daman1209arora commited on
Commit
e02c50d
·
verified ·
1 Parent(s): 3395ec9

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "max_position_embeddings": 40960,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.52.4",
27
+ "use_cache": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.52.4"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a09b576c6547b576dd3f3c2e90f1bd8bd3d33bf6b39256c4c50f0982aa9d4f7
3
+ size 2384234968
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c955e06bcf511c9eb0c576d80879d6a956be4d7dd398236ca1a9fcc66a31c19
3
+ size 4768663315
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1325f40886cd907d4160d5d36a50b2aa4ce8180e738c6b0b7cab177c2ad1a3f
3
+ size 14917
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4546f7cde630d9b35e94345f93acc2b980dd4330583e65ddf22236c7c806d400
3
+ size 14917
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e674de87535c7fd5082b44c684bb517d78a71251342918b720c27ef01e0552bc
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "padding_side": "right",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
trainer_state.json ADDED
@@ -0,0 +1,1524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 400,
3
+ "best_metric": 1.0,
4
+ "best_model_checkpoint": "/projects/bffw/darora1/llm_ipc/final_models/mpi_async_n3/checkpoint-400",
5
+ "epoch": 0.15521642383784734,
6
+ "eval_steps": 40,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0007760821191892367,
14
+ "grad_norm": 12.262979507446289,
15
+ "learning_rate": 2.0000000000000002e-07,
16
+ "loss": 0.5292,
17
+ "step": 2
18
+ },
19
+ {
20
+ "epoch": 0.0015521642383784734,
21
+ "grad_norm": 10.550226211547852,
22
+ "learning_rate": 6.000000000000001e-07,
23
+ "loss": 0.5158,
24
+ "step": 4
25
+ },
26
+ {
27
+ "epoch": 0.00232824635756771,
28
+ "grad_norm": 11.25029182434082,
29
+ "learning_rate": 1.0000000000000002e-06,
30
+ "loss": 0.5035,
31
+ "step": 6
32
+ },
33
+ {
34
+ "epoch": 0.003104328476756947,
35
+ "grad_norm": 12.211933135986328,
36
+ "learning_rate": 1.4000000000000001e-06,
37
+ "loss": 0.4699,
38
+ "step": 8
39
+ },
40
+ {
41
+ "epoch": 0.0038804105959461834,
42
+ "grad_norm": 13.521236419677734,
43
+ "learning_rate": 1.8000000000000001e-06,
44
+ "loss": 0.4353,
45
+ "step": 10
46
+ },
47
+ {
48
+ "epoch": 0.00465649271513542,
49
+ "grad_norm": 10.723718643188477,
50
+ "learning_rate": 2.2e-06,
51
+ "loss": 0.3387,
52
+ "step": 12
53
+ },
54
+ {
55
+ "epoch": 0.005432574834324657,
56
+ "grad_norm": 4.546169757843018,
57
+ "learning_rate": 2.6e-06,
58
+ "loss": 0.2936,
59
+ "step": 14
60
+ },
61
+ {
62
+ "epoch": 0.006208656953513894,
63
+ "grad_norm": 2.195192813873291,
64
+ "learning_rate": 3e-06,
65
+ "loss": 0.1848,
66
+ "step": 16
67
+ },
68
+ {
69
+ "epoch": 0.00698473907270313,
70
+ "grad_norm": 2.521470785140991,
71
+ "learning_rate": 3.4000000000000005e-06,
72
+ "loss": 0.1964,
73
+ "step": 18
74
+ },
75
+ {
76
+ "epoch": 0.007760821191892367,
77
+ "grad_norm": 1.8902873992919922,
78
+ "learning_rate": 3.8000000000000005e-06,
79
+ "loss": 0.1254,
80
+ "step": 20
81
+ },
82
+ {
83
+ "epoch": 0.008536903311081603,
84
+ "grad_norm": 1.6655786037445068,
85
+ "learning_rate": 4.2000000000000004e-06,
86
+ "loss": 0.1055,
87
+ "step": 22
88
+ },
89
+ {
90
+ "epoch": 0.00931298543027084,
91
+ "grad_norm": 1.4653961658477783,
92
+ "learning_rate": 4.600000000000001e-06,
93
+ "loss": 0.091,
94
+ "step": 24
95
+ },
96
+ {
97
+ "epoch": 0.010089067549460077,
98
+ "grad_norm": 1.1770055294036865,
99
+ "learning_rate": 5e-06,
100
+ "loss": 0.0762,
101
+ "step": 26
102
+ },
103
+ {
104
+ "epoch": 0.010865149668649314,
105
+ "grad_norm": 1.4054973125457764,
106
+ "learning_rate": 5.400000000000001e-06,
107
+ "loss": 0.0433,
108
+ "step": 28
109
+ },
110
+ {
111
+ "epoch": 0.011641231787838551,
112
+ "grad_norm": 1.2623802423477173,
113
+ "learning_rate": 5.8e-06,
114
+ "loss": 0.0367,
115
+ "step": 30
116
+ },
117
+ {
118
+ "epoch": 0.012417313907027787,
119
+ "grad_norm": 0.6245309114456177,
120
+ "learning_rate": 6.200000000000001e-06,
121
+ "loss": 0.0269,
122
+ "step": 32
123
+ },
124
+ {
125
+ "epoch": 0.013193396026217025,
126
+ "grad_norm": 0.5869189500808716,
127
+ "learning_rate": 6.600000000000001e-06,
128
+ "loss": 0.0224,
129
+ "step": 34
130
+ },
131
+ {
132
+ "epoch": 0.01396947814540626,
133
+ "grad_norm": 0.828778862953186,
134
+ "learning_rate": 7e-06,
135
+ "loss": 0.0243,
136
+ "step": 36
137
+ },
138
+ {
139
+ "epoch": 0.014745560264595498,
140
+ "grad_norm": 0.5178276896476746,
141
+ "learning_rate": 7.4e-06,
142
+ "loss": 0.0143,
143
+ "step": 38
144
+ },
145
+ {
146
+ "epoch": 0.015521642383784734,
147
+ "grad_norm": 1.2490451335906982,
148
+ "learning_rate": 7.800000000000002e-06,
149
+ "loss": 0.0145,
150
+ "step": 40
151
+ },
152
+ {
153
+ "epoch": 0.015521642383784734,
154
+ "eval_accuracy": 0.9950695396598972,
155
+ "eval_loss": 0.01397051103413105,
156
+ "eval_runtime": 131.2714,
157
+ "eval_samples_per_second": 38.089,
158
+ "eval_steps_per_second": 9.522,
159
+ "step": 40
160
+ },
161
+ {
162
+ "epoch": 0.01629772450297397,
163
+ "grad_norm": 0.6127598881721497,
164
+ "learning_rate": 8.2e-06,
165
+ "loss": 0.0147,
166
+ "step": 42
167
+ },
168
+ {
169
+ "epoch": 0.017073806622163207,
170
+ "grad_norm": 0.7742691040039062,
171
+ "learning_rate": 8.6e-06,
172
+ "loss": 0.0092,
173
+ "step": 44
174
+ },
175
+ {
176
+ "epoch": 0.017849888741352446,
177
+ "grad_norm": 0.43619677424430847,
178
+ "learning_rate": 9e-06,
179
+ "loss": 0.0139,
180
+ "step": 46
181
+ },
182
+ {
183
+ "epoch": 0.01862597086054168,
184
+ "grad_norm": 0.6179471015930176,
185
+ "learning_rate": 9.4e-06,
186
+ "loss": 0.0137,
187
+ "step": 48
188
+ },
189
+ {
190
+ "epoch": 0.019402052979730917,
191
+ "grad_norm": 0.6856386065483093,
192
+ "learning_rate": 9.800000000000001e-06,
193
+ "loss": 0.0114,
194
+ "step": 50
195
+ },
196
+ {
197
+ "epoch": 0.020178135098920153,
198
+ "grad_norm": 0.4444126486778259,
199
+ "learning_rate": 9.999998993000299e-06,
200
+ "loss": 0.0072,
201
+ "step": 52
202
+ },
203
+ {
204
+ "epoch": 0.020954217218109392,
205
+ "grad_norm": 0.44204798340797424,
206
+ "learning_rate": 9.999990937005126e-06,
207
+ "loss": 0.0081,
208
+ "step": 54
209
+ },
210
+ {
211
+ "epoch": 0.021730299337298628,
212
+ "grad_norm": 0.6200250387191772,
213
+ "learning_rate": 9.999974825027756e-06,
214
+ "loss": 0.0068,
215
+ "step": 56
216
+ },
217
+ {
218
+ "epoch": 0.022506381456487864,
219
+ "grad_norm": 0.3666571378707886,
220
+ "learning_rate": 9.999950657094151e-06,
221
+ "loss": 0.0056,
222
+ "step": 58
223
+ },
224
+ {
225
+ "epoch": 0.023282463575677103,
226
+ "grad_norm": 0.37394317984580994,
227
+ "learning_rate": 9.999918433243253e-06,
228
+ "loss": 0.0057,
229
+ "step": 60
230
+ },
231
+ {
232
+ "epoch": 0.02405854569486634,
233
+ "grad_norm": 0.3526070713996887,
234
+ "learning_rate": 9.999878153526974e-06,
235
+ "loss": 0.0046,
236
+ "step": 62
237
+ },
238
+ {
239
+ "epoch": 0.024834627814055574,
240
+ "grad_norm": 0.37286990880966187,
241
+ "learning_rate": 9.99982981801022e-06,
242
+ "loss": 0.0055,
243
+ "step": 64
244
+ },
245
+ {
246
+ "epoch": 0.02561070993324481,
247
+ "grad_norm": 0.2880455255508423,
248
+ "learning_rate": 9.999773426770864e-06,
249
+ "loss": 0.0055,
250
+ "step": 66
251
+ },
252
+ {
253
+ "epoch": 0.02638679205243405,
254
+ "grad_norm": 0.1844996213912964,
255
+ "learning_rate": 9.999708979899769e-06,
256
+ "loss": 0.0053,
257
+ "step": 68
258
+ },
259
+ {
260
+ "epoch": 0.027162874171623285,
261
+ "grad_norm": 0.3575407564640045,
262
+ "learning_rate": 9.999636477500765e-06,
263
+ "loss": 0.0046,
264
+ "step": 70
265
+ },
266
+ {
267
+ "epoch": 0.02793895629081252,
268
+ "grad_norm": 0.4409068524837494,
269
+ "learning_rate": 9.999555919690673e-06,
270
+ "loss": 0.0047,
271
+ "step": 72
272
+ },
273
+ {
274
+ "epoch": 0.02871503841000176,
275
+ "grad_norm": 0.28502458333969116,
276
+ "learning_rate": 9.999467306599285e-06,
277
+ "loss": 0.0039,
278
+ "step": 74
279
+ },
280
+ {
281
+ "epoch": 0.029491120529190996,
282
+ "grad_norm": 0.3887697458267212,
283
+ "learning_rate": 9.999370638369377e-06,
284
+ "loss": 0.0032,
285
+ "step": 76
286
+ },
287
+ {
288
+ "epoch": 0.03026720264838023,
289
+ "grad_norm": 0.3041154742240906,
290
+ "learning_rate": 9.999265915156697e-06,
291
+ "loss": 0.0028,
292
+ "step": 78
293
+ },
294
+ {
295
+ "epoch": 0.031043284767569467,
296
+ "grad_norm": 0.3210655748844147,
297
+ "learning_rate": 9.999153137129978e-06,
298
+ "loss": 0.0034,
299
+ "step": 80
300
+ },
301
+ {
302
+ "epoch": 0.031043284767569467,
303
+ "eval_accuracy": 0.9989959166226455,
304
+ "eval_loss": 0.0025868695229291916,
305
+ "eval_runtime": 127.9011,
306
+ "eval_samples_per_second": 39.093,
307
+ "eval_steps_per_second": 9.773,
308
+ "step": 80
309
+ },
310
+ {
311
+ "epoch": 0.0318193668867587,
312
+ "grad_norm": 0.6757090091705322,
313
+ "learning_rate": 9.999032304470926e-06,
314
+ "loss": 0.003,
315
+ "step": 82
316
+ },
317
+ {
318
+ "epoch": 0.03259544900594794,
319
+ "grad_norm": 0.16272921860218048,
320
+ "learning_rate": 9.998903417374228e-06,
321
+ "loss": 0.0017,
322
+ "step": 84
323
+ },
324
+ {
325
+ "epoch": 0.03337153112513718,
326
+ "grad_norm": 0.6212184429168701,
327
+ "learning_rate": 9.998766476047546e-06,
328
+ "loss": 0.0033,
329
+ "step": 86
330
+ },
331
+ {
332
+ "epoch": 0.034147613244326414,
333
+ "grad_norm": 0.19103288650512695,
334
+ "learning_rate": 9.998621480711522e-06,
335
+ "loss": 0.0016,
336
+ "step": 88
337
+ },
338
+ {
339
+ "epoch": 0.03492369536351565,
340
+ "grad_norm": 0.4599171280860901,
341
+ "learning_rate": 9.998468431599768e-06,
342
+ "loss": 0.0035,
343
+ "step": 90
344
+ },
345
+ {
346
+ "epoch": 0.03569977748270489,
347
+ "grad_norm": 0.22474364936351776,
348
+ "learning_rate": 9.99830732895888e-06,
349
+ "loss": 0.0016,
350
+ "step": 92
351
+ },
352
+ {
353
+ "epoch": 0.036475859601894124,
354
+ "grad_norm": 0.19210362434387207,
355
+ "learning_rate": 9.998138173048424e-06,
356
+ "loss": 0.0015,
357
+ "step": 94
358
+ },
359
+ {
360
+ "epoch": 0.03725194172108336,
361
+ "grad_norm": 0.22696685791015625,
362
+ "learning_rate": 9.997960964140946e-06,
363
+ "loss": 0.0012,
364
+ "step": 96
365
+ },
366
+ {
367
+ "epoch": 0.038028023840272596,
368
+ "grad_norm": 0.3195860981941223,
369
+ "learning_rate": 9.997775702521965e-06,
370
+ "loss": 0.0012,
371
+ "step": 98
372
+ },
373
+ {
374
+ "epoch": 0.038804105959461835,
375
+ "grad_norm": 0.3686668574810028,
376
+ "learning_rate": 9.997582388489975e-06,
377
+ "loss": 0.0014,
378
+ "step": 100
379
+ },
380
+ {
381
+ "epoch": 0.039580188078651074,
382
+ "grad_norm": 0.28173354268074036,
383
+ "learning_rate": 9.99738102235644e-06,
384
+ "loss": 0.0014,
385
+ "step": 102
386
+ },
387
+ {
388
+ "epoch": 0.040356270197840306,
389
+ "grad_norm": 0.1499175727367401,
390
+ "learning_rate": 9.997171604445803e-06,
391
+ "loss": 0.0015,
392
+ "step": 104
393
+ },
394
+ {
395
+ "epoch": 0.041132352317029545,
396
+ "grad_norm": 0.4097079038619995,
397
+ "learning_rate": 9.99695413509548e-06,
398
+ "loss": 0.0019,
399
+ "step": 106
400
+ },
401
+ {
402
+ "epoch": 0.041908434436218785,
403
+ "grad_norm": 0.1767456978559494,
404
+ "learning_rate": 9.996728614655854e-06,
405
+ "loss": 0.0009,
406
+ "step": 108
407
+ },
408
+ {
409
+ "epoch": 0.04268451655540802,
410
+ "grad_norm": 0.2755231559276581,
411
+ "learning_rate": 9.996495043490285e-06,
412
+ "loss": 0.0012,
413
+ "step": 110
414
+ },
415
+ {
416
+ "epoch": 0.043460598674597256,
417
+ "grad_norm": 0.21294231712818146,
418
+ "learning_rate": 9.996253421975103e-06,
419
+ "loss": 0.001,
420
+ "step": 112
421
+ },
422
+ {
423
+ "epoch": 0.044236680793786495,
424
+ "grad_norm": 0.5105843544006348,
425
+ "learning_rate": 9.996003750499608e-06,
426
+ "loss": 0.0017,
427
+ "step": 114
428
+ },
429
+ {
430
+ "epoch": 0.04501276291297573,
431
+ "grad_norm": 0.20320548117160797,
432
+ "learning_rate": 9.995746029466071e-06,
433
+ "loss": 0.0011,
434
+ "step": 116
435
+ },
436
+ {
437
+ "epoch": 0.04578884503216497,
438
+ "grad_norm": 0.1121864914894104,
439
+ "learning_rate": 9.995480259289731e-06,
440
+ "loss": 0.0012,
441
+ "step": 118
442
+ },
443
+ {
444
+ "epoch": 0.046564927151354206,
445
+ "grad_norm": 0.13900773227214813,
446
+ "learning_rate": 9.995206440398798e-06,
447
+ "loss": 0.0005,
448
+ "step": 120
449
+ },
450
+ {
451
+ "epoch": 0.046564927151354206,
452
+ "eval_accuracy": 0.9996412933698419,
453
+ "eval_loss": 0.0013112464221194386,
454
+ "eval_runtime": 129.9264,
455
+ "eval_samples_per_second": 38.483,
456
+ "eval_steps_per_second": 9.621,
457
+ "step": 120
458
+ },
459
+ {
460
+ "epoch": 0.04734100927054344,
461
+ "grad_norm": 0.17495128512382507,
462
+ "learning_rate": 9.994924573234448e-06,
463
+ "loss": 0.0011,
464
+ "step": 122
465
+ },
466
+ {
467
+ "epoch": 0.04811709138973268,
468
+ "grad_norm": 0.44951504468917847,
469
+ "learning_rate": 9.994634658250825e-06,
470
+ "loss": 0.0015,
471
+ "step": 124
472
+ },
473
+ {
474
+ "epoch": 0.04889317350892191,
475
+ "grad_norm": 0.28001341223716736,
476
+ "learning_rate": 9.994336695915041e-06,
477
+ "loss": 0.0009,
478
+ "step": 126
479
+ },
480
+ {
481
+ "epoch": 0.04966925562811115,
482
+ "grad_norm": 0.15198007225990295,
483
+ "learning_rate": 9.994030686707171e-06,
484
+ "loss": 0.0008,
485
+ "step": 128
486
+ },
487
+ {
488
+ "epoch": 0.05044533774730039,
489
+ "grad_norm": 0.2052275389432907,
490
+ "learning_rate": 9.993716631120259e-06,
491
+ "loss": 0.0008,
492
+ "step": 130
493
+ },
494
+ {
495
+ "epoch": 0.05122141986648962,
496
+ "grad_norm": 0.20360974967479706,
497
+ "learning_rate": 9.993394529660307e-06,
498
+ "loss": 0.0008,
499
+ "step": 132
500
+ },
501
+ {
502
+ "epoch": 0.05199750198567886,
503
+ "grad_norm": 0.20459742844104767,
504
+ "learning_rate": 9.99306438284629e-06,
505
+ "loss": 0.0007,
506
+ "step": 134
507
+ },
508
+ {
509
+ "epoch": 0.0527735841048681,
510
+ "grad_norm": 0.12038147449493408,
511
+ "learning_rate": 9.992726191210139e-06,
512
+ "loss": 0.0008,
513
+ "step": 136
514
+ },
515
+ {
516
+ "epoch": 0.05354966622405733,
517
+ "grad_norm": 0.2902871072292328,
518
+ "learning_rate": 9.992379955296745e-06,
519
+ "loss": 0.0005,
520
+ "step": 138
521
+ },
522
+ {
523
+ "epoch": 0.05432574834324657,
524
+ "grad_norm": 0.11465182155370712,
525
+ "learning_rate": 9.992025675663966e-06,
526
+ "loss": 0.0006,
527
+ "step": 140
528
+ },
529
+ {
530
+ "epoch": 0.05510183046243581,
531
+ "grad_norm": 0.10924035310745239,
532
+ "learning_rate": 9.991663352882615e-06,
533
+ "loss": 0.0005,
534
+ "step": 142
535
+ },
536
+ {
537
+ "epoch": 0.05587791258162504,
538
+ "grad_norm": 0.10540606826543808,
539
+ "learning_rate": 9.991292987536469e-06,
540
+ "loss": 0.0003,
541
+ "step": 144
542
+ },
543
+ {
544
+ "epoch": 0.05665399470081428,
545
+ "grad_norm": 0.10914743691682816,
546
+ "learning_rate": 9.990914580222258e-06,
547
+ "loss": 0.0004,
548
+ "step": 146
549
+ },
550
+ {
551
+ "epoch": 0.05743007682000352,
552
+ "grad_norm": 0.06488844007253647,
553
+ "learning_rate": 9.990528131549674e-06,
554
+ "loss": 0.0004,
555
+ "step": 148
556
+ },
557
+ {
558
+ "epoch": 0.05820615893919275,
559
+ "grad_norm": 0.11523474752902985,
560
+ "learning_rate": 9.990133642141359e-06,
561
+ "loss": 0.0003,
562
+ "step": 150
563
+ },
564
+ {
565
+ "epoch": 0.05898224105838199,
566
+ "grad_norm": 0.17658241093158722,
567
+ "learning_rate": 9.989731112632917e-06,
568
+ "loss": 0.0004,
569
+ "step": 152
570
+ },
571
+ {
572
+ "epoch": 0.059758323177571224,
573
+ "grad_norm": 0.1516527682542801,
574
+ "learning_rate": 9.989320543672904e-06,
575
+ "loss": 0.0002,
576
+ "step": 154
577
+ },
578
+ {
579
+ "epoch": 0.06053440529676046,
580
+ "grad_norm": 0.14159496128559113,
581
+ "learning_rate": 9.988901935922826e-06,
582
+ "loss": 0.0001,
583
+ "step": 156
584
+ },
585
+ {
586
+ "epoch": 0.0613104874159497,
587
+ "grad_norm": 0.19340620934963226,
588
+ "learning_rate": 9.988475290057145e-06,
589
+ "loss": 0.0006,
590
+ "step": 158
591
+ },
592
+ {
593
+ "epoch": 0.062086569535138934,
594
+ "grad_norm": 0.17848193645477295,
595
+ "learning_rate": 9.988040606763272e-06,
596
+ "loss": 0.0003,
597
+ "step": 160
598
+ },
599
+ {
600
+ "epoch": 0.062086569535138934,
601
+ "eval_accuracy": 0.9999064094433845,
602
+ "eval_loss": 0.00032003907836042345,
603
+ "eval_runtime": 127.2112,
604
+ "eval_samples_per_second": 39.305,
605
+ "eval_steps_per_second": 9.826,
606
+ "step": 160
607
+ },
608
+ {
609
+ "epoch": 0.06286265165432818,
610
+ "grad_norm": 0.15477371215820312,
611
+ "learning_rate": 9.98759788674157e-06,
612
+ "loss": 0.0003,
613
+ "step": 162
614
+ },
615
+ {
616
+ "epoch": 0.0636387337735174,
617
+ "grad_norm": 0.1277933269739151,
618
+ "learning_rate": 9.987147130705347e-06,
619
+ "loss": 0.0004,
620
+ "step": 164
621
+ },
622
+ {
623
+ "epoch": 0.06441481589270664,
624
+ "grad_norm": 0.14449910819530487,
625
+ "learning_rate": 9.986688339380863e-06,
626
+ "loss": 0.0002,
627
+ "step": 166
628
+ },
629
+ {
630
+ "epoch": 0.06519089801189588,
631
+ "grad_norm": 0.6293010115623474,
632
+ "learning_rate": 9.98622151350732e-06,
633
+ "loss": 0.0006,
634
+ "step": 168
635
+ },
636
+ {
637
+ "epoch": 0.06596698013108512,
638
+ "grad_norm": 0.2988656163215637,
639
+ "learning_rate": 9.985746653836867e-06,
640
+ "loss": 0.0005,
641
+ "step": 170
642
+ },
643
+ {
644
+ "epoch": 0.06674306225027436,
645
+ "grad_norm": 0.0764790028333664,
646
+ "learning_rate": 9.985263761134602e-06,
647
+ "loss": 0.0005,
648
+ "step": 172
649
+ },
650
+ {
651
+ "epoch": 0.06751914436946359,
652
+ "grad_norm": 0.3135935366153717,
653
+ "learning_rate": 9.984772836178559e-06,
654
+ "loss": 0.0006,
655
+ "step": 174
656
+ },
657
+ {
658
+ "epoch": 0.06829522648865283,
659
+ "grad_norm": 0.4241097569465637,
660
+ "learning_rate": 9.984273879759713e-06,
661
+ "loss": 0.0008,
662
+ "step": 176
663
+ },
664
+ {
665
+ "epoch": 0.06907130860784207,
666
+ "grad_norm": 0.07492109388113022,
667
+ "learning_rate": 9.983766892681985e-06,
668
+ "loss": 0.0003,
669
+ "step": 178
670
+ },
671
+ {
672
+ "epoch": 0.0698473907270313,
673
+ "grad_norm": 0.15513752400875092,
674
+ "learning_rate": 9.983251875762234e-06,
675
+ "loss": 0.0003,
676
+ "step": 180
677
+ },
678
+ {
679
+ "epoch": 0.07062347284622054,
680
+ "grad_norm": 0.2630753815174103,
681
+ "learning_rate": 9.982728829830252e-06,
682
+ "loss": 0.0006,
683
+ "step": 182
684
+ },
685
+ {
686
+ "epoch": 0.07139955496540978,
687
+ "grad_norm": 0.07824663817882538,
688
+ "learning_rate": 9.982197755728771e-06,
689
+ "loss": 0.0003,
690
+ "step": 184
691
+ },
692
+ {
693
+ "epoch": 0.07217563708459901,
694
+ "grad_norm": 0.03119218535721302,
695
+ "learning_rate": 9.981658654313458e-06,
696
+ "loss": 0.0006,
697
+ "step": 186
698
+ },
699
+ {
700
+ "epoch": 0.07295171920378825,
701
+ "grad_norm": 0.5731412768363953,
702
+ "learning_rate": 9.981111526452912e-06,
703
+ "loss": 0.0015,
704
+ "step": 188
705
+ },
706
+ {
707
+ "epoch": 0.07372780132297749,
708
+ "grad_norm": 0.13840052485466003,
709
+ "learning_rate": 9.980556373028665e-06,
710
+ "loss": 0.0002,
711
+ "step": 190
712
+ },
713
+ {
714
+ "epoch": 0.07450388344216673,
715
+ "grad_norm": 0.04643406346440315,
716
+ "learning_rate": 9.979993194935182e-06,
717
+ "loss": 0.0002,
718
+ "step": 192
719
+ },
720
+ {
721
+ "epoch": 0.07527996556135597,
722
+ "grad_norm": 0.05373441055417061,
723
+ "learning_rate": 9.979421993079853e-06,
724
+ "loss": 0.0003,
725
+ "step": 194
726
+ },
727
+ {
728
+ "epoch": 0.07605604768054519,
729
+ "grad_norm": 0.21675284206867218,
730
+ "learning_rate": 9.978842768382999e-06,
731
+ "loss": 0.0004,
732
+ "step": 196
733
+ },
734
+ {
735
+ "epoch": 0.07683212979973443,
736
+ "grad_norm": 0.18371616303920746,
737
+ "learning_rate": 9.978255521777865e-06,
738
+ "loss": 0.0002,
739
+ "step": 198
740
+ },
741
+ {
742
+ "epoch": 0.07760821191892367,
743
+ "grad_norm": 0.09996998310089111,
744
+ "learning_rate": 9.977660254210623e-06,
745
+ "loss": 0.0003,
746
+ "step": 200
747
+ },
748
+ {
749
+ "epoch": 0.07760821191892367,
750
+ "eval_accuracy": 0.9999704461885914,
751
+ "eval_loss": 0.00016504956874996424,
752
+ "eval_runtime": 129.4586,
753
+ "eval_samples_per_second": 38.622,
754
+ "eval_steps_per_second": 9.656,
755
+ "step": 200
756
+ },
757
+ {
758
+ "epoch": 0.07838429403811291,
759
+ "grad_norm": 0.1638646125793457,
760
+ "learning_rate": 9.977056966640368e-06,
761
+ "loss": 0.0004,
762
+ "step": 202
763
+ },
764
+ {
765
+ "epoch": 0.07916037615730215,
766
+ "grad_norm": 0.13783912360668182,
767
+ "learning_rate": 9.976445660039118e-06,
768
+ "loss": 0.0001,
769
+ "step": 204
770
+ },
771
+ {
772
+ "epoch": 0.07993645827649139,
773
+ "grad_norm": 0.14015792310237885,
774
+ "learning_rate": 9.975826335391808e-06,
775
+ "loss": 0.0001,
776
+ "step": 206
777
+ },
778
+ {
779
+ "epoch": 0.08071254039568061,
780
+ "grad_norm": 0.2642574906349182,
781
+ "learning_rate": 9.975198993696294e-06,
782
+ "loss": 0.0002,
783
+ "step": 208
784
+ },
785
+ {
786
+ "epoch": 0.08148862251486985,
787
+ "grad_norm": 0.15489515662193298,
788
+ "learning_rate": 9.974563635963348e-06,
789
+ "loss": 0.0006,
790
+ "step": 210
791
+ },
792
+ {
793
+ "epoch": 0.08226470463405909,
794
+ "grad_norm": 0.35902225971221924,
795
+ "learning_rate": 9.973920263216658e-06,
796
+ "loss": 0.0004,
797
+ "step": 212
798
+ },
799
+ {
800
+ "epoch": 0.08304078675324833,
801
+ "grad_norm": 0.4768538773059845,
802
+ "learning_rate": 9.973268876492827e-06,
803
+ "loss": 0.0005,
804
+ "step": 214
805
+ },
806
+ {
807
+ "epoch": 0.08381686887243757,
808
+ "grad_norm": 0.13987833261489868,
809
+ "learning_rate": 9.972609476841368e-06,
810
+ "loss": 0.0002,
811
+ "step": 216
812
+ },
813
+ {
814
+ "epoch": 0.0845929509916268,
815
+ "grad_norm": 0.1310640126466751,
816
+ "learning_rate": 9.971942065324704e-06,
817
+ "loss": 0.0003,
818
+ "step": 218
819
+ },
820
+ {
821
+ "epoch": 0.08536903311081603,
822
+ "grad_norm": 0.2835996747016907,
823
+ "learning_rate": 9.971266643018171e-06,
824
+ "loss": 0.0004,
825
+ "step": 220
826
+ },
827
+ {
828
+ "epoch": 0.08614511523000527,
829
+ "grad_norm": 0.14516514539718628,
830
+ "learning_rate": 9.970583211010008e-06,
831
+ "loss": 0.0005,
832
+ "step": 222
833
+ },
834
+ {
835
+ "epoch": 0.08692119734919451,
836
+ "grad_norm": 0.1896241158246994,
837
+ "learning_rate": 9.969891770401358e-06,
838
+ "loss": 0.0011,
839
+ "step": 224
840
+ },
841
+ {
842
+ "epoch": 0.08769727946838375,
843
+ "grad_norm": 0.13900001347064972,
844
+ "learning_rate": 9.969192322306271e-06,
845
+ "loss": 0.0006,
846
+ "step": 226
847
+ },
848
+ {
849
+ "epoch": 0.08847336158757299,
850
+ "grad_norm": 0.12469799816608429,
851
+ "learning_rate": 9.968484867851698e-06,
852
+ "loss": 0.0003,
853
+ "step": 228
854
+ },
855
+ {
856
+ "epoch": 0.08924944370676222,
857
+ "grad_norm": 0.2005859613418579,
858
+ "learning_rate": 9.96776940817749e-06,
859
+ "loss": 0.0008,
860
+ "step": 230
861
+ },
862
+ {
863
+ "epoch": 0.09002552582595146,
864
+ "grad_norm": 0.13151948153972626,
865
+ "learning_rate": 9.967045944436392e-06,
866
+ "loss": 0.0002,
867
+ "step": 232
868
+ },
869
+ {
870
+ "epoch": 0.0908016079451407,
871
+ "grad_norm": 0.10286468267440796,
872
+ "learning_rate": 9.966314477794052e-06,
873
+ "loss": 0.0004,
874
+ "step": 234
875
+ },
876
+ {
877
+ "epoch": 0.09157769006432993,
878
+ "grad_norm": 0.08904605358839035,
879
+ "learning_rate": 9.965575009429006e-06,
880
+ "loss": 0.0003,
881
+ "step": 236
882
+ },
883
+ {
884
+ "epoch": 0.09235377218351917,
885
+ "grad_norm": 0.110069639980793,
886
+ "learning_rate": 9.964827540532685e-06,
887
+ "loss": 0.0004,
888
+ "step": 238
889
+ },
890
+ {
891
+ "epoch": 0.09312985430270841,
892
+ "grad_norm": 0.044081419706344604,
893
+ "learning_rate": 9.964072072309412e-06,
894
+ "loss": 0.0002,
895
+ "step": 240
896
+ },
897
+ {
898
+ "epoch": 0.09312985430270841,
899
+ "eval_accuracy": 0.9998835115818656,
900
+ "eval_loss": 0.00031863132608123124,
901
+ "eval_runtime": 127.6209,
902
+ "eval_samples_per_second": 39.179,
903
+ "eval_steps_per_second": 9.795,
904
+ "step": 240
905
+ },
906
+ {
907
+ "epoch": 0.09390593642189764,
908
+ "grad_norm": 0.11008896678686142,
909
+ "learning_rate": 9.963308605976397e-06,
910
+ "loss": 0.0002,
911
+ "step": 242
912
+ },
913
+ {
914
+ "epoch": 0.09468201854108688,
915
+ "grad_norm": 0.07347576320171356,
916
+ "learning_rate": 9.962537142763733e-06,
917
+ "loss": 0.0003,
918
+ "step": 244
919
+ },
920
+ {
921
+ "epoch": 0.09545810066027612,
922
+ "grad_norm": 0.061514757573604584,
923
+ "learning_rate": 9.961757683914406e-06,
924
+ "loss": 0.0001,
925
+ "step": 246
926
+ },
927
+ {
928
+ "epoch": 0.09623418277946535,
929
+ "grad_norm": 0.030034126713871956,
930
+ "learning_rate": 9.960970230684276e-06,
931
+ "loss": 0.0001,
932
+ "step": 248
933
+ },
934
+ {
935
+ "epoch": 0.0970102648986546,
936
+ "grad_norm": 0.06853067874908447,
937
+ "learning_rate": 9.96017478434209e-06,
938
+ "loss": 0.0001,
939
+ "step": 250
940
+ },
941
+ {
942
+ "epoch": 0.09778634701784382,
943
+ "grad_norm": 0.08918727934360504,
944
+ "learning_rate": 9.959371346169466e-06,
945
+ "loss": 0.0001,
946
+ "step": 252
947
+ },
948
+ {
949
+ "epoch": 0.09856242913703306,
950
+ "grad_norm": 0.014135139063000679,
951
+ "learning_rate": 9.958559917460909e-06,
952
+ "loss": 0.0,
953
+ "step": 254
954
+ },
955
+ {
956
+ "epoch": 0.0993385112562223,
957
+ "grad_norm": 0.03587706759572029,
958
+ "learning_rate": 9.957740499523787e-06,
959
+ "loss": 0.0,
960
+ "step": 256
961
+ },
962
+ {
963
+ "epoch": 0.10011459337541154,
964
+ "grad_norm": 0.028472531586885452,
965
+ "learning_rate": 9.95691309367835e-06,
966
+ "loss": 0.0,
967
+ "step": 258
968
+ },
969
+ {
970
+ "epoch": 0.10089067549460078,
971
+ "grad_norm": 0.10992776602506638,
972
+ "learning_rate": 9.95607770125771e-06,
973
+ "loss": 0.0,
974
+ "step": 260
975
+ },
976
+ {
977
+ "epoch": 0.10166675761379002,
978
+ "grad_norm": 0.015062687918543816,
979
+ "learning_rate": 9.955234323607854e-06,
980
+ "loss": 0.0,
981
+ "step": 262
982
+ },
983
+ {
984
+ "epoch": 0.10244283973297924,
985
+ "grad_norm": 0.09784650802612305,
986
+ "learning_rate": 9.954382962087628e-06,
987
+ "loss": 0.0001,
988
+ "step": 264
989
+ },
990
+ {
991
+ "epoch": 0.10321892185216848,
992
+ "grad_norm": 0.004005913157016039,
993
+ "learning_rate": 9.95352361806875e-06,
994
+ "loss": 0.0001,
995
+ "step": 266
996
+ },
997
+ {
998
+ "epoch": 0.10399500397135772,
999
+ "grad_norm": 0.0028742440044879913,
1000
+ "learning_rate": 9.95265629293579e-06,
1001
+ "loss": 0.0,
1002
+ "step": 268
1003
+ },
1004
+ {
1005
+ "epoch": 0.10477108609054696,
1006
+ "grad_norm": 0.01080241333693266,
1007
+ "learning_rate": 9.951780988086183e-06,
1008
+ "loss": 0.0,
1009
+ "step": 270
1010
+ },
1011
+ {
1012
+ "epoch": 0.1055471682097362,
1013
+ "grad_norm": 0.006698825862258673,
1014
+ "learning_rate": 9.950897704930223e-06,
1015
+ "loss": 0.0,
1016
+ "step": 272
1017
+ },
1018
+ {
1019
+ "epoch": 0.10632325032892542,
1020
+ "grad_norm": 0.0032098847441375256,
1021
+ "learning_rate": 9.95000644489105e-06,
1022
+ "loss": 0.0,
1023
+ "step": 274
1024
+ },
1025
+ {
1026
+ "epoch": 0.10709933244811466,
1027
+ "grad_norm": 0.014737925492227077,
1028
+ "learning_rate": 9.949107209404664e-06,
1029
+ "loss": 0.0,
1030
+ "step": 276
1031
+ },
1032
+ {
1033
+ "epoch": 0.1078754145673039,
1034
+ "grad_norm": 0.002784354379400611,
1035
+ "learning_rate": 9.948199999919914e-06,
1036
+ "loss": 0.0,
1037
+ "step": 278
1038
+ },
1039
+ {
1040
+ "epoch": 0.10865149668649314,
1041
+ "grad_norm": 0.001066903700120747,
1042
+ "learning_rate": 9.947284817898493e-06,
1043
+ "loss": 0.0,
1044
+ "step": 280
1045
+ },
1046
+ {
1047
+ "epoch": 0.10865149668649314,
1048
+ "eval_accuracy": 0.9999963963963965,
1049
+ "eval_loss": 2.1505837139557116e-05,
1050
+ "eval_runtime": 127.9196,
1051
+ "eval_samples_per_second": 39.087,
1052
+ "eval_steps_per_second": 9.772,
1053
+ "step": 280
1054
+ },
1055
+ {
1056
+ "epoch": 0.10942757880568238,
1057
+ "grad_norm": 0.0006136983865872025,
1058
+ "learning_rate": 9.946361664814942e-06,
1059
+ "loss": 0.0,
1060
+ "step": 282
1061
+ },
1062
+ {
1063
+ "epoch": 0.11020366092487162,
1064
+ "grad_norm": 0.021285895258188248,
1065
+ "learning_rate": 9.945430542156647e-06,
1066
+ "loss": 0.0,
1067
+ "step": 284
1068
+ },
1069
+ {
1070
+ "epoch": 0.11097974304406084,
1071
+ "grad_norm": 0.0006559508037753403,
1072
+ "learning_rate": 9.944491451423829e-06,
1073
+ "loss": 0.0,
1074
+ "step": 286
1075
+ },
1076
+ {
1077
+ "epoch": 0.11175582516325008,
1078
+ "grad_norm": 0.0006491419044323266,
1079
+ "learning_rate": 9.943544394129552e-06,
1080
+ "loss": 0.0,
1081
+ "step": 288
1082
+ },
1083
+ {
1084
+ "epoch": 0.11253190728243932,
1085
+ "grad_norm": 0.0015669898129999638,
1086
+ "learning_rate": 9.942589371799715e-06,
1087
+ "loss": 0.0,
1088
+ "step": 290
1089
+ },
1090
+ {
1091
+ "epoch": 0.11330798940162856,
1092
+ "grad_norm": 0.003007555613294244,
1093
+ "learning_rate": 9.941626385973047e-06,
1094
+ "loss": 0.0,
1095
+ "step": 292
1096
+ },
1097
+ {
1098
+ "epoch": 0.1140840715208178,
1099
+ "grad_norm": 0.004424386657774448,
1100
+ "learning_rate": 9.940655438201113e-06,
1101
+ "loss": 0.0,
1102
+ "step": 294
1103
+ },
1104
+ {
1105
+ "epoch": 0.11486015364000704,
1106
+ "grad_norm": 0.0031703764107078314,
1107
+ "learning_rate": 9.9396765300483e-06,
1108
+ "loss": 0.0,
1109
+ "step": 296
1110
+ },
1111
+ {
1112
+ "epoch": 0.11563623575919627,
1113
+ "grad_norm": 0.00221498915925622,
1114
+ "learning_rate": 9.938689663091828e-06,
1115
+ "loss": 0.0,
1116
+ "step": 298
1117
+ },
1118
+ {
1119
+ "epoch": 0.1164123178783855,
1120
+ "grad_norm": 0.005704451352357864,
1121
+ "learning_rate": 9.937694838921734e-06,
1122
+ "loss": 0.0,
1123
+ "step": 300
1124
+ },
1125
+ {
1126
+ "epoch": 0.11718839999757474,
1127
+ "grad_norm": 0.0013350360095500946,
1128
+ "learning_rate": 9.93669205914088e-06,
1129
+ "loss": 0.0,
1130
+ "step": 302
1131
+ },
1132
+ {
1133
+ "epoch": 0.11796448211676398,
1134
+ "grad_norm": 0.0008370972354896367,
1135
+ "learning_rate": 9.93568132536494e-06,
1136
+ "loss": 0.0,
1137
+ "step": 304
1138
+ },
1139
+ {
1140
+ "epoch": 0.11874056423595322,
1141
+ "grad_norm": 0.01809551753103733,
1142
+ "learning_rate": 9.934662639222412e-06,
1143
+ "loss": 0.0,
1144
+ "step": 306
1145
+ },
1146
+ {
1147
+ "epoch": 0.11951664635514245,
1148
+ "grad_norm": 0.0005159855354577303,
1149
+ "learning_rate": 9.9336360023546e-06,
1150
+ "loss": 0.0,
1151
+ "step": 308
1152
+ },
1153
+ {
1154
+ "epoch": 0.12029272847433169,
1155
+ "grad_norm": 0.0009325972059741616,
1156
+ "learning_rate": 9.932601416415622e-06,
1157
+ "loss": 0.0,
1158
+ "step": 310
1159
+ },
1160
+ {
1161
+ "epoch": 0.12106881059352093,
1162
+ "grad_norm": 0.003970442805439234,
1163
+ "learning_rate": 9.931558883072403e-06,
1164
+ "loss": 0.0,
1165
+ "step": 312
1166
+ },
1167
+ {
1168
+ "epoch": 0.12184489271271016,
1169
+ "grad_norm": 0.000802877766545862,
1170
+ "learning_rate": 9.930508404004668e-06,
1171
+ "loss": 0.0,
1172
+ "step": 314
1173
+ },
1174
+ {
1175
+ "epoch": 0.1226209748318994,
1176
+ "grad_norm": 0.000747400859836489,
1177
+ "learning_rate": 9.929449980904952e-06,
1178
+ "loss": 0.0,
1179
+ "step": 316
1180
+ },
1181
+ {
1182
+ "epoch": 0.12339705695108864,
1183
+ "grad_norm": 0.05219698324799538,
1184
+ "learning_rate": 9.928383615478586e-06,
1185
+ "loss": 0.0,
1186
+ "step": 318
1187
+ },
1188
+ {
1189
+ "epoch": 0.12417313907027787,
1190
+ "grad_norm": 0.0008670546812936664,
1191
+ "learning_rate": 9.927309309443696e-06,
1192
+ "loss": 0.0,
1193
+ "step": 320
1194
+ },
1195
+ {
1196
+ "epoch": 0.12417313907027787,
1197
+ "eval_accuracy": 0.9999981981981982,
1198
+ "eval_loss": 1.2200940545881167e-05,
1199
+ "eval_runtime": 126.4816,
1200
+ "eval_samples_per_second": 39.531,
1201
+ "eval_steps_per_second": 9.883,
1202
+ "step": 320
1203
+ },
1204
+ {
1205
+ "epoch": 0.12494922118946711,
1206
+ "grad_norm": 0.0031448816880583763,
1207
+ "learning_rate": 9.9262270645312e-06,
1208
+ "loss": 0.0,
1209
+ "step": 322
1210
+ },
1211
+ {
1212
+ "epoch": 0.12572530330865636,
1213
+ "grad_norm": 0.0009269348229281604,
1214
+ "learning_rate": 9.925136882484816e-06,
1215
+ "loss": 0.0,
1216
+ "step": 324
1217
+ },
1218
+ {
1219
+ "epoch": 0.1265013854278456,
1220
+ "grad_norm": 0.00048692882410250604,
1221
+ "learning_rate": 9.924038765061042e-06,
1222
+ "loss": 0.0,
1223
+ "step": 326
1224
+ },
1225
+ {
1226
+ "epoch": 0.1272774675470348,
1227
+ "grad_norm": 0.0416707918047905,
1228
+ "learning_rate": 9.922932714029163e-06,
1229
+ "loss": 0.0,
1230
+ "step": 328
1231
+ },
1232
+ {
1233
+ "epoch": 0.12805354966622406,
1234
+ "grad_norm": 0.0007638961542397738,
1235
+ "learning_rate": 9.921818731171249e-06,
1236
+ "loss": 0.0,
1237
+ "step": 330
1238
+ },
1239
+ {
1240
+ "epoch": 0.1288296317854133,
1241
+ "grad_norm": 0.0007810278912074864,
1242
+ "learning_rate": 9.920696818282147e-06,
1243
+ "loss": 0.0,
1244
+ "step": 332
1245
+ },
1246
+ {
1247
+ "epoch": 0.12960571390460254,
1248
+ "grad_norm": 0.0021285091061145067,
1249
+ "learning_rate": 9.919566977169486e-06,
1250
+ "loss": 0.0,
1251
+ "step": 334
1252
+ },
1253
+ {
1254
+ "epoch": 0.13038179602379177,
1255
+ "grad_norm": 0.0033166895154863596,
1256
+ "learning_rate": 9.918429209653662e-06,
1257
+ "loss": 0.0,
1258
+ "step": 336
1259
+ },
1260
+ {
1261
+ "epoch": 0.131157878142981,
1262
+ "grad_norm": 0.0007379205781035125,
1263
+ "learning_rate": 9.917283517567845e-06,
1264
+ "loss": 0.0,
1265
+ "step": 338
1266
+ },
1267
+ {
1268
+ "epoch": 0.13193396026217025,
1269
+ "grad_norm": 0.0007624090649187565,
1270
+ "learning_rate": 9.916129902757977e-06,
1271
+ "loss": 0.0,
1272
+ "step": 340
1273
+ },
1274
+ {
1275
+ "epoch": 0.13271004238135947,
1276
+ "grad_norm": 0.00043780903797596693,
1277
+ "learning_rate": 9.914968367082756e-06,
1278
+ "loss": 0.0,
1279
+ "step": 342
1280
+ },
1281
+ {
1282
+ "epoch": 0.13348612450054873,
1283
+ "grad_norm": 0.0003401880676392466,
1284
+ "learning_rate": 9.913798912413653e-06,
1285
+ "loss": 0.0,
1286
+ "step": 344
1287
+ },
1288
+ {
1289
+ "epoch": 0.13426220661973795,
1290
+ "grad_norm": 0.3694112002849579,
1291
+ "learning_rate": 9.912621540634889e-06,
1292
+ "loss": 0.0001,
1293
+ "step": 346
1294
+ },
1295
+ {
1296
+ "epoch": 0.13503828873892718,
1297
+ "grad_norm": 0.0005322833894751966,
1298
+ "learning_rate": 9.911436253643445e-06,
1299
+ "loss": 0.0,
1300
+ "step": 348
1301
+ },
1302
+ {
1303
+ "epoch": 0.13581437085811643,
1304
+ "grad_norm": 0.023259738460183144,
1305
+ "learning_rate": 9.910243053349055e-06,
1306
+ "loss": 0.0,
1307
+ "step": 350
1308
+ },
1309
+ {
1310
+ "epoch": 0.13659045297730565,
1311
+ "grad_norm": 0.00202095415443182,
1312
+ "learning_rate": 9.909041941674205e-06,
1313
+ "loss": 0.0002,
1314
+ "step": 352
1315
+ },
1316
+ {
1317
+ "epoch": 0.1373665350964949,
1318
+ "grad_norm": 0.001592564396560192,
1319
+ "learning_rate": 9.90783292055412e-06,
1320
+ "loss": 0.0001,
1321
+ "step": 354
1322
+ },
1323
+ {
1324
+ "epoch": 0.13814261721568413,
1325
+ "grad_norm": 0.2993152141571045,
1326
+ "learning_rate": 9.906615991936781e-06,
1327
+ "loss": 0.0005,
1328
+ "step": 356
1329
+ },
1330
+ {
1331
+ "epoch": 0.13891869933487336,
1332
+ "grad_norm": 0.016617566347122192,
1333
+ "learning_rate": 9.905391157782897e-06,
1334
+ "loss": 0.0001,
1335
+ "step": 358
1336
+ },
1337
+ {
1338
+ "epoch": 0.1396947814540626,
1339
+ "grad_norm": 0.10565357655286789,
1340
+ "learning_rate": 9.904158420065923e-06,
1341
+ "loss": 0.0001,
1342
+ "step": 360
1343
+ },
1344
+ {
1345
+ "epoch": 0.1396947814540626,
1346
+ "eval_accuracy": 0.9999483411112575,
1347
+ "eval_loss": 0.0001308279752265662,
1348
+ "eval_runtime": 125.426,
1349
+ "eval_samples_per_second": 39.864,
1350
+ "eval_steps_per_second": 9.966,
1351
+ "step": 360
1352
+ },
1353
+ {
1354
+ "epoch": 0.14047086357325184,
1355
+ "grad_norm": 0.19931581616401672,
1356
+ "learning_rate": 9.902917780772043e-06,
1357
+ "loss": 0.0002,
1358
+ "step": 362
1359
+ },
1360
+ {
1361
+ "epoch": 0.1412469456924411,
1362
+ "grad_norm": 0.016386395320296288,
1363
+ "learning_rate": 9.901669241900178e-06,
1364
+ "loss": 0.0,
1365
+ "step": 364
1366
+ },
1367
+ {
1368
+ "epoch": 0.14202302781163031,
1369
+ "grad_norm": 0.03343227878212929,
1370
+ "learning_rate": 9.900412805461968e-06,
1371
+ "loss": 0.0,
1372
+ "step": 366
1373
+ },
1374
+ {
1375
+ "epoch": 0.14279910993081957,
1376
+ "grad_norm": 0.007820719853043556,
1377
+ "learning_rate": 9.899148473481786e-06,
1378
+ "loss": 0.0,
1379
+ "step": 368
1380
+ },
1381
+ {
1382
+ "epoch": 0.1435751920500088,
1383
+ "grad_norm": 0.006656871177256107,
1384
+ "learning_rate": 9.89787624799672e-06,
1385
+ "loss": 0.0,
1386
+ "step": 370
1387
+ },
1388
+ {
1389
+ "epoch": 0.14435127416919802,
1390
+ "grad_norm": 0.008290220983326435,
1391
+ "learning_rate": 9.896596131056583e-06,
1392
+ "loss": 0.0,
1393
+ "step": 372
1394
+ },
1395
+ {
1396
+ "epoch": 0.14512735628838727,
1397
+ "grad_norm": 0.006194319576025009,
1398
+ "learning_rate": 9.895308124723897e-06,
1399
+ "loss": 0.0,
1400
+ "step": 374
1401
+ },
1402
+ {
1403
+ "epoch": 0.1459034384075765,
1404
+ "grad_norm": 0.008603195659816265,
1405
+ "learning_rate": 9.894012231073895e-06,
1406
+ "loss": 0.0,
1407
+ "step": 376
1408
+ },
1409
+ {
1410
+ "epoch": 0.14667952052676575,
1411
+ "grad_norm": 0.023244811221957207,
1412
+ "learning_rate": 9.892708452194522e-06,
1413
+ "loss": 0.0001,
1414
+ "step": 378
1415
+ },
1416
+ {
1417
+ "epoch": 0.14745560264595498,
1418
+ "grad_norm": 0.005203426815569401,
1419
+ "learning_rate": 9.891396790186424e-06,
1420
+ "loss": 0.0,
1421
+ "step": 380
1422
+ },
1423
+ {
1424
+ "epoch": 0.1482316847651442,
1425
+ "grad_norm": 0.0015924626495689154,
1426
+ "learning_rate": 9.890077247162951e-06,
1427
+ "loss": 0.0,
1428
+ "step": 382
1429
+ },
1430
+ {
1431
+ "epoch": 0.14900776688433345,
1432
+ "grad_norm": 0.002123458543792367,
1433
+ "learning_rate": 9.888749825250151e-06,
1434
+ "loss": 0.0,
1435
+ "step": 384
1436
+ },
1437
+ {
1438
+ "epoch": 0.14978384900352268,
1439
+ "grad_norm": 0.0028824363835155964,
1440
+ "learning_rate": 9.887414526586764e-06,
1441
+ "loss": 0.0,
1442
+ "step": 386
1443
+ },
1444
+ {
1445
+ "epoch": 0.15055993112271193,
1446
+ "grad_norm": 0.002307540737092495,
1447
+ "learning_rate": 9.886071353324223e-06,
1448
+ "loss": 0.0,
1449
+ "step": 388
1450
+ },
1451
+ {
1452
+ "epoch": 0.15133601324190116,
1453
+ "grad_norm": 0.0008634248515591025,
1454
+ "learning_rate": 9.884720307626647e-06,
1455
+ "loss": 0.0,
1456
+ "step": 390
1457
+ },
1458
+ {
1459
+ "epoch": 0.15211209536109038,
1460
+ "grad_norm": 0.005232020281255245,
1461
+ "learning_rate": 9.883361391670841e-06,
1462
+ "loss": 0.0,
1463
+ "step": 392
1464
+ },
1465
+ {
1466
+ "epoch": 0.15288817748027964,
1467
+ "grad_norm": 0.0016856775619089603,
1468
+ "learning_rate": 9.881994607646288e-06,
1469
+ "loss": 0.0,
1470
+ "step": 394
1471
+ },
1472
+ {
1473
+ "epoch": 0.15366425959946886,
1474
+ "grad_norm": 0.07297682762145996,
1475
+ "learning_rate": 9.880619957755151e-06,
1476
+ "loss": 0.0002,
1477
+ "step": 396
1478
+ },
1479
+ {
1480
+ "epoch": 0.15444034171865811,
1481
+ "grad_norm": 0.017098629847168922,
1482
+ "learning_rate": 9.879237444212265e-06,
1483
+ "loss": 0.0,
1484
+ "step": 398
1485
+ },
1486
+ {
1487
+ "epoch": 0.15521642383784734,
1488
+ "grad_norm": 0.0019075347809121013,
1489
+ "learning_rate": 9.877847069245134e-06,
1490
+ "loss": 0.0,
1491
+ "step": 400
1492
+ },
1493
+ {
1494
+ "epoch": 0.15521642383784734,
1495
+ "eval_accuracy": 1.0,
1496
+ "eval_loss": 6.299953383859247e-06,
1497
+ "eval_runtime": 127.1041,
1498
+ "eval_samples_per_second": 39.338,
1499
+ "eval_steps_per_second": 9.834,
1500
+ "step": 400
1501
+ }
1502
+ ],
1503
+ "logging_steps": 2,
1504
+ "max_steps": 5000,
1505
+ "num_input_tokens_seen": 0,
1506
+ "num_train_epochs": 2,
1507
+ "save_steps": 40,
1508
+ "stateful_callbacks": {
1509
+ "TrainerControl": {
1510
+ "args": {
1511
+ "should_epoch_stop": false,
1512
+ "should_evaluate": false,
1513
+ "should_log": false,
1514
+ "should_save": true,
1515
+ "should_training_stop": false
1516
+ },
1517
+ "attributes": {}
1518
+ }
1519
+ },
1520
+ "total_flos": 2.902886523894497e+17,
1521
+ "train_batch_size": 1,
1522
+ "trial_name": null,
1523
+ "trial_params": null
1524
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a910710b31f99b947273772abd52b7aad579803a1185a4498ccc66e82ef7a0
3
+ size 6161
vocab.json ADDED
The diff for this file is too large to render. See raw diff