ellendagher commited on
Commit
d11af70
·
verified ·
1 Parent(s): 5f52254

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -5,13 +5,13 @@
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
- "eos_token_id": 151645,
9
  "head_dim": 128,
10
  "hidden_act": "silu",
11
  "hidden_size": 1024,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
14
- "max_position_embeddings": 40960,
15
  "max_window_layers": 28,
16
  "model_type": "qwen3",
17
  "num_attention_heads": 16,
 
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
  "head_dim": 128,
10
  "hidden_act": "silu",
11
  "hidden_size": 1024,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
14
+ "max_position_embeddings": 32768,
15
  "max_window_layers": 28,
16
  "model_type": "qwen3",
17
  "num_attention_heads": 16,
generation_config.json CHANGED
@@ -1,13 +1,6 @@
1
  {
2
  "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "pad_token_id": 151643,
9
- "temperature": 0.6,
10
- "top_k": 20,
11
- "top_p": 0.95,
12
  "transformers_version": "4.51.3"
13
  }
 
1
  {
2
  "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
 
 
 
 
 
 
 
5
  "transformers_version": "4.51.3"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6feec19be08a1cb6bdaaeb1ef10313b0ff034b95ddd1cfb172570ba0a9d5f44
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f69768e0b8c9a65816aabb418e88789b5f81b7e044131ed25080a4ea179809ea
3
  size 2384234968
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d38ba7f28d2ff254e5b07c7271ab61ac92dc9e13aff554bfc0ea905d47468c7a
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55de7658973251ff4bc714dc23c0fd39bfb4c6134b24943293a4e73605048c0a
3
  size 4768663315
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7063580a565cb4ab0c1d36b25d817a35a16d1f21f4a993a9f25cdba6efadcb9d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de69a2834426ff9ef8199d077e00892579278af31d8969d77f98235b5cfc010a
3
  size 14645
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41b34dcd0cff02c6ffc7608e693c2a196add41da5739504686225043c01447c4
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9094c17298642f38cf309a4c458bf716435118fd82328f2994b59befd5513506
3
  size 1383
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cb65c15636c55bd9e5006387a80459708278556b496f4197710f6f74c0b6424
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:803e3d3a19f8999f7f46a9f3874e90b8b99ec2637ef8ca8cb769d1aa65c7cb2c
3
  size 1465
special_tokens_map.json CHANGED
@@ -15,7 +15,7 @@
15
  "<|video_pad|>"
16
  ],
17
  "eos_token": {
18
- "content": "<|im_end|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
 
15
  "<|video_pad|>"
16
  ],
17
  "eos_token": {
18
+ "content": "<|endoftext|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08a6b43b5f875be3dd0696ffbcfdcc45541445e127f83a2bd28f4ebdbdb60340
3
- size 11422750
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json CHANGED
@@ -227,9 +227,9 @@
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
230
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
- "eos_token": "<|im_end|>",
233
  "errors": "replace",
234
  "extra_special_tokens": {},
235
  "model_max_length": 131072,
 
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|endoftext|>",
233
  "errors": "replace",
234
  "extra_special_tokens": {},
235
  "model_max_length": 131072,
trainer_state.json CHANGED
@@ -1,1421 +1,167 @@
1
  {
2
- "best_global_step": 9908,
3
- "best_metric": 1.7954870462417603,
4
- "best_model_checkpoint": "./mcqa_qwen3_letter_final/checkpoint-9908",
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 9908,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.010092854259184497,
14
- "grad_norm": 31.45309066772461,
15
- "learning_rate": 9.081735620585268e-07,
16
- "loss": 2.6792,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.020185708518368994,
21
- "grad_norm": 29.842939376831055,
22
- "learning_rate": 1.9172552976791123e-06,
23
- "loss": 2.3336,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.030278562777553492,
28
- "grad_norm": 25.984067916870117,
29
- "learning_rate": 2.926337033299698e-06,
30
- "loss": 2.1298,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.04037141703673799,
35
- "grad_norm": 21.91876983642578,
36
- "learning_rate": 3.935418768920283e-06,
37
- "loss": 2.0582,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.050464271295922486,
42
- "grad_norm": 18.466697692871094,
43
- "learning_rate": 4.944500504540869e-06,
44
- "loss": 1.9876,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.060557125555106985,
49
- "grad_norm": 23.480918884277344,
50
- "learning_rate": 5.953582240161454e-06,
51
- "loss": 1.9532,
52
  "step": 300
53
  },
54
  {
55
- "epoch": 0.07064997981429148,
56
- "grad_norm": 20.459003448486328,
57
- "learning_rate": 6.962663975782039e-06,
58
- "loss": 1.9739,
59
  "step": 350
60
  },
61
  {
62
- "epoch": 0.08074283407347597,
63
- "grad_norm": 21.53769302368164,
64
- "learning_rate": 7.971745711402625e-06,
65
- "loss": 1.9591,
66
  "step": 400
67
  },
68
  {
69
- "epoch": 0.09083568833266048,
70
- "grad_norm": 19.838491439819336,
71
- "learning_rate": 8.98082744702321e-06,
72
- "loss": 1.9862,
73
  "step": 450
74
  },
75
  {
76
- "epoch": 0.10092854259184497,
77
- "grad_norm": 14.181418418884277,
78
- "learning_rate": 9.989909182643795e-06,
79
- "loss": 1.9422,
80
  "step": 500
81
  },
82
  {
83
- "epoch": 0.11102139685102948,
84
- "grad_norm": 17.275489807128906,
85
- "learning_rate": 1.0998990918264381e-05,
86
- "loss": 1.9537,
87
  "step": 550
88
  },
89
  {
90
- "epoch": 0.12111425111021397,
91
- "grad_norm": 23.366567611694336,
92
- "learning_rate": 1.2008072653884966e-05,
93
- "loss": 1.9305,
94
  "step": 600
95
  },
96
  {
97
- "epoch": 0.13120710536939847,
98
- "grad_norm": 17.37462043762207,
99
- "learning_rate": 1.301715438950555e-05,
100
- "loss": 1.9227,
101
  "step": 650
102
  },
103
  {
104
- "epoch": 0.14129995962858297,
105
- "grad_norm": 18.20924186706543,
106
- "learning_rate": 1.4026236125126136e-05,
107
- "loss": 1.9499,
108
  "step": 700
109
  },
110
  {
111
- "epoch": 0.15139281388776746,
112
- "grad_norm": 15.593676567077637,
113
- "learning_rate": 1.503531786074672e-05,
114
- "loss": 1.9175,
115
  "step": 750
116
  },
117
  {
118
- "epoch": 0.16148566814695195,
119
- "grad_norm": 16.923185348510742,
120
- "learning_rate": 1.6044399596367305e-05,
121
- "loss": 1.9491,
122
  "step": 800
123
  },
124
  {
125
- "epoch": 0.17157852240613647,
126
- "grad_norm": 16.0386905670166,
127
- "learning_rate": 1.7053481331987892e-05,
128
- "loss": 1.9246,
129
  "step": 850
130
  },
131
  {
132
- "epoch": 0.18167137666532096,
133
- "grad_norm": 15.55115032196045,
134
- "learning_rate": 1.806256306760848e-05,
135
- "loss": 1.9162,
136
  "step": 900
137
  },
138
  {
139
- "epoch": 0.19176423092450545,
140
- "grad_norm": 19.219303131103516,
141
- "learning_rate": 1.9071644803229064e-05,
142
- "loss": 1.9408,
143
  "step": 950
144
  },
145
  {
146
- "epoch": 0.20185708518368994,
147
- "grad_norm": 22.23096466064453,
148
- "learning_rate": 1.999102837277111e-05,
149
- "loss": 1.9427,
150
  "step": 1000
151
  },
152
  {
153
- "epoch": 0.21194993944287444,
154
- "grad_norm": 14.07517147064209,
155
- "learning_rate": 1.9878883032410005e-05,
156
- "loss": 1.9638,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 0.22204279370205895,
161
- "grad_norm": 17.114988327026367,
162
- "learning_rate": 1.9766737692048896e-05,
163
- "loss": 1.9503,
164
- "step": 1100
165
- },
166
- {
167
- "epoch": 0.23213564796124345,
168
- "grad_norm": 14.243387222290039,
169
- "learning_rate": 1.965459235168779e-05,
170
- "loss": 1.9495,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 0.24222850222042794,
175
- "grad_norm": 12.465641021728516,
176
- "learning_rate": 1.954244701132668e-05,
177
- "loss": 1.9259,
178
- "step": 1200
179
- },
180
- {
181
- "epoch": 0.25232135647961246,
182
- "grad_norm": 11.812984466552734,
183
- "learning_rate": 1.9430301670965574e-05,
184
- "loss": 1.9725,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 0.26241421073879695,
189
- "grad_norm": 12.729921340942383,
190
- "learning_rate": 1.9318156330604464e-05,
191
- "loss": 1.9252,
192
- "step": 1300
193
- },
194
- {
195
- "epoch": 0.27250706499798144,
196
- "grad_norm": 15.226152420043945,
197
- "learning_rate": 1.920601099024336e-05,
198
- "loss": 1.8983,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 0.28259991925716593,
203
- "grad_norm": 11.439791679382324,
204
- "learning_rate": 1.909386564988225e-05,
205
- "loss": 1.9063,
206
- "step": 1400
207
- },
208
- {
209
- "epoch": 0.2926927735163504,
210
- "grad_norm": 19.441160202026367,
211
- "learning_rate": 1.8981720309521143e-05,
212
- "loss": 1.9032,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 0.3027856277755349,
217
- "grad_norm": 13.702200889587402,
218
- "learning_rate": 1.8869574969160033e-05,
219
- "loss": 1.886,
220
- "step": 1500
221
- },
222
- {
223
- "epoch": 0.3128784820347194,
224
- "grad_norm": 13.252537727355957,
225
- "learning_rate": 1.8757429628798924e-05,
226
- "loss": 1.938,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.3229713362939039,
231
- "grad_norm": 13.876880645751953,
232
- "learning_rate": 1.8645284288437818e-05,
233
- "loss": 1.7923,
234
- "step": 1600
235
- },
236
- {
237
- "epoch": 0.3330641905530884,
238
- "grad_norm": 13.790823936462402,
239
- "learning_rate": 1.853313894807671e-05,
240
- "loss": 1.8618,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.34315704481227294,
245
- "grad_norm": 12.250268936157227,
246
- "learning_rate": 1.8420993607715602e-05,
247
- "loss": 1.8595,
248
- "step": 1700
249
- },
250
- {
251
- "epoch": 0.35324989907145743,
252
- "grad_norm": 11.78331470489502,
253
- "learning_rate": 1.8308848267354492e-05,
254
- "loss": 1.8957,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 0.3633427533306419,
259
- "grad_norm": 13.84703540802002,
260
- "learning_rate": 1.8196702926993386e-05,
261
- "loss": 1.8921,
262
- "step": 1800
263
- },
264
- {
265
- "epoch": 0.3734356075898264,
266
- "grad_norm": 16.251209259033203,
267
- "learning_rate": 1.8084557586632277e-05,
268
- "loss": 1.8639,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 0.3835284618490109,
273
- "grad_norm": 10.786506652832031,
274
- "learning_rate": 1.797241224627117e-05,
275
- "loss": 1.9083,
276
- "step": 1900
277
- },
278
- {
279
- "epoch": 0.3936213161081954,
280
- "grad_norm": 18.62939453125,
281
- "learning_rate": 1.786026690591006e-05,
282
- "loss": 1.8543,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 0.4037141703673799,
287
- "grad_norm": 12.85810661315918,
288
- "learning_rate": 1.774812156554895e-05,
289
- "loss": 1.8541,
290
- "step": 2000
291
- },
292
- {
293
- "epoch": 0.4138070246265644,
294
- "grad_norm": 15.795912742614746,
295
- "learning_rate": 1.7635976225187845e-05,
296
- "loss": 1.8663,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 0.42389987888574887,
301
- "grad_norm": 20.85138702392578,
302
- "learning_rate": 1.7523830884826736e-05,
303
- "loss": 1.8814,
304
- "step": 2100
305
- },
306
- {
307
- "epoch": 0.43399273314493336,
308
- "grad_norm": 13.458274841308594,
309
- "learning_rate": 1.741168554446563e-05,
310
- "loss": 1.8972,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 0.4440855874041179,
315
- "grad_norm": 13.698914527893066,
316
- "learning_rate": 1.729954020410452e-05,
317
- "loss": 1.8952,
318
- "step": 2200
319
- },
320
- {
321
- "epoch": 0.4541784416633024,
322
- "grad_norm": 15.174164772033691,
323
- "learning_rate": 1.718739486374341e-05,
324
- "loss": 1.8839,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 0.4642712959224869,
329
- "grad_norm": 11.87475299835205,
330
- "learning_rate": 1.7075249523382305e-05,
331
- "loss": 1.8756,
332
- "step": 2300
333
- },
334
- {
335
- "epoch": 0.4743641501816714,
336
- "grad_norm": 11.978715896606445,
337
- "learning_rate": 1.696534708982842e-05,
338
- "loss": 1.8022,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 0.4844570044408559,
343
- "grad_norm": 8.314435958862305,
344
- "learning_rate": 1.685320174946731e-05,
345
- "loss": 1.8306,
346
- "step": 2400
347
- },
348
- {
349
- "epoch": 0.49454985870004037,
350
- "grad_norm": 12.93226146697998,
351
- "learning_rate": 1.67410564091062e-05,
352
- "loss": 1.8749,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 0.5046427129592249,
357
- "grad_norm": 11.91203498840332,
358
- "learning_rate": 1.6628911068745095e-05,
359
- "loss": 1.843,
360
- "step": 2500
361
- },
362
- {
363
- "epoch": 0.5147355672184094,
364
- "grad_norm": 12.928333282470703,
365
- "learning_rate": 1.651676572838399e-05,
366
- "loss": 1.8897,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 0.5248284214775939,
371
- "grad_norm": 11.979988098144531,
372
- "learning_rate": 1.640462038802288e-05,
373
- "loss": 1.8362,
374
- "step": 2600
375
- },
376
- {
377
- "epoch": 0.5349212757367784,
378
- "grad_norm": 13.687814712524414,
379
- "learning_rate": 1.629247504766177e-05,
380
- "loss": 1.8539,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 0.5450141299959629,
385
- "grad_norm": 13.337515830993652,
386
- "learning_rate": 1.6180329707300664e-05,
387
- "loss": 1.8721,
388
- "step": 2700
389
- },
390
- {
391
- "epoch": 0.5551069842551474,
392
- "grad_norm": 11.664098739624023,
393
- "learning_rate": 1.6068184366939554e-05,
394
- "loss": 1.8677,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 0.5651998385143319,
399
- "grad_norm": 11.584458351135254,
400
- "learning_rate": 1.5956039026578448e-05,
401
- "loss": 1.8326,
402
- "step": 2800
403
- },
404
- {
405
- "epoch": 0.5752926927735164,
406
- "grad_norm": 12.478504180908203,
407
- "learning_rate": 1.584389368621734e-05,
408
- "loss": 1.8089,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 0.5853855470327008,
413
- "grad_norm": 9.926409721374512,
414
- "learning_rate": 1.573174834585623e-05,
415
- "loss": 1.8459,
416
- "step": 2900
417
- },
418
- {
419
- "epoch": 0.5954784012918853,
420
- "grad_norm": 10.55431842803955,
421
- "learning_rate": 1.5619603005495123e-05,
422
- "loss": 1.7877,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 0.6055712555510698,
427
- "grad_norm": 10.03345775604248,
428
- "learning_rate": 1.5507457665134017e-05,
429
- "loss": 1.7983,
430
- "step": 3000
431
- },
432
- {
433
- "epoch": 0.6156641098102543,
434
- "grad_norm": 18.028541564941406,
435
- "learning_rate": 1.5395312324772907e-05,
436
- "loss": 1.835,
437
- "step": 3050
438
- },
439
- {
440
- "epoch": 0.6257569640694388,
441
- "grad_norm": 16.779207229614258,
442
- "learning_rate": 1.5283166984411798e-05,
443
- "loss": 1.7876,
444
- "step": 3100
445
- },
446
- {
447
- "epoch": 0.6358498183286233,
448
- "grad_norm": 10.918420791625977,
449
- "learning_rate": 1.5171021644050692e-05,
450
- "loss": 1.8199,
451
- "step": 3150
452
- },
453
- {
454
- "epoch": 0.6459426725878078,
455
- "grad_norm": 14.741921424865723,
456
- "learning_rate": 1.5058876303689582e-05,
457
- "loss": 1.8695,
458
- "step": 3200
459
- },
460
- {
461
- "epoch": 0.6560355268469923,
462
- "grad_norm": 14.065166473388672,
463
- "learning_rate": 1.4946730963328474e-05,
464
- "loss": 1.8596,
465
- "step": 3250
466
- },
467
- {
468
- "epoch": 0.6661283811061768,
469
- "grad_norm": 9.158289909362793,
470
- "learning_rate": 1.4834585622967368e-05,
471
- "loss": 1.7999,
472
- "step": 3300
473
- },
474
- {
475
- "epoch": 0.6762212353653613,
476
- "grad_norm": 13.310529708862305,
477
- "learning_rate": 1.4722440282606259e-05,
478
- "loss": 1.8628,
479
- "step": 3350
480
- },
481
- {
482
- "epoch": 0.6863140896245459,
483
- "grad_norm": 16.720178604125977,
484
- "learning_rate": 1.4610294942245151e-05,
485
- "loss": 1.8143,
486
- "step": 3400
487
- },
488
- {
489
- "epoch": 0.6964069438837304,
490
- "grad_norm": 10.033896446228027,
491
- "learning_rate": 1.4498149601884043e-05,
492
- "loss": 1.8191,
493
- "step": 3450
494
- },
495
- {
496
- "epoch": 0.7064997981429149,
497
- "grad_norm": 10.272814750671387,
498
- "learning_rate": 1.4386004261522934e-05,
499
- "loss": 1.8152,
500
- "step": 3500
501
- },
502
- {
503
- "epoch": 0.7165926524020994,
504
- "grad_norm": 12.844585418701172,
505
- "learning_rate": 1.4273858921161828e-05,
506
- "loss": 1.7685,
507
- "step": 3550
508
- },
509
- {
510
- "epoch": 0.7266855066612838,
511
- "grad_norm": 9.883562088012695,
512
- "learning_rate": 1.4161713580800718e-05,
513
- "loss": 1.775,
514
- "step": 3600
515
- },
516
- {
517
- "epoch": 0.7367783609204683,
518
- "grad_norm": 13.27017593383789,
519
- "learning_rate": 1.404956824043961e-05,
520
- "loss": 1.8214,
521
- "step": 3650
522
- },
523
- {
524
- "epoch": 0.7468712151796528,
525
- "grad_norm": 9.754312515258789,
526
- "learning_rate": 1.3937422900078504e-05,
527
- "loss": 1.8177,
528
- "step": 3700
529
- },
530
- {
531
- "epoch": 0.7569640694388373,
532
- "grad_norm": 9.161224365234375,
533
- "learning_rate": 1.3825277559717395e-05,
534
- "loss": 1.7826,
535
- "step": 3750
536
- },
537
- {
538
- "epoch": 0.7670569236980218,
539
- "grad_norm": 15.469295501708984,
540
- "learning_rate": 1.3713132219356287e-05,
541
- "loss": 1.7669,
542
- "step": 3800
543
- },
544
- {
545
- "epoch": 0.7771497779572063,
546
- "grad_norm": 9.38441276550293,
547
- "learning_rate": 1.3600986878995179e-05,
548
- "loss": 1.694,
549
- "step": 3850
550
- },
551
- {
552
- "epoch": 0.7872426322163908,
553
- "grad_norm": 9.595329284667969,
554
- "learning_rate": 1.348884153863407e-05,
555
- "loss": 1.7962,
556
- "step": 3900
557
- },
558
- {
559
- "epoch": 0.7973354864755753,
560
- "grad_norm": 13.450725555419922,
561
- "learning_rate": 1.3376696198272963e-05,
562
- "loss": 1.834,
563
- "step": 3950
564
- },
565
- {
566
- "epoch": 0.8074283407347598,
567
- "grad_norm": 10.953201293945312,
568
- "learning_rate": 1.3264550857911855e-05,
569
- "loss": 1.7788,
570
- "step": 4000
571
- },
572
- {
573
- "epoch": 0.8175211949939443,
574
- "grad_norm": 11.066990852355957,
575
- "learning_rate": 1.3152405517550746e-05,
576
- "loss": 1.819,
577
- "step": 4050
578
- },
579
- {
580
- "epoch": 0.8276140492531288,
581
- "grad_norm": 10.758559226989746,
582
- "learning_rate": 1.3040260177189638e-05,
583
- "loss": 1.7824,
584
- "step": 4100
585
- },
586
- {
587
- "epoch": 0.8377069035123133,
588
- "grad_norm": 11.183074951171875,
589
- "learning_rate": 1.2928114836828532e-05,
590
- "loss": 1.8395,
591
- "step": 4150
592
- },
593
- {
594
- "epoch": 0.8477997577714977,
595
- "grad_norm": 18.876482009887695,
596
- "learning_rate": 1.2815969496467423e-05,
597
- "loss": 1.8085,
598
- "step": 4200
599
- },
600
- {
601
- "epoch": 0.8578926120306822,
602
- "grad_norm": 9.296488761901855,
603
- "learning_rate": 1.2703824156106315e-05,
604
- "loss": 1.7582,
605
- "step": 4250
606
- },
607
- {
608
- "epoch": 0.8679854662898667,
609
- "grad_norm": 10.75462532043457,
610
- "learning_rate": 1.2591678815745207e-05,
611
- "loss": 1.756,
612
- "step": 4300
613
- },
614
- {
615
- "epoch": 0.8780783205490512,
616
- "grad_norm": 9.275300025939941,
617
- "learning_rate": 1.2479533475384097e-05,
618
- "loss": 1.776,
619
- "step": 4350
620
- },
621
- {
622
- "epoch": 0.8881711748082358,
623
- "grad_norm": 15.259178161621094,
624
- "learning_rate": 1.2367388135022991e-05,
625
- "loss": 1.7603,
626
- "step": 4400
627
- },
628
- {
629
- "epoch": 0.8982640290674203,
630
- "grad_norm": 10.773064613342285,
631
- "learning_rate": 1.2255242794661883e-05,
632
- "loss": 1.7885,
633
- "step": 4450
634
- },
635
- {
636
- "epoch": 0.9083568833266048,
637
- "grad_norm": 8.615636825561523,
638
- "learning_rate": 1.2143097454300774e-05,
639
- "loss": 1.7873,
640
- "step": 4500
641
- },
642
- {
643
- "epoch": 0.9184497375857893,
644
- "grad_norm": 11.73543643951416,
645
- "learning_rate": 1.2030952113939666e-05,
646
- "loss": 1.782,
647
- "step": 4550
648
- },
649
- {
650
- "epoch": 0.9285425918449738,
651
- "grad_norm": 11.97884750366211,
652
- "learning_rate": 1.191880677357856e-05,
653
- "loss": 1.7151,
654
- "step": 4600
655
- },
656
- {
657
- "epoch": 0.9386354461041583,
658
- "grad_norm": 9.290508270263672,
659
- "learning_rate": 1.180666143321745e-05,
660
- "loss": 1.7588,
661
- "step": 4650
662
- },
663
- {
664
- "epoch": 0.9487283003633428,
665
- "grad_norm": 7.962852478027344,
666
- "learning_rate": 1.1694516092856343e-05,
667
- "loss": 1.7455,
668
- "step": 4700
669
- },
670
- {
671
- "epoch": 0.9588211546225273,
672
- "grad_norm": 10.642402648925781,
673
- "learning_rate": 1.1582370752495235e-05,
674
- "loss": 1.8426,
675
- "step": 4750
676
- },
677
- {
678
- "epoch": 0.9689140088817118,
679
- "grad_norm": 9.8108491897583,
680
- "learning_rate": 1.1470225412134125e-05,
681
- "loss": 1.7877,
682
- "step": 4800
683
- },
684
- {
685
- "epoch": 0.9790068631408962,
686
- "grad_norm": 12.963693618774414,
687
- "learning_rate": 1.135808007177302e-05,
688
- "loss": 1.782,
689
- "step": 4850
690
- },
691
- {
692
- "epoch": 0.9890997174000807,
693
- "grad_norm": 12.592732429504395,
694
- "learning_rate": 1.1245934731411911e-05,
695
- "loss": 1.7852,
696
- "step": 4900
697
- },
698
- {
699
- "epoch": 0.9991925716592652,
700
- "grad_norm": 15.016729354858398,
701
- "learning_rate": 1.1133789391050802e-05,
702
- "loss": 1.7959,
703
- "step": 4950
704
- },
705
- {
706
- "epoch": 1.0,
707
- "eval_loss": 1.8044700622558594,
708
- "eval_runtime": 226.5642,
709
- "eval_samples_per_second": 16.397,
710
- "eval_steps_per_second": 2.052,
711
- "step": 4954
712
- },
713
- {
714
- "epoch": 1.0092854259184498,
715
- "grad_norm": 12.544588088989258,
716
- "learning_rate": 1.1021644050689696e-05,
717
- "loss": 1.3976,
718
- "step": 5000
719
- },
720
- {
721
- "epoch": 1.0193782801776343,
722
- "grad_norm": 15.890256881713867,
723
- "learning_rate": 1.0909498710328588e-05,
724
- "loss": 1.3058,
725
- "step": 5050
726
- },
727
- {
728
- "epoch": 1.0294711344368188,
729
- "grad_norm": 12.59524154663086,
730
- "learning_rate": 1.0797353369967478e-05,
731
- "loss": 1.3811,
732
- "step": 5100
733
- },
734
- {
735
- "epoch": 1.0395639886960033,
736
- "grad_norm": 11.901808738708496,
737
- "learning_rate": 1.0687450936413592e-05,
738
- "loss": 1.3774,
739
- "step": 5150
740
- },
741
- {
742
- "epoch": 1.0496568429551878,
743
- "grad_norm": 9.971845626831055,
744
- "learning_rate": 1.0575305596052484e-05,
745
- "loss": 1.3099,
746
- "step": 5200
747
- },
748
- {
749
- "epoch": 1.0597496972143723,
750
- "grad_norm": 12.993363380432129,
751
- "learning_rate": 1.0463160255691377e-05,
752
- "loss": 1.3509,
753
- "step": 5250
754
- },
755
- {
756
- "epoch": 1.0698425514735568,
757
- "grad_norm": 11.694303512573242,
758
- "learning_rate": 1.0351014915330269e-05,
759
- "loss": 1.3512,
760
- "step": 5300
761
- },
762
- {
763
- "epoch": 1.0799354057327413,
764
- "grad_norm": 14.644658088684082,
765
- "learning_rate": 1.0238869574969161e-05,
766
- "loss": 1.3742,
767
- "step": 5350
768
- },
769
- {
770
- "epoch": 1.0900282599919258,
771
- "grad_norm": 14.014612197875977,
772
- "learning_rate": 1.0126724234608051e-05,
773
- "loss": 1.3546,
774
- "step": 5400
775
- },
776
- {
777
- "epoch": 1.1001211142511103,
778
- "grad_norm": 11.02804946899414,
779
- "learning_rate": 1.0014578894246945e-05,
780
- "loss": 1.3341,
781
- "step": 5450
782
- },
783
- {
784
- "epoch": 1.1102139685102947,
785
- "grad_norm": 11.991540908813477,
786
- "learning_rate": 9.902433553885838e-06,
787
- "loss": 1.3558,
788
- "step": 5500
789
- },
790
- {
791
- "epoch": 1.1203068227694792,
792
- "grad_norm": 13.833789825439453,
793
- "learning_rate": 9.79028821352473e-06,
794
- "loss": 1.3334,
795
- "step": 5550
796
- },
797
- {
798
- "epoch": 1.1303996770286637,
799
- "grad_norm": 13.477495193481445,
800
- "learning_rate": 9.67814287316362e-06,
801
- "loss": 1.29,
802
- "step": 5600
803
- },
804
- {
805
- "epoch": 1.1404925312878482,
806
- "grad_norm": 16.3387508392334,
807
- "learning_rate": 9.565997532802512e-06,
808
- "loss": 1.3758,
809
- "step": 5650
810
- },
811
- {
812
- "epoch": 1.1505853855470327,
813
- "grad_norm": 10.865882873535156,
814
- "learning_rate": 9.453852192441405e-06,
815
- "loss": 1.3259,
816
- "step": 5700
817
- },
818
- {
819
- "epoch": 1.1606782398062172,
820
- "grad_norm": 10.249828338623047,
821
- "learning_rate": 9.341706852080297e-06,
822
- "loss": 1.3175,
823
- "step": 5750
824
- },
825
- {
826
- "epoch": 1.1707710940654017,
827
- "grad_norm": 10.611560821533203,
828
- "learning_rate": 9.229561511719189e-06,
829
- "loss": 1.3067,
830
- "step": 5800
831
- },
832
- {
833
- "epoch": 1.1808639483245862,
834
- "grad_norm": 11.055877685546875,
835
- "learning_rate": 9.117416171358081e-06,
836
- "loss": 1.3428,
837
- "step": 5850
838
- },
839
- {
840
- "epoch": 1.1909568025837707,
841
- "grad_norm": 13.777265548706055,
842
- "learning_rate": 9.005270830996973e-06,
843
- "loss": 1.3555,
844
- "step": 5900
845
- },
846
- {
847
- "epoch": 1.2010496568429552,
848
- "grad_norm": 8.595498085021973,
849
- "learning_rate": 8.893125490635864e-06,
850
- "loss": 1.3357,
851
- "step": 5950
852
- },
853
- {
854
- "epoch": 1.2111425111021397,
855
- "grad_norm": 15.421058654785156,
856
- "learning_rate": 8.780980150274758e-06,
857
- "loss": 1.3201,
858
- "step": 6000
859
- },
860
- {
861
- "epoch": 1.2212353653613242,
862
- "grad_norm": 13.1820707321167,
863
- "learning_rate": 8.668834809913648e-06,
864
- "loss": 1.346,
865
- "step": 6050
866
- },
867
- {
868
- "epoch": 1.2313282196205086,
869
- "grad_norm": 9.598958015441895,
870
- "learning_rate": 8.55668946955254e-06,
871
- "loss": 1.3975,
872
- "step": 6100
873
- },
874
- {
875
- "epoch": 1.2414210738796931,
876
- "grad_norm": 9.70576000213623,
877
- "learning_rate": 8.444544129191433e-06,
878
- "loss": 1.3446,
879
- "step": 6150
880
- },
881
- {
882
- "epoch": 1.2515139281388776,
883
- "grad_norm": 16.419450759887695,
884
- "learning_rate": 8.332398788830325e-06,
885
- "loss": 1.3239,
886
- "step": 6200
887
- },
888
- {
889
- "epoch": 1.2616067823980621,
890
- "grad_norm": 13.559700012207031,
891
- "learning_rate": 8.220253448469217e-06,
892
- "loss": 1.3283,
893
- "step": 6250
894
- },
895
- {
896
- "epoch": 1.2716996366572466,
897
- "grad_norm": 12.93370532989502,
898
- "learning_rate": 8.108108108108109e-06,
899
- "loss": 1.373,
900
- "step": 6300
901
- },
902
- {
903
- "epoch": 1.281792490916431,
904
- "grad_norm": 11.545220375061035,
905
- "learning_rate": 7.995962767747001e-06,
906
- "loss": 1.3332,
907
- "step": 6350
908
- },
909
- {
910
- "epoch": 1.2918853451756156,
911
- "grad_norm": 14.145684242248535,
912
- "learning_rate": 7.883817427385892e-06,
913
- "loss": 1.34,
914
- "step": 6400
915
- },
916
- {
917
- "epoch": 1.3019781994348,
918
- "grad_norm": 12.836868286132812,
919
- "learning_rate": 7.771672087024786e-06,
920
- "loss": 1.341,
921
- "step": 6450
922
- },
923
- {
924
- "epoch": 1.3120710536939846,
925
- "grad_norm": 12.780885696411133,
926
- "learning_rate": 7.659526746663676e-06,
927
- "loss": 1.357,
928
- "step": 6500
929
- },
930
- {
931
- "epoch": 1.3221639079531693,
932
- "grad_norm": 12.356983184814453,
933
- "learning_rate": 7.547381406302568e-06,
934
- "loss": 1.3531,
935
- "step": 6550
936
- },
937
- {
938
- "epoch": 1.3322567622123538,
939
- "grad_norm": 9.624804496765137,
940
- "learning_rate": 7.4352360659414604e-06,
941
- "loss": 1.3193,
942
- "step": 6600
943
- },
944
- {
945
- "epoch": 1.3423496164715383,
946
- "grad_norm": 11.769197463989258,
947
- "learning_rate": 7.323090725580353e-06,
948
- "loss": 1.3279,
949
- "step": 6650
950
- },
951
- {
952
- "epoch": 1.3524424707307228,
953
- "grad_norm": 7.835779666900635,
954
- "learning_rate": 7.210945385219245e-06,
955
- "loss": 1.3125,
956
- "step": 6700
957
- },
958
- {
959
- "epoch": 1.3625353249899073,
960
- "grad_norm": 11.88305377960205,
961
- "learning_rate": 7.098800044858136e-06,
962
- "loss": 1.3217,
963
- "step": 6750
964
- },
965
- {
966
- "epoch": 1.3726281792490918,
967
- "grad_norm": 13.909214973449707,
968
- "learning_rate": 6.986654704497029e-06,
969
- "loss": 1.3698,
970
- "step": 6800
971
- },
972
- {
973
- "epoch": 1.3827210335082762,
974
- "grad_norm": 13.501673698425293,
975
- "learning_rate": 6.8745093641359205e-06,
976
- "loss": 1.3027,
977
- "step": 6850
978
- },
979
- {
980
- "epoch": 1.3928138877674607,
981
- "grad_norm": 9.617853164672852,
982
- "learning_rate": 6.762364023774813e-06,
983
- "loss": 1.3278,
984
- "step": 6900
985
- },
986
- {
987
- "epoch": 1.4029067420266452,
988
- "grad_norm": 15.783841133117676,
989
- "learning_rate": 6.650218683413705e-06,
990
- "loss": 1.2946,
991
- "step": 6950
992
- },
993
- {
994
- "epoch": 1.4129995962858297,
995
- "grad_norm": 17.270166397094727,
996
- "learning_rate": 6.538073343052597e-06,
997
- "loss": 1.3485,
998
- "step": 7000
999
- },
1000
- {
1001
- "epoch": 1.4230924505450142,
1002
- "grad_norm": 10.313908576965332,
1003
- "learning_rate": 6.425928002691488e-06,
1004
- "loss": 1.3049,
1005
- "step": 7050
1006
- },
1007
- {
1008
- "epoch": 1.4331853048041987,
1009
- "grad_norm": 14.23890495300293,
1010
- "learning_rate": 6.313782662330381e-06,
1011
- "loss": 1.3516,
1012
- "step": 7100
1013
- },
1014
- {
1015
- "epoch": 1.4432781590633832,
1016
- "grad_norm": 12.376551628112793,
1017
- "learning_rate": 6.201637321969273e-06,
1018
- "loss": 1.2714,
1019
- "step": 7150
1020
- },
1021
- {
1022
- "epoch": 1.4533710133225677,
1023
- "grad_norm": 8.100600242614746,
1024
- "learning_rate": 6.089491981608164e-06,
1025
- "loss": 1.3395,
1026
- "step": 7200
1027
- },
1028
- {
1029
- "epoch": 1.4634638675817522,
1030
- "grad_norm": 15.866579055786133,
1031
- "learning_rate": 5.977346641247057e-06,
1032
- "loss": 1.3524,
1033
- "step": 7250
1034
- },
1035
- {
1036
- "epoch": 1.4735567218409367,
1037
- "grad_norm": 12.644124031066895,
1038
- "learning_rate": 5.8652013008859484e-06,
1039
- "loss": 1.3386,
1040
- "step": 7300
1041
- },
1042
- {
1043
- "epoch": 1.4836495761001212,
1044
- "grad_norm": 12.569697380065918,
1045
- "learning_rate": 5.753055960524841e-06,
1046
- "loss": 1.3369,
1047
- "step": 7350
1048
- },
1049
- {
1050
- "epoch": 1.4937424303593057,
1051
- "grad_norm": 11.428314208984375,
1052
- "learning_rate": 5.640910620163733e-06,
1053
- "loss": 1.3232,
1054
- "step": 7400
1055
- },
1056
- {
1057
- "epoch": 1.5038352846184901,
1058
- "grad_norm": 14.005457878112793,
1059
- "learning_rate": 5.528765279802625e-06,
1060
- "loss": 1.328,
1061
- "step": 7450
1062
- },
1063
- {
1064
- "epoch": 1.5139281388776746,
1065
- "grad_norm": 11.448248863220215,
1066
- "learning_rate": 5.418862846248739e-06,
1067
- "loss": 1.3106,
1068
- "step": 7500
1069
- },
1070
- {
1071
- "epoch": 1.5240209931368591,
1072
- "grad_norm": 11.041686058044434,
1073
- "learning_rate": 5.306717505887631e-06,
1074
- "loss": 1.3496,
1075
- "step": 7550
1076
- },
1077
- {
1078
- "epoch": 1.5341138473960436,
1079
- "grad_norm": 12.765951156616211,
1080
- "learning_rate": 5.194572165526522e-06,
1081
- "loss": 1.2758,
1082
- "step": 7600
1083
- },
1084
- {
1085
- "epoch": 1.544206701655228,
1086
- "grad_norm": 14.556378364562988,
1087
- "learning_rate": 5.082426825165415e-06,
1088
- "loss": 1.3376,
1089
- "step": 7650
1090
- },
1091
- {
1092
- "epoch": 1.5542995559144126,
1093
- "grad_norm": 10.397843360900879,
1094
- "learning_rate": 4.970281484804307e-06,
1095
- "loss": 1.2924,
1096
- "step": 7700
1097
- },
1098
- {
1099
- "epoch": 1.564392410173597,
1100
- "grad_norm": 22.283723831176758,
1101
- "learning_rate": 4.858136144443199e-06,
1102
- "loss": 1.3393,
1103
- "step": 7750
1104
- },
1105
- {
1106
- "epoch": 1.5744852644327816,
1107
- "grad_norm": 12.424785614013672,
1108
- "learning_rate": 4.74599080408209e-06,
1109
- "loss": 1.3193,
1110
- "step": 7800
1111
- },
1112
- {
1113
- "epoch": 1.584578118691966,
1114
- "grad_norm": 10.400694847106934,
1115
- "learning_rate": 4.633845463720983e-06,
1116
- "loss": 1.288,
1117
- "step": 7850
1118
- },
1119
- {
1120
- "epoch": 1.5946709729511506,
1121
- "grad_norm": 8.679718017578125,
1122
- "learning_rate": 4.521700123359875e-06,
1123
- "loss": 1.3217,
1124
- "step": 7900
1125
- },
1126
- {
1127
- "epoch": 1.604763827210335,
1128
- "grad_norm": 11.473520278930664,
1129
- "learning_rate": 4.409554782998767e-06,
1130
- "loss": 1.343,
1131
- "step": 7950
1132
- },
1133
- {
1134
- "epoch": 1.6148566814695196,
1135
- "grad_norm": 13.197211265563965,
1136
- "learning_rate": 4.297409442637659e-06,
1137
- "loss": 1.3501,
1138
- "step": 8000
1139
- },
1140
- {
1141
- "epoch": 1.624949535728704,
1142
- "grad_norm": 10.097699165344238,
1143
- "learning_rate": 4.18526410227655e-06,
1144
- "loss": 1.3053,
1145
- "step": 8050
1146
- },
1147
- {
1148
- "epoch": 1.6350423899878885,
1149
- "grad_norm": 13.534586906433105,
1150
- "learning_rate": 4.0731187619154425e-06,
1151
- "loss": 1.3039,
1152
- "step": 8100
1153
- },
1154
- {
1155
- "epoch": 1.645135244247073,
1156
- "grad_norm": 12.015509605407715,
1157
- "learning_rate": 3.960973421554335e-06,
1158
- "loss": 1.3044,
1159
- "step": 8150
1160
- },
1161
- {
1162
- "epoch": 1.6552280985062575,
1163
- "grad_norm": 13.811707496643066,
1164
- "learning_rate": 3.848828081193227e-06,
1165
- "loss": 1.2875,
1166
- "step": 8200
1167
- },
1168
- {
1169
- "epoch": 1.665320952765442,
1170
- "grad_norm": 8.733817100524902,
1171
- "learning_rate": 3.7366827408321186e-06,
1172
- "loss": 1.2781,
1173
- "step": 8250
1174
- },
1175
- {
1176
- "epoch": 1.6754138070246265,
1177
- "grad_norm": 10.037213325500488,
1178
- "learning_rate": 3.624537400471011e-06,
1179
- "loss": 1.3017,
1180
- "step": 8300
1181
- },
1182
- {
1183
- "epoch": 1.685506661283811,
1184
- "grad_norm": 18.557842254638672,
1185
- "learning_rate": 3.5123920601099026e-06,
1186
- "loss": 1.2988,
1187
- "step": 8350
1188
- },
1189
- {
1190
- "epoch": 1.6955995155429955,
1191
- "grad_norm": 11.54616641998291,
1192
- "learning_rate": 3.4002467197487947e-06,
1193
- "loss": 1.3061,
1194
- "step": 8400
1195
- },
1196
- {
1197
- "epoch": 1.70569236980218,
1198
- "grad_norm": 10.847959518432617,
1199
- "learning_rate": 3.2881013793876865e-06,
1200
- "loss": 1.327,
1201
- "step": 8450
1202
- },
1203
- {
1204
- "epoch": 1.7157852240613645,
1205
- "grad_norm": 13.53753662109375,
1206
- "learning_rate": 3.1759560390265787e-06,
1207
- "loss": 1.3172,
1208
- "step": 8500
1209
- },
1210
- {
1211
- "epoch": 1.725878078320549,
1212
- "grad_norm": 15.62736701965332,
1213
- "learning_rate": 3.063810698665471e-06,
1214
- "loss": 1.2641,
1215
- "step": 8550
1216
- },
1217
- {
1218
- "epoch": 1.7359709325797335,
1219
- "grad_norm": 11.705061912536621,
1220
- "learning_rate": 2.9516653583043626e-06,
1221
- "loss": 1.2898,
1222
- "step": 8600
1223
- },
1224
- {
1225
- "epoch": 1.746063786838918,
1226
- "grad_norm": 10.831978797912598,
1227
- "learning_rate": 2.839520017943255e-06,
1228
- "loss": 1.3224,
1229
- "step": 8650
1230
- },
1231
- {
1232
- "epoch": 1.7561566410981024,
1233
- "grad_norm": 10.530208587646484,
1234
- "learning_rate": 2.727374677582147e-06,
1235
- "loss": 1.2515,
1236
- "step": 8700
1237
- },
1238
- {
1239
- "epoch": 1.766249495357287,
1240
- "grad_norm": 12.877543449401855,
1241
- "learning_rate": 2.6152293372210387e-06,
1242
- "loss": 1.3279,
1243
- "step": 8750
1244
- },
1245
- {
1246
- "epoch": 1.7763423496164714,
1247
- "grad_norm": 13.460204124450684,
1248
- "learning_rate": 2.503083996859931e-06,
1249
- "loss": 1.2812,
1250
- "step": 8800
1251
- },
1252
- {
1253
- "epoch": 1.786435203875656,
1254
- "grad_norm": 14.455750465393066,
1255
- "learning_rate": 2.3909386564988227e-06,
1256
- "loss": 1.2797,
1257
- "step": 8850
1258
- },
1259
- {
1260
- "epoch": 1.7965280581348404,
1261
- "grad_norm": 16.0786075592041,
1262
- "learning_rate": 2.2787933161377144e-06,
1263
- "loss": 1.2896,
1264
- "step": 8900
1265
- },
1266
- {
1267
- "epoch": 1.806620912394025,
1268
- "grad_norm": 11.142393112182617,
1269
- "learning_rate": 2.1666479757766066e-06,
1270
- "loss": 1.2908,
1271
- "step": 8950
1272
- },
1273
- {
1274
- "epoch": 1.8167137666532094,
1275
- "grad_norm": 13.860331535339355,
1276
- "learning_rate": 2.0545026354154988e-06,
1277
- "loss": 1.322,
1278
- "step": 9000
1279
- },
1280
- {
1281
- "epoch": 1.8268066209123939,
1282
- "grad_norm": 15.06369400024414,
1283
- "learning_rate": 1.9423572950543905e-06,
1284
- "loss": 1.3083,
1285
- "step": 9050
1286
- },
1287
- {
1288
- "epoch": 1.8368994751715784,
1289
- "grad_norm": 15.326362609863281,
1290
- "learning_rate": 1.8302119546932825e-06,
1291
- "loss": 1.2909,
1292
- "step": 9100
1293
- },
1294
- {
1295
- "epoch": 1.8469923294307629,
1296
- "grad_norm": 16.93311882019043,
1297
- "learning_rate": 1.7180666143321747e-06,
1298
- "loss": 1.3016,
1299
- "step": 9150
1300
- },
1301
- {
1302
- "epoch": 1.8570851836899476,
1303
- "grad_norm": 15.057299613952637,
1304
- "learning_rate": 1.6059212739710667e-06,
1305
- "loss": 1.2914,
1306
- "step": 9200
1307
- },
1308
- {
1309
- "epoch": 1.867178037949132,
1310
- "grad_norm": 10.040867805480957,
1311
- "learning_rate": 1.4937759336099586e-06,
1312
- "loss": 1.3225,
1313
- "step": 9250
1314
- },
1315
- {
1316
- "epoch": 1.8772708922083166,
1317
- "grad_norm": 21.653919219970703,
1318
- "learning_rate": 1.3816305932488506e-06,
1319
- "loss": 1.2867,
1320
- "step": 9300
1321
- },
1322
- {
1323
- "epoch": 1.887363746467501,
1324
- "grad_norm": 10.50373363494873,
1325
- "learning_rate": 1.2694852528877428e-06,
1326
- "loss": 1.3098,
1327
- "step": 9350
1328
- },
1329
- {
1330
- "epoch": 1.8974566007266855,
1331
- "grad_norm": 12.554201126098633,
1332
- "learning_rate": 1.1573399125266345e-06,
1333
- "loss": 1.2786,
1334
- "step": 9400
1335
- },
1336
- {
1337
- "epoch": 1.90754945498587,
1338
- "grad_norm": 11.518633842468262,
1339
- "learning_rate": 1.0451945721655265e-06,
1340
- "loss": 1.289,
1341
- "step": 9450
1342
- },
1343
- {
1344
- "epoch": 1.9176423092450545,
1345
- "grad_norm": 12.421961784362793,
1346
- "learning_rate": 9.330492318044186e-07,
1347
- "loss": 1.2706,
1348
- "step": 9500
1349
- },
1350
- {
1351
- "epoch": 1.927735163504239,
1352
- "grad_norm": 14.83471393585205,
1353
- "learning_rate": 8.209038914433106e-07,
1354
- "loss": 1.262,
1355
- "step": 9550
1356
- },
1357
- {
1358
- "epoch": 1.9378280177634235,
1359
- "grad_norm": 12.76465129852295,
1360
- "learning_rate": 7.087585510822026e-07,
1361
- "loss": 1.2898,
1362
- "step": 9600
1363
- },
1364
- {
1365
- "epoch": 1.947920872022608,
1366
- "grad_norm": 12.1636381149292,
1367
- "learning_rate": 5.966132107210946e-07,
1368
- "loss": 1.2823,
1369
- "step": 9650
1370
- },
1371
- {
1372
- "epoch": 1.9580137262817925,
1373
- "grad_norm": 14.129364967346191,
1374
- "learning_rate": 4.844678703599866e-07,
1375
- "loss": 1.356,
1376
- "step": 9700
1377
- },
1378
- {
1379
- "epoch": 1.968106580540977,
1380
- "grad_norm": 12.133772850036621,
1381
- "learning_rate": 3.723225299988786e-07,
1382
- "loss": 1.28,
1383
- "step": 9750
1384
- },
1385
- {
1386
- "epoch": 1.9781994348001615,
1387
- "grad_norm": 9.618387222290039,
1388
- "learning_rate": 2.6017718963777056e-07,
1389
- "loss": 1.2651,
1390
- "step": 9800
1391
- },
1392
- {
1393
- "epoch": 1.988292289059346,
1394
- "grad_norm": 11.654873847961426,
1395
- "learning_rate": 1.4803184927666255e-07,
1396
- "loss": 1.3191,
1397
- "step": 9850
1398
- },
1399
- {
1400
- "epoch": 1.9983851433185305,
1401
- "grad_norm": 11.9207763671875,
1402
- "learning_rate": 3.588650891555456e-08,
1403
- "loss": 1.3182,
1404
- "step": 9900
1405
- },
1406
- {
1407
- "epoch": 2.0,
1408
- "eval_loss": 1.7954870462417603,
1409
- "eval_runtime": 226.5861,
1410
- "eval_samples_per_second": 16.396,
1411
- "eval_steps_per_second": 2.052,
1412
- "step": 9908
1413
  }
1414
  ],
1415
  "logging_steps": 50,
1416
- "max_steps": 9908,
1417
  "num_input_tokens_seen": 0,
1418
- "num_train_epochs": 2,
1419
  "save_steps": 500,
1420
  "stateful_callbacks": {
1421
  "TrainerControl": {
@@ -1429,8 +175,8 @@
1429
  "attributes": {}
1430
  }
1431
  },
1432
- "total_flos": 5.362669429614182e+16,
1433
- "train_batch_size": 2,
1434
  "trial_name": null,
1435
  "trial_params": null
1436
  }
 
1
  {
2
+ "best_global_step": 1037,
3
+ "best_metric": 0.49640655517578125,
4
+ "best_model_checkpoint": "./mcqa_model/checkpoint-1037",
5
+ "epoch": 0.9995180722891567,
6
  "eval_steps": 500,
7
+ "global_step": 1037,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.04819277108433735,
14
+ "grad_norm": 42.9211540222168,
15
+ "learning_rate": 4.2307692307692304e-07,
16
+ "loss": 0.7894,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.0963855421686747,
21
+ "grad_norm": 72.75926208496094,
22
+ "learning_rate": 9.038461538461538e-07,
23
+ "loss": 0.6269,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.14457831325301204,
28
+ "grad_norm": 73.69217681884766,
29
+ "learning_rate": 9.571275455519828e-07,
30
+ "loss": 0.5828,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.1927710843373494,
35
+ "grad_norm": 49.02070617675781,
36
+ "learning_rate": 9.035369774919614e-07,
37
+ "loss": 0.5468,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.24096385542168675,
42
+ "grad_norm": 36.791542053222656,
43
+ "learning_rate": 8.4994640943194e-07,
44
+ "loss": 0.6211,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.2891566265060241,
49
+ "grad_norm": 76.28284454345703,
50
+ "learning_rate": 7.963558413719184e-07,
51
+ "loss": 0.5783,
52
  "step": 300
53
  },
54
  {
55
+ "epoch": 0.3373493975903614,
56
+ "grad_norm": 42.93614196777344,
57
+ "learning_rate": 7.42765273311897e-07,
58
+ "loss": 0.5927,
59
  "step": 350
60
  },
61
  {
62
+ "epoch": 0.3855421686746988,
63
+ "grad_norm": 68.8043212890625,
64
+ "learning_rate": 6.891747052518756e-07,
65
+ "loss": 0.5652,
66
  "step": 400
67
  },
68
  {
69
+ "epoch": 0.43373493975903615,
70
+ "grad_norm": 58.556663513183594,
71
+ "learning_rate": 6.355841371918542e-07,
72
+ "loss": 0.5515,
73
  "step": 450
74
  },
75
  {
76
+ "epoch": 0.4819277108433735,
77
+ "grad_norm": 47.458282470703125,
78
+ "learning_rate": 5.819935691318327e-07,
79
+ "loss": 0.5392,
80
  "step": 500
81
  },
82
  {
83
+ "epoch": 0.5301204819277109,
84
+ "grad_norm": 53.92032241821289,
85
+ "learning_rate": 5.284030010718113e-07,
86
+ "loss": 0.5744,
87
  "step": 550
88
  },
89
  {
90
+ "epoch": 0.5783132530120482,
91
+ "grad_norm": 61.76318359375,
92
+ "learning_rate": 4.748124330117899e-07,
93
+ "loss": 0.5361,
94
  "step": 600
95
  },
96
  {
97
+ "epoch": 0.6265060240963856,
98
+ "grad_norm": 40.69456481933594,
99
+ "learning_rate": 4.2122186495176846e-07,
100
+ "loss": 0.6077,
101
  "step": 650
102
  },
103
  {
104
+ "epoch": 0.6746987951807228,
105
+ "grad_norm": 80.4998550415039,
106
+ "learning_rate": 3.6763129689174703e-07,
107
+ "loss": 0.5567,
108
  "step": 700
109
  },
110
  {
111
+ "epoch": 0.7228915662650602,
112
+ "grad_norm": 45.61492156982422,
113
+ "learning_rate": 3.140407288317256e-07,
114
+ "loss": 0.5114,
115
  "step": 750
116
  },
117
  {
118
+ "epoch": 0.7710843373493976,
119
+ "grad_norm": 47.459102630615234,
120
+ "learning_rate": 2.6045016077170417e-07,
121
+ "loss": 0.4868,
122
  "step": 800
123
  },
124
  {
125
+ "epoch": 0.8192771084337349,
126
+ "grad_norm": 86.90262603759766,
127
+ "learning_rate": 2.0685959271168274e-07,
128
+ "loss": 0.5324,
129
  "step": 850
130
  },
131
  {
132
+ "epoch": 0.8674698795180723,
133
+ "grad_norm": 45.50479507446289,
134
+ "learning_rate": 1.532690246516613e-07,
135
+ "loss": 0.5092,
136
  "step": 900
137
  },
138
  {
139
+ "epoch": 0.9156626506024096,
140
+ "grad_norm": 44.45884323120117,
141
+ "learning_rate": 9.967845659163988e-08,
142
+ "loss": 0.556,
143
  "step": 950
144
  },
145
  {
146
+ "epoch": 0.963855421686747,
147
+ "grad_norm": 54.122867584228516,
148
+ "learning_rate": 4.608788853161844e-08,
149
+ "loss": 0.4876,
150
  "step": 1000
151
  },
152
  {
153
+ "epoch": 0.9995180722891567,
154
+ "eval_loss": 0.49640655517578125,
155
+ "eval_runtime": 68.7272,
156
+ "eval_samples_per_second": 45.266,
157
+ "eval_steps_per_second": 5.66,
158
+ "step": 1037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
  ],
161
  "logging_steps": 50,
162
+ "max_steps": 1037,
163
  "num_input_tokens_seen": 0,
164
+ "num_train_epochs": 1,
165
  "save_steps": 500,
166
  "stateful_callbacks": {
167
  "TrainerControl": {
 
175
  "attributes": {}
176
  }
177
  },
178
+ "total_flos": 7892894912348160.0,
179
+ "train_batch_size": 8,
180
  "trial_name": null,
181
  "trial_params": null
182
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49c1279eb875f7ef2fd123dcc9d04eddc05a2c63d5ccfebe19a516fc432c4bfa
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:469a2bf0f14e0f4f12ec3d12ca471fb6a277722dc7ee87a05b1afeaa3495fa84
3
  size 5713