irishprancer commited on
Commit
d3f05fe
·
verified ·
1 Parent(s): 823c4ee

Training in progress, step 450, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04b0f779026ec70664afe700822ef58aa8ad2099293e8a97ff37d57139227678
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78ffc67243e9efe15ba19e9b31a035d2c93b575113b680c0e0bfd11442028482
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:883c0d10e8ceb16e5ba20eebd16f134a05b14cf22161289dc343eceb41e829a7
3
  size 1054136250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a561696f8f803801ea7f2ab87e64e1172e029a7d45f1b06fdf82b9fa3338298
3
  size 1054136250
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:226f394c3a9826cc7f74d0799aa02f643f1ee6b891784f44c588787dbc9c0cb3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed14e15604e1097b80da74a65c68f380dc6bb673bf5694a945c25e7931ad5a75
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2673d78ac7304a2a7678ae71ed65422fa2295f07aca63cf23ca76e0b5c92da69
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb248e7cc2fe7b509c9e866be7b72af3b33225d8b86373c1a62393cc3a24f4da
3
  size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7682406902313232,
3
- "best_model_checkpoint": "./output/checkpoint-150",
4
- "epoch": 3.3333333333333335,
5
  "eval_steps": 150,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -120,6 +120,232 @@
120
  "eval_samples_per_second": 22.155,
121
  "eval_steps_per_second": 22.155,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
@@ -139,7 +365,7 @@
139
  "attributes": {}
140
  }
141
  },
142
- "total_flos": 1615667282657280.0,
143
  "train_batch_size": 2,
144
  "trial_name": null,
145
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.726381778717041,
3
+ "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 10.0,
5
  "eval_steps": 150,
6
+ "global_step": 450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
120
  "eval_samples_per_second": 22.155,
121
  "eval_steps_per_second": 22.155,
122
  "step": 150
123
+ },
124
+ {
125
+ "epoch": 3.5555555555555554,
126
+ "grad_norm": 2.575423002243042,
127
+ "learning_rate": 2.9999892980750276e-05,
128
+ "loss": 0.6943,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 3.7777777777777777,
133
+ "grad_norm": 2.392334222793579,
134
+ "learning_rate": 2.9999854334972655e-05,
135
+ "loss": 0.6922,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 4.0,
140
+ "grad_norm": 2.5393826961517334,
141
+ "learning_rate": 2.999980974373202e-05,
142
+ "loss": 0.6677,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 4.222222222222222,
147
+ "grad_norm": 1.6046398878097534,
148
+ "learning_rate": 2.9999759207046055e-05,
149
+ "loss": 0.5898,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 4.444444444444445,
154
+ "grad_norm": 1.681414246559143,
155
+ "learning_rate": 2.9999702724934783e-05,
156
+ "loss": 0.7117,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 4.666666666666667,
161
+ "grad_norm": 3.416290044784546,
162
+ "learning_rate": 2.99996402974206e-05,
163
+ "loss": 0.6278,
164
+ "step": 210
165
+ },
166
+ {
167
+ "epoch": 4.888888888888889,
168
+ "grad_norm": 2.0781354904174805,
169
+ "learning_rate": 2.9999571924528243e-05,
170
+ "loss": 0.6732,
171
+ "step": 220
172
+ },
173
+ {
174
+ "epoch": 5.111111111111111,
175
+ "grad_norm": 2.630134105682373,
176
+ "learning_rate": 2.9999497606284816e-05,
177
+ "loss": 0.6029,
178
+ "step": 230
179
+ },
180
+ {
181
+ "epoch": 5.333333333333333,
182
+ "grad_norm": 1.482037901878357,
183
+ "learning_rate": 2.9999417342719775e-05,
184
+ "loss": 0.6941,
185
+ "step": 240
186
+ },
187
+ {
188
+ "epoch": 5.555555555555555,
189
+ "grad_norm": 2.217900514602661,
190
+ "learning_rate": 2.9999331133864935e-05,
191
+ "loss": 0.6478,
192
+ "step": 250
193
+ },
194
+ {
195
+ "epoch": 5.777777777777778,
196
+ "grad_norm": 1.7131129503250122,
197
+ "learning_rate": 2.9999238979754465e-05,
198
+ "loss": 0.6095,
199
+ "step": 260
200
+ },
201
+ {
202
+ "epoch": 6.0,
203
+ "grad_norm": 1.908470869064331,
204
+ "learning_rate": 2.99991408804249e-05,
205
+ "loss": 0.5758,
206
+ "step": 270
207
+ },
208
+ {
209
+ "epoch": 6.222222222222222,
210
+ "grad_norm": 2.141641616821289,
211
+ "learning_rate": 2.999903683591511e-05,
212
+ "loss": 0.574,
213
+ "step": 280
214
+ },
215
+ {
216
+ "epoch": 6.444444444444445,
217
+ "grad_norm": 1.3931849002838135,
218
+ "learning_rate": 2.9998926846266345e-05,
219
+ "loss": 0.6139,
220
+ "step": 290
221
+ },
222
+ {
223
+ "epoch": 6.666666666666667,
224
+ "grad_norm": 2.278519868850708,
225
+ "learning_rate": 2.9998810911522193e-05,
226
+ "loss": 0.6227,
227
+ "step": 300
228
+ },
229
+ {
230
+ "epoch": 6.666666666666667,
231
+ "eval_loss": 0.7388573884963989,
232
+ "eval_runtime": 0.4661,
233
+ "eval_samples_per_second": 21.453,
234
+ "eval_steps_per_second": 21.453,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 6.888888888888889,
239
+ "grad_norm": 2.712602138519287,
240
+ "learning_rate": 2.9998689031728615e-05,
241
+ "loss": 0.6479,
242
+ "step": 310
243
+ },
244
+ {
245
+ "epoch": 7.111111111111111,
246
+ "grad_norm": 1.7634906768798828,
247
+ "learning_rate": 2.9998561206933918e-05,
248
+ "loss": 0.5863,
249
+ "step": 320
250
+ },
251
+ {
252
+ "epoch": 7.333333333333333,
253
+ "grad_norm": 1.772024154663086,
254
+ "learning_rate": 2.9998427437188766e-05,
255
+ "loss": 0.5795,
256
+ "step": 330
257
+ },
258
+ {
259
+ "epoch": 7.555555555555555,
260
+ "grad_norm": 2.34784197807312,
261
+ "learning_rate": 2.999828772254618e-05,
262
+ "loss": 0.6034,
263
+ "step": 340
264
+ },
265
+ {
266
+ "epoch": 7.777777777777778,
267
+ "grad_norm": 2.455519199371338,
268
+ "learning_rate": 2.9998142063061544e-05,
269
+ "loss": 0.6625,
270
+ "step": 350
271
+ },
272
+ {
273
+ "epoch": 8.0,
274
+ "grad_norm": 1.3227782249450684,
275
+ "learning_rate": 2.9997990458792583e-05,
276
+ "loss": 0.6041,
277
+ "step": 360
278
+ },
279
+ {
280
+ "epoch": 8.222222222222221,
281
+ "grad_norm": 1.9034490585327148,
282
+ "learning_rate": 2.9997832909799397e-05,
283
+ "loss": 0.5491,
284
+ "step": 370
285
+ },
286
+ {
287
+ "epoch": 8.444444444444445,
288
+ "grad_norm": 1.9352225065231323,
289
+ "learning_rate": 2.9997669416144432e-05,
290
+ "loss": 0.6406,
291
+ "step": 380
292
+ },
293
+ {
294
+ "epoch": 8.666666666666666,
295
+ "grad_norm": 1.0488076210021973,
296
+ "learning_rate": 2.999749997789249e-05,
297
+ "loss": 0.5398,
298
+ "step": 390
299
+ },
300
+ {
301
+ "epoch": 8.88888888888889,
302
+ "grad_norm": 1.5216209888458252,
303
+ "learning_rate": 2.9997324595110723e-05,
304
+ "loss": 0.6545,
305
+ "step": 400
306
+ },
307
+ {
308
+ "epoch": 9.11111111111111,
309
+ "grad_norm": 1.3843863010406494,
310
+ "learning_rate": 2.9997143267868663e-05,
311
+ "loss": 0.5948,
312
+ "step": 410
313
+ },
314
+ {
315
+ "epoch": 9.333333333333334,
316
+ "grad_norm": 2.4701507091522217,
317
+ "learning_rate": 2.999695599623817e-05,
318
+ "loss": 0.6224,
319
+ "step": 420
320
+ },
321
+ {
322
+ "epoch": 9.555555555555555,
323
+ "grad_norm": 2.598496198654175,
324
+ "learning_rate": 2.9996762780293483e-05,
325
+ "loss": 0.575,
326
+ "step": 430
327
+ },
328
+ {
329
+ "epoch": 9.777777777777779,
330
+ "grad_norm": 1.5249278545379639,
331
+ "learning_rate": 2.9996563620111176e-05,
332
+ "loss": 0.5294,
333
+ "step": 440
334
+ },
335
+ {
336
+ "epoch": 10.0,
337
+ "grad_norm": 1.3830034732818604,
338
+ "learning_rate": 2.9996358515770198e-05,
339
+ "loss": 0.5417,
340
+ "step": 450
341
+ },
342
+ {
343
+ "epoch": 10.0,
344
+ "eval_loss": 0.726381778717041,
345
+ "eval_runtime": 0.4056,
346
+ "eval_samples_per_second": 24.653,
347
+ "eval_steps_per_second": 24.653,
348
+ "step": 450
349
  }
350
  ],
351
  "logging_steps": 10,
 
365
  "attributes": {}
366
  }
367
  },
368
+ "total_flos": 4801636770840576.0,
369
  "train_batch_size": 2,
370
  "trial_name": null,
371
  "trial_params": null