jflotz commited on
Commit
eea000e
·
1 Parent(s): a32776c

Training in progress, step 20000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36903f0243a52bf56c2303ead198d638e5dcf25f476cfa5723954adea34fe732
3
  size 893438545
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74edd0cbfef63026b31ad51e8f4bc01025df6b2f2a0198b920cb1fc03a8579b7
3
  size 893438545
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9720327dd677676021978fd27f18fb24496429062f72218e72439f8001b20be8
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532ee195f2906ab9ce0bf8722baaca50a9bcc629a6a0003a6ae623a01b7ea889
3
  size 449471589
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c101353913b86ff575c9018e352c46e19aa450e36d2ee34d697b2d5ed877d1d
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e0931a17310d4650ee3518b83c618a6ffd8c6840ad8bd778d326fb339eb375
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6647dd16e16d38ed2bd48d2720f065055b2d402dd5cc12bc5f1fbd386132f2a
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ca250cf2344e8df6eed15e28ea548564b75ef302cf69c48962d23fb49df8b25
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.11152872422292362,
5
- "global_step": 10000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -206,11 +206,211 @@
206
  "eval_samples_per_second": 957.11,
207
  "eval_steps_per_second": 15.0,
208
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  }
210
  ],
211
  "max_steps": 1000000,
212
  "num_train_epochs": 12,
213
- "total_flos": 7.010026103034715e+20,
214
  "trial_name": null,
215
  "trial_params": null
216
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.22305744844584724,
5
+ "global_step": 20000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
206
  "eval_samples_per_second": 957.11,
207
  "eval_steps_per_second": 15.0,
208
  "step": 10000
209
+ },
210
+ {
211
+ "epoch": 0.12,
212
+ "learning_rate": 3.149999999999999e-05,
213
+ "loss": 0.6293,
214
+ "step": 10500
215
+ },
216
+ {
217
+ "epoch": 0.12,
218
+ "learning_rate": 3.2999999999999996e-05,
219
+ "loss": 0.6245,
220
+ "step": 11000
221
+ },
222
+ {
223
+ "epoch": 0.12,
224
+ "eval_loss": 0.6135886311531067,
225
+ "eval_runtime": 2.3072,
226
+ "eval_samples_per_second": 995.595,
227
+ "eval_steps_per_second": 15.604,
228
+ "step": 11000
229
+ },
230
+ {
231
+ "epoch": 0.13,
232
+ "learning_rate": 3.45e-05,
233
+ "loss": 0.6197,
234
+ "step": 11500
235
+ },
236
+ {
237
+ "epoch": 0.13,
238
+ "learning_rate": 3.5999999999999994e-05,
239
+ "loss": 0.6148,
240
+ "step": 12000
241
+ },
242
+ {
243
+ "epoch": 0.13,
244
+ "eval_loss": 0.6036174297332764,
245
+ "eval_runtime": 2.3862,
246
+ "eval_samples_per_second": 962.612,
247
+ "eval_steps_per_second": 15.087,
248
+ "step": 12000
249
+ },
250
+ {
251
+ "epoch": 0.14,
252
+ "learning_rate": 3.75e-05,
253
+ "loss": 0.6096,
254
+ "step": 12500
255
+ },
256
+ {
257
+ "epoch": 0.14,
258
+ "learning_rate": 3.9e-05,
259
+ "loss": 0.6046,
260
+ "step": 13000
261
+ },
262
+ {
263
+ "epoch": 0.14,
264
+ "eval_loss": 0.5924868583679199,
265
+ "eval_runtime": 2.386,
266
+ "eval_samples_per_second": 962.685,
267
+ "eval_steps_per_second": 15.088,
268
+ "step": 13000
269
+ },
270
+ {
271
+ "epoch": 0.15,
272
+ "learning_rate": 4.05e-05,
273
+ "loss": 0.5998,
274
+ "step": 13500
275
+ },
276
+ {
277
+ "epoch": 0.16,
278
+ "learning_rate": 4.2e-05,
279
+ "loss": 0.5951,
280
+ "step": 14000
281
+ },
282
+ {
283
+ "epoch": 0.16,
284
+ "eval_loss": 0.5810989141464233,
285
+ "eval_runtime": 2.4081,
286
+ "eval_samples_per_second": 953.875,
287
+ "eval_steps_per_second": 14.95,
288
+ "step": 14000
289
+ },
290
+ {
291
+ "epoch": 0.16,
292
+ "learning_rate": 4.3499999999999993e-05,
293
+ "loss": 0.5908,
294
+ "step": 14500
295
+ },
296
+ {
297
+ "epoch": 0.17,
298
+ "learning_rate": 4.4999999999999996e-05,
299
+ "loss": 0.586,
300
+ "step": 15000
301
+ },
302
+ {
303
+ "epoch": 0.17,
304
+ "eval_loss": 0.5703989863395691,
305
+ "eval_runtime": 2.3852,
306
+ "eval_samples_per_second": 963.035,
307
+ "eval_steps_per_second": 15.093,
308
+ "step": 15000
309
+ },
310
+ {
311
+ "epoch": 0.17,
312
+ "learning_rate": 4.65e-05,
313
+ "loss": 0.5815,
314
+ "step": 15500
315
+ },
316
+ {
317
+ "epoch": 0.18,
318
+ "learning_rate": 4.7999999999999994e-05,
319
+ "loss": 0.5769,
320
+ "step": 16000
321
+ },
322
+ {
323
+ "epoch": 0.18,
324
+ "eval_loss": 0.5616013407707214,
325
+ "eval_runtime": 2.3872,
326
+ "eval_samples_per_second": 962.217,
327
+ "eval_steps_per_second": 15.08,
328
+ "step": 16000
329
+ },
330
+ {
331
+ "epoch": 0.18,
332
+ "learning_rate": 4.95e-05,
333
+ "loss": 0.5722,
334
+ "step": 16500
335
+ },
336
+ {
337
+ "epoch": 0.19,
338
+ "learning_rate": 5.1e-05,
339
+ "loss": 0.5673,
340
+ "step": 17000
341
+ },
342
+ {
343
+ "epoch": 0.19,
344
+ "eval_loss": 0.5503653287887573,
345
+ "eval_runtime": 2.3451,
346
+ "eval_samples_per_second": 979.501,
347
+ "eval_steps_per_second": 15.351,
348
+ "step": 17000
349
+ },
350
+ {
351
+ "epoch": 0.2,
352
+ "learning_rate": 5.2499999999999995e-05,
353
+ "loss": 0.5616,
354
+ "step": 17500
355
+ },
356
+ {
357
+ "epoch": 0.2,
358
+ "learning_rate": 5.399999999999999e-05,
359
+ "loss": 0.5553,
360
+ "step": 18000
361
+ },
362
+ {
363
+ "epoch": 0.2,
364
+ "eval_loss": 0.5397240519523621,
365
+ "eval_runtime": 2.4358,
366
+ "eval_samples_per_second": 943.009,
367
+ "eval_steps_per_second": 14.779,
368
+ "step": 18000
369
+ },
370
+ {
371
+ "epoch": 0.21,
372
+ "learning_rate": 5.5499999999999994e-05,
373
+ "loss": 0.5491,
374
+ "step": 18500
375
+ },
376
+ {
377
+ "epoch": 0.21,
378
+ "learning_rate": 5.6999999999999996e-05,
379
+ "loss": 0.5421,
380
+ "step": 19000
381
+ },
382
+ {
383
+ "epoch": 0.21,
384
+ "eval_loss": 0.5266169905662537,
385
+ "eval_runtime": 2.4081,
386
+ "eval_samples_per_second": 953.853,
387
+ "eval_steps_per_second": 14.949,
388
+ "step": 19000
389
+ },
390
+ {
391
+ "epoch": 0.22,
392
+ "learning_rate": 5.85e-05,
393
+ "loss": 0.5357,
394
+ "step": 19500
395
+ },
396
+ {
397
+ "epoch": 0.22,
398
+ "learning_rate": 5.9999999999999995e-05,
399
+ "loss": 0.5301,
400
+ "step": 20000
401
+ },
402
+ {
403
+ "epoch": 0.22,
404
+ "eval_loss": 0.5150080919265747,
405
+ "eval_runtime": 2.3725,
406
+ "eval_samples_per_second": 968.19,
407
+ "eval_steps_per_second": 15.174,
408
+ "step": 20000
409
  }
410
  ],
411
  "max_steps": 1000000,
412
  "num_train_epochs": 12,
413
+ "total_flos": 1.402005220606943e+21,
414
  "trial_name": null,
415
  "trial_params": null
416
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9720327dd677676021978fd27f18fb24496429062f72218e72439f8001b20be8
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:532ee195f2906ab9ce0bf8722baaca50a9bcc629a6a0003a6ae623a01b7ea889
3
  size 449471589