Wilsonwin commited on
Commit
9cf2abb
·
verified ·
1 Parent(s): 01f9a48

Training in progress, step 11838, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08019f8dd515e2b7dd1e16b7ad39ac1098307f1c6ab16b9ed965910d0892bc77
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b44b5dec3eb521e4966741844bbf6502227d6ed08a1303474080332dfbe45e5
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5fe73ae0629191b3874b216e4c6336651d3274bf403ab0a26100f714397a9f08
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6886cac9d99b59f70ddb2a5a11358ca7a3e8d9f6f65ffabbc7d41be6c68dc0f9
3
  size 318646859
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:116f0b85bffdc97adeb264e8dbd65d6acc7d514e82a48ea5ea50bd5091784a48
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b172ef1a2b23540cb3d53eed9b6dcd9ee9e06553bb8c4f5a46142cb0fe60689
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9428957594188208,
6
  "eval_steps": 500,
7
- "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8249,6 +8249,237 @@
8249
  "eval_samples_per_second": 273.376,
8250
  "eval_steps_per_second": 5.741,
8251
  "step": 11500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8252
  }
8253
  ],
8254
  "logging_steps": 10,
@@ -8263,12 +8494,12 @@
8263
  "should_evaluate": false,
8264
  "should_log": false,
8265
  "should_save": true,
8266
- "should_training_stop": false
8267
  },
8268
  "attributes": {}
8269
  }
8270
  },
8271
- "total_flos": 3.846232536325816e+17,
8272
  "train_batch_size": 48,
8273
  "trial_name": null,
8274
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 11838,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8249
  "eval_samples_per_second": 273.376,
8250
  "eval_steps_per_second": 5.741,
8251
  "step": 11500
8252
+ },
8253
+ {
8254
+ "epoch": 1.9445852339922283,
8255
+ "grad_norm": 0.4365793466567993,
8256
+ "learning_rate": 8.270650415620584e-07,
8257
+ "loss": 4.29614372253418,
8258
+ "step": 11510
8259
+ },
8260
+ {
8261
+ "epoch": 1.9462747085656362,
8262
+ "grad_norm": 0.43951302766799927,
8263
+ "learning_rate": 7.775944524542055e-07,
8264
+ "loss": 4.270536422729492,
8265
+ "step": 11520
8266
+ },
8267
+ {
8268
+ "epoch": 1.9479641831390437,
8269
+ "grad_norm": 0.45033299922943115,
8270
+ "learning_rate": 7.296455308872406e-07,
8271
+ "loss": 4.282721710205078,
8272
+ "step": 11530
8273
+ },
8274
+ {
8275
+ "epoch": 1.9496536577124515,
8276
+ "grad_norm": 0.43896329402923584,
8277
+ "learning_rate": 6.832187658113441e-07,
8278
+ "loss": 4.2960052490234375,
8279
+ "step": 11540
8280
+ },
8281
+ {
8282
+ "epoch": 1.9513431322858592,
8283
+ "grad_norm": 0.44476914405822754,
8284
+ "learning_rate": 6.383146306547626e-07,
8285
+ "loss": 4.30328483581543,
8286
+ "step": 11550
8287
+ },
8288
+ {
8289
+ "epoch": 1.9530326068592667,
8290
+ "grad_norm": 0.4521080553531647,
8291
+ "learning_rate": 5.949335833189628e-07,
8292
+ "loss": 4.327355575561524,
8293
+ "step": 11560
8294
+ },
8295
+ {
8296
+ "epoch": 1.9547220814326745,
8297
+ "grad_norm": 0.4513733983039856,
8298
+ "learning_rate": 5.530760661741018e-07,
8299
+ "loss": 4.302740859985351,
8300
+ "step": 11570
8301
+ },
8302
+ {
8303
+ "epoch": 1.9564115560060822,
8304
+ "grad_norm": 0.4388287663459778,
8305
+ "learning_rate": 5.127425060543478e-07,
8306
+ "loss": 4.277825546264649,
8307
+ "step": 11580
8308
+ },
8309
+ {
8310
+ "epoch": 1.9581010305794897,
8311
+ "grad_norm": 0.44225117564201355,
8312
+ "learning_rate": 4.7393331425364943e-07,
8313
+ "loss": 4.280667877197265,
8314
+ "step": 11590
8315
+ },
8316
+ {
8317
+ "epoch": 1.9597905051528974,
8318
+ "grad_norm": 0.44035524129867554,
8319
+ "learning_rate": 4.3664888652144017e-07,
8320
+ "loss": 4.278044891357422,
8321
+ "step": 11600
8322
+ },
8323
+ {
8324
+ "epoch": 1.9614799797263052,
8325
+ "grad_norm": 0.44098883867263794,
8326
+ "learning_rate": 4.008896030587072e-07,
8327
+ "loss": 4.268376159667969,
8328
+ "step": 11610
8329
+ },
8330
+ {
8331
+ "epoch": 1.9631694542997127,
8332
+ "grad_norm": 0.43533217906951904,
8333
+ "learning_rate": 3.6665582851406195e-07,
8334
+ "loss": 4.295290756225586,
8335
+ "step": 11620
8336
+ },
8337
+ {
8338
+ "epoch": 1.9648589288731204,
8339
+ "grad_norm": 0.45106539130210876,
8340
+ "learning_rate": 3.3394791198000927e-07,
8341
+ "loss": 4.281253051757813,
8342
+ "step": 11630
8343
+ },
8344
+ {
8345
+ "epoch": 1.9665484034465281,
8346
+ "grad_norm": 0.44754281640052795,
8347
+ "learning_rate": 3.027661869893672e-07,
8348
+ "loss": 4.281909942626953,
8349
+ "step": 11640
8350
+ },
8351
+ {
8352
+ "epoch": 1.9682378780199357,
8353
+ "grad_norm": 0.438475638628006,
8354
+ "learning_rate": 2.731109715119861e-07,
8355
+ "loss": 4.280799484252929,
8356
+ "step": 11650
8357
+ },
8358
+ {
8359
+ "epoch": 1.9699273525933436,
8360
+ "grad_norm": 0.44646841287612915,
8361
+ "learning_rate": 2.4498256795135173e-07,
8362
+ "loss": 4.306585693359375,
8363
+ "step": 11660
8364
+ },
8365
+ {
8366
+ "epoch": 1.9716168271667511,
8367
+ "grad_norm": 0.4341582953929901,
8368
+ "learning_rate": 2.183812631415871e-07,
8369
+ "loss": 4.274542617797851,
8370
+ "step": 11670
8371
+ },
8372
+ {
8373
+ "epoch": 1.9733063017401589,
8374
+ "grad_norm": 0.4331877827644348,
8375
+ "learning_rate": 1.933073283445219e-07,
8376
+ "loss": 4.2908935546875,
8377
+ "step": 11680
8378
+ },
8379
+ {
8380
+ "epoch": 1.9749957763135666,
8381
+ "grad_norm": 0.447518914937973,
8382
+ "learning_rate": 1.697610192469112e-07,
8383
+ "loss": 4.3111217498779295,
8384
+ "step": 11690
8385
+ },
8386
+ {
8387
+ "epoch": 1.976685250886974,
8388
+ "grad_norm": 0.44273945689201355,
8389
+ "learning_rate": 1.4774257595783766e-07,
8390
+ "loss": 4.300546264648437,
8391
+ "step": 11700
8392
+ },
8393
+ {
8394
+ "epoch": 1.9783747254603818,
8395
+ "grad_norm": 0.45125117897987366,
8396
+ "learning_rate": 1.272522230062467e-07,
8397
+ "loss": 4.289936828613281,
8398
+ "step": 11710
8399
+ },
8400
+ {
8401
+ "epoch": 1.9800642000337896,
8402
+ "grad_norm": 0.43694615364074707,
8403
+ "learning_rate": 1.0829016933869838e-07,
8404
+ "loss": 4.289299392700196,
8405
+ "step": 11720
8406
+ },
8407
+ {
8408
+ "epoch": 1.981753674607197,
8409
+ "grad_norm": 0.44341588020324707,
8410
+ "learning_rate": 9.085660831715247e-08,
8411
+ "loss": 4.297845458984375,
8412
+ "step": 11730
8413
+ },
8414
+ {
8415
+ "epoch": 1.9834431491806048,
8416
+ "grad_norm": 0.446321576833725,
8417
+ "learning_rate": 7.495171771710328e-08,
8418
+ "loss": 4.293007659912109,
8419
+ "step": 11740
8420
+ },
8421
+ {
8422
+ "epoch": 1.9851326237540126,
8423
+ "grad_norm": 0.44870322942733765,
8424
+ "learning_rate": 6.057565972568123e-08,
8425
+ "loss": 4.291889190673828,
8426
+ "step": 11750
8427
+ },
8428
+ {
8429
+ "epoch": 1.98682209832742,
8430
+ "grad_norm": 0.44810283184051514,
8431
+ "learning_rate": 4.772858094005405e-08,
8432
+ "loss": 4.299283981323242,
8433
+ "step": 11760
8434
+ },
8435
+ {
8436
+ "epoch": 1.9885115729008278,
8437
+ "grad_norm": 0.44559845328330994,
8438
+ "learning_rate": 3.641061236591136e-08,
8439
+ "loss": 4.282249832153321,
8440
+ "step": 11770
8441
+ },
8442
+ {
8443
+ "epoch": 1.9902010474742355,
8444
+ "grad_norm": 0.4399174153804779,
8445
+ "learning_rate": 2.6621869416099118e-08,
8446
+ "loss": 4.288850021362305,
8447
+ "step": 11780
8448
+ },
8449
+ {
8450
+ "epoch": 1.991890522047643,
8451
+ "grad_norm": 0.4408097267150879,
8452
+ "learning_rate": 1.8362451909520458e-08,
8453
+ "loss": 4.286129760742187,
8454
+ "step": 11790
8455
+ },
8456
+ {
8457
+ "epoch": 1.993579996621051,
8458
+ "grad_norm": 0.44703418016433716,
8459
+ "learning_rate": 1.16324440700033e-08,
8460
+ "loss": 4.295662307739258,
8461
+ "step": 11800
8462
+ },
8463
+ {
8464
+ "epoch": 1.9952694711944585,
8465
+ "grad_norm": 0.45136868953704834,
8466
+ "learning_rate": 6.431914525567572e-09,
8467
+ "loss": 4.288112640380859,
8468
+ "step": 11810
8469
+ },
8470
+ {
8471
+ "epoch": 1.996958945767866,
8472
+ "grad_norm": 0.4494999647140503,
8473
+ "learning_rate": 2.760916307625871e-09,
8474
+ "loss": 4.304160308837891,
8475
+ "step": 11820
8476
+ },
8477
+ {
8478
+ "epoch": 1.998648420341274,
8479
+ "grad_norm": 0.4357486665248871,
8480
+ "learning_rate": 6.194868504838524e-10,
8481
+ "loss": 4.277233123779297,
8482
+ "step": 11830
8483
  }
8484
  ],
8485
  "logging_steps": 10,
 
8494
  "should_evaluate": false,
8495
  "should_log": false,
8496
  "should_save": true,
8497
+ "should_training_stop": true
8498
  },
8499
  "attributes": {}
8500
  }
8501
  },
8502
+ "total_flos": 3.959258038224814e+17,
8503
  "train_batch_size": 48,
8504
  "trial_name": null,
8505
  "trial_params": null