shareit commited on
Commit
2795bd6
·
verified ·
1 Parent(s): 535e38d

Training in progress, step 664, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a5d279c3421528a68f15a8871b6a50cb5db0de42b8f36b73bee3644fdc6d8ac
3
  size 170415112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f7af443d21ff67d874a56085a16f52737d7f56789de192ba1be426211ea41b6
3
  size 170415112
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f50bdcfcb9f3aae3901f3568c5b1bce16eaf0c76d7de6c67ddb821b4888989b1
3
  size 86718091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:735530cb1b764c0ff16a697bfc1916069fab6eabfae0a8fa9e1c5bfb930501e4
3
  size 86718091
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:954108a301f31704884688b043fe00abb56c97201b7aaacebc614e30abd254ee
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48795a8d2fd5455a045e7eb527144bdace15d2d820342ea71a23e2a449fc389
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.6144578313253013,
6
  "eval_steps": 500,
7
- "global_step": 600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4208,6 +4208,454 @@
4208
  "learning_rate": 1.9726858877086497e-05,
4209
  "loss": 0.8407,
4210
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4211
  }
4212
  ],
4213
  "logging_steps": 1,
@@ -4222,12 +4670,12 @@
4222
  "should_evaluate": false,
4223
  "should_log": false,
4224
  "should_save": true,
4225
- "should_training_stop": false
4226
  },
4227
  "attributes": {}
4228
  }
4229
  },
4230
- "total_flos": 6.610222931331318e+18,
4231
  "train_batch_size": 8,
4232
  "trial_name": null,
4233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
  "eval_steps": 500,
7
+ "global_step": 664,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4208
  "learning_rate": 1.9726858877086497e-05,
4209
  "loss": 0.8407,
4210
  "step": 600
4211
+ },
4212
+ {
4213
+ "epoch": 3.6204819277108435,
4214
+ "grad_norm": 0.057724274694919586,
4215
+ "learning_rate": 1.9423368740515935e-05,
4216
+ "loss": 0.7717,
4217
+ "step": 601
4218
+ },
4219
+ {
4220
+ "epoch": 3.6265060240963853,
4221
+ "grad_norm": 0.0628497302532196,
4222
+ "learning_rate": 1.9119878603945374e-05,
4223
+ "loss": 0.9699,
4224
+ "step": 602
4225
+ },
4226
+ {
4227
+ "epoch": 3.6325301204819276,
4228
+ "grad_norm": 0.06557570397853851,
4229
+ "learning_rate": 1.8816388467374812e-05,
4230
+ "loss": 0.8877,
4231
+ "step": 603
4232
+ },
4233
+ {
4234
+ "epoch": 3.63855421686747,
4235
+ "grad_norm": 0.05746385082602501,
4236
+ "learning_rate": 1.851289833080425e-05,
4237
+ "loss": 0.9611,
4238
+ "step": 604
4239
+ },
4240
+ {
4241
+ "epoch": 3.644578313253012,
4242
+ "grad_norm": 0.05730944871902466,
4243
+ "learning_rate": 1.8209408194233688e-05,
4244
+ "loss": 0.9038,
4245
+ "step": 605
4246
+ },
4247
+ {
4248
+ "epoch": 3.6506024096385543,
4249
+ "grad_norm": 0.06290185451507568,
4250
+ "learning_rate": 1.7905918057663127e-05,
4251
+ "loss": 0.9113,
4252
+ "step": 606
4253
+ },
4254
+ {
4255
+ "epoch": 3.6566265060240966,
4256
+ "grad_norm": 0.052313752472400665,
4257
+ "learning_rate": 1.7602427921092565e-05,
4258
+ "loss": 0.8877,
4259
+ "step": 607
4260
+ },
4261
+ {
4262
+ "epoch": 3.662650602409639,
4263
+ "grad_norm": 0.05167490243911743,
4264
+ "learning_rate": 1.7298937784522003e-05,
4265
+ "loss": 0.8335,
4266
+ "step": 608
4267
+ },
4268
+ {
4269
+ "epoch": 3.6686746987951806,
4270
+ "grad_norm": 0.0641271322965622,
4271
+ "learning_rate": 1.699544764795144e-05,
4272
+ "loss": 0.9001,
4273
+ "step": 609
4274
+ },
4275
+ {
4276
+ "epoch": 3.674698795180723,
4277
+ "grad_norm": 0.053020380437374115,
4278
+ "learning_rate": 1.6691957511380883e-05,
4279
+ "loss": 0.9843,
4280
+ "step": 610
4281
+ },
4282
+ {
4283
+ "epoch": 3.680722891566265,
4284
+ "grad_norm": 0.05865924432873726,
4285
+ "learning_rate": 1.638846737481032e-05,
4286
+ "loss": 0.7744,
4287
+ "step": 611
4288
+ },
4289
+ {
4290
+ "epoch": 3.6867469879518073,
4291
+ "grad_norm": 0.052690133452415466,
4292
+ "learning_rate": 1.6084977238239756e-05,
4293
+ "loss": 0.9141,
4294
+ "step": 612
4295
+ },
4296
+ {
4297
+ "epoch": 3.692771084337349,
4298
+ "grad_norm": 0.055314671248197556,
4299
+ "learning_rate": 1.5781487101669194e-05,
4300
+ "loss": 0.8527,
4301
+ "step": 613
4302
+ },
4303
+ {
4304
+ "epoch": 3.6987951807228914,
4305
+ "grad_norm": 0.055480074137449265,
4306
+ "learning_rate": 1.5477996965098636e-05,
4307
+ "loss": 0.7843,
4308
+ "step": 614
4309
+ },
4310
+ {
4311
+ "epoch": 3.7048192771084336,
4312
+ "grad_norm": 0.05978460609912872,
4313
+ "learning_rate": 1.5174506828528074e-05,
4314
+ "loss": 0.953,
4315
+ "step": 615
4316
+ },
4317
+ {
4318
+ "epoch": 3.710843373493976,
4319
+ "grad_norm": 0.05480135232210159,
4320
+ "learning_rate": 1.4871016691957512e-05,
4321
+ "loss": 0.8885,
4322
+ "step": 616
4323
+ },
4324
+ {
4325
+ "epoch": 3.716867469879518,
4326
+ "grad_norm": 0.06841973960399628,
4327
+ "learning_rate": 1.456752655538695e-05,
4328
+ "loss": 0.8523,
4329
+ "step": 617
4330
+ },
4331
+ {
4332
+ "epoch": 3.7228915662650603,
4333
+ "grad_norm": 0.05191405862569809,
4334
+ "learning_rate": 1.426403641881639e-05,
4335
+ "loss": 0.9134,
4336
+ "step": 618
4337
+ },
4338
+ {
4339
+ "epoch": 3.7289156626506026,
4340
+ "grad_norm": 0.06994569301605225,
4341
+ "learning_rate": 1.3960546282245829e-05,
4342
+ "loss": 0.9769,
4343
+ "step": 619
4344
+ },
4345
+ {
4346
+ "epoch": 3.734939759036145,
4347
+ "grad_norm": 0.06147489696741104,
4348
+ "learning_rate": 1.3657056145675265e-05,
4349
+ "loss": 0.9384,
4350
+ "step": 620
4351
+ },
4352
+ {
4353
+ "epoch": 3.7409638554216866,
4354
+ "grad_norm": 0.0558396577835083,
4355
+ "learning_rate": 1.3353566009104704e-05,
4356
+ "loss": 1.0132,
4357
+ "step": 621
4358
+ },
4359
+ {
4360
+ "epoch": 3.746987951807229,
4361
+ "grad_norm": 0.054801881313323975,
4362
+ "learning_rate": 1.3050075872534142e-05,
4363
+ "loss": 0.885,
4364
+ "step": 622
4365
+ },
4366
+ {
4367
+ "epoch": 3.753012048192771,
4368
+ "grad_norm": 0.06346871703863144,
4369
+ "learning_rate": 1.2746585735963582e-05,
4370
+ "loss": 0.9055,
4371
+ "step": 623
4372
+ },
4373
+ {
4374
+ "epoch": 3.7590361445783134,
4375
+ "grad_norm": 0.06990102678537369,
4376
+ "learning_rate": 1.244309559939302e-05,
4377
+ "loss": 1.0826,
4378
+ "step": 624
4379
+ },
4380
+ {
4381
+ "epoch": 3.765060240963855,
4382
+ "grad_norm": 0.058159008622169495,
4383
+ "learning_rate": 1.213960546282246e-05,
4384
+ "loss": 0.8537,
4385
+ "step": 625
4386
+ },
4387
+ {
4388
+ "epoch": 3.7710843373493974,
4389
+ "grad_norm": 0.056310877203941345,
4390
+ "learning_rate": 1.1836115326251897e-05,
4391
+ "loss": 0.8969,
4392
+ "step": 626
4393
+ },
4394
+ {
4395
+ "epoch": 3.7771084337349397,
4396
+ "grad_norm": 0.0730551928281784,
4397
+ "learning_rate": 1.1532625189681335e-05,
4398
+ "loss": 0.7622,
4399
+ "step": 627
4400
+ },
4401
+ {
4402
+ "epoch": 3.783132530120482,
4403
+ "grad_norm": 0.055493585765361786,
4404
+ "learning_rate": 1.1229135053110775e-05,
4405
+ "loss": 0.9374,
4406
+ "step": 628
4407
+ },
4408
+ {
4409
+ "epoch": 3.789156626506024,
4410
+ "grad_norm": 0.06163483485579491,
4411
+ "learning_rate": 1.0925644916540213e-05,
4412
+ "loss": 0.9165,
4413
+ "step": 629
4414
+ },
4415
+ {
4416
+ "epoch": 3.7951807228915664,
4417
+ "grad_norm": 0.05568576976656914,
4418
+ "learning_rate": 1.0622154779969651e-05,
4419
+ "loss": 0.9599,
4420
+ "step": 630
4421
+ },
4422
+ {
4423
+ "epoch": 3.8012048192771086,
4424
+ "grad_norm": 0.05647989735007286,
4425
+ "learning_rate": 1.031866464339909e-05,
4426
+ "loss": 0.9085,
4427
+ "step": 631
4428
+ },
4429
+ {
4430
+ "epoch": 3.807228915662651,
4431
+ "grad_norm": 0.07673195749521255,
4432
+ "learning_rate": 1.0015174506828528e-05,
4433
+ "loss": 1.0056,
4434
+ "step": 632
4435
+ },
4436
+ {
4437
+ "epoch": 3.8132530120481927,
4438
+ "grad_norm": 0.052770134061574936,
4439
+ "learning_rate": 9.711684370257968e-06,
4440
+ "loss": 0.835,
4441
+ "step": 633
4442
+ },
4443
+ {
4444
+ "epoch": 3.819277108433735,
4445
+ "grad_norm": 0.06691750884056091,
4446
+ "learning_rate": 9.408194233687406e-06,
4447
+ "loss": 0.8668,
4448
+ "step": 634
4449
+ },
4450
+ {
4451
+ "epoch": 3.825301204819277,
4452
+ "grad_norm": 0.06338905543088913,
4453
+ "learning_rate": 9.104704097116844e-06,
4454
+ "loss": 0.9121,
4455
+ "step": 635
4456
+ },
4457
+ {
4458
+ "epoch": 3.8313253012048194,
4459
+ "grad_norm": 0.07239171117544174,
4460
+ "learning_rate": 8.801213960546282e-06,
4461
+ "loss": 0.9947,
4462
+ "step": 636
4463
+ },
4464
+ {
4465
+ "epoch": 3.837349397590361,
4466
+ "grad_norm": 0.05657603591680527,
4467
+ "learning_rate": 8.49772382397572e-06,
4468
+ "loss": 0.955,
4469
+ "step": 637
4470
+ },
4471
+ {
4472
+ "epoch": 3.8433734939759034,
4473
+ "grad_norm": 0.05116644501686096,
4474
+ "learning_rate": 8.19423368740516e-06,
4475
+ "loss": 0.8011,
4476
+ "step": 638
4477
+ },
4478
+ {
4479
+ "epoch": 3.8493975903614457,
4480
+ "grad_norm": 0.059036966413259506,
4481
+ "learning_rate": 7.890743550834597e-06,
4482
+ "loss": 0.9505,
4483
+ "step": 639
4484
+ },
4485
+ {
4486
+ "epoch": 3.855421686746988,
4487
+ "grad_norm": 0.061433471739292145,
4488
+ "learning_rate": 7.587253414264037e-06,
4489
+ "loss": 0.9425,
4490
+ "step": 640
4491
+ },
4492
+ {
4493
+ "epoch": 3.86144578313253,
4494
+ "grad_norm": 0.050406377762556076,
4495
+ "learning_rate": 7.283763277693475e-06,
4496
+ "loss": 0.9486,
4497
+ "step": 641
4498
+ },
4499
+ {
4500
+ "epoch": 3.8674698795180724,
4501
+ "grad_norm": 0.0669349730014801,
4502
+ "learning_rate": 6.980273141122914e-06,
4503
+ "loss": 0.9278,
4504
+ "step": 642
4505
+ },
4506
+ {
4507
+ "epoch": 3.8734939759036147,
4508
+ "grad_norm": 0.0585930198431015,
4509
+ "learning_rate": 6.676783004552352e-06,
4510
+ "loss": 0.7989,
4511
+ "step": 643
4512
+ },
4513
+ {
4514
+ "epoch": 3.8795180722891565,
4515
+ "grad_norm": 0.0559052973985672,
4516
+ "learning_rate": 6.373292867981791e-06,
4517
+ "loss": 0.8278,
4518
+ "step": 644
4519
+ },
4520
+ {
4521
+ "epoch": 3.8855421686746987,
4522
+ "grad_norm": 0.053793761879205704,
4523
+ "learning_rate": 6.06980273141123e-06,
4524
+ "loss": 0.861,
4525
+ "step": 645
4526
+ },
4527
+ {
4528
+ "epoch": 3.891566265060241,
4529
+ "grad_norm": 0.06693346053361893,
4530
+ "learning_rate": 5.766312594840667e-06,
4531
+ "loss": 0.8576,
4532
+ "step": 646
4533
+ },
4534
+ {
4535
+ "epoch": 3.897590361445783,
4536
+ "grad_norm": 0.06896480917930603,
4537
+ "learning_rate": 5.4628224582701065e-06,
4538
+ "loss": 1.0072,
4539
+ "step": 647
4540
+ },
4541
+ {
4542
+ "epoch": 3.9036144578313254,
4543
+ "grad_norm": 0.055934589356184006,
4544
+ "learning_rate": 5.159332321699545e-06,
4545
+ "loss": 0.8718,
4546
+ "step": 648
4547
+ },
4548
+ {
4549
+ "epoch": 3.9096385542168672,
4550
+ "grad_norm": 0.05610671639442444,
4551
+ "learning_rate": 4.855842185128984e-06,
4552
+ "loss": 0.8591,
4553
+ "step": 649
4554
+ },
4555
+ {
4556
+ "epoch": 3.9156626506024095,
4557
+ "grad_norm": 0.058563508093357086,
4558
+ "learning_rate": 4.552352048558422e-06,
4559
+ "loss": 0.9538,
4560
+ "step": 650
4561
+ },
4562
+ {
4563
+ "epoch": 3.9216867469879517,
4564
+ "grad_norm": 0.05412083491683006,
4565
+ "learning_rate": 4.24886191198786e-06,
4566
+ "loss": 0.9795,
4567
+ "step": 651
4568
+ },
4569
+ {
4570
+ "epoch": 3.927710843373494,
4571
+ "grad_norm": 0.059804435819387436,
4572
+ "learning_rate": 3.9453717754172986e-06,
4573
+ "loss": 0.9015,
4574
+ "step": 652
4575
+ },
4576
+ {
4577
+ "epoch": 3.933734939759036,
4578
+ "grad_norm": 0.06050240248441696,
4579
+ "learning_rate": 3.6418816388467377e-06,
4580
+ "loss": 0.8491,
4581
+ "step": 653
4582
+ },
4583
+ {
4584
+ "epoch": 3.9397590361445785,
4585
+ "grad_norm": 0.06998637318611145,
4586
+ "learning_rate": 3.338391502276176e-06,
4587
+ "loss": 0.9078,
4588
+ "step": 654
4589
+ },
4590
+ {
4591
+ "epoch": 3.9457831325301207,
4592
+ "grad_norm": 0.05342373624444008,
4593
+ "learning_rate": 3.034901365705615e-06,
4594
+ "loss": 0.89,
4595
+ "step": 655
4596
+ },
4597
+ {
4598
+ "epoch": 3.9518072289156625,
4599
+ "grad_norm": 0.060358088463544846,
4600
+ "learning_rate": 2.7314112291350532e-06,
4601
+ "loss": 0.936,
4602
+ "step": 656
4603
+ },
4604
+ {
4605
+ "epoch": 3.9578313253012047,
4606
+ "grad_norm": 0.04981756955385208,
4607
+ "learning_rate": 2.427921092564492e-06,
4608
+ "loss": 0.8473,
4609
+ "step": 657
4610
+ },
4611
+ {
4612
+ "epoch": 3.963855421686747,
4613
+ "grad_norm": 0.05234144255518913,
4614
+ "learning_rate": 2.12443095599393e-06,
4615
+ "loss": 0.9084,
4616
+ "step": 658
4617
+ },
4618
+ {
4619
+ "epoch": 3.9698795180722892,
4620
+ "grad_norm": 0.05660410597920418,
4621
+ "learning_rate": 1.8209408194233688e-06,
4622
+ "loss": 0.7614,
4623
+ "step": 659
4624
+ },
4625
+ {
4626
+ "epoch": 3.9759036144578315,
4627
+ "grad_norm": 0.05609305948019028,
4628
+ "learning_rate": 1.5174506828528075e-06,
4629
+ "loss": 0.9501,
4630
+ "step": 660
4631
+ },
4632
+ {
4633
+ "epoch": 3.9819277108433733,
4634
+ "grad_norm": 0.06469684839248657,
4635
+ "learning_rate": 1.213960546282246e-06,
4636
+ "loss": 0.8804,
4637
+ "step": 661
4638
+ },
4639
+ {
4640
+ "epoch": 3.9879518072289155,
4641
+ "grad_norm": 0.06128552928566933,
4642
+ "learning_rate": 9.104704097116844e-07,
4643
+ "loss": 0.8887,
4644
+ "step": 662
4645
+ },
4646
+ {
4647
+ "epoch": 3.9939759036144578,
4648
+ "grad_norm": 0.05402183160185814,
4649
+ "learning_rate": 6.06980273141123e-07,
4650
+ "loss": 0.8302,
4651
+ "step": 663
4652
+ },
4653
+ {
4654
+ "epoch": 4.0,
4655
+ "grad_norm": 0.06712618470191956,
4656
+ "learning_rate": 3.034901365705615e-07,
4657
+ "loss": 0.904,
4658
+ "step": 664
4659
  }
4660
  ],
4661
  "logging_steps": 1,
 
4670
  "should_evaluate": false,
4671
  "should_log": false,
4672
  "should_save": true,
4673
+ "should_training_stop": true
4674
  },
4675
  "attributes": {}
4676
  }
4677
  },
4678
+ "total_flos": 7.313907386851e+18,
4679
  "train_batch_size": 8,
4680
  "trial_name": null,
4681
  "trial_params": null