Wilsonwin commited on
Commit
49199dd
·
verified ·
1 Parent(s): d44f401

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c74bfe809433060df3635ef406235f0717bc42781fff9acd5df0f855eb57b3f
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:539ec21ed2f2d5401b90d0d0b28a43621343b47ec158a5dc912ef7d73a069cdf
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70990f23441c3c0fadf8ff7b5b48864178e6a3f9dbc5c1184cb7c19ddf968c0f
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c631e1446372f309276121049f5c8b7603bed555765afc41b0a5db7f194949eb
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11940f1313899a11d3e47a2d43f508134dd8e03ac7613f4eca32c754da2d1839
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59ab6babcc58d5a8a0338e2999283607960e6faa29d71e8d0c3f11e2480b272d
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5732bb4fae95fda377427872ad7c4fed0c45a84922701b3143ffa39cf761f9db
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5a3c7ee6384cdea60f7a41957135fc1d6a8e0bdd3b9a0dd5c4c46f69d638ec
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0136847440446022,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4311,6 +4311,364 @@
4311
  "eval_samples_per_second": 208.452,
4312
  "eval_steps_per_second": 4.377,
4313
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4314
  }
4315
  ],
4316
  "logging_steps": 10,
@@ -4330,7 +4688,7 @@
4330
  "attributes": {}
4331
  }
4332
  },
4333
- "total_flos": 2.0067200216019763e+17,
4334
  "train_batch_size": 48,
4335
  "trial_name": null,
4336
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0981584727149856,
6
  "eval_steps": 500,
7
+ "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4311
  "eval_samples_per_second": 208.452,
4312
  "eval_steps_per_second": 4.377,
4313
  "step": 6000
4314
+ },
4315
+ {
4316
+ "epoch": 1.0153742186180097,
4317
+ "grad_norm": 0.5339483022689819,
4318
+ "learning_rate": 0.00019297799453520028,
4319
+ "loss": 4.500830459594726,
4320
+ "step": 6010
4321
+ },
4322
+ {
4323
+ "epoch": 1.0170636931914174,
4324
+ "grad_norm": 0.5642256736755371,
4325
+ "learning_rate": 0.00019251885987680252,
4326
+ "loss": 4.485746002197265,
4327
+ "step": 6020
4328
+ },
4329
+ {
4330
+ "epoch": 1.0187531677648252,
4331
+ "grad_norm": 0.5060975551605225,
4332
+ "learning_rate": 0.00019205929164029217,
4333
+ "loss": 4.475402450561523,
4334
+ "step": 6030
4335
+ },
4336
+ {
4337
+ "epoch": 1.020442642338233,
4338
+ "grad_norm": 0.49786120653152466,
4339
+ "learning_rate": 0.00019159929451203033,
4340
+ "loss": 4.486777114868164,
4341
+ "step": 6040
4342
+ },
4343
+ {
4344
+ "epoch": 1.0221321169116404,
4345
+ "grad_norm": 0.506598949432373,
4346
+ "learning_rate": 0.00019113887318275149,
4347
+ "loss": 4.489146041870117,
4348
+ "step": 6050
4349
+ },
4350
+ {
4351
+ "epoch": 1.0238215914850481,
4352
+ "grad_norm": 0.48270103335380554,
4353
+ "learning_rate": 0.00019067803234751603,
4354
+ "loss": 4.474691009521484,
4355
+ "step": 6060
4356
+ },
4357
+ {
4358
+ "epoch": 1.0255110660584559,
4359
+ "grad_norm": 0.48239970207214355,
4360
+ "learning_rate": 0.00019021677670566208,
4361
+ "loss": 4.4708606719970705,
4362
+ "step": 6070
4363
+ },
4364
+ {
4365
+ "epoch": 1.0272005406318634,
4366
+ "grad_norm": 0.4966093599796295,
4367
+ "learning_rate": 0.00018975511096075762,
4368
+ "loss": 4.505655670166016,
4369
+ "step": 6080
4370
+ },
4371
+ {
4372
+ "epoch": 1.0288900152052711,
4373
+ "grad_norm": 0.5429375767707825,
4374
+ "learning_rate": 0.00018929303982055272,
4375
+ "loss": 4.499347305297851,
4376
+ "step": 6090
4377
+ },
4378
+ {
4379
+ "epoch": 1.0305794897786789,
4380
+ "grad_norm": 0.4981507360935211,
4381
+ "learning_rate": 0.00018883056799693125,
4382
+ "loss": 4.461819839477539,
4383
+ "step": 6100
4384
+ },
4385
+ {
4386
+ "epoch": 1.0322689643520866,
4387
+ "grad_norm": 0.5121614336967468,
4388
+ "learning_rate": 0.00018836770020586315,
4389
+ "loss": 4.478689956665039,
4390
+ "step": 6110
4391
+ },
4392
+ {
4393
+ "epoch": 1.033958438925494,
4394
+ "grad_norm": 0.4835728406906128,
4395
+ "learning_rate": 0.00018790444116735595,
4396
+ "loss": 4.47772216796875,
4397
+ "step": 6120
4398
+ },
4399
+ {
4400
+ "epoch": 1.0356479134989018,
4401
+ "grad_norm": 0.4881206154823303,
4402
+ "learning_rate": 0.00018744079560540695,
4403
+ "loss": 4.479801177978516,
4404
+ "step": 6130
4405
+ },
4406
+ {
4407
+ "epoch": 1.0373373880723096,
4408
+ "grad_norm": 0.47434431314468384,
4409
+ "learning_rate": 0.000186976768247955,
4410
+ "loss": 4.480235290527344,
4411
+ "step": 6140
4412
+ },
4413
+ {
4414
+ "epoch": 1.039026862645717,
4415
+ "grad_norm": 0.48258504271507263,
4416
+ "learning_rate": 0.00018651236382683225,
4417
+ "loss": 4.469864273071289,
4418
+ "step": 6150
4419
+ },
4420
+ {
4421
+ "epoch": 1.0407163372191248,
4422
+ "grad_norm": 0.5025637745857239,
4423
+ "learning_rate": 0.0001860475870777157,
4424
+ "loss": 4.472750091552735,
4425
+ "step": 6160
4426
+ },
4427
+ {
4428
+ "epoch": 1.0424058117925326,
4429
+ "grad_norm": 0.4636594355106354,
4430
+ "learning_rate": 0.0001855824427400793,
4431
+ "loss": 4.450835418701172,
4432
+ "step": 6170
4433
+ },
4434
+ {
4435
+ "epoch": 1.0440952863659403,
4436
+ "grad_norm": 0.4901501536369324,
4437
+ "learning_rate": 0.00018511693555714535,
4438
+ "loss": 4.490735626220703,
4439
+ "step": 6180
4440
+ },
4441
+ {
4442
+ "epoch": 1.0457847609393478,
4443
+ "grad_norm": 0.5198561549186707,
4444
+ "learning_rate": 0.00018465107027583615,
4445
+ "loss": 4.474180221557617,
4446
+ "step": 6190
4447
+ },
4448
+ {
4449
+ "epoch": 1.0474742355127555,
4450
+ "grad_norm": 0.4723539352416992,
4451
+ "learning_rate": 0.00018418485164672574,
4452
+ "loss": 4.4745361328125,
4453
+ "step": 6200
4454
+ },
4455
+ {
4456
+ "epoch": 1.0491637100861633,
4457
+ "grad_norm": 0.5074954628944397,
4458
+ "learning_rate": 0.00018371828442399128,
4459
+ "loss": 4.469810485839844,
4460
+ "step": 6210
4461
+ },
4462
+ {
4463
+ "epoch": 1.0508531846595708,
4464
+ "grad_norm": 0.49918699264526367,
4465
+ "learning_rate": 0.00018325137336536464,
4466
+ "loss": 4.442096710205078,
4467
+ "step": 6220
4468
+ },
4469
+ {
4470
+ "epoch": 1.0525426592329785,
4471
+ "grad_norm": 0.5088530778884888,
4472
+ "learning_rate": 0.00018278412323208392,
4473
+ "loss": 4.484762573242188,
4474
+ "step": 6230
4475
+ },
4476
+ {
4477
+ "epoch": 1.0542321338063863,
4478
+ "grad_norm": 0.506341814994812,
4479
+ "learning_rate": 0.00018231653878884486,
4480
+ "loss": 4.486656188964844,
4481
+ "step": 6240
4482
+ },
4483
+ {
4484
+ "epoch": 1.055921608379794,
4485
+ "grad_norm": 0.5262649059295654,
4486
+ "learning_rate": 0.00018184862480375233,
4487
+ "loss": 4.455668640136719,
4488
+ "step": 6250
4489
+ },
4490
+ {
4491
+ "epoch": 1.0576110829532015,
4492
+ "grad_norm": 0.5115051865577698,
4493
+ "learning_rate": 0.00018138038604827153,
4494
+ "loss": 4.479043960571289,
4495
+ "step": 6260
4496
+ },
4497
+ {
4498
+ "epoch": 1.0593005575266092,
4499
+ "grad_norm": 0.50110924243927,
4500
+ "learning_rate": 0.0001809118272971795,
4501
+ "loss": 4.446685409545898,
4502
+ "step": 6270
4503
+ },
4504
+ {
4505
+ "epoch": 1.060990032100017,
4506
+ "grad_norm": 0.5022484660148621,
4507
+ "learning_rate": 0.0001804429533285164,
4508
+ "loss": 4.4593353271484375,
4509
+ "step": 6280
4510
+ },
4511
+ {
4512
+ "epoch": 1.0626795066734245,
4513
+ "grad_norm": 0.492165744304657,
4514
+ "learning_rate": 0.00017997376892353668,
4515
+ "loss": 4.496971511840821,
4516
+ "step": 6290
4517
+ },
4518
+ {
4519
+ "epoch": 1.0643689812468322,
4520
+ "grad_norm": 0.5134599208831787,
4521
+ "learning_rate": 0.0001795042788666605,
4522
+ "loss": 4.465629196166992,
4523
+ "step": 6300
4524
+ },
4525
+ {
4526
+ "epoch": 1.06605845582024,
4527
+ "grad_norm": 0.5151488184928894,
4528
+ "learning_rate": 0.00017903448794542488,
4529
+ "loss": 4.454899597167969,
4530
+ "step": 6310
4531
+ },
4532
+ {
4533
+ "epoch": 1.0677479303936477,
4534
+ "grad_norm": 0.5240500569343567,
4535
+ "learning_rate": 0.00017856440095043464,
4536
+ "loss": 4.481625747680664,
4537
+ "step": 6320
4538
+ },
4539
+ {
4540
+ "epoch": 1.0694374049670552,
4541
+ "grad_norm": 0.5187123417854309,
4542
+ "learning_rate": 0.00017809402267531405,
4543
+ "loss": 4.437789535522461,
4544
+ "step": 6330
4545
+ },
4546
+ {
4547
+ "epoch": 1.071126879540463,
4548
+ "grad_norm": 0.4693409502506256,
4549
+ "learning_rate": 0.00017762335791665735,
4550
+ "loss": 4.450423812866211,
4551
+ "step": 6340
4552
+ },
4553
+ {
4554
+ "epoch": 1.0728163541138707,
4555
+ "grad_norm": 0.5061246752738953,
4556
+ "learning_rate": 0.00017715241147398035,
4557
+ "loss": 4.46313705444336,
4558
+ "step": 6350
4559
+ },
4560
+ {
4561
+ "epoch": 1.0745058286872782,
4562
+ "grad_norm": 0.47927796840667725,
4563
+ "learning_rate": 0.00017668118814967126,
4564
+ "loss": 4.446915817260742,
4565
+ "step": 6360
4566
+ },
4567
+ {
4568
+ "epoch": 1.076195303260686,
4569
+ "grad_norm": 0.47587907314300537,
4570
+ "learning_rate": 0.00017620969274894163,
4571
+ "loss": 4.461398696899414,
4572
+ "step": 6370
4573
+ },
4574
+ {
4575
+ "epoch": 1.0778847778340936,
4576
+ "grad_norm": 0.5091392397880554,
4577
+ "learning_rate": 0.00017573793007977763,
4578
+ "loss": 4.450970458984375,
4579
+ "step": 6380
4580
+ },
4581
+ {
4582
+ "epoch": 1.0795742524075012,
4583
+ "grad_norm": 0.5105127692222595,
4584
+ "learning_rate": 0.0001752659049528906,
4585
+ "loss": 4.458657455444336,
4586
+ "step": 6390
4587
+ },
4588
+ {
4589
+ "epoch": 1.081263726980909,
4590
+ "grad_norm": 0.5196726322174072,
4591
+ "learning_rate": 0.00017479362218166854,
4592
+ "loss": 4.444008636474609,
4593
+ "step": 6400
4594
+ },
4595
+ {
4596
+ "epoch": 1.0829532015543166,
4597
+ "grad_norm": 0.4891359210014343,
4598
+ "learning_rate": 0.0001743210865821265,
4599
+ "loss": 4.436445236206055,
4600
+ "step": 6410
4601
+ },
4602
+ {
4603
+ "epoch": 1.0846426761277244,
4604
+ "grad_norm": 0.5141095519065857,
4605
+ "learning_rate": 0.0001738483029728578,
4606
+ "loss": 4.455481338500976,
4607
+ "step": 6420
4608
+ },
4609
+ {
4610
+ "epoch": 1.0863321507011319,
4611
+ "grad_norm": 0.5223525166511536,
4612
+ "learning_rate": 0.00017337527617498474,
4613
+ "loss": 4.485405731201172,
4614
+ "step": 6430
4615
+ },
4616
+ {
4617
+ "epoch": 1.0880216252745396,
4618
+ "grad_norm": 0.4939091205596924,
4619
+ "learning_rate": 0.0001729020110121096,
4620
+ "loss": 4.448784255981446,
4621
+ "step": 6440
4622
+ },
4623
+ {
4624
+ "epoch": 1.0897110998479473,
4625
+ "grad_norm": 0.49695253372192383,
4626
+ "learning_rate": 0.0001724285123102652,
4627
+ "loss": 4.4587146759033205,
4628
+ "step": 6450
4629
+ },
4630
+ {
4631
+ "epoch": 1.091400574421355,
4632
+ "grad_norm": 0.4882517158985138,
4633
+ "learning_rate": 0.00017195478489786593,
4634
+ "loss": 4.43580207824707,
4635
+ "step": 6460
4636
+ },
4637
+ {
4638
+ "epoch": 1.0930900489947626,
4639
+ "grad_norm": 0.4971882998943329,
4640
+ "learning_rate": 0.00017148083360565836,
4641
+ "loss": 4.436479949951172,
4642
+ "step": 6470
4643
+ },
4644
+ {
4645
+ "epoch": 1.0947795235681703,
4646
+ "grad_norm": 0.4835260808467865,
4647
+ "learning_rate": 0.00017100666326667202,
4648
+ "loss": 4.476963043212891,
4649
+ "step": 6480
4650
+ },
4651
+ {
4652
+ "epoch": 1.096468998141578,
4653
+ "grad_norm": 0.4847490191459656,
4654
+ "learning_rate": 0.00017053227871617027,
4655
+ "loss": 4.449015426635742,
4656
+ "step": 6490
4657
+ },
4658
+ {
4659
+ "epoch": 1.0981584727149856,
4660
+ "grad_norm": 0.5305824279785156,
4661
+ "learning_rate": 0.00017005768479160064,
4662
+ "loss": 4.452330780029297,
4663
+ "step": 6500
4664
+ },
4665
+ {
4666
+ "epoch": 1.0981584727149856,
4667
+ "eval_loss": 4.447469711303711,
4668
+ "eval_runtime": 4.0239,
4669
+ "eval_samples_per_second": 248.518,
4670
+ "eval_steps_per_second": 5.219,
4671
+ "step": 6500
4672
  }
4673
  ],
4674
  "logging_steps": 10,
 
4688
  "attributes": {}
4689
  }
4690
  },
4691
+ "total_flos": 2.1739484320314163e+17,
4692
  "train_batch_size": 48,
4693
  "trial_name": null,
4694
  "trial_params": null