Wilsonwin commited on
Commit
ccf2b74
·
verified ·
1 Parent(s): 71521b0

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f198d05f5a6f7322d5950baad97f98d6f59bcdb9ed02f220583ce5fd10a379c7
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb4549408e52f631e5a2754236ea70999d0d21bd6cdb0e3578808e3ad0ec0af
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:302e7a816c65dc7ea036853d2e134881bf37e4d7e3ce31f671702ad86c5f1616
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5fa1630ee9673533bbca0fabe5cc81512e307a45647863ef671c972b6a648c2
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef678004bfc53268aeb4845a442c0327144244832e571a2be41a7160145765eb
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04f884e61b89876d8b9b16b9a44bf2c7f027c2c95e35ca0aba5b86933c2288c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5732bb4fae95fda377427872ad7c4fed0c45a84922701b3143ffa39cf761f9db
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5a3c7ee6384cdea60f7a41957135fc1d6a8e0bdd3b9a0dd5c4c46f69d638ec
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0136847440446022,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4311,6 +4311,364 @@
4311
  "eval_samples_per_second": 276.003,
4312
  "eval_steps_per_second": 5.796,
4313
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4314
  }
4315
  ],
4316
  "logging_steps": 10,
@@ -4330,7 +4688,7 @@
4330
  "attributes": {}
4331
  }
4332
  },
4333
- "total_flos": 2.0067200216019763e+17,
4334
  "train_batch_size": 48,
4335
  "trial_name": null,
4336
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0981584727149856,
6
  "eval_steps": 500,
7
+ "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4311
  "eval_samples_per_second": 276.003,
4312
  "eval_steps_per_second": 5.796,
4313
  "step": 6000
4314
+ },
4315
+ {
4316
+ "epoch": 1.0153742186180097,
4317
+ "grad_norm": 0.518569827079773,
4318
+ "learning_rate": 0.00019297799453520028,
4319
+ "loss": 4.499275207519531,
4320
+ "step": 6010
4321
+ },
4322
+ {
4323
+ "epoch": 1.0170636931914174,
4324
+ "grad_norm": 0.5655678510665894,
4325
+ "learning_rate": 0.00019251885987680252,
4326
+ "loss": 4.483604049682617,
4327
+ "step": 6020
4328
+ },
4329
+ {
4330
+ "epoch": 1.0187531677648252,
4331
+ "grad_norm": 0.5033740997314453,
4332
+ "learning_rate": 0.00019205929164029217,
4333
+ "loss": 4.474781036376953,
4334
+ "step": 6030
4335
+ },
4336
+ {
4337
+ "epoch": 1.020442642338233,
4338
+ "grad_norm": 0.5125960111618042,
4339
+ "learning_rate": 0.00019159929451203033,
4340
+ "loss": 4.485195922851562,
4341
+ "step": 6040
4342
+ },
4343
+ {
4344
+ "epoch": 1.0221321169116404,
4345
+ "grad_norm": 0.5002242922782898,
4346
+ "learning_rate": 0.00019113887318275149,
4347
+ "loss": 4.486893081665039,
4348
+ "step": 6050
4349
+ },
4350
+ {
4351
+ "epoch": 1.0238215914850481,
4352
+ "grad_norm": 0.48877793550491333,
4353
+ "learning_rate": 0.00019067803234751603,
4354
+ "loss": 4.473563766479492,
4355
+ "step": 6060
4356
+ },
4357
+ {
4358
+ "epoch": 1.0255110660584559,
4359
+ "grad_norm": 0.485661119222641,
4360
+ "learning_rate": 0.00019021677670566208,
4361
+ "loss": 4.469658660888672,
4362
+ "step": 6070
4363
+ },
4364
+ {
4365
+ "epoch": 1.0272005406318634,
4366
+ "grad_norm": 0.5000821352005005,
4367
+ "learning_rate": 0.00018975511096075762,
4368
+ "loss": 4.504412078857422,
4369
+ "step": 6080
4370
+ },
4371
+ {
4372
+ "epoch": 1.0288900152052711,
4373
+ "grad_norm": 0.5075719356536865,
4374
+ "learning_rate": 0.00018929303982055272,
4375
+ "loss": 4.497782135009766,
4376
+ "step": 6090
4377
+ },
4378
+ {
4379
+ "epoch": 1.0305794897786789,
4380
+ "grad_norm": 0.477532297372818,
4381
+ "learning_rate": 0.00018883056799693125,
4382
+ "loss": 4.46100082397461,
4383
+ "step": 6100
4384
+ },
4385
+ {
4386
+ "epoch": 1.0322689643520866,
4387
+ "grad_norm": 0.5213661789894104,
4388
+ "learning_rate": 0.00018836770020586315,
4389
+ "loss": 4.476996612548828,
4390
+ "step": 6110
4391
+ },
4392
+ {
4393
+ "epoch": 1.033958438925494,
4394
+ "grad_norm": 0.5093067288398743,
4395
+ "learning_rate": 0.00018790444116735595,
4396
+ "loss": 4.477323150634765,
4397
+ "step": 6120
4398
+ },
4399
+ {
4400
+ "epoch": 1.0356479134989018,
4401
+ "grad_norm": 0.480839341878891,
4402
+ "learning_rate": 0.00018744079560540695,
4403
+ "loss": 4.478923797607422,
4404
+ "step": 6130
4405
+ },
4406
+ {
4407
+ "epoch": 1.0373373880723096,
4408
+ "grad_norm": 0.47398701310157776,
4409
+ "learning_rate": 0.000186976768247955,
4410
+ "loss": 4.478921508789062,
4411
+ "step": 6140
4412
+ },
4413
+ {
4414
+ "epoch": 1.039026862645717,
4415
+ "grad_norm": 0.4890805780887604,
4416
+ "learning_rate": 0.00018651236382683225,
4417
+ "loss": 4.468624877929687,
4418
+ "step": 6150
4419
+ },
4420
+ {
4421
+ "epoch": 1.0407163372191248,
4422
+ "grad_norm": 0.49367958307266235,
4423
+ "learning_rate": 0.0001860475870777157,
4424
+ "loss": 4.472190475463867,
4425
+ "step": 6160
4426
+ },
4427
+ {
4428
+ "epoch": 1.0424058117925326,
4429
+ "grad_norm": 0.4590769112110138,
4430
+ "learning_rate": 0.0001855824427400793,
4431
+ "loss": 4.449500274658203,
4432
+ "step": 6170
4433
+ },
4434
+ {
4435
+ "epoch": 1.0440952863659403,
4436
+ "grad_norm": 0.4810253381729126,
4437
+ "learning_rate": 0.00018511693555714535,
4438
+ "loss": 4.490542221069336,
4439
+ "step": 6180
4440
+ },
4441
+ {
4442
+ "epoch": 1.0457847609393478,
4443
+ "grad_norm": 0.5299515128135681,
4444
+ "learning_rate": 0.00018465107027583615,
4445
+ "loss": 4.474026489257812,
4446
+ "step": 6190
4447
+ },
4448
+ {
4449
+ "epoch": 1.0474742355127555,
4450
+ "grad_norm": 0.4833298623561859,
4451
+ "learning_rate": 0.00018418485164672574,
4452
+ "loss": 4.473223114013672,
4453
+ "step": 6200
4454
+ },
4455
+ {
4456
+ "epoch": 1.0491637100861633,
4457
+ "grad_norm": 0.4987802803516388,
4458
+ "learning_rate": 0.00018371828442399128,
4459
+ "loss": 4.467764663696289,
4460
+ "step": 6210
4461
+ },
4462
+ {
4463
+ "epoch": 1.0508531846595708,
4464
+ "grad_norm": 0.49086934328079224,
4465
+ "learning_rate": 0.00018325137336536464,
4466
+ "loss": 4.441515350341797,
4467
+ "step": 6220
4468
+ },
4469
+ {
4470
+ "epoch": 1.0525426592329785,
4471
+ "grad_norm": 0.5031701326370239,
4472
+ "learning_rate": 0.00018278412323208392,
4473
+ "loss": 4.483510208129883,
4474
+ "step": 6230
4475
+ },
4476
+ {
4477
+ "epoch": 1.0542321338063863,
4478
+ "grad_norm": 0.509184718132019,
4479
+ "learning_rate": 0.00018231653878884486,
4480
+ "loss": 4.485199356079102,
4481
+ "step": 6240
4482
+ },
4483
+ {
4484
+ "epoch": 1.055921608379794,
4485
+ "grad_norm": 0.48335397243499756,
4486
+ "learning_rate": 0.00018184862480375233,
4487
+ "loss": 4.454570388793945,
4488
+ "step": 6250
4489
+ },
4490
+ {
4491
+ "epoch": 1.0576110829532015,
4492
+ "grad_norm": 0.5146468281745911,
4493
+ "learning_rate": 0.00018138038604827153,
4494
+ "loss": 4.477815628051758,
4495
+ "step": 6260
4496
+ },
4497
+ {
4498
+ "epoch": 1.0593005575266092,
4499
+ "grad_norm": 0.5049527883529663,
4500
+ "learning_rate": 0.0001809118272971795,
4501
+ "loss": 4.445434951782227,
4502
+ "step": 6270
4503
+ },
4504
+ {
4505
+ "epoch": 1.060990032100017,
4506
+ "grad_norm": 0.47304192185401917,
4507
+ "learning_rate": 0.0001804429533285164,
4508
+ "loss": 4.458169555664062,
4509
+ "step": 6280
4510
+ },
4511
+ {
4512
+ "epoch": 1.0626795066734245,
4513
+ "grad_norm": 0.4755364954471588,
4514
+ "learning_rate": 0.00017997376892353668,
4515
+ "loss": 4.495440292358398,
4516
+ "step": 6290
4517
+ },
4518
+ {
4519
+ "epoch": 1.0643689812468322,
4520
+ "grad_norm": 0.49506038427352905,
4521
+ "learning_rate": 0.0001795042788666605,
4522
+ "loss": 4.4639404296875,
4523
+ "step": 6300
4524
+ },
4525
+ {
4526
+ "epoch": 1.06605845582024,
4527
+ "grad_norm": 0.5216291546821594,
4528
+ "learning_rate": 0.00017903448794542488,
4529
+ "loss": 4.4542278289794925,
4530
+ "step": 6310
4531
+ },
4532
+ {
4533
+ "epoch": 1.0677479303936477,
4534
+ "grad_norm": 0.5284595489501953,
4535
+ "learning_rate": 0.00017856440095043464,
4536
+ "loss": 4.479632186889648,
4537
+ "step": 6320
4538
+ },
4539
+ {
4540
+ "epoch": 1.0694374049670552,
4541
+ "grad_norm": 0.5182107090950012,
4542
+ "learning_rate": 0.00017809402267531405,
4543
+ "loss": 4.4362133026123045,
4544
+ "step": 6330
4545
+ },
4546
+ {
4547
+ "epoch": 1.071126879540463,
4548
+ "grad_norm": 0.5018042922019958,
4549
+ "learning_rate": 0.00017762335791665735,
4550
+ "loss": 4.452248001098633,
4551
+ "step": 6340
4552
+ },
4553
+ {
4554
+ "epoch": 1.0728163541138707,
4555
+ "grad_norm": 0.5280482172966003,
4556
+ "learning_rate": 0.00017715241147398035,
4557
+ "loss": 4.464836120605469,
4558
+ "step": 6350
4559
+ },
4560
+ {
4561
+ "epoch": 1.0745058286872782,
4562
+ "grad_norm": 0.47761428356170654,
4563
+ "learning_rate": 0.00017668118814967126,
4564
+ "loss": 4.447597503662109,
4565
+ "step": 6360
4566
+ },
4567
+ {
4568
+ "epoch": 1.076195303260686,
4569
+ "grad_norm": 0.4841929078102112,
4570
+ "learning_rate": 0.00017620969274894163,
4571
+ "loss": 4.4613292694091795,
4572
+ "step": 6370
4573
+ },
4574
+ {
4575
+ "epoch": 1.0778847778340936,
4576
+ "grad_norm": 0.5038534998893738,
4577
+ "learning_rate": 0.00017573793007977763,
4578
+ "loss": 4.451330184936523,
4579
+ "step": 6380
4580
+ },
4581
+ {
4582
+ "epoch": 1.0795742524075012,
4583
+ "grad_norm": 0.5004971027374268,
4584
+ "learning_rate": 0.0001752659049528906,
4585
+ "loss": 4.457633972167969,
4586
+ "step": 6390
4587
+ },
4588
+ {
4589
+ "epoch": 1.081263726980909,
4590
+ "grad_norm": 0.5123668909072876,
4591
+ "learning_rate": 0.00017479362218166854,
4592
+ "loss": 4.443200302124024,
4593
+ "step": 6400
4594
+ },
4595
+ {
4596
+ "epoch": 1.0829532015543166,
4597
+ "grad_norm": 0.5099160075187683,
4598
+ "learning_rate": 0.0001743210865821265,
4599
+ "loss": 4.436219787597656,
4600
+ "step": 6410
4601
+ },
4602
+ {
4603
+ "epoch": 1.0846426761277244,
4604
+ "grad_norm": 0.5162463784217834,
4605
+ "learning_rate": 0.0001738483029728578,
4606
+ "loss": 4.45533561706543,
4607
+ "step": 6420
4608
+ },
4609
+ {
4610
+ "epoch": 1.0863321507011319,
4611
+ "grad_norm": 0.5178755521774292,
4612
+ "learning_rate": 0.00017337527617498474,
4613
+ "loss": 4.48522720336914,
4614
+ "step": 6430
4615
+ },
4616
+ {
4617
+ "epoch": 1.0880216252745396,
4618
+ "grad_norm": 0.49394717812538147,
4619
+ "learning_rate": 0.0001729020110121096,
4620
+ "loss": 4.447189712524414,
4621
+ "step": 6440
4622
+ },
4623
+ {
4624
+ "epoch": 1.0897110998479473,
4625
+ "grad_norm": 0.4908885955810547,
4626
+ "learning_rate": 0.0001724285123102652,
4627
+ "loss": 4.457671737670898,
4628
+ "step": 6450
4629
+ },
4630
+ {
4631
+ "epoch": 1.091400574421355,
4632
+ "grad_norm": 0.5045267343521118,
4633
+ "learning_rate": 0.00017195478489786593,
4634
+ "loss": 4.435376358032227,
4635
+ "step": 6460
4636
+ },
4637
+ {
4638
+ "epoch": 1.0930900489947626,
4639
+ "grad_norm": 0.5065691471099854,
4640
+ "learning_rate": 0.00017148083360565836,
4641
+ "loss": 4.435953903198242,
4642
+ "step": 6470
4643
+ },
4644
+ {
4645
+ "epoch": 1.0947795235681703,
4646
+ "grad_norm": 0.4825722575187683,
4647
+ "learning_rate": 0.00017100666326667202,
4648
+ "loss": 4.4766490936279295,
4649
+ "step": 6480
4650
+ },
4651
+ {
4652
+ "epoch": 1.096468998141578,
4653
+ "grad_norm": 0.4787653982639313,
4654
+ "learning_rate": 0.00017053227871617027,
4655
+ "loss": 4.448079299926758,
4656
+ "step": 6490
4657
+ },
4658
+ {
4659
+ "epoch": 1.0981584727149856,
4660
+ "grad_norm": 0.5119611024856567,
4661
+ "learning_rate": 0.00017005768479160064,
4662
+ "loss": 4.452360534667969,
4663
+ "step": 6500
4664
+ },
4665
+ {
4666
+ "epoch": 1.0981584727149856,
4667
+ "eval_loss": 4.429732799530029,
4668
+ "eval_runtime": 3.6558,
4669
+ "eval_samples_per_second": 273.54,
4670
+ "eval_steps_per_second": 5.744,
4671
+ "step": 6500
4672
  }
4673
  ],
4674
  "logging_steps": 10,
 
4688
  "attributes": {}
4689
  }
4690
  },
4691
+ "total_flos": 2.1739484320314163e+17,
4692
  "train_batch_size": 48,
4693
  "trial_name": null,
4694
  "trial_params": null