FormlessAI commited on
Commit
3f687a8
·
verified ·
1 Parent(s): 04e989d

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:361c716e9d659af0f39bdbc4ae43e035e27a740b934d15518ada4b2cf9b6eda5
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f371c80024225652f3c86237652abc582dcf0f83241d7917c781b6818ee9f107
3
  size 1037269336
last-checkpoint/global_step3050/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9702b049d245d16c002c1456338f72710c06f952b0478544008ad989a5de7e07
3
+ size 781993445
last-checkpoint/global_step3050/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4294cab9b940c0087225b9c7649512d19f24fa6a3f5f01b7538b149f5d7be8ab
3
+ size 781993509
last-checkpoint/global_step3050/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d7132340fd716524b3218b067e868d891edb7ac95bd8b57e53e48ee29e0838
3
+ size 781993509
last-checkpoint/global_step3050/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c4a5a81b58fe1331524664aed42e5082b13c66d3fadfb6645ff1316cf85beb
3
+ size 781993509
last-checkpoint/global_step3050/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2d5e8ed0bb6d30443a8b0f9ac2b192e359d428409f7528391fcd07030056005
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step2850
 
1
+ global_step3050
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:296a6ee956d4e7f171ae24e548556b8cc9db16a2b3b5267a93081e841cd2d54f
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e467591174f4d5f061fdc6867a8959bae4dd3ff9f561e079a51d1986c3871bef
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a52bed46ef108ac8871b7898a08f1745f27bbe75c31d481ed89e7717758ce61
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2af72cce7586fc024c88a31600f7b9bd8f97fac8953bf342b40bab89d92f4d3d
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e051a87038e9d1c34a6b99d96282794b5fc3a153ac0563f4cb4e418c57165626
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:267b8e17d32bdaab462ce2a11855474cca07a7c3d899baff6bd1f852d0f4b42e
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcbdd5111ef3195d3f21479a27577b5cfe61981c34932671437006ec501d9f4d
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69cff40eb67e607ef56c9df4fce05c9d4f61aef835fb92458f77bb2b8ff22109
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e18d52b697cf338b1ff89fd713a420b43ebaabb684d60d0dea3fb8b664d9d7
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e4f4eb8437c35cc3bc21ff2f135541f1bea2ca5b0d67f12d8ea935606929e82
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.0408315658569336,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4143044047099869,
6
  "eval_steps": 50,
7
- "global_step": 2850,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4454,6 +4454,318 @@
4454
  "eval_samples_per_second": 173.899,
4455
  "eval_steps_per_second": 10.905,
4456
  "step": 2850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4457
  }
4458
  ],
4459
  "logging_steps": 5,
@@ -4482,7 +4794,7 @@
4482
  "attributes": {}
4483
  }
4484
  },
4485
- "total_flos": 7.41822245675991e+17,
4486
  "train_batch_size": 4,
4487
  "trial_name": null,
4488
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.019763946533203,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.44337839802296847,
6
  "eval_steps": 50,
7
+ "global_step": 3050,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4454
  "eval_samples_per_second": 173.899,
4455
  "eval_steps_per_second": 10.905,
4456
  "step": 2850
4457
+ },
4458
+ {
4459
+ "epoch": 0.4150312545428115,
4460
+ "grad_norm": 2.5634236335754395,
4461
+ "learning_rate": 8.225223779709644e-05,
4462
+ "loss": 2.143,
4463
+ "step": 2855
4464
+ },
4465
+ {
4466
+ "epoch": 0.415758104375636,
4467
+ "grad_norm": 2.3738303184509277,
4468
+ "learning_rate": 8.219124026368057e-05,
4469
+ "loss": 2.2716,
4470
+ "step": 2860
4471
+ },
4472
+ {
4473
+ "epoch": 0.41648495420846054,
4474
+ "grad_norm": 2.5236852169036865,
4475
+ "learning_rate": 8.213016193649599e-05,
4476
+ "loss": 2.1629,
4477
+ "step": 2865
4478
+ },
4479
+ {
4480
+ "epoch": 0.41721180404128505,
4481
+ "grad_norm": 2.6418023109436035,
4482
+ "learning_rate": 8.206900296930731e-05,
4483
+ "loss": 2.2035,
4484
+ "step": 2870
4485
+ },
4486
+ {
4487
+ "epoch": 0.4179386538741096,
4488
+ "grad_norm": 2.70849609375,
4489
+ "learning_rate": 8.200776351608213e-05,
4490
+ "loss": 2.1833,
4491
+ "step": 2875
4492
+ },
4493
+ {
4494
+ "epoch": 0.4186655037069342,
4495
+ "grad_norm": 2.2143619060516357,
4496
+ "learning_rate": 8.194644373099076e-05,
4497
+ "loss": 2.1152,
4498
+ "step": 2880
4499
+ },
4500
+ {
4501
+ "epoch": 0.4193923535397587,
4502
+ "grad_norm": 2.5225560665130615,
4503
+ "learning_rate": 8.18850437684056e-05,
4504
+ "loss": 2.2915,
4505
+ "step": 2885
4506
+ },
4507
+ {
4508
+ "epoch": 0.42011920337258324,
4509
+ "grad_norm": 2.643038511276245,
4510
+ "learning_rate": 8.182356378290107e-05,
4511
+ "loss": 2.2131,
4512
+ "step": 2890
4513
+ },
4514
+ {
4515
+ "epoch": 0.42084605320540774,
4516
+ "grad_norm": 2.9499423503875732,
4517
+ "learning_rate": 8.17620039292529e-05,
4518
+ "loss": 2.2959,
4519
+ "step": 2895
4520
+ },
4521
+ {
4522
+ "epoch": 0.4215729030382323,
4523
+ "grad_norm": 2.53491473197937,
4524
+ "learning_rate": 8.170036436243797e-05,
4525
+ "loss": 2.1247,
4526
+ "step": 2900
4527
+ },
4528
+ {
4529
+ "epoch": 0.4215729030382323,
4530
+ "eval_loss": 2.030867576599121,
4531
+ "eval_runtime": 21.6628,
4532
+ "eval_samples_per_second": 152.381,
4533
+ "eval_steps_per_second": 9.556,
4534
+ "step": 2900
4535
+ },
4536
+ {
4537
+ "epoch": 0.42229975287105687,
4538
+ "grad_norm": 2.75742769241333,
4539
+ "learning_rate": 8.163864523763382e-05,
4540
+ "loss": 1.9965,
4541
+ "step": 2905
4542
+ },
4543
+ {
4544
+ "epoch": 0.42302660270388137,
4545
+ "grad_norm": 4.27183198928833,
4546
+ "learning_rate": 8.157684671021828e-05,
4547
+ "loss": 2.1029,
4548
+ "step": 2910
4549
+ },
4550
+ {
4551
+ "epoch": 0.42375345253670593,
4552
+ "grad_norm": 2.9568264484405518,
4553
+ "learning_rate": 8.151496893576904e-05,
4554
+ "loss": 2.2166,
4555
+ "step": 2915
4556
+ },
4557
+ {
4558
+ "epoch": 0.42448030236953044,
4559
+ "grad_norm": 2.716278314590454,
4560
+ "learning_rate": 8.145301207006335e-05,
4561
+ "loss": 2.1629,
4562
+ "step": 2920
4563
+ },
4564
+ {
4565
+ "epoch": 0.425207152202355,
4566
+ "grad_norm": 2.635277032852173,
4567
+ "learning_rate": 8.139097626907753e-05,
4568
+ "loss": 2.2077,
4569
+ "step": 2925
4570
+ },
4571
+ {
4572
+ "epoch": 0.4259340020351795,
4573
+ "grad_norm": 2.677725076675415,
4574
+ "learning_rate": 8.132886168898666e-05,
4575
+ "loss": 2.2313,
4576
+ "step": 2930
4577
+ },
4578
+ {
4579
+ "epoch": 0.42666085186800407,
4580
+ "grad_norm": 2.510044813156128,
4581
+ "learning_rate": 8.12666684861641e-05,
4582
+ "loss": 2.03,
4583
+ "step": 2935
4584
+ },
4585
+ {
4586
+ "epoch": 0.4273877017008286,
4587
+ "grad_norm": 2.279388904571533,
4588
+ "learning_rate": 8.120439681718117e-05,
4589
+ "loss": 2.1885,
4590
+ "step": 2940
4591
+ },
4592
+ {
4593
+ "epoch": 0.42811455153365313,
4594
+ "grad_norm": 2.61489200592041,
4595
+ "learning_rate": 8.114204683880671e-05,
4596
+ "loss": 2.2475,
4597
+ "step": 2945
4598
+ },
4599
+ {
4600
+ "epoch": 0.4288414013664777,
4601
+ "grad_norm": 2.7564356327056885,
4602
+ "learning_rate": 8.107961870800672e-05,
4603
+ "loss": 2.2717,
4604
+ "step": 2950
4605
+ },
4606
+ {
4607
+ "epoch": 0.4288414013664777,
4608
+ "eval_loss": 2.0410735607147217,
4609
+ "eval_runtime": 19.0203,
4610
+ "eval_samples_per_second": 173.552,
4611
+ "eval_steps_per_second": 10.883,
4612
+ "step": 2950
4613
+ },
4614
+ {
4615
+ "epoch": 0.4295682511993022,
4616
+ "grad_norm": 2.354588270187378,
4617
+ "learning_rate": 8.101711258194397e-05,
4618
+ "loss": 2.0337,
4619
+ "step": 2955
4620
+ },
4621
+ {
4622
+ "epoch": 0.43029510103212676,
4623
+ "grad_norm": 2.4436914920806885,
4624
+ "learning_rate": 8.095452861797751e-05,
4625
+ "loss": 2.0731,
4626
+ "step": 2960
4627
+ },
4628
+ {
4629
+ "epoch": 0.4310219508649513,
4630
+ "grad_norm": 2.4441328048706055,
4631
+ "learning_rate": 8.089186697366247e-05,
4632
+ "loss": 2.0913,
4633
+ "step": 2965
4634
+ },
4635
+ {
4636
+ "epoch": 0.4317488006977758,
4637
+ "grad_norm": 2.439755916595459,
4638
+ "learning_rate": 8.082912780674939e-05,
4639
+ "loss": 1.9794,
4640
+ "step": 2970
4641
+ },
4642
+ {
4643
+ "epoch": 0.4324756505306004,
4644
+ "grad_norm": 3.0894908905029297,
4645
+ "learning_rate": 8.076631127518407e-05,
4646
+ "loss": 2.2068,
4647
+ "step": 2975
4648
+ },
4649
+ {
4650
+ "epoch": 0.4332025003634249,
4651
+ "grad_norm": 2.3073198795318604,
4652
+ "learning_rate": 8.070341753710708e-05,
4653
+ "loss": 2.153,
4654
+ "step": 2980
4655
+ },
4656
+ {
4657
+ "epoch": 0.43392935019624945,
4658
+ "grad_norm": 2.387176513671875,
4659
+ "learning_rate": 8.06404467508533e-05,
4660
+ "loss": 2.0941,
4661
+ "step": 2985
4662
+ },
4663
+ {
4664
+ "epoch": 0.434656200029074,
4665
+ "grad_norm": 2.364358425140381,
4666
+ "learning_rate": 8.057739907495163e-05,
4667
+ "loss": 2.1182,
4668
+ "step": 2990
4669
+ },
4670
+ {
4671
+ "epoch": 0.4353830498618985,
4672
+ "grad_norm": 2.8649942874908447,
4673
+ "learning_rate": 8.05142746681245e-05,
4674
+ "loss": 2.0715,
4675
+ "step": 2995
4676
+ },
4677
+ {
4678
+ "epoch": 0.4361098996947231,
4679
+ "grad_norm": 2.504004716873169,
4680
+ "learning_rate": 8.045107368928755e-05,
4681
+ "loss": 2.183,
4682
+ "step": 3000
4683
+ },
4684
+ {
4685
+ "epoch": 0.4361098996947231,
4686
+ "eval_loss": 2.0339367389678955,
4687
+ "eval_runtime": 19.159,
4688
+ "eval_samples_per_second": 172.295,
4689
+ "eval_steps_per_second": 10.804,
4690
+ "step": 3000
4691
+ },
4692
+ {
4693
+ "epoch": 0.4368367495275476,
4694
+ "grad_norm": 2.4988174438476562,
4695
+ "learning_rate": 8.038779629754915e-05,
4696
+ "loss": 2.1443,
4697
+ "step": 3005
4698
+ },
4699
+ {
4700
+ "epoch": 0.43756359936037215,
4701
+ "grad_norm": 2.5082359313964844,
4702
+ "learning_rate": 8.032444265221006e-05,
4703
+ "loss": 2.0544,
4704
+ "step": 3010
4705
+ },
4706
+ {
4707
+ "epoch": 0.4382904491931967,
4708
+ "grad_norm": 2.3334364891052246,
4709
+ "learning_rate": 8.026101291276302e-05,
4710
+ "loss": 2.1904,
4711
+ "step": 3015
4712
+ },
4713
+ {
4714
+ "epoch": 0.4390172990260212,
4715
+ "grad_norm": 2.405759572982788,
4716
+ "learning_rate": 8.019750723889232e-05,
4717
+ "loss": 2.0836,
4718
+ "step": 3020
4719
+ },
4720
+ {
4721
+ "epoch": 0.4397441488588458,
4722
+ "grad_norm": 2.2676541805267334,
4723
+ "learning_rate": 8.013392579047339e-05,
4724
+ "loss": 2.1745,
4725
+ "step": 3025
4726
+ },
4727
+ {
4728
+ "epoch": 0.4404709986916703,
4729
+ "grad_norm": 2.144158124923706,
4730
+ "learning_rate": 8.00702687275725e-05,
4731
+ "loss": 2.2107,
4732
+ "step": 3030
4733
+ },
4734
+ {
4735
+ "epoch": 0.44119784852449484,
4736
+ "grad_norm": 2.9987900257110596,
4737
+ "learning_rate": 8.000653621044621e-05,
4738
+ "loss": 2.1826,
4739
+ "step": 3035
4740
+ },
4741
+ {
4742
+ "epoch": 0.4419246983573194,
4743
+ "grad_norm": 2.3955495357513428,
4744
+ "learning_rate": 7.994272839954103e-05,
4745
+ "loss": 2.1445,
4746
+ "step": 3040
4747
+ },
4748
+ {
4749
+ "epoch": 0.4426515481901439,
4750
+ "grad_norm": 3.0471301078796387,
4751
+ "learning_rate": 7.987884545549309e-05,
4752
+ "loss": 2.1338,
4753
+ "step": 3045
4754
+ },
4755
+ {
4756
+ "epoch": 0.44337839802296847,
4757
+ "grad_norm": 2.6408660411834717,
4758
+ "learning_rate": 7.981488753912759e-05,
4759
+ "loss": 2.1363,
4760
+ "step": 3050
4761
+ },
4762
+ {
4763
+ "epoch": 0.44337839802296847,
4764
+ "eval_loss": 2.019763946533203,
4765
+ "eval_runtime": 18.8959,
4766
+ "eval_samples_per_second": 174.694,
4767
+ "eval_steps_per_second": 10.955,
4768
+ "step": 3050
4769
  }
4770
  ],
4771
  "logging_steps": 5,
 
4794
  "attributes": {}
4795
  }
4796
  },
4797
+ "total_flos": 7.952395068977971e+17,
4798
  "train_batch_size": 4,
4799
  "trial_name": null,
4800
  "trial_params": null