FormlessAI commited on
Commit
bbdff17
·
verified ·
1 Parent(s): 7a6c2ab

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4309676b677f4c45fa6920d209e1c2f12b3e4ee7403bed0916c1f32cebbb28b1
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0ccb8a0129c7d9eaf16b9290d28992901ea364a3eb99fab68c551e58865ad1d
3
  size 1037269336
last-checkpoint/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8582c27675be97fbe175662f0617ffe889ff3637d47df468a239c01727d77734
3
+ size 781993445
last-checkpoint/global_step500/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6016bca51cbed4fb5c504cbb187a0a788949484fb1cb48367af54ed9a5386209
3
+ size 781993509
last-checkpoint/global_step500/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2054e8d1388acd691635ad517a65efb430d628b720c279ee8d82dd1c29deefa8
3
+ size 781993509
last-checkpoint/global_step500/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0669a96fa76f6656565eb40cf556723e5d0cf16d8246191dc18bef5b155485d2
3
+ size 781993509
last-checkpoint/global_step500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac632624f93cba312efba8e16ea77a85d2d3ba6cb34c3a2b435a69c990f108c
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step350
 
1
+ global_step500
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d21dde17f9d9a99170acf034e536c5632372f501fec0f61fa850b399a279e4b
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee5d289ec7768cbf0e07f6f91891b3cd40d731766941a42a578606b1c1b8dc08
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05603e4e06b9be10365e879b5235243dbbfe82cd9517e88d67d00e72d67835a5
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d94bb40a8ce66db77dee5b9f49872d85599609de34f6189c8101364fa21ff9b
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ed4a37f9e4fd517d3e6e7b7e7c7a2c363dff932cd44578c70bbefdee8b0e2e9
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c477cacf16acc18aecb2516f63a1ba0461197152443bea744139c7ff46a46f73
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6eebac46efa7f2bd7fd13551dc3528de444025722f566ee67f960be4415bda97
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:810b92355364d8d84b8db0ab868b5a2796dc3ae691f062feeffae3690cbf1153
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4457fca0aac48cf0905dae3a185a31d975d5e88a968660d320a67ba2923e9a35
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e72eb35b7c5b7fc898e5e035cd606aabe0783358153b154eea9980f36a0e64c
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.3381261825561523,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.05087948829771769,
6
  "eval_steps": 50,
7
- "global_step": 350,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -554,6 +554,240 @@
554
  "eval_samples_per_second": 174.411,
555
  "eval_steps_per_second": 10.937,
556
  "step": 350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  }
558
  ],
559
  "logging_steps": 5,
@@ -582,7 +816,7 @@
582
  "attributes": {}
583
  }
584
  },
585
- "total_flos": 9.118819589449318e+16,
586
  "train_batch_size": 4,
587
  "trial_name": null,
588
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.276088237762451,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.07268498328245385,
6
  "eval_steps": 50,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
554
  "eval_samples_per_second": 174.411,
555
  "eval_steps_per_second": 10.937,
556
  "step": 350
557
+ },
558
+ {
559
+ "epoch": 0.05160633813054223,
560
+ "grad_norm": 2.7642250061035156,
561
+ "learning_rate": 0.00010003408475268547,
562
+ "loss": 2.293,
563
+ "step": 355
564
+ },
565
+ {
566
+ "epoch": 0.05233318796336677,
567
+ "grad_norm": 2.7389674186706543,
568
+ "learning_rate": 0.00010002762180560444,
569
+ "loss": 2.4124,
570
+ "step": 360
571
+ },
572
+ {
573
+ "epoch": 0.053060037796191305,
574
+ "grad_norm": 2.504678726196289,
575
+ "learning_rate": 0.00010002103316168538,
576
+ "loss": 2.3363,
577
+ "step": 365
578
+ },
579
+ {
580
+ "epoch": 0.053786887629015845,
581
+ "grad_norm": 2.936523199081421,
582
+ "learning_rate": 0.00010001431883751522,
583
+ "loss": 2.5376,
584
+ "step": 370
585
+ },
586
+ {
587
+ "epoch": 0.054513737461840385,
588
+ "grad_norm": 2.2680201530456543,
589
+ "learning_rate": 0.00010000747884999726,
590
+ "loss": 2.351,
591
+ "step": 375
592
+ },
593
+ {
594
+ "epoch": 0.055240587294664925,
595
+ "grad_norm": 2.2338175773620605,
596
+ "learning_rate": 0.00010000051321635116,
597
+ "loss": 2.2443,
598
+ "step": 380
599
+ },
600
+ {
601
+ "epoch": 0.05596743712748946,
602
+ "grad_norm": 2.376094341278076,
603
+ "learning_rate": 9.999342195411289e-05,
604
+ "loss": 2.3425,
605
+ "step": 385
606
+ },
607
+ {
608
+ "epoch": 0.056694286960314,
609
+ "grad_norm": 2.237074136734009,
610
+ "learning_rate": 9.998620508113469e-05,
611
+ "loss": 2.2599,
612
+ "step": 390
613
+ },
614
+ {
615
+ "epoch": 0.05742113679313854,
616
+ "grad_norm": 2.188169002532959,
617
+ "learning_rate": 9.997886261558505e-05,
618
+ "loss": 2.4128,
619
+ "step": 395
620
+ },
621
+ {
622
+ "epoch": 0.05814798662596308,
623
+ "grad_norm": 2.172314167022705,
624
+ "learning_rate": 9.99713945759486e-05,
625
+ "loss": 2.2818,
626
+ "step": 400
627
+ },
628
+ {
629
+ "epoch": 0.05814798662596308,
630
+ "eval_loss": 2.3017640113830566,
631
+ "eval_runtime": 23.369,
632
+ "eval_samples_per_second": 141.256,
633
+ "eval_steps_per_second": 8.858,
634
+ "step": 400
635
+ },
636
+ {
637
+ "epoch": 0.05887483645878761,
638
+ "grad_norm": 2.5056467056274414,
639
+ "learning_rate": 9.996380098102613e-05,
640
+ "loss": 2.3248,
641
+ "step": 405
642
+ },
643
+ {
644
+ "epoch": 0.05960168629161215,
645
+ "grad_norm": 2.3642866611480713,
646
+ "learning_rate": 9.99560818499345e-05,
647
+ "loss": 2.3405,
648
+ "step": 410
649
+ },
650
+ {
651
+ "epoch": 0.06032853612443669,
652
+ "grad_norm": 2.5992562770843506,
653
+ "learning_rate": 9.994823720210662e-05,
654
+ "loss": 2.2849,
655
+ "step": 415
656
+ },
657
+ {
658
+ "epoch": 0.06105538595726123,
659
+ "grad_norm": 2.5721242427825928,
660
+ "learning_rate": 9.994026705729136e-05,
661
+ "loss": 2.2368,
662
+ "step": 420
663
+ },
664
+ {
665
+ "epoch": 0.061782235790085765,
666
+ "grad_norm": 2.104682207107544,
667
+ "learning_rate": 9.993217143555357e-05,
668
+ "loss": 2.3928,
669
+ "step": 425
670
+ },
671
+ {
672
+ "epoch": 0.06250908562291031,
673
+ "grad_norm": 2.82171630859375,
674
+ "learning_rate": 9.992395035727397e-05,
675
+ "loss": 2.2192,
676
+ "step": 430
677
+ },
678
+ {
679
+ "epoch": 0.06323593545573485,
680
+ "grad_norm": 2.1614480018615723,
681
+ "learning_rate": 9.99156038431491e-05,
682
+ "loss": 2.257,
683
+ "step": 435
684
+ },
685
+ {
686
+ "epoch": 0.06396278528855938,
687
+ "grad_norm": 2.601987600326538,
688
+ "learning_rate": 9.990713191419133e-05,
689
+ "loss": 2.319,
690
+ "step": 440
691
+ },
692
+ {
693
+ "epoch": 0.06468963512138393,
694
+ "grad_norm": 2.4330639839172363,
695
+ "learning_rate": 9.989853459172868e-05,
696
+ "loss": 2.5235,
697
+ "step": 445
698
+ },
699
+ {
700
+ "epoch": 0.06541648495420846,
701
+ "grad_norm": 2.4703032970428467,
702
+ "learning_rate": 9.988981189740496e-05,
703
+ "loss": 2.3522,
704
+ "step": 450
705
+ },
706
+ {
707
+ "epoch": 0.06541648495420846,
708
+ "eval_loss": 2.3036184310913086,
709
+ "eval_runtime": 19.0462,
710
+ "eval_samples_per_second": 173.316,
711
+ "eval_steps_per_second": 10.868,
712
+ "step": 450
713
+ },
714
+ {
715
+ "epoch": 0.06614333478703299,
716
+ "grad_norm": 2.469402313232422,
717
+ "learning_rate": 9.988096385317949e-05,
718
+ "loss": 2.3827,
719
+ "step": 455
720
+ },
721
+ {
722
+ "epoch": 0.06687018461985754,
723
+ "grad_norm": 2.229930877685547,
724
+ "learning_rate": 9.987199048132724e-05,
725
+ "loss": 2.4999,
726
+ "step": 460
727
+ },
728
+ {
729
+ "epoch": 0.06759703445268207,
730
+ "grad_norm": 2.1780316829681396,
731
+ "learning_rate": 9.986289180443866e-05,
732
+ "loss": 2.4012,
733
+ "step": 465
734
+ },
735
+ {
736
+ "epoch": 0.06832388428550662,
737
+ "grad_norm": 2.6075403690338135,
738
+ "learning_rate": 9.985366784541965e-05,
739
+ "loss": 2.1278,
740
+ "step": 470
741
+ },
742
+ {
743
+ "epoch": 0.06905073411833115,
744
+ "grad_norm": 2.6112635135650635,
745
+ "learning_rate": 9.984431862749151e-05,
746
+ "loss": 2.3663,
747
+ "step": 475
748
+ },
749
+ {
750
+ "epoch": 0.06977758395115569,
751
+ "grad_norm": 2.6977710723876953,
752
+ "learning_rate": 9.98348441741909e-05,
753
+ "loss": 2.438,
754
+ "step": 480
755
+ },
756
+ {
757
+ "epoch": 0.07050443378398023,
758
+ "grad_norm": 2.4466493129730225,
759
+ "learning_rate": 9.982524450936976e-05,
760
+ "loss": 2.2735,
761
+ "step": 485
762
+ },
763
+ {
764
+ "epoch": 0.07123128361680477,
765
+ "grad_norm": 2.569622755050659,
766
+ "learning_rate": 9.981551965719518e-05,
767
+ "loss": 2.2769,
768
+ "step": 490
769
+ },
770
+ {
771
+ "epoch": 0.07195813344962931,
772
+ "grad_norm": 2.5103085041046143,
773
+ "learning_rate": 9.980566964214952e-05,
774
+ "loss": 2.3861,
775
+ "step": 495
776
+ },
777
+ {
778
+ "epoch": 0.07268498328245385,
779
+ "grad_norm": 2.7069687843322754,
780
+ "learning_rate": 9.979569448903016e-05,
781
+ "loss": 2.1348,
782
+ "step": 500
783
+ },
784
+ {
785
+ "epoch": 0.07268498328245385,
786
+ "eval_loss": 2.276088237762451,
787
+ "eval_runtime": 18.7696,
788
+ "eval_samples_per_second": 175.869,
789
+ "eval_steps_per_second": 11.028,
790
+ "step": 500
791
  }
792
  ],
793
  "logging_steps": 5,
 
816
  "attributes": {}
817
  }
818
  },
819
+ "total_flos": 1.304977240836014e+17,
820
  "train_batch_size": 4,
821
  "trial_name": null,
822
  "trial_params": null