FormlessAI commited on
Commit
ea7b461
·
verified ·
1 Parent(s): ea8927f

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc277642dc0b06e4ef5f27d5d9e2c2de2e591d599ed3f0e0949f75571b1cc34c
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8574164bb4d11eaf453dcb6ad3966428cd591430ae9c31f0937299ed1a487081
3
  size 1037269336
last-checkpoint/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a09647bd6cf33248479b856f41c1b82c476851b19566acd615bb9266f2b1b0ee
3
+ size 781993445
last-checkpoint/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f60fae928a89055d942fe282de8f7700321637408cadeae0860f984db5297c7
3
+ size 781993509
last-checkpoint/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5e367c94a7f4f9ffb0449a994883423b0ee48aa8ed33c52f9254bc053771053
3
+ size 781993509
last-checkpoint/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22ebe4a089211b888c7db1c62207d175ec46c124ad172ec679adaea45438cb12
3
+ size 781993509
last-checkpoint/global_step5000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eda741d4f4d77768028d65e555ce867c47e40bb8497ec8a08f5c144c7be204e
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step4850
 
1
+ global_step5000
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6240912d01e192733ef6be739d7b09f31a1f74d3c2153dd5b7bb314e27267ccf
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd00f37ba9aa2f280e60110d762d55bd77f2e19074544210642612fc0d0c6aed
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52d878f10e98abb1122007071c71e47bf1782972c530015971c5f9bcece9d472
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da7f2a246e741148e024dc29f274d353214e019d5f548b483c4905c46044d9c6
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9fee06aeb30dffa295e5deaef161931b146d5691264c40430f2d7f1d7c37ddf
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59fe33085db221039a6aa12c757a1cedc0cc5b1d3be922c202529c8eb1b8058a
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:731e4717014f6fa0804acda5ac6424642876d39e465767a39ea8e341536660c0
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15166ad530c105df387795709025f21626f6ea307321c73af1fa12ffc3d040d0
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2648a958bbf7c08c15a521db8bff4a4ac3ef2beed98d51a27a6f1dfcf292094
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ccb65ec1efdeb7bb899bcfdbd59da40edf4d90e5de5df4ddf919745dfd59ebe
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 1.8672053813934326,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7050443378398022,
6
  "eval_steps": 50,
7
- "global_step": 4850,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7574,6 +7574,240 @@
7574
  "eval_samples_per_second": 173.724,
7575
  "eval_steps_per_second": 10.894,
7576
  "step": 4850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7577
  }
7578
  ],
7579
  "logging_steps": 5,
@@ -7602,7 +7836,7 @@
7602
  "attributes": {}
7603
  }
7604
  },
7605
- "total_flos": 1.2640433748731494e+18,
7606
  "train_batch_size": 4,
7607
  "trial_name": null,
7608
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 1.8494781255722046,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7268498328245384,
6
  "eval_steps": 50,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7574
  "eval_samples_per_second": 173.724,
7575
  "eval_steps_per_second": 10.894,
7576
  "step": 4850
7577
+ },
7578
+ {
7579
+ "epoch": 0.7057711876726268,
7580
+ "grad_norm": 2.359971761703491,
7581
+ "learning_rate": 5.321233654341051e-05,
7582
+ "loss": 2.0426,
7583
+ "step": 4855
7584
+ },
7585
+ {
7586
+ "epoch": 0.7064980375054514,
7587
+ "grad_norm": 2.5104758739471436,
7588
+ "learning_rate": 5.3132997135038396e-05,
7589
+ "loss": 2.075,
7590
+ "step": 4860
7591
+ },
7592
+ {
7593
+ "epoch": 0.7072248873382759,
7594
+ "grad_norm": 2.3607850074768066,
7595
+ "learning_rate": 5.305365008699002e-05,
7596
+ "loss": 2.184,
7597
+ "step": 4865
7598
+ },
7599
+ {
7600
+ "epoch": 0.7079517371711005,
7601
+ "grad_norm": 2.6986582279205322,
7602
+ "learning_rate": 5.2974295599021475e-05,
7603
+ "loss": 2.0019,
7604
+ "step": 4870
7605
+ },
7606
+ {
7607
+ "epoch": 0.7086785870039249,
7608
+ "grad_norm": 2.2969441413879395,
7609
+ "learning_rate": 5.289493387090762e-05,
7610
+ "loss": 2.1051,
7611
+ "step": 4875
7612
+ },
7613
+ {
7614
+ "epoch": 0.7094054368367495,
7615
+ "grad_norm": 2.4311702251434326,
7616
+ "learning_rate": 5.2815565102441487e-05,
7617
+ "loss": 2.0222,
7618
+ "step": 4880
7619
+ },
7620
+ {
7621
+ "epoch": 0.7101322866695741,
7622
+ "grad_norm": 2.284479856491089,
7623
+ "learning_rate": 5.273618949343387e-05,
7624
+ "loss": 2.0578,
7625
+ "step": 4885
7626
+ },
7627
+ {
7628
+ "epoch": 0.7108591365023986,
7629
+ "grad_norm": 2.054469108581543,
7630
+ "learning_rate": 5.265680724371276e-05,
7631
+ "loss": 2.0806,
7632
+ "step": 4890
7633
+ },
7634
+ {
7635
+ "epoch": 0.7115859863352232,
7636
+ "grad_norm": 2.0409023761749268,
7637
+ "learning_rate": 5.257741855312288e-05,
7638
+ "loss": 2.1366,
7639
+ "step": 4895
7640
+ },
7641
+ {
7642
+ "epoch": 0.7123128361680476,
7643
+ "grad_norm": 2.3130247592926025,
7644
+ "learning_rate": 5.2498023621525144e-05,
7645
+ "loss": 1.9231,
7646
+ "step": 4900
7647
+ },
7648
+ {
7649
+ "epoch": 0.7123128361680476,
7650
+ "eval_loss": 1.85334312915802,
7651
+ "eval_runtime": 21.9469,
7652
+ "eval_samples_per_second": 150.409,
7653
+ "eval_steps_per_second": 9.432,
7654
+ "step": 4900
7655
+ },
7656
+ {
7657
+ "epoch": 0.7130396860008722,
7658
+ "grad_norm": 2.8905739784240723,
7659
+ "learning_rate": 5.241862264879624e-05,
7660
+ "loss": 2.1506,
7661
+ "step": 4905
7662
+ },
7663
+ {
7664
+ "epoch": 0.7137665358336968,
7665
+ "grad_norm": 1.8220387697219849,
7666
+ "learning_rate": 5.2339215834828e-05,
7667
+ "loss": 1.8484,
7668
+ "step": 4910
7669
+ },
7670
+ {
7671
+ "epoch": 0.7144933856665213,
7672
+ "grad_norm": 2.53902530670166,
7673
+ "learning_rate": 5.225980337952697e-05,
7674
+ "loss": 1.9491,
7675
+ "step": 4915
7676
+ },
7677
+ {
7678
+ "epoch": 0.7152202354993459,
7679
+ "grad_norm": 2.232422351837158,
7680
+ "learning_rate": 5.2180385482813935e-05,
7681
+ "loss": 1.9356,
7682
+ "step": 4920
7683
+ },
7684
+ {
7685
+ "epoch": 0.7159470853321703,
7686
+ "grad_norm": 2.471998691558838,
7687
+ "learning_rate": 5.210096234462335e-05,
7688
+ "loss": 2.0199,
7689
+ "step": 4925
7690
+ },
7691
+ {
7692
+ "epoch": 0.7166739351649949,
7693
+ "grad_norm": 2.3903968334198,
7694
+ "learning_rate": 5.202153416490285e-05,
7695
+ "loss": 2.0745,
7696
+ "step": 4930
7697
+ },
7698
+ {
7699
+ "epoch": 0.7174007849978195,
7700
+ "grad_norm": 2.582702159881592,
7701
+ "learning_rate": 5.1942101143612804e-05,
7702
+ "loss": 2.1917,
7703
+ "step": 4935
7704
+ },
7705
+ {
7706
+ "epoch": 0.718127634830644,
7707
+ "grad_norm": 2.2047088146209717,
7708
+ "learning_rate": 5.186266348072575e-05,
7709
+ "loss": 2.0905,
7710
+ "step": 4940
7711
+ },
7712
+ {
7713
+ "epoch": 0.7188544846634686,
7714
+ "grad_norm": 2.3632895946502686,
7715
+ "learning_rate": 5.178322137622589e-05,
7716
+ "loss": 1.8037,
7717
+ "step": 4945
7718
+ },
7719
+ {
7720
+ "epoch": 0.719581334496293,
7721
+ "grad_norm": 2.1407690048217773,
7722
+ "learning_rate": 5.170377503010865e-05,
7723
+ "loss": 1.9275,
7724
+ "step": 4950
7725
+ },
7726
+ {
7727
+ "epoch": 0.719581334496293,
7728
+ "eval_loss": 1.8587294816970825,
7729
+ "eval_runtime": 19.3641,
7730
+ "eval_samples_per_second": 170.47,
7731
+ "eval_steps_per_second": 10.69,
7732
+ "step": 4950
7733
+ },
7734
+ {
7735
+ "epoch": 0.7203081843291176,
7736
+ "grad_norm": 2.4468822479248047,
7737
+ "learning_rate": 5.16243246423801e-05,
7738
+ "loss": 2.0012,
7739
+ "step": 4955
7740
+ },
7741
+ {
7742
+ "epoch": 0.7210350341619421,
7743
+ "grad_norm": 2.2367379665374756,
7744
+ "learning_rate": 5.15448704130565e-05,
7745
+ "loss": 2.1336,
7746
+ "step": 4960
7747
+ },
7748
+ {
7749
+ "epoch": 0.7217618839947667,
7750
+ "grad_norm": 2.382683515548706,
7751
+ "learning_rate": 5.1465412542163777e-05,
7752
+ "loss": 2.0299,
7753
+ "step": 4965
7754
+ },
7755
+ {
7756
+ "epoch": 0.7224887338275913,
7757
+ "grad_norm": 2.802795648574829,
7758
+ "learning_rate": 5.138595122973702e-05,
7759
+ "loss": 2.1449,
7760
+ "step": 4970
7761
+ },
7762
+ {
7763
+ "epoch": 0.7232155836604157,
7764
+ "grad_norm": 2.422428846359253,
7765
+ "learning_rate": 5.130648667582e-05,
7766
+ "loss": 1.9257,
7767
+ "step": 4975
7768
+ },
7769
+ {
7770
+ "epoch": 0.7239424334932403,
7771
+ "grad_norm": 2.619701862335205,
7772
+ "learning_rate": 5.1227019080464614e-05,
7773
+ "loss": 2.1349,
7774
+ "step": 4980
7775
+ },
7776
+ {
7777
+ "epoch": 0.7246692833260648,
7778
+ "grad_norm": 2.259448289871216,
7779
+ "learning_rate": 5.114754864373048e-05,
7780
+ "loss": 1.9518,
7781
+ "step": 4985
7782
+ },
7783
+ {
7784
+ "epoch": 0.7253961331588894,
7785
+ "grad_norm": 2.466169834136963,
7786
+ "learning_rate": 5.106807556568429e-05,
7787
+ "loss": 2.0608,
7788
+ "step": 4990
7789
+ },
7790
+ {
7791
+ "epoch": 0.726122982991714,
7792
+ "grad_norm": 2.4360663890838623,
7793
+ "learning_rate": 5.098860004639943e-05,
7794
+ "loss": 2.0255,
7795
+ "step": 4995
7796
+ },
7797
+ {
7798
+ "epoch": 0.7268498328245384,
7799
+ "grad_norm": 2.5744364261627197,
7800
+ "learning_rate": 5.0909122285955454e-05,
7801
+ "loss": 2.0253,
7802
+ "step": 5000
7803
+ },
7804
+ {
7805
+ "epoch": 0.7268498328245384,
7806
+ "eval_loss": 1.8494781255722046,
7807
+ "eval_runtime": 19.054,
7808
+ "eval_samples_per_second": 173.245,
7809
+ "eval_steps_per_second": 10.864,
7810
+ "step": 5000
7811
  }
7812
  ],
7813
  "logging_steps": 5,
 
7836
  "attributes": {}
7837
  }
7838
  },
7839
+ "total_flos": 1.3038086059374674e+18,
7840
  "train_batch_size": 4,
7841
  "trial_name": null,
7842
  "trial_params": null