AiAF commited on
Commit
283c16a
·
verified ·
1 Parent(s): 75de095

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7ce7d64d20fdf389d531e9adbdd3b5ff3d852fea5307986d9255b260474f590
3
  size 332316480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60f39bf0e2d04be586e3cdf61f38e4002d8f4ccf2b3e7506a28c3eebf56ae883
3
  size 332316480
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:725564e9f8183d7e28b741e7e93b7d22f858ed11067577a8e0e1c8143b298950
3
  size 169158677
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d52cab9303197d3ac09cad8604a16b3cd57e8bcfc3339e579cfd996a7044c903
3
  size 169158677
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:901a91e07bc4351a859fdeb9218e622accee766b4b0a88f88399d63dfa225750
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:391d73924ccf821cbbf0fdb4254c0376997b82cf4a16ef088c45a6a109b91100
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c08494ee770db422d5f2ff781935e817305fb84dab453dfcf5a5df2443b7693d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f94ebdc28f5491fc51fc2ecbab5d9e2e3ba6be348d92d880d778a28fcd2cbce
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.95,
6
  "eval_steps": 50,
7
- "global_step": 950,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10678,6 +10678,567 @@
10678
  "memory/max_active (GiB)": 7.78,
10679
  "memory/max_allocated (GiB)": 7.78,
10680
  "step": 950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10681
  }
10682
  ],
10683
  "logging_steps": 1,
@@ -10692,12 +11253,12 @@
10692
  "should_evaluate": false,
10693
  "should_log": false,
10694
  "should_save": true,
10695
- "should_training_stop": false
10696
  },
10697
  "attributes": {}
10698
  }
10699
  },
10700
- "total_flos": 1.9677301637854003e+17,
10701
  "train_batch_size": 1,
10702
  "trial_name": null,
10703
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 50,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10678
  "memory/max_active (GiB)": 7.78,
10679
  "memory/max_allocated (GiB)": 7.78,
10680
  "step": 950
10681
+ },
10682
+ {
10683
+ "epoch": 0.951,
10684
+ "grad_norm": 0.8099629878997803,
10685
+ "learning_rate": 1.30832912661093e-06,
10686
+ "loss": 2.287,
10687
+ "memory/device_reserved (GiB)": 17.74,
10688
+ "memory/max_active (GiB)": 17.43,
10689
+ "memory/max_allocated (GiB)": 17.43,
10690
+ "step": 951,
10691
+ "tokens_per_second_per_gpu": 997.07
10692
+ },
10693
+ {
10694
+ "epoch": 0.952,
10695
+ "grad_norm": 0.910591185092926,
10696
+ "learning_rate": 1.2566280820298426e-06,
10697
+ "loss": 2.2643,
10698
+ "memory/device_reserved (GiB)": 17.78,
10699
+ "memory/max_active (GiB)": 17.43,
10700
+ "memory/max_allocated (GiB)": 17.43,
10701
+ "step": 952,
10702
+ "tokens_per_second_per_gpu": 821.07
10703
+ },
10704
+ {
10705
+ "epoch": 0.953,
10706
+ "grad_norm": 0.8110288381576538,
10707
+ "learning_rate": 1.2059628086956044e-06,
10708
+ "loss": 2.3573,
10709
+ "memory/device_reserved (GiB)": 17.78,
10710
+ "memory/max_active (GiB)": 17.43,
10711
+ "memory/max_allocated (GiB)": 17.43,
10712
+ "step": 953,
10713
+ "tokens_per_second_per_gpu": 1025.55
10714
+ },
10715
+ {
10716
+ "epoch": 0.954,
10717
+ "grad_norm": 0.8043859004974365,
10718
+ "learning_rate": 1.1563338380629618e-06,
10719
+ "loss": 2.5223,
10720
+ "memory/device_reserved (GiB)": 17.78,
10721
+ "memory/max_active (GiB)": 17.43,
10722
+ "memory/max_allocated (GiB)": 17.43,
10723
+ "step": 954,
10724
+ "tokens_per_second_per_gpu": 1124.93
10725
+ },
10726
+ {
10727
+ "epoch": 0.955,
10728
+ "grad_norm": 0.7169449329376221,
10729
+ "learning_rate": 1.1077416907163574e-06,
10730
+ "loss": 2.1511,
10731
+ "memory/device_reserved (GiB)": 17.79,
10732
+ "memory/max_active (GiB)": 17.43,
10733
+ "memory/max_allocated (GiB)": 17.43,
10734
+ "step": 955,
10735
+ "tokens_per_second_per_gpu": 1207.88
10736
+ },
10737
+ {
10738
+ "epoch": 0.956,
10739
+ "grad_norm": 0.9546728730201721,
10740
+ "learning_rate": 1.0601868763643996e-06,
10741
+ "loss": 2.3221,
10742
+ "memory/device_reserved (GiB)": 17.79,
10743
+ "memory/max_active (GiB)": 17.43,
10744
+ "memory/max_allocated (GiB)": 17.43,
10745
+ "step": 956,
10746
+ "tokens_per_second_per_gpu": 715.9
10747
+ },
10748
+ {
10749
+ "epoch": 0.957,
10750
+ "grad_norm": 0.9332824945449829,
10751
+ "learning_rate": 1.0136698938346011e-06,
10752
+ "loss": 2.4007,
10753
+ "memory/device_reserved (GiB)": 17.79,
10754
+ "memory/max_active (GiB)": 17.43,
10755
+ "memory/max_allocated (GiB)": 17.43,
10756
+ "step": 957,
10757
+ "tokens_per_second_per_gpu": 909.16
10758
+ },
10759
+ {
10760
+ "epoch": 0.958,
10761
+ "grad_norm": 0.948166012763977,
10762
+ "learning_rate": 9.68191231068083e-07,
10763
+ "loss": 2.2667,
10764
+ "memory/device_reserved (GiB)": 17.79,
10765
+ "memory/max_active (GiB)": 17.43,
10766
+ "memory/max_allocated (GiB)": 17.43,
10767
+ "step": 958,
10768
+ "tokens_per_second_per_gpu": 739.18
10769
+ },
10770
+ {
10771
+ "epoch": 0.959,
10772
+ "grad_norm": 0.7676699161529541,
10773
+ "learning_rate": 9.237513651145225e-07,
10774
+ "loss": 2.1496,
10775
+ "memory/device_reserved (GiB)": 17.79,
10776
+ "memory/max_active (GiB)": 17.43,
10777
+ "memory/max_allocated (GiB)": 17.43,
10778
+ "step": 959,
10779
+ "tokens_per_second_per_gpu": 1144.45
10780
+ },
10781
+ {
10782
+ "epoch": 0.96,
10783
+ "grad_norm": 0.7553421854972839,
10784
+ "learning_rate": 8.803507621270579e-07,
10785
+ "loss": 2.4495,
10786
+ "memory/device_reserved (GiB)": 17.79,
10787
+ "memory/max_active (GiB)": 17.43,
10788
+ "memory/max_allocated (GiB)": 17.43,
10789
+ "step": 960,
10790
+ "tokens_per_second_per_gpu": 1184.03
10791
+ },
10792
+ {
10793
+ "epoch": 0.961,
10794
+ "grad_norm": 0.8734245896339417,
10795
+ "learning_rate": 8.379898773574924e-07,
10796
+ "loss": 2.4696,
10797
+ "memory/device_reserved (GiB)": 17.79,
10798
+ "memory/max_active (GiB)": 17.43,
10799
+ "memory/max_allocated (GiB)": 17.43,
10800
+ "step": 961,
10801
+ "tokens_per_second_per_gpu": 985.05
10802
+ },
10803
+ {
10804
+ "epoch": 0.962,
10805
+ "grad_norm": 0.9553205370903015,
10806
+ "learning_rate": 7.966691551514527e-07,
10807
+ "loss": 2.5366,
10808
+ "memory/device_reserved (GiB)": 17.79,
10809
+ "memory/max_active (GiB)": 17.43,
10810
+ "memory/max_allocated (GiB)": 17.43,
10811
+ "step": 962,
10812
+ "tokens_per_second_per_gpu": 789.81
10813
+ },
10814
+ {
10815
+ "epoch": 0.963,
10816
+ "grad_norm": 1.0424203872680664,
10817
+ "learning_rate": 7.563890289437825e-07,
10818
+ "loss": 2.3128,
10819
+ "memory/device_reserved (GiB)": 17.79,
10820
+ "memory/max_active (GiB)": 17.43,
10821
+ "memory/max_allocated (GiB)": 17.43,
10822
+ "step": 963,
10823
+ "tokens_per_second_per_gpu": 642.35
10824
+ },
10825
+ {
10826
+ "epoch": 0.964,
10827
+ "grad_norm": 0.9237273931503296,
10828
+ "learning_rate": 7.171499212539123e-07,
10829
+ "loss": 2.3017,
10830
+ "memory/device_reserved (GiB)": 17.79,
10831
+ "memory/max_active (GiB)": 17.43,
10832
+ "memory/max_allocated (GiB)": 17.43,
10833
+ "step": 964,
10834
+ "tokens_per_second_per_gpu": 771.48
10835
+ },
10836
+ {
10837
+ "epoch": 0.965,
10838
+ "grad_norm": 0.821221649646759,
10839
+ "learning_rate": 6.78952243681541e-07,
10840
+ "loss": 2.3436,
10841
+ "memory/device_reserved (GiB)": 17.79,
10842
+ "memory/max_active (GiB)": 17.43,
10843
+ "memory/max_allocated (GiB)": 17.43,
10844
+ "step": 965,
10845
+ "tokens_per_second_per_gpu": 1111.51
10846
+ },
10847
+ {
10848
+ "epoch": 0.966,
10849
+ "grad_norm": 0.8634496331214905,
10850
+ "learning_rate": 6.41796396902239e-07,
10851
+ "loss": 2.0172,
10852
+ "memory/device_reserved (GiB)": 17.79,
10853
+ "memory/max_active (GiB)": 17.43,
10854
+ "memory/max_allocated (GiB)": 17.43,
10855
+ "step": 966,
10856
+ "tokens_per_second_per_gpu": 789.7
10857
+ },
10858
+ {
10859
+ "epoch": 0.967,
10860
+ "grad_norm": 0.8414502143859863,
10861
+ "learning_rate": 6.056827706632185e-07,
10862
+ "loss": 2.282,
10863
+ "memory/device_reserved (GiB)": 17.79,
10864
+ "memory/max_active (GiB)": 17.43,
10865
+ "memory/max_allocated (GiB)": 17.43,
10866
+ "step": 967,
10867
+ "tokens_per_second_per_gpu": 1027.47
10868
+ },
10869
+ {
10870
+ "epoch": 0.968,
10871
+ "grad_norm": 1.1495898962020874,
10872
+ "learning_rate": 5.706117437793701e-07,
10873
+ "loss": 2.3397,
10874
+ "memory/device_reserved (GiB)": 17.79,
10875
+ "memory/max_active (GiB)": 17.43,
10876
+ "memory/max_allocated (GiB)": 17.43,
10877
+ "step": 968,
10878
+ "tokens_per_second_per_gpu": 534.09
10879
+ },
10880
+ {
10881
+ "epoch": 0.969,
10882
+ "grad_norm": 0.8324930667877197,
10883
+ "learning_rate": 5.365836841291438e-07,
10884
+ "loss": 2.4827,
10885
+ "memory/device_reserved (GiB)": 17.79,
10886
+ "memory/max_active (GiB)": 17.43,
10887
+ "memory/max_allocated (GiB)": 17.43,
10888
+ "step": 969,
10889
+ "tokens_per_second_per_gpu": 1116.92
10890
+ },
10891
+ {
10892
+ "epoch": 0.97,
10893
+ "grad_norm": 1.0874335765838623,
10894
+ "learning_rate": 5.035989486508075e-07,
10895
+ "loss": 2.3351,
10896
+ "memory/device_reserved (GiB)": 17.79,
10897
+ "memory/max_active (GiB)": 17.43,
10898
+ "memory/max_allocated (GiB)": 17.43,
10899
+ "step": 970,
10900
+ "tokens_per_second_per_gpu": 579.44
10901
+ },
10902
+ {
10903
+ "epoch": 0.971,
10904
+ "grad_norm": 0.791401743888855,
10905
+ "learning_rate": 4.7165788333860536e-07,
10906
+ "loss": 2.2922,
10907
+ "memory/device_reserved (GiB)": 17.79,
10908
+ "memory/max_active (GiB)": 17.43,
10909
+ "memory/max_allocated (GiB)": 17.43,
10910
+ "step": 971,
10911
+ "tokens_per_second_per_gpu": 1109.94
10912
+ },
10913
+ {
10914
+ "epoch": 0.972,
10915
+ "grad_norm": 0.8731902241706848,
10916
+ "learning_rate": 4.4076082323920576e-07,
10917
+ "loss": 2.4829,
10918
+ "memory/device_reserved (GiB)": 17.79,
10919
+ "memory/max_active (GiB)": 17.43,
10920
+ "memory/max_allocated (GiB)": 17.43,
10921
+ "step": 972,
10922
+ "tokens_per_second_per_gpu": 877.97
10923
+ },
10924
+ {
10925
+ "epoch": 0.973,
10926
+ "grad_norm": 0.8560281991958618,
10927
+ "learning_rate": 4.1090809244814785e-07,
10928
+ "loss": 2.2317,
10929
+ "memory/device_reserved (GiB)": 17.79,
10930
+ "memory/max_active (GiB)": 17.43,
10931
+ "memory/max_allocated (GiB)": 17.43,
10932
+ "step": 973,
10933
+ "tokens_per_second_per_gpu": 932.27
10934
+ },
10935
+ {
10936
+ "epoch": 0.974,
10937
+ "grad_norm": 0.9274902939796448,
10938
+ "learning_rate": 3.82100004106456e-07,
10939
+ "loss": 2.5177,
10940
+ "memory/device_reserved (GiB)": 17.79,
10941
+ "memory/max_active (GiB)": 17.43,
10942
+ "memory/max_allocated (GiB)": 17.43,
10943
+ "step": 974,
10944
+ "tokens_per_second_per_gpu": 967.07
10945
+ },
10946
+ {
10947
+ "epoch": 0.975,
10948
+ "grad_norm": 0.9513389468193054,
10949
+ "learning_rate": 3.543368603973529e-07,
10950
+ "loss": 2.3584,
10951
+ "memory/device_reserved (GiB)": 17.79,
10952
+ "memory/max_active (GiB)": 17.43,
10953
+ "memory/max_allocated (GiB)": 17.43,
10954
+ "step": 975,
10955
+ "tokens_per_second_per_gpu": 776.96
10956
+ },
10957
+ {
10958
+ "epoch": 0.976,
10959
+ "grad_norm": 0.8030345439910889,
10960
+ "learning_rate": 3.2761895254306287e-07,
10961
+ "loss": 2.1989,
10962
+ "memory/device_reserved (GiB)": 17.79,
10963
+ "memory/max_active (GiB)": 17.43,
10964
+ "memory/max_allocated (GiB)": 17.43,
10965
+ "step": 976,
10966
+ "tokens_per_second_per_gpu": 1064.08
10967
+ },
10968
+ {
10969
+ "epoch": 0.977,
10970
+ "grad_norm": 0.8223397135734558,
10971
+ "learning_rate": 3.019465608018024e-07,
10972
+ "loss": 2.3525,
10973
+ "memory/device_reserved (GiB)": 17.79,
10974
+ "memory/max_active (GiB)": 17.43,
10975
+ "memory/max_allocated (GiB)": 17.43,
10976
+ "step": 977,
10977
+ "tokens_per_second_per_gpu": 1110.89
10978
+ },
10979
+ {
10980
+ "epoch": 0.978,
10981
+ "grad_norm": 1.0492770671844482,
10982
+ "learning_rate": 2.773199544648164e-07,
10983
+ "loss": 2.336,
10984
+ "memory/device_reserved (GiB)": 17.79,
10985
+ "memory/max_active (GiB)": 17.43,
10986
+ "memory/max_allocated (GiB)": 17.43,
10987
+ "step": 978,
10988
+ "tokens_per_second_per_gpu": 779.71
10989
+ },
10990
+ {
10991
+ "epoch": 0.979,
10992
+ "grad_norm": 0.897686779499054,
10993
+ "learning_rate": 2.537393918535358e-07,
10994
+ "loss": 2.357,
10995
+ "memory/device_reserved (GiB)": 17.79,
10996
+ "memory/max_active (GiB)": 17.43,
10997
+ "memory/max_allocated (GiB)": 17.43,
10998
+ "step": 979,
10999
+ "tokens_per_second_per_gpu": 980.21
11000
+ },
11001
+ {
11002
+ "epoch": 0.98,
11003
+ "grad_norm": 0.8448941707611084,
11004
+ "learning_rate": 2.312051203169352e-07,
11005
+ "loss": 2.176,
11006
+ "memory/device_reserved (GiB)": 17.79,
11007
+ "memory/max_active (GiB)": 17.43,
11008
+ "memory/max_allocated (GiB)": 17.43,
11009
+ "step": 980,
11010
+ "tokens_per_second_per_gpu": 1001.24
11011
+ },
11012
+ {
11013
+ "epoch": 0.981,
11014
+ "grad_norm": 0.9005848169326782,
11015
+ "learning_rate": 2.0971737622883515e-07,
11016
+ "loss": 2.2181,
11017
+ "memory/device_reserved (GiB)": 17.79,
11018
+ "memory/max_active (GiB)": 17.43,
11019
+ "memory/max_allocated (GiB)": 17.43,
11020
+ "step": 981,
11021
+ "tokens_per_second_per_gpu": 900.62
11022
+ },
11023
+ {
11024
+ "epoch": 0.982,
11025
+ "grad_norm": 0.8972439169883728,
11026
+ "learning_rate": 1.8927638498551502e-07,
11027
+ "loss": 2.2886,
11028
+ "memory/device_reserved (GiB)": 17.79,
11029
+ "memory/max_active (GiB)": 17.43,
11030
+ "memory/max_allocated (GiB)": 17.43,
11031
+ "step": 982,
11032
+ "tokens_per_second_per_gpu": 875.01
11033
+ },
11034
+ {
11035
+ "epoch": 0.983,
11036
+ "grad_norm": 0.8892665505409241,
11037
+ "learning_rate": 1.6988236100329292e-07,
11038
+ "loss": 2.2567,
11039
+ "memory/device_reserved (GiB)": 17.79,
11040
+ "memory/max_active (GiB)": 17.43,
11041
+ "memory/max_allocated (GiB)": 17.43,
11042
+ "step": 983,
11043
+ "tokens_per_second_per_gpu": 844.42
11044
+ },
11045
+ {
11046
+ "epoch": 0.984,
11047
+ "grad_norm": 0.915696918964386,
11048
+ "learning_rate": 1.5153550771630498e-07,
11049
+ "loss": 2.3351,
11050
+ "memory/device_reserved (GiB)": 17.79,
11051
+ "memory/max_active (GiB)": 17.43,
11052
+ "memory/max_allocated (GiB)": 17.43,
11053
+ "step": 984,
11054
+ "tokens_per_second_per_gpu": 815.64
11055
+ },
11056
+ {
11057
+ "epoch": 0.985,
11058
+ "grad_norm": 0.9981441497802734,
11059
+ "learning_rate": 1.3423601757436287e-07,
11060
+ "loss": 2.2343,
11061
+ "memory/device_reserved (GiB)": 17.79,
11062
+ "memory/max_active (GiB)": 17.43,
11063
+ "memory/max_allocated (GiB)": 17.43,
11064
+ "step": 985,
11065
+ "tokens_per_second_per_gpu": 702.43
11066
+ },
11067
+ {
11068
+ "epoch": 0.986,
11069
+ "grad_norm": 0.8215169906616211,
11070
+ "learning_rate": 1.179840720409331e-07,
11071
+ "loss": 2.192,
11072
+ "memory/device_reserved (GiB)": 17.79,
11073
+ "memory/max_active (GiB)": 17.43,
11074
+ "memory/max_allocated (GiB)": 17.43,
11075
+ "step": 986,
11076
+ "tokens_per_second_per_gpu": 1012.4
11077
+ },
11078
+ {
11079
+ "epoch": 0.987,
11080
+ "grad_norm": 1.0433471202850342,
11081
+ "learning_rate": 1.0277984159122733e-07,
11082
+ "loss": 2.3544,
11083
+ "memory/device_reserved (GiB)": 17.79,
11084
+ "memory/max_active (GiB)": 17.43,
11085
+ "memory/max_allocated (GiB)": 17.43,
11086
+ "step": 987,
11087
+ "tokens_per_second_per_gpu": 829.67
11088
+ },
11089
+ {
11090
+ "epoch": 0.988,
11091
+ "grad_norm": 0.8312088847160339,
11092
+ "learning_rate": 8.862348571043733e-08,
11093
+ "loss": 2.3737,
11094
+ "memory/device_reserved (GiB)": 17.79,
11095
+ "memory/max_active (GiB)": 17.43,
11096
+ "memory/max_allocated (GiB)": 17.43,
11097
+ "step": 988,
11098
+ "tokens_per_second_per_gpu": 1123.09
11099
+ },
11100
+ {
11101
+ "epoch": 0.989,
11102
+ "grad_norm": 1.0085126161575317,
11103
+ "learning_rate": 7.551515289203615e-08,
11104
+ "loss": 2.0985,
11105
+ "memory/device_reserved (GiB)": 17.79,
11106
+ "memory/max_active (GiB)": 17.43,
11107
+ "memory/max_allocated (GiB)": 17.43,
11108
+ "step": 989,
11109
+ "tokens_per_second_per_gpu": 642.57
11110
+ },
11111
+ {
11112
+ "epoch": 0.99,
11113
+ "grad_norm": 0.9324679970741272,
11114
+ "learning_rate": 6.34549806362239e-08,
11115
+ "loss": 2.5521,
11116
+ "memory/device_reserved (GiB)": 17.79,
11117
+ "memory/max_active (GiB)": 17.43,
11118
+ "memory/max_allocated (GiB)": 17.43,
11119
+ "step": 990,
11120
+ "tokens_per_second_per_gpu": 979.2
11121
+ },
11122
+ {
11123
+ "epoch": 0.991,
11124
+ "grad_norm": 0.8679972290992737,
11125
+ "learning_rate": 5.2443095448506674e-08,
11126
+ "loss": 2.2688,
11127
+ "memory/device_reserved (GiB)": 17.79,
11128
+ "memory/max_active (GiB)": 17.43,
11129
+ "memory/max_allocated (GiB)": 17.43,
11130
+ "step": 991,
11131
+ "tokens_per_second_per_gpu": 857.33
11132
+ },
11133
+ {
11134
+ "epoch": 0.992,
11135
+ "grad_norm": 0.8510658740997314,
11136
+ "learning_rate": 4.247961283835311e-08,
11137
+ "loss": 2.2254,
11138
+ "memory/device_reserved (GiB)": 17.79,
11139
+ "memory/max_active (GiB)": 17.43,
11140
+ "memory/max_allocated (GiB)": 17.43,
11141
+ "step": 992,
11142
+ "tokens_per_second_per_gpu": 952.28
11143
+ },
11144
+ {
11145
+ "epoch": 0.993,
11146
+ "grad_norm": 0.8851034641265869,
11147
+ "learning_rate": 3.356463731798432e-08,
11148
+ "loss": 2.3777,
11149
+ "memory/device_reserved (GiB)": 17.79,
11150
+ "memory/max_active (GiB)": 17.43,
11151
+ "memory/max_allocated (GiB)": 17.43,
11152
+ "step": 993,
11153
+ "tokens_per_second_per_gpu": 930.35
11154
+ },
11155
+ {
11156
+ "epoch": 0.994,
11157
+ "grad_norm": 0.847767174243927,
11158
+ "learning_rate": 2.5698262401263605e-08,
11159
+ "loss": 2.501,
11160
+ "memory/device_reserved (GiB)": 17.79,
11161
+ "memory/max_active (GiB)": 17.43,
11162
+ "memory/max_allocated (GiB)": 17.43,
11163
+ "step": 994,
11164
+ "tokens_per_second_per_gpu": 1058.39
11165
+ },
11166
+ {
11167
+ "epoch": 0.995,
11168
+ "grad_norm": 0.8249082565307617,
11169
+ "learning_rate": 1.888057060274173e-08,
11170
+ "loss": 2.0256,
11171
+ "memory/device_reserved (GiB)": 17.79,
11172
+ "memory/max_active (GiB)": 17.43,
11173
+ "memory/max_allocated (GiB)": 17.43,
11174
+ "step": 995,
11175
+ "tokens_per_second_per_gpu": 914.1
11176
+ },
11177
+ {
11178
+ "epoch": 0.996,
11179
+ "grad_norm": 1.0485869646072388,
11180
+ "learning_rate": 1.3111633436779791e-08,
11181
+ "loss": 2.3413,
11182
+ "memory/device_reserved (GiB)": 17.79,
11183
+ "memory/max_active (GiB)": 17.43,
11184
+ "memory/max_allocated (GiB)": 17.43,
11185
+ "step": 996,
11186
+ "tokens_per_second_per_gpu": 546.19
11187
+ },
11188
+ {
11189
+ "epoch": 0.997,
11190
+ "grad_norm": 0.923591136932373,
11191
+ "learning_rate": 8.391511416816489e-09,
11192
+ "loss": 2.4325,
11193
+ "memory/device_reserved (GiB)": 17.79,
11194
+ "memory/max_active (GiB)": 17.43,
11195
+ "memory/max_allocated (GiB)": 17.43,
11196
+ "step": 997,
11197
+ "tokens_per_second_per_gpu": 844.28
11198
+ },
11199
+ {
11200
+ "epoch": 0.998,
11201
+ "grad_norm": 1.0989410877227783,
11202
+ "learning_rate": 4.720254054679796e-09,
11203
+ "loss": 2.2315,
11204
+ "memory/device_reserved (GiB)": 17.79,
11205
+ "memory/max_active (GiB)": 17.43,
11206
+ "memory/max_allocated (GiB)": 17.43,
11207
+ "step": 998,
11208
+ "tokens_per_second_per_gpu": 518.76
11209
+ },
11210
+ {
11211
+ "epoch": 0.999,
11212
+ "grad_norm": 1.190458059310913,
11213
+ "learning_rate": 2.0978998601206556e-09,
11214
+ "loss": 2.5925,
11215
+ "memory/device_reserved (GiB)": 17.79,
11216
+ "memory/max_active (GiB)": 17.43,
11217
+ "memory/max_allocated (GiB)": 17.43,
11218
+ "step": 999,
11219
+ "tokens_per_second_per_gpu": 618.02
11220
+ },
11221
+ {
11222
+ "epoch": 1.0,
11223
+ "grad_norm": 0.9831822514533997,
11224
+ "learning_rate": 5.244763404133046e-10,
11225
+ "loss": 2.2586,
11226
+ "memory/device_reserved (GiB)": 17.79,
11227
+ "memory/max_active (GiB)": 17.43,
11228
+ "memory/max_allocated (GiB)": 17.43,
11229
+ "step": 1000,
11230
+ "tokens_per_second_per_gpu": 740.06
11231
+ },
11232
+ {
11233
+ "epoch": 1.0,
11234
+ "eval_loss": 2.245497226715088,
11235
+ "eval_runtime": 67.8857,
11236
+ "eval_samples_per_second": 2.887,
11237
+ "eval_steps_per_second": 1.444,
11238
+ "memory/device_reserved (GiB)": 17.79,
11239
+ "memory/max_active (GiB)": 7.78,
11240
+ "memory/max_allocated (GiB)": 7.78,
11241
+ "step": 1000
11242
  }
11243
  ],
11244
  "logging_steps": 1,
 
11253
  "should_evaluate": false,
11254
  "should_log": false,
11255
  "should_save": true,
11256
+ "should_training_stop": true
11257
  },
11258
  "attributes": {}
11259
  }
11260
  },
11261
+ "total_flos": 2.0713221703021363e+17,
11262
  "train_batch_size": 1,
11263
  "trial_name": null,
11264
  "trial_params": null