AiAF commited on
Commit
4672c04
·
verified ·
1 Parent(s): 7e5d2d6

Training in progress, step 800, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f145aad3e393aacb1ea6687fe5c794bd1505c6b68c50e5038c6eac34efa7e4d6
3
  size 102264160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd804fe5a6a07ca92c0d9df3ee8901a99a952af466c85b5d67804f3b9b5754fc
3
  size 102264160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:140bdab4eebed8c5ba2417db0ed65f56201fa6307a32fb787ad292b97ae34b13
3
  size 52162827
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0bed6cff1a4618fb4cd1381e691366f8ad28f8182c56da1f0df2fb19366078
3
  size 52162827
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4295d68f9590a1ee84490e5a76cd2d12d84f3c4e7c7542a7915be508cf875fe0
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f05bb1ddd76152fd645931407e88adee7bc96ff7799e0d5b2faef63c077f8ed
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6af5f150dbd15fa79794ceabe67cfe7018c07d61742eb73c3c6b041388c26d7c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c0f6da37afd2d18fa5e85c27927c29b3e2c21ee39c49983ca41ec400e0b2cd5
3
  size 1465
last-checkpoint/tokens_state.json CHANGED
@@ -1 +1 @@
1
- {"total": 10467328, "trainable": 4329291}
 
1
+ {"total": 11163776, "trainable": 4620168}
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.36580904767711253,
6
  "eval_steps": 50,
7
- "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10700,6 +10700,718 @@
10700
  "memory/max_active (GiB)": 11.76,
10701
  "memory/max_allocated (GiB)": 11.76,
10702
  "step": 750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10703
  }
10704
  ],
10705
  "logging_steps": 1,
@@ -10719,7 +11431,7 @@
10719
  "attributes": {}
10720
  }
10721
  },
10722
- "total_flos": 1.287529657836503e+17,
10723
  "train_batch_size": 2,
10724
  "trial_name": null,
10725
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.39019631752225337,
6
  "eval_steps": 50,
7
+ "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10700
  "memory/max_active (GiB)": 11.76,
10701
  "memory/max_allocated (GiB)": 11.76,
10702
  "step": 750
10703
+ },
10704
+ {
10705
+ "epoch": 0.36629679307401536,
10706
+ "grad_norm": 0.13251733779907227,
10707
+ "learning_rate": 3.102762227218957e-05,
10708
+ "loss": 2.4578309059143066,
10709
+ "memory/device_reserved (GiB)": 29.55,
10710
+ "memory/max_active (GiB)": 16.51,
10711
+ "memory/max_allocated (GiB)": 16.51,
10712
+ "ppl": 11.67945,
10713
+ "step": 751,
10714
+ "tokens/total": 10483072,
10715
+ "tokens/train_per_sec_per_gpu": 3532.64,
10716
+ "tokens/trainable": 4337168
10717
+ },
10718
+ {
10719
+ "epoch": 0.3667845384709182,
10720
+ "grad_norm": 0.17637301981449127,
10721
+ "learning_rate": 3.079347503220351e-05,
10722
+ "loss": 2.6546099185943604,
10723
+ "memory/device_reserved (GiB)": 29.55,
10724
+ "memory/max_active (GiB)": 15.63,
10725
+ "memory/max_allocated (GiB)": 15.63,
10726
+ "ppl": 14.21944,
10727
+ "step": 752,
10728
+ "tokens/total": 10496896,
10729
+ "tokens/train_per_sec_per_gpu": 1129.56,
10730
+ "tokens/trainable": 4342196
10731
+ },
10732
+ {
10733
+ "epoch": 0.367272283867821,
10734
+ "grad_norm": 0.13249512016773224,
10735
+ "learning_rate": 3.056005373591637e-05,
10736
+ "loss": 2.4976649284362793,
10737
+ "memory/device_reserved (GiB)": 29.55,
10738
+ "memory/max_active (GiB)": 16.42,
10739
+ "memory/max_allocated (GiB)": 16.42,
10740
+ "ppl": 12.15408,
10741
+ "step": 753,
10742
+ "tokens/total": 10511232,
10743
+ "tokens/train_per_sec_per_gpu": 2885.15,
10744
+ "tokens/trainable": 4350017
10745
+ },
10746
+ {
10747
+ "epoch": 0.3677600292647238,
10748
+ "grad_norm": 0.1888270080089569,
10749
+ "learning_rate": 3.032736083180716e-05,
10750
+ "loss": 2.5618886947631836,
10751
+ "memory/device_reserved (GiB)": 29.55,
10752
+ "memory/max_active (GiB)": 13.85,
10753
+ "memory/max_allocated (GiB)": 13.85,
10754
+ "ppl": 12.96027,
10755
+ "step": 754,
10756
+ "tokens/total": 10523136,
10757
+ "tokens/train_per_sec_per_gpu": 1028.47,
10758
+ "tokens/trainable": 4353662
10759
+ },
10760
+ {
10761
+ "epoch": 0.3682477746616266,
10762
+ "grad_norm": 0.17043054103851318,
10763
+ "learning_rate": 3.0095398760714267e-05,
10764
+ "loss": 2.4277312755584717,
10765
+ "memory/device_reserved (GiB)": 29.55,
10766
+ "memory/max_active (GiB)": 16.42,
10767
+ "memory/max_allocated (GiB)": 16.42,
10768
+ "ppl": 11.33314,
10769
+ "step": 755,
10770
+ "tokens/total": 10535040,
10771
+ "tokens/train_per_sec_per_gpu": 949.66,
10772
+ "tokens/trainable": 4358548
10773
+ },
10774
+ {
10775
+ "epoch": 0.36873552005852944,
10776
+ "grad_norm": 0.1492493599653244,
10777
+ "learning_rate": 2.9864169955810084e-05,
10778
+ "loss": 2.565107583999634,
10779
+ "memory/device_reserved (GiB)": 29.55,
10780
+ "memory/max_active (GiB)": 16.51,
10781
+ "memory/max_allocated (GiB)": 16.51,
10782
+ "ppl": 13.00206,
10783
+ "step": 756,
10784
+ "tokens/total": 10549888,
10785
+ "tokens/train_per_sec_per_gpu": 2031.11,
10786
+ "tokens/trainable": 4364785
10787
+ },
10788
+ {
10789
+ "epoch": 0.36922326545543227,
10790
+ "grad_norm": 0.16900953650474548,
10791
+ "learning_rate": 2.9633676842575387e-05,
10792
+ "loss": 2.4396462440490723,
10793
+ "memory/device_reserved (GiB)": 29.55,
10794
+ "memory/max_active (GiB)": 16.07,
10795
+ "memory/max_allocated (GiB)": 16.07,
10796
+ "ppl": 11.46898,
10797
+ "step": 757,
10798
+ "tokens/total": 10563840,
10799
+ "tokens/train_per_sec_per_gpu": 1820.3,
10800
+ "tokens/trainable": 4369356
10801
+ },
10802
+ {
10803
+ "epoch": 0.3697110108523351,
10804
+ "grad_norm": 0.15214021503925323,
10805
+ "learning_rate": 2.940392183877382e-05,
10806
+ "loss": 2.6643388271331787,
10807
+ "memory/device_reserved (GiB)": 29.55,
10808
+ "memory/max_active (GiB)": 15.98,
10809
+ "memory/max_allocated (GiB)": 15.98,
10810
+ "ppl": 14.35845,
10811
+ "step": 758,
10812
+ "tokens/total": 10577536,
10813
+ "tokens/train_per_sec_per_gpu": 1402.8,
10814
+ "tokens/trainable": 4375453
10815
+ },
10816
+ {
10817
+ "epoch": 0.37019875624923787,
10818
+ "grad_norm": 0.15483756363391876,
10819
+ "learning_rate": 2.9174907354426696e-05,
10820
+ "loss": 2.4720706939697266,
10821
+ "memory/device_reserved (GiB)": 29.55,
10822
+ "memory/max_active (GiB)": 16.07,
10823
+ "memory/max_allocated (GiB)": 16.07,
10824
+ "ppl": 11.84695,
10825
+ "step": 759,
10826
+ "tokens/total": 10590848,
10827
+ "tokens/train_per_sec_per_gpu": 1571.9,
10828
+ "tokens/trainable": 4381149
10829
+ },
10830
+ {
10831
+ "epoch": 0.3706865016461407,
10832
+ "grad_norm": 0.14039497077465057,
10833
+ "learning_rate": 2.8946635791787545e-05,
10834
+ "loss": 2.5491788387298584,
10835
+ "memory/device_reserved (GiB)": 29.55,
10836
+ "memory/max_active (GiB)": 16.07,
10837
+ "memory/max_allocated (GiB)": 16.07,
10838
+ "ppl": 12.79659,
10839
+ "step": 760,
10840
+ "tokens/total": 10604800,
10841
+ "tokens/train_per_sec_per_gpu": 1587.23,
10842
+ "tokens/trainable": 4387980
10843
+ },
10844
+ {
10845
+ "epoch": 0.3711742470430435,
10846
+ "grad_norm": 0.12205954641103745,
10847
+ "learning_rate": 2.8719109545317103e-05,
10848
+ "loss": 2.476264476776123,
10849
+ "memory/device_reserved (GiB)": 29.55,
10850
+ "memory/max_active (GiB)": 16.07,
10851
+ "memory/max_allocated (GiB)": 16.07,
10852
+ "ppl": 11.89674,
10853
+ "step": 761,
10854
+ "tokens/total": 10620416,
10855
+ "tokens/train_per_sec_per_gpu": 2379.38,
10856
+ "tokens/trainable": 4397120
10857
+ },
10858
+ {
10859
+ "epoch": 0.37166199243994635,
10860
+ "grad_norm": 0.15171197056770325,
10861
+ "learning_rate": 2.8492331001657945e-05,
10862
+ "loss": 2.5069305896759033,
10863
+ "memory/device_reserved (GiB)": 29.55,
10864
+ "memory/max_active (GiB)": 15.54,
10865
+ "memory/max_allocated (GiB)": 15.54,
10866
+ "ppl": 12.26722,
10867
+ "step": 762,
10868
+ "tokens/total": 10633216,
10869
+ "tokens/train_per_sec_per_gpu": 710.79,
10870
+ "tokens/trainable": 4402732
10871
+ },
10872
+ {
10873
+ "epoch": 0.3721497378368492,
10874
+ "grad_norm": 0.13738340139389038,
10875
+ "learning_rate": 2.8266302539609745e-05,
10876
+ "loss": 2.423926830291748,
10877
+ "memory/device_reserved (GiB)": 29.55,
10878
+ "memory/max_active (GiB)": 16.51,
10879
+ "memory/max_allocated (GiB)": 16.51,
10880
+ "ppl": 11.29011,
10881
+ "step": 763,
10882
+ "tokens/total": 10647808,
10883
+ "tokens/train_per_sec_per_gpu": 2262.63,
10884
+ "tokens/trainable": 4409476
10885
+ },
10886
+ {
10887
+ "epoch": 0.37263748323375195,
10888
+ "grad_norm": 0.16071482002735138,
10889
+ "learning_rate": 2.804102653010414e-05,
10890
+ "loss": 2.723536252975464,
10891
+ "memory/device_reserved (GiB)": 29.55,
10892
+ "memory/max_active (GiB)": 15.63,
10893
+ "memory/max_allocated (GiB)": 15.63,
10894
+ "ppl": 15.2341,
10895
+ "step": 764,
10896
+ "tokens/total": 10662656,
10897
+ "tokens/train_per_sec_per_gpu": 656.81,
10898
+ "tokens/trainable": 4414619
10899
+ },
10900
+ {
10901
+ "epoch": 0.3731252286306548,
10902
+ "grad_norm": 0.1205301433801651,
10903
+ "learning_rate": 2.7816505336179798e-05,
10904
+ "loss": 2.4741783142089844,
10905
+ "memory/device_reserved (GiB)": 29.55,
10906
+ "memory/max_active (GiB)": 16.07,
10907
+ "memory/max_allocated (GiB)": 16.07,
10908
+ "ppl": 11.87195,
10909
+ "step": 765,
10910
+ "tokens/total": 10676992,
10911
+ "tokens/train_per_sec_per_gpu": 2993.83,
10912
+ "tokens/trainable": 4423601
10913
+ },
10914
+ {
10915
+ "epoch": 0.3736129740275576,
10916
+ "grad_norm": 0.13879121840000153,
10917
+ "learning_rate": 2.759274131295787e-05,
10918
+ "loss": 2.4349002838134766,
10919
+ "memory/device_reserved (GiB)": 29.55,
10920
+ "memory/max_active (GiB)": 16.42,
10921
+ "memory/max_allocated (GiB)": 16.42,
10922
+ "ppl": 11.41468,
10923
+ "step": 766,
10924
+ "tokens/total": 10692096,
10925
+ "tokens/train_per_sec_per_gpu": 2115.95,
10926
+ "tokens/trainable": 4430839
10927
+ },
10928
+ {
10929
+ "epoch": 0.37410071942446044,
10930
+ "grad_norm": 0.1562461405992508,
10931
+ "learning_rate": 2.736973680761702e-05,
10932
+ "loss": 2.4425415992736816,
10933
+ "memory/device_reserved (GiB)": 29.55,
10934
+ "memory/max_active (GiB)": 15.18,
10935
+ "memory/max_allocated (GiB)": 15.18,
10936
+ "ppl": 11.50224,
10937
+ "step": 767,
10938
+ "tokens/total": 10704768,
10939
+ "tokens/train_per_sec_per_gpu": 2592.22,
10940
+ "tokens/trainable": 4435996
10941
+ },
10942
+ {
10943
+ "epoch": 0.37458846482136327,
10944
+ "grad_norm": 0.1498877853155136,
10945
+ "learning_rate": 2.7147494159369036e-05,
10946
+ "loss": 2.5003294944763184,
10947
+ "memory/device_reserved (GiB)": 29.55,
10948
+ "memory/max_active (GiB)": 16.07,
10949
+ "memory/max_allocated (GiB)": 16.07,
10950
+ "ppl": 12.18651,
10951
+ "step": 768,
10952
+ "tokens/total": 10718848,
10953
+ "tokens/train_per_sec_per_gpu": 1520.91,
10954
+ "tokens/trainable": 4441847
10955
+ },
10956
+ {
10957
+ "epoch": 0.3750762102182661,
10958
+ "grad_norm": 0.15580855309963226,
10959
+ "learning_rate": 2.6926015699434072e-05,
10960
+ "loss": 2.697448253631592,
10961
+ "memory/device_reserved (GiB)": 29.55,
10962
+ "memory/max_active (GiB)": 15.63,
10963
+ "memory/max_allocated (GiB)": 15.63,
10964
+ "ppl": 14.84181,
10965
+ "step": 769,
10966
+ "tokens/total": 10732416,
10967
+ "tokens/train_per_sec_per_gpu": 3266.58,
10968
+ "tokens/trainable": 4447624
10969
+ },
10970
+ {
10971
+ "epoch": 0.37556395561516887,
10972
+ "grad_norm": 0.14440256357192993,
10973
+ "learning_rate": 2.6705303751016408e-05,
10974
+ "loss": 2.406161308288574,
10975
+ "memory/device_reserved (GiB)": 29.55,
10976
+ "memory/max_active (GiB)": 16.42,
10977
+ "memory/max_allocated (GiB)": 16.42,
10978
+ "ppl": 11.0913,
10979
+ "step": 770,
10980
+ "tokens/total": 10747392,
10981
+ "tokens/train_per_sec_per_gpu": 3311.44,
10982
+ "tokens/trainable": 4453204
10983
+ },
10984
+ {
10985
+ "epoch": 0.3760517010120717,
10986
+ "grad_norm": 0.14817574620246887,
10987
+ "learning_rate": 2.6485360629279987e-05,
10988
+ "loss": 2.578953981399536,
10989
+ "memory/device_reserved (GiB)": 29.55,
10990
+ "memory/max_active (GiB)": 15.18,
10991
+ "memory/max_allocated (GiB)": 15.18,
10992
+ "ppl": 13.18334,
10993
+ "step": 771,
10994
+ "tokens/total": 10761856,
10995
+ "tokens/train_per_sec_per_gpu": 2836.99,
10996
+ "tokens/trainable": 4460156
10997
+ },
10998
+ {
10999
+ "epoch": 0.3765394464089745,
11000
+ "grad_norm": 0.182297021150589,
11001
+ "learning_rate": 2.6266188641323996e-05,
11002
+ "loss": 2.5378308296203613,
11003
+ "memory/device_reserved (GiB)": 29.55,
11004
+ "memory/max_active (GiB)": 16.42,
11005
+ "memory/max_allocated (GiB)": 16.42,
11006
+ "ppl": 12.6522,
11007
+ "step": 772,
11008
+ "tokens/total": 10775424,
11009
+ "tokens/train_per_sec_per_gpu": 1735.97,
11010
+ "tokens/trainable": 4464199
11011
+ },
11012
+ {
11013
+ "epoch": 0.37702719180587735,
11014
+ "grad_norm": 0.1523345559835434,
11015
+ "learning_rate": 2.6047790086158952e-05,
11016
+ "loss": 2.4858243465423584,
11017
+ "memory/device_reserved (GiB)": 29.55,
11018
+ "memory/max_active (GiB)": 15.54,
11019
+ "memory/max_allocated (GiB)": 15.54,
11020
+ "ppl": 12.01102,
11021
+ "step": 773,
11022
+ "tokens/total": 10789248,
11023
+ "tokens/train_per_sec_per_gpu": 1157.59,
11024
+ "tokens/trainable": 4469950
11025
+ },
11026
+ {
11027
+ "epoch": 0.3775149372027802,
11028
+ "grad_norm": 0.13964441418647766,
11029
+ "learning_rate": 2.5830167254682257e-05,
11030
+ "loss": 2.5482704639434814,
11031
+ "memory/device_reserved (GiB)": 29.55,
11032
+ "memory/max_active (GiB)": 15.09,
11033
+ "memory/max_allocated (GiB)": 15.09,
11034
+ "ppl": 12.78497,
11035
+ "step": 774,
11036
+ "tokens/total": 10802944,
11037
+ "tokens/train_per_sec_per_gpu": 1879.89,
11038
+ "tokens/trainable": 4476738
11039
+ },
11040
+ {
11041
+ "epoch": 0.37800268259968295,
11042
+ "grad_norm": 0.16566026210784912,
11043
+ "learning_rate": 2.5613322429654574e-05,
11044
+ "loss": 2.579946279525757,
11045
+ "memory/device_reserved (GiB)": 29.55,
11046
+ "memory/max_active (GiB)": 16.51,
11047
+ "memory/max_allocated (GiB)": 16.51,
11048
+ "ppl": 13.19643,
11049
+ "step": 775,
11050
+ "tokens/total": 10816128,
11051
+ "tokens/train_per_sec_per_gpu": 630.56,
11052
+ "tokens/trainable": 4481633
11053
+ },
11054
+ {
11055
+ "epoch": 0.3784904279965858,
11056
+ "grad_norm": 0.18692387640476227,
11057
+ "learning_rate": 2.5397257885675397e-05,
11058
+ "loss": 2.35819411277771,
11059
+ "memory/device_reserved (GiB)": 29.55,
11060
+ "memory/max_active (GiB)": 15.63,
11061
+ "memory/max_allocated (GiB)": 15.63,
11062
+ "ppl": 10.57184,
11063
+ "step": 776,
11064
+ "tokens/total": 10829312,
11065
+ "tokens/train_per_sec_per_gpu": 2158.15,
11066
+ "tokens/trainable": 4486282
11067
+ },
11068
+ {
11069
+ "epoch": 0.3789781733934886,
11070
+ "grad_norm": 0.13402055203914642,
11071
+ "learning_rate": 2.5181975889159615e-05,
11072
+ "loss": 2.6000072956085205,
11073
+ "memory/device_reserved (GiB)": 29.55,
11074
+ "memory/max_active (GiB)": 15.63,
11075
+ "memory/max_allocated (GiB)": 15.63,
11076
+ "ppl": 13.46384,
11077
+ "step": 777,
11078
+ "tokens/total": 10843520,
11079
+ "tokens/train_per_sec_per_gpu": 2489.19,
11080
+ "tokens/trainable": 4493796
11081
+ },
11082
+ {
11083
+ "epoch": 0.37946591879039143,
11084
+ "grad_norm": 0.1505974680185318,
11085
+ "learning_rate": 2.496747869831345e-05,
11086
+ "loss": 2.4257397651672363,
11087
+ "memory/device_reserved (GiB)": 29.55,
11088
+ "memory/max_active (GiB)": 16.07,
11089
+ "memory/max_allocated (GiB)": 16.07,
11090
+ "ppl": 11.31059,
11091
+ "step": 778,
11092
+ "tokens/total": 10857984,
11093
+ "tokens/train_per_sec_per_gpu": 2561.66,
11094
+ "tokens/trainable": 4499804
11095
+ },
11096
+ {
11097
+ "epoch": 0.37995366418729426,
11098
+ "grad_norm": 0.13848432898521423,
11099
+ "learning_rate": 2.475376856311097e-05,
11100
+ "loss": 2.3233590126037598,
11101
+ "memory/device_reserved (GiB)": 29.55,
11102
+ "memory/max_active (GiB)": 16.51,
11103
+ "memory/max_allocated (GiB)": 16.51,
11104
+ "ppl": 10.20991,
11105
+ "step": 779,
11106
+ "tokens/total": 10872960,
11107
+ "tokens/train_per_sec_per_gpu": 2813.77,
11108
+ "tokens/trainable": 4506550
11109
+ },
11110
+ {
11111
+ "epoch": 0.38044140958419703,
11112
+ "grad_norm": 0.1617778092622757,
11113
+ "learning_rate": 2.4540847725270378e-05,
11114
+ "loss": 2.4378297328948975,
11115
+ "memory/device_reserved (GiB)": 29.55,
11116
+ "memory/max_active (GiB)": 16.42,
11117
+ "memory/max_allocated (GiB)": 16.42,
11118
+ "ppl": 11.44817,
11119
+ "step": 780,
11120
+ "tokens/total": 10887168,
11121
+ "tokens/train_per_sec_per_gpu": 309.72,
11122
+ "tokens/trainable": 4511183
11123
+ },
11124
+ {
11125
+ "epoch": 0.38092915498109986,
11126
+ "grad_norm": 0.1438380777835846,
11127
+ "learning_rate": 2.432871841823047e-05,
11128
+ "loss": 2.430607557296753,
11129
+ "memory/device_reserved (GiB)": 29.55,
11130
+ "memory/max_active (GiB)": 15.54,
11131
+ "memory/max_allocated (GiB)": 15.54,
11132
+ "ppl": 11.36579,
11133
+ "step": 781,
11134
+ "tokens/total": 10900608,
11135
+ "tokens/train_per_sec_per_gpu": 1996.21,
11136
+ "tokens/trainable": 4517458
11137
+ },
11138
+ {
11139
+ "epoch": 0.3814169003780027,
11140
+ "grad_norm": 0.14792795479297638,
11141
+ "learning_rate": 2.411738286712735e-05,
11142
+ "loss": 2.4632468223571777,
11143
+ "memory/device_reserved (GiB)": 29.55,
11144
+ "memory/max_active (GiB)": 15.18,
11145
+ "memory/max_allocated (GiB)": 15.18,
11146
+ "ppl": 11.74288,
11147
+ "step": 782,
11148
+ "tokens/total": 10913664,
11149
+ "tokens/train_per_sec_per_gpu": 2785.12,
11150
+ "tokens/trainable": 4524146
11151
+ },
11152
+ {
11153
+ "epoch": 0.3819046457749055,
11154
+ "grad_norm": 0.16730709373950958,
11155
+ "learning_rate": 2.3906843288770886e-05,
11156
+ "loss": 2.556550979614258,
11157
+ "memory/device_reserved (GiB)": 29.55,
11158
+ "memory/max_active (GiB)": 16.07,
11159
+ "memory/max_allocated (GiB)": 16.07,
11160
+ "ppl": 12.89128,
11161
+ "step": 783,
11162
+ "tokens/total": 10928128,
11163
+ "tokens/train_per_sec_per_gpu": 672.37,
11164
+ "tokens/trainable": 4528859
11165
+ },
11166
+ {
11167
+ "epoch": 0.38239239117180834,
11168
+ "grad_norm": 0.1668711155653,
11169
+ "learning_rate": 2.3697101891621697e-05,
11170
+ "loss": 2.1584508419036865,
11171
+ "memory/device_reserved (GiB)": 29.55,
11172
+ "memory/max_active (GiB)": 16.51,
11173
+ "memory/max_allocated (GiB)": 16.51,
11174
+ "ppl": 8.65772,
11175
+ "step": 784,
11176
+ "tokens/total": 10941824,
11177
+ "tokens/train_per_sec_per_gpu": 134.14,
11178
+ "tokens/trainable": 4533160
11179
+ },
11180
+ {
11181
+ "epoch": 0.3828801365687111,
11182
+ "grad_norm": 0.1528262495994568,
11183
+ "learning_rate": 2.3488160875767717e-05,
11184
+ "loss": 2.454880714416504,
11185
+ "memory/device_reserved (GiB)": 29.55,
11186
+ "memory/max_active (GiB)": 16.07,
11187
+ "memory/max_allocated (GiB)": 16.07,
11188
+ "ppl": 11.64504,
11189
+ "step": 785,
11190
+ "tokens/total": 10956288,
11191
+ "tokens/train_per_sec_per_gpu": 1773.05,
11192
+ "tokens/trainable": 4538778
11193
+ },
11194
+ {
11195
+ "epoch": 0.38336788196561394,
11196
+ "grad_norm": 0.1478903591632843,
11197
+ "learning_rate": 2.3280022432901383e-05,
11198
+ "loss": 2.1368329524993896,
11199
+ "memory/device_reserved (GiB)": 29.55,
11200
+ "memory/max_active (GiB)": 15.18,
11201
+ "memory/max_allocated (GiB)": 15.18,
11202
+ "ppl": 8.47256,
11203
+ "step": 786,
11204
+ "tokens/total": 10968960,
11205
+ "tokens/train_per_sec_per_gpu": 394.55,
11206
+ "tokens/trainable": 4544577
11207
+ },
11208
+ {
11209
+ "epoch": 0.38385562736251677,
11210
+ "grad_norm": 0.17052386701107025,
11211
+ "learning_rate": 2.307268874629649e-05,
11212
+ "loss": 2.4743740558624268,
11213
+ "memory/device_reserved (GiB)": 29.55,
11214
+ "memory/max_active (GiB)": 16.07,
11215
+ "memory/max_allocated (GiB)": 16.07,
11216
+ "ppl": 11.87427,
11217
+ "step": 787,
11218
+ "tokens/total": 10983936,
11219
+ "tokens/train_per_sec_per_gpu": 2616.52,
11220
+ "tokens/trainable": 4549516
11221
+ },
11222
+ {
11223
+ "epoch": 0.3843433727594196,
11224
+ "grad_norm": 0.1929779350757599,
11225
+ "learning_rate": 2.2866161990785228e-05,
11226
+ "loss": 2.54533314704895,
11227
+ "memory/device_reserved (GiB)": 29.55,
11228
+ "memory/max_active (GiB)": 16.07,
11229
+ "memory/max_allocated (GiB)": 16.07,
11230
+ "ppl": 12.74747,
11231
+ "step": 788,
11232
+ "tokens/total": 10997376,
11233
+ "tokens/train_per_sec_per_gpu": 1739.04,
11234
+ "tokens/trainable": 4553203
11235
+ },
11236
+ {
11237
+ "epoch": 0.3848311181563224,
11238
+ "grad_norm": 0.19457341730594635,
11239
+ "learning_rate": 2.266044433273562e-05,
11240
+ "loss": 2.3346762657165527,
11241
+ "memory/device_reserved (GiB)": 29.55,
11242
+ "memory/max_active (GiB)": 16.51,
11243
+ "memory/max_allocated (GiB)": 16.51,
11244
+ "ppl": 10.32612,
11245
+ "step": 789,
11246
+ "tokens/total": 11010688,
11247
+ "tokens/train_per_sec_per_gpu": 2363.73,
11248
+ "tokens/trainable": 4556841
11249
+ },
11250
+ {
11251
+ "epoch": 0.3853188635532252,
11252
+ "grad_norm": 0.12720970809459686,
11253
+ "learning_rate": 2.245553793002849e-05,
11254
+ "loss": 2.5888097286224365,
11255
+ "memory/device_reserved (GiB)": 29.55,
11256
+ "memory/max_active (GiB)": 16.51,
11257
+ "memory/max_allocated (GiB)": 16.51,
11258
+ "ppl": 13.31391,
11259
+ "step": 790,
11260
+ "tokens/total": 11026304,
11261
+ "tokens/train_per_sec_per_gpu": 1456.55,
11262
+ "tokens/trainable": 4566086
11263
+ },
11264
+ {
11265
+ "epoch": 0.385806608950128,
11266
+ "grad_norm": 0.15932175517082214,
11267
+ "learning_rate": 2.2251444932035094e-05,
11268
+ "loss": 2.7473325729370117,
11269
+ "memory/device_reserved (GiB)": 29.55,
11270
+ "memory/max_active (GiB)": 13.76,
11271
+ "memory/max_allocated (GiB)": 13.76,
11272
+ "ppl": 15.60096,
11273
+ "step": 791,
11274
+ "tokens/total": 11038592,
11275
+ "tokens/train_per_sec_per_gpu": 1133.74,
11276
+ "tokens/trainable": 4572237
11277
+ },
11278
+ {
11279
+ "epoch": 0.38629435434703085,
11280
+ "grad_norm": 0.15806850790977478,
11281
+ "learning_rate": 2.204816747959434e-05,
11282
+ "loss": 2.3397216796875,
11283
+ "memory/device_reserved (GiB)": 29.55,
11284
+ "memory/max_active (GiB)": 15.98,
11285
+ "memory/max_allocated (GiB)": 15.98,
11286
+ "ppl": 10.37835,
11287
+ "step": 792,
11288
+ "tokens/total": 11052800,
11289
+ "tokens/train_per_sec_per_gpu": 1664.54,
11290
+ "tokens/trainable": 4577892
11291
+ },
11292
+ {
11293
+ "epoch": 0.3867820997439337,
11294
+ "grad_norm": 0.16440050303936005,
11295
+ "learning_rate": 2.184570770499056e-05,
11296
+ "loss": 2.379885196685791,
11297
+ "memory/device_reserved (GiB)": 29.55,
11298
+ "memory/max_active (GiB)": 15.63,
11299
+ "memory/max_allocated (GiB)": 15.63,
11300
+ "ppl": 10.80366,
11301
+ "step": 793,
11302
+ "tokens/total": 11066112,
11303
+ "tokens/train_per_sec_per_gpu": 2098.95,
11304
+ "tokens/trainable": 4582681
11305
+ },
11306
+ {
11307
+ "epoch": 0.3872698451408365,
11308
+ "grad_norm": 0.14843714237213135,
11309
+ "learning_rate": 2.1644067731931007e-05,
11310
+ "loss": 2.3706493377685547,
11311
+ "memory/device_reserved (GiB)": 29.55,
11312
+ "memory/max_active (GiB)": 16.07,
11313
+ "memory/max_allocated (GiB)": 16.07,
11314
+ "ppl": 10.70434,
11315
+ "step": 794,
11316
+ "tokens/total": 11079552,
11317
+ "tokens/train_per_sec_per_gpu": 182.87,
11318
+ "tokens/trainable": 4588104
11319
+ },
11320
+ {
11321
+ "epoch": 0.3877575905377393,
11322
+ "grad_norm": 0.16309773921966553,
11323
+ "learning_rate": 2.1443249675523536e-05,
11324
+ "loss": 2.451366424560547,
11325
+ "memory/device_reserved (GiB)": 29.55,
11326
+ "memory/max_active (GiB)": 15.98,
11327
+ "memory/max_allocated (GiB)": 15.98,
11328
+ "ppl": 11.60419,
11329
+ "step": 795,
11330
+ "tokens/total": 11093632,
11331
+ "tokens/train_per_sec_per_gpu": 182.4,
11332
+ "tokens/trainable": 4593013
11333
+ },
11334
+ {
11335
+ "epoch": 0.3882453359346421,
11336
+ "grad_norm": 0.14842580258846283,
11337
+ "learning_rate": 2.1243255642254578e-05,
11338
+ "loss": 2.5915379524230957,
11339
+ "memory/device_reserved (GiB)": 29.55,
11340
+ "memory/max_active (GiB)": 15.63,
11341
+ "memory/max_allocated (GiB)": 15.63,
11342
+ "ppl": 13.35029,
11343
+ "step": 796,
11344
+ "tokens/total": 11107328,
11345
+ "tokens/train_per_sec_per_gpu": 113.69,
11346
+ "tokens/trainable": 4599134
11347
+ },
11348
+ {
11349
+ "epoch": 0.38873308133154494,
11350
+ "grad_norm": 0.14871464669704437,
11351
+ "learning_rate": 2.1044087729966856e-05,
11352
+ "loss": 2.5890448093414307,
11353
+ "memory/device_reserved (GiB)": 29.55,
11354
+ "memory/max_active (GiB)": 16.51,
11355
+ "memory/max_allocated (GiB)": 16.51,
11356
+ "ppl": 13.31705,
11357
+ "step": 797,
11358
+ "tokens/total": 11121792,
11359
+ "tokens/train_per_sec_per_gpu": 1460.67,
11360
+ "tokens/trainable": 4605136
11361
+ },
11362
+ {
11363
+ "epoch": 0.38922082672844777,
11364
+ "grad_norm": 0.18083694577217102,
11365
+ "learning_rate": 2.0845748027837586e-05,
11366
+ "loss": 2.543159008026123,
11367
+ "memory/device_reserved (GiB)": 29.55,
11368
+ "memory/max_active (GiB)": 15.18,
11369
+ "memory/max_allocated (GiB)": 15.18,
11370
+ "ppl": 12.71979,
11371
+ "step": 798,
11372
+ "tokens/total": 11135232,
11373
+ "tokens/train_per_sec_per_gpu": 1358.54,
11374
+ "tokens/trainable": 4609430
11375
+ },
11376
+ {
11377
+ "epoch": 0.3897085721253506,
11378
+ "grad_norm": 0.1697179079055786,
11379
+ "learning_rate": 2.0648238616356332e-05,
11380
+ "loss": 2.6967287063598633,
11381
+ "memory/device_reserved (GiB)": 29.55,
11382
+ "memory/max_active (GiB)": 16.07,
11383
+ "memory/max_allocated (GiB)": 16.07,
11384
+ "ppl": 14.83114,
11385
+ "step": 799,
11386
+ "tokens/total": 11150208,
11387
+ "tokens/train_per_sec_per_gpu": 1863.18,
11388
+ "tokens/trainable": 4614633
11389
+ },
11390
+ {
11391
+ "epoch": 0.39019631752225337,
11392
+ "grad_norm": 0.1563408523797989,
11393
+ "learning_rate": 2.045156156730338e-05,
11394
+ "loss": 2.489025354385376,
11395
+ "memory/device_reserved (GiB)": 29.55,
11396
+ "memory/max_active (GiB)": 14.74,
11397
+ "memory/max_allocated (GiB)": 14.74,
11398
+ "ppl": 12.04953,
11399
+ "step": 800,
11400
+ "tokens/total": 11163776,
11401
+ "tokens/train_per_sec_per_gpu": 2263.21,
11402
+ "tokens/trainable": 4620168
11403
+ },
11404
+ {
11405
+ "epoch": 0.39019631752225337,
11406
+ "eval_loss": 2.494191884994507,
11407
+ "eval_ppl": 12.11194,
11408
+ "eval_runtime": 6.1585,
11409
+ "eval_samples_per_second": 32.475,
11410
+ "eval_steps_per_second": 16.238,
11411
+ "memory/device_reserved (GiB)": 29.55,
11412
+ "memory/max_active (GiB)": 11.76,
11413
+ "memory/max_allocated (GiB)": 11.76,
11414
+ "step": 800
11415
  }
11416
  ],
11417
  "logging_steps": 1,
 
11431
  "attributes": {}
11432
  }
11433
  },
11434
+ "total_flos": 1.3731959764176077e+17,
11435
  "train_batch_size": 2,
11436
  "trial_name": null,
11437
  "trial_params": null