CocoRoF commited on
Commit
4bb3bfd
·
verified ·
1 Parent(s): 4500399

Training in progress, step 15000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be61347e986df1813f8514099ca7495d01a0aaff2cb1086c995e9f56e4864f44
3
  size 737632172
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a186d2cbe6d7d5ac4c2cb2dffcb32f2c152be1b999f1d2203cb01e12498cf45
3
  size 737632172
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f13242f121b2c14586fd1d56f596180d4a87cf3c3e6a8b06d2aa80aac67af52
3
  size 1475354682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd8d731853da7018e3f5c4a04692a24cce0335468b21c7f5e72c91a67c23f4c
3
  size 1475354682
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0169a1fcfdb795965595a4f242f88323799f2590a349006ef637e474b948bd8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39d0a42a76c6856b42516358f397705cff8f5ae2210de23f6abc8fc7d370ce43
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c87b1e14f1bde64851a26e79d9e7529d68eb1e143f87cb05c0bdf4c84c676fd8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79d39762bf88c59ea58ab8c192f4d9721ab6eba78debc69a369654d4199af50
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bea1a1e24ce9ba044268ec704e9b5435b962ac6a2de09e9847caeff5397ad96
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1426a2458db12639f98377335a1109abade08a448981fb41c315ef1f9fd4191e
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ec0b58b8e0950d68e3cbb11f67305b0912f5521574e32c474ae22410e7fadc8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e598407075fbe89ea2094160f92052415ad3b0b80d125438201859b9875d537b
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:284664914b90ff8b8a0dc92bb3d3f63cfa784487322d7dc115bd6038f6758aca
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437e8f540f343ec9a874078e652ba30d02c2d12e3039d8092e96942ade74967b
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0468917ff18a692d9dedf8d79fa5e11dd93feec07877799ca86aca3ed690d129
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfb7770b02ea62450b4853818ad587c09df566939a04e941091111cc9b7cf2
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:842bb0cfac2bb06e2e811dabf1e415d78f36184efada16b4098faa08d32c3580
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54fe581e399def26af9ab0920fdea64c37d1eed5ad5a4b3fec55e45525aba99f
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca857437aedb445e7bf3dd47069eecac538c7d7fd16d601188beef14a54e520e
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491793f3baa6e0a6171458158bd6f4cce55a8696d0c0e279c19b74fbf532973f
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:123bdb81188a6d8339925f843c784a6596cb7ed0221abdafc8c5e0e110c82c27
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c420d6b7ad7b972ca48fc034e3641c21f7aede383a84979b4bca5295d5ea7ac1
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5547511663643273,
5
  "eval_steps": 1000,
6
- "global_step": 12500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8853,6 +8853,1780 @@
8853
  "learning_rate": 9.978330047590683e-06,
8854
  "loss": 10.3868,
8855
  "step": 12500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8856
  }
8857
  ],
8858
  "logging_steps": 10,
@@ -8872,7 +10646,7 @@
8872
  "attributes": {}
8873
  }
8874
  },
8875
- "total_flos": 4.36215400628224e+18,
8876
  "train_batch_size": 4,
8877
  "trial_name": null,
8878
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.6657013996371928,
5
  "eval_steps": 1000,
6
+ "global_step": 15000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8853
  "learning_rate": 9.978330047590683e-06,
8854
  "loss": 10.3868,
8855
  "step": 12500
8856
+ },
8857
+ {
8858
+ "epoch": 0.5551949672974188,
8859
+ "grad_norm": 81.77082061767578,
8860
+ "learning_rate": 9.978312711628756e-06,
8861
+ "loss": 10.7695,
8862
+ "step": 12510
8863
+ },
8864
+ {
8865
+ "epoch": 0.5556387682305102,
8866
+ "grad_norm": 87.88277435302734,
8867
+ "learning_rate": 9.978295375666829e-06,
8868
+ "loss": 11.4438,
8869
+ "step": 12520
8870
+ },
8871
+ {
8872
+ "epoch": 0.5560825691636017,
8873
+ "grad_norm": 72.19029998779297,
8874
+ "learning_rate": 9.9782780397049e-06,
8875
+ "loss": 10.6795,
8876
+ "step": 12530
8877
+ },
8878
+ {
8879
+ "epoch": 0.5565263700966931,
8880
+ "grad_norm": 66.47805786132812,
8881
+ "learning_rate": 9.978260703742973e-06,
8882
+ "loss": 10.5683,
8883
+ "step": 12540
8884
+ },
8885
+ {
8886
+ "epoch": 0.5569701710297846,
8887
+ "grad_norm": 79.33062744140625,
8888
+ "learning_rate": 9.978243367781046e-06,
8889
+ "loss": 10.8922,
8890
+ "step": 12550
8891
+ },
8892
+ {
8893
+ "epoch": 0.557413971962876,
8894
+ "grad_norm": 67.08543395996094,
8895
+ "learning_rate": 9.97822603181912e-06,
8896
+ "loss": 10.2346,
8897
+ "step": 12560
8898
+ },
8899
+ {
8900
+ "epoch": 0.5578577728959675,
8901
+ "grad_norm": 73.28809356689453,
8902
+ "learning_rate": 9.978208695857191e-06,
8903
+ "loss": 11.0026,
8904
+ "step": 12570
8905
+ },
8906
+ {
8907
+ "epoch": 0.558301573829059,
8908
+ "grad_norm": 75.66752624511719,
8909
+ "learning_rate": 9.978191359895264e-06,
8910
+ "loss": 10.7005,
8911
+ "step": 12580
8912
+ },
8913
+ {
8914
+ "epoch": 0.5587453747621505,
8915
+ "grad_norm": 75.37193298339844,
8916
+ "learning_rate": 9.978174023933337e-06,
8917
+ "loss": 11.1813,
8918
+ "step": 12590
8919
+ },
8920
+ {
8921
+ "epoch": 0.5591891756952418,
8922
+ "grad_norm": 72.89427947998047,
8923
+ "learning_rate": 9.978156687971408e-06,
8924
+ "loss": 11.0323,
8925
+ "step": 12600
8926
+ },
8927
+ {
8928
+ "epoch": 0.5596329766283333,
8929
+ "grad_norm": 70.98633575439453,
8930
+ "learning_rate": 9.978139352009482e-06,
8931
+ "loss": 10.8357,
8932
+ "step": 12610
8933
+ },
8934
+ {
8935
+ "epoch": 0.5600767775614248,
8936
+ "grad_norm": 79.14083099365234,
8937
+ "learning_rate": 9.978122016047555e-06,
8938
+ "loss": 11.1229,
8939
+ "step": 12620
8940
+ },
8941
+ {
8942
+ "epoch": 0.5605205784945163,
8943
+ "grad_norm": 77.51022338867188,
8944
+ "learning_rate": 9.978104680085626e-06,
8945
+ "loss": 10.6968,
8946
+ "step": 12630
8947
+ },
8948
+ {
8949
+ "epoch": 0.5609643794276078,
8950
+ "grad_norm": 84.46963500976562,
8951
+ "learning_rate": 9.978087344123699e-06,
8952
+ "loss": 11.0026,
8953
+ "step": 12640
8954
+ },
8955
+ {
8956
+ "epoch": 0.5614081803606992,
8957
+ "grad_norm": 69.38136291503906,
8958
+ "learning_rate": 9.978070008161772e-06,
8959
+ "loss": 10.9979,
8960
+ "step": 12650
8961
+ },
8962
+ {
8963
+ "epoch": 0.5618519812937907,
8964
+ "grad_norm": 79.57415008544922,
8965
+ "learning_rate": 9.978052672199844e-06,
8966
+ "loss": 11.0241,
8967
+ "step": 12660
8968
+ },
8969
+ {
8970
+ "epoch": 0.5622957822268821,
8971
+ "grad_norm": 76.59188079833984,
8972
+ "learning_rate": 9.978035336237917e-06,
8973
+ "loss": 10.9994,
8974
+ "step": 12670
8975
+ },
8976
+ {
8977
+ "epoch": 0.5627395831599736,
8978
+ "grad_norm": 71.59693145751953,
8979
+ "learning_rate": 9.97801800027599e-06,
8980
+ "loss": 10.8948,
8981
+ "step": 12680
8982
+ },
8983
+ {
8984
+ "epoch": 0.563183384093065,
8985
+ "grad_norm": 87.1396484375,
8986
+ "learning_rate": 9.978000664314061e-06,
8987
+ "loss": 11.0541,
8988
+ "step": 12690
8989
+ },
8990
+ {
8991
+ "epoch": 0.5636271850261565,
8992
+ "grad_norm": 80.40076446533203,
8993
+ "learning_rate": 9.977983328352134e-06,
8994
+ "loss": 10.9173,
8995
+ "step": 12700
8996
+ },
8997
+ {
8998
+ "epoch": 0.564070985959248,
8999
+ "grad_norm": 70.14698791503906,
9000
+ "learning_rate": 9.977965992390207e-06,
9001
+ "loss": 10.8389,
9002
+ "step": 12710
9003
+ },
9004
+ {
9005
+ "epoch": 0.5645147868923395,
9006
+ "grad_norm": 77.14825439453125,
9007
+ "learning_rate": 9.977948656428279e-06,
9008
+ "loss": 10.7059,
9009
+ "step": 12720
9010
+ },
9011
+ {
9012
+ "epoch": 0.564958587825431,
9013
+ "grad_norm": 70.7795181274414,
9014
+ "learning_rate": 9.977931320466352e-06,
9015
+ "loss": 11.038,
9016
+ "step": 12730
9017
+ },
9018
+ {
9019
+ "epoch": 0.5654023887585223,
9020
+ "grad_norm": 91.8403549194336,
9021
+ "learning_rate": 9.977913984504425e-06,
9022
+ "loss": 11.0422,
9023
+ "step": 12740
9024
+ },
9025
+ {
9026
+ "epoch": 0.5658461896916138,
9027
+ "grad_norm": 74.08330535888672,
9028
+ "learning_rate": 9.977896648542496e-06,
9029
+ "loss": 11.2937,
9030
+ "step": 12750
9031
+ },
9032
+ {
9033
+ "epoch": 0.5662899906247053,
9034
+ "grad_norm": 65.14645385742188,
9035
+ "learning_rate": 9.97787931258057e-06,
9036
+ "loss": 10.9717,
9037
+ "step": 12760
9038
+ },
9039
+ {
9040
+ "epoch": 0.5667337915577968,
9041
+ "grad_norm": 71.15072631835938,
9042
+ "learning_rate": 9.977861976618642e-06,
9043
+ "loss": 10.9591,
9044
+ "step": 12770
9045
+ },
9046
+ {
9047
+ "epoch": 0.5671775924908882,
9048
+ "grad_norm": 76.79869079589844,
9049
+ "learning_rate": 9.977844640656715e-06,
9050
+ "loss": 10.5707,
9051
+ "step": 12780
9052
+ },
9053
+ {
9054
+ "epoch": 0.5676213934239797,
9055
+ "grad_norm": 85.81656646728516,
9056
+ "learning_rate": 9.977827304694787e-06,
9057
+ "loss": 11.0967,
9058
+ "step": 12790
9059
+ },
9060
+ {
9061
+ "epoch": 0.5680651943570711,
9062
+ "grad_norm": 62.44831848144531,
9063
+ "learning_rate": 9.97780996873286e-06,
9064
+ "loss": 10.7672,
9065
+ "step": 12800
9066
+ },
9067
+ {
9068
+ "epoch": 0.5685089952901626,
9069
+ "grad_norm": 79.21768188476562,
9070
+ "learning_rate": 9.977792632770933e-06,
9071
+ "loss": 10.8746,
9072
+ "step": 12810
9073
+ },
9074
+ {
9075
+ "epoch": 0.568952796223254,
9076
+ "grad_norm": 75.3280258178711,
9077
+ "learning_rate": 9.977775296809004e-06,
9078
+ "loss": 10.6935,
9079
+ "step": 12820
9080
+ },
9081
+ {
9082
+ "epoch": 0.5693965971563455,
9083
+ "grad_norm": 75.73821258544922,
9084
+ "learning_rate": 9.977757960847077e-06,
9085
+ "loss": 11.4327,
9086
+ "step": 12830
9087
+ },
9088
+ {
9089
+ "epoch": 0.569840398089437,
9090
+ "grad_norm": 77.80765533447266,
9091
+ "learning_rate": 9.97774062488515e-06,
9092
+ "loss": 10.4745,
9093
+ "step": 12840
9094
+ },
9095
+ {
9096
+ "epoch": 0.5702841990225285,
9097
+ "grad_norm": 75.14087677001953,
9098
+ "learning_rate": 9.977723288923222e-06,
9099
+ "loss": 11.1868,
9100
+ "step": 12850
9101
+ },
9102
+ {
9103
+ "epoch": 0.57072799995562,
9104
+ "grad_norm": 91.36666107177734,
9105
+ "learning_rate": 9.977705952961295e-06,
9106
+ "loss": 10.9202,
9107
+ "step": 12860
9108
+ },
9109
+ {
9110
+ "epoch": 0.5711718008887113,
9111
+ "grad_norm": 75.56893920898438,
9112
+ "learning_rate": 9.977688616999368e-06,
9113
+ "loss": 10.4079,
9114
+ "step": 12870
9115
+ },
9116
+ {
9117
+ "epoch": 0.5716156018218028,
9118
+ "grad_norm": 78.30206298828125,
9119
+ "learning_rate": 9.97767128103744e-06,
9120
+ "loss": 10.688,
9121
+ "step": 12880
9122
+ },
9123
+ {
9124
+ "epoch": 0.5720594027548943,
9125
+ "grad_norm": 67.02020263671875,
9126
+ "learning_rate": 9.977653945075512e-06,
9127
+ "loss": 11.0276,
9128
+ "step": 12890
9129
+ },
9130
+ {
9131
+ "epoch": 0.5725032036879858,
9132
+ "grad_norm": 82.73100280761719,
9133
+ "learning_rate": 9.977636609113586e-06,
9134
+ "loss": 10.2337,
9135
+ "step": 12900
9136
+ },
9137
+ {
9138
+ "epoch": 0.5729470046210772,
9139
+ "grad_norm": 68.80712127685547,
9140
+ "learning_rate": 9.977619273151657e-06,
9141
+ "loss": 10.8567,
9142
+ "step": 12910
9143
+ },
9144
+ {
9145
+ "epoch": 0.5733908055541687,
9146
+ "grad_norm": 84.04026794433594,
9147
+ "learning_rate": 9.97760193718973e-06,
9148
+ "loss": 10.6981,
9149
+ "step": 12920
9150
+ },
9151
+ {
9152
+ "epoch": 0.5738346064872601,
9153
+ "grad_norm": 76.38339233398438,
9154
+ "learning_rate": 9.977584601227803e-06,
9155
+ "loss": 10.8466,
9156
+ "step": 12930
9157
+ },
9158
+ {
9159
+ "epoch": 0.5742784074203516,
9160
+ "grad_norm": 72.25580596923828,
9161
+ "learning_rate": 9.977567265265874e-06,
9162
+ "loss": 10.8808,
9163
+ "step": 12940
9164
+ },
9165
+ {
9166
+ "epoch": 0.574722208353443,
9167
+ "grad_norm": 71.599853515625,
9168
+ "learning_rate": 9.977549929303948e-06,
9169
+ "loss": 11.0439,
9170
+ "step": 12950
9171
+ },
9172
+ {
9173
+ "epoch": 0.5751660092865345,
9174
+ "grad_norm": 70.68738555908203,
9175
+ "learning_rate": 9.97753259334202e-06,
9176
+ "loss": 10.314,
9177
+ "step": 12960
9178
+ },
9179
+ {
9180
+ "epoch": 0.575609810219626,
9181
+ "grad_norm": 85.19171905517578,
9182
+ "learning_rate": 9.977515257380092e-06,
9183
+ "loss": 10.8608,
9184
+ "step": 12970
9185
+ },
9186
+ {
9187
+ "epoch": 0.5760536111527175,
9188
+ "grad_norm": 69.8178482055664,
9189
+ "learning_rate": 9.977497921418165e-06,
9190
+ "loss": 10.6028,
9191
+ "step": 12980
9192
+ },
9193
+ {
9194
+ "epoch": 0.576497412085809,
9195
+ "grad_norm": 75.49757385253906,
9196
+ "learning_rate": 9.977480585456238e-06,
9197
+ "loss": 10.3789,
9198
+ "step": 12990
9199
+ },
9200
+ {
9201
+ "epoch": 0.5769412130189003,
9202
+ "grad_norm": 71.4937515258789,
9203
+ "learning_rate": 9.977463249494311e-06,
9204
+ "loss": 10.6222,
9205
+ "step": 13000
9206
+ },
9207
+ {
9208
+ "epoch": 0.5769412130189003,
9209
+ "eval_loss": 0.3380095958709717,
9210
+ "eval_runtime": 676.243,
9211
+ "eval_samples_per_second": 1795.791,
9212
+ "eval_steps_per_second": 56.119,
9213
+ "step": 13000
9214
+ },
9215
+ {
9216
+ "epoch": 0.5773850139519918,
9217
+ "grad_norm": 80.01882934570312,
9218
+ "learning_rate": 9.977445913532383e-06,
9219
+ "loss": 11.0475,
9220
+ "step": 13010
9221
+ },
9222
+ {
9223
+ "epoch": 0.5778288148850833,
9224
+ "grad_norm": 79.94010925292969,
9225
+ "learning_rate": 9.977428577570456e-06,
9226
+ "loss": 10.7743,
9227
+ "step": 13020
9228
+ },
9229
+ {
9230
+ "epoch": 0.5782726158181748,
9231
+ "grad_norm": 78.5164566040039,
9232
+ "learning_rate": 9.977411241608529e-06,
9233
+ "loss": 10.7199,
9234
+ "step": 13030
9235
+ },
9236
+ {
9237
+ "epoch": 0.5787164167512662,
9238
+ "grad_norm": 74.05512237548828,
9239
+ "learning_rate": 9.9773939056466e-06,
9240
+ "loss": 11.3993,
9241
+ "step": 13040
9242
+ },
9243
+ {
9244
+ "epoch": 0.5791602176843577,
9245
+ "grad_norm": 71.45179748535156,
9246
+ "learning_rate": 9.977376569684673e-06,
9247
+ "loss": 10.6827,
9248
+ "step": 13050
9249
+ },
9250
+ {
9251
+ "epoch": 0.5796040186174491,
9252
+ "grad_norm": 69.38019561767578,
9253
+ "learning_rate": 9.977359233722746e-06,
9254
+ "loss": 11.3264,
9255
+ "step": 13060
9256
+ },
9257
+ {
9258
+ "epoch": 0.5800478195505406,
9259
+ "grad_norm": 81.93159484863281,
9260
+ "learning_rate": 9.977341897760818e-06,
9261
+ "loss": 10.2077,
9262
+ "step": 13070
9263
+ },
9264
+ {
9265
+ "epoch": 0.5804916204836321,
9266
+ "grad_norm": 79.02632141113281,
9267
+ "learning_rate": 9.97732456179889e-06,
9268
+ "loss": 10.7618,
9269
+ "step": 13080
9270
+ },
9271
+ {
9272
+ "epoch": 0.5809354214167235,
9273
+ "grad_norm": 83.42288970947266,
9274
+ "learning_rate": 9.977307225836964e-06,
9275
+ "loss": 10.335,
9276
+ "step": 13090
9277
+ },
9278
+ {
9279
+ "epoch": 0.581379222349815,
9280
+ "grad_norm": 86.47683715820312,
9281
+ "learning_rate": 9.977289889875035e-06,
9282
+ "loss": 10.56,
9283
+ "step": 13100
9284
+ },
9285
+ {
9286
+ "epoch": 0.5818230232829065,
9287
+ "grad_norm": 87.43464660644531,
9288
+ "learning_rate": 9.977272553913108e-06,
9289
+ "loss": 11.0672,
9290
+ "step": 13110
9291
+ },
9292
+ {
9293
+ "epoch": 0.582266824215998,
9294
+ "grad_norm": 79.6264877319336,
9295
+ "learning_rate": 9.977255217951181e-06,
9296
+ "loss": 11.4854,
9297
+ "step": 13120
9298
+ },
9299
+ {
9300
+ "epoch": 0.5827106251490893,
9301
+ "grad_norm": 68.71961212158203,
9302
+ "learning_rate": 9.977237881989253e-06,
9303
+ "loss": 10.6174,
9304
+ "step": 13130
9305
+ },
9306
+ {
9307
+ "epoch": 0.5831544260821808,
9308
+ "grad_norm": 64.4797592163086,
9309
+ "learning_rate": 9.977220546027326e-06,
9310
+ "loss": 10.2814,
9311
+ "step": 13140
9312
+ },
9313
+ {
9314
+ "epoch": 0.5835982270152723,
9315
+ "grad_norm": 77.29915618896484,
9316
+ "learning_rate": 9.977203210065399e-06,
9317
+ "loss": 11.197,
9318
+ "step": 13150
9319
+ },
9320
+ {
9321
+ "epoch": 0.5840420279483638,
9322
+ "grad_norm": 71.70263671875,
9323
+ "learning_rate": 9.97718587410347e-06,
9324
+ "loss": 10.7686,
9325
+ "step": 13160
9326
+ },
9327
+ {
9328
+ "epoch": 0.5844858288814552,
9329
+ "grad_norm": 78.5580062866211,
9330
+ "learning_rate": 9.977168538141543e-06,
9331
+ "loss": 10.7684,
9332
+ "step": 13170
9333
+ },
9334
+ {
9335
+ "epoch": 0.5849296298145467,
9336
+ "grad_norm": 87.69745635986328,
9337
+ "learning_rate": 9.977151202179616e-06,
9338
+ "loss": 11.1967,
9339
+ "step": 13180
9340
+ },
9341
+ {
9342
+ "epoch": 0.5853734307476381,
9343
+ "grad_norm": 79.14582824707031,
9344
+ "learning_rate": 9.977133866217688e-06,
9345
+ "loss": 10.4763,
9346
+ "step": 13190
9347
+ },
9348
+ {
9349
+ "epoch": 0.5858172316807296,
9350
+ "grad_norm": 87.4928207397461,
9351
+ "learning_rate": 9.977116530255761e-06,
9352
+ "loss": 10.9238,
9353
+ "step": 13200
9354
+ },
9355
+ {
9356
+ "epoch": 0.5862610326138211,
9357
+ "grad_norm": 76.60111999511719,
9358
+ "learning_rate": 9.977099194293834e-06,
9359
+ "loss": 10.7317,
9360
+ "step": 13210
9361
+ },
9362
+ {
9363
+ "epoch": 0.5867048335469125,
9364
+ "grad_norm": 80.5156478881836,
9365
+ "learning_rate": 9.977081858331907e-06,
9366
+ "loss": 10.0798,
9367
+ "step": 13220
9368
+ },
9369
+ {
9370
+ "epoch": 0.587148634480004,
9371
+ "grad_norm": 76.1066665649414,
9372
+ "learning_rate": 9.977064522369978e-06,
9373
+ "loss": 10.5962,
9374
+ "step": 13230
9375
+ },
9376
+ {
9377
+ "epoch": 0.5875924354130955,
9378
+ "grad_norm": 76.55358123779297,
9379
+ "learning_rate": 9.977047186408052e-06,
9380
+ "loss": 10.4487,
9381
+ "step": 13240
9382
+ },
9383
+ {
9384
+ "epoch": 0.588036236346187,
9385
+ "grad_norm": 79.84068298339844,
9386
+ "learning_rate": 9.977029850446125e-06,
9387
+ "loss": 10.9349,
9388
+ "step": 13250
9389
+ },
9390
+ {
9391
+ "epoch": 0.5884800372792783,
9392
+ "grad_norm": 67.1181411743164,
9393
+ "learning_rate": 9.977012514484196e-06,
9394
+ "loss": 10.5641,
9395
+ "step": 13260
9396
+ },
9397
+ {
9398
+ "epoch": 0.5889238382123698,
9399
+ "grad_norm": 75.07349395751953,
9400
+ "learning_rate": 9.976995178522269e-06,
9401
+ "loss": 10.9706,
9402
+ "step": 13270
9403
+ },
9404
+ {
9405
+ "epoch": 0.5893676391454613,
9406
+ "grad_norm": 70.72966003417969,
9407
+ "learning_rate": 9.976977842560342e-06,
9408
+ "loss": 10.4643,
9409
+ "step": 13280
9410
+ },
9411
+ {
9412
+ "epoch": 0.5898114400785528,
9413
+ "grad_norm": 73.792724609375,
9414
+ "learning_rate": 9.976960506598414e-06,
9415
+ "loss": 10.8425,
9416
+ "step": 13290
9417
+ },
9418
+ {
9419
+ "epoch": 0.5902552410116442,
9420
+ "grad_norm": 67.8843994140625,
9421
+ "learning_rate": 9.976943170636487e-06,
9422
+ "loss": 10.7675,
9423
+ "step": 13300
9424
+ },
9425
+ {
9426
+ "epoch": 0.5906990419447357,
9427
+ "grad_norm": 64.45490264892578,
9428
+ "learning_rate": 9.97692583467456e-06,
9429
+ "loss": 9.6526,
9430
+ "step": 13310
9431
+ },
9432
+ {
9433
+ "epoch": 0.5911428428778271,
9434
+ "grad_norm": 83.5705337524414,
9435
+ "learning_rate": 9.976908498712631e-06,
9436
+ "loss": 11.1669,
9437
+ "step": 13320
9438
+ },
9439
+ {
9440
+ "epoch": 0.5915866438109186,
9441
+ "grad_norm": 71.82415008544922,
9442
+ "learning_rate": 9.976891162750704e-06,
9443
+ "loss": 10.5378,
9444
+ "step": 13330
9445
+ },
9446
+ {
9447
+ "epoch": 0.5920304447440101,
9448
+ "grad_norm": 80.347900390625,
9449
+ "learning_rate": 9.976873826788777e-06,
9450
+ "loss": 10.9272,
9451
+ "step": 13340
9452
+ },
9453
+ {
9454
+ "epoch": 0.5924742456771015,
9455
+ "grad_norm": 85.71224212646484,
9456
+ "learning_rate": 9.976856490826849e-06,
9457
+ "loss": 10.9012,
9458
+ "step": 13350
9459
+ },
9460
+ {
9461
+ "epoch": 0.592918046610193,
9462
+ "grad_norm": 79.83573150634766,
9463
+ "learning_rate": 9.976839154864922e-06,
9464
+ "loss": 10.5738,
9465
+ "step": 13360
9466
+ },
9467
+ {
9468
+ "epoch": 0.5933618475432845,
9469
+ "grad_norm": 74.77920532226562,
9470
+ "learning_rate": 9.976821818902995e-06,
9471
+ "loss": 10.9535,
9472
+ "step": 13370
9473
+ },
9474
+ {
9475
+ "epoch": 0.593805648476376,
9476
+ "grad_norm": 78.11053466796875,
9477
+ "learning_rate": 9.976804482941068e-06,
9478
+ "loss": 10.6037,
9479
+ "step": 13380
9480
+ },
9481
+ {
9482
+ "epoch": 0.5942494494094673,
9483
+ "grad_norm": 74.39498901367188,
9484
+ "learning_rate": 9.97678714697914e-06,
9485
+ "loss": 10.7146,
9486
+ "step": 13390
9487
+ },
9488
+ {
9489
+ "epoch": 0.5946932503425588,
9490
+ "grad_norm": 60.10570526123047,
9491
+ "learning_rate": 9.976769811017212e-06,
9492
+ "loss": 10.5725,
9493
+ "step": 13400
9494
+ },
9495
+ {
9496
+ "epoch": 0.5951370512756503,
9497
+ "grad_norm": 73.86632537841797,
9498
+ "learning_rate": 9.976752475055285e-06,
9499
+ "loss": 11.0233,
9500
+ "step": 13410
9501
+ },
9502
+ {
9503
+ "epoch": 0.5955808522087418,
9504
+ "grad_norm": 88.34583282470703,
9505
+ "learning_rate": 9.976735139093357e-06,
9506
+ "loss": 10.9409,
9507
+ "step": 13420
9508
+ },
9509
+ {
9510
+ "epoch": 0.5960246531418333,
9511
+ "grad_norm": 79.4537353515625,
9512
+ "learning_rate": 9.97671780313143e-06,
9513
+ "loss": 10.3377,
9514
+ "step": 13430
9515
+ },
9516
+ {
9517
+ "epoch": 0.5964684540749247,
9518
+ "grad_norm": 78.50579833984375,
9519
+ "learning_rate": 9.976700467169503e-06,
9520
+ "loss": 10.7454,
9521
+ "step": 13440
9522
+ },
9523
+ {
9524
+ "epoch": 0.5969122550080161,
9525
+ "grad_norm": 86.72323608398438,
9526
+ "learning_rate": 9.976683131207574e-06,
9527
+ "loss": 11.0431,
9528
+ "step": 13450
9529
+ },
9530
+ {
9531
+ "epoch": 0.5973560559411076,
9532
+ "grad_norm": 74.95063781738281,
9533
+ "learning_rate": 9.976665795245647e-06,
9534
+ "loss": 10.871,
9535
+ "step": 13460
9536
+ },
9537
+ {
9538
+ "epoch": 0.5977998568741991,
9539
+ "grad_norm": 81.35505676269531,
9540
+ "learning_rate": 9.97664845928372e-06,
9541
+ "loss": 10.8405,
9542
+ "step": 13470
9543
+ },
9544
+ {
9545
+ "epoch": 0.5982436578072905,
9546
+ "grad_norm": 76.54270935058594,
9547
+ "learning_rate": 9.976631123321794e-06,
9548
+ "loss": 10.7319,
9549
+ "step": 13480
9550
+ },
9551
+ {
9552
+ "epoch": 0.598687458740382,
9553
+ "grad_norm": 85.25194549560547,
9554
+ "learning_rate": 9.976613787359865e-06,
9555
+ "loss": 10.6786,
9556
+ "step": 13490
9557
+ },
9558
+ {
9559
+ "epoch": 0.5991312596734735,
9560
+ "grad_norm": 71.38860321044922,
9561
+ "learning_rate": 9.976596451397938e-06,
9562
+ "loss": 10.5208,
9563
+ "step": 13500
9564
+ },
9565
+ {
9566
+ "epoch": 0.599575060606565,
9567
+ "grad_norm": 96.68553924560547,
9568
+ "learning_rate": 9.976579115436011e-06,
9569
+ "loss": 11.0773,
9570
+ "step": 13510
9571
+ },
9572
+ {
9573
+ "epoch": 0.6000188615396563,
9574
+ "grad_norm": 84.14801025390625,
9575
+ "learning_rate": 9.976561779474083e-06,
9576
+ "loss": 10.6248,
9577
+ "step": 13520
9578
+ },
9579
+ {
9580
+ "epoch": 0.6004626624727478,
9581
+ "grad_norm": 79.10609436035156,
9582
+ "learning_rate": 9.976544443512156e-06,
9583
+ "loss": 10.942,
9584
+ "step": 13530
9585
+ },
9586
+ {
9587
+ "epoch": 0.6009064634058393,
9588
+ "grad_norm": 70.84247589111328,
9589
+ "learning_rate": 9.976527107550229e-06,
9590
+ "loss": 11.0524,
9591
+ "step": 13540
9592
+ },
9593
+ {
9594
+ "epoch": 0.6013502643389308,
9595
+ "grad_norm": 73.7003402709961,
9596
+ "learning_rate": 9.9765097715883e-06,
9597
+ "loss": 10.8864,
9598
+ "step": 13550
9599
+ },
9600
+ {
9601
+ "epoch": 0.6017940652720223,
9602
+ "grad_norm": 83.23018646240234,
9603
+ "learning_rate": 9.976492435626373e-06,
9604
+ "loss": 10.9026,
9605
+ "step": 13560
9606
+ },
9607
+ {
9608
+ "epoch": 0.6022378662051137,
9609
+ "grad_norm": 76.42780303955078,
9610
+ "learning_rate": 9.976475099664446e-06,
9611
+ "loss": 10.7181,
9612
+ "step": 13570
9613
+ },
9614
+ {
9615
+ "epoch": 0.6026816671382051,
9616
+ "grad_norm": 75.6198501586914,
9617
+ "learning_rate": 9.976457763702518e-06,
9618
+ "loss": 11.1458,
9619
+ "step": 13580
9620
+ },
9621
+ {
9622
+ "epoch": 0.6031254680712966,
9623
+ "grad_norm": 88.8167495727539,
9624
+ "learning_rate": 9.97644042774059e-06,
9625
+ "loss": 10.8837,
9626
+ "step": 13590
9627
+ },
9628
+ {
9629
+ "epoch": 0.6035692690043881,
9630
+ "grad_norm": 71.14923858642578,
9631
+ "learning_rate": 9.976423091778664e-06,
9632
+ "loss": 10.8807,
9633
+ "step": 13600
9634
+ },
9635
+ {
9636
+ "epoch": 0.6040130699374795,
9637
+ "grad_norm": 98.20439147949219,
9638
+ "learning_rate": 9.976405755816737e-06,
9639
+ "loss": 11.4224,
9640
+ "step": 13610
9641
+ },
9642
+ {
9643
+ "epoch": 0.604456870870571,
9644
+ "grad_norm": 75.78369140625,
9645
+ "learning_rate": 9.976388419854808e-06,
9646
+ "loss": 10.2893,
9647
+ "step": 13620
9648
+ },
9649
+ {
9650
+ "epoch": 0.6049006718036625,
9651
+ "grad_norm": 79.95215606689453,
9652
+ "learning_rate": 9.976371083892881e-06,
9653
+ "loss": 10.8222,
9654
+ "step": 13630
9655
+ },
9656
+ {
9657
+ "epoch": 0.605344472736754,
9658
+ "grad_norm": 83.35610961914062,
9659
+ "learning_rate": 9.976353747930954e-06,
9660
+ "loss": 10.7412,
9661
+ "step": 13640
9662
+ },
9663
+ {
9664
+ "epoch": 0.6057882736698454,
9665
+ "grad_norm": 75.5788345336914,
9666
+ "learning_rate": 9.976336411969026e-06,
9667
+ "loss": 10.5057,
9668
+ "step": 13650
9669
+ },
9670
+ {
9671
+ "epoch": 0.6062320746029368,
9672
+ "grad_norm": 72.3238296508789,
9673
+ "learning_rate": 9.976319076007099e-06,
9674
+ "loss": 11.2517,
9675
+ "step": 13660
9676
+ },
9677
+ {
9678
+ "epoch": 0.6066758755360283,
9679
+ "grad_norm": 75.04680633544922,
9680
+ "learning_rate": 9.976301740045172e-06,
9681
+ "loss": 10.5344,
9682
+ "step": 13670
9683
+ },
9684
+ {
9685
+ "epoch": 0.6071196764691198,
9686
+ "grad_norm": 63.219268798828125,
9687
+ "learning_rate": 9.976284404083243e-06,
9688
+ "loss": 10.9847,
9689
+ "step": 13680
9690
+ },
9691
+ {
9692
+ "epoch": 0.6075634774022113,
9693
+ "grad_norm": 87.58104705810547,
9694
+ "learning_rate": 9.976267068121316e-06,
9695
+ "loss": 11.4142,
9696
+ "step": 13690
9697
+ },
9698
+ {
9699
+ "epoch": 0.6080072783353027,
9700
+ "grad_norm": 70.27639770507812,
9701
+ "learning_rate": 9.97624973215939e-06,
9702
+ "loss": 10.9125,
9703
+ "step": 13700
9704
+ },
9705
+ {
9706
+ "epoch": 0.6084510792683941,
9707
+ "grad_norm": 75.89519500732422,
9708
+ "learning_rate": 9.97623239619746e-06,
9709
+ "loss": 10.8769,
9710
+ "step": 13710
9711
+ },
9712
+ {
9713
+ "epoch": 0.6088948802014856,
9714
+ "grad_norm": 78.69032287597656,
9715
+ "learning_rate": 9.976215060235534e-06,
9716
+ "loss": 10.4481,
9717
+ "step": 13720
9718
+ },
9719
+ {
9720
+ "epoch": 0.6093386811345771,
9721
+ "grad_norm": 73.8470230102539,
9722
+ "learning_rate": 9.976197724273607e-06,
9723
+ "loss": 10.5418,
9724
+ "step": 13730
9725
+ },
9726
+ {
9727
+ "epoch": 0.6097824820676685,
9728
+ "grad_norm": 74.75282287597656,
9729
+ "learning_rate": 9.97618038831168e-06,
9730
+ "loss": 10.6919,
9731
+ "step": 13740
9732
+ },
9733
+ {
9734
+ "epoch": 0.61022628300076,
9735
+ "grad_norm": 82.48995971679688,
9736
+ "learning_rate": 9.976163052349751e-06,
9737
+ "loss": 11.2039,
9738
+ "step": 13750
9739
+ },
9740
+ {
9741
+ "epoch": 0.6106700839338515,
9742
+ "grad_norm": 86.70307159423828,
9743
+ "learning_rate": 9.976145716387825e-06,
9744
+ "loss": 10.645,
9745
+ "step": 13760
9746
+ },
9747
+ {
9748
+ "epoch": 0.611113884866943,
9749
+ "grad_norm": 87.02120208740234,
9750
+ "learning_rate": 9.976128380425898e-06,
9751
+ "loss": 10.8805,
9752
+ "step": 13770
9753
+ },
9754
+ {
9755
+ "epoch": 0.6115576858000344,
9756
+ "grad_norm": 67.70435333251953,
9757
+ "learning_rate": 9.976111044463969e-06,
9758
+ "loss": 10.7013,
9759
+ "step": 13780
9760
+ },
9761
+ {
9762
+ "epoch": 0.6120014867331258,
9763
+ "grad_norm": 78.09877014160156,
9764
+ "learning_rate": 9.976093708502042e-06,
9765
+ "loss": 10.7801,
9766
+ "step": 13790
9767
+ },
9768
+ {
9769
+ "epoch": 0.6124452876662173,
9770
+ "grad_norm": 85.01838684082031,
9771
+ "learning_rate": 9.976076372540115e-06,
9772
+ "loss": 10.6342,
9773
+ "step": 13800
9774
+ },
9775
+ {
9776
+ "epoch": 0.6128890885993088,
9777
+ "grad_norm": 83.46656799316406,
9778
+ "learning_rate": 9.976059036578187e-06,
9779
+ "loss": 10.5866,
9780
+ "step": 13810
9781
+ },
9782
+ {
9783
+ "epoch": 0.6133328895324003,
9784
+ "grad_norm": 81.93741607666016,
9785
+ "learning_rate": 9.97604170061626e-06,
9786
+ "loss": 10.4993,
9787
+ "step": 13820
9788
+ },
9789
+ {
9790
+ "epoch": 0.6137766904654917,
9791
+ "grad_norm": 60.87858963012695,
9792
+ "learning_rate": 9.976024364654333e-06,
9793
+ "loss": 10.246,
9794
+ "step": 13830
9795
+ },
9796
+ {
9797
+ "epoch": 0.6142204913985831,
9798
+ "grad_norm": 85.0611343383789,
9799
+ "learning_rate": 9.976007028692404e-06,
9800
+ "loss": 11.0282,
9801
+ "step": 13840
9802
+ },
9803
+ {
9804
+ "epoch": 0.6146642923316746,
9805
+ "grad_norm": 65.17115783691406,
9806
+ "learning_rate": 9.975989692730477e-06,
9807
+ "loss": 10.2572,
9808
+ "step": 13850
9809
+ },
9810
+ {
9811
+ "epoch": 0.6151080932647661,
9812
+ "grad_norm": 67.50042724609375,
9813
+ "learning_rate": 9.97597235676855e-06,
9814
+ "loss": 10.2284,
9815
+ "step": 13860
9816
+ },
9817
+ {
9818
+ "epoch": 0.6155518941978575,
9819
+ "grad_norm": 89.74058532714844,
9820
+ "learning_rate": 9.975955020806623e-06,
9821
+ "loss": 10.8248,
9822
+ "step": 13870
9823
+ },
9824
+ {
9825
+ "epoch": 0.615995695130949,
9826
+ "grad_norm": 86.69886779785156,
9827
+ "learning_rate": 9.975937684844695e-06,
9828
+ "loss": 10.8384,
9829
+ "step": 13880
9830
+ },
9831
+ {
9832
+ "epoch": 0.6164394960640405,
9833
+ "grad_norm": 68.96355438232422,
9834
+ "learning_rate": 9.975920348882768e-06,
9835
+ "loss": 10.2142,
9836
+ "step": 13890
9837
+ },
9838
+ {
9839
+ "epoch": 0.616883296997132,
9840
+ "grad_norm": 91.22903442382812,
9841
+ "learning_rate": 9.97590301292084e-06,
9842
+ "loss": 11.3292,
9843
+ "step": 13900
9844
+ },
9845
+ {
9846
+ "epoch": 0.6173270979302234,
9847
+ "grad_norm": 67.73802185058594,
9848
+ "learning_rate": 9.975885676958912e-06,
9849
+ "loss": 10.8209,
9850
+ "step": 13910
9851
+ },
9852
+ {
9853
+ "epoch": 0.6177708988633148,
9854
+ "grad_norm": 80.69165802001953,
9855
+ "learning_rate": 9.975868340996985e-06,
9856
+ "loss": 10.6029,
9857
+ "step": 13920
9858
+ },
9859
+ {
9860
+ "epoch": 0.6182146997964063,
9861
+ "grad_norm": 73.6860580444336,
9862
+ "learning_rate": 9.975851005035058e-06,
9863
+ "loss": 10.3577,
9864
+ "step": 13930
9865
+ },
9866
+ {
9867
+ "epoch": 0.6186585007294978,
9868
+ "grad_norm": 69.81961822509766,
9869
+ "learning_rate": 9.97583366907313e-06,
9870
+ "loss": 10.3961,
9871
+ "step": 13940
9872
+ },
9873
+ {
9874
+ "epoch": 0.6191023016625893,
9875
+ "grad_norm": 76.92399597167969,
9876
+ "learning_rate": 9.975816333111203e-06,
9877
+ "loss": 11.1469,
9878
+ "step": 13950
9879
+ },
9880
+ {
9881
+ "epoch": 0.6195461025956807,
9882
+ "grad_norm": 72.5120849609375,
9883
+ "learning_rate": 9.975798997149276e-06,
9884
+ "loss": 10.7128,
9885
+ "step": 13960
9886
+ },
9887
+ {
9888
+ "epoch": 0.6199899035287721,
9889
+ "grad_norm": 70.09220123291016,
9890
+ "learning_rate": 9.975781661187347e-06,
9891
+ "loss": 10.5301,
9892
+ "step": 13970
9893
+ },
9894
+ {
9895
+ "epoch": 0.6204337044618636,
9896
+ "grad_norm": 67.22700500488281,
9897
+ "learning_rate": 9.97576432522542e-06,
9898
+ "loss": 10.1562,
9899
+ "step": 13980
9900
+ },
9901
+ {
9902
+ "epoch": 0.6208775053949551,
9903
+ "grad_norm": 69.6814956665039,
9904
+ "learning_rate": 9.975746989263493e-06,
9905
+ "loss": 11.2697,
9906
+ "step": 13990
9907
+ },
9908
+ {
9909
+ "epoch": 0.6213213063280466,
9910
+ "grad_norm": 81.13983917236328,
9911
+ "learning_rate": 9.975729653301567e-06,
9912
+ "loss": 10.9927,
9913
+ "step": 14000
9914
+ },
9915
+ {
9916
+ "epoch": 0.6213213063280466,
9917
+ "eval_loss": 0.3354085683822632,
9918
+ "eval_runtime": 673.6456,
9919
+ "eval_samples_per_second": 1802.715,
9920
+ "eval_steps_per_second": 56.335,
9921
+ "step": 14000
9922
+ },
9923
+ {
9924
+ "epoch": 0.621765107261138,
9925
+ "grad_norm": 73.27105712890625,
9926
+ "learning_rate": 9.975712317339638e-06,
9927
+ "loss": 10.5364,
9928
+ "step": 14010
9929
+ },
9930
+ {
9931
+ "epoch": 0.6222089081942295,
9932
+ "grad_norm": 74.4351577758789,
9933
+ "learning_rate": 9.975694981377711e-06,
9934
+ "loss": 10.6971,
9935
+ "step": 14020
9936
+ },
9937
+ {
9938
+ "epoch": 0.622652709127321,
9939
+ "grad_norm": 75.62439727783203,
9940
+ "learning_rate": 9.975677645415784e-06,
9941
+ "loss": 10.709,
9942
+ "step": 14030
9943
+ },
9944
+ {
9945
+ "epoch": 0.6230965100604124,
9946
+ "grad_norm": 71.058349609375,
9947
+ "learning_rate": 9.975660309453855e-06,
9948
+ "loss": 10.7634,
9949
+ "step": 14040
9950
+ },
9951
+ {
9952
+ "epoch": 0.6235403109935038,
9953
+ "grad_norm": 85.23941802978516,
9954
+ "learning_rate": 9.975642973491929e-06,
9955
+ "loss": 11.1897,
9956
+ "step": 14050
9957
+ },
9958
+ {
9959
+ "epoch": 0.6239841119265953,
9960
+ "grad_norm": 78.38990020751953,
9961
+ "learning_rate": 9.975625637530002e-06,
9962
+ "loss": 10.9573,
9963
+ "step": 14060
9964
+ },
9965
+ {
9966
+ "epoch": 0.6244279128596868,
9967
+ "grad_norm": 73.34696197509766,
9968
+ "learning_rate": 9.975608301568073e-06,
9969
+ "loss": 11.1118,
9970
+ "step": 14070
9971
+ },
9972
+ {
9973
+ "epoch": 0.6248717137927783,
9974
+ "grad_norm": 65.3375244140625,
9975
+ "learning_rate": 9.975590965606146e-06,
9976
+ "loss": 10.7498,
9977
+ "step": 14080
9978
+ },
9979
+ {
9980
+ "epoch": 0.6253155147258697,
9981
+ "grad_norm": 89.46272277832031,
9982
+ "learning_rate": 9.97557362964422e-06,
9983
+ "loss": 10.7714,
9984
+ "step": 14090
9985
+ },
9986
+ {
9987
+ "epoch": 0.6257593156589611,
9988
+ "grad_norm": 73.509521484375,
9989
+ "learning_rate": 9.975556293682292e-06,
9990
+ "loss": 10.5213,
9991
+ "step": 14100
9992
+ },
9993
+ {
9994
+ "epoch": 0.6262031165920526,
9995
+ "grad_norm": 58.2670783996582,
9996
+ "learning_rate": 9.975538957720364e-06,
9997
+ "loss": 10.8304,
9998
+ "step": 14110
9999
+ },
10000
+ {
10001
+ "epoch": 0.6266469175251441,
10002
+ "grad_norm": 71.2006607055664,
10003
+ "learning_rate": 9.975521621758437e-06,
10004
+ "loss": 10.727,
10005
+ "step": 14120
10006
+ },
10007
+ {
10008
+ "epoch": 0.6270907184582356,
10009
+ "grad_norm": 82.89636993408203,
10010
+ "learning_rate": 9.97550428579651e-06,
10011
+ "loss": 10.5115,
10012
+ "step": 14130
10013
+ },
10014
+ {
10015
+ "epoch": 0.627534519391327,
10016
+ "grad_norm": 69.2073974609375,
10017
+ "learning_rate": 9.975486949834581e-06,
10018
+ "loss": 10.3403,
10019
+ "step": 14140
10020
+ },
10021
+ {
10022
+ "epoch": 0.6279783203244185,
10023
+ "grad_norm": 71.10247039794922,
10024
+ "learning_rate": 9.975469613872654e-06,
10025
+ "loss": 11.1554,
10026
+ "step": 14150
10027
+ },
10028
+ {
10029
+ "epoch": 0.62842212125751,
10030
+ "grad_norm": 70.13909912109375,
10031
+ "learning_rate": 9.975452277910727e-06,
10032
+ "loss": 10.4193,
10033
+ "step": 14160
10034
+ },
10035
+ {
10036
+ "epoch": 0.6288659221906014,
10037
+ "grad_norm": 81.4350814819336,
10038
+ "learning_rate": 9.975434941948799e-06,
10039
+ "loss": 10.6163,
10040
+ "step": 14170
10041
+ },
10042
+ {
10043
+ "epoch": 0.6293097231236928,
10044
+ "grad_norm": 71.56867980957031,
10045
+ "learning_rate": 9.975417605986872e-06,
10046
+ "loss": 10.7015,
10047
+ "step": 14180
10048
+ },
10049
+ {
10050
+ "epoch": 0.6297535240567843,
10051
+ "grad_norm": 81.04633331298828,
10052
+ "learning_rate": 9.975400270024945e-06,
10053
+ "loss": 10.8405,
10054
+ "step": 14190
10055
+ },
10056
+ {
10057
+ "epoch": 0.6301973249898758,
10058
+ "grad_norm": 70.49227905273438,
10059
+ "learning_rate": 9.975382934063016e-06,
10060
+ "loss": 9.9988,
10061
+ "step": 14200
10062
+ },
10063
+ {
10064
+ "epoch": 0.6306411259229673,
10065
+ "grad_norm": 74.22464752197266,
10066
+ "learning_rate": 9.97536559810109e-06,
10067
+ "loss": 10.6188,
10068
+ "step": 14210
10069
+ },
10070
+ {
10071
+ "epoch": 0.6310849268560588,
10072
+ "grad_norm": 71.94355773925781,
10073
+ "learning_rate": 9.975348262139162e-06,
10074
+ "loss": 11.1014,
10075
+ "step": 14220
10076
+ },
10077
+ {
10078
+ "epoch": 0.6315287277891501,
10079
+ "grad_norm": 77.49762725830078,
10080
+ "learning_rate": 9.975330926177234e-06,
10081
+ "loss": 10.8575,
10082
+ "step": 14230
10083
+ },
10084
+ {
10085
+ "epoch": 0.6319725287222416,
10086
+ "grad_norm": 60.8555793762207,
10087
+ "learning_rate": 9.975313590215307e-06,
10088
+ "loss": 10.5589,
10089
+ "step": 14240
10090
+ },
10091
+ {
10092
+ "epoch": 0.6324163296553331,
10093
+ "grad_norm": 75.85652160644531,
10094
+ "learning_rate": 9.97529625425338e-06,
10095
+ "loss": 10.7895,
10096
+ "step": 14250
10097
+ },
10098
+ {
10099
+ "epoch": 0.6328601305884246,
10100
+ "grad_norm": 78.68632507324219,
10101
+ "learning_rate": 9.975278918291451e-06,
10102
+ "loss": 10.5087,
10103
+ "step": 14260
10104
+ },
10105
+ {
10106
+ "epoch": 0.633303931521516,
10107
+ "grad_norm": 67.67071533203125,
10108
+ "learning_rate": 9.975261582329524e-06,
10109
+ "loss": 10.6245,
10110
+ "step": 14270
10111
+ },
10112
+ {
10113
+ "epoch": 0.6337477324546075,
10114
+ "grad_norm": 71.56282806396484,
10115
+ "learning_rate": 9.975244246367597e-06,
10116
+ "loss": 10.2392,
10117
+ "step": 14280
10118
+ },
10119
+ {
10120
+ "epoch": 0.634191533387699,
10121
+ "grad_norm": 88.3572769165039,
10122
+ "learning_rate": 9.975226910405669e-06,
10123
+ "loss": 10.4799,
10124
+ "step": 14290
10125
+ },
10126
+ {
10127
+ "epoch": 0.6346353343207904,
10128
+ "grad_norm": 82.04935455322266,
10129
+ "learning_rate": 9.975209574443742e-06,
10130
+ "loss": 10.4068,
10131
+ "step": 14300
10132
+ },
10133
+ {
10134
+ "epoch": 0.6350791352538818,
10135
+ "grad_norm": 84.6961441040039,
10136
+ "learning_rate": 9.975192238481815e-06,
10137
+ "loss": 11.09,
10138
+ "step": 14310
10139
+ },
10140
+ {
10141
+ "epoch": 0.6355229361869733,
10142
+ "grad_norm": 71.76660919189453,
10143
+ "learning_rate": 9.975174902519888e-06,
10144
+ "loss": 10.3988,
10145
+ "step": 14320
10146
+ },
10147
+ {
10148
+ "epoch": 0.6359667371200648,
10149
+ "grad_norm": 80.33756256103516,
10150
+ "learning_rate": 9.97515756655796e-06,
10151
+ "loss": 11.2503,
10152
+ "step": 14330
10153
+ },
10154
+ {
10155
+ "epoch": 0.6364105380531563,
10156
+ "grad_norm": 62.710575103759766,
10157
+ "learning_rate": 9.975140230596033e-06,
10158
+ "loss": 10.4881,
10159
+ "step": 14340
10160
+ },
10161
+ {
10162
+ "epoch": 0.6368543389862478,
10163
+ "grad_norm": 71.74394989013672,
10164
+ "learning_rate": 9.975122894634106e-06,
10165
+ "loss": 10.9982,
10166
+ "step": 14350
10167
+ },
10168
+ {
10169
+ "epoch": 0.6372981399193391,
10170
+ "grad_norm": 68.05677795410156,
10171
+ "learning_rate": 9.975105558672177e-06,
10172
+ "loss": 10.6098,
10173
+ "step": 14360
10174
+ },
10175
+ {
10176
+ "epoch": 0.6377419408524306,
10177
+ "grad_norm": 78.377197265625,
10178
+ "learning_rate": 9.97508822271025e-06,
10179
+ "loss": 10.4365,
10180
+ "step": 14370
10181
+ },
10182
+ {
10183
+ "epoch": 0.6381857417855221,
10184
+ "grad_norm": 70.45510864257812,
10185
+ "learning_rate": 9.975070886748323e-06,
10186
+ "loss": 10.625,
10187
+ "step": 14380
10188
+ },
10189
+ {
10190
+ "epoch": 0.6386295427186136,
10191
+ "grad_norm": 73.3565444946289,
10192
+ "learning_rate": 9.975053550786395e-06,
10193
+ "loss": 11.1326,
10194
+ "step": 14390
10195
+ },
10196
+ {
10197
+ "epoch": 0.639073343651705,
10198
+ "grad_norm": 62.82444763183594,
10199
+ "learning_rate": 9.975036214824468e-06,
10200
+ "loss": 10.5034,
10201
+ "step": 14400
10202
+ },
10203
+ {
10204
+ "epoch": 0.6395171445847965,
10205
+ "grad_norm": 69.73179626464844,
10206
+ "learning_rate": 9.97501887886254e-06,
10207
+ "loss": 11.1584,
10208
+ "step": 14410
10209
+ },
10210
+ {
10211
+ "epoch": 0.639960945517888,
10212
+ "grad_norm": 71.01941680908203,
10213
+ "learning_rate": 9.975001542900612e-06,
10214
+ "loss": 10.6757,
10215
+ "step": 14420
10216
+ },
10217
+ {
10218
+ "epoch": 0.6404047464509794,
10219
+ "grad_norm": 66.62494659423828,
10220
+ "learning_rate": 9.974984206938685e-06,
10221
+ "loss": 10.9316,
10222
+ "step": 14430
10223
+ },
10224
+ {
10225
+ "epoch": 0.6408485473840708,
10226
+ "grad_norm": 70.99148559570312,
10227
+ "learning_rate": 9.974966870976758e-06,
10228
+ "loss": 10.5745,
10229
+ "step": 14440
10230
+ },
10231
+ {
10232
+ "epoch": 0.6412923483171623,
10233
+ "grad_norm": 68.47904968261719,
10234
+ "learning_rate": 9.97494953501483e-06,
10235
+ "loss": 10.7873,
10236
+ "step": 14450
10237
+ },
10238
+ {
10239
+ "epoch": 0.6417361492502538,
10240
+ "grad_norm": 78.76315307617188,
10241
+ "learning_rate": 9.974932199052903e-06,
10242
+ "loss": 10.8907,
10243
+ "step": 14460
10244
+ },
10245
+ {
10246
+ "epoch": 0.6421799501833453,
10247
+ "grad_norm": 69.0668716430664,
10248
+ "learning_rate": 9.974914863090976e-06,
10249
+ "loss": 10.9457,
10250
+ "step": 14470
10251
+ },
10252
+ {
10253
+ "epoch": 0.6426237511164368,
10254
+ "grad_norm": 70.29750061035156,
10255
+ "learning_rate": 9.974897527129047e-06,
10256
+ "loss": 10.9257,
10257
+ "step": 14480
10258
+ },
10259
+ {
10260
+ "epoch": 0.6430675520495281,
10261
+ "grad_norm": 66.38678741455078,
10262
+ "learning_rate": 9.97488019116712e-06,
10263
+ "loss": 10.2022,
10264
+ "step": 14490
10265
+ },
10266
+ {
10267
+ "epoch": 0.6435113529826196,
10268
+ "grad_norm": 66.664306640625,
10269
+ "learning_rate": 9.974862855205193e-06,
10270
+ "loss": 10.2297,
10271
+ "step": 14500
10272
+ },
10273
+ {
10274
+ "epoch": 0.6439551539157111,
10275
+ "grad_norm": 75.4664306640625,
10276
+ "learning_rate": 9.974845519243266e-06,
10277
+ "loss": 10.8164,
10278
+ "step": 14510
10279
+ },
10280
+ {
10281
+ "epoch": 0.6443989548488026,
10282
+ "grad_norm": 68.78631591796875,
10283
+ "learning_rate": 9.974828183281338e-06,
10284
+ "loss": 10.6404,
10285
+ "step": 14520
10286
+ },
10287
+ {
10288
+ "epoch": 0.644842755781894,
10289
+ "grad_norm": 79.80715942382812,
10290
+ "learning_rate": 9.974810847319411e-06,
10291
+ "loss": 10.6104,
10292
+ "step": 14530
10293
+ },
10294
+ {
10295
+ "epoch": 0.6452865567149855,
10296
+ "grad_norm": 66.43132019042969,
10297
+ "learning_rate": 9.974793511357484e-06,
10298
+ "loss": 10.3112,
10299
+ "step": 14540
10300
+ },
10301
+ {
10302
+ "epoch": 0.645730357648077,
10303
+ "grad_norm": 67.95149993896484,
10304
+ "learning_rate": 9.974776175395555e-06,
10305
+ "loss": 10.6031,
10306
+ "step": 14550
10307
+ },
10308
+ {
10309
+ "epoch": 0.6461741585811684,
10310
+ "grad_norm": 79.79896545410156,
10311
+ "learning_rate": 9.974758839433628e-06,
10312
+ "loss": 10.501,
10313
+ "step": 14560
10314
+ },
10315
+ {
10316
+ "epoch": 0.6466179595142599,
10317
+ "grad_norm": 65.96994018554688,
10318
+ "learning_rate": 9.974741503471701e-06,
10319
+ "loss": 10.2683,
10320
+ "step": 14570
10321
+ },
10322
+ {
10323
+ "epoch": 0.6470617604473513,
10324
+ "grad_norm": 76.4495849609375,
10325
+ "learning_rate": 9.974724167509773e-06,
10326
+ "loss": 10.6038,
10327
+ "step": 14580
10328
+ },
10329
+ {
10330
+ "epoch": 0.6475055613804428,
10331
+ "grad_norm": 69.33879089355469,
10332
+ "learning_rate": 9.974706831547846e-06,
10333
+ "loss": 10.2104,
10334
+ "step": 14590
10335
+ },
10336
+ {
10337
+ "epoch": 0.6479493623135343,
10338
+ "grad_norm": 68.40853881835938,
10339
+ "learning_rate": 9.974689495585919e-06,
10340
+ "loss": 10.7651,
10341
+ "step": 14600
10342
+ },
10343
+ {
10344
+ "epoch": 0.6483931632466258,
10345
+ "grad_norm": 71.53189849853516,
10346
+ "learning_rate": 9.97467215962399e-06,
10347
+ "loss": 10.6313,
10348
+ "step": 14610
10349
+ },
10350
+ {
10351
+ "epoch": 0.6488369641797171,
10352
+ "grad_norm": 73.11844635009766,
10353
+ "learning_rate": 9.974654823662064e-06,
10354
+ "loss": 11.0404,
10355
+ "step": 14620
10356
+ },
10357
+ {
10358
+ "epoch": 0.6492807651128086,
10359
+ "grad_norm": 72.54490661621094,
10360
+ "learning_rate": 9.974637487700137e-06,
10361
+ "loss": 10.0347,
10362
+ "step": 14630
10363
+ },
10364
+ {
10365
+ "epoch": 0.6497245660459001,
10366
+ "grad_norm": 77.51940155029297,
10367
+ "learning_rate": 9.974620151738208e-06,
10368
+ "loss": 10.875,
10369
+ "step": 14640
10370
+ },
10371
+ {
10372
+ "epoch": 0.6501683669789916,
10373
+ "grad_norm": 75.68498229980469,
10374
+ "learning_rate": 9.974602815776281e-06,
10375
+ "loss": 10.7511,
10376
+ "step": 14650
10377
+ },
10378
+ {
10379
+ "epoch": 0.650612167912083,
10380
+ "grad_norm": 85.01661682128906,
10381
+ "learning_rate": 9.974585479814354e-06,
10382
+ "loss": 11.0653,
10383
+ "step": 14660
10384
+ },
10385
+ {
10386
+ "epoch": 0.6510559688451745,
10387
+ "grad_norm": 71.71186065673828,
10388
+ "learning_rate": 9.974568143852426e-06,
10389
+ "loss": 10.1264,
10390
+ "step": 14670
10391
+ },
10392
+ {
10393
+ "epoch": 0.651499769778266,
10394
+ "grad_norm": 85.85321807861328,
10395
+ "learning_rate": 9.974550807890499e-06,
10396
+ "loss": 11.0676,
10397
+ "step": 14680
10398
+ },
10399
+ {
10400
+ "epoch": 0.6519435707113574,
10401
+ "grad_norm": 73.28590393066406,
10402
+ "learning_rate": 9.974533471928572e-06,
10403
+ "loss": 10.6028,
10404
+ "step": 14690
10405
+ },
10406
+ {
10407
+ "epoch": 0.6523873716444489,
10408
+ "grad_norm": 66.07901000976562,
10409
+ "learning_rate": 9.974516135966643e-06,
10410
+ "loss": 10.6085,
10411
+ "step": 14700
10412
+ },
10413
+ {
10414
+ "epoch": 0.6528311725775403,
10415
+ "grad_norm": 79.432861328125,
10416
+ "learning_rate": 9.974498800004716e-06,
10417
+ "loss": 10.8179,
10418
+ "step": 14710
10419
+ },
10420
+ {
10421
+ "epoch": 0.6532749735106318,
10422
+ "grad_norm": 64.7619857788086,
10423
+ "learning_rate": 9.97448146404279e-06,
10424
+ "loss": 10.6211,
10425
+ "step": 14720
10426
+ },
10427
+ {
10428
+ "epoch": 0.6537187744437233,
10429
+ "grad_norm": 79.81686401367188,
10430
+ "learning_rate": 9.974464128080862e-06,
10431
+ "loss": 10.3651,
10432
+ "step": 14730
10433
+ },
10434
+ {
10435
+ "epoch": 0.6541625753768148,
10436
+ "grad_norm": 80.43688201904297,
10437
+ "learning_rate": 9.974446792118934e-06,
10438
+ "loss": 10.7854,
10439
+ "step": 14740
10440
+ },
10441
+ {
10442
+ "epoch": 0.6546063763099061,
10443
+ "grad_norm": 78.90631866455078,
10444
+ "learning_rate": 9.974429456157007e-06,
10445
+ "loss": 10.7801,
10446
+ "step": 14750
10447
+ },
10448
+ {
10449
+ "epoch": 0.6550501772429976,
10450
+ "grad_norm": 82.28389739990234,
10451
+ "learning_rate": 9.97441212019508e-06,
10452
+ "loss": 10.6682,
10453
+ "step": 14760
10454
+ },
10455
+ {
10456
+ "epoch": 0.6554939781760891,
10457
+ "grad_norm": 71.01078033447266,
10458
+ "learning_rate": 9.974394784233151e-06,
10459
+ "loss": 11.0055,
10460
+ "step": 14770
10461
+ },
10462
+ {
10463
+ "epoch": 0.6559377791091806,
10464
+ "grad_norm": 74.68998718261719,
10465
+ "learning_rate": 9.974377448271224e-06,
10466
+ "loss": 10.3595,
10467
+ "step": 14780
10468
+ },
10469
+ {
10470
+ "epoch": 0.656381580042272,
10471
+ "grad_norm": 86.77886199951172,
10472
+ "learning_rate": 9.974360112309297e-06,
10473
+ "loss": 10.404,
10474
+ "step": 14790
10475
+ },
10476
+ {
10477
+ "epoch": 0.6568253809753635,
10478
+ "grad_norm": 72.12997436523438,
10479
+ "learning_rate": 9.974342776347369e-06,
10480
+ "loss": 10.5215,
10481
+ "step": 14800
10482
+ },
10483
+ {
10484
+ "epoch": 0.657269181908455,
10485
+ "grad_norm": 82.83440399169922,
10486
+ "learning_rate": 9.974325440385442e-06,
10487
+ "loss": 11.087,
10488
+ "step": 14810
10489
+ },
10490
+ {
10491
+ "epoch": 0.6577129828415464,
10492
+ "grad_norm": 62.340938568115234,
10493
+ "learning_rate": 9.974308104423515e-06,
10494
+ "loss": 10.3713,
10495
+ "step": 14820
10496
+ },
10497
+ {
10498
+ "epoch": 0.6581567837746379,
10499
+ "grad_norm": 65.31613159179688,
10500
+ "learning_rate": 9.974290768461586e-06,
10501
+ "loss": 10.6538,
10502
+ "step": 14830
10503
+ },
10504
+ {
10505
+ "epoch": 0.6586005847077293,
10506
+ "grad_norm": 69.7147445678711,
10507
+ "learning_rate": 9.97427343249966e-06,
10508
+ "loss": 10.8514,
10509
+ "step": 14840
10510
+ },
10511
+ {
10512
+ "epoch": 0.6590443856408208,
10513
+ "grad_norm": 63.39995574951172,
10514
+ "learning_rate": 9.974256096537732e-06,
10515
+ "loss": 10.4676,
10516
+ "step": 14850
10517
+ },
10518
+ {
10519
+ "epoch": 0.6594881865739123,
10520
+ "grad_norm": 74.32138061523438,
10521
+ "learning_rate": 9.974238760575804e-06,
10522
+ "loss": 10.4392,
10523
+ "step": 14860
10524
+ },
10525
+ {
10526
+ "epoch": 0.6599319875070038,
10527
+ "grad_norm": 86.13739776611328,
10528
+ "learning_rate": 9.974221424613877e-06,
10529
+ "loss": 10.5601,
10530
+ "step": 14870
10531
+ },
10532
+ {
10533
+ "epoch": 0.6603757884400951,
10534
+ "grad_norm": 71.16339874267578,
10535
+ "learning_rate": 9.97420408865195e-06,
10536
+ "loss": 11.0757,
10537
+ "step": 14880
10538
+ },
10539
+ {
10540
+ "epoch": 0.6608195893731866,
10541
+ "grad_norm": 85.1015625,
10542
+ "learning_rate": 9.974186752690021e-06,
10543
+ "loss": 11.1856,
10544
+ "step": 14890
10545
+ },
10546
+ {
10547
+ "epoch": 0.6612633903062781,
10548
+ "grad_norm": 64.8528823852539,
10549
+ "learning_rate": 9.974169416728094e-06,
10550
+ "loss": 11.0716,
10551
+ "step": 14900
10552
+ },
10553
+ {
10554
+ "epoch": 0.6617071912393696,
10555
+ "grad_norm": 81.894775390625,
10556
+ "learning_rate": 9.974152080766168e-06,
10557
+ "loss": 10.3157,
10558
+ "step": 14910
10559
+ },
10560
+ {
10561
+ "epoch": 0.6621509921724611,
10562
+ "grad_norm": 77.81661224365234,
10563
+ "learning_rate": 9.974134744804239e-06,
10564
+ "loss": 10.4957,
10565
+ "step": 14920
10566
+ },
10567
+ {
10568
+ "epoch": 0.6625947931055525,
10569
+ "grad_norm": 73.48043823242188,
10570
+ "learning_rate": 9.974117408842312e-06,
10571
+ "loss": 10.9361,
10572
+ "step": 14930
10573
+ },
10574
+ {
10575
+ "epoch": 0.663038594038644,
10576
+ "grad_norm": 67.01435089111328,
10577
+ "learning_rate": 9.974100072880385e-06,
10578
+ "loss": 10.4762,
10579
+ "step": 14940
10580
+ },
10581
+ {
10582
+ "epoch": 0.6634823949717354,
10583
+ "grad_norm": 76.67558288574219,
10584
+ "learning_rate": 9.974082736918458e-06,
10585
+ "loss": 10.5872,
10586
+ "step": 14950
10587
+ },
10588
+ {
10589
+ "epoch": 0.6639261959048269,
10590
+ "grad_norm": 69.8974838256836,
10591
+ "learning_rate": 9.97406540095653e-06,
10592
+ "loss": 10.6485,
10593
+ "step": 14960
10594
+ },
10595
+ {
10596
+ "epoch": 0.6643699968379183,
10597
+ "grad_norm": 77.30745697021484,
10598
+ "learning_rate": 9.974048064994603e-06,
10599
+ "loss": 10.601,
10600
+ "step": 14970
10601
+ },
10602
+ {
10603
+ "epoch": 0.6648137977710098,
10604
+ "grad_norm": 60.251041412353516,
10605
+ "learning_rate": 9.974030729032676e-06,
10606
+ "loss": 10.1426,
10607
+ "step": 14980
10608
+ },
10609
+ {
10610
+ "epoch": 0.6652575987041013,
10611
+ "grad_norm": 80.3484878540039,
10612
+ "learning_rate": 9.974013393070747e-06,
10613
+ "loss": 10.6465,
10614
+ "step": 14990
10615
+ },
10616
+ {
10617
+ "epoch": 0.6657013996371928,
10618
+ "grad_norm": 67.18524932861328,
10619
+ "learning_rate": 9.97399605710882e-06,
10620
+ "loss": 10.8912,
10621
+ "step": 15000
10622
+ },
10623
+ {
10624
+ "epoch": 0.6657013996371928,
10625
+ "eval_loss": 0.33302244544029236,
10626
+ "eval_runtime": 671.9227,
10627
+ "eval_samples_per_second": 1807.337,
10628
+ "eval_steps_per_second": 56.48,
10629
+ "step": 15000
10630
  }
10631
  ],
10632
  "logging_steps": 10,
 
10646
  "attributes": {}
10647
  }
10648
  },
10649
+ "total_flos": 5.234584807538688e+18,
10650
  "train_batch_size": 4,
10651
  "trial_name": null,
10652
  "trial_params": null