CocoRoF commited on
Commit
6715abb
·
verified ·
1 Parent(s): 1477fb2

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55d347bc1642ba36f81bf1c53bbf532d55be3ebef8d9dafa099a5b1a7a3135f3
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2696563487c81338ee1c316d03a16b8cdda3a230f8c30ff43050548155251701
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d92863646892c67d414151d51e68dd618cd29310d9197b20ad3650d50f4c83e
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c48c8f079566e8995d74a2dde5c2bad04c58bdad806c74f723414147f3b56d46
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9043c499ba403bc8a9781e7fc0340c346cc44c1643553465dd6859e68c52b3a9
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97c307475b202e1cf87916aaf07584ea570dab26688da354222b9b509afaa3e1
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8638375985314761,
5
  "eval_steps": 500,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4963,6 +4963,714 @@
4963
  "eval_samples_per_second": 1118.746,
4964
  "eval_steps_per_second": 34.961,
4965
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966
  }
4967
  ],
4968
  "logging_steps": 5,
@@ -4982,7 +5690,7 @@
4982
  "attributes": {}
4983
  }
4984
  },
4985
- "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9872429697502584,
5
  "eval_steps": 500,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4963
  "eval_samples_per_second": 1118.746,
4964
  "eval_steps_per_second": 34.961,
4965
  "step": 3500
4966
+ },
4967
+ {
4968
+ "epoch": 0.8650716522436639,
4969
+ "grad_norm": 105.625,
4970
+ "learning_rate": 2.995884773662551e-07,
4971
+ "loss": 68.0234,
4972
+ "step": 3505
4973
+ },
4974
+ {
4975
+ "epoch": 0.8663057059558518,
4976
+ "grad_norm": 105.4375,
4977
+ "learning_rate": 2.968449931412894e-07,
4978
+ "loss": 67.563,
4979
+ "step": 3510
4980
+ },
4981
+ {
4982
+ "epoch": 0.8675397596680395,
4983
+ "grad_norm": 102.125,
4984
+ "learning_rate": 2.941015089163237e-07,
4985
+ "loss": 68.1967,
4986
+ "step": 3515
4987
+ },
4988
+ {
4989
+ "epoch": 0.8687738133802274,
4990
+ "grad_norm": 105.0,
4991
+ "learning_rate": 2.91358024691358e-07,
4992
+ "loss": 68.515,
4993
+ "step": 3520
4994
+ },
4995
+ {
4996
+ "epoch": 0.8700078670924152,
4997
+ "grad_norm": 103.3125,
4998
+ "learning_rate": 2.886145404663923e-07,
4999
+ "loss": 68.8215,
5000
+ "step": 3525
5001
+ },
5002
+ {
5003
+ "epoch": 0.871241920804603,
5004
+ "grad_norm": 106.8125,
5005
+ "learning_rate": 2.858710562414266e-07,
5006
+ "loss": 67.0145,
5007
+ "step": 3530
5008
+ },
5009
+ {
5010
+ "epoch": 0.8724759745167908,
5011
+ "grad_norm": 107.375,
5012
+ "learning_rate": 2.8312757201646087e-07,
5013
+ "loss": 68.5533,
5014
+ "step": 3535
5015
+ },
5016
+ {
5017
+ "epoch": 0.8737100282289787,
5018
+ "grad_norm": 104.3125,
5019
+ "learning_rate": 2.803840877914952e-07,
5020
+ "loss": 66.9931,
5021
+ "step": 3540
5022
+ },
5023
+ {
5024
+ "epoch": 0.8749440819411665,
5025
+ "grad_norm": 106.5,
5026
+ "learning_rate": 2.7764060356652947e-07,
5027
+ "loss": 68.8769,
5028
+ "step": 3545
5029
+ },
5030
+ {
5031
+ "epoch": 0.8761781356533543,
5032
+ "grad_norm": 106.3125,
5033
+ "learning_rate": 2.748971193415638e-07,
5034
+ "loss": 67.0216,
5035
+ "step": 3550
5036
+ },
5037
+ {
5038
+ "epoch": 0.8774121893655421,
5039
+ "grad_norm": 102.5,
5040
+ "learning_rate": 2.7215363511659807e-07,
5041
+ "loss": 69.8652,
5042
+ "step": 3555
5043
+ },
5044
+ {
5045
+ "epoch": 0.87864624307773,
5046
+ "grad_norm": 104.25,
5047
+ "learning_rate": 2.694101508916324e-07,
5048
+ "loss": 68.6337,
5049
+ "step": 3560
5050
+ },
5051
+ {
5052
+ "epoch": 0.8798802967899177,
5053
+ "grad_norm": 102.625,
5054
+ "learning_rate": 2.6666666666666667e-07,
5055
+ "loss": 66.5836,
5056
+ "step": 3565
5057
+ },
5058
+ {
5059
+ "epoch": 0.8811143505021056,
5060
+ "grad_norm": 102.875,
5061
+ "learning_rate": 2.63923182441701e-07,
5062
+ "loss": 67.1445,
5063
+ "step": 3570
5064
+ },
5065
+ {
5066
+ "epoch": 0.8823484042142934,
5067
+ "grad_norm": 106.375,
5068
+ "learning_rate": 2.611796982167352e-07,
5069
+ "loss": 68.3128,
5070
+ "step": 3575
5071
+ },
5072
+ {
5073
+ "epoch": 0.8835824579264813,
5074
+ "grad_norm": 106.5,
5075
+ "learning_rate": 2.584362139917695e-07,
5076
+ "loss": 67.5264,
5077
+ "step": 3580
5078
+ },
5079
+ {
5080
+ "epoch": 0.884816511638669,
5081
+ "grad_norm": 105.25,
5082
+ "learning_rate": 2.556927297668038e-07,
5083
+ "loss": 68.2184,
5084
+ "step": 3585
5085
+ },
5086
+ {
5087
+ "epoch": 0.8860505653508569,
5088
+ "grad_norm": 107.0,
5089
+ "learning_rate": 2.529492455418381e-07,
5090
+ "loss": 68.3838,
5091
+ "step": 3590
5092
+ },
5093
+ {
5094
+ "epoch": 0.8872846190630447,
5095
+ "grad_norm": 101.25,
5096
+ "learning_rate": 2.502057613168724e-07,
5097
+ "loss": 67.683,
5098
+ "step": 3595
5099
+ },
5100
+ {
5101
+ "epoch": 0.8885186727752326,
5102
+ "grad_norm": 104.125,
5103
+ "learning_rate": 2.474622770919067e-07,
5104
+ "loss": 69.2804,
5105
+ "step": 3600
5106
+ },
5107
+ {
5108
+ "epoch": 0.8897527264874203,
5109
+ "grad_norm": 106.0625,
5110
+ "learning_rate": 2.44718792866941e-07,
5111
+ "loss": 68.8471,
5112
+ "step": 3605
5113
+ },
5114
+ {
5115
+ "epoch": 0.8909867801996082,
5116
+ "grad_norm": 103.125,
5117
+ "learning_rate": 2.419753086419753e-07,
5118
+ "loss": 68.7409,
5119
+ "step": 3610
5120
+ },
5121
+ {
5122
+ "epoch": 0.892220833911796,
5123
+ "grad_norm": 106.4375,
5124
+ "learning_rate": 2.3923182441700957e-07,
5125
+ "loss": 67.3829,
5126
+ "step": 3615
5127
+ },
5128
+ {
5129
+ "epoch": 0.8934548876239838,
5130
+ "grad_norm": 104.5,
5131
+ "learning_rate": 2.364883401920439e-07,
5132
+ "loss": 68.2949,
5133
+ "step": 3620
5134
+ },
5135
+ {
5136
+ "epoch": 0.8946889413361716,
5137
+ "grad_norm": 103.4375,
5138
+ "learning_rate": 2.337448559670782e-07,
5139
+ "loss": 68.3005,
5140
+ "step": 3625
5141
+ },
5142
+ {
5143
+ "epoch": 0.8959229950483595,
5144
+ "grad_norm": 104.0,
5145
+ "learning_rate": 2.3100137174211247e-07,
5146
+ "loss": 65.948,
5147
+ "step": 3630
5148
+ },
5149
+ {
5150
+ "epoch": 0.8971570487605472,
5151
+ "grad_norm": 104.0,
5152
+ "learning_rate": 2.2825788751714677e-07,
5153
+ "loss": 67.4345,
5154
+ "step": 3635
5155
+ },
5156
+ {
5157
+ "epoch": 0.8983911024727351,
5158
+ "grad_norm": 106.125,
5159
+ "learning_rate": 2.2551440329218105e-07,
5160
+ "loss": 68.0186,
5161
+ "step": 3640
5162
+ },
5163
+ {
5164
+ "epoch": 0.899625156184923,
5165
+ "grad_norm": 106.6875,
5166
+ "learning_rate": 2.2277091906721535e-07,
5167
+ "loss": 69.481,
5168
+ "step": 3645
5169
+ },
5170
+ {
5171
+ "epoch": 0.9008592098971108,
5172
+ "grad_norm": 103.875,
5173
+ "learning_rate": 2.2002743484224965e-07,
5174
+ "loss": 68.3314,
5175
+ "step": 3650
5176
+ },
5177
+ {
5178
+ "epoch": 0.9020932636092986,
5179
+ "grad_norm": 103.3125,
5180
+ "learning_rate": 2.1728395061728395e-07,
5181
+ "loss": 66.6179,
5182
+ "step": 3655
5183
+ },
5184
+ {
5185
+ "epoch": 0.9033273173214864,
5186
+ "grad_norm": 102.25,
5187
+ "learning_rate": 2.1454046639231825e-07,
5188
+ "loss": 68.0456,
5189
+ "step": 3660
5190
+ },
5191
+ {
5192
+ "epoch": 0.9045613710336743,
5193
+ "grad_norm": 110.25,
5194
+ "learning_rate": 2.1179698216735252e-07,
5195
+ "loss": 68.8177,
5196
+ "step": 3665
5197
+ },
5198
+ {
5199
+ "epoch": 0.905795424745862,
5200
+ "grad_norm": 104.6875,
5201
+ "learning_rate": 2.0905349794238682e-07,
5202
+ "loss": 67.566,
5203
+ "step": 3670
5204
+ },
5205
+ {
5206
+ "epoch": 0.9070294784580499,
5207
+ "grad_norm": 103.875,
5208
+ "learning_rate": 2.0631001371742112e-07,
5209
+ "loss": 66.4167,
5210
+ "step": 3675
5211
+ },
5212
+ {
5213
+ "epoch": 0.9082635321702377,
5214
+ "grad_norm": 104.8125,
5215
+ "learning_rate": 2.035665294924554e-07,
5216
+ "loss": 68.3844,
5217
+ "step": 3680
5218
+ },
5219
+ {
5220
+ "epoch": 0.9094975858824256,
5221
+ "grad_norm": 104.0625,
5222
+ "learning_rate": 2.008230452674897e-07,
5223
+ "loss": 67.0489,
5224
+ "step": 3685
5225
+ },
5226
+ {
5227
+ "epoch": 0.9107316395946133,
5228
+ "grad_norm": 107.3125,
5229
+ "learning_rate": 1.98079561042524e-07,
5230
+ "loss": 67.8705,
5231
+ "step": 3690
5232
+ },
5233
+ {
5234
+ "epoch": 0.9119656933068012,
5235
+ "grad_norm": 103.375,
5236
+ "learning_rate": 1.953360768175583e-07,
5237
+ "loss": 67.953,
5238
+ "step": 3695
5239
+ },
5240
+ {
5241
+ "epoch": 0.913199747018989,
5242
+ "grad_norm": 105.25,
5243
+ "learning_rate": 1.9259259259259257e-07,
5244
+ "loss": 67.5511,
5245
+ "step": 3700
5246
+ },
5247
+ {
5248
+ "epoch": 0.9144338007311769,
5249
+ "grad_norm": 103.125,
5250
+ "learning_rate": 1.8984910836762687e-07,
5251
+ "loss": 67.6629,
5252
+ "step": 3705
5253
+ },
5254
+ {
5255
+ "epoch": 0.9156678544433646,
5256
+ "grad_norm": 107.0625,
5257
+ "learning_rate": 1.8710562414266117e-07,
5258
+ "loss": 68.8038,
5259
+ "step": 3710
5260
+ },
5261
+ {
5262
+ "epoch": 0.9169019081555525,
5263
+ "grad_norm": 103.6875,
5264
+ "learning_rate": 1.8436213991769547e-07,
5265
+ "loss": 66.1586,
5266
+ "step": 3715
5267
+ },
5268
+ {
5269
+ "epoch": 0.9181359618677403,
5270
+ "grad_norm": 104.8125,
5271
+ "learning_rate": 1.8161865569272977e-07,
5272
+ "loss": 67.8019,
5273
+ "step": 3720
5274
+ },
5275
+ {
5276
+ "epoch": 0.9193700155799281,
5277
+ "grad_norm": 101.6875,
5278
+ "learning_rate": 1.7887517146776405e-07,
5279
+ "loss": 67.1005,
5280
+ "step": 3725
5281
+ },
5282
+ {
5283
+ "epoch": 0.9206040692921159,
5284
+ "grad_norm": 107.9375,
5285
+ "learning_rate": 1.7613168724279835e-07,
5286
+ "loss": 67.2893,
5287
+ "step": 3730
5288
+ },
5289
+ {
5290
+ "epoch": 0.9218381230043038,
5291
+ "grad_norm": 105.0625,
5292
+ "learning_rate": 1.7338820301783262e-07,
5293
+ "loss": 65.8601,
5294
+ "step": 3735
5295
+ },
5296
+ {
5297
+ "epoch": 0.9230721767164916,
5298
+ "grad_norm": 105.8125,
5299
+ "learning_rate": 1.7064471879286692e-07,
5300
+ "loss": 66.3078,
5301
+ "step": 3740
5302
+ },
5303
+ {
5304
+ "epoch": 0.9243062304286794,
5305
+ "grad_norm": 106.1875,
5306
+ "learning_rate": 1.6790123456790122e-07,
5307
+ "loss": 66.7744,
5308
+ "step": 3745
5309
+ },
5310
+ {
5311
+ "epoch": 0.9255402841408672,
5312
+ "grad_norm": 104.625,
5313
+ "learning_rate": 1.6515775034293552e-07,
5314
+ "loss": 68.7563,
5315
+ "step": 3750
5316
+ },
5317
+ {
5318
+ "epoch": 0.9267743378530551,
5319
+ "grad_norm": 104.625,
5320
+ "learning_rate": 1.6241426611796983e-07,
5321
+ "loss": 67.7685,
5322
+ "step": 3755
5323
+ },
5324
+ {
5325
+ "epoch": 0.9280083915652428,
5326
+ "grad_norm": 105.3125,
5327
+ "learning_rate": 1.5967078189300413e-07,
5328
+ "loss": 68.9463,
5329
+ "step": 3760
5330
+ },
5331
+ {
5332
+ "epoch": 0.9292424452774307,
5333
+ "grad_norm": 99.75,
5334
+ "learning_rate": 1.569272976680384e-07,
5335
+ "loss": 68.4013,
5336
+ "step": 3765
5337
+ },
5338
+ {
5339
+ "epoch": 0.9304764989896185,
5340
+ "grad_norm": 102.875,
5341
+ "learning_rate": 1.5418381344307267e-07,
5342
+ "loss": 67.2276,
5343
+ "step": 3770
5344
+ },
5345
+ {
5346
+ "epoch": 0.9317105527018064,
5347
+ "grad_norm": 105.4375,
5348
+ "learning_rate": 1.5144032921810697e-07,
5349
+ "loss": 67.9008,
5350
+ "step": 3775
5351
+ },
5352
+ {
5353
+ "epoch": 0.9329446064139941,
5354
+ "grad_norm": 106.3125,
5355
+ "learning_rate": 1.4869684499314127e-07,
5356
+ "loss": 68.7067,
5357
+ "step": 3780
5358
+ },
5359
+ {
5360
+ "epoch": 0.934178660126182,
5361
+ "grad_norm": 103.625,
5362
+ "learning_rate": 1.4595336076817558e-07,
5363
+ "loss": 66.8319,
5364
+ "step": 3785
5365
+ },
5366
+ {
5367
+ "epoch": 0.9354127138383698,
5368
+ "grad_norm": 106.0,
5369
+ "learning_rate": 1.4320987654320988e-07,
5370
+ "loss": 66.7882,
5371
+ "step": 3790
5372
+ },
5373
+ {
5374
+ "epoch": 0.9366467675505576,
5375
+ "grad_norm": 109.5625,
5376
+ "learning_rate": 1.4046639231824418e-07,
5377
+ "loss": 67.7522,
5378
+ "step": 3795
5379
+ },
5380
+ {
5381
+ "epoch": 0.9378808212627454,
5382
+ "grad_norm": 103.8125,
5383
+ "learning_rate": 1.3772290809327848e-07,
5384
+ "loss": 67.2511,
5385
+ "step": 3800
5386
+ },
5387
+ {
5388
+ "epoch": 0.9391148749749333,
5389
+ "grad_norm": 104.6875,
5390
+ "learning_rate": 1.3497942386831278e-07,
5391
+ "loss": 67.226,
5392
+ "step": 3805
5393
+ },
5394
+ {
5395
+ "epoch": 0.940348928687121,
5396
+ "grad_norm": 101.8125,
5397
+ "learning_rate": 1.3223593964334702e-07,
5398
+ "loss": 66.5479,
5399
+ "step": 3810
5400
+ },
5401
+ {
5402
+ "epoch": 0.9415829823993089,
5403
+ "grad_norm": 104.3125,
5404
+ "learning_rate": 1.2949245541838133e-07,
5405
+ "loss": 67.8139,
5406
+ "step": 3815
5407
+ },
5408
+ {
5409
+ "epoch": 0.9428170361114967,
5410
+ "grad_norm": 104.375,
5411
+ "learning_rate": 1.2674897119341563e-07,
5412
+ "loss": 67.2027,
5413
+ "step": 3820
5414
+ },
5415
+ {
5416
+ "epoch": 0.9440510898236846,
5417
+ "grad_norm": 104.8125,
5418
+ "learning_rate": 1.2400548696844993e-07,
5419
+ "loss": 67.2941,
5420
+ "step": 3825
5421
+ },
5422
+ {
5423
+ "epoch": 0.9452851435358725,
5424
+ "grad_norm": 103.9375,
5425
+ "learning_rate": 1.2126200274348423e-07,
5426
+ "loss": 66.4945,
5427
+ "step": 3830
5428
+ },
5429
+ {
5430
+ "epoch": 0.9465191972480602,
5431
+ "grad_norm": 102.375,
5432
+ "learning_rate": 1.1851851851851851e-07,
5433
+ "loss": 66.7565,
5434
+ "step": 3835
5435
+ },
5436
+ {
5437
+ "epoch": 0.9477532509602481,
5438
+ "grad_norm": 104.75,
5439
+ "learning_rate": 1.157750342935528e-07,
5440
+ "loss": 69.663,
5441
+ "step": 3840
5442
+ },
5443
+ {
5444
+ "epoch": 0.9489873046724359,
5445
+ "grad_norm": 102.0625,
5446
+ "learning_rate": 1.130315500685871e-07,
5447
+ "loss": 67.022,
5448
+ "step": 3845
5449
+ },
5450
+ {
5451
+ "epoch": 0.9502213583846237,
5452
+ "grad_norm": 103.5625,
5453
+ "learning_rate": 1.1028806584362139e-07,
5454
+ "loss": 66.7464,
5455
+ "step": 3850
5456
+ },
5457
+ {
5458
+ "epoch": 0.9514554120968115,
5459
+ "grad_norm": 106.125,
5460
+ "learning_rate": 1.0754458161865569e-07,
5461
+ "loss": 67.5819,
5462
+ "step": 3855
5463
+ },
5464
+ {
5465
+ "epoch": 0.9526894658089994,
5466
+ "grad_norm": 107.1875,
5467
+ "learning_rate": 1.0480109739368999e-07,
5468
+ "loss": 68.8449,
5469
+ "step": 3860
5470
+ },
5471
+ {
5472
+ "epoch": 0.9539235195211871,
5473
+ "grad_norm": 104.25,
5474
+ "learning_rate": 1.0205761316872428e-07,
5475
+ "loss": 67.4359,
5476
+ "step": 3865
5477
+ },
5478
+ {
5479
+ "epoch": 0.955157573233375,
5480
+ "grad_norm": 110.0625,
5481
+ "learning_rate": 9.931412894375856e-08,
5482
+ "loss": 66.3128,
5483
+ "step": 3870
5484
+ },
5485
+ {
5486
+ "epoch": 0.9563916269455628,
5487
+ "grad_norm": 106.5625,
5488
+ "learning_rate": 9.657064471879286e-08,
5489
+ "loss": 67.7631,
5490
+ "step": 3875
5491
+ },
5492
+ {
5493
+ "epoch": 0.9576256806577507,
5494
+ "grad_norm": 104.25,
5495
+ "learning_rate": 9.382716049382716e-08,
5496
+ "loss": 67.0168,
5497
+ "step": 3880
5498
+ },
5499
+ {
5500
+ "epoch": 0.9588597343699384,
5501
+ "grad_norm": 99.5,
5502
+ "learning_rate": 9.108367626886144e-08,
5503
+ "loss": 67.202,
5504
+ "step": 3885
5505
+ },
5506
+ {
5507
+ "epoch": 0.9600937880821263,
5508
+ "grad_norm": 103.0625,
5509
+ "learning_rate": 8.834019204389574e-08,
5510
+ "loss": 66.6468,
5511
+ "step": 3890
5512
+ },
5513
+ {
5514
+ "epoch": 0.9613278417943141,
5515
+ "grad_norm": 104.25,
5516
+ "learning_rate": 8.559670781893004e-08,
5517
+ "loss": 67.3951,
5518
+ "step": 3895
5519
+ },
5520
+ {
5521
+ "epoch": 0.962561895506502,
5522
+ "grad_norm": 103.8125,
5523
+ "learning_rate": 8.285322359396434e-08,
5524
+ "loss": 66.0457,
5525
+ "step": 3900
5526
+ },
5527
+ {
5528
+ "epoch": 0.9637959492186897,
5529
+ "grad_norm": 105.125,
5530
+ "learning_rate": 8.010973936899861e-08,
5531
+ "loss": 66.7826,
5532
+ "step": 3905
5533
+ },
5534
+ {
5535
+ "epoch": 0.9650300029308776,
5536
+ "grad_norm": 104.125,
5537
+ "learning_rate": 7.736625514403291e-08,
5538
+ "loss": 67.4089,
5539
+ "step": 3910
5540
+ },
5541
+ {
5542
+ "epoch": 0.9662640566430654,
5543
+ "grad_norm": 105.4375,
5544
+ "learning_rate": 7.462277091906722e-08,
5545
+ "loss": 67.6246,
5546
+ "step": 3915
5547
+ },
5548
+ {
5549
+ "epoch": 0.9674981103552532,
5550
+ "grad_norm": 101.875,
5551
+ "learning_rate": 7.187928669410152e-08,
5552
+ "loss": 67.3768,
5553
+ "step": 3920
5554
+ },
5555
+ {
5556
+ "epoch": 0.968732164067441,
5557
+ "grad_norm": 104.3125,
5558
+ "learning_rate": 6.913580246913579e-08,
5559
+ "loss": 67.9191,
5560
+ "step": 3925
5561
+ },
5562
+ {
5563
+ "epoch": 0.9699662177796289,
5564
+ "grad_norm": 103.4375,
5565
+ "learning_rate": 6.639231824417009e-08,
5566
+ "loss": 66.7172,
5567
+ "step": 3930
5568
+ },
5569
+ {
5570
+ "epoch": 0.9712002714918166,
5571
+ "grad_norm": 104.875,
5572
+ "learning_rate": 6.364883401920439e-08,
5573
+ "loss": 68.1103,
5574
+ "step": 3935
5575
+ },
5576
+ {
5577
+ "epoch": 0.9724343252040045,
5578
+ "grad_norm": 106.1875,
5579
+ "learning_rate": 6.090534979423868e-08,
5580
+ "loss": 66.7532,
5581
+ "step": 3940
5582
+ },
5583
+ {
5584
+ "epoch": 0.9736683789161923,
5585
+ "grad_norm": 101.9375,
5586
+ "learning_rate": 5.816186556927297e-08,
5587
+ "loss": 66.8054,
5588
+ "step": 3945
5589
+ },
5590
+ {
5591
+ "epoch": 0.9749024326283802,
5592
+ "grad_norm": 107.1875,
5593
+ "learning_rate": 5.541838134430727e-08,
5594
+ "loss": 66.1427,
5595
+ "step": 3950
5596
+ },
5597
+ {
5598
+ "epoch": 0.9761364863405679,
5599
+ "grad_norm": 99.8125,
5600
+ "learning_rate": 5.267489711934156e-08,
5601
+ "loss": 66.5402,
5602
+ "step": 3955
5603
+ },
5604
+ {
5605
+ "epoch": 0.9773705400527558,
5606
+ "grad_norm": 104.0625,
5607
+ "learning_rate": 4.993141289437586e-08,
5608
+ "loss": 68.0075,
5609
+ "step": 3960
5610
+ },
5611
+ {
5612
+ "epoch": 0.9786045937649436,
5613
+ "grad_norm": 106.125,
5614
+ "learning_rate": 4.718792866941015e-08,
5615
+ "loss": 67.5935,
5616
+ "step": 3965
5617
+ },
5618
+ {
5619
+ "epoch": 0.9798386474771315,
5620
+ "grad_norm": 104.9375,
5621
+ "learning_rate": 4.444444444444444e-08,
5622
+ "loss": 66.8711,
5623
+ "step": 3970
5624
+ },
5625
+ {
5626
+ "epoch": 0.9810727011893192,
5627
+ "grad_norm": 104.5,
5628
+ "learning_rate": 4.1700960219478735e-08,
5629
+ "loss": 66.3836,
5630
+ "step": 3975
5631
+ },
5632
+ {
5633
+ "epoch": 0.9823067549015071,
5634
+ "grad_norm": 102.0625,
5635
+ "learning_rate": 3.895747599451303e-08,
5636
+ "loss": 65.9148,
5637
+ "step": 3980
5638
+ },
5639
+ {
5640
+ "epoch": 0.9835408086136949,
5641
+ "grad_norm": 105.375,
5642
+ "learning_rate": 3.621399176954732e-08,
5643
+ "loss": 66.4969,
5644
+ "step": 3985
5645
+ },
5646
+ {
5647
+ "epoch": 0.9847748623258827,
5648
+ "grad_norm": 101.1875,
5649
+ "learning_rate": 3.3470507544581616e-08,
5650
+ "loss": 66.926,
5651
+ "step": 3990
5652
+ },
5653
+ {
5654
+ "epoch": 0.9860089160380705,
5655
+ "grad_norm": 102.4375,
5656
+ "learning_rate": 3.072702331961591e-08,
5657
+ "loss": 65.3679,
5658
+ "step": 3995
5659
+ },
5660
+ {
5661
+ "epoch": 0.9872429697502584,
5662
+ "grad_norm": 107.875,
5663
+ "learning_rate": 2.7983539094650204e-08,
5664
+ "loss": 67.4716,
5665
+ "step": 4000
5666
+ },
5667
+ {
5668
+ "epoch": 0.9872429697502584,
5669
+ "eval_loss": 2.095496892929077,
5670
+ "eval_runtime": 197.7366,
5671
+ "eval_samples_per_second": 1104.317,
5672
+ "eval_steps_per_second": 34.511,
5673
+ "step": 4000
5674
  }
5675
  ],
5676
  "logging_steps": 5,
 
5690
  "attributes": {}
5691
  }
5692
  },
5693
+ "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null