CocoRoF commited on
Commit
521d0fd
·
verified ·
1 Parent(s): 25e487c

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a158678a9913c22e822b56e488bc5beae7ec2a0c2aed4dcac0b3f632242ce08
3
  size 791869518
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bbb7854b2bafc4b0e3606708ae4d48e79f7dae198813843750b86a3309c3ff2
3
  size 791869518
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9b4d8078da3a562ec7c1abfe025ead553b88b222214f46c5cace69dac85c305
3
  size 2375752250
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4c6b74ce94790e80aeccdcc0c96350cf3007ec072e27d539b8d75b3737cd30
3
  size 2375752250
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04cb5208648fd09a2e0403d51973f74ffbfd93cbd5da59e1e99c8df03769a86c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7034685b36b93a4dd3a50697b0b1c314b249b2189ec2cb96b757312b1514a579
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e851fe1c1de0057f4eecefed6a131fa9021334eb43f6e7e65fdb270a25ac864
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:978379030048e432baa510ec4fc9514faa08fe564ab964b3a4d05e8f60306495
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdbc75d90af112615b53d15931e8157a80e37bcd110aac9a3089f5f6f5344171
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c8a310f6ca2ca89570eb2cc68544656b30224f00b2d6d96eeda6e0cb8be50ab
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5b8110fcf6e044b6860c6305be969cfe03129549b92dc6fc2394448e9265d6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f936acaf5a2d5fe8c38d945450417facbf1577584c216908a396d3cc20bec88
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:215e4654fd9445711cb9dfea2667862985f77e204a2b8b6ad2d7416e86b834fc
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10c4704142b6f369cf4bf151e113e45f019dc64e0a7de8f91691f0f749dea2d6
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.809108247124776,
5
  "eval_steps": 500,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4963,6 +4963,714 @@
4963
  "eval_samples_per_second": 1103.838,
4964
  "eval_steps_per_second": 34.498,
4965
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966
  }
4967
  ],
4968
  "logging_steps": 5,
@@ -4982,7 +5690,7 @@
4982
  "attributes": {}
4983
  }
4984
  },
4985
- "total_flos": 1.5163252974760755e+19,
4986
  "train_batch_size": 4,
4987
  "trial_name": null,
4988
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9246951395711727,
5
  "eval_steps": 500,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4963
  "eval_samples_per_second": 1103.838,
4964
  "eval_steps_per_second": 34.498,
4965
  "step": 3500
4966
+ },
4967
+ {
4968
+ "epoch": 0.81026411604924,
4969
+ "grad_norm": 165.75,
4970
+ "learning_rate": 1.0534429599177801e-06,
4971
+ "loss": 66.3025,
4972
+ "step": 3505
4973
+ },
4974
+ {
4975
+ "epoch": 0.8114199849737039,
4976
+ "grad_norm": 181.625,
4977
+ "learning_rate": 1.0470195272353546e-06,
4978
+ "loss": 65.6947,
4979
+ "step": 3510
4980
+ },
4981
+ {
4982
+ "epoch": 0.812575853898168,
4983
+ "grad_norm": 172.25,
4984
+ "learning_rate": 1.0405960945529292e-06,
4985
+ "loss": 66.646,
4986
+ "step": 3515
4987
+ },
4988
+ {
4989
+ "epoch": 0.813731722822632,
4990
+ "grad_norm": 168.375,
4991
+ "learning_rate": 1.0341726618705036e-06,
4992
+ "loss": 66.0262,
4993
+ "step": 3520
4994
+ },
4995
+ {
4996
+ "epoch": 0.8148875917470959,
4997
+ "grad_norm": 171.625,
4998
+ "learning_rate": 1.0277492291880783e-06,
4999
+ "loss": 67.0085,
5000
+ "step": 3525
5001
+ },
5002
+ {
5003
+ "epoch": 0.8160434606715599,
5004
+ "grad_norm": 174.5,
5005
+ "learning_rate": 1.0213257965056527e-06,
5006
+ "loss": 65.7099,
5007
+ "step": 3530
5008
+ },
5009
+ {
5010
+ "epoch": 0.8171993295960238,
5011
+ "grad_norm": 167.375,
5012
+ "learning_rate": 1.0149023638232273e-06,
5013
+ "loss": 67.0235,
5014
+ "step": 3535
5015
+ },
5016
+ {
5017
+ "epoch": 0.8183551985204878,
5018
+ "grad_norm": 174.625,
5019
+ "learning_rate": 1.0084789311408017e-06,
5020
+ "loss": 66.8565,
5021
+ "step": 3540
5022
+ },
5023
+ {
5024
+ "epoch": 0.8195110674449517,
5025
+ "grad_norm": 165.25,
5026
+ "learning_rate": 1.0020554984583762e-06,
5027
+ "loss": 67.6963,
5028
+ "step": 3545
5029
+ },
5030
+ {
5031
+ "epoch": 0.8206669363694157,
5032
+ "grad_norm": 182.625,
5033
+ "learning_rate": 9.956320657759508e-07,
5034
+ "loss": 66.9619,
5035
+ "step": 3550
5036
+ },
5037
+ {
5038
+ "epoch": 0.8218228052938796,
5039
+ "grad_norm": 173.0,
5040
+ "learning_rate": 9.892086330935252e-07,
5041
+ "loss": 68.0282,
5042
+ "step": 3555
5043
+ },
5044
+ {
5045
+ "epoch": 0.8229786742183436,
5046
+ "grad_norm": 167.625,
5047
+ "learning_rate": 9.827852004110999e-07,
5048
+ "loss": 66.8721,
5049
+ "step": 3560
5050
+ },
5051
+ {
5052
+ "epoch": 0.8241345431428077,
5053
+ "grad_norm": 163.375,
5054
+ "learning_rate": 9.763617677286743e-07,
5055
+ "loss": 64.3104,
5056
+ "step": 3565
5057
+ },
5058
+ {
5059
+ "epoch": 0.8252904120672716,
5060
+ "grad_norm": 173.375,
5061
+ "learning_rate": 9.699383350462487e-07,
5062
+ "loss": 66.3384,
5063
+ "step": 3570
5064
+ },
5065
+ {
5066
+ "epoch": 0.8264462809917356,
5067
+ "grad_norm": 185.625,
5068
+ "learning_rate": 9.635149023638233e-07,
5069
+ "loss": 65.7862,
5070
+ "step": 3575
5071
+ },
5072
+ {
5073
+ "epoch": 0.8276021499161995,
5074
+ "grad_norm": 165.375,
5075
+ "learning_rate": 9.570914696813978e-07,
5076
+ "loss": 66.4101,
5077
+ "step": 3580
5078
+ },
5079
+ {
5080
+ "epoch": 0.8287580188406635,
5081
+ "grad_norm": 169.125,
5082
+ "learning_rate": 9.506680369989724e-07,
5083
+ "loss": 67.1195,
5084
+ "step": 3585
5085
+ },
5086
+ {
5087
+ "epoch": 0.8299138877651274,
5088
+ "grad_norm": 167.25,
5089
+ "learning_rate": 9.442446043165468e-07,
5090
+ "loss": 66.6352,
5091
+ "step": 3590
5092
+ },
5093
+ {
5094
+ "epoch": 0.8310697566895914,
5095
+ "grad_norm": 159.125,
5096
+ "learning_rate": 9.378211716341213e-07,
5097
+ "loss": 66.0377,
5098
+ "step": 3595
5099
+ },
5100
+ {
5101
+ "epoch": 0.8322256256140553,
5102
+ "grad_norm": 189.125,
5103
+ "learning_rate": 9.313977389516958e-07,
5104
+ "loss": 67.3261,
5105
+ "step": 3600
5106
+ },
5107
+ {
5108
+ "epoch": 0.8333814945385193,
5109
+ "grad_norm": 165.375,
5110
+ "learning_rate": 9.249743062692704e-07,
5111
+ "loss": 67.397,
5112
+ "step": 3605
5113
+ },
5114
+ {
5115
+ "epoch": 0.8345373634629834,
5116
+ "grad_norm": 173.375,
5117
+ "learning_rate": 9.185508735868449e-07,
5118
+ "loss": 67.4292,
5119
+ "step": 3610
5120
+ },
5121
+ {
5122
+ "epoch": 0.8356932323874473,
5123
+ "grad_norm": 174.5,
5124
+ "learning_rate": 9.121274409044194e-07,
5125
+ "loss": 66.4497,
5126
+ "step": 3615
5127
+ },
5128
+ {
5129
+ "epoch": 0.8368491013119113,
5130
+ "grad_norm": 163.75,
5131
+ "learning_rate": 9.05704008221994e-07,
5132
+ "loss": 67.061,
5133
+ "step": 3620
5134
+ },
5135
+ {
5136
+ "epoch": 0.8380049702363752,
5137
+ "grad_norm": 169.25,
5138
+ "learning_rate": 8.992805755395684e-07,
5139
+ "loss": 65.7249,
5140
+ "step": 3625
5141
+ },
5142
+ {
5143
+ "epoch": 0.8391608391608392,
5144
+ "grad_norm": 168.5,
5145
+ "learning_rate": 8.928571428571429e-07,
5146
+ "loss": 65.5726,
5147
+ "step": 3630
5148
+ },
5149
+ {
5150
+ "epoch": 0.8403167080853031,
5151
+ "grad_norm": 188.25,
5152
+ "learning_rate": 8.864337101747174e-07,
5153
+ "loss": 67.4626,
5154
+ "step": 3635
5155
+ },
5156
+ {
5157
+ "epoch": 0.8414725770097671,
5158
+ "grad_norm": 181.5,
5159
+ "learning_rate": 8.80010277492292e-07,
5160
+ "loss": 67.5812,
5161
+ "step": 3640
5162
+ },
5163
+ {
5164
+ "epoch": 0.842628445934231,
5165
+ "grad_norm": 180.5,
5166
+ "learning_rate": 8.735868448098665e-07,
5167
+ "loss": 67.5507,
5168
+ "step": 3645
5169
+ },
5170
+ {
5171
+ "epoch": 0.843784314858695,
5172
+ "grad_norm": 178.125,
5173
+ "learning_rate": 8.67163412127441e-07,
5174
+ "loss": 65.7228,
5175
+ "step": 3650
5176
+ },
5177
+ {
5178
+ "epoch": 0.8449401837831589,
5179
+ "grad_norm": 172.25,
5180
+ "learning_rate": 8.607399794450155e-07,
5181
+ "loss": 65.4235,
5182
+ "step": 3655
5183
+ },
5184
+ {
5185
+ "epoch": 0.846096052707623,
5186
+ "grad_norm": 164.375,
5187
+ "learning_rate": 8.543165467625899e-07,
5188
+ "loss": 66.6319,
5189
+ "step": 3660
5190
+ },
5191
+ {
5192
+ "epoch": 0.847251921632087,
5193
+ "grad_norm": 167.625,
5194
+ "learning_rate": 8.478931140801645e-07,
5195
+ "loss": 65.731,
5196
+ "step": 3665
5197
+ },
5198
+ {
5199
+ "epoch": 0.8484077905565509,
5200
+ "grad_norm": 189.5,
5201
+ "learning_rate": 8.41469681397739e-07,
5202
+ "loss": 66.946,
5203
+ "step": 3670
5204
+ },
5205
+ {
5206
+ "epoch": 0.8495636594810149,
5207
+ "grad_norm": 163.0,
5208
+ "learning_rate": 8.350462487153135e-07,
5209
+ "loss": 66.6999,
5210
+ "step": 3675
5211
+ },
5212
+ {
5213
+ "epoch": 0.8507195284054788,
5214
+ "grad_norm": 167.875,
5215
+ "learning_rate": 8.286228160328881e-07,
5216
+ "loss": 66.8708,
5217
+ "step": 3680
5218
+ },
5219
+ {
5220
+ "epoch": 0.8518753973299428,
5221
+ "grad_norm": 189.125,
5222
+ "learning_rate": 8.221993833504626e-07,
5223
+ "loss": 65.4334,
5224
+ "step": 3685
5225
+ },
5226
+ {
5227
+ "epoch": 0.8530312662544067,
5228
+ "grad_norm": 170.125,
5229
+ "learning_rate": 8.157759506680371e-07,
5230
+ "loss": 66.7173,
5231
+ "step": 3690
5232
+ },
5233
+ {
5234
+ "epoch": 0.8541871351788707,
5235
+ "grad_norm": 177.75,
5236
+ "learning_rate": 8.093525179856115e-07,
5237
+ "loss": 65.8568,
5238
+ "step": 3695
5239
+ },
5240
+ {
5241
+ "epoch": 0.8553430041033346,
5242
+ "grad_norm": 171.125,
5243
+ "learning_rate": 8.029290853031861e-07,
5244
+ "loss": 66.3738,
5245
+ "step": 3700
5246
+ },
5247
+ {
5248
+ "epoch": 0.8564988730277987,
5249
+ "grad_norm": 186.625,
5250
+ "learning_rate": 7.965056526207606e-07,
5251
+ "loss": 66.5799,
5252
+ "step": 3705
5253
+ },
5254
+ {
5255
+ "epoch": 0.8576547419522627,
5256
+ "grad_norm": 166.75,
5257
+ "learning_rate": 7.900822199383351e-07,
5258
+ "loss": 66.2826,
5259
+ "step": 3710
5260
+ },
5261
+ {
5262
+ "epoch": 0.8588106108767266,
5263
+ "grad_norm": 163.25,
5264
+ "learning_rate": 7.836587872559097e-07,
5265
+ "loss": 65.4569,
5266
+ "step": 3715
5267
+ },
5268
+ {
5269
+ "epoch": 0.8599664798011906,
5270
+ "grad_norm": 172.5,
5271
+ "learning_rate": 7.772353545734842e-07,
5272
+ "loss": 66.8164,
5273
+ "step": 3720
5274
+ },
5275
+ {
5276
+ "epoch": 0.8611223487256545,
5277
+ "grad_norm": 172.875,
5278
+ "learning_rate": 7.708119218910587e-07,
5279
+ "loss": 65.8996,
5280
+ "step": 3725
5281
+ },
5282
+ {
5283
+ "epoch": 0.8622782176501185,
5284
+ "grad_norm": 167.625,
5285
+ "learning_rate": 7.643884892086331e-07,
5286
+ "loss": 66.3647,
5287
+ "step": 3730
5288
+ },
5289
+ {
5290
+ "epoch": 0.8634340865745824,
5291
+ "grad_norm": 178.5,
5292
+ "learning_rate": 7.579650565262076e-07,
5293
+ "loss": 66.4823,
5294
+ "step": 3735
5295
+ },
5296
+ {
5297
+ "epoch": 0.8645899554990464,
5298
+ "grad_norm": 171.875,
5299
+ "learning_rate": 7.515416238437822e-07,
5300
+ "loss": 64.8069,
5301
+ "step": 3740
5302
+ },
5303
+ {
5304
+ "epoch": 0.8657458244235103,
5305
+ "grad_norm": 186.0,
5306
+ "learning_rate": 7.451181911613567e-07,
5307
+ "loss": 65.7009,
5308
+ "step": 3745
5309
+ },
5310
+ {
5311
+ "epoch": 0.8669016933479743,
5312
+ "grad_norm": 181.625,
5313
+ "learning_rate": 7.386947584789312e-07,
5314
+ "loss": 67.0425,
5315
+ "step": 3750
5316
+ },
5317
+ {
5318
+ "epoch": 0.8680575622724384,
5319
+ "grad_norm": 177.625,
5320
+ "learning_rate": 7.322713257965057e-07,
5321
+ "loss": 67.3124,
5322
+ "step": 3755
5323
+ },
5324
+ {
5325
+ "epoch": 0.8692134311969023,
5326
+ "grad_norm": 161.75,
5327
+ "learning_rate": 7.258478931140803e-07,
5328
+ "loss": 67.4977,
5329
+ "step": 3760
5330
+ },
5331
+ {
5332
+ "epoch": 0.8703693001213663,
5333
+ "grad_norm": 169.625,
5334
+ "learning_rate": 7.194244604316547e-07,
5335
+ "loss": 66.7436,
5336
+ "step": 3765
5337
+ },
5338
+ {
5339
+ "epoch": 0.8715251690458302,
5340
+ "grad_norm": 174.625,
5341
+ "learning_rate": 7.130010277492292e-07,
5342
+ "loss": 66.0341,
5343
+ "step": 3770
5344
+ },
5345
+ {
5346
+ "epoch": 0.8726810379702942,
5347
+ "grad_norm": 168.625,
5348
+ "learning_rate": 7.065775950668037e-07,
5349
+ "loss": 66.7868,
5350
+ "step": 3775
5351
+ },
5352
+ {
5353
+ "epoch": 0.8738369068947581,
5354
+ "grad_norm": 168.5,
5355
+ "learning_rate": 7.001541623843783e-07,
5356
+ "loss": 66.8334,
5357
+ "step": 3780
5358
+ },
5359
+ {
5360
+ "epoch": 0.8749927758192221,
5361
+ "grad_norm": 172.5,
5362
+ "learning_rate": 6.937307297019528e-07,
5363
+ "loss": 66.601,
5364
+ "step": 3785
5365
+ },
5366
+ {
5367
+ "epoch": 0.876148644743686,
5368
+ "grad_norm": 171.0,
5369
+ "learning_rate": 6.873072970195273e-07,
5370
+ "loss": 66.1991,
5371
+ "step": 3790
5372
+ },
5373
+ {
5374
+ "epoch": 0.87730451366815,
5375
+ "grad_norm": 167.75,
5376
+ "learning_rate": 6.808838643371019e-07,
5377
+ "loss": 66.5028,
5378
+ "step": 3795
5379
+ },
5380
+ {
5381
+ "epoch": 0.878460382592614,
5382
+ "grad_norm": 186.125,
5383
+ "learning_rate": 6.744604316546763e-07,
5384
+ "loss": 67.2106,
5385
+ "step": 3800
5386
+ },
5387
+ {
5388
+ "epoch": 0.879616251517078,
5389
+ "grad_norm": 173.5,
5390
+ "learning_rate": 6.680369989722508e-07,
5391
+ "loss": 65.993,
5392
+ "step": 3805
5393
+ },
5394
+ {
5395
+ "epoch": 0.880772120441542,
5396
+ "grad_norm": 162.75,
5397
+ "learning_rate": 6.616135662898253e-07,
5398
+ "loss": 65.3315,
5399
+ "step": 3810
5400
+ },
5401
+ {
5402
+ "epoch": 0.8819279893660059,
5403
+ "grad_norm": 173.875,
5404
+ "learning_rate": 6.551901336073999e-07,
5405
+ "loss": 66.5064,
5406
+ "step": 3815
5407
+ },
5408
+ {
5409
+ "epoch": 0.8830838582904699,
5410
+ "grad_norm": 170.125,
5411
+ "learning_rate": 6.487667009249743e-07,
5412
+ "loss": 66.6228,
5413
+ "step": 3820
5414
+ },
5415
+ {
5416
+ "epoch": 0.8842397272149338,
5417
+ "grad_norm": 176.75,
5418
+ "learning_rate": 6.423432682425489e-07,
5419
+ "loss": 66.9273,
5420
+ "step": 3825
5421
+ },
5422
+ {
5423
+ "epoch": 0.8853955961393978,
5424
+ "grad_norm": 161.25,
5425
+ "learning_rate": 6.359198355601234e-07,
5426
+ "loss": 66.3972,
5427
+ "step": 3830
5428
+ },
5429
+ {
5430
+ "epoch": 0.8865514650638617,
5431
+ "grad_norm": 160.5,
5432
+ "learning_rate": 6.294964028776979e-07,
5433
+ "loss": 65.0881,
5434
+ "step": 3835
5435
+ },
5436
+ {
5437
+ "epoch": 0.8877073339883257,
5438
+ "grad_norm": 163.875,
5439
+ "learning_rate": 6.230729701952724e-07,
5440
+ "loss": 67.1789,
5441
+ "step": 3840
5442
+ },
5443
+ {
5444
+ "epoch": 0.8888632029127896,
5445
+ "grad_norm": 159.625,
5446
+ "learning_rate": 6.16649537512847e-07,
5447
+ "loss": 65.1281,
5448
+ "step": 3845
5449
+ },
5450
+ {
5451
+ "epoch": 0.8900190718372537,
5452
+ "grad_norm": 166.125,
5453
+ "learning_rate": 6.102261048304214e-07,
5454
+ "loss": 65.8177,
5455
+ "step": 3850
5456
+ },
5457
+ {
5458
+ "epoch": 0.8911749407617177,
5459
+ "grad_norm": 170.5,
5460
+ "learning_rate": 6.038026721479959e-07,
5461
+ "loss": 65.8602,
5462
+ "step": 3855
5463
+ },
5464
+ {
5465
+ "epoch": 0.8923308096861816,
5466
+ "grad_norm": 167.375,
5467
+ "learning_rate": 5.973792394655705e-07,
5468
+ "loss": 67.0154,
5469
+ "step": 3860
5470
+ },
5471
+ {
5472
+ "epoch": 0.8934866786106456,
5473
+ "grad_norm": 177.0,
5474
+ "learning_rate": 5.90955806783145e-07,
5475
+ "loss": 66.4106,
5476
+ "step": 3865
5477
+ },
5478
+ {
5479
+ "epoch": 0.8946425475351095,
5480
+ "grad_norm": 171.0,
5481
+ "learning_rate": 5.845323741007194e-07,
5482
+ "loss": 64.8226,
5483
+ "step": 3870
5484
+ },
5485
+ {
5486
+ "epoch": 0.8957984164595735,
5487
+ "grad_norm": 170.25,
5488
+ "learning_rate": 5.78108941418294e-07,
5489
+ "loss": 65.3194,
5490
+ "step": 3875
5491
+ },
5492
+ {
5493
+ "epoch": 0.8969542853840374,
5494
+ "grad_norm": 164.375,
5495
+ "learning_rate": 5.716855087358686e-07,
5496
+ "loss": 66.7081,
5497
+ "step": 3880
5498
+ },
5499
+ {
5500
+ "epoch": 0.8981101543085014,
5501
+ "grad_norm": 173.125,
5502
+ "learning_rate": 5.65262076053443e-07,
5503
+ "loss": 65.5161,
5504
+ "step": 3885
5505
+ },
5506
+ {
5507
+ "epoch": 0.8992660232329653,
5508
+ "grad_norm": 179.375,
5509
+ "learning_rate": 5.588386433710175e-07,
5510
+ "loss": 65.9439,
5511
+ "step": 3890
5512
+ },
5513
+ {
5514
+ "epoch": 0.9004218921574293,
5515
+ "grad_norm": 180.875,
5516
+ "learning_rate": 5.524152106885921e-07,
5517
+ "loss": 64.9139,
5518
+ "step": 3895
5519
+ },
5520
+ {
5521
+ "epoch": 0.9015777610818934,
5522
+ "grad_norm": 168.75,
5523
+ "learning_rate": 5.459917780061665e-07,
5524
+ "loss": 64.3953,
5525
+ "step": 3900
5526
+ },
5527
+ {
5528
+ "epoch": 0.9027336300063573,
5529
+ "grad_norm": 178.125,
5530
+ "learning_rate": 5.39568345323741e-07,
5531
+ "loss": 66.4299,
5532
+ "step": 3905
5533
+ },
5534
+ {
5535
+ "epoch": 0.9038894989308213,
5536
+ "grad_norm": 171.0,
5537
+ "learning_rate": 5.331449126413155e-07,
5538
+ "loss": 65.5295,
5539
+ "step": 3910
5540
+ },
5541
+ {
5542
+ "epoch": 0.9050453678552852,
5543
+ "grad_norm": 168.875,
5544
+ "learning_rate": 5.267214799588901e-07,
5545
+ "loss": 67.0198,
5546
+ "step": 3915
5547
+ },
5548
+ {
5549
+ "epoch": 0.9062012367797492,
5550
+ "grad_norm": 167.25,
5551
+ "learning_rate": 5.202980472764646e-07,
5552
+ "loss": 65.2047,
5553
+ "step": 3920
5554
+ },
5555
+ {
5556
+ "epoch": 0.9073571057042131,
5557
+ "grad_norm": 163.625,
5558
+ "learning_rate": 5.138746145940391e-07,
5559
+ "loss": 66.2454,
5560
+ "step": 3925
5561
+ },
5562
+ {
5563
+ "epoch": 0.9085129746286771,
5564
+ "grad_norm": 168.375,
5565
+ "learning_rate": 5.074511819116137e-07,
5566
+ "loss": 65.8559,
5567
+ "step": 3930
5568
+ },
5569
+ {
5570
+ "epoch": 0.909668843553141,
5571
+ "grad_norm": 191.25,
5572
+ "learning_rate": 5.010277492291881e-07,
5573
+ "loss": 65.7141,
5574
+ "step": 3935
5575
+ },
5576
+ {
5577
+ "epoch": 0.910824712477605,
5578
+ "grad_norm": 159.25,
5579
+ "learning_rate": 4.946043165467626e-07,
5580
+ "loss": 65.7896,
5581
+ "step": 3940
5582
+ },
5583
+ {
5584
+ "epoch": 0.9119805814020691,
5585
+ "grad_norm": 157.375,
5586
+ "learning_rate": 4.881808838643371e-07,
5587
+ "loss": 65.8169,
5588
+ "step": 3945
5589
+ },
5590
+ {
5591
+ "epoch": 0.913136450326533,
5592
+ "grad_norm": 186.5,
5593
+ "learning_rate": 4.817574511819117e-07,
5594
+ "loss": 64.8462,
5595
+ "step": 3950
5596
+ },
5597
+ {
5598
+ "epoch": 0.914292319250997,
5599
+ "grad_norm": 157.625,
5600
+ "learning_rate": 4.753340184994862e-07,
5601
+ "loss": 65.2834,
5602
+ "step": 3955
5603
+ },
5604
+ {
5605
+ "epoch": 0.9154481881754609,
5606
+ "grad_norm": 175.75,
5607
+ "learning_rate": 4.6891058581706067e-07,
5608
+ "loss": 66.5881,
5609
+ "step": 3960
5610
+ },
5611
+ {
5612
+ "epoch": 0.9166040570999249,
5613
+ "grad_norm": 163.375,
5614
+ "learning_rate": 4.624871531346352e-07,
5615
+ "loss": 66.1449,
5616
+ "step": 3965
5617
+ },
5618
+ {
5619
+ "epoch": 0.9177599260243888,
5620
+ "grad_norm": 166.875,
5621
+ "learning_rate": 4.560637204522097e-07,
5622
+ "loss": 65.73,
5623
+ "step": 3970
5624
+ },
5625
+ {
5626
+ "epoch": 0.9189157949488528,
5627
+ "grad_norm": 159.125,
5628
+ "learning_rate": 4.496402877697842e-07,
5629
+ "loss": 65.5529,
5630
+ "step": 3975
5631
+ },
5632
+ {
5633
+ "epoch": 0.9200716638733167,
5634
+ "grad_norm": 170.375,
5635
+ "learning_rate": 4.432168550873587e-07,
5636
+ "loss": 64.6188,
5637
+ "step": 3980
5638
+ },
5639
+ {
5640
+ "epoch": 0.9212275327977807,
5641
+ "grad_norm": 155.375,
5642
+ "learning_rate": 4.3679342240493327e-07,
5643
+ "loss": 64.926,
5644
+ "step": 3985
5645
+ },
5646
+ {
5647
+ "epoch": 0.9223834017222446,
5648
+ "grad_norm": 166.375,
5649
+ "learning_rate": 4.3036998972250774e-07,
5650
+ "loss": 65.8812,
5651
+ "step": 3990
5652
+ },
5653
+ {
5654
+ "epoch": 0.9235392706467087,
5655
+ "grad_norm": 176.5,
5656
+ "learning_rate": 4.2394655704008227e-07,
5657
+ "loss": 64.8619,
5658
+ "step": 3995
5659
+ },
5660
+ {
5661
+ "epoch": 0.9246951395711727,
5662
+ "grad_norm": 163.75,
5663
+ "learning_rate": 4.1752312435765675e-07,
5664
+ "loss": 66.7275,
5665
+ "step": 4000
5666
+ },
5667
+ {
5668
+ "epoch": 0.9246951395711727,
5669
+ "eval_loss": 2.0627987384796143,
5670
+ "eval_runtime": 212.1215,
5671
+ "eval_samples_per_second": 1099.059,
5672
+ "eval_steps_per_second": 34.348,
5673
+ "step": 4000
5674
  }
5675
  ],
5676
  "logging_steps": 5,
 
5690
  "attributes": {}
5691
  }
5692
  },
5693
+ "total_flos": 1.7329431971155149e+19,
5694
  "train_batch_size": 4,
5695
  "trial_name": null,
5696
  "trial_params": null