rovdetection commited on
Commit
42d45da
·
verified ·
1 Parent(s): c0c7b9b

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:835db88c2c568a2a5b9eecd0ca20228d562ccd37375f6d5e37ee4f667bd5c028
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b732646b1016d0368b94920529e0e03c133894ca8756d67e145a97d90d254777
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bde2b53b9a0c26662086027ef84b0578651b731c913f116872da22f0740efeab
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1728e885cf58302b2e8ae68b6c9f146637db471aa0ed43e5c883bad6235443e
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa8f41c51c4c045061b2c14ad0e244d1f18ea14e355c0937c51abc1c22235765
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16de339ad05cf2ba88ca8586907951353749d574c9326b3098589fb0f62ac32e
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcdef9cce1358b15f98ec011b2742b883d23020479104f9b5467277f0c257b88
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cefe33faabb000e8f719c6f02e0099d6289469d78aca45133006441981cd323
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0033c7745b46bdca3ecab5787678834ca68f7f7e1288869dceeb38812abc253
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62db0ba9861d9ab63380744e79a287faa461a1bf55700140a411fe1e976f1cd
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5ee800e7df74b641553b418c04566b716dade6c517cb6fd519bb2168d1739f3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b41aa0c086667ab13fd1c3da2f8b431d894c7368cafdbcdd2e5351f4800eddf8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 6.873415001074576,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4008,6 +4008,506 @@
4008
  "mean_token_accuracy": 0.6625144556164742,
4009
  "num_tokens": 23764831.0,
4010
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4011
  }
4012
  ],
4013
  "logging_steps": 10,
@@ -4027,7 +4527,7 @@
4027
  "attributes": {}
4028
  }
4029
  },
4030
- "total_flos": 1.951545327353856e+17,
4031
  "train_batch_size": 2,
4032
  "trial_name": null,
4033
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 7.732430689877498,
6
  "eval_steps": 500,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4008
  "mean_token_accuracy": 0.6625144556164742,
4009
  "num_tokens": 23764831.0,
4010
  "step": 4000
4011
+ },
4012
+ {
4013
+ "entropy": 1.699565550684929,
4014
+ "epoch": 6.890608209757146,
4015
+ "grad_norm": 0.7662839889526367,
4016
+ "learning_rate": 3.964e-05,
4017
+ "loss": 1.7373327255249023,
4018
+ "mean_token_accuracy": 0.6809282444417477,
4019
+ "num_tokens": 23825367.0,
4020
+ "step": 4010
4021
+ },
4022
+ {
4023
+ "entropy": 1.6455101184546947,
4024
+ "epoch": 6.907801418439716,
4025
+ "grad_norm": 0.7619901299476624,
4026
+ "learning_rate": 3.9240000000000004e-05,
4027
+ "loss": 1.709805679321289,
4028
+ "mean_token_accuracy": 0.6812954246997833,
4029
+ "num_tokens": 23887369.0,
4030
+ "step": 4020
4031
+ },
4032
+ {
4033
+ "entropy": 1.7952800825238229,
4034
+ "epoch": 6.924994627122286,
4035
+ "grad_norm": 0.7858437299728394,
4036
+ "learning_rate": 3.884e-05,
4037
+ "loss": 1.8688398361206056,
4038
+ "mean_token_accuracy": 0.6621494639664889,
4039
+ "num_tokens": 23949358.0,
4040
+ "step": 4030
4041
+ },
4042
+ {
4043
+ "entropy": 1.772008201479912,
4044
+ "epoch": 6.942187835804857,
4045
+ "grad_norm": 0.7586779594421387,
4046
+ "learning_rate": 3.8440000000000005e-05,
4047
+ "loss": 1.798760986328125,
4048
+ "mean_token_accuracy": 0.667642817273736,
4049
+ "num_tokens": 24009691.0,
4050
+ "step": 4040
4051
+ },
4052
+ {
4053
+ "entropy": 1.7289930269122125,
4054
+ "epoch": 6.959381044487428,
4055
+ "grad_norm": 0.854505717754364,
4056
+ "learning_rate": 3.804e-05,
4057
+ "loss": 1.771562385559082,
4058
+ "mean_token_accuracy": 0.6692178774625063,
4059
+ "num_tokens": 24064506.0,
4060
+ "step": 4050
4061
+ },
4062
+ {
4063
+ "entropy": 1.715189914405346,
4064
+ "epoch": 6.976574253169998,
4065
+ "grad_norm": 0.758488655090332,
4066
+ "learning_rate": 3.7640000000000006e-05,
4067
+ "loss": 1.756412887573242,
4068
+ "mean_token_accuracy": 0.6710222817957401,
4069
+ "num_tokens": 24126841.0,
4070
+ "step": 4060
4071
+ },
4072
+ {
4073
+ "entropy": 1.7383173301815986,
4074
+ "epoch": 6.993767461852569,
4075
+ "grad_norm": 0.7450618147850037,
4076
+ "learning_rate": 3.724e-05,
4077
+ "loss": 1.7997669219970702,
4078
+ "mean_token_accuracy": 0.6649864386767149,
4079
+ "num_tokens": 24186159.0,
4080
+ "step": 4070
4081
+ },
4082
+ {
4083
+ "entropy": 1.7172312767474682,
4084
+ "epoch": 7.010315925209542,
4085
+ "grad_norm": 0.8475770950317383,
4086
+ "learning_rate": 3.684e-05,
4087
+ "loss": 1.7585922241210938,
4088
+ "mean_token_accuracy": 0.6746863397684965,
4089
+ "num_tokens": 24239759.0,
4090
+ "step": 4080
4091
+ },
4092
+ {
4093
+ "entropy": 1.7192407630383968,
4094
+ "epoch": 7.027509133892113,
4095
+ "grad_norm": 0.7818967700004578,
4096
+ "learning_rate": 3.6440000000000003e-05,
4097
+ "loss": 1.7634265899658204,
4098
+ "mean_token_accuracy": 0.6724576361477375,
4099
+ "num_tokens": 24298775.0,
4100
+ "step": 4090
4101
+ },
4102
+ {
4103
+ "entropy": 1.7496131911873818,
4104
+ "epoch": 7.044702342574683,
4105
+ "grad_norm": 0.8118335008621216,
4106
+ "learning_rate": 3.604e-05,
4107
+ "loss": 1.802253532409668,
4108
+ "mean_token_accuracy": 0.6702191606163979,
4109
+ "num_tokens": 24361142.0,
4110
+ "step": 4100
4111
+ },
4112
+ {
4113
+ "entropy": 1.7090509735047816,
4114
+ "epoch": 7.061895551257253,
4115
+ "grad_norm": 0.8414726257324219,
4116
+ "learning_rate": 3.5640000000000004e-05,
4117
+ "loss": 1.7347373962402344,
4118
+ "mean_token_accuracy": 0.679864277690649,
4119
+ "num_tokens": 24419838.0,
4120
+ "step": 4110
4121
+ },
4122
+ {
4123
+ "entropy": 1.6807728812098504,
4124
+ "epoch": 7.079088759939824,
4125
+ "grad_norm": 0.8567139506340027,
4126
+ "learning_rate": 3.524e-05,
4127
+ "loss": 1.7365150451660156,
4128
+ "mean_token_accuracy": 0.6765194039791822,
4129
+ "num_tokens": 24477518.0,
4130
+ "step": 4120
4131
+ },
4132
+ {
4133
+ "entropy": 1.709678091108799,
4134
+ "epoch": 7.096281968622394,
4135
+ "grad_norm": 0.8345620036125183,
4136
+ "learning_rate": 3.484e-05,
4137
+ "loss": 1.730575180053711,
4138
+ "mean_token_accuracy": 0.6709145799279213,
4139
+ "num_tokens": 24534560.0,
4140
+ "step": 4130
4141
+ },
4142
+ {
4143
+ "entropy": 1.6541544690728187,
4144
+ "epoch": 7.113475177304965,
4145
+ "grad_norm": 0.8509814143180847,
4146
+ "learning_rate": 3.444e-05,
4147
+ "loss": 1.6795757293701172,
4148
+ "mean_token_accuracy": 0.6856038823723793,
4149
+ "num_tokens": 24594829.0,
4150
+ "step": 4140
4151
+ },
4152
+ {
4153
+ "entropy": 1.7498343527317046,
4154
+ "epoch": 7.130668385987535,
4155
+ "grad_norm": 0.8674039244651794,
4156
+ "learning_rate": 3.404e-05,
4157
+ "loss": 1.8083892822265626,
4158
+ "mean_token_accuracy": 0.6709578204900026,
4159
+ "num_tokens": 24656798.0,
4160
+ "step": 4150
4161
+ },
4162
+ {
4163
+ "entropy": 1.677807478606701,
4164
+ "epoch": 7.147861594670105,
4165
+ "grad_norm": 0.8016234040260315,
4166
+ "learning_rate": 3.3639999999999996e-05,
4167
+ "loss": 1.7206790924072266,
4168
+ "mean_token_accuracy": 0.6754934191703796,
4169
+ "num_tokens": 24714009.0,
4170
+ "step": 4160
4171
+ },
4172
+ {
4173
+ "entropy": 1.672835360467434,
4174
+ "epoch": 7.1650548033526755,
4175
+ "grad_norm": 0.7139334082603455,
4176
+ "learning_rate": 3.324e-05,
4177
+ "loss": 1.7049163818359374,
4178
+ "mean_token_accuracy": 0.6851269513368606,
4179
+ "num_tokens": 24778022.0,
4180
+ "step": 4170
4181
+ },
4182
+ {
4183
+ "entropy": 1.6577355667948723,
4184
+ "epoch": 7.182248012035246,
4185
+ "grad_norm": 0.9129847288131714,
4186
+ "learning_rate": 3.2840000000000004e-05,
4187
+ "loss": 1.7073640823364258,
4188
+ "mean_token_accuracy": 0.6768647953867912,
4189
+ "num_tokens": 24837669.0,
4190
+ "step": 4180
4191
+ },
4192
+ {
4193
+ "entropy": 1.7049853071570396,
4194
+ "epoch": 7.199441220717817,
4195
+ "grad_norm": 0.7545643448829651,
4196
+ "learning_rate": 3.244e-05,
4197
+ "loss": 1.754374122619629,
4198
+ "mean_token_accuracy": 0.6808854278177023,
4199
+ "num_tokens": 24898991.0,
4200
+ "step": 4190
4201
+ },
4202
+ {
4203
+ "entropy": 1.6785477355122567,
4204
+ "epoch": 7.216634429400387,
4205
+ "grad_norm": 0.8802333474159241,
4206
+ "learning_rate": 3.2040000000000005e-05,
4207
+ "loss": 1.6974828720092774,
4208
+ "mean_token_accuracy": 0.6824289247393608,
4209
+ "num_tokens": 24957348.0,
4210
+ "step": 4200
4211
+ },
4212
+ {
4213
+ "entropy": 1.7312355414032936,
4214
+ "epoch": 7.233827638082957,
4215
+ "grad_norm": 0.8227038383483887,
4216
+ "learning_rate": 3.164e-05,
4217
+ "loss": 1.7645183563232423,
4218
+ "mean_token_accuracy": 0.6661410238593817,
4219
+ "num_tokens": 25016658.0,
4220
+ "step": 4210
4221
+ },
4222
+ {
4223
+ "entropy": 1.8124181643128394,
4224
+ "epoch": 7.2510208467655275,
4225
+ "grad_norm": 0.8563106060028076,
4226
+ "learning_rate": 3.1240000000000006e-05,
4227
+ "loss": 1.8163776397705078,
4228
+ "mean_token_accuracy": 0.6610642150044441,
4229
+ "num_tokens": 25074658.0,
4230
+ "step": 4220
4231
+ },
4232
+ {
4233
+ "entropy": 1.776869924366474,
4234
+ "epoch": 7.268214055448098,
4235
+ "grad_norm": 0.8615058064460754,
4236
+ "learning_rate": 3.084e-05,
4237
+ "loss": 1.861563491821289,
4238
+ "mean_token_accuracy": 0.6624562762677669,
4239
+ "num_tokens": 25132732.0,
4240
+ "step": 4230
4241
+ },
4242
+ {
4243
+ "entropy": 1.742109003663063,
4244
+ "epoch": 7.285407264130669,
4245
+ "grad_norm": 0.7851050496101379,
4246
+ "learning_rate": 3.0440000000000003e-05,
4247
+ "loss": 1.7527351379394531,
4248
+ "mean_token_accuracy": 0.6712357953190804,
4249
+ "num_tokens": 25194009.0,
4250
+ "step": 4240
4251
+ },
4252
+ {
4253
+ "entropy": 1.7356494843959809,
4254
+ "epoch": 7.302600472813239,
4255
+ "grad_norm": 0.8842288255691528,
4256
+ "learning_rate": 3.004e-05,
4257
+ "loss": 1.8091196060180663,
4258
+ "mean_token_accuracy": 0.6680308949202299,
4259
+ "num_tokens": 25250681.0,
4260
+ "step": 4250
4261
+ },
4262
+ {
4263
+ "entropy": 1.714112138748169,
4264
+ "epoch": 7.319793681495809,
4265
+ "grad_norm": 0.8050926923751831,
4266
+ "learning_rate": 2.964e-05,
4267
+ "loss": 1.741617774963379,
4268
+ "mean_token_accuracy": 0.6764710985124112,
4269
+ "num_tokens": 25307119.0,
4270
+ "step": 4260
4271
+ },
4272
+ {
4273
+ "entropy": 1.7806825146079064,
4274
+ "epoch": 7.3369868901783795,
4275
+ "grad_norm": 0.755797803401947,
4276
+ "learning_rate": 2.924e-05,
4277
+ "loss": 1.8448747634887694,
4278
+ "mean_token_accuracy": 0.6646751999855042,
4279
+ "num_tokens": 25365721.0,
4280
+ "step": 4270
4281
+ },
4282
+ {
4283
+ "entropy": 1.7478718511760234,
4284
+ "epoch": 7.35418009886095,
4285
+ "grad_norm": 0.8148614764213562,
4286
+ "learning_rate": 2.8840000000000002e-05,
4287
+ "loss": 1.8303293228149413,
4288
+ "mean_token_accuracy": 0.6662985436618328,
4289
+ "num_tokens": 25423309.0,
4290
+ "step": 4280
4291
+ },
4292
+ {
4293
+ "entropy": 1.6996045634150505,
4294
+ "epoch": 7.371373307543521,
4295
+ "grad_norm": 0.7613778114318848,
4296
+ "learning_rate": 2.844e-05,
4297
+ "loss": 1.7077817916870117,
4298
+ "mean_token_accuracy": 0.679437268525362,
4299
+ "num_tokens": 25480080.0,
4300
+ "step": 4290
4301
+ },
4302
+ {
4303
+ "entropy": 1.8055237784981728,
4304
+ "epoch": 7.38856651622609,
4305
+ "grad_norm": 0.899900496006012,
4306
+ "learning_rate": 2.804e-05,
4307
+ "loss": 1.882634735107422,
4308
+ "mean_token_accuracy": 0.659589122608304,
4309
+ "num_tokens": 25538885.0,
4310
+ "step": 4300
4311
+ },
4312
+ {
4313
+ "entropy": 1.6835025876760483,
4314
+ "epoch": 7.405759724908661,
4315
+ "grad_norm": 0.7718909382820129,
4316
+ "learning_rate": 2.764e-05,
4317
+ "loss": 1.7145641326904297,
4318
+ "mean_token_accuracy": 0.6805526971817016,
4319
+ "num_tokens": 25598830.0,
4320
+ "step": 4310
4321
+ },
4322
+ {
4323
+ "entropy": 1.7392980232834816,
4324
+ "epoch": 7.422952933591231,
4325
+ "grad_norm": 0.7144562005996704,
4326
+ "learning_rate": 2.724e-05,
4327
+ "loss": 1.7779796600341797,
4328
+ "mean_token_accuracy": 0.6709600411355495,
4329
+ "num_tokens": 25660275.0,
4330
+ "step": 4320
4331
+ },
4332
+ {
4333
+ "entropy": 1.7193088322877883,
4334
+ "epoch": 7.440146142273802,
4335
+ "grad_norm": 0.8038010001182556,
4336
+ "learning_rate": 2.6840000000000004e-05,
4337
+ "loss": 1.7928234100341798,
4338
+ "mean_token_accuracy": 0.6767275612801313,
4339
+ "num_tokens": 25719958.0,
4340
+ "step": 4330
4341
+ },
4342
+ {
4343
+ "entropy": 1.7314304433763028,
4344
+ "epoch": 7.457339350956373,
4345
+ "grad_norm": 0.7783089876174927,
4346
+ "learning_rate": 2.6440000000000004e-05,
4347
+ "loss": 1.7952003479003906,
4348
+ "mean_token_accuracy": 0.6740467935800553,
4349
+ "num_tokens": 25776689.0,
4350
+ "step": 4340
4351
+ },
4352
+ {
4353
+ "entropy": 1.74028614833951,
4354
+ "epoch": 7.474532559638942,
4355
+ "grad_norm": 0.8052565455436707,
4356
+ "learning_rate": 2.6040000000000005e-05,
4357
+ "loss": 1.7803146362304687,
4358
+ "mean_token_accuracy": 0.6733121275901794,
4359
+ "num_tokens": 25837916.0,
4360
+ "step": 4350
4361
+ },
4362
+ {
4363
+ "entropy": 1.6831192195415496,
4364
+ "epoch": 7.491725768321513,
4365
+ "grad_norm": 0.8941977024078369,
4366
+ "learning_rate": 2.5640000000000002e-05,
4367
+ "loss": 1.7077743530273437,
4368
+ "mean_token_accuracy": 0.6749852932989597,
4369
+ "num_tokens": 25896712.0,
4370
+ "step": 4360
4371
+ },
4372
+ {
4373
+ "entropy": 1.7840609520673751,
4374
+ "epoch": 7.508918977004083,
4375
+ "grad_norm": 0.818671703338623,
4376
+ "learning_rate": 2.5240000000000002e-05,
4377
+ "loss": 1.8329656600952149,
4378
+ "mean_token_accuracy": 0.6679215718060731,
4379
+ "num_tokens": 25958383.0,
4380
+ "step": 4370
4381
+ },
4382
+ {
4383
+ "entropy": 1.76528559923172,
4384
+ "epoch": 7.526112185686654,
4385
+ "grad_norm": 0.7579294443130493,
4386
+ "learning_rate": 2.4840000000000003e-05,
4387
+ "loss": 1.7914703369140625,
4388
+ "mean_token_accuracy": 0.6695499271154404,
4389
+ "num_tokens": 26017754.0,
4390
+ "step": 4380
4391
+ },
4392
+ {
4393
+ "entropy": 1.704708030819893,
4394
+ "epoch": 7.5433053943692245,
4395
+ "grad_norm": 0.8200159668922424,
4396
+ "learning_rate": 2.4440000000000003e-05,
4397
+ "loss": 1.774311637878418,
4398
+ "mean_token_accuracy": 0.6739427134394645,
4399
+ "num_tokens": 26075760.0,
4400
+ "step": 4390
4401
+ },
4402
+ {
4403
+ "entropy": 1.7540104657411575,
4404
+ "epoch": 7.560498603051794,
4405
+ "grad_norm": 0.8373399972915649,
4406
+ "learning_rate": 2.404e-05,
4407
+ "loss": 1.796240997314453,
4408
+ "mean_token_accuracy": 0.6640590511262416,
4409
+ "num_tokens": 26133858.0,
4410
+ "step": 4400
4411
+ },
4412
+ {
4413
+ "entropy": 1.754172220826149,
4414
+ "epoch": 7.577691811734365,
4415
+ "grad_norm": 0.7368677258491516,
4416
+ "learning_rate": 2.364e-05,
4417
+ "loss": 1.8175994873046875,
4418
+ "mean_token_accuracy": 0.6717667855322361,
4419
+ "num_tokens": 26197518.0,
4420
+ "step": 4410
4421
+ },
4422
+ {
4423
+ "entropy": 1.6564558774232865,
4424
+ "epoch": 7.594885020416935,
4425
+ "grad_norm": 0.8868939280509949,
4426
+ "learning_rate": 2.324e-05,
4427
+ "loss": 1.669070053100586,
4428
+ "mean_token_accuracy": 0.6839951984584332,
4429
+ "num_tokens": 26250823.0,
4430
+ "step": 4420
4431
+ },
4432
+ {
4433
+ "entropy": 1.7594470486044884,
4434
+ "epoch": 7.612078229099506,
4435
+ "grad_norm": 0.86412513256073,
4436
+ "learning_rate": 2.284e-05,
4437
+ "loss": 1.8095222473144532,
4438
+ "mean_token_accuracy": 0.666244950518012,
4439
+ "num_tokens": 26312548.0,
4440
+ "step": 4430
4441
+ },
4442
+ {
4443
+ "entropy": 1.7646627604961396,
4444
+ "epoch": 7.6292714377820765,
4445
+ "grad_norm": 0.7128214836120605,
4446
+ "learning_rate": 2.244e-05,
4447
+ "loss": 1.832158660888672,
4448
+ "mean_token_accuracy": 0.6679420609027147,
4449
+ "num_tokens": 26376747.0,
4450
+ "step": 4440
4451
+ },
4452
+ {
4453
+ "entropy": 1.7401177063584328,
4454
+ "epoch": 7.646464646464646,
4455
+ "grad_norm": 0.7479432225227356,
4456
+ "learning_rate": 2.2040000000000002e-05,
4457
+ "loss": 1.7779264450073242,
4458
+ "mean_token_accuracy": 0.6710429213941097,
4459
+ "num_tokens": 26438907.0,
4460
+ "step": 4450
4461
+ },
4462
+ {
4463
+ "entropy": 1.6960709124803544,
4464
+ "epoch": 7.663657855147217,
4465
+ "grad_norm": 0.8182732462882996,
4466
+ "learning_rate": 2.1640000000000003e-05,
4467
+ "loss": 1.7709745407104491,
4468
+ "mean_token_accuracy": 0.6782359674572944,
4469
+ "num_tokens": 26499840.0,
4470
+ "step": 4460
4471
+ },
4472
+ {
4473
+ "entropy": 1.8024938970804214,
4474
+ "epoch": 7.680851063829787,
4475
+ "grad_norm": 0.8208670020103455,
4476
+ "learning_rate": 2.124e-05,
4477
+ "loss": 1.8752277374267579,
4478
+ "mean_token_accuracy": 0.6610838636755944,
4479
+ "num_tokens": 26561739.0,
4480
+ "step": 4470
4481
+ },
4482
+ {
4483
+ "entropy": 1.6679524429142476,
4484
+ "epoch": 7.698044272512358,
4485
+ "grad_norm": 0.7669119834899902,
4486
+ "learning_rate": 2.084e-05,
4487
+ "loss": 1.6840700149536132,
4488
+ "mean_token_accuracy": 0.6839361816644669,
4489
+ "num_tokens": 26618997.0,
4490
+ "step": 4480
4491
+ },
4492
+ {
4493
+ "entropy": 1.669876104593277,
4494
+ "epoch": 7.715237481194928,
4495
+ "grad_norm": 0.8296427130699158,
4496
+ "learning_rate": 2.044e-05,
4497
+ "loss": 1.6926704406738282,
4498
+ "mean_token_accuracy": 0.6837400387972593,
4499
+ "num_tokens": 26677617.0,
4500
+ "step": 4490
4501
+ },
4502
+ {
4503
+ "entropy": 1.7478768080472946,
4504
+ "epoch": 7.732430689877498,
4505
+ "grad_norm": 0.9231081008911133,
4506
+ "learning_rate": 2.004e-05,
4507
+ "loss": 1.8043970108032226,
4508
+ "mean_token_accuracy": 0.6680058591067791,
4509
+ "num_tokens": 26735542.0,
4510
+ "step": 4500
4511
  }
4512
  ],
4513
  "logging_steps": 10,
 
4527
  "attributes": {}
4528
  }
4529
  },
4530
+ "total_flos": 2.19451190411264e+17,
4531
  "train_batch_size": 2,
4532
  "trial_name": null,
4533
  "trial_params": null