Wilsonwin commited on
Commit
d0eb430
·
verified ·
1 Parent(s): 7f762e0

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:850517b9cf5da4903168f8b9dbfcfcb01385d34bc0d5bd1c93041c99d5afbbab
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c74bfe809433060df3635ef406235f0717bc42781fff9acd5df0f855eb57b3f
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8eaed0cac576a8a9a03addbea043ecae521ca2a1d3d91c2f8f4543bcfc559783
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70990f23441c3c0fadf8ff7b5b48864178e6a3f9dbc5c1184cb7c19ddf968c0f
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a9c47849ad44860f45019fca12bd8b47e7589be1317a01ad6705b924156a6be
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11940f1313899a11d3e47a2d43f508134dd8e03ac7613f4eca32c754da2d1839
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bda4b56b57284b5d776cea834f86539fa062d5e046885e07dcb7516921ccd6ee
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5732bb4fae95fda377427872ad7c4fed0c45a84922701b3143ffa39cf761f9db
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9292110153742186,
6
  "eval_steps": 500,
7
- "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3953,6 +3953,364 @@
3953
  "eval_samples_per_second": 275.136,
3954
  "eval_steps_per_second": 5.778,
3955
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3956
  }
3957
  ],
3958
  "logging_steps": 10,
@@ -3972,7 +4330,7 @@
3972
  "attributes": {}
3973
  }
3974
  },
3975
- "total_flos": 1.83951251472384e+17,
3976
  "train_batch_size": 48,
3977
  "trial_name": null,
3978
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0136847440446022,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3953
  "eval_samples_per_second": 275.136,
3954
  "eval_steps_per_second": 5.778,
3955
  "step": 5500
3956
+ },
3957
+ {
3958
+ "epoch": 0.9309004899476263,
3959
+ "grad_norm": 0.4983241558074951,
3960
+ "learning_rate": 0.00021527977734609537,
3961
+ "loss": 4.547625732421875,
3962
+ "step": 5510
3963
+ },
3964
+ {
3965
+ "epoch": 0.9325899645210339,
3966
+ "grad_norm": 0.5012770295143127,
3967
+ "learning_rate": 0.00021484818619522722,
3968
+ "loss": 4.557040023803711,
3969
+ "step": 5520
3970
+ },
3971
+ {
3972
+ "epoch": 0.9342794390944417,
3973
+ "grad_norm": 0.5078200101852417,
3974
+ "learning_rate": 0.00021441593376712224,
3975
+ "loss": 4.553184890747071,
3976
+ "step": 5530
3977
+ },
3978
+ {
3979
+ "epoch": 0.9359689136678493,
3980
+ "grad_norm": 0.48705384135246277,
3981
+ "learning_rate": 0.0002139830244695935,
3982
+ "loss": 4.5813232421875,
3983
+ "step": 5540
3984
+ },
3985
+ {
3986
+ "epoch": 0.937658388241257,
3987
+ "grad_norm": 0.5023474097251892,
3988
+ "learning_rate": 0.00021354946271715265,
3989
+ "loss": 4.552815628051758,
3990
+ "step": 5550
3991
+ },
3992
+ {
3993
+ "epoch": 0.9393478628146646,
3994
+ "grad_norm": 0.5058281421661377,
3995
+ "learning_rate": 0.00021311525293096444,
3996
+ "loss": 4.541165924072265,
3997
+ "step": 5560
3998
+ },
3999
+ {
4000
+ "epoch": 0.9410373373880723,
4001
+ "grad_norm": 0.5129496455192566,
4002
+ "learning_rate": 0.00021268039953880184,
4003
+ "loss": 4.529154968261719,
4004
+ "step": 5570
4005
+ },
4006
+ {
4007
+ "epoch": 0.94272681196148,
4008
+ "grad_norm": 0.5097109079360962,
4009
+ "learning_rate": 0.00021224490697500088,
4010
+ "loss": 4.535088348388672,
4011
+ "step": 5580
4012
+ },
4013
+ {
4014
+ "epoch": 0.9444162865348876,
4015
+ "grad_norm": 0.5103420615196228,
4016
+ "learning_rate": 0.00021180877968041552,
4017
+ "loss": 4.553527069091797,
4018
+ "step": 5590
4019
+ },
4020
+ {
4021
+ "epoch": 0.9461057611082954,
4022
+ "grad_norm": 0.4936409294605255,
4023
+ "learning_rate": 0.00021137202210237213,
4024
+ "loss": 4.54007568359375,
4025
+ "step": 5600
4026
+ },
4027
+ {
4028
+ "epoch": 0.947795235681703,
4029
+ "grad_norm": 0.5701144933700562,
4030
+ "learning_rate": 0.0002109346386946243,
4031
+ "loss": 4.558887100219726,
4032
+ "step": 5610
4033
+ },
4034
+ {
4035
+ "epoch": 0.9494847102551106,
4036
+ "grad_norm": 0.4890182912349701,
4037
+ "learning_rate": 0.00021049663391730752,
4038
+ "loss": 4.543179702758789,
4039
+ "step": 5620
4040
+ },
4041
+ {
4042
+ "epoch": 0.9511741848285183,
4043
+ "grad_norm": 0.5074143409729004,
4044
+ "learning_rate": 0.00021005801223689344,
4045
+ "loss": 4.5704292297363285,
4046
+ "step": 5630
4047
+ },
4048
+ {
4049
+ "epoch": 0.952863659401926,
4050
+ "grad_norm": 0.4767675995826721,
4051
+ "learning_rate": 0.00020961877812614458,
4052
+ "loss": 4.569948196411133,
4053
+ "step": 5640
4054
+ },
4055
+ {
4056
+ "epoch": 0.9545531339753337,
4057
+ "grad_norm": 0.5034293532371521,
4058
+ "learning_rate": 0.00020917893606406843,
4059
+ "loss": 4.524322128295898,
4060
+ "step": 5650
4061
+ },
4062
+ {
4063
+ "epoch": 0.9562426085487413,
4064
+ "grad_norm": 0.5619840621948242,
4065
+ "learning_rate": 0.0002087384905358722,
4066
+ "loss": 4.528865051269531,
4067
+ "step": 5660
4068
+ },
4069
+ {
4070
+ "epoch": 0.9579320831221491,
4071
+ "grad_norm": 0.5692474842071533,
4072
+ "learning_rate": 0.00020829744603291663,
4073
+ "loss": 4.5155292510986325,
4074
+ "step": 5670
4075
+ },
4076
+ {
4077
+ "epoch": 0.9596215576955567,
4078
+ "grad_norm": 0.504224419593811,
4079
+ "learning_rate": 0.00020785580705267047,
4080
+ "loss": 4.559905624389648,
4081
+ "step": 5680
4082
+ },
4083
+ {
4084
+ "epoch": 0.9613110322689643,
4085
+ "grad_norm": 0.563014805316925,
4086
+ "learning_rate": 0.00020741357809866447,
4087
+ "loss": 4.556017303466797,
4088
+ "step": 5690
4089
+ },
4090
+ {
4091
+ "epoch": 0.963000506842372,
4092
+ "grad_norm": 0.4872301518917084,
4093
+ "learning_rate": 0.0002069707636804457,
4094
+ "loss": 4.550839233398437,
4095
+ "step": 5700
4096
+ },
4097
+ {
4098
+ "epoch": 0.9646899814157797,
4099
+ "grad_norm": 0.5135483145713806,
4100
+ "learning_rate": 0.0002065273683135312,
4101
+ "loss": 4.550697708129883,
4102
+ "step": 5710
4103
+ },
4104
+ {
4105
+ "epoch": 0.9663794559891874,
4106
+ "grad_norm": 0.4852290451526642,
4107
+ "learning_rate": 0.00020608339651936224,
4108
+ "loss": 4.531842422485352,
4109
+ "step": 5720
4110
+ },
4111
+ {
4112
+ "epoch": 0.968068930562595,
4113
+ "grad_norm": 0.5045028924942017,
4114
+ "learning_rate": 0.00020563885282525802,
4115
+ "loss": 4.532521057128906,
4116
+ "step": 5730
4117
+ },
4118
+ {
4119
+ "epoch": 0.9697584051360028,
4120
+ "grad_norm": 0.530616044998169,
4121
+ "learning_rate": 0.00020519374176436968,
4122
+ "loss": 4.546891403198242,
4123
+ "step": 5740
4124
+ },
4125
+ {
4126
+ "epoch": 0.9714478797094104,
4127
+ "grad_norm": 0.49565091729164124,
4128
+ "learning_rate": 0.00020474806787563392,
4129
+ "loss": 4.533766555786133,
4130
+ "step": 5750
4131
+ },
4132
+ {
4133
+ "epoch": 0.973137354282818,
4134
+ "grad_norm": 0.5225724577903748,
4135
+ "learning_rate": 0.0002043018357037267,
4136
+ "loss": 4.542942810058594,
4137
+ "step": 5760
4138
+ },
4139
+ {
4140
+ "epoch": 0.9748268288562257,
4141
+ "grad_norm": 0.49189162254333496,
4142
+ "learning_rate": 0.00020385504979901712,
4143
+ "loss": 4.545899200439453,
4144
+ "step": 5770
4145
+ },
4146
+ {
4147
+ "epoch": 0.9765163034296334,
4148
+ "grad_norm": 0.5116291642189026,
4149
+ "learning_rate": 0.00020340771471752078,
4150
+ "loss": 4.532541656494141,
4151
+ "step": 5780
4152
+ },
4153
+ {
4154
+ "epoch": 0.9782057780030411,
4155
+ "grad_norm": 0.5132644772529602,
4156
+ "learning_rate": 0.0002029598350208534,
4157
+ "loss": 4.524928283691406,
4158
+ "step": 5790
4159
+ },
4160
+ {
4161
+ "epoch": 0.9798952525764487,
4162
+ "grad_norm": 0.4904372990131378,
4163
+ "learning_rate": 0.00020251141527618434,
4164
+ "loss": 4.532776641845703,
4165
+ "step": 5800
4166
+ },
4167
+ {
4168
+ "epoch": 0.9815847271498563,
4169
+ "grad_norm": 0.48598089814186096,
4170
+ "learning_rate": 0.00020206246005618998,
4171
+ "loss": 4.519465637207031,
4172
+ "step": 5810
4173
+ },
4174
+ {
4175
+ "epoch": 0.9832742017232641,
4176
+ "grad_norm": 0.5415476560592651,
4177
+ "learning_rate": 0.00020161297393900713,
4178
+ "loss": 4.512179565429688,
4179
+ "step": 5820
4180
+ },
4181
+ {
4182
+ "epoch": 0.9849636762966717,
4183
+ "grad_norm": 0.5061231255531311,
4184
+ "learning_rate": 0.00020116296150818623,
4185
+ "loss": 4.534863662719727,
4186
+ "step": 5830
4187
+ },
4188
+ {
4189
+ "epoch": 0.9866531508700794,
4190
+ "grad_norm": 0.5157834887504578,
4191
+ "learning_rate": 0.0002007124273526449,
4192
+ "loss": 4.50738639831543,
4193
+ "step": 5840
4194
+ },
4195
+ {
4196
+ "epoch": 0.988342625443487,
4197
+ "grad_norm": 0.509292483329773,
4198
+ "learning_rate": 0.00020026137606662077,
4199
+ "loss": 4.5266845703125,
4200
+ "step": 5850
4201
+ },
4202
+ {
4203
+ "epoch": 0.9900321000168948,
4204
+ "grad_norm": 0.5107020139694214,
4205
+ "learning_rate": 0.0001998098122496249,
4206
+ "loss": 4.533035659790039,
4207
+ "step": 5860
4208
+ },
4209
+ {
4210
+ "epoch": 0.9917215745903024,
4211
+ "grad_norm": 0.5432437062263489,
4212
+ "learning_rate": 0.00019935774050639472,
4213
+ "loss": 4.518278884887695,
4214
+ "step": 5870
4215
+ },
4216
+ {
4217
+ "epoch": 0.99341104916371,
4218
+ "grad_norm": 0.5360410213470459,
4219
+ "learning_rate": 0.0001989051654468473,
4220
+ "loss": 4.502675628662109,
4221
+ "step": 5880
4222
+ },
4223
+ {
4224
+ "epoch": 0.9951005237371178,
4225
+ "grad_norm": 0.5418276786804199,
4226
+ "learning_rate": 0.00019845209168603195,
4227
+ "loss": 4.5235343933105465,
4228
+ "step": 5890
4229
+ },
4230
+ {
4231
+ "epoch": 0.9967899983105254,
4232
+ "grad_norm": 0.5157185792922974,
4233
+ "learning_rate": 0.00019799852384408355,
4234
+ "loss": 4.524081420898438,
4235
+ "step": 5900
4236
+ },
4237
+ {
4238
+ "epoch": 0.9984794728839331,
4239
+ "grad_norm": 0.5043293237686157,
4240
+ "learning_rate": 0.00019754446654617527,
4241
+ "loss": 4.508223342895508,
4242
+ "step": 5910
4243
+ },
4244
+ {
4245
+ "epoch": 1.0001689474573408,
4246
+ "grad_norm": 0.5386601090431213,
4247
+ "learning_rate": 0.00019708992442247136,
4248
+ "loss": 4.5236083984375,
4249
+ "step": 5920
4250
+ },
4251
+ {
4252
+ "epoch": 1.0018584220307485,
4253
+ "grad_norm": 0.5341511368751526,
4254
+ "learning_rate": 0.0001966349021080801,
4255
+ "loss": 4.459320068359375,
4256
+ "step": 5930
4257
+ },
4258
+ {
4259
+ "epoch": 1.003547896604156,
4260
+ "grad_norm": 0.5038416981697083,
4261
+ "learning_rate": 0.0001961794042430062,
4262
+ "loss": 4.505880355834961,
4263
+ "step": 5940
4264
+ },
4265
+ {
4266
+ "epoch": 1.0052373711775637,
4267
+ "grad_norm": 0.47585076093673706,
4268
+ "learning_rate": 0.000195723435472104,
4269
+ "loss": 4.477125930786133,
4270
+ "step": 5950
4271
+ },
4272
+ {
4273
+ "epoch": 1.0069268457509715,
4274
+ "grad_norm": 0.49405696988105774,
4275
+ "learning_rate": 0.00019526700044502956,
4276
+ "loss": 4.483388137817383,
4277
+ "step": 5960
4278
+ },
4279
+ {
4280
+ "epoch": 1.0086163203243792,
4281
+ "grad_norm": 0.47832658886909485,
4282
+ "learning_rate": 0.0001948101038161937,
4283
+ "loss": 4.474266052246094,
4284
+ "step": 5970
4285
+ },
4286
+ {
4287
+ "epoch": 1.0103057948977867,
4288
+ "grad_norm": 0.470113068819046,
4289
+ "learning_rate": 0.0001943527502447141,
4290
+ "loss": 4.483303833007812,
4291
+ "step": 5980
4292
+ },
4293
+ {
4294
+ "epoch": 1.0119952694711944,
4295
+ "grad_norm": 0.4839136004447937,
4296
+ "learning_rate": 0.00019389494439436836,
4297
+ "loss": 4.453615188598633,
4298
+ "step": 5990
4299
+ },
4300
+ {
4301
+ "epoch": 1.0136847440446022,
4302
+ "grad_norm": 0.482327401638031,
4303
+ "learning_rate": 0.0001934366909335458,
4304
+ "loss": 4.491983413696289,
4305
+ "step": 6000
4306
+ },
4307
+ {
4308
+ "epoch": 1.0136847440446022,
4309
+ "eval_loss": 4.487085819244385,
4310
+ "eval_runtime": 4.7973,
4311
+ "eval_samples_per_second": 208.452,
4312
+ "eval_steps_per_second": 4.377,
4313
+ "step": 6000
4314
  }
4315
  ],
4316
  "logging_steps": 10,
 
4330
  "attributes": {}
4331
  }
4332
  },
4333
+ "total_flos": 2.0067200216019763e+17,
4334
  "train_batch_size": 48,
4335
  "trial_name": null,
4336
  "trial_params": null