Wilsonwin commited on
Commit
7184234
·
verified ·
1 Parent(s): 522809a

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e8ae4d439763bede675a7bb8407ca626ba1a1ca1d28d508145ff27990bcdfd60
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f198d05f5a6f7322d5950baad97f98d6f59bcdb9ed02f220583ce5fd10a379c7
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8df4f8c8c0f93c7a4647906cc1e5f85c72386b1b581eb687df3d305abbdc44a7
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302e7a816c65dc7ea036853d2e134881bf37e4d7e3ce31f671702ad86c5f1616
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8af3cc3f1560f815527e73bcdf0bbfb03998a87b5067ff9928ca94f46e638231
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef678004bfc53268aeb4845a442c0327144244832e571a2be41a7160145765eb
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bda4b56b57284b5d776cea834f86539fa062d5e046885e07dcb7516921ccd6ee
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5732bb4fae95fda377427872ad7c4fed0c45a84922701b3143ffa39cf761f9db
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9292110153742186,
6
  "eval_steps": 500,
7
- "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3953,6 +3953,364 @@
3953
  "eval_samples_per_second": 276.03,
3954
  "eval_steps_per_second": 5.797,
3955
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3956
  }
3957
  ],
3958
  "logging_steps": 10,
@@ -3972,7 +4330,7 @@
3972
  "attributes": {}
3973
  }
3974
  },
3975
- "total_flos": 1.83951251472384e+17,
3976
  "train_batch_size": 48,
3977
  "trial_name": null,
3978
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0136847440446022,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3953
  "eval_samples_per_second": 276.03,
3954
  "eval_steps_per_second": 5.797,
3955
  "step": 5500
3956
+ },
3957
+ {
3958
+ "epoch": 0.9309004899476263,
3959
+ "grad_norm": 0.4981231689453125,
3960
+ "learning_rate": 0.00021527977734609537,
3961
+ "loss": 4.546344757080078,
3962
+ "step": 5510
3963
+ },
3964
+ {
3965
+ "epoch": 0.9325899645210339,
3966
+ "grad_norm": 0.5102471709251404,
3967
+ "learning_rate": 0.00021484818619522722,
3968
+ "loss": 4.555732727050781,
3969
+ "step": 5520
3970
+ },
3971
+ {
3972
+ "epoch": 0.9342794390944417,
3973
+ "grad_norm": 0.4952332675457001,
3974
+ "learning_rate": 0.00021441593376712224,
3975
+ "loss": 4.551390838623047,
3976
+ "step": 5530
3977
+ },
3978
+ {
3979
+ "epoch": 0.9359689136678493,
3980
+ "grad_norm": 0.48818397521972656,
3981
+ "learning_rate": 0.0002139830244695935,
3982
+ "loss": 4.579892349243164,
3983
+ "step": 5540
3984
+ },
3985
+ {
3986
+ "epoch": 0.937658388241257,
3987
+ "grad_norm": 0.5026493072509766,
3988
+ "learning_rate": 0.00021354946271715265,
3989
+ "loss": 4.550901794433594,
3990
+ "step": 5550
3991
+ },
3992
+ {
3993
+ "epoch": 0.9393478628146646,
3994
+ "grad_norm": 0.5072263479232788,
3995
+ "learning_rate": 0.00021311525293096444,
3996
+ "loss": 4.540952682495117,
3997
+ "step": 5560
3998
+ },
3999
+ {
4000
+ "epoch": 0.9410373373880723,
4001
+ "grad_norm": 0.506399929523468,
4002
+ "learning_rate": 0.00021268039953880184,
4003
+ "loss": 4.528360748291016,
4004
+ "step": 5570
4005
+ },
4006
+ {
4007
+ "epoch": 0.94272681196148,
4008
+ "grad_norm": 0.4890686571598053,
4009
+ "learning_rate": 0.00021224490697500088,
4010
+ "loss": 4.535118484497071,
4011
+ "step": 5580
4012
+ },
4013
+ {
4014
+ "epoch": 0.9444162865348876,
4015
+ "grad_norm": 0.53489750623703,
4016
+ "learning_rate": 0.00021180877968041552,
4017
+ "loss": 4.554486846923828,
4018
+ "step": 5590
4019
+ },
4020
+ {
4021
+ "epoch": 0.9461057611082954,
4022
+ "grad_norm": 0.5218686461448669,
4023
+ "learning_rate": 0.00021137202210237213,
4024
+ "loss": 4.540655136108398,
4025
+ "step": 5600
4026
+ },
4027
+ {
4028
+ "epoch": 0.947795235681703,
4029
+ "grad_norm": 0.5503886342048645,
4030
+ "learning_rate": 0.0002109346386946243,
4031
+ "loss": 4.5588642120361325,
4032
+ "step": 5610
4033
+ },
4034
+ {
4035
+ "epoch": 0.9494847102551106,
4036
+ "grad_norm": 0.4940740168094635,
4037
+ "learning_rate": 0.00021049663391730752,
4038
+ "loss": 4.542883682250976,
4039
+ "step": 5620
4040
+ },
4041
+ {
4042
+ "epoch": 0.9511741848285183,
4043
+ "grad_norm": 0.4873588979244232,
4044
+ "learning_rate": 0.00021005801223689344,
4045
+ "loss": 4.569081115722656,
4046
+ "step": 5630
4047
+ },
4048
+ {
4049
+ "epoch": 0.952863659401926,
4050
+ "grad_norm": 0.4529159367084503,
4051
+ "learning_rate": 0.00020961877812614458,
4052
+ "loss": 4.569264984130859,
4053
+ "step": 5640
4054
+ },
4055
+ {
4056
+ "epoch": 0.9545531339753337,
4057
+ "grad_norm": 0.5239872932434082,
4058
+ "learning_rate": 0.00020917893606406843,
4059
+ "loss": 4.523569869995117,
4060
+ "step": 5650
4061
+ },
4062
+ {
4063
+ "epoch": 0.9562426085487413,
4064
+ "grad_norm": 0.5355167388916016,
4065
+ "learning_rate": 0.0002087384905358722,
4066
+ "loss": 4.528088760375977,
4067
+ "step": 5660
4068
+ },
4069
+ {
4070
+ "epoch": 0.9579320831221491,
4071
+ "grad_norm": 0.5981546640396118,
4072
+ "learning_rate": 0.00020829744603291663,
4073
+ "loss": 4.515169525146485,
4074
+ "step": 5670
4075
+ },
4076
+ {
4077
+ "epoch": 0.9596215576955567,
4078
+ "grad_norm": 0.5237213969230652,
4079
+ "learning_rate": 0.00020785580705267047,
4080
+ "loss": 4.559556198120117,
4081
+ "step": 5680
4082
+ },
4083
+ {
4084
+ "epoch": 0.9613110322689643,
4085
+ "grad_norm": 0.5060997009277344,
4086
+ "learning_rate": 0.00020741357809866447,
4087
+ "loss": 4.5545307159423825,
4088
+ "step": 5690
4089
+ },
4090
+ {
4091
+ "epoch": 0.963000506842372,
4092
+ "grad_norm": 0.4923208951950073,
4093
+ "learning_rate": 0.0002069707636804457,
4094
+ "loss": 4.550180053710937,
4095
+ "step": 5700
4096
+ },
4097
+ {
4098
+ "epoch": 0.9646899814157797,
4099
+ "grad_norm": 0.5420214533805847,
4100
+ "learning_rate": 0.0002065273683135312,
4101
+ "loss": 4.5501148223876955,
4102
+ "step": 5710
4103
+ },
4104
+ {
4105
+ "epoch": 0.9663794559891874,
4106
+ "grad_norm": 0.48059365153312683,
4107
+ "learning_rate": 0.00020608339651936224,
4108
+ "loss": 4.532232284545898,
4109
+ "step": 5720
4110
+ },
4111
+ {
4112
+ "epoch": 0.968068930562595,
4113
+ "grad_norm": 0.49933409690856934,
4114
+ "learning_rate": 0.00020563885282525802,
4115
+ "loss": 4.532613372802734,
4116
+ "step": 5730
4117
+ },
4118
+ {
4119
+ "epoch": 0.9697584051360028,
4120
+ "grad_norm": 0.5204219222068787,
4121
+ "learning_rate": 0.00020519374176436968,
4122
+ "loss": 4.545319366455078,
4123
+ "step": 5740
4124
+ },
4125
+ {
4126
+ "epoch": 0.9714478797094104,
4127
+ "grad_norm": 0.49551549553871155,
4128
+ "learning_rate": 0.00020474806787563392,
4129
+ "loss": 4.532552337646484,
4130
+ "step": 5750
4131
+ },
4132
+ {
4133
+ "epoch": 0.973137354282818,
4134
+ "grad_norm": 0.512352705001831,
4135
+ "learning_rate": 0.0002043018357037267,
4136
+ "loss": 4.541680908203125,
4137
+ "step": 5760
4138
+ },
4139
+ {
4140
+ "epoch": 0.9748268288562257,
4141
+ "grad_norm": 0.49258846044540405,
4142
+ "learning_rate": 0.00020385504979901712,
4143
+ "loss": 4.545413970947266,
4144
+ "step": 5770
4145
+ },
4146
+ {
4147
+ "epoch": 0.9765163034296334,
4148
+ "grad_norm": 0.5037888288497925,
4149
+ "learning_rate": 0.00020340771471752078,
4150
+ "loss": 4.531426239013672,
4151
+ "step": 5780
4152
+ },
4153
+ {
4154
+ "epoch": 0.9782057780030411,
4155
+ "grad_norm": 0.526168167591095,
4156
+ "learning_rate": 0.0002029598350208534,
4157
+ "loss": 4.524025344848633,
4158
+ "step": 5790
4159
+ },
4160
+ {
4161
+ "epoch": 0.9798952525764487,
4162
+ "grad_norm": 0.5037376880645752,
4163
+ "learning_rate": 0.00020251141527618434,
4164
+ "loss": 4.531801223754883,
4165
+ "step": 5800
4166
+ },
4167
+ {
4168
+ "epoch": 0.9815847271498563,
4169
+ "grad_norm": 0.49936115741729736,
4170
+ "learning_rate": 0.00020206246005618998,
4171
+ "loss": 4.517900848388672,
4172
+ "step": 5810
4173
+ },
4174
+ {
4175
+ "epoch": 0.9832742017232641,
4176
+ "grad_norm": 0.5162473917007446,
4177
+ "learning_rate": 0.00020161297393900713,
4178
+ "loss": 4.51179313659668,
4179
+ "step": 5820
4180
+ },
4181
+ {
4182
+ "epoch": 0.9849636762966717,
4183
+ "grad_norm": 0.5165606141090393,
4184
+ "learning_rate": 0.00020116296150818623,
4185
+ "loss": 4.53326187133789,
4186
+ "step": 5830
4187
+ },
4188
+ {
4189
+ "epoch": 0.9866531508700794,
4190
+ "grad_norm": 0.5134915113449097,
4191
+ "learning_rate": 0.0002007124273526449,
4192
+ "loss": 4.505707168579102,
4193
+ "step": 5840
4194
+ },
4195
+ {
4196
+ "epoch": 0.988342625443487,
4197
+ "grad_norm": 0.4991665184497833,
4198
+ "learning_rate": 0.00020026137606662077,
4199
+ "loss": 4.525319671630859,
4200
+ "step": 5850
4201
+ },
4202
+ {
4203
+ "epoch": 0.9900321000168948,
4204
+ "grad_norm": 0.5060558915138245,
4205
+ "learning_rate": 0.0001998098122496249,
4206
+ "loss": 4.531586456298828,
4207
+ "step": 5860
4208
+ },
4209
+ {
4210
+ "epoch": 0.9917215745903024,
4211
+ "grad_norm": 0.5269056558609009,
4212
+ "learning_rate": 0.00019935774050639472,
4213
+ "loss": 4.517117691040039,
4214
+ "step": 5870
4215
+ },
4216
+ {
4217
+ "epoch": 0.99341104916371,
4218
+ "grad_norm": 0.5109555125236511,
4219
+ "learning_rate": 0.0001989051654468473,
4220
+ "loss": 4.501250076293945,
4221
+ "step": 5880
4222
+ },
4223
+ {
4224
+ "epoch": 0.9951005237371178,
4225
+ "grad_norm": 0.5808560848236084,
4226
+ "learning_rate": 0.00019845209168603195,
4227
+ "loss": 4.523174285888672,
4228
+ "step": 5890
4229
+ },
4230
+ {
4231
+ "epoch": 0.9967899983105254,
4232
+ "grad_norm": 0.509011447429657,
4233
+ "learning_rate": 0.00019799852384408355,
4234
+ "loss": 4.526637268066406,
4235
+ "step": 5900
4236
+ },
4237
+ {
4238
+ "epoch": 0.9984794728839331,
4239
+ "grad_norm": 0.48725831508636475,
4240
+ "learning_rate": 0.00019754446654617527,
4241
+ "loss": 4.508483123779297,
4242
+ "step": 5910
4243
+ },
4244
+ {
4245
+ "epoch": 1.0001689474573408,
4246
+ "grad_norm": 0.5509161353111267,
4247
+ "learning_rate": 0.00019708992442247136,
4248
+ "loss": 4.524269866943359,
4249
+ "step": 5920
4250
+ },
4251
+ {
4252
+ "epoch": 1.0018584220307485,
4253
+ "grad_norm": 0.5454822182655334,
4254
+ "learning_rate": 0.0001966349021080801,
4255
+ "loss": 4.459218597412109,
4256
+ "step": 5930
4257
+ },
4258
+ {
4259
+ "epoch": 1.003547896604156,
4260
+ "grad_norm": 0.5172731280326843,
4261
+ "learning_rate": 0.0001961794042430062,
4262
+ "loss": 4.506275939941406,
4263
+ "step": 5940
4264
+ },
4265
+ {
4266
+ "epoch": 1.0052373711775637,
4267
+ "grad_norm": 0.4621833562850952,
4268
+ "learning_rate": 0.000195723435472104,
4269
+ "loss": 4.4765571594238285,
4270
+ "step": 5950
4271
+ },
4272
+ {
4273
+ "epoch": 1.0069268457509715,
4274
+ "grad_norm": 0.4911273717880249,
4275
+ "learning_rate": 0.00019526700044502956,
4276
+ "loss": 4.482321929931641,
4277
+ "step": 5960
4278
+ },
4279
+ {
4280
+ "epoch": 1.0086163203243792,
4281
+ "grad_norm": 0.4957529306411743,
4282
+ "learning_rate": 0.0001948101038161937,
4283
+ "loss": 4.473802947998047,
4284
+ "step": 5970
4285
+ },
4286
+ {
4287
+ "epoch": 1.0103057948977867,
4288
+ "grad_norm": 0.4635160267353058,
4289
+ "learning_rate": 0.0001943527502447141,
4290
+ "loss": 4.482971572875977,
4291
+ "step": 5980
4292
+ },
4293
+ {
4294
+ "epoch": 1.0119952694711944,
4295
+ "grad_norm": 0.4733022451400757,
4296
+ "learning_rate": 0.00019389494439436836,
4297
+ "loss": 4.454212188720703,
4298
+ "step": 5990
4299
+ },
4300
+ {
4301
+ "epoch": 1.0136847440446022,
4302
+ "grad_norm": 0.49631762504577637,
4303
+ "learning_rate": 0.0001934366909335458,
4304
+ "loss": 4.491296005249024,
4305
+ "step": 6000
4306
+ },
4307
+ {
4308
+ "epoch": 1.0136847440446022,
4309
+ "eval_loss": 4.470248699188232,
4310
+ "eval_runtime": 3.6231,
4311
+ "eval_samples_per_second": 276.003,
4312
+ "eval_steps_per_second": 5.796,
4313
+ "step": 6000
4314
  }
4315
  ],
4316
  "logging_steps": 10,
 
4330
  "attributes": {}
4331
  }
4332
  },
4333
+ "total_flos": 2.0067200216019763e+17,
4334
  "train_batch_size": 48,
4335
  "trial_name": null,
4336
  "trial_params": null