Wilsonwin commited on
Commit
baf5c29
·
verified ·
1 Parent(s): a0f6758

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b08dad283213606d07f6ab5db889fe475967297819d0fa97888daa2251428bc5
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b6a100ff38d4f00b501f20a5190982d96bca76e8f9a3dd9afd41838295e088c
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a825b2575d588500993f41103ac272cc25e9d2d7632d64e83467f98084e396cb
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318e4ae9012739627ee7e1642d03ed5987f8ac51a72f0db40543b41f04528304
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2170c077dd4bfe6d0d497b721bc49c7786a9b4086e60e7a16be839d33838b66
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3343121e0ab3aeb674ab29d872307564462c4bd82cdd92e6577a4ff26999fc00
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cda9bcc9266ec91d2da20eab50cd7cea609c16666645a54519c40bab7f69f1a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:528ba9a1d2a5739586b1652bb1454f9e977f93a6ae9e9c38a71b51bc41c45de4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4360533873965196,
6
  "eval_steps": 500,
7
- "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6101,6 +6101,364 @@
6101
  "eval_samples_per_second": 186.962,
6102
  "eval_steps_per_second": 3.926,
6103
  "step": 8500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6104
  }
6105
  ],
6106
  "logging_steps": 10,
@@ -6120,7 +6478,7 @@
6120
  "attributes": {}
6121
  }
6122
  },
6123
- "total_flos": 2.8428620737491763e+17,
6124
  "train_batch_size": 48,
6125
  "trial_name": null,
6126
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.5205271160669032,
6
  "eval_steps": 500,
7
+ "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6101
  "eval_samples_per_second": 186.962,
6102
  "eval_steps_per_second": 3.926,
6103
  "step": 8500
6104
+ },
6105
+ {
6106
+ "epoch": 1.4377428619699273,
6107
+ "grad_norm": 0.4886073172092438,
6108
+ "learning_rate": 7.706964398504293e-05,
6109
+ "loss": 4.375003814697266,
6110
+ "step": 8510
6111
+ },
6112
+ {
6113
+ "epoch": 1.439432336543335,
6114
+ "grad_norm": 0.48204493522644043,
6115
+ "learning_rate": 7.665144535049224e-05,
6116
+ "loss": 4.33798828125,
6117
+ "step": 8520
6118
+ },
6119
+ {
6120
+ "epoch": 1.4411218111167428,
6121
+ "grad_norm": 0.46427401900291443,
6122
+ "learning_rate": 7.623399467409416e-05,
6123
+ "loss": 4.350474166870117,
6124
+ "step": 8530
6125
+ },
6126
+ {
6127
+ "epoch": 1.4428112856901505,
6128
+ "grad_norm": 0.48773905634880066,
6129
+ "learning_rate": 7.581729621272386e-05,
6130
+ "loss": 4.331120300292969,
6131
+ "step": 8540
6132
+ },
6133
+ {
6134
+ "epoch": 1.444500760263558,
6135
+ "grad_norm": 0.49667221307754517,
6136
+ "learning_rate": 7.540135421558585e-05,
6137
+ "loss": 4.313655090332031,
6138
+ "step": 8550
6139
+ },
6140
+ {
6141
+ "epoch": 1.4461902348369657,
6142
+ "grad_norm": 0.4690784215927124,
6143
+ "learning_rate": 7.498617292417074e-05,
6144
+ "loss": 4.368997573852539,
6145
+ "step": 8560
6146
+ },
6147
+ {
6148
+ "epoch": 1.4478797094103735,
6149
+ "grad_norm": 0.48315441608428955,
6150
+ "learning_rate": 7.457175657221194e-05,
6151
+ "loss": 4.365554428100586,
6152
+ "step": 8570
6153
+ },
6154
+ {
6155
+ "epoch": 1.449569183983781,
6156
+ "grad_norm": 0.47335466742515564,
6157
+ "learning_rate": 7.415810938564277e-05,
6158
+ "loss": 4.337088394165039,
6159
+ "step": 8580
6160
+ },
6161
+ {
6162
+ "epoch": 1.4512586585571887,
6163
+ "grad_norm": 0.45826077461242676,
6164
+ "learning_rate": 7.37452355825528e-05,
6165
+ "loss": 4.34198112487793,
6166
+ "step": 8590
6167
+ },
6168
+ {
6169
+ "epoch": 1.4529481331305965,
6170
+ "grad_norm": 0.4623316824436188,
6171
+ "learning_rate": 7.333313937314548e-05,
6172
+ "loss": 4.346709442138672,
6173
+ "step": 8600
6174
+ },
6175
+ {
6176
+ "epoch": 1.454637607704004,
6177
+ "grad_norm": 0.48673200607299805,
6178
+ "learning_rate": 7.292182495969462e-05,
6179
+ "loss": 4.370217514038086,
6180
+ "step": 8610
6181
+ },
6182
+ {
6183
+ "epoch": 1.4563270822774117,
6184
+ "grad_norm": 0.4870317280292511,
6185
+ "learning_rate": 7.251129653650206e-05,
6186
+ "loss": 4.340325927734375,
6187
+ "step": 8620
6188
+ },
6189
+ {
6190
+ "epoch": 1.4580165568508194,
6191
+ "grad_norm": 0.4829833507537842,
6192
+ "learning_rate": 7.210155828985447e-05,
6193
+ "loss": 4.333442687988281,
6194
+ "step": 8630
6195
+ },
6196
+ {
6197
+ "epoch": 1.459706031424227,
6198
+ "grad_norm": 0.4647566080093384,
6199
+ "learning_rate": 7.169261439798083e-05,
6200
+ "loss": 4.3144184112548825,
6201
+ "step": 8640
6202
+ },
6203
+ {
6204
+ "epoch": 1.4613955059976347,
6205
+ "grad_norm": 0.48941001296043396,
6206
+ "learning_rate": 7.128446903101004e-05,
6207
+ "loss": 4.31253662109375,
6208
+ "step": 8650
6209
+ },
6210
+ {
6211
+ "epoch": 1.4630849805710424,
6212
+ "grad_norm": 0.46602746844291687,
6213
+ "learning_rate": 7.087712635092802e-05,
6214
+ "loss": 4.346303176879883,
6215
+ "step": 8660
6216
+ },
6217
+ {
6218
+ "epoch": 1.4647744551444501,
6219
+ "grad_norm": 0.5055034756660461,
6220
+ "learning_rate": 7.047059051153538e-05,
6221
+ "loss": 4.3370361328125,
6222
+ "step": 8670
6223
+ },
6224
+ {
6225
+ "epoch": 1.4664639297178579,
6226
+ "grad_norm": 0.49361884593963623,
6227
+ "learning_rate": 7.006486565840532e-05,
6228
+ "loss": 4.337132263183594,
6229
+ "step": 8680
6230
+ },
6231
+ {
6232
+ "epoch": 1.4681534042912654,
6233
+ "grad_norm": 0.4785706400871277,
6234
+ "learning_rate": 6.96599559288411e-05,
6235
+ "loss": 4.349030303955078,
6236
+ "step": 8690
6237
+ },
6238
+ {
6239
+ "epoch": 1.4698428788646731,
6240
+ "grad_norm": 0.49940159916877747,
6241
+ "learning_rate": 6.925586545183383e-05,
6242
+ "loss": 4.356793212890625,
6243
+ "step": 8700
6244
+ },
6245
+ {
6246
+ "epoch": 1.4715323534380809,
6247
+ "grad_norm": 0.4632912576198578,
6248
+ "learning_rate": 6.885259834802042e-05,
6249
+ "loss": 4.333657836914062,
6250
+ "step": 8710
6251
+ },
6252
+ {
6253
+ "epoch": 1.4732218280114884,
6254
+ "grad_norm": 0.4802776575088501,
6255
+ "learning_rate": 6.845015872964179e-05,
6256
+ "loss": 4.345002365112305,
6257
+ "step": 8720
6258
+ },
6259
+ {
6260
+ "epoch": 1.4749113025848961,
6261
+ "grad_norm": 0.4794064164161682,
6262
+ "learning_rate": 6.80485507005005e-05,
6263
+ "loss": 4.348992538452149,
6264
+ "step": 8730
6265
+ },
6266
+ {
6267
+ "epoch": 1.4766007771583038,
6268
+ "grad_norm": 0.48898664116859436,
6269
+ "learning_rate": 6.764777835591921e-05,
6270
+ "loss": 4.341244125366211,
6271
+ "step": 8740
6272
+ },
6273
+ {
6274
+ "epoch": 1.4782902517317114,
6275
+ "grad_norm": 0.4965602159500122,
6276
+ "learning_rate": 6.724784578269892e-05,
6277
+ "loss": 4.321900939941406,
6278
+ "step": 8750
6279
+ },
6280
+ {
6281
+ "epoch": 1.479979726305119,
6282
+ "grad_norm": 0.4652167856693268,
6283
+ "learning_rate": 6.684875705907722e-05,
6284
+ "loss": 4.334490203857422,
6285
+ "step": 8760
6286
+ },
6287
+ {
6288
+ "epoch": 1.4816692008785268,
6289
+ "grad_norm": 0.4919753968715668,
6290
+ "learning_rate": 6.645051625468657e-05,
6291
+ "loss": 4.318844604492187,
6292
+ "step": 8770
6293
+ },
6294
+ {
6295
+ "epoch": 1.4833586754519343,
6296
+ "grad_norm": 0.48315659165382385,
6297
+ "learning_rate": 6.605312743051297e-05,
6298
+ "loss": 4.349975967407227,
6299
+ "step": 8780
6300
+ },
6301
+ {
6302
+ "epoch": 1.485048150025342,
6303
+ "grad_norm": 0.4814257323741913,
6304
+ "learning_rate": 6.565659463885467e-05,
6305
+ "loss": 4.339570236206055,
6306
+ "step": 8790
6307
+ },
6308
+ {
6309
+ "epoch": 1.4867376245987498,
6310
+ "grad_norm": 0.48735612630844116,
6311
+ "learning_rate": 6.526092192328048e-05,
6312
+ "loss": 4.335529708862305,
6313
+ "step": 8800
6314
+ },
6315
+ {
6316
+ "epoch": 1.4884270991721575,
6317
+ "grad_norm": 0.4753458499908447,
6318
+ "learning_rate": 6.486611331858879e-05,
6319
+ "loss": 4.328804779052734,
6320
+ "step": 8810
6321
+ },
6322
+ {
6323
+ "epoch": 1.490116573745565,
6324
+ "grad_norm": 0.46705493330955505,
6325
+ "learning_rate": 6.447217285076651e-05,
6326
+ "loss": 4.353744125366211,
6327
+ "step": 8820
6328
+ },
6329
+ {
6330
+ "epoch": 1.4918060483189728,
6331
+ "grad_norm": 0.4967743456363678,
6332
+ "learning_rate": 6.407910453694782e-05,
6333
+ "loss": 4.356158065795898,
6334
+ "step": 8830
6335
+ },
6336
+ {
6337
+ "epoch": 1.4934955228923805,
6338
+ "grad_norm": 0.4624764621257782,
6339
+ "learning_rate": 6.368691238537321e-05,
6340
+ "loss": 4.316521453857422,
6341
+ "step": 8840
6342
+ },
6343
+ {
6344
+ "epoch": 1.4951849974657883,
6345
+ "grad_norm": 0.5081548094749451,
6346
+ "learning_rate": 6.329560039534874e-05,
6347
+ "loss": 4.3620750427246096,
6348
+ "step": 8850
6349
+ },
6350
+ {
6351
+ "epoch": 1.4968744720391958,
6352
+ "grad_norm": 0.486570805311203,
6353
+ "learning_rate": 6.290517255720505e-05,
6354
+ "loss": 4.351879501342774,
6355
+ "step": 8860
6356
+ },
6357
+ {
6358
+ "epoch": 1.4985639466126035,
6359
+ "grad_norm": 0.4706440567970276,
6360
+ "learning_rate": 6.251563285225707e-05,
6361
+ "loss": 4.324571228027343,
6362
+ "step": 8870
6363
+ },
6364
+ {
6365
+ "epoch": 1.5002534211860112,
6366
+ "grad_norm": 0.49965882301330566,
6367
+ "learning_rate": 6.212698525276294e-05,
6368
+ "loss": 4.34442367553711,
6369
+ "step": 8880
6370
+ },
6371
+ {
6372
+ "epoch": 1.5019428957594188,
6373
+ "grad_norm": 0.4871665835380554,
6374
+ "learning_rate": 6.173923372188372e-05,
6375
+ "loss": 4.329629516601562,
6376
+ "step": 8890
6377
+ },
6378
+ {
6379
+ "epoch": 1.5036323703328265,
6380
+ "grad_norm": 0.47697439789772034,
6381
+ "learning_rate": 6.135238221364313e-05,
6382
+ "loss": 4.3523296356201175,
6383
+ "step": 8900
6384
+ },
6385
+ {
6386
+ "epoch": 1.5053218449062342,
6387
+ "grad_norm": 0.48661452531814575,
6388
+ "learning_rate": 6.096643467288703e-05,
6389
+ "loss": 4.330023956298828,
6390
+ "step": 8910
6391
+ },
6392
+ {
6393
+ "epoch": 1.5070113194796417,
6394
+ "grad_norm": 0.4829593002796173,
6395
+ "learning_rate": 6.058139503524314e-05,
6396
+ "loss": 4.348539352416992,
6397
+ "step": 8920
6398
+ },
6399
+ {
6400
+ "epoch": 1.5087007940530495,
6401
+ "grad_norm": 0.47934937477111816,
6402
+ "learning_rate": 6.019726722708104e-05,
6403
+ "loss": 4.323921966552734,
6404
+ "step": 8930
6405
+ },
6406
+ {
6407
+ "epoch": 1.5103902686264572,
6408
+ "grad_norm": 0.5149379372596741,
6409
+ "learning_rate": 5.981405516547222e-05,
6410
+ "loss": 4.312050628662109,
6411
+ "step": 8940
6412
+ },
6413
+ {
6414
+ "epoch": 1.5120797431998647,
6415
+ "grad_norm": 0.48116961121559143,
6416
+ "learning_rate": 5.9431762758149875e-05,
6417
+ "loss": 4.327413940429688,
6418
+ "step": 8950
6419
+ },
6420
+ {
6421
+ "epoch": 1.5137692177732727,
6422
+ "grad_norm": 0.49428287148475647,
6423
+ "learning_rate": 5.9050393903469215e-05,
6424
+ "loss": 4.323257827758789,
6425
+ "step": 8960
6426
+ },
6427
+ {
6428
+ "epoch": 1.5154586923466802,
6429
+ "grad_norm": 0.5180572271347046,
6430
+ "learning_rate": 5.866995249036775e-05,
6431
+ "loss": 4.333328628540039,
6432
+ "step": 8970
6433
+ },
6434
+ {
6435
+ "epoch": 1.5171481669200877,
6436
+ "grad_norm": 0.4911746382713318,
6437
+ "learning_rate": 5.829044239832564e-05,
6438
+ "loss": 4.323813247680664,
6439
+ "step": 8980
6440
+ },
6441
+ {
6442
+ "epoch": 1.5188376414934956,
6443
+ "grad_norm": 0.49372172355651855,
6444
+ "learning_rate": 5.791186749732594e-05,
6445
+ "loss": 4.345953750610351,
6446
+ "step": 8990
6447
+ },
6448
+ {
6449
+ "epoch": 1.5205271160669032,
6450
+ "grad_norm": 0.4822508990764618,
6451
+ "learning_rate": 5.7534231647815244e-05,
6452
+ "loss": 4.349853134155273,
6453
+ "step": 9000
6454
+ },
6455
+ {
6456
+ "epoch": 1.5205271160669032,
6457
+ "eval_loss": 4.292741298675537,
6458
+ "eval_runtime": 3.7165,
6459
+ "eval_samples_per_second": 269.07,
6460
+ "eval_steps_per_second": 5.65,
6461
+ "step": 9000
6462
  }
6463
  ],
6464
  "logging_steps": 10,
 
6478
  "attributes": {}
6479
  }
6480
  },
6481
+ "total_flos": 3.010090484178616e+17,
6482
  "train_batch_size": 48,
6483
  "trial_name": null,
6484
  "trial_params": null