Wilsonwin commited on
Commit
c06f8cc
·
verified ·
1 Parent(s): 7aab02f

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:663d31a8b6ad2423dc3c0b8759bef8029d3f5914e7b173b5be641f54497bab8c
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5822c5d51ff1a3f6c8d63d9491441c689004f44619d361568f98a19df1caeab
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c93129eb07b3c389c642dd3ac521458eb6b0b8b0b4b6634a4a4ec236e73b73dd
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c88f007992dd9990ea0216c73aaca02a8b4aebfac4c43fbb77c941bb9cf18e
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95d6f8a42fc11a5f0262b0c737f666f824322b1b030452310cca3fb10ffef9ad
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88595be53afbf68c948f838fbf4b1fa7776619d23de4baf3620fece471fafed5
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cda9bcc9266ec91d2da20eab50cd7cea609c16666645a54519c40bab7f69f1a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:528ba9a1d2a5739586b1652bb1454f9e977f93a6ae9e9c38a71b51bc41c45de4
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4360533873965196,
6
  "eval_steps": 500,
7
- "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6101,6 +6101,364 @@
6101
  "eval_samples_per_second": 271.089,
6102
  "eval_steps_per_second": 5.693,
6103
  "step": 8500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6104
  }
6105
  ],
6106
  "logging_steps": 10,
@@ -6120,7 +6478,7 @@
6120
  "attributes": {}
6121
  }
6122
  },
6123
- "total_flos": 2.8428620737491763e+17,
6124
  "train_batch_size": 48,
6125
  "trial_name": null,
6126
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.5205271160669032,
6
  "eval_steps": 500,
7
+ "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6101
  "eval_samples_per_second": 271.089,
6102
  "eval_steps_per_second": 5.693,
6103
  "step": 8500
6104
+ },
6105
+ {
6106
+ "epoch": 1.4377428619699273,
6107
+ "grad_norm": 0.4925293028354645,
6108
+ "learning_rate": 7.706964398504293e-05,
6109
+ "loss": 4.376210403442383,
6110
+ "step": 8510
6111
+ },
6112
+ {
6113
+ "epoch": 1.439432336543335,
6114
+ "grad_norm": 0.4719123840332031,
6115
+ "learning_rate": 7.665144535049224e-05,
6116
+ "loss": 4.338931274414063,
6117
+ "step": 8520
6118
+ },
6119
+ {
6120
+ "epoch": 1.4411218111167428,
6121
+ "grad_norm": 0.4722173511981964,
6122
+ "learning_rate": 7.623399467409416e-05,
6123
+ "loss": 4.352537536621094,
6124
+ "step": 8530
6125
+ },
6126
+ {
6127
+ "epoch": 1.4428112856901505,
6128
+ "grad_norm": 0.4844585955142975,
6129
+ "learning_rate": 7.581729621272386e-05,
6130
+ "loss": 4.332356262207031,
6131
+ "step": 8540
6132
+ },
6133
+ {
6134
+ "epoch": 1.444500760263558,
6135
+ "grad_norm": 0.49630841612815857,
6136
+ "learning_rate": 7.540135421558585e-05,
6137
+ "loss": 4.3133392333984375,
6138
+ "step": 8550
6139
+ },
6140
+ {
6141
+ "epoch": 1.4461902348369657,
6142
+ "grad_norm": 0.472133994102478,
6143
+ "learning_rate": 7.498617292417074e-05,
6144
+ "loss": 4.3697349548339846,
6145
+ "step": 8560
6146
+ },
6147
+ {
6148
+ "epoch": 1.4478797094103735,
6149
+ "grad_norm": 0.48327624797821045,
6150
+ "learning_rate": 7.457175657221194e-05,
6151
+ "loss": 4.366666030883789,
6152
+ "step": 8570
6153
+ },
6154
+ {
6155
+ "epoch": 1.449569183983781,
6156
+ "grad_norm": 0.4768034815788269,
6157
+ "learning_rate": 7.415810938564277e-05,
6158
+ "loss": 4.33704719543457,
6159
+ "step": 8580
6160
+ },
6161
+ {
6162
+ "epoch": 1.4512586585571887,
6163
+ "grad_norm": 0.4592680037021637,
6164
+ "learning_rate": 7.37452355825528e-05,
6165
+ "loss": 4.343940734863281,
6166
+ "step": 8590
6167
+ },
6168
+ {
6169
+ "epoch": 1.4529481331305965,
6170
+ "grad_norm": 0.4643280804157257,
6171
+ "learning_rate": 7.333313937314548e-05,
6172
+ "loss": 4.346873474121094,
6173
+ "step": 8600
6174
+ },
6175
+ {
6176
+ "epoch": 1.454637607704004,
6177
+ "grad_norm": 0.4980602264404297,
6178
+ "learning_rate": 7.292182495969462e-05,
6179
+ "loss": 4.370085525512695,
6180
+ "step": 8610
6181
+ },
6182
+ {
6183
+ "epoch": 1.4563270822774117,
6184
+ "grad_norm": 0.4845782518386841,
6185
+ "learning_rate": 7.251129653650206e-05,
6186
+ "loss": 4.3420463562011715,
6187
+ "step": 8620
6188
+ },
6189
+ {
6190
+ "epoch": 1.4580165568508194,
6191
+ "grad_norm": 0.47701558470726013,
6192
+ "learning_rate": 7.210155828985447e-05,
6193
+ "loss": 4.333865356445313,
6194
+ "step": 8630
6195
+ },
6196
+ {
6197
+ "epoch": 1.459706031424227,
6198
+ "grad_norm": 0.4681967794895172,
6199
+ "learning_rate": 7.169261439798083e-05,
6200
+ "loss": 4.315822982788086,
6201
+ "step": 8640
6202
+ },
6203
+ {
6204
+ "epoch": 1.4613955059976347,
6205
+ "grad_norm": 0.48438313603401184,
6206
+ "learning_rate": 7.128446903101004e-05,
6207
+ "loss": 4.31340446472168,
6208
+ "step": 8650
6209
+ },
6210
+ {
6211
+ "epoch": 1.4630849805710424,
6212
+ "grad_norm": 0.4675985872745514,
6213
+ "learning_rate": 7.087712635092802e-05,
6214
+ "loss": 4.347599792480469,
6215
+ "step": 8660
6216
+ },
6217
+ {
6218
+ "epoch": 1.4647744551444501,
6219
+ "grad_norm": 0.5026019215583801,
6220
+ "learning_rate": 7.047059051153538e-05,
6221
+ "loss": 4.3385356903076175,
6222
+ "step": 8670
6223
+ },
6224
+ {
6225
+ "epoch": 1.4664639297178579,
6226
+ "grad_norm": 0.4908424913883209,
6227
+ "learning_rate": 7.006486565840532e-05,
6228
+ "loss": 4.337771224975586,
6229
+ "step": 8680
6230
+ },
6231
+ {
6232
+ "epoch": 1.4681534042912654,
6233
+ "grad_norm": 0.47692814469337463,
6234
+ "learning_rate": 6.96599559288411e-05,
6235
+ "loss": 4.350002288818359,
6236
+ "step": 8690
6237
+ },
6238
+ {
6239
+ "epoch": 1.4698428788646731,
6240
+ "grad_norm": 0.4985916316509247,
6241
+ "learning_rate": 6.925586545183383e-05,
6242
+ "loss": 4.357270812988281,
6243
+ "step": 8700
6244
+ },
6245
+ {
6246
+ "epoch": 1.4715323534380809,
6247
+ "grad_norm": 0.4779921770095825,
6248
+ "learning_rate": 6.885259834802042e-05,
6249
+ "loss": 4.3343353271484375,
6250
+ "step": 8710
6251
+ },
6252
+ {
6253
+ "epoch": 1.4732218280114884,
6254
+ "grad_norm": 0.4964430630207062,
6255
+ "learning_rate": 6.845015872964179e-05,
6256
+ "loss": 4.345649337768554,
6257
+ "step": 8720
6258
+ },
6259
+ {
6260
+ "epoch": 1.4749113025848961,
6261
+ "grad_norm": 0.4816732108592987,
6262
+ "learning_rate": 6.80485507005005e-05,
6263
+ "loss": 4.349812316894531,
6264
+ "step": 8730
6265
+ },
6266
+ {
6267
+ "epoch": 1.4766007771583038,
6268
+ "grad_norm": 0.4839925765991211,
6269
+ "learning_rate": 6.764777835591921e-05,
6270
+ "loss": 4.342644119262696,
6271
+ "step": 8740
6272
+ },
6273
+ {
6274
+ "epoch": 1.4782902517317114,
6275
+ "grad_norm": 0.5161303877830505,
6276
+ "learning_rate": 6.724784578269892e-05,
6277
+ "loss": 4.322945022583008,
6278
+ "step": 8750
6279
+ },
6280
+ {
6281
+ "epoch": 1.479979726305119,
6282
+ "grad_norm": 0.4845769703388214,
6283
+ "learning_rate": 6.684875705907722e-05,
6284
+ "loss": 4.33643798828125,
6285
+ "step": 8760
6286
+ },
6287
+ {
6288
+ "epoch": 1.4816692008785268,
6289
+ "grad_norm": 0.48371464014053345,
6290
+ "learning_rate": 6.645051625468657e-05,
6291
+ "loss": 4.319810104370117,
6292
+ "step": 8770
6293
+ },
6294
+ {
6295
+ "epoch": 1.4833586754519343,
6296
+ "grad_norm": 0.4810192286968231,
6297
+ "learning_rate": 6.605312743051297e-05,
6298
+ "loss": 4.350659561157227,
6299
+ "step": 8780
6300
+ },
6301
+ {
6302
+ "epoch": 1.485048150025342,
6303
+ "grad_norm": 0.4886019825935364,
6304
+ "learning_rate": 6.565659463885467e-05,
6305
+ "loss": 4.340823364257813,
6306
+ "step": 8790
6307
+ },
6308
+ {
6309
+ "epoch": 1.4867376245987498,
6310
+ "grad_norm": 0.4922144114971161,
6311
+ "learning_rate": 6.526092192328048e-05,
6312
+ "loss": 4.337167358398437,
6313
+ "step": 8800
6314
+ },
6315
+ {
6316
+ "epoch": 1.4884270991721575,
6317
+ "grad_norm": 0.47720760107040405,
6318
+ "learning_rate": 6.486611331858879e-05,
6319
+ "loss": 4.330669403076172,
6320
+ "step": 8810
6321
+ },
6322
+ {
6323
+ "epoch": 1.490116573745565,
6324
+ "grad_norm": 0.45629069209098816,
6325
+ "learning_rate": 6.447217285076651e-05,
6326
+ "loss": 4.354007339477539,
6327
+ "step": 8820
6328
+ },
6329
+ {
6330
+ "epoch": 1.4918060483189728,
6331
+ "grad_norm": 0.4794461727142334,
6332
+ "learning_rate": 6.407910453694782e-05,
6333
+ "loss": 4.356667327880859,
6334
+ "step": 8830
6335
+ },
6336
+ {
6337
+ "epoch": 1.4934955228923805,
6338
+ "grad_norm": 0.4836932420730591,
6339
+ "learning_rate": 6.368691238537321e-05,
6340
+ "loss": 4.3167163848876955,
6341
+ "step": 8840
6342
+ },
6343
+ {
6344
+ "epoch": 1.4951849974657883,
6345
+ "grad_norm": 0.5060141086578369,
6346
+ "learning_rate": 6.329560039534874e-05,
6347
+ "loss": 4.362548828125,
6348
+ "step": 8850
6349
+ },
6350
+ {
6351
+ "epoch": 1.4968744720391958,
6352
+ "grad_norm": 0.48216700553894043,
6353
+ "learning_rate": 6.290517255720505e-05,
6354
+ "loss": 4.3512012481689455,
6355
+ "step": 8860
6356
+ },
6357
+ {
6358
+ "epoch": 1.4985639466126035,
6359
+ "grad_norm": 0.46019911766052246,
6360
+ "learning_rate": 6.251563285225707e-05,
6361
+ "loss": 4.32593002319336,
6362
+ "step": 8870
6363
+ },
6364
+ {
6365
+ "epoch": 1.5002534211860112,
6366
+ "grad_norm": 0.4773600697517395,
6367
+ "learning_rate": 6.212698525276294e-05,
6368
+ "loss": 4.345823287963867,
6369
+ "step": 8880
6370
+ },
6371
+ {
6372
+ "epoch": 1.5019428957594188,
6373
+ "grad_norm": 0.4903421401977539,
6374
+ "learning_rate": 6.173923372188372e-05,
6375
+ "loss": 4.330167770385742,
6376
+ "step": 8890
6377
+ },
6378
+ {
6379
+ "epoch": 1.5036323703328265,
6380
+ "grad_norm": 0.47027841210365295,
6381
+ "learning_rate": 6.135238221364313e-05,
6382
+ "loss": 4.352994155883789,
6383
+ "step": 8900
6384
+ },
6385
+ {
6386
+ "epoch": 1.5053218449062342,
6387
+ "grad_norm": 0.4893588125705719,
6388
+ "learning_rate": 6.096643467288703e-05,
6389
+ "loss": 4.3315269470214846,
6390
+ "step": 8910
6391
+ },
6392
+ {
6393
+ "epoch": 1.5070113194796417,
6394
+ "grad_norm": 0.4835808277130127,
6395
+ "learning_rate": 6.058139503524314e-05,
6396
+ "loss": 4.349056625366211,
6397
+ "step": 8920
6398
+ },
6399
+ {
6400
+ "epoch": 1.5087007940530495,
6401
+ "grad_norm": 0.4750809967517853,
6402
+ "learning_rate": 6.019726722708104e-05,
6403
+ "loss": 4.325545120239258,
6404
+ "step": 8930
6405
+ },
6406
+ {
6407
+ "epoch": 1.5103902686264572,
6408
+ "grad_norm": 0.4945700466632843,
6409
+ "learning_rate": 5.981405516547222e-05,
6410
+ "loss": 4.312815093994141,
6411
+ "step": 8940
6412
+ },
6413
+ {
6414
+ "epoch": 1.5120797431998647,
6415
+ "grad_norm": 0.4704221487045288,
6416
+ "learning_rate": 5.9431762758149875e-05,
6417
+ "loss": 4.328189849853516,
6418
+ "step": 8950
6419
+ },
6420
+ {
6421
+ "epoch": 1.5137692177732727,
6422
+ "grad_norm": 0.48752453923225403,
6423
+ "learning_rate": 5.9050393903469215e-05,
6424
+ "loss": 4.324124145507812,
6425
+ "step": 8960
6426
+ },
6427
+ {
6428
+ "epoch": 1.5154586923466802,
6429
+ "grad_norm": 0.5149093270301819,
6430
+ "learning_rate": 5.866995249036775e-05,
6431
+ "loss": 4.334346771240234,
6432
+ "step": 8970
6433
+ },
6434
+ {
6435
+ "epoch": 1.5171481669200877,
6436
+ "grad_norm": 0.49064958095550537,
6437
+ "learning_rate": 5.829044239832564e-05,
6438
+ "loss": 4.324323654174805,
6439
+ "step": 8980
6440
+ },
6441
+ {
6442
+ "epoch": 1.5188376414934956,
6443
+ "grad_norm": 0.486092746257782,
6444
+ "learning_rate": 5.791186749732594e-05,
6445
+ "loss": 4.346895599365235,
6446
+ "step": 8990
6447
+ },
6448
+ {
6449
+ "epoch": 1.5205271160669032,
6450
+ "grad_norm": 0.48512768745422363,
6451
+ "learning_rate": 5.7534231647815244e-05,
6452
+ "loss": 4.350548934936524,
6453
+ "step": 9000
6454
+ },
6455
+ {
6456
+ "epoch": 1.5205271160669032,
6457
+ "eval_loss": 4.312350273132324,
6458
+ "eval_runtime": 4.1596,
6459
+ "eval_samples_per_second": 240.409,
6460
+ "eval_steps_per_second": 5.049,
6461
+ "step": 9000
6462
  }
6463
  ],
6464
  "logging_steps": 10,
 
6478
  "attributes": {}
6479
  }
6480
  },
6481
+ "total_flos": 3.010090484178616e+17,
6482
  "train_batch_size": 48,
6483
  "trial_name": null,
6484
  "trial_params": null