minpeter commited on
Commit
289fc1a
·
verified ·
1 Parent(s): 9f068ac

Training in progress, step 57000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37d8cb0a48975371483c8d0f0ea6cbaf3751d2262e07e187669c5fa919e08bdd
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cae5bd40cebc93aa05562030f2b12652a8c928f29de2177774bdfb46d57e338
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0698df592b4199333ec83aae72823b0afe639bf7b1724f696793eea67074662d
3
  size 422377931
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3455cdb481c045d67e6c29cc19cbf512f3f4349a97202825124c73528f7b3652
3
  size 422377931
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b1f38a3490635b3698eb8dc53557f282297c0ad5c2a0688b8ef197ae933742a
3
  size 15365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e189d953d56fbbb1dc48bf345790e84a3fa8ff54652aa62e6c6b85a7192fc179
3
  size 15365
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79ac1919bb29ab4ae2b1b0fd8994bb6245a7e65efdd03caed0c32a8061880453
3
  size 15365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f952436541ae47c1950b5a1b819228a6aa1f641c3a191645aa67b0892fe0b260
3
  size 15365
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d1a2675a27b09581871a511c3a1b6270fe19135aa45d2b488c95610aa4071bf
3
  size 15365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71bb0fe2b2559162529fb3a1e66e184ec5cc1d927ba0e24ba8b4215d6d671a7b
3
  size 15365
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:093642c6af6e0eff31531a5cc5f33bd12f66b88e390ca8a0b353843eca5d3d88
3
  size 15365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf14548f27ec2bb28d193492f3a62a0d7bf30afb378a1eaed2530adf64f04c79
3
  size 15365
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a43a1adab58b12e1b9f398e0b2131e69a1f26da62e19f0d8154a7da3af2a7c5
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca6fb53f371a50c66a1841bfc607b1baa7b2a69fcea3747532bd4d0962b4499
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 55000,
3
- "best_metric": 2.553321599960327,
4
- "best_model_checkpoint": "./artifacts/models/pretrain-4gpu-8k-ckpt/checkpoint-55000",
5
- "epoch": 0.98083861701755,
6
  "eval_steps": 1000,
7
- "global_step": 56000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16136,6 +16136,294 @@
16136
  "eval_samples_per_second": 100.371,
16137
  "eval_steps_per_second": 3.158,
16138
  "step": 56000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16139
  }
16140
  ],
16141
  "logging_steps": 25,
@@ -16155,7 +16443,7 @@
16155
  "attributes": {}
16156
  }
16157
  },
16158
- "total_flos": 2.8403068460916015e+19,
16159
  "train_batch_size": 16,
16160
  "trial_name": null,
16161
  "trial_params": null
 
1
  {
2
+ "best_global_step": 57000,
3
+ "best_metric": 2.5533201694488525,
4
+ "best_model_checkpoint": "./artifacts/models/pretrain-4gpu-8k-ckpt/checkpoint-57000",
5
+ "epoch": 0.9983535923214348,
6
  "eval_steps": 1000,
7
+ "global_step": 57000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16136
  "eval_samples_per_second": 100.371,
16137
  "eval_steps_per_second": 3.158,
16138
  "step": 56000
16139
+ },
16140
+ {
16141
+ "epoch": 0.9812764914001472,
16142
+ "grad_norm": 0.2109375,
16143
+ "learning_rate": 9.59941896743366e-07,
16144
+ "loss": 2.5361,
16145
+ "step": 56025
16146
+ },
16147
+ {
16148
+ "epoch": 0.9817143657827443,
16149
+ "grad_norm": 0.20703125,
16150
+ "learning_rate": 9.156223661438423e-07,
16151
+ "loss": 2.5396,
16152
+ "step": 56050
16153
+ },
16154
+ {
16155
+ "epoch": 0.9821522401653414,
16156
+ "grad_norm": 0.212890625,
16157
+ "learning_rate": 8.723493140556648e-07,
16158
+ "loss": 2.5361,
16159
+ "step": 56075
16160
+ },
16161
+ {
16162
+ "epoch": 0.9825901145479385,
16163
+ "grad_norm": 0.203125,
16164
+ "learning_rate": 8.301228312136422e-07,
16165
+ "loss": 2.5437,
16166
+ "step": 56100
16167
+ },
16168
+ {
16169
+ "epoch": 0.9830279889305356,
16170
+ "grad_norm": 0.2177734375,
16171
+ "learning_rate": 7.88943006158116e-07,
16172
+ "loss": 2.5395,
16173
+ "step": 56125
16174
+ },
16175
+ {
16176
+ "epoch": 0.9834658633131327,
16177
+ "grad_norm": 0.21875,
16178
+ "learning_rate": 7.488099252347946e-07,
16179
+ "loss": 2.5497,
16180
+ "step": 56150
16181
+ },
16182
+ {
16183
+ "epoch": 0.9839037376957298,
16184
+ "grad_norm": 0.21484375,
16185
+ "learning_rate": 7.097236725945866e-07,
16186
+ "loss": 2.5393,
16187
+ "step": 56175
16188
+ },
16189
+ {
16190
+ "epoch": 0.9843416120783269,
16191
+ "grad_norm": 0.212890625,
16192
+ "learning_rate": 6.716843301934894e-07,
16193
+ "loss": 2.5412,
16194
+ "step": 56200
16195
+ },
16196
+ {
16197
+ "epoch": 0.9847794864609241,
16198
+ "grad_norm": 0.2138671875,
16199
+ "learning_rate": 6.346919777922011e-07,
16200
+ "loss": 2.546,
16201
+ "step": 56225
16202
+ },
16203
+ {
16204
+ "epoch": 0.9852173608435212,
16205
+ "grad_norm": 0.212890625,
16206
+ "learning_rate": 5.987466929561757e-07,
16207
+ "loss": 2.546,
16208
+ "step": 56250
16209
+ },
16210
+ {
16211
+ "epoch": 0.9856552352261183,
16212
+ "grad_norm": 0.2265625,
16213
+ "learning_rate": 5.638485510554014e-07,
16214
+ "loss": 2.5372,
16215
+ "step": 56275
16216
+ },
16217
+ {
16218
+ "epoch": 0.9860931096087154,
16219
+ "grad_norm": 0.2197265625,
16220
+ "learning_rate": 5.29997625264178e-07,
16221
+ "loss": 2.5395,
16222
+ "step": 56300
16223
+ },
16224
+ {
16225
+ "epoch": 0.9865309839913126,
16226
+ "grad_norm": 0.255859375,
16227
+ "learning_rate": 4.971939865610064e-07,
16228
+ "loss": 2.5328,
16229
+ "step": 56325
16230
+ },
16231
+ {
16232
+ "epoch": 0.9869688583739097,
16233
+ "grad_norm": 0.2255859375,
16234
+ "learning_rate": 4.654377037284774e-07,
16235
+ "loss": 2.5318,
16236
+ "step": 56350
16237
+ },
16238
+ {
16239
+ "epoch": 0.9874067327565068,
16240
+ "grad_norm": 0.2021484375,
16241
+ "learning_rate": 4.347288433530494e-07,
16242
+ "loss": 2.5302,
16243
+ "step": 56375
16244
+ },
16245
+ {
16246
+ "epoch": 0.9878446071391039,
16247
+ "grad_norm": 0.20703125,
16248
+ "learning_rate": 4.050674698248824e-07,
16249
+ "loss": 2.5494,
16250
+ "step": 56400
16251
+ },
16252
+ {
16253
+ "epoch": 0.9882824815217011,
16254
+ "grad_norm": 0.2138671875,
16255
+ "learning_rate": 3.764536453380041e-07,
16256
+ "loss": 2.5285,
16257
+ "step": 56425
16258
+ },
16259
+ {
16260
+ "epoch": 0.9887203559042982,
16261
+ "grad_norm": 0.2158203125,
16262
+ "learning_rate": 3.4888742988964383e-07,
16263
+ "loss": 2.5454,
16264
+ "step": 56450
16265
+ },
16266
+ {
16267
+ "epoch": 0.9891582302868953,
16268
+ "grad_norm": 0.2177734375,
16269
+ "learning_rate": 3.2236888128067687e-07,
16270
+ "loss": 2.5468,
16271
+ "step": 56475
16272
+ },
16273
+ {
16274
+ "epoch": 0.9895961046694924,
16275
+ "grad_norm": 0.21484375,
16276
+ "learning_rate": 2.968980551150136e-07,
16277
+ "loss": 2.5422,
16278
+ "step": 56500
16279
+ },
16280
+ {
16281
+ "epoch": 0.9900339790520896,
16282
+ "grad_norm": 0.21875,
16283
+ "learning_rate": 2.7247500479982145e-07,
16284
+ "loss": 2.5383,
16285
+ "step": 56525
16286
+ },
16287
+ {
16288
+ "epoch": 0.9904718534346867,
16289
+ "grad_norm": 0.208984375,
16290
+ "learning_rate": 2.490997815453033e-07,
16291
+ "loss": 2.5277,
16292
+ "step": 56550
16293
+ },
16294
+ {
16295
+ "epoch": 0.9909097278172838,
16296
+ "grad_norm": 0.2197265625,
16297
+ "learning_rate": 2.2677243436453056e-07,
16298
+ "loss": 2.5163,
16299
+ "step": 56575
16300
+ },
16301
+ {
16302
+ "epoch": 0.9913476021998809,
16303
+ "grad_norm": 0.220703125,
16304
+ "learning_rate": 2.054930100734431e-07,
16305
+ "loss": 2.5343,
16306
+ "step": 56600
16307
+ },
16308
+ {
16309
+ "epoch": 0.9917854765824781,
16310
+ "grad_norm": 0.212890625,
16311
+ "learning_rate": 1.8526155329057214e-07,
16312
+ "loss": 2.5355,
16313
+ "step": 56625
16314
+ },
16315
+ {
16316
+ "epoch": 0.9922233509650752,
16317
+ "grad_norm": 0.2138671875,
16318
+ "learning_rate": 1.6607810643731737e-07,
16319
+ "loss": 2.5432,
16320
+ "step": 56650
16321
+ },
16322
+ {
16323
+ "epoch": 0.9926612253476722,
16324
+ "grad_norm": 0.216796875,
16325
+ "learning_rate": 1.47942709737392e-07,
16326
+ "loss": 2.5337,
16327
+ "step": 56675
16328
+ },
16329
+ {
16330
+ "epoch": 0.9930990997302693,
16331
+ "grad_norm": 0.2119140625,
16332
+ "learning_rate": 1.3085540121698937e-07,
16333
+ "loss": 2.5476,
16334
+ "step": 56700
16335
+ },
16336
+ {
16337
+ "epoch": 0.9935369741128665,
16338
+ "grad_norm": 0.224609375,
16339
+ "learning_rate": 1.1481621670478282e-07,
16340
+ "loss": 2.5355,
16341
+ "step": 56725
16342
+ },
16343
+ {
16344
+ "epoch": 0.9939748484954636,
16345
+ "grad_norm": 0.2109375,
16346
+ "learning_rate": 9.982518983170375e-08,
16347
+ "loss": 2.5383,
16348
+ "step": 56750
16349
+ },
16350
+ {
16351
+ "epoch": 0.9944127228780607,
16352
+ "grad_norm": 0.2080078125,
16353
+ "learning_rate": 8.58823520308305e-08,
16354
+ "loss": 2.542,
16355
+ "step": 56775
16356
+ },
16357
+ {
16358
+ "epoch": 0.9948505972606578,
16359
+ "grad_norm": 0.220703125,
16360
+ "learning_rate": 7.298773253749946e-08,
16361
+ "loss": 2.5432,
16362
+ "step": 56800
16363
+ },
16364
+ {
16365
+ "epoch": 0.995288471643255,
16366
+ "grad_norm": 0.212890625,
16367
+ "learning_rate": 6.114135838908296e-08,
16368
+ "loss": 2.5368,
16369
+ "step": 56825
16370
+ },
16371
+ {
16372
+ "epoch": 0.9957263460258521,
16373
+ "grad_norm": 0.212890625,
16374
+ "learning_rate": 5.0343254425044837e-08,
16375
+ "loss": 2.5428,
16376
+ "step": 56850
16377
+ },
16378
+ {
16379
+ "epoch": 0.9961642204084492,
16380
+ "grad_norm": 0.216796875,
16381
+ "learning_rate": 4.0593443286773834e-08,
16382
+ "loss": 2.5417,
16383
+ "step": 56875
16384
+ },
16385
+ {
16386
+ "epoch": 0.9966020947910463,
16387
+ "grad_norm": 0.2109375,
16388
+ "learning_rate": 3.189194541769469e-08,
16389
+ "loss": 2.5482,
16390
+ "step": 56900
16391
+ },
16392
+ {
16393
+ "epoch": 0.9970399691736435,
16394
+ "grad_norm": 0.21484375,
16395
+ "learning_rate": 2.4238779063046057e-08,
16396
+ "loss": 2.5316,
16397
+ "step": 56925
16398
+ },
16399
+ {
16400
+ "epoch": 0.9974778435562406,
16401
+ "grad_norm": 0.2138671875,
16402
+ "learning_rate": 1.7633960269991535e-08,
16403
+ "loss": 2.5438,
16404
+ "step": 56950
16405
+ },
16406
+ {
16407
+ "epoch": 0.9979157179388377,
16408
+ "grad_norm": 0.2119140625,
16409
+ "learning_rate": 1.2077502887453129e-08,
16410
+ "loss": 2.538,
16411
+ "step": 56975
16412
+ },
16413
+ {
16414
+ "epoch": 0.9983535923214348,
16415
+ "grad_norm": 0.2109375,
16416
+ "learning_rate": 7.569418566222286e-09,
16417
+ "loss": 2.5309,
16418
+ "step": 57000
16419
+ },
16420
+ {
16421
+ "epoch": 0.9983535923214348,
16422
+ "eval_loss": 2.5533201694488525,
16423
+ "eval_runtime": 36.4092,
16424
+ "eval_samples_per_second": 100.387,
16425
+ "eval_steps_per_second": 3.159,
16426
+ "step": 57000
16427
  }
16428
  ],
16429
  "logging_steps": 25,
 
16443
  "attributes": {}
16444
  }
16445
  },
16446
+ "total_flos": 2.8910359451038384e+19,
16447
  "train_batch_size": 16,
16448
  "trial_name": null,
16449
  "trial_params": null