shulijia commited on
Commit
5db4ed5
·
verified ·
1 Parent(s): 3cf6a55

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68351bdfc9f2775d55135f85d4da406751f2017ee1e6038a2f1f11809940cc1b
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:070b5b7acfb870eafcd0bf40ce133115da39bd3236dee84b8493ea73e863aebf
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f079cb7f5616b227a12a8a76dc488cc35d940f620f47389c7b64c2321c29ddbc
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6986d123949d70dafc8db16862d29980777537d8be6a72c449522a071032d5c
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6023996335cf65957e240cbf24c23b1c1817d3b9ce032bb60007899ddd598fd
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40beb3dc5129ab4ac6babe96012ebdd87569ab488ea6742096d9d349a8d4cd73
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.548872446631501,
6
  "eval_steps": 100,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8108,6 +8108,456 @@
8108
  "mean_token_accuracy": 0.7797822907567025,
8109
  "num_tokens": 73725952.0,
8110
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8111
  }
8112
  ],
8113
  "logging_steps": 10,
@@ -8127,7 +8577,7 @@
8127
  "attributes": {}
8128
  }
8129
  },
8130
- "total_flos": 1.9484329511170867e+17,
8131
  "train_batch_size": 2,
8132
  "trial_name": null,
8133
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.6904804049987607,
6
  "eval_steps": 100,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8108
  "mean_token_accuracy": 0.7797822907567025,
8109
  "num_tokens": 73725952.0,
8110
  "step": 9000
8111
+ },
8112
+ {
8113
+ "epoch": 2.5517046057988457,
8114
+ "grad_norm": 1.1519678831100464,
8115
+ "learning_rate": 1.661596559320256e-06,
8116
+ "loss": 0.1289,
8117
+ "mean_token_accuracy": 0.764640410989523,
8118
+ "num_tokens": 73807872.0,
8119
+ "step": 9010
8120
+ },
8121
+ {
8122
+ "epoch": 2.554536764966191,
8123
+ "grad_norm": 1.1929394006729126,
8124
+ "learning_rate": 1.65110668205182e-06,
8125
+ "loss": 0.1074,
8126
+ "mean_token_accuracy": 0.7785836592316627,
8127
+ "num_tokens": 73889792.0,
8128
+ "step": 9020
8129
+ },
8130
+ {
8131
+ "epoch": 2.5573689241335362,
8132
+ "grad_norm": 1.3088452816009521,
8133
+ "learning_rate": 1.6406168047833843e-06,
8134
+ "loss": 0.0986,
8135
+ "mean_token_accuracy": 0.7910469707101584,
8136
+ "num_tokens": 73971712.0,
8137
+ "step": 9030
8138
+ },
8139
+ {
8140
+ "epoch": 2.5602010833008815,
8141
+ "grad_norm": 1.795518159866333,
8142
+ "learning_rate": 1.6301269275149482e-06,
8143
+ "loss": 0.1081,
8144
+ "mean_token_accuracy": 0.7642245594412088,
8145
+ "num_tokens": 74053632.0,
8146
+ "step": 9040
8147
+ },
8148
+ {
8149
+ "epoch": 2.5630332424682267,
8150
+ "grad_norm": 1.3358420133590698,
8151
+ "learning_rate": 1.6196370502465123e-06,
8152
+ "loss": 0.1147,
8153
+ "mean_token_accuracy": 0.7747553832828998,
8154
+ "num_tokens": 74135552.0,
8155
+ "step": 9050
8156
+ },
8157
+ {
8158
+ "epoch": 2.565865401635572,
8159
+ "grad_norm": 1.489589810371399,
8160
+ "learning_rate": 1.6091471729780763e-06,
8161
+ "loss": 0.1207,
8162
+ "mean_token_accuracy": 0.7772504851222038,
8163
+ "num_tokens": 74217472.0,
8164
+ "step": 9060
8165
+ },
8166
+ {
8167
+ "epoch": 2.5686975608029172,
8168
+ "grad_norm": 1.3394817113876343,
8169
+ "learning_rate": 1.5986572957096402e-06,
8170
+ "loss": 0.1376,
8171
+ "mean_token_accuracy": 0.7540484316647053,
8172
+ "num_tokens": 74299392.0,
8173
+ "step": 9070
8174
+ },
8175
+ {
8176
+ "epoch": 2.5715297199702625,
8177
+ "grad_norm": 1.119963526725769,
8178
+ "learning_rate": 1.5881674184412043e-06,
8179
+ "loss": 0.1185,
8180
+ "mean_token_accuracy": 0.7621819950640202,
8181
+ "num_tokens": 74381312.0,
8182
+ "step": 9080
8183
+ },
8184
+ {
8185
+ "epoch": 2.5743618791376077,
8186
+ "grad_norm": 1.4001566171646118,
8187
+ "learning_rate": 1.5776775411727686e-06,
8188
+ "loss": 0.104,
8189
+ "mean_token_accuracy": 0.7811521515250206,
8190
+ "num_tokens": 74463232.0,
8191
+ "step": 9090
8192
+ },
8193
+ {
8194
+ "epoch": 2.5771940383049525,
8195
+ "grad_norm": 1.5772784948349,
8196
+ "learning_rate": 1.5671876639043324e-06,
8197
+ "loss": 0.1152,
8198
+ "mean_token_accuracy": 0.7650195706635714,
8199
+ "num_tokens": 74545152.0,
8200
+ "step": 9100
8201
+ },
8202
+ {
8203
+ "epoch": 2.5800261974722978,
8204
+ "grad_norm": 1.7766703367233276,
8205
+ "learning_rate": 1.5566977866358965e-06,
8206
+ "loss": 0.113,
8207
+ "mean_token_accuracy": 0.7801981404423713,
8208
+ "num_tokens": 74627072.0,
8209
+ "step": 9110
8210
+ },
8211
+ {
8212
+ "epoch": 2.582858356639643,
8213
+ "grad_norm": 1.4249588251113892,
8214
+ "learning_rate": 1.5462079093674606e-06,
8215
+ "loss": 0.1236,
8216
+ "mean_token_accuracy": 0.7641022481024266,
8217
+ "num_tokens": 74708992.0,
8218
+ "step": 9120
8219
+ },
8220
+ {
8221
+ "epoch": 2.5856905158069883,
8222
+ "grad_norm": 1.6609476804733276,
8223
+ "learning_rate": 1.5357180320990244e-06,
8224
+ "loss": 0.1154,
8225
+ "mean_token_accuracy": 0.763050389662385,
8226
+ "num_tokens": 74790912.0,
8227
+ "step": 9130
8228
+ },
8229
+ {
8230
+ "epoch": 2.5885226749743335,
8231
+ "grad_norm": 0.9137653708457947,
8232
+ "learning_rate": 1.5252281548305885e-06,
8233
+ "loss": 0.1179,
8234
+ "mean_token_accuracy": 0.7632950112223625,
8235
+ "num_tokens": 74872832.0,
8236
+ "step": 9140
8237
+ },
8238
+ {
8239
+ "epoch": 2.5913548341416788,
8240
+ "grad_norm": 0.9380526542663574,
8241
+ "learning_rate": 1.5147382775621528e-06,
8242
+ "loss": 0.1242,
8243
+ "mean_token_accuracy": 0.7689334619790316,
8244
+ "num_tokens": 74954752.0,
8245
+ "step": 9150
8246
+ },
8247
+ {
8248
+ "epoch": 2.594186993309024,
8249
+ "grad_norm": 1.246500849723816,
8250
+ "learning_rate": 1.5042484002937167e-06,
8251
+ "loss": 0.1106,
8252
+ "mean_token_accuracy": 0.758524950966239,
8253
+ "num_tokens": 75036672.0,
8254
+ "step": 9160
8255
+ },
8256
+ {
8257
+ "epoch": 2.5970191524763693,
8258
+ "grad_norm": 1.2258425951004028,
8259
+ "learning_rate": 1.4937585230252807e-06,
8260
+ "loss": 0.121,
8261
+ "mean_token_accuracy": 0.7841976564377546,
8262
+ "num_tokens": 75118592.0,
8263
+ "step": 9170
8264
+ },
8265
+ {
8266
+ "epoch": 2.5998513116437145,
8267
+ "grad_norm": 1.4543510675430298,
8268
+ "learning_rate": 1.4832686457568448e-06,
8269
+ "loss": 0.0928,
8270
+ "mean_token_accuracy": 0.794337086752057,
8271
+ "num_tokens": 75200512.0,
8272
+ "step": 9180
8273
+ },
8274
+ {
8275
+ "epoch": 2.6026834708110593,
8276
+ "grad_norm": 1.4098447561264038,
8277
+ "learning_rate": 1.4727787684884087e-06,
8278
+ "loss": 0.1181,
8279
+ "mean_token_accuracy": 0.7629280813038349,
8280
+ "num_tokens": 75282432.0,
8281
+ "step": 9190
8282
+ },
8283
+ {
8284
+ "epoch": 2.605515629978405,
8285
+ "grad_norm": 1.3578165769577026,
8286
+ "learning_rate": 1.4622888912199728e-06,
8287
+ "loss": 0.1072,
8288
+ "mean_token_accuracy": 0.7831213314086198,
8289
+ "num_tokens": 75364352.0,
8290
+ "step": 9200
8291
+ },
8292
+ {
8293
+ "epoch": 2.60834778914575,
8294
+ "grad_norm": 1.7388701438903809,
8295
+ "learning_rate": 1.451799013951537e-06,
8296
+ "loss": 0.1029,
8297
+ "mean_token_accuracy": 0.7875366933643818,
8298
+ "num_tokens": 75446272.0,
8299
+ "step": 9210
8300
+ },
8301
+ {
8302
+ "epoch": 2.611179948313095,
8303
+ "grad_norm": 1.3704735040664673,
8304
+ "learning_rate": 1.441309136683101e-06,
8305
+ "loss": 0.1546,
8306
+ "mean_token_accuracy": 0.7491927597671747,
8307
+ "num_tokens": 75528192.0,
8308
+ "step": 9220
8309
+ },
8310
+ {
8311
+ "epoch": 2.6140121074804403,
8312
+ "grad_norm": 1.2139005661010742,
8313
+ "learning_rate": 1.430819259414665e-06,
8314
+ "loss": 0.1298,
8315
+ "mean_token_accuracy": 0.7586350254714489,
8316
+ "num_tokens": 75610112.0,
8317
+ "step": 9230
8318
+ },
8319
+ {
8320
+ "epoch": 2.6168442666477856,
8321
+ "grad_norm": 2.0187840461730957,
8322
+ "learning_rate": 1.420329382146229e-06,
8323
+ "loss": 0.1319,
8324
+ "mean_token_accuracy": 0.7546477496623993,
8325
+ "num_tokens": 75692032.0,
8326
+ "step": 9240
8327
+ },
8328
+ {
8329
+ "epoch": 2.619676425815131,
8330
+ "grad_norm": 1.0713800191879272,
8331
+ "learning_rate": 1.409839504877793e-06,
8332
+ "loss": 0.0989,
8333
+ "mean_token_accuracy": 0.7925269067287445,
8334
+ "num_tokens": 75773952.0,
8335
+ "step": 9250
8336
+ },
8337
+ {
8338
+ "epoch": 2.622508584982476,
8339
+ "grad_norm": 1.284598469734192,
8340
+ "learning_rate": 1.3993496276093572e-06,
8341
+ "loss": 0.1198,
8342
+ "mean_token_accuracy": 0.7871208406984807,
8343
+ "num_tokens": 75855872.0,
8344
+ "step": 9260
8345
+ },
8346
+ {
8347
+ "epoch": 2.6253407441498213,
8348
+ "grad_norm": 0.9530990123748779,
8349
+ "learning_rate": 1.3888597503409213e-06,
8350
+ "loss": 0.1194,
8351
+ "mean_token_accuracy": 0.7739114474505187,
8352
+ "num_tokens": 75937792.0,
8353
+ "step": 9270
8354
+ },
8355
+ {
8356
+ "epoch": 2.6281729033171666,
8357
+ "grad_norm": 1.252050757408142,
8358
+ "learning_rate": 1.3783698730724852e-06,
8359
+ "loss": 0.1143,
8360
+ "mean_token_accuracy": 0.7736790612339973,
8361
+ "num_tokens": 76019712.0,
8362
+ "step": 9280
8363
+ },
8364
+ {
8365
+ "epoch": 2.631005062484512,
8366
+ "grad_norm": 1.2160993814468384,
8367
+ "learning_rate": 1.3678799958040492e-06,
8368
+ "loss": 0.1035,
8369
+ "mean_token_accuracy": 0.7817025430500507,
8370
+ "num_tokens": 76101632.0,
8371
+ "step": 9290
8372
+ },
8373
+ {
8374
+ "epoch": 2.6338372216518566,
8375
+ "grad_norm": 1.4404122829437256,
8376
+ "learning_rate": 1.3573901185356133e-06,
8377
+ "loss": 0.1285,
8378
+ "mean_token_accuracy": 0.7545743621885777,
8379
+ "num_tokens": 76183552.0,
8380
+ "step": 9300
8381
+ },
8382
+ {
8383
+ "epoch": 2.636669380819202,
8384
+ "grad_norm": 1.158105492591858,
8385
+ "learning_rate": 1.3469002412671772e-06,
8386
+ "loss": 0.1127,
8387
+ "mean_token_accuracy": 0.7763209372758866,
8388
+ "num_tokens": 76265472.0,
8389
+ "step": 9310
8390
+ },
8391
+ {
8392
+ "epoch": 2.639501539986547,
8393
+ "grad_norm": 1.2974953651428223,
8394
+ "learning_rate": 1.3364103639987415e-06,
8395
+ "loss": 0.1254,
8396
+ "mean_token_accuracy": 0.7779109582304955,
8397
+ "num_tokens": 76347392.0,
8398
+ "step": 9320
8399
+ },
8400
+ {
8401
+ "epoch": 2.6423336991538924,
8402
+ "grad_norm": 1.4528638124465942,
8403
+ "learning_rate": 1.3259204867303055e-06,
8404
+ "loss": 0.1089,
8405
+ "mean_token_accuracy": 0.7840998075902462,
8406
+ "num_tokens": 76429312.0,
8407
+ "step": 9330
8408
+ },
8409
+ {
8410
+ "epoch": 2.6451658583212376,
8411
+ "grad_norm": 0.9896726012229919,
8412
+ "learning_rate": 1.3154306094618694e-06,
8413
+ "loss": 0.103,
8414
+ "mean_token_accuracy": 0.7768224064260721,
8415
+ "num_tokens": 76511232.0,
8416
+ "step": 9340
8417
+ },
8418
+ {
8419
+ "epoch": 2.647998017488583,
8420
+ "grad_norm": 1.1756311655044556,
8421
+ "learning_rate": 1.3049407321934335e-06,
8422
+ "loss": 0.1033,
8423
+ "mean_token_accuracy": 0.7787426613271237,
8424
+ "num_tokens": 76593152.0,
8425
+ "step": 9350
8426
+ },
8427
+ {
8428
+ "epoch": 2.650830176655928,
8429
+ "grad_norm": 1.280672550201416,
8430
+ "learning_rate": 1.2944508549249975e-06,
8431
+ "loss": 0.1049,
8432
+ "mean_token_accuracy": 0.7903131127357483,
8433
+ "num_tokens": 76675072.0,
8434
+ "step": 9360
8435
+ },
8436
+ {
8437
+ "epoch": 2.6536623358232734,
8438
+ "grad_norm": 1.228232979774475,
8439
+ "learning_rate": 1.2839609776565614e-06,
8440
+ "loss": 0.1449,
8441
+ "mean_token_accuracy": 0.743456457555294,
8442
+ "num_tokens": 76756992.0,
8443
+ "step": 9370
8444
+ },
8445
+ {
8446
+ "epoch": 2.6564944949906186,
8447
+ "grad_norm": 1.4639639854431152,
8448
+ "learning_rate": 1.2734711003881257e-06,
8449
+ "loss": 0.1358,
8450
+ "mean_token_accuracy": 0.7659980464726687,
8451
+ "num_tokens": 76838912.0,
8452
+ "step": 9380
8453
+ },
8454
+ {
8455
+ "epoch": 2.6593266541579634,
8456
+ "grad_norm": 1.4914389848709106,
8457
+ "learning_rate": 1.2629812231196898e-06,
8458
+ "loss": 0.1121,
8459
+ "mean_token_accuracy": 0.7777764193713665,
8460
+ "num_tokens": 76920832.0,
8461
+ "step": 9390
8462
+ },
8463
+ {
8464
+ "epoch": 2.662158813325309,
8465
+ "grad_norm": 1.1283109188079834,
8466
+ "learning_rate": 1.2524913458512536e-06,
8467
+ "loss": 0.113,
8468
+ "mean_token_accuracy": 0.7669520601630211,
8469
+ "num_tokens": 77002752.0,
8470
+ "step": 9400
8471
+ },
8472
+ {
8473
+ "epoch": 2.664990972492654,
8474
+ "grad_norm": 1.1668506860733032,
8475
+ "learning_rate": 1.2420014685828177e-06,
8476
+ "loss": 0.1224,
8477
+ "mean_token_accuracy": 0.7749633088707923,
8478
+ "num_tokens": 77084672.0,
8479
+ "step": 9410
8480
+ },
8481
+ {
8482
+ "epoch": 2.667823131659999,
8483
+ "grad_norm": 1.8604751825332642,
8484
+ "learning_rate": 1.2315115913143818e-06,
8485
+ "loss": 0.1349,
8486
+ "mean_token_accuracy": 0.7719178080558777,
8487
+ "num_tokens": 77166592.0,
8488
+ "step": 9420
8489
+ },
8490
+ {
8491
+ "epoch": 2.6706552908273444,
8492
+ "grad_norm": 2.2527692317962646,
8493
+ "learning_rate": 1.2210217140459456e-06,
8494
+ "loss": 0.1357,
8495
+ "mean_token_accuracy": 0.7704500976949931,
8496
+ "num_tokens": 77248512.0,
8497
+ "step": 9430
8498
+ },
8499
+ {
8500
+ "epoch": 2.6734874499946897,
8501
+ "grad_norm": 1.1649688482284546,
8502
+ "learning_rate": 1.21053183677751e-06,
8503
+ "loss": 0.1056,
8504
+ "mean_token_accuracy": 0.7830601751804351,
8505
+ "num_tokens": 77330432.0,
8506
+ "step": 9440
8507
+ },
8508
+ {
8509
+ "epoch": 2.676319609162035,
8510
+ "grad_norm": 1.1416834592819214,
8511
+ "learning_rate": 1.2000419595090738e-06,
8512
+ "loss": 0.1265,
8513
+ "mean_token_accuracy": 0.7677592922002077,
8514
+ "num_tokens": 77412352.0,
8515
+ "step": 9450
8516
+ },
8517
+ {
8518
+ "epoch": 2.67915176832938,
8519
+ "grad_norm": 1.1690260171890259,
8520
+ "learning_rate": 1.1895520822406379e-06,
8521
+ "loss": 0.1164,
8522
+ "mean_token_accuracy": 0.7738502897322178,
8523
+ "num_tokens": 77494272.0,
8524
+ "step": 9460
8525
+ },
8526
+ {
8527
+ "epoch": 2.6819839274967254,
8528
+ "grad_norm": 1.4305615425109863,
8529
+ "learning_rate": 1.179062204972202e-06,
8530
+ "loss": 0.1248,
8531
+ "mean_token_accuracy": 0.7664261247962714,
8532
+ "num_tokens": 77576192.0,
8533
+ "step": 9470
8534
+ },
8535
+ {
8536
+ "epoch": 2.6848160866640707,
8537
+ "grad_norm": 1.3226728439331055,
8538
+ "learning_rate": 1.168572327703766e-06,
8539
+ "loss": 0.1253,
8540
+ "mean_token_accuracy": 0.7715998016297817,
8541
+ "num_tokens": 77658112.0,
8542
+ "step": 9480
8543
+ },
8544
+ {
8545
+ "epoch": 2.687648245831416,
8546
+ "grad_norm": 1.2239925861358643,
8547
+ "learning_rate": 1.1580824504353299e-06,
8548
+ "loss": 0.1412,
8549
+ "mean_token_accuracy": 0.7618395283818244,
8550
+ "num_tokens": 77740032.0,
8551
+ "step": 9490
8552
+ },
8553
+ {
8554
+ "epoch": 2.6904804049987607,
8555
+ "grad_norm": 1.3090022802352905,
8556
+ "learning_rate": 1.1475925731668942e-06,
8557
+ "loss": 0.1944,
8558
+ "mean_token_accuracy": 0.7248899202793837,
8559
+ "num_tokens": 77821952.0,
8560
+ "step": 9500
8561
  }
8562
  ],
8563
  "logging_steps": 10,
 
8577
  "attributes": {}
8578
  }
8579
  },
8580
+ "total_flos": 2.0566822331036467e+17,
8581
  "train_batch_size": 2,
8582
  "trial_name": null,
8583
  "trial_params": null