Wilsonwin commited on
Commit
6454d9b
·
verified ·
1 Parent(s): 631c79d

Training in progress, step 10500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:318c2656039c95a58242e4619aba90de89d286abfdd50c932ac46a5bbc6d6b36
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:289c5f7a117bc2146cbc4b2792b4927c7ce3188416b6d12c24b53c92eac18575
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fdbed07e432554d329c7e8d5c0f65220a1bfeee29ae26fa92a6aa0d5901ae56
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78aff07fca8298b71f09331247658b387bcda955f1390e18f38dbc6caf805220
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5948a5161f7923aa0acf66b01adf35dc2196a8acf5bd2c21227561e5bff45666
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a0861f9132710b799b6fa2e167a1b0b3b522e3a288bf5f69138ff390819689
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53471871a37f3cc35b4a656a6f0cfda18046c304a91d9bf8b29b14eea2ccc156
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f96c5626b64f285225e7bd0540a942ee4b22f3baba9f0a0f2189b039b8bf46c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6894745734076704,
6
  "eval_steps": 500,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7175,6 +7175,364 @@
7175
  "eval_samples_per_second": 273.932,
7176
  "eval_steps_per_second": 5.753,
7177
  "step": 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7178
  }
7179
  ],
7180
  "logging_steps": 10,
@@ -7194,7 +7552,7 @@
7194
  "attributes": {}
7195
  }
7196
  },
7197
- "total_flos": 3.344547305037496e+17,
7198
  "train_batch_size": 48,
7199
  "trial_name": null,
7200
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.7739483020780538,
6
  "eval_steps": 500,
7
+ "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7175
  "eval_samples_per_second": 273.932,
7176
  "eval_steps_per_second": 5.753,
7177
  "step": 10000
7178
+ },
7179
+ {
7180
+ "epoch": 1.691164047981078,
7181
+ "grad_norm": 0.47159892320632935,
7182
+ "learning_rate": 2.4865303937104007e-05,
7183
+ "loss": 4.28497314453125,
7184
+ "step": 10010
7185
+ },
7186
+ {
7187
+ "epoch": 1.6928535225544854,
7188
+ "grad_norm": 0.45282673835754395,
7189
+ "learning_rate": 2.460181551284876e-05,
7190
+ "loss": 4.316118621826172,
7191
+ "step": 10020
7192
+ },
7193
+ {
7194
+ "epoch": 1.6945429971278934,
7195
+ "grad_norm": 0.4685194492340088,
7196
+ "learning_rate": 2.433960581310091e-05,
7197
+ "loss": 4.295747375488281,
7198
+ "step": 10030
7199
+ },
7200
+ {
7201
+ "epoch": 1.6962324717013009,
7202
+ "grad_norm": 0.4553631842136383,
7203
+ "learning_rate": 2.4078677511694776e-05,
7204
+ "loss": 4.325288391113281,
7205
+ "step": 10040
7206
+ },
7207
+ {
7208
+ "epoch": 1.6979219462747086,
7209
+ "grad_norm": 0.46230271458625793,
7210
+ "learning_rate": 2.381903326939777e-05,
7211
+ "loss": 4.269796752929688,
7212
+ "step": 10050
7213
+ },
7214
+ {
7215
+ "epoch": 1.6996114208481163,
7216
+ "grad_norm": 0.4607372283935547,
7217
+ "learning_rate": 2.356067573388355e-05,
7218
+ "loss": 4.311262893676758,
7219
+ "step": 10060
7220
+ },
7221
+ {
7222
+ "epoch": 1.7013008954215239,
7223
+ "grad_norm": 0.4789498448371887,
7224
+ "learning_rate": 2.3303607539704628e-05,
7225
+ "loss": 4.303665542602539,
7226
+ "step": 10070
7227
+ },
7228
+ {
7229
+ "epoch": 1.7029903699949316,
7230
+ "grad_norm": 0.46580952405929565,
7231
+ "learning_rate": 2.3047831308265845e-05,
7232
+ "loss": 4.283160400390625,
7233
+ "step": 10080
7234
+ },
7235
+ {
7236
+ "epoch": 1.7046798445683393,
7237
+ "grad_norm": 0.4807932674884796,
7238
+ "learning_rate": 2.2793349647797372e-05,
7239
+ "loss": 4.308661270141601,
7240
+ "step": 10090
7241
+ },
7242
+ {
7243
+ "epoch": 1.7063693191417468,
7244
+ "grad_norm": 0.4682171046733856,
7245
+ "learning_rate": 2.2540165153328345e-05,
7246
+ "loss": 4.298659896850586,
7247
+ "step": 10100
7248
+ },
7249
+ {
7250
+ "epoch": 1.7080587937151546,
7251
+ "grad_norm": 0.46330752968788147,
7252
+ "learning_rate": 2.2288280406660237e-05,
7253
+ "loss": 4.294895935058594,
7254
+ "step": 10110
7255
+ },
7256
+ {
7257
+ "epoch": 1.7097482682885623,
7258
+ "grad_norm": 0.4711052179336548,
7259
+ "learning_rate": 2.2037697976340525e-05,
7260
+ "loss": 4.325272750854492,
7261
+ "step": 10120
7262
+ },
7263
+ {
7264
+ "epoch": 1.7114377428619698,
7265
+ "grad_norm": 0.47657638788223267,
7266
+ "learning_rate": 2.1788420417636704e-05,
7267
+ "loss": 4.280495834350586,
7268
+ "step": 10130
7269
+ },
7270
+ {
7271
+ "epoch": 1.7131272174353775,
7272
+ "grad_norm": 0.45199576020240784,
7273
+ "learning_rate": 2.1540450272509986e-05,
7274
+ "loss": 4.289173889160156,
7275
+ "step": 10140
7276
+ },
7277
+ {
7278
+ "epoch": 1.7148166920087853,
7279
+ "grad_norm": 0.4709782004356384,
7280
+ "learning_rate": 2.129379006958944e-05,
7281
+ "loss": 4.30334243774414,
7282
+ "step": 10150
7283
+ },
7284
+ {
7285
+ "epoch": 1.7165061665821928,
7286
+ "grad_norm": 0.4583008289337158,
7287
+ "learning_rate": 2.104844232414634e-05,
7288
+ "loss": 4.33288459777832,
7289
+ "step": 10160
7290
+ },
7291
+ {
7292
+ "epoch": 1.7181956411556008,
7293
+ "grad_norm": 0.4560486674308777,
7294
+ "learning_rate": 2.080440953806844e-05,
7295
+ "loss": 4.312181091308593,
7296
+ "step": 10170
7297
+ },
7298
+ {
7299
+ "epoch": 1.7198851157290083,
7300
+ "grad_norm": 0.45241913199424744,
7301
+ "learning_rate": 2.056169419983432e-05,
7302
+ "loss": 4.302781677246093,
7303
+ "step": 10180
7304
+ },
7305
+ {
7306
+ "epoch": 1.721574590302416,
7307
+ "grad_norm": 0.4574364125728607,
7308
+ "learning_rate": 2.0320298784488177e-05,
7309
+ "loss": 4.296425628662109,
7310
+ "step": 10190
7311
+ },
7312
+ {
7313
+ "epoch": 1.7232640648758237,
7314
+ "grad_norm": 0.4723096489906311,
7315
+ "learning_rate": 2.008022575361464e-05,
7316
+ "loss": 4.3003795623779295,
7317
+ "step": 10200
7318
+ },
7319
+ {
7320
+ "epoch": 1.7249535394492312,
7321
+ "grad_norm": 0.4733101427555084,
7322
+ "learning_rate": 1.9841477555313428e-05,
7323
+ "loss": 4.288959503173828,
7324
+ "step": 10210
7325
+ },
7326
+ {
7327
+ "epoch": 1.726643014022639,
7328
+ "grad_norm": 0.4613873362541199,
7329
+ "learning_rate": 1.960405662417458e-05,
7330
+ "loss": 4.3150989532470705,
7331
+ "step": 10220
7332
+ },
7333
+ {
7334
+ "epoch": 1.7283324885960467,
7335
+ "grad_norm": 0.47345536947250366,
7336
+ "learning_rate": 1.9367965381253632e-05,
7337
+ "loss": 4.287479400634766,
7338
+ "step": 10230
7339
+ },
7340
+ {
7341
+ "epoch": 1.7300219631694542,
7342
+ "grad_norm": 0.4718509912490845,
7343
+ "learning_rate": 1.9133206234046833e-05,
7344
+ "loss": 4.321730422973633,
7345
+ "step": 10240
7346
+ },
7347
+ {
7348
+ "epoch": 1.731711437742862,
7349
+ "grad_norm": 0.455735445022583,
7350
+ "learning_rate": 1.8899781576466605e-05,
7351
+ "loss": 4.2946220397949215,
7352
+ "step": 10250
7353
+ },
7354
+ {
7355
+ "epoch": 1.7334009123162697,
7356
+ "grad_norm": 0.4965671896934509,
7357
+ "learning_rate": 1.86676937888172e-05,
7358
+ "loss": 4.301831817626953,
7359
+ "step": 10260
7360
+ },
7361
+ {
7362
+ "epoch": 1.7350903868896772,
7363
+ "grad_norm": 0.4581054449081421,
7364
+ "learning_rate": 1.8436945237770347e-05,
7365
+ "loss": 4.305143737792969,
7366
+ "step": 10270
7367
+ },
7368
+ {
7369
+ "epoch": 1.736779861463085,
7370
+ "grad_norm": 0.4591616094112396,
7371
+ "learning_rate": 1.8207538276341255e-05,
7372
+ "loss": 4.309583282470703,
7373
+ "step": 10280
7374
+ },
7375
+ {
7376
+ "epoch": 1.7384693360364927,
7377
+ "grad_norm": 0.4735301733016968,
7378
+ "learning_rate": 1.7979475243864422e-05,
7379
+ "loss": 4.28990478515625,
7380
+ "step": 10290
7381
+ },
7382
+ {
7383
+ "epoch": 1.7401588106099002,
7384
+ "grad_norm": 0.46391761302948,
7385
+ "learning_rate": 1.7752758465969835e-05,
7386
+ "loss": 4.2906452178955075,
7387
+ "step": 10300
7388
+ },
7389
+ {
7390
+ "epoch": 1.7418482851833081,
7391
+ "grad_norm": 0.4546545147895813,
7392
+ "learning_rate": 1.7527390254559564e-05,
7393
+ "loss": 4.305644226074219,
7394
+ "step": 10310
7395
+ },
7396
+ {
7397
+ "epoch": 1.7435377597567157,
7398
+ "grad_norm": 0.4470182955265045,
7399
+ "learning_rate": 1.7303372907783646e-05,
7400
+ "loss": 4.288211059570313,
7401
+ "step": 10320
7402
+ },
7403
+ {
7404
+ "epoch": 1.7452272343301232,
7405
+ "grad_norm": 0.4606943726539612,
7406
+ "learning_rate": 1.708070871001704e-05,
7407
+ "loss": 4.294968795776367,
7408
+ "step": 10330
7409
+ },
7410
+ {
7411
+ "epoch": 1.7469167089035311,
7412
+ "grad_norm": 0.4543667733669281,
7413
+ "learning_rate": 1.6859399931836182e-05,
7414
+ "loss": 4.301618194580078,
7415
+ "step": 10340
7416
+ },
7417
+ {
7418
+ "epoch": 1.7486061834769386,
7419
+ "grad_norm": 0.472310870885849,
7420
+ "learning_rate": 1.663944882999596e-05,
7421
+ "loss": 4.318760681152344,
7422
+ "step": 10350
7423
+ },
7424
+ {
7425
+ "epoch": 1.7502956580503464,
7426
+ "grad_norm": 0.44963911175727844,
7427
+ "learning_rate": 1.6420857647406533e-05,
7428
+ "loss": 4.308442687988281,
7429
+ "step": 10360
7430
+ },
7431
+ {
7432
+ "epoch": 1.751985132623754,
7433
+ "grad_norm": 0.45367759466171265,
7434
+ "learning_rate": 1.6203628613110513e-05,
7435
+ "loss": 4.320900344848633,
7436
+ "step": 10370
7437
+ },
7438
+ {
7439
+ "epoch": 1.7536746071971616,
7440
+ "grad_norm": 0.4687769114971161,
7441
+ "learning_rate": 1.598776394226035e-05,
7442
+ "loss": 4.342009735107422,
7443
+ "step": 10380
7444
+ },
7445
+ {
7446
+ "epoch": 1.7553640817705694,
7447
+ "grad_norm": 0.4652376174926758,
7448
+ "learning_rate": 1.5773265836095615e-05,
7449
+ "loss": 4.283346557617188,
7450
+ "step": 10390
7451
+ },
7452
+ {
7453
+ "epoch": 1.757053556343977,
7454
+ "grad_norm": 0.44677111506462097,
7455
+ "learning_rate": 1.5560136481920583e-05,
7456
+ "loss": 4.30499496459961,
7457
+ "step": 10400
7458
+ },
7459
+ {
7460
+ "epoch": 1.7587430309173846,
7461
+ "grad_norm": 0.4536132514476776,
7462
+ "learning_rate": 1.5348378053081885e-05,
7463
+ "loss": 4.284192657470703,
7464
+ "step": 10410
7465
+ },
7466
+ {
7467
+ "epoch": 1.7604325054907923,
7468
+ "grad_norm": 0.4781353175640106,
7469
+ "learning_rate": 1.5137992708946522e-05,
7470
+ "loss": 4.299782180786133,
7471
+ "step": 10420
7472
+ },
7473
+ {
7474
+ "epoch": 1.7621219800642,
7475
+ "grad_norm": 0.46639731526374817,
7476
+ "learning_rate": 1.4928982594879602e-05,
7477
+ "loss": 4.301108169555664,
7478
+ "step": 10430
7479
+ },
7480
+ {
7481
+ "epoch": 1.7638114546376076,
7482
+ "grad_norm": 0.4624445140361786,
7483
+ "learning_rate": 1.4721349842222623e-05,
7484
+ "loss": 4.283761596679687,
7485
+ "step": 10440
7486
+ },
7487
+ {
7488
+ "epoch": 1.7655009292110155,
7489
+ "grad_norm": 0.47024649381637573,
7490
+ "learning_rate": 1.4515096568271728e-05,
7491
+ "loss": 4.299430084228516,
7492
+ "step": 10450
7493
+ },
7494
+ {
7495
+ "epoch": 1.767190403784423,
7496
+ "grad_norm": 0.4745561182498932,
7497
+ "learning_rate": 1.4310224876256071e-05,
7498
+ "loss": 4.319121551513672,
7499
+ "step": 10460
7500
+ },
7501
+ {
7502
+ "epoch": 1.7688798783578306,
7503
+ "grad_norm": 0.4728463292121887,
7504
+ "learning_rate": 1.410673685531638e-05,
7505
+ "loss": 4.306048583984375,
7506
+ "step": 10470
7507
+ },
7508
+ {
7509
+ "epoch": 1.7705693529312385,
7510
+ "grad_norm": 0.460742712020874,
7511
+ "learning_rate": 1.390463458048357e-05,
7512
+ "loss": 4.31497802734375,
7513
+ "step": 10480
7514
+ },
7515
+ {
7516
+ "epoch": 1.772258827504646,
7517
+ "grad_norm": 0.46218180656433105,
7518
+ "learning_rate": 1.3703920112657856e-05,
7519
+ "loss": 4.3034709930419925,
7520
+ "step": 10490
7521
+ },
7522
+ {
7523
+ "epoch": 1.7739483020780538,
7524
+ "grad_norm": 0.46378350257873535,
7525
+ "learning_rate": 1.3504595498587378e-05,
7526
+ "loss": 4.3008544921875,
7527
+ "step": 10500
7528
+ },
7529
+ {
7530
+ "epoch": 1.7739483020780538,
7531
+ "eval_loss": 4.253804683685303,
7532
+ "eval_runtime": 3.6144,
7533
+ "eval_samples_per_second": 276.668,
7534
+ "eval_steps_per_second": 5.81,
7535
+ "step": 10500
7536
  }
7537
  ],
7538
  "logging_steps": 10,
 
7552
  "attributes": {}
7553
  }
7554
  },
7555
+ "total_flos": 3.511775715466936e+17,
7556
  "train_batch_size": 48,
7557
  "trial_name": null,
7558
  "trial_params": null