Azrail commited on
Commit
15f2bbe
·
verified ·
1 Parent(s): 5fafef5

Training in progress, step 53000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17f79bfd92c936c07be11debb700728ae4b7e0771937dc9aee38748f4dc80dc3
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76223d6bdee171cffd4cda1d9c4bbbab95942f789f412cecdfbcec4b8715383c
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdcb0e96beb98fcdfd50cc3b612cd068e544f01ef0961afbf353f3d6eabba3ce
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e3b0c1d87658be3018021f1815500a16d4cf88fae3993a3710e48c97c61995c
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f53c01b35d1753a4f571c1ddd2b16976530a7b71c320877f1fbd74ce1de4ed
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3771019f4815646a43bbc09acce698c65d4ba61e6cbb0516a172314f7fbb077
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.34977794145591706,
6
  "eval_steps": 500,
7
- "global_step": 52000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9264,11 +9264,189 @@
9264
  "eval_steps_per_second": 23.525,
9265
  "num_input_tokens_seen": 13631488000,
9266
  "step": 52000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9267
  }
9268
  ],
9269
  "logging_steps": 50,
9270
  "max_steps": 60000,
9271
- "num_input_tokens_seen": 13631488000,
9272
  "num_train_epochs": 1,
9273
  "save_steps": 1000,
9274
  "stateful_callbacks": {
@@ -9283,7 +9461,7 @@
9283
  "attributes": {}
9284
  }
9285
  },
9286
- "total_flos": 3.64655608332288e+18,
9287
  "train_batch_size": 64,
9288
  "trial_name": null,
9289
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3565044403300693,
6
  "eval_steps": 500,
7
+ "global_step": 53000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9264
  "eval_steps_per_second": 23.525,
9265
  "num_input_tokens_seen": 13631488000,
9266
  "step": 52000
9267
+ },
9268
+ {
9269
+ "epoch": 0.3501142663996247,
9270
+ "grad_norm": 0.22132734954357147,
9271
+ "learning_rate": 0.001,
9272
+ "loss": 3.0564,
9273
+ "num_input_tokens_seen": 13644595200,
9274
+ "step": 52050
9275
+ },
9276
+ {
9277
+ "epoch": 0.3504505913433323,
9278
+ "grad_norm": 0.19554653763771057,
9279
+ "learning_rate": 0.001,
9280
+ "loss": 3.0457,
9281
+ "num_input_tokens_seen": 13657702400,
9282
+ "step": 52100
9283
+ },
9284
+ {
9285
+ "epoch": 0.3507869162870399,
9286
+ "grad_norm": 0.23935073614120483,
9287
+ "learning_rate": 0.001,
9288
+ "loss": 3.0465,
9289
+ "num_input_tokens_seen": 13670809600,
9290
+ "step": 52150
9291
+ },
9292
+ {
9293
+ "epoch": 0.3511232412307475,
9294
+ "grad_norm": 0.2895826995372772,
9295
+ "learning_rate": 0.001,
9296
+ "loss": 3.0509,
9297
+ "num_input_tokens_seen": 13683916800,
9298
+ "step": 52200
9299
+ },
9300
+ {
9301
+ "epoch": 0.3514595661744551,
9302
+ "grad_norm": 0.24599236249923706,
9303
+ "learning_rate": 0.001,
9304
+ "loss": 3.0385,
9305
+ "num_input_tokens_seen": 13697024000,
9306
+ "step": 52250
9307
+ },
9308
+ {
9309
+ "epoch": 0.35179589111816273,
9310
+ "grad_norm": 0.19500850141048431,
9311
+ "learning_rate": 0.001,
9312
+ "loss": 3.0523,
9313
+ "num_input_tokens_seen": 13710131200,
9314
+ "step": 52300
9315
+ },
9316
+ {
9317
+ "epoch": 0.35213221606187034,
9318
+ "grad_norm": 0.20790818333625793,
9319
+ "learning_rate": 0.001,
9320
+ "loss": 3.0547,
9321
+ "num_input_tokens_seen": 13723238400,
9322
+ "step": 52350
9323
+ },
9324
+ {
9325
+ "epoch": 0.35246854100557795,
9326
+ "grad_norm": 0.18653196096420288,
9327
+ "learning_rate": 0.001,
9328
+ "loss": 3.0545,
9329
+ "num_input_tokens_seen": 13736345600,
9330
+ "step": 52400
9331
+ },
9332
+ {
9333
+ "epoch": 0.35280486594928556,
9334
+ "grad_norm": 0.22097791731357574,
9335
+ "learning_rate": 0.001,
9336
+ "loss": 3.0573,
9337
+ "num_input_tokens_seen": 13749452800,
9338
+ "step": 52450
9339
+ },
9340
+ {
9341
+ "epoch": 0.3531411908929932,
9342
+ "grad_norm": 0.22931267321109772,
9343
+ "learning_rate": 0.001,
9344
+ "loss": 3.0478,
9345
+ "num_input_tokens_seen": 13762560000,
9346
+ "step": 52500
9347
+ },
9348
+ {
9349
+ "epoch": 0.3531411908929932,
9350
+ "eval_loss": 2.9459915161132812,
9351
+ "eval_runtime": 52.6495,
9352
+ "eval_samples_per_second": 94.968,
9353
+ "eval_steps_per_second": 23.742,
9354
+ "num_input_tokens_seen": 13762560000,
9355
+ "step": 52500
9356
+ },
9357
+ {
9358
+ "epoch": 0.3534775158367008,
9359
+ "grad_norm": 0.31109049916267395,
9360
+ "learning_rate": 0.001,
9361
+ "loss": 3.0462,
9362
+ "num_input_tokens_seen": 13775667200,
9363
+ "step": 52550
9364
+ },
9365
+ {
9366
+ "epoch": 0.3538138407804084,
9367
+ "grad_norm": 1.7297276258468628,
9368
+ "learning_rate": 0.001,
9369
+ "loss": 3.0629,
9370
+ "num_input_tokens_seen": 13788774400,
9371
+ "step": 52600
9372
+ },
9373
+ {
9374
+ "epoch": 0.354150165724116,
9375
+ "grad_norm": 0.4056268334388733,
9376
+ "learning_rate": 0.001,
9377
+ "loss": 3.0763,
9378
+ "num_input_tokens_seen": 13801881600,
9379
+ "step": 52650
9380
+ },
9381
+ {
9382
+ "epoch": 0.3544864906678236,
9383
+ "grad_norm": 0.3694227635860443,
9384
+ "learning_rate": 0.001,
9385
+ "loss": 3.099,
9386
+ "num_input_tokens_seen": 13814988800,
9387
+ "step": 52700
9388
+ },
9389
+ {
9390
+ "epoch": 0.35482281561153123,
9391
+ "grad_norm": 0.2708556056022644,
9392
+ "learning_rate": 0.001,
9393
+ "loss": 3.0985,
9394
+ "num_input_tokens_seen": 13828096000,
9395
+ "step": 52750
9396
+ },
9397
+ {
9398
+ "epoch": 0.35515914055523884,
9399
+ "grad_norm": 0.27150145173072815,
9400
+ "learning_rate": 0.001,
9401
+ "loss": 3.0694,
9402
+ "num_input_tokens_seen": 13841203200,
9403
+ "step": 52800
9404
+ },
9405
+ {
9406
+ "epoch": 0.35549546549894645,
9407
+ "grad_norm": 0.2626855969429016,
9408
+ "learning_rate": 0.001,
9409
+ "loss": 3.0642,
9410
+ "num_input_tokens_seen": 13854310400,
9411
+ "step": 52850
9412
+ },
9413
+ {
9414
+ "epoch": 0.35583179044265406,
9415
+ "grad_norm": 0.20539118349552155,
9416
+ "learning_rate": 0.001,
9417
+ "loss": 3.059,
9418
+ "num_input_tokens_seen": 13867417600,
9419
+ "step": 52900
9420
+ },
9421
+ {
9422
+ "epoch": 0.35616811538636167,
9423
+ "grad_norm": 0.21489828824996948,
9424
+ "learning_rate": 0.001,
9425
+ "loss": 3.054,
9426
+ "num_input_tokens_seen": 13880524800,
9427
+ "step": 52950
9428
+ },
9429
+ {
9430
+ "epoch": 0.3565044403300693,
9431
+ "grad_norm": 0.263488233089447,
9432
+ "learning_rate": 0.001,
9433
+ "loss": 3.044,
9434
+ "num_input_tokens_seen": 13893632000,
9435
+ "step": 53000
9436
+ },
9437
+ {
9438
+ "epoch": 0.3565044403300693,
9439
+ "eval_loss": 2.9570043087005615,
9440
+ "eval_runtime": 53.2194,
9441
+ "eval_samples_per_second": 93.951,
9442
+ "eval_steps_per_second": 23.488,
9443
+ "num_input_tokens_seen": 13893632000,
9444
+ "step": 53000
9445
  }
9446
  ],
9447
  "logging_steps": 50,
9448
  "max_steps": 60000,
9449
+ "num_input_tokens_seen": 13893632000,
9450
  "num_train_epochs": 1,
9451
  "save_steps": 1000,
9452
  "stateful_callbacks": {
 
9461
  "attributes": {}
9462
  }
9463
  },
9464
+ "total_flos": 3.71668216184832e+18,
9465
  "train_batch_size": 64,
9466
  "trial_name": null,
9467
  "trial_params": null