Azrail commited on
Commit
02c54cb
·
verified ·
1 Parent(s): f02308b

Training in progress, step 70000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c67ab3cac009a5afdc201af7f0117dd68a478413d54e0923fe125d5f63dd515
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0815144315751afde957889b3801664e381aaf78af5aaa224fc2449fb124f643
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d01209ddef39b46affb20fe03502cb8000499194b31764df158aa95dc134101e
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb8d729362c485fd51b577b67e8426946112c542b67c6dbee290cc17eda6309
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1397d04798a1fd86f4b074ba5cc769a269eab9bb0994d2bcfee86faa58f609a6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f91f0395ad8bb44fd81f1444330dede040f6b66dbc15e61e2a7fe4c1ef60aa2a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bba508cada3fb6a2130ffab8142880b38ad6264731466b5965eb74743d23afc9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c3a675e6db9104dd282c679a41cb4bdc17a98118d756c948d809458e24a6b37
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4641284223165053,
6
  "eval_steps": 500,
7
- "global_step": 69000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12290,11 +12290,189 @@
12290
  "eval_steps_per_second": 23.529,
12291
  "num_input_tokens_seen": 18087936000,
12292
  "step": 69000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12293
  }
12294
  ],
12295
  "logging_steps": 50,
12296
  "max_steps": 70000,
12297
- "num_input_tokens_seen": 18087936000,
12298
  "num_train_epochs": 1,
12299
  "save_steps": 1000,
12300
  "stateful_callbacks": {
@@ -12304,12 +12482,12 @@
12304
  "should_evaluate": false,
12305
  "should_log": false,
12306
  "should_save": true,
12307
- "should_training_stop": false
12308
  },
12309
  "attributes": {}
12310
  }
12311
  },
12312
- "total_flos": 4.83869941825536e+18,
12313
  "train_batch_size": 64,
12314
  "trial_name": null,
12315
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4708549211906576,
6
  "eval_steps": 500,
7
+ "global_step": 70000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12290
  "eval_steps_per_second": 23.529,
12291
  "num_input_tokens_seen": 18087936000,
12292
  "step": 69000
12293
+ },
12294
+ {
12295
+ "epoch": 0.4644647472602129,
12296
+ "grad_norm": 0.1424110382795334,
12297
+ "learning_rate": 8.67336033464411e-06,
12298
+ "loss": 2.9591,
12299
+ "num_input_tokens_seen": 18101043200,
12300
+ "step": 69050
12301
+ },
12302
+ {
12303
+ "epoch": 0.46480107220392053,
12304
+ "grad_norm": 0.14686723053455353,
12305
+ "learning_rate": 7.786715955054202e-06,
12306
+ "loss": 2.9561,
12307
+ "num_input_tokens_seen": 18114150400,
12308
+ "step": 69100
12309
+ },
12310
+ {
12311
+ "epoch": 0.46513739714762814,
12312
+ "grad_norm": 0.13719068467617035,
12313
+ "learning_rate": 6.947512116245669e-06,
12314
+ "loss": 2.9629,
12315
+ "num_input_tokens_seen": 18127257600,
12316
+ "step": 69150
12317
+ },
12318
+ {
12319
+ "epoch": 0.46547372209133575,
12320
+ "grad_norm": 0.14337210357189178,
12321
+ "learning_rate": 6.15582970243117e-06,
12322
+ "loss": 2.9713,
12323
+ "num_input_tokens_seen": 18140364800,
12324
+ "step": 69200
12325
+ },
12326
+ {
12327
+ "epoch": 0.46581004703504336,
12328
+ "grad_norm": 0.18305008113384247,
12329
+ "learning_rate": 5.411745017609493e-06,
12330
+ "loss": 2.9659,
12331
+ "num_input_tokens_seen": 18153472000,
12332
+ "step": 69250
12333
+ },
12334
+ {
12335
+ "epoch": 0.466146371978751,
12336
+ "grad_norm": 0.137322798371315,
12337
+ "learning_rate": 4.715329778211374e-06,
12338
+ "loss": 2.9678,
12339
+ "num_input_tokens_seen": 18166579200,
12340
+ "step": 69300
12341
+ },
12342
+ {
12343
+ "epoch": 0.4664826969224586,
12344
+ "grad_norm": 0.13300293684005737,
12345
+ "learning_rate": 4.066651106186981e-06,
12346
+ "loss": 2.9647,
12347
+ "num_input_tokens_seen": 18179686400,
12348
+ "step": 69350
12349
+ },
12350
+ {
12351
+ "epoch": 0.4668190218661662,
12352
+ "grad_norm": 0.13357709348201752,
12353
+ "learning_rate": 3.4657715225368535e-06,
12354
+ "loss": 2.965,
12355
+ "num_input_tokens_seen": 18192793600,
12356
+ "step": 69400
12357
+ },
12358
+ {
12359
+ "epoch": 0.4671553468098738,
12360
+ "grad_norm": 0.13399702310562134,
12361
+ "learning_rate": 2.9127489412859033e-06,
12362
+ "loss": 2.9614,
12363
+ "num_input_tokens_seen": 18205900800,
12364
+ "step": 69450
12365
+ },
12366
+ {
12367
+ "epoch": 0.4674916717535814,
12368
+ "grad_norm": 0.13703274726867676,
12369
+ "learning_rate": 2.4076366639015913e-06,
12370
+ "loss": 2.964,
12371
+ "num_input_tokens_seen": 18219008000,
12372
+ "step": 69500
12373
+ },
12374
+ {
12375
+ "epoch": 0.4674916717535814,
12376
+ "eval_loss": 2.8645894527435303,
12377
+ "eval_runtime": 53.3524,
12378
+ "eval_samples_per_second": 93.716,
12379
+ "eval_steps_per_second": 23.429,
12380
+ "num_input_tokens_seen": 18219008000,
12381
+ "step": 69500
12382
+ },
12383
+ {
12384
+ "epoch": 0.46782799669728903,
12385
+ "grad_norm": 0.3837803900241852,
12386
+ "learning_rate": 1.950483374156431e-06,
12387
+ "loss": 2.9665,
12388
+ "num_input_tokens_seen": 18232115200,
12389
+ "step": 69550
12390
+ },
12391
+ {
12392
+ "epoch": 0.46816432164099664,
12393
+ "grad_norm": 0.13585589826107025,
12394
+ "learning_rate": 1.541333133436018e-06,
12395
+ "loss": 2.9579,
12396
+ "num_input_tokens_seen": 18245222400,
12397
+ "step": 69600
12398
+ },
12399
+ {
12400
+ "epoch": 0.4685006465847043,
12401
+ "grad_norm": 0.13347585499286652,
12402
+ "learning_rate": 1.18022537649215e-06,
12403
+ "loss": 2.9636,
12404
+ "num_input_tokens_seen": 18258329600,
12405
+ "step": 69650
12406
+ },
12407
+ {
12408
+ "epoch": 0.4688369715284119,
12409
+ "grad_norm": 0.13726544380187988,
12410
+ "learning_rate": 8.671949076420882e-07,
12411
+ "loss": 2.9626,
12412
+ "num_input_tokens_seen": 18271436800,
12413
+ "step": 69700
12414
+ },
12415
+ {
12416
+ "epoch": 0.4691732964721195,
12417
+ "grad_norm": 0.14254987239837646,
12418
+ "learning_rate": 6.022718974137975e-07,
12419
+ "loss": 2.9698,
12420
+ "num_input_tokens_seen": 18284544000,
12421
+ "step": 69750
12422
+ },
12423
+ {
12424
+ "epoch": 0.46950962141582714,
12425
+ "grad_norm": 0.1329219937324524,
12426
+ "learning_rate": 3.854818796385495e-07,
12427
+ "loss": 2.96,
12428
+ "num_input_tokens_seen": 18297651200,
12429
+ "step": 69800
12430
+ },
12431
+ {
12432
+ "epoch": 0.46984594635953475,
12433
+ "grad_norm": 0.1384582668542862,
12434
+ "learning_rate": 2.1684574898939157e-07,
12435
+ "loss": 2.9693,
12436
+ "num_input_tokens_seen": 18310758400,
12437
+ "step": 69850
12438
+ },
12439
+ {
12440
+ "epoch": 0.47018227130324236,
12441
+ "grad_norm": 0.14365264773368835,
12442
+ "learning_rate": 9.637975896759077e-08,
12443
+ "loss": 2.9686,
12444
+ "num_input_tokens_seen": 18323865600,
12445
+ "step": 69900
12446
+ },
12447
+ {
12448
+ "epoch": 0.47051859624694997,
12449
+ "grad_norm": 0.13613733649253845,
12450
+ "learning_rate": 2.4095520335998265e-08,
12451
+ "loss": 2.9607,
12452
+ "num_input_tokens_seen": 18336972800,
12453
+ "step": 69950
12454
+ },
12455
+ {
12456
+ "epoch": 0.4708549211906576,
12457
+ "grad_norm": 0.14377959072589874,
12458
+ "learning_rate": 0.0,
12459
+ "loss": 2.9684,
12460
+ "num_input_tokens_seen": 18350080000,
12461
+ "step": 70000
12462
+ },
12463
+ {
12464
+ "epoch": 0.4708549211906576,
12465
+ "eval_loss": 2.8644959926605225,
12466
+ "eval_runtime": 54.0337,
12467
+ "eval_samples_per_second": 92.535,
12468
+ "eval_steps_per_second": 23.134,
12469
+ "num_input_tokens_seen": 18350080000,
12470
+ "step": 70000
12471
  }
12472
  ],
12473
  "logging_steps": 50,
12474
  "max_steps": 70000,
12475
+ "num_input_tokens_seen": 18350080000,
12476
  "num_train_epochs": 1,
12477
  "save_steps": 1000,
12478
  "stateful_callbacks": {
 
12482
  "should_evaluate": false,
12483
  "should_log": false,
12484
  "should_save": true,
12485
+ "should_training_stop": true
12486
  },
12487
  "attributes": {}
12488
  }
12489
  },
12490
+ "total_flos": 4.9088254967808e+18,
12491
  "train_batch_size": 64,
12492
  "trial_name": null,
12493
  "trial_params": null