ccore commited on
Commit
d9fa30c
·
verified ·
1 Parent(s): 69d352e

Training in progress, epoch 10, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18aa3793eafce3749c6627f617fc9beefd20e5e39a72c4be54bc7466a8da3a58
3
  size 504109968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400714933e436ed4942f6ac14653ba65257ac6b5a7d4ce477d416ba55e17474e
3
  size 504109968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:769e2d328832f9e8977633a3bf16aca606696a0874a02a1edd348042a65107cd
3
  size 1008339066
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a3e3d7e456d6422407d86032db6f5536b675b02f9b9cb0ae22f5ea3f872fd94
3
  size 1008339066
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e44951d2ae26589c4948133e119adf0d1d7d8c7788a1be4749c0b21311e2d966
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961c3f8777af59bc7b4c75a96dab229e529cd80db0c10138ac15c22505754430
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd9f3ba97711824f0c4f1f355e670342edfe13d4e26033a3152ec68013cdb3f1
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4fc465b0742a760c8a88fa10c0e66b96b132949734205aaaadb5090b96a9191
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:617e3f89c08ec8ae94159021694f0a7719bab4370769175cb516700c1b355f9f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9906eb8c72456a53a67a4ac3ea98c57e6203ca53bfe5b869a429143d2009f53
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 2484,
3
  "best_metric": 1.949218511581421,
4
  "best_model_checkpoint": "./opt_thinker_ckpts/checkpoint-2484",
5
- "epoch": 9.0,
6
  "eval_steps": 500,
7
- "global_step": 2484,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -423,6 +423,56 @@
423
  "eval_samples_per_second": 8.425,
424
  "eval_steps_per_second": 4.212,
425
  "step": 2484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  }
427
  ],
428
  "logging_steps": 50,
@@ -437,12 +487,12 @@
437
  "should_evaluate": false,
438
  "should_log": false,
439
  "should_save": true,
440
- "should_training_stop": false
441
  },
442
  "attributes": {}
443
  }
444
  },
445
- "total_flos": 3.792025365696e+16,
446
  "train_batch_size": 2,
447
  "trial_name": null,
448
  "trial_params": null
 
2
  "best_global_step": 2484,
3
  "best_metric": 1.949218511581421,
4
  "best_model_checkpoint": "./opt_thinker_ckpts/checkpoint-2484",
5
+ "epoch": 10.0,
6
  "eval_steps": 500,
7
+ "global_step": 2760,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
423
  "eval_samples_per_second": 8.425,
424
  "eval_steps_per_second": 4.212,
425
  "step": 2484
426
+ },
427
+ {
428
+ "epoch": 9.058023572076156,
429
+ "grad_norm": 12.567099571228027,
430
+ "learning_rate": 2.257449032190323e-06,
431
+ "loss": 12.8177,
432
+ "step": 2500
433
+ },
434
+ {
435
+ "epoch": 9.239347234814144,
436
+ "grad_norm": 12.118515014648438,
437
+ "learning_rate": 1.4898066712173974e-06,
438
+ "loss": 12.8555,
439
+ "step": 2550
440
+ },
441
+ {
442
+ "epoch": 9.42067089755213,
443
+ "grad_norm": 12.42186164855957,
444
+ "learning_rate": 8.792502686258752e-07,
445
+ "loss": 12.9066,
446
+ "step": 2600
447
+ },
448
+ {
449
+ "epoch": 9.601994560290118,
450
+ "grad_norm": 12.5271635055542,
451
+ "learning_rate": 4.277569313094809e-07,
452
+ "loss": 12.9596,
453
+ "step": 2650
454
+ },
455
+ {
456
+ "epoch": 9.783318223028106,
457
+ "grad_norm": 12.402328491210938,
458
+ "learning_rate": 1.3678868732311946e-07,
459
+ "loss": 12.8895,
460
+ "step": 2700
461
+ },
462
+ {
463
+ "epoch": 9.964641885766092,
464
+ "grad_norm": 12.048426628112793,
465
+ "learning_rate": 7.287751536050324e-09,
466
+ "loss": 12.9195,
467
+ "step": 2750
468
+ },
469
+ {
470
+ "epoch": 10.0,
471
+ "eval_loss": 1.9497145414352417,
472
+ "eval_runtime": 58.2174,
473
+ "eval_samples_per_second": 8.417,
474
+ "eval_steps_per_second": 4.208,
475
+ "step": 2760
476
  }
477
  ],
478
  "logging_steps": 50,
 
487
  "should_evaluate": false,
488
  "should_log": false,
489
  "should_save": true,
490
+ "should_training_stop": true
491
  },
492
  "attributes": {}
493
  }
494
  },
495
+ "total_flos": 4.213723912704e+16,
496
  "train_batch_size": 2,
497
  "trial_name": null,
498
  "trial_params": null