Azrail commited on
Commit
5290b29
·
verified ·
1 Parent(s): ec0a567

Training in progress, step 60000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59a706f60964ffe8cd2b221f9a7465c0f56181a98072bee3057047cce8e408cf
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c41967e5432db5ed91bc1228a51744d8af764a94e341f801caf2cc8d0b340946
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ef2119eabf69c54d09db0a76c3313d847c900937c3e2edb463f3eba3b1000af
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70cad043527913fd0557530d296a1fe5bc45ca60997f5c855298840644081537
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfe4fcebd5141fdf7604535ed8dc60cda464d7e4d084d78ec5c9b7105325f9b5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6a4cb233f004dcf5c1bd7310c625e6acfeb53e49f5aa9a513759dc7631fff0b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5b084cf754d7494e17fb8efe3747874197d5052ad1bcb013283a3027835137
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9201fef1295387122e53aeeb3fe425d2797e674a7be3dba9faefda446e2071fd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3968634335749828,
6
  "eval_steps": 500,
7
- "global_step": 59000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10510,11 +10510,189 @@
10510
  "eval_steps_per_second": 23.346,
10511
  "num_input_tokens_seen": 15466496000,
10512
  "step": 59000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10513
  }
10514
  ],
10515
  "logging_steps": 50,
10516
  "max_steps": 60000,
10517
- "num_input_tokens_seen": 15466496000,
10518
  "num_train_epochs": 1,
10519
  "save_steps": 1000,
10520
  "stateful_callbacks": {
@@ -10524,12 +10702,12 @@
10524
  "should_evaluate": false,
10525
  "should_log": false,
10526
  "should_save": true,
10527
- "should_training_stop": false
10528
  },
10529
  "attributes": {}
10530
  }
10531
  },
10532
- "total_flos": 4.13743863300096e+18,
10533
  "train_batch_size": 64,
10534
  "trial_name": null,
10535
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.40358993244913505,
6
  "eval_steps": 500,
7
+ "global_step": 60000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10510
  "eval_steps_per_second": 23.346,
10511
  "num_input_tokens_seen": 15466496000,
10512
  "step": 59000
10513
+ },
10514
+ {
10515
+ "epoch": 0.39719975851869044,
10516
+ "grad_norm": 0.14572475850582123,
10517
+ "learning_rate": 6.059144366901737e-05,
10518
+ "loss": 2.9861,
10519
+ "num_input_tokens_seen": 15479603200,
10520
+ "step": 59050
10521
+ },
10522
+ {
10523
+ "epoch": 0.39753608346239805,
10524
+ "grad_norm": 0.5027282238006592,
10525
+ "learning_rate": 5.449673790581611e-05,
10526
+ "loss": 2.9773,
10527
+ "num_input_tokens_seen": 15492710400,
10528
+ "step": 59100
10529
+ },
10530
+ {
10531
+ "epoch": 0.39787240840610566,
10532
+ "grad_norm": 0.192597895860672,
10533
+ "learning_rate": 4.87073578250698e-05,
10534
+ "loss": 2.9874,
10535
+ "num_input_tokens_seen": 15505817600,
10536
+ "step": 59150
10537
+ },
10538
+ {
10539
+ "epoch": 0.39820873334981327,
10540
+ "grad_norm": 0.15083667635917664,
10541
+ "learning_rate": 4.322727117869951e-05,
10542
+ "loss": 2.987,
10543
+ "num_input_tokens_seen": 15518924800,
10544
+ "step": 59200
10545
+ },
10546
+ {
10547
+ "epoch": 0.3985450582935209,
10548
+ "grad_norm": 0.14701534807682037,
10549
+ "learning_rate": 3.806023374435663e-05,
10550
+ "loss": 2.9858,
10551
+ "num_input_tokens_seen": 15532032000,
10552
+ "step": 59250
10553
+ },
10554
+ {
10555
+ "epoch": 0.3988813832372285,
10556
+ "grad_norm": 0.145115464925766,
10557
+ "learning_rate": 3.3209786751399184e-05,
10558
+ "loss": 2.9926,
10559
+ "num_input_tokens_seen": 15545139200,
10560
+ "step": 59300
10561
+ },
10562
+ {
10563
+ "epoch": 0.3992177081809361,
10564
+ "grad_norm": 0.15828457474708557,
10565
+ "learning_rate": 2.8679254453910786e-05,
10566
+ "loss": 2.9803,
10567
+ "num_input_tokens_seen": 15558246400,
10568
+ "step": 59350
10569
+ },
10570
+ {
10571
+ "epoch": 0.3995540331246437,
10572
+ "grad_norm": 0.14400678873062134,
10573
+ "learning_rate": 2.4471741852423235e-05,
10574
+ "loss": 2.9701,
10575
+ "num_input_tokens_seen": 15571353600,
10576
+ "step": 59400
10577
+ },
10578
+ {
10579
+ "epoch": 0.3998903580683513,
10580
+ "grad_norm": 0.14925344288349152,
10581
+ "learning_rate": 2.0590132565903473e-05,
10582
+ "loss": 2.989,
10583
+ "num_input_tokens_seen": 15584460800,
10584
+ "step": 59450
10585
+ },
10586
+ {
10587
+ "epoch": 0.40022668301205894,
10588
+ "grad_norm": 0.14081260561943054,
10589
+ "learning_rate": 1.70370868554659e-05,
10590
+ "loss": 2.9824,
10591
+ "num_input_tokens_seen": 15597568000,
10592
+ "step": 59500
10593
+ },
10594
+ {
10595
+ "epoch": 0.40022668301205894,
10596
+ "eval_loss": 2.882228136062622,
10597
+ "eval_runtime": 53.7595,
10598
+ "eval_samples_per_second": 93.007,
10599
+ "eval_steps_per_second": 23.252,
10600
+ "num_input_tokens_seen": 15597568000,
10601
+ "step": 59500
10602
+ },
10603
+ {
10604
+ "epoch": 0.40056300795576655,
10605
+ "grad_norm": 0.13585136830806732,
10606
+ "learning_rate": 1.3815039801161721e-05,
10607
+ "loss": 2.9883,
10608
+ "num_input_tokens_seen": 15610675200,
10609
+ "step": 59550
10610
+ },
10611
+ {
10612
+ "epoch": 0.40089933289947416,
10613
+ "grad_norm": 0.1438748985528946,
10614
+ "learning_rate": 1.0926199633097156e-05,
10615
+ "loss": 2.9781,
10616
+ "num_input_tokens_seen": 15623782400,
10617
+ "step": 59600
10618
+ },
10619
+ {
10620
+ "epoch": 0.40123565784318177,
10621
+ "grad_norm": 0.3345394730567932,
10622
+ "learning_rate": 8.372546218022748e-06,
10623
+ "loss": 2.9869,
10624
+ "num_input_tokens_seen": 15636889600,
10625
+ "step": 59650
10626
+ },
10627
+ {
10628
+ "epoch": 0.4015719827868894,
10629
+ "grad_norm": 0.14581316709518433,
10630
+ "learning_rate": 6.15582970243117e-06,
10631
+ "loss": 2.9882,
10632
+ "num_input_tokens_seen": 15649996800,
10633
+ "step": 59700
10634
+ },
10635
+ {
10636
+ "epoch": 0.401908307730597,
10637
+ "grad_norm": 0.1409323662519455,
10638
+ "learning_rate": 4.277569313094809e-06,
10639
+ "loss": 2.9833,
10640
+ "num_input_tokens_seen": 15663104000,
10641
+ "step": 59750
10642
+ },
10643
+ {
10644
+ "epoch": 0.4022446326743046,
10645
+ "grad_norm": 0.1412041187286377,
10646
+ "learning_rate": 2.739052315863355e-06,
10647
+ "loss": 2.9835,
10648
+ "num_input_tokens_seen": 15676211200,
10649
+ "step": 59800
10650
+ },
10651
+ {
10652
+ "epoch": 0.4025809576180122,
10653
+ "grad_norm": 0.14011850953102112,
10654
+ "learning_rate": 1.541333133436018e-06,
10655
+ "loss": 2.9819,
10656
+ "num_input_tokens_seen": 15689318400,
10657
+ "step": 59850
10658
+ },
10659
+ {
10660
+ "epoch": 0.4029172825617198,
10661
+ "grad_norm": 0.14772015810012817,
10662
+ "learning_rate": 6.852326227130834e-07,
10663
+ "loss": 2.9855,
10664
+ "num_input_tokens_seen": 15702425600,
10665
+ "step": 59900
10666
+ },
10667
+ {
10668
+ "epoch": 0.40325360750542744,
10669
+ "grad_norm": 0.14281156659126282,
10670
+ "learning_rate": 1.7133751222137007e-07,
10671
+ "loss": 2.978,
10672
+ "num_input_tokens_seen": 15715532800,
10673
+ "step": 59950
10674
+ },
10675
+ {
10676
+ "epoch": 0.40358993244913505,
10677
+ "grad_norm": 0.14420129358768463,
10678
+ "learning_rate": 0.0,
10679
+ "loss": 2.9789,
10680
+ "num_input_tokens_seen": 15728640000,
10681
+ "step": 60000
10682
+ },
10683
+ {
10684
+ "epoch": 0.40358993244913505,
10685
+ "eval_loss": 2.8818726539611816,
10686
+ "eval_runtime": 53.5982,
10687
+ "eval_samples_per_second": 93.287,
10688
+ "eval_steps_per_second": 23.322,
10689
+ "num_input_tokens_seen": 15728640000,
10690
+ "step": 60000
10691
  }
10692
  ],
10693
  "logging_steps": 50,
10694
  "max_steps": 60000,
10695
+ "num_input_tokens_seen": 15728640000,
10696
  "num_train_epochs": 1,
10697
  "save_steps": 1000,
10698
  "stateful_callbacks": {
 
10702
  "should_evaluate": false,
10703
  "should_log": false,
10704
  "should_save": true,
10705
+ "should_training_stop": true
10706
  },
10707
  "attributes": {}
10708
  }
10709
  },
10710
+ "total_flos": 4.2075647115264e+18,
10711
  "train_batch_size": 64,
10712
  "trial_name": null,
10713
  "trial_params": null