Azrail commited on
Commit
d083a74
·
verified ·
1 Parent(s): fa67c72

Training in progress, step 66000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ad9801ec7b3ea03c8febaf16be0cca903ae6c5e7ba16db1d0ab836be5805c8b
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91b3f88ddbda82d579d7e857e17e157a938e94cf97682c36dea7a9e8ddcf3d14
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcaa21f2d1112b5786bb6cb8a7af07df0a486ccdc4e343d067ea09aba3ebc0cf
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1773eecaec3a2d8883e5d344c33d10650e6ebcee793cb11cc46ab81989c4cf9e
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b166fab474c8d8470da4ff5d475f9ae65d65d8dd07f0e702e6e8c799bab73616
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a07bef738a41ab3ac6ef10bbe9890f379f768870bcb200cb24b86bcef1753cd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.31005163552237736,
6
  "eval_steps": 500,
7
- "global_step": 65000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11578,11 +11578,189 @@
11578
  "eval_steps_per_second": 23.41,
11579
  "num_input_tokens_seen": 17039355456,
11580
  "step": 65000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11581
  }
11582
  ],
11583
  "logging_steps": 50,
11584
  "max_steps": 70000,
11585
- "num_input_tokens_seen": 17039355456,
11586
  "num_train_epochs": 1,
11587
  "save_steps": 1000,
11588
  "stateful_callbacks": {
@@ -11597,7 +11775,7 @@
11597
  "attributes": {}
11598
  }
11599
  },
11600
- "total_flos": 4.5581938885892506e+18,
11601
  "train_batch_size": 64,
11602
  "trial_name": null,
11603
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3148216606842601,
6
  "eval_steps": 500,
7
+ "global_step": 66000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11578
  "eval_steps_per_second": 23.41,
11579
  "num_input_tokens_seen": 17039355456,
11580
  "step": 65000
11581
+ },
11582
+ {
11583
+ "epoch": 0.3102901367804715,
11584
+ "grad_norm": 0.17116400599479675,
11585
+ "learning_rate": 0.000278017467984759,
11586
+ "loss": 2.5504,
11587
+ "num_input_tokens_seen": 17052462656,
11588
+ "step": 65050
11589
+ },
11590
+ {
11591
+ "epoch": 0.3105286380385657,
11592
+ "grad_norm": 0.17055106163024902,
11593
+ "learning_rate": 0.00027300475013022663,
11594
+ "loss": 2.543,
11595
+ "num_input_tokens_seen": 17065569856,
11596
+ "step": 65100
11597
+ },
11598
+ {
11599
+ "epoch": 0.3107671392966598,
11600
+ "grad_norm": 0.17849299311637878,
11601
+ "learning_rate": 0.000268020607911083,
11602
+ "loss": 2.5476,
11603
+ "num_input_tokens_seen": 17078677056,
11604
+ "step": 65150
11605
+ },
11606
+ {
11607
+ "epoch": 0.31100564055475394,
11608
+ "grad_norm": 0.17608341574668884,
11609
+ "learning_rate": 0.0002630656687635007,
11610
+ "loss": 2.5452,
11611
+ "num_input_tokens_seen": 17091784256,
11612
+ "step": 65200
11613
+ },
11614
+ {
11615
+ "epoch": 0.31124414181284804,
11616
+ "grad_norm": 0.19086676836013794,
11617
+ "learning_rate": 0.0002581405564473801,
11618
+ "loss": 2.5562,
11619
+ "num_input_tokens_seen": 17104891456,
11620
+ "step": 65250
11621
+ },
11622
+ {
11623
+ "epoch": 0.3114826430709422,
11624
+ "grad_norm": 0.1721603125333786,
11625
+ "learning_rate": 0.00025324589096782657,
11626
+ "loss": 2.5402,
11627
+ "num_input_tokens_seen": 17117998656,
11628
+ "step": 65300
11629
+ },
11630
+ {
11631
+ "epoch": 0.31172114432903636,
11632
+ "grad_norm": 0.16727598011493683,
11633
+ "learning_rate": 0.00024838228849709997,
11634
+ "loss": 2.5253,
11635
+ "num_input_tokens_seen": 17131105856,
11636
+ "step": 65350
11637
+ },
11638
+ {
11639
+ "epoch": 0.31195964558713046,
11640
+ "grad_norm": 0.1664544939994812,
11641
+ "learning_rate": 0.000243550361297047,
11642
+ "loss": 2.5519,
11643
+ "num_input_tokens_seen": 17144213056,
11644
+ "step": 65400
11645
+ },
11646
+ {
11647
+ "epoch": 0.3121981468452246,
11648
+ "grad_norm": 0.17195752263069153,
11649
+ "learning_rate": 0.00023875071764202561,
11650
+ "loss": 2.5297,
11651
+ "num_input_tokens_seen": 17157320256,
11652
+ "step": 65450
11653
+ },
11654
+ {
11655
+ "epoch": 0.3124366481033187,
11656
+ "grad_norm": 0.19001176953315735,
11657
+ "learning_rate": 0.00023398396174233177,
11658
+ "loss": 2.5439,
11659
+ "num_input_tokens_seen": 17170427456,
11660
+ "step": 65500
11661
+ },
11662
+ {
11663
+ "epoch": 0.3124366481033187,
11664
+ "eval_loss": 2.426327705383301,
11665
+ "eval_runtime": 53.7603,
11666
+ "eval_samples_per_second": 93.005,
11667
+ "eval_steps_per_second": 23.251,
11668
+ "num_input_tokens_seen": 17170427456,
11669
+ "step": 65500
11670
+ },
11671
+ {
11672
+ "epoch": 0.3126751493614129,
11673
+ "grad_norm": 0.17215538024902344,
11674
+ "learning_rate": 0.00022925069366813716,
11675
+ "loss": 2.5442,
11676
+ "num_input_tokens_seen": 17183534656,
11677
+ "step": 65550
11678
+ },
11679
+ {
11680
+ "epoch": 0.31291365061950704,
11681
+ "grad_norm": 0.16736114025115967,
11682
+ "learning_rate": 0.0002245515092739488,
11683
+ "loss": 2.5472,
11684
+ "num_input_tokens_seen": 17196641856,
11685
+ "step": 65600
11686
+ },
11687
+ {
11688
+ "epoch": 0.31315215187760115,
11689
+ "grad_norm": 0.1739792823791504,
11690
+ "learning_rate": 0.00021988700012359863,
11691
+ "loss": 2.5401,
11692
+ "num_input_tokens_seen": 17209749056,
11693
+ "step": 65650
11694
+ },
11695
+ {
11696
+ "epoch": 0.3133906531356953,
11697
+ "grad_norm": 0.17363224923610687,
11698
+ "learning_rate": 0.00021525775341577403,
11699
+ "loss": 2.5539,
11700
+ "num_input_tokens_seen": 17222856256,
11701
+ "step": 65700
11702
+ },
11703
+ {
11704
+ "epoch": 0.3136291543937894,
11705
+ "grad_norm": 0.16787610948085785,
11706
+ "learning_rate": 0.00021066435191009715,
11707
+ "loss": 2.5338,
11708
+ "num_input_tokens_seen": 17235963456,
11709
+ "step": 65750
11710
+ },
11711
+ {
11712
+ "epoch": 0.31386765565188357,
11713
+ "grad_norm": 0.17158125340938568,
11714
+ "learning_rate": 0.00020610737385376348,
11715
+ "loss": 2.5531,
11716
+ "num_input_tokens_seen": 17249070656,
11717
+ "step": 65800
11718
+ },
11719
+ {
11720
+ "epoch": 0.3141061569099777,
11721
+ "grad_norm": 0.1693524569272995,
11722
+ "learning_rate": 0.00020158739290874821,
11723
+ "loss": 2.5286,
11724
+ "num_input_tokens_seen": 17262177856,
11725
+ "step": 65850
11726
+ },
11727
+ {
11728
+ "epoch": 0.31434465816807183,
11729
+ "grad_norm": 0.1730414181947708,
11730
+ "learning_rate": 0.0001971049780795901,
11731
+ "loss": 2.5228,
11732
+ "num_input_tokens_seen": 17275285056,
11733
+ "step": 65900
11734
+ },
11735
+ {
11736
+ "epoch": 0.314583159426166,
11737
+ "grad_norm": 0.16220349073410034,
11738
+ "learning_rate": 0.00019266069364176142,
11739
+ "loss": 2.5445,
11740
+ "num_input_tokens_seen": 17288392256,
11741
+ "step": 65950
11742
+ },
11743
+ {
11744
+ "epoch": 0.3148216606842601,
11745
+ "grad_norm": 0.1605050265789032,
11746
+ "learning_rate": 0.00018825509907063325,
11747
+ "loss": 2.5491,
11748
+ "num_input_tokens_seen": 17301499456,
11749
+ "step": 66000
11750
+ },
11751
+ {
11752
+ "epoch": 0.3148216606842601,
11753
+ "eval_loss": 2.4224469661712646,
11754
+ "eval_runtime": 53.2989,
11755
+ "eval_samples_per_second": 93.811,
11756
+ "eval_steps_per_second": 23.453,
11757
+ "num_input_tokens_seen": 17301499456,
11758
+ "step": 66000
11759
  }
11760
  ],
11761
  "logging_steps": 50,
11762
  "max_steps": 70000,
11763
+ "num_input_tokens_seen": 17301499456,
11764
  "num_train_epochs": 1,
11765
  "save_steps": 1000,
11766
  "stateful_callbacks": {
 
11775
  "attributes": {}
11776
  }
11777
  },
11778
+ "total_flos": 4.628319967114691e+18,
11779
  "train_batch_size": 64,
11780
  "trial_name": null,
11781
  "trial_params": null