Azrail commited on
Commit
84d5e71
·
verified ·
1 Parent(s): 0a2f99f

Training in progress, step 66000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97f833e77e28bcce2d00fc8f583d642be803be2e4268c16065f001da61ccfb12
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6627f46453f0eddcb5503378a89a14a6529d63c8f3e731e04b523860ef73959
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6fb466dd570b07209b2b66d3759663a3b462b568c13bb8f7963bf1191bda0a0
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55adb983e10ce2c91d34635b0e2c61b12341302e3599339214fbe162d24db56d
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83439c671f875b1f809ad8f03d85b4a006312176c0266e869dc1f2efa804bb73
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d828325c04baaeca4bef8dd14dbbff2a89fb26da8a22793521965c92d2ced694
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4372224268198963,
6
  "eval_steps": 500,
7
- "global_step": 65000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11578,11 +11578,189 @@
11578
  "eval_steps_per_second": 23.574,
11579
  "num_input_tokens_seen": 17039360000,
11580
  "step": 65000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11581
  }
11582
  ],
11583
  "logging_steps": 50,
11584
  "max_steps": 70000,
11585
- "num_input_tokens_seen": 17039360000,
11586
  "num_train_epochs": 1,
11587
  "save_steps": 1000,
11588
  "stateful_callbacks": {
@@ -11597,7 +11775,7 @@
11597
  "attributes": {}
11598
  }
11599
  },
11600
- "total_flos": 4.5581951041536e+18,
11601
  "train_batch_size": 64,
11602
  "trial_name": null,
11603
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.44394892569404854,
6
  "eval_steps": 500,
7
+ "global_step": 66000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11578
  "eval_steps_per_second": 23.574,
11579
  "num_input_tokens_seen": 17039360000,
11580
  "step": 65000
11581
+ },
11582
+ {
11583
+ "epoch": 0.4375587517636039,
11584
+ "grad_norm": 0.16188842058181763,
11585
+ "learning_rate": 0.00021814686889249158,
11586
+ "loss": 2.9812,
11587
+ "num_input_tokens_seen": 17052467200,
11588
+ "step": 65050
11589
+ },
11590
+ {
11591
+ "epoch": 0.43789507670731154,
11592
+ "grad_norm": 0.14550812542438507,
11593
+ "learning_rate": 0.00021410601988619394,
11594
+ "loss": 2.9856,
11595
+ "num_input_tokens_seen": 17065574400,
11596
+ "step": 65100
11597
+ },
11598
+ {
11599
+ "epoch": 0.43823140165101915,
11600
+ "grad_norm": 0.1500539779663086,
11601
+ "learning_rate": 0.00021009272593674322,
11602
+ "loss": 2.9827,
11603
+ "num_input_tokens_seen": 17078681600,
11604
+ "step": 65150
11605
+ },
11606
+ {
11607
+ "epoch": 0.43856772659472676,
11608
+ "grad_norm": 0.1571357101202011,
11609
+ "learning_rate": 0.00020610737385376348,
11610
+ "loss": 2.9788,
11611
+ "num_input_tokens_seen": 17091788800,
11612
+ "step": 65200
11613
+ },
11614
+ {
11615
+ "epoch": 0.43890405153843437,
11616
+ "grad_norm": 0.1671544760465622,
11617
+ "learning_rate": 0.00020215034775378332,
11618
+ "loss": 2.9758,
11619
+ "num_input_tokens_seen": 17104896000,
11620
+ "step": 65250
11621
+ },
11622
+ {
11623
+ "epoch": 0.439240376482142,
11624
+ "grad_norm": 0.15525776147842407,
11625
+ "learning_rate": 0.0001982220290232143,
11626
+ "loss": 2.9823,
11627
+ "num_input_tokens_seen": 17118003200,
11628
+ "step": 65300
11629
+ },
11630
+ {
11631
+ "epoch": 0.4395767014258496,
11632
+ "grad_norm": 0.14799903333187103,
11633
+ "learning_rate": 0.00019432279628159188,
11634
+ "loss": 2.9781,
11635
+ "num_input_tokens_seen": 17131110400,
11636
+ "step": 65350
11637
+ },
11638
+ {
11639
+ "epoch": 0.4399130263695572,
11640
+ "grad_norm": 0.16087676584720612,
11641
+ "learning_rate": 0.00019045302534508295,
11642
+ "loss": 2.9805,
11643
+ "num_input_tokens_seen": 17144217600,
11644
+ "step": 65400
11645
+ },
11646
+ {
11647
+ "epoch": 0.4402493513132648,
11648
+ "grad_norm": 0.15892113745212555,
11649
+ "learning_rate": 0.0001866130891902653,
11650
+ "loss": 2.9823,
11651
+ "num_input_tokens_seen": 17157324800,
11652
+ "step": 65450
11653
+ },
11654
+ {
11655
+ "epoch": 0.4405856762569724,
11656
+ "grad_norm": 0.187602236866951,
11657
+ "learning_rate": 0.00018280335791817732,
11658
+ "loss": 2.9804,
11659
+ "num_input_tokens_seen": 17170432000,
11660
+ "step": 65500
11661
+ },
11662
+ {
11663
+ "epoch": 0.4405856762569724,
11664
+ "eval_loss": 2.875824451446533,
11665
+ "eval_runtime": 53.0867,
11666
+ "eval_samples_per_second": 94.186,
11667
+ "eval_steps_per_second": 23.546,
11668
+ "num_input_tokens_seen": 17170432000,
11669
+ "step": 65500
11670
+ },
11671
+ {
11672
+ "epoch": 0.44092200120068004,
11673
+ "grad_norm": 0.15579210221767426,
11674
+ "learning_rate": 0.0001790241987186485,
11675
+ "loss": 2.9734,
11676
+ "num_input_tokens_seen": 17183539200,
11677
+ "step": 65550
11678
+ },
11679
+ {
11680
+ "epoch": 0.44125832614438765,
11681
+ "grad_norm": 0.15250550210475922,
11682
+ "learning_rate": 0.00017527597583490823,
11683
+ "loss": 2.9787,
11684
+ "num_input_tokens_seen": 17196646400,
11685
+ "step": 65600
11686
+ },
11687
+ {
11688
+ "epoch": 0.44159465108809526,
11689
+ "grad_norm": 0.15954890847206116,
11690
+ "learning_rate": 0.00017155905052847938,
11691
+ "loss": 2.978,
11692
+ "num_input_tokens_seen": 17209753600,
11693
+ "step": 65650
11694
+ },
11695
+ {
11696
+ "epoch": 0.44193097603180287,
11697
+ "grad_norm": 0.15598754584789276,
11698
+ "learning_rate": 0.00016787378104435928,
11699
+ "loss": 2.9809,
11700
+ "num_input_tokens_seen": 17222860800,
11701
+ "step": 65700
11702
+ },
11703
+ {
11704
+ "epoch": 0.4422673009755105,
11705
+ "grad_norm": 0.14709477126598358,
11706
+ "learning_rate": 0.00016422052257649078,
11707
+ "loss": 2.9793,
11708
+ "num_input_tokens_seen": 17235968000,
11709
+ "step": 65750
11710
+ },
11711
+ {
11712
+ "epoch": 0.4426036259192181,
11713
+ "grad_norm": 0.15505217015743256,
11714
+ "learning_rate": 0.0001605996272335291,
11715
+ "loss": 2.9763,
11716
+ "num_input_tokens_seen": 17249075200,
11717
+ "step": 65800
11718
+ },
11719
+ {
11720
+ "epoch": 0.4429399508629257,
11721
+ "grad_norm": 0.14491549134254456,
11722
+ "learning_rate": 0.0001570114440049037,
11723
+ "loss": 2.9756,
11724
+ "num_input_tokens_seen": 17262182400,
11725
+ "step": 65850
11726
+ },
11727
+ {
11728
+ "epoch": 0.4432762758066333,
11729
+ "grad_norm": 0.1571652740240097,
11730
+ "learning_rate": 0.00015345631872718213,
11731
+ "loss": 2.977,
11732
+ "num_input_tokens_seen": 17275289600,
11733
+ "step": 65900
11734
+ },
11735
+ {
11736
+ "epoch": 0.4436126007503409,
11737
+ "grad_norm": 0.18299035727977753,
11738
+ "learning_rate": 0.00014993459405073824,
11739
+ "loss": 2.9788,
11740
+ "num_input_tokens_seen": 17288396800,
11741
+ "step": 65950
11742
+ },
11743
+ {
11744
+ "epoch": 0.44394892569404854,
11745
+ "grad_norm": 0.14829285442829132,
11746
+ "learning_rate": 0.00014644660940672628,
11747
+ "loss": 2.9851,
11748
+ "num_input_tokens_seen": 17301504000,
11749
+ "step": 66000
11750
+ },
11751
+ {
11752
+ "epoch": 0.44394892569404854,
11753
+ "eval_loss": 2.8729286193847656,
11754
+ "eval_runtime": 53.2839,
11755
+ "eval_samples_per_second": 93.837,
11756
+ "eval_steps_per_second": 23.459,
11757
+ "num_input_tokens_seen": 17301504000,
11758
+ "step": 66000
11759
  }
11760
  ],
11761
  "logging_steps": 50,
11762
  "max_steps": 70000,
11763
+ "num_input_tokens_seen": 17301504000,
11764
  "num_train_epochs": 1,
11765
  "save_steps": 1000,
11766
  "stateful_callbacks": {
 
11775
  "attributes": {}
11776
  }
11777
  },
11778
+ "total_flos": 4.62832118267904e+18,
11779
  "train_batch_size": 64,
11780
  "trial_name": null,
11781
  "trial_params": null