Azrail commited on
Commit
5e94c44
·
verified ·
1 Parent(s): 1e72ac2

Training in progress, step 140000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6173b4bc562c2e11366705c8c76e7d31698b3a60389b9a754914d9b8842cf90f
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6e43382fe5ddb78fed06a23ba6c7b8489c50f8ee7949d8db86e49cd8910036e
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3048a59b63da999ae8fc02b473b5d2a50c2be60b98f1004a6c79f0035ac60f1
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c18874d88aac76ea7c7006e997509fca95df88b10d2c13b5a6816de7643ed6e
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ef3d8a81eedcecdd331f8207cd63df8c3721e9e06bbee141ce7de5f7de358d9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82993dca9aea22266a253201514efb5478f36bf5a374573dc48fbab5e03c52d6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0f0628bbbac738b6a9aa97ca88652280d641a00de879a3f6b83636f7c99513d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf74877c1fcc66d6df58cb7c2b28db5c3be81aec77034ec2a9ace3e30449eb22
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.3260693800159795,
6
  "eval_steps": 500,
7
- "global_step": 139000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24750,11 +24750,189 @@
24750
  "eval_steps_per_second": 15.14,
24751
  "num_input_tokens_seen": 72864248896,
24752
  "step": 139000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24753
  }
24754
  ],
24755
  "logging_steps": 50,
24756
  "max_steps": 140000,
24757
- "num_input_tokens_seen": 72864248896,
24758
  "num_train_epochs": 2,
24759
  "save_steps": 1000,
24760
  "stateful_callbacks": {
@@ -24764,12 +24942,12 @@
24764
  "should_evaluate": false,
24765
  "should_log": false,
24766
  "should_save": true,
24767
- "should_training_stop": false
24768
  },
24769
  "attributes": {}
24770
  }
24771
  },
24772
- "total_flos": 1.2895643010692137e+20,
24773
  "train_batch_size": 32,
24774
  "trial_name": null,
24775
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.335609430339745,
6
  "eval_steps": 500,
7
+ "global_step": 140000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24750
  "eval_steps_per_second": 15.14,
24751
  "num_input_tokens_seen": 72864248896,
24752
  "step": 139000
24753
+ },
24754
+ {
24755
+ "epoch": 1.3265463825321677,
24756
+ "grad_norm": 0.11243559420108795,
24757
+ "learning_rate": 2.837655575097964e-06,
24758
+ "loss": 2.0318,
24759
+ "num_input_tokens_seen": 72890458688,
24760
+ "step": 139050
24761
+ },
24762
+ {
24763
+ "epoch": 1.3270233850483562,
24764
+ "grad_norm": 0.11617834120988846,
24765
+ "learning_rate": 2.547062725623828e-06,
24766
+ "loss": 2.0384,
24767
+ "num_input_tokens_seen": 72916673088,
24768
+ "step": 139100
24769
+ },
24770
+ {
24771
+ "epoch": 1.3275003875645444,
24772
+ "grad_norm": 0.11737903952598572,
24773
+ "learning_rate": 2.2721256504567023e-06,
24774
+ "loss": 2.0235,
24775
+ "num_input_tokens_seen": 72942884768,
24776
+ "step": 139150
24777
+ },
24778
+ {
24779
+ "epoch": 1.3279773900807328,
24780
+ "grad_norm": 0.10866422206163406,
24781
+ "learning_rate": 2.012853002380466e-06,
24782
+ "loss": 2.024,
24783
+ "num_input_tokens_seen": 72969088544,
24784
+ "step": 139200
24785
+ },
24786
+ {
24787
+ "epoch": 1.328454392596921,
24788
+ "grad_norm": 0.11547800898551941,
24789
+ "learning_rate": 1.769252941190458e-06,
24790
+ "loss": 2.0323,
24791
+ "num_input_tokens_seen": 72995301472,
24792
+ "step": 139250
24793
+ },
24794
+ {
24795
+ "epoch": 1.3289313951131092,
24796
+ "grad_norm": 0.11617856472730637,
24797
+ "learning_rate": 1.541333133436018e-06,
24798
+ "loss": 2.0294,
24799
+ "num_input_tokens_seen": 73021507392,
24800
+ "step": 139300
24801
+ },
24802
+ {
24803
+ "epoch": 1.3294083976292974,
24804
+ "grad_norm": 0.11435816437005997,
24805
+ "learning_rate": 1.3291007521799014e-06,
24806
+ "loss": 2.0288,
24807
+ "num_input_tokens_seen": 73047719968,
24808
+ "step": 139350
24809
+ },
24810
+ {
24811
+ "epoch": 1.3298854001454858,
24812
+ "grad_norm": 0.11262206733226776,
24813
+ "learning_rate": 1.132562476771959e-06,
24814
+ "loss": 2.0301,
24815
+ "num_input_tokens_seen": 73073924576,
24816
+ "step": 139400
24817
+ },
24818
+ {
24819
+ "epoch": 1.330362402661674,
24820
+ "grad_norm": 0.11383078992366791,
24821
+ "learning_rate": 9.517244926393609e-07,
24822
+ "loss": 2.0187,
24823
+ "num_input_tokens_seen": 73100138976,
24824
+ "step": 139450
24825
+ },
24826
+ {
24827
+ "epoch": 1.3308394051778623,
24828
+ "grad_norm": 0.1159028634428978,
24829
+ "learning_rate": 7.865924910916978e-07,
24830
+ "loss": 2.0366,
24831
+ "num_input_tokens_seen": 73126349984,
24832
+ "step": 139500
24833
+ },
24834
+ {
24835
+ "epoch": 1.3308394051778623,
24836
+ "eval_loss": 1.9510103464126587,
24837
+ "eval_runtime": 82.8489,
24838
+ "eval_samples_per_second": 60.351,
24839
+ "eval_steps_per_second": 15.088,
24840
+ "num_input_tokens_seen": 73126349984,
24841
+ "step": 139500
24842
+ },
24843
+ {
24844
+ "epoch": 1.3313164076940507,
24845
+ "grad_norm": 0.1160767450928688,
24846
+ "learning_rate": 6.371716691419005e-07,
24847
+ "loss": 2.0374,
24848
+ "num_input_tokens_seen": 73152559296,
24849
+ "step": 139550
24850
+ },
24851
+ {
24852
+ "epoch": 1.331793410210239,
24853
+ "grad_norm": 0.11154640465974808,
24854
+ "learning_rate": 5.034667293427053e-07,
24855
+ "loss": 2.0385,
24856
+ "num_input_tokens_seen": 73178773696,
24857
+ "step": 139600
24858
+ },
24859
+ {
24860
+ "epoch": 1.332270412726427,
24861
+ "grad_norm": 0.11127237975597382,
24862
+ "learning_rate": 3.854818796385495e-07,
24863
+ "loss": 2.0281,
24864
+ "num_input_tokens_seen": 73204985664,
24865
+ "step": 139650
24866
+ },
24867
+ {
24868
+ "epoch": 1.3327474152426153,
24869
+ "grad_norm": 0.11270651966333389,
24870
+ "learning_rate": 2.8322083323334415e-07,
24871
+ "loss": 2.022,
24872
+ "num_input_tokens_seen": 73231192992,
24873
+ "step": 139700
24874
+ },
24875
+ {
24876
+ "epoch": 1.3332244177588037,
24877
+ "grad_norm": 0.11388963460922241,
24878
+ "learning_rate": 1.9668680847356734e-07,
24879
+ "loss": 2.0305,
24880
+ "num_input_tokens_seen": 73257397792,
24881
+ "step": 139750
24882
+ },
24883
+ {
24884
+ "epoch": 1.333701420274992,
24885
+ "grad_norm": 0.11808367073535919,
24886
+ "learning_rate": 1.2588252874673466e-07,
24887
+ "loss": 2.0302,
24888
+ "num_input_tokens_seen": 73283607648,
24889
+ "step": 139800
24890
+ },
24891
+ {
24892
+ "epoch": 1.3341784227911802,
24893
+ "grad_norm": 0.11369805783033371,
24894
+ "learning_rate": 7.081022239591173e-08,
24895
+ "loss": 2.0355,
24896
+ "num_input_tokens_seen": 73309822048,
24897
+ "step": 139850
24898
+ },
24899
+ {
24900
+ "epoch": 1.3346554253073686,
24901
+ "grad_norm": 0.11115424335002899,
24902
+ "learning_rate": 3.147162264971471e-08,
24903
+ "loss": 2.027,
24904
+ "num_input_tokens_seen": 73336032384,
24905
+ "step": 139900
24906
+ },
24907
+ {
24908
+ "epoch": 1.3351324278235568,
24909
+ "grad_norm": 0.11730392277240753,
24910
+ "learning_rate": 7.867967567354306e-09,
24911
+ "loss": 2.0268,
24912
+ "num_input_tokens_seen": 73362242112,
24913
+ "step": 139950
24914
+ },
24915
+ {
24916
+ "epoch": 1.335609430339745,
24917
+ "grad_norm": 0.11209023743867874,
24918
+ "learning_rate": 0.0,
24919
+ "loss": 2.0315,
24920
+ "num_input_tokens_seen": 73388446624,
24921
+ "step": 140000
24922
+ },
24923
+ {
24924
+ "epoch": 1.335609430339745,
24925
+ "eval_loss": 1.9509990215301514,
24926
+ "eval_runtime": 82.6099,
24927
+ "eval_samples_per_second": 60.525,
24928
+ "eval_steps_per_second": 15.131,
24929
+ "num_input_tokens_seen": 73388446624,
24930
+ "step": 140000
24931
  }
24932
  ],
24933
  "logging_steps": 50,
24934
  "max_steps": 140000,
24935
+ "num_input_tokens_seen": 73388446624,
24936
  "num_train_epochs": 2,
24937
  "save_steps": 1000,
24938
  "stateful_callbacks": {
 
24942
  "should_evaluate": false,
24943
  "should_log": false,
24944
  "should_save": true,
24945
+ "should_training_stop": true
24946
  },
24947
  "attributes": {}
24948
  }
24949
  },
24950
+ "total_flos": 1.2988416447181578e+20,
24951
  "train_batch_size": 32,
24952
  "trial_name": null,
24953
  "trial_params": null