Azrail commited on
Commit
916dee2
·
verified ·
1 Parent(s): bad5581

Training in progress, step 67000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91b3f88ddbda82d579d7e857e17e157a938e94cf97682c36dea7a9e8ddcf3d14
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df562358f0b3d93fdb48e67f5210b057adeffd8b788222cd6d30c1e17d16a45
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1773eecaec3a2d8883e5d344c33d10650e6ebcee793cb11cc46ab81989c4cf9e
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:095f32100e867e0fe913cd1c8e425177cd1f66e07c341665a191649c37a86bd3
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a07bef738a41ab3ac6ef10bbe9890f379f768870bcb200cb24b86bcef1753cd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00a7e117096eaa1f05b475c020696dc81b37bf94c840c6a7b407a88337130d26
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3148216606842601,
6
  "eval_steps": 500,
7
- "global_step": 66000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11756,11 +11756,189 @@
11756
  "eval_steps_per_second": 23.453,
11757
  "num_input_tokens_seen": 17301499456,
11758
  "step": 66000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11759
  }
11760
  ],
11761
  "logging_steps": 50,
11762
  "max_steps": 70000,
11763
- "num_input_tokens_seen": 17301499456,
11764
  "num_train_epochs": 1,
11765
  "save_steps": 1000,
11766
  "stateful_callbacks": {
@@ -11775,7 +11953,7 @@
11775
  "attributes": {}
11776
  }
11777
  },
11778
- "total_flos": 4.628319967114691e+18,
11779
  "train_batch_size": 64,
11780
  "trial_name": null,
11781
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.31959168584614284,
6
  "eval_steps": 500,
7
+ "global_step": 67000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11756
  "eval_steps_per_second": 23.453,
11757
  "num_input_tokens_seen": 17301499456,
11758
  "step": 66000
11759
+ },
11760
+ {
11761
+ "epoch": 0.31506016194235426,
11762
+ "grad_norm": 0.16586218774318695,
11763
+ "learning_rate": 0.00018388874897104518,
11764
+ "loss": 2.5468,
11765
+ "num_input_tokens_seen": 17314606656,
11766
+ "step": 66050
11767
+ },
11768
+ {
11769
+ "epoch": 0.31529866320044836,
11770
+ "grad_norm": 0.1646813303232193,
11771
+ "learning_rate": 0.00017956219300748795,
11772
+ "loss": 2.5352,
11773
+ "num_input_tokens_seen": 17327713856,
11774
+ "step": 66100
11775
+ },
11776
+ {
11777
+ "epoch": 0.3155371644585425,
11778
+ "grad_norm": 0.18712937831878662,
11779
+ "learning_rate": 0.00017527597583490823,
11780
+ "loss": 2.5412,
11781
+ "num_input_tokens_seen": 17340821056,
11782
+ "step": 66150
11783
+ },
11784
+ {
11785
+ "epoch": 0.3157756657166367,
11786
+ "grad_norm": 0.1631355583667755,
11787
+ "learning_rate": 0.00017103063703014372,
11788
+ "loss": 2.5272,
11789
+ "num_input_tokens_seen": 17353928256,
11790
+ "step": 66200
11791
+ },
11792
+ {
11793
+ "epoch": 0.3160141669747308,
11794
+ "grad_norm": 0.15910203754901886,
11795
+ "learning_rate": 0.00016682671102399805,
11796
+ "loss": 2.5333,
11797
+ "num_input_tokens_seen": 17367035456,
11798
+ "step": 66250
11799
+ },
11800
+ {
11801
+ "epoch": 0.31625266823282494,
11802
+ "grad_norm": 0.5742849707603455,
11803
+ "learning_rate": 0.00016266472703396284,
11804
+ "loss": 2.5463,
11805
+ "num_input_tokens_seen": 17380142656,
11806
+ "step": 66300
11807
+ },
11808
+ {
11809
+ "epoch": 0.31649116949091904,
11810
+ "grad_norm": 0.17517830431461334,
11811
+ "learning_rate": 0.00015854520899759655,
11812
+ "loss": 2.5511,
11813
+ "num_input_tokens_seen": 17393249856,
11814
+ "step": 66350
11815
+ },
11816
+ {
11817
+ "epoch": 0.3167296707490132,
11818
+ "grad_norm": 0.6962131857872009,
11819
+ "learning_rate": 0.00015446867550656767,
11820
+ "loss": 2.5452,
11821
+ "num_input_tokens_seen": 17406357056,
11822
+ "step": 66400
11823
+ },
11824
+ {
11825
+ "epoch": 0.31696817200710736,
11826
+ "grad_norm": 0.16677837073802948,
11827
+ "learning_rate": 0.00015043563974137132,
11828
+ "loss": 2.5392,
11829
+ "num_input_tokens_seen": 17419464256,
11830
+ "step": 66450
11831
+ },
11832
+ {
11833
+ "epoch": 0.31720667326520147,
11834
+ "grad_norm": 0.16235870122909546,
11835
+ "learning_rate": 0.00014644660940672628,
11836
+ "loss": 2.5125,
11837
+ "num_input_tokens_seen": 17432571456,
11838
+ "step": 66500
11839
+ },
11840
+ {
11841
+ "epoch": 0.31720667326520147,
11842
+ "eval_loss": 2.419802188873291,
11843
+ "eval_runtime": 52.8641,
11844
+ "eval_samples_per_second": 94.582,
11845
+ "eval_steps_per_second": 23.646,
11846
+ "num_input_tokens_seen": 17432571456,
11847
+ "step": 66500
11848
+ },
11849
+ {
11850
+ "epoch": 0.3174451745232956,
11851
+ "grad_norm": 0.17308832705020905,
11852
+ "learning_rate": 0.00014250208666766236,
11853
+ "loss": 2.5349,
11854
+ "num_input_tokens_seen": 17445678656,
11855
+ "step": 66550
11856
+ },
11857
+ {
11858
+ "epoch": 0.31768367578138973,
11859
+ "grad_norm": 0.16299477219581604,
11860
+ "learning_rate": 0.00013860256808630427,
11861
+ "loss": 2.5277,
11862
+ "num_input_tokens_seen": 17458785856,
11863
+ "step": 66600
11864
+ },
11865
+ {
11866
+ "epoch": 0.3179221770394839,
11867
+ "grad_norm": 0.18277022242546082,
11868
+ "learning_rate": 0.00013474854455936125,
11869
+ "loss": 2.5203,
11870
+ "num_input_tokens_seen": 17471893056,
11871
+ "step": 66650
11872
+ },
11873
+ {
11874
+ "epoch": 0.318160678297578,
11875
+ "grad_norm": 0.16096614301204681,
11876
+ "learning_rate": 0.00013094050125632973,
11877
+ "loss": 2.535,
11878
+ "num_input_tokens_seen": 17485000256,
11879
+ "step": 66700
11880
+ },
11881
+ {
11882
+ "epoch": 0.31839917955567215,
11883
+ "grad_norm": 0.1723272204399109,
11884
+ "learning_rate": 0.0001271789175584172,
11885
+ "loss": 2.549,
11886
+ "num_input_tokens_seen": 17498107456,
11887
+ "step": 66750
11888
+ },
11889
+ {
11890
+ "epoch": 0.3186376808137663,
11891
+ "grad_norm": 0.15782694518566132,
11892
+ "learning_rate": 0.00012346426699819457,
11893
+ "loss": 2.5317,
11894
+ "num_input_tokens_seen": 17511214656,
11895
+ "step": 66800
11896
+ },
11897
+ {
11898
+ "epoch": 0.3188761820718604,
11899
+ "grad_norm": 0.1627569943666458,
11900
+ "learning_rate": 0.00011979701719998454,
11901
+ "loss": 2.5382,
11902
+ "num_input_tokens_seen": 17524321856,
11903
+ "step": 66850
11904
+ },
11905
+ {
11906
+ "epoch": 0.3191146833299546,
11907
+ "grad_norm": 0.16340333223342896,
11908
+ "learning_rate": 0.00011617762982099444,
11909
+ "loss": 2.5477,
11910
+ "num_input_tokens_seen": 17537429056,
11911
+ "step": 66900
11912
+ },
11913
+ {
11914
+ "epoch": 0.3193531845880487,
11915
+ "grad_norm": 0.15788671374320984,
11916
+ "learning_rate": 0.00011260656049319957,
11917
+ "loss": 2.537,
11918
+ "num_input_tokens_seen": 17550536256,
11919
+ "step": 66950
11920
+ },
11921
+ {
11922
+ "epoch": 0.31959168584614284,
11923
+ "grad_norm": 0.16191193461418152,
11924
+ "learning_rate": 0.0001090842587659851,
11925
+ "loss": 2.5394,
11926
+ "num_input_tokens_seen": 17563643456,
11927
+ "step": 67000
11928
+ },
11929
+ {
11930
+ "epoch": 0.31959168584614284,
11931
+ "eval_loss": 2.417813301086426,
11932
+ "eval_runtime": 53.532,
11933
+ "eval_samples_per_second": 93.402,
11934
+ "eval_steps_per_second": 23.351,
11935
+ "num_input_tokens_seen": 17563643456,
11936
+ "step": 67000
11937
  }
11938
  ],
11939
  "logging_steps": 50,
11940
  "max_steps": 70000,
11941
+ "num_input_tokens_seen": 17563643456,
11942
  "num_train_epochs": 1,
11943
  "save_steps": 1000,
11944
  "stateful_callbacks": {
 
11953
  "attributes": {}
11954
  }
11955
  },
11956
+ "total_flos": 4.698446045640131e+18,
11957
  "train_batch_size": 64,
11958
  "trial_name": null,
11959
  "trial_params": null