Azrail commited on
Commit
7300058
·
verified ·
1 Parent(s): 86291a0

Training in progress, step 135000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d56ac5cac24a22412473f2135127ddabb38b319ea83b674e986a42239b250e9
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:079464073c9724ceb804666b522429a90a4928e290e5da217f3ad8b9d68b8886
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4d6a881e9f26105deee08c944e754ddbf4c77f455ab89089e93e0141d4bbc5a
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd285cfac8e5c0f6d1266cf8e23ce20a797130dac2828587dcc5345232fa441
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24b5f8b02f183c01b91dfb927bcee2fd08e29422009a0f8c863f42c2374d464d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:874cf93e738f75197422ec1e62b162ef1e398b581422e23932b758446980a6af
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f96155b98d632c68f19e59b549aa9343e95b0d1b8978f18da42e6a70e5498d0e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e1e7a01b81e1907abf43be3318a5c567fc57f95dbaef634f44d30b341186326
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2783691283971523,
6
  "eval_steps": 500,
7
- "global_step": 134000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23860,11 +23860,189 @@
23860
  "eval_steps_per_second": 15.134,
23861
  "num_input_tokens_seen": 70243253472,
23862
  "step": 134000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23863
  }
23864
  ],
23865
  "logging_steps": 50,
23866
  "max_steps": 140000,
23867
- "num_input_tokens_seen": 70243253472,
23868
  "num_train_epochs": 2,
23869
  "save_steps": 1000,
23870
  "stateful_callbacks": {
@@ -23879,7 +24057,7 @@
23879
  "attributes": {}
23880
  }
23881
  },
23882
- "total_flos": 1.243177462760067e+20,
23883
  "train_batch_size": 32,
23884
  "trial_name": null,
23885
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2879091787209178,
6
  "eval_steps": 500,
7
+ "global_step": 135000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23860
  "eval_steps_per_second": 15.134,
23861
  "num_input_tokens_seen": 70243253472,
23862
  "step": 134000
23863
+ },
23864
+ {
23865
+ "epoch": 1.2788461309133405,
23866
+ "grad_norm": 0.12490282952785492,
23867
+ "learning_rate": 0.00010734153455962764,
23868
+ "loss": 2.0308,
23869
+ "num_input_tokens_seen": 70269466208,
23870
+ "step": 134050
23871
+ },
23872
+ {
23873
+ "epoch": 1.279323133429529,
23874
+ "grad_norm": 0.12396061420440674,
23875
+ "learning_rate": 0.00010561116804955451,
23876
+ "loss": 2.036,
23877
+ "num_input_tokens_seen": 70295676096,
23878
+ "step": 134100
23879
+ },
23880
+ {
23881
+ "epoch": 1.2798001359457172,
23882
+ "grad_norm": 0.12122515588998795,
23883
+ "learning_rate": 0.00010389321369363636,
23884
+ "loss": 2.0424,
23885
+ "num_input_tokens_seen": 70321882272,
23886
+ "step": 134150
23887
+ },
23888
+ {
23889
+ "epoch": 1.2802771384619054,
23890
+ "grad_norm": 0.12559206783771515,
23891
+ "learning_rate": 0.00010218772555910954,
23892
+ "loss": 2.0456,
23893
+ "num_input_tokens_seen": 70348095808,
23894
+ "step": 134200
23895
+ },
23896
+ {
23897
+ "epoch": 1.2807541409780936,
23898
+ "grad_norm": 0.11915505677461624,
23899
+ "learning_rate": 0.0001004947573208756,
23900
+ "loss": 2.0412,
23901
+ "num_input_tokens_seen": 70374304800,
23902
+ "step": 134250
23903
+ },
23904
+ {
23905
+ "epoch": 1.2812311434942818,
23906
+ "grad_norm": 0.12196268141269684,
23907
+ "learning_rate": 9.881436225981105e-05,
23908
+ "loss": 2.0386,
23909
+ "num_input_tokens_seen": 70400510976,
23910
+ "step": 134300
23911
+ },
23912
+ {
23913
+ "epoch": 1.2817081460104702,
23914
+ "grad_norm": 0.12415535002946854,
23915
+ "learning_rate": 9.714659326109137e-05,
23916
+ "loss": 2.0448,
23917
+ "num_input_tokens_seen": 70426725376,
23918
+ "step": 134350
23919
+ },
23920
+ {
23921
+ "epoch": 1.2821851485266584,
23922
+ "grad_norm": 0.12361661344766617,
23923
+ "learning_rate": 9.549150281252633e-05,
23924
+ "loss": 2.0371,
23925
+ "num_input_tokens_seen": 70452929792,
23926
+ "step": 134400
23927
+ },
23928
+ {
23929
+ "epoch": 1.2826621510428469,
23930
+ "grad_norm": 0.12377167493104935,
23931
+ "learning_rate": 9.384914300290748e-05,
23932
+ "loss": 2.0344,
23933
+ "num_input_tokens_seen": 70479144192,
23934
+ "step": 134450
23935
+ },
23936
+ {
23937
+ "epoch": 1.283139153559035,
23938
+ "grad_norm": 0.11863281577825546,
23939
+ "learning_rate": 9.221956552036992e-05,
23940
+ "loss": 2.0393,
23941
+ "num_input_tokens_seen": 70505353504,
23942
+ "step": 134500
23943
+ },
23944
+ {
23945
+ "epoch": 1.283139153559035,
23946
+ "eval_loss": 1.9545812606811523,
23947
+ "eval_runtime": 82.3767,
23948
+ "eval_samples_per_second": 60.697,
23949
+ "eval_steps_per_second": 15.174,
23950
+ "num_input_tokens_seen": 70505353504,
23951
+ "step": 134500
23952
+ },
23953
+ {
23954
+ "epoch": 1.2836161560752233,
23955
+ "grad_norm": 0.12550202012062073,
23956
+ "learning_rate": 9.060282165076461e-05,
23957
+ "loss": 2.0483,
23958
+ "num_input_tokens_seen": 70531564640,
23959
+ "step": 134550
23960
+ },
23961
+ {
23962
+ "epoch": 1.2840931585914115,
23963
+ "grad_norm": 0.12165137380361557,
23964
+ "learning_rate": 8.899896227604509e-05,
23965
+ "loss": 2.034,
23966
+ "num_input_tokens_seen": 70557777824,
23967
+ "step": 134600
23968
+ },
23969
+ {
23970
+ "epoch": 1.2845701611076,
23971
+ "grad_norm": 0.12417840212583542,
23972
+ "learning_rate": 8.740803787266521e-05,
23973
+ "loss": 2.0381,
23974
+ "num_input_tokens_seen": 70583987456,
23975
+ "step": 134650
23976
+ },
23977
+ {
23978
+ "epoch": 1.2850471636237881,
23979
+ "grad_norm": 0.12609820067882538,
23980
+ "learning_rate": 8.58300985099918e-05,
23981
+ "loss": 2.0369,
23982
+ "num_input_tokens_seen": 70610189152,
23983
+ "step": 134700
23984
+ },
23985
+ {
23986
+ "epoch": 1.2855241661399763,
23987
+ "grad_norm": 0.1163376122713089,
23988
+ "learning_rate": 8.426519384872733e-05,
23989
+ "loss": 2.0236,
23990
+ "num_input_tokens_seen": 70636401088,
23991
+ "step": 134750
23992
+ },
23993
+ {
23994
+ "epoch": 1.2860011686561648,
23995
+ "grad_norm": 0.11958843469619751,
23996
+ "learning_rate": 8.271337313934868e-05,
23997
+ "loss": 2.0465,
23998
+ "num_input_tokens_seen": 70662608672,
23999
+ "step": 134800
24000
+ },
24001
+ {
24002
+ "epoch": 1.286478171172353,
24003
+ "grad_norm": 0.12234240025281906,
24004
+ "learning_rate": 8.117468522055577e-05,
24005
+ "loss": 2.0384,
24006
+ "num_input_tokens_seen": 70688820640,
24007
+ "step": 134850
24008
+ },
24009
+ {
24010
+ "epoch": 1.2869551736885412,
24011
+ "grad_norm": 0.11501733213663101,
24012
+ "learning_rate": 7.964917851773496e-05,
24013
+ "loss": 2.0343,
24014
+ "num_input_tokens_seen": 70715035040,
24015
+ "step": 134900
24016
+ },
24017
+ {
24018
+ "epoch": 1.2874321762047294,
24019
+ "grad_norm": 0.12062328308820724,
24020
+ "learning_rate": 7.813690104143555e-05,
24021
+ "loss": 2.0211,
24022
+ "num_input_tokens_seen": 70741249088,
24023
+ "step": 134950
24024
+ },
24025
+ {
24026
+ "epoch": 1.2879091787209178,
24027
+ "grad_norm": 0.11405592411756516,
24028
+ "learning_rate": 7.663790038585794e-05,
24029
+ "loss": 2.0401,
24030
+ "num_input_tokens_seen": 70767457344,
24031
+ "step": 135000
24032
+ },
24033
+ {
24034
+ "epoch": 1.2879091787209178,
24035
+ "eval_loss": 1.9541493654251099,
24036
+ "eval_runtime": 82.5619,
24037
+ "eval_samples_per_second": 60.561,
24038
+ "eval_steps_per_second": 15.14,
24039
+ "num_input_tokens_seen": 70767457344,
24040
+ "step": 135000
24041
  }
24042
  ],
24043
  "logging_steps": 50,
24044
  "max_steps": 140000,
24045
+ "num_input_tokens_seen": 70767457344,
24046
  "num_train_epochs": 2,
24047
  "save_steps": 1000,
24048
  "stateful_callbacks": {
 
24057
  "attributes": {}
24058
  }
24059
  },
24060
+ "total_flos": 1.2524549151466045e+20,
24061
  "train_batch_size": 32,
24062
  "trial_name": null,
24063
  "trial_params": null