Azrail commited on
Commit
e321181
·
verified ·
1 Parent(s): ec1e826

Training in progress, step 62000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dc1817a301fc24319ca1c05c92090e28d0ab00a3a5d43949da4772ff52fcf2b
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c05cde8285dd52085342b46430f4e5412103d775ef2ecb3ff92fe973f05563a
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c0576570955fbd0c77602fddc48b3da384f1445f3f7054045594138138a2617
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13bd3612e3785a0d69245374e9d503a45ff63d121c602ad8a1a69ce58b21ee6f
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8ee7735caca4437694ef1fa1c7821cadab81eb5dba9c8318224d8baee7f9384
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88c7ed774bb0bea4c8451805c5254d2a8728348d14f02b8481173830b417e9b0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b86dd42ce2bfa419ab9d950fa2e032bc9074c23516cf132dad718a38dfd9a2d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.29097153487484645,
6
  "eval_steps": 500,
7
- "global_step": 61000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10866,11 +10866,189 @@
10866
  "eval_steps_per_second": 23.343,
10867
  "num_input_tokens_seen": 15990779456,
10868
  "step": 61000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10869
  }
10870
  ],
10871
  "logging_steps": 50,
10872
  "max_steps": 70000,
10873
- "num_input_tokens_seen": 15990779456,
10874
  "num_train_epochs": 1,
10875
  "save_steps": 1000,
10876
  "stateful_callbacks": {
@@ -10885,7 +11063,7 @@
10885
  "attributes": {}
10886
  }
10887
  },
10888
- "total_flos": 4.2776895744874906e+18,
10889
  "train_batch_size": 64,
10890
  "trial_name": null,
10891
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2957415600367292,
6
  "eval_steps": 500,
7
+ "global_step": 62000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10866
  "eval_steps_per_second": 23.343,
10867
  "num_input_tokens_seen": 15990779456,
10868
  "step": 61000
10869
+ },
10870
+ {
10871
+ "epoch": 0.2912100361329406,
10872
+ "grad_norm": 0.19767510890960693,
10873
+ "learning_rate": 0.0007118738970516943,
10874
+ "loss": 2.5963,
10875
+ "num_input_tokens_seen": 16003886656,
10876
+ "step": 61050
10877
+ },
10878
+ {
10879
+ "epoch": 0.2914485373910347,
10880
+ "grad_norm": 0.21463529765605927,
10881
+ "learning_rate": 0.0007067792524832604,
10882
+ "loss": 2.5825,
10883
+ "num_input_tokens_seen": 16016993856,
10884
+ "step": 61100
10885
+ },
10886
+ {
10887
+ "epoch": 0.29168703864912887,
10888
+ "grad_norm": 0.2011532485485077,
10889
+ "learning_rate": 0.0007016585772004026,
10890
+ "loss": 2.5783,
10891
+ "num_input_tokens_seen": 16030101056,
10892
+ "step": 61150
10893
+ },
10894
+ {
10895
+ "epoch": 0.29192553990722303,
10896
+ "grad_norm": 0.19351401925086975,
10897
+ "learning_rate": 0.0006965125158269618,
10898
+ "loss": 2.5619,
10899
+ "num_input_tokens_seen": 16043208256,
10900
+ "step": 61200
10901
+ },
10902
+ {
10903
+ "epoch": 0.29216404116531713,
10904
+ "grad_norm": 0.1988568902015686,
10905
+ "learning_rate": 0.000691341716182545,
10906
+ "loss": 2.6007,
10907
+ "num_input_tokens_seen": 16056315456,
10908
+ "step": 61250
10909
+ },
10910
+ {
10911
+ "epoch": 0.2924025424234113,
10912
+ "grad_norm": 0.20459413528442383,
10913
+ "learning_rate": 0.0006861468292009726,
10914
+ "loss": 2.5762,
10915
+ "num_input_tokens_seen": 16069422656,
10916
+ "step": 61300
10917
+ },
10918
+ {
10919
+ "epoch": 0.2926410436815054,
10920
+ "grad_norm": 0.1914205551147461,
10921
+ "learning_rate": 0.0006809285088483361,
10922
+ "loss": 2.5734,
10923
+ "num_input_tokens_seen": 16082529856,
10924
+ "step": 61350
10925
+ },
10926
+ {
10927
+ "epoch": 0.29287954493959956,
10928
+ "grad_norm": 0.194325253367424,
10929
+ "learning_rate": 0.0006756874120406714,
10930
+ "loss": 2.5874,
10931
+ "num_input_tokens_seen": 16095637056,
10932
+ "step": 61400
10933
+ },
10934
+ {
10935
+ "epoch": 0.2931180461976937,
10936
+ "grad_norm": 0.20854853093624115,
10937
+ "learning_rate": 0.0006704241985612625,
10938
+ "loss": 2.5865,
10939
+ "num_input_tokens_seen": 16108744256,
10940
+ "step": 61450
10941
+ },
10942
+ {
10943
+ "epoch": 0.2933565474557878,
10944
+ "grad_norm": 0.190395787358284,
10945
+ "learning_rate": 0.0006651395309775837,
10946
+ "loss": 2.5716,
10947
+ "num_input_tokens_seen": 16121851456,
10948
+ "step": 61500
10949
+ },
10950
+ {
10951
+ "epoch": 0.2933565474557878,
10952
+ "eval_loss": 2.4551966190338135,
10953
+ "eval_runtime": 53.3343,
10954
+ "eval_samples_per_second": 93.748,
10955
+ "eval_steps_per_second": 23.437,
10956
+ "num_input_tokens_seen": 16121851456,
10957
+ "step": 61500
10958
+ },
10959
+ {
10960
+ "epoch": 0.293595048713882,
10961
+ "grad_norm": 0.20652073621749878,
10962
+ "learning_rate": 0.0006598340745578908,
10963
+ "loss": 2.5765,
10964
+ "num_input_tokens_seen": 16134958656,
10965
+ "step": 61550
10966
+ },
10967
+ {
10968
+ "epoch": 0.2938335499719761,
10969
+ "grad_norm": 0.20701836049556732,
10970
+ "learning_rate": 0.0006545084971874737,
10971
+ "loss": 2.5653,
10972
+ "num_input_tokens_seen": 16148065856,
10973
+ "step": 61600
10974
+ },
10975
+ {
10976
+ "epoch": 0.29407205123007024,
10977
+ "grad_norm": 0.1792392134666443,
10978
+ "learning_rate": 0.000649163469284578,
10979
+ "loss": 2.577,
10980
+ "num_input_tokens_seen": 16161173056,
10981
+ "step": 61650
10982
+ },
10983
+ {
10984
+ "epoch": 0.2943105524881644,
10985
+ "grad_norm": 0.21742790937423706,
10986
+ "learning_rate": 0.0006437996637160086,
10987
+ "loss": 2.574,
10988
+ "num_input_tokens_seen": 16174280256,
10989
+ "step": 61700
10990
+ },
10991
+ {
10992
+ "epoch": 0.2945490537462585,
10993
+ "grad_norm": 0.20747682452201843,
10994
+ "learning_rate": 0.0006384177557124247,
10995
+ "loss": 2.564,
10996
+ "num_input_tokens_seen": 16187387456,
10997
+ "step": 61750
10998
+ },
10999
+ {
11000
+ "epoch": 0.29478755500435266,
11001
+ "grad_norm": 0.19990311563014984,
11002
+ "learning_rate": 0.0006330184227833376,
11003
+ "loss": 2.5866,
11004
+ "num_input_tokens_seen": 16200494656,
11005
+ "step": 61800
11006
+ },
11007
+ {
11008
+ "epoch": 0.29502605626244677,
11009
+ "grad_norm": 0.20410317182540894,
11010
+ "learning_rate": 0.0006276023446318213,
11011
+ "loss": 2.5559,
11012
+ "num_input_tokens_seen": 16213601856,
11013
+ "step": 61850
11014
+ },
11015
+ {
11016
+ "epoch": 0.2952645575205409,
11017
+ "grad_norm": 0.19365034997463226,
11018
+ "learning_rate": 0.000622170203068947,
11019
+ "loss": 2.5705,
11020
+ "num_input_tokens_seen": 16226709056,
11021
+ "step": 61900
11022
+ },
11023
+ {
11024
+ "epoch": 0.29550305877863503,
11025
+ "grad_norm": 0.2115161269903183,
11026
+ "learning_rate": 0.0006167226819279528,
11027
+ "loss": 2.5621,
11028
+ "num_input_tokens_seen": 16239816256,
11029
+ "step": 61950
11030
+ },
11031
+ {
11032
+ "epoch": 0.2957415600367292,
11033
+ "grad_norm": 0.22992485761642456,
11034
+ "learning_rate": 0.0006112604669781572,
11035
+ "loss": 2.5587,
11036
+ "num_input_tokens_seen": 16252923456,
11037
+ "step": 62000
11038
+ },
11039
+ {
11040
+ "epoch": 0.2957415600367292,
11041
+ "eval_loss": 2.452096462249756,
11042
+ "eval_runtime": 53.6354,
11043
+ "eval_samples_per_second": 93.222,
11044
+ "eval_steps_per_second": 23.306,
11045
+ "num_input_tokens_seen": 16252923456,
11046
+ "step": 62000
11047
  }
11048
  ],
11049
  "logging_steps": 50,
11050
  "max_steps": 70000,
11051
+ "num_input_tokens_seen": 16252923456,
11052
  "num_train_epochs": 1,
11053
  "save_steps": 1000,
11054
  "stateful_callbacks": {
 
11063
  "attributes": {}
11064
  }
11065
  },
11066
+ "total_flos": 4.3478156530129306e+18,
11067
  "train_batch_size": 64,
11068
  "trial_name": null,
11069
  "trial_params": null