mohammadmahdinouri commited on
Commit
32ad964
·
verified ·
1 Parent(s): 1c90f7f

Training in progress, step 29000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2057e4bc4ccb7266894aa681fe099f5645555d35372ed2c2f53abaad870b8285
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:746096b767c4c47c0c49b66fcf9e67e43d00132964e1a14503d0dc54e61a88ce
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49e7c91022600e2317a6a9b8ec33d6b3225250425e275f6eed0bdadc714f7fa6
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ff72e1a59e706ba21c0c5fc5faf4ff560d04a9269b480a240031d2014cadf01
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f92647ded7f1a6725e7ffd2310a8d2fbafb5da62cf15755b5f3e6fb2fdf499f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46ed9fc518619ac92c06b536cae3d8dd21e3799906ab806f17d4dd1aa6e8dd9d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5badfec76e553ebbd712f8d9135dd4df979bf9196652df1ae9ad27ae709e59c4
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0bcf7c080583def4d92e63cc47df57eaf4cf519a6a214957e6214d525864a6a
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fdddb3d61ba5e574c0c975793584282bdce7b095bac6bf2d58912967ca7933b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbfa9291779333cc6de79bd13fa6c586039654ce156817d635f2b7564e084805
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3be341579a31269cdfe494164e23b8a4ba61b71f1f432b36a2c0aef7d49c9b92
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d36fdfb70bf9082281ebe37b706d22a6591594718aa46603291c3e49697116
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18c359f46f82e1c9ecfbab9a4532bc57a1a730dfa02c76c631eb621b98761e8a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cd39fe9272798d41cdbf7f22a06af7a14c62772e1b67733185e58a79e1dfc7e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.05459925608513584,
6
  "eval_steps": 500,
7
- "global_step": 28000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9808,6 +9808,356 @@
9808
  "learning_rate": 0.000491060032601183,
9809
  "loss": 18.3958,
9810
  "step": 28000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9811
  }
9812
  ],
9813
  "logging_steps": 20,
@@ -9827,7 +10177,7 @@
9827
  "attributes": {}
9828
  }
9829
  },
9830
- "total_flos": 2.058460925948802e+19,
9831
  "train_batch_size": 48,
9832
  "trial_name": null,
9833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.05654922951674783,
6
  "eval_steps": 500,
7
+ "global_step": 29000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9808
  "learning_rate": 0.000491060032601183,
9809
  "loss": 18.3958,
9810
  "step": 28000
9811
+ },
9812
+ {
9813
+ "epoch": 0.05463825555376808,
9814
+ "grad_norm": 8.4375,
9815
+ "learning_rate": 0.0004910535305702737,
9816
+ "loss": 18.3374,
9817
+ "step": 28020
9818
+ },
9819
+ {
9820
+ "epoch": 0.05467725502240032,
9821
+ "grad_norm": 8.4375,
9822
+ "learning_rate": 0.0004910470285393643,
9823
+ "loss": 18.3651,
9824
+ "step": 28040
9825
+ },
9826
+ {
9827
+ "epoch": 0.05471625449103256,
9828
+ "grad_norm": 7.9375,
9829
+ "learning_rate": 0.000491040526508455,
9830
+ "loss": 18.2815,
9831
+ "step": 28060
9832
+ },
9833
+ {
9834
+ "epoch": 0.0547552539596648,
9835
+ "grad_norm": 8.1875,
9836
+ "learning_rate": 0.0004910340244775455,
9837
+ "loss": 18.3387,
9838
+ "step": 28080
9839
+ },
9840
+ {
9841
+ "epoch": 0.05479425342829704,
9842
+ "grad_norm": 8.125,
9843
+ "learning_rate": 0.0004910275224466362,
9844
+ "loss": 18.4166,
9845
+ "step": 28100
9846
+ },
9847
+ {
9848
+ "epoch": 0.05483325289692928,
9849
+ "grad_norm": 8.3125,
9850
+ "learning_rate": 0.0004910210204157268,
9851
+ "loss": 18.3669,
9852
+ "step": 28120
9853
+ },
9854
+ {
9855
+ "epoch": 0.05487225236556152,
9856
+ "grad_norm": 8.1875,
9857
+ "learning_rate": 0.0004910145183848175,
9858
+ "loss": 18.3254,
9859
+ "step": 28140
9860
+ },
9861
+ {
9862
+ "epoch": 0.05491125183419376,
9863
+ "grad_norm": 7.90625,
9864
+ "learning_rate": 0.0004910080163539082,
9865
+ "loss": 18.3784,
9866
+ "step": 28160
9867
+ },
9868
+ {
9869
+ "epoch": 0.054950251302826,
9870
+ "grad_norm": 8.0625,
9871
+ "learning_rate": 0.0004910015143229988,
9872
+ "loss": 18.3403,
9873
+ "step": 28180
9874
+ },
9875
+ {
9876
+ "epoch": 0.05498925077145824,
9877
+ "grad_norm": 9.375,
9878
+ "learning_rate": 0.0004909950122920895,
9879
+ "loss": 18.3102,
9880
+ "step": 28200
9881
+ },
9882
+ {
9883
+ "epoch": 0.055028250240090476,
9884
+ "grad_norm": 9.0625,
9885
+ "learning_rate": 0.00049098851026118,
9886
+ "loss": 18.4089,
9887
+ "step": 28220
9888
+ },
9889
+ {
9890
+ "epoch": 0.05506724970872272,
9891
+ "grad_norm": 8.5625,
9892
+ "learning_rate": 0.0004909820082302707,
9893
+ "loss": 18.3223,
9894
+ "step": 28240
9895
+ },
9896
+ {
9897
+ "epoch": 0.05510624917735496,
9898
+ "grad_norm": 8.5625,
9899
+ "learning_rate": 0.0004909755061993613,
9900
+ "loss": 18.3313,
9901
+ "step": 28260
9902
+ },
9903
+ {
9904
+ "epoch": 0.055145248645987195,
9905
+ "grad_norm": 8.6875,
9906
+ "learning_rate": 0.000490969004168452,
9907
+ "loss": 18.3614,
9908
+ "step": 28280
9909
+ },
9910
+ {
9911
+ "epoch": 0.05518424811461944,
9912
+ "grad_norm": 8.25,
9913
+ "learning_rate": 0.0004909625021375427,
9914
+ "loss": 18.4159,
9915
+ "step": 28300
9916
+ },
9917
+ {
9918
+ "epoch": 0.05522324758325168,
9919
+ "grad_norm": 9.625,
9920
+ "learning_rate": 0.0004909560001066333,
9921
+ "loss": 18.3133,
9922
+ "step": 28320
9923
+ },
9924
+ {
9925
+ "epoch": 0.05526224705188392,
9926
+ "grad_norm": 8.625,
9927
+ "learning_rate": 0.000490949498075724,
9928
+ "loss": 18.2357,
9929
+ "step": 28340
9930
+ },
9931
+ {
9932
+ "epoch": 0.055301246520516156,
9933
+ "grad_norm": 8.625,
9934
+ "learning_rate": 0.0004909429960448146,
9935
+ "loss": 18.3533,
9936
+ "step": 28360
9937
+ },
9938
+ {
9939
+ "epoch": 0.0553402459891484,
9940
+ "grad_norm": 8.0625,
9941
+ "learning_rate": 0.0004909364940139053,
9942
+ "loss": 18.4396,
9943
+ "step": 28380
9944
+ },
9945
+ {
9946
+ "epoch": 0.05537924545778064,
9947
+ "grad_norm": 9.25,
9948
+ "learning_rate": 0.0004909299919829958,
9949
+ "loss": 18.298,
9950
+ "step": 28400
9951
+ },
9952
+ {
9953
+ "epoch": 0.055418244926412875,
9954
+ "grad_norm": 8.125,
9955
+ "learning_rate": 0.0004909234899520865,
9956
+ "loss": 18.271,
9957
+ "step": 28420
9958
+ },
9959
+ {
9960
+ "epoch": 0.05545724439504512,
9961
+ "grad_norm": 7.1875,
9962
+ "learning_rate": 0.0004909169879211771,
9963
+ "loss": 18.3477,
9964
+ "step": 28440
9965
+ },
9966
+ {
9967
+ "epoch": 0.05549624386367736,
9968
+ "grad_norm": 9.125,
9969
+ "learning_rate": 0.0004909104858902678,
9970
+ "loss": 18.346,
9971
+ "step": 28460
9972
+ },
9973
+ {
9974
+ "epoch": 0.055535243332309595,
9975
+ "grad_norm": 9.125,
9976
+ "learning_rate": 0.0004909039838593585,
9977
+ "loss": 18.3856,
9978
+ "step": 28480
9979
+ },
9980
+ {
9981
+ "epoch": 0.05557424280094184,
9982
+ "grad_norm": 8.0625,
9983
+ "learning_rate": 0.0004908974818284491,
9984
+ "loss": 18.3034,
9985
+ "step": 28500
9986
+ },
9987
+ {
9988
+ "epoch": 0.05561324226957408,
9989
+ "grad_norm": 9.375,
9990
+ "learning_rate": 0.0004908909797975398,
9991
+ "loss": 18.2843,
9992
+ "step": 28520
9993
+ },
9994
+ {
9995
+ "epoch": 0.055652241738206314,
9996
+ "grad_norm": 9.4375,
9997
+ "learning_rate": 0.0004908844777666304,
9998
+ "loss": 18.2606,
9999
+ "step": 28540
10000
+ },
10001
+ {
10002
+ "epoch": 0.055691241206838556,
10003
+ "grad_norm": 8.25,
10004
+ "learning_rate": 0.0004908779757357211,
10005
+ "loss": 18.3211,
10006
+ "step": 28560
10007
+ },
10008
+ {
10009
+ "epoch": 0.0557302406754708,
10010
+ "grad_norm": 8.0625,
10011
+ "learning_rate": 0.0004908714737048117,
10012
+ "loss": 18.2635,
10013
+ "step": 28580
10014
+ },
10015
+ {
10016
+ "epoch": 0.05576924014410304,
10017
+ "grad_norm": 8.4375,
10018
+ "learning_rate": 0.0004908649716739024,
10019
+ "loss": 18.3031,
10020
+ "step": 28600
10021
+ },
10022
+ {
10023
+ "epoch": 0.055808239612735275,
10024
+ "grad_norm": 8.1875,
10025
+ "learning_rate": 0.0004908584696429931,
10026
+ "loss": 18.3625,
10027
+ "step": 28620
10028
+ },
10029
+ {
10030
+ "epoch": 0.05584723908136752,
10031
+ "grad_norm": 9.125,
10032
+ "learning_rate": 0.0004908519676120837,
10033
+ "loss": 18.2676,
10034
+ "step": 28640
10035
+ },
10036
+ {
10037
+ "epoch": 0.05588623854999976,
10038
+ "grad_norm": 8.625,
10039
+ "learning_rate": 0.0004908454655811743,
10040
+ "loss": 18.356,
10041
+ "step": 28660
10042
+ },
10043
+ {
10044
+ "epoch": 0.055925238018631994,
10045
+ "grad_norm": 8.5625,
10046
+ "learning_rate": 0.0004908389635502649,
10047
+ "loss": 18.3445,
10048
+ "step": 28680
10049
+ },
10050
+ {
10051
+ "epoch": 0.055964237487264236,
10052
+ "grad_norm": 10.4375,
10053
+ "learning_rate": 0.0004908324615193556,
10054
+ "loss": 18.2686,
10055
+ "step": 28700
10056
+ },
10057
+ {
10058
+ "epoch": 0.05600323695589648,
10059
+ "grad_norm": 9.125,
10060
+ "learning_rate": 0.0004908259594884462,
10061
+ "loss": 18.3601,
10062
+ "step": 28720
10063
+ },
10064
+ {
10065
+ "epoch": 0.05604223642452871,
10066
+ "grad_norm": 8.5625,
10067
+ "learning_rate": 0.0004908194574575369,
10068
+ "loss": 18.2382,
10069
+ "step": 28740
10070
+ },
10071
+ {
10072
+ "epoch": 0.056081235893160955,
10073
+ "grad_norm": 8.5,
10074
+ "learning_rate": 0.0004908129554266275,
10075
+ "loss": 18.2971,
10076
+ "step": 28760
10077
+ },
10078
+ {
10079
+ "epoch": 0.0561202353617932,
10080
+ "grad_norm": 8.3125,
10081
+ "learning_rate": 0.0004908064533957182,
10082
+ "loss": 18.2238,
10083
+ "step": 28780
10084
+ },
10085
+ {
10086
+ "epoch": 0.05615923483042543,
10087
+ "grad_norm": 8.1875,
10088
+ "learning_rate": 0.0004907999513648089,
10089
+ "loss": 18.32,
10090
+ "step": 28800
10091
+ },
10092
+ {
10093
+ "epoch": 0.056198234299057674,
10094
+ "grad_norm": 9.25,
10095
+ "learning_rate": 0.0004907934493338995,
10096
+ "loss": 18.2592,
10097
+ "step": 28820
10098
+ },
10099
+ {
10100
+ "epoch": 0.056237233767689916,
10101
+ "grad_norm": 8.25,
10102
+ "learning_rate": 0.0004907869473029901,
10103
+ "loss": 18.3136,
10104
+ "step": 28840
10105
+ },
10106
+ {
10107
+ "epoch": 0.05627623323632216,
10108
+ "grad_norm": 7.875,
10109
+ "learning_rate": 0.0004907804452720807,
10110
+ "loss": 18.2562,
10111
+ "step": 28860
10112
+ },
10113
+ {
10114
+ "epoch": 0.05631523270495439,
10115
+ "grad_norm": 9.25,
10116
+ "learning_rate": 0.0004907739432411714,
10117
+ "loss": 18.2101,
10118
+ "step": 28880
10119
+ },
10120
+ {
10121
+ "epoch": 0.056354232173586635,
10122
+ "grad_norm": 8.75,
10123
+ "learning_rate": 0.000490767441210262,
10124
+ "loss": 18.3202,
10125
+ "step": 28900
10126
+ },
10127
+ {
10128
+ "epoch": 0.05639323164221888,
10129
+ "grad_norm": 9.5,
10130
+ "learning_rate": 0.0004907609391793527,
10131
+ "loss": 18.3016,
10132
+ "step": 28920
10133
+ },
10134
+ {
10135
+ "epoch": 0.05643223111085111,
10136
+ "grad_norm": 8.8125,
10137
+ "learning_rate": 0.0004907544371484434,
10138
+ "loss": 18.2731,
10139
+ "step": 28940
10140
+ },
10141
+ {
10142
+ "epoch": 0.056471230579483354,
10143
+ "grad_norm": 8.4375,
10144
+ "learning_rate": 0.000490747935117534,
10145
+ "loss": 18.2778,
10146
+ "step": 28960
10147
+ },
10148
+ {
10149
+ "epoch": 0.056510230048115596,
10150
+ "grad_norm": 7.1875,
10151
+ "learning_rate": 0.0004907414330866247,
10152
+ "loss": 18.2984,
10153
+ "step": 28980
10154
+ },
10155
+ {
10156
+ "epoch": 0.05654922951674783,
10157
+ "grad_norm": 8.3125,
10158
+ "learning_rate": 0.0004907349310557152,
10159
+ "loss": 18.2915,
10160
+ "step": 29000
10161
  }
10162
  ],
10163
  "logging_steps": 20,
 
10177
  "attributes": {}
10178
  }
10179
  },
10180
+ "total_flos": 2.1319717710265844e+19,
10181
  "train_batch_size": 48,
10182
  "trial_name": null,
10183
  "trial_params": null