mohammadmahdinouri commited on
Commit
d39894e
·
verified ·
1 Parent(s): 4e38c9c

Training in progress, step 72000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:757633efe84a53c5ec97a90a7f4675f908dbeafb070171c08276f4ceae89bf82
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbefc43fcc2f8bf8bb8522016041f2a9a7a1389e937a0c7f9efe740c9281e923
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c319382408e536debfaba9985144c2b85aedc267f1adb41fa2fcd682a710d69
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e10ee0c90a6cc09cdc24b1085749ee192ca52841ac52349ee023c635a106f71
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b55180ad5c333f626bc6ef839beda747e8f0633fdb8a2329d1af0642155fcad0
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2f1706dfc950df47249e8d65d6df596c2f98887c24dba54cde743e4804d2cf
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fb9e669a1e66d6084675ac17f9361f1d66f6538870dda5d62bb9fedf0717021
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:488c74f8a1dc2a7148ae3d9f18c7e9fcbb141512e2f149cd1d29674d054be2f3
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93c86e46203b6a91184b0093d776c5c5cbb5568a55f409f62928f5b11605d793
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77448ddbc0e5f35d8ef3a4b1063eb25209d701957cc23b3671796af1520e431c
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57649dd5fae41007b8326ad8bceda3664e8263c16462c398827f7c60518777a9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3acb48030fde17938d59bf929c695a9b6dbd4fe2687e2cce76096a6e14351d6
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:065fc078fd1aeeb645695c18fb1eff98c533b26302779a57f06b17d1e0565e6a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514d743b09cdf67b5f7ccba0c67283da3d20aa73a759bcf5ebfccf66234e08c8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.10517334344577499,
6
  "eval_steps": 500,
7
- "global_step": 71000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24858,6 +24858,356 @@
24858
  "learning_rate": 0.00048259050104507866,
24859
  "loss": 16.5599,
24860
  "step": 71000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24861
  }
24862
  ],
24863
  "logging_steps": 20,
@@ -24877,7 +25227,7 @@
24877
  "attributes": {}
24878
  }
24879
  },
24880
- "total_flos": 5.220171364156257e+19,
24881
  "train_batch_size": 48,
24882
  "trial_name": null,
24883
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.10665465814219437,
6
  "eval_steps": 500,
7
+ "global_step": 72000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24858
  "learning_rate": 0.00048259050104507866,
24859
  "loss": 16.5599,
24860
  "step": 71000
24861
+ },
24862
+ {
24863
+ "epoch": 0.10520296973970338,
24864
+ "grad_norm": 6.84375,
24865
+ "learning_rate": 0.0004825855621100711,
24866
+ "loss": 16.4962,
24867
+ "step": 71020
24868
+ },
24869
+ {
24870
+ "epoch": 0.10523259603363178,
24871
+ "grad_norm": 6.875,
24872
+ "learning_rate": 0.0004825806231750635,
24873
+ "loss": 16.4845,
24874
+ "step": 71040
24875
+ },
24876
+ {
24877
+ "epoch": 0.10526222232756016,
24878
+ "grad_norm": 6.71875,
24879
+ "learning_rate": 0.00048257568424005595,
24880
+ "loss": 16.5411,
24881
+ "step": 71060
24882
+ },
24883
+ {
24884
+ "epoch": 0.10529184862148855,
24885
+ "grad_norm": 6.65625,
24886
+ "learning_rate": 0.0004825707453050484,
24887
+ "loss": 16.5235,
24888
+ "step": 71080
24889
+ },
24890
+ {
24891
+ "epoch": 0.10532147491541693,
24892
+ "grad_norm": 6.375,
24893
+ "learning_rate": 0.00048256580637004085,
24894
+ "loss": 16.5079,
24895
+ "step": 71100
24896
+ },
24897
+ {
24898
+ "epoch": 0.10535110120934532,
24899
+ "grad_norm": 6.375,
24900
+ "learning_rate": 0.00048256086743503324,
24901
+ "loss": 16.5093,
24902
+ "step": 71120
24903
+ },
24904
+ {
24905
+ "epoch": 0.10538072750327371,
24906
+ "grad_norm": 6.84375,
24907
+ "learning_rate": 0.0004825559285000257,
24908
+ "loss": 16.5136,
24909
+ "step": 71140
24910
+ },
24911
+ {
24912
+ "epoch": 0.1054103537972021,
24913
+ "grad_norm": 6.8125,
24914
+ "learning_rate": 0.00048255098956501814,
24915
+ "loss": 16.4724,
24916
+ "step": 71160
24917
+ },
24918
+ {
24919
+ "epoch": 0.10543998009113048,
24920
+ "grad_norm": 6.25,
24921
+ "learning_rate": 0.0004825460506300106,
24922
+ "loss": 16.4691,
24923
+ "step": 71180
24924
+ },
24925
+ {
24926
+ "epoch": 0.10546960638505887,
24927
+ "grad_norm": 7.96875,
24928
+ "learning_rate": 0.000482541111695003,
24929
+ "loss": 16.4072,
24930
+ "step": 71200
24931
+ },
24932
+ {
24933
+ "epoch": 0.10549923267898725,
24934
+ "grad_norm": 6.375,
24935
+ "learning_rate": 0.0004825361727599954,
24936
+ "loss": 16.531,
24937
+ "step": 71220
24938
+ },
24939
+ {
24940
+ "epoch": 0.10552885897291564,
24941
+ "grad_norm": 6.3125,
24942
+ "learning_rate": 0.0004825312338249879,
24943
+ "loss": 16.5211,
24944
+ "step": 71240
24945
+ },
24946
+ {
24947
+ "epoch": 0.10555848526684403,
24948
+ "grad_norm": 6.625,
24949
+ "learning_rate": 0.00048252629488998027,
24950
+ "loss": 16.5079,
24951
+ "step": 71260
24952
+ },
24953
+ {
24954
+ "epoch": 0.10558811156077241,
24955
+ "grad_norm": 6.5625,
24956
+ "learning_rate": 0.0004825213559549727,
24957
+ "loss": 16.4813,
24958
+ "step": 71280
24959
+ },
24960
+ {
24961
+ "epoch": 0.1056177378547008,
24962
+ "grad_norm": 7.5,
24963
+ "learning_rate": 0.00048251641701996516,
24964
+ "loss": 16.5194,
24965
+ "step": 71300
24966
+ },
24967
+ {
24968
+ "epoch": 0.10564736414862919,
24969
+ "grad_norm": 6.84375,
24970
+ "learning_rate": 0.0004825114780849576,
24971
+ "loss": 16.4672,
24972
+ "step": 71320
24973
+ },
24974
+ {
24975
+ "epoch": 0.10567699044255757,
24976
+ "grad_norm": 6.90625,
24977
+ "learning_rate": 0.00048250653914995,
24978
+ "loss": 16.5088,
24979
+ "step": 71340
24980
+ },
24981
+ {
24982
+ "epoch": 0.10570661673648597,
24983
+ "grad_norm": 7.53125,
24984
+ "learning_rate": 0.00048250160021494245,
24985
+ "loss": 16.5076,
24986
+ "step": 71360
24987
+ },
24988
+ {
24989
+ "epoch": 0.10573624303041436,
24990
+ "grad_norm": 5.65625,
24991
+ "learning_rate": 0.0004824966612799349,
24992
+ "loss": 16.4723,
24993
+ "step": 71380
24994
+ },
24995
+ {
24996
+ "epoch": 0.10576586932434275,
24997
+ "grad_norm": 6.3125,
24998
+ "learning_rate": 0.00048249172234492735,
24999
+ "loss": 16.4759,
25000
+ "step": 71400
25001
+ },
25002
+ {
25003
+ "epoch": 0.10579549561827113,
25004
+ "grad_norm": 6.03125,
25005
+ "learning_rate": 0.00048248678340991974,
25006
+ "loss": 16.4789,
25007
+ "step": 71420
25008
+ },
25009
+ {
25010
+ "epoch": 0.10582512191219952,
25011
+ "grad_norm": 6.71875,
25012
+ "learning_rate": 0.0004824818444749122,
25013
+ "loss": 16.5194,
25014
+ "step": 71440
25015
+ },
25016
+ {
25017
+ "epoch": 0.1058547482061279,
25018
+ "grad_norm": 7.1875,
25019
+ "learning_rate": 0.00048247690553990464,
25020
+ "loss": 16.5426,
25021
+ "step": 71460
25022
+ },
25023
+ {
25024
+ "epoch": 0.10588437450005629,
25025
+ "grad_norm": 6.40625,
25026
+ "learning_rate": 0.0004824719666048971,
25027
+ "loss": 16.5093,
25028
+ "step": 71480
25029
+ },
25030
+ {
25031
+ "epoch": 0.10591400079398468,
25032
+ "grad_norm": 7.28125,
25033
+ "learning_rate": 0.0004824670276698895,
25034
+ "loss": 16.5204,
25035
+ "step": 71500
25036
+ },
25037
+ {
25038
+ "epoch": 0.10594362708791306,
25039
+ "grad_norm": 7.0,
25040
+ "learning_rate": 0.000482462088734882,
25041
+ "loss": 16.5197,
25042
+ "step": 71520
25043
+ },
25044
+ {
25045
+ "epoch": 0.10597325338184145,
25046
+ "grad_norm": 6.1875,
25047
+ "learning_rate": 0.0004824571497998744,
25048
+ "loss": 16.4954,
25049
+ "step": 71540
25050
+ },
25051
+ {
25052
+ "epoch": 0.10600287967576984,
25053
+ "grad_norm": 6.6875,
25054
+ "learning_rate": 0.00048245221086486677,
25055
+ "loss": 16.4379,
25056
+ "step": 71560
25057
+ },
25058
+ {
25059
+ "epoch": 0.10603250596969822,
25060
+ "grad_norm": 7.8125,
25061
+ "learning_rate": 0.0004824472719298592,
25062
+ "loss": 16.4314,
25063
+ "step": 71580
25064
+ },
25065
+ {
25066
+ "epoch": 0.10606213226362661,
25067
+ "grad_norm": 7.40625,
25068
+ "learning_rate": 0.00048244233299485166,
25069
+ "loss": 16.4561,
25070
+ "step": 71600
25071
+ },
25072
+ {
25073
+ "epoch": 0.106091758557555,
25074
+ "grad_norm": 6.625,
25075
+ "learning_rate": 0.0004824373940598441,
25076
+ "loss": 16.5224,
25077
+ "step": 71620
25078
+ },
25079
+ {
25080
+ "epoch": 0.10612138485148338,
25081
+ "grad_norm": 6.78125,
25082
+ "learning_rate": 0.0004824324551248365,
25083
+ "loss": 16.4732,
25084
+ "step": 71640
25085
+ },
25086
+ {
25087
+ "epoch": 0.10615101114541177,
25088
+ "grad_norm": 6.90625,
25089
+ "learning_rate": 0.00048242751618982895,
25090
+ "loss": 16.4654,
25091
+ "step": 71660
25092
+ },
25093
+ {
25094
+ "epoch": 0.10618063743934017,
25095
+ "grad_norm": 7.875,
25096
+ "learning_rate": 0.0004824225772548214,
25097
+ "loss": 16.5161,
25098
+ "step": 71680
25099
+ },
25100
+ {
25101
+ "epoch": 0.10621026373326856,
25102
+ "grad_norm": 6.09375,
25103
+ "learning_rate": 0.00048241763831981385,
25104
+ "loss": 16.446,
25105
+ "step": 71700
25106
+ },
25107
+ {
25108
+ "epoch": 0.10623989002719694,
25109
+ "grad_norm": 7.1875,
25110
+ "learning_rate": 0.00048241269938480624,
25111
+ "loss": 16.5245,
25112
+ "step": 71720
25113
+ },
25114
+ {
25115
+ "epoch": 0.10626951632112533,
25116
+ "grad_norm": 6.1875,
25117
+ "learning_rate": 0.0004824077604497987,
25118
+ "loss": 16.4989,
25119
+ "step": 71740
25120
+ },
25121
+ {
25122
+ "epoch": 0.10629914261505372,
25123
+ "grad_norm": 7.0,
25124
+ "learning_rate": 0.00048240282151479114,
25125
+ "loss": 16.5321,
25126
+ "step": 71760
25127
+ },
25128
+ {
25129
+ "epoch": 0.1063287689089821,
25130
+ "grad_norm": 6.15625,
25131
+ "learning_rate": 0.0004823978825797836,
25132
+ "loss": 16.4498,
25133
+ "step": 71780
25134
+ },
25135
+ {
25136
+ "epoch": 0.10635839520291049,
25137
+ "grad_norm": 6.875,
25138
+ "learning_rate": 0.000482392943644776,
25139
+ "loss": 16.4793,
25140
+ "step": 71800
25141
+ },
25142
+ {
25143
+ "epoch": 0.10638802149683887,
25144
+ "grad_norm": 7.90625,
25145
+ "learning_rate": 0.0004823880047097685,
25146
+ "loss": 16.4312,
25147
+ "step": 71820
25148
+ },
25149
+ {
25150
+ "epoch": 0.10641764779076726,
25151
+ "grad_norm": 6.625,
25152
+ "learning_rate": 0.0004823830657747609,
25153
+ "loss": 16.5154,
25154
+ "step": 71840
25155
+ },
25156
+ {
25157
+ "epoch": 0.10644727408469565,
25158
+ "grad_norm": 7.5,
25159
+ "learning_rate": 0.00048237812683975327,
25160
+ "loss": 16.4446,
25161
+ "step": 71860
25162
+ },
25163
+ {
25164
+ "epoch": 0.10647690037862403,
25165
+ "grad_norm": 7.25,
25166
+ "learning_rate": 0.0004823731879047457,
25167
+ "loss": 16.5021,
25168
+ "step": 71880
25169
+ },
25170
+ {
25171
+ "epoch": 0.10650652667255242,
25172
+ "grad_norm": 6.5,
25173
+ "learning_rate": 0.00048236824896973816,
25174
+ "loss": 16.4583,
25175
+ "step": 71900
25176
+ },
25177
+ {
25178
+ "epoch": 0.10653615296648081,
25179
+ "grad_norm": 6.625,
25180
+ "learning_rate": 0.0004823633100347306,
25181
+ "loss": 16.4529,
25182
+ "step": 71920
25183
+ },
25184
+ {
25185
+ "epoch": 0.1065657792604092,
25186
+ "grad_norm": 6.15625,
25187
+ "learning_rate": 0.000482358371099723,
25188
+ "loss": 16.4529,
25189
+ "step": 71940
25190
+ },
25191
+ {
25192
+ "epoch": 0.10659540555433758,
25193
+ "grad_norm": 6.53125,
25194
+ "learning_rate": 0.00048235343216471545,
25195
+ "loss": 16.4638,
25196
+ "step": 71960
25197
+ },
25198
+ {
25199
+ "epoch": 0.10662503184826597,
25200
+ "grad_norm": 6.5,
25201
+ "learning_rate": 0.0004823484932297079,
25202
+ "loss": 16.4739,
25203
+ "step": 71980
25204
+ },
25205
+ {
25206
+ "epoch": 0.10665465814219437,
25207
+ "grad_norm": 6.75,
25208
+ "learning_rate": 0.00048234355429470035,
25209
+ "loss": 16.5261,
25210
+ "step": 72000
25211
  }
25212
  ],
25213
  "logging_steps": 20,
 
25227
  "attributes": {}
25228
  }
25229
  },
25230
+ "total_flos": 5.293707639198528e+19,
25231
  "train_batch_size": 48,
25232
  "trial_name": null,
25233
  "trial_params": null