mohammadmahdinouri commited on
Commit
d5e815e
·
verified ·
1 Parent(s): 3b8e1dd

Training in progress, step 18000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e1f6084c2fd12874836176a807971d304a89f7ecfc63e2081a9bd54f224b13b
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13fbe4723123a9c016392f22f5c5a607f137024e3a3211fa73da181d0f6cd1aa
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:729c0d767d06adf4295f1acf80d3c9a43aee84e3de6cc9a899725bd2d9ba998b
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa6efd41ace1816d77bf0b60c121855a1169e94c3066ee2c4a8939be056cb68
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6775411b7c96ce112db0ff86dbc4c7f4f5876ba69512e78981d49611b5ed959e
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e45d3c16114f00517a9e754366d6be11045def442e0374684988d3ee13c529
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c2e00f40f2b965358ee58725a6039af41eeb8a8f4527ae152ec5dad618307fd
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773184c6d03f9fc1dff724dd2ebc3487575db231883b47dc4663fdc68f33bddb
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37ee15f1c9ceef9e456d1af53da3ed0fd0ec244051b974379f15c285ed42f8b7
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a97caacfd2ffecaa53d612d1aaec198c719ff4db983e8469e19a70730a6af9
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e79d7f1dfea25dc4809dc0e5c220d70f3b690693b546131b59ad7f9ed9b129c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee307f509a475bceeb88f57a12c9dbe31c5cc43a16b915e7c00fca8b909b56f5
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84d957adbd57639a95ced1440a685d29db26c75001a9b3061d2f7af9b9a721b1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.033149548337403904,
6
  "eval_steps": 500,
7
- "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5958,6 +5958,356 @@
5958
  "learning_rate": 0.000494636149601328,
5959
  "loss": 20.0712,
5960
  "step": 17000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5961
  }
5962
  ],
5963
  "logging_steps": 20,
@@ -5977,7 +6327,7 @@
5977
  "attributes": {}
5978
  }
5979
  },
5980
- "total_flos": 1.2497927616331776e+19,
5981
  "train_batch_size": 48,
5982
  "trial_name": null,
5983
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.035099521769015894,
6
  "eval_steps": 500,
7
+ "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5958
  "learning_rate": 0.000494636149601328,
5959
  "loss": 20.0712,
5960
  "step": 17000
5961
+ },
5962
+ {
5963
+ "epoch": 0.03318854780603614,
5964
+ "grad_norm": 11.0,
5965
+ "learning_rate": 0.0004946296475704186,
5966
+ "loss": 20.1068,
5967
+ "step": 17020
5968
+ },
5969
+ {
5970
+ "epoch": 0.03322754727466838,
5971
+ "grad_norm": 10.1875,
5972
+ "learning_rate": 0.0004946231455395093,
5973
+ "loss": 20.0552,
5974
+ "step": 17040
5975
+ },
5976
+ {
5977
+ "epoch": 0.03326654674330062,
5978
+ "grad_norm": 12.5,
5979
+ "learning_rate": 0.0004946166435085999,
5980
+ "loss": 20.0382,
5981
+ "step": 17060
5982
+ },
5983
+ {
5984
+ "epoch": 0.033305546211932865,
5985
+ "grad_norm": 11.125,
5986
+ "learning_rate": 0.0004946101414776906,
5987
+ "loss": 20.1285,
5988
+ "step": 17080
5989
+ },
5990
+ {
5991
+ "epoch": 0.0333445456805651,
5992
+ "grad_norm": 10.375,
5993
+ "learning_rate": 0.0004946036394467813,
5994
+ "loss": 20.0373,
5995
+ "step": 17100
5996
+ },
5997
+ {
5998
+ "epoch": 0.03338354514919734,
5999
+ "grad_norm": 9.9375,
6000
+ "learning_rate": 0.0004945971374158719,
6001
+ "loss": 20.1946,
6002
+ "step": 17120
6003
+ },
6004
+ {
6005
+ "epoch": 0.033422544617829585,
6006
+ "grad_norm": 10.6875,
6007
+ "learning_rate": 0.0004945906353849625,
6008
+ "loss": 20.1412,
6009
+ "step": 17140
6010
+ },
6011
+ {
6012
+ "epoch": 0.03346154408646182,
6013
+ "grad_norm": 9.5,
6014
+ "learning_rate": 0.0004945841333540531,
6015
+ "loss": 20.078,
6016
+ "step": 17160
6017
+ },
6018
+ {
6019
+ "epoch": 0.03350054355509406,
6020
+ "grad_norm": 10.5,
6021
+ "learning_rate": 0.0004945776313231438,
6022
+ "loss": 20.0913,
6023
+ "step": 17180
6024
+ },
6025
+ {
6026
+ "epoch": 0.033539543023726304,
6027
+ "grad_norm": 11.75,
6028
+ "learning_rate": 0.0004945711292922344,
6029
+ "loss": 20.1428,
6030
+ "step": 17200
6031
+ },
6032
+ {
6033
+ "epoch": 0.03357854249235854,
6034
+ "grad_norm": 10.8125,
6035
+ "learning_rate": 0.0004945646272613251,
6036
+ "loss": 20.0407,
6037
+ "step": 17220
6038
+ },
6039
+ {
6040
+ "epoch": 0.03361754196099078,
6041
+ "grad_norm": 10.5625,
6042
+ "learning_rate": 0.0004945581252304157,
6043
+ "loss": 20.1396,
6044
+ "step": 17240
6045
+ },
6046
+ {
6047
+ "epoch": 0.03365654142962302,
6048
+ "grad_norm": 10.0,
6049
+ "learning_rate": 0.0004945516231995064,
6050
+ "loss": 20.0334,
6051
+ "step": 17260
6052
+ },
6053
+ {
6054
+ "epoch": 0.03369554089825526,
6055
+ "grad_norm": 9.875,
6056
+ "learning_rate": 0.0004945451211685971,
6057
+ "loss": 19.9909,
6058
+ "step": 17280
6059
+ },
6060
+ {
6061
+ "epoch": 0.0337345403668875,
6062
+ "grad_norm": 12.4375,
6063
+ "learning_rate": 0.0004945386191376876,
6064
+ "loss": 20.0374,
6065
+ "step": 17300
6066
+ },
6067
+ {
6068
+ "epoch": 0.03377353983551974,
6069
+ "grad_norm": 9.75,
6070
+ "learning_rate": 0.0004945321171067783,
6071
+ "loss": 20.0703,
6072
+ "step": 17320
6073
+ },
6074
+ {
6075
+ "epoch": 0.033812539304151984,
6076
+ "grad_norm": 11.375,
6077
+ "learning_rate": 0.0004945256150758689,
6078
+ "loss": 19.9489,
6079
+ "step": 17340
6080
+ },
6081
+ {
6082
+ "epoch": 0.03385153877278422,
6083
+ "grad_norm": 11.3125,
6084
+ "learning_rate": 0.0004945191130449596,
6085
+ "loss": 19.9904,
6086
+ "step": 17360
6087
+ },
6088
+ {
6089
+ "epoch": 0.03389053824141646,
6090
+ "grad_norm": 10.3125,
6091
+ "learning_rate": 0.0004945126110140502,
6092
+ "loss": 19.9895,
6093
+ "step": 17380
6094
+ },
6095
+ {
6096
+ "epoch": 0.0339295377100487,
6097
+ "grad_norm": 11.0,
6098
+ "learning_rate": 0.0004945061089831409,
6099
+ "loss": 20.0525,
6100
+ "step": 17400
6101
+ },
6102
+ {
6103
+ "epoch": 0.03396853717868094,
6104
+ "grad_norm": 10.0,
6105
+ "learning_rate": 0.0004944996069522316,
6106
+ "loss": 20.0451,
6107
+ "step": 17420
6108
+ },
6109
+ {
6110
+ "epoch": 0.03400753664731318,
6111
+ "grad_norm": 10.625,
6112
+ "learning_rate": 0.0004944931049213222,
6113
+ "loss": 20.0506,
6114
+ "step": 17440
6115
+ },
6116
+ {
6117
+ "epoch": 0.03404653611594542,
6118
+ "grad_norm": 10.4375,
6119
+ "learning_rate": 0.0004944866028904128,
6120
+ "loss": 19.9625,
6121
+ "step": 17460
6122
+ },
6123
+ {
6124
+ "epoch": 0.03408553558457766,
6125
+ "grad_norm": 11.625,
6126
+ "learning_rate": 0.0004944801008595034,
6127
+ "loss": 19.995,
6128
+ "step": 17480
6129
+ },
6130
+ {
6131
+ "epoch": 0.0341245350532099,
6132
+ "grad_norm": 11.0625,
6133
+ "learning_rate": 0.0004944735988285941,
6134
+ "loss": 20.1062,
6135
+ "step": 17500
6136
+ },
6137
+ {
6138
+ "epoch": 0.03416353452184214,
6139
+ "grad_norm": 10.4375,
6140
+ "learning_rate": 0.0004944670967976847,
6141
+ "loss": 19.9454,
6142
+ "step": 17520
6143
+ },
6144
+ {
6145
+ "epoch": 0.034202533990474376,
6146
+ "grad_norm": 9.3125,
6147
+ "learning_rate": 0.0004944605947667754,
6148
+ "loss": 19.8752,
6149
+ "step": 17540
6150
+ },
6151
+ {
6152
+ "epoch": 0.03424153345910662,
6153
+ "grad_norm": 10.375,
6154
+ "learning_rate": 0.000494454092735866,
6155
+ "loss": 19.9649,
6156
+ "step": 17560
6157
+ },
6158
+ {
6159
+ "epoch": 0.03428053292773886,
6160
+ "grad_norm": 10.125,
6161
+ "learning_rate": 0.0004944475907049567,
6162
+ "loss": 19.9261,
6163
+ "step": 17580
6164
+ },
6165
+ {
6166
+ "epoch": 0.0343195323963711,
6167
+ "grad_norm": 9.875,
6168
+ "learning_rate": 0.0004944410886740474,
6169
+ "loss": 19.909,
6170
+ "step": 17600
6171
+ },
6172
+ {
6173
+ "epoch": 0.03435853186500334,
6174
+ "grad_norm": 10.75,
6175
+ "learning_rate": 0.000494434586643138,
6176
+ "loss": 19.9778,
6177
+ "step": 17620
6178
+ },
6179
+ {
6180
+ "epoch": 0.03439753133363558,
6181
+ "grad_norm": 11.5,
6182
+ "learning_rate": 0.0004944280846122287,
6183
+ "loss": 19.9709,
6184
+ "step": 17640
6185
+ },
6186
+ {
6187
+ "epoch": 0.03443653080226782,
6188
+ "grad_norm": 11.125,
6189
+ "learning_rate": 0.0004944215825813193,
6190
+ "loss": 19.9898,
6191
+ "step": 17660
6192
+ },
6193
+ {
6194
+ "epoch": 0.03447553027090006,
6195
+ "grad_norm": 10.5625,
6196
+ "learning_rate": 0.00049441508055041,
6197
+ "loss": 19.9979,
6198
+ "step": 17680
6199
+ },
6200
+ {
6201
+ "epoch": 0.0345145297395323,
6202
+ "grad_norm": 9.5625,
6203
+ "learning_rate": 0.0004944085785195005,
6204
+ "loss": 19.9206,
6205
+ "step": 17700
6206
+ },
6207
+ {
6208
+ "epoch": 0.03455352920816454,
6209
+ "grad_norm": 11.0,
6210
+ "learning_rate": 0.0004944020764885912,
6211
+ "loss": 19.9701,
6212
+ "step": 17720
6213
+ },
6214
+ {
6215
+ "epoch": 0.034592528676796776,
6216
+ "grad_norm": 10.75,
6217
+ "learning_rate": 0.0004943955744576818,
6218
+ "loss": 19.9937,
6219
+ "step": 17740
6220
+ },
6221
+ {
6222
+ "epoch": 0.03463152814542902,
6223
+ "grad_norm": 12.5625,
6224
+ "learning_rate": 0.0004943890724267725,
6225
+ "loss": 20.0349,
6226
+ "step": 17760
6227
+ },
6228
+ {
6229
+ "epoch": 0.03467052761406126,
6230
+ "grad_norm": 11.3125,
6231
+ "learning_rate": 0.0004943825703958632,
6232
+ "loss": 19.8582,
6233
+ "step": 17780
6234
+ },
6235
+ {
6236
+ "epoch": 0.034709527082693495,
6237
+ "grad_norm": 12.125,
6238
+ "learning_rate": 0.0004943760683649538,
6239
+ "loss": 19.9185,
6240
+ "step": 17800
6241
+ },
6242
+ {
6243
+ "epoch": 0.03474852655132574,
6244
+ "grad_norm": 10.625,
6245
+ "learning_rate": 0.0004943695663340445,
6246
+ "loss": 19.9073,
6247
+ "step": 17820
6248
+ },
6249
+ {
6250
+ "epoch": 0.03478752601995798,
6251
+ "grad_norm": 9.8125,
6252
+ "learning_rate": 0.0004943630643031351,
6253
+ "loss": 19.8189,
6254
+ "step": 17840
6255
+ },
6256
+ {
6257
+ "epoch": 0.03482652548859022,
6258
+ "grad_norm": 12.375,
6259
+ "learning_rate": 0.0004943565622722258,
6260
+ "loss": 20.0152,
6261
+ "step": 17860
6262
+ },
6263
+ {
6264
+ "epoch": 0.034865524957222456,
6265
+ "grad_norm": 10.0,
6266
+ "learning_rate": 0.0004943500602413164,
6267
+ "loss": 19.9768,
6268
+ "step": 17880
6269
+ },
6270
+ {
6271
+ "epoch": 0.0349045244258547,
6272
+ "grad_norm": 10.5,
6273
+ "learning_rate": 0.0004943435582104071,
6274
+ "loss": 19.9124,
6275
+ "step": 17900
6276
+ },
6277
+ {
6278
+ "epoch": 0.03494352389448694,
6279
+ "grad_norm": 9.5,
6280
+ "learning_rate": 0.0004943370561794977,
6281
+ "loss": 19.8925,
6282
+ "step": 17920
6283
+ },
6284
+ {
6285
+ "epoch": 0.034982523363119175,
6286
+ "grad_norm": 8.75,
6287
+ "learning_rate": 0.0004943305541485883,
6288
+ "loss": 19.9456,
6289
+ "step": 17940
6290
+ },
6291
+ {
6292
+ "epoch": 0.03502152283175142,
6293
+ "grad_norm": 10.625,
6294
+ "learning_rate": 0.000494324052117679,
6295
+ "loss": 19.8603,
6296
+ "step": 17960
6297
+ },
6298
+ {
6299
+ "epoch": 0.03506052230038366,
6300
+ "grad_norm": 10.25,
6301
+ "learning_rate": 0.0004943175500867696,
6302
+ "loss": 19.909,
6303
+ "step": 17980
6304
+ },
6305
+ {
6306
+ "epoch": 0.035099521769015894,
6307
+ "grad_norm": 9.5625,
6308
+ "learning_rate": 0.0004943110480558603,
6309
+ "loss": 19.8528,
6310
+ "step": 18000
6311
  }
6312
  ],
6313
  "logging_steps": 20,
 
6327
  "attributes": {}
6328
  }
6329
  },
6330
+ "total_flos": 1.3232995623550058e+19,
6331
  "train_batch_size": 48,
6332
  "trial_name": null,
6333
  "trial_params": null