Wilsonwin commited on
Commit
479bd3a
·
verified ·
1 Parent(s): 63f8fba

Training in progress, step 11500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62f77201047c0ff7c5527ffc5ccf11b4138f77fae747adff964ee88ae1f98afc
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbe911786fb4b3454d02608c237b36cc20b52333d42fa68272921c094a01a632
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ace327ac217bad5e9c3541a67c8adbffd0c6930f7ad271ab5e15f9a6306ce52e
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83e156617695722c4ccab8876c70abb964581f51616c0cec63d83f236c2f3130
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca9715fac08ad0b70edb3a378bc21ad649dabc882b316cdb77b215f678babe3b
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffe8fac68804a21cd2d55f992832d3e8fb9ed8d46f7a6aafd6debfef9c29633
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d77946d2c30708215d82675369c6b0f4ea0ac50e0bfa8851a58c893e34baac40
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:116f0b85bffdc97adeb264e8dbd65d6acc7d514e82a48ea5ea50bd5091784a48
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.8584220307484371,
6
  "eval_steps": 500,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7891,6 +7891,364 @@
7891
  "eval_samples_per_second": 262.842,
7892
  "eval_steps_per_second": 5.52,
7893
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7894
  }
7895
  ],
7896
  "logging_steps": 10,
@@ -7910,7 +8268,7 @@
7910
  "attributes": {}
7911
  }
7912
  },
7913
- "total_flos": 3.679004125896376e+17,
7914
  "train_batch_size": 48,
7915
  "trial_name": null,
7916
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9428957594188208,
6
  "eval_steps": 500,
7
+ "global_step": 11500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7891
  "eval_samples_per_second": 262.842,
7892
  "eval_steps_per_second": 5.52,
7893
  "step": 11000
7894
+ },
7895
+ {
7896
+ "epoch": 1.8601115053218449,
7897
+ "grad_norm": 0.455091655254364,
7898
+ "learning_rate": 5.225389455120976e-06,
7899
+ "loss": 4.29366455078125,
7900
+ "step": 11010
7901
+ },
7902
+ {
7903
+ "epoch": 1.8618009798952526,
7904
+ "grad_norm": 0.46570661664009094,
7905
+ "learning_rate": 5.1008000218881576e-06,
7906
+ "loss": 4.286912536621093,
7907
+ "step": 11020
7908
+ },
7909
+ {
7910
+ "epoch": 1.8634904544686601,
7911
+ "grad_norm": 0.4428755044937134,
7912
+ "learning_rate": 4.977688171116923e-06,
7913
+ "loss": 4.300152206420899,
7914
+ "step": 11030
7915
+ },
7916
+ {
7917
+ "epoch": 1.8651799290420679,
7918
+ "grad_norm": 0.4450303316116333,
7919
+ "learning_rate": 4.856055158217298e-06,
7920
+ "loss": 4.289414978027343,
7921
+ "step": 11040
7922
+ },
7923
+ {
7924
+ "epoch": 1.8668694036154756,
7925
+ "grad_norm": 0.45025452971458435,
7926
+ "learning_rate": 4.735902223519173e-06,
7927
+ "loss": 4.273600006103516,
7928
+ "step": 11050
7929
+ },
7930
+ {
7931
+ "epoch": 1.868558878188883,
7932
+ "grad_norm": 0.4395146071910858,
7933
+ "learning_rate": 4.6172305922595746e-06,
7934
+ "loss": 4.288070297241211,
7935
+ "step": 11060
7936
+ },
7937
+ {
7938
+ "epoch": 1.870248352762291,
7939
+ "grad_norm": 0.45017367601394653,
7940
+ "learning_rate": 4.500041474570265e-06,
7941
+ "loss": 4.298558044433594,
7942
+ "step": 11070
7943
+ },
7944
+ {
7945
+ "epoch": 1.8719378273356986,
7946
+ "grad_norm": 0.45083948969841003,
7947
+ "learning_rate": 4.384336065465349e-06,
7948
+ "loss": 4.278664398193359,
7949
+ "step": 11080
7950
+ },
7951
+ {
7952
+ "epoch": 1.8736273019091063,
7953
+ "grad_norm": 0.4492949843406677,
7954
+ "learning_rate": 4.270115544829017e-06,
7955
+ "loss": 4.304440307617187,
7956
+ "step": 11090
7957
+ },
7958
+ {
7959
+ "epoch": 1.875316776482514,
7960
+ "grad_norm": 0.4543094336986542,
7961
+ "learning_rate": 4.1573810774037044e-06,
7962
+ "loss": 4.284811401367188,
7963
+ "step": 11100
7964
+ },
7965
+ {
7966
+ "epoch": 1.8770062510559216,
7967
+ "grad_norm": 0.44232332706451416,
7968
+ "learning_rate": 4.046133812777985e-06,
7969
+ "loss": 4.275522232055664,
7970
+ "step": 11110
7971
+ },
7972
+ {
7973
+ "epoch": 1.8786957256293293,
7974
+ "grad_norm": 0.4532018005847931,
7975
+ "learning_rate": 3.936374885375049e-06,
7976
+ "loss": 4.325132751464844,
7977
+ "step": 11120
7978
+ },
7979
+ {
7980
+ "epoch": 1.880385200202737,
7981
+ "grad_norm": 0.44870230555534363,
7982
+ "learning_rate": 3.828105414440974e-06,
7983
+ "loss": 4.293384170532226,
7984
+ "step": 11130
7985
+ },
7986
+ {
7987
+ "epoch": 1.8820746747761445,
7988
+ "grad_norm": 0.4408150017261505,
7989
+ "learning_rate": 3.7213265040334394e-06,
7990
+ "loss": 4.296081161499023,
7991
+ "step": 11140
7992
+ },
7993
+ {
7994
+ "epoch": 1.8837641493495523,
7995
+ "grad_norm": 0.44336998462677,
7996
+ "learning_rate": 3.616039243010399e-06,
7997
+ "loss": 4.299095916748047,
7998
+ "step": 11150
7999
+ },
8000
+ {
8001
+ "epoch": 1.88545362392296,
8002
+ "grad_norm": 0.4412024915218353,
8003
+ "learning_rate": 3.5122447050189573e-06,
8004
+ "loss": 4.285486221313477,
8005
+ "step": 11160
8006
+ },
8007
+ {
8008
+ "epoch": 1.8871430984963675,
8009
+ "grad_norm": 0.45283156633377075,
8010
+ "learning_rate": 3.4099439484844947e-06,
8011
+ "loss": 4.294749069213867,
8012
+ "step": 11170
8013
+ },
8014
+ {
8015
+ "epoch": 1.8888325730697753,
8016
+ "grad_norm": 0.4460100829601288,
8017
+ "learning_rate": 3.3091380165998103e-06,
8018
+ "loss": 4.310376739501953,
8019
+ "step": 11180
8020
+ },
8021
+ {
8022
+ "epoch": 1.890522047643183,
8023
+ "grad_norm": 0.44468414783477783,
8024
+ "learning_rate": 3.2098279373145463e-06,
8025
+ "loss": 4.327771377563477,
8026
+ "step": 11190
8027
+ },
8028
+ {
8029
+ "epoch": 1.8922115222165905,
8030
+ "grad_norm": 0.4378024637699127,
8031
+ "learning_rate": 3.1120147233246463e-06,
8032
+ "loss": 4.273694610595703,
8033
+ "step": 11200
8034
+ },
8035
+ {
8036
+ "epoch": 1.8939009967899985,
8037
+ "grad_norm": 0.44141000509262085,
8038
+ "learning_rate": 3.0156993720619804e-06,
8039
+ "loss": 4.287034606933593,
8040
+ "step": 11210
8041
+ },
8042
+ {
8043
+ "epoch": 1.895590471363406,
8044
+ "grad_norm": 0.4455374479293823,
8045
+ "learning_rate": 2.9208828656843876e-06,
8046
+ "loss": 4.320920181274414,
8047
+ "step": 11220
8048
+ },
8049
+ {
8050
+ "epoch": 1.8972799459368137,
8051
+ "grad_norm": 0.45368343591690063,
8052
+ "learning_rate": 2.827566171065415e-06,
8053
+ "loss": 4.285198974609375,
8054
+ "step": 11230
8055
+ },
8056
+ {
8057
+ "epoch": 1.8989694205102214,
8058
+ "grad_norm": 0.44222062826156616,
8059
+ "learning_rate": 2.7357502397845454e-06,
8060
+ "loss": 4.296764755249024,
8061
+ "step": 11240
8062
+ },
8063
+ {
8064
+ "epoch": 1.900658895083629,
8065
+ "grad_norm": 0.45191657543182373,
8066
+ "learning_rate": 2.645436008117602e-06,
8067
+ "loss": 4.27384033203125,
8068
+ "step": 11250
8069
+ },
8070
+ {
8071
+ "epoch": 1.9023483696570367,
8072
+ "grad_norm": 0.4548667371273041,
8073
+ "learning_rate": 2.5566243970270073e-06,
8074
+ "loss": 4.297956085205078,
8075
+ "step": 11260
8076
+ },
8077
+ {
8078
+ "epoch": 1.9040378442304444,
8079
+ "grad_norm": 0.4443969130516052,
8080
+ "learning_rate": 2.469316312152575e-06,
8081
+ "loss": 4.291641616821289,
8082
+ "step": 11270
8083
+ },
8084
+ {
8085
+ "epoch": 1.905727318803852,
8086
+ "grad_norm": 0.4367770850658417,
8087
+ "learning_rate": 2.3835126438021156e-06,
8088
+ "loss": 4.266088485717773,
8089
+ "step": 11280
8090
+ },
8091
+ {
8092
+ "epoch": 1.9074167933772597,
8093
+ "grad_norm": 0.4365804195404053,
8094
+ "learning_rate": 2.299214266942495e-06,
8095
+ "loss": 4.263021850585938,
8096
+ "step": 11290
8097
+ },
8098
+ {
8099
+ "epoch": 1.9091062679506674,
8100
+ "grad_norm": 0.4369988441467285,
8101
+ "learning_rate": 2.2164220411906407e-06,
8102
+ "loss": 4.288222122192383,
8103
+ "step": 11300
8104
+ },
8105
+ {
8106
+ "epoch": 1.910795742524075,
8107
+ "grad_norm": 0.44547080993652344,
8108
+ "learning_rate": 2.1351368108047495e-06,
8109
+ "loss": 4.26991081237793,
8110
+ "step": 11310
8111
+ },
8112
+ {
8113
+ "epoch": 1.9124852170974826,
8114
+ "grad_norm": 0.45165297389030457,
8115
+ "learning_rate": 2.0553594046757438e-06,
8116
+ "loss": 4.2671764373779295,
8117
+ "step": 11320
8118
+ },
8119
+ {
8120
+ "epoch": 1.9141746916708904,
8121
+ "grad_norm": 0.4523044526576996,
8122
+ "learning_rate": 1.9770906363187787e-06,
8123
+ "loss": 4.28791618347168,
8124
+ "step": 11330
8125
+ },
8126
+ {
8127
+ "epoch": 1.915864166244298,
8128
+ "grad_norm": 0.43898409605026245,
8129
+ "learning_rate": 1.9003313038649826e-06,
8130
+ "loss": 4.301726150512695,
8131
+ "step": 11340
8132
+ },
8133
+ {
8134
+ "epoch": 1.9175536408177059,
8135
+ "grad_norm": 0.44454851746559143,
8136
+ "learning_rate": 1.825082190053262e-06,
8137
+ "loss": 4.280124664306641,
8138
+ "step": 11350
8139
+ },
8140
+ {
8141
+ "epoch": 1.9192431153911134,
8142
+ "grad_norm": 0.44016656279563904,
8143
+ "learning_rate": 1.7513440622223762e-06,
8144
+ "loss": 4.312954330444336,
8145
+ "step": 11360
8146
+ },
8147
+ {
8148
+ "epoch": 1.9209325899645209,
8149
+ "grad_norm": 0.439481645822525,
8150
+ "learning_rate": 1.6791176723030763e-06,
8151
+ "loss": 4.291484069824219,
8152
+ "step": 11370
8153
+ },
8154
+ {
8155
+ "epoch": 1.9226220645379288,
8156
+ "grad_norm": 0.44403141736984253,
8157
+ "learning_rate": 1.608403756810428e-06,
8158
+ "loss": 4.297753524780274,
8159
+ "step": 11380
8160
+ },
8161
+ {
8162
+ "epoch": 1.9243115391113363,
8163
+ "grad_norm": 0.4380677342414856,
8164
+ "learning_rate": 1.5392030368363839e-06,
8165
+ "loss": 4.311534881591797,
8166
+ "step": 11390
8167
+ },
8168
+ {
8169
+ "epoch": 1.926001013684744,
8170
+ "grad_norm": 0.4598468542098999,
8171
+ "learning_rate": 1.4715162180422902e-06,
8172
+ "loss": 4.272250747680664,
8173
+ "step": 11400
8174
+ },
8175
+ {
8176
+ "epoch": 1.9276904882581518,
8177
+ "grad_norm": 0.44107797741889954,
8178
+ "learning_rate": 1.405343990651825e-06,
8179
+ "loss": 4.283835601806641,
8180
+ "step": 11410
8181
+ },
8182
+ {
8183
+ "epoch": 1.9293799628315593,
8184
+ "grad_norm": 0.43588972091674805,
8185
+ "learning_rate": 1.3406870294438876e-06,
8186
+ "loss": 4.273925399780273,
8187
+ "step": 11420
8188
+ },
8189
+ {
8190
+ "epoch": 1.931069437404967,
8191
+ "grad_norm": 0.4393414258956909,
8192
+ "learning_rate": 1.2775459937457544e-06,
8193
+ "loss": 4.295301055908203,
8194
+ "step": 11430
8195
+ },
8196
+ {
8197
+ "epoch": 1.9327589119783748,
8198
+ "grad_norm": 0.44228672981262207,
8199
+ "learning_rate": 1.2159215274262834e-06,
8200
+ "loss": 4.273171997070312,
8201
+ "step": 11440
8202
+ },
8203
+ {
8204
+ "epoch": 1.9344483865517823,
8205
+ "grad_norm": 0.4422619640827179,
8206
+ "learning_rate": 1.155814258889437e-06,
8207
+ "loss": 4.285517883300781,
8208
+ "step": 11450
8209
+ },
8210
+ {
8211
+ "epoch": 1.93613786112519,
8212
+ "grad_norm": 0.4427924156188965,
8213
+ "learning_rate": 1.0972248010678365e-06,
8214
+ "loss": 4.312974548339843,
8215
+ "step": 11460
8216
+ },
8217
+ {
8218
+ "epoch": 1.9378273356985978,
8219
+ "grad_norm": 0.44030192494392395,
8220
+ "learning_rate": 1.040153751416517e-06,
8221
+ "loss": 4.302379989624024,
8222
+ "step": 11470
8223
+ },
8224
+ {
8225
+ "epoch": 1.9395168102720053,
8226
+ "grad_norm": 0.44400596618652344,
8227
+ "learning_rate": 9.846016919068167e-07,
8228
+ "loss": 4.280198287963867,
8229
+ "step": 11480
8230
+ },
8231
+ {
8232
+ "epoch": 1.941206284845413,
8233
+ "grad_norm": 0.4478650689125061,
8234
+ "learning_rate": 9.305691890204469e-07,
8235
+ "loss": 4.281633758544922,
8236
+ "step": 11490
8237
+ },
8238
+ {
8239
+ "epoch": 1.9428957594188208,
8240
+ "grad_norm": 0.4458984434604645,
8241
+ "learning_rate": 8.780567937437644e-07,
8242
+ "loss": 4.278944396972657,
8243
+ "step": 11500
8244
+ },
8245
+ {
8246
+ "epoch": 1.9428957594188208,
8247
+ "eval_loss": 4.265942573547363,
8248
+ "eval_runtime": 4.0629,
8249
+ "eval_samples_per_second": 246.128,
8250
+ "eval_steps_per_second": 5.169,
8251
+ "step": 11500
8252
  }
8253
  ],
8254
  "logging_steps": 10,
 
8268
  "attributes": {}
8269
  }
8270
  },
8271
+ "total_flos": 3.846232536325816e+17,
8272
  "train_batch_size": 48,
8273
  "trial_name": null,
8274
  "trial_params": null