FormlessAI commited on
Commit
1f9cebb
·
verified ·
1 Parent(s): c72a43a

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0a21dc5f4f9acf3af3f8980785056b6b5ada5cb15eef4540db5bca39c790390
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83240b591b305b77a7f0e03a1614297d5289c0ea99896646801a0c1dbd574862
3
  size 1037269336
last-checkpoint/global_step4650/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e51d545c208c91904449605d111c48b0b9cd7cffe8820bff9335d42b333c838
3
+ size 781993445
last-checkpoint/global_step4650/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:042c2f523987a0cc723abcd2f7298c860835da787e4bbd1db139579226329fb2
3
+ size 781993509
last-checkpoint/global_step4650/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87022fc476e96c495276b8a125e2cd268dfd3b042f21127f9100c1570e463907
3
+ size 781993509
last-checkpoint/global_step4650/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:557b588731c35b0d0eafa43689a8fb120627a74098c020a845f3338f31555e7c
3
+ size 781993509
last-checkpoint/global_step4650/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:986e5ce5926507822e2b6e2503b40a1f334a287ff6957abefa9b133fbadf4b81
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step4500
 
1
+ global_step4650
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3630f5c7f559df2743db6022b9a9e3a578f1caa3a824d427deb7eb53b5753113
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a1d5dc1450e1f7d92df3b8367376288a592dc32fb455c0cd4248d71d3a7f2b5
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a975041d5ccbda078ebb49cae6863f266b7176846aea763c1f5991e324beb6a
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fb2912e97dbc350f2bdb8248e072bd5fc3be1df66f8fc3c1a669133cca92882
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e16d82cf7cd32b948d7f53723214355031cb0c2f352b62b817e45196b5c3bed
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97f3af51e8b6ba933c0395cf8132efd073aae835daafe97b9b1543a75390d4e
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2faeda1cf20088c59a4c59ca63cd8875d237d2179a7055592aef1e315f61c7ea
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29aafcf8ce3f67acef842d3fa0b0a4c6e670568793675e69ea643de91260101d
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e70fa6096403ae68870c39096182c6dd70befee0d4111f312991f4b6364fbfa
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a589015430e6f0d6c31bfd6d790e8fd16af3732cfc9fd2552a05ca53c4825d5
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 1.89194917678833,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6541648495420846,
6
  "eval_steps": 50,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7028,6 +7028,240 @@
7028
  "eval_samples_per_second": 170.525,
7029
  "eval_steps_per_second": 10.693,
7030
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7031
  }
7032
  ],
7033
  "logging_steps": 5,
@@ -7056,7 +7290,7 @@
7056
  "attributes": {}
7057
  }
7058
  },
7059
- "total_flos": 1.1728234225273405e+18,
7060
  "train_batch_size": 4,
7061
  "trial_name": null,
7062
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 1.8841668367385864,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6759703445268208,
6
  "eval_steps": 50,
7
+ "global_step": 4650,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7028
  "eval_samples_per_second": 170.525,
7029
  "eval_steps_per_second": 10.693,
7030
  "step": 4500
7031
+ },
7032
+ {
7033
+ "epoch": 0.6548916993749091,
7034
+ "grad_norm": 2.508161783218384,
7035
+ "learning_rate": 5.873522555612962e-05,
7036
+ "loss": 2.2274,
7037
+ "step": 4505
7038
+ },
7039
+ {
7040
+ "epoch": 0.6556185492077337,
7041
+ "grad_norm": 2.5135767459869385,
7042
+ "learning_rate": 5.8656915655531224e-05,
7043
+ "loss": 2.0161,
7044
+ "step": 4510
7045
+ },
7046
+ {
7047
+ "epoch": 0.6563453990405582,
7048
+ "grad_norm": 2.4507250785827637,
7049
+ "learning_rate": 5.8578584208798255e-05,
7050
+ "loss": 1.9389,
7051
+ "step": 4515
7052
+ },
7053
+ {
7054
+ "epoch": 0.6570722488733828,
7055
+ "grad_norm": 2.4963390827178955,
7056
+ "learning_rate": 5.850023141313007e-05,
7057
+ "loss": 2.2685,
7058
+ "step": 4520
7059
+ },
7060
+ {
7061
+ "epoch": 0.6577990987062073,
7062
+ "grad_norm": 2.3876497745513916,
7063
+ "learning_rate": 5.842185746577973e-05,
7064
+ "loss": 2.1684,
7065
+ "step": 4525
7066
+ },
7067
+ {
7068
+ "epoch": 0.6585259485390318,
7069
+ "grad_norm": 2.5659289360046387,
7070
+ "learning_rate": 5.834346256405354e-05,
7071
+ "loss": 2.0895,
7072
+ "step": 4530
7073
+ },
7074
+ {
7075
+ "epoch": 0.6592527983718564,
7076
+ "grad_norm": 2.480208396911621,
7077
+ "learning_rate": 5.826504690531059e-05,
7078
+ "loss": 2.086,
7079
+ "step": 4535
7080
+ },
7081
+ {
7082
+ "epoch": 0.6599796482046809,
7083
+ "grad_norm": 2.4734959602355957,
7084
+ "learning_rate": 5.818661068696221e-05,
7085
+ "loss": 2.1213,
7086
+ "step": 4540
7087
+ },
7088
+ {
7089
+ "epoch": 0.6607064980375055,
7090
+ "grad_norm": 2.8239712715148926,
7091
+ "learning_rate": 5.810815410647147e-05,
7092
+ "loss": 2.0349,
7093
+ "step": 4545
7094
+ },
7095
+ {
7096
+ "epoch": 0.66143334787033,
7097
+ "grad_norm": 2.229339122772217,
7098
+ "learning_rate": 5.8029677361352714e-05,
7099
+ "loss": 1.9909,
7100
+ "step": 4550
7101
+ },
7102
+ {
7103
+ "epoch": 0.66143334787033,
7104
+ "eval_loss": 1.8908016681671143,
7105
+ "eval_runtime": 22.1348,
7106
+ "eval_samples_per_second": 149.132,
7107
+ "eval_steps_per_second": 9.352,
7108
+ "step": 4550
7109
+ },
7110
+ {
7111
+ "epoch": 0.6621601977031545,
7112
+ "grad_norm": 2.306365966796875,
7113
+ "learning_rate": 5.795118064917109e-05,
7114
+ "loss": 1.9745,
7115
+ "step": 4555
7116
+ },
7117
+ {
7118
+ "epoch": 0.662887047535979,
7119
+ "grad_norm": 2.618732213973999,
7120
+ "learning_rate": 5.787266416754193e-05,
7121
+ "loss": 2.1639,
7122
+ "step": 4560
7123
+ },
7124
+ {
7125
+ "epoch": 0.6636138973688036,
7126
+ "grad_norm": 2.4831111431121826,
7127
+ "learning_rate": 5.779412811413042e-05,
7128
+ "loss": 1.8808,
7129
+ "step": 4565
7130
+ },
7131
+ {
7132
+ "epoch": 0.6643407472016282,
7133
+ "grad_norm": 2.3205296993255615,
7134
+ "learning_rate": 5.771557268665096e-05,
7135
+ "loss": 1.9686,
7136
+ "step": 4570
7137
+ },
7138
+ {
7139
+ "epoch": 0.6650675970344527,
7140
+ "grad_norm": 2.1423285007476807,
7141
+ "learning_rate": 5.763699808286676e-05,
7142
+ "loss": 1.9517,
7143
+ "step": 4575
7144
+ },
7145
+ {
7146
+ "epoch": 0.6657944468672772,
7147
+ "grad_norm": 2.134899854660034,
7148
+ "learning_rate": 5.755840450058927e-05,
7149
+ "loss": 2.0311,
7150
+ "step": 4580
7151
+ },
7152
+ {
7153
+ "epoch": 0.6665212967001017,
7154
+ "grad_norm": 2.3795955181121826,
7155
+ "learning_rate": 5.747979213767777e-05,
7156
+ "loss": 1.9214,
7157
+ "step": 4585
7158
+ },
7159
+ {
7160
+ "epoch": 0.6672481465329263,
7161
+ "grad_norm": 2.3388452529907227,
7162
+ "learning_rate": 5.740116119203877e-05,
7163
+ "loss": 2.1742,
7164
+ "step": 4590
7165
+ },
7166
+ {
7167
+ "epoch": 0.6679749963657509,
7168
+ "grad_norm": 2.438502073287964,
7169
+ "learning_rate": 5.732251186162558e-05,
7170
+ "loss": 1.9072,
7171
+ "step": 4595
7172
+ },
7173
+ {
7174
+ "epoch": 0.6687018461985754,
7175
+ "grad_norm": 2.352613925933838,
7176
+ "learning_rate": 5.7243844344437806e-05,
7177
+ "loss": 2.162,
7178
+ "step": 4600
7179
+ },
7180
+ {
7181
+ "epoch": 0.6687018461985754,
7182
+ "eval_loss": 1.893505334854126,
7183
+ "eval_runtime": 19.1578,
7184
+ "eval_samples_per_second": 172.305,
7185
+ "eval_steps_per_second": 10.805,
7186
+ "step": 4600
7187
+ },
7188
+ {
7189
+ "epoch": 0.6694286960313999,
7190
+ "grad_norm": 2.3778982162475586,
7191
+ "learning_rate": 5.716515883852082e-05,
7192
+ "loss": 2.0784,
7193
+ "step": 4605
7194
+ },
7195
+ {
7196
+ "epoch": 0.6701555458642244,
7197
+ "grad_norm": 2.6638474464416504,
7198
+ "learning_rate": 5.708645554196528e-05,
7199
+ "loss": 2.0468,
7200
+ "step": 4610
7201
+ },
7202
+ {
7203
+ "epoch": 0.670882395697049,
7204
+ "grad_norm": 2.4324584007263184,
7205
+ "learning_rate": 5.700773465290667e-05,
7206
+ "loss": 2.0943,
7207
+ "step": 4615
7208
+ },
7209
+ {
7210
+ "epoch": 0.6716092455298736,
7211
+ "grad_norm": 2.2958381175994873,
7212
+ "learning_rate": 5.692899636952473e-05,
7213
+ "loss": 2.0988,
7214
+ "step": 4620
7215
+ },
7216
+ {
7217
+ "epoch": 0.6723360953626981,
7218
+ "grad_norm": 2.202683448791504,
7219
+ "learning_rate": 5.6850240890042966e-05,
7220
+ "loss": 2.1533,
7221
+ "step": 4625
7222
+ },
7223
+ {
7224
+ "epoch": 0.6730629451955226,
7225
+ "grad_norm": 1.9483098983764648,
7226
+ "learning_rate": 5.677146841272821e-05,
7227
+ "loss": 1.9827,
7228
+ "step": 4630
7229
+ },
7230
+ {
7231
+ "epoch": 0.6737897950283471,
7232
+ "grad_norm": 2.550309658050537,
7233
+ "learning_rate": 5.669267913589012e-05,
7234
+ "loss": 1.9718,
7235
+ "step": 4635
7236
+ },
7237
+ {
7238
+ "epoch": 0.6745166448611717,
7239
+ "grad_norm": 2.50044846534729,
7240
+ "learning_rate": 5.661387325788056e-05,
7241
+ "loss": 2.0441,
7242
+ "step": 4640
7243
+ },
7244
+ {
7245
+ "epoch": 0.6752434946939962,
7246
+ "grad_norm": 2.406494140625,
7247
+ "learning_rate": 5.653505097709326e-05,
7248
+ "loss": 1.9735,
7249
+ "step": 4645
7250
+ },
7251
+ {
7252
+ "epoch": 0.6759703445268208,
7253
+ "grad_norm": 2.304180383682251,
7254
+ "learning_rate": 5.645621249196321e-05,
7255
+ "loss": 1.9182,
7256
+ "step": 4650
7257
+ },
7258
+ {
7259
+ "epoch": 0.6759703445268208,
7260
+ "eval_loss": 1.8841668367385864,
7261
+ "eval_runtime": 18.863,
7262
+ "eval_samples_per_second": 174.999,
7263
+ "eval_steps_per_second": 10.974,
7264
+ "step": 4650
7265
  }
7266
  ],
7267
  "logging_steps": 5,
 
7290
  "attributes": {}
7291
  }
7292
  },
7293
+ "total_flos": 1.211591505395843e+18,
7294
  "train_batch_size": 4,
7295
  "trial_name": null,
7296
  "trial_params": null