Delete checkpoints
Browse files- checkpoints/checkpoint-1280/model.safetensors +0 -3
- checkpoints/checkpoint-1280/optimizer.pt +0 -3
- checkpoints/checkpoint-1280/rng_state.pth +0 -3
- checkpoints/checkpoint-1280/scheduler.pt +0 -3
- checkpoints/checkpoint-1280/trainer_state.json +0 -503
- checkpoints/checkpoint-1280/training_args.bin +0 -3
- checkpoints/checkpoint-202/model.safetensors +0 -3
- checkpoints/checkpoint-202/optimizer.pt +0 -3
- checkpoints/checkpoint-202/rng_state.pth +0 -3
- checkpoints/checkpoint-202/scheduler.pt +0 -3
- checkpoints/checkpoint-202/trainer_state.json +0 -132
- checkpoints/checkpoint-202/training_args.bin +0 -3
- checkpoints/checkpoint-606-2/model.safetensors +0 -3
- checkpoints/checkpoint-606-2/optimizer.pt +0 -3
- checkpoints/checkpoint-606-2/rng_state.pth +0 -3
- checkpoints/checkpoint-606-2/scheduler.pt +0 -3
- checkpoints/checkpoint-606-2/trainer_state.json +0 -235
- checkpoints/checkpoint-606-2/training_args.bin +0 -3
- checkpoints/checkpoint-606/model.safetensors +0 -3
- checkpoints/checkpoint-606/optimizer.pt +0 -3
- checkpoints/checkpoint-606/rng_state.pth +0 -3
- checkpoints/checkpoint-606/scheduler.pt +0 -3
- checkpoints/checkpoint-606/trainer_state.json +0 -235
- checkpoints/checkpoint-606/training_args.bin +0 -3
checkpoints/checkpoint-1280/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:69e7936faa0f4f96da50c28e7fc64aa98b67f8da2d8c84d20a6c2a1111b17e0f
|
| 3 |
-
size 2297612372
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-1280/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:44c88d6887930c13f119d5ac5d9efd93094abb40e4255a84aa20844c912d5e44
|
| 3 |
-
size 548599104
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-1280/rng_state.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c9d223714e45bc425e11bbcc5a937a01cf97e4f8bbd782e3737caf11063855d0
|
| 3 |
-
size 14180
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-1280/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:29eaea96c12c4154ffed8572a07eb967ef6c8db7290858962558445a58bedf6e
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-1280/trainer_state.json
DELETED
|
@@ -1,503 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 6.320987654320987,
|
| 5 |
-
"eval_steps": 128,
|
| 6 |
-
"global_step": 1280,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.1580246913580247,
|
| 13 |
-
"grad_norm": 9.916343688964844,
|
| 14 |
-
"learning_rate": 1.4851485148514851e-05,
|
| 15 |
-
"loss": 13.4879,
|
| 16 |
-
"step": 32
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.3160493827160494,
|
| 20 |
-
"grad_norm": 8.410324096679688,
|
| 21 |
-
"learning_rate": 3.06930693069307e-05,
|
| 22 |
-
"loss": 11.0681,
|
| 23 |
-
"step": 64
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.4740740740740741,
|
| 27 |
-
"grad_norm": 7.2831854820251465,
|
| 28 |
-
"learning_rate": 4.653465346534654e-05,
|
| 29 |
-
"loss": 10.2179,
|
| 30 |
-
"step": 96
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.6320987654320988,
|
| 34 |
-
"grad_norm": 5.446238040924072,
|
| 35 |
-
"learning_rate": 6.237623762376238e-05,
|
| 36 |
-
"loss": 9.8657,
|
| 37 |
-
"step": 128
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.6320987654320988,
|
| 41 |
-
"eval_bleu": 0.009206539746414727,
|
| 42 |
-
"eval_cap_loss": 4.180750540658539,
|
| 43 |
-
"eval_con_loss": 2.05906052682914,
|
| 44 |
-
"eval_loss": 8.298871559255263,
|
| 45 |
-
"step": 128
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"epoch": 0.6320987654320988,
|
| 49 |
-
"eval_bleu": 0.009206539746414727,
|
| 50 |
-
"eval_cap_loss": 4.180750540658539,
|
| 51 |
-
"eval_con_loss": 2.05906052682914,
|
| 52 |
-
"eval_loss": 8.298871559255263,
|
| 53 |
-
"eval_runtime": 160.1737,
|
| 54 |
-
"eval_samples_per_second": 5.051,
|
| 55 |
-
"eval_steps_per_second": 0.637,
|
| 56 |
-
"step": 128
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"epoch": 0.7901234567901234,
|
| 60 |
-
"grad_norm": 6.679168224334717,
|
| 61 |
-
"learning_rate": 7.821782178217822e-05,
|
| 62 |
-
"loss": 9.6441,
|
| 63 |
-
"step": 160
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"epoch": 0.9481481481481482,
|
| 67 |
-
"grad_norm": 3.790262222290039,
|
| 68 |
-
"learning_rate": 9.405940594059406e-05,
|
| 69 |
-
"loss": 9.5422,
|
| 70 |
-
"step": 192
|
| 71 |
-
},
|
| 72 |
-
{
|
| 73 |
-
"epoch": 1.106172839506173,
|
| 74 |
-
"grad_norm": 5.2132487297058105,
|
| 75 |
-
"learning_rate": 9.99701414469309e-05,
|
| 76 |
-
"loss": 9.2999,
|
| 77 |
-
"step": 224
|
| 78 |
-
},
|
| 79 |
-
{
|
| 80 |
-
"epoch": 1.2641975308641975,
|
| 81 |
-
"grad_norm": 3.9284615516662598,
|
| 82 |
-
"learning_rate": 9.979827188241365e-05,
|
| 83 |
-
"loss": 9.2528,
|
| 84 |
-
"step": 256
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"epoch": 1.2641975308641975,
|
| 88 |
-
"eval_bleu": 0.010232611843559726,
|
| 89 |
-
"eval_cap_loss": 3.7518067640416763,
|
| 90 |
-
"eval_con_loss": 2.0590475727530086,
|
| 91 |
-
"eval_loss": 7.869901939934375,
|
| 92 |
-
"step": 256
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"epoch": 1.2641975308641975,
|
| 96 |
-
"eval_bleu": 0.010232611843559726,
|
| 97 |
-
"eval_cap_loss": 3.7518067640416763,
|
| 98 |
-
"eval_con_loss": 2.0590475727530086,
|
| 99 |
-
"eval_loss": 7.869901939934375,
|
| 100 |
-
"eval_runtime": 163.0281,
|
| 101 |
-
"eval_samples_per_second": 4.962,
|
| 102 |
-
"eval_steps_per_second": 0.626,
|
| 103 |
-
"step": 256
|
| 104 |
-
},
|
| 105 |
-
{
|
| 106 |
-
"epoch": 1.4222222222222223,
|
| 107 |
-
"grad_norm": 2.924140691757202,
|
| 108 |
-
"learning_rate": 9.947416695486633e-05,
|
| 109 |
-
"loss": 9.2148,
|
| 110 |
-
"step": 288
|
| 111 |
-
},
|
| 112 |
-
{
|
| 113 |
-
"epoch": 1.5802469135802468,
|
| 114 |
-
"grad_norm": 2.8188695907592773,
|
| 115 |
-
"learning_rate": 9.899881746636785e-05,
|
| 116 |
-
"loss": 9.2119,
|
| 117 |
-
"step": 320
|
| 118 |
-
},
|
| 119 |
-
{
|
| 120 |
-
"epoch": 1.7382716049382716,
|
| 121 |
-
"grad_norm": 2.9822909832000732,
|
| 122 |
-
"learning_rate": 9.837367657983356e-05,
|
| 123 |
-
"loss": 9.1222,
|
| 124 |
-
"step": 352
|
| 125 |
-
},
|
| 126 |
-
{
|
| 127 |
-
"epoch": 1.8962962962962964,
|
| 128 |
-
"grad_norm": 2.3413400650024414,
|
| 129 |
-
"learning_rate": 9.760065537663649e-05,
|
| 130 |
-
"loss": 9.1112,
|
| 131 |
-
"step": 384
|
| 132 |
-
},
|
| 133 |
-
{
|
| 134 |
-
"epoch": 1.8962962962962964,
|
| 135 |
-
"eval_bleu": 0.014012859200134394,
|
| 136 |
-
"eval_cap_loss": 3.5916168689727783,
|
| 137 |
-
"eval_con_loss": 2.0590362034591974,
|
| 138 |
-
"eval_loss": 7.709689268878862,
|
| 139 |
-
"step": 384
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"epoch": 1.8962962962962964,
|
| 143 |
-
"eval_bleu": 0.014012859200134394,
|
| 144 |
-
"eval_cap_loss": 3.5916168689727783,
|
| 145 |
-
"eval_con_loss": 2.0590362034591974,
|
| 146 |
-
"eval_loss": 7.709689268878862,
|
| 147 |
-
"eval_runtime": 160.6733,
|
| 148 |
-
"eval_samples_per_second": 5.035,
|
| 149 |
-
"eval_steps_per_second": 0.635,
|
| 150 |
-
"step": 384
|
| 151 |
-
},
|
| 152 |
-
{
|
| 153 |
-
"epoch": 2.054320987654321,
|
| 154 |
-
"grad_norm": 3.7009811401367188,
|
| 155 |
-
"learning_rate": 9.668211701435327e-05,
|
| 156 |
-
"loss": 9.0482,
|
| 157 |
-
"step": 416
|
| 158 |
-
},
|
| 159 |
-
{
|
| 160 |
-
"epoch": 2.212345679012346,
|
| 161 |
-
"grad_norm": 3.00201153755188,
|
| 162 |
-
"learning_rate": 9.562086950249409e-05,
|
| 163 |
-
"loss": 9.0231,
|
| 164 |
-
"step": 448
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"epoch": 2.3703703703703702,
|
| 168 |
-
"grad_norm": 2.188750743865967,
|
| 169 |
-
"learning_rate": 9.442015711830245e-05,
|
| 170 |
-
"loss": 9.0336,
|
| 171 |
-
"step": 480
|
| 172 |
-
},
|
| 173 |
-
{
|
| 174 |
-
"epoch": 2.528395061728395,
|
| 175 |
-
"grad_norm": 2.7350165843963623,
|
| 176 |
-
"learning_rate": 9.308365048886625e-05,
|
| 177 |
-
"loss": 9.0336,
|
| 178 |
-
"step": 512
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"epoch": 2.528395061728395,
|
| 182 |
-
"eval_bleu": 0.019149744160649594,
|
| 183 |
-
"eval_cap_loss": 3.5493307885001686,
|
| 184 |
-
"eval_con_loss": 2.059034511154773,
|
| 185 |
-
"eval_loss": 7.6673998201594635,
|
| 186 |
-
"step": 512
|
| 187 |
-
},
|
| 188 |
-
{
|
| 189 |
-
"epoch": 2.528395061728395,
|
| 190 |
-
"eval_bleu": 0.019149744160649594,
|
| 191 |
-
"eval_cap_loss": 3.5493307885001686,
|
| 192 |
-
"eval_con_loss": 2.059034511154773,
|
| 193 |
-
"eval_loss": 7.6673998201594635,
|
| 194 |
-
"eval_runtime": 161.0817,
|
| 195 |
-
"eval_samples_per_second": 5.022,
|
| 196 |
-
"eval_steps_per_second": 0.633,
|
| 197 |
-
"step": 512
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"epoch": 2.68641975308642,
|
| 201 |
-
"grad_norm": 2.4831721782684326,
|
| 202 |
-
"learning_rate": 9.161543536985996e-05,
|
| 203 |
-
"loss": 8.9656,
|
| 204 |
-
"step": 544
|
| 205 |
-
},
|
| 206 |
-
{
|
| 207 |
-
"epoch": 2.8444444444444446,
|
| 208 |
-
"grad_norm": 3.178410768508911,
|
| 209 |
-
"learning_rate": 9.00200001552218e-05,
|
| 210 |
-
"loss": 9.0066,
|
| 211 |
-
"step": 576
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"epoch": 3.0024691358024693,
|
| 215 |
-
"grad_norm": 2.4883828163146973,
|
| 216 |
-
"learning_rate": 8.83022221559489e-05,
|
| 217 |
-
"loss": 8.8965,
|
| 218 |
-
"step": 608
|
| 219 |
-
},
|
| 220 |
-
{
|
| 221 |
-
"epoch": 3.1604938271604937,
|
| 222 |
-
"grad_norm": 3.184849500656128,
|
| 223 |
-
"learning_rate": 8.646735268995731e-05,
|
| 224 |
-
"loss": 8.8889,
|
| 225 |
-
"step": 640
|
| 226 |
-
},
|
| 227 |
-
{
|
| 228 |
-
"epoch": 3.1604938271604937,
|
| 229 |
-
"eval_bleu": 0.016690347492961013,
|
| 230 |
-
"eval_cap_loss": 3.445917959306754,
|
| 231 |
-
"eval_con_loss": 2.059011185870451,
|
| 232 |
-
"eval_loss": 7.563940347409716,
|
| 233 |
-
"step": 640
|
| 234 |
-
},
|
| 235 |
-
{
|
| 236 |
-
"epoch": 3.1604938271604937,
|
| 237 |
-
"eval_bleu": 0.016690347492961013,
|
| 238 |
-
"eval_cap_loss": 3.445917959306754,
|
| 239 |
-
"eval_con_loss": 2.059011185870451,
|
| 240 |
-
"eval_loss": 7.563940347409716,
|
| 241 |
-
"eval_runtime": 160.8496,
|
| 242 |
-
"eval_samples_per_second": 5.03,
|
| 243 |
-
"eval_steps_per_second": 0.634,
|
| 244 |
-
"step": 640
|
| 245 |
-
},
|
| 246 |
-
{
|
| 247 |
-
"epoch": 3.3185185185185184,
|
| 248 |
-
"grad_norm": 2.856328248977661,
|
| 249 |
-
"learning_rate": 8.452100102858734e-05,
|
| 250 |
-
"loss": 8.8877,
|
| 251 |
-
"step": 672
|
| 252 |
-
},
|
| 253 |
-
{
|
| 254 |
-
"epoch": 3.476543209876543,
|
| 255 |
-
"grad_norm": 3.4148852825164795,
|
| 256 |
-
"learning_rate": 8.246911724883068e-05,
|
| 257 |
-
"loss": 8.8006,
|
| 258 |
-
"step": 704
|
| 259 |
-
},
|
| 260 |
-
{
|
| 261 |
-
"epoch": 3.634567901234568,
|
| 262 |
-
"grad_norm": 3.2651753425598145,
|
| 263 |
-
"learning_rate": 8.031797404370057e-05,
|
| 264 |
-
"loss": 8.8842,
|
| 265 |
-
"step": 736
|
| 266 |
-
},
|
| 267 |
-
{
|
| 268 |
-
"epoch": 3.7925925925925927,
|
| 269 |
-
"grad_norm": 3.55135440826416,
|
| 270 |
-
"learning_rate": 7.807414754635145e-05,
|
| 271 |
-
"loss": 8.8534,
|
| 272 |
-
"step": 768
|
| 273 |
-
},
|
| 274 |
-
{
|
| 275 |
-
"epoch": 3.7925925925925927,
|
| 276 |
-
"eval_bleu": 0.018187566983297536,
|
| 277 |
-
"eval_cap_loss": 3.4394626383687936,
|
| 278 |
-
"eval_con_loss": 2.0590050898346246,
|
| 279 |
-
"eval_loss": 7.557472799338546,
|
| 280 |
-
"step": 768
|
| 281 |
-
},
|
| 282 |
-
{
|
| 283 |
-
"epoch": 3.7925925925925927,
|
| 284 |
-
"eval_bleu": 0.018187566983297536,
|
| 285 |
-
"eval_cap_loss": 3.4394626383687936,
|
| 286 |
-
"eval_con_loss": 2.0590050898346246,
|
| 287 |
-
"eval_loss": 7.557472799338546,
|
| 288 |
-
"eval_runtime": 164.2478,
|
| 289 |
-
"eval_samples_per_second": 4.925,
|
| 290 |
-
"eval_steps_per_second": 0.621,
|
| 291 |
-
"step": 768
|
| 292 |
-
},
|
| 293 |
-
{
|
| 294 |
-
"epoch": 3.950617283950617,
|
| 295 |
-
"grad_norm": 2.6633050441741943,
|
| 296 |
-
"learning_rate": 7.574449722656991e-05,
|
| 297 |
-
"loss": 8.8075,
|
| 298 |
-
"step": 800
|
| 299 |
-
},
|
| 300 |
-
{
|
| 301 |
-
"epoch": 4.108641975308642,
|
| 302 |
-
"grad_norm": 2.6695327758789062,
|
| 303 |
-
"learning_rate": 7.333614492109364e-05,
|
| 304 |
-
"loss": 8.7768,
|
| 305 |
-
"step": 832
|
| 306 |
-
},
|
| 307 |
-
{
|
| 308 |
-
"epoch": 4.266666666666667,
|
| 309 |
-
"grad_norm": 2.1547067165374756,
|
| 310 |
-
"learning_rate": 7.08564530618639e-05,
|
| 311 |
-
"loss": 8.7896,
|
| 312 |
-
"step": 864
|
| 313 |
-
},
|
| 314 |
-
{
|
| 315 |
-
"epoch": 4.424691358024692,
|
| 316 |
-
"grad_norm": 2.484839916229248,
|
| 317 |
-
"learning_rate": 6.831300216876873e-05,
|
| 318 |
-
"loss": 8.7546,
|
| 319 |
-
"step": 896
|
| 320 |
-
},
|
| 321 |
-
{
|
| 322 |
-
"epoch": 4.424691358024692,
|
| 323 |
-
"eval_bleu": 0.017297916814474356,
|
| 324 |
-
"eval_cap_loss": 3.4138665620018456,
|
| 325 |
-
"eval_con_loss": 2.058998935362872,
|
| 326 |
-
"eval_loss": 7.531864435065026,
|
| 327 |
-
"step": 896
|
| 328 |
-
},
|
| 329 |
-
{
|
| 330 |
-
"epoch": 4.424691358024692,
|
| 331 |
-
"eval_bleu": 0.017297916814474356,
|
| 332 |
-
"eval_cap_loss": 3.4138665620018456,
|
| 333 |
-
"eval_con_loss": 2.058998935362872,
|
| 334 |
-
"eval_loss": 7.531864435065026,
|
| 335 |
-
"eval_runtime": 163.539,
|
| 336 |
-
"eval_samples_per_second": 4.947,
|
| 337 |
-
"eval_steps_per_second": 0.624,
|
| 338 |
-
"step": 896
|
| 339 |
-
},
|
| 340 |
-
{
|
| 341 |
-
"epoch": 4.582716049382716,
|
| 342 |
-
"grad_norm": 2.8547232151031494,
|
| 343 |
-
"learning_rate": 6.571356767568207e-05,
|
| 344 |
-
"loss": 8.7195,
|
| 345 |
-
"step": 928
|
| 346 |
-
},
|
| 347 |
-
{
|
| 348 |
-
"epoch": 4.7407407407407405,
|
| 349 |
-
"grad_norm": 3.8819704055786133,
|
| 350 |
-
"learning_rate": 6.306609616064304e-05,
|
| 351 |
-
"loss": 8.7259,
|
| 352 |
-
"step": 960
|
| 353 |
-
},
|
| 354 |
-
{
|
| 355 |
-
"epoch": 4.898765432098766,
|
| 356 |
-
"grad_norm": 3.2503316402435303,
|
| 357 |
-
"learning_rate": 6.037868105284045e-05,
|
| 358 |
-
"loss": 8.7204,
|
| 359 |
-
"step": 992
|
| 360 |
-
},
|
| 361 |
-
{
|
| 362 |
-
"epoch": 5.05679012345679,
|
| 363 |
-
"grad_norm": 2.2758522033691406,
|
| 364 |
-
"learning_rate": 5.7659537890667145e-05,
|
| 365 |
-
"loss": 8.717,
|
| 366 |
-
"step": 1024
|
| 367 |
-
},
|
| 368 |
-
{
|
| 369 |
-
"epoch": 5.05679012345679,
|
| 370 |
-
"eval_bleu": 0.0211266020043842,
|
| 371 |
-
"eval_cap_loss": 3.340357939402262,
|
| 372 |
-
"eval_con_loss": 2.059003666335461,
|
| 373 |
-
"eval_loss": 7.45836528376037,
|
| 374 |
-
"step": 1024
|
| 375 |
-
},
|
| 376 |
-
{
|
| 377 |
-
"epoch": 5.05679012345679,
|
| 378 |
-
"eval_bleu": 0.0211266020043842,
|
| 379 |
-
"eval_cap_loss": 3.340357939402262,
|
| 380 |
-
"eval_con_loss": 2.059003666335461,
|
| 381 |
-
"eval_loss": 7.45836528376037,
|
| 382 |
-
"eval_runtime": 164.548,
|
| 383 |
-
"eval_samples_per_second": 4.916,
|
| 384 |
-
"eval_steps_per_second": 0.62,
|
| 385 |
-
"step": 1024
|
| 386 |
-
},
|
| 387 |
-
{
|
| 388 |
-
"epoch": 5.214814814814815,
|
| 389 |
-
"grad_norm": 2.7144651412963867,
|
| 390 |
-
"learning_rate": 5.491697920648174e-05,
|
| 391 |
-
"loss": 8.6562,
|
| 392 |
-
"step": 1056
|
| 393 |
-
},
|
| 394 |
-
{
|
| 395 |
-
"epoch": 5.37283950617284,
|
| 396 |
-
"grad_norm": 2.6319425106048584,
|
| 397 |
-
"learning_rate": 5.2159389114855585e-05,
|
| 398 |
-
"loss": 8.667,
|
| 399 |
-
"step": 1088
|
| 400 |
-
},
|
| 401 |
-
{
|
| 402 |
-
"epoch": 5.530864197530864,
|
| 403 |
-
"grad_norm": 3.410334348678589,
|
| 404 |
-
"learning_rate": 4.939519768199012e-05,
|
| 405 |
-
"loss": 8.6587,
|
| 406 |
-
"step": 1120
|
| 407 |
-
},
|
| 408 |
-
{
|
| 409 |
-
"epoch": 5.688888888888889,
|
| 410 |
-
"grad_norm": 4.210638523101807,
|
| 411 |
-
"learning_rate": 4.663285515465818e-05,
|
| 412 |
-
"loss": 8.6541,
|
| 413 |
-
"step": 1152
|
| 414 |
-
},
|
| 415 |
-
{
|
| 416 |
-
"epoch": 5.688888888888889,
|
| 417 |
-
"eval_bleu": 0.02044929088069335,
|
| 418 |
-
"eval_cap_loss": 3.2661077111375096,
|
| 419 |
-
"eval_con_loss": 2.059012539246503,
|
| 420 |
-
"eval_loss": 7.384132796642827,
|
| 421 |
-
"step": 1152
|
| 422 |
-
},
|
| 423 |
-
{
|
| 424 |
-
"epoch": 5.688888888888889,
|
| 425 |
-
"eval_bleu": 0.02044929088069335,
|
| 426 |
-
"eval_cap_loss": 3.2661077111375096,
|
| 427 |
-
"eval_con_loss": 2.059012539246503,
|
| 428 |
-
"eval_loss": 7.384132796642827,
|
| 429 |
-
"eval_runtime": 163.0673,
|
| 430 |
-
"eval_samples_per_second": 4.961,
|
| 431 |
-
"eval_steps_per_second": 0.626,
|
| 432 |
-
"step": 1152
|
| 433 |
-
},
|
| 434 |
-
{
|
| 435 |
-
"epoch": 5.846913580246913,
|
| 436 |
-
"grad_norm": 4.617128372192383,
|
| 437 |
-
"learning_rate": 4.388080612745244e-05,
|
| 438 |
-
"loss": 8.6481,
|
| 439 |
-
"step": 1184
|
| 440 |
-
},
|
| 441 |
-
{
|
| 442 |
-
"epoch": 6.004938271604939,
|
| 443 |
-
"grad_norm": 2.4542462825775146,
|
| 444 |
-
"learning_rate": 4.114746372731275e-05,
|
| 445 |
-
"loss": 8.6291,
|
| 446 |
-
"step": 1216
|
| 447 |
-
},
|
| 448 |
-
{
|
| 449 |
-
"epoch": 6.162962962962963,
|
| 450 |
-
"grad_norm": 4.868436813354492,
|
| 451 |
-
"learning_rate": 3.844118389425153e-05,
|
| 452 |
-
"loss": 8.5827,
|
| 453 |
-
"step": 1248
|
| 454 |
-
},
|
| 455 |
-
{
|
| 456 |
-
"epoch": 6.320987654320987,
|
| 457 |
-
"grad_norm": 3.225403308868408,
|
| 458 |
-
"learning_rate": 3.577023983690177e-05,
|
| 459 |
-
"loss": 8.5461,
|
| 460 |
-
"step": 1280
|
| 461 |
-
},
|
| 462 |
-
{
|
| 463 |
-
"epoch": 6.320987654320987,
|
| 464 |
-
"eval_bleu": 0.023454420732711045,
|
| 465 |
-
"eval_cap_loss": 3.2792342223373114,
|
| 466 |
-
"eval_con_loss": 2.0589846162235035,
|
| 467 |
-
"eval_loss": 7.397203454784319,
|
| 468 |
-
"step": 1280
|
| 469 |
-
},
|
| 470 |
-
{
|
| 471 |
-
"epoch": 6.320987654320987,
|
| 472 |
-
"eval_bleu": 0.023454420732711045,
|
| 473 |
-
"eval_cap_loss": 3.2792342223373114,
|
| 474 |
-
"eval_con_loss": 2.0589846162235035,
|
| 475 |
-
"eval_loss": 7.397203454784319,
|
| 476 |
-
"eval_runtime": 161.0376,
|
| 477 |
-
"eval_samples_per_second": 5.024,
|
| 478 |
-
"eval_steps_per_second": 0.633,
|
| 479 |
-
"step": 1280
|
| 480 |
-
}
|
| 481 |
-
],
|
| 482 |
-
"logging_steps": 32,
|
| 483 |
-
"max_steps": 2020,
|
| 484 |
-
"num_input_tokens_seen": 0,
|
| 485 |
-
"num_train_epochs": 10,
|
| 486 |
-
"save_steps": 128,
|
| 487 |
-
"stateful_callbacks": {
|
| 488 |
-
"TrainerControl": {
|
| 489 |
-
"args": {
|
| 490 |
-
"should_epoch_stop": false,
|
| 491 |
-
"should_evaluate": false,
|
| 492 |
-
"should_log": false,
|
| 493 |
-
"should_save": true,
|
| 494 |
-
"should_training_stop": false
|
| 495 |
-
},
|
| 496 |
-
"attributes": {}
|
| 497 |
-
}
|
| 498 |
-
},
|
| 499 |
-
"total_flos": 0.0,
|
| 500 |
-
"train_batch_size": 16,
|
| 501 |
-
"trial_name": null,
|
| 502 |
-
"trial_params": null
|
| 503 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-1280/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ee0fb7ff03fa7d579a0122f63c8133057dbe8dded973c9246203fc477a16730e
|
| 3 |
-
size 5112
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0dffee69756e606b8b3871e102c92751a40f17cbd2e6e307e668c6d6967a32c6
|
| 3 |
-
size 5821173932
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:6bc77f1f9f675b5a7d79bee225ca0f8410101081232d6ece7d43854ab8b66d84
|
| 3 |
-
size 312119520
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/rng_state.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c0cff64e8412933e783886fbdffd3f6efbcf0ae4d2d1512c2e684b0f3d664dd3
|
| 3 |
-
size 14244
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:17ee40d21180a1ccc4e69d8fdf2bfff2f3c4b3a31fe3c79203430eab430365bf
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/trainer_state.json
DELETED
|
@@ -1,132 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.9975308641975309,
|
| 5 |
-
"eval_steps": 64,
|
| 6 |
-
"global_step": 202,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.1580246913580247,
|
| 13 |
-
"grad_norm": 3.374070882797241,
|
| 14 |
-
"learning_rate": 2.972743532698138e-05,
|
| 15 |
-
"loss": 12.1032,
|
| 16 |
-
"step": 32
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.3160493827160494,
|
| 20 |
-
"grad_norm": 3.2159523963928223,
|
| 21 |
-
"learning_rate": 2.601262828482597e-05,
|
| 22 |
-
"loss": 11.7808,
|
| 23 |
-
"step": 64
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.3160493827160494,
|
| 27 |
-
"eval_bleu": 0.0,
|
| 28 |
-
"eval_cap_loss": 8.910892418452672,
|
| 29 |
-
"eval_con_loss": 1.3794510922408456,
|
| 30 |
-
"eval_loss": 10.290343505408377,
|
| 31 |
-
"step": 64
|
| 32 |
-
},
|
| 33 |
-
{
|
| 34 |
-
"epoch": 0.3160493827160494,
|
| 35 |
-
"eval_bleu": 0.0,
|
| 36 |
-
"eval_cap_loss": 8.910892418452672,
|
| 37 |
-
"eval_con_loss": 1.3794510922408456,
|
| 38 |
-
"eval_loss": 10.290343505408377,
|
| 39 |
-
"eval_runtime": 220.4456,
|
| 40 |
-
"eval_samples_per_second": 3.67,
|
| 41 |
-
"eval_steps_per_second": 0.921,
|
| 42 |
-
"step": 64
|
| 43 |
-
},
|
| 44 |
-
{
|
| 45 |
-
"epoch": 0.4740740740740741,
|
| 46 |
-
"grad_norm": 2.8607146739959717,
|
| 47 |
-
"learning_rate": 1.8986967266497293e-05,
|
| 48 |
-
"loss": 11.5303,
|
| 49 |
-
"step": 96
|
| 50 |
-
},
|
| 51 |
-
{
|
| 52 |
-
"epoch": 0.6320987654320988,
|
| 53 |
-
"grad_norm": 2.915891408920288,
|
| 54 |
-
"learning_rate": 1.0762658106621542e-05,
|
| 55 |
-
"loss": 11.4537,
|
| 56 |
-
"step": 128
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"epoch": 0.6320987654320988,
|
| 60 |
-
"eval_bleu": 0.0,
|
| 61 |
-
"eval_cap_loss": 8.568840851337452,
|
| 62 |
-
"eval_con_loss": 1.379400504046473,
|
| 63 |
-
"eval_loss": 9.94824136771592,
|
| 64 |
-
"step": 128
|
| 65 |
-
},
|
| 66 |
-
{
|
| 67 |
-
"epoch": 0.6320987654320988,
|
| 68 |
-
"eval_bleu": 0.0,
|
| 69 |
-
"eval_cap_loss": 8.568840851337452,
|
| 70 |
-
"eval_con_loss": 1.379400504046473,
|
| 71 |
-
"eval_loss": 9.94824136771592,
|
| 72 |
-
"eval_runtime": 220.8837,
|
| 73 |
-
"eval_samples_per_second": 3.663,
|
| 74 |
-
"eval_steps_per_second": 0.919,
|
| 75 |
-
"step": 128
|
| 76 |
-
},
|
| 77 |
-
{
|
| 78 |
-
"epoch": 0.7901234567901234,
|
| 79 |
-
"grad_norm": 3.0462822914123535,
|
| 80 |
-
"learning_rate": 3.812270111907451e-06,
|
| 81 |
-
"loss": 11.2808,
|
| 82 |
-
"step": 160
|
| 83 |
-
},
|
| 84 |
-
{
|
| 85 |
-
"epoch": 0.9481481481481482,
|
| 86 |
-
"grad_norm": 2.9391884803771973,
|
| 87 |
-
"learning_rate": 2.2537891617109508e-07,
|
| 88 |
-
"loss": 11.3221,
|
| 89 |
-
"step": 192
|
| 90 |
-
},
|
| 91 |
-
{
|
| 92 |
-
"epoch": 0.9481481481481482,
|
| 93 |
-
"eval_bleu": 0.0,
|
| 94 |
-
"eval_cap_loss": 8.536650624768487,
|
| 95 |
-
"eval_con_loss": 1.379456711520115,
|
| 96 |
-
"eval_loss": 9.916107299879855,
|
| 97 |
-
"step": 192
|
| 98 |
-
},
|
| 99 |
-
{
|
| 100 |
-
"epoch": 0.9481481481481482,
|
| 101 |
-
"eval_bleu": 0.0,
|
| 102 |
-
"eval_cap_loss": 8.536650624768487,
|
| 103 |
-
"eval_con_loss": 1.379456711520115,
|
| 104 |
-
"eval_loss": 9.916107299879855,
|
| 105 |
-
"eval_runtime": 220.5088,
|
| 106 |
-
"eval_samples_per_second": 3.669,
|
| 107 |
-
"eval_steps_per_second": 0.921,
|
| 108 |
-
"step": 192
|
| 109 |
-
}
|
| 110 |
-
],
|
| 111 |
-
"logging_steps": 32,
|
| 112 |
-
"max_steps": 202,
|
| 113 |
-
"num_input_tokens_seen": 0,
|
| 114 |
-
"num_train_epochs": 1,
|
| 115 |
-
"save_steps": 64,
|
| 116 |
-
"stateful_callbacks": {
|
| 117 |
-
"TrainerControl": {
|
| 118 |
-
"args": {
|
| 119 |
-
"should_epoch_stop": false,
|
| 120 |
-
"should_evaluate": false,
|
| 121 |
-
"should_log": false,
|
| 122 |
-
"should_save": true,
|
| 123 |
-
"should_training_stop": true
|
| 124 |
-
},
|
| 125 |
-
"attributes": {}
|
| 126 |
-
}
|
| 127 |
-
},
|
| 128 |
-
"total_flos": 0.0,
|
| 129 |
-
"train_batch_size": 16,
|
| 130 |
-
"trial_name": null,
|
| 131 |
-
"trial_params": null
|
| 132 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-202/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bd2b23c0025977f304dc794f158c7bad81d7c588ae408e484fc9c79f6fec2528
|
| 3 |
-
size 5112
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:30b715a98d0790402e95d413d3b3d080f453af0c8f70b74f5171a5e6a5a39f15
|
| 3 |
-
size 2297612372
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3ebde8deb09d4e8e6e449042dcefc4cc0997334c8ac8fa04c8acdcd6c76c089e
|
| 3 |
-
size 178998372
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/rng_state.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:51d23a921626f1fecc8b752c0dc40ad68da4137994e71ad7c66137caf507a3e6
|
| 3 |
-
size 14180
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e51415869b2f0df9eed69859df5822396e1c56ea421d89bb22b2c580ba0e2803
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/trainer_state.json
DELETED
|
@@ -1,235 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 2.9925925925925925,
|
| 5 |
-
"eval_steps": 128,
|
| 6 |
-
"global_step": 606,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.1580246913580247,
|
| 13 |
-
"grad_norm": 3.291957378387451,
|
| 14 |
-
"learning_rate": 4.918032786885246e-06,
|
| 15 |
-
"loss": 9.3959,
|
| 16 |
-
"step": 32
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.3160493827160494,
|
| 20 |
-
"grad_norm": 4.1592698097229,
|
| 21 |
-
"learning_rate": 9.999916929744365e-06,
|
| 22 |
-
"loss": 9.151,
|
| 23 |
-
"step": 64
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.4740740740740741,
|
| 27 |
-
"grad_norm": 5.414369583129883,
|
| 28 |
-
"learning_rate": 9.909808702018315e-06,
|
| 29 |
-
"loss": 9.1311,
|
| 30 |
-
"step": 96
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.6320987654320988,
|
| 34 |
-
"grad_norm": 4.628862380981445,
|
| 35 |
-
"learning_rate": 9.653114094889128e-06,
|
| 36 |
-
"loss": 9.1676,
|
| 37 |
-
"step": 128
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.6320987654320988,
|
| 41 |
-
"eval_bleu": 0.018568904197283402,
|
| 42 |
-
"eval_cap_loss": 3.457257219389373,
|
| 43 |
-
"eval_con_loss": 2.059046186652838,
|
| 44 |
-
"eval_loss": 8.97356055764591,
|
| 45 |
-
"step": 128
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"epoch": 0.6320987654320988,
|
| 49 |
-
"eval_bleu": 0.018568904197283402,
|
| 50 |
-
"eval_cap_loss": 3.457257219389373,
|
| 51 |
-
"eval_con_loss": 2.059046186652838,
|
| 52 |
-
"eval_loss": 8.97356055764591,
|
| 53 |
-
"eval_runtime": 161.0868,
|
| 54 |
-
"eval_samples_per_second": 5.022,
|
| 55 |
-
"eval_steps_per_second": 0.633,
|
| 56 |
-
"step": 128
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"epoch": 0.7901234567901234,
|
| 60 |
-
"grad_norm": 6.027233123779297,
|
| 61 |
-
"learning_rate": 9.253761031990218e-06,
|
| 62 |
-
"loss": 9.1153,
|
| 63 |
-
"step": 160
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"epoch": 0.9481481481481482,
|
| 67 |
-
"grad_norm": 6.049582004547119,
|
| 68 |
-
"learning_rate": 8.699609944263219e-06,
|
| 69 |
-
"loss": 9.1281,
|
| 70 |
-
"step": 192
|
| 71 |
-
},
|
| 72 |
-
{
|
| 73 |
-
"epoch": 1.106172839506173,
|
| 74 |
-
"grad_norm": 4.76241397857666,
|
| 75 |
-
"learning_rate": 8.019933675572389e-06,
|
| 76 |
-
"loss": 9.021,
|
| 77 |
-
"step": 224
|
| 78 |
-
},
|
| 79 |
-
{
|
| 80 |
-
"epoch": 1.2641975308641975,
|
| 81 |
-
"grad_norm": 4.239706516265869,
|
| 82 |
-
"learning_rate": 7.2377931669113934e-06,
|
| 83 |
-
"loss": 9.0272,
|
| 84 |
-
"step": 256
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"epoch": 1.2641975308641975,
|
| 88 |
-
"eval_bleu": 0.016148135533212146,
|
| 89 |
-
"eval_cap_loss": 3.4138525163426117,
|
| 90 |
-
"eval_con_loss": 2.059033068956113,
|
| 91 |
-
"eval_loss": 8.886738094629026,
|
| 92 |
-
"step": 256
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"epoch": 1.2641975308641975,
|
| 96 |
-
"eval_bleu": 0.016148135533212146,
|
| 97 |
-
"eval_cap_loss": 3.4138525163426117,
|
| 98 |
-
"eval_con_loss": 2.059033068956113,
|
| 99 |
-
"eval_loss": 8.886738094629026,
|
| 100 |
-
"eval_runtime": 161.0562,
|
| 101 |
-
"eval_samples_per_second": 5.023,
|
| 102 |
-
"eval_steps_per_second": 0.633,
|
| 103 |
-
"step": 256
|
| 104 |
-
},
|
| 105 |
-
{
|
| 106 |
-
"epoch": 1.4222222222222223,
|
| 107 |
-
"grad_norm": 5.311102867126465,
|
| 108 |
-
"learning_rate": 6.379725899357408e-06,
|
| 109 |
-
"loss": 9.0054,
|
| 110 |
-
"step": 288
|
| 111 |
-
},
|
| 112 |
-
{
|
| 113 |
-
"epoch": 1.5802469135802468,
|
| 114 |
-
"grad_norm": 5.357556343078613,
|
| 115 |
-
"learning_rate": 5.474845495876518e-06,
|
| 116 |
-
"loss": 9.1177,
|
| 117 |
-
"step": 320
|
| 118 |
-
},
|
| 119 |
-
{
|
| 120 |
-
"epoch": 1.7382716049382716,
|
| 121 |
-
"grad_norm": 5.401686668395996,
|
| 122 |
-
"learning_rate": 4.553853916434448e-06,
|
| 123 |
-
"loss": 9.0223,
|
| 124 |
-
"step": 352
|
| 125 |
-
},
|
| 126 |
-
{
|
| 127 |
-
"epoch": 1.8962962962962964,
|
| 128 |
-
"grad_norm": 6.359274387359619,
|
| 129 |
-
"learning_rate": 3.6479997619424605e-06,
|
| 130 |
-
"loss": 9.083,
|
| 131 |
-
"step": 384
|
| 132 |
-
},
|
| 133 |
-
{
|
| 134 |
-
"epoch": 1.8962962962962964,
|
| 135 |
-
"eval_bleu": 0.017365712049326326,
|
| 136 |
-
"eval_cap_loss": 3.416674846527623,
|
| 137 |
-
"eval_con_loss": 2.0590534537446263,
|
| 138 |
-
"eval_loss": 8.892403146799873,
|
| 139 |
-
"step": 384
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"epoch": 1.8962962962962964,
|
| 143 |
-
"eval_bleu": 0.017365712049326326,
|
| 144 |
-
"eval_cap_loss": 3.416674846527623,
|
| 145 |
-
"eval_con_loss": 2.0590534537446263,
|
| 146 |
-
"eval_loss": 8.892403146799873,
|
| 147 |
-
"eval_runtime": 159.3342,
|
| 148 |
-
"eval_samples_per_second": 5.077,
|
| 149 |
-
"eval_steps_per_second": 0.64,
|
| 150 |
-
"step": 384
|
| 151 |
-
},
|
| 152 |
-
{
|
| 153 |
-
"epoch": 2.054320987654321,
|
| 154 |
-
"grad_norm": 3.601047992706299,
|
| 155 |
-
"learning_rate": 2.7880180310578546e-06,
|
| 156 |
-
"loss": 9.1251,
|
| 157 |
-
"step": 416
|
| 158 |
-
},
|
| 159 |
-
{
|
| 160 |
-
"epoch": 2.212345679012346,
|
| 161 |
-
"grad_norm": 4.511282920837402,
|
| 162 |
-
"learning_rate": 2.0030873031501274e-06,
|
| 163 |
-
"loss": 9.1476,
|
| 164 |
-
"step": 448
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"epoch": 2.3703703703703702,
|
| 168 |
-
"grad_norm": 3.5873196125030518,
|
| 169 |
-
"learning_rate": 1.3198397294863285e-06,
|
| 170 |
-
"loss": 9.2698,
|
| 171 |
-
"step": 480
|
| 172 |
-
},
|
| 173 |
-
{
|
| 174 |
-
"epoch": 2.528395061728395,
|
| 175 |
-
"grad_norm": 2.907109498977661,
|
| 176 |
-
"learning_rate": 7.614574229430432e-07,
|
| 177 |
-
"loss": 9.3526,
|
| 178 |
-
"step": 512
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"epoch": 2.528395061728395,
|
| 182 |
-
"eval_bleu": 0.018224741893114085,
|
| 183 |
-
"eval_cap_loss": 3.4440931338889924,
|
| 184 |
-
"eval_con_loss": 2.059036243195627,
|
| 185 |
-
"eval_loss": 8.947222527335672,
|
| 186 |
-
"step": 512
|
| 187 |
-
},
|
| 188 |
-
{
|
| 189 |
-
"epoch": 2.528395061728395,
|
| 190 |
-
"eval_bleu": 0.018224741893114085,
|
| 191 |
-
"eval_cap_loss": 3.4440931338889924,
|
| 192 |
-
"eval_con_loss": 2.059036243195627,
|
| 193 |
-
"eval_loss": 8.947222527335672,
|
| 194 |
-
"eval_runtime": 159.1467,
|
| 195 |
-
"eval_samples_per_second": 5.083,
|
| 196 |
-
"eval_steps_per_second": 0.641,
|
| 197 |
-
"step": 512
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"epoch": 2.68641975308642,
|
| 201 |
-
"grad_norm": 5.598337650299072,
|
| 202 |
-
"learning_rate": 3.4688590511087304e-07,
|
| 203 |
-
"loss": 9.3263,
|
| 204 |
-
"step": 544
|
| 205 |
-
},
|
| 206 |
-
{
|
| 207 |
-
"epoch": 2.8444444444444446,
|
| 208 |
-
"grad_norm": 5.659440040588379,
|
| 209 |
-
"learning_rate": 9.019129798168658e-08,
|
| 210 |
-
"loss": 9.4575,
|
| 211 |
-
"step": 576
|
| 212 |
-
}
|
| 213 |
-
],
|
| 214 |
-
"logging_steps": 32,
|
| 215 |
-
"max_steps": 606,
|
| 216 |
-
"num_input_tokens_seen": 0,
|
| 217 |
-
"num_train_epochs": 3,
|
| 218 |
-
"save_steps": 128,
|
| 219 |
-
"stateful_callbacks": {
|
| 220 |
-
"TrainerControl": {
|
| 221 |
-
"args": {
|
| 222 |
-
"should_epoch_stop": false,
|
| 223 |
-
"should_evaluate": false,
|
| 224 |
-
"should_log": false,
|
| 225 |
-
"should_save": true,
|
| 226 |
-
"should_training_stop": true
|
| 227 |
-
},
|
| 228 |
-
"attributes": {}
|
| 229 |
-
}
|
| 230 |
-
},
|
| 231 |
-
"total_flos": 0.0,
|
| 232 |
-
"train_batch_size": 16,
|
| 233 |
-
"trial_name": null,
|
| 234 |
-
"trial_params": null
|
| 235 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606-2/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3c7cc8cc74cd8f844a12e1e98a8984236a6126dfae3836b014d3e373369f69d7
|
| 3 |
-
size 5112
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2dab7fa42fa66d5815df8b7bbb5b720fa77fdbc70f25393dc810583332dfc3da
|
| 3 |
-
size 2297612372
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:95bad9522fdb87255cb62c18caec57e7911e2ff092e39c135fe2882ebcf7bbd6
|
| 3 |
-
size 178998372
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/rng_state.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fd68f9468bdece9db631b8983960a037ce75a3b363c645663d54244d569fdce3
|
| 3 |
-
size 14180
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:aa2b7f16f016bea816df5df16d92c79d6f816d8a4d91bb613af70a9f91f3326d
|
| 3 |
-
size 1064
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/trainer_state.json
DELETED
|
@@ -1,235 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"best_metric": null,
|
| 3 |
-
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 2.996291718170581,
|
| 5 |
-
"eval_steps": 128,
|
| 6 |
-
"global_step": 606,
|
| 7 |
-
"is_hyper_param_search": false,
|
| 8 |
-
"is_local_process_zero": true,
|
| 9 |
-
"is_world_process_zero": true,
|
| 10 |
-
"log_history": [
|
| 11 |
-
{
|
| 12 |
-
"epoch": 0.15822002472187885,
|
| 13 |
-
"grad_norm": 11.1969633102417,
|
| 14 |
-
"learning_rate": 0.00015737704918032785,
|
| 15 |
-
"loss": 14.339,
|
| 16 |
-
"step": 32
|
| 17 |
-
},
|
| 18 |
-
{
|
| 19 |
-
"epoch": 0.3164400494437577,
|
| 20 |
-
"grad_norm": 12.77004337310791,
|
| 21 |
-
"learning_rate": 0.00029997757152782376,
|
| 22 |
-
"loss": 10.8537,
|
| 23 |
-
"step": 64
|
| 24 |
-
},
|
| 25 |
-
{
|
| 26 |
-
"epoch": 0.4746600741656366,
|
| 27 |
-
"grad_norm": 8.878470420837402,
|
| 28 |
-
"learning_rate": 0.0002969575009832261,
|
| 29 |
-
"loss": 10.1762,
|
| 30 |
-
"step": 96
|
| 31 |
-
},
|
| 32 |
-
{
|
| 33 |
-
"epoch": 0.6328800988875154,
|
| 34 |
-
"grad_norm": 7.734447479248047,
|
| 35 |
-
"learning_rate": 0.00028895126509070673,
|
| 36 |
-
"loss": 9.9993,
|
| 37 |
-
"step": 128
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.6328800988875154,
|
| 41 |
-
"eval_bleu": 0.009866227706267426,
|
| 42 |
-
"eval_cap_loss": 3.8397970962994203,
|
| 43 |
-
"eval_con_loss": 1.3794401672673342,
|
| 44 |
-
"eval_loss": 9.059034354581033,
|
| 45 |
-
"step": 128
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"epoch": 0.6328800988875154,
|
| 49 |
-
"eval_bleu": 0.009866227706267426,
|
| 50 |
-
"eval_cap_loss": 3.8397970962994203,
|
| 51 |
-
"eval_con_loss": 1.3794401672673342,
|
| 52 |
-
"eval_loss": 9.059034354581033,
|
| 53 |
-
"eval_runtime": 165.4257,
|
| 54 |
-
"eval_samples_per_second": 4.89,
|
| 55 |
-
"eval_steps_per_second": 1.227,
|
| 56 |
-
"step": 128
|
| 57 |
-
},
|
| 58 |
-
{
|
| 59 |
-
"epoch": 0.7911001236093943,
|
| 60 |
-
"grad_norm": 5.295111179351807,
|
| 61 |
-
"learning_rate": 0.0002766954985019261,
|
| 62 |
-
"loss": 9.7893,
|
| 63 |
-
"step": 160
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"epoch": 0.9493201483312732,
|
| 67 |
-
"grad_norm": 6.028670310974121,
|
| 68 |
-
"learning_rate": 0.0002598176493606703,
|
| 69 |
-
"loss": 9.7303,
|
| 70 |
-
"step": 192
|
| 71 |
-
},
|
| 72 |
-
{
|
| 73 |
-
"epoch": 1.107540173053152,
|
| 74 |
-
"grad_norm": 6.170614719390869,
|
| 75 |
-
"learning_rate": 0.00023921376409217457,
|
| 76 |
-
"loss": 9.3777,
|
| 77 |
-
"step": 224
|
| 78 |
-
},
|
| 79 |
-
{
|
| 80 |
-
"epoch": 1.2657601977750308,
|
| 81 |
-
"grad_norm": 8.26496410369873,
|
| 82 |
-
"learning_rate": 0.00021558291813029553,
|
| 83 |
-
"loss": 9.2487,
|
| 84 |
-
"step": 256
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"epoch": 1.2657601977750308,
|
| 88 |
-
"eval_bleu": 0.012158325589574045,
|
| 89 |
-
"eval_cap_loss": 3.663852495512939,
|
| 90 |
-
"eval_con_loss": 1.3794238802247447,
|
| 91 |
-
"eval_loss": 8.70712886068034,
|
| 92 |
-
"step": 256
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"epoch": 1.2657601977750308,
|
| 96 |
-
"eval_bleu": 0.012158325589574045,
|
| 97 |
-
"eval_cap_loss": 3.663852495512939,
|
| 98 |
-
"eval_con_loss": 1.3794238802247447,
|
| 99 |
-
"eval_loss": 8.70712886068034,
|
| 100 |
-
"eval_runtime": 165.1008,
|
| 101 |
-
"eval_samples_per_second": 4.9,
|
| 102 |
-
"eval_steps_per_second": 1.23,
|
| 103 |
-
"step": 256
|
| 104 |
-
},
|
| 105 |
-
{
|
| 106 |
-
"epoch": 1.4239802224969098,
|
| 107 |
-
"grad_norm": 5.113515377044678,
|
| 108 |
-
"learning_rate": 0.00018972688957066537,
|
| 109 |
-
"loss": 9.2305,
|
| 110 |
-
"step": 288
|
| 111 |
-
},
|
| 112 |
-
{
|
| 113 |
-
"epoch": 1.5822002472187886,
|
| 114 |
-
"grad_norm": 4.397431373596191,
|
| 115 |
-
"learning_rate": 0.00016252295540000706,
|
| 116 |
-
"loss": 9.2278,
|
| 117 |
-
"step": 320
|
| 118 |
-
},
|
| 119 |
-
{
|
| 120 |
-
"epoch": 1.7404202719406676,
|
| 121 |
-
"grad_norm": 4.647693157196045,
|
| 122 |
-
"learning_rate": 0.00013489412610081624,
|
| 123 |
-
"loss": 9.0332,
|
| 124 |
-
"step": 352
|
| 125 |
-
},
|
| 126 |
-
{
|
| 127 |
-
"epoch": 1.8986402966625464,
|
| 128 |
-
"grad_norm": 6.768193244934082,
|
| 129 |
-
"learning_rate": 0.00010777782855047967,
|
| 130 |
-
"loss": 9.0303,
|
| 131 |
-
"step": 384
|
| 132 |
-
},
|
| 133 |
-
{
|
| 134 |
-
"epoch": 1.8986402966625464,
|
| 135 |
-
"eval_bleu": 0.015940976494587023,
|
| 136 |
-
"eval_cap_loss": 3.5265488871212662,
|
| 137 |
-
"eval_con_loss": 1.3794590686929638,
|
| 138 |
-
"eval_loss": 8.432556859378156,
|
| 139 |
-
"step": 384
|
| 140 |
-
},
|
| 141 |
-
{
|
| 142 |
-
"epoch": 1.8986402966625464,
|
| 143 |
-
"eval_bleu": 0.015940976494587023,
|
| 144 |
-
"eval_cap_loss": 3.5265488871212662,
|
| 145 |
-
"eval_con_loss": 1.3794590686929638,
|
| 146 |
-
"eval_loss": 8.432556859378156,
|
| 147 |
-
"eval_runtime": 164.9408,
|
| 148 |
-
"eval_samples_per_second": 4.905,
|
| 149 |
-
"eval_steps_per_second": 1.231,
|
| 150 |
-
"step": 384
|
| 151 |
-
},
|
| 152 |
-
{
|
| 153 |
-
"epoch": 2.056860321384425,
|
| 154 |
-
"grad_norm": 3.732090473175049,
|
| 155 |
-
"learning_rate": 8.286620499265821e-05,
|
| 156 |
-
"loss": 8.9538,
|
| 157 |
-
"step": 416
|
| 158 |
-
},
|
| 159 |
-
{
|
| 160 |
-
"epoch": 2.215080346106304,
|
| 161 |
-
"grad_norm": 4.6480584144592285,
|
| 162 |
-
"learning_rate": 5.9401989732828384e-05,
|
| 163 |
-
"loss": 8.8722,
|
| 164 |
-
"step": 448
|
| 165 |
-
},
|
| 166 |
-
{
|
| 167 |
-
"epoch": 2.373300370828183,
|
| 168 |
-
"grad_norm": 5.697710037231445,
|
| 169 |
-
"learning_rate": 3.901170167210344e-05,
|
| 170 |
-
"loss": 8.8853,
|
| 171 |
-
"step": 480
|
| 172 |
-
},
|
| 173 |
-
{
|
| 174 |
-
"epoch": 2.5315203955500616,
|
| 175 |
-
"grad_norm": 3.8718154430389404,
|
| 176 |
-
"learning_rate": 2.238716904029349e-05,
|
| 177 |
-
"loss": 8.8722,
|
| 178 |
-
"step": 512
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"epoch": 2.5315203955500616,
|
| 182 |
-
"eval_bleu": 0.022739422488822453,
|
| 183 |
-
"eval_cap_loss": 3.4747910476083237,
|
| 184 |
-
"eval_con_loss": 1.37942426369108,
|
| 185 |
-
"eval_loss": 8.329006359494965,
|
| 186 |
-
"step": 512
|
| 187 |
-
},
|
| 188 |
-
{
|
| 189 |
-
"epoch": 2.5315203955500616,
|
| 190 |
-
"eval_bleu": 0.022739422488822453,
|
| 191 |
-
"eval_cap_loss": 3.4747910476083237,
|
| 192 |
-
"eval_con_loss": 1.37942426369108,
|
| 193 |
-
"eval_loss": 8.329006359494965,
|
| 194 |
-
"eval_runtime": 163.9894,
|
| 195 |
-
"eval_samples_per_second": 4.933,
|
| 196 |
-
"eval_steps_per_second": 1.238,
|
| 197 |
-
"step": 512
|
| 198 |
-
},
|
| 199 |
-
{
|
| 200 |
-
"epoch": 2.689740420271941,
|
| 201 |
-
"grad_norm": 4.082241535186768,
|
| 202 |
-
"learning_rate": 1.0092450616157804e-05,
|
| 203 |
-
"loss": 8.7704,
|
| 204 |
-
"step": 544
|
| 205 |
-
},
|
| 206 |
-
{
|
| 207 |
-
"epoch": 2.8479604449938196,
|
| 208 |
-
"grad_norm": 3.609088897705078,
|
| 209 |
-
"learning_rate": 2.544697607077684e-06,
|
| 210 |
-
"loss": 8.8435,
|
| 211 |
-
"step": 576
|
| 212 |
-
}
|
| 213 |
-
],
|
| 214 |
-
"logging_steps": 32,
|
| 215 |
-
"max_steps": 606,
|
| 216 |
-
"num_input_tokens_seen": 0,
|
| 217 |
-
"num_train_epochs": 3,
|
| 218 |
-
"save_steps": 128,
|
| 219 |
-
"stateful_callbacks": {
|
| 220 |
-
"TrainerControl": {
|
| 221 |
-
"args": {
|
| 222 |
-
"should_epoch_stop": false,
|
| 223 |
-
"should_evaluate": false,
|
| 224 |
-
"should_log": false,
|
| 225 |
-
"should_save": true,
|
| 226 |
-
"should_training_stop": true
|
| 227 |
-
},
|
| 228 |
-
"attributes": {}
|
| 229 |
-
}
|
| 230 |
-
},
|
| 231 |
-
"total_flos": 0.0,
|
| 232 |
-
"train_batch_size": 8,
|
| 233 |
-
"trial_name": null,
|
| 234 |
-
"trial_params": null
|
| 235 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/checkpoint-606/training_args.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c438a6e15f14bb991774d44e2f18eed0b82a51ff2a702113d87a3659795d03d1
|
| 3 |
-
size 5112
|
|
|
|
|
|
|
|
|
|
|
|