{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.7171610593795776, "learning_rate": 1.985e-05, "loss": 0.6936, "step": 4 }, { "epoch": 2.0, "grad_norm": 0.931102991104126, "learning_rate": 1.9650000000000003e-05, "loss": 0.6903, "step": 8 }, { "epoch": 3.0, "grad_norm": 0.9753431677818298, "learning_rate": 1.9450000000000002e-05, "loss": 0.691, "step": 12 }, { "epoch": 4.0, "grad_norm": 0.7282153964042664, "learning_rate": 1.925e-05, "loss": 0.6909, "step": 16 }, { "epoch": 5.0, "grad_norm": 0.7919716238975525, "learning_rate": 1.9050000000000002e-05, "loss": 0.6853, "step": 20 }, { "epoch": 6.0, "grad_norm": 2.4427859783172607, "learning_rate": 1.885e-05, "loss": 0.6789, "step": 24 }, { "epoch": 7.0, "grad_norm": 0.7891402840614319, "learning_rate": 1.8650000000000003e-05, "loss": 0.6753, "step": 28 }, { "epoch": 8.0, "grad_norm": 0.6697407960891724, "learning_rate": 1.845e-05, "loss": 0.6762, "step": 32 }, { "epoch": 9.0, "grad_norm": 0.7295678853988647, "learning_rate": 1.825e-05, "loss": 0.6714, "step": 36 }, { "epoch": 10.0, "grad_norm": 0.7619945406913757, "learning_rate": 1.805e-05, "loss": 0.6697, "step": 40 }, { "epoch": 11.0, "grad_norm": 1.8652944564819336, "learning_rate": 1.785e-05, "loss": 0.6609, "step": 44 }, { "epoch": 12.0, "grad_norm": 0.9776553511619568, "learning_rate": 1.7650000000000002e-05, "loss": 0.6554, "step": 48 }, { "epoch": 13.0, "grad_norm": 0.8226175308227539, "learning_rate": 1.7450000000000004e-05, "loss": 0.6505, "step": 52 }, { "epoch": 14.0, "grad_norm": 1.9432940483093262, "learning_rate": 1.7250000000000003e-05, "loss": 0.6451, "step": 56 }, { "epoch": 15.0, "grad_norm": 1.1705070734024048, "learning_rate": 1.705e-05, "loss": 0.6293, "step": 60 }, { "epoch": 16.0, "grad_norm": 1.1913769245147705, "learning_rate": 1.6850000000000003e-05, "loss": 0.6219, "step": 64 }, { "epoch": 17.0, "grad_norm": 1.1586858034133911, "learning_rate": 1.665e-05, "loss": 0.6151, "step": 68 }, { "epoch": 18.0, "grad_norm": 1.3686275482177734, "learning_rate": 1.645e-05, "loss": 0.6057, "step": 72 }, { "epoch": 19.0, "grad_norm": 1.2270820140838623, "learning_rate": 1.6250000000000002e-05, "loss": 0.5921, "step": 76 }, { "epoch": 20.0, "grad_norm": 2.155693531036377, "learning_rate": 1.605e-05, "loss": 0.5771, "step": 80 }, { "epoch": 21.0, "grad_norm": 1.8586078882217407, "learning_rate": 1.5850000000000002e-05, "loss": 0.562, "step": 84 }, { "epoch": 22.0, "grad_norm": 2.844381809234619, "learning_rate": 1.565e-05, "loss": 0.5483, "step": 88 }, { "epoch": 23.0, "grad_norm": 1.532677412033081, "learning_rate": 1.545e-05, "loss": 0.5323, "step": 92 }, { "epoch": 24.0, "grad_norm": 2.501610040664673, "learning_rate": 1.525e-05, "loss": 0.5106, "step": 96 }, { "epoch": 25.0, "grad_norm": 3.366448402404785, "learning_rate": 1.505e-05, "loss": 0.5028, "step": 100 }, { "epoch": 26.0, "grad_norm": 2.540175199508667, "learning_rate": 1.4850000000000002e-05, "loss": 0.4743, "step": 104 }, { "epoch": 27.0, "grad_norm": 2.1043853759765625, "learning_rate": 1.4650000000000002e-05, "loss": 0.4676, "step": 108 }, { "epoch": 28.0, "grad_norm": 2.4121694564819336, "learning_rate": 1.4450000000000002e-05, "loss": 0.4418, "step": 112 }, { "epoch": 29.0, "grad_norm": 1.871059775352478, "learning_rate": 1.425e-05, "loss": 0.4188, "step": 116 }, { "epoch": 30.0, "grad_norm": 3.22082257270813, "learning_rate": 1.4050000000000001e-05, "loss": 0.3973, "step": 120 }, { "epoch": 31.0, "grad_norm": 2.0184738636016846, "learning_rate": 1.3850000000000001e-05, "loss": 0.3767, "step": 124 }, { "epoch": 32.0, "grad_norm": 1.8004070520401, "learning_rate": 1.3650000000000001e-05, "loss": 0.3687, "step": 128 }, { "epoch": 33.0, "grad_norm": 2.161533832550049, "learning_rate": 1.3450000000000002e-05, "loss": 0.3419, "step": 132 }, { "epoch": 34.0, "grad_norm": 2.215999126434326, "learning_rate": 1.325e-05, "loss": 0.3259, "step": 136 }, { "epoch": 35.0, "grad_norm": 1.8289316892623901, "learning_rate": 1.305e-05, "loss": 0.2965, "step": 140 }, { "epoch": 36.0, "grad_norm": 1.7603213787078857, "learning_rate": 1.285e-05, "loss": 0.2784, "step": 144 }, { "epoch": 37.0, "grad_norm": 1.9211527109146118, "learning_rate": 1.2650000000000001e-05, "loss": 0.2624, "step": 148 }, { "epoch": 38.0, "grad_norm": 1.7408591508865356, "learning_rate": 1.2450000000000003e-05, "loss": 0.2301, "step": 152 }, { "epoch": 39.0, "grad_norm": 1.8422377109527588, "learning_rate": 1.2250000000000001e-05, "loss": 0.2316, "step": 156 }, { "epoch": 40.0, "grad_norm": 2.905261754989624, "learning_rate": 1.2050000000000002e-05, "loss": 0.2066, "step": 160 }, { "epoch": 41.0, "grad_norm": 1.5432759523391724, "learning_rate": 1.1850000000000002e-05, "loss": 0.2084, "step": 164 }, { "epoch": 42.0, "grad_norm": 1.6602318286895752, "learning_rate": 1.1650000000000002e-05, "loss": 0.1901, "step": 168 }, { "epoch": 43.0, "grad_norm": 1.7276387214660645, "learning_rate": 1.145e-05, "loss": 0.1635, "step": 172 }, { "epoch": 44.0, "grad_norm": 3.0626723766326904, "learning_rate": 1.125e-05, "loss": 0.1493, "step": 176 }, { "epoch": 45.0, "grad_norm": 1.6950130462646484, "learning_rate": 1.1050000000000001e-05, "loss": 0.128, "step": 180 }, { "epoch": 46.0, "grad_norm": 1.41054105758667, "learning_rate": 1.0850000000000001e-05, "loss": 0.1241, "step": 184 }, { "epoch": 47.0, "grad_norm": 1.694176435470581, "learning_rate": 1.065e-05, "loss": 0.126, "step": 188 }, { "epoch": 48.0, "grad_norm": 1.3726774454116821, "learning_rate": 1.045e-05, "loss": 0.1127, "step": 192 }, { "epoch": 49.0, "grad_norm": 2.0337917804718018, "learning_rate": 1.025e-05, "loss": 0.1056, "step": 196 }, { "epoch": 50.0, "grad_norm": 1.3560911417007446, "learning_rate": 1.005e-05, "loss": 0.0995, "step": 200 }, { "epoch": 51.0, "grad_norm": 1.0479848384857178, "learning_rate": 9.85e-06, "loss": 0.0848, "step": 204 }, { "epoch": 52.0, "grad_norm": 0.9078042507171631, "learning_rate": 9.65e-06, "loss": 0.0789, "step": 208 }, { "epoch": 53.0, "grad_norm": 1.6278938055038452, "learning_rate": 9.450000000000001e-06, "loss": 0.077, "step": 212 }, { "epoch": 54.0, "grad_norm": 1.9590917825698853, "learning_rate": 9.250000000000001e-06, "loss": 0.0807, "step": 216 }, { "epoch": 55.0, "grad_norm": 1.2972891330718994, "learning_rate": 9.050000000000001e-06, "loss": 0.0614, "step": 220 }, { "epoch": 56.0, "grad_norm": 0.8540873527526855, "learning_rate": 8.85e-06, "loss": 0.0606, "step": 224 }, { "epoch": 57.0, "grad_norm": 0.6654326319694519, "learning_rate": 8.65e-06, "loss": 0.0551, "step": 228 }, { "epoch": 58.0, "grad_norm": 0.9245683550834656, "learning_rate": 8.45e-06, "loss": 0.054, "step": 232 }, { "epoch": 59.0, "grad_norm": 0.5625425577163696, "learning_rate": 8.25e-06, "loss": 0.0496, "step": 236 }, { "epoch": 60.0, "grad_norm": 0.664634644985199, "learning_rate": 8.050000000000001e-06, "loss": 0.0493, "step": 240 }, { "epoch": 61.0, "grad_norm": 0.5101817846298218, "learning_rate": 7.850000000000001e-06, "loss": 0.0442, "step": 244 }, { "epoch": 62.0, "grad_norm": 0.5927309393882751, "learning_rate": 7.650000000000001e-06, "loss": 0.0423, "step": 248 }, { "epoch": 63.0, "grad_norm": 0.7394993305206299, "learning_rate": 7.450000000000001e-06, "loss": 0.0434, "step": 252 }, { "epoch": 64.0, "grad_norm": 0.653026819229126, "learning_rate": 7.25e-06, "loss": 0.0373, "step": 256 }, { "epoch": 65.0, "grad_norm": 0.4957493543624878, "learning_rate": 7.05e-06, "loss": 0.0345, "step": 260 }, { "epoch": 66.0, "grad_norm": 0.6404949426651001, "learning_rate": 6.850000000000001e-06, "loss": 0.0347, "step": 264 }, { "epoch": 67.0, "grad_norm": 0.4832979440689087, "learning_rate": 6.650000000000001e-06, "loss": 0.0318, "step": 268 }, { "epoch": 68.0, "grad_norm": 0.5346927046775818, "learning_rate": 6.450000000000001e-06, "loss": 0.0334, "step": 272 }, { "epoch": 69.0, "grad_norm": 0.46299833059310913, "learning_rate": 6.25e-06, "loss": 0.0329, "step": 276 }, { "epoch": 70.0, "grad_norm": 0.39228323101997375, "learning_rate": 6.0500000000000005e-06, "loss": 0.0299, "step": 280 }, { "epoch": 71.0, "grad_norm": 0.4643970727920532, "learning_rate": 5.85e-06, "loss": 0.0297, "step": 284 }, { "epoch": 72.0, "grad_norm": 0.4702988862991333, "learning_rate": 5.65e-06, "loss": 0.0292, "step": 288 }, { "epoch": 73.0, "grad_norm": 0.4042636752128601, "learning_rate": 5.450000000000001e-06, "loss": 0.0276, "step": 292 }, { "epoch": 74.0, "grad_norm": 0.49854159355163574, "learning_rate": 5.2500000000000006e-06, "loss": 0.0285, "step": 296 }, { "epoch": 75.0, "grad_norm": 0.33747512102127075, "learning_rate": 5.050000000000001e-06, "loss": 0.0259, "step": 300 }, { "epoch": 76.0, "grad_norm": 0.5222832560539246, "learning_rate": 4.85e-06, "loss": 0.027, "step": 304 }, { "epoch": 77.0, "grad_norm": 0.3840760588645935, "learning_rate": 4.65e-06, "loss": 0.0257, "step": 308 }, { "epoch": 78.0, "grad_norm": 0.3676559627056122, "learning_rate": 4.450000000000001e-06, "loss": 0.0253, "step": 312 }, { "epoch": 79.0, "grad_norm": 0.3206919729709625, "learning_rate": 4.25e-06, "loss": 0.0245, "step": 316 }, { "epoch": 80.0, "grad_norm": 0.38936108350753784, "learning_rate": 4.05e-06, "loss": 0.0246, "step": 320 }, { "epoch": 81.0, "grad_norm": 1.3330600261688232, "learning_rate": 3.85e-06, "loss": 0.0245, "step": 324 }, { "epoch": 82.0, "grad_norm": 0.3317999839782715, "learning_rate": 3.65e-06, "loss": 0.0225, "step": 328 }, { "epoch": 83.0, "grad_norm": 0.35797789692878723, "learning_rate": 3.45e-06, "loss": 0.0237, "step": 332 }, { "epoch": 84.0, "grad_norm": 0.3166642189025879, "learning_rate": 3.2500000000000002e-06, "loss": 0.0233, "step": 336 }, { "epoch": 85.0, "grad_norm": 0.3116203248500824, "learning_rate": 3.05e-06, "loss": 0.0235, "step": 340 }, { "epoch": 86.0, "grad_norm": 0.3509286940097809, "learning_rate": 2.85e-06, "loss": 0.0221, "step": 344 }, { "epoch": 87.0, "grad_norm": 0.33957698941230774, "learning_rate": 2.6500000000000005e-06, "loss": 0.0219, "step": 348 }, { "epoch": 88.0, "grad_norm": 0.36599016189575195, "learning_rate": 2.4500000000000003e-06, "loss": 0.0219, "step": 352 }, { "epoch": 89.0, "grad_norm": 0.30192670226097107, "learning_rate": 2.25e-06, "loss": 0.0215, "step": 356 }, { "epoch": 90.0, "grad_norm": 0.4861908257007599, "learning_rate": 2.05e-06, "loss": 0.0216, "step": 360 }, { "epoch": 91.0, "grad_norm": 0.43383175134658813, "learning_rate": 1.85e-06, "loss": 0.0211, "step": 364 }, { "epoch": 92.0, "grad_norm": 0.32720497250556946, "learning_rate": 1.6500000000000003e-06, "loss": 0.0218, "step": 368 }, { "epoch": 93.0, "grad_norm": 0.36105918884277344, "learning_rate": 1.45e-06, "loss": 0.0212, "step": 372 }, { "epoch": 94.0, "grad_norm": 0.3829093277454376, "learning_rate": 1.25e-06, "loss": 0.0202, "step": 376 }, { "epoch": 95.0, "grad_norm": 0.3548564016819, "learning_rate": 1.0500000000000001e-06, "loss": 0.0216, "step": 380 }, { "epoch": 96.0, "grad_norm": 0.52253657579422, "learning_rate": 8.500000000000001e-07, "loss": 0.0211, "step": 384 }, { "epoch": 97.0, "grad_norm": 0.29113584756851196, "learning_rate": 6.5e-07, "loss": 0.0216, "step": 388 }, { "epoch": 98.0, "grad_norm": 0.35965240001678467, "learning_rate": 4.5000000000000003e-07, "loss": 0.0209, "step": 392 }, { "epoch": 99.0, "grad_norm": 0.2798146605491638, "learning_rate": 2.5000000000000004e-07, "loss": 0.0208, "step": 396 }, { "epoch": 100.0, "grad_norm": 0.30020079016685486, "learning_rate": 5.0000000000000004e-08, "loss": 0.0208, "step": 400 } ], "logging_steps": 500, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 23228751974400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }