{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 394, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012706480304955527, "grad_norm": 1.0249743461608887, "learning_rate": 1.2121212121212122e-06, "loss": 1.2769, "step": 5 }, { "epoch": 0.025412960609911054, "grad_norm": 0.894174337387085, "learning_rate": 2.7272727272727272e-06, "loss": 1.3117, "step": 10 }, { "epoch": 0.03811944091486658, "grad_norm": 0.6720679998397827, "learning_rate": 4.242424242424242e-06, "loss": 1.33, "step": 15 }, { "epoch": 0.05082592121982211, "grad_norm": 0.7012405395507812, "learning_rate": 5.757575757575758e-06, "loss": 1.3123, "step": 20 }, { "epoch": 0.06353240152477764, "grad_norm": 0.8811719417572021, "learning_rate": 7.272727272727273e-06, "loss": 1.2455, "step": 25 }, { "epoch": 0.07623888182973317, "grad_norm": 0.5775105357170105, "learning_rate": 8.787878787878788e-06, "loss": 1.2503, "step": 30 }, { "epoch": 0.08894536213468869, "grad_norm": 0.6601129770278931, "learning_rate": 1.0303030303030302e-05, "loss": 1.2622, "step": 35 }, { "epoch": 0.10165184243964422, "grad_norm": 0.563920795917511, "learning_rate": 1.1818181818181819e-05, "loss": 1.2293, "step": 40 }, { "epoch": 0.11435832274459974, "grad_norm": 0.6447675824165344, "learning_rate": 1.3333333333333333e-05, "loss": 1.1266, "step": 45 }, { "epoch": 0.12706480304955528, "grad_norm": 0.6008123755455017, "learning_rate": 1.484848484848485e-05, "loss": 1.1518, "step": 50 }, { "epoch": 0.1397712833545108, "grad_norm": 0.5885509848594666, "learning_rate": 1.6363636363636363e-05, "loss": 1.2858, "step": 55 }, { "epoch": 0.15247776365946633, "grad_norm": 0.47118306159973145, "learning_rate": 1.7878787878787877e-05, "loss": 1.1583, "step": 60 }, { "epoch": 0.16518424396442186, "grad_norm": 0.5283555388450623, "learning_rate": 1.9393939393939395e-05, "loss": 1.2376, "step": 65 }, { "epoch": 0.17789072426937738, "grad_norm": 0.5695042014122009, "learning_rate": 2.090909090909091e-05, "loss": 1.2086, "step": 70 }, { "epoch": 0.1905972045743329, "grad_norm": 0.7871887683868408, "learning_rate": 2.2424242424242424e-05, "loss": 1.178, "step": 75 }, { "epoch": 0.20330368487928843, "grad_norm": 0.5051509141921997, "learning_rate": 2.3939393939393942e-05, "loss": 1.1244, "step": 80 }, { "epoch": 0.21601016518424396, "grad_norm": 0.6592188477516174, "learning_rate": 2.5454545454545457e-05, "loss": 1.1719, "step": 85 }, { "epoch": 0.22871664548919948, "grad_norm": 0.5462698340415955, "learning_rate": 2.696969696969697e-05, "loss": 1.1111, "step": 90 }, { "epoch": 0.241423125794155, "grad_norm": 0.5268920063972473, "learning_rate": 2.8484848484848486e-05, "loss": 1.1048, "step": 95 }, { "epoch": 0.25412960609911056, "grad_norm": 0.5009660124778748, "learning_rate": 3e-05, "loss": 1.1586, "step": 100 }, { "epoch": 0.2668360864040661, "grad_norm": 0.6100103259086609, "learning_rate": 2.999947137111298e-05, "loss": 1.1005, "step": 105 }, { "epoch": 0.2795425667090216, "grad_norm": 0.5694701075553894, "learning_rate": 2.9997885521711738e-05, "loss": 1.0613, "step": 110 }, { "epoch": 0.29224904701397714, "grad_norm": 0.5084468722343445, "learning_rate": 2.9995242563573035e-05, "loss": 1.0681, "step": 115 }, { "epoch": 0.30495552731893266, "grad_norm": 0.5901311635971069, "learning_rate": 2.9991542682982747e-05, "loss": 1.0505, "step": 120 }, { "epoch": 0.3176620076238882, "grad_norm": 0.5251779556274414, "learning_rate": 2.99867861407227e-05, "loss": 1.0543, "step": 125 }, { "epoch": 0.3303684879288437, "grad_norm": 0.5326008796691895, "learning_rate": 2.9980973272052328e-05, "loss": 1.0744, "step": 130 }, { "epoch": 0.34307496823379924, "grad_norm": 0.5696294903755188, "learning_rate": 2.9974104486684988e-05, "loss": 1.0599, "step": 135 }, { "epoch": 0.35578144853875476, "grad_norm": 0.5977292656898499, "learning_rate": 2.996618026875914e-05, "loss": 1.071, "step": 140 }, { "epoch": 0.3684879288437103, "grad_norm": 0.5755591988563538, "learning_rate": 2.995720117680417e-05, "loss": 1.0214, "step": 145 }, { "epoch": 0.3811944091486658, "grad_norm": 0.8020675778388977, "learning_rate": 2.994716784370108e-05, "loss": 1.0587, "step": 150 }, { "epoch": 0.39390088945362134, "grad_norm": 0.6691463589668274, "learning_rate": 2.9936080976637823e-05, "loss": 0.9774, "step": 155 }, { "epoch": 0.40660736975857686, "grad_norm": 0.6647286415100098, "learning_rate": 2.992394135705949e-05, "loss": 1.0251, "step": 160 }, { "epoch": 0.4193138500635324, "grad_norm": 0.7553662061691284, "learning_rate": 2.9910749840613233e-05, "loss": 0.9626, "step": 165 }, { "epoch": 0.4320203303684879, "grad_norm": 0.6578956842422485, "learning_rate": 2.9896507357087928e-05, "loss": 0.9649, "step": 170 }, { "epoch": 0.44472681067344344, "grad_norm": 0.6698135137557983, "learning_rate": 2.988121491034868e-05, "loss": 0.9372, "step": 175 }, { "epoch": 0.45743329097839897, "grad_norm": 0.7247338891029358, "learning_rate": 2.9864873578266034e-05, "loss": 0.9995, "step": 180 }, { "epoch": 0.4701397712833545, "grad_norm": 0.7628229856491089, "learning_rate": 2.9847484512640018e-05, "loss": 0.9421, "step": 185 }, { "epoch": 0.48284625158831, "grad_norm": 0.7909829020500183, "learning_rate": 2.9829048939118944e-05, "loss": 0.9142, "step": 190 }, { "epoch": 0.49555273189326554, "grad_norm": 0.7012385725975037, "learning_rate": 2.9809568157113047e-05, "loss": 0.9399, "step": 195 }, { "epoch": 0.5082592121982211, "grad_norm": 0.7712706327438354, "learning_rate": 2.9789043539702875e-05, "loss": 0.9837, "step": 200 }, { "epoch": 0.5209656925031766, "grad_norm": 0.7406087517738342, "learning_rate": 2.9767476533542513e-05, "loss": 0.9507, "step": 205 }, { "epoch": 0.5336721728081322, "grad_norm": 0.7858076095581055, "learning_rate": 2.9744868658757628e-05, "loss": 0.9529, "step": 210 }, { "epoch": 0.5463786531130876, "grad_norm": 0.7638741731643677, "learning_rate": 2.9721221508838302e-05, "loss": 0.9502, "step": 215 }, { "epoch": 0.5590851334180432, "grad_norm": 0.9117738604545593, "learning_rate": 2.9696536750526748e-05, "loss": 0.8913, "step": 220 }, { "epoch": 0.5717916137229987, "grad_norm": 0.8238304853439331, "learning_rate": 2.9670816123699812e-05, "loss": 0.8704, "step": 225 }, { "epoch": 0.5844980940279543, "grad_norm": 0.7925108075141907, "learning_rate": 2.9644061441246323e-05, "loss": 0.8565, "step": 230 }, { "epoch": 0.5972045743329097, "grad_norm": 0.7725275754928589, "learning_rate": 2.9616274588939364e-05, "loss": 0.8292, "step": 235 }, { "epoch": 0.6099110546378653, "grad_norm": 0.8474175333976746, "learning_rate": 2.9587457525303305e-05, "loss": 0.8822, "step": 240 }, { "epoch": 0.6226175349428208, "grad_norm": 1.0394459962844849, "learning_rate": 2.9557612281475776e-05, "loss": 0.8602, "step": 245 }, { "epoch": 0.6353240152477764, "grad_norm": 0.9201862812042236, "learning_rate": 2.9526740961064516e-05, "loss": 0.8374, "step": 250 }, { "epoch": 0.6480304955527318, "grad_norm": 0.9244139790534973, "learning_rate": 2.9494845739999103e-05, "loss": 0.8361, "step": 255 }, { "epoch": 0.6607369758576874, "grad_norm": 0.9039394855499268, "learning_rate": 2.9461928866377553e-05, "loss": 0.8261, "step": 260 }, { "epoch": 0.6734434561626429, "grad_norm": 0.7772687673568726, "learning_rate": 2.942799266030791e-05, "loss": 0.8687, "step": 265 }, { "epoch": 0.6861499364675985, "grad_norm": 1.1102648973464966, "learning_rate": 2.9393039513744684e-05, "loss": 0.8003, "step": 270 }, { "epoch": 0.6988564167725541, "grad_norm": 0.8548868298530579, "learning_rate": 2.9357071890320262e-05, "loss": 0.8169, "step": 275 }, { "epoch": 0.7115628970775095, "grad_norm": 0.8160343766212463, "learning_rate": 2.9320092325171292e-05, "loss": 0.7822, "step": 280 }, { "epoch": 0.7242693773824651, "grad_norm": 0.825340986251831, "learning_rate": 2.9282103424759935e-05, "loss": 0.7904, "step": 285 }, { "epoch": 0.7369758576874206, "grad_norm": 0.7895886898040771, "learning_rate": 2.924310786669023e-05, "loss": 0.8006, "step": 290 }, { "epoch": 0.7496823379923762, "grad_norm": 1.0612972974777222, "learning_rate": 2.9203108399519295e-05, "loss": 0.7724, "step": 295 }, { "epoch": 0.7623888182973316, "grad_norm": 0.937221884727478, "learning_rate": 2.9162107842563645e-05, "loss": 0.7597, "step": 300 }, { "epoch": 0.7750952986022872, "grad_norm": 0.9658941626548767, "learning_rate": 2.9120109085700443e-05, "loss": 0.7786, "step": 305 }, { "epoch": 0.7878017789072427, "grad_norm": 1.1432039737701416, "learning_rate": 2.9077115089163842e-05, "loss": 0.8014, "step": 310 }, { "epoch": 0.8005082592121983, "grad_norm": 0.9009749293327332, "learning_rate": 2.903312888333631e-05, "loss": 0.8, "step": 315 }, { "epoch": 0.8132147395171537, "grad_norm": 1.0406774282455444, "learning_rate": 2.8988153568535053e-05, "loss": 0.743, "step": 320 }, { "epoch": 0.8259212198221093, "grad_norm": 1.0573517084121704, "learning_rate": 2.8942192314793486e-05, "loss": 0.746, "step": 325 }, { "epoch": 0.8386277001270648, "grad_norm": 0.9490292072296143, "learning_rate": 2.8895248361637795e-05, "loss": 0.7417, "step": 330 }, { "epoch": 0.8513341804320204, "grad_norm": 1.0302095413208008, "learning_rate": 2.8847325017858608e-05, "loss": 0.7573, "step": 335 }, { "epoch": 0.8640406607369758, "grad_norm": 0.8958389759063721, "learning_rate": 2.879842566127778e-05, "loss": 0.7489, "step": 340 }, { "epoch": 0.8767471410419314, "grad_norm": 0.9970207214355469, "learning_rate": 2.8748553738510296e-05, "loss": 0.6952, "step": 345 }, { "epoch": 0.8894536213468869, "grad_norm": 1.0113465785980225, "learning_rate": 2.869771276472137e-05, "loss": 0.6985, "step": 350 }, { "epoch": 0.9021601016518425, "grad_norm": 0.9318073391914368, "learning_rate": 2.8645906323378642e-05, "loss": 0.7184, "step": 355 }, { "epoch": 0.9148665819567979, "grad_norm": 0.9506980180740356, "learning_rate": 2.8593138065999648e-05, "loss": 0.6907, "step": 360 }, { "epoch": 0.9275730622617535, "grad_norm": 0.9164155721664429, "learning_rate": 2.8539411711894397e-05, "loss": 0.7064, "step": 365 }, { "epoch": 0.940279542566709, "grad_norm": 1.1175841093063354, "learning_rate": 2.8484731047903274e-05, "loss": 0.6761, "step": 370 }, { "epoch": 0.9529860228716646, "grad_norm": 0.9533513188362122, "learning_rate": 2.842909992813007e-05, "loss": 0.6527, "step": 375 }, { "epoch": 0.96569250317662, "grad_norm": 1.088073968887329, "learning_rate": 2.8372522273670386e-05, "loss": 0.7256, "step": 380 }, { "epoch": 0.9783989834815756, "grad_norm": 0.9813941121101379, "learning_rate": 2.8315002072335216e-05, "loss": 0.6904, "step": 385 }, { "epoch": 0.9911054637865311, "grad_norm": 1.0815173387527466, "learning_rate": 2.8256543378369906e-05, "loss": 0.7024, "step": 390 } ], "logging_steps": 5, "max_steps": 1970, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.47483574861824e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }