| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 394, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012706480304955527, | |
| "grad_norm": 1.0249743461608887, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 1.2769, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.025412960609911054, | |
| "grad_norm": 0.894174337387085, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 1.3117, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03811944091486658, | |
| "grad_norm": 0.6720679998397827, | |
| "learning_rate": 4.242424242424242e-06, | |
| "loss": 1.33, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05082592121982211, | |
| "grad_norm": 0.7012405395507812, | |
| "learning_rate": 5.757575757575758e-06, | |
| "loss": 1.3123, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06353240152477764, | |
| "grad_norm": 0.8811719417572021, | |
| "learning_rate": 7.272727272727273e-06, | |
| "loss": 1.2455, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.07623888182973317, | |
| "grad_norm": 0.5775105357170105, | |
| "learning_rate": 8.787878787878788e-06, | |
| "loss": 1.2503, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08894536213468869, | |
| "grad_norm": 0.6601129770278931, | |
| "learning_rate": 1.0303030303030302e-05, | |
| "loss": 1.2622, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10165184243964422, | |
| "grad_norm": 0.563920795917511, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 1.2293, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11435832274459974, | |
| "grad_norm": 0.6447675824165344, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.1266, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.12706480304955528, | |
| "grad_norm": 0.6008123755455017, | |
| "learning_rate": 1.484848484848485e-05, | |
| "loss": 1.1518, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1397712833545108, | |
| "grad_norm": 0.5885509848594666, | |
| "learning_rate": 1.6363636363636363e-05, | |
| "loss": 1.2858, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.15247776365946633, | |
| "grad_norm": 0.47118306159973145, | |
| "learning_rate": 1.7878787878787877e-05, | |
| "loss": 1.1583, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16518424396442186, | |
| "grad_norm": 0.5283555388450623, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "loss": 1.2376, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.17789072426937738, | |
| "grad_norm": 0.5695042014122009, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 1.2086, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1905972045743329, | |
| "grad_norm": 0.7871887683868408, | |
| "learning_rate": 2.2424242424242424e-05, | |
| "loss": 1.178, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.20330368487928843, | |
| "grad_norm": 0.5051509141921997, | |
| "learning_rate": 2.3939393939393942e-05, | |
| "loss": 1.1244, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21601016518424396, | |
| "grad_norm": 0.6592188477516174, | |
| "learning_rate": 2.5454545454545457e-05, | |
| "loss": 1.1719, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.22871664548919948, | |
| "grad_norm": 0.5462698340415955, | |
| "learning_rate": 2.696969696969697e-05, | |
| "loss": 1.1111, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.241423125794155, | |
| "grad_norm": 0.5268920063972473, | |
| "learning_rate": 2.8484848484848486e-05, | |
| "loss": 1.1048, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.25412960609911056, | |
| "grad_norm": 0.5009660124778748, | |
| "learning_rate": 3e-05, | |
| "loss": 1.1586, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2668360864040661, | |
| "grad_norm": 0.6100103259086609, | |
| "learning_rate": 2.999947137111298e-05, | |
| "loss": 1.1005, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2795425667090216, | |
| "grad_norm": 0.5694701075553894, | |
| "learning_rate": 2.9997885521711738e-05, | |
| "loss": 1.0613, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.29224904701397714, | |
| "grad_norm": 0.5084468722343445, | |
| "learning_rate": 2.9995242563573035e-05, | |
| "loss": 1.0681, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.30495552731893266, | |
| "grad_norm": 0.5901311635971069, | |
| "learning_rate": 2.9991542682982747e-05, | |
| "loss": 1.0505, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3176620076238882, | |
| "grad_norm": 0.5251779556274414, | |
| "learning_rate": 2.99867861407227e-05, | |
| "loss": 1.0543, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3303684879288437, | |
| "grad_norm": 0.5326008796691895, | |
| "learning_rate": 2.9980973272052328e-05, | |
| "loss": 1.0744, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.34307496823379924, | |
| "grad_norm": 0.5696294903755188, | |
| "learning_rate": 2.9974104486684988e-05, | |
| "loss": 1.0599, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.35578144853875476, | |
| "grad_norm": 0.5977292656898499, | |
| "learning_rate": 2.996618026875914e-05, | |
| "loss": 1.071, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3684879288437103, | |
| "grad_norm": 0.5755591988563538, | |
| "learning_rate": 2.995720117680417e-05, | |
| "loss": 1.0214, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.3811944091486658, | |
| "grad_norm": 0.8020675778388977, | |
| "learning_rate": 2.994716784370108e-05, | |
| "loss": 1.0587, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.39390088945362134, | |
| "grad_norm": 0.6691463589668274, | |
| "learning_rate": 2.9936080976637823e-05, | |
| "loss": 0.9774, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.40660736975857686, | |
| "grad_norm": 0.6647286415100098, | |
| "learning_rate": 2.992394135705949e-05, | |
| "loss": 1.0251, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4193138500635324, | |
| "grad_norm": 0.7553662061691284, | |
| "learning_rate": 2.9910749840613233e-05, | |
| "loss": 0.9626, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4320203303684879, | |
| "grad_norm": 0.6578956842422485, | |
| "learning_rate": 2.9896507357087928e-05, | |
| "loss": 0.9649, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.44472681067344344, | |
| "grad_norm": 0.6698135137557983, | |
| "learning_rate": 2.988121491034868e-05, | |
| "loss": 0.9372, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.45743329097839897, | |
| "grad_norm": 0.7247338891029358, | |
| "learning_rate": 2.9864873578266034e-05, | |
| "loss": 0.9995, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4701397712833545, | |
| "grad_norm": 0.7628229856491089, | |
| "learning_rate": 2.9847484512640018e-05, | |
| "loss": 0.9421, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.48284625158831, | |
| "grad_norm": 0.7909829020500183, | |
| "learning_rate": 2.9829048939118944e-05, | |
| "loss": 0.9142, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.49555273189326554, | |
| "grad_norm": 0.7012385725975037, | |
| "learning_rate": 2.9809568157113047e-05, | |
| "loss": 0.9399, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5082592121982211, | |
| "grad_norm": 0.7712706327438354, | |
| "learning_rate": 2.9789043539702875e-05, | |
| "loss": 0.9837, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5209656925031766, | |
| "grad_norm": 0.7406087517738342, | |
| "learning_rate": 2.9767476533542513e-05, | |
| "loss": 0.9507, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5336721728081322, | |
| "grad_norm": 0.7858076095581055, | |
| "learning_rate": 2.9744868658757628e-05, | |
| "loss": 0.9529, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5463786531130876, | |
| "grad_norm": 0.7638741731643677, | |
| "learning_rate": 2.9721221508838302e-05, | |
| "loss": 0.9502, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5590851334180432, | |
| "grad_norm": 0.9117738604545593, | |
| "learning_rate": 2.9696536750526748e-05, | |
| "loss": 0.8913, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5717916137229987, | |
| "grad_norm": 0.8238304853439331, | |
| "learning_rate": 2.9670816123699812e-05, | |
| "loss": 0.8704, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.5844980940279543, | |
| "grad_norm": 0.7925108075141907, | |
| "learning_rate": 2.9644061441246323e-05, | |
| "loss": 0.8565, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5972045743329097, | |
| "grad_norm": 0.7725275754928589, | |
| "learning_rate": 2.9616274588939364e-05, | |
| "loss": 0.8292, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6099110546378653, | |
| "grad_norm": 0.8474175333976746, | |
| "learning_rate": 2.9587457525303305e-05, | |
| "loss": 0.8822, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6226175349428208, | |
| "grad_norm": 1.0394459962844849, | |
| "learning_rate": 2.9557612281475776e-05, | |
| "loss": 0.8602, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6353240152477764, | |
| "grad_norm": 0.9201862812042236, | |
| "learning_rate": 2.9526740961064516e-05, | |
| "loss": 0.8374, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6480304955527318, | |
| "grad_norm": 0.9244139790534973, | |
| "learning_rate": 2.9494845739999103e-05, | |
| "loss": 0.8361, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6607369758576874, | |
| "grad_norm": 0.9039394855499268, | |
| "learning_rate": 2.9461928866377553e-05, | |
| "loss": 0.8261, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6734434561626429, | |
| "grad_norm": 0.7772687673568726, | |
| "learning_rate": 2.942799266030791e-05, | |
| "loss": 0.8687, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.6861499364675985, | |
| "grad_norm": 1.1102648973464966, | |
| "learning_rate": 2.9393039513744684e-05, | |
| "loss": 0.8003, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6988564167725541, | |
| "grad_norm": 0.8548868298530579, | |
| "learning_rate": 2.9357071890320262e-05, | |
| "loss": 0.8169, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7115628970775095, | |
| "grad_norm": 0.8160343766212463, | |
| "learning_rate": 2.9320092325171292e-05, | |
| "loss": 0.7822, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7242693773824651, | |
| "grad_norm": 0.825340986251831, | |
| "learning_rate": 2.9282103424759935e-05, | |
| "loss": 0.7904, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7369758576874206, | |
| "grad_norm": 0.7895886898040771, | |
| "learning_rate": 2.924310786669023e-05, | |
| "loss": 0.8006, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7496823379923762, | |
| "grad_norm": 1.0612972974777222, | |
| "learning_rate": 2.9203108399519295e-05, | |
| "loss": 0.7724, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.7623888182973316, | |
| "grad_norm": 0.937221884727478, | |
| "learning_rate": 2.9162107842563645e-05, | |
| "loss": 0.7597, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7750952986022872, | |
| "grad_norm": 0.9658941626548767, | |
| "learning_rate": 2.9120109085700443e-05, | |
| "loss": 0.7786, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.7878017789072427, | |
| "grad_norm": 1.1432039737701416, | |
| "learning_rate": 2.9077115089163842e-05, | |
| "loss": 0.8014, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8005082592121983, | |
| "grad_norm": 0.9009749293327332, | |
| "learning_rate": 2.903312888333631e-05, | |
| "loss": 0.8, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8132147395171537, | |
| "grad_norm": 1.0406774282455444, | |
| "learning_rate": 2.8988153568535053e-05, | |
| "loss": 0.743, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8259212198221093, | |
| "grad_norm": 1.0573517084121704, | |
| "learning_rate": 2.8942192314793486e-05, | |
| "loss": 0.746, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8386277001270648, | |
| "grad_norm": 0.9490292072296143, | |
| "learning_rate": 2.8895248361637795e-05, | |
| "loss": 0.7417, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8513341804320204, | |
| "grad_norm": 1.0302095413208008, | |
| "learning_rate": 2.8847325017858608e-05, | |
| "loss": 0.7573, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.8640406607369758, | |
| "grad_norm": 0.8958389759063721, | |
| "learning_rate": 2.879842566127778e-05, | |
| "loss": 0.7489, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8767471410419314, | |
| "grad_norm": 0.9970207214355469, | |
| "learning_rate": 2.8748553738510296e-05, | |
| "loss": 0.6952, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.8894536213468869, | |
| "grad_norm": 1.0113465785980225, | |
| "learning_rate": 2.869771276472137e-05, | |
| "loss": 0.6985, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9021601016518425, | |
| "grad_norm": 0.9318073391914368, | |
| "learning_rate": 2.8645906323378642e-05, | |
| "loss": 0.7184, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9148665819567979, | |
| "grad_norm": 0.9506980180740356, | |
| "learning_rate": 2.8593138065999648e-05, | |
| "loss": 0.6907, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9275730622617535, | |
| "grad_norm": 0.9164155721664429, | |
| "learning_rate": 2.8539411711894397e-05, | |
| "loss": 0.7064, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.940279542566709, | |
| "grad_norm": 1.1175841093063354, | |
| "learning_rate": 2.8484731047903274e-05, | |
| "loss": 0.6761, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9529860228716646, | |
| "grad_norm": 0.9533513188362122, | |
| "learning_rate": 2.842909992813007e-05, | |
| "loss": 0.6527, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.96569250317662, | |
| "grad_norm": 1.088073968887329, | |
| "learning_rate": 2.8372522273670386e-05, | |
| "loss": 0.7256, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9783989834815756, | |
| "grad_norm": 0.9813941121101379, | |
| "learning_rate": 2.8315002072335216e-05, | |
| "loss": 0.6904, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.9911054637865311, | |
| "grad_norm": 1.0815173387527466, | |
| "learning_rate": 2.8256543378369906e-05, | |
| "loss": 0.7024, | |
| "step": 390 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1970, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.47483574861824e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |