{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.16436488926410675, "learning_rate": 3e-06, "loss": 1.2626, "step": 10 }, { "grad_norm": 0.17262399196624756, "learning_rate": 6.333333333333334e-06, "loss": 1.2533, "step": 20 }, { "grad_norm": 0.296794056892395, "learning_rate": 9.666666666666667e-06, "loss": 1.2115, "step": 30 }, { "grad_norm": 0.18189626932144165, "learning_rate": 1.3000000000000001e-05, "loss": 1.1331, "step": 40 }, { "grad_norm": 0.19689704477787018, "learning_rate": 1.6333333333333335e-05, "loss": 1.0918, "step": 50 }, { "grad_norm": 0.33070850372314453, "learning_rate": 1.9666666666666666e-05, "loss": 1.075, "step": 60 }, { "grad_norm": 0.18935681879520416, "learning_rate": 2.3000000000000003e-05, "loss": 1.0615, "step": 70 }, { "grad_norm": 0.38514193892478943, "learning_rate": 2.633333333333333e-05, "loss": 1.0459, "step": 80 }, { "grad_norm": 0.27131959795951843, "learning_rate": 2.9666666666666672e-05, "loss": 1.0219, "step": 90 }, { "grad_norm": 0.5233219265937805, "learning_rate": 3.3e-05, "loss": 0.9815, "step": 100 }, { "grad_norm": 0.3682015836238861, "learning_rate": 3.633333333333333e-05, "loss": 0.9351, "step": 110 }, { "grad_norm": 0.5758795738220215, "learning_rate": 3.966666666666667e-05, "loss": 0.8842, "step": 120 }, { "grad_norm": 0.9897364974021912, "learning_rate": 4.3e-05, "loss": 0.8308, "step": 130 }, { "grad_norm": 0.6590216755867004, "learning_rate": 4.633333333333333e-05, "loss": 0.7733, "step": 140 }, { "grad_norm": 0.850604236125946, "learning_rate": 4.966666666666667e-05, "loss": 0.7366, "step": 150 }, { "grad_norm": 0.7046625018119812, "learning_rate": 5.300000000000001e-05, "loss": 0.6911, "step": 160 }, { "grad_norm": 0.8093236684799194, "learning_rate": 5.633333333333334e-05, "loss": 0.6561, "step": 170 }, { "grad_norm": 1.1920536756515503, "learning_rate": 5.966666666666667e-05, "loss": 0.624, "step": 180 }, { "grad_norm": 0.8813220858573914, "learning_rate": 6.3e-05, "loss": 0.6004, "step": 190 }, { "grad_norm": 0.9447306394577026, "learning_rate": 6.633333333333334e-05, "loss": 0.5778, "step": 200 }, { "grad_norm": 1.016005039215088, "learning_rate": 6.966666666666668e-05, "loss": 0.5577, "step": 210 }, { "grad_norm": 0.9053665995597839, "learning_rate": 7.3e-05, "loss": 0.541, "step": 220 }, { "grad_norm": 1.1023659706115723, "learning_rate": 7.633333333333334e-05, "loss": 0.516, "step": 230 }, { "grad_norm": 1.2229526042938232, "learning_rate": 7.966666666666666e-05, "loss": 0.4996, "step": 240 }, { "grad_norm": 1.1556382179260254, "learning_rate": 8.3e-05, "loss": 0.4822, "step": 250 }, { "grad_norm": 1.349492073059082, "learning_rate": 8.633333333333334e-05, "loss": 0.4579, "step": 260 }, { "grad_norm": 1.251599669456482, "learning_rate": 8.966666666666666e-05, "loss": 0.4291, "step": 270 }, { "grad_norm": 1.1909986734390259, "learning_rate": 9.300000000000001e-05, "loss": 0.4068, "step": 280 }, { "grad_norm": 1.2850487232208252, "learning_rate": 9.633333333333335e-05, "loss": 0.3932, "step": 290 }, { "grad_norm": 1.3327720165252686, "learning_rate": 9.966666666666667e-05, "loss": 0.3716, "step": 300 }, { "grad_norm": 1.6960358619689941, "learning_rate": 9.999938485971279e-05, "loss": 0.3564, "step": 310 }, { "grad_norm": 1.3837307691574097, "learning_rate": 9.999725846827562e-05, "loss": 0.3295, "step": 320 }, { "grad_norm": 0.9719492793083191, "learning_rate": 9.999361329594254e-05, "loss": 0.3078, "step": 330 }, { "grad_norm": 1.0894017219543457, "learning_rate": 9.998844945344405e-05, "loss": 0.2905, "step": 340 }, { "grad_norm": 1.0385677814483643, "learning_rate": 9.99817670976436e-05, "loss": 0.2703, "step": 350 }, { "grad_norm": 1.2726777791976929, "learning_rate": 9.997356643153303e-05, "loss": 0.2523, "step": 360 }, { "grad_norm": 1.377550482749939, "learning_rate": 9.996384770422629e-05, "loss": 0.2416, "step": 370 }, { "grad_norm": 1.5664293766021729, "learning_rate": 9.995261121095194e-05, "loss": 0.2256, "step": 380 }, { "grad_norm": 1.2346878051757812, "learning_rate": 9.993985729304408e-05, "loss": 0.1967, "step": 390 }, { "grad_norm": 1.0488691329956055, "learning_rate": 9.992558633793212e-05, "loss": 0.1839, "step": 400 }, { "grad_norm": 1.8018057346343994, "learning_rate": 9.990979877912891e-05, "loss": 0.1722, "step": 410 }, { "grad_norm": 1.0807803869247437, "learning_rate": 9.989249509621759e-05, "loss": 0.1529, "step": 420 }, { "grad_norm": 1.1561813354492188, "learning_rate": 9.987367581483705e-05, "loss": 0.144, "step": 430 }, { "grad_norm": 1.4849426746368408, "learning_rate": 9.985334150666592e-05, "loss": 0.1278, "step": 440 }, { "grad_norm": 1.3472307920455933, "learning_rate": 9.983149278940526e-05, "loss": 0.1207, "step": 450 }, { "grad_norm": 1.3177121877670288, "learning_rate": 9.980813032675974e-05, "loss": 0.1188, "step": 460 }, { "grad_norm": 1.1293466091156006, "learning_rate": 9.978325482841753e-05, "loss": 0.108, "step": 470 }, { "grad_norm": 1.2579174041748047, "learning_rate": 9.975686705002867e-05, "loss": 0.0956, "step": 480 }, { "grad_norm": 1.0814399719238281, "learning_rate": 9.972896779318219e-05, "loss": 0.0928, "step": 490 }, { "grad_norm": 1.103074312210083, "learning_rate": 9.969955790538175e-05, "loss": 0.0919, "step": 500 }, { "grad_norm": 1.310774564743042, "learning_rate": 9.966863828001982e-05, "loss": 0.0948, "step": 510 }, { "grad_norm": 1.3567174673080444, "learning_rate": 9.963620985635065e-05, "loss": 0.0905, "step": 520 }, { "grad_norm": 1.0070871114730835, "learning_rate": 9.960227361946164e-05, "loss": 0.093, "step": 530 }, { "grad_norm": 1.145613431930542, "learning_rate": 9.95668306002435e-05, "loss": 0.0951, "step": 540 }, { "grad_norm": 0.9971445202827454, "learning_rate": 9.952988187535886e-05, "loss": 0.094, "step": 550 }, { "grad_norm": 1.1573495864868164, "learning_rate": 9.949142856720961e-05, "loss": 0.0842, "step": 560 }, { "grad_norm": 1.1136912107467651, "learning_rate": 9.945147184390278e-05, "loss": 0.0855, "step": 570 }, { "grad_norm": 1.0231609344482422, "learning_rate": 9.941001291921512e-05, "loss": 0.0775, "step": 580 }, { "grad_norm": 1.0133978128433228, "learning_rate": 9.936705305255612e-05, "loss": 0.083, "step": 590 }, { "grad_norm": 1.1500823497772217, "learning_rate": 9.932259354892984e-05, "loss": 0.0867, "step": 600 }, { "grad_norm": 1.0416063070297241, "learning_rate": 9.927663575889521e-05, "loss": 0.0796, "step": 610 }, { "grad_norm": 1.0557258129119873, "learning_rate": 9.922918107852504e-05, "loss": 0.0826, "step": 620 }, { "grad_norm": 1.2491735219955444, "learning_rate": 9.918023094936363e-05, "loss": 0.0817, "step": 630 }, { "grad_norm": 1.1741544008255005, "learning_rate": 9.912978685838294e-05, "loss": 0.0807, "step": 640 }, { "grad_norm": 1.2721768617630005, "learning_rate": 9.90778503379374e-05, "loss": 0.0754, "step": 650 }, { "grad_norm": 0.9619813561439514, "learning_rate": 9.902442296571743e-05, "loss": 0.0754, "step": 660 }, { "grad_norm": 1.1113922595977783, "learning_rate": 9.896950636470147e-05, "loss": 0.0736, "step": 670 }, { "grad_norm": 0.8889420032501221, "learning_rate": 9.891310220310666e-05, "loss": 0.0774, "step": 680 }, { "grad_norm": 0.8371496200561523, "learning_rate": 9.885521219433823e-05, "loss": 0.0752, "step": 690 }, { "grad_norm": 1.193962812423706, "learning_rate": 9.879583809693738e-05, "loss": 0.0739, "step": 700 }, { "grad_norm": 0.9352642893791199, "learning_rate": 9.873498171452789e-05, "loss": 0.074, "step": 710 }, { "grad_norm": 0.8476113677024841, "learning_rate": 9.867264489576135e-05, "loss": 0.0687, "step": 720 }, { "grad_norm": 0.85703045129776, "learning_rate": 9.860882953426099e-05, "loss": 0.0721, "step": 730 }, { "grad_norm": 1.0026068687438965, "learning_rate": 9.854353756856412e-05, "loss": 0.0632, "step": 740 }, { "grad_norm": 0.8964142799377441, "learning_rate": 9.847677098206332e-05, "loss": 0.0628, "step": 750 }, { "grad_norm": 0.9155259728431702, "learning_rate": 9.840853180294608e-05, "loss": 0.0647, "step": 760 }, { "grad_norm": 0.9439025521278381, "learning_rate": 9.833882210413332e-05, "loss": 0.0699, "step": 770 }, { "grad_norm": 0.9577764272689819, "learning_rate": 9.826764400321633e-05, "loss": 0.0639, "step": 780 }, { "grad_norm": 1.0065102577209473, "learning_rate": 9.819499966239243e-05, "loss": 0.0613, "step": 790 }, { "grad_norm": 0.9528071880340576, "learning_rate": 9.812089128839938e-05, "loss": 0.0643, "step": 800 }, { "grad_norm": 0.8194341659545898, "learning_rate": 9.804532113244828e-05, "loss": 0.0647, "step": 810 }, { "grad_norm": 0.8991771936416626, "learning_rate": 9.796829149015517e-05, "loss": 0.063, "step": 820 }, { "grad_norm": 0.7631344795227051, "learning_rate": 9.788980470147132e-05, "loss": 0.0651, "step": 830 }, { "grad_norm": 0.8909981846809387, "learning_rate": 9.780986315061218e-05, "loss": 0.0594, "step": 840 }, { "grad_norm": 0.8621935248374939, "learning_rate": 9.772846926598491e-05, "loss": 0.0623, "step": 850 }, { "grad_norm": 0.7843512296676636, "learning_rate": 9.76456255201146e-05, "loss": 0.061, "step": 860 }, { "grad_norm": 0.8107233047485352, "learning_rate": 9.756133442956923e-05, "loss": 0.06, "step": 870 }, { "grad_norm": 0.8250086307525635, "learning_rate": 9.747559855488313e-05, "loss": 0.0608, "step": 880 }, { "grad_norm": 1.0516151189804077, "learning_rate": 9.73884205004793e-05, "loss": 0.0587, "step": 890 }, { "grad_norm": 0.7967276573181152, "learning_rate": 9.729980291459019e-05, "loss": 0.0554, "step": 900 }, { "grad_norm": 0.7640511393547058, "learning_rate": 9.720974848917735e-05, "loss": 0.0649, "step": 910 }, { "grad_norm": 0.9864993095397949, "learning_rate": 9.711825995984957e-05, "loss": 0.0597, "step": 920 }, { "grad_norm": 0.8970789313316345, "learning_rate": 9.702534010577991e-05, "loss": 0.0559, "step": 930 }, { "grad_norm": 0.9033194780349731, "learning_rate": 9.693099174962103e-05, "loss": 0.0502, "step": 940 }, { "grad_norm": 0.891749918460846, "learning_rate": 9.683521775741977e-05, "loss": 0.0646, "step": 950 }, { "grad_norm": 0.8868605494499207, "learning_rate": 9.673802103852979e-05, "loss": 0.0592, "step": 960 }, { "grad_norm": 0.9233248829841614, "learning_rate": 9.663940454552342e-05, "loss": 0.0566, "step": 970 }, { "grad_norm": 0.8602434992790222, "learning_rate": 9.65393712741018e-05, "loss": 0.0534, "step": 980 }, { "grad_norm": 0.8748095631599426, "learning_rate": 9.6437924263004e-05, "loss": 0.0543, "step": 990 }, { "grad_norm": 0.7950878143310547, "learning_rate": 9.63350665939146e-05, "loss": 0.0517, "step": 1000 }, { "grad_norm": 0.8243885040283203, "learning_rate": 9.623080139137023e-05, "loss": 0.0516, "step": 1010 }, { "grad_norm": 0.772615373134613, "learning_rate": 9.612513182266447e-05, "loss": 0.053, "step": 1020 }, { "grad_norm": 0.8965969681739807, "learning_rate": 9.601806109775179e-05, "loss": 0.0514, "step": 1030 }, { "grad_norm": 0.8220785856246948, "learning_rate": 9.590959246914995e-05, "loss": 0.0585, "step": 1040 }, { "grad_norm": 0.8071192502975464, "learning_rate": 9.579972923184122e-05, "loss": 0.0528, "step": 1050 }, { "grad_norm": 0.715766429901123, "learning_rate": 9.568847472317232e-05, "loss": 0.0549, "step": 1060 }, { "grad_norm": 0.7731180191040039, "learning_rate": 9.557583232275303e-05, "loss": 0.051, "step": 1070 }, { "grad_norm": 0.7870914936065674, "learning_rate": 9.546180545235344e-05, "loss": 0.0535, "step": 1080 }, { "grad_norm": 0.7550817131996155, "learning_rate": 9.534639757580013e-05, "loss": 0.0556, "step": 1090 }, { "grad_norm": 0.8336426615715027, "learning_rate": 9.522961219887092e-05, "loss": 0.0515, "step": 1100 }, { "grad_norm": 0.7580074071884155, "learning_rate": 9.511145286918828e-05, "loss": 0.0547, "step": 1110 }, { "grad_norm": 0.7820581793785095, "learning_rate": 9.499192317611167e-05, "loss": 0.0546, "step": 1120 }, { "grad_norm": 0.8143420815467834, "learning_rate": 9.487102675062851e-05, "loss": 0.051, "step": 1130 }, { "grad_norm": 0.9172049164772034, "learning_rate": 9.474876726524374e-05, "loss": 0.055, "step": 1140 }, { "grad_norm": 0.6472367644309998, "learning_rate": 9.462514843386845e-05, "loss": 0.0475, "step": 1150 }, { "grad_norm": 0.8475685119628906, "learning_rate": 9.450017401170689e-05, "loss": 0.0481, "step": 1160 }, { "grad_norm": 0.6527271866798401, "learning_rate": 9.437384779514256e-05, "loss": 0.049, "step": 1170 }, { "grad_norm": 0.7633675932884216, "learning_rate": 9.424617362162271e-05, "loss": 0.044, "step": 1180 }, { "grad_norm": 0.6868181824684143, "learning_rate": 9.411715536954196e-05, "loss": 0.0493, "step": 1190 }, { "grad_norm": 0.7303510904312134, "learning_rate": 9.39867969581243e-05, "loss": 0.0526, "step": 1200 }, { "grad_norm": 0.7577109336853027, "learning_rate": 9.385510234730415e-05, "loss": 0.0549, "step": 1210 }, { "grad_norm": 0.671566903591156, "learning_rate": 9.372207553760603e-05, "loss": 0.0492, "step": 1220 }, { "grad_norm": 0.6442204713821411, "learning_rate": 9.358772057002312e-05, "loss": 0.0489, "step": 1230 }, { "grad_norm": 0.8377021551132202, "learning_rate": 9.345204152589428e-05, "loss": 0.0462, "step": 1240 }, { "grad_norm": 0.7773048281669617, "learning_rate": 9.331504252678037e-05, "loss": 0.0519, "step": 1250 }, { "grad_norm": 0.8093121647834778, "learning_rate": 9.317672773433876e-05, "loss": 0.0497, "step": 1260 }, { "grad_norm": 0.8040132522583008, "learning_rate": 9.30371013501972e-05, "loss": 0.0512, "step": 1270 }, { "grad_norm": 0.7714861035346985, "learning_rate": 9.289616761582587e-05, "loss": 0.0496, "step": 1280 }, { "grad_norm": 0.8005548715591431, "learning_rate": 9.275393081240882e-05, "loss": 0.0469, "step": 1290 }, { "grad_norm": 0.6162385940551758, "learning_rate": 9.261039526071374e-05, "loss": 0.0455, "step": 1300 }, { "grad_norm": 0.8203111290931702, "learning_rate": 9.246556532096078e-05, "loss": 0.0501, "step": 1310 }, { "grad_norm": 0.7906405329704285, "learning_rate": 9.231944539269009e-05, "loss": 0.0511, "step": 1320 }, { "grad_norm": 0.6650158166885376, "learning_rate": 9.217203991462815e-05, "loss": 0.0449, "step": 1330 }, { "grad_norm": 0.681294858455658, "learning_rate": 9.202335336455296e-05, "loss": 0.0497, "step": 1340 }, { "grad_norm": 0.7014471888542175, "learning_rate": 9.187339025915802e-05, "loss": 0.0454, "step": 1350 }, { "grad_norm": 0.768068790435791, "learning_rate": 9.17221551539151e-05, "loss": 0.048, "step": 1360 }, { "grad_norm": 0.6616342663764954, "learning_rate": 9.156965264293586e-05, "loss": 0.0446, "step": 1370 }, { "grad_norm": 0.8056400418281555, "learning_rate": 9.141588735883232e-05, "loss": 0.0496, "step": 1380 }, { "grad_norm": 0.7025634050369263, "learning_rate": 9.126086397257612e-05, "loss": 0.0477, "step": 1390 }, { "grad_norm": 0.7773793935775757, "learning_rate": 9.110458719335659e-05, "loss": 0.0478, "step": 1400 }, { "grad_norm": 0.696651041507721, "learning_rate": 9.094706176843777e-05, "loss": 0.0472, "step": 1410 }, { "grad_norm": 0.6660913228988647, "learning_rate": 9.078829248301417e-05, "loss": 0.0465, "step": 1420 }, { "grad_norm": 0.7574058771133423, "learning_rate": 9.062828416006539e-05, "loss": 0.0443, "step": 1430 }, { "grad_norm": 0.6740890145301819, "learning_rate": 9.046704166020961e-05, "loss": 0.0472, "step": 1440 }, { "grad_norm": 0.6242374181747437, "learning_rate": 9.030456988155596e-05, "loss": 0.0477, "step": 1450 }, { "grad_norm": 0.6770017147064209, "learning_rate": 9.014087375955573e-05, "loss": 0.0469, "step": 1460 }, { "grad_norm": 0.6208916306495667, "learning_rate": 8.997595826685243e-05, "loss": 0.0451, "step": 1470 }, { "grad_norm": 0.6913909316062927, "learning_rate": 8.980982841313074e-05, "loss": 0.0432, "step": 1480 }, { "grad_norm": 0.67692631483078, "learning_rate": 8.964248924496435e-05, "loss": 0.0475, "step": 1490 }, { "grad_norm": 0.6939643621444702, "learning_rate": 8.947394584566258e-05, "loss": 0.0395, "step": 1500 } ], "logging_steps": 10, "max_steps": 6000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 90, "trial_name": null, "trial_params": null }