{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2361, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012706480304955527, "grad_norm": 29.66147254458293, "learning_rate": 5.070422535211268e-07, "loss": 2.2184, "step": 10 }, { "epoch": 0.025412960609911054, "grad_norm": 13.392595629867637, "learning_rate": 1.0704225352112677e-06, "loss": 1.8971, "step": 20 }, { "epoch": 0.03811944091486658, "grad_norm": 11.776335522019831, "learning_rate": 1.6338028169014086e-06, "loss": 1.4505, "step": 30 }, { "epoch": 0.05082592121982211, "grad_norm": 9.270920607469225, "learning_rate": 2.19718309859155e-06, "loss": 1.2166, "step": 40 }, { "epoch": 0.06353240152477764, "grad_norm": 8.813337405138961, "learning_rate": 2.7605633802816906e-06, "loss": 1.1232, "step": 50 }, { "epoch": 0.07623888182973317, "grad_norm": 8.759370998993289, "learning_rate": 3.3239436619718313e-06, "loss": 1.1167, "step": 60 }, { "epoch": 0.08894536213468869, "grad_norm": 9.774431896253528, "learning_rate": 3.887323943661972e-06, "loss": 1.0426, "step": 70 }, { "epoch": 0.10165184243964422, "grad_norm": 6.934105141250302, "learning_rate": 4.450704225352113e-06, "loss": 0.9485, "step": 80 }, { "epoch": 0.11435832274459974, "grad_norm": 9.541786088857878, "learning_rate": 5.014084507042254e-06, "loss": 0.971, "step": 90 }, { "epoch": 0.12706480304955528, "grad_norm": 8.771108246775684, "learning_rate": 5.577464788732395e-06, "loss": 1.0133, "step": 100 }, { "epoch": 0.1397712833545108, "grad_norm": 8.314820650161352, "learning_rate": 6.1408450704225356e-06, "loss": 0.9729, "step": 110 }, { "epoch": 0.15247776365946633, "grad_norm": 8.539055316805458, "learning_rate": 6.704225352112676e-06, "loss": 0.9679, "step": 120 }, { "epoch": 0.16518424396442186, "grad_norm": 7.751785436062567, "learning_rate": 7.267605633802818e-06, "loss": 0.9634, "step": 130 }, { "epoch": 0.17789072426937738, "grad_norm": 8.338219679133466, "learning_rate": 7.830985915492958e-06, "loss": 0.9674, "step": 140 }, { "epoch": 0.1905972045743329, "grad_norm": 6.970312776486159, "learning_rate": 8.3943661971831e-06, "loss": 0.9323, "step": 150 }, { "epoch": 0.20330368487928843, "grad_norm": 8.934334887580613, "learning_rate": 8.957746478873241e-06, "loss": 0.9296, "step": 160 }, { "epoch": 0.21601016518424396, "grad_norm": 8.855402068732014, "learning_rate": 9.521126760563381e-06, "loss": 0.8979, "step": 170 }, { "epoch": 0.22871664548919948, "grad_norm": 7.627448152527667, "learning_rate": 1.0084507042253523e-05, "loss": 0.9398, "step": 180 }, { "epoch": 0.241423125794155, "grad_norm": 8.293311252648705, "learning_rate": 1.0647887323943662e-05, "loss": 0.9903, "step": 190 }, { "epoch": 0.25412960609911056, "grad_norm": 7.860820039563278, "learning_rate": 1.1211267605633804e-05, "loss": 0.953, "step": 200 }, { "epoch": 0.2668360864040661, "grad_norm": 12.101780056709387, "learning_rate": 1.1774647887323944e-05, "loss": 0.9014, "step": 210 }, { "epoch": 0.2795425667090216, "grad_norm": 8.180635063012854, "learning_rate": 1.2338028169014084e-05, "loss": 0.9612, "step": 220 }, { "epoch": 0.29224904701397714, "grad_norm": 7.043614458189797, "learning_rate": 1.2901408450704227e-05, "loss": 0.9492, "step": 230 }, { "epoch": 0.30495552731893266, "grad_norm": 6.22917625029958, "learning_rate": 1.3464788732394367e-05, "loss": 0.9073, "step": 240 }, { "epoch": 0.3176620076238882, "grad_norm": 6.158034885075652, "learning_rate": 1.4028169014084507e-05, "loss": 0.899, "step": 250 }, { "epoch": 0.3303684879288437, "grad_norm": 6.511044998547078, "learning_rate": 1.459154929577465e-05, "loss": 0.9147, "step": 260 }, { "epoch": 0.34307496823379924, "grad_norm": 7.2764171631878405, "learning_rate": 1.515492957746479e-05, "loss": 0.9637, "step": 270 }, { "epoch": 0.35578144853875476, "grad_norm": 7.6464115804873405, "learning_rate": 1.571830985915493e-05, "loss": 0.9521, "step": 280 }, { "epoch": 0.3684879288437103, "grad_norm": 7.027026272110715, "learning_rate": 1.6281690140845072e-05, "loss": 0.9738, "step": 290 }, { "epoch": 0.3811944091486658, "grad_norm": 7.0177338278091765, "learning_rate": 1.6845070422535213e-05, "loss": 1.0046, "step": 300 }, { "epoch": 0.39390088945362134, "grad_norm": 6.988788493899054, "learning_rate": 1.740845070422535e-05, "loss": 0.9481, "step": 310 }, { "epoch": 0.40660736975857686, "grad_norm": 6.592900845765771, "learning_rate": 1.7971830985915497e-05, "loss": 0.9588, "step": 320 }, { "epoch": 0.4193138500635324, "grad_norm": 5.836583770100009, "learning_rate": 1.8535211267605635e-05, "loss": 0.9404, "step": 330 }, { "epoch": 0.4320203303684879, "grad_norm": 6.69521882642329, "learning_rate": 1.9098591549295776e-05, "loss": 0.9385, "step": 340 }, { "epoch": 0.44472681067344344, "grad_norm": 5.945281536047344, "learning_rate": 1.9661971830985918e-05, "loss": 0.9394, "step": 350 }, { "epoch": 0.45743329097839897, "grad_norm": 5.686864112756425, "learning_rate": 1.9999803787597817e-05, "loss": 0.9764, "step": 360 }, { "epoch": 0.4701397712833545, "grad_norm": 5.459918154381771, "learning_rate": 1.9997596486500402e-05, "loss": 0.9827, "step": 370 }, { "epoch": 0.48284625158831, "grad_norm": 5.164520519383337, "learning_rate": 1.999293716197302e-05, "loss": 0.9291, "step": 380 }, { "epoch": 0.49555273189326554, "grad_norm": 6.177104322189275, "learning_rate": 1.998582695676762e-05, "loss": 1.0364, "step": 390 }, { "epoch": 0.5082592121982211, "grad_norm": 5.346131678285112, "learning_rate": 1.997626761474232e-05, "loss": 0.9728, "step": 400 }, { "epoch": 0.5209656925031766, "grad_norm": 5.075630186929998, "learning_rate": 1.99642614804337e-05, "loss": 0.957, "step": 410 }, { "epoch": 0.5336721728081322, "grad_norm": 5.766342613636245, "learning_rate": 1.9949811498481763e-05, "loss": 0.9856, "step": 420 }, { "epoch": 0.5463786531130876, "grad_norm": 4.773734435548506, "learning_rate": 1.9932921212907753e-05, "loss": 1.0065, "step": 430 }, { "epoch": 0.5590851334180432, "grad_norm": 4.8306675684219105, "learning_rate": 1.991359476624493e-05, "loss": 0.9175, "step": 440 }, { "epoch": 0.5717916137229987, "grad_norm": 4.8928892220094236, "learning_rate": 1.9891836898522566e-05, "loss": 0.9014, "step": 450 }, { "epoch": 0.5844980940279543, "grad_norm": 4.466687752161082, "learning_rate": 1.9867652946103413e-05, "loss": 0.9324, "step": 460 }, { "epoch": 0.5972045743329097, "grad_norm": 4.706203130518702, "learning_rate": 1.9841048840374885e-05, "loss": 0.9311, "step": 470 }, { "epoch": 0.6099110546378653, "grad_norm": 4.776570809357277, "learning_rate": 1.9812031106294314e-05, "loss": 0.952, "step": 480 }, { "epoch": 0.6226175349428208, "grad_norm": 5.443465949868151, "learning_rate": 1.978060686078866e-05, "loss": 0.9067, "step": 490 }, { "epoch": 0.6353240152477764, "grad_norm": 4.985579208130221, "learning_rate": 1.974678381100896e-05, "loss": 0.9559, "step": 500 }, { "epoch": 0.6480304955527318, "grad_norm": 4.406803792781764, "learning_rate": 1.9710570252440106e-05, "loss": 0.9082, "step": 510 }, { "epoch": 0.6607369758576874, "grad_norm": 4.620905177890474, "learning_rate": 1.9671975066866254e-05, "loss": 0.9241, "step": 520 }, { "epoch": 0.6734434561626429, "grad_norm": 3.9984494959651533, "learning_rate": 1.9631007720192475e-05, "loss": 0.8811, "step": 530 }, { "epoch": 0.6861499364675985, "grad_norm": 4.760193821922472, "learning_rate": 1.9587678260123146e-05, "loss": 0.9314, "step": 540 }, { "epoch": 0.6988564167725541, "grad_norm": 4.4328809578626895, "learning_rate": 1.9541997313697614e-05, "loss": 0.9018, "step": 550 }, { "epoch": 0.7115628970775095, "grad_norm": 3.785123304702001, "learning_rate": 1.9493976084683814e-05, "loss": 0.9349, "step": 560 }, { "epoch": 0.7242693773824651, "grad_norm": 4.623522007776074, "learning_rate": 1.9443626350830417e-05, "loss": 0.9283, "step": 570 }, { "epoch": 0.7369758576874206, "grad_norm": 4.367382745999128, "learning_rate": 1.9390960460978188e-05, "loss": 0.8936, "step": 580 }, { "epoch": 0.7496823379923762, "grad_norm": 4.640745350515662, "learning_rate": 1.933599133203131e-05, "loss": 0.9529, "step": 590 }, { "epoch": 0.7623888182973316, "grad_norm": 4.0946471225054974, "learning_rate": 1.9278732445789364e-05, "loss": 0.8961, "step": 600 }, { "epoch": 0.7750952986022872, "grad_norm": 4.250614479191838, "learning_rate": 1.9219197845640766e-05, "loss": 0.9028, "step": 610 }, { "epoch": 0.7878017789072427, "grad_norm": 4.147828875270731, "learning_rate": 1.9157402133118454e-05, "loss": 0.9302, "step": 620 }, { "epoch": 0.8005082592121983, "grad_norm": 5.568981020266887, "learning_rate": 1.909336046431871e-05, "loss": 0.9233, "step": 630 }, { "epoch": 0.8132147395171537, "grad_norm": 4.311517125711432, "learning_rate": 1.9027088546183968e-05, "loss": 0.9694, "step": 640 }, { "epoch": 0.8259212198221093, "grad_norm": 5.556351823725932, "learning_rate": 1.8958602632650474e-05, "loss": 0.9003, "step": 650 }, { "epoch": 0.8386277001270648, "grad_norm": 4.269905367926679, "learning_rate": 1.8887919520661867e-05, "loss": 0.8805, "step": 660 }, { "epoch": 0.8513341804320204, "grad_norm": 3.863721835826297, "learning_rate": 1.8815056546049505e-05, "loss": 0.9158, "step": 670 }, { "epoch": 0.8640406607369758, "grad_norm": 3.963824200874715, "learning_rate": 1.8740031579280667e-05, "loss": 0.8835, "step": 680 }, { "epoch": 0.8767471410419314, "grad_norm": 3.680960497113959, "learning_rate": 1.8662863021075632e-05, "loss": 0.898, "step": 690 }, { "epoch": 0.8894536213468869, "grad_norm": 3.7414803428899606, "learning_rate": 1.8583569797894673e-05, "loss": 0.9253, "step": 700 }, { "epoch": 0.9021601016518425, "grad_norm": 4.680988801232008, "learning_rate": 1.8502171357296144e-05, "loss": 0.848, "step": 710 }, { "epoch": 0.9148665819567979, "grad_norm": 3.9671267724005785, "learning_rate": 1.8418687663166745e-05, "loss": 0.8965, "step": 720 }, { "epoch": 0.9275730622617535, "grad_norm": 4.137039499686447, "learning_rate": 1.833313919082515e-05, "loss": 0.8553, "step": 730 }, { "epoch": 0.940279542566709, "grad_norm": 4.784766455706121, "learning_rate": 1.8245546922000207e-05, "loss": 0.8695, "step": 740 }, { "epoch": 0.9529860228716646, "grad_norm": 4.418195979726905, "learning_rate": 1.815593233968492e-05, "loss": 0.8497, "step": 750 }, { "epoch": 0.96569250317662, "grad_norm": 4.103893841492413, "learning_rate": 1.806431742286752e-05, "loss": 0.8746, "step": 760 }, { "epoch": 0.9783989834815756, "grad_norm": 3.798164417492566, "learning_rate": 1.7970724641140864e-05, "loss": 0.8708, "step": 770 }, { "epoch": 0.9911054637865311, "grad_norm": 4.623760315878684, "learning_rate": 1.7875176949191506e-05, "loss": 0.94, "step": 780 }, { "epoch": 1.0038119440914866, "grad_norm": 3.381126634985229, "learning_rate": 1.7777697781169813e-05, "loss": 0.8297, "step": 790 }, { "epoch": 1.0165184243964422, "grad_norm": 4.072031882597377, "learning_rate": 1.7678311044942464e-05, "loss": 0.6761, "step": 800 }, { "epoch": 1.0292249047013977, "grad_norm": 4.666218927514245, "learning_rate": 1.757704111622878e-05, "loss": 0.6868, "step": 810 }, { "epoch": 1.0419313850063532, "grad_norm": 3.010389554548932, "learning_rate": 1.747391283262231e-05, "loss": 0.6994, "step": 820 }, { "epoch": 1.0546378653113089, "grad_norm": 4.690411895539488, "learning_rate": 1.736895148749911e-05, "loss": 0.7141, "step": 830 }, { "epoch": 1.0673443456162643, "grad_norm": 4.135758513727204, "learning_rate": 1.7262182823814297e-05, "loss": 0.6941, "step": 840 }, { "epoch": 1.0800508259212198, "grad_norm": 4.378977675253243, "learning_rate": 1.7153633027788252e-05, "loss": 0.6662, "step": 850 }, { "epoch": 1.0927573062261753, "grad_norm": 3.8569291056754498, "learning_rate": 1.704332872248418e-05, "loss": 0.6575, "step": 860 }, { "epoch": 1.105463786531131, "grad_norm": 3.9650953005920666, "learning_rate": 1.69312969612785e-05, "loss": 0.6959, "step": 870 }, { "epoch": 1.1181702668360864, "grad_norm": 3.349386344864765, "learning_rate": 1.6817565221225698e-05, "loss": 0.6701, "step": 880 }, { "epoch": 1.130876747141042, "grad_norm": 4.530446985368436, "learning_rate": 1.6702161396319266e-05, "loss": 0.7168, "step": 890 }, { "epoch": 1.1435832274459974, "grad_norm": 4.048659358174538, "learning_rate": 1.658511379065039e-05, "loss": 0.7087, "step": 900 }, { "epoch": 1.156289707750953, "grad_norm": 3.897340539186477, "learning_rate": 1.6466451111466044e-05, "loss": 0.7509, "step": 910 }, { "epoch": 1.1689961880559085, "grad_norm": 3.106349799248209, "learning_rate": 1.6346202462128228e-05, "loss": 0.6793, "step": 920 }, { "epoch": 1.181702668360864, "grad_norm": 3.7338218401998753, "learning_rate": 1.6224397334976023e-05, "loss": 0.7172, "step": 930 }, { "epoch": 1.1944091486658195, "grad_norm": 5.11718627522725, "learning_rate": 1.610106560409227e-05, "loss": 0.6759, "step": 940 }, { "epoch": 1.2071156289707752, "grad_norm": 3.6889308944466177, "learning_rate": 1.597623751797662e-05, "loss": 0.6822, "step": 950 }, { "epoch": 1.2198221092757306, "grad_norm": 3.6223318506400135, "learning_rate": 1.584994369212673e-05, "loss": 0.7034, "step": 960 }, { "epoch": 1.2325285895806861, "grad_norm": 3.3333910693718662, "learning_rate": 1.572221510152949e-05, "loss": 0.767, "step": 970 }, { "epoch": 1.2452350698856416, "grad_norm": 4.265447578007238, "learning_rate": 1.5593083073064037e-05, "loss": 0.7358, "step": 980 }, { "epoch": 1.2579415501905973, "grad_norm": 3.874622904654225, "learning_rate": 1.5462579277818498e-05, "loss": 0.7336, "step": 990 }, { "epoch": 1.2706480304955527, "grad_norm": 3.925758808832438, "learning_rate": 1.5330735723322282e-05, "loss": 0.7102, "step": 1000 }, { "epoch": 1.2833545108005082, "grad_norm": 4.212874894353556, "learning_rate": 1.5197584745695904e-05, "loss": 0.7053, "step": 1010 }, { "epoch": 1.2960609911054637, "grad_norm": 3.7288496569236154, "learning_rate": 1.506315900172014e-05, "loss": 0.7223, "step": 1020 }, { "epoch": 1.3087674714104194, "grad_norm": 3.79413472563588, "learning_rate": 1.4927491460826626e-05, "loss": 0.7185, "step": 1030 }, { "epoch": 1.3214739517153749, "grad_norm": 4.197391869723048, "learning_rate": 1.4790615397011703e-05, "loss": 0.6293, "step": 1040 }, { "epoch": 1.3341804320203303, "grad_norm": 3.3274802014296254, "learning_rate": 1.4652564380675616e-05, "loss": 0.7111, "step": 1050 }, { "epoch": 1.346886912325286, "grad_norm": 3.984633199779957, "learning_rate": 1.4513372270388967e-05, "loss": 0.6926, "step": 1060 }, { "epoch": 1.3595933926302415, "grad_norm": 4.32141196403412, "learning_rate": 1.4373073204588556e-05, "loss": 0.7126, "step": 1070 }, { "epoch": 1.372299872935197, "grad_norm": 3.7790442182857302, "learning_rate": 1.42317015932045e-05, "loss": 0.6873, "step": 1080 }, { "epoch": 1.3850063532401524, "grad_norm": 4.2661658978513355, "learning_rate": 1.4089292109220852e-05, "loss": 0.7642, "step": 1090 }, { "epoch": 1.397712833545108, "grad_norm": 4.2591149854567645, "learning_rate": 1.394587968017162e-05, "loss": 0.6799, "step": 1100 }, { "epoch": 1.4104193138500636, "grad_norm": 3.689601844022756, "learning_rate": 1.3801499479574431e-05, "loss": 0.6536, "step": 1110 }, { "epoch": 1.423125794155019, "grad_norm": 4.289242494025662, "learning_rate": 1.3656186918303804e-05, "loss": 0.7092, "step": 1120 }, { "epoch": 1.4358322744599745, "grad_norm": 3.891766076099888, "learning_rate": 1.3509977635906241e-05, "loss": 0.6536, "step": 1130 }, { "epoch": 1.4485387547649302, "grad_norm": 3.4313665664745465, "learning_rate": 1.3362907491859227e-05, "loss": 0.6474, "step": 1140 }, { "epoch": 1.4612452350698857, "grad_norm": 4.303628344639665, "learning_rate": 1.3215012556776287e-05, "loss": 0.715, "step": 1150 }, { "epoch": 1.4739517153748412, "grad_norm": 4.009317272354951, "learning_rate": 1.3066329103560267e-05, "loss": 0.715, "step": 1160 }, { "epoch": 1.4866581956797966, "grad_norm": 3.171330560062687, "learning_rate": 1.2916893598506981e-05, "loss": 0.6217, "step": 1170 }, { "epoch": 1.499364675984752, "grad_norm": 3.3926952435565676, "learning_rate": 1.276674269236145e-05, "loss": 0.7366, "step": 1180 }, { "epoch": 1.5120711562897078, "grad_norm": 3.8316403134343537, "learning_rate": 1.2615913211328894e-05, "loss": 0.6939, "step": 1190 }, { "epoch": 1.5247776365946633, "grad_norm": 4.868361745818093, "learning_rate": 1.2464442148042679e-05, "loss": 0.6919, "step": 1200 }, { "epoch": 1.537484116899619, "grad_norm": 3.5185484888328644, "learning_rate": 1.2312366652491476e-05, "loss": 0.6791, "step": 1210 }, { "epoch": 1.5501905972045744, "grad_norm": 3.543401291583064, "learning_rate": 1.2159724022907786e-05, "loss": 0.6574, "step": 1220 }, { "epoch": 1.5628970775095299, "grad_norm": 3.6437779582291063, "learning_rate": 1.2006551696620135e-05, "loss": 0.701, "step": 1230 }, { "epoch": 1.5756035578144854, "grad_norm": 3.2559101294982025, "learning_rate": 1.1852887240871145e-05, "loss": 0.6546, "step": 1240 }, { "epoch": 1.5883100381194408, "grad_norm": 3.9272330209126634, "learning_rate": 1.1698768343603753e-05, "loss": 0.6643, "step": 1250 }, { "epoch": 1.6010165184243963, "grad_norm": 4.624643945291569, "learning_rate": 1.1544232804217805e-05, "loss": 0.6982, "step": 1260 }, { "epoch": 1.613722998729352, "grad_norm": 3.7368581014964803, "learning_rate": 1.1389318524299332e-05, "loss": 0.6591, "step": 1270 }, { "epoch": 1.6264294790343075, "grad_norm": 3.4323757873137177, "learning_rate": 1.1234063498324764e-05, "loss": 0.6743, "step": 1280 }, { "epoch": 1.6391359593392631, "grad_norm": 4.208550713330492, "learning_rate": 1.1078505804342327e-05, "loss": 0.7147, "step": 1290 }, { "epoch": 1.6518424396442186, "grad_norm": 2.978768874310465, "learning_rate": 1.092268359463302e-05, "loss": 0.671, "step": 1300 }, { "epoch": 1.664548919949174, "grad_norm": 3.5924777944521606, "learning_rate": 1.0766635086353298e-05, "loss": 0.6713, "step": 1310 }, { "epoch": 1.6772554002541296, "grad_norm": 3.495623048824376, "learning_rate": 1.06103985521619e-05, "loss": 0.6629, "step": 1320 }, { "epoch": 1.689961880559085, "grad_norm": 4.086638389260075, "learning_rate": 1.0454012310833034e-05, "loss": 0.7035, "step": 1330 }, { "epoch": 1.7026683608640405, "grad_norm": 3.475772078501932, "learning_rate": 1.0297514717858286e-05, "loss": 0.6631, "step": 1340 }, { "epoch": 1.7153748411689962, "grad_norm": 3.5510342885210164, "learning_rate": 1.0140944156039481e-05, "loss": 0.685, "step": 1350 }, { "epoch": 1.7280813214739519, "grad_norm": 3.5594852661382634, "learning_rate": 9.984339026074881e-06, "loss": 0.6549, "step": 1360 }, { "epoch": 1.7407878017789074, "grad_norm": 3.3395635194008415, "learning_rate": 9.827737737140983e-06, "loss": 0.6467, "step": 1370 }, { "epoch": 1.7534942820838628, "grad_norm": 3.219821540782638, "learning_rate": 9.671178697472217e-06, "loss": 0.6543, "step": 1380 }, { "epoch": 1.7662007623888183, "grad_norm": 3.384594388965041, "learning_rate": 9.514700304940901e-06, "loss": 0.6922, "step": 1390 }, { "epoch": 1.7789072426937738, "grad_norm": 3.64590250632275, "learning_rate": 9.358340937639746e-06, "loss": 0.6557, "step": 1400 }, { "epoch": 1.7916137229987292, "grad_norm": 3.765353121248252, "learning_rate": 9.202138944469168e-06, "loss": 0.688, "step": 1410 }, { "epoch": 1.804320203303685, "grad_norm": 3.7449398399867624, "learning_rate": 9.046132635731816e-06, "loss": 0.6675, "step": 1420 }, { "epoch": 1.8170266836086404, "grad_norm": 3.942030599345544, "learning_rate": 8.890360273736504e-06, "loss": 0.6584, "step": 1430 }, { "epoch": 1.829733163913596, "grad_norm": 4.037931457583538, "learning_rate": 8.734860063413974e-06, "loss": 0.6735, "step": 1440 }, { "epoch": 1.8424396442185516, "grad_norm": 3.6205476660211247, "learning_rate": 8.579670142946701e-06, "loss": 0.7102, "step": 1450 }, { "epoch": 1.855146124523507, "grad_norm": 3.821487835967331, "learning_rate": 8.42482857441506e-06, "loss": 0.6749, "step": 1460 }, { "epoch": 1.8678526048284625, "grad_norm": 3.3623194464637574, "learning_rate": 8.270373334462193e-06, "loss": 0.672, "step": 1470 }, { "epoch": 1.880559085133418, "grad_norm": 4.020841970961885, "learning_rate": 8.116342304979783e-06, "loss": 0.6863, "step": 1480 }, { "epoch": 1.8932655654383734, "grad_norm": 4.08254040286643, "learning_rate": 7.962773263817114e-06, "loss": 0.6815, "step": 1490 }, { "epoch": 1.9059720457433291, "grad_norm": 4.148274894889353, "learning_rate": 7.809703875515613e-06, "loss": 0.6417, "step": 1500 }, { "epoch": 1.9186785260482846, "grad_norm": 4.640824882446659, "learning_rate": 7.657171682071198e-06, "loss": 0.62, "step": 1510 }, { "epoch": 1.9313850063532403, "grad_norm": 4.7797510297359835, "learning_rate": 7.505214093726692e-06, "loss": 0.6439, "step": 1520 }, { "epoch": 1.9440914866581958, "grad_norm": 3.613563186875674, "learning_rate": 7.353868379796518e-06, "loss": 0.6705, "step": 1530 }, { "epoch": 1.9567979669631512, "grad_norm": 3.271201239131824, "learning_rate": 7.203171659526e-06, "loss": 0.6324, "step": 1540 }, { "epoch": 1.9695044472681067, "grad_norm": 3.89489541610708, "learning_rate": 7.053160892987434e-06, "loss": 0.6757, "step": 1550 }, { "epoch": 1.9822109275730622, "grad_norm": 3.701828258351079, "learning_rate": 6.903872872015209e-06, "loss": 0.6456, "step": 1560 }, { "epoch": 1.9949174078780176, "grad_norm": 3.5373164710070957, "learning_rate": 6.755344211182221e-06, "loss": 0.6166, "step": 1570 }, { "epoch": 2.007623888182973, "grad_norm": 2.425760113176382, "learning_rate": 6.607611338819697e-06, "loss": 0.5016, "step": 1580 }, { "epoch": 2.020330368487929, "grad_norm": 3.427501282817139, "learning_rate": 6.460710488082774e-06, "loss": 0.374, "step": 1590 }, { "epoch": 2.0330368487928845, "grad_norm": 3.4855149165350636, "learning_rate": 6.31467768806388e-06, "loss": 0.3524, "step": 1600 }, { "epoch": 2.04574332909784, "grad_norm": 3.5473678303457996, "learning_rate": 6.169548754956201e-06, "loss": 0.3485, "step": 1610 }, { "epoch": 2.0584498094027954, "grad_norm": 3.2554977371598466, "learning_rate": 6.025359283269363e-06, "loss": 0.348, "step": 1620 }, { "epoch": 2.071156289707751, "grad_norm": 3.4222657332943376, "learning_rate": 5.882144637099465e-06, "loss": 0.3753, "step": 1630 }, { "epoch": 2.0838627700127064, "grad_norm": 2.9777568505895675, "learning_rate": 5.739939941455644e-06, "loss": 0.3526, "step": 1640 }, { "epoch": 2.096569250317662, "grad_norm": 3.7955516489911805, "learning_rate": 5.598780073645267e-06, "loss": 0.3543, "step": 1650 }, { "epoch": 2.1092757306226178, "grad_norm": 3.8406500166667885, "learning_rate": 5.458699654719873e-06, "loss": 0.3642, "step": 1660 }, { "epoch": 2.121982210927573, "grad_norm": 3.813395969645494, "learning_rate": 5.319733040983972e-06, "loss": 0.3428, "step": 1670 }, { "epoch": 2.1346886912325287, "grad_norm": 3.7266891839301763, "learning_rate": 5.181914315568782e-06, "loss": 0.3403, "step": 1680 }, { "epoch": 2.147395171537484, "grad_norm": 3.688709734552298, "learning_rate": 5.0452772800729375e-06, "loss": 0.3469, "step": 1690 }, { "epoch": 2.1601016518424396, "grad_norm": 3.6629109337292403, "learning_rate": 4.909855446272288e-06, "loss": 0.3454, "step": 1700 }, { "epoch": 2.172808132147395, "grad_norm": 3.7085182263998555, "learning_rate": 4.775682027900739e-06, "loss": 0.341, "step": 1710 }, { "epoch": 2.1855146124523506, "grad_norm": 3.481723946532174, "learning_rate": 4.6427899325042135e-06, "loss": 0.3352, "step": 1720 }, { "epoch": 2.198221092757306, "grad_norm": 3.2839395610983027, "learning_rate": 4.511211753369712e-06, "loss": 0.3447, "step": 1730 }, { "epoch": 2.210927573062262, "grad_norm": 3.6755308055006464, "learning_rate": 4.380979761531431e-06, "loss": 0.3531, "step": 1740 }, { "epoch": 2.2236340533672174, "grad_norm": 3.7905960831955916, "learning_rate": 4.2521258978559324e-06, "loss": 0.356, "step": 1750 }, { "epoch": 2.236340533672173, "grad_norm": 3.627875927246556, "learning_rate": 4.124681765208286e-06, "loss": 0.3266, "step": 1760 }, { "epoch": 2.2490470139771284, "grad_norm": 3.3246186092589447, "learning_rate": 3.998678620701102e-06, "loss": 0.3386, "step": 1770 }, { "epoch": 2.261753494282084, "grad_norm": 3.804007286983282, "learning_rate": 3.874147368028396e-06, "loss": 0.3544, "step": 1780 }, { "epoch": 2.2744599745870393, "grad_norm": 3.143040423820396, "learning_rate": 3.751118549886065e-06, "loss": 0.3227, "step": 1790 }, { "epoch": 2.2871664548919948, "grad_norm": 3.352132852945674, "learning_rate": 3.6296223404809903e-06, "loss": 0.3399, "step": 1800 }, { "epoch": 2.2998729351969507, "grad_norm": 4.043987038976339, "learning_rate": 3.509688538130448e-06, "loss": 0.3369, "step": 1810 }, { "epoch": 2.312579415501906, "grad_norm": 3.954856965708331, "learning_rate": 3.39134655795374e-06, "loss": 0.341, "step": 1820 }, { "epoch": 2.3252858958068616, "grad_norm": 3.5214147520563626, "learning_rate": 3.2746254246578167e-06, "loss": 0.3365, "step": 1830 }, { "epoch": 2.337992376111817, "grad_norm": 3.218428553726758, "learning_rate": 3.1595537654186114e-06, "loss": 0.3546, "step": 1840 }, { "epoch": 2.3506988564167726, "grad_norm": 3.163287967416541, "learning_rate": 3.0461598028599305e-06, "loss": 0.3431, "step": 1850 }, { "epoch": 2.363405336721728, "grad_norm": 3.0988204272069573, "learning_rate": 2.9344713481315225e-06, "loss": 0.3303, "step": 1860 }, { "epoch": 2.3761118170266835, "grad_norm": 3.9034586935786395, "learning_rate": 2.8245157940880784e-06, "loss": 0.3337, "step": 1870 }, { "epoch": 2.388818297331639, "grad_norm": 3.5690630552722786, "learning_rate": 2.7163201085708424e-06, "loss": 0.3223, "step": 1880 }, { "epoch": 2.4015247776365944, "grad_norm": 3.163806174642701, "learning_rate": 2.6099108277934105e-06, "loss": 0.3398, "step": 1890 }, { "epoch": 2.4142312579415504, "grad_norm": 3.7465583268537275, "learning_rate": 2.505314049833457e-06, "loss": 0.3483, "step": 1900 }, { "epoch": 2.426937738246506, "grad_norm": 3.516374761436456, "learning_rate": 2.402555428231872e-06, "loss": 0.3273, "step": 1910 }, { "epoch": 2.4396442185514613, "grad_norm": 3.5353549798113284, "learning_rate": 2.3016601657009364e-06, "loss": 0.3374, "step": 1920 }, { "epoch": 2.4523506988564168, "grad_norm": 3.357432157861631, "learning_rate": 2.202653007943093e-06, "loss": 0.3464, "step": 1930 }, { "epoch": 2.4650571791613722, "grad_norm": 3.6506743298663675, "learning_rate": 2.1055582375817475e-06, "loss": 0.325, "step": 1940 }, { "epoch": 2.4777636594663277, "grad_norm": 3.907282101797735, "learning_rate": 2.0103996682057235e-06, "loss": 0.3255, "step": 1950 }, { "epoch": 2.490470139771283, "grad_norm": 3.711785490906897, "learning_rate": 1.9172006385286723e-06, "loss": 0.3391, "step": 1960 }, { "epoch": 2.503176620076239, "grad_norm": 3.2473323176322135, "learning_rate": 1.8259840066650136e-06, "loss": 0.3389, "step": 1970 }, { "epoch": 2.5158831003811946, "grad_norm": 3.6433209864443916, "learning_rate": 1.7367721445237285e-06, "loss": 0.3258, "step": 1980 }, { "epoch": 2.52858958068615, "grad_norm": 4.12961794749056, "learning_rate": 1.6495869323213654e-06, "loss": 0.3185, "step": 1990 }, { "epoch": 2.5412960609911055, "grad_norm": 4.1376649833602865, "learning_rate": 1.564449753215711e-06, "loss": 0.3247, "step": 2000 }, { "epoch": 2.554002541296061, "grad_norm": 4.441583691608097, "learning_rate": 1.4813814880612942e-06, "loss": 0.3198, "step": 2010 }, { "epoch": 2.5667090216010164, "grad_norm": 2.7847992176083047, "learning_rate": 1.4004025102881402e-06, "loss": 0.3143, "step": 2020 }, { "epoch": 2.579415501905972, "grad_norm": 3.1337243741428473, "learning_rate": 1.321532680904959e-06, "loss": 0.3312, "step": 2030 }, { "epoch": 2.5921219822109274, "grad_norm": 3.3653692372757225, "learning_rate": 1.2447913436279879e-06, "loss": 0.3129, "step": 2040 }, { "epoch": 2.604828462515883, "grad_norm": 3.7337029805672635, "learning_rate": 1.1701973201367544e-06, "loss": 0.3253, "step": 2050 }, { "epoch": 2.6175349428208388, "grad_norm": 4.214904637605022, "learning_rate": 1.09776890545782e-06, "loss": 0.3531, "step": 2060 }, { "epoch": 2.6302414231257942, "grad_norm": 3.3613800857078764, "learning_rate": 1.0275238634777441e-06, "loss": 0.3105, "step": 2070 }, { "epoch": 2.6429479034307497, "grad_norm": 3.7555272174929595, "learning_rate": 9.594794225862692e-07, "loss": 0.3331, "step": 2080 }, { "epoch": 2.655654383735705, "grad_norm": 3.6364434124366, "learning_rate": 8.936522714508678e-07, "loss": 0.3336, "step": 2090 }, { "epoch": 2.6683608640406606, "grad_norm": 3.684690863431174, "learning_rate": 8.300585549236773e-07, "loss": 0.3232, "step": 2100 }, { "epoch": 2.681067344345616, "grad_norm": 4.307296056522447, "learning_rate": 7.687138700817598e-07, "loss": 0.3165, "step": 2110 }, { "epoch": 2.693773824650572, "grad_norm": 3.418062959790304, "learning_rate": 7.096332624017755e-07, "loss": 0.3126, "step": 2120 }, { "epoch": 2.7064803049555275, "grad_norm": 3.5508346766299397, "learning_rate": 6.528312220698885e-07, "loss": 0.3303, "step": 2130 }, { "epoch": 2.719186785260483, "grad_norm": 4.518044935525217, "learning_rate": 5.983216804278869e-07, "loss": 0.3191, "step": 2140 }, { "epoch": 2.7318932655654384, "grad_norm": 3.2181606746612044, "learning_rate": 5.461180065563787e-07, "loss": 0.3059, "step": 2150 }, { "epoch": 2.744599745870394, "grad_norm": 3.8710207257234144, "learning_rate": 4.962330039958585e-07, "loss": 0.3194, "step": 2160 }, { "epoch": 2.7573062261753494, "grad_norm": 3.6523179671049215, "learning_rate": 4.486789076064968e-07, "loss": 0.3148, "step": 2170 }, { "epoch": 2.770012706480305, "grad_norm": 3.5451882545264053, "learning_rate": 4.034673805674116e-07, "loss": 0.3285, "step": 2180 }, { "epoch": 2.7827191867852603, "grad_norm": 3.0920265120158827, "learning_rate": 3.606095115161279e-07, "loss": 0.3172, "step": 2190 }, { "epoch": 2.795425667090216, "grad_norm": 3.496745467546405, "learning_rate": 3.201158118289793e-07, "loss": 0.3183, "step": 2200 }, { "epoch": 2.8081321473951717, "grad_norm": 3.361865186046809, "learning_rate": 2.8199621304306425e-07, "loss": 0.3209, "step": 2210 }, { "epoch": 2.820838627700127, "grad_norm": 3.951234253131063, "learning_rate": 2.46260064420426e-07, "loss": 0.3165, "step": 2220 }, { "epoch": 2.8335451080050826, "grad_norm": 4.033338515766138, "learning_rate": 2.1291613065504313e-07, "loss": 0.3233, "step": 2230 }, { "epoch": 2.846251588310038, "grad_norm": 3.8424683095800303, "learning_rate": 1.819725897231872e-07, "loss": 0.318, "step": 2240 }, { "epoch": 2.8589580686149936, "grad_norm": 3.6002718533091, "learning_rate": 1.5343703087768225e-07, "loss": 0.3323, "step": 2250 }, { "epoch": 2.871664548919949, "grad_norm": 3.392955644057836, "learning_rate": 1.2731645278655448e-07, "loss": 0.3088, "step": 2260 }, { "epoch": 2.884371029224905, "grad_norm": 3.9060628771881842, "learning_rate": 1.0361726181653209e-07, "loss": 0.3213, "step": 2270 }, { "epoch": 2.8970775095298604, "grad_norm": 3.6937829161145515, "learning_rate": 8.234527046180885e-08, "loss": 0.3193, "step": 2280 }, { "epoch": 2.909783989834816, "grad_norm": 3.7501161450474036, "learning_rate": 6.350569591846434e-08, "loss": 0.3334, "step": 2290 }, { "epoch": 2.9224904701397714, "grad_norm": 3.4784424672902605, "learning_rate": 4.710315880489091e-08, "loss": 0.3273, "step": 2300 }, { "epoch": 2.935196950444727, "grad_norm": 3.4126846620318707, "learning_rate": 3.31416820285313e-08, "loss": 0.3177, "step": 2310 }, { "epoch": 2.9479034307496823, "grad_norm": 3.9848361195981785, "learning_rate": 2.1624689799214503e-08, "loss": 0.322, "step": 2320 }, { "epoch": 2.9606099110546378, "grad_norm": 3.6028548305792203, "learning_rate": 1.2555006789334301e-08, "loss": 0.3038, "step": 2330 }, { "epoch": 2.9733163913595932, "grad_norm": 4.107748523282802, "learning_rate": 5.934857441062258e-09, "loss": 0.313, "step": 2340 }, { "epoch": 2.9860228716645487, "grad_norm": 4.057112734457165, "learning_rate": 1.765865420779722e-09, "loss": 0.315, "step": 2350 }, { "epoch": 2.998729351969504, "grad_norm": 3.491454630886239, "learning_rate": 4.9053220856354335e-11, "loss": 0.328, "step": 2360 }, { "epoch": 3.0, "step": 2361, "total_flos": 121880697913344.0, "train_loss": 0.6645450910134418, "train_runtime": 16043.0595, "train_samples_per_second": 1.177, "train_steps_per_second": 0.147 } ], "logging_steps": 10, "max_steps": 2361, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 121880697913344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }