{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3951, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.0227874517440796, "eval_runtime": 1025.9766, "eval_samples_per_second": 41.442, "eval_steps_per_second": 0.648, "step": 0 }, { "epoch": 0.0002531004808909137, "grad_norm": 1.5422555208206177, "learning_rate": 0.0, "loss": 0.7739, "step": 1 }, { "epoch": 0.0005062009617818274, "grad_norm": 1.7739096879959106, "learning_rate": 2.0000000000000002e-07, "loss": 0.7967, "step": 2 }, { "epoch": 0.0007593014426727411, "grad_norm": 1.5074769258499146, "learning_rate": 4.0000000000000003e-07, "loss": 0.7494, "step": 3 }, { "epoch": 0.0010124019235636548, "grad_norm": 1.5649176836013794, "learning_rate": 6.000000000000001e-07, "loss": 0.7765, "step": 4 }, { "epoch": 0.0012655024044545685, "grad_norm": 1.6183850765228271, "learning_rate": 8.000000000000001e-07, "loss": 0.7806, "step": 5 }, { "epoch": 0.0015186028853454822, "grad_norm": 1.4134117364883423, "learning_rate": 1.0000000000000002e-06, "loss": 0.7581, "step": 6 }, { "epoch": 0.001771703366236396, "grad_norm": 1.2370158433914185, "learning_rate": 1.2000000000000002e-06, "loss": 0.7565, "step": 7 }, { "epoch": 0.0020248038471273096, "grad_norm": 1.086237907409668, "learning_rate": 1.4000000000000001e-06, "loss": 0.7564, "step": 8 }, { "epoch": 0.002277904328018223, "grad_norm": 0.9099379777908325, "learning_rate": 1.6000000000000001e-06, "loss": 0.7606, "step": 9 }, { "epoch": 0.002531004808909137, "grad_norm": 0.7214070558547974, "learning_rate": 1.8000000000000001e-06, "loss": 0.7334, "step": 10 }, { "epoch": 0.0027841052898000505, "grad_norm": 0.5764913558959961, "learning_rate": 2.0000000000000003e-06, "loss": 0.7254, "step": 11 }, { "epoch": 0.0030372057706909645, "grad_norm": 0.4785311222076416, "learning_rate": 2.2e-06, "loss": 0.7321, "step": 12 }, { "epoch": 0.003290306251581878, "grad_norm": 0.39615628123283386, "learning_rate": 2.4000000000000003e-06, "loss": 0.7124, "step": 13 }, { "epoch": 0.003543406732472792, "grad_norm": 0.3793116509914398, "learning_rate": 2.6e-06, "loss": 0.6932, "step": 14 }, { "epoch": 0.0037965072133637054, "grad_norm": 0.3727288842201233, "learning_rate": 2.8000000000000003e-06, "loss": 0.7294, "step": 15 }, { "epoch": 0.004049607694254619, "grad_norm": 0.40745407342910767, "learning_rate": 3e-06, "loss": 0.705, "step": 16 }, { "epoch": 0.004302708175145533, "grad_norm": 0.42191222310066223, "learning_rate": 3.2000000000000003e-06, "loss": 0.7075, "step": 17 }, { "epoch": 0.004555808656036446, "grad_norm": 0.40793243050575256, "learning_rate": 3.4000000000000005e-06, "loss": 0.7056, "step": 18 }, { "epoch": 0.00480890913692736, "grad_norm": 0.40160873532295227, "learning_rate": 3.6000000000000003e-06, "loss": 0.72, "step": 19 }, { "epoch": 0.005062009617818274, "grad_norm": 0.3526245951652527, "learning_rate": 3.8000000000000005e-06, "loss": 0.7102, "step": 20 }, { "epoch": 0.005315110098709187, "grad_norm": 0.3622744381427765, "learning_rate": 4.000000000000001e-06, "loss": 0.6996, "step": 21 }, { "epoch": 0.005568210579600101, "grad_norm": 0.3431853950023651, "learning_rate": 4.2000000000000004e-06, "loss": 0.6969, "step": 22 }, { "epoch": 0.005821311060491015, "grad_norm": 0.28951337933540344, "learning_rate": 4.4e-06, "loss": 0.7094, "step": 23 }, { "epoch": 0.006074411541381929, "grad_norm": 0.28734734654426575, "learning_rate": 4.600000000000001e-06, "loss": 0.6972, "step": 24 }, { "epoch": 0.006327512022272842, "grad_norm": 0.20745716989040375, "learning_rate": 4.800000000000001e-06, "loss": 0.6743, "step": 25 }, { "epoch": 0.006580612503163756, "grad_norm": 0.20546671748161316, "learning_rate": 5e-06, "loss": 0.6905, "step": 26 }, { "epoch": 0.00683371298405467, "grad_norm": 0.2157692164182663, "learning_rate": 5.2e-06, "loss": 0.6685, "step": 27 }, { "epoch": 0.007086813464945584, "grad_norm": 0.20869505405426025, "learning_rate": 5.400000000000001e-06, "loss": 0.6728, "step": 28 }, { "epoch": 0.007339913945836497, "grad_norm": 0.22692961990833282, "learning_rate": 5.600000000000001e-06, "loss": 0.6878, "step": 29 }, { "epoch": 0.007593014426727411, "grad_norm": 0.2152116298675537, "learning_rate": 5.8e-06, "loss": 0.6974, "step": 30 }, { "epoch": 0.007846114907618324, "grad_norm": 0.21653513610363007, "learning_rate": 6e-06, "loss": 0.6917, "step": 31 }, { "epoch": 0.008099215388509239, "grad_norm": 0.22303710877895355, "learning_rate": 6.200000000000001e-06, "loss": 0.7065, "step": 32 }, { "epoch": 0.008352315869400152, "grad_norm": 0.21773776412010193, "learning_rate": 6.4000000000000006e-06, "loss": 0.6777, "step": 33 }, { "epoch": 0.008605416350291066, "grad_norm": 0.19383567571640015, "learning_rate": 6.600000000000001e-06, "loss": 0.7071, "step": 34 }, { "epoch": 0.00885851683118198, "grad_norm": 0.17862704396247864, "learning_rate": 6.800000000000001e-06, "loss": 0.6903, "step": 35 }, { "epoch": 0.009111617312072893, "grad_norm": 0.18009145557880402, "learning_rate": 7e-06, "loss": 0.6597, "step": 36 }, { "epoch": 0.009364717792963807, "grad_norm": 0.28171759843826294, "learning_rate": 7.2000000000000005e-06, "loss": 0.6641, "step": 37 }, { "epoch": 0.00961781827385472, "grad_norm": 0.18906806409358978, "learning_rate": 7.4e-06, "loss": 0.6988, "step": 38 }, { "epoch": 0.009870918754745633, "grad_norm": 0.24441087245941162, "learning_rate": 7.600000000000001e-06, "loss": 0.6586, "step": 39 }, { "epoch": 0.010124019235636548, "grad_norm": 0.19522865116596222, "learning_rate": 7.800000000000002e-06, "loss": 0.6526, "step": 40 }, { "epoch": 0.010377119716527461, "grad_norm": 0.21378129720687866, "learning_rate": 8.000000000000001e-06, "loss": 0.6883, "step": 41 }, { "epoch": 0.010630220197418374, "grad_norm": 0.1778658777475357, "learning_rate": 8.2e-06, "loss": 0.6701, "step": 42 }, { "epoch": 0.010883320678309289, "grad_norm": 0.16810527443885803, "learning_rate": 8.400000000000001e-06, "loss": 0.6445, "step": 43 }, { "epoch": 0.011136421159200202, "grad_norm": 0.18885721266269684, "learning_rate": 8.6e-06, "loss": 0.6977, "step": 44 }, { "epoch": 0.011389521640091117, "grad_norm": 0.19532494246959686, "learning_rate": 8.8e-06, "loss": 0.6694, "step": 45 }, { "epoch": 0.01164262212098203, "grad_norm": 0.1723746508359909, "learning_rate": 9e-06, "loss": 0.6703, "step": 46 }, { "epoch": 0.011895722601872943, "grad_norm": 0.18233156204223633, "learning_rate": 9.200000000000002e-06, "loss": 0.6674, "step": 47 }, { "epoch": 0.012148823082763858, "grad_norm": 0.18409250676631927, "learning_rate": 9.4e-06, "loss": 0.6562, "step": 48 }, { "epoch": 0.012401923563654771, "grad_norm": 0.1799033135175705, "learning_rate": 9.600000000000001e-06, "loss": 0.6391, "step": 49 }, { "epoch": 0.012655024044545684, "grad_norm": 0.17093200981616974, "learning_rate": 9.800000000000001e-06, "loss": 0.6609, "step": 50 }, { "epoch": 0.012908124525436599, "grad_norm": 0.16845975816249847, "learning_rate": 1e-05, "loss": 0.6591, "step": 51 }, { "epoch": 0.013161225006327512, "grad_norm": 0.16641423106193542, "learning_rate": 1.02e-05, "loss": 0.6759, "step": 52 }, { "epoch": 0.013414325487218427, "grad_norm": 0.19389550387859344, "learning_rate": 1.04e-05, "loss": 0.6882, "step": 53 }, { "epoch": 0.01366742596810934, "grad_norm": 0.16377364099025726, "learning_rate": 1.0600000000000002e-05, "loss": 0.6783, "step": 54 }, { "epoch": 0.013920526449000253, "grad_norm": 0.1614464521408081, "learning_rate": 1.0800000000000002e-05, "loss": 0.6597, "step": 55 }, { "epoch": 0.014173626929891167, "grad_norm": 0.18549728393554688, "learning_rate": 1.1000000000000001e-05, "loss": 0.6742, "step": 56 }, { "epoch": 0.01442672741078208, "grad_norm": 0.16043926775455475, "learning_rate": 1.1200000000000001e-05, "loss": 0.6525, "step": 57 }, { "epoch": 0.014679827891672994, "grad_norm": 0.1643228679895401, "learning_rate": 1.14e-05, "loss": 0.6534, "step": 58 }, { "epoch": 0.014932928372563908, "grad_norm": 0.1760501265525818, "learning_rate": 1.16e-05, "loss": 0.6384, "step": 59 }, { "epoch": 0.015186028853454821, "grad_norm": 0.1705779880285263, "learning_rate": 1.18e-05, "loss": 0.6611, "step": 60 }, { "epoch": 0.015439129334345734, "grad_norm": 0.16293965280056, "learning_rate": 1.2e-05, "loss": 0.6395, "step": 61 }, { "epoch": 0.015692229815236648, "grad_norm": 0.1599552184343338, "learning_rate": 1.22e-05, "loss": 0.6309, "step": 62 }, { "epoch": 0.015945330296127564, "grad_norm": 0.16157129406929016, "learning_rate": 1.2400000000000002e-05, "loss": 0.6506, "step": 63 }, { "epoch": 0.016198430777018477, "grad_norm": 0.16981346905231476, "learning_rate": 1.2600000000000001e-05, "loss": 0.6367, "step": 64 }, { "epoch": 0.01645153125790939, "grad_norm": 0.16686077415943146, "learning_rate": 1.2800000000000001e-05, "loss": 0.6335, "step": 65 }, { "epoch": 0.016704631738800303, "grad_norm": 0.16791872680187225, "learning_rate": 1.3000000000000001e-05, "loss": 0.659, "step": 66 }, { "epoch": 0.016957732219691216, "grad_norm": 0.18651379644870758, "learning_rate": 1.3200000000000002e-05, "loss": 0.6621, "step": 67 }, { "epoch": 0.017210832700582133, "grad_norm": 0.16107667982578278, "learning_rate": 1.3400000000000002e-05, "loss": 0.6526, "step": 68 }, { "epoch": 0.017463933181473046, "grad_norm": 0.19045163691043854, "learning_rate": 1.3600000000000002e-05, "loss": 0.6356, "step": 69 }, { "epoch": 0.01771703366236396, "grad_norm": 0.1727728545665741, "learning_rate": 1.38e-05, "loss": 0.6321, "step": 70 }, { "epoch": 0.017970134143254872, "grad_norm": 0.16364595293998718, "learning_rate": 1.4e-05, "loss": 0.6547, "step": 71 }, { "epoch": 0.018223234624145785, "grad_norm": 0.1609407365322113, "learning_rate": 1.4200000000000001e-05, "loss": 0.6085, "step": 72 }, { "epoch": 0.018476335105036698, "grad_norm": 0.1595495045185089, "learning_rate": 1.4400000000000001e-05, "loss": 0.6321, "step": 73 }, { "epoch": 0.018729435585927615, "grad_norm": 0.19799435138702393, "learning_rate": 1.46e-05, "loss": 0.6253, "step": 74 }, { "epoch": 0.018982536066818528, "grad_norm": 0.16346189379692078, "learning_rate": 1.48e-05, "loss": 0.6364, "step": 75 }, { "epoch": 0.01923563654770944, "grad_norm": 0.16418568789958954, "learning_rate": 1.5000000000000002e-05, "loss": 0.6375, "step": 76 }, { "epoch": 0.019488737028600354, "grad_norm": 0.18783988058567047, "learning_rate": 1.5200000000000002e-05, "loss": 0.6447, "step": 77 }, { "epoch": 0.019741837509491267, "grad_norm": 0.19690784811973572, "learning_rate": 1.54e-05, "loss": 0.6385, "step": 78 }, { "epoch": 0.019994937990382183, "grad_norm": 0.17404019832611084, "learning_rate": 1.5600000000000003e-05, "loss": 0.6358, "step": 79 }, { "epoch": 0.020248038471273096, "grad_norm": 0.1718037873506546, "learning_rate": 1.58e-05, "loss": 0.6317, "step": 80 }, { "epoch": 0.02050113895216401, "grad_norm": 0.1750226616859436, "learning_rate": 1.6000000000000003e-05, "loss": 0.6287, "step": 81 }, { "epoch": 0.020754239433054922, "grad_norm": 0.17224791646003723, "learning_rate": 1.62e-05, "loss": 0.6459, "step": 82 }, { "epoch": 0.021007339913945836, "grad_norm": 0.18394415080547333, "learning_rate": 1.64e-05, "loss": 0.6467, "step": 83 }, { "epoch": 0.02126044039483675, "grad_norm": 0.16639743745326996, "learning_rate": 1.66e-05, "loss": 0.6748, "step": 84 }, { "epoch": 0.021513540875727665, "grad_norm": 0.18489432334899902, "learning_rate": 1.6800000000000002e-05, "loss": 0.6451, "step": 85 }, { "epoch": 0.021766641356618578, "grad_norm": 0.165752574801445, "learning_rate": 1.7e-05, "loss": 0.6058, "step": 86 }, { "epoch": 0.02201974183750949, "grad_norm": 0.18663623929023743, "learning_rate": 1.72e-05, "loss": 0.6741, "step": 87 }, { "epoch": 0.022272842318400404, "grad_norm": 0.18166765570640564, "learning_rate": 1.7400000000000003e-05, "loss": 0.6332, "step": 88 }, { "epoch": 0.022525942799291317, "grad_norm": 0.17642953991889954, "learning_rate": 1.76e-05, "loss": 0.6135, "step": 89 }, { "epoch": 0.022779043280182234, "grad_norm": 0.1743820458650589, "learning_rate": 1.7800000000000002e-05, "loss": 0.6025, "step": 90 }, { "epoch": 0.023032143761073147, "grad_norm": 0.17671117186546326, "learning_rate": 1.8e-05, "loss": 0.6119, "step": 91 }, { "epoch": 0.02328524424196406, "grad_norm": 0.17659315466880798, "learning_rate": 1.8200000000000002e-05, "loss": 0.6435, "step": 92 }, { "epoch": 0.023538344722854973, "grad_norm": 0.176067516207695, "learning_rate": 1.8400000000000003e-05, "loss": 0.6242, "step": 93 }, { "epoch": 0.023791445203745886, "grad_norm": 0.177460178732872, "learning_rate": 1.86e-05, "loss": 0.6531, "step": 94 }, { "epoch": 0.0240445456846368, "grad_norm": 0.21800711750984192, "learning_rate": 1.88e-05, "loss": 0.5926, "step": 95 }, { "epoch": 0.024297646165527716, "grad_norm": 0.17698881030082703, "learning_rate": 1.9e-05, "loss": 0.6201, "step": 96 }, { "epoch": 0.02455074664641863, "grad_norm": 0.17561809718608856, "learning_rate": 1.9200000000000003e-05, "loss": 0.6235, "step": 97 }, { "epoch": 0.024803847127309542, "grad_norm": 0.17871560156345367, "learning_rate": 1.94e-05, "loss": 0.6317, "step": 98 }, { "epoch": 0.025056947608200455, "grad_norm": 0.17323796451091766, "learning_rate": 1.9600000000000002e-05, "loss": 0.614, "step": 99 }, { "epoch": 0.025310048089091368, "grad_norm": 0.1735820323228836, "learning_rate": 1.98e-05, "loss": 0.6155, "step": 100 }, { "epoch": 0.025563148569982284, "grad_norm": 0.18773189187049866, "learning_rate": 2e-05, "loss": 0.6282, "step": 101 }, { "epoch": 0.025816249050873197, "grad_norm": 0.17454087734222412, "learning_rate": 1.999999971336061e-05, "loss": 0.6495, "step": 102 }, { "epoch": 0.02606934953176411, "grad_norm": 0.17310722172260284, "learning_rate": 1.999999885344244e-05, "loss": 0.664, "step": 103 }, { "epoch": 0.026322450012655024, "grad_norm": 0.17041310667991638, "learning_rate": 1.9999997420245554e-05, "loss": 0.5958, "step": 104 }, { "epoch": 0.026575550493545937, "grad_norm": 0.17476394772529602, "learning_rate": 1.999999541377003e-05, "loss": 0.6042, "step": 105 }, { "epoch": 0.026828650974436853, "grad_norm": 0.18969106674194336, "learning_rate": 1.999999283401598e-05, "loss": 0.6414, "step": 106 }, { "epoch": 0.027081751455327766, "grad_norm": 0.17666743695735931, "learning_rate": 1.999998968098355e-05, "loss": 0.6095, "step": 107 }, { "epoch": 0.02733485193621868, "grad_norm": 0.16765183210372925, "learning_rate": 1.9999985954672926e-05, "loss": 0.6298, "step": 108 }, { "epoch": 0.027587952417109592, "grad_norm": 0.16784022748470306, "learning_rate": 1.999998165508432e-05, "loss": 0.6207, "step": 109 }, { "epoch": 0.027841052898000505, "grad_norm": 0.1832384467124939, "learning_rate": 1.999997678221798e-05, "loss": 0.643, "step": 110 }, { "epoch": 0.02809415337889142, "grad_norm": 0.18284334242343903, "learning_rate": 1.9999971336074178e-05, "loss": 0.6085, "step": 111 }, { "epoch": 0.028347253859782335, "grad_norm": 0.25969794392585754, "learning_rate": 1.9999965316653238e-05, "loss": 0.591, "step": 112 }, { "epoch": 0.028600354340673248, "grad_norm": 0.17420271039009094, "learning_rate": 1.9999958723955496e-05, "loss": 0.6081, "step": 113 }, { "epoch": 0.02885345482156416, "grad_norm": 0.17406368255615234, "learning_rate": 1.999995155798133e-05, "loss": 0.6538, "step": 114 }, { "epoch": 0.029106555302455074, "grad_norm": 0.2751551866531372, "learning_rate": 1.9999943818731156e-05, "loss": 0.5929, "step": 115 }, { "epoch": 0.029359655783345987, "grad_norm": 0.1678651124238968, "learning_rate": 1.9999935506205416e-05, "loss": 0.6078, "step": 116 }, { "epoch": 0.029612756264236904, "grad_norm": 0.1709502935409546, "learning_rate": 1.9999926620404585e-05, "loss": 0.5928, "step": 117 }, { "epoch": 0.029865856745127817, "grad_norm": 0.17515309154987335, "learning_rate": 1.9999917161329173e-05, "loss": 0.6355, "step": 118 }, { "epoch": 0.03011895722601873, "grad_norm": 0.17262603342533112, "learning_rate": 1.9999907128979723e-05, "loss": 0.5745, "step": 119 }, { "epoch": 0.030372057706909643, "grad_norm": 0.18772061169147491, "learning_rate": 1.999989652335681e-05, "loss": 0.6168, "step": 120 }, { "epoch": 0.030625158187800556, "grad_norm": 0.17533883452415466, "learning_rate": 1.999988534446104e-05, "loss": 0.6442, "step": 121 }, { "epoch": 0.03087825866869147, "grad_norm": 0.16918431222438812, "learning_rate": 1.9999873592293054e-05, "loss": 0.6262, "step": 122 }, { "epoch": 0.031131359149582385, "grad_norm": 0.17905767261981964, "learning_rate": 1.999986126685353e-05, "loss": 0.6213, "step": 123 }, { "epoch": 0.031384459630473295, "grad_norm": 0.1797955334186554, "learning_rate": 1.999984836814317e-05, "loss": 0.6448, "step": 124 }, { "epoch": 0.03163756011136421, "grad_norm": 0.17527543008327484, "learning_rate": 1.9999834896162716e-05, "loss": 0.6416, "step": 125 }, { "epoch": 0.03189066059225513, "grad_norm": 0.18013723194599152, "learning_rate": 1.999982085091294e-05, "loss": 0.6293, "step": 126 }, { "epoch": 0.03214376107314604, "grad_norm": 0.27788278460502625, "learning_rate": 1.9999806232394644e-05, "loss": 0.6174, "step": 127 }, { "epoch": 0.032396861554036954, "grad_norm": 0.17403407394886017, "learning_rate": 1.9999791040608674e-05, "loss": 0.6236, "step": 128 }, { "epoch": 0.032649962034927864, "grad_norm": 0.18010959029197693, "learning_rate": 1.999977527555589e-05, "loss": 0.6268, "step": 129 }, { "epoch": 0.03290306251581878, "grad_norm": 0.168148934841156, "learning_rate": 1.9999758937237206e-05, "loss": 0.5944, "step": 130 }, { "epoch": 0.0331561629967097, "grad_norm": 0.18338632583618164, "learning_rate": 1.999974202565355e-05, "loss": 0.584, "step": 131 }, { "epoch": 0.033409263477600606, "grad_norm": 0.17366443574428558, "learning_rate": 1.9999724540805898e-05, "loss": 0.6433, "step": 132 }, { "epoch": 0.03366236395849152, "grad_norm": 0.16738726198673248, "learning_rate": 1.9999706482695248e-05, "loss": 0.6111, "step": 133 }, { "epoch": 0.03391546443938243, "grad_norm": 0.17759299278259277, "learning_rate": 1.999968785132264e-05, "loss": 0.6017, "step": 134 }, { "epoch": 0.03416856492027335, "grad_norm": 0.1689685583114624, "learning_rate": 1.9999668646689137e-05, "loss": 0.6109, "step": 135 }, { "epoch": 0.034421665401164266, "grad_norm": 0.25977635383605957, "learning_rate": 1.9999648868795845e-05, "loss": 0.6312, "step": 136 }, { "epoch": 0.034674765882055175, "grad_norm": 0.1700809746980667, "learning_rate": 1.9999628517643888e-05, "loss": 0.6069, "step": 137 }, { "epoch": 0.03492786636294609, "grad_norm": 0.1755145788192749, "learning_rate": 1.999960759323445e-05, "loss": 0.6094, "step": 138 }, { "epoch": 0.035180966843837, "grad_norm": 0.17850720882415771, "learning_rate": 1.9999586095568714e-05, "loss": 0.6176, "step": 139 }, { "epoch": 0.03543406732472792, "grad_norm": 0.1870347112417221, "learning_rate": 1.999956402464792e-05, "loss": 0.648, "step": 140 }, { "epoch": 0.03568716780561883, "grad_norm": 0.17585211992263794, "learning_rate": 1.999954138047333e-05, "loss": 0.5954, "step": 141 }, { "epoch": 0.035940268286509744, "grad_norm": 0.17679710686206818, "learning_rate": 1.9999518163046246e-05, "loss": 0.5972, "step": 142 }, { "epoch": 0.03619336876740066, "grad_norm": 0.18154670298099518, "learning_rate": 1.9999494372367997e-05, "loss": 0.6638, "step": 143 }, { "epoch": 0.03644646924829157, "grad_norm": 0.1948469579219818, "learning_rate": 1.9999470008439947e-05, "loss": 0.573, "step": 144 }, { "epoch": 0.03669956972918249, "grad_norm": 0.17780163884162903, "learning_rate": 1.999944507126349e-05, "loss": 0.6046, "step": 145 }, { "epoch": 0.036952670210073396, "grad_norm": 0.17574277520179749, "learning_rate": 1.9999419560840063e-05, "loss": 0.6046, "step": 146 }, { "epoch": 0.03720577069096431, "grad_norm": 0.16369116306304932, "learning_rate": 1.999939347717112e-05, "loss": 0.5947, "step": 147 }, { "epoch": 0.03745887117185523, "grad_norm": 0.17040003836154938, "learning_rate": 1.9999366820258165e-05, "loss": 0.6176, "step": 148 }, { "epoch": 0.03771197165274614, "grad_norm": 0.1718555986881256, "learning_rate": 1.9999339590102718e-05, "loss": 0.6008, "step": 149 }, { "epoch": 0.037965072133637055, "grad_norm": 0.17747220396995544, "learning_rate": 1.9999311786706343e-05, "loss": 0.5756, "step": 150 }, { "epoch": 0.038218172614527965, "grad_norm": 0.17003098130226135, "learning_rate": 1.9999283410070632e-05, "loss": 0.6252, "step": 151 }, { "epoch": 0.03847127309541888, "grad_norm": 0.18186484277248383, "learning_rate": 1.999925446019722e-05, "loss": 0.5814, "step": 152 }, { "epoch": 0.0387243735763098, "grad_norm": 0.17330726981163025, "learning_rate": 1.9999224937087754e-05, "loss": 0.6205, "step": 153 }, { "epoch": 0.03897747405720071, "grad_norm": 0.1642872840166092, "learning_rate": 1.9999194840743938e-05, "loss": 0.6079, "step": 154 }, { "epoch": 0.039230574538091624, "grad_norm": 0.1646062582731247, "learning_rate": 1.999916417116749e-05, "loss": 0.6499, "step": 155 }, { "epoch": 0.039483675018982534, "grad_norm": 0.16851483285427094, "learning_rate": 1.9999132928360172e-05, "loss": 0.6216, "step": 156 }, { "epoch": 0.03973677549987345, "grad_norm": 0.16902515292167664, "learning_rate": 1.9999101112323774e-05, "loss": 0.6106, "step": 157 }, { "epoch": 0.03998987598076437, "grad_norm": 0.16902323067188263, "learning_rate": 1.999906872306012e-05, "loss": 0.6175, "step": 158 }, { "epoch": 0.040242976461655276, "grad_norm": 0.17191338539123535, "learning_rate": 1.9999035760571065e-05, "loss": 0.6585, "step": 159 }, { "epoch": 0.04049607694254619, "grad_norm": 0.16515901684761047, "learning_rate": 1.99990022248585e-05, "loss": 0.6281, "step": 160 }, { "epoch": 0.0407491774234371, "grad_norm": 0.18414366245269775, "learning_rate": 1.9998968115924352e-05, "loss": 0.6127, "step": 161 }, { "epoch": 0.04100227790432802, "grad_norm": 0.16314849257469177, "learning_rate": 1.999893343377057e-05, "loss": 0.5814, "step": 162 }, { "epoch": 0.04125537838521893, "grad_norm": 0.16900253295898438, "learning_rate": 1.9998898178399142e-05, "loss": 0.6057, "step": 163 }, { "epoch": 0.041508478866109845, "grad_norm": 0.1643771082162857, "learning_rate": 1.999886234981209e-05, "loss": 0.6169, "step": 164 }, { "epoch": 0.04176157934700076, "grad_norm": 0.1646137237548828, "learning_rate": 1.9998825948011476e-05, "loss": 0.6144, "step": 165 }, { "epoch": 0.04201467982789167, "grad_norm": 0.17198063433170319, "learning_rate": 1.9998788972999374e-05, "loss": 0.5821, "step": 166 }, { "epoch": 0.04226778030878259, "grad_norm": 0.15963180363178253, "learning_rate": 1.9998751424777914e-05, "loss": 0.5805, "step": 167 }, { "epoch": 0.0425208807896735, "grad_norm": 0.17139393091201782, "learning_rate": 1.9998713303349242e-05, "loss": 0.6, "step": 168 }, { "epoch": 0.042773981270564414, "grad_norm": 0.17048956453800201, "learning_rate": 1.999867460871555e-05, "loss": 0.598, "step": 169 }, { "epoch": 0.04302708175145533, "grad_norm": 0.16242019832134247, "learning_rate": 1.9998635340879046e-05, "loss": 0.6278, "step": 170 }, { "epoch": 0.04328018223234624, "grad_norm": 0.1649109423160553, "learning_rate": 1.9998595499841994e-05, "loss": 0.6153, "step": 171 }, { "epoch": 0.043533282713237156, "grad_norm": 0.1605859398841858, "learning_rate": 1.9998555085606668e-05, "loss": 0.6199, "step": 172 }, { "epoch": 0.043786383194128066, "grad_norm": 0.18250787258148193, "learning_rate": 1.9998514098175388e-05, "loss": 0.6057, "step": 173 }, { "epoch": 0.04403948367501898, "grad_norm": 0.1787337362766266, "learning_rate": 1.9998472537550505e-05, "loss": 0.5691, "step": 174 }, { "epoch": 0.0442925841559099, "grad_norm": 0.15843160450458527, "learning_rate": 1.9998430403734402e-05, "loss": 0.6294, "step": 175 }, { "epoch": 0.04454568463680081, "grad_norm": 0.16140268743038177, "learning_rate": 1.999838769672949e-05, "loss": 0.6127, "step": 176 }, { "epoch": 0.044798785117691725, "grad_norm": 0.20213818550109863, "learning_rate": 1.9998344416538225e-05, "loss": 0.6074, "step": 177 }, { "epoch": 0.045051885598582635, "grad_norm": 0.17139162123203278, "learning_rate": 1.999830056316308e-05, "loss": 0.6083, "step": 178 }, { "epoch": 0.04530498607947355, "grad_norm": 0.18342524766921997, "learning_rate": 1.999825613660657e-05, "loss": 0.6143, "step": 179 }, { "epoch": 0.04555808656036447, "grad_norm": 0.15636856853961945, "learning_rate": 1.9998211136871252e-05, "loss": 0.6055, "step": 180 }, { "epoch": 0.04581118704125538, "grad_norm": 0.19432251155376434, "learning_rate": 1.999816556395969e-05, "loss": 0.6049, "step": 181 }, { "epoch": 0.046064287522146294, "grad_norm": 0.18179355561733246, "learning_rate": 1.999811941787451e-05, "loss": 0.6087, "step": 182 }, { "epoch": 0.0463173880030372, "grad_norm": 0.16457490622997284, "learning_rate": 1.999807269861835e-05, "loss": 0.6325, "step": 183 }, { "epoch": 0.04657048848392812, "grad_norm": 0.17942102253437042, "learning_rate": 1.999802540619389e-05, "loss": 0.5882, "step": 184 }, { "epoch": 0.046823588964819036, "grad_norm": 0.1658487170934677, "learning_rate": 1.9997977540603845e-05, "loss": 0.602, "step": 185 }, { "epoch": 0.047076689445709946, "grad_norm": 0.16457369923591614, "learning_rate": 1.999792910185095e-05, "loss": 0.5894, "step": 186 }, { "epoch": 0.04732978992660086, "grad_norm": 0.1655249446630478, "learning_rate": 1.9997880089937995e-05, "loss": 0.5942, "step": 187 }, { "epoch": 0.04758289040749177, "grad_norm": 0.1669139415025711, "learning_rate": 1.9997830504867777e-05, "loss": 0.606, "step": 188 }, { "epoch": 0.04783599088838269, "grad_norm": 0.17530295252799988, "learning_rate": 1.9997780346643147e-05, "loss": 0.616, "step": 189 }, { "epoch": 0.0480890913692736, "grad_norm": 0.16950328648090363, "learning_rate": 1.9997729615266975e-05, "loss": 0.614, "step": 190 }, { "epoch": 0.048342191850164515, "grad_norm": 0.16296598315238953, "learning_rate": 1.999767831074217e-05, "loss": 0.6147, "step": 191 }, { "epoch": 0.04859529233105543, "grad_norm": 0.15502794086933136, "learning_rate": 1.999762643307168e-05, "loss": 0.5812, "step": 192 }, { "epoch": 0.04884839281194634, "grad_norm": 0.16831602156162262, "learning_rate": 1.999757398225847e-05, "loss": 0.5909, "step": 193 }, { "epoch": 0.04910149329283726, "grad_norm": 0.1634550243616104, "learning_rate": 1.9997520958305556e-05, "loss": 0.5875, "step": 194 }, { "epoch": 0.04935459377372817, "grad_norm": 0.17237253487110138, "learning_rate": 1.9997467361215966e-05, "loss": 0.632, "step": 195 }, { "epoch": 0.049607694254619084, "grad_norm": 0.16299310326576233, "learning_rate": 1.9997413190992785e-05, "loss": 0.6021, "step": 196 }, { "epoch": 0.04986079473551, "grad_norm": 0.16284602880477905, "learning_rate": 1.999735844763911e-05, "loss": 0.591, "step": 197 }, { "epoch": 0.05011389521640091, "grad_norm": 0.16349320113658905, "learning_rate": 1.9997303131158082e-05, "loss": 0.5971, "step": 198 }, { "epoch": 0.050366995697291826, "grad_norm": 0.1635074019432068, "learning_rate": 1.9997247241552872e-05, "loss": 0.6136, "step": 199 }, { "epoch": 0.050620096178182736, "grad_norm": 0.16308331489562988, "learning_rate": 1.9997190778826685e-05, "loss": 0.5461, "step": 200 }, { "epoch": 0.05087319665907365, "grad_norm": 0.1602434664964676, "learning_rate": 1.9997133742982755e-05, "loss": 0.6125, "step": 201 }, { "epoch": 0.05112629713996457, "grad_norm": 0.16787537932395935, "learning_rate": 1.9997076134024356e-05, "loss": 0.6193, "step": 202 }, { "epoch": 0.05137939762085548, "grad_norm": 0.16162943840026855, "learning_rate": 1.9997017951954788e-05, "loss": 0.6082, "step": 203 }, { "epoch": 0.051632498101746395, "grad_norm": 0.17052938044071198, "learning_rate": 1.9996959196777388e-05, "loss": 0.5929, "step": 204 }, { "epoch": 0.051885598582637305, "grad_norm": 0.17506612837314606, "learning_rate": 1.9996899868495524e-05, "loss": 0.618, "step": 205 }, { "epoch": 0.05213869906352822, "grad_norm": 0.19307924807071686, "learning_rate": 1.9996839967112595e-05, "loss": 0.5826, "step": 206 }, { "epoch": 0.05239179954441914, "grad_norm": 0.16263261437416077, "learning_rate": 1.9996779492632035e-05, "loss": 0.5959, "step": 207 }, { "epoch": 0.05264490002531005, "grad_norm": 0.2174728810787201, "learning_rate": 1.999671844505731e-05, "loss": 0.5998, "step": 208 }, { "epoch": 0.052898000506200964, "grad_norm": 0.1637093722820282, "learning_rate": 1.9996656824391927e-05, "loss": 0.5784, "step": 209 }, { "epoch": 0.05315110098709187, "grad_norm": 0.17159394919872284, "learning_rate": 1.9996594630639415e-05, "loss": 0.5975, "step": 210 }, { "epoch": 0.05340420146798279, "grad_norm": 0.16354690492153168, "learning_rate": 1.9996531863803334e-05, "loss": 0.61, "step": 211 }, { "epoch": 0.053657301948873706, "grad_norm": 0.16591696441173553, "learning_rate": 1.9996468523887286e-05, "loss": 0.604, "step": 212 }, { "epoch": 0.053910402429764616, "grad_norm": 0.16789790987968445, "learning_rate": 1.9996404610894905e-05, "loss": 0.5941, "step": 213 }, { "epoch": 0.05416350291065553, "grad_norm": 0.1629846841096878, "learning_rate": 1.999634012482985e-05, "loss": 0.5854, "step": 214 }, { "epoch": 0.05441660339154644, "grad_norm": 0.17834800481796265, "learning_rate": 1.9996275065695823e-05, "loss": 0.6057, "step": 215 }, { "epoch": 0.05466970387243736, "grad_norm": 0.16474172472953796, "learning_rate": 1.9996209433496546e-05, "loss": 0.6017, "step": 216 }, { "epoch": 0.05492280435332827, "grad_norm": 0.15831775963306427, "learning_rate": 1.9996143228235793e-05, "loss": 0.6176, "step": 217 }, { "epoch": 0.055175904834219185, "grad_norm": 0.1640530377626419, "learning_rate": 1.999607644991735e-05, "loss": 0.6052, "step": 218 }, { "epoch": 0.0554290053151101, "grad_norm": 0.1691807508468628, "learning_rate": 1.9996009098545047e-05, "loss": 0.6023, "step": 219 }, { "epoch": 0.05568210579600101, "grad_norm": 0.17375995218753815, "learning_rate": 1.999594117412275e-05, "loss": 0.588, "step": 220 }, { "epoch": 0.05593520627689193, "grad_norm": 0.1587343066930771, "learning_rate": 1.9995872676654346e-05, "loss": 0.5992, "step": 221 }, { "epoch": 0.05618830675778284, "grad_norm": 0.16630913317203522, "learning_rate": 1.9995803606143768e-05, "loss": 0.613, "step": 222 }, { "epoch": 0.05644140723867375, "grad_norm": 0.15650874376296997, "learning_rate": 1.9995733962594966e-05, "loss": 0.5881, "step": 223 }, { "epoch": 0.05669450771956467, "grad_norm": 0.17238104343414307, "learning_rate": 1.9995663746011947e-05, "loss": 0.5898, "step": 224 }, { "epoch": 0.05694760820045558, "grad_norm": 0.1555783599615097, "learning_rate": 1.9995592956398725e-05, "loss": 0.5927, "step": 225 }, { "epoch": 0.057200708681346496, "grad_norm": 0.16336458921432495, "learning_rate": 1.9995521593759365e-05, "loss": 0.6055, "step": 226 }, { "epoch": 0.057453809162237406, "grad_norm": 0.17016665637493134, "learning_rate": 1.999544965809795e-05, "loss": 0.6168, "step": 227 }, { "epoch": 0.05770690964312832, "grad_norm": 0.1600721925497055, "learning_rate": 1.9995377149418613e-05, "loss": 0.5961, "step": 228 }, { "epoch": 0.05796001012401924, "grad_norm": 0.18717394769191742, "learning_rate": 1.9995304067725504e-05, "loss": 0.5645, "step": 229 }, { "epoch": 0.05821311060491015, "grad_norm": 0.1661251187324524, "learning_rate": 1.9995230413022816e-05, "loss": 0.5786, "step": 230 }, { "epoch": 0.058466211085801065, "grad_norm": 0.1633238047361374, "learning_rate": 1.9995156185314774e-05, "loss": 0.6195, "step": 231 }, { "epoch": 0.058719311566691974, "grad_norm": 0.16088761389255524, "learning_rate": 1.999508138460563e-05, "loss": 0.5744, "step": 232 }, { "epoch": 0.05897241204758289, "grad_norm": 0.17186705768108368, "learning_rate": 1.9995006010899668e-05, "loss": 0.6101, "step": 233 }, { "epoch": 0.05922551252847381, "grad_norm": 0.16114409267902374, "learning_rate": 1.9994930064201214e-05, "loss": 0.5771, "step": 234 }, { "epoch": 0.05947861300936472, "grad_norm": 0.15168946981430054, "learning_rate": 1.999485354451462e-05, "loss": 0.5918, "step": 235 }, { "epoch": 0.059731713490255633, "grad_norm": 0.17712467908859253, "learning_rate": 1.999477645184428e-05, "loss": 0.5937, "step": 236 }, { "epoch": 0.05998481397114654, "grad_norm": 0.16912341117858887, "learning_rate": 1.99946987861946e-05, "loss": 0.5937, "step": 237 }, { "epoch": 0.06023791445203746, "grad_norm": 0.1597040891647339, "learning_rate": 1.9994620547570044e-05, "loss": 0.5889, "step": 238 }, { "epoch": 0.06049101493292837, "grad_norm": 0.16447393596172333, "learning_rate": 1.999454173597509e-05, "loss": 0.6258, "step": 239 }, { "epoch": 0.060744115413819286, "grad_norm": 0.166543111205101, "learning_rate": 1.9994462351414264e-05, "loss": 0.594, "step": 240 }, { "epoch": 0.0609972158947102, "grad_norm": 0.16105465590953827, "learning_rate": 1.999438239389211e-05, "loss": 0.5903, "step": 241 }, { "epoch": 0.06125031637560111, "grad_norm": 0.15936554968357086, "learning_rate": 1.999430186341321e-05, "loss": 0.5915, "step": 242 }, { "epoch": 0.06150341685649203, "grad_norm": 0.15656346082687378, "learning_rate": 1.999422075998219e-05, "loss": 0.5699, "step": 243 }, { "epoch": 0.06175651733738294, "grad_norm": 0.1559823900461197, "learning_rate": 1.999413908360369e-05, "loss": 0.6093, "step": 244 }, { "epoch": 0.062009617818273854, "grad_norm": 0.16356448829174042, "learning_rate": 1.99940568342824e-05, "loss": 0.6096, "step": 245 }, { "epoch": 0.06226271829916477, "grad_norm": 0.167751282453537, "learning_rate": 1.9993974012023027e-05, "loss": 0.5826, "step": 246 }, { "epoch": 0.06251581878005569, "grad_norm": 0.15648062527179718, "learning_rate": 1.9993890616830325e-05, "loss": 0.5984, "step": 247 }, { "epoch": 0.06276891926094659, "grad_norm": 0.1637192964553833, "learning_rate": 1.9993806648709074e-05, "loss": 0.6117, "step": 248 }, { "epoch": 0.0630220197418375, "grad_norm": 0.17687813937664032, "learning_rate": 1.999372210766409e-05, "loss": 0.6144, "step": 249 }, { "epoch": 0.06327512022272842, "grad_norm": 0.15885214507579803, "learning_rate": 1.9993636993700215e-05, "loss": 0.5917, "step": 250 }, { "epoch": 0.06352822070361934, "grad_norm": 0.15977385640144348, "learning_rate": 1.9993551306822327e-05, "loss": 0.6064, "step": 251 }, { "epoch": 0.06378132118451026, "grad_norm": 0.169663205742836, "learning_rate": 1.999346504703534e-05, "loss": 0.6055, "step": 252 }, { "epoch": 0.06403442166540116, "grad_norm": 0.17267578840255737, "learning_rate": 1.999337821434421e-05, "loss": 0.6009, "step": 253 }, { "epoch": 0.06428752214629208, "grad_norm": 0.1592559516429901, "learning_rate": 1.9993290808753895e-05, "loss": 0.5861, "step": 254 }, { "epoch": 0.06454062262718299, "grad_norm": 0.15290877223014832, "learning_rate": 1.999320283026942e-05, "loss": 0.5827, "step": 255 }, { "epoch": 0.06479372310807391, "grad_norm": 0.15359878540039062, "learning_rate": 1.9993114278895825e-05, "loss": 0.602, "step": 256 }, { "epoch": 0.06504682358896482, "grad_norm": 0.15598230063915253, "learning_rate": 1.999302515463819e-05, "loss": 0.6146, "step": 257 }, { "epoch": 0.06529992406985573, "grad_norm": 0.15773145854473114, "learning_rate": 1.9992935457501613e-05, "loss": 0.5922, "step": 258 }, { "epoch": 0.06555302455074664, "grad_norm": 0.15868717432022095, "learning_rate": 1.999284518749125e-05, "loss": 0.6124, "step": 259 }, { "epoch": 0.06580612503163756, "grad_norm": 0.15584896504878998, "learning_rate": 1.9992754344612265e-05, "loss": 0.5999, "step": 260 }, { "epoch": 0.06605922551252848, "grad_norm": 0.15427203476428986, "learning_rate": 1.9992662928869874e-05, "loss": 0.5952, "step": 261 }, { "epoch": 0.0663123259934194, "grad_norm": 0.15341795980930328, "learning_rate": 1.9992570940269313e-05, "loss": 0.587, "step": 262 }, { "epoch": 0.0665654264743103, "grad_norm": 0.16140134632587433, "learning_rate": 1.9992478378815857e-05, "loss": 0.5778, "step": 263 }, { "epoch": 0.06681852695520121, "grad_norm": 0.14912493526935577, "learning_rate": 1.999238524451481e-05, "loss": 0.5857, "step": 264 }, { "epoch": 0.06707162743609213, "grad_norm": 0.17488797008991241, "learning_rate": 1.999229153737152e-05, "loss": 0.5844, "step": 265 }, { "epoch": 0.06732472791698305, "grad_norm": 0.15687042474746704, "learning_rate": 1.9992197257391344e-05, "loss": 0.6298, "step": 266 }, { "epoch": 0.06757782839787396, "grad_norm": 0.16537171602249146, "learning_rate": 1.9992102404579697e-05, "loss": 0.6044, "step": 267 }, { "epoch": 0.06783092887876487, "grad_norm": 0.1586325317621231, "learning_rate": 1.999200697894202e-05, "loss": 0.6134, "step": 268 }, { "epoch": 0.06808402935965578, "grad_norm": 0.281053751707077, "learning_rate": 1.9991910980483772e-05, "loss": 0.5833, "step": 269 }, { "epoch": 0.0683371298405467, "grad_norm": 0.16128815710544586, "learning_rate": 1.9991814409210465e-05, "loss": 0.6059, "step": 270 }, { "epoch": 0.06859023032143761, "grad_norm": 0.16216106712818146, "learning_rate": 1.999171726512763e-05, "loss": 0.6201, "step": 271 }, { "epoch": 0.06884333080232853, "grad_norm": 0.16704806685447693, "learning_rate": 1.9991619548240844e-05, "loss": 0.593, "step": 272 }, { "epoch": 0.06909643128321943, "grad_norm": 0.1574796736240387, "learning_rate": 1.9991521258555703e-05, "loss": 0.5881, "step": 273 }, { "epoch": 0.06934953176411035, "grad_norm": 0.1586303412914276, "learning_rate": 1.999142239607784e-05, "loss": 0.591, "step": 274 }, { "epoch": 0.06960263224500127, "grad_norm": 0.16143415868282318, "learning_rate": 1.9991322960812928e-05, "loss": 0.5879, "step": 275 }, { "epoch": 0.06985573272589218, "grad_norm": 0.1673780381679535, "learning_rate": 1.9991222952766663e-05, "loss": 0.6059, "step": 276 }, { "epoch": 0.07010883320678309, "grad_norm": 0.16469413042068481, "learning_rate": 1.9991122371944784e-05, "loss": 0.5997, "step": 277 }, { "epoch": 0.070361933687674, "grad_norm": 0.16841571033000946, "learning_rate": 1.999102121835305e-05, "loss": 0.6089, "step": 278 }, { "epoch": 0.07061503416856492, "grad_norm": 0.16411472856998444, "learning_rate": 1.9990919491997262e-05, "loss": 0.6135, "step": 279 }, { "epoch": 0.07086813464945584, "grad_norm": 0.15883317589759827, "learning_rate": 1.999081719288325e-05, "loss": 0.6049, "step": 280 }, { "epoch": 0.07112123513034675, "grad_norm": 0.16443659365177155, "learning_rate": 1.9990714321016888e-05, "loss": 0.6231, "step": 281 }, { "epoch": 0.07137433561123765, "grad_norm": 0.16413702070713043, "learning_rate": 1.999061087640406e-05, "loss": 0.6017, "step": 282 }, { "epoch": 0.07162743609212857, "grad_norm": 0.16081872582435608, "learning_rate": 1.9990506859050706e-05, "loss": 0.5843, "step": 283 }, { "epoch": 0.07188053657301949, "grad_norm": 0.15878289937973022, "learning_rate": 1.9990402268962786e-05, "loss": 0.5691, "step": 284 }, { "epoch": 0.0721336370539104, "grad_norm": 0.15292367339134216, "learning_rate": 1.99902971061463e-05, "loss": 0.5998, "step": 285 }, { "epoch": 0.07238673753480132, "grad_norm": 0.15839214622974396, "learning_rate": 1.9990191370607268e-05, "loss": 0.5947, "step": 286 }, { "epoch": 0.07263983801569222, "grad_norm": 0.15390464663505554, "learning_rate": 1.9990085062351755e-05, "loss": 0.6064, "step": 287 }, { "epoch": 0.07289293849658314, "grad_norm": 0.1568797379732132, "learning_rate": 1.998997818138586e-05, "loss": 0.5988, "step": 288 }, { "epoch": 0.07314603897747406, "grad_norm": 0.15333013236522675, "learning_rate": 1.9989870727715706e-05, "loss": 0.5986, "step": 289 }, { "epoch": 0.07339913945836497, "grad_norm": 0.15738581120967865, "learning_rate": 1.9989762701347457e-05, "loss": 0.6029, "step": 290 }, { "epoch": 0.07365223993925589, "grad_norm": 0.15028329193592072, "learning_rate": 1.99896541022873e-05, "loss": 0.5913, "step": 291 }, { "epoch": 0.07390534042014679, "grad_norm": 0.15606540441513062, "learning_rate": 1.9989544930541464e-05, "loss": 0.5727, "step": 292 }, { "epoch": 0.07415844090103771, "grad_norm": 0.16559909284114838, "learning_rate": 1.9989435186116206e-05, "loss": 0.6129, "step": 293 }, { "epoch": 0.07441154138192863, "grad_norm": 0.16808965802192688, "learning_rate": 1.998932486901782e-05, "loss": 0.593, "step": 294 }, { "epoch": 0.07466464186281954, "grad_norm": 0.154685840010643, "learning_rate": 1.9989213979252633e-05, "loss": 0.6267, "step": 295 }, { "epoch": 0.07491774234371046, "grad_norm": 0.15596535801887512, "learning_rate": 1.9989102516826992e-05, "loss": 0.5819, "step": 296 }, { "epoch": 0.07517084282460136, "grad_norm": 0.16899852454662323, "learning_rate": 1.9988990481747296e-05, "loss": 0.6208, "step": 297 }, { "epoch": 0.07542394330549228, "grad_norm": 0.15050692856311798, "learning_rate": 1.9988877874019964e-05, "loss": 0.5892, "step": 298 }, { "epoch": 0.0756770437863832, "grad_norm": 0.1622265726327896, "learning_rate": 1.9988764693651454e-05, "loss": 0.5773, "step": 299 }, { "epoch": 0.07593014426727411, "grad_norm": 0.18525199592113495, "learning_rate": 1.9988650940648252e-05, "loss": 0.6063, "step": 300 }, { "epoch": 0.07618324474816503, "grad_norm": 0.17563289403915405, "learning_rate": 1.998853661501688e-05, "loss": 0.5942, "step": 301 }, { "epoch": 0.07643634522905593, "grad_norm": 0.1748930960893631, "learning_rate": 1.9988421716763892e-05, "loss": 0.5884, "step": 302 }, { "epoch": 0.07668944570994685, "grad_norm": 0.15376964211463928, "learning_rate": 1.9988306245895873e-05, "loss": 0.5922, "step": 303 }, { "epoch": 0.07694254619083776, "grad_norm": 0.16042698919773102, "learning_rate": 1.9988190202419443e-05, "loss": 0.5657, "step": 304 }, { "epoch": 0.07719564667172868, "grad_norm": 0.16305531561374664, "learning_rate": 1.998807358634126e-05, "loss": 0.5933, "step": 305 }, { "epoch": 0.0774487471526196, "grad_norm": 0.15527382493019104, "learning_rate": 1.9987956397668005e-05, "loss": 0.5753, "step": 306 }, { "epoch": 0.0777018476335105, "grad_norm": 0.1665937453508377, "learning_rate": 1.9987838636406397e-05, "loss": 0.5871, "step": 307 }, { "epoch": 0.07795494811440142, "grad_norm": 0.15807491540908813, "learning_rate": 1.9987720302563184e-05, "loss": 0.5953, "step": 308 }, { "epoch": 0.07820804859529233, "grad_norm": 0.16152141988277435, "learning_rate": 1.9987601396145154e-05, "loss": 0.593, "step": 309 }, { "epoch": 0.07846114907618325, "grad_norm": 0.16260159015655518, "learning_rate": 1.9987481917159123e-05, "loss": 0.6233, "step": 310 }, { "epoch": 0.07871424955707416, "grad_norm": 0.15751411020755768, "learning_rate": 1.998736186561194e-05, "loss": 0.5676, "step": 311 }, { "epoch": 0.07896735003796507, "grad_norm": 0.15959788858890533, "learning_rate": 1.9987241241510485e-05, "loss": 0.6108, "step": 312 }, { "epoch": 0.07922045051885598, "grad_norm": 0.15218234062194824, "learning_rate": 1.9987120044861676e-05, "loss": 0.5774, "step": 313 }, { "epoch": 0.0794735509997469, "grad_norm": 0.15036055445671082, "learning_rate": 1.9986998275672458e-05, "loss": 0.5523, "step": 314 }, { "epoch": 0.07972665148063782, "grad_norm": 0.16426804661750793, "learning_rate": 1.9986875933949815e-05, "loss": 0.5858, "step": 315 }, { "epoch": 0.07997975196152873, "grad_norm": 0.15715593099594116, "learning_rate": 1.9986753019700758e-05, "loss": 0.6016, "step": 316 }, { "epoch": 0.08023285244241964, "grad_norm": 0.17541229724884033, "learning_rate": 1.998662953293234e-05, "loss": 0.5837, "step": 317 }, { "epoch": 0.08048595292331055, "grad_norm": 0.1641518473625183, "learning_rate": 1.9986505473651628e-05, "loss": 0.6009, "step": 318 }, { "epoch": 0.08073905340420147, "grad_norm": 0.15388673543930054, "learning_rate": 1.9986380841865746e-05, "loss": 0.5843, "step": 319 }, { "epoch": 0.08099215388509239, "grad_norm": 0.17019514739513397, "learning_rate": 1.998625563758183e-05, "loss": 0.5964, "step": 320 }, { "epoch": 0.0812452543659833, "grad_norm": 0.15808385610580444, "learning_rate": 1.9986129860807063e-05, "loss": 0.5857, "step": 321 }, { "epoch": 0.0814983548468742, "grad_norm": 0.16654977202415466, "learning_rate": 1.9986003511548655e-05, "loss": 0.5848, "step": 322 }, { "epoch": 0.08175145532776512, "grad_norm": 0.16539111733436584, "learning_rate": 1.9985876589813848e-05, "loss": 0.5946, "step": 323 }, { "epoch": 0.08200455580865604, "grad_norm": 0.15268370509147644, "learning_rate": 1.998574909560992e-05, "loss": 0.5755, "step": 324 }, { "epoch": 0.08225765628954695, "grad_norm": 0.15759532153606415, "learning_rate": 1.9985621028944174e-05, "loss": 0.576, "step": 325 }, { "epoch": 0.08251075677043786, "grad_norm": 0.189273402094841, "learning_rate": 1.9985492389823958e-05, "loss": 0.5808, "step": 326 }, { "epoch": 0.08276385725132877, "grad_norm": 0.1537943333387375, "learning_rate": 1.998536317825665e-05, "loss": 0.599, "step": 327 }, { "epoch": 0.08301695773221969, "grad_norm": 0.16000860929489136, "learning_rate": 1.998523339424965e-05, "loss": 0.592, "step": 328 }, { "epoch": 0.0832700582131106, "grad_norm": 0.1547561138868332, "learning_rate": 1.9985103037810396e-05, "loss": 0.5943, "step": 329 }, { "epoch": 0.08352315869400152, "grad_norm": 0.1555517166852951, "learning_rate": 1.998497210894637e-05, "loss": 0.5634, "step": 330 }, { "epoch": 0.08377625917489243, "grad_norm": 0.1513368785381317, "learning_rate": 1.9984840607665073e-05, "loss": 0.5469, "step": 331 }, { "epoch": 0.08402935965578334, "grad_norm": 0.15102490782737732, "learning_rate": 1.9984708533974043e-05, "loss": 0.5763, "step": 332 }, { "epoch": 0.08428246013667426, "grad_norm": 0.1598501354455948, "learning_rate": 1.9984575887880854e-05, "loss": 0.5852, "step": 333 }, { "epoch": 0.08453556061756518, "grad_norm": 0.17087587714195251, "learning_rate": 1.998444266939311e-05, "loss": 0.5749, "step": 334 }, { "epoch": 0.08478866109845609, "grad_norm": 0.17225144803524017, "learning_rate": 1.9984308878518446e-05, "loss": 0.5848, "step": 335 }, { "epoch": 0.085041761579347, "grad_norm": 0.1622695028781891, "learning_rate": 1.998417451526453e-05, "loss": 0.5964, "step": 336 }, { "epoch": 0.08529486206023791, "grad_norm": 0.1584172397851944, "learning_rate": 1.9984039579639073e-05, "loss": 0.6052, "step": 337 }, { "epoch": 0.08554796254112883, "grad_norm": 0.1658150851726532, "learning_rate": 1.9983904071649803e-05, "loss": 0.6025, "step": 338 }, { "epoch": 0.08580106302201974, "grad_norm": 0.1496206670999527, "learning_rate": 1.998376799130449e-05, "loss": 0.5741, "step": 339 }, { "epoch": 0.08605416350291066, "grad_norm": 0.15472014248371124, "learning_rate": 1.998363133861094e-05, "loss": 0.5833, "step": 340 }, { "epoch": 0.08630726398380156, "grad_norm": 0.1652199923992157, "learning_rate": 1.998349411357698e-05, "loss": 0.6036, "step": 341 }, { "epoch": 0.08656036446469248, "grad_norm": 0.15922397375106812, "learning_rate": 1.998335631621048e-05, "loss": 0.5952, "step": 342 }, { "epoch": 0.0868134649455834, "grad_norm": 0.1518596112728119, "learning_rate": 1.998321794651934e-05, "loss": 0.5826, "step": 343 }, { "epoch": 0.08706656542647431, "grad_norm": 0.16738763451576233, "learning_rate": 1.9983079004511488e-05, "loss": 0.6016, "step": 344 }, { "epoch": 0.08731966590736523, "grad_norm": 0.15630985796451569, "learning_rate": 1.9982939490194902e-05, "loss": 0.6241, "step": 345 }, { "epoch": 0.08757276638825613, "grad_norm": 0.15981461107730865, "learning_rate": 1.9982799403577564e-05, "loss": 0.5889, "step": 346 }, { "epoch": 0.08782586686914705, "grad_norm": 0.27540192008018494, "learning_rate": 1.9982658744667517e-05, "loss": 0.6059, "step": 347 }, { "epoch": 0.08807896735003796, "grad_norm": 0.15519198775291443, "learning_rate": 1.9982517513472813e-05, "loss": 0.5731, "step": 348 }, { "epoch": 0.08833206783092888, "grad_norm": 0.15499025583267212, "learning_rate": 1.998237571000156e-05, "loss": 0.5499, "step": 349 }, { "epoch": 0.0885851683118198, "grad_norm": 0.15701013803482056, "learning_rate": 1.9982233334261885e-05, "loss": 0.5907, "step": 350 }, { "epoch": 0.0888382687927107, "grad_norm": 0.15649379789829254, "learning_rate": 1.9982090386261944e-05, "loss": 0.5898, "step": 351 }, { "epoch": 0.08909136927360162, "grad_norm": 0.15530402958393097, "learning_rate": 1.9981946866009936e-05, "loss": 0.5883, "step": 352 }, { "epoch": 0.08934446975449253, "grad_norm": 0.15217220783233643, "learning_rate": 1.9981802773514087e-05, "loss": 0.5809, "step": 353 }, { "epoch": 0.08959757023538345, "grad_norm": 0.1590508371591568, "learning_rate": 1.9981658108782663e-05, "loss": 0.5807, "step": 354 }, { "epoch": 0.08985067071627437, "grad_norm": 0.15429863333702087, "learning_rate": 1.998151287182395e-05, "loss": 0.5834, "step": 355 }, { "epoch": 0.09010377119716527, "grad_norm": 0.15378670394420624, "learning_rate": 1.9981367062646277e-05, "loss": 0.6003, "step": 356 }, { "epoch": 0.09035687167805619, "grad_norm": 0.15638568997383118, "learning_rate": 1.9981220681258004e-05, "loss": 0.5928, "step": 357 }, { "epoch": 0.0906099721589471, "grad_norm": 0.16201607882976532, "learning_rate": 1.9981073727667523e-05, "loss": 0.5835, "step": 358 }, { "epoch": 0.09086307263983802, "grad_norm": 0.157405823469162, "learning_rate": 1.9980926201883254e-05, "loss": 0.5621, "step": 359 }, { "epoch": 0.09111617312072894, "grad_norm": 0.1522633582353592, "learning_rate": 1.9980778103913663e-05, "loss": 0.5531, "step": 360 }, { "epoch": 0.09136927360161984, "grad_norm": 0.16538803279399872, "learning_rate": 1.998062943376723e-05, "loss": 0.5816, "step": 361 }, { "epoch": 0.09162237408251075, "grad_norm": 0.1762588918209076, "learning_rate": 1.9980480191452487e-05, "loss": 0.5875, "step": 362 }, { "epoch": 0.09187547456340167, "grad_norm": 0.15592646598815918, "learning_rate": 1.9980330376977984e-05, "loss": 0.6372, "step": 363 }, { "epoch": 0.09212857504429259, "grad_norm": 0.1568923443555832, "learning_rate": 1.998017999035231e-05, "loss": 0.5961, "step": 364 }, { "epoch": 0.0923816755251835, "grad_norm": 0.15493977069854736, "learning_rate": 1.998002903158409e-05, "loss": 0.5596, "step": 365 }, { "epoch": 0.0926347760060744, "grad_norm": 0.15471848845481873, "learning_rate": 1.9979877500681973e-05, "loss": 0.595, "step": 366 }, { "epoch": 0.09288787648696532, "grad_norm": 0.15223823487758636, "learning_rate": 1.997972539765465e-05, "loss": 0.5923, "step": 367 }, { "epoch": 0.09314097696785624, "grad_norm": 0.154722198843956, "learning_rate": 1.997957272251084e-05, "loss": 0.5848, "step": 368 }, { "epoch": 0.09339407744874716, "grad_norm": 0.14988532662391663, "learning_rate": 1.9979419475259293e-05, "loss": 0.5879, "step": 369 }, { "epoch": 0.09364717792963807, "grad_norm": 0.15272821485996246, "learning_rate": 1.9979265655908797e-05, "loss": 0.5856, "step": 370 }, { "epoch": 0.09390027841052898, "grad_norm": 0.1536874920129776, "learning_rate": 1.9979111264468172e-05, "loss": 0.5675, "step": 371 }, { "epoch": 0.09415337889141989, "grad_norm": 0.15007144212722778, "learning_rate": 1.9978956300946265e-05, "loss": 0.5832, "step": 372 }, { "epoch": 0.09440647937231081, "grad_norm": 0.15521340072155, "learning_rate": 1.997880076535196e-05, "loss": 0.5555, "step": 373 }, { "epoch": 0.09465957985320173, "grad_norm": 0.15635685622692108, "learning_rate": 1.9978644657694174e-05, "loss": 0.5907, "step": 374 }, { "epoch": 0.09491268033409264, "grad_norm": 0.15637192130088806, "learning_rate": 1.997848797798186e-05, "loss": 0.5855, "step": 375 }, { "epoch": 0.09516578081498354, "grad_norm": 0.15218724310398102, "learning_rate": 1.9978330726223992e-05, "loss": 0.5708, "step": 376 }, { "epoch": 0.09541888129587446, "grad_norm": 0.21592919528484344, "learning_rate": 1.9978172902429595e-05, "loss": 0.5739, "step": 377 }, { "epoch": 0.09567198177676538, "grad_norm": 0.2044958621263504, "learning_rate": 1.9978014506607705e-05, "loss": 0.5678, "step": 378 }, { "epoch": 0.0959250822576563, "grad_norm": 0.1602644920349121, "learning_rate": 1.9977855538767416e-05, "loss": 0.5839, "step": 379 }, { "epoch": 0.0961781827385472, "grad_norm": 0.15126170217990875, "learning_rate": 1.9977695998917833e-05, "loss": 0.5982, "step": 380 }, { "epoch": 0.09643128321943811, "grad_norm": 0.1637371927499771, "learning_rate": 1.99775358870681e-05, "loss": 0.589, "step": 381 }, { "epoch": 0.09668438370032903, "grad_norm": 0.1484130471944809, "learning_rate": 1.9977375203227403e-05, "loss": 0.5574, "step": 382 }, { "epoch": 0.09693748418121995, "grad_norm": 0.15195773541927338, "learning_rate": 1.997721394740495e-05, "loss": 0.5996, "step": 383 }, { "epoch": 0.09719058466211086, "grad_norm": 0.18043164908885956, "learning_rate": 1.9977052119609984e-05, "loss": 0.5906, "step": 384 }, { "epoch": 0.09744368514300177, "grad_norm": 0.15861275792121887, "learning_rate": 1.9976889719851785e-05, "loss": 0.579, "step": 385 }, { "epoch": 0.09769678562389268, "grad_norm": 0.14669226109981537, "learning_rate": 1.9976726748139658e-05, "loss": 0.5891, "step": 386 }, { "epoch": 0.0979498861047836, "grad_norm": 0.15282531082630157, "learning_rate": 1.9976563204482952e-05, "loss": 0.586, "step": 387 }, { "epoch": 0.09820298658567451, "grad_norm": 0.15087860822677612, "learning_rate": 1.997639908889104e-05, "loss": 0.5916, "step": 388 }, { "epoch": 0.09845608706656543, "grad_norm": 0.15215586125850677, "learning_rate": 1.9976234401373335e-05, "loss": 0.6154, "step": 389 }, { "epoch": 0.09870918754745633, "grad_norm": 0.15243254601955414, "learning_rate": 1.9976069141939268e-05, "loss": 0.5911, "step": 390 }, { "epoch": 0.09896228802834725, "grad_norm": 0.1560385674238205, "learning_rate": 1.9975903310598323e-05, "loss": 0.5991, "step": 391 }, { "epoch": 0.09921538850923817, "grad_norm": 0.15781332552433014, "learning_rate": 1.9975736907359997e-05, "loss": 0.5921, "step": 392 }, { "epoch": 0.09946848899012908, "grad_norm": 0.16044297814369202, "learning_rate": 1.997556993223384e-05, "loss": 0.5954, "step": 393 }, { "epoch": 0.09972158947102, "grad_norm": 0.14789925515651703, "learning_rate": 1.997540238522942e-05, "loss": 0.5788, "step": 394 }, { "epoch": 0.0999746899519109, "grad_norm": 0.1528216302394867, "learning_rate": 1.997523426635634e-05, "loss": 0.5963, "step": 395 }, { "epoch": 0.10022779043280182, "grad_norm": 0.15586043894290924, "learning_rate": 1.9975065575624237e-05, "loss": 0.5847, "step": 396 }, { "epoch": 0.10048089091369274, "grad_norm": 0.14834704995155334, "learning_rate": 1.9974896313042784e-05, "loss": 0.5776, "step": 397 }, { "epoch": 0.10073399139458365, "grad_norm": 0.1561892032623291, "learning_rate": 1.9974726478621688e-05, "loss": 0.5692, "step": 398 }, { "epoch": 0.10098709187547457, "grad_norm": 0.1573437750339508, "learning_rate": 1.9974556072370678e-05, "loss": 0.6105, "step": 399 }, { "epoch": 0.10124019235636547, "grad_norm": 0.15625901520252228, "learning_rate": 1.997438509429953e-05, "loss": 0.5999, "step": 400 }, { "epoch": 0.10149329283725639, "grad_norm": 0.15824973583221436, "learning_rate": 1.997421354441804e-05, "loss": 0.6109, "step": 401 }, { "epoch": 0.1017463933181473, "grad_norm": 0.1485176682472229, "learning_rate": 1.997404142273605e-05, "loss": 0.6122, "step": 402 }, { "epoch": 0.10199949379903822, "grad_norm": 0.1522447168827057, "learning_rate": 1.997386872926342e-05, "loss": 0.5805, "step": 403 }, { "epoch": 0.10225259427992914, "grad_norm": 0.14381229877471924, "learning_rate": 1.997369546401005e-05, "loss": 0.5514, "step": 404 }, { "epoch": 0.10250569476082004, "grad_norm": 0.15505284070968628, "learning_rate": 1.997352162698588e-05, "loss": 0.5728, "step": 405 }, { "epoch": 0.10275879524171096, "grad_norm": 0.16043666005134583, "learning_rate": 1.9973347218200867e-05, "loss": 0.579, "step": 406 }, { "epoch": 0.10301189572260187, "grad_norm": 0.15423263609409332, "learning_rate": 1.9973172237665014e-05, "loss": 0.571, "step": 407 }, { "epoch": 0.10326499620349279, "grad_norm": 0.18292605876922607, "learning_rate": 1.9972996685388353e-05, "loss": 0.5797, "step": 408 }, { "epoch": 0.1035180966843837, "grad_norm": 0.1477111428976059, "learning_rate": 1.997282056138095e-05, "loss": 0.5803, "step": 409 }, { "epoch": 0.10377119716527461, "grad_norm": 0.1475038230419159, "learning_rate": 1.99726438656529e-05, "loss": 0.566, "step": 410 }, { "epoch": 0.10402429764616553, "grad_norm": 0.15047602355480194, "learning_rate": 1.9972466598214328e-05, "loss": 0.5598, "step": 411 }, { "epoch": 0.10427739812705644, "grad_norm": 0.17586417496204376, "learning_rate": 1.9972288759075402e-05, "loss": 0.5897, "step": 412 }, { "epoch": 0.10453049860794736, "grad_norm": 0.19568774104118347, "learning_rate": 1.9972110348246313e-05, "loss": 0.5697, "step": 413 }, { "epoch": 0.10478359908883828, "grad_norm": 0.15748457610607147, "learning_rate": 1.9971931365737293e-05, "loss": 0.601, "step": 414 }, { "epoch": 0.10503669956972918, "grad_norm": 0.1497504711151123, "learning_rate": 1.9971751811558598e-05, "loss": 0.5851, "step": 415 }, { "epoch": 0.1052898000506201, "grad_norm": 0.1707945168018341, "learning_rate": 1.9971571685720524e-05, "loss": 0.606, "step": 416 }, { "epoch": 0.10554290053151101, "grad_norm": 0.15110230445861816, "learning_rate": 1.99713909882334e-05, "loss": 0.5805, "step": 417 }, { "epoch": 0.10579600101240193, "grad_norm": 0.1568954437971115, "learning_rate": 1.9971209719107585e-05, "loss": 0.5896, "step": 418 }, { "epoch": 0.10604910149329284, "grad_norm": 0.15485233068466187, "learning_rate": 1.9971027878353464e-05, "loss": 0.5706, "step": 419 }, { "epoch": 0.10630220197418375, "grad_norm": 0.15614329278469086, "learning_rate": 1.9970845465981466e-05, "loss": 0.598, "step": 420 }, { "epoch": 0.10655530245507466, "grad_norm": 0.15889252722263336, "learning_rate": 1.9970662482002047e-05, "loss": 0.5861, "step": 421 }, { "epoch": 0.10680840293596558, "grad_norm": 0.15649782121181488, "learning_rate": 1.99704789264257e-05, "loss": 0.5865, "step": 422 }, { "epoch": 0.1070615034168565, "grad_norm": 0.15167191624641418, "learning_rate": 1.9970294799262946e-05, "loss": 0.5862, "step": 423 }, { "epoch": 0.10731460389774741, "grad_norm": 0.15284578502178192, "learning_rate": 1.9970110100524343e-05, "loss": 0.5757, "step": 424 }, { "epoch": 0.10756770437863832, "grad_norm": 0.1515786200761795, "learning_rate": 1.996992483022047e-05, "loss": 0.5718, "step": 425 }, { "epoch": 0.10782080485952923, "grad_norm": 0.16168825328350067, "learning_rate": 1.9969738988361963e-05, "loss": 0.5758, "step": 426 }, { "epoch": 0.10807390534042015, "grad_norm": 0.1584496945142746, "learning_rate": 1.9969552574959464e-05, "loss": 0.5663, "step": 427 }, { "epoch": 0.10832700582131106, "grad_norm": 0.15288542211055756, "learning_rate": 1.9969365590023662e-05, "loss": 0.5915, "step": 428 }, { "epoch": 0.10858010630220197, "grad_norm": 0.19142642617225647, "learning_rate": 1.9969178033565278e-05, "loss": 0.5811, "step": 429 }, { "epoch": 0.10883320678309288, "grad_norm": 0.16370010375976562, "learning_rate": 1.996898990559507e-05, "loss": 0.6023, "step": 430 }, { "epoch": 0.1090863072639838, "grad_norm": 0.1556045413017273, "learning_rate": 1.9968801206123815e-05, "loss": 0.5838, "step": 431 }, { "epoch": 0.10933940774487472, "grad_norm": 0.14515145123004913, "learning_rate": 1.9968611935162328e-05, "loss": 0.5954, "step": 432 }, { "epoch": 0.10959250822576563, "grad_norm": 0.15034079551696777, "learning_rate": 1.996842209272147e-05, "loss": 0.5751, "step": 433 }, { "epoch": 0.10984560870665654, "grad_norm": 0.15695899724960327, "learning_rate": 1.9968231678812117e-05, "loss": 0.5727, "step": 434 }, { "epoch": 0.11009870918754745, "grad_norm": 0.15632854402065277, "learning_rate": 1.996804069344519e-05, "loss": 0.5893, "step": 435 }, { "epoch": 0.11035180966843837, "grad_norm": 0.1531461477279663, "learning_rate": 1.996784913663163e-05, "loss": 0.5851, "step": 436 }, { "epoch": 0.11060491014932929, "grad_norm": 0.15281325578689575, "learning_rate": 1.9967657008382425e-05, "loss": 0.5838, "step": 437 }, { "epoch": 0.1108580106302202, "grad_norm": 0.14954638481140137, "learning_rate": 1.996746430870859e-05, "loss": 0.5907, "step": 438 }, { "epoch": 0.1111111111111111, "grad_norm": 0.16380441188812256, "learning_rate": 1.9967271037621167e-05, "loss": 0.5849, "step": 439 }, { "epoch": 0.11136421159200202, "grad_norm": 0.19176936149597168, "learning_rate": 1.996707719513124e-05, "loss": 0.5831, "step": 440 }, { "epoch": 0.11161731207289294, "grad_norm": 0.1506921797990799, "learning_rate": 1.996688278124992e-05, "loss": 0.5423, "step": 441 }, { "epoch": 0.11187041255378385, "grad_norm": 0.15436936914920807, "learning_rate": 1.996668779598835e-05, "loss": 0.5848, "step": 442 }, { "epoch": 0.11212351303467477, "grad_norm": 0.15891426801681519, "learning_rate": 1.9966492239357712e-05, "loss": 0.5739, "step": 443 }, { "epoch": 0.11237661351556567, "grad_norm": 0.15040729939937592, "learning_rate": 1.9966296111369215e-05, "loss": 0.5511, "step": 444 }, { "epoch": 0.11262971399645659, "grad_norm": 0.15116922557353973, "learning_rate": 1.9966099412034104e-05, "loss": 0.5953, "step": 445 }, { "epoch": 0.1128828144773475, "grad_norm": 0.15750816464424133, "learning_rate": 1.996590214136365e-05, "loss": 0.5751, "step": 446 }, { "epoch": 0.11313591495823842, "grad_norm": 0.1517753303050995, "learning_rate": 1.996570429936917e-05, "loss": 0.6068, "step": 447 }, { "epoch": 0.11338901543912934, "grad_norm": 0.14933405816555023, "learning_rate": 1.9965505886062004e-05, "loss": 0.577, "step": 448 }, { "epoch": 0.11364211592002024, "grad_norm": 0.14812275767326355, "learning_rate": 1.996530690145352e-05, "loss": 0.5704, "step": 449 }, { "epoch": 0.11389521640091116, "grad_norm": 0.16272808611392975, "learning_rate": 1.9965107345555133e-05, "loss": 0.5688, "step": 450 }, { "epoch": 0.11414831688180208, "grad_norm": 0.14796628057956696, "learning_rate": 1.996490721837828e-05, "loss": 0.5806, "step": 451 }, { "epoch": 0.11440141736269299, "grad_norm": 0.14997173845767975, "learning_rate": 1.9964706519934432e-05, "loss": 0.5763, "step": 452 }, { "epoch": 0.11465451784358391, "grad_norm": 0.15785318613052368, "learning_rate": 1.99645052502351e-05, "loss": 0.5757, "step": 453 }, { "epoch": 0.11490761832447481, "grad_norm": 0.14751143753528595, "learning_rate": 1.996430340929182e-05, "loss": 0.5592, "step": 454 }, { "epoch": 0.11516071880536573, "grad_norm": 0.14795641601085663, "learning_rate": 1.996410099711616e-05, "loss": 0.5707, "step": 455 }, { "epoch": 0.11541381928625664, "grad_norm": 0.1508340984582901, "learning_rate": 1.9963898013719726e-05, "loss": 0.576, "step": 456 }, { "epoch": 0.11566691976714756, "grad_norm": 0.1608547866344452, "learning_rate": 1.9963694459114155e-05, "loss": 0.5707, "step": 457 }, { "epoch": 0.11592002024803848, "grad_norm": 0.15005795657634735, "learning_rate": 1.9963490333311116e-05, "loss": 0.5692, "step": 458 }, { "epoch": 0.11617312072892938, "grad_norm": 0.14821495115756989, "learning_rate": 1.9963285636322312e-05, "loss": 0.5444, "step": 459 }, { "epoch": 0.1164262212098203, "grad_norm": 0.15709495544433594, "learning_rate": 1.9963080368159476e-05, "loss": 0.5943, "step": 460 }, { "epoch": 0.11667932169071121, "grad_norm": 0.14981617033481598, "learning_rate": 1.996287452883438e-05, "loss": 0.5595, "step": 461 }, { "epoch": 0.11693242217160213, "grad_norm": 0.15191037952899933, "learning_rate": 1.9962668118358814e-05, "loss": 0.5768, "step": 462 }, { "epoch": 0.11718552265249305, "grad_norm": 0.14880812168121338, "learning_rate": 1.9962461136744624e-05, "loss": 0.5714, "step": 463 }, { "epoch": 0.11743862313338395, "grad_norm": 0.17096273601055145, "learning_rate": 1.9962253584003666e-05, "loss": 0.5672, "step": 464 }, { "epoch": 0.11769172361427487, "grad_norm": 0.15059910714626312, "learning_rate": 1.9962045460147843e-05, "loss": 0.5638, "step": 465 }, { "epoch": 0.11794482409516578, "grad_norm": 0.1539696604013443, "learning_rate": 1.9961836765189088e-05, "loss": 0.579, "step": 466 }, { "epoch": 0.1181979245760567, "grad_norm": 0.15227748453617096, "learning_rate": 1.996162749913936e-05, "loss": 0.5598, "step": 467 }, { "epoch": 0.11845102505694761, "grad_norm": 0.144993394613266, "learning_rate": 1.996141766201066e-05, "loss": 0.6033, "step": 468 }, { "epoch": 0.11870412553783852, "grad_norm": 0.15821410715579987, "learning_rate": 1.996120725381502e-05, "loss": 0.6059, "step": 469 }, { "epoch": 0.11895722601872943, "grad_norm": 0.15914399921894073, "learning_rate": 1.9960996274564493e-05, "loss": 0.5961, "step": 470 }, { "epoch": 0.11921032649962035, "grad_norm": 0.1500304937362671, "learning_rate": 1.996078472427118e-05, "loss": 0.5553, "step": 471 }, { "epoch": 0.11946342698051127, "grad_norm": 0.15498086810112, "learning_rate": 1.996057260294721e-05, "loss": 0.5705, "step": 472 }, { "epoch": 0.11971652746140218, "grad_norm": 0.14791083335876465, "learning_rate": 1.996035991060474e-05, "loss": 0.5588, "step": 473 }, { "epoch": 0.11996962794229309, "grad_norm": 0.15489189326763153, "learning_rate": 1.9960146647255962e-05, "loss": 0.58, "step": 474 }, { "epoch": 0.120222728423184, "grad_norm": 0.14758768677711487, "learning_rate": 1.9959932812913108e-05, "loss": 0.5926, "step": 475 }, { "epoch": 0.12047582890407492, "grad_norm": 0.15335911512374878, "learning_rate": 1.9959718407588436e-05, "loss": 0.5965, "step": 476 }, { "epoch": 0.12072892938496584, "grad_norm": 0.16314727067947388, "learning_rate": 1.995950343129423e-05, "loss": 0.5807, "step": 477 }, { "epoch": 0.12098202986585674, "grad_norm": 0.15755096077919006, "learning_rate": 1.995928788404282e-05, "loss": 0.5603, "step": 478 }, { "epoch": 0.12123513034674765, "grad_norm": 0.15094897150993347, "learning_rate": 1.995907176584656e-05, "loss": 0.5649, "step": 479 }, { "epoch": 0.12148823082763857, "grad_norm": 0.1507454514503479, "learning_rate": 1.9958855076717844e-05, "loss": 0.5594, "step": 480 }, { "epoch": 0.12174133130852949, "grad_norm": 0.14879706501960754, "learning_rate": 1.995863781666909e-05, "loss": 0.5627, "step": 481 }, { "epoch": 0.1219944317894204, "grad_norm": 0.14595156908035278, "learning_rate": 1.9958419985712756e-05, "loss": 0.6164, "step": 482 }, { "epoch": 0.12224753227031131, "grad_norm": 0.14950570464134216, "learning_rate": 1.995820158386133e-05, "loss": 0.5731, "step": 483 }, { "epoch": 0.12250063275120222, "grad_norm": 0.15327396988868713, "learning_rate": 1.9957982611127328e-05, "loss": 0.5676, "step": 484 }, { "epoch": 0.12275373323209314, "grad_norm": 0.14594602584838867, "learning_rate": 1.9957763067523304e-05, "loss": 0.5656, "step": 485 }, { "epoch": 0.12300683371298406, "grad_norm": 0.1484612226486206, "learning_rate": 1.9957542953061853e-05, "loss": 0.5828, "step": 486 }, { "epoch": 0.12325993419387497, "grad_norm": 0.15902866423130035, "learning_rate": 1.995732226775558e-05, "loss": 0.5838, "step": 487 }, { "epoch": 0.12351303467476588, "grad_norm": 0.15555420517921448, "learning_rate": 1.9957101011617147e-05, "loss": 0.5901, "step": 488 }, { "epoch": 0.12376613515565679, "grad_norm": 0.1557220220565796, "learning_rate": 1.9956879184659235e-05, "loss": 0.5861, "step": 489 }, { "epoch": 0.12401923563654771, "grad_norm": 0.15559321641921997, "learning_rate": 1.9956656786894558e-05, "loss": 0.5857, "step": 490 }, { "epoch": 0.12427233611743863, "grad_norm": 0.15801867842674255, "learning_rate": 1.995643381833587e-05, "loss": 0.5823, "step": 491 }, { "epoch": 0.12452543659832954, "grad_norm": 0.15406377613544464, "learning_rate": 1.995621027899595e-05, "loss": 0.5894, "step": 492 }, { "epoch": 0.12477853707922044, "grad_norm": 0.1492113322019577, "learning_rate": 1.9955986168887614e-05, "loss": 0.5648, "step": 493 }, { "epoch": 0.12503163756011137, "grad_norm": 0.17829811573028564, "learning_rate": 1.995576148802371e-05, "loss": 0.5827, "step": 494 }, { "epoch": 0.1252847380410023, "grad_norm": 0.20612825453281403, "learning_rate": 1.995553623641712e-05, "loss": 0.5863, "step": 495 }, { "epoch": 0.12553783852189318, "grad_norm": 0.15315592288970947, "learning_rate": 1.9955310414080756e-05, "loss": 0.5823, "step": 496 }, { "epoch": 0.1257909390027841, "grad_norm": 0.15542149543762207, "learning_rate": 1.9955084021027563e-05, "loss": 0.5756, "step": 497 }, { "epoch": 0.126044039483675, "grad_norm": 0.144132599234581, "learning_rate": 1.9954857057270517e-05, "loss": 0.5542, "step": 498 }, { "epoch": 0.12629713996456593, "grad_norm": 0.1674090474843979, "learning_rate": 1.9954629522822636e-05, "loss": 0.5658, "step": 499 }, { "epoch": 0.12655024044545685, "grad_norm": 0.1668354570865631, "learning_rate": 1.995440141769696e-05, "loss": 0.5959, "step": 500 }, { "epoch": 0.12680334092634776, "grad_norm": 0.15888547897338867, "learning_rate": 1.9954172741906566e-05, "loss": 0.5757, "step": 501 }, { "epoch": 0.12705644140723868, "grad_norm": 0.15670830011367798, "learning_rate": 1.9953943495464563e-05, "loss": 0.5624, "step": 502 }, { "epoch": 0.1273095418881296, "grad_norm": 0.15593712031841278, "learning_rate": 1.9953713678384097e-05, "loss": 0.5626, "step": 503 }, { "epoch": 0.1275626423690205, "grad_norm": 0.15130479633808136, "learning_rate": 1.9953483290678334e-05, "loss": 0.582, "step": 504 }, { "epoch": 0.1278157428499114, "grad_norm": 0.1545686274766922, "learning_rate": 1.9953252332360495e-05, "loss": 0.5529, "step": 505 }, { "epoch": 0.12806884333080232, "grad_norm": 0.1504255086183548, "learning_rate": 1.995302080344381e-05, "loss": 0.5847, "step": 506 }, { "epoch": 0.12832194381169323, "grad_norm": 0.15030816197395325, "learning_rate": 1.9952788703941554e-05, "loss": 0.5671, "step": 507 }, { "epoch": 0.12857504429258415, "grad_norm": 0.15274129807949066, "learning_rate": 1.9952556033867036e-05, "loss": 0.6036, "step": 508 }, { "epoch": 0.12882814477347507, "grad_norm": 0.16378158330917358, "learning_rate": 1.995232279323359e-05, "loss": 0.5824, "step": 509 }, { "epoch": 0.12908124525436598, "grad_norm": 0.14647503197193146, "learning_rate": 1.9952088982054592e-05, "loss": 0.5627, "step": 510 }, { "epoch": 0.1293343457352569, "grad_norm": 0.14909817278385162, "learning_rate": 1.9951854600343443e-05, "loss": 0.567, "step": 511 }, { "epoch": 0.12958744621614782, "grad_norm": 0.1562184989452362, "learning_rate": 1.995161964811358e-05, "loss": 0.5692, "step": 512 }, { "epoch": 0.12984054669703873, "grad_norm": 0.15351241827011108, "learning_rate": 1.995138412537847e-05, "loss": 0.6028, "step": 513 }, { "epoch": 0.13009364717792965, "grad_norm": 0.15046747028827667, "learning_rate": 1.9951148032151623e-05, "loss": 0.574, "step": 514 }, { "epoch": 0.13034674765882054, "grad_norm": 0.15114392340183258, "learning_rate": 1.995091136844656e-05, "loss": 0.5647, "step": 515 }, { "epoch": 0.13059984813971146, "grad_norm": 0.1495080143213272, "learning_rate": 1.995067413427686e-05, "loss": 0.5757, "step": 516 }, { "epoch": 0.13085294862060237, "grad_norm": 0.16302980482578278, "learning_rate": 1.9950436329656126e-05, "loss": 0.561, "step": 517 }, { "epoch": 0.1311060491014933, "grad_norm": 0.15612943470478058, "learning_rate": 1.9950197954597976e-05, "loss": 0.5664, "step": 518 }, { "epoch": 0.1313591495823842, "grad_norm": 0.15150874853134155, "learning_rate": 1.994995900911609e-05, "loss": 0.5726, "step": 519 }, { "epoch": 0.13161225006327512, "grad_norm": 0.15911947190761566, "learning_rate": 1.9949719493224156e-05, "loss": 0.5715, "step": 520 }, { "epoch": 0.13186535054416604, "grad_norm": 0.17096064984798431, "learning_rate": 1.994947940693591e-05, "loss": 0.5867, "step": 521 }, { "epoch": 0.13211845102505695, "grad_norm": 0.15780529379844666, "learning_rate": 1.9949238750265114e-05, "loss": 0.573, "step": 522 }, { "epoch": 0.13237155150594787, "grad_norm": 0.14687465131282806, "learning_rate": 1.9948997523225567e-05, "loss": 0.548, "step": 523 }, { "epoch": 0.1326246519868388, "grad_norm": 0.14603029191493988, "learning_rate": 1.9948755725831096e-05, "loss": 0.5669, "step": 524 }, { "epoch": 0.13287775246772968, "grad_norm": 0.14853985607624054, "learning_rate": 1.9948513358095565e-05, "loss": 0.6054, "step": 525 }, { "epoch": 0.1331308529486206, "grad_norm": 0.150251105427742, "learning_rate": 1.9948270420032862e-05, "loss": 0.5661, "step": 526 }, { "epoch": 0.1333839534295115, "grad_norm": 0.16034545004367828, "learning_rate": 1.994802691165692e-05, "loss": 0.5722, "step": 527 }, { "epoch": 0.13363705391040243, "grad_norm": 0.15148800611495972, "learning_rate": 1.99477828329817e-05, "loss": 0.5698, "step": 528 }, { "epoch": 0.13389015439129334, "grad_norm": 0.15985143184661865, "learning_rate": 1.994753818402119e-05, "loss": 0.5695, "step": 529 }, { "epoch": 0.13414325487218426, "grad_norm": 0.1527390331029892, "learning_rate": 1.994729296478942e-05, "loss": 0.5791, "step": 530 }, { "epoch": 0.13439635535307518, "grad_norm": 0.1592749059200287, "learning_rate": 1.994704717530044e-05, "loss": 0.6112, "step": 531 }, { "epoch": 0.1346494558339661, "grad_norm": 0.15453064441680908, "learning_rate": 1.9946800815568347e-05, "loss": 0.5678, "step": 532 }, { "epoch": 0.134902556314857, "grad_norm": 0.15128736197948456, "learning_rate": 1.9946553885607263e-05, "loss": 0.5487, "step": 533 }, { "epoch": 0.13515565679574792, "grad_norm": 0.15509158372879028, "learning_rate": 1.9946306385431344e-05, "loss": 0.5729, "step": 534 }, { "epoch": 0.1354087572766388, "grad_norm": 0.15262316167354584, "learning_rate": 1.994605831505478e-05, "loss": 0.5601, "step": 535 }, { "epoch": 0.13566185775752973, "grad_norm": 0.15524417161941528, "learning_rate": 1.994580967449179e-05, "loss": 0.5639, "step": 536 }, { "epoch": 0.13591495823842065, "grad_norm": 0.15370914340019226, "learning_rate": 1.994556046375663e-05, "loss": 0.5525, "step": 537 }, { "epoch": 0.13616805871931156, "grad_norm": 0.1504281908273697, "learning_rate": 1.994531068286358e-05, "loss": 0.6125, "step": 538 }, { "epoch": 0.13642115920020248, "grad_norm": 0.15541692078113556, "learning_rate": 1.994506033182697e-05, "loss": 0.5826, "step": 539 }, { "epoch": 0.1366742596810934, "grad_norm": 0.1468469351530075, "learning_rate": 1.9944809410661147e-05, "loss": 0.5543, "step": 540 }, { "epoch": 0.1369273601619843, "grad_norm": 0.14687389135360718, "learning_rate": 1.9944557919380492e-05, "loss": 0.5673, "step": 541 }, { "epoch": 0.13718046064287523, "grad_norm": 0.15926910936832428, "learning_rate": 1.994430585799943e-05, "loss": 0.5855, "step": 542 }, { "epoch": 0.13743356112376615, "grad_norm": 0.15011908113956451, "learning_rate": 1.9944053226532408e-05, "loss": 0.5867, "step": 543 }, { "epoch": 0.13768666160465706, "grad_norm": 0.14907962083816528, "learning_rate": 1.9943800024993906e-05, "loss": 0.5923, "step": 544 }, { "epoch": 0.13793976208554795, "grad_norm": 0.15277652442455292, "learning_rate": 1.9943546253398443e-05, "loss": 0.5812, "step": 545 }, { "epoch": 0.13819286256643887, "grad_norm": 0.14851029217243195, "learning_rate": 1.9943291911760564e-05, "loss": 0.5568, "step": 546 }, { "epoch": 0.13844596304732978, "grad_norm": 0.1487397402524948, "learning_rate": 1.9943037000094857e-05, "loss": 0.5823, "step": 547 }, { "epoch": 0.1386990635282207, "grad_norm": 0.14926819503307343, "learning_rate": 1.9942781518415927e-05, "loss": 0.5911, "step": 548 }, { "epoch": 0.13895216400911162, "grad_norm": 0.14806623756885529, "learning_rate": 1.9942525466738423e-05, "loss": 0.6017, "step": 549 }, { "epoch": 0.13920526449000253, "grad_norm": 0.14928856492042542, "learning_rate": 1.9942268845077022e-05, "loss": 0.5873, "step": 550 }, { "epoch": 0.13945836497089345, "grad_norm": 0.14869250357151031, "learning_rate": 1.9942011653446444e-05, "loss": 0.5809, "step": 551 }, { "epoch": 0.13971146545178437, "grad_norm": 0.142907053232193, "learning_rate": 1.9941753891861425e-05, "loss": 0.5781, "step": 552 }, { "epoch": 0.13996456593267528, "grad_norm": 0.14452791213989258, "learning_rate": 1.9941495560336742e-05, "loss": 0.5649, "step": 553 }, { "epoch": 0.14021766641356617, "grad_norm": 0.15031449496746063, "learning_rate": 1.9941236658887207e-05, "loss": 0.5417, "step": 554 }, { "epoch": 0.1404707668944571, "grad_norm": 0.1468997448682785, "learning_rate": 1.9940977187527665e-05, "loss": 0.5621, "step": 555 }, { "epoch": 0.140723867375348, "grad_norm": 0.15059807896614075, "learning_rate": 1.9940717146272988e-05, "loss": 0.5626, "step": 556 }, { "epoch": 0.14097696785623892, "grad_norm": 0.14762374758720398, "learning_rate": 1.994045653513808e-05, "loss": 0.5476, "step": 557 }, { "epoch": 0.14123006833712984, "grad_norm": 0.15583224594593048, "learning_rate": 1.9940195354137888e-05, "loss": 0.5522, "step": 558 }, { "epoch": 0.14148316881802075, "grad_norm": 0.1424167901277542, "learning_rate": 1.993993360328738e-05, "loss": 0.5484, "step": 559 }, { "epoch": 0.14173626929891167, "grad_norm": 0.18569441139698029, "learning_rate": 1.9939671282601564e-05, "loss": 0.5813, "step": 560 }, { "epoch": 0.1419893697798026, "grad_norm": 0.15066631138324738, "learning_rate": 1.993940839209548e-05, "loss": 0.5771, "step": 561 }, { "epoch": 0.1422424702606935, "grad_norm": 0.14673936367034912, "learning_rate": 1.9939144931784198e-05, "loss": 0.5665, "step": 562 }, { "epoch": 0.14249557074158442, "grad_norm": 0.1482633650302887, "learning_rate": 1.9938880901682817e-05, "loss": 0.5797, "step": 563 }, { "epoch": 0.1427486712224753, "grad_norm": 0.1495112031698227, "learning_rate": 1.993861630180648e-05, "loss": 0.562, "step": 564 }, { "epoch": 0.14300177170336623, "grad_norm": 0.15051144361495972, "learning_rate": 1.9938351132170348e-05, "loss": 0.5414, "step": 565 }, { "epoch": 0.14325487218425714, "grad_norm": 0.1446501761674881, "learning_rate": 1.993808539278963e-05, "loss": 0.534, "step": 566 }, { "epoch": 0.14350797266514806, "grad_norm": 0.14814580976963043, "learning_rate": 1.9937819083679557e-05, "loss": 0.5824, "step": 567 }, { "epoch": 0.14376107314603898, "grad_norm": 0.16074736416339874, "learning_rate": 1.9937552204855395e-05, "loss": 0.5501, "step": 568 }, { "epoch": 0.1440141736269299, "grad_norm": 0.1435040831565857, "learning_rate": 1.9937284756332445e-05, "loss": 0.5444, "step": 569 }, { "epoch": 0.1442672741078208, "grad_norm": 0.1509789079427719, "learning_rate": 1.993701673812604e-05, "loss": 0.5827, "step": 570 }, { "epoch": 0.14452037458871173, "grad_norm": 0.16079916059970856, "learning_rate": 1.9936748150251546e-05, "loss": 0.5837, "step": 571 }, { "epoch": 0.14477347506960264, "grad_norm": 0.15574325621128082, "learning_rate": 1.993647899272436e-05, "loss": 0.5711, "step": 572 }, { "epoch": 0.14502657555049356, "grad_norm": 0.14947296679019928, "learning_rate": 1.9936209265559908e-05, "loss": 0.5493, "step": 573 }, { "epoch": 0.14527967603138445, "grad_norm": 0.1521664559841156, "learning_rate": 1.9935938968773656e-05, "loss": 0.5855, "step": 574 }, { "epoch": 0.14553277651227536, "grad_norm": 0.14926466345787048, "learning_rate": 1.99356681023811e-05, "loss": 0.5916, "step": 575 }, { "epoch": 0.14578587699316628, "grad_norm": 0.14865681529045105, "learning_rate": 1.993539666639777e-05, "loss": 0.5578, "step": 576 }, { "epoch": 0.1460389774740572, "grad_norm": 0.15686194598674774, "learning_rate": 1.993512466083922e-05, "loss": 0.5774, "step": 577 }, { "epoch": 0.1462920779549481, "grad_norm": 0.14981764554977417, "learning_rate": 1.993485208572105e-05, "loss": 0.5951, "step": 578 }, { "epoch": 0.14654517843583903, "grad_norm": 0.15296462178230286, "learning_rate": 1.9934578941058883e-05, "loss": 0.5828, "step": 579 }, { "epoch": 0.14679827891672995, "grad_norm": 0.149521604180336, "learning_rate": 1.9934305226868383e-05, "loss": 0.593, "step": 580 }, { "epoch": 0.14705137939762086, "grad_norm": 0.14766348898410797, "learning_rate": 1.9934030943165235e-05, "loss": 0.5868, "step": 581 }, { "epoch": 0.14730447987851178, "grad_norm": 0.16118831932544708, "learning_rate": 1.9933756089965163e-05, "loss": 0.5953, "step": 582 }, { "epoch": 0.1475575803594027, "grad_norm": 0.14837884902954102, "learning_rate": 1.993348066728393e-05, "loss": 0.549, "step": 583 }, { "epoch": 0.14781068084029358, "grad_norm": 0.16255691647529602, "learning_rate": 1.993320467513732e-05, "loss": 0.5629, "step": 584 }, { "epoch": 0.1480637813211845, "grad_norm": 0.148208349943161, "learning_rate": 1.993292811354116e-05, "loss": 0.5518, "step": 585 }, { "epoch": 0.14831688180207542, "grad_norm": 0.15558750927448273, "learning_rate": 1.9932650982511296e-05, "loss": 0.5983, "step": 586 }, { "epoch": 0.14856998228296633, "grad_norm": 0.15297412872314453, "learning_rate": 1.9932373282063623e-05, "loss": 0.5745, "step": 587 }, { "epoch": 0.14882308276385725, "grad_norm": 0.1489609032869339, "learning_rate": 1.993209501221406e-05, "loss": 0.5471, "step": 588 }, { "epoch": 0.14907618324474817, "grad_norm": 0.1520264446735382, "learning_rate": 1.9931816172978556e-05, "loss": 0.5614, "step": 589 }, { "epoch": 0.14932928372563908, "grad_norm": 0.2518834173679352, "learning_rate": 1.99315367643731e-05, "loss": 0.5713, "step": 590 }, { "epoch": 0.14958238420653, "grad_norm": 0.21436260640621185, "learning_rate": 1.993125678641371e-05, "loss": 0.5649, "step": 591 }, { "epoch": 0.14983548468742092, "grad_norm": 0.14589393138885498, "learning_rate": 1.9930976239116436e-05, "loss": 0.5597, "step": 592 }, { "epoch": 0.15008858516831183, "grad_norm": 0.16369180381298065, "learning_rate": 1.9930695122497354e-05, "loss": 0.5536, "step": 593 }, { "epoch": 0.15034168564920272, "grad_norm": 0.148586705327034, "learning_rate": 1.9930413436572592e-05, "loss": 0.5754, "step": 594 }, { "epoch": 0.15059478613009364, "grad_norm": 0.14744792878627777, "learning_rate": 1.993013118135829e-05, "loss": 0.5879, "step": 595 }, { "epoch": 0.15084788661098456, "grad_norm": 0.15067927539348602, "learning_rate": 1.9929848356870632e-05, "loss": 0.5458, "step": 596 }, { "epoch": 0.15110098709187547, "grad_norm": 0.1520412117242813, "learning_rate": 1.992956496312583e-05, "loss": 0.5677, "step": 597 }, { "epoch": 0.1513540875727664, "grad_norm": 0.1419924944639206, "learning_rate": 1.9929281000140134e-05, "loss": 0.5646, "step": 598 }, { "epoch": 0.1516071880536573, "grad_norm": 0.1541939228773117, "learning_rate": 1.992899646792982e-05, "loss": 0.5952, "step": 599 }, { "epoch": 0.15186028853454822, "grad_norm": 0.16430619359016418, "learning_rate": 1.9928711366511198e-05, "loss": 0.5945, "step": 600 }, { "epoch": 0.15211338901543914, "grad_norm": 0.15097370743751526, "learning_rate": 1.9928425695900618e-05, "loss": 0.5813, "step": 601 }, { "epoch": 0.15236648949633005, "grad_norm": 0.1600654274225235, "learning_rate": 1.9928139456114452e-05, "loss": 0.5904, "step": 602 }, { "epoch": 0.15261958997722094, "grad_norm": 0.15649215877056122, "learning_rate": 1.992785264716911e-05, "loss": 0.5657, "step": 603 }, { "epoch": 0.15287269045811186, "grad_norm": 0.15534299612045288, "learning_rate": 1.9927565269081035e-05, "loss": 0.5694, "step": 604 }, { "epoch": 0.15312579093900278, "grad_norm": 0.15252354741096497, "learning_rate": 1.9927277321866704e-05, "loss": 0.5811, "step": 605 }, { "epoch": 0.1533788914198937, "grad_norm": 0.14859551191329956, "learning_rate": 1.992698880554262e-05, "loss": 0.57, "step": 606 }, { "epoch": 0.1536319919007846, "grad_norm": 0.1567496955394745, "learning_rate": 1.9926699720125325e-05, "loss": 0.5831, "step": 607 }, { "epoch": 0.15388509238167553, "grad_norm": 0.14938685297966003, "learning_rate": 1.9926410065631396e-05, "loss": 0.5716, "step": 608 }, { "epoch": 0.15413819286256644, "grad_norm": 0.15608009696006775, "learning_rate": 1.9926119842077433e-05, "loss": 0.5598, "step": 609 }, { "epoch": 0.15439129334345736, "grad_norm": 0.14903607964515686, "learning_rate": 1.9925829049480074e-05, "loss": 0.583, "step": 610 }, { "epoch": 0.15464439382434828, "grad_norm": 0.5127801299095154, "learning_rate": 1.992553768785599e-05, "loss": 0.5919, "step": 611 }, { "epoch": 0.1548974943052392, "grad_norm": 0.14690221846103668, "learning_rate": 1.9925245757221885e-05, "loss": 0.5601, "step": 612 }, { "epoch": 0.15515059478613008, "grad_norm": 0.1513320654630661, "learning_rate": 1.9924953257594494e-05, "loss": 0.5698, "step": 613 }, { "epoch": 0.155403695267021, "grad_norm": 0.15853796899318695, "learning_rate": 1.9924660188990587e-05, "loss": 0.5994, "step": 614 }, { "epoch": 0.1556567957479119, "grad_norm": 0.15242621302604675, "learning_rate": 1.9924366551426965e-05, "loss": 0.5915, "step": 615 }, { "epoch": 0.15590989622880283, "grad_norm": 0.15101855993270874, "learning_rate": 1.992407234492046e-05, "loss": 0.5847, "step": 616 }, { "epoch": 0.15616299670969375, "grad_norm": 0.15262015163898468, "learning_rate": 1.992377756948794e-05, "loss": 0.5834, "step": 617 }, { "epoch": 0.15641609719058466, "grad_norm": 0.1572626531124115, "learning_rate": 1.99234822251463e-05, "loss": 0.574, "step": 618 }, { "epoch": 0.15666919767147558, "grad_norm": 0.1495586633682251, "learning_rate": 1.9923186311912474e-05, "loss": 0.5489, "step": 619 }, { "epoch": 0.1569222981523665, "grad_norm": 0.14650045335292816, "learning_rate": 1.9922889829803428e-05, "loss": 0.5674, "step": 620 }, { "epoch": 0.1571753986332574, "grad_norm": 0.1512831598520279, "learning_rate": 1.9922592778836156e-05, "loss": 0.5705, "step": 621 }, { "epoch": 0.15742849911414833, "grad_norm": 0.1478670984506607, "learning_rate": 1.9922295159027692e-05, "loss": 0.5543, "step": 622 }, { "epoch": 0.15768159959503922, "grad_norm": 0.14747242629528046, "learning_rate": 1.992199697039509e-05, "loss": 0.5556, "step": 623 }, { "epoch": 0.15793470007593013, "grad_norm": 0.15013568103313446, "learning_rate": 1.9921698212955447e-05, "loss": 0.5663, "step": 624 }, { "epoch": 0.15818780055682105, "grad_norm": 0.16112448275089264, "learning_rate": 1.9921398886725897e-05, "loss": 0.5755, "step": 625 }, { "epoch": 0.15844090103771197, "grad_norm": 0.15023668110370636, "learning_rate": 1.9921098991723592e-05, "loss": 0.5578, "step": 626 }, { "epoch": 0.15869400151860288, "grad_norm": 0.16092228889465332, "learning_rate": 1.9920798527965724e-05, "loss": 0.5613, "step": 627 }, { "epoch": 0.1589471019994938, "grad_norm": 0.1508610099554062, "learning_rate": 1.9920497495469526e-05, "loss": 0.5534, "step": 628 }, { "epoch": 0.15920020248038472, "grad_norm": 0.151445209980011, "learning_rate": 1.9920195894252248e-05, "loss": 0.5834, "step": 629 }, { "epoch": 0.15945330296127563, "grad_norm": 0.14864689111709595, "learning_rate": 1.991989372433118e-05, "loss": 0.5356, "step": 630 }, { "epoch": 0.15970640344216655, "grad_norm": 0.15049447119235992, "learning_rate": 1.9919590985723653e-05, "loss": 0.5648, "step": 631 }, { "epoch": 0.15995950392305747, "grad_norm": 0.20305077731609344, "learning_rate": 1.991928767844701e-05, "loss": 0.5546, "step": 632 }, { "epoch": 0.16021260440394836, "grad_norm": 0.15501059591770172, "learning_rate": 1.991898380251865e-05, "loss": 0.5523, "step": 633 }, { "epoch": 0.16046570488483927, "grad_norm": 0.16190926730632782, "learning_rate": 1.9918679357955987e-05, "loss": 0.5666, "step": 634 }, { "epoch": 0.1607188053657302, "grad_norm": 0.15073847770690918, "learning_rate": 1.991837434477648e-05, "loss": 0.5442, "step": 635 }, { "epoch": 0.1609719058466211, "grad_norm": 0.15194927155971527, "learning_rate": 1.9918068762997607e-05, "loss": 0.568, "step": 636 }, { "epoch": 0.16122500632751202, "grad_norm": 0.1503583937883377, "learning_rate": 1.991776261263689e-05, "loss": 0.5732, "step": 637 }, { "epoch": 0.16147810680840294, "grad_norm": 0.16482821106910706, "learning_rate": 1.9917455893711883e-05, "loss": 0.6229, "step": 638 }, { "epoch": 0.16173120728929385, "grad_norm": 0.14933739602565765, "learning_rate": 1.9917148606240167e-05, "loss": 0.5671, "step": 639 }, { "epoch": 0.16198430777018477, "grad_norm": 0.15639838576316833, "learning_rate": 1.991684075023936e-05, "loss": 0.5761, "step": 640 }, { "epoch": 0.1622374082510757, "grad_norm": 0.1661742925643921, "learning_rate": 1.9916532325727105e-05, "loss": 0.5833, "step": 641 }, { "epoch": 0.1624905087319666, "grad_norm": 0.14919333159923553, "learning_rate": 1.9916223332721088e-05, "loss": 0.564, "step": 642 }, { "epoch": 0.1627436092128575, "grad_norm": 0.15366952121257782, "learning_rate": 1.9915913771239022e-05, "loss": 0.5893, "step": 643 }, { "epoch": 0.1629967096937484, "grad_norm": 0.15431956946849823, "learning_rate": 1.9915603641298654e-05, "loss": 0.5827, "step": 644 }, { "epoch": 0.16324981017463933, "grad_norm": 0.15347975492477417, "learning_rate": 1.991529294291776e-05, "loss": 0.5685, "step": 645 }, { "epoch": 0.16350291065553024, "grad_norm": 0.1506311148405075, "learning_rate": 1.991498167611416e-05, "loss": 0.5558, "step": 646 }, { "epoch": 0.16375601113642116, "grad_norm": 0.15514300763607025, "learning_rate": 1.991466984090569e-05, "loss": 0.5563, "step": 647 }, { "epoch": 0.16400911161731208, "grad_norm": 0.24941538274288177, "learning_rate": 1.9914357437310228e-05, "loss": 0.5965, "step": 648 }, { "epoch": 0.164262212098203, "grad_norm": 0.1533135324716568, "learning_rate": 1.9914044465345688e-05, "loss": 0.5596, "step": 649 }, { "epoch": 0.1645153125790939, "grad_norm": 0.1488742232322693, "learning_rate": 1.9913730925030005e-05, "loss": 0.589, "step": 650 }, { "epoch": 0.16476841305998483, "grad_norm": 0.1442538946866989, "learning_rate": 1.991341681638116e-05, "loss": 0.572, "step": 651 }, { "epoch": 0.16502151354087571, "grad_norm": 0.15130048990249634, "learning_rate": 1.9913102139417157e-05, "loss": 0.5927, "step": 652 }, { "epoch": 0.16527461402176663, "grad_norm": 0.15967319905757904, "learning_rate": 1.9912786894156038e-05, "loss": 0.5635, "step": 653 }, { "epoch": 0.16552771450265755, "grad_norm": 0.14939410984516144, "learning_rate": 1.9912471080615873e-05, "loss": 0.5761, "step": 654 }, { "epoch": 0.16578081498354846, "grad_norm": 0.15269893407821655, "learning_rate": 1.9912154698814765e-05, "loss": 0.5579, "step": 655 }, { "epoch": 0.16603391546443938, "grad_norm": 0.1510346531867981, "learning_rate": 1.9911837748770857e-05, "loss": 0.5591, "step": 656 }, { "epoch": 0.1662870159453303, "grad_norm": 0.14931762218475342, "learning_rate": 1.9911520230502316e-05, "loss": 0.5706, "step": 657 }, { "epoch": 0.1665401164262212, "grad_norm": 0.15608814358711243, "learning_rate": 1.9911202144027343e-05, "loss": 0.5527, "step": 658 }, { "epoch": 0.16679321690711213, "grad_norm": 0.1536053717136383, "learning_rate": 1.9910883489364178e-05, "loss": 0.5994, "step": 659 }, { "epoch": 0.16704631738800305, "grad_norm": 0.14757314324378967, "learning_rate": 1.9910564266531084e-05, "loss": 0.5724, "step": 660 }, { "epoch": 0.16729941786889396, "grad_norm": 0.1864614188671112, "learning_rate": 1.9910244475546362e-05, "loss": 0.5572, "step": 661 }, { "epoch": 0.16755251834978485, "grad_norm": 0.1520412564277649, "learning_rate": 1.990992411642835e-05, "loss": 0.5817, "step": 662 }, { "epoch": 0.16780561883067577, "grad_norm": 0.15760135650634766, "learning_rate": 1.9909603189195405e-05, "loss": 0.5514, "step": 663 }, { "epoch": 0.16805871931156668, "grad_norm": 0.14856499433517456, "learning_rate": 1.9909281693865935e-05, "loss": 0.5622, "step": 664 }, { "epoch": 0.1683118197924576, "grad_norm": 0.14778557419776917, "learning_rate": 1.9908959630458362e-05, "loss": 0.5571, "step": 665 }, { "epoch": 0.16856492027334852, "grad_norm": 0.1456754058599472, "learning_rate": 1.9908636998991156e-05, "loss": 0.5677, "step": 666 }, { "epoch": 0.16881802075423943, "grad_norm": 0.1472209244966507, "learning_rate": 1.9908313799482807e-05, "loss": 0.5751, "step": 667 }, { "epoch": 0.16907112123513035, "grad_norm": 0.15274663269519806, "learning_rate": 1.9907990031951847e-05, "loss": 0.5367, "step": 668 }, { "epoch": 0.16932422171602127, "grad_norm": 0.14899861812591553, "learning_rate": 1.9907665696416835e-05, "loss": 0.5721, "step": 669 }, { "epoch": 0.16957732219691218, "grad_norm": 0.1461005061864853, "learning_rate": 1.9907340792896362e-05, "loss": 0.5664, "step": 670 }, { "epoch": 0.1698304226778031, "grad_norm": 0.1556117981672287, "learning_rate": 1.9907015321409063e-05, "loss": 0.5747, "step": 671 }, { "epoch": 0.170083523158694, "grad_norm": 0.15157921612262726, "learning_rate": 1.9906689281973585e-05, "loss": 0.5702, "step": 672 }, { "epoch": 0.1703366236395849, "grad_norm": 0.16483035683631897, "learning_rate": 1.990636267460863e-05, "loss": 0.594, "step": 673 }, { "epoch": 0.17058972412047582, "grad_norm": 0.1515565663576126, "learning_rate": 1.9906035499332917e-05, "loss": 0.5831, "step": 674 }, { "epoch": 0.17084282460136674, "grad_norm": 0.1511337012052536, "learning_rate": 1.9905707756165198e-05, "loss": 0.5637, "step": 675 }, { "epoch": 0.17109592508225765, "grad_norm": 0.15434476733207703, "learning_rate": 1.9905379445124267e-05, "loss": 0.5996, "step": 676 }, { "epoch": 0.17134902556314857, "grad_norm": 0.14820517599582672, "learning_rate": 1.9905050566228945e-05, "loss": 0.5669, "step": 677 }, { "epoch": 0.1716021260440395, "grad_norm": 0.15169605612754822, "learning_rate": 1.9904721119498084e-05, "loss": 0.5638, "step": 678 }, { "epoch": 0.1718552265249304, "grad_norm": 0.1516595184803009, "learning_rate": 1.990439110495057e-05, "loss": 0.5726, "step": 679 }, { "epoch": 0.17210832700582132, "grad_norm": 0.15837806463241577, "learning_rate": 1.9904060522605327e-05, "loss": 0.5851, "step": 680 }, { "epoch": 0.17236142748671224, "grad_norm": 0.15037870407104492, "learning_rate": 1.9903729372481302e-05, "loss": 0.5679, "step": 681 }, { "epoch": 0.17261452796760313, "grad_norm": 0.1476915329694748, "learning_rate": 1.9903397654597482e-05, "loss": 0.5509, "step": 682 }, { "epoch": 0.17286762844849404, "grad_norm": 0.15054358541965485, "learning_rate": 1.990306536897288e-05, "loss": 0.5645, "step": 683 }, { "epoch": 0.17312072892938496, "grad_norm": 0.1502549946308136, "learning_rate": 1.9902732515626546e-05, "loss": 0.6015, "step": 684 }, { "epoch": 0.17337382941027588, "grad_norm": 0.1518615037202835, "learning_rate": 1.9902399094577566e-05, "loss": 0.5778, "step": 685 }, { "epoch": 0.1736269298911668, "grad_norm": 0.1569449007511139, "learning_rate": 1.990206510584505e-05, "loss": 0.5819, "step": 686 }, { "epoch": 0.1738800303720577, "grad_norm": 0.1500752717256546, "learning_rate": 1.9901730549448147e-05, "loss": 0.5498, "step": 687 }, { "epoch": 0.17413313085294863, "grad_norm": 0.15543176233768463, "learning_rate": 1.9901395425406033e-05, "loss": 0.5501, "step": 688 }, { "epoch": 0.17438623133383954, "grad_norm": 0.15402987599372864, "learning_rate": 1.9901059733737923e-05, "loss": 0.5868, "step": 689 }, { "epoch": 0.17463933181473046, "grad_norm": 0.15203896164894104, "learning_rate": 1.9900723474463063e-05, "loss": 0.5802, "step": 690 }, { "epoch": 0.17489243229562138, "grad_norm": 0.14999938011169434, "learning_rate": 1.9900386647600726e-05, "loss": 0.5744, "step": 691 }, { "epoch": 0.17514553277651226, "grad_norm": 0.14887036383152008, "learning_rate": 1.9900049253170226e-05, "loss": 0.565, "step": 692 }, { "epoch": 0.17539863325740318, "grad_norm": 0.15507274866104126, "learning_rate": 1.9899711291190898e-05, "loss": 0.5731, "step": 693 }, { "epoch": 0.1756517337382941, "grad_norm": 0.14773182570934296, "learning_rate": 1.9899372761682128e-05, "loss": 0.5971, "step": 694 }, { "epoch": 0.175904834219185, "grad_norm": 0.17984947562217712, "learning_rate": 1.9899033664663312e-05, "loss": 0.5553, "step": 695 }, { "epoch": 0.17615793470007593, "grad_norm": 0.14991788566112518, "learning_rate": 1.9898694000153896e-05, "loss": 0.5433, "step": 696 }, { "epoch": 0.17641103518096685, "grad_norm": 0.1493489146232605, "learning_rate": 1.989835376817335e-05, "loss": 0.554, "step": 697 }, { "epoch": 0.17666413566185776, "grad_norm": 0.1503385454416275, "learning_rate": 1.9898012968741178e-05, "loss": 0.5822, "step": 698 }, { "epoch": 0.17691723614274868, "grad_norm": 0.1620415300130844, "learning_rate": 1.989767160187692e-05, "loss": 0.5697, "step": 699 }, { "epoch": 0.1771703366236396, "grad_norm": 0.15772275626659393, "learning_rate": 1.9897329667600143e-05, "loss": 0.5615, "step": 700 }, { "epoch": 0.17742343710453048, "grad_norm": 0.15086622536182404, "learning_rate": 1.9896987165930455e-05, "loss": 0.5746, "step": 701 }, { "epoch": 0.1776765375854214, "grad_norm": 0.14750191569328308, "learning_rate": 1.9896644096887483e-05, "loss": 0.591, "step": 702 }, { "epoch": 0.17792963806631232, "grad_norm": 0.18089640140533447, "learning_rate": 1.98963004604909e-05, "loss": 0.5773, "step": 703 }, { "epoch": 0.17818273854720323, "grad_norm": 0.15691858530044556, "learning_rate": 1.9895956256760403e-05, "loss": 0.6003, "step": 704 }, { "epoch": 0.17843583902809415, "grad_norm": 0.15080849826335907, "learning_rate": 1.9895611485715726e-05, "loss": 0.5561, "step": 705 }, { "epoch": 0.17868893950898507, "grad_norm": 0.1499396711587906, "learning_rate": 1.9895266147376634e-05, "loss": 0.5699, "step": 706 }, { "epoch": 0.17894203998987598, "grad_norm": 0.14726616442203522, "learning_rate": 1.989492024176292e-05, "loss": 0.5649, "step": 707 }, { "epoch": 0.1791951404707669, "grad_norm": 0.14629290997982025, "learning_rate": 1.9894573768894423e-05, "loss": 0.5599, "step": 708 }, { "epoch": 0.17944824095165782, "grad_norm": 0.14857056736946106, "learning_rate": 1.9894226728790998e-05, "loss": 0.5825, "step": 709 }, { "epoch": 0.17970134143254873, "grad_norm": 0.1579265296459198, "learning_rate": 1.9893879121472546e-05, "loss": 0.5486, "step": 710 }, { "epoch": 0.17995444191343962, "grad_norm": 0.1460217982530594, "learning_rate": 1.9893530946958987e-05, "loss": 0.5746, "step": 711 }, { "epoch": 0.18020754239433054, "grad_norm": 0.1500665843486786, "learning_rate": 1.989318220527029e-05, "loss": 0.5593, "step": 712 }, { "epoch": 0.18046064287522146, "grad_norm": 0.16921649873256683, "learning_rate": 1.9892832896426438e-05, "loss": 0.5542, "step": 713 }, { "epoch": 0.18071374335611237, "grad_norm": 0.14597009122371674, "learning_rate": 1.9892483020447463e-05, "loss": 0.5825, "step": 714 }, { "epoch": 0.1809668438370033, "grad_norm": 0.15014439821243286, "learning_rate": 1.989213257735342e-05, "loss": 0.5663, "step": 715 }, { "epoch": 0.1812199443178942, "grad_norm": 0.15399925410747528, "learning_rate": 1.9891781567164404e-05, "loss": 0.5428, "step": 716 }, { "epoch": 0.18147304479878512, "grad_norm": 0.1528647243976593, "learning_rate": 1.9891429989900527e-05, "loss": 0.5704, "step": 717 }, { "epoch": 0.18172614527967604, "grad_norm": 0.15238700807094574, "learning_rate": 1.9891077845581957e-05, "loss": 0.5716, "step": 718 }, { "epoch": 0.18197924576056695, "grad_norm": 0.15280528366565704, "learning_rate": 1.989072513422887e-05, "loss": 0.607, "step": 719 }, { "epoch": 0.18223234624145787, "grad_norm": 0.14587153494358063, "learning_rate": 1.98903718558615e-05, "loss": 0.5656, "step": 720 }, { "epoch": 0.18248544672234876, "grad_norm": 0.1593136191368103, "learning_rate": 1.989001801050008e-05, "loss": 0.5719, "step": 721 }, { "epoch": 0.18273854720323968, "grad_norm": 0.14155927300453186, "learning_rate": 1.9889663598164915e-05, "loss": 0.595, "step": 722 }, { "epoch": 0.1829916476841306, "grad_norm": 0.15472820401191711, "learning_rate": 1.9889308618876317e-05, "loss": 0.5775, "step": 723 }, { "epoch": 0.1832447481650215, "grad_norm": 0.15889547765254974, "learning_rate": 1.9888953072654624e-05, "loss": 0.5711, "step": 724 }, { "epoch": 0.18349784864591243, "grad_norm": 0.14979174733161926, "learning_rate": 1.9888596959520234e-05, "loss": 0.5955, "step": 725 }, { "epoch": 0.18375094912680334, "grad_norm": 0.17373642325401306, "learning_rate": 1.9888240279493557e-05, "loss": 0.5716, "step": 726 }, { "epoch": 0.18400404960769426, "grad_norm": 0.14577554166316986, "learning_rate": 1.9887883032595037e-05, "loss": 0.5856, "step": 727 }, { "epoch": 0.18425715008858518, "grad_norm": 0.14496983587741852, "learning_rate": 1.988752521884516e-05, "loss": 0.5905, "step": 728 }, { "epoch": 0.1845102505694761, "grad_norm": 0.15008766949176788, "learning_rate": 1.9887166838264434e-05, "loss": 0.5736, "step": 729 }, { "epoch": 0.184763351050367, "grad_norm": 0.14624342322349548, "learning_rate": 1.9886807890873404e-05, "loss": 0.5416, "step": 730 }, { "epoch": 0.1850164515312579, "grad_norm": 0.1502377986907959, "learning_rate": 1.9886448376692656e-05, "loss": 0.5535, "step": 731 }, { "epoch": 0.1852695520121488, "grad_norm": 0.14800770580768585, "learning_rate": 1.988608829574279e-05, "loss": 0.5827, "step": 732 }, { "epoch": 0.18552265249303973, "grad_norm": 0.1491914838552475, "learning_rate": 1.9885727648044453e-05, "loss": 0.5701, "step": 733 }, { "epoch": 0.18577575297393065, "grad_norm": 0.15261493623256683, "learning_rate": 1.9885366433618322e-05, "loss": 0.5806, "step": 734 }, { "epoch": 0.18602885345482156, "grad_norm": 0.14738062024116516, "learning_rate": 1.9885004652485103e-05, "loss": 0.5835, "step": 735 }, { "epoch": 0.18628195393571248, "grad_norm": 0.15116408467292786, "learning_rate": 1.988464230466553e-05, "loss": 0.5753, "step": 736 }, { "epoch": 0.1865350544166034, "grad_norm": 0.1526593565940857, "learning_rate": 1.988427939018039e-05, "loss": 0.5727, "step": 737 }, { "epoch": 0.1867881548974943, "grad_norm": 0.14892533421516418, "learning_rate": 1.9883915909050472e-05, "loss": 0.5694, "step": 738 }, { "epoch": 0.18704125537838523, "grad_norm": 0.14990945160388947, "learning_rate": 1.9883551861296626e-05, "loss": 0.5663, "step": 739 }, { "epoch": 0.18729435585927615, "grad_norm": 0.14417888224124908, "learning_rate": 1.9883187246939717e-05, "loss": 0.5517, "step": 740 }, { "epoch": 0.18754745634016703, "grad_norm": 0.14893411099910736, "learning_rate": 1.9882822066000644e-05, "loss": 0.5656, "step": 741 }, { "epoch": 0.18780055682105795, "grad_norm": 0.15740130841732025, "learning_rate": 1.9882456318500347e-05, "loss": 0.5595, "step": 742 }, { "epoch": 0.18805365730194887, "grad_norm": 0.15170875191688538, "learning_rate": 1.9882090004459794e-05, "loss": 0.5772, "step": 743 }, { "epoch": 0.18830675778283978, "grad_norm": 0.1540304720401764, "learning_rate": 1.9881723123899984e-05, "loss": 0.582, "step": 744 }, { "epoch": 0.1885598582637307, "grad_norm": 0.15017232298851013, "learning_rate": 1.9881355676841947e-05, "loss": 0.5557, "step": 745 }, { "epoch": 0.18881295874462162, "grad_norm": 0.1520720273256302, "learning_rate": 1.988098766330675e-05, "loss": 0.5682, "step": 746 }, { "epoch": 0.18906605922551253, "grad_norm": 0.14698469638824463, "learning_rate": 1.9880619083315495e-05, "loss": 0.5703, "step": 747 }, { "epoch": 0.18931915970640345, "grad_norm": 0.16416220366954803, "learning_rate": 1.98802499368893e-05, "loss": 0.5467, "step": 748 }, { "epoch": 0.18957226018729437, "grad_norm": 0.14882448315620422, "learning_rate": 1.9879880224049337e-05, "loss": 0.5549, "step": 749 }, { "epoch": 0.18982536066818528, "grad_norm": 0.15382224321365356, "learning_rate": 1.98795099448168e-05, "loss": 0.5716, "step": 750 }, { "epoch": 0.19007846114907617, "grad_norm": 0.14566577970981598, "learning_rate": 1.9879139099212912e-05, "loss": 0.5848, "step": 751 }, { "epoch": 0.1903315616299671, "grad_norm": 0.1462111920118332, "learning_rate": 1.987876768725894e-05, "loss": 0.5762, "step": 752 }, { "epoch": 0.190584662110858, "grad_norm": 0.14414694905281067, "learning_rate": 1.9878395708976164e-05, "loss": 0.5293, "step": 753 }, { "epoch": 0.19083776259174892, "grad_norm": 0.2745702564716339, "learning_rate": 1.987802316438592e-05, "loss": 0.5625, "step": 754 }, { "epoch": 0.19109086307263984, "grad_norm": 0.14843228459358215, "learning_rate": 1.9877650053509566e-05, "loss": 0.5498, "step": 755 }, { "epoch": 0.19134396355353075, "grad_norm": 0.1457221359014511, "learning_rate": 1.9877276376368483e-05, "loss": 0.5439, "step": 756 }, { "epoch": 0.19159706403442167, "grad_norm": 0.15008778870105743, "learning_rate": 1.98769021329841e-05, "loss": 0.5675, "step": 757 }, { "epoch": 0.1918501645153126, "grad_norm": 0.1420939564704895, "learning_rate": 1.987652732337787e-05, "loss": 0.5727, "step": 758 }, { "epoch": 0.1921032649962035, "grad_norm": 0.1531420350074768, "learning_rate": 1.9876151947571273e-05, "loss": 0.5537, "step": 759 }, { "epoch": 0.1923563654770944, "grad_norm": 0.1537618339061737, "learning_rate": 1.9875776005585838e-05, "loss": 0.5251, "step": 760 }, { "epoch": 0.1926094659579853, "grad_norm": 0.1491342931985855, "learning_rate": 1.9875399497443114e-05, "loss": 0.5642, "step": 761 }, { "epoch": 0.19286256643887623, "grad_norm": 0.14750774204730988, "learning_rate": 1.9875022423164686e-05, "loss": 0.5551, "step": 762 }, { "epoch": 0.19311566691976714, "grad_norm": 0.14625151455402374, "learning_rate": 1.9874644782772167e-05, "loss": 0.5609, "step": 763 }, { "epoch": 0.19336876740065806, "grad_norm": 0.15412314236164093, "learning_rate": 1.9874266576287215e-05, "loss": 0.5612, "step": 764 }, { "epoch": 0.19362186788154898, "grad_norm": 0.1581011265516281, "learning_rate": 1.98738878037315e-05, "loss": 0.5554, "step": 765 }, { "epoch": 0.1938749683624399, "grad_norm": 0.14776253700256348, "learning_rate": 1.9873508465126744e-05, "loss": 0.5755, "step": 766 }, { "epoch": 0.1941280688433308, "grad_norm": 0.15599747002124786, "learning_rate": 1.987312856049469e-05, "loss": 0.5744, "step": 767 }, { "epoch": 0.19438116932422173, "grad_norm": 0.15313883125782013, "learning_rate": 1.9872748089857123e-05, "loss": 0.5595, "step": 768 }, { "epoch": 0.19463426980511264, "grad_norm": 0.1496153026819229, "learning_rate": 1.9872367053235847e-05, "loss": 0.5607, "step": 769 }, { "epoch": 0.19488737028600353, "grad_norm": 0.15209335088729858, "learning_rate": 1.987198545065271e-05, "loss": 0.5883, "step": 770 }, { "epoch": 0.19514047076689445, "grad_norm": 0.14694365859031677, "learning_rate": 1.9871603282129588e-05, "loss": 0.5692, "step": 771 }, { "epoch": 0.19539357124778536, "grad_norm": 0.1486019343137741, "learning_rate": 1.9871220547688392e-05, "loss": 0.5753, "step": 772 }, { "epoch": 0.19564667172867628, "grad_norm": 0.15324336290359497, "learning_rate": 1.987083724735106e-05, "loss": 0.5873, "step": 773 }, { "epoch": 0.1958997722095672, "grad_norm": 0.14359359443187714, "learning_rate": 1.9870453381139565e-05, "loss": 0.5718, "step": 774 }, { "epoch": 0.1961528726904581, "grad_norm": 0.1478722244501114, "learning_rate": 1.987006894907592e-05, "loss": 0.5813, "step": 775 }, { "epoch": 0.19640597317134903, "grad_norm": 0.1535591036081314, "learning_rate": 1.9869683951182154e-05, "loss": 0.5818, "step": 776 }, { "epoch": 0.19665907365223995, "grad_norm": 0.1460777223110199, "learning_rate": 1.9869298387480345e-05, "loss": 0.5696, "step": 777 }, { "epoch": 0.19691217413313086, "grad_norm": 0.15877650678157806, "learning_rate": 1.9868912257992593e-05, "loss": 0.5593, "step": 778 }, { "epoch": 0.19716527461402178, "grad_norm": 0.14716073870658875, "learning_rate": 1.986852556274104e-05, "loss": 0.5495, "step": 779 }, { "epoch": 0.19741837509491267, "grad_norm": 0.14650224149227142, "learning_rate": 1.9868138301747845e-05, "loss": 0.5565, "step": 780 }, { "epoch": 0.19767147557580358, "grad_norm": 0.15673044323921204, "learning_rate": 1.9867750475035216e-05, "loss": 0.5877, "step": 781 }, { "epoch": 0.1979245760566945, "grad_norm": 0.15667545795440674, "learning_rate": 1.9867362082625386e-05, "loss": 0.5733, "step": 782 }, { "epoch": 0.19817767653758542, "grad_norm": 0.18397942185401917, "learning_rate": 1.9866973124540617e-05, "loss": 0.5653, "step": 783 }, { "epoch": 0.19843077701847633, "grad_norm": 0.15077096223831177, "learning_rate": 1.9866583600803208e-05, "loss": 0.55, "step": 784 }, { "epoch": 0.19868387749936725, "grad_norm": 0.14992989599704742, "learning_rate": 1.9866193511435492e-05, "loss": 0.5574, "step": 785 }, { "epoch": 0.19893697798025817, "grad_norm": 0.14983786642551422, "learning_rate": 1.9865802856459832e-05, "loss": 0.5891, "step": 786 }, { "epoch": 0.19919007846114908, "grad_norm": 0.15408428013324738, "learning_rate": 1.9865411635898623e-05, "loss": 0.5957, "step": 787 }, { "epoch": 0.19944317894204, "grad_norm": 0.15763822197914124, "learning_rate": 1.9865019849774287e-05, "loss": 0.5684, "step": 788 }, { "epoch": 0.19969627942293092, "grad_norm": 0.14608003199100494, "learning_rate": 1.9864627498109292e-05, "loss": 0.5338, "step": 789 }, { "epoch": 0.1999493799038218, "grad_norm": 0.152947336435318, "learning_rate": 1.9864234580926127e-05, "loss": 0.573, "step": 790 }, { "epoch": 0.20020248038471272, "grad_norm": 0.15223805606365204, "learning_rate": 1.9863841098247318e-05, "loss": 0.5662, "step": 791 }, { "epoch": 0.20045558086560364, "grad_norm": 0.181410551071167, "learning_rate": 1.9863447050095425e-05, "loss": 0.5738, "step": 792 }, { "epoch": 0.20070868134649456, "grad_norm": 0.14844632148742676, "learning_rate": 1.986305243649303e-05, "loss": 0.571, "step": 793 }, { "epoch": 0.20096178182738547, "grad_norm": 0.15224933624267578, "learning_rate": 1.9862657257462764e-05, "loss": 0.575, "step": 794 }, { "epoch": 0.2012148823082764, "grad_norm": 0.14983992278575897, "learning_rate": 1.9862261513027278e-05, "loss": 0.5459, "step": 795 }, { "epoch": 0.2014679827891673, "grad_norm": 0.15010716021060944, "learning_rate": 1.986186520320926e-05, "loss": 0.5684, "step": 796 }, { "epoch": 0.20172108327005822, "grad_norm": 0.15364646911621094, "learning_rate": 1.9861468328031427e-05, "loss": 0.5923, "step": 797 }, { "epoch": 0.20197418375094914, "grad_norm": 0.24683018028736115, "learning_rate": 1.9861070887516538e-05, "loss": 0.5493, "step": 798 }, { "epoch": 0.20222728423184005, "grad_norm": 0.15156035125255585, "learning_rate": 1.986067288168737e-05, "loss": 0.5553, "step": 799 }, { "epoch": 0.20248038471273094, "grad_norm": 0.1583360731601715, "learning_rate": 1.9860274310566743e-05, "loss": 0.5715, "step": 800 }, { "epoch": 0.20273348519362186, "grad_norm": 0.14934858679771423, "learning_rate": 1.9859875174177507e-05, "loss": 0.5931, "step": 801 }, { "epoch": 0.20298658567451278, "grad_norm": 0.16253922879695892, "learning_rate": 1.985947547254254e-05, "loss": 0.6016, "step": 802 }, { "epoch": 0.2032396861554037, "grad_norm": 0.14571331441402435, "learning_rate": 1.9859075205684763e-05, "loss": 0.5308, "step": 803 }, { "epoch": 0.2034927866362946, "grad_norm": 0.1456516981124878, "learning_rate": 1.9858674373627113e-05, "loss": 0.581, "step": 804 }, { "epoch": 0.20374588711718553, "grad_norm": 0.15310640633106232, "learning_rate": 1.9858272976392574e-05, "loss": 0.5679, "step": 805 }, { "epoch": 0.20399898759807644, "grad_norm": 0.16352048516273499, "learning_rate": 1.985787101400416e-05, "loss": 0.5891, "step": 806 }, { "epoch": 0.20425208807896736, "grad_norm": 0.1480332612991333, "learning_rate": 1.985746848648491e-05, "loss": 0.5655, "step": 807 }, { "epoch": 0.20450518855985828, "grad_norm": 0.1917625367641449, "learning_rate": 1.98570653938579e-05, "loss": 0.5484, "step": 808 }, { "epoch": 0.20475828904074916, "grad_norm": 0.150888592004776, "learning_rate": 1.9856661736146244e-05, "loss": 0.5603, "step": 809 }, { "epoch": 0.20501138952164008, "grad_norm": 0.1545407623052597, "learning_rate": 1.9856257513373077e-05, "loss": 0.5379, "step": 810 }, { "epoch": 0.205264490002531, "grad_norm": 0.1561100035905838, "learning_rate": 1.9855852725561575e-05, "loss": 0.5782, "step": 811 }, { "epoch": 0.2055175904834219, "grad_norm": 0.14499087631702423, "learning_rate": 1.9855447372734943e-05, "loss": 0.5654, "step": 812 }, { "epoch": 0.20577069096431283, "grad_norm": 0.16896207630634308, "learning_rate": 1.9855041454916416e-05, "loss": 0.5453, "step": 813 }, { "epoch": 0.20602379144520375, "grad_norm": 0.15073281526565552, "learning_rate": 1.9854634972129272e-05, "loss": 0.5577, "step": 814 }, { "epoch": 0.20627689192609466, "grad_norm": 0.17091280221939087, "learning_rate": 1.9854227924396804e-05, "loss": 0.5716, "step": 815 }, { "epoch": 0.20652999240698558, "grad_norm": 0.14324995875358582, "learning_rate": 1.985382031174236e-05, "loss": 0.5561, "step": 816 }, { "epoch": 0.2067830928878765, "grad_norm": 0.15467405319213867, "learning_rate": 1.9853412134189292e-05, "loss": 0.5706, "step": 817 }, { "epoch": 0.2070361933687674, "grad_norm": 0.14770232141017914, "learning_rate": 1.985300339176101e-05, "loss": 0.5569, "step": 818 }, { "epoch": 0.2072892938496583, "grad_norm": 0.1511012762784958, "learning_rate": 1.9852594084480946e-05, "loss": 0.5587, "step": 819 }, { "epoch": 0.20754239433054922, "grad_norm": 0.1511695683002472, "learning_rate": 1.985218421237256e-05, "loss": 0.57, "step": 820 }, { "epoch": 0.20779549481144013, "grad_norm": 0.1476137489080429, "learning_rate": 1.9851773775459356e-05, "loss": 0.5687, "step": 821 }, { "epoch": 0.20804859529233105, "grad_norm": 0.14952382445335388, "learning_rate": 1.985136277376486e-05, "loss": 0.5763, "step": 822 }, { "epoch": 0.20830169577322197, "grad_norm": 0.15390071272850037, "learning_rate": 1.9850951207312628e-05, "loss": 0.5937, "step": 823 }, { "epoch": 0.20855479625411288, "grad_norm": 0.14283983409404755, "learning_rate": 1.9850539076126262e-05, "loss": 0.5638, "step": 824 }, { "epoch": 0.2088078967350038, "grad_norm": 0.1518326848745346, "learning_rate": 1.9850126380229386e-05, "loss": 0.5858, "step": 825 }, { "epoch": 0.20906099721589472, "grad_norm": 0.15040968358516693, "learning_rate": 1.984971311964566e-05, "loss": 0.5581, "step": 826 }, { "epoch": 0.20931409769678563, "grad_norm": 0.15228532254695892, "learning_rate": 1.9849299294398773e-05, "loss": 0.5566, "step": 827 }, { "epoch": 0.20956719817767655, "grad_norm": 0.15812933444976807, "learning_rate": 1.9848884904512453e-05, "loss": 0.5818, "step": 828 }, { "epoch": 0.20982029865856744, "grad_norm": 0.14517231285572052, "learning_rate": 1.984846995001045e-05, "loss": 0.5614, "step": 829 }, { "epoch": 0.21007339913945836, "grad_norm": 0.14441753923892975, "learning_rate": 1.9848054430916558e-05, "loss": 0.5678, "step": 830 }, { "epoch": 0.21032649962034927, "grad_norm": 0.14744623005390167, "learning_rate": 1.9847638347254594e-05, "loss": 0.5865, "step": 831 }, { "epoch": 0.2105796001012402, "grad_norm": 0.1468220204114914, "learning_rate": 1.9847221699048417e-05, "loss": 0.5514, "step": 832 }, { "epoch": 0.2108327005821311, "grad_norm": 0.14899007976055145, "learning_rate": 1.9846804486321902e-05, "loss": 0.5589, "step": 833 }, { "epoch": 0.21108580106302202, "grad_norm": 0.15209320187568665, "learning_rate": 1.9846386709098977e-05, "loss": 0.5523, "step": 834 }, { "epoch": 0.21133890154391294, "grad_norm": 0.14415016770362854, "learning_rate": 1.984596836740359e-05, "loss": 0.5686, "step": 835 }, { "epoch": 0.21159200202480385, "grad_norm": 0.14713110029697418, "learning_rate": 1.9845549461259715e-05, "loss": 0.5911, "step": 836 }, { "epoch": 0.21184510250569477, "grad_norm": 0.15125249326229095, "learning_rate": 1.9845129990691382e-05, "loss": 0.5706, "step": 837 }, { "epoch": 0.2120982029865857, "grad_norm": 0.14549441635608673, "learning_rate": 1.9844709955722627e-05, "loss": 0.5641, "step": 838 }, { "epoch": 0.21235130346747658, "grad_norm": 0.15025334060192108, "learning_rate": 1.9844289356377534e-05, "loss": 0.5769, "step": 839 }, { "epoch": 0.2126044039483675, "grad_norm": 0.14516359567642212, "learning_rate": 1.9843868192680213e-05, "loss": 0.5508, "step": 840 }, { "epoch": 0.2128575044292584, "grad_norm": 0.15259525179862976, "learning_rate": 1.9843446464654814e-05, "loss": 0.5719, "step": 841 }, { "epoch": 0.21311060491014933, "grad_norm": 0.1537070870399475, "learning_rate": 1.9843024172325504e-05, "loss": 0.5914, "step": 842 }, { "epoch": 0.21336370539104024, "grad_norm": 0.15067122876644135, "learning_rate": 1.98426013157165e-05, "loss": 0.5737, "step": 843 }, { "epoch": 0.21361680587193116, "grad_norm": 0.15000620484352112, "learning_rate": 1.984217789485204e-05, "loss": 0.59, "step": 844 }, { "epoch": 0.21386990635282208, "grad_norm": 0.1379261314868927, "learning_rate": 1.9841753909756406e-05, "loss": 0.5331, "step": 845 }, { "epoch": 0.214123006833713, "grad_norm": 0.3226446509361267, "learning_rate": 1.984132936045389e-05, "loss": 0.5666, "step": 846 }, { "epoch": 0.2143761073146039, "grad_norm": 0.1511106640100479, "learning_rate": 1.9840904246968837e-05, "loss": 0.5392, "step": 847 }, { "epoch": 0.21462920779549483, "grad_norm": 0.1527481973171234, "learning_rate": 1.9840478569325624e-05, "loss": 0.5539, "step": 848 }, { "epoch": 0.21488230827638571, "grad_norm": 0.15316209197044373, "learning_rate": 1.9840052327548642e-05, "loss": 0.5653, "step": 849 }, { "epoch": 0.21513540875727663, "grad_norm": 0.14784926176071167, "learning_rate": 1.9839625521662338e-05, "loss": 0.5627, "step": 850 }, { "epoch": 0.21538850923816755, "grad_norm": 0.14797787368297577, "learning_rate": 1.9839198151691172e-05, "loss": 0.5637, "step": 851 }, { "epoch": 0.21564160971905846, "grad_norm": 0.1469213217496872, "learning_rate": 1.983877021765965e-05, "loss": 0.5445, "step": 852 }, { "epoch": 0.21589471019994938, "grad_norm": 0.14689572155475616, "learning_rate": 1.98383417195923e-05, "loss": 0.5447, "step": 853 }, { "epoch": 0.2161478106808403, "grad_norm": 0.1469879001379013, "learning_rate": 1.983791265751369e-05, "loss": 0.5541, "step": 854 }, { "epoch": 0.2164009111617312, "grad_norm": 0.1490246057510376, "learning_rate": 1.9837483031448414e-05, "loss": 0.5866, "step": 855 }, { "epoch": 0.21665401164262213, "grad_norm": 0.15119101107120514, "learning_rate": 1.9837052841421106e-05, "loss": 0.564, "step": 856 }, { "epoch": 0.21690711212351305, "grad_norm": 0.15075601637363434, "learning_rate": 1.9836622087456422e-05, "loss": 0.5776, "step": 857 }, { "epoch": 0.21716021260440393, "grad_norm": 0.20193101465702057, "learning_rate": 1.9836190769579063e-05, "loss": 0.5576, "step": 858 }, { "epoch": 0.21741331308529485, "grad_norm": 0.16276125609874725, "learning_rate": 1.983575888781375e-05, "loss": 0.5525, "step": 859 }, { "epoch": 0.21766641356618577, "grad_norm": 0.14671862125396729, "learning_rate": 1.9835326442185247e-05, "loss": 0.5494, "step": 860 }, { "epoch": 0.21791951404707668, "grad_norm": 0.14973184466362, "learning_rate": 1.9834893432718338e-05, "loss": 0.563, "step": 861 }, { "epoch": 0.2181726145279676, "grad_norm": 0.14972767233848572, "learning_rate": 1.9834459859437856e-05, "loss": 0.5496, "step": 862 }, { "epoch": 0.21842571500885852, "grad_norm": 0.14641425013542175, "learning_rate": 1.9834025722368646e-05, "loss": 0.5531, "step": 863 }, { "epoch": 0.21867881548974943, "grad_norm": 0.14714324474334717, "learning_rate": 1.9833591021535604e-05, "loss": 0.5565, "step": 864 }, { "epoch": 0.21893191597064035, "grad_norm": 0.16813084483146667, "learning_rate": 1.983315575696365e-05, "loss": 0.5525, "step": 865 }, { "epoch": 0.21918501645153127, "grad_norm": 0.1511027067899704, "learning_rate": 1.9832719928677734e-05, "loss": 0.5575, "step": 866 }, { "epoch": 0.21943811693242218, "grad_norm": 0.14805647730827332, "learning_rate": 1.983228353670284e-05, "loss": 0.5616, "step": 867 }, { "epoch": 0.21969121741331307, "grad_norm": 0.14975033700466156, "learning_rate": 1.983184658106399e-05, "loss": 0.5771, "step": 868 }, { "epoch": 0.219944317894204, "grad_norm": 0.15038229525089264, "learning_rate": 1.9831409061786228e-05, "loss": 0.5907, "step": 869 }, { "epoch": 0.2201974183750949, "grad_norm": 0.1614668369293213, "learning_rate": 1.983097097889464e-05, "loss": 0.58, "step": 870 }, { "epoch": 0.22045051885598582, "grad_norm": 0.15763644874095917, "learning_rate": 1.9830532332414343e-05, "loss": 0.567, "step": 871 }, { "epoch": 0.22070361933687674, "grad_norm": 0.15375757217407227, "learning_rate": 1.9830093122370476e-05, "loss": 0.5407, "step": 872 }, { "epoch": 0.22095671981776766, "grad_norm": 0.14617526531219482, "learning_rate": 1.9829653348788228e-05, "loss": 0.5578, "step": 873 }, { "epoch": 0.22120982029865857, "grad_norm": 0.14947310090065002, "learning_rate": 1.98292130116928e-05, "loss": 0.5344, "step": 874 }, { "epoch": 0.2214629207795495, "grad_norm": 0.149801105260849, "learning_rate": 1.982877211110944e-05, "loss": 0.5652, "step": 875 }, { "epoch": 0.2217160212604404, "grad_norm": 0.1476932018995285, "learning_rate": 1.9828330647063424e-05, "loss": 0.5811, "step": 876 }, { "epoch": 0.22196912174133132, "grad_norm": 0.14732202887535095, "learning_rate": 1.9827888619580065e-05, "loss": 0.5533, "step": 877 }, { "epoch": 0.2222222222222222, "grad_norm": 0.14460918307304382, "learning_rate": 1.9827446028684695e-05, "loss": 0.5354, "step": 878 }, { "epoch": 0.22247532270311313, "grad_norm": 0.14462372660636902, "learning_rate": 1.982700287440269e-05, "loss": 0.5588, "step": 879 }, { "epoch": 0.22272842318400404, "grad_norm": 0.15274891257286072, "learning_rate": 1.9826559156759458e-05, "loss": 0.5396, "step": 880 }, { "epoch": 0.22298152366489496, "grad_norm": 0.14966580271720886, "learning_rate": 1.9826114875780434e-05, "loss": 0.5699, "step": 881 }, { "epoch": 0.22323462414578588, "grad_norm": 0.15385785698890686, "learning_rate": 1.9825670031491086e-05, "loss": 0.5526, "step": 882 }, { "epoch": 0.2234877246266768, "grad_norm": 0.15709258615970612, "learning_rate": 1.9825224623916917e-05, "loss": 0.5922, "step": 883 }, { "epoch": 0.2237408251075677, "grad_norm": 0.1467486023902893, "learning_rate": 1.9824778653083463e-05, "loss": 0.5793, "step": 884 }, { "epoch": 0.22399392558845863, "grad_norm": 0.14842136204242706, "learning_rate": 1.982433211901629e-05, "loss": 0.5616, "step": 885 }, { "epoch": 0.22424702606934954, "grad_norm": 0.1539982259273529, "learning_rate": 1.9823885021740995e-05, "loss": 0.5709, "step": 886 }, { "epoch": 0.22450012655024046, "grad_norm": 0.14926017820835114, "learning_rate": 1.9823437361283213e-05, "loss": 0.5449, "step": 887 }, { "epoch": 0.22475322703113135, "grad_norm": 0.14725536108016968, "learning_rate": 1.9822989137668603e-05, "loss": 0.6158, "step": 888 }, { "epoch": 0.22500632751202226, "grad_norm": 0.14215263724327087, "learning_rate": 1.9822540350922865e-05, "loss": 0.5622, "step": 889 }, { "epoch": 0.22525942799291318, "grad_norm": 0.16503040492534637, "learning_rate": 1.9822091001071724e-05, "loss": 0.536, "step": 890 }, { "epoch": 0.2255125284738041, "grad_norm": 0.1484951227903366, "learning_rate": 1.982164108814094e-05, "loss": 0.57, "step": 891 }, { "epoch": 0.225765628954695, "grad_norm": 0.15657170116901398, "learning_rate": 1.9821190612156307e-05, "loss": 0.5483, "step": 892 }, { "epoch": 0.22601872943558593, "grad_norm": 0.1487235128879547, "learning_rate": 1.982073957314365e-05, "loss": 0.5811, "step": 893 }, { "epoch": 0.22627182991647685, "grad_norm": 0.15387752652168274, "learning_rate": 1.9820287971128822e-05, "loss": 0.5595, "step": 894 }, { "epoch": 0.22652493039736776, "grad_norm": 0.15158569812774658, "learning_rate": 1.981983580613772e-05, "loss": 0.5584, "step": 895 }, { "epoch": 0.22677803087825868, "grad_norm": 0.15643306076526642, "learning_rate": 1.9819383078196258e-05, "loss": 0.5429, "step": 896 }, { "epoch": 0.2270311313591496, "grad_norm": 0.15127789974212646, "learning_rate": 1.9818929787330396e-05, "loss": 0.5938, "step": 897 }, { "epoch": 0.22728423184004048, "grad_norm": 0.1479039490222931, "learning_rate": 1.9818475933566116e-05, "loss": 0.5891, "step": 898 }, { "epoch": 0.2275373323209314, "grad_norm": 0.1658899486064911, "learning_rate": 1.981802151692944e-05, "loss": 0.597, "step": 899 }, { "epoch": 0.22779043280182232, "grad_norm": 0.16984491050243378, "learning_rate": 1.9817566537446415e-05, "loss": 0.5894, "step": 900 }, { "epoch": 0.22804353328271323, "grad_norm": 0.1491304188966751, "learning_rate": 1.9817110995143127e-05, "loss": 0.5737, "step": 901 }, { "epoch": 0.22829663376360415, "grad_norm": 0.14824657142162323, "learning_rate": 1.981665489004569e-05, "loss": 0.5786, "step": 902 }, { "epoch": 0.22854973424449507, "grad_norm": 0.14434686303138733, "learning_rate": 1.9816198222180252e-05, "loss": 0.5582, "step": 903 }, { "epoch": 0.22880283472538598, "grad_norm": 0.145100399851799, "learning_rate": 1.981574099157299e-05, "loss": 0.5673, "step": 904 }, { "epoch": 0.2290559352062769, "grad_norm": 0.14965581893920898, "learning_rate": 1.9815283198250125e-05, "loss": 0.5773, "step": 905 }, { "epoch": 0.22930903568716782, "grad_norm": 0.1608322560787201, "learning_rate": 1.9814824842237888e-05, "loss": 0.5536, "step": 906 }, { "epoch": 0.2295621361680587, "grad_norm": 0.1505739390850067, "learning_rate": 1.9814365923562563e-05, "loss": 0.5733, "step": 907 }, { "epoch": 0.22981523664894962, "grad_norm": 0.15027667582035065, "learning_rate": 1.981390644225046e-05, "loss": 0.5668, "step": 908 }, { "epoch": 0.23006833712984054, "grad_norm": 0.14859870076179504, "learning_rate": 1.981344639832792e-05, "loss": 0.5559, "step": 909 }, { "epoch": 0.23032143761073146, "grad_norm": 0.15014663338661194, "learning_rate": 1.9812985791821314e-05, "loss": 0.5742, "step": 910 }, { "epoch": 0.23057453809162237, "grad_norm": 0.14658929407596588, "learning_rate": 1.9812524622757047e-05, "loss": 0.5456, "step": 911 }, { "epoch": 0.2308276385725133, "grad_norm": 0.14573732018470764, "learning_rate": 1.981206289116156e-05, "loss": 0.5387, "step": 912 }, { "epoch": 0.2310807390534042, "grad_norm": 0.1504942774772644, "learning_rate": 1.981160059706132e-05, "loss": 0.5549, "step": 913 }, { "epoch": 0.23133383953429512, "grad_norm": 0.14284364879131317, "learning_rate": 1.9811137740482825e-05, "loss": 0.529, "step": 914 }, { "epoch": 0.23158694001518604, "grad_norm": 0.15270258486270905, "learning_rate": 1.9810674321452624e-05, "loss": 0.5676, "step": 915 }, { "epoch": 0.23184004049607695, "grad_norm": 0.1552465260028839, "learning_rate": 1.981021033999727e-05, "loss": 0.5408, "step": 916 }, { "epoch": 0.23209314097696784, "grad_norm": 0.15350569784641266, "learning_rate": 1.9809745796143368e-05, "loss": 0.5783, "step": 917 }, { "epoch": 0.23234624145785876, "grad_norm": 0.16560080647468567, "learning_rate": 1.980928068991755e-05, "loss": 0.5625, "step": 918 }, { "epoch": 0.23259934193874968, "grad_norm": 0.1464039832353592, "learning_rate": 1.9808815021346474e-05, "loss": 0.5525, "step": 919 }, { "epoch": 0.2328524424196406, "grad_norm": 0.14381320774555206, "learning_rate": 1.9808348790456845e-05, "loss": 0.5513, "step": 920 }, { "epoch": 0.2331055429005315, "grad_norm": 0.14749474823474884, "learning_rate": 1.980788199727538e-05, "loss": 0.5706, "step": 921 }, { "epoch": 0.23335864338142243, "grad_norm": 0.1491667926311493, "learning_rate": 1.980741464182885e-05, "loss": 0.5641, "step": 922 }, { "epoch": 0.23361174386231334, "grad_norm": 0.1418030709028244, "learning_rate": 1.980694672414404e-05, "loss": 0.527, "step": 923 }, { "epoch": 0.23386484434320426, "grad_norm": 0.14927972853183746, "learning_rate": 1.980647824424778e-05, "loss": 0.543, "step": 924 }, { "epoch": 0.23411794482409518, "grad_norm": 0.15179072320461273, "learning_rate": 1.980600920216692e-05, "loss": 0.5694, "step": 925 }, { "epoch": 0.2343710453049861, "grad_norm": 0.15006226301193237, "learning_rate": 1.9805539597928356e-05, "loss": 0.5692, "step": 926 }, { "epoch": 0.23462414578587698, "grad_norm": 0.14651387929916382, "learning_rate": 1.9805069431559007e-05, "loss": 0.5385, "step": 927 }, { "epoch": 0.2348772462667679, "grad_norm": 0.1494535505771637, "learning_rate": 1.9804598703085825e-05, "loss": 0.5311, "step": 928 }, { "epoch": 0.2351303467476588, "grad_norm": 0.15092428028583527, "learning_rate": 1.98041274125358e-05, "loss": 0.5597, "step": 929 }, { "epoch": 0.23538344722854973, "grad_norm": 0.14304505288600922, "learning_rate": 1.9803655559935943e-05, "loss": 0.5712, "step": 930 }, { "epoch": 0.23563654770944065, "grad_norm": 0.1441313475370407, "learning_rate": 1.980318314531331e-05, "loss": 0.5584, "step": 931 }, { "epoch": 0.23588964819033156, "grad_norm": 0.14732512831687927, "learning_rate": 1.9802710168694984e-05, "loss": 0.542, "step": 932 }, { "epoch": 0.23614274867122248, "grad_norm": 0.14828568696975708, "learning_rate": 1.9802236630108077e-05, "loss": 0.5549, "step": 933 }, { "epoch": 0.2363958491521134, "grad_norm": 0.14553432166576385, "learning_rate": 1.9801762529579737e-05, "loss": 0.5616, "step": 934 }, { "epoch": 0.2366489496330043, "grad_norm": 0.15267543494701385, "learning_rate": 1.9801287867137143e-05, "loss": 0.5649, "step": 935 }, { "epoch": 0.23690205011389523, "grad_norm": 0.14400067925453186, "learning_rate": 1.9800812642807508e-05, "loss": 0.5277, "step": 936 }, { "epoch": 0.23715515059478612, "grad_norm": 0.15066170692443848, "learning_rate": 1.9800336856618073e-05, "loss": 0.592, "step": 937 }, { "epoch": 0.23740825107567703, "grad_norm": 0.15186914801597595, "learning_rate": 1.9799860508596116e-05, "loss": 0.5745, "step": 938 }, { "epoch": 0.23766135155656795, "grad_norm": 0.15037445724010468, "learning_rate": 1.979938359876894e-05, "loss": 0.549, "step": 939 }, { "epoch": 0.23791445203745887, "grad_norm": 0.14689569175243378, "learning_rate": 1.9798906127163892e-05, "loss": 0.5535, "step": 940 }, { "epoch": 0.23816755251834978, "grad_norm": 0.15130499005317688, "learning_rate": 1.9798428093808343e-05, "loss": 0.5915, "step": 941 }, { "epoch": 0.2384206529992407, "grad_norm": 0.14659197628498077, "learning_rate": 1.9797949498729692e-05, "loss": 0.5625, "step": 942 }, { "epoch": 0.23867375348013162, "grad_norm": 0.14219814538955688, "learning_rate": 1.9797470341955383e-05, "loss": 0.5488, "step": 943 }, { "epoch": 0.23892685396102253, "grad_norm": 0.14325040578842163, "learning_rate": 1.9796990623512885e-05, "loss": 0.5544, "step": 944 }, { "epoch": 0.23917995444191345, "grad_norm": 0.14300473034381866, "learning_rate": 1.979651034342969e-05, "loss": 0.5794, "step": 945 }, { "epoch": 0.23943305492280437, "grad_norm": 0.14769800007343292, "learning_rate": 1.9796029501733343e-05, "loss": 0.5599, "step": 946 }, { "epoch": 0.23968615540369526, "grad_norm": 0.1453614979982376, "learning_rate": 1.9795548098451404e-05, "loss": 0.5685, "step": 947 }, { "epoch": 0.23993925588458617, "grad_norm": 0.15104912221431732, "learning_rate": 1.9795066133611468e-05, "loss": 0.5482, "step": 948 }, { "epoch": 0.2401923563654771, "grad_norm": 0.14418837428092957, "learning_rate": 1.9794583607241168e-05, "loss": 0.5535, "step": 949 }, { "epoch": 0.240445456846368, "grad_norm": 0.1435074359178543, "learning_rate": 1.979410051936817e-05, "loss": 0.5507, "step": 950 }, { "epoch": 0.24069855732725892, "grad_norm": 0.14348246157169342, "learning_rate": 1.979361687002016e-05, "loss": 0.5475, "step": 951 }, { "epoch": 0.24095165780814984, "grad_norm": 0.16258108615875244, "learning_rate": 1.9793132659224875e-05, "loss": 0.5258, "step": 952 }, { "epoch": 0.24120475828904075, "grad_norm": 0.14690545201301575, "learning_rate": 1.9792647887010066e-05, "loss": 0.5423, "step": 953 }, { "epoch": 0.24145785876993167, "grad_norm": 0.15458691120147705, "learning_rate": 1.9792162553403527e-05, "loss": 0.5876, "step": 954 }, { "epoch": 0.2417109592508226, "grad_norm": 0.14689841866493225, "learning_rate": 1.979167665843308e-05, "loss": 0.5349, "step": 955 }, { "epoch": 0.24196405973171348, "grad_norm": 0.14967821538448334, "learning_rate": 1.9791190202126578e-05, "loss": 0.5568, "step": 956 }, { "epoch": 0.2422171602126044, "grad_norm": 0.1518346518278122, "learning_rate": 1.9790703184511916e-05, "loss": 0.5653, "step": 957 }, { "epoch": 0.2424702606934953, "grad_norm": 0.14848634600639343, "learning_rate": 1.9790215605617007e-05, "loss": 0.5578, "step": 958 }, { "epoch": 0.24272336117438623, "grad_norm": 0.14639200270175934, "learning_rate": 1.97897274654698e-05, "loss": 0.5557, "step": 959 }, { "epoch": 0.24297646165527714, "grad_norm": 0.15468524396419525, "learning_rate": 1.9789238764098292e-05, "loss": 0.5588, "step": 960 }, { "epoch": 0.24322956213616806, "grad_norm": 0.1442486047744751, "learning_rate": 1.9788749501530488e-05, "loss": 0.5539, "step": 961 }, { "epoch": 0.24348266261705898, "grad_norm": 0.1456901878118515, "learning_rate": 1.9788259677794436e-05, "loss": 0.5666, "step": 962 }, { "epoch": 0.2437357630979499, "grad_norm": 0.1506296843290329, "learning_rate": 1.9787769292918222e-05, "loss": 0.5503, "step": 963 }, { "epoch": 0.2439888635788408, "grad_norm": 0.14512263238430023, "learning_rate": 1.9787278346929956e-05, "loss": 0.5555, "step": 964 }, { "epoch": 0.24424196405973173, "grad_norm": 0.14852070808410645, "learning_rate": 1.9786786839857785e-05, "loss": 0.5746, "step": 965 }, { "epoch": 0.24449506454062261, "grad_norm": 0.14244824647903442, "learning_rate": 1.9786294771729886e-05, "loss": 0.5293, "step": 966 }, { "epoch": 0.24474816502151353, "grad_norm": 0.16617433726787567, "learning_rate": 1.9785802142574464e-05, "loss": 0.6001, "step": 967 }, { "epoch": 0.24500126550240445, "grad_norm": 0.15356621146202087, "learning_rate": 1.9785308952419764e-05, "loss": 0.5785, "step": 968 }, { "epoch": 0.24525436598329536, "grad_norm": 0.17322207987308502, "learning_rate": 1.9784815201294058e-05, "loss": 0.5471, "step": 969 }, { "epoch": 0.24550746646418628, "grad_norm": 0.14599476754665375, "learning_rate": 1.9784320889225655e-05, "loss": 0.5515, "step": 970 }, { "epoch": 0.2457605669450772, "grad_norm": 0.15090207755565643, "learning_rate": 1.978382601624289e-05, "loss": 0.5684, "step": 971 }, { "epoch": 0.2460136674259681, "grad_norm": 0.1482008546590805, "learning_rate": 1.9783330582374128e-05, "loss": 0.5457, "step": 972 }, { "epoch": 0.24626676790685903, "grad_norm": 0.17851801216602325, "learning_rate": 1.9782834587647782e-05, "loss": 0.5695, "step": 973 }, { "epoch": 0.24651986838774995, "grad_norm": 0.14859318733215332, "learning_rate": 1.9782338032092282e-05, "loss": 0.5883, "step": 974 }, { "epoch": 0.24677296886864086, "grad_norm": 0.15535728633403778, "learning_rate": 1.978184091573609e-05, "loss": 0.5583, "step": 975 }, { "epoch": 0.24702606934953175, "grad_norm": 0.15031661093235016, "learning_rate": 1.9781343238607708e-05, "loss": 0.5608, "step": 976 }, { "epoch": 0.24727916983042267, "grad_norm": 0.15604668855667114, "learning_rate": 1.978084500073567e-05, "loss": 0.5537, "step": 977 }, { "epoch": 0.24753227031131358, "grad_norm": 0.14635708928108215, "learning_rate": 1.9780346202148533e-05, "loss": 0.5273, "step": 978 }, { "epoch": 0.2477853707922045, "grad_norm": 0.1487453132867813, "learning_rate": 1.9779846842874895e-05, "loss": 0.5561, "step": 979 }, { "epoch": 0.24803847127309542, "grad_norm": 0.15110675990581512, "learning_rate": 1.9779346922943384e-05, "loss": 0.5223, "step": 980 }, { "epoch": 0.24829157175398633, "grad_norm": 0.15664315223693848, "learning_rate": 1.9778846442382656e-05, "loss": 0.5848, "step": 981 }, { "epoch": 0.24854467223487725, "grad_norm": 0.15137054026126862, "learning_rate": 1.9778345401221407e-05, "loss": 0.5318, "step": 982 }, { "epoch": 0.24879777271576817, "grad_norm": 0.14949896931648254, "learning_rate": 1.9777843799488354e-05, "loss": 0.5627, "step": 983 }, { "epoch": 0.24905087319665908, "grad_norm": 0.15440170466899872, "learning_rate": 1.9777341637212264e-05, "loss": 0.5306, "step": 984 }, { "epoch": 0.24930397367755, "grad_norm": 0.14904992282390594, "learning_rate": 1.9776838914421913e-05, "loss": 0.5559, "step": 985 }, { "epoch": 0.2495570741584409, "grad_norm": 0.14369764924049377, "learning_rate": 1.9776335631146128e-05, "loss": 0.5448, "step": 986 }, { "epoch": 0.2498101746393318, "grad_norm": 0.15963445603847504, "learning_rate": 1.9775831787413757e-05, "loss": 0.5561, "step": 987 }, { "epoch": 0.25006327512022275, "grad_norm": 0.15341433882713318, "learning_rate": 1.977532738325369e-05, "loss": 0.5736, "step": 988 }, { "epoch": 0.25031637560111364, "grad_norm": 0.14597854018211365, "learning_rate": 1.977482241869484e-05, "loss": 0.5471, "step": 989 }, { "epoch": 0.2505694760820046, "grad_norm": 0.15139272809028625, "learning_rate": 1.9774316893766152e-05, "loss": 0.5557, "step": 990 }, { "epoch": 0.25082257656289547, "grad_norm": 0.14927662909030914, "learning_rate": 1.9773810808496612e-05, "loss": 0.5761, "step": 991 }, { "epoch": 0.25107567704378636, "grad_norm": 0.14922501146793365, "learning_rate": 1.977330416291523e-05, "loss": 0.5638, "step": 992 }, { "epoch": 0.2513287775246773, "grad_norm": 0.16757948696613312, "learning_rate": 1.9772796957051055e-05, "loss": 0.555, "step": 993 }, { "epoch": 0.2515818780055682, "grad_norm": 0.15029583871364594, "learning_rate": 1.977228919093316e-05, "loss": 0.545, "step": 994 }, { "epoch": 0.25183497848645914, "grad_norm": 0.14153912663459778, "learning_rate": 1.9771780864590654e-05, "loss": 0.5454, "step": 995 }, { "epoch": 0.25208807896735, "grad_norm": 0.147403284907341, "learning_rate": 1.9771271978052677e-05, "loss": 0.6348, "step": 996 }, { "epoch": 0.25234117944824097, "grad_norm": 0.15162605047225952, "learning_rate": 1.977076253134841e-05, "loss": 0.5687, "step": 997 }, { "epoch": 0.25259427992913186, "grad_norm": 0.1470058113336563, "learning_rate": 1.977025252450705e-05, "loss": 0.5339, "step": 998 }, { "epoch": 0.2528473804100228, "grad_norm": 0.15223732590675354, "learning_rate": 1.976974195755784e-05, "loss": 0.5607, "step": 999 }, { "epoch": 0.2531004808909137, "grad_norm": 0.14976049959659576, "learning_rate": 1.9769230830530044e-05, "loss": 0.5582, "step": 1000 }, { "epoch": 0.2533535813718046, "grad_norm": 0.14229172468185425, "learning_rate": 1.976871914345297e-05, "loss": 0.5762, "step": 1001 }, { "epoch": 0.2536066818526955, "grad_norm": 0.147952601313591, "learning_rate": 1.9768206896355945e-05, "loss": 0.5771, "step": 1002 }, { "epoch": 0.2538597823335864, "grad_norm": 0.14495033025741577, "learning_rate": 1.9767694089268346e-05, "loss": 0.5754, "step": 1003 }, { "epoch": 0.25411288281447736, "grad_norm": 0.14356327056884766, "learning_rate": 1.976718072221956e-05, "loss": 0.5333, "step": 1004 }, { "epoch": 0.25436598329536825, "grad_norm": 0.1483432650566101, "learning_rate": 1.9766666795239026e-05, "loss": 0.5369, "step": 1005 }, { "epoch": 0.2546190837762592, "grad_norm": 0.14932604134082794, "learning_rate": 1.9766152308356198e-05, "loss": 0.5582, "step": 1006 }, { "epoch": 0.2548721842571501, "grad_norm": 0.14025354385375977, "learning_rate": 1.9765637261600577e-05, "loss": 0.5303, "step": 1007 }, { "epoch": 0.255125284738041, "grad_norm": 0.14725551009178162, "learning_rate": 1.9765121655001683e-05, "loss": 0.5725, "step": 1008 }, { "epoch": 0.2553783852189319, "grad_norm": 0.15981052815914154, "learning_rate": 1.9764605488589083e-05, "loss": 0.5395, "step": 1009 }, { "epoch": 0.2556314856998228, "grad_norm": 0.14977654814720154, "learning_rate": 1.976408876239236e-05, "loss": 0.5677, "step": 1010 }, { "epoch": 0.25588458618071375, "grad_norm": 0.14924289286136627, "learning_rate": 1.9763571476441144e-05, "loss": 0.5729, "step": 1011 }, { "epoch": 0.25613768666160464, "grad_norm": 0.1671869158744812, "learning_rate": 1.9763053630765085e-05, "loss": 0.5458, "step": 1012 }, { "epoch": 0.2563907871424956, "grad_norm": 0.22122712433338165, "learning_rate": 1.976253522539387e-05, "loss": 0.5681, "step": 1013 }, { "epoch": 0.25664388762338647, "grad_norm": 0.18188539147377014, "learning_rate": 1.9762016260357222e-05, "loss": 0.5387, "step": 1014 }, { "epoch": 0.2568969881042774, "grad_norm": 0.149360790848732, "learning_rate": 1.9761496735684886e-05, "loss": 0.5721, "step": 1015 }, { "epoch": 0.2571500885851683, "grad_norm": 0.15543420612812042, "learning_rate": 1.976097665140665e-05, "loss": 0.5607, "step": 1016 }, { "epoch": 0.25740318906605925, "grad_norm": 0.1418234258890152, "learning_rate": 1.976045600755233e-05, "loss": 0.5512, "step": 1017 }, { "epoch": 0.25765628954695013, "grad_norm": 0.15278293192386627, "learning_rate": 1.975993480415177e-05, "loss": 0.5859, "step": 1018 }, { "epoch": 0.2579093900278411, "grad_norm": 0.14391061663627625, "learning_rate": 1.975941304123485e-05, "loss": 0.5694, "step": 1019 }, { "epoch": 0.25816249050873197, "grad_norm": 0.14631696045398712, "learning_rate": 1.975889071883148e-05, "loss": 0.5629, "step": 1020 }, { "epoch": 0.25841559098962286, "grad_norm": 0.1435028314590454, "learning_rate": 1.975836783697161e-05, "loss": 0.5582, "step": 1021 }, { "epoch": 0.2586686914705138, "grad_norm": 0.15036992728710175, "learning_rate": 1.975784439568521e-05, "loss": 0.5799, "step": 1022 }, { "epoch": 0.2589217919514047, "grad_norm": 0.17731989920139313, "learning_rate": 1.9757320395002288e-05, "loss": 0.5387, "step": 1023 }, { "epoch": 0.25917489243229563, "grad_norm": 0.13910239934921265, "learning_rate": 1.9756795834952892e-05, "loss": 0.5487, "step": 1024 }, { "epoch": 0.2594279929131865, "grad_norm": 0.1503012776374817, "learning_rate": 1.975627071556708e-05, "loss": 0.5585, "step": 1025 }, { "epoch": 0.25968109339407747, "grad_norm": 0.14616143703460693, "learning_rate": 1.9755745036874967e-05, "loss": 0.5525, "step": 1026 }, { "epoch": 0.25993419387496836, "grad_norm": 0.14453400671482086, "learning_rate": 1.9755218798906683e-05, "loss": 0.574, "step": 1027 }, { "epoch": 0.2601872943558593, "grad_norm": 0.14514601230621338, "learning_rate": 1.97546920016924e-05, "loss": 0.5384, "step": 1028 }, { "epoch": 0.2604403948367502, "grad_norm": 0.14687170088291168, "learning_rate": 1.9754164645262316e-05, "loss": 0.5624, "step": 1029 }, { "epoch": 0.2606934953176411, "grad_norm": 0.1529356688261032, "learning_rate": 1.9753636729646665e-05, "loss": 0.6067, "step": 1030 }, { "epoch": 0.260946595798532, "grad_norm": 0.16056649386882782, "learning_rate": 1.975310825487571e-05, "loss": 0.5768, "step": 1031 }, { "epoch": 0.2611996962794229, "grad_norm": 0.1485505998134613, "learning_rate": 1.9752579220979746e-05, "loss": 0.5675, "step": 1032 }, { "epoch": 0.26145279676031385, "grad_norm": 0.15249554812908173, "learning_rate": 1.9752049627989106e-05, "loss": 0.5823, "step": 1033 }, { "epoch": 0.26170589724120474, "grad_norm": 0.15143629908561707, "learning_rate": 1.9751519475934143e-05, "loss": 0.6127, "step": 1034 }, { "epoch": 0.2619589977220957, "grad_norm": 0.16558900475502014, "learning_rate": 1.9750988764845257e-05, "loss": 0.5608, "step": 1035 }, { "epoch": 0.2622120982029866, "grad_norm": 0.15004250407218933, "learning_rate": 1.975045749475287e-05, "loss": 0.5455, "step": 1036 }, { "epoch": 0.2624651986838775, "grad_norm": 0.144332155585289, "learning_rate": 1.9749925665687436e-05, "loss": 0.5497, "step": 1037 }, { "epoch": 0.2627182991647684, "grad_norm": 0.14648112654685974, "learning_rate": 1.9749393277679445e-05, "loss": 0.5332, "step": 1038 }, { "epoch": 0.26297139964565935, "grad_norm": 0.1481088399887085, "learning_rate": 1.9748860330759417e-05, "loss": 0.5823, "step": 1039 }, { "epoch": 0.26322450012655024, "grad_norm": 0.1628430187702179, "learning_rate": 1.974832682495791e-05, "loss": 0.5508, "step": 1040 }, { "epoch": 0.26347760060744113, "grad_norm": 0.14736312627792358, "learning_rate": 1.9747792760305504e-05, "loss": 0.5527, "step": 1041 }, { "epoch": 0.2637307010883321, "grad_norm": 0.13812898099422455, "learning_rate": 1.974725813683281e-05, "loss": 0.5156, "step": 1042 }, { "epoch": 0.26398380156922296, "grad_norm": 0.16094870865345, "learning_rate": 1.974672295457049e-05, "loss": 0.5695, "step": 1043 }, { "epoch": 0.2642369020501139, "grad_norm": 0.1549401581287384, "learning_rate": 1.974618721354922e-05, "loss": 0.5764, "step": 1044 }, { "epoch": 0.2644900025310048, "grad_norm": 0.1427125632762909, "learning_rate": 1.9745650913799706e-05, "loss": 0.5509, "step": 1045 }, { "epoch": 0.26474310301189574, "grad_norm": 0.1518612951040268, "learning_rate": 1.97451140553527e-05, "loss": 0.5554, "step": 1046 }, { "epoch": 0.26499620349278663, "grad_norm": 0.149056077003479, "learning_rate": 1.9744576638238975e-05, "loss": 0.5316, "step": 1047 }, { "epoch": 0.2652493039736776, "grad_norm": 0.1529719978570938, "learning_rate": 1.9744038662489344e-05, "loss": 0.5777, "step": 1048 }, { "epoch": 0.26550240445456846, "grad_norm": 0.15761038661003113, "learning_rate": 1.9743500128134646e-05, "loss": 0.559, "step": 1049 }, { "epoch": 0.26575550493545935, "grad_norm": 0.16031967103481293, "learning_rate": 1.9742961035205753e-05, "loss": 0.5575, "step": 1050 }, { "epoch": 0.2660086054163503, "grad_norm": 0.1591987907886505, "learning_rate": 1.9742421383733572e-05, "loss": 0.5489, "step": 1051 }, { "epoch": 0.2662617058972412, "grad_norm": 0.14696821570396423, "learning_rate": 1.974188117374904e-05, "loss": 0.5625, "step": 1052 }, { "epoch": 0.26651480637813213, "grad_norm": 0.1537933647632599, "learning_rate": 1.9741340405283123e-05, "loss": 0.5595, "step": 1053 }, { "epoch": 0.266767906859023, "grad_norm": 0.14726348221302032, "learning_rate": 1.9740799078366827e-05, "loss": 0.5377, "step": 1054 }, { "epoch": 0.26702100733991396, "grad_norm": 0.14925776422023773, "learning_rate": 1.974025719303118e-05, "loss": 0.5792, "step": 1055 }, { "epoch": 0.26727410782080485, "grad_norm": 0.14656715095043182, "learning_rate": 1.9739714749307248e-05, "loss": 0.5509, "step": 1056 }, { "epoch": 0.2675272083016958, "grad_norm": 0.14511851966381073, "learning_rate": 1.973917174722613e-05, "loss": 0.541, "step": 1057 }, { "epoch": 0.2677803087825867, "grad_norm": 0.15105004608631134, "learning_rate": 1.973862818681896e-05, "loss": 0.5634, "step": 1058 }, { "epoch": 0.2680334092634776, "grad_norm": 0.15590152144432068, "learning_rate": 1.9738084068116888e-05, "loss": 0.5651, "step": 1059 }, { "epoch": 0.2682865097443685, "grad_norm": 0.14832177758216858, "learning_rate": 1.973753939115112e-05, "loss": 0.564, "step": 1060 }, { "epoch": 0.2685396102252594, "grad_norm": 0.1544780135154724, "learning_rate": 1.9736994155952868e-05, "loss": 0.5605, "step": 1061 }, { "epoch": 0.26879271070615035, "grad_norm": 0.1504780650138855, "learning_rate": 1.97364483625534e-05, "loss": 0.5283, "step": 1062 }, { "epoch": 0.26904581118704124, "grad_norm": 0.14311392605304718, "learning_rate": 1.9735902010983995e-05, "loss": 0.5329, "step": 1063 }, { "epoch": 0.2692989116679322, "grad_norm": 0.14662767946720123, "learning_rate": 1.9735355101275987e-05, "loss": 0.5584, "step": 1064 }, { "epoch": 0.2695520121488231, "grad_norm": 0.17058195173740387, "learning_rate": 1.9734807633460717e-05, "loss": 0.5709, "step": 1065 }, { "epoch": 0.269805112629714, "grad_norm": 0.14818315207958221, "learning_rate": 1.973425960756958e-05, "loss": 0.5637, "step": 1066 }, { "epoch": 0.2700582131106049, "grad_norm": 0.15794546902179718, "learning_rate": 1.9733711023633985e-05, "loss": 0.5734, "step": 1067 }, { "epoch": 0.27031131359149585, "grad_norm": 0.1491701602935791, "learning_rate": 1.9733161881685383e-05, "loss": 0.5565, "step": 1068 }, { "epoch": 0.27056441407238674, "grad_norm": 0.15116646885871887, "learning_rate": 1.973261218175526e-05, "loss": 0.5499, "step": 1069 }, { "epoch": 0.2708175145532776, "grad_norm": 0.1520608812570572, "learning_rate": 1.9732061923875126e-05, "loss": 0.5686, "step": 1070 }, { "epoch": 0.27107061503416857, "grad_norm": 0.1513378620147705, "learning_rate": 1.973151110807652e-05, "loss": 0.566, "step": 1071 }, { "epoch": 0.27132371551505946, "grad_norm": 0.15130463242530823, "learning_rate": 1.9730959734391032e-05, "loss": 0.572, "step": 1072 }, { "epoch": 0.2715768159959504, "grad_norm": 0.14602863788604736, "learning_rate": 1.973040780285026e-05, "loss": 0.5447, "step": 1073 }, { "epoch": 0.2718299164768413, "grad_norm": 0.1462998390197754, "learning_rate": 1.9729855313485853e-05, "loss": 0.5335, "step": 1074 }, { "epoch": 0.27208301695773224, "grad_norm": 0.14722299575805664, "learning_rate": 1.972930226632948e-05, "loss": 0.5566, "step": 1075 }, { "epoch": 0.2723361174386231, "grad_norm": 0.14984026551246643, "learning_rate": 1.972874866141284e-05, "loss": 0.5535, "step": 1076 }, { "epoch": 0.27258921791951407, "grad_norm": 0.1488526165485382, "learning_rate": 1.9728194498767682e-05, "loss": 0.5658, "step": 1077 }, { "epoch": 0.27284231840040496, "grad_norm": 0.15000055730342865, "learning_rate": 1.972763977842577e-05, "loss": 0.5566, "step": 1078 }, { "epoch": 0.27309541888129585, "grad_norm": 0.14696164429187775, "learning_rate": 1.9727084500418902e-05, "loss": 0.5505, "step": 1079 }, { "epoch": 0.2733485193621868, "grad_norm": 0.14558060467243195, "learning_rate": 1.9726528664778916e-05, "loss": 0.5552, "step": 1080 }, { "epoch": 0.2736016198430777, "grad_norm": 0.14551407098770142, "learning_rate": 1.972597227153767e-05, "loss": 0.5329, "step": 1081 }, { "epoch": 0.2738547203239686, "grad_norm": 0.14947450160980225, "learning_rate": 1.9725415320727067e-05, "loss": 0.5746, "step": 1082 }, { "epoch": 0.2741078208048595, "grad_norm": 0.14890162646770477, "learning_rate": 1.972485781237903e-05, "loss": 0.5518, "step": 1083 }, { "epoch": 0.27436092128575046, "grad_norm": 0.15217174589633942, "learning_rate": 1.972429974652553e-05, "loss": 0.5623, "step": 1084 }, { "epoch": 0.27461402176664135, "grad_norm": 0.1404881477355957, "learning_rate": 1.9723741123198548e-05, "loss": 0.5472, "step": 1085 }, { "epoch": 0.2748671222475323, "grad_norm": 0.14228922128677368, "learning_rate": 1.9723181942430117e-05, "loss": 0.5433, "step": 1086 }, { "epoch": 0.2751202227284232, "grad_norm": 0.1508197784423828, "learning_rate": 1.9722622204252285e-05, "loss": 0.5471, "step": 1087 }, { "epoch": 0.2753733232093141, "grad_norm": 0.1493370085954666, "learning_rate": 1.972206190869715e-05, "loss": 0.5522, "step": 1088 }, { "epoch": 0.275626423690205, "grad_norm": 0.1405772864818573, "learning_rate": 1.972150105579683e-05, "loss": 0.5476, "step": 1089 }, { "epoch": 0.2758795241710959, "grad_norm": 0.1434200257062912, "learning_rate": 1.9720939645583477e-05, "loss": 0.5685, "step": 1090 }, { "epoch": 0.27613262465198685, "grad_norm": 0.15157896280288696, "learning_rate": 1.972037767808927e-05, "loss": 0.5849, "step": 1091 }, { "epoch": 0.27638572513287774, "grad_norm": 0.15583936870098114, "learning_rate": 1.971981515334643e-05, "loss": 0.5486, "step": 1092 }, { "epoch": 0.2766388256137687, "grad_norm": 0.1595613956451416, "learning_rate": 1.9719252071387213e-05, "loss": 0.5666, "step": 1093 }, { "epoch": 0.27689192609465957, "grad_norm": 0.14364835619926453, "learning_rate": 1.9718688432243883e-05, "loss": 0.5715, "step": 1094 }, { "epoch": 0.2771450265755505, "grad_norm": 0.14432287216186523, "learning_rate": 1.9718124235948765e-05, "loss": 0.5561, "step": 1095 }, { "epoch": 0.2773981270564414, "grad_norm": 0.1456945687532425, "learning_rate": 1.97175594825342e-05, "loss": 0.5504, "step": 1096 }, { "epoch": 0.27765122753733235, "grad_norm": 0.14358799159526825, "learning_rate": 1.971699417203256e-05, "loss": 0.5498, "step": 1097 }, { "epoch": 0.27790432801822323, "grad_norm": 0.15574151277542114, "learning_rate": 1.9716428304476255e-05, "loss": 0.5538, "step": 1098 }, { "epoch": 0.2781574284991141, "grad_norm": 0.14136652648448944, "learning_rate": 1.971586187989773e-05, "loss": 0.5256, "step": 1099 }, { "epoch": 0.27841052898000507, "grad_norm": 0.14398375153541565, "learning_rate": 1.9715294898329453e-05, "loss": 0.5811, "step": 1100 }, { "epoch": 0.27866362946089596, "grad_norm": 0.15071871876716614, "learning_rate": 1.9714727359803926e-05, "loss": 0.5514, "step": 1101 }, { "epoch": 0.2789167299417869, "grad_norm": 0.14579877257347107, "learning_rate": 1.971415926435369e-05, "loss": 0.5469, "step": 1102 }, { "epoch": 0.2791698304226778, "grad_norm": 0.14999797940254211, "learning_rate": 1.9713590612011306e-05, "loss": 0.5769, "step": 1103 }, { "epoch": 0.27942293090356873, "grad_norm": 0.14470462501049042, "learning_rate": 1.9713021402809378e-05, "loss": 0.5819, "step": 1104 }, { "epoch": 0.2796760313844596, "grad_norm": 0.1511867791414261, "learning_rate": 1.9712451636780536e-05, "loss": 0.5677, "step": 1105 }, { "epoch": 0.27992913186535057, "grad_norm": 0.1428070068359375, "learning_rate": 1.9711881313957442e-05, "loss": 0.5517, "step": 1106 }, { "epoch": 0.28018223234624146, "grad_norm": 0.1462501436471939, "learning_rate": 1.97113104343728e-05, "loss": 0.5535, "step": 1107 }, { "epoch": 0.28043533282713234, "grad_norm": 0.14507222175598145, "learning_rate": 1.9710738998059326e-05, "loss": 0.5755, "step": 1108 }, { "epoch": 0.2806884333080233, "grad_norm": 0.15307988226413727, "learning_rate": 1.9710167005049786e-05, "loss": 0.5374, "step": 1109 }, { "epoch": 0.2809415337889142, "grad_norm": 0.14656472206115723, "learning_rate": 1.970959445537697e-05, "loss": 0.5394, "step": 1110 }, { "epoch": 0.2811946342698051, "grad_norm": 0.3027561604976654, "learning_rate": 1.9709021349073697e-05, "loss": 0.5404, "step": 1111 }, { "epoch": 0.281447734750696, "grad_norm": 0.14776207506656647, "learning_rate": 1.970844768617283e-05, "loss": 0.5317, "step": 1112 }, { "epoch": 0.28170083523158695, "grad_norm": 0.14800283312797546, "learning_rate": 1.9707873466707247e-05, "loss": 0.5876, "step": 1113 }, { "epoch": 0.28195393571247784, "grad_norm": 0.14399170875549316, "learning_rate": 1.9707298690709874e-05, "loss": 0.551, "step": 1114 }, { "epoch": 0.2822070361933688, "grad_norm": 0.1502523124217987, "learning_rate": 1.9706723358213654e-05, "loss": 0.5611, "step": 1115 }, { "epoch": 0.2824601366742597, "grad_norm": 0.15089723467826843, "learning_rate": 1.9706147469251577e-05, "loss": 0.5505, "step": 1116 }, { "epoch": 0.2827132371551506, "grad_norm": 0.13856129348278046, "learning_rate": 1.9705571023856656e-05, "loss": 0.5112, "step": 1117 }, { "epoch": 0.2829663376360415, "grad_norm": 0.14749446511268616, "learning_rate": 1.9704994022061938e-05, "loss": 0.5543, "step": 1118 }, { "epoch": 0.2832194381169324, "grad_norm": 0.1500622034072876, "learning_rate": 1.970441646390049e-05, "loss": 0.5463, "step": 1119 }, { "epoch": 0.28347253859782334, "grad_norm": 0.14998632669448853, "learning_rate": 1.970383834940544e-05, "loss": 0.5653, "step": 1120 }, { "epoch": 0.28372563907871423, "grad_norm": 0.14342360198497772, "learning_rate": 1.970325967860992e-05, "loss": 0.5488, "step": 1121 }, { "epoch": 0.2839787395596052, "grad_norm": 0.16454951465129852, "learning_rate": 1.9702680451547103e-05, "loss": 0.562, "step": 1122 }, { "epoch": 0.28423184004049606, "grad_norm": 0.14895156025886536, "learning_rate": 1.97021006682502e-05, "loss": 0.5917, "step": 1123 }, { "epoch": 0.284484940521387, "grad_norm": 0.15073780715465546, "learning_rate": 1.9701520328752446e-05, "loss": 0.5855, "step": 1124 }, { "epoch": 0.2847380410022779, "grad_norm": 0.15511606633663177, "learning_rate": 1.9700939433087112e-05, "loss": 0.5347, "step": 1125 }, { "epoch": 0.28499114148316884, "grad_norm": 0.14976346492767334, "learning_rate": 1.9700357981287498e-05, "loss": 0.5744, "step": 1126 }, { "epoch": 0.28524424196405973, "grad_norm": 0.1468544751405716, "learning_rate": 1.9699775973386935e-05, "loss": 0.5385, "step": 1127 }, { "epoch": 0.2854973424449506, "grad_norm": 0.15148510038852692, "learning_rate": 1.9699193409418793e-05, "loss": 0.5484, "step": 1128 }, { "epoch": 0.28575044292584156, "grad_norm": 0.1473529189825058, "learning_rate": 1.9698610289416466e-05, "loss": 0.5308, "step": 1129 }, { "epoch": 0.28600354340673245, "grad_norm": 0.1504306197166443, "learning_rate": 1.9698026613413384e-05, "loss": 0.5399, "step": 1130 }, { "epoch": 0.2862566438876234, "grad_norm": 0.15331873297691345, "learning_rate": 1.969744238144301e-05, "loss": 0.5695, "step": 1131 }, { "epoch": 0.2865097443685143, "grad_norm": 0.17417678236961365, "learning_rate": 1.9696857593538836e-05, "loss": 0.5674, "step": 1132 }, { "epoch": 0.28676284484940523, "grad_norm": 0.1782664656639099, "learning_rate": 1.9696272249734383e-05, "loss": 0.5739, "step": 1133 }, { "epoch": 0.2870159453302961, "grad_norm": 0.20433051884174347, "learning_rate": 1.969568635006321e-05, "loss": 0.5494, "step": 1134 }, { "epoch": 0.28726904581118706, "grad_norm": 0.16913430392742157, "learning_rate": 1.9695099894558907e-05, "loss": 0.5318, "step": 1135 }, { "epoch": 0.28752214629207795, "grad_norm": 0.14963854849338531, "learning_rate": 1.9694512883255094e-05, "loss": 0.5509, "step": 1136 }, { "epoch": 0.2877752467729689, "grad_norm": 0.1432947814464569, "learning_rate": 1.969392531618542e-05, "loss": 0.5402, "step": 1137 }, { "epoch": 0.2880283472538598, "grad_norm": 0.14945542812347412, "learning_rate": 1.969333719338357e-05, "loss": 0.5601, "step": 1138 }, { "epoch": 0.2882814477347507, "grad_norm": 0.1545867770910263, "learning_rate": 1.9692748514883258e-05, "loss": 0.5447, "step": 1139 }, { "epoch": 0.2885345482156416, "grad_norm": 0.14961642026901245, "learning_rate": 1.9692159280718237e-05, "loss": 0.5686, "step": 1140 }, { "epoch": 0.2887876486965325, "grad_norm": 0.15187720954418182, "learning_rate": 1.9691569490922283e-05, "loss": 0.5876, "step": 1141 }, { "epoch": 0.28904074917742345, "grad_norm": 0.14029815793037415, "learning_rate": 1.9690979145529213e-05, "loss": 0.5591, "step": 1142 }, { "epoch": 0.28929384965831434, "grad_norm": 0.14870896935462952, "learning_rate": 1.969038824457286e-05, "loss": 0.5839, "step": 1143 }, { "epoch": 0.2895469501392053, "grad_norm": 0.146173894405365, "learning_rate": 1.9689796788087106e-05, "loss": 0.5531, "step": 1144 }, { "epoch": 0.2898000506200962, "grad_norm": 0.14388002455234528, "learning_rate": 1.968920477610586e-05, "loss": 0.5314, "step": 1145 }, { "epoch": 0.2900531511009871, "grad_norm": 0.14754292368888855, "learning_rate": 1.9688612208663052e-05, "loss": 0.5731, "step": 1146 }, { "epoch": 0.290306251581878, "grad_norm": 0.14382413029670715, "learning_rate": 1.968801908579266e-05, "loss": 0.541, "step": 1147 }, { "epoch": 0.2905593520627689, "grad_norm": 0.1487230360507965, "learning_rate": 1.9687425407528685e-05, "loss": 0.5698, "step": 1148 }, { "epoch": 0.29081245254365984, "grad_norm": 0.15192517638206482, "learning_rate": 1.9686831173905162e-05, "loss": 0.5622, "step": 1149 }, { "epoch": 0.2910655530245507, "grad_norm": 0.14377538859844208, "learning_rate": 1.9686236384956156e-05, "loss": 0.5611, "step": 1150 }, { "epoch": 0.29131865350544167, "grad_norm": 0.15064801275730133, "learning_rate": 1.9685641040715765e-05, "loss": 0.5642, "step": 1151 }, { "epoch": 0.29157175398633256, "grad_norm": 0.14578548073768616, "learning_rate": 1.9685045141218114e-05, "loss": 0.5795, "step": 1152 }, { "epoch": 0.2918248544672235, "grad_norm": 0.20898699760437012, "learning_rate": 1.9684448686497377e-05, "loss": 0.5623, "step": 1153 }, { "epoch": 0.2920779549481144, "grad_norm": 0.14994247257709503, "learning_rate": 1.9683851676587732e-05, "loss": 0.5562, "step": 1154 }, { "epoch": 0.29233105542900534, "grad_norm": 0.146097332239151, "learning_rate": 1.9683254111523417e-05, "loss": 0.5474, "step": 1155 }, { "epoch": 0.2925841559098962, "grad_norm": 0.1490073949098587, "learning_rate": 1.968265599133868e-05, "loss": 0.5753, "step": 1156 }, { "epoch": 0.2928372563907871, "grad_norm": 0.14542317390441895, "learning_rate": 1.968205731606782e-05, "loss": 0.5386, "step": 1157 }, { "epoch": 0.29309035687167806, "grad_norm": 0.15466979146003723, "learning_rate": 1.9681458085745148e-05, "loss": 0.5305, "step": 1158 }, { "epoch": 0.29334345735256895, "grad_norm": 0.13925255835056305, "learning_rate": 1.9680858300405027e-05, "loss": 0.5257, "step": 1159 }, { "epoch": 0.2935965578334599, "grad_norm": 0.14682196080684662, "learning_rate": 1.9680257960081828e-05, "loss": 0.5431, "step": 1160 }, { "epoch": 0.2938496583143508, "grad_norm": 0.15029916167259216, "learning_rate": 1.9679657064809977e-05, "loss": 0.5535, "step": 1161 }, { "epoch": 0.2941027587952417, "grad_norm": 0.14853662252426147, "learning_rate": 1.9679055614623918e-05, "loss": 0.5311, "step": 1162 }, { "epoch": 0.2943558592761326, "grad_norm": 0.1501016914844513, "learning_rate": 1.9678453609558136e-05, "loss": 0.5555, "step": 1163 }, { "epoch": 0.29460895975702356, "grad_norm": 0.15274439752101898, "learning_rate": 1.967785104964713e-05, "loss": 0.535, "step": 1164 }, { "epoch": 0.29486206023791445, "grad_norm": 0.18682987987995148, "learning_rate": 1.967724793492546e-05, "loss": 0.5742, "step": 1165 }, { "epoch": 0.2951151607188054, "grad_norm": 0.14630046486854553, "learning_rate": 1.9676644265427692e-05, "loss": 0.5471, "step": 1166 }, { "epoch": 0.2953682611996963, "grad_norm": 1.4593929052352905, "learning_rate": 1.967604004118844e-05, "loss": 0.5525, "step": 1167 }, { "epoch": 0.29562136168058717, "grad_norm": 0.1469281017780304, "learning_rate": 1.967543526224233e-05, "loss": 0.5748, "step": 1168 }, { "epoch": 0.2958744621614781, "grad_norm": 0.1593310683965683, "learning_rate": 1.9674829928624042e-05, "loss": 0.5467, "step": 1169 }, { "epoch": 0.296127562642369, "grad_norm": 0.15252695977687836, "learning_rate": 1.9674224040368277e-05, "loss": 0.5604, "step": 1170 }, { "epoch": 0.29638066312325995, "grad_norm": 0.14795660972595215, "learning_rate": 1.9673617597509774e-05, "loss": 0.5525, "step": 1171 }, { "epoch": 0.29663376360415084, "grad_norm": 0.14674055576324463, "learning_rate": 1.9673010600083287e-05, "loss": 0.5807, "step": 1172 }, { "epoch": 0.2968868640850418, "grad_norm": 0.1470927894115448, "learning_rate": 1.9672403048123624e-05, "loss": 0.5216, "step": 1173 }, { "epoch": 0.29713996456593267, "grad_norm": 0.15062348544597626, "learning_rate": 1.9671794941665613e-05, "loss": 0.5343, "step": 1174 }, { "epoch": 0.2973930650468236, "grad_norm": 0.16333657503128052, "learning_rate": 1.9671186280744114e-05, "loss": 0.5568, "step": 1175 }, { "epoch": 0.2976461655277145, "grad_norm": 0.1702549308538437, "learning_rate": 1.9670577065394018e-05, "loss": 0.549, "step": 1176 }, { "epoch": 0.2978992660086054, "grad_norm": 0.14794114232063293, "learning_rate": 1.9669967295650256e-05, "loss": 0.563, "step": 1177 }, { "epoch": 0.29815236648949633, "grad_norm": 0.14850711822509766, "learning_rate": 1.9669356971547778e-05, "loss": 0.5508, "step": 1178 }, { "epoch": 0.2984054669703872, "grad_norm": 0.1458379328250885, "learning_rate": 1.966874609312158e-05, "loss": 0.5793, "step": 1179 }, { "epoch": 0.29865856745127817, "grad_norm": 0.18383198976516724, "learning_rate": 1.9668134660406675e-05, "loss": 0.5607, "step": 1180 }, { "epoch": 0.29891166793216906, "grad_norm": 0.14815421402454376, "learning_rate": 1.966752267343812e-05, "loss": 0.5542, "step": 1181 }, { "epoch": 0.29916476841306, "grad_norm": 0.150907501578331, "learning_rate": 1.9666910132250995e-05, "loss": 0.5533, "step": 1182 }, { "epoch": 0.2994178688939509, "grad_norm": 0.1607259064912796, "learning_rate": 1.966629703688042e-05, "loss": 0.5646, "step": 1183 }, { "epoch": 0.29967096937484183, "grad_norm": 0.1456514447927475, "learning_rate": 1.966568338736154e-05, "loss": 0.5466, "step": 1184 }, { "epoch": 0.2999240698557327, "grad_norm": 0.14388611912727356, "learning_rate": 1.966506918372953e-05, "loss": 0.5208, "step": 1185 }, { "epoch": 0.30017717033662367, "grad_norm": 0.14943061769008636, "learning_rate": 1.9664454426019614e-05, "loss": 0.5642, "step": 1186 }, { "epoch": 0.30043027081751456, "grad_norm": 0.16205556690692902, "learning_rate": 1.966383911426702e-05, "loss": 0.5611, "step": 1187 }, { "epoch": 0.30068337129840544, "grad_norm": 0.15247951447963715, "learning_rate": 1.9663223248507034e-05, "loss": 0.5699, "step": 1188 }, { "epoch": 0.3009364717792964, "grad_norm": 0.1501820832490921, "learning_rate": 1.9662606828774956e-05, "loss": 0.574, "step": 1189 }, { "epoch": 0.3011895722601873, "grad_norm": 0.15078707039356232, "learning_rate": 1.9661989855106122e-05, "loss": 0.5715, "step": 1190 }, { "epoch": 0.3014426727410782, "grad_norm": 0.15016232430934906, "learning_rate": 1.966137232753591e-05, "loss": 0.5592, "step": 1191 }, { "epoch": 0.3016957732219691, "grad_norm": 0.1505821794271469, "learning_rate": 1.9660754246099715e-05, "loss": 0.5865, "step": 1192 }, { "epoch": 0.30194887370286005, "grad_norm": 0.2461215853691101, "learning_rate": 1.9660135610832966e-05, "loss": 0.5559, "step": 1193 }, { "epoch": 0.30220197418375094, "grad_norm": 0.15415988862514496, "learning_rate": 1.965951642177114e-05, "loss": 0.5582, "step": 1194 }, { "epoch": 0.3024550746646419, "grad_norm": 0.14573028683662415, "learning_rate": 1.9658896678949726e-05, "loss": 0.5366, "step": 1195 }, { "epoch": 0.3027081751455328, "grad_norm": 0.14445725083351135, "learning_rate": 1.9658276382404255e-05, "loss": 0.5591, "step": 1196 }, { "epoch": 0.30296127562642367, "grad_norm": 0.15039972960948944, "learning_rate": 1.9657655532170286e-05, "loss": 0.5732, "step": 1197 }, { "epoch": 0.3032143761073146, "grad_norm": 0.14960485696792603, "learning_rate": 1.965703412828341e-05, "loss": 0.5558, "step": 1198 }, { "epoch": 0.3034674765882055, "grad_norm": 0.14577481150627136, "learning_rate": 1.9656412170779254e-05, "loss": 0.5279, "step": 1199 }, { "epoch": 0.30372057706909644, "grad_norm": 0.14354942739009857, "learning_rate": 1.9655789659693473e-05, "loss": 0.5384, "step": 1200 }, { "epoch": 0.30397367754998733, "grad_norm": 0.1729145646095276, "learning_rate": 1.965516659506175e-05, "loss": 0.5649, "step": 1201 }, { "epoch": 0.3042267780308783, "grad_norm": 0.14442485570907593, "learning_rate": 1.965454297691981e-05, "loss": 0.5646, "step": 1202 }, { "epoch": 0.30447987851176916, "grad_norm": 0.14704465866088867, "learning_rate": 1.96539188053034e-05, "loss": 0.5459, "step": 1203 }, { "epoch": 0.3047329789926601, "grad_norm": 0.1534062772989273, "learning_rate": 1.9653294080248302e-05, "loss": 0.5841, "step": 1204 }, { "epoch": 0.304986079473551, "grad_norm": 0.15259750187397003, "learning_rate": 1.9652668801790334e-05, "loss": 0.5793, "step": 1205 }, { "epoch": 0.3052391799544419, "grad_norm": 0.1467946618795395, "learning_rate": 1.965204296996534e-05, "loss": 0.548, "step": 1206 }, { "epoch": 0.30549228043533283, "grad_norm": 0.15427266061306, "learning_rate": 1.9651416584809192e-05, "loss": 0.5533, "step": 1207 }, { "epoch": 0.3057453809162237, "grad_norm": 0.15036946535110474, "learning_rate": 1.9650789646357803e-05, "loss": 0.5539, "step": 1208 }, { "epoch": 0.30599848139711466, "grad_norm": 0.14675118029117584, "learning_rate": 1.965016215464712e-05, "loss": 0.5656, "step": 1209 }, { "epoch": 0.30625158187800555, "grad_norm": 0.1452735811471939, "learning_rate": 1.964953410971311e-05, "loss": 0.5528, "step": 1210 }, { "epoch": 0.3065046823588965, "grad_norm": 0.17103785276412964, "learning_rate": 1.964890551159178e-05, "loss": 0.5444, "step": 1211 }, { "epoch": 0.3067577828397874, "grad_norm": 0.1458415538072586, "learning_rate": 1.9648276360319163e-05, "loss": 0.5467, "step": 1212 }, { "epoch": 0.30701088332067833, "grad_norm": 0.14854510128498077, "learning_rate": 1.9647646655931327e-05, "loss": 0.5493, "step": 1213 }, { "epoch": 0.3072639838015692, "grad_norm": 0.14343342185020447, "learning_rate": 1.9647016398464377e-05, "loss": 0.5302, "step": 1214 }, { "epoch": 0.30751708428246016, "grad_norm": 0.17061619460582733, "learning_rate": 1.9646385587954437e-05, "loss": 0.5539, "step": 1215 }, { "epoch": 0.30777018476335105, "grad_norm": 0.14777931571006775, "learning_rate": 1.9645754224437675e-05, "loss": 0.5513, "step": 1216 }, { "epoch": 0.30802328524424194, "grad_norm": 0.14115074276924133, "learning_rate": 1.9645122307950283e-05, "loss": 0.5096, "step": 1217 }, { "epoch": 0.3082763857251329, "grad_norm": 0.14359930157661438, "learning_rate": 1.964448983852849e-05, "loss": 0.5517, "step": 1218 }, { "epoch": 0.3085294862060238, "grad_norm": 0.15274740755558014, "learning_rate": 1.9643856816208554e-05, "loss": 0.5539, "step": 1219 }, { "epoch": 0.3087825866869147, "grad_norm": 0.1473120003938675, "learning_rate": 1.964322324102676e-05, "loss": 0.5475, "step": 1220 }, { "epoch": 0.3090356871678056, "grad_norm": 0.14768411219120026, "learning_rate": 1.964258911301944e-05, "loss": 0.5569, "step": 1221 }, { "epoch": 0.30928878764869655, "grad_norm": 0.14215229451656342, "learning_rate": 1.9641954432222932e-05, "loss": 0.542, "step": 1222 }, { "epoch": 0.30954188812958744, "grad_norm": 0.15531481802463531, "learning_rate": 1.9641319198673634e-05, "loss": 0.5389, "step": 1223 }, { "epoch": 0.3097949886104784, "grad_norm": 0.14298652112483978, "learning_rate": 1.964068341240796e-05, "loss": 0.5622, "step": 1224 }, { "epoch": 0.31004808909136927, "grad_norm": 0.1486559808254242, "learning_rate": 1.964004707346235e-05, "loss": 0.5679, "step": 1225 }, { "epoch": 0.31030118957226016, "grad_norm": 0.14799034595489502, "learning_rate": 1.9639410181873296e-05, "loss": 0.5861, "step": 1226 }, { "epoch": 0.3105542900531511, "grad_norm": 0.15091361105442047, "learning_rate": 1.96387727376773e-05, "loss": 0.5825, "step": 1227 }, { "epoch": 0.310807390534042, "grad_norm": 0.1536412090063095, "learning_rate": 1.9638134740910914e-05, "loss": 0.5909, "step": 1228 }, { "epoch": 0.31106049101493294, "grad_norm": 0.15981240570545197, "learning_rate": 1.9637496191610703e-05, "loss": 0.5426, "step": 1229 }, { "epoch": 0.3113135914958238, "grad_norm": 0.15578638017177582, "learning_rate": 1.963685708981328e-05, "loss": 0.5561, "step": 1230 }, { "epoch": 0.31156669197671477, "grad_norm": 0.14452330768108368, "learning_rate": 1.9636217435555282e-05, "loss": 0.5751, "step": 1231 }, { "epoch": 0.31181979245760566, "grad_norm": 0.1484268307685852, "learning_rate": 1.963557722887338e-05, "loss": 0.5433, "step": 1232 }, { "epoch": 0.3120728929384966, "grad_norm": 0.14375555515289307, "learning_rate": 1.963493646980428e-05, "loss": 0.566, "step": 1233 }, { "epoch": 0.3123259934193875, "grad_norm": 0.14822691679000854, "learning_rate": 1.96342951583847e-05, "loss": 0.5551, "step": 1234 }, { "epoch": 0.31257909390027844, "grad_norm": 0.14368198812007904, "learning_rate": 1.963365329465142e-05, "loss": 0.5509, "step": 1235 }, { "epoch": 0.3128321943811693, "grad_norm": 0.1450127214193344, "learning_rate": 1.9633010878641236e-05, "loss": 0.5796, "step": 1236 }, { "epoch": 0.3130852948620602, "grad_norm": 0.14795133471488953, "learning_rate": 1.963236791039097e-05, "loss": 0.5687, "step": 1237 }, { "epoch": 0.31333839534295116, "grad_norm": 0.14844830334186554, "learning_rate": 1.9631724389937478e-05, "loss": 0.5522, "step": 1238 }, { "epoch": 0.31359149582384205, "grad_norm": 0.14689283072948456, "learning_rate": 1.9631080317317662e-05, "loss": 0.559, "step": 1239 }, { "epoch": 0.313844596304733, "grad_norm": 0.14795449376106262, "learning_rate": 1.9630435692568443e-05, "loss": 0.5182, "step": 1240 }, { "epoch": 0.3140976967856239, "grad_norm": 0.14915379881858826, "learning_rate": 1.9629790515726773e-05, "loss": 0.5407, "step": 1241 }, { "epoch": 0.3143507972665148, "grad_norm": 0.15982119739055634, "learning_rate": 1.9629144786829642e-05, "loss": 0.5515, "step": 1242 }, { "epoch": 0.3146038977474057, "grad_norm": 0.1536249965429306, "learning_rate": 1.9628498505914065e-05, "loss": 0.557, "step": 1243 }, { "epoch": 0.31485699822829666, "grad_norm": 0.15894600749015808, "learning_rate": 1.962785167301709e-05, "loss": 0.5411, "step": 1244 }, { "epoch": 0.31511009870918755, "grad_norm": 0.30941352248191833, "learning_rate": 1.9627204288175806e-05, "loss": 0.5907, "step": 1245 }, { "epoch": 0.31536319919007844, "grad_norm": 0.15000180900096893, "learning_rate": 1.9626556351427318e-05, "loss": 0.5546, "step": 1246 }, { "epoch": 0.3156162996709694, "grad_norm": 0.1520778387784958, "learning_rate": 1.9625907862808777e-05, "loss": 0.5743, "step": 1247 }, { "epoch": 0.31586940015186027, "grad_norm": 0.14461271464824677, "learning_rate": 1.9625258822357355e-05, "loss": 0.5321, "step": 1248 }, { "epoch": 0.3161225006327512, "grad_norm": 0.15170255303382874, "learning_rate": 1.9624609230110266e-05, "loss": 0.5558, "step": 1249 }, { "epoch": 0.3163756011136421, "grad_norm": 0.14372846484184265, "learning_rate": 1.9623959086104746e-05, "loss": 0.5265, "step": 1250 }, { "epoch": 0.31662870159453305, "grad_norm": 0.14450037479400635, "learning_rate": 1.9623308390378062e-05, "loss": 0.5717, "step": 1251 }, { "epoch": 0.31688180207542394, "grad_norm": 0.14532533288002014, "learning_rate": 1.9622657142967523e-05, "loss": 0.5581, "step": 1252 }, { "epoch": 0.3171349025563149, "grad_norm": 0.1428423672914505, "learning_rate": 1.9622005343910464e-05, "loss": 0.5561, "step": 1253 }, { "epoch": 0.31738800303720577, "grad_norm": 0.14719721674919128, "learning_rate": 1.9621352993244244e-05, "loss": 0.576, "step": 1254 }, { "epoch": 0.31764110351809666, "grad_norm": 0.15930767357349396, "learning_rate": 1.9620700091006274e-05, "loss": 0.5626, "step": 1255 }, { "epoch": 0.3178942039989876, "grad_norm": 0.13895705342292786, "learning_rate": 1.962004663723397e-05, "loss": 0.517, "step": 1256 }, { "epoch": 0.3181473044798785, "grad_norm": 0.1523996889591217, "learning_rate": 1.96193926319648e-05, "loss": 0.5687, "step": 1257 }, { "epoch": 0.31840040496076943, "grad_norm": 0.14647218585014343, "learning_rate": 1.9618738075236258e-05, "loss": 0.5316, "step": 1258 }, { "epoch": 0.3186535054416603, "grad_norm": 0.18226373195648193, "learning_rate": 1.961808296708586e-05, "loss": 0.5278, "step": 1259 }, { "epoch": 0.31890660592255127, "grad_norm": 0.1543813794851303, "learning_rate": 1.961742730755117e-05, "loss": 0.5644, "step": 1260 }, { "epoch": 0.31915970640344216, "grad_norm": 0.14397448301315308, "learning_rate": 1.961677109666978e-05, "loss": 0.5058, "step": 1261 }, { "epoch": 0.3194128068843331, "grad_norm": 0.14688347280025482, "learning_rate": 1.9616114334479293e-05, "loss": 0.5484, "step": 1262 }, { "epoch": 0.319665907365224, "grad_norm": 0.16274970769882202, "learning_rate": 1.9615457021017376e-05, "loss": 0.5387, "step": 1263 }, { "epoch": 0.31991900784611493, "grad_norm": 0.14345817267894745, "learning_rate": 1.96147991563217e-05, "loss": 0.5318, "step": 1264 }, { "epoch": 0.3201721083270058, "grad_norm": 0.14996305108070374, "learning_rate": 1.9614140740429987e-05, "loss": 0.5673, "step": 1265 }, { "epoch": 0.3204252088078967, "grad_norm": 0.14910423755645752, "learning_rate": 1.961348177337998e-05, "loss": 0.5521, "step": 1266 }, { "epoch": 0.32067830928878766, "grad_norm": 0.15267117321491241, "learning_rate": 1.9612822255209448e-05, "loss": 0.5619, "step": 1267 }, { "epoch": 0.32093140976967854, "grad_norm": 0.15022993087768555, "learning_rate": 1.9612162185956215e-05, "loss": 0.5441, "step": 1268 }, { "epoch": 0.3211845102505695, "grad_norm": 0.14584487676620483, "learning_rate": 1.9611501565658112e-05, "loss": 0.5511, "step": 1269 }, { "epoch": 0.3214376107314604, "grad_norm": 0.14786754548549652, "learning_rate": 1.9610840394353012e-05, "loss": 0.5311, "step": 1270 }, { "epoch": 0.3216907112123513, "grad_norm": 0.14073446393013, "learning_rate": 1.961017867207882e-05, "loss": 0.5499, "step": 1271 }, { "epoch": 0.3219438116932422, "grad_norm": 0.152338907122612, "learning_rate": 1.960951639887347e-05, "loss": 0.5565, "step": 1272 }, { "epoch": 0.32219691217413315, "grad_norm": 0.14627261459827423, "learning_rate": 1.9608853574774928e-05, "loss": 0.5581, "step": 1273 }, { "epoch": 0.32245001265502404, "grad_norm": 0.1517730951309204, "learning_rate": 1.9608190199821194e-05, "loss": 0.5723, "step": 1274 }, { "epoch": 0.32270311313591493, "grad_norm": 0.14177106320858002, "learning_rate": 1.9607526274050296e-05, "loss": 0.5347, "step": 1275 }, { "epoch": 0.3229562136168059, "grad_norm": 0.1493445485830307, "learning_rate": 1.9606861797500297e-05, "loss": 0.5429, "step": 1276 }, { "epoch": 0.32320931409769676, "grad_norm": 0.14698271453380585, "learning_rate": 1.9606196770209293e-05, "loss": 0.5202, "step": 1277 }, { "epoch": 0.3234624145785877, "grad_norm": 0.15068919956684113, "learning_rate": 1.96055311922154e-05, "loss": 0.5311, "step": 1278 }, { "epoch": 0.3237155150594786, "grad_norm": 0.16112153232097626, "learning_rate": 1.9604865063556782e-05, "loss": 0.5354, "step": 1279 }, { "epoch": 0.32396861554036954, "grad_norm": 0.14112502336502075, "learning_rate": 1.9604198384271623e-05, "loss": 0.5646, "step": 1280 }, { "epoch": 0.32422171602126043, "grad_norm": 0.16473205387592316, "learning_rate": 1.9603531154398142e-05, "loss": 0.5443, "step": 1281 }, { "epoch": 0.3244748165021514, "grad_norm": 0.1423436552286148, "learning_rate": 1.9602863373974598e-05, "loss": 0.5461, "step": 1282 }, { "epoch": 0.32472791698304226, "grad_norm": 0.17554926872253418, "learning_rate": 1.9602195043039262e-05, "loss": 0.5414, "step": 1283 }, { "epoch": 0.3249810174639332, "grad_norm": 0.1491205394268036, "learning_rate": 1.960152616163045e-05, "loss": 0.55, "step": 1284 }, { "epoch": 0.3252341179448241, "grad_norm": 0.14338554441928864, "learning_rate": 1.9600856729786515e-05, "loss": 0.5461, "step": 1285 }, { "epoch": 0.325487218425715, "grad_norm": 0.14553028345108032, "learning_rate": 1.960018674754583e-05, "loss": 0.5636, "step": 1286 }, { "epoch": 0.32574031890660593, "grad_norm": 0.15363600850105286, "learning_rate": 1.9599516214946802e-05, "loss": 0.5671, "step": 1287 }, { "epoch": 0.3259934193874968, "grad_norm": 0.14060044288635254, "learning_rate": 1.959884513202787e-05, "loss": 0.5303, "step": 1288 }, { "epoch": 0.32624651986838776, "grad_norm": 0.15078315138816833, "learning_rate": 1.959817349882751e-05, "loss": 0.5658, "step": 1289 }, { "epoch": 0.32649962034927865, "grad_norm": 0.1502080112695694, "learning_rate": 1.9597501315384223e-05, "loss": 0.5571, "step": 1290 }, { "epoch": 0.3267527208301696, "grad_norm": 0.14814816415309906, "learning_rate": 1.9596828581736545e-05, "loss": 0.5646, "step": 1291 }, { "epoch": 0.3270058213110605, "grad_norm": 0.15655817091464996, "learning_rate": 1.9596155297923037e-05, "loss": 0.5691, "step": 1292 }, { "epoch": 0.32725892179195143, "grad_norm": 0.1446794867515564, "learning_rate": 1.9595481463982308e-05, "loss": 0.5333, "step": 1293 }, { "epoch": 0.3275120222728423, "grad_norm": 0.1509925127029419, "learning_rate": 1.9594807079952978e-05, "loss": 0.55, "step": 1294 }, { "epoch": 0.3277651227537332, "grad_norm": 0.1459847241640091, "learning_rate": 1.959413214587371e-05, "loss": 0.5386, "step": 1295 }, { "epoch": 0.32801822323462415, "grad_norm": 0.1447642296552658, "learning_rate": 1.95934566617832e-05, "loss": 0.5471, "step": 1296 }, { "epoch": 0.32827132371551504, "grad_norm": 0.15009340643882751, "learning_rate": 1.9592780627720168e-05, "loss": 0.5552, "step": 1297 }, { "epoch": 0.328524424196406, "grad_norm": 0.1540219485759735, "learning_rate": 1.9592104043723372e-05, "loss": 0.5529, "step": 1298 }, { "epoch": 0.3287775246772969, "grad_norm": 0.14725595712661743, "learning_rate": 1.9591426909831595e-05, "loss": 0.5668, "step": 1299 }, { "epoch": 0.3290306251581878, "grad_norm": 0.1435309201478958, "learning_rate": 1.9590749226083664e-05, "loss": 0.5603, "step": 1300 }, { "epoch": 0.3292837256390787, "grad_norm": 0.151546910405159, "learning_rate": 1.959007099251842e-05, "loss": 0.5497, "step": 1301 }, { "epoch": 0.32953682611996965, "grad_norm": 0.1673007309436798, "learning_rate": 1.9589392209174756e-05, "loss": 0.5575, "step": 1302 }, { "epoch": 0.32978992660086054, "grad_norm": 0.1480465680360794, "learning_rate": 1.9588712876091572e-05, "loss": 0.5506, "step": 1303 }, { "epoch": 0.33004302708175143, "grad_norm": 0.15586607158184052, "learning_rate": 1.958803299330782e-05, "loss": 0.5569, "step": 1304 }, { "epoch": 0.33029612756264237, "grad_norm": 0.1441100686788559, "learning_rate": 1.9587352560862473e-05, "loss": 0.5527, "step": 1305 }, { "epoch": 0.33054922804353326, "grad_norm": 0.15386159718036652, "learning_rate": 1.9586671578794544e-05, "loss": 0.5527, "step": 1306 }, { "epoch": 0.3308023285244242, "grad_norm": 0.15203054249286652, "learning_rate": 1.958599004714307e-05, "loss": 0.5776, "step": 1307 }, { "epoch": 0.3310554290053151, "grad_norm": 0.15152958035469055, "learning_rate": 1.958530796594712e-05, "loss": 0.5666, "step": 1308 }, { "epoch": 0.33130852948620604, "grad_norm": 0.15217675268650055, "learning_rate": 1.9584625335245792e-05, "loss": 0.5522, "step": 1309 }, { "epoch": 0.3315616299670969, "grad_norm": 0.15009896457195282, "learning_rate": 1.958394215507823e-05, "loss": 0.5431, "step": 1310 }, { "epoch": 0.33181473044798787, "grad_norm": 0.14295952022075653, "learning_rate": 1.958325842548359e-05, "loss": 0.5504, "step": 1311 }, { "epoch": 0.33206783092887876, "grad_norm": 0.14832429587841034, "learning_rate": 1.9582574146501077e-05, "loss": 0.5468, "step": 1312 }, { "epoch": 0.3323209314097697, "grad_norm": 0.14665763080120087, "learning_rate": 1.9581889318169915e-05, "loss": 0.5596, "step": 1313 }, { "epoch": 0.3325740318906606, "grad_norm": 0.15014337003231049, "learning_rate": 1.9581203940529362e-05, "loss": 0.5636, "step": 1314 }, { "epoch": 0.3328271323715515, "grad_norm": 0.2629657983779907, "learning_rate": 1.9580518013618714e-05, "loss": 0.5379, "step": 1315 }, { "epoch": 0.3330802328524424, "grad_norm": 0.14648117125034332, "learning_rate": 1.9579831537477286e-05, "loss": 0.5614, "step": 1316 }, { "epoch": 0.3333333333333333, "grad_norm": 0.15964412689208984, "learning_rate": 1.9579144512144442e-05, "loss": 0.5497, "step": 1317 }, { "epoch": 0.33358643381422426, "grad_norm": 0.14608141779899597, "learning_rate": 1.957845693765956e-05, "loss": 0.5498, "step": 1318 }, { "epoch": 0.33383953429511515, "grad_norm": 0.1524888575077057, "learning_rate": 1.9577768814062058e-05, "loss": 0.5459, "step": 1319 }, { "epoch": 0.3340926347760061, "grad_norm": 0.14381134510040283, "learning_rate": 1.9577080141391393e-05, "loss": 0.5614, "step": 1320 }, { "epoch": 0.334345735256897, "grad_norm": 0.14823700487613678, "learning_rate": 1.9576390919687033e-05, "loss": 0.5611, "step": 1321 }, { "epoch": 0.3345988357377879, "grad_norm": 0.14616329967975616, "learning_rate": 1.9575701148988497e-05, "loss": 0.52, "step": 1322 }, { "epoch": 0.3348519362186788, "grad_norm": 0.1496943235397339, "learning_rate": 1.9575010829335328e-05, "loss": 0.5572, "step": 1323 }, { "epoch": 0.3351050366995697, "grad_norm": 0.1528523713350296, "learning_rate": 1.95743199607671e-05, "loss": 0.5452, "step": 1324 }, { "epoch": 0.33535813718046065, "grad_norm": 0.14889608323574066, "learning_rate": 1.9573628543323414e-05, "loss": 0.5365, "step": 1325 }, { "epoch": 0.33561123766135154, "grad_norm": 0.1547989845275879, "learning_rate": 1.9572936577043915e-05, "loss": 0.5476, "step": 1326 }, { "epoch": 0.3358643381422425, "grad_norm": 0.17157310247421265, "learning_rate": 1.9572244061968265e-05, "loss": 0.5476, "step": 1327 }, { "epoch": 0.33611743862313337, "grad_norm": 0.1616293042898178, "learning_rate": 1.9571550998136172e-05, "loss": 0.5407, "step": 1328 }, { "epoch": 0.3363705391040243, "grad_norm": 0.15019971132278442, "learning_rate": 1.9570857385587363e-05, "loss": 0.5362, "step": 1329 }, { "epoch": 0.3366236395849152, "grad_norm": 0.15603739023208618, "learning_rate": 1.9570163224361602e-05, "loss": 0.5458, "step": 1330 }, { "epoch": 0.33687674006580615, "grad_norm": 0.14277146756649017, "learning_rate": 1.9569468514498683e-05, "loss": 0.5376, "step": 1331 }, { "epoch": 0.33712984054669703, "grad_norm": 0.14798814058303833, "learning_rate": 1.9568773256038437e-05, "loss": 0.5618, "step": 1332 }, { "epoch": 0.337382941027588, "grad_norm": 0.15534618496894836, "learning_rate": 1.9568077449020714e-05, "loss": 0.5585, "step": 1333 }, { "epoch": 0.33763604150847887, "grad_norm": 0.15266895294189453, "learning_rate": 1.9567381093485407e-05, "loss": 0.5676, "step": 1334 }, { "epoch": 0.33788914198936976, "grad_norm": 0.14401869475841522, "learning_rate": 1.9566684189472437e-05, "loss": 0.5631, "step": 1335 }, { "epoch": 0.3381422424702607, "grad_norm": 0.14957746863365173, "learning_rate": 1.9565986737021755e-05, "loss": 0.5474, "step": 1336 }, { "epoch": 0.3383953429511516, "grad_norm": 0.1466982066631317, "learning_rate": 1.9565288736173347e-05, "loss": 0.5371, "step": 1337 }, { "epoch": 0.33864844343204253, "grad_norm": 0.153359055519104, "learning_rate": 1.9564590186967224e-05, "loss": 0.5191, "step": 1338 }, { "epoch": 0.3389015439129334, "grad_norm": 0.1684064418077469, "learning_rate": 1.9563891089443436e-05, "loss": 0.5869, "step": 1339 }, { "epoch": 0.33915464439382437, "grad_norm": 0.14754392206668854, "learning_rate": 1.956319144364206e-05, "loss": 0.5393, "step": 1340 }, { "epoch": 0.33940774487471526, "grad_norm": 0.14714030921459198, "learning_rate": 1.9562491249603205e-05, "loss": 0.5592, "step": 1341 }, { "epoch": 0.3396608453556062, "grad_norm": 0.14940780401229858, "learning_rate": 1.956179050736701e-05, "loss": 0.5767, "step": 1342 }, { "epoch": 0.3399139458364971, "grad_norm": 0.1524302363395691, "learning_rate": 1.9561089216973644e-05, "loss": 0.5508, "step": 1343 }, { "epoch": 0.340167046317388, "grad_norm": 0.1543436348438263, "learning_rate": 1.956038737846332e-05, "loss": 0.5439, "step": 1344 }, { "epoch": 0.3404201467982789, "grad_norm": 0.1465919464826584, "learning_rate": 1.9559684991876264e-05, "loss": 0.5602, "step": 1345 }, { "epoch": 0.3406732472791698, "grad_norm": 0.1619967222213745, "learning_rate": 1.9558982057252747e-05, "loss": 0.5605, "step": 1346 }, { "epoch": 0.34092634776006076, "grad_norm": 0.1516886055469513, "learning_rate": 1.9558278574633066e-05, "loss": 0.5531, "step": 1347 }, { "epoch": 0.34117944824095164, "grad_norm": 0.13994504511356354, "learning_rate": 1.9557574544057552e-05, "loss": 0.5326, "step": 1348 }, { "epoch": 0.3414325487218426, "grad_norm": 0.145668625831604, "learning_rate": 1.955686996556656e-05, "loss": 0.5703, "step": 1349 }, { "epoch": 0.3416856492027335, "grad_norm": 0.14654158055782318, "learning_rate": 1.9556164839200487e-05, "loss": 0.5418, "step": 1350 }, { "epoch": 0.3419387496836244, "grad_norm": 0.14354093372821808, "learning_rate": 1.9555459164999752e-05, "loss": 0.5769, "step": 1351 }, { "epoch": 0.3421918501645153, "grad_norm": 0.1480148732662201, "learning_rate": 1.9554752943004816e-05, "loss": 0.5573, "step": 1352 }, { "epoch": 0.3424449506454062, "grad_norm": 0.16286349296569824, "learning_rate": 1.955404617325616e-05, "loss": 0.544, "step": 1353 }, { "epoch": 0.34269805112629714, "grad_norm": 0.15159283578395844, "learning_rate": 1.9553338855794302e-05, "loss": 0.5495, "step": 1354 }, { "epoch": 0.34295115160718803, "grad_norm": 0.14285486936569214, "learning_rate": 1.9552630990659796e-05, "loss": 0.5738, "step": 1355 }, { "epoch": 0.343204252088079, "grad_norm": 0.14942722022533417, "learning_rate": 1.9551922577893214e-05, "loss": 0.586, "step": 1356 }, { "epoch": 0.34345735256896986, "grad_norm": 0.1516982465982437, "learning_rate": 1.9551213617535176e-05, "loss": 0.5626, "step": 1357 }, { "epoch": 0.3437104530498608, "grad_norm": 0.15070468187332153, "learning_rate": 1.9550504109626324e-05, "loss": 0.5479, "step": 1358 }, { "epoch": 0.3439635535307517, "grad_norm": 0.1549229472875595, "learning_rate": 1.9549794054207324e-05, "loss": 0.547, "step": 1359 }, { "epoch": 0.34421665401164264, "grad_norm": 0.1502760797739029, "learning_rate": 1.9549083451318893e-05, "loss": 0.542, "step": 1360 }, { "epoch": 0.34446975449253353, "grad_norm": 0.14848487079143524, "learning_rate": 1.9548372301001764e-05, "loss": 0.5609, "step": 1361 }, { "epoch": 0.3447228549734245, "grad_norm": 0.1615775227546692, "learning_rate": 1.9547660603296702e-05, "loss": 0.5792, "step": 1362 }, { "epoch": 0.34497595545431536, "grad_norm": 0.15550091862678528, "learning_rate": 1.9546948358244513e-05, "loss": 0.5413, "step": 1363 }, { "epoch": 0.34522905593520625, "grad_norm": 0.14651690423488617, "learning_rate": 1.9546235565886024e-05, "loss": 0.5402, "step": 1364 }, { "epoch": 0.3454821564160972, "grad_norm": 0.1453961730003357, "learning_rate": 1.9545522226262102e-05, "loss": 0.5549, "step": 1365 }, { "epoch": 0.3457352568969881, "grad_norm": 0.14784598350524902, "learning_rate": 1.954480833941364e-05, "loss": 0.5399, "step": 1366 }, { "epoch": 0.34598835737787903, "grad_norm": 0.16435948014259338, "learning_rate": 1.954409390538156e-05, "loss": 0.5305, "step": 1367 }, { "epoch": 0.3462414578587699, "grad_norm": 0.1500237137079239, "learning_rate": 1.954337892420682e-05, "loss": 0.533, "step": 1368 }, { "epoch": 0.34649455833966086, "grad_norm": 0.14695794880390167, "learning_rate": 1.9542663395930414e-05, "loss": 0.5699, "step": 1369 }, { "epoch": 0.34674765882055175, "grad_norm": 0.1687469333410263, "learning_rate": 1.9541947320593356e-05, "loss": 0.5452, "step": 1370 }, { "epoch": 0.3470007593014427, "grad_norm": 0.14847731590270996, "learning_rate": 1.9541230698236703e-05, "loss": 0.5434, "step": 1371 }, { "epoch": 0.3472538597823336, "grad_norm": 0.14137008786201477, "learning_rate": 1.954051352890153e-05, "loss": 0.5174, "step": 1372 }, { "epoch": 0.3475069602632245, "grad_norm": 0.1569334715604782, "learning_rate": 1.953979581262895e-05, "loss": 0.5393, "step": 1373 }, { "epoch": 0.3477600607441154, "grad_norm": 0.14948101341724396, "learning_rate": 1.953907754946012e-05, "loss": 0.5647, "step": 1374 }, { "epoch": 0.3480131612250063, "grad_norm": 0.1450328826904297, "learning_rate": 1.9538358739436206e-05, "loss": 0.57, "step": 1375 }, { "epoch": 0.34826626170589725, "grad_norm": 0.14773407578468323, "learning_rate": 1.9537639382598417e-05, "loss": 0.553, "step": 1376 }, { "epoch": 0.34851936218678814, "grad_norm": 0.18741102516651154, "learning_rate": 1.9536919478987995e-05, "loss": 0.5694, "step": 1377 }, { "epoch": 0.3487724626676791, "grad_norm": 0.14810268580913544, "learning_rate": 1.953619902864621e-05, "loss": 0.5612, "step": 1378 }, { "epoch": 0.34902556314857, "grad_norm": 0.1470220386981964, "learning_rate": 1.9535478031614362e-05, "loss": 0.5428, "step": 1379 }, { "epoch": 0.3492786636294609, "grad_norm": 0.15955446660518646, "learning_rate": 1.9534756487933784e-05, "loss": 0.5645, "step": 1380 }, { "epoch": 0.3495317641103518, "grad_norm": 0.15951718389987946, "learning_rate": 1.9534034397645844e-05, "loss": 0.5398, "step": 1381 }, { "epoch": 0.34978486459124275, "grad_norm": 0.15523962676525116, "learning_rate": 1.9533311760791937e-05, "loss": 0.5445, "step": 1382 }, { "epoch": 0.35003796507213364, "grad_norm": 0.14976316690444946, "learning_rate": 1.9532588577413487e-05, "loss": 0.5801, "step": 1383 }, { "epoch": 0.3502910655530245, "grad_norm": 0.15287049114704132, "learning_rate": 1.9531864847551958e-05, "loss": 0.5723, "step": 1384 }, { "epoch": 0.35054416603391547, "grad_norm": 0.14190274477005005, "learning_rate": 1.9531140571248835e-05, "loss": 0.547, "step": 1385 }, { "epoch": 0.35079726651480636, "grad_norm": 0.14908644556999207, "learning_rate": 1.9530415748545638e-05, "loss": 0.5525, "step": 1386 }, { "epoch": 0.3510503669956973, "grad_norm": 0.1516028642654419, "learning_rate": 1.9529690379483926e-05, "loss": 0.5697, "step": 1387 }, { "epoch": 0.3513034674765882, "grad_norm": 0.14582683145999908, "learning_rate": 1.9528964464105276e-05, "loss": 0.5341, "step": 1388 }, { "epoch": 0.35155656795747914, "grad_norm": 0.15602953732013702, "learning_rate": 1.952823800245131e-05, "loss": 0.5573, "step": 1389 }, { "epoch": 0.35180966843837, "grad_norm": 0.1433050036430359, "learning_rate": 1.952751099456367e-05, "loss": 0.5232, "step": 1390 }, { "epoch": 0.35206276891926097, "grad_norm": 0.156890407204628, "learning_rate": 1.952678344048404e-05, "loss": 0.5444, "step": 1391 }, { "epoch": 0.35231586940015186, "grad_norm": 0.15238720178604126, "learning_rate": 1.9526055340254117e-05, "loss": 0.5577, "step": 1392 }, { "epoch": 0.35256896988104275, "grad_norm": 0.14713051915168762, "learning_rate": 1.952532669391565e-05, "loss": 0.5548, "step": 1393 }, { "epoch": 0.3528220703619337, "grad_norm": 0.14882564544677734, "learning_rate": 1.9524597501510408e-05, "loss": 0.5599, "step": 1394 }, { "epoch": 0.3530751708428246, "grad_norm": 0.15113122761249542, "learning_rate": 1.95238677630802e-05, "loss": 0.5508, "step": 1395 }, { "epoch": 0.3533282713237155, "grad_norm": 0.14699649810791016, "learning_rate": 1.952313747866685e-05, "loss": 0.5297, "step": 1396 }, { "epoch": 0.3535813718046064, "grad_norm": 0.14875228703022003, "learning_rate": 1.9522406648312232e-05, "loss": 0.5425, "step": 1397 }, { "epoch": 0.35383447228549736, "grad_norm": 0.1482452005147934, "learning_rate": 1.952167527205824e-05, "loss": 0.5393, "step": 1398 }, { "epoch": 0.35408757276638825, "grad_norm": 0.1498897224664688, "learning_rate": 1.95209433499468e-05, "loss": 0.5603, "step": 1399 }, { "epoch": 0.3543406732472792, "grad_norm": 0.15813641250133514, "learning_rate": 1.9520210882019878e-05, "loss": 0.5328, "step": 1400 }, { "epoch": 0.3545937737281701, "grad_norm": 0.15140895545482635, "learning_rate": 1.9519477868319457e-05, "loss": 0.5577, "step": 1401 }, { "epoch": 0.35484687420906097, "grad_norm": 0.15345318615436554, "learning_rate": 1.9518744308887566e-05, "loss": 0.5369, "step": 1402 }, { "epoch": 0.3550999746899519, "grad_norm": 0.16097603738307953, "learning_rate": 1.9518010203766256e-05, "loss": 0.5621, "step": 1403 }, { "epoch": 0.3553530751708428, "grad_norm": 0.152102530002594, "learning_rate": 1.9517275552997605e-05, "loss": 0.5524, "step": 1404 }, { "epoch": 0.35560617565173375, "grad_norm": 0.13988344371318817, "learning_rate": 1.9516540356623742e-05, "loss": 0.547, "step": 1405 }, { "epoch": 0.35585927613262464, "grad_norm": 0.1525922417640686, "learning_rate": 1.9515804614686804e-05, "loss": 0.5702, "step": 1406 }, { "epoch": 0.3561123766135156, "grad_norm": 0.14725516736507416, "learning_rate": 1.951506832722897e-05, "loss": 0.5532, "step": 1407 }, { "epoch": 0.35636547709440647, "grad_norm": 0.14990653097629547, "learning_rate": 1.9514331494292458e-05, "loss": 0.5763, "step": 1408 }, { "epoch": 0.3566185775752974, "grad_norm": 0.16074863076210022, "learning_rate": 1.95135941159195e-05, "loss": 0.5419, "step": 1409 }, { "epoch": 0.3568716780561883, "grad_norm": 0.1455734223127365, "learning_rate": 1.9512856192152376e-05, "loss": 0.5462, "step": 1410 }, { "epoch": 0.35712477853707925, "grad_norm": 0.1445416659116745, "learning_rate": 1.951211772303338e-05, "loss": 0.5269, "step": 1411 }, { "epoch": 0.35737787901797013, "grad_norm": 0.14358116686344147, "learning_rate": 1.9511378708604857e-05, "loss": 0.5465, "step": 1412 }, { "epoch": 0.357630979498861, "grad_norm": 0.15081366896629333, "learning_rate": 1.951063914890917e-05, "loss": 0.5418, "step": 1413 }, { "epoch": 0.35788407997975197, "grad_norm": 0.14173376560211182, "learning_rate": 1.950989904398871e-05, "loss": 0.5512, "step": 1414 }, { "epoch": 0.35813718046064286, "grad_norm": 0.14065559208393097, "learning_rate": 1.9509158393885914e-05, "loss": 0.5422, "step": 1415 }, { "epoch": 0.3583902809415338, "grad_norm": 0.14772234857082367, "learning_rate": 1.9508417198643234e-05, "loss": 0.5548, "step": 1416 }, { "epoch": 0.3586433814224247, "grad_norm": 0.14474010467529297, "learning_rate": 1.950767545830317e-05, "loss": 0.553, "step": 1417 }, { "epoch": 0.35889648190331563, "grad_norm": 0.1493399441242218, "learning_rate": 1.950693317290824e-05, "loss": 0.5676, "step": 1418 }, { "epoch": 0.3591495823842065, "grad_norm": 0.15082105994224548, "learning_rate": 1.9506190342500997e-05, "loss": 0.5533, "step": 1419 }, { "epoch": 0.35940268286509747, "grad_norm": 0.1458379477262497, "learning_rate": 1.9505446967124025e-05, "loss": 0.5529, "step": 1420 }, { "epoch": 0.35965578334598836, "grad_norm": 0.15100237727165222, "learning_rate": 1.9504703046819944e-05, "loss": 0.5715, "step": 1421 }, { "epoch": 0.35990888382687924, "grad_norm": 0.14533960819244385, "learning_rate": 1.9503958581631396e-05, "loss": 0.5622, "step": 1422 }, { "epoch": 0.3601619843077702, "grad_norm": 0.17402461171150208, "learning_rate": 1.9503213571601067e-05, "loss": 0.5827, "step": 1423 }, { "epoch": 0.3604150847886611, "grad_norm": 0.1494351178407669, "learning_rate": 1.950246801677166e-05, "loss": 0.5436, "step": 1424 }, { "epoch": 0.360668185269552, "grad_norm": 0.14508825540542603, "learning_rate": 1.950172191718592e-05, "loss": 0.5235, "step": 1425 }, { "epoch": 0.3609212857504429, "grad_norm": 0.14634418487548828, "learning_rate": 1.9500975272886616e-05, "loss": 0.5168, "step": 1426 }, { "epoch": 0.36117438623133385, "grad_norm": 0.14786958694458008, "learning_rate": 1.9500228083916554e-05, "loss": 0.5493, "step": 1427 }, { "epoch": 0.36142748671222474, "grad_norm": 0.1488710641860962, "learning_rate": 1.949948035031857e-05, "loss": 0.5677, "step": 1428 }, { "epoch": 0.3616805871931157, "grad_norm": 0.14901059865951538, "learning_rate": 1.9498732072135526e-05, "loss": 0.5349, "step": 1429 }, { "epoch": 0.3619336876740066, "grad_norm": 0.1447782665491104, "learning_rate": 1.9497983249410324e-05, "loss": 0.5348, "step": 1430 }, { "epoch": 0.3621867881548975, "grad_norm": 0.14676731824874878, "learning_rate": 1.9497233882185886e-05, "loss": 0.5376, "step": 1431 }, { "epoch": 0.3624398886357884, "grad_norm": 0.1531912237405777, "learning_rate": 1.949648397050518e-05, "loss": 0.5415, "step": 1432 }, { "epoch": 0.3626929891166793, "grad_norm": 0.18537327647209167, "learning_rate": 1.9495733514411187e-05, "loss": 0.5479, "step": 1433 }, { "epoch": 0.36294608959757024, "grad_norm": 0.1582462340593338, "learning_rate": 1.9494982513946937e-05, "loss": 0.5685, "step": 1434 }, { "epoch": 0.36319919007846113, "grad_norm": 0.15838588774204254, "learning_rate": 1.9494230969155484e-05, "loss": 0.5342, "step": 1435 }, { "epoch": 0.3634522905593521, "grad_norm": 0.14424557983875275, "learning_rate": 1.9493478880079903e-05, "loss": 0.5342, "step": 1436 }, { "epoch": 0.36370539104024296, "grad_norm": 0.14751091599464417, "learning_rate": 1.9492726246763322e-05, "loss": 0.5417, "step": 1437 }, { "epoch": 0.3639584915211339, "grad_norm": 0.22001822292804718, "learning_rate": 1.949197306924888e-05, "loss": 0.5519, "step": 1438 }, { "epoch": 0.3642115920020248, "grad_norm": 0.14862795174121857, "learning_rate": 1.9491219347579752e-05, "loss": 0.5499, "step": 1439 }, { "epoch": 0.36446469248291574, "grad_norm": 0.14559324085712433, "learning_rate": 1.9490465081799158e-05, "loss": 0.541, "step": 1440 }, { "epoch": 0.36471779296380663, "grad_norm": 0.14515161514282227, "learning_rate": 1.9489710271950327e-05, "loss": 0.5354, "step": 1441 }, { "epoch": 0.3649708934446975, "grad_norm": 0.15026681125164032, "learning_rate": 1.9488954918076538e-05, "loss": 0.5751, "step": 1442 }, { "epoch": 0.36522399392558846, "grad_norm": 0.1478470116853714, "learning_rate": 1.9488199020221094e-05, "loss": 0.5488, "step": 1443 }, { "epoch": 0.36547709440647935, "grad_norm": 0.1495615541934967, "learning_rate": 1.9487442578427328e-05, "loss": 0.5296, "step": 1444 }, { "epoch": 0.3657301948873703, "grad_norm": 0.14521771669387817, "learning_rate": 1.94866855927386e-05, "loss": 0.548, "step": 1445 }, { "epoch": 0.3659832953682612, "grad_norm": 0.14744411408901215, "learning_rate": 1.9485928063198313e-05, "loss": 0.5557, "step": 1446 }, { "epoch": 0.36623639584915213, "grad_norm": 0.14081555604934692, "learning_rate": 1.948516998984989e-05, "loss": 0.5326, "step": 1447 }, { "epoch": 0.366489496330043, "grad_norm": 0.14889878034591675, "learning_rate": 1.9484411372736797e-05, "loss": 0.5661, "step": 1448 }, { "epoch": 0.36674259681093396, "grad_norm": 0.1612987369298935, "learning_rate": 1.948365221190251e-05, "loss": 0.5478, "step": 1449 }, { "epoch": 0.36699569729182485, "grad_norm": 0.15219271183013916, "learning_rate": 1.9482892507390568e-05, "loss": 0.5557, "step": 1450 }, { "epoch": 0.36724879777271574, "grad_norm": 0.15758199989795685, "learning_rate": 1.948213225924451e-05, "loss": 0.6024, "step": 1451 }, { "epoch": 0.3675018982536067, "grad_norm": 0.14763115346431732, "learning_rate": 1.9481371467507923e-05, "loss": 0.5329, "step": 1452 }, { "epoch": 0.3677549987344976, "grad_norm": 0.14281617105007172, "learning_rate": 1.948061013222442e-05, "loss": 0.5341, "step": 1453 }, { "epoch": 0.3680080992153885, "grad_norm": 0.14848028123378754, "learning_rate": 1.9479848253437652e-05, "loss": 0.5416, "step": 1454 }, { "epoch": 0.3682611996962794, "grad_norm": 0.1465156376361847, "learning_rate": 1.947908583119129e-05, "loss": 0.5307, "step": 1455 }, { "epoch": 0.36851430017717035, "grad_norm": 0.14618930220603943, "learning_rate": 1.947832286552905e-05, "loss": 0.5412, "step": 1456 }, { "epoch": 0.36876740065806124, "grad_norm": 0.15129581093788147, "learning_rate": 1.9477559356494662e-05, "loss": 0.5447, "step": 1457 }, { "epoch": 0.3690205011389522, "grad_norm": 0.1473216861486435, "learning_rate": 1.94767953041319e-05, "loss": 0.5243, "step": 1458 }, { "epoch": 0.3692736016198431, "grad_norm": 0.14416159689426422, "learning_rate": 1.9476030708484568e-05, "loss": 0.5422, "step": 1459 }, { "epoch": 0.369526702100734, "grad_norm": 0.14985951781272888, "learning_rate": 1.9475265569596495e-05, "loss": 0.5885, "step": 1460 }, { "epoch": 0.3697798025816249, "grad_norm": 0.15145282447338104, "learning_rate": 1.9474499887511546e-05, "loss": 0.5507, "step": 1461 }, { "epoch": 0.3700329030625158, "grad_norm": 0.1508771777153015, "learning_rate": 1.9473733662273618e-05, "loss": 0.5431, "step": 1462 }, { "epoch": 0.37028600354340674, "grad_norm": 0.15162132680416107, "learning_rate": 1.947296689392663e-05, "loss": 0.5388, "step": 1463 }, { "epoch": 0.3705391040242976, "grad_norm": 0.14948506653308868, "learning_rate": 1.947219958251455e-05, "loss": 0.5429, "step": 1464 }, { "epoch": 0.37079220450518857, "grad_norm": 0.1548861265182495, "learning_rate": 1.947143172808136e-05, "loss": 0.5538, "step": 1465 }, { "epoch": 0.37104530498607946, "grad_norm": 0.1493985801935196, "learning_rate": 1.9470663330671077e-05, "loss": 0.5254, "step": 1466 }, { "epoch": 0.3712984054669704, "grad_norm": 0.1530543714761734, "learning_rate": 1.946989439032776e-05, "loss": 0.5355, "step": 1467 }, { "epoch": 0.3715515059478613, "grad_norm": 0.1417504996061325, "learning_rate": 1.9469124907095483e-05, "loss": 0.5415, "step": 1468 }, { "epoch": 0.37180460642875224, "grad_norm": 0.14839012920856476, "learning_rate": 1.946835488101836e-05, "loss": 0.5535, "step": 1469 }, { "epoch": 0.3720577069096431, "grad_norm": 0.1545441746711731, "learning_rate": 1.9467584312140538e-05, "loss": 0.5375, "step": 1470 }, { "epoch": 0.372310807390534, "grad_norm": 0.1494404524564743, "learning_rate": 1.946681320050619e-05, "loss": 0.5845, "step": 1471 }, { "epoch": 0.37256390787142496, "grad_norm": 0.14412353932857513, "learning_rate": 1.946604154615952e-05, "loss": 0.5404, "step": 1472 }, { "epoch": 0.37281700835231585, "grad_norm": 0.14045435190200806, "learning_rate": 1.9465269349144772e-05, "loss": 0.5584, "step": 1473 }, { "epoch": 0.3730701088332068, "grad_norm": 0.14929087460041046, "learning_rate": 1.946449660950621e-05, "loss": 0.5725, "step": 1474 }, { "epoch": 0.3733232093140977, "grad_norm": 0.14448022842407227, "learning_rate": 1.9463723327288134e-05, "loss": 0.5482, "step": 1475 }, { "epoch": 0.3735763097949886, "grad_norm": 0.145907461643219, "learning_rate": 1.946294950253487e-05, "loss": 0.5593, "step": 1476 }, { "epoch": 0.3738294102758795, "grad_norm": 0.15283805131912231, "learning_rate": 1.946217513529079e-05, "loss": 0.5719, "step": 1477 }, { "epoch": 0.37408251075677046, "grad_norm": 0.22472895681858063, "learning_rate": 1.9461400225600276e-05, "loss": 0.5779, "step": 1478 }, { "epoch": 0.37433561123766135, "grad_norm": 0.14655548334121704, "learning_rate": 1.946062477350776e-05, "loss": 0.5612, "step": 1479 }, { "epoch": 0.3745887117185523, "grad_norm": 0.14366453886032104, "learning_rate": 1.9459848779057694e-05, "loss": 0.5442, "step": 1480 }, { "epoch": 0.3748418121994432, "grad_norm": 0.148257315158844, "learning_rate": 1.9459072242294566e-05, "loss": 0.544, "step": 1481 }, { "epoch": 0.37509491268033407, "grad_norm": 0.16711963713169098, "learning_rate": 1.945829516326289e-05, "loss": 0.5755, "step": 1482 }, { "epoch": 0.375348013161225, "grad_norm": 0.1500495970249176, "learning_rate": 1.9457517542007212e-05, "loss": 0.5389, "step": 1483 }, { "epoch": 0.3756011136421159, "grad_norm": 0.14473144710063934, "learning_rate": 1.945673937857212e-05, "loss": 0.5487, "step": 1484 }, { "epoch": 0.37585421412300685, "grad_norm": 0.15230558812618256, "learning_rate": 1.9455960673002214e-05, "loss": 0.5679, "step": 1485 }, { "epoch": 0.37610731460389774, "grad_norm": 0.14546474814414978, "learning_rate": 1.9455181425342146e-05, "loss": 0.5279, "step": 1486 }, { "epoch": 0.3763604150847887, "grad_norm": 0.14962340891361237, "learning_rate": 1.945440163563658e-05, "loss": 0.547, "step": 1487 }, { "epoch": 0.37661351556567957, "grad_norm": 0.1480061560869217, "learning_rate": 1.9453621303930225e-05, "loss": 0.5455, "step": 1488 }, { "epoch": 0.3768666160465705, "grad_norm": 0.16731922328472137, "learning_rate": 1.9452840430267815e-05, "loss": 0.5802, "step": 1489 }, { "epoch": 0.3771197165274614, "grad_norm": 0.15016759932041168, "learning_rate": 1.9452059014694115e-05, "loss": 0.5399, "step": 1490 }, { "epoch": 0.3773728170083523, "grad_norm": 0.14352989196777344, "learning_rate": 1.945127705725392e-05, "loss": 0.5498, "step": 1491 }, { "epoch": 0.37762591748924323, "grad_norm": 0.16667211055755615, "learning_rate": 1.945049455799206e-05, "loss": 0.5452, "step": 1492 }, { "epoch": 0.3778790179701341, "grad_norm": 0.15130731463432312, "learning_rate": 1.9449711516953394e-05, "loss": 0.5647, "step": 1493 }, { "epoch": 0.37813211845102507, "grad_norm": 0.1445522904396057, "learning_rate": 1.9448927934182812e-05, "loss": 0.5613, "step": 1494 }, { "epoch": 0.37838521893191596, "grad_norm": 0.14771664142608643, "learning_rate": 1.9448143809725234e-05, "loss": 0.5514, "step": 1495 }, { "epoch": 0.3786383194128069, "grad_norm": 0.1448034942150116, "learning_rate": 1.9447359143625614e-05, "loss": 0.5448, "step": 1496 }, { "epoch": 0.3788914198936978, "grad_norm": 0.16282175481319427, "learning_rate": 1.9446573935928937e-05, "loss": 0.5674, "step": 1497 }, { "epoch": 0.37914452037458873, "grad_norm": 0.14873945713043213, "learning_rate": 1.9445788186680214e-05, "loss": 0.5233, "step": 1498 }, { "epoch": 0.3793976208554796, "grad_norm": 0.14670808613300323, "learning_rate": 1.9445001895924486e-05, "loss": 0.5589, "step": 1499 }, { "epoch": 0.37965072133637057, "grad_norm": 0.1473490446805954, "learning_rate": 1.944421506370684e-05, "loss": 0.5664, "step": 1500 }, { "epoch": 0.37990382181726146, "grad_norm": 0.14425857365131378, "learning_rate": 1.9443427690072377e-05, "loss": 0.5402, "step": 1501 }, { "epoch": 0.38015692229815234, "grad_norm": 0.14268651604652405, "learning_rate": 1.9442639775066235e-05, "loss": 0.5331, "step": 1502 }, { "epoch": 0.3804100227790433, "grad_norm": 0.14433452486991882, "learning_rate": 1.9441851318733586e-05, "loss": 0.5492, "step": 1503 }, { "epoch": 0.3806631232599342, "grad_norm": 0.15032480657100677, "learning_rate": 1.9441062321119628e-05, "loss": 0.5567, "step": 1504 }, { "epoch": 0.3809162237408251, "grad_norm": 0.14846478402614594, "learning_rate": 1.9440272782269595e-05, "loss": 0.5623, "step": 1505 }, { "epoch": 0.381169324221716, "grad_norm": 0.15278089046478271, "learning_rate": 1.9439482702228748e-05, "loss": 0.5611, "step": 1506 }, { "epoch": 0.38142242470260695, "grad_norm": 0.15438181161880493, "learning_rate": 1.943869208104238e-05, "loss": 0.5657, "step": 1507 }, { "epoch": 0.38167552518349784, "grad_norm": 0.14381231367588043, "learning_rate": 1.943790091875582e-05, "loss": 0.5571, "step": 1508 }, { "epoch": 0.3819286256643888, "grad_norm": 0.145586296916008, "learning_rate": 1.943710921541442e-05, "loss": 0.5621, "step": 1509 }, { "epoch": 0.3821817261452797, "grad_norm": 0.15460233390331268, "learning_rate": 1.943631697106356e-05, "loss": 0.5711, "step": 1510 }, { "epoch": 0.38243482662617057, "grad_norm": 0.14406736195087433, "learning_rate": 1.9435524185748673e-05, "loss": 0.546, "step": 1511 }, { "epoch": 0.3826879271070615, "grad_norm": 0.14487136900424957, "learning_rate": 1.9434730859515195e-05, "loss": 0.5353, "step": 1512 }, { "epoch": 0.3829410275879524, "grad_norm": 0.1524420529603958, "learning_rate": 1.9433936992408615e-05, "loss": 0.5387, "step": 1513 }, { "epoch": 0.38319412806884334, "grad_norm": 0.14421406388282776, "learning_rate": 1.943314258447443e-05, "loss": 0.5534, "step": 1514 }, { "epoch": 0.38344722854973423, "grad_norm": 0.14695259928703308, "learning_rate": 1.94323476357582e-05, "loss": 0.5437, "step": 1515 }, { "epoch": 0.3837003290306252, "grad_norm": 0.14831073582172394, "learning_rate": 1.9431552146305484e-05, "loss": 0.5444, "step": 1516 }, { "epoch": 0.38395342951151606, "grad_norm": 0.14436882734298706, "learning_rate": 1.9430756116161892e-05, "loss": 0.5358, "step": 1517 }, { "epoch": 0.384206529992407, "grad_norm": 0.15285713970661163, "learning_rate": 1.9429959545373056e-05, "loss": 0.5586, "step": 1518 }, { "epoch": 0.3844596304732979, "grad_norm": 0.14679375290870667, "learning_rate": 1.9429162433984642e-05, "loss": 0.5565, "step": 1519 }, { "epoch": 0.3847127309541888, "grad_norm": 0.14445728063583374, "learning_rate": 1.9428364782042345e-05, "loss": 0.5491, "step": 1520 }, { "epoch": 0.38496583143507973, "grad_norm": 0.14439482986927032, "learning_rate": 1.9427566589591896e-05, "loss": 0.5623, "step": 1521 }, { "epoch": 0.3852189319159706, "grad_norm": 0.14486654102802277, "learning_rate": 1.9426767856679055e-05, "loss": 0.5658, "step": 1522 }, { "epoch": 0.38547203239686156, "grad_norm": 0.14940501749515533, "learning_rate": 1.9425968583349608e-05, "loss": 0.5596, "step": 1523 }, { "epoch": 0.38572513287775245, "grad_norm": 0.14470356702804565, "learning_rate": 1.9425168769649377e-05, "loss": 0.5471, "step": 1524 }, { "epoch": 0.3859782333586434, "grad_norm": 0.1546734869480133, "learning_rate": 1.9424368415624216e-05, "loss": 0.5414, "step": 1525 }, { "epoch": 0.3862313338395343, "grad_norm": 0.14746522903442383, "learning_rate": 1.942356752132e-05, "loss": 0.5369, "step": 1526 }, { "epoch": 0.38648443432042523, "grad_norm": 0.2848914861679077, "learning_rate": 1.942276608678265e-05, "loss": 0.5377, "step": 1527 }, { "epoch": 0.3867375348013161, "grad_norm": 0.14106248319149017, "learning_rate": 1.9421964112058108e-05, "loss": 0.5719, "step": 1528 }, { "epoch": 0.38699063528220706, "grad_norm": 0.15155471861362457, "learning_rate": 1.942116159719235e-05, "loss": 0.5347, "step": 1529 }, { "epoch": 0.38724373576309795, "grad_norm": 0.14718204736709595, "learning_rate": 1.9420358542231383e-05, "loss": 0.5376, "step": 1530 }, { "epoch": 0.38749683624398884, "grad_norm": 0.14534416794776917, "learning_rate": 1.9419554947221245e-05, "loss": 0.5681, "step": 1531 }, { "epoch": 0.3877499367248798, "grad_norm": 0.15082937479019165, "learning_rate": 1.9418750812208002e-05, "loss": 0.5679, "step": 1532 }, { "epoch": 0.3880030372057707, "grad_norm": 0.14653445780277252, "learning_rate": 1.941794613723775e-05, "loss": 0.5427, "step": 1533 }, { "epoch": 0.3882561376866616, "grad_norm": 0.19520319998264313, "learning_rate": 1.9417140922356626e-05, "loss": 0.5413, "step": 1534 }, { "epoch": 0.3885092381675525, "grad_norm": 0.14684267342090607, "learning_rate": 1.9416335167610793e-05, "loss": 0.5604, "step": 1535 }, { "epoch": 0.38876233864844345, "grad_norm": 0.1453063189983368, "learning_rate": 1.9415528873046434e-05, "loss": 0.5403, "step": 1536 }, { "epoch": 0.38901543912933434, "grad_norm": 0.15310315787792206, "learning_rate": 1.941472203870978e-05, "loss": 0.5437, "step": 1537 }, { "epoch": 0.3892685396102253, "grad_norm": 0.1449703723192215, "learning_rate": 1.941391466464708e-05, "loss": 0.5481, "step": 1538 }, { "epoch": 0.3895216400911162, "grad_norm": 0.1476258635520935, "learning_rate": 1.9413106750904623e-05, "loss": 0.5554, "step": 1539 }, { "epoch": 0.38977474057200706, "grad_norm": 0.14646273851394653, "learning_rate": 1.9412298297528727e-05, "loss": 0.5382, "step": 1540 }, { "epoch": 0.390027841052898, "grad_norm": 0.14629773795604706, "learning_rate": 1.9411489304565735e-05, "loss": 0.527, "step": 1541 }, { "epoch": 0.3902809415337889, "grad_norm": 0.14340394735336304, "learning_rate": 1.941067977206202e-05, "loss": 0.5303, "step": 1542 }, { "epoch": 0.39053404201467984, "grad_norm": 0.14393474161624908, "learning_rate": 1.9409869700064e-05, "loss": 0.5402, "step": 1543 }, { "epoch": 0.3907871424955707, "grad_norm": 0.14596255123615265, "learning_rate": 1.9409059088618106e-05, "loss": 0.543, "step": 1544 }, { "epoch": 0.39104024297646167, "grad_norm": 0.14661453664302826, "learning_rate": 1.9408247937770817e-05, "loss": 0.5639, "step": 1545 }, { "epoch": 0.39129334345735256, "grad_norm": 0.1494133323431015, "learning_rate": 1.9407436247568633e-05, "loss": 0.5322, "step": 1546 }, { "epoch": 0.3915464439382435, "grad_norm": 0.15496422350406647, "learning_rate": 1.940662401805808e-05, "loss": 0.5692, "step": 1547 }, { "epoch": 0.3917995444191344, "grad_norm": 0.14746661484241486, "learning_rate": 1.940581124928573e-05, "loss": 0.5359, "step": 1548 }, { "epoch": 0.39205264490002534, "grad_norm": 0.14533816277980804, "learning_rate": 1.940499794129817e-05, "loss": 0.5512, "step": 1549 }, { "epoch": 0.3923057453809162, "grad_norm": 0.14947715401649475, "learning_rate": 1.940418409414203e-05, "loss": 0.5743, "step": 1550 }, { "epoch": 0.3925588458618071, "grad_norm": 0.14406318962574005, "learning_rate": 1.940336970786396e-05, "loss": 0.5474, "step": 1551 }, { "epoch": 0.39281194634269806, "grad_norm": 0.146559938788414, "learning_rate": 1.9402554782510657e-05, "loss": 0.5486, "step": 1552 }, { "epoch": 0.39306504682358895, "grad_norm": 0.16369974613189697, "learning_rate": 1.9401739318128832e-05, "loss": 0.5568, "step": 1553 }, { "epoch": 0.3933181473044799, "grad_norm": 0.14416536688804626, "learning_rate": 1.9400923314765235e-05, "loss": 0.5214, "step": 1554 }, { "epoch": 0.3935712477853708, "grad_norm": 0.14466746151447296, "learning_rate": 1.9400106772466645e-05, "loss": 0.5392, "step": 1555 }, { "epoch": 0.3938243482662617, "grad_norm": 0.15031152963638306, "learning_rate": 1.9399289691279874e-05, "loss": 0.5702, "step": 1556 }, { "epoch": 0.3940774487471526, "grad_norm": 0.14659929275512695, "learning_rate": 1.9398472071251765e-05, "loss": 0.554, "step": 1557 }, { "epoch": 0.39433054922804356, "grad_norm": 0.14912793040275574, "learning_rate": 1.9397653912429187e-05, "loss": 0.5275, "step": 1558 }, { "epoch": 0.39458364970893445, "grad_norm": 0.14978481829166412, "learning_rate": 1.9396835214859044e-05, "loss": 0.5502, "step": 1559 }, { "epoch": 0.39483675018982534, "grad_norm": 0.1445114016532898, "learning_rate": 1.9396015978588273e-05, "loss": 0.5567, "step": 1560 }, { "epoch": 0.3950898506707163, "grad_norm": 0.1440342217683792, "learning_rate": 1.9395196203663837e-05, "loss": 0.5358, "step": 1561 }, { "epoch": 0.39534295115160717, "grad_norm": 0.14023324847221375, "learning_rate": 1.939437589013273e-05, "loss": 0.566, "step": 1562 }, { "epoch": 0.3955960516324981, "grad_norm": 0.14542604982852936, "learning_rate": 1.939355503804198e-05, "loss": 0.5286, "step": 1563 }, { "epoch": 0.395849152113389, "grad_norm": 0.1461302787065506, "learning_rate": 1.939273364743865e-05, "loss": 0.561, "step": 1564 }, { "epoch": 0.39610225259427995, "grad_norm": 0.1431075483560562, "learning_rate": 1.939191171836982e-05, "loss": 0.5478, "step": 1565 }, { "epoch": 0.39635535307517084, "grad_norm": 0.1537792831659317, "learning_rate": 1.939108925088262e-05, "loss": 0.5173, "step": 1566 }, { "epoch": 0.3966084535560618, "grad_norm": 0.1548088788986206, "learning_rate": 1.939026624502419e-05, "loss": 0.5566, "step": 1567 }, { "epoch": 0.39686155403695267, "grad_norm": 0.14292335510253906, "learning_rate": 1.9389442700841714e-05, "loss": 0.5571, "step": 1568 }, { "epoch": 0.39711465451784356, "grad_norm": 0.14941321313381195, "learning_rate": 1.9388618618382405e-05, "loss": 0.561, "step": 1569 }, { "epoch": 0.3973677549987345, "grad_norm": 0.14457207918167114, "learning_rate": 1.938779399769351e-05, "loss": 0.561, "step": 1570 }, { "epoch": 0.3976208554796254, "grad_norm": 0.14793317019939423, "learning_rate": 1.9386968838822296e-05, "loss": 0.5394, "step": 1571 }, { "epoch": 0.39787395596051633, "grad_norm": 0.14733612537384033, "learning_rate": 1.9386143141816075e-05, "loss": 0.5519, "step": 1572 }, { "epoch": 0.3981270564414072, "grad_norm": 0.14648863673210144, "learning_rate": 1.9385316906722173e-05, "loss": 0.5532, "step": 1573 }, { "epoch": 0.39838015692229817, "grad_norm": 0.140003502368927, "learning_rate": 1.938449013358796e-05, "loss": 0.5484, "step": 1574 }, { "epoch": 0.39863325740318906, "grad_norm": 0.14673306047916412, "learning_rate": 1.9383662822460838e-05, "loss": 0.5486, "step": 1575 }, { "epoch": 0.39888635788408, "grad_norm": 0.15231101214885712, "learning_rate": 1.938283497338823e-05, "loss": 0.562, "step": 1576 }, { "epoch": 0.3991394583649709, "grad_norm": 0.14271137118339539, "learning_rate": 1.9382006586417597e-05, "loss": 0.5364, "step": 1577 }, { "epoch": 0.39939255884586183, "grad_norm": 0.15283474326133728, "learning_rate": 1.9381177661596426e-05, "loss": 0.5377, "step": 1578 }, { "epoch": 0.3996456593267527, "grad_norm": 0.1438225507736206, "learning_rate": 1.938034819897224e-05, "loss": 0.5361, "step": 1579 }, { "epoch": 0.3998987598076436, "grad_norm": 0.14464469254016876, "learning_rate": 1.9379518198592593e-05, "loss": 0.549, "step": 1580 }, { "epoch": 0.40015186028853456, "grad_norm": 0.1797109991312027, "learning_rate": 1.9378687660505063e-05, "loss": 0.5717, "step": 1581 }, { "epoch": 0.40040496076942544, "grad_norm": 0.1502915769815445, "learning_rate": 1.9377856584757258e-05, "loss": 0.5535, "step": 1582 }, { "epoch": 0.4006580612503164, "grad_norm": 0.1443057805299759, "learning_rate": 1.9377024971396835e-05, "loss": 0.5183, "step": 1583 }, { "epoch": 0.4009111617312073, "grad_norm": 0.15263494849205017, "learning_rate": 1.937619282047146e-05, "loss": 0.576, "step": 1584 }, { "epoch": 0.4011642622120982, "grad_norm": 0.14717890322208405, "learning_rate": 1.9375360132028836e-05, "loss": 0.5739, "step": 1585 }, { "epoch": 0.4014173626929891, "grad_norm": 0.144087553024292, "learning_rate": 1.9374526906116707e-05, "loss": 0.5565, "step": 1586 }, { "epoch": 0.40167046317388005, "grad_norm": 0.14913515746593475, "learning_rate": 1.9373693142782834e-05, "loss": 0.5462, "step": 1587 }, { "epoch": 0.40192356365477094, "grad_norm": 0.14807447791099548, "learning_rate": 1.9372858842075017e-05, "loss": 0.5827, "step": 1588 }, { "epoch": 0.40217666413566183, "grad_norm": 0.1502460241317749, "learning_rate": 1.9372024004041085e-05, "loss": 0.5567, "step": 1589 }, { "epoch": 0.4024297646165528, "grad_norm": 0.14845815300941467, "learning_rate": 1.9371188628728896e-05, "loss": 0.5603, "step": 1590 }, { "epoch": 0.40268286509744367, "grad_norm": 0.150599405169487, "learning_rate": 1.9370352716186346e-05, "loss": 0.5804, "step": 1591 }, { "epoch": 0.4029359655783346, "grad_norm": 0.1461457461118698, "learning_rate": 1.9369516266461348e-05, "loss": 0.5178, "step": 1592 }, { "epoch": 0.4031890660592255, "grad_norm": 0.14655934274196625, "learning_rate": 1.9368679279601855e-05, "loss": 0.5434, "step": 1593 }, { "epoch": 0.40344216654011644, "grad_norm": 0.14532841742038727, "learning_rate": 1.9367841755655856e-05, "loss": 0.5416, "step": 1594 }, { "epoch": 0.40369526702100733, "grad_norm": 0.14369773864746094, "learning_rate": 1.936700369467136e-05, "loss": 0.5253, "step": 1595 }, { "epoch": 0.4039483675018983, "grad_norm": 0.1581760197877884, "learning_rate": 1.9366165096696412e-05, "loss": 0.5493, "step": 1596 }, { "epoch": 0.40420146798278916, "grad_norm": 0.14258648455142975, "learning_rate": 1.9365325961779085e-05, "loss": 0.5264, "step": 1597 }, { "epoch": 0.4044545684636801, "grad_norm": 0.15081873536109924, "learning_rate": 1.936448628996749e-05, "loss": 0.5568, "step": 1598 }, { "epoch": 0.404707668944571, "grad_norm": 0.14594605565071106, "learning_rate": 1.9363646081309757e-05, "loss": 0.5538, "step": 1599 }, { "epoch": 0.4049607694254619, "grad_norm": 0.1677783578634262, "learning_rate": 1.936280533585406e-05, "loss": 0.5693, "step": 1600 }, { "epoch": 0.40521386990635283, "grad_norm": 0.14986838400363922, "learning_rate": 1.9361964053648594e-05, "loss": 0.5563, "step": 1601 }, { "epoch": 0.4054669703872437, "grad_norm": 0.15021882951259613, "learning_rate": 1.9361122234741585e-05, "loss": 0.5725, "step": 1602 }, { "epoch": 0.40572007086813466, "grad_norm": 0.1499585509300232, "learning_rate": 1.9360279879181294e-05, "loss": 0.5709, "step": 1603 }, { "epoch": 0.40597317134902555, "grad_norm": 0.15348757803440094, "learning_rate": 1.9359436987016016e-05, "loss": 0.5275, "step": 1604 }, { "epoch": 0.4062262718299165, "grad_norm": 0.14585711061954498, "learning_rate": 1.935859355829407e-05, "loss": 0.5352, "step": 1605 }, { "epoch": 0.4064793723108074, "grad_norm": 0.14915527403354645, "learning_rate": 1.9357749593063806e-05, "loss": 0.542, "step": 1606 }, { "epoch": 0.40673247279169833, "grad_norm": 0.15031583607196808, "learning_rate": 1.9356905091373606e-05, "loss": 0.5603, "step": 1607 }, { "epoch": 0.4069855732725892, "grad_norm": 0.14817659556865692, "learning_rate": 1.9356060053271887e-05, "loss": 0.5525, "step": 1608 }, { "epoch": 0.4072386737534801, "grad_norm": 0.14792583882808685, "learning_rate": 1.935521447880709e-05, "loss": 0.5678, "step": 1609 }, { "epoch": 0.40749177423437105, "grad_norm": 0.14879105985164642, "learning_rate": 1.9354368368027696e-05, "loss": 0.5516, "step": 1610 }, { "epoch": 0.40774487471526194, "grad_norm": 0.1414695829153061, "learning_rate": 1.93535217209822e-05, "loss": 0.5196, "step": 1611 }, { "epoch": 0.4079979751961529, "grad_norm": 0.14334243535995483, "learning_rate": 1.9352674537719155e-05, "loss": 0.5462, "step": 1612 }, { "epoch": 0.4082510756770438, "grad_norm": 0.14682495594024658, "learning_rate": 1.9351826818287107e-05, "loss": 0.5489, "step": 1613 }, { "epoch": 0.4085041761579347, "grad_norm": 0.25355473160743713, "learning_rate": 1.935097856273467e-05, "loss": 0.5499, "step": 1614 }, { "epoch": 0.4087572766388256, "grad_norm": 0.14532774686813354, "learning_rate": 1.935012977111047e-05, "loss": 0.5304, "step": 1615 }, { "epoch": 0.40901037711971655, "grad_norm": 0.14377574622631073, "learning_rate": 1.9349280443463157e-05, "loss": 0.5354, "step": 1616 }, { "epoch": 0.40926347760060744, "grad_norm": 0.15197570621967316, "learning_rate": 1.9348430579841437e-05, "loss": 0.5301, "step": 1617 }, { "epoch": 0.40951657808149833, "grad_norm": 0.15150560438632965, "learning_rate": 1.9347580180294015e-05, "loss": 0.5514, "step": 1618 }, { "epoch": 0.4097696785623893, "grad_norm": 0.14334368705749512, "learning_rate": 1.9346729244869654e-05, "loss": 0.5402, "step": 1619 }, { "epoch": 0.41002277904328016, "grad_norm": 0.1479242444038391, "learning_rate": 1.934587777361713e-05, "loss": 0.55, "step": 1620 }, { "epoch": 0.4102758795241711, "grad_norm": 0.14709392189979553, "learning_rate": 1.9345025766585258e-05, "loss": 0.5662, "step": 1621 }, { "epoch": 0.410528980005062, "grad_norm": 0.14203697443008423, "learning_rate": 1.9344173223822883e-05, "loss": 0.5651, "step": 1622 }, { "epoch": 0.41078208048595294, "grad_norm": 0.15245656669139862, "learning_rate": 1.934332014537888e-05, "loss": 0.5257, "step": 1623 }, { "epoch": 0.4110351809668438, "grad_norm": 0.14380770921707153, "learning_rate": 1.9342466531302148e-05, "loss": 0.53, "step": 1624 }, { "epoch": 0.41128828144773477, "grad_norm": 0.14402315020561218, "learning_rate": 1.9341612381641632e-05, "loss": 0.547, "step": 1625 }, { "epoch": 0.41154138192862566, "grad_norm": 0.14973874390125275, "learning_rate": 1.934075769644629e-05, "loss": 0.5315, "step": 1626 }, { "epoch": 0.4117944824095166, "grad_norm": 0.1417977660894394, "learning_rate": 1.9339902475765125e-05, "loss": 0.543, "step": 1627 }, { "epoch": 0.4120475828904075, "grad_norm": 0.14770272374153137, "learning_rate": 1.9339046719647166e-05, "loss": 0.5412, "step": 1628 }, { "epoch": 0.4123006833712984, "grad_norm": 0.14769352972507477, "learning_rate": 1.9338190428141463e-05, "loss": 0.533, "step": 1629 }, { "epoch": 0.4125537838521893, "grad_norm": 0.14572639763355255, "learning_rate": 1.9337333601297118e-05, "loss": 0.5772, "step": 1630 }, { "epoch": 0.4128068843330802, "grad_norm": 0.14324599504470825, "learning_rate": 1.933647623916324e-05, "loss": 0.5535, "step": 1631 }, { "epoch": 0.41305998481397116, "grad_norm": 0.14937616884708405, "learning_rate": 1.9335618341788983e-05, "loss": 0.5634, "step": 1632 }, { "epoch": 0.41331308529486205, "grad_norm": 0.14965523779392242, "learning_rate": 1.9334759909223534e-05, "loss": 0.5319, "step": 1633 }, { "epoch": 0.413566185775753, "grad_norm": 0.1447480022907257, "learning_rate": 1.93339009415161e-05, "loss": 0.5188, "step": 1634 }, { "epoch": 0.4138192862566439, "grad_norm": 0.14657054841518402, "learning_rate": 1.933304143871592e-05, "loss": 0.545, "step": 1635 }, { "epoch": 0.4140723867375348, "grad_norm": 0.15385763347148895, "learning_rate": 1.9332181400872273e-05, "loss": 0.5667, "step": 1636 }, { "epoch": 0.4143254872184257, "grad_norm": 0.14401037991046906, "learning_rate": 1.9331320828034466e-05, "loss": 0.5383, "step": 1637 }, { "epoch": 0.4145785876993166, "grad_norm": 0.15646244585514069, "learning_rate": 1.933045972025183e-05, "loss": 0.5537, "step": 1638 }, { "epoch": 0.41483168818020755, "grad_norm": 0.15087704360485077, "learning_rate": 1.9329598077573727e-05, "loss": 0.543, "step": 1639 }, { "epoch": 0.41508478866109844, "grad_norm": 0.1671897917985916, "learning_rate": 1.932873590004956e-05, "loss": 0.5443, "step": 1640 }, { "epoch": 0.4153378891419894, "grad_norm": 0.1444534808397293, "learning_rate": 1.9327873187728747e-05, "loss": 0.5732, "step": 1641 }, { "epoch": 0.41559098962288027, "grad_norm": 0.1481046974658966, "learning_rate": 1.9327009940660755e-05, "loss": 0.5425, "step": 1642 }, { "epoch": 0.4158440901037712, "grad_norm": 0.19806204736232758, "learning_rate": 1.9326146158895067e-05, "loss": 0.5439, "step": 1643 }, { "epoch": 0.4160971905846621, "grad_norm": 0.1478814035654068, "learning_rate": 1.9325281842481206e-05, "loss": 0.5357, "step": 1644 }, { "epoch": 0.41635029106555305, "grad_norm": 0.15157166123390198, "learning_rate": 1.9324416991468712e-05, "loss": 0.5549, "step": 1645 }, { "epoch": 0.41660339154644394, "grad_norm": 0.15019840002059937, "learning_rate": 1.9323551605907175e-05, "loss": 0.5526, "step": 1646 }, { "epoch": 0.4168564920273349, "grad_norm": 0.1437724530696869, "learning_rate": 1.9322685685846202e-05, "loss": 0.5212, "step": 1647 }, { "epoch": 0.41710959250822577, "grad_norm": 0.15244163572788239, "learning_rate": 1.9321819231335435e-05, "loss": 0.5564, "step": 1648 }, { "epoch": 0.41736269298911666, "grad_norm": 0.15348710119724274, "learning_rate": 1.9320952242424542e-05, "loss": 0.5519, "step": 1649 }, { "epoch": 0.4176157934700076, "grad_norm": 0.14565591514110565, "learning_rate": 1.9320084719163232e-05, "loss": 0.5734, "step": 1650 }, { "epoch": 0.4178688939508985, "grad_norm": 0.15512077510356903, "learning_rate": 1.9319216661601233e-05, "loss": 0.5552, "step": 1651 }, { "epoch": 0.41812199443178943, "grad_norm": 0.15159796178340912, "learning_rate": 1.9318348069788314e-05, "loss": 0.5572, "step": 1652 }, { "epoch": 0.4183750949126803, "grad_norm": 0.1493181437253952, "learning_rate": 1.9317478943774263e-05, "loss": 0.553, "step": 1653 }, { "epoch": 0.41862819539357127, "grad_norm": 0.14635220170021057, "learning_rate": 1.931660928360891e-05, "loss": 0.521, "step": 1654 }, { "epoch": 0.41888129587446216, "grad_norm": 0.14356295764446259, "learning_rate": 1.931573908934211e-05, "loss": 0.5318, "step": 1655 }, { "epoch": 0.4191343963553531, "grad_norm": 0.1549730747938156, "learning_rate": 1.931486836102375e-05, "loss": 0.5262, "step": 1656 }, { "epoch": 0.419387496836244, "grad_norm": 0.1705036610364914, "learning_rate": 1.9313997098703747e-05, "loss": 0.5646, "step": 1657 }, { "epoch": 0.4196405973171349, "grad_norm": 0.14875201880931854, "learning_rate": 1.9313125302432045e-05, "loss": 0.5448, "step": 1658 }, { "epoch": 0.4198936977980258, "grad_norm": 0.16302737593650818, "learning_rate": 1.9312252972258624e-05, "loss": 0.5347, "step": 1659 }, { "epoch": 0.4201467982789167, "grad_norm": 0.1432584524154663, "learning_rate": 1.9311380108233495e-05, "loss": 0.533, "step": 1660 }, { "epoch": 0.42039989875980766, "grad_norm": 0.15105532109737396, "learning_rate": 1.9310506710406696e-05, "loss": 0.5633, "step": 1661 }, { "epoch": 0.42065299924069854, "grad_norm": 0.17110736668109894, "learning_rate": 1.93096327788283e-05, "loss": 0.5322, "step": 1662 }, { "epoch": 0.4209060997215895, "grad_norm": 0.1431974321603775, "learning_rate": 1.9308758313548397e-05, "loss": 0.5432, "step": 1663 }, { "epoch": 0.4211592002024804, "grad_norm": 0.14605098962783813, "learning_rate": 1.9307883314617136e-05, "loss": 0.5548, "step": 1664 }, { "epoch": 0.4214123006833713, "grad_norm": 0.19013462960720062, "learning_rate": 1.9307007782084662e-05, "loss": 0.5568, "step": 1665 }, { "epoch": 0.4216654011642622, "grad_norm": 0.15632565319538116, "learning_rate": 1.9306131716001175e-05, "loss": 0.5494, "step": 1666 }, { "epoch": 0.4219185016451531, "grad_norm": 0.1458219289779663, "learning_rate": 1.93052551164169e-05, "loss": 0.5526, "step": 1667 }, { "epoch": 0.42217160212604404, "grad_norm": 0.14812980592250824, "learning_rate": 1.9304377983382085e-05, "loss": 0.5425, "step": 1668 }, { "epoch": 0.42242470260693493, "grad_norm": 0.14942355453968048, "learning_rate": 1.930350031694702e-05, "loss": 0.5632, "step": 1669 }, { "epoch": 0.4226778030878259, "grad_norm": 0.1527610868215561, "learning_rate": 1.9302622117162015e-05, "loss": 0.5406, "step": 1670 }, { "epoch": 0.42293090356871677, "grad_norm": 0.15162545442581177, "learning_rate": 1.9301743384077416e-05, "loss": 0.5474, "step": 1671 }, { "epoch": 0.4231840040496077, "grad_norm": 0.1516416221857071, "learning_rate": 1.9300864117743602e-05, "loss": 0.5354, "step": 1672 }, { "epoch": 0.4234371045304986, "grad_norm": 0.14847074449062347, "learning_rate": 1.9299984318210977e-05, "loss": 0.5454, "step": 1673 }, { "epoch": 0.42369020501138954, "grad_norm": 0.15890692174434662, "learning_rate": 1.9299103985529977e-05, "loss": 0.5702, "step": 1674 }, { "epoch": 0.42394330549228043, "grad_norm": 0.15740667283535004, "learning_rate": 1.9298223119751076e-05, "loss": 0.5293, "step": 1675 }, { "epoch": 0.4241964059731714, "grad_norm": 0.15397818386554718, "learning_rate": 1.9297341720924762e-05, "loss": 0.5496, "step": 1676 }, { "epoch": 0.42444950645406226, "grad_norm": 0.15494826436042786, "learning_rate": 1.9296459789101574e-05, "loss": 0.5804, "step": 1677 }, { "epoch": 0.42470260693495315, "grad_norm": 0.14730721712112427, "learning_rate": 1.9295577324332062e-05, "loss": 0.5443, "step": 1678 }, { "epoch": 0.4249557074158441, "grad_norm": 0.1446990668773651, "learning_rate": 1.929469432666682e-05, "loss": 0.5542, "step": 1679 }, { "epoch": 0.425208807896735, "grad_norm": 0.14975771307945251, "learning_rate": 1.9293810796156475e-05, "loss": 0.5316, "step": 1680 }, { "epoch": 0.42546190837762593, "grad_norm": 0.14780595898628235, "learning_rate": 1.9292926732851663e-05, "loss": 0.5337, "step": 1681 }, { "epoch": 0.4257150088585168, "grad_norm": 0.14161317050457, "learning_rate": 1.929204213680308e-05, "loss": 0.517, "step": 1682 }, { "epoch": 0.42596810933940776, "grad_norm": 0.14389091730117798, "learning_rate": 1.929115700806143e-05, "loss": 0.5502, "step": 1683 }, { "epoch": 0.42622120982029865, "grad_norm": 0.1470528095960617, "learning_rate": 1.9290271346677458e-05, "loss": 0.5359, "step": 1684 }, { "epoch": 0.4264743103011896, "grad_norm": 0.1421509087085724, "learning_rate": 1.9289385152701935e-05, "loss": 0.5495, "step": 1685 }, { "epoch": 0.4267274107820805, "grad_norm": 0.1501965969800949, "learning_rate": 1.9288498426185665e-05, "loss": 0.5387, "step": 1686 }, { "epoch": 0.4269805112629714, "grad_norm": 0.14985966682434082, "learning_rate": 1.9287611167179484e-05, "loss": 0.5687, "step": 1687 }, { "epoch": 0.4272336117438623, "grad_norm": 0.14769534766674042, "learning_rate": 1.928672337573426e-05, "loss": 0.5509, "step": 1688 }, { "epoch": 0.4274867122247532, "grad_norm": 0.14639385044574738, "learning_rate": 1.9285835051900883e-05, "loss": 0.5462, "step": 1689 }, { "epoch": 0.42773981270564415, "grad_norm": 0.14832186698913574, "learning_rate": 1.9284946195730278e-05, "loss": 0.5421, "step": 1690 }, { "epoch": 0.42799291318653504, "grad_norm": 0.14823070168495178, "learning_rate": 1.9284056807273404e-05, "loss": 0.5426, "step": 1691 }, { "epoch": 0.428246013667426, "grad_norm": 0.14643365144729614, "learning_rate": 1.9283166886581247e-05, "loss": 0.5626, "step": 1692 }, { "epoch": 0.4284991141483169, "grad_norm": 0.1499713808298111, "learning_rate": 1.9282276433704824e-05, "loss": 0.5682, "step": 1693 }, { "epoch": 0.4287522146292078, "grad_norm": 0.1642482578754425, "learning_rate": 1.9281385448695182e-05, "loss": 0.5255, "step": 1694 }, { "epoch": 0.4290053151100987, "grad_norm": 0.14676684141159058, "learning_rate": 1.9280493931603404e-05, "loss": 0.5685, "step": 1695 }, { "epoch": 0.42925841559098965, "grad_norm": 0.1522381454706192, "learning_rate": 1.9279601882480592e-05, "loss": 0.5644, "step": 1696 }, { "epoch": 0.42951151607188054, "grad_norm": 0.14444655179977417, "learning_rate": 1.927870930137789e-05, "loss": 0.5436, "step": 1697 }, { "epoch": 0.42976461655277143, "grad_norm": 0.14501915872097015, "learning_rate": 1.9277816188346464e-05, "loss": 0.5218, "step": 1698 }, { "epoch": 0.43001771703366237, "grad_norm": 0.14758257567882538, "learning_rate": 1.9276922543437516e-05, "loss": 0.5536, "step": 1699 }, { "epoch": 0.43027081751455326, "grad_norm": 0.150035098195076, "learning_rate": 1.927602836670228e-05, "loss": 0.5363, "step": 1700 }, { "epoch": 0.4305239179954442, "grad_norm": 0.13837407529354095, "learning_rate": 1.9275133658192015e-05, "loss": 0.5222, "step": 1701 }, { "epoch": 0.4307770184763351, "grad_norm": 0.15080954134464264, "learning_rate": 1.927423841795801e-05, "loss": 0.5429, "step": 1702 }, { "epoch": 0.43103011895722604, "grad_norm": 0.15365765988826752, "learning_rate": 1.927334264605159e-05, "loss": 0.5326, "step": 1703 }, { "epoch": 0.4312832194381169, "grad_norm": 0.159660205245018, "learning_rate": 1.9272446342524106e-05, "loss": 0.5433, "step": 1704 }, { "epoch": 0.43153631991900787, "grad_norm": 0.15141738951206207, "learning_rate": 1.9271549507426943e-05, "loss": 0.5516, "step": 1705 }, { "epoch": 0.43178942039989876, "grad_norm": 0.1540350466966629, "learning_rate": 1.9270652140811513e-05, "loss": 0.5618, "step": 1706 }, { "epoch": 0.43204252088078965, "grad_norm": 0.14833320677280426, "learning_rate": 1.9269754242729265e-05, "loss": 0.5541, "step": 1707 }, { "epoch": 0.4322956213616806, "grad_norm": 0.1516059786081314, "learning_rate": 1.9268855813231665e-05, "loss": 0.5626, "step": 1708 }, { "epoch": 0.4325487218425715, "grad_norm": 0.1475950926542282, "learning_rate": 1.9267956852370226e-05, "loss": 0.5293, "step": 1709 }, { "epoch": 0.4328018223234624, "grad_norm": 0.1647576540708542, "learning_rate": 1.9267057360196478e-05, "loss": 0.5369, "step": 1710 }, { "epoch": 0.4330549228043533, "grad_norm": 0.14087119698524475, "learning_rate": 1.926615733676199e-05, "loss": 0.5426, "step": 1711 }, { "epoch": 0.43330802328524426, "grad_norm": 0.1437087506055832, "learning_rate": 1.926525678211836e-05, "loss": 0.5482, "step": 1712 }, { "epoch": 0.43356112376613515, "grad_norm": 0.17049936950206757, "learning_rate": 1.926435569631721e-05, "loss": 0.5395, "step": 1713 }, { "epoch": 0.4338142242470261, "grad_norm": 0.16222238540649414, "learning_rate": 1.92634540794102e-05, "loss": 0.568, "step": 1714 }, { "epoch": 0.434067324727917, "grad_norm": 0.14295589923858643, "learning_rate": 1.926255193144902e-05, "loss": 0.5175, "step": 1715 }, { "epoch": 0.43432042520880787, "grad_norm": 0.1490984857082367, "learning_rate": 1.9261649252485383e-05, "loss": 0.5408, "step": 1716 }, { "epoch": 0.4345735256896988, "grad_norm": 0.15359872579574585, "learning_rate": 1.9260746042571038e-05, "loss": 0.5514, "step": 1717 }, { "epoch": 0.4348266261705897, "grad_norm": 0.14625461399555206, "learning_rate": 1.925984230175777e-05, "loss": 0.5183, "step": 1718 }, { "epoch": 0.43507972665148065, "grad_norm": 0.1523420363664627, "learning_rate": 1.9258938030097388e-05, "loss": 0.5517, "step": 1719 }, { "epoch": 0.43533282713237154, "grad_norm": 0.14779441058635712, "learning_rate": 1.9258033227641725e-05, "loss": 0.5505, "step": 1720 }, { "epoch": 0.4355859276132625, "grad_norm": 0.14362023770809174, "learning_rate": 1.9257127894442658e-05, "loss": 0.5055, "step": 1721 }, { "epoch": 0.43583902809415337, "grad_norm": 0.14773982763290405, "learning_rate": 1.9256222030552086e-05, "loss": 0.5672, "step": 1722 }, { "epoch": 0.4360921285750443, "grad_norm": 0.14201810956001282, "learning_rate": 1.9255315636021935e-05, "loss": 0.5279, "step": 1723 }, { "epoch": 0.4363452290559352, "grad_norm": 0.1453840434551239, "learning_rate": 1.9254408710904177e-05, "loss": 0.535, "step": 1724 }, { "epoch": 0.43659832953682615, "grad_norm": 0.15022654831409454, "learning_rate": 1.9253501255250794e-05, "loss": 0.5481, "step": 1725 }, { "epoch": 0.43685143001771704, "grad_norm": 0.14555461704730988, "learning_rate": 1.9252593269113816e-05, "loss": 0.5605, "step": 1726 }, { "epoch": 0.4371045304986079, "grad_norm": 0.16268178820610046, "learning_rate": 1.925168475254529e-05, "loss": 0.5463, "step": 1727 }, { "epoch": 0.43735763097949887, "grad_norm": 0.1464332938194275, "learning_rate": 1.9250775705597307e-05, "loss": 0.5602, "step": 1728 }, { "epoch": 0.43761073146038976, "grad_norm": 0.1467064619064331, "learning_rate": 1.9249866128321972e-05, "loss": 0.5525, "step": 1729 }, { "epoch": 0.4378638319412807, "grad_norm": 0.17261680960655212, "learning_rate": 1.9248956020771434e-05, "loss": 0.5433, "step": 1730 }, { "epoch": 0.4381169324221716, "grad_norm": 0.15159925818443298, "learning_rate": 1.9248045382997866e-05, "loss": 0.5539, "step": 1731 }, { "epoch": 0.43837003290306253, "grad_norm": 0.1457923799753189, "learning_rate": 1.9247134215053477e-05, "loss": 0.549, "step": 1732 }, { "epoch": 0.4386231333839534, "grad_norm": 0.14411090314388275, "learning_rate": 1.9246222516990495e-05, "loss": 0.5325, "step": 1733 }, { "epoch": 0.43887623386484437, "grad_norm": 0.1534961760044098, "learning_rate": 1.924531028886119e-05, "loss": 0.5479, "step": 1734 }, { "epoch": 0.43912933434573526, "grad_norm": 0.1493685394525528, "learning_rate": 1.924439753071786e-05, "loss": 0.5513, "step": 1735 }, { "epoch": 0.43938243482662614, "grad_norm": 0.14418727159500122, "learning_rate": 1.9243484242612827e-05, "loss": 0.5372, "step": 1736 }, { "epoch": 0.4396355353075171, "grad_norm": 0.14740322530269623, "learning_rate": 1.924257042459845e-05, "loss": 0.5319, "step": 1737 }, { "epoch": 0.439888635788408, "grad_norm": 0.14894923567771912, "learning_rate": 1.9241656076727116e-05, "loss": 0.5773, "step": 1738 }, { "epoch": 0.4401417362692989, "grad_norm": 0.14886252582073212, "learning_rate": 1.9240741199051244e-05, "loss": 0.5588, "step": 1739 }, { "epoch": 0.4403948367501898, "grad_norm": 0.14675700664520264, "learning_rate": 1.923982579162328e-05, "loss": 0.5579, "step": 1740 }, { "epoch": 0.44064793723108076, "grad_norm": 0.15053215622901917, "learning_rate": 1.9238909854495705e-05, "loss": 0.5507, "step": 1741 }, { "epoch": 0.44090103771197164, "grad_norm": 0.1464221179485321, "learning_rate": 1.9237993387721026e-05, "loss": 0.5213, "step": 1742 }, { "epoch": 0.4411541381928626, "grad_norm": 0.15776830911636353, "learning_rate": 1.923707639135178e-05, "loss": 0.5412, "step": 1743 }, { "epoch": 0.4414072386737535, "grad_norm": 0.14986103773117065, "learning_rate": 1.923615886544054e-05, "loss": 0.5156, "step": 1744 }, { "epoch": 0.4416603391546444, "grad_norm": 0.16009564697742462, "learning_rate": 1.9235240810039903e-05, "loss": 0.529, "step": 1745 }, { "epoch": 0.4419134396355353, "grad_norm": 0.1472238302230835, "learning_rate": 1.9234322225202503e-05, "loss": 0.5621, "step": 1746 }, { "epoch": 0.4421665401164262, "grad_norm": 0.1461733728647232, "learning_rate": 1.9233403110980997e-05, "loss": 0.5306, "step": 1747 }, { "epoch": 0.44241964059731714, "grad_norm": 0.15014013648033142, "learning_rate": 1.9232483467428077e-05, "loss": 0.5384, "step": 1748 }, { "epoch": 0.44267274107820803, "grad_norm": 0.14777792990207672, "learning_rate": 1.9231563294596466e-05, "loss": 0.5613, "step": 1749 }, { "epoch": 0.442925841559099, "grad_norm": 0.15301355719566345, "learning_rate": 1.923064259253891e-05, "loss": 0.5542, "step": 1750 }, { "epoch": 0.44317894203998986, "grad_norm": 0.16541852056980133, "learning_rate": 1.9229721361308196e-05, "loss": 0.5659, "step": 1751 }, { "epoch": 0.4434320425208808, "grad_norm": 0.15273579955101013, "learning_rate": 1.9228799600957132e-05, "loss": 0.5443, "step": 1752 }, { "epoch": 0.4436851430017717, "grad_norm": 0.14688630402088165, "learning_rate": 1.922787731153857e-05, "loss": 0.569, "step": 1753 }, { "epoch": 0.44393824348266264, "grad_norm": 0.14831644296646118, "learning_rate": 1.922695449310537e-05, "loss": 0.5373, "step": 1754 }, { "epoch": 0.44419134396355353, "grad_norm": 0.1455741822719574, "learning_rate": 1.9226031145710443e-05, "loss": 0.5579, "step": 1755 }, { "epoch": 0.4444444444444444, "grad_norm": 0.1440512090921402, "learning_rate": 1.9225107269406726e-05, "loss": 0.539, "step": 1756 }, { "epoch": 0.44469754492533536, "grad_norm": 0.15093894302845, "learning_rate": 1.9224182864247173e-05, "loss": 0.5554, "step": 1757 }, { "epoch": 0.44495064540622625, "grad_norm": 0.14180652797222137, "learning_rate": 1.9223257930284785e-05, "loss": 0.5351, "step": 1758 }, { "epoch": 0.4452037458871172, "grad_norm": 0.16017498075962067, "learning_rate": 1.9222332467572583e-05, "loss": 0.5538, "step": 1759 }, { "epoch": 0.4454568463680081, "grad_norm": 0.14740543067455292, "learning_rate": 1.9221406476163625e-05, "loss": 0.5569, "step": 1760 }, { "epoch": 0.44570994684889903, "grad_norm": 0.14455154538154602, "learning_rate": 1.9220479956110997e-05, "loss": 0.5332, "step": 1761 }, { "epoch": 0.4459630473297899, "grad_norm": 0.14225231111049652, "learning_rate": 1.9219552907467806e-05, "loss": 0.552, "step": 1762 }, { "epoch": 0.44621614781068086, "grad_norm": 0.1488070785999298, "learning_rate": 1.9218625330287207e-05, "loss": 0.5376, "step": 1763 }, { "epoch": 0.44646924829157175, "grad_norm": 0.1501726508140564, "learning_rate": 1.9217697224622373e-05, "loss": 0.5524, "step": 1764 }, { "epoch": 0.44672234877246264, "grad_norm": 0.14537248015403748, "learning_rate": 1.921676859052651e-05, "loss": 0.5589, "step": 1765 }, { "epoch": 0.4469754492533536, "grad_norm": 0.1483267843723297, "learning_rate": 1.9215839428052853e-05, "loss": 0.5697, "step": 1766 }, { "epoch": 0.4472285497342445, "grad_norm": 0.15355317294597626, "learning_rate": 1.921490973725467e-05, "loss": 0.5543, "step": 1767 }, { "epoch": 0.4474816502151354, "grad_norm": 0.15834340453147888, "learning_rate": 1.9213979518185265e-05, "loss": 0.5474, "step": 1768 }, { "epoch": 0.4477347506960263, "grad_norm": 0.14593762159347534, "learning_rate": 1.921304877089795e-05, "loss": 0.5595, "step": 1769 }, { "epoch": 0.44798785117691725, "grad_norm": 0.1434435397386551, "learning_rate": 1.92121174954461e-05, "loss": 0.5335, "step": 1770 }, { "epoch": 0.44824095165780814, "grad_norm": 0.19594144821166992, "learning_rate": 1.9211185691883094e-05, "loss": 0.5765, "step": 1771 }, { "epoch": 0.4484940521386991, "grad_norm": 0.14419157803058624, "learning_rate": 1.9210253360262346e-05, "loss": 0.533, "step": 1772 }, { "epoch": 0.44874715261959, "grad_norm": 0.1428258717060089, "learning_rate": 1.9209320500637316e-05, "loss": 0.5276, "step": 1773 }, { "epoch": 0.4490002531004809, "grad_norm": 0.1456083357334137, "learning_rate": 1.9208387113061475e-05, "loss": 0.5512, "step": 1774 }, { "epoch": 0.4492533535813718, "grad_norm": 0.14498069882392883, "learning_rate": 1.9207453197588335e-05, "loss": 0.5324, "step": 1775 }, { "epoch": 0.4495064540622627, "grad_norm": 0.14620797336101532, "learning_rate": 1.9206518754271434e-05, "loss": 0.5346, "step": 1776 }, { "epoch": 0.44975955454315364, "grad_norm": 0.1486203372478485, "learning_rate": 1.920558378316434e-05, "loss": 0.5358, "step": 1777 }, { "epoch": 0.45001265502404453, "grad_norm": 0.17899760603904724, "learning_rate": 1.9204648284320656e-05, "loss": 0.5691, "step": 1778 }, { "epoch": 0.45026575550493547, "grad_norm": 0.15098373591899872, "learning_rate": 1.9203712257794013e-05, "loss": 0.559, "step": 1779 }, { "epoch": 0.45051885598582636, "grad_norm": 0.14738845825195312, "learning_rate": 1.9202775703638066e-05, "loss": 0.5347, "step": 1780 }, { "epoch": 0.4507719564667173, "grad_norm": 0.1463220864534378, "learning_rate": 1.920183862190651e-05, "loss": 0.5438, "step": 1781 }, { "epoch": 0.4510250569476082, "grad_norm": 0.14913417398929596, "learning_rate": 1.9200901012653067e-05, "loss": 0.5127, "step": 1782 }, { "epoch": 0.45127815742849914, "grad_norm": 0.143700510263443, "learning_rate": 1.9199962875931487e-05, "loss": 0.5456, "step": 1783 }, { "epoch": 0.45153125790939, "grad_norm": 0.14566923677921295, "learning_rate": 1.919902421179555e-05, "loss": 0.5535, "step": 1784 }, { "epoch": 0.4517843583902809, "grad_norm": 0.16239500045776367, "learning_rate": 1.9198085020299067e-05, "loss": 0.5576, "step": 1785 }, { "epoch": 0.45203745887117186, "grad_norm": 0.1463373899459839, "learning_rate": 1.919714530149588e-05, "loss": 0.5507, "step": 1786 }, { "epoch": 0.45229055935206275, "grad_norm": 0.15224839746952057, "learning_rate": 1.9196205055439866e-05, "loss": 0.53, "step": 1787 }, { "epoch": 0.4525436598329537, "grad_norm": 0.18018022179603577, "learning_rate": 1.9195264282184924e-05, "loss": 0.5549, "step": 1788 }, { "epoch": 0.4527967603138446, "grad_norm": 0.14686517417430878, "learning_rate": 1.919432298178498e-05, "loss": 0.5316, "step": 1789 }, { "epoch": 0.4530498607947355, "grad_norm": 0.15180079638957977, "learning_rate": 1.9193381154294007e-05, "loss": 0.5711, "step": 1790 }, { "epoch": 0.4533029612756264, "grad_norm": 0.14719291031360626, "learning_rate": 1.9192438799765995e-05, "loss": 0.5459, "step": 1791 }, { "epoch": 0.45355606175651736, "grad_norm": 0.1472300887107849, "learning_rate": 1.9191495918254964e-05, "loss": 0.5458, "step": 1792 }, { "epoch": 0.45380916223740825, "grad_norm": 0.14407874643802643, "learning_rate": 1.919055250981497e-05, "loss": 0.5088, "step": 1793 }, { "epoch": 0.4540622627182992, "grad_norm": 0.16175606846809387, "learning_rate": 1.9189608574500095e-05, "loss": 0.5237, "step": 1794 }, { "epoch": 0.4543153631991901, "grad_norm": 0.14462268352508545, "learning_rate": 1.9188664112364456e-05, "loss": 0.5458, "step": 1795 }, { "epoch": 0.45456846368008097, "grad_norm": 0.14778609573841095, "learning_rate": 1.9187719123462192e-05, "loss": 0.5306, "step": 1796 }, { "epoch": 0.4548215641609719, "grad_norm": 0.14949549734592438, "learning_rate": 1.9186773607847482e-05, "loss": 0.5617, "step": 1797 }, { "epoch": 0.4550746646418628, "grad_norm": 0.19620174169540405, "learning_rate": 1.9185827565574526e-05, "loss": 0.5321, "step": 1798 }, { "epoch": 0.45532776512275375, "grad_norm": 0.1574760526418686, "learning_rate": 1.9184880996697562e-05, "loss": 0.5657, "step": 1799 }, { "epoch": 0.45558086560364464, "grad_norm": 0.14064092934131622, "learning_rate": 1.9183933901270857e-05, "loss": 0.5031, "step": 1800 }, { "epoch": 0.4558339660845356, "grad_norm": 0.14611414074897766, "learning_rate": 1.9182986279348702e-05, "loss": 0.5557, "step": 1801 }, { "epoch": 0.45608706656542647, "grad_norm": 0.1446884572505951, "learning_rate": 1.918203813098542e-05, "loss": 0.5143, "step": 1802 }, { "epoch": 0.4563401670463174, "grad_norm": 0.14710043370723724, "learning_rate": 1.9181089456235373e-05, "loss": 0.5441, "step": 1803 }, { "epoch": 0.4565932675272083, "grad_norm": 0.14978434145450592, "learning_rate": 1.918014025515294e-05, "loss": 0.5581, "step": 1804 }, { "epoch": 0.4568463680080992, "grad_norm": 0.1453111171722412, "learning_rate": 1.917919052779254e-05, "loss": 0.5704, "step": 1805 }, { "epoch": 0.45709946848899013, "grad_norm": 0.1428987681865692, "learning_rate": 1.9178240274208616e-05, "loss": 0.5422, "step": 1806 }, { "epoch": 0.457352568969881, "grad_norm": 0.14089812338352203, "learning_rate": 1.917728949445565e-05, "loss": 0.5229, "step": 1807 }, { "epoch": 0.45760566945077197, "grad_norm": 0.15608355402946472, "learning_rate": 1.917633818858814e-05, "loss": 0.5376, "step": 1808 }, { "epoch": 0.45785876993166286, "grad_norm": 0.14855574071407318, "learning_rate": 1.9175386356660632e-05, "loss": 0.5576, "step": 1809 }, { "epoch": 0.4581118704125538, "grad_norm": 0.15267117321491241, "learning_rate": 1.917443399872769e-05, "loss": 0.5298, "step": 1810 }, { "epoch": 0.4583649708934447, "grad_norm": 0.14579527080059052, "learning_rate": 1.9173481114843898e-05, "loss": 0.5365, "step": 1811 }, { "epoch": 0.45861807137433563, "grad_norm": 0.15329217910766602, "learning_rate": 1.91725277050639e-05, "loss": 0.5436, "step": 1812 }, { "epoch": 0.4588711718552265, "grad_norm": 0.14536508917808533, "learning_rate": 1.9171573769442348e-05, "loss": 0.5544, "step": 1813 }, { "epoch": 0.4591242723361174, "grad_norm": 0.147845059633255, "learning_rate": 1.917061930803392e-05, "loss": 0.528, "step": 1814 }, { "epoch": 0.45937737281700836, "grad_norm": 0.14528067409992218, "learning_rate": 1.9169664320893345e-05, "loss": 0.5697, "step": 1815 }, { "epoch": 0.45963047329789924, "grad_norm": 0.15772676467895508, "learning_rate": 1.916870880807536e-05, "loss": 0.5549, "step": 1816 }, { "epoch": 0.4598835737787902, "grad_norm": 0.1482834368944168, "learning_rate": 1.9167752769634754e-05, "loss": 0.5628, "step": 1817 }, { "epoch": 0.4601366742596811, "grad_norm": 0.1466667354106903, "learning_rate": 1.9166796205626328e-05, "loss": 0.5327, "step": 1818 }, { "epoch": 0.460389774740572, "grad_norm": 0.14505578577518463, "learning_rate": 1.916583911610492e-05, "loss": 0.5499, "step": 1819 }, { "epoch": 0.4606428752214629, "grad_norm": 0.1450030505657196, "learning_rate": 1.9164881501125398e-05, "loss": 0.5536, "step": 1820 }, { "epoch": 0.46089597570235386, "grad_norm": 0.1456303894519806, "learning_rate": 1.916392336074266e-05, "loss": 0.5356, "step": 1821 }, { "epoch": 0.46114907618324474, "grad_norm": 0.1493786871433258, "learning_rate": 1.9162964695011635e-05, "loss": 0.5576, "step": 1822 }, { "epoch": 0.4614021766641357, "grad_norm": 0.14639952778816223, "learning_rate": 1.916200550398728e-05, "loss": 0.5622, "step": 1823 }, { "epoch": 0.4616552771450266, "grad_norm": 0.14597994089126587, "learning_rate": 1.916104578772459e-05, "loss": 0.5695, "step": 1824 }, { "epoch": 0.46190837762591747, "grad_norm": 0.15567152202129364, "learning_rate": 1.916008554627857e-05, "loss": 0.5363, "step": 1825 }, { "epoch": 0.4621614781068084, "grad_norm": 0.14252327382564545, "learning_rate": 1.915912477970428e-05, "loss": 0.5514, "step": 1826 }, { "epoch": 0.4624145785876993, "grad_norm": 0.14835482835769653, "learning_rate": 1.9158163488056794e-05, "loss": 0.5464, "step": 1827 }, { "epoch": 0.46266767906859024, "grad_norm": 0.1697893738746643, "learning_rate": 1.9157201671391222e-05, "loss": 0.5105, "step": 1828 }, { "epoch": 0.46292077954948113, "grad_norm": 0.14517392218112946, "learning_rate": 1.91562393297627e-05, "loss": 0.5348, "step": 1829 }, { "epoch": 0.4631738800303721, "grad_norm": 0.1532156765460968, "learning_rate": 1.9155276463226405e-05, "loss": 0.5373, "step": 1830 }, { "epoch": 0.46342698051126296, "grad_norm": 0.15020602941513062, "learning_rate": 1.915431307183753e-05, "loss": 0.5449, "step": 1831 }, { "epoch": 0.4636800809921539, "grad_norm": 0.15006892383098602, "learning_rate": 1.9153349155651305e-05, "loss": 0.5433, "step": 1832 }, { "epoch": 0.4639331814730448, "grad_norm": 0.15749923884868622, "learning_rate": 1.915238471472299e-05, "loss": 0.5485, "step": 1833 }, { "epoch": 0.4641862819539357, "grad_norm": 0.1460724174976349, "learning_rate": 1.915141974910787e-05, "loss": 0.5582, "step": 1834 }, { "epoch": 0.46443938243482663, "grad_norm": 0.15296821296215057, "learning_rate": 1.915045425886127e-05, "loss": 0.5241, "step": 1835 }, { "epoch": 0.4646924829157175, "grad_norm": 0.1486324518918991, "learning_rate": 1.914948824403854e-05, "loss": 0.542, "step": 1836 }, { "epoch": 0.46494558339660846, "grad_norm": 0.16442471742630005, "learning_rate": 1.9148521704695056e-05, "loss": 0.5169, "step": 1837 }, { "epoch": 0.46519868387749935, "grad_norm": 0.14902648329734802, "learning_rate": 1.914755464088623e-05, "loss": 0.5619, "step": 1838 }, { "epoch": 0.4654517843583903, "grad_norm": 0.17024584114551544, "learning_rate": 1.9146587052667497e-05, "loss": 0.5473, "step": 1839 }, { "epoch": 0.4657048848392812, "grad_norm": 0.1540568619966507, "learning_rate": 1.9145618940094334e-05, "loss": 0.5338, "step": 1840 }, { "epoch": 0.46595798532017213, "grad_norm": 0.15819701552391052, "learning_rate": 1.9144650303222233e-05, "loss": 0.5662, "step": 1841 }, { "epoch": 0.466211085801063, "grad_norm": 0.15010058879852295, "learning_rate": 1.914368114210673e-05, "loss": 0.5713, "step": 1842 }, { "epoch": 0.46646418628195396, "grad_norm": 0.15871453285217285, "learning_rate": 1.9142711456803383e-05, "loss": 0.5358, "step": 1843 }, { "epoch": 0.46671728676284485, "grad_norm": 0.1482570320367813, "learning_rate": 1.9141741247367785e-05, "loss": 0.5361, "step": 1844 }, { "epoch": 0.46697038724373574, "grad_norm": 0.1534900814294815, "learning_rate": 1.9140770513855552e-05, "loss": 0.535, "step": 1845 }, { "epoch": 0.4672234877246267, "grad_norm": 0.1464172750711441, "learning_rate": 1.9139799256322334e-05, "loss": 0.5098, "step": 1846 }, { "epoch": 0.4674765882055176, "grad_norm": 0.148983895778656, "learning_rate": 1.913882747482381e-05, "loss": 0.5457, "step": 1847 }, { "epoch": 0.4677296886864085, "grad_norm": 0.15446853637695312, "learning_rate": 1.91378551694157e-05, "loss": 0.5631, "step": 1848 }, { "epoch": 0.4679827891672994, "grad_norm": 0.14498282968997955, "learning_rate": 1.913688234015373e-05, "loss": 0.5376, "step": 1849 }, { "epoch": 0.46823588964819035, "grad_norm": 0.14600622653961182, "learning_rate": 1.913590898709368e-05, "loss": 0.5287, "step": 1850 }, { "epoch": 0.46848899012908124, "grad_norm": 0.14623919129371643, "learning_rate": 1.9134935110291345e-05, "loss": 0.5239, "step": 1851 }, { "epoch": 0.4687420906099722, "grad_norm": 0.15560786426067352, "learning_rate": 1.913396070980256e-05, "loss": 0.5311, "step": 1852 }, { "epoch": 0.4689951910908631, "grad_norm": 0.17758597433567047, "learning_rate": 1.913298578568318e-05, "loss": 0.5569, "step": 1853 }, { "epoch": 0.46924829157175396, "grad_norm": 0.14047466218471527, "learning_rate": 1.91320103379891e-05, "loss": 0.485, "step": 1854 }, { "epoch": 0.4695013920526449, "grad_norm": 0.1628057211637497, "learning_rate": 1.913103436677624e-05, "loss": 0.5464, "step": 1855 }, { "epoch": 0.4697544925335358, "grad_norm": 0.15403884649276733, "learning_rate": 1.9130057872100546e-05, "loss": 0.5315, "step": 1856 }, { "epoch": 0.47000759301442674, "grad_norm": 0.14413578808307648, "learning_rate": 1.9129080854018003e-05, "loss": 0.5402, "step": 1857 }, { "epoch": 0.4702606934953176, "grad_norm": 0.14322863519191742, "learning_rate": 1.912810331258462e-05, "loss": 0.5062, "step": 1858 }, { "epoch": 0.47051379397620857, "grad_norm": 0.15314650535583496, "learning_rate": 1.9127125247856437e-05, "loss": 0.5432, "step": 1859 }, { "epoch": 0.47076689445709946, "grad_norm": 0.13958516716957092, "learning_rate": 1.912614665988952e-05, "loss": 0.5348, "step": 1860 }, { "epoch": 0.4710199949379904, "grad_norm": 0.1479014754295349, "learning_rate": 1.912516754873998e-05, "loss": 0.5773, "step": 1861 }, { "epoch": 0.4712730954188813, "grad_norm": 0.1426335573196411, "learning_rate": 1.9124187914463936e-05, "loss": 0.5176, "step": 1862 }, { "epoch": 0.4715261958997722, "grad_norm": 0.14238254725933075, "learning_rate": 1.9123207757117557e-05, "loss": 0.5656, "step": 1863 }, { "epoch": 0.4717792963806631, "grad_norm": 0.14684829115867615, "learning_rate": 1.912222707675703e-05, "loss": 0.5471, "step": 1864 }, { "epoch": 0.472032396861554, "grad_norm": 0.15091629326343536, "learning_rate": 1.912124587343857e-05, "loss": 0.5219, "step": 1865 }, { "epoch": 0.47228549734244496, "grad_norm": 0.14322194457054138, "learning_rate": 1.9120264147218438e-05, "loss": 0.5373, "step": 1866 }, { "epoch": 0.47253859782333585, "grad_norm": 0.15718382596969604, "learning_rate": 1.9119281898152907e-05, "loss": 0.5386, "step": 1867 }, { "epoch": 0.4727916983042268, "grad_norm": 0.15968430042266846, "learning_rate": 1.911829912629829e-05, "loss": 0.5569, "step": 1868 }, { "epoch": 0.4730447987851177, "grad_norm": 0.14726126194000244, "learning_rate": 1.9117315831710926e-05, "loss": 0.5441, "step": 1869 }, { "epoch": 0.4732978992660086, "grad_norm": 0.14613080024719238, "learning_rate": 1.9116332014447187e-05, "loss": 0.5269, "step": 1870 }, { "epoch": 0.4735509997468995, "grad_norm": 0.1488690972328186, "learning_rate": 1.9115347674563468e-05, "loss": 0.5511, "step": 1871 }, { "epoch": 0.47380410022779046, "grad_norm": 0.14886198937892914, "learning_rate": 1.9114362812116203e-05, "loss": 0.5406, "step": 1872 }, { "epoch": 0.47405720070868135, "grad_norm": 0.14333635568618774, "learning_rate": 1.9113377427161852e-05, "loss": 0.5205, "step": 1873 }, { "epoch": 0.47431030118957224, "grad_norm": 0.148750901222229, "learning_rate": 1.9112391519756905e-05, "loss": 0.5591, "step": 1874 }, { "epoch": 0.4745634016704632, "grad_norm": 0.15048296749591827, "learning_rate": 1.9111405089957886e-05, "loss": 0.535, "step": 1875 }, { "epoch": 0.47481650215135407, "grad_norm": 0.15181617438793182, "learning_rate": 1.9110418137821332e-05, "loss": 0.5396, "step": 1876 }, { "epoch": 0.475069602632245, "grad_norm": 0.15228953957557678, "learning_rate": 1.910943066340384e-05, "loss": 0.5433, "step": 1877 }, { "epoch": 0.4753227031131359, "grad_norm": 0.14914223551750183, "learning_rate": 1.9108442666762008e-05, "loss": 0.5206, "step": 1878 }, { "epoch": 0.47557580359402685, "grad_norm": 0.14637982845306396, "learning_rate": 1.910745414795248e-05, "loss": 0.5691, "step": 1879 }, { "epoch": 0.47582890407491774, "grad_norm": 0.14993123710155487, "learning_rate": 1.9106465107031922e-05, "loss": 0.5713, "step": 1880 }, { "epoch": 0.4760820045558087, "grad_norm": 0.14913460612297058, "learning_rate": 1.9105475544057038e-05, "loss": 0.5671, "step": 1881 }, { "epoch": 0.47633510503669957, "grad_norm": 0.16538570821285248, "learning_rate": 1.9104485459084555e-05, "loss": 0.5331, "step": 1882 }, { "epoch": 0.47658820551759046, "grad_norm": 0.15010684728622437, "learning_rate": 1.9103494852171237e-05, "loss": 0.5802, "step": 1883 }, { "epoch": 0.4768413059984814, "grad_norm": 0.15425536036491394, "learning_rate": 1.910250372337387e-05, "loss": 0.5355, "step": 1884 }, { "epoch": 0.4770944064793723, "grad_norm": 0.1477648764848709, "learning_rate": 1.9101512072749267e-05, "loss": 0.513, "step": 1885 }, { "epoch": 0.47734750696026323, "grad_norm": 0.1467532515525818, "learning_rate": 1.9100519900354288e-05, "loss": 0.549, "step": 1886 }, { "epoch": 0.4776006074411541, "grad_norm": 0.15148386359214783, "learning_rate": 1.9099527206245807e-05, "loss": 0.5582, "step": 1887 }, { "epoch": 0.47785370792204507, "grad_norm": 0.14593085646629333, "learning_rate": 1.9098533990480735e-05, "loss": 0.5277, "step": 1888 }, { "epoch": 0.47810680840293596, "grad_norm": 0.14520245790481567, "learning_rate": 1.9097540253116007e-05, "loss": 0.5274, "step": 1889 }, { "epoch": 0.4783599088838269, "grad_norm": 0.14359933137893677, "learning_rate": 1.9096545994208594e-05, "loss": 0.5226, "step": 1890 }, { "epoch": 0.4786130093647178, "grad_norm": 0.1448543220758438, "learning_rate": 1.9095551213815497e-05, "loss": 0.5339, "step": 1891 }, { "epoch": 0.47886610984560873, "grad_norm": 0.14873534440994263, "learning_rate": 1.9094555911993742e-05, "loss": 0.547, "step": 1892 }, { "epoch": 0.4791192103264996, "grad_norm": 0.14431843161582947, "learning_rate": 1.909356008880039e-05, "loss": 0.5374, "step": 1893 }, { "epoch": 0.4793723108073905, "grad_norm": 0.15551522374153137, "learning_rate": 1.9092563744292526e-05, "loss": 0.5421, "step": 1894 }, { "epoch": 0.47962541128828146, "grad_norm": 0.14870351552963257, "learning_rate": 1.909156687852727e-05, "loss": 0.5532, "step": 1895 }, { "epoch": 0.47987851176917234, "grad_norm": 0.14925895631313324, "learning_rate": 1.909056949156177e-05, "loss": 0.5419, "step": 1896 }, { "epoch": 0.4801316122500633, "grad_norm": 0.1646534502506256, "learning_rate": 1.9089571583453206e-05, "loss": 0.5399, "step": 1897 }, { "epoch": 0.4803847127309542, "grad_norm": 0.14854730665683746, "learning_rate": 1.9088573154258782e-05, "loss": 0.5431, "step": 1898 }, { "epoch": 0.4806378132118451, "grad_norm": 0.14499633014202118, "learning_rate": 1.9087574204035738e-05, "loss": 0.5554, "step": 1899 }, { "epoch": 0.480890913692736, "grad_norm": 0.14545610547065735, "learning_rate": 1.9086574732841346e-05, "loss": 0.5275, "step": 1900 }, { "epoch": 0.48114401417362695, "grad_norm": 0.14720550179481506, "learning_rate": 1.90855747407329e-05, "loss": 0.5711, "step": 1901 }, { "epoch": 0.48139711465451784, "grad_norm": 0.15526871383190155, "learning_rate": 1.908457422776772e-05, "loss": 0.5352, "step": 1902 }, { "epoch": 0.48165021513540873, "grad_norm": 0.1483507752418518, "learning_rate": 1.9083573194003173e-05, "loss": 0.5635, "step": 1903 }, { "epoch": 0.4819033156162997, "grad_norm": 0.14749382436275482, "learning_rate": 1.9082571639496644e-05, "loss": 0.5267, "step": 1904 }, { "epoch": 0.48215641609719057, "grad_norm": 0.1496928334236145, "learning_rate": 1.908156956430555e-05, "loss": 0.5527, "step": 1905 }, { "epoch": 0.4824095165780815, "grad_norm": 0.15124240517616272, "learning_rate": 1.9080566968487337e-05, "loss": 0.553, "step": 1906 }, { "epoch": 0.4826626170589724, "grad_norm": 0.14676426351070404, "learning_rate": 1.907956385209948e-05, "loss": 0.5288, "step": 1907 }, { "epoch": 0.48291571753986334, "grad_norm": 0.1487855762243271, "learning_rate": 1.907856021519949e-05, "loss": 0.5259, "step": 1908 }, { "epoch": 0.48316881802075423, "grad_norm": 0.14760038256645203, "learning_rate": 1.9077556057844898e-05, "loss": 0.5174, "step": 1909 }, { "epoch": 0.4834219185016452, "grad_norm": 0.1448255032300949, "learning_rate": 1.9076551380093274e-05, "loss": 0.5207, "step": 1910 }, { "epoch": 0.48367501898253606, "grad_norm": 0.15068931877613068, "learning_rate": 1.9075546182002213e-05, "loss": 0.5478, "step": 1911 }, { "epoch": 0.48392811946342695, "grad_norm": 0.14817774295806885, "learning_rate": 1.907454046362934e-05, "loss": 0.5553, "step": 1912 }, { "epoch": 0.4841812199443179, "grad_norm": 0.14044204354286194, "learning_rate": 1.9073534225032313e-05, "loss": 0.5021, "step": 1913 }, { "epoch": 0.4844343204252088, "grad_norm": 0.14910194277763367, "learning_rate": 1.9072527466268816e-05, "loss": 0.5579, "step": 1914 }, { "epoch": 0.48468742090609973, "grad_norm": 0.14622847735881805, "learning_rate": 1.9071520187396563e-05, "loss": 0.5489, "step": 1915 }, { "epoch": 0.4849405213869906, "grad_norm": 0.14573965966701508, "learning_rate": 1.90705123884733e-05, "loss": 0.5139, "step": 1916 }, { "epoch": 0.48519362186788156, "grad_norm": 0.1451754868030548, "learning_rate": 1.9069504069556806e-05, "loss": 0.5397, "step": 1917 }, { "epoch": 0.48544672234877245, "grad_norm": 0.14228999614715576, "learning_rate": 1.9068495230704878e-05, "loss": 0.5257, "step": 1918 }, { "epoch": 0.4856998228296634, "grad_norm": 0.14980851113796234, "learning_rate": 1.9067485871975356e-05, "loss": 0.565, "step": 1919 }, { "epoch": 0.4859529233105543, "grad_norm": 0.14844326674938202, "learning_rate": 1.9066475993426105e-05, "loss": 0.554, "step": 1920 }, { "epoch": 0.48620602379144523, "grad_norm": 0.14422109723091125, "learning_rate": 1.9065465595115018e-05, "loss": 0.5296, "step": 1921 }, { "epoch": 0.4864591242723361, "grad_norm": 0.14998356997966766, "learning_rate": 1.9064454677100016e-05, "loss": 0.5451, "step": 1922 }, { "epoch": 0.486712224753227, "grad_norm": 0.14755471050739288, "learning_rate": 1.906344323943906e-05, "loss": 0.5637, "step": 1923 }, { "epoch": 0.48696532523411795, "grad_norm": 0.1462131142616272, "learning_rate": 1.9062431282190125e-05, "loss": 0.5349, "step": 1924 }, { "epoch": 0.48721842571500884, "grad_norm": 0.14705601334571838, "learning_rate": 1.9061418805411228e-05, "loss": 0.5413, "step": 1925 }, { "epoch": 0.4874715261958998, "grad_norm": 0.1487126648426056, "learning_rate": 1.9060405809160412e-05, "loss": 0.5382, "step": 1926 }, { "epoch": 0.4877246266767907, "grad_norm": 0.14743807911872864, "learning_rate": 1.905939229349575e-05, "loss": 0.5381, "step": 1927 }, { "epoch": 0.4879777271576816, "grad_norm": 0.1568838655948639, "learning_rate": 1.9058378258475346e-05, "loss": 0.5607, "step": 1928 }, { "epoch": 0.4882308276385725, "grad_norm": 0.1473371833562851, "learning_rate": 1.9057363704157334e-05, "loss": 0.5569, "step": 1929 }, { "epoch": 0.48848392811946345, "grad_norm": 0.14230087399482727, "learning_rate": 1.905634863059987e-05, "loss": 0.5394, "step": 1930 }, { "epoch": 0.48873702860035434, "grad_norm": 0.21522068977355957, "learning_rate": 1.9055333037861154e-05, "loss": 0.5066, "step": 1931 }, { "epoch": 0.48899012908124523, "grad_norm": 0.15192356705665588, "learning_rate": 1.9054316925999398e-05, "loss": 0.5596, "step": 1932 }, { "epoch": 0.4892432295621362, "grad_norm": 0.1506957709789276, "learning_rate": 1.9053300295072864e-05, "loss": 0.5596, "step": 1933 }, { "epoch": 0.48949633004302706, "grad_norm": 0.14356692135334015, "learning_rate": 1.9052283145139825e-05, "loss": 0.5294, "step": 1934 }, { "epoch": 0.489749430523918, "grad_norm": 0.14666032791137695, "learning_rate": 1.9051265476258596e-05, "loss": 0.5412, "step": 1935 }, { "epoch": 0.4900025310048089, "grad_norm": 0.14896340668201447, "learning_rate": 1.9050247288487517e-05, "loss": 0.5583, "step": 1936 }, { "epoch": 0.49025563148569984, "grad_norm": 0.14283309876918793, "learning_rate": 1.904922858188496e-05, "loss": 0.5163, "step": 1937 }, { "epoch": 0.4905087319665907, "grad_norm": 0.15178611874580383, "learning_rate": 1.9048209356509323e-05, "loss": 0.5424, "step": 1938 }, { "epoch": 0.49076183244748167, "grad_norm": 0.1689726561307907, "learning_rate": 1.9047189612419037e-05, "loss": 0.5351, "step": 1939 }, { "epoch": 0.49101493292837256, "grad_norm": 0.1439141482114792, "learning_rate": 1.9046169349672564e-05, "loss": 0.5146, "step": 1940 }, { "epoch": 0.4912680334092635, "grad_norm": 0.1454126238822937, "learning_rate": 1.904514856832839e-05, "loss": 0.5702, "step": 1941 }, { "epoch": 0.4915211338901544, "grad_norm": 0.14274144172668457, "learning_rate": 1.9044127268445033e-05, "loss": 0.52, "step": 1942 }, { "epoch": 0.4917742343710453, "grad_norm": 0.14609690010547638, "learning_rate": 1.9043105450081047e-05, "loss": 0.5065, "step": 1943 }, { "epoch": 0.4920273348519362, "grad_norm": 0.15331365168094635, "learning_rate": 1.9042083113295006e-05, "loss": 0.55, "step": 1944 }, { "epoch": 0.4922804353328271, "grad_norm": 0.15041755139827728, "learning_rate": 1.9041060258145525e-05, "loss": 0.5474, "step": 1945 }, { "epoch": 0.49253353581371806, "grad_norm": 0.14926937222480774, "learning_rate": 1.9040036884691235e-05, "loss": 0.56, "step": 1946 }, { "epoch": 0.49278663629460895, "grad_norm": 0.15952521562576294, "learning_rate": 1.9039012992990806e-05, "loss": 0.5525, "step": 1947 }, { "epoch": 0.4930397367754999, "grad_norm": 0.14475597441196442, "learning_rate": 1.9037988583102936e-05, "loss": 0.5547, "step": 1948 }, { "epoch": 0.4932928372563908, "grad_norm": 0.1460406631231308, "learning_rate": 1.9036963655086353e-05, "loss": 0.5205, "step": 1949 }, { "epoch": 0.4935459377372817, "grad_norm": 0.14597095549106598, "learning_rate": 1.9035938208999812e-05, "loss": 0.5318, "step": 1950 }, { "epoch": 0.4937990382181726, "grad_norm": 0.1466267853975296, "learning_rate": 1.9034912244902103e-05, "loss": 0.5391, "step": 1951 }, { "epoch": 0.4940521386990635, "grad_norm": 0.1474587321281433, "learning_rate": 1.903388576285204e-05, "loss": 0.5297, "step": 1952 }, { "epoch": 0.49430523917995445, "grad_norm": 0.1442517638206482, "learning_rate": 1.9032858762908468e-05, "loss": 0.5211, "step": 1953 }, { "epoch": 0.49455833966084534, "grad_norm": 0.14450058341026306, "learning_rate": 1.9031831245130265e-05, "loss": 0.5576, "step": 1954 }, { "epoch": 0.4948114401417363, "grad_norm": 0.15431705117225647, "learning_rate": 1.9030803209576337e-05, "loss": 0.5432, "step": 1955 }, { "epoch": 0.49506454062262717, "grad_norm": 0.1450243592262268, "learning_rate": 1.9029774656305614e-05, "loss": 0.5456, "step": 1956 }, { "epoch": 0.4953176411035181, "grad_norm": 0.1464066505432129, "learning_rate": 1.9028745585377066e-05, "loss": 0.5454, "step": 1957 }, { "epoch": 0.495570741584409, "grad_norm": 0.14703373610973358, "learning_rate": 1.9027715996849686e-05, "loss": 0.5622, "step": 1958 }, { "epoch": 0.49582384206529995, "grad_norm": 0.14474087953567505, "learning_rate": 1.9026685890782498e-05, "loss": 0.5169, "step": 1959 }, { "epoch": 0.49607694254619084, "grad_norm": 0.14639674127101898, "learning_rate": 1.902565526723455e-05, "loss": 0.5581, "step": 1960 }, { "epoch": 0.4963300430270817, "grad_norm": 0.15299645066261292, "learning_rate": 1.9024624126264938e-05, "loss": 0.5219, "step": 1961 }, { "epoch": 0.49658314350797267, "grad_norm": 0.14949935674667358, "learning_rate": 1.9023592467932767e-05, "loss": 0.5601, "step": 1962 }, { "epoch": 0.49683624398886356, "grad_norm": 0.14503586292266846, "learning_rate": 1.9022560292297183e-05, "loss": 0.544, "step": 1963 }, { "epoch": 0.4970893444697545, "grad_norm": 0.17673051357269287, "learning_rate": 1.9021527599417354e-05, "loss": 0.5612, "step": 1964 }, { "epoch": 0.4973424449506454, "grad_norm": 0.14651919901371002, "learning_rate": 1.9020494389352483e-05, "loss": 0.5746, "step": 1965 }, { "epoch": 0.49759554543153633, "grad_norm": 0.15588942170143127, "learning_rate": 1.9019460662161807e-05, "loss": 0.5579, "step": 1966 }, { "epoch": 0.4978486459124272, "grad_norm": 0.1492077112197876, "learning_rate": 1.901842641790458e-05, "loss": 0.5499, "step": 1967 }, { "epoch": 0.49810174639331817, "grad_norm": 0.18058110773563385, "learning_rate": 1.9017391656640097e-05, "loss": 0.5418, "step": 1968 }, { "epoch": 0.49835484687420906, "grad_norm": 0.16760560870170593, "learning_rate": 1.9016356378427683e-05, "loss": 0.5246, "step": 1969 }, { "epoch": 0.4986079473551, "grad_norm": 0.1536208987236023, "learning_rate": 1.901532058332668e-05, "loss": 0.581, "step": 1970 }, { "epoch": 0.4988610478359909, "grad_norm": 0.16915126144886017, "learning_rate": 1.901428427139647e-05, "loss": 0.5336, "step": 1971 }, { "epoch": 0.4991141483168818, "grad_norm": 0.14436998963356018, "learning_rate": 1.901324744269647e-05, "loss": 0.5401, "step": 1972 }, { "epoch": 0.4993672487977727, "grad_norm": 0.14993751049041748, "learning_rate": 1.901221009728611e-05, "loss": 0.5555, "step": 1973 }, { "epoch": 0.4996203492786636, "grad_norm": 0.14491701126098633, "learning_rate": 1.901117223522486e-05, "loss": 0.5338, "step": 1974 }, { "epoch": 0.49987344975955456, "grad_norm": 0.14353638887405396, "learning_rate": 1.9010133856572222e-05, "loss": 0.5344, "step": 1975 }, { "epoch": 0.5001265502404455, "grad_norm": 0.14797140657901764, "learning_rate": 1.9009094961387726e-05, "loss": 0.525, "step": 1976 }, { "epoch": 0.5003796507213364, "grad_norm": 0.14880260825157166, "learning_rate": 1.900805554973092e-05, "loss": 0.5453, "step": 1977 }, { "epoch": 0.5006327512022273, "grad_norm": 0.14741623401641846, "learning_rate": 1.90070156216614e-05, "loss": 0.5341, "step": 1978 }, { "epoch": 0.5008858516831182, "grad_norm": 0.14488853514194489, "learning_rate": 1.9005975177238784e-05, "loss": 0.5299, "step": 1979 }, { "epoch": 0.5011389521640092, "grad_norm": 0.14642424881458282, "learning_rate": 1.9004934216522714e-05, "loss": 0.5569, "step": 1980 }, { "epoch": 0.5013920526449, "grad_norm": 0.14464901387691498, "learning_rate": 1.9003892739572863e-05, "loss": 0.5588, "step": 1981 }, { "epoch": 0.5016451531257909, "grad_norm": 0.147584930062294, "learning_rate": 1.9002850746448947e-05, "loss": 0.5346, "step": 1982 }, { "epoch": 0.5018982536066818, "grad_norm": 0.1435047686100006, "learning_rate": 1.900180823721069e-05, "loss": 0.5466, "step": 1983 }, { "epoch": 0.5021513540875727, "grad_norm": 0.169583261013031, "learning_rate": 1.9000765211917862e-05, "loss": 0.5527, "step": 1984 }, { "epoch": 0.5024044545684637, "grad_norm": 0.15055251121520996, "learning_rate": 1.8999721670630256e-05, "loss": 0.5795, "step": 1985 }, { "epoch": 0.5026575550493546, "grad_norm": 0.15124401450157166, "learning_rate": 1.8998677613407703e-05, "loss": 0.5141, "step": 1986 }, { "epoch": 0.5029106555302455, "grad_norm": 0.14702655375003815, "learning_rate": 1.8997633040310047e-05, "loss": 0.5314, "step": 1987 }, { "epoch": 0.5031637560111364, "grad_norm": 0.1500476449728012, "learning_rate": 1.8996587951397176e-05, "loss": 0.5811, "step": 1988 }, { "epoch": 0.5034168564920274, "grad_norm": 0.14876651763916016, "learning_rate": 1.8995542346729003e-05, "loss": 0.5242, "step": 1989 }, { "epoch": 0.5036699569729183, "grad_norm": 0.14694412052631378, "learning_rate": 1.8994496226365467e-05, "loss": 0.517, "step": 1990 }, { "epoch": 0.5039230574538092, "grad_norm": 0.1451118141412735, "learning_rate": 1.899344959036654e-05, "loss": 0.533, "step": 1991 }, { "epoch": 0.5041761579347, "grad_norm": 0.14241361618041992, "learning_rate": 1.8992402438792225e-05, "loss": 0.5199, "step": 1992 }, { "epoch": 0.5044292584155909, "grad_norm": 0.1467846781015396, "learning_rate": 1.8991354771702557e-05, "loss": 0.5298, "step": 1993 }, { "epoch": 0.5046823588964819, "grad_norm": 0.14562153816223145, "learning_rate": 1.899030658915759e-05, "loss": 0.5373, "step": 1994 }, { "epoch": 0.5049354593773728, "grad_norm": 0.1485190987586975, "learning_rate": 1.8989257891217422e-05, "loss": 0.566, "step": 1995 }, { "epoch": 0.5051885598582637, "grad_norm": 0.14917857944965363, "learning_rate": 1.8988208677942163e-05, "loss": 0.5349, "step": 1996 }, { "epoch": 0.5054416603391546, "grad_norm": 0.1508703976869583, "learning_rate": 1.8987158949391965e-05, "loss": 0.5426, "step": 1997 }, { "epoch": 0.5056947608200456, "grad_norm": 0.14861728250980377, "learning_rate": 1.898610870562701e-05, "loss": 0.5555, "step": 1998 }, { "epoch": 0.5059478613009365, "grad_norm": 0.14600548148155212, "learning_rate": 1.8985057946707507e-05, "loss": 0.5273, "step": 1999 }, { "epoch": 0.5062009617818274, "grad_norm": 0.14683488011360168, "learning_rate": 1.8984006672693688e-05, "loss": 0.5254, "step": 2000 }, { "epoch": 0.5064540622627183, "grad_norm": 0.14343638718128204, "learning_rate": 1.8982954883645826e-05, "loss": 0.5368, "step": 2001 }, { "epoch": 0.5067071627436092, "grad_norm": 0.1483989655971527, "learning_rate": 1.8981902579624216e-05, "loss": 0.5546, "step": 2002 }, { "epoch": 0.5069602632245002, "grad_norm": 0.14976057410240173, "learning_rate": 1.898084976068918e-05, "loss": 0.5373, "step": 2003 }, { "epoch": 0.507213363705391, "grad_norm": 0.14165432751178741, "learning_rate": 1.897979642690108e-05, "loss": 0.5337, "step": 2004 }, { "epoch": 0.5074664641862819, "grad_norm": 0.14637352526187897, "learning_rate": 1.8978742578320303e-05, "loss": 0.5268, "step": 2005 }, { "epoch": 0.5077195646671728, "grad_norm": 0.14603778719902039, "learning_rate": 1.897768821500726e-05, "loss": 0.5407, "step": 2006 }, { "epoch": 0.5079726651480638, "grad_norm": 0.14633269608020782, "learning_rate": 1.897663333702239e-05, "loss": 0.558, "step": 2007 }, { "epoch": 0.5082257656289547, "grad_norm": 0.1476481556892395, "learning_rate": 1.8975577944426177e-05, "loss": 0.5352, "step": 2008 }, { "epoch": 0.5084788661098456, "grad_norm": 0.14400698244571686, "learning_rate": 1.8974522037279116e-05, "loss": 0.5296, "step": 2009 }, { "epoch": 0.5087319665907365, "grad_norm": 0.14734113216400146, "learning_rate": 1.8973465615641745e-05, "loss": 0.5487, "step": 2010 }, { "epoch": 0.5089850670716274, "grad_norm": 0.14925172924995422, "learning_rate": 1.8972408679574625e-05, "loss": 0.5543, "step": 2011 }, { "epoch": 0.5092381675525184, "grad_norm": 0.14813505113124847, "learning_rate": 1.897135122913835e-05, "loss": 0.52, "step": 2012 }, { "epoch": 0.5094912680334093, "grad_norm": 0.14566028118133545, "learning_rate": 1.8970293264393536e-05, "loss": 0.5769, "step": 2013 }, { "epoch": 0.5097443685143002, "grad_norm": 0.14642831683158875, "learning_rate": 1.896923478540084e-05, "loss": 0.5491, "step": 2014 }, { "epoch": 0.509997468995191, "grad_norm": 0.1512906402349472, "learning_rate": 1.896817579222094e-05, "loss": 0.5539, "step": 2015 }, { "epoch": 0.510250569476082, "grad_norm": 0.15196779370307922, "learning_rate": 1.8967116284914545e-05, "loss": 0.5313, "step": 2016 }, { "epoch": 0.5105036699569729, "grad_norm": 0.1474650502204895, "learning_rate": 1.8966056263542393e-05, "loss": 0.5331, "step": 2017 }, { "epoch": 0.5107567704378638, "grad_norm": 0.15592193603515625, "learning_rate": 1.8964995728165255e-05, "loss": 0.5485, "step": 2018 }, { "epoch": 0.5110098709187547, "grad_norm": 0.1451408714056015, "learning_rate": 1.8963934678843933e-05, "loss": 0.543, "step": 2019 }, { "epoch": 0.5112629713996456, "grad_norm": 0.14020602405071259, "learning_rate": 1.8962873115639246e-05, "loss": 0.5583, "step": 2020 }, { "epoch": 0.5115160718805366, "grad_norm": 0.14597153663635254, "learning_rate": 1.8961811038612057e-05, "loss": 0.5399, "step": 2021 }, { "epoch": 0.5117691723614275, "grad_norm": 0.17113880813121796, "learning_rate": 1.896074844782325e-05, "loss": 0.5566, "step": 2022 }, { "epoch": 0.5120222728423184, "grad_norm": 0.15941676497459412, "learning_rate": 1.8959685343333746e-05, "loss": 0.5279, "step": 2023 }, { "epoch": 0.5122753733232093, "grad_norm": 0.13685846328735352, "learning_rate": 1.8958621725204486e-05, "loss": 0.5301, "step": 2024 }, { "epoch": 0.5125284738041003, "grad_norm": 0.14705871045589447, "learning_rate": 1.8957557593496442e-05, "loss": 0.5471, "step": 2025 }, { "epoch": 0.5127815742849912, "grad_norm": 0.1425859034061432, "learning_rate": 1.8956492948270625e-05, "loss": 0.525, "step": 2026 }, { "epoch": 0.513034674765882, "grad_norm": 0.16177019476890564, "learning_rate": 1.895542778958807e-05, "loss": 0.5966, "step": 2027 }, { "epoch": 0.5132877752467729, "grad_norm": 0.15402144193649292, "learning_rate": 1.895436211750983e-05, "loss": 0.5515, "step": 2028 }, { "epoch": 0.5135408757276639, "grad_norm": 0.14698517322540283, "learning_rate": 1.8953295932097007e-05, "loss": 0.5314, "step": 2029 }, { "epoch": 0.5137939762085548, "grad_norm": 0.15535616874694824, "learning_rate": 1.895222923341072e-05, "loss": 0.5531, "step": 2030 }, { "epoch": 0.5140470766894457, "grad_norm": 0.15035369992256165, "learning_rate": 1.895116202151212e-05, "loss": 0.5351, "step": 2031 }, { "epoch": 0.5143001771703366, "grad_norm": 0.14429210126399994, "learning_rate": 1.8950094296462392e-05, "loss": 0.5495, "step": 2032 }, { "epoch": 0.5145532776512275, "grad_norm": 0.14750072360038757, "learning_rate": 1.894902605832274e-05, "loss": 0.55, "step": 2033 }, { "epoch": 0.5148063781321185, "grad_norm": 0.15020400285720825, "learning_rate": 1.894795730715441e-05, "loss": 0.5516, "step": 2034 }, { "epoch": 0.5150594786130094, "grad_norm": 0.14803938567638397, "learning_rate": 1.8946888043018666e-05, "loss": 0.5431, "step": 2035 }, { "epoch": 0.5153125790939003, "grad_norm": 0.1507357358932495, "learning_rate": 1.894581826597681e-05, "loss": 0.5701, "step": 2036 }, { "epoch": 0.5155656795747912, "grad_norm": 0.1515742987394333, "learning_rate": 1.894474797609017e-05, "loss": 0.5624, "step": 2037 }, { "epoch": 0.5158187800556822, "grad_norm": 0.1476578265428543, "learning_rate": 1.89436771734201e-05, "loss": 0.5491, "step": 2038 }, { "epoch": 0.516071880536573, "grad_norm": 0.14802879095077515, "learning_rate": 1.8942605858027988e-05, "loss": 0.5459, "step": 2039 }, { "epoch": 0.5163249810174639, "grad_norm": 0.1512601375579834, "learning_rate": 1.8941534029975257e-05, "loss": 0.5526, "step": 2040 }, { "epoch": 0.5165780814983548, "grad_norm": 0.1454947143793106, "learning_rate": 1.8940461689323343e-05, "loss": 0.5341, "step": 2041 }, { "epoch": 0.5168311819792457, "grad_norm": 0.1484716832637787, "learning_rate": 1.8939388836133728e-05, "loss": 0.5406, "step": 2042 }, { "epoch": 0.5170842824601367, "grad_norm": 0.14195962250232697, "learning_rate": 1.8938315470467912e-05, "loss": 0.5365, "step": 2043 }, { "epoch": 0.5173373829410276, "grad_norm": 0.14998741447925568, "learning_rate": 1.893724159238743e-05, "loss": 0.5313, "step": 2044 }, { "epoch": 0.5175904834219185, "grad_norm": 0.14848732948303223, "learning_rate": 1.893616720195385e-05, "loss": 0.5479, "step": 2045 }, { "epoch": 0.5178435839028094, "grad_norm": 0.15063674747943878, "learning_rate": 1.8935092299228753e-05, "loss": 0.58, "step": 2046 }, { "epoch": 0.5180966843837004, "grad_norm": 0.14607171714305878, "learning_rate": 1.8934016884273773e-05, "loss": 0.5336, "step": 2047 }, { "epoch": 0.5183497848645913, "grad_norm": 0.14624135196208954, "learning_rate": 1.8932940957150557e-05, "loss": 0.5441, "step": 2048 }, { "epoch": 0.5186028853454822, "grad_norm": 0.1554187834262848, "learning_rate": 1.893186451792078e-05, "loss": 0.5666, "step": 2049 }, { "epoch": 0.518855985826373, "grad_norm": 0.14865043759346008, "learning_rate": 1.893078756664616e-05, "loss": 0.5323, "step": 2050 }, { "epoch": 0.5191090863072639, "grad_norm": 0.14657746255397797, "learning_rate": 1.8929710103388435e-05, "loss": 0.5371, "step": 2051 }, { "epoch": 0.5193621867881549, "grad_norm": 0.15419794619083405, "learning_rate": 1.8928632128209368e-05, "loss": 0.5338, "step": 2052 }, { "epoch": 0.5196152872690458, "grad_norm": 0.14877189695835114, "learning_rate": 1.8927553641170764e-05, "loss": 0.5763, "step": 2053 }, { "epoch": 0.5198683877499367, "grad_norm": 0.15488377213478088, "learning_rate": 1.8926474642334443e-05, "loss": 0.5632, "step": 2054 }, { "epoch": 0.5201214882308276, "grad_norm": 0.14712969958782196, "learning_rate": 1.892539513176227e-05, "loss": 0.5599, "step": 2055 }, { "epoch": 0.5203745887117186, "grad_norm": 0.14861759543418884, "learning_rate": 1.8924315109516124e-05, "loss": 0.525, "step": 2056 }, { "epoch": 0.5206276891926095, "grad_norm": 0.1469864696264267, "learning_rate": 1.8923234575657928e-05, "loss": 0.56, "step": 2057 }, { "epoch": 0.5208807896735004, "grad_norm": 0.14894641935825348, "learning_rate": 1.892215353024962e-05, "loss": 0.5274, "step": 2058 }, { "epoch": 0.5211338901543913, "grad_norm": 0.15288713574409485, "learning_rate": 1.8921071973353174e-05, "loss": 0.551, "step": 2059 }, { "epoch": 0.5213869906352822, "grad_norm": 0.14344966411590576, "learning_rate": 1.89199899050306e-05, "loss": 0.533, "step": 2060 }, { "epoch": 0.5216400911161732, "grad_norm": 0.1435471773147583, "learning_rate": 1.891890732534392e-05, "loss": 0.5379, "step": 2061 }, { "epoch": 0.521893191597064, "grad_norm": 0.14533542096614838, "learning_rate": 1.8917824234355205e-05, "loss": 0.5225, "step": 2062 }, { "epoch": 0.5221462920779549, "grad_norm": 0.14329683780670166, "learning_rate": 1.8916740632126544e-05, "loss": 0.5398, "step": 2063 }, { "epoch": 0.5223993925588458, "grad_norm": 0.14607039093971252, "learning_rate": 1.891565651872006e-05, "loss": 0.5059, "step": 2064 }, { "epoch": 0.5226524930397368, "grad_norm": 0.14713475108146667, "learning_rate": 1.8914571894197894e-05, "loss": 0.5527, "step": 2065 }, { "epoch": 0.5229055935206277, "grad_norm": 0.1503877490758896, "learning_rate": 1.8913486758622236e-05, "loss": 0.5449, "step": 2066 }, { "epoch": 0.5231586940015186, "grad_norm": 0.1482800990343094, "learning_rate": 1.8912401112055288e-05, "loss": 0.5663, "step": 2067 }, { "epoch": 0.5234117944824095, "grad_norm": 0.14930813014507294, "learning_rate": 1.891131495455929e-05, "loss": 0.5481, "step": 2068 }, { "epoch": 0.5236648949633004, "grad_norm": 0.14039750397205353, "learning_rate": 1.891022828619651e-05, "loss": 0.517, "step": 2069 }, { "epoch": 0.5239179954441914, "grad_norm": 0.1495741307735443, "learning_rate": 1.8909141107029237e-05, "loss": 0.5328, "step": 2070 }, { "epoch": 0.5241710959250823, "grad_norm": 0.14653369784355164, "learning_rate": 1.890805341711981e-05, "loss": 0.5524, "step": 2071 }, { "epoch": 0.5244241964059732, "grad_norm": 0.15061494708061218, "learning_rate": 1.8906965216530574e-05, "loss": 0.564, "step": 2072 }, { "epoch": 0.524677296886864, "grad_norm": 0.1590503454208374, "learning_rate": 1.8905876505323918e-05, "loss": 0.5553, "step": 2073 }, { "epoch": 0.524930397367755, "grad_norm": 0.19967687129974365, "learning_rate": 1.890478728356225e-05, "loss": 0.557, "step": 2074 }, { "epoch": 0.5251834978486459, "grad_norm": 0.14689359068870544, "learning_rate": 1.8903697551308018e-05, "loss": 0.5203, "step": 2075 }, { "epoch": 0.5254365983295368, "grad_norm": 0.14794990420341492, "learning_rate": 1.890260730862369e-05, "loss": 0.5395, "step": 2076 }, { "epoch": 0.5256896988104277, "grad_norm": 0.14844904839992523, "learning_rate": 1.8901516555571772e-05, "loss": 0.5522, "step": 2077 }, { "epoch": 0.5259427992913187, "grad_norm": 0.14163947105407715, "learning_rate": 1.8900425292214794e-05, "loss": 0.5318, "step": 2078 }, { "epoch": 0.5261958997722096, "grad_norm": 0.15207208693027496, "learning_rate": 1.889933351861531e-05, "loss": 0.5234, "step": 2079 }, { "epoch": 0.5264490002531005, "grad_norm": 0.14353814721107483, "learning_rate": 1.8898241234835914e-05, "loss": 0.5376, "step": 2080 }, { "epoch": 0.5267021007339914, "grad_norm": 0.14712877571582794, "learning_rate": 1.8897148440939227e-05, "loss": 0.5412, "step": 2081 }, { "epoch": 0.5269552012148823, "grad_norm": 0.1465034931898117, "learning_rate": 1.889605513698789e-05, "loss": 0.5296, "step": 2082 }, { "epoch": 0.5272083016957733, "grad_norm": 0.14930671453475952, "learning_rate": 1.8894961323044584e-05, "loss": 0.5479, "step": 2083 }, { "epoch": 0.5274614021766642, "grad_norm": 0.14586983621120453, "learning_rate": 1.889386699917201e-05, "loss": 0.5548, "step": 2084 }, { "epoch": 0.527714502657555, "grad_norm": 0.14690659940242767, "learning_rate": 1.889277216543291e-05, "loss": 0.5351, "step": 2085 }, { "epoch": 0.5279676031384459, "grad_norm": 0.1415756791830063, "learning_rate": 1.8891676821890047e-05, "loss": 0.5311, "step": 2086 }, { "epoch": 0.5282207036193369, "grad_norm": 0.14690400660037994, "learning_rate": 1.8890580968606208e-05, "loss": 0.5541, "step": 2087 }, { "epoch": 0.5284738041002278, "grad_norm": 0.14690932631492615, "learning_rate": 1.8889484605644226e-05, "loss": 0.545, "step": 2088 }, { "epoch": 0.5287269045811187, "grad_norm": 0.13867908716201782, "learning_rate": 1.8888387733066946e-05, "loss": 0.5583, "step": 2089 }, { "epoch": 0.5289800050620096, "grad_norm": 0.14592419564723969, "learning_rate": 1.8887290350937253e-05, "loss": 0.5415, "step": 2090 }, { "epoch": 0.5292331055429005, "grad_norm": 0.14960640668869019, "learning_rate": 1.8886192459318056e-05, "loss": 0.554, "step": 2091 }, { "epoch": 0.5294862060237915, "grad_norm": 0.148869588971138, "learning_rate": 1.8885094058272296e-05, "loss": 0.5576, "step": 2092 }, { "epoch": 0.5297393065046824, "grad_norm": 0.14470264315605164, "learning_rate": 1.888399514786294e-05, "loss": 0.5394, "step": 2093 }, { "epoch": 0.5299924069855733, "grad_norm": 0.1456449329853058, "learning_rate": 1.8882895728152986e-05, "loss": 0.5323, "step": 2094 }, { "epoch": 0.5302455074664641, "grad_norm": 0.14392712712287903, "learning_rate": 1.8881795799205465e-05, "loss": 0.5256, "step": 2095 }, { "epoch": 0.5304986079473551, "grad_norm": 0.14633582532405853, "learning_rate": 1.888069536108343e-05, "loss": 0.536, "step": 2096 }, { "epoch": 0.530751708428246, "grad_norm": 0.1577252596616745, "learning_rate": 1.8879594413849966e-05, "loss": 0.5289, "step": 2097 }, { "epoch": 0.5310048089091369, "grad_norm": 0.14655692875385284, "learning_rate": 1.8878492957568193e-05, "loss": 0.5507, "step": 2098 }, { "epoch": 0.5312579093900278, "grad_norm": 0.14495889842510223, "learning_rate": 1.8877390992301252e-05, "loss": 0.5758, "step": 2099 }, { "epoch": 0.5315110098709187, "grad_norm": 0.16211102902889252, "learning_rate": 1.8876288518112315e-05, "loss": 0.5455, "step": 2100 }, { "epoch": 0.5317641103518097, "grad_norm": 0.1435222029685974, "learning_rate": 1.8875185535064588e-05, "loss": 0.5419, "step": 2101 }, { "epoch": 0.5320172108327006, "grad_norm": 0.14342010021209717, "learning_rate": 1.88740820432213e-05, "loss": 0.5603, "step": 2102 }, { "epoch": 0.5322703113135915, "grad_norm": 0.14504124224185944, "learning_rate": 1.8872978042645714e-05, "loss": 0.5188, "step": 2103 }, { "epoch": 0.5325234117944824, "grad_norm": 0.1456257551908493, "learning_rate": 1.8871873533401113e-05, "loss": 0.5403, "step": 2104 }, { "epoch": 0.5327765122753734, "grad_norm": 0.16915258765220642, "learning_rate": 1.8870768515550826e-05, "loss": 0.5291, "step": 2105 }, { "epoch": 0.5330296127562643, "grad_norm": 0.14916908740997314, "learning_rate": 1.8869662989158197e-05, "loss": 0.5471, "step": 2106 }, { "epoch": 0.5332827132371551, "grad_norm": 0.15560995042324066, "learning_rate": 1.8868556954286604e-05, "loss": 0.5356, "step": 2107 }, { "epoch": 0.533535813718046, "grad_norm": 0.1501181274652481, "learning_rate": 1.8867450410999453e-05, "loss": 0.5277, "step": 2108 }, { "epoch": 0.5337889141989369, "grad_norm": 0.1498737931251526, "learning_rate": 1.886634335936018e-05, "loss": 0.5623, "step": 2109 }, { "epoch": 0.5340420146798279, "grad_norm": 0.142748162150383, "learning_rate": 1.8865235799432246e-05, "loss": 0.5373, "step": 2110 }, { "epoch": 0.5342951151607188, "grad_norm": 0.1436242014169693, "learning_rate": 1.886412773127915e-05, "loss": 0.5397, "step": 2111 }, { "epoch": 0.5345482156416097, "grad_norm": 0.1510160118341446, "learning_rate": 1.8863019154964418e-05, "loss": 0.5333, "step": 2112 }, { "epoch": 0.5348013161225006, "grad_norm": 0.14627863466739655, "learning_rate": 1.8861910070551597e-05, "loss": 0.5535, "step": 2113 }, { "epoch": 0.5350544166033916, "grad_norm": 0.14565078914165497, "learning_rate": 1.886080047810427e-05, "loss": 0.5402, "step": 2114 }, { "epoch": 0.5353075170842825, "grad_norm": 0.14789433777332306, "learning_rate": 1.8859690377686046e-05, "loss": 0.5256, "step": 2115 }, { "epoch": 0.5355606175651734, "grad_norm": 0.1485886126756668, "learning_rate": 1.885857976936057e-05, "loss": 0.5282, "step": 2116 }, { "epoch": 0.5358137180460643, "grad_norm": 0.1415485143661499, "learning_rate": 1.8857468653191503e-05, "loss": 0.5339, "step": 2117 }, { "epoch": 0.5360668185269551, "grad_norm": 0.1438799798488617, "learning_rate": 1.885635702924255e-05, "loss": 0.526, "step": 2118 }, { "epoch": 0.5363199190078461, "grad_norm": 0.14721158146858215, "learning_rate": 1.8855244897577432e-05, "loss": 0.5386, "step": 2119 }, { "epoch": 0.536573019488737, "grad_norm": 0.1481776386499405, "learning_rate": 1.885413225825991e-05, "loss": 0.5634, "step": 2120 }, { "epoch": 0.5368261199696279, "grad_norm": 0.14346905052661896, "learning_rate": 1.8853019111353766e-05, "loss": 0.5261, "step": 2121 }, { "epoch": 0.5370792204505188, "grad_norm": 0.14726269245147705, "learning_rate": 1.8851905456922818e-05, "loss": 0.525, "step": 2122 }, { "epoch": 0.5373323209314098, "grad_norm": 0.14352542161941528, "learning_rate": 1.8850791295030904e-05, "loss": 0.5329, "step": 2123 }, { "epoch": 0.5375854214123007, "grad_norm": 0.16002048552036285, "learning_rate": 1.8849676625741905e-05, "loss": 0.5259, "step": 2124 }, { "epoch": 0.5378385218931916, "grad_norm": 0.14723941683769226, "learning_rate": 1.8848561449119713e-05, "loss": 0.5293, "step": 2125 }, { "epoch": 0.5380916223740825, "grad_norm": 0.14481991529464722, "learning_rate": 1.8847445765228262e-05, "loss": 0.5285, "step": 2126 }, { "epoch": 0.5383447228549735, "grad_norm": 0.14677830040454865, "learning_rate": 1.8846329574131514e-05, "loss": 0.5222, "step": 2127 }, { "epoch": 0.5385978233358644, "grad_norm": 0.14093263447284698, "learning_rate": 1.8845212875893457e-05, "loss": 0.524, "step": 2128 }, { "epoch": 0.5388509238167553, "grad_norm": 0.1465352177619934, "learning_rate": 1.884409567057811e-05, "loss": 0.5328, "step": 2129 }, { "epoch": 0.5391040242976461, "grad_norm": 0.14699774980545044, "learning_rate": 1.884297795824952e-05, "loss": 0.5203, "step": 2130 }, { "epoch": 0.539357124778537, "grad_norm": 0.14181242883205414, "learning_rate": 1.8841859738971758e-05, "loss": 0.5508, "step": 2131 }, { "epoch": 0.539610225259428, "grad_norm": 0.14994683861732483, "learning_rate": 1.8840741012808937e-05, "loss": 0.5595, "step": 2132 }, { "epoch": 0.5398633257403189, "grad_norm": 0.14866229891777039, "learning_rate": 1.883962177982518e-05, "loss": 0.5573, "step": 2133 }, { "epoch": 0.5401164262212098, "grad_norm": 0.14127808809280396, "learning_rate": 1.8838502040084667e-05, "loss": 0.5283, "step": 2134 }, { "epoch": 0.5403695267021007, "grad_norm": 0.14179453253746033, "learning_rate": 1.8837381793651574e-05, "loss": 0.5463, "step": 2135 }, { "epoch": 0.5406226271829917, "grad_norm": 0.1535675823688507, "learning_rate": 1.8836261040590132e-05, "loss": 0.5538, "step": 2136 }, { "epoch": 0.5408757276638826, "grad_norm": 0.14412690699100494, "learning_rate": 1.8835139780964583e-05, "loss": 0.5468, "step": 2137 }, { "epoch": 0.5411288281447735, "grad_norm": 0.14546814560890198, "learning_rate": 1.8834018014839216e-05, "loss": 0.5348, "step": 2138 }, { "epoch": 0.5413819286256644, "grad_norm": 0.16463163495063782, "learning_rate": 1.883289574227834e-05, "loss": 0.55, "step": 2139 }, { "epoch": 0.5416350291065553, "grad_norm": 0.14167420566082, "learning_rate": 1.883177296334628e-05, "loss": 0.5556, "step": 2140 }, { "epoch": 0.5418881295874463, "grad_norm": 0.1812027245759964, "learning_rate": 1.8830649678107415e-05, "loss": 0.5446, "step": 2141 }, { "epoch": 0.5421412300683371, "grad_norm": 0.1430598348379135, "learning_rate": 1.8829525886626135e-05, "loss": 0.5163, "step": 2142 }, { "epoch": 0.542394330549228, "grad_norm": 0.16585777699947357, "learning_rate": 1.8828401588966864e-05, "loss": 0.5631, "step": 2143 }, { "epoch": 0.5426474310301189, "grad_norm": 0.14548391103744507, "learning_rate": 1.8827276785194056e-05, "loss": 0.5474, "step": 2144 }, { "epoch": 0.5429005315110099, "grad_norm": 0.1467544436454773, "learning_rate": 1.88261514753722e-05, "loss": 0.5493, "step": 2145 }, { "epoch": 0.5431536319919008, "grad_norm": 0.14689014852046967, "learning_rate": 1.8825025659565798e-05, "loss": 0.5416, "step": 2146 }, { "epoch": 0.5434067324727917, "grad_norm": 0.14642195403575897, "learning_rate": 1.8823899337839394e-05, "loss": 0.5426, "step": 2147 }, { "epoch": 0.5436598329536826, "grad_norm": 0.14804759621620178, "learning_rate": 1.8822772510257562e-05, "loss": 0.5637, "step": 2148 }, { "epoch": 0.5439129334345735, "grad_norm": 0.1435912698507309, "learning_rate": 1.8821645176884894e-05, "loss": 0.5184, "step": 2149 }, { "epoch": 0.5441660339154645, "grad_norm": 0.1469103991985321, "learning_rate": 1.8820517337786023e-05, "loss": 0.5412, "step": 2150 }, { "epoch": 0.5444191343963554, "grad_norm": 0.1398661583662033, "learning_rate": 1.8819388993025603e-05, "loss": 0.544, "step": 2151 }, { "epoch": 0.5446722348772463, "grad_norm": 0.15352042019367218, "learning_rate": 1.881826014266832e-05, "loss": 0.5487, "step": 2152 }, { "epoch": 0.5449253353581371, "grad_norm": 0.14718757569789886, "learning_rate": 1.8817130786778888e-05, "loss": 0.5265, "step": 2153 }, { "epoch": 0.5451784358390281, "grad_norm": 0.1449514478445053, "learning_rate": 1.8816000925422053e-05, "loss": 0.5498, "step": 2154 }, { "epoch": 0.545431536319919, "grad_norm": 0.14808648824691772, "learning_rate": 1.8814870558662582e-05, "loss": 0.5468, "step": 2155 }, { "epoch": 0.5456846368008099, "grad_norm": 0.14138783514499664, "learning_rate": 1.8813739686565285e-05, "loss": 0.5224, "step": 2156 }, { "epoch": 0.5459377372817008, "grad_norm": 0.14915712177753448, "learning_rate": 1.881260830919499e-05, "loss": 0.5228, "step": 2157 }, { "epoch": 0.5461908377625917, "grad_norm": 0.14445608854293823, "learning_rate": 1.881147642661655e-05, "loss": 0.5146, "step": 2158 }, { "epoch": 0.5464439382434827, "grad_norm": 0.14614073932170868, "learning_rate": 1.8810344038894857e-05, "loss": 0.5391, "step": 2159 }, { "epoch": 0.5466970387243736, "grad_norm": 0.14145520329475403, "learning_rate": 1.880921114609483e-05, "loss": 0.5195, "step": 2160 }, { "epoch": 0.5469501392052645, "grad_norm": 0.1484239399433136, "learning_rate": 1.8808077748281415e-05, "loss": 0.569, "step": 2161 }, { "epoch": 0.5472032396861554, "grad_norm": 0.14622029662132263, "learning_rate": 1.8806943845519586e-05, "loss": 0.526, "step": 2162 }, { "epoch": 0.5474563401670464, "grad_norm": 0.14384983479976654, "learning_rate": 1.8805809437874346e-05, "loss": 0.5245, "step": 2163 }, { "epoch": 0.5477094406479373, "grad_norm": 0.1476360708475113, "learning_rate": 1.8804674525410734e-05, "loss": 0.536, "step": 2164 }, { "epoch": 0.5479625411288281, "grad_norm": 0.14406946301460266, "learning_rate": 1.880353910819381e-05, "loss": 0.5128, "step": 2165 }, { "epoch": 0.548215641609719, "grad_norm": 0.14427940547466278, "learning_rate": 1.8802403186288657e-05, "loss": 0.5478, "step": 2166 }, { "epoch": 0.5484687420906099, "grad_norm": 0.14759938418865204, "learning_rate": 1.8801266759760408e-05, "loss": 0.5471, "step": 2167 }, { "epoch": 0.5487218425715009, "grad_norm": 0.1455715447664261, "learning_rate": 1.8800129828674202e-05, "loss": 0.5435, "step": 2168 }, { "epoch": 0.5489749430523918, "grad_norm": 0.14836999773979187, "learning_rate": 1.879899239309522e-05, "loss": 0.5447, "step": 2169 }, { "epoch": 0.5492280435332827, "grad_norm": 0.14490824937820435, "learning_rate": 1.879785445308867e-05, "loss": 0.5628, "step": 2170 }, { "epoch": 0.5494811440141736, "grad_norm": 0.14529091119766235, "learning_rate": 1.8796716008719784e-05, "loss": 0.542, "step": 2171 }, { "epoch": 0.5497342444950646, "grad_norm": 0.14771731197834015, "learning_rate": 1.879557706005383e-05, "loss": 0.5596, "step": 2172 }, { "epoch": 0.5499873449759555, "grad_norm": 0.1474103182554245, "learning_rate": 1.8794437607156105e-05, "loss": 0.5354, "step": 2173 }, { "epoch": 0.5502404454568464, "grad_norm": 0.14134001731872559, "learning_rate": 1.8793297650091922e-05, "loss": 0.5248, "step": 2174 }, { "epoch": 0.5504935459377372, "grad_norm": 0.1465873420238495, "learning_rate": 1.8792157188926642e-05, "loss": 0.5564, "step": 2175 }, { "epoch": 0.5507466464186282, "grad_norm": 0.14788149297237396, "learning_rate": 1.8791016223725634e-05, "loss": 0.5726, "step": 2176 }, { "epoch": 0.5509997468995191, "grad_norm": 0.15029706060886383, "learning_rate": 1.878987475455432e-05, "loss": 0.5511, "step": 2177 }, { "epoch": 0.55125284738041, "grad_norm": 0.14880117774009705, "learning_rate": 1.878873278147813e-05, "loss": 0.5333, "step": 2178 }, { "epoch": 0.5515059478613009, "grad_norm": 0.1465713381767273, "learning_rate": 1.878759030456253e-05, "loss": 0.5935, "step": 2179 }, { "epoch": 0.5517590483421918, "grad_norm": 0.14811015129089355, "learning_rate": 1.878644732387302e-05, "loss": 0.5665, "step": 2180 }, { "epoch": 0.5520121488230828, "grad_norm": 0.14152702689170837, "learning_rate": 1.8785303839475126e-05, "loss": 0.5455, "step": 2181 }, { "epoch": 0.5522652493039737, "grad_norm": 0.14692920446395874, "learning_rate": 1.8784159851434395e-05, "loss": 0.5515, "step": 2182 }, { "epoch": 0.5525183497848646, "grad_norm": 0.14347903430461884, "learning_rate": 1.8783015359816415e-05, "loss": 0.5495, "step": 2183 }, { "epoch": 0.5527714502657555, "grad_norm": 0.1463506817817688, "learning_rate": 1.8781870364686795e-05, "loss": 0.525, "step": 2184 }, { "epoch": 0.5530245507466465, "grad_norm": 0.14531154930591583, "learning_rate": 1.8780724866111175e-05, "loss": 0.51, "step": 2185 }, { "epoch": 0.5532776512275374, "grad_norm": 0.1414245069026947, "learning_rate": 1.8779578864155222e-05, "loss": 0.5449, "step": 2186 }, { "epoch": 0.5535307517084282, "grad_norm": 0.15202537178993225, "learning_rate": 1.877843235888464e-05, "loss": 0.5324, "step": 2187 }, { "epoch": 0.5537838521893191, "grad_norm": 0.14601637423038483, "learning_rate": 1.8777285350365153e-05, "loss": 0.5444, "step": 2188 }, { "epoch": 0.55403695267021, "grad_norm": 0.14495153725147247, "learning_rate": 1.8776137838662513e-05, "loss": 0.5449, "step": 2189 }, { "epoch": 0.554290053151101, "grad_norm": 0.14015650749206543, "learning_rate": 1.8774989823842505e-05, "loss": 0.5079, "step": 2190 }, { "epoch": 0.5545431536319919, "grad_norm": 0.1459050178527832, "learning_rate": 1.877384130597095e-05, "loss": 0.5518, "step": 2191 }, { "epoch": 0.5547962541128828, "grad_norm": 0.14173124730587006, "learning_rate": 1.8772692285113684e-05, "loss": 0.5035, "step": 2192 }, { "epoch": 0.5550493545937737, "grad_norm": 0.14195701479911804, "learning_rate": 1.8771542761336574e-05, "loss": 0.5648, "step": 2193 }, { "epoch": 0.5553024550746647, "grad_norm": 0.15466424822807312, "learning_rate": 1.8770392734705523e-05, "loss": 0.5433, "step": 2194 }, { "epoch": 0.5555555555555556, "grad_norm": 0.14722977578639984, "learning_rate": 1.8769242205286463e-05, "loss": 0.5526, "step": 2195 }, { "epoch": 0.5558086560364465, "grad_norm": 0.1462264060974121, "learning_rate": 1.876809117314535e-05, "loss": 0.5403, "step": 2196 }, { "epoch": 0.5560617565173374, "grad_norm": 0.14811238646507263, "learning_rate": 1.876693963834817e-05, "loss": 0.559, "step": 2197 }, { "epoch": 0.5563148569982282, "grad_norm": 0.14408525824546814, "learning_rate": 1.8765787600960935e-05, "loss": 0.5405, "step": 2198 }, { "epoch": 0.5565679574791192, "grad_norm": 0.1455586701631546, "learning_rate": 1.8764635061049695e-05, "loss": 0.5599, "step": 2199 }, { "epoch": 0.5568210579600101, "grad_norm": 0.15099912881851196, "learning_rate": 1.8763482018680517e-05, "loss": 0.526, "step": 2200 }, { "epoch": 0.557074158440901, "grad_norm": 0.14662666618824005, "learning_rate": 1.8762328473919503e-05, "loss": 0.5466, "step": 2201 }, { "epoch": 0.5573272589217919, "grad_norm": 0.1466791331768036, "learning_rate": 1.8761174426832785e-05, "loss": 0.5418, "step": 2202 }, { "epoch": 0.5575803594026829, "grad_norm": 0.14384949207305908, "learning_rate": 1.876001987748652e-05, "loss": 0.4963, "step": 2203 }, { "epoch": 0.5578334598835738, "grad_norm": 0.1430201381444931, "learning_rate": 1.87588648259469e-05, "loss": 0.5282, "step": 2204 }, { "epoch": 0.5580865603644647, "grad_norm": 0.1408693492412567, "learning_rate": 1.875770927228014e-05, "loss": 0.549, "step": 2205 }, { "epoch": 0.5583396608453556, "grad_norm": 0.14347919821739197, "learning_rate": 1.8756553216552485e-05, "loss": 0.5466, "step": 2206 }, { "epoch": 0.5585927613262465, "grad_norm": 0.14288023114204407, "learning_rate": 1.8755396658830205e-05, "loss": 0.5286, "step": 2207 }, { "epoch": 0.5588458618071375, "grad_norm": 0.1465696394443512, "learning_rate": 1.875423959917961e-05, "loss": 0.5712, "step": 2208 }, { "epoch": 0.5590989622880284, "grad_norm": 0.15595479309558868, "learning_rate": 1.875308203766703e-05, "loss": 0.518, "step": 2209 }, { "epoch": 0.5593520627689192, "grad_norm": 0.14882028102874756, "learning_rate": 1.875192397435882e-05, "loss": 0.5272, "step": 2210 }, { "epoch": 0.5596051632498101, "grad_norm": 0.148061141371727, "learning_rate": 1.8750765409321375e-05, "loss": 0.5247, "step": 2211 }, { "epoch": 0.5598582637307011, "grad_norm": 0.145848348736763, "learning_rate": 1.874960634262111e-05, "loss": 0.5299, "step": 2212 }, { "epoch": 0.560111364211592, "grad_norm": 0.1456480175256729, "learning_rate": 1.8748446774324477e-05, "loss": 0.5334, "step": 2213 }, { "epoch": 0.5603644646924829, "grad_norm": 0.14741340279579163, "learning_rate": 1.8747286704497947e-05, "loss": 0.4874, "step": 2214 }, { "epoch": 0.5606175651733738, "grad_norm": 0.14687207341194153, "learning_rate": 1.874612613320802e-05, "loss": 0.5166, "step": 2215 }, { "epoch": 0.5608706656542647, "grad_norm": 0.1464032232761383, "learning_rate": 1.8744965060521245e-05, "loss": 0.554, "step": 2216 }, { "epoch": 0.5611237661351557, "grad_norm": 0.16772732138633728, "learning_rate": 1.8743803486504168e-05, "loss": 0.5445, "step": 2217 }, { "epoch": 0.5613768666160466, "grad_norm": 0.1479538083076477, "learning_rate": 1.874264141122338e-05, "loss": 0.5392, "step": 2218 }, { "epoch": 0.5616299670969375, "grad_norm": 0.1854136735200882, "learning_rate": 1.874147883474551e-05, "loss": 0.5588, "step": 2219 }, { "epoch": 0.5618830675778284, "grad_norm": 0.14578451216220856, "learning_rate": 1.8740315757137204e-05, "loss": 0.5041, "step": 2220 }, { "epoch": 0.5621361680587194, "grad_norm": 0.1439344435930252, "learning_rate": 1.8739152178465133e-05, "loss": 0.5386, "step": 2221 }, { "epoch": 0.5623892685396102, "grad_norm": 0.14381521940231323, "learning_rate": 1.8737988098796e-05, "loss": 0.5571, "step": 2222 }, { "epoch": 0.5626423690205011, "grad_norm": 0.15286916494369507, "learning_rate": 1.8736823518196556e-05, "loss": 0.5573, "step": 2223 }, { "epoch": 0.562895469501392, "grad_norm": 0.15535466372966766, "learning_rate": 1.8735658436733545e-05, "loss": 0.5573, "step": 2224 }, { "epoch": 0.563148569982283, "grad_norm": 0.14613652229309082, "learning_rate": 1.8734492854473766e-05, "loss": 0.5397, "step": 2225 }, { "epoch": 0.5634016704631739, "grad_norm": 0.1451738327741623, "learning_rate": 1.8733326771484043e-05, "loss": 0.5629, "step": 2226 }, { "epoch": 0.5636547709440648, "grad_norm": 0.14860254526138306, "learning_rate": 1.873216018783122e-05, "loss": 0.5463, "step": 2227 }, { "epoch": 0.5639078714249557, "grad_norm": 0.15874744951725006, "learning_rate": 1.8730993103582177e-05, "loss": 0.5412, "step": 2228 }, { "epoch": 0.5641609719058466, "grad_norm": 0.1528647392988205, "learning_rate": 1.8729825518803818e-05, "loss": 0.5532, "step": 2229 }, { "epoch": 0.5644140723867376, "grad_norm": 0.16232876479625702, "learning_rate": 1.8728657433563082e-05, "loss": 0.5397, "step": 2230 }, { "epoch": 0.5646671728676285, "grad_norm": 0.1460360735654831, "learning_rate": 1.872748884792693e-05, "loss": 0.5474, "step": 2231 }, { "epoch": 0.5649202733485194, "grad_norm": 0.14589981734752655, "learning_rate": 1.8726319761962358e-05, "loss": 0.5209, "step": 2232 }, { "epoch": 0.5651733738294102, "grad_norm": 0.14501315355300903, "learning_rate": 1.8725150175736383e-05, "loss": 0.5467, "step": 2233 }, { "epoch": 0.5654264743103012, "grad_norm": 0.150316059589386, "learning_rate": 1.8723980089316054e-05, "loss": 0.5632, "step": 2234 }, { "epoch": 0.5656795747911921, "grad_norm": 0.14659665524959564, "learning_rate": 1.8722809502768455e-05, "loss": 0.5366, "step": 2235 }, { "epoch": 0.565932675272083, "grad_norm": 0.14401592314243317, "learning_rate": 1.872163841616069e-05, "loss": 0.534, "step": 2236 }, { "epoch": 0.5661857757529739, "grad_norm": 0.14456796646118164, "learning_rate": 1.8720466829559896e-05, "loss": 0.5673, "step": 2237 }, { "epoch": 0.5664388762338648, "grad_norm": 0.1423196941614151, "learning_rate": 1.8719294743033235e-05, "loss": 0.5155, "step": 2238 }, { "epoch": 0.5666919767147558, "grad_norm": 0.1501038521528244, "learning_rate": 1.8718122156647905e-05, "loss": 0.5379, "step": 2239 }, { "epoch": 0.5669450771956467, "grad_norm": 0.14496496319770813, "learning_rate": 1.8716949070471123e-05, "loss": 0.5323, "step": 2240 }, { "epoch": 0.5671981776765376, "grad_norm": 0.18220806121826172, "learning_rate": 1.871577548457014e-05, "loss": 0.5159, "step": 2241 }, { "epoch": 0.5674512781574285, "grad_norm": 0.14391760528087616, "learning_rate": 1.871460139901224e-05, "loss": 0.5315, "step": 2242 }, { "epoch": 0.5677043786383195, "grad_norm": 0.14471866190433502, "learning_rate": 1.8713426813864727e-05, "loss": 0.5259, "step": 2243 }, { "epoch": 0.5679574791192104, "grad_norm": 0.15548956394195557, "learning_rate": 1.8712251729194936e-05, "loss": 0.5802, "step": 2244 }, { "epoch": 0.5682105796001012, "grad_norm": 0.14285972714424133, "learning_rate": 1.8711076145070234e-05, "loss": 0.5192, "step": 2245 }, { "epoch": 0.5684636800809921, "grad_norm": 0.14926937222480774, "learning_rate": 1.8709900061558016e-05, "loss": 0.5381, "step": 2246 }, { "epoch": 0.568716780561883, "grad_norm": 0.14849525690078735, "learning_rate": 1.8708723478725704e-05, "loss": 0.5575, "step": 2247 }, { "epoch": 0.568969881042774, "grad_norm": 0.14708726108074188, "learning_rate": 1.870754639664075e-05, "loss": 0.5146, "step": 2248 }, { "epoch": 0.5692229815236649, "grad_norm": 0.1492762565612793, "learning_rate": 1.870636881537063e-05, "loss": 0.5367, "step": 2249 }, { "epoch": 0.5694760820045558, "grad_norm": 0.1519898623228073, "learning_rate": 1.8705190734982853e-05, "loss": 0.5548, "step": 2250 }, { "epoch": 0.5697291824854467, "grad_norm": 0.1466929167509079, "learning_rate": 1.8704012155544958e-05, "loss": 0.5345, "step": 2251 }, { "epoch": 0.5699822829663377, "grad_norm": 0.1472339779138565, "learning_rate": 1.8702833077124508e-05, "loss": 0.5272, "step": 2252 }, { "epoch": 0.5702353834472286, "grad_norm": 0.13981996476650238, "learning_rate": 1.87016534997891e-05, "loss": 0.5352, "step": 2253 }, { "epoch": 0.5704884839281195, "grad_norm": 0.1445668786764145, "learning_rate": 1.8700473423606355e-05, "loss": 0.5247, "step": 2254 }, { "epoch": 0.5707415844090103, "grad_norm": 0.1454024463891983, "learning_rate": 1.8699292848643926e-05, "loss": 0.5476, "step": 2255 }, { "epoch": 0.5709946848899012, "grad_norm": 0.15462176501750946, "learning_rate": 1.8698111774969488e-05, "loss": 0.541, "step": 2256 }, { "epoch": 0.5712477853707922, "grad_norm": 0.1474360227584839, "learning_rate": 1.8696930202650755e-05, "loss": 0.5543, "step": 2257 }, { "epoch": 0.5715008858516831, "grad_norm": 0.14274410903453827, "learning_rate": 1.8695748131755463e-05, "loss": 0.5589, "step": 2258 }, { "epoch": 0.571753986332574, "grad_norm": 0.14441858232021332, "learning_rate": 1.8694565562351374e-05, "loss": 0.5442, "step": 2259 }, { "epoch": 0.5720070868134649, "grad_norm": 0.1430259644985199, "learning_rate": 1.869338249450629e-05, "loss": 0.5226, "step": 2260 }, { "epoch": 0.5722601872943559, "grad_norm": 0.14691650867462158, "learning_rate": 1.869219892828802e-05, "loss": 0.5594, "step": 2261 }, { "epoch": 0.5725132877752468, "grad_norm": 0.15393652021884918, "learning_rate": 1.8691014863764426e-05, "loss": 0.5436, "step": 2262 }, { "epoch": 0.5727663882561377, "grad_norm": 0.1516573429107666, "learning_rate": 1.8689830301003387e-05, "loss": 0.5349, "step": 2263 }, { "epoch": 0.5730194887370286, "grad_norm": 0.147565558552742, "learning_rate": 1.8688645240072813e-05, "loss": 0.5319, "step": 2264 }, { "epoch": 0.5732725892179195, "grad_norm": 0.14211657643318176, "learning_rate": 1.8687459681040634e-05, "loss": 0.504, "step": 2265 }, { "epoch": 0.5735256896988105, "grad_norm": 0.15049326419830322, "learning_rate": 1.868627362397482e-05, "loss": 0.5531, "step": 2266 }, { "epoch": 0.5737787901797013, "grad_norm": 0.14671093225479126, "learning_rate": 1.8685087068943365e-05, "loss": 0.5268, "step": 2267 }, { "epoch": 0.5740318906605922, "grad_norm": 0.1650523841381073, "learning_rate": 1.8683900016014292e-05, "loss": 0.5423, "step": 2268 }, { "epoch": 0.5742849911414831, "grad_norm": 0.1483929306268692, "learning_rate": 1.868271246525565e-05, "loss": 0.5411, "step": 2269 }, { "epoch": 0.5745380916223741, "grad_norm": 0.14577732980251312, "learning_rate": 1.8681524416735524e-05, "loss": 0.513, "step": 2270 }, { "epoch": 0.574791192103265, "grad_norm": 0.14955361187458038, "learning_rate": 1.868033587052202e-05, "loss": 0.5267, "step": 2271 }, { "epoch": 0.5750442925841559, "grad_norm": 0.1502561718225479, "learning_rate": 1.867914682668327e-05, "loss": 0.5302, "step": 2272 }, { "epoch": 0.5752973930650468, "grad_norm": 0.1462802290916443, "learning_rate": 1.867795728528744e-05, "loss": 0.5489, "step": 2273 }, { "epoch": 0.5755504935459378, "grad_norm": 0.1433577835559845, "learning_rate": 1.8676767246402735e-05, "loss": 0.5323, "step": 2274 }, { "epoch": 0.5758035940268287, "grad_norm": 0.1570679247379303, "learning_rate": 1.8675576710097365e-05, "loss": 0.5642, "step": 2275 }, { "epoch": 0.5760566945077196, "grad_norm": 0.14700014889240265, "learning_rate": 1.8674385676439584e-05, "loss": 0.5428, "step": 2276 }, { "epoch": 0.5763097949886105, "grad_norm": 0.14331628382205963, "learning_rate": 1.8673194145497675e-05, "loss": 0.5405, "step": 2277 }, { "epoch": 0.5765628954695013, "grad_norm": 0.1505105346441269, "learning_rate": 1.8672002117339944e-05, "loss": 0.5564, "step": 2278 }, { "epoch": 0.5768159959503923, "grad_norm": 0.1457102745771408, "learning_rate": 1.8670809592034724e-05, "loss": 0.5481, "step": 2279 }, { "epoch": 0.5770690964312832, "grad_norm": 0.15212388336658478, "learning_rate": 1.8669616569650384e-05, "loss": 0.5633, "step": 2280 }, { "epoch": 0.5773221969121741, "grad_norm": 0.14920096099376678, "learning_rate": 1.8668423050255317e-05, "loss": 0.5369, "step": 2281 }, { "epoch": 0.577575297393065, "grad_norm": 0.14896419644355774, "learning_rate": 1.8667229033917946e-05, "loss": 0.5323, "step": 2282 }, { "epoch": 0.577828397873956, "grad_norm": 0.14505890011787415, "learning_rate": 1.8666034520706718e-05, "loss": 0.5552, "step": 2283 }, { "epoch": 0.5780814983548469, "grad_norm": 0.14572517573833466, "learning_rate": 1.8664839510690114e-05, "loss": 0.5367, "step": 2284 }, { "epoch": 0.5783345988357378, "grad_norm": 0.13905175030231476, "learning_rate": 1.8663644003936642e-05, "loss": 0.5251, "step": 2285 }, { "epoch": 0.5785876993166287, "grad_norm": 0.1470813900232315, "learning_rate": 1.8662448000514834e-05, "loss": 0.54, "step": 2286 }, { "epoch": 0.5788407997975196, "grad_norm": 0.1497400999069214, "learning_rate": 1.866125150049326e-05, "loss": 0.5415, "step": 2287 }, { "epoch": 0.5790939002784106, "grad_norm": 0.15024422109127045, "learning_rate": 1.866005450394051e-05, "loss": 0.5501, "step": 2288 }, { "epoch": 0.5793470007593015, "grad_norm": 0.14332029223442078, "learning_rate": 1.8658857010925205e-05, "loss": 0.5296, "step": 2289 }, { "epoch": 0.5796001012401923, "grad_norm": 0.14805105328559875, "learning_rate": 1.8657659021515996e-05, "loss": 0.5874, "step": 2290 }, { "epoch": 0.5798532017210832, "grad_norm": 0.14845344424247742, "learning_rate": 1.8656460535781557e-05, "loss": 0.5426, "step": 2291 }, { "epoch": 0.5801063022019742, "grad_norm": 0.15134577453136444, "learning_rate": 1.86552615537906e-05, "loss": 0.5523, "step": 2292 }, { "epoch": 0.5803594026828651, "grad_norm": 0.14015473425388336, "learning_rate": 1.8654062075611857e-05, "loss": 0.5131, "step": 2293 }, { "epoch": 0.580612503163756, "grad_norm": 0.16274476051330566, "learning_rate": 1.8652862101314097e-05, "loss": 0.5529, "step": 2294 }, { "epoch": 0.5808656036446469, "grad_norm": 0.15004579722881317, "learning_rate": 1.86516616309661e-05, "loss": 0.5347, "step": 2295 }, { "epoch": 0.5811187041255378, "grad_norm": 0.14771991968154907, "learning_rate": 1.8650460664636702e-05, "loss": 0.5313, "step": 2296 }, { "epoch": 0.5813718046064288, "grad_norm": 0.15148195624351501, "learning_rate": 1.864925920239474e-05, "loss": 0.5418, "step": 2297 }, { "epoch": 0.5816249050873197, "grad_norm": 0.14730021357536316, "learning_rate": 1.8648057244309094e-05, "loss": 0.5132, "step": 2298 }, { "epoch": 0.5818780055682106, "grad_norm": 0.15396273136138916, "learning_rate": 1.8646854790448673e-05, "loss": 0.5329, "step": 2299 }, { "epoch": 0.5821311060491015, "grad_norm": 0.14523978531360626, "learning_rate": 1.8645651840882407e-05, "loss": 0.5529, "step": 2300 }, { "epoch": 0.5823842065299925, "grad_norm": 0.1450643390417099, "learning_rate": 1.8644448395679263e-05, "loss": 0.5157, "step": 2301 }, { "epoch": 0.5826373070108833, "grad_norm": 0.1512904018163681, "learning_rate": 1.8643244454908224e-05, "loss": 0.5279, "step": 2302 }, { "epoch": 0.5828904074917742, "grad_norm": 0.15034013986587524, "learning_rate": 1.864204001863832e-05, "loss": 0.5261, "step": 2303 }, { "epoch": 0.5831435079726651, "grad_norm": 0.14855656027793884, "learning_rate": 1.8640835086938593e-05, "loss": 0.5558, "step": 2304 }, { "epoch": 0.583396608453556, "grad_norm": 0.14781597256660461, "learning_rate": 1.863962965987812e-05, "loss": 0.5356, "step": 2305 }, { "epoch": 0.583649708934447, "grad_norm": 0.14822877943515778, "learning_rate": 1.8638423737526004e-05, "loss": 0.5268, "step": 2306 }, { "epoch": 0.5839028094153379, "grad_norm": 0.14480765163898468, "learning_rate": 1.863721731995138e-05, "loss": 0.5303, "step": 2307 }, { "epoch": 0.5841559098962288, "grad_norm": 0.1457347571849823, "learning_rate": 1.863601040722341e-05, "loss": 0.5483, "step": 2308 }, { "epoch": 0.5844090103771197, "grad_norm": 0.150473952293396, "learning_rate": 1.863480299941128e-05, "loss": 0.5518, "step": 2309 }, { "epoch": 0.5846621108580107, "grad_norm": 0.1446637511253357, "learning_rate": 1.863359509658421e-05, "loss": 0.5555, "step": 2310 }, { "epoch": 0.5849152113389016, "grad_norm": 0.15238513052463531, "learning_rate": 1.8632386698811448e-05, "loss": 0.5533, "step": 2311 }, { "epoch": 0.5851683118197925, "grad_norm": 0.14772778749465942, "learning_rate": 1.863117780616227e-05, "loss": 0.5573, "step": 2312 }, { "epoch": 0.5854214123006833, "grad_norm": 0.15186363458633423, "learning_rate": 1.8629968418705976e-05, "loss": 0.5362, "step": 2313 }, { "epoch": 0.5856745127815742, "grad_norm": 0.16332495212554932, "learning_rate": 1.8628758536511897e-05, "loss": 0.5473, "step": 2314 }, { "epoch": 0.5859276132624652, "grad_norm": 0.16811923682689667, "learning_rate": 1.8627548159649395e-05, "loss": 0.5479, "step": 2315 }, { "epoch": 0.5861807137433561, "grad_norm": 0.14447632431983948, "learning_rate": 1.8626337288187862e-05, "loss": 0.5457, "step": 2316 }, { "epoch": 0.586433814224247, "grad_norm": 0.1498802751302719, "learning_rate": 1.8625125922196707e-05, "loss": 0.5309, "step": 2317 }, { "epoch": 0.5866869147051379, "grad_norm": 0.14454385638237, "learning_rate": 1.862391406174538e-05, "loss": 0.5439, "step": 2318 }, { "epoch": 0.5869400151860289, "grad_norm": 0.1505521684885025, "learning_rate": 1.8622701706903354e-05, "loss": 0.5367, "step": 2319 }, { "epoch": 0.5871931156669198, "grad_norm": 0.14599654078483582, "learning_rate": 1.862148885774013e-05, "loss": 0.5464, "step": 2320 }, { "epoch": 0.5874462161478107, "grad_norm": 0.14601285755634308, "learning_rate": 1.8620275514325236e-05, "loss": 0.5436, "step": 2321 }, { "epoch": 0.5876993166287016, "grad_norm": 0.14640100300312042, "learning_rate": 1.8619061676728235e-05, "loss": 0.5207, "step": 2322 }, { "epoch": 0.5879524171095926, "grad_norm": 0.14489838480949402, "learning_rate": 1.861784734501871e-05, "loss": 0.5487, "step": 2323 }, { "epoch": 0.5882055175904835, "grad_norm": 0.14972306787967682, "learning_rate": 1.861663251926628e-05, "loss": 0.5453, "step": 2324 }, { "epoch": 0.5884586180713743, "grad_norm": 0.14117872714996338, "learning_rate": 1.861541719954058e-05, "loss": 0.5155, "step": 2325 }, { "epoch": 0.5887117185522652, "grad_norm": 0.1484600156545639, "learning_rate": 1.8614201385911292e-05, "loss": 0.5498, "step": 2326 }, { "epoch": 0.5889648190331561, "grad_norm": 0.15175874531269073, "learning_rate": 1.861298507844811e-05, "loss": 0.528, "step": 2327 }, { "epoch": 0.5892179195140471, "grad_norm": 0.1560920774936676, "learning_rate": 1.8611768277220764e-05, "loss": 0.5315, "step": 2328 }, { "epoch": 0.589471019994938, "grad_norm": 0.14526920020580292, "learning_rate": 1.8610550982299007e-05, "loss": 0.5317, "step": 2329 }, { "epoch": 0.5897241204758289, "grad_norm": 0.15285490453243256, "learning_rate": 1.860933319375263e-05, "loss": 0.5376, "step": 2330 }, { "epoch": 0.5899772209567198, "grad_norm": 0.14467842876911163, "learning_rate": 1.860811491165144e-05, "loss": 0.514, "step": 2331 }, { "epoch": 0.5902303214376108, "grad_norm": 0.14383679628372192, "learning_rate": 1.8606896136065288e-05, "loss": 0.5353, "step": 2332 }, { "epoch": 0.5904834219185017, "grad_norm": 0.14506690204143524, "learning_rate": 1.8605676867064034e-05, "loss": 0.5497, "step": 2333 }, { "epoch": 0.5907365223993926, "grad_norm": 0.1569226235151291, "learning_rate": 1.8604457104717582e-05, "loss": 0.546, "step": 2334 }, { "epoch": 0.5909896228802834, "grad_norm": 0.14538037776947021, "learning_rate": 1.860323684909586e-05, "loss": 0.5406, "step": 2335 }, { "epoch": 0.5912427233611743, "grad_norm": 0.14282667636871338, "learning_rate": 1.8602016100268812e-05, "loss": 0.511, "step": 2336 }, { "epoch": 0.5914958238420653, "grad_norm": 0.14653204381465912, "learning_rate": 1.860079485830643e-05, "loss": 0.5259, "step": 2337 }, { "epoch": 0.5917489243229562, "grad_norm": 0.14614234864711761, "learning_rate": 1.8599573123278724e-05, "loss": 0.5176, "step": 2338 }, { "epoch": 0.5920020248038471, "grad_norm": 0.14530158042907715, "learning_rate": 1.8598350895255734e-05, "loss": 0.5582, "step": 2339 }, { "epoch": 0.592255125284738, "grad_norm": 0.1459869146347046, "learning_rate": 1.8597128174307524e-05, "loss": 0.5513, "step": 2340 }, { "epoch": 0.592508225765629, "grad_norm": 0.14628392457962036, "learning_rate": 1.859590496050419e-05, "loss": 0.5419, "step": 2341 }, { "epoch": 0.5927613262465199, "grad_norm": 0.15488635003566742, "learning_rate": 1.8594681253915864e-05, "loss": 0.5631, "step": 2342 }, { "epoch": 0.5930144267274108, "grad_norm": 0.15236783027648926, "learning_rate": 1.859345705461269e-05, "loss": 0.5406, "step": 2343 }, { "epoch": 0.5932675272083017, "grad_norm": 0.14610011875629425, "learning_rate": 1.8592232362664853e-05, "loss": 0.5398, "step": 2344 }, { "epoch": 0.5935206276891926, "grad_norm": 0.15310116112232208, "learning_rate": 1.859100717814256e-05, "loss": 0.538, "step": 2345 }, { "epoch": 0.5937737281700836, "grad_norm": 0.14699193835258484, "learning_rate": 1.858978150111605e-05, "loss": 0.4988, "step": 2346 }, { "epoch": 0.5940268286509744, "grad_norm": 0.1398555487394333, "learning_rate": 1.858855533165559e-05, "loss": 0.5139, "step": 2347 }, { "epoch": 0.5942799291318653, "grad_norm": 0.14508651196956635, "learning_rate": 1.8587328669831467e-05, "loss": 0.5433, "step": 2348 }, { "epoch": 0.5945330296127562, "grad_norm": 0.1478363573551178, "learning_rate": 1.858610151571401e-05, "loss": 0.5223, "step": 2349 }, { "epoch": 0.5947861300936472, "grad_norm": 0.14768554270267487, "learning_rate": 1.858487386937356e-05, "loss": 0.5343, "step": 2350 }, { "epoch": 0.5950392305745381, "grad_norm": 0.14980356395244598, "learning_rate": 1.8583645730880508e-05, "loss": 0.5467, "step": 2351 }, { "epoch": 0.595292331055429, "grad_norm": 0.14696039259433746, "learning_rate": 1.8582417100305252e-05, "loss": 0.555, "step": 2352 }, { "epoch": 0.5955454315363199, "grad_norm": 0.17784389853477478, "learning_rate": 1.858118797771823e-05, "loss": 0.5386, "step": 2353 }, { "epoch": 0.5957985320172108, "grad_norm": 0.14381876587867737, "learning_rate": 1.8579958363189906e-05, "loss": 0.5349, "step": 2354 }, { "epoch": 0.5960516324981018, "grad_norm": 0.15086035430431366, "learning_rate": 1.857872825679077e-05, "loss": 0.5461, "step": 2355 }, { "epoch": 0.5963047329789927, "grad_norm": 0.15046915411949158, "learning_rate": 1.8577497658591333e-05, "loss": 0.5328, "step": 2356 }, { "epoch": 0.5965578334598836, "grad_norm": 0.14956995844841003, "learning_rate": 1.8576266568662156e-05, "loss": 0.5735, "step": 2357 }, { "epoch": 0.5968109339407744, "grad_norm": 0.1443953663110733, "learning_rate": 1.8575034987073806e-05, "loss": 0.5291, "step": 2358 }, { "epoch": 0.5970640344216654, "grad_norm": 0.1495794653892517, "learning_rate": 1.8573802913896895e-05, "loss": 0.5279, "step": 2359 }, { "epoch": 0.5973171349025563, "grad_norm": 0.14987659454345703, "learning_rate": 1.8572570349202045e-05, "loss": 0.5253, "step": 2360 }, { "epoch": 0.5975702353834472, "grad_norm": 0.14494718611240387, "learning_rate": 1.8571337293059923e-05, "loss": 0.5352, "step": 2361 }, { "epoch": 0.5978233358643381, "grad_norm": 0.14781028032302856, "learning_rate": 1.857010374554122e-05, "loss": 0.5185, "step": 2362 }, { "epoch": 0.598076436345229, "grad_norm": 0.14690136909484863, "learning_rate": 1.856886970671664e-05, "loss": 0.5433, "step": 2363 }, { "epoch": 0.59832953682612, "grad_norm": 0.14500904083251953, "learning_rate": 1.8567635176656945e-05, "loss": 0.5261, "step": 2364 }, { "epoch": 0.5985826373070109, "grad_norm": 0.14209990203380585, "learning_rate": 1.8566400155432893e-05, "loss": 0.5517, "step": 2365 }, { "epoch": 0.5988357377879018, "grad_norm": 0.1500253528356552, "learning_rate": 1.8565164643115296e-05, "loss": 0.5416, "step": 2366 }, { "epoch": 0.5990888382687927, "grad_norm": 0.14733339846134186, "learning_rate": 1.8563928639774976e-05, "loss": 0.5387, "step": 2367 }, { "epoch": 0.5993419387496837, "grad_norm": 0.14200736582279205, "learning_rate": 1.8562692145482793e-05, "loss": 0.5811, "step": 2368 }, { "epoch": 0.5995950392305746, "grad_norm": 0.14497509598731995, "learning_rate": 1.8561455160309634e-05, "loss": 0.5104, "step": 2369 }, { "epoch": 0.5998481397114654, "grad_norm": 0.13897013664245605, "learning_rate": 1.856021768432641e-05, "loss": 0.5241, "step": 2370 }, { "epoch": 0.6001012401923563, "grad_norm": 0.14394894242286682, "learning_rate": 1.8558979717604065e-05, "loss": 0.5312, "step": 2371 }, { "epoch": 0.6003543406732473, "grad_norm": 0.14755356311798096, "learning_rate": 1.8557741260213572e-05, "loss": 0.5343, "step": 2372 }, { "epoch": 0.6006074411541382, "grad_norm": 0.15196183323860168, "learning_rate": 1.8556502312225922e-05, "loss": 0.568, "step": 2373 }, { "epoch": 0.6008605416350291, "grad_norm": 0.14456459879875183, "learning_rate": 1.8555262873712145e-05, "loss": 0.5162, "step": 2374 }, { "epoch": 0.60111364211592, "grad_norm": 0.15117591619491577, "learning_rate": 1.8554022944743296e-05, "loss": 0.5446, "step": 2375 }, { "epoch": 0.6013667425968109, "grad_norm": 0.14727413654327393, "learning_rate": 1.8552782525390458e-05, "loss": 0.5459, "step": 2376 }, { "epoch": 0.6016198430777019, "grad_norm": 0.14920492470264435, "learning_rate": 1.855154161572474e-05, "loss": 0.5383, "step": 2377 }, { "epoch": 0.6018729435585928, "grad_norm": 0.1506134271621704, "learning_rate": 1.8550300215817277e-05, "loss": 0.55, "step": 2378 }, { "epoch": 0.6021260440394837, "grad_norm": 0.14898306131362915, "learning_rate": 1.8549058325739246e-05, "loss": 0.5564, "step": 2379 }, { "epoch": 0.6023791445203746, "grad_norm": 0.14046598970890045, "learning_rate": 1.8547815945561832e-05, "loss": 0.5334, "step": 2380 }, { "epoch": 0.6026322450012656, "grad_norm": 0.14773893356323242, "learning_rate": 1.8546573075356265e-05, "loss": 0.5528, "step": 2381 }, { "epoch": 0.6028853454821564, "grad_norm": 0.14395421743392944, "learning_rate": 1.8545329715193786e-05, "loss": 0.5663, "step": 2382 }, { "epoch": 0.6031384459630473, "grad_norm": 0.156617671251297, "learning_rate": 1.854408586514569e-05, "loss": 0.5349, "step": 2383 }, { "epoch": 0.6033915464439382, "grad_norm": 0.15281963348388672, "learning_rate": 1.8542841525283268e-05, "loss": 0.5849, "step": 2384 }, { "epoch": 0.6036446469248291, "grad_norm": 0.1514674872159958, "learning_rate": 1.8541596695677868e-05, "loss": 0.5624, "step": 2385 }, { "epoch": 0.6038977474057201, "grad_norm": 0.1519140750169754, "learning_rate": 1.8540351376400848e-05, "loss": 0.5502, "step": 2386 }, { "epoch": 0.604150847886611, "grad_norm": 0.14331114292144775, "learning_rate": 1.8539105567523598e-05, "loss": 0.5499, "step": 2387 }, { "epoch": 0.6044039483675019, "grad_norm": 0.1429108828306198, "learning_rate": 1.8537859269117538e-05, "loss": 0.5287, "step": 2388 }, { "epoch": 0.6046570488483928, "grad_norm": 0.1414855569601059, "learning_rate": 1.8536612481254116e-05, "loss": 0.5485, "step": 2389 }, { "epoch": 0.6049101493292838, "grad_norm": 0.15185344219207764, "learning_rate": 1.8535365204004815e-05, "loss": 0.5415, "step": 2390 }, { "epoch": 0.6051632498101747, "grad_norm": 0.14885537326335907, "learning_rate": 1.8534117437441125e-05, "loss": 0.5474, "step": 2391 }, { "epoch": 0.6054163502910656, "grad_norm": 0.14955249428749084, "learning_rate": 1.853286918163459e-05, "loss": 0.5135, "step": 2392 }, { "epoch": 0.6056694507719564, "grad_norm": 0.1478264480829239, "learning_rate": 1.8531620436656762e-05, "loss": 0.5503, "step": 2393 }, { "epoch": 0.6059225512528473, "grad_norm": 0.14942239224910736, "learning_rate": 1.8530371202579238e-05, "loss": 0.517, "step": 2394 }, { "epoch": 0.6061756517337383, "grad_norm": 0.15352338552474976, "learning_rate": 1.852912147947362e-05, "loss": 0.5355, "step": 2395 }, { "epoch": 0.6064287522146292, "grad_norm": 0.14409947395324707, "learning_rate": 1.8527871267411564e-05, "loss": 0.5252, "step": 2396 }, { "epoch": 0.6066818526955201, "grad_norm": 0.14866919815540314, "learning_rate": 1.852662056646474e-05, "loss": 0.5382, "step": 2397 }, { "epoch": 0.606934953176411, "grad_norm": 0.14645080268383026, "learning_rate": 1.852536937670484e-05, "loss": 0.533, "step": 2398 }, { "epoch": 0.607188053657302, "grad_norm": 0.1523124724626541, "learning_rate": 1.8524117698203605e-05, "loss": 0.5808, "step": 2399 }, { "epoch": 0.6074411541381929, "grad_norm": 0.14742736518383026, "learning_rate": 1.852286553103278e-05, "loss": 0.5433, "step": 2400 }, { "epoch": 0.6076942546190838, "grad_norm": 0.15319399535655975, "learning_rate": 1.8521612875264154e-05, "loss": 0.5473, "step": 2401 }, { "epoch": 0.6079473550999747, "grad_norm": 0.15313813090324402, "learning_rate": 1.852035973096954e-05, "loss": 0.5392, "step": 2402 }, { "epoch": 0.6082004555808656, "grad_norm": 0.1409512609243393, "learning_rate": 1.8519106098220775e-05, "loss": 0.5332, "step": 2403 }, { "epoch": 0.6084535560617566, "grad_norm": 0.14349418878555298, "learning_rate": 1.8517851977089727e-05, "loss": 0.5146, "step": 2404 }, { "epoch": 0.6087066565426474, "grad_norm": 0.15203545987606049, "learning_rate": 1.8516597367648295e-05, "loss": 0.5236, "step": 2405 }, { "epoch": 0.6089597570235383, "grad_norm": 0.15104606747627258, "learning_rate": 1.85153422699684e-05, "loss": 0.5633, "step": 2406 }, { "epoch": 0.6092128575044292, "grad_norm": 0.14073584973812103, "learning_rate": 1.8514086684121997e-05, "loss": 0.5533, "step": 2407 }, { "epoch": 0.6094659579853202, "grad_norm": 0.14999449253082275, "learning_rate": 1.8512830610181067e-05, "loss": 0.5343, "step": 2408 }, { "epoch": 0.6097190584662111, "grad_norm": 0.14399008452892303, "learning_rate": 1.8511574048217614e-05, "loss": 0.515, "step": 2409 }, { "epoch": 0.609972158947102, "grad_norm": 0.14407195150852203, "learning_rate": 1.8510316998303675e-05, "loss": 0.5277, "step": 2410 }, { "epoch": 0.6102252594279929, "grad_norm": 0.14424079656600952, "learning_rate": 1.8509059460511315e-05, "loss": 0.5507, "step": 2411 }, { "epoch": 0.6104783599088838, "grad_norm": 0.15625551342964172, "learning_rate": 1.8507801434912626e-05, "loss": 0.519, "step": 2412 }, { "epoch": 0.6107314603897748, "grad_norm": 0.15180808305740356, "learning_rate": 1.8506542921579732e-05, "loss": 0.5304, "step": 2413 }, { "epoch": 0.6109845608706657, "grad_norm": 0.14952440559864044, "learning_rate": 1.8505283920584773e-05, "loss": 0.5348, "step": 2414 }, { "epoch": 0.6112376613515565, "grad_norm": 0.139666348695755, "learning_rate": 1.850402443199993e-05, "loss": 0.5515, "step": 2415 }, { "epoch": 0.6114907618324474, "grad_norm": 0.14391599595546722, "learning_rate": 1.8502764455897398e-05, "loss": 0.5238, "step": 2416 }, { "epoch": 0.6117438623133384, "grad_norm": 0.1433994621038437, "learning_rate": 1.8501503992349426e-05, "loss": 0.5279, "step": 2417 }, { "epoch": 0.6119969627942293, "grad_norm": 0.14672143757343292, "learning_rate": 1.850024304142826e-05, "loss": 0.5533, "step": 2418 }, { "epoch": 0.6122500632751202, "grad_norm": 0.14460209012031555, "learning_rate": 1.849898160320619e-05, "loss": 0.5376, "step": 2419 }, { "epoch": 0.6125031637560111, "grad_norm": 0.146707683801651, "learning_rate": 1.8497719677755537e-05, "loss": 0.5413, "step": 2420 }, { "epoch": 0.6127562642369021, "grad_norm": 0.2081783264875412, "learning_rate": 1.849645726514864e-05, "loss": 0.5439, "step": 2421 }, { "epoch": 0.613009364717793, "grad_norm": 0.15196694433689117, "learning_rate": 1.8495194365457866e-05, "loss": 0.5475, "step": 2422 }, { "epoch": 0.6132624651986839, "grad_norm": 0.14226335287094116, "learning_rate": 1.8493930978755625e-05, "loss": 0.5598, "step": 2423 }, { "epoch": 0.6135155656795748, "grad_norm": 0.1489374339580536, "learning_rate": 1.8492667105114335e-05, "loss": 0.5462, "step": 2424 }, { "epoch": 0.6137686661604657, "grad_norm": 0.14674276113510132, "learning_rate": 1.8491402744606456e-05, "loss": 0.5588, "step": 2425 }, { "epoch": 0.6140217666413567, "grad_norm": 0.14677469432353973, "learning_rate": 1.8490137897304474e-05, "loss": 0.527, "step": 2426 }, { "epoch": 0.6142748671222475, "grad_norm": 0.15478141605854034, "learning_rate": 1.848887256328089e-05, "loss": 0.5391, "step": 2427 }, { "epoch": 0.6145279676031384, "grad_norm": 0.1517610400915146, "learning_rate": 1.848760674260825e-05, "loss": 0.5195, "step": 2428 }, { "epoch": 0.6147810680840293, "grad_norm": 0.1444411426782608, "learning_rate": 1.8486340435359125e-05, "loss": 0.53, "step": 2429 }, { "epoch": 0.6150341685649203, "grad_norm": 0.14189781248569489, "learning_rate": 1.84850736416061e-05, "loss": 0.5274, "step": 2430 }, { "epoch": 0.6152872690458112, "grad_norm": 0.14328473806381226, "learning_rate": 1.84838063614218e-05, "loss": 0.5269, "step": 2431 }, { "epoch": 0.6155403695267021, "grad_norm": 0.15161679685115814, "learning_rate": 1.848253859487888e-05, "loss": 0.5177, "step": 2432 }, { "epoch": 0.615793470007593, "grad_norm": 0.14411920309066772, "learning_rate": 1.848127034205002e-05, "loss": 0.5102, "step": 2433 }, { "epoch": 0.6160465704884839, "grad_norm": 0.1530870944261551, "learning_rate": 1.8480001603007914e-05, "loss": 0.5249, "step": 2434 }, { "epoch": 0.6162996709693749, "grad_norm": 0.1427738517522812, "learning_rate": 1.847873237782531e-05, "loss": 0.544, "step": 2435 }, { "epoch": 0.6165527714502658, "grad_norm": 0.14298276603221893, "learning_rate": 1.8477462666574963e-05, "loss": 0.5407, "step": 2436 }, { "epoch": 0.6168058719311567, "grad_norm": 0.16381780803203583, "learning_rate": 1.8476192469329663e-05, "loss": 0.536, "step": 2437 }, { "epoch": 0.6170589724120475, "grad_norm": 0.14814378321170807, "learning_rate": 1.847492178616223e-05, "loss": 0.5579, "step": 2438 }, { "epoch": 0.6173120728929385, "grad_norm": 0.14587941765785217, "learning_rate": 1.8473650617145507e-05, "loss": 0.5485, "step": 2439 }, { "epoch": 0.6175651733738294, "grad_norm": 0.20234888792037964, "learning_rate": 1.847237896235237e-05, "loss": 0.5435, "step": 2440 }, { "epoch": 0.6178182738547203, "grad_norm": 0.14384739100933075, "learning_rate": 1.847110682185572e-05, "loss": 0.5457, "step": 2441 }, { "epoch": 0.6180713743356112, "grad_norm": 0.14170606434345245, "learning_rate": 1.8469834195728484e-05, "loss": 0.5325, "step": 2442 }, { "epoch": 0.6183244748165021, "grad_norm": 0.14463554322719574, "learning_rate": 1.846856108404362e-05, "loss": 0.5357, "step": 2443 }, { "epoch": 0.6185775752973931, "grad_norm": 0.14453023672103882, "learning_rate": 1.8467287486874114e-05, "loss": 0.523, "step": 2444 }, { "epoch": 0.618830675778284, "grad_norm": 0.14751245081424713, "learning_rate": 1.8466013404292974e-05, "loss": 0.5157, "step": 2445 }, { "epoch": 0.6190837762591749, "grad_norm": 0.1481376588344574, "learning_rate": 1.8464738836373248e-05, "loss": 0.533, "step": 2446 }, { "epoch": 0.6193368767400658, "grad_norm": 0.15021999180316925, "learning_rate": 1.8463463783187997e-05, "loss": 0.5214, "step": 2447 }, { "epoch": 0.6195899772209568, "grad_norm": 0.1563500463962555, "learning_rate": 1.8462188244810324e-05, "loss": 0.5549, "step": 2448 }, { "epoch": 0.6198430777018477, "grad_norm": 0.14146524667739868, "learning_rate": 1.846091222131335e-05, "loss": 0.5374, "step": 2449 }, { "epoch": 0.6200961781827385, "grad_norm": 0.1462208777666092, "learning_rate": 1.845963571277022e-05, "loss": 0.5402, "step": 2450 }, { "epoch": 0.6203492786636294, "grad_norm": 0.14394448697566986, "learning_rate": 1.8458358719254125e-05, "loss": 0.5462, "step": 2451 }, { "epoch": 0.6206023791445203, "grad_norm": 0.14380502700805664, "learning_rate": 1.8457081240838265e-05, "loss": 0.5331, "step": 2452 }, { "epoch": 0.6208554796254113, "grad_norm": 0.13871550559997559, "learning_rate": 1.8455803277595877e-05, "loss": 0.5364, "step": 2453 }, { "epoch": 0.6211085801063022, "grad_norm": 0.16735586524009705, "learning_rate": 1.8454524829600223e-05, "loss": 0.566, "step": 2454 }, { "epoch": 0.6213616805871931, "grad_norm": 0.16703766584396362, "learning_rate": 1.8453245896924595e-05, "loss": 0.5477, "step": 2455 }, { "epoch": 0.621614781068084, "grad_norm": 0.1464633047580719, "learning_rate": 1.8451966479642312e-05, "loss": 0.5451, "step": 2456 }, { "epoch": 0.621867881548975, "grad_norm": 0.1467081904411316, "learning_rate": 1.845068657782672e-05, "loss": 0.5581, "step": 2457 }, { "epoch": 0.6221209820298659, "grad_norm": 0.14057934284210205, "learning_rate": 1.844940619155119e-05, "loss": 0.5022, "step": 2458 }, { "epoch": 0.6223740825107568, "grad_norm": 0.14661408960819244, "learning_rate": 1.8448125320889127e-05, "loss": 0.5311, "step": 2459 }, { "epoch": 0.6226271829916477, "grad_norm": 0.14641529321670532, "learning_rate": 1.8446843965913962e-05, "loss": 0.5181, "step": 2460 }, { "epoch": 0.6228802834725385, "grad_norm": 0.14614209532737732, "learning_rate": 1.844556212669915e-05, "loss": 0.5363, "step": 2461 }, { "epoch": 0.6231333839534295, "grad_norm": 0.14240095019340515, "learning_rate": 1.844427980331817e-05, "loss": 0.5336, "step": 2462 }, { "epoch": 0.6233864844343204, "grad_norm": 0.14581473171710968, "learning_rate": 1.844299699584455e-05, "loss": 0.5273, "step": 2463 }, { "epoch": 0.6236395849152113, "grad_norm": 0.14479555189609528, "learning_rate": 1.8441713704351815e-05, "loss": 0.5366, "step": 2464 }, { "epoch": 0.6238926853961022, "grad_norm": 0.14247867465019226, "learning_rate": 1.8440429928913545e-05, "loss": 0.5473, "step": 2465 }, { "epoch": 0.6241457858769932, "grad_norm": 0.1444939225912094, "learning_rate": 1.843914566960333e-05, "loss": 0.5127, "step": 2466 }, { "epoch": 0.6243988863578841, "grad_norm": 0.14626888930797577, "learning_rate": 1.8437860926494794e-05, "loss": 0.5229, "step": 2467 }, { "epoch": 0.624651986838775, "grad_norm": 0.15107285976409912, "learning_rate": 1.8436575699661592e-05, "loss": 0.529, "step": 2468 }, { "epoch": 0.6249050873196659, "grad_norm": 0.1470448076725006, "learning_rate": 1.84352899891774e-05, "loss": 0.5315, "step": 2469 }, { "epoch": 0.6251581878005569, "grad_norm": 0.1635686606168747, "learning_rate": 1.8434003795115928e-05, "loss": 0.532, "step": 2470 }, { "epoch": 0.6254112882814478, "grad_norm": 0.15282386541366577, "learning_rate": 1.8432717117550908e-05, "loss": 0.5301, "step": 2471 }, { "epoch": 0.6256643887623387, "grad_norm": 0.14698238670825958, "learning_rate": 1.84314299565561e-05, "loss": 0.5389, "step": 2472 }, { "epoch": 0.6259174892432295, "grad_norm": 0.14671845734119415, "learning_rate": 1.8430142312205302e-05, "loss": 0.5372, "step": 2473 }, { "epoch": 0.6261705897241204, "grad_norm": 0.14897477626800537, "learning_rate": 1.8428854184572328e-05, "loss": 0.5462, "step": 2474 }, { "epoch": 0.6264236902050114, "grad_norm": 0.15275777876377106, "learning_rate": 1.842756557373102e-05, "loss": 0.5164, "step": 2475 }, { "epoch": 0.6266767906859023, "grad_norm": 0.3233988583087921, "learning_rate": 1.8426276479755258e-05, "loss": 0.5316, "step": 2476 }, { "epoch": 0.6269298911667932, "grad_norm": 0.18216943740844727, "learning_rate": 1.8424986902718935e-05, "loss": 0.5207, "step": 2477 }, { "epoch": 0.6271829916476841, "grad_norm": 0.14639009535312653, "learning_rate": 1.842369684269599e-05, "loss": 0.5525, "step": 2478 }, { "epoch": 0.6274360921285751, "grad_norm": 0.1494276374578476, "learning_rate": 1.842240629976037e-05, "loss": 0.5458, "step": 2479 }, { "epoch": 0.627689192609466, "grad_norm": 0.14708733558654785, "learning_rate": 1.8421115273986064e-05, "loss": 0.5474, "step": 2480 }, { "epoch": 0.6279422930903569, "grad_norm": 0.14327023923397064, "learning_rate": 1.8419823765447082e-05, "loss": 0.5489, "step": 2481 }, { "epoch": 0.6281953935712478, "grad_norm": 0.14275063574314117, "learning_rate": 1.8418531774217463e-05, "loss": 0.5303, "step": 2482 }, { "epoch": 0.6284484940521387, "grad_norm": 0.14552482962608337, "learning_rate": 1.8417239300371273e-05, "loss": 0.5427, "step": 2483 }, { "epoch": 0.6287015945330297, "grad_norm": 0.14963699877262115, "learning_rate": 1.8415946343982614e-05, "loss": 0.5655, "step": 2484 }, { "epoch": 0.6289546950139205, "grad_norm": 0.14270426332950592, "learning_rate": 1.84146529051256e-05, "loss": 0.5368, "step": 2485 }, { "epoch": 0.6292077954948114, "grad_norm": 0.15036006271839142, "learning_rate": 1.8413358983874387e-05, "loss": 0.5523, "step": 2486 }, { "epoch": 0.6294608959757023, "grad_norm": 0.1527857482433319, "learning_rate": 1.841206458030315e-05, "loss": 0.5378, "step": 2487 }, { "epoch": 0.6297139964565933, "grad_norm": 0.14708644151687622, "learning_rate": 1.8410769694486093e-05, "loss": 0.5349, "step": 2488 }, { "epoch": 0.6299670969374842, "grad_norm": 0.1494288295507431, "learning_rate": 1.8409474326497455e-05, "loss": 0.5604, "step": 2489 }, { "epoch": 0.6302201974183751, "grad_norm": 0.14631909132003784, "learning_rate": 1.8408178476411487e-05, "loss": 0.5081, "step": 2490 }, { "epoch": 0.630473297899266, "grad_norm": 0.15233495831489563, "learning_rate": 1.8406882144302483e-05, "loss": 0.5179, "step": 2491 }, { "epoch": 0.6307263983801569, "grad_norm": 0.14197777211666107, "learning_rate": 1.840558533024476e-05, "loss": 0.5134, "step": 2492 }, { "epoch": 0.6309794988610479, "grad_norm": 0.14182843267917633, "learning_rate": 1.8404288034312664e-05, "loss": 0.525, "step": 2493 }, { "epoch": 0.6312325993419388, "grad_norm": 0.15093275904655457, "learning_rate": 1.8402990256580556e-05, "loss": 0.5374, "step": 2494 }, { "epoch": 0.6314856998228296, "grad_norm": 0.14455483853816986, "learning_rate": 1.8401691997122844e-05, "loss": 0.5121, "step": 2495 }, { "epoch": 0.6317388003037205, "grad_norm": 0.151828333735466, "learning_rate": 1.8400393256013955e-05, "loss": 0.5319, "step": 2496 }, { "epoch": 0.6319919007846115, "grad_norm": 0.14533530175685883, "learning_rate": 1.8399094033328336e-05, "loss": 0.5333, "step": 2497 }, { "epoch": 0.6322450012655024, "grad_norm": 0.1414755880832672, "learning_rate": 1.8397794329140475e-05, "loss": 0.5117, "step": 2498 }, { "epoch": 0.6324981017463933, "grad_norm": 0.1505860835313797, "learning_rate": 1.8396494143524877e-05, "loss": 0.5003, "step": 2499 }, { "epoch": 0.6327512022272842, "grad_norm": 0.14254526793956757, "learning_rate": 1.8395193476556085e-05, "loss": 0.5017, "step": 2500 }, { "epoch": 0.6330043027081751, "grad_norm": 0.14562487602233887, "learning_rate": 1.8393892328308654e-05, "loss": 0.5563, "step": 2501 }, { "epoch": 0.6332574031890661, "grad_norm": 0.14674434065818787, "learning_rate": 1.8392590698857184e-05, "loss": 0.5054, "step": 2502 }, { "epoch": 0.633510503669957, "grad_norm": 0.1408618837594986, "learning_rate": 1.839128858827629e-05, "loss": 0.5065, "step": 2503 }, { "epoch": 0.6337636041508479, "grad_norm": 0.14369329810142517, "learning_rate": 1.8389985996640623e-05, "loss": 0.5329, "step": 2504 }, { "epoch": 0.6340167046317388, "grad_norm": 0.14057013392448425, "learning_rate": 1.8388682924024854e-05, "loss": 0.5195, "step": 2505 }, { "epoch": 0.6342698051126298, "grad_norm": 0.1421726793050766, "learning_rate": 1.8387379370503692e-05, "loss": 0.5104, "step": 2506 }, { "epoch": 0.6345229055935206, "grad_norm": 0.14999838173389435, "learning_rate": 1.8386075336151857e-05, "loss": 0.5628, "step": 2507 }, { "epoch": 0.6347760060744115, "grad_norm": 0.15406276285648346, "learning_rate": 1.8384770821044115e-05, "loss": 0.5568, "step": 2508 }, { "epoch": 0.6350291065553024, "grad_norm": 0.14896829426288605, "learning_rate": 1.8383465825255244e-05, "loss": 0.552, "step": 2509 }, { "epoch": 0.6352822070361933, "grad_norm": 0.1516672670841217, "learning_rate": 1.8382160348860063e-05, "loss": 0.5199, "step": 2510 }, { "epoch": 0.6355353075170843, "grad_norm": 0.1618236005306244, "learning_rate": 1.8380854391933413e-05, "loss": 0.5039, "step": 2511 }, { "epoch": 0.6357884079979752, "grad_norm": 0.14674592018127441, "learning_rate": 1.8379547954550157e-05, "loss": 0.544, "step": 2512 }, { "epoch": 0.6360415084788661, "grad_norm": 0.14451873302459717, "learning_rate": 1.8378241036785186e-05, "loss": 0.5554, "step": 2513 }, { "epoch": 0.636294608959757, "grad_norm": 0.14248131215572357, "learning_rate": 1.8376933638713438e-05, "loss": 0.5241, "step": 2514 }, { "epoch": 0.636547709440648, "grad_norm": 0.1481953114271164, "learning_rate": 1.8375625760409848e-05, "loss": 0.5685, "step": 2515 }, { "epoch": 0.6368008099215389, "grad_norm": 0.15451690554618835, "learning_rate": 1.8374317401949403e-05, "loss": 0.5293, "step": 2516 }, { "epoch": 0.6370539104024298, "grad_norm": 0.14824581146240234, "learning_rate": 1.83730085634071e-05, "loss": 0.5394, "step": 2517 }, { "epoch": 0.6373070108833206, "grad_norm": 0.1481318473815918, "learning_rate": 1.837169924485798e-05, "loss": 0.5484, "step": 2518 }, { "epoch": 0.6375601113642116, "grad_norm": 0.14752435684204102, "learning_rate": 1.8370389446377103e-05, "loss": 0.5054, "step": 2519 }, { "epoch": 0.6378132118451025, "grad_norm": 0.143208846449852, "learning_rate": 1.8369079168039555e-05, "loss": 0.5242, "step": 2520 }, { "epoch": 0.6380663123259934, "grad_norm": 0.14739929139614105, "learning_rate": 1.836776840992045e-05, "loss": 0.5317, "step": 2521 }, { "epoch": 0.6383194128068843, "grad_norm": 0.14670588076114655, "learning_rate": 1.836645717209493e-05, "loss": 0.5345, "step": 2522 }, { "epoch": 0.6385725132877752, "grad_norm": 0.1452939361333847, "learning_rate": 1.836514545463817e-05, "loss": 0.5427, "step": 2523 }, { "epoch": 0.6388256137686662, "grad_norm": 0.14539825916290283, "learning_rate": 1.8363833257625365e-05, "loss": 0.5147, "step": 2524 }, { "epoch": 0.6390787142495571, "grad_norm": 0.1460723876953125, "learning_rate": 1.8362520581131738e-05, "loss": 0.5728, "step": 2525 }, { "epoch": 0.639331814730448, "grad_norm": 0.1652328222990036, "learning_rate": 1.8361207425232548e-05, "loss": 0.5341, "step": 2526 }, { "epoch": 0.6395849152113389, "grad_norm": 0.14856599271297455, "learning_rate": 1.8359893790003074e-05, "loss": 0.5075, "step": 2527 }, { "epoch": 0.6398380156922299, "grad_norm": 0.16230234503746033, "learning_rate": 1.835857967551862e-05, "loss": 0.5655, "step": 2528 }, { "epoch": 0.6400911161731208, "grad_norm": 0.1495852917432785, "learning_rate": 1.8357265081854522e-05, "loss": 0.5381, "step": 2529 }, { "epoch": 0.6403442166540116, "grad_norm": 0.1538114994764328, "learning_rate": 1.8355950009086148e-05, "loss": 0.5569, "step": 2530 }, { "epoch": 0.6405973171349025, "grad_norm": 0.15153038501739502, "learning_rate": 1.8354634457288885e-05, "loss": 0.534, "step": 2531 }, { "epoch": 0.6408504176157934, "grad_norm": 0.15110176801681519, "learning_rate": 1.835331842653815e-05, "loss": 0.5507, "step": 2532 }, { "epoch": 0.6411035180966844, "grad_norm": 0.1493983119726181, "learning_rate": 1.8352001916909387e-05, "loss": 0.5493, "step": 2533 }, { "epoch": 0.6413566185775753, "grad_norm": 0.15783622860908508, "learning_rate": 1.8350684928478072e-05, "loss": 0.5666, "step": 2534 }, { "epoch": 0.6416097190584662, "grad_norm": 0.14776931703090668, "learning_rate": 1.8349367461319704e-05, "loss": 0.5761, "step": 2535 }, { "epoch": 0.6418628195393571, "grad_norm": 0.15445038676261902, "learning_rate": 1.8348049515509814e-05, "loss": 0.5574, "step": 2536 }, { "epoch": 0.6421159200202481, "grad_norm": 0.15116912126541138, "learning_rate": 1.834673109112395e-05, "loss": 0.5492, "step": 2537 }, { "epoch": 0.642369020501139, "grad_norm": 0.14761857688426971, "learning_rate": 1.83454121882377e-05, "loss": 0.5171, "step": 2538 }, { "epoch": 0.6426221209820299, "grad_norm": 0.14666865766048431, "learning_rate": 1.834409280692667e-05, "loss": 0.5354, "step": 2539 }, { "epoch": 0.6428752214629208, "grad_norm": 0.14642728865146637, "learning_rate": 1.83427729472665e-05, "loss": 0.5374, "step": 2540 }, { "epoch": 0.6431283219438116, "grad_norm": 0.1411811113357544, "learning_rate": 1.8341452609332852e-05, "loss": 0.5296, "step": 2541 }, { "epoch": 0.6433814224247026, "grad_norm": 0.15042644739151, "learning_rate": 1.834013179320142e-05, "loss": 0.5422, "step": 2542 }, { "epoch": 0.6436345229055935, "grad_norm": 0.15062500536441803, "learning_rate": 1.8338810498947928e-05, "loss": 0.5535, "step": 2543 }, { "epoch": 0.6438876233864844, "grad_norm": 0.14488181471824646, "learning_rate": 1.8337488726648118e-05, "loss": 0.5395, "step": 2544 }, { "epoch": 0.6441407238673753, "grad_norm": 0.1423717439174652, "learning_rate": 1.8336166476377763e-05, "loss": 0.522, "step": 2545 }, { "epoch": 0.6443938243482663, "grad_norm": 0.1495235413312912, "learning_rate": 1.8334843748212666e-05, "loss": 0.5149, "step": 2546 }, { "epoch": 0.6446469248291572, "grad_norm": 0.14710858464241028, "learning_rate": 1.8333520542228657e-05, "loss": 0.5424, "step": 2547 }, { "epoch": 0.6449000253100481, "grad_norm": 0.16457505524158478, "learning_rate": 1.8332196858501593e-05, "loss": 0.5536, "step": 2548 }, { "epoch": 0.645153125790939, "grad_norm": 0.14198842644691467, "learning_rate": 1.833087269710736e-05, "loss": 0.5159, "step": 2549 }, { "epoch": 0.6454062262718299, "grad_norm": 0.15280668437480927, "learning_rate": 1.8329548058121865e-05, "loss": 0.5615, "step": 2550 }, { "epoch": 0.6456593267527209, "grad_norm": 0.14535677433013916, "learning_rate": 1.832822294162105e-05, "loss": 0.5491, "step": 2551 }, { "epoch": 0.6459124272336118, "grad_norm": 0.14739260077476501, "learning_rate": 1.832689734768088e-05, "loss": 0.536, "step": 2552 }, { "epoch": 0.6461655277145026, "grad_norm": 0.14564937353134155, "learning_rate": 1.8325571276377347e-05, "loss": 0.5162, "step": 2553 }, { "epoch": 0.6464186281953935, "grad_norm": 0.14750128984451294, "learning_rate": 1.8324244727786477e-05, "loss": 0.544, "step": 2554 }, { "epoch": 0.6466717286762845, "grad_norm": 0.15399467945098877, "learning_rate": 1.832291770198431e-05, "loss": 0.5167, "step": 2555 }, { "epoch": 0.6469248291571754, "grad_norm": 0.14649620652198792, "learning_rate": 1.832159019904693e-05, "loss": 0.5685, "step": 2556 }, { "epoch": 0.6471779296380663, "grad_norm": 0.14589351415634155, "learning_rate": 1.8320262219050432e-05, "loss": 0.5276, "step": 2557 }, { "epoch": 0.6474310301189572, "grad_norm": 0.14570419490337372, "learning_rate": 1.8318933762070954e-05, "loss": 0.5218, "step": 2558 }, { "epoch": 0.6476841305998481, "grad_norm": 0.14705124497413635, "learning_rate": 1.8317604828184648e-05, "loss": 0.5315, "step": 2559 }, { "epoch": 0.6479372310807391, "grad_norm": 0.14263677597045898, "learning_rate": 1.83162754174677e-05, "loss": 0.5315, "step": 2560 }, { "epoch": 0.64819033156163, "grad_norm": 0.1443737894296646, "learning_rate": 1.8314945529996328e-05, "loss": 0.5539, "step": 2561 }, { "epoch": 0.6484434320425209, "grad_norm": 0.14575058221817017, "learning_rate": 1.831361516584676e-05, "loss": 0.5229, "step": 2562 }, { "epoch": 0.6486965325234118, "grad_norm": 0.15036718547344208, "learning_rate": 1.8312284325095274e-05, "loss": 0.5123, "step": 2563 }, { "epoch": 0.6489496330043028, "grad_norm": 0.1482842117547989, "learning_rate": 1.8310953007818158e-05, "loss": 0.5636, "step": 2564 }, { "epoch": 0.6492027334851936, "grad_norm": 0.1420038640499115, "learning_rate": 1.8309621214091734e-05, "loss": 0.5231, "step": 2565 }, { "epoch": 0.6494558339660845, "grad_norm": 0.1481584757566452, "learning_rate": 1.8308288943992357e-05, "loss": 0.5328, "step": 2566 }, { "epoch": 0.6497089344469754, "grad_norm": 0.14541727304458618, "learning_rate": 1.8306956197596397e-05, "loss": 0.5228, "step": 2567 }, { "epoch": 0.6499620349278664, "grad_norm": 0.14279106259346008, "learning_rate": 1.830562297498026e-05, "loss": 0.54, "step": 2568 }, { "epoch": 0.6502151354087573, "grad_norm": 0.1472083181142807, "learning_rate": 1.8304289276220375e-05, "loss": 0.5329, "step": 2569 }, { "epoch": 0.6504682358896482, "grad_norm": 0.15057985484600067, "learning_rate": 1.83029551013932e-05, "loss": 0.5335, "step": 2570 }, { "epoch": 0.6507213363705391, "grad_norm": 0.3025047481060028, "learning_rate": 1.8301620450575225e-05, "loss": 0.5398, "step": 2571 }, { "epoch": 0.65097443685143, "grad_norm": 0.14984352886676788, "learning_rate": 1.8300285323842953e-05, "loss": 0.5528, "step": 2572 }, { "epoch": 0.651227537332321, "grad_norm": 0.14657318592071533, "learning_rate": 1.8298949721272935e-05, "loss": 0.546, "step": 2573 }, { "epoch": 0.6514806378132119, "grad_norm": 0.14619691669940948, "learning_rate": 1.8297613642941736e-05, "loss": 0.5644, "step": 2574 }, { "epoch": 0.6517337382941027, "grad_norm": 0.14889754354953766, "learning_rate": 1.8296277088925944e-05, "loss": 0.529, "step": 2575 }, { "epoch": 0.6519868387749936, "grad_norm": 0.14526212215423584, "learning_rate": 1.8294940059302185e-05, "loss": 0.5276, "step": 2576 }, { "epoch": 0.6522399392558846, "grad_norm": 0.14411452412605286, "learning_rate": 1.829360255414711e-05, "loss": 0.5287, "step": 2577 }, { "epoch": 0.6524930397367755, "grad_norm": 0.14212995767593384, "learning_rate": 1.8292264573537392e-05, "loss": 0.5244, "step": 2578 }, { "epoch": 0.6527461402176664, "grad_norm": 0.15822291374206543, "learning_rate": 1.8290926117549737e-05, "loss": 0.5323, "step": 2579 }, { "epoch": 0.6529992406985573, "grad_norm": 0.14641554653644562, "learning_rate": 1.8289587186260874e-05, "loss": 0.5468, "step": 2580 }, { "epoch": 0.6532523411794482, "grad_norm": 0.14154264330863953, "learning_rate": 1.8288247779747564e-05, "loss": 0.5034, "step": 2581 }, { "epoch": 0.6535054416603392, "grad_norm": 0.15055370330810547, "learning_rate": 1.8286907898086586e-05, "loss": 0.5511, "step": 2582 }, { "epoch": 0.6537585421412301, "grad_norm": 0.14376474916934967, "learning_rate": 1.828556754135476e-05, "loss": 0.5399, "step": 2583 }, { "epoch": 0.654011642622121, "grad_norm": 0.1468782275915146, "learning_rate": 1.8284226709628917e-05, "loss": 0.5159, "step": 2584 }, { "epoch": 0.6542647431030119, "grad_norm": 0.14444656670093536, "learning_rate": 1.8282885402985936e-05, "loss": 0.5384, "step": 2585 }, { "epoch": 0.6545178435839029, "grad_norm": 0.14160342514514923, "learning_rate": 1.8281543621502706e-05, "loss": 0.5061, "step": 2586 }, { "epoch": 0.6547709440647937, "grad_norm": 0.15068010985851288, "learning_rate": 1.828020136525614e-05, "loss": 0.5265, "step": 2587 }, { "epoch": 0.6550240445456846, "grad_norm": 0.14576853811740875, "learning_rate": 1.82788586343232e-05, "loss": 0.5338, "step": 2588 }, { "epoch": 0.6552771450265755, "grad_norm": 0.1483764946460724, "learning_rate": 1.8277515428780855e-05, "loss": 0.549, "step": 2589 }, { "epoch": 0.6555302455074664, "grad_norm": 0.14455454051494598, "learning_rate": 1.8276171748706107e-05, "loss": 0.54, "step": 2590 }, { "epoch": 0.6557833459883574, "grad_norm": 0.1460620015859604, "learning_rate": 1.827482759417599e-05, "loss": 0.5433, "step": 2591 }, { "epoch": 0.6560364464692483, "grad_norm": 0.14440762996673584, "learning_rate": 1.827348296526756e-05, "loss": 0.5548, "step": 2592 }, { "epoch": 0.6562895469501392, "grad_norm": 0.1437801718711853, "learning_rate": 1.82721378620579e-05, "loss": 0.5473, "step": 2593 }, { "epoch": 0.6565426474310301, "grad_norm": 0.18944020569324493, "learning_rate": 1.8270792284624127e-05, "loss": 0.5425, "step": 2594 }, { "epoch": 0.6567957479119211, "grad_norm": 0.1519591063261032, "learning_rate": 1.8269446233043373e-05, "loss": 0.5229, "step": 2595 }, { "epoch": 0.657048848392812, "grad_norm": 0.15800318121910095, "learning_rate": 1.826809970739281e-05, "loss": 0.5294, "step": 2596 }, { "epoch": 0.6573019488737029, "grad_norm": 0.1494808793067932, "learning_rate": 1.826675270774963e-05, "loss": 0.5266, "step": 2597 }, { "epoch": 0.6575550493545937, "grad_norm": 0.151277557015419, "learning_rate": 1.826540523419105e-05, "loss": 0.5543, "step": 2598 }, { "epoch": 0.6578081498354846, "grad_norm": 0.14490482211112976, "learning_rate": 1.8264057286794323e-05, "loss": 0.5365, "step": 2599 }, { "epoch": 0.6580612503163756, "grad_norm": 0.15144558250904083, "learning_rate": 1.8262708865636724e-05, "loss": 0.5418, "step": 2600 }, { "epoch": 0.6583143507972665, "grad_norm": 0.14295633137226105, "learning_rate": 1.826135997079555e-05, "loss": 0.5208, "step": 2601 }, { "epoch": 0.6585674512781574, "grad_norm": 0.1654975265264511, "learning_rate": 1.8260010602348136e-05, "loss": 0.5612, "step": 2602 }, { "epoch": 0.6588205517590483, "grad_norm": 0.1482984572649002, "learning_rate": 1.825866076037183e-05, "loss": 0.5505, "step": 2603 }, { "epoch": 0.6590736522399393, "grad_norm": 0.34943100810050964, "learning_rate": 1.8257310444944028e-05, "loss": 0.5318, "step": 2604 }, { "epoch": 0.6593267527208302, "grad_norm": 0.14697709679603577, "learning_rate": 1.8255959656142127e-05, "loss": 0.5271, "step": 2605 }, { "epoch": 0.6595798532017211, "grad_norm": 0.15266238152980804, "learning_rate": 1.8254608394043578e-05, "loss": 0.5424, "step": 2606 }, { "epoch": 0.659832953682612, "grad_norm": 0.15145014226436615, "learning_rate": 1.8253256658725837e-05, "loss": 0.5597, "step": 2607 }, { "epoch": 0.6600860541635029, "grad_norm": 0.14370834827423096, "learning_rate": 1.82519044502664e-05, "loss": 0.5167, "step": 2608 }, { "epoch": 0.6603391546443939, "grad_norm": 0.1422983705997467, "learning_rate": 1.8250551768742783e-05, "loss": 0.522, "step": 2609 }, { "epoch": 0.6605922551252847, "grad_norm": 0.14248280227184296, "learning_rate": 1.8249198614232538e-05, "loss": 0.5082, "step": 2610 }, { "epoch": 0.6608453556061756, "grad_norm": 0.14176413416862488, "learning_rate": 1.8247844986813233e-05, "loss": 0.5422, "step": 2611 }, { "epoch": 0.6610984560870665, "grad_norm": 0.14454533159732819, "learning_rate": 1.824649088656247e-05, "loss": 0.534, "step": 2612 }, { "epoch": 0.6613515565679575, "grad_norm": 0.15026485919952393, "learning_rate": 1.8245136313557876e-05, "loss": 0.548, "step": 2613 }, { "epoch": 0.6616046570488484, "grad_norm": 0.1425452083349228, "learning_rate": 1.8243781267877108e-05, "loss": 0.5117, "step": 2614 }, { "epoch": 0.6618577575297393, "grad_norm": 0.14487503468990326, "learning_rate": 1.824242574959785e-05, "loss": 0.5207, "step": 2615 }, { "epoch": 0.6621108580106302, "grad_norm": 0.28814759850502014, "learning_rate": 1.8241069758797805e-05, "loss": 0.5417, "step": 2616 }, { "epoch": 0.6623639584915212, "grad_norm": 0.1512315720319748, "learning_rate": 1.823971329555471e-05, "loss": 0.5347, "step": 2617 }, { "epoch": 0.6626170589724121, "grad_norm": 0.1426512598991394, "learning_rate": 1.8238356359946337e-05, "loss": 0.5465, "step": 2618 }, { "epoch": 0.662870159453303, "grad_norm": 0.1508060097694397, "learning_rate": 1.8236998952050467e-05, "loss": 0.5312, "step": 2619 }, { "epoch": 0.6631232599341939, "grad_norm": 0.16253367066383362, "learning_rate": 1.8235641071944916e-05, "loss": 0.5339, "step": 2620 }, { "epoch": 0.6633763604150847, "grad_norm": 0.1454850435256958, "learning_rate": 1.8234282719707534e-05, "loss": 0.5083, "step": 2621 }, { "epoch": 0.6636294608959757, "grad_norm": 0.14943727850914001, "learning_rate": 1.823292389541619e-05, "loss": 0.5344, "step": 2622 }, { "epoch": 0.6638825613768666, "grad_norm": 0.15019634366035461, "learning_rate": 1.8231564599148787e-05, "loss": 0.545, "step": 2623 }, { "epoch": 0.6641356618577575, "grad_norm": 0.15300233662128448, "learning_rate": 1.8230204830983243e-05, "loss": 0.53, "step": 2624 }, { "epoch": 0.6643887623386484, "grad_norm": 0.14591997861862183, "learning_rate": 1.8228844590997514e-05, "loss": 0.5388, "step": 2625 }, { "epoch": 0.6646418628195394, "grad_norm": 0.1456891894340515, "learning_rate": 1.822748387926958e-05, "loss": 0.5484, "step": 2626 }, { "epoch": 0.6648949633004303, "grad_norm": 0.1455857753753662, "learning_rate": 1.822612269587745e-05, "loss": 0.5297, "step": 2627 }, { "epoch": 0.6651480637813212, "grad_norm": 0.14482049643993378, "learning_rate": 1.8224761040899154e-05, "loss": 0.5385, "step": 2628 }, { "epoch": 0.6654011642622121, "grad_norm": 0.14514224231243134, "learning_rate": 1.8223398914412755e-05, "loss": 0.5399, "step": 2629 }, { "epoch": 0.665654264743103, "grad_norm": 0.14818979799747467, "learning_rate": 1.8222036316496342e-05, "loss": 0.521, "step": 2630 }, { "epoch": 0.665907365223994, "grad_norm": 0.14780977368354797, "learning_rate": 1.8220673247228022e-05, "loss": 0.5621, "step": 2631 }, { "epoch": 0.6661604657048849, "grad_norm": 0.14855843782424927, "learning_rate": 1.821930970668595e-05, "loss": 0.5312, "step": 2632 }, { "epoch": 0.6664135661857757, "grad_norm": 0.14819836616516113, "learning_rate": 1.821794569494828e-05, "loss": 0.495, "step": 2633 }, { "epoch": 0.6666666666666666, "grad_norm": 0.1473757028579712, "learning_rate": 1.821658121209322e-05, "loss": 0.5421, "step": 2634 }, { "epoch": 0.6669197671475576, "grad_norm": 0.14525184035301208, "learning_rate": 1.821521625819899e-05, "loss": 0.5207, "step": 2635 }, { "epoch": 0.6671728676284485, "grad_norm": 0.14685095846652985, "learning_rate": 1.8213850833343836e-05, "loss": 0.5216, "step": 2636 }, { "epoch": 0.6674259681093394, "grad_norm": 0.14816656708717346, "learning_rate": 1.821248493760604e-05, "loss": 0.5292, "step": 2637 }, { "epoch": 0.6676790685902303, "grad_norm": 0.14218780398368835, "learning_rate": 1.8211118571063896e-05, "loss": 0.5333, "step": 2638 }, { "epoch": 0.6679321690711212, "grad_norm": 0.15113654732704163, "learning_rate": 1.820975173379575e-05, "loss": 0.5349, "step": 2639 }, { "epoch": 0.6681852695520122, "grad_norm": 0.1410970240831375, "learning_rate": 1.820838442587995e-05, "loss": 0.5299, "step": 2640 }, { "epoch": 0.6684383700329031, "grad_norm": 0.1561165153980255, "learning_rate": 1.8207016647394882e-05, "loss": 0.5349, "step": 2641 }, { "epoch": 0.668691470513794, "grad_norm": 0.15471969544887543, "learning_rate": 1.8205648398418957e-05, "loss": 0.5291, "step": 2642 }, { "epoch": 0.6689445709946849, "grad_norm": 0.1493500918149948, "learning_rate": 1.8204279679030617e-05, "loss": 0.5074, "step": 2643 }, { "epoch": 0.6691976714755759, "grad_norm": 0.14829988777637482, "learning_rate": 1.8202910489308327e-05, "loss": 0.4893, "step": 2644 }, { "epoch": 0.6694507719564667, "grad_norm": 0.14577631652355194, "learning_rate": 1.820154082933058e-05, "loss": 0.5165, "step": 2645 }, { "epoch": 0.6697038724373576, "grad_norm": 0.1467095911502838, "learning_rate": 1.820017069917589e-05, "loss": 0.5523, "step": 2646 }, { "epoch": 0.6699569729182485, "grad_norm": 0.1475784182548523, "learning_rate": 1.8198800098922812e-05, "loss": 0.5437, "step": 2647 }, { "epoch": 0.6702100733991394, "grad_norm": 0.14039447903633118, "learning_rate": 1.8197429028649916e-05, "loss": 0.5496, "step": 2648 }, { "epoch": 0.6704631738800304, "grad_norm": 0.14430488646030426, "learning_rate": 1.8196057488435802e-05, "loss": 0.5046, "step": 2649 }, { "epoch": 0.6707162743609213, "grad_norm": 0.15438096225261688, "learning_rate": 1.81946854783591e-05, "loss": 0.557, "step": 2650 }, { "epoch": 0.6709693748418122, "grad_norm": 0.1537015587091446, "learning_rate": 1.8193312998498458e-05, "loss": 0.5419, "step": 2651 }, { "epoch": 0.6712224753227031, "grad_norm": 0.1503497213125229, "learning_rate": 1.8191940048932565e-05, "loss": 0.5565, "step": 2652 }, { "epoch": 0.6714755758035941, "grad_norm": 0.14642778038978577, "learning_rate": 1.8190566629740125e-05, "loss": 0.5475, "step": 2653 }, { "epoch": 0.671728676284485, "grad_norm": 0.14955230057239532, "learning_rate": 1.8189192740999876e-05, "loss": 0.5181, "step": 2654 }, { "epoch": 0.6719817767653758, "grad_norm": 0.1473504602909088, "learning_rate": 1.8187818382790575e-05, "loss": 0.5528, "step": 2655 }, { "epoch": 0.6722348772462667, "grad_norm": 0.15076345205307007, "learning_rate": 1.818644355519102e-05, "loss": 0.5451, "step": 2656 }, { "epoch": 0.6724879777271576, "grad_norm": 0.14599618315696716, "learning_rate": 1.8185068258280013e-05, "loss": 0.5012, "step": 2657 }, { "epoch": 0.6727410782080486, "grad_norm": 0.14549905061721802, "learning_rate": 1.818369249213641e-05, "loss": 0.5373, "step": 2658 }, { "epoch": 0.6729941786889395, "grad_norm": 0.146087646484375, "learning_rate": 1.818231625683908e-05, "loss": 0.5227, "step": 2659 }, { "epoch": 0.6732472791698304, "grad_norm": 0.18517613410949707, "learning_rate": 1.818093955246691e-05, "loss": 0.5253, "step": 2660 }, { "epoch": 0.6735003796507213, "grad_norm": 0.15097062289714813, "learning_rate": 1.817956237909883e-05, "loss": 0.5283, "step": 2661 }, { "epoch": 0.6737534801316123, "grad_norm": 0.14378787577152252, "learning_rate": 1.8178184736813792e-05, "loss": 0.506, "step": 2662 }, { "epoch": 0.6740065806125032, "grad_norm": 0.14214852452278137, "learning_rate": 1.8176806625690768e-05, "loss": 0.5049, "step": 2663 }, { "epoch": 0.6742596810933941, "grad_norm": 0.1420966237783432, "learning_rate": 1.8175428045808767e-05, "loss": 0.5188, "step": 2664 }, { "epoch": 0.674512781574285, "grad_norm": 0.14496557414531708, "learning_rate": 1.8174048997246818e-05, "loss": 0.518, "step": 2665 }, { "epoch": 0.674765882055176, "grad_norm": 0.14519834518432617, "learning_rate": 1.8172669480083978e-05, "loss": 0.5376, "step": 2666 }, { "epoch": 0.6750189825360668, "grad_norm": 0.15130527317523956, "learning_rate": 1.8171289494399335e-05, "loss": 0.533, "step": 2667 }, { "epoch": 0.6752720830169577, "grad_norm": 0.15248116850852966, "learning_rate": 1.8169909040271997e-05, "loss": 0.5196, "step": 2668 }, { "epoch": 0.6755251834978486, "grad_norm": 0.1468237340450287, "learning_rate": 1.8168528117781104e-05, "loss": 0.5035, "step": 2669 }, { "epoch": 0.6757782839787395, "grad_norm": 0.1470155268907547, "learning_rate": 1.8167146727005824e-05, "loss": 0.5208, "step": 2670 }, { "epoch": 0.6760313844596305, "grad_norm": 0.14594605565071106, "learning_rate": 1.8165764868025344e-05, "loss": 0.5228, "step": 2671 }, { "epoch": 0.6762844849405214, "grad_norm": 0.14652718603610992, "learning_rate": 1.816438254091889e-05, "loss": 0.547, "step": 2672 }, { "epoch": 0.6765375854214123, "grad_norm": 0.14855660498142242, "learning_rate": 1.8162999745765696e-05, "loss": 0.5525, "step": 2673 }, { "epoch": 0.6767906859023032, "grad_norm": 0.14703433215618134, "learning_rate": 1.8161616482645048e-05, "loss": 0.5521, "step": 2674 }, { "epoch": 0.6770437863831942, "grad_norm": 0.15182270109653473, "learning_rate": 1.816023275163624e-05, "loss": 0.5567, "step": 2675 }, { "epoch": 0.6772968868640851, "grad_norm": 0.15013697743415833, "learning_rate": 1.8158848552818592e-05, "loss": 0.5535, "step": 2676 }, { "epoch": 0.677549987344976, "grad_norm": 0.14551712572574615, "learning_rate": 1.815746388627147e-05, "loss": 0.5388, "step": 2677 }, { "epoch": 0.6778030878258668, "grad_norm": 0.1438574492931366, "learning_rate": 1.8156078752074246e-05, "loss": 0.5222, "step": 2678 }, { "epoch": 0.6780561883067577, "grad_norm": 0.14907999336719513, "learning_rate": 1.8154693150306327e-05, "loss": 0.5325, "step": 2679 }, { "epoch": 0.6783092887876487, "grad_norm": 0.1412954479455948, "learning_rate": 1.815330708104715e-05, "loss": 0.5149, "step": 2680 }, { "epoch": 0.6785623892685396, "grad_norm": 0.14831040799617767, "learning_rate": 1.815192054437617e-05, "loss": 0.5388, "step": 2681 }, { "epoch": 0.6788154897494305, "grad_norm": 0.14994782209396362, "learning_rate": 1.815053354037288e-05, "loss": 0.5435, "step": 2682 }, { "epoch": 0.6790685902303214, "grad_norm": 0.142563596367836, "learning_rate": 1.814914606911679e-05, "loss": 0.5269, "step": 2683 }, { "epoch": 0.6793216907112124, "grad_norm": 0.14263266324996948, "learning_rate": 1.814775813068744e-05, "loss": 0.5313, "step": 2684 }, { "epoch": 0.6795747911921033, "grad_norm": 0.14363744854927063, "learning_rate": 1.81463697251644e-05, "loss": 0.5108, "step": 2685 }, { "epoch": 0.6798278916729942, "grad_norm": 0.187117338180542, "learning_rate": 1.8144980852627266e-05, "loss": 0.5346, "step": 2686 }, { "epoch": 0.6800809921538851, "grad_norm": 0.1482447385787964, "learning_rate": 1.8143591513155662e-05, "loss": 0.5538, "step": 2687 }, { "epoch": 0.680334092634776, "grad_norm": 0.1461954563856125, "learning_rate": 1.8142201706829225e-05, "loss": 0.5215, "step": 2688 }, { "epoch": 0.680587193115667, "grad_norm": 0.1450091302394867, "learning_rate": 1.8140811433727633e-05, "loss": 0.5306, "step": 2689 }, { "epoch": 0.6808402935965578, "grad_norm": 0.14692509174346924, "learning_rate": 1.81394206939306e-05, "loss": 0.5418, "step": 2690 }, { "epoch": 0.6810933940774487, "grad_norm": 0.14713634550571442, "learning_rate": 1.8138029487517833e-05, "loss": 0.5421, "step": 2691 }, { "epoch": 0.6813464945583396, "grad_norm": 0.140730082988739, "learning_rate": 1.8136637814569108e-05, "loss": 0.5322, "step": 2692 }, { "epoch": 0.6815995950392306, "grad_norm": 0.14739148318767548, "learning_rate": 1.813524567516419e-05, "loss": 0.5394, "step": 2693 }, { "epoch": 0.6818526955201215, "grad_norm": 0.14859172701835632, "learning_rate": 1.81338530693829e-05, "loss": 0.5419, "step": 2694 }, { "epoch": 0.6821057960010124, "grad_norm": 0.14800290763378143, "learning_rate": 1.8132459997305063e-05, "loss": 0.5281, "step": 2695 }, { "epoch": 0.6823588964819033, "grad_norm": 0.2411351054906845, "learning_rate": 1.8131066459010546e-05, "loss": 0.5375, "step": 2696 }, { "epoch": 0.6826119969627942, "grad_norm": 0.14017412066459656, "learning_rate": 1.812967245457924e-05, "loss": 0.5304, "step": 2697 }, { "epoch": 0.6828650974436852, "grad_norm": 0.1507704257965088, "learning_rate": 1.812827798409105e-05, "loss": 0.5114, "step": 2698 }, { "epoch": 0.6831181979245761, "grad_norm": 0.14874079823493958, "learning_rate": 1.8126883047625933e-05, "loss": 0.5418, "step": 2699 }, { "epoch": 0.683371298405467, "grad_norm": 0.14708906412124634, "learning_rate": 1.8125487645263847e-05, "loss": 0.5366, "step": 2700 }, { "epoch": 0.6836243988863578, "grad_norm": 0.1492803692817688, "learning_rate": 1.812409177708479e-05, "loss": 0.5398, "step": 2701 }, { "epoch": 0.6838774993672488, "grad_norm": 0.14203189313411713, "learning_rate": 1.8122695443168785e-05, "loss": 0.5186, "step": 2702 }, { "epoch": 0.6841305998481397, "grad_norm": 0.15312273800373077, "learning_rate": 1.812129864359588e-05, "loss": 0.5266, "step": 2703 }, { "epoch": 0.6843837003290306, "grad_norm": 0.14636339247226715, "learning_rate": 1.811990137844615e-05, "loss": 0.5243, "step": 2704 }, { "epoch": 0.6846368008099215, "grad_norm": 0.14452330768108368, "learning_rate": 1.81185036477997e-05, "loss": 0.5438, "step": 2705 }, { "epoch": 0.6848899012908124, "grad_norm": 0.1509128361940384, "learning_rate": 1.8117105451736657e-05, "loss": 0.5311, "step": 2706 }, { "epoch": 0.6851430017717034, "grad_norm": 0.15748311579227448, "learning_rate": 1.8115706790337176e-05, "loss": 0.5302, "step": 2707 }, { "epoch": 0.6853961022525943, "grad_norm": 0.14035460352897644, "learning_rate": 1.8114307663681444e-05, "loss": 0.5152, "step": 2708 }, { "epoch": 0.6856492027334852, "grad_norm": 0.14641892910003662, "learning_rate": 1.811290807184966e-05, "loss": 0.5333, "step": 2709 }, { "epoch": 0.6859023032143761, "grad_norm": 0.14782601594924927, "learning_rate": 1.811150801492207e-05, "loss": 0.545, "step": 2710 }, { "epoch": 0.6861554036952671, "grad_norm": 0.16503480076789856, "learning_rate": 1.811010749297893e-05, "loss": 0.5038, "step": 2711 }, { "epoch": 0.686408504176158, "grad_norm": 0.16098101437091827, "learning_rate": 1.810870650610053e-05, "loss": 0.5523, "step": 2712 }, { "epoch": 0.6866616046570488, "grad_norm": 0.14806942641735077, "learning_rate": 1.8107305054367188e-05, "loss": 0.5503, "step": 2713 }, { "epoch": 0.6869147051379397, "grad_norm": 0.14722320437431335, "learning_rate": 1.8105903137859248e-05, "loss": 0.5556, "step": 2714 }, { "epoch": 0.6871678056188307, "grad_norm": 0.15449842810630798, "learning_rate": 1.8104500756657072e-05, "loss": 0.5254, "step": 2715 }, { "epoch": 0.6874209060997216, "grad_norm": 0.14408960938453674, "learning_rate": 1.8103097910841055e-05, "loss": 0.528, "step": 2716 }, { "epoch": 0.6876740065806125, "grad_norm": 0.14892727136611938, "learning_rate": 1.810169460049163e-05, "loss": 0.5322, "step": 2717 }, { "epoch": 0.6879271070615034, "grad_norm": 0.14627705514431, "learning_rate": 1.8100290825689238e-05, "loss": 0.536, "step": 2718 }, { "epoch": 0.6881802075423943, "grad_norm": 0.1423405110836029, "learning_rate": 1.8098886586514356e-05, "loss": 0.5169, "step": 2719 }, { "epoch": 0.6884333080232853, "grad_norm": 0.14613592624664307, "learning_rate": 1.8097481883047483e-05, "loss": 0.5193, "step": 2720 }, { "epoch": 0.6886864085041762, "grad_norm": 0.1515716165304184, "learning_rate": 1.8096076715369152e-05, "loss": 0.5355, "step": 2721 }, { "epoch": 0.6889395089850671, "grad_norm": 0.14382004737854004, "learning_rate": 1.8094671083559918e-05, "loss": 0.5517, "step": 2722 }, { "epoch": 0.689192609465958, "grad_norm": 0.1456882804632187, "learning_rate": 1.809326498770036e-05, "loss": 0.539, "step": 2723 }, { "epoch": 0.689445709946849, "grad_norm": 0.1479150950908661, "learning_rate": 1.809185842787109e-05, "loss": 0.5223, "step": 2724 }, { "epoch": 0.6896988104277398, "grad_norm": 0.14168787002563477, "learning_rate": 1.8090451404152736e-05, "loss": 0.5394, "step": 2725 }, { "epoch": 0.6899519109086307, "grad_norm": 0.149136021733284, "learning_rate": 1.808904391662597e-05, "loss": 0.5013, "step": 2726 }, { "epoch": 0.6902050113895216, "grad_norm": 0.1428527981042862, "learning_rate": 1.808763596537147e-05, "loss": 0.5037, "step": 2727 }, { "epoch": 0.6904581118704125, "grad_norm": 0.1451261341571808, "learning_rate": 1.8086227550469965e-05, "loss": 0.5326, "step": 2728 }, { "epoch": 0.6907112123513035, "grad_norm": 0.14612074196338654, "learning_rate": 1.808481867200218e-05, "loss": 0.5391, "step": 2729 }, { "epoch": 0.6909643128321944, "grad_norm": 0.147441565990448, "learning_rate": 1.808340933004889e-05, "loss": 0.5487, "step": 2730 }, { "epoch": 0.6912174133130853, "grad_norm": 0.14560317993164062, "learning_rate": 1.808199952469089e-05, "loss": 0.5093, "step": 2731 }, { "epoch": 0.6914705137939762, "grad_norm": 0.14556057751178741, "learning_rate": 1.8080589256009007e-05, "loss": 0.5582, "step": 2732 }, { "epoch": 0.6917236142748672, "grad_norm": 0.1445273905992508, "learning_rate": 1.8079178524084077e-05, "loss": 0.5189, "step": 2733 }, { "epoch": 0.6919767147557581, "grad_norm": 0.14358773827552795, "learning_rate": 1.8077767328996983e-05, "loss": 0.515, "step": 2734 }, { "epoch": 0.692229815236649, "grad_norm": 0.1437392234802246, "learning_rate": 1.8076355670828626e-05, "loss": 0.5014, "step": 2735 }, { "epoch": 0.6924829157175398, "grad_norm": 0.1445649117231369, "learning_rate": 1.8074943549659923e-05, "loss": 0.5834, "step": 2736 }, { "epoch": 0.6927360161984307, "grad_norm": 0.14494314789772034, "learning_rate": 1.8073530965571838e-05, "loss": 0.5269, "step": 2737 }, { "epoch": 0.6929891166793217, "grad_norm": 0.14743253588676453, "learning_rate": 1.807211791864535e-05, "loss": 0.5393, "step": 2738 }, { "epoch": 0.6932422171602126, "grad_norm": 0.14686620235443115, "learning_rate": 1.8070704408961463e-05, "loss": 0.5481, "step": 2739 }, { "epoch": 0.6934953176411035, "grad_norm": 0.1491858810186386, "learning_rate": 1.806929043660121e-05, "loss": 0.5529, "step": 2740 }, { "epoch": 0.6937484181219944, "grad_norm": 0.1462259441614151, "learning_rate": 1.806787600164566e-05, "loss": 0.5477, "step": 2741 }, { "epoch": 0.6940015186028854, "grad_norm": 0.14706359803676605, "learning_rate": 1.8066461104175888e-05, "loss": 0.5374, "step": 2742 }, { "epoch": 0.6942546190837763, "grad_norm": 0.14483709633350372, "learning_rate": 1.8065045744273015e-05, "loss": 0.5161, "step": 2743 }, { "epoch": 0.6945077195646672, "grad_norm": 0.15205207467079163, "learning_rate": 1.8063629922018172e-05, "loss": 0.5238, "step": 2744 }, { "epoch": 0.6947608200455581, "grad_norm": 0.15554554760456085, "learning_rate": 1.8062213637492538e-05, "loss": 0.52, "step": 2745 }, { "epoch": 0.695013920526449, "grad_norm": 0.15813574194908142, "learning_rate": 1.8060796890777294e-05, "loss": 0.5371, "step": 2746 }, { "epoch": 0.69526702100734, "grad_norm": 0.14827005565166473, "learning_rate": 1.805937968195366e-05, "loss": 0.5507, "step": 2747 }, { "epoch": 0.6955201214882308, "grad_norm": 0.14521510899066925, "learning_rate": 1.805796201110289e-05, "loss": 0.5171, "step": 2748 }, { "epoch": 0.6957732219691217, "grad_norm": 0.1462366133928299, "learning_rate": 1.8056543878306247e-05, "loss": 0.5548, "step": 2749 }, { "epoch": 0.6960263224500126, "grad_norm": 0.14651119709014893, "learning_rate": 1.8055125283645036e-05, "loss": 0.5377, "step": 2750 }, { "epoch": 0.6962794229309036, "grad_norm": 0.14567963778972626, "learning_rate": 1.805370622720058e-05, "loss": 0.5358, "step": 2751 }, { "epoch": 0.6965325234117945, "grad_norm": 0.15595458447933197, "learning_rate": 1.8052286709054226e-05, "loss": 0.5452, "step": 2752 }, { "epoch": 0.6967856238926854, "grad_norm": 0.1589277982711792, "learning_rate": 1.805086672928736e-05, "loss": 0.5441, "step": 2753 }, { "epoch": 0.6970387243735763, "grad_norm": 0.14181219041347504, "learning_rate": 1.804944628798138e-05, "loss": 0.5248, "step": 2754 }, { "epoch": 0.6972918248544672, "grad_norm": 0.14502781629562378, "learning_rate": 1.804802538521772e-05, "loss": 0.5059, "step": 2755 }, { "epoch": 0.6975449253353582, "grad_norm": 0.14499282836914062, "learning_rate": 1.8046604021077834e-05, "loss": 0.5058, "step": 2756 }, { "epoch": 0.6977980258162491, "grad_norm": 0.14618387818336487, "learning_rate": 1.804518219564321e-05, "loss": 0.5181, "step": 2757 }, { "epoch": 0.69805112629714, "grad_norm": 0.14578969776630402, "learning_rate": 1.8043759908995355e-05, "loss": 0.539, "step": 2758 }, { "epoch": 0.6983042267780308, "grad_norm": 0.14962175488471985, "learning_rate": 1.8042337161215808e-05, "loss": 0.5291, "step": 2759 }, { "epoch": 0.6985573272589218, "grad_norm": 0.16270391643047333, "learning_rate": 1.8040913952386134e-05, "loss": 0.5516, "step": 2760 }, { "epoch": 0.6988104277398127, "grad_norm": 0.15037229657173157, "learning_rate": 1.8039490282587916e-05, "loss": 0.5411, "step": 2761 }, { "epoch": 0.6990635282207036, "grad_norm": 0.14972001314163208, "learning_rate": 1.803806615190278e-05, "loss": 0.5267, "step": 2762 }, { "epoch": 0.6993166287015945, "grad_norm": 0.14504480361938477, "learning_rate": 1.8036641560412355e-05, "loss": 0.5524, "step": 2763 }, { "epoch": 0.6995697291824855, "grad_norm": 0.14563485980033875, "learning_rate": 1.8035216508198318e-05, "loss": 0.5161, "step": 2764 }, { "epoch": 0.6998228296633764, "grad_norm": 0.14676491916179657, "learning_rate": 1.8033790995342368e-05, "loss": 0.5358, "step": 2765 }, { "epoch": 0.7000759301442673, "grad_norm": 0.1475553661584854, "learning_rate": 1.8032365021926218e-05, "loss": 0.5367, "step": 2766 }, { "epoch": 0.7003290306251582, "grad_norm": 0.1452532410621643, "learning_rate": 1.803093858803162e-05, "loss": 0.5359, "step": 2767 }, { "epoch": 0.700582131106049, "grad_norm": 0.16440361738204956, "learning_rate": 1.802951169374035e-05, "loss": 0.5153, "step": 2768 }, { "epoch": 0.70083523158694, "grad_norm": 0.14603191614151, "learning_rate": 1.8028084339134205e-05, "loss": 0.5208, "step": 2769 }, { "epoch": 0.7010883320678309, "grad_norm": 0.14808522164821625, "learning_rate": 1.8026656524295018e-05, "loss": 0.5457, "step": 2770 }, { "epoch": 0.7013414325487218, "grad_norm": 0.1442599594593048, "learning_rate": 1.8025228249304637e-05, "loss": 0.5235, "step": 2771 }, { "epoch": 0.7015945330296127, "grad_norm": 0.14662116765975952, "learning_rate": 1.8023799514244943e-05, "loss": 0.5088, "step": 2772 }, { "epoch": 0.7018476335105037, "grad_norm": 0.142899751663208, "learning_rate": 1.8022370319197847e-05, "loss": 0.538, "step": 2773 }, { "epoch": 0.7021007339913946, "grad_norm": 0.14384151995182037, "learning_rate": 1.8020940664245272e-05, "loss": 0.5427, "step": 2774 }, { "epoch": 0.7023538344722855, "grad_norm": 0.1488441526889801, "learning_rate": 1.8019510549469185e-05, "loss": 0.5414, "step": 2775 }, { "epoch": 0.7026069349531764, "grad_norm": 0.14776454865932465, "learning_rate": 1.8018079974951574e-05, "loss": 0.5214, "step": 2776 }, { "epoch": 0.7028600354340673, "grad_norm": 0.14912188053131104, "learning_rate": 1.801664894077444e-05, "loss": 0.5411, "step": 2777 }, { "epoch": 0.7031131359149583, "grad_norm": 0.1462947577238083, "learning_rate": 1.8015217447019832e-05, "loss": 0.5682, "step": 2778 }, { "epoch": 0.7033662363958492, "grad_norm": 0.1409514993429184, "learning_rate": 1.8013785493769806e-05, "loss": 0.5121, "step": 2779 }, { "epoch": 0.70361933687674, "grad_norm": 0.1406039148569107, "learning_rate": 1.8012353081106463e-05, "loss": 0.5262, "step": 2780 }, { "epoch": 0.7038724373576309, "grad_norm": 0.14898115396499634, "learning_rate": 1.8010920209111908e-05, "loss": 0.537, "step": 2781 }, { "epoch": 0.7041255378385219, "grad_norm": 0.14424747228622437, "learning_rate": 1.8009486877868293e-05, "loss": 0.5258, "step": 2782 }, { "epoch": 0.7043786383194128, "grad_norm": 0.15015709400177002, "learning_rate": 1.8008053087457784e-05, "loss": 0.4968, "step": 2783 }, { "epoch": 0.7046317388003037, "grad_norm": 0.14862953126430511, "learning_rate": 1.800661883796258e-05, "loss": 0.508, "step": 2784 }, { "epoch": 0.7048848392811946, "grad_norm": 0.14401929080486298, "learning_rate": 1.8005184129464905e-05, "loss": 0.5225, "step": 2785 }, { "epoch": 0.7051379397620855, "grad_norm": 0.14750038087368011, "learning_rate": 1.8003748962046997e-05, "loss": 0.5555, "step": 2786 }, { "epoch": 0.7053910402429765, "grad_norm": 0.14002522826194763, "learning_rate": 1.8002313335791143e-05, "loss": 0.5049, "step": 2787 }, { "epoch": 0.7056441407238674, "grad_norm": 0.16281020641326904, "learning_rate": 1.800087725077964e-05, "loss": 0.5209, "step": 2788 }, { "epoch": 0.7058972412047583, "grad_norm": 0.14687448740005493, "learning_rate": 1.799944070709482e-05, "loss": 0.5479, "step": 2789 }, { "epoch": 0.7061503416856492, "grad_norm": 0.1478552669286728, "learning_rate": 1.799800370481903e-05, "loss": 0.5372, "step": 2790 }, { "epoch": 0.7064034421665402, "grad_norm": 0.15007026493549347, "learning_rate": 1.799656624403465e-05, "loss": 0.5264, "step": 2791 }, { "epoch": 0.706656542647431, "grad_norm": 0.14968924224376678, "learning_rate": 1.7995128324824094e-05, "loss": 0.5485, "step": 2792 }, { "epoch": 0.7069096431283219, "grad_norm": 0.1551859825849533, "learning_rate": 1.799368994726979e-05, "loss": 0.5162, "step": 2793 }, { "epoch": 0.7071627436092128, "grad_norm": 0.14584921300411224, "learning_rate": 1.7992251111454198e-05, "loss": 0.5153, "step": 2794 }, { "epoch": 0.7074158440901037, "grad_norm": 0.15128177404403687, "learning_rate": 1.7990811817459802e-05, "loss": 0.5425, "step": 2795 }, { "epoch": 0.7076689445709947, "grad_norm": 0.14871525764465332, "learning_rate": 1.7989372065369118e-05, "loss": 0.534, "step": 2796 }, { "epoch": 0.7079220450518856, "grad_norm": 0.14760757982730865, "learning_rate": 1.7987931855264677e-05, "loss": 0.5279, "step": 2797 }, { "epoch": 0.7081751455327765, "grad_norm": 0.14915476739406586, "learning_rate": 1.798649118722905e-05, "loss": 0.5133, "step": 2798 }, { "epoch": 0.7084282460136674, "grad_norm": 0.14028678834438324, "learning_rate": 1.798505006134482e-05, "loss": 0.5203, "step": 2799 }, { "epoch": 0.7086813464945584, "grad_norm": 0.14444823563098907, "learning_rate": 1.7983608477694616e-05, "loss": 0.5349, "step": 2800 }, { "epoch": 0.7089344469754493, "grad_norm": 0.1436929553747177, "learning_rate": 1.7982166436361067e-05, "loss": 0.5176, "step": 2801 }, { "epoch": 0.7091875474563402, "grad_norm": 0.14400990307331085, "learning_rate": 1.7980723937426848e-05, "loss": 0.5246, "step": 2802 }, { "epoch": 0.709440647937231, "grad_norm": 0.14669135212898254, "learning_rate": 1.7979280980974658e-05, "loss": 0.5344, "step": 2803 }, { "epoch": 0.7096937484181219, "grad_norm": 0.15003477036952972, "learning_rate": 1.7977837567087214e-05, "loss": 0.5285, "step": 2804 }, { "epoch": 0.7099468488990129, "grad_norm": 0.15274128317832947, "learning_rate": 1.7976393695847267e-05, "loss": 0.5622, "step": 2805 }, { "epoch": 0.7101999493799038, "grad_norm": 0.14622192084789276, "learning_rate": 1.7974949367337586e-05, "loss": 0.5131, "step": 2806 }, { "epoch": 0.7104530498607947, "grad_norm": 0.14860200881958008, "learning_rate": 1.7973504581640978e-05, "loss": 0.5315, "step": 2807 }, { "epoch": 0.7107061503416856, "grad_norm": 0.1511184573173523, "learning_rate": 1.7972059338840262e-05, "loss": 0.57, "step": 2808 }, { "epoch": 0.7109592508225766, "grad_norm": 0.14598067104816437, "learning_rate": 1.7970613639018297e-05, "loss": 0.5285, "step": 2809 }, { "epoch": 0.7112123513034675, "grad_norm": 0.15503932535648346, "learning_rate": 1.796916748225796e-05, "loss": 0.5564, "step": 2810 }, { "epoch": 0.7114654517843584, "grad_norm": 0.14427991211414337, "learning_rate": 1.7967720868642156e-05, "loss": 0.5314, "step": 2811 }, { "epoch": 0.7117185522652493, "grad_norm": 0.14314691722393036, "learning_rate": 1.7966273798253815e-05, "loss": 0.5052, "step": 2812 }, { "epoch": 0.7119716527461403, "grad_norm": 0.15298913419246674, "learning_rate": 1.7964826271175896e-05, "loss": 0.5472, "step": 2813 }, { "epoch": 0.7122247532270312, "grad_norm": 0.15115098655223846, "learning_rate": 1.7963378287491383e-05, "loss": 0.5443, "step": 2814 }, { "epoch": 0.712477853707922, "grad_norm": 0.14653074741363525, "learning_rate": 1.7961929847283284e-05, "loss": 0.5363, "step": 2815 }, { "epoch": 0.7127309541888129, "grad_norm": 0.15043623745441437, "learning_rate": 1.7960480950634635e-05, "loss": 0.5715, "step": 2816 }, { "epoch": 0.7129840546697038, "grad_norm": 0.14523740112781525, "learning_rate": 1.7959031597628504e-05, "loss": 0.5246, "step": 2817 }, { "epoch": 0.7132371551505948, "grad_norm": 0.14821510016918182, "learning_rate": 1.7957581788347967e-05, "loss": 0.5442, "step": 2818 }, { "epoch": 0.7134902556314857, "grad_norm": 0.14689341187477112, "learning_rate": 1.795613152287615e-05, "loss": 0.5327, "step": 2819 }, { "epoch": 0.7137433561123766, "grad_norm": 0.1495596319437027, "learning_rate": 1.795468080129619e-05, "loss": 0.52, "step": 2820 }, { "epoch": 0.7139964565932675, "grad_norm": 0.14510971307754517, "learning_rate": 1.7953229623691256e-05, "loss": 0.526, "step": 2821 }, { "epoch": 0.7142495570741585, "grad_norm": 0.14755523204803467, "learning_rate": 1.7951777990144534e-05, "loss": 0.5426, "step": 2822 }, { "epoch": 0.7145026575550494, "grad_norm": 0.1472373902797699, "learning_rate": 1.795032590073925e-05, "loss": 0.5396, "step": 2823 }, { "epoch": 0.7147557580359403, "grad_norm": 0.15088455379009247, "learning_rate": 1.7948873355558647e-05, "loss": 0.5292, "step": 2824 }, { "epoch": 0.7150088585168312, "grad_norm": 0.18523003160953522, "learning_rate": 1.7947420354685995e-05, "loss": 0.543, "step": 2825 }, { "epoch": 0.715261958997722, "grad_norm": 0.14839759469032288, "learning_rate": 1.794596689820459e-05, "loss": 0.5121, "step": 2826 }, { "epoch": 0.715515059478613, "grad_norm": 0.14841121435165405, "learning_rate": 1.794451298619776e-05, "loss": 0.5192, "step": 2827 }, { "epoch": 0.7157681599595039, "grad_norm": 0.1472892314195633, "learning_rate": 1.7943058618748853e-05, "loss": 0.5244, "step": 2828 }, { "epoch": 0.7160212604403948, "grad_norm": 0.1522616297006607, "learning_rate": 1.7941603795941247e-05, "loss": 0.54, "step": 2829 }, { "epoch": 0.7162743609212857, "grad_norm": 0.14400440454483032, "learning_rate": 1.794014851785834e-05, "loss": 0.5232, "step": 2830 }, { "epoch": 0.7165274614021767, "grad_norm": 0.15369334816932678, "learning_rate": 1.793869278458356e-05, "loss": 0.5649, "step": 2831 }, { "epoch": 0.7167805618830676, "grad_norm": 0.15156877040863037, "learning_rate": 1.7937236596200362e-05, "loss": 0.5444, "step": 2832 }, { "epoch": 0.7170336623639585, "grad_norm": 0.1472565233707428, "learning_rate": 1.793577995279223e-05, "loss": 0.5596, "step": 2833 }, { "epoch": 0.7172867628448494, "grad_norm": 0.15399563312530518, "learning_rate": 1.793432285444266e-05, "loss": 0.5209, "step": 2834 }, { "epoch": 0.7175398633257403, "grad_norm": 0.1464615911245346, "learning_rate": 1.79328653012352e-05, "loss": 0.5228, "step": 2835 }, { "epoch": 0.7177929638066313, "grad_norm": 0.1457682102918625, "learning_rate": 1.7931407293253396e-05, "loss": 0.5353, "step": 2836 }, { "epoch": 0.7180460642875222, "grad_norm": 0.14437851309776306, "learning_rate": 1.7929948830580837e-05, "loss": 0.5311, "step": 2837 }, { "epoch": 0.718299164768413, "grad_norm": 0.14645318686962128, "learning_rate": 1.7928489913301132e-05, "loss": 0.534, "step": 2838 }, { "epoch": 0.7185522652493039, "grad_norm": 0.14460812509059906, "learning_rate": 1.792703054149792e-05, "loss": 0.5089, "step": 2839 }, { "epoch": 0.7188053657301949, "grad_norm": 0.14507655799388885, "learning_rate": 1.792557071525486e-05, "loss": 0.5444, "step": 2840 }, { "epoch": 0.7190584662110858, "grad_norm": 0.149433434009552, "learning_rate": 1.7924110434655645e-05, "loss": 0.5198, "step": 2841 }, { "epoch": 0.7193115666919767, "grad_norm": 0.14685337245464325, "learning_rate": 1.7922649699783985e-05, "loss": 0.5351, "step": 2842 }, { "epoch": 0.7195646671728676, "grad_norm": 0.1567326933145523, "learning_rate": 1.792118851072363e-05, "loss": 0.5206, "step": 2843 }, { "epoch": 0.7198177676537585, "grad_norm": 0.14403514564037323, "learning_rate": 1.7919726867558333e-05, "loss": 0.5313, "step": 2844 }, { "epoch": 0.7200708681346495, "grad_norm": 0.14330951869487762, "learning_rate": 1.7918264770371897e-05, "loss": 0.5412, "step": 2845 }, { "epoch": 0.7203239686155404, "grad_norm": 0.18638013303279877, "learning_rate": 1.7916802219248136e-05, "loss": 0.5498, "step": 2846 }, { "epoch": 0.7205770690964313, "grad_norm": 0.15019692480564117, "learning_rate": 1.79153392142709e-05, "loss": 0.5252, "step": 2847 }, { "epoch": 0.7208301695773222, "grad_norm": 0.14442569017410278, "learning_rate": 1.7913875755524058e-05, "loss": 0.5343, "step": 2848 }, { "epoch": 0.7210832700582132, "grad_norm": 0.15182821452617645, "learning_rate": 1.7912411843091505e-05, "loss": 0.5427, "step": 2849 }, { "epoch": 0.721336370539104, "grad_norm": 0.1439221352338791, "learning_rate": 1.791094747705717e-05, "loss": 0.5196, "step": 2850 }, { "epoch": 0.7215894710199949, "grad_norm": 0.1714148223400116, "learning_rate": 1.7909482657504988e-05, "loss": 0.536, "step": 2851 }, { "epoch": 0.7218425715008858, "grad_norm": 0.14558939635753632, "learning_rate": 1.7908017384518946e-05, "loss": 0.5459, "step": 2852 }, { "epoch": 0.7220956719817767, "grad_norm": 0.1415179967880249, "learning_rate": 1.7906551658183047e-05, "loss": 0.5152, "step": 2853 }, { "epoch": 0.7223487724626677, "grad_norm": 0.16388234496116638, "learning_rate": 1.790508547858131e-05, "loss": 0.5399, "step": 2854 }, { "epoch": 0.7226018729435586, "grad_norm": 0.14092190563678741, "learning_rate": 1.7903618845797792e-05, "loss": 0.5031, "step": 2855 }, { "epoch": 0.7228549734244495, "grad_norm": 0.1417672336101532, "learning_rate": 1.790215175991657e-05, "loss": 0.5043, "step": 2856 }, { "epoch": 0.7231080739053404, "grad_norm": 0.14391210675239563, "learning_rate": 1.7900684221021747e-05, "loss": 0.5341, "step": 2857 }, { "epoch": 0.7233611743862314, "grad_norm": 0.1473853588104248, "learning_rate": 1.7899216229197463e-05, "loss": 0.5526, "step": 2858 }, { "epoch": 0.7236142748671223, "grad_norm": 0.14629141986370087, "learning_rate": 1.7897747784527864e-05, "loss": 0.5292, "step": 2859 }, { "epoch": 0.7238673753480132, "grad_norm": 0.14511753618717194, "learning_rate": 1.789627888709714e-05, "loss": 0.5424, "step": 2860 }, { "epoch": 0.724120475828904, "grad_norm": 0.1441185474395752, "learning_rate": 1.7894809536989498e-05, "loss": 0.5352, "step": 2861 }, { "epoch": 0.724373576309795, "grad_norm": 0.1520334780216217, "learning_rate": 1.789333973428917e-05, "loss": 0.5463, "step": 2862 }, { "epoch": 0.7246266767906859, "grad_norm": 0.1481693983078003, "learning_rate": 1.7891869479080418e-05, "loss": 0.5418, "step": 2863 }, { "epoch": 0.7248797772715768, "grad_norm": 0.15643611550331116, "learning_rate": 1.7890398771447534e-05, "loss": 0.5349, "step": 2864 }, { "epoch": 0.7251328777524677, "grad_norm": 0.1441963016986847, "learning_rate": 1.788892761147482e-05, "loss": 0.5285, "step": 2865 }, { "epoch": 0.7253859782333586, "grad_norm": 0.15031108260154724, "learning_rate": 1.7887455999246623e-05, "loss": 0.552, "step": 2866 }, { "epoch": 0.7256390787142496, "grad_norm": 0.1494501680135727, "learning_rate": 1.7885983934847307e-05, "loss": 0.5487, "step": 2867 }, { "epoch": 0.7258921791951405, "grad_norm": 0.14510177075862885, "learning_rate": 1.7884511418361256e-05, "loss": 0.5262, "step": 2868 }, { "epoch": 0.7261452796760314, "grad_norm": 0.15942813456058502, "learning_rate": 1.7883038449872892e-05, "loss": 0.5171, "step": 2869 }, { "epoch": 0.7263983801569223, "grad_norm": 0.1431449055671692, "learning_rate": 1.7881565029466656e-05, "loss": 0.5199, "step": 2870 }, { "epoch": 0.7266514806378133, "grad_norm": 0.14851166307926178, "learning_rate": 1.7880091157227015e-05, "loss": 0.5258, "step": 2871 }, { "epoch": 0.7269045811187042, "grad_norm": 0.1499960571527481, "learning_rate": 1.7878616833238466e-05, "loss": 0.5357, "step": 2872 }, { "epoch": 0.727157681599595, "grad_norm": 0.14485423266887665, "learning_rate": 1.7877142057585525e-05, "loss": 0.5071, "step": 2873 }, { "epoch": 0.7274107820804859, "grad_norm": 0.15008753538131714, "learning_rate": 1.7875666830352737e-05, "loss": 0.5372, "step": 2874 }, { "epoch": 0.7276638825613768, "grad_norm": 0.14161038398742676, "learning_rate": 1.787419115162468e-05, "loss": 0.5088, "step": 2875 }, { "epoch": 0.7279169830422678, "grad_norm": 0.14768251776695251, "learning_rate": 1.787271502148594e-05, "loss": 0.517, "step": 2876 }, { "epoch": 0.7281700835231587, "grad_norm": 0.15208204090595245, "learning_rate": 1.7871238440021154e-05, "loss": 0.5045, "step": 2877 }, { "epoch": 0.7284231840040496, "grad_norm": 0.14633716642856598, "learning_rate": 1.7869761407314966e-05, "loss": 0.5477, "step": 2878 }, { "epoch": 0.7286762844849405, "grad_norm": 0.14961759746074677, "learning_rate": 1.7868283923452046e-05, "loss": 0.5197, "step": 2879 }, { "epoch": 0.7289293849658315, "grad_norm": 0.1462971568107605, "learning_rate": 1.7866805988517102e-05, "loss": 0.5121, "step": 2880 }, { "epoch": 0.7291824854467224, "grad_norm": 0.14941370487213135, "learning_rate": 1.7865327602594855e-05, "loss": 0.5557, "step": 2881 }, { "epoch": 0.7294355859276133, "grad_norm": 0.1482287496328354, "learning_rate": 1.786384876577006e-05, "loss": 0.5476, "step": 2882 }, { "epoch": 0.7296886864085042, "grad_norm": 0.149621844291687, "learning_rate": 1.7862369478127502e-05, "loss": 0.5243, "step": 2883 }, { "epoch": 0.729941786889395, "grad_norm": 0.14481297135353088, "learning_rate": 1.786088973975198e-05, "loss": 0.5, "step": 2884 }, { "epoch": 0.730194887370286, "grad_norm": 0.14421936869621277, "learning_rate": 1.785940955072832e-05, "loss": 0.5492, "step": 2885 }, { "epoch": 0.7304479878511769, "grad_norm": 0.15094096958637238, "learning_rate": 1.7857928911141382e-05, "loss": 0.505, "step": 2886 }, { "epoch": 0.7307010883320678, "grad_norm": 0.14592532813549042, "learning_rate": 1.7856447821076052e-05, "loss": 0.5383, "step": 2887 }, { "epoch": 0.7309541888129587, "grad_norm": 0.1469985991716385, "learning_rate": 1.785496628061723e-05, "loss": 0.5283, "step": 2888 }, { "epoch": 0.7312072892938497, "grad_norm": 0.14522351324558258, "learning_rate": 1.7853484289849854e-05, "loss": 0.4775, "step": 2889 }, { "epoch": 0.7314603897747406, "grad_norm": 0.14416377246379852, "learning_rate": 1.7852001848858883e-05, "loss": 0.5045, "step": 2890 }, { "epoch": 0.7317134902556315, "grad_norm": 0.14876104891300201, "learning_rate": 1.7850518957729305e-05, "loss": 0.5208, "step": 2891 }, { "epoch": 0.7319665907365224, "grad_norm": 0.14898617565631866, "learning_rate": 1.784903561654612e-05, "loss": 0.5489, "step": 2892 }, { "epoch": 0.7322196912174133, "grad_norm": 0.1452137529850006, "learning_rate": 1.784755182539438e-05, "loss": 0.5454, "step": 2893 }, { "epoch": 0.7324727916983043, "grad_norm": 0.15900442004203796, "learning_rate": 1.7846067584359138e-05, "loss": 0.5291, "step": 2894 }, { "epoch": 0.7327258921791951, "grad_norm": 0.14678388833999634, "learning_rate": 1.7844582893525487e-05, "loss": 0.5292, "step": 2895 }, { "epoch": 0.732978992660086, "grad_norm": 0.1841592639684677, "learning_rate": 1.784309775297854e-05, "loss": 0.5152, "step": 2896 }, { "epoch": 0.7332320931409769, "grad_norm": 0.1804284304380417, "learning_rate": 1.7841612162803434e-05, "loss": 0.5267, "step": 2897 }, { "epoch": 0.7334851936218679, "grad_norm": 0.14499682188034058, "learning_rate": 1.7840126123085332e-05, "loss": 0.5372, "step": 2898 }, { "epoch": 0.7337382941027588, "grad_norm": 0.1472013294696808, "learning_rate": 1.7838639633909435e-05, "loss": 0.5349, "step": 2899 }, { "epoch": 0.7339913945836497, "grad_norm": 0.2258056253194809, "learning_rate": 1.783715269536096e-05, "loss": 0.5387, "step": 2900 }, { "epoch": 0.7342444950645406, "grad_norm": 0.14326857030391693, "learning_rate": 1.7835665307525137e-05, "loss": 0.5373, "step": 2901 }, { "epoch": 0.7344975955454315, "grad_norm": 0.14536747336387634, "learning_rate": 1.7834177470487242e-05, "loss": 0.5171, "step": 2902 }, { "epoch": 0.7347506960263225, "grad_norm": 0.15388627350330353, "learning_rate": 1.783268918433258e-05, "loss": 0.5477, "step": 2903 }, { "epoch": 0.7350037965072134, "grad_norm": 0.15929150581359863, "learning_rate": 1.7831200449146457e-05, "loss": 0.5249, "step": 2904 }, { "epoch": 0.7352568969881043, "grad_norm": 0.15158522129058838, "learning_rate": 1.7829711265014224e-05, "loss": 0.5385, "step": 2905 }, { "epoch": 0.7355099974689951, "grad_norm": 0.14774282276630402, "learning_rate": 1.782822163202125e-05, "loss": 0.5179, "step": 2906 }, { "epoch": 0.7357630979498861, "grad_norm": 0.14512324333190918, "learning_rate": 1.7826731550252943e-05, "loss": 0.5443, "step": 2907 }, { "epoch": 0.736016198430777, "grad_norm": 0.6089526414871216, "learning_rate": 1.7825241019794715e-05, "loss": 0.5556, "step": 2908 }, { "epoch": 0.7362692989116679, "grad_norm": 0.14531570672988892, "learning_rate": 1.7823750040732016e-05, "loss": 0.5472, "step": 2909 }, { "epoch": 0.7365223993925588, "grad_norm": 0.14886292815208435, "learning_rate": 1.7822258613150327e-05, "loss": 0.5265, "step": 2910 }, { "epoch": 0.7367754998734498, "grad_norm": 0.15178433060646057, "learning_rate": 1.7820766737135143e-05, "loss": 0.5366, "step": 2911 }, { "epoch": 0.7370286003543407, "grad_norm": 0.15800794959068298, "learning_rate": 1.781927441277199e-05, "loss": 0.5543, "step": 2912 }, { "epoch": 0.7372817008352316, "grad_norm": 0.14842455089092255, "learning_rate": 1.7817781640146426e-05, "loss": 0.5194, "step": 2913 }, { "epoch": 0.7375348013161225, "grad_norm": 0.1446068435907364, "learning_rate": 1.7816288419344018e-05, "loss": 0.5142, "step": 2914 }, { "epoch": 0.7377879017970134, "grad_norm": 0.1476709246635437, "learning_rate": 1.781479475045038e-05, "loss": 0.5527, "step": 2915 }, { "epoch": 0.7380410022779044, "grad_norm": 0.15049278736114502, "learning_rate": 1.7813300633551135e-05, "loss": 0.5229, "step": 2916 }, { "epoch": 0.7382941027587953, "grad_norm": 0.14928176999092102, "learning_rate": 1.7811806068731937e-05, "loss": 0.5459, "step": 2917 }, { "epoch": 0.7385472032396861, "grad_norm": 0.1485675424337387, "learning_rate": 1.7810311056078468e-05, "loss": 0.5202, "step": 2918 }, { "epoch": 0.738800303720577, "grad_norm": 0.14436054229736328, "learning_rate": 1.7808815595676434e-05, "loss": 0.5389, "step": 2919 }, { "epoch": 0.739053404201468, "grad_norm": 0.16016635298728943, "learning_rate": 1.7807319687611565e-05, "loss": 0.5204, "step": 2920 }, { "epoch": 0.7393065046823589, "grad_norm": 0.1503237783908844, "learning_rate": 1.7805823331969625e-05, "loss": 0.5428, "step": 2921 }, { "epoch": 0.7395596051632498, "grad_norm": 0.14970757067203522, "learning_rate": 1.7804326528836387e-05, "loss": 0.5139, "step": 2922 }, { "epoch": 0.7398127056441407, "grad_norm": 0.15020276606082916, "learning_rate": 1.7802829278297663e-05, "loss": 0.5323, "step": 2923 }, { "epoch": 0.7400658061250316, "grad_norm": 0.15152312815189362, "learning_rate": 1.7801331580439288e-05, "loss": 0.5466, "step": 2924 }, { "epoch": 0.7403189066059226, "grad_norm": 0.14687307178974152, "learning_rate": 1.7799833435347127e-05, "loss": 0.5373, "step": 2925 }, { "epoch": 0.7405720070868135, "grad_norm": 0.14877833425998688, "learning_rate": 1.7798334843107056e-05, "loss": 0.5114, "step": 2926 }, { "epoch": 0.7408251075677044, "grad_norm": 0.14333681762218475, "learning_rate": 1.779683580380499e-05, "loss": 0.5213, "step": 2927 }, { "epoch": 0.7410782080485953, "grad_norm": 0.15205039083957672, "learning_rate": 1.779533631752687e-05, "loss": 0.5585, "step": 2928 }, { "epoch": 0.7413313085294863, "grad_norm": 0.14635588228702545, "learning_rate": 1.7793836384358653e-05, "loss": 0.5142, "step": 2929 }, { "epoch": 0.7415844090103771, "grad_norm": 0.14950081706047058, "learning_rate": 1.779233600438633e-05, "loss": 0.5303, "step": 2930 }, { "epoch": 0.741837509491268, "grad_norm": 0.14850197732448578, "learning_rate": 1.7790835177695913e-05, "loss": 0.5561, "step": 2931 }, { "epoch": 0.7420906099721589, "grad_norm": 0.1456206738948822, "learning_rate": 1.7789333904373442e-05, "loss": 0.5173, "step": 2932 }, { "epoch": 0.7423437104530498, "grad_norm": 0.1439744085073471, "learning_rate": 1.778783218450498e-05, "loss": 0.4964, "step": 2933 }, { "epoch": 0.7425968109339408, "grad_norm": 0.15093915164470673, "learning_rate": 1.7786330018176617e-05, "loss": 0.5184, "step": 2934 }, { "epoch": 0.7428499114148317, "grad_norm": 0.15115399658679962, "learning_rate": 1.7784827405474472e-05, "loss": 0.5649, "step": 2935 }, { "epoch": 0.7431030118957226, "grad_norm": 0.1543717235326767, "learning_rate": 1.7783324346484687e-05, "loss": 0.5345, "step": 2936 }, { "epoch": 0.7433561123766135, "grad_norm": 0.16179496049880981, "learning_rate": 1.7781820841293426e-05, "loss": 0.5542, "step": 2937 }, { "epoch": 0.7436092128575045, "grad_norm": 0.1514633148908615, "learning_rate": 1.7780316889986883e-05, "loss": 0.5284, "step": 2938 }, { "epoch": 0.7438623133383954, "grad_norm": 0.15350931882858276, "learning_rate": 1.7778812492651275e-05, "loss": 0.5345, "step": 2939 }, { "epoch": 0.7441154138192863, "grad_norm": 0.1504724770784378, "learning_rate": 1.777730764937285e-05, "loss": 0.5424, "step": 2940 }, { "epoch": 0.7443685143001771, "grad_norm": 0.14843443036079407, "learning_rate": 1.7775802360237877e-05, "loss": 0.5354, "step": 2941 }, { "epoch": 0.744621614781068, "grad_norm": 0.1463453620672226, "learning_rate": 1.7774296625332647e-05, "loss": 0.5058, "step": 2942 }, { "epoch": 0.744874715261959, "grad_norm": 0.15474829077720642, "learning_rate": 1.7772790444743485e-05, "loss": 0.5209, "step": 2943 }, { "epoch": 0.7451278157428499, "grad_norm": 0.14824678003787994, "learning_rate": 1.7771283818556727e-05, "loss": 0.5152, "step": 2944 }, { "epoch": 0.7453809162237408, "grad_norm": 0.15518391132354736, "learning_rate": 1.776977674685876e-05, "loss": 0.5217, "step": 2945 }, { "epoch": 0.7456340167046317, "grad_norm": 0.14659546315670013, "learning_rate": 1.776826922973597e-05, "loss": 0.5318, "step": 2946 }, { "epoch": 0.7458871171855227, "grad_norm": 0.1435319483280182, "learning_rate": 1.7766761267274786e-05, "loss": 0.5375, "step": 2947 }, { "epoch": 0.7461402176664136, "grad_norm": 0.14836201071739197, "learning_rate": 1.7765252859561655e-05, "loss": 0.5419, "step": 2948 }, { "epoch": 0.7463933181473045, "grad_norm": 0.15242090821266174, "learning_rate": 1.7763744006683047e-05, "loss": 0.5492, "step": 2949 }, { "epoch": 0.7466464186281954, "grad_norm": 0.14794063568115234, "learning_rate": 1.7762234708725464e-05, "loss": 0.509, "step": 2950 }, { "epoch": 0.7468995191090863, "grad_norm": 0.14903810620307922, "learning_rate": 1.776072496577543e-05, "loss": 0.5268, "step": 2951 }, { "epoch": 0.7471526195899773, "grad_norm": 0.14988981187343597, "learning_rate": 1.7759214777919496e-05, "loss": 0.5397, "step": 2952 }, { "epoch": 0.7474057200708681, "grad_norm": 0.23774631321430206, "learning_rate": 1.7757704145244238e-05, "loss": 0.5274, "step": 2953 }, { "epoch": 0.747658820551759, "grad_norm": 0.16623346507549286, "learning_rate": 1.7756193067836262e-05, "loss": 0.5404, "step": 2954 }, { "epoch": 0.7479119210326499, "grad_norm": 0.1483059525489807, "learning_rate": 1.7754681545782186e-05, "loss": 0.533, "step": 2955 }, { "epoch": 0.7481650215135409, "grad_norm": 0.15368886291980743, "learning_rate": 1.7753169579168664e-05, "loss": 0.532, "step": 2956 }, { "epoch": 0.7484181219944318, "grad_norm": 0.14759455621242523, "learning_rate": 1.7751657168082383e-05, "loss": 0.5253, "step": 2957 }, { "epoch": 0.7486712224753227, "grad_norm": 0.1529926359653473, "learning_rate": 1.7750144312610033e-05, "loss": 0.5471, "step": 2958 }, { "epoch": 0.7489243229562136, "grad_norm": 0.14726297557353973, "learning_rate": 1.7748631012838353e-05, "loss": 0.5447, "step": 2959 }, { "epoch": 0.7491774234371046, "grad_norm": 0.14679226279258728, "learning_rate": 1.7747117268854093e-05, "loss": 0.5019, "step": 2960 }, { "epoch": 0.7494305239179955, "grad_norm": 0.14927375316619873, "learning_rate": 1.7745603080744032e-05, "loss": 0.5083, "step": 2961 }, { "epoch": 0.7496836243988864, "grad_norm": 0.14736905694007874, "learning_rate": 1.774408844859498e-05, "loss": 0.5464, "step": 2962 }, { "epoch": 0.7499367248797773, "grad_norm": 0.16927950084209442, "learning_rate": 1.7742573372493765e-05, "loss": 0.522, "step": 2963 }, { "epoch": 0.7501898253606681, "grad_norm": 0.14778970181941986, "learning_rate": 1.7741057852527238e-05, "loss": 0.5143, "step": 2964 }, { "epoch": 0.7504429258415591, "grad_norm": 0.1510339081287384, "learning_rate": 1.7739541888782287e-05, "loss": 0.5104, "step": 2965 }, { "epoch": 0.75069602632245, "grad_norm": 0.14848393201828003, "learning_rate": 1.773802548134582e-05, "loss": 0.5317, "step": 2966 }, { "epoch": 0.7509491268033409, "grad_norm": 0.14839640259742737, "learning_rate": 1.7736508630304762e-05, "loss": 0.5395, "step": 2967 }, { "epoch": 0.7512022272842318, "grad_norm": 0.14730995893478394, "learning_rate": 1.7734991335746077e-05, "loss": 0.5198, "step": 2968 }, { "epoch": 0.7514553277651228, "grad_norm": 0.15580318868160248, "learning_rate": 1.7733473597756746e-05, "loss": 0.5475, "step": 2969 }, { "epoch": 0.7517084282460137, "grad_norm": 0.2519559860229492, "learning_rate": 1.7731955416423778e-05, "loss": 0.5294, "step": 2970 }, { "epoch": 0.7519615287269046, "grad_norm": 0.14028480648994446, "learning_rate": 1.773043679183421e-05, "loss": 0.5416, "step": 2971 }, { "epoch": 0.7522146292077955, "grad_norm": 0.14994113147258759, "learning_rate": 1.7728917724075096e-05, "loss": 0.5408, "step": 2972 }, { "epoch": 0.7524677296886864, "grad_norm": 0.14393679797649384, "learning_rate": 1.7727398213233525e-05, "loss": 0.5421, "step": 2973 }, { "epoch": 0.7527208301695774, "grad_norm": 0.14897584915161133, "learning_rate": 1.7725878259396605e-05, "loss": 0.5341, "step": 2974 }, { "epoch": 0.7529739306504682, "grad_norm": 0.16702575981616974, "learning_rate": 1.7724357862651474e-05, "loss": 0.5156, "step": 2975 }, { "epoch": 0.7532270311313591, "grad_norm": 0.15600398182868958, "learning_rate": 1.772283702308529e-05, "loss": 0.5292, "step": 2976 }, { "epoch": 0.75348013161225, "grad_norm": 0.15482543408870697, "learning_rate": 1.7721315740785244e-05, "loss": 0.5635, "step": 2977 }, { "epoch": 0.753733232093141, "grad_norm": 0.14389260113239288, "learning_rate": 1.7719794015838547e-05, "loss": 0.5263, "step": 2978 }, { "epoch": 0.7539863325740319, "grad_norm": 0.14956559240818024, "learning_rate": 1.771827184833243e-05, "loss": 0.5106, "step": 2979 }, { "epoch": 0.7542394330549228, "grad_norm": 0.1460314691066742, "learning_rate": 1.771674923835416e-05, "loss": 0.5413, "step": 2980 }, { "epoch": 0.7544925335358137, "grad_norm": 0.15577438473701477, "learning_rate": 1.7715226185991028e-05, "loss": 0.537, "step": 2981 }, { "epoch": 0.7547456340167046, "grad_norm": 0.17240791022777557, "learning_rate": 1.7713702691330346e-05, "loss": 0.5057, "step": 2982 }, { "epoch": 0.7549987344975956, "grad_norm": 0.14926257729530334, "learning_rate": 1.771217875445945e-05, "loss": 0.5397, "step": 2983 }, { "epoch": 0.7552518349784865, "grad_norm": 0.1437961757183075, "learning_rate": 1.7710654375465705e-05, "loss": 0.5347, "step": 2984 }, { "epoch": 0.7555049354593774, "grad_norm": 0.14255636930465698, "learning_rate": 1.77091295544365e-05, "loss": 0.5273, "step": 2985 }, { "epoch": 0.7557580359402682, "grad_norm": 0.1450057029724121, "learning_rate": 1.770760429145925e-05, "loss": 0.5289, "step": 2986 }, { "epoch": 0.7560111364211592, "grad_norm": 0.14657136797904968, "learning_rate": 1.77060785866214e-05, "loss": 0.5278, "step": 2987 }, { "epoch": 0.7562642369020501, "grad_norm": 0.1571706384420395, "learning_rate": 1.7704552440010406e-05, "loss": 0.5494, "step": 2988 }, { "epoch": 0.756517337382941, "grad_norm": 0.1852877289056778, "learning_rate": 1.7703025851713768e-05, "loss": 0.5202, "step": 2989 }, { "epoch": 0.7567704378638319, "grad_norm": 0.15663079917430878, "learning_rate": 1.7701498821818993e-05, "loss": 0.525, "step": 2990 }, { "epoch": 0.7570235383447228, "grad_norm": 0.1503055989742279, "learning_rate": 1.769997135041363e-05, "loss": 0.5151, "step": 2991 }, { "epoch": 0.7572766388256138, "grad_norm": 0.15526209771633148, "learning_rate": 1.7698443437585244e-05, "loss": 0.5367, "step": 2992 }, { "epoch": 0.7575297393065047, "grad_norm": 0.1541331559419632, "learning_rate": 1.7696915083421427e-05, "loss": 0.5565, "step": 2993 }, { "epoch": 0.7577828397873956, "grad_norm": 0.15073953568935394, "learning_rate": 1.769538628800979e-05, "loss": 0.5038, "step": 2994 }, { "epoch": 0.7580359402682865, "grad_norm": 0.16339759528636932, "learning_rate": 1.7693857051437986e-05, "loss": 0.5062, "step": 2995 }, { "epoch": 0.7582890407491775, "grad_norm": 0.15231944620609283, "learning_rate": 1.7692327373793675e-05, "loss": 0.5424, "step": 2996 }, { "epoch": 0.7585421412300684, "grad_norm": 0.1665373295545578, "learning_rate": 1.7690797255164557e-05, "loss": 0.505, "step": 2997 }, { "epoch": 0.7587952417109592, "grad_norm": 0.1562441736459732, "learning_rate": 1.7689266695638345e-05, "loss": 0.5234, "step": 2998 }, { "epoch": 0.7590483421918501, "grad_norm": 0.1558438539505005, "learning_rate": 1.768773569530278e-05, "loss": 0.5427, "step": 2999 }, { "epoch": 0.7593014426727411, "grad_norm": 0.1525963395833969, "learning_rate": 1.7686204254245638e-05, "loss": 0.5334, "step": 3000 }, { "epoch": 0.759554543153632, "grad_norm": 0.15848256647586823, "learning_rate": 1.768467237255471e-05, "loss": 0.5222, "step": 3001 }, { "epoch": 0.7598076436345229, "grad_norm": 0.14888495206832886, "learning_rate": 1.7683140050317816e-05, "loss": 0.5297, "step": 3002 }, { "epoch": 0.7600607441154138, "grad_norm": 0.15064217150211334, "learning_rate": 1.76816072876228e-05, "loss": 0.5171, "step": 3003 }, { "epoch": 0.7603138445963047, "grad_norm": 0.15602967143058777, "learning_rate": 1.7680074084557533e-05, "loss": 0.5343, "step": 3004 }, { "epoch": 0.7605669450771957, "grad_norm": 0.150221049785614, "learning_rate": 1.7678540441209914e-05, "loss": 0.5375, "step": 3005 }, { "epoch": 0.7608200455580866, "grad_norm": 0.15444840490818024, "learning_rate": 1.7677006357667855e-05, "loss": 0.5289, "step": 3006 }, { "epoch": 0.7610731460389775, "grad_norm": 0.15188942849636078, "learning_rate": 1.7675471834019307e-05, "loss": 0.5396, "step": 3007 }, { "epoch": 0.7613262465198684, "grad_norm": 0.15647144615650177, "learning_rate": 1.767393687035224e-05, "loss": 0.5393, "step": 3008 }, { "epoch": 0.7615793470007594, "grad_norm": 0.15710274875164032, "learning_rate": 1.7672401466754653e-05, "loss": 0.551, "step": 3009 }, { "epoch": 0.7618324474816502, "grad_norm": 0.14301566779613495, "learning_rate": 1.7670865623314563e-05, "loss": 0.5356, "step": 3010 }, { "epoch": 0.7620855479625411, "grad_norm": 0.15355347096920013, "learning_rate": 1.766932934012002e-05, "loss": 0.5102, "step": 3011 }, { "epoch": 0.762338648443432, "grad_norm": 0.14850030839443207, "learning_rate": 1.766779261725909e-05, "loss": 0.5363, "step": 3012 }, { "epoch": 0.7625917489243229, "grad_norm": 0.14782600104808807, "learning_rate": 1.766625545481988e-05, "loss": 0.5195, "step": 3013 }, { "epoch": 0.7628448494052139, "grad_norm": 0.15266305208206177, "learning_rate": 1.7664717852890506e-05, "loss": 0.5204, "step": 3014 }, { "epoch": 0.7630979498861048, "grad_norm": 0.15124641358852386, "learning_rate": 1.7663179811559112e-05, "loss": 0.5319, "step": 3015 }, { "epoch": 0.7633510503669957, "grad_norm": 0.15111200511455536, "learning_rate": 1.7661641330913878e-05, "loss": 0.5351, "step": 3016 }, { "epoch": 0.7636041508478866, "grad_norm": 0.15270401537418365, "learning_rate": 1.7660102411042998e-05, "loss": 0.5143, "step": 3017 }, { "epoch": 0.7638572513287776, "grad_norm": 0.15359775722026825, "learning_rate": 1.7658563052034697e-05, "loss": 0.5495, "step": 3018 }, { "epoch": 0.7641103518096685, "grad_norm": 0.14975665509700775, "learning_rate": 1.7657023253977222e-05, "loss": 0.5194, "step": 3019 }, { "epoch": 0.7643634522905594, "grad_norm": 0.1495814472436905, "learning_rate": 1.7655483016958844e-05, "loss": 0.5133, "step": 3020 }, { "epoch": 0.7646165527714502, "grad_norm": 0.1468273401260376, "learning_rate": 1.7653942341067867e-05, "loss": 0.5532, "step": 3021 }, { "epoch": 0.7648696532523411, "grad_norm": 0.2196628600358963, "learning_rate": 1.7652401226392608e-05, "loss": 0.5279, "step": 3022 }, { "epoch": 0.7651227537332321, "grad_norm": 0.1500619500875473, "learning_rate": 1.765085967302142e-05, "loss": 0.5471, "step": 3023 }, { "epoch": 0.765375854214123, "grad_norm": 0.14568065106868744, "learning_rate": 1.7649317681042676e-05, "loss": 0.5312, "step": 3024 }, { "epoch": 0.7656289546950139, "grad_norm": 0.15396185219287872, "learning_rate": 1.764777525054478e-05, "loss": 0.5609, "step": 3025 }, { "epoch": 0.7658820551759048, "grad_norm": 0.16178679466247559, "learning_rate": 1.7646232381616147e-05, "loss": 0.5326, "step": 3026 }, { "epoch": 0.7661351556567958, "grad_norm": 0.14604564011096954, "learning_rate": 1.764468907434523e-05, "loss": 0.5096, "step": 3027 }, { "epoch": 0.7663882561376867, "grad_norm": 0.15339668095111847, "learning_rate": 1.7643145328820508e-05, "loss": 0.5429, "step": 3028 }, { "epoch": 0.7666413566185776, "grad_norm": 0.177531898021698, "learning_rate": 1.7641601145130476e-05, "loss": 0.5295, "step": 3029 }, { "epoch": 0.7668944570994685, "grad_norm": 0.14776462316513062, "learning_rate": 1.764005652336366e-05, "loss": 0.522, "step": 3030 }, { "epoch": 0.7671475575803594, "grad_norm": 0.15326423943042755, "learning_rate": 1.763851146360861e-05, "loss": 0.5607, "step": 3031 }, { "epoch": 0.7674006580612504, "grad_norm": 0.1485346108675003, "learning_rate": 1.76369659659539e-05, "loss": 0.5401, "step": 3032 }, { "epoch": 0.7676537585421412, "grad_norm": 0.15064039826393127, "learning_rate": 1.7635420030488136e-05, "loss": 0.5439, "step": 3033 }, { "epoch": 0.7679068590230321, "grad_norm": 0.16441243886947632, "learning_rate": 1.7633873657299932e-05, "loss": 0.5329, "step": 3034 }, { "epoch": 0.768159959503923, "grad_norm": 0.1476014405488968, "learning_rate": 1.7632326846477946e-05, "loss": 0.5312, "step": 3035 }, { "epoch": 0.768413059984814, "grad_norm": 0.15498928725719452, "learning_rate": 1.7630779598110855e-05, "loss": 0.5561, "step": 3036 }, { "epoch": 0.7686661604657049, "grad_norm": 0.14413794875144958, "learning_rate": 1.7629231912287355e-05, "loss": 0.5048, "step": 3037 }, { "epoch": 0.7689192609465958, "grad_norm": 0.14934541285037994, "learning_rate": 1.762768378909617e-05, "loss": 0.5343, "step": 3038 }, { "epoch": 0.7691723614274867, "grad_norm": 0.1492408812046051, "learning_rate": 1.7626135228626057e-05, "loss": 0.5277, "step": 3039 }, { "epoch": 0.7694254619083776, "grad_norm": 0.14385581016540527, "learning_rate": 1.7624586230965785e-05, "loss": 0.5309, "step": 3040 }, { "epoch": 0.7696785623892686, "grad_norm": 0.16025836765766144, "learning_rate": 1.762303679620416e-05, "loss": 0.4861, "step": 3041 }, { "epoch": 0.7699316628701595, "grad_norm": 0.15009579062461853, "learning_rate": 1.7621486924430006e-05, "loss": 0.5447, "step": 3042 }, { "epoch": 0.7701847633510503, "grad_norm": 0.17060710489749908, "learning_rate": 1.7619936615732172e-05, "loss": 0.5049, "step": 3043 }, { "epoch": 0.7704378638319412, "grad_norm": 0.151773139834404, "learning_rate": 1.761838587019954e-05, "loss": 0.541, "step": 3044 }, { "epoch": 0.7706909643128322, "grad_norm": 0.14400655031204224, "learning_rate": 1.7616834687921e-05, "loss": 0.5236, "step": 3045 }, { "epoch": 0.7709440647937231, "grad_norm": 0.14931342005729675, "learning_rate": 1.7615283068985488e-05, "loss": 0.5249, "step": 3046 }, { "epoch": 0.771197165274614, "grad_norm": 0.1493956744670868, "learning_rate": 1.761373101348195e-05, "loss": 0.5402, "step": 3047 }, { "epoch": 0.7714502657555049, "grad_norm": 0.14991344511508942, "learning_rate": 1.7612178521499368e-05, "loss": 0.5284, "step": 3048 }, { "epoch": 0.7717033662363959, "grad_norm": 0.17889820039272308, "learning_rate": 1.7610625593126736e-05, "loss": 0.5188, "step": 3049 }, { "epoch": 0.7719564667172868, "grad_norm": 0.14607857167720795, "learning_rate": 1.760907222845308e-05, "loss": 0.5412, "step": 3050 }, { "epoch": 0.7722095671981777, "grad_norm": 0.1670723855495453, "learning_rate": 1.760751842756746e-05, "loss": 0.5264, "step": 3051 }, { "epoch": 0.7724626676790686, "grad_norm": 0.15447920560836792, "learning_rate": 1.760596419055894e-05, "loss": 0.5309, "step": 3052 }, { "epoch": 0.7727157681599595, "grad_norm": 0.14942969381809235, "learning_rate": 1.760440951751663e-05, "loss": 0.5255, "step": 3053 }, { "epoch": 0.7729688686408505, "grad_norm": 0.14934472739696503, "learning_rate": 1.760285440852965e-05, "loss": 0.5333, "step": 3054 }, { "epoch": 0.7732219691217413, "grad_norm": 0.14957554638385773, "learning_rate": 1.7601298863687158e-05, "loss": 0.5171, "step": 3055 }, { "epoch": 0.7734750696026322, "grad_norm": 0.16215239465236664, "learning_rate": 1.7599742883078324e-05, "loss": 0.5404, "step": 3056 }, { "epoch": 0.7737281700835231, "grad_norm": 0.14555588364601135, "learning_rate": 1.7598186466792355e-05, "loss": 0.5077, "step": 3057 }, { "epoch": 0.7739812705644141, "grad_norm": 0.15099303424358368, "learning_rate": 1.759662961491847e-05, "loss": 0.5154, "step": 3058 }, { "epoch": 0.774234371045305, "grad_norm": 0.16148875653743744, "learning_rate": 1.7595072327545923e-05, "loss": 0.5071, "step": 3059 }, { "epoch": 0.7744874715261959, "grad_norm": 0.15326759219169617, "learning_rate": 1.759351460476399e-05, "loss": 0.5523, "step": 3060 }, { "epoch": 0.7747405720070868, "grad_norm": 0.13800647854804993, "learning_rate": 1.7591956446661974e-05, "loss": 0.5073, "step": 3061 }, { "epoch": 0.7749936724879777, "grad_norm": 0.15216606855392456, "learning_rate": 1.7590397853329203e-05, "loss": 0.54, "step": 3062 }, { "epoch": 0.7752467729688687, "grad_norm": 0.14922036230564117, "learning_rate": 1.758883882485502e-05, "loss": 0.5434, "step": 3063 }, { "epoch": 0.7754998734497596, "grad_norm": 0.14790558815002441, "learning_rate": 1.7587279361328805e-05, "loss": 0.5239, "step": 3064 }, { "epoch": 0.7757529739306505, "grad_norm": 0.14883247017860413, "learning_rate": 1.758571946283996e-05, "loss": 0.5324, "step": 3065 }, { "epoch": 0.7760060744115413, "grad_norm": 0.16245825588703156, "learning_rate": 1.7584159129477908e-05, "loss": 0.5258, "step": 3066 }, { "epoch": 0.7762591748924323, "grad_norm": 0.14453540742397308, "learning_rate": 1.7582598361332103e-05, "loss": 0.5489, "step": 3067 }, { "epoch": 0.7765122753733232, "grad_norm": 0.1497040092945099, "learning_rate": 1.7581037158492015e-05, "loss": 0.5344, "step": 3068 }, { "epoch": 0.7767653758542141, "grad_norm": 0.15549802780151367, "learning_rate": 1.7579475521047152e-05, "loss": 0.5162, "step": 3069 }, { "epoch": 0.777018476335105, "grad_norm": 0.1497444063425064, "learning_rate": 1.7577913449087028e-05, "loss": 0.524, "step": 3070 }, { "epoch": 0.7772715768159959, "grad_norm": 0.1515781432390213, "learning_rate": 1.757635094270121e-05, "loss": 0.5101, "step": 3071 }, { "epoch": 0.7775246772968869, "grad_norm": 0.15219537913799286, "learning_rate": 1.7574788001979255e-05, "loss": 0.5372, "step": 3072 }, { "epoch": 0.7777777777777778, "grad_norm": 0.1431799679994583, "learning_rate": 1.7573224627010778e-05, "loss": 0.5365, "step": 3073 }, { "epoch": 0.7780308782586687, "grad_norm": 0.14396248757839203, "learning_rate": 1.75716608178854e-05, "loss": 0.5347, "step": 3074 }, { "epoch": 0.7782839787395596, "grad_norm": 0.1475006639957428, "learning_rate": 1.757009657469276e-05, "loss": 0.5199, "step": 3075 }, { "epoch": 0.7785370792204506, "grad_norm": 0.14710633456707, "learning_rate": 1.756853189752255e-05, "loss": 0.5318, "step": 3076 }, { "epoch": 0.7787901797013415, "grad_norm": 0.14377863705158234, "learning_rate": 1.7566966786464457e-05, "loss": 0.5217, "step": 3077 }, { "epoch": 0.7790432801822323, "grad_norm": 0.14841139316558838, "learning_rate": 1.7565401241608205e-05, "loss": 0.5151, "step": 3078 }, { "epoch": 0.7792963806631232, "grad_norm": 0.14787758886814117, "learning_rate": 1.7563835263043556e-05, "loss": 0.5102, "step": 3079 }, { "epoch": 0.7795494811440141, "grad_norm": 0.15038910508155823, "learning_rate": 1.756226885086027e-05, "loss": 0.5221, "step": 3080 }, { "epoch": 0.7798025816249051, "grad_norm": 0.14832082390785217, "learning_rate": 1.7560702005148156e-05, "loss": 0.5488, "step": 3081 }, { "epoch": 0.780055682105796, "grad_norm": 0.17327068746089935, "learning_rate": 1.7559134725997034e-05, "loss": 0.541, "step": 3082 }, { "epoch": 0.7803087825866869, "grad_norm": 0.17171186208724976, "learning_rate": 1.755756701349675e-05, "loss": 0.5284, "step": 3083 }, { "epoch": 0.7805618830675778, "grad_norm": 0.14251437783241272, "learning_rate": 1.7555998867737184e-05, "loss": 0.5228, "step": 3084 }, { "epoch": 0.7808149835484688, "grad_norm": 0.14049778878688812, "learning_rate": 1.7554430288808228e-05, "loss": 0.5065, "step": 3085 }, { "epoch": 0.7810680840293597, "grad_norm": 0.1463402658700943, "learning_rate": 1.7552861276799812e-05, "loss": 0.5221, "step": 3086 }, { "epoch": 0.7813211845102506, "grad_norm": 0.15021516382694244, "learning_rate": 1.7551291831801876e-05, "loss": 0.5149, "step": 3087 }, { "epoch": 0.7815742849911415, "grad_norm": 0.14419084787368774, "learning_rate": 1.75497219539044e-05, "loss": 0.5424, "step": 3088 }, { "epoch": 0.7818273854720323, "grad_norm": 0.1514299064874649, "learning_rate": 1.754815164319738e-05, "loss": 0.5362, "step": 3089 }, { "epoch": 0.7820804859529233, "grad_norm": 0.18080902099609375, "learning_rate": 1.754658089977084e-05, "loss": 0.5379, "step": 3090 }, { "epoch": 0.7823335864338142, "grad_norm": 0.1468861997127533, "learning_rate": 1.754500972371482e-05, "loss": 0.5347, "step": 3091 }, { "epoch": 0.7825866869147051, "grad_norm": 0.1527579426765442, "learning_rate": 1.7543438115119397e-05, "loss": 0.5425, "step": 3092 }, { "epoch": 0.782839787395596, "grad_norm": 0.14295974373817444, "learning_rate": 1.754186607407467e-05, "loss": 0.54, "step": 3093 }, { "epoch": 0.783092887876487, "grad_norm": 0.1470569223165512, "learning_rate": 1.7540293600670758e-05, "loss": 0.5512, "step": 3094 }, { "epoch": 0.7833459883573779, "grad_norm": 0.1645229160785675, "learning_rate": 1.7538720694997814e-05, "loss": 0.5642, "step": 3095 }, { "epoch": 0.7835990888382688, "grad_norm": 0.41865524649620056, "learning_rate": 1.7537147357145998e-05, "loss": 0.5418, "step": 3096 }, { "epoch": 0.7838521893191597, "grad_norm": 0.1472993940114975, "learning_rate": 1.7535573587205514e-05, "loss": 0.5286, "step": 3097 }, { "epoch": 0.7841052898000507, "grad_norm": 0.14632809162139893, "learning_rate": 1.7533999385266582e-05, "loss": 0.5716, "step": 3098 }, { "epoch": 0.7843583902809416, "grad_norm": 0.14350329339504242, "learning_rate": 1.7532424751419445e-05, "loss": 0.5391, "step": 3099 }, { "epoch": 0.7846114907618325, "grad_norm": 0.16144879162311554, "learning_rate": 1.7530849685754377e-05, "loss": 0.5133, "step": 3100 }, { "epoch": 0.7848645912427233, "grad_norm": 0.14275042712688446, "learning_rate": 1.7529274188361673e-05, "loss": 0.5426, "step": 3101 }, { "epoch": 0.7851176917236142, "grad_norm": 0.15260683000087738, "learning_rate": 1.7527698259331645e-05, "loss": 0.5563, "step": 3102 }, { "epoch": 0.7853707922045052, "grad_norm": 0.16672103106975555, "learning_rate": 1.7526121898754648e-05, "loss": 0.5346, "step": 3103 }, { "epoch": 0.7856238926853961, "grad_norm": 0.14481595158576965, "learning_rate": 1.752454510672105e-05, "loss": 0.5507, "step": 3104 }, { "epoch": 0.785876993166287, "grad_norm": 0.1431320309638977, "learning_rate": 1.7522967883321236e-05, "loss": 0.5097, "step": 3105 }, { "epoch": 0.7861300936471779, "grad_norm": 0.14806000888347626, "learning_rate": 1.7521390228645635e-05, "loss": 0.5266, "step": 3106 }, { "epoch": 0.7863831941280689, "grad_norm": 0.1451040655374527, "learning_rate": 1.7519812142784687e-05, "loss": 0.5436, "step": 3107 }, { "epoch": 0.7866362946089598, "grad_norm": 0.14261282980442047, "learning_rate": 1.751823362582886e-05, "loss": 0.5431, "step": 3108 }, { "epoch": 0.7868893950898507, "grad_norm": 0.1590554565191269, "learning_rate": 1.7516654677868645e-05, "loss": 0.538, "step": 3109 }, { "epoch": 0.7871424955707416, "grad_norm": 0.16258887946605682, "learning_rate": 1.7515075298994566e-05, "loss": 0.5309, "step": 3110 }, { "epoch": 0.7873955960516325, "grad_norm": 0.14656253159046173, "learning_rate": 1.7513495489297158e-05, "loss": 0.506, "step": 3111 }, { "epoch": 0.7876486965325235, "grad_norm": 0.14780651032924652, "learning_rate": 1.7511915248866993e-05, "loss": 0.5291, "step": 3112 }, { "epoch": 0.7879017970134143, "grad_norm": 0.2545357644557953, "learning_rate": 1.7510334577794662e-05, "loss": 0.5201, "step": 3113 }, { "epoch": 0.7881548974943052, "grad_norm": 0.1433885544538498, "learning_rate": 1.7508753476170778e-05, "loss": 0.521, "step": 3114 }, { "epoch": 0.7884079979751961, "grad_norm": 0.1426301747560501, "learning_rate": 1.750717194408599e-05, "loss": 0.5202, "step": 3115 }, { "epoch": 0.7886610984560871, "grad_norm": 0.14188823103904724, "learning_rate": 1.7505589981630955e-05, "loss": 0.5006, "step": 3116 }, { "epoch": 0.788914198936978, "grad_norm": 0.15551066398620605, "learning_rate": 1.7504007588896366e-05, "loss": 0.5568, "step": 3117 }, { "epoch": 0.7891672994178689, "grad_norm": 0.1495133489370346, "learning_rate": 1.7502424765972944e-05, "loss": 0.5346, "step": 3118 }, { "epoch": 0.7894203998987598, "grad_norm": 0.14981497824192047, "learning_rate": 1.7500841512951422e-05, "loss": 0.5241, "step": 3119 }, { "epoch": 0.7896735003796507, "grad_norm": 0.14472204446792603, "learning_rate": 1.7499257829922573e-05, "loss": 0.5329, "step": 3120 }, { "epoch": 0.7899266008605417, "grad_norm": 0.1536359339952469, "learning_rate": 1.7497673716977174e-05, "loss": 0.5196, "step": 3121 }, { "epoch": 0.7901797013414326, "grad_norm": 0.23148512840270996, "learning_rate": 1.7496089174206044e-05, "loss": 0.5343, "step": 3122 }, { "epoch": 0.7904328018223234, "grad_norm": 0.14624980092048645, "learning_rate": 1.7494504201700026e-05, "loss": 0.5281, "step": 3123 }, { "epoch": 0.7906859023032143, "grad_norm": 0.14661769568920135, "learning_rate": 1.7492918799549977e-05, "loss": 0.5172, "step": 3124 }, { "epoch": 0.7909390027841053, "grad_norm": 0.14322149753570557, "learning_rate": 1.7491332967846792e-05, "loss": 0.5349, "step": 3125 }, { "epoch": 0.7911921032649962, "grad_norm": 0.14741265773773193, "learning_rate": 1.7489746706681376e-05, "loss": 0.5275, "step": 3126 }, { "epoch": 0.7914452037458871, "grad_norm": 0.14919212460517883, "learning_rate": 1.7488160016144672e-05, "loss": 0.5266, "step": 3127 }, { "epoch": 0.791698304226778, "grad_norm": 0.14996758103370667, "learning_rate": 1.7486572896327635e-05, "loss": 0.541, "step": 3128 }, { "epoch": 0.7919514047076689, "grad_norm": 0.15300825238227844, "learning_rate": 1.7484985347321254e-05, "loss": 0.5179, "step": 3129 }, { "epoch": 0.7922045051885599, "grad_norm": 0.15273669362068176, "learning_rate": 1.7483397369216543e-05, "loss": 0.5135, "step": 3130 }, { "epoch": 0.7924576056694508, "grad_norm": 0.14794354140758514, "learning_rate": 1.7481808962104536e-05, "loss": 0.5397, "step": 3131 }, { "epoch": 0.7927107061503417, "grad_norm": 0.14445775747299194, "learning_rate": 1.7480220126076287e-05, "loss": 0.5334, "step": 3132 }, { "epoch": 0.7929638066312326, "grad_norm": 0.15021532773971558, "learning_rate": 1.747863086122289e-05, "loss": 0.5534, "step": 3133 }, { "epoch": 0.7932169071121236, "grad_norm": 0.14712731540203094, "learning_rate": 1.7477041167635448e-05, "loss": 0.5136, "step": 3134 }, { "epoch": 0.7934700075930144, "grad_norm": 0.14748620986938477, "learning_rate": 1.7475451045405098e-05, "loss": 0.5219, "step": 3135 }, { "epoch": 0.7937231080739053, "grad_norm": 0.14623963832855225, "learning_rate": 1.7473860494622995e-05, "loss": 0.5267, "step": 3136 }, { "epoch": 0.7939762085547962, "grad_norm": 0.14472338557243347, "learning_rate": 1.7472269515380325e-05, "loss": 0.5269, "step": 3137 }, { "epoch": 0.7942293090356871, "grad_norm": 0.14391745626926422, "learning_rate": 1.747067810776829e-05, "loss": 0.5367, "step": 3138 }, { "epoch": 0.7944824095165781, "grad_norm": 0.22898557782173157, "learning_rate": 1.7469086271878132e-05, "loss": 0.5164, "step": 3139 }, { "epoch": 0.794735509997469, "grad_norm": 0.1457047462463379, "learning_rate": 1.74674940078011e-05, "loss": 0.5, "step": 3140 }, { "epoch": 0.7949886104783599, "grad_norm": 0.1578938215970993, "learning_rate": 1.746590131562848e-05, "loss": 0.5397, "step": 3141 }, { "epoch": 0.7952417109592508, "grad_norm": 0.1459478735923767, "learning_rate": 1.746430819545157e-05, "loss": 0.5432, "step": 3142 }, { "epoch": 0.7954948114401418, "grad_norm": 0.1432630568742752, "learning_rate": 1.7462714647361704e-05, "loss": 0.5264, "step": 3143 }, { "epoch": 0.7957479119210327, "grad_norm": 0.17483381927013397, "learning_rate": 1.746112067145024e-05, "loss": 0.5239, "step": 3144 }, { "epoch": 0.7960010124019236, "grad_norm": 0.15125927329063416, "learning_rate": 1.7459526267808554e-05, "loss": 0.5368, "step": 3145 }, { "epoch": 0.7962541128828144, "grad_norm": 0.16469672322273254, "learning_rate": 1.7457931436528047e-05, "loss": 0.5258, "step": 3146 }, { "epoch": 0.7965072133637054, "grad_norm": 0.15689164400100708, "learning_rate": 1.7456336177700156e-05, "loss": 0.5478, "step": 3147 }, { "epoch": 0.7967603138445963, "grad_norm": 0.1473947912454605, "learning_rate": 1.745474049141633e-05, "loss": 0.5448, "step": 3148 }, { "epoch": 0.7970134143254872, "grad_norm": 0.144256129860878, "learning_rate": 1.745314437776804e-05, "loss": 0.5333, "step": 3149 }, { "epoch": 0.7972665148063781, "grad_norm": 0.14789961278438568, "learning_rate": 1.7451547836846792e-05, "loss": 0.5524, "step": 3150 }, { "epoch": 0.797519615287269, "grad_norm": 0.15695874392986298, "learning_rate": 1.744995086874412e-05, "loss": 0.5175, "step": 3151 }, { "epoch": 0.79777271576816, "grad_norm": 0.1487826704978943, "learning_rate": 1.7448353473551562e-05, "loss": 0.5258, "step": 3152 }, { "epoch": 0.7980258162490509, "grad_norm": 0.16587235033512115, "learning_rate": 1.74467556513607e-05, "loss": 0.5524, "step": 3153 }, { "epoch": 0.7982789167299418, "grad_norm": 0.14169293642044067, "learning_rate": 1.7445157402263136e-05, "loss": 0.4876, "step": 3154 }, { "epoch": 0.7985320172108327, "grad_norm": 0.14788098633289337, "learning_rate": 1.7443558726350487e-05, "loss": 0.5213, "step": 3155 }, { "epoch": 0.7987851176917237, "grad_norm": 0.1869196593761444, "learning_rate": 1.744195962371441e-05, "loss": 0.5223, "step": 3156 }, { "epoch": 0.7990382181726146, "grad_norm": 0.1450669914484024, "learning_rate": 1.744036009444657e-05, "loss": 0.5241, "step": 3157 }, { "epoch": 0.7992913186535054, "grad_norm": 0.14606769382953644, "learning_rate": 1.7438760138638667e-05, "loss": 0.5261, "step": 3158 }, { "epoch": 0.7995444191343963, "grad_norm": 0.15291200578212738, "learning_rate": 1.7437159756382432e-05, "loss": 0.5507, "step": 3159 }, { "epoch": 0.7997975196152872, "grad_norm": 0.14692464470863342, "learning_rate": 1.74355589477696e-05, "loss": 0.5357, "step": 3160 }, { "epoch": 0.8000506200961782, "grad_norm": 0.14842897653579712, "learning_rate": 1.7433957712891946e-05, "loss": 0.5375, "step": 3161 }, { "epoch": 0.8003037205770691, "grad_norm": 0.21580153703689575, "learning_rate": 1.7432356051841265e-05, "loss": 0.5178, "step": 3162 }, { "epoch": 0.80055682105796, "grad_norm": 0.14377190172672272, "learning_rate": 1.743075396470938e-05, "loss": 0.5037, "step": 3163 }, { "epoch": 0.8008099215388509, "grad_norm": 0.14572538435459137, "learning_rate": 1.742915145158813e-05, "loss": 0.5235, "step": 3164 }, { "epoch": 0.8010630220197419, "grad_norm": 0.14468161761760712, "learning_rate": 1.7427548512569384e-05, "loss": 0.571, "step": 3165 }, { "epoch": 0.8013161225006328, "grad_norm": 0.1520310938358307, "learning_rate": 1.742594514774504e-05, "loss": 0.5298, "step": 3166 }, { "epoch": 0.8015692229815237, "grad_norm": 0.1469186395406723, "learning_rate": 1.7424341357207015e-05, "loss": 0.5416, "step": 3167 }, { "epoch": 0.8018223234624146, "grad_norm": 0.15198953449726105, "learning_rate": 1.742273714104725e-05, "loss": 0.5843, "step": 3168 }, { "epoch": 0.8020754239433054, "grad_norm": 0.14566631615161896, "learning_rate": 1.7421132499357706e-05, "loss": 0.5275, "step": 3169 }, { "epoch": 0.8023285244241964, "grad_norm": 0.14933887124061584, "learning_rate": 1.7419527432230378e-05, "loss": 0.5412, "step": 3170 }, { "epoch": 0.8025816249050873, "grad_norm": 0.15247420966625214, "learning_rate": 1.7417921939757284e-05, "loss": 0.5539, "step": 3171 }, { "epoch": 0.8028347253859782, "grad_norm": 0.14642424881458282, "learning_rate": 1.7416316022030458e-05, "loss": 0.5373, "step": 3172 }, { "epoch": 0.8030878258668691, "grad_norm": 0.14506737887859344, "learning_rate": 1.7414709679141966e-05, "loss": 0.5391, "step": 3173 }, { "epoch": 0.8033409263477601, "grad_norm": 0.13656027615070343, "learning_rate": 1.7413102911183895e-05, "loss": 0.529, "step": 3174 }, { "epoch": 0.803594026828651, "grad_norm": 0.15322208404541016, "learning_rate": 1.7411495718248364e-05, "loss": 0.5286, "step": 3175 }, { "epoch": 0.8038471273095419, "grad_norm": 0.24947340786457062, "learning_rate": 1.74098881004275e-05, "loss": 0.4954, "step": 3176 }, { "epoch": 0.8041002277904328, "grad_norm": 0.14178350567817688, "learning_rate": 1.7408280057813474e-05, "loss": 0.5062, "step": 3177 }, { "epoch": 0.8043533282713237, "grad_norm": 0.148432195186615, "learning_rate": 1.7406671590498466e-05, "loss": 0.5114, "step": 3178 }, { "epoch": 0.8046064287522147, "grad_norm": 0.15027928352355957, "learning_rate": 1.7405062698574685e-05, "loss": 0.5337, "step": 3179 }, { "epoch": 0.8048595292331056, "grad_norm": 0.14550426602363586, "learning_rate": 1.740345338213437e-05, "loss": 0.5211, "step": 3180 }, { "epoch": 0.8051126297139964, "grad_norm": 0.14625242352485657, "learning_rate": 1.7401843641269773e-05, "loss": 0.5349, "step": 3181 }, { "epoch": 0.8053657301948873, "grad_norm": 0.14504191279411316, "learning_rate": 1.7400233476073184e-05, "loss": 0.5092, "step": 3182 }, { "epoch": 0.8056188306757783, "grad_norm": 0.14711694419384003, "learning_rate": 1.7398622886636908e-05, "loss": 0.5212, "step": 3183 }, { "epoch": 0.8058719311566692, "grad_norm": 0.1485500931739807, "learning_rate": 1.7397011873053274e-05, "loss": 0.5389, "step": 3184 }, { "epoch": 0.8061250316375601, "grad_norm": 0.15183328092098236, "learning_rate": 1.7395400435414643e-05, "loss": 0.5496, "step": 3185 }, { "epoch": 0.806378132118451, "grad_norm": 0.14625895023345947, "learning_rate": 1.739378857381339e-05, "loss": 0.5241, "step": 3186 }, { "epoch": 0.8066312325993419, "grad_norm": 0.15633103251457214, "learning_rate": 1.7392176288341925e-05, "loss": 0.5152, "step": 3187 }, { "epoch": 0.8068843330802329, "grad_norm": 0.1578294336795807, "learning_rate": 1.739056357909267e-05, "loss": 0.5136, "step": 3188 }, { "epoch": 0.8071374335611238, "grad_norm": 0.16467800736427307, "learning_rate": 1.7388950446158083e-05, "loss": 0.5284, "step": 3189 }, { "epoch": 0.8073905340420147, "grad_norm": 0.14969292283058167, "learning_rate": 1.7387336889630645e-05, "loss": 0.535, "step": 3190 }, { "epoch": 0.8076436345229056, "grad_norm": 0.1427565962076187, "learning_rate": 1.738572290960285e-05, "loss": 0.5158, "step": 3191 }, { "epoch": 0.8078967350037966, "grad_norm": 0.14638501405715942, "learning_rate": 1.7384108506167225e-05, "loss": 0.5222, "step": 3192 }, { "epoch": 0.8081498354846874, "grad_norm": 0.14268317818641663, "learning_rate": 1.7382493679416327e-05, "loss": 0.5272, "step": 3193 }, { "epoch": 0.8084029359655783, "grad_norm": 0.1447330117225647, "learning_rate": 1.7380878429442727e-05, "loss": 0.535, "step": 3194 }, { "epoch": 0.8086560364464692, "grad_norm": 0.1458996683359146, "learning_rate": 1.737926275633902e-05, "loss": 0.5487, "step": 3195 }, { "epoch": 0.8089091369273602, "grad_norm": 0.14721274375915527, "learning_rate": 1.7377646660197832e-05, "loss": 0.5311, "step": 3196 }, { "epoch": 0.8091622374082511, "grad_norm": 0.14739663898944855, "learning_rate": 1.7376030141111816e-05, "loss": 0.5212, "step": 3197 }, { "epoch": 0.809415337889142, "grad_norm": 0.17076340317726135, "learning_rate": 1.7374413199173634e-05, "loss": 0.5319, "step": 3198 }, { "epoch": 0.8096684383700329, "grad_norm": 0.14928781986236572, "learning_rate": 1.737279583447599e-05, "loss": 0.557, "step": 3199 }, { "epoch": 0.8099215388509238, "grad_norm": 0.14964807033538818, "learning_rate": 1.7371178047111594e-05, "loss": 0.5267, "step": 3200 }, { "epoch": 0.8101746393318148, "grad_norm": 0.14851973950862885, "learning_rate": 1.73695598371732e-05, "loss": 0.5376, "step": 3201 }, { "epoch": 0.8104277398127057, "grad_norm": 0.15421175956726074, "learning_rate": 1.7367941204753575e-05, "loss": 0.5065, "step": 3202 }, { "epoch": 0.8106808402935965, "grad_norm": 0.1504693329334259, "learning_rate": 1.7366322149945506e-05, "loss": 0.5075, "step": 3203 }, { "epoch": 0.8109339407744874, "grad_norm": 0.15099111199378967, "learning_rate": 1.7364702672841816e-05, "loss": 0.5157, "step": 3204 }, { "epoch": 0.8111870412553784, "grad_norm": 0.16988548636436462, "learning_rate": 1.7363082773535347e-05, "loss": 0.5494, "step": 3205 }, { "epoch": 0.8114401417362693, "grad_norm": 0.1674482673406601, "learning_rate": 1.736146245211896e-05, "loss": 0.5345, "step": 3206 }, { "epoch": 0.8116932422171602, "grad_norm": 0.1654837727546692, "learning_rate": 1.7359841708685543e-05, "loss": 0.5403, "step": 3207 }, { "epoch": 0.8119463426980511, "grad_norm": 0.14797748625278473, "learning_rate": 1.7358220543328014e-05, "loss": 0.5348, "step": 3208 }, { "epoch": 0.812199443178942, "grad_norm": 0.14437085390090942, "learning_rate": 1.735659895613931e-05, "loss": 0.5212, "step": 3209 }, { "epoch": 0.812452543659833, "grad_norm": 0.14685693383216858, "learning_rate": 1.7354976947212395e-05, "loss": 0.5458, "step": 3210 }, { "epoch": 0.8127056441407239, "grad_norm": 0.15425729751586914, "learning_rate": 1.735335451664025e-05, "loss": 0.5454, "step": 3211 }, { "epoch": 0.8129587446216148, "grad_norm": 0.14977982640266418, "learning_rate": 1.735173166451589e-05, "loss": 0.5311, "step": 3212 }, { "epoch": 0.8132118451025057, "grad_norm": 0.14677029848098755, "learning_rate": 1.735010839093235e-05, "loss": 0.5359, "step": 3213 }, { "epoch": 0.8134649455833967, "grad_norm": 0.14466652274131775, "learning_rate": 1.7348484695982684e-05, "loss": 0.5378, "step": 3214 }, { "epoch": 0.8137180460642875, "grad_norm": 0.14722996950149536, "learning_rate": 1.7346860579759984e-05, "loss": 0.5162, "step": 3215 }, { "epoch": 0.8139711465451784, "grad_norm": 0.14407674968242645, "learning_rate": 1.7345236042357346e-05, "loss": 0.529, "step": 3216 }, { "epoch": 0.8142242470260693, "grad_norm": 0.15334564447402954, "learning_rate": 1.734361108386791e-05, "loss": 0.5351, "step": 3217 }, { "epoch": 0.8144773475069602, "grad_norm": 0.14499294757843018, "learning_rate": 1.7341985704384827e-05, "loss": 0.5332, "step": 3218 }, { "epoch": 0.8147304479878512, "grad_norm": 0.14575597643852234, "learning_rate": 1.734035990400128e-05, "loss": 0.5214, "step": 3219 }, { "epoch": 0.8149835484687421, "grad_norm": 0.14648140966892242, "learning_rate": 1.7338733682810468e-05, "loss": 0.5355, "step": 3220 }, { "epoch": 0.815236648949633, "grad_norm": 0.14309579133987427, "learning_rate": 1.733710704090562e-05, "loss": 0.508, "step": 3221 }, { "epoch": 0.8154897494305239, "grad_norm": 0.14522792398929596, "learning_rate": 1.733547997837999e-05, "loss": 0.5297, "step": 3222 }, { "epoch": 0.8157428499114149, "grad_norm": 0.1431078463792801, "learning_rate": 1.7333852495326852e-05, "loss": 0.507, "step": 3223 }, { "epoch": 0.8159959503923058, "grad_norm": 0.1608077734708786, "learning_rate": 1.733222459183951e-05, "loss": 0.5215, "step": 3224 }, { "epoch": 0.8162490508731967, "grad_norm": 0.15187807381153107, "learning_rate": 1.7330596268011283e-05, "loss": 0.5548, "step": 3225 }, { "epoch": 0.8165021513540875, "grad_norm": 0.14093150198459625, "learning_rate": 1.7328967523935525e-05, "loss": 0.517, "step": 3226 }, { "epoch": 0.8167552518349784, "grad_norm": 0.1472300887107849, "learning_rate": 1.7327338359705606e-05, "loss": 0.5161, "step": 3227 }, { "epoch": 0.8170083523158694, "grad_norm": 0.1484699696302414, "learning_rate": 1.7325708775414917e-05, "loss": 0.5565, "step": 3228 }, { "epoch": 0.8172614527967603, "grad_norm": 0.17139586806297302, "learning_rate": 1.7324078771156887e-05, "loss": 0.5184, "step": 3229 }, { "epoch": 0.8175145532776512, "grad_norm": 0.14693978428840637, "learning_rate": 1.7322448347024957e-05, "loss": 0.5205, "step": 3230 }, { "epoch": 0.8177676537585421, "grad_norm": 0.14895689487457275, "learning_rate": 1.7320817503112595e-05, "loss": 0.5273, "step": 3231 }, { "epoch": 0.8180207542394331, "grad_norm": 0.14781410992145538, "learning_rate": 1.7319186239513294e-05, "loss": 0.5613, "step": 3232 }, { "epoch": 0.818273854720324, "grad_norm": 0.14414310455322266, "learning_rate": 1.7317554556320573e-05, "loss": 0.5353, "step": 3233 }, { "epoch": 0.8185269552012149, "grad_norm": 0.15891197323799133, "learning_rate": 1.731592245362797e-05, "loss": 0.5516, "step": 3234 }, { "epoch": 0.8187800556821058, "grad_norm": 0.14879481494426727, "learning_rate": 1.7314289931529055e-05, "loss": 0.5143, "step": 3235 }, { "epoch": 0.8190331561629967, "grad_norm": 0.15288427472114563, "learning_rate": 1.7312656990117413e-05, "loss": 0.5373, "step": 3236 }, { "epoch": 0.8192862566438877, "grad_norm": 0.1443529576063156, "learning_rate": 1.7311023629486653e-05, "loss": 0.5258, "step": 3237 }, { "epoch": 0.8195393571247785, "grad_norm": 0.1469467282295227, "learning_rate": 1.7309389849730423e-05, "loss": 0.5344, "step": 3238 }, { "epoch": 0.8197924576056694, "grad_norm": 0.14776411652565002, "learning_rate": 1.730775565094237e-05, "loss": 0.5424, "step": 3239 }, { "epoch": 0.8200455580865603, "grad_norm": 0.14843793213367462, "learning_rate": 1.7306121033216198e-05, "loss": 0.5512, "step": 3240 }, { "epoch": 0.8202986585674513, "grad_norm": 0.1491561084985733, "learning_rate": 1.7304485996645595e-05, "loss": 0.5534, "step": 3241 }, { "epoch": 0.8205517590483422, "grad_norm": 0.1519213169813156, "learning_rate": 1.730285054132431e-05, "loss": 0.5349, "step": 3242 }, { "epoch": 0.8208048595292331, "grad_norm": 0.14969325065612793, "learning_rate": 1.7301214667346093e-05, "loss": 0.5252, "step": 3243 }, { "epoch": 0.821057960010124, "grad_norm": 0.14714865386486053, "learning_rate": 1.729957837480473e-05, "loss": 0.521, "step": 3244 }, { "epoch": 0.821311060491015, "grad_norm": 0.14812898635864258, "learning_rate": 1.7297941663794016e-05, "loss": 0.5325, "step": 3245 }, { "epoch": 0.8215641609719059, "grad_norm": 0.17630314826965332, "learning_rate": 1.7296304534407794e-05, "loss": 0.5195, "step": 3246 }, { "epoch": 0.8218172614527968, "grad_norm": 0.14810848236083984, "learning_rate": 1.7294666986739908e-05, "loss": 0.5131, "step": 3247 }, { "epoch": 0.8220703619336877, "grad_norm": 0.1505279690027237, "learning_rate": 1.7293029020884236e-05, "loss": 0.535, "step": 3248 }, { "epoch": 0.8223234624145785, "grad_norm": 0.17762263119220734, "learning_rate": 1.7291390636934683e-05, "loss": 0.5364, "step": 3249 }, { "epoch": 0.8225765628954695, "grad_norm": 0.14034011960029602, "learning_rate": 1.728975183498517e-05, "loss": 0.4963, "step": 3250 }, { "epoch": 0.8228296633763604, "grad_norm": 0.14482980966567993, "learning_rate": 1.7288112615129645e-05, "loss": 0.5497, "step": 3251 }, { "epoch": 0.8230827638572513, "grad_norm": 0.1575547158718109, "learning_rate": 1.7286472977462087e-05, "loss": 0.5212, "step": 3252 }, { "epoch": 0.8233358643381422, "grad_norm": 0.1547786146402359, "learning_rate": 1.7284832922076487e-05, "loss": 0.5203, "step": 3253 }, { "epoch": 0.8235889648190332, "grad_norm": 0.1497178077697754, "learning_rate": 1.728319244906687e-05, "loss": 0.5195, "step": 3254 }, { "epoch": 0.8238420652999241, "grad_norm": 0.15711848437786102, "learning_rate": 1.728155155852728e-05, "loss": 0.5324, "step": 3255 }, { "epoch": 0.824095165780815, "grad_norm": 0.1447044163942337, "learning_rate": 1.7279910250551784e-05, "loss": 0.5522, "step": 3256 }, { "epoch": 0.8243482662617059, "grad_norm": 0.14661699533462524, "learning_rate": 1.7278268525234478e-05, "loss": 0.529, "step": 3257 }, { "epoch": 0.8246013667425968, "grad_norm": 0.1551191210746765, "learning_rate": 1.7276626382669476e-05, "loss": 0.5019, "step": 3258 }, { "epoch": 0.8248544672234878, "grad_norm": 0.14299927651882172, "learning_rate": 1.7274983822950916e-05, "loss": 0.5204, "step": 3259 }, { "epoch": 0.8251075677043787, "grad_norm": 0.14080898463726044, "learning_rate": 1.7273340846172967e-05, "loss": 0.4934, "step": 3260 }, { "epoch": 0.8253606681852695, "grad_norm": 0.14775314927101135, "learning_rate": 1.7271697452429816e-05, "loss": 0.5323, "step": 3261 }, { "epoch": 0.8256137686661604, "grad_norm": 0.15185517072677612, "learning_rate": 1.727005364181568e-05, "loss": 0.5701, "step": 3262 }, { "epoch": 0.8258668691470514, "grad_norm": 0.14407968521118164, "learning_rate": 1.726840941442478e-05, "loss": 0.5227, "step": 3263 }, { "epoch": 0.8261199696279423, "grad_norm": 0.15256400406360626, "learning_rate": 1.7266764770351394e-05, "loss": 0.5361, "step": 3264 }, { "epoch": 0.8263730701088332, "grad_norm": 0.16510039567947388, "learning_rate": 1.7265119709689794e-05, "loss": 0.5447, "step": 3265 }, { "epoch": 0.8266261705897241, "grad_norm": 0.24316293001174927, "learning_rate": 1.7263474232534295e-05, "loss": 0.5298, "step": 3266 }, { "epoch": 0.826879271070615, "grad_norm": 0.15302090346813202, "learning_rate": 1.7261828338979226e-05, "loss": 0.5348, "step": 3267 }, { "epoch": 0.827132371551506, "grad_norm": 0.1483800858259201, "learning_rate": 1.726018202911894e-05, "loss": 0.5187, "step": 3268 }, { "epoch": 0.8273854720323969, "grad_norm": 0.14614009857177734, "learning_rate": 1.7258535303047822e-05, "loss": 0.5164, "step": 3269 }, { "epoch": 0.8276385725132878, "grad_norm": 0.14864173531532288, "learning_rate": 1.7256888160860272e-05, "loss": 0.5293, "step": 3270 }, { "epoch": 0.8278916729941787, "grad_norm": 0.14359018206596375, "learning_rate": 1.7255240602650715e-05, "loss": 0.5088, "step": 3271 }, { "epoch": 0.8281447734750697, "grad_norm": 0.14949128031730652, "learning_rate": 1.7253592628513604e-05, "loss": 0.525, "step": 3272 }, { "epoch": 0.8283978739559605, "grad_norm": 0.14719711244106293, "learning_rate": 1.7251944238543418e-05, "loss": 0.5173, "step": 3273 }, { "epoch": 0.8286509744368514, "grad_norm": 0.16194112598896027, "learning_rate": 1.725029543283465e-05, "loss": 0.5512, "step": 3274 }, { "epoch": 0.8289040749177423, "grad_norm": 0.14672024548053741, "learning_rate": 1.724864621148182e-05, "loss": 0.5567, "step": 3275 }, { "epoch": 0.8291571753986332, "grad_norm": 0.14654190838336945, "learning_rate": 1.7246996574579486e-05, "loss": 0.5086, "step": 3276 }, { "epoch": 0.8294102758795242, "grad_norm": 0.14794012904167175, "learning_rate": 1.7245346522222207e-05, "loss": 0.5252, "step": 3277 }, { "epoch": 0.8296633763604151, "grad_norm": 0.14955581724643707, "learning_rate": 1.7243696054504583e-05, "loss": 0.5419, "step": 3278 }, { "epoch": 0.829916476841306, "grad_norm": 0.15737499296665192, "learning_rate": 1.724204517152123e-05, "loss": 0.5238, "step": 3279 }, { "epoch": 0.8301695773221969, "grad_norm": 0.14875125885009766, "learning_rate": 1.7240393873366784e-05, "loss": 0.5243, "step": 3280 }, { "epoch": 0.8304226778030879, "grad_norm": 0.14579667150974274, "learning_rate": 1.7238742160135922e-05, "loss": 0.5218, "step": 3281 }, { "epoch": 0.8306757782839788, "grad_norm": 0.1460975706577301, "learning_rate": 1.7237090031923324e-05, "loss": 0.5135, "step": 3282 }, { "epoch": 0.8309288787648696, "grad_norm": 0.14335250854492188, "learning_rate": 1.723543748882371e-05, "loss": 0.5198, "step": 3283 }, { "epoch": 0.8311819792457605, "grad_norm": 0.14790262281894684, "learning_rate": 1.723378453093181e-05, "loss": 0.5373, "step": 3284 }, { "epoch": 0.8314350797266514, "grad_norm": 0.16086900234222412, "learning_rate": 1.7232131158342386e-05, "loss": 0.5271, "step": 3285 }, { "epoch": 0.8316881802075424, "grad_norm": 0.15057741105556488, "learning_rate": 1.7230477371150224e-05, "loss": 0.5109, "step": 3286 }, { "epoch": 0.8319412806884333, "grad_norm": 0.15472213923931122, "learning_rate": 1.7228823169450137e-05, "loss": 0.5213, "step": 3287 }, { "epoch": 0.8321943811693242, "grad_norm": 0.15323445200920105, "learning_rate": 1.7227168553336947e-05, "loss": 0.5177, "step": 3288 }, { "epoch": 0.8324474816502151, "grad_norm": 0.16149663925170898, "learning_rate": 1.7225513522905512e-05, "loss": 0.5255, "step": 3289 }, { "epoch": 0.8327005821311061, "grad_norm": 0.14462552964687347, "learning_rate": 1.7223858078250718e-05, "loss": 0.502, "step": 3290 }, { "epoch": 0.832953682611997, "grad_norm": 0.14670826494693756, "learning_rate": 1.7222202219467465e-05, "loss": 0.5492, "step": 3291 }, { "epoch": 0.8332067830928879, "grad_norm": 0.1441624015569687, "learning_rate": 1.7220545946650675e-05, "loss": 0.5432, "step": 3292 }, { "epoch": 0.8334598835737788, "grad_norm": 0.15543736517429352, "learning_rate": 1.7218889259895308e-05, "loss": 0.5281, "step": 3293 }, { "epoch": 0.8337129840546698, "grad_norm": 0.1510584056377411, "learning_rate": 1.721723215929633e-05, "loss": 0.5099, "step": 3294 }, { "epoch": 0.8339660845355606, "grad_norm": 0.14710479974746704, "learning_rate": 1.7215574644948742e-05, "loss": 0.4992, "step": 3295 }, { "epoch": 0.8342191850164515, "grad_norm": 0.1451963633298874, "learning_rate": 1.721391671694756e-05, "loss": 0.5309, "step": 3296 }, { "epoch": 0.8344722854973424, "grad_norm": 0.1436157077550888, "learning_rate": 1.721225837538784e-05, "loss": 0.5383, "step": 3297 }, { "epoch": 0.8347253859782333, "grad_norm": 0.14644017815589905, "learning_rate": 1.721059962036465e-05, "loss": 0.5557, "step": 3298 }, { "epoch": 0.8349784864591243, "grad_norm": 0.15466825664043427, "learning_rate": 1.7208940451973074e-05, "loss": 0.5359, "step": 3299 }, { "epoch": 0.8352315869400152, "grad_norm": 0.14470620453357697, "learning_rate": 1.7207280870308233e-05, "loss": 0.5146, "step": 3300 }, { "epoch": 0.8354846874209061, "grad_norm": 0.15013274550437927, "learning_rate": 1.720562087546527e-05, "loss": 0.5022, "step": 3301 }, { "epoch": 0.835737787901797, "grad_norm": 0.1453494131565094, "learning_rate": 1.7203960467539348e-05, "loss": 0.5232, "step": 3302 }, { "epoch": 0.835990888382688, "grad_norm": 0.15376313030719757, "learning_rate": 1.7202299646625653e-05, "loss": 0.5572, "step": 3303 }, { "epoch": 0.8362439888635789, "grad_norm": 0.15615098178386688, "learning_rate": 1.7200638412819396e-05, "loss": 0.5075, "step": 3304 }, { "epoch": 0.8364970893444698, "grad_norm": 0.14668036997318268, "learning_rate": 1.7198976766215813e-05, "loss": 0.5481, "step": 3305 }, { "epoch": 0.8367501898253606, "grad_norm": 0.14969007670879364, "learning_rate": 1.7197314706910164e-05, "loss": 0.5528, "step": 3306 }, { "epoch": 0.8370032903062515, "grad_norm": 0.18027009069919586, "learning_rate": 1.719565223499773e-05, "loss": 0.502, "step": 3307 }, { "epoch": 0.8372563907871425, "grad_norm": 0.1496511697769165, "learning_rate": 1.719398935057382e-05, "loss": 0.5427, "step": 3308 }, { "epoch": 0.8375094912680334, "grad_norm": 0.1542242020368576, "learning_rate": 1.7192326053733757e-05, "loss": 0.5389, "step": 3309 }, { "epoch": 0.8377625917489243, "grad_norm": 0.14819049835205078, "learning_rate": 1.71906623445729e-05, "loss": 0.5316, "step": 3310 }, { "epoch": 0.8380156922298152, "grad_norm": 0.1437612771987915, "learning_rate": 1.718899822318662e-05, "loss": 0.5434, "step": 3311 }, { "epoch": 0.8382687927107062, "grad_norm": 0.18938961625099182, "learning_rate": 1.7187333689670324e-05, "loss": 0.5158, "step": 3312 }, { "epoch": 0.8385218931915971, "grad_norm": 0.14875753223896027, "learning_rate": 1.7185668744119433e-05, "loss": 0.5239, "step": 3313 }, { "epoch": 0.838774993672488, "grad_norm": 0.17988501489162445, "learning_rate": 1.7184003386629397e-05, "loss": 0.5066, "step": 3314 }, { "epoch": 0.8390280941533789, "grad_norm": 0.14523835480213165, "learning_rate": 1.7182337617295685e-05, "loss": 0.5136, "step": 3315 }, { "epoch": 0.8392811946342698, "grad_norm": 0.16122743487358093, "learning_rate": 1.7180671436213793e-05, "loss": 0.517, "step": 3316 }, { "epoch": 0.8395342951151608, "grad_norm": 0.15069933235645294, "learning_rate": 1.717900484347924e-05, "loss": 0.5515, "step": 3317 }, { "epoch": 0.8397873955960516, "grad_norm": 0.14569856226444244, "learning_rate": 1.7177337839187566e-05, "loss": 0.5173, "step": 3318 }, { "epoch": 0.8400404960769425, "grad_norm": 0.1768755167722702, "learning_rate": 1.7175670423434342e-05, "loss": 0.5326, "step": 3319 }, { "epoch": 0.8402935965578334, "grad_norm": 0.14865732192993164, "learning_rate": 1.7174002596315153e-05, "loss": 0.544, "step": 3320 }, { "epoch": 0.8405466970387244, "grad_norm": 0.1527811735868454, "learning_rate": 1.717233435792561e-05, "loss": 0.547, "step": 3321 }, { "epoch": 0.8407997975196153, "grad_norm": 0.14593443274497986, "learning_rate": 1.7170665708361357e-05, "loss": 0.5048, "step": 3322 }, { "epoch": 0.8410528980005062, "grad_norm": 0.1512899398803711, "learning_rate": 1.7168996647718045e-05, "loss": 0.5298, "step": 3323 }, { "epoch": 0.8413059984813971, "grad_norm": 0.14689087867736816, "learning_rate": 1.7167327176091365e-05, "loss": 0.5352, "step": 3324 }, { "epoch": 0.841559098962288, "grad_norm": 0.1569533348083496, "learning_rate": 1.716565729357702e-05, "loss": 0.5072, "step": 3325 }, { "epoch": 0.841812199443179, "grad_norm": 0.14808864891529083, "learning_rate": 1.7163987000270747e-05, "loss": 0.5568, "step": 3326 }, { "epoch": 0.8420652999240699, "grad_norm": 0.15098944306373596, "learning_rate": 1.716231629626829e-05, "loss": 0.5224, "step": 3327 }, { "epoch": 0.8423184004049608, "grad_norm": 0.14621658623218536, "learning_rate": 1.7160645181665432e-05, "loss": 0.5386, "step": 3328 }, { "epoch": 0.8425715008858516, "grad_norm": 0.1677350252866745, "learning_rate": 1.715897365655798e-05, "loss": 0.5322, "step": 3329 }, { "epoch": 0.8428246013667426, "grad_norm": 0.26365259289741516, "learning_rate": 1.7157301721041752e-05, "loss": 0.5288, "step": 3330 }, { "epoch": 0.8430777018476335, "grad_norm": 0.14822180569171906, "learning_rate": 1.7155629375212602e-05, "loss": 0.5216, "step": 3331 }, { "epoch": 0.8433308023285244, "grad_norm": 0.14585661888122559, "learning_rate": 1.7153956619166395e-05, "loss": 0.5387, "step": 3332 }, { "epoch": 0.8435839028094153, "grad_norm": 0.1485254466533661, "learning_rate": 1.7152283452999033e-05, "loss": 0.5345, "step": 3333 }, { "epoch": 0.8438370032903062, "grad_norm": 0.1552872657775879, "learning_rate": 1.715060987680643e-05, "loss": 0.5132, "step": 3334 }, { "epoch": 0.8440901037711972, "grad_norm": 0.14705923199653625, "learning_rate": 1.714893589068453e-05, "loss": 0.5186, "step": 3335 }, { "epoch": 0.8443432042520881, "grad_norm": 0.1518663763999939, "learning_rate": 1.7147261494729303e-05, "loss": 0.5358, "step": 3336 }, { "epoch": 0.844596304732979, "grad_norm": 0.1611744910478592, "learning_rate": 1.7145586689036734e-05, "loss": 0.5435, "step": 3337 }, { "epoch": 0.8448494052138699, "grad_norm": 0.14860674738883972, "learning_rate": 1.7143911473702837e-05, "loss": 0.5276, "step": 3338 }, { "epoch": 0.8451025056947609, "grad_norm": 0.1732797622680664, "learning_rate": 1.714223584882365e-05, "loss": 0.5404, "step": 3339 }, { "epoch": 0.8453556061756518, "grad_norm": 0.1530172973871231, "learning_rate": 1.714055981449523e-05, "loss": 0.5241, "step": 3340 }, { "epoch": 0.8456087066565426, "grad_norm": 0.1485951542854309, "learning_rate": 1.7138883370813667e-05, "loss": 0.5515, "step": 3341 }, { "epoch": 0.8458618071374335, "grad_norm": 0.15209129452705383, "learning_rate": 1.7137206517875062e-05, "loss": 0.5398, "step": 3342 }, { "epoch": 0.8461149076183245, "grad_norm": 0.15377309918403625, "learning_rate": 1.7135529255775546e-05, "loss": 0.5234, "step": 3343 }, { "epoch": 0.8463680080992154, "grad_norm": 0.14883680641651154, "learning_rate": 1.7133851584611276e-05, "loss": 0.514, "step": 3344 }, { "epoch": 0.8466211085801063, "grad_norm": 0.16214464604854584, "learning_rate": 1.7132173504478425e-05, "loss": 0.5718, "step": 3345 }, { "epoch": 0.8468742090609972, "grad_norm": 0.15174496173858643, "learning_rate": 1.7130495015473196e-05, "loss": 0.523, "step": 3346 }, { "epoch": 0.8471273095418881, "grad_norm": 0.14569640159606934, "learning_rate": 1.7128816117691814e-05, "loss": 0.5099, "step": 3347 }, { "epoch": 0.8473804100227791, "grad_norm": 0.15412220358848572, "learning_rate": 1.7127136811230527e-05, "loss": 0.5391, "step": 3348 }, { "epoch": 0.84763351050367, "grad_norm": 0.14210274815559387, "learning_rate": 1.7125457096185605e-05, "loss": 0.5132, "step": 3349 }, { "epoch": 0.8478866109845609, "grad_norm": 0.14505943655967712, "learning_rate": 1.7123776972653342e-05, "loss": 0.508, "step": 3350 }, { "epoch": 0.8481397114654518, "grad_norm": 0.15040002763271332, "learning_rate": 1.7122096440730052e-05, "loss": 0.5218, "step": 3351 }, { "epoch": 0.8483928119463428, "grad_norm": 0.14346162974834442, "learning_rate": 1.7120415500512088e-05, "loss": 0.4979, "step": 3352 }, { "epoch": 0.8486459124272336, "grad_norm": 0.17275333404541016, "learning_rate": 1.7118734152095803e-05, "loss": 0.551, "step": 3353 }, { "epoch": 0.8488990129081245, "grad_norm": 0.14918582141399384, "learning_rate": 1.711705239557759e-05, "loss": 0.5649, "step": 3354 }, { "epoch": 0.8491521133890154, "grad_norm": 0.14397351443767548, "learning_rate": 1.7115370231053864e-05, "loss": 0.511, "step": 3355 }, { "epoch": 0.8494052138699063, "grad_norm": 0.14942991733551025, "learning_rate": 1.7113687658621052e-05, "loss": 0.527, "step": 3356 }, { "epoch": 0.8496583143507973, "grad_norm": 0.1508914977312088, "learning_rate": 1.711200467837562e-05, "loss": 0.5517, "step": 3357 }, { "epoch": 0.8499114148316882, "grad_norm": 0.15089473128318787, "learning_rate": 1.7110321290414044e-05, "loss": 0.528, "step": 3358 }, { "epoch": 0.8501645153125791, "grad_norm": 0.3767695128917694, "learning_rate": 1.7108637494832828e-05, "loss": 0.5304, "step": 3359 }, { "epoch": 0.85041761579347, "grad_norm": 0.1531301587820053, "learning_rate": 1.7106953291728507e-05, "loss": 0.4992, "step": 3360 }, { "epoch": 0.850670716274361, "grad_norm": 0.1518569141626358, "learning_rate": 1.7105268681197628e-05, "loss": 0.5077, "step": 3361 }, { "epoch": 0.8509238167552519, "grad_norm": 0.1432340443134308, "learning_rate": 1.7103583663336766e-05, "loss": 0.5153, "step": 3362 }, { "epoch": 0.8511769172361427, "grad_norm": 0.1390077918767929, "learning_rate": 1.7101898238242525e-05, "loss": 0.507, "step": 3363 }, { "epoch": 0.8514300177170336, "grad_norm": 0.16909977793693542, "learning_rate": 1.7100212406011524e-05, "loss": 0.5703, "step": 3364 }, { "epoch": 0.8516831181979245, "grad_norm": 0.15069939196109772, "learning_rate": 1.70985261667404e-05, "loss": 0.5155, "step": 3365 }, { "epoch": 0.8519362186788155, "grad_norm": 0.14972269535064697, "learning_rate": 1.7096839520525838e-05, "loss": 0.5285, "step": 3366 }, { "epoch": 0.8521893191597064, "grad_norm": 0.1468164324760437, "learning_rate": 1.709515246746451e-05, "loss": 0.5289, "step": 3367 }, { "epoch": 0.8524424196405973, "grad_norm": 0.14484484493732452, "learning_rate": 1.709346500765315e-05, "loss": 0.5252, "step": 3368 }, { "epoch": 0.8526955201214882, "grad_norm": 0.14705991744995117, "learning_rate": 1.7091777141188485e-05, "loss": 0.5301, "step": 3369 }, { "epoch": 0.8529486206023792, "grad_norm": 0.15270598232746124, "learning_rate": 1.7090088868167278e-05, "loss": 0.551, "step": 3370 }, { "epoch": 0.8532017210832701, "grad_norm": 0.14679788053035736, "learning_rate": 1.7088400188686317e-05, "loss": 0.569, "step": 3371 }, { "epoch": 0.853454821564161, "grad_norm": 0.15219935774803162, "learning_rate": 1.708671110284241e-05, "loss": 0.5224, "step": 3372 }, { "epoch": 0.8537079220450519, "grad_norm": 0.1575467735528946, "learning_rate": 1.708502161073239e-05, "loss": 0.5049, "step": 3373 }, { "epoch": 0.8539610225259427, "grad_norm": 0.14440415799617767, "learning_rate": 1.7083331712453108e-05, "loss": 0.5304, "step": 3374 }, { "epoch": 0.8542141230068337, "grad_norm": 0.14947082102298737, "learning_rate": 1.7081641408101444e-05, "loss": 0.5362, "step": 3375 }, { "epoch": 0.8544672234877246, "grad_norm": 0.14875511825084686, "learning_rate": 1.7079950697774303e-05, "loss": 0.5192, "step": 3376 }, { "epoch": 0.8547203239686155, "grad_norm": 0.14965280890464783, "learning_rate": 1.7078259581568603e-05, "loss": 0.5293, "step": 3377 }, { "epoch": 0.8549734244495064, "grad_norm": 0.15363718569278717, "learning_rate": 1.7076568059581298e-05, "loss": 0.5452, "step": 3378 }, { "epoch": 0.8552265249303974, "grad_norm": 0.15703356266021729, "learning_rate": 1.7074876131909355e-05, "loss": 0.5243, "step": 3379 }, { "epoch": 0.8554796254112883, "grad_norm": 0.14936314523220062, "learning_rate": 1.7073183798649773e-05, "loss": 0.5293, "step": 3380 }, { "epoch": 0.8557327258921792, "grad_norm": 0.16604667901992798, "learning_rate": 1.7071491059899567e-05, "loss": 0.5686, "step": 3381 }, { "epoch": 0.8559858263730701, "grad_norm": 0.14733180403709412, "learning_rate": 1.706979791575578e-05, "loss": 0.5176, "step": 3382 }, { "epoch": 0.856238926853961, "grad_norm": 0.14883117377758026, "learning_rate": 1.7068104366315476e-05, "loss": 0.5362, "step": 3383 }, { "epoch": 0.856492027334852, "grad_norm": 0.17202907800674438, "learning_rate": 1.706641041167574e-05, "loss": 0.539, "step": 3384 }, { "epoch": 0.8567451278157429, "grad_norm": 0.15562129020690918, "learning_rate": 1.7064716051933684e-05, "loss": 0.5317, "step": 3385 }, { "epoch": 0.8569982282966337, "grad_norm": 0.15075981616973877, "learning_rate": 1.7063021287186443e-05, "loss": 0.5421, "step": 3386 }, { "epoch": 0.8572513287775246, "grad_norm": 0.15465915203094482, "learning_rate": 1.7061326117531175e-05, "loss": 0.531, "step": 3387 }, { "epoch": 0.8575044292584156, "grad_norm": 0.14956246316432953, "learning_rate": 1.7059630543065058e-05, "loss": 0.5317, "step": 3388 }, { "epoch": 0.8577575297393065, "grad_norm": 0.15158401429653168, "learning_rate": 1.7057934563885298e-05, "loss": 0.5141, "step": 3389 }, { "epoch": 0.8580106302201974, "grad_norm": 0.14342811703681946, "learning_rate": 1.7056238180089123e-05, "loss": 0.5054, "step": 3390 }, { "epoch": 0.8582637307010883, "grad_norm": 0.1439913660287857, "learning_rate": 1.7054541391773778e-05, "loss": 0.5372, "step": 3391 }, { "epoch": 0.8585168311819793, "grad_norm": 0.14679889380931854, "learning_rate": 1.705284419903654e-05, "loss": 0.5523, "step": 3392 }, { "epoch": 0.8587699316628702, "grad_norm": 0.1553446352481842, "learning_rate": 1.7051146601974707e-05, "loss": 0.498, "step": 3393 }, { "epoch": 0.8590230321437611, "grad_norm": 0.14746935665607452, "learning_rate": 1.7049448600685593e-05, "loss": 0.5152, "step": 3394 }, { "epoch": 0.859276132624652, "grad_norm": 0.15107740461826324, "learning_rate": 1.704775019526655e-05, "loss": 0.533, "step": 3395 }, { "epoch": 0.8595292331055429, "grad_norm": 0.1483919769525528, "learning_rate": 1.7046051385814934e-05, "loss": 0.5497, "step": 3396 }, { "epoch": 0.8597823335864339, "grad_norm": 0.14124512672424316, "learning_rate": 1.7044352172428137e-05, "loss": 0.5559, "step": 3397 }, { "epoch": 0.8600354340673247, "grad_norm": 0.14931374788284302, "learning_rate": 1.704265255520358e-05, "loss": 0.5402, "step": 3398 }, { "epoch": 0.8602885345482156, "grad_norm": 0.14312680065631866, "learning_rate": 1.7040952534238683e-05, "loss": 0.5381, "step": 3399 }, { "epoch": 0.8605416350291065, "grad_norm": 0.15884354710578918, "learning_rate": 1.7039252109630915e-05, "loss": 0.5195, "step": 3400 }, { "epoch": 0.8607947355099975, "grad_norm": 0.15197265148162842, "learning_rate": 1.7037551281477756e-05, "loss": 0.4994, "step": 3401 }, { "epoch": 0.8610478359908884, "grad_norm": 0.1559198796749115, "learning_rate": 1.7035850049876712e-05, "loss": 0.512, "step": 3402 }, { "epoch": 0.8613009364717793, "grad_norm": 0.1487860530614853, "learning_rate": 1.7034148414925308e-05, "loss": 0.5242, "step": 3403 }, { "epoch": 0.8615540369526702, "grad_norm": 0.14872916042804718, "learning_rate": 1.7032446376721097e-05, "loss": 0.5138, "step": 3404 }, { "epoch": 0.8618071374335611, "grad_norm": 0.16058118641376495, "learning_rate": 1.7030743935361652e-05, "loss": 0.5241, "step": 3405 }, { "epoch": 0.8620602379144521, "grad_norm": 0.15902970731258392, "learning_rate": 1.702904109094457e-05, "loss": 0.5281, "step": 3406 }, { "epoch": 0.862313338395343, "grad_norm": 0.15066573023796082, "learning_rate": 1.7027337843567478e-05, "loss": 0.5462, "step": 3407 }, { "epoch": 0.8625664388762339, "grad_norm": 0.15071547031402588, "learning_rate": 1.7025634193328008e-05, "loss": 0.5336, "step": 3408 }, { "epoch": 0.8628195393571247, "grad_norm": 0.14600256085395813, "learning_rate": 1.7023930140323835e-05, "loss": 0.5374, "step": 3409 }, { "epoch": 0.8630726398380157, "grad_norm": 0.1508922576904297, "learning_rate": 1.7022225684652648e-05, "loss": 0.5232, "step": 3410 }, { "epoch": 0.8633257403189066, "grad_norm": 0.1466078758239746, "learning_rate": 1.7020520826412156e-05, "loss": 0.5264, "step": 3411 }, { "epoch": 0.8635788407997975, "grad_norm": 0.1576623171567917, "learning_rate": 1.70188155657001e-05, "loss": 0.5064, "step": 3412 }, { "epoch": 0.8638319412806884, "grad_norm": 1.3963921070098877, "learning_rate": 1.7017109902614234e-05, "loss": 0.5552, "step": 3413 }, { "epoch": 0.8640850417615793, "grad_norm": 0.15045902132987976, "learning_rate": 1.701540383725234e-05, "loss": 0.5103, "step": 3414 }, { "epoch": 0.8643381422424703, "grad_norm": 0.147593155503273, "learning_rate": 1.701369736971223e-05, "loss": 0.5332, "step": 3415 }, { "epoch": 0.8645912427233612, "grad_norm": 0.14418303966522217, "learning_rate": 1.7011990500091723e-05, "loss": 0.5234, "step": 3416 }, { "epoch": 0.8648443432042521, "grad_norm": 0.14922396838665009, "learning_rate": 1.701028322848868e-05, "loss": 0.5325, "step": 3417 }, { "epoch": 0.865097443685143, "grad_norm": 0.1431153565645218, "learning_rate": 1.7008575555000968e-05, "loss": 0.5187, "step": 3418 }, { "epoch": 0.865350544166034, "grad_norm": 0.14386627078056335, "learning_rate": 1.7006867479726486e-05, "loss": 0.5331, "step": 3419 }, { "epoch": 0.8656036446469249, "grad_norm": 0.14942172169685364, "learning_rate": 1.7005159002763153e-05, "loss": 0.5147, "step": 3420 }, { "epoch": 0.8658567451278157, "grad_norm": 0.14960147440433502, "learning_rate": 1.7003450124208915e-05, "loss": 0.5402, "step": 3421 }, { "epoch": 0.8661098456087066, "grad_norm": 0.15860414505004883, "learning_rate": 1.700174084416174e-05, "loss": 0.4954, "step": 3422 }, { "epoch": 0.8663629460895975, "grad_norm": 0.15297777950763702, "learning_rate": 1.700003116271961e-05, "loss": 0.5221, "step": 3423 }, { "epoch": 0.8666160465704885, "grad_norm": 0.1475251168012619, "learning_rate": 1.6998321079980548e-05, "loss": 0.5191, "step": 3424 }, { "epoch": 0.8668691470513794, "grad_norm": 0.14576828479766846, "learning_rate": 1.699661059604258e-05, "loss": 0.5193, "step": 3425 }, { "epoch": 0.8671222475322703, "grad_norm": 0.1448698192834854, "learning_rate": 1.699489971100377e-05, "loss": 0.5347, "step": 3426 }, { "epoch": 0.8673753480131612, "grad_norm": 0.14953352510929108, "learning_rate": 1.6993188424962195e-05, "loss": 0.5475, "step": 3427 }, { "epoch": 0.8676284484940522, "grad_norm": 0.1521630883216858, "learning_rate": 1.699147673801596e-05, "loss": 0.5027, "step": 3428 }, { "epoch": 0.8678815489749431, "grad_norm": 0.14906881749629974, "learning_rate": 1.69897646502632e-05, "loss": 0.5631, "step": 3429 }, { "epoch": 0.868134649455834, "grad_norm": 0.1829783320426941, "learning_rate": 1.6988052161802056e-05, "loss": 0.5442, "step": 3430 }, { "epoch": 0.8683877499367249, "grad_norm": 0.14839577674865723, "learning_rate": 1.6986339272730704e-05, "loss": 0.5088, "step": 3431 }, { "epoch": 0.8686408504176157, "grad_norm": 0.1514139473438263, "learning_rate": 1.6984625983147346e-05, "loss": 0.5364, "step": 3432 }, { "epoch": 0.8688939508985067, "grad_norm": 0.1505652517080307, "learning_rate": 1.698291229315019e-05, "loss": 0.5424, "step": 3433 }, { "epoch": 0.8691470513793976, "grad_norm": 0.150529146194458, "learning_rate": 1.698119820283749e-05, "loss": 0.5398, "step": 3434 }, { "epoch": 0.8694001518602885, "grad_norm": 0.15734519064426422, "learning_rate": 1.6979483712307504e-05, "loss": 0.5216, "step": 3435 }, { "epoch": 0.8696532523411794, "grad_norm": 0.15409469604492188, "learning_rate": 1.697776882165852e-05, "loss": 0.5326, "step": 3436 }, { "epoch": 0.8699063528220704, "grad_norm": 0.14382171630859375, "learning_rate": 1.6976053530988857e-05, "loss": 0.5117, "step": 3437 }, { "epoch": 0.8701594533029613, "grad_norm": 0.15217280387878418, "learning_rate": 1.6974337840396836e-05, "loss": 0.5507, "step": 3438 }, { "epoch": 0.8704125537838522, "grad_norm": 0.14626486599445343, "learning_rate": 1.6972621749980822e-05, "loss": 0.522, "step": 3439 }, { "epoch": 0.8706656542647431, "grad_norm": 0.14933772385120392, "learning_rate": 1.69709052598392e-05, "loss": 0.5086, "step": 3440 }, { "epoch": 0.8709187547456341, "grad_norm": 0.1597888320684433, "learning_rate": 1.696918837007036e-05, "loss": 0.5405, "step": 3441 }, { "epoch": 0.871171855226525, "grad_norm": 0.18638035655021667, "learning_rate": 1.6967471080772734e-05, "loss": 0.5286, "step": 3442 }, { "epoch": 0.8714249557074158, "grad_norm": 0.14873693883419037, "learning_rate": 1.6965753392044772e-05, "loss": 0.5144, "step": 3443 }, { "epoch": 0.8716780561883067, "grad_norm": 0.15111809968948364, "learning_rate": 1.6964035303984944e-05, "loss": 0.5011, "step": 3444 }, { "epoch": 0.8719311566691976, "grad_norm": 0.14639481902122498, "learning_rate": 1.6962316816691745e-05, "loss": 0.5235, "step": 3445 }, { "epoch": 0.8721842571500886, "grad_norm": 0.15030495822429657, "learning_rate": 1.6960597930263692e-05, "loss": 0.5344, "step": 3446 }, { "epoch": 0.8724373576309795, "grad_norm": 0.1590227335691452, "learning_rate": 1.6958878644799326e-05, "loss": 0.5219, "step": 3447 }, { "epoch": 0.8726904581118704, "grad_norm": 0.15427719056606293, "learning_rate": 1.6957158960397207e-05, "loss": 0.5118, "step": 3448 }, { "epoch": 0.8729435585927613, "grad_norm": 0.14708839356899261, "learning_rate": 1.6955438877155923e-05, "loss": 0.5465, "step": 3449 }, { "epoch": 0.8731966590736523, "grad_norm": 0.15478238463401794, "learning_rate": 1.6953718395174083e-05, "loss": 0.5647, "step": 3450 }, { "epoch": 0.8734497595545432, "grad_norm": 0.14843030273914337, "learning_rate": 1.6951997514550318e-05, "loss": 0.498, "step": 3451 }, { "epoch": 0.8737028600354341, "grad_norm": 0.14159445464611053, "learning_rate": 1.6950276235383277e-05, "loss": 0.5286, "step": 3452 }, { "epoch": 0.873955960516325, "grad_norm": 0.1486474871635437, "learning_rate": 1.694855455777165e-05, "loss": 0.5433, "step": 3453 }, { "epoch": 0.8742090609972158, "grad_norm": 0.14780612289905548, "learning_rate": 1.694683248181413e-05, "loss": 0.5018, "step": 3454 }, { "epoch": 0.8744621614781068, "grad_norm": 0.1476944535970688, "learning_rate": 1.6945110007609434e-05, "loss": 0.5325, "step": 3455 }, { "epoch": 0.8747152619589977, "grad_norm": 0.14772316813468933, "learning_rate": 1.6943387135256314e-05, "loss": 0.5074, "step": 3456 }, { "epoch": 0.8749683624398886, "grad_norm": 0.14985302090644836, "learning_rate": 1.694166386485354e-05, "loss": 0.5572, "step": 3457 }, { "epoch": 0.8752214629207795, "grad_norm": 0.14288417994976044, "learning_rate": 1.6939940196499904e-05, "loss": 0.525, "step": 3458 }, { "epoch": 0.8754745634016705, "grad_norm": 0.15138815343379974, "learning_rate": 1.6938216130294217e-05, "loss": 0.5268, "step": 3459 }, { "epoch": 0.8757276638825614, "grad_norm": 0.1470714509487152, "learning_rate": 1.6936491666335315e-05, "loss": 0.5145, "step": 3460 }, { "epoch": 0.8759807643634523, "grad_norm": 0.166818305850029, "learning_rate": 1.6934766804722062e-05, "loss": 0.5407, "step": 3461 }, { "epoch": 0.8762338648443432, "grad_norm": 0.14688996970653534, "learning_rate": 1.6933041545553336e-05, "loss": 0.5394, "step": 3462 }, { "epoch": 0.8764869653252341, "grad_norm": 0.15566755831241608, "learning_rate": 1.6931315888928047e-05, "loss": 0.5442, "step": 3463 }, { "epoch": 0.8767400658061251, "grad_norm": 0.16220787167549133, "learning_rate": 1.6929589834945118e-05, "loss": 0.5196, "step": 3464 }, { "epoch": 0.876993166287016, "grad_norm": 0.15492060780525208, "learning_rate": 1.6927863383703506e-05, "loss": 0.5181, "step": 3465 }, { "epoch": 0.8772462667679068, "grad_norm": 0.15213742852210999, "learning_rate": 1.692613653530218e-05, "loss": 0.519, "step": 3466 }, { "epoch": 0.8774993672487977, "grad_norm": 0.16015225648880005, "learning_rate": 1.6924409289840137e-05, "loss": 0.515, "step": 3467 }, { "epoch": 0.8777524677296887, "grad_norm": 0.15063226222991943, "learning_rate": 1.6922681647416404e-05, "loss": 0.5274, "step": 3468 }, { "epoch": 0.8780055682105796, "grad_norm": 0.14600348472595215, "learning_rate": 1.692095360813001e-05, "loss": 0.5058, "step": 3469 }, { "epoch": 0.8782586686914705, "grad_norm": 0.144175186753273, "learning_rate": 1.6919225172080033e-05, "loss": 0.4963, "step": 3470 }, { "epoch": 0.8785117691723614, "grad_norm": 0.15371057391166687, "learning_rate": 1.6917496339365547e-05, "loss": 0.5396, "step": 3471 }, { "epoch": 0.8787648696532523, "grad_norm": 0.1446683406829834, "learning_rate": 1.6915767110085675e-05, "loss": 0.5455, "step": 3472 }, { "epoch": 0.8790179701341433, "grad_norm": 0.14825096726417542, "learning_rate": 1.6914037484339544e-05, "loss": 0.5222, "step": 3473 }, { "epoch": 0.8792710706150342, "grad_norm": 0.1521778106689453, "learning_rate": 1.6912307462226306e-05, "loss": 0.5532, "step": 3474 }, { "epoch": 0.8795241710959251, "grad_norm": 0.14979393780231476, "learning_rate": 1.691057704384515e-05, "loss": 0.5484, "step": 3475 }, { "epoch": 0.879777271576816, "grad_norm": 0.15452872216701508, "learning_rate": 1.6908846229295267e-05, "loss": 0.516, "step": 3476 }, { "epoch": 0.880030372057707, "grad_norm": 0.15019750595092773, "learning_rate": 1.6907115018675884e-05, "loss": 0.5342, "step": 3477 }, { "epoch": 0.8802834725385978, "grad_norm": 0.15492092072963715, "learning_rate": 1.690538341208625e-05, "loss": 0.5472, "step": 3478 }, { "epoch": 0.8805365730194887, "grad_norm": 0.22255095839500427, "learning_rate": 1.690365140962564e-05, "loss": 0.5288, "step": 3479 }, { "epoch": 0.8807896735003796, "grad_norm": 0.15164735913276672, "learning_rate": 1.6901919011393332e-05, "loss": 0.5284, "step": 3480 }, { "epoch": 0.8810427739812705, "grad_norm": 0.1517164558172226, "learning_rate": 1.6900186217488648e-05, "loss": 0.5831, "step": 3481 }, { "epoch": 0.8812958744621615, "grad_norm": 0.14784644544124603, "learning_rate": 1.6898453028010925e-05, "loss": 0.5238, "step": 3482 }, { "epoch": 0.8815489749430524, "grad_norm": 0.17462372779846191, "learning_rate": 1.6896719443059525e-05, "loss": 0.4889, "step": 3483 }, { "epoch": 0.8818020754239433, "grad_norm": 0.14936992526054382, "learning_rate": 1.6894985462733827e-05, "loss": 0.5319, "step": 3484 }, { "epoch": 0.8820551759048342, "grad_norm": 0.15361931920051575, "learning_rate": 1.689325108713324e-05, "loss": 0.517, "step": 3485 }, { "epoch": 0.8823082763857252, "grad_norm": 0.1460724174976349, "learning_rate": 1.689151631635719e-05, "loss": 0.5471, "step": 3486 }, { "epoch": 0.8825613768666161, "grad_norm": 0.14905647933483124, "learning_rate": 1.6889781150505127e-05, "loss": 0.5226, "step": 3487 }, { "epoch": 0.882814477347507, "grad_norm": 0.14525733888149261, "learning_rate": 1.6888045589676526e-05, "loss": 0.5362, "step": 3488 }, { "epoch": 0.8830675778283978, "grad_norm": 0.14851044118404388, "learning_rate": 1.6886309633970882e-05, "loss": 0.5343, "step": 3489 }, { "epoch": 0.8833206783092888, "grad_norm": 0.15067359805107117, "learning_rate": 1.6884573283487718e-05, "loss": 0.5401, "step": 3490 }, { "epoch": 0.8835737787901797, "grad_norm": 0.1583697348833084, "learning_rate": 1.6882836538326567e-05, "loss": 0.5097, "step": 3491 }, { "epoch": 0.8838268792710706, "grad_norm": 0.1492602527141571, "learning_rate": 1.6881099398586997e-05, "loss": 0.5606, "step": 3492 }, { "epoch": 0.8840799797519615, "grad_norm": 0.14807946979999542, "learning_rate": 1.68793618643686e-05, "loss": 0.5129, "step": 3493 }, { "epoch": 0.8843330802328524, "grad_norm": 0.15059691667556763, "learning_rate": 1.6877623935770977e-05, "loss": 0.5209, "step": 3494 }, { "epoch": 0.8845861807137434, "grad_norm": 0.14356523752212524, "learning_rate": 1.6875885612893763e-05, "loss": 0.5011, "step": 3495 }, { "epoch": 0.8848392811946343, "grad_norm": 0.14546121656894684, "learning_rate": 1.6874146895836615e-05, "loss": 0.5161, "step": 3496 }, { "epoch": 0.8850923816755252, "grad_norm": 0.14519689977169037, "learning_rate": 1.6872407784699204e-05, "loss": 0.5251, "step": 3497 }, { "epoch": 0.8853454821564161, "grad_norm": 0.14190933108329773, "learning_rate": 1.6870668279581232e-05, "loss": 0.5033, "step": 3498 }, { "epoch": 0.8855985826373071, "grad_norm": 0.14906252920627594, "learning_rate": 1.6868928380582424e-05, "loss": 0.5047, "step": 3499 }, { "epoch": 0.885851683118198, "grad_norm": 0.1466013342142105, "learning_rate": 1.686718808780252e-05, "loss": 0.5193, "step": 3500 }, { "epoch": 0.8861047835990888, "grad_norm": 0.15025852620601654, "learning_rate": 1.686544740134129e-05, "loss": 0.5506, "step": 3501 }, { "epoch": 0.8863578840799797, "grad_norm": 0.15151681005954742, "learning_rate": 1.686370632129853e-05, "loss": 0.5384, "step": 3502 }, { "epoch": 0.8866109845608706, "grad_norm": 0.1482160985469818, "learning_rate": 1.686196484777404e-05, "loss": 0.5193, "step": 3503 }, { "epoch": 0.8868640850417616, "grad_norm": 0.15177269279956818, "learning_rate": 1.686022298086766e-05, "loss": 0.5265, "step": 3504 }, { "epoch": 0.8871171855226525, "grad_norm": 0.1474560648202896, "learning_rate": 1.6858480720679257e-05, "loss": 0.5094, "step": 3505 }, { "epoch": 0.8873702860035434, "grad_norm": 0.1532500833272934, "learning_rate": 1.6856738067308695e-05, "loss": 0.4938, "step": 3506 }, { "epoch": 0.8876233864844343, "grad_norm": 0.1511625349521637, "learning_rate": 1.6854995020855886e-05, "loss": 0.5344, "step": 3507 }, { "epoch": 0.8878764869653253, "grad_norm": 0.15170307457447052, "learning_rate": 1.6853251581420755e-05, "loss": 0.5198, "step": 3508 }, { "epoch": 0.8881295874462162, "grad_norm": 0.1520860344171524, "learning_rate": 1.6851507749103245e-05, "loss": 0.5533, "step": 3509 }, { "epoch": 0.8883826879271071, "grad_norm": 0.14674068987369537, "learning_rate": 1.6849763524003334e-05, "loss": 0.5151, "step": 3510 }, { "epoch": 0.888635788407998, "grad_norm": 0.16098830103874207, "learning_rate": 1.684801890622101e-05, "loss": 0.5354, "step": 3511 }, { "epoch": 0.8888888888888888, "grad_norm": 0.14611618220806122, "learning_rate": 1.6846273895856287e-05, "loss": 0.5335, "step": 3512 }, { "epoch": 0.8891419893697798, "grad_norm": 0.1465662717819214, "learning_rate": 1.6844528493009202e-05, "loss": 0.5089, "step": 3513 }, { "epoch": 0.8893950898506707, "grad_norm": 0.14643220603466034, "learning_rate": 1.6842782697779818e-05, "loss": 0.5292, "step": 3514 }, { "epoch": 0.8896481903315616, "grad_norm": 0.20344223082065582, "learning_rate": 1.6841036510268218e-05, "loss": 0.5048, "step": 3515 }, { "epoch": 0.8899012908124525, "grad_norm": 0.15312816202640533, "learning_rate": 1.683928993057451e-05, "loss": 0.5291, "step": 3516 }, { "epoch": 0.8901543912933435, "grad_norm": 0.15690524876117706, "learning_rate": 1.683754295879881e-05, "loss": 0.518, "step": 3517 }, { "epoch": 0.8904074917742344, "grad_norm": 0.15592250227928162, "learning_rate": 1.6835795595041284e-05, "loss": 0.5612, "step": 3518 }, { "epoch": 0.8906605922551253, "grad_norm": 0.1496099978685379, "learning_rate": 1.6834047839402096e-05, "loss": 0.5312, "step": 3519 }, { "epoch": 0.8909136927360162, "grad_norm": 0.17859135568141937, "learning_rate": 1.683229969198144e-05, "loss": 0.5369, "step": 3520 }, { "epoch": 0.8911667932169071, "grad_norm": 0.1483176201581955, "learning_rate": 1.6830551152879534e-05, "loss": 0.5306, "step": 3521 }, { "epoch": 0.8914198936977981, "grad_norm": 0.1436195969581604, "learning_rate": 1.682880222219662e-05, "loss": 0.504, "step": 3522 }, { "epoch": 0.891672994178689, "grad_norm": 0.14228114485740662, "learning_rate": 1.6827052900032963e-05, "loss": 0.5106, "step": 3523 }, { "epoch": 0.8919260946595798, "grad_norm": 0.13983725011348724, "learning_rate": 1.6825303186488843e-05, "loss": 0.5211, "step": 3524 }, { "epoch": 0.8921791951404707, "grad_norm": 0.1424570530653, "learning_rate": 1.6823553081664568e-05, "loss": 0.5133, "step": 3525 }, { "epoch": 0.8924322956213617, "grad_norm": 0.14800359308719635, "learning_rate": 1.6821802585660475e-05, "loss": 0.5283, "step": 3526 }, { "epoch": 0.8926853961022526, "grad_norm": 0.14731517434120178, "learning_rate": 1.6820051698576906e-05, "loss": 0.5271, "step": 3527 }, { "epoch": 0.8929384965831435, "grad_norm": 0.14860732853412628, "learning_rate": 1.681830042051424e-05, "loss": 0.5287, "step": 3528 }, { "epoch": 0.8931915970640344, "grad_norm": 0.14400489628314972, "learning_rate": 1.6816548751572875e-05, "loss": 0.5087, "step": 3529 }, { "epoch": 0.8934446975449253, "grad_norm": 0.1470673680305481, "learning_rate": 1.681479669185323e-05, "loss": 0.5077, "step": 3530 }, { "epoch": 0.8936977980258163, "grad_norm": 0.14760112762451172, "learning_rate": 1.6813044241455747e-05, "loss": 0.514, "step": 3531 }, { "epoch": 0.8939508985067072, "grad_norm": 0.1499701291322708, "learning_rate": 1.6811291400480886e-05, "loss": 0.5066, "step": 3532 }, { "epoch": 0.8942039989875981, "grad_norm": 0.15296226739883423, "learning_rate": 1.680953816902914e-05, "loss": 0.5396, "step": 3533 }, { "epoch": 0.894457099468489, "grad_norm": 0.1517123132944107, "learning_rate": 1.680778454720102e-05, "loss": 0.5158, "step": 3534 }, { "epoch": 0.89471019994938, "grad_norm": 0.14845465123653412, "learning_rate": 1.6806030535097045e-05, "loss": 0.5176, "step": 3535 }, { "epoch": 0.8949633004302708, "grad_norm": 0.15333910286426544, "learning_rate": 1.6804276132817784e-05, "loss": 0.5703, "step": 3536 }, { "epoch": 0.8952164009111617, "grad_norm": 0.14943931996822357, "learning_rate": 1.68025213404638e-05, "loss": 0.5164, "step": 3537 }, { "epoch": 0.8954695013920526, "grad_norm": 0.14630047976970673, "learning_rate": 1.68007661581357e-05, "loss": 0.553, "step": 3538 }, { "epoch": 0.8957226018729436, "grad_norm": 0.1605490893125534, "learning_rate": 1.67990105859341e-05, "loss": 0.5257, "step": 3539 }, { "epoch": 0.8959757023538345, "grad_norm": 0.14859427511692047, "learning_rate": 1.6797254623959648e-05, "loss": 0.5092, "step": 3540 }, { "epoch": 0.8962288028347254, "grad_norm": 0.15891937911510468, "learning_rate": 1.6795498272313005e-05, "loss": 0.5291, "step": 3541 }, { "epoch": 0.8964819033156163, "grad_norm": 0.15164145827293396, "learning_rate": 1.6793741531094862e-05, "loss": 0.5138, "step": 3542 }, { "epoch": 0.8967350037965072, "grad_norm": 0.14699804782867432, "learning_rate": 1.679198440040593e-05, "loss": 0.534, "step": 3543 }, { "epoch": 0.8969881042773982, "grad_norm": 0.1932336390018463, "learning_rate": 1.6790226880346938e-05, "loss": 0.5186, "step": 3544 }, { "epoch": 0.8972412047582891, "grad_norm": 0.14946523308753967, "learning_rate": 1.6788468971018645e-05, "loss": 0.5245, "step": 3545 }, { "epoch": 0.89749430523918, "grad_norm": 0.1494015008211136, "learning_rate": 1.6786710672521823e-05, "loss": 0.5365, "step": 3546 }, { "epoch": 0.8977474057200708, "grad_norm": 0.14856594800949097, "learning_rate": 1.6784951984957272e-05, "loss": 0.5111, "step": 3547 }, { "epoch": 0.8980005062009618, "grad_norm": 0.1525156944990158, "learning_rate": 1.678319290842582e-05, "loss": 0.5072, "step": 3548 }, { "epoch": 0.8982536066818527, "grad_norm": 0.14739874005317688, "learning_rate": 1.6781433443028306e-05, "loss": 0.5156, "step": 3549 }, { "epoch": 0.8985067071627436, "grad_norm": 0.14433911442756653, "learning_rate": 1.67796735888656e-05, "loss": 0.5122, "step": 3550 }, { "epoch": 0.8987598076436345, "grad_norm": 0.1427396535873413, "learning_rate": 1.6777913346038586e-05, "loss": 0.5353, "step": 3551 }, { "epoch": 0.8990129081245254, "grad_norm": 0.14944280683994293, "learning_rate": 1.6776152714648178e-05, "loss": 0.5271, "step": 3552 }, { "epoch": 0.8992660086054164, "grad_norm": 0.15227444469928741, "learning_rate": 1.677439169479531e-05, "loss": 0.5553, "step": 3553 }, { "epoch": 0.8995191090863073, "grad_norm": 0.15288929641246796, "learning_rate": 1.6772630286580938e-05, "loss": 0.5201, "step": 3554 }, { "epoch": 0.8997722095671982, "grad_norm": 0.1477259397506714, "learning_rate": 1.6770868490106035e-05, "loss": 0.5427, "step": 3555 }, { "epoch": 0.9000253100480891, "grad_norm": 0.15653710067272186, "learning_rate": 1.6769106305471602e-05, "loss": 0.5349, "step": 3556 }, { "epoch": 0.90027841052898, "grad_norm": 0.14116887748241425, "learning_rate": 1.6767343732778667e-05, "loss": 0.5183, "step": 3557 }, { "epoch": 0.9005315110098709, "grad_norm": 0.1575784683227539, "learning_rate": 1.6765580772128268e-05, "loss": 0.5231, "step": 3558 }, { "epoch": 0.9007846114907618, "grad_norm": 0.15128043293952942, "learning_rate": 1.676381742362148e-05, "loss": 0.5116, "step": 3559 }, { "epoch": 0.9010377119716527, "grad_norm": 0.15102870762348175, "learning_rate": 1.6762053687359384e-05, "loss": 0.5229, "step": 3560 }, { "epoch": 0.9012908124525436, "grad_norm": 0.14146219193935394, "learning_rate": 1.6760289563443094e-05, "loss": 0.5177, "step": 3561 }, { "epoch": 0.9015439129334346, "grad_norm": 0.14700660109519958, "learning_rate": 1.6758525051973742e-05, "loss": 0.5387, "step": 3562 }, { "epoch": 0.9017970134143255, "grad_norm": 0.1503869742155075, "learning_rate": 1.6756760153052486e-05, "loss": 0.533, "step": 3563 }, { "epoch": 0.9020501138952164, "grad_norm": 0.1457030475139618, "learning_rate": 1.6754994866780502e-05, "loss": 0.514, "step": 3564 }, { "epoch": 0.9023032143761073, "grad_norm": 0.14772522449493408, "learning_rate": 1.6753229193258997e-05, "loss": 0.5597, "step": 3565 }, { "epoch": 0.9025563148569983, "grad_norm": 0.1503283530473709, "learning_rate": 1.6751463132589184e-05, "loss": 0.5361, "step": 3566 }, { "epoch": 0.9028094153378892, "grad_norm": 0.1497548520565033, "learning_rate": 1.674969668487231e-05, "loss": 0.5669, "step": 3567 }, { "epoch": 0.90306251581878, "grad_norm": 0.15100781619548798, "learning_rate": 1.6747929850209645e-05, "loss": 0.5151, "step": 3568 }, { "epoch": 0.9033156162996709, "grad_norm": 0.14710834622383118, "learning_rate": 1.6746162628702472e-05, "loss": 0.5, "step": 3569 }, { "epoch": 0.9035687167805618, "grad_norm": 0.1542469710111618, "learning_rate": 1.674439502045211e-05, "loss": 0.5578, "step": 3570 }, { "epoch": 0.9038218172614528, "grad_norm": 0.15553085505962372, "learning_rate": 1.674262702555988e-05, "loss": 0.5352, "step": 3571 }, { "epoch": 0.9040749177423437, "grad_norm": 0.14215399324893951, "learning_rate": 1.6740858644127153e-05, "loss": 0.5502, "step": 3572 }, { "epoch": 0.9043280182232346, "grad_norm": 0.14907367527484894, "learning_rate": 1.67390898762553e-05, "loss": 0.5246, "step": 3573 }, { "epoch": 0.9045811187041255, "grad_norm": 0.14666853845119476, "learning_rate": 1.6737320722045715e-05, "loss": 0.5213, "step": 3574 }, { "epoch": 0.9048342191850165, "grad_norm": 0.15853703022003174, "learning_rate": 1.6735551181599827e-05, "loss": 0.5466, "step": 3575 }, { "epoch": 0.9050873196659074, "grad_norm": 0.14738543331623077, "learning_rate": 1.6733781255019076e-05, "loss": 0.5472, "step": 3576 }, { "epoch": 0.9053404201467983, "grad_norm": 0.15210974216461182, "learning_rate": 1.673201094240493e-05, "loss": 0.5388, "step": 3577 }, { "epoch": 0.9055935206276892, "grad_norm": 0.14926952123641968, "learning_rate": 1.673024024385888e-05, "loss": 0.5079, "step": 3578 }, { "epoch": 0.90584662110858, "grad_norm": 0.14893130958080292, "learning_rate": 1.672846915948243e-05, "loss": 0.5282, "step": 3579 }, { "epoch": 0.906099721589471, "grad_norm": 0.14674939215183258, "learning_rate": 1.6726697689377112e-05, "loss": 0.4976, "step": 3580 }, { "epoch": 0.9063528220703619, "grad_norm": 0.1544337272644043, "learning_rate": 1.6724925833644495e-05, "loss": 0.5224, "step": 3581 }, { "epoch": 0.9066059225512528, "grad_norm": 0.1453384906053543, "learning_rate": 1.6723153592386137e-05, "loss": 0.511, "step": 3582 }, { "epoch": 0.9068590230321437, "grad_norm": 0.18287062644958496, "learning_rate": 1.6721380965703646e-05, "loss": 0.5394, "step": 3583 }, { "epoch": 0.9071121235130347, "grad_norm": 0.1470341831445694, "learning_rate": 1.671960795369864e-05, "loss": 0.5319, "step": 3584 }, { "epoch": 0.9073652239939256, "grad_norm": 0.14185889065265656, "learning_rate": 1.671783455647277e-05, "loss": 0.5097, "step": 3585 }, { "epoch": 0.9076183244748165, "grad_norm": 0.1518610417842865, "learning_rate": 1.671606077412769e-05, "loss": 0.5246, "step": 3586 }, { "epoch": 0.9078714249557074, "grad_norm": 0.16382010281085968, "learning_rate": 1.6714286606765098e-05, "loss": 0.5506, "step": 3587 }, { "epoch": 0.9081245254365984, "grad_norm": 0.14890503883361816, "learning_rate": 1.671251205448669e-05, "loss": 0.5394, "step": 3588 }, { "epoch": 0.9083776259174893, "grad_norm": 0.15002837777137756, "learning_rate": 1.671073711739421e-05, "loss": 0.5452, "step": 3589 }, { "epoch": 0.9086307263983802, "grad_norm": 0.14891470968723297, "learning_rate": 1.6708961795589406e-05, "loss": 0.533, "step": 3590 }, { "epoch": 0.908883826879271, "grad_norm": 0.1472836583852768, "learning_rate": 1.670718608917405e-05, "loss": 0.5244, "step": 3591 }, { "epoch": 0.9091369273601619, "grad_norm": 0.14605462551116943, "learning_rate": 1.6705409998249947e-05, "loss": 0.5079, "step": 3592 }, { "epoch": 0.9093900278410529, "grad_norm": 0.14506979286670685, "learning_rate": 1.6703633522918908e-05, "loss": 0.5274, "step": 3593 }, { "epoch": 0.9096431283219438, "grad_norm": 0.148104727268219, "learning_rate": 1.6701856663282786e-05, "loss": 0.5499, "step": 3594 }, { "epoch": 0.9098962288028347, "grad_norm": 0.16289789974689484, "learning_rate": 1.670007941944343e-05, "loss": 0.5337, "step": 3595 }, { "epoch": 0.9101493292837256, "grad_norm": 0.15084603428840637, "learning_rate": 1.669830179150274e-05, "loss": 0.5294, "step": 3596 }, { "epoch": 0.9104024297646166, "grad_norm": 0.14832569658756256, "learning_rate": 1.6696523779562614e-05, "loss": 0.5128, "step": 3597 }, { "epoch": 0.9106555302455075, "grad_norm": 0.14730194211006165, "learning_rate": 1.6694745383724984e-05, "loss": 0.5169, "step": 3598 }, { "epoch": 0.9109086307263984, "grad_norm": 0.14623935520648956, "learning_rate": 1.6692966604091804e-05, "loss": 0.5247, "step": 3599 }, { "epoch": 0.9111617312072893, "grad_norm": 0.5078619122505188, "learning_rate": 1.6691187440765044e-05, "loss": 0.5421, "step": 3600 }, { "epoch": 0.9114148316881802, "grad_norm": 0.15417684614658356, "learning_rate": 1.6689407893846702e-05, "loss": 0.5199, "step": 3601 }, { "epoch": 0.9116679321690712, "grad_norm": 0.14338022470474243, "learning_rate": 1.6687627963438798e-05, "loss": 0.5228, "step": 3602 }, { "epoch": 0.911921032649962, "grad_norm": 0.14711850881576538, "learning_rate": 1.6685847649643368e-05, "loss": 0.513, "step": 3603 }, { "epoch": 0.9121741331308529, "grad_norm": 0.1450110822916031, "learning_rate": 1.6684066952562474e-05, "loss": 0.4989, "step": 3604 }, { "epoch": 0.9124272336117438, "grad_norm": 0.14669804275035858, "learning_rate": 1.6682285872298195e-05, "loss": 0.504, "step": 3605 }, { "epoch": 0.9126803340926348, "grad_norm": 0.15428633987903595, "learning_rate": 1.668050440895265e-05, "loss": 0.5345, "step": 3606 }, { "epoch": 0.9129334345735257, "grad_norm": 0.17304518818855286, "learning_rate": 1.6678722562627954e-05, "loss": 0.5385, "step": 3607 }, { "epoch": 0.9131865350544166, "grad_norm": 0.1551153063774109, "learning_rate": 1.6676940333426262e-05, "loss": 0.5421, "step": 3608 }, { "epoch": 0.9134396355353075, "grad_norm": 0.15000928938388824, "learning_rate": 1.6675157721449743e-05, "loss": 0.5319, "step": 3609 }, { "epoch": 0.9136927360161984, "grad_norm": 0.1465054601430893, "learning_rate": 1.6673374726800592e-05, "loss": 0.5426, "step": 3610 }, { "epoch": 0.9139458364970894, "grad_norm": 0.16044995188713074, "learning_rate": 1.6671591349581024e-05, "loss": 0.5261, "step": 3611 }, { "epoch": 0.9141989369779803, "grad_norm": 0.1494501680135727, "learning_rate": 1.6669807589893277e-05, "loss": 0.5438, "step": 3612 }, { "epoch": 0.9144520374588712, "grad_norm": 0.1447419673204422, "learning_rate": 1.6668023447839607e-05, "loss": 0.5166, "step": 3613 }, { "epoch": 0.914705137939762, "grad_norm": 0.14979945123195648, "learning_rate": 1.6666238923522296e-05, "loss": 0.5397, "step": 3614 }, { "epoch": 0.914958238420653, "grad_norm": 0.14727343618869781, "learning_rate": 1.666445401704365e-05, "loss": 0.5271, "step": 3615 }, { "epoch": 0.9152113389015439, "grad_norm": 0.14938634634017944, "learning_rate": 1.6662668728505992e-05, "loss": 0.5385, "step": 3616 }, { "epoch": 0.9154644393824348, "grad_norm": 0.21567074954509735, "learning_rate": 1.6660883058011672e-05, "loss": 0.5465, "step": 3617 }, { "epoch": 0.9157175398633257, "grad_norm": 0.15296199917793274, "learning_rate": 1.6659097005663052e-05, "loss": 0.5444, "step": 3618 }, { "epoch": 0.9159706403442166, "grad_norm": 0.1634710282087326, "learning_rate": 1.665731057156253e-05, "loss": 0.5251, "step": 3619 }, { "epoch": 0.9162237408251076, "grad_norm": 0.14891543984413147, "learning_rate": 1.665552375581251e-05, "loss": 0.5243, "step": 3620 }, { "epoch": 0.9164768413059985, "grad_norm": 0.1558849960565567, "learning_rate": 1.665373655851543e-05, "loss": 0.5047, "step": 3621 }, { "epoch": 0.9167299417868894, "grad_norm": 0.14574921131134033, "learning_rate": 1.6651948979773754e-05, "loss": 0.5377, "step": 3622 }, { "epoch": 0.9169830422677803, "grad_norm": 0.14607584476470947, "learning_rate": 1.665016101968995e-05, "loss": 0.5197, "step": 3623 }, { "epoch": 0.9172361427486713, "grad_norm": 0.15331031382083893, "learning_rate": 1.664837267836652e-05, "loss": 0.5147, "step": 3624 }, { "epoch": 0.9174892432295622, "grad_norm": 0.14590170979499817, "learning_rate": 1.664658395590599e-05, "loss": 0.5443, "step": 3625 }, { "epoch": 0.917742343710453, "grad_norm": 0.14982114732265472, "learning_rate": 1.6644794852410896e-05, "loss": 0.5336, "step": 3626 }, { "epoch": 0.9179954441913439, "grad_norm": 0.1437799483537674, "learning_rate": 1.6643005367983815e-05, "loss": 0.5172, "step": 3627 }, { "epoch": 0.9182485446722348, "grad_norm": 0.1481659859418869, "learning_rate": 1.6641215502727322e-05, "loss": 0.51, "step": 3628 }, { "epoch": 0.9185016451531258, "grad_norm": 0.14843769371509552, "learning_rate": 1.663942525674403e-05, "loss": 0.5126, "step": 3629 }, { "epoch": 0.9187547456340167, "grad_norm": 0.14817391335964203, "learning_rate": 1.663763463013658e-05, "loss": 0.493, "step": 3630 }, { "epoch": 0.9190078461149076, "grad_norm": 0.1455070525407791, "learning_rate": 1.6635843623007612e-05, "loss": 0.5583, "step": 3631 }, { "epoch": 0.9192609465957985, "grad_norm": 0.14654161036014557, "learning_rate": 1.6634052235459807e-05, "loss": 0.5174, "step": 3632 }, { "epoch": 0.9195140470766895, "grad_norm": 0.1489616334438324, "learning_rate": 1.6632260467595855e-05, "loss": 0.5653, "step": 3633 }, { "epoch": 0.9197671475575804, "grad_norm": 0.14681079983711243, "learning_rate": 1.6630468319518485e-05, "loss": 0.5171, "step": 3634 }, { "epoch": 0.9200202480384713, "grad_norm": 0.14830783009529114, "learning_rate": 1.6628675791330428e-05, "loss": 0.5485, "step": 3635 }, { "epoch": 0.9202733485193622, "grad_norm": 0.15489326417446136, "learning_rate": 1.662688288313445e-05, "loss": 0.517, "step": 3636 }, { "epoch": 0.9205264490002532, "grad_norm": 0.1531219482421875, "learning_rate": 1.662508959503333e-05, "loss": 0.5279, "step": 3637 }, { "epoch": 0.920779549481144, "grad_norm": 0.1536158174276352, "learning_rate": 1.6623295927129884e-05, "loss": 0.553, "step": 3638 }, { "epoch": 0.9210326499620349, "grad_norm": 0.1454981416463852, "learning_rate": 1.6621501879526926e-05, "loss": 0.5416, "step": 3639 }, { "epoch": 0.9212857504429258, "grad_norm": 0.14393465220928192, "learning_rate": 1.6619707452327315e-05, "loss": 0.4994, "step": 3640 }, { "epoch": 0.9215388509238167, "grad_norm": 0.14966842532157898, "learning_rate": 1.6617912645633915e-05, "loss": 0.5153, "step": 3641 }, { "epoch": 0.9217919514047077, "grad_norm": 0.14466114342212677, "learning_rate": 1.6616117459549626e-05, "loss": 0.5079, "step": 3642 }, { "epoch": 0.9220450518855986, "grad_norm": 0.15230457484722137, "learning_rate": 1.661432189417735e-05, "loss": 0.5477, "step": 3643 }, { "epoch": 0.9222981523664895, "grad_norm": 0.15053333342075348, "learning_rate": 1.6612525949620034e-05, "loss": 0.5474, "step": 3644 }, { "epoch": 0.9225512528473804, "grad_norm": 0.1569037139415741, "learning_rate": 1.6610729625980634e-05, "loss": 0.5449, "step": 3645 }, { "epoch": 0.9228043533282714, "grad_norm": 0.15597319602966309, "learning_rate": 1.6608932923362126e-05, "loss": 0.5321, "step": 3646 }, { "epoch": 0.9230574538091623, "grad_norm": 0.14706282317638397, "learning_rate": 1.660713584186751e-05, "loss": 0.5087, "step": 3647 }, { "epoch": 0.9233105542900532, "grad_norm": 0.14015620946884155, "learning_rate": 1.6605338381599817e-05, "loss": 0.5065, "step": 3648 }, { "epoch": 0.923563654770944, "grad_norm": 0.14668244123458862, "learning_rate": 1.660354054266208e-05, "loss": 0.5135, "step": 3649 }, { "epoch": 0.9238167552518349, "grad_norm": 0.14619354903697968, "learning_rate": 1.6601742325157374e-05, "loss": 0.5209, "step": 3650 }, { "epoch": 0.9240698557327259, "grad_norm": 0.1468517780303955, "learning_rate": 1.6599943729188788e-05, "loss": 0.5401, "step": 3651 }, { "epoch": 0.9243229562136168, "grad_norm": 0.1478777825832367, "learning_rate": 1.659814475485942e-05, "loss": 0.5466, "step": 3652 }, { "epoch": 0.9245760566945077, "grad_norm": 0.15836574137210846, "learning_rate": 1.659634540227242e-05, "loss": 0.5302, "step": 3653 }, { "epoch": 0.9248291571753986, "grad_norm": 0.1507386863231659, "learning_rate": 1.6594545671530924e-05, "loss": 0.5455, "step": 3654 }, { "epoch": 0.9250822576562896, "grad_norm": 0.14204141497612, "learning_rate": 1.6592745562738113e-05, "loss": 0.5231, "step": 3655 }, { "epoch": 0.9253353581371805, "grad_norm": 0.15151117742061615, "learning_rate": 1.6590945075997186e-05, "loss": 0.5254, "step": 3656 }, { "epoch": 0.9255884586180714, "grad_norm": 0.14500434696674347, "learning_rate": 1.6589144211411357e-05, "loss": 0.5194, "step": 3657 }, { "epoch": 0.9258415590989623, "grad_norm": 0.15036217868328094, "learning_rate": 1.6587342969083867e-05, "loss": 0.5352, "step": 3658 }, { "epoch": 0.9260946595798532, "grad_norm": 0.14913876354694366, "learning_rate": 1.658554134911798e-05, "loss": 0.5438, "step": 3659 }, { "epoch": 0.9263477600607442, "grad_norm": 0.1413276195526123, "learning_rate": 1.6583739351616975e-05, "loss": 0.5293, "step": 3660 }, { "epoch": 0.926600860541635, "grad_norm": 0.15294210612773895, "learning_rate": 1.658193697668416e-05, "loss": 0.5114, "step": 3661 }, { "epoch": 0.9268539610225259, "grad_norm": 0.15567360818386078, "learning_rate": 1.658013422442286e-05, "loss": 0.5246, "step": 3662 }, { "epoch": 0.9271070615034168, "grad_norm": 0.14649812877178192, "learning_rate": 1.6578331094936423e-05, "loss": 0.5083, "step": 3663 }, { "epoch": 0.9273601619843078, "grad_norm": 0.14088179171085358, "learning_rate": 1.657652758832822e-05, "loss": 0.5252, "step": 3664 }, { "epoch": 0.9276132624651987, "grad_norm": 0.14526082575321198, "learning_rate": 1.657472370470164e-05, "loss": 0.5262, "step": 3665 }, { "epoch": 0.9278663629460896, "grad_norm": 0.14780530333518982, "learning_rate": 1.6572919444160093e-05, "loss": 0.5149, "step": 3666 }, { "epoch": 0.9281194634269805, "grad_norm": 0.15143126249313354, "learning_rate": 1.657111480680702e-05, "loss": 0.5299, "step": 3667 }, { "epoch": 0.9283725639078714, "grad_norm": 0.14445963501930237, "learning_rate": 1.6569309792745873e-05, "loss": 0.5063, "step": 3668 }, { "epoch": 0.9286256643887624, "grad_norm": 0.14489617943763733, "learning_rate": 1.6567504402080134e-05, "loss": 0.5001, "step": 3669 }, { "epoch": 0.9288787648696533, "grad_norm": 0.15362706780433655, "learning_rate": 1.6565698634913296e-05, "loss": 0.517, "step": 3670 }, { "epoch": 0.9291318653505442, "grad_norm": 0.15799693763256073, "learning_rate": 1.6563892491348882e-05, "loss": 0.5247, "step": 3671 }, { "epoch": 0.929384965831435, "grad_norm": 0.1488887518644333, "learning_rate": 1.6562085971490437e-05, "loss": 0.5233, "step": 3672 }, { "epoch": 0.929638066312326, "grad_norm": 0.16144323348999023, "learning_rate": 1.6560279075441522e-05, "loss": 0.5007, "step": 3673 }, { "epoch": 0.9298911667932169, "grad_norm": 0.14974623918533325, "learning_rate": 1.6558471803305723e-05, "loss": 0.539, "step": 3674 }, { "epoch": 0.9301442672741078, "grad_norm": 0.1455133706331253, "learning_rate": 1.655666415518665e-05, "loss": 0.5211, "step": 3675 }, { "epoch": 0.9303973677549987, "grad_norm": 0.15050366520881653, "learning_rate": 1.655485613118793e-05, "loss": 0.5491, "step": 3676 }, { "epoch": 0.9306504682358896, "grad_norm": 0.15126065909862518, "learning_rate": 1.6553047731413208e-05, "loss": 0.5299, "step": 3677 }, { "epoch": 0.9309035687167806, "grad_norm": 0.14508146047592163, "learning_rate": 1.6551238955966164e-05, "loss": 0.5337, "step": 3678 }, { "epoch": 0.9311566691976715, "grad_norm": 0.14636819064617157, "learning_rate": 1.6549429804950484e-05, "loss": 0.5469, "step": 3679 }, { "epoch": 0.9314097696785624, "grad_norm": 0.14605185389518738, "learning_rate": 1.6547620278469886e-05, "loss": 0.5174, "step": 3680 }, { "epoch": 0.9316628701594533, "grad_norm": 0.15410538017749786, "learning_rate": 1.6545810376628112e-05, "loss": 0.5187, "step": 3681 }, { "epoch": 0.9319159706403443, "grad_norm": 0.14263054728507996, "learning_rate": 1.654400009952891e-05, "loss": 0.5093, "step": 3682 }, { "epoch": 0.9321690711212351, "grad_norm": 0.15540753304958344, "learning_rate": 1.6542189447276062e-05, "loss": 0.5138, "step": 3683 }, { "epoch": 0.932422171602126, "grad_norm": 0.14605596661567688, "learning_rate": 1.6540378419973374e-05, "loss": 0.503, "step": 3684 }, { "epoch": 0.9326752720830169, "grad_norm": 0.14798161387443542, "learning_rate": 1.6538567017724663e-05, "loss": 0.528, "step": 3685 }, { "epoch": 0.9329283725639079, "grad_norm": 0.1593511402606964, "learning_rate": 1.6536755240633777e-05, "loss": 0.5304, "step": 3686 }, { "epoch": 0.9331814730447988, "grad_norm": 0.15393568575382233, "learning_rate": 1.653494308880458e-05, "loss": 0.5125, "step": 3687 }, { "epoch": 0.9334345735256897, "grad_norm": 0.14946870505809784, "learning_rate": 1.6533130562340957e-05, "loss": 0.5391, "step": 3688 }, { "epoch": 0.9336876740065806, "grad_norm": 0.14910483360290527, "learning_rate": 1.6531317661346813e-05, "loss": 0.5577, "step": 3689 }, { "epoch": 0.9339407744874715, "grad_norm": 0.14952532947063446, "learning_rate": 1.6529504385926086e-05, "loss": 0.5466, "step": 3690 }, { "epoch": 0.9341938749683625, "grad_norm": 0.14770744740962982, "learning_rate": 1.652769073618272e-05, "loss": 0.5334, "step": 3691 }, { "epoch": 0.9344469754492534, "grad_norm": 0.14591385424137115, "learning_rate": 1.6525876712220696e-05, "loss": 0.5506, "step": 3692 }, { "epoch": 0.9347000759301443, "grad_norm": 0.1432468444108963, "learning_rate": 1.6524062314143997e-05, "loss": 0.4886, "step": 3693 }, { "epoch": 0.9349531764110351, "grad_norm": 0.1527366042137146, "learning_rate": 1.6522247542056652e-05, "loss": 0.5242, "step": 3694 }, { "epoch": 0.9352062768919261, "grad_norm": 0.1392030417919159, "learning_rate": 1.6520432396062685e-05, "loss": 0.4892, "step": 3695 }, { "epoch": 0.935459377372817, "grad_norm": 0.1481018364429474, "learning_rate": 1.651861687626616e-05, "loss": 0.5395, "step": 3696 }, { "epoch": 0.9357124778537079, "grad_norm": 0.14707832038402557, "learning_rate": 1.651680098277116e-05, "loss": 0.5346, "step": 3697 }, { "epoch": 0.9359655783345988, "grad_norm": 0.14899218082427979, "learning_rate": 1.6514984715681783e-05, "loss": 0.5683, "step": 3698 }, { "epoch": 0.9362186788154897, "grad_norm": 0.1743021011352539, "learning_rate": 1.651316807510215e-05, "loss": 0.5254, "step": 3699 }, { "epoch": 0.9364717792963807, "grad_norm": 0.14781922101974487, "learning_rate": 1.651135106113641e-05, "loss": 0.564, "step": 3700 }, { "epoch": 0.9367248797772716, "grad_norm": 0.16526828706264496, "learning_rate": 1.6509533673888722e-05, "loss": 0.5316, "step": 3701 }, { "epoch": 0.9369779802581625, "grad_norm": 0.1428469568490982, "learning_rate": 1.650771591346328e-05, "loss": 0.5436, "step": 3702 }, { "epoch": 0.9372310807390534, "grad_norm": 0.14892153441905975, "learning_rate": 1.6505897779964288e-05, "loss": 0.5438, "step": 3703 }, { "epoch": 0.9374841812199444, "grad_norm": 0.14730869233608246, "learning_rate": 1.6504079273495977e-05, "loss": 0.5255, "step": 3704 }, { "epoch": 0.9377372817008353, "grad_norm": 0.15142153203487396, "learning_rate": 1.6502260394162598e-05, "loss": 0.4979, "step": 3705 }, { "epoch": 0.9379903821817261, "grad_norm": 0.14910633862018585, "learning_rate": 1.6500441142068426e-05, "loss": 0.5294, "step": 3706 }, { "epoch": 0.938243482662617, "grad_norm": 0.14492537081241608, "learning_rate": 1.649862151731775e-05, "loss": 0.5155, "step": 3707 }, { "epoch": 0.9384965831435079, "grad_norm": 0.16272975504398346, "learning_rate": 1.6496801520014886e-05, "loss": 0.5372, "step": 3708 }, { "epoch": 0.9387496836243989, "grad_norm": 0.1430039256811142, "learning_rate": 1.6494981150264172e-05, "loss": 0.539, "step": 3709 }, { "epoch": 0.9390027841052898, "grad_norm": 0.15006831288337708, "learning_rate": 1.6493160408169972e-05, "loss": 0.5207, "step": 3710 }, { "epoch": 0.9392558845861807, "grad_norm": 0.1520131528377533, "learning_rate": 1.6491339293836654e-05, "loss": 0.5295, "step": 3711 }, { "epoch": 0.9395089850670716, "grad_norm": 0.15275251865386963, "learning_rate": 1.6489517807368626e-05, "loss": 0.533, "step": 3712 }, { "epoch": 0.9397620855479626, "grad_norm": 0.1537601202726364, "learning_rate": 1.6487695948870307e-05, "loss": 0.548, "step": 3713 }, { "epoch": 0.9400151860288535, "grad_norm": 0.14867492020130157, "learning_rate": 1.648587371844614e-05, "loss": 0.522, "step": 3714 }, { "epoch": 0.9402682865097444, "grad_norm": 0.1484834849834442, "learning_rate": 1.6484051116200594e-05, "loss": 0.5443, "step": 3715 }, { "epoch": 0.9405213869906353, "grad_norm": 0.14887021481990814, "learning_rate": 1.648222814223815e-05, "loss": 0.5181, "step": 3716 }, { "epoch": 0.9407744874715261, "grad_norm": 0.14832769334316254, "learning_rate": 1.6480404796663316e-05, "loss": 0.5076, "step": 3717 }, { "epoch": 0.9410275879524171, "grad_norm": 0.155690997838974, "learning_rate": 1.647858107958062e-05, "loss": 0.5299, "step": 3718 }, { "epoch": 0.941280688433308, "grad_norm": 0.14807718992233276, "learning_rate": 1.6476756991094614e-05, "loss": 0.5245, "step": 3719 }, { "epoch": 0.9415337889141989, "grad_norm": 0.1461859941482544, "learning_rate": 1.6474932531309874e-05, "loss": 0.5091, "step": 3720 }, { "epoch": 0.9417868893950898, "grad_norm": 0.14575421810150146, "learning_rate": 1.6473107700330983e-05, "loss": 0.5237, "step": 3721 }, { "epoch": 0.9420399898759808, "grad_norm": 0.14872266352176666, "learning_rate": 1.6471282498262557e-05, "loss": 0.545, "step": 3722 }, { "epoch": 0.9422930903568717, "grad_norm": 0.15600840747356415, "learning_rate": 1.6469456925209235e-05, "loss": 0.5221, "step": 3723 }, { "epoch": 0.9425461908377626, "grad_norm": 0.15039996802806854, "learning_rate": 1.6467630981275672e-05, "loss": 0.5293, "step": 3724 }, { "epoch": 0.9427992913186535, "grad_norm": 0.1508973389863968, "learning_rate": 1.6465804666566542e-05, "loss": 0.541, "step": 3725 }, { "epoch": 0.9430523917995444, "grad_norm": 0.14584723114967346, "learning_rate": 1.6463977981186545e-05, "loss": 0.5288, "step": 3726 }, { "epoch": 0.9433054922804354, "grad_norm": 0.14509539306163788, "learning_rate": 1.6462150925240403e-05, "loss": 0.5199, "step": 3727 }, { "epoch": 0.9435585927613263, "grad_norm": 0.15895618498325348, "learning_rate": 1.6460323498832856e-05, "loss": 0.5303, "step": 3728 }, { "epoch": 0.9438116932422171, "grad_norm": 0.15297171473503113, "learning_rate": 1.6458495702068667e-05, "loss": 0.5213, "step": 3729 }, { "epoch": 0.944064793723108, "grad_norm": 0.15110565721988678, "learning_rate": 1.645666753505262e-05, "loss": 0.5212, "step": 3730 }, { "epoch": 0.944317894203999, "grad_norm": 0.16017645597457886, "learning_rate": 1.645483899788952e-05, "loss": 0.5363, "step": 3731 }, { "epoch": 0.9445709946848899, "grad_norm": 0.21678687632083893, "learning_rate": 1.645301009068419e-05, "loss": 0.4942, "step": 3732 }, { "epoch": 0.9448240951657808, "grad_norm": 0.20207969844341278, "learning_rate": 1.6451180813541483e-05, "loss": 0.5625, "step": 3733 }, { "epoch": 0.9450771956466717, "grad_norm": 0.16444139182567596, "learning_rate": 1.6449351166566262e-05, "loss": 0.5143, "step": 3734 }, { "epoch": 0.9453302961275627, "grad_norm": 0.14706484973430634, "learning_rate": 1.644752114986342e-05, "loss": 0.5175, "step": 3735 }, { "epoch": 0.9455833966084536, "grad_norm": 0.14995858073234558, "learning_rate": 1.6445690763537867e-05, "loss": 0.5114, "step": 3736 }, { "epoch": 0.9458364970893445, "grad_norm": 0.16415844857692719, "learning_rate": 1.644386000769454e-05, "loss": 0.5115, "step": 3737 }, { "epoch": 0.9460895975702354, "grad_norm": 0.14436942338943481, "learning_rate": 1.6442028882438382e-05, "loss": 0.4989, "step": 3738 }, { "epoch": 0.9463426980511263, "grad_norm": 0.15018820762634277, "learning_rate": 1.6440197387874378e-05, "loss": 0.5292, "step": 3739 }, { "epoch": 0.9465957985320173, "grad_norm": 0.14948798716068268, "learning_rate": 1.6438365524107514e-05, "loss": 0.5559, "step": 3740 }, { "epoch": 0.9468488990129081, "grad_norm": 0.156710684299469, "learning_rate": 1.6436533291242814e-05, "loss": 0.527, "step": 3741 }, { "epoch": 0.947101999493799, "grad_norm": 0.1518458127975464, "learning_rate": 1.6434700689385313e-05, "loss": 0.545, "step": 3742 }, { "epoch": 0.9473550999746899, "grad_norm": 0.15717007219791412, "learning_rate": 1.6432867718640072e-05, "loss": 0.5381, "step": 3743 }, { "epoch": 0.9476082004555809, "grad_norm": 0.14824333786964417, "learning_rate": 1.6431034379112166e-05, "loss": 0.5292, "step": 3744 }, { "epoch": 0.9478613009364718, "grad_norm": 0.14966285228729248, "learning_rate": 1.6429200670906705e-05, "loss": 0.5353, "step": 3745 }, { "epoch": 0.9481144014173627, "grad_norm": 0.14533835649490356, "learning_rate": 1.642736659412881e-05, "loss": 0.5127, "step": 3746 }, { "epoch": 0.9483675018982536, "grad_norm": 0.14732496440410614, "learning_rate": 1.642553214888362e-05, "loss": 0.5474, "step": 3747 }, { "epoch": 0.9486206023791445, "grad_norm": 0.15061214566230774, "learning_rate": 1.64236973352763e-05, "loss": 0.519, "step": 3748 }, { "epoch": 0.9488737028600355, "grad_norm": 0.1512715071439743, "learning_rate": 1.642186215341204e-05, "loss": 0.5354, "step": 3749 }, { "epoch": 0.9491268033409264, "grad_norm": 0.1489042043685913, "learning_rate": 1.6420026603396046e-05, "loss": 0.5044, "step": 3750 }, { "epoch": 0.9493799038218173, "grad_norm": 0.14633409678936005, "learning_rate": 1.6418190685333545e-05, "loss": 0.537, "step": 3751 }, { "epoch": 0.9496330043027081, "grad_norm": 0.15067242085933685, "learning_rate": 1.6416354399329785e-05, "loss": 0.5315, "step": 3752 }, { "epoch": 0.9498861047835991, "grad_norm": 0.1458524614572525, "learning_rate": 1.6414517745490038e-05, "loss": 0.5463, "step": 3753 }, { "epoch": 0.95013920526449, "grad_norm": 0.15035071969032288, "learning_rate": 1.6412680723919593e-05, "loss": 0.5375, "step": 3754 }, { "epoch": 0.9503923057453809, "grad_norm": 0.14916814863681793, "learning_rate": 1.6410843334723768e-05, "loss": 0.515, "step": 3755 }, { "epoch": 0.9506454062262718, "grad_norm": 0.1461304873228073, "learning_rate": 1.6409005578007896e-05, "loss": 0.5117, "step": 3756 }, { "epoch": 0.9508985067071627, "grad_norm": 0.15287987887859344, "learning_rate": 1.6407167453877323e-05, "loss": 0.5396, "step": 3757 }, { "epoch": 0.9511516071880537, "grad_norm": 0.15371842682361603, "learning_rate": 1.6405328962437436e-05, "loss": 0.5179, "step": 3758 }, { "epoch": 0.9514047076689446, "grad_norm": 0.14629611372947693, "learning_rate": 1.6403490103793627e-05, "loss": 0.5133, "step": 3759 }, { "epoch": 0.9516578081498355, "grad_norm": 0.16815464198589325, "learning_rate": 1.6401650878051313e-05, "loss": 0.5232, "step": 3760 }, { "epoch": 0.9519109086307264, "grad_norm": 0.14734789729118347, "learning_rate": 1.639981128531593e-05, "loss": 0.525, "step": 3761 }, { "epoch": 0.9521640091116174, "grad_norm": 0.16419310867786407, "learning_rate": 1.6397971325692945e-05, "loss": 0.5383, "step": 3762 }, { "epoch": 0.9524171095925082, "grad_norm": 0.14848332107067108, "learning_rate": 1.6396130999287834e-05, "loss": 0.5169, "step": 3763 }, { "epoch": 0.9526702100733991, "grad_norm": 0.14701737463474274, "learning_rate": 1.6394290306206103e-05, "loss": 0.5309, "step": 3764 }, { "epoch": 0.95292331055429, "grad_norm": 0.14737538993358612, "learning_rate": 1.6392449246553273e-05, "loss": 0.5259, "step": 3765 }, { "epoch": 0.9531764110351809, "grad_norm": 0.15066717565059662, "learning_rate": 1.6390607820434884e-05, "loss": 0.5376, "step": 3766 }, { "epoch": 0.9534295115160719, "grad_norm": 0.15112866461277008, "learning_rate": 1.6388766027956507e-05, "loss": 0.5348, "step": 3767 }, { "epoch": 0.9536826119969628, "grad_norm": 0.17869140207767487, "learning_rate": 1.6386923869223723e-05, "loss": 0.5409, "step": 3768 }, { "epoch": 0.9539357124778537, "grad_norm": 0.1470973640680313, "learning_rate": 1.6385081344342144e-05, "loss": 0.5224, "step": 3769 }, { "epoch": 0.9541888129587446, "grad_norm": 0.14399096369743347, "learning_rate": 1.6383238453417397e-05, "loss": 0.503, "step": 3770 }, { "epoch": 0.9544419134396356, "grad_norm": 0.14881475269794464, "learning_rate": 1.638139519655513e-05, "loss": 0.5471, "step": 3771 }, { "epoch": 0.9546950139205265, "grad_norm": 0.15086811780929565, "learning_rate": 1.637955157386101e-05, "loss": 0.5539, "step": 3772 }, { "epoch": 0.9549481144014174, "grad_norm": 0.1603875756263733, "learning_rate": 1.6377707585440733e-05, "loss": 0.5547, "step": 3773 }, { "epoch": 0.9552012148823082, "grad_norm": 0.3683100640773773, "learning_rate": 1.6375863231400005e-05, "loss": 0.5051, "step": 3774 }, { "epoch": 0.9554543153631991, "grad_norm": 0.14303649961948395, "learning_rate": 1.637401851184457e-05, "loss": 0.5323, "step": 3775 }, { "epoch": 0.9557074158440901, "grad_norm": 0.14671659469604492, "learning_rate": 1.637217342688017e-05, "loss": 0.533, "step": 3776 }, { "epoch": 0.955960516324981, "grad_norm": 0.15702015161514282, "learning_rate": 1.637032797661258e-05, "loss": 0.5313, "step": 3777 }, { "epoch": 0.9562136168058719, "grad_norm": 0.14805901050567627, "learning_rate": 1.636848216114761e-05, "loss": 0.5072, "step": 3778 }, { "epoch": 0.9564667172867628, "grad_norm": 0.1447458118200302, "learning_rate": 1.636663598059106e-05, "loss": 0.5454, "step": 3779 }, { "epoch": 0.9567198177676538, "grad_norm": 0.1459626704454422, "learning_rate": 1.6364789435048772e-05, "loss": 0.5377, "step": 3780 }, { "epoch": 0.9569729182485447, "grad_norm": 0.1516941636800766, "learning_rate": 1.6362942524626614e-05, "loss": 0.5233, "step": 3781 }, { "epoch": 0.9572260187294356, "grad_norm": 0.14520078897476196, "learning_rate": 1.6361095249430455e-05, "loss": 0.5124, "step": 3782 }, { "epoch": 0.9574791192103265, "grad_norm": 0.15192930400371552, "learning_rate": 1.63592476095662e-05, "loss": 0.5162, "step": 3783 }, { "epoch": 0.9577322196912175, "grad_norm": 0.14521363377571106, "learning_rate": 1.635739960513977e-05, "loss": 0.5255, "step": 3784 }, { "epoch": 0.9579853201721084, "grad_norm": 0.14702746272087097, "learning_rate": 1.6355551236257102e-05, "loss": 0.5122, "step": 3785 }, { "epoch": 0.9582384206529992, "grad_norm": 0.1460426151752472, "learning_rate": 1.6353702503024168e-05, "loss": 0.5111, "step": 3786 }, { "epoch": 0.9584915211338901, "grad_norm": 0.1451456993818283, "learning_rate": 1.6351853405546944e-05, "loss": 0.522, "step": 3787 }, { "epoch": 0.958744621614781, "grad_norm": 0.14988650381565094, "learning_rate": 1.635000394393144e-05, "loss": 0.5293, "step": 3788 }, { "epoch": 0.958997722095672, "grad_norm": 0.15334585309028625, "learning_rate": 1.634815411828368e-05, "loss": 0.5087, "step": 3789 }, { "epoch": 0.9592508225765629, "grad_norm": 0.14127138257026672, "learning_rate": 1.6346303928709706e-05, "loss": 0.5311, "step": 3790 }, { "epoch": 0.9595039230574538, "grad_norm": 0.1901031732559204, "learning_rate": 1.6344453375315593e-05, "loss": 0.5593, "step": 3791 }, { "epoch": 0.9597570235383447, "grad_norm": 0.15265797078609467, "learning_rate": 1.6342602458207427e-05, "loss": 0.519, "step": 3792 }, { "epoch": 0.9600101240192357, "grad_norm": 0.1488872468471527, "learning_rate": 1.6340751177491316e-05, "loss": 0.5418, "step": 3793 }, { "epoch": 0.9602632245001266, "grad_norm": 0.1467711478471756, "learning_rate": 1.6338899533273388e-05, "loss": 0.5083, "step": 3794 }, { "epoch": 0.9605163249810175, "grad_norm": 0.14361095428466797, "learning_rate": 1.6337047525659802e-05, "loss": 0.492, "step": 3795 }, { "epoch": 0.9607694254619084, "grad_norm": 0.1403983235359192, "learning_rate": 1.6335195154756716e-05, "loss": 0.4922, "step": 3796 }, { "epoch": 0.9610225259427992, "grad_norm": 0.14709924161434174, "learning_rate": 1.6333342420670338e-05, "loss": 0.5513, "step": 3797 }, { "epoch": 0.9612756264236902, "grad_norm": 0.14563478529453278, "learning_rate": 1.6331489323506868e-05, "loss": 0.525, "step": 3798 }, { "epoch": 0.9615287269045811, "grad_norm": 0.15184901654720306, "learning_rate": 1.6329635863372548e-05, "loss": 0.5041, "step": 3799 }, { "epoch": 0.961781827385472, "grad_norm": 0.14962069690227509, "learning_rate": 1.6327782040373626e-05, "loss": 0.5218, "step": 3800 }, { "epoch": 0.9620349278663629, "grad_norm": 0.14599859714508057, "learning_rate": 1.632592785461639e-05, "loss": 0.513, "step": 3801 }, { "epoch": 0.9622880283472539, "grad_norm": 0.14386872947216034, "learning_rate": 1.6324073306207125e-05, "loss": 0.5376, "step": 3802 }, { "epoch": 0.9625411288281448, "grad_norm": 0.14680960774421692, "learning_rate": 1.6322218395252153e-05, "loss": 0.5156, "step": 3803 }, { "epoch": 0.9627942293090357, "grad_norm": 0.15352913737297058, "learning_rate": 1.6320363121857808e-05, "loss": 0.5279, "step": 3804 }, { "epoch": 0.9630473297899266, "grad_norm": 0.16522404551506042, "learning_rate": 1.6318507486130456e-05, "loss": 0.5416, "step": 3805 }, { "epoch": 0.9633004302708175, "grad_norm": 0.15523165464401245, "learning_rate": 1.631665148817647e-05, "loss": 0.4957, "step": 3806 }, { "epoch": 0.9635535307517085, "grad_norm": 0.1665385216474533, "learning_rate": 1.6314795128102256e-05, "loss": 0.5234, "step": 3807 }, { "epoch": 0.9638066312325994, "grad_norm": 0.16250942647457123, "learning_rate": 1.631293840601423e-05, "loss": 0.554, "step": 3808 }, { "epoch": 0.9640597317134902, "grad_norm": 0.1459292620420456, "learning_rate": 1.631108132201884e-05, "loss": 0.5082, "step": 3809 }, { "epoch": 0.9643128321943811, "grad_norm": 0.1556709259748459, "learning_rate": 1.6309223876222545e-05, "loss": 0.5629, "step": 3810 }, { "epoch": 0.9645659326752721, "grad_norm": 0.15264128148555756, "learning_rate": 1.6307366068731827e-05, "loss": 0.5323, "step": 3811 }, { "epoch": 0.964819033156163, "grad_norm": 0.14854317903518677, "learning_rate": 1.6305507899653193e-05, "loss": 0.5327, "step": 3812 }, { "epoch": 0.9650721336370539, "grad_norm": 0.14538481831550598, "learning_rate": 1.6303649369093165e-05, "loss": 0.5338, "step": 3813 }, { "epoch": 0.9653252341179448, "grad_norm": 0.1563609093427658, "learning_rate": 1.6301790477158294e-05, "loss": 0.5166, "step": 3814 }, { "epoch": 0.9655783345988357, "grad_norm": 0.14705339074134827, "learning_rate": 1.6299931223955136e-05, "loss": 0.5347, "step": 3815 }, { "epoch": 0.9658314350797267, "grad_norm": 0.14251990616321564, "learning_rate": 1.6298071609590286e-05, "loss": 0.5251, "step": 3816 }, { "epoch": 0.9660845355606176, "grad_norm": 0.16012468934059143, "learning_rate": 1.6296211634170354e-05, "loss": 0.5513, "step": 3817 }, { "epoch": 0.9663376360415085, "grad_norm": 0.1472565084695816, "learning_rate": 1.629435129780196e-05, "loss": 0.5347, "step": 3818 }, { "epoch": 0.9665907365223994, "grad_norm": 0.14485855400562286, "learning_rate": 1.629249060059176e-05, "loss": 0.5403, "step": 3819 }, { "epoch": 0.9668438370032904, "grad_norm": 0.14992330968379974, "learning_rate": 1.629062954264643e-05, "loss": 0.5299, "step": 3820 }, { "epoch": 0.9670969374841812, "grad_norm": 0.15015843510627747, "learning_rate": 1.628876812407264e-05, "loss": 0.5461, "step": 3821 }, { "epoch": 0.9673500379650721, "grad_norm": 0.14781992137432098, "learning_rate": 1.628690634497712e-05, "loss": 0.5088, "step": 3822 }, { "epoch": 0.967603138445963, "grad_norm": 0.1436677873134613, "learning_rate": 1.6285044205466592e-05, "loss": 0.5124, "step": 3823 }, { "epoch": 0.9678562389268539, "grad_norm": 0.15176212787628174, "learning_rate": 1.6283181705647812e-05, "loss": 0.557, "step": 3824 }, { "epoch": 0.9681093394077449, "grad_norm": 0.14748024940490723, "learning_rate": 1.628131884562756e-05, "loss": 0.518, "step": 3825 }, { "epoch": 0.9683624398886358, "grad_norm": 0.15286625921726227, "learning_rate": 1.6279455625512614e-05, "loss": 0.503, "step": 3826 }, { "epoch": 0.9686155403695267, "grad_norm": 0.1510486900806427, "learning_rate": 1.6277592045409802e-05, "loss": 0.5352, "step": 3827 }, { "epoch": 0.9688686408504176, "grad_norm": 0.148910254240036, "learning_rate": 1.6275728105425953e-05, "loss": 0.5318, "step": 3828 }, { "epoch": 0.9691217413313086, "grad_norm": 0.14115232229232788, "learning_rate": 1.6273863805667923e-05, "loss": 0.5019, "step": 3829 }, { "epoch": 0.9693748418121995, "grad_norm": 0.16815748810768127, "learning_rate": 1.627199914624259e-05, "loss": 0.5088, "step": 3830 }, { "epoch": 0.9696279422930904, "grad_norm": 0.16133743524551392, "learning_rate": 1.6270134127256853e-05, "loss": 0.5311, "step": 3831 }, { "epoch": 0.9698810427739812, "grad_norm": 0.1418672502040863, "learning_rate": 1.626826874881762e-05, "loss": 0.512, "step": 3832 }, { "epoch": 0.9701341432548722, "grad_norm": 0.1418655812740326, "learning_rate": 1.6266403011031844e-05, "loss": 0.5298, "step": 3833 }, { "epoch": 0.9703872437357631, "grad_norm": 0.14854706823825836, "learning_rate": 1.6264536914006472e-05, "loss": 0.5426, "step": 3834 }, { "epoch": 0.970640344216654, "grad_norm": 0.15471938252449036, "learning_rate": 1.6262670457848488e-05, "loss": 0.5248, "step": 3835 }, { "epoch": 0.9708934446975449, "grad_norm": 0.1533239185810089, "learning_rate": 1.6260803642664893e-05, "loss": 0.5646, "step": 3836 }, { "epoch": 0.9711465451784358, "grad_norm": 0.14689037203788757, "learning_rate": 1.6258936468562702e-05, "loss": 0.5296, "step": 3837 }, { "epoch": 0.9713996456593268, "grad_norm": 0.1485079973936081, "learning_rate": 1.6257068935648965e-05, "loss": 0.5097, "step": 3838 }, { "epoch": 0.9716527461402177, "grad_norm": 0.15799418091773987, "learning_rate": 1.6255201044030734e-05, "loss": 0.5244, "step": 3839 }, { "epoch": 0.9719058466211086, "grad_norm": 0.1632402241230011, "learning_rate": 1.6253332793815097e-05, "loss": 0.5378, "step": 3840 }, { "epoch": 0.9721589471019995, "grad_norm": 0.14659827947616577, "learning_rate": 1.6251464185109157e-05, "loss": 0.511, "step": 3841 }, { "epoch": 0.9724120475828905, "grad_norm": 0.1553066223859787, "learning_rate": 1.6249595218020037e-05, "loss": 0.5075, "step": 3842 }, { "epoch": 0.9726651480637813, "grad_norm": 0.14816173911094666, "learning_rate": 1.624772589265488e-05, "loss": 0.5452, "step": 3843 }, { "epoch": 0.9729182485446722, "grad_norm": 0.14994974434375763, "learning_rate": 1.6245856209120847e-05, "loss": 0.5466, "step": 3844 }, { "epoch": 0.9731713490255631, "grad_norm": 0.14969618618488312, "learning_rate": 1.624398616752513e-05, "loss": 0.5011, "step": 3845 }, { "epoch": 0.973424449506454, "grad_norm": 0.14794595539569855, "learning_rate": 1.624211576797493e-05, "loss": 0.5262, "step": 3846 }, { "epoch": 0.973677549987345, "grad_norm": 0.15413470566272736, "learning_rate": 1.6240245010577474e-05, "loss": 0.5484, "step": 3847 }, { "epoch": 0.9739306504682359, "grad_norm": 0.14749734103679657, "learning_rate": 1.6238373895440006e-05, "loss": 0.5156, "step": 3848 }, { "epoch": 0.9741837509491268, "grad_norm": 0.14444613456726074, "learning_rate": 1.62365024226698e-05, "loss": 0.5126, "step": 3849 }, { "epoch": 0.9744368514300177, "grad_norm": 0.15196967124938965, "learning_rate": 1.623463059237414e-05, "loss": 0.5392, "step": 3850 }, { "epoch": 0.9746899519109087, "grad_norm": 0.1544530689716339, "learning_rate": 1.623275840466033e-05, "loss": 0.5233, "step": 3851 }, { "epoch": 0.9749430523917996, "grad_norm": 0.15481902658939362, "learning_rate": 1.6230885859635703e-05, "loss": 0.5226, "step": 3852 }, { "epoch": 0.9751961528726905, "grad_norm": 0.14608952403068542, "learning_rate": 1.6229012957407604e-05, "loss": 0.5013, "step": 3853 }, { "epoch": 0.9754492533535813, "grad_norm": 0.22510144114494324, "learning_rate": 1.622713969808341e-05, "loss": 0.5354, "step": 3854 }, { "epoch": 0.9757023538344722, "grad_norm": 0.14725567400455475, "learning_rate": 1.6225266081770503e-05, "loss": 0.5239, "step": 3855 }, { "epoch": 0.9759554543153632, "grad_norm": 0.15374629199504852, "learning_rate": 1.6223392108576298e-05, "loss": 0.543, "step": 3856 }, { "epoch": 0.9762085547962541, "grad_norm": 0.1542677879333496, "learning_rate": 1.6221517778608227e-05, "loss": 0.507, "step": 3857 }, { "epoch": 0.976461655277145, "grad_norm": 0.1502179205417633, "learning_rate": 1.621964309197374e-05, "loss": 0.5383, "step": 3858 }, { "epoch": 0.9767147557580359, "grad_norm": 0.14840199053287506, "learning_rate": 1.6217768048780304e-05, "loss": 0.5541, "step": 3859 }, { "epoch": 0.9769678562389269, "grad_norm": 0.16259582340717316, "learning_rate": 1.6215892649135412e-05, "loss": 0.5344, "step": 3860 }, { "epoch": 0.9772209567198178, "grad_norm": 0.1516430675983429, "learning_rate": 1.6214016893146584e-05, "loss": 0.5297, "step": 3861 }, { "epoch": 0.9774740572007087, "grad_norm": 0.18059854209423065, "learning_rate": 1.621214078092135e-05, "loss": 0.5492, "step": 3862 }, { "epoch": 0.9777271576815996, "grad_norm": 0.15057961642742157, "learning_rate": 1.621026431256726e-05, "loss": 0.5481, "step": 3863 }, { "epoch": 0.9779802581624905, "grad_norm": 0.14518338441848755, "learning_rate": 1.6208387488191895e-05, "loss": 0.5244, "step": 3864 }, { "epoch": 0.9782333586433815, "grad_norm": 0.15141542255878448, "learning_rate": 1.6206510307902838e-05, "loss": 0.4997, "step": 3865 }, { "epoch": 0.9784864591242723, "grad_norm": 0.15622003376483917, "learning_rate": 1.620463277180772e-05, "loss": 0.5277, "step": 3866 }, { "epoch": 0.9787395596051632, "grad_norm": 0.1454780101776123, "learning_rate": 1.6202754880014158e-05, "loss": 0.5247, "step": 3867 }, { "epoch": 0.9789926600860541, "grad_norm": 0.14858952164649963, "learning_rate": 1.620087663262982e-05, "loss": 0.5299, "step": 3868 }, { "epoch": 0.9792457605669451, "grad_norm": 0.14994552731513977, "learning_rate": 1.6198998029762376e-05, "loss": 0.5275, "step": 3869 }, { "epoch": 0.979498861047836, "grad_norm": 0.14378589391708374, "learning_rate": 1.6197119071519528e-05, "loss": 0.5253, "step": 3870 }, { "epoch": 0.9797519615287269, "grad_norm": 0.14868232607841492, "learning_rate": 1.6195239758008985e-05, "loss": 0.5236, "step": 3871 }, { "epoch": 0.9800050620096178, "grad_norm": 0.1519884616136551, "learning_rate": 1.6193360089338493e-05, "loss": 0.5475, "step": 3872 }, { "epoch": 0.9802581624905087, "grad_norm": 0.14714932441711426, "learning_rate": 1.6191480065615798e-05, "loss": 0.4973, "step": 3873 }, { "epoch": 0.9805112629713997, "grad_norm": 0.14794203639030457, "learning_rate": 1.6189599686948693e-05, "loss": 0.5031, "step": 3874 }, { "epoch": 0.9807643634522906, "grad_norm": 0.14940579235553741, "learning_rate": 1.618771895344496e-05, "loss": 0.5202, "step": 3875 }, { "epoch": 0.9810174639331815, "grad_norm": 0.1445947140455246, "learning_rate": 1.618583786521243e-05, "loss": 0.5114, "step": 3876 }, { "epoch": 0.9812705644140723, "grad_norm": 0.15065394341945648, "learning_rate": 1.6183956422358935e-05, "loss": 0.5115, "step": 3877 }, { "epoch": 0.9815236648949633, "grad_norm": 0.1555759757757187, "learning_rate": 1.6182074624992338e-05, "loss": 0.5206, "step": 3878 }, { "epoch": 0.9817767653758542, "grad_norm": 0.15297970175743103, "learning_rate": 1.6180192473220512e-05, "loss": 0.5322, "step": 3879 }, { "epoch": 0.9820298658567451, "grad_norm": 0.1459963321685791, "learning_rate": 1.6178309967151366e-05, "loss": 0.4921, "step": 3880 }, { "epoch": 0.982282966337636, "grad_norm": 0.14950400590896606, "learning_rate": 1.6176427106892814e-05, "loss": 0.5332, "step": 3881 }, { "epoch": 0.982536066818527, "grad_norm": 0.15379725396633148, "learning_rate": 1.6174543892552793e-05, "loss": 0.5238, "step": 3882 }, { "epoch": 0.9827891672994179, "grad_norm": 0.15259183943271637, "learning_rate": 1.6172660324239275e-05, "loss": 0.5376, "step": 3883 }, { "epoch": 0.9830422677803088, "grad_norm": 0.15693873167037964, "learning_rate": 1.617077640206023e-05, "loss": 0.5315, "step": 3884 }, { "epoch": 0.9832953682611997, "grad_norm": 0.1517692655324936, "learning_rate": 1.6168892126123665e-05, "loss": 0.5344, "step": 3885 }, { "epoch": 0.9835484687420906, "grad_norm": 0.14472903311252594, "learning_rate": 1.61670074965376e-05, "loss": 0.5075, "step": 3886 }, { "epoch": 0.9838015692229816, "grad_norm": 0.14296984672546387, "learning_rate": 1.616512251341008e-05, "loss": 0.5123, "step": 3887 }, { "epoch": 0.9840546697038725, "grad_norm": 0.14558005332946777, "learning_rate": 1.616323717684916e-05, "loss": 0.4849, "step": 3888 }, { "epoch": 0.9843077701847633, "grad_norm": 0.15538431704044342, "learning_rate": 1.6161351486962928e-05, "loss": 0.541, "step": 3889 }, { "epoch": 0.9845608706656542, "grad_norm": 0.15170827507972717, "learning_rate": 1.6159465443859482e-05, "loss": 0.5256, "step": 3890 }, { "epoch": 0.9848139711465452, "grad_norm": 0.14378522336483002, "learning_rate": 1.615757904764695e-05, "loss": 0.5343, "step": 3891 }, { "epoch": 0.9850670716274361, "grad_norm": 0.14715576171875, "learning_rate": 1.6155692298433475e-05, "loss": 0.528, "step": 3892 }, { "epoch": 0.985320172108327, "grad_norm": 0.1473788172006607, "learning_rate": 1.6153805196327216e-05, "loss": 0.532, "step": 3893 }, { "epoch": 0.9855732725892179, "grad_norm": 0.15451231598854065, "learning_rate": 1.615191774143636e-05, "loss": 0.5179, "step": 3894 }, { "epoch": 0.9858263730701088, "grad_norm": 0.14568965137004852, "learning_rate": 1.6150029933869107e-05, "loss": 0.5334, "step": 3895 }, { "epoch": 0.9860794735509998, "grad_norm": 0.15080852806568146, "learning_rate": 1.6148141773733685e-05, "loss": 0.5326, "step": 3896 }, { "epoch": 0.9863325740318907, "grad_norm": 0.14815910160541534, "learning_rate": 1.614625326113834e-05, "loss": 0.5167, "step": 3897 }, { "epoch": 0.9865856745127816, "grad_norm": 0.14249533414840698, "learning_rate": 1.614436439619133e-05, "loss": 0.4969, "step": 3898 }, { "epoch": 0.9868387749936725, "grad_norm": 0.1495925337076187, "learning_rate": 1.6142475179000943e-05, "loss": 0.5222, "step": 3899 }, { "epoch": 0.9870918754745635, "grad_norm": 0.14467386901378632, "learning_rate": 1.6140585609675486e-05, "loss": 0.5245, "step": 3900 }, { "epoch": 0.9873449759554543, "grad_norm": 0.15033195912837982, "learning_rate": 1.6138695688323278e-05, "loss": 0.5071, "step": 3901 }, { "epoch": 0.9875980764363452, "grad_norm": 0.14334291219711304, "learning_rate": 1.613680541505267e-05, "loss": 0.5415, "step": 3902 }, { "epoch": 0.9878511769172361, "grad_norm": 0.14398124814033508, "learning_rate": 1.6134914789972022e-05, "loss": 0.5135, "step": 3903 }, { "epoch": 0.988104277398127, "grad_norm": 0.15389041602611542, "learning_rate": 1.6133023813189728e-05, "loss": 0.5159, "step": 3904 }, { "epoch": 0.988357377879018, "grad_norm": 0.14809522032737732, "learning_rate": 1.6131132484814184e-05, "loss": 0.5142, "step": 3905 }, { "epoch": 0.9886104783599089, "grad_norm": 0.14882373809814453, "learning_rate": 1.6129240804953825e-05, "loss": 0.5294, "step": 3906 }, { "epoch": 0.9888635788407998, "grad_norm": 0.15086989104747772, "learning_rate": 1.6127348773717087e-05, "loss": 0.5456, "step": 3907 }, { "epoch": 0.9891166793216907, "grad_norm": 0.1454935073852539, "learning_rate": 1.6125456391212446e-05, "loss": 0.515, "step": 3908 }, { "epoch": 0.9893697798025817, "grad_norm": 0.14580386877059937, "learning_rate": 1.6123563657548382e-05, "loss": 0.4836, "step": 3909 }, { "epoch": 0.9896228802834726, "grad_norm": 0.1508018523454666, "learning_rate": 1.61216705728334e-05, "loss": 0.5305, "step": 3910 }, { "epoch": 0.9898759807643635, "grad_norm": 0.14905580878257751, "learning_rate": 1.6119777137176035e-05, "loss": 0.5172, "step": 3911 }, { "epoch": 0.9901290812452543, "grad_norm": 0.1432248204946518, "learning_rate": 1.6117883350684827e-05, "loss": 0.5242, "step": 3912 }, { "epoch": 0.9903821817261452, "grad_norm": 0.1491553783416748, "learning_rate": 1.6115989213468338e-05, "loss": 0.5177, "step": 3913 }, { "epoch": 0.9906352822070362, "grad_norm": 0.14771565794944763, "learning_rate": 1.6114094725635168e-05, "loss": 0.507, "step": 3914 }, { "epoch": 0.9908883826879271, "grad_norm": 0.15701542794704437, "learning_rate": 1.6112199887293916e-05, "loss": 0.5237, "step": 3915 }, { "epoch": 0.991141483168818, "grad_norm": 0.1535111665725708, "learning_rate": 1.6110304698553206e-05, "loss": 0.5408, "step": 3916 }, { "epoch": 0.9913945836497089, "grad_norm": 0.15653599798679352, "learning_rate": 1.6108409159521692e-05, "loss": 0.5303, "step": 3917 }, { "epoch": 0.9916476841305999, "grad_norm": 0.16198381781578064, "learning_rate": 1.610651327030804e-05, "loss": 0.5241, "step": 3918 }, { "epoch": 0.9919007846114908, "grad_norm": 0.14654488861560822, "learning_rate": 1.610461703102093e-05, "loss": 0.5249, "step": 3919 }, { "epoch": 0.9921538850923817, "grad_norm": 0.15079180896282196, "learning_rate": 1.6102720441769077e-05, "loss": 0.5479, "step": 3920 }, { "epoch": 0.9924069855732726, "grad_norm": 0.1462228149175644, "learning_rate": 1.610082350266121e-05, "loss": 0.5034, "step": 3921 }, { "epoch": 0.9926600860541634, "grad_norm": 0.1475261002779007, "learning_rate": 1.6098926213806068e-05, "loss": 0.5169, "step": 3922 }, { "epoch": 0.9929131865350544, "grad_norm": 0.1557222157716751, "learning_rate": 1.6097028575312427e-05, "loss": 0.501, "step": 3923 }, { "epoch": 0.9931662870159453, "grad_norm": 0.15464568138122559, "learning_rate": 1.609513058728907e-05, "loss": 0.5179, "step": 3924 }, { "epoch": 0.9934193874968362, "grad_norm": 0.14977793395519257, "learning_rate": 1.6093232249844807e-05, "loss": 0.5462, "step": 3925 }, { "epoch": 0.9936724879777271, "grad_norm": 0.14974834024906158, "learning_rate": 1.6091333563088462e-05, "loss": 0.5274, "step": 3926 }, { "epoch": 0.9939255884586181, "grad_norm": 0.15120790898799896, "learning_rate": 1.6089434527128886e-05, "loss": 0.5097, "step": 3927 }, { "epoch": 0.994178688939509, "grad_norm": 0.15376362204551697, "learning_rate": 1.6087535142074948e-05, "loss": 0.5162, "step": 3928 }, { "epoch": 0.9944317894203999, "grad_norm": 0.14804089069366455, "learning_rate": 1.608563540803553e-05, "loss": 0.5307, "step": 3929 }, { "epoch": 0.9946848899012908, "grad_norm": 0.1548173874616623, "learning_rate": 1.6083735325119545e-05, "loss": 0.5015, "step": 3930 }, { "epoch": 0.9949379903821818, "grad_norm": 0.16197916865348816, "learning_rate": 1.6081834893435918e-05, "loss": 0.5291, "step": 3931 }, { "epoch": 0.9951910908630727, "grad_norm": 0.155508354306221, "learning_rate": 1.6079934113093598e-05, "loss": 0.5499, "step": 3932 }, { "epoch": 0.9954441913439636, "grad_norm": 0.14825887978076935, "learning_rate": 1.6078032984201553e-05, "loss": 0.5146, "step": 3933 }, { "epoch": 0.9956972918248544, "grad_norm": 0.15199661254882812, "learning_rate": 1.607613150686877e-05, "loss": 0.5164, "step": 3934 }, { "epoch": 0.9959503923057453, "grad_norm": 0.1496342420578003, "learning_rate": 1.6074229681204254e-05, "loss": 0.5235, "step": 3935 }, { "epoch": 0.9962034927866363, "grad_norm": 0.15322662889957428, "learning_rate": 1.6072327507317037e-05, "loss": 0.536, "step": 3936 }, { "epoch": 0.9964565932675272, "grad_norm": 0.1493048220872879, "learning_rate": 1.6070424985316165e-05, "loss": 0.5078, "step": 3937 }, { "epoch": 0.9967096937484181, "grad_norm": 0.14688515663146973, "learning_rate": 1.6068522115310705e-05, "loss": 0.5198, "step": 3938 }, { "epoch": 0.996962794229309, "grad_norm": 0.14200732111930847, "learning_rate": 1.6066618897409746e-05, "loss": 0.5224, "step": 3939 }, { "epoch": 0.9972158947102, "grad_norm": 0.14864256978034973, "learning_rate": 1.6064715331722395e-05, "loss": 0.5359, "step": 3940 }, { "epoch": 0.9974689951910909, "grad_norm": 0.1501983106136322, "learning_rate": 1.6062811418357777e-05, "loss": 0.4984, "step": 3941 }, { "epoch": 0.9977220956719818, "grad_norm": 0.15095578134059906, "learning_rate": 1.6060907157425044e-05, "loss": 0.5167, "step": 3942 }, { "epoch": 0.9979751961528727, "grad_norm": 0.1415361911058426, "learning_rate": 1.6059002549033355e-05, "loss": 0.5057, "step": 3943 }, { "epoch": 0.9982282966337636, "grad_norm": 0.14795713126659393, "learning_rate": 1.605709759329191e-05, "loss": 0.5122, "step": 3944 }, { "epoch": 0.9984813971146546, "grad_norm": 0.162861630320549, "learning_rate": 1.6055192290309904e-05, "loss": 0.5262, "step": 3945 }, { "epoch": 0.9987344975955454, "grad_norm": 0.16867463290691376, "learning_rate": 1.6053286640196567e-05, "loss": 0.5071, "step": 3946 }, { "epoch": 0.9989875980764363, "grad_norm": 0.14524881541728973, "learning_rate": 1.605138064306115e-05, "loss": 0.4979, "step": 3947 }, { "epoch": 0.9992406985573272, "grad_norm": 0.1482476443052292, "learning_rate": 1.6049474299012915e-05, "loss": 0.5309, "step": 3948 }, { "epoch": 0.9994937990382182, "grad_norm": 0.15403158962726593, "learning_rate": 1.6047567608161154e-05, "loss": 0.5286, "step": 3949 }, { "epoch": 0.9997468995191091, "grad_norm": 0.14373505115509033, "learning_rate": 1.6045660570615168e-05, "loss": 0.5067, "step": 3950 }, { "epoch": 1.0, "grad_norm": 0.1479228436946869, "learning_rate": 1.6043753186484287e-05, "loss": 0.5169, "step": 3951 }, { "epoch": 1.0, "eval_loss": 0.802573561668396, "eval_runtime": 1052.193, "eval_samples_per_second": 40.41, "eval_steps_per_second": 0.632, "step": 3951 } ], "logging_steps": 1, "max_steps": 13221, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.030110469860717e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }