{ "best_global_step": 9356, "best_metric": 0.9798825256975033, "best_model_checkpoint": "runs/de_sapbert/checkpoint-9356", "epoch": 2.0, "eval_steps": 500, "global_step": 9356, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010688328345446773, "grad_norm": 683.587890625, "learning_rate": 8.547008547008549e-08, "loss": 50.3236, "step": 5 }, { "epoch": 0.0021376656690893546, "grad_norm": 546.2178955078125, "learning_rate": 1.9230769230769234e-07, "loss": 59.3515, "step": 10 }, { "epoch": 0.0032064985036340315, "grad_norm": 602.040771484375, "learning_rate": 2.991452991452992e-07, "loss": 62.6096, "step": 15 }, { "epoch": 0.004275331338178709, "grad_norm": 697.5074462890625, "learning_rate": 4.05982905982906e-07, "loss": 71.224, "step": 20 }, { "epoch": 0.005344164172723386, "grad_norm": 629.8714599609375, "learning_rate": 5.128205128205128e-07, "loss": 58.249, "step": 25 }, { "epoch": 0.006412997007268063, "grad_norm": 850.3056640625, "learning_rate": 6.196581196581197e-07, "loss": 53.1647, "step": 30 }, { "epoch": 0.007481829841812741, "grad_norm": 535.2101440429688, "learning_rate": 7.264957264957266e-07, "loss": 74.0864, "step": 35 }, { "epoch": 0.008550662676357419, "grad_norm": 877.2177734375, "learning_rate": 8.333333333333333e-07, "loss": 68.1396, "step": 40 }, { "epoch": 0.009619495510902095, "grad_norm": 827.8037109375, "learning_rate": 9.401709401709402e-07, "loss": 67.5764, "step": 45 }, { "epoch": 0.010688328345446772, "grad_norm": 738.37158203125, "learning_rate": 1.047008547008547e-06, "loss": 59.9784, "step": 50 }, { "epoch": 0.01175716117999145, "grad_norm": 698.6246337890625, "learning_rate": 1.153846153846154e-06, "loss": 51.3131, "step": 55 }, { "epoch": 0.012825994014536126, "grad_norm": 683.5339965820312, "learning_rate": 1.2606837606837608e-06, "loss": 68.0457, "step": 60 }, { "epoch": 0.013894826849080803, "grad_norm": 766.8721923828125, "learning_rate": 1.3675213675213678e-06, "loss": 58.9018, "step": 65 }, { "epoch": 0.014963659683625482, "grad_norm": 694.8308715820312, "learning_rate": 1.4743589743589745e-06, "loss": 59.4911, "step": 70 }, { "epoch": 0.01603249251817016, "grad_norm": 585.236572265625, "learning_rate": 1.5811965811965813e-06, "loss": 58.0199, "step": 75 }, { "epoch": 0.017101325352714837, "grad_norm": 778.44287109375, "learning_rate": 1.6880341880341883e-06, "loss": 63.7679, "step": 80 }, { "epoch": 0.018170158187259512, "grad_norm": 646.8430786132812, "learning_rate": 1.794871794871795e-06, "loss": 60.7314, "step": 85 }, { "epoch": 0.01923899102180419, "grad_norm": 989.9755249023438, "learning_rate": 1.9017094017094018e-06, "loss": 77.8096, "step": 90 }, { "epoch": 0.020307823856348866, "grad_norm": 834.5411987304688, "learning_rate": 2.008547008547009e-06, "loss": 66.2088, "step": 95 }, { "epoch": 0.021376656690893545, "grad_norm": 643.298095703125, "learning_rate": 2.1153846153846155e-06, "loss": 65.6054, "step": 100 }, { "epoch": 0.02244548952543822, "grad_norm": 1178.525146484375, "learning_rate": 2.222222222222222e-06, "loss": 80.6449, "step": 105 }, { "epoch": 0.0235143223599829, "grad_norm": 697.0106201171875, "learning_rate": 2.3290598290598295e-06, "loss": 69.9855, "step": 110 }, { "epoch": 0.024583155194527577, "grad_norm": 783.412841796875, "learning_rate": 2.435897435897436e-06, "loss": 56.6307, "step": 115 }, { "epoch": 0.025651988029072252, "grad_norm": 692.8482666015625, "learning_rate": 2.542735042735043e-06, "loss": 46.5356, "step": 120 }, { "epoch": 0.02672082086361693, "grad_norm": 731.8605346679688, "learning_rate": 2.64957264957265e-06, "loss": 57.7131, "step": 125 }, { "epoch": 0.027789653698161606, "grad_norm": 710.8729858398438, "learning_rate": 2.756410256410257e-06, "loss": 49.0737, "step": 130 }, { "epoch": 0.028858486532706284, "grad_norm": 608.5044555664062, "learning_rate": 2.8632478632478635e-06, "loss": 50.7495, "step": 135 }, { "epoch": 0.029927319367250963, "grad_norm": 723.5994873046875, "learning_rate": 2.9700854700854705e-06, "loss": 57.1029, "step": 140 }, { "epoch": 0.030996152201795638, "grad_norm": 697.4583740234375, "learning_rate": 3.0769230769230774e-06, "loss": 45.4298, "step": 145 }, { "epoch": 0.03206498503634032, "grad_norm": 567.723388671875, "learning_rate": 3.183760683760684e-06, "loss": 57.8409, "step": 150 }, { "epoch": 0.03313381787088499, "grad_norm": 688.6309814453125, "learning_rate": 3.290598290598291e-06, "loss": 68.8388, "step": 155 }, { "epoch": 0.034202650705429674, "grad_norm": 628.3114624023438, "learning_rate": 3.397435897435898e-06, "loss": 64.5809, "step": 160 }, { "epoch": 0.03527148353997435, "grad_norm": 556.7693481445312, "learning_rate": 3.5042735042735045e-06, "loss": 54.5407, "step": 165 }, { "epoch": 0.036340316374519024, "grad_norm": 494.6075744628906, "learning_rate": 3.6111111111111115e-06, "loss": 48.341, "step": 170 }, { "epoch": 0.0374091492090637, "grad_norm": 554.0121459960938, "learning_rate": 3.7179487179487184e-06, "loss": 44.3806, "step": 175 }, { "epoch": 0.03847798204360838, "grad_norm": 758.5612182617188, "learning_rate": 3.8247863247863246e-06, "loss": 59.123, "step": 180 }, { "epoch": 0.03954681487815306, "grad_norm": 716.5413208007812, "learning_rate": 3.9316239316239315e-06, "loss": 59.4863, "step": 185 }, { "epoch": 0.04061564771269773, "grad_norm": 503.7677307128906, "learning_rate": 4.0384615384615385e-06, "loss": 65.4498, "step": 190 }, { "epoch": 0.041684480547242414, "grad_norm": 613.170654296875, "learning_rate": 4.145299145299146e-06, "loss": 52.9526, "step": 195 }, { "epoch": 0.04275331338178709, "grad_norm": 725.29833984375, "learning_rate": 4.2521367521367524e-06, "loss": 46.6744, "step": 200 }, { "epoch": 0.043822146216331764, "grad_norm": 525.0426635742188, "learning_rate": 4.358974358974359e-06, "loss": 37.1728, "step": 205 }, { "epoch": 0.04489097905087644, "grad_norm": 627.0368041992188, "learning_rate": 4.465811965811966e-06, "loss": 63.3973, "step": 210 }, { "epoch": 0.04595981188542112, "grad_norm": 501.9342956542969, "learning_rate": 4.5726495726495725e-06, "loss": 51.1136, "step": 215 }, { "epoch": 0.0470286447199658, "grad_norm": 535.6387329101562, "learning_rate": 4.6794871794871795e-06, "loss": 40.9712, "step": 220 }, { "epoch": 0.04809747755451047, "grad_norm": 492.3857421875, "learning_rate": 4.786324786324787e-06, "loss": 46.4765, "step": 225 }, { "epoch": 0.049166310389055154, "grad_norm": 579.775390625, "learning_rate": 4.8931623931623934e-06, "loss": 47.3894, "step": 230 }, { "epoch": 0.05023514322359983, "grad_norm": 530.4070434570312, "learning_rate": 5e-06, "loss": 36.866, "step": 235 }, { "epoch": 0.051303976058144504, "grad_norm": 476.2954406738281, "learning_rate": 5.1068376068376065e-06, "loss": 32.3704, "step": 240 }, { "epoch": 0.052372808892689186, "grad_norm": 412.430419921875, "learning_rate": 5.213675213675214e-06, "loss": 36.0298, "step": 245 }, { "epoch": 0.05344164172723386, "grad_norm": 467.79412841796875, "learning_rate": 5.320512820512821e-06, "loss": 43.7503, "step": 250 }, { "epoch": 0.05451047456177854, "grad_norm": 395.0292663574219, "learning_rate": 5.4273504273504275e-06, "loss": 33.9929, "step": 255 }, { "epoch": 0.05557930739632321, "grad_norm": 543.4691162109375, "learning_rate": 5.534188034188035e-06, "loss": 38.4924, "step": 260 }, { "epoch": 0.056648140230867894, "grad_norm": 446.6466369628906, "learning_rate": 5.641025641025641e-06, "loss": 30.8329, "step": 265 }, { "epoch": 0.05771697306541257, "grad_norm": 409.0653381347656, "learning_rate": 5.7478632478632475e-06, "loss": 29.0382, "step": 270 }, { "epoch": 0.058785805899957244, "grad_norm": 397.3152770996094, "learning_rate": 5.854700854700855e-06, "loss": 25.2408, "step": 275 }, { "epoch": 0.059854638734501926, "grad_norm": 324.9696350097656, "learning_rate": 5.961538461538462e-06, "loss": 30.9856, "step": 280 }, { "epoch": 0.0609234715690466, "grad_norm": 439.66241455078125, "learning_rate": 6.0683760683760684e-06, "loss": 25.987, "step": 285 }, { "epoch": 0.061992304403591277, "grad_norm": 299.0304260253906, "learning_rate": 6.175213675213676e-06, "loss": 24.4493, "step": 290 }, { "epoch": 0.06306113723813596, "grad_norm": 324.8219909667969, "learning_rate": 6.282051282051282e-06, "loss": 25.6893, "step": 295 }, { "epoch": 0.06412997007268063, "grad_norm": 419.6073303222656, "learning_rate": 6.3888888888888885e-06, "loss": 29.4356, "step": 300 }, { "epoch": 0.06519880290722531, "grad_norm": 299.56048583984375, "learning_rate": 6.495726495726496e-06, "loss": 22.2, "step": 305 }, { "epoch": 0.06626763574176998, "grad_norm": 303.1939697265625, "learning_rate": 6.602564102564103e-06, "loss": 24.3048, "step": 310 }, { "epoch": 0.06733646857631466, "grad_norm": 203.7785186767578, "learning_rate": 6.7094017094017094e-06, "loss": 18.5714, "step": 315 }, { "epoch": 0.06840530141085935, "grad_norm": 292.95050048828125, "learning_rate": 6.816239316239317e-06, "loss": 17.4822, "step": 320 }, { "epoch": 0.06947413424540402, "grad_norm": 145.5703125, "learning_rate": 6.923076923076923e-06, "loss": 15.8966, "step": 325 }, { "epoch": 0.0705429670799487, "grad_norm": 261.8589782714844, "learning_rate": 7.02991452991453e-06, "loss": 14.8771, "step": 330 }, { "epoch": 0.07161179991449337, "grad_norm": 215.4441375732422, "learning_rate": 7.136752136752137e-06, "loss": 16.0905, "step": 335 }, { "epoch": 0.07268063274903805, "grad_norm": 206.23388671875, "learning_rate": 7.243589743589744e-06, "loss": 11.9737, "step": 340 }, { "epoch": 0.07374946558358272, "grad_norm": 200.727294921875, "learning_rate": 7.350427350427351e-06, "loss": 12.1407, "step": 345 }, { "epoch": 0.0748182984181274, "grad_norm": 183.34083557128906, "learning_rate": 7.457264957264958e-06, "loss": 11.5492, "step": 350 }, { "epoch": 0.07588713125267209, "grad_norm": 155.31253051757812, "learning_rate": 7.564102564102564e-06, "loss": 13.0664, "step": 355 }, { "epoch": 0.07695596408721676, "grad_norm": 141.0535125732422, "learning_rate": 7.670940170940172e-06, "loss": 10.0428, "step": 360 }, { "epoch": 0.07802479692176144, "grad_norm": 164.8147430419922, "learning_rate": 7.77777777777778e-06, "loss": 9.2962, "step": 365 }, { "epoch": 0.07909362975630611, "grad_norm": 113.84266662597656, "learning_rate": 7.884615384615384e-06, "loss": 8.6304, "step": 370 }, { "epoch": 0.08016246259085079, "grad_norm": 90.67302703857422, "learning_rate": 7.991452991452993e-06, "loss": 5.7954, "step": 375 }, { "epoch": 0.08123129542539546, "grad_norm": 77.02782440185547, "learning_rate": 8.098290598290598e-06, "loss": 6.0213, "step": 380 }, { "epoch": 0.08230012825994014, "grad_norm": 77.24604797363281, "learning_rate": 8.205128205128205e-06, "loss": 6.8873, "step": 385 }, { "epoch": 0.08336896109448483, "grad_norm": 78.4577407836914, "learning_rate": 8.311965811965812e-06, "loss": 5.6347, "step": 390 }, { "epoch": 0.0844377939290295, "grad_norm": 70.10798645019531, "learning_rate": 8.41880341880342e-06, "loss": 5.7346, "step": 395 }, { "epoch": 0.08550662676357418, "grad_norm": 53.02711486816406, "learning_rate": 8.525641025641026e-06, "loss": 4.2817, "step": 400 }, { "epoch": 0.08657545959811885, "grad_norm": 58.17922592163086, "learning_rate": 8.632478632478633e-06, "loss": 3.9817, "step": 405 }, { "epoch": 0.08764429243266353, "grad_norm": 47.072662353515625, "learning_rate": 8.73931623931624e-06, "loss": 3.1871, "step": 410 }, { "epoch": 0.0887131252672082, "grad_norm": 35.73280715942383, "learning_rate": 8.846153846153847e-06, "loss": 3.2088, "step": 415 }, { "epoch": 0.08978195810175288, "grad_norm": 40.767581939697266, "learning_rate": 8.952991452991454e-06, "loss": 3.5216, "step": 420 }, { "epoch": 0.09085079093629757, "grad_norm": 32.53750991821289, "learning_rate": 9.059829059829061e-06, "loss": 2.3657, "step": 425 }, { "epoch": 0.09191962377084224, "grad_norm": 31.6849422454834, "learning_rate": 9.166666666666666e-06, "loss": 2.3054, "step": 430 }, { "epoch": 0.09298845660538692, "grad_norm": 29.081796646118164, "learning_rate": 9.273504273504275e-06, "loss": 2.174, "step": 435 }, { "epoch": 0.0940572894399316, "grad_norm": 34.65196990966797, "learning_rate": 9.38034188034188e-06, "loss": 2.4017, "step": 440 }, { "epoch": 0.09512612227447627, "grad_norm": 19.63212776184082, "learning_rate": 9.487179487179487e-06, "loss": 2.1189, "step": 445 }, { "epoch": 0.09619495510902094, "grad_norm": 24.055822372436523, "learning_rate": 9.594017094017094e-06, "loss": 2.3965, "step": 450 }, { "epoch": 0.09726378794356563, "grad_norm": 18.823020935058594, "learning_rate": 9.700854700854701e-06, "loss": 1.7638, "step": 455 }, { "epoch": 0.09833262077811031, "grad_norm": 16.97188949584961, "learning_rate": 9.807692307692308e-06, "loss": 1.4081, "step": 460 }, { "epoch": 0.09940145361265498, "grad_norm": 18.629823684692383, "learning_rate": 9.914529914529915e-06, "loss": 1.5501, "step": 465 }, { "epoch": 0.10047028644719966, "grad_norm": 16.413358688354492, "learning_rate": 1.0021367521367522e-05, "loss": 1.4015, "step": 470 }, { "epoch": 0.10153911928174433, "grad_norm": 14.555413246154785, "learning_rate": 1.012820512820513e-05, "loss": 1.3726, "step": 475 }, { "epoch": 0.10260795211628901, "grad_norm": 17.65382194519043, "learning_rate": 1.0235042735042734e-05, "loss": 1.1044, "step": 480 }, { "epoch": 0.10367678495083368, "grad_norm": 13.994580268859863, "learning_rate": 1.0341880341880343e-05, "loss": 1.1651, "step": 485 }, { "epoch": 0.10474561778537837, "grad_norm": 13.052831649780273, "learning_rate": 1.044871794871795e-05, "loss": 1.1674, "step": 490 }, { "epoch": 0.10581445061992305, "grad_norm": 16.172752380371094, "learning_rate": 1.0555555555555557e-05, "loss": 1.2274, "step": 495 }, { "epoch": 0.10688328345446772, "grad_norm": 12.005922317504883, "learning_rate": 1.0662393162393162e-05, "loss": 1.0606, "step": 500 }, { "epoch": 0.1079521162890124, "grad_norm": 13.400876998901367, "learning_rate": 1.076923076923077e-05, "loss": 1.2207, "step": 505 }, { "epoch": 0.10902094912355707, "grad_norm": 14.658180236816406, "learning_rate": 1.0876068376068376e-05, "loss": 1.12, "step": 510 }, { "epoch": 0.11008978195810175, "grad_norm": 11.943291664123535, "learning_rate": 1.0982905982905985e-05, "loss": 0.9925, "step": 515 }, { "epoch": 0.11115861479264642, "grad_norm": 10.507229804992676, "learning_rate": 1.1089743589743592e-05, "loss": 0.9542, "step": 520 }, { "epoch": 0.11222744762719111, "grad_norm": 12.802043914794922, "learning_rate": 1.1196581196581197e-05, "loss": 1.1911, "step": 525 }, { "epoch": 0.11329628046173579, "grad_norm": 10.767659187316895, "learning_rate": 1.1303418803418804e-05, "loss": 1.2418, "step": 530 }, { "epoch": 0.11436511329628046, "grad_norm": 12.087440490722656, "learning_rate": 1.1410256410256411e-05, "loss": 1.0926, "step": 535 }, { "epoch": 0.11543394613082514, "grad_norm": 9.390274047851562, "learning_rate": 1.1517094017094016e-05, "loss": 0.9052, "step": 540 }, { "epoch": 0.11650277896536981, "grad_norm": 10.022379875183105, "learning_rate": 1.1623931623931625e-05, "loss": 0.9725, "step": 545 }, { "epoch": 0.11757161179991449, "grad_norm": 12.384991645812988, "learning_rate": 1.1730769230769232e-05, "loss": 1.0631, "step": 550 }, { "epoch": 0.11864044463445918, "grad_norm": 10.217049598693848, "learning_rate": 1.1837606837606839e-05, "loss": 1.0069, "step": 555 }, { "epoch": 0.11970927746900385, "grad_norm": 9.567984580993652, "learning_rate": 1.1944444444444444e-05, "loss": 0.9301, "step": 560 }, { "epoch": 0.12077811030354853, "grad_norm": 7.623697280883789, "learning_rate": 1.2051282051282051e-05, "loss": 0.7413, "step": 565 }, { "epoch": 0.1218469431380932, "grad_norm": 12.299339294433594, "learning_rate": 1.2158119658119658e-05, "loss": 1.0359, "step": 570 }, { "epoch": 0.12291577597263788, "grad_norm": 10.420650482177734, "learning_rate": 1.2264957264957267e-05, "loss": 0.9793, "step": 575 }, { "epoch": 0.12398460880718255, "grad_norm": 9.585742950439453, "learning_rate": 1.2371794871794874e-05, "loss": 0.7139, "step": 580 }, { "epoch": 0.12505344164172724, "grad_norm": 8.004850387573242, "learning_rate": 1.247863247863248e-05, "loss": 1.0595, "step": 585 }, { "epoch": 0.12612227447627192, "grad_norm": 9.993664741516113, "learning_rate": 1.2585470085470086e-05, "loss": 0.8869, "step": 590 }, { "epoch": 0.1271911073108166, "grad_norm": 9.177787780761719, "learning_rate": 1.2692307692307693e-05, "loss": 0.8318, "step": 595 }, { "epoch": 0.12825994014536127, "grad_norm": 10.756118774414062, "learning_rate": 1.2799145299145298e-05, "loss": 0.7328, "step": 600 }, { "epoch": 0.12932877297990594, "grad_norm": 9.876399040222168, "learning_rate": 1.2905982905982907e-05, "loss": 0.7785, "step": 605 }, { "epoch": 0.13039760581445062, "grad_norm": 9.022135734558105, "learning_rate": 1.3012820512820514e-05, "loss": 0.8507, "step": 610 }, { "epoch": 0.1314664386489953, "grad_norm": 9.838420867919922, "learning_rate": 1.3119658119658121e-05, "loss": 1.0039, "step": 615 }, { "epoch": 0.13253527148353997, "grad_norm": 11.163064002990723, "learning_rate": 1.3226495726495728e-05, "loss": 0.7514, "step": 620 }, { "epoch": 0.13360410431808464, "grad_norm": 7.968421459197998, "learning_rate": 1.3333333333333333e-05, "loss": 0.6666, "step": 625 }, { "epoch": 0.13467293715262932, "grad_norm": 9.873268127441406, "learning_rate": 1.3440170940170942e-05, "loss": 0.6639, "step": 630 }, { "epoch": 0.135741769987174, "grad_norm": 7.193119049072266, "learning_rate": 1.3547008547008549e-05, "loss": 0.77, "step": 635 }, { "epoch": 0.1368106028217187, "grad_norm": 9.957530975341797, "learning_rate": 1.3653846153846156e-05, "loss": 0.8079, "step": 640 }, { "epoch": 0.13787943565626337, "grad_norm": 9.682191848754883, "learning_rate": 1.3760683760683761e-05, "loss": 0.8061, "step": 645 }, { "epoch": 0.13894826849080805, "grad_norm": 7.611622333526611, "learning_rate": 1.3867521367521368e-05, "loss": 0.7666, "step": 650 }, { "epoch": 0.14001710132535272, "grad_norm": 9.452914237976074, "learning_rate": 1.3974358974358975e-05, "loss": 0.7835, "step": 655 }, { "epoch": 0.1410859341598974, "grad_norm": 7.584820747375488, "learning_rate": 1.4081196581196584e-05, "loss": 0.7319, "step": 660 }, { "epoch": 0.14215476699444207, "grad_norm": 7.296449661254883, "learning_rate": 1.4188034188034189e-05, "loss": 0.5945, "step": 665 }, { "epoch": 0.14322359982898675, "grad_norm": 8.2069730758667, "learning_rate": 1.4294871794871796e-05, "loss": 0.7008, "step": 670 }, { "epoch": 0.14429243266353142, "grad_norm": 8.537630081176758, "learning_rate": 1.4401709401709403e-05, "loss": 0.9728, "step": 675 }, { "epoch": 0.1453612654980761, "grad_norm": 8.83273696899414, "learning_rate": 1.450854700854701e-05, "loss": 0.7346, "step": 680 }, { "epoch": 0.14643009833262077, "grad_norm": 9.676769256591797, "learning_rate": 1.4615384615384615e-05, "loss": 0.8463, "step": 685 }, { "epoch": 0.14749893116716545, "grad_norm": 8.45507526397705, "learning_rate": 1.4722222222222224e-05, "loss": 0.7251, "step": 690 }, { "epoch": 0.14856776400171012, "grad_norm": 9.440534591674805, "learning_rate": 1.4829059829059831e-05, "loss": 0.7444, "step": 695 }, { "epoch": 0.1496365968362548, "grad_norm": 7.491715431213379, "learning_rate": 1.4935897435897438e-05, "loss": 0.6496, "step": 700 }, { "epoch": 0.15070542967079947, "grad_norm": 6.313747406005859, "learning_rate": 1.5042735042735043e-05, "loss": 0.7623, "step": 705 }, { "epoch": 0.15177426250534418, "grad_norm": 7.4156060218811035, "learning_rate": 1.514957264957265e-05, "loss": 0.8285, "step": 710 }, { "epoch": 0.15284309533988885, "grad_norm": 8.984630584716797, "learning_rate": 1.5256410256410257e-05, "loss": 0.886, "step": 715 }, { "epoch": 0.15391192817443353, "grad_norm": 7.339222431182861, "learning_rate": 1.5363247863247866e-05, "loss": 0.7274, "step": 720 }, { "epoch": 0.1549807610089782, "grad_norm": 7.579738140106201, "learning_rate": 1.5470085470085473e-05, "loss": 0.8669, "step": 725 }, { "epoch": 0.15604959384352288, "grad_norm": 8.190738677978516, "learning_rate": 1.557692307692308e-05, "loss": 0.6741, "step": 730 }, { "epoch": 0.15711842667806755, "grad_norm": 7.186857223510742, "learning_rate": 1.5683760683760683e-05, "loss": 0.7396, "step": 735 }, { "epoch": 0.15818725951261223, "grad_norm": 6.874704360961914, "learning_rate": 1.579059829059829e-05, "loss": 0.7197, "step": 740 }, { "epoch": 0.1592560923471569, "grad_norm": 9.56429386138916, "learning_rate": 1.5897435897435897e-05, "loss": 0.9099, "step": 745 }, { "epoch": 0.16032492518170158, "grad_norm": 7.520778179168701, "learning_rate": 1.6004273504273508e-05, "loss": 0.7344, "step": 750 }, { "epoch": 0.16139375801624625, "grad_norm": 5.668506622314453, "learning_rate": 1.6111111111111115e-05, "loss": 0.6107, "step": 755 }, { "epoch": 0.16246259085079093, "grad_norm": 9.869526863098145, "learning_rate": 1.6217948717948718e-05, "loss": 0.6929, "step": 760 }, { "epoch": 0.1635314236853356, "grad_norm": 8.029936790466309, "learning_rate": 1.6324786324786325e-05, "loss": 0.7019, "step": 765 }, { "epoch": 0.16460025651988028, "grad_norm": 8.148101806640625, "learning_rate": 1.6431623931623932e-05, "loss": 0.5759, "step": 770 }, { "epoch": 0.16566908935442498, "grad_norm": 7.96873664855957, "learning_rate": 1.653846153846154e-05, "loss": 0.5736, "step": 775 }, { "epoch": 0.16673792218896966, "grad_norm": 9.016830444335938, "learning_rate": 1.6645299145299146e-05, "loss": 0.5878, "step": 780 }, { "epoch": 0.16780675502351433, "grad_norm": 7.439241886138916, "learning_rate": 1.6752136752136753e-05, "loss": 0.7349, "step": 785 }, { "epoch": 0.168875587858059, "grad_norm": 9.404788970947266, "learning_rate": 1.685897435897436e-05, "loss": 0.7691, "step": 790 }, { "epoch": 0.16994442069260368, "grad_norm": 6.773132801055908, "learning_rate": 1.6965811965811967e-05, "loss": 0.6228, "step": 795 }, { "epoch": 0.17101325352714836, "grad_norm": 6.86265754699707, "learning_rate": 1.7072649572649574e-05, "loss": 0.6394, "step": 800 }, { "epoch": 0.17208208636169303, "grad_norm": 6.647765159606934, "learning_rate": 1.717948717948718e-05, "loss": 0.624, "step": 805 }, { "epoch": 0.1731509191962377, "grad_norm": 6.882334232330322, "learning_rate": 1.7286324786324788e-05, "loss": 0.6487, "step": 810 }, { "epoch": 0.17421975203078238, "grad_norm": 8.620728492736816, "learning_rate": 1.7393162393162395e-05, "loss": 0.6001, "step": 815 }, { "epoch": 0.17528858486532706, "grad_norm": 7.544363021850586, "learning_rate": 1.7500000000000002e-05, "loss": 0.6685, "step": 820 }, { "epoch": 0.17635741769987173, "grad_norm": 8.941640853881836, "learning_rate": 1.760683760683761e-05, "loss": 0.8176, "step": 825 }, { "epoch": 0.1774262505344164, "grad_norm": 6.829235553741455, "learning_rate": 1.7713675213675216e-05, "loss": 0.6417, "step": 830 }, { "epoch": 0.17849508336896108, "grad_norm": 7.0878705978393555, "learning_rate": 1.7820512820512823e-05, "loss": 0.6353, "step": 835 }, { "epoch": 0.17956391620350576, "grad_norm": 9.062278747558594, "learning_rate": 1.792735042735043e-05, "loss": 0.5946, "step": 840 }, { "epoch": 0.18063274903805046, "grad_norm": 7.967255115509033, "learning_rate": 1.8034188034188037e-05, "loss": 0.6122, "step": 845 }, { "epoch": 0.18170158187259514, "grad_norm": 7.076515197753906, "learning_rate": 1.8141025641025644e-05, "loss": 0.6499, "step": 850 }, { "epoch": 0.1827704147071398, "grad_norm": 7.658061981201172, "learning_rate": 1.8247863247863247e-05, "loss": 0.601, "step": 855 }, { "epoch": 0.1838392475416845, "grad_norm": 7.605343341827393, "learning_rate": 1.8354700854700854e-05, "loss": 0.5404, "step": 860 }, { "epoch": 0.18490808037622916, "grad_norm": 6.7278900146484375, "learning_rate": 1.8461538461538465e-05, "loss": 0.6574, "step": 865 }, { "epoch": 0.18597691321077384, "grad_norm": 6.190138339996338, "learning_rate": 1.856837606837607e-05, "loss": 0.5377, "step": 870 }, { "epoch": 0.1870457460453185, "grad_norm": 5.700743198394775, "learning_rate": 1.867521367521368e-05, "loss": 0.5537, "step": 875 }, { "epoch": 0.1881145788798632, "grad_norm": 9.452567100524902, "learning_rate": 1.8782051282051282e-05, "loss": 0.7377, "step": 880 }, { "epoch": 0.18918341171440786, "grad_norm": 8.572616577148438, "learning_rate": 1.888888888888889e-05, "loss": 0.8466, "step": 885 }, { "epoch": 0.19025224454895254, "grad_norm": 6.903915882110596, "learning_rate": 1.8995726495726496e-05, "loss": 0.4808, "step": 890 }, { "epoch": 0.1913210773834972, "grad_norm": 6.828094959259033, "learning_rate": 1.9102564102564106e-05, "loss": 0.5482, "step": 895 }, { "epoch": 0.1923899102180419, "grad_norm": 9.153865814208984, "learning_rate": 1.920940170940171e-05, "loss": 0.6588, "step": 900 }, { "epoch": 0.19345874305258656, "grad_norm": 8.290953636169434, "learning_rate": 1.9316239316239317e-05, "loss": 0.6953, "step": 905 }, { "epoch": 0.19452757588713127, "grad_norm": 6.111261367797852, "learning_rate": 1.9423076923076924e-05, "loss": 0.4803, "step": 910 }, { "epoch": 0.19559640872167594, "grad_norm": 8.402656555175781, "learning_rate": 1.952991452991453e-05, "loss": 0.6869, "step": 915 }, { "epoch": 0.19666524155622062, "grad_norm": 5.929531574249268, "learning_rate": 1.9636752136752138e-05, "loss": 0.5393, "step": 920 }, { "epoch": 0.1977340743907653, "grad_norm": 7.195873260498047, "learning_rate": 1.9743589743589745e-05, "loss": 0.4829, "step": 925 }, { "epoch": 0.19880290722530997, "grad_norm": 8.35781192779541, "learning_rate": 1.9850427350427352e-05, "loss": 0.5071, "step": 930 }, { "epoch": 0.19987174005985464, "grad_norm": 7.967381954193115, "learning_rate": 1.995726495726496e-05, "loss": 0.6828, "step": 935 }, { "epoch": 0.20094057289439932, "grad_norm": 7.339260101318359, "learning_rate": 1.999287410926366e-05, "loss": 0.6468, "step": 940 }, { "epoch": 0.202009405728944, "grad_norm": 6.559432029724121, "learning_rate": 1.9980997624703088e-05, "loss": 0.6169, "step": 945 }, { "epoch": 0.20307823856348867, "grad_norm": 6.627939701080322, "learning_rate": 1.996912114014252e-05, "loss": 0.5389, "step": 950 }, { "epoch": 0.20414707139803334, "grad_norm": 6.5172119140625, "learning_rate": 1.995724465558195e-05, "loss": 0.4527, "step": 955 }, { "epoch": 0.20521590423257802, "grad_norm": 6.31046199798584, "learning_rate": 1.994536817102138e-05, "loss": 0.5405, "step": 960 }, { "epoch": 0.2062847370671227, "grad_norm": 6.0619425773620605, "learning_rate": 1.993349168646081e-05, "loss": 0.6583, "step": 965 }, { "epoch": 0.20735356990166737, "grad_norm": 7.9872941970825195, "learning_rate": 1.992161520190024e-05, "loss": 0.7037, "step": 970 }, { "epoch": 0.20842240273621207, "grad_norm": 6.310743808746338, "learning_rate": 1.9909738717339668e-05, "loss": 0.564, "step": 975 }, { "epoch": 0.20949123557075675, "grad_norm": 6.636473655700684, "learning_rate": 1.9897862232779098e-05, "loss": 0.4531, "step": 980 }, { "epoch": 0.21056006840530142, "grad_norm": 6.087254047393799, "learning_rate": 1.988598574821853e-05, "loss": 0.5454, "step": 985 }, { "epoch": 0.2116289012398461, "grad_norm": 6.705723762512207, "learning_rate": 1.987410926365796e-05, "loss": 0.4836, "step": 990 }, { "epoch": 0.21269773407439077, "grad_norm": 6.238287448883057, "learning_rate": 1.9862232779097387e-05, "loss": 0.5458, "step": 995 }, { "epoch": 0.21376656690893545, "grad_norm": 5.439404010772705, "learning_rate": 1.985035629453682e-05, "loss": 0.5202, "step": 1000 }, { "epoch": 0.21483539974348012, "grad_norm": 5.1525654792785645, "learning_rate": 1.9838479809976248e-05, "loss": 0.5532, "step": 1005 }, { "epoch": 0.2159042325780248, "grad_norm": 6.952949047088623, "learning_rate": 1.9826603325415678e-05, "loss": 0.63, "step": 1010 }, { "epoch": 0.21697306541256947, "grad_norm": 5.5152788162231445, "learning_rate": 1.981472684085511e-05, "loss": 0.6299, "step": 1015 }, { "epoch": 0.21804189824711415, "grad_norm": 7.090893745422363, "learning_rate": 1.980285035629454e-05, "loss": 0.5405, "step": 1020 }, { "epoch": 0.21911073108165882, "grad_norm": 5.758279323577881, "learning_rate": 1.979097387173397e-05, "loss": 0.4379, "step": 1025 }, { "epoch": 0.2201795639162035, "grad_norm": 10.006664276123047, "learning_rate": 1.9779097387173397e-05, "loss": 0.7446, "step": 1030 }, { "epoch": 0.22124839675074817, "grad_norm": 6.134273529052734, "learning_rate": 1.9767220902612828e-05, "loss": 0.6652, "step": 1035 }, { "epoch": 0.22231722958529285, "grad_norm": 7.395203113555908, "learning_rate": 1.9755344418052258e-05, "loss": 0.6873, "step": 1040 }, { "epoch": 0.22338606241983755, "grad_norm": 4.713867664337158, "learning_rate": 1.974346793349169e-05, "loss": 0.4904, "step": 1045 }, { "epoch": 0.22445489525438223, "grad_norm": 6.1399359703063965, "learning_rate": 1.973159144893112e-05, "loss": 0.5763, "step": 1050 }, { "epoch": 0.2255237280889269, "grad_norm": 7.5219879150390625, "learning_rate": 1.971971496437055e-05, "loss": 0.5801, "step": 1055 }, { "epoch": 0.22659256092347158, "grad_norm": 5.3690619468688965, "learning_rate": 1.9707838479809977e-05, "loss": 0.518, "step": 1060 }, { "epoch": 0.22766139375801625, "grad_norm": 7.701142311096191, "learning_rate": 1.9695961995249407e-05, "loss": 0.5928, "step": 1065 }, { "epoch": 0.22873022659256093, "grad_norm": 5.431284427642822, "learning_rate": 1.9684085510688838e-05, "loss": 0.5575, "step": 1070 }, { "epoch": 0.2297990594271056, "grad_norm": 5.841889381408691, "learning_rate": 1.967220902612827e-05, "loss": 0.497, "step": 1075 }, { "epoch": 0.23086789226165028, "grad_norm": 6.84688663482666, "learning_rate": 1.9660332541567696e-05, "loss": 0.4607, "step": 1080 }, { "epoch": 0.23193672509619495, "grad_norm": 7.2094645500183105, "learning_rate": 1.964845605700713e-05, "loss": 0.5531, "step": 1085 }, { "epoch": 0.23300555793073963, "grad_norm": 8.189807891845703, "learning_rate": 1.9636579572446557e-05, "loss": 0.5372, "step": 1090 }, { "epoch": 0.2340743907652843, "grad_norm": 6.64928674697876, "learning_rate": 1.9624703087885987e-05, "loss": 0.4641, "step": 1095 }, { "epoch": 0.23514322359982898, "grad_norm": 5.907129764556885, "learning_rate": 1.9612826603325418e-05, "loss": 0.4724, "step": 1100 }, { "epoch": 0.23621205643437365, "grad_norm": 5.550957202911377, "learning_rate": 1.960095011876485e-05, "loss": 0.4422, "step": 1105 }, { "epoch": 0.23728088926891835, "grad_norm": 5.1877899169921875, "learning_rate": 1.9589073634204276e-05, "loss": 0.4571, "step": 1110 }, { "epoch": 0.23834972210346303, "grad_norm": 6.098719120025635, "learning_rate": 1.9577197149643706e-05, "loss": 0.5104, "step": 1115 }, { "epoch": 0.2394185549380077, "grad_norm": 5.2909770011901855, "learning_rate": 1.9565320665083137e-05, "loss": 0.5217, "step": 1120 }, { "epoch": 0.24048738777255238, "grad_norm": 7.134459018707275, "learning_rate": 1.9553444180522567e-05, "loss": 0.4481, "step": 1125 }, { "epoch": 0.24155622060709706, "grad_norm": 7.295234203338623, "learning_rate": 1.9541567695961994e-05, "loss": 0.5244, "step": 1130 }, { "epoch": 0.24262505344164173, "grad_norm": 6.2853193283081055, "learning_rate": 1.952969121140143e-05, "loss": 0.4593, "step": 1135 }, { "epoch": 0.2436938862761864, "grad_norm": 6.563023567199707, "learning_rate": 1.9517814726840856e-05, "loss": 0.5566, "step": 1140 }, { "epoch": 0.24476271911073108, "grad_norm": 5.089166164398193, "learning_rate": 1.9505938242280286e-05, "loss": 0.5473, "step": 1145 }, { "epoch": 0.24583155194527576, "grad_norm": 4.706131458282471, "learning_rate": 1.9494061757719717e-05, "loss": 0.3649, "step": 1150 }, { "epoch": 0.24690038477982043, "grad_norm": 7.967005729675293, "learning_rate": 1.9482185273159147e-05, "loss": 0.5466, "step": 1155 }, { "epoch": 0.2479692176143651, "grad_norm": 6.625776767730713, "learning_rate": 1.9470308788598574e-05, "loss": 0.4687, "step": 1160 }, { "epoch": 0.24903805044890978, "grad_norm": 6.631053447723389, "learning_rate": 1.9458432304038005e-05, "loss": 0.521, "step": 1165 }, { "epoch": 0.2501068832834545, "grad_norm": 5.4099555015563965, "learning_rate": 1.944655581947744e-05, "loss": 0.4539, "step": 1170 }, { "epoch": 0.25117571611799916, "grad_norm": 6.302776336669922, "learning_rate": 1.9434679334916866e-05, "loss": 0.5975, "step": 1175 }, { "epoch": 0.25224454895254383, "grad_norm": 6.055430889129639, "learning_rate": 1.9422802850356297e-05, "loss": 0.4108, "step": 1180 }, { "epoch": 0.2533133817870885, "grad_norm": 6.507791519165039, "learning_rate": 1.9410926365795727e-05, "loss": 0.4052, "step": 1185 }, { "epoch": 0.2543822146216332, "grad_norm": 4.61367130279541, "learning_rate": 1.9399049881235158e-05, "loss": 0.3805, "step": 1190 }, { "epoch": 0.25545104745617786, "grad_norm": 7.931022644042969, "learning_rate": 1.9387173396674585e-05, "loss": 0.508, "step": 1195 }, { "epoch": 0.25651988029072254, "grad_norm": 7.164324760437012, "learning_rate": 1.9375296912114015e-05, "loss": 0.4517, "step": 1200 }, { "epoch": 0.2575887131252672, "grad_norm": 5.5631890296936035, "learning_rate": 1.9363420427553446e-05, "loss": 0.5546, "step": 1205 }, { "epoch": 0.2586575459598119, "grad_norm": 6.603108882904053, "learning_rate": 1.9351543942992876e-05, "loss": 0.5427, "step": 1210 }, { "epoch": 0.25972637879435656, "grad_norm": 6.033792018890381, "learning_rate": 1.9339667458432304e-05, "loss": 0.4869, "step": 1215 }, { "epoch": 0.26079521162890124, "grad_norm": 6.105428218841553, "learning_rate": 1.9327790973871738e-05, "loss": 0.5849, "step": 1220 }, { "epoch": 0.2618640444634459, "grad_norm": 5.2770161628723145, "learning_rate": 1.9315914489311165e-05, "loss": 0.4608, "step": 1225 }, { "epoch": 0.2629328772979906, "grad_norm": 8.29669189453125, "learning_rate": 1.9304038004750595e-05, "loss": 0.6211, "step": 1230 }, { "epoch": 0.26400171013253526, "grad_norm": 5.556075572967529, "learning_rate": 1.9292161520190026e-05, "loss": 0.5136, "step": 1235 }, { "epoch": 0.26507054296707994, "grad_norm": 6.262655258178711, "learning_rate": 1.9280285035629456e-05, "loss": 0.4788, "step": 1240 }, { "epoch": 0.2661393758016246, "grad_norm": 7.117279052734375, "learning_rate": 1.9268408551068884e-05, "loss": 0.4763, "step": 1245 }, { "epoch": 0.2672082086361693, "grad_norm": 7.450889587402344, "learning_rate": 1.9256532066508314e-05, "loss": 0.5398, "step": 1250 }, { "epoch": 0.26827704147071396, "grad_norm": 6.718365669250488, "learning_rate": 1.9244655581947745e-05, "loss": 0.5782, "step": 1255 }, { "epoch": 0.26934587430525864, "grad_norm": 5.374184608459473, "learning_rate": 1.9232779097387175e-05, "loss": 0.4568, "step": 1260 }, { "epoch": 0.2704147071398033, "grad_norm": 4.485583305358887, "learning_rate": 1.9220902612826606e-05, "loss": 0.3874, "step": 1265 }, { "epoch": 0.271483539974348, "grad_norm": 5.478529930114746, "learning_rate": 1.9209026128266036e-05, "loss": 0.4604, "step": 1270 }, { "epoch": 0.27255237280889266, "grad_norm": 5.911350727081299, "learning_rate": 1.9197149643705463e-05, "loss": 0.4202, "step": 1275 }, { "epoch": 0.2736212056434374, "grad_norm": 7.953678131103516, "learning_rate": 1.9185273159144894e-05, "loss": 0.4467, "step": 1280 }, { "epoch": 0.27469003847798207, "grad_norm": 6.318038463592529, "learning_rate": 1.9173396674584325e-05, "loss": 0.3631, "step": 1285 }, { "epoch": 0.27575887131252674, "grad_norm": 7.290485382080078, "learning_rate": 1.9161520190023755e-05, "loss": 0.5917, "step": 1290 }, { "epoch": 0.2768277041470714, "grad_norm": 6.057776927947998, "learning_rate": 1.9149643705463182e-05, "loss": 0.4315, "step": 1295 }, { "epoch": 0.2778965369816161, "grad_norm": 5.482032775878906, "learning_rate": 1.9137767220902613e-05, "loss": 0.3898, "step": 1300 }, { "epoch": 0.27896536981616077, "grad_norm": 7.219336032867432, "learning_rate": 1.9125890736342047e-05, "loss": 0.5546, "step": 1305 }, { "epoch": 0.28003420265070544, "grad_norm": 6.556499481201172, "learning_rate": 1.9114014251781474e-05, "loss": 0.4766, "step": 1310 }, { "epoch": 0.2811030354852501, "grad_norm": 8.849128723144531, "learning_rate": 1.9102137767220904e-05, "loss": 0.5883, "step": 1315 }, { "epoch": 0.2821718683197948, "grad_norm": 6.604886054992676, "learning_rate": 1.9090261282660335e-05, "loss": 0.4837, "step": 1320 }, { "epoch": 0.28324070115433947, "grad_norm": 6.3507304191589355, "learning_rate": 1.9078384798099766e-05, "loss": 0.4576, "step": 1325 }, { "epoch": 0.28430953398888414, "grad_norm": 7.592872619628906, "learning_rate": 1.9066508313539193e-05, "loss": 0.3894, "step": 1330 }, { "epoch": 0.2853783668234288, "grad_norm": 7.806624889373779, "learning_rate": 1.9054631828978623e-05, "loss": 0.5239, "step": 1335 }, { "epoch": 0.2864471996579735, "grad_norm": 5.70356559753418, "learning_rate": 1.9042755344418054e-05, "loss": 0.4255, "step": 1340 }, { "epoch": 0.28751603249251817, "grad_norm": 6.017592906951904, "learning_rate": 1.9030878859857484e-05, "loss": 0.3944, "step": 1345 }, { "epoch": 0.28858486532706284, "grad_norm": 8.678641319274902, "learning_rate": 1.9019002375296915e-05, "loss": 0.6219, "step": 1350 }, { "epoch": 0.2896536981616075, "grad_norm": 5.638593673706055, "learning_rate": 1.9007125890736345e-05, "loss": 0.4791, "step": 1355 }, { "epoch": 0.2907225309961522, "grad_norm": 6.662184238433838, "learning_rate": 1.8995249406175773e-05, "loss": 0.4101, "step": 1360 }, { "epoch": 0.29179136383069687, "grad_norm": 5.850408554077148, "learning_rate": 1.8983372921615203e-05, "loss": 0.4422, "step": 1365 }, { "epoch": 0.29286019666524155, "grad_norm": 6.298422813415527, "learning_rate": 1.8971496437054634e-05, "loss": 0.4426, "step": 1370 }, { "epoch": 0.2939290294997862, "grad_norm": 6.113378524780273, "learning_rate": 1.8959619952494064e-05, "loss": 0.4108, "step": 1375 }, { "epoch": 0.2949978623343309, "grad_norm": 5.136318206787109, "learning_rate": 1.894774346793349e-05, "loss": 0.4734, "step": 1380 }, { "epoch": 0.29606669516887557, "grad_norm": 7.1877760887146, "learning_rate": 1.8935866983372922e-05, "loss": 0.4678, "step": 1385 }, { "epoch": 0.29713552800342025, "grad_norm": 7.322067737579346, "learning_rate": 1.8923990498812352e-05, "loss": 0.4334, "step": 1390 }, { "epoch": 0.2982043608379649, "grad_norm": 4.906497001647949, "learning_rate": 1.8912114014251783e-05, "loss": 0.4211, "step": 1395 }, { "epoch": 0.2992731936725096, "grad_norm": 4.929844379425049, "learning_rate": 1.8900237529691214e-05, "loss": 0.5807, "step": 1400 }, { "epoch": 0.30034202650705427, "grad_norm": 6.196166515350342, "learning_rate": 1.8888361045130644e-05, "loss": 0.5956, "step": 1405 }, { "epoch": 0.30141085934159895, "grad_norm": 5.226170539855957, "learning_rate": 1.887648456057007e-05, "loss": 0.4175, "step": 1410 }, { "epoch": 0.3024796921761437, "grad_norm": 4.843142509460449, "learning_rate": 1.8864608076009502e-05, "loss": 0.4233, "step": 1415 }, { "epoch": 0.30354852501068835, "grad_norm": 5.112825393676758, "learning_rate": 1.8852731591448932e-05, "loss": 0.4118, "step": 1420 }, { "epoch": 0.304617357845233, "grad_norm": 6.756041526794434, "learning_rate": 1.8840855106888363e-05, "loss": 0.3919, "step": 1425 }, { "epoch": 0.3056861906797777, "grad_norm": 5.811524868011475, "learning_rate": 1.882897862232779e-05, "loss": 0.5152, "step": 1430 }, { "epoch": 0.3067550235143224, "grad_norm": 5.891305446624756, "learning_rate": 1.8817102137767224e-05, "loss": 0.393, "step": 1435 }, { "epoch": 0.30782385634886705, "grad_norm": 6.220530986785889, "learning_rate": 1.880522565320665e-05, "loss": 0.3765, "step": 1440 }, { "epoch": 0.30889268918341173, "grad_norm": 6.09738826751709, "learning_rate": 1.8793349168646082e-05, "loss": 0.4476, "step": 1445 }, { "epoch": 0.3099615220179564, "grad_norm": 4.718704700469971, "learning_rate": 1.8781472684085512e-05, "loss": 0.5094, "step": 1450 }, { "epoch": 0.3110303548525011, "grad_norm": 5.264518737792969, "learning_rate": 1.8769596199524943e-05, "loss": 0.4496, "step": 1455 }, { "epoch": 0.31209918768704575, "grad_norm": 5.551924705505371, "learning_rate": 1.8757719714964373e-05, "loss": 0.4305, "step": 1460 }, { "epoch": 0.31316802052159043, "grad_norm": 4.252546787261963, "learning_rate": 1.87458432304038e-05, "loss": 0.495, "step": 1465 }, { "epoch": 0.3142368533561351, "grad_norm": 4.372467517852783, "learning_rate": 1.8733966745843235e-05, "loss": 0.5561, "step": 1470 }, { "epoch": 0.3153056861906798, "grad_norm": 6.216442108154297, "learning_rate": 1.872209026128266e-05, "loss": 0.4432, "step": 1475 }, { "epoch": 0.31637451902522445, "grad_norm": 3.8125741481781006, "learning_rate": 1.8710213776722092e-05, "loss": 0.3382, "step": 1480 }, { "epoch": 0.31744335185976913, "grad_norm": 5.539150714874268, "learning_rate": 1.8698337292161523e-05, "loss": 0.4209, "step": 1485 }, { "epoch": 0.3185121846943138, "grad_norm": 6.593637466430664, "learning_rate": 1.8686460807600953e-05, "loss": 0.3776, "step": 1490 }, { "epoch": 0.3195810175288585, "grad_norm": 5.109198570251465, "learning_rate": 1.867458432304038e-05, "loss": 0.4271, "step": 1495 }, { "epoch": 0.32064985036340315, "grad_norm": 7.083045959472656, "learning_rate": 1.866270783847981e-05, "loss": 0.5628, "step": 1500 }, { "epoch": 0.32171868319794783, "grad_norm": 7.068709850311279, "learning_rate": 1.865083135391924e-05, "loss": 0.4438, "step": 1505 }, { "epoch": 0.3227875160324925, "grad_norm": 4.62941312789917, "learning_rate": 1.8638954869358672e-05, "loss": 0.3863, "step": 1510 }, { "epoch": 0.3238563488670372, "grad_norm": 4.4039788246154785, "learning_rate": 1.86270783847981e-05, "loss": 0.5629, "step": 1515 }, { "epoch": 0.32492518170158186, "grad_norm": 6.153443813323975, "learning_rate": 1.8615201900237533e-05, "loss": 0.4851, "step": 1520 }, { "epoch": 0.32599401453612653, "grad_norm": 4.207914352416992, "learning_rate": 1.860332541567696e-05, "loss": 0.3386, "step": 1525 }, { "epoch": 0.3270628473706712, "grad_norm": 5.669225692749023, "learning_rate": 1.859144893111639e-05, "loss": 0.4546, "step": 1530 }, { "epoch": 0.3281316802052159, "grad_norm": 6.5213775634765625, "learning_rate": 1.857957244655582e-05, "loss": 0.4147, "step": 1535 }, { "epoch": 0.32920051303976056, "grad_norm": 5.153679370880127, "learning_rate": 1.8567695961995252e-05, "loss": 0.403, "step": 1540 }, { "epoch": 0.33026934587430523, "grad_norm": 5.655941009521484, "learning_rate": 1.855581947743468e-05, "loss": 0.4155, "step": 1545 }, { "epoch": 0.33133817870884996, "grad_norm": 6.6513895988464355, "learning_rate": 1.854394299287411e-05, "loss": 0.4831, "step": 1550 }, { "epoch": 0.33240701154339464, "grad_norm": 5.994706153869629, "learning_rate": 1.853206650831354e-05, "loss": 0.388, "step": 1555 }, { "epoch": 0.3334758443779393, "grad_norm": 4.132383346557617, "learning_rate": 1.852019002375297e-05, "loss": 0.4184, "step": 1560 }, { "epoch": 0.334544677212484, "grad_norm": 4.975070953369141, "learning_rate": 1.8508313539192398e-05, "loss": 0.3645, "step": 1565 }, { "epoch": 0.33561351004702866, "grad_norm": 6.475266933441162, "learning_rate": 1.8496437054631832e-05, "loss": 0.4653, "step": 1570 }, { "epoch": 0.33668234288157334, "grad_norm": 5.302603244781494, "learning_rate": 1.848456057007126e-05, "loss": 0.5196, "step": 1575 }, { "epoch": 0.337751175716118, "grad_norm": 6.404365539550781, "learning_rate": 1.847268408551069e-05, "loss": 0.5252, "step": 1580 }, { "epoch": 0.3388200085506627, "grad_norm": 5.3015923500061035, "learning_rate": 1.846080760095012e-05, "loss": 0.6971, "step": 1585 }, { "epoch": 0.33988884138520736, "grad_norm": 6.321039199829102, "learning_rate": 1.844893111638955e-05, "loss": 0.3881, "step": 1590 }, { "epoch": 0.34095767421975204, "grad_norm": 4.614476680755615, "learning_rate": 1.843705463182898e-05, "loss": 0.4302, "step": 1595 }, { "epoch": 0.3420265070542967, "grad_norm": 5.174408912658691, "learning_rate": 1.842517814726841e-05, "loss": 0.3701, "step": 1600 }, { "epoch": 0.3430953398888414, "grad_norm": 4.7469706535339355, "learning_rate": 1.8413301662707842e-05, "loss": 0.3893, "step": 1605 }, { "epoch": 0.34416417272338606, "grad_norm": 5.967380046844482, "learning_rate": 1.840142517814727e-05, "loss": 0.4246, "step": 1610 }, { "epoch": 0.34523300555793074, "grad_norm": 4.841580867767334, "learning_rate": 1.83895486935867e-05, "loss": 0.3006, "step": 1615 }, { "epoch": 0.3463018383924754, "grad_norm": 5.739339351654053, "learning_rate": 1.837767220902613e-05, "loss": 0.7078, "step": 1620 }, { "epoch": 0.3473706712270201, "grad_norm": 5.888680458068848, "learning_rate": 1.836579572446556e-05, "loss": 0.375, "step": 1625 }, { "epoch": 0.34843950406156476, "grad_norm": 6.077122211456299, "learning_rate": 1.835391923990499e-05, "loss": 0.4425, "step": 1630 }, { "epoch": 0.34950833689610944, "grad_norm": 6.087640762329102, "learning_rate": 1.834204275534442e-05, "loss": 0.3841, "step": 1635 }, { "epoch": 0.3505771697306541, "grad_norm": 7.3536529541015625, "learning_rate": 1.833016627078385e-05, "loss": 0.4075, "step": 1640 }, { "epoch": 0.3516460025651988, "grad_norm": 6.833067893981934, "learning_rate": 1.831828978622328e-05, "loss": 0.5383, "step": 1645 }, { "epoch": 0.35271483539974346, "grad_norm": 5.849217414855957, "learning_rate": 1.8306413301662707e-05, "loss": 0.4623, "step": 1650 }, { "epoch": 0.35378366823428814, "grad_norm": 5.285182952880859, "learning_rate": 1.829453681710214e-05, "loss": 0.3986, "step": 1655 }, { "epoch": 0.3548525010688328, "grad_norm": 5.706902980804443, "learning_rate": 1.8282660332541568e-05, "loss": 0.4038, "step": 1660 }, { "epoch": 0.3559213339033775, "grad_norm": 4.221705436706543, "learning_rate": 1.8270783847981e-05, "loss": 0.5615, "step": 1665 }, { "epoch": 0.35699016673792217, "grad_norm": 6.5307745933532715, "learning_rate": 1.825890736342043e-05, "loss": 0.5535, "step": 1670 }, { "epoch": 0.35805899957246684, "grad_norm": 5.936892509460449, "learning_rate": 1.824703087885986e-05, "loss": 0.3316, "step": 1675 }, { "epoch": 0.3591278324070115, "grad_norm": 4.413790702819824, "learning_rate": 1.8235154394299287e-05, "loss": 0.3916, "step": 1680 }, { "epoch": 0.36019666524155625, "grad_norm": 5.399665355682373, "learning_rate": 1.8223277909738718e-05, "loss": 0.3723, "step": 1685 }, { "epoch": 0.3612654980761009, "grad_norm": 8.413554191589355, "learning_rate": 1.8211401425178148e-05, "loss": 0.5188, "step": 1690 }, { "epoch": 0.3623343309106456, "grad_norm": 3.7601664066314697, "learning_rate": 1.819952494061758e-05, "loss": 0.3817, "step": 1695 }, { "epoch": 0.36340316374519027, "grad_norm": 5.661569595336914, "learning_rate": 1.818764845605701e-05, "loss": 0.4036, "step": 1700 }, { "epoch": 0.36447199657973495, "grad_norm": 6.07588005065918, "learning_rate": 1.817577197149644e-05, "loss": 0.4224, "step": 1705 }, { "epoch": 0.3655408294142796, "grad_norm": 5.329127311706543, "learning_rate": 1.8163895486935867e-05, "loss": 0.4171, "step": 1710 }, { "epoch": 0.3666096622488243, "grad_norm": 7.156865119934082, "learning_rate": 1.8152019002375298e-05, "loss": 0.4122, "step": 1715 }, { "epoch": 0.367678495083369, "grad_norm": 5.72195291519165, "learning_rate": 1.8140142517814728e-05, "loss": 0.4024, "step": 1720 }, { "epoch": 0.36874732791791365, "grad_norm": 4.991401672363281, "learning_rate": 1.812826603325416e-05, "loss": 0.3882, "step": 1725 }, { "epoch": 0.3698161607524583, "grad_norm": 4.662073612213135, "learning_rate": 1.811638954869359e-05, "loss": 0.3282, "step": 1730 }, { "epoch": 0.370884993587003, "grad_norm": 5.966677188873291, "learning_rate": 1.8104513064133016e-05, "loss": 0.2961, "step": 1735 }, { "epoch": 0.3719538264215477, "grad_norm": 5.708690166473389, "learning_rate": 1.809263657957245e-05, "loss": 0.4568, "step": 1740 }, { "epoch": 0.37302265925609235, "grad_norm": 5.69785213470459, "learning_rate": 1.8080760095011877e-05, "loss": 0.3544, "step": 1745 }, { "epoch": 0.374091492090637, "grad_norm": 6.101360321044922, "learning_rate": 1.8068883610451308e-05, "loss": 0.4282, "step": 1750 }, { "epoch": 0.3751603249251817, "grad_norm": 6.585791110992432, "learning_rate": 1.805700712589074e-05, "loss": 0.3798, "step": 1755 }, { "epoch": 0.3762291577597264, "grad_norm": 5.618402481079102, "learning_rate": 1.804513064133017e-05, "loss": 0.4618, "step": 1760 }, { "epoch": 0.37729799059427105, "grad_norm": 6.637610912322998, "learning_rate": 1.8033254156769596e-05, "loss": 0.3876, "step": 1765 }, { "epoch": 0.3783668234288157, "grad_norm": 4.9898295402526855, "learning_rate": 1.8021377672209027e-05, "loss": 0.3393, "step": 1770 }, { "epoch": 0.3794356562633604, "grad_norm": 6.182595252990723, "learning_rate": 1.8009501187648457e-05, "loss": 0.4276, "step": 1775 }, { "epoch": 0.3805044890979051, "grad_norm": 5.460147380828857, "learning_rate": 1.7997624703087888e-05, "loss": 0.4641, "step": 1780 }, { "epoch": 0.38157332193244975, "grad_norm": 4.0731940269470215, "learning_rate": 1.798574821852732e-05, "loss": 0.4248, "step": 1785 }, { "epoch": 0.3826421547669944, "grad_norm": 3.6468496322631836, "learning_rate": 1.797387173396675e-05, "loss": 0.3601, "step": 1790 }, { "epoch": 0.3837109876015391, "grad_norm": 3.701404094696045, "learning_rate": 1.7961995249406176e-05, "loss": 0.3384, "step": 1795 }, { "epoch": 0.3847798204360838, "grad_norm": 6.082109451293945, "learning_rate": 1.7950118764845607e-05, "loss": 0.3598, "step": 1800 }, { "epoch": 0.38584865327062845, "grad_norm": 4.901666164398193, "learning_rate": 1.7938242280285037e-05, "loss": 0.4363, "step": 1805 }, { "epoch": 0.3869174861051731, "grad_norm": 3.848799467086792, "learning_rate": 1.7926365795724468e-05, "loss": 0.335, "step": 1810 }, { "epoch": 0.38798631893971786, "grad_norm": 4.457520484924316, "learning_rate": 1.7914489311163895e-05, "loss": 0.3892, "step": 1815 }, { "epoch": 0.38905515177426253, "grad_norm": 6.423126697540283, "learning_rate": 1.7902612826603326e-05, "loss": 0.4103, "step": 1820 }, { "epoch": 0.3901239846088072, "grad_norm": 5.50001335144043, "learning_rate": 1.7890736342042756e-05, "loss": 0.3959, "step": 1825 }, { "epoch": 0.3911928174433519, "grad_norm": 3.85994553565979, "learning_rate": 1.7878859857482187e-05, "loss": 0.3593, "step": 1830 }, { "epoch": 0.39226165027789656, "grad_norm": 6.009896278381348, "learning_rate": 1.7866983372921617e-05, "loss": 0.4366, "step": 1835 }, { "epoch": 0.39333048311244123, "grad_norm": 4.844223499298096, "learning_rate": 1.7855106888361048e-05, "loss": 0.3633, "step": 1840 }, { "epoch": 0.3943993159469859, "grad_norm": 5.032964706420898, "learning_rate": 1.7843230403800475e-05, "loss": 0.4333, "step": 1845 }, { "epoch": 0.3954681487815306, "grad_norm": 5.1685872077941895, "learning_rate": 1.7831353919239905e-05, "loss": 0.4825, "step": 1850 }, { "epoch": 0.39653698161607526, "grad_norm": 5.741828918457031, "learning_rate": 1.7819477434679336e-05, "loss": 0.3041, "step": 1855 }, { "epoch": 0.39760581445061993, "grad_norm": 5.440220832824707, "learning_rate": 1.7807600950118767e-05, "loss": 0.3416, "step": 1860 }, { "epoch": 0.3986746472851646, "grad_norm": 4.476759433746338, "learning_rate": 1.7795724465558197e-05, "loss": 0.4578, "step": 1865 }, { "epoch": 0.3997434801197093, "grad_norm": 6.7310991287231445, "learning_rate": 1.7783847980997628e-05, "loss": 0.5441, "step": 1870 }, { "epoch": 0.40081231295425396, "grad_norm": 5.929594993591309, "learning_rate": 1.7771971496437058e-05, "loss": 0.5124, "step": 1875 }, { "epoch": 0.40188114578879863, "grad_norm": 4.516419410705566, "learning_rate": 1.7760095011876485e-05, "loss": 0.4026, "step": 1880 }, { "epoch": 0.4029499786233433, "grad_norm": 5.7698798179626465, "learning_rate": 1.7748218527315916e-05, "loss": 0.4537, "step": 1885 }, { "epoch": 0.404018811457888, "grad_norm": 4.604269027709961, "learning_rate": 1.7736342042755346e-05, "loss": 0.3983, "step": 1890 }, { "epoch": 0.40508764429243266, "grad_norm": 6.4217610359191895, "learning_rate": 1.7724465558194777e-05, "loss": 0.3894, "step": 1895 }, { "epoch": 0.40615647712697733, "grad_norm": 5.296751022338867, "learning_rate": 1.7712589073634204e-05, "loss": 0.4747, "step": 1900 }, { "epoch": 0.407225309961522, "grad_norm": 4.870068550109863, "learning_rate": 1.7700712589073638e-05, "loss": 0.4042, "step": 1905 }, { "epoch": 0.4082941427960667, "grad_norm": 4.312191486358643, "learning_rate": 1.7688836104513065e-05, "loss": 0.3477, "step": 1910 }, { "epoch": 0.40936297563061136, "grad_norm": 5.281498432159424, "learning_rate": 1.7676959619952496e-05, "loss": 0.4512, "step": 1915 }, { "epoch": 0.41043180846515603, "grad_norm": 4.401067733764648, "learning_rate": 1.7665083135391926e-05, "loss": 0.3451, "step": 1920 }, { "epoch": 0.4115006412997007, "grad_norm": 5.28626012802124, "learning_rate": 1.7653206650831357e-05, "loss": 0.3983, "step": 1925 }, { "epoch": 0.4125694741342454, "grad_norm": 5.951436519622803, "learning_rate": 1.7641330166270784e-05, "loss": 0.62, "step": 1930 }, { "epoch": 0.41363830696879006, "grad_norm": 3.4126088619232178, "learning_rate": 1.7629453681710215e-05, "loss": 0.4632, "step": 1935 }, { "epoch": 0.41470713980333473, "grad_norm": 4.540611267089844, "learning_rate": 1.7617577197149645e-05, "loss": 0.3757, "step": 1940 }, { "epoch": 0.4157759726378794, "grad_norm": 5.913720607757568, "learning_rate": 1.7605700712589076e-05, "loss": 0.4752, "step": 1945 }, { "epoch": 0.41684480547242414, "grad_norm": 4.386907577514648, "learning_rate": 1.7593824228028503e-05, "loss": 0.453, "step": 1950 }, { "epoch": 0.4179136383069688, "grad_norm": 4.836590766906738, "learning_rate": 1.7581947743467937e-05, "loss": 0.4348, "step": 1955 }, { "epoch": 0.4189824711415135, "grad_norm": 4.215417861938477, "learning_rate": 1.7570071258907364e-05, "loss": 0.3944, "step": 1960 }, { "epoch": 0.42005130397605817, "grad_norm": 5.9303789138793945, "learning_rate": 1.7558194774346795e-05, "loss": 0.3702, "step": 1965 }, { "epoch": 0.42112013681060284, "grad_norm": 5.648311138153076, "learning_rate": 1.7546318289786225e-05, "loss": 0.3888, "step": 1970 }, { "epoch": 0.4221889696451475, "grad_norm": 5.413701057434082, "learning_rate": 1.7534441805225656e-05, "loss": 0.3968, "step": 1975 }, { "epoch": 0.4232578024796922, "grad_norm": 4.331090450286865, "learning_rate": 1.7522565320665083e-05, "loss": 0.3609, "step": 1980 }, { "epoch": 0.42432663531423687, "grad_norm": 4.991115093231201, "learning_rate": 1.7510688836104513e-05, "loss": 0.4328, "step": 1985 }, { "epoch": 0.42539546814878154, "grad_norm": 5.451033115386963, "learning_rate": 1.7498812351543944e-05, "loss": 0.516, "step": 1990 }, { "epoch": 0.4264643009833262, "grad_norm": 5.011542320251465, "learning_rate": 1.7486935866983374e-05, "loss": 0.3605, "step": 1995 }, { "epoch": 0.4275331338178709, "grad_norm": 5.4983086585998535, "learning_rate": 1.74750593824228e-05, "loss": 0.3094, "step": 2000 }, { "epoch": 0.42860196665241557, "grad_norm": 5.928680896759033, "learning_rate": 1.7463182897862236e-05, "loss": 0.3866, "step": 2005 }, { "epoch": 0.42967079948696024, "grad_norm": 4.630986213684082, "learning_rate": 1.7451306413301666e-05, "loss": 0.3943, "step": 2010 }, { "epoch": 0.4307396323215049, "grad_norm": 4.091104030609131, "learning_rate": 1.7439429928741093e-05, "loss": 0.3931, "step": 2015 }, { "epoch": 0.4318084651560496, "grad_norm": 6.031238555908203, "learning_rate": 1.7427553444180524e-05, "loss": 0.3728, "step": 2020 }, { "epoch": 0.43287729799059427, "grad_norm": 4.81741189956665, "learning_rate": 1.7415676959619954e-05, "loss": 0.2953, "step": 2025 }, { "epoch": 0.43394613082513894, "grad_norm": 5.144311904907227, "learning_rate": 1.7403800475059385e-05, "loss": 0.3547, "step": 2030 }, { "epoch": 0.4350149636596836, "grad_norm": 4.806643009185791, "learning_rate": 1.7391923990498812e-05, "loss": 0.4275, "step": 2035 }, { "epoch": 0.4360837964942283, "grad_norm": 4.138782501220703, "learning_rate": 1.7380047505938246e-05, "loss": 0.2959, "step": 2040 }, { "epoch": 0.43715262932877297, "grad_norm": 5.7593255043029785, "learning_rate": 1.7368171021377673e-05, "loss": 0.3245, "step": 2045 }, { "epoch": 0.43822146216331764, "grad_norm": 4.043095588684082, "learning_rate": 1.7356294536817104e-05, "loss": 0.4148, "step": 2050 }, { "epoch": 0.4392902949978623, "grad_norm": 4.848685264587402, "learning_rate": 1.7344418052256534e-05, "loss": 0.3542, "step": 2055 }, { "epoch": 0.440359127832407, "grad_norm": 5.738672256469727, "learning_rate": 1.7332541567695965e-05, "loss": 0.4493, "step": 2060 }, { "epoch": 0.44142796066695167, "grad_norm": 4.470565319061279, "learning_rate": 1.7320665083135392e-05, "loss": 0.376, "step": 2065 }, { "epoch": 0.44249679350149634, "grad_norm": 4.22749137878418, "learning_rate": 1.7308788598574823e-05, "loss": 0.322, "step": 2070 }, { "epoch": 0.443565626336041, "grad_norm": 5.158305644989014, "learning_rate": 1.7296912114014253e-05, "loss": 0.3227, "step": 2075 }, { "epoch": 0.4446344591705857, "grad_norm": 6.257720947265625, "learning_rate": 1.7285035629453684e-05, "loss": 0.3046, "step": 2080 }, { "epoch": 0.4457032920051304, "grad_norm": 5.981179237365723, "learning_rate": 1.727315914489311e-05, "loss": 0.3064, "step": 2085 }, { "epoch": 0.4467721248396751, "grad_norm": 5.584667682647705, "learning_rate": 1.7261282660332545e-05, "loss": 0.3199, "step": 2090 }, { "epoch": 0.4478409576742198, "grad_norm": 5.660790920257568, "learning_rate": 1.7249406175771972e-05, "loss": 0.3774, "step": 2095 }, { "epoch": 0.44890979050876445, "grad_norm": 4.129720687866211, "learning_rate": 1.7237529691211402e-05, "loss": 0.3212, "step": 2100 }, { "epoch": 0.4499786233433091, "grad_norm": 3.2054107189178467, "learning_rate": 1.7225653206650833e-05, "loss": 0.3302, "step": 2105 }, { "epoch": 0.4510474561778538, "grad_norm": 3.934522867202759, "learning_rate": 1.7213776722090264e-05, "loss": 0.3205, "step": 2110 }, { "epoch": 0.4521162890123985, "grad_norm": 5.592263221740723, "learning_rate": 1.720190023752969e-05, "loss": 0.3673, "step": 2115 }, { "epoch": 0.45318512184694315, "grad_norm": 5.707674026489258, "learning_rate": 1.719002375296912e-05, "loss": 0.3752, "step": 2120 }, { "epoch": 0.4542539546814878, "grad_norm": 4.284328937530518, "learning_rate": 1.7178147268408552e-05, "loss": 0.3192, "step": 2125 }, { "epoch": 0.4553227875160325, "grad_norm": 4.87931489944458, "learning_rate": 1.7166270783847982e-05, "loss": 0.3372, "step": 2130 }, { "epoch": 0.4563916203505772, "grad_norm": 5.3206048011779785, "learning_rate": 1.7154394299287413e-05, "loss": 0.321, "step": 2135 }, { "epoch": 0.45746045318512185, "grad_norm": 5.118194103240967, "learning_rate": 1.7142517814726843e-05, "loss": 0.4086, "step": 2140 }, { "epoch": 0.4585292860196665, "grad_norm": 5.390005111694336, "learning_rate": 1.7130641330166274e-05, "loss": 0.4092, "step": 2145 }, { "epoch": 0.4595981188542112, "grad_norm": 6.221261978149414, "learning_rate": 1.71187648456057e-05, "loss": 0.4591, "step": 2150 }, { "epoch": 0.4606669516887559, "grad_norm": 4.9464497566223145, "learning_rate": 1.7106888361045132e-05, "loss": 0.503, "step": 2155 }, { "epoch": 0.46173578452330055, "grad_norm": 6.745388984680176, "learning_rate": 1.7095011876484562e-05, "loss": 0.4767, "step": 2160 }, { "epoch": 0.4628046173578452, "grad_norm": 5.506555080413818, "learning_rate": 1.7083135391923993e-05, "loss": 0.3425, "step": 2165 }, { "epoch": 0.4638734501923899, "grad_norm": 5.21577787399292, "learning_rate": 1.707125890736342e-05, "loss": 0.372, "step": 2170 }, { "epoch": 0.4649422830269346, "grad_norm": 4.69103479385376, "learning_rate": 1.7059382422802854e-05, "loss": 0.4671, "step": 2175 }, { "epoch": 0.46601111586147925, "grad_norm": 4.060796737670898, "learning_rate": 1.704750593824228e-05, "loss": 0.3767, "step": 2180 }, { "epoch": 0.4670799486960239, "grad_norm": 6.448695659637451, "learning_rate": 1.703562945368171e-05, "loss": 0.3096, "step": 2185 }, { "epoch": 0.4681487815305686, "grad_norm": 4.255459308624268, "learning_rate": 1.7023752969121142e-05, "loss": 0.3654, "step": 2190 }, { "epoch": 0.4692176143651133, "grad_norm": 5.383869647979736, "learning_rate": 1.7011876484560573e-05, "loss": 0.4243, "step": 2195 }, { "epoch": 0.47028644719965795, "grad_norm": 4.97196102142334, "learning_rate": 1.7e-05, "loss": 0.4411, "step": 2200 }, { "epoch": 0.47135528003420263, "grad_norm": 4.9628071784973145, "learning_rate": 1.698812351543943e-05, "loss": 0.5001, "step": 2205 }, { "epoch": 0.4724241128687473, "grad_norm": 5.05242919921875, "learning_rate": 1.697624703087886e-05, "loss": 0.3485, "step": 2210 }, { "epoch": 0.473492945703292, "grad_norm": 4.373459339141846, "learning_rate": 1.696437054631829e-05, "loss": 0.3546, "step": 2215 }, { "epoch": 0.4745617785378367, "grad_norm": 5.0651397705078125, "learning_rate": 1.6952494061757722e-05, "loss": 0.4037, "step": 2220 }, { "epoch": 0.4756306113723814, "grad_norm": 6.026737213134766, "learning_rate": 1.6940617577197153e-05, "loss": 0.3114, "step": 2225 }, { "epoch": 0.47669944420692606, "grad_norm": 4.404332160949707, "learning_rate": 1.692874109263658e-05, "loss": 0.3422, "step": 2230 }, { "epoch": 0.47776827704147073, "grad_norm": 5.780966281890869, "learning_rate": 1.691686460807601e-05, "loss": 0.4241, "step": 2235 }, { "epoch": 0.4788371098760154, "grad_norm": 5.648661136627197, "learning_rate": 1.690498812351544e-05, "loss": 0.3912, "step": 2240 }, { "epoch": 0.4799059427105601, "grad_norm": 3.616197109222412, "learning_rate": 1.689311163895487e-05, "loss": 0.3748, "step": 2245 }, { "epoch": 0.48097477554510476, "grad_norm": 4.634115219116211, "learning_rate": 1.68812351543943e-05, "loss": 0.3746, "step": 2250 }, { "epoch": 0.48204360837964944, "grad_norm": 4.268435478210449, "learning_rate": 1.686935866983373e-05, "loss": 0.3544, "step": 2255 }, { "epoch": 0.4831124412141941, "grad_norm": 4.208693504333496, "learning_rate": 1.685748218527316e-05, "loss": 0.3246, "step": 2260 }, { "epoch": 0.4841812740487388, "grad_norm": 7.521546840667725, "learning_rate": 1.684560570071259e-05, "loss": 0.3739, "step": 2265 }, { "epoch": 0.48525010688328346, "grad_norm": 5.12343692779541, "learning_rate": 1.683372921615202e-05, "loss": 0.3606, "step": 2270 }, { "epoch": 0.48631893971782814, "grad_norm": 6.54265022277832, "learning_rate": 1.682185273159145e-05, "loss": 0.3891, "step": 2275 }, { "epoch": 0.4873877725523728, "grad_norm": 4.471118450164795, "learning_rate": 1.680997624703088e-05, "loss": 0.2855, "step": 2280 }, { "epoch": 0.4884566053869175, "grad_norm": 7.488130569458008, "learning_rate": 1.679809976247031e-05, "loss": 0.4829, "step": 2285 }, { "epoch": 0.48952543822146216, "grad_norm": 6.3466033935546875, "learning_rate": 1.678622327790974e-05, "loss": 0.4529, "step": 2290 }, { "epoch": 0.49059427105600684, "grad_norm": 7.353418350219727, "learning_rate": 1.677434679334917e-05, "loss": 0.4819, "step": 2295 }, { "epoch": 0.4916631038905515, "grad_norm": 4.575865745544434, "learning_rate": 1.67624703087886e-05, "loss": 0.3453, "step": 2300 }, { "epoch": 0.4927319367250962, "grad_norm": 4.988368511199951, "learning_rate": 1.675059382422803e-05, "loss": 0.3425, "step": 2305 }, { "epoch": 0.49380076955964086, "grad_norm": 5.05146598815918, "learning_rate": 1.6738717339667462e-05, "loss": 0.2884, "step": 2310 }, { "epoch": 0.49486960239418554, "grad_norm": 6.10252571105957, "learning_rate": 1.672684085510689e-05, "loss": 0.3184, "step": 2315 }, { "epoch": 0.4959384352287302, "grad_norm": 5.356700420379639, "learning_rate": 1.671496437054632e-05, "loss": 0.3043, "step": 2320 }, { "epoch": 0.4970072680632749, "grad_norm": 4.550732135772705, "learning_rate": 1.670308788598575e-05, "loss": 0.3746, "step": 2325 }, { "epoch": 0.49807610089781956, "grad_norm": 4.781940937042236, "learning_rate": 1.669121140142518e-05, "loss": 0.4023, "step": 2330 }, { "epoch": 0.49914493373236424, "grad_norm": 3.1689300537109375, "learning_rate": 1.6679334916864608e-05, "loss": 0.2994, "step": 2335 }, { "epoch": 0.500213766566909, "grad_norm": 5.919034004211426, "learning_rate": 1.6667458432304042e-05, "loss": 0.3858, "step": 2340 }, { "epoch": 0.5012825994014536, "grad_norm": 4.044144153594971, "learning_rate": 1.665558194774347e-05, "loss": 0.3488, "step": 2345 }, { "epoch": 0.5023514322359983, "grad_norm": 5.063786506652832, "learning_rate": 1.66437054631829e-05, "loss": 0.4467, "step": 2350 }, { "epoch": 0.5034202650705429, "grad_norm": 4.159796237945557, "learning_rate": 1.663182897862233e-05, "loss": 0.3199, "step": 2355 }, { "epoch": 0.5044890979050877, "grad_norm": 4.232370853424072, "learning_rate": 1.661995249406176e-05, "loss": 0.3124, "step": 2360 }, { "epoch": 0.5055579307396323, "grad_norm": 3.8301782608032227, "learning_rate": 1.6608076009501188e-05, "loss": 0.3229, "step": 2365 }, { "epoch": 0.506626763574177, "grad_norm": 5.729179382324219, "learning_rate": 1.6596199524940618e-05, "loss": 0.3416, "step": 2370 }, { "epoch": 0.5076955964087216, "grad_norm": 4.137636184692383, "learning_rate": 1.658432304038005e-05, "loss": 0.3555, "step": 2375 }, { "epoch": 0.5087644292432664, "grad_norm": 6.014377593994141, "learning_rate": 1.657244655581948e-05, "loss": 0.3078, "step": 2380 }, { "epoch": 0.509833262077811, "grad_norm": 5.031920909881592, "learning_rate": 1.6560570071258906e-05, "loss": 0.3105, "step": 2385 }, { "epoch": 0.5109020949123557, "grad_norm": 4.162966728210449, "learning_rate": 1.654869358669834e-05, "loss": 0.3565, "step": 2390 }, { "epoch": 0.5119709277469003, "grad_norm": 5.57382345199585, "learning_rate": 1.6536817102137768e-05, "loss": 0.3113, "step": 2395 }, { "epoch": 0.5130397605814451, "grad_norm": 6.443201065063477, "learning_rate": 1.6524940617577198e-05, "loss": 0.4326, "step": 2400 }, { "epoch": 0.5141085934159897, "grad_norm": 7.050893306732178, "learning_rate": 1.651306413301663e-05, "loss": 0.4395, "step": 2405 }, { "epoch": 0.5151774262505344, "grad_norm": 4.315305709838867, "learning_rate": 1.650118764845606e-05, "loss": 0.3549, "step": 2410 }, { "epoch": 0.516246259085079, "grad_norm": 3.76841402053833, "learning_rate": 1.6489311163895486e-05, "loss": 0.4271, "step": 2415 }, { "epoch": 0.5173150919196238, "grad_norm": 4.878926753997803, "learning_rate": 1.6477434679334917e-05, "loss": 0.3136, "step": 2420 }, { "epoch": 0.5183839247541685, "grad_norm": 4.831075668334961, "learning_rate": 1.646555819477435e-05, "loss": 0.3235, "step": 2425 }, { "epoch": 0.5194527575887131, "grad_norm": 4.886428356170654, "learning_rate": 1.6453681710213778e-05, "loss": 0.2909, "step": 2430 }, { "epoch": 0.5205215904232579, "grad_norm": 5.281339645385742, "learning_rate": 1.644180522565321e-05, "loss": 0.3296, "step": 2435 }, { "epoch": 0.5215904232578025, "grad_norm": 4.9752516746521, "learning_rate": 1.642992874109264e-05, "loss": 0.4124, "step": 2440 }, { "epoch": 0.5226592560923472, "grad_norm": 5.5705952644348145, "learning_rate": 1.641805225653207e-05, "loss": 0.4444, "step": 2445 }, { "epoch": 0.5237280889268918, "grad_norm": 4.4641499519348145, "learning_rate": 1.6406175771971497e-05, "loss": 0.3649, "step": 2450 }, { "epoch": 0.5247969217614366, "grad_norm": 4.909672260284424, "learning_rate": 1.6394299287410927e-05, "loss": 0.3897, "step": 2455 }, { "epoch": 0.5258657545959812, "grad_norm": 5.340948581695557, "learning_rate": 1.6382422802850358e-05, "loss": 0.36, "step": 2460 }, { "epoch": 0.5269345874305259, "grad_norm": 5.204975128173828, "learning_rate": 1.637054631828979e-05, "loss": 0.3899, "step": 2465 }, { "epoch": 0.5280034202650705, "grad_norm": 5.030284881591797, "learning_rate": 1.6358669833729216e-05, "loss": 0.3303, "step": 2470 }, { "epoch": 0.5290722530996153, "grad_norm": 3.7952535152435303, "learning_rate": 1.634679334916865e-05, "loss": 0.3115, "step": 2475 }, { "epoch": 0.5301410859341599, "grad_norm": 5.823569297790527, "learning_rate": 1.6334916864608077e-05, "loss": 0.4637, "step": 2480 }, { "epoch": 0.5312099187687046, "grad_norm": 6.1813483238220215, "learning_rate": 1.6323040380047507e-05, "loss": 0.4402, "step": 2485 }, { "epoch": 0.5322787516032492, "grad_norm": 3.668980360031128, "learning_rate": 1.6311163895486938e-05, "loss": 0.2825, "step": 2490 }, { "epoch": 0.533347584437794, "grad_norm": 4.954606056213379, "learning_rate": 1.629928741092637e-05, "loss": 0.3389, "step": 2495 }, { "epoch": 0.5344164172723386, "grad_norm": 4.136919021606445, "learning_rate": 1.6287410926365796e-05, "loss": 0.3513, "step": 2500 }, { "epoch": 0.5354852501068833, "grad_norm": 5.383963108062744, "learning_rate": 1.6275534441805226e-05, "loss": 0.4301, "step": 2505 }, { "epoch": 0.5365540829414279, "grad_norm": 4.818902015686035, "learning_rate": 1.6263657957244657e-05, "loss": 0.3733, "step": 2510 }, { "epoch": 0.5376229157759727, "grad_norm": 4.797301769256592, "learning_rate": 1.6251781472684087e-05, "loss": 0.3241, "step": 2515 }, { "epoch": 0.5386917486105173, "grad_norm": 5.040024757385254, "learning_rate": 1.6239904988123514e-05, "loss": 0.3457, "step": 2520 }, { "epoch": 0.539760581445062, "grad_norm": 5.214640140533447, "learning_rate": 1.622802850356295e-05, "loss": 0.4533, "step": 2525 }, { "epoch": 0.5408294142796066, "grad_norm": 3.6819052696228027, "learning_rate": 1.6216152019002375e-05, "loss": 0.3397, "step": 2530 }, { "epoch": 0.5418982471141514, "grad_norm": 4.882740020751953, "learning_rate": 1.6204275534441806e-05, "loss": 0.4097, "step": 2535 }, { "epoch": 0.542967079948696, "grad_norm": 4.784149646759033, "learning_rate": 1.6192399049881237e-05, "loss": 0.3083, "step": 2540 }, { "epoch": 0.5440359127832407, "grad_norm": 5.621673107147217, "learning_rate": 1.6180522565320667e-05, "loss": 0.3199, "step": 2545 }, { "epoch": 0.5451047456177853, "grad_norm": 5.204516887664795, "learning_rate": 1.6168646080760094e-05, "loss": 0.3971, "step": 2550 }, { "epoch": 0.54617357845233, "grad_norm": 4.5771026611328125, "learning_rate": 1.6156769596199525e-05, "loss": 0.5229, "step": 2555 }, { "epoch": 0.5472424112868748, "grad_norm": 5.919792652130127, "learning_rate": 1.6144893111638955e-05, "loss": 0.3765, "step": 2560 }, { "epoch": 0.5483112441214194, "grad_norm": 4.573512554168701, "learning_rate": 1.6133016627078386e-05, "loss": 0.3727, "step": 2565 }, { "epoch": 0.5493800769559641, "grad_norm": 3.752349615097046, "learning_rate": 1.6121140142517816e-05, "loss": 0.3252, "step": 2570 }, { "epoch": 0.5504489097905088, "grad_norm": 3.7579493522644043, "learning_rate": 1.6109263657957247e-05, "loss": 0.2897, "step": 2575 }, { "epoch": 0.5515177426250535, "grad_norm": 3.408615827560425, "learning_rate": 1.6097387173396678e-05, "loss": 0.2867, "step": 2580 }, { "epoch": 0.5525865754595981, "grad_norm": 6.79346227645874, "learning_rate": 1.6085510688836105e-05, "loss": 0.3058, "step": 2585 }, { "epoch": 0.5536554082941428, "grad_norm": 4.814434051513672, "learning_rate": 1.6073634204275535e-05, "loss": 0.461, "step": 2590 }, { "epoch": 0.5547242411286875, "grad_norm": 4.379047393798828, "learning_rate": 1.6061757719714966e-05, "loss": 0.2341, "step": 2595 }, { "epoch": 0.5557930739632322, "grad_norm": 7.072385787963867, "learning_rate": 1.6049881235154396e-05, "loss": 0.3446, "step": 2600 }, { "epoch": 0.5568619067977768, "grad_norm": 6.0254411697387695, "learning_rate": 1.6038004750593824e-05, "loss": 0.2844, "step": 2605 }, { "epoch": 0.5579307396323215, "grad_norm": 3.961240768432617, "learning_rate": 1.6026128266033257e-05, "loss": 0.3414, "step": 2610 }, { "epoch": 0.5589995724668662, "grad_norm": 4.460314750671387, "learning_rate": 1.6014251781472685e-05, "loss": 0.3575, "step": 2615 }, { "epoch": 0.5600684053014109, "grad_norm": 4.68889856338501, "learning_rate": 1.6002375296912115e-05, "loss": 0.3799, "step": 2620 }, { "epoch": 0.5611372381359555, "grad_norm": 4.315304756164551, "learning_rate": 1.5990498812351546e-05, "loss": 0.3553, "step": 2625 }, { "epoch": 0.5622060709705002, "grad_norm": 5.276904582977295, "learning_rate": 1.5978622327790976e-05, "loss": 0.3938, "step": 2630 }, { "epoch": 0.5632749038050449, "grad_norm": 6.12239408493042, "learning_rate": 1.5966745843230403e-05, "loss": 0.4941, "step": 2635 }, { "epoch": 0.5643437366395896, "grad_norm": 3.896017074584961, "learning_rate": 1.5954869358669834e-05, "loss": 0.3454, "step": 2640 }, { "epoch": 0.5654125694741342, "grad_norm": 4.870078086853027, "learning_rate": 1.5942992874109265e-05, "loss": 0.3009, "step": 2645 }, { "epoch": 0.5664814023086789, "grad_norm": 4.661655426025391, "learning_rate": 1.5931116389548695e-05, "loss": 0.4625, "step": 2650 }, { "epoch": 0.5675502351432236, "grad_norm": 4.946725368499756, "learning_rate": 1.5919239904988126e-05, "loss": 0.3426, "step": 2655 }, { "epoch": 0.5686190679777683, "grad_norm": 5.536448955535889, "learning_rate": 1.5907363420427556e-05, "loss": 0.3403, "step": 2660 }, { "epoch": 0.5696879008123129, "grad_norm": 4.526655673980713, "learning_rate": 1.5895486935866983e-05, "loss": 0.3571, "step": 2665 }, { "epoch": 0.5707567336468576, "grad_norm": 5.318846225738525, "learning_rate": 1.5883610451306414e-05, "loss": 0.235, "step": 2670 }, { "epoch": 0.5718255664814023, "grad_norm": 4.3493571281433105, "learning_rate": 1.5871733966745844e-05, "loss": 0.3939, "step": 2675 }, { "epoch": 0.572894399315947, "grad_norm": 4.984584331512451, "learning_rate": 1.5859857482185275e-05, "loss": 0.3041, "step": 2680 }, { "epoch": 0.5739632321504916, "grad_norm": 5.118055820465088, "learning_rate": 1.5847980997624702e-05, "loss": 0.398, "step": 2685 }, { "epoch": 0.5750320649850363, "grad_norm": 3.7693285942077637, "learning_rate": 1.5836104513064136e-05, "loss": 0.4327, "step": 2690 }, { "epoch": 0.5761008978195811, "grad_norm": 4.113673210144043, "learning_rate": 1.5824228028503563e-05, "loss": 0.3622, "step": 2695 }, { "epoch": 0.5771697306541257, "grad_norm": 4.5102386474609375, "learning_rate": 1.5812351543942994e-05, "loss": 0.3461, "step": 2700 }, { "epoch": 0.5782385634886704, "grad_norm": 4.592400074005127, "learning_rate": 1.5800475059382424e-05, "loss": 0.409, "step": 2705 }, { "epoch": 0.579307396323215, "grad_norm": 4.869931697845459, "learning_rate": 1.5788598574821855e-05, "loss": 0.3561, "step": 2710 }, { "epoch": 0.5803762291577598, "grad_norm": 4.971279621124268, "learning_rate": 1.5776722090261285e-05, "loss": 0.3608, "step": 2715 }, { "epoch": 0.5814450619923044, "grad_norm": 4.390021324157715, "learning_rate": 1.5764845605700713e-05, "loss": 0.3824, "step": 2720 }, { "epoch": 0.5825138948268491, "grad_norm": 4.252533912658691, "learning_rate": 1.5752969121140143e-05, "loss": 0.3403, "step": 2725 }, { "epoch": 0.5835827276613937, "grad_norm": 4.273214817047119, "learning_rate": 1.5741092636579574e-05, "loss": 0.4133, "step": 2730 }, { "epoch": 0.5846515604959385, "grad_norm": 6.121555328369141, "learning_rate": 1.5729216152019004e-05, "loss": 0.4104, "step": 2735 }, { "epoch": 0.5857203933304831, "grad_norm": 4.297682762145996, "learning_rate": 1.5717339667458435e-05, "loss": 0.2704, "step": 2740 }, { "epoch": 0.5867892261650278, "grad_norm": 3.2164599895477295, "learning_rate": 1.5705463182897865e-05, "loss": 0.275, "step": 2745 }, { "epoch": 0.5878580589995724, "grad_norm": 5.293271541595459, "learning_rate": 1.5693586698337293e-05, "loss": 0.3642, "step": 2750 }, { "epoch": 0.5889268918341172, "grad_norm": 7.932840824127197, "learning_rate": 1.5681710213776723e-05, "loss": 0.3882, "step": 2755 }, { "epoch": 0.5899957246686618, "grad_norm": 4.301117897033691, "learning_rate": 1.5669833729216154e-05, "loss": 0.445, "step": 2760 }, { "epoch": 0.5910645575032065, "grad_norm": 5.11594820022583, "learning_rate": 1.5657957244655584e-05, "loss": 0.275, "step": 2765 }, { "epoch": 0.5921333903377511, "grad_norm": 6.5174384117126465, "learning_rate": 1.564608076009501e-05, "loss": 0.3727, "step": 2770 }, { "epoch": 0.5932022231722959, "grad_norm": 4.847846508026123, "learning_rate": 1.5634204275534445e-05, "loss": 0.3458, "step": 2775 }, { "epoch": 0.5942710560068405, "grad_norm": 4.418210983276367, "learning_rate": 1.5622327790973872e-05, "loss": 0.3775, "step": 2780 }, { "epoch": 0.5953398888413852, "grad_norm": 4.810405731201172, "learning_rate": 1.5610451306413303e-05, "loss": 0.3731, "step": 2785 }, { "epoch": 0.5964087216759298, "grad_norm": 3.9812352657318115, "learning_rate": 1.5598574821852734e-05, "loss": 0.3073, "step": 2790 }, { "epoch": 0.5974775545104746, "grad_norm": 4.542743682861328, "learning_rate": 1.5586698337292164e-05, "loss": 0.3113, "step": 2795 }, { "epoch": 0.5985463873450192, "grad_norm": 4.736385345458984, "learning_rate": 1.557482185273159e-05, "loss": 0.3666, "step": 2800 }, { "epoch": 0.5996152201795639, "grad_norm": 5.22001314163208, "learning_rate": 1.5562945368171022e-05, "loss": 0.3769, "step": 2805 }, { "epoch": 0.6006840530141085, "grad_norm": 5.7952680587768555, "learning_rate": 1.5551068883610452e-05, "loss": 0.3267, "step": 2810 }, { "epoch": 0.6017528858486533, "grad_norm": 4.174045085906982, "learning_rate": 1.5539192399049883e-05, "loss": 0.2513, "step": 2815 }, { "epoch": 0.6028217186831979, "grad_norm": 3.869800090789795, "learning_rate": 1.552731591448931e-05, "loss": 0.273, "step": 2820 }, { "epoch": 0.6038905515177426, "grad_norm": 4.380319118499756, "learning_rate": 1.5515439429928744e-05, "loss": 0.3712, "step": 2825 }, { "epoch": 0.6049593843522874, "grad_norm": 3.972041368484497, "learning_rate": 1.550356294536817e-05, "loss": 0.4728, "step": 2830 }, { "epoch": 0.606028217186832, "grad_norm": 5.212732791900635, "learning_rate": 1.5491686460807602e-05, "loss": 0.3608, "step": 2835 }, { "epoch": 0.6070970500213767, "grad_norm": 5.559129238128662, "learning_rate": 1.5479809976247032e-05, "loss": 0.3765, "step": 2840 }, { "epoch": 0.6081658828559213, "grad_norm": 4.520800590515137, "learning_rate": 1.5467933491686463e-05, "loss": 0.3285, "step": 2845 }, { "epoch": 0.609234715690466, "grad_norm": 6.37885856628418, "learning_rate": 1.5456057007125893e-05, "loss": 0.4822, "step": 2850 }, { "epoch": 0.6103035485250107, "grad_norm": 3.292531967163086, "learning_rate": 1.544418052256532e-05, "loss": 0.3185, "step": 2855 }, { "epoch": 0.6113723813595554, "grad_norm": 4.683765411376953, "learning_rate": 1.5432304038004754e-05, "loss": 0.3027, "step": 2860 }, { "epoch": 0.6124412141941, "grad_norm": 6.004202365875244, "learning_rate": 1.542042755344418e-05, "loss": 0.3819, "step": 2865 }, { "epoch": 0.6135100470286448, "grad_norm": 4.668170928955078, "learning_rate": 1.5408551068883612e-05, "loss": 0.2819, "step": 2870 }, { "epoch": 0.6145788798631894, "grad_norm": 6.2482781410217285, "learning_rate": 1.5396674584323043e-05, "loss": 0.3369, "step": 2875 }, { "epoch": 0.6156477126977341, "grad_norm": 4.60993766784668, "learning_rate": 1.5384798099762473e-05, "loss": 0.331, "step": 2880 }, { "epoch": 0.6167165455322787, "grad_norm": 5.188110828399658, "learning_rate": 1.53729216152019e-05, "loss": 0.3662, "step": 2885 }, { "epoch": 0.6177853783668235, "grad_norm": 5.201088905334473, "learning_rate": 1.536104513064133e-05, "loss": 0.3472, "step": 2890 }, { "epoch": 0.6188542112013681, "grad_norm": 5.363198280334473, "learning_rate": 1.534916864608076e-05, "loss": 0.387, "step": 2895 }, { "epoch": 0.6199230440359128, "grad_norm": 5.238138198852539, "learning_rate": 1.5337292161520192e-05, "loss": 0.3017, "step": 2900 }, { "epoch": 0.6209918768704574, "grad_norm": 4.188523769378662, "learning_rate": 1.532541567695962e-05, "loss": 0.2628, "step": 2905 }, { "epoch": 0.6220607097050022, "grad_norm": 4.730754852294922, "learning_rate": 1.5313539192399053e-05, "loss": 0.2683, "step": 2910 }, { "epoch": 0.6231295425395468, "grad_norm": 3.7036404609680176, "learning_rate": 1.530166270783848e-05, "loss": 0.3189, "step": 2915 }, { "epoch": 0.6241983753740915, "grad_norm": 4.961543560028076, "learning_rate": 1.528978622327791e-05, "loss": 0.3389, "step": 2920 }, { "epoch": 0.6252672082086361, "grad_norm": 4.376546859741211, "learning_rate": 1.527790973871734e-05, "loss": 0.3552, "step": 2925 }, { "epoch": 0.6263360410431809, "grad_norm": 3.2792232036590576, "learning_rate": 1.5266033254156772e-05, "loss": 0.2784, "step": 2930 }, { "epoch": 0.6274048738777255, "grad_norm": 4.739627838134766, "learning_rate": 1.5254156769596201e-05, "loss": 0.3416, "step": 2935 }, { "epoch": 0.6284737067122702, "grad_norm": 4.889829635620117, "learning_rate": 1.5242280285035631e-05, "loss": 0.3805, "step": 2940 }, { "epoch": 0.6295425395468148, "grad_norm": 5.562602519989014, "learning_rate": 1.523040380047506e-05, "loss": 0.4616, "step": 2945 }, { "epoch": 0.6306113723813596, "grad_norm": 6.154614448547363, "learning_rate": 1.521852731591449e-05, "loss": 0.3584, "step": 2950 }, { "epoch": 0.6316802052159042, "grad_norm": 4.117344856262207, "learning_rate": 1.520665083135392e-05, "loss": 0.3439, "step": 2955 }, { "epoch": 0.6327490380504489, "grad_norm": 4.961648941040039, "learning_rate": 1.519477434679335e-05, "loss": 0.3569, "step": 2960 }, { "epoch": 0.6338178708849936, "grad_norm": 4.030764579772949, "learning_rate": 1.5182897862232779e-05, "loss": 0.2775, "step": 2965 }, { "epoch": 0.6348867037195383, "grad_norm": 5.615406036376953, "learning_rate": 1.517102137767221e-05, "loss": 0.3758, "step": 2970 }, { "epoch": 0.635955536554083, "grad_norm": 5.250066757202148, "learning_rate": 1.5159144893111638e-05, "loss": 0.4178, "step": 2975 }, { "epoch": 0.6370243693886276, "grad_norm": 3.862907648086548, "learning_rate": 1.514726840855107e-05, "loss": 0.2725, "step": 2980 }, { "epoch": 0.6380932022231723, "grad_norm": 7.1906023025512695, "learning_rate": 1.51353919239905e-05, "loss": 0.4838, "step": 2985 }, { "epoch": 0.639162035057717, "grad_norm": 4.240938663482666, "learning_rate": 1.512351543942993e-05, "loss": 0.3184, "step": 2990 }, { "epoch": 0.6402308678922617, "grad_norm": 5.662024974822998, "learning_rate": 1.511163895486936e-05, "loss": 0.3265, "step": 2995 }, { "epoch": 0.6412997007268063, "grad_norm": 5.721799850463867, "learning_rate": 1.509976247030879e-05, "loss": 0.3344, "step": 3000 }, { "epoch": 0.642368533561351, "grad_norm": 4.524104118347168, "learning_rate": 1.508788598574822e-05, "loss": 0.284, "step": 3005 }, { "epoch": 0.6434373663958957, "grad_norm": 4.907393455505371, "learning_rate": 1.5076009501187649e-05, "loss": 0.3337, "step": 3010 }, { "epoch": 0.6445061992304404, "grad_norm": 4.567984580993652, "learning_rate": 1.5064133016627081e-05, "loss": 0.3085, "step": 3015 }, { "epoch": 0.645575032064985, "grad_norm": 6.088601589202881, "learning_rate": 1.505225653206651e-05, "loss": 0.3685, "step": 3020 }, { "epoch": 0.6466438648995297, "grad_norm": 5.842155456542969, "learning_rate": 1.504038004750594e-05, "loss": 0.4621, "step": 3025 }, { "epoch": 0.6477126977340744, "grad_norm": 4.505978584289551, "learning_rate": 1.502850356294537e-05, "loss": 0.2958, "step": 3030 }, { "epoch": 0.6487815305686191, "grad_norm": 3.832209825515747, "learning_rate": 1.50166270783848e-05, "loss": 0.4165, "step": 3035 }, { "epoch": 0.6498503634031637, "grad_norm": 3.149580240249634, "learning_rate": 1.5004750593824229e-05, "loss": 0.336, "step": 3040 }, { "epoch": 0.6509191962377084, "grad_norm": 4.5704121589660645, "learning_rate": 1.499287410926366e-05, "loss": 0.328, "step": 3045 }, { "epoch": 0.6519880290722531, "grad_norm": 5.424034595489502, "learning_rate": 1.4980997624703088e-05, "loss": 0.3256, "step": 3050 }, { "epoch": 0.6530568619067978, "grad_norm": 4.873384475708008, "learning_rate": 1.4969121140142519e-05, "loss": 0.2877, "step": 3055 }, { "epoch": 0.6541256947413424, "grad_norm": 4.21671199798584, "learning_rate": 1.4957244655581948e-05, "loss": 0.3468, "step": 3060 }, { "epoch": 0.6551945275758871, "grad_norm": 4.723153591156006, "learning_rate": 1.494536817102138e-05, "loss": 0.3836, "step": 3065 }, { "epoch": 0.6562633604104318, "grad_norm": 4.3572587966918945, "learning_rate": 1.4933491686460809e-05, "loss": 0.3084, "step": 3070 }, { "epoch": 0.6573321932449765, "grad_norm": 4.8245439529418945, "learning_rate": 1.492161520190024e-05, "loss": 0.2879, "step": 3075 }, { "epoch": 0.6584010260795211, "grad_norm": 4.260484218597412, "learning_rate": 1.4909738717339668e-05, "loss": 0.2836, "step": 3080 }, { "epoch": 0.6594698589140658, "grad_norm": 3.668529748916626, "learning_rate": 1.4897862232779099e-05, "loss": 0.3049, "step": 3085 }, { "epoch": 0.6605386917486105, "grad_norm": 5.860143661499023, "learning_rate": 1.4885985748218528e-05, "loss": 0.3972, "step": 3090 }, { "epoch": 0.6616075245831552, "grad_norm": 3.9581236839294434, "learning_rate": 1.4874109263657958e-05, "loss": 0.3189, "step": 3095 }, { "epoch": 0.6626763574176999, "grad_norm": 2.8415067195892334, "learning_rate": 1.4862232779097387e-05, "loss": 0.207, "step": 3100 }, { "epoch": 0.6637451902522445, "grad_norm": 5.096329689025879, "learning_rate": 1.485035629453682e-05, "loss": 0.2844, "step": 3105 }, { "epoch": 0.6648140230867893, "grad_norm": 5.822755813598633, "learning_rate": 1.4838479809976248e-05, "loss": 0.3583, "step": 3110 }, { "epoch": 0.6658828559213339, "grad_norm": 5.467360019683838, "learning_rate": 1.4826603325415679e-05, "loss": 0.2681, "step": 3115 }, { "epoch": 0.6669516887558786, "grad_norm": 5.418729305267334, "learning_rate": 1.4814726840855107e-05, "loss": 0.3788, "step": 3120 }, { "epoch": 0.6680205215904232, "grad_norm": 5.312787055969238, "learning_rate": 1.4802850356294538e-05, "loss": 0.3335, "step": 3125 }, { "epoch": 0.669089354424968, "grad_norm": 4.632271766662598, "learning_rate": 1.4790973871733969e-05, "loss": 0.2958, "step": 3130 }, { "epoch": 0.6701581872595126, "grad_norm": 5.137240886688232, "learning_rate": 1.4779097387173397e-05, "loss": 0.343, "step": 3135 }, { "epoch": 0.6712270200940573, "grad_norm": 4.227065086364746, "learning_rate": 1.4767220902612828e-05, "loss": 0.3026, "step": 3140 }, { "epoch": 0.672295852928602, "grad_norm": 4.9906110763549805, "learning_rate": 1.4755344418052257e-05, "loss": 0.3575, "step": 3145 }, { "epoch": 0.6733646857631467, "grad_norm": 6.338077545166016, "learning_rate": 1.4743467933491689e-05, "loss": 0.3962, "step": 3150 }, { "epoch": 0.6744335185976913, "grad_norm": 5.018848896026611, "learning_rate": 1.4731591448931118e-05, "loss": 0.292, "step": 3155 }, { "epoch": 0.675502351432236, "grad_norm": 5.4188432693481445, "learning_rate": 1.4719714964370548e-05, "loss": 0.4226, "step": 3160 }, { "epoch": 0.6765711842667806, "grad_norm": 5.020565032958984, "learning_rate": 1.4707838479809977e-05, "loss": 0.3999, "step": 3165 }, { "epoch": 0.6776400171013254, "grad_norm": 5.457892894744873, "learning_rate": 1.4695961995249408e-05, "loss": 0.3949, "step": 3170 }, { "epoch": 0.67870884993587, "grad_norm": 4.842294216156006, "learning_rate": 1.4684085510688837e-05, "loss": 0.2844, "step": 3175 }, { "epoch": 0.6797776827704147, "grad_norm": 4.515163421630859, "learning_rate": 1.4672209026128267e-05, "loss": 0.3003, "step": 3180 }, { "epoch": 0.6808465156049593, "grad_norm": 3.4031429290771484, "learning_rate": 1.4660332541567696e-05, "loss": 0.2636, "step": 3185 }, { "epoch": 0.6819153484395041, "grad_norm": 4.693248748779297, "learning_rate": 1.4648456057007128e-05, "loss": 0.2334, "step": 3190 }, { "epoch": 0.6829841812740487, "grad_norm": 4.690431118011475, "learning_rate": 1.4636579572446557e-05, "loss": 0.2574, "step": 3195 }, { "epoch": 0.6840530141085934, "grad_norm": 3.9794492721557617, "learning_rate": 1.4624703087885988e-05, "loss": 0.3476, "step": 3200 }, { "epoch": 0.685121846943138, "grad_norm": 4.062690258026123, "learning_rate": 1.4612826603325417e-05, "loss": 0.3763, "step": 3205 }, { "epoch": 0.6861906797776828, "grad_norm": 2.888495683670044, "learning_rate": 1.4600950118764847e-05, "loss": 0.2873, "step": 3210 }, { "epoch": 0.6872595126122274, "grad_norm": 4.061041355133057, "learning_rate": 1.4589073634204276e-05, "loss": 0.2859, "step": 3215 }, { "epoch": 0.6883283454467721, "grad_norm": 5.954913139343262, "learning_rate": 1.4577197149643707e-05, "loss": 0.3335, "step": 3220 }, { "epoch": 0.6893971782813167, "grad_norm": 4.9537434577941895, "learning_rate": 1.4565320665083135e-05, "loss": 0.3712, "step": 3225 }, { "epoch": 0.6904660111158615, "grad_norm": 3.5754384994506836, "learning_rate": 1.4553444180522566e-05, "loss": 0.4072, "step": 3230 }, { "epoch": 0.6915348439504062, "grad_norm": 6.583157062530518, "learning_rate": 1.4541567695961995e-05, "loss": 0.3442, "step": 3235 }, { "epoch": 0.6926036767849508, "grad_norm": 4.144803524017334, "learning_rate": 1.4529691211401427e-05, "loss": 0.32, "step": 3240 }, { "epoch": 0.6936725096194956, "grad_norm": 3.350670576095581, "learning_rate": 1.4517814726840856e-05, "loss": 0.3076, "step": 3245 }, { "epoch": 0.6947413424540402, "grad_norm": 3.798152208328247, "learning_rate": 1.4505938242280287e-05, "loss": 0.3134, "step": 3250 }, { "epoch": 0.6958101752885849, "grad_norm": 4.410452365875244, "learning_rate": 1.4494061757719715e-05, "loss": 0.3155, "step": 3255 }, { "epoch": 0.6968790081231295, "grad_norm": 5.064853191375732, "learning_rate": 1.4482185273159146e-05, "loss": 0.3097, "step": 3260 }, { "epoch": 0.6979478409576743, "grad_norm": 5.49769401550293, "learning_rate": 1.4470308788598575e-05, "loss": 0.2727, "step": 3265 }, { "epoch": 0.6990166737922189, "grad_norm": 4.130645751953125, "learning_rate": 1.4458432304038005e-05, "loss": 0.3666, "step": 3270 }, { "epoch": 0.7000855066267636, "grad_norm": 5.358222484588623, "learning_rate": 1.4446555819477438e-05, "loss": 0.3049, "step": 3275 }, { "epoch": 0.7011543394613082, "grad_norm": 3.783137559890747, "learning_rate": 1.4434679334916866e-05, "loss": 0.3506, "step": 3280 }, { "epoch": 0.702223172295853, "grad_norm": 4.486612319946289, "learning_rate": 1.4422802850356297e-05, "loss": 0.3027, "step": 3285 }, { "epoch": 0.7032920051303976, "grad_norm": 5.604061126708984, "learning_rate": 1.4410926365795726e-05, "loss": 0.2706, "step": 3290 }, { "epoch": 0.7043608379649423, "grad_norm": 5.663457870483398, "learning_rate": 1.4399049881235156e-05, "loss": 0.3165, "step": 3295 }, { "epoch": 0.7054296707994869, "grad_norm": 4.874339580535889, "learning_rate": 1.4387173396674585e-05, "loss": 0.3567, "step": 3300 }, { "epoch": 0.7064985036340317, "grad_norm": 5.478762626647949, "learning_rate": 1.4375296912114016e-05, "loss": 0.2795, "step": 3305 }, { "epoch": 0.7075673364685763, "grad_norm": 4.213021278381348, "learning_rate": 1.4363420427553445e-05, "loss": 0.2905, "step": 3310 }, { "epoch": 0.708636169303121, "grad_norm": 4.549129009246826, "learning_rate": 1.4351543942992875e-05, "loss": 0.2946, "step": 3315 }, { "epoch": 0.7097050021376656, "grad_norm": 4.900253772735596, "learning_rate": 1.4339667458432304e-05, "loss": 0.298, "step": 3320 }, { "epoch": 0.7107738349722104, "grad_norm": 5.591811656951904, "learning_rate": 1.4327790973871736e-05, "loss": 0.289, "step": 3325 }, { "epoch": 0.711842667806755, "grad_norm": 3.1972029209136963, "learning_rate": 1.4315914489311165e-05, "loss": 0.3194, "step": 3330 }, { "epoch": 0.7129115006412997, "grad_norm": 3.692401647567749, "learning_rate": 1.4304038004750596e-05, "loss": 0.2719, "step": 3335 }, { "epoch": 0.7139803334758443, "grad_norm": 6.502699851989746, "learning_rate": 1.4292161520190025e-05, "loss": 0.3079, "step": 3340 }, { "epoch": 0.7150491663103891, "grad_norm": 4.761363506317139, "learning_rate": 1.4280285035629455e-05, "loss": 0.3373, "step": 3345 }, { "epoch": 0.7161179991449337, "grad_norm": 5.628553867340088, "learning_rate": 1.4268408551068884e-05, "loss": 0.3103, "step": 3350 }, { "epoch": 0.7171868319794784, "grad_norm": 5.576054096221924, "learning_rate": 1.4256532066508314e-05, "loss": 0.3384, "step": 3355 }, { "epoch": 0.718255664814023, "grad_norm": 4.364500999450684, "learning_rate": 1.4244655581947743e-05, "loss": 0.3785, "step": 3360 }, { "epoch": 0.7193244976485678, "grad_norm": 2.8248353004455566, "learning_rate": 1.4232779097387176e-05, "loss": 0.2583, "step": 3365 }, { "epoch": 0.7203933304831125, "grad_norm": 5.5604987144470215, "learning_rate": 1.4220902612826604e-05, "loss": 0.2992, "step": 3370 }, { "epoch": 0.7214621633176571, "grad_norm": 4.8770527839660645, "learning_rate": 1.4209026128266035e-05, "loss": 0.2196, "step": 3375 }, { "epoch": 0.7225309961522018, "grad_norm": 4.998085021972656, "learning_rate": 1.4197149643705464e-05, "loss": 0.3438, "step": 3380 }, { "epoch": 0.7235998289867465, "grad_norm": 4.125364303588867, "learning_rate": 1.4185273159144894e-05, "loss": 0.333, "step": 3385 }, { "epoch": 0.7246686618212912, "grad_norm": 5.174322605133057, "learning_rate": 1.4173396674584323e-05, "loss": 0.4422, "step": 3390 }, { "epoch": 0.7257374946558358, "grad_norm": 4.850910186767578, "learning_rate": 1.4161520190023754e-05, "loss": 0.458, "step": 3395 }, { "epoch": 0.7268063274903805, "grad_norm": 4.238053321838379, "learning_rate": 1.4149643705463183e-05, "loss": 0.2526, "step": 3400 }, { "epoch": 0.7278751603249252, "grad_norm": 4.8868842124938965, "learning_rate": 1.4137767220902613e-05, "loss": 0.2443, "step": 3405 }, { "epoch": 0.7289439931594699, "grad_norm": 6.352740287780762, "learning_rate": 1.4125890736342045e-05, "loss": 0.4024, "step": 3410 }, { "epoch": 0.7300128259940145, "grad_norm": 3.7694151401519775, "learning_rate": 1.4114014251781474e-05, "loss": 0.3057, "step": 3415 }, { "epoch": 0.7310816588285592, "grad_norm": 4.326847553253174, "learning_rate": 1.4102137767220905e-05, "loss": 0.3417, "step": 3420 }, { "epoch": 0.7321504916631039, "grad_norm": 4.306587219238281, "learning_rate": 1.4090261282660334e-05, "loss": 0.3535, "step": 3425 }, { "epoch": 0.7332193244976486, "grad_norm": 4.4991044998168945, "learning_rate": 1.4078384798099764e-05, "loss": 0.3814, "step": 3430 }, { "epoch": 0.7342881573321932, "grad_norm": 4.0679779052734375, "learning_rate": 1.4066508313539193e-05, "loss": 0.3196, "step": 3435 }, { "epoch": 0.735356990166738, "grad_norm": 4.0540666580200195, "learning_rate": 1.4054631828978624e-05, "loss": 0.2738, "step": 3440 }, { "epoch": 0.7364258230012826, "grad_norm": 4.532857894897461, "learning_rate": 1.4042755344418053e-05, "loss": 0.2127, "step": 3445 }, { "epoch": 0.7374946558358273, "grad_norm": 4.681793212890625, "learning_rate": 1.4030878859857485e-05, "loss": 0.2415, "step": 3450 }, { "epoch": 0.7385634886703719, "grad_norm": 5.458173751831055, "learning_rate": 1.4019002375296914e-05, "loss": 0.368, "step": 3455 }, { "epoch": 0.7396323215049166, "grad_norm": 4.303793430328369, "learning_rate": 1.4007125890736344e-05, "loss": 0.2965, "step": 3460 }, { "epoch": 0.7407011543394613, "grad_norm": 5.24821138381958, "learning_rate": 1.3995249406175773e-05, "loss": 0.3676, "step": 3465 }, { "epoch": 0.741769987174006, "grad_norm": 7.041927337646484, "learning_rate": 1.3983372921615204e-05, "loss": 0.4793, "step": 3470 }, { "epoch": 0.7428388200085506, "grad_norm": 4.38003396987915, "learning_rate": 1.3971496437054632e-05, "loss": 0.2924, "step": 3475 }, { "epoch": 0.7439076528430953, "grad_norm": 4.844277858734131, "learning_rate": 1.3959619952494063e-05, "loss": 0.3051, "step": 3480 }, { "epoch": 0.74497648567764, "grad_norm": 4.943488121032715, "learning_rate": 1.3947743467933492e-05, "loss": 0.3206, "step": 3485 }, { "epoch": 0.7460453185121847, "grad_norm": 3.5360701084136963, "learning_rate": 1.3935866983372922e-05, "loss": 0.3062, "step": 3490 }, { "epoch": 0.7471141513467293, "grad_norm": 4.964517116546631, "learning_rate": 1.3923990498812351e-05, "loss": 0.3099, "step": 3495 }, { "epoch": 0.748182984181274, "grad_norm": 4.1770124435424805, "learning_rate": 1.3912114014251783e-05, "loss": 0.3528, "step": 3500 }, { "epoch": 0.7492518170158188, "grad_norm": 4.830697059631348, "learning_rate": 1.3900237529691212e-05, "loss": 0.3075, "step": 3505 }, { "epoch": 0.7503206498503634, "grad_norm": 4.7558512687683105, "learning_rate": 1.3888361045130643e-05, "loss": 0.3132, "step": 3510 }, { "epoch": 0.7513894826849081, "grad_norm": 5.082642555236816, "learning_rate": 1.3876484560570072e-05, "loss": 0.3789, "step": 3515 }, { "epoch": 0.7524583155194527, "grad_norm": 5.486532211303711, "learning_rate": 1.3864608076009502e-05, "loss": 0.3316, "step": 3520 }, { "epoch": 0.7535271483539975, "grad_norm": 4.763543605804443, "learning_rate": 1.3852731591448931e-05, "loss": 0.3113, "step": 3525 }, { "epoch": 0.7545959811885421, "grad_norm": 4.146590709686279, "learning_rate": 1.3840855106888362e-05, "loss": 0.2481, "step": 3530 }, { "epoch": 0.7556648140230868, "grad_norm": 4.292271614074707, "learning_rate": 1.382897862232779e-05, "loss": 0.3174, "step": 3535 }, { "epoch": 0.7567336468576314, "grad_norm": 5.971374988555908, "learning_rate": 1.3817102137767223e-05, "loss": 0.3116, "step": 3540 }, { "epoch": 0.7578024796921762, "grad_norm": 4.599390983581543, "learning_rate": 1.3805225653206652e-05, "loss": 0.29, "step": 3545 }, { "epoch": 0.7588713125267208, "grad_norm": 3.7273731231689453, "learning_rate": 1.3793349168646082e-05, "loss": 0.33, "step": 3550 }, { "epoch": 0.7599401453612655, "grad_norm": 3.681992530822754, "learning_rate": 1.3781472684085513e-05, "loss": 0.2002, "step": 3555 }, { "epoch": 0.7610089781958101, "grad_norm": 5.324198246002197, "learning_rate": 1.3769596199524942e-05, "loss": 0.3566, "step": 3560 }, { "epoch": 0.7620778110303549, "grad_norm": 4.434847354888916, "learning_rate": 1.3757719714964372e-05, "loss": 0.2618, "step": 3565 }, { "epoch": 0.7631466438648995, "grad_norm": 5.279498100280762, "learning_rate": 1.3745843230403801e-05, "loss": 0.316, "step": 3570 }, { "epoch": 0.7642154766994442, "grad_norm": 3.4741098880767822, "learning_rate": 1.3733966745843233e-05, "loss": 0.2997, "step": 3575 }, { "epoch": 0.7652843095339888, "grad_norm": 4.7899909019470215, "learning_rate": 1.372209026128266e-05, "loss": 0.2809, "step": 3580 }, { "epoch": 0.7663531423685336, "grad_norm": 4.318710803985596, "learning_rate": 1.3710213776722093e-05, "loss": 0.2023, "step": 3585 }, { "epoch": 0.7674219752030782, "grad_norm": 4.148991107940674, "learning_rate": 1.3698337292161522e-05, "loss": 0.2726, "step": 3590 }, { "epoch": 0.7684908080376229, "grad_norm": 5.0960373878479, "learning_rate": 1.3686460807600952e-05, "loss": 0.2878, "step": 3595 }, { "epoch": 0.7695596408721675, "grad_norm": 5.928832530975342, "learning_rate": 1.3674584323040381e-05, "loss": 0.4026, "step": 3600 }, { "epoch": 0.7706284737067123, "grad_norm": 4.24060583114624, "learning_rate": 1.3662707838479811e-05, "loss": 0.3205, "step": 3605 }, { "epoch": 0.7716973065412569, "grad_norm": 4.517853736877441, "learning_rate": 1.365083135391924e-05, "loss": 0.3092, "step": 3610 }, { "epoch": 0.7727661393758016, "grad_norm": 5.5383501052856445, "learning_rate": 1.3638954869358671e-05, "loss": 0.3249, "step": 3615 }, { "epoch": 0.7738349722103463, "grad_norm": 3.5598056316375732, "learning_rate": 1.36270783847981e-05, "loss": 0.2898, "step": 3620 }, { "epoch": 0.774903805044891, "grad_norm": 5.0517578125, "learning_rate": 1.3615201900237532e-05, "loss": 0.3464, "step": 3625 }, { "epoch": 0.7759726378794357, "grad_norm": 4.764474868774414, "learning_rate": 1.360332541567696e-05, "loss": 0.3755, "step": 3630 }, { "epoch": 0.7770414707139803, "grad_norm": 4.272229194641113, "learning_rate": 1.3591448931116391e-05, "loss": 0.3236, "step": 3635 }, { "epoch": 0.7781103035485251, "grad_norm": 4.496946811676025, "learning_rate": 1.357957244655582e-05, "loss": 0.3298, "step": 3640 }, { "epoch": 0.7791791363830697, "grad_norm": 3.3338801860809326, "learning_rate": 1.356769596199525e-05, "loss": 0.3301, "step": 3645 }, { "epoch": 0.7802479692176144, "grad_norm": 4.775890350341797, "learning_rate": 1.355581947743468e-05, "loss": 0.2428, "step": 3650 }, { "epoch": 0.781316802052159, "grad_norm": 3.7741811275482178, "learning_rate": 1.354394299287411e-05, "loss": 0.2789, "step": 3655 }, { "epoch": 0.7823856348867038, "grad_norm": 5.699966907501221, "learning_rate": 1.3532066508313539e-05, "loss": 0.4398, "step": 3660 }, { "epoch": 0.7834544677212484, "grad_norm": 5.20950174331665, "learning_rate": 1.352019002375297e-05, "loss": 0.3211, "step": 3665 }, { "epoch": 0.7845233005557931, "grad_norm": 4.900545120239258, "learning_rate": 1.3508313539192398e-05, "loss": 0.3079, "step": 3670 }, { "epoch": 0.7855921333903377, "grad_norm": 4.627389907836914, "learning_rate": 1.349643705463183e-05, "loss": 0.2765, "step": 3675 }, { "epoch": 0.7866609662248825, "grad_norm": 3.996687889099121, "learning_rate": 1.348456057007126e-05, "loss": 0.2414, "step": 3680 }, { "epoch": 0.7877297990594271, "grad_norm": 4.968347072601318, "learning_rate": 1.347268408551069e-05, "loss": 0.3142, "step": 3685 }, { "epoch": 0.7887986318939718, "grad_norm": 5.365523815155029, "learning_rate": 1.346080760095012e-05, "loss": 0.4895, "step": 3690 }, { "epoch": 0.7898674647285164, "grad_norm": 3.6716244220733643, "learning_rate": 1.344893111638955e-05, "loss": 0.3058, "step": 3695 }, { "epoch": 0.7909362975630612, "grad_norm": 3.6110551357269287, "learning_rate": 1.343705463182898e-05, "loss": 0.2568, "step": 3700 }, { "epoch": 0.7920051303976058, "grad_norm": 3.8466339111328125, "learning_rate": 1.3425178147268409e-05, "loss": 0.2505, "step": 3705 }, { "epoch": 0.7930739632321505, "grad_norm": 6.473718643188477, "learning_rate": 1.3413301662707841e-05, "loss": 0.3416, "step": 3710 }, { "epoch": 0.7941427960666951, "grad_norm": 4.931123733520508, "learning_rate": 1.340142517814727e-05, "loss": 0.2867, "step": 3715 }, { "epoch": 0.7952116289012399, "grad_norm": 4.821789741516113, "learning_rate": 1.33895486935867e-05, "loss": 0.2696, "step": 3720 }, { "epoch": 0.7962804617357845, "grad_norm": 3.5999889373779297, "learning_rate": 1.337767220902613e-05, "loss": 0.293, "step": 3725 }, { "epoch": 0.7973492945703292, "grad_norm": 3.716235637664795, "learning_rate": 1.336579572446556e-05, "loss": 0.2741, "step": 3730 }, { "epoch": 0.7984181274048738, "grad_norm": 3.1744401454925537, "learning_rate": 1.3353919239904989e-05, "loss": 0.3276, "step": 3735 }, { "epoch": 0.7994869602394186, "grad_norm": 4.65699577331543, "learning_rate": 1.334204275534442e-05, "loss": 0.2688, "step": 3740 }, { "epoch": 0.8005557930739632, "grad_norm": 3.338193416595459, "learning_rate": 1.3330166270783848e-05, "loss": 0.2408, "step": 3745 }, { "epoch": 0.8016246259085079, "grad_norm": 4.22088098526001, "learning_rate": 1.3318289786223279e-05, "loss": 0.2926, "step": 3750 }, { "epoch": 0.8026934587430525, "grad_norm": 5.624631881713867, "learning_rate": 1.3306413301662708e-05, "loss": 0.3119, "step": 3755 }, { "epoch": 0.8037622915775973, "grad_norm": 3.8507394790649414, "learning_rate": 1.329453681710214e-05, "loss": 0.3018, "step": 3760 }, { "epoch": 0.804831124412142, "grad_norm": 4.6665239334106445, "learning_rate": 1.3282660332541569e-05, "loss": 0.3448, "step": 3765 }, { "epoch": 0.8058999572466866, "grad_norm": 4.100464344024658, "learning_rate": 1.3270783847981e-05, "loss": 0.3539, "step": 3770 }, { "epoch": 0.8069687900812313, "grad_norm": 6.0533623695373535, "learning_rate": 1.3258907363420428e-05, "loss": 0.2776, "step": 3775 }, { "epoch": 0.808037622915776, "grad_norm": 3.781015396118164, "learning_rate": 1.3247030878859859e-05, "loss": 0.2255, "step": 3780 }, { "epoch": 0.8091064557503207, "grad_norm": 5.616995334625244, "learning_rate": 1.3235154394299288e-05, "loss": 0.2507, "step": 3785 }, { "epoch": 0.8101752885848653, "grad_norm": 5.021564960479736, "learning_rate": 1.3223277909738718e-05, "loss": 0.3463, "step": 3790 }, { "epoch": 0.81124412141941, "grad_norm": 4.946634769439697, "learning_rate": 1.3211401425178147e-05, "loss": 0.2849, "step": 3795 }, { "epoch": 0.8123129542539547, "grad_norm": 3.1573128700256348, "learning_rate": 1.319952494061758e-05, "loss": 0.2678, "step": 3800 }, { "epoch": 0.8133817870884994, "grad_norm": 5.302856922149658, "learning_rate": 1.3187648456057008e-05, "loss": 0.3446, "step": 3805 }, { "epoch": 0.814450619923044, "grad_norm": 5.2195820808410645, "learning_rate": 1.3175771971496439e-05, "loss": 0.344, "step": 3810 }, { "epoch": 0.8155194527575887, "grad_norm": 5.514340877532959, "learning_rate": 1.3163895486935867e-05, "loss": 0.3305, "step": 3815 }, { "epoch": 0.8165882855921334, "grad_norm": 4.197089195251465, "learning_rate": 1.3152019002375298e-05, "loss": 0.2728, "step": 3820 }, { "epoch": 0.8176571184266781, "grad_norm": 4.766973972320557, "learning_rate": 1.3140142517814727e-05, "loss": 0.4181, "step": 3825 }, { "epoch": 0.8187259512612227, "grad_norm": 5.202324390411377, "learning_rate": 1.3128266033254157e-05, "loss": 0.3351, "step": 3830 }, { "epoch": 0.8197947840957674, "grad_norm": 3.472627878189087, "learning_rate": 1.311638954869359e-05, "loss": 0.2646, "step": 3835 }, { "epoch": 0.8208636169303121, "grad_norm": 4.589137554168701, "learning_rate": 1.3104513064133017e-05, "loss": 0.2628, "step": 3840 }, { "epoch": 0.8219324497648568, "grad_norm": 3.9725475311279297, "learning_rate": 1.3092636579572449e-05, "loss": 0.2747, "step": 3845 }, { "epoch": 0.8230012825994014, "grad_norm": 3.832432985305786, "learning_rate": 1.3080760095011878e-05, "loss": 0.2253, "step": 3850 }, { "epoch": 0.8240701154339461, "grad_norm": 4.213531494140625, "learning_rate": 1.3068883610451308e-05, "loss": 0.2741, "step": 3855 }, { "epoch": 0.8251389482684908, "grad_norm": 6.430481910705566, "learning_rate": 1.3057007125890737e-05, "loss": 0.3982, "step": 3860 }, { "epoch": 0.8262077811030355, "grad_norm": 2.416151762008667, "learning_rate": 1.3045130641330168e-05, "loss": 0.3014, "step": 3865 }, { "epoch": 0.8272766139375801, "grad_norm": 4.334439754486084, "learning_rate": 1.3033254156769597e-05, "loss": 0.2696, "step": 3870 }, { "epoch": 0.8283454467721248, "grad_norm": 3.599234104156494, "learning_rate": 1.3021377672209027e-05, "loss": 0.2607, "step": 3875 }, { "epoch": 0.8294142796066695, "grad_norm": 4.65981388092041, "learning_rate": 1.3009501187648456e-05, "loss": 0.3154, "step": 3880 }, { "epoch": 0.8304831124412142, "grad_norm": 5.147418975830078, "learning_rate": 1.2997624703087888e-05, "loss": 0.3275, "step": 3885 }, { "epoch": 0.8315519452757588, "grad_norm": 4.910894870758057, "learning_rate": 1.2985748218527317e-05, "loss": 0.274, "step": 3890 }, { "epoch": 0.8326207781103035, "grad_norm": 3.3270483016967773, "learning_rate": 1.2973871733966748e-05, "loss": 0.3042, "step": 3895 }, { "epoch": 0.8336896109448483, "grad_norm": 5.005611419677734, "learning_rate": 1.2961995249406177e-05, "loss": 0.2692, "step": 3900 }, { "epoch": 0.8347584437793929, "grad_norm": 3.320770263671875, "learning_rate": 1.2950118764845607e-05, "loss": 0.2505, "step": 3905 }, { "epoch": 0.8358272766139376, "grad_norm": 4.788522720336914, "learning_rate": 1.2938242280285036e-05, "loss": 0.3762, "step": 3910 }, { "epoch": 0.8368961094484823, "grad_norm": 5.107404708862305, "learning_rate": 1.2926365795724467e-05, "loss": 0.2467, "step": 3915 }, { "epoch": 0.837964942283027, "grad_norm": 3.5440781116485596, "learning_rate": 1.2914489311163895e-05, "loss": 0.2227, "step": 3920 }, { "epoch": 0.8390337751175716, "grad_norm": 5.089791774749756, "learning_rate": 1.2902612826603326e-05, "loss": 0.2513, "step": 3925 }, { "epoch": 0.8401026079521163, "grad_norm": 5.978660583496094, "learning_rate": 1.2890736342042755e-05, "loss": 0.313, "step": 3930 }, { "epoch": 0.841171440786661, "grad_norm": 4.347848415374756, "learning_rate": 1.2878859857482187e-05, "loss": 0.265, "step": 3935 }, { "epoch": 0.8422402736212057, "grad_norm": 5.038461208343506, "learning_rate": 1.2866983372921616e-05, "loss": 0.2839, "step": 3940 }, { "epoch": 0.8433091064557503, "grad_norm": 4.367410659790039, "learning_rate": 1.2855106888361046e-05, "loss": 0.3432, "step": 3945 }, { "epoch": 0.844377939290295, "grad_norm": 4.267697334289551, "learning_rate": 1.2843230403800475e-05, "loss": 0.2168, "step": 3950 }, { "epoch": 0.8454467721248397, "grad_norm": 4.99351167678833, "learning_rate": 1.2831353919239906e-05, "loss": 0.3083, "step": 3955 }, { "epoch": 0.8465156049593844, "grad_norm": 3.725167751312256, "learning_rate": 1.2819477434679335e-05, "loss": 0.3362, "step": 3960 }, { "epoch": 0.847584437793929, "grad_norm": 4.825465679168701, "learning_rate": 1.2807600950118765e-05, "loss": 0.2897, "step": 3965 }, { "epoch": 0.8486532706284737, "grad_norm": 4.231856822967529, "learning_rate": 1.2795724465558198e-05, "loss": 0.299, "step": 3970 }, { "epoch": 0.8497221034630184, "grad_norm": 3.8439395427703857, "learning_rate": 1.2783847980997626e-05, "loss": 0.3421, "step": 3975 }, { "epoch": 0.8507909362975631, "grad_norm": 4.338144779205322, "learning_rate": 1.2771971496437057e-05, "loss": 0.2886, "step": 3980 }, { "epoch": 0.8518597691321077, "grad_norm": 5.123786449432373, "learning_rate": 1.2760095011876486e-05, "loss": 0.3563, "step": 3985 }, { "epoch": 0.8529286019666524, "grad_norm": 5.506287574768066, "learning_rate": 1.2748218527315916e-05, "loss": 0.3204, "step": 3990 }, { "epoch": 0.853997434801197, "grad_norm": 3.644973039627075, "learning_rate": 1.2736342042755345e-05, "loss": 0.3025, "step": 3995 }, { "epoch": 0.8550662676357418, "grad_norm": 5.109133720397949, "learning_rate": 1.2724465558194776e-05, "loss": 0.2813, "step": 4000 }, { "epoch": 0.8561351004702864, "grad_norm": 5.544173717498779, "learning_rate": 1.2712589073634205e-05, "loss": 0.2787, "step": 4005 }, { "epoch": 0.8572039333048311, "grad_norm": 5.382670879364014, "learning_rate": 1.2700712589073637e-05, "loss": 0.2643, "step": 4010 }, { "epoch": 0.8582727661393758, "grad_norm": 5.406363010406494, "learning_rate": 1.2688836104513064e-05, "loss": 0.291, "step": 4015 }, { "epoch": 0.8593415989739205, "grad_norm": 3.5062954425811768, "learning_rate": 1.2676959619952496e-05, "loss": 0.2467, "step": 4020 }, { "epoch": 0.8604104318084651, "grad_norm": 5.817686080932617, "learning_rate": 1.2665083135391925e-05, "loss": 0.3489, "step": 4025 }, { "epoch": 0.8614792646430098, "grad_norm": 3.931792974472046, "learning_rate": 1.2653206650831356e-05, "loss": 0.2613, "step": 4030 }, { "epoch": 0.8625480974775546, "grad_norm": 4.279338359832764, "learning_rate": 1.2641330166270785e-05, "loss": 0.3007, "step": 4035 }, { "epoch": 0.8636169303120992, "grad_norm": 3.9646289348602295, "learning_rate": 1.2629453681710215e-05, "loss": 0.2685, "step": 4040 }, { "epoch": 0.8646857631466439, "grad_norm": 5.029911518096924, "learning_rate": 1.2617577197149644e-05, "loss": 0.2984, "step": 4045 }, { "epoch": 0.8657545959811885, "grad_norm": 4.78744649887085, "learning_rate": 1.2605700712589074e-05, "loss": 0.2321, "step": 4050 }, { "epoch": 0.8668234288157333, "grad_norm": 3.825188636779785, "learning_rate": 1.2593824228028503e-05, "loss": 0.2417, "step": 4055 }, { "epoch": 0.8678922616502779, "grad_norm": 4.478353500366211, "learning_rate": 1.2581947743467936e-05, "loss": 0.3164, "step": 4060 }, { "epoch": 0.8689610944848226, "grad_norm": 5.523867607116699, "learning_rate": 1.2570071258907364e-05, "loss": 0.3769, "step": 4065 }, { "epoch": 0.8700299273193672, "grad_norm": 6.190155506134033, "learning_rate": 1.2558194774346795e-05, "loss": 0.3385, "step": 4070 }, { "epoch": 0.871098760153912, "grad_norm": 4.058770179748535, "learning_rate": 1.2546318289786224e-05, "loss": 0.3135, "step": 4075 }, { "epoch": 0.8721675929884566, "grad_norm": 5.607039928436279, "learning_rate": 1.2534441805225654e-05, "loss": 0.3295, "step": 4080 }, { "epoch": 0.8732364258230013, "grad_norm": 4.902414321899414, "learning_rate": 1.2522565320665083e-05, "loss": 0.2992, "step": 4085 }, { "epoch": 0.8743052586575459, "grad_norm": 4.188961505889893, "learning_rate": 1.2510688836104514e-05, "loss": 0.2723, "step": 4090 }, { "epoch": 0.8753740914920907, "grad_norm": 4.536145210266113, "learning_rate": 1.2498812351543943e-05, "loss": 0.2805, "step": 4095 }, { "epoch": 0.8764429243266353, "grad_norm": 3.7727832794189453, "learning_rate": 1.2486935866983373e-05, "loss": 0.2171, "step": 4100 }, { "epoch": 0.87751175716118, "grad_norm": 4.528228759765625, "learning_rate": 1.2475059382422802e-05, "loss": 0.2618, "step": 4105 }, { "epoch": 0.8785805899957246, "grad_norm": 4.920950412750244, "learning_rate": 1.2463182897862234e-05, "loss": 0.2994, "step": 4110 }, { "epoch": 0.8796494228302694, "grad_norm": 4.851797580718994, "learning_rate": 1.2451306413301665e-05, "loss": 0.2866, "step": 4115 }, { "epoch": 0.880718255664814, "grad_norm": 3.021509885787964, "learning_rate": 1.2439429928741094e-05, "loss": 0.2091, "step": 4120 }, { "epoch": 0.8817870884993587, "grad_norm": 5.19913911819458, "learning_rate": 1.2427553444180524e-05, "loss": 0.3285, "step": 4125 }, { "epoch": 0.8828559213339033, "grad_norm": 4.311760902404785, "learning_rate": 1.2415676959619953e-05, "loss": 0.2854, "step": 4130 }, { "epoch": 0.8839247541684481, "grad_norm": 5.5093994140625, "learning_rate": 1.2403800475059384e-05, "loss": 0.3004, "step": 4135 }, { "epoch": 0.8849935870029927, "grad_norm": 3.5908706188201904, "learning_rate": 1.2391923990498813e-05, "loss": 0.2335, "step": 4140 }, { "epoch": 0.8860624198375374, "grad_norm": 3.561647653579712, "learning_rate": 1.2380047505938245e-05, "loss": 0.2919, "step": 4145 }, { "epoch": 0.887131252672082, "grad_norm": 3.1781160831451416, "learning_rate": 1.2368171021377674e-05, "loss": 0.2786, "step": 4150 }, { "epoch": 0.8882000855066268, "grad_norm": 4.471413612365723, "learning_rate": 1.2356294536817104e-05, "loss": 0.3312, "step": 4155 }, { "epoch": 0.8892689183411714, "grad_norm": 5.232965469360352, "learning_rate": 1.2344418052256533e-05, "loss": 0.2677, "step": 4160 }, { "epoch": 0.8903377511757161, "grad_norm": 4.883133888244629, "learning_rate": 1.2332541567695964e-05, "loss": 0.2774, "step": 4165 }, { "epoch": 0.8914065840102608, "grad_norm": 4.092249393463135, "learning_rate": 1.2320665083135392e-05, "loss": 0.2649, "step": 4170 }, { "epoch": 0.8924754168448055, "grad_norm": 3.5607283115386963, "learning_rate": 1.2308788598574823e-05, "loss": 0.3119, "step": 4175 }, { "epoch": 0.8935442496793502, "grad_norm": 4.573966026306152, "learning_rate": 1.2296912114014252e-05, "loss": 0.243, "step": 4180 }, { "epoch": 0.8946130825138948, "grad_norm": 4.2962775230407715, "learning_rate": 1.2285035629453684e-05, "loss": 0.292, "step": 4185 }, { "epoch": 0.8956819153484396, "grad_norm": 4.585544109344482, "learning_rate": 1.2273159144893111e-05, "loss": 0.352, "step": 4190 }, { "epoch": 0.8967507481829842, "grad_norm": 4.529600143432617, "learning_rate": 1.2261282660332543e-05, "loss": 0.2422, "step": 4195 }, { "epoch": 0.8978195810175289, "grad_norm": 2.9587581157684326, "learning_rate": 1.2249406175771972e-05, "loss": 0.2427, "step": 4200 }, { "epoch": 0.8988884138520735, "grad_norm": 4.409660339355469, "learning_rate": 1.2237529691211403e-05, "loss": 0.2246, "step": 4205 }, { "epoch": 0.8999572466866183, "grad_norm": 3.328666925430298, "learning_rate": 1.2225653206650832e-05, "loss": 0.2275, "step": 4210 }, { "epoch": 0.9010260795211629, "grad_norm": 4.411447048187256, "learning_rate": 1.2213776722090262e-05, "loss": 0.3766, "step": 4215 }, { "epoch": 0.9020949123557076, "grad_norm": 3.3779454231262207, "learning_rate": 1.2201900237529691e-05, "loss": 0.2748, "step": 4220 }, { "epoch": 0.9031637451902522, "grad_norm": 5.558443069458008, "learning_rate": 1.2190023752969122e-05, "loss": 0.2941, "step": 4225 }, { "epoch": 0.904232578024797, "grad_norm": 3.7313380241394043, "learning_rate": 1.217814726840855e-05, "loss": 0.2693, "step": 4230 }, { "epoch": 0.9053014108593416, "grad_norm": 3.5401077270507812, "learning_rate": 1.2166270783847983e-05, "loss": 0.3058, "step": 4235 }, { "epoch": 0.9063702436938863, "grad_norm": 3.6305854320526123, "learning_rate": 1.2154394299287412e-05, "loss": 0.2167, "step": 4240 }, { "epoch": 0.9074390765284309, "grad_norm": 4.208883285522461, "learning_rate": 1.2142517814726842e-05, "loss": 0.2371, "step": 4245 }, { "epoch": 0.9085079093629757, "grad_norm": 4.586354732513428, "learning_rate": 1.2130641330166273e-05, "loss": 0.2199, "step": 4250 }, { "epoch": 0.9095767421975203, "grad_norm": 3.673724889755249, "learning_rate": 1.2118764845605702e-05, "loss": 0.2803, "step": 4255 }, { "epoch": 0.910645575032065, "grad_norm": 4.0301337242126465, "learning_rate": 1.2106888361045132e-05, "loss": 0.2765, "step": 4260 }, { "epoch": 0.9117144078666096, "grad_norm": 4.114202976226807, "learning_rate": 1.2095011876484561e-05, "loss": 0.2734, "step": 4265 }, { "epoch": 0.9127832407011544, "grad_norm": 6.415131568908691, "learning_rate": 1.2083135391923993e-05, "loss": 0.3345, "step": 4270 }, { "epoch": 0.913852073535699, "grad_norm": 4.800512790679932, "learning_rate": 1.207125890736342e-05, "loss": 0.2873, "step": 4275 }, { "epoch": 0.9149209063702437, "grad_norm": 4.536464214324951, "learning_rate": 1.2059382422802853e-05, "loss": 0.2506, "step": 4280 }, { "epoch": 0.9159897392047883, "grad_norm": 4.594064235687256, "learning_rate": 1.2047505938242281e-05, "loss": 0.2335, "step": 4285 }, { "epoch": 0.917058572039333, "grad_norm": 5.493027687072754, "learning_rate": 1.2035629453681712e-05, "loss": 0.3218, "step": 4290 }, { "epoch": 0.9181274048738777, "grad_norm": 4.560657501220703, "learning_rate": 1.2023752969121141e-05, "loss": 0.2971, "step": 4295 }, { "epoch": 0.9191962377084224, "grad_norm": 3.5777430534362793, "learning_rate": 1.2011876484560571e-05, "loss": 0.2296, "step": 4300 }, { "epoch": 0.9202650705429671, "grad_norm": 4.112082481384277, "learning_rate": 1.2e-05, "loss": 0.3087, "step": 4305 }, { "epoch": 0.9213339033775118, "grad_norm": 3.815093994140625, "learning_rate": 1.1988123515439431e-05, "loss": 0.3353, "step": 4310 }, { "epoch": 0.9224027362120565, "grad_norm": 5.078567028045654, "learning_rate": 1.197624703087886e-05, "loss": 0.3046, "step": 4315 }, { "epoch": 0.9234715690466011, "grad_norm": 3.549429178237915, "learning_rate": 1.1964370546318292e-05, "loss": 0.3431, "step": 4320 }, { "epoch": 0.9245404018811458, "grad_norm": 4.466531276702881, "learning_rate": 1.195249406175772e-05, "loss": 0.2707, "step": 4325 }, { "epoch": 0.9256092347156905, "grad_norm": 5.423553943634033, "learning_rate": 1.1940617577197151e-05, "loss": 0.284, "step": 4330 }, { "epoch": 0.9266780675502352, "grad_norm": 4.436051845550537, "learning_rate": 1.192874109263658e-05, "loss": 0.2714, "step": 4335 }, { "epoch": 0.9277469003847798, "grad_norm": 4.404295444488525, "learning_rate": 1.191686460807601e-05, "loss": 0.2751, "step": 4340 }, { "epoch": 0.9288157332193245, "grad_norm": 4.390391826629639, "learning_rate": 1.190498812351544e-05, "loss": 0.3047, "step": 4345 }, { "epoch": 0.9298845660538692, "grad_norm": 4.6937479972839355, "learning_rate": 1.189311163895487e-05, "loss": 0.2867, "step": 4350 }, { "epoch": 0.9309533988884139, "grad_norm": 4.352549076080322, "learning_rate": 1.1881235154394299e-05, "loss": 0.2895, "step": 4355 }, { "epoch": 0.9320222317229585, "grad_norm": 4.013473033905029, "learning_rate": 1.186935866983373e-05, "loss": 0.2749, "step": 4360 }, { "epoch": 0.9330910645575032, "grad_norm": 3.603860378265381, "learning_rate": 1.1857482185273158e-05, "loss": 0.2549, "step": 4365 }, { "epoch": 0.9341598973920479, "grad_norm": 5.079062461853027, "learning_rate": 1.184560570071259e-05, "loss": 0.2648, "step": 4370 }, { "epoch": 0.9352287302265926, "grad_norm": 6.029326438903809, "learning_rate": 1.183372921615202e-05, "loss": 0.3001, "step": 4375 }, { "epoch": 0.9362975630611372, "grad_norm": 4.8559041023254395, "learning_rate": 1.182185273159145e-05, "loss": 0.3198, "step": 4380 }, { "epoch": 0.9373663958956819, "grad_norm": 4.295980453491211, "learning_rate": 1.1809976247030879e-05, "loss": 0.2583, "step": 4385 }, { "epoch": 0.9384352287302266, "grad_norm": 6.648914337158203, "learning_rate": 1.179809976247031e-05, "loss": 0.2894, "step": 4390 }, { "epoch": 0.9395040615647713, "grad_norm": 5.454647064208984, "learning_rate": 1.178622327790974e-05, "loss": 0.3017, "step": 4395 }, { "epoch": 0.9405728943993159, "grad_norm": 5.520369529724121, "learning_rate": 1.1774346793349169e-05, "loss": 0.2754, "step": 4400 }, { "epoch": 0.9416417272338606, "grad_norm": 3.847935914993286, "learning_rate": 1.1762470308788601e-05, "loss": 0.3289, "step": 4405 }, { "epoch": 0.9427105600684053, "grad_norm": 4.063333988189697, "learning_rate": 1.175059382422803e-05, "loss": 0.2787, "step": 4410 }, { "epoch": 0.94377939290295, "grad_norm": 4.977645397186279, "learning_rate": 1.173871733966746e-05, "loss": 0.2406, "step": 4415 }, { "epoch": 0.9448482257374946, "grad_norm": 4.375988483428955, "learning_rate": 1.172684085510689e-05, "loss": 0.3144, "step": 4420 }, { "epoch": 0.9459170585720393, "grad_norm": 4.656064987182617, "learning_rate": 1.171496437054632e-05, "loss": 0.3237, "step": 4425 }, { "epoch": 0.946985891406584, "grad_norm": 4.027129650115967, "learning_rate": 1.1703087885985749e-05, "loss": 0.2641, "step": 4430 }, { "epoch": 0.9480547242411287, "grad_norm": 4.126834869384766, "learning_rate": 1.169121140142518e-05, "loss": 0.2875, "step": 4435 }, { "epoch": 0.9491235570756734, "grad_norm": 3.4707841873168945, "learning_rate": 1.1679334916864608e-05, "loss": 0.3211, "step": 4440 }, { "epoch": 0.950192389910218, "grad_norm": 2.8617501258850098, "learning_rate": 1.166745843230404e-05, "loss": 0.2403, "step": 4445 }, { "epoch": 0.9512612227447628, "grad_norm": 4.50408935546875, "learning_rate": 1.1655581947743468e-05, "loss": 0.3018, "step": 4450 }, { "epoch": 0.9523300555793074, "grad_norm": 3.976015329360962, "learning_rate": 1.16437054631829e-05, "loss": 0.2531, "step": 4455 }, { "epoch": 0.9533988884138521, "grad_norm": 6.214652061462402, "learning_rate": 1.1631828978622329e-05, "loss": 0.349, "step": 4460 }, { "epoch": 0.9544677212483967, "grad_norm": 3.969996929168701, "learning_rate": 1.161995249406176e-05, "loss": 0.238, "step": 4465 }, { "epoch": 0.9555365540829415, "grad_norm": 3.9902470111846924, "learning_rate": 1.1608076009501188e-05, "loss": 0.2768, "step": 4470 }, { "epoch": 0.9566053869174861, "grad_norm": 4.20414924621582, "learning_rate": 1.1596199524940619e-05, "loss": 0.2944, "step": 4475 }, { "epoch": 0.9576742197520308, "grad_norm": 3.5199337005615234, "learning_rate": 1.1584323040380048e-05, "loss": 0.3043, "step": 4480 }, { "epoch": 0.9587430525865754, "grad_norm": 3.7765684127807617, "learning_rate": 1.1572446555819478e-05, "loss": 0.2434, "step": 4485 }, { "epoch": 0.9598118854211202, "grad_norm": 3.9338152408599854, "learning_rate": 1.1560570071258907e-05, "loss": 0.2451, "step": 4490 }, { "epoch": 0.9608807182556648, "grad_norm": 2.86897873878479, "learning_rate": 1.154869358669834e-05, "loss": 0.213, "step": 4495 }, { "epoch": 0.9619495510902095, "grad_norm": 4.536627292633057, "learning_rate": 1.1536817102137768e-05, "loss": 0.26, "step": 4500 }, { "epoch": 0.9630183839247541, "grad_norm": 5.863621234893799, "learning_rate": 1.1524940617577199e-05, "loss": 0.3173, "step": 4505 }, { "epoch": 0.9640872167592989, "grad_norm": 5.156888008117676, "learning_rate": 1.1513064133016627e-05, "loss": 0.2745, "step": 4510 }, { "epoch": 0.9651560495938435, "grad_norm": 3.947845220565796, "learning_rate": 1.1501187648456058e-05, "loss": 0.2824, "step": 4515 }, { "epoch": 0.9662248824283882, "grad_norm": 3.6855573654174805, "learning_rate": 1.1489311163895487e-05, "loss": 0.2769, "step": 4520 }, { "epoch": 0.9672937152629328, "grad_norm": 3.929898977279663, "learning_rate": 1.1477434679334917e-05, "loss": 0.2464, "step": 4525 }, { "epoch": 0.9683625480974776, "grad_norm": 3.9288270473480225, "learning_rate": 1.146555819477435e-05, "loss": 0.3213, "step": 4530 }, { "epoch": 0.9694313809320222, "grad_norm": 5.536011219024658, "learning_rate": 1.1453681710213777e-05, "loss": 0.3606, "step": 4535 }, { "epoch": 0.9705002137665669, "grad_norm": 3.3420379161834717, "learning_rate": 1.1441805225653209e-05, "loss": 0.2183, "step": 4540 }, { "epoch": 0.9715690466011115, "grad_norm": 3.492932081222534, "learning_rate": 1.1429928741092638e-05, "loss": 0.2567, "step": 4545 }, { "epoch": 0.9726378794356563, "grad_norm": 5.132521629333496, "learning_rate": 1.1418052256532068e-05, "loss": 0.2521, "step": 4550 }, { "epoch": 0.9737067122702009, "grad_norm": 4.512472152709961, "learning_rate": 1.1406175771971497e-05, "loss": 0.2696, "step": 4555 }, { "epoch": 0.9747755451047456, "grad_norm": 5.246362686157227, "learning_rate": 1.1394299287410928e-05, "loss": 0.3409, "step": 4560 }, { "epoch": 0.9758443779392902, "grad_norm": 4.033038139343262, "learning_rate": 1.1382422802850357e-05, "loss": 0.2732, "step": 4565 }, { "epoch": 0.976913210773835, "grad_norm": 4.162726879119873, "learning_rate": 1.1370546318289787e-05, "loss": 0.3003, "step": 4570 }, { "epoch": 0.9779820436083797, "grad_norm": 5.6553730964660645, "learning_rate": 1.1358669833729216e-05, "loss": 0.3426, "step": 4575 }, { "epoch": 0.9790508764429243, "grad_norm": 3.857776403427124, "learning_rate": 1.1346793349168648e-05, "loss": 0.2873, "step": 4580 }, { "epoch": 0.980119709277469, "grad_norm": 4.109443187713623, "learning_rate": 1.1334916864608077e-05, "loss": 0.3, "step": 4585 }, { "epoch": 0.9811885421120137, "grad_norm": 3.3073673248291016, "learning_rate": 1.1323040380047508e-05, "loss": 0.2074, "step": 4590 }, { "epoch": 0.9822573749465584, "grad_norm": 3.0706233978271484, "learning_rate": 1.1311163895486937e-05, "loss": 0.2521, "step": 4595 }, { "epoch": 0.983326207781103, "grad_norm": 5.8296356201171875, "learning_rate": 1.1299287410926367e-05, "loss": 0.3123, "step": 4600 }, { "epoch": 0.9843950406156478, "grad_norm": 3.409862995147705, "learning_rate": 1.1287410926365796e-05, "loss": 0.2492, "step": 4605 }, { "epoch": 0.9854638734501924, "grad_norm": 5.090631008148193, "learning_rate": 1.1275534441805227e-05, "loss": 0.3012, "step": 4610 }, { "epoch": 0.9865327062847371, "grad_norm": 6.443350315093994, "learning_rate": 1.1263657957244655e-05, "loss": 0.2516, "step": 4615 }, { "epoch": 0.9876015391192817, "grad_norm": 4.340301513671875, "learning_rate": 1.1251781472684088e-05, "loss": 0.3629, "step": 4620 }, { "epoch": 0.9886703719538265, "grad_norm": 4.117158889770508, "learning_rate": 1.1239904988123515e-05, "loss": 0.2484, "step": 4625 }, { "epoch": 0.9897392047883711, "grad_norm": 4.39588737487793, "learning_rate": 1.1228028503562947e-05, "loss": 0.2749, "step": 4630 }, { "epoch": 0.9908080376229158, "grad_norm": 4.059388637542725, "learning_rate": 1.1216152019002376e-05, "loss": 0.2064, "step": 4635 }, { "epoch": 0.9918768704574604, "grad_norm": 3.4412331581115723, "learning_rate": 1.1204275534441806e-05, "loss": 0.3089, "step": 4640 }, { "epoch": 0.9929457032920052, "grad_norm": 4.691385746002197, "learning_rate": 1.1192399049881235e-05, "loss": 0.3145, "step": 4645 }, { "epoch": 0.9940145361265498, "grad_norm": 3.472172737121582, "learning_rate": 1.1180522565320666e-05, "loss": 0.2357, "step": 4650 }, { "epoch": 0.9950833689610945, "grad_norm": 4.1867289543151855, "learning_rate": 1.1168646080760095e-05, "loss": 0.2803, "step": 4655 }, { "epoch": 0.9961522017956391, "grad_norm": 4.0518083572387695, "learning_rate": 1.1156769596199525e-05, "loss": 0.2437, "step": 4660 }, { "epoch": 0.9972210346301839, "grad_norm": 3.507197141647339, "learning_rate": 1.1144893111638954e-05, "loss": 0.2708, "step": 4665 }, { "epoch": 0.9982898674647285, "grad_norm": 5.1572585105896, "learning_rate": 1.1133016627078386e-05, "loss": 0.253, "step": 4670 }, { "epoch": 0.9993587002992732, "grad_norm": 4.823436737060547, "learning_rate": 1.1121140142517817e-05, "loss": 0.2219, "step": 4675 }, { "epoch": 1.0, "eval_loss": 0.1271175593137741, "eval_mrr": 0.9770190895741555, "eval_runtime": 313.9716, "eval_samples_per_second": 7.23, "eval_steps_per_second": 0.905, "step": 4678 }, { "epoch": 1.000427533133818, "grad_norm": 5.583497047424316, "learning_rate": 1.1109263657957246e-05, "loss": 0.2621, "step": 4680 }, { "epoch": 1.0014963659683624, "grad_norm": 4.658013343811035, "learning_rate": 1.1097387173396676e-05, "loss": 0.382, "step": 4685 }, { "epoch": 1.0025651988029072, "grad_norm": 3.0044312477111816, "learning_rate": 1.1085510688836105e-05, "loss": 0.3026, "step": 4690 }, { "epoch": 1.003634031637452, "grad_norm": 4.063423156738281, "learning_rate": 1.1073634204275536e-05, "loss": 0.3643, "step": 4695 }, { "epoch": 1.0047028644719966, "grad_norm": 4.625239372253418, "learning_rate": 1.1061757719714965e-05, "loss": 0.382, "step": 4700 }, { "epoch": 1.0057716973065411, "grad_norm": 3.8251540660858154, "learning_rate": 1.1049881235154397e-05, "loss": 0.3082, "step": 4705 }, { "epoch": 1.0068405301410859, "grad_norm": 4.241628170013428, "learning_rate": 1.1038004750593824e-05, "loss": 0.3411, "step": 4710 }, { "epoch": 1.0079093629756306, "grad_norm": 5.6527276039123535, "learning_rate": 1.1026128266033256e-05, "loss": 0.317, "step": 4715 }, { "epoch": 1.0089781958101753, "grad_norm": 5.0404052734375, "learning_rate": 1.1014251781472685e-05, "loss": 0.4396, "step": 4720 }, { "epoch": 1.01004702864472, "grad_norm": 4.585846900939941, "learning_rate": 1.1002375296912116e-05, "loss": 0.4034, "step": 4725 }, { "epoch": 1.0111158614792646, "grad_norm": 4.704357624053955, "learning_rate": 1.0990498812351544e-05, "loss": 0.2875, "step": 4730 }, { "epoch": 1.0121846943138093, "grad_norm": 5.956788063049316, "learning_rate": 1.0978622327790975e-05, "loss": 0.4919, "step": 4735 }, { "epoch": 1.013253527148354, "grad_norm": 4.240102291107178, "learning_rate": 1.0966745843230404e-05, "loss": 0.3118, "step": 4740 }, { "epoch": 1.0143223599828988, "grad_norm": 4.7897515296936035, "learning_rate": 1.0954869358669834e-05, "loss": 0.3976, "step": 4745 }, { "epoch": 1.0153911928174433, "grad_norm": 3.1631078720092773, "learning_rate": 1.0942992874109263e-05, "loss": 0.2919, "step": 4750 }, { "epoch": 1.016460025651988, "grad_norm": 4.258396148681641, "learning_rate": 1.0931116389548696e-05, "loss": 0.5247, "step": 4755 }, { "epoch": 1.0175288584865327, "grad_norm": 3.010542392730713, "learning_rate": 1.0919239904988124e-05, "loss": 0.2126, "step": 4760 }, { "epoch": 1.0185976913210775, "grad_norm": 3.0874409675598145, "learning_rate": 1.0907363420427555e-05, "loss": 0.3455, "step": 4765 }, { "epoch": 1.019666524155622, "grad_norm": 4.446132183074951, "learning_rate": 1.0895486935866984e-05, "loss": 0.3498, "step": 4770 }, { "epoch": 1.0207353569901667, "grad_norm": 4.1357502937316895, "learning_rate": 1.0883610451306414e-05, "loss": 0.29, "step": 4775 }, { "epoch": 1.0218041898247114, "grad_norm": 6.850640296936035, "learning_rate": 1.0871733966745843e-05, "loss": 0.3684, "step": 4780 }, { "epoch": 1.0228730226592562, "grad_norm": 3.9681396484375, "learning_rate": 1.0859857482185274e-05, "loss": 0.3033, "step": 4785 }, { "epoch": 1.0239418554938007, "grad_norm": 3.521563768386841, "learning_rate": 1.0847980997624703e-05, "loss": 0.2747, "step": 4790 }, { "epoch": 1.0250106883283454, "grad_norm": 4.060203552246094, "learning_rate": 1.0836104513064135e-05, "loss": 0.2813, "step": 4795 }, { "epoch": 1.0260795211628901, "grad_norm": 3.187224864959717, "learning_rate": 1.0824228028503562e-05, "loss": 0.2945, "step": 4800 }, { "epoch": 1.0271483539974349, "grad_norm": 3.6413896083831787, "learning_rate": 1.0812351543942994e-05, "loss": 0.421, "step": 4805 }, { "epoch": 1.0282171868319794, "grad_norm": 4.686298847198486, "learning_rate": 1.0800475059382423e-05, "loss": 0.3365, "step": 4810 }, { "epoch": 1.029286019666524, "grad_norm": 5.890400409698486, "learning_rate": 1.0788598574821854e-05, "loss": 0.3708, "step": 4815 }, { "epoch": 1.0303548525010688, "grad_norm": 3.566652774810791, "learning_rate": 1.0776722090261284e-05, "loss": 0.4345, "step": 4820 }, { "epoch": 1.0314236853356136, "grad_norm": 5.6578826904296875, "learning_rate": 1.0764845605700713e-05, "loss": 0.3728, "step": 4825 }, { "epoch": 1.032492518170158, "grad_norm": 4.193053245544434, "learning_rate": 1.0752969121140144e-05, "loss": 0.3173, "step": 4830 }, { "epoch": 1.0335613510047028, "grad_norm": 4.646356105804443, "learning_rate": 1.0741092636579572e-05, "loss": 0.2574, "step": 4835 }, { "epoch": 1.0346301838392475, "grad_norm": 3.941087245941162, "learning_rate": 1.0729216152019005e-05, "loss": 0.3128, "step": 4840 }, { "epoch": 1.0356990166737923, "grad_norm": 4.5648884773254395, "learning_rate": 1.0717339667458434e-05, "loss": 0.2542, "step": 4845 }, { "epoch": 1.036767849508337, "grad_norm": 3.661923408508301, "learning_rate": 1.0705463182897864e-05, "loss": 0.1649, "step": 4850 }, { "epoch": 1.0378366823428815, "grad_norm": 5.052914619445801, "learning_rate": 1.0693586698337293e-05, "loss": 0.4091, "step": 4855 }, { "epoch": 1.0389055151774262, "grad_norm": 5.769303321838379, "learning_rate": 1.0681710213776724e-05, "loss": 0.3553, "step": 4860 }, { "epoch": 1.039974348011971, "grad_norm": 8.323318481445312, "learning_rate": 1.0669833729216152e-05, "loss": 0.5474, "step": 4865 }, { "epoch": 1.0410431808465157, "grad_norm": 5.351403713226318, "learning_rate": 1.0657957244655583e-05, "loss": 0.3586, "step": 4870 }, { "epoch": 1.0421120136810602, "grad_norm": 3.5083069801330566, "learning_rate": 1.0646080760095012e-05, "loss": 0.2589, "step": 4875 }, { "epoch": 1.043180846515605, "grad_norm": 3.8574445247650146, "learning_rate": 1.0634204275534444e-05, "loss": 0.3143, "step": 4880 }, { "epoch": 1.0442496793501497, "grad_norm": 3.950756311416626, "learning_rate": 1.0622327790973871e-05, "loss": 0.3248, "step": 4885 }, { "epoch": 1.0453185121846944, "grad_norm": 5.606834411621094, "learning_rate": 1.0610451306413303e-05, "loss": 0.4631, "step": 4890 }, { "epoch": 1.046387345019239, "grad_norm": 4.092567443847656, "learning_rate": 1.0598574821852732e-05, "loss": 0.2977, "step": 4895 }, { "epoch": 1.0474561778537836, "grad_norm": 5.365922451019287, "learning_rate": 1.0586698337292163e-05, "loss": 0.2487, "step": 4900 }, { "epoch": 1.0485250106883284, "grad_norm": 5.173450946807861, "learning_rate": 1.0574821852731592e-05, "loss": 0.3554, "step": 4905 }, { "epoch": 1.049593843522873, "grad_norm": 5.21553373336792, "learning_rate": 1.0562945368171022e-05, "loss": 0.3579, "step": 4910 }, { "epoch": 1.0506626763574176, "grad_norm": 4.973548889160156, "learning_rate": 1.0551068883610451e-05, "loss": 0.3562, "step": 4915 }, { "epoch": 1.0517315091919623, "grad_norm": 6.216787815093994, "learning_rate": 1.0539192399049882e-05, "loss": 0.4625, "step": 4920 }, { "epoch": 1.052800342026507, "grad_norm": 4.355904579162598, "learning_rate": 1.052731591448931e-05, "loss": 0.2834, "step": 4925 }, { "epoch": 1.0538691748610518, "grad_norm": 4.44107723236084, "learning_rate": 1.0515439429928743e-05, "loss": 0.4072, "step": 4930 }, { "epoch": 1.0549380076955963, "grad_norm": 5.2141289710998535, "learning_rate": 1.0503562945368172e-05, "loss": 0.2831, "step": 4935 }, { "epoch": 1.056006840530141, "grad_norm": 4.4729228019714355, "learning_rate": 1.0491686460807602e-05, "loss": 0.2642, "step": 4940 }, { "epoch": 1.0570756733646858, "grad_norm": 4.615827560424805, "learning_rate": 1.0479809976247031e-05, "loss": 0.2887, "step": 4945 }, { "epoch": 1.0581445061992305, "grad_norm": 4.060108661651611, "learning_rate": 1.0467933491686462e-05, "loss": 0.4394, "step": 4950 }, { "epoch": 1.059213339033775, "grad_norm": 3.323357582092285, "learning_rate": 1.0456057007125892e-05, "loss": 0.2957, "step": 4955 }, { "epoch": 1.0602821718683197, "grad_norm": 3.9010369777679443, "learning_rate": 1.0444180522565321e-05, "loss": 0.298, "step": 4960 }, { "epoch": 1.0613510047028645, "grad_norm": 4.847980976104736, "learning_rate": 1.0432304038004753e-05, "loss": 0.2643, "step": 4965 }, { "epoch": 1.0624198375374092, "grad_norm": 4.916622638702393, "learning_rate": 1.0420427553444182e-05, "loss": 0.3147, "step": 4970 }, { "epoch": 1.063488670371954, "grad_norm": 6.059121131896973, "learning_rate": 1.0408551068883613e-05, "loss": 0.3433, "step": 4975 }, { "epoch": 1.0645575032064984, "grad_norm": 4.212458610534668, "learning_rate": 1.0396674584323041e-05, "loss": 0.2797, "step": 4980 }, { "epoch": 1.0656263360410432, "grad_norm": 3.9374332427978516, "learning_rate": 1.0384798099762472e-05, "loss": 0.2958, "step": 4985 }, { "epoch": 1.066695168875588, "grad_norm": 7.207469940185547, "learning_rate": 1.0372921615201901e-05, "loss": 0.3972, "step": 4990 }, { "epoch": 1.0677640017101326, "grad_norm": 5.122316360473633, "learning_rate": 1.0361045130641331e-05, "loss": 0.3361, "step": 4995 }, { "epoch": 1.0688328345446771, "grad_norm": 3.4103052616119385, "learning_rate": 1.034916864608076e-05, "loss": 0.3174, "step": 5000 }, { "epoch": 1.0699016673792219, "grad_norm": 4.129265308380127, "learning_rate": 1.033729216152019e-05, "loss": 0.3346, "step": 5005 }, { "epoch": 1.0709705002137666, "grad_norm": 4.027009963989258, "learning_rate": 1.032541567695962e-05, "loss": 0.326, "step": 5010 }, { "epoch": 1.0720393330483113, "grad_norm": 3.362579822540283, "learning_rate": 1.0313539192399052e-05, "loss": 0.2975, "step": 5015 }, { "epoch": 1.0731081658828558, "grad_norm": 5.225454330444336, "learning_rate": 1.030166270783848e-05, "loss": 0.3303, "step": 5020 }, { "epoch": 1.0741769987174006, "grad_norm": 3.756742000579834, "learning_rate": 1.0289786223277911e-05, "loss": 0.2285, "step": 5025 }, { "epoch": 1.0752458315519453, "grad_norm": 4.867086887359619, "learning_rate": 1.027790973871734e-05, "loss": 0.3798, "step": 5030 }, { "epoch": 1.07631466438649, "grad_norm": 4.204124927520752, "learning_rate": 1.026603325415677e-05, "loss": 0.2882, "step": 5035 }, { "epoch": 1.0773834972210345, "grad_norm": 4.995541095733643, "learning_rate": 1.02541567695962e-05, "loss": 0.4249, "step": 5040 }, { "epoch": 1.0784523300555793, "grad_norm": 4.921726226806641, "learning_rate": 1.024228028503563e-05, "loss": 0.4139, "step": 5045 }, { "epoch": 1.079521162890124, "grad_norm": 5.5460734367370605, "learning_rate": 1.0230403800475059e-05, "loss": 0.4502, "step": 5050 }, { "epoch": 1.0805899957246687, "grad_norm": 4.828423023223877, "learning_rate": 1.0218527315914491e-05, "loss": 0.3521, "step": 5055 }, { "epoch": 1.0816588285592132, "grad_norm": 3.87648344039917, "learning_rate": 1.0206650831353918e-05, "loss": 0.342, "step": 5060 }, { "epoch": 1.082727661393758, "grad_norm": 4.833287715911865, "learning_rate": 1.019477434679335e-05, "loss": 0.294, "step": 5065 }, { "epoch": 1.0837964942283027, "grad_norm": 4.559665679931641, "learning_rate": 1.018289786223278e-05, "loss": 0.2994, "step": 5070 }, { "epoch": 1.0848653270628474, "grad_norm": 4.908376216888428, "learning_rate": 1.017102137767221e-05, "loss": 0.3467, "step": 5075 }, { "epoch": 1.085934159897392, "grad_norm": 6.717689514160156, "learning_rate": 1.0159144893111639e-05, "loss": 0.443, "step": 5080 }, { "epoch": 1.0870029927319367, "grad_norm": 3.5398693084716797, "learning_rate": 1.014726840855107e-05, "loss": 0.2759, "step": 5085 }, { "epoch": 1.0880718255664814, "grad_norm": 4.501621246337891, "learning_rate": 1.0135391923990498e-05, "loss": 0.2614, "step": 5090 }, { "epoch": 1.0891406584010261, "grad_norm": 5.194151401519775, "learning_rate": 1.0123515439429929e-05, "loss": 0.3649, "step": 5095 }, { "epoch": 1.0902094912355706, "grad_norm": 4.68430757522583, "learning_rate": 1.0111638954869361e-05, "loss": 0.3699, "step": 5100 }, { "epoch": 1.0912783240701154, "grad_norm": 6.266822814941406, "learning_rate": 1.009976247030879e-05, "loss": 0.3785, "step": 5105 }, { "epoch": 1.09234715690466, "grad_norm": 5.040585517883301, "learning_rate": 1.008788598574822e-05, "loss": 0.325, "step": 5110 }, { "epoch": 1.0934159897392048, "grad_norm": 6.65608024597168, "learning_rate": 1.007600950118765e-05, "loss": 0.3954, "step": 5115 }, { "epoch": 1.0944848225737496, "grad_norm": 4.3081488609313965, "learning_rate": 1.006413301662708e-05, "loss": 0.2887, "step": 5120 }, { "epoch": 1.095553655408294, "grad_norm": 3.5742883682250977, "learning_rate": 1.0052256532066509e-05, "loss": 0.2565, "step": 5125 }, { "epoch": 1.0966224882428388, "grad_norm": 4.272683620452881, "learning_rate": 1.004038004750594e-05, "loss": 0.2661, "step": 5130 }, { "epoch": 1.0976913210773835, "grad_norm": 3.7064707279205322, "learning_rate": 1.0028503562945368e-05, "loss": 0.286, "step": 5135 }, { "epoch": 1.0987601539119283, "grad_norm": 6.0004963874816895, "learning_rate": 1.00166270783848e-05, "loss": 0.3969, "step": 5140 }, { "epoch": 1.0998289867464728, "grad_norm": 6.301701068878174, "learning_rate": 1.0004750593824228e-05, "loss": 0.2451, "step": 5145 }, { "epoch": 1.1008978195810175, "grad_norm": 5.471083164215088, "learning_rate": 9.992874109263658e-06, "loss": 0.2472, "step": 5150 }, { "epoch": 1.1019666524155622, "grad_norm": 6.456992149353027, "learning_rate": 9.980997624703089e-06, "loss": 0.286, "step": 5155 }, { "epoch": 1.103035485250107, "grad_norm": 6.616683006286621, "learning_rate": 9.969121140142518e-06, "loss": 0.3109, "step": 5160 }, { "epoch": 1.1041043180846515, "grad_norm": 5.748746871948242, "learning_rate": 9.95724465558195e-06, "loss": 0.441, "step": 5165 }, { "epoch": 1.1051731509191962, "grad_norm": 4.254424571990967, "learning_rate": 9.945368171021379e-06, "loss": 0.3494, "step": 5170 }, { "epoch": 1.106241983753741, "grad_norm": 6.022365093231201, "learning_rate": 9.93349168646081e-06, "loss": 0.345, "step": 5175 }, { "epoch": 1.1073108165882857, "grad_norm": 3.26804518699646, "learning_rate": 9.921615201900238e-06, "loss": 0.2503, "step": 5180 }, { "epoch": 1.1083796494228302, "grad_norm": 3.100945234298706, "learning_rate": 9.909738717339669e-06, "loss": 0.3922, "step": 5185 }, { "epoch": 1.109448482257375, "grad_norm": 4.631006717681885, "learning_rate": 9.897862232779099e-06, "loss": 0.3429, "step": 5190 }, { "epoch": 1.1105173150919196, "grad_norm": 4.623953819274902, "learning_rate": 9.885985748218528e-06, "loss": 0.3437, "step": 5195 }, { "epoch": 1.1115861479264644, "grad_norm": 3.877652406692505, "learning_rate": 9.874109263657959e-06, "loss": 0.2751, "step": 5200 }, { "epoch": 1.1126549807610089, "grad_norm": 4.4313225746154785, "learning_rate": 9.862232779097387e-06, "loss": 0.3634, "step": 5205 }, { "epoch": 1.1137238135955536, "grad_norm": 5.426332473754883, "learning_rate": 9.850356294536818e-06, "loss": 0.2685, "step": 5210 }, { "epoch": 1.1147926464300983, "grad_norm": 3.7707009315490723, "learning_rate": 9.838479809976248e-06, "loss": 0.2121, "step": 5215 }, { "epoch": 1.115861479264643, "grad_norm": 3.5573911666870117, "learning_rate": 9.826603325415677e-06, "loss": 0.4083, "step": 5220 }, { "epoch": 1.1169303120991876, "grad_norm": 3.2365455627441406, "learning_rate": 9.814726840855108e-06, "loss": 0.2497, "step": 5225 }, { "epoch": 1.1179991449337323, "grad_norm": 3.604321241378784, "learning_rate": 9.802850356294538e-06, "loss": 0.3521, "step": 5230 }, { "epoch": 1.119067977768277, "grad_norm": 4.779599666595459, "learning_rate": 9.790973871733967e-06, "loss": 0.3048, "step": 5235 }, { "epoch": 1.1201368106028218, "grad_norm": 3.685837745666504, "learning_rate": 9.779097387173398e-06, "loss": 0.2622, "step": 5240 }, { "epoch": 1.1212056434373663, "grad_norm": 4.687803268432617, "learning_rate": 9.767220902612827e-06, "loss": 0.2651, "step": 5245 }, { "epoch": 1.122274476271911, "grad_norm": 3.861872911453247, "learning_rate": 9.755344418052257e-06, "loss": 0.2971, "step": 5250 }, { "epoch": 1.1233433091064557, "grad_norm": 5.285345077514648, "learning_rate": 9.743467933491688e-06, "loss": 0.3186, "step": 5255 }, { "epoch": 1.1244121419410005, "grad_norm": 4.946507930755615, "learning_rate": 9.731591448931117e-06, "loss": 0.3277, "step": 5260 }, { "epoch": 1.1254809747755452, "grad_norm": 3.527979612350464, "learning_rate": 9.719714964370547e-06, "loss": 0.2521, "step": 5265 }, { "epoch": 1.1265498076100897, "grad_norm": 4.42695426940918, "learning_rate": 9.707838479809976e-06, "loss": 0.3114, "step": 5270 }, { "epoch": 1.1276186404446344, "grad_norm": 2.8614661693573, "learning_rate": 9.695961995249407e-06, "loss": 0.3582, "step": 5275 }, { "epoch": 1.1286874732791792, "grad_norm": 4.813942909240723, "learning_rate": 9.684085510688837e-06, "loss": 0.3617, "step": 5280 }, { "epoch": 1.129756306113724, "grad_norm": 4.115063667297363, "learning_rate": 9.672209026128266e-06, "loss": 0.328, "step": 5285 }, { "epoch": 1.1308251389482684, "grad_norm": 2.9611902236938477, "learning_rate": 9.660332541567697e-06, "loss": 0.2627, "step": 5290 }, { "epoch": 1.1318939717828131, "grad_norm": 4.242338180541992, "learning_rate": 9.648456057007125e-06, "loss": 0.3054, "step": 5295 }, { "epoch": 1.1329628046173579, "grad_norm": 3.4355380535125732, "learning_rate": 9.636579572446556e-06, "loss": 0.5342, "step": 5300 }, { "epoch": 1.1340316374519026, "grad_norm": 3.823155641555786, "learning_rate": 9.624703087885987e-06, "loss": 0.2956, "step": 5305 }, { "epoch": 1.1351004702864471, "grad_norm": 3.815985679626465, "learning_rate": 9.612826603325417e-06, "loss": 0.3008, "step": 5310 }, { "epoch": 1.1361693031209918, "grad_norm": 6.562064170837402, "learning_rate": 9.600950118764848e-06, "loss": 0.4844, "step": 5315 }, { "epoch": 1.1372381359555366, "grad_norm": 4.454050540924072, "learning_rate": 9.589073634204276e-06, "loss": 0.2941, "step": 5320 }, { "epoch": 1.1383069687900813, "grad_norm": 4.194582462310791, "learning_rate": 9.577197149643707e-06, "loss": 0.3928, "step": 5325 }, { "epoch": 1.1393758016246258, "grad_norm": 5.349386215209961, "learning_rate": 9.565320665083136e-06, "loss": 0.2804, "step": 5330 }, { "epoch": 1.1404446344591705, "grad_norm": 4.539825916290283, "learning_rate": 9.553444180522566e-06, "loss": 0.2605, "step": 5335 }, { "epoch": 1.1415134672937153, "grad_norm": 4.817893028259277, "learning_rate": 9.541567695961997e-06, "loss": 0.2871, "step": 5340 }, { "epoch": 1.14258230012826, "grad_norm": 3.5281782150268555, "learning_rate": 9.529691211401426e-06, "loss": 0.2106, "step": 5345 }, { "epoch": 1.1436511329628045, "grad_norm": 5.409219264984131, "learning_rate": 9.517814726840856e-06, "loss": 0.3053, "step": 5350 }, { "epoch": 1.1447199657973492, "grad_norm": 4.795240879058838, "learning_rate": 9.505938242280285e-06, "loss": 0.3719, "step": 5355 }, { "epoch": 1.145788798631894, "grad_norm": 6.551200866699219, "learning_rate": 9.494061757719716e-06, "loss": 0.396, "step": 5360 }, { "epoch": 1.1468576314664387, "grad_norm": 4.7790703773498535, "learning_rate": 9.482185273159146e-06, "loss": 0.4105, "step": 5365 }, { "epoch": 1.1479264643009834, "grad_norm": 5.062493801116943, "learning_rate": 9.470308788598575e-06, "loss": 0.3882, "step": 5370 }, { "epoch": 1.148995297135528, "grad_norm": 4.342947006225586, "learning_rate": 9.458432304038006e-06, "loss": 0.2815, "step": 5375 }, { "epoch": 1.1500641299700727, "grad_norm": 4.391014099121094, "learning_rate": 9.446555819477435e-06, "loss": 0.2477, "step": 5380 }, { "epoch": 1.1511329628046174, "grad_norm": 3.2322447299957275, "learning_rate": 9.434679334916865e-06, "loss": 0.2798, "step": 5385 }, { "epoch": 1.152201795639162, "grad_norm": 3.8520939350128174, "learning_rate": 9.422802850356296e-06, "loss": 0.2758, "step": 5390 }, { "epoch": 1.1532706284737066, "grad_norm": 3.970700740814209, "learning_rate": 9.410926365795725e-06, "loss": 0.2938, "step": 5395 }, { "epoch": 1.1543394613082514, "grad_norm": 4.378193378448486, "learning_rate": 9.399049881235155e-06, "loss": 0.3946, "step": 5400 }, { "epoch": 1.155408294142796, "grad_norm": 3.779149293899536, "learning_rate": 9.387173396674586e-06, "loss": 0.2348, "step": 5405 }, { "epoch": 1.1564771269773408, "grad_norm": 3.6311495304107666, "learning_rate": 9.375296912114015e-06, "loss": 0.2701, "step": 5410 }, { "epoch": 1.1575459598118853, "grad_norm": 4.026009559631348, "learning_rate": 9.363420427553445e-06, "loss": 0.3154, "step": 5415 }, { "epoch": 1.15861479264643, "grad_norm": 3.796111583709717, "learning_rate": 9.351543942992874e-06, "loss": 0.2617, "step": 5420 }, { "epoch": 1.1596836254809748, "grad_norm": 4.301056861877441, "learning_rate": 9.339667458432304e-06, "loss": 0.3529, "step": 5425 }, { "epoch": 1.1607524583155195, "grad_norm": 4.392220973968506, "learning_rate": 9.327790973871735e-06, "loss": 0.2858, "step": 5430 }, { "epoch": 1.161821291150064, "grad_norm": 3.698474168777466, "learning_rate": 9.315914489311164e-06, "loss": 0.42, "step": 5435 }, { "epoch": 1.1628901239846088, "grad_norm": 4.409991264343262, "learning_rate": 9.304038004750594e-06, "loss": 0.3894, "step": 5440 }, { "epoch": 1.1639589568191535, "grad_norm": 2.799488067626953, "learning_rate": 9.292161520190025e-06, "loss": 0.3073, "step": 5445 }, { "epoch": 1.1650277896536982, "grad_norm": 3.6285009384155273, "learning_rate": 9.280285035629456e-06, "loss": 0.2824, "step": 5450 }, { "epoch": 1.1660966224882428, "grad_norm": 4.096553802490234, "learning_rate": 9.268408551068884e-06, "loss": 0.3139, "step": 5455 }, { "epoch": 1.1671654553227875, "grad_norm": 6.436227798461914, "learning_rate": 9.256532066508315e-06, "loss": 0.4651, "step": 5460 }, { "epoch": 1.1682342881573322, "grad_norm": 4.163245677947998, "learning_rate": 9.244655581947744e-06, "loss": 0.2611, "step": 5465 }, { "epoch": 1.169303120991877, "grad_norm": 4.249100208282471, "learning_rate": 9.232779097387174e-06, "loss": 0.2663, "step": 5470 }, { "epoch": 1.1703719538264215, "grad_norm": 4.877579212188721, "learning_rate": 9.220902612826605e-06, "loss": 0.252, "step": 5475 }, { "epoch": 1.1714407866609662, "grad_norm": 4.387494087219238, "learning_rate": 9.209026128266034e-06, "loss": 0.4996, "step": 5480 }, { "epoch": 1.172509619495511, "grad_norm": 3.6732518672943115, "learning_rate": 9.197149643705464e-06, "loss": 0.2786, "step": 5485 }, { "epoch": 1.1735784523300556, "grad_norm": 4.684414386749268, "learning_rate": 9.185273159144895e-06, "loss": 0.2628, "step": 5490 }, { "epoch": 1.1746472851646002, "grad_norm": 5.551144599914551, "learning_rate": 9.173396674584324e-06, "loss": 0.3326, "step": 5495 }, { "epoch": 1.1757161179991449, "grad_norm": 3.942741632461548, "learning_rate": 9.161520190023754e-06, "loss": 0.3294, "step": 5500 }, { "epoch": 1.1767849508336896, "grad_norm": 4.97520637512207, "learning_rate": 9.149643705463183e-06, "loss": 0.341, "step": 5505 }, { "epoch": 1.1778537836682343, "grad_norm": 4.264441967010498, "learning_rate": 9.137767220902614e-06, "loss": 0.2878, "step": 5510 }, { "epoch": 1.178922616502779, "grad_norm": 5.5287299156188965, "learning_rate": 9.125890736342044e-06, "loss": 0.354, "step": 5515 }, { "epoch": 1.1799914493373236, "grad_norm": 2.997340679168701, "learning_rate": 9.114014251781473e-06, "loss": 0.2667, "step": 5520 }, { "epoch": 1.1810602821718683, "grad_norm": 4.381051540374756, "learning_rate": 9.102137767220904e-06, "loss": 0.2932, "step": 5525 }, { "epoch": 1.182129115006413, "grad_norm": 3.4648494720458984, "learning_rate": 9.090261282660332e-06, "loss": 0.2608, "step": 5530 }, { "epoch": 1.1831979478409576, "grad_norm": 4.567250728607178, "learning_rate": 9.078384798099763e-06, "loss": 0.3078, "step": 5535 }, { "epoch": 1.1842667806755023, "grad_norm": 4.373274326324463, "learning_rate": 9.066508313539194e-06, "loss": 0.4028, "step": 5540 }, { "epoch": 1.185335613510047, "grad_norm": 4.338989734649658, "learning_rate": 9.054631828978622e-06, "loss": 0.3429, "step": 5545 }, { "epoch": 1.1864044463445917, "grad_norm": 4.9778008460998535, "learning_rate": 9.042755344418053e-06, "loss": 0.3481, "step": 5550 }, { "epoch": 1.1874732791791365, "grad_norm": 4.068686008453369, "learning_rate": 9.030878859857482e-06, "loss": 0.2931, "step": 5555 }, { "epoch": 1.188542112013681, "grad_norm": 3.909130096435547, "learning_rate": 9.019002375296912e-06, "loss": 0.2719, "step": 5560 }, { "epoch": 1.1896109448482257, "grad_norm": 4.785898208618164, "learning_rate": 9.007125890736343e-06, "loss": 0.3968, "step": 5565 }, { "epoch": 1.1906797776827704, "grad_norm": 5.576188087463379, "learning_rate": 8.995249406175772e-06, "loss": 0.4653, "step": 5570 }, { "epoch": 1.1917486105173152, "grad_norm": 3.010072946548462, "learning_rate": 8.983372921615202e-06, "loss": 0.2727, "step": 5575 }, { "epoch": 1.1928174433518597, "grad_norm": 4.709297180175781, "learning_rate": 8.971496437054633e-06, "loss": 0.4521, "step": 5580 }, { "epoch": 1.1938862761864044, "grad_norm": 5.573824405670166, "learning_rate": 8.959619952494063e-06, "loss": 0.3042, "step": 5585 }, { "epoch": 1.1949551090209491, "grad_norm": 4.321738243103027, "learning_rate": 8.947743467933492e-06, "loss": 0.2687, "step": 5590 }, { "epoch": 1.1960239418554939, "grad_norm": 5.602605819702148, "learning_rate": 8.935866983372923e-06, "loss": 0.2892, "step": 5595 }, { "epoch": 1.1970927746900384, "grad_norm": 3.6464884281158447, "learning_rate": 8.923990498812353e-06, "loss": 0.2515, "step": 5600 }, { "epoch": 1.1981616075245831, "grad_norm": 3.9809868335723877, "learning_rate": 8.912114014251782e-06, "loss": 0.3022, "step": 5605 }, { "epoch": 1.1992304403591278, "grad_norm": 4.83494758605957, "learning_rate": 8.900237529691213e-06, "loss": 0.4018, "step": 5610 }, { "epoch": 1.2002992731936726, "grad_norm": 3.961460590362549, "learning_rate": 8.888361045130642e-06, "loss": 0.2532, "step": 5615 }, { "epoch": 1.2013681060282173, "grad_norm": 2.4498984813690186, "learning_rate": 8.876484560570072e-06, "loss": 0.2285, "step": 5620 }, { "epoch": 1.2024369388627618, "grad_norm": 3.654311418533325, "learning_rate": 8.864608076009503e-06, "loss": 0.3307, "step": 5625 }, { "epoch": 1.2035057716973065, "grad_norm": 4.238831996917725, "learning_rate": 8.852731591448932e-06, "loss": 0.2353, "step": 5630 }, { "epoch": 1.2045746045318513, "grad_norm": 3.811962842941284, "learning_rate": 8.840855106888362e-06, "loss": 0.3137, "step": 5635 }, { "epoch": 1.2056434373663958, "grad_norm": 3.8361501693725586, "learning_rate": 8.828978622327791e-06, "loss": 0.2549, "step": 5640 }, { "epoch": 1.2067122702009405, "grad_norm": 4.136886119842529, "learning_rate": 8.817102137767222e-06, "loss": 0.2901, "step": 5645 }, { "epoch": 1.2077811030354852, "grad_norm": 4.573363304138184, "learning_rate": 8.805225653206652e-06, "loss": 0.4423, "step": 5650 }, { "epoch": 1.20884993587003, "grad_norm": 4.777524948120117, "learning_rate": 8.793349168646081e-06, "loss": 0.2959, "step": 5655 }, { "epoch": 1.2099187687045747, "grad_norm": 4.250500679016113, "learning_rate": 8.781472684085511e-06, "loss": 0.2523, "step": 5660 }, { "epoch": 1.2109876015391192, "grad_norm": 4.024094581604004, "learning_rate": 8.769596199524942e-06, "loss": 0.1746, "step": 5665 }, { "epoch": 1.212056434373664, "grad_norm": 4.290604591369629, "learning_rate": 8.757719714964371e-06, "loss": 0.3541, "step": 5670 }, { "epoch": 1.2131252672082087, "grad_norm": 3.597705125808716, "learning_rate": 8.745843230403801e-06, "loss": 0.2801, "step": 5675 }, { "epoch": 1.2141941000427534, "grad_norm": 5.059614181518555, "learning_rate": 8.73396674584323e-06, "loss": 0.2846, "step": 5680 }, { "epoch": 1.215262932877298, "grad_norm": 3.8920083045959473, "learning_rate": 8.722090261282661e-06, "loss": 0.3503, "step": 5685 }, { "epoch": 1.2163317657118426, "grad_norm": 4.512190818786621, "learning_rate": 8.710213776722091e-06, "loss": 0.2831, "step": 5690 }, { "epoch": 1.2174005985463874, "grad_norm": 4.729888916015625, "learning_rate": 8.69833729216152e-06, "loss": 0.2708, "step": 5695 }, { "epoch": 1.218469431380932, "grad_norm": 4.533064365386963, "learning_rate": 8.68646080760095e-06, "loss": 0.2846, "step": 5700 }, { "epoch": 1.2195382642154766, "grad_norm": 3.9406075477600098, "learning_rate": 8.67458432304038e-06, "loss": 0.3136, "step": 5705 }, { "epoch": 1.2206070970500214, "grad_norm": 6.5291924476623535, "learning_rate": 8.66270783847981e-06, "loss": 0.3377, "step": 5710 }, { "epoch": 1.221675929884566, "grad_norm": 4.291172981262207, "learning_rate": 8.65083135391924e-06, "loss": 0.4648, "step": 5715 }, { "epoch": 1.2227447627191108, "grad_norm": 5.999503135681152, "learning_rate": 8.63895486935867e-06, "loss": 0.371, "step": 5720 }, { "epoch": 1.2238135955536553, "grad_norm": 3.673821449279785, "learning_rate": 8.6270783847981e-06, "loss": 0.3419, "step": 5725 }, { "epoch": 1.2248824283882, "grad_norm": 4.6455607414245605, "learning_rate": 8.61520190023753e-06, "loss": 0.3034, "step": 5730 }, { "epoch": 1.2259512612227448, "grad_norm": 4.375533103942871, "learning_rate": 8.603325415676961e-06, "loss": 0.2912, "step": 5735 }, { "epoch": 1.2270200940572895, "grad_norm": 4.1931376457214355, "learning_rate": 8.59144893111639e-06, "loss": 0.3251, "step": 5740 }, { "epoch": 1.228088926891834, "grad_norm": 7.451878547668457, "learning_rate": 8.57957244655582e-06, "loss": 0.3989, "step": 5745 }, { "epoch": 1.2291577597263788, "grad_norm": 5.163100242614746, "learning_rate": 8.567695961995251e-06, "loss": 0.3193, "step": 5750 }, { "epoch": 1.2302265925609235, "grad_norm": 6.099165439605713, "learning_rate": 8.55581947743468e-06, "loss": 0.3586, "step": 5755 }, { "epoch": 1.2312954253954682, "grad_norm": 3.8234498500823975, "learning_rate": 8.54394299287411e-06, "loss": 0.2832, "step": 5760 }, { "epoch": 1.232364258230013, "grad_norm": 4.173794269561768, "learning_rate": 8.53206650831354e-06, "loss": 0.389, "step": 5765 }, { "epoch": 1.2334330910645575, "grad_norm": 4.987196922302246, "learning_rate": 8.52019002375297e-06, "loss": 0.3889, "step": 5770 }, { "epoch": 1.2345019238991022, "grad_norm": 3.354900360107422, "learning_rate": 8.5083135391924e-06, "loss": 0.2243, "step": 5775 }, { "epoch": 1.235570756733647, "grad_norm": 4.882574558258057, "learning_rate": 8.49643705463183e-06, "loss": 0.2416, "step": 5780 }, { "epoch": 1.2366395895681914, "grad_norm": 4.3282790184021, "learning_rate": 8.48456057007126e-06, "loss": 0.3066, "step": 5785 }, { "epoch": 1.2377084224027362, "grad_norm": 5.309357166290283, "learning_rate": 8.472684085510689e-06, "loss": 0.3227, "step": 5790 }, { "epoch": 1.2387772552372809, "grad_norm": 3.708139181137085, "learning_rate": 8.46080760095012e-06, "loss": 0.3194, "step": 5795 }, { "epoch": 1.2398460880718256, "grad_norm": 5.823927879333496, "learning_rate": 8.44893111638955e-06, "loss": 0.3851, "step": 5800 }, { "epoch": 1.2409149209063703, "grad_norm": 5.825521945953369, "learning_rate": 8.437054631828979e-06, "loss": 0.2793, "step": 5805 }, { "epoch": 1.2419837537409149, "grad_norm": 4.350478172302246, "learning_rate": 8.42517814726841e-06, "loss": 0.2482, "step": 5810 }, { "epoch": 1.2430525865754596, "grad_norm": 4.824470043182373, "learning_rate": 8.413301662707838e-06, "loss": 0.3311, "step": 5815 }, { "epoch": 1.2441214194100043, "grad_norm": 4.695113182067871, "learning_rate": 8.401425178147269e-06, "loss": 0.2456, "step": 5820 }, { "epoch": 1.245190252244549, "grad_norm": 5.539307594299316, "learning_rate": 8.3895486935867e-06, "loss": 0.3267, "step": 5825 }, { "epoch": 1.2462590850790936, "grad_norm": 4.055349826812744, "learning_rate": 8.377672209026128e-06, "loss": 0.2496, "step": 5830 }, { "epoch": 1.2473279179136383, "grad_norm": 4.012608051300049, "learning_rate": 8.365795724465559e-06, "loss": 0.2896, "step": 5835 }, { "epoch": 1.248396750748183, "grad_norm": 4.369838714599609, "learning_rate": 8.35391923990499e-06, "loss": 0.2717, "step": 5840 }, { "epoch": 1.2494655835827277, "grad_norm": 7.311318874359131, "learning_rate": 8.342042755344418e-06, "loss": 0.344, "step": 5845 }, { "epoch": 1.2505344164172723, "grad_norm": 3.8691282272338867, "learning_rate": 8.330166270783849e-06, "loss": 0.1997, "step": 5850 }, { "epoch": 1.251603249251817, "grad_norm": 4.140939712524414, "learning_rate": 8.318289786223278e-06, "loss": 0.2454, "step": 5855 }, { "epoch": 1.2526720820863617, "grad_norm": 4.034096717834473, "learning_rate": 8.306413301662708e-06, "loss": 0.2923, "step": 5860 }, { "epoch": 1.2537409149209064, "grad_norm": 4.175270080566406, "learning_rate": 8.294536817102139e-06, "loss": 0.268, "step": 5865 }, { "epoch": 1.2548097477554512, "grad_norm": 5.182862758636475, "learning_rate": 8.28266033254157e-06, "loss": 0.2469, "step": 5870 }, { "epoch": 1.2558785805899957, "grad_norm": 3.4455058574676514, "learning_rate": 8.270783847980998e-06, "loss": 0.2488, "step": 5875 }, { "epoch": 1.2569474134245404, "grad_norm": 3.5229389667510986, "learning_rate": 8.258907363420429e-06, "loss": 0.2809, "step": 5880 }, { "epoch": 1.2580162462590851, "grad_norm": 5.2068071365356445, "learning_rate": 8.247030878859859e-06, "loss": 0.2967, "step": 5885 }, { "epoch": 1.2590850790936297, "grad_norm": 5.500560283660889, "learning_rate": 8.235154394299288e-06, "loss": 0.3366, "step": 5890 }, { "epoch": 1.2601539119281744, "grad_norm": 3.9053938388824463, "learning_rate": 8.223277909738719e-06, "loss": 0.2968, "step": 5895 }, { "epoch": 1.2612227447627191, "grad_norm": 3.7163820266723633, "learning_rate": 8.211401425178147e-06, "loss": 0.2, "step": 5900 }, { "epoch": 1.2622915775972638, "grad_norm": 4.347673416137695, "learning_rate": 8.199524940617578e-06, "loss": 0.2761, "step": 5905 }, { "epoch": 1.2633604104318086, "grad_norm": 3.297481060028076, "learning_rate": 8.187648456057008e-06, "loss": 0.2177, "step": 5910 }, { "epoch": 1.264429243266353, "grad_norm": 5.587257385253906, "learning_rate": 8.175771971496437e-06, "loss": 0.2721, "step": 5915 }, { "epoch": 1.2654980761008978, "grad_norm": 3.562802791595459, "learning_rate": 8.163895486935868e-06, "loss": 0.2592, "step": 5920 }, { "epoch": 1.2665669089354425, "grad_norm": 5.265760898590088, "learning_rate": 8.152019002375298e-06, "loss": 0.453, "step": 5925 }, { "epoch": 1.267635741769987, "grad_norm": 4.091883182525635, "learning_rate": 8.140142517814727e-06, "loss": 0.1952, "step": 5930 }, { "epoch": 1.2687045746045318, "grad_norm": 4.552518844604492, "learning_rate": 8.128266033254158e-06, "loss": 0.3269, "step": 5935 }, { "epoch": 1.2697734074390765, "grad_norm": 4.755618572235107, "learning_rate": 8.116389548693587e-06, "loss": 0.2776, "step": 5940 }, { "epoch": 1.2708422402736212, "grad_norm": 4.392646312713623, "learning_rate": 8.104513064133017e-06, "loss": 0.2627, "step": 5945 }, { "epoch": 1.271911073108166, "grad_norm": 2.6964704990386963, "learning_rate": 8.092636579572448e-06, "loss": 0.2524, "step": 5950 }, { "epoch": 1.2729799059427105, "grad_norm": 3.914213180541992, "learning_rate": 8.080760095011877e-06, "loss": 0.2713, "step": 5955 }, { "epoch": 1.2740487387772552, "grad_norm": 3.009427785873413, "learning_rate": 8.068883610451307e-06, "loss": 0.2618, "step": 5960 }, { "epoch": 1.2751175716118, "grad_norm": 4.362711429595947, "learning_rate": 8.057007125890736e-06, "loss": 0.2735, "step": 5965 }, { "epoch": 1.2761864044463445, "grad_norm": 5.038128852844238, "learning_rate": 8.045130641330167e-06, "loss": 0.327, "step": 5970 }, { "epoch": 1.2772552372808892, "grad_norm": 5.867886543273926, "learning_rate": 8.033254156769597e-06, "loss": 0.3294, "step": 5975 }, { "epoch": 1.278324070115434, "grad_norm": 3.8101038932800293, "learning_rate": 8.021377672209026e-06, "loss": 0.3299, "step": 5980 }, { "epoch": 1.2793929029499786, "grad_norm": 4.082939624786377, "learning_rate": 8.009501187648457e-06, "loss": 0.2328, "step": 5985 }, { "epoch": 1.2804617357845234, "grad_norm": 5.352798938751221, "learning_rate": 7.997624703087885e-06, "loss": 0.2554, "step": 5990 }, { "epoch": 1.281530568619068, "grad_norm": 2.7532148361206055, "learning_rate": 7.985748218527316e-06, "loss": 0.3285, "step": 5995 }, { "epoch": 1.2825994014536126, "grad_norm": 4.2501349449157715, "learning_rate": 7.973871733966747e-06, "loss": 0.2884, "step": 6000 }, { "epoch": 1.2836682342881574, "grad_norm": 3.0817322731018066, "learning_rate": 7.961995249406177e-06, "loss": 0.3201, "step": 6005 }, { "epoch": 1.284737067122702, "grad_norm": 4.214169502258301, "learning_rate": 7.950118764845608e-06, "loss": 0.3529, "step": 6010 }, { "epoch": 1.2858058999572468, "grad_norm": 4.896885871887207, "learning_rate": 7.938242280285036e-06, "loss": 0.3113, "step": 6015 }, { "epoch": 1.2868747327917913, "grad_norm": 4.869765758514404, "learning_rate": 7.926365795724467e-06, "loss": 0.386, "step": 6020 }, { "epoch": 1.287943565626336, "grad_norm": 4.720851421356201, "learning_rate": 7.914489311163896e-06, "loss": 0.2453, "step": 6025 }, { "epoch": 1.2890123984608808, "grad_norm": 4.764908790588379, "learning_rate": 7.902612826603326e-06, "loss": 0.3845, "step": 6030 }, { "epoch": 1.2900812312954253, "grad_norm": 4.5335845947265625, "learning_rate": 7.890736342042757e-06, "loss": 0.3311, "step": 6035 }, { "epoch": 1.29115006412997, "grad_norm": 5.650118350982666, "learning_rate": 7.878859857482186e-06, "loss": 0.3556, "step": 6040 }, { "epoch": 1.2922188969645148, "grad_norm": 4.7145209312438965, "learning_rate": 7.866983372921616e-06, "loss": 0.2491, "step": 6045 }, { "epoch": 1.2932877297990595, "grad_norm": 5.045220851898193, "learning_rate": 7.855106888361045e-06, "loss": 0.3478, "step": 6050 }, { "epoch": 1.2943565626336042, "grad_norm": 3.746929407119751, "learning_rate": 7.843230403800476e-06, "loss": 0.175, "step": 6055 }, { "epoch": 1.2954253954681487, "grad_norm": 3.4932451248168945, "learning_rate": 7.831353919239906e-06, "loss": 0.2475, "step": 6060 }, { "epoch": 1.2964942283026935, "grad_norm": 4.507287502288818, "learning_rate": 7.819477434679335e-06, "loss": 0.2793, "step": 6065 }, { "epoch": 1.2975630611372382, "grad_norm": 3.872846841812134, "learning_rate": 7.807600950118766e-06, "loss": 0.3745, "step": 6070 }, { "epoch": 1.2986318939717827, "grad_norm": 3.80639910697937, "learning_rate": 7.795724465558195e-06, "loss": 0.2619, "step": 6075 }, { "epoch": 1.2997007268063274, "grad_norm": 4.278339862823486, "learning_rate": 7.783847980997625e-06, "loss": 0.2882, "step": 6080 }, { "epoch": 1.3007695596408722, "grad_norm": 3.2503674030303955, "learning_rate": 7.771971496437056e-06, "loss": 0.2651, "step": 6085 }, { "epoch": 1.3018383924754169, "grad_norm": 3.709991216659546, "learning_rate": 7.760095011876485e-06, "loss": 0.3257, "step": 6090 }, { "epoch": 1.3029072253099616, "grad_norm": 4.797738075256348, "learning_rate": 7.748218527315915e-06, "loss": 0.2626, "step": 6095 }, { "epoch": 1.3039760581445061, "grad_norm": 3.289095163345337, "learning_rate": 7.736342042755346e-06, "loss": 0.3391, "step": 6100 }, { "epoch": 1.3050448909790509, "grad_norm": 5.237732410430908, "learning_rate": 7.724465558194774e-06, "loss": 0.2868, "step": 6105 }, { "epoch": 1.3061137238135956, "grad_norm": 3.3352086544036865, "learning_rate": 7.712589073634205e-06, "loss": 0.2358, "step": 6110 }, { "epoch": 1.30718255664814, "grad_norm": 4.8291168212890625, "learning_rate": 7.700712589073634e-06, "loss": 0.3505, "step": 6115 }, { "epoch": 1.308251389482685, "grad_norm": 6.421624183654785, "learning_rate": 7.688836104513064e-06, "loss": 0.3643, "step": 6120 }, { "epoch": 1.3093202223172296, "grad_norm": 2.7074790000915527, "learning_rate": 7.676959619952495e-06, "loss": 0.2729, "step": 6125 }, { "epoch": 1.3103890551517743, "grad_norm": 4.26420783996582, "learning_rate": 7.665083135391924e-06, "loss": 0.2363, "step": 6130 }, { "epoch": 1.311457887986319, "grad_norm": 6.1749773025512695, "learning_rate": 7.653206650831354e-06, "loss": 0.3173, "step": 6135 }, { "epoch": 1.3125267208208635, "grad_norm": 3.3525917530059814, "learning_rate": 7.641330166270783e-06, "loss": 0.2797, "step": 6140 }, { "epoch": 1.3135955536554083, "grad_norm": 3.1302783489227295, "learning_rate": 7.629453681710216e-06, "loss": 0.3054, "step": 6145 }, { "epoch": 1.314664386489953, "grad_norm": 3.0552220344543457, "learning_rate": 7.617577197149645e-06, "loss": 0.2596, "step": 6150 }, { "epoch": 1.3157332193244977, "grad_norm": 5.424324035644531, "learning_rate": 7.605700712589075e-06, "loss": 0.3871, "step": 6155 }, { "epoch": 1.3168020521590424, "grad_norm": 4.735466480255127, "learning_rate": 7.593824228028505e-06, "loss": 0.2798, "step": 6160 }, { "epoch": 1.317870884993587, "grad_norm": 5.158178806304932, "learning_rate": 7.581947743467934e-06, "loss": 0.2532, "step": 6165 }, { "epoch": 1.3189397178281317, "grad_norm": 5.720581531524658, "learning_rate": 7.570071258907364e-06, "loss": 0.2649, "step": 6170 }, { "epoch": 1.3200085506626764, "grad_norm": 4.740435600280762, "learning_rate": 7.5581947743467946e-06, "loss": 0.3457, "step": 6175 }, { "epoch": 1.321077383497221, "grad_norm": 4.528372287750244, "learning_rate": 7.546318289786224e-06, "loss": 0.4062, "step": 6180 }, { "epoch": 1.3221462163317657, "grad_norm": 5.7430243492126465, "learning_rate": 7.534441805225654e-06, "loss": 0.4318, "step": 6185 }, { "epoch": 1.3232150491663104, "grad_norm": 3.7349984645843506, "learning_rate": 7.522565320665084e-06, "loss": 0.2305, "step": 6190 }, { "epoch": 1.3242838820008551, "grad_norm": 3.384366273880005, "learning_rate": 7.510688836104514e-06, "loss": 0.218, "step": 6195 }, { "epoch": 1.3253527148353998, "grad_norm": 4.311688423156738, "learning_rate": 7.498812351543944e-06, "loss": 0.2738, "step": 6200 }, { "epoch": 1.3264215476699444, "grad_norm": 3.9737985134124756, "learning_rate": 7.486935866983374e-06, "loss": 0.3043, "step": 6205 }, { "epoch": 1.327490380504489, "grad_norm": 3.2927355766296387, "learning_rate": 7.475059382422803e-06, "loss": 0.2055, "step": 6210 }, { "epoch": 1.3285592133390338, "grad_norm": 4.364592552185059, "learning_rate": 7.463182897862233e-06, "loss": 0.2528, "step": 6215 }, { "epoch": 1.3296280461735783, "grad_norm": 4.896527290344238, "learning_rate": 7.451306413301664e-06, "loss": 0.3514, "step": 6220 }, { "epoch": 1.330696879008123, "grad_norm": 3.7543258666992188, "learning_rate": 7.439429928741093e-06, "loss": 0.3199, "step": 6225 }, { "epoch": 1.3317657118426678, "grad_norm": 4.389688491821289, "learning_rate": 7.427553444180523e-06, "loss": 0.2453, "step": 6230 }, { "epoch": 1.3328345446772125, "grad_norm": 5.297595500946045, "learning_rate": 7.415676959619953e-06, "loss": 0.3237, "step": 6235 }, { "epoch": 1.3339033775117572, "grad_norm": 4.290585041046143, "learning_rate": 7.403800475059383e-06, "loss": 0.3409, "step": 6240 }, { "epoch": 1.3349722103463018, "grad_norm": 3.8684494495391846, "learning_rate": 7.391923990498813e-06, "loss": 0.268, "step": 6245 }, { "epoch": 1.3360410431808465, "grad_norm": 7.344365119934082, "learning_rate": 7.380047505938243e-06, "loss": 0.4072, "step": 6250 }, { "epoch": 1.3371098760153912, "grad_norm": 4.403175354003906, "learning_rate": 7.368171021377672e-06, "loss": 0.3601, "step": 6255 }, { "epoch": 1.338178708849936, "grad_norm": 4.6706414222717285, "learning_rate": 7.356294536817102e-06, "loss": 0.3997, "step": 6260 }, { "epoch": 1.3392475416844807, "grad_norm": 3.4723129272460938, "learning_rate": 7.344418052256533e-06, "loss": 0.2062, "step": 6265 }, { "epoch": 1.3403163745190252, "grad_norm": 3.8669190406799316, "learning_rate": 7.332541567695962e-06, "loss": 0.2703, "step": 6270 }, { "epoch": 1.34138520735357, "grad_norm": 4.620151519775391, "learning_rate": 7.320665083135392e-06, "loss": 0.2498, "step": 6275 }, { "epoch": 1.3424540401881147, "grad_norm": 4.765347480773926, "learning_rate": 7.308788598574822e-06, "loss": 0.3418, "step": 6280 }, { "epoch": 1.3435228730226592, "grad_norm": 3.9806559085845947, "learning_rate": 7.296912114014253e-06, "loss": 0.2046, "step": 6285 }, { "epoch": 1.344591705857204, "grad_norm": 6.489411354064941, "learning_rate": 7.285035629453683e-06, "loss": 0.3267, "step": 6290 }, { "epoch": 1.3456605386917486, "grad_norm": 4.385682582855225, "learning_rate": 7.2731591448931125e-06, "loss": 0.2756, "step": 6295 }, { "epoch": 1.3467293715262934, "grad_norm": 5.30741548538208, "learning_rate": 7.261282660332542e-06, "loss": 0.2808, "step": 6300 }, { "epoch": 1.347798204360838, "grad_norm": 3.52230167388916, "learning_rate": 7.249406175771973e-06, "loss": 0.3037, "step": 6305 }, { "epoch": 1.3488670371953826, "grad_norm": 3.3302509784698486, "learning_rate": 7.2375296912114025e-06, "loss": 0.3837, "step": 6310 }, { "epoch": 1.3499358700299273, "grad_norm": 4.349034309387207, "learning_rate": 7.225653206650832e-06, "loss": 0.2496, "step": 6315 }, { "epoch": 1.351004702864472, "grad_norm": 3.651261329650879, "learning_rate": 7.213776722090262e-06, "loss": 0.3131, "step": 6320 }, { "epoch": 1.3520735356990166, "grad_norm": 4.3042144775390625, "learning_rate": 7.201900237529692e-06, "loss": 0.3038, "step": 6325 }, { "epoch": 1.3531423685335613, "grad_norm": 4.746523380279541, "learning_rate": 7.190023752969122e-06, "loss": 0.2872, "step": 6330 }, { "epoch": 1.354211201368106, "grad_norm": 3.058163642883301, "learning_rate": 7.178147268408552e-06, "loss": 0.3548, "step": 6335 }, { "epoch": 1.3552800342026508, "grad_norm": 4.4561309814453125, "learning_rate": 7.1662707838479815e-06, "loss": 0.1994, "step": 6340 }, { "epoch": 1.3563488670371955, "grad_norm": 3.580275535583496, "learning_rate": 7.154394299287411e-06, "loss": 0.2383, "step": 6345 }, { "epoch": 1.35741769987174, "grad_norm": 4.0294671058654785, "learning_rate": 7.142517814726842e-06, "loss": 0.3412, "step": 6350 }, { "epoch": 1.3584865327062847, "grad_norm": 4.032179355621338, "learning_rate": 7.1306413301662715e-06, "loss": 0.2392, "step": 6355 }, { "epoch": 1.3595553655408295, "grad_norm": 3.6529910564422607, "learning_rate": 7.118764845605701e-06, "loss": 0.2946, "step": 6360 }, { "epoch": 1.360624198375374, "grad_norm": 5.5632781982421875, "learning_rate": 7.106888361045131e-06, "loss": 0.3075, "step": 6365 }, { "epoch": 1.3616930312099187, "grad_norm": 4.6378865242004395, "learning_rate": 7.0950118764845614e-06, "loss": 0.2695, "step": 6370 }, { "epoch": 1.3627618640444634, "grad_norm": 4.01276969909668, "learning_rate": 7.083135391923991e-06, "loss": 0.2289, "step": 6375 }, { "epoch": 1.3638306968790082, "grad_norm": 2.731029748916626, "learning_rate": 7.071258907363421e-06, "loss": 0.339, "step": 6380 }, { "epoch": 1.3648995297135529, "grad_norm": 6.142641544342041, "learning_rate": 7.0593824228028505e-06, "loss": 0.3048, "step": 6385 }, { "epoch": 1.3659683625480974, "grad_norm": 4.8854289054870605, "learning_rate": 7.04750593824228e-06, "loss": 0.3437, "step": 6390 }, { "epoch": 1.3670371953826421, "grad_norm": 4.592909336090088, "learning_rate": 7.035629453681711e-06, "loss": 0.2587, "step": 6395 }, { "epoch": 1.3681060282171869, "grad_norm": 4.572000026702881, "learning_rate": 7.0237529691211405e-06, "loss": 0.3156, "step": 6400 }, { "epoch": 1.3691748610517316, "grad_norm": 6.196121692657471, "learning_rate": 7.01187648456057e-06, "loss": 0.2827, "step": 6405 }, { "epoch": 1.3702436938862763, "grad_norm": 3.967109441757202, "learning_rate": 7e-06, "loss": 0.2337, "step": 6410 }, { "epoch": 1.3713125267208208, "grad_norm": 3.1756539344787598, "learning_rate": 6.98812351543943e-06, "loss": 0.265, "step": 6415 }, { "epoch": 1.3723813595553656, "grad_norm": 3.4292986392974854, "learning_rate": 6.97624703087886e-06, "loss": 0.2822, "step": 6420 }, { "epoch": 1.3734501923899103, "grad_norm": 4.521055698394775, "learning_rate": 6.964370546318291e-06, "loss": 0.2697, "step": 6425 }, { "epoch": 1.3745190252244548, "grad_norm": 3.9092273712158203, "learning_rate": 6.95249406175772e-06, "loss": 0.2136, "step": 6430 }, { "epoch": 1.3755878580589995, "grad_norm": 3.5216240882873535, "learning_rate": 6.940617577197151e-06, "loss": 0.2549, "step": 6435 }, { "epoch": 1.3766566908935443, "grad_norm": 5.987946510314941, "learning_rate": 6.928741092636581e-06, "loss": 0.2631, "step": 6440 }, { "epoch": 1.377725523728089, "grad_norm": 5.098079681396484, "learning_rate": 6.91686460807601e-06, "loss": 0.324, "step": 6445 }, { "epoch": 1.3787943565626337, "grad_norm": 4.314655303955078, "learning_rate": 6.90498812351544e-06, "loss": 0.3722, "step": 6450 }, { "epoch": 1.3798631893971782, "grad_norm": 5.151162147521973, "learning_rate": 6.893111638954871e-06, "loss": 0.453, "step": 6455 }, { "epoch": 1.380932022231723, "grad_norm": 4.187003135681152, "learning_rate": 6.8812351543943e-06, "loss": 0.2798, "step": 6460 }, { "epoch": 1.3820008550662677, "grad_norm": 5.253510475158691, "learning_rate": 6.86935866983373e-06, "loss": 0.2605, "step": 6465 }, { "epoch": 1.3830696879008122, "grad_norm": 2.9405324459075928, "learning_rate": 6.85748218527316e-06, "loss": 0.3834, "step": 6470 }, { "epoch": 1.384138520735357, "grad_norm": 3.8434178829193115, "learning_rate": 6.845605700712589e-06, "loss": 0.2683, "step": 6475 }, { "epoch": 1.3852073535699017, "grad_norm": 4.633339881896973, "learning_rate": 6.83372921615202e-06, "loss": 0.243, "step": 6480 }, { "epoch": 1.3862761864044464, "grad_norm": 4.103108882904053, "learning_rate": 6.82185273159145e-06, "loss": 0.3287, "step": 6485 }, { "epoch": 1.3873450192389911, "grad_norm": 4.187243938446045, "learning_rate": 6.809976247030879e-06, "loss": 0.2754, "step": 6490 }, { "epoch": 1.3884138520735356, "grad_norm": 5.196486949920654, "learning_rate": 6.798099762470309e-06, "loss": 0.3701, "step": 6495 }, { "epoch": 1.3894826849080804, "grad_norm": 4.622681140899658, "learning_rate": 6.78622327790974e-06, "loss": 0.2762, "step": 6500 }, { "epoch": 1.390551517742625, "grad_norm": 2.859978675842285, "learning_rate": 6.774346793349169e-06, "loss": 0.266, "step": 6505 }, { "epoch": 1.3916203505771696, "grad_norm": 5.8184332847595215, "learning_rate": 6.762470308788599e-06, "loss": 0.2958, "step": 6510 }, { "epoch": 1.3926891834117143, "grad_norm": 3.787079334259033, "learning_rate": 6.750593824228029e-06, "loss": 0.2754, "step": 6515 }, { "epoch": 1.393758016246259, "grad_norm": 4.132429599761963, "learning_rate": 6.7387173396674584e-06, "loss": 0.3634, "step": 6520 }, { "epoch": 1.3948268490808038, "grad_norm": 5.011837005615234, "learning_rate": 6.726840855106889e-06, "loss": 0.2974, "step": 6525 }, { "epoch": 1.3958956819153485, "grad_norm": 5.0287957191467285, "learning_rate": 6.714964370546319e-06, "loss": 0.3158, "step": 6530 }, { "epoch": 1.396964514749893, "grad_norm": 3.846284866333008, "learning_rate": 6.703087885985748e-06, "loss": 0.2718, "step": 6535 }, { "epoch": 1.3980333475844378, "grad_norm": 5.715949058532715, "learning_rate": 6.691211401425178e-06, "loss": 0.2586, "step": 6540 }, { "epoch": 1.3991021804189825, "grad_norm": 2.0641372203826904, "learning_rate": 6.679334916864609e-06, "loss": 0.2071, "step": 6545 }, { "epoch": 1.4001710132535272, "grad_norm": 3.989108085632324, "learning_rate": 6.667458432304038e-06, "loss": 0.3308, "step": 6550 }, { "epoch": 1.401239846088072, "grad_norm": 5.488873481750488, "learning_rate": 6.655581947743468e-06, "loss": 0.2517, "step": 6555 }, { "epoch": 1.4023086789226165, "grad_norm": 4.91823673248291, "learning_rate": 6.643705463182898e-06, "loss": 0.234, "step": 6560 }, { "epoch": 1.4033775117571612, "grad_norm": 5.4402289390563965, "learning_rate": 6.631828978622329e-06, "loss": 0.3025, "step": 6565 }, { "epoch": 1.404446344591706, "grad_norm": 5.417737007141113, "learning_rate": 6.619952494061759e-06, "loss": 0.2619, "step": 6570 }, { "epoch": 1.4055151774262504, "grad_norm": 3.603675127029419, "learning_rate": 6.6080760095011885e-06, "loss": 0.2521, "step": 6575 }, { "epoch": 1.4065840102607952, "grad_norm": 4.426266670227051, "learning_rate": 6.596199524940618e-06, "loss": 0.3113, "step": 6580 }, { "epoch": 1.40765284309534, "grad_norm": 4.535027980804443, "learning_rate": 6.584323040380049e-06, "loss": 0.3537, "step": 6585 }, { "epoch": 1.4087216759298846, "grad_norm": 3.585488796234131, "learning_rate": 6.5724465558194785e-06, "loss": 0.2386, "step": 6590 }, { "epoch": 1.4097905087644294, "grad_norm": 5.358974456787109, "learning_rate": 6.560570071258908e-06, "loss": 0.3081, "step": 6595 }, { "epoch": 1.4108593415989739, "grad_norm": 3.859417200088501, "learning_rate": 6.548693586698338e-06, "loss": 0.189, "step": 6600 }, { "epoch": 1.4119281744335186, "grad_norm": 3.350184679031372, "learning_rate": 6.536817102137768e-06, "loss": 0.2643, "step": 6605 }, { "epoch": 1.4129970072680633, "grad_norm": 3.4859519004821777, "learning_rate": 6.524940617577198e-06, "loss": 0.2206, "step": 6610 }, { "epoch": 1.4140658401026078, "grad_norm": 6.238532543182373, "learning_rate": 6.513064133016628e-06, "loss": 0.3687, "step": 6615 }, { "epoch": 1.4151346729371526, "grad_norm": 6.955577850341797, "learning_rate": 6.5011876484560576e-06, "loss": 0.3626, "step": 6620 }, { "epoch": 1.4162035057716973, "grad_norm": 4.4574995040893555, "learning_rate": 6.489311163895487e-06, "loss": 0.2711, "step": 6625 }, { "epoch": 1.417272338606242, "grad_norm": 4.533407211303711, "learning_rate": 6.477434679334918e-06, "loss": 0.3304, "step": 6630 }, { "epoch": 1.4183411714407868, "grad_norm": 2.947624921798706, "learning_rate": 6.4655581947743475e-06, "loss": 0.321, "step": 6635 }, { "epoch": 1.4194100042753313, "grad_norm": 4.557621955871582, "learning_rate": 6.453681710213777e-06, "loss": 0.319, "step": 6640 }, { "epoch": 1.420478837109876, "grad_norm": 4.511264324188232, "learning_rate": 6.441805225653207e-06, "loss": 0.2363, "step": 6645 }, { "epoch": 1.4215476699444207, "grad_norm": 4.200313568115234, "learning_rate": 6.429928741092637e-06, "loss": 0.2319, "step": 6650 }, { "epoch": 1.4226165027789655, "grad_norm": 7.376286506652832, "learning_rate": 6.418052256532067e-06, "loss": 0.3526, "step": 6655 }, { "epoch": 1.4236853356135102, "grad_norm": 4.415379047393799, "learning_rate": 6.406175771971497e-06, "loss": 0.4186, "step": 6660 }, { "epoch": 1.4247541684480547, "grad_norm": 4.578277587890625, "learning_rate": 6.394299287410927e-06, "loss": 0.2846, "step": 6665 }, { "epoch": 1.4258230012825994, "grad_norm": 4.811502456665039, "learning_rate": 6.382422802850356e-06, "loss": 0.3077, "step": 6670 }, { "epoch": 1.4268918341171442, "grad_norm": 3.3036532402038574, "learning_rate": 6.370546318289787e-06, "loss": 0.3709, "step": 6675 }, { "epoch": 1.4279606669516887, "grad_norm": 4.229010105133057, "learning_rate": 6.3586698337292165e-06, "loss": 0.3438, "step": 6680 }, { "epoch": 1.4290294997862334, "grad_norm": 7.352675914764404, "learning_rate": 6.346793349168646e-06, "loss": 0.4159, "step": 6685 }, { "epoch": 1.4300983326207781, "grad_norm": 3.935654878616333, "learning_rate": 6.334916864608076e-06, "loss": 0.3511, "step": 6690 }, { "epoch": 1.4311671654553229, "grad_norm": 4.271127700805664, "learning_rate": 6.323040380047506e-06, "loss": 0.3061, "step": 6695 }, { "epoch": 1.4322359982898676, "grad_norm": 4.57000207901001, "learning_rate": 6.311163895486936e-06, "loss": 0.2694, "step": 6700 }, { "epoch": 1.433304831124412, "grad_norm": 4.243838787078857, "learning_rate": 6.299287410926367e-06, "loss": 0.3412, "step": 6705 }, { "epoch": 1.4343736639589568, "grad_norm": 4.534287929534912, "learning_rate": 6.2874109263657964e-06, "loss": 0.2631, "step": 6710 }, { "epoch": 1.4354424967935016, "grad_norm": 4.457566261291504, "learning_rate": 6.275534441805227e-06, "loss": 0.3879, "step": 6715 }, { "epoch": 1.436511329628046, "grad_norm": 3.7211356163024902, "learning_rate": 6.263657957244657e-06, "loss": 0.2942, "step": 6720 }, { "epoch": 1.4375801624625908, "grad_norm": 6.8076300621032715, "learning_rate": 6.251781472684086e-06, "loss": 0.3901, "step": 6725 }, { "epoch": 1.4386489952971355, "grad_norm": 6.238668441772461, "learning_rate": 6.239904988123516e-06, "loss": 0.3192, "step": 6730 }, { "epoch": 1.4397178281316803, "grad_norm": 4.374307155609131, "learning_rate": 6.228028503562946e-06, "loss": 0.2269, "step": 6735 }, { "epoch": 1.440786660966225, "grad_norm": 5.202229976654053, "learning_rate": 6.216152019002376e-06, "loss": 0.372, "step": 6740 }, { "epoch": 1.4418554938007695, "grad_norm": 4.483334064483643, "learning_rate": 6.204275534441806e-06, "loss": 0.2467, "step": 6745 }, { "epoch": 1.4429243266353142, "grad_norm": 3.3366737365722656, "learning_rate": 6.192399049881236e-06, "loss": 0.2313, "step": 6750 }, { "epoch": 1.443993159469859, "grad_norm": 6.443348407745361, "learning_rate": 6.1805225653206655e-06, "loss": 0.3538, "step": 6755 }, { "epoch": 1.4450619923044035, "grad_norm": 3.4701974391937256, "learning_rate": 6.168646080760096e-06, "loss": 0.252, "step": 6760 }, { "epoch": 1.4461308251389482, "grad_norm": 3.572749137878418, "learning_rate": 6.156769596199526e-06, "loss": 0.3049, "step": 6765 }, { "epoch": 1.447199657973493, "grad_norm": 4.363938808441162, "learning_rate": 6.144893111638955e-06, "loss": 0.2796, "step": 6770 }, { "epoch": 1.4482684908080377, "grad_norm": 3.493666172027588, "learning_rate": 6.133016627078385e-06, "loss": 0.2128, "step": 6775 }, { "epoch": 1.4493373236425824, "grad_norm": 4.754271507263184, "learning_rate": 6.121140142517815e-06, "loss": 0.3407, "step": 6780 }, { "epoch": 1.450406156477127, "grad_norm": 4.948278903961182, "learning_rate": 6.109263657957245e-06, "loss": 0.2196, "step": 6785 }, { "epoch": 1.4514749893116716, "grad_norm": 4.344764709472656, "learning_rate": 6.097387173396675e-06, "loss": 0.2472, "step": 6790 }, { "epoch": 1.4525438221462164, "grad_norm": 4.455203056335449, "learning_rate": 6.085510688836105e-06, "loss": 0.2788, "step": 6795 }, { "epoch": 1.453612654980761, "grad_norm": 5.69878625869751, "learning_rate": 6.0736342042755345e-06, "loss": 0.305, "step": 6800 }, { "epoch": 1.4546814878153058, "grad_norm": 4.746001243591309, "learning_rate": 6.061757719714965e-06, "loss": 0.3072, "step": 6805 }, { "epoch": 1.4557503206498503, "grad_norm": 3.463618755340576, "learning_rate": 6.049881235154395e-06, "loss": 0.2879, "step": 6810 }, { "epoch": 1.456819153484395, "grad_norm": 3.4969255924224854, "learning_rate": 6.0380047505938244e-06, "loss": 0.4406, "step": 6815 }, { "epoch": 1.4578879863189398, "grad_norm": 3.6291632652282715, "learning_rate": 6.026128266033254e-06, "loss": 0.3371, "step": 6820 }, { "epoch": 1.4589568191534843, "grad_norm": 4.0304765701293945, "learning_rate": 6.014251781472684e-06, "loss": 0.2775, "step": 6825 }, { "epoch": 1.460025651988029, "grad_norm": 3.6861469745635986, "learning_rate": 6.002375296912114e-06, "loss": 0.2872, "step": 6830 }, { "epoch": 1.4610944848225738, "grad_norm": 4.720432758331299, "learning_rate": 5.990498812351544e-06, "loss": 0.3056, "step": 6835 }, { "epoch": 1.4621633176571185, "grad_norm": 3.8419721126556396, "learning_rate": 5.978622327790974e-06, "loss": 0.3183, "step": 6840 }, { "epoch": 1.4632321504916632, "grad_norm": 4.320315361022949, "learning_rate": 5.9667458432304035e-06, "loss": 0.2801, "step": 6845 }, { "epoch": 1.4643009833262077, "grad_norm": 4.07327127456665, "learning_rate": 5.954869358669835e-06, "loss": 0.2641, "step": 6850 }, { "epoch": 1.4653698161607525, "grad_norm": 5.109342098236084, "learning_rate": 5.942992874109265e-06, "loss": 0.2903, "step": 6855 }, { "epoch": 1.4664386489952972, "grad_norm": 5.147985458374023, "learning_rate": 5.931116389548694e-06, "loss": 0.4097, "step": 6860 }, { "epoch": 1.4675074818298417, "grad_norm": 5.812030792236328, "learning_rate": 5.919239904988124e-06, "loss": 0.2133, "step": 6865 }, { "epoch": 1.4685763146643864, "grad_norm": 4.3751220703125, "learning_rate": 5.9073634204275545e-06, "loss": 0.3064, "step": 6870 }, { "epoch": 1.4696451474989312, "grad_norm": 3.8219094276428223, "learning_rate": 5.895486935866984e-06, "loss": 0.2629, "step": 6875 }, { "epoch": 1.470713980333476, "grad_norm": 3.550219774246216, "learning_rate": 5.883610451306414e-06, "loss": 0.1846, "step": 6880 }, { "epoch": 1.4717828131680206, "grad_norm": 4.344959259033203, "learning_rate": 5.871733966745844e-06, "loss": 0.2552, "step": 6885 }, { "epoch": 1.4728516460025651, "grad_norm": 3.7821099758148193, "learning_rate": 5.859857482185274e-06, "loss": 0.2812, "step": 6890 }, { "epoch": 1.4739204788371099, "grad_norm": 5.074913501739502, "learning_rate": 5.847980997624704e-06, "loss": 0.2796, "step": 6895 }, { "epoch": 1.4749893116716546, "grad_norm": 5.702268600463867, "learning_rate": 5.836104513064134e-06, "loss": 0.3157, "step": 6900 }, { "epoch": 1.476058144506199, "grad_norm": 4.769154071807861, "learning_rate": 5.824228028503563e-06, "loss": 0.271, "step": 6905 }, { "epoch": 1.4771269773407438, "grad_norm": 3.915893077850342, "learning_rate": 5.812351543942993e-06, "loss": 0.2352, "step": 6910 }, { "epoch": 1.4781958101752886, "grad_norm": 5.49572229385376, "learning_rate": 5.8004750593824236e-06, "loss": 0.3752, "step": 6915 }, { "epoch": 1.4792646430098333, "grad_norm": 5.197114944458008, "learning_rate": 5.788598574821853e-06, "loss": 0.2811, "step": 6920 }, { "epoch": 1.480333475844378, "grad_norm": 4.672935485839844, "learning_rate": 5.776722090261283e-06, "loss": 0.3303, "step": 6925 }, { "epoch": 1.4814023086789225, "grad_norm": 3.5662314891815186, "learning_rate": 5.764845605700713e-06, "loss": 0.3382, "step": 6930 }, { "epoch": 1.4824711415134673, "grad_norm": 3.7478342056274414, "learning_rate": 5.752969121140143e-06, "loss": 0.2235, "step": 6935 }, { "epoch": 1.483539974348012, "grad_norm": 5.836414813995361, "learning_rate": 5.741092636579573e-06, "loss": 0.2446, "step": 6940 }, { "epoch": 1.4846088071825567, "grad_norm": 4.945041179656982, "learning_rate": 5.729216152019003e-06, "loss": 0.2745, "step": 6945 }, { "epoch": 1.4856776400171015, "grad_norm": 4.556496620178223, "learning_rate": 5.717339667458432e-06, "loss": 0.3061, "step": 6950 }, { "epoch": 1.486746472851646, "grad_norm": 5.837685585021973, "learning_rate": 5.705463182897862e-06, "loss": 0.3059, "step": 6955 }, { "epoch": 1.4878153056861907, "grad_norm": 3.4342663288116455, "learning_rate": 5.6935866983372926e-06, "loss": 0.2692, "step": 6960 }, { "epoch": 1.4888841385207354, "grad_norm": 4.30683708190918, "learning_rate": 5.681710213776722e-06, "loss": 0.2853, "step": 6965 }, { "epoch": 1.48995297135528, "grad_norm": 3.7401227951049805, "learning_rate": 5.669833729216152e-06, "loss": 0.238, "step": 6970 }, { "epoch": 1.4910218041898247, "grad_norm": 3.991908311843872, "learning_rate": 5.657957244655582e-06, "loss": 0.3086, "step": 6975 }, { "epoch": 1.4920906370243694, "grad_norm": 5.546383857727051, "learning_rate": 5.646080760095012e-06, "loss": 0.3016, "step": 6980 }, { "epoch": 1.4931594698589141, "grad_norm": 4.429809093475342, "learning_rate": 5.634204275534442e-06, "loss": 0.3272, "step": 6985 }, { "epoch": 1.4942283026934589, "grad_norm": 4.91778564453125, "learning_rate": 5.6223277909738725e-06, "loss": 0.3125, "step": 6990 }, { "epoch": 1.4952971355280034, "grad_norm": 5.806905269622803, "learning_rate": 5.610451306413302e-06, "loss": 0.2777, "step": 6995 }, { "epoch": 1.496365968362548, "grad_norm": 5.0485711097717285, "learning_rate": 5.598574821852733e-06, "loss": 0.3026, "step": 7000 }, { "epoch": 1.4974348011970928, "grad_norm": 4.642349720001221, "learning_rate": 5.5866983372921624e-06, "loss": 0.2522, "step": 7005 }, { "epoch": 1.4985036340316373, "grad_norm": 3.192457437515259, "learning_rate": 5.574821852731592e-06, "loss": 0.2487, "step": 7010 }, { "epoch": 1.499572466866182, "grad_norm": 4.002120494842529, "learning_rate": 5.562945368171022e-06, "loss": 0.2316, "step": 7015 }, { "epoch": 1.5006412997007268, "grad_norm": 4.840696334838867, "learning_rate": 5.551068883610452e-06, "loss": 0.2484, "step": 7020 }, { "epoch": 1.5017101325352715, "grad_norm": 4.7393927574157715, "learning_rate": 5.539192399049882e-06, "loss": 0.2852, "step": 7025 }, { "epoch": 1.5027789653698163, "grad_norm": 4.964815616607666, "learning_rate": 5.527315914489312e-06, "loss": 0.2944, "step": 7030 }, { "epoch": 1.5038477982043608, "grad_norm": 4.7306342124938965, "learning_rate": 5.5154394299287415e-06, "loss": 0.3133, "step": 7035 }, { "epoch": 1.5049166310389055, "grad_norm": 5.262001991271973, "learning_rate": 5.503562945368171e-06, "loss": 0.2557, "step": 7040 }, { "epoch": 1.5059854638734502, "grad_norm": 4.136565685272217, "learning_rate": 5.491686460807602e-06, "loss": 0.232, "step": 7045 }, { "epoch": 1.5070542967079947, "grad_norm": 3.917520046234131, "learning_rate": 5.4798099762470315e-06, "loss": 0.2635, "step": 7050 }, { "epoch": 1.5081231295425397, "grad_norm": 5.6809210777282715, "learning_rate": 5.467933491686461e-06, "loss": 0.3033, "step": 7055 }, { "epoch": 1.5091919623770842, "grad_norm": 3.7200369834899902, "learning_rate": 5.456057007125891e-06, "loss": 0.2477, "step": 7060 }, { "epoch": 1.510260795211629, "grad_norm": 4.6949543952941895, "learning_rate": 5.444180522565321e-06, "loss": 0.2443, "step": 7065 }, { "epoch": 1.5113296280461737, "grad_norm": 4.025641918182373, "learning_rate": 5.432304038004751e-06, "loss": 0.4329, "step": 7070 }, { "epoch": 1.5123984608807182, "grad_norm": 3.7725117206573486, "learning_rate": 5.420427553444181e-06, "loss": 0.2682, "step": 7075 }, { "epoch": 1.513467293715263, "grad_norm": 4.11836051940918, "learning_rate": 5.4085510688836105e-06, "loss": 0.3149, "step": 7080 }, { "epoch": 1.5145361265498076, "grad_norm": 4.033612251281738, "learning_rate": 5.39667458432304e-06, "loss": 0.353, "step": 7085 }, { "epoch": 1.5156049593843521, "grad_norm": 5.4751482009887695, "learning_rate": 5.384798099762471e-06, "loss": 0.2247, "step": 7090 }, { "epoch": 1.516673792218897, "grad_norm": 4.203334808349609, "learning_rate": 5.3729216152019005e-06, "loss": 0.2862, "step": 7095 }, { "epoch": 1.5177426250534416, "grad_norm": 5.31473970413208, "learning_rate": 5.36104513064133e-06, "loss": 0.287, "step": 7100 }, { "epoch": 1.5188114578879863, "grad_norm": 4.896878719329834, "learning_rate": 5.34916864608076e-06, "loss": 0.3141, "step": 7105 }, { "epoch": 1.519880290722531, "grad_norm": 3.62528133392334, "learning_rate": 5.33729216152019e-06, "loss": 0.4446, "step": 7110 }, { "epoch": 1.5209491235570756, "grad_norm": 5.231464385986328, "learning_rate": 5.32541567695962e-06, "loss": 0.2853, "step": 7115 }, { "epoch": 1.5220179563916203, "grad_norm": 3.0587196350097656, "learning_rate": 5.31353919239905e-06, "loss": 0.2662, "step": 7120 }, { "epoch": 1.523086789226165, "grad_norm": 5.080547332763672, "learning_rate": 5.3016627078384795e-06, "loss": 0.2729, "step": 7125 }, { "epoch": 1.5241556220607098, "grad_norm": 3.547877073287964, "learning_rate": 5.289786223277911e-06, "loss": 0.2376, "step": 7130 }, { "epoch": 1.5252244548952545, "grad_norm": 3.9913973808288574, "learning_rate": 5.277909738717341e-06, "loss": 0.2434, "step": 7135 }, { "epoch": 1.526293287729799, "grad_norm": 3.9852547645568848, "learning_rate": 5.26603325415677e-06, "loss": 0.302, "step": 7140 }, { "epoch": 1.5273621205643437, "grad_norm": 3.660104274749756, "learning_rate": 5.2541567695962e-06, "loss": 0.2346, "step": 7145 }, { "epoch": 1.5284309533988885, "grad_norm": 4.887364387512207, "learning_rate": 5.242280285035631e-06, "loss": 0.4036, "step": 7150 }, { "epoch": 1.529499786233433, "grad_norm": 5.766690254211426, "learning_rate": 5.23040380047506e-06, "loss": 0.2902, "step": 7155 }, { "epoch": 1.530568619067978, "grad_norm": 5.018100738525391, "learning_rate": 5.21852731591449e-06, "loss": 0.4254, "step": 7160 }, { "epoch": 1.5316374519025224, "grad_norm": 2.8769116401672363, "learning_rate": 5.20665083135392e-06, "loss": 0.2863, "step": 7165 }, { "epoch": 1.5327062847370672, "grad_norm": 4.766345024108887, "learning_rate": 5.194774346793349e-06, "loss": 0.2618, "step": 7170 }, { "epoch": 1.533775117571612, "grad_norm": 4.371603012084961, "learning_rate": 5.18289786223278e-06, "loss": 0.3614, "step": 7175 }, { "epoch": 1.5348439504061564, "grad_norm": 3.7386531829833984, "learning_rate": 5.17102137767221e-06, "loss": 0.3084, "step": 7180 }, { "epoch": 1.5359127832407011, "grad_norm": 3.2616264820098877, "learning_rate": 5.159144893111639e-06, "loss": 0.2799, "step": 7185 }, { "epoch": 1.5369816160752459, "grad_norm": 4.840415000915527, "learning_rate": 5.147268408551069e-06, "loss": 0.2843, "step": 7190 }, { "epoch": 1.5380504489097904, "grad_norm": 2.643326997756958, "learning_rate": 5.1353919239905e-06, "loss": 0.255, "step": 7195 }, { "epoch": 1.5391192817443353, "grad_norm": 3.9539496898651123, "learning_rate": 5.123515439429929e-06, "loss": 0.2278, "step": 7200 }, { "epoch": 1.5401881145788798, "grad_norm": 4.173327922821045, "learning_rate": 5.111638954869359e-06, "loss": 0.3137, "step": 7205 }, { "epoch": 1.5412569474134246, "grad_norm": 4.327914237976074, "learning_rate": 5.099762470308789e-06, "loss": 0.3365, "step": 7210 }, { "epoch": 1.5423257802479693, "grad_norm": 2.9048960208892822, "learning_rate": 5.087885985748218e-06, "loss": 0.1981, "step": 7215 }, { "epoch": 1.5433946130825138, "grad_norm": 4.26038932800293, "learning_rate": 5.076009501187649e-06, "loss": 0.2338, "step": 7220 }, { "epoch": 1.5444634459170585, "grad_norm": 5.362328052520752, "learning_rate": 5.064133016627079e-06, "loss": 0.2692, "step": 7225 }, { "epoch": 1.5455322787516033, "grad_norm": 4.408464431762695, "learning_rate": 5.052256532066508e-06, "loss": 0.2129, "step": 7230 }, { "epoch": 1.5466011115861478, "grad_norm": 5.237843990325928, "learning_rate": 5.040380047505938e-06, "loss": 0.2395, "step": 7235 }, { "epoch": 1.5476699444206927, "grad_norm": 6.1017045974731445, "learning_rate": 5.028503562945369e-06, "loss": 0.485, "step": 7240 }, { "epoch": 1.5487387772552372, "grad_norm": 5.8066582679748535, "learning_rate": 5.016627078384798e-06, "loss": 0.2166, "step": 7245 }, { "epoch": 1.549807610089782, "grad_norm": 6.7323899269104, "learning_rate": 5.004750593824228e-06, "loss": 0.2799, "step": 7250 }, { "epoch": 1.5508764429243267, "grad_norm": 4.477848052978516, "learning_rate": 4.9928741092636586e-06, "loss": 0.2856, "step": 7255 }, { "epoch": 1.5519452757588712, "grad_norm": 3.282881498336792, "learning_rate": 4.980997624703088e-06, "loss": 0.272, "step": 7260 }, { "epoch": 1.5530141085934162, "grad_norm": 4.757537364959717, "learning_rate": 4.969121140142518e-06, "loss": 0.299, "step": 7265 }, { "epoch": 1.5540829414279607, "grad_norm": 6.090857028961182, "learning_rate": 4.9572446555819485e-06, "loss": 0.3309, "step": 7270 }, { "epoch": 1.5551517742625054, "grad_norm": 3.326892137527466, "learning_rate": 4.945368171021378e-06, "loss": 0.223, "step": 7275 }, { "epoch": 1.5562206070970501, "grad_norm": 3.5346665382385254, "learning_rate": 4.933491686460808e-06, "loss": 0.2351, "step": 7280 }, { "epoch": 1.5572894399315946, "grad_norm": 3.1125802993774414, "learning_rate": 4.921615201900238e-06, "loss": 0.2177, "step": 7285 }, { "epoch": 1.5583582727661394, "grad_norm": 3.7614200115203857, "learning_rate": 4.909738717339667e-06, "loss": 0.2606, "step": 7290 }, { "epoch": 1.559427105600684, "grad_norm": 3.761014223098755, "learning_rate": 4.897862232779098e-06, "loss": 0.3972, "step": 7295 }, { "epoch": 1.5604959384352286, "grad_norm": 3.6661438941955566, "learning_rate": 4.885985748218528e-06, "loss": 0.2594, "step": 7300 }, { "epoch": 1.5615647712697736, "grad_norm": 4.455360412597656, "learning_rate": 4.874109263657958e-06, "loss": 0.2934, "step": 7305 }, { "epoch": 1.562633604104318, "grad_norm": 4.19691801071167, "learning_rate": 4.862232779097388e-06, "loss": 0.4105, "step": 7310 }, { "epoch": 1.5637024369388628, "grad_norm": 4.041048049926758, "learning_rate": 4.8503562945368175e-06, "loss": 0.1971, "step": 7315 }, { "epoch": 1.5647712697734075, "grad_norm": 3.2611756324768066, "learning_rate": 4.838479809976247e-06, "loss": 0.2107, "step": 7320 }, { "epoch": 1.565840102607952, "grad_norm": 3.419591188430786, "learning_rate": 4.826603325415678e-06, "loss": 0.2441, "step": 7325 }, { "epoch": 1.5669089354424968, "grad_norm": 4.567037105560303, "learning_rate": 4.8147268408551075e-06, "loss": 0.2413, "step": 7330 }, { "epoch": 1.5679777682770415, "grad_norm": 3.887484550476074, "learning_rate": 4.802850356294537e-06, "loss": 0.2619, "step": 7335 }, { "epoch": 1.569046601111586, "grad_norm": 4.95120906829834, "learning_rate": 4.790973871733967e-06, "loss": 0.3098, "step": 7340 }, { "epoch": 1.570115433946131, "grad_norm": 4.205053806304932, "learning_rate": 4.779097387173397e-06, "loss": 0.3024, "step": 7345 }, { "epoch": 1.5711842667806755, "grad_norm": 6.198763847351074, "learning_rate": 4.767220902612827e-06, "loss": 0.2548, "step": 7350 }, { "epoch": 1.5722530996152202, "grad_norm": 4.158599853515625, "learning_rate": 4.755344418052257e-06, "loss": 0.2925, "step": 7355 }, { "epoch": 1.573321932449765, "grad_norm": 3.3105695247650146, "learning_rate": 4.7434679334916866e-06, "loss": 0.2245, "step": 7360 }, { "epoch": 1.5743907652843094, "grad_norm": 2.852360963821411, "learning_rate": 4.731591448931116e-06, "loss": 0.2612, "step": 7365 }, { "epoch": 1.5754595981188542, "grad_norm": 5.082930564880371, "learning_rate": 4.719714964370547e-06, "loss": 0.4075, "step": 7370 }, { "epoch": 1.576528430953399, "grad_norm": 3.626047372817993, "learning_rate": 4.7078384798099765e-06, "loss": 0.2457, "step": 7375 }, { "epoch": 1.5775972637879434, "grad_norm": 3.2513113021850586, "learning_rate": 4.695961995249407e-06, "loss": 0.214, "step": 7380 }, { "epoch": 1.5786660966224884, "grad_norm": 4.396987438201904, "learning_rate": 4.684085510688837e-06, "loss": 0.2761, "step": 7385 }, { "epoch": 1.5797349294570329, "grad_norm": 4.177000045776367, "learning_rate": 4.6722090261282665e-06, "loss": 0.278, "step": 7390 }, { "epoch": 1.5808037622915776, "grad_norm": 6.472886562347412, "learning_rate": 4.660332541567696e-06, "loss": 0.4008, "step": 7395 }, { "epoch": 1.5818725951261223, "grad_norm": 5.244050979614258, "learning_rate": 4.648456057007127e-06, "loss": 0.3222, "step": 7400 }, { "epoch": 1.5829414279606668, "grad_norm": 3.3180673122406006, "learning_rate": 4.636579572446556e-06, "loss": 0.2645, "step": 7405 }, { "epoch": 1.5840102607952118, "grad_norm": 4.317756652832031, "learning_rate": 4.624703087885986e-06, "loss": 0.2121, "step": 7410 }, { "epoch": 1.5850790936297563, "grad_norm": 5.13472843170166, "learning_rate": 4.612826603325416e-06, "loss": 0.2679, "step": 7415 }, { "epoch": 1.586147926464301, "grad_norm": 4.850220680236816, "learning_rate": 4.6009501187648455e-06, "loss": 0.342, "step": 7420 }, { "epoch": 1.5872167592988458, "grad_norm": 3.7907469272613525, "learning_rate": 4.589073634204276e-06, "loss": 0.2312, "step": 7425 }, { "epoch": 1.5882855921333903, "grad_norm": 5.306363582611084, "learning_rate": 4.577197149643706e-06, "loss": 0.3332, "step": 7430 }, { "epoch": 1.589354424967935, "grad_norm": 4.227755069732666, "learning_rate": 4.5653206650831355e-06, "loss": 0.2628, "step": 7435 }, { "epoch": 1.5904232578024797, "grad_norm": 4.175191879272461, "learning_rate": 4.553444180522565e-06, "loss": 0.2824, "step": 7440 }, { "epoch": 1.5914920906370242, "grad_norm": 4.70232629776001, "learning_rate": 4.541567695961996e-06, "loss": 0.3249, "step": 7445 }, { "epoch": 1.5925609234715692, "grad_norm": 5.078143119812012, "learning_rate": 4.5296912114014254e-06, "loss": 0.3738, "step": 7450 }, { "epoch": 1.5936297563061137, "grad_norm": 3.0150363445281982, "learning_rate": 4.517814726840856e-06, "loss": 0.3357, "step": 7455 }, { "epoch": 1.5946985891406584, "grad_norm": 6.010279655456543, "learning_rate": 4.505938242280286e-06, "loss": 0.2563, "step": 7460 }, { "epoch": 1.5957674219752032, "grad_norm": 4.169801712036133, "learning_rate": 4.494061757719715e-06, "loss": 0.2091, "step": 7465 }, { "epoch": 1.5968362548097477, "grad_norm": 5.483653545379639, "learning_rate": 4.482185273159145e-06, "loss": 0.2425, "step": 7470 }, { "epoch": 1.5979050876442924, "grad_norm": 3.874551773071289, "learning_rate": 4.470308788598575e-06, "loss": 0.2699, "step": 7475 }, { "epoch": 1.5989739204788371, "grad_norm": 5.686993598937988, "learning_rate": 4.458432304038005e-06, "loss": 0.3409, "step": 7480 }, { "epoch": 1.6000427533133816, "grad_norm": 4.527751922607422, "learning_rate": 4.446555819477435e-06, "loss": 0.2233, "step": 7485 }, { "epoch": 1.6011115861479266, "grad_norm": 4.663357257843018, "learning_rate": 4.434679334916865e-06, "loss": 0.3269, "step": 7490 }, { "epoch": 1.602180418982471, "grad_norm": 5.009659767150879, "learning_rate": 4.4228028503562945e-06, "loss": 0.3029, "step": 7495 }, { "epoch": 1.6032492518170158, "grad_norm": 3.9787962436676025, "learning_rate": 4.410926365795725e-06, "loss": 0.2547, "step": 7500 }, { "epoch": 1.6043180846515606, "grad_norm": 5.281296253204346, "learning_rate": 4.399049881235155e-06, "loss": 0.2855, "step": 7505 }, { "epoch": 1.605386917486105, "grad_norm": 6.091033935546875, "learning_rate": 4.387173396674584e-06, "loss": 0.3106, "step": 7510 }, { "epoch": 1.6064557503206498, "grad_norm": 5.57248067855835, "learning_rate": 4.375296912114015e-06, "loss": 0.262, "step": 7515 }, { "epoch": 1.6075245831551945, "grad_norm": 4.538100242614746, "learning_rate": 4.363420427553445e-06, "loss": 0.3016, "step": 7520 }, { "epoch": 1.608593415989739, "grad_norm": 2.859865665435791, "learning_rate": 4.351543942992874e-06, "loss": 0.2852, "step": 7525 }, { "epoch": 1.609662248824284, "grad_norm": 4.841543197631836, "learning_rate": 4.339667458432305e-06, "loss": 0.3126, "step": 7530 }, { "epoch": 1.6107310816588285, "grad_norm": 4.134354114532471, "learning_rate": 4.327790973871735e-06, "loss": 0.2779, "step": 7535 }, { "epoch": 1.6117999144933732, "grad_norm": 5.4539875984191895, "learning_rate": 4.315914489311164e-06, "loss": 0.2811, "step": 7540 }, { "epoch": 1.612868747327918, "grad_norm": 4.018299579620361, "learning_rate": 4.304038004750594e-06, "loss": 0.259, "step": 7545 }, { "epoch": 1.6139375801624625, "grad_norm": 3.978214740753174, "learning_rate": 4.292161520190024e-06, "loss": 0.2602, "step": 7550 }, { "epoch": 1.6150064129970074, "grad_norm": 4.782619953155518, "learning_rate": 4.280285035629454e-06, "loss": 0.2481, "step": 7555 }, { "epoch": 1.616075245831552, "grad_norm": 4.34796142578125, "learning_rate": 4.268408551068884e-06, "loss": 0.2022, "step": 7560 }, { "epoch": 1.6171440786660967, "grad_norm": 4.58864688873291, "learning_rate": 4.256532066508314e-06, "loss": 0.2943, "step": 7565 }, { "epoch": 1.6182129115006414, "grad_norm": 3.2588422298431396, "learning_rate": 4.244655581947743e-06, "loss": 0.2144, "step": 7570 }, { "epoch": 1.619281744335186, "grad_norm": 4.609071731567383, "learning_rate": 4.232779097387174e-06, "loss": 0.2589, "step": 7575 }, { "epoch": 1.6203505771697306, "grad_norm": 3.8828067779541016, "learning_rate": 4.220902612826604e-06, "loss": 0.1999, "step": 7580 }, { "epoch": 1.6214194100042754, "grad_norm": 5.068613052368164, "learning_rate": 4.209026128266034e-06, "loss": 0.3035, "step": 7585 }, { "epoch": 1.6224882428388199, "grad_norm": 3.4416937828063965, "learning_rate": 4.197149643705464e-06, "loss": 0.2322, "step": 7590 }, { "epoch": 1.6235570756733648, "grad_norm": 4.246146202087402, "learning_rate": 4.185273159144894e-06, "loss": 0.237, "step": 7595 }, { "epoch": 1.6246259085079093, "grad_norm": 4.175546646118164, "learning_rate": 4.173396674584323e-06, "loss": 0.2815, "step": 7600 }, { "epoch": 1.625694741342454, "grad_norm": 5.142884254455566, "learning_rate": 4.161520190023753e-06, "loss": 0.4136, "step": 7605 }, { "epoch": 1.6267635741769988, "grad_norm": 4.261429309844971, "learning_rate": 4.1496437054631835e-06, "loss": 0.2474, "step": 7610 }, { "epoch": 1.6278324070115433, "grad_norm": 5.0894646644592285, "learning_rate": 4.137767220902613e-06, "loss": 0.3143, "step": 7615 }, { "epoch": 1.628901239846088, "grad_norm": 4.596246242523193, "learning_rate": 4.125890736342043e-06, "loss": 0.244, "step": 7620 }, { "epoch": 1.6299700726806328, "grad_norm": 4.05454158782959, "learning_rate": 4.114014251781473e-06, "loss": 0.3134, "step": 7625 }, { "epoch": 1.6310389055151773, "grad_norm": 5.604685306549072, "learning_rate": 4.102137767220903e-06, "loss": 0.2516, "step": 7630 }, { "epoch": 1.6321077383497222, "grad_norm": 2.5428969860076904, "learning_rate": 4.090261282660333e-06, "loss": 0.3159, "step": 7635 }, { "epoch": 1.6331765711842667, "grad_norm": 3.228505849838257, "learning_rate": 4.078384798099763e-06, "loss": 0.2519, "step": 7640 }, { "epoch": 1.6342454040188115, "grad_norm": 5.0502753257751465, "learning_rate": 4.066508313539192e-06, "loss": 0.2785, "step": 7645 }, { "epoch": 1.6353142368533562, "grad_norm": 3.824427366256714, "learning_rate": 4.054631828978622e-06, "loss": 0.2627, "step": 7650 }, { "epoch": 1.6363830696879007, "grad_norm": 4.460954666137695, "learning_rate": 4.0427553444180526e-06, "loss": 0.2677, "step": 7655 }, { "epoch": 1.6374519025224454, "grad_norm": 3.3890676498413086, "learning_rate": 4.030878859857483e-06, "loss": 0.1962, "step": 7660 }, { "epoch": 1.6385207353569902, "grad_norm": 4.556974411010742, "learning_rate": 4.019002375296913e-06, "loss": 0.3779, "step": 7665 }, { "epoch": 1.639589568191535, "grad_norm": 3.9803950786590576, "learning_rate": 4.0071258907363425e-06, "loss": 0.2731, "step": 7670 }, { "epoch": 1.6406584010260796, "grad_norm": 3.7230427265167236, "learning_rate": 3.995249406175772e-06, "loss": 0.2806, "step": 7675 }, { "epoch": 1.6417272338606241, "grad_norm": 3.6325037479400635, "learning_rate": 3.983372921615202e-06, "loss": 0.2582, "step": 7680 }, { "epoch": 1.6427960666951689, "grad_norm": 4.024942398071289, "learning_rate": 3.9714964370546325e-06, "loss": 0.2064, "step": 7685 }, { "epoch": 1.6438648995297136, "grad_norm": 4.7745819091796875, "learning_rate": 3.959619952494062e-06, "loss": 0.263, "step": 7690 }, { "epoch": 1.644933732364258, "grad_norm": 3.8132996559143066, "learning_rate": 3.947743467933492e-06, "loss": 0.3126, "step": 7695 }, { "epoch": 1.646002565198803, "grad_norm": 3.711763620376587, "learning_rate": 3.9358669833729216e-06, "loss": 0.2889, "step": 7700 }, { "epoch": 1.6470713980333476, "grad_norm": 3.696894645690918, "learning_rate": 3.923990498812352e-06, "loss": 0.2372, "step": 7705 }, { "epoch": 1.6481402308678923, "grad_norm": 5.242607593536377, "learning_rate": 3.912114014251782e-06, "loss": 0.2062, "step": 7710 }, { "epoch": 1.649209063702437, "grad_norm": 3.8635284900665283, "learning_rate": 3.9002375296912115e-06, "loss": 0.3085, "step": 7715 }, { "epoch": 1.6502778965369815, "grad_norm": 4.494617938995361, "learning_rate": 3.888361045130641e-06, "loss": 0.22, "step": 7720 }, { "epoch": 1.6513467293715263, "grad_norm": 5.683468818664551, "learning_rate": 3.876484560570072e-06, "loss": 0.258, "step": 7725 }, { "epoch": 1.652415562206071, "grad_norm": 7.1560845375061035, "learning_rate": 3.8646080760095015e-06, "loss": 0.2496, "step": 7730 }, { "epoch": 1.6534843950406155, "grad_norm": 4.27496337890625, "learning_rate": 3.852731591448932e-06, "loss": 0.2975, "step": 7735 }, { "epoch": 1.6545532278751605, "grad_norm": 5.494519233703613, "learning_rate": 3.840855106888362e-06, "loss": 0.2744, "step": 7740 }, { "epoch": 1.655622060709705, "grad_norm": 4.088238716125488, "learning_rate": 3.8289786223277914e-06, "loss": 0.2255, "step": 7745 }, { "epoch": 1.6566908935442497, "grad_norm": 3.627351760864258, "learning_rate": 3.817102137767221e-06, "loss": 0.2387, "step": 7750 }, { "epoch": 1.6577597263787944, "grad_norm": 4.195761680603027, "learning_rate": 3.8052256532066513e-06, "loss": 0.2724, "step": 7755 }, { "epoch": 1.658828559213339, "grad_norm": 4.758053779602051, "learning_rate": 3.793349168646081e-06, "loss": 0.282, "step": 7760 }, { "epoch": 1.6598973920478837, "grad_norm": 3.427823066711426, "learning_rate": 3.781472684085511e-06, "loss": 0.166, "step": 7765 }, { "epoch": 1.6609662248824284, "grad_norm": 4.784726142883301, "learning_rate": 3.769596199524941e-06, "loss": 0.2653, "step": 7770 }, { "epoch": 1.662035057716973, "grad_norm": 4.018444538116455, "learning_rate": 3.757719714964371e-06, "loss": 0.2368, "step": 7775 }, { "epoch": 1.6631038905515179, "grad_norm": 4.532012462615967, "learning_rate": 3.7458432304038006e-06, "loss": 0.2235, "step": 7780 }, { "epoch": 1.6641727233860624, "grad_norm": 4.576938152313232, "learning_rate": 3.7339667458432303e-06, "loss": 0.309, "step": 7785 }, { "epoch": 1.665241556220607, "grad_norm": 4.126202583312988, "learning_rate": 3.7220902612826604e-06, "loss": 0.3402, "step": 7790 }, { "epoch": 1.6663103890551518, "grad_norm": 5.895056247711182, "learning_rate": 3.710213776722091e-06, "loss": 0.2741, "step": 7795 }, { "epoch": 1.6673792218896963, "grad_norm": 5.252209663391113, "learning_rate": 3.6983372921615207e-06, "loss": 0.2282, "step": 7800 }, { "epoch": 1.6684480547242413, "grad_norm": 5.411665439605713, "learning_rate": 3.6864608076009504e-06, "loss": 0.3233, "step": 7805 }, { "epoch": 1.6695168875587858, "grad_norm": 3.801215887069702, "learning_rate": 3.6745843230403805e-06, "loss": 0.23, "step": 7810 }, { "epoch": 1.6705857203933305, "grad_norm": 5.455605983734131, "learning_rate": 3.6627078384798102e-06, "loss": 0.22, "step": 7815 }, { "epoch": 1.6716545532278753, "grad_norm": 3.8827927112579346, "learning_rate": 3.6508313539192404e-06, "loss": 0.216, "step": 7820 }, { "epoch": 1.6727233860624198, "grad_norm": 3.9195375442504883, "learning_rate": 3.63895486935867e-06, "loss": 0.3479, "step": 7825 }, { "epoch": 1.6737922188969645, "grad_norm": 4.495283603668213, "learning_rate": 3.6270783847981e-06, "loss": 0.2256, "step": 7830 }, { "epoch": 1.6748610517315092, "grad_norm": 5.642339706420898, "learning_rate": 3.61520190023753e-06, "loss": 0.3205, "step": 7835 }, { "epoch": 1.6759298845660537, "grad_norm": 5.5151495933532715, "learning_rate": 3.60332541567696e-06, "loss": 0.28, "step": 7840 }, { "epoch": 1.6769987174005987, "grad_norm": 3.8195252418518066, "learning_rate": 3.5914489311163897e-06, "loss": 0.255, "step": 7845 }, { "epoch": 1.6780675502351432, "grad_norm": 5.310424327850342, "learning_rate": 3.5795724465558194e-06, "loss": 0.2584, "step": 7850 }, { "epoch": 1.679136383069688, "grad_norm": 5.491156101226807, "learning_rate": 3.5676959619952495e-06, "loss": 0.254, "step": 7855 }, { "epoch": 1.6802052159042327, "grad_norm": 4.094849109649658, "learning_rate": 3.5558194774346792e-06, "loss": 0.2051, "step": 7860 }, { "epoch": 1.6812740487387772, "grad_norm": 3.9543018341064453, "learning_rate": 3.54394299287411e-06, "loss": 0.2653, "step": 7865 }, { "epoch": 1.682342881573322, "grad_norm": 4.145587921142578, "learning_rate": 3.5320665083135395e-06, "loss": 0.2882, "step": 7870 }, { "epoch": 1.6834117144078666, "grad_norm": 3.4505057334899902, "learning_rate": 3.5201900237529696e-06, "loss": 0.2685, "step": 7875 }, { "epoch": 1.6844805472424111, "grad_norm": 4.536677837371826, "learning_rate": 3.5083135391923993e-06, "loss": 0.2606, "step": 7880 }, { "epoch": 1.685549380076956, "grad_norm": 5.157629013061523, "learning_rate": 3.4964370546318295e-06, "loss": 0.2266, "step": 7885 }, { "epoch": 1.6866182129115006, "grad_norm": 4.595909595489502, "learning_rate": 3.484560570071259e-06, "loss": 0.2112, "step": 7890 }, { "epoch": 1.6876870457460453, "grad_norm": 4.331202030181885, "learning_rate": 3.4726840855106893e-06, "loss": 0.245, "step": 7895 }, { "epoch": 1.68875587858059, "grad_norm": 5.239740371704102, "learning_rate": 3.460807600950119e-06, "loss": 0.2144, "step": 7900 }, { "epoch": 1.6898247114151346, "grad_norm": 3.1925699710845947, "learning_rate": 3.448931116389549e-06, "loss": 0.3334, "step": 7905 }, { "epoch": 1.6908935442496793, "grad_norm": 3.5667247772216797, "learning_rate": 3.437054631828979e-06, "loss": 0.2754, "step": 7910 }, { "epoch": 1.691962377084224, "grad_norm": 4.145174026489258, "learning_rate": 3.4251781472684085e-06, "loss": 0.3048, "step": 7915 }, { "epoch": 1.6930312099187685, "grad_norm": 3.559020519256592, "learning_rate": 3.4133016627078386e-06, "loss": 0.2319, "step": 7920 }, { "epoch": 1.6941000427533135, "grad_norm": 3.1762850284576416, "learning_rate": 3.4014251781472683e-06, "loss": 0.3505, "step": 7925 }, { "epoch": 1.695168875587858, "grad_norm": 4.600183963775635, "learning_rate": 3.3895486935866985e-06, "loss": 0.3171, "step": 7930 }, { "epoch": 1.6962377084224027, "grad_norm": 4.069181442260742, "learning_rate": 3.3776722090261286e-06, "loss": 0.2359, "step": 7935 }, { "epoch": 1.6973065412569475, "grad_norm": 5.979001998901367, "learning_rate": 3.3657957244655587e-06, "loss": 0.259, "step": 7940 }, { "epoch": 1.698375374091492, "grad_norm": 4.2909040451049805, "learning_rate": 3.3539192399049884e-06, "loss": 0.2345, "step": 7945 }, { "epoch": 1.699444206926037, "grad_norm": 4.572742938995361, "learning_rate": 3.3420427553444185e-06, "loss": 0.2364, "step": 7950 }, { "epoch": 1.7005130397605814, "grad_norm": 4.979130744934082, "learning_rate": 3.3301662707838482e-06, "loss": 0.3125, "step": 7955 }, { "epoch": 1.7015818725951262, "grad_norm": 8.828888893127441, "learning_rate": 3.3182897862232784e-06, "loss": 0.3855, "step": 7960 }, { "epoch": 1.702650705429671, "grad_norm": 3.5113627910614014, "learning_rate": 3.306413301662708e-06, "loss": 0.3085, "step": 7965 }, { "epoch": 1.7037195382642154, "grad_norm": 3.138580322265625, "learning_rate": 3.294536817102138e-06, "loss": 0.2558, "step": 7970 }, { "epoch": 1.7047883710987601, "grad_norm": 3.382124900817871, "learning_rate": 3.282660332541568e-06, "loss": 0.2863, "step": 7975 }, { "epoch": 1.7058572039333049, "grad_norm": 4.64111328125, "learning_rate": 3.2707838479809976e-06, "loss": 0.2763, "step": 7980 }, { "epoch": 1.7069260367678494, "grad_norm": 3.9928252696990967, "learning_rate": 3.2589073634204277e-06, "loss": 0.2307, "step": 7985 }, { "epoch": 1.7079948696023943, "grad_norm": 4.402683258056641, "learning_rate": 3.2470308788598574e-06, "loss": 0.2604, "step": 7990 }, { "epoch": 1.7090637024369388, "grad_norm": 4.634458541870117, "learning_rate": 3.2351543942992876e-06, "loss": 0.3524, "step": 7995 }, { "epoch": 1.7101325352714836, "grad_norm": 3.9876441955566406, "learning_rate": 3.2232779097387173e-06, "loss": 0.2591, "step": 8000 }, { "epoch": 1.7112013681060283, "grad_norm": 5.491477012634277, "learning_rate": 3.211401425178148e-06, "loss": 0.2945, "step": 8005 }, { "epoch": 1.7122702009405728, "grad_norm": 3.348909378051758, "learning_rate": 3.1995249406175775e-06, "loss": 0.2698, "step": 8010 }, { "epoch": 1.7133390337751175, "grad_norm": 2.3808627128601074, "learning_rate": 3.1876484560570076e-06, "loss": 0.2091, "step": 8015 }, { "epoch": 1.7144078666096623, "grad_norm": 4.511120319366455, "learning_rate": 3.1757719714964373e-06, "loss": 0.2313, "step": 8020 }, { "epoch": 1.7154766994442068, "grad_norm": 3.1614320278167725, "learning_rate": 3.1638954869358675e-06, "loss": 0.2935, "step": 8025 }, { "epoch": 1.7165455322787517, "grad_norm": 4.708336353302002, "learning_rate": 3.152019002375297e-06, "loss": 0.2358, "step": 8030 }, { "epoch": 1.7176143651132962, "grad_norm": 5.274806499481201, "learning_rate": 3.1401425178147273e-06, "loss": 0.2842, "step": 8035 }, { "epoch": 1.718683197947841, "grad_norm": 4.673067569732666, "learning_rate": 3.128266033254157e-06, "loss": 0.26, "step": 8040 }, { "epoch": 1.7197520307823857, "grad_norm": 7.412868499755859, "learning_rate": 3.1163895486935867e-06, "loss": 0.312, "step": 8045 }, { "epoch": 1.7208208636169302, "grad_norm": 5.098508834838867, "learning_rate": 3.104513064133017e-06, "loss": 0.2776, "step": 8050 }, { "epoch": 1.721889696451475, "grad_norm": 2.9823100566864014, "learning_rate": 3.0926365795724465e-06, "loss": 0.1612, "step": 8055 }, { "epoch": 1.7229585292860197, "grad_norm": 3.906702995300293, "learning_rate": 3.0807600950118767e-06, "loss": 0.1803, "step": 8060 }, { "epoch": 1.7240273621205642, "grad_norm": 4.462987899780273, "learning_rate": 3.0688836104513064e-06, "loss": 0.2677, "step": 8065 }, { "epoch": 1.7250961949551091, "grad_norm": 3.3349108695983887, "learning_rate": 3.0570071258907365e-06, "loss": 0.2315, "step": 8070 }, { "epoch": 1.7261650277896536, "grad_norm": 3.8888843059539795, "learning_rate": 3.0451306413301666e-06, "loss": 0.2583, "step": 8075 }, { "epoch": 1.7272338606241984, "grad_norm": 3.5807013511657715, "learning_rate": 3.0332541567695967e-06, "loss": 0.2488, "step": 8080 }, { "epoch": 1.728302693458743, "grad_norm": 4.443240165710449, "learning_rate": 3.0213776722090264e-06, "loss": 0.2379, "step": 8085 }, { "epoch": 1.7293715262932876, "grad_norm": 4.572385311126709, "learning_rate": 3.0095011876484566e-06, "loss": 0.2637, "step": 8090 }, { "epoch": 1.7304403591278326, "grad_norm": 3.8426921367645264, "learning_rate": 2.9976247030878863e-06, "loss": 0.2101, "step": 8095 }, { "epoch": 1.731509191962377, "grad_norm": 3.6695351600646973, "learning_rate": 2.9857482185273164e-06, "loss": 0.283, "step": 8100 }, { "epoch": 1.7325780247969218, "grad_norm": 4.494965076446533, "learning_rate": 2.973871733966746e-06, "loss": 0.2163, "step": 8105 }, { "epoch": 1.7336468576314665, "grad_norm": 4.575949192047119, "learning_rate": 2.961995249406176e-06, "loss": 0.2474, "step": 8110 }, { "epoch": 1.734715690466011, "grad_norm": 5.060282230377197, "learning_rate": 2.950118764845606e-06, "loss": 0.3346, "step": 8115 }, { "epoch": 1.7357845233005558, "grad_norm": 5.1213274002075195, "learning_rate": 2.9382422802850356e-06, "loss": 0.2031, "step": 8120 }, { "epoch": 1.7368533561351005, "grad_norm": 4.754722595214844, "learning_rate": 2.9263657957244658e-06, "loss": 0.2301, "step": 8125 }, { "epoch": 1.737922188969645, "grad_norm": 3.7561304569244385, "learning_rate": 2.9144893111638955e-06, "loss": 0.3413, "step": 8130 }, { "epoch": 1.73899102180419, "grad_norm": 4.434960842132568, "learning_rate": 2.9026128266033256e-06, "loss": 0.3121, "step": 8135 }, { "epoch": 1.7400598546387345, "grad_norm": 3.5216495990753174, "learning_rate": 2.8907363420427553e-06, "loss": 0.2696, "step": 8140 }, { "epoch": 1.7411286874732792, "grad_norm": 3.2195262908935547, "learning_rate": 2.878859857482186e-06, "loss": 0.1871, "step": 8145 }, { "epoch": 1.742197520307824, "grad_norm": 2.6963675022125244, "learning_rate": 2.8669833729216155e-06, "loss": 0.2427, "step": 8150 }, { "epoch": 1.7432663531423684, "grad_norm": 3.3632442951202393, "learning_rate": 2.8551068883610457e-06, "loss": 0.215, "step": 8155 }, { "epoch": 1.7443351859769132, "grad_norm": 4.627504825592041, "learning_rate": 2.8432304038004754e-06, "loss": 0.2603, "step": 8160 }, { "epoch": 1.745404018811458, "grad_norm": 4.896625995635986, "learning_rate": 2.8313539192399055e-06, "loss": 0.2149, "step": 8165 }, { "epoch": 1.7464728516460024, "grad_norm": 3.6175167560577393, "learning_rate": 2.819477434679335e-06, "loss": 0.2961, "step": 8170 }, { "epoch": 1.7475416844805474, "grad_norm": 2.9704079627990723, "learning_rate": 2.807600950118765e-06, "loss": 0.2363, "step": 8175 }, { "epoch": 1.7486105173150919, "grad_norm": 5.211386203765869, "learning_rate": 2.795724465558195e-06, "loss": 0.2238, "step": 8180 }, { "epoch": 1.7496793501496366, "grad_norm": 4.538329601287842, "learning_rate": 2.7838479809976247e-06, "loss": 0.2522, "step": 8185 }, { "epoch": 1.7507481829841813, "grad_norm": 4.693541049957275, "learning_rate": 2.771971496437055e-06, "loss": 0.2314, "step": 8190 }, { "epoch": 1.7518170158187258, "grad_norm": 6.232285499572754, "learning_rate": 2.7600950118764846e-06, "loss": 0.34, "step": 8195 }, { "epoch": 1.7528858486532708, "grad_norm": 3.3624300956726074, "learning_rate": 2.7482185273159147e-06, "loss": 0.3113, "step": 8200 }, { "epoch": 1.7539546814878153, "grad_norm": 4.9479193687438965, "learning_rate": 2.7363420427553444e-06, "loss": 0.2022, "step": 8205 }, { "epoch": 1.75502351432236, "grad_norm": 2.4150469303131104, "learning_rate": 2.7244655581947745e-06, "loss": 0.2244, "step": 8210 }, { "epoch": 1.7560923471569048, "grad_norm": 2.7240800857543945, "learning_rate": 2.7125890736342046e-06, "loss": 0.1794, "step": 8215 }, { "epoch": 1.7571611799914493, "grad_norm": 5.584763526916504, "learning_rate": 2.7007125890736348e-06, "loss": 0.3058, "step": 8220 }, { "epoch": 1.758230012825994, "grad_norm": 6.3999505043029785, "learning_rate": 2.6888361045130645e-06, "loss": 0.2495, "step": 8225 }, { "epoch": 1.7592988456605387, "grad_norm": 3.8963570594787598, "learning_rate": 2.6769596199524946e-06, "loss": 0.3586, "step": 8230 }, { "epoch": 1.7603676784950832, "grad_norm": 4.307738780975342, "learning_rate": 2.6650831353919243e-06, "loss": 0.2726, "step": 8235 }, { "epoch": 1.7614365113296282, "grad_norm": 4.685522079467773, "learning_rate": 2.653206650831354e-06, "loss": 0.2774, "step": 8240 }, { "epoch": 1.7625053441641727, "grad_norm": 3.685218095779419, "learning_rate": 2.641330166270784e-06, "loss": 0.3009, "step": 8245 }, { "epoch": 1.7635741769987174, "grad_norm": 3.9087140560150146, "learning_rate": 2.629453681710214e-06, "loss": 0.2813, "step": 8250 }, { "epoch": 1.7646430098332622, "grad_norm": 4.455633163452148, "learning_rate": 2.617577197149644e-06, "loss": 0.2942, "step": 8255 }, { "epoch": 1.7657118426678067, "grad_norm": 3.3832907676696777, "learning_rate": 2.6057007125890737e-06, "loss": 0.2463, "step": 8260 }, { "epoch": 1.7667806755023514, "grad_norm": 4.235377788543701, "learning_rate": 2.5938242280285038e-06, "loss": 0.2399, "step": 8265 }, { "epoch": 1.7678495083368961, "grad_norm": 5.997225761413574, "learning_rate": 2.5819477434679335e-06, "loss": 0.2725, "step": 8270 }, { "epoch": 1.7689183411714406, "grad_norm": 3.9668803215026855, "learning_rate": 2.5700712589073636e-06, "loss": 0.2171, "step": 8275 }, { "epoch": 1.7699871740059856, "grad_norm": 6.379711151123047, "learning_rate": 2.5581947743467933e-06, "loss": 0.3037, "step": 8280 }, { "epoch": 1.77105600684053, "grad_norm": 4.1840901374816895, "learning_rate": 2.546318289786224e-06, "loss": 0.1921, "step": 8285 }, { "epoch": 1.7721248396750748, "grad_norm": 3.4607646465301514, "learning_rate": 2.5344418052256536e-06, "loss": 0.2519, "step": 8290 }, { "epoch": 1.7731936725096196, "grad_norm": 4.899019241333008, "learning_rate": 2.5225653206650837e-06, "loss": 0.2644, "step": 8295 }, { "epoch": 1.774262505344164, "grad_norm": 3.769134283065796, "learning_rate": 2.5106888361045134e-06, "loss": 0.3707, "step": 8300 }, { "epoch": 1.7753313381787088, "grad_norm": 3.0456831455230713, "learning_rate": 2.4988123515439435e-06, "loss": 0.1496, "step": 8305 }, { "epoch": 1.7764001710132535, "grad_norm": 4.198024749755859, "learning_rate": 2.4869358669833732e-06, "loss": 0.2251, "step": 8310 }, { "epoch": 1.777469003847798, "grad_norm": 3.964083194732666, "learning_rate": 2.475059382422803e-06, "loss": 0.2426, "step": 8315 }, { "epoch": 1.778537836682343, "grad_norm": 4.519120216369629, "learning_rate": 2.463182897862233e-06, "loss": 0.2853, "step": 8320 }, { "epoch": 1.7796066695168875, "grad_norm": 4.322653293609619, "learning_rate": 2.4513064133016627e-06, "loss": 0.2156, "step": 8325 }, { "epoch": 1.7806755023514322, "grad_norm": 2.6961798667907715, "learning_rate": 2.439429928741093e-06, "loss": 0.2293, "step": 8330 }, { "epoch": 1.781744335185977, "grad_norm": 4.139772415161133, "learning_rate": 2.4275534441805226e-06, "loss": 0.2516, "step": 8335 }, { "epoch": 1.7828131680205215, "grad_norm": 3.3040573596954346, "learning_rate": 2.4156769596199527e-06, "loss": 0.2272, "step": 8340 }, { "epoch": 1.7838820008550664, "grad_norm": 4.51014518737793, "learning_rate": 2.403800475059383e-06, "loss": 0.2995, "step": 8345 }, { "epoch": 1.784950833689611, "grad_norm": 3.647020101547241, "learning_rate": 2.3919239904988125e-06, "loss": 0.2825, "step": 8350 }, { "epoch": 1.7860196665241557, "grad_norm": 3.456620931625366, "learning_rate": 2.3800475059382427e-06, "loss": 0.2604, "step": 8355 }, { "epoch": 1.7870884993587004, "grad_norm": 5.626756191253662, "learning_rate": 2.3681710213776724e-06, "loss": 0.2216, "step": 8360 }, { "epoch": 1.788157332193245, "grad_norm": 4.277560710906982, "learning_rate": 2.356294536817102e-06, "loss": 0.3034, "step": 8365 }, { "epoch": 1.7892261650277896, "grad_norm": 2.8576090335845947, "learning_rate": 2.344418052256532e-06, "loss": 0.229, "step": 8370 }, { "epoch": 1.7902949978623344, "grad_norm": 4.79686975479126, "learning_rate": 2.3325415676959623e-06, "loss": 0.271, "step": 8375 }, { "epoch": 1.7913638306968789, "grad_norm": 5.135036945343018, "learning_rate": 2.320665083135392e-06, "loss": 0.2371, "step": 8380 }, { "epoch": 1.7924326635314238, "grad_norm": 5.7761406898498535, "learning_rate": 2.308788598574822e-06, "loss": 0.2592, "step": 8385 }, { "epoch": 1.7935014963659683, "grad_norm": 2.8430325984954834, "learning_rate": 2.296912114014252e-06, "loss": 0.206, "step": 8390 }, { "epoch": 1.794570329200513, "grad_norm": 4.540223598480225, "learning_rate": 2.285035629453682e-06, "loss": 0.2167, "step": 8395 }, { "epoch": 1.7956391620350578, "grad_norm": 4.889501094818115, "learning_rate": 2.2731591448931117e-06, "loss": 0.2521, "step": 8400 }, { "epoch": 1.7967079948696023, "grad_norm": 3.3274142742156982, "learning_rate": 2.261282660332542e-06, "loss": 0.2283, "step": 8405 }, { "epoch": 1.797776827704147, "grad_norm": 3.501002073287964, "learning_rate": 2.249406175771972e-06, "loss": 0.2161, "step": 8410 }, { "epoch": 1.7988456605386918, "grad_norm": 2.9936413764953613, "learning_rate": 2.2375296912114016e-06, "loss": 0.2233, "step": 8415 }, { "epoch": 1.7999144933732363, "grad_norm": 4.086530685424805, "learning_rate": 2.2256532066508318e-06, "loss": 0.2799, "step": 8420 }, { "epoch": 1.8009833262077812, "grad_norm": 4.791090965270996, "learning_rate": 2.2137767220902615e-06, "loss": 0.2558, "step": 8425 }, { "epoch": 1.8020521590423257, "grad_norm": 4.07485294342041, "learning_rate": 2.201900237529691e-06, "loss": 0.3093, "step": 8430 }, { "epoch": 1.8031209918768705, "grad_norm": 4.454413414001465, "learning_rate": 2.1900237529691213e-06, "loss": 0.2751, "step": 8435 }, { "epoch": 1.8041898247114152, "grad_norm": 4.849613666534424, "learning_rate": 2.178147268408551e-06, "loss": 0.268, "step": 8440 }, { "epoch": 1.8052586575459597, "grad_norm": 4.424874782562256, "learning_rate": 2.166270783847981e-06, "loss": 0.2473, "step": 8445 }, { "epoch": 1.8063274903805044, "grad_norm": 5.070244789123535, "learning_rate": 2.1543942992874112e-06, "loss": 0.3218, "step": 8450 }, { "epoch": 1.8073963232150492, "grad_norm": 4.407561302185059, "learning_rate": 2.142517814726841e-06, "loss": 0.2602, "step": 8455 }, { "epoch": 1.8084651560495937, "grad_norm": 3.2732160091400146, "learning_rate": 2.130641330166271e-06, "loss": 0.2118, "step": 8460 }, { "epoch": 1.8095339888841386, "grad_norm": 6.757079124450684, "learning_rate": 2.1187648456057008e-06, "loss": 0.2526, "step": 8465 }, { "epoch": 1.8106028217186831, "grad_norm": 3.9517734050750732, "learning_rate": 2.106888361045131e-06, "loss": 0.2797, "step": 8470 }, { "epoch": 1.8116716545532279, "grad_norm": 3.6137807369232178, "learning_rate": 2.0950118764845606e-06, "loss": 0.2177, "step": 8475 }, { "epoch": 1.8127404873877726, "grad_norm": 3.5731587409973145, "learning_rate": 2.0831353919239907e-06, "loss": 0.2264, "step": 8480 }, { "epoch": 1.8138093202223171, "grad_norm": 4.859638690948486, "learning_rate": 2.071258907363421e-06, "loss": 0.253, "step": 8485 }, { "epoch": 1.814878153056862, "grad_norm": 4.231696605682373, "learning_rate": 2.0593824228028506e-06, "loss": 0.199, "step": 8490 }, { "epoch": 1.8159469858914066, "grad_norm": 3.7343459129333496, "learning_rate": 2.0475059382422803e-06, "loss": 0.2484, "step": 8495 }, { "epoch": 1.8170158187259513, "grad_norm": 4.6749958992004395, "learning_rate": 2.0356294536817104e-06, "loss": 0.2666, "step": 8500 }, { "epoch": 1.818084651560496, "grad_norm": 3.5160164833068848, "learning_rate": 2.02375296912114e-06, "loss": 0.2423, "step": 8505 }, { "epoch": 1.8191534843950405, "grad_norm": 5.324501037597656, "learning_rate": 2.01187648456057e-06, "loss": 0.3206, "step": 8510 }, { "epoch": 1.8202223172295853, "grad_norm": 4.3562092781066895, "learning_rate": 2.0000000000000003e-06, "loss": 0.1885, "step": 8515 }, { "epoch": 1.82129115006413, "grad_norm": 3.188398838043213, "learning_rate": 1.98812351543943e-06, "loss": 0.3103, "step": 8520 }, { "epoch": 1.8223599828986745, "grad_norm": 3.9081082344055176, "learning_rate": 1.97624703087886e-06, "loss": 0.2204, "step": 8525 }, { "epoch": 1.8234288157332195, "grad_norm": 4.220818519592285, "learning_rate": 1.96437054631829e-06, "loss": 0.2269, "step": 8530 }, { "epoch": 1.824497648567764, "grad_norm": 4.2256035804748535, "learning_rate": 1.95249406175772e-06, "loss": 0.3602, "step": 8535 }, { "epoch": 1.8255664814023087, "grad_norm": 3.092357635498047, "learning_rate": 1.9406175771971497e-06, "loss": 0.1735, "step": 8540 }, { "epoch": 1.8266353142368534, "grad_norm": 5.8758649826049805, "learning_rate": 1.9287410926365794e-06, "loss": 0.3107, "step": 8545 }, { "epoch": 1.827704147071398, "grad_norm": 4.43316650390625, "learning_rate": 1.91686460807601e-06, "loss": 0.2346, "step": 8550 }, { "epoch": 1.8287729799059427, "grad_norm": 4.877310276031494, "learning_rate": 1.9049881235154396e-06, "loss": 0.2372, "step": 8555 }, { "epoch": 1.8298418127404874, "grad_norm": 5.378355026245117, "learning_rate": 1.8931116389548696e-06, "loss": 0.2665, "step": 8560 }, { "epoch": 1.830910645575032, "grad_norm": 4.576028347015381, "learning_rate": 1.8812351543942995e-06, "loss": 0.2722, "step": 8565 }, { "epoch": 1.8319794784095769, "grad_norm": 2.452864646911621, "learning_rate": 1.8693586698337294e-06, "loss": 0.1879, "step": 8570 }, { "epoch": 1.8330483112441214, "grad_norm": 4.013648509979248, "learning_rate": 1.8574821852731593e-06, "loss": 0.2434, "step": 8575 }, { "epoch": 1.834117144078666, "grad_norm": 4.295891761779785, "learning_rate": 1.845605700712589e-06, "loss": 0.3408, "step": 8580 }, { "epoch": 1.8351859769132108, "grad_norm": 5.399013042449951, "learning_rate": 1.8337292161520193e-06, "loss": 0.2819, "step": 8585 }, { "epoch": 1.8362548097477553, "grad_norm": 5.267197608947754, "learning_rate": 1.8218527315914493e-06, "loss": 0.2828, "step": 8590 }, { "epoch": 1.8373236425823, "grad_norm": 4.1791672706604, "learning_rate": 1.809976247030879e-06, "loss": 0.1821, "step": 8595 }, { "epoch": 1.8383924754168448, "grad_norm": 4.158424377441406, "learning_rate": 1.7980997624703089e-06, "loss": 0.2193, "step": 8600 }, { "epoch": 1.8394613082513895, "grad_norm": 3.101128101348877, "learning_rate": 1.7862232779097388e-06, "loss": 0.3155, "step": 8605 }, { "epoch": 1.8405301410859343, "grad_norm": 3.661057233810425, "learning_rate": 1.7743467933491687e-06, "loss": 0.2129, "step": 8610 }, { "epoch": 1.8415989739204788, "grad_norm": 3.7547378540039062, "learning_rate": 1.7624703087885986e-06, "loss": 0.2321, "step": 8615 }, { "epoch": 1.8426678067550235, "grad_norm": 4.53202486038208, "learning_rate": 1.7505938242280287e-06, "loss": 0.1748, "step": 8620 }, { "epoch": 1.8437366395895682, "grad_norm": 3.7189040184020996, "learning_rate": 1.7387173396674587e-06, "loss": 0.2355, "step": 8625 }, { "epoch": 1.8448054724241127, "grad_norm": 5.827390670776367, "learning_rate": 1.7268408551068886e-06, "loss": 0.2226, "step": 8630 }, { "epoch": 1.8458743052586577, "grad_norm": 4.365615367889404, "learning_rate": 1.7149643705463185e-06, "loss": 0.2812, "step": 8635 }, { "epoch": 1.8469431380932022, "grad_norm": 4.593905925750732, "learning_rate": 1.7030878859857484e-06, "loss": 0.2542, "step": 8640 }, { "epoch": 1.848011970927747, "grad_norm": 4.3599419593811035, "learning_rate": 1.691211401425178e-06, "loss": 0.214, "step": 8645 }, { "epoch": 1.8490808037622917, "grad_norm": 5.342328071594238, "learning_rate": 1.679334916864608e-06, "loss": 0.2157, "step": 8650 }, { "epoch": 1.8501496365968362, "grad_norm": 3.1678943634033203, "learning_rate": 1.6674584323040384e-06, "loss": 0.2336, "step": 8655 }, { "epoch": 1.851218469431381, "grad_norm": 4.464089870452881, "learning_rate": 1.655581947743468e-06, "loss": 0.3409, "step": 8660 }, { "epoch": 1.8522873022659256, "grad_norm": 4.1919755935668945, "learning_rate": 1.643705463182898e-06, "loss": 0.2902, "step": 8665 }, { "epoch": 1.8533561351004701, "grad_norm": 3.814858913421631, "learning_rate": 1.6318289786223279e-06, "loss": 0.2329, "step": 8670 }, { "epoch": 1.854424967935015, "grad_norm": 3.2706382274627686, "learning_rate": 1.6199524940617578e-06, "loss": 0.1849, "step": 8675 }, { "epoch": 1.8554938007695596, "grad_norm": 3.6442952156066895, "learning_rate": 1.6080760095011877e-06, "loss": 0.2919, "step": 8680 }, { "epoch": 1.8565626336041043, "grad_norm": 3.179872512817383, "learning_rate": 1.5961995249406176e-06, "loss": 0.2157, "step": 8685 }, { "epoch": 1.857631466438649, "grad_norm": 3.71156644821167, "learning_rate": 1.5843230403800478e-06, "loss": 0.2286, "step": 8690 }, { "epoch": 1.8587002992731936, "grad_norm": 5.000162124633789, "learning_rate": 1.5724465558194777e-06, "loss": 0.1957, "step": 8695 }, { "epoch": 1.8597691321077383, "grad_norm": 3.7217514514923096, "learning_rate": 1.5605700712589076e-06, "loss": 0.2102, "step": 8700 }, { "epoch": 1.860837964942283, "grad_norm": 5.23848295211792, "learning_rate": 1.5486935866983375e-06, "loss": 0.3172, "step": 8705 }, { "epoch": 1.8619067977768275, "grad_norm": 3.95940899848938, "learning_rate": 1.5368171021377672e-06, "loss": 0.2619, "step": 8710 }, { "epoch": 1.8629756306113725, "grad_norm": 4.389864921569824, "learning_rate": 1.5249406175771971e-06, "loss": 0.2898, "step": 8715 }, { "epoch": 1.864044463445917, "grad_norm": 4.196899890899658, "learning_rate": 1.513064133016627e-06, "loss": 0.2523, "step": 8720 }, { "epoch": 1.8651132962804617, "grad_norm": 4.35107946395874, "learning_rate": 1.5011876484560572e-06, "loss": 0.2534, "step": 8725 }, { "epoch": 1.8661821291150065, "grad_norm": 5.233465194702148, "learning_rate": 1.489311163895487e-06, "loss": 0.2546, "step": 8730 }, { "epoch": 1.867250961949551, "grad_norm": 4.285619735717773, "learning_rate": 1.477434679334917e-06, "loss": 0.2171, "step": 8735 }, { "epoch": 1.868319794784096, "grad_norm": 5.0237579345703125, "learning_rate": 1.465558194774347e-06, "loss": 0.2617, "step": 8740 }, { "epoch": 1.8693886276186404, "grad_norm": 3.848062753677368, "learning_rate": 1.4536817102137768e-06, "loss": 0.1917, "step": 8745 }, { "epoch": 1.8704574604531852, "grad_norm": 3.6329150199890137, "learning_rate": 1.4418052256532067e-06, "loss": 0.2256, "step": 8750 }, { "epoch": 1.87152629328773, "grad_norm": 4.504333019256592, "learning_rate": 1.4299287410926366e-06, "loss": 0.2319, "step": 8755 }, { "epoch": 1.8725951261222744, "grad_norm": 6.011372089385986, "learning_rate": 1.4180522565320668e-06, "loss": 0.2783, "step": 8760 }, { "epoch": 1.8736639589568191, "grad_norm": 4.750868320465088, "learning_rate": 1.4061757719714967e-06, "loss": 0.2885, "step": 8765 }, { "epoch": 1.8747327917913639, "grad_norm": 3.2728309631347656, "learning_rate": 1.3942992874109266e-06, "loss": 0.2586, "step": 8770 }, { "epoch": 1.8758016246259084, "grad_norm": 3.3371262550354004, "learning_rate": 1.3824228028503565e-06, "loss": 0.2009, "step": 8775 }, { "epoch": 1.8768704574604533, "grad_norm": 3.7395825386047363, "learning_rate": 1.3705463182897862e-06, "loss": 0.273, "step": 8780 }, { "epoch": 1.8779392902949978, "grad_norm": 4.672481060028076, "learning_rate": 1.3586698337292161e-06, "loss": 0.2502, "step": 8785 }, { "epoch": 1.8790081231295426, "grad_norm": 2.957099676132202, "learning_rate": 1.346793349168646e-06, "loss": 0.2174, "step": 8790 }, { "epoch": 1.8800769559640873, "grad_norm": 4.8943915367126465, "learning_rate": 1.3349168646080762e-06, "loss": 0.2723, "step": 8795 }, { "epoch": 1.8811457887986318, "grad_norm": 4.067677021026611, "learning_rate": 1.323040380047506e-06, "loss": 0.2633, "step": 8800 }, { "epoch": 1.8822146216331765, "grad_norm": 4.314869403839111, "learning_rate": 1.311163895486936e-06, "loss": 0.2794, "step": 8805 }, { "epoch": 1.8832834544677213, "grad_norm": 4.225076675415039, "learning_rate": 1.299287410926366e-06, "loss": 0.2961, "step": 8810 }, { "epoch": 1.8843522873022658, "grad_norm": 3.992135763168335, "learning_rate": 1.2874109263657958e-06, "loss": 0.2598, "step": 8815 }, { "epoch": 1.8854211201368107, "grad_norm": 4.5158586502075195, "learning_rate": 1.2755344418052257e-06, "loss": 0.2794, "step": 8820 }, { "epoch": 1.8864899529713552, "grad_norm": 4.226551055908203, "learning_rate": 1.2636579572446556e-06, "loss": 0.2447, "step": 8825 }, { "epoch": 1.8875587858059, "grad_norm": 3.2052338123321533, "learning_rate": 1.2517814726840858e-06, "loss": 0.2741, "step": 8830 }, { "epoch": 1.8886276186404447, "grad_norm": 3.315537929534912, "learning_rate": 1.2399049881235155e-06, "loss": 0.2192, "step": 8835 }, { "epoch": 1.8896964514749892, "grad_norm": 4.095473289489746, "learning_rate": 1.2280285035629456e-06, "loss": 0.3188, "step": 8840 }, { "epoch": 1.890765284309534, "grad_norm": 4.654134273529053, "learning_rate": 1.2161520190023753e-06, "loss": 0.294, "step": 8845 }, { "epoch": 1.8918341171440787, "grad_norm": 3.982452154159546, "learning_rate": 1.2042755344418052e-06, "loss": 0.2961, "step": 8850 }, { "epoch": 1.8929029499786232, "grad_norm": 3.594325542449951, "learning_rate": 1.1923990498812353e-06, "loss": 0.2288, "step": 8855 }, { "epoch": 1.8939717828131681, "grad_norm": 4.437509059906006, "learning_rate": 1.1805225653206653e-06, "loss": 0.2796, "step": 8860 }, { "epoch": 1.8950406156477126, "grad_norm": 4.6788716316223145, "learning_rate": 1.1686460807600952e-06, "loss": 0.2464, "step": 8865 }, { "epoch": 1.8961094484822574, "grad_norm": 4.381009578704834, "learning_rate": 1.1567695961995249e-06, "loss": 0.2435, "step": 8870 }, { "epoch": 1.897178281316802, "grad_norm": 4.203982353210449, "learning_rate": 1.144893111638955e-06, "loss": 0.2993, "step": 8875 }, { "epoch": 1.8982471141513466, "grad_norm": 3.9560775756835938, "learning_rate": 1.133016627078385e-06, "loss": 0.2049, "step": 8880 }, { "epoch": 1.8993159469858916, "grad_norm": 4.908998012542725, "learning_rate": 1.1211401425178148e-06, "loss": 0.2588, "step": 8885 }, { "epoch": 1.900384779820436, "grad_norm": 2.399383544921875, "learning_rate": 1.1092636579572447e-06, "loss": 0.2559, "step": 8890 }, { "epoch": 1.9014536126549808, "grad_norm": 5.100274085998535, "learning_rate": 1.0973871733966747e-06, "loss": 0.261, "step": 8895 }, { "epoch": 1.9025224454895255, "grad_norm": 1.9479761123657227, "learning_rate": 1.0855106888361046e-06, "loss": 0.2132, "step": 8900 }, { "epoch": 1.90359127832407, "grad_norm": 4.266331195831299, "learning_rate": 1.0736342042755345e-06, "loss": 0.2184, "step": 8905 }, { "epoch": 1.9046601111586148, "grad_norm": 3.761469841003418, "learning_rate": 1.0617577197149644e-06, "loss": 0.2551, "step": 8910 }, { "epoch": 1.9057289439931595, "grad_norm": 5.301465034484863, "learning_rate": 1.0498812351543943e-06, "loss": 0.2302, "step": 8915 }, { "epoch": 1.906797776827704, "grad_norm": 4.8627095222473145, "learning_rate": 1.0380047505938242e-06, "loss": 0.2441, "step": 8920 }, { "epoch": 1.907866609662249, "grad_norm": 3.7152163982391357, "learning_rate": 1.0261282660332544e-06, "loss": 0.2176, "step": 8925 }, { "epoch": 1.9089354424967935, "grad_norm": 4.612980365753174, "learning_rate": 1.0142517814726843e-06, "loss": 0.3041, "step": 8930 }, { "epoch": 1.9100042753313382, "grad_norm": 3.9601426124572754, "learning_rate": 1.002375296912114e-06, "loss": 0.2325, "step": 8935 }, { "epoch": 1.911073108165883, "grad_norm": 3.773958921432495, "learning_rate": 9.904988123515439e-07, "loss": 0.2463, "step": 8940 }, { "epoch": 1.9121419410004274, "grad_norm": 5.172873020172119, "learning_rate": 9.78622327790974e-07, "loss": 0.2997, "step": 8945 }, { "epoch": 1.9132107738349722, "grad_norm": 3.382683038711548, "learning_rate": 9.66745843230404e-07, "loss": 0.2202, "step": 8950 }, { "epoch": 1.914279606669517, "grad_norm": 5.699649333953857, "learning_rate": 9.548693586698338e-07, "loss": 0.2745, "step": 8955 }, { "epoch": 1.9153484395040614, "grad_norm": 4.574731349945068, "learning_rate": 9.429928741092638e-07, "loss": 0.2642, "step": 8960 }, { "epoch": 1.9164172723386064, "grad_norm": 7.173608303070068, "learning_rate": 9.311163895486937e-07, "loss": 0.2782, "step": 8965 }, { "epoch": 1.9174861051731509, "grad_norm": 3.9324846267700195, "learning_rate": 9.192399049881236e-07, "loss": 0.2435, "step": 8970 }, { "epoch": 1.9185549380076956, "grad_norm": 3.742494583129883, "learning_rate": 9.073634204275535e-07, "loss": 0.2492, "step": 8975 }, { "epoch": 1.9196237708422403, "grad_norm": 5.236582279205322, "learning_rate": 8.954869358669835e-07, "loss": 0.2161, "step": 8980 }, { "epoch": 1.9206926036767848, "grad_norm": 3.473259449005127, "learning_rate": 8.836104513064133e-07, "loss": 0.2549, "step": 8985 }, { "epoch": 1.9217614365113296, "grad_norm": 3.2006514072418213, "learning_rate": 8.717339667458432e-07, "loss": 0.2217, "step": 8990 }, { "epoch": 1.9228302693458743, "grad_norm": 3.0505008697509766, "learning_rate": 8.598574821852733e-07, "loss": 0.266, "step": 8995 }, { "epoch": 1.9238991021804188, "grad_norm": 3.8124094009399414, "learning_rate": 8.479809976247032e-07, "loss": 0.2909, "step": 9000 }, { "epoch": 1.9249679350149638, "grad_norm": 3.0390665531158447, "learning_rate": 8.361045130641331e-07, "loss": 0.2149, "step": 9005 }, { "epoch": 1.9260367678495083, "grad_norm": 3.928755521774292, "learning_rate": 8.24228028503563e-07, "loss": 0.3099, "step": 9010 }, { "epoch": 1.927105600684053, "grad_norm": 4.092939376831055, "learning_rate": 8.12351543942993e-07, "loss": 0.1929, "step": 9015 }, { "epoch": 1.9281744335185977, "grad_norm": 4.7592573165893555, "learning_rate": 8.004750593824228e-07, "loss": 0.2854, "step": 9020 }, { "epoch": 1.9292432663531422, "grad_norm": 3.904730796813965, "learning_rate": 7.885985748218527e-07, "loss": 0.1857, "step": 9025 }, { "epoch": 1.9303120991876872, "grad_norm": 4.656405925750732, "learning_rate": 7.767220902612828e-07, "loss": 0.2445, "step": 9030 }, { "epoch": 1.9313809320222317, "grad_norm": 3.890486240386963, "learning_rate": 7.648456057007127e-07, "loss": 0.226, "step": 9035 }, { "epoch": 1.9324497648567764, "grad_norm": 4.5724334716796875, "learning_rate": 7.529691211401426e-07, "loss": 0.2822, "step": 9040 }, { "epoch": 1.9335185976913212, "grad_norm": 4.720613479614258, "learning_rate": 7.410926365795724e-07, "loss": 0.2541, "step": 9045 }, { "epoch": 1.9345874305258657, "grad_norm": 3.9262373447418213, "learning_rate": 7.292161520190025e-07, "loss": 0.2442, "step": 9050 }, { "epoch": 1.9356562633604104, "grad_norm": 3.6456849575042725, "learning_rate": 7.173396674584323e-07, "loss": 0.2504, "step": 9055 }, { "epoch": 1.9367250961949551, "grad_norm": 3.021383762359619, "learning_rate": 7.054631828978623e-07, "loss": 0.2073, "step": 9060 }, { "epoch": 1.9377939290294997, "grad_norm": 4.671846389770508, "learning_rate": 6.935866983372923e-07, "loss": 0.2245, "step": 9065 }, { "epoch": 1.9388627618640446, "grad_norm": 4.805634021759033, "learning_rate": 6.817102137767222e-07, "loss": 0.2442, "step": 9070 }, { "epoch": 1.9399315946985891, "grad_norm": 3.9393720626831055, "learning_rate": 6.698337292161521e-07, "loss": 0.2382, "step": 9075 }, { "epoch": 1.9410004275331338, "grad_norm": 5.1551408767700195, "learning_rate": 6.579572446555819e-07, "loss": 0.2482, "step": 9080 }, { "epoch": 1.9420692603676786, "grad_norm": 5.381765365600586, "learning_rate": 6.460807600950119e-07, "loss": 0.2849, "step": 9085 }, { "epoch": 1.943138093202223, "grad_norm": 3.842059850692749, "learning_rate": 6.342042755344418e-07, "loss": 0.2666, "step": 9090 }, { "epoch": 1.9442069260367678, "grad_norm": 4.254835605621338, "learning_rate": 6.223277909738719e-07, "loss": 0.224, "step": 9095 }, { "epoch": 1.9452757588713125, "grad_norm": 5.467522144317627, "learning_rate": 6.104513064133017e-07, "loss": 0.2961, "step": 9100 }, { "epoch": 1.946344591705857, "grad_norm": 4.110438823699951, "learning_rate": 5.985748218527317e-07, "loss": 0.217, "step": 9105 }, { "epoch": 1.947413424540402, "grad_norm": 4.675514221191406, "learning_rate": 5.866983372921616e-07, "loss": 0.2384, "step": 9110 }, { "epoch": 1.9484822573749465, "grad_norm": 4.90285062789917, "learning_rate": 5.748218527315915e-07, "loss": 0.2205, "step": 9115 }, { "epoch": 1.9495510902094912, "grad_norm": 4.838087558746338, "learning_rate": 5.629453681710214e-07, "loss": 0.2807, "step": 9120 }, { "epoch": 1.950619923044036, "grad_norm": 4.49014949798584, "learning_rate": 5.510688836104513e-07, "loss": 0.2577, "step": 9125 }, { "epoch": 1.9516887558785805, "grad_norm": 6.248046398162842, "learning_rate": 5.391923990498813e-07, "loss": 0.3212, "step": 9130 }, { "epoch": 1.9527575887131252, "grad_norm": 2.6727161407470703, "learning_rate": 5.273159144893112e-07, "loss": 0.239, "step": 9135 }, { "epoch": 1.95382642154767, "grad_norm": 5.567617416381836, "learning_rate": 5.154394299287412e-07, "loss": 0.2283, "step": 9140 }, { "epoch": 1.9548952543822147, "grad_norm": 4.877483367919922, "learning_rate": 5.03562945368171e-07, "loss": 0.2606, "step": 9145 }, { "epoch": 1.9559640872167594, "grad_norm": 4.150485515594482, "learning_rate": 4.91686460807601e-07, "loss": 0.2684, "step": 9150 }, { "epoch": 1.957032920051304, "grad_norm": 4.878507614135742, "learning_rate": 4.798099762470309e-07, "loss": 0.2857, "step": 9155 }, { "epoch": 1.9581017528858486, "grad_norm": 5.343387126922607, "learning_rate": 4.6793349168646085e-07, "loss": 0.2962, "step": 9160 }, { "epoch": 1.9591705857203934, "grad_norm": 4.346437454223633, "learning_rate": 4.560570071258908e-07, "loss": 0.2639, "step": 9165 }, { "epoch": 1.9602394185549379, "grad_norm": 5.331128120422363, "learning_rate": 4.441805225653207e-07, "loss": 0.2549, "step": 9170 }, { "epoch": 1.9613082513894828, "grad_norm": 4.075921535491943, "learning_rate": 4.3230403800475065e-07, "loss": 0.2561, "step": 9175 }, { "epoch": 1.9623770842240273, "grad_norm": 4.879267692565918, "learning_rate": 4.2042755344418056e-07, "loss": 0.2324, "step": 9180 }, { "epoch": 1.963445917058572, "grad_norm": 4.325537204742432, "learning_rate": 4.085510688836105e-07, "loss": 0.3142, "step": 9185 }, { "epoch": 1.9645147498931168, "grad_norm": 3.530134439468384, "learning_rate": 3.966745843230404e-07, "loss": 0.2778, "step": 9190 }, { "epoch": 1.9655835827276613, "grad_norm": 2.6315903663635254, "learning_rate": 3.8479809976247036e-07, "loss": 0.2603, "step": 9195 }, { "epoch": 1.966652415562206, "grad_norm": 4.100142002105713, "learning_rate": 3.729216152019002e-07, "loss": 0.2191, "step": 9200 }, { "epoch": 1.9677212483967508, "grad_norm": 3.4908711910247803, "learning_rate": 3.610451306413302e-07, "loss": 0.2658, "step": 9205 }, { "epoch": 1.9687900812312953, "grad_norm": 4.331186771392822, "learning_rate": 3.4916864608076015e-07, "loss": 0.2701, "step": 9210 }, { "epoch": 1.9698589140658402, "grad_norm": 6.090305805206299, "learning_rate": 3.3729216152019e-07, "loss": 0.3, "step": 9215 }, { "epoch": 1.9709277469003847, "grad_norm": 3.7345423698425293, "learning_rate": 3.2541567695962e-07, "loss": 0.2728, "step": 9220 }, { "epoch": 1.9719965797349295, "grad_norm": 6.370054244995117, "learning_rate": 3.135391923990499e-07, "loss": 0.2724, "step": 9225 }, { "epoch": 1.9730654125694742, "grad_norm": 3.2030200958251953, "learning_rate": 3.0166270783847986e-07, "loss": 0.1735, "step": 9230 }, { "epoch": 1.9741342454040187, "grad_norm": 3.904633045196533, "learning_rate": 2.897862232779098e-07, "loss": 0.25, "step": 9235 }, { "epoch": 1.9752030782385634, "grad_norm": 5.196364402770996, "learning_rate": 2.779097387173397e-07, "loss": 0.276, "step": 9240 }, { "epoch": 1.9762719110731082, "grad_norm": 5.8324785232543945, "learning_rate": 2.660332541567696e-07, "loss": 0.2581, "step": 9245 }, { "epoch": 1.9773407439076527, "grad_norm": 3.4866878986358643, "learning_rate": 2.541567695961995e-07, "loss": 0.2545, "step": 9250 }, { "epoch": 1.9784095767421976, "grad_norm": 5.080046653747559, "learning_rate": 2.422802850356295e-07, "loss": 0.2251, "step": 9255 }, { "epoch": 1.9794784095767421, "grad_norm": 4.654627799987793, "learning_rate": 2.304038004750594e-07, "loss": 0.269, "step": 9260 }, { "epoch": 1.9805472424112869, "grad_norm": 4.3756327629089355, "learning_rate": 2.1852731591448934e-07, "loss": 0.2026, "step": 9265 }, { "epoch": 1.9816160752458316, "grad_norm": 4.612358093261719, "learning_rate": 2.0665083135391925e-07, "loss": 0.2297, "step": 9270 }, { "epoch": 1.9826849080803761, "grad_norm": 4.363190174102783, "learning_rate": 1.9477434679334917e-07, "loss": 0.2414, "step": 9275 }, { "epoch": 1.983753740914921, "grad_norm": 4.239806175231934, "learning_rate": 1.828978622327791e-07, "loss": 0.2724, "step": 9280 }, { "epoch": 1.9848225737494656, "grad_norm": 3.087779998779297, "learning_rate": 1.7102137767220902e-07, "loss": 0.2338, "step": 9285 }, { "epoch": 1.9858914065840103, "grad_norm": 5.1465277671813965, "learning_rate": 1.59144893111639e-07, "loss": 0.2606, "step": 9290 }, { "epoch": 1.986960239418555, "grad_norm": 3.789433240890503, "learning_rate": 1.4726840855106888e-07, "loss": 0.2747, "step": 9295 }, { "epoch": 1.9880290722530995, "grad_norm": 3.880868673324585, "learning_rate": 1.3539192399049882e-07, "loss": 0.1792, "step": 9300 }, { "epoch": 1.9890979050876443, "grad_norm": 4.200949668884277, "learning_rate": 1.2351543942992876e-07, "loss": 0.2479, "step": 9305 }, { "epoch": 1.990166737922189, "grad_norm": 4.372617721557617, "learning_rate": 1.1163895486935867e-07, "loss": 0.2554, "step": 9310 }, { "epoch": 1.9912355707567335, "grad_norm": 3.7008919715881348, "learning_rate": 9.97624703087886e-08, "loss": 0.25, "step": 9315 }, { "epoch": 1.9923044035912785, "grad_norm": 3.9479458332061768, "learning_rate": 8.788598574821854e-08, "loss": 0.2814, "step": 9320 }, { "epoch": 1.993373236425823, "grad_norm": 4.310093402862549, "learning_rate": 7.600950118764846e-08, "loss": 0.2102, "step": 9325 }, { "epoch": 1.9944420692603677, "grad_norm": 3.808363199234009, "learning_rate": 6.41330166270784e-08, "loss": 0.2466, "step": 9330 }, { "epoch": 1.9955109020949124, "grad_norm": 4.076649188995361, "learning_rate": 5.225653206650832e-08, "loss": 0.2547, "step": 9335 }, { "epoch": 1.996579734929457, "grad_norm": 3.773390531539917, "learning_rate": 4.0380047505938245e-08, "loss": 0.2216, "step": 9340 }, { "epoch": 1.9976485677640017, "grad_norm": 3.149965286254883, "learning_rate": 2.8503562945368176e-08, "loss": 0.2521, "step": 9345 }, { "epoch": 1.9987174005985464, "grad_norm": 3.375763177871704, "learning_rate": 1.66270783847981e-08, "loss": 0.2558, "step": 9350 }, { "epoch": 1.999786233433091, "grad_norm": 5.134764194488525, "learning_rate": 4.7505938242280285e-09, "loss": 0.2345, "step": 9355 }, { "epoch": 2.0, "eval_loss": 0.12192188948392868, "eval_mrr": 0.9798825256975033, "eval_runtime": 315.6223, "eval_samples_per_second": 7.192, "eval_steps_per_second": 0.9, "step": 9356 } ], "logging_steps": 5, "max_steps": 9356, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }