{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.99806875241406, "eval_steps": 500, "global_step": 3235, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001544998068752414, "grad_norm": 5.710525847568793, "learning_rate": 6.17283950617284e-08, "loss": 0.828, "step": 1 }, { "epoch": 0.003089996137504828, "grad_norm": 6.049764326616172, "learning_rate": 1.234567901234568e-07, "loss": 0.8666, "step": 2 }, { "epoch": 0.004634994206257242, "grad_norm": 5.969137518557101, "learning_rate": 1.8518518518518518e-07, "loss": 0.8889, "step": 3 }, { "epoch": 0.006179992275009656, "grad_norm": 5.870647993334988, "learning_rate": 2.469135802469136e-07, "loss": 0.8908, "step": 4 }, { "epoch": 0.00772499034376207, "grad_norm": 5.776488445262133, "learning_rate": 3.08641975308642e-07, "loss": 0.8474, "step": 5 }, { "epoch": 0.009269988412514484, "grad_norm": 5.986642964937279, "learning_rate": 3.7037037037037036e-07, "loss": 0.9165, "step": 6 }, { "epoch": 0.010814986481266898, "grad_norm": 5.737332367580903, "learning_rate": 4.320987654320988e-07, "loss": 0.8589, "step": 7 }, { "epoch": 0.012359984550019312, "grad_norm": 5.751941500758043, "learning_rate": 4.938271604938272e-07, "loss": 0.8599, "step": 8 }, { "epoch": 0.013904982618771726, "grad_norm": 5.5764251248486385, "learning_rate": 5.555555555555555e-07, "loss": 0.831, "step": 9 }, { "epoch": 0.01544998068752414, "grad_norm": 5.6244644081886355, "learning_rate": 6.17283950617284e-07, "loss": 0.8528, "step": 10 }, { "epoch": 0.016994978756276556, "grad_norm": 5.576197394856864, "learning_rate": 6.790123456790124e-07, "loss": 0.8773, "step": 11 }, { "epoch": 0.01853997682502897, "grad_norm": 5.218349525083682, "learning_rate": 7.407407407407407e-07, "loss": 0.8251, "step": 12 }, { "epoch": 0.020084974893781384, "grad_norm": 4.354819408598521, "learning_rate": 8.024691358024692e-07, "loss": 0.8081, "step": 13 }, { "epoch": 0.021629972962533797, "grad_norm": 4.385835177613865, "learning_rate": 8.641975308641976e-07, "loss": 0.8391, "step": 14 }, { "epoch": 0.023174971031286212, "grad_norm": 4.347317412611628, "learning_rate": 9.259259259259259e-07, "loss": 0.7988, "step": 15 }, { "epoch": 0.024719969100038625, "grad_norm": 4.048400848628607, "learning_rate": 9.876543209876544e-07, "loss": 0.7927, "step": 16 }, { "epoch": 0.02626496716879104, "grad_norm": 3.9877110150261186, "learning_rate": 1.0493827160493827e-06, "loss": 0.8054, "step": 17 }, { "epoch": 0.027809965237543453, "grad_norm": 2.3715386759576953, "learning_rate": 1.111111111111111e-06, "loss": 0.7751, "step": 18 }, { "epoch": 0.02935496330629587, "grad_norm": 2.2769664112952768, "learning_rate": 1.1728395061728396e-06, "loss": 0.7403, "step": 19 }, { "epoch": 0.03089996137504828, "grad_norm": 2.2218579830138374, "learning_rate": 1.234567901234568e-06, "loss": 0.78, "step": 20 }, { "epoch": 0.03244495944380069, "grad_norm": 2.1379924026779618, "learning_rate": 1.2962962962962962e-06, "loss": 0.7733, "step": 21 }, { "epoch": 0.03398995751255311, "grad_norm": 1.8457096028802598, "learning_rate": 1.3580246913580248e-06, "loss": 0.7288, "step": 22 }, { "epoch": 0.035534955581305525, "grad_norm": 1.8873630419957341, "learning_rate": 1.4197530864197531e-06, "loss": 0.73, "step": 23 }, { "epoch": 0.03707995365005794, "grad_norm": 1.8317693080161108, "learning_rate": 1.4814814814814815e-06, "loss": 0.7215, "step": 24 }, { "epoch": 0.03862495171881035, "grad_norm": 2.5317900166988556, "learning_rate": 1.54320987654321e-06, "loss": 0.7202, "step": 25 }, { "epoch": 0.04016994978756277, "grad_norm": 2.6505836257318647, "learning_rate": 1.6049382716049383e-06, "loss": 0.7014, "step": 26 }, { "epoch": 0.04171494785631518, "grad_norm": 2.7994022700436454, "learning_rate": 1.6666666666666667e-06, "loss": 0.706, "step": 27 }, { "epoch": 0.04325994592506759, "grad_norm": 2.7783647055277068, "learning_rate": 1.7283950617283952e-06, "loss": 0.7232, "step": 28 }, { "epoch": 0.044804943993820005, "grad_norm": 2.5399605687854714, "learning_rate": 1.7901234567901235e-06, "loss": 0.7209, "step": 29 }, { "epoch": 0.046349942062572425, "grad_norm": 2.312688193327278, "learning_rate": 1.8518518518518519e-06, "loss": 0.7406, "step": 30 }, { "epoch": 0.04789494013132484, "grad_norm": 1.8960747207075574, "learning_rate": 1.9135802469135804e-06, "loss": 0.7009, "step": 31 }, { "epoch": 0.04943993820007725, "grad_norm": 1.3513049689310215, "learning_rate": 1.9753086419753087e-06, "loss": 0.669, "step": 32 }, { "epoch": 0.05098493626882966, "grad_norm": 1.0603683955872711, "learning_rate": 2.037037037037037e-06, "loss": 0.6744, "step": 33 }, { "epoch": 0.05252993433758208, "grad_norm": 1.0458531070849169, "learning_rate": 2.0987654320987654e-06, "loss": 0.6877, "step": 34 }, { "epoch": 0.05407493240633449, "grad_norm": 1.1575837214473375, "learning_rate": 2.1604938271604937e-06, "loss": 0.6703, "step": 35 }, { "epoch": 0.055619930475086905, "grad_norm": 1.1729860640338754, "learning_rate": 2.222222222222222e-06, "loss": 0.7058, "step": 36 }, { "epoch": 0.05716492854383932, "grad_norm": 1.073518447265809, "learning_rate": 2.283950617283951e-06, "loss": 0.6245, "step": 37 }, { "epoch": 0.05870992661259174, "grad_norm": 0.9894736380011858, "learning_rate": 2.345679012345679e-06, "loss": 0.6441, "step": 38 }, { "epoch": 0.06025492468134415, "grad_norm": 0.9380622149660193, "learning_rate": 2.4074074074074075e-06, "loss": 0.6554, "step": 39 }, { "epoch": 0.06179992275009656, "grad_norm": 0.878333894029556, "learning_rate": 2.469135802469136e-06, "loss": 0.6383, "step": 40 }, { "epoch": 0.06334492081884897, "grad_norm": 0.8238495498332643, "learning_rate": 2.5308641975308646e-06, "loss": 0.6634, "step": 41 }, { "epoch": 0.06488991888760139, "grad_norm": 0.6652884751646784, "learning_rate": 2.5925925925925925e-06, "loss": 0.6148, "step": 42 }, { "epoch": 0.0664349169563538, "grad_norm": 0.81345363447846, "learning_rate": 2.6543209876543212e-06, "loss": 0.6145, "step": 43 }, { "epoch": 0.06797991502510622, "grad_norm": 0.8388957796314146, "learning_rate": 2.7160493827160496e-06, "loss": 0.6206, "step": 44 }, { "epoch": 0.06952491309385864, "grad_norm": 0.8819596531907908, "learning_rate": 2.7777777777777783e-06, "loss": 0.6465, "step": 45 }, { "epoch": 0.07106991116261105, "grad_norm": 0.8070872503058211, "learning_rate": 2.8395061728395062e-06, "loss": 0.5965, "step": 46 }, { "epoch": 0.07261490923136346, "grad_norm": 0.6119664646194845, "learning_rate": 2.901234567901235e-06, "loss": 0.6237, "step": 47 }, { "epoch": 0.07415990730011587, "grad_norm": 0.5653181040561361, "learning_rate": 2.962962962962963e-06, "loss": 0.6295, "step": 48 }, { "epoch": 0.07570490536886829, "grad_norm": 0.5809711842730865, "learning_rate": 3.0246913580246917e-06, "loss": 0.6164, "step": 49 }, { "epoch": 0.0772499034376207, "grad_norm": 0.7702740655770473, "learning_rate": 3.08641975308642e-06, "loss": 0.6239, "step": 50 }, { "epoch": 0.07879490150637311, "grad_norm": 0.791815078440016, "learning_rate": 3.1481481481481483e-06, "loss": 0.5986, "step": 51 }, { "epoch": 0.08033989957512554, "grad_norm": 0.5923030796849932, "learning_rate": 3.2098765432098767e-06, "loss": 0.6027, "step": 52 }, { "epoch": 0.08188489764387795, "grad_norm": 0.5719205057137366, "learning_rate": 3.2716049382716054e-06, "loss": 0.6137, "step": 53 }, { "epoch": 0.08342989571263036, "grad_norm": 0.5344346494884967, "learning_rate": 3.3333333333333333e-06, "loss": 0.562, "step": 54 }, { "epoch": 0.08497489378138277, "grad_norm": 0.5460600127796734, "learning_rate": 3.395061728395062e-06, "loss": 0.5898, "step": 55 }, { "epoch": 0.08651989185013519, "grad_norm": 0.6196357495733394, "learning_rate": 3.4567901234567904e-06, "loss": 0.5855, "step": 56 }, { "epoch": 0.0880648899188876, "grad_norm": 0.5276719975914509, "learning_rate": 3.5185185185185187e-06, "loss": 0.5992, "step": 57 }, { "epoch": 0.08960988798764001, "grad_norm": 0.46677560361446563, "learning_rate": 3.580246913580247e-06, "loss": 0.562, "step": 58 }, { "epoch": 0.09115488605639242, "grad_norm": 0.561976163733214, "learning_rate": 3.641975308641976e-06, "loss": 0.6004, "step": 59 }, { "epoch": 0.09269988412514485, "grad_norm": 0.39497739925454306, "learning_rate": 3.7037037037037037e-06, "loss": 0.5407, "step": 60 }, { "epoch": 0.09424488219389726, "grad_norm": 0.41389566270817507, "learning_rate": 3.7654320987654325e-06, "loss": 0.5626, "step": 61 }, { "epoch": 0.09578988026264967, "grad_norm": 0.45144559080031976, "learning_rate": 3.827160493827161e-06, "loss": 0.5949, "step": 62 }, { "epoch": 0.09733487833140209, "grad_norm": 0.48213410339126606, "learning_rate": 3.88888888888889e-06, "loss": 0.5678, "step": 63 }, { "epoch": 0.0988798764001545, "grad_norm": 0.4043994993080344, "learning_rate": 3.9506172839506175e-06, "loss": 0.5474, "step": 64 }, { "epoch": 0.10042487446890691, "grad_norm": 0.4307196585420853, "learning_rate": 4.012345679012346e-06, "loss": 0.591, "step": 65 }, { "epoch": 0.10196987253765932, "grad_norm": 0.4030450395887483, "learning_rate": 4.074074074074074e-06, "loss": 0.5592, "step": 66 }, { "epoch": 0.10351487060641174, "grad_norm": 0.41302615422620265, "learning_rate": 4.135802469135803e-06, "loss": 0.5764, "step": 67 }, { "epoch": 0.10505986867516416, "grad_norm": 0.3770052620726786, "learning_rate": 4.197530864197531e-06, "loss": 0.5538, "step": 68 }, { "epoch": 0.10660486674391657, "grad_norm": 0.38236206990463684, "learning_rate": 4.2592592592592596e-06, "loss": 0.5483, "step": 69 }, { "epoch": 0.10814986481266899, "grad_norm": 0.43652729087231673, "learning_rate": 4.3209876543209875e-06, "loss": 0.572, "step": 70 }, { "epoch": 0.1096948628814214, "grad_norm": 0.36998118091746773, "learning_rate": 4.382716049382716e-06, "loss": 0.5479, "step": 71 }, { "epoch": 0.11123986095017381, "grad_norm": 0.41030965162983296, "learning_rate": 4.444444444444444e-06, "loss": 0.5647, "step": 72 }, { "epoch": 0.11278485901892622, "grad_norm": 0.394576285547281, "learning_rate": 4.506172839506173e-06, "loss": 0.5791, "step": 73 }, { "epoch": 0.11432985708767864, "grad_norm": 0.3817968746002213, "learning_rate": 4.567901234567902e-06, "loss": 0.5792, "step": 74 }, { "epoch": 0.11587485515643106, "grad_norm": 0.38145004401034627, "learning_rate": 4.62962962962963e-06, "loss": 0.5414, "step": 75 }, { "epoch": 0.11741985322518347, "grad_norm": 0.39500926226882566, "learning_rate": 4.691358024691358e-06, "loss": 0.5442, "step": 76 }, { "epoch": 0.11896485129393589, "grad_norm": 0.39305436784692144, "learning_rate": 4.753086419753087e-06, "loss": 0.5566, "step": 77 }, { "epoch": 0.1205098493626883, "grad_norm": 0.432274129509891, "learning_rate": 4.814814814814815e-06, "loss": 0.5333, "step": 78 }, { "epoch": 0.12205484743144071, "grad_norm": 0.37521165834388315, "learning_rate": 4.876543209876544e-06, "loss": 0.5585, "step": 79 }, { "epoch": 0.12359984550019312, "grad_norm": 0.40507566194275074, "learning_rate": 4.938271604938272e-06, "loss": 0.5746, "step": 80 }, { "epoch": 0.12514484356894554, "grad_norm": 0.4700873538089607, "learning_rate": 5e-06, "loss": 0.5495, "step": 81 }, { "epoch": 0.12668984163769795, "grad_norm": 0.38548467592230895, "learning_rate": 5.061728395061729e-06, "loss": 0.5641, "step": 82 }, { "epoch": 0.12823483970645036, "grad_norm": 0.38554777177664373, "learning_rate": 5.123456790123458e-06, "loss": 0.5524, "step": 83 }, { "epoch": 0.12977983777520277, "grad_norm": 0.3640393478163548, "learning_rate": 5.185185185185185e-06, "loss": 0.5451, "step": 84 }, { "epoch": 0.13132483584395518, "grad_norm": 0.43476227477610885, "learning_rate": 5.246913580246914e-06, "loss": 0.5359, "step": 85 }, { "epoch": 0.1328698339127076, "grad_norm": 0.3912135753317222, "learning_rate": 5.3086419753086425e-06, "loss": 0.5428, "step": 86 }, { "epoch": 0.13441483198146004, "grad_norm": 0.38785590560382627, "learning_rate": 5.370370370370371e-06, "loss": 0.5538, "step": 87 }, { "epoch": 0.13595983005021245, "grad_norm": 0.4337816795039643, "learning_rate": 5.432098765432099e-06, "loss": 0.5477, "step": 88 }, { "epoch": 0.13750482811896486, "grad_norm": 0.427697562609148, "learning_rate": 5.493827160493828e-06, "loss": 0.5591, "step": 89 }, { "epoch": 0.13904982618771727, "grad_norm": 0.400324725397789, "learning_rate": 5.555555555555557e-06, "loss": 0.543, "step": 90 }, { "epoch": 0.14059482425646969, "grad_norm": 0.3805022762729413, "learning_rate": 5.617283950617285e-06, "loss": 0.5366, "step": 91 }, { "epoch": 0.1421398223252221, "grad_norm": 0.40268562562969573, "learning_rate": 5.6790123456790125e-06, "loss": 0.5506, "step": 92 }, { "epoch": 0.1436848203939745, "grad_norm": 0.3741472472829562, "learning_rate": 5.740740740740741e-06, "loss": 0.5605, "step": 93 }, { "epoch": 0.14522981846272692, "grad_norm": 0.4389216559272842, "learning_rate": 5.80246913580247e-06, "loss": 0.5498, "step": 94 }, { "epoch": 0.14677481653147934, "grad_norm": 0.3504966664738845, "learning_rate": 5.864197530864199e-06, "loss": 0.5693, "step": 95 }, { "epoch": 0.14831981460023175, "grad_norm": 0.43687521278677166, "learning_rate": 5.925925925925926e-06, "loss": 0.5385, "step": 96 }, { "epoch": 0.14986481266898416, "grad_norm": 0.36997011618120434, "learning_rate": 5.9876543209876546e-06, "loss": 0.5365, "step": 97 }, { "epoch": 0.15140981073773657, "grad_norm": 0.41633654438197415, "learning_rate": 6.049382716049383e-06, "loss": 0.5482, "step": 98 }, { "epoch": 0.15295480880648898, "grad_norm": 0.36078310725818136, "learning_rate": 6.111111111111112e-06, "loss": 0.5447, "step": 99 }, { "epoch": 0.1544998068752414, "grad_norm": 0.40689414392129963, "learning_rate": 6.17283950617284e-06, "loss": 0.5571, "step": 100 }, { "epoch": 0.1560448049439938, "grad_norm": 0.3959637322505997, "learning_rate": 6.234567901234569e-06, "loss": 0.5497, "step": 101 }, { "epoch": 0.15758980301274622, "grad_norm": 0.3612104003828734, "learning_rate": 6.296296296296297e-06, "loss": 0.5321, "step": 102 }, { "epoch": 0.15913480108149866, "grad_norm": 0.40867295169168494, "learning_rate": 6.358024691358025e-06, "loss": 0.5008, "step": 103 }, { "epoch": 0.16067979915025107, "grad_norm": 0.422364732115871, "learning_rate": 6.419753086419753e-06, "loss": 0.4988, "step": 104 }, { "epoch": 0.16222479721900349, "grad_norm": 0.41840477055355874, "learning_rate": 6.481481481481482e-06, "loss": 0.5504, "step": 105 }, { "epoch": 0.1637697952877559, "grad_norm": 0.44488795453834623, "learning_rate": 6.543209876543211e-06, "loss": 0.5408, "step": 106 }, { "epoch": 0.1653147933565083, "grad_norm": 0.4090047927808193, "learning_rate": 6.60493827160494e-06, "loss": 0.5184, "step": 107 }, { "epoch": 0.16685979142526072, "grad_norm": 0.43566246713243156, "learning_rate": 6.666666666666667e-06, "loss": 0.5583, "step": 108 }, { "epoch": 0.16840478949401314, "grad_norm": 0.4336549982819173, "learning_rate": 6.728395061728395e-06, "loss": 0.5239, "step": 109 }, { "epoch": 0.16994978756276555, "grad_norm": 0.38725459953935476, "learning_rate": 6.790123456790124e-06, "loss": 0.5373, "step": 110 }, { "epoch": 0.17149478563151796, "grad_norm": 0.3886574111490645, "learning_rate": 6.851851851851853e-06, "loss": 0.5345, "step": 111 }, { "epoch": 0.17303978370027037, "grad_norm": 0.3807102882267326, "learning_rate": 6.913580246913581e-06, "loss": 0.5431, "step": 112 }, { "epoch": 0.17458478176902278, "grad_norm": 0.38851624188796763, "learning_rate": 6.975308641975309e-06, "loss": 0.525, "step": 113 }, { "epoch": 0.1761297798377752, "grad_norm": 0.39904321694201544, "learning_rate": 7.0370370370370375e-06, "loss": 0.5271, "step": 114 }, { "epoch": 0.1776747779065276, "grad_norm": 0.37763006414403505, "learning_rate": 7.098765432098766e-06, "loss": 0.5545, "step": 115 }, { "epoch": 0.17921977597528002, "grad_norm": 0.39690352437756915, "learning_rate": 7.160493827160494e-06, "loss": 0.5332, "step": 116 }, { "epoch": 0.18076477404403243, "grad_norm": 0.42185494395341117, "learning_rate": 7.222222222222223e-06, "loss": 0.5216, "step": 117 }, { "epoch": 0.18230977211278485, "grad_norm": 0.3946640051572986, "learning_rate": 7.283950617283952e-06, "loss": 0.5282, "step": 118 }, { "epoch": 0.18385477018153729, "grad_norm": 0.41587886340648617, "learning_rate": 7.34567901234568e-06, "loss": 0.5236, "step": 119 }, { "epoch": 0.1853997682502897, "grad_norm": 0.4083071883690087, "learning_rate": 7.4074074074074075e-06, "loss": 0.518, "step": 120 }, { "epoch": 0.1869447663190421, "grad_norm": 0.4670179452606362, "learning_rate": 7.469135802469136e-06, "loss": 0.5235, "step": 121 }, { "epoch": 0.18848976438779452, "grad_norm": 0.3832184526111515, "learning_rate": 7.530864197530865e-06, "loss": 0.5093, "step": 122 }, { "epoch": 0.19003476245654694, "grad_norm": 0.43110954184162165, "learning_rate": 7.592592592592594e-06, "loss": 0.5462, "step": 123 }, { "epoch": 0.19157976052529935, "grad_norm": 0.47899541042901816, "learning_rate": 7.654320987654322e-06, "loss": 0.5124, "step": 124 }, { "epoch": 0.19312475859405176, "grad_norm": 0.3563423219363521, "learning_rate": 7.71604938271605e-06, "loss": 0.5306, "step": 125 }, { "epoch": 0.19466975666280417, "grad_norm": 0.494167357130286, "learning_rate": 7.77777777777778e-06, "loss": 0.5476, "step": 126 }, { "epoch": 0.19621475473155658, "grad_norm": 0.4199281821246535, "learning_rate": 7.839506172839507e-06, "loss": 0.5129, "step": 127 }, { "epoch": 0.197759752800309, "grad_norm": 0.42874803775500075, "learning_rate": 7.901234567901235e-06, "loss": 0.5377, "step": 128 }, { "epoch": 0.1993047508690614, "grad_norm": 0.48444994069769653, "learning_rate": 7.962962962962963e-06, "loss": 0.5259, "step": 129 }, { "epoch": 0.20084974893781382, "grad_norm": 0.39490613364382426, "learning_rate": 8.024691358024692e-06, "loss": 0.5308, "step": 130 }, { "epoch": 0.20239474700656623, "grad_norm": 0.43757115803668, "learning_rate": 8.08641975308642e-06, "loss": 0.5107, "step": 131 }, { "epoch": 0.20393974507531865, "grad_norm": 0.3788906376675599, "learning_rate": 8.148148148148148e-06, "loss": 0.5217, "step": 132 }, { "epoch": 0.20548474314407106, "grad_norm": 0.41231540673430483, "learning_rate": 8.209876543209876e-06, "loss": 0.5202, "step": 133 }, { "epoch": 0.20702974121282347, "grad_norm": 0.41114922991305, "learning_rate": 8.271604938271606e-06, "loss": 0.5028, "step": 134 }, { "epoch": 0.2085747392815759, "grad_norm": 0.4314656415641388, "learning_rate": 8.333333333333334e-06, "loss": 0.5115, "step": 135 }, { "epoch": 0.21011973735032832, "grad_norm": 0.46072988154083205, "learning_rate": 8.395061728395062e-06, "loss": 0.5226, "step": 136 }, { "epoch": 0.21166473541908074, "grad_norm": 0.434467906903538, "learning_rate": 8.456790123456791e-06, "loss": 0.4874, "step": 137 }, { "epoch": 0.21320973348783315, "grad_norm": 0.497863367875963, "learning_rate": 8.518518518518519e-06, "loss": 0.4896, "step": 138 }, { "epoch": 0.21475473155658556, "grad_norm": 0.46475096439828056, "learning_rate": 8.580246913580249e-06, "loss": 0.523, "step": 139 }, { "epoch": 0.21629972962533797, "grad_norm": 0.46473376520270904, "learning_rate": 8.641975308641975e-06, "loss": 0.5222, "step": 140 }, { "epoch": 0.21784472769409038, "grad_norm": 0.48249089068999007, "learning_rate": 8.703703703703705e-06, "loss": 0.5284, "step": 141 }, { "epoch": 0.2193897257628428, "grad_norm": 0.47356534737372047, "learning_rate": 8.765432098765432e-06, "loss": 0.5411, "step": 142 }, { "epoch": 0.2209347238315952, "grad_norm": 0.49655813462641374, "learning_rate": 8.827160493827162e-06, "loss": 0.5288, "step": 143 }, { "epoch": 0.22247972190034762, "grad_norm": 0.47797832058367923, "learning_rate": 8.888888888888888e-06, "loss": 0.5011, "step": 144 }, { "epoch": 0.22402471996910003, "grad_norm": 0.4891629495482122, "learning_rate": 8.950617283950618e-06, "loss": 0.5375, "step": 145 }, { "epoch": 0.22556971803785245, "grad_norm": 0.4995540667356458, "learning_rate": 9.012345679012346e-06, "loss": 0.5066, "step": 146 }, { "epoch": 0.22711471610660486, "grad_norm": 0.4913072450986421, "learning_rate": 9.074074074074075e-06, "loss": 0.513, "step": 147 }, { "epoch": 0.22865971417535727, "grad_norm": 0.5139430700115885, "learning_rate": 9.135802469135803e-06, "loss": 0.5249, "step": 148 }, { "epoch": 0.23020471224410968, "grad_norm": 0.5001392757430173, "learning_rate": 9.197530864197531e-06, "loss": 0.5241, "step": 149 }, { "epoch": 0.23174971031286212, "grad_norm": 0.588979551967436, "learning_rate": 9.25925925925926e-06, "loss": 0.4902, "step": 150 }, { "epoch": 0.23329470838161454, "grad_norm": 0.44327600345852003, "learning_rate": 9.320987654320989e-06, "loss": 0.535, "step": 151 }, { "epoch": 0.23483970645036695, "grad_norm": 0.6588938978046119, "learning_rate": 9.382716049382717e-06, "loss": 0.5287, "step": 152 }, { "epoch": 0.23638470451911936, "grad_norm": 0.5019071274471444, "learning_rate": 9.444444444444445e-06, "loss": 0.5174, "step": 153 }, { "epoch": 0.23792970258787177, "grad_norm": 0.5798004831920888, "learning_rate": 9.506172839506174e-06, "loss": 0.5366, "step": 154 }, { "epoch": 0.23947470065662418, "grad_norm": 0.5573489243712513, "learning_rate": 9.567901234567902e-06, "loss": 0.51, "step": 155 }, { "epoch": 0.2410196987253766, "grad_norm": 0.6116355203416278, "learning_rate": 9.62962962962963e-06, "loss": 0.5238, "step": 156 }, { "epoch": 0.242564696794129, "grad_norm": 0.4303349064919002, "learning_rate": 9.691358024691358e-06, "loss": 0.5052, "step": 157 }, { "epoch": 0.24410969486288142, "grad_norm": 0.5846495029223572, "learning_rate": 9.753086419753087e-06, "loss": 0.5155, "step": 158 }, { "epoch": 0.24565469293163383, "grad_norm": 0.5317637386673317, "learning_rate": 9.814814814814815e-06, "loss": 0.5281, "step": 159 }, { "epoch": 0.24719969100038625, "grad_norm": 0.5302534238607497, "learning_rate": 9.876543209876543e-06, "loss": 0.5305, "step": 160 }, { "epoch": 0.24874468906913866, "grad_norm": 0.5266882577241278, "learning_rate": 9.938271604938273e-06, "loss": 0.5355, "step": 161 }, { "epoch": 0.25028968713789107, "grad_norm": 0.49922155532066165, "learning_rate": 1e-05, "loss": 0.5031, "step": 162 }, { "epoch": 0.2518346852066435, "grad_norm": 0.4592908271000004, "learning_rate": 1.0061728395061729e-05, "loss": 0.5021, "step": 163 }, { "epoch": 0.2533796832753959, "grad_norm": 0.4530082968374955, "learning_rate": 1.0123456790123458e-05, "loss": 0.4686, "step": 164 }, { "epoch": 0.2549246813441483, "grad_norm": 0.42978356992811056, "learning_rate": 1.0185185185185186e-05, "loss": 0.5135, "step": 165 }, { "epoch": 0.2564696794129007, "grad_norm": 0.5126396192177177, "learning_rate": 1.0246913580246916e-05, "loss": 0.5235, "step": 166 }, { "epoch": 0.25801467748165313, "grad_norm": 0.5225132732049089, "learning_rate": 1.0308641975308642e-05, "loss": 0.5005, "step": 167 }, { "epoch": 0.25955967555040554, "grad_norm": 0.4043135233504965, "learning_rate": 1.037037037037037e-05, "loss": 0.5002, "step": 168 }, { "epoch": 0.26110467361915796, "grad_norm": 0.5245231745704769, "learning_rate": 1.04320987654321e-05, "loss": 0.5011, "step": 169 }, { "epoch": 0.26264967168791037, "grad_norm": 0.47196065790321456, "learning_rate": 1.0493827160493827e-05, "loss": 0.5275, "step": 170 }, { "epoch": 0.2641946697566628, "grad_norm": 0.4523629419808231, "learning_rate": 1.0555555555555557e-05, "loss": 0.5118, "step": 171 }, { "epoch": 0.2657396678254152, "grad_norm": 0.4979806152654715, "learning_rate": 1.0617283950617285e-05, "loss": 0.5097, "step": 172 }, { "epoch": 0.2672846658941676, "grad_norm": 0.44509743127948187, "learning_rate": 1.0679012345679015e-05, "loss": 0.4993, "step": 173 }, { "epoch": 0.2688296639629201, "grad_norm": 0.4455034650385514, "learning_rate": 1.0740740740740742e-05, "loss": 0.508, "step": 174 }, { "epoch": 0.2703746620316725, "grad_norm": 0.5192570517083308, "learning_rate": 1.0802469135802469e-05, "loss": 0.492, "step": 175 }, { "epoch": 0.2719196601004249, "grad_norm": 0.4456452449258413, "learning_rate": 1.0864197530864198e-05, "loss": 0.4821, "step": 176 }, { "epoch": 0.2734646581691773, "grad_norm": 0.5144461893432273, "learning_rate": 1.0925925925925926e-05, "loss": 0.4978, "step": 177 }, { "epoch": 0.2750096562379297, "grad_norm": 0.49015908319324175, "learning_rate": 1.0987654320987656e-05, "loss": 0.5077, "step": 178 }, { "epoch": 0.27655465430668214, "grad_norm": 0.611048353148209, "learning_rate": 1.1049382716049384e-05, "loss": 0.5114, "step": 179 }, { "epoch": 0.27809965237543455, "grad_norm": 0.4301067128883658, "learning_rate": 1.1111111111111113e-05, "loss": 0.4879, "step": 180 }, { "epoch": 0.27964465044418696, "grad_norm": 0.5184200944165325, "learning_rate": 1.1172839506172841e-05, "loss": 0.4991, "step": 181 }, { "epoch": 0.28118964851293937, "grad_norm": 0.48906460056767975, "learning_rate": 1.123456790123457e-05, "loss": 0.5017, "step": 182 }, { "epoch": 0.2827346465816918, "grad_norm": 0.5068739265341573, "learning_rate": 1.1296296296296297e-05, "loss": 0.5301, "step": 183 }, { "epoch": 0.2842796446504442, "grad_norm": 0.5653539409601965, "learning_rate": 1.1358024691358025e-05, "loss": 0.5036, "step": 184 }, { "epoch": 0.2858246427191966, "grad_norm": 0.43226329912829126, "learning_rate": 1.1419753086419753e-05, "loss": 0.497, "step": 185 }, { "epoch": 0.287369640787949, "grad_norm": 0.4623648533960864, "learning_rate": 1.1481481481481482e-05, "loss": 0.4925, "step": 186 }, { "epoch": 0.28891463885670143, "grad_norm": 0.6058844212091814, "learning_rate": 1.154320987654321e-05, "loss": 0.5212, "step": 187 }, { "epoch": 0.29045963692545385, "grad_norm": 0.48211167669773336, "learning_rate": 1.160493827160494e-05, "loss": 0.5278, "step": 188 }, { "epoch": 0.29200463499420626, "grad_norm": 0.5543321529994507, "learning_rate": 1.1666666666666668e-05, "loss": 0.5055, "step": 189 }, { "epoch": 0.29354963306295867, "grad_norm": 0.48112189333487765, "learning_rate": 1.1728395061728398e-05, "loss": 0.5076, "step": 190 }, { "epoch": 0.2950946311317111, "grad_norm": 0.5639410534011825, "learning_rate": 1.1790123456790124e-05, "loss": 0.4828, "step": 191 }, { "epoch": 0.2966396292004635, "grad_norm": 0.4735479759283006, "learning_rate": 1.1851851851851852e-05, "loss": 0.5093, "step": 192 }, { "epoch": 0.2981846272692159, "grad_norm": 0.5643110697130628, "learning_rate": 1.1913580246913581e-05, "loss": 0.4854, "step": 193 }, { "epoch": 0.2997296253379683, "grad_norm": 0.4984204062912655, "learning_rate": 1.1975308641975309e-05, "loss": 0.4926, "step": 194 }, { "epoch": 0.30127462340672073, "grad_norm": 0.49997353608332434, "learning_rate": 1.2037037037037039e-05, "loss": 0.504, "step": 195 }, { "epoch": 0.30281962147547314, "grad_norm": 0.49136878100791626, "learning_rate": 1.2098765432098767e-05, "loss": 0.5038, "step": 196 }, { "epoch": 0.30436461954422556, "grad_norm": 0.47745757828321805, "learning_rate": 1.2160493827160496e-05, "loss": 0.4939, "step": 197 }, { "epoch": 0.30590961761297797, "grad_norm": 0.5437743379083488, "learning_rate": 1.2222222222222224e-05, "loss": 0.5229, "step": 198 }, { "epoch": 0.3074546156817304, "grad_norm": 0.5127922337289456, "learning_rate": 1.228395061728395e-05, "loss": 0.4895, "step": 199 }, { "epoch": 0.3089996137504828, "grad_norm": 0.5361316785945024, "learning_rate": 1.234567901234568e-05, "loss": 0.4901, "step": 200 }, { "epoch": 0.3105446118192352, "grad_norm": 0.5238404471815981, "learning_rate": 1.2407407407407408e-05, "loss": 0.5, "step": 201 }, { "epoch": 0.3120896098879876, "grad_norm": 0.5810647787246795, "learning_rate": 1.2469135802469137e-05, "loss": 0.5223, "step": 202 }, { "epoch": 0.31363460795674003, "grad_norm": 0.43923275832310554, "learning_rate": 1.2530864197530865e-05, "loss": 0.4868, "step": 203 }, { "epoch": 0.31517960602549244, "grad_norm": 0.5797605900093968, "learning_rate": 1.2592592592592593e-05, "loss": 0.5303, "step": 204 }, { "epoch": 0.31672460409424485, "grad_norm": 0.4788145310602815, "learning_rate": 1.2654320987654323e-05, "loss": 0.5078, "step": 205 }, { "epoch": 0.3182696021629973, "grad_norm": 0.5151966869820197, "learning_rate": 1.271604938271605e-05, "loss": 0.49, "step": 206 }, { "epoch": 0.31981460023174973, "grad_norm": 0.4643533022702465, "learning_rate": 1.2777777777777777e-05, "loss": 0.5175, "step": 207 }, { "epoch": 0.32135959830050215, "grad_norm": 0.4451559371965026, "learning_rate": 1.2839506172839507e-05, "loss": 0.4917, "step": 208 }, { "epoch": 0.32290459636925456, "grad_norm": 0.5769348621435174, "learning_rate": 1.2901234567901235e-05, "loss": 0.5154, "step": 209 }, { "epoch": 0.32444959443800697, "grad_norm": 0.564752183818162, "learning_rate": 1.2962962962962964e-05, "loss": 0.4617, "step": 210 }, { "epoch": 0.3259945925067594, "grad_norm": 0.4609257950185553, "learning_rate": 1.3024691358024692e-05, "loss": 0.5045, "step": 211 }, { "epoch": 0.3275395905755118, "grad_norm": 0.6104720841523222, "learning_rate": 1.3086419753086422e-05, "loss": 0.5118, "step": 212 }, { "epoch": 0.3290845886442642, "grad_norm": 0.5241343410978266, "learning_rate": 1.314814814814815e-05, "loss": 0.5171, "step": 213 }, { "epoch": 0.3306295867130166, "grad_norm": 0.5108306342986028, "learning_rate": 1.320987654320988e-05, "loss": 0.5102, "step": 214 }, { "epoch": 0.33217458478176903, "grad_norm": 0.5106959399995965, "learning_rate": 1.3271604938271605e-05, "loss": 0.4935, "step": 215 }, { "epoch": 0.33371958285052145, "grad_norm": 0.41082270149969163, "learning_rate": 1.3333333333333333e-05, "loss": 0.4964, "step": 216 }, { "epoch": 0.33526458091927386, "grad_norm": 0.5648703892036134, "learning_rate": 1.3395061728395063e-05, "loss": 0.4775, "step": 217 }, { "epoch": 0.33680957898802627, "grad_norm": 0.5302837175036034, "learning_rate": 1.345679012345679e-05, "loss": 0.5036, "step": 218 }, { "epoch": 0.3383545770567787, "grad_norm": 0.4405493806296677, "learning_rate": 1.351851851851852e-05, "loss": 0.4813, "step": 219 }, { "epoch": 0.3398995751255311, "grad_norm": 0.5559943474355175, "learning_rate": 1.3580246913580248e-05, "loss": 0.5368, "step": 220 }, { "epoch": 0.3414445731942835, "grad_norm": 0.6690313798977477, "learning_rate": 1.3641975308641976e-05, "loss": 0.5088, "step": 221 }, { "epoch": 0.3429895712630359, "grad_norm": 0.4987422253598504, "learning_rate": 1.3703703703703706e-05, "loss": 0.5012, "step": 222 }, { "epoch": 0.34453456933178833, "grad_norm": 0.565733054713056, "learning_rate": 1.3765432098765432e-05, "loss": 0.5074, "step": 223 }, { "epoch": 0.34607956740054074, "grad_norm": 0.5852915014173296, "learning_rate": 1.3827160493827162e-05, "loss": 0.5149, "step": 224 }, { "epoch": 0.34762456546929316, "grad_norm": 0.48386909237880565, "learning_rate": 1.388888888888889e-05, "loss": 0.4948, "step": 225 }, { "epoch": 0.34916956353804557, "grad_norm": 0.5090404912577391, "learning_rate": 1.3950617283950617e-05, "loss": 0.4806, "step": 226 }, { "epoch": 0.350714561606798, "grad_norm": 0.5405487911023028, "learning_rate": 1.4012345679012347e-05, "loss": 0.4836, "step": 227 }, { "epoch": 0.3522595596755504, "grad_norm": 0.6020795755895314, "learning_rate": 1.4074074074074075e-05, "loss": 0.5183, "step": 228 }, { "epoch": 0.3538045577443028, "grad_norm": 0.7210588264696691, "learning_rate": 1.4135802469135805e-05, "loss": 0.4954, "step": 229 }, { "epoch": 0.3553495558130552, "grad_norm": 0.5610102185492789, "learning_rate": 1.4197530864197532e-05, "loss": 0.4958, "step": 230 }, { "epoch": 0.35689455388180763, "grad_norm": 0.7370524380576285, "learning_rate": 1.4259259259259259e-05, "loss": 0.4834, "step": 231 }, { "epoch": 0.35843955195056004, "grad_norm": 0.6174843175213943, "learning_rate": 1.4320987654320988e-05, "loss": 0.469, "step": 232 }, { "epoch": 0.35998455001931245, "grad_norm": 0.7489823117793495, "learning_rate": 1.4382716049382716e-05, "loss": 0.4985, "step": 233 }, { "epoch": 0.36152954808806487, "grad_norm": 0.7482892211550086, "learning_rate": 1.4444444444444446e-05, "loss": 0.498, "step": 234 }, { "epoch": 0.3630745461568173, "grad_norm": 0.5663826590735073, "learning_rate": 1.4506172839506174e-05, "loss": 0.493, "step": 235 }, { "epoch": 0.3646195442255697, "grad_norm": 0.56031875302314, "learning_rate": 1.4567901234567903e-05, "loss": 0.4887, "step": 236 }, { "epoch": 0.36616454229432216, "grad_norm": 0.5545020045898295, "learning_rate": 1.4629629629629631e-05, "loss": 0.4975, "step": 237 }, { "epoch": 0.36770954036307457, "grad_norm": 0.8105529978457248, "learning_rate": 1.469135802469136e-05, "loss": 0.5139, "step": 238 }, { "epoch": 0.369254538431827, "grad_norm": 0.5051490011149278, "learning_rate": 1.4753086419753087e-05, "loss": 0.5122, "step": 239 }, { "epoch": 0.3707995365005794, "grad_norm": 0.6917082513185616, "learning_rate": 1.4814814814814815e-05, "loss": 0.4883, "step": 240 }, { "epoch": 0.3723445345693318, "grad_norm": 0.5442273270786291, "learning_rate": 1.4876543209876545e-05, "loss": 0.4996, "step": 241 }, { "epoch": 0.3738895326380842, "grad_norm": 0.5886022896267327, "learning_rate": 1.4938271604938272e-05, "loss": 0.5136, "step": 242 }, { "epoch": 0.37543453070683663, "grad_norm": 0.6176946491140586, "learning_rate": 1.5000000000000002e-05, "loss": 0.5008, "step": 243 }, { "epoch": 0.37697952877558905, "grad_norm": 0.48163962361497564, "learning_rate": 1.506172839506173e-05, "loss": 0.4621, "step": 244 }, { "epoch": 0.37852452684434146, "grad_norm": 0.5823819633531065, "learning_rate": 1.5123456790123458e-05, "loss": 0.5014, "step": 245 }, { "epoch": 0.38006952491309387, "grad_norm": 0.5556409825974377, "learning_rate": 1.5185185185185187e-05, "loss": 0.4908, "step": 246 }, { "epoch": 0.3816145229818463, "grad_norm": 0.49204456782278017, "learning_rate": 1.5246913580246915e-05, "loss": 0.4794, "step": 247 }, { "epoch": 0.3831595210505987, "grad_norm": 0.5218847012886274, "learning_rate": 1.5308641975308643e-05, "loss": 0.4819, "step": 248 }, { "epoch": 0.3847045191193511, "grad_norm": 0.6707936691548406, "learning_rate": 1.537037037037037e-05, "loss": 0.4954, "step": 249 }, { "epoch": 0.3862495171881035, "grad_norm": 0.4792858143106399, "learning_rate": 1.54320987654321e-05, "loss": 0.4704, "step": 250 }, { "epoch": 0.38779451525685593, "grad_norm": 0.6550631856766055, "learning_rate": 1.549382716049383e-05, "loss": 0.4632, "step": 251 }, { "epoch": 0.38933951332560834, "grad_norm": 0.6148232911899961, "learning_rate": 1.555555555555556e-05, "loss": 0.4957, "step": 252 }, { "epoch": 0.39088451139436076, "grad_norm": 0.6189124576819793, "learning_rate": 1.5617283950617285e-05, "loss": 0.5076, "step": 253 }, { "epoch": 0.39242950946311317, "grad_norm": 0.48862799716904554, "learning_rate": 1.5679012345679014e-05, "loss": 0.4945, "step": 254 }, { "epoch": 0.3939745075318656, "grad_norm": 0.585144087103819, "learning_rate": 1.5740740740740744e-05, "loss": 0.4924, "step": 255 }, { "epoch": 0.395519505600618, "grad_norm": 0.5667450122163912, "learning_rate": 1.580246913580247e-05, "loss": 0.484, "step": 256 }, { "epoch": 0.3970645036693704, "grad_norm": 0.6095236035793079, "learning_rate": 1.58641975308642e-05, "loss": 0.516, "step": 257 }, { "epoch": 0.3986095017381228, "grad_norm": 0.5144278114192096, "learning_rate": 1.5925925925925926e-05, "loss": 0.4946, "step": 258 }, { "epoch": 0.40015449980687523, "grad_norm": 0.4978337276381844, "learning_rate": 1.5987654320987655e-05, "loss": 0.4938, "step": 259 }, { "epoch": 0.40169949787562764, "grad_norm": 0.48962016774341105, "learning_rate": 1.6049382716049385e-05, "loss": 0.4941, "step": 260 }, { "epoch": 0.40324449594438005, "grad_norm": 0.513535643726524, "learning_rate": 1.6111111111111115e-05, "loss": 0.5076, "step": 261 }, { "epoch": 0.40478949401313247, "grad_norm": 0.6084652900237991, "learning_rate": 1.617283950617284e-05, "loss": 0.4794, "step": 262 }, { "epoch": 0.4063344920818849, "grad_norm": 0.56685351617812, "learning_rate": 1.623456790123457e-05, "loss": 0.5078, "step": 263 }, { "epoch": 0.4078794901506373, "grad_norm": 0.529846953013078, "learning_rate": 1.6296296296296297e-05, "loss": 0.4966, "step": 264 }, { "epoch": 0.4094244882193897, "grad_norm": 0.5597284968234656, "learning_rate": 1.6358024691358026e-05, "loss": 0.5213, "step": 265 }, { "epoch": 0.4109694862881421, "grad_norm": 0.5534294076758068, "learning_rate": 1.6419753086419752e-05, "loss": 0.482, "step": 266 }, { "epoch": 0.41251448435689453, "grad_norm": 0.5300280687363526, "learning_rate": 1.6481481481481482e-05, "loss": 0.4957, "step": 267 }, { "epoch": 0.41405948242564694, "grad_norm": 0.5028678429558248, "learning_rate": 1.654320987654321e-05, "loss": 0.4843, "step": 268 }, { "epoch": 0.4156044804943994, "grad_norm": 0.5332912817161174, "learning_rate": 1.660493827160494e-05, "loss": 0.5011, "step": 269 }, { "epoch": 0.4171494785631518, "grad_norm": 0.6141790491096283, "learning_rate": 1.6666666666666667e-05, "loss": 0.4807, "step": 270 }, { "epoch": 0.41869447663190423, "grad_norm": 0.4920994336331858, "learning_rate": 1.6728395061728397e-05, "loss": 0.4889, "step": 271 }, { "epoch": 0.42023947470065665, "grad_norm": 0.5980275124249509, "learning_rate": 1.6790123456790123e-05, "loss": 0.5132, "step": 272 }, { "epoch": 0.42178447276940906, "grad_norm": 0.5394625611650974, "learning_rate": 1.6851851851851853e-05, "loss": 0.4627, "step": 273 }, { "epoch": 0.42332947083816147, "grad_norm": 0.7248723930474641, "learning_rate": 1.6913580246913582e-05, "loss": 0.4869, "step": 274 }, { "epoch": 0.4248744689069139, "grad_norm": 0.6373196784188249, "learning_rate": 1.697530864197531e-05, "loss": 0.4934, "step": 275 }, { "epoch": 0.4264194669756663, "grad_norm": 0.49093690922622474, "learning_rate": 1.7037037037037038e-05, "loss": 0.4946, "step": 276 }, { "epoch": 0.4279644650444187, "grad_norm": 0.611357856932913, "learning_rate": 1.7098765432098768e-05, "loss": 0.4948, "step": 277 }, { "epoch": 0.4295094631131711, "grad_norm": 0.5810786039103336, "learning_rate": 1.7160493827160498e-05, "loss": 0.4611, "step": 278 }, { "epoch": 0.43105446118192353, "grad_norm": 0.5998516333559146, "learning_rate": 1.7222222222222224e-05, "loss": 0.4955, "step": 279 }, { "epoch": 0.43259945925067594, "grad_norm": 0.5198736292192212, "learning_rate": 1.728395061728395e-05, "loss": 0.4966, "step": 280 }, { "epoch": 0.43414445731942836, "grad_norm": 0.5536821306171171, "learning_rate": 1.734567901234568e-05, "loss": 0.4862, "step": 281 }, { "epoch": 0.43568945538818077, "grad_norm": 0.5604288378022935, "learning_rate": 1.740740740740741e-05, "loss": 0.4877, "step": 282 }, { "epoch": 0.4372344534569332, "grad_norm": 0.5949880800600871, "learning_rate": 1.746913580246914e-05, "loss": 0.4689, "step": 283 }, { "epoch": 0.4387794515256856, "grad_norm": 0.4700379462095166, "learning_rate": 1.7530864197530865e-05, "loss": 0.4747, "step": 284 }, { "epoch": 0.440324449594438, "grad_norm": 0.62160832986606, "learning_rate": 1.7592592592592595e-05, "loss": 0.4902, "step": 285 }, { "epoch": 0.4418694476631904, "grad_norm": 0.6861003303955511, "learning_rate": 1.7654320987654324e-05, "loss": 0.4802, "step": 286 }, { "epoch": 0.44341444573194283, "grad_norm": 0.6203798095778803, "learning_rate": 1.771604938271605e-05, "loss": 0.5007, "step": 287 }, { "epoch": 0.44495944380069524, "grad_norm": 0.5859656663071188, "learning_rate": 1.7777777777777777e-05, "loss": 0.4887, "step": 288 }, { "epoch": 0.44650444186944765, "grad_norm": 0.6548629496649446, "learning_rate": 1.7839506172839506e-05, "loss": 0.503, "step": 289 }, { "epoch": 0.44804943993820007, "grad_norm": 0.6950491664057514, "learning_rate": 1.7901234567901236e-05, "loss": 0.4986, "step": 290 }, { "epoch": 0.4495944380069525, "grad_norm": 0.5537226284187394, "learning_rate": 1.7962962962962965e-05, "loss": 0.5076, "step": 291 }, { "epoch": 0.4511394360757049, "grad_norm": 0.6388361745867758, "learning_rate": 1.802469135802469e-05, "loss": 0.4779, "step": 292 }, { "epoch": 0.4526844341444573, "grad_norm": 0.6085640687426014, "learning_rate": 1.808641975308642e-05, "loss": 0.5179, "step": 293 }, { "epoch": 0.4542294322132097, "grad_norm": 0.5517518414624861, "learning_rate": 1.814814814814815e-05, "loss": 0.4881, "step": 294 }, { "epoch": 0.45577443028196213, "grad_norm": 0.7151623923954042, "learning_rate": 1.820987654320988e-05, "loss": 0.4801, "step": 295 }, { "epoch": 0.45731942835071454, "grad_norm": 0.7042975048721097, "learning_rate": 1.8271604938271607e-05, "loss": 0.4938, "step": 296 }, { "epoch": 0.45886442641946695, "grad_norm": 0.4995757593080137, "learning_rate": 1.8333333333333333e-05, "loss": 0.4857, "step": 297 }, { "epoch": 0.46040942448821937, "grad_norm": 0.6187745198391312, "learning_rate": 1.8395061728395062e-05, "loss": 0.5052, "step": 298 }, { "epoch": 0.4619544225569718, "grad_norm": 0.6059646363284727, "learning_rate": 1.8456790123456792e-05, "loss": 0.4713, "step": 299 }, { "epoch": 0.46349942062572425, "grad_norm": 0.6482948009625152, "learning_rate": 1.851851851851852e-05, "loss": 0.5005, "step": 300 }, { "epoch": 0.46504441869447666, "grad_norm": 0.6992107201545023, "learning_rate": 1.8580246913580248e-05, "loss": 0.5022, "step": 301 }, { "epoch": 0.46658941676322907, "grad_norm": 0.7222756350520918, "learning_rate": 1.8641975308641977e-05, "loss": 0.4914, "step": 302 }, { "epoch": 0.4681344148319815, "grad_norm": 0.5273457902203218, "learning_rate": 1.8703703703703707e-05, "loss": 0.4895, "step": 303 }, { "epoch": 0.4696794129007339, "grad_norm": 0.7559504982375984, "learning_rate": 1.8765432098765433e-05, "loss": 0.4728, "step": 304 }, { "epoch": 0.4712244109694863, "grad_norm": 0.6653810506880293, "learning_rate": 1.8827160493827163e-05, "loss": 0.4959, "step": 305 }, { "epoch": 0.4727694090382387, "grad_norm": 0.6398622780969125, "learning_rate": 1.888888888888889e-05, "loss": 0.5179, "step": 306 }, { "epoch": 0.47431440710699113, "grad_norm": 0.7253897617529732, "learning_rate": 1.895061728395062e-05, "loss": 0.4873, "step": 307 }, { "epoch": 0.47585940517574354, "grad_norm": 0.7271988533673481, "learning_rate": 1.901234567901235e-05, "loss": 0.4802, "step": 308 }, { "epoch": 0.47740440324449596, "grad_norm": 0.7021909017029754, "learning_rate": 1.9074074074074075e-05, "loss": 0.4674, "step": 309 }, { "epoch": 0.47894940131324837, "grad_norm": 0.9323527639437065, "learning_rate": 1.9135802469135804e-05, "loss": 0.4872, "step": 310 }, { "epoch": 0.4804943993820008, "grad_norm": 0.8973508473397817, "learning_rate": 1.9197530864197534e-05, "loss": 0.4766, "step": 311 }, { "epoch": 0.4820393974507532, "grad_norm": 0.576113380299896, "learning_rate": 1.925925925925926e-05, "loss": 0.4845, "step": 312 }, { "epoch": 0.4835843955195056, "grad_norm": 0.6191199246371925, "learning_rate": 1.932098765432099e-05, "loss": 0.4976, "step": 313 }, { "epoch": 0.485129393588258, "grad_norm": 0.7369196700033259, "learning_rate": 1.9382716049382716e-05, "loss": 0.4961, "step": 314 }, { "epoch": 0.48667439165701043, "grad_norm": 0.6341586678347879, "learning_rate": 1.9444444444444445e-05, "loss": 0.4809, "step": 315 }, { "epoch": 0.48821938972576284, "grad_norm": 0.7384561853673551, "learning_rate": 1.9506172839506175e-05, "loss": 0.4999, "step": 316 }, { "epoch": 0.48976438779451525, "grad_norm": 0.7336115929437365, "learning_rate": 1.9567901234567905e-05, "loss": 0.4558, "step": 317 }, { "epoch": 0.49130938586326767, "grad_norm": 0.7486709134260394, "learning_rate": 1.962962962962963e-05, "loss": 0.4788, "step": 318 }, { "epoch": 0.4928543839320201, "grad_norm": 0.6720544533117421, "learning_rate": 1.969135802469136e-05, "loss": 0.5111, "step": 319 }, { "epoch": 0.4943993820007725, "grad_norm": 0.7761063436131116, "learning_rate": 1.9753086419753087e-05, "loss": 0.4782, "step": 320 }, { "epoch": 0.4959443800695249, "grad_norm": 0.528674070490898, "learning_rate": 1.9814814814814816e-05, "loss": 0.4852, "step": 321 }, { "epoch": 0.4974893781382773, "grad_norm": 0.6706818047677288, "learning_rate": 1.9876543209876546e-05, "loss": 0.4802, "step": 322 }, { "epoch": 0.49903437620702973, "grad_norm": 0.6339238603786248, "learning_rate": 1.9938271604938272e-05, "loss": 0.5068, "step": 323 }, { "epoch": 0.5005793742757821, "grad_norm": 0.7173836836933206, "learning_rate": 2e-05, "loss": 0.4649, "step": 324 }, { "epoch": 0.5021243723445346, "grad_norm": 0.559933412612215, "learning_rate": 1.9999994176483684e-05, "loss": 0.4978, "step": 325 }, { "epoch": 0.503669370413287, "grad_norm": 0.7436099661686131, "learning_rate": 1.9999976705941517e-05, "loss": 0.4839, "step": 326 }, { "epoch": 0.5052143684820394, "grad_norm": 0.5838636102611117, "learning_rate": 1.999994758839384e-05, "loss": 0.4857, "step": 327 }, { "epoch": 0.5067593665507918, "grad_norm": 0.7734271288071158, "learning_rate": 1.9999906823874576e-05, "loss": 0.5111, "step": 328 }, { "epoch": 0.5083043646195442, "grad_norm": 0.787973036598539, "learning_rate": 1.99998544124312e-05, "loss": 0.5045, "step": 329 }, { "epoch": 0.5098493626882966, "grad_norm": 0.5203082033516738, "learning_rate": 1.9999790354124753e-05, "loss": 0.4719, "step": 330 }, { "epoch": 0.511394360757049, "grad_norm": 0.6743999318154785, "learning_rate": 1.999971464902985e-05, "loss": 0.4788, "step": 331 }, { "epoch": 0.5129393588258014, "grad_norm": 0.5879354810039861, "learning_rate": 1.9999627297234657e-05, "loss": 0.4832, "step": 332 }, { "epoch": 0.5144843568945539, "grad_norm": 0.5201223059400362, "learning_rate": 1.9999528298840922e-05, "loss": 0.4832, "step": 333 }, { "epoch": 0.5160293549633063, "grad_norm": 0.5652951794150841, "learning_rate": 1.999941765396394e-05, "loss": 0.4776, "step": 334 }, { "epoch": 0.5175743530320587, "grad_norm": 0.5775260383487908, "learning_rate": 1.999929536273259e-05, "loss": 0.4557, "step": 335 }, { "epoch": 0.5191193511008111, "grad_norm": 0.6030130531747527, "learning_rate": 1.9999161425289293e-05, "loss": 0.5049, "step": 336 }, { "epoch": 0.5206643491695635, "grad_norm": 0.6873617700228188, "learning_rate": 1.999901584179005e-05, "loss": 0.4926, "step": 337 }, { "epoch": 0.5222093472383159, "grad_norm": 0.7045808369205199, "learning_rate": 1.999885861240443e-05, "loss": 0.4786, "step": 338 }, { "epoch": 0.5237543453070683, "grad_norm": 0.5845456254159683, "learning_rate": 1.9998689737315552e-05, "loss": 0.4924, "step": 339 }, { "epoch": 0.5252993433758207, "grad_norm": 0.6601874220285872, "learning_rate": 1.9998509216720105e-05, "loss": 0.4786, "step": 340 }, { "epoch": 0.5268443414445731, "grad_norm": 0.6372741303131043, "learning_rate": 1.9998317050828344e-05, "loss": 0.4806, "step": 341 }, { "epoch": 0.5283893395133256, "grad_norm": 0.5489797827607252, "learning_rate": 1.9998113239864084e-05, "loss": 0.4755, "step": 342 }, { "epoch": 0.529934337582078, "grad_norm": 0.5301387031120968, "learning_rate": 1.9997897784064703e-05, "loss": 0.4736, "step": 343 }, { "epoch": 0.5314793356508304, "grad_norm": 0.5901801068285081, "learning_rate": 1.9997670683681147e-05, "loss": 0.5029, "step": 344 }, { "epoch": 0.5330243337195828, "grad_norm": 0.5580833911916225, "learning_rate": 1.9997431938977917e-05, "loss": 0.4799, "step": 345 }, { "epoch": 0.5345693317883352, "grad_norm": 0.674579231832587, "learning_rate": 1.9997181550233087e-05, "loss": 0.5011, "step": 346 }, { "epoch": 0.5361143298570876, "grad_norm": 0.5824731876820698, "learning_rate": 1.9996919517738274e-05, "loss": 0.478, "step": 347 }, { "epoch": 0.5376593279258401, "grad_norm": 0.6944167289759114, "learning_rate": 1.9996645841798674e-05, "loss": 0.4808, "step": 348 }, { "epoch": 0.5392043259945926, "grad_norm": 0.6292738786597291, "learning_rate": 1.9996360522733042e-05, "loss": 0.4741, "step": 349 }, { "epoch": 0.540749324063345, "grad_norm": 0.5779598417118014, "learning_rate": 1.9996063560873683e-05, "loss": 0.481, "step": 350 }, { "epoch": 0.5422943221320974, "grad_norm": 0.5661547406370173, "learning_rate": 1.9995754956566474e-05, "loss": 0.4808, "step": 351 }, { "epoch": 0.5438393202008498, "grad_norm": 0.6338637806603226, "learning_rate": 1.9995434710170846e-05, "loss": 0.5095, "step": 352 }, { "epoch": 0.5453843182696022, "grad_norm": 0.6057747222209161, "learning_rate": 1.999510282205979e-05, "loss": 0.4811, "step": 353 }, { "epoch": 0.5469293163383546, "grad_norm": 0.5979925160054058, "learning_rate": 1.999475929261986e-05, "loss": 0.4912, "step": 354 }, { "epoch": 0.548474314407107, "grad_norm": 0.5673995805516399, "learning_rate": 1.9994404122251165e-05, "loss": 0.4956, "step": 355 }, { "epoch": 0.5500193124758594, "grad_norm": 0.5429092104503371, "learning_rate": 1.999403731136737e-05, "loss": 0.4694, "step": 356 }, { "epoch": 0.5515643105446119, "grad_norm": 0.6162747275117934, "learning_rate": 1.9993658860395705e-05, "loss": 0.4639, "step": 357 }, { "epoch": 0.5531093086133643, "grad_norm": 0.5289958498571179, "learning_rate": 1.999326876977695e-05, "loss": 0.489, "step": 358 }, { "epoch": 0.5546543066821167, "grad_norm": 0.6956854989866178, "learning_rate": 1.9992867039965446e-05, "loss": 0.4801, "step": 359 }, { "epoch": 0.5561993047508691, "grad_norm": 0.45506994619166224, "learning_rate": 1.999245367142909e-05, "loss": 0.4562, "step": 360 }, { "epoch": 0.5577443028196215, "grad_norm": 0.5803537902707872, "learning_rate": 1.9992028664649333e-05, "loss": 0.4844, "step": 361 }, { "epoch": 0.5592893008883739, "grad_norm": 0.6999398067672966, "learning_rate": 1.999159202012118e-05, "loss": 0.4973, "step": 362 }, { "epoch": 0.5608342989571263, "grad_norm": 0.5884486560625902, "learning_rate": 1.9991143738353192e-05, "loss": 0.4567, "step": 363 }, { "epoch": 0.5623792970258787, "grad_norm": 0.7686029400088842, "learning_rate": 1.9990683819867488e-05, "loss": 0.5116, "step": 364 }, { "epoch": 0.5639242950946312, "grad_norm": 0.8027722669305977, "learning_rate": 1.9990212265199738e-05, "loss": 0.4644, "step": 365 }, { "epoch": 0.5654692931633836, "grad_norm": 0.53603964469965, "learning_rate": 1.9989729074899155e-05, "loss": 0.488, "step": 366 }, { "epoch": 0.567014291232136, "grad_norm": 0.7992162509762684, "learning_rate": 1.998923424952852e-05, "loss": 0.4901, "step": 367 }, { "epoch": 0.5685592893008884, "grad_norm": 0.9592338513559997, "learning_rate": 1.9988727789664156e-05, "loss": 0.4807, "step": 368 }, { "epoch": 0.5701042873696408, "grad_norm": 0.5239783663668971, "learning_rate": 1.9988209695895934e-05, "loss": 0.4614, "step": 369 }, { "epoch": 0.5716492854383932, "grad_norm": 0.8890855920263243, "learning_rate": 1.9987679968827284e-05, "loss": 0.4718, "step": 370 }, { "epoch": 0.5731942835071456, "grad_norm": 0.7679960499673951, "learning_rate": 1.998713860907518e-05, "loss": 0.4557, "step": 371 }, { "epoch": 0.574739281575898, "grad_norm": 0.5265352371361026, "learning_rate": 1.9986585617270146e-05, "loss": 0.5142, "step": 372 }, { "epoch": 0.5762842796446505, "grad_norm": 0.8569226107458833, "learning_rate": 1.9986020994056253e-05, "loss": 0.5081, "step": 373 }, { "epoch": 0.5778292777134029, "grad_norm": 0.5730006960440248, "learning_rate": 1.9985444740091115e-05, "loss": 0.4677, "step": 374 }, { "epoch": 0.5793742757821553, "grad_norm": 0.7060033644385073, "learning_rate": 1.9984856856045905e-05, "loss": 0.4518, "step": 375 }, { "epoch": 0.5809192738509077, "grad_norm": 0.5764080189980941, "learning_rate": 1.998425734260533e-05, "loss": 0.4936, "step": 376 }, { "epoch": 0.5824642719196601, "grad_norm": 0.6530402141725, "learning_rate": 1.9983646200467642e-05, "loss": 0.4849, "step": 377 }, { "epoch": 0.5840092699884125, "grad_norm": 0.5922520687527244, "learning_rate": 1.9983023430344645e-05, "loss": 0.476, "step": 378 }, { "epoch": 0.5855542680571649, "grad_norm": 0.6622960301785015, "learning_rate": 1.9982389032961674e-05, "loss": 0.4738, "step": 379 }, { "epoch": 0.5870992661259173, "grad_norm": 0.5932387762499344, "learning_rate": 1.9981743009057625e-05, "loss": 0.4721, "step": 380 }, { "epoch": 0.5886442641946698, "grad_norm": 0.604356990297672, "learning_rate": 1.998108535938492e-05, "loss": 0.4549, "step": 381 }, { "epoch": 0.5901892622634222, "grad_norm": 0.6115703619011411, "learning_rate": 1.998041608470952e-05, "loss": 0.4827, "step": 382 }, { "epoch": 0.5917342603321746, "grad_norm": 0.5263040606071182, "learning_rate": 1.9979735185810934e-05, "loss": 0.4632, "step": 383 }, { "epoch": 0.593279258400927, "grad_norm": 0.52183161353744, "learning_rate": 1.997904266348221e-05, "loss": 0.5046, "step": 384 }, { "epoch": 0.5948242564696794, "grad_norm": 0.5839422620278649, "learning_rate": 1.9978338518529927e-05, "loss": 0.4828, "step": 385 }, { "epoch": 0.5963692545384318, "grad_norm": 0.4875831343294605, "learning_rate": 1.997762275177421e-05, "loss": 0.4994, "step": 386 }, { "epoch": 0.5979142526071842, "grad_norm": 0.5633325866593137, "learning_rate": 1.997689536404871e-05, "loss": 0.478, "step": 387 }, { "epoch": 0.5994592506759366, "grad_norm": 0.5847424662418874, "learning_rate": 1.9976156356200624e-05, "loss": 0.4523, "step": 388 }, { "epoch": 0.601004248744689, "grad_norm": 0.6139122738669522, "learning_rate": 1.9975405729090672e-05, "loss": 0.4712, "step": 389 }, { "epoch": 0.6025492468134415, "grad_norm": 0.47469630122984285, "learning_rate": 1.9974643483593113e-05, "loss": 0.4818, "step": 390 }, { "epoch": 0.6040942448821939, "grad_norm": 0.56327001825302, "learning_rate": 1.9973869620595735e-05, "loss": 0.4837, "step": 391 }, { "epoch": 0.6056392429509463, "grad_norm": 0.5349368768413365, "learning_rate": 1.997308414099986e-05, "loss": 0.4711, "step": 392 }, { "epoch": 0.6071842410196987, "grad_norm": 0.5336241812914135, "learning_rate": 1.997228704572034e-05, "loss": 0.48, "step": 393 }, { "epoch": 0.6087292390884511, "grad_norm": 0.5706105355552523, "learning_rate": 1.9971478335685554e-05, "loss": 0.4814, "step": 394 }, { "epoch": 0.6102742371572035, "grad_norm": 0.5521450734072392, "learning_rate": 1.9970658011837404e-05, "loss": 0.485, "step": 395 }, { "epoch": 0.6118192352259559, "grad_norm": 0.572963367170591, "learning_rate": 1.9969826075131332e-05, "loss": 0.4469, "step": 396 }, { "epoch": 0.6133642332947083, "grad_norm": 0.7156578110712472, "learning_rate": 1.99689825265363e-05, "loss": 0.4842, "step": 397 }, { "epoch": 0.6149092313634608, "grad_norm": 0.5707287593061962, "learning_rate": 1.996812736703478e-05, "loss": 0.4828, "step": 398 }, { "epoch": 0.6164542294322132, "grad_norm": 0.6192176042391873, "learning_rate": 1.9967260597622783e-05, "loss": 0.4812, "step": 399 }, { "epoch": 0.6179992275009656, "grad_norm": 0.46037341427538186, "learning_rate": 1.9966382219309844e-05, "loss": 0.4802, "step": 400 }, { "epoch": 0.619544225569718, "grad_norm": 0.6250606047819924, "learning_rate": 1.9965492233119007e-05, "loss": 0.4611, "step": 401 }, { "epoch": 0.6210892236384704, "grad_norm": 0.5013136460382464, "learning_rate": 1.996459064008684e-05, "loss": 0.4539, "step": 402 }, { "epoch": 0.6226342217072228, "grad_norm": 0.49658115991264595, "learning_rate": 1.9963677441263444e-05, "loss": 0.4881, "step": 403 }, { "epoch": 0.6241792197759752, "grad_norm": 0.4980556963905113, "learning_rate": 1.9962752637712407e-05, "loss": 0.4773, "step": 404 }, { "epoch": 0.6257242178447276, "grad_norm": 0.4606748293454025, "learning_rate": 1.9961816230510865e-05, "loss": 0.4685, "step": 405 }, { "epoch": 0.6272692159134801, "grad_norm": 0.513098348974989, "learning_rate": 1.996086822074945e-05, "loss": 0.4729, "step": 406 }, { "epoch": 0.6288142139822325, "grad_norm": 0.6745382495067648, "learning_rate": 1.995990860953231e-05, "loss": 0.4715, "step": 407 }, { "epoch": 0.6303592120509849, "grad_norm": 0.5812800257502622, "learning_rate": 1.9958937397977106e-05, "loss": 0.4882, "step": 408 }, { "epoch": 0.6319042101197373, "grad_norm": 0.4967563402161336, "learning_rate": 1.9957954587215015e-05, "loss": 0.4465, "step": 409 }, { "epoch": 0.6334492081884897, "grad_norm": 0.5353724919428766, "learning_rate": 1.9956960178390722e-05, "loss": 0.4652, "step": 410 }, { "epoch": 0.6349942062572422, "grad_norm": 0.5977484243441894, "learning_rate": 1.9955954172662412e-05, "loss": 0.471, "step": 411 }, { "epoch": 0.6365392043259946, "grad_norm": 0.5728110478964238, "learning_rate": 1.9954936571201787e-05, "loss": 0.4644, "step": 412 }, { "epoch": 0.6380842023947471, "grad_norm": 0.4966127297578567, "learning_rate": 1.9953907375194048e-05, "loss": 0.4841, "step": 413 }, { "epoch": 0.6396292004634995, "grad_norm": 0.4238906650293999, "learning_rate": 1.9952866585837907e-05, "loss": 0.4767, "step": 414 }, { "epoch": 0.6411741985322519, "grad_norm": 0.5021223189099361, "learning_rate": 1.9951814204345573e-05, "loss": 0.4828, "step": 415 }, { "epoch": 0.6427191966010043, "grad_norm": 0.45071322258656704, "learning_rate": 1.9950750231942756e-05, "loss": 0.4731, "step": 416 }, { "epoch": 0.6442641946697567, "grad_norm": 0.5143892247548273, "learning_rate": 1.994967466986867e-05, "loss": 0.4634, "step": 417 }, { "epoch": 0.6458091927385091, "grad_norm": 0.4860208525828681, "learning_rate": 1.9948587519376024e-05, "loss": 0.4625, "step": 418 }, { "epoch": 0.6473541908072615, "grad_norm": 0.5396692239515574, "learning_rate": 1.994748878173103e-05, "loss": 0.4875, "step": 419 }, { "epoch": 0.6488991888760139, "grad_norm": 0.504906333166527, "learning_rate": 1.9946378458213385e-05, "loss": 0.4756, "step": 420 }, { "epoch": 0.6504441869447664, "grad_norm": 0.5478797890987712, "learning_rate": 1.994525655011629e-05, "loss": 0.4655, "step": 421 }, { "epoch": 0.6519891850135188, "grad_norm": 0.48466577172367287, "learning_rate": 1.9944123058746438e-05, "loss": 0.4845, "step": 422 }, { "epoch": 0.6535341830822712, "grad_norm": 0.48727796935979073, "learning_rate": 1.9942977985424008e-05, "loss": 0.4956, "step": 423 }, { "epoch": 0.6550791811510236, "grad_norm": 0.5404229468156803, "learning_rate": 1.9941821331482668e-05, "loss": 0.4745, "step": 424 }, { "epoch": 0.656624179219776, "grad_norm": 0.5218649888859695, "learning_rate": 1.9940653098269576e-05, "loss": 0.4739, "step": 425 }, { "epoch": 0.6581691772885284, "grad_norm": 0.6277511722785452, "learning_rate": 1.9939473287145383e-05, "loss": 0.4838, "step": 426 }, { "epoch": 0.6597141753572808, "grad_norm": 0.5792420993844729, "learning_rate": 1.9938281899484212e-05, "loss": 0.4712, "step": 427 }, { "epoch": 0.6612591734260332, "grad_norm": 0.4664485056454156, "learning_rate": 1.993707893667368e-05, "loss": 0.4699, "step": 428 }, { "epoch": 0.6628041714947857, "grad_norm": 0.5895150414801137, "learning_rate": 1.9935864400114883e-05, "loss": 0.4816, "step": 429 }, { "epoch": 0.6643491695635381, "grad_norm": 0.4989619826581152, "learning_rate": 1.9934638291222393e-05, "loss": 0.4759, "step": 430 }, { "epoch": 0.6658941676322905, "grad_norm": 0.5409970663691207, "learning_rate": 1.9933400611424263e-05, "loss": 0.4994, "step": 431 }, { "epoch": 0.6674391657010429, "grad_norm": 0.44069921822749025, "learning_rate": 1.9932151362162023e-05, "loss": 0.4646, "step": 432 }, { "epoch": 0.6689841637697953, "grad_norm": 0.4743165405387189, "learning_rate": 1.993089054489068e-05, "loss": 0.4512, "step": 433 }, { "epoch": 0.6705291618385477, "grad_norm": 0.4603537336942946, "learning_rate": 1.9929618161078706e-05, "loss": 0.494, "step": 434 }, { "epoch": 0.6720741599073001, "grad_norm": 0.4994980644579075, "learning_rate": 1.992833421220806e-05, "loss": 0.4862, "step": 435 }, { "epoch": 0.6736191579760525, "grad_norm": 0.482300744590858, "learning_rate": 1.9927038699774153e-05, "loss": 0.4596, "step": 436 }, { "epoch": 0.675164156044805, "grad_norm": 0.45968194841108734, "learning_rate": 1.9925731625285877e-05, "loss": 0.4588, "step": 437 }, { "epoch": 0.6767091541135574, "grad_norm": 0.5306478390618934, "learning_rate": 1.9924412990265583e-05, "loss": 0.4469, "step": 438 }, { "epoch": 0.6782541521823098, "grad_norm": 0.4523498225363492, "learning_rate": 1.992308279624909e-05, "loss": 0.4726, "step": 439 }, { "epoch": 0.6797991502510622, "grad_norm": 0.5050806980150873, "learning_rate": 1.9921741044785686e-05, "loss": 0.4695, "step": 440 }, { "epoch": 0.6813441483198146, "grad_norm": 0.4973788637615642, "learning_rate": 1.9920387737438106e-05, "loss": 0.4498, "step": 441 }, { "epoch": 0.682889146388567, "grad_norm": 0.5324747468011528, "learning_rate": 1.991902287578255e-05, "loss": 0.4684, "step": 442 }, { "epoch": 0.6844341444573194, "grad_norm": 0.5207643117728168, "learning_rate": 1.991764646140868e-05, "loss": 0.4567, "step": 443 }, { "epoch": 0.6859791425260718, "grad_norm": 0.6917547831340377, "learning_rate": 1.9916258495919613e-05, "loss": 0.4619, "step": 444 }, { "epoch": 0.6875241405948243, "grad_norm": 0.5831283735958166, "learning_rate": 1.9914858980931915e-05, "loss": 0.4779, "step": 445 }, { "epoch": 0.6890691386635767, "grad_norm": 0.6434653697660098, "learning_rate": 1.9913447918075603e-05, "loss": 0.4843, "step": 446 }, { "epoch": 0.6906141367323291, "grad_norm": 0.6888110907722848, "learning_rate": 1.9912025308994146e-05, "loss": 0.4656, "step": 447 }, { "epoch": 0.6921591348010815, "grad_norm": 0.5572773141730595, "learning_rate": 1.991059115534447e-05, "loss": 0.4799, "step": 448 }, { "epoch": 0.6937041328698339, "grad_norm": 0.6228827937417516, "learning_rate": 1.990914545879693e-05, "loss": 0.4863, "step": 449 }, { "epoch": 0.6952491309385863, "grad_norm": 0.9385549718614218, "learning_rate": 1.9907688221035334e-05, "loss": 0.4965, "step": 450 }, { "epoch": 0.6967941290073387, "grad_norm": 0.6216201408540786, "learning_rate": 1.9906219443756934e-05, "loss": 0.4944, "step": 451 }, { "epoch": 0.6983391270760911, "grad_norm": 0.7374497687556242, "learning_rate": 1.990473912867242e-05, "loss": 0.4785, "step": 452 }, { "epoch": 0.6998841251448435, "grad_norm": 0.7242281547004954, "learning_rate": 1.9903247277505918e-05, "loss": 0.5072, "step": 453 }, { "epoch": 0.701429123213596, "grad_norm": 0.6380953606446602, "learning_rate": 1.990174389199499e-05, "loss": 0.4507, "step": 454 }, { "epoch": 0.7029741212823484, "grad_norm": 0.63217422340224, "learning_rate": 1.9900228973890642e-05, "loss": 0.4566, "step": 455 }, { "epoch": 0.7045191193511008, "grad_norm": 0.6753151746239234, "learning_rate": 1.9898702524957297e-05, "loss": 0.4972, "step": 456 }, { "epoch": 0.7060641174198532, "grad_norm": 0.5716164825628418, "learning_rate": 1.9897164546972818e-05, "loss": 0.4652, "step": 457 }, { "epoch": 0.7076091154886056, "grad_norm": 0.5781162887062791, "learning_rate": 1.989561504172849e-05, "loss": 0.479, "step": 458 }, { "epoch": 0.709154113557358, "grad_norm": 0.5546548532909156, "learning_rate": 1.989405401102903e-05, "loss": 0.4659, "step": 459 }, { "epoch": 0.7106991116261104, "grad_norm": 0.497707112874905, "learning_rate": 1.9892481456692577e-05, "loss": 0.4548, "step": 460 }, { "epoch": 0.7122441096948628, "grad_norm": 0.5487426776093526, "learning_rate": 1.989089738055068e-05, "loss": 0.4815, "step": 461 }, { "epoch": 0.7137891077636153, "grad_norm": 0.5357768679760406, "learning_rate": 1.988930178444833e-05, "loss": 0.4763, "step": 462 }, { "epoch": 0.7153341058323677, "grad_norm": 0.49667199874196705, "learning_rate": 1.988769467024392e-05, "loss": 0.4762, "step": 463 }, { "epoch": 0.7168791039011201, "grad_norm": 0.49018263206008067, "learning_rate": 1.9886076039809258e-05, "loss": 0.481, "step": 464 }, { "epoch": 0.7184241019698725, "grad_norm": 0.4932635622505462, "learning_rate": 1.9884445895029566e-05, "loss": 0.5002, "step": 465 }, { "epoch": 0.7199691000386249, "grad_norm": 0.6478933955895217, "learning_rate": 1.9882804237803487e-05, "loss": 0.4829, "step": 466 }, { "epoch": 0.7215140981073773, "grad_norm": 0.550351756704014, "learning_rate": 1.9881151070043056e-05, "loss": 0.4651, "step": 467 }, { "epoch": 0.7230590961761297, "grad_norm": 0.4803448277234995, "learning_rate": 1.987948639367373e-05, "loss": 0.4703, "step": 468 }, { "epoch": 0.7246040942448821, "grad_norm": 0.6703270815763992, "learning_rate": 1.987781021063436e-05, "loss": 0.4844, "step": 469 }, { "epoch": 0.7261490923136346, "grad_norm": 0.5130509795552595, "learning_rate": 1.98761225228772e-05, "loss": 0.4589, "step": 470 }, { "epoch": 0.727694090382387, "grad_norm": 0.6124474909651425, "learning_rate": 1.9874423332367906e-05, "loss": 0.485, "step": 471 }, { "epoch": 0.7292390884511394, "grad_norm": 0.5849000615106008, "learning_rate": 1.9872712641085532e-05, "loss": 0.4548, "step": 472 }, { "epoch": 0.7307840865198918, "grad_norm": 0.6493342997987456, "learning_rate": 1.987099045102253e-05, "loss": 0.4738, "step": 473 }, { "epoch": 0.7323290845886443, "grad_norm": 0.521504029687783, "learning_rate": 1.9869256764184732e-05, "loss": 0.4623, "step": 474 }, { "epoch": 0.7338740826573967, "grad_norm": 0.6071760408826405, "learning_rate": 1.986751158259137e-05, "loss": 0.4853, "step": 475 }, { "epoch": 0.7354190807261491, "grad_norm": 0.6133100208095442, "learning_rate": 1.986575490827507e-05, "loss": 0.48, "step": 476 }, { "epoch": 0.7369640787949016, "grad_norm": 0.6052978907744019, "learning_rate": 1.986398674328183e-05, "loss": 0.4691, "step": 477 }, { "epoch": 0.738509076863654, "grad_norm": 0.5211547069567607, "learning_rate": 1.986220708967104e-05, "loss": 0.4687, "step": 478 }, { "epoch": 0.7400540749324064, "grad_norm": 0.6504062797439619, "learning_rate": 1.9860415949515465e-05, "loss": 0.4617, "step": 479 }, { "epoch": 0.7415990730011588, "grad_norm": 0.4631275306118629, "learning_rate": 1.9858613324901258e-05, "loss": 0.4699, "step": 480 }, { "epoch": 0.7431440710699112, "grad_norm": 0.5408772862256126, "learning_rate": 1.9856799217927937e-05, "loss": 0.4745, "step": 481 }, { "epoch": 0.7446890691386636, "grad_norm": 0.615734729476013, "learning_rate": 1.9854973630708398e-05, "loss": 0.493, "step": 482 }, { "epoch": 0.746234067207416, "grad_norm": 0.5497481369298501, "learning_rate": 1.9853136565368912e-05, "loss": 0.4773, "step": 483 }, { "epoch": 0.7477790652761684, "grad_norm": 0.5395695768899293, "learning_rate": 1.985128802404911e-05, "loss": 0.4692, "step": 484 }, { "epoch": 0.7493240633449209, "grad_norm": 0.4717158441955894, "learning_rate": 1.9849428008901998e-05, "loss": 0.4443, "step": 485 }, { "epoch": 0.7508690614136733, "grad_norm": 0.5519431318167544, "learning_rate": 1.984755652209394e-05, "loss": 0.4353, "step": 486 }, { "epoch": 0.7524140594824257, "grad_norm": 0.5704660425361644, "learning_rate": 1.9845673565804668e-05, "loss": 0.4672, "step": 487 }, { "epoch": 0.7539590575511781, "grad_norm": 0.5019881000170023, "learning_rate": 1.9843779142227258e-05, "loss": 0.4707, "step": 488 }, { "epoch": 0.7555040556199305, "grad_norm": 0.8104508302720752, "learning_rate": 1.984187325356816e-05, "loss": 0.4673, "step": 489 }, { "epoch": 0.7570490536886829, "grad_norm": 0.5557089923302476, "learning_rate": 1.9839955902047158e-05, "loss": 0.4766, "step": 490 }, { "epoch": 0.7585940517574353, "grad_norm": 0.8025127585338392, "learning_rate": 1.983802708989741e-05, "loss": 0.4763, "step": 491 }, { "epoch": 0.7601390498261877, "grad_norm": 0.6276227244932889, "learning_rate": 1.9836086819365402e-05, "loss": 0.4681, "step": 492 }, { "epoch": 0.7616840478949402, "grad_norm": 0.5408146556433482, "learning_rate": 1.9834135092710974e-05, "loss": 0.4571, "step": 493 }, { "epoch": 0.7632290459636926, "grad_norm": 0.6639150288183177, "learning_rate": 1.9832171912207315e-05, "loss": 0.476, "step": 494 }, { "epoch": 0.764774044032445, "grad_norm": 0.566307892664434, "learning_rate": 1.9830197280140942e-05, "loss": 0.4789, "step": 495 }, { "epoch": 0.7663190421011974, "grad_norm": 0.48340622646584586, "learning_rate": 1.9828211198811712e-05, "loss": 0.457, "step": 496 }, { "epoch": 0.7678640401699498, "grad_norm": 0.5611417647939552, "learning_rate": 1.982621367053283e-05, "loss": 0.4735, "step": 497 }, { "epoch": 0.7694090382387022, "grad_norm": 0.8124664520976523, "learning_rate": 1.9824204697630813e-05, "loss": 0.4871, "step": 498 }, { "epoch": 0.7709540363074546, "grad_norm": 0.5828019722844524, "learning_rate": 1.9822184282445526e-05, "loss": 0.4842, "step": 499 }, { "epoch": 0.772499034376207, "grad_norm": 0.6152676172737613, "learning_rate": 1.982015242733015e-05, "loss": 0.4606, "step": 500 }, { "epoch": 0.7740440324449595, "grad_norm": 0.6059648748037076, "learning_rate": 1.9818109134651195e-05, "loss": 0.4626, "step": 501 }, { "epoch": 0.7755890305137119, "grad_norm": 0.497175893371789, "learning_rate": 1.9816054406788487e-05, "loss": 0.4758, "step": 502 }, { "epoch": 0.7771340285824643, "grad_norm": 0.7034466843527079, "learning_rate": 1.981398824613518e-05, "loss": 0.4722, "step": 503 }, { "epoch": 0.7786790266512167, "grad_norm": 0.72551984241267, "learning_rate": 1.9811910655097738e-05, "loss": 0.4835, "step": 504 }, { "epoch": 0.7802240247199691, "grad_norm": 0.474056779686596, "learning_rate": 1.9809821636095928e-05, "loss": 0.4453, "step": 505 }, { "epoch": 0.7817690227887215, "grad_norm": 0.6552409532263126, "learning_rate": 1.980772119156285e-05, "loss": 0.501, "step": 506 }, { "epoch": 0.7833140208574739, "grad_norm": 0.6656122111544996, "learning_rate": 1.980560932394489e-05, "loss": 0.4566, "step": 507 }, { "epoch": 0.7848590189262263, "grad_norm": 0.5108710688172251, "learning_rate": 1.980348603570175e-05, "loss": 0.4827, "step": 508 }, { "epoch": 0.7864040169949787, "grad_norm": 0.7029240631369389, "learning_rate": 1.980135132930643e-05, "loss": 0.4906, "step": 509 }, { "epoch": 0.7879490150637312, "grad_norm": 0.5357521646243408, "learning_rate": 1.979920520724523e-05, "loss": 0.4739, "step": 510 }, { "epoch": 0.7894940131324836, "grad_norm": 0.6138856333289459, "learning_rate": 1.979704767201775e-05, "loss": 0.4521, "step": 511 }, { "epoch": 0.791039011201236, "grad_norm": 0.7655034083350113, "learning_rate": 1.979487872613687e-05, "loss": 0.462, "step": 512 }, { "epoch": 0.7925840092699884, "grad_norm": 0.5506482235274379, "learning_rate": 1.9792698372128773e-05, "loss": 0.4721, "step": 513 }, { "epoch": 0.7941290073387408, "grad_norm": 0.6399879733970042, "learning_rate": 1.9790506612532922e-05, "loss": 0.479, "step": 514 }, { "epoch": 0.7956740054074932, "grad_norm": 0.6743218492238005, "learning_rate": 1.978830344990207e-05, "loss": 0.4959, "step": 515 }, { "epoch": 0.7972190034762456, "grad_norm": 0.6245094087709018, "learning_rate": 1.978608888680225e-05, "loss": 0.489, "step": 516 }, { "epoch": 0.798764001544998, "grad_norm": 0.5231892205730603, "learning_rate": 1.9783862925812763e-05, "loss": 0.4729, "step": 517 }, { "epoch": 0.8003089996137505, "grad_norm": 0.6980445425608665, "learning_rate": 1.9781625569526196e-05, "loss": 0.4761, "step": 518 }, { "epoch": 0.8018539976825029, "grad_norm": 0.46725084526314115, "learning_rate": 1.9779376820548406e-05, "loss": 0.48, "step": 519 }, { "epoch": 0.8033989957512553, "grad_norm": 0.7529080485036865, "learning_rate": 1.977711668149852e-05, "loss": 0.4517, "step": 520 }, { "epoch": 0.8049439938200077, "grad_norm": 0.5514745904883499, "learning_rate": 1.977484515500893e-05, "loss": 0.4535, "step": 521 }, { "epoch": 0.8064889918887601, "grad_norm": 0.5227150688951454, "learning_rate": 1.9772562243725284e-05, "loss": 0.4596, "step": 522 }, { "epoch": 0.8080339899575125, "grad_norm": 0.5961824948692727, "learning_rate": 1.97702679503065e-05, "loss": 0.47, "step": 523 }, { "epoch": 0.8095789880262649, "grad_norm": 0.45568517785748874, "learning_rate": 1.976796227742475e-05, "loss": 0.4537, "step": 524 }, { "epoch": 0.8111239860950173, "grad_norm": 0.7131825657348263, "learning_rate": 1.976564522776546e-05, "loss": 0.4866, "step": 525 }, { "epoch": 0.8126689841637698, "grad_norm": 0.5749936938909824, "learning_rate": 1.97633168040273e-05, "loss": 0.4721, "step": 526 }, { "epoch": 0.8142139822325222, "grad_norm": 0.5036849330761999, "learning_rate": 1.9760977008922198e-05, "loss": 0.4678, "step": 527 }, { "epoch": 0.8157589803012746, "grad_norm": 0.559371349758974, "learning_rate": 1.9758625845175315e-05, "loss": 0.4577, "step": 528 }, { "epoch": 0.817303978370027, "grad_norm": 0.5225927138157542, "learning_rate": 1.975626331552507e-05, "loss": 0.463, "step": 529 }, { "epoch": 0.8188489764387794, "grad_norm": 0.5819899267909925, "learning_rate": 1.97538894227231e-05, "loss": 0.4736, "step": 530 }, { "epoch": 0.8203939745075318, "grad_norm": 0.5586374174962812, "learning_rate": 1.9751504169534285e-05, "loss": 0.4619, "step": 531 }, { "epoch": 0.8219389725762842, "grad_norm": 0.5047928750527703, "learning_rate": 1.9749107558736738e-05, "loss": 0.4694, "step": 532 }, { "epoch": 0.8234839706450366, "grad_norm": 0.48203382422173785, "learning_rate": 1.9746699593121807e-05, "loss": 0.4711, "step": 533 }, { "epoch": 0.8250289687137891, "grad_norm": 0.5411100458678653, "learning_rate": 1.974428027549405e-05, "loss": 0.44, "step": 534 }, { "epoch": 0.8265739667825415, "grad_norm": 0.5856872320334934, "learning_rate": 1.9741849608671254e-05, "loss": 0.4579, "step": 535 }, { "epoch": 0.8281189648512939, "grad_norm": 0.5214238748558009, "learning_rate": 1.9739407595484428e-05, "loss": 0.4587, "step": 536 }, { "epoch": 0.8296639629200464, "grad_norm": 0.5294386215618423, "learning_rate": 1.9736954238777793e-05, "loss": 0.4755, "step": 537 }, { "epoch": 0.8312089609887988, "grad_norm": 0.5116389255050486, "learning_rate": 1.9734489541408778e-05, "loss": 0.4714, "step": 538 }, { "epoch": 0.8327539590575512, "grad_norm": 0.5285352088299049, "learning_rate": 1.9732013506248024e-05, "loss": 0.4887, "step": 539 }, { "epoch": 0.8342989571263036, "grad_norm": 0.4742494651877579, "learning_rate": 1.9729526136179383e-05, "loss": 0.4765, "step": 540 }, { "epoch": 0.835843955195056, "grad_norm": 0.5709179492275687, "learning_rate": 1.9727027434099896e-05, "loss": 0.4565, "step": 541 }, { "epoch": 0.8373889532638085, "grad_norm": 0.43801779641616123, "learning_rate": 1.972451740291981e-05, "loss": 0.4552, "step": 542 }, { "epoch": 0.8389339513325609, "grad_norm": 0.4694962544663004, "learning_rate": 1.9721996045562574e-05, "loss": 0.4531, "step": 543 }, { "epoch": 0.8404789494013133, "grad_norm": 0.5273924264239146, "learning_rate": 1.9719463364964815e-05, "loss": 0.4391, "step": 544 }, { "epoch": 0.8420239474700657, "grad_norm": 0.5070158927946227, "learning_rate": 1.9716919364076356e-05, "loss": 0.4698, "step": 545 }, { "epoch": 0.8435689455388181, "grad_norm": 0.5113370169214586, "learning_rate": 1.97143640458602e-05, "loss": 0.4511, "step": 546 }, { "epoch": 0.8451139436075705, "grad_norm": 0.5466141496586362, "learning_rate": 1.971179741329254e-05, "loss": 0.4448, "step": 547 }, { "epoch": 0.8466589416763229, "grad_norm": 0.5978693975102003, "learning_rate": 1.9709219469362738e-05, "loss": 0.4632, "step": 548 }, { "epoch": 0.8482039397450754, "grad_norm": 0.6031815135517822, "learning_rate": 1.9706630217073332e-05, "loss": 0.4641, "step": 549 }, { "epoch": 0.8497489378138278, "grad_norm": 0.5299613352275072, "learning_rate": 1.9704029659440034e-05, "loss": 0.4499, "step": 550 }, { "epoch": 0.8512939358825802, "grad_norm": 0.616012663057653, "learning_rate": 1.970141779949173e-05, "loss": 0.482, "step": 551 }, { "epoch": 0.8528389339513326, "grad_norm": 0.5807161773091191, "learning_rate": 1.9698794640270445e-05, "loss": 0.4593, "step": 552 }, { "epoch": 0.854383932020085, "grad_norm": 0.5077450623135112, "learning_rate": 1.96961601848314e-05, "loss": 0.4559, "step": 553 }, { "epoch": 0.8559289300888374, "grad_norm": 0.5955785751148015, "learning_rate": 1.969351443624294e-05, "loss": 0.4765, "step": 554 }, { "epoch": 0.8574739281575898, "grad_norm": 0.5471914657772435, "learning_rate": 1.9690857397586576e-05, "loss": 0.47, "step": 555 }, { "epoch": 0.8590189262263422, "grad_norm": 0.5477763873151372, "learning_rate": 1.9688189071956986e-05, "loss": 0.4914, "step": 556 }, { "epoch": 0.8605639242950947, "grad_norm": 0.5064162410586999, "learning_rate": 1.968550946246196e-05, "loss": 0.4466, "step": 557 }, { "epoch": 0.8621089223638471, "grad_norm": 0.5612997169646585, "learning_rate": 1.9682818572222455e-05, "loss": 0.4688, "step": 558 }, { "epoch": 0.8636539204325995, "grad_norm": 0.5922371199978138, "learning_rate": 1.968011640437256e-05, "loss": 0.4884, "step": 559 }, { "epoch": 0.8651989185013519, "grad_norm": 0.5733100931552012, "learning_rate": 1.9677402962059498e-05, "loss": 0.4691, "step": 560 }, { "epoch": 0.8667439165701043, "grad_norm": 0.4788326899620197, "learning_rate": 1.9674678248443623e-05, "loss": 0.469, "step": 561 }, { "epoch": 0.8682889146388567, "grad_norm": 0.5108286913024527, "learning_rate": 1.9671942266698422e-05, "loss": 0.4605, "step": 562 }, { "epoch": 0.8698339127076091, "grad_norm": 0.41026938235452415, "learning_rate": 1.9669195020010497e-05, "loss": 0.4675, "step": 563 }, { "epoch": 0.8713789107763615, "grad_norm": 0.5633424290252143, "learning_rate": 1.966643651157958e-05, "loss": 0.458, "step": 564 }, { "epoch": 0.872923908845114, "grad_norm": 0.4811957159097772, "learning_rate": 1.9663666744618505e-05, "loss": 0.4753, "step": 565 }, { "epoch": 0.8744689069138664, "grad_norm": 0.5145018367353924, "learning_rate": 1.966088572235324e-05, "loss": 0.4751, "step": 566 }, { "epoch": 0.8760139049826188, "grad_norm": 0.6608912963282882, "learning_rate": 1.9658093448022843e-05, "loss": 0.4951, "step": 567 }, { "epoch": 0.8775589030513712, "grad_norm": 0.4873176004907011, "learning_rate": 1.965528992487949e-05, "loss": 0.4407, "step": 568 }, { "epoch": 0.8791039011201236, "grad_norm": 0.52924829289414, "learning_rate": 1.9652475156188454e-05, "loss": 0.4457, "step": 569 }, { "epoch": 0.880648899188876, "grad_norm": 0.5224915021592323, "learning_rate": 1.96496491452281e-05, "loss": 0.4784, "step": 570 }, { "epoch": 0.8821938972576284, "grad_norm": 0.4328176416081436, "learning_rate": 1.9646811895289898e-05, "loss": 0.4638, "step": 571 }, { "epoch": 0.8837388953263808, "grad_norm": 0.6267819564657475, "learning_rate": 1.9643963409678397e-05, "loss": 0.4697, "step": 572 }, { "epoch": 0.8852838933951332, "grad_norm": 0.5726349193744767, "learning_rate": 1.9641103691711237e-05, "loss": 0.4504, "step": 573 }, { "epoch": 0.8868288914638857, "grad_norm": 0.6172099383632251, "learning_rate": 1.963823274471915e-05, "loss": 0.4772, "step": 574 }, { "epoch": 0.8883738895326381, "grad_norm": 0.5712016426831111, "learning_rate": 1.963535057204593e-05, "loss": 0.4654, "step": 575 }, { "epoch": 0.8899188876013905, "grad_norm": 0.6525543206349429, "learning_rate": 1.963245717704845e-05, "loss": 0.4645, "step": 576 }, { "epoch": 0.8914638856701429, "grad_norm": 0.5468160048979546, "learning_rate": 1.9629552563096663e-05, "loss": 0.4596, "step": 577 }, { "epoch": 0.8930088837388953, "grad_norm": 0.5960945179184008, "learning_rate": 1.962663673357358e-05, "loss": 0.4501, "step": 578 }, { "epoch": 0.8945538818076477, "grad_norm": 0.48223789706471104, "learning_rate": 1.9623709691875276e-05, "loss": 0.4612, "step": 579 }, { "epoch": 0.8960988798764001, "grad_norm": 0.6933949975822168, "learning_rate": 1.962077144141089e-05, "loss": 0.4717, "step": 580 }, { "epoch": 0.8976438779451525, "grad_norm": 0.6216858041696675, "learning_rate": 1.9617821985602602e-05, "loss": 0.4406, "step": 581 }, { "epoch": 0.899188876013905, "grad_norm": 0.5247659248935891, "learning_rate": 1.961486132788567e-05, "loss": 0.4351, "step": 582 }, { "epoch": 0.9007338740826574, "grad_norm": 0.5673144096102493, "learning_rate": 1.9611889471708363e-05, "loss": 0.4761, "step": 583 }, { "epoch": 0.9022788721514098, "grad_norm": 0.5852411323255016, "learning_rate": 1.9608906420532022e-05, "loss": 0.4785, "step": 584 }, { "epoch": 0.9038238702201622, "grad_norm": 0.5156951100998572, "learning_rate": 1.9605912177831017e-05, "loss": 0.4377, "step": 585 }, { "epoch": 0.9053688682889146, "grad_norm": 0.652584440811431, "learning_rate": 1.9602906747092748e-05, "loss": 0.4427, "step": 586 }, { "epoch": 0.906913866357667, "grad_norm": 0.572741140001938, "learning_rate": 1.9599890131817654e-05, "loss": 0.4735, "step": 587 }, { "epoch": 0.9084588644264194, "grad_norm": 0.5389575841277862, "learning_rate": 1.959686233551919e-05, "loss": 0.4659, "step": 588 }, { "epoch": 0.9100038624951718, "grad_norm": 0.5544813005610821, "learning_rate": 1.959382336172385e-05, "loss": 0.4711, "step": 589 }, { "epoch": 0.9115488605639243, "grad_norm": 0.4629794079326743, "learning_rate": 1.959077321397113e-05, "loss": 0.4485, "step": 590 }, { "epoch": 0.9130938586326767, "grad_norm": 0.5543012926620978, "learning_rate": 1.9587711895813543e-05, "loss": 0.4772, "step": 591 }, { "epoch": 0.9146388567014291, "grad_norm": 0.5097890428803978, "learning_rate": 1.9584639410816626e-05, "loss": 0.4372, "step": 592 }, { "epoch": 0.9161838547701815, "grad_norm": 0.5003904469671335, "learning_rate": 1.958155576255891e-05, "loss": 0.4553, "step": 593 }, { "epoch": 0.9177288528389339, "grad_norm": 0.6447345651505191, "learning_rate": 1.9578460954631925e-05, "loss": 0.4599, "step": 594 }, { "epoch": 0.9192738509076863, "grad_norm": 0.45998220830039327, "learning_rate": 1.9575354990640207e-05, "loss": 0.4748, "step": 595 }, { "epoch": 0.9208188489764387, "grad_norm": 0.576612511028923, "learning_rate": 1.957223787420128e-05, "loss": 0.4624, "step": 596 }, { "epoch": 0.9223638470451911, "grad_norm": 0.5009049994961855, "learning_rate": 1.956910960894567e-05, "loss": 0.4735, "step": 597 }, { "epoch": 0.9239088451139436, "grad_norm": 0.5460626834367621, "learning_rate": 1.9565970198516865e-05, "loss": 0.4846, "step": 598 }, { "epoch": 0.925453843182696, "grad_norm": 0.5015386578208575, "learning_rate": 1.9562819646571352e-05, "loss": 0.4609, "step": 599 }, { "epoch": 0.9269988412514485, "grad_norm": 0.5024530445286935, "learning_rate": 1.955965795677859e-05, "loss": 0.467, "step": 600 }, { "epoch": 0.9285438393202009, "grad_norm": 0.5214495803470149, "learning_rate": 1.9556485132821008e-05, "loss": 0.468, "step": 601 }, { "epoch": 0.9300888373889533, "grad_norm": 0.5469904473746168, "learning_rate": 1.9553301178394e-05, "loss": 0.4675, "step": 602 }, { "epoch": 0.9316338354577057, "grad_norm": 0.42801213668000626, "learning_rate": 1.9550106097205936e-05, "loss": 0.4385, "step": 603 }, { "epoch": 0.9331788335264581, "grad_norm": 0.6286126945693475, "learning_rate": 1.9546899892978133e-05, "loss": 0.4671, "step": 604 }, { "epoch": 0.9347238315952106, "grad_norm": 0.5298545848972486, "learning_rate": 1.954368256944487e-05, "loss": 0.4528, "step": 605 }, { "epoch": 0.936268829663963, "grad_norm": 0.6284357598017777, "learning_rate": 1.954045413035337e-05, "loss": 0.4618, "step": 606 }, { "epoch": 0.9378138277327154, "grad_norm": 0.5494468363897199, "learning_rate": 1.9537214579463813e-05, "loss": 0.4401, "step": 607 }, { "epoch": 0.9393588258014678, "grad_norm": 0.5803567598323889, "learning_rate": 1.9533963920549307e-05, "loss": 0.4482, "step": 608 }, { "epoch": 0.9409038238702202, "grad_norm": 0.4848098858889297, "learning_rate": 1.953070215739591e-05, "loss": 0.4584, "step": 609 }, { "epoch": 0.9424488219389726, "grad_norm": 0.5287737798300541, "learning_rate": 1.952742929380261e-05, "loss": 0.4449, "step": 610 }, { "epoch": 0.943993820007725, "grad_norm": 0.44746267147798136, "learning_rate": 1.9524145333581315e-05, "loss": 0.4645, "step": 611 }, { "epoch": 0.9455388180764774, "grad_norm": 0.47006753115030436, "learning_rate": 1.952085028055687e-05, "loss": 0.4585, "step": 612 }, { "epoch": 0.9470838161452299, "grad_norm": 0.477649050783476, "learning_rate": 1.951754413856703e-05, "loss": 0.4803, "step": 613 }, { "epoch": 0.9486288142139823, "grad_norm": 0.5861456310642301, "learning_rate": 1.9514226911462476e-05, "loss": 0.4715, "step": 614 }, { "epoch": 0.9501738122827347, "grad_norm": 0.586466955629601, "learning_rate": 1.9510898603106785e-05, "loss": 0.4661, "step": 615 }, { "epoch": 0.9517188103514871, "grad_norm": 0.5250264066549754, "learning_rate": 1.9507559217376454e-05, "loss": 0.4774, "step": 616 }, { "epoch": 0.9532638084202395, "grad_norm": 0.6259679352919468, "learning_rate": 1.9504208758160875e-05, "loss": 0.4734, "step": 617 }, { "epoch": 0.9548088064889919, "grad_norm": 0.6274651281533699, "learning_rate": 1.9500847229362336e-05, "loss": 0.4697, "step": 618 }, { "epoch": 0.9563538045577443, "grad_norm": 0.56420356915564, "learning_rate": 1.949747463489603e-05, "loss": 0.4612, "step": 619 }, { "epoch": 0.9578988026264967, "grad_norm": 0.6539424139332953, "learning_rate": 1.9494090978690012e-05, "loss": 0.4277, "step": 620 }, { "epoch": 0.9594438006952491, "grad_norm": 0.5397383366998847, "learning_rate": 1.9490696264685255e-05, "loss": 0.4914, "step": 621 }, { "epoch": 0.9609887987640016, "grad_norm": 0.7074533966019366, "learning_rate": 1.948729049683558e-05, "loss": 0.4706, "step": 622 }, { "epoch": 0.962533796832754, "grad_norm": 0.638739683017697, "learning_rate": 1.9483873679107708e-05, "loss": 0.4706, "step": 623 }, { "epoch": 0.9640787949015064, "grad_norm": 0.5642274047110885, "learning_rate": 1.9480445815481208e-05, "loss": 0.4603, "step": 624 }, { "epoch": 0.9656237929702588, "grad_norm": 0.7915082915304136, "learning_rate": 1.9477006909948528e-05, "loss": 0.4736, "step": 625 }, { "epoch": 0.9671687910390112, "grad_norm": 0.7193895705299945, "learning_rate": 1.947355696651497e-05, "loss": 0.4536, "step": 626 }, { "epoch": 0.9687137891077636, "grad_norm": 0.6874467686400824, "learning_rate": 1.9470095989198698e-05, "loss": 0.4609, "step": 627 }, { "epoch": 0.970258787176516, "grad_norm": 0.742725656530099, "learning_rate": 1.9466623982030724e-05, "loss": 0.4667, "step": 628 }, { "epoch": 0.9718037852452684, "grad_norm": 0.6351055914182705, "learning_rate": 1.94631409490549e-05, "loss": 0.4622, "step": 629 }, { "epoch": 0.9733487833140209, "grad_norm": 0.620862753529517, "learning_rate": 1.9459646894327935e-05, "loss": 0.4662, "step": 630 }, { "epoch": 0.9748937813827733, "grad_norm": 0.8008314887333335, "learning_rate": 1.945614182191936e-05, "loss": 0.4573, "step": 631 }, { "epoch": 0.9764387794515257, "grad_norm": 0.5871249570400383, "learning_rate": 1.9452625735911538e-05, "loss": 0.4658, "step": 632 }, { "epoch": 0.9779837775202781, "grad_norm": 0.6981720820056847, "learning_rate": 1.944909864039968e-05, "loss": 0.4563, "step": 633 }, { "epoch": 0.9795287755890305, "grad_norm": 0.6308706397897398, "learning_rate": 1.9445560539491797e-05, "loss": 0.4692, "step": 634 }, { "epoch": 0.9810737736577829, "grad_norm": 0.6738567666349204, "learning_rate": 1.9442011437308726e-05, "loss": 0.4609, "step": 635 }, { "epoch": 0.9826187717265353, "grad_norm": 0.7992587303022194, "learning_rate": 1.9438451337984123e-05, "loss": 0.4777, "step": 636 }, { "epoch": 0.9841637697952877, "grad_norm": 0.43096464793857103, "learning_rate": 1.9434880245664445e-05, "loss": 0.4904, "step": 637 }, { "epoch": 0.9857087678640402, "grad_norm": 0.71312683936272, "learning_rate": 1.9431298164508953e-05, "loss": 0.4776, "step": 638 }, { "epoch": 0.9872537659327926, "grad_norm": 0.5245795847338929, "learning_rate": 1.942770509868971e-05, "loss": 0.4434, "step": 639 }, { "epoch": 0.988798764001545, "grad_norm": 0.5112752811456024, "learning_rate": 1.9424101052391568e-05, "loss": 0.4426, "step": 640 }, { "epoch": 0.9903437620702974, "grad_norm": 0.5640047688975489, "learning_rate": 1.942048602981218e-05, "loss": 0.4721, "step": 641 }, { "epoch": 0.9918887601390498, "grad_norm": 0.46816840841005464, "learning_rate": 1.9416860035161965e-05, "loss": 0.4437, "step": 642 }, { "epoch": 0.9934337582078022, "grad_norm": 0.5821896523793214, "learning_rate": 1.9413223072664143e-05, "loss": 0.4498, "step": 643 }, { "epoch": 0.9949787562765546, "grad_norm": 0.47576663271203234, "learning_rate": 1.9409575146554684e-05, "loss": 0.4545, "step": 644 }, { "epoch": 0.996523754345307, "grad_norm": 0.5772400583391829, "learning_rate": 1.940591626108234e-05, "loss": 0.4411, "step": 645 }, { "epoch": 0.9980687524140595, "grad_norm": 0.6078500383698241, "learning_rate": 1.9402246420508634e-05, "loss": 0.4609, "step": 646 }, { "epoch": 0.9996137504828119, "grad_norm": 0.4634291476559771, "learning_rate": 1.9398565629107838e-05, "loss": 0.4437, "step": 647 }, { "epoch": 1.0011587485515643, "grad_norm": 1.3364766284298728, "learning_rate": 1.9394873891166985e-05, "loss": 0.83, "step": 648 }, { "epoch": 1.0027037466203168, "grad_norm": 0.5565785975610199, "learning_rate": 1.9391171210985847e-05, "loss": 0.4026, "step": 649 }, { "epoch": 1.004248744689069, "grad_norm": 0.5509198518930501, "learning_rate": 1.938745759287695e-05, "loss": 0.438, "step": 650 }, { "epoch": 1.0057937427578216, "grad_norm": 0.5951778555061867, "learning_rate": 1.9383733041165563e-05, "loss": 0.4149, "step": 651 }, { "epoch": 1.007338740826574, "grad_norm": 0.5284799565685759, "learning_rate": 1.9379997560189677e-05, "loss": 0.4682, "step": 652 }, { "epoch": 1.0088837388953265, "grad_norm": 0.5210194784476061, "learning_rate": 1.937625115430002e-05, "loss": 0.4404, "step": 653 }, { "epoch": 1.0104287369640788, "grad_norm": 0.4892814831648519, "learning_rate": 1.9372493827860047e-05, "loss": 0.4135, "step": 654 }, { "epoch": 1.0119737350328313, "grad_norm": 0.48258859054387077, "learning_rate": 1.9368725585245927e-05, "loss": 0.4395, "step": 655 }, { "epoch": 1.0135187331015836, "grad_norm": 0.46970426209679145, "learning_rate": 1.9364946430846538e-05, "loss": 0.4063, "step": 656 }, { "epoch": 1.015063731170336, "grad_norm": 0.5328869222669156, "learning_rate": 1.9361156369063483e-05, "loss": 0.416, "step": 657 }, { "epoch": 1.0166087292390884, "grad_norm": 0.47828689390546153, "learning_rate": 1.9357355404311054e-05, "loss": 0.4321, "step": 658 }, { "epoch": 1.018153727307841, "grad_norm": 0.4773884667010597, "learning_rate": 1.935354354101625e-05, "loss": 0.4151, "step": 659 }, { "epoch": 1.0196987253765932, "grad_norm": 0.45625683075286116, "learning_rate": 1.9349720783618755e-05, "loss": 0.4358, "step": 660 }, { "epoch": 1.0212437234453458, "grad_norm": 0.5932557093736336, "learning_rate": 1.9345887136570952e-05, "loss": 0.44, "step": 661 }, { "epoch": 1.022788721514098, "grad_norm": 0.5758348597743984, "learning_rate": 1.93420426043379e-05, "loss": 0.4147, "step": 662 }, { "epoch": 1.0243337195828506, "grad_norm": 0.624340611421523, "learning_rate": 1.9338187191397342e-05, "loss": 0.4332, "step": 663 }, { "epoch": 1.0258787176516029, "grad_norm": 0.6276127466161742, "learning_rate": 1.933432090223969e-05, "loss": 0.4481, "step": 664 }, { "epoch": 1.0274237157203554, "grad_norm": 0.5168811520056401, "learning_rate": 1.933044374136801e-05, "loss": 0.4051, "step": 665 }, { "epoch": 1.0289687137891077, "grad_norm": 0.5754915934973664, "learning_rate": 1.9326555713298062e-05, "loss": 0.4062, "step": 666 }, { "epoch": 1.0305137118578602, "grad_norm": 0.5744758037778054, "learning_rate": 1.9322656822558235e-05, "loss": 0.43, "step": 667 }, { "epoch": 1.0320587099266125, "grad_norm": 0.6450058946656205, "learning_rate": 1.9318747073689587e-05, "loss": 0.4102, "step": 668 }, { "epoch": 1.033603707995365, "grad_norm": 0.5735956489042, "learning_rate": 1.931482647124581e-05, "loss": 0.4389, "step": 669 }, { "epoch": 1.0351487060641174, "grad_norm": 0.4862255453240811, "learning_rate": 1.931089501979324e-05, "loss": 0.3945, "step": 670 }, { "epoch": 1.0366937041328699, "grad_norm": 0.5473489958820873, "learning_rate": 1.9306952723910856e-05, "loss": 0.415, "step": 671 }, { "epoch": 1.0382387022016222, "grad_norm": 0.46699583337260786, "learning_rate": 1.930299958819026e-05, "loss": 0.3957, "step": 672 }, { "epoch": 1.0397837002703747, "grad_norm": 0.69399110704959, "learning_rate": 1.9299035617235685e-05, "loss": 0.4311, "step": 673 }, { "epoch": 1.041328698339127, "grad_norm": 0.5457221304889184, "learning_rate": 1.9295060815663978e-05, "loss": 0.4505, "step": 674 }, { "epoch": 1.0428736964078795, "grad_norm": 0.5111172140342872, "learning_rate": 1.9291075188104605e-05, "loss": 0.4207, "step": 675 }, { "epoch": 1.0444186944766318, "grad_norm": 0.45343473769361, "learning_rate": 1.9287078739199643e-05, "loss": 0.4357, "step": 676 }, { "epoch": 1.0459636925453843, "grad_norm": 0.49330134697784683, "learning_rate": 1.9283071473603764e-05, "loss": 0.4099, "step": 677 }, { "epoch": 1.0475086906141367, "grad_norm": 0.5790520408944024, "learning_rate": 1.927905339598424e-05, "loss": 0.4531, "step": 678 }, { "epoch": 1.0490536886828892, "grad_norm": 0.5376770919900155, "learning_rate": 1.927502451102095e-05, "loss": 0.4256, "step": 679 }, { "epoch": 1.0505986867516415, "grad_norm": 0.455085118024656, "learning_rate": 1.9270984823406337e-05, "loss": 0.4225, "step": 680 }, { "epoch": 1.052143684820394, "grad_norm": 0.6164677350751747, "learning_rate": 1.926693433784545e-05, "loss": 0.4223, "step": 681 }, { "epoch": 1.0536886828891463, "grad_norm": 0.4949093888853463, "learning_rate": 1.9262873059055894e-05, "loss": 0.4919, "step": 682 }, { "epoch": 1.0552336809578988, "grad_norm": 0.6082818519785694, "learning_rate": 1.9258800991767855e-05, "loss": 0.4155, "step": 683 }, { "epoch": 1.0567786790266511, "grad_norm": 0.3873388850145095, "learning_rate": 1.925471814072409e-05, "loss": 0.3878, "step": 684 }, { "epoch": 1.0583236770954036, "grad_norm": 0.6308796892091995, "learning_rate": 1.92506245106799e-05, "loss": 0.4038, "step": 685 }, { "epoch": 1.059868675164156, "grad_norm": 0.4892413771363583, "learning_rate": 1.9246520106403155e-05, "loss": 0.3919, "step": 686 }, { "epoch": 1.0614136732329085, "grad_norm": 0.6470204902374019, "learning_rate": 1.9242404932674268e-05, "loss": 0.4241, "step": 687 }, { "epoch": 1.0629586713016608, "grad_norm": 0.49844497940236554, "learning_rate": 1.923827899428619e-05, "loss": 0.4379, "step": 688 }, { "epoch": 1.0645036693704133, "grad_norm": 0.6095008329750325, "learning_rate": 1.923414229604442e-05, "loss": 0.4468, "step": 689 }, { "epoch": 1.0660486674391656, "grad_norm": 0.5631524885811506, "learning_rate": 1.9229994842766985e-05, "loss": 0.4235, "step": 690 }, { "epoch": 1.0675936655079181, "grad_norm": 0.6392391018971592, "learning_rate": 1.922583663928443e-05, "loss": 0.4363, "step": 691 }, { "epoch": 1.0691386635766706, "grad_norm": 0.397586848867128, "learning_rate": 1.9221667690439834e-05, "loss": 0.3874, "step": 692 }, { "epoch": 1.070683661645423, "grad_norm": 0.5529912437247614, "learning_rate": 1.9217488001088784e-05, "loss": 0.4453, "step": 693 }, { "epoch": 1.0722286597141752, "grad_norm": 0.424608449318674, "learning_rate": 1.9213297576099382e-05, "loss": 0.4523, "step": 694 }, { "epoch": 1.0737736577829278, "grad_norm": 0.5488953011486416, "learning_rate": 1.920909642035222e-05, "loss": 0.4071, "step": 695 }, { "epoch": 1.0753186558516803, "grad_norm": 0.6300890160377641, "learning_rate": 1.9204884538740408e-05, "loss": 0.4625, "step": 696 }, { "epoch": 1.0768636539204326, "grad_norm": 0.4794220407642638, "learning_rate": 1.9200661936169532e-05, "loss": 0.4092, "step": 697 }, { "epoch": 1.078408651989185, "grad_norm": 0.4667174163361139, "learning_rate": 1.9196428617557673e-05, "loss": 0.3862, "step": 698 }, { "epoch": 1.0799536500579374, "grad_norm": 0.5159425208042543, "learning_rate": 1.9192184587835392e-05, "loss": 0.4289, "step": 699 }, { "epoch": 1.08149864812669, "grad_norm": 0.5578635489921652, "learning_rate": 1.918792985194572e-05, "loss": 0.4225, "step": 700 }, { "epoch": 1.0830436461954422, "grad_norm": 0.5435318454089418, "learning_rate": 1.9183664414844166e-05, "loss": 0.4502, "step": 701 }, { "epoch": 1.0845886442641948, "grad_norm": 0.5149684183497798, "learning_rate": 1.91793882814987e-05, "loss": 0.4049, "step": 702 }, { "epoch": 1.086133642332947, "grad_norm": 0.5484973349411677, "learning_rate": 1.9175101456889743e-05, "loss": 0.4235, "step": 703 }, { "epoch": 1.0876786404016996, "grad_norm": 0.5506799173147281, "learning_rate": 1.9170803946010178e-05, "loss": 0.434, "step": 704 }, { "epoch": 1.089223638470452, "grad_norm": 0.527429906112722, "learning_rate": 1.916649575386533e-05, "loss": 0.4347, "step": 705 }, { "epoch": 1.0907686365392044, "grad_norm": 0.6166633691568066, "learning_rate": 1.9162176885472958e-05, "loss": 0.4619, "step": 706 }, { "epoch": 1.0923136346079567, "grad_norm": 0.4092027428749796, "learning_rate": 1.915784734586327e-05, "loss": 0.3812, "step": 707 }, { "epoch": 1.0938586326767092, "grad_norm": 0.5124640276445508, "learning_rate": 1.915350714007889e-05, "loss": 0.4392, "step": 708 }, { "epoch": 1.0954036307454615, "grad_norm": 0.42222674077533784, "learning_rate": 1.914915627317487e-05, "loss": 0.402, "step": 709 }, { "epoch": 1.096948628814214, "grad_norm": 0.45241911237444915, "learning_rate": 1.9144794750218688e-05, "loss": 0.4109, "step": 710 }, { "epoch": 1.0984936268829664, "grad_norm": 0.46151903022722857, "learning_rate": 1.9140422576290213e-05, "loss": 0.4514, "step": 711 }, { "epoch": 1.100038624951719, "grad_norm": 0.37578065915984293, "learning_rate": 1.913603975648173e-05, "loss": 0.4136, "step": 712 }, { "epoch": 1.1015836230204712, "grad_norm": 0.4608033270536375, "learning_rate": 1.913164629589793e-05, "loss": 0.4277, "step": 713 }, { "epoch": 1.1031286210892237, "grad_norm": 0.4688430509147066, "learning_rate": 1.9127242199655888e-05, "loss": 0.4082, "step": 714 }, { "epoch": 1.104673619157976, "grad_norm": 0.4799852558499316, "learning_rate": 1.912282747288507e-05, "loss": 0.4158, "step": 715 }, { "epoch": 1.1062186172267285, "grad_norm": 0.5424314288329353, "learning_rate": 1.9118402120727323e-05, "loss": 0.4461, "step": 716 }, { "epoch": 1.1077636152954808, "grad_norm": 0.48425404415644613, "learning_rate": 1.9113966148336866e-05, "loss": 0.3806, "step": 717 }, { "epoch": 1.1093086133642334, "grad_norm": 0.5081477212309261, "learning_rate": 1.9109519560880297e-05, "loss": 0.4504, "step": 718 }, { "epoch": 1.1108536114329857, "grad_norm": 0.5523788161986258, "learning_rate": 1.9105062363536563e-05, "loss": 0.3907, "step": 719 }, { "epoch": 1.1123986095017382, "grad_norm": 0.467569504967294, "learning_rate": 1.910059456149698e-05, "loss": 0.4478, "step": 720 }, { "epoch": 1.1139436075704905, "grad_norm": 0.6066983489373099, "learning_rate": 1.9096116159965213e-05, "loss": 0.4309, "step": 721 }, { "epoch": 1.115488605639243, "grad_norm": 0.5145244625714894, "learning_rate": 1.9091627164157268e-05, "loss": 0.4395, "step": 722 }, { "epoch": 1.1170336037079953, "grad_norm": 0.5650109940617468, "learning_rate": 1.9087127579301495e-05, "loss": 0.385, "step": 723 }, { "epoch": 1.1185786017767478, "grad_norm": 0.5962376530239254, "learning_rate": 1.9082617410638577e-05, "loss": 0.4433, "step": 724 }, { "epoch": 1.1201235998455001, "grad_norm": 0.6495897405376705, "learning_rate": 1.9078096663421516e-05, "loss": 0.4298, "step": 725 }, { "epoch": 1.1216685979142527, "grad_norm": 0.4829825060433689, "learning_rate": 1.9073565342915648e-05, "loss": 0.4159, "step": 726 }, { "epoch": 1.123213595983005, "grad_norm": 0.6072439941979277, "learning_rate": 1.9069023454398613e-05, "loss": 0.4137, "step": 727 }, { "epoch": 1.1247585940517575, "grad_norm": 0.6118954817979739, "learning_rate": 1.9064471003160365e-05, "loss": 0.435, "step": 728 }, { "epoch": 1.1263035921205098, "grad_norm": 0.4518956228314214, "learning_rate": 1.9059907994503158e-05, "loss": 0.4138, "step": 729 }, { "epoch": 1.1278485901892623, "grad_norm": 0.5152111124647966, "learning_rate": 1.905533443374154e-05, "loss": 0.4098, "step": 730 }, { "epoch": 1.1293935882580146, "grad_norm": 0.4613459372184929, "learning_rate": 1.905075032620236e-05, "loss": 0.4429, "step": 731 }, { "epoch": 1.1309385863267671, "grad_norm": 0.513810810573902, "learning_rate": 1.9046155677224733e-05, "loss": 0.4258, "step": 732 }, { "epoch": 1.1324835843955194, "grad_norm": 0.4689188495134162, "learning_rate": 1.904155049216007e-05, "loss": 0.4164, "step": 733 }, { "epoch": 1.134028582464272, "grad_norm": 0.43359128642830813, "learning_rate": 1.903693477637204e-05, "loss": 0.3971, "step": 734 }, { "epoch": 1.1355735805330243, "grad_norm": 0.47714176476568504, "learning_rate": 1.9032308535236585e-05, "loss": 0.4296, "step": 735 }, { "epoch": 1.1371185786017768, "grad_norm": 0.4661256615890847, "learning_rate": 1.90276717741419e-05, "loss": 0.4146, "step": 736 }, { "epoch": 1.138663576670529, "grad_norm": 0.5063503172163795, "learning_rate": 1.9023024498488444e-05, "loss": 0.4238, "step": 737 }, { "epoch": 1.1402085747392816, "grad_norm": 0.43116827542655406, "learning_rate": 1.9018366713688902e-05, "loss": 0.3854, "step": 738 }, { "epoch": 1.141753572808034, "grad_norm": 0.44437990493560264, "learning_rate": 1.901369842516822e-05, "loss": 0.4117, "step": 739 }, { "epoch": 1.1432985708767864, "grad_norm": 0.4468615786225864, "learning_rate": 1.9009019638363568e-05, "loss": 0.4247, "step": 740 }, { "epoch": 1.1448435689455387, "grad_norm": 0.4921327293478226, "learning_rate": 1.9004330358724337e-05, "loss": 0.4376, "step": 741 }, { "epoch": 1.1463885670142913, "grad_norm": 0.42386060416003174, "learning_rate": 1.8999630591712157e-05, "loss": 0.4086, "step": 742 }, { "epoch": 1.1479335650830436, "grad_norm": 0.4976894009465658, "learning_rate": 1.8994920342800856e-05, "loss": 0.4393, "step": 743 }, { "epoch": 1.149478563151796, "grad_norm": 0.43422083652728927, "learning_rate": 1.8990199617476475e-05, "loss": 0.4149, "step": 744 }, { "epoch": 1.1510235612205484, "grad_norm": 0.5026720268679562, "learning_rate": 1.898546842123726e-05, "loss": 0.4375, "step": 745 }, { "epoch": 1.152568559289301, "grad_norm": 0.3908282983950561, "learning_rate": 1.8980726759593655e-05, "loss": 0.4038, "step": 746 }, { "epoch": 1.1541135573580532, "grad_norm": 0.5080414429365883, "learning_rate": 1.897597463806828e-05, "loss": 0.4286, "step": 747 }, { "epoch": 1.1556585554268057, "grad_norm": 0.4418511939787641, "learning_rate": 1.897121206219595e-05, "loss": 0.4432, "step": 748 }, { "epoch": 1.157203553495558, "grad_norm": 0.5834575973269833, "learning_rate": 1.896643903752366e-05, "loss": 0.4597, "step": 749 }, { "epoch": 1.1587485515643106, "grad_norm": 0.4577077781338633, "learning_rate": 1.8961655569610557e-05, "loss": 0.3892, "step": 750 }, { "epoch": 1.1602935496330629, "grad_norm": 0.45873577511676983, "learning_rate": 1.8956861664027967e-05, "loss": 0.4222, "step": 751 }, { "epoch": 1.1618385477018154, "grad_norm": 0.53409290763028, "learning_rate": 1.895205732635937e-05, "loss": 0.4358, "step": 752 }, { "epoch": 1.163383545770568, "grad_norm": 0.4475145444354131, "learning_rate": 1.8947242562200384e-05, "loss": 0.4065, "step": 753 }, { "epoch": 1.1649285438393202, "grad_norm": 0.4528239025194661, "learning_rate": 1.8942417377158787e-05, "loss": 0.4256, "step": 754 }, { "epoch": 1.1664735419080725, "grad_norm": 0.45535492007488315, "learning_rate": 1.893758177685449e-05, "loss": 0.4286, "step": 755 }, { "epoch": 1.168018539976825, "grad_norm": 0.5179531521675605, "learning_rate": 1.893273576691953e-05, "loss": 0.4618, "step": 756 }, { "epoch": 1.1695635380455776, "grad_norm": 0.4364168741696879, "learning_rate": 1.892787935299807e-05, "loss": 0.3965, "step": 757 }, { "epoch": 1.1711085361143299, "grad_norm": 0.5164985137290942, "learning_rate": 1.892301254074639e-05, "loss": 0.3822, "step": 758 }, { "epoch": 1.1726535341830822, "grad_norm": 0.5062525439419493, "learning_rate": 1.8918135335832887e-05, "loss": 0.4248, "step": 759 }, { "epoch": 1.1741985322518347, "grad_norm": 0.4565975072847523, "learning_rate": 1.8913247743938052e-05, "loss": 0.4363, "step": 760 }, { "epoch": 1.1757435303205872, "grad_norm": 0.4386411685626734, "learning_rate": 1.890834977075448e-05, "loss": 0.3843, "step": 761 }, { "epoch": 1.1772885283893395, "grad_norm": 0.6743202722336329, "learning_rate": 1.8903441421986862e-05, "loss": 0.4616, "step": 762 }, { "epoch": 1.1788335264580918, "grad_norm": 0.4380278600162091, "learning_rate": 1.889852270335196e-05, "loss": 0.3985, "step": 763 }, { "epoch": 1.1803785245268443, "grad_norm": 0.522491512128541, "learning_rate": 1.8893593620578627e-05, "loss": 0.409, "step": 764 }, { "epoch": 1.1819235225955969, "grad_norm": 0.4399324016428471, "learning_rate": 1.888865417940778e-05, "loss": 0.4243, "step": 765 }, { "epoch": 1.1834685206643492, "grad_norm": 0.48665549134471187, "learning_rate": 1.88837043855924e-05, "loss": 0.4586, "step": 766 }, { "epoch": 1.1850135187331017, "grad_norm": 0.4174352741946846, "learning_rate": 1.8878744244897533e-05, "loss": 0.4103, "step": 767 }, { "epoch": 1.186558516801854, "grad_norm": 0.42271424515322037, "learning_rate": 1.8873773763100268e-05, "loss": 0.417, "step": 768 }, { "epoch": 1.1881035148706065, "grad_norm": 0.43245653601677747, "learning_rate": 1.886879294598974e-05, "loss": 0.438, "step": 769 }, { "epoch": 1.1896485129393588, "grad_norm": 0.43184361291590384, "learning_rate": 1.8863801799367127e-05, "loss": 0.416, "step": 770 }, { "epoch": 1.1911935110081113, "grad_norm": 0.39952138502007134, "learning_rate": 1.885880032904563e-05, "loss": 0.4242, "step": 771 }, { "epoch": 1.1927385090768636, "grad_norm": 0.503972164197035, "learning_rate": 1.885378854085048e-05, "loss": 0.4095, "step": 772 }, { "epoch": 1.1942835071456162, "grad_norm": 0.5037254937862574, "learning_rate": 1.884876644061892e-05, "loss": 0.4163, "step": 773 }, { "epoch": 1.1958285052143685, "grad_norm": 0.5174438485814317, "learning_rate": 1.8843734034200212e-05, "loss": 0.4545, "step": 774 }, { "epoch": 1.197373503283121, "grad_norm": 0.4583425023056468, "learning_rate": 1.883869132745561e-05, "loss": 0.4004, "step": 775 }, { "epoch": 1.1989185013518733, "grad_norm": 0.5960085260084351, "learning_rate": 1.8833638326258378e-05, "loss": 0.4679, "step": 776 }, { "epoch": 1.2004634994206258, "grad_norm": 0.4493274872898307, "learning_rate": 1.8828575036493757e-05, "loss": 0.384, "step": 777 }, { "epoch": 1.202008497489378, "grad_norm": 0.5274529225696944, "learning_rate": 1.882350146405898e-05, "loss": 0.44, "step": 778 }, { "epoch": 1.2035534955581306, "grad_norm": 0.4239573155807951, "learning_rate": 1.8818417614863252e-05, "loss": 0.4329, "step": 779 }, { "epoch": 1.205098493626883, "grad_norm": 0.6769455830956056, "learning_rate": 1.881332349482775e-05, "loss": 0.4167, "step": 780 }, { "epoch": 1.2066434916956355, "grad_norm": 0.4269337998332165, "learning_rate": 1.8808219109885607e-05, "loss": 0.4242, "step": 781 }, { "epoch": 1.2081884897643878, "grad_norm": 0.563957198250286, "learning_rate": 1.8803104465981925e-05, "loss": 0.408, "step": 782 }, { "epoch": 1.2097334878331403, "grad_norm": 0.4055951866200763, "learning_rate": 1.8797979569073744e-05, "loss": 0.4143, "step": 783 }, { "epoch": 1.2112784859018926, "grad_norm": 0.5109473437123518, "learning_rate": 1.8792844425130042e-05, "loss": 0.4069, "step": 784 }, { "epoch": 1.212823483970645, "grad_norm": 0.4445808642578272, "learning_rate": 1.8787699040131747e-05, "loss": 0.4141, "step": 785 }, { "epoch": 1.2143684820393974, "grad_norm": 0.5420631544278582, "learning_rate": 1.8782543420071702e-05, "loss": 0.4385, "step": 786 }, { "epoch": 1.21591348010815, "grad_norm": 0.40616611731350544, "learning_rate": 1.8777377570954672e-05, "loss": 0.3989, "step": 787 }, { "epoch": 1.2174584781769022, "grad_norm": 0.4707714435148161, "learning_rate": 1.877220149879734e-05, "loss": 0.4284, "step": 788 }, { "epoch": 1.2190034762456547, "grad_norm": 0.47268361998379266, "learning_rate": 1.8767015209628297e-05, "loss": 0.4042, "step": 789 }, { "epoch": 1.220548474314407, "grad_norm": 0.4158289555397981, "learning_rate": 1.8761818709488027e-05, "loss": 0.3734, "step": 790 }, { "epoch": 1.2220934723831596, "grad_norm": 0.5918300916911394, "learning_rate": 1.875661200442891e-05, "loss": 0.4588, "step": 791 }, { "epoch": 1.2236384704519119, "grad_norm": 0.4000714748382632, "learning_rate": 1.8751395100515216e-05, "loss": 0.4043, "step": 792 }, { "epoch": 1.2251834685206644, "grad_norm": 0.47074021394727583, "learning_rate": 1.874616800382309e-05, "loss": 0.4291, "step": 793 }, { "epoch": 1.2267284665894167, "grad_norm": 0.4074299919124258, "learning_rate": 1.8740930720440547e-05, "loss": 0.4495, "step": 794 }, { "epoch": 1.2282734646581692, "grad_norm": 0.44463894701454254, "learning_rate": 1.8735683256467466e-05, "loss": 0.4426, "step": 795 }, { "epoch": 1.2298184627269215, "grad_norm": 0.456438272452797, "learning_rate": 1.8730425618015585e-05, "loss": 0.4226, "step": 796 }, { "epoch": 1.231363460795674, "grad_norm": 0.41437369179734895, "learning_rate": 1.8725157811208503e-05, "loss": 0.3996, "step": 797 }, { "epoch": 1.2329084588644263, "grad_norm": 0.4381008303106899, "learning_rate": 1.8719879842181637e-05, "loss": 0.4222, "step": 798 }, { "epoch": 1.2344534569331789, "grad_norm": 0.45219154433272574, "learning_rate": 1.871459171708226e-05, "loss": 0.4355, "step": 799 }, { "epoch": 1.2359984550019312, "grad_norm": 0.43884072093074766, "learning_rate": 1.8709293442069478e-05, "loss": 0.4149, "step": 800 }, { "epoch": 1.2375434530706837, "grad_norm": 0.4153099719657308, "learning_rate": 1.8703985023314197e-05, "loss": 0.4353, "step": 801 }, { "epoch": 1.239088451139436, "grad_norm": 0.48527223352148174, "learning_rate": 1.8698666466999155e-05, "loss": 0.4457, "step": 802 }, { "epoch": 1.2406334492081885, "grad_norm": 0.49426665700033673, "learning_rate": 1.869333777931889e-05, "loss": 0.3766, "step": 803 }, { "epoch": 1.2421784472769408, "grad_norm": 0.47562382375407, "learning_rate": 1.8687998966479744e-05, "loss": 0.4132, "step": 804 }, { "epoch": 1.2437234453456933, "grad_norm": 0.5668875037105555, "learning_rate": 1.8682650034699846e-05, "loss": 0.411, "step": 805 }, { "epoch": 1.2452684434144456, "grad_norm": 0.5402337545034578, "learning_rate": 1.8677290990209123e-05, "loss": 0.4424, "step": 806 }, { "epoch": 1.2468134414831982, "grad_norm": 0.5239988001262941, "learning_rate": 1.8671921839249257e-05, "loss": 0.4384, "step": 807 }, { "epoch": 1.2483584395519505, "grad_norm": 0.5288860580098513, "learning_rate": 1.866654258807373e-05, "loss": 0.4041, "step": 808 }, { "epoch": 1.249903437620703, "grad_norm": 0.5432074604687258, "learning_rate": 1.8661153242947767e-05, "loss": 0.4372, "step": 809 }, { "epoch": 1.2514484356894555, "grad_norm": 0.5835677291859726, "learning_rate": 1.8655753810148355e-05, "loss": 0.44, "step": 810 }, { "epoch": 1.2529934337582078, "grad_norm": 0.47512737536131294, "learning_rate": 1.8650344295964235e-05, "loss": 0.4154, "step": 811 }, { "epoch": 1.2545384318269601, "grad_norm": 0.6938256343264145, "learning_rate": 1.8644924706695882e-05, "loss": 0.4702, "step": 812 }, { "epoch": 1.2560834298957126, "grad_norm": 0.5336003948148897, "learning_rate": 1.863949504865551e-05, "loss": 0.3836, "step": 813 }, { "epoch": 1.2576284279644652, "grad_norm": 0.7205769857033815, "learning_rate": 1.863405532816706e-05, "loss": 0.4484, "step": 814 }, { "epoch": 1.2591734260332175, "grad_norm": 0.7111209456301347, "learning_rate": 1.8628605551566192e-05, "loss": 0.388, "step": 815 }, { "epoch": 1.2607184241019698, "grad_norm": 0.48172365712809784, "learning_rate": 1.862314572520028e-05, "loss": 0.4708, "step": 816 }, { "epoch": 1.2622634221707223, "grad_norm": 0.6679843879681505, "learning_rate": 1.86176758554284e-05, "loss": 0.413, "step": 817 }, { "epoch": 1.2638084202394748, "grad_norm": 0.4288760740803521, "learning_rate": 1.8612195948621326e-05, "loss": 0.4037, "step": 818 }, { "epoch": 1.2653534183082271, "grad_norm": 0.693474410094532, "learning_rate": 1.8606706011161525e-05, "loss": 0.4332, "step": 819 }, { "epoch": 1.2668984163769794, "grad_norm": 0.5139787163371892, "learning_rate": 1.8601206049443147e-05, "loss": 0.4026, "step": 820 }, { "epoch": 1.268443414445732, "grad_norm": 0.6958188940573934, "learning_rate": 1.8595696069872013e-05, "loss": 0.4756, "step": 821 }, { "epoch": 1.2699884125144845, "grad_norm": 0.5734882542897282, "learning_rate": 1.8590176078865613e-05, "loss": 0.4288, "step": 822 }, { "epoch": 1.2715334105832368, "grad_norm": 0.5595603734149939, "learning_rate": 1.8584646082853104e-05, "loss": 0.4118, "step": 823 }, { "epoch": 1.273078408651989, "grad_norm": 0.4831649906250932, "learning_rate": 1.8579106088275283e-05, "loss": 0.4035, "step": 824 }, { "epoch": 1.2746234067207416, "grad_norm": 0.662868451682626, "learning_rate": 1.8573556101584605e-05, "loss": 0.4117, "step": 825 }, { "epoch": 1.2761684047894941, "grad_norm": 0.5233364070938898, "learning_rate": 1.8567996129245154e-05, "loss": 0.389, "step": 826 }, { "epoch": 1.2777134028582464, "grad_norm": 0.6715176687276262, "learning_rate": 1.856242617773265e-05, "loss": 0.4589, "step": 827 }, { "epoch": 1.2792584009269987, "grad_norm": 0.5179703863690273, "learning_rate": 1.8556846253534434e-05, "loss": 0.3973, "step": 828 }, { "epoch": 1.2808033989957512, "grad_norm": 0.5583584369069439, "learning_rate": 1.855125636314946e-05, "loss": 0.4465, "step": 829 }, { "epoch": 1.2823483970645038, "grad_norm": 0.5298708463588024, "learning_rate": 1.8545656513088298e-05, "loss": 0.466, "step": 830 }, { "epoch": 1.283893395133256, "grad_norm": 0.5048655625729561, "learning_rate": 1.85400467098731e-05, "loss": 0.4177, "step": 831 }, { "epoch": 1.2854383932020084, "grad_norm": 0.45461643220935843, "learning_rate": 1.8534426960037633e-05, "loss": 0.4306, "step": 832 }, { "epoch": 1.286983391270761, "grad_norm": 0.5433937164438797, "learning_rate": 1.8528797270127233e-05, "loss": 0.4342, "step": 833 }, { "epoch": 1.2885283893395134, "grad_norm": 0.4183035071044246, "learning_rate": 1.8523157646698818e-05, "loss": 0.3993, "step": 834 }, { "epoch": 1.2900733874082657, "grad_norm": 0.5888925197450727, "learning_rate": 1.851750809632088e-05, "loss": 0.4474, "step": 835 }, { "epoch": 1.291618385477018, "grad_norm": 0.5632715695442245, "learning_rate": 1.851184862557346e-05, "loss": 0.4141, "step": 836 }, { "epoch": 1.2931633835457705, "grad_norm": 0.5258629120662136, "learning_rate": 1.850617924104817e-05, "loss": 0.4292, "step": 837 }, { "epoch": 1.294708381614523, "grad_norm": 0.5860119125421862, "learning_rate": 1.850049994934816e-05, "loss": 0.4117, "step": 838 }, { "epoch": 1.2962533796832754, "grad_norm": 0.5056794829116673, "learning_rate": 1.8494810757088116e-05, "loss": 0.4167, "step": 839 }, { "epoch": 1.2977983777520279, "grad_norm": 0.6689277202296586, "learning_rate": 1.848911167089426e-05, "loss": 0.4414, "step": 840 }, { "epoch": 1.2993433758207802, "grad_norm": 0.5576363255315622, "learning_rate": 1.848340269740434e-05, "loss": 0.3892, "step": 841 }, { "epoch": 1.3008883738895327, "grad_norm": 0.5126569392076785, "learning_rate": 1.847768384326761e-05, "loss": 0.4256, "step": 842 }, { "epoch": 1.302433371958285, "grad_norm": 0.6566473109262699, "learning_rate": 1.847195511514484e-05, "loss": 0.3779, "step": 843 }, { "epoch": 1.3039783700270375, "grad_norm": 0.5337028890179826, "learning_rate": 1.8466216519708303e-05, "loss": 0.4268, "step": 844 }, { "epoch": 1.3055233680957898, "grad_norm": 0.7006580042162586, "learning_rate": 1.8460468063641756e-05, "loss": 0.4159, "step": 845 }, { "epoch": 1.3070683661645424, "grad_norm": 0.5153077795234265, "learning_rate": 1.8454709753640442e-05, "loss": 0.4348, "step": 846 }, { "epoch": 1.3086133642332947, "grad_norm": 0.503457363947637, "learning_rate": 1.8448941596411085e-05, "loss": 0.3942, "step": 847 }, { "epoch": 1.3101583623020472, "grad_norm": 0.6034607807473501, "learning_rate": 1.844316359867188e-05, "loss": 0.4734, "step": 848 }, { "epoch": 1.3117033603707995, "grad_norm": 0.5092576898647186, "learning_rate": 1.8437375767152478e-05, "loss": 0.4155, "step": 849 }, { "epoch": 1.313248358439552, "grad_norm": 0.5801893391853876, "learning_rate": 1.8431578108593982e-05, "loss": 0.3724, "step": 850 }, { "epoch": 1.3147933565083043, "grad_norm": 0.582002729480085, "learning_rate": 1.8425770629748952e-05, "loss": 0.4153, "step": 851 }, { "epoch": 1.3163383545770568, "grad_norm": 0.5764000921504487, "learning_rate": 1.8419953337381366e-05, "loss": 0.4059, "step": 852 }, { "epoch": 1.3178833526458091, "grad_norm": 0.5811778382929148, "learning_rate": 1.8414126238266656e-05, "loss": 0.4217, "step": 853 }, { "epoch": 1.3194283507145617, "grad_norm": 0.6296416123813867, "learning_rate": 1.840828933919165e-05, "loss": 0.4297, "step": 854 }, { "epoch": 1.320973348783314, "grad_norm": 0.5893140604718962, "learning_rate": 1.8402442646954617e-05, "loss": 0.444, "step": 855 }, { "epoch": 1.3225183468520665, "grad_norm": 0.5472566359921525, "learning_rate": 1.8396586168365208e-05, "loss": 0.418, "step": 856 }, { "epoch": 1.3240633449208188, "grad_norm": 0.4704009197953403, "learning_rate": 1.8390719910244487e-05, "loss": 0.3872, "step": 857 }, { "epoch": 1.3256083429895713, "grad_norm": 0.46965874420200904, "learning_rate": 1.8384843879424905e-05, "loss": 0.4273, "step": 858 }, { "epoch": 1.3271533410583236, "grad_norm": 0.49510358414123096, "learning_rate": 1.8378958082750294e-05, "loss": 0.4131, "step": 859 }, { "epoch": 1.3286983391270761, "grad_norm": 0.5147354155194656, "learning_rate": 1.8373062527075855e-05, "loss": 0.4657, "step": 860 }, { "epoch": 1.3302433371958284, "grad_norm": 0.5691023255687069, "learning_rate": 1.836715721926817e-05, "loss": 0.4256, "step": 861 }, { "epoch": 1.331788335264581, "grad_norm": 0.527918759192356, "learning_rate": 1.8361242166205162e-05, "loss": 0.4245, "step": 862 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4789602641670178, "learning_rate": 1.8355317374776118e-05, "loss": 0.4484, "step": 863 }, { "epoch": 1.3348783314020858, "grad_norm": 0.496346213415839, "learning_rate": 1.834938285188166e-05, "loss": 0.4129, "step": 864 }, { "epoch": 1.336423329470838, "grad_norm": 0.4366284682232012, "learning_rate": 1.8343438604433748e-05, "loss": 0.399, "step": 865 }, { "epoch": 1.3379683275395906, "grad_norm": 0.5924292371651109, "learning_rate": 1.8337484639355662e-05, "loss": 0.412, "step": 866 }, { "epoch": 1.339513325608343, "grad_norm": 0.49117096820753126, "learning_rate": 1.8331520963582008e-05, "loss": 0.4627, "step": 867 }, { "epoch": 1.3410583236770954, "grad_norm": 0.6161883728803011, "learning_rate": 1.83255475840587e-05, "loss": 0.4025, "step": 868 }, { "epoch": 1.3426033217458477, "grad_norm": 0.46288879588800663, "learning_rate": 1.8319564507742948e-05, "loss": 0.4203, "step": 869 }, { "epoch": 1.3441483198146003, "grad_norm": 0.5582464836225155, "learning_rate": 1.8313571741603266e-05, "loss": 0.4335, "step": 870 }, { "epoch": 1.3456933178833528, "grad_norm": 0.3834136145752225, "learning_rate": 1.8307569292619444e-05, "loss": 0.4112, "step": 871 }, { "epoch": 1.347238315952105, "grad_norm": 0.6045961684721476, "learning_rate": 1.8301557167782556e-05, "loss": 0.4347, "step": 872 }, { "epoch": 1.3487833140208574, "grad_norm": 0.4421211332310866, "learning_rate": 1.829553537409494e-05, "loss": 0.4209, "step": 873 }, { "epoch": 1.35032831208961, "grad_norm": 0.47763240811133, "learning_rate": 1.8289503918570202e-05, "loss": 0.4047, "step": 874 }, { "epoch": 1.3518733101583624, "grad_norm": 0.4786964942885881, "learning_rate": 1.8283462808233202e-05, "loss": 0.4325, "step": 875 }, { "epoch": 1.3534183082271147, "grad_norm": 0.47668842803636313, "learning_rate": 1.8277412050120032e-05, "loss": 0.4323, "step": 876 }, { "epoch": 1.354963306295867, "grad_norm": 0.5082042714893231, "learning_rate": 1.8271351651278034e-05, "loss": 0.4381, "step": 877 }, { "epoch": 1.3565083043646196, "grad_norm": 0.4675136508606097, "learning_rate": 1.8265281618765775e-05, "loss": 0.4335, "step": 878 }, { "epoch": 1.358053302433372, "grad_norm": 0.44088313665716206, "learning_rate": 1.825920195965304e-05, "loss": 0.4178, "step": 879 }, { "epoch": 1.3595983005021244, "grad_norm": 0.4799998049535272, "learning_rate": 1.825311268102083e-05, "loss": 0.4066, "step": 880 }, { "epoch": 1.3611432985708767, "grad_norm": 0.4422017258664749, "learning_rate": 1.824701378996135e-05, "loss": 0.4658, "step": 881 }, { "epoch": 1.3626882966396292, "grad_norm": 0.5481235689770617, "learning_rate": 1.824090529357799e-05, "loss": 0.4066, "step": 882 }, { "epoch": 1.3642332947083817, "grad_norm": 0.39668987705466796, "learning_rate": 1.8234787198985344e-05, "loss": 0.4271, "step": 883 }, { "epoch": 1.365778292777134, "grad_norm": 0.43999236490421967, "learning_rate": 1.822865951330917e-05, "loss": 0.4135, "step": 884 }, { "epoch": 1.3673232908458863, "grad_norm": 0.4028788138486918, "learning_rate": 1.822252224368641e-05, "loss": 0.3866, "step": 885 }, { "epoch": 1.3688682889146389, "grad_norm": 0.4706359445149962, "learning_rate": 1.8216375397265156e-05, "loss": 0.4414, "step": 886 }, { "epoch": 1.3704132869833914, "grad_norm": 0.43199006808529056, "learning_rate": 1.8210218981204667e-05, "loss": 0.4301, "step": 887 }, { "epoch": 1.3719582850521437, "grad_norm": 0.5109984832709967, "learning_rate": 1.8204053002675332e-05, "loss": 0.4371, "step": 888 }, { "epoch": 1.373503283120896, "grad_norm": 0.5057102595895472, "learning_rate": 1.819787746885869e-05, "loss": 0.445, "step": 889 }, { "epoch": 1.3750482811896485, "grad_norm": 0.41414355683875387, "learning_rate": 1.819169238694741e-05, "loss": 0.3943, "step": 890 }, { "epoch": 1.376593279258401, "grad_norm": 0.4708298059381896, "learning_rate": 1.8185497764145273e-05, "loss": 0.3904, "step": 891 }, { "epoch": 1.3781382773271533, "grad_norm": 0.5046625669106174, "learning_rate": 1.8179293607667177e-05, "loss": 0.4219, "step": 892 }, { "epoch": 1.3796832753959056, "grad_norm": 0.42300606637211485, "learning_rate": 1.8173079924739126e-05, "loss": 0.4039, "step": 893 }, { "epoch": 1.3812282734646582, "grad_norm": 0.5304124140133345, "learning_rate": 1.816685672259821e-05, "loss": 0.416, "step": 894 }, { "epoch": 1.3827732715334107, "grad_norm": 0.4491524971210518, "learning_rate": 1.8160624008492616e-05, "loss": 0.4133, "step": 895 }, { "epoch": 1.384318269602163, "grad_norm": 0.4615362518480325, "learning_rate": 1.815438178968161e-05, "loss": 0.4296, "step": 896 }, { "epoch": 1.3858632676709153, "grad_norm": 0.5710077523466568, "learning_rate": 1.8148130073435522e-05, "loss": 0.4794, "step": 897 }, { "epoch": 1.3874082657396678, "grad_norm": 0.4146139292943481, "learning_rate": 1.8141868867035745e-05, "loss": 0.4026, "step": 898 }, { "epoch": 1.3889532638084203, "grad_norm": 0.4766542949444053, "learning_rate": 1.8135598177774727e-05, "loss": 0.4347, "step": 899 }, { "epoch": 1.3904982618771726, "grad_norm": 0.586507014795295, "learning_rate": 1.812931801295596e-05, "loss": 0.4239, "step": 900 }, { "epoch": 1.3920432599459251, "grad_norm": 0.4412321523261593, "learning_rate": 1.8123028379893978e-05, "loss": 0.4264, "step": 901 }, { "epoch": 1.3935882580146775, "grad_norm": 0.5544330254655024, "learning_rate": 1.811672928591433e-05, "loss": 0.4248, "step": 902 }, { "epoch": 1.39513325608343, "grad_norm": 0.49452624360858843, "learning_rate": 1.8110420738353592e-05, "loss": 0.4431, "step": 903 }, { "epoch": 1.3966782541521823, "grad_norm": 0.4875019409862713, "learning_rate": 1.8104102744559356e-05, "loss": 0.4063, "step": 904 }, { "epoch": 1.3982232522209348, "grad_norm": 0.5206288503610863, "learning_rate": 1.8097775311890202e-05, "loss": 0.4156, "step": 905 }, { "epoch": 1.399768250289687, "grad_norm": 0.48772952710005985, "learning_rate": 1.8091438447715715e-05, "loss": 0.45, "step": 906 }, { "epoch": 1.4013132483584396, "grad_norm": 0.5016636120265034, "learning_rate": 1.8085092159416465e-05, "loss": 0.4235, "step": 907 }, { "epoch": 1.402858246427192, "grad_norm": 0.5109607196561832, "learning_rate": 1.807873645438399e-05, "loss": 0.4307, "step": 908 }, { "epoch": 1.4044032444959444, "grad_norm": 0.4510810602064695, "learning_rate": 1.80723713400208e-05, "loss": 0.4604, "step": 909 }, { "epoch": 1.4059482425646967, "grad_norm": 0.4072164332284222, "learning_rate": 1.806599682374037e-05, "loss": 0.3996, "step": 910 }, { "epoch": 1.4074932406334493, "grad_norm": 0.41715200632940785, "learning_rate": 1.8059612912967112e-05, "loss": 0.426, "step": 911 }, { "epoch": 1.4090382387022016, "grad_norm": 0.43621728263307896, "learning_rate": 1.8053219615136398e-05, "loss": 0.4423, "step": 912 }, { "epoch": 1.410583236770954, "grad_norm": 0.4405286790407957, "learning_rate": 1.8046816937694507e-05, "loss": 0.3876, "step": 913 }, { "epoch": 1.4121282348397064, "grad_norm": 0.5113841092391364, "learning_rate": 1.804040488809868e-05, "loss": 0.4452, "step": 914 }, { "epoch": 1.413673232908459, "grad_norm": 0.4625613132501593, "learning_rate": 1.803398347381703e-05, "loss": 0.4218, "step": 915 }, { "epoch": 1.4152182309772112, "grad_norm": 0.46740840058281885, "learning_rate": 1.8027552702328615e-05, "loss": 0.4006, "step": 916 }, { "epoch": 1.4167632290459637, "grad_norm": 0.5367324776327906, "learning_rate": 1.8021112581123368e-05, "loss": 0.415, "step": 917 }, { "epoch": 1.418308227114716, "grad_norm": 0.47149302201616977, "learning_rate": 1.801466311770212e-05, "loss": 0.4202, "step": 918 }, { "epoch": 1.4198532251834686, "grad_norm": 0.44430364014340556, "learning_rate": 1.8008204319576585e-05, "loss": 0.4498, "step": 919 }, { "epoch": 1.4213982232522209, "grad_norm": 0.45778425143738466, "learning_rate": 1.8001736194269344e-05, "loss": 0.4106, "step": 920 }, { "epoch": 1.4229432213209734, "grad_norm": 0.5068357830105168, "learning_rate": 1.7995258749313845e-05, "loss": 0.4196, "step": 921 }, { "epoch": 1.4244882193897257, "grad_norm": 0.39846484826868767, "learning_rate": 1.7988771992254385e-05, "loss": 0.4126, "step": 922 }, { "epoch": 1.4260332174584782, "grad_norm": 0.5840262542698686, "learning_rate": 1.7982275930646118e-05, "loss": 0.4274, "step": 923 }, { "epoch": 1.4275782155272305, "grad_norm": 0.511589946587284, "learning_rate": 1.7975770572055024e-05, "loss": 0.4544, "step": 924 }, { "epoch": 1.429123213595983, "grad_norm": 0.5612891612277126, "learning_rate": 1.7969255924057917e-05, "loss": 0.4256, "step": 925 }, { "epoch": 1.4306682116647353, "grad_norm": 0.4212774544985197, "learning_rate": 1.7962731994242423e-05, "loss": 0.408, "step": 926 }, { "epoch": 1.4322132097334879, "grad_norm": 0.5874029710301969, "learning_rate": 1.795619879020699e-05, "loss": 0.4359, "step": 927 }, { "epoch": 1.4337582078022402, "grad_norm": 0.4582090346562365, "learning_rate": 1.7949656319560867e-05, "loss": 0.4212, "step": 928 }, { "epoch": 1.4353032058709927, "grad_norm": 0.5426752216138848, "learning_rate": 1.794310458992408e-05, "loss": 0.4371, "step": 929 }, { "epoch": 1.436848203939745, "grad_norm": 0.4354876938872931, "learning_rate": 1.7936543608927454e-05, "loss": 0.411, "step": 930 }, { "epoch": 1.4383932020084975, "grad_norm": 0.4891820853169601, "learning_rate": 1.792997338421259e-05, "loss": 0.4112, "step": 931 }, { "epoch": 1.43993820007725, "grad_norm": 0.46657914027549213, "learning_rate": 1.7923393923431846e-05, "loss": 0.4184, "step": 932 }, { "epoch": 1.4414831981460023, "grad_norm": 0.454828728778665, "learning_rate": 1.791680523424834e-05, "loss": 0.426, "step": 933 }, { "epoch": 1.4430281962147546, "grad_norm": 0.4979793657157445, "learning_rate": 1.7910207324335938e-05, "loss": 0.4219, "step": 934 }, { "epoch": 1.4445731942835072, "grad_norm": 0.48675800684632714, "learning_rate": 1.7903600201379253e-05, "loss": 0.4174, "step": 935 }, { "epoch": 1.4461181923522597, "grad_norm": 0.4213405691334379, "learning_rate": 1.7896983873073622e-05, "loss": 0.4515, "step": 936 }, { "epoch": 1.447663190421012, "grad_norm": 0.5327820604117295, "learning_rate": 1.7890358347125102e-05, "loss": 0.4194, "step": 937 }, { "epoch": 1.4492081884897643, "grad_norm": 0.4026624387051131, "learning_rate": 1.7883723631250466e-05, "loss": 0.4042, "step": 938 }, { "epoch": 1.4507531865585168, "grad_norm": 0.5333247827160055, "learning_rate": 1.7877079733177185e-05, "loss": 0.4193, "step": 939 }, { "epoch": 1.4522981846272693, "grad_norm": 0.45332836907839, "learning_rate": 1.7870426660643434e-05, "loss": 0.3926, "step": 940 }, { "epoch": 1.4538431826960216, "grad_norm": 0.4875272717665886, "learning_rate": 1.7863764421398063e-05, "loss": 0.4168, "step": 941 }, { "epoch": 1.455388180764774, "grad_norm": 0.5943245609386203, "learning_rate": 1.7857093023200615e-05, "loss": 0.4473, "step": 942 }, { "epoch": 1.4569331788335265, "grad_norm": 0.4563352060532699, "learning_rate": 1.7850412473821276e-05, "loss": 0.3996, "step": 943 }, { "epoch": 1.458478176902279, "grad_norm": 0.41543091822343664, "learning_rate": 1.784372278104091e-05, "loss": 0.4105, "step": 944 }, { "epoch": 1.4600231749710313, "grad_norm": 0.423066996768143, "learning_rate": 1.783702395265102e-05, "loss": 0.4284, "step": 945 }, { "epoch": 1.4615681730397836, "grad_norm": 0.40054604688760026, "learning_rate": 1.783031599645376e-05, "loss": 0.4515, "step": 946 }, { "epoch": 1.4631131711085361, "grad_norm": 0.464289692029655, "learning_rate": 1.7823598920261905e-05, "loss": 0.4231, "step": 947 }, { "epoch": 1.4646581691772886, "grad_norm": 0.4695214485515377, "learning_rate": 1.7816872731898854e-05, "loss": 0.4004, "step": 948 }, { "epoch": 1.466203167246041, "grad_norm": 0.5064446330972402, "learning_rate": 1.7810137439198623e-05, "loss": 0.4186, "step": 949 }, { "epoch": 1.4677481653147932, "grad_norm": 0.56654412231934, "learning_rate": 1.7803393050005827e-05, "loss": 0.4295, "step": 950 }, { "epoch": 1.4692931633835458, "grad_norm": 0.44143628229399845, "learning_rate": 1.7796639572175684e-05, "loss": 0.4251, "step": 951 }, { "epoch": 1.4708381614522983, "grad_norm": 0.5903005096516218, "learning_rate": 1.778987701357398e-05, "loss": 0.4629, "step": 952 }, { "epoch": 1.4723831595210506, "grad_norm": 0.4743303307740833, "learning_rate": 1.77831053820771e-05, "loss": 0.4386, "step": 953 }, { "epoch": 1.473928157589803, "grad_norm": 0.4133708183016042, "learning_rate": 1.777632468557198e-05, "loss": 0.3899, "step": 954 }, { "epoch": 1.4754731556585554, "grad_norm": 0.4099509095745342, "learning_rate": 1.776953493195612e-05, "loss": 0.3977, "step": 955 }, { "epoch": 1.477018153727308, "grad_norm": 0.4892746834724251, "learning_rate": 1.7762736129137573e-05, "loss": 0.4601, "step": 956 }, { "epoch": 1.4785631517960602, "grad_norm": 0.4606664185264968, "learning_rate": 1.7755928285034922e-05, "loss": 0.4411, "step": 957 }, { "epoch": 1.4801081498648125, "grad_norm": 0.4888063223420539, "learning_rate": 1.7749111407577285e-05, "loss": 0.4658, "step": 958 }, { "epoch": 1.481653147933565, "grad_norm": 0.41123703228065644, "learning_rate": 1.7742285504704304e-05, "loss": 0.3595, "step": 959 }, { "epoch": 1.4831981460023176, "grad_norm": 0.4771417376077512, "learning_rate": 1.7735450584366126e-05, "loss": 0.4503, "step": 960 }, { "epoch": 1.4847431440710699, "grad_norm": 0.5504398056568721, "learning_rate": 1.772860665452341e-05, "loss": 0.4196, "step": 961 }, { "epoch": 1.4862881421398222, "grad_norm": 0.4174463321764409, "learning_rate": 1.7721753723147303e-05, "loss": 0.4693, "step": 962 }, { "epoch": 1.4878331402085747, "grad_norm": 0.4898438817074545, "learning_rate": 1.7714891798219432e-05, "loss": 0.4263, "step": 963 }, { "epoch": 1.4893781382773272, "grad_norm": 0.3960717073176924, "learning_rate": 1.7708020887731907e-05, "loss": 0.4202, "step": 964 }, { "epoch": 1.4909231363460795, "grad_norm": 0.47148079844329527, "learning_rate": 1.7701140999687297e-05, "loss": 0.4279, "step": 965 }, { "epoch": 1.492468134414832, "grad_norm": 0.4103545027885097, "learning_rate": 1.7694252142098633e-05, "loss": 0.3953, "step": 966 }, { "epoch": 1.4940131324835844, "grad_norm": 0.5692760013107627, "learning_rate": 1.768735432298939e-05, "loss": 0.4106, "step": 967 }, { "epoch": 1.4955581305523369, "grad_norm": 0.4869141600803218, "learning_rate": 1.7680447550393484e-05, "loss": 0.4225, "step": 968 }, { "epoch": 1.4971031286210892, "grad_norm": 0.5157407143373595, "learning_rate": 1.7673531832355243e-05, "loss": 0.4292, "step": 969 }, { "epoch": 1.4986481266898417, "grad_norm": 0.5366608131587157, "learning_rate": 1.766660717692944e-05, "loss": 0.4166, "step": 970 }, { "epoch": 1.500193124758594, "grad_norm": 0.442046095840421, "learning_rate": 1.7659673592181238e-05, "loss": 0.3993, "step": 971 }, { "epoch": 1.5017381228273465, "grad_norm": 0.4717468064214897, "learning_rate": 1.76527310861862e-05, "loss": 0.4109, "step": 972 }, { "epoch": 1.5032831208960988, "grad_norm": 0.5055048135490848, "learning_rate": 1.76457796670303e-05, "loss": 0.4694, "step": 973 }, { "epoch": 1.5048281189648511, "grad_norm": 0.42027109913566585, "learning_rate": 1.7638819342809866e-05, "loss": 0.399, "step": 974 }, { "epoch": 1.5063731170336037, "grad_norm": 0.5088457566736103, "learning_rate": 1.7631850121631613e-05, "loss": 0.4279, "step": 975 }, { "epoch": 1.5079181151023562, "grad_norm": 0.6505391601176911, "learning_rate": 1.7624872011612616e-05, "loss": 0.4135, "step": 976 }, { "epoch": 1.5094631131711085, "grad_norm": 0.6212394710525051, "learning_rate": 1.7617885020880306e-05, "loss": 0.4519, "step": 977 }, { "epoch": 1.5110081112398608, "grad_norm": 0.5556644040701827, "learning_rate": 1.7610889157572453e-05, "loss": 0.4177, "step": 978 }, { "epoch": 1.5125531093086133, "grad_norm": 0.5367753777384147, "learning_rate": 1.7603884429837154e-05, "loss": 0.4563, "step": 979 }, { "epoch": 1.5140981073773658, "grad_norm": 0.6625984103456459, "learning_rate": 1.759687084583285e-05, "loss": 0.4217, "step": 980 }, { "epoch": 1.5156431054461184, "grad_norm": 0.4474548850735172, "learning_rate": 1.758984841372828e-05, "loss": 0.4122, "step": 981 }, { "epoch": 1.5171881035148707, "grad_norm": 0.4808938686955035, "learning_rate": 1.758281714170249e-05, "loss": 0.4255, "step": 982 }, { "epoch": 1.518733101583623, "grad_norm": 0.4815848862777789, "learning_rate": 1.7575777037944832e-05, "loss": 0.4311, "step": 983 }, { "epoch": 1.5202780996523755, "grad_norm": 0.5008243387929454, "learning_rate": 1.756872811065493e-05, "loss": 0.3934, "step": 984 }, { "epoch": 1.521823097721128, "grad_norm": 0.4064843914579355, "learning_rate": 1.75616703680427e-05, "loss": 0.4377, "step": 985 }, { "epoch": 1.5233680957898803, "grad_norm": 0.5384348261076747, "learning_rate": 1.7554603818328316e-05, "loss": 0.3924, "step": 986 }, { "epoch": 1.5249130938586326, "grad_norm": 0.4554320019960905, "learning_rate": 1.754752846974221e-05, "loss": 0.4476, "step": 987 }, { "epoch": 1.5264580919273851, "grad_norm": 0.4899525812696042, "learning_rate": 1.7540444330525062e-05, "loss": 0.4216, "step": 988 }, { "epoch": 1.5280030899961377, "grad_norm": 0.48880509107273545, "learning_rate": 1.7533351408927798e-05, "loss": 0.4146, "step": 989 }, { "epoch": 1.52954808806489, "grad_norm": 0.49382495609953264, "learning_rate": 1.752624971321156e-05, "loss": 0.41, "step": 990 }, { "epoch": 1.5310930861336423, "grad_norm": 0.47303250972051536, "learning_rate": 1.751913925164772e-05, "loss": 0.3851, "step": 991 }, { "epoch": 1.5326380842023948, "grad_norm": 0.49975851137486366, "learning_rate": 1.7512020032517855e-05, "loss": 0.4388, "step": 992 }, { "epoch": 1.5341830822711473, "grad_norm": 0.4994628325142338, "learning_rate": 1.750489206411374e-05, "loss": 0.442, "step": 993 }, { "epoch": 1.5357280803398996, "grad_norm": 0.39288420285558795, "learning_rate": 1.7497755354737348e-05, "loss": 0.4068, "step": 994 }, { "epoch": 1.537273078408652, "grad_norm": 0.5946395363539785, "learning_rate": 1.7490609912700826e-05, "loss": 0.4066, "step": 995 }, { "epoch": 1.5388180764774044, "grad_norm": 0.46723721904000864, "learning_rate": 1.7483455746326496e-05, "loss": 0.4463, "step": 996 }, { "epoch": 1.540363074546157, "grad_norm": 0.581262991018618, "learning_rate": 1.7476292863946832e-05, "loss": 0.4191, "step": 997 }, { "epoch": 1.5419080726149093, "grad_norm": 0.46072034457636807, "learning_rate": 1.746912127390447e-05, "loss": 0.3769, "step": 998 }, { "epoch": 1.5434530706836616, "grad_norm": 0.4618575077395252, "learning_rate": 1.746194098455219e-05, "loss": 0.4393, "step": 999 }, { "epoch": 1.544998068752414, "grad_norm": 0.5217036418531954, "learning_rate": 1.745475200425289e-05, "loss": 0.4014, "step": 1000 }, { "epoch": 1.5465430668211666, "grad_norm": 0.4063397393354951, "learning_rate": 1.7447554341379604e-05, "loss": 0.4372, "step": 1001 }, { "epoch": 1.548088064889919, "grad_norm": 0.48401583029160256, "learning_rate": 1.744034800431547e-05, "loss": 0.4299, "step": 1002 }, { "epoch": 1.5496330629586712, "grad_norm": 0.39052255578625183, "learning_rate": 1.7433133001453733e-05, "loss": 0.4084, "step": 1003 }, { "epoch": 1.5511780610274237, "grad_norm": 0.4337565815780213, "learning_rate": 1.7425909341197736e-05, "loss": 0.4115, "step": 1004 }, { "epoch": 1.5527230590961763, "grad_norm": 0.395787844337923, "learning_rate": 1.741867703196089e-05, "loss": 0.4192, "step": 1005 }, { "epoch": 1.5542680571649286, "grad_norm": 0.452158953721951, "learning_rate": 1.7411436082166698e-05, "loss": 0.4518, "step": 1006 }, { "epoch": 1.5558130552336809, "grad_norm": 0.4243405130123789, "learning_rate": 1.740418650024871e-05, "loss": 0.428, "step": 1007 }, { "epoch": 1.5573580533024334, "grad_norm": 0.4119264272340879, "learning_rate": 1.7396928294650544e-05, "loss": 0.4209, "step": 1008 }, { "epoch": 1.558903051371186, "grad_norm": 0.4508659065519288, "learning_rate": 1.7389661473825854e-05, "loss": 0.4546, "step": 1009 }, { "epoch": 1.5604480494399382, "grad_norm": 0.44362349425289, "learning_rate": 1.738238604623833e-05, "loss": 0.4011, "step": 1010 }, { "epoch": 1.5619930475086905, "grad_norm": 0.42849256749775155, "learning_rate": 1.7375102020361684e-05, "loss": 0.3983, "step": 1011 }, { "epoch": 1.563538045577443, "grad_norm": 0.43614760379468864, "learning_rate": 1.7367809404679645e-05, "loss": 0.4463, "step": 1012 }, { "epoch": 1.5650830436461955, "grad_norm": 0.50345337572537, "learning_rate": 1.7360508207685953e-05, "loss": 0.433, "step": 1013 }, { "epoch": 1.5666280417149478, "grad_norm": 0.4461145699233713, "learning_rate": 1.7353198437884324e-05, "loss": 0.4219, "step": 1014 }, { "epoch": 1.5681730397837002, "grad_norm": 0.5003327075295656, "learning_rate": 1.7345880103788483e-05, "loss": 0.4167, "step": 1015 }, { "epoch": 1.5697180378524527, "grad_norm": 0.4220597542794742, "learning_rate": 1.7338553213922106e-05, "loss": 0.402, "step": 1016 }, { "epoch": 1.5712630359212052, "grad_norm": 0.6927983125189849, "learning_rate": 1.7331217776818857e-05, "loss": 0.4597, "step": 1017 }, { "epoch": 1.5728080339899575, "grad_norm": 0.4523352933138053, "learning_rate": 1.7323873801022336e-05, "loss": 0.4024, "step": 1018 }, { "epoch": 1.5743530320587098, "grad_norm": 0.4406282131047334, "learning_rate": 1.7316521295086096e-05, "loss": 0.4399, "step": 1019 }, { "epoch": 1.5758980301274623, "grad_norm": 0.4670282049764913, "learning_rate": 1.730916026757363e-05, "loss": 0.4094, "step": 1020 }, { "epoch": 1.5774430281962148, "grad_norm": 0.45769828179068067, "learning_rate": 1.7301790727058344e-05, "loss": 0.3998, "step": 1021 }, { "epoch": 1.5789880262649671, "grad_norm": 0.47524459735603775, "learning_rate": 1.729441268212357e-05, "loss": 0.4565, "step": 1022 }, { "epoch": 1.5805330243337194, "grad_norm": 0.38203499395546797, "learning_rate": 1.7287026141362538e-05, "loss": 0.4105, "step": 1023 }, { "epoch": 1.582078022402472, "grad_norm": 0.5943249966519746, "learning_rate": 1.727963111337838e-05, "loss": 0.4187, "step": 1024 }, { "epoch": 1.5836230204712245, "grad_norm": 0.39636716445657233, "learning_rate": 1.7272227606784108e-05, "loss": 0.4015, "step": 1025 }, { "epoch": 1.5851680185399768, "grad_norm": 0.4821415809895145, "learning_rate": 1.7264815630202608e-05, "loss": 0.4239, "step": 1026 }, { "epoch": 1.586713016608729, "grad_norm": 0.4095946412846323, "learning_rate": 1.7257395192266638e-05, "loss": 0.4077, "step": 1027 }, { "epoch": 1.5882580146774816, "grad_norm": 0.46320533517297113, "learning_rate": 1.7249966301618803e-05, "loss": 0.4672, "step": 1028 }, { "epoch": 1.5898030127462341, "grad_norm": 0.4243175456416756, "learning_rate": 1.7242528966911555e-05, "loss": 0.4114, "step": 1029 }, { "epoch": 1.5913480108149864, "grad_norm": 0.4341383183105472, "learning_rate": 1.7235083196807184e-05, "loss": 0.4523, "step": 1030 }, { "epoch": 1.5928930088837387, "grad_norm": 0.46358936039044313, "learning_rate": 1.7227628999977798e-05, "loss": 0.4287, "step": 1031 }, { "epoch": 1.5944380069524913, "grad_norm": 0.49957217804382614, "learning_rate": 1.7220166385105334e-05, "loss": 0.4194, "step": 1032 }, { "epoch": 1.5959830050212438, "grad_norm": 0.45552962799584457, "learning_rate": 1.7212695360881516e-05, "loss": 0.4105, "step": 1033 }, { "epoch": 1.597528003089996, "grad_norm": 0.4839725637176701, "learning_rate": 1.720521593600787e-05, "loss": 0.4525, "step": 1034 }, { "epoch": 1.5990730011587484, "grad_norm": 0.4978411554178996, "learning_rate": 1.7197728119195713e-05, "loss": 0.4147, "step": 1035 }, { "epoch": 1.600617999227501, "grad_norm": 0.4810788732813052, "learning_rate": 1.7190231919166126e-05, "loss": 0.4038, "step": 1036 }, { "epoch": 1.6021629972962534, "grad_norm": 0.4345462486279959, "learning_rate": 1.7182727344649955e-05, "loss": 0.4457, "step": 1037 }, { "epoch": 1.6037079953650057, "grad_norm": 0.4667674610729828, "learning_rate": 1.7175214404387806e-05, "loss": 0.4108, "step": 1038 }, { "epoch": 1.605252993433758, "grad_norm": 0.491750081379796, "learning_rate": 1.716769310713003e-05, "loss": 0.4126, "step": 1039 }, { "epoch": 1.6067979915025106, "grad_norm": 0.5321245192437956, "learning_rate": 1.716016346163669e-05, "loss": 0.4329, "step": 1040 }, { "epoch": 1.608342989571263, "grad_norm": 0.5705642649837284, "learning_rate": 1.7152625476677607e-05, "loss": 0.4572, "step": 1041 }, { "epoch": 1.6098879876400156, "grad_norm": 0.41962782455573117, "learning_rate": 1.7145079161032285e-05, "loss": 0.3945, "step": 1042 }, { "epoch": 1.611432985708768, "grad_norm": 0.5834876437510723, "learning_rate": 1.713752452348995e-05, "loss": 0.4005, "step": 1043 }, { "epoch": 1.6129779837775202, "grad_norm": 0.4640599896072275, "learning_rate": 1.7129961572849504e-05, "loss": 0.4237, "step": 1044 }, { "epoch": 1.6145229818462727, "grad_norm": 0.5113637542744628, "learning_rate": 1.712239031791955e-05, "loss": 0.4315, "step": 1045 }, { "epoch": 1.6160679799150253, "grad_norm": 0.5437102971306271, "learning_rate": 1.7114810767518348e-05, "loss": 0.4299, "step": 1046 }, { "epoch": 1.6176129779837776, "grad_norm": 0.46588623192136386, "learning_rate": 1.7107222930473825e-05, "loss": 0.4247, "step": 1047 }, { "epoch": 1.6191579760525299, "grad_norm": 0.5687430380865951, "learning_rate": 1.709962681562356e-05, "loss": 0.4158, "step": 1048 }, { "epoch": 1.6207029741212824, "grad_norm": 0.4219901505636826, "learning_rate": 1.7092022431814776e-05, "loss": 0.4384, "step": 1049 }, { "epoch": 1.622247972190035, "grad_norm": 0.6086529775889625, "learning_rate": 1.7084409787904318e-05, "loss": 0.4333, "step": 1050 }, { "epoch": 1.6237929702587872, "grad_norm": 0.3933954907913078, "learning_rate": 1.707678889275866e-05, "loss": 0.3966, "step": 1051 }, { "epoch": 1.6253379683275395, "grad_norm": 0.5698473474556648, "learning_rate": 1.7069159755253885e-05, "loss": 0.4295, "step": 1052 }, { "epoch": 1.626882966396292, "grad_norm": 0.5583565295778193, "learning_rate": 1.7061522384275676e-05, "loss": 0.4404, "step": 1053 }, { "epoch": 1.6284279644650446, "grad_norm": 0.37984214038138536, "learning_rate": 1.70538767887193e-05, "loss": 0.4027, "step": 1054 }, { "epoch": 1.6299729625337969, "grad_norm": 0.5292648917757365, "learning_rate": 1.7046222977489604e-05, "loss": 0.3927, "step": 1055 }, { "epoch": 1.6315179606025492, "grad_norm": 0.39388175702148265, "learning_rate": 1.7038560959501013e-05, "loss": 0.4392, "step": 1056 }, { "epoch": 1.6330629586713017, "grad_norm": 0.6131475830634713, "learning_rate": 1.70308907436775e-05, "loss": 0.4258, "step": 1057 }, { "epoch": 1.6346079567400542, "grad_norm": 0.48654115212978744, "learning_rate": 1.7023212338952597e-05, "loss": 0.4411, "step": 1058 }, { "epoch": 1.6361529548088065, "grad_norm": 0.5399153842841071, "learning_rate": 1.7015525754269363e-05, "loss": 0.3923, "step": 1059 }, { "epoch": 1.6376979528775588, "grad_norm": 0.5496305939572897, "learning_rate": 1.7007830998580384e-05, "loss": 0.441, "step": 1060 }, { "epoch": 1.6392429509463113, "grad_norm": 0.5823889373911426, "learning_rate": 1.7000128080847777e-05, "loss": 0.4274, "step": 1061 }, { "epoch": 1.6407879490150639, "grad_norm": 0.5258119400476815, "learning_rate": 1.6992417010043144e-05, "loss": 0.3915, "step": 1062 }, { "epoch": 1.6423329470838162, "grad_norm": 0.538372681888035, "learning_rate": 1.69846977951476e-05, "loss": 0.4039, "step": 1063 }, { "epoch": 1.6438779451525685, "grad_norm": 0.5328102894010672, "learning_rate": 1.6976970445151743e-05, "loss": 0.4002, "step": 1064 }, { "epoch": 1.645422943221321, "grad_norm": 0.555179635562809, "learning_rate": 1.6969234969055636e-05, "loss": 0.4312, "step": 1065 }, { "epoch": 1.6469679412900735, "grad_norm": 0.43623942457001436, "learning_rate": 1.6961491375868816e-05, "loss": 0.4039, "step": 1066 }, { "epoch": 1.6485129393588258, "grad_norm": 0.5409787456712991, "learning_rate": 1.6953739674610273e-05, "loss": 0.4304, "step": 1067 }, { "epoch": 1.6500579374275781, "grad_norm": 0.5548322117826611, "learning_rate": 1.6945979874308437e-05, "loss": 0.4097, "step": 1068 }, { "epoch": 1.6516029354963306, "grad_norm": 0.41991467711648706, "learning_rate": 1.6938211984001177e-05, "loss": 0.4049, "step": 1069 }, { "epoch": 1.6531479335650832, "grad_norm": 0.5874502331421213, "learning_rate": 1.6930436012735772e-05, "loss": 0.4028, "step": 1070 }, { "epoch": 1.6546929316338355, "grad_norm": 0.4960880812102513, "learning_rate": 1.6922651969568927e-05, "loss": 0.4196, "step": 1071 }, { "epoch": 1.6562379297025878, "grad_norm": 0.6468504112312653, "learning_rate": 1.6914859863566743e-05, "loss": 0.445, "step": 1072 }, { "epoch": 1.6577829277713403, "grad_norm": 0.5536732764228746, "learning_rate": 1.6907059703804708e-05, "loss": 0.4387, "step": 1073 }, { "epoch": 1.6593279258400928, "grad_norm": 0.5315498593211347, "learning_rate": 1.6899251499367693e-05, "loss": 0.4408, "step": 1074 }, { "epoch": 1.6608729239088451, "grad_norm": 0.6016438683600485, "learning_rate": 1.6891435259349946e-05, "loss": 0.4277, "step": 1075 }, { "epoch": 1.6624179219775974, "grad_norm": 0.3880154616053888, "learning_rate": 1.688361099285506e-05, "loss": 0.3862, "step": 1076 }, { "epoch": 1.66396292004635, "grad_norm": 0.5735513235216411, "learning_rate": 1.6875778708995984e-05, "loss": 0.4399, "step": 1077 }, { "epoch": 1.6655079181151025, "grad_norm": 0.3426318769068146, "learning_rate": 1.6867938416895013e-05, "loss": 0.3603, "step": 1078 }, { "epoch": 1.6670529161838548, "grad_norm": 0.5512757987984395, "learning_rate": 1.686009012568375e-05, "loss": 0.4507, "step": 1079 }, { "epoch": 1.668597914252607, "grad_norm": 0.4585223444156026, "learning_rate": 1.6852233844503125e-05, "loss": 0.4078, "step": 1080 }, { "epoch": 1.6701429123213596, "grad_norm": 0.46617891208928275, "learning_rate": 1.6844369582503385e-05, "loss": 0.4415, "step": 1081 }, { "epoch": 1.671687910390112, "grad_norm": 0.5182270610690971, "learning_rate": 1.6836497348844054e-05, "loss": 0.4379, "step": 1082 }, { "epoch": 1.6732329084588644, "grad_norm": 0.4231542550185723, "learning_rate": 1.6828617152693952e-05, "loss": 0.3974, "step": 1083 }, { "epoch": 1.6747779065276167, "grad_norm": 0.456985759425072, "learning_rate": 1.6820729003231165e-05, "loss": 0.4291, "step": 1084 }, { "epoch": 1.6763229045963692, "grad_norm": 0.3586413401119008, "learning_rate": 1.6812832909643048e-05, "loss": 0.4274, "step": 1085 }, { "epoch": 1.6778679026651218, "grad_norm": 0.3958370214908289, "learning_rate": 1.680492888112621e-05, "loss": 0.4191, "step": 1086 }, { "epoch": 1.679412900733874, "grad_norm": 0.3912286725885716, "learning_rate": 1.67970169268865e-05, "loss": 0.4312, "step": 1087 }, { "epoch": 1.6809578988026264, "grad_norm": 0.4403903193726935, "learning_rate": 1.6789097056138986e-05, "loss": 0.4134, "step": 1088 }, { "epoch": 1.6825028968713789, "grad_norm": 0.36112175845961403, "learning_rate": 1.6781169278107977e-05, "loss": 0.3902, "step": 1089 }, { "epoch": 1.6840478949401314, "grad_norm": 0.4631080202289778, "learning_rate": 1.677323360202698e-05, "loss": 0.4303, "step": 1090 }, { "epoch": 1.6855928930088837, "grad_norm": 0.45957132845336096, "learning_rate": 1.67652900371387e-05, "loss": 0.4276, "step": 1091 }, { "epoch": 1.687137891077636, "grad_norm": 0.3956287138802762, "learning_rate": 1.6757338592695042e-05, "loss": 0.3991, "step": 1092 }, { "epoch": 1.6886828891463885, "grad_norm": 0.4579883971487316, "learning_rate": 1.674937927795707e-05, "loss": 0.4231, "step": 1093 }, { "epoch": 1.690227887215141, "grad_norm": 0.46543494773471034, "learning_rate": 1.674141210219502e-05, "loss": 0.3998, "step": 1094 }, { "epoch": 1.6917728852838934, "grad_norm": 0.4441750968301943, "learning_rate": 1.6733437074688305e-05, "loss": 0.426, "step": 1095 }, { "epoch": 1.6933178833526457, "grad_norm": 0.4251121137982081, "learning_rate": 1.6725454204725443e-05, "loss": 0.403, "step": 1096 }, { "epoch": 1.6948628814213982, "grad_norm": 0.4585415338162839, "learning_rate": 1.671746350160413e-05, "loss": 0.4355, "step": 1097 }, { "epoch": 1.6964078794901507, "grad_norm": 0.38864650747527674, "learning_rate": 1.670946497463115e-05, "loss": 0.3996, "step": 1098 }, { "epoch": 1.697952877558903, "grad_norm": 0.5318543726518846, "learning_rate": 1.6701458633122418e-05, "loss": 0.4262, "step": 1099 }, { "epoch": 1.6994978756276553, "grad_norm": 0.4239883521161212, "learning_rate": 1.6693444486402946e-05, "loss": 0.4099, "step": 1100 }, { "epoch": 1.7010428736964078, "grad_norm": 0.4790834602290601, "learning_rate": 1.6685422543806836e-05, "loss": 0.4183, "step": 1101 }, { "epoch": 1.7025878717651604, "grad_norm": 0.4266477941159462, "learning_rate": 1.6677392814677277e-05, "loss": 0.4483, "step": 1102 }, { "epoch": 1.7041328698339127, "grad_norm": 0.4810412861601916, "learning_rate": 1.666935530836651e-05, "loss": 0.4015, "step": 1103 }, { "epoch": 1.705677867902665, "grad_norm": 0.4125803165360151, "learning_rate": 1.6661310034235852e-05, "loss": 0.4275, "step": 1104 }, { "epoch": 1.7072228659714175, "grad_norm": 0.42750481754423475, "learning_rate": 1.6653257001655652e-05, "loss": 0.42, "step": 1105 }, { "epoch": 1.70876786404017, "grad_norm": 0.4082409310490051, "learning_rate": 1.664519622000532e-05, "loss": 0.4211, "step": 1106 }, { "epoch": 1.7103128621089225, "grad_norm": 0.37385776706389195, "learning_rate": 1.6637127698673257e-05, "loss": 0.4434, "step": 1107 }, { "epoch": 1.7118578601776748, "grad_norm": 0.42472385386863054, "learning_rate": 1.6629051447056904e-05, "loss": 0.3898, "step": 1108 }, { "epoch": 1.7134028582464271, "grad_norm": 0.37511991746796214, "learning_rate": 1.66209674745627e-05, "loss": 0.4352, "step": 1109 }, { "epoch": 1.7149478563151797, "grad_norm": 0.4653178888511881, "learning_rate": 1.6612875790606065e-05, "loss": 0.4265, "step": 1110 }, { "epoch": 1.7164928543839322, "grad_norm": 0.3737059845454942, "learning_rate": 1.660477640461142e-05, "loss": 0.4349, "step": 1111 }, { "epoch": 1.7180378524526845, "grad_norm": 0.44489409943539515, "learning_rate": 1.659666932601214e-05, "loss": 0.3996, "step": 1112 }, { "epoch": 1.7195828505214368, "grad_norm": 0.4060688521067978, "learning_rate": 1.658855456425057e-05, "loss": 0.4225, "step": 1113 }, { "epoch": 1.7211278485901893, "grad_norm": 0.4112391700979759, "learning_rate": 1.6580432128778e-05, "loss": 0.4208, "step": 1114 }, { "epoch": 1.7226728466589418, "grad_norm": 0.4485941217587587, "learning_rate": 1.657230202905465e-05, "loss": 0.412, "step": 1115 }, { "epoch": 1.7242178447276941, "grad_norm": 0.47033474730496, "learning_rate": 1.6564164274549678e-05, "loss": 0.4247, "step": 1116 }, { "epoch": 1.7257628427964464, "grad_norm": 0.451035910624295, "learning_rate": 1.6556018874741155e-05, "loss": 0.4332, "step": 1117 }, { "epoch": 1.727307840865199, "grad_norm": 0.4714593573788001, "learning_rate": 1.6547865839116054e-05, "loss": 0.4033, "step": 1118 }, { "epoch": 1.7288528389339515, "grad_norm": 0.4238640791971488, "learning_rate": 1.653970517717024e-05, "loss": 0.4397, "step": 1119 }, { "epoch": 1.7303978370027038, "grad_norm": 0.4421702779021047, "learning_rate": 1.6531536898408465e-05, "loss": 0.4169, "step": 1120 }, { "epoch": 1.731942835071456, "grad_norm": 0.5295413060120978, "learning_rate": 1.6523361012344348e-05, "loss": 0.4106, "step": 1121 }, { "epoch": 1.7334878331402086, "grad_norm": 0.40429856303319545, "learning_rate": 1.651517752850037e-05, "loss": 0.4437, "step": 1122 }, { "epoch": 1.7350328312089611, "grad_norm": 0.5373106862781462, "learning_rate": 1.6506986456407866e-05, "loss": 0.4199, "step": 1123 }, { "epoch": 1.7365778292777134, "grad_norm": 0.3904785014991924, "learning_rate": 1.6498787805606998e-05, "loss": 0.4284, "step": 1124 }, { "epoch": 1.7381228273464657, "grad_norm": 0.5331935737106657, "learning_rate": 1.6490581585646764e-05, "loss": 0.4335, "step": 1125 }, { "epoch": 1.7396678254152182, "grad_norm": 0.46847170307869046, "learning_rate": 1.648236780608498e-05, "loss": 0.4085, "step": 1126 }, { "epoch": 1.7412128234839708, "grad_norm": 0.38723378189232005, "learning_rate": 1.647414647648825e-05, "loss": 0.4433, "step": 1127 }, { "epoch": 1.742757821552723, "grad_norm": 0.37269262592273045, "learning_rate": 1.6465917606431993e-05, "loss": 0.3809, "step": 1128 }, { "epoch": 1.7443028196214754, "grad_norm": 0.3993284084896468, "learning_rate": 1.6457681205500396e-05, "loss": 0.4163, "step": 1129 }, { "epoch": 1.745847817690228, "grad_norm": 0.45156254846691735, "learning_rate": 1.644943728328643e-05, "loss": 0.422, "step": 1130 }, { "epoch": 1.7473928157589804, "grad_norm": 0.4201127965399996, "learning_rate": 1.644118584939181e-05, "loss": 0.4138, "step": 1131 }, { "epoch": 1.7489378138277327, "grad_norm": 0.5526932125962268, "learning_rate": 1.6432926913427013e-05, "loss": 0.4375, "step": 1132 }, { "epoch": 1.750482811896485, "grad_norm": 0.423302834544846, "learning_rate": 1.6424660485011245e-05, "loss": 0.4252, "step": 1133 }, { "epoch": 1.7520278099652375, "grad_norm": 0.567182498479579, "learning_rate": 1.641638657377244e-05, "loss": 0.4084, "step": 1134 }, { "epoch": 1.75357280803399, "grad_norm": 0.5256441949747406, "learning_rate": 1.6408105189347256e-05, "loss": 0.4567, "step": 1135 }, { "epoch": 1.7551178061027424, "grad_norm": 0.45572108810657974, "learning_rate": 1.6399816341381043e-05, "loss": 0.3956, "step": 1136 }, { "epoch": 1.7566628041714947, "grad_norm": 0.5075408162955704, "learning_rate": 1.6391520039527855e-05, "loss": 0.4001, "step": 1137 }, { "epoch": 1.7582078022402472, "grad_norm": 0.5261526373756911, "learning_rate": 1.6383216293450414e-05, "loss": 0.4456, "step": 1138 }, { "epoch": 1.7597528003089997, "grad_norm": 0.49852123588147795, "learning_rate": 1.6374905112820125e-05, "loss": 0.4073, "step": 1139 }, { "epoch": 1.761297798377752, "grad_norm": 0.4160723131462996, "learning_rate": 1.6366586507317053e-05, "loss": 0.4001, "step": 1140 }, { "epoch": 1.7628427964465043, "grad_norm": 0.5833407884519073, "learning_rate": 1.635826048662989e-05, "loss": 0.4513, "step": 1141 }, { "epoch": 1.7643877945152568, "grad_norm": 0.4051078050643858, "learning_rate": 1.6349927060455995e-05, "loss": 0.3935, "step": 1142 }, { "epoch": 1.7659327925840094, "grad_norm": 0.5161122590465061, "learning_rate": 1.6341586238501328e-05, "loss": 0.4289, "step": 1143 }, { "epoch": 1.7674777906527617, "grad_norm": 0.4754395799258531, "learning_rate": 1.6333238030480473e-05, "loss": 0.4047, "step": 1144 }, { "epoch": 1.769022788721514, "grad_norm": 0.47139154917748255, "learning_rate": 1.632488244611661e-05, "loss": 0.4436, "step": 1145 }, { "epoch": 1.7705677867902665, "grad_norm": 0.4412287652912194, "learning_rate": 1.631651949514153e-05, "loss": 0.3824, "step": 1146 }, { "epoch": 1.772112784859019, "grad_norm": 0.43485585021694073, "learning_rate": 1.6308149187295574e-05, "loss": 0.4459, "step": 1147 }, { "epoch": 1.7736577829277713, "grad_norm": 0.47043687460380107, "learning_rate": 1.6299771532327678e-05, "loss": 0.4539, "step": 1148 }, { "epoch": 1.7752027809965236, "grad_norm": 0.38180897760324495, "learning_rate": 1.6291386539995315e-05, "loss": 0.4106, "step": 1149 }, { "epoch": 1.7767477790652761, "grad_norm": 0.5062005710248006, "learning_rate": 1.628299422006452e-05, "loss": 0.4314, "step": 1150 }, { "epoch": 1.7782927771340287, "grad_norm": 0.4069539551224503, "learning_rate": 1.6274594582309847e-05, "loss": 0.3624, "step": 1151 }, { "epoch": 1.779837775202781, "grad_norm": 0.4788503288383385, "learning_rate": 1.6266187636514392e-05, "loss": 0.4616, "step": 1152 }, { "epoch": 1.7813827732715333, "grad_norm": 0.4716097788610631, "learning_rate": 1.6257773392469746e-05, "loss": 0.4106, "step": 1153 }, { "epoch": 1.7829277713402858, "grad_norm": 0.44325674898456585, "learning_rate": 1.624935185997601e-05, "loss": 0.4218, "step": 1154 }, { "epoch": 1.7844727694090383, "grad_norm": 0.3672590509728591, "learning_rate": 1.6240923048841765e-05, "loss": 0.3718, "step": 1155 }, { "epoch": 1.7860177674777906, "grad_norm": 0.44314151566080306, "learning_rate": 1.623248696888408e-05, "loss": 0.4048, "step": 1156 }, { "epoch": 1.787562765546543, "grad_norm": 0.42316617608664175, "learning_rate": 1.622404362992848e-05, "loss": 0.4231, "step": 1157 }, { "epoch": 1.7891077636152954, "grad_norm": 0.5026977406205784, "learning_rate": 1.6215593041808952e-05, "loss": 0.4076, "step": 1158 }, { "epoch": 1.790652761684048, "grad_norm": 0.4111284851479003, "learning_rate": 1.6207135214367927e-05, "loss": 0.4402, "step": 1159 }, { "epoch": 1.7921977597528003, "grad_norm": 0.4791635986715569, "learning_rate": 1.619867015745626e-05, "loss": 0.4033, "step": 1160 }, { "epoch": 1.7937427578215526, "grad_norm": 0.4206350682096085, "learning_rate": 1.619019788093323e-05, "loss": 0.3969, "step": 1161 }, { "epoch": 1.795287755890305, "grad_norm": 0.5118175137446966, "learning_rate": 1.618171839466653e-05, "loss": 0.4486, "step": 1162 }, { "epoch": 1.7968327539590576, "grad_norm": 0.37469193611011, "learning_rate": 1.6173231708532236e-05, "loss": 0.4223, "step": 1163 }, { "epoch": 1.79837775202781, "grad_norm": 0.38363755005196315, "learning_rate": 1.6164737832414828e-05, "loss": 0.4249, "step": 1164 }, { "epoch": 1.7999227500965622, "grad_norm": 0.45225010893014483, "learning_rate": 1.615623677620715e-05, "loss": 0.4215, "step": 1165 }, { "epoch": 1.8014677481653147, "grad_norm": 0.4336552492217712, "learning_rate": 1.6147728549810405e-05, "loss": 0.4205, "step": 1166 }, { "epoch": 1.8030127462340673, "grad_norm": 0.41502336244133237, "learning_rate": 1.6139213163134156e-05, "loss": 0.3947, "step": 1167 }, { "epoch": 1.8045577443028196, "grad_norm": 0.3626512490800638, "learning_rate": 1.6130690626096303e-05, "loss": 0.4151, "step": 1168 }, { "epoch": 1.806102742371572, "grad_norm": 0.41471569967330335, "learning_rate": 1.6122160948623067e-05, "loss": 0.4215, "step": 1169 }, { "epoch": 1.8076477404403244, "grad_norm": 0.43319412806349006, "learning_rate": 1.6113624140648994e-05, "loss": 0.404, "step": 1170 }, { "epoch": 1.809192738509077, "grad_norm": 0.3901867966422059, "learning_rate": 1.6105080212116936e-05, "loss": 0.4472, "step": 1171 }, { "epoch": 1.8107377365778294, "grad_norm": 0.42603123799012155, "learning_rate": 1.609652917297803e-05, "loss": 0.4006, "step": 1172 }, { "epoch": 1.8122827346465817, "grad_norm": 0.4477437811567181, "learning_rate": 1.6087971033191697e-05, "loss": 0.4132, "step": 1173 }, { "epoch": 1.813827732715334, "grad_norm": 0.4978610159559326, "learning_rate": 1.6079405802725637e-05, "loss": 0.445, "step": 1174 }, { "epoch": 1.8153727307840866, "grad_norm": 0.3768518193409819, "learning_rate": 1.60708334915558e-05, "loss": 0.402, "step": 1175 }, { "epoch": 1.816917728852839, "grad_norm": 0.482116112525008, "learning_rate": 1.6062254109666383e-05, "loss": 0.4621, "step": 1176 }, { "epoch": 1.8184627269215914, "grad_norm": 0.4199364072463108, "learning_rate": 1.6053667667049818e-05, "loss": 0.3961, "step": 1177 }, { "epoch": 1.8200077249903437, "grad_norm": 0.538431055053487, "learning_rate": 1.6045074173706768e-05, "loss": 0.4018, "step": 1178 }, { "epoch": 1.8215527230590962, "grad_norm": 0.42648164237169384, "learning_rate": 1.6036473639646096e-05, "loss": 0.4148, "step": 1179 }, { "epoch": 1.8230977211278487, "grad_norm": 0.4736005614498484, "learning_rate": 1.6027866074884878e-05, "loss": 0.4489, "step": 1180 }, { "epoch": 1.824642719196601, "grad_norm": 0.41355734751216566, "learning_rate": 1.601925148944837e-05, "loss": 0.3779, "step": 1181 }, { "epoch": 1.8261877172653533, "grad_norm": 0.3807327999311575, "learning_rate": 1.601062989337001e-05, "loss": 0.4154, "step": 1182 }, { "epoch": 1.8277327153341059, "grad_norm": 0.44707245963173015, "learning_rate": 1.6002001296691396e-05, "loss": 0.4789, "step": 1183 }, { "epoch": 1.8292777134028584, "grad_norm": 0.39862702717192483, "learning_rate": 1.5993365709462287e-05, "loss": 0.412, "step": 1184 }, { "epoch": 1.8308227114716107, "grad_norm": 0.4070883568619711, "learning_rate": 1.5984723141740578e-05, "loss": 0.4104, "step": 1185 }, { "epoch": 1.832367709540363, "grad_norm": 0.38820691798818935, "learning_rate": 1.5976073603592287e-05, "loss": 0.4159, "step": 1186 }, { "epoch": 1.8339127076091155, "grad_norm": 0.4348042474093562, "learning_rate": 1.5967417105091576e-05, "loss": 0.4202, "step": 1187 }, { "epoch": 1.835457705677868, "grad_norm": 0.37607469424926, "learning_rate": 1.5958753656320682e-05, "loss": 0.3992, "step": 1188 }, { "epoch": 1.8370027037466203, "grad_norm": 0.44957091479916494, "learning_rate": 1.5950083267369963e-05, "loss": 0.4487, "step": 1189 }, { "epoch": 1.8385477018153726, "grad_norm": 0.3629894205452323, "learning_rate": 1.594140594833784e-05, "loss": 0.4258, "step": 1190 }, { "epoch": 1.8400926998841252, "grad_norm": 0.4149048979610815, "learning_rate": 1.5932721709330823e-05, "loss": 0.4144, "step": 1191 }, { "epoch": 1.8416376979528777, "grad_norm": 0.43015668406736984, "learning_rate": 1.5924030560463467e-05, "loss": 0.3945, "step": 1192 }, { "epoch": 1.84318269602163, "grad_norm": 0.40875245683215083, "learning_rate": 1.5915332511858386e-05, "loss": 0.4273, "step": 1193 }, { "epoch": 1.8447276940903823, "grad_norm": 0.48678232529343357, "learning_rate": 1.590662757364622e-05, "loss": 0.4022, "step": 1194 }, { "epoch": 1.8462726921591348, "grad_norm": 0.48710434068490177, "learning_rate": 1.589791575596565e-05, "loss": 0.4435, "step": 1195 }, { "epoch": 1.8478176902278873, "grad_norm": 0.47540620535720457, "learning_rate": 1.5889197068963348e-05, "loss": 0.4119, "step": 1196 }, { "epoch": 1.8493626882966396, "grad_norm": 0.5118424582389081, "learning_rate": 1.5880471522794002e-05, "loss": 0.4182, "step": 1197 }, { "epoch": 1.850907686365392, "grad_norm": 0.5117975475744425, "learning_rate": 1.587173912762028e-05, "loss": 0.4291, "step": 1198 }, { "epoch": 1.8524526844341445, "grad_norm": 0.40914787337941433, "learning_rate": 1.5862999893612835e-05, "loss": 0.3813, "step": 1199 }, { "epoch": 1.853997682502897, "grad_norm": 0.4843361735693343, "learning_rate": 1.5854253830950278e-05, "loss": 0.445, "step": 1200 }, { "epoch": 1.8555426805716493, "grad_norm": 0.3909686604074257, "learning_rate": 1.5845500949819182e-05, "loss": 0.4303, "step": 1201 }, { "epoch": 1.8570876786404016, "grad_norm": 0.44102589457978264, "learning_rate": 1.583674126041405e-05, "loss": 0.4294, "step": 1202 }, { "epoch": 1.858632676709154, "grad_norm": 0.4358430193434794, "learning_rate": 1.5827974772937324e-05, "loss": 0.3977, "step": 1203 }, { "epoch": 1.8601776747779066, "grad_norm": 0.4461991481644745, "learning_rate": 1.5819201497599363e-05, "loss": 0.4392, "step": 1204 }, { "epoch": 1.861722672846659, "grad_norm": 0.42261435698545313, "learning_rate": 1.5810421444618427e-05, "loss": 0.4526, "step": 1205 }, { "epoch": 1.8632676709154112, "grad_norm": 0.3402254322481036, "learning_rate": 1.580163462422067e-05, "loss": 0.381, "step": 1206 }, { "epoch": 1.8648126689841638, "grad_norm": 0.4138469158049446, "learning_rate": 1.5792841046640136e-05, "loss": 0.4162, "step": 1207 }, { "epoch": 1.8663576670529163, "grad_norm": 0.37457277171011155, "learning_rate": 1.5784040722118724e-05, "loss": 0.4146, "step": 1208 }, { "epoch": 1.8679026651216686, "grad_norm": 0.4185974755493213, "learning_rate": 1.5775233660906213e-05, "loss": 0.4136, "step": 1209 }, { "epoch": 1.8694476631904209, "grad_norm": 0.40677840126521025, "learning_rate": 1.5766419873260212e-05, "loss": 0.4391, "step": 1210 }, { "epoch": 1.8709926612591734, "grad_norm": 0.4287022074904789, "learning_rate": 1.575759936944616e-05, "loss": 0.386, "step": 1211 }, { "epoch": 1.872537659327926, "grad_norm": 0.44311547782932187, "learning_rate": 1.5748772159737333e-05, "loss": 0.4508, "step": 1212 }, { "epoch": 1.8740826573966782, "grad_norm": 0.47026059885845845, "learning_rate": 1.573993825441481e-05, "loss": 0.3718, "step": 1213 }, { "epoch": 1.8756276554654305, "grad_norm": 0.44693750733916593, "learning_rate": 1.5731097663767473e-05, "loss": 0.4353, "step": 1214 }, { "epoch": 1.877172653534183, "grad_norm": 0.3532672357113444, "learning_rate": 1.5722250398091984e-05, "loss": 0.3704, "step": 1215 }, { "epoch": 1.8787176516029356, "grad_norm": 0.48592958370028755, "learning_rate": 1.571339646769278e-05, "loss": 0.4042, "step": 1216 }, { "epoch": 1.8802626496716879, "grad_norm": 0.3980956030292733, "learning_rate": 1.5704535882882063e-05, "loss": 0.4079, "step": 1217 }, { "epoch": 1.8818076477404402, "grad_norm": 0.4130771410435422, "learning_rate": 1.569566865397979e-05, "loss": 0.413, "step": 1218 }, { "epoch": 1.8833526458091927, "grad_norm": 0.39290901289372004, "learning_rate": 1.5686794791313643e-05, "loss": 0.4219, "step": 1219 }, { "epoch": 1.8848976438779452, "grad_norm": 0.49030977023004885, "learning_rate": 1.5677914305219052e-05, "loss": 0.4554, "step": 1220 }, { "epoch": 1.8864426419466975, "grad_norm": 0.523021478767902, "learning_rate": 1.5669027206039134e-05, "loss": 0.4285, "step": 1221 }, { "epoch": 1.8879876400154498, "grad_norm": 0.39339129324702127, "learning_rate": 1.566013350412473e-05, "loss": 0.3948, "step": 1222 }, { "epoch": 1.8895326380842024, "grad_norm": 0.6633490682794599, "learning_rate": 1.565123320983436e-05, "loss": 0.4195, "step": 1223 }, { "epoch": 1.8910776361529549, "grad_norm": 0.45755548659148687, "learning_rate": 1.564232633353423e-05, "loss": 0.457, "step": 1224 }, { "epoch": 1.8926226342217072, "grad_norm": 0.5676764535120085, "learning_rate": 1.5633412885598206e-05, "loss": 0.4306, "step": 1225 }, { "epoch": 1.8941676322904595, "grad_norm": 0.375617665728306, "learning_rate": 1.562449287640781e-05, "loss": 0.3844, "step": 1226 }, { "epoch": 1.895712630359212, "grad_norm": 0.4879781560405763, "learning_rate": 1.5615566316352203e-05, "loss": 0.4292, "step": 1227 }, { "epoch": 1.8972576284279645, "grad_norm": 0.4473773022344148, "learning_rate": 1.5606633215828184e-05, "loss": 0.4118, "step": 1228 }, { "epoch": 1.8988026264967168, "grad_norm": 0.4906895846209599, "learning_rate": 1.5597693585240158e-05, "loss": 0.4219, "step": 1229 }, { "epoch": 1.9003476245654691, "grad_norm": 0.5858117366027312, "learning_rate": 1.5588747435000144e-05, "loss": 0.4481, "step": 1230 }, { "epoch": 1.9018926226342217, "grad_norm": 0.43998595261659823, "learning_rate": 1.557979477552776e-05, "loss": 0.4239, "step": 1231 }, { "epoch": 1.9034376207029742, "grad_norm": 0.5136977111415142, "learning_rate": 1.5570835617250187e-05, "loss": 0.4398, "step": 1232 }, { "epoch": 1.9049826187717267, "grad_norm": 0.5516205589768617, "learning_rate": 1.5561869970602192e-05, "loss": 0.4111, "step": 1233 }, { "epoch": 1.906527616840479, "grad_norm": 0.4847977235601811, "learning_rate": 1.555289784602609e-05, "loss": 0.3988, "step": 1234 }, { "epoch": 1.9080726149092313, "grad_norm": 0.47670775052947384, "learning_rate": 1.5543919253971747e-05, "loss": 0.4068, "step": 1235 }, { "epoch": 1.9096176129779838, "grad_norm": 0.5280189086362712, "learning_rate": 1.5534934204896554e-05, "loss": 0.4189, "step": 1236 }, { "epoch": 1.9111626110467363, "grad_norm": 0.41085466754669236, "learning_rate": 1.5525942709265432e-05, "loss": 0.4114, "step": 1237 }, { "epoch": 1.9127076091154886, "grad_norm": 0.41300104013917077, "learning_rate": 1.5516944777550798e-05, "loss": 0.4198, "step": 1238 }, { "epoch": 1.914252607184241, "grad_norm": 0.3768370805792257, "learning_rate": 1.550794042023258e-05, "loss": 0.4066, "step": 1239 }, { "epoch": 1.9157976052529935, "grad_norm": 0.4052064630767921, "learning_rate": 1.549892964779818e-05, "loss": 0.4066, "step": 1240 }, { "epoch": 1.917342603321746, "grad_norm": 0.4662924136254279, "learning_rate": 1.5489912470742475e-05, "loss": 0.4435, "step": 1241 }, { "epoch": 1.9188876013904983, "grad_norm": 0.4362606747636451, "learning_rate": 1.5480888899567797e-05, "loss": 0.4393, "step": 1242 }, { "epoch": 1.9204325994592506, "grad_norm": 0.4651922304999439, "learning_rate": 1.5471858944783933e-05, "loss": 0.4367, "step": 1243 }, { "epoch": 1.9219775975280031, "grad_norm": 0.4863725387342473, "learning_rate": 1.5462822616908096e-05, "loss": 0.3961, "step": 1244 }, { "epoch": 1.9235225955967556, "grad_norm": 0.4409380404943186, "learning_rate": 1.545377992646493e-05, "loss": 0.4074, "step": 1245 }, { "epoch": 1.925067593665508, "grad_norm": 0.5015668687283318, "learning_rate": 1.5444730883986484e-05, "loss": 0.4443, "step": 1246 }, { "epoch": 1.9266125917342602, "grad_norm": 0.37548053864129166, "learning_rate": 1.5435675500012212e-05, "loss": 0.3809, "step": 1247 }, { "epoch": 1.9281575898030128, "grad_norm": 0.4027955187117131, "learning_rate": 1.5426613785088945e-05, "loss": 0.4373, "step": 1248 }, { "epoch": 1.9297025878717653, "grad_norm": 0.4444772090477185, "learning_rate": 1.5417545749770894e-05, "loss": 0.4053, "step": 1249 }, { "epoch": 1.9312475859405176, "grad_norm": 0.3356993908290934, "learning_rate": 1.5408471404619623e-05, "loss": 0.4185, "step": 1250 }, { "epoch": 1.93279258400927, "grad_norm": 0.45216644102837233, "learning_rate": 1.5399390760204064e-05, "loss": 0.4382, "step": 1251 }, { "epoch": 1.9343375820780224, "grad_norm": 0.39659986506695455, "learning_rate": 1.539030382710046e-05, "loss": 0.4132, "step": 1252 }, { "epoch": 1.935882580146775, "grad_norm": 0.42507097893473217, "learning_rate": 1.53812106158924e-05, "loss": 0.4013, "step": 1253 }, { "epoch": 1.9374275782155272, "grad_norm": 0.3706839869951866, "learning_rate": 1.5372111137170774e-05, "loss": 0.3958, "step": 1254 }, { "epoch": 1.9389725762842795, "grad_norm": 0.5539726028342381, "learning_rate": 1.5363005401533772e-05, "loss": 0.453, "step": 1255 }, { "epoch": 1.940517574353032, "grad_norm": 0.4317186472128484, "learning_rate": 1.535389341958688e-05, "loss": 0.409, "step": 1256 }, { "epoch": 1.9420625724217846, "grad_norm": 0.49391845388045147, "learning_rate": 1.534477520194285e-05, "loss": 0.4444, "step": 1257 }, { "epoch": 1.943607570490537, "grad_norm": 0.3983615396225619, "learning_rate": 1.53356507592217e-05, "loss": 0.393, "step": 1258 }, { "epoch": 1.9451525685592892, "grad_norm": 0.5728982459230999, "learning_rate": 1.5326520102050693e-05, "loss": 0.4145, "step": 1259 }, { "epoch": 1.9466975666280417, "grad_norm": 0.42529455333828947, "learning_rate": 1.5317383241064344e-05, "loss": 0.4329, "step": 1260 }, { "epoch": 1.9482425646967942, "grad_norm": 0.41272288598070117, "learning_rate": 1.5308240186904382e-05, "loss": 0.3925, "step": 1261 }, { "epoch": 1.9497875627655465, "grad_norm": 0.42937201471943554, "learning_rate": 1.5299090950219748e-05, "loss": 0.4437, "step": 1262 }, { "epoch": 1.9513325608342988, "grad_norm": 0.34675359296194197, "learning_rate": 1.5289935541666595e-05, "loss": 0.3766, "step": 1263 }, { "epoch": 1.9528775589030514, "grad_norm": 0.4415739544977423, "learning_rate": 1.528077397190825e-05, "loss": 0.5182, "step": 1264 }, { "epoch": 1.954422556971804, "grad_norm": 0.36706926469713314, "learning_rate": 1.5271606251615228e-05, "loss": 0.3949, "step": 1265 }, { "epoch": 1.9559675550405562, "grad_norm": 0.47375181784914705, "learning_rate": 1.52624323914652e-05, "loss": 0.4209, "step": 1266 }, { "epoch": 1.9575125531093085, "grad_norm": 0.4538680744612514, "learning_rate": 1.5253252402142989e-05, "loss": 0.4233, "step": 1267 }, { "epoch": 1.959057551178061, "grad_norm": 0.40183510175278914, "learning_rate": 1.5244066294340565e-05, "loss": 0.4201, "step": 1268 }, { "epoch": 1.9606025492468135, "grad_norm": 0.4349192596248898, "learning_rate": 1.5234874078757011e-05, "loss": 0.4375, "step": 1269 }, { "epoch": 1.9621475473155658, "grad_norm": 0.4684407582185239, "learning_rate": 1.5225675766098538e-05, "loss": 0.4201, "step": 1270 }, { "epoch": 1.9636925453843181, "grad_norm": 0.4676977134199685, "learning_rate": 1.5216471367078444e-05, "loss": 0.4327, "step": 1271 }, { "epoch": 1.9652375434530707, "grad_norm": 0.4651364451819135, "learning_rate": 1.5207260892417123e-05, "loss": 0.4058, "step": 1272 }, { "epoch": 1.9667825415218232, "grad_norm": 0.4195519384505539, "learning_rate": 1.519804435284205e-05, "loss": 0.4196, "step": 1273 }, { "epoch": 1.9683275395905755, "grad_norm": 0.5034847226308515, "learning_rate": 1.5188821759087754e-05, "loss": 0.415, "step": 1274 }, { "epoch": 1.9698725376593278, "grad_norm": 0.45715412621213153, "learning_rate": 1.5179593121895822e-05, "loss": 0.4044, "step": 1275 }, { "epoch": 1.9714175357280803, "grad_norm": 0.400344773388348, "learning_rate": 1.5170358452014875e-05, "loss": 0.4238, "step": 1276 }, { "epoch": 1.9729625337968328, "grad_norm": 0.3774795508426065, "learning_rate": 1.5161117760200572e-05, "loss": 0.3959, "step": 1277 }, { "epoch": 1.9745075318655851, "grad_norm": 0.41817123336845213, "learning_rate": 1.5151871057215563e-05, "loss": 0.4425, "step": 1278 }, { "epoch": 1.9760525299343374, "grad_norm": 0.3821156240727613, "learning_rate": 1.5142618353829525e-05, "loss": 0.4049, "step": 1279 }, { "epoch": 1.97759752800309, "grad_norm": 0.461972860672087, "learning_rate": 1.5133359660819108e-05, "loss": 0.4241, "step": 1280 }, { "epoch": 1.9791425260718425, "grad_norm": 0.4906883124179141, "learning_rate": 1.5124094988967939e-05, "loss": 0.4375, "step": 1281 }, { "epoch": 1.9806875241405948, "grad_norm": 0.4061863051387895, "learning_rate": 1.5114824349066615e-05, "loss": 0.3941, "step": 1282 }, { "epoch": 1.982232522209347, "grad_norm": 0.5219158938990639, "learning_rate": 1.5105547751912677e-05, "loss": 0.4077, "step": 1283 }, { "epoch": 1.9837775202780996, "grad_norm": 0.40815694718964124, "learning_rate": 1.5096265208310614e-05, "loss": 0.4302, "step": 1284 }, { "epoch": 1.9853225183468521, "grad_norm": 0.44778106647158383, "learning_rate": 1.5086976729071831e-05, "loss": 0.3937, "step": 1285 }, { "epoch": 1.9868675164156044, "grad_norm": 0.40163147480068184, "learning_rate": 1.5077682325014648e-05, "loss": 0.4108, "step": 1286 }, { "epoch": 1.9884125144843567, "grad_norm": 0.4517967629763744, "learning_rate": 1.5068382006964293e-05, "loss": 0.4317, "step": 1287 }, { "epoch": 1.9899575125531093, "grad_norm": 0.47635160917343544, "learning_rate": 1.5059075785752874e-05, "loss": 0.4019, "step": 1288 }, { "epoch": 1.9915025106218618, "grad_norm": 0.47615460061669673, "learning_rate": 1.5049763672219375e-05, "loss": 0.4392, "step": 1289 }, { "epoch": 1.993047508690614, "grad_norm": 0.43686943884665175, "learning_rate": 1.5040445677209647e-05, "loss": 0.3905, "step": 1290 }, { "epoch": 1.9945925067593664, "grad_norm": 0.4046263122893071, "learning_rate": 1.5031121811576392e-05, "loss": 0.4072, "step": 1291 }, { "epoch": 1.996137504828119, "grad_norm": 0.4816429969549619, "learning_rate": 1.5021792086179142e-05, "loss": 0.4099, "step": 1292 }, { "epoch": 1.9976825028968714, "grad_norm": 0.48351418508216265, "learning_rate": 1.5012456511884262e-05, "loss": 0.4574, "step": 1293 }, { "epoch": 1.9992275009656237, "grad_norm": 0.5270574470254615, "learning_rate": 1.5003115099564923e-05, "loss": 0.393, "step": 1294 }, { "epoch": 2.000772499034376, "grad_norm": 1.0759787731157788, "learning_rate": 1.49937678601011e-05, "loss": 0.7395, "step": 1295 }, { "epoch": 2.0023174971031286, "grad_norm": 0.6426604250753457, "learning_rate": 1.4984414804379555e-05, "loss": 0.3562, "step": 1296 }, { "epoch": 2.003862495171881, "grad_norm": 0.5850065399891953, "learning_rate": 1.4975055943293817e-05, "loss": 0.3683, "step": 1297 }, { "epoch": 2.0054074932406336, "grad_norm": 0.6215215758010988, "learning_rate": 1.496569128774419e-05, "loss": 0.3784, "step": 1298 }, { "epoch": 2.0069524913093857, "grad_norm": 0.4349501123428446, "learning_rate": 1.4956320848637715e-05, "loss": 0.3661, "step": 1299 }, { "epoch": 2.008497489378138, "grad_norm": 0.6614580525361466, "learning_rate": 1.494694463688817e-05, "loss": 0.3796, "step": 1300 }, { "epoch": 2.0100424874468907, "grad_norm": 0.5126760011471372, "learning_rate": 1.4937562663416062e-05, "loss": 0.3816, "step": 1301 }, { "epoch": 2.0115874855156433, "grad_norm": 0.47771980584377577, "learning_rate": 1.492817493914861e-05, "loss": 0.3468, "step": 1302 }, { "epoch": 2.0131324835843953, "grad_norm": 0.6026042336654378, "learning_rate": 1.4918781475019719e-05, "loss": 0.4066, "step": 1303 }, { "epoch": 2.014677481653148, "grad_norm": 0.4139301578281827, "learning_rate": 1.4909382281969995e-05, "loss": 0.3709, "step": 1304 }, { "epoch": 2.0162224797219004, "grad_norm": 0.5918917779669979, "learning_rate": 1.4899977370946705e-05, "loss": 0.3642, "step": 1305 }, { "epoch": 2.017767477790653, "grad_norm": 0.450041652346319, "learning_rate": 1.4890566752903776e-05, "loss": 0.349, "step": 1306 }, { "epoch": 2.019312475859405, "grad_norm": 0.4506785555827132, "learning_rate": 1.4881150438801797e-05, "loss": 0.3721, "step": 1307 }, { "epoch": 2.0208574739281575, "grad_norm": 0.570795595652325, "learning_rate": 1.4871728439607967e-05, "loss": 0.3999, "step": 1308 }, { "epoch": 2.02240247199691, "grad_norm": 0.42308643571061916, "learning_rate": 1.4862300766296125e-05, "loss": 0.3615, "step": 1309 }, { "epoch": 2.0239474700656626, "grad_norm": 0.48653062670462627, "learning_rate": 1.4852867429846716e-05, "loss": 0.3807, "step": 1310 }, { "epoch": 2.0254924681344146, "grad_norm": 0.42650810659547944, "learning_rate": 1.4843428441246768e-05, "loss": 0.3539, "step": 1311 }, { "epoch": 2.027037466203167, "grad_norm": 0.40651676050486135, "learning_rate": 1.4833983811489914e-05, "loss": 0.3558, "step": 1312 }, { "epoch": 2.0285824642719197, "grad_norm": 0.3639439150609093, "learning_rate": 1.4824533551576336e-05, "loss": 0.3321, "step": 1313 }, { "epoch": 2.030127462340672, "grad_norm": 0.5075290307266537, "learning_rate": 1.4815077672512788e-05, "loss": 0.4081, "step": 1314 }, { "epoch": 2.0316724604094243, "grad_norm": 0.3731487161297325, "learning_rate": 1.4805616185312558e-05, "loss": 0.3604, "step": 1315 }, { "epoch": 2.033217458478177, "grad_norm": 0.42259471360322426, "learning_rate": 1.4796149100995477e-05, "loss": 0.3499, "step": 1316 }, { "epoch": 2.0347624565469293, "grad_norm": 0.4566985305376784, "learning_rate": 1.4786676430587884e-05, "loss": 0.398, "step": 1317 }, { "epoch": 2.036307454615682, "grad_norm": 0.42021509303273197, "learning_rate": 1.477719818512263e-05, "loss": 0.3801, "step": 1318 }, { "epoch": 2.037852452684434, "grad_norm": 0.3620059259269264, "learning_rate": 1.4767714375639064e-05, "loss": 0.3266, "step": 1319 }, { "epoch": 2.0393974507531865, "grad_norm": 0.4329279909469547, "learning_rate": 1.4758225013182998e-05, "loss": 0.3903, "step": 1320 }, { "epoch": 2.040942448821939, "grad_norm": 0.3769089147551581, "learning_rate": 1.4748730108806737e-05, "loss": 0.3408, "step": 1321 }, { "epoch": 2.0424874468906915, "grad_norm": 0.3607144203351907, "learning_rate": 1.473922967356902e-05, "loss": 0.3705, "step": 1322 }, { "epoch": 2.0440324449594436, "grad_norm": 0.48006555083301444, "learning_rate": 1.4729723718535034e-05, "loss": 0.3824, "step": 1323 }, { "epoch": 2.045577443028196, "grad_norm": 0.35260660964738766, "learning_rate": 1.47202122547764e-05, "loss": 0.3796, "step": 1324 }, { "epoch": 2.0471224410969486, "grad_norm": 0.4382487148084734, "learning_rate": 1.4710695293371145e-05, "loss": 0.3932, "step": 1325 }, { "epoch": 2.048667439165701, "grad_norm": 0.32038049111582656, "learning_rate": 1.470117284540371e-05, "loss": 0.3198, "step": 1326 }, { "epoch": 2.0502124372344532, "grad_norm": 0.5011484955987627, "learning_rate": 1.4691644921964923e-05, "loss": 0.4311, "step": 1327 }, { "epoch": 2.0517574353032058, "grad_norm": 0.39233383545052114, "learning_rate": 1.468211153415198e-05, "loss": 0.3894, "step": 1328 }, { "epoch": 2.0533024333719583, "grad_norm": 0.4081674305594968, "learning_rate": 1.4672572693068454e-05, "loss": 0.364, "step": 1329 }, { "epoch": 2.054847431440711, "grad_norm": 0.3686773590214129, "learning_rate": 1.4663028409824267e-05, "loss": 0.3855, "step": 1330 }, { "epoch": 2.0563924295094633, "grad_norm": 0.4087788197073645, "learning_rate": 1.465347869553567e-05, "loss": 0.3639, "step": 1331 }, { "epoch": 2.0579374275782154, "grad_norm": 0.37917271821222304, "learning_rate": 1.4643923561325251e-05, "loss": 0.3442, "step": 1332 }, { "epoch": 2.059482425646968, "grad_norm": 0.4027114446240674, "learning_rate": 1.4634363018321904e-05, "loss": 0.3691, "step": 1333 }, { "epoch": 2.0610274237157205, "grad_norm": 0.4155862015510876, "learning_rate": 1.4624797077660823e-05, "loss": 0.3638, "step": 1334 }, { "epoch": 2.062572421784473, "grad_norm": 0.35345402996642744, "learning_rate": 1.4615225750483499e-05, "loss": 0.3617, "step": 1335 }, { "epoch": 2.064117419853225, "grad_norm": 0.4532733796855912, "learning_rate": 1.4605649047937677e-05, "loss": 0.392, "step": 1336 }, { "epoch": 2.0656624179219776, "grad_norm": 0.4224996935221109, "learning_rate": 1.459606698117738e-05, "loss": 0.387, "step": 1337 }, { "epoch": 2.06720741599073, "grad_norm": 0.3404337892790566, "learning_rate": 1.4586479561362872e-05, "loss": 0.3345, "step": 1338 }, { "epoch": 2.0687524140594826, "grad_norm": 0.47441334853308326, "learning_rate": 1.4576886799660647e-05, "loss": 0.3758, "step": 1339 }, { "epoch": 2.0702974121282347, "grad_norm": 0.3816818928456925, "learning_rate": 1.4567288707243435e-05, "loss": 0.3374, "step": 1340 }, { "epoch": 2.0718424101969872, "grad_norm": 0.4027979155460083, "learning_rate": 1.4557685295290156e-05, "loss": 0.3783, "step": 1341 }, { "epoch": 2.0733874082657398, "grad_norm": 0.35945638081905074, "learning_rate": 1.4548076574985941e-05, "loss": 0.3591, "step": 1342 }, { "epoch": 2.0749324063344923, "grad_norm": 0.428838173753213, "learning_rate": 1.4538462557522098e-05, "loss": 0.3742, "step": 1343 }, { "epoch": 2.0764774044032444, "grad_norm": 0.35857306475196454, "learning_rate": 1.4528843254096101e-05, "loss": 0.3658, "step": 1344 }, { "epoch": 2.078022402471997, "grad_norm": 0.38762347547922466, "learning_rate": 1.4519218675911588e-05, "loss": 0.3793, "step": 1345 }, { "epoch": 2.0795674005407494, "grad_norm": 0.41780547141692204, "learning_rate": 1.4509588834178334e-05, "loss": 0.3448, "step": 1346 }, { "epoch": 2.081112398609502, "grad_norm": 0.35468393255465985, "learning_rate": 1.449995374011225e-05, "loss": 0.3703, "step": 1347 }, { "epoch": 2.082657396678254, "grad_norm": 0.41736109626820306, "learning_rate": 1.4490313404935354e-05, "loss": 0.3461, "step": 1348 }, { "epoch": 2.0842023947470065, "grad_norm": 0.44696702503058494, "learning_rate": 1.4480667839875786e-05, "loss": 0.3887, "step": 1349 }, { "epoch": 2.085747392815759, "grad_norm": 0.39881733713625106, "learning_rate": 1.4471017056167762e-05, "loss": 0.3696, "step": 1350 }, { "epoch": 2.0872923908845116, "grad_norm": 0.34752641191324113, "learning_rate": 1.446136106505158e-05, "loss": 0.3385, "step": 1351 }, { "epoch": 2.0888373889532637, "grad_norm": 0.45189573641367403, "learning_rate": 1.4451699877773606e-05, "loss": 0.3797, "step": 1352 }, { "epoch": 2.090382387022016, "grad_norm": 0.37232676934433356, "learning_rate": 1.4442033505586257e-05, "loss": 0.3696, "step": 1353 }, { "epoch": 2.0919273850907687, "grad_norm": 0.41339427190548983, "learning_rate": 1.4432361959747987e-05, "loss": 0.3668, "step": 1354 }, { "epoch": 2.093472383159521, "grad_norm": 0.3509534173916223, "learning_rate": 1.4422685251523278e-05, "loss": 0.3286, "step": 1355 }, { "epoch": 2.0950173812282733, "grad_norm": 0.4370018707975151, "learning_rate": 1.4413003392182623e-05, "loss": 0.3635, "step": 1356 }, { "epoch": 2.096562379297026, "grad_norm": 0.46712091531707495, "learning_rate": 1.4403316393002515e-05, "loss": 0.3819, "step": 1357 }, { "epoch": 2.0981073773657783, "grad_norm": 0.45090084672165176, "learning_rate": 1.4393624265265436e-05, "loss": 0.3646, "step": 1358 }, { "epoch": 2.099652375434531, "grad_norm": 0.5143423350130466, "learning_rate": 1.4383927020259836e-05, "loss": 0.373, "step": 1359 }, { "epoch": 2.101197373503283, "grad_norm": 0.4800964490992476, "learning_rate": 1.4374224669280126e-05, "loss": 0.3663, "step": 1360 }, { "epoch": 2.1027423715720355, "grad_norm": 0.39633825219537294, "learning_rate": 1.4364517223626672e-05, "loss": 0.357, "step": 1361 }, { "epoch": 2.104287369640788, "grad_norm": 0.5043784157862997, "learning_rate": 1.4354804694605761e-05, "loss": 0.3816, "step": 1362 }, { "epoch": 2.1058323677095405, "grad_norm": 0.42918752700146817, "learning_rate": 1.4345087093529612e-05, "loss": 0.3707, "step": 1363 }, { "epoch": 2.1073773657782926, "grad_norm": 0.43937179739768983, "learning_rate": 1.4335364431716347e-05, "loss": 0.3603, "step": 1364 }, { "epoch": 2.108922363847045, "grad_norm": 0.4341973690891753, "learning_rate": 1.4325636720489976e-05, "loss": 0.3739, "step": 1365 }, { "epoch": 2.1104673619157976, "grad_norm": 0.44682831166197123, "learning_rate": 1.4315903971180402e-05, "loss": 0.371, "step": 1366 }, { "epoch": 2.11201235998455, "grad_norm": 0.37105552668369196, "learning_rate": 1.430616619512339e-05, "loss": 0.3497, "step": 1367 }, { "epoch": 2.1135573580533022, "grad_norm": 0.4617967341779264, "learning_rate": 1.4296423403660553e-05, "loss": 0.3548, "step": 1368 }, { "epoch": 2.1151023561220548, "grad_norm": 0.43869109268060746, "learning_rate": 1.4286675608139363e-05, "loss": 0.3886, "step": 1369 }, { "epoch": 2.1166473541908073, "grad_norm": 0.3722703089267696, "learning_rate": 1.4276922819913094e-05, "loss": 0.3563, "step": 1370 }, { "epoch": 2.11819235225956, "grad_norm": 0.4787522503745591, "learning_rate": 1.4267165050340866e-05, "loss": 0.3705, "step": 1371 }, { "epoch": 2.119737350328312, "grad_norm": 0.42037785691079366, "learning_rate": 1.4257402310787577e-05, "loss": 0.3886, "step": 1372 }, { "epoch": 2.1212823483970644, "grad_norm": 0.392966809146565, "learning_rate": 1.424763461262392e-05, "loss": 0.3419, "step": 1373 }, { "epoch": 2.122827346465817, "grad_norm": 0.41803682426888483, "learning_rate": 1.4237861967226368e-05, "loss": 0.38, "step": 1374 }, { "epoch": 2.1243723445345695, "grad_norm": 0.4499047177185132, "learning_rate": 1.4228084385977154e-05, "loss": 0.4071, "step": 1375 }, { "epoch": 2.1259173426033215, "grad_norm": 0.4161025781889693, "learning_rate": 1.4218301880264256e-05, "loss": 0.3394, "step": 1376 }, { "epoch": 2.127462340672074, "grad_norm": 0.4209702233001214, "learning_rate": 1.4208514461481393e-05, "loss": 0.4086, "step": 1377 }, { "epoch": 2.1290073387408266, "grad_norm": 0.4053946490089977, "learning_rate": 1.4198722141028003e-05, "loss": 0.3687, "step": 1378 }, { "epoch": 2.130552336809579, "grad_norm": 0.35901738583970455, "learning_rate": 1.4188924930309232e-05, "loss": 0.3303, "step": 1379 }, { "epoch": 2.132097334878331, "grad_norm": 0.40068734290821084, "learning_rate": 1.4179122840735924e-05, "loss": 0.3553, "step": 1380 }, { "epoch": 2.1336423329470837, "grad_norm": 0.32566922960762856, "learning_rate": 1.4169315883724606e-05, "loss": 0.3429, "step": 1381 }, { "epoch": 2.1351873310158362, "grad_norm": 0.4440137922722412, "learning_rate": 1.415950407069747e-05, "loss": 0.3813, "step": 1382 }, { "epoch": 2.1367323290845888, "grad_norm": 0.411019619509668, "learning_rate": 1.414968741308237e-05, "loss": 0.3739, "step": 1383 }, { "epoch": 2.1382773271533413, "grad_norm": 0.38593634589804976, "learning_rate": 1.4139865922312795e-05, "loss": 0.3503, "step": 1384 }, { "epoch": 2.1398223252220934, "grad_norm": 0.4083794943601374, "learning_rate": 1.4130039609827872e-05, "loss": 0.3828, "step": 1385 }, { "epoch": 2.141367323290846, "grad_norm": 0.3704417490037932, "learning_rate": 1.4120208487072338e-05, "loss": 0.3544, "step": 1386 }, { "epoch": 2.1429123213595984, "grad_norm": 0.3943305722395482, "learning_rate": 1.411037256549653e-05, "loss": 0.3746, "step": 1387 }, { "epoch": 2.1444573194283505, "grad_norm": 0.3870364756251109, "learning_rate": 1.4100531856556382e-05, "loss": 0.3436, "step": 1388 }, { "epoch": 2.146002317497103, "grad_norm": 0.36245903233524757, "learning_rate": 1.4090686371713403e-05, "loss": 0.3582, "step": 1389 }, { "epoch": 2.1475473155658555, "grad_norm": 0.36258117988797456, "learning_rate": 1.408083612243465e-05, "loss": 0.3505, "step": 1390 }, { "epoch": 2.149092313634608, "grad_norm": 0.41694209665316695, "learning_rate": 1.4070981120192753e-05, "loss": 0.3786, "step": 1391 }, { "epoch": 2.1506373117033606, "grad_norm": 0.37711532935502623, "learning_rate": 1.4061121376465856e-05, "loss": 0.3977, "step": 1392 }, { "epoch": 2.1521823097721127, "grad_norm": 0.3744748043759873, "learning_rate": 1.405125690273764e-05, "loss": 0.3615, "step": 1393 }, { "epoch": 2.153727307840865, "grad_norm": 0.47534941024017124, "learning_rate": 1.4041387710497288e-05, "loss": 0.3594, "step": 1394 }, { "epoch": 2.1552723059096177, "grad_norm": 0.4446796652825382, "learning_rate": 1.4031513811239481e-05, "loss": 0.3693, "step": 1395 }, { "epoch": 2.15681730397837, "grad_norm": 0.46495215861585426, "learning_rate": 1.402163521646438e-05, "loss": 0.3827, "step": 1396 }, { "epoch": 2.1583623020471223, "grad_norm": 0.4751427703471162, "learning_rate": 1.401175193767762e-05, "loss": 0.381, "step": 1397 }, { "epoch": 2.159907300115875, "grad_norm": 0.43428244519518505, "learning_rate": 1.4001863986390283e-05, "loss": 0.37, "step": 1398 }, { "epoch": 2.1614522981846274, "grad_norm": 0.3901193541544352, "learning_rate": 1.3991971374118905e-05, "loss": 0.341, "step": 1399 }, { "epoch": 2.16299729625338, "grad_norm": 0.5295000413737412, "learning_rate": 1.3982074112385438e-05, "loss": 0.3699, "step": 1400 }, { "epoch": 2.164542294322132, "grad_norm": 0.37290745085815785, "learning_rate": 1.3972172212717254e-05, "loss": 0.3774, "step": 1401 }, { "epoch": 2.1660872923908845, "grad_norm": 0.48647902246630303, "learning_rate": 1.3962265686647134e-05, "loss": 0.3624, "step": 1402 }, { "epoch": 2.167632290459637, "grad_norm": 0.43451486713078374, "learning_rate": 1.3952354545713238e-05, "loss": 0.3791, "step": 1403 }, { "epoch": 2.1691772885283895, "grad_norm": 0.38676191071603455, "learning_rate": 1.3942438801459105e-05, "loss": 0.3577, "step": 1404 }, { "epoch": 2.1707222865971416, "grad_norm": 0.4079485216333912, "learning_rate": 1.393251846543363e-05, "loss": 0.3742, "step": 1405 }, { "epoch": 2.172267284665894, "grad_norm": 0.4245194977963728, "learning_rate": 1.3922593549191067e-05, "loss": 0.3438, "step": 1406 }, { "epoch": 2.1738122827346467, "grad_norm": 0.4710108240998202, "learning_rate": 1.3912664064290996e-05, "loss": 0.3804, "step": 1407 }, { "epoch": 2.175357280803399, "grad_norm": 0.35032978198671444, "learning_rate": 1.390273002229832e-05, "loss": 0.3495, "step": 1408 }, { "epoch": 2.1769022788721513, "grad_norm": 0.47043752547469747, "learning_rate": 1.3892791434783252e-05, "loss": 0.3982, "step": 1409 }, { "epoch": 2.178447276940904, "grad_norm": 0.3395709394993951, "learning_rate": 1.3882848313321295e-05, "loss": 0.368, "step": 1410 }, { "epoch": 2.1799922750096563, "grad_norm": 0.3921345171454981, "learning_rate": 1.3872900669493236e-05, "loss": 0.3394, "step": 1411 }, { "epoch": 2.181537273078409, "grad_norm": 0.3603669422232152, "learning_rate": 1.3862948514885127e-05, "loss": 0.3818, "step": 1412 }, { "epoch": 2.183082271147161, "grad_norm": 0.4586326523644426, "learning_rate": 1.3852991861088276e-05, "loss": 0.3781, "step": 1413 }, { "epoch": 2.1846272692159134, "grad_norm": 0.33363207179753945, "learning_rate": 1.3843030719699233e-05, "loss": 0.3517, "step": 1414 }, { "epoch": 2.186172267284666, "grad_norm": 0.4664591651148466, "learning_rate": 1.3833065102319765e-05, "loss": 0.3637, "step": 1415 }, { "epoch": 2.1877172653534185, "grad_norm": 0.36525066517370136, "learning_rate": 1.3823095020556869e-05, "loss": 0.3644, "step": 1416 }, { "epoch": 2.1892622634221706, "grad_norm": 0.4186672176861902, "learning_rate": 1.3813120486022719e-05, "loss": 0.3642, "step": 1417 }, { "epoch": 2.190807261490923, "grad_norm": 0.353099704191407, "learning_rate": 1.3803141510334698e-05, "loss": 0.3714, "step": 1418 }, { "epoch": 2.1923522595596756, "grad_norm": 0.40659281356024, "learning_rate": 1.379315810511535e-05, "loss": 0.3629, "step": 1419 }, { "epoch": 2.193897257628428, "grad_norm": 0.334897947542878, "learning_rate": 1.3783170281992378e-05, "loss": 0.375, "step": 1420 }, { "epoch": 2.19544225569718, "grad_norm": 0.38754183449297214, "learning_rate": 1.377317805259863e-05, "loss": 0.3783, "step": 1421 }, { "epoch": 2.1969872537659327, "grad_norm": 0.33152437892695147, "learning_rate": 1.3763181428572092e-05, "loss": 0.3336, "step": 1422 }, { "epoch": 2.1985322518346853, "grad_norm": 0.34925043302948444, "learning_rate": 1.3753180421555863e-05, "loss": 0.3539, "step": 1423 }, { "epoch": 2.200077249903438, "grad_norm": 0.38985650737074445, "learning_rate": 1.3743175043198147e-05, "loss": 0.4065, "step": 1424 }, { "epoch": 2.20162224797219, "grad_norm": 0.38001782793988226, "learning_rate": 1.3733165305152242e-05, "loss": 0.4034, "step": 1425 }, { "epoch": 2.2031672460409424, "grad_norm": 0.3702474349989427, "learning_rate": 1.3723151219076526e-05, "loss": 0.3525, "step": 1426 }, { "epoch": 2.204712244109695, "grad_norm": 0.39762206754927837, "learning_rate": 1.3713132796634434e-05, "loss": 0.3997, "step": 1427 }, { "epoch": 2.2062572421784474, "grad_norm": 0.3818956247443618, "learning_rate": 1.3703110049494454e-05, "loss": 0.3494, "step": 1428 }, { "epoch": 2.2078022402471995, "grad_norm": 0.40692580884582075, "learning_rate": 1.3693082989330115e-05, "loss": 0.3652, "step": 1429 }, { "epoch": 2.209347238315952, "grad_norm": 0.38065068978162814, "learning_rate": 1.368305162781997e-05, "loss": 0.3412, "step": 1430 }, { "epoch": 2.2108922363847046, "grad_norm": 0.41461988930560145, "learning_rate": 1.367301597664757e-05, "loss": 0.358, "step": 1431 }, { "epoch": 2.212437234453457, "grad_norm": 0.3696473375669328, "learning_rate": 1.3662976047501474e-05, "loss": 0.3702, "step": 1432 }, { "epoch": 2.213982232522209, "grad_norm": 0.359029225480873, "learning_rate": 1.3652931852075227e-05, "loss": 0.3537, "step": 1433 }, { "epoch": 2.2155272305909617, "grad_norm": 0.4114894284425776, "learning_rate": 1.3642883402067326e-05, "loss": 0.3998, "step": 1434 }, { "epoch": 2.217072228659714, "grad_norm": 0.3700464526175181, "learning_rate": 1.363283070918124e-05, "loss": 0.3536, "step": 1435 }, { "epoch": 2.2186172267284667, "grad_norm": 0.39100078723026666, "learning_rate": 1.3622773785125371e-05, "loss": 0.3587, "step": 1436 }, { "epoch": 2.220162224797219, "grad_norm": 0.45883521433582575, "learning_rate": 1.3612712641613053e-05, "loss": 0.3886, "step": 1437 }, { "epoch": 2.2217072228659713, "grad_norm": 0.3368867045992942, "learning_rate": 1.3602647290362528e-05, "loss": 0.3658, "step": 1438 }, { "epoch": 2.223252220934724, "grad_norm": 0.43359597364473584, "learning_rate": 1.359257774309695e-05, "loss": 0.3688, "step": 1439 }, { "epoch": 2.2247972190034764, "grad_norm": 0.39772827401684735, "learning_rate": 1.358250401154435e-05, "loss": 0.3486, "step": 1440 }, { "epoch": 2.2263422170722285, "grad_norm": 0.39574297156552674, "learning_rate": 1.3572426107437632e-05, "loss": 0.3712, "step": 1441 }, { "epoch": 2.227887215140981, "grad_norm": 0.40449422272862884, "learning_rate": 1.3562344042514572e-05, "loss": 0.3633, "step": 1442 }, { "epoch": 2.2294322132097335, "grad_norm": 0.4407856586288068, "learning_rate": 1.3552257828517779e-05, "loss": 0.3832, "step": 1443 }, { "epoch": 2.230977211278486, "grad_norm": 0.37009374168708414, "learning_rate": 1.3542167477194703e-05, "loss": 0.3726, "step": 1444 }, { "epoch": 2.2325222093472386, "grad_norm": 0.3861429399744957, "learning_rate": 1.3532073000297603e-05, "loss": 0.3536, "step": 1445 }, { "epoch": 2.2340672074159906, "grad_norm": 0.37097508015820607, "learning_rate": 1.3521974409583553e-05, "loss": 0.3698, "step": 1446 }, { "epoch": 2.235612205484743, "grad_norm": 0.3955047378794769, "learning_rate": 1.3511871716814416e-05, "loss": 0.3715, "step": 1447 }, { "epoch": 2.2371572035534957, "grad_norm": 0.36206180305120106, "learning_rate": 1.3501764933756829e-05, "loss": 0.3734, "step": 1448 }, { "epoch": 2.2387022016222478, "grad_norm": 0.39669951955998534, "learning_rate": 1.3491654072182194e-05, "loss": 0.3701, "step": 1449 }, { "epoch": 2.2402471996910003, "grad_norm": 0.3769240803861721, "learning_rate": 1.3481539143866669e-05, "loss": 0.3446, "step": 1450 }, { "epoch": 2.241792197759753, "grad_norm": 0.4769873611258111, "learning_rate": 1.3471420160591142e-05, "loss": 0.3982, "step": 1451 }, { "epoch": 2.2433371958285053, "grad_norm": 0.438479106616143, "learning_rate": 1.3461297134141221e-05, "loss": 0.4132, "step": 1452 }, { "epoch": 2.244882193897258, "grad_norm": 0.3454166783113564, "learning_rate": 1.3451170076307234e-05, "loss": 0.3701, "step": 1453 }, { "epoch": 2.24642719196601, "grad_norm": 0.43304648268053036, "learning_rate": 1.3441038998884199e-05, "loss": 0.3831, "step": 1454 }, { "epoch": 2.2479721900347625, "grad_norm": 0.3184518368707185, "learning_rate": 1.3430903913671806e-05, "loss": 0.3194, "step": 1455 }, { "epoch": 2.249517188103515, "grad_norm": 0.45760229534218727, "learning_rate": 1.3420764832474433e-05, "loss": 0.4064, "step": 1456 }, { "epoch": 2.251062186172267, "grad_norm": 0.33300741428622593, "learning_rate": 1.3410621767101093e-05, "loss": 0.349, "step": 1457 }, { "epoch": 2.2526071842410196, "grad_norm": 0.3622091907516793, "learning_rate": 1.3400474729365457e-05, "loss": 0.3589, "step": 1458 }, { "epoch": 2.254152182309772, "grad_norm": 0.38910358220401364, "learning_rate": 1.33903237310858e-05, "loss": 0.3887, "step": 1459 }, { "epoch": 2.2556971803785246, "grad_norm": 0.34828129536463026, "learning_rate": 1.3380168784085028e-05, "loss": 0.3859, "step": 1460 }, { "epoch": 2.257242178447277, "grad_norm": 0.32501753204521494, "learning_rate": 1.3370009900190647e-05, "loss": 0.3172, "step": 1461 }, { "epoch": 2.2587871765160292, "grad_norm": 0.3667675380633063, "learning_rate": 1.3359847091234734e-05, "loss": 0.3794, "step": 1462 }, { "epoch": 2.2603321745847818, "grad_norm": 0.43081353248471516, "learning_rate": 1.3349680369053948e-05, "loss": 0.3997, "step": 1463 }, { "epoch": 2.2618771726535343, "grad_norm": 0.3740436843549171, "learning_rate": 1.3339509745489507e-05, "loss": 0.3366, "step": 1464 }, { "epoch": 2.2634221707222864, "grad_norm": 0.4153882363960919, "learning_rate": 1.3329335232387169e-05, "loss": 0.3691, "step": 1465 }, { "epoch": 2.264967168791039, "grad_norm": 0.36298553325964145, "learning_rate": 1.3319156841597216e-05, "loss": 0.3699, "step": 1466 }, { "epoch": 2.2665121668597914, "grad_norm": 0.36414337877061437, "learning_rate": 1.3308974584974462e-05, "loss": 0.3747, "step": 1467 }, { "epoch": 2.268057164928544, "grad_norm": 0.3817063809769564, "learning_rate": 1.3298788474378208e-05, "loss": 0.3954, "step": 1468 }, { "epoch": 2.2696021629972964, "grad_norm": 0.35983719089230964, "learning_rate": 1.3288598521672249e-05, "loss": 0.3557, "step": 1469 }, { "epoch": 2.2711471610660485, "grad_norm": 0.41833029573205366, "learning_rate": 1.3278404738724864e-05, "loss": 0.3832, "step": 1470 }, { "epoch": 2.272692159134801, "grad_norm": 0.3479456697466919, "learning_rate": 1.3268207137408777e-05, "loss": 0.3468, "step": 1471 }, { "epoch": 2.2742371572035536, "grad_norm": 0.38118731511897375, "learning_rate": 1.3258005729601178e-05, "loss": 0.3589, "step": 1472 }, { "epoch": 2.275782155272306, "grad_norm": 0.3512428631756438, "learning_rate": 1.3247800527183668e-05, "loss": 0.3711, "step": 1473 }, { "epoch": 2.277327153341058, "grad_norm": 0.36075107343858775, "learning_rate": 1.3237591542042286e-05, "loss": 0.3885, "step": 1474 }, { "epoch": 2.2788721514098107, "grad_norm": 0.3838403745110564, "learning_rate": 1.322737878606747e-05, "loss": 0.366, "step": 1475 }, { "epoch": 2.280417149478563, "grad_norm": 0.36213700045454317, "learning_rate": 1.3217162271154048e-05, "loss": 0.3921, "step": 1476 }, { "epoch": 2.2819621475473157, "grad_norm": 0.3223962554134506, "learning_rate": 1.320694200920123e-05, "loss": 0.3338, "step": 1477 }, { "epoch": 2.283507145616068, "grad_norm": 0.41486725710658423, "learning_rate": 1.3196718012112588e-05, "loss": 0.3688, "step": 1478 }, { "epoch": 2.2850521436848203, "grad_norm": 0.3772957002868288, "learning_rate": 1.3186490291796046e-05, "loss": 0.3725, "step": 1479 }, { "epoch": 2.286597141753573, "grad_norm": 0.3785028401147254, "learning_rate": 1.317625886016386e-05, "loss": 0.3493, "step": 1480 }, { "epoch": 2.2881421398223254, "grad_norm": 0.49821839771056026, "learning_rate": 1.3166023729132616e-05, "loss": 0.3744, "step": 1481 }, { "epoch": 2.2896871378910775, "grad_norm": 0.3840029405558574, "learning_rate": 1.3155784910623202e-05, "loss": 0.3776, "step": 1482 }, { "epoch": 2.29123213595983, "grad_norm": 0.37619192947931485, "learning_rate": 1.3145542416560803e-05, "loss": 0.3397, "step": 1483 }, { "epoch": 2.2927771340285825, "grad_norm": 0.3761329229497339, "learning_rate": 1.3135296258874885e-05, "loss": 0.3707, "step": 1484 }, { "epoch": 2.294322132097335, "grad_norm": 0.44154051787961873, "learning_rate": 1.3125046449499184e-05, "loss": 0.4026, "step": 1485 }, { "epoch": 2.295867130166087, "grad_norm": 0.4056703268311263, "learning_rate": 1.3114793000371688e-05, "loss": 0.3643, "step": 1486 }, { "epoch": 2.2974121282348396, "grad_norm": 0.43074578884948644, "learning_rate": 1.3104535923434613e-05, "loss": 0.4114, "step": 1487 }, { "epoch": 2.298957126303592, "grad_norm": 0.4255419242998696, "learning_rate": 1.309427523063442e-05, "loss": 0.3342, "step": 1488 }, { "epoch": 2.3005021243723447, "grad_norm": 0.4253368995388549, "learning_rate": 1.3084010933921768e-05, "loss": 0.3925, "step": 1489 }, { "epoch": 2.3020471224410968, "grad_norm": 0.42044575708827026, "learning_rate": 1.3073743045251515e-05, "loss": 0.3508, "step": 1490 }, { "epoch": 2.3035921205098493, "grad_norm": 0.4592537221306995, "learning_rate": 1.3063471576582707e-05, "loss": 0.391, "step": 1491 }, { "epoch": 2.305137118578602, "grad_norm": 0.4027595336443124, "learning_rate": 1.3053196539878554e-05, "loss": 0.3499, "step": 1492 }, { "epoch": 2.3066821166473543, "grad_norm": 0.4185054100906129, "learning_rate": 1.3042917947106432e-05, "loss": 0.3715, "step": 1493 }, { "epoch": 2.3082271147161064, "grad_norm": 0.3823836000830192, "learning_rate": 1.3032635810237841e-05, "loss": 0.3535, "step": 1494 }, { "epoch": 2.309772112784859, "grad_norm": 0.4868879417526453, "learning_rate": 1.3022350141248428e-05, "loss": 0.3697, "step": 1495 }, { "epoch": 2.3113171108536115, "grad_norm": 0.5987961287043786, "learning_rate": 1.3012060952117943e-05, "loss": 0.3801, "step": 1496 }, { "epoch": 2.312862108922364, "grad_norm": 0.4341803773899347, "learning_rate": 1.3001768254830234e-05, "loss": 0.3727, "step": 1497 }, { "epoch": 2.314407106991116, "grad_norm": 0.3959792828320748, "learning_rate": 1.2991472061373245e-05, "loss": 0.3364, "step": 1498 }, { "epoch": 2.3159521050598686, "grad_norm": 0.3630080413235252, "learning_rate": 1.298117238373898e-05, "loss": 0.3448, "step": 1499 }, { "epoch": 2.317497103128621, "grad_norm": 0.4158979668924574, "learning_rate": 1.2970869233923517e-05, "loss": 0.3789, "step": 1500 }, { "epoch": 2.3190421011973736, "grad_norm": 0.38735876788463375, "learning_rate": 1.2960562623926958e-05, "loss": 0.3499, "step": 1501 }, { "epoch": 2.3205870992661257, "grad_norm": 0.397914896315446, "learning_rate": 1.2950252565753447e-05, "loss": 0.3682, "step": 1502 }, { "epoch": 2.3221320973348782, "grad_norm": 0.49715437762269304, "learning_rate": 1.293993907141115e-05, "loss": 0.4185, "step": 1503 }, { "epoch": 2.3236770954036308, "grad_norm": 0.34734098096138294, "learning_rate": 1.2929622152912218e-05, "loss": 0.3342, "step": 1504 }, { "epoch": 2.3252220934723833, "grad_norm": 0.5302460876060274, "learning_rate": 1.2919301822272803e-05, "loss": 0.3905, "step": 1505 }, { "epoch": 2.326767091541136, "grad_norm": 0.3875427180586835, "learning_rate": 1.290897809151303e-05, "loss": 0.3811, "step": 1506 }, { "epoch": 2.328312089609888, "grad_norm": 0.42414819109941143, "learning_rate": 1.2898650972656984e-05, "loss": 0.3805, "step": 1507 }, { "epoch": 2.3298570876786404, "grad_norm": 0.3740318133183141, "learning_rate": 1.2888320477732686e-05, "loss": 0.3492, "step": 1508 }, { "epoch": 2.331402085747393, "grad_norm": 0.47966746218141526, "learning_rate": 1.2877986618772102e-05, "loss": 0.4223, "step": 1509 }, { "epoch": 2.332947083816145, "grad_norm": 0.3227496393571366, "learning_rate": 1.2867649407811113e-05, "loss": 0.3256, "step": 1510 }, { "epoch": 2.3344920818848975, "grad_norm": 0.4705238828087119, "learning_rate": 1.2857308856889499e-05, "loss": 0.3528, "step": 1511 }, { "epoch": 2.33603707995365, "grad_norm": 0.37272711509665085, "learning_rate": 1.2846964978050934e-05, "loss": 0.3562, "step": 1512 }, { "epoch": 2.3375820780224026, "grad_norm": 0.3496175735336385, "learning_rate": 1.2836617783342968e-05, "loss": 0.3493, "step": 1513 }, { "epoch": 2.339127076091155, "grad_norm": 0.4241492387658265, "learning_rate": 1.2826267284817015e-05, "loss": 0.3788, "step": 1514 }, { "epoch": 2.340672074159907, "grad_norm": 0.3444610135870008, "learning_rate": 1.2815913494528329e-05, "loss": 0.3724, "step": 1515 }, { "epoch": 2.3422170722286597, "grad_norm": 0.4237702873885263, "learning_rate": 1.2805556424536006e-05, "loss": 0.3537, "step": 1516 }, { "epoch": 2.3437620702974122, "grad_norm": 0.3386918272881773, "learning_rate": 1.2795196086902963e-05, "loss": 0.3439, "step": 1517 }, { "epoch": 2.3453070683661643, "grad_norm": 0.36390778443502575, "learning_rate": 1.278483249369591e-05, "loss": 0.399, "step": 1518 }, { "epoch": 2.346852066434917, "grad_norm": 0.46486398687069375, "learning_rate": 1.2774465656985366e-05, "loss": 0.3888, "step": 1519 }, { "epoch": 2.3483970645036694, "grad_norm": 0.36511103256892335, "learning_rate": 1.276409558884562e-05, "loss": 0.3522, "step": 1520 }, { "epoch": 2.349942062572422, "grad_norm": 0.4240435494678701, "learning_rate": 1.2753722301354719e-05, "loss": 0.3701, "step": 1521 }, { "epoch": 2.3514870606411744, "grad_norm": 0.35595301513538447, "learning_rate": 1.2743345806594465e-05, "loss": 0.3538, "step": 1522 }, { "epoch": 2.3530320587099265, "grad_norm": 0.3392303921144407, "learning_rate": 1.2732966116650398e-05, "loss": 0.3747, "step": 1523 }, { "epoch": 2.354577056778679, "grad_norm": 0.4180615550439474, "learning_rate": 1.2722583243611779e-05, "loss": 0.3771, "step": 1524 }, { "epoch": 2.3561220548474315, "grad_norm": 0.3326992917791569, "learning_rate": 1.2712197199571567e-05, "loss": 0.3779, "step": 1525 }, { "epoch": 2.3576670529161836, "grad_norm": 0.37638281356692926, "learning_rate": 1.2701807996626428e-05, "loss": 0.3406, "step": 1526 }, { "epoch": 2.359212050984936, "grad_norm": 0.4467309014837173, "learning_rate": 1.26914156468767e-05, "loss": 0.3854, "step": 1527 }, { "epoch": 2.3607570490536887, "grad_norm": 0.3651961202304041, "learning_rate": 1.2681020162426381e-05, "loss": 0.3367, "step": 1528 }, { "epoch": 2.362302047122441, "grad_norm": 0.4234804791183684, "learning_rate": 1.267062155538313e-05, "loss": 0.3746, "step": 1529 }, { "epoch": 2.3638470451911937, "grad_norm": 0.4144698550641894, "learning_rate": 1.2660219837858233e-05, "loss": 0.3636, "step": 1530 }, { "epoch": 2.365392043259946, "grad_norm": 0.43121758198157845, "learning_rate": 1.264981502196662e-05, "loss": 0.4111, "step": 1531 }, { "epoch": 2.3669370413286983, "grad_norm": 0.4051484129046985, "learning_rate": 1.2639407119826797e-05, "loss": 0.3619, "step": 1532 }, { "epoch": 2.368482039397451, "grad_norm": 0.42242691648430375, "learning_rate": 1.2628996143560888e-05, "loss": 0.3648, "step": 1533 }, { "epoch": 2.3700270374662034, "grad_norm": 0.43706388056113105, "learning_rate": 1.261858210529459e-05, "loss": 0.3801, "step": 1534 }, { "epoch": 2.3715720355349554, "grad_norm": 0.4133203002744403, "learning_rate": 1.2608165017157172e-05, "loss": 0.3759, "step": 1535 }, { "epoch": 2.373117033603708, "grad_norm": 0.3957795736861924, "learning_rate": 1.2597744891281442e-05, "loss": 0.343, "step": 1536 }, { "epoch": 2.3746620316724605, "grad_norm": 0.46289661156132217, "learning_rate": 1.2587321739803765e-05, "loss": 0.404, "step": 1537 }, { "epoch": 2.376207029741213, "grad_norm": 0.3738631079589091, "learning_rate": 1.2576895574864012e-05, "loss": 0.3572, "step": 1538 }, { "epoch": 2.377752027809965, "grad_norm": 0.36969186353423017, "learning_rate": 1.2566466408605571e-05, "loss": 0.3618, "step": 1539 }, { "epoch": 2.3792970258787176, "grad_norm": 0.39404381573595004, "learning_rate": 1.2556034253175328e-05, "loss": 0.3843, "step": 1540 }, { "epoch": 2.38084202394747, "grad_norm": 0.41086107205233785, "learning_rate": 1.2545599120723653e-05, "loss": 0.4059, "step": 1541 }, { "epoch": 2.3823870220162227, "grad_norm": 0.35312462026705205, "learning_rate": 1.2535161023404372e-05, "loss": 0.3381, "step": 1542 }, { "epoch": 2.3839320200849747, "grad_norm": 0.41538713905498775, "learning_rate": 1.2524719973374773e-05, "loss": 0.4071, "step": 1543 }, { "epoch": 2.3854770181537273, "grad_norm": 0.4041678593682328, "learning_rate": 1.2514275982795581e-05, "loss": 0.3592, "step": 1544 }, { "epoch": 2.38702201622248, "grad_norm": 0.3922538471708698, "learning_rate": 1.2503829063830951e-05, "loss": 0.3444, "step": 1545 }, { "epoch": 2.3885670142912323, "grad_norm": 0.40147497637387947, "learning_rate": 1.2493379228648435e-05, "loss": 0.3956, "step": 1546 }, { "epoch": 2.3901120123599844, "grad_norm": 0.5301264133275635, "learning_rate": 1.2482926489418994e-05, "loss": 0.3823, "step": 1547 }, { "epoch": 2.391657010428737, "grad_norm": 0.3591143598375493, "learning_rate": 1.247247085831697e-05, "loss": 0.3635, "step": 1548 }, { "epoch": 2.3932020084974894, "grad_norm": 0.4700938168066273, "learning_rate": 1.2462012347520068e-05, "loss": 0.3649, "step": 1549 }, { "epoch": 2.394747006566242, "grad_norm": 0.4692489142515503, "learning_rate": 1.2451550969209347e-05, "loss": 0.378, "step": 1550 }, { "epoch": 2.396292004634994, "grad_norm": 0.44131343406676554, "learning_rate": 1.2441086735569217e-05, "loss": 0.376, "step": 1551 }, { "epoch": 2.3978370027037466, "grad_norm": 0.39291529103239475, "learning_rate": 1.2430619658787398e-05, "loss": 0.342, "step": 1552 }, { "epoch": 2.399382000772499, "grad_norm": 0.4276829641349647, "learning_rate": 1.2420149751054928e-05, "loss": 0.3746, "step": 1553 }, { "epoch": 2.4009269988412516, "grad_norm": 0.38923221241587547, "learning_rate": 1.2409677024566145e-05, "loss": 0.3493, "step": 1554 }, { "epoch": 2.4024719969100037, "grad_norm": 0.45144647188994214, "learning_rate": 1.2399201491518668e-05, "loss": 0.3618, "step": 1555 }, { "epoch": 2.404016994978756, "grad_norm": 0.3622136349830833, "learning_rate": 1.2388723164113387e-05, "loss": 0.3643, "step": 1556 }, { "epoch": 2.4055619930475087, "grad_norm": 0.46375987246182593, "learning_rate": 1.2378242054554437e-05, "loss": 0.3767, "step": 1557 }, { "epoch": 2.4071069911162613, "grad_norm": 0.3753801451204473, "learning_rate": 1.2367758175049205e-05, "loss": 0.3316, "step": 1558 }, { "epoch": 2.4086519891850133, "grad_norm": 0.4176092707195243, "learning_rate": 1.2357271537808305e-05, "loss": 0.3978, "step": 1559 }, { "epoch": 2.410196987253766, "grad_norm": 0.38497041739970334, "learning_rate": 1.2346782155045545e-05, "loss": 0.3567, "step": 1560 }, { "epoch": 2.4117419853225184, "grad_norm": 0.40469089805379294, "learning_rate": 1.233629003897795e-05, "loss": 0.3691, "step": 1561 }, { "epoch": 2.413286983391271, "grad_norm": 0.397008354054557, "learning_rate": 1.232579520182573e-05, "loss": 0.3406, "step": 1562 }, { "epoch": 2.414831981460023, "grad_norm": 0.4327180998686038, "learning_rate": 1.2315297655812245e-05, "loss": 0.391, "step": 1563 }, { "epoch": 2.4163769795287755, "grad_norm": 0.3794533017096521, "learning_rate": 1.230479741316402e-05, "loss": 0.363, "step": 1564 }, { "epoch": 2.417921977597528, "grad_norm": 0.3979890553187387, "learning_rate": 1.229429448611073e-05, "loss": 0.3648, "step": 1565 }, { "epoch": 2.4194669756662806, "grad_norm": 0.36807844953169777, "learning_rate": 1.2283788886885164e-05, "loss": 0.3588, "step": 1566 }, { "epoch": 2.421011973735033, "grad_norm": 0.424778522735063, "learning_rate": 1.2273280627723229e-05, "loss": 0.3974, "step": 1567 }, { "epoch": 2.422556971803785, "grad_norm": 0.3607302488017555, "learning_rate": 1.2262769720863928e-05, "loss": 0.355, "step": 1568 }, { "epoch": 2.4241019698725377, "grad_norm": 0.46700358665080766, "learning_rate": 1.2252256178549347e-05, "loss": 0.3815, "step": 1569 }, { "epoch": 2.42564696794129, "grad_norm": 0.4428941894673897, "learning_rate": 1.2241740013024646e-05, "loss": 0.3827, "step": 1570 }, { "epoch": 2.4271919660100423, "grad_norm": 0.3974770139024617, "learning_rate": 1.2231221236538036e-05, "loss": 0.3897, "step": 1571 }, { "epoch": 2.428736964078795, "grad_norm": 0.3906881021857222, "learning_rate": 1.2220699861340767e-05, "loss": 0.3812, "step": 1572 }, { "epoch": 2.4302819621475473, "grad_norm": 0.4029443524451781, "learning_rate": 1.221017589968713e-05, "loss": 0.3451, "step": 1573 }, { "epoch": 2.4318269602163, "grad_norm": 0.43273727386622335, "learning_rate": 1.2199649363834403e-05, "loss": 0.3575, "step": 1574 }, { "epoch": 2.4333719582850524, "grad_norm": 0.37302268121258025, "learning_rate": 1.2189120266042882e-05, "loss": 0.3748, "step": 1575 }, { "epoch": 2.4349169563538045, "grad_norm": 0.4275209169090592, "learning_rate": 1.2178588618575848e-05, "loss": 0.3815, "step": 1576 }, { "epoch": 2.436461954422557, "grad_norm": 0.3319999626135359, "learning_rate": 1.2168054433699538e-05, "loss": 0.3528, "step": 1577 }, { "epoch": 2.4380069524913095, "grad_norm": 0.4067361109698382, "learning_rate": 1.2157517723683154e-05, "loss": 0.3568, "step": 1578 }, { "epoch": 2.4395519505600616, "grad_norm": 0.4433421917928247, "learning_rate": 1.2146978500798835e-05, "loss": 0.389, "step": 1579 }, { "epoch": 2.441096948628814, "grad_norm": 0.3705496249240087, "learning_rate": 1.2136436777321654e-05, "loss": 0.3665, "step": 1580 }, { "epoch": 2.4426419466975666, "grad_norm": 0.3291281731979841, "learning_rate": 1.2125892565529584e-05, "loss": 0.3627, "step": 1581 }, { "epoch": 2.444186944766319, "grad_norm": 0.37793609578429166, "learning_rate": 1.21153458777035e-05, "loss": 0.3671, "step": 1582 }, { "epoch": 2.4457319428350717, "grad_norm": 0.4001058175236968, "learning_rate": 1.2104796726127177e-05, "loss": 0.3646, "step": 1583 }, { "epoch": 2.4472769409038237, "grad_norm": 0.31602994668131923, "learning_rate": 1.2094245123087233e-05, "loss": 0.3486, "step": 1584 }, { "epoch": 2.4488219389725763, "grad_norm": 0.3564860768735522, "learning_rate": 1.208369108087316e-05, "loss": 0.3765, "step": 1585 }, { "epoch": 2.450366937041329, "grad_norm": 0.3904154915473986, "learning_rate": 1.2073134611777286e-05, "loss": 0.3958, "step": 1586 }, { "epoch": 2.451911935110081, "grad_norm": 0.38280484756691824, "learning_rate": 1.2062575728094765e-05, "loss": 0.4035, "step": 1587 }, { "epoch": 2.4534569331788334, "grad_norm": 0.3468685689401559, "learning_rate": 1.205201444212356e-05, "loss": 0.3629, "step": 1588 }, { "epoch": 2.455001931247586, "grad_norm": 0.38453844020902084, "learning_rate": 1.2041450766164438e-05, "loss": 0.3785, "step": 1589 }, { "epoch": 2.4565469293163384, "grad_norm": 0.3472638959424853, "learning_rate": 1.2030884712520949e-05, "loss": 0.3621, "step": 1590 }, { "epoch": 2.458091927385091, "grad_norm": 0.3095567691318013, "learning_rate": 1.2020316293499406e-05, "loss": 0.3683, "step": 1591 }, { "epoch": 2.459636925453843, "grad_norm": 0.38472866632733393, "learning_rate": 1.200974552140888e-05, "loss": 0.3829, "step": 1592 }, { "epoch": 2.4611819235225956, "grad_norm": 0.36650445342089505, "learning_rate": 1.199917240856119e-05, "loss": 0.3755, "step": 1593 }, { "epoch": 2.462726921591348, "grad_norm": 0.3707901478558478, "learning_rate": 1.1988596967270871e-05, "loss": 0.3641, "step": 1594 }, { "epoch": 2.4642719196601006, "grad_norm": 0.35636071724918433, "learning_rate": 1.1978019209855174e-05, "loss": 0.3686, "step": 1595 }, { "epoch": 2.4658169177288527, "grad_norm": 0.358718004161485, "learning_rate": 1.1967439148634048e-05, "loss": 0.3752, "step": 1596 }, { "epoch": 2.467361915797605, "grad_norm": 0.37309502428610986, "learning_rate": 1.1956856795930125e-05, "loss": 0.3648, "step": 1597 }, { "epoch": 2.4689069138663577, "grad_norm": 0.3868412553134633, "learning_rate": 1.1946272164068706e-05, "loss": 0.3713, "step": 1598 }, { "epoch": 2.4704519119351103, "grad_norm": 0.3570443630062963, "learning_rate": 1.1935685265377744e-05, "loss": 0.3628, "step": 1599 }, { "epoch": 2.4719969100038623, "grad_norm": 0.38659293311832776, "learning_rate": 1.192509611218784e-05, "loss": 0.3735, "step": 1600 }, { "epoch": 2.473541908072615, "grad_norm": 0.36614166940687065, "learning_rate": 1.191450471683221e-05, "loss": 0.3878, "step": 1601 }, { "epoch": 2.4750869061413674, "grad_norm": 0.30860683926039556, "learning_rate": 1.1903911091646684e-05, "loss": 0.3116, "step": 1602 }, { "epoch": 2.47663190421012, "grad_norm": 0.3837738108242698, "learning_rate": 1.18933152489697e-05, "loss": 0.3939, "step": 1603 }, { "epoch": 2.478176902278872, "grad_norm": 0.3251661277016553, "learning_rate": 1.1882717201142268e-05, "loss": 0.3641, "step": 1604 }, { "epoch": 2.4797219003476245, "grad_norm": 0.329832910428953, "learning_rate": 1.1872116960507967e-05, "loss": 0.3335, "step": 1605 }, { "epoch": 2.481266898416377, "grad_norm": 0.35595723128330176, "learning_rate": 1.1861514539412929e-05, "loss": 0.3243, "step": 1606 }, { "epoch": 2.4828118964851296, "grad_norm": 0.4423864475477478, "learning_rate": 1.1850909950205836e-05, "loss": 0.4151, "step": 1607 }, { "epoch": 2.4843568945538816, "grad_norm": 0.35035962991499586, "learning_rate": 1.1840303205237881e-05, "loss": 0.3535, "step": 1608 }, { "epoch": 2.485901892622634, "grad_norm": 0.3749364271144841, "learning_rate": 1.182969431686278e-05, "loss": 0.3403, "step": 1609 }, { "epoch": 2.4874468906913867, "grad_norm": 0.37239868853752134, "learning_rate": 1.1819083297436736e-05, "loss": 0.3652, "step": 1610 }, { "epoch": 2.488991888760139, "grad_norm": 0.38170193269860564, "learning_rate": 1.1808470159318437e-05, "loss": 0.3567, "step": 1611 }, { "epoch": 2.4905368868288913, "grad_norm": 0.36037749878868286, "learning_rate": 1.1797854914869045e-05, "loss": 0.3942, "step": 1612 }, { "epoch": 2.492081884897644, "grad_norm": 0.37582221240332014, "learning_rate": 1.1787237576452163e-05, "loss": 0.346, "step": 1613 }, { "epoch": 2.4936268829663963, "grad_norm": 0.37008982532758955, "learning_rate": 1.1776618156433844e-05, "loss": 0.3849, "step": 1614 }, { "epoch": 2.495171881035149, "grad_norm": 0.41018458264773117, "learning_rate": 1.1765996667182561e-05, "loss": 0.3685, "step": 1615 }, { "epoch": 2.496716879103901, "grad_norm": 0.38164914683639606, "learning_rate": 1.1755373121069193e-05, "loss": 0.3568, "step": 1616 }, { "epoch": 2.4982618771726535, "grad_norm": 0.36045260793646444, "learning_rate": 1.1744747530467024e-05, "loss": 0.3604, "step": 1617 }, { "epoch": 2.499806875241406, "grad_norm": 0.4234092809716452, "learning_rate": 1.1734119907751718e-05, "loss": 0.3479, "step": 1618 }, { "epoch": 2.5013518733101585, "grad_norm": 0.4506685346594244, "learning_rate": 1.172349026530129e-05, "loss": 0.3618, "step": 1619 }, { "epoch": 2.502896871378911, "grad_norm": 0.35827147140055726, "learning_rate": 1.1712858615496126e-05, "loss": 0.3895, "step": 1620 }, { "epoch": 2.504441869447663, "grad_norm": 0.40668868027028227, "learning_rate": 1.1702224970718945e-05, "loss": 0.3779, "step": 1621 }, { "epoch": 2.5059868675164156, "grad_norm": 0.43721462910238873, "learning_rate": 1.1691589343354783e-05, "loss": 0.3764, "step": 1622 }, { "epoch": 2.507531865585168, "grad_norm": 0.3329977452270092, "learning_rate": 1.1680951745790995e-05, "loss": 0.3622, "step": 1623 }, { "epoch": 2.5090768636539202, "grad_norm": 0.42107709908755453, "learning_rate": 1.1670312190417222e-05, "loss": 0.3758, "step": 1624 }, { "epoch": 2.5106218617226728, "grad_norm": 0.4208908662563971, "learning_rate": 1.1659670689625388e-05, "loss": 0.3608, "step": 1625 }, { "epoch": 2.5121668597914253, "grad_norm": 0.3766854190530986, "learning_rate": 1.1649027255809688e-05, "loss": 0.377, "step": 1626 }, { "epoch": 2.513711857860178, "grad_norm": 0.3774492349506252, "learning_rate": 1.163838190136656e-05, "loss": 0.3505, "step": 1627 }, { "epoch": 2.5152568559289303, "grad_norm": 0.3435191835092907, "learning_rate": 1.1627734638694686e-05, "loss": 0.3426, "step": 1628 }, { "epoch": 2.5168018539976824, "grad_norm": 0.39902803115667496, "learning_rate": 1.1617085480194965e-05, "loss": 0.4169, "step": 1629 }, { "epoch": 2.518346852066435, "grad_norm": 0.35488588854952313, "learning_rate": 1.1606434438270506e-05, "loss": 0.3516, "step": 1630 }, { "epoch": 2.5198918501351875, "grad_norm": 0.32376837838614114, "learning_rate": 1.1595781525326615e-05, "loss": 0.3706, "step": 1631 }, { "epoch": 2.5214368482039395, "grad_norm": 0.4321354148256689, "learning_rate": 1.1585126753770779e-05, "loss": 0.3958, "step": 1632 }, { "epoch": 2.522981846272692, "grad_norm": 0.37602770442381045, "learning_rate": 1.1574470136012634e-05, "loss": 0.4077, "step": 1633 }, { "epoch": 2.5245268443414446, "grad_norm": 0.34235964899557103, "learning_rate": 1.1563811684463983e-05, "loss": 0.3281, "step": 1634 }, { "epoch": 2.526071842410197, "grad_norm": 0.3355625889946589, "learning_rate": 1.1553151411538763e-05, "loss": 0.3533, "step": 1635 }, { "epoch": 2.5276168404789496, "grad_norm": 0.4036884287235769, "learning_rate": 1.1542489329653024e-05, "loss": 0.3608, "step": 1636 }, { "epoch": 2.5291618385477017, "grad_norm": 0.34850457433814847, "learning_rate": 1.1531825451224929e-05, "loss": 0.4007, "step": 1637 }, { "epoch": 2.5307068366164542, "grad_norm": 0.36751741530628984, "learning_rate": 1.1521159788674732e-05, "loss": 0.362, "step": 1638 }, { "epoch": 2.5322518346852068, "grad_norm": 0.34711116881643217, "learning_rate": 1.1510492354424764e-05, "loss": 0.3786, "step": 1639 }, { "epoch": 2.533796832753959, "grad_norm": 0.4150461794108452, "learning_rate": 1.1499823160899423e-05, "loss": 0.3754, "step": 1640 }, { "epoch": 2.5353418308227114, "grad_norm": 0.34348378830998577, "learning_rate": 1.148915222052515e-05, "loss": 0.3734, "step": 1641 }, { "epoch": 2.536886828891464, "grad_norm": 0.37311263386487675, "learning_rate": 1.1478479545730427e-05, "loss": 0.375, "step": 1642 }, { "epoch": 2.5384318269602164, "grad_norm": 0.3474038030648802, "learning_rate": 1.1467805148945754e-05, "loss": 0.3794, "step": 1643 }, { "epoch": 2.539976825028969, "grad_norm": 0.3177637932187878, "learning_rate": 1.145712904260363e-05, "loss": 0.3523, "step": 1644 }, { "epoch": 2.541521823097721, "grad_norm": 0.39390690996880134, "learning_rate": 1.1446451239138557e-05, "loss": 0.389, "step": 1645 }, { "epoch": 2.5430668211664735, "grad_norm": 0.34277634737778906, "learning_rate": 1.1435771750987006e-05, "loss": 0.3462, "step": 1646 }, { "epoch": 2.544611819235226, "grad_norm": 0.3556710520299068, "learning_rate": 1.1425090590587408e-05, "loss": 0.3781, "step": 1647 }, { "epoch": 2.546156817303978, "grad_norm": 0.33724042057846365, "learning_rate": 1.141440777038015e-05, "loss": 0.3584, "step": 1648 }, { "epoch": 2.5477018153727307, "grad_norm": 0.38337941265970843, "learning_rate": 1.1403723302807545e-05, "loss": 0.3729, "step": 1649 }, { "epoch": 2.549246813441483, "grad_norm": 0.3482890369406031, "learning_rate": 1.1393037200313833e-05, "loss": 0.3537, "step": 1650 }, { "epoch": 2.5507918115102357, "grad_norm": 0.4201359211988468, "learning_rate": 1.1382349475345144e-05, "loss": 0.3979, "step": 1651 }, { "epoch": 2.5523368095789882, "grad_norm": 0.35021976573173197, "learning_rate": 1.1371660140349508e-05, "loss": 0.3685, "step": 1652 }, { "epoch": 2.5538818076477403, "grad_norm": 0.3543193047283711, "learning_rate": 1.1360969207776833e-05, "loss": 0.3186, "step": 1653 }, { "epoch": 2.555426805716493, "grad_norm": 0.40946670670623375, "learning_rate": 1.135027669007888e-05, "loss": 0.3994, "step": 1654 }, { "epoch": 2.5569718037852454, "grad_norm": 0.3356230472697648, "learning_rate": 1.1339582599709259e-05, "loss": 0.3561, "step": 1655 }, { "epoch": 2.5585168018539974, "grad_norm": 0.40156839931049476, "learning_rate": 1.1328886949123412e-05, "loss": 0.4091, "step": 1656 }, { "epoch": 2.56006179992275, "grad_norm": 0.3444909181259304, "learning_rate": 1.13181897507786e-05, "loss": 0.3699, "step": 1657 }, { "epoch": 2.5616067979915025, "grad_norm": 0.34720282845136263, "learning_rate": 1.130749101713388e-05, "loss": 0.3503, "step": 1658 }, { "epoch": 2.563151796060255, "grad_norm": 0.37463668226442803, "learning_rate": 1.1296790760650104e-05, "loss": 0.3804, "step": 1659 }, { "epoch": 2.5646967941290075, "grad_norm": 0.3900258989304525, "learning_rate": 1.1286088993789905e-05, "loss": 0.3652, "step": 1660 }, { "epoch": 2.5662417921977596, "grad_norm": 0.33028918841264143, "learning_rate": 1.1275385729017652e-05, "loss": 0.3581, "step": 1661 }, { "epoch": 2.567786790266512, "grad_norm": 0.33961331276260387, "learning_rate": 1.1264680978799479e-05, "loss": 0.3462, "step": 1662 }, { "epoch": 2.5693317883352647, "grad_norm": 0.37867130935536425, "learning_rate": 1.1253974755603245e-05, "loss": 0.3496, "step": 1663 }, { "epoch": 2.5708767864040167, "grad_norm": 0.3613523979429839, "learning_rate": 1.1243267071898517e-05, "loss": 0.3864, "step": 1664 }, { "epoch": 2.5724217844727693, "grad_norm": 0.41202193115938407, "learning_rate": 1.1232557940156577e-05, "loss": 0.3658, "step": 1665 }, { "epoch": 2.573966782541522, "grad_norm": 0.4278616106203508, "learning_rate": 1.1221847372850378e-05, "loss": 0.3969, "step": 1666 }, { "epoch": 2.5755117806102743, "grad_norm": 0.3571632510602076, "learning_rate": 1.1211135382454558e-05, "loss": 0.3733, "step": 1667 }, { "epoch": 2.577056778679027, "grad_norm": 0.4029852529619227, "learning_rate": 1.1200421981445407e-05, "loss": 0.3251, "step": 1668 }, { "epoch": 2.578601776747779, "grad_norm": 0.42153390947577163, "learning_rate": 1.1189707182300853e-05, "loss": 0.3958, "step": 1669 }, { "epoch": 2.5801467748165314, "grad_norm": 0.3997622363095959, "learning_rate": 1.1178990997500465e-05, "loss": 0.3651, "step": 1670 }, { "epoch": 2.581691772885284, "grad_norm": 0.4370995184797026, "learning_rate": 1.1168273439525414e-05, "loss": 0.3564, "step": 1671 }, { "epoch": 2.583236770954036, "grad_norm": 0.38740278406564566, "learning_rate": 1.1157554520858473e-05, "loss": 0.3495, "step": 1672 }, { "epoch": 2.5847817690227886, "grad_norm": 0.39943570867253225, "learning_rate": 1.1146834253984008e-05, "loss": 0.3832, "step": 1673 }, { "epoch": 2.586326767091541, "grad_norm": 0.4286889655740783, "learning_rate": 1.113611265138794e-05, "loss": 0.3812, "step": 1674 }, { "epoch": 2.5878717651602936, "grad_norm": 0.4034269597280416, "learning_rate": 1.112538972555776e-05, "loss": 0.3497, "step": 1675 }, { "epoch": 2.589416763229046, "grad_norm": 0.3651631935470604, "learning_rate": 1.1114665488982495e-05, "loss": 0.3754, "step": 1676 }, { "epoch": 2.590961761297798, "grad_norm": 0.40988739779982775, "learning_rate": 1.11039399541527e-05, "loss": 0.3218, "step": 1677 }, { "epoch": 2.5925067593665507, "grad_norm": 0.36812055377792025, "learning_rate": 1.1093213133560434e-05, "loss": 0.3782, "step": 1678 }, { "epoch": 2.5940517574353033, "grad_norm": 0.4001732888113902, "learning_rate": 1.1082485039699267e-05, "loss": 0.3909, "step": 1679 }, { "epoch": 2.5955967555040558, "grad_norm": 0.3448566107573252, "learning_rate": 1.1071755685064235e-05, "loss": 0.3464, "step": 1680 }, { "epoch": 2.5971417535728083, "grad_norm": 0.3743101610618975, "learning_rate": 1.1061025082151867e-05, "loss": 0.3806, "step": 1681 }, { "epoch": 2.5986867516415604, "grad_norm": 0.35879392038638114, "learning_rate": 1.105029324346012e-05, "loss": 0.3586, "step": 1682 }, { "epoch": 2.600231749710313, "grad_norm": 0.377664275456811, "learning_rate": 1.1039560181488402e-05, "loss": 0.3718, "step": 1683 }, { "epoch": 2.6017767477790654, "grad_norm": 0.40933275088802623, "learning_rate": 1.1028825908737553e-05, "loss": 0.3777, "step": 1684 }, { "epoch": 2.6033217458478175, "grad_norm": 0.3298265958252012, "learning_rate": 1.1018090437709808e-05, "loss": 0.3772, "step": 1685 }, { "epoch": 2.60486674391657, "grad_norm": 0.3713450998673412, "learning_rate": 1.1007353780908806e-05, "loss": 0.3662, "step": 1686 }, { "epoch": 2.6064117419853225, "grad_norm": 0.35689444857277386, "learning_rate": 1.0996615950839571e-05, "loss": 0.3686, "step": 1687 }, { "epoch": 2.607956740054075, "grad_norm": 0.3343695076218598, "learning_rate": 1.0985876960008484e-05, "loss": 0.3448, "step": 1688 }, { "epoch": 2.6095017381228276, "grad_norm": 0.35158447812634286, "learning_rate": 1.0975136820923281e-05, "loss": 0.3467, "step": 1689 }, { "epoch": 2.6110467361915797, "grad_norm": 0.4050884338590485, "learning_rate": 1.0964395546093045e-05, "loss": 0.3922, "step": 1690 }, { "epoch": 2.612591734260332, "grad_norm": 0.3443820763949224, "learning_rate": 1.0953653148028169e-05, "loss": 0.3602, "step": 1691 }, { "epoch": 2.6141367323290847, "grad_norm": 0.3392318575087236, "learning_rate": 1.0942909639240357e-05, "loss": 0.3384, "step": 1692 }, { "epoch": 2.615681730397837, "grad_norm": 0.4190400000083697, "learning_rate": 1.0932165032242613e-05, "loss": 0.419, "step": 1693 }, { "epoch": 2.6172267284665893, "grad_norm": 0.3827326493258423, "learning_rate": 1.0921419339549214e-05, "loss": 0.3902, "step": 1694 }, { "epoch": 2.618771726535342, "grad_norm": 0.3434828708694504, "learning_rate": 1.0910672573675702e-05, "loss": 0.3545, "step": 1695 }, { "epoch": 2.6203167246040944, "grad_norm": 0.3494003086008042, "learning_rate": 1.0899924747138873e-05, "loss": 0.3547, "step": 1696 }, { "epoch": 2.621861722672847, "grad_norm": 0.3569697562163808, "learning_rate": 1.0889175872456752e-05, "loss": 0.3647, "step": 1697 }, { "epoch": 2.623406720741599, "grad_norm": 0.36951447010825045, "learning_rate": 1.0878425962148591e-05, "loss": 0.3906, "step": 1698 }, { "epoch": 2.6249517188103515, "grad_norm": 0.3839461866783587, "learning_rate": 1.0867675028734848e-05, "loss": 0.3746, "step": 1699 }, { "epoch": 2.626496716879104, "grad_norm": 0.37270244976724515, "learning_rate": 1.0856923084737163e-05, "loss": 0.3605, "step": 1700 }, { "epoch": 2.628041714947856, "grad_norm": 0.35260617503223335, "learning_rate": 1.0846170142678367e-05, "loss": 0.3822, "step": 1701 }, { "epoch": 2.6295867130166086, "grad_norm": 0.3751951935383054, "learning_rate": 1.0835416215082443e-05, "loss": 0.3834, "step": 1702 }, { "epoch": 2.631131711085361, "grad_norm": 0.33782451126880886, "learning_rate": 1.0824661314474527e-05, "loss": 0.3473, "step": 1703 }, { "epoch": 2.6326767091541137, "grad_norm": 0.40325154684522907, "learning_rate": 1.081390545338089e-05, "loss": 0.3657, "step": 1704 }, { "epoch": 2.634221707222866, "grad_norm": 0.352296627953665, "learning_rate": 1.0803148644328913e-05, "loss": 0.3773, "step": 1705 }, { "epoch": 2.6357667052916183, "grad_norm": 0.36878116398503147, "learning_rate": 1.0792390899847087e-05, "loss": 0.3523, "step": 1706 }, { "epoch": 2.637311703360371, "grad_norm": 0.34447925426176995, "learning_rate": 1.0781632232464998e-05, "loss": 0.3416, "step": 1707 }, { "epoch": 2.6388567014291233, "grad_norm": 0.367247887065588, "learning_rate": 1.0770872654713294e-05, "loss": 0.3984, "step": 1708 }, { "epoch": 2.6404016994978754, "grad_norm": 0.35206165902777403, "learning_rate": 1.0760112179123694e-05, "loss": 0.3427, "step": 1709 }, { "epoch": 2.641946697566628, "grad_norm": 0.376225110305087, "learning_rate": 1.0749350818228959e-05, "loss": 0.4161, "step": 1710 }, { "epoch": 2.6434916956353804, "grad_norm": 0.3337188637844167, "learning_rate": 1.073858858456288e-05, "loss": 0.3664, "step": 1711 }, { "epoch": 2.645036693704133, "grad_norm": 0.3607347301665339, "learning_rate": 1.0727825490660266e-05, "loss": 0.3545, "step": 1712 }, { "epoch": 2.6465816917728855, "grad_norm": 0.36889878210418037, "learning_rate": 1.0717061549056929e-05, "loss": 0.3664, "step": 1713 }, { "epoch": 2.6481266898416376, "grad_norm": 0.3178905450025926, "learning_rate": 1.0706296772289664e-05, "loss": 0.338, "step": 1714 }, { "epoch": 2.64967168791039, "grad_norm": 0.4662510774631601, "learning_rate": 1.0695531172896246e-05, "loss": 0.3665, "step": 1715 }, { "epoch": 2.6512166859791426, "grad_norm": 0.3913059630149199, "learning_rate": 1.0684764763415398e-05, "loss": 0.3465, "step": 1716 }, { "epoch": 2.6527616840478947, "grad_norm": 0.37168867217367046, "learning_rate": 1.0673997556386795e-05, "loss": 0.3617, "step": 1717 }, { "epoch": 2.654306682116647, "grad_norm": 0.3741854268960078, "learning_rate": 1.066322956435104e-05, "loss": 0.3612, "step": 1718 }, { "epoch": 2.6558516801853997, "grad_norm": 0.42080583378095715, "learning_rate": 1.0652460799849648e-05, "loss": 0.3883, "step": 1719 }, { "epoch": 2.6573966782541523, "grad_norm": 0.42444826560559384, "learning_rate": 1.0641691275425032e-05, "loss": 0.3901, "step": 1720 }, { "epoch": 2.658941676322905, "grad_norm": 0.3018040298859625, "learning_rate": 1.0630921003620494e-05, "loss": 0.3283, "step": 1721 }, { "epoch": 2.660486674391657, "grad_norm": 0.4260934759494013, "learning_rate": 1.0620149996980202e-05, "loss": 0.3525, "step": 1722 }, { "epoch": 2.6620316724604094, "grad_norm": 0.3547776104929509, "learning_rate": 1.0609378268049187e-05, "loss": 0.3709, "step": 1723 }, { "epoch": 2.663576670529162, "grad_norm": 0.36407433576938014, "learning_rate": 1.0598605829373314e-05, "loss": 0.3692, "step": 1724 }, { "epoch": 2.665121668597914, "grad_norm": 0.38886778634294844, "learning_rate": 1.0587832693499279e-05, "loss": 0.3903, "step": 1725 }, { "epoch": 2.6666666666666665, "grad_norm": 0.377113220427048, "learning_rate": 1.0577058872974586e-05, "loss": 0.3727, "step": 1726 }, { "epoch": 2.668211664735419, "grad_norm": 0.36392449361845985, "learning_rate": 1.056628438034754e-05, "loss": 0.3607, "step": 1727 }, { "epoch": 2.6697566628041716, "grad_norm": 0.3725018011499924, "learning_rate": 1.0555509228167229e-05, "loss": 0.349, "step": 1728 }, { "epoch": 2.671301660872924, "grad_norm": 0.38474166949387567, "learning_rate": 1.0544733428983507e-05, "loss": 0.3687, "step": 1729 }, { "epoch": 2.672846658941676, "grad_norm": 0.3387525027285337, "learning_rate": 1.0533956995346984e-05, "loss": 0.37, "step": 1730 }, { "epoch": 2.6743916570104287, "grad_norm": 0.3346063791724931, "learning_rate": 1.0523179939809003e-05, "loss": 0.3747, "step": 1731 }, { "epoch": 2.675936655079181, "grad_norm": 0.3540639692092411, "learning_rate": 1.051240227492164e-05, "loss": 0.3544, "step": 1732 }, { "epoch": 2.6774816531479333, "grad_norm": 0.3365447325844868, "learning_rate": 1.0501624013237677e-05, "loss": 0.3544, "step": 1733 }, { "epoch": 2.679026651216686, "grad_norm": 0.3701982526266977, "learning_rate": 1.0490845167310584e-05, "loss": 0.3897, "step": 1734 }, { "epoch": 2.6805716492854383, "grad_norm": 0.33394592876014084, "learning_rate": 1.0480065749694527e-05, "loss": 0.38, "step": 1735 }, { "epoch": 2.682116647354191, "grad_norm": 0.42012987156176734, "learning_rate": 1.0469285772944323e-05, "loss": 0.3944, "step": 1736 }, { "epoch": 2.6836616454229434, "grad_norm": 0.33450173026268304, "learning_rate": 1.0458505249615446e-05, "loss": 0.3384, "step": 1737 }, { "epoch": 2.6852066434916955, "grad_norm": 0.4209421814505849, "learning_rate": 1.0447724192264013e-05, "loss": 0.3921, "step": 1738 }, { "epoch": 2.686751641560448, "grad_norm": 0.3116701220633722, "learning_rate": 1.0436942613446747e-05, "loss": 0.353, "step": 1739 }, { "epoch": 2.6882966396292005, "grad_norm": 0.41880317759100466, "learning_rate": 1.0426160525720998e-05, "loss": 0.3618, "step": 1740 }, { "epoch": 2.689841637697953, "grad_norm": 0.3521440241958734, "learning_rate": 1.0415377941644693e-05, "loss": 0.3754, "step": 1741 }, { "epoch": 2.6913866357667056, "grad_norm": 0.36925119111411564, "learning_rate": 1.0404594873776341e-05, "loss": 0.3532, "step": 1742 }, { "epoch": 2.6929316338354576, "grad_norm": 0.3930311845014814, "learning_rate": 1.0393811334675022e-05, "loss": 0.3913, "step": 1743 }, { "epoch": 2.69447663190421, "grad_norm": 0.3957524573537371, "learning_rate": 1.0383027336900356e-05, "loss": 0.3658, "step": 1744 }, { "epoch": 2.6960216299729627, "grad_norm": 0.34381889112991654, "learning_rate": 1.0372242893012498e-05, "loss": 0.3529, "step": 1745 }, { "epoch": 2.6975666280417148, "grad_norm": 0.36062031390733496, "learning_rate": 1.036145801557213e-05, "loss": 0.3758, "step": 1746 }, { "epoch": 2.6991116261104673, "grad_norm": 0.35880279522688346, "learning_rate": 1.0350672717140434e-05, "loss": 0.3685, "step": 1747 }, { "epoch": 2.70065662417922, "grad_norm": 0.3277167898223584, "learning_rate": 1.0339887010279075e-05, "loss": 0.3562, "step": 1748 }, { "epoch": 2.7022016222479723, "grad_norm": 0.32860044985560033, "learning_rate": 1.0329100907550206e-05, "loss": 0.3873, "step": 1749 }, { "epoch": 2.703746620316725, "grad_norm": 0.33480738244800695, "learning_rate": 1.031831442151644e-05, "loss": 0.3583, "step": 1750 }, { "epoch": 2.705291618385477, "grad_norm": 0.3023068173186701, "learning_rate": 1.0307527564740825e-05, "loss": 0.3374, "step": 1751 }, { "epoch": 2.7068366164542295, "grad_norm": 0.32207342175670595, "learning_rate": 1.0296740349786855e-05, "loss": 0.3647, "step": 1752 }, { "epoch": 2.708381614522982, "grad_norm": 0.34534216880029756, "learning_rate": 1.0285952789218427e-05, "loss": 0.3596, "step": 1753 }, { "epoch": 2.709926612591734, "grad_norm": 0.3114698161956925, "learning_rate": 1.0275164895599856e-05, "loss": 0.333, "step": 1754 }, { "epoch": 2.7114716106604866, "grad_norm": 0.3355593735850947, "learning_rate": 1.0264376681495832e-05, "loss": 0.3684, "step": 1755 }, { "epoch": 2.713016608729239, "grad_norm": 0.35882847807664203, "learning_rate": 1.0253588159471423e-05, "loss": 0.4015, "step": 1756 }, { "epoch": 2.7145616067979916, "grad_norm": 0.3723391769865432, "learning_rate": 1.0242799342092057e-05, "loss": 0.3789, "step": 1757 }, { "epoch": 2.716106604866744, "grad_norm": 0.369534389468052, "learning_rate": 1.023201024192351e-05, "loss": 0.3614, "step": 1758 }, { "epoch": 2.7176516029354962, "grad_norm": 0.3716954927493281, "learning_rate": 1.022122087153187e-05, "loss": 0.3647, "step": 1759 }, { "epoch": 2.7191966010042488, "grad_norm": 0.4088874585438322, "learning_rate": 1.0210431243483562e-05, "loss": 0.3713, "step": 1760 }, { "epoch": 2.7207415990730013, "grad_norm": 0.3228418788565312, "learning_rate": 1.0199641370345299e-05, "loss": 0.3695, "step": 1761 }, { "epoch": 2.7222865971417534, "grad_norm": 0.32660210774213344, "learning_rate": 1.0188851264684078e-05, "loss": 0.3476, "step": 1762 }, { "epoch": 2.723831595210506, "grad_norm": 0.39733458318604004, "learning_rate": 1.0178060939067176e-05, "loss": 0.3792, "step": 1763 }, { "epoch": 2.7253765932792584, "grad_norm": 0.37355156259038086, "learning_rate": 1.0167270406062116e-05, "loss": 0.368, "step": 1764 }, { "epoch": 2.726921591348011, "grad_norm": 0.37116260305353566, "learning_rate": 1.0156479678236666e-05, "loss": 0.38, "step": 1765 }, { "epoch": 2.7284665894167635, "grad_norm": 0.32888676637010966, "learning_rate": 1.0145688768158825e-05, "loss": 0.3404, "step": 1766 }, { "epoch": 2.7300115874855155, "grad_norm": 0.36058652780117767, "learning_rate": 1.0134897688396799e-05, "loss": 0.3641, "step": 1767 }, { "epoch": 2.731556585554268, "grad_norm": 0.39540448075479745, "learning_rate": 1.0124106451518998e-05, "loss": 0.4011, "step": 1768 }, { "epoch": 2.7331015836230206, "grad_norm": 0.37473305877747215, "learning_rate": 1.0113315070094007e-05, "loss": 0.3602, "step": 1769 }, { "epoch": 2.7346465816917727, "grad_norm": 0.38046377002108167, "learning_rate": 1.0102523556690585e-05, "loss": 0.3911, "step": 1770 }, { "epoch": 2.736191579760525, "grad_norm": 0.33963001877777876, "learning_rate": 1.009173192387764e-05, "loss": 0.3417, "step": 1771 }, { "epoch": 2.7377365778292777, "grad_norm": 0.3948854092316902, "learning_rate": 1.0080940184224226e-05, "loss": 0.4059, "step": 1772 }, { "epoch": 2.7392815758980302, "grad_norm": 0.3694871476727327, "learning_rate": 1.0070148350299511e-05, "loss": 0.3622, "step": 1773 }, { "epoch": 2.7408265739667828, "grad_norm": 0.325187451309045, "learning_rate": 1.0059356434672789e-05, "loss": 0.3592, "step": 1774 }, { "epoch": 2.742371572035535, "grad_norm": 0.34211805533524775, "learning_rate": 1.0048564449913431e-05, "loss": 0.3519, "step": 1775 }, { "epoch": 2.7439165701042874, "grad_norm": 0.3416683900255319, "learning_rate": 1.0037772408590898e-05, "loss": 0.3849, "step": 1776 }, { "epoch": 2.74546156817304, "grad_norm": 0.37483994986464536, "learning_rate": 1.002698032327472e-05, "loss": 0.368, "step": 1777 }, { "epoch": 2.747006566241792, "grad_norm": 0.3601206917075784, "learning_rate": 1.0016188206534472e-05, "loss": 0.3288, "step": 1778 }, { "epoch": 2.7485515643105445, "grad_norm": 0.36637419921606434, "learning_rate": 1.0005396070939766e-05, "loss": 0.3994, "step": 1779 }, { "epoch": 2.750096562379297, "grad_norm": 0.3515745061370125, "learning_rate": 9.994603929060235e-06, "loss": 0.3413, "step": 1780 }, { "epoch": 2.7516415604480495, "grad_norm": 0.3759376437167684, "learning_rate": 9.983811793465531e-06, "loss": 0.3671, "step": 1781 }, { "epoch": 2.753186558516802, "grad_norm": 0.35128205594555295, "learning_rate": 9.973019676725284e-06, "loss": 0.3813, "step": 1782 }, { "epoch": 2.754731556585554, "grad_norm": 0.33221295529335093, "learning_rate": 9.962227591409102e-06, "loss": 0.3877, "step": 1783 }, { "epoch": 2.7562765546543067, "grad_norm": 0.31287488312446426, "learning_rate": 9.951435550086572e-06, "loss": 0.3794, "step": 1784 }, { "epoch": 2.757821552723059, "grad_norm": 0.37322262209559076, "learning_rate": 9.940643565327214e-06, "loss": 0.3538, "step": 1785 }, { "epoch": 2.7593665507918113, "grad_norm": 0.3343363891246558, "learning_rate": 9.92985164970049e-06, "loss": 0.3682, "step": 1786 }, { "epoch": 2.760911548860564, "grad_norm": 0.325747199982899, "learning_rate": 9.919059815775777e-06, "loss": 0.3672, "step": 1787 }, { "epoch": 2.7624565469293163, "grad_norm": 0.3446649847122903, "learning_rate": 9.908268076122362e-06, "loss": 0.3522, "step": 1788 }, { "epoch": 2.764001544998069, "grad_norm": 0.3577627631062896, "learning_rate": 9.897476443309417e-06, "loss": 0.3742, "step": 1789 }, { "epoch": 2.7655465430668214, "grad_norm": 0.32269852463577114, "learning_rate": 9.886684929905994e-06, "loss": 0.3728, "step": 1790 }, { "epoch": 2.7670915411355734, "grad_norm": 0.3102930608351158, "learning_rate": 9.875893548481005e-06, "loss": 0.367, "step": 1791 }, { "epoch": 2.768636539204326, "grad_norm": 0.3306665918816944, "learning_rate": 9.865102311603201e-06, "loss": 0.3855, "step": 1792 }, { "epoch": 2.7701815372730785, "grad_norm": 0.3485486973036903, "learning_rate": 9.854311231841178e-06, "loss": 0.3613, "step": 1793 }, { "epoch": 2.7717265353418306, "grad_norm": 0.3103820488909034, "learning_rate": 9.843520321763339e-06, "loss": 0.3556, "step": 1794 }, { "epoch": 2.773271533410583, "grad_norm": 0.3399853783364427, "learning_rate": 9.832729593937889e-06, "loss": 0.3812, "step": 1795 }, { "epoch": 2.7748165314793356, "grad_norm": 0.3348511383515678, "learning_rate": 9.821939060932828e-06, "loss": 0.3865, "step": 1796 }, { "epoch": 2.776361529548088, "grad_norm": 0.3483151988451605, "learning_rate": 9.811148735315925e-06, "loss": 0.3361, "step": 1797 }, { "epoch": 2.7779065276168406, "grad_norm": 0.3269934542000524, "learning_rate": 9.800358629654706e-06, "loss": 0.3607, "step": 1798 }, { "epoch": 2.7794515256855927, "grad_norm": 0.38415588083490315, "learning_rate": 9.78956875651644e-06, "loss": 0.4059, "step": 1799 }, { "epoch": 2.7809965237543453, "grad_norm": 0.34201416875920343, "learning_rate": 9.778779128468133e-06, "loss": 0.3817, "step": 1800 }, { "epoch": 2.7825415218230978, "grad_norm": 0.31407154779643937, "learning_rate": 9.767989758076497e-06, "loss": 0.3328, "step": 1801 }, { "epoch": 2.7840865198918503, "grad_norm": 0.32207375315414816, "learning_rate": 9.757200657907944e-06, "loss": 0.3801, "step": 1802 }, { "epoch": 2.785631517960603, "grad_norm": 0.338786746879186, "learning_rate": 9.74641184052858e-06, "loss": 0.394, "step": 1803 }, { "epoch": 2.787176516029355, "grad_norm": 0.328061189757231, "learning_rate": 9.735623318504173e-06, "loss": 0.3612, "step": 1804 }, { "epoch": 2.7887215140981074, "grad_norm": 0.31036925004136096, "learning_rate": 9.724835104400144e-06, "loss": 0.3639, "step": 1805 }, { "epoch": 2.79026651216686, "grad_norm": 0.3326793059889599, "learning_rate": 9.714047210781575e-06, "loss": 0.3813, "step": 1806 }, { "epoch": 2.791811510235612, "grad_norm": 0.3410020660434098, "learning_rate": 9.70325965021315e-06, "loss": 0.3734, "step": 1807 }, { "epoch": 2.7933565083043645, "grad_norm": 0.3233669600192115, "learning_rate": 9.692472435259175e-06, "loss": 0.3575, "step": 1808 }, { "epoch": 2.794901506373117, "grad_norm": 0.2852315461095015, "learning_rate": 9.681685578483562e-06, "loss": 0.3484, "step": 1809 }, { "epoch": 2.7964465044418696, "grad_norm": 0.34989565760412056, "learning_rate": 9.670899092449797e-06, "loss": 0.4057, "step": 1810 }, { "epoch": 2.797991502510622, "grad_norm": 0.32287125182895715, "learning_rate": 9.660112989720926e-06, "loss": 0.3326, "step": 1811 }, { "epoch": 2.799536500579374, "grad_norm": 0.3423193384827897, "learning_rate": 9.649327282859571e-06, "loss": 0.3927, "step": 1812 }, { "epoch": 2.8010814986481267, "grad_norm": 0.3104467003057835, "learning_rate": 9.638541984427874e-06, "loss": 0.3381, "step": 1813 }, { "epoch": 2.8026264967168792, "grad_norm": 0.3662473709029123, "learning_rate": 9.627757106987502e-06, "loss": 0.3874, "step": 1814 }, { "epoch": 2.8041714947856313, "grad_norm": 0.31064681400614186, "learning_rate": 9.616972663099648e-06, "loss": 0.3605, "step": 1815 }, { "epoch": 2.805716492854384, "grad_norm": 0.3163050575247288, "learning_rate": 9.606188665324981e-06, "loss": 0.3642, "step": 1816 }, { "epoch": 2.8072614909231364, "grad_norm": 0.32631869156766546, "learning_rate": 9.595405126223659e-06, "loss": 0.3614, "step": 1817 }, { "epoch": 2.808806488991889, "grad_norm": 0.3186261247415215, "learning_rate": 9.58462205835531e-06, "loss": 0.3463, "step": 1818 }, { "epoch": 2.8103514870606414, "grad_norm": 0.34946115367170294, "learning_rate": 9.573839474279004e-06, "loss": 0.3897, "step": 1819 }, { "epoch": 2.8118964851293935, "grad_norm": 0.3610056689405753, "learning_rate": 9.563057386553252e-06, "loss": 0.3744, "step": 1820 }, { "epoch": 2.813441483198146, "grad_norm": 0.36925556325002185, "learning_rate": 9.552275807735989e-06, "loss": 0.4194, "step": 1821 }, { "epoch": 2.8149864812668985, "grad_norm": 0.35377071773496876, "learning_rate": 9.541494750384555e-06, "loss": 0.3529, "step": 1822 }, { "epoch": 2.8165314793356506, "grad_norm": 0.37208819869500137, "learning_rate": 9.53071422705568e-06, "loss": 0.383, "step": 1823 }, { "epoch": 2.818076477404403, "grad_norm": 0.41289596521089295, "learning_rate": 9.519934250305476e-06, "loss": 0.3888, "step": 1824 }, { "epoch": 2.8196214754731557, "grad_norm": 0.31682351302326445, "learning_rate": 9.50915483268942e-06, "loss": 0.3794, "step": 1825 }, { "epoch": 2.821166473541908, "grad_norm": 0.38720873601521705, "learning_rate": 9.49837598676233e-06, "loss": 0.3972, "step": 1826 }, { "epoch": 2.8227114716106607, "grad_norm": 0.32938792095935354, "learning_rate": 9.487597725078364e-06, "loss": 0.3382, "step": 1827 }, { "epoch": 2.824256469679413, "grad_norm": 0.3836354672842586, "learning_rate": 9.476820060191002e-06, "loss": 0.3884, "step": 1828 }, { "epoch": 2.8258014677481653, "grad_norm": 0.3567268923171431, "learning_rate": 9.466043004653023e-06, "loss": 0.3525, "step": 1829 }, { "epoch": 2.827346465816918, "grad_norm": 0.35467769287766127, "learning_rate": 9.455266571016497e-06, "loss": 0.3787, "step": 1830 }, { "epoch": 2.82889146388567, "grad_norm": 0.37911919407734834, "learning_rate": 9.444490771832774e-06, "loss": 0.3598, "step": 1831 }, { "epoch": 2.8304364619544224, "grad_norm": 0.3679598847551083, "learning_rate": 9.433715619652465e-06, "loss": 0.3714, "step": 1832 }, { "epoch": 2.831981460023175, "grad_norm": 0.3551441554747631, "learning_rate": 9.422941127025418e-06, "loss": 0.3855, "step": 1833 }, { "epoch": 2.8335264580919275, "grad_norm": 0.4081929100395201, "learning_rate": 9.412167306500726e-06, "loss": 0.3787, "step": 1834 }, { "epoch": 2.83507145616068, "grad_norm": 0.3557286806748959, "learning_rate": 9.40139417062669e-06, "loss": 0.3268, "step": 1835 }, { "epoch": 2.836616454229432, "grad_norm": 0.4169391818715902, "learning_rate": 9.390621731950813e-06, "loss": 0.3748, "step": 1836 }, { "epoch": 2.8381614522981846, "grad_norm": 0.32312520133482814, "learning_rate": 9.379850003019801e-06, "loss": 0.3575, "step": 1837 }, { "epoch": 2.839706450366937, "grad_norm": 0.3720547393708458, "learning_rate": 9.36907899637951e-06, "loss": 0.3921, "step": 1838 }, { "epoch": 2.841251448435689, "grad_norm": 0.3565900090478252, "learning_rate": 9.35830872457497e-06, "loss": 0.3308, "step": 1839 }, { "epoch": 2.8427964465044417, "grad_norm": 0.36936869482214235, "learning_rate": 9.347539200150353e-06, "loss": 0.3957, "step": 1840 }, { "epoch": 2.8443414445731943, "grad_norm": 0.33665978880090697, "learning_rate": 9.336770435648963e-06, "loss": 0.3707, "step": 1841 }, { "epoch": 2.845886442641947, "grad_norm": 0.38194101011396897, "learning_rate": 9.326002443613205e-06, "loss": 0.3621, "step": 1842 }, { "epoch": 2.8474314407106993, "grad_norm": 0.33504548187835287, "learning_rate": 9.315235236584604e-06, "loss": 0.3158, "step": 1843 }, { "epoch": 2.8489764387794514, "grad_norm": 0.3623399139687453, "learning_rate": 9.304468827103759e-06, "loss": 0.4372, "step": 1844 }, { "epoch": 2.850521436848204, "grad_norm": 0.31336460318606113, "learning_rate": 9.293703227710338e-06, "loss": 0.3102, "step": 1845 }, { "epoch": 2.8520664349169564, "grad_norm": 0.34812885913099506, "learning_rate": 9.282938450943073e-06, "loss": 0.3938, "step": 1846 }, { "epoch": 2.8536114329857085, "grad_norm": 0.3237649882525916, "learning_rate": 9.272174509339737e-06, "loss": 0.3883, "step": 1847 }, { "epoch": 2.855156431054461, "grad_norm": 0.3754307597738313, "learning_rate": 9.261411415437122e-06, "loss": 0.3594, "step": 1848 }, { "epoch": 2.8567014291232136, "grad_norm": 0.3714313102917016, "learning_rate": 9.250649181771043e-06, "loss": 0.4007, "step": 1849 }, { "epoch": 2.858246427191966, "grad_norm": 0.33192425672421066, "learning_rate": 9.23988782087631e-06, "loss": 0.3172, "step": 1850 }, { "epoch": 2.8597914252607186, "grad_norm": 0.37731298967290744, "learning_rate": 9.22912734528671e-06, "loss": 0.3974, "step": 1851 }, { "epoch": 2.8613364233294707, "grad_norm": 0.3599660873090621, "learning_rate": 9.218367767535007e-06, "loss": 0.3752, "step": 1852 }, { "epoch": 2.862881421398223, "grad_norm": 0.3189128885030965, "learning_rate": 9.207609100152914e-06, "loss": 0.3739, "step": 1853 }, { "epoch": 2.8644264194669757, "grad_norm": 0.36697730511755916, "learning_rate": 9.196851355671092e-06, "loss": 0.3713, "step": 1854 }, { "epoch": 2.865971417535728, "grad_norm": 0.3289816212903146, "learning_rate": 9.186094546619113e-06, "loss": 0.3365, "step": 1855 }, { "epoch": 2.8675164156044803, "grad_norm": 0.3767158120357283, "learning_rate": 9.175338685525474e-06, "loss": 0.4198, "step": 1856 }, { "epoch": 2.869061413673233, "grad_norm": 0.3604726845177797, "learning_rate": 9.16458378491756e-06, "loss": 0.3473, "step": 1857 }, { "epoch": 2.8706064117419854, "grad_norm": 0.36843772814248144, "learning_rate": 9.153829857321636e-06, "loss": 0.3703, "step": 1858 }, { "epoch": 2.872151409810738, "grad_norm": 0.3353465677144737, "learning_rate": 9.14307691526284e-06, "loss": 0.3415, "step": 1859 }, { "epoch": 2.87369640787949, "grad_norm": 0.3734030809287896, "learning_rate": 9.132324971265158e-06, "loss": 0.3894, "step": 1860 }, { "epoch": 2.8752414059482425, "grad_norm": 0.34868294707846387, "learning_rate": 9.121574037851412e-06, "loss": 0.3534, "step": 1861 }, { "epoch": 2.876786404016995, "grad_norm": 0.39580330856371265, "learning_rate": 9.110824127543251e-06, "loss": 0.3603, "step": 1862 }, { "epoch": 2.8783314020857476, "grad_norm": 0.3623532660244772, "learning_rate": 9.100075252861132e-06, "loss": 0.372, "step": 1863 }, { "epoch": 2.8798764001545, "grad_norm": 0.3378791762996007, "learning_rate": 9.089327426324298e-06, "loss": 0.3383, "step": 1864 }, { "epoch": 2.881421398223252, "grad_norm": 0.47448094237387944, "learning_rate": 9.07858066045079e-06, "loss": 0.4105, "step": 1865 }, { "epoch": 2.8829663962920047, "grad_norm": 0.33304734997480795, "learning_rate": 9.06783496775739e-06, "loss": 0.3839, "step": 1866 }, { "epoch": 2.884511394360757, "grad_norm": 0.4243508179114609, "learning_rate": 9.057090360759643e-06, "loss": 0.3777, "step": 1867 }, { "epoch": 2.8860563924295093, "grad_norm": 0.32594481159882926, "learning_rate": 9.046346851971833e-06, "loss": 0.3451, "step": 1868 }, { "epoch": 2.887601390498262, "grad_norm": 0.35708947325916557, "learning_rate": 9.035604453906958e-06, "loss": 0.3566, "step": 1869 }, { "epoch": 2.8891463885670143, "grad_norm": 0.33713471564574893, "learning_rate": 9.024863179076717e-06, "loss": 0.3677, "step": 1870 }, { "epoch": 2.890691386635767, "grad_norm": 0.33277875564337334, "learning_rate": 9.014123039991519e-06, "loss": 0.3813, "step": 1871 }, { "epoch": 2.8922363847045194, "grad_norm": 0.361086262111271, "learning_rate": 9.003384049160432e-06, "loss": 0.3577, "step": 1872 }, { "epoch": 2.8937813827732715, "grad_norm": 0.36013299136069565, "learning_rate": 8.992646219091194e-06, "loss": 0.3878, "step": 1873 }, { "epoch": 2.895326380842024, "grad_norm": 0.33204506647111404, "learning_rate": 8.981909562290195e-06, "loss": 0.3968, "step": 1874 }, { "epoch": 2.8968713789107765, "grad_norm": 0.3191138181510106, "learning_rate": 8.97117409126245e-06, "loss": 0.3721, "step": 1875 }, { "epoch": 2.8984163769795286, "grad_norm": 0.3774025646373156, "learning_rate": 8.960439818511603e-06, "loss": 0.3508, "step": 1876 }, { "epoch": 2.899961375048281, "grad_norm": 0.3256596493776712, "learning_rate": 8.949706756539884e-06, "loss": 0.3545, "step": 1877 }, { "epoch": 2.9015063731170336, "grad_norm": 0.35782201950397946, "learning_rate": 8.938974917848136e-06, "loss": 0.3664, "step": 1878 }, { "epoch": 2.903051371185786, "grad_norm": 0.32557695242528206, "learning_rate": 8.928244314935766e-06, "loss": 0.3613, "step": 1879 }, { "epoch": 2.9045963692545387, "grad_norm": 0.33661206605627975, "learning_rate": 8.917514960300736e-06, "loss": 0.3658, "step": 1880 }, { "epoch": 2.9061413673232908, "grad_norm": 0.30611721230749417, "learning_rate": 8.906786866439569e-06, "loss": 0.3349, "step": 1881 }, { "epoch": 2.9076863653920433, "grad_norm": 0.3855923243570789, "learning_rate": 8.896060045847305e-06, "loss": 0.4335, "step": 1882 }, { "epoch": 2.909231363460796, "grad_norm": 0.32994284899084836, "learning_rate": 8.885334511017505e-06, "loss": 0.3888, "step": 1883 }, { "epoch": 2.910776361529548, "grad_norm": 0.3275017610912406, "learning_rate": 8.874610274442243e-06, "loss": 0.3744, "step": 1884 }, { "epoch": 2.9123213595983004, "grad_norm": 0.3204037352204569, "learning_rate": 8.863887348612064e-06, "loss": 0.3774, "step": 1885 }, { "epoch": 2.913866357667053, "grad_norm": 0.31008542710822184, "learning_rate": 8.853165746015997e-06, "loss": 0.3609, "step": 1886 }, { "epoch": 2.9154113557358055, "grad_norm": 0.3670116179174495, "learning_rate": 8.842445479141529e-06, "loss": 0.3826, "step": 1887 }, { "epoch": 2.916956353804558, "grad_norm": 0.3484955621769868, "learning_rate": 8.831726560474591e-06, "loss": 0.3832, "step": 1888 }, { "epoch": 2.91850135187331, "grad_norm": 0.3019005389728138, "learning_rate": 8.821009002499537e-06, "loss": 0.3359, "step": 1889 }, { "epoch": 2.9200463499420626, "grad_norm": 0.3508096905730115, "learning_rate": 8.810292817699148e-06, "loss": 0.3808, "step": 1890 }, { "epoch": 2.921591348010815, "grad_norm": 0.3753433779283418, "learning_rate": 8.799578018554598e-06, "loss": 0.4206, "step": 1891 }, { "epoch": 2.923136346079567, "grad_norm": 0.3084706311085096, "learning_rate": 8.788864617545441e-06, "loss": 0.3312, "step": 1892 }, { "epoch": 2.9246813441483197, "grad_norm": 0.33076269097945443, "learning_rate": 8.778152627149625e-06, "loss": 0.3518, "step": 1893 }, { "epoch": 2.9262263422170722, "grad_norm": 0.3540114476431238, "learning_rate": 8.767442059843428e-06, "loss": 0.3812, "step": 1894 }, { "epoch": 2.9277713402858248, "grad_norm": 0.32652140810027336, "learning_rate": 8.756732928101484e-06, "loss": 0.3528, "step": 1895 }, { "epoch": 2.9293163383545773, "grad_norm": 0.3537987522041188, "learning_rate": 8.746025244396758e-06, "loss": 0.3665, "step": 1896 }, { "epoch": 2.9308613364233294, "grad_norm": 0.37455174469212804, "learning_rate": 8.735319021200526e-06, "loss": 0.3836, "step": 1897 }, { "epoch": 2.932406334492082, "grad_norm": 0.35528944666588386, "learning_rate": 8.72461427098235e-06, "loss": 0.3532, "step": 1898 }, { "epoch": 2.9339513325608344, "grad_norm": 0.35780037846888135, "learning_rate": 8.713911006210098e-06, "loss": 0.389, "step": 1899 }, { "epoch": 2.9354963306295865, "grad_norm": 0.32932701072310766, "learning_rate": 8.703209239349898e-06, "loss": 0.3452, "step": 1900 }, { "epoch": 2.937041328698339, "grad_norm": 0.33473659772573405, "learning_rate": 8.69250898286612e-06, "loss": 0.3535, "step": 1901 }, { "epoch": 2.9385863267670915, "grad_norm": 0.3494841740116071, "learning_rate": 8.681810249221404e-06, "loss": 0.3771, "step": 1902 }, { "epoch": 2.940131324835844, "grad_norm": 0.31487184756731507, "learning_rate": 8.671113050876591e-06, "loss": 0.3486, "step": 1903 }, { "epoch": 2.9416763229045966, "grad_norm": 0.32507002090003284, "learning_rate": 8.660417400290748e-06, "loss": 0.3363, "step": 1904 }, { "epoch": 2.9432213209733487, "grad_norm": 0.3259933336875303, "learning_rate": 8.649723309921123e-06, "loss": 0.3833, "step": 1905 }, { "epoch": 2.944766319042101, "grad_norm": 0.33803273807909534, "learning_rate": 8.63903079222317e-06, "loss": 0.3514, "step": 1906 }, { "epoch": 2.9463113171108537, "grad_norm": 0.37259454540329684, "learning_rate": 8.628339859650494e-06, "loss": 0.4122, "step": 1907 }, { "epoch": 2.947856315179606, "grad_norm": 0.30265285947732246, "learning_rate": 8.617650524654859e-06, "loss": 0.3262, "step": 1908 }, { "epoch": 2.9494013132483583, "grad_norm": 0.35426517118802986, "learning_rate": 8.606962799686172e-06, "loss": 0.3787, "step": 1909 }, { "epoch": 2.950946311317111, "grad_norm": 0.36496997501986744, "learning_rate": 8.596276697192457e-06, "loss": 0.3954, "step": 1910 }, { "epoch": 2.9524913093858633, "grad_norm": 0.3221394516656487, "learning_rate": 8.585592229619852e-06, "loss": 0.3601, "step": 1911 }, { "epoch": 2.954036307454616, "grad_norm": 0.343284057510135, "learning_rate": 8.574909409412594e-06, "loss": 0.3601, "step": 1912 }, { "epoch": 2.955581305523368, "grad_norm": 0.36919276923767286, "learning_rate": 8.564228249012999e-06, "loss": 0.3597, "step": 1913 }, { "epoch": 2.9571263035921205, "grad_norm": 0.3558036409705332, "learning_rate": 8.553548760861445e-06, "loss": 0.3517, "step": 1914 }, { "epoch": 2.958671301660873, "grad_norm": 0.30528643053787236, "learning_rate": 8.542870957396372e-06, "loss": 0.3445, "step": 1915 }, { "epoch": 2.960216299729625, "grad_norm": 0.3895442296244485, "learning_rate": 8.532194851054251e-06, "loss": 0.4245, "step": 1916 }, { "epoch": 2.9617612977983776, "grad_norm": 0.31130293067874815, "learning_rate": 8.521520454269575e-06, "loss": 0.3162, "step": 1917 }, { "epoch": 2.96330629586713, "grad_norm": 0.33824772307552303, "learning_rate": 8.510847779474853e-06, "loss": 0.3948, "step": 1918 }, { "epoch": 2.9648512939358826, "grad_norm": 0.3063632495140437, "learning_rate": 8.500176839100582e-06, "loss": 0.3473, "step": 1919 }, { "epoch": 2.966396292004635, "grad_norm": 0.3917024736759489, "learning_rate": 8.489507645575237e-06, "loss": 0.3641, "step": 1920 }, { "epoch": 2.9679412900733873, "grad_norm": 0.35170761394628663, "learning_rate": 8.478840211325271e-06, "loss": 0.3923, "step": 1921 }, { "epoch": 2.9694862881421398, "grad_norm": 0.3139549745987255, "learning_rate": 8.468174548775074e-06, "loss": 0.3443, "step": 1922 }, { "epoch": 2.9710312862108923, "grad_norm": 0.37143813618497334, "learning_rate": 8.457510670346976e-06, "loss": 0.3644, "step": 1923 }, { "epoch": 2.9725762842796444, "grad_norm": 0.32588117037769393, "learning_rate": 8.44684858846124e-06, "loss": 0.3745, "step": 1924 }, { "epoch": 2.974121282348397, "grad_norm": 0.3300217091551878, "learning_rate": 8.43618831553602e-06, "loss": 0.3625, "step": 1925 }, { "epoch": 2.9756662804171494, "grad_norm": 0.3401074924266491, "learning_rate": 8.425529863987367e-06, "loss": 0.359, "step": 1926 }, { "epoch": 2.977211278485902, "grad_norm": 0.3484772895642021, "learning_rate": 8.414873246229224e-06, "loss": 0.3816, "step": 1927 }, { "epoch": 2.9787562765546545, "grad_norm": 0.41666062227266154, "learning_rate": 8.404218474673388e-06, "loss": 0.3617, "step": 1928 }, { "epoch": 2.9803012746234065, "grad_norm": 0.36388313660607097, "learning_rate": 8.393565561729494e-06, "loss": 0.3753, "step": 1929 }, { "epoch": 2.981846272692159, "grad_norm": 0.3370641596322351, "learning_rate": 8.382914519805038e-06, "loss": 0.3865, "step": 1930 }, { "epoch": 2.9833912707609116, "grad_norm": 0.39828880281255974, "learning_rate": 8.372265361305318e-06, "loss": 0.375, "step": 1931 }, { "epoch": 2.984936268829664, "grad_norm": 0.3509210589081975, "learning_rate": 8.361618098633447e-06, "loss": 0.3882, "step": 1932 }, { "epoch": 2.9864812668984166, "grad_norm": 0.32143059852995304, "learning_rate": 8.350972744190315e-06, "loss": 0.3628, "step": 1933 }, { "epoch": 2.9880262649671687, "grad_norm": 0.40462411169516604, "learning_rate": 8.340329310374615e-06, "loss": 0.4045, "step": 1934 }, { "epoch": 2.9895712630359212, "grad_norm": 0.3308810060641629, "learning_rate": 8.329687809582783e-06, "loss": 0.3419, "step": 1935 }, { "epoch": 2.9911162611046738, "grad_norm": 0.37756880497026074, "learning_rate": 8.319048254209008e-06, "loss": 0.3561, "step": 1936 }, { "epoch": 2.992661259173426, "grad_norm": 0.40392657935561116, "learning_rate": 8.308410656645219e-06, "loss": 0.4015, "step": 1937 }, { "epoch": 2.9942062572421784, "grad_norm": 0.310906995607671, "learning_rate": 8.297775029281059e-06, "loss": 0.3457, "step": 1938 }, { "epoch": 2.995751255310931, "grad_norm": 0.3936651896162998, "learning_rate": 8.287141384503875e-06, "loss": 0.3616, "step": 1939 }, { "epoch": 2.9972962533796834, "grad_norm": 0.36600560073789795, "learning_rate": 8.276509734698713e-06, "loss": 0.3656, "step": 1940 }, { "epoch": 2.998841251448436, "grad_norm": 0.33792106105923625, "learning_rate": 8.265880092248287e-06, "loss": 0.36, "step": 1941 }, { "epoch": 3.000386249517188, "grad_norm": 0.7884024451283067, "learning_rate": 8.255252469532976e-06, "loss": 0.6218, "step": 1942 }, { "epoch": 3.0019312475859405, "grad_norm": 0.41136239903748256, "learning_rate": 8.244626878930809e-06, "loss": 0.334, "step": 1943 }, { "epoch": 3.003476245654693, "grad_norm": 0.38557782826067777, "learning_rate": 8.234003332817444e-06, "loss": 0.3216, "step": 1944 }, { "epoch": 3.005021243723445, "grad_norm": 0.42737578312770047, "learning_rate": 8.22338184356616e-06, "loss": 0.3156, "step": 1945 }, { "epoch": 3.0065662417921977, "grad_norm": 0.49182535843540254, "learning_rate": 8.21276242354784e-06, "loss": 0.3526, "step": 1946 }, { "epoch": 3.00811123986095, "grad_norm": 0.3531231830588724, "learning_rate": 8.202145085130962e-06, "loss": 0.3045, "step": 1947 }, { "epoch": 3.0096562379297027, "grad_norm": 0.44744729912865344, "learning_rate": 8.191529840681566e-06, "loss": 0.3312, "step": 1948 }, { "epoch": 3.011201235998455, "grad_norm": 0.3978997469153042, "learning_rate": 8.180916702563269e-06, "loss": 0.332, "step": 1949 }, { "epoch": 3.0127462340672073, "grad_norm": 0.3803819680392497, "learning_rate": 8.170305683137224e-06, "loss": 0.3168, "step": 1950 }, { "epoch": 3.01429123213596, "grad_norm": 0.4060768177545252, "learning_rate": 8.159696794762117e-06, "loss": 0.3586, "step": 1951 }, { "epoch": 3.0158362302047124, "grad_norm": 0.40996850607772845, "learning_rate": 8.149090049794167e-06, "loss": 0.3433, "step": 1952 }, { "epoch": 3.0173812282734644, "grad_norm": 0.36604597032908637, "learning_rate": 8.138485460587073e-06, "loss": 0.3121, "step": 1953 }, { "epoch": 3.018926226342217, "grad_norm": 0.3863732691577218, "learning_rate": 8.127883039492034e-06, "loss": 0.3368, "step": 1954 }, { "epoch": 3.0204712244109695, "grad_norm": 0.45669908496616113, "learning_rate": 8.117282798857733e-06, "loss": 0.3365, "step": 1955 }, { "epoch": 3.022016222479722, "grad_norm": 0.3168426641482655, "learning_rate": 8.106684751030304e-06, "loss": 0.3018, "step": 1956 }, { "epoch": 3.0235612205484745, "grad_norm": 0.3563055027818958, "learning_rate": 8.096088908353316e-06, "loss": 0.3237, "step": 1957 }, { "epoch": 3.0251062186172266, "grad_norm": 0.38662101865397397, "learning_rate": 8.085495283167795e-06, "loss": 0.3016, "step": 1958 }, { "epoch": 3.026651216685979, "grad_norm": 0.3908623085978282, "learning_rate": 8.074903887812164e-06, "loss": 0.3678, "step": 1959 }, { "epoch": 3.0281962147547317, "grad_norm": 0.3517050583162218, "learning_rate": 8.064314734622261e-06, "loss": 0.3113, "step": 1960 }, { "epoch": 3.029741212823484, "grad_norm": 0.3228214063624704, "learning_rate": 8.053727835931296e-06, "loss": 0.2936, "step": 1961 }, { "epoch": 3.0312862108922363, "grad_norm": 0.359669390736013, "learning_rate": 8.043143204069878e-06, "loss": 0.3352, "step": 1962 }, { "epoch": 3.032831208960989, "grad_norm": 0.308843952012322, "learning_rate": 8.032560851365958e-06, "loss": 0.2842, "step": 1963 }, { "epoch": 3.0343762070297413, "grad_norm": 0.34620229646133166, "learning_rate": 8.021980790144828e-06, "loss": 0.3245, "step": 1964 }, { "epoch": 3.035921205098494, "grad_norm": 0.36237483206996585, "learning_rate": 8.011403032729132e-06, "loss": 0.3311, "step": 1965 }, { "epoch": 3.037466203167246, "grad_norm": 0.3222689549420564, "learning_rate": 8.000827591438813e-06, "loss": 0.3432, "step": 1966 }, { "epoch": 3.0390112012359984, "grad_norm": 0.30796098539082056, "learning_rate": 7.99025447859112e-06, "loss": 0.3114, "step": 1967 }, { "epoch": 3.040556199304751, "grad_norm": 0.33516739593446654, "learning_rate": 7.979683706500597e-06, "loss": 0.3053, "step": 1968 }, { "epoch": 3.0421011973735035, "grad_norm": 0.33592661681666536, "learning_rate": 7.969115287479054e-06, "loss": 0.3487, "step": 1969 }, { "epoch": 3.0436461954422556, "grad_norm": 0.30802980799402857, "learning_rate": 7.958549233835564e-06, "loss": 0.3227, "step": 1970 }, { "epoch": 3.045191193511008, "grad_norm": 0.32428406489386286, "learning_rate": 7.947985557876443e-06, "loss": 0.3298, "step": 1971 }, { "epoch": 3.0467361915797606, "grad_norm": 0.33083165390163505, "learning_rate": 7.937424271905239e-06, "loss": 0.2987, "step": 1972 }, { "epoch": 3.048281189648513, "grad_norm": 0.30363848445527514, "learning_rate": 7.926865388222716e-06, "loss": 0.3177, "step": 1973 }, { "epoch": 3.049826187717265, "grad_norm": 0.3042459141183939, "learning_rate": 7.916308919126843e-06, "loss": 0.3421, "step": 1974 }, { "epoch": 3.0513711857860177, "grad_norm": 0.34952555370647176, "learning_rate": 7.905754876912772e-06, "loss": 0.3556, "step": 1975 }, { "epoch": 3.0529161838547703, "grad_norm": 0.339210302082838, "learning_rate": 7.895203273872827e-06, "loss": 0.3274, "step": 1976 }, { "epoch": 3.054461181923523, "grad_norm": 0.2826868819966703, "learning_rate": 7.8846541222965e-06, "loss": 0.2865, "step": 1977 }, { "epoch": 3.056006179992275, "grad_norm": 0.32180958949436134, "learning_rate": 7.874107434470421e-06, "loss": 0.3625, "step": 1978 }, { "epoch": 3.0575511780610274, "grad_norm": 0.31396555494460543, "learning_rate": 7.863563222678347e-06, "loss": 0.3288, "step": 1979 }, { "epoch": 3.05909617612978, "grad_norm": 0.3126639314271169, "learning_rate": 7.853021499201166e-06, "loss": 0.2777, "step": 1980 }, { "epoch": 3.0606411741985324, "grad_norm": 0.31160175280762303, "learning_rate": 7.842482276316849e-06, "loss": 0.3397, "step": 1981 }, { "epoch": 3.0621861722672845, "grad_norm": 0.31100151834640594, "learning_rate": 7.831945566300462e-06, "loss": 0.2956, "step": 1982 }, { "epoch": 3.063731170336037, "grad_norm": 0.30758412203375307, "learning_rate": 7.821411381424155e-06, "loss": 0.33, "step": 1983 }, { "epoch": 3.0652761684047896, "grad_norm": 0.3076587795492542, "learning_rate": 7.81087973395712e-06, "loss": 0.3128, "step": 1984 }, { "epoch": 3.066821166473542, "grad_norm": 0.2918417125199751, "learning_rate": 7.800350636165598e-06, "loss": 0.3154, "step": 1985 }, { "epoch": 3.068366164542294, "grad_norm": 0.3238750849067064, "learning_rate": 7.789824100312875e-06, "loss": 0.313, "step": 1986 }, { "epoch": 3.0699111626110467, "grad_norm": 0.3014540710270625, "learning_rate": 7.779300138659236e-06, "loss": 0.3125, "step": 1987 }, { "epoch": 3.071456160679799, "grad_norm": 0.31200892317526047, "learning_rate": 7.76877876346197e-06, "loss": 0.3581, "step": 1988 }, { "epoch": 3.0730011587485517, "grad_norm": 0.30585921724220005, "learning_rate": 7.758259986975356e-06, "loss": 0.2841, "step": 1989 }, { "epoch": 3.074546156817304, "grad_norm": 0.375228761165033, "learning_rate": 7.747743821450655e-06, "loss": 0.3188, "step": 1990 }, { "epoch": 3.0760911548860563, "grad_norm": 0.3350819319157019, "learning_rate": 7.737230279136079e-06, "loss": 0.3459, "step": 1991 }, { "epoch": 3.077636152954809, "grad_norm": 0.39013391109305673, "learning_rate": 7.726719372276773e-06, "loss": 0.3285, "step": 1992 }, { "epoch": 3.0791811510235614, "grad_norm": 0.33211513329445735, "learning_rate": 7.716211113114837e-06, "loss": 0.3122, "step": 1993 }, { "epoch": 3.0807261490923135, "grad_norm": 0.3206731155619347, "learning_rate": 7.705705513889272e-06, "loss": 0.3209, "step": 1994 }, { "epoch": 3.082271147161066, "grad_norm": 0.32069768530522746, "learning_rate": 7.695202586835982e-06, "loss": 0.3113, "step": 1995 }, { "epoch": 3.0838161452298185, "grad_norm": 0.3241838019122408, "learning_rate": 7.68470234418776e-06, "loss": 0.3126, "step": 1996 }, { "epoch": 3.085361143298571, "grad_norm": 0.33491003881018966, "learning_rate": 7.674204798174274e-06, "loss": 0.3468, "step": 1997 }, { "epoch": 3.086906141367323, "grad_norm": 0.28363728227704965, "learning_rate": 7.663709961022047e-06, "loss": 0.2986, "step": 1998 }, { "epoch": 3.0884511394360756, "grad_norm": 0.32279458153433793, "learning_rate": 7.653217844954457e-06, "loss": 0.3233, "step": 1999 }, { "epoch": 3.089996137504828, "grad_norm": 0.28526838950564726, "learning_rate": 7.6427284621917e-06, "loss": 0.2882, "step": 2000 }, { "epoch": 3.0915411355735807, "grad_norm": 0.304610421649569, "learning_rate": 7.632241824950795e-06, "loss": 0.3226, "step": 2001 }, { "epoch": 3.0930861336423328, "grad_norm": 0.3036909973794187, "learning_rate": 7.621757945445566e-06, "loss": 0.3094, "step": 2002 }, { "epoch": 3.0946311317110853, "grad_norm": 0.31267060969745486, "learning_rate": 7.611276835886618e-06, "loss": 0.3258, "step": 2003 }, { "epoch": 3.096176129779838, "grad_norm": 0.31148989100417424, "learning_rate": 7.600798508481333e-06, "loss": 0.3388, "step": 2004 }, { "epoch": 3.0977211278485903, "grad_norm": 0.2824646007792682, "learning_rate": 7.590322975433857e-06, "loss": 0.3087, "step": 2005 }, { "epoch": 3.0992661259173424, "grad_norm": 0.2909734144095486, "learning_rate": 7.579850248945076e-06, "loss": 0.2977, "step": 2006 }, { "epoch": 3.100811123986095, "grad_norm": 0.30060098439672756, "learning_rate": 7.569380341212607e-06, "loss": 0.322, "step": 2007 }, { "epoch": 3.1023561220548475, "grad_norm": 0.33945241202534976, "learning_rate": 7.558913264430787e-06, "loss": 0.3394, "step": 2008 }, { "epoch": 3.1039011201236, "grad_norm": 0.2991750655405624, "learning_rate": 7.548449030790654e-06, "loss": 0.3311, "step": 2009 }, { "epoch": 3.105446118192352, "grad_norm": 0.30375865737056035, "learning_rate": 7.537987652479933e-06, "loss": 0.2999, "step": 2010 }, { "epoch": 3.1069911162611046, "grad_norm": 0.30206892316049744, "learning_rate": 7.527529141683033e-06, "loss": 0.3172, "step": 2011 }, { "epoch": 3.108536114329857, "grad_norm": 0.29866760647338564, "learning_rate": 7.51707351058101e-06, "loss": 0.3113, "step": 2012 }, { "epoch": 3.1100811123986096, "grad_norm": 0.323243751310988, "learning_rate": 7.506620771351571e-06, "loss": 0.3637, "step": 2013 }, { "epoch": 3.1116261104673617, "grad_norm": 0.3018218364508096, "learning_rate": 7.496170936169052e-06, "loss": 0.3046, "step": 2014 }, { "epoch": 3.1131711085361142, "grad_norm": 0.3241655560793737, "learning_rate": 7.4857240172044225e-06, "loss": 0.3098, "step": 2015 }, { "epoch": 3.1147161066048668, "grad_norm": 0.3361900566276189, "learning_rate": 7.475280026625231e-06, "loss": 0.3463, "step": 2016 }, { "epoch": 3.1162611046736193, "grad_norm": 0.3283291903875403, "learning_rate": 7.464838976595629e-06, "loss": 0.3366, "step": 2017 }, { "epoch": 3.117806102742372, "grad_norm": 0.3108628101587473, "learning_rate": 7.45440087927635e-06, "loss": 0.3214, "step": 2018 }, { "epoch": 3.119351100811124, "grad_norm": 0.3190568790064714, "learning_rate": 7.443965746824675e-06, "loss": 0.3297, "step": 2019 }, { "epoch": 3.1208960988798764, "grad_norm": 0.3741097089121587, "learning_rate": 7.433533591394431e-06, "loss": 0.337, "step": 2020 }, { "epoch": 3.122441096948629, "grad_norm": 0.2983209834019764, "learning_rate": 7.4231044251359915e-06, "loss": 0.3, "step": 2021 }, { "epoch": 3.123986095017381, "grad_norm": 0.32005915220299286, "learning_rate": 7.412678260196239e-06, "loss": 0.324, "step": 2022 }, { "epoch": 3.1255310930861335, "grad_norm": 0.2959414585390334, "learning_rate": 7.402255108718557e-06, "loss": 0.318, "step": 2023 }, { "epoch": 3.127076091154886, "grad_norm": 0.3270026006004426, "learning_rate": 7.391834982842832e-06, "loss": 0.3506, "step": 2024 }, { "epoch": 3.1286210892236386, "grad_norm": 0.30440766531275365, "learning_rate": 7.381417894705412e-06, "loss": 0.3357, "step": 2025 }, { "epoch": 3.130166087292391, "grad_norm": 0.30062877503735863, "learning_rate": 7.3710038564391136e-06, "loss": 0.299, "step": 2026 }, { "epoch": 3.131711085361143, "grad_norm": 0.30250903548825736, "learning_rate": 7.360592880173206e-06, "loss": 0.3228, "step": 2027 }, { "epoch": 3.1332560834298957, "grad_norm": 0.30006741172402757, "learning_rate": 7.350184978033386e-06, "loss": 0.3033, "step": 2028 }, { "epoch": 3.1348010814986482, "grad_norm": 0.3030524979542656, "learning_rate": 7.339780162141766e-06, "loss": 0.3299, "step": 2029 }, { "epoch": 3.1363460795674007, "grad_norm": 0.32601249704264784, "learning_rate": 7.3293784446168744e-06, "loss": 0.323, "step": 2030 }, { "epoch": 3.137891077636153, "grad_norm": 0.3280829831747014, "learning_rate": 7.3189798375736244e-06, "loss": 0.3314, "step": 2031 }, { "epoch": 3.1394360757049053, "grad_norm": 0.33861184901049984, "learning_rate": 7.308584353123304e-06, "loss": 0.3173, "step": 2032 }, { "epoch": 3.140981073773658, "grad_norm": 0.3217425801964215, "learning_rate": 7.298192003373574e-06, "loss": 0.3305, "step": 2033 }, { "epoch": 3.1425260718424104, "grad_norm": 0.3356774915195005, "learning_rate": 7.287802800428436e-06, "loss": 0.2906, "step": 2034 }, { "epoch": 3.1440710699111625, "grad_norm": 0.31040933404456317, "learning_rate": 7.277416756388225e-06, "loss": 0.321, "step": 2035 }, { "epoch": 3.145616067979915, "grad_norm": 0.31775670109709836, "learning_rate": 7.267033883349604e-06, "loss": 0.3169, "step": 2036 }, { "epoch": 3.1471610660486675, "grad_norm": 0.3446460689329911, "learning_rate": 7.256654193405538e-06, "loss": 0.3251, "step": 2037 }, { "epoch": 3.14870606411742, "grad_norm": 0.350018120631074, "learning_rate": 7.246277698645282e-06, "loss": 0.3241, "step": 2038 }, { "epoch": 3.150251062186172, "grad_norm": 0.33844654887442527, "learning_rate": 7.235904411154384e-06, "loss": 0.3343, "step": 2039 }, { "epoch": 3.1517960602549246, "grad_norm": 0.34759277688664425, "learning_rate": 7.225534343014635e-06, "loss": 0.326, "step": 2040 }, { "epoch": 3.153341058323677, "grad_norm": 0.3761804484694728, "learning_rate": 7.215167506304093e-06, "loss": 0.3337, "step": 2041 }, { "epoch": 3.1548860563924297, "grad_norm": 0.3036808549726752, "learning_rate": 7.2048039130970405e-06, "loss": 0.3026, "step": 2042 }, { "epoch": 3.1564310544611818, "grad_norm": 0.3308959534258229, "learning_rate": 7.194443575463997e-06, "loss": 0.3175, "step": 2043 }, { "epoch": 3.1579760525299343, "grad_norm": 0.4039625798245503, "learning_rate": 7.184086505471677e-06, "loss": 0.3389, "step": 2044 }, { "epoch": 3.159521050598687, "grad_norm": 0.3271092791236419, "learning_rate": 7.1737327151829886e-06, "loss": 0.3176, "step": 2045 }, { "epoch": 3.1610660486674393, "grad_norm": 0.3017867318273709, "learning_rate": 7.163382216657033e-06, "loss": 0.3395, "step": 2046 }, { "epoch": 3.1626110467361914, "grad_norm": 0.3072626547475011, "learning_rate": 7.153035021949071e-06, "loss": 0.3291, "step": 2047 }, { "epoch": 3.164156044804944, "grad_norm": 0.33070539685114875, "learning_rate": 7.142691143110503e-06, "loss": 0.3106, "step": 2048 }, { "epoch": 3.1657010428736965, "grad_norm": 0.34165141169244423, "learning_rate": 7.13235059218889e-06, "loss": 0.3256, "step": 2049 }, { "epoch": 3.167246040942449, "grad_norm": 0.30789275738121263, "learning_rate": 7.122013381227903e-06, "loss": 0.3255, "step": 2050 }, { "epoch": 3.168791039011201, "grad_norm": 0.3050434073330102, "learning_rate": 7.111679522267316e-06, "loss": 0.3234, "step": 2051 }, { "epoch": 3.1703360370799536, "grad_norm": 0.30472613845506935, "learning_rate": 7.10134902734302e-06, "loss": 0.2926, "step": 2052 }, { "epoch": 3.171881035148706, "grad_norm": 0.3240729947975448, "learning_rate": 7.091021908486972e-06, "loss": 0.3248, "step": 2053 }, { "epoch": 3.1734260332174586, "grad_norm": 0.30459117386909057, "learning_rate": 7.080698177727197e-06, "loss": 0.3012, "step": 2054 }, { "epoch": 3.1749710312862107, "grad_norm": 0.3019228803061939, "learning_rate": 7.070377847087785e-06, "loss": 0.3235, "step": 2055 }, { "epoch": 3.1765160293549632, "grad_norm": 0.308196996486944, "learning_rate": 7.060060928588854e-06, "loss": 0.3039, "step": 2056 }, { "epoch": 3.1780610274237158, "grad_norm": 0.3296019013877957, "learning_rate": 7.049747434246553e-06, "loss": 0.3179, "step": 2057 }, { "epoch": 3.1796060254924683, "grad_norm": 0.28393548048780604, "learning_rate": 7.039437376073046e-06, "loss": 0.3044, "step": 2058 }, { "epoch": 3.1811510235612204, "grad_norm": 0.31412331081483336, "learning_rate": 7.029130766076487e-06, "loss": 0.311, "step": 2059 }, { "epoch": 3.182696021629973, "grad_norm": 0.35929585889556886, "learning_rate": 7.018827616261019e-06, "loss": 0.3429, "step": 2060 }, { "epoch": 3.1842410196987254, "grad_norm": 0.31259887222877736, "learning_rate": 7.008527938626758e-06, "loss": 0.3192, "step": 2061 }, { "epoch": 3.185786017767478, "grad_norm": 0.33647655458832076, "learning_rate": 6.998231745169768e-06, "loss": 0.3372, "step": 2062 }, { "epoch": 3.18733101583623, "grad_norm": 0.34576919204731715, "learning_rate": 6.98793904788206e-06, "loss": 0.3409, "step": 2063 }, { "epoch": 3.1888760139049825, "grad_norm": 0.34336624485341644, "learning_rate": 6.977649858751574e-06, "loss": 0.3378, "step": 2064 }, { "epoch": 3.190421011973735, "grad_norm": 0.31616612559537477, "learning_rate": 6.967364189762161e-06, "loss": 0.3277, "step": 2065 }, { "epoch": 3.1919660100424876, "grad_norm": 0.2991788838981259, "learning_rate": 6.957082052893569e-06, "loss": 0.3091, "step": 2066 }, { "epoch": 3.1935110081112397, "grad_norm": 0.29740484486390106, "learning_rate": 6.946803460121447e-06, "loss": 0.3137, "step": 2067 }, { "epoch": 3.195056006179992, "grad_norm": 0.33067825144047314, "learning_rate": 6.936528423417297e-06, "loss": 0.3316, "step": 2068 }, { "epoch": 3.1966010042487447, "grad_norm": 0.3253881816390596, "learning_rate": 6.9262569547484895e-06, "loss": 0.3019, "step": 2069 }, { "epoch": 3.1981460023174972, "grad_norm": 0.35714210851306327, "learning_rate": 6.915989066078236e-06, "loss": 0.3094, "step": 2070 }, { "epoch": 3.1996910003862493, "grad_norm": 0.33828151857375915, "learning_rate": 6.905724769365584e-06, "loss": 0.3173, "step": 2071 }, { "epoch": 3.201235998455002, "grad_norm": 0.3258581626413422, "learning_rate": 6.89546407656539e-06, "loss": 0.3355, "step": 2072 }, { "epoch": 3.2027809965237544, "grad_norm": 0.31619364898142527, "learning_rate": 6.885206999628316e-06, "loss": 0.3253, "step": 2073 }, { "epoch": 3.204325994592507, "grad_norm": 0.3245215682117331, "learning_rate": 6.874953550500818e-06, "loss": 0.3128, "step": 2074 }, { "epoch": 3.205870992661259, "grad_norm": 0.3492998554629684, "learning_rate": 6.864703741125118e-06, "loss": 0.3335, "step": 2075 }, { "epoch": 3.2074159907300115, "grad_norm": 0.30558361699517855, "learning_rate": 6.854457583439198e-06, "loss": 0.3209, "step": 2076 }, { "epoch": 3.208960988798764, "grad_norm": 0.3099179747531365, "learning_rate": 6.8442150893768e-06, "loss": 0.319, "step": 2077 }, { "epoch": 3.2105059868675165, "grad_norm": 0.3157714570332714, "learning_rate": 6.833976270867389e-06, "loss": 0.3277, "step": 2078 }, { "epoch": 3.212050984936269, "grad_norm": 0.34331191286021584, "learning_rate": 6.823741139836141e-06, "loss": 0.3082, "step": 2079 }, { "epoch": 3.213595983005021, "grad_norm": 0.36280428217881505, "learning_rate": 6.813509708203957e-06, "loss": 0.3194, "step": 2080 }, { "epoch": 3.2151409810737737, "grad_norm": 0.32816061819420744, "learning_rate": 6.803281987887415e-06, "loss": 0.3342, "step": 2081 }, { "epoch": 3.216685979142526, "grad_norm": 0.2866495948592929, "learning_rate": 6.793057990798771e-06, "loss": 0.3038, "step": 2082 }, { "epoch": 3.2182309772112783, "grad_norm": 0.3271633964261483, "learning_rate": 6.782837728845955e-06, "loss": 0.3502, "step": 2083 }, { "epoch": 3.219775975280031, "grad_norm": 0.3366728652771696, "learning_rate": 6.772621213932534e-06, "loss": 0.3375, "step": 2084 }, { "epoch": 3.2213209733487833, "grad_norm": 0.30722878333647485, "learning_rate": 6.762408457957717e-06, "loss": 0.3292, "step": 2085 }, { "epoch": 3.222865971417536, "grad_norm": 0.2979270640399415, "learning_rate": 6.752199472816334e-06, "loss": 0.3285, "step": 2086 }, { "epoch": 3.2244109694862884, "grad_norm": 0.371411840945949, "learning_rate": 6.741994270398826e-06, "loss": 0.3479, "step": 2087 }, { "epoch": 3.2259559675550404, "grad_norm": 0.3122055816138479, "learning_rate": 6.7317928625912224e-06, "loss": 0.3293, "step": 2088 }, { "epoch": 3.227500965623793, "grad_norm": 0.29151359437983104, "learning_rate": 6.721595261275138e-06, "loss": 0.301, "step": 2089 }, { "epoch": 3.2290459636925455, "grad_norm": 0.365446720853779, "learning_rate": 6.711401478327753e-06, "loss": 0.382, "step": 2090 }, { "epoch": 3.2305909617612976, "grad_norm": 0.2982950900755694, "learning_rate": 6.7012115256217955e-06, "loss": 0.3185, "step": 2091 }, { "epoch": 3.23213595983005, "grad_norm": 0.32904392410846933, "learning_rate": 6.691025415025543e-06, "loss": 0.3037, "step": 2092 }, { "epoch": 3.2336809578988026, "grad_norm": 0.38552940351689985, "learning_rate": 6.680843158402787e-06, "loss": 0.3249, "step": 2093 }, { "epoch": 3.235225955967555, "grad_norm": 0.4798538635236364, "learning_rate": 6.670664767612834e-06, "loss": 0.3329, "step": 2094 }, { "epoch": 3.2367709540363077, "grad_norm": 0.2868402678908147, "learning_rate": 6.6604902545104945e-06, "loss": 0.3151, "step": 2095 }, { "epoch": 3.2383159521050597, "grad_norm": 0.30295128778510166, "learning_rate": 6.650319630946054e-06, "loss": 0.3459, "step": 2096 }, { "epoch": 3.2398609501738123, "grad_norm": 0.28477829890464335, "learning_rate": 6.640152908765271e-06, "loss": 0.2829, "step": 2097 }, { "epoch": 3.241405948242565, "grad_norm": 0.3043821765443384, "learning_rate": 6.6299900998093584e-06, "loss": 0.3342, "step": 2098 }, { "epoch": 3.2429509463113173, "grad_norm": 0.28882521212213424, "learning_rate": 6.619831215914974e-06, "loss": 0.3063, "step": 2099 }, { "epoch": 3.2444959443800694, "grad_norm": 0.3019053305770242, "learning_rate": 6.6096762689142055e-06, "loss": 0.3427, "step": 2100 }, { "epoch": 3.246040942448822, "grad_norm": 0.3072868697438517, "learning_rate": 6.599525270634547e-06, "loss": 0.3128, "step": 2101 }, { "epoch": 3.2475859405175744, "grad_norm": 0.3809685909950267, "learning_rate": 6.589378232898908e-06, "loss": 0.3457, "step": 2102 }, { "epoch": 3.249130938586327, "grad_norm": 0.2986082902350497, "learning_rate": 6.57923516752557e-06, "loss": 0.2906, "step": 2103 }, { "epoch": 3.250675936655079, "grad_norm": 0.332214871396409, "learning_rate": 6.569096086328193e-06, "loss": 0.337, "step": 2104 }, { "epoch": 3.2522209347238316, "grad_norm": 0.29882924577291337, "learning_rate": 6.558961001115804e-06, "loss": 0.3158, "step": 2105 }, { "epoch": 3.253765932792584, "grad_norm": 0.36285775334486814, "learning_rate": 6.54882992369277e-06, "loss": 0.3582, "step": 2106 }, { "epoch": 3.2553109308613366, "grad_norm": 0.3343521837112809, "learning_rate": 6.538702865858779e-06, "loss": 0.3319, "step": 2107 }, { "epoch": 3.2568559289300887, "grad_norm": 0.32710514230893734, "learning_rate": 6.528579839408862e-06, "loss": 0.3114, "step": 2108 }, { "epoch": 3.258400926998841, "grad_norm": 0.35437302562272066, "learning_rate": 6.518460856133333e-06, "loss": 0.3221, "step": 2109 }, { "epoch": 3.2599459250675937, "grad_norm": 0.2948561681272042, "learning_rate": 6.5083459278178055e-06, "loss": 0.3125, "step": 2110 }, { "epoch": 3.2614909231363463, "grad_norm": 0.3327096898152124, "learning_rate": 6.498235066243173e-06, "loss": 0.3233, "step": 2111 }, { "epoch": 3.2630359212050983, "grad_norm": 0.3491172613140132, "learning_rate": 6.488128283185587e-06, "loss": 0.3188, "step": 2112 }, { "epoch": 3.264580919273851, "grad_norm": 0.33942768190488043, "learning_rate": 6.478025590416448e-06, "loss": 0.3387, "step": 2113 }, { "epoch": 3.2661259173426034, "grad_norm": 0.3148019556965914, "learning_rate": 6.467926999702398e-06, "loss": 0.3363, "step": 2114 }, { "epoch": 3.267670915411356, "grad_norm": 0.3087260630845642, "learning_rate": 6.457832522805301e-06, "loss": 0.3226, "step": 2115 }, { "epoch": 3.269215913480108, "grad_norm": 0.2994605193902773, "learning_rate": 6.447742171482222e-06, "loss": 0.2829, "step": 2116 }, { "epoch": 3.2707609115488605, "grad_norm": 0.3162179077575026, "learning_rate": 6.43765595748543e-06, "loss": 0.3358, "step": 2117 }, { "epoch": 3.272305909617613, "grad_norm": 0.31149071455352867, "learning_rate": 6.427573892562371e-06, "loss": 0.3198, "step": 2118 }, { "epoch": 3.2738509076863656, "grad_norm": 0.3130243430360272, "learning_rate": 6.417495988455654e-06, "loss": 0.3176, "step": 2119 }, { "epoch": 3.2753959057551176, "grad_norm": 0.27271929828869107, "learning_rate": 6.407422256903053e-06, "loss": 0.2959, "step": 2120 }, { "epoch": 3.27694090382387, "grad_norm": 0.296875976677433, "learning_rate": 6.3973527096374755e-06, "loss": 0.3182, "step": 2121 }, { "epoch": 3.2784859018926227, "grad_norm": 0.3269470714609401, "learning_rate": 6.387287358386951e-06, "loss": 0.3394, "step": 2122 }, { "epoch": 3.280030899961375, "grad_norm": 0.3306813867980907, "learning_rate": 6.377226214874632e-06, "loss": 0.3214, "step": 2123 }, { "epoch": 3.2815758980301273, "grad_norm": 0.28955380829471505, "learning_rate": 6.367169290818764e-06, "loss": 0.3376, "step": 2124 }, { "epoch": 3.28312089609888, "grad_norm": 0.2975387284274261, "learning_rate": 6.357116597932678e-06, "loss": 0.3177, "step": 2125 }, { "epoch": 3.2846658941676323, "grad_norm": 0.30428738369842534, "learning_rate": 6.347068147924777e-06, "loss": 0.3232, "step": 2126 }, { "epoch": 3.286210892236385, "grad_norm": 0.32222213930482607, "learning_rate": 6.337023952498528e-06, "loss": 0.3272, "step": 2127 }, { "epoch": 3.287755890305137, "grad_norm": 0.2932860647027404, "learning_rate": 6.326984023352435e-06, "loss": 0.3266, "step": 2128 }, { "epoch": 3.2893008883738895, "grad_norm": 0.3148661002616021, "learning_rate": 6.316948372180033e-06, "loss": 0.3038, "step": 2129 }, { "epoch": 3.290845886442642, "grad_norm": 0.28731529331513167, "learning_rate": 6.306917010669887e-06, "loss": 0.3024, "step": 2130 }, { "epoch": 3.2923908845113945, "grad_norm": 0.28653310886300043, "learning_rate": 6.296889950505549e-06, "loss": 0.3393, "step": 2131 }, { "epoch": 3.293935882580147, "grad_norm": 0.29035156846679866, "learning_rate": 6.286867203365567e-06, "loss": 0.3348, "step": 2132 }, { "epoch": 3.295480880648899, "grad_norm": 0.2836383679593379, "learning_rate": 6.276848780923477e-06, "loss": 0.2971, "step": 2133 }, { "epoch": 3.2970258787176516, "grad_norm": 0.3000703288967473, "learning_rate": 6.266834694847761e-06, "loss": 0.3474, "step": 2134 }, { "epoch": 3.298570876786404, "grad_norm": 0.3200438298821115, "learning_rate": 6.256824956801855e-06, "loss": 0.3502, "step": 2135 }, { "epoch": 3.3001158748551562, "grad_norm": 0.2790993174208608, "learning_rate": 6.24681957844414e-06, "loss": 0.3249, "step": 2136 }, { "epoch": 3.3016608729239088, "grad_norm": 0.2991950969267082, "learning_rate": 6.236818571427914e-06, "loss": 0.3229, "step": 2137 }, { "epoch": 3.3032058709926613, "grad_norm": 0.32849858022653744, "learning_rate": 6.226821947401371e-06, "loss": 0.3516, "step": 2138 }, { "epoch": 3.304750869061414, "grad_norm": 0.2952936305787952, "learning_rate": 6.216829718007626e-06, "loss": 0.3307, "step": 2139 }, { "epoch": 3.3062958671301663, "grad_norm": 0.2926171259132466, "learning_rate": 6.206841894884652e-06, "loss": 0.2829, "step": 2140 }, { "epoch": 3.3078408651989184, "grad_norm": 0.29872188548930834, "learning_rate": 6.1968584896653024e-06, "loss": 0.33, "step": 2141 }, { "epoch": 3.309385863267671, "grad_norm": 0.2872084939935575, "learning_rate": 6.186879513977282e-06, "loss": 0.3092, "step": 2142 }, { "epoch": 3.3109308613364234, "grad_norm": 0.3231832994578448, "learning_rate": 6.176904979443137e-06, "loss": 0.321, "step": 2143 }, { "epoch": 3.3124758594051755, "grad_norm": 0.30650497191872844, "learning_rate": 6.166934897680235e-06, "loss": 0.3059, "step": 2144 }, { "epoch": 3.314020857473928, "grad_norm": 0.2873449907673149, "learning_rate": 6.1569692803007704e-06, "loss": 0.3127, "step": 2145 }, { "epoch": 3.3155658555426806, "grad_norm": 0.296086516056076, "learning_rate": 6.147008138911726e-06, "loss": 0.3236, "step": 2146 }, { "epoch": 3.317110853611433, "grad_norm": 0.28483034632816595, "learning_rate": 6.137051485114876e-06, "loss": 0.3414, "step": 2147 }, { "epoch": 3.3186558516801856, "grad_norm": 0.3110587357486399, "learning_rate": 6.127099330506767e-06, "loss": 0.3249, "step": 2148 }, { "epoch": 3.3202008497489377, "grad_norm": 0.3338260387944192, "learning_rate": 6.117151686678709e-06, "loss": 0.3434, "step": 2149 }, { "epoch": 3.3217458478176902, "grad_norm": 0.2762650761530332, "learning_rate": 6.10720856521675e-06, "loss": 0.2817, "step": 2150 }, { "epoch": 3.3232908458864427, "grad_norm": 0.28406837745293795, "learning_rate": 6.097269977701682e-06, "loss": 0.3408, "step": 2151 }, { "epoch": 3.324835843955195, "grad_norm": 0.3159990309856077, "learning_rate": 6.087335935709007e-06, "loss": 0.3398, "step": 2152 }, { "epoch": 3.3263808420239473, "grad_norm": 0.31687125526308096, "learning_rate": 6.077406450808936e-06, "loss": 0.3131, "step": 2153 }, { "epoch": 3.3279258400927, "grad_norm": 0.3228890382581072, "learning_rate": 6.067481534566373e-06, "loss": 0.3353, "step": 2154 }, { "epoch": 3.3294708381614524, "grad_norm": 0.30655674152913354, "learning_rate": 6.0575611985409e-06, "loss": 0.331, "step": 2155 }, { "epoch": 3.331015836230205, "grad_norm": 0.2890344119225039, "learning_rate": 6.047645454286766e-06, "loss": 0.3117, "step": 2156 }, { "epoch": 3.332560834298957, "grad_norm": 0.33760835295942354, "learning_rate": 6.037734313352867e-06, "loss": 0.3368, "step": 2157 }, { "epoch": 3.3341058323677095, "grad_norm": 0.3253283473574128, "learning_rate": 6.027827787282748e-06, "loss": 0.3082, "step": 2158 }, { "epoch": 3.335650830436462, "grad_norm": 0.2897352535879886, "learning_rate": 6.017925887614568e-06, "loss": 0.3, "step": 2159 }, { "epoch": 3.337195828505214, "grad_norm": 0.2955233160657908, "learning_rate": 6.008028625881097e-06, "loss": 0.3243, "step": 2160 }, { "epoch": 3.3387408265739666, "grad_norm": 0.32277690918307483, "learning_rate": 5.998136013609719e-06, "loss": 0.3192, "step": 2161 }, { "epoch": 3.340285824642719, "grad_norm": 0.2813433861438704, "learning_rate": 5.988248062322386e-06, "loss": 0.3018, "step": 2162 }, { "epoch": 3.3418308227114717, "grad_norm": 0.33556231369701195, "learning_rate": 5.9783647835356205e-06, "loss": 0.3427, "step": 2163 }, { "epoch": 3.343375820780224, "grad_norm": 0.27903381009972933, "learning_rate": 5.968486188760522e-06, "loss": 0.3101, "step": 2164 }, { "epoch": 3.3449208188489763, "grad_norm": 0.2975222766300107, "learning_rate": 5.958612289502717e-06, "loss": 0.3408, "step": 2165 }, { "epoch": 3.346465816917729, "grad_norm": 0.28742457323302933, "learning_rate": 5.948743097262362e-06, "loss": 0.2879, "step": 2166 }, { "epoch": 3.3480108149864813, "grad_norm": 0.33765388485959563, "learning_rate": 5.938878623534146e-06, "loss": 0.3233, "step": 2167 }, { "epoch": 3.349555813055234, "grad_norm": 0.28038682698947376, "learning_rate": 5.929018879807251e-06, "loss": 0.2952, "step": 2168 }, { "epoch": 3.351100811123986, "grad_norm": 0.2927492934637609, "learning_rate": 5.919163877565351e-06, "loss": 0.3222, "step": 2169 }, { "epoch": 3.3526458091927385, "grad_norm": 0.3080196352611929, "learning_rate": 5.9093136282866014e-06, "loss": 0.3634, "step": 2170 }, { "epoch": 3.354190807261491, "grad_norm": 0.2823896192396401, "learning_rate": 5.899468143443619e-06, "loss": 0.307, "step": 2171 }, { "epoch": 3.3557358053302435, "grad_norm": 0.3109390761464888, "learning_rate": 5.889627434503471e-06, "loss": 0.3293, "step": 2172 }, { "epoch": 3.3572808033989956, "grad_norm": 0.298851956421623, "learning_rate": 5.879791512927666e-06, "loss": 0.3075, "step": 2173 }, { "epoch": 3.358825801467748, "grad_norm": 0.33105321813931043, "learning_rate": 5.869960390172132e-06, "loss": 0.3636, "step": 2174 }, { "epoch": 3.3603707995365006, "grad_norm": 0.2726036122694373, "learning_rate": 5.860134077687206e-06, "loss": 0.2971, "step": 2175 }, { "epoch": 3.361915797605253, "grad_norm": 0.3169708065299759, "learning_rate": 5.850312586917633e-06, "loss": 0.3211, "step": 2176 }, { "epoch": 3.3634607956740052, "grad_norm": 0.3092352736008897, "learning_rate": 5.840495929302534e-06, "loss": 0.3261, "step": 2177 }, { "epoch": 3.3650057937427578, "grad_norm": 0.3013827075812925, "learning_rate": 5.830684116275399e-06, "loss": 0.2984, "step": 2178 }, { "epoch": 3.3665507918115103, "grad_norm": 0.3465164333641493, "learning_rate": 5.820877159264076e-06, "loss": 0.3451, "step": 2179 }, { "epoch": 3.368095789880263, "grad_norm": 0.29077128408034014, "learning_rate": 5.81107506969077e-06, "loss": 0.2956, "step": 2180 }, { "epoch": 3.369640787949015, "grad_norm": 0.31677518629789475, "learning_rate": 5.801277858972003e-06, "loss": 0.3458, "step": 2181 }, { "epoch": 3.3711857860177674, "grad_norm": 0.3040507887546481, "learning_rate": 5.791485538518609e-06, "loss": 0.295, "step": 2182 }, { "epoch": 3.37273078408652, "grad_norm": 0.3128835058310876, "learning_rate": 5.781698119735746e-06, "loss": 0.3672, "step": 2183 }, { "epoch": 3.3742757821552725, "grad_norm": 0.2907342364379208, "learning_rate": 5.771915614022849e-06, "loss": 0.3224, "step": 2184 }, { "epoch": 3.3758207802240245, "grad_norm": 0.32014471959534657, "learning_rate": 5.762138032773632e-06, "loss": 0.3523, "step": 2185 }, { "epoch": 3.377365778292777, "grad_norm": 0.28386760947114625, "learning_rate": 5.752365387376082e-06, "loss": 0.2751, "step": 2186 }, { "epoch": 3.3789107763615296, "grad_norm": 0.28339576558145446, "learning_rate": 5.742597689212427e-06, "loss": 0.3188, "step": 2187 }, { "epoch": 3.380455774430282, "grad_norm": 0.2900473735725753, "learning_rate": 5.732834949659137e-06, "loss": 0.3173, "step": 2188 }, { "epoch": 3.382000772499034, "grad_norm": 0.2951707486670391, "learning_rate": 5.723077180086908e-06, "loss": 0.3269, "step": 2189 }, { "epoch": 3.3835457705677867, "grad_norm": 0.30158203397634437, "learning_rate": 5.713324391860644e-06, "loss": 0.3261, "step": 2190 }, { "epoch": 3.3850907686365392, "grad_norm": 0.28878851001356914, "learning_rate": 5.703576596339446e-06, "loss": 0.3244, "step": 2191 }, { "epoch": 3.3866357667052918, "grad_norm": 0.2940470251906648, "learning_rate": 5.693833804876613e-06, "loss": 0.3213, "step": 2192 }, { "epoch": 3.388180764774044, "grad_norm": 0.317598196152587, "learning_rate": 5.684096028819599e-06, "loss": 0.3427, "step": 2193 }, { "epoch": 3.3897257628427964, "grad_norm": 0.28467757390432236, "learning_rate": 5.674363279510027e-06, "loss": 0.3244, "step": 2194 }, { "epoch": 3.391270760911549, "grad_norm": 0.2931704204030622, "learning_rate": 5.664635568283659e-06, "loss": 0.3129, "step": 2195 }, { "epoch": 3.3928157589803014, "grad_norm": 0.3025413526256767, "learning_rate": 5.654912906470391e-06, "loss": 0.321, "step": 2196 }, { "epoch": 3.3943607570490535, "grad_norm": 0.2941745857818625, "learning_rate": 5.64519530539424e-06, "loss": 0.3229, "step": 2197 }, { "epoch": 3.395905755117806, "grad_norm": 0.3115604266707593, "learning_rate": 5.635482776373331e-06, "loss": 0.3314, "step": 2198 }, { "epoch": 3.3974507531865585, "grad_norm": 0.3074004572362856, "learning_rate": 5.625775330719876e-06, "loss": 0.3333, "step": 2199 }, { "epoch": 3.398995751255311, "grad_norm": 0.31708100696238983, "learning_rate": 5.6160729797401635e-06, "loss": 0.3203, "step": 2200 }, { "epoch": 3.4005407493240636, "grad_norm": 0.3096151342251726, "learning_rate": 5.606375734734568e-06, "loss": 0.3345, "step": 2201 }, { "epoch": 3.4020857473928157, "grad_norm": 0.28062160873940917, "learning_rate": 5.596683606997488e-06, "loss": 0.3118, "step": 2202 }, { "epoch": 3.403630745461568, "grad_norm": 0.2932205910283833, "learning_rate": 5.586996607817378e-06, "loss": 0.3061, "step": 2203 }, { "epoch": 3.4051757435303207, "grad_norm": 0.3005313235639288, "learning_rate": 5.577314748476723e-06, "loss": 0.3516, "step": 2204 }, { "epoch": 3.406720741599073, "grad_norm": 0.28817249138292567, "learning_rate": 5.567638040252015e-06, "loss": 0.3176, "step": 2205 }, { "epoch": 3.4082657396678253, "grad_norm": 0.30504106832630595, "learning_rate": 5.5579664944137466e-06, "loss": 0.3072, "step": 2206 }, { "epoch": 3.409810737736578, "grad_norm": 0.31477321626223925, "learning_rate": 5.548300122226394e-06, "loss": 0.3216, "step": 2207 }, { "epoch": 3.4113557358053304, "grad_norm": 0.3055044793944136, "learning_rate": 5.538638934948426e-06, "loss": 0.3123, "step": 2208 }, { "epoch": 3.412900733874083, "grad_norm": 0.29927189771353724, "learning_rate": 5.528982943832244e-06, "loss": 0.3072, "step": 2209 }, { "epoch": 3.414445731942835, "grad_norm": 0.2790255164031628, "learning_rate": 5.519332160124215e-06, "loss": 0.3203, "step": 2210 }, { "epoch": 3.4159907300115875, "grad_norm": 0.3066440995673194, "learning_rate": 5.509686595064647e-06, "loss": 0.3417, "step": 2211 }, { "epoch": 3.41753572808034, "grad_norm": 0.29923042318609944, "learning_rate": 5.500046259887756e-06, "loss": 0.3098, "step": 2212 }, { "epoch": 3.419080726149092, "grad_norm": 0.2951716068534887, "learning_rate": 5.4904111658216666e-06, "loss": 0.2985, "step": 2213 }, { "epoch": 3.4206257242178446, "grad_norm": 0.372351075921913, "learning_rate": 5.480781324088413e-06, "loss": 0.3531, "step": 2214 }, { "epoch": 3.422170722286597, "grad_norm": 0.289646191998028, "learning_rate": 5.4711567459039005e-06, "loss": 0.3046, "step": 2215 }, { "epoch": 3.4237157203553497, "grad_norm": 0.334861868272502, "learning_rate": 5.461537442477905e-06, "loss": 0.3143, "step": 2216 }, { "epoch": 3.425260718424102, "grad_norm": 0.35436752391348036, "learning_rate": 5.451923425014062e-06, "loss": 0.3509, "step": 2217 }, { "epoch": 3.4268057164928543, "grad_norm": 0.3253298848395046, "learning_rate": 5.442314704709848e-06, "loss": 0.3137, "step": 2218 }, { "epoch": 3.428350714561607, "grad_norm": 0.3147118234142464, "learning_rate": 5.432711292756568e-06, "loss": 0.2989, "step": 2219 }, { "epoch": 3.4298957126303593, "grad_norm": 0.33981775763903643, "learning_rate": 5.423113200339354e-06, "loss": 0.336, "step": 2220 }, { "epoch": 3.4314407106991114, "grad_norm": 0.30742180018030135, "learning_rate": 5.413520438637132e-06, "loss": 0.3404, "step": 2221 }, { "epoch": 3.432985708767864, "grad_norm": 0.2936446615554233, "learning_rate": 5.4039330188226225e-06, "loss": 0.3067, "step": 2222 }, { "epoch": 3.4345307068366164, "grad_norm": 0.29735554095181127, "learning_rate": 5.394350952062326e-06, "loss": 0.3058, "step": 2223 }, { "epoch": 3.436075704905369, "grad_norm": 0.3090291013211636, "learning_rate": 5.384774249516507e-06, "loss": 0.3458, "step": 2224 }, { "epoch": 3.4376207029741215, "grad_norm": 0.2769269573347901, "learning_rate": 5.375202922339176e-06, "loss": 0.3295, "step": 2225 }, { "epoch": 3.4391657010428736, "grad_norm": 0.29422663152701095, "learning_rate": 5.365636981678098e-06, "loss": 0.3317, "step": 2226 }, { "epoch": 3.440710699111626, "grad_norm": 0.28587821943007236, "learning_rate": 5.356076438674753e-06, "loss": 0.337, "step": 2227 }, { "epoch": 3.4422556971803786, "grad_norm": 0.2756579345667861, "learning_rate": 5.34652130446433e-06, "loss": 0.3168, "step": 2228 }, { "epoch": 3.443800695249131, "grad_norm": 0.29157506610518374, "learning_rate": 5.336971590175739e-06, "loss": 0.3281, "step": 2229 }, { "epoch": 3.445345693317883, "grad_norm": 0.3369440077771602, "learning_rate": 5.32742730693155e-06, "loss": 0.2985, "step": 2230 }, { "epoch": 3.4468906913866357, "grad_norm": 0.31445580441515014, "learning_rate": 5.317888465848021e-06, "loss": 0.3282, "step": 2231 }, { "epoch": 3.4484356894553883, "grad_norm": 0.28482222574007016, "learning_rate": 5.308355078035081e-06, "loss": 0.3321, "step": 2232 }, { "epoch": 3.4499806875241408, "grad_norm": 0.3027108473280835, "learning_rate": 5.298827154596291e-06, "loss": 0.3103, "step": 2233 }, { "epoch": 3.451525685592893, "grad_norm": 0.2696250010309581, "learning_rate": 5.289304706628857e-06, "loss": 0.2925, "step": 2234 }, { "epoch": 3.4530706836616454, "grad_norm": 0.33329611712467594, "learning_rate": 5.279787745223601e-06, "loss": 0.3525, "step": 2235 }, { "epoch": 3.454615681730398, "grad_norm": 0.281239150680963, "learning_rate": 5.270276281464971e-06, "loss": 0.3147, "step": 2236 }, { "epoch": 3.4561606797991504, "grad_norm": 0.29093877241478167, "learning_rate": 5.260770326430986e-06, "loss": 0.3277, "step": 2237 }, { "epoch": 3.4577056778679025, "grad_norm": 0.2869540771749508, "learning_rate": 5.251269891193264e-06, "loss": 0.3236, "step": 2238 }, { "epoch": 3.459250675936655, "grad_norm": 0.3095171077162968, "learning_rate": 5.241774986817003e-06, "loss": 0.3201, "step": 2239 }, { "epoch": 3.4607956740054076, "grad_norm": 0.2728683239339957, "learning_rate": 5.232285624360942e-06, "loss": 0.2892, "step": 2240 }, { "epoch": 3.46234067207416, "grad_norm": 0.3002781043393013, "learning_rate": 5.22280181487737e-06, "loss": 0.3422, "step": 2241 }, { "epoch": 3.463885670142912, "grad_norm": 0.3502283358940919, "learning_rate": 5.213323569412119e-06, "loss": 0.3375, "step": 2242 }, { "epoch": 3.4654306682116647, "grad_norm": 0.2861578524806576, "learning_rate": 5.203850899004526e-06, "loss": 0.3135, "step": 2243 }, { "epoch": 3.466975666280417, "grad_norm": 0.27517496874692066, "learning_rate": 5.194383814687445e-06, "loss": 0.3014, "step": 2244 }, { "epoch": 3.4685206643491697, "grad_norm": 0.33770254365156044, "learning_rate": 5.184922327487216e-06, "loss": 0.3396, "step": 2245 }, { "epoch": 3.470065662417922, "grad_norm": 0.3481527487827275, "learning_rate": 5.175466448423667e-06, "loss": 0.3277, "step": 2246 }, { "epoch": 3.4716106604866743, "grad_norm": 0.27866302166677176, "learning_rate": 5.1660161885100875e-06, "loss": 0.3063, "step": 2247 }, { "epoch": 3.473155658555427, "grad_norm": 0.28777286233526794, "learning_rate": 5.156571558753232e-06, "loss": 0.3188, "step": 2248 }, { "epoch": 3.4747006566241794, "grad_norm": 0.30524554889947425, "learning_rate": 5.147132570153288e-06, "loss": 0.3446, "step": 2249 }, { "epoch": 3.4762456546929315, "grad_norm": 0.3121312156022311, "learning_rate": 5.137699233703877e-06, "loss": 0.3273, "step": 2250 }, { "epoch": 3.477790652761684, "grad_norm": 0.32383150492577106, "learning_rate": 5.128271560392037e-06, "loss": 0.3507, "step": 2251 }, { "epoch": 3.4793356508304365, "grad_norm": 0.2816659022244789, "learning_rate": 5.118849561198209e-06, "loss": 0.2841, "step": 2252 }, { "epoch": 3.480880648899189, "grad_norm": 0.3272938751242537, "learning_rate": 5.109433247096223e-06, "loss": 0.3505, "step": 2253 }, { "epoch": 3.482425646967941, "grad_norm": 0.2981256625020646, "learning_rate": 5.100022629053298e-06, "loss": 0.3127, "step": 2254 }, { "epoch": 3.4839706450366936, "grad_norm": 0.2933188941169057, "learning_rate": 5.090617718030008e-06, "loss": 0.3169, "step": 2255 }, { "epoch": 3.485515643105446, "grad_norm": 0.2872258700955758, "learning_rate": 5.081218524980283e-06, "loss": 0.3326, "step": 2256 }, { "epoch": 3.4870606411741987, "grad_norm": 0.2908190598454137, "learning_rate": 5.0718250608513945e-06, "loss": 0.3207, "step": 2257 }, { "epoch": 3.4886056392429508, "grad_norm": 0.31922189252324423, "learning_rate": 5.062437336583941e-06, "loss": 0.3428, "step": 2258 }, { "epoch": 3.4901506373117033, "grad_norm": 0.2838826520533276, "learning_rate": 5.053055363111832e-06, "loss": 0.3108, "step": 2259 }, { "epoch": 3.491695635380456, "grad_norm": 0.30882710150906156, "learning_rate": 5.043679151362289e-06, "loss": 0.329, "step": 2260 }, { "epoch": 3.4932406334492083, "grad_norm": 0.290484700089672, "learning_rate": 5.034308712255812e-06, "loss": 0.3187, "step": 2261 }, { "epoch": 3.494785631517961, "grad_norm": 0.3082255198228457, "learning_rate": 5.024944056706185e-06, "loss": 0.3228, "step": 2262 }, { "epoch": 3.496330629586713, "grad_norm": 0.29372896753322114, "learning_rate": 5.015585195620446e-06, "loss": 0.3275, "step": 2263 }, { "epoch": 3.4978756276554654, "grad_norm": 0.29357696231398667, "learning_rate": 5.006232139898905e-06, "loss": 0.3169, "step": 2264 }, { "epoch": 3.499420625724218, "grad_norm": 0.2884814225710948, "learning_rate": 4.996884900435083e-06, "loss": 0.3142, "step": 2265 }, { "epoch": 3.50096562379297, "grad_norm": 0.28897152251065805, "learning_rate": 4.9875434881157414e-06, "loss": 0.3337, "step": 2266 }, { "epoch": 3.5025106218617226, "grad_norm": 0.27719911447912204, "learning_rate": 4.978207913820862e-06, "loss": 0.2983, "step": 2267 }, { "epoch": 3.504055619930475, "grad_norm": 0.2803186707776398, "learning_rate": 4.968878188423612e-06, "loss": 0.3288, "step": 2268 }, { "epoch": 3.5056006179992276, "grad_norm": 0.28386210426181124, "learning_rate": 4.959554322790353e-06, "loss": 0.3375, "step": 2269 }, { "epoch": 3.50714561606798, "grad_norm": 0.2901559918495454, "learning_rate": 4.950236327780626e-06, "loss": 0.3455, "step": 2270 }, { "epoch": 3.5086906141367322, "grad_norm": 0.268436856144372, "learning_rate": 4.940924214247129e-06, "loss": 0.2927, "step": 2271 }, { "epoch": 3.5102356122054847, "grad_norm": 0.2873116232600302, "learning_rate": 4.93161799303571e-06, "loss": 0.298, "step": 2272 }, { "epoch": 3.5117806102742373, "grad_norm": 0.29008235996760645, "learning_rate": 4.922317674985354e-06, "loss": 0.3479, "step": 2273 }, { "epoch": 3.5133256083429893, "grad_norm": 0.29239498312996054, "learning_rate": 4.913023270928173e-06, "loss": 0.322, "step": 2274 }, { "epoch": 3.514870606411742, "grad_norm": 0.281318082008107, "learning_rate": 4.903734791689386e-06, "loss": 0.3229, "step": 2275 }, { "epoch": 3.5164156044804944, "grad_norm": 0.2817868843524265, "learning_rate": 4.894452248087322e-06, "loss": 0.2985, "step": 2276 }, { "epoch": 3.517960602549247, "grad_norm": 0.29394148760759636, "learning_rate": 4.885175650933388e-06, "loss": 0.3235, "step": 2277 }, { "epoch": 3.5195056006179994, "grad_norm": 0.3020515025690975, "learning_rate": 4.8759050110320634e-06, "loss": 0.3361, "step": 2278 }, { "epoch": 3.5210505986867515, "grad_norm": 0.2840178284864125, "learning_rate": 4.866640339180897e-06, "loss": 0.2904, "step": 2279 }, { "epoch": 3.522595596755504, "grad_norm": 0.3124828725623448, "learning_rate": 4.857381646170479e-06, "loss": 0.3363, "step": 2280 }, { "epoch": 3.5241405948242566, "grad_norm": 0.33097852863800437, "learning_rate": 4.848128942784437e-06, "loss": 0.3466, "step": 2281 }, { "epoch": 3.5256855928930086, "grad_norm": 0.2697866047650532, "learning_rate": 4.838882239799431e-06, "loss": 0.2947, "step": 2282 }, { "epoch": 3.527230590961761, "grad_norm": 0.30067909409011023, "learning_rate": 4.829641547985126e-06, "loss": 0.3401, "step": 2283 }, { "epoch": 3.5287755890305137, "grad_norm": 0.29218942735755626, "learning_rate": 4.820406878104181e-06, "loss": 0.311, "step": 2284 }, { "epoch": 3.530320587099266, "grad_norm": 0.2978152219775323, "learning_rate": 4.81117824091225e-06, "loss": 0.3101, "step": 2285 }, { "epoch": 3.5318655851680187, "grad_norm": 0.2812636313720219, "learning_rate": 4.801955647157954e-06, "loss": 0.3377, "step": 2286 }, { "epoch": 3.533410583236771, "grad_norm": 0.28556018772655467, "learning_rate": 4.792739107582877e-06, "loss": 0.3, "step": 2287 }, { "epoch": 3.5349555813055233, "grad_norm": 0.3013218025013973, "learning_rate": 4.783528632921559e-06, "loss": 0.3347, "step": 2288 }, { "epoch": 3.536500579374276, "grad_norm": 0.29652092144814596, "learning_rate": 4.774324233901465e-06, "loss": 0.3067, "step": 2289 }, { "epoch": 3.538045577443028, "grad_norm": 0.2791103002717618, "learning_rate": 4.76512592124299e-06, "loss": 0.31, "step": 2290 }, { "epoch": 3.5395905755117805, "grad_norm": 0.28554021109499733, "learning_rate": 4.755933705659436e-06, "loss": 0.3372, "step": 2291 }, { "epoch": 3.541135573580533, "grad_norm": 0.28096024407371184, "learning_rate": 4.746747597857014e-06, "loss": 0.2821, "step": 2292 }, { "epoch": 3.5426805716492855, "grad_norm": 0.30103995246845716, "learning_rate": 4.737567608534807e-06, "loss": 0.3071, "step": 2293 }, { "epoch": 3.544225569718038, "grad_norm": 0.3234908431584731, "learning_rate": 4.728393748384775e-06, "loss": 0.3828, "step": 2294 }, { "epoch": 3.54577056778679, "grad_norm": 0.2738354771871757, "learning_rate": 4.719226028091754e-06, "loss": 0.2884, "step": 2295 }, { "epoch": 3.5473155658555426, "grad_norm": 0.28562524744336715, "learning_rate": 4.71006445833341e-06, "loss": 0.3252, "step": 2296 }, { "epoch": 3.548860563924295, "grad_norm": 0.3153980274813264, "learning_rate": 4.7009090497802515e-06, "loss": 0.3431, "step": 2297 }, { "epoch": 3.5504055619930472, "grad_norm": 0.28503184064485754, "learning_rate": 4.6917598130956185e-06, "loss": 0.2956, "step": 2298 }, { "epoch": 3.5519505600617998, "grad_norm": 0.286290725538258, "learning_rate": 4.68261675893566e-06, "loss": 0.3155, "step": 2299 }, { "epoch": 3.5534955581305523, "grad_norm": 0.283665528522596, "learning_rate": 4.6734798979493076e-06, "loss": 0.3296, "step": 2300 }, { "epoch": 3.555040556199305, "grad_norm": 0.27875530953738303, "learning_rate": 4.664349240778305e-06, "loss": 0.3236, "step": 2301 }, { "epoch": 3.5565855542680573, "grad_norm": 0.2910848344127417, "learning_rate": 4.6552247980571555e-06, "loss": 0.323, "step": 2302 }, { "epoch": 3.5581305523368094, "grad_norm": 0.2996673657862408, "learning_rate": 4.64610658041312e-06, "loss": 0.3322, "step": 2303 }, { "epoch": 3.559675550405562, "grad_norm": 0.29406997898555776, "learning_rate": 4.636994598466228e-06, "loss": 0.3282, "step": 2304 }, { "epoch": 3.5612205484743145, "grad_norm": 0.294812404719125, "learning_rate": 4.62788886282923e-06, "loss": 0.3274, "step": 2305 }, { "epoch": 3.562765546543067, "grad_norm": 0.26286667049575385, "learning_rate": 4.618789384107604e-06, "loss": 0.2954, "step": 2306 }, { "epoch": 3.5643105446118195, "grad_norm": 0.28117106184829754, "learning_rate": 4.609696172899544e-06, "loss": 0.3158, "step": 2307 }, { "epoch": 3.5658555426805716, "grad_norm": 0.28618769853049536, "learning_rate": 4.600609239795942e-06, "loss": 0.3056, "step": 2308 }, { "epoch": 3.567400540749324, "grad_norm": 0.2876234597527964, "learning_rate": 4.591528595380376e-06, "loss": 0.3447, "step": 2309 }, { "epoch": 3.5689455388180766, "grad_norm": 0.2649699556214327, "learning_rate": 4.582454250229109e-06, "loss": 0.3003, "step": 2310 }, { "epoch": 3.5704905368868287, "grad_norm": 0.28715806030805735, "learning_rate": 4.573386214911056e-06, "loss": 0.3406, "step": 2311 }, { "epoch": 3.5720355349555812, "grad_norm": 0.306079247776258, "learning_rate": 4.56432449998779e-06, "loss": 0.3066, "step": 2312 }, { "epoch": 3.5735805330243338, "grad_norm": 0.2960566601949477, "learning_rate": 4.5552691160135166e-06, "loss": 0.3227, "step": 2313 }, { "epoch": 3.5751255310930863, "grad_norm": 0.27474214014579057, "learning_rate": 4.546220073535073e-06, "loss": 0.3169, "step": 2314 }, { "epoch": 3.576670529161839, "grad_norm": 0.2792239517410719, "learning_rate": 4.537177383091905e-06, "loss": 0.3477, "step": 2315 }, { "epoch": 3.578215527230591, "grad_norm": 0.28330923193830465, "learning_rate": 4.52814105521607e-06, "loss": 0.3222, "step": 2316 }, { "epoch": 3.5797605252993434, "grad_norm": 0.30495621831737457, "learning_rate": 4.519111100432205e-06, "loss": 0.3387, "step": 2317 }, { "epoch": 3.581305523368096, "grad_norm": 0.3035515644759343, "learning_rate": 4.510087529257528e-06, "loss": 0.3241, "step": 2318 }, { "epoch": 3.582850521436848, "grad_norm": 0.27417614810409746, "learning_rate": 4.501070352201822e-06, "loss": 0.3007, "step": 2319 }, { "epoch": 3.5843955195056005, "grad_norm": 0.3327302924786083, "learning_rate": 4.492059579767423e-06, "loss": 0.3343, "step": 2320 }, { "epoch": 3.585940517574353, "grad_norm": 0.2920221136734065, "learning_rate": 4.483055222449205e-06, "loss": 0.3373, "step": 2321 }, { "epoch": 3.5874855156431056, "grad_norm": 0.27946705806576955, "learning_rate": 4.4740572907345715e-06, "loss": 0.3033, "step": 2322 }, { "epoch": 3.589030513711858, "grad_norm": 0.2772826415022957, "learning_rate": 4.465065795103449e-06, "loss": 0.3416, "step": 2323 }, { "epoch": 3.59057551178061, "grad_norm": 0.2988025221067634, "learning_rate": 4.456080746028258e-06, "loss": 0.3114, "step": 2324 }, { "epoch": 3.5921205098493627, "grad_norm": 0.28131839831341277, "learning_rate": 4.44710215397391e-06, "loss": 0.3267, "step": 2325 }, { "epoch": 3.5936655079181152, "grad_norm": 0.2673194517474683, "learning_rate": 4.438130029397809e-06, "loss": 0.3045, "step": 2326 }, { "epoch": 3.5952105059868673, "grad_norm": 0.3030327493148294, "learning_rate": 4.429164382749818e-06, "loss": 0.3263, "step": 2327 }, { "epoch": 3.59675550405562, "grad_norm": 0.2964503515134522, "learning_rate": 4.420205224472243e-06, "loss": 0.3224, "step": 2328 }, { "epoch": 3.5983005021243724, "grad_norm": 0.2565371205869835, "learning_rate": 4.411252564999856e-06, "loss": 0.3048, "step": 2329 }, { "epoch": 3.599845500193125, "grad_norm": 0.28072890270304685, "learning_rate": 4.402306414759846e-06, "loss": 0.3312, "step": 2330 }, { "epoch": 3.6013904982618774, "grad_norm": 0.29669430764988436, "learning_rate": 4.393366784171819e-06, "loss": 0.312, "step": 2331 }, { "epoch": 3.6029354963306295, "grad_norm": 0.2925674727172111, "learning_rate": 4.384433683647798e-06, "loss": 0.314, "step": 2332 }, { "epoch": 3.604480494399382, "grad_norm": 0.2869832468713185, "learning_rate": 4.375507123592194e-06, "loss": 0.3063, "step": 2333 }, { "epoch": 3.6060254924681345, "grad_norm": 0.2907609308543007, "learning_rate": 4.366587114401797e-06, "loss": 0.3561, "step": 2334 }, { "epoch": 3.6075704905368866, "grad_norm": 0.2753019981096271, "learning_rate": 4.357673666465774e-06, "loss": 0.3089, "step": 2335 }, { "epoch": 3.609115488605639, "grad_norm": 0.29948791147150927, "learning_rate": 4.348766790165644e-06, "loss": 0.334, "step": 2336 }, { "epoch": 3.6106604866743917, "grad_norm": 0.27687796000703996, "learning_rate": 4.339866495875272e-06, "loss": 0.2906, "step": 2337 }, { "epoch": 3.612205484743144, "grad_norm": 0.2790976938941624, "learning_rate": 4.330972793960868e-06, "loss": 0.3083, "step": 2338 }, { "epoch": 3.6137504828118967, "grad_norm": 0.29356439095240316, "learning_rate": 4.322085694780952e-06, "loss": 0.32, "step": 2339 }, { "epoch": 3.615295480880649, "grad_norm": 0.31398262183890996, "learning_rate": 4.3132052086863575e-06, "loss": 0.3578, "step": 2340 }, { "epoch": 3.6168404789494013, "grad_norm": 0.2721926001528886, "learning_rate": 4.304331346020214e-06, "loss": 0.3124, "step": 2341 }, { "epoch": 3.618385477018154, "grad_norm": 0.27836014826647015, "learning_rate": 4.29546411711794e-06, "loss": 0.3174, "step": 2342 }, { "epoch": 3.619930475086906, "grad_norm": 0.26836637334197044, "learning_rate": 4.286603532307227e-06, "loss": 0.3113, "step": 2343 }, { "epoch": 3.6214754731556584, "grad_norm": 0.2945983199066885, "learning_rate": 4.277749601908019e-06, "loss": 0.3434, "step": 2344 }, { "epoch": 3.623020471224411, "grad_norm": 0.3165093940014093, "learning_rate": 4.26890233623253e-06, "loss": 0.3215, "step": 2345 }, { "epoch": 3.6245654692931635, "grad_norm": 0.2891975746819121, "learning_rate": 4.260061745585192e-06, "loss": 0.2894, "step": 2346 }, { "epoch": 3.626110467361916, "grad_norm": 0.301821885448982, "learning_rate": 4.251227840262672e-06, "loss": 0.3289, "step": 2347 }, { "epoch": 3.627655465430668, "grad_norm": 0.30197853968639676, "learning_rate": 4.242400630553847e-06, "loss": 0.3332, "step": 2348 }, { "epoch": 3.6292004634994206, "grad_norm": 0.3492402747469276, "learning_rate": 4.233580126739797e-06, "loss": 0.3232, "step": 2349 }, { "epoch": 3.630745461568173, "grad_norm": 0.2825109886787622, "learning_rate": 4.224766339093789e-06, "loss": 0.3215, "step": 2350 }, { "epoch": 3.632290459636925, "grad_norm": 0.3259483323792399, "learning_rate": 4.2159592778812755e-06, "loss": 0.3161, "step": 2351 }, { "epoch": 3.6338354577056777, "grad_norm": 0.3046160767765696, "learning_rate": 4.2071589533598685e-06, "loss": 0.3365, "step": 2352 }, { "epoch": 3.6353804557744303, "grad_norm": 0.2870019230840828, "learning_rate": 4.198365375779329e-06, "loss": 0.3056, "step": 2353 }, { "epoch": 3.6369254538431828, "grad_norm": 0.28634070752172264, "learning_rate": 4.189578555381574e-06, "loss": 0.3221, "step": 2354 }, { "epoch": 3.6384704519119353, "grad_norm": 0.3085639926700382, "learning_rate": 4.180798502400641e-06, "loss": 0.2964, "step": 2355 }, { "epoch": 3.6400154499806874, "grad_norm": 0.29455924923821897, "learning_rate": 4.172025227062676e-06, "loss": 0.3237, "step": 2356 }, { "epoch": 3.64156044804944, "grad_norm": 0.282767655730405, "learning_rate": 4.163258739585953e-06, "loss": 0.3185, "step": 2357 }, { "epoch": 3.6431054461181924, "grad_norm": 0.27549679409464584, "learning_rate": 4.154499050180822e-06, "loss": 0.3274, "step": 2358 }, { "epoch": 3.6446504441869445, "grad_norm": 0.32129929976078514, "learning_rate": 4.145746169049723e-06, "loss": 0.322, "step": 2359 }, { "epoch": 3.646195442255697, "grad_norm": 0.27218886752571936, "learning_rate": 4.137000106387168e-06, "loss": 0.2981, "step": 2360 }, { "epoch": 3.6477404403244496, "grad_norm": 0.29119004895406364, "learning_rate": 4.128260872379723e-06, "loss": 0.3198, "step": 2361 }, { "epoch": 3.649285438393202, "grad_norm": 0.29391437316088503, "learning_rate": 4.119528477206002e-06, "loss": 0.3131, "step": 2362 }, { "epoch": 3.6508304364619546, "grad_norm": 0.29055974194966844, "learning_rate": 4.110802931036655e-06, "loss": 0.3035, "step": 2363 }, { "epoch": 3.6523754345307067, "grad_norm": 0.2890808737229718, "learning_rate": 4.102084244034353e-06, "loss": 0.3005, "step": 2364 }, { "epoch": 3.653920432599459, "grad_norm": 0.2926329191796007, "learning_rate": 4.093372426353776e-06, "loss": 0.3783, "step": 2365 }, { "epoch": 3.6554654306682117, "grad_norm": 0.28533462518139363, "learning_rate": 4.0846674881416144e-06, "loss": 0.3235, "step": 2366 }, { "epoch": 3.657010428736964, "grad_norm": 0.2872870681791413, "learning_rate": 4.075969439536534e-06, "loss": 0.3253, "step": 2367 }, { "epoch": 3.6585554268057168, "grad_norm": 0.2930492381119679, "learning_rate": 4.0672782906691796e-06, "loss": 0.3157, "step": 2368 }, { "epoch": 3.660100424874469, "grad_norm": 0.2797000544443934, "learning_rate": 4.058594051662162e-06, "loss": 0.3031, "step": 2369 }, { "epoch": 3.6616454229432214, "grad_norm": 0.26141060186570436, "learning_rate": 4.049916732630042e-06, "loss": 0.2936, "step": 2370 }, { "epoch": 3.663190421011974, "grad_norm": 0.2821585565080069, "learning_rate": 4.041246343679322e-06, "loss": 0.3039, "step": 2371 }, { "epoch": 3.664735419080726, "grad_norm": 0.29743988524711434, "learning_rate": 4.032582894908427e-06, "loss": 0.3438, "step": 2372 }, { "epoch": 3.6662804171494785, "grad_norm": 0.25495036834102386, "learning_rate": 4.023926396407713e-06, "loss": 0.3114, "step": 2373 }, { "epoch": 3.667825415218231, "grad_norm": 0.27381646930018905, "learning_rate": 4.015276858259427e-06, "loss": 0.3067, "step": 2374 }, { "epoch": 3.6693704132869835, "grad_norm": 0.28599944148952305, "learning_rate": 4.006634290537716e-06, "loss": 0.3178, "step": 2375 }, { "epoch": 3.670915411355736, "grad_norm": 0.31054110898662346, "learning_rate": 3.997998703308607e-06, "loss": 0.3376, "step": 2376 }, { "epoch": 3.672460409424488, "grad_norm": 0.28324079483642384, "learning_rate": 3.9893701066299935e-06, "loss": 0.2919, "step": 2377 }, { "epoch": 3.6740054074932407, "grad_norm": 0.2871511572419279, "learning_rate": 3.980748510551631e-06, "loss": 0.3439, "step": 2378 }, { "epoch": 3.675550405561993, "grad_norm": 0.28076972169375886, "learning_rate": 3.972133925115125e-06, "loss": 0.325, "step": 2379 }, { "epoch": 3.6770954036307453, "grad_norm": 0.30012863746594104, "learning_rate": 3.963526360353907e-06, "loss": 0.3434, "step": 2380 }, { "epoch": 3.678640401699498, "grad_norm": 0.29426540656348177, "learning_rate": 3.954925826293235e-06, "loss": 0.3402, "step": 2381 }, { "epoch": 3.6801853997682503, "grad_norm": 0.2785553092379665, "learning_rate": 3.946332332950187e-06, "loss": 0.3136, "step": 2382 }, { "epoch": 3.681730397837003, "grad_norm": 0.2855029878330256, "learning_rate": 3.937745890333623e-06, "loss": 0.3174, "step": 2383 }, { "epoch": 3.6832753959057554, "grad_norm": 0.2889645342399537, "learning_rate": 3.929166508444202e-06, "loss": 0.3282, "step": 2384 }, { "epoch": 3.6848203939745074, "grad_norm": 0.3014123647394371, "learning_rate": 3.9205941972743635e-06, "loss": 0.3354, "step": 2385 }, { "epoch": 3.68636539204326, "grad_norm": 0.2902558615689743, "learning_rate": 3.912028966808304e-06, "loss": 0.324, "step": 2386 }, { "epoch": 3.6879103901120125, "grad_norm": 0.302259699500998, "learning_rate": 3.903470827021971e-06, "loss": 0.3266, "step": 2387 }, { "epoch": 3.6894553881807646, "grad_norm": 0.2669074724711914, "learning_rate": 3.894919787883065e-06, "loss": 0.3293, "step": 2388 }, { "epoch": 3.691000386249517, "grad_norm": 0.2814141701242912, "learning_rate": 3.8863758593510074e-06, "loss": 0.3253, "step": 2389 }, { "epoch": 3.6925453843182696, "grad_norm": 0.2786579110909944, "learning_rate": 3.877839051376936e-06, "loss": 0.3136, "step": 2390 }, { "epoch": 3.694090382387022, "grad_norm": 0.3078452654348989, "learning_rate": 3.869309373903702e-06, "loss": 0.3059, "step": 2391 }, { "epoch": 3.6956353804557747, "grad_norm": 0.3022141364409151, "learning_rate": 3.860786836865848e-06, "loss": 0.3058, "step": 2392 }, { "epoch": 3.6971803785245267, "grad_norm": 0.3260246289205542, "learning_rate": 3.8522714501895956e-06, "loss": 0.3408, "step": 2393 }, { "epoch": 3.6987253765932793, "grad_norm": 0.28233718460621005, "learning_rate": 3.843763223792852e-06, "loss": 0.308, "step": 2394 }, { "epoch": 3.700270374662032, "grad_norm": 0.2944076983439895, "learning_rate": 3.835262167585173e-06, "loss": 0.3233, "step": 2395 }, { "epoch": 3.701815372730784, "grad_norm": 0.28225828265737507, "learning_rate": 3.826768291467766e-06, "loss": 0.2941, "step": 2396 }, { "epoch": 3.7033603707995364, "grad_norm": 0.2899248452344834, "learning_rate": 3.818281605333476e-06, "loss": 0.3261, "step": 2397 }, { "epoch": 3.704905368868289, "grad_norm": 0.27398393856978026, "learning_rate": 3.8098021190667734e-06, "loss": 0.3112, "step": 2398 }, { "epoch": 3.7064503669370414, "grad_norm": 0.2771615629906721, "learning_rate": 3.801329842543745e-06, "loss": 0.3304, "step": 2399 }, { "epoch": 3.707995365005794, "grad_norm": 0.2701960050462768, "learning_rate": 3.7928647856320744e-06, "loss": 0.3169, "step": 2400 }, { "epoch": 3.709540363074546, "grad_norm": 0.300822489280398, "learning_rate": 3.784406958191048e-06, "loss": 0.3256, "step": 2401 }, { "epoch": 3.7110853611432986, "grad_norm": 0.2996056944707502, "learning_rate": 3.775956370071523e-06, "loss": 0.312, "step": 2402 }, { "epoch": 3.712630359212051, "grad_norm": 0.27837072310290295, "learning_rate": 3.767513031115925e-06, "loss": 0.309, "step": 2403 }, { "epoch": 3.714175357280803, "grad_norm": 0.2693096347486841, "learning_rate": 3.7590769511582394e-06, "loss": 0.3036, "step": 2404 }, { "epoch": 3.7157203553495557, "grad_norm": 0.32662996945742845, "learning_rate": 3.750648140023996e-06, "loss": 0.3536, "step": 2405 }, { "epoch": 3.717265353418308, "grad_norm": 0.2756705470474485, "learning_rate": 3.7422266075302536e-06, "loss": 0.2954, "step": 2406 }, { "epoch": 3.7188103514870607, "grad_norm": 0.29602855593732685, "learning_rate": 3.7338123634856093e-06, "loss": 0.3197, "step": 2407 }, { "epoch": 3.7203553495558133, "grad_norm": 0.3032746778758695, "learning_rate": 3.725405417690153e-06, "loss": 0.3315, "step": 2408 }, { "epoch": 3.7219003476245653, "grad_norm": 0.285293303776543, "learning_rate": 3.717005779935482e-06, "loss": 0.3134, "step": 2409 }, { "epoch": 3.723445345693318, "grad_norm": 0.28516283657952524, "learning_rate": 3.7086134600046884e-06, "loss": 0.3199, "step": 2410 }, { "epoch": 3.7249903437620704, "grad_norm": 0.28737421760741216, "learning_rate": 3.7002284676723275e-06, "loss": 0.332, "step": 2411 }, { "epoch": 3.7265353418308225, "grad_norm": 0.2853255506619392, "learning_rate": 3.691850812704426e-06, "loss": 0.3278, "step": 2412 }, { "epoch": 3.728080339899575, "grad_norm": 0.26633934941241444, "learning_rate": 3.6834805048584734e-06, "loss": 0.3169, "step": 2413 }, { "epoch": 3.7296253379683275, "grad_norm": 0.30731984258088263, "learning_rate": 3.6751175538833895e-06, "loss": 0.3076, "step": 2414 }, { "epoch": 3.73117033603708, "grad_norm": 0.28496603760523437, "learning_rate": 3.6667619695195287e-06, "loss": 0.2885, "step": 2415 }, { "epoch": 3.7327153341058326, "grad_norm": 0.27574161879402365, "learning_rate": 3.658413761498675e-06, "loss": 0.294, "step": 2416 }, { "epoch": 3.7342603321745846, "grad_norm": 0.30651263847779714, "learning_rate": 3.650072939544007e-06, "loss": 0.3501, "step": 2417 }, { "epoch": 3.735805330243337, "grad_norm": 0.2879518921399523, "learning_rate": 3.6417395133701105e-06, "loss": 0.3031, "step": 2418 }, { "epoch": 3.7373503283120897, "grad_norm": 0.2774082284554297, "learning_rate": 3.633413492682952e-06, "loss": 0.2877, "step": 2419 }, { "epoch": 3.7388953263808418, "grad_norm": 0.30988087213619264, "learning_rate": 3.6250948871798764e-06, "loss": 0.3439, "step": 2420 }, { "epoch": 3.7404403244495943, "grad_norm": 0.2552787976778361, "learning_rate": 3.6167837065495857e-06, "loss": 0.3178, "step": 2421 }, { "epoch": 3.741985322518347, "grad_norm": 0.30276353151540203, "learning_rate": 3.6084799604721476e-06, "loss": 0.3224, "step": 2422 }, { "epoch": 3.7435303205870993, "grad_norm": 0.2881000322096692, "learning_rate": 3.600183658618959e-06, "loss": 0.3398, "step": 2423 }, { "epoch": 3.745075318655852, "grad_norm": 0.3030401930058737, "learning_rate": 3.591894810652747e-06, "loss": 0.354, "step": 2424 }, { "epoch": 3.746620316724604, "grad_norm": 0.29556313076610236, "learning_rate": 3.5836134262275635e-06, "loss": 0.3445, "step": 2425 }, { "epoch": 3.7481653147933565, "grad_norm": 0.2782574220777906, "learning_rate": 3.5753395149887613e-06, "loss": 0.3185, "step": 2426 }, { "epoch": 3.749710312862109, "grad_norm": 0.29886550199653916, "learning_rate": 3.567073086572993e-06, "loss": 0.3191, "step": 2427 }, { "epoch": 3.751255310930861, "grad_norm": 0.2892296432054014, "learning_rate": 3.558814150608191e-06, "loss": 0.3228, "step": 2428 }, { "epoch": 3.7528003089996136, "grad_norm": 0.2789843249541946, "learning_rate": 3.5505627167135713e-06, "loss": 0.3111, "step": 2429 }, { "epoch": 3.754345307068366, "grad_norm": 0.3034167859696258, "learning_rate": 3.542318794499604e-06, "loss": 0.3678, "step": 2430 }, { "epoch": 3.7558903051371186, "grad_norm": 0.2711441770520911, "learning_rate": 3.5340823935680103e-06, "loss": 0.2927, "step": 2431 }, { "epoch": 3.757435303205871, "grad_norm": 0.2592075372588375, "learning_rate": 3.5258535235117553e-06, "loss": 0.3071, "step": 2432 }, { "epoch": 3.7589803012746232, "grad_norm": 0.30000167531503863, "learning_rate": 3.5176321939150283e-06, "loss": 0.3373, "step": 2433 }, { "epoch": 3.7605252993433758, "grad_norm": 0.2772686649141341, "learning_rate": 3.509418414353237e-06, "loss": 0.3184, "step": 2434 }, { "epoch": 3.7620702974121283, "grad_norm": 0.2850132261704843, "learning_rate": 3.5012121943930055e-06, "loss": 0.3264, "step": 2435 }, { "epoch": 3.763615295480881, "grad_norm": 0.2746313966031941, "learning_rate": 3.4930135435921387e-06, "loss": 0.3048, "step": 2436 }, { "epoch": 3.7651602935496333, "grad_norm": 0.27696324109330944, "learning_rate": 3.484822471499629e-06, "loss": 0.3017, "step": 2437 }, { "epoch": 3.7667052916183854, "grad_norm": 0.2857465441210261, "learning_rate": 3.476638987655656e-06, "loss": 0.3394, "step": 2438 }, { "epoch": 3.768250289687138, "grad_norm": 0.2701391682588106, "learning_rate": 3.4684631015915405e-06, "loss": 0.3126, "step": 2439 }, { "epoch": 3.7697952877558905, "grad_norm": 0.28642088647933955, "learning_rate": 3.4602948228297617e-06, "loss": 0.3566, "step": 2440 }, { "epoch": 3.7713402858246425, "grad_norm": 0.2582867530032964, "learning_rate": 3.4521341608839485e-06, "loss": 0.3131, "step": 2441 }, { "epoch": 3.772885283893395, "grad_norm": 0.29603105677117764, "learning_rate": 3.4439811252588473e-06, "loss": 0.3307, "step": 2442 }, { "epoch": 3.7744302819621476, "grad_norm": 0.2816710326702153, "learning_rate": 3.4358357254503218e-06, "loss": 0.3097, "step": 2443 }, { "epoch": 3.7759752800309, "grad_norm": 0.2588600533991288, "learning_rate": 3.4276979709453517e-06, "loss": 0.2926, "step": 2444 }, { "epoch": 3.7775202780996526, "grad_norm": 0.29375109727692156, "learning_rate": 3.419567871222006e-06, "loss": 0.3346, "step": 2445 }, { "epoch": 3.7790652761684047, "grad_norm": 0.2816765493113852, "learning_rate": 3.411445435749431e-06, "loss": 0.3153, "step": 2446 }, { "epoch": 3.7806102742371572, "grad_norm": 0.26207482189737596, "learning_rate": 3.40333067398786e-06, "loss": 0.299, "step": 2447 }, { "epoch": 3.7821552723059098, "grad_norm": 0.2862967353699353, "learning_rate": 3.3952235953885826e-06, "loss": 0.3048, "step": 2448 }, { "epoch": 3.783700270374662, "grad_norm": 0.3027747024886073, "learning_rate": 3.3871242093939336e-06, "loss": 0.3337, "step": 2449 }, { "epoch": 3.7852452684434144, "grad_norm": 0.28634461492706575, "learning_rate": 3.3790325254373035e-06, "loss": 0.3179, "step": 2450 }, { "epoch": 3.786790266512167, "grad_norm": 0.2737228437395912, "learning_rate": 3.370948552943097e-06, "loss": 0.324, "step": 2451 }, { "epoch": 3.7883352645809194, "grad_norm": 0.30069097327329203, "learning_rate": 3.362872301326746e-06, "loss": 0.333, "step": 2452 }, { "epoch": 3.789880262649672, "grad_norm": 0.2772840030853766, "learning_rate": 3.3548037799946855e-06, "loss": 0.3105, "step": 2453 }, { "epoch": 3.791425260718424, "grad_norm": 0.2709040140809632, "learning_rate": 3.3467429983443477e-06, "loss": 0.3122, "step": 2454 }, { "epoch": 3.7929702587871765, "grad_norm": 0.29374149854838, "learning_rate": 3.3386899657641536e-06, "loss": 0.3566, "step": 2455 }, { "epoch": 3.794515256855929, "grad_norm": 0.25230393347593666, "learning_rate": 3.330644691633492e-06, "loss": 0.2863, "step": 2456 }, { "epoch": 3.796060254924681, "grad_norm": 0.30006389026141705, "learning_rate": 3.322607185322727e-06, "loss": 0.332, "step": 2457 }, { "epoch": 3.7976052529934337, "grad_norm": 0.28391997015756637, "learning_rate": 3.314577456193164e-06, "loss": 0.3231, "step": 2458 }, { "epoch": 3.799150251062186, "grad_norm": 0.2746275004697681, "learning_rate": 3.3065555135970563e-06, "loss": 0.3331, "step": 2459 }, { "epoch": 3.8006952491309387, "grad_norm": 0.28400137332988795, "learning_rate": 3.2985413668775843e-06, "loss": 0.3324, "step": 2460 }, { "epoch": 3.8022402471996912, "grad_norm": 0.27522551887911145, "learning_rate": 3.290535025368854e-06, "loss": 0.3306, "step": 2461 }, { "epoch": 3.8037852452684433, "grad_norm": 0.2502044392473161, "learning_rate": 3.282536498395872e-06, "loss": 0.2878, "step": 2462 }, { "epoch": 3.805330243337196, "grad_norm": 0.27666962631904396, "learning_rate": 3.2745457952745564e-06, "loss": 0.3218, "step": 2463 }, { "epoch": 3.8068752414059484, "grad_norm": 0.2920699542948679, "learning_rate": 3.2665629253117004e-06, "loss": 0.3163, "step": 2464 }, { "epoch": 3.8084202394747004, "grad_norm": 0.2570876032941772, "learning_rate": 3.25858789780498e-06, "loss": 0.276, "step": 2465 }, { "epoch": 3.809965237543453, "grad_norm": 0.3024643614314443, "learning_rate": 3.2506207220429364e-06, "loss": 0.3568, "step": 2466 }, { "epoch": 3.8115102356122055, "grad_norm": 0.3187306121938027, "learning_rate": 3.2426614073049634e-06, "loss": 0.3558, "step": 2467 }, { "epoch": 3.813055233680958, "grad_norm": 0.25672330629125384, "learning_rate": 3.2347099628612987e-06, "loss": 0.2898, "step": 2468 }, { "epoch": 3.8146002317497105, "grad_norm": 0.27521971881801127, "learning_rate": 3.226766397973021e-06, "loss": 0.3167, "step": 2469 }, { "epoch": 3.8161452298184626, "grad_norm": 0.28028964614559176, "learning_rate": 3.2188307218920256e-06, "loss": 0.3374, "step": 2470 }, { "epoch": 3.817690227887215, "grad_norm": 0.27067978983059543, "learning_rate": 3.210902943861014e-06, "loss": 0.34, "step": 2471 }, { "epoch": 3.8192352259559676, "grad_norm": 0.2966281754649168, "learning_rate": 3.2029830731135037e-06, "loss": 0.3427, "step": 2472 }, { "epoch": 3.8207802240247197, "grad_norm": 0.26225644313330754, "learning_rate": 3.195071118873794e-06, "loss": 0.3019, "step": 2473 }, { "epoch": 3.8223252220934723, "grad_norm": 0.30050751773723666, "learning_rate": 3.187167090356952e-06, "loss": 0.3093, "step": 2474 }, { "epoch": 3.8238702201622248, "grad_norm": 0.2895323993656114, "learning_rate": 3.1792709967688374e-06, "loss": 0.314, "step": 2475 }, { "epoch": 3.8254152182309773, "grad_norm": 0.2845895721775863, "learning_rate": 3.1713828473060516e-06, "loss": 0.3232, "step": 2476 }, { "epoch": 3.82696021629973, "grad_norm": 0.2788983484694356, "learning_rate": 3.1635026511559454e-06, "loss": 0.3019, "step": 2477 }, { "epoch": 3.828505214368482, "grad_norm": 0.299206235548388, "learning_rate": 3.155630417496616e-06, "loss": 0.3159, "step": 2478 }, { "epoch": 3.8300502124372344, "grad_norm": 0.26996516365524614, "learning_rate": 3.1477661554968754e-06, "loss": 0.3097, "step": 2479 }, { "epoch": 3.831595210505987, "grad_norm": 0.3447975781486293, "learning_rate": 3.139909874316256e-06, "loss": 0.2932, "step": 2480 }, { "epoch": 3.833140208574739, "grad_norm": 0.2953443736278557, "learning_rate": 3.132061583104993e-06, "loss": 0.3387, "step": 2481 }, { "epoch": 3.8346852066434916, "grad_norm": 0.2656990345073048, "learning_rate": 3.1242212910040182e-06, "loss": 0.3147, "step": 2482 }, { "epoch": 3.836230204712244, "grad_norm": 0.2894820301822276, "learning_rate": 3.1163890071449442e-06, "loss": 0.3237, "step": 2483 }, { "epoch": 3.8377752027809966, "grad_norm": 0.26301722348242473, "learning_rate": 3.108564740650055e-06, "loss": 0.3089, "step": 2484 }, { "epoch": 3.839320200849749, "grad_norm": 0.2859877329259118, "learning_rate": 3.100748500632306e-06, "loss": 0.3446, "step": 2485 }, { "epoch": 3.840865198918501, "grad_norm": 0.2730798483437427, "learning_rate": 3.0929402961952937e-06, "loss": 0.3001, "step": 2486 }, { "epoch": 3.8424101969872537, "grad_norm": 0.336685079173142, "learning_rate": 3.08514013643326e-06, "loss": 0.3659, "step": 2487 }, { "epoch": 3.8439551950560062, "grad_norm": 0.2599541468171893, "learning_rate": 3.0773480304310755e-06, "loss": 0.2874, "step": 2488 }, { "epoch": 3.8455001931247583, "grad_norm": 0.2860270421516179, "learning_rate": 3.0695639872642312e-06, "loss": 0.3086, "step": 2489 }, { "epoch": 3.847045191193511, "grad_norm": 0.26861705157078963, "learning_rate": 3.0617880159988254e-06, "loss": 0.3094, "step": 2490 }, { "epoch": 3.8485901892622634, "grad_norm": 0.2815136997019909, "learning_rate": 3.0540201256915634e-06, "loss": 0.3506, "step": 2491 }, { "epoch": 3.850135187331016, "grad_norm": 0.27877802817970965, "learning_rate": 3.0462603253897295e-06, "loss": 0.3109, "step": 2492 }, { "epoch": 3.8516801853997684, "grad_norm": 0.30209925612494853, "learning_rate": 3.038508624131187e-06, "loss": 0.321, "step": 2493 }, { "epoch": 3.8532251834685205, "grad_norm": 0.2830608093330047, "learning_rate": 3.030765030944369e-06, "loss": 0.3472, "step": 2494 }, { "epoch": 3.854770181537273, "grad_norm": 0.2763574592020169, "learning_rate": 3.0230295548482645e-06, "loss": 0.3143, "step": 2495 }, { "epoch": 3.8563151796060255, "grad_norm": 0.24869248240551284, "learning_rate": 3.015302204852403e-06, "loss": 0.2909, "step": 2496 }, { "epoch": 3.857860177674778, "grad_norm": 0.276922534016088, "learning_rate": 3.00758298995686e-06, "loss": 0.3337, "step": 2497 }, { "epoch": 3.8594051757435306, "grad_norm": 0.3005334205136595, "learning_rate": 2.9998719191522285e-06, "loss": 0.3393, "step": 2498 }, { "epoch": 3.8609501738122827, "grad_norm": 0.2639660690186735, "learning_rate": 2.9921690014196146e-06, "loss": 0.3234, "step": 2499 }, { "epoch": 3.862495171881035, "grad_norm": 0.2850851703479764, "learning_rate": 2.984474245730639e-06, "loss": 0.3301, "step": 2500 }, { "epoch": 3.8640401699497877, "grad_norm": 0.31943916453503357, "learning_rate": 2.976787661047407e-06, "loss": 0.3269, "step": 2501 }, { "epoch": 3.86558516801854, "grad_norm": 0.2855035066577431, "learning_rate": 2.9691092563224987e-06, "loss": 0.3399, "step": 2502 }, { "epoch": 3.8671301660872923, "grad_norm": 0.27097833054856457, "learning_rate": 2.961439040498989e-06, "loss": 0.2875, "step": 2503 }, { "epoch": 3.868675164156045, "grad_norm": 0.2876244973141876, "learning_rate": 2.953777022510399e-06, "loss": 0.3283, "step": 2504 }, { "epoch": 3.8702201622247974, "grad_norm": 0.29495025719323237, "learning_rate": 2.9461232112807026e-06, "loss": 0.3254, "step": 2505 }, { "epoch": 3.87176516029355, "grad_norm": 0.2793939810202331, "learning_rate": 2.9384776157243255e-06, "loss": 0.3101, "step": 2506 }, { "epoch": 3.873310158362302, "grad_norm": 0.32870411907404007, "learning_rate": 2.930840244746114e-06, "loss": 0.3527, "step": 2507 }, { "epoch": 3.8748551564310545, "grad_norm": 0.267899565796094, "learning_rate": 2.9232111072413425e-06, "loss": 0.2885, "step": 2508 }, { "epoch": 3.876400154499807, "grad_norm": 0.26407332238732506, "learning_rate": 2.915590212095685e-06, "loss": 0.3213, "step": 2509 }, { "epoch": 3.877945152568559, "grad_norm": 0.2956720896877126, "learning_rate": 2.9079775681852286e-06, "loss": 0.3402, "step": 2510 }, { "epoch": 3.8794901506373116, "grad_norm": 0.2880015544740385, "learning_rate": 2.900373184376444e-06, "loss": 0.3037, "step": 2511 }, { "epoch": 3.881035148706064, "grad_norm": 0.3142573096878454, "learning_rate": 2.892777069526177e-06, "loss": 0.3155, "step": 2512 }, { "epoch": 3.8825801467748167, "grad_norm": 0.27963558222192025, "learning_rate": 2.8851892324816545e-06, "loss": 0.3052, "step": 2513 }, { "epoch": 3.884125144843569, "grad_norm": 0.3087110967476342, "learning_rate": 2.8776096820804533e-06, "loss": 0.3256, "step": 2514 }, { "epoch": 3.8856701429123213, "grad_norm": 0.2660113614147622, "learning_rate": 2.8700384271504976e-06, "loss": 0.3414, "step": 2515 }, { "epoch": 3.887215140981074, "grad_norm": 0.28736381662818766, "learning_rate": 2.862475476510056e-06, "loss": 0.3227, "step": 2516 }, { "epoch": 3.8887601390498263, "grad_norm": 0.28480442164399583, "learning_rate": 2.8549208389677186e-06, "loss": 0.2999, "step": 2517 }, { "epoch": 3.8903051371185784, "grad_norm": 0.26694354485938204, "learning_rate": 2.8473745233223948e-06, "loss": 0.2852, "step": 2518 }, { "epoch": 3.891850135187331, "grad_norm": 0.26483590059175577, "learning_rate": 2.83983653836331e-06, "loss": 0.3254, "step": 2519 }, { "epoch": 3.8933951332560834, "grad_norm": 0.4365422920645754, "learning_rate": 2.832306892869976e-06, "loss": 0.3418, "step": 2520 }, { "epoch": 3.894940131324836, "grad_norm": 0.28036367525513534, "learning_rate": 2.824785595612195e-06, "loss": 0.3136, "step": 2521 }, { "epoch": 3.8964851293935885, "grad_norm": 0.26833613821794683, "learning_rate": 2.8172726553500475e-06, "loss": 0.3306, "step": 2522 }, { "epoch": 3.8980301274623406, "grad_norm": 0.29202942986368347, "learning_rate": 2.8097680808338777e-06, "loss": 0.3084, "step": 2523 }, { "epoch": 3.899575125531093, "grad_norm": 0.31466329293406414, "learning_rate": 2.802271880804288e-06, "loss": 0.3379, "step": 2524 }, { "epoch": 3.9011201235998456, "grad_norm": 0.2746590039037682, "learning_rate": 2.7947840639921308e-06, "loss": 0.3195, "step": 2525 }, { "epoch": 3.9026651216685977, "grad_norm": 0.2776155297062115, "learning_rate": 2.7873046391184876e-06, "loss": 0.3458, "step": 2526 }, { "epoch": 3.90421011973735, "grad_norm": 0.28379798540119755, "learning_rate": 2.779833614894667e-06, "loss": 0.3231, "step": 2527 }, { "epoch": 3.9057551178061027, "grad_norm": 0.282290654970775, "learning_rate": 2.7723710000222013e-06, "loss": 0.3355, "step": 2528 }, { "epoch": 3.9073001158748553, "grad_norm": 0.27994330517679616, "learning_rate": 2.764916803192822e-06, "loss": 0.3172, "step": 2529 }, { "epoch": 3.908845113943608, "grad_norm": 0.28807301998096235, "learning_rate": 2.757471033088448e-06, "loss": 0.3143, "step": 2530 }, { "epoch": 3.91039011201236, "grad_norm": 0.2670934014134746, "learning_rate": 2.7500336983812004e-06, "loss": 0.3227, "step": 2531 }, { "epoch": 3.9119351100811124, "grad_norm": 0.30166805275472464, "learning_rate": 2.7426048077333645e-06, "loss": 0.3205, "step": 2532 }, { "epoch": 3.913480108149865, "grad_norm": 0.2975501010058444, "learning_rate": 2.735184369797389e-06, "loss": 0.3173, "step": 2533 }, { "epoch": 3.915025106218617, "grad_norm": 0.2944776062183738, "learning_rate": 2.7277723932158906e-06, "loss": 0.3486, "step": 2534 }, { "epoch": 3.9165701042873695, "grad_norm": 0.29121904647797964, "learning_rate": 2.7203688866216195e-06, "loss": 0.3476, "step": 2535 }, { "epoch": 3.918115102356122, "grad_norm": 0.2601908814373125, "learning_rate": 2.7129738586374642e-06, "loss": 0.31, "step": 2536 }, { "epoch": 3.9196601004248746, "grad_norm": 0.27872417325971527, "learning_rate": 2.7055873178764326e-06, "loss": 0.3205, "step": 2537 }, { "epoch": 3.921205098493627, "grad_norm": 0.28305511381243725, "learning_rate": 2.698209272941659e-06, "loss": 0.3248, "step": 2538 }, { "epoch": 3.922750096562379, "grad_norm": 0.2717603105914359, "learning_rate": 2.6908397324263746e-06, "loss": 0.3068, "step": 2539 }, { "epoch": 3.9242950946311317, "grad_norm": 0.27617265176212685, "learning_rate": 2.6834787049139046e-06, "loss": 0.3421, "step": 2540 }, { "epoch": 3.925840092699884, "grad_norm": 0.2582584226868063, "learning_rate": 2.6761261989776667e-06, "loss": 0.3069, "step": 2541 }, { "epoch": 3.9273850907686363, "grad_norm": 0.2844409386241979, "learning_rate": 2.6687822231811466e-06, "loss": 0.3326, "step": 2542 }, { "epoch": 3.928930088837389, "grad_norm": 0.27678858798389055, "learning_rate": 2.661446786077896e-06, "loss": 0.3271, "step": 2543 }, { "epoch": 3.9304750869061413, "grad_norm": 0.3107603318217923, "learning_rate": 2.6541198962115235e-06, "loss": 0.3513, "step": 2544 }, { "epoch": 3.932020084974894, "grad_norm": 0.2596690084398148, "learning_rate": 2.6468015621156795e-06, "loss": 0.3263, "step": 2545 }, { "epoch": 3.9335650830436464, "grad_norm": 0.2577049910767489, "learning_rate": 2.6394917923140516e-06, "loss": 0.3066, "step": 2546 }, { "epoch": 3.9351100811123985, "grad_norm": 0.2742840237945727, "learning_rate": 2.632190595320356e-06, "loss": 0.3157, "step": 2547 }, { "epoch": 3.936655079181151, "grad_norm": 0.2874388066581053, "learning_rate": 2.6248979796383203e-06, "loss": 0.3417, "step": 2548 }, { "epoch": 3.9382000772499035, "grad_norm": 0.2641171853027971, "learning_rate": 2.617613953761675e-06, "loss": 0.3165, "step": 2549 }, { "epoch": 3.9397450753186556, "grad_norm": 0.25757630730624076, "learning_rate": 2.61033852617415e-06, "loss": 0.2954, "step": 2550 }, { "epoch": 3.941290073387408, "grad_norm": 0.28871174120342913, "learning_rate": 2.6030717053494594e-06, "loss": 0.3447, "step": 2551 }, { "epoch": 3.9428350714561606, "grad_norm": 0.26899938557460307, "learning_rate": 2.595813499751291e-06, "loss": 0.3125, "step": 2552 }, { "epoch": 3.944380069524913, "grad_norm": 0.27053856713912605, "learning_rate": 2.5885639178333055e-06, "loss": 0.3317, "step": 2553 }, { "epoch": 3.9459250675936657, "grad_norm": 0.2621928977218557, "learning_rate": 2.581322968039112e-06, "loss": 0.3001, "step": 2554 }, { "epoch": 3.9474700656624178, "grad_norm": 0.31073082669424956, "learning_rate": 2.574090658802265e-06, "loss": 0.2995, "step": 2555 }, { "epoch": 3.9490150637311703, "grad_norm": 0.27223315523092395, "learning_rate": 2.566866998546269e-06, "loss": 0.3316, "step": 2556 }, { "epoch": 3.950560061799923, "grad_norm": 0.26309289936037056, "learning_rate": 2.5596519956845333e-06, "loss": 0.3263, "step": 2557 }, { "epoch": 3.9521050598686753, "grad_norm": 0.2760315668432166, "learning_rate": 2.552445658620397e-06, "loss": 0.3201, "step": 2558 }, { "epoch": 3.953650057937428, "grad_norm": 0.31612520618117157, "learning_rate": 2.545247995747111e-06, "loss": 0.3232, "step": 2559 }, { "epoch": 3.95519505600618, "grad_norm": 0.2801694724730051, "learning_rate": 2.5380590154478123e-06, "loss": 0.3236, "step": 2560 }, { "epoch": 3.9567400540749325, "grad_norm": 0.27151968947849203, "learning_rate": 2.530878726095527e-06, "loss": 0.3296, "step": 2561 }, { "epoch": 3.958285052143685, "grad_norm": 0.2612661392541989, "learning_rate": 2.523707136053167e-06, "loss": 0.3038, "step": 2562 }, { "epoch": 3.959830050212437, "grad_norm": 0.2969613204013011, "learning_rate": 2.5165442536735063e-06, "loss": 0.3378, "step": 2563 }, { "epoch": 3.9613750482811896, "grad_norm": 0.2905351272080741, "learning_rate": 2.5093900872991773e-06, "loss": 0.3285, "step": 2564 }, { "epoch": 3.962920046349942, "grad_norm": 0.26261315552383974, "learning_rate": 2.5022446452626526e-06, "loss": 0.3299, "step": 2565 }, { "epoch": 3.9644650444186946, "grad_norm": 0.2696554939331233, "learning_rate": 2.4951079358862617e-06, "loss": 0.3313, "step": 2566 }, { "epoch": 3.966010042487447, "grad_norm": 0.2762737571132383, "learning_rate": 2.4879799674821502e-06, "loss": 0.2961, "step": 2567 }, { "epoch": 3.9675550405561992, "grad_norm": 0.29122297431880617, "learning_rate": 2.480860748352283e-06, "loss": 0.3359, "step": 2568 }, { "epoch": 3.9691000386249518, "grad_norm": 0.2570176670900592, "learning_rate": 2.4737502867884435e-06, "loss": 0.286, "step": 2569 }, { "epoch": 3.9706450366937043, "grad_norm": 0.26987345591744205, "learning_rate": 2.466648591072206e-06, "loss": 0.3271, "step": 2570 }, { "epoch": 3.9721900347624564, "grad_norm": 0.2699107176466893, "learning_rate": 2.45955566947494e-06, "loss": 0.3089, "step": 2571 }, { "epoch": 3.973735032831209, "grad_norm": 0.2708530591199158, "learning_rate": 2.452471530257794e-06, "loss": 0.3465, "step": 2572 }, { "epoch": 3.9752800308999614, "grad_norm": 0.2574374266513645, "learning_rate": 2.4453961816716877e-06, "loss": 0.3331, "step": 2573 }, { "epoch": 3.976825028968714, "grad_norm": 0.27547375127798024, "learning_rate": 2.4383296319573e-06, "loss": 0.3025, "step": 2574 }, { "epoch": 3.9783700270374664, "grad_norm": 0.2504763554314188, "learning_rate": 2.4312718893450705e-06, "loss": 0.2938, "step": 2575 }, { "epoch": 3.9799150251062185, "grad_norm": 0.2815546618219794, "learning_rate": 2.4242229620551716e-06, "loss": 0.3392, "step": 2576 }, { "epoch": 3.981460023174971, "grad_norm": 0.2786440090691761, "learning_rate": 2.417182858297512e-06, "loss": 0.3199, "step": 2577 }, { "epoch": 3.9830050212437236, "grad_norm": 0.27252398160735486, "learning_rate": 2.410151586271724e-06, "loss": 0.3246, "step": 2578 }, { "epoch": 3.9845500193124757, "grad_norm": 0.27187542466363995, "learning_rate": 2.403129154167153e-06, "loss": 0.315, "step": 2579 }, { "epoch": 3.986095017381228, "grad_norm": 0.28004031141644736, "learning_rate": 2.396115570162845e-06, "loss": 0.3507, "step": 2580 }, { "epoch": 3.9876400154499807, "grad_norm": 0.2497058580671791, "learning_rate": 2.38911084242755e-06, "loss": 0.2794, "step": 2581 }, { "epoch": 3.9891850135187332, "grad_norm": 0.355113027174499, "learning_rate": 2.382114979119696e-06, "loss": 0.3425, "step": 2582 }, { "epoch": 3.9907300115874857, "grad_norm": 0.27113915685805545, "learning_rate": 2.3751279883873836e-06, "loss": 0.2964, "step": 2583 }, { "epoch": 3.992275009656238, "grad_norm": 0.2654063245844106, "learning_rate": 2.368149878368391e-06, "loss": 0.3178, "step": 2584 }, { "epoch": 3.9938200077249904, "grad_norm": 0.27600129782630206, "learning_rate": 2.36118065719014e-06, "loss": 0.3049, "step": 2585 }, { "epoch": 3.995365005793743, "grad_norm": 0.27300799642990714, "learning_rate": 2.354220332969703e-06, "loss": 0.3492, "step": 2586 }, { "epoch": 3.996910003862495, "grad_norm": 0.28004218352269694, "learning_rate": 2.3472689138137993e-06, "loss": 0.321, "step": 2587 }, { "epoch": 3.9984550019312475, "grad_norm": 0.2779377101364117, "learning_rate": 2.340326407818767e-06, "loss": 0.2955, "step": 2588 }, { "epoch": 4.0, "grad_norm": 0.6560195256750083, "learning_rate": 2.3333928230705607e-06, "loss": 0.5436, "step": 2589 }, { "epoch": 4.001544998068752, "grad_norm": 0.35297570843211745, "learning_rate": 2.3264681676447563e-06, "loss": 0.2993, "step": 2590 }, { "epoch": 4.003089996137505, "grad_norm": 0.33274834199770514, "learning_rate": 2.3195524496065205e-06, "loss": 0.294, "step": 2591 }, { "epoch": 4.004634994206257, "grad_norm": 0.30847576183187997, "learning_rate": 2.312645677010613e-06, "loss": 0.2832, "step": 2592 }, { "epoch": 4.00617999227501, "grad_norm": 0.32455176724268037, "learning_rate": 2.305747857901368e-06, "loss": 0.2866, "step": 2593 }, { "epoch": 4.007724990343762, "grad_norm": 0.3611186008524997, "learning_rate": 2.2988590003127056e-06, "loss": 0.3171, "step": 2594 }, { "epoch": 4.009269988412514, "grad_norm": 0.3427626897240081, "learning_rate": 2.291979112268098e-06, "loss": 0.2898, "step": 2595 }, { "epoch": 4.010814986481267, "grad_norm": 0.3199721661637215, "learning_rate": 2.2851082017805704e-06, "loss": 0.2772, "step": 2596 }, { "epoch": 4.012359984550019, "grad_norm": 0.28832240717882623, "learning_rate": 2.2782462768527002e-06, "loss": 0.2684, "step": 2597 }, { "epoch": 4.013904982618771, "grad_norm": 0.28520278751778916, "learning_rate": 2.2713933454765914e-06, "loss": 0.2996, "step": 2598 }, { "epoch": 4.015449980687524, "grad_norm": 0.3182851078911527, "learning_rate": 2.2645494156338753e-06, "loss": 0.2953, "step": 2599 }, { "epoch": 4.016994978756276, "grad_norm": 0.330844105757205, "learning_rate": 2.2577144952957e-06, "loss": 0.2962, "step": 2600 }, { "epoch": 4.018539976825029, "grad_norm": 0.3024366019784113, "learning_rate": 2.2508885924227173e-06, "loss": 0.2975, "step": 2601 }, { "epoch": 4.0200849748937815, "grad_norm": 0.2886705901192864, "learning_rate": 2.2440717149650783e-06, "loss": 0.2873, "step": 2602 }, { "epoch": 4.0216299729625336, "grad_norm": 0.28858902519032026, "learning_rate": 2.237263870862427e-06, "loss": 0.2909, "step": 2603 }, { "epoch": 4.0231749710312865, "grad_norm": 0.26621458481397053, "learning_rate": 2.230465068043879e-06, "loss": 0.3008, "step": 2604 }, { "epoch": 4.024719969100039, "grad_norm": 0.2699219981753166, "learning_rate": 2.223675314428021e-06, "loss": 0.2734, "step": 2605 }, { "epoch": 4.026264967168791, "grad_norm": 0.2582877248698914, "learning_rate": 2.2168946179229023e-06, "loss": 0.317, "step": 2606 }, { "epoch": 4.027809965237544, "grad_norm": 0.2720648768546618, "learning_rate": 2.2101229864260232e-06, "loss": 0.2842, "step": 2607 }, { "epoch": 4.029354963306296, "grad_norm": 0.26802535537822, "learning_rate": 2.2033604278243203e-06, "loss": 0.2949, "step": 2608 }, { "epoch": 4.030899961375049, "grad_norm": 0.27609944326096486, "learning_rate": 2.1966069499941736e-06, "loss": 0.2769, "step": 2609 }, { "epoch": 4.032444959443801, "grad_norm": 0.2910402458473247, "learning_rate": 2.189862560801379e-06, "loss": 0.3021, "step": 2610 }, { "epoch": 4.033989957512553, "grad_norm": 0.25895784443425884, "learning_rate": 2.183127268101145e-06, "loss": 0.2743, "step": 2611 }, { "epoch": 4.035534955581306, "grad_norm": 0.29965986707604836, "learning_rate": 2.1764010797380984e-06, "loss": 0.3069, "step": 2612 }, { "epoch": 4.037079953650058, "grad_norm": 0.2832580927628535, "learning_rate": 2.169684003546243e-06, "loss": 0.3026, "step": 2613 }, { "epoch": 4.03862495171881, "grad_norm": 0.269207142087305, "learning_rate": 2.1629760473489804e-06, "loss": 0.3043, "step": 2614 }, { "epoch": 4.040169949787563, "grad_norm": 0.26990765328338545, "learning_rate": 2.1562772189590943e-06, "loss": 0.2885, "step": 2615 }, { "epoch": 4.041714947856315, "grad_norm": 0.2787470375976079, "learning_rate": 2.149587526178728e-06, "loss": 0.2732, "step": 2616 }, { "epoch": 4.043259945925068, "grad_norm": 0.2798282089887123, "learning_rate": 2.142906976799387e-06, "loss": 0.2902, "step": 2617 }, { "epoch": 4.04480494399382, "grad_norm": 0.27442833744364226, "learning_rate": 2.136235578601935e-06, "loss": 0.2689, "step": 2618 }, { "epoch": 4.046349942062572, "grad_norm": 0.2694923001634534, "learning_rate": 2.129573339356571e-06, "loss": 0.2809, "step": 2619 }, { "epoch": 4.047894940131325, "grad_norm": 0.27976334306233275, "learning_rate": 2.1229202668228197e-06, "loss": 0.2924, "step": 2620 }, { "epoch": 4.049439938200077, "grad_norm": 0.2746586631614338, "learning_rate": 2.116276368749538e-06, "loss": 0.3196, "step": 2621 }, { "epoch": 4.050984936268829, "grad_norm": 0.27945704461203497, "learning_rate": 2.1096416528749007e-06, "loss": 0.2966, "step": 2622 }, { "epoch": 4.052529934337582, "grad_norm": 0.2676789072308649, "learning_rate": 2.1030161269263803e-06, "loss": 0.302, "step": 2623 }, { "epoch": 4.054074932406334, "grad_norm": 0.27382597030308314, "learning_rate": 2.0963997986207463e-06, "loss": 0.2675, "step": 2624 }, { "epoch": 4.055619930475087, "grad_norm": 0.2853415243908205, "learning_rate": 2.089792675664063e-06, "loss": 0.2943, "step": 2625 }, { "epoch": 4.057164928543839, "grad_norm": 0.33592515264325296, "learning_rate": 2.083194765751665e-06, "loss": 0.2879, "step": 2626 }, { "epoch": 4.0587099266125914, "grad_norm": 0.2635244052725874, "learning_rate": 2.0766060765681585e-06, "loss": 0.2796, "step": 2627 }, { "epoch": 4.060254924681344, "grad_norm": 0.27632595655148084, "learning_rate": 2.070026615787414e-06, "loss": 0.3041, "step": 2628 }, { "epoch": 4.0617999227500965, "grad_norm": 0.27383695578636996, "learning_rate": 2.063456391072548e-06, "loss": 0.2715, "step": 2629 }, { "epoch": 4.063344920818849, "grad_norm": 0.26805508244654674, "learning_rate": 2.056895410075922e-06, "loss": 0.2926, "step": 2630 }, { "epoch": 4.0648899188876015, "grad_norm": 0.27079296904003086, "learning_rate": 2.0503436804391363e-06, "loss": 0.2968, "step": 2631 }, { "epoch": 4.066434916956354, "grad_norm": 0.2839634964734517, "learning_rate": 2.0438012097930103e-06, "loss": 0.2888, "step": 2632 }, { "epoch": 4.067979915025107, "grad_norm": 0.27693820809681, "learning_rate": 2.03726800575758e-06, "loss": 0.3041, "step": 2633 }, { "epoch": 4.069524913093859, "grad_norm": 0.2852672431775437, "learning_rate": 2.0307440759420893e-06, "loss": 0.2868, "step": 2634 }, { "epoch": 4.071069911162611, "grad_norm": 0.276889264279888, "learning_rate": 2.02422942794498e-06, "loss": 0.3132, "step": 2635 }, { "epoch": 4.072614909231364, "grad_norm": 0.2603384430608871, "learning_rate": 2.0177240693538837e-06, "loss": 0.3138, "step": 2636 }, { "epoch": 4.074159907300116, "grad_norm": 0.2859369347882085, "learning_rate": 2.011228007745616e-06, "loss": 0.2873, "step": 2637 }, { "epoch": 4.075704905368868, "grad_norm": 0.26524672779721087, "learning_rate": 2.0047412506861585e-06, "loss": 0.2947, "step": 2638 }, { "epoch": 4.077249903437621, "grad_norm": 0.267761406155791, "learning_rate": 1.998263805730658e-06, "loss": 0.2871, "step": 2639 }, { "epoch": 4.078794901506373, "grad_norm": 0.24396876394390982, "learning_rate": 1.9917956804234175e-06, "loss": 0.275, "step": 2640 }, { "epoch": 4.080339899575126, "grad_norm": 0.27563743994946327, "learning_rate": 1.9853368822978825e-06, "loss": 0.3002, "step": 2641 }, { "epoch": 4.081884897643878, "grad_norm": 0.27893572026242713, "learning_rate": 1.978887418876634e-06, "loss": 0.2793, "step": 2642 }, { "epoch": 4.08342989571263, "grad_norm": 0.29901495594444244, "learning_rate": 1.972447297671387e-06, "loss": 0.2794, "step": 2643 }, { "epoch": 4.084974893781383, "grad_norm": 0.2630927373691285, "learning_rate": 1.9660165261829713e-06, "loss": 0.3088, "step": 2644 }, { "epoch": 4.086519891850135, "grad_norm": 0.2581921768243413, "learning_rate": 1.9595951119013256e-06, "loss": 0.2937, "step": 2645 }, { "epoch": 4.088064889918887, "grad_norm": 0.27907608088607133, "learning_rate": 1.9531830623054904e-06, "loss": 0.2804, "step": 2646 }, { "epoch": 4.08960988798764, "grad_norm": 0.2588292706956756, "learning_rate": 1.946780384863608e-06, "loss": 0.305, "step": 2647 }, { "epoch": 4.091154886056392, "grad_norm": 0.26473793544808233, "learning_rate": 1.940387087032891e-06, "loss": 0.3015, "step": 2648 }, { "epoch": 4.092699884125145, "grad_norm": 0.2561182759478267, "learning_rate": 1.9340031762596322e-06, "loss": 0.2905, "step": 2649 }, { "epoch": 4.094244882193897, "grad_norm": 0.27197238223485004, "learning_rate": 1.9276286599792017e-06, "loss": 0.2855, "step": 2650 }, { "epoch": 4.095789880262649, "grad_norm": 0.2583100147877924, "learning_rate": 1.9212635456160135e-06, "loss": 0.2891, "step": 2651 }, { "epoch": 4.097334878331402, "grad_norm": 0.2591733986657217, "learning_rate": 1.9149078405835364e-06, "loss": 0.2959, "step": 2652 }, { "epoch": 4.098879876400154, "grad_norm": 0.26109272907718123, "learning_rate": 1.9085615522842847e-06, "loss": 0.3081, "step": 2653 }, { "epoch": 4.1004248744689065, "grad_norm": 0.26109805566801353, "learning_rate": 1.9022246881098006e-06, "loss": 0.2889, "step": 2654 }, { "epoch": 4.101969872537659, "grad_norm": 0.2685150784547928, "learning_rate": 1.8958972554406486e-06, "loss": 0.2874, "step": 2655 }, { "epoch": 4.1035148706064115, "grad_norm": 0.26623508576588406, "learning_rate": 1.8895792616464104e-06, "loss": 0.2628, "step": 2656 }, { "epoch": 4.1050598686751645, "grad_norm": 0.26965660997503416, "learning_rate": 1.8832707140856754e-06, "loss": 0.3011, "step": 2657 }, { "epoch": 4.106604866743917, "grad_norm": 0.2655433654395828, "learning_rate": 1.876971620106025e-06, "loss": 0.2899, "step": 2658 }, { "epoch": 4.108149864812669, "grad_norm": 0.28936180053100075, "learning_rate": 1.8706819870440408e-06, "loss": 0.2871, "step": 2659 }, { "epoch": 4.109694862881422, "grad_norm": 0.2693456122167831, "learning_rate": 1.8644018222252758e-06, "loss": 0.262, "step": 2660 }, { "epoch": 4.111239860950174, "grad_norm": 0.2728148425499023, "learning_rate": 1.8581311329642592e-06, "loss": 0.2611, "step": 2661 }, { "epoch": 4.112784859018927, "grad_norm": 0.2481959640311222, "learning_rate": 1.8518699265644824e-06, "loss": 0.2997, "step": 2662 }, { "epoch": 4.114329857087679, "grad_norm": 0.2717865128324141, "learning_rate": 1.845618210318394e-06, "loss": 0.2795, "step": 2663 }, { "epoch": 4.115874855156431, "grad_norm": 0.28004319623538276, "learning_rate": 1.839375991507385e-06, "loss": 0.2997, "step": 2664 }, { "epoch": 4.117419853225184, "grad_norm": 0.2632782300167009, "learning_rate": 1.8331432774017933e-06, "loss": 0.2972, "step": 2665 }, { "epoch": 4.118964851293936, "grad_norm": 0.2851311343079179, "learning_rate": 1.8269200752608784e-06, "loss": 0.2937, "step": 2666 }, { "epoch": 4.120509849362688, "grad_norm": 0.2812915875069273, "learning_rate": 1.820706392332824e-06, "loss": 0.2853, "step": 2667 }, { "epoch": 4.122054847431441, "grad_norm": 0.25013432328189417, "learning_rate": 1.8145022358547281e-06, "loss": 0.3019, "step": 2668 }, { "epoch": 4.123599845500193, "grad_norm": 0.2604723469813131, "learning_rate": 1.808307613052591e-06, "loss": 0.2828, "step": 2669 }, { "epoch": 4.125144843568946, "grad_norm": 0.26666330632362434, "learning_rate": 1.8021225311413094e-06, "loss": 0.2843, "step": 2670 }, { "epoch": 4.126689841637698, "grad_norm": 0.26119140973628086, "learning_rate": 1.7959469973246702e-06, "loss": 0.2916, "step": 2671 }, { "epoch": 4.12823483970645, "grad_norm": 0.26851563771555603, "learning_rate": 1.7897810187953368e-06, "loss": 0.2773, "step": 2672 }, { "epoch": 4.129779837775203, "grad_norm": 0.2646691764372043, "learning_rate": 1.7836246027348458e-06, "loss": 0.2871, "step": 2673 }, { "epoch": 4.131324835843955, "grad_norm": 0.266673904833293, "learning_rate": 1.7774777563135914e-06, "loss": 0.3005, "step": 2674 }, { "epoch": 4.132869833912707, "grad_norm": 0.27565335537117946, "learning_rate": 1.7713404866908324e-06, "loss": 0.285, "step": 2675 }, { "epoch": 4.13441483198146, "grad_norm": 0.2678535723349092, "learning_rate": 1.7652128010146607e-06, "loss": 0.3123, "step": 2676 }, { "epoch": 4.135959830050212, "grad_norm": 0.26791651418575857, "learning_rate": 1.759094706422011e-06, "loss": 0.2846, "step": 2677 }, { "epoch": 4.137504828118965, "grad_norm": 0.2608627463804873, "learning_rate": 1.752986210038653e-06, "loss": 0.2985, "step": 2678 }, { "epoch": 4.139049826187717, "grad_norm": 0.2695461806214163, "learning_rate": 1.7468873189791702e-06, "loss": 0.274, "step": 2679 }, { "epoch": 4.140594824256469, "grad_norm": 0.26804038858612883, "learning_rate": 1.7407980403469593e-06, "loss": 0.2914, "step": 2680 }, { "epoch": 4.142139822325222, "grad_norm": 0.2594543301141497, "learning_rate": 1.7347183812342262e-06, "loss": 0.2802, "step": 2681 }, { "epoch": 4.1436848203939745, "grad_norm": 0.271242315707833, "learning_rate": 1.7286483487219708e-06, "loss": 0.2748, "step": 2682 }, { "epoch": 4.1452298184627265, "grad_norm": 0.25877474446311616, "learning_rate": 1.7225879498799714e-06, "loss": 0.3038, "step": 2683 }, { "epoch": 4.1467748165314795, "grad_norm": 0.2583012066263862, "learning_rate": 1.7165371917668027e-06, "loss": 0.3117, "step": 2684 }, { "epoch": 4.148319814600232, "grad_norm": 0.25705142863926106, "learning_rate": 1.7104960814298e-06, "loss": 0.2854, "step": 2685 }, { "epoch": 4.1498648126689845, "grad_norm": 0.27534788160746365, "learning_rate": 1.7044646259050613e-06, "loss": 0.3006, "step": 2686 }, { "epoch": 4.151409810737737, "grad_norm": 0.2552998248208253, "learning_rate": 1.6984428322174474e-06, "loss": 0.2949, "step": 2687 }, { "epoch": 4.152954808806489, "grad_norm": 0.25399246575983253, "learning_rate": 1.692430707380559e-06, "loss": 0.2796, "step": 2688 }, { "epoch": 4.154499806875242, "grad_norm": 0.26916961785143106, "learning_rate": 1.6864282583967374e-06, "loss": 0.2919, "step": 2689 }, { "epoch": 4.156044804943994, "grad_norm": 0.25904941209274596, "learning_rate": 1.6804354922570531e-06, "loss": 0.3015, "step": 2690 }, { "epoch": 4.157589803012746, "grad_norm": 0.2593360878603433, "learning_rate": 1.6744524159413034e-06, "loss": 0.2921, "step": 2691 }, { "epoch": 4.159134801081499, "grad_norm": 0.2685305985790236, "learning_rate": 1.6684790364179915e-06, "loss": 0.2894, "step": 2692 }, { "epoch": 4.160679799150251, "grad_norm": 0.2764684547712535, "learning_rate": 1.662515360644339e-06, "loss": 0.2827, "step": 2693 }, { "epoch": 4.162224797219004, "grad_norm": 0.2736590945989404, "learning_rate": 1.6565613955662553e-06, "loss": 0.3102, "step": 2694 }, { "epoch": 4.163769795287756, "grad_norm": 0.2746167776023611, "learning_rate": 1.650617148118342e-06, "loss": 0.2943, "step": 2695 }, { "epoch": 4.165314793356508, "grad_norm": 0.2552248238989851, "learning_rate": 1.6446826252238847e-06, "loss": 0.2945, "step": 2696 }, { "epoch": 4.166859791425261, "grad_norm": 0.24565803383932303, "learning_rate": 1.6387578337948406e-06, "loss": 0.2962, "step": 2697 }, { "epoch": 4.168404789494013, "grad_norm": 0.27271274150833275, "learning_rate": 1.6328427807318325e-06, "loss": 0.2931, "step": 2698 }, { "epoch": 4.169949787562765, "grad_norm": 0.2542496219999202, "learning_rate": 1.626937472924146e-06, "loss": 0.2857, "step": 2699 }, { "epoch": 4.171494785631518, "grad_norm": 0.26111808200468095, "learning_rate": 1.6210419172497095e-06, "loss": 0.2922, "step": 2700 }, { "epoch": 4.17303978370027, "grad_norm": 0.26779907068759473, "learning_rate": 1.6151561205750975e-06, "loss": 0.2981, "step": 2701 }, { "epoch": 4.174584781769023, "grad_norm": 0.25712530536811584, "learning_rate": 1.609280089755515e-06, "loss": 0.3145, "step": 2702 }, { "epoch": 4.176129779837775, "grad_norm": 0.2674792405493472, "learning_rate": 1.6034138316347947e-06, "loss": 0.2935, "step": 2703 }, { "epoch": 4.177674777906527, "grad_norm": 0.2789707479462175, "learning_rate": 1.597557353045387e-06, "loss": 0.3127, "step": 2704 }, { "epoch": 4.17921977597528, "grad_norm": 0.2691537888051308, "learning_rate": 1.5917106608083499e-06, "loss": 0.2599, "step": 2705 }, { "epoch": 4.180764774044032, "grad_norm": 0.268291544998456, "learning_rate": 1.5858737617333475e-06, "loss": 0.2867, "step": 2706 }, { "epoch": 4.182309772112784, "grad_norm": 0.28028024419026, "learning_rate": 1.580046662618635e-06, "loss": 0.2871, "step": 2707 }, { "epoch": 4.183854770181537, "grad_norm": 0.2659007276542954, "learning_rate": 1.5742293702510503e-06, "loss": 0.2993, "step": 2708 }, { "epoch": 4.1853997682502895, "grad_norm": 0.2587222497070998, "learning_rate": 1.5684218914060168e-06, "loss": 0.3272, "step": 2709 }, { "epoch": 4.186944766319042, "grad_norm": 0.26472924951458243, "learning_rate": 1.562624232847526e-06, "loss": 0.2743, "step": 2710 }, { "epoch": 4.1884897643877945, "grad_norm": 0.2661526618516509, "learning_rate": 1.5568364013281222e-06, "loss": 0.299, "step": 2711 }, { "epoch": 4.190034762456547, "grad_norm": 0.265073035002172, "learning_rate": 1.5510584035889175e-06, "loss": 0.2942, "step": 2712 }, { "epoch": 4.1915797605253, "grad_norm": 0.2651760611815606, "learning_rate": 1.5452902463595632e-06, "loss": 0.272, "step": 2713 }, { "epoch": 4.193124758594052, "grad_norm": 0.25415926892736274, "learning_rate": 1.5395319363582484e-06, "loss": 0.3074, "step": 2714 }, { "epoch": 4.194669756662805, "grad_norm": 0.26101661136350696, "learning_rate": 1.5337834802916995e-06, "loss": 0.2948, "step": 2715 }, { "epoch": 4.196214754731557, "grad_norm": 0.261471985052514, "learning_rate": 1.528044884855161e-06, "loss": 0.2976, "step": 2716 }, { "epoch": 4.197759752800309, "grad_norm": 0.27707967768695646, "learning_rate": 1.522316156732393e-06, "loss": 0.2852, "step": 2717 }, { "epoch": 4.199304750869062, "grad_norm": 0.27245023111023586, "learning_rate": 1.5165973025956649e-06, "loss": 0.2824, "step": 2718 }, { "epoch": 4.200849748937814, "grad_norm": 0.2601781022324639, "learning_rate": 1.510888329105743e-06, "loss": 0.3073, "step": 2719 }, { "epoch": 4.202394747006566, "grad_norm": 0.277937579125253, "learning_rate": 1.5051892429118864e-06, "loss": 0.2954, "step": 2720 }, { "epoch": 4.203939745075319, "grad_norm": 0.2804163141991482, "learning_rate": 1.499500050651843e-06, "loss": 0.2927, "step": 2721 }, { "epoch": 4.205484743144071, "grad_norm": 0.2860035727567229, "learning_rate": 1.4938207589518316e-06, "loss": 0.298, "step": 2722 }, { "epoch": 4.207029741212823, "grad_norm": 0.2673931846723449, "learning_rate": 1.488151374426543e-06, "loss": 0.2626, "step": 2723 }, { "epoch": 4.208574739281576, "grad_norm": 0.2643676356963618, "learning_rate": 1.4824919036791264e-06, "loss": 0.2726, "step": 2724 }, { "epoch": 4.210119737350328, "grad_norm": 0.27728603364664633, "learning_rate": 1.4768423533011854e-06, "loss": 0.2798, "step": 2725 }, { "epoch": 4.211664735419081, "grad_norm": 0.2675668087701721, "learning_rate": 1.4712027298727695e-06, "loss": 0.2981, "step": 2726 }, { "epoch": 4.213209733487833, "grad_norm": 0.2625622307682466, "learning_rate": 1.4655730399623691e-06, "loss": 0.2804, "step": 2727 }, { "epoch": 4.214754731556585, "grad_norm": 0.28179833680572036, "learning_rate": 1.459953290126902e-06, "loss": 0.2817, "step": 2728 }, { "epoch": 4.216299729625338, "grad_norm": 0.25765596741273245, "learning_rate": 1.4543434869117069e-06, "loss": 0.2986, "step": 2729 }, { "epoch": 4.21784472769409, "grad_norm": 0.2699003255481433, "learning_rate": 1.4487436368505415e-06, "loss": 0.3087, "step": 2730 }, { "epoch": 4.219389725762843, "grad_norm": 0.3022573028513693, "learning_rate": 1.44315374646557e-06, "loss": 0.2911, "step": 2731 }, { "epoch": 4.220934723831595, "grad_norm": 0.28486003461154424, "learning_rate": 1.437573822267354e-06, "loss": 0.2801, "step": 2732 }, { "epoch": 4.222479721900347, "grad_norm": 0.2669694082240228, "learning_rate": 1.432003870754849e-06, "loss": 0.2945, "step": 2733 }, { "epoch": 4.2240247199691, "grad_norm": 0.2643763955292878, "learning_rate": 1.4264438984153994e-06, "loss": 0.2752, "step": 2734 }, { "epoch": 4.225569718037852, "grad_norm": 0.2569655474407925, "learning_rate": 1.420893911724721e-06, "loss": 0.3101, "step": 2735 }, { "epoch": 4.2271147161066045, "grad_norm": 0.2784143583887037, "learning_rate": 1.4153539171468989e-06, "loss": 0.299, "step": 2736 }, { "epoch": 4.2286597141753575, "grad_norm": 0.28294273443929013, "learning_rate": 1.4098239211343867e-06, "loss": 0.3066, "step": 2737 }, { "epoch": 4.2302047122441095, "grad_norm": 0.27971930018683044, "learning_rate": 1.4043039301279904e-06, "loss": 0.2936, "step": 2738 }, { "epoch": 4.2317497103128625, "grad_norm": 0.26227395944464504, "learning_rate": 1.3987939505568548e-06, "loss": 0.2982, "step": 2739 }, { "epoch": 4.233294708381615, "grad_norm": 0.27596571223736277, "learning_rate": 1.3932939888384757e-06, "loss": 0.2918, "step": 2740 }, { "epoch": 4.234839706450367, "grad_norm": 0.27686711737441777, "learning_rate": 1.387804051378676e-06, "loss": 0.2946, "step": 2741 }, { "epoch": 4.23638470451912, "grad_norm": 0.2626706540194456, "learning_rate": 1.3823241445716018e-06, "loss": 0.2855, "step": 2742 }, { "epoch": 4.237929702587872, "grad_norm": 0.2699177513677067, "learning_rate": 1.3768542747997215e-06, "loss": 0.2725, "step": 2743 }, { "epoch": 4.239474700656624, "grad_norm": 0.2476488526024897, "learning_rate": 1.3713944484338093e-06, "loss": 0.3028, "step": 2744 }, { "epoch": 4.241019698725377, "grad_norm": 0.2724306903831276, "learning_rate": 1.3659446718329428e-06, "loss": 0.3019, "step": 2745 }, { "epoch": 4.242564696794129, "grad_norm": 0.2788325580234708, "learning_rate": 1.3605049513444934e-06, "loss": 0.2947, "step": 2746 }, { "epoch": 4.244109694862882, "grad_norm": 0.2833089189097369, "learning_rate": 1.355075293304121e-06, "loss": 0.3046, "step": 2747 }, { "epoch": 4.245654692931634, "grad_norm": 0.2557300194025437, "learning_rate": 1.349655704035766e-06, "loss": 0.2725, "step": 2748 }, { "epoch": 4.247199691000386, "grad_norm": 0.2604867975518522, "learning_rate": 1.3442461898516445e-06, "loss": 0.2983, "step": 2749 }, { "epoch": 4.248744689069139, "grad_norm": 0.27446618513793725, "learning_rate": 1.3388467570522345e-06, "loss": 0.2997, "step": 2750 }, { "epoch": 4.250289687137891, "grad_norm": 0.2679213087673959, "learning_rate": 1.3334574119262712e-06, "loss": 0.282, "step": 2751 }, { "epoch": 4.251834685206643, "grad_norm": 0.26969135534197414, "learning_rate": 1.328078160750743e-06, "loss": 0.3124, "step": 2752 }, { "epoch": 4.253379683275396, "grad_norm": 0.28197892311272066, "learning_rate": 1.3227090097908823e-06, "loss": 0.291, "step": 2753 }, { "epoch": 4.254924681344148, "grad_norm": 0.2678194314118187, "learning_rate": 1.317349965300153e-06, "loss": 0.2991, "step": 2754 }, { "epoch": 4.256469679412901, "grad_norm": 0.2551082093823196, "learning_rate": 1.3120010335202582e-06, "loss": 0.274, "step": 2755 }, { "epoch": 4.258014677481653, "grad_norm": 0.2576683682515109, "learning_rate": 1.306662220681112e-06, "loss": 0.3014, "step": 2756 }, { "epoch": 4.259559675550405, "grad_norm": 0.27823452287515954, "learning_rate": 1.301333533000848e-06, "loss": 0.282, "step": 2757 }, { "epoch": 4.261104673619158, "grad_norm": 0.27064779011873485, "learning_rate": 1.2960149766858054e-06, "loss": 0.2808, "step": 2758 }, { "epoch": 4.26264967168791, "grad_norm": 0.28523853161033486, "learning_rate": 1.2907065579305256e-06, "loss": 0.297, "step": 2759 }, { "epoch": 4.264194669756662, "grad_norm": 0.2708041208211495, "learning_rate": 1.2854082829177405e-06, "loss": 0.2831, "step": 2760 }, { "epoch": 4.265739667825415, "grad_norm": 0.2648305312620129, "learning_rate": 1.2801201578183653e-06, "loss": 0.2973, "step": 2761 }, { "epoch": 4.267284665894167, "grad_norm": 0.2735956587065842, "learning_rate": 1.2748421887915019e-06, "loss": 0.2912, "step": 2762 }, { "epoch": 4.26882966396292, "grad_norm": 0.28383388984786767, "learning_rate": 1.2695743819844154e-06, "loss": 0.2874, "step": 2763 }, { "epoch": 4.2703746620316725, "grad_norm": 0.25961010957508923, "learning_rate": 1.2643167435325355e-06, "loss": 0.2707, "step": 2764 }, { "epoch": 4.271919660100425, "grad_norm": 0.2651184947655493, "learning_rate": 1.2590692795594583e-06, "loss": 0.2919, "step": 2765 }, { "epoch": 4.2734646581691775, "grad_norm": 0.2734338799923934, "learning_rate": 1.253831996176914e-06, "loss": 0.2928, "step": 2766 }, { "epoch": 4.27500965623793, "grad_norm": 0.25747978326708343, "learning_rate": 1.2486048994847844e-06, "loss": 0.2891, "step": 2767 }, { "epoch": 4.276554654306683, "grad_norm": 0.27079109561019277, "learning_rate": 1.2433879955710914e-06, "loss": 0.2872, "step": 2768 }, { "epoch": 4.278099652375435, "grad_norm": 0.2714083298952672, "learning_rate": 1.238181290511976e-06, "loss": 0.2832, "step": 2769 }, { "epoch": 4.279644650444187, "grad_norm": 0.2864030084682352, "learning_rate": 1.232984790371704e-06, "loss": 0.2823, "step": 2770 }, { "epoch": 4.28118964851294, "grad_norm": 0.26908930397344727, "learning_rate": 1.2277985012026606e-06, "loss": 0.2752, "step": 2771 }, { "epoch": 4.282734646581692, "grad_norm": 0.27363520551663223, "learning_rate": 1.2226224290453293e-06, "loss": 0.3072, "step": 2772 }, { "epoch": 4.284279644650444, "grad_norm": 0.27796990549595507, "learning_rate": 1.2174565799283e-06, "loss": 0.313, "step": 2773 }, { "epoch": 4.285824642719197, "grad_norm": 0.25546332790624254, "learning_rate": 1.2123009598682545e-06, "loss": 0.282, "step": 2774 }, { "epoch": 4.287369640787949, "grad_norm": 0.26275734343502666, "learning_rate": 1.2071555748699582e-06, "loss": 0.304, "step": 2775 }, { "epoch": 4.288914638856701, "grad_norm": 0.26566758703400944, "learning_rate": 1.2020204309262573e-06, "loss": 0.2953, "step": 2776 }, { "epoch": 4.290459636925454, "grad_norm": 0.2617061764170391, "learning_rate": 1.1968955340180755e-06, "loss": 0.283, "step": 2777 }, { "epoch": 4.292004634994206, "grad_norm": 0.25308744356989943, "learning_rate": 1.1917808901143934e-06, "loss": 0.2804, "step": 2778 }, { "epoch": 4.293549633062959, "grad_norm": 0.2558528318894234, "learning_rate": 1.186676505172254e-06, "loss": 0.2865, "step": 2779 }, { "epoch": 4.295094631131711, "grad_norm": 0.2626385833347452, "learning_rate": 1.1815823851367513e-06, "loss": 0.2725, "step": 2780 }, { "epoch": 4.296639629200463, "grad_norm": 0.2725329953811948, "learning_rate": 1.176498535941023e-06, "loss": 0.2918, "step": 2781 }, { "epoch": 4.298184627269216, "grad_norm": 0.2594510565272809, "learning_rate": 1.1714249635062457e-06, "loss": 0.2955, "step": 2782 }, { "epoch": 4.299729625337968, "grad_norm": 0.2669704298263696, "learning_rate": 1.1663616737416228e-06, "loss": 0.2774, "step": 2783 }, { "epoch": 4.301274623406721, "grad_norm": 0.267068551523087, "learning_rate": 1.161308672544389e-06, "loss": 0.292, "step": 2784 }, { "epoch": 4.302819621475473, "grad_norm": 0.2565141322395392, "learning_rate": 1.1562659657997898e-06, "loss": 0.2885, "step": 2785 }, { "epoch": 4.304364619544225, "grad_norm": 0.27089147362700666, "learning_rate": 1.151233559381081e-06, "loss": 0.2692, "step": 2786 }, { "epoch": 4.305909617612978, "grad_norm": 0.2593028497722825, "learning_rate": 1.1462114591495232e-06, "loss": 0.2922, "step": 2787 }, { "epoch": 4.30745461568173, "grad_norm": 0.2583390751579423, "learning_rate": 1.1411996709543726e-06, "loss": 0.3224, "step": 2788 }, { "epoch": 4.3089996137504825, "grad_norm": 0.2570556234252693, "learning_rate": 1.1361982006328753e-06, "loss": 0.291, "step": 2789 }, { "epoch": 4.310544611819235, "grad_norm": 0.2630380559439596, "learning_rate": 1.1312070540102616e-06, "loss": 0.3107, "step": 2790 }, { "epoch": 4.3120896098879875, "grad_norm": 0.2842415951632048, "learning_rate": 1.1262262368997345e-06, "loss": 0.2962, "step": 2791 }, { "epoch": 4.31363460795674, "grad_norm": 0.2756958314420368, "learning_rate": 1.1212557551024684e-06, "loss": 0.2956, "step": 2792 }, { "epoch": 4.3151796060254926, "grad_norm": 0.27349414353096263, "learning_rate": 1.116295614407602e-06, "loss": 0.2844, "step": 2793 }, { "epoch": 4.316724604094245, "grad_norm": 0.26465966124059365, "learning_rate": 1.1113458205922245e-06, "loss": 0.2763, "step": 2794 }, { "epoch": 4.318269602162998, "grad_norm": 0.25786288998792983, "learning_rate": 1.1064063794213752e-06, "loss": 0.3091, "step": 2795 }, { "epoch": 4.31981460023175, "grad_norm": 0.28142911391046926, "learning_rate": 1.1014772966480424e-06, "loss": 0.3115, "step": 2796 }, { "epoch": 4.321359598300502, "grad_norm": 0.2623039039743166, "learning_rate": 1.0965585780131416e-06, "loss": 0.2736, "step": 2797 }, { "epoch": 4.322904596369255, "grad_norm": 0.259862121274783, "learning_rate": 1.09165022924552e-06, "loss": 0.2796, "step": 2798 }, { "epoch": 4.324449594438007, "grad_norm": 0.25811668390839637, "learning_rate": 1.0867522560619502e-06, "loss": 0.2479, "step": 2799 }, { "epoch": 4.32599459250676, "grad_norm": 0.24983832907445375, "learning_rate": 1.081864664167116e-06, "loss": 0.2926, "step": 2800 }, { "epoch": 4.327539590575512, "grad_norm": 0.26213754971949565, "learning_rate": 1.0769874592536111e-06, "loss": 0.2918, "step": 2801 }, { "epoch": 4.329084588644264, "grad_norm": 0.2858979374114705, "learning_rate": 1.072120647001933e-06, "loss": 0.2861, "step": 2802 }, { "epoch": 4.330629586713017, "grad_norm": 0.28863507029482965, "learning_rate": 1.0672642330804727e-06, "loss": 0.2739, "step": 2803 }, { "epoch": 4.332174584781769, "grad_norm": 0.28426966585702257, "learning_rate": 1.0624182231455104e-06, "loss": 0.2795, "step": 2804 }, { "epoch": 4.333719582850521, "grad_norm": 0.28730206374309886, "learning_rate": 1.0575826228412133e-06, "loss": 0.2788, "step": 2805 }, { "epoch": 4.335264580919274, "grad_norm": 0.283858782754077, "learning_rate": 1.0527574377996186e-06, "loss": 0.2787, "step": 2806 }, { "epoch": 4.336809578988026, "grad_norm": 0.28756263622558814, "learning_rate": 1.047942673640635e-06, "loss": 0.2991, "step": 2807 }, { "epoch": 4.338354577056779, "grad_norm": 0.2764066697844148, "learning_rate": 1.0431383359720347e-06, "loss": 0.2989, "step": 2808 }, { "epoch": 4.339899575125531, "grad_norm": 0.28583249449052095, "learning_rate": 1.0383444303894453e-06, "loss": 0.2862, "step": 2809 }, { "epoch": 4.341444573194283, "grad_norm": 0.2841395484758755, "learning_rate": 1.0335609624763432e-06, "loss": 0.2995, "step": 2810 }, { "epoch": 4.342989571263036, "grad_norm": 0.2685794891937625, "learning_rate": 1.0287879378040487e-06, "loss": 0.2925, "step": 2811 }, { "epoch": 4.344534569331788, "grad_norm": 0.26258521380539535, "learning_rate": 1.0240253619317219e-06, "loss": 0.2747, "step": 2812 }, { "epoch": 4.34607956740054, "grad_norm": 0.267818235791399, "learning_rate": 1.0192732404063489e-06, "loss": 0.2924, "step": 2813 }, { "epoch": 4.347624565469293, "grad_norm": 0.2632359759113845, "learning_rate": 1.0145315787627409e-06, "loss": 0.3012, "step": 2814 }, { "epoch": 4.349169563538045, "grad_norm": 0.26988849870475357, "learning_rate": 1.0098003825235269e-06, "loss": 0.287, "step": 2815 }, { "epoch": 4.350714561606798, "grad_norm": 0.2587365917808564, "learning_rate": 1.0050796571991473e-06, "loss": 0.2936, "step": 2816 }, { "epoch": 4.3522595596755504, "grad_norm": 0.2627356130129623, "learning_rate": 1.0003694082878446e-06, "loss": 0.2996, "step": 2817 }, { "epoch": 4.3538045577443025, "grad_norm": 0.2687399318366919, "learning_rate": 9.956696412756628e-07, "loss": 0.2806, "step": 2818 }, { "epoch": 4.3553495558130555, "grad_norm": 0.2705244231198962, "learning_rate": 9.909803616364355e-07, "loss": 0.3125, "step": 2819 }, { "epoch": 4.356894553881808, "grad_norm": 0.28202591327206383, "learning_rate": 9.863015748317796e-07, "loss": 0.3125, "step": 2820 }, { "epoch": 4.35843955195056, "grad_norm": 0.2750386507428876, "learning_rate": 9.816332863111e-07, "loss": 0.282, "step": 2821 }, { "epoch": 4.359984550019313, "grad_norm": 0.2681117988087569, "learning_rate": 9.769755015115601e-07, "loss": 0.2852, "step": 2822 }, { "epoch": 4.361529548088065, "grad_norm": 0.2697871884519985, "learning_rate": 9.72328225858099e-07, "loss": 0.2649, "step": 2823 }, { "epoch": 4.363074546156818, "grad_norm": 0.27767839712511905, "learning_rate": 9.676914647634173e-07, "loss": 0.2934, "step": 2824 }, { "epoch": 4.36461954422557, "grad_norm": 0.2554776069992101, "learning_rate": 9.630652236279626e-07, "loss": 0.3008, "step": 2825 }, { "epoch": 4.366164542294322, "grad_norm": 0.26583486755810737, "learning_rate": 9.584495078399313e-07, "loss": 0.2901, "step": 2826 }, { "epoch": 4.367709540363075, "grad_norm": 0.2519230132890621, "learning_rate": 9.53844322775268e-07, "loss": 0.3048, "step": 2827 }, { "epoch": 4.369254538431827, "grad_norm": 0.26574937505559243, "learning_rate": 9.492496737976431e-07, "loss": 0.3112, "step": 2828 }, { "epoch": 4.370799536500579, "grad_norm": 0.2626698870037352, "learning_rate": 9.446655662584603e-07, "loss": 0.2958, "step": 2829 }, { "epoch": 4.372344534569332, "grad_norm": 0.258759543501891, "learning_rate": 9.400920054968454e-07, "loss": 0.2827, "step": 2830 }, { "epoch": 4.373889532638084, "grad_norm": 0.2588433969423612, "learning_rate": 9.355289968396375e-07, "loss": 0.2885, "step": 2831 }, { "epoch": 4.375434530706837, "grad_norm": 0.27537351975410845, "learning_rate": 9.309765456013864e-07, "loss": 0.301, "step": 2832 }, { "epoch": 4.376979528775589, "grad_norm": 0.26655686940733025, "learning_rate": 9.264346570843519e-07, "loss": 0.2811, "step": 2833 }, { "epoch": 4.378524526844341, "grad_norm": 0.2613567931591631, "learning_rate": 9.219033365784835e-07, "loss": 0.2925, "step": 2834 }, { "epoch": 4.380069524913094, "grad_norm": 0.2540529081441726, "learning_rate": 9.173825893614252e-07, "loss": 0.2774, "step": 2835 }, { "epoch": 4.381614522981846, "grad_norm": 0.26713640472672784, "learning_rate": 9.128724206985062e-07, "loss": 0.2865, "step": 2836 }, { "epoch": 4.383159521050599, "grad_norm": 0.26090527538647557, "learning_rate": 9.083728358427335e-07, "loss": 0.316, "step": 2837 }, { "epoch": 4.384704519119351, "grad_norm": 0.2678154420643848, "learning_rate": 9.038838400347905e-07, "loss": 0.2769, "step": 2838 }, { "epoch": 4.386249517188103, "grad_norm": 0.2604016875544415, "learning_rate": 8.994054385030216e-07, "loss": 0.2975, "step": 2839 }, { "epoch": 4.387794515256856, "grad_norm": 0.2554370248520068, "learning_rate": 8.949376364634399e-07, "loss": 0.2952, "step": 2840 }, { "epoch": 4.389339513325608, "grad_norm": 0.2699777521079376, "learning_rate": 8.904804391197064e-07, "loss": 0.3007, "step": 2841 }, { "epoch": 4.39088451139436, "grad_norm": 0.25390927589195533, "learning_rate": 8.86033851663135e-07, "loss": 0.2725, "step": 2842 }, { "epoch": 4.392429509463113, "grad_norm": 0.252348570230189, "learning_rate": 8.815978792726798e-07, "loss": 0.2838, "step": 2843 }, { "epoch": 4.3939745075318655, "grad_norm": 0.24264124783216434, "learning_rate": 8.771725271149323e-07, "loss": 0.2913, "step": 2844 }, { "epoch": 4.3955195056006175, "grad_norm": 0.25139812416061724, "learning_rate": 8.727578003441128e-07, "loss": 0.2889, "step": 2845 }, { "epoch": 4.3970645036693705, "grad_norm": 0.29343007188491793, "learning_rate": 8.683537041020717e-07, "loss": 0.2801, "step": 2846 }, { "epoch": 4.398609501738123, "grad_norm": 0.2771586970571665, "learning_rate": 8.639602435182715e-07, "loss": 0.287, "step": 2847 }, { "epoch": 4.400154499806876, "grad_norm": 0.2625415000119047, "learning_rate": 8.595774237097898e-07, "loss": 0.295, "step": 2848 }, { "epoch": 4.401699497875628, "grad_norm": 0.2635139314783241, "learning_rate": 8.552052497813156e-07, "loss": 0.2816, "step": 2849 }, { "epoch": 4.40324449594438, "grad_norm": 0.2623070396843959, "learning_rate": 8.508437268251301e-07, "loss": 0.2978, "step": 2850 }, { "epoch": 4.404789494013133, "grad_norm": 0.2623511479433405, "learning_rate": 8.464928599211108e-07, "loss": 0.2942, "step": 2851 }, { "epoch": 4.406334492081885, "grad_norm": 0.26188337142387347, "learning_rate": 8.421526541367331e-07, "loss": 0.273, "step": 2852 }, { "epoch": 4.407879490150638, "grad_norm": 0.26446170824551557, "learning_rate": 8.378231145270444e-07, "loss": 0.2763, "step": 2853 }, { "epoch": 4.40942448821939, "grad_norm": 0.267408301416425, "learning_rate": 8.335042461346731e-07, "loss": 0.2576, "step": 2854 }, { "epoch": 4.410969486288142, "grad_norm": 0.26129385670881317, "learning_rate": 8.291960539898237e-07, "loss": 0.2927, "step": 2855 }, { "epoch": 4.412514484356895, "grad_norm": 0.27837602199004946, "learning_rate": 8.248985431102596e-07, "loss": 0.2895, "step": 2856 }, { "epoch": 4.414059482425647, "grad_norm": 0.38602757865555726, "learning_rate": 8.206117185013018e-07, "loss": 0.2877, "step": 2857 }, { "epoch": 4.415604480494399, "grad_norm": 0.25112204595180504, "learning_rate": 8.163355851558341e-07, "loss": 0.2727, "step": 2858 }, { "epoch": 4.417149478563152, "grad_norm": 0.2614185847740753, "learning_rate": 8.120701480542814e-07, "loss": 0.3035, "step": 2859 }, { "epoch": 4.418694476631904, "grad_norm": 0.2553220314763591, "learning_rate": 8.078154121646098e-07, "loss": 0.3002, "step": 2860 }, { "epoch": 4.420239474700657, "grad_norm": 0.26121153339325115, "learning_rate": 8.035713824423286e-07, "loss": 0.3272, "step": 2861 }, { "epoch": 4.421784472769409, "grad_norm": 0.2709016788163464, "learning_rate": 7.993380638304693e-07, "loss": 0.2752, "step": 2862 }, { "epoch": 4.423329470838161, "grad_norm": 0.2814925228644675, "learning_rate": 7.951154612595935e-07, "loss": 0.2883, "step": 2863 }, { "epoch": 4.424874468906914, "grad_norm": 0.2614044026128978, "learning_rate": 7.909035796477804e-07, "loss": 0.2938, "step": 2864 }, { "epoch": 4.426419466975666, "grad_norm": 0.2637822021097448, "learning_rate": 7.867024239006216e-07, "loss": 0.2904, "step": 2865 }, { "epoch": 4.427964465044418, "grad_norm": 0.257545413756115, "learning_rate": 7.825119989112173e-07, "loss": 0.2921, "step": 2866 }, { "epoch": 4.429509463113171, "grad_norm": 0.2643875625180412, "learning_rate": 7.783323095601669e-07, "loss": 0.3041, "step": 2867 }, { "epoch": 4.431054461181923, "grad_norm": 0.2531790710161676, "learning_rate": 7.74163360715573e-07, "loss": 0.3023, "step": 2868 }, { "epoch": 4.432599459250676, "grad_norm": 0.25394923851501777, "learning_rate": 7.700051572330203e-07, "loss": 0.285, "step": 2869 }, { "epoch": 4.434144457319428, "grad_norm": 0.25031274968336986, "learning_rate": 7.658577039555826e-07, "loss": 0.2883, "step": 2870 }, { "epoch": 4.4356894553881805, "grad_norm": 0.27097355388026867, "learning_rate": 7.617210057138125e-07, "loss": 0.2856, "step": 2871 }, { "epoch": 4.4372344534569335, "grad_norm": 0.26240159789765377, "learning_rate": 7.57595067325736e-07, "loss": 0.2937, "step": 2872 }, { "epoch": 4.4387794515256855, "grad_norm": 0.2531922974276264, "learning_rate": 7.534798935968456e-07, "loss": 0.2779, "step": 2873 }, { "epoch": 4.440324449594438, "grad_norm": 0.26971927072917845, "learning_rate": 7.493754893201011e-07, "loss": 0.2902, "step": 2874 }, { "epoch": 4.441869447663191, "grad_norm": 0.2740877305136215, "learning_rate": 7.452818592759125e-07, "loss": 0.2498, "step": 2875 }, { "epoch": 4.443414445731943, "grad_norm": 0.2511143660039177, "learning_rate": 7.411990082321451e-07, "loss": 0.2714, "step": 2876 }, { "epoch": 4.444959443800696, "grad_norm": 0.2564829853397635, "learning_rate": 7.371269409441095e-07, "loss": 0.3017, "step": 2877 }, { "epoch": 4.446504441869448, "grad_norm": 0.27408731358717525, "learning_rate": 7.330656621545539e-07, "loss": 0.2833, "step": 2878 }, { "epoch": 4.4480494399382, "grad_norm": 0.2672650137994964, "learning_rate": 7.290151765936637e-07, "loss": 0.2782, "step": 2879 }, { "epoch": 4.449594438006953, "grad_norm": 0.2857822402391506, "learning_rate": 7.249754889790539e-07, "loss": 0.2804, "step": 2880 }, { "epoch": 4.451139436075705, "grad_norm": 0.25868586705556984, "learning_rate": 7.209466040157609e-07, "loss": 0.3066, "step": 2881 }, { "epoch": 4.452684434144457, "grad_norm": 0.2639988700481228, "learning_rate": 7.169285263962388e-07, "loss": 0.2891, "step": 2882 }, { "epoch": 4.45422943221321, "grad_norm": 0.26950420864000857, "learning_rate": 7.129212608003578e-07, "loss": 0.3012, "step": 2883 }, { "epoch": 4.455774430281962, "grad_norm": 0.28816115850113744, "learning_rate": 7.08924811895395e-07, "loss": 0.2913, "step": 2884 }, { "epoch": 4.457319428350715, "grad_norm": 0.26936776699619713, "learning_rate": 7.049391843360221e-07, "loss": 0.2913, "step": 2885 }, { "epoch": 4.458864426419467, "grad_norm": 0.2539951275036158, "learning_rate": 7.009643827643165e-07, "loss": 0.2743, "step": 2886 }, { "epoch": 4.460409424488219, "grad_norm": 0.26943928428630687, "learning_rate": 6.970004118097418e-07, "loss": 0.2975, "step": 2887 }, { "epoch": 4.461954422556972, "grad_norm": 0.25189473397020784, "learning_rate": 6.930472760891449e-07, "loss": 0.2905, "step": 2888 }, { "epoch": 4.463499420625724, "grad_norm": 0.26693902281433285, "learning_rate": 6.891049802067618e-07, "loss": 0.2929, "step": 2889 }, { "epoch": 4.465044418694477, "grad_norm": 0.2810224746999638, "learning_rate": 6.851735287541928e-07, "loss": 0.2759, "step": 2890 }, { "epoch": 4.466589416763229, "grad_norm": 0.26768536984031904, "learning_rate": 6.812529263104139e-07, "loss": 0.299, "step": 2891 }, { "epoch": 4.468134414831981, "grad_norm": 0.25728728354333524, "learning_rate": 6.773431774417638e-07, "loss": 0.3156, "step": 2892 }, { "epoch": 4.469679412900734, "grad_norm": 0.2450788324987171, "learning_rate": 6.734442867019397e-07, "loss": 0.2873, "step": 2893 }, { "epoch": 4.471224410969486, "grad_norm": 0.27023418205265815, "learning_rate": 6.695562586319904e-07, "loss": 0.2727, "step": 2894 }, { "epoch": 4.472769409038238, "grad_norm": 0.2539956150485677, "learning_rate": 6.656790977603155e-07, "loss": 0.3142, "step": 2895 }, { "epoch": 4.474314407106991, "grad_norm": 0.2672451179619062, "learning_rate": 6.618128086026599e-07, "loss": 0.2883, "step": 2896 }, { "epoch": 4.475859405175743, "grad_norm": 0.2601810918200648, "learning_rate": 6.57957395662101e-07, "loss": 0.2741, "step": 2897 }, { "epoch": 4.4774044032444955, "grad_norm": 0.2734009988942304, "learning_rate": 6.541128634290505e-07, "loss": 0.2861, "step": 2898 }, { "epoch": 4.4789494013132485, "grad_norm": 0.24055243201681886, "learning_rate": 6.502792163812477e-07, "loss": 0.2881, "step": 2899 }, { "epoch": 4.480494399382001, "grad_norm": 0.2631381553215582, "learning_rate": 6.464564589837552e-07, "loss": 0.2699, "step": 2900 }, { "epoch": 4.4820393974507535, "grad_norm": 0.26019209591455583, "learning_rate": 6.426445956889471e-07, "loss": 0.306, "step": 2901 }, { "epoch": 4.483584395519506, "grad_norm": 0.26652584863732715, "learning_rate": 6.388436309365187e-07, "loss": 0.2815, "step": 2902 }, { "epoch": 4.485129393588258, "grad_norm": 0.26983962550983454, "learning_rate": 6.350535691534632e-07, "loss": 0.29, "step": 2903 }, { "epoch": 4.486674391657011, "grad_norm": 0.2626357280596294, "learning_rate": 6.312744147540773e-07, "loss": 0.2839, "step": 2904 }, { "epoch": 4.488219389725763, "grad_norm": 0.27461410578512785, "learning_rate": 6.275061721399555e-07, "loss": 0.2824, "step": 2905 }, { "epoch": 4.489764387794516, "grad_norm": 0.2622361856679866, "learning_rate": 6.237488456999819e-07, "loss": 0.3024, "step": 2906 }, { "epoch": 4.491309385863268, "grad_norm": 0.2746370655224052, "learning_rate": 6.200024398103255e-07, "loss": 0.2812, "step": 2907 }, { "epoch": 4.49285438393202, "grad_norm": 0.26790869892038327, "learning_rate": 6.162669588344406e-07, "loss": 0.2755, "step": 2908 }, { "epoch": 4.494399382000773, "grad_norm": 0.2587599432352666, "learning_rate": 6.125424071230523e-07, "loss": 0.2728, "step": 2909 }, { "epoch": 4.495944380069525, "grad_norm": 0.2558817698825992, "learning_rate": 6.088287890141564e-07, "loss": 0.3052, "step": 2910 }, { "epoch": 4.497489378138277, "grad_norm": 0.2507394315815141, "learning_rate": 6.051261088330185e-07, "loss": 0.2867, "step": 2911 }, { "epoch": 4.49903437620703, "grad_norm": 0.2603363778609366, "learning_rate": 6.014343708921644e-07, "loss": 0.3073, "step": 2912 }, { "epoch": 4.500579374275782, "grad_norm": 0.2737691451682704, "learning_rate": 5.97753579491367e-07, "loss": 0.2798, "step": 2913 }, { "epoch": 4.502124372344534, "grad_norm": 0.26166135981659255, "learning_rate": 5.940837389176612e-07, "loss": 0.2928, "step": 2914 }, { "epoch": 4.503669370413287, "grad_norm": 0.2508364636247735, "learning_rate": 5.904248534453205e-07, "loss": 0.2815, "step": 2915 }, { "epoch": 4.505214368482039, "grad_norm": 0.27348174619572957, "learning_rate": 5.867769273358603e-07, "loss": 0.2839, "step": 2916 }, { "epoch": 4.506759366550792, "grad_norm": 0.25458086841431166, "learning_rate": 5.831399648380331e-07, "loss": 0.3015, "step": 2917 }, { "epoch": 4.508304364619544, "grad_norm": 0.25404222143132255, "learning_rate": 5.795139701878216e-07, "loss": 0.3096, "step": 2918 }, { "epoch": 4.509849362688296, "grad_norm": 0.2552609111656389, "learning_rate": 5.75898947608432e-07, "loss": 0.2891, "step": 2919 }, { "epoch": 4.511394360757049, "grad_norm": 0.2571698890294941, "learning_rate": 5.72294901310293e-07, "loss": 0.2987, "step": 2920 }, { "epoch": 4.512939358825801, "grad_norm": 0.26693124571129695, "learning_rate": 5.6870183549105e-07, "loss": 0.2851, "step": 2921 }, { "epoch": 4.514484356894554, "grad_norm": 0.25604903201484286, "learning_rate": 5.65119754335558e-07, "loss": 0.2761, "step": 2922 }, { "epoch": 4.516029354963306, "grad_norm": 0.2651818750695043, "learning_rate": 5.615486620158772e-07, "loss": 0.2848, "step": 2923 }, { "epoch": 4.5175743530320585, "grad_norm": 0.25795885877436847, "learning_rate": 5.57988562691274e-07, "loss": 0.2869, "step": 2924 }, { "epoch": 4.519119351100811, "grad_norm": 0.25586204386282513, "learning_rate": 5.544394605082049e-07, "loss": 0.2946, "step": 2925 }, { "epoch": 4.5206643491695635, "grad_norm": 0.26053398466300776, "learning_rate": 5.50901359600322e-07, "loss": 0.2853, "step": 2926 }, { "epoch": 4.522209347238316, "grad_norm": 0.2679671039153107, "learning_rate": 5.473742640884617e-07, "loss": 0.2739, "step": 2927 }, { "epoch": 4.5237543453070685, "grad_norm": 0.2655847575764897, "learning_rate": 5.438581780806452e-07, "loss": 0.2617, "step": 2928 }, { "epoch": 4.525299343375821, "grad_norm": 0.2639531409211295, "learning_rate": 5.403531056720667e-07, "loss": 0.2744, "step": 2929 }, { "epoch": 4.526844341444573, "grad_norm": 0.25717246551724393, "learning_rate": 5.368590509451e-07, "loss": 0.2768, "step": 2930 }, { "epoch": 4.528389339513326, "grad_norm": 0.25128657380483466, "learning_rate": 5.333760179692782e-07, "loss": 0.3154, "step": 2931 }, { "epoch": 4.529934337582078, "grad_norm": 0.24439737572549236, "learning_rate": 5.29904010801302e-07, "loss": 0.276, "step": 2932 }, { "epoch": 4.531479335650831, "grad_norm": 0.24616404234702524, "learning_rate": 5.264430334850313e-07, "loss": 0.294, "step": 2933 }, { "epoch": 4.533024333719583, "grad_norm": 0.2535756309886617, "learning_rate": 5.229930900514757e-07, "loss": 0.2787, "step": 2934 }, { "epoch": 4.534569331788335, "grad_norm": 0.2532351842631494, "learning_rate": 5.195541845187946e-07, "loss": 0.292, "step": 2935 }, { "epoch": 4.536114329857088, "grad_norm": 0.25151834498796366, "learning_rate": 5.161263208922951e-07, "loss": 0.2785, "step": 2936 }, { "epoch": 4.53765932792584, "grad_norm": 0.2599471032435072, "learning_rate": 5.127095031644203e-07, "loss": 0.2835, "step": 2937 }, { "epoch": 4.539204325994593, "grad_norm": 0.25798101126953255, "learning_rate": 5.093037353147468e-07, "loss": 0.2968, "step": 2938 }, { "epoch": 4.540749324063345, "grad_norm": 0.259273652181009, "learning_rate": 5.059090213099893e-07, "loss": 0.2702, "step": 2939 }, { "epoch": 4.542294322132097, "grad_norm": 0.2534112630277041, "learning_rate": 5.025253651039763e-07, "loss": 0.2834, "step": 2940 }, { "epoch": 4.54383932020085, "grad_norm": 0.24765028367211347, "learning_rate": 4.991527706376642e-07, "loss": 0.2929, "step": 2941 }, { "epoch": 4.545384318269602, "grad_norm": 0.2638538279876342, "learning_rate": 4.957912418391276e-07, "loss": 0.2938, "step": 2942 }, { "epoch": 4.546929316338355, "grad_norm": 0.25705911130371334, "learning_rate": 4.924407826235478e-07, "loss": 0.301, "step": 2943 }, { "epoch": 4.548474314407107, "grad_norm": 0.2533532326555761, "learning_rate": 4.891013968932157e-07, "loss": 0.2938, "step": 2944 }, { "epoch": 4.550019312475859, "grad_norm": 0.2604782381352908, "learning_rate": 4.857730885375256e-07, "loss": 0.3128, "step": 2945 }, { "epoch": 4.551564310544612, "grad_norm": 0.2661288559069528, "learning_rate": 4.8245586143297e-07, "loss": 0.2883, "step": 2946 }, { "epoch": 4.553109308613364, "grad_norm": 0.26494909619367757, "learning_rate": 4.791497194431327e-07, "loss": 0.2912, "step": 2947 }, { "epoch": 4.554654306682116, "grad_norm": 0.2573521022791769, "learning_rate": 4.7585466641868696e-07, "loss": 0.2976, "step": 2948 }, { "epoch": 4.556199304750869, "grad_norm": 0.2474134126608029, "learning_rate": 4.7257070619739385e-07, "loss": 0.2964, "step": 2949 }, { "epoch": 4.557744302819621, "grad_norm": 0.2662720882263185, "learning_rate": 4.6929784260409173e-07, "loss": 0.2923, "step": 2950 }, { "epoch": 4.5592893008883735, "grad_norm": 0.2721849113155332, "learning_rate": 4.660360794506946e-07, "loss": 0.3001, "step": 2951 }, { "epoch": 4.560834298957126, "grad_norm": 0.26924922315527816, "learning_rate": 4.6278542053619037e-07, "loss": 0.2922, "step": 2952 }, { "epoch": 4.5623792970258785, "grad_norm": 0.25583439387284956, "learning_rate": 4.595458696466315e-07, "loss": 0.2699, "step": 2953 }, { "epoch": 4.5639242950946315, "grad_norm": 0.26911058498170265, "learning_rate": 4.563174305551332e-07, "loss": 0.2825, "step": 2954 }, { "epoch": 4.565469293163384, "grad_norm": 0.2736019640925009, "learning_rate": 4.531001070218699e-07, "loss": 0.2726, "step": 2955 }, { "epoch": 4.567014291232136, "grad_norm": 0.27333788236445145, "learning_rate": 4.4989390279406743e-07, "loss": 0.2755, "step": 2956 }, { "epoch": 4.568559289300889, "grad_norm": 0.2670057757162366, "learning_rate": 4.46698821606002e-07, "loss": 0.2741, "step": 2957 }, { "epoch": 4.570104287369641, "grad_norm": 0.2530328585893621, "learning_rate": 4.435148671789968e-07, "loss": 0.2751, "step": 2958 }, { "epoch": 4.571649285438394, "grad_norm": 0.25370473439291286, "learning_rate": 4.403420432214134e-07, "loss": 0.2891, "step": 2959 }, { "epoch": 4.573194283507146, "grad_norm": 0.27988363989895887, "learning_rate": 4.371803534286501e-07, "loss": 0.3005, "step": 2960 }, { "epoch": 4.574739281575898, "grad_norm": 0.2682729877596255, "learning_rate": 4.340298014831368e-07, "loss": 0.29, "step": 2961 }, { "epoch": 4.576284279644651, "grad_norm": 0.2695052983399432, "learning_rate": 4.308903910543327e-07, "loss": 0.2714, "step": 2962 }, { "epoch": 4.577829277713403, "grad_norm": 0.26718898647008155, "learning_rate": 4.2776212579871744e-07, "loss": 0.2792, "step": 2963 }, { "epoch": 4.579374275782155, "grad_norm": 0.2612423478718931, "learning_rate": 4.2464500935979427e-07, "loss": 0.2819, "step": 2964 }, { "epoch": 4.580919273850908, "grad_norm": 0.2541099406038526, "learning_rate": 4.2153904536807677e-07, "loss": 0.2656, "step": 2965 }, { "epoch": 4.58246427191966, "grad_norm": 0.29918885368190606, "learning_rate": 4.184442374410924e-07, "loss": 0.3043, "step": 2966 }, { "epoch": 4.584009269988412, "grad_norm": 0.2618377788927011, "learning_rate": 4.1536058918337565e-07, "loss": 0.3008, "step": 2967 }, { "epoch": 4.585554268057165, "grad_norm": 0.2794756561280435, "learning_rate": 4.1228810418645794e-07, "loss": 0.289, "step": 2968 }, { "epoch": 4.587099266125917, "grad_norm": 0.25519422876902836, "learning_rate": 4.0922678602887457e-07, "loss": 0.291, "step": 2969 }, { "epoch": 4.58864426419467, "grad_norm": 0.2644912669965429, "learning_rate": 4.0617663827615337e-07, "loss": 0.3142, "step": 2970 }, { "epoch": 4.590189262263422, "grad_norm": 0.2659753879403537, "learning_rate": 4.0313766448081046e-07, "loss": 0.2987, "step": 2971 }, { "epoch": 4.591734260332174, "grad_norm": 0.24650181184175446, "learning_rate": 4.00109868182349e-07, "loss": 0.2842, "step": 2972 }, { "epoch": 4.593279258400927, "grad_norm": 0.26200916175613237, "learning_rate": 3.9709325290725356e-07, "loss": 0.3125, "step": 2973 }, { "epoch": 4.594824256469679, "grad_norm": 0.255982843411152, "learning_rate": 3.9408782216898476e-07, "loss": 0.294, "step": 2974 }, { "epoch": 4.596369254538432, "grad_norm": 0.26082442969723524, "learning_rate": 3.910935794679804e-07, "loss": 0.2831, "step": 2975 }, { "epoch": 4.597914252607184, "grad_norm": 0.2565516240416758, "learning_rate": 3.8811052829163975e-07, "loss": 0.2834, "step": 2976 }, { "epoch": 4.599459250675936, "grad_norm": 0.26869430439688397, "learning_rate": 3.8513867211433576e-07, "loss": 0.2926, "step": 2977 }, { "epoch": 4.601004248744689, "grad_norm": 0.24411083800035055, "learning_rate": 3.821780143973985e-07, "loss": 0.3005, "step": 2978 }, { "epoch": 4.6025492468134415, "grad_norm": 0.264484689882384, "learning_rate": 3.792285585891142e-07, "loss": 0.2935, "step": 2979 }, { "epoch": 4.6040942448821935, "grad_norm": 0.27406408483165134, "learning_rate": 3.76290308124726e-07, "loss": 0.2806, "step": 2980 }, { "epoch": 4.6056392429509465, "grad_norm": 0.28229215730793045, "learning_rate": 3.7336326642642306e-07, "loss": 0.2895, "step": 2981 }, { "epoch": 4.607184241019699, "grad_norm": 0.24526968251856454, "learning_rate": 3.704474369033395e-07, "loss": 0.3047, "step": 2982 }, { "epoch": 4.608729239088451, "grad_norm": 0.2603588113373752, "learning_rate": 3.675428229515521e-07, "loss": 0.2966, "step": 2983 }, { "epoch": 4.610274237157204, "grad_norm": 0.26010133112965883, "learning_rate": 3.6464942795407463e-07, "loss": 0.2812, "step": 2984 }, { "epoch": 4.611819235225956, "grad_norm": 0.4350910197910034, "learning_rate": 3.6176725528085155e-07, "loss": 0.293, "step": 2985 }, { "epoch": 4.613364233294709, "grad_norm": 0.240581016016014, "learning_rate": 3.5889630828876197e-07, "loss": 0.2817, "step": 2986 }, { "epoch": 4.614909231363461, "grad_norm": 0.25829303747171467, "learning_rate": 3.560365903216056e-07, "loss": 0.2897, "step": 2987 }, { "epoch": 4.616454229432213, "grad_norm": 0.25171261731038586, "learning_rate": 3.531881047101049e-07, "loss": 0.291, "step": 2988 }, { "epoch": 4.617999227500966, "grad_norm": 0.2562080633165383, "learning_rate": 3.5035085477190143e-07, "loss": 0.2825, "step": 2989 }, { "epoch": 4.619544225569718, "grad_norm": 0.2615515830021962, "learning_rate": 3.475248438115486e-07, "loss": 0.2805, "step": 2990 }, { "epoch": 4.621089223638471, "grad_norm": 0.27217200807195047, "learning_rate": 3.4471007512050905e-07, "loss": 0.299, "step": 2991 }, { "epoch": 4.622634221707223, "grad_norm": 0.2592325032075456, "learning_rate": 3.4190655197715696e-07, "loss": 0.2931, "step": 2992 }, { "epoch": 4.624179219775975, "grad_norm": 0.2787965410831059, "learning_rate": 3.391142776467626e-07, "loss": 0.2865, "step": 2993 }, { "epoch": 4.625724217844728, "grad_norm": 0.2763998086870409, "learning_rate": 3.3633325538149575e-07, "loss": 0.292, "step": 2994 }, { "epoch": 4.62726921591348, "grad_norm": 0.26539305967122, "learning_rate": 3.335634884204253e-07, "loss": 0.2925, "step": 2995 }, { "epoch": 4.628814213982232, "grad_norm": 0.2717194586118823, "learning_rate": 3.3080497998950525e-07, "loss": 0.3036, "step": 2996 }, { "epoch": 4.630359212050985, "grad_norm": 0.26471765157806754, "learning_rate": 3.2805773330157996e-07, "loss": 0.2814, "step": 2997 }, { "epoch": 4.631904210119737, "grad_norm": 0.2375266110362663, "learning_rate": 3.2532175155637666e-07, "loss": 0.3062, "step": 2998 }, { "epoch": 4.633449208188489, "grad_norm": 0.2584620883200809, "learning_rate": 3.22597037940503e-07, "loss": 0.2894, "step": 2999 }, { "epoch": 4.634994206257242, "grad_norm": 0.25014684661215736, "learning_rate": 3.1988359562744044e-07, "loss": 0.3023, "step": 3000 }, { "epoch": 4.636539204325994, "grad_norm": 0.2589175639190174, "learning_rate": 3.1718142777754647e-07, "loss": 0.3022, "step": 3001 }, { "epoch": 4.638084202394747, "grad_norm": 0.25962521875473066, "learning_rate": 3.144905375380436e-07, "loss": 0.2943, "step": 3002 }, { "epoch": 4.639629200463499, "grad_norm": 0.3068565495268349, "learning_rate": 3.1181092804301817e-07, "loss": 0.3029, "step": 3003 }, { "epoch": 4.641174198532251, "grad_norm": 0.2626998439727996, "learning_rate": 3.0914260241342253e-07, "loss": 0.2622, "step": 3004 }, { "epoch": 4.642719196601004, "grad_norm": 0.26091768745552785, "learning_rate": 3.0648556375706406e-07, "loss": 0.2884, "step": 3005 }, { "epoch": 4.6442641946697565, "grad_norm": 0.2484316313206766, "learning_rate": 3.0383981516860394e-07, "loss": 0.2942, "step": 3006 }, { "epoch": 4.6458091927385095, "grad_norm": 0.2568793288901372, "learning_rate": 3.012053597295539e-07, "loss": 0.2944, "step": 3007 }, { "epoch": 4.6473541908072615, "grad_norm": 0.2636369276262675, "learning_rate": 2.9858220050827393e-07, "loss": 0.2798, "step": 3008 }, { "epoch": 4.648899188876014, "grad_norm": 0.2619527142797112, "learning_rate": 2.959703405599645e-07, "loss": 0.2934, "step": 3009 }, { "epoch": 4.650444186944767, "grad_norm": 0.2599222047449872, "learning_rate": 2.9336978292666907e-07, "loss": 0.2905, "step": 3010 }, { "epoch": 4.651989185013519, "grad_norm": 0.2657360019258332, "learning_rate": 2.9078053063726574e-07, "loss": 0.2935, "step": 3011 }, { "epoch": 4.653534183082272, "grad_norm": 0.24039743290730614, "learning_rate": 2.8820258670746337e-07, "loss": 0.31, "step": 3012 }, { "epoch": 4.655079181151024, "grad_norm": 0.27607894269018923, "learning_rate": 2.8563595413980125e-07, "loss": 0.2983, "step": 3013 }, { "epoch": 4.656624179219776, "grad_norm": 0.2489877665400183, "learning_rate": 2.830806359236471e-07, "loss": 0.2959, "step": 3014 }, { "epoch": 4.658169177288529, "grad_norm": 0.2622471092746999, "learning_rate": 2.8053663503518677e-07, "loss": 0.3128, "step": 3015 }, { "epoch": 4.659714175357281, "grad_norm": 0.25792206556879244, "learning_rate": 2.78003954437428e-07, "loss": 0.304, "step": 3016 }, { "epoch": 4.661259173426033, "grad_norm": 0.2562575937944232, "learning_rate": 2.7548259708018996e-07, "loss": 0.2774, "step": 3017 }, { "epoch": 4.662804171494786, "grad_norm": 0.25706580209330043, "learning_rate": 2.729725659001081e-07, "loss": 0.3137, "step": 3018 }, { "epoch": 4.664349169563538, "grad_norm": 0.2517488393940567, "learning_rate": 2.7047386382062055e-07, "loss": 0.2803, "step": 3019 }, { "epoch": 4.66589416763229, "grad_norm": 0.26259992172895824, "learning_rate": 2.6798649375197715e-07, "loss": 0.307, "step": 3020 }, { "epoch": 4.667439165701043, "grad_norm": 0.2731097696196708, "learning_rate": 2.6551045859122494e-07, "loss": 0.2853, "step": 3021 }, { "epoch": 4.668984163769795, "grad_norm": 0.2574199095447165, "learning_rate": 2.6304576122221035e-07, "loss": 0.2797, "step": 3022 }, { "epoch": 4.670529161838548, "grad_norm": 0.25673009439199423, "learning_rate": 2.6059240451557277e-07, "loss": 0.2845, "step": 3023 }, { "epoch": 4.6720741599073, "grad_norm": 0.2551712252623502, "learning_rate": 2.581503913287475e-07, "loss": 0.3003, "step": 3024 }, { "epoch": 4.673619157976052, "grad_norm": 0.26621650568891075, "learning_rate": 2.5571972450595175e-07, "loss": 0.2888, "step": 3025 }, { "epoch": 4.675164156044805, "grad_norm": 0.2502730172591605, "learning_rate": 2.5330040687819414e-07, "loss": 0.2788, "step": 3026 }, { "epoch": 4.676709154113557, "grad_norm": 0.2613701492863004, "learning_rate": 2.5089244126326076e-07, "loss": 0.3035, "step": 3027 }, { "epoch": 4.67825415218231, "grad_norm": 0.24494238030412452, "learning_rate": 2.484958304657159e-07, "loss": 0.3085, "step": 3028 }, { "epoch": 4.679799150251062, "grad_norm": 0.2673429062687088, "learning_rate": 2.461105772769035e-07, "loss": 0.2971, "step": 3029 }, { "epoch": 4.681344148319814, "grad_norm": 0.2467919239565518, "learning_rate": 2.4373668447493225e-07, "loss": 0.289, "step": 3030 }, { "epoch": 4.682889146388567, "grad_norm": 0.26210152522922486, "learning_rate": 2.4137415482468507e-07, "loss": 0.2946, "step": 3031 }, { "epoch": 4.684434144457319, "grad_norm": 0.2574519780914216, "learning_rate": 2.3902299107780413e-07, "loss": 0.2881, "step": 3032 }, { "epoch": 4.6859791425260715, "grad_norm": 0.24016396234460047, "learning_rate": 2.366831959727023e-07, "loss": 0.2984, "step": 3033 }, { "epoch": 4.6875241405948245, "grad_norm": 0.2559903575327571, "learning_rate": 2.3435477223454406e-07, "loss": 0.2859, "step": 3034 }, { "epoch": 4.6890691386635766, "grad_norm": 0.2667882593031153, "learning_rate": 2.3203772257525126e-07, "loss": 0.2926, "step": 3035 }, { "epoch": 4.690614136732329, "grad_norm": 0.2585206420654447, "learning_rate": 2.2973204969350182e-07, "loss": 0.2755, "step": 3036 }, { "epoch": 4.692159134801082, "grad_norm": 0.2669232031167189, "learning_rate": 2.2743775627471988e-07, "loss": 0.2969, "step": 3037 }, { "epoch": 4.693704132869834, "grad_norm": 0.2614069628729293, "learning_rate": 2.2515484499107455e-07, "loss": 0.3, "step": 3038 }, { "epoch": 4.695249130938587, "grad_norm": 0.2568808792370943, "learning_rate": 2.2288331850148227e-07, "loss": 0.2898, "step": 3039 }, { "epoch": 4.696794129007339, "grad_norm": 0.25238459662138113, "learning_rate": 2.206231794515956e-07, "loss": 0.2896, "step": 3040 }, { "epoch": 4.698339127076091, "grad_norm": 0.2652471351972476, "learning_rate": 2.1837443047380558e-07, "loss": 0.2919, "step": 3041 }, { "epoch": 4.699884125144844, "grad_norm": 0.2553998606851428, "learning_rate": 2.1613707418723928e-07, "loss": 0.2848, "step": 3042 }, { "epoch": 4.701429123213596, "grad_norm": 0.25542649938633366, "learning_rate": 2.1391111319775337e-07, "loss": 0.3065, "step": 3043 }, { "epoch": 4.702974121282349, "grad_norm": 0.2812636092094736, "learning_rate": 2.1169655009792956e-07, "loss": 0.2972, "step": 3044 }, { "epoch": 4.704519119351101, "grad_norm": 0.25404358711792563, "learning_rate": 2.0949338746707793e-07, "loss": 0.2841, "step": 3045 }, { "epoch": 4.706064117419853, "grad_norm": 0.2627303253026273, "learning_rate": 2.073016278712292e-07, "loss": 0.2943, "step": 3046 }, { "epoch": 4.707609115488606, "grad_norm": 0.2573181980524582, "learning_rate": 2.051212738631314e-07, "loss": 0.2936, "step": 3047 }, { "epoch": 4.709154113557358, "grad_norm": 0.25450832270118967, "learning_rate": 2.0295232798225318e-07, "loss": 0.3028, "step": 3048 }, { "epoch": 4.71069911162611, "grad_norm": 0.2553376179609789, "learning_rate": 2.0079479275477042e-07, "loss": 0.2607, "step": 3049 }, { "epoch": 4.712244109694863, "grad_norm": 0.25177203954022603, "learning_rate": 1.9864867069357195e-07, "loss": 0.2797, "step": 3050 }, { "epoch": 4.713789107763615, "grad_norm": 0.2575875971545679, "learning_rate": 1.9651396429825275e-07, "loss": 0.2998, "step": 3051 }, { "epoch": 4.715334105832367, "grad_norm": 0.2635968138324678, "learning_rate": 1.9439067605511397e-07, "loss": 0.2922, "step": 3052 }, { "epoch": 4.71687910390112, "grad_norm": 0.24317901975873896, "learning_rate": 1.9227880843715295e-07, "loss": 0.2802, "step": 3053 }, { "epoch": 4.718424101969872, "grad_norm": 0.2547787471260892, "learning_rate": 1.901783639040733e-07, "loss": 0.3249, "step": 3054 }, { "epoch": 4.719969100038625, "grad_norm": 0.24521049953279442, "learning_rate": 1.8808934490226695e-07, "loss": 0.3147, "step": 3055 }, { "epoch": 4.721514098107377, "grad_norm": 0.24994998314683725, "learning_rate": 1.8601175386481983e-07, "loss": 0.2851, "step": 3056 }, { "epoch": 4.723059096176129, "grad_norm": 0.25838635320022785, "learning_rate": 1.8394559321151194e-07, "loss": 0.2852, "step": 3057 }, { "epoch": 4.724604094244882, "grad_norm": 0.26483125987400336, "learning_rate": 1.8189086534880717e-07, "loss": 0.3043, "step": 3058 }, { "epoch": 4.7261490923136344, "grad_norm": 0.24740621963643042, "learning_rate": 1.798475726698523e-07, "loss": 0.2942, "step": 3059 }, { "epoch": 4.727694090382387, "grad_norm": 0.25991414421279757, "learning_rate": 1.778157175544759e-07, "loss": 0.2813, "step": 3060 }, { "epoch": 4.7292390884511395, "grad_norm": 0.25323902055654574, "learning_rate": 1.757953023691894e-07, "loss": 0.3062, "step": 3061 }, { "epoch": 4.730784086519892, "grad_norm": 0.2522027590208502, "learning_rate": 1.7378632946717488e-07, "loss": 0.2843, "step": 3062 }, { "epoch": 4.7323290845886445, "grad_norm": 0.24787680203183227, "learning_rate": 1.7178880118828843e-07, "loss": 0.2922, "step": 3063 }, { "epoch": 4.733874082657397, "grad_norm": 0.2532974854279987, "learning_rate": 1.6980271985906128e-07, "loss": 0.3153, "step": 3064 }, { "epoch": 4.73541908072615, "grad_norm": 0.2580133948574958, "learning_rate": 1.6782808779268746e-07, "loss": 0.3055, "step": 3065 }, { "epoch": 4.736964078794902, "grad_norm": 0.25298169374581525, "learning_rate": 1.6586490728902505e-07, "loss": 0.2979, "step": 3066 }, { "epoch": 4.738509076863654, "grad_norm": 0.2538966998135153, "learning_rate": 1.6391318063459948e-07, "loss": 0.2805, "step": 3067 }, { "epoch": 4.740054074932407, "grad_norm": 0.26741859535088586, "learning_rate": 1.6197291010259243e-07, "loss": 0.2922, "step": 3068 }, { "epoch": 4.741599073001159, "grad_norm": 0.2642514248214674, "learning_rate": 1.600440979528428e-07, "loss": 0.2997, "step": 3069 }, { "epoch": 4.743144071069911, "grad_norm": 0.2521038821057413, "learning_rate": 1.5812674643184477e-07, "loss": 0.2813, "step": 3070 }, { "epoch": 4.744689069138664, "grad_norm": 0.26571433631296837, "learning_rate": 1.562208577727442e-07, "loss": 0.2934, "step": 3071 }, { "epoch": 4.746234067207416, "grad_norm": 0.24920533281973395, "learning_rate": 1.5432643419533545e-07, "loss": 0.2886, "step": 3072 }, { "epoch": 4.747779065276168, "grad_norm": 0.26623976506898966, "learning_rate": 1.5244347790606018e-07, "loss": 0.2815, "step": 3073 }, { "epoch": 4.749324063344921, "grad_norm": 0.24087325245714944, "learning_rate": 1.5057199109800304e-07, "loss": 0.2874, "step": 3074 }, { "epoch": 4.750869061413673, "grad_norm": 0.2561113082240993, "learning_rate": 1.487119759508904e-07, "loss": 0.2958, "step": 3075 }, { "epoch": 4.752414059482426, "grad_norm": 0.2680255621342339, "learning_rate": 1.4686343463109042e-07, "loss": 0.2946, "step": 3076 }, { "epoch": 4.753959057551178, "grad_norm": 0.25592499365391025, "learning_rate": 1.450263692916032e-07, "loss": 0.2639, "step": 3077 }, { "epoch": 4.75550405561993, "grad_norm": 0.25763694586491487, "learning_rate": 1.432007820720649e-07, "loss": 0.2917, "step": 3078 }, { "epoch": 4.757049053688683, "grad_norm": 0.25967977415257865, "learning_rate": 1.413866750987436e-07, "loss": 0.2805, "step": 3079 }, { "epoch": 4.758594051757435, "grad_norm": 0.2699266695225682, "learning_rate": 1.3958405048453473e-07, "loss": 0.3221, "step": 3080 }, { "epoch": 4.760139049826188, "grad_norm": 0.24106353756088736, "learning_rate": 1.3779291032896103e-07, "loss": 0.3241, "step": 3081 }, { "epoch": 4.76168404789494, "grad_norm": 0.24435704589938798, "learning_rate": 1.3601325671817157e-07, "loss": 0.2825, "step": 3082 }, { "epoch": 4.763229045963692, "grad_norm": 0.25947427873062445, "learning_rate": 1.3424509172493162e-07, "loss": 0.2993, "step": 3083 }, { "epoch": 4.764774044032445, "grad_norm": 0.25944673175945776, "learning_rate": 1.324884174086305e-07, "loss": 0.2929, "step": 3084 }, { "epoch": 4.766319042101197, "grad_norm": 0.25336135081447747, "learning_rate": 1.3074323581527048e-07, "loss": 0.2886, "step": 3085 }, { "epoch": 4.7678640401699495, "grad_norm": 0.252614519515273, "learning_rate": 1.2900954897747453e-07, "loss": 0.2775, "step": 3086 }, { "epoch": 4.769409038238702, "grad_norm": 0.24579476913754703, "learning_rate": 1.2728735891446852e-07, "loss": 0.2869, "step": 3087 }, { "epoch": 4.7709540363074545, "grad_norm": 0.2528807109876194, "learning_rate": 1.2557666763209574e-07, "loss": 0.305, "step": 3088 }, { "epoch": 4.772499034376207, "grad_norm": 0.24487471338204533, "learning_rate": 1.2387747712280241e-07, "loss": 0.312, "step": 3089 }, { "epoch": 4.77404403244496, "grad_norm": 0.2608447655149398, "learning_rate": 1.2218978936564317e-07, "loss": 0.2837, "step": 3090 }, { "epoch": 4.775589030513712, "grad_norm": 0.2658506811725821, "learning_rate": 1.2051360632627128e-07, "loss": 0.3044, "step": 3091 }, { "epoch": 4.777134028582465, "grad_norm": 0.25523968432562266, "learning_rate": 1.1884892995694397e-07, "loss": 0.2947, "step": 3092 }, { "epoch": 4.778679026651217, "grad_norm": 0.26107769789039603, "learning_rate": 1.1719576219651585e-07, "loss": 0.2747, "step": 3093 }, { "epoch": 4.780224024719969, "grad_norm": 0.25967638686201483, "learning_rate": 1.155541049704334e-07, "loss": 0.2898, "step": 3094 }, { "epoch": 4.781769022788722, "grad_norm": 0.24076064761231497, "learning_rate": 1.1392396019074492e-07, "loss": 0.2874, "step": 3095 }, { "epoch": 4.783314020857474, "grad_norm": 0.2538786543107622, "learning_rate": 1.1230532975608166e-07, "loss": 0.2977, "step": 3096 }, { "epoch": 4.784859018926227, "grad_norm": 0.2534819453332853, "learning_rate": 1.1069821555166893e-07, "loss": 0.3019, "step": 3097 }, { "epoch": 4.786404016994979, "grad_norm": 0.2702612126862035, "learning_rate": 1.0910261944931944e-07, "loss": 0.3054, "step": 3098 }, { "epoch": 4.787949015063731, "grad_norm": 0.254789858231426, "learning_rate": 1.0751854330742662e-07, "loss": 0.286, "step": 3099 }, { "epoch": 4.789494013132484, "grad_norm": 0.2445154486603855, "learning_rate": 1.0594598897097131e-07, "loss": 0.308, "step": 3100 }, { "epoch": 4.791039011201236, "grad_norm": 0.2514993116605904, "learning_rate": 1.0438495827151063e-07, "loss": 0.2897, "step": 3101 }, { "epoch": 4.792584009269988, "grad_norm": 0.27896843305717867, "learning_rate": 1.0283545302718467e-07, "loss": 0.2725, "step": 3102 }, { "epoch": 4.794129007338741, "grad_norm": 0.25113435147902635, "learning_rate": 1.0129747504270426e-07, "loss": 0.294, "step": 3103 }, { "epoch": 4.795674005407493, "grad_norm": 0.2572985147403647, "learning_rate": 9.977102610935873e-08, "loss": 0.2942, "step": 3104 }, { "epoch": 4.797219003476245, "grad_norm": 0.253533454474435, "learning_rate": 9.825610800500928e-08, "loss": 0.2912, "step": 3105 }, { "epoch": 4.798764001544998, "grad_norm": 0.2682009952194996, "learning_rate": 9.675272249408451e-08, "loss": 0.2943, "step": 3106 }, { "epoch": 4.80030899961375, "grad_norm": 0.2628965486878884, "learning_rate": 9.526087132758378e-08, "loss": 0.2793, "step": 3107 }, { "epoch": 4.801853997682503, "grad_norm": 0.241876425219939, "learning_rate": 9.378055624306937e-08, "loss": 0.2819, "step": 3108 }, { "epoch": 4.803398995751255, "grad_norm": 0.2522902335177405, "learning_rate": 9.23117789646688e-08, "loss": 0.2922, "step": 3109 }, { "epoch": 4.804943993820007, "grad_norm": 0.25012039531530555, "learning_rate": 9.085454120307369e-08, "loss": 0.2944, "step": 3110 }, { "epoch": 4.80648899188876, "grad_norm": 0.26196557563682565, "learning_rate": 8.9408844655533e-08, "loss": 0.3094, "step": 3111 }, { "epoch": 4.808033989957512, "grad_norm": 0.25034625471773875, "learning_rate": 8.797469100585432e-08, "loss": 0.3111, "step": 3112 }, { "epoch": 4.809578988026265, "grad_norm": 0.2573775303619353, "learning_rate": 8.655208192440035e-08, "loss": 0.2795, "step": 3113 }, { "epoch": 4.8111239860950175, "grad_norm": 0.24570076708153799, "learning_rate": 8.514101906808903e-08, "loss": 0.2981, "step": 3114 }, { "epoch": 4.8126689841637695, "grad_norm": 0.2560553607758964, "learning_rate": 8.374150408038906e-08, "loss": 0.2678, "step": 3115 }, { "epoch": 4.8142139822325225, "grad_norm": 0.25514993283212767, "learning_rate": 8.235353859132101e-08, "loss": 0.2912, "step": 3116 }, { "epoch": 4.815758980301275, "grad_norm": 0.2564593705558445, "learning_rate": 8.097712421745285e-08, "loss": 0.2939, "step": 3117 }, { "epoch": 4.817303978370027, "grad_norm": 0.24438243256142436, "learning_rate": 7.96122625618978e-08, "loss": 0.3062, "step": 3118 }, { "epoch": 4.81884897643878, "grad_norm": 0.2566158061297478, "learning_rate": 7.825895521431648e-08, "loss": 0.297, "step": 3119 }, { "epoch": 4.820393974507532, "grad_norm": 0.2647315094862275, "learning_rate": 7.691720375090917e-08, "loss": 0.2762, "step": 3120 }, { "epoch": 4.821938972576284, "grad_norm": 0.25332631178034265, "learning_rate": 7.558700973441913e-08, "loss": 0.2923, "step": 3121 }, { "epoch": 4.823483970645037, "grad_norm": 0.25754452034045594, "learning_rate": 7.426837471412484e-08, "loss": 0.3007, "step": 3122 }, { "epoch": 4.825028968713789, "grad_norm": 0.25802855412718945, "learning_rate": 7.296130022584891e-08, "loss": 0.2965, "step": 3123 }, { "epoch": 4.826573966782542, "grad_norm": 0.24171961863339633, "learning_rate": 7.166578779194244e-08, "loss": 0.2953, "step": 3124 }, { "epoch": 4.828118964851294, "grad_norm": 0.26848970212251244, "learning_rate": 7.038183892129402e-08, "loss": 0.2849, "step": 3125 }, { "epoch": 4.829663962920046, "grad_norm": 0.2732647192479697, "learning_rate": 6.910945510932299e-08, "loss": 0.2792, "step": 3126 }, { "epoch": 4.831208960988799, "grad_norm": 0.25119847200311296, "learning_rate": 6.784863783797834e-08, "loss": 0.2841, "step": 3127 }, { "epoch": 4.832753959057551, "grad_norm": 0.2659127426223045, "learning_rate": 6.659938857573989e-08, "loss": 0.2835, "step": 3128 }, { "epoch": 4.834298957126304, "grad_norm": 0.24287669202615508, "learning_rate": 6.536170877760928e-08, "loss": 0.3094, "step": 3129 }, { "epoch": 4.835843955195056, "grad_norm": 0.2524398474028034, "learning_rate": 6.413559988511898e-08, "loss": 0.2766, "step": 3130 }, { "epoch": 4.837388953263808, "grad_norm": 0.24636644294928553, "learning_rate": 6.292106332632109e-08, "loss": 0.2805, "step": 3131 }, { "epoch": 4.838933951332561, "grad_norm": 0.25059216219556507, "learning_rate": 6.171810051578964e-08, "loss": 0.294, "step": 3132 }, { "epoch": 4.840478949401313, "grad_norm": 0.2593449990780125, "learning_rate": 6.052671285462053e-08, "loss": 0.2732, "step": 3133 }, { "epoch": 4.842023947470066, "grad_norm": 0.265582989211521, "learning_rate": 5.934690173042601e-08, "loss": 0.2848, "step": 3134 }, { "epoch": 4.843568945538818, "grad_norm": 0.24836888259695616, "learning_rate": 5.8178668517335776e-08, "loss": 0.3193, "step": 3135 }, { "epoch": 4.84511394360757, "grad_norm": 0.2587337178212595, "learning_rate": 5.7022014575995874e-08, "loss": 0.2957, "step": 3136 }, { "epoch": 4.846658941676323, "grad_norm": 0.25640047361586515, "learning_rate": 5.5876941253563133e-08, "loss": 0.3007, "step": 3137 }, { "epoch": 4.848203939745075, "grad_norm": 0.25405299279784654, "learning_rate": 5.474344988370961e-08, "loss": 0.285, "step": 3138 }, { "epoch": 4.849748937813827, "grad_norm": 0.2524368973556902, "learning_rate": 5.362154178661705e-08, "loss": 0.2812, "step": 3139 }, { "epoch": 4.85129393588258, "grad_norm": 0.25438246106219387, "learning_rate": 5.2511218268974653e-08, "loss": 0.2916, "step": 3140 }, { "epoch": 4.8528389339513325, "grad_norm": 0.25761563933862874, "learning_rate": 5.1412480623979074e-08, "loss": 0.2883, "step": 3141 }, { "epoch": 4.854383932020085, "grad_norm": 0.3884347308779884, "learning_rate": 5.032533013133445e-08, "loss": 0.2972, "step": 3142 }, { "epoch": 4.8559289300888375, "grad_norm": 0.25459195301079524, "learning_rate": 4.9249768057247906e-08, "loss": 0.2923, "step": 3143 }, { "epoch": 4.85747392815759, "grad_norm": 0.2723526198000797, "learning_rate": 4.8185795654430714e-08, "loss": 0.2779, "step": 3144 }, { "epoch": 4.859018926226343, "grad_norm": 0.2597046608050061, "learning_rate": 4.713341416209494e-08, "loss": 0.2746, "step": 3145 }, { "epoch": 4.860563924295095, "grad_norm": 0.26742727602246746, "learning_rate": 4.6092624805953445e-08, "loss": 0.306, "step": 3146 }, { "epoch": 4.862108922363847, "grad_norm": 0.25912298128520866, "learning_rate": 4.5063428798215456e-08, "loss": 0.2744, "step": 3147 }, { "epoch": 4.8636539204326, "grad_norm": 0.24338053061352327, "learning_rate": 4.404582733758989e-08, "loss": 0.2751, "step": 3148 }, { "epoch": 4.865198918501352, "grad_norm": 0.2437854128883248, "learning_rate": 4.3039821609280883e-08, "loss": 0.2991, "step": 3149 }, { "epoch": 4.866743916570105, "grad_norm": 0.25091088930381433, "learning_rate": 4.204541278498453e-08, "loss": 0.2865, "step": 3150 }, { "epoch": 4.868288914638857, "grad_norm": 0.24341464505682345, "learning_rate": 4.106260202289436e-08, "loss": 0.2935, "step": 3151 }, { "epoch": 4.869833912707609, "grad_norm": 0.26994373494428825, "learning_rate": 4.00913904676925e-08, "loss": 0.2947, "step": 3152 }, { "epoch": 4.871378910776362, "grad_norm": 0.27505825971515313, "learning_rate": 3.913177925055189e-08, "loss": 0.3059, "step": 3153 }, { "epoch": 4.872923908845114, "grad_norm": 0.25677800328253775, "learning_rate": 3.818376948913516e-08, "loss": 0.2813, "step": 3154 }, { "epoch": 4.874468906913866, "grad_norm": 0.24802025913650247, "learning_rate": 3.72473622875924e-08, "loss": 0.2879, "step": 3155 }, { "epoch": 4.876013904982619, "grad_norm": 0.25375381537341724, "learning_rate": 3.632255873655899e-08, "loss": 0.2973, "step": 3156 }, { "epoch": 4.877558903051371, "grad_norm": 0.24565996198324672, "learning_rate": 3.540935991315886e-08, "loss": 0.3021, "step": 3157 }, { "epoch": 4.879103901120123, "grad_norm": 0.27031542113477763, "learning_rate": 3.4507766880996777e-08, "loss": 0.2717, "step": 3158 }, { "epoch": 4.880648899188876, "grad_norm": 0.26984848219184726, "learning_rate": 3.361778069015942e-08, "loss": 0.3028, "step": 3159 }, { "epoch": 4.882193897257628, "grad_norm": 0.25055738718507226, "learning_rate": 3.2739402377218733e-08, "loss": 0.2809, "step": 3160 }, { "epoch": 4.883738895326381, "grad_norm": 0.2553017173834425, "learning_rate": 3.187263296522414e-08, "loss": 0.3079, "step": 3161 }, { "epoch": 4.885283893395133, "grad_norm": 0.27134236346025103, "learning_rate": 3.101747346370476e-08, "loss": 0.2966, "step": 3162 }, { "epoch": 4.886828891463885, "grad_norm": 0.238700532193219, "learning_rate": 3.017392486866721e-08, "loss": 0.2904, "step": 3163 }, { "epoch": 4.888373889532638, "grad_norm": 0.2664104995700814, "learning_rate": 2.9341988162595593e-08, "loss": 0.2924, "step": 3164 }, { "epoch": 4.88991888760139, "grad_norm": 0.25312119786678605, "learning_rate": 2.852166431444925e-08, "loss": 0.2852, "step": 3165 }, { "epoch": 4.891463885670143, "grad_norm": 0.2726811827798265, "learning_rate": 2.7712954279662807e-08, "loss": 0.2973, "step": 3166 }, { "epoch": 4.893008883738895, "grad_norm": 0.2627360031991354, "learning_rate": 2.691585900014282e-08, "loss": 0.3028, "step": 3167 }, { "epoch": 4.8945538818076475, "grad_norm": 0.263266359101341, "learning_rate": 2.613037940426888e-08, "loss": 0.3111, "step": 3168 }, { "epoch": 4.8960988798764005, "grad_norm": 0.2555703880319152, "learning_rate": 2.5356516406890297e-08, "loss": 0.3061, "step": 3169 }, { "epoch": 4.8976438779451525, "grad_norm": 0.23354636919548324, "learning_rate": 2.459427090933053e-08, "loss": 0.2979, "step": 3170 }, { "epoch": 4.899188876013905, "grad_norm": 0.2650312391366454, "learning_rate": 2.3843643799377203e-08, "loss": 0.2889, "step": 3171 }, { "epoch": 4.900733874082658, "grad_norm": 0.25084262133255253, "learning_rate": 2.3104635951289865e-08, "loss": 0.306, "step": 3172 }, { "epoch": 4.90227887215141, "grad_norm": 0.2486852643493245, "learning_rate": 2.2377248225791126e-08, "loss": 0.2861, "step": 3173 }, { "epoch": 4.903823870220162, "grad_norm": 0.25855230519573574, "learning_rate": 2.166148147007441e-08, "loss": 0.283, "step": 3174 }, { "epoch": 4.905368868288915, "grad_norm": 0.24859770816223153, "learning_rate": 2.095733651779286e-08, "loss": 0.2909, "step": 3175 }, { "epoch": 4.906913866357667, "grad_norm": 0.25891763202999857, "learning_rate": 2.0264814189069338e-08, "loss": 0.299, "step": 3176 }, { "epoch": 4.90845886442642, "grad_norm": 0.2628373285797482, "learning_rate": 1.95839152904842e-08, "loss": 0.3001, "step": 3177 }, { "epoch": 4.910003862495172, "grad_norm": 0.2658454027222171, "learning_rate": 1.891464061508419e-08, "loss": 0.2808, "step": 3178 }, { "epoch": 4.911548860563924, "grad_norm": 0.2588877161165534, "learning_rate": 1.8256990942375763e-08, "loss": 0.2892, "step": 3179 }, { "epoch": 4.913093858632677, "grad_norm": 0.2558921809926812, "learning_rate": 1.7610967038325098e-08, "loss": 0.3087, "step": 3180 }, { "epoch": 4.914638856701429, "grad_norm": 0.2539661338213362, "learning_rate": 1.69765696553581e-08, "loss": 0.2635, "step": 3181 }, { "epoch": 4.916183854770182, "grad_norm": 0.26979473641104096, "learning_rate": 1.6353799532359272e-08, "loss": 0.3011, "step": 3182 }, { "epoch": 4.917728852838934, "grad_norm": 0.257195493811567, "learning_rate": 1.5742657394672845e-08, "loss": 0.2917, "step": 3183 }, { "epoch": 4.919273850907686, "grad_norm": 0.2577444843402606, "learning_rate": 1.5143143954096106e-08, "loss": 0.2915, "step": 3184 }, { "epoch": 4.920818848976439, "grad_norm": 0.2573012556974309, "learning_rate": 1.4555259908884956e-08, "loss": 0.3021, "step": 3185 }, { "epoch": 4.922363847045191, "grad_norm": 0.2756755319012173, "learning_rate": 1.3979005943749458e-08, "loss": 0.2764, "step": 3186 }, { "epoch": 4.923908845113943, "grad_norm": 0.23916518764361053, "learning_rate": 1.3414382729854958e-08, "loss": 0.2937, "step": 3187 }, { "epoch": 4.925453843182696, "grad_norm": 0.261849376442361, "learning_rate": 1.2861390924820971e-08, "loss": 0.2895, "step": 3188 }, { "epoch": 4.926998841251448, "grad_norm": 0.24826642618733027, "learning_rate": 1.232003117271674e-08, "loss": 0.2988, "step": 3189 }, { "epoch": 4.928543839320201, "grad_norm": 0.24853887948250036, "learning_rate": 1.1790304104066785e-08, "loss": 0.2911, "step": 3190 }, { "epoch": 4.930088837388953, "grad_norm": 0.24270067833376385, "learning_rate": 1.1272210335846467e-08, "loss": 0.3125, "step": 3191 }, { "epoch": 4.931633835457705, "grad_norm": 0.27524267578953826, "learning_rate": 1.0765750471480874e-08, "loss": 0.2974, "step": 3192 }, { "epoch": 4.933178833526458, "grad_norm": 0.2514539487626679, "learning_rate": 1.027092510084482e-08, "loss": 0.2829, "step": 3193 }, { "epoch": 4.93472383159521, "grad_norm": 0.25530309636590776, "learning_rate": 9.78773480026396e-09, "loss": 0.3063, "step": 3194 }, { "epoch": 4.9362688296639625, "grad_norm": 0.2430448526910672, "learning_rate": 9.316180132511454e-09, "loss": 0.2825, "step": 3195 }, { "epoch": 4.9378138277327155, "grad_norm": 0.24953605691053876, "learning_rate": 8.856261646807973e-09, "loss": 0.276, "step": 3196 }, { "epoch": 4.939358825801468, "grad_norm": 0.26085038721660375, "learning_rate": 8.407979878821693e-09, "loss": 0.2911, "step": 3197 }, { "epoch": 4.9409038238702205, "grad_norm": 0.2529795028057573, "learning_rate": 7.971335350669407e-09, "loss": 0.2877, "step": 3198 }, { "epoch": 4.942448821938973, "grad_norm": 0.25165456998556174, "learning_rate": 7.546328570912087e-09, "loss": 0.2926, "step": 3199 }, { "epoch": 4.943993820007725, "grad_norm": 0.261137378374808, "learning_rate": 7.132960034554881e-09, "loss": 0.2854, "step": 3200 }, { "epoch": 4.945538818076478, "grad_norm": 0.26188548062388917, "learning_rate": 6.731230223051555e-09, "loss": 0.2921, "step": 3201 }, { "epoch": 4.94708381614523, "grad_norm": 0.2551406478394897, "learning_rate": 6.3411396042967195e-09, "loss": 0.2981, "step": 3202 }, { "epoch": 4.948628814213983, "grad_norm": 0.25222437832835654, "learning_rate": 5.962688632630276e-09, "loss": 0.2774, "step": 3203 }, { "epoch": 4.950173812282735, "grad_norm": 0.27823138205186315, "learning_rate": 5.5958777488374084e-09, "loss": 0.2846, "step": 3204 }, { "epoch": 4.951718810351487, "grad_norm": 0.25918505017963855, "learning_rate": 5.240707380140819e-09, "loss": 0.2957, "step": 3205 }, { "epoch": 4.95326380842024, "grad_norm": 0.2691778992141782, "learning_rate": 4.897177940209607e-09, "loss": 0.2808, "step": 3206 }, { "epoch": 4.954808806488992, "grad_norm": 0.2506498611968295, "learning_rate": 4.565289829154829e-09, "loss": 0.2805, "step": 3207 }, { "epoch": 4.956353804557744, "grad_norm": 0.24703882740054822, "learning_rate": 4.245043433527274e-09, "loss": 0.2907, "step": 3208 }, { "epoch": 4.957898802626497, "grad_norm": 0.26781286296162743, "learning_rate": 3.936439126317471e-09, "loss": 0.2918, "step": 3209 }, { "epoch": 4.959443800695249, "grad_norm": 0.26449507230830843, "learning_rate": 3.6394772669601232e-09, "loss": 0.319, "step": 3210 }, { "epoch": 4.960988798764001, "grad_norm": 0.2645583190923361, "learning_rate": 3.354158201326341e-09, "loss": 0.2945, "step": 3211 }, { "epoch": 4.962533796832754, "grad_norm": 0.2701639285128124, "learning_rate": 3.0804822617269693e-09, "loss": 0.2917, "step": 3212 }, { "epoch": 4.964078794901506, "grad_norm": 0.244463820017173, "learning_rate": 2.8184497669159207e-09, "loss": 0.2754, "step": 3213 }, { "epoch": 4.965623792970259, "grad_norm": 0.25321065765381146, "learning_rate": 2.568061022082402e-09, "loss": 0.3006, "step": 3214 }, { "epoch": 4.967168791039011, "grad_norm": 0.2648764123155521, "learning_rate": 2.3293163188531365e-09, "loss": 0.2882, "step": 3215 }, { "epoch": 4.968713789107763, "grad_norm": 0.2576406493639142, "learning_rate": 2.1022159352979132e-09, "loss": 0.2918, "step": 3216 }, { "epoch": 4.970258787176516, "grad_norm": 0.2618865954101649, "learning_rate": 1.886760135918486e-09, "loss": 0.2926, "step": 3217 }, { "epoch": 4.971803785245268, "grad_norm": 0.266272378997755, "learning_rate": 1.6829491716585655e-09, "loss": 0.2759, "step": 3218 }, { "epoch": 4.973348783314021, "grad_norm": 0.2527104354146062, "learning_rate": 1.4907832798971567e-09, "loss": 0.3009, "step": 3219 }, { "epoch": 4.974893781382773, "grad_norm": 0.2664754974802911, "learning_rate": 1.3102626844507805e-09, "loss": 0.275, "step": 3220 }, { "epoch": 4.9764387794515255, "grad_norm": 0.25867265361391384, "learning_rate": 1.1413875955712529e-09, "loss": 0.2855, "step": 3221 }, { "epoch": 4.977983777520278, "grad_norm": 0.2485550225744536, "learning_rate": 9.841582099490154e-10, "loss": 0.2808, "step": 3222 }, { "epoch": 4.9795287755890305, "grad_norm": 0.25693374027664123, "learning_rate": 8.38574710708695e-10, "loss": 0.282, "step": 3223 }, { "epoch": 4.981073773657783, "grad_norm": 0.24549712369310722, "learning_rate": 7.046372674124336e-10, "loss": 0.293, "step": 3224 }, { "epoch": 4.9826187717265356, "grad_norm": 0.26406240573717155, "learning_rate": 5.823460360587784e-10, "loss": 0.2863, "step": 3225 }, { "epoch": 4.984163769795288, "grad_norm": 0.2627830362298323, "learning_rate": 4.717011590793518e-10, "loss": 0.2955, "step": 3226 }, { "epoch": 4.98570876786404, "grad_norm": 0.25065459456984196, "learning_rate": 3.7270276534218107e-10, "loss": 0.2975, "step": 3227 }, { "epoch": 4.987253765932793, "grad_norm": 0.26387785853687307, "learning_rate": 2.8535097015169875e-10, "loss": 0.2972, "step": 3228 }, { "epoch": 4.988798764001545, "grad_norm": 0.26240875180761636, "learning_rate": 2.0964587524652246e-10, "loss": 0.2884, "step": 3229 }, { "epoch": 4.990343762070298, "grad_norm": 0.26694758501253935, "learning_rate": 1.4558756880167502e-10, "loss": 0.2638, "step": 3230 }, { "epoch": 4.99188876013905, "grad_norm": 0.25432186019400693, "learning_rate": 9.317612542525389e-11, "loss": 0.2764, "step": 3231 }, { "epoch": 4.993433758207802, "grad_norm": 0.24276982906523575, "learning_rate": 5.2411606160651664e-11, "loss": 0.2877, "step": 3232 }, { "epoch": 4.994978756276555, "grad_norm": 0.2499445868789298, "learning_rate": 2.329405848655597e-11, "loss": 0.2799, "step": 3233 }, { "epoch": 4.996523754345307, "grad_norm": 0.2642190332747088, "learning_rate": 5.823516316949551e-12, "loss": 0.3005, "step": 3234 }, { "epoch": 4.99806875241406, "grad_norm": 0.25509562799069685, "learning_rate": 0.0, "loss": 0.2864, "step": 3235 }, { "epoch": 4.99806875241406, "step": 3235, "total_flos": 6.858435114997252e+18, "train_loss": 0.3828202440298322, "train_runtime": 133801.6515, "train_samples_per_second": 3.096, "train_steps_per_second": 0.024 } ], "logging_steps": 1.0, "max_steps": 3235, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.858435114997252e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }