{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997284084736556, "eval_steps": 500, "global_step": 2760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010863661053775121, "grad_norm": 47.42976265253127, "learning_rate": 0.0, "loss": 11.434, "step": 1 }, { "epoch": 0.0021727322107550242, "grad_norm": 47.68004442246608, "learning_rate": 1.8115942028985507e-07, "loss": 11.3418, "step": 2 }, { "epoch": 0.0032590983161325366, "grad_norm": 45.068010333084196, "learning_rate": 3.6231884057971015e-07, "loss": 11.5874, "step": 3 }, { "epoch": 0.0043454644215100485, "grad_norm": 46.649670849706524, "learning_rate": 5.434782608695653e-07, "loss": 11.3539, "step": 4 }, { "epoch": 0.005431830526887561, "grad_norm": 47.08562963169932, "learning_rate": 7.246376811594203e-07, "loss": 11.3614, "step": 5 }, { "epoch": 0.006518196632265073, "grad_norm": 49.647719538761756, "learning_rate": 9.057971014492754e-07, "loss": 11.3663, "step": 6 }, { "epoch": 0.0076045627376425855, "grad_norm": 47.01674764708084, "learning_rate": 1.0869565217391306e-06, "loss": 11.4344, "step": 7 }, { "epoch": 0.008690928843020097, "grad_norm": 52.09313128785783, "learning_rate": 1.2681159420289857e-06, "loss": 10.9463, "step": 8 }, { "epoch": 0.00977729494839761, "grad_norm": 46.02815358023105, "learning_rate": 1.4492753623188406e-06, "loss": 11.3141, "step": 9 }, { "epoch": 0.010863661053775122, "grad_norm": 49.371112006683596, "learning_rate": 1.6304347826086957e-06, "loss": 11.0773, "step": 10 }, { "epoch": 0.011950027159152634, "grad_norm": 67.30236569780485, "learning_rate": 1.8115942028985508e-06, "loss": 10.1833, "step": 11 }, { "epoch": 0.013036393264530146, "grad_norm": 66.45540616448592, "learning_rate": 1.9927536231884058e-06, "loss": 10.191, "step": 12 }, { "epoch": 0.014122759369907659, "grad_norm": 79.4421801208747, "learning_rate": 2.173913043478261e-06, "loss": 9.737, "step": 13 }, { "epoch": 0.015209125475285171, "grad_norm": 83.09920933118697, "learning_rate": 2.355072463768116e-06, "loss": 9.4759, "step": 14 }, { "epoch": 0.016295491580662683, "grad_norm": 82.75546589659622, "learning_rate": 2.5362318840579714e-06, "loss": 5.4515, "step": 15 }, { "epoch": 0.017381857686040194, "grad_norm": 74.56643305451783, "learning_rate": 2.7173913043478263e-06, "loss": 4.872, "step": 16 }, { "epoch": 0.018468223791417708, "grad_norm": 67.0651747186948, "learning_rate": 2.898550724637681e-06, "loss": 4.7424, "step": 17 }, { "epoch": 0.01955458989679522, "grad_norm": 49.58983529341401, "learning_rate": 3.0797101449275365e-06, "loss": 3.7345, "step": 18 }, { "epoch": 0.020640956002172733, "grad_norm": 42.82106366208801, "learning_rate": 3.2608695652173914e-06, "loss": 3.3123, "step": 19 }, { "epoch": 0.021727322107550243, "grad_norm": 9.871537529993306, "learning_rate": 3.4420289855072464e-06, "loss": 1.824, "step": 20 }, { "epoch": 0.022813688212927757, "grad_norm": 6.2443558856559225, "learning_rate": 3.6231884057971017e-06, "loss": 1.7047, "step": 21 }, { "epoch": 0.023900054318305268, "grad_norm": 8.605494563630991, "learning_rate": 3.804347826086957e-06, "loss": 1.6876, "step": 22 }, { "epoch": 0.024986420423682782, "grad_norm": 4.8381633958609145, "learning_rate": 3.9855072463768115e-06, "loss": 1.7046, "step": 23 }, { "epoch": 0.026072786529060293, "grad_norm": 3.8298906934134367, "learning_rate": 4.166666666666667e-06, "loss": 1.5484, "step": 24 }, { "epoch": 0.027159152634437807, "grad_norm": 3.1043417877337567, "learning_rate": 4.347826086956522e-06, "loss": 1.4028, "step": 25 }, { "epoch": 0.028245518739815317, "grad_norm": 2.5457200949214593, "learning_rate": 4.528985507246377e-06, "loss": 1.3538, "step": 26 }, { "epoch": 0.02933188484519283, "grad_norm": 1.9819893516712124, "learning_rate": 4.710144927536232e-06, "loss": 1.3066, "step": 27 }, { "epoch": 0.030418250950570342, "grad_norm": 1.7314098293735436, "learning_rate": 4.891304347826087e-06, "loss": 1.4263, "step": 28 }, { "epoch": 0.031504617055947856, "grad_norm": 1.3755571776862083, "learning_rate": 5.072463768115943e-06, "loss": 1.1298, "step": 29 }, { "epoch": 0.03259098316132537, "grad_norm": 8.87132189075883, "learning_rate": 5.253623188405797e-06, "loss": 1.0306, "step": 30 }, { "epoch": 0.03367734926670288, "grad_norm": 1.7995058100446353, "learning_rate": 5.4347826086956525e-06, "loss": 1.1264, "step": 31 }, { "epoch": 0.03476371537208039, "grad_norm": 1.1425503558784733, "learning_rate": 5.615942028985508e-06, "loss": 1.2436, "step": 32 }, { "epoch": 0.035850081477457905, "grad_norm": 0.8575692155276106, "learning_rate": 5.797101449275362e-06, "loss": 0.9673, "step": 33 }, { "epoch": 0.036936447582835416, "grad_norm": 0.9022077624005543, "learning_rate": 5.978260869565218e-06, "loss": 1.1056, "step": 34 }, { "epoch": 0.03802281368821293, "grad_norm": 0.8193865263174435, "learning_rate": 6.159420289855073e-06, "loss": 1.1485, "step": 35 }, { "epoch": 0.03910917979359044, "grad_norm": 0.7212984170572092, "learning_rate": 6.340579710144928e-06, "loss": 0.9802, "step": 36 }, { "epoch": 0.040195545898967955, "grad_norm": 0.6849715821599869, "learning_rate": 6.521739130434783e-06, "loss": 1.0494, "step": 37 }, { "epoch": 0.041281912004345465, "grad_norm": 0.7177998347564148, "learning_rate": 6.702898550724638e-06, "loss": 1.0036, "step": 38 }, { "epoch": 0.042368278109722976, "grad_norm": 0.6542128369487578, "learning_rate": 6.884057971014493e-06, "loss": 1.0708, "step": 39 }, { "epoch": 0.04345464421510049, "grad_norm": 0.6325722477016282, "learning_rate": 7.065217391304347e-06, "loss": 1.0924, "step": 40 }, { "epoch": 0.044541010320478004, "grad_norm": 0.6428606626162306, "learning_rate": 7.246376811594203e-06, "loss": 0.9409, "step": 41 }, { "epoch": 0.045627376425855515, "grad_norm": 0.6589541916442002, "learning_rate": 7.427536231884058e-06, "loss": 0.8022, "step": 42 }, { "epoch": 0.046713742531233025, "grad_norm": 0.5858770012410753, "learning_rate": 7.608695652173914e-06, "loss": 0.8985, "step": 43 }, { "epoch": 0.047800108636610536, "grad_norm": 0.5808285599783714, "learning_rate": 7.789855072463769e-06, "loss": 0.9994, "step": 44 }, { "epoch": 0.04888647474198805, "grad_norm": 0.5466520993115281, "learning_rate": 7.971014492753623e-06, "loss": 0.9522, "step": 45 }, { "epoch": 0.049972840847365564, "grad_norm": 0.5337135966592617, "learning_rate": 8.15217391304348e-06, "loss": 0.9646, "step": 46 }, { "epoch": 0.051059206952743075, "grad_norm": 0.5615089696443837, "learning_rate": 8.333333333333334e-06, "loss": 0.9139, "step": 47 }, { "epoch": 0.052145573058120585, "grad_norm": 0.5157434947378996, "learning_rate": 8.51449275362319e-06, "loss": 0.814, "step": 48 }, { "epoch": 0.053231939163498096, "grad_norm": 0.4969559881767152, "learning_rate": 8.695652173913044e-06, "loss": 0.7876, "step": 49 }, { "epoch": 0.054318305268875614, "grad_norm": 0.49539178197821104, "learning_rate": 8.876811594202899e-06, "loss": 0.95, "step": 50 }, { "epoch": 0.055404671374253124, "grad_norm": 0.47206056016318554, "learning_rate": 9.057971014492753e-06, "loss": 0.874, "step": 51 }, { "epoch": 0.056491037479630635, "grad_norm": 0.47657722173037265, "learning_rate": 9.239130434782608e-06, "loss": 1.0396, "step": 52 }, { "epoch": 0.057577403585008145, "grad_norm": 0.44635693268466214, "learning_rate": 9.420289855072464e-06, "loss": 0.7916, "step": 53 }, { "epoch": 0.05866376969038566, "grad_norm": 0.4303597391329919, "learning_rate": 9.601449275362319e-06, "loss": 0.9078, "step": 54 }, { "epoch": 0.05975013579576317, "grad_norm": 0.4003575052406969, "learning_rate": 9.782608695652175e-06, "loss": 0.8336, "step": 55 }, { "epoch": 0.060836501901140684, "grad_norm": 0.39773386635530766, "learning_rate": 9.96376811594203e-06, "loss": 0.77, "step": 56 }, { "epoch": 0.061922868006518195, "grad_norm": 0.3941398218902587, "learning_rate": 1.0144927536231885e-05, "loss": 0.9084, "step": 57 }, { "epoch": 0.06300923411189571, "grad_norm": 0.35736259296533923, "learning_rate": 1.032608695652174e-05, "loss": 0.769, "step": 58 }, { "epoch": 0.06409560021727322, "grad_norm": 0.3816418641371705, "learning_rate": 1.0507246376811594e-05, "loss": 0.7691, "step": 59 }, { "epoch": 0.06518196632265073, "grad_norm": 0.3810710913713711, "learning_rate": 1.068840579710145e-05, "loss": 0.8808, "step": 60 }, { "epoch": 0.06626833242802825, "grad_norm": 0.3443974353424044, "learning_rate": 1.0869565217391305e-05, "loss": 0.7281, "step": 61 }, { "epoch": 0.06735469853340575, "grad_norm": 0.36325592594690226, "learning_rate": 1.1050724637681161e-05, "loss": 0.8967, "step": 62 }, { "epoch": 0.06844106463878327, "grad_norm": 0.317348233816886, "learning_rate": 1.1231884057971016e-05, "loss": 0.8222, "step": 63 }, { "epoch": 0.06952743074416078, "grad_norm": 0.4030429139872056, "learning_rate": 1.141304347826087e-05, "loss": 0.9917, "step": 64 }, { "epoch": 0.0706137968495383, "grad_norm": 1.2956515484086424, "learning_rate": 1.1594202898550725e-05, "loss": 0.7572, "step": 65 }, { "epoch": 0.07170016295491581, "grad_norm": 0.31637366163291664, "learning_rate": 1.177536231884058e-05, "loss": 0.8348, "step": 66 }, { "epoch": 0.07278652906029331, "grad_norm": 0.34793727220215726, "learning_rate": 1.1956521739130435e-05, "loss": 0.7888, "step": 67 }, { "epoch": 0.07387289516567083, "grad_norm": 0.34770698912868103, "learning_rate": 1.213768115942029e-05, "loss": 0.927, "step": 68 }, { "epoch": 0.07495926127104835, "grad_norm": 0.3382503528214814, "learning_rate": 1.2318840579710146e-05, "loss": 0.9062, "step": 69 }, { "epoch": 0.07604562737642585, "grad_norm": 0.32335230434753537, "learning_rate": 1.25e-05, "loss": 0.8274, "step": 70 }, { "epoch": 0.07713199348180337, "grad_norm": 0.3018528417853627, "learning_rate": 1.2681159420289857e-05, "loss": 0.729, "step": 71 }, { "epoch": 0.07821835958718087, "grad_norm": 0.2995557029114903, "learning_rate": 1.286231884057971e-05, "loss": 0.7333, "step": 72 }, { "epoch": 0.07930472569255839, "grad_norm": 0.3440499942715796, "learning_rate": 1.3043478260869566e-05, "loss": 0.7702, "step": 73 }, { "epoch": 0.08039109179793591, "grad_norm": 0.3271394945591713, "learning_rate": 1.3224637681159422e-05, "loss": 0.841, "step": 74 }, { "epoch": 0.08147745790331341, "grad_norm": 0.588182977567395, "learning_rate": 1.3405797101449276e-05, "loss": 0.7931, "step": 75 }, { "epoch": 0.08256382400869093, "grad_norm": 0.3169973904469514, "learning_rate": 1.3586956521739131e-05, "loss": 0.8236, "step": 76 }, { "epoch": 0.08365019011406843, "grad_norm": 0.37937597793832695, "learning_rate": 1.3768115942028985e-05, "loss": 0.8394, "step": 77 }, { "epoch": 0.08473655621944595, "grad_norm": 0.3101121341173568, "learning_rate": 1.3949275362318842e-05, "loss": 0.7428, "step": 78 }, { "epoch": 0.08582292232482347, "grad_norm": 0.3491524871940984, "learning_rate": 1.4130434782608694e-05, "loss": 0.8074, "step": 79 }, { "epoch": 0.08690928843020097, "grad_norm": 0.3406761907650529, "learning_rate": 1.431159420289855e-05, "loss": 0.8764, "step": 80 }, { "epoch": 0.08799565453557849, "grad_norm": 0.39980761914812296, "learning_rate": 1.4492753623188407e-05, "loss": 0.9204, "step": 81 }, { "epoch": 0.08908202064095601, "grad_norm": 0.3503163549597415, "learning_rate": 1.4673913043478263e-05, "loss": 0.8341, "step": 82 }, { "epoch": 0.09016838674633351, "grad_norm": 0.32060623340338407, "learning_rate": 1.4855072463768116e-05, "loss": 0.7766, "step": 83 }, { "epoch": 0.09125475285171103, "grad_norm": 0.2777330480401291, "learning_rate": 1.5036231884057972e-05, "loss": 0.6719, "step": 84 }, { "epoch": 0.09234111895708853, "grad_norm": 0.2945319827995314, "learning_rate": 1.5217391304347828e-05, "loss": 0.6928, "step": 85 }, { "epoch": 0.09342748506246605, "grad_norm": 0.31722782519937764, "learning_rate": 1.539855072463768e-05, "loss": 0.6845, "step": 86 }, { "epoch": 0.09451385116784357, "grad_norm": 0.3649623509247646, "learning_rate": 1.5579710144927537e-05, "loss": 0.8863, "step": 87 }, { "epoch": 0.09560021727322107, "grad_norm": 0.3032228289540756, "learning_rate": 1.5760869565217393e-05, "loss": 0.8432, "step": 88 }, { "epoch": 0.09668658337859859, "grad_norm": 0.32268314698774975, "learning_rate": 1.5942028985507246e-05, "loss": 0.7853, "step": 89 }, { "epoch": 0.0977729494839761, "grad_norm": 0.28495836007013153, "learning_rate": 1.6123188405797102e-05, "loss": 0.6235, "step": 90 }, { "epoch": 0.09885931558935361, "grad_norm": 0.33761880748666284, "learning_rate": 1.630434782608696e-05, "loss": 0.6729, "step": 91 }, { "epoch": 0.09994568169473113, "grad_norm": 0.36551845557112056, "learning_rate": 1.6485507246376815e-05, "loss": 0.7086, "step": 92 }, { "epoch": 0.10103204780010863, "grad_norm": 0.3405316367499358, "learning_rate": 1.6666666666666667e-05, "loss": 0.7835, "step": 93 }, { "epoch": 0.10211841390548615, "grad_norm": 0.3151046251738467, "learning_rate": 1.6847826086956524e-05, "loss": 0.7496, "step": 94 }, { "epoch": 0.10320478001086367, "grad_norm": 0.3589280541517969, "learning_rate": 1.702898550724638e-05, "loss": 0.7816, "step": 95 }, { "epoch": 0.10429114611624117, "grad_norm": 0.3171029980790089, "learning_rate": 1.7210144927536233e-05, "loss": 0.7579, "step": 96 }, { "epoch": 0.10537751222161869, "grad_norm": 0.324607331491716, "learning_rate": 1.739130434782609e-05, "loss": 0.7955, "step": 97 }, { "epoch": 0.10646387832699619, "grad_norm": 0.3045111608096009, "learning_rate": 1.757246376811594e-05, "loss": 0.7903, "step": 98 }, { "epoch": 0.10755024443237371, "grad_norm": 0.3390686560725376, "learning_rate": 1.7753623188405798e-05, "loss": 0.7545, "step": 99 }, { "epoch": 0.10863661053775123, "grad_norm": 0.37459982974182887, "learning_rate": 1.793478260869565e-05, "loss": 0.7228, "step": 100 }, { "epoch": 0.10972297664312873, "grad_norm": 0.30758592530241896, "learning_rate": 1.8115942028985507e-05, "loss": 0.7649, "step": 101 }, { "epoch": 0.11080934274850625, "grad_norm": 0.39063023747580966, "learning_rate": 1.8297101449275363e-05, "loss": 0.7315, "step": 102 }, { "epoch": 0.11189570885388377, "grad_norm": 0.3311469026300531, "learning_rate": 1.8478260869565216e-05, "loss": 0.7472, "step": 103 }, { "epoch": 0.11298207495926127, "grad_norm": 0.2774611029803798, "learning_rate": 1.8659420289855072e-05, "loss": 0.652, "step": 104 }, { "epoch": 0.11406844106463879, "grad_norm": 0.3161529829810922, "learning_rate": 1.8840579710144928e-05, "loss": 0.6786, "step": 105 }, { "epoch": 0.11515480717001629, "grad_norm": 2.2978066782145703, "learning_rate": 1.9021739130434784e-05, "loss": 0.7107, "step": 106 }, { "epoch": 0.11624117327539381, "grad_norm": 0.31982445109059515, "learning_rate": 1.9202898550724637e-05, "loss": 0.7302, "step": 107 }, { "epoch": 0.11732753938077133, "grad_norm": 0.48605214446455897, "learning_rate": 1.9384057971014493e-05, "loss": 0.7575, "step": 108 }, { "epoch": 0.11841390548614883, "grad_norm": 0.39913111673105317, "learning_rate": 1.956521739130435e-05, "loss": 0.7749, "step": 109 }, { "epoch": 0.11950027159152635, "grad_norm": 0.33719293759419927, "learning_rate": 1.9746376811594202e-05, "loss": 0.6234, "step": 110 }, { "epoch": 0.12058663769690385, "grad_norm": 0.327961865870223, "learning_rate": 1.992753623188406e-05, "loss": 0.704, "step": 111 }, { "epoch": 0.12167300380228137, "grad_norm": 0.3302344862300122, "learning_rate": 2.0108695652173915e-05, "loss": 0.7068, "step": 112 }, { "epoch": 0.12275936990765889, "grad_norm": 0.3361049253227002, "learning_rate": 2.028985507246377e-05, "loss": 0.7866, "step": 113 }, { "epoch": 0.12384573601303639, "grad_norm": 0.3903014269729262, "learning_rate": 2.0471014492753624e-05, "loss": 0.7261, "step": 114 }, { "epoch": 0.1249321021184139, "grad_norm": 0.31105194176159484, "learning_rate": 2.065217391304348e-05, "loss": 0.6544, "step": 115 }, { "epoch": 0.12601846822379142, "grad_norm": 0.3242229606443768, "learning_rate": 2.0833333333333336e-05, "loss": 0.7285, "step": 116 }, { "epoch": 0.12710483432916894, "grad_norm": 0.38888502590729007, "learning_rate": 2.101449275362319e-05, "loss": 0.7276, "step": 117 }, { "epoch": 0.12819120043454643, "grad_norm": 0.2929981239749271, "learning_rate": 2.1195652173913045e-05, "loss": 0.679, "step": 118 }, { "epoch": 0.12927756653992395, "grad_norm": 0.38133473770575677, "learning_rate": 2.13768115942029e-05, "loss": 0.654, "step": 119 }, { "epoch": 0.13036393264530147, "grad_norm": 0.3261443908547802, "learning_rate": 2.1557971014492757e-05, "loss": 0.5957, "step": 120 }, { "epoch": 0.13145029875067898, "grad_norm": 0.3199852381723966, "learning_rate": 2.173913043478261e-05, "loss": 0.7514, "step": 121 }, { "epoch": 0.1325366648560565, "grad_norm": 0.3463408156461838, "learning_rate": 2.1920289855072466e-05, "loss": 0.6947, "step": 122 }, { "epoch": 0.133623030961434, "grad_norm": 0.30331803915860234, "learning_rate": 2.2101449275362323e-05, "loss": 0.702, "step": 123 }, { "epoch": 0.1347093970668115, "grad_norm": 0.3236777869223172, "learning_rate": 2.2282608695652175e-05, "loss": 0.7627, "step": 124 }, { "epoch": 0.13579576317218903, "grad_norm": 0.3406326199995319, "learning_rate": 2.246376811594203e-05, "loss": 0.7228, "step": 125 }, { "epoch": 0.13688212927756654, "grad_norm": 0.31271294656006476, "learning_rate": 2.2644927536231884e-05, "loss": 0.7498, "step": 126 }, { "epoch": 0.13796849538294406, "grad_norm": 0.3544136046542549, "learning_rate": 2.282608695652174e-05, "loss": 0.7462, "step": 127 }, { "epoch": 0.13905486148832155, "grad_norm": 0.3624930037682448, "learning_rate": 2.3007246376811593e-05, "loss": 0.6693, "step": 128 }, { "epoch": 0.14014122759369907, "grad_norm": 0.3308306947097468, "learning_rate": 2.318840579710145e-05, "loss": 0.7459, "step": 129 }, { "epoch": 0.1412275936990766, "grad_norm": 0.40058243180571346, "learning_rate": 2.3369565217391306e-05, "loss": 0.7318, "step": 130 }, { "epoch": 0.1423139598044541, "grad_norm": 0.4049033469685285, "learning_rate": 2.355072463768116e-05, "loss": 0.5989, "step": 131 }, { "epoch": 0.14340032590983162, "grad_norm": 0.4196653307192952, "learning_rate": 2.3731884057971015e-05, "loss": 0.6452, "step": 132 }, { "epoch": 0.1444866920152091, "grad_norm": 0.3079131265114905, "learning_rate": 2.391304347826087e-05, "loss": 0.6703, "step": 133 }, { "epoch": 0.14557305812058663, "grad_norm": 0.3879330071773021, "learning_rate": 2.4094202898550724e-05, "loss": 0.6932, "step": 134 }, { "epoch": 0.14665942422596415, "grad_norm": 0.3166100038309626, "learning_rate": 2.427536231884058e-05, "loss": 0.5912, "step": 135 }, { "epoch": 0.14774579033134166, "grad_norm": 0.37269856641571536, "learning_rate": 2.4456521739130436e-05, "loss": 0.732, "step": 136 }, { "epoch": 0.14883215643671918, "grad_norm": 0.31947226182032123, "learning_rate": 2.4637681159420292e-05, "loss": 0.6404, "step": 137 }, { "epoch": 0.1499185225420967, "grad_norm": 0.3312201876731154, "learning_rate": 2.4818840579710145e-05, "loss": 0.6647, "step": 138 }, { "epoch": 0.1510048886474742, "grad_norm": 0.34221142157501777, "learning_rate": 2.5e-05, "loss": 0.6815, "step": 139 }, { "epoch": 0.1520912547528517, "grad_norm": 0.3503437415659079, "learning_rate": 2.5181159420289857e-05, "loss": 0.6483, "step": 140 }, { "epoch": 0.15317762085822922, "grad_norm": 0.2999318649426732, "learning_rate": 2.5362318840579714e-05, "loss": 0.5561, "step": 141 }, { "epoch": 0.15426398696360674, "grad_norm": 0.3660175159790616, "learning_rate": 2.554347826086957e-05, "loss": 0.7898, "step": 142 }, { "epoch": 0.15535035306898426, "grad_norm": 0.3493355769561248, "learning_rate": 2.572463768115942e-05, "loss": 0.6534, "step": 143 }, { "epoch": 0.15643671917436175, "grad_norm": 0.37085245352189844, "learning_rate": 2.5905797101449275e-05, "loss": 0.6633, "step": 144 }, { "epoch": 0.15752308527973927, "grad_norm": 0.35068140901151135, "learning_rate": 2.608695652173913e-05, "loss": 0.663, "step": 145 }, { "epoch": 0.15860945138511678, "grad_norm": 0.4010679148963105, "learning_rate": 2.6268115942028988e-05, "loss": 0.7033, "step": 146 }, { "epoch": 0.1596958174904943, "grad_norm": 0.35956740117801717, "learning_rate": 2.6449275362318844e-05, "loss": 0.5322, "step": 147 }, { "epoch": 0.16078218359587182, "grad_norm": 0.3427983240303555, "learning_rate": 2.66304347826087e-05, "loss": 0.6437, "step": 148 }, { "epoch": 0.1618685497012493, "grad_norm": 0.3912470571466729, "learning_rate": 2.6811594202898553e-05, "loss": 0.6049, "step": 149 }, { "epoch": 0.16295491580662683, "grad_norm": 0.3936912655206134, "learning_rate": 2.6992753623188406e-05, "loss": 0.679, "step": 150 }, { "epoch": 0.16404128191200434, "grad_norm": 0.30905108315885166, "learning_rate": 2.7173913043478262e-05, "loss": 0.6683, "step": 151 }, { "epoch": 0.16512764801738186, "grad_norm": 0.3970867161679149, "learning_rate": 2.7355072463768118e-05, "loss": 0.7464, "step": 152 }, { "epoch": 0.16621401412275938, "grad_norm": 0.5978611987524829, "learning_rate": 2.753623188405797e-05, "loss": 0.8076, "step": 153 }, { "epoch": 0.16730038022813687, "grad_norm": 0.3751667415126323, "learning_rate": 2.7717391304347827e-05, "loss": 0.6232, "step": 154 }, { "epoch": 0.1683867463335144, "grad_norm": 0.31936567157999435, "learning_rate": 2.7898550724637683e-05, "loss": 0.6378, "step": 155 }, { "epoch": 0.1694731124388919, "grad_norm": 0.3884902722682578, "learning_rate": 2.807971014492754e-05, "loss": 0.5935, "step": 156 }, { "epoch": 0.17055947854426942, "grad_norm": 0.37944230471427176, "learning_rate": 2.826086956521739e-05, "loss": 0.7708, "step": 157 }, { "epoch": 0.17164584464964694, "grad_norm": 0.3936112796333657, "learning_rate": 2.8442028985507245e-05, "loss": 0.6791, "step": 158 }, { "epoch": 0.17273221075502446, "grad_norm": 0.4134004803118426, "learning_rate": 2.86231884057971e-05, "loss": 0.7304, "step": 159 }, { "epoch": 0.17381857686040195, "grad_norm": 0.39518223514982026, "learning_rate": 2.8804347826086957e-05, "loss": 0.6607, "step": 160 }, { "epoch": 0.17490494296577946, "grad_norm": 0.41617574784171485, "learning_rate": 2.8985507246376814e-05, "loss": 0.645, "step": 161 }, { "epoch": 0.17599130907115698, "grad_norm": 0.35109864157463505, "learning_rate": 2.916666666666667e-05, "loss": 0.6659, "step": 162 }, { "epoch": 0.1770776751765345, "grad_norm": 0.47995349689586203, "learning_rate": 2.9347826086956526e-05, "loss": 0.6961, "step": 163 }, { "epoch": 0.17816404128191202, "grad_norm": 0.3972611725023033, "learning_rate": 2.9528985507246375e-05, "loss": 0.6843, "step": 164 }, { "epoch": 0.1792504073872895, "grad_norm": 0.413568828059455, "learning_rate": 2.971014492753623e-05, "loss": 0.6295, "step": 165 }, { "epoch": 0.18033677349266702, "grad_norm": 0.3283858891915756, "learning_rate": 2.9891304347826088e-05, "loss": 0.6147, "step": 166 }, { "epoch": 0.18142313959804454, "grad_norm": 0.3857771015472112, "learning_rate": 3.0072463768115944e-05, "loss": 0.6899, "step": 167 }, { "epoch": 0.18250950570342206, "grad_norm": 0.3528254026635013, "learning_rate": 3.02536231884058e-05, "loss": 0.643, "step": 168 }, { "epoch": 0.18359587180879958, "grad_norm": 0.3859796992563831, "learning_rate": 3.0434782608695656e-05, "loss": 0.7077, "step": 169 }, { "epoch": 0.18468223791417707, "grad_norm": 0.33023238200514216, "learning_rate": 3.061594202898551e-05, "loss": 0.6102, "step": 170 }, { "epoch": 0.18576860401955458, "grad_norm": 0.4007072696623166, "learning_rate": 3.079710144927536e-05, "loss": 0.6065, "step": 171 }, { "epoch": 0.1868549701249321, "grad_norm": 0.3732474007713326, "learning_rate": 3.0978260869565215e-05, "loss": 0.6893, "step": 172 }, { "epoch": 0.18794133623030962, "grad_norm": 0.3684614005536571, "learning_rate": 3.1159420289855074e-05, "loss": 0.7654, "step": 173 }, { "epoch": 0.18902770233568714, "grad_norm": 0.3827899786773753, "learning_rate": 3.134057971014493e-05, "loss": 0.6234, "step": 174 }, { "epoch": 0.19011406844106463, "grad_norm": 0.44308755581238835, "learning_rate": 3.152173913043479e-05, "loss": 0.5625, "step": 175 }, { "epoch": 0.19120043454644214, "grad_norm": 0.3021687124502996, "learning_rate": 3.170289855072464e-05, "loss": 0.578, "step": 176 }, { "epoch": 0.19228680065181966, "grad_norm": 0.38081504872059607, "learning_rate": 3.188405797101449e-05, "loss": 0.7315, "step": 177 }, { "epoch": 0.19337316675719718, "grad_norm": 0.31960037620950316, "learning_rate": 3.2065217391304345e-05, "loss": 0.6919, "step": 178 }, { "epoch": 0.1944595328625747, "grad_norm": 0.3240566162270478, "learning_rate": 3.2246376811594205e-05, "loss": 0.6566, "step": 179 }, { "epoch": 0.1955458989679522, "grad_norm": 0.37684107271042017, "learning_rate": 3.242753623188406e-05, "loss": 0.6849, "step": 180 }, { "epoch": 0.1966322650733297, "grad_norm": 0.3606327144312569, "learning_rate": 3.260869565217392e-05, "loss": 0.7028, "step": 181 }, { "epoch": 0.19771863117870722, "grad_norm": 0.3379612298036947, "learning_rate": 3.278985507246377e-05, "loss": 0.6765, "step": 182 }, { "epoch": 0.19880499728408474, "grad_norm": 0.3715162187753512, "learning_rate": 3.297101449275363e-05, "loss": 0.6705, "step": 183 }, { "epoch": 0.19989136338946226, "grad_norm": 0.3416069288579867, "learning_rate": 3.3152173913043475e-05, "loss": 0.5462, "step": 184 }, { "epoch": 0.20097772949483977, "grad_norm": 0.35929998122430595, "learning_rate": 3.3333333333333335e-05, "loss": 0.6799, "step": 185 }, { "epoch": 0.20206409560021726, "grad_norm": 0.3646513724585104, "learning_rate": 3.351449275362319e-05, "loss": 0.555, "step": 186 }, { "epoch": 0.20315046170559478, "grad_norm": 0.34156943309279547, "learning_rate": 3.369565217391305e-05, "loss": 0.6862, "step": 187 }, { "epoch": 0.2042368278109723, "grad_norm": 0.36021154947787176, "learning_rate": 3.38768115942029e-05, "loss": 0.5927, "step": 188 }, { "epoch": 0.20532319391634982, "grad_norm": 0.33736444547194183, "learning_rate": 3.405797101449276e-05, "loss": 0.6357, "step": 189 }, { "epoch": 0.20640956002172733, "grad_norm": 0.3347793137961156, "learning_rate": 3.423913043478261e-05, "loss": 0.7018, "step": 190 }, { "epoch": 0.20749592612710482, "grad_norm": 0.33321501904490336, "learning_rate": 3.4420289855072465e-05, "loss": 0.5765, "step": 191 }, { "epoch": 0.20858229223248234, "grad_norm": 0.2985279214232776, "learning_rate": 3.460144927536232e-05, "loss": 0.5984, "step": 192 }, { "epoch": 0.20966865833785986, "grad_norm": 0.4393287226504694, "learning_rate": 3.478260869565218e-05, "loss": 0.6367, "step": 193 }, { "epoch": 0.21075502444323738, "grad_norm": 0.3976309357217895, "learning_rate": 3.496376811594203e-05, "loss": 0.674, "step": 194 }, { "epoch": 0.2118413905486149, "grad_norm": 0.3395805387632937, "learning_rate": 3.514492753623188e-05, "loss": 0.7247, "step": 195 }, { "epoch": 0.21292775665399238, "grad_norm": 0.3410895246039724, "learning_rate": 3.532608695652174e-05, "loss": 0.6201, "step": 196 }, { "epoch": 0.2140141227593699, "grad_norm": 0.4236852595545962, "learning_rate": 3.5507246376811596e-05, "loss": 0.6262, "step": 197 }, { "epoch": 0.21510048886474742, "grad_norm": 0.3319685047264388, "learning_rate": 3.568840579710145e-05, "loss": 0.6806, "step": 198 }, { "epoch": 0.21618685497012494, "grad_norm": 0.39776751755026146, "learning_rate": 3.58695652173913e-05, "loss": 0.629, "step": 199 }, { "epoch": 0.21727322107550245, "grad_norm": 0.4425674787934631, "learning_rate": 3.605072463768116e-05, "loss": 0.6415, "step": 200 }, { "epoch": 0.21835958718087994, "grad_norm": 0.3902632618556099, "learning_rate": 3.6231884057971014e-05, "loss": 0.6017, "step": 201 }, { "epoch": 0.21944595328625746, "grad_norm": 0.31252665069367547, "learning_rate": 3.641304347826087e-05, "loss": 0.593, "step": 202 }, { "epoch": 0.22053231939163498, "grad_norm": 0.37950687752452195, "learning_rate": 3.6594202898550726e-05, "loss": 0.665, "step": 203 }, { "epoch": 0.2216186854970125, "grad_norm": 0.41790640164319537, "learning_rate": 3.6775362318840586e-05, "loss": 0.7504, "step": 204 }, { "epoch": 0.22270505160239001, "grad_norm": 0.3934296220203009, "learning_rate": 3.695652173913043e-05, "loss": 0.7212, "step": 205 }, { "epoch": 0.22379141770776753, "grad_norm": 0.34836207270953384, "learning_rate": 3.713768115942029e-05, "loss": 0.5961, "step": 206 }, { "epoch": 0.22487778381314502, "grad_norm": 0.3465484436021716, "learning_rate": 3.7318840579710144e-05, "loss": 0.5953, "step": 207 }, { "epoch": 0.22596414991852254, "grad_norm": 0.4174788414604287, "learning_rate": 3.7500000000000003e-05, "loss": 0.6579, "step": 208 }, { "epoch": 0.22705051602390006, "grad_norm": 0.3452833272046794, "learning_rate": 3.7681159420289856e-05, "loss": 0.5755, "step": 209 }, { "epoch": 0.22813688212927757, "grad_norm": 0.362068326295876, "learning_rate": 3.7862318840579716e-05, "loss": 0.6234, "step": 210 }, { "epoch": 0.2292232482346551, "grad_norm": 0.40679228134760154, "learning_rate": 3.804347826086957e-05, "loss": 0.6763, "step": 211 }, { "epoch": 0.23030961434003258, "grad_norm": 0.5913714039916962, "learning_rate": 3.822463768115942e-05, "loss": 0.6732, "step": 212 }, { "epoch": 0.2313959804454101, "grad_norm": 0.3995785148498594, "learning_rate": 3.8405797101449274e-05, "loss": 0.6433, "step": 213 }, { "epoch": 0.23248234655078762, "grad_norm": 0.36605578312178133, "learning_rate": 3.8586956521739134e-05, "loss": 0.6126, "step": 214 }, { "epoch": 0.23356871265616513, "grad_norm": 0.42326961520317585, "learning_rate": 3.876811594202899e-05, "loss": 0.6457, "step": 215 }, { "epoch": 0.23465507876154265, "grad_norm": 0.31485350161070985, "learning_rate": 3.8949275362318846e-05, "loss": 0.5586, "step": 216 }, { "epoch": 0.23574144486692014, "grad_norm": 0.4442633600893104, "learning_rate": 3.91304347826087e-05, "loss": 0.6518, "step": 217 }, { "epoch": 0.23682781097229766, "grad_norm": 0.3109433023369161, "learning_rate": 3.931159420289855e-05, "loss": 0.5855, "step": 218 }, { "epoch": 0.23791417707767518, "grad_norm": 0.38569729043968454, "learning_rate": 3.9492753623188405e-05, "loss": 0.6142, "step": 219 }, { "epoch": 0.2390005431830527, "grad_norm": 0.38922464375361626, "learning_rate": 3.9673913043478264e-05, "loss": 0.5639, "step": 220 }, { "epoch": 0.2400869092884302, "grad_norm": 0.43484479750371985, "learning_rate": 3.985507246376812e-05, "loss": 0.6244, "step": 221 }, { "epoch": 0.2411732753938077, "grad_norm": 0.4530248144056785, "learning_rate": 4.003623188405797e-05, "loss": 0.627, "step": 222 }, { "epoch": 0.24225964149918522, "grad_norm": 0.4076332968728458, "learning_rate": 4.021739130434783e-05, "loss": 0.7224, "step": 223 }, { "epoch": 0.24334600760456274, "grad_norm": 0.5000314189743986, "learning_rate": 4.039855072463768e-05, "loss": 0.6852, "step": 224 }, { "epoch": 0.24443237370994025, "grad_norm": 0.34320362503846413, "learning_rate": 4.057971014492754e-05, "loss": 0.6058, "step": 225 }, { "epoch": 0.24551873981531777, "grad_norm": 0.47219376671830815, "learning_rate": 4.076086956521739e-05, "loss": 0.6674, "step": 226 }, { "epoch": 0.24660510592069526, "grad_norm": 0.3587452252464959, "learning_rate": 4.094202898550725e-05, "loss": 0.638, "step": 227 }, { "epoch": 0.24769147202607278, "grad_norm": 0.4613785827488836, "learning_rate": 4.11231884057971e-05, "loss": 0.6972, "step": 228 }, { "epoch": 0.2487778381314503, "grad_norm": 0.329991625761809, "learning_rate": 4.130434782608696e-05, "loss": 0.5529, "step": 229 }, { "epoch": 0.2498642042368278, "grad_norm": 0.3741029607892106, "learning_rate": 4.148550724637681e-05, "loss": 0.6011, "step": 230 }, { "epoch": 0.2509505703422053, "grad_norm": 0.4306240446824924, "learning_rate": 4.166666666666667e-05, "loss": 0.5325, "step": 231 }, { "epoch": 0.25203693644758285, "grad_norm": 0.367506786848363, "learning_rate": 4.1847826086956525e-05, "loss": 0.6824, "step": 232 }, { "epoch": 0.25312330255296034, "grad_norm": 0.47372227080571855, "learning_rate": 4.202898550724638e-05, "loss": 0.6374, "step": 233 }, { "epoch": 0.2542096686583379, "grad_norm": 0.5156921440414878, "learning_rate": 4.221014492753623e-05, "loss": 0.6427, "step": 234 }, { "epoch": 0.2552960347637154, "grad_norm": 0.45395768532559216, "learning_rate": 4.239130434782609e-05, "loss": 0.6612, "step": 235 }, { "epoch": 0.25638240086909286, "grad_norm": 0.4069535604489809, "learning_rate": 4.257246376811594e-05, "loss": 0.6174, "step": 236 }, { "epoch": 0.2574687669744704, "grad_norm": 0.344812449713345, "learning_rate": 4.27536231884058e-05, "loss": 0.6664, "step": 237 }, { "epoch": 0.2585551330798479, "grad_norm": 0.36495481854090916, "learning_rate": 4.2934782608695655e-05, "loss": 0.6023, "step": 238 }, { "epoch": 0.25964149918522544, "grad_norm": 0.30915257208219793, "learning_rate": 4.3115942028985515e-05, "loss": 0.469, "step": 239 }, { "epoch": 0.26072786529060293, "grad_norm": 0.3715021994062854, "learning_rate": 4.329710144927536e-05, "loss": 0.6198, "step": 240 }, { "epoch": 0.2618142313959804, "grad_norm": 0.35485980128558436, "learning_rate": 4.347826086956522e-05, "loss": 0.5899, "step": 241 }, { "epoch": 0.26290059750135797, "grad_norm": 0.37357085602982554, "learning_rate": 4.365942028985507e-05, "loss": 0.5866, "step": 242 }, { "epoch": 0.26398696360673546, "grad_norm": 0.43525843257231617, "learning_rate": 4.384057971014493e-05, "loss": 0.6105, "step": 243 }, { "epoch": 0.265073329712113, "grad_norm": 0.3567030087703949, "learning_rate": 4.4021739130434786e-05, "loss": 0.6394, "step": 244 }, { "epoch": 0.2661596958174905, "grad_norm": 0.38669710562942966, "learning_rate": 4.4202898550724645e-05, "loss": 0.6398, "step": 245 }, { "epoch": 0.267246061922868, "grad_norm": 0.4016485598358191, "learning_rate": 4.438405797101449e-05, "loss": 0.573, "step": 246 }, { "epoch": 0.26833242802824553, "grad_norm": 0.3994769171874989, "learning_rate": 4.456521739130435e-05, "loss": 0.6472, "step": 247 }, { "epoch": 0.269418794133623, "grad_norm": 0.3930082965173683, "learning_rate": 4.4746376811594203e-05, "loss": 0.6015, "step": 248 }, { "epoch": 0.27050516023900056, "grad_norm": 0.42427991032264106, "learning_rate": 4.492753623188406e-05, "loss": 0.5874, "step": 249 }, { "epoch": 0.27159152634437805, "grad_norm": 0.4556028921701948, "learning_rate": 4.5108695652173916e-05, "loss": 0.5721, "step": 250 }, { "epoch": 0.27267789244975554, "grad_norm": 0.31890617888812983, "learning_rate": 4.528985507246377e-05, "loss": 0.6188, "step": 251 }, { "epoch": 0.2737642585551331, "grad_norm": 0.5053993449529575, "learning_rate": 4.547101449275363e-05, "loss": 0.5797, "step": 252 }, { "epoch": 0.2748506246605106, "grad_norm": 0.42298251228676076, "learning_rate": 4.565217391304348e-05, "loss": 0.5552, "step": 253 }, { "epoch": 0.2759369907658881, "grad_norm": 0.4903420522876282, "learning_rate": 4.5833333333333334e-05, "loss": 0.6542, "step": 254 }, { "epoch": 0.2770233568712656, "grad_norm": 0.5125953322446878, "learning_rate": 4.601449275362319e-05, "loss": 0.5396, "step": 255 }, { "epoch": 0.2781097229766431, "grad_norm": 0.45310178344575475, "learning_rate": 4.6195652173913046e-05, "loss": 0.6162, "step": 256 }, { "epoch": 0.27919608908202065, "grad_norm": 0.4487708826560388, "learning_rate": 4.63768115942029e-05, "loss": 0.6502, "step": 257 }, { "epoch": 0.28028245518739814, "grad_norm": 0.4900859768124775, "learning_rate": 4.655797101449276e-05, "loss": 0.6008, "step": 258 }, { "epoch": 0.2813688212927757, "grad_norm": 0.3800038644969487, "learning_rate": 4.673913043478261e-05, "loss": 0.6958, "step": 259 }, { "epoch": 0.2824551873981532, "grad_norm": 0.5415490021182547, "learning_rate": 4.6920289855072464e-05, "loss": 0.593, "step": 260 }, { "epoch": 0.28354155350353066, "grad_norm": 0.3982427272311023, "learning_rate": 4.710144927536232e-05, "loss": 0.5966, "step": 261 }, { "epoch": 0.2846279196089082, "grad_norm": 0.5401528154973722, "learning_rate": 4.7282608695652177e-05, "loss": 0.619, "step": 262 }, { "epoch": 0.2857142857142857, "grad_norm": 0.5154413167754509, "learning_rate": 4.746376811594203e-05, "loss": 0.6592, "step": 263 }, { "epoch": 0.28680065181966324, "grad_norm": 0.4063213538113251, "learning_rate": 4.764492753623189e-05, "loss": 0.6317, "step": 264 }, { "epoch": 0.28788701792504073, "grad_norm": 0.6750975005739185, "learning_rate": 4.782608695652174e-05, "loss": 0.6438, "step": 265 }, { "epoch": 0.2889733840304182, "grad_norm": 0.4839237278771349, "learning_rate": 4.80072463768116e-05, "loss": 0.6567, "step": 266 }, { "epoch": 0.29005975013579577, "grad_norm": 0.37912659029120716, "learning_rate": 4.818840579710145e-05, "loss": 0.6049, "step": 267 }, { "epoch": 0.29114611624117326, "grad_norm": 0.5206265858409235, "learning_rate": 4.836956521739131e-05, "loss": 0.6277, "step": 268 }, { "epoch": 0.2922324823465508, "grad_norm": 0.39656501761716834, "learning_rate": 4.855072463768116e-05, "loss": 0.6818, "step": 269 }, { "epoch": 0.2933188484519283, "grad_norm": 0.3801854939712858, "learning_rate": 4.873188405797102e-05, "loss": 0.6587, "step": 270 }, { "epoch": 0.29440521455730584, "grad_norm": 0.3842074438497603, "learning_rate": 4.891304347826087e-05, "loss": 0.6342, "step": 271 }, { "epoch": 0.29549158066268333, "grad_norm": 0.369716441615996, "learning_rate": 4.909420289855073e-05, "loss": 0.6217, "step": 272 }, { "epoch": 0.2965779467680608, "grad_norm": 0.38531235268603364, "learning_rate": 4.9275362318840584e-05, "loss": 0.5868, "step": 273 }, { "epoch": 0.29766431287343836, "grad_norm": 0.43763126695088933, "learning_rate": 4.945652173913044e-05, "loss": 0.6285, "step": 274 }, { "epoch": 0.29875067897881585, "grad_norm": 0.3846139503382674, "learning_rate": 4.963768115942029e-05, "loss": 0.633, "step": 275 }, { "epoch": 0.2998370450841934, "grad_norm": 0.37756144400597547, "learning_rate": 4.981884057971015e-05, "loss": 0.6626, "step": 276 }, { "epoch": 0.3009234111895709, "grad_norm": 0.40371266565860925, "learning_rate": 5e-05, "loss": 0.6074, "step": 277 }, { "epoch": 0.3020097772949484, "grad_norm": 0.32114621663183585, "learning_rate": 4.997987117552335e-05, "loss": 0.5938, "step": 278 }, { "epoch": 0.3030961434003259, "grad_norm": 0.3458181623717006, "learning_rate": 4.99597423510467e-05, "loss": 0.6156, "step": 279 }, { "epoch": 0.3041825095057034, "grad_norm": 0.3476368008444952, "learning_rate": 4.9939613526570054e-05, "loss": 0.6102, "step": 280 }, { "epoch": 0.30526887561108096, "grad_norm": 0.33561993117861, "learning_rate": 4.99194847020934e-05, "loss": 0.6144, "step": 281 }, { "epoch": 0.30635524171645845, "grad_norm": 0.365635808209832, "learning_rate": 4.989935587761675e-05, "loss": 0.6704, "step": 282 }, { "epoch": 0.30744160782183594, "grad_norm": 0.9319024330202934, "learning_rate": 4.98792270531401e-05, "loss": 0.5774, "step": 283 }, { "epoch": 0.3085279739272135, "grad_norm": 0.4827298957200816, "learning_rate": 4.985909822866345e-05, "loss": 0.582, "step": 284 }, { "epoch": 0.309614340032591, "grad_norm": 0.4494417777315935, "learning_rate": 4.98389694041868e-05, "loss": 0.6873, "step": 285 }, { "epoch": 0.3107007061379685, "grad_norm": 0.328089095172115, "learning_rate": 4.981884057971015e-05, "loss": 0.5767, "step": 286 }, { "epoch": 0.311787072243346, "grad_norm": 0.49905370247791747, "learning_rate": 4.979871175523349e-05, "loss": 0.6716, "step": 287 }, { "epoch": 0.3128734383487235, "grad_norm": 0.362639625003065, "learning_rate": 4.9778582930756844e-05, "loss": 0.5449, "step": 288 }, { "epoch": 0.31395980445410104, "grad_norm": 0.3997785309672156, "learning_rate": 4.9758454106280194e-05, "loss": 0.567, "step": 289 }, { "epoch": 0.31504617055947853, "grad_norm": 0.4290083535794823, "learning_rate": 4.9738325281803545e-05, "loss": 0.5908, "step": 290 }, { "epoch": 0.3161325366648561, "grad_norm": 0.3343558099749465, "learning_rate": 4.9718196457326895e-05, "loss": 0.583, "step": 291 }, { "epoch": 0.31721890277023357, "grad_norm": 0.5192605304108577, "learning_rate": 4.9698067632850245e-05, "loss": 0.5764, "step": 292 }, { "epoch": 0.31830526887561106, "grad_norm": 0.3192570636090976, "learning_rate": 4.967793880837359e-05, "loss": 0.5839, "step": 293 }, { "epoch": 0.3193916349809886, "grad_norm": 0.4462005614754709, "learning_rate": 4.965780998389694e-05, "loss": 0.5667, "step": 294 }, { "epoch": 0.3204780010863661, "grad_norm": 0.3541672162214765, "learning_rate": 4.963768115942029e-05, "loss": 0.5752, "step": 295 }, { "epoch": 0.32156436719174364, "grad_norm": 0.39583173875572886, "learning_rate": 4.961755233494364e-05, "loss": 0.5916, "step": 296 }, { "epoch": 0.32265073329712113, "grad_norm": 0.34758321682146476, "learning_rate": 4.959742351046699e-05, "loss": 0.6263, "step": 297 }, { "epoch": 0.3237370994024986, "grad_norm": 0.31063123811297383, "learning_rate": 4.957729468599034e-05, "loss": 0.6546, "step": 298 }, { "epoch": 0.32482346550787616, "grad_norm": 0.3883953881635285, "learning_rate": 4.9557165861513685e-05, "loss": 0.5426, "step": 299 }, { "epoch": 0.32590983161325365, "grad_norm": 0.31263278329466, "learning_rate": 4.9537037037037035e-05, "loss": 0.6109, "step": 300 }, { "epoch": 0.3269961977186312, "grad_norm": 0.38985885363064005, "learning_rate": 4.9516908212560386e-05, "loss": 0.6025, "step": 301 }, { "epoch": 0.3280825638240087, "grad_norm": 0.3191894942994916, "learning_rate": 4.9496779388083736e-05, "loss": 0.6033, "step": 302 }, { "epoch": 0.3291689299293862, "grad_norm": 0.3368107721666587, "learning_rate": 4.947665056360709e-05, "loss": 0.4455, "step": 303 }, { "epoch": 0.3302552960347637, "grad_norm": 0.3849763384042673, "learning_rate": 4.945652173913044e-05, "loss": 0.5659, "step": 304 }, { "epoch": 0.3313416621401412, "grad_norm": 0.36683094636421276, "learning_rate": 4.943639291465378e-05, "loss": 0.6048, "step": 305 }, { "epoch": 0.33242802824551876, "grad_norm": 0.3714935735333472, "learning_rate": 4.941626409017713e-05, "loss": 0.6025, "step": 306 }, { "epoch": 0.33351439435089625, "grad_norm": 0.38816620562069254, "learning_rate": 4.939613526570048e-05, "loss": 0.6002, "step": 307 }, { "epoch": 0.33460076045627374, "grad_norm": 0.32552606544552226, "learning_rate": 4.937600644122383e-05, "loss": 0.5381, "step": 308 }, { "epoch": 0.3356871265616513, "grad_norm": 0.3631438765188129, "learning_rate": 4.935587761674719e-05, "loss": 0.6228, "step": 309 }, { "epoch": 0.3367734926670288, "grad_norm": 0.38865514545322416, "learning_rate": 4.933574879227053e-05, "loss": 0.56, "step": 310 }, { "epoch": 0.3378598587724063, "grad_norm": 0.2945498596109938, "learning_rate": 4.9315619967793884e-05, "loss": 0.5413, "step": 311 }, { "epoch": 0.3389462248777838, "grad_norm": 0.35670185898857193, "learning_rate": 4.9295491143317234e-05, "loss": 0.5514, "step": 312 }, { "epoch": 0.3400325909831613, "grad_norm": 0.3739222339447141, "learning_rate": 4.9275362318840584e-05, "loss": 0.6549, "step": 313 }, { "epoch": 0.34111895708853884, "grad_norm": 0.3439535357755469, "learning_rate": 4.9255233494363935e-05, "loss": 0.6364, "step": 314 }, { "epoch": 0.34220532319391633, "grad_norm": 0.352745580414187, "learning_rate": 4.9235104669887285e-05, "loss": 0.6083, "step": 315 }, { "epoch": 0.3432916892992939, "grad_norm": 0.38830265954465293, "learning_rate": 4.9214975845410636e-05, "loss": 0.6728, "step": 316 }, { "epoch": 0.34437805540467137, "grad_norm": 0.3459082342273997, "learning_rate": 4.919484702093398e-05, "loss": 0.6026, "step": 317 }, { "epoch": 0.3454644215100489, "grad_norm": 0.35359107437865145, "learning_rate": 4.917471819645733e-05, "loss": 0.5309, "step": 318 }, { "epoch": 0.3465507876154264, "grad_norm": 0.3427531573154007, "learning_rate": 4.915458937198068e-05, "loss": 0.581, "step": 319 }, { "epoch": 0.3476371537208039, "grad_norm": 0.3313277244505761, "learning_rate": 4.913446054750403e-05, "loss": 0.5841, "step": 320 }, { "epoch": 0.34872351982618144, "grad_norm": 0.39736199704082015, "learning_rate": 4.911433172302738e-05, "loss": 0.6124, "step": 321 }, { "epoch": 0.34980988593155893, "grad_norm": 0.3865933295807601, "learning_rate": 4.909420289855073e-05, "loss": 0.653, "step": 322 }, { "epoch": 0.3508962520369365, "grad_norm": 0.43086888798238226, "learning_rate": 4.9074074074074075e-05, "loss": 0.6435, "step": 323 }, { "epoch": 0.35198261814231396, "grad_norm": 0.38800754666667786, "learning_rate": 4.9053945249597426e-05, "loss": 0.5361, "step": 324 }, { "epoch": 0.35306898424769145, "grad_norm": 0.3660737159133813, "learning_rate": 4.9033816425120776e-05, "loss": 0.5684, "step": 325 }, { "epoch": 0.354155350353069, "grad_norm": 0.4204315194298004, "learning_rate": 4.901368760064413e-05, "loss": 0.6404, "step": 326 }, { "epoch": 0.3552417164584465, "grad_norm": 0.3170143607884224, "learning_rate": 4.899355877616748e-05, "loss": 0.6461, "step": 327 }, { "epoch": 0.35632808256382403, "grad_norm": 0.405516428025657, "learning_rate": 4.897342995169083e-05, "loss": 0.6566, "step": 328 }, { "epoch": 0.3574144486692015, "grad_norm": 0.34497300446535245, "learning_rate": 4.895330112721417e-05, "loss": 0.6325, "step": 329 }, { "epoch": 0.358500814774579, "grad_norm": 0.3310799633100821, "learning_rate": 4.893317230273752e-05, "loss": 0.6182, "step": 330 }, { "epoch": 0.35958718087995656, "grad_norm": 0.34962061106840475, "learning_rate": 4.891304347826087e-05, "loss": 0.5789, "step": 331 }, { "epoch": 0.36067354698533405, "grad_norm": 0.3436909599055896, "learning_rate": 4.889291465378422e-05, "loss": 0.6079, "step": 332 }, { "epoch": 0.3617599130907116, "grad_norm": 0.31423849242371116, "learning_rate": 4.887278582930757e-05, "loss": 0.617, "step": 333 }, { "epoch": 0.3628462791960891, "grad_norm": 0.32989006906558915, "learning_rate": 4.885265700483092e-05, "loss": 0.6286, "step": 334 }, { "epoch": 0.3639326453014666, "grad_norm": 0.2981938562505759, "learning_rate": 4.883252818035427e-05, "loss": 0.5523, "step": 335 }, { "epoch": 0.3650190114068441, "grad_norm": 0.2938899504012302, "learning_rate": 4.881239935587762e-05, "loss": 0.5559, "step": 336 }, { "epoch": 0.3661053775122216, "grad_norm": 0.306390335055949, "learning_rate": 4.879227053140097e-05, "loss": 0.5279, "step": 337 }, { "epoch": 0.36719174361759915, "grad_norm": 0.29770840644817853, "learning_rate": 4.877214170692432e-05, "loss": 0.5445, "step": 338 }, { "epoch": 0.36827810972297664, "grad_norm": 0.29063381897144974, "learning_rate": 4.875201288244767e-05, "loss": 0.5412, "step": 339 }, { "epoch": 0.36936447582835413, "grad_norm": 0.3662908450948418, "learning_rate": 4.873188405797102e-05, "loss": 0.5979, "step": 340 }, { "epoch": 0.3704508419337317, "grad_norm": 0.33264242580715425, "learning_rate": 4.871175523349436e-05, "loss": 0.5707, "step": 341 }, { "epoch": 0.37153720803910917, "grad_norm": 0.2887266102901276, "learning_rate": 4.869162640901771e-05, "loss": 0.5862, "step": 342 }, { "epoch": 0.3726235741444867, "grad_norm": 0.3811526902501501, "learning_rate": 4.8671497584541064e-05, "loss": 0.6734, "step": 343 }, { "epoch": 0.3737099402498642, "grad_norm": 0.3496973956465857, "learning_rate": 4.8651368760064414e-05, "loss": 0.6364, "step": 344 }, { "epoch": 0.3747963063552417, "grad_norm": 0.351321411857546, "learning_rate": 4.8631239935587765e-05, "loss": 0.5734, "step": 345 }, { "epoch": 0.37588267246061924, "grad_norm": 0.292271941619597, "learning_rate": 4.8611111111111115e-05, "loss": 0.5829, "step": 346 }, { "epoch": 0.37696903856599673, "grad_norm": 0.33761900413988344, "learning_rate": 4.859098228663446e-05, "loss": 0.5667, "step": 347 }, { "epoch": 0.3780554046713743, "grad_norm": 0.30480274436567273, "learning_rate": 4.857085346215781e-05, "loss": 0.6091, "step": 348 }, { "epoch": 0.37914177077675176, "grad_norm": 0.3449760413731048, "learning_rate": 4.855072463768116e-05, "loss": 0.5847, "step": 349 }, { "epoch": 0.38022813688212925, "grad_norm": 0.3224194522885285, "learning_rate": 4.853059581320451e-05, "loss": 0.534, "step": 350 }, { "epoch": 0.3813145029875068, "grad_norm": 0.4063330131193779, "learning_rate": 4.851046698872786e-05, "loss": 0.6307, "step": 351 }, { "epoch": 0.3824008690928843, "grad_norm": 0.38355335334847085, "learning_rate": 4.849033816425121e-05, "loss": 0.5599, "step": 352 }, { "epoch": 0.38348723519826183, "grad_norm": 0.36506468973896744, "learning_rate": 4.847020933977456e-05, "loss": 0.5872, "step": 353 }, { "epoch": 0.3845736013036393, "grad_norm": 0.3803701016182233, "learning_rate": 4.8450080515297905e-05, "loss": 0.5948, "step": 354 }, { "epoch": 0.3856599674090168, "grad_norm": 0.2813028230105774, "learning_rate": 4.8429951690821256e-05, "loss": 0.5189, "step": 355 }, { "epoch": 0.38674633351439436, "grad_norm": 0.36844120846851103, "learning_rate": 4.8409822866344606e-05, "loss": 0.7032, "step": 356 }, { "epoch": 0.38783269961977185, "grad_norm": 0.31506006431063643, "learning_rate": 4.8389694041867956e-05, "loss": 0.6779, "step": 357 }, { "epoch": 0.3889190657251494, "grad_norm": 0.273517743058034, "learning_rate": 4.836956521739131e-05, "loss": 0.5391, "step": 358 }, { "epoch": 0.3900054318305269, "grad_norm": 0.29652962043541536, "learning_rate": 4.834943639291466e-05, "loss": 0.6143, "step": 359 }, { "epoch": 0.3910917979359044, "grad_norm": 0.373764000931361, "learning_rate": 4.8329307568438e-05, "loss": 0.5608, "step": 360 }, { "epoch": 0.3921781640412819, "grad_norm": 0.32348571239823776, "learning_rate": 4.830917874396135e-05, "loss": 0.5999, "step": 361 }, { "epoch": 0.3932645301466594, "grad_norm": 0.3389935671156297, "learning_rate": 4.82890499194847e-05, "loss": 0.6179, "step": 362 }, { "epoch": 0.39435089625203695, "grad_norm": 0.33090309895557846, "learning_rate": 4.826892109500805e-05, "loss": 0.5677, "step": 363 }, { "epoch": 0.39543726235741444, "grad_norm": 0.3876842507208104, "learning_rate": 4.82487922705314e-05, "loss": 0.6422, "step": 364 }, { "epoch": 0.396523628462792, "grad_norm": 0.358037345170968, "learning_rate": 4.822866344605475e-05, "loss": 0.6851, "step": 365 }, { "epoch": 0.3976099945681695, "grad_norm": 0.354515736645034, "learning_rate": 4.82085346215781e-05, "loss": 0.5981, "step": 366 }, { "epoch": 0.39869636067354697, "grad_norm": 0.3285390908844221, "learning_rate": 4.818840579710145e-05, "loss": 0.5237, "step": 367 }, { "epoch": 0.3997827267789245, "grad_norm": 0.31136196007724176, "learning_rate": 4.81682769726248e-05, "loss": 0.567, "step": 368 }, { "epoch": 0.400869092884302, "grad_norm": 0.369774385851412, "learning_rate": 4.814814814814815e-05, "loss": 0.6756, "step": 369 }, { "epoch": 0.40195545898967955, "grad_norm": 0.33795041292459543, "learning_rate": 4.81280193236715e-05, "loss": 0.6412, "step": 370 }, { "epoch": 0.40304182509505704, "grad_norm": 0.3458101814916495, "learning_rate": 4.810789049919485e-05, "loss": 0.6094, "step": 371 }, { "epoch": 0.40412819120043453, "grad_norm": 0.5983070332631912, "learning_rate": 4.80877616747182e-05, "loss": 0.579, "step": 372 }, { "epoch": 0.4052145573058121, "grad_norm": 0.3548282635593562, "learning_rate": 4.806763285024155e-05, "loss": 0.5944, "step": 373 }, { "epoch": 0.40630092341118956, "grad_norm": 0.3727756358532041, "learning_rate": 4.80475040257649e-05, "loss": 0.6092, "step": 374 }, { "epoch": 0.4073872895165671, "grad_norm": 0.39011195814342375, "learning_rate": 4.802737520128825e-05, "loss": 0.666, "step": 375 }, { "epoch": 0.4084736556219446, "grad_norm": 0.39429371486432985, "learning_rate": 4.80072463768116e-05, "loss": 0.638, "step": 376 }, { "epoch": 0.4095600217273221, "grad_norm": 0.3875116222250447, "learning_rate": 4.7987117552334945e-05, "loss": 0.5522, "step": 377 }, { "epoch": 0.41064638783269963, "grad_norm": 0.3996911354997838, "learning_rate": 4.7966988727858295e-05, "loss": 0.6148, "step": 378 }, { "epoch": 0.4117327539380771, "grad_norm": 0.337395105654431, "learning_rate": 4.7946859903381646e-05, "loss": 0.5408, "step": 379 }, { "epoch": 0.41281912004345467, "grad_norm": 0.3981335379617971, "learning_rate": 4.7926731078904996e-05, "loss": 0.6355, "step": 380 }, { "epoch": 0.41390548614883216, "grad_norm": 0.3737563727353856, "learning_rate": 4.790660225442835e-05, "loss": 0.6024, "step": 381 }, { "epoch": 0.41499185225420965, "grad_norm": 0.3247813583283968, "learning_rate": 4.78864734299517e-05, "loss": 0.5253, "step": 382 }, { "epoch": 0.4160782183595872, "grad_norm": 2.352926786020598, "learning_rate": 4.786634460547504e-05, "loss": 0.7029, "step": 383 }, { "epoch": 0.4171645844649647, "grad_norm": 0.5385109046659324, "learning_rate": 4.784621578099839e-05, "loss": 0.6489, "step": 384 }, { "epoch": 0.41825095057034223, "grad_norm": 0.4569959869356735, "learning_rate": 4.782608695652174e-05, "loss": 0.6963, "step": 385 }, { "epoch": 0.4193373166757197, "grad_norm": 0.3568703067687905, "learning_rate": 4.780595813204509e-05, "loss": 0.5468, "step": 386 }, { "epoch": 0.4204236827810972, "grad_norm": 0.3364839363840677, "learning_rate": 4.778582930756844e-05, "loss": 0.5632, "step": 387 }, { "epoch": 0.42151004888647475, "grad_norm": 0.4361492326645728, "learning_rate": 4.776570048309179e-05, "loss": 0.6234, "step": 388 }, { "epoch": 0.42259641499185224, "grad_norm": 0.3406092056186368, "learning_rate": 4.7745571658615143e-05, "loss": 0.6089, "step": 389 }, { "epoch": 0.4236827810972298, "grad_norm": 0.41805200757740274, "learning_rate": 4.772544283413849e-05, "loss": 0.5879, "step": 390 }, { "epoch": 0.4247691472026073, "grad_norm": 0.9705772926450914, "learning_rate": 4.770531400966184e-05, "loss": 0.5705, "step": 391 }, { "epoch": 0.42585551330798477, "grad_norm": 0.43786365868102706, "learning_rate": 4.768518518518519e-05, "loss": 0.6706, "step": 392 }, { "epoch": 0.4269418794133623, "grad_norm": 0.3206057730617046, "learning_rate": 4.766505636070854e-05, "loss": 0.5156, "step": 393 }, { "epoch": 0.4280282455187398, "grad_norm": 0.4415140569898448, "learning_rate": 4.764492753623189e-05, "loss": 0.6042, "step": 394 }, { "epoch": 0.42911461162411735, "grad_norm": 0.45835863800919646, "learning_rate": 4.762479871175524e-05, "loss": 0.6008, "step": 395 }, { "epoch": 0.43020097772949484, "grad_norm": 0.43544998354048303, "learning_rate": 4.760466988727858e-05, "loss": 0.5701, "step": 396 }, { "epoch": 0.4312873438348723, "grad_norm": 0.41968345190790757, "learning_rate": 4.7584541062801933e-05, "loss": 0.5837, "step": 397 }, { "epoch": 0.4323737099402499, "grad_norm": 0.3148965008893944, "learning_rate": 4.7564412238325284e-05, "loss": 0.466, "step": 398 }, { "epoch": 0.43346007604562736, "grad_norm": 0.3572351751172452, "learning_rate": 4.7544283413848634e-05, "loss": 0.5494, "step": 399 }, { "epoch": 0.4345464421510049, "grad_norm": 0.3135010242332235, "learning_rate": 4.7524154589371985e-05, "loss": 0.6341, "step": 400 }, { "epoch": 0.4356328082563824, "grad_norm": 0.2798271317958357, "learning_rate": 4.7504025764895335e-05, "loss": 0.5368, "step": 401 }, { "epoch": 0.4367191743617599, "grad_norm": 0.3298353573731893, "learning_rate": 4.748389694041868e-05, "loss": 0.5844, "step": 402 }, { "epoch": 0.43780554046713743, "grad_norm": 0.30661860984953815, "learning_rate": 4.746376811594203e-05, "loss": 0.6086, "step": 403 }, { "epoch": 0.4388919065725149, "grad_norm": 0.34709263276033087, "learning_rate": 4.744363929146538e-05, "loss": 0.6001, "step": 404 }, { "epoch": 0.43997827267789247, "grad_norm": 0.34939843767031104, "learning_rate": 4.742351046698873e-05, "loss": 0.629, "step": 405 }, { "epoch": 0.44106463878326996, "grad_norm": 0.33674712165347076, "learning_rate": 4.740338164251208e-05, "loss": 0.5376, "step": 406 }, { "epoch": 0.44215100488864745, "grad_norm": 0.4029453970559045, "learning_rate": 4.738325281803543e-05, "loss": 0.6396, "step": 407 }, { "epoch": 0.443237370994025, "grad_norm": 0.3334452097984024, "learning_rate": 4.7363123993558775e-05, "loss": 0.5577, "step": 408 }, { "epoch": 0.4443237370994025, "grad_norm": 0.3766090552332058, "learning_rate": 4.7342995169082125e-05, "loss": 0.5945, "step": 409 }, { "epoch": 0.44541010320478003, "grad_norm": 4.463496357902373, "learning_rate": 4.7322866344605476e-05, "loss": 0.6133, "step": 410 }, { "epoch": 0.4464964693101575, "grad_norm": 0.45967669080480855, "learning_rate": 4.7302737520128826e-05, "loss": 0.6271, "step": 411 }, { "epoch": 0.44758283541553506, "grad_norm": 0.39684230397297027, "learning_rate": 4.7282608695652177e-05, "loss": 0.6375, "step": 412 }, { "epoch": 0.44866920152091255, "grad_norm": 0.4061856474766164, "learning_rate": 4.726247987117553e-05, "loss": 0.532, "step": 413 }, { "epoch": 0.44975556762629004, "grad_norm": 2.885097097723057, "learning_rate": 4.724235104669887e-05, "loss": 0.5319, "step": 414 }, { "epoch": 0.4508419337316676, "grad_norm": 0.9486670006835104, "learning_rate": 4.722222222222222e-05, "loss": 0.6808, "step": 415 }, { "epoch": 0.4519282998370451, "grad_norm": 0.605271527861985, "learning_rate": 4.720209339774557e-05, "loss": 0.6533, "step": 416 }, { "epoch": 0.4530146659424226, "grad_norm": 0.47183266093159826, "learning_rate": 4.718196457326892e-05, "loss": 0.5511, "step": 417 }, { "epoch": 0.4541010320478001, "grad_norm": 0.6212982558960221, "learning_rate": 4.716183574879227e-05, "loss": 0.6136, "step": 418 }, { "epoch": 0.4551873981531776, "grad_norm": 0.44123859162928697, "learning_rate": 4.714170692431562e-05, "loss": 0.6008, "step": 419 }, { "epoch": 0.45627376425855515, "grad_norm": 1.050279639323627, "learning_rate": 4.712157809983897e-05, "loss": 0.5292, "step": 420 }, { "epoch": 0.45736013036393264, "grad_norm": 0.7505332169054151, "learning_rate": 4.710144927536232e-05, "loss": 0.7037, "step": 421 }, { "epoch": 0.4584464964693102, "grad_norm": 0.4232688620731375, "learning_rate": 4.708132045088567e-05, "loss": 0.4935, "step": 422 }, { "epoch": 0.4595328625746877, "grad_norm": 0.612441149311464, "learning_rate": 4.706119162640902e-05, "loss": 0.5574, "step": 423 }, { "epoch": 0.46061922868006516, "grad_norm": 0.421751383862468, "learning_rate": 4.704106280193237e-05, "loss": 0.5847, "step": 424 }, { "epoch": 0.4617055947854427, "grad_norm": 4.097652592052832, "learning_rate": 4.702093397745572e-05, "loss": 0.7265, "step": 425 }, { "epoch": 0.4627919608908202, "grad_norm": 0.9223283885428981, "learning_rate": 4.700080515297907e-05, "loss": 0.6251, "step": 426 }, { "epoch": 0.46387832699619774, "grad_norm": 0.5410213463547794, "learning_rate": 4.698067632850241e-05, "loss": 0.589, "step": 427 }, { "epoch": 0.46496469310157523, "grad_norm": 0.5194858270108019, "learning_rate": 4.696054750402576e-05, "loss": 0.501, "step": 428 }, { "epoch": 0.4660510592069527, "grad_norm": 0.3855427884207616, "learning_rate": 4.6940418679549114e-05, "loss": 0.604, "step": 429 }, { "epoch": 0.46713742531233027, "grad_norm": 0.6151292792753239, "learning_rate": 4.6920289855072464e-05, "loss": 0.5851, "step": 430 }, { "epoch": 0.46822379141770776, "grad_norm": 1.9210884776045805, "learning_rate": 4.6900161030595815e-05, "loss": 0.662, "step": 431 }, { "epoch": 0.4693101575230853, "grad_norm": 0.5074808925096181, "learning_rate": 4.6880032206119165e-05, "loss": 0.5437, "step": 432 }, { "epoch": 0.4703965236284628, "grad_norm": 0.40279403557566856, "learning_rate": 4.6859903381642516e-05, "loss": 0.6097, "step": 433 }, { "epoch": 0.4714828897338403, "grad_norm": 0.4345193741758728, "learning_rate": 4.6839774557165866e-05, "loss": 0.5773, "step": 434 }, { "epoch": 0.4725692558392178, "grad_norm": 0.37253878365343635, "learning_rate": 4.6819645732689216e-05, "loss": 0.5877, "step": 435 }, { "epoch": 0.4736556219445953, "grad_norm": 0.3835532388486165, "learning_rate": 4.679951690821257e-05, "loss": 0.5329, "step": 436 }, { "epoch": 0.47474198804997286, "grad_norm": 0.4251609155319328, "learning_rate": 4.677938808373592e-05, "loss": 0.6629, "step": 437 }, { "epoch": 0.47582835415535035, "grad_norm": 0.38736986300486, "learning_rate": 4.675925925925926e-05, "loss": 0.586, "step": 438 }, { "epoch": 0.47691472026072784, "grad_norm": 0.32876979731597233, "learning_rate": 4.673913043478261e-05, "loss": 0.5594, "step": 439 }, { "epoch": 0.4780010863661054, "grad_norm": 0.33257637028455167, "learning_rate": 4.671900161030596e-05, "loss": 0.5366, "step": 440 }, { "epoch": 0.4790874524714829, "grad_norm": 0.3853167521064687, "learning_rate": 4.669887278582931e-05, "loss": 0.6959, "step": 441 }, { "epoch": 0.4801738185768604, "grad_norm": 0.27522978650279517, "learning_rate": 4.667874396135266e-05, "loss": 0.5369, "step": 442 }, { "epoch": 0.4812601846822379, "grad_norm": 0.373865493886352, "learning_rate": 4.665861513687601e-05, "loss": 0.625, "step": 443 }, { "epoch": 0.4823465507876154, "grad_norm": 0.33056286613514113, "learning_rate": 4.663848631239936e-05, "loss": 0.655, "step": 444 }, { "epoch": 0.48343291689299295, "grad_norm": 0.3914664715049825, "learning_rate": 4.661835748792271e-05, "loss": 0.6147, "step": 445 }, { "epoch": 0.48451928299837044, "grad_norm": 0.4023255181838839, "learning_rate": 4.659822866344606e-05, "loss": 0.6841, "step": 446 }, { "epoch": 0.485605649103748, "grad_norm": 0.3324300933780938, "learning_rate": 4.657809983896941e-05, "loss": 0.5685, "step": 447 }, { "epoch": 0.4866920152091255, "grad_norm": 0.30956604354778466, "learning_rate": 4.655797101449276e-05, "loss": 0.6251, "step": 448 }, { "epoch": 0.48777838131450296, "grad_norm": 0.3317558414480103, "learning_rate": 4.653784219001611e-05, "loss": 0.4871, "step": 449 }, { "epoch": 0.4888647474198805, "grad_norm": 0.2866221036000792, "learning_rate": 4.651771336553945e-05, "loss": 0.5532, "step": 450 }, { "epoch": 0.489951113525258, "grad_norm": 0.35268847077620036, "learning_rate": 4.64975845410628e-05, "loss": 0.5916, "step": 451 }, { "epoch": 0.49103747963063554, "grad_norm": 0.33296268411537283, "learning_rate": 4.6477455716586154e-05, "loss": 0.5842, "step": 452 }, { "epoch": 0.49212384573601303, "grad_norm": 0.33065731687999067, "learning_rate": 4.6457326892109504e-05, "loss": 0.5889, "step": 453 }, { "epoch": 0.4932102118413905, "grad_norm": 5.35387238173493, "learning_rate": 4.6437198067632854e-05, "loss": 0.7111, "step": 454 }, { "epoch": 0.49429657794676807, "grad_norm": 0.39725064939828564, "learning_rate": 4.6417069243156205e-05, "loss": 0.5718, "step": 455 }, { "epoch": 0.49538294405214556, "grad_norm": 0.2887144889454067, "learning_rate": 4.6396940418679555e-05, "loss": 0.5534, "step": 456 }, { "epoch": 0.4964693101575231, "grad_norm": 0.3178914994494645, "learning_rate": 4.63768115942029e-05, "loss": 0.592, "step": 457 }, { "epoch": 0.4975556762629006, "grad_norm": 0.3406891408747439, "learning_rate": 4.635668276972625e-05, "loss": 0.6025, "step": 458 }, { "epoch": 0.4986420423682781, "grad_norm": 0.30670554804553957, "learning_rate": 4.63365539452496e-05, "loss": 0.5931, "step": 459 }, { "epoch": 0.4997284084736556, "grad_norm": 0.3507545768963338, "learning_rate": 4.631642512077295e-05, "loss": 0.4789, "step": 460 }, { "epoch": 0.5008147745790331, "grad_norm": 0.3312432204424875, "learning_rate": 4.62962962962963e-05, "loss": 0.6251, "step": 461 }, { "epoch": 0.5019011406844106, "grad_norm": 0.2839629820117148, "learning_rate": 4.627616747181965e-05, "loss": 0.5726, "step": 462 }, { "epoch": 0.5029875067897882, "grad_norm": 0.34988725634101764, "learning_rate": 4.6256038647342995e-05, "loss": 0.5736, "step": 463 }, { "epoch": 0.5040738728951657, "grad_norm": 0.2867955995325992, "learning_rate": 4.6235909822866345e-05, "loss": 0.5117, "step": 464 }, { "epoch": 0.5051602390005432, "grad_norm": 0.31421665451219233, "learning_rate": 4.6215780998389696e-05, "loss": 0.5454, "step": 465 }, { "epoch": 0.5062466051059207, "grad_norm": 0.31798364787062666, "learning_rate": 4.6195652173913046e-05, "loss": 0.6131, "step": 466 }, { "epoch": 0.5073329712112982, "grad_norm": 0.26831960486164064, "learning_rate": 4.61755233494364e-05, "loss": 0.5238, "step": 467 }, { "epoch": 0.5084193373166758, "grad_norm": 2.10848088127481, "learning_rate": 4.615539452495975e-05, "loss": 0.6286, "step": 468 }, { "epoch": 0.5095057034220533, "grad_norm": 1.241828825140304, "learning_rate": 4.613526570048309e-05, "loss": 0.5952, "step": 469 }, { "epoch": 0.5105920695274307, "grad_norm": 0.40646433421750344, "learning_rate": 4.611513687600644e-05, "loss": 0.5723, "step": 470 }, { "epoch": 0.5116784356328082, "grad_norm": 0.40157270193570693, "learning_rate": 4.609500805152979e-05, "loss": 0.5497, "step": 471 }, { "epoch": 0.5127648017381857, "grad_norm": 0.5273288128529262, "learning_rate": 4.607487922705314e-05, "loss": 0.6246, "step": 472 }, { "epoch": 0.5138511678435633, "grad_norm": 0.3306495299562268, "learning_rate": 4.605475040257649e-05, "loss": 0.5542, "step": 473 }, { "epoch": 0.5149375339489408, "grad_norm": 0.4001468297434879, "learning_rate": 4.603462157809984e-05, "loss": 0.5398, "step": 474 }, { "epoch": 0.5160239000543183, "grad_norm": 0.37117547251666005, "learning_rate": 4.601449275362319e-05, "loss": 0.5051, "step": 475 }, { "epoch": 0.5171102661596958, "grad_norm": 0.37330474371205713, "learning_rate": 4.599436392914654e-05, "loss": 0.6274, "step": 476 }, { "epoch": 0.5181966322650733, "grad_norm": 0.3177150608318576, "learning_rate": 4.597423510466989e-05, "loss": 0.6233, "step": 477 }, { "epoch": 0.5192829983704509, "grad_norm": 0.3395758209267892, "learning_rate": 4.595410628019324e-05, "loss": 0.5558, "step": 478 }, { "epoch": 0.5203693644758284, "grad_norm": 0.43145196661623064, "learning_rate": 4.593397745571659e-05, "loss": 0.5135, "step": 479 }, { "epoch": 0.5214557305812059, "grad_norm": 0.3378938985710016, "learning_rate": 4.591384863123994e-05, "loss": 0.6307, "step": 480 }, { "epoch": 0.5225420966865834, "grad_norm": 0.3796611078692407, "learning_rate": 4.589371980676328e-05, "loss": 0.6247, "step": 481 }, { "epoch": 0.5236284627919608, "grad_norm": 0.31777229444380634, "learning_rate": 4.587359098228663e-05, "loss": 0.5715, "step": 482 }, { "epoch": 0.5247148288973384, "grad_norm": 1.2541220000841995, "learning_rate": 4.5853462157809983e-05, "loss": 0.6311, "step": 483 }, { "epoch": 0.5258011950027159, "grad_norm": 0.375274080817239, "learning_rate": 4.5833333333333334e-05, "loss": 0.6323, "step": 484 }, { "epoch": 0.5268875611080934, "grad_norm": 0.3030819222711369, "learning_rate": 4.5813204508856684e-05, "loss": 0.5558, "step": 485 }, { "epoch": 0.5279739272134709, "grad_norm": 0.29724438991716257, "learning_rate": 4.5793075684380035e-05, "loss": 0.5411, "step": 486 }, { "epoch": 0.5290602933188484, "grad_norm": 0.3692008426629504, "learning_rate": 4.577294685990338e-05, "loss": 0.5703, "step": 487 }, { "epoch": 0.530146659424226, "grad_norm": 0.3067195803068957, "learning_rate": 4.575281803542673e-05, "loss": 0.55, "step": 488 }, { "epoch": 0.5312330255296035, "grad_norm": 0.3210155514239663, "learning_rate": 4.573268921095008e-05, "loss": 0.6002, "step": 489 }, { "epoch": 0.532319391634981, "grad_norm": 0.3690643623258139, "learning_rate": 4.571256038647343e-05, "loss": 0.5103, "step": 490 }, { "epoch": 0.5334057577403585, "grad_norm": 0.2765860013081951, "learning_rate": 4.569243156199678e-05, "loss": 0.5799, "step": 491 }, { "epoch": 0.534492123845736, "grad_norm": 0.33429010965550565, "learning_rate": 4.567230273752013e-05, "loss": 0.6094, "step": 492 }, { "epoch": 0.5355784899511136, "grad_norm": 0.29136251790542295, "learning_rate": 4.565217391304348e-05, "loss": 0.5378, "step": 493 }, { "epoch": 0.5366648560564911, "grad_norm": 0.2844301702531187, "learning_rate": 4.5632045088566825e-05, "loss": 0.5864, "step": 494 }, { "epoch": 0.5377512221618685, "grad_norm": 0.5545061371868792, "learning_rate": 4.561191626409018e-05, "loss": 0.6204, "step": 495 }, { "epoch": 0.538837588267246, "grad_norm": 0.318517137156566, "learning_rate": 4.559178743961353e-05, "loss": 0.6169, "step": 496 }, { "epoch": 0.5399239543726235, "grad_norm": 0.5614006587860757, "learning_rate": 4.557165861513688e-05, "loss": 0.6405, "step": 497 }, { "epoch": 0.5410103204780011, "grad_norm": 4.322840399830027, "learning_rate": 4.555152979066023e-05, "loss": 0.7318, "step": 498 }, { "epoch": 0.5420966865833786, "grad_norm": 0.42310607386376065, "learning_rate": 4.553140096618358e-05, "loss": 0.6113, "step": 499 }, { "epoch": 0.5431830526887561, "grad_norm": 0.28008353513819445, "learning_rate": 4.551127214170693e-05, "loss": 0.5477, "step": 500 }, { "epoch": 0.5442694187941336, "grad_norm": 0.30356337594171395, "learning_rate": 4.549114331723028e-05, "loss": 0.5237, "step": 501 }, { "epoch": 0.5453557848995111, "grad_norm": 0.29136899459338034, "learning_rate": 4.547101449275363e-05, "loss": 0.546, "step": 502 }, { "epoch": 0.5464421510048887, "grad_norm": 0.3242872074752499, "learning_rate": 4.545088566827698e-05, "loss": 0.5685, "step": 503 }, { "epoch": 0.5475285171102662, "grad_norm": 0.289072080861752, "learning_rate": 4.543075684380033e-05, "loss": 0.6173, "step": 504 }, { "epoch": 0.5486148832156437, "grad_norm": 1.8161868411531035, "learning_rate": 4.541062801932367e-05, "loss": 0.6846, "step": 505 }, { "epoch": 0.5497012493210212, "grad_norm": 0.34978871574336756, "learning_rate": 4.539049919484702e-05, "loss": 0.5456, "step": 506 }, { "epoch": 0.5507876154263986, "grad_norm": 0.3532095536615177, "learning_rate": 4.5370370370370374e-05, "loss": 0.5817, "step": 507 }, { "epoch": 0.5518739815317762, "grad_norm": 0.3178968918320975, "learning_rate": 4.5350241545893724e-05, "loss": 0.6827, "step": 508 }, { "epoch": 0.5529603476371537, "grad_norm": 0.35437237560411355, "learning_rate": 4.5330112721417075e-05, "loss": 0.5824, "step": 509 }, { "epoch": 0.5540467137425312, "grad_norm": 0.35906433315398284, "learning_rate": 4.5309983896940425e-05, "loss": 0.6308, "step": 510 }, { "epoch": 0.5551330798479087, "grad_norm": 0.3371988682139157, "learning_rate": 4.528985507246377e-05, "loss": 0.6189, "step": 511 }, { "epoch": 0.5562194459532862, "grad_norm": 0.34773854106391766, "learning_rate": 4.526972624798712e-05, "loss": 0.6193, "step": 512 }, { "epoch": 0.5573058120586638, "grad_norm": 0.30040461367639065, "learning_rate": 4.524959742351047e-05, "loss": 0.6796, "step": 513 }, { "epoch": 0.5583921781640413, "grad_norm": 0.29839148503376767, "learning_rate": 4.522946859903382e-05, "loss": 0.4633, "step": 514 }, { "epoch": 0.5594785442694188, "grad_norm": 0.3898999776519876, "learning_rate": 4.520933977455717e-05, "loss": 0.6043, "step": 515 }, { "epoch": 0.5605649103747963, "grad_norm": 0.31323168094149345, "learning_rate": 4.518921095008052e-05, "loss": 0.5771, "step": 516 }, { "epoch": 0.5616512764801738, "grad_norm": 0.3599921358881633, "learning_rate": 4.5169082125603865e-05, "loss": 0.6147, "step": 517 }, { "epoch": 0.5627376425855514, "grad_norm": 0.36986379425585436, "learning_rate": 4.5148953301127215e-05, "loss": 0.629, "step": 518 }, { "epoch": 0.5638240086909289, "grad_norm": 0.3067346615929688, "learning_rate": 4.5128824476650565e-05, "loss": 0.5815, "step": 519 }, { "epoch": 0.5649103747963063, "grad_norm": 0.321662723162871, "learning_rate": 4.5108695652173916e-05, "loss": 0.4992, "step": 520 }, { "epoch": 0.5659967409016838, "grad_norm": 0.4372731119277421, "learning_rate": 4.5088566827697266e-05, "loss": 0.5525, "step": 521 }, { "epoch": 0.5670831070070613, "grad_norm": 0.2746048318744581, "learning_rate": 4.506843800322062e-05, "loss": 0.526, "step": 522 }, { "epoch": 0.5681694731124389, "grad_norm": 0.36321929270499054, "learning_rate": 4.504830917874396e-05, "loss": 0.5558, "step": 523 }, { "epoch": 0.5692558392178164, "grad_norm": 0.35078035811496033, "learning_rate": 4.502818035426731e-05, "loss": 0.6041, "step": 524 }, { "epoch": 0.5703422053231939, "grad_norm": 0.32097833519004565, "learning_rate": 4.500805152979066e-05, "loss": 0.5756, "step": 525 }, { "epoch": 0.5714285714285714, "grad_norm": 0.31309210162467127, "learning_rate": 4.498792270531401e-05, "loss": 0.5956, "step": 526 }, { "epoch": 0.5725149375339489, "grad_norm": 0.33988541935676575, "learning_rate": 4.496779388083736e-05, "loss": 0.4892, "step": 527 }, { "epoch": 0.5736013036393265, "grad_norm": 0.30805294072076145, "learning_rate": 4.494766505636071e-05, "loss": 0.5855, "step": 528 }, { "epoch": 0.574687669744704, "grad_norm": 0.3366594271820263, "learning_rate": 4.492753623188406e-05, "loss": 0.4805, "step": 529 }, { "epoch": 0.5757740358500815, "grad_norm": 0.37928548566196957, "learning_rate": 4.490740740740741e-05, "loss": 0.6241, "step": 530 }, { "epoch": 0.576860401955459, "grad_norm": 0.30742391467683355, "learning_rate": 4.488727858293076e-05, "loss": 0.6086, "step": 531 }, { "epoch": 0.5779467680608364, "grad_norm": 0.31576647315641115, "learning_rate": 4.486714975845411e-05, "loss": 0.5462, "step": 532 }, { "epoch": 0.579033134166214, "grad_norm": 0.28703115376320776, "learning_rate": 4.484702093397746e-05, "loss": 0.5784, "step": 533 }, { "epoch": 0.5801195002715915, "grad_norm": 0.392361699383929, "learning_rate": 4.482689210950081e-05, "loss": 0.5553, "step": 534 }, { "epoch": 0.581205866376969, "grad_norm": 0.2457996247238777, "learning_rate": 4.480676328502416e-05, "loss": 0.5183, "step": 535 }, { "epoch": 0.5822922324823465, "grad_norm": 0.3819855424666858, "learning_rate": 4.47866344605475e-05, "loss": 0.5848, "step": 536 }, { "epoch": 0.5833785985877241, "grad_norm": 0.3850425708340909, "learning_rate": 4.476650563607085e-05, "loss": 0.6249, "step": 537 }, { "epoch": 0.5844649646931016, "grad_norm": 0.31842638348730595, "learning_rate": 4.4746376811594203e-05, "loss": 0.5951, "step": 538 }, { "epoch": 0.5855513307984791, "grad_norm": 0.36303273537471714, "learning_rate": 4.4726247987117554e-05, "loss": 0.5858, "step": 539 }, { "epoch": 0.5866376969038566, "grad_norm": 0.3117510634246719, "learning_rate": 4.4706119162640904e-05, "loss": 0.5514, "step": 540 }, { "epoch": 0.5877240630092341, "grad_norm": 0.2670798922892835, "learning_rate": 4.4685990338164255e-05, "loss": 0.5277, "step": 541 }, { "epoch": 0.5888104291146117, "grad_norm": 0.3064884224005411, "learning_rate": 4.46658615136876e-05, "loss": 0.4741, "step": 542 }, { "epoch": 0.5898967952199892, "grad_norm": 0.310083165559287, "learning_rate": 4.464573268921095e-05, "loss": 0.6051, "step": 543 }, { "epoch": 0.5909831613253667, "grad_norm": 0.27622117537429336, "learning_rate": 4.46256038647343e-05, "loss": 0.6423, "step": 544 }, { "epoch": 0.5920695274307441, "grad_norm": 0.27993483123416674, "learning_rate": 4.460547504025765e-05, "loss": 0.4805, "step": 545 }, { "epoch": 0.5931558935361216, "grad_norm": 0.32417797778998203, "learning_rate": 4.4585346215781e-05, "loss": 0.5567, "step": 546 }, { "epoch": 0.5942422596414992, "grad_norm": 0.29605811507235974, "learning_rate": 4.456521739130435e-05, "loss": 0.5729, "step": 547 }, { "epoch": 0.5953286257468767, "grad_norm": 0.29828599424858376, "learning_rate": 4.4545088566827694e-05, "loss": 0.6101, "step": 548 }, { "epoch": 0.5964149918522542, "grad_norm": 0.28540790776426933, "learning_rate": 4.4524959742351045e-05, "loss": 0.588, "step": 549 }, { "epoch": 0.5975013579576317, "grad_norm": 0.3134156032331217, "learning_rate": 4.4504830917874395e-05, "loss": 0.5963, "step": 550 }, { "epoch": 0.5985877240630092, "grad_norm": 0.26562086939111457, "learning_rate": 4.4484702093397746e-05, "loss": 0.5348, "step": 551 }, { "epoch": 0.5996740901683868, "grad_norm": 0.24425537730145894, "learning_rate": 4.4464573268921096e-05, "loss": 0.4238, "step": 552 }, { "epoch": 0.6007604562737643, "grad_norm": 0.3227903761717977, "learning_rate": 4.4444444444444447e-05, "loss": 0.5276, "step": 553 }, { "epoch": 0.6018468223791418, "grad_norm": 0.26672863118483947, "learning_rate": 4.442431561996779e-05, "loss": 0.4945, "step": 554 }, { "epoch": 0.6029331884845193, "grad_norm": 0.28158559387630955, "learning_rate": 4.440418679549114e-05, "loss": 0.5702, "step": 555 }, { "epoch": 0.6040195545898968, "grad_norm": 0.2697349078497178, "learning_rate": 4.438405797101449e-05, "loss": 0.5005, "step": 556 }, { "epoch": 0.6051059206952744, "grad_norm": 0.2929744350260733, "learning_rate": 4.436392914653785e-05, "loss": 0.5482, "step": 557 }, { "epoch": 0.6061922868006518, "grad_norm": 0.2532143698595884, "learning_rate": 4.43438003220612e-05, "loss": 0.4897, "step": 558 }, { "epoch": 0.6072786529060293, "grad_norm": 0.2884881334308616, "learning_rate": 4.432367149758454e-05, "loss": 0.5349, "step": 559 }, { "epoch": 0.6083650190114068, "grad_norm": 0.2923979170211628, "learning_rate": 4.430354267310789e-05, "loss": 0.5402, "step": 560 }, { "epoch": 0.6094513851167843, "grad_norm": 0.26919893447193854, "learning_rate": 4.428341384863124e-05, "loss": 0.5799, "step": 561 }, { "epoch": 0.6105377512221619, "grad_norm": 0.2720077132435709, "learning_rate": 4.4263285024154594e-05, "loss": 0.5686, "step": 562 }, { "epoch": 0.6116241173275394, "grad_norm": 0.31827559695512353, "learning_rate": 4.4243156199677944e-05, "loss": 0.5619, "step": 563 }, { "epoch": 0.6127104834329169, "grad_norm": 2.1590873328974083, "learning_rate": 4.4223027375201295e-05, "loss": 0.5544, "step": 564 }, { "epoch": 0.6137968495382944, "grad_norm": 0.3937901432455096, "learning_rate": 4.4202898550724645e-05, "loss": 0.5944, "step": 565 }, { "epoch": 0.6148832156436719, "grad_norm": 0.30126820140514676, "learning_rate": 4.418276972624799e-05, "loss": 0.5689, "step": 566 }, { "epoch": 0.6159695817490495, "grad_norm": 0.8644691934910457, "learning_rate": 4.416264090177134e-05, "loss": 0.5902, "step": 567 }, { "epoch": 0.617055947854427, "grad_norm": 0.39910214076365913, "learning_rate": 4.414251207729469e-05, "loss": 0.5864, "step": 568 }, { "epoch": 0.6181423139598045, "grad_norm": 1.498557761959139, "learning_rate": 4.412238325281804e-05, "loss": 0.63, "step": 569 }, { "epoch": 0.619228680065182, "grad_norm": 0.3718653221031873, "learning_rate": 4.410225442834139e-05, "loss": 0.5651, "step": 570 }, { "epoch": 0.6203150461705594, "grad_norm": 0.3424011389445576, "learning_rate": 4.408212560386474e-05, "loss": 0.5961, "step": 571 }, { "epoch": 0.621401412275937, "grad_norm": 0.2817501362306431, "learning_rate": 4.4061996779388085e-05, "loss": 0.4771, "step": 572 }, { "epoch": 0.6224877783813145, "grad_norm": 0.34169844118510745, "learning_rate": 4.4041867954911435e-05, "loss": 0.5904, "step": 573 }, { "epoch": 0.623574144486692, "grad_norm": 0.533391377968989, "learning_rate": 4.4021739130434786e-05, "loss": 0.5331, "step": 574 }, { "epoch": 0.6246605105920695, "grad_norm": 0.3512470503769232, "learning_rate": 4.4001610305958136e-05, "loss": 0.5239, "step": 575 }, { "epoch": 0.625746876697447, "grad_norm": 0.30755374423752424, "learning_rate": 4.3981481481481486e-05, "loss": 0.4987, "step": 576 }, { "epoch": 0.6268332428028246, "grad_norm": 0.41962205581953893, "learning_rate": 4.396135265700484e-05, "loss": 0.5862, "step": 577 }, { "epoch": 0.6279196089082021, "grad_norm": 0.30481371468505797, "learning_rate": 4.394122383252818e-05, "loss": 0.5442, "step": 578 }, { "epoch": 0.6290059750135796, "grad_norm": 0.7041520938375232, "learning_rate": 4.392109500805153e-05, "loss": 0.5594, "step": 579 }, { "epoch": 0.6300923411189571, "grad_norm": 0.3865445475383624, "learning_rate": 4.390096618357488e-05, "loss": 0.556, "step": 580 }, { "epoch": 0.6311787072243346, "grad_norm": 0.32961327627949605, "learning_rate": 4.388083735909823e-05, "loss": 0.576, "step": 581 }, { "epoch": 0.6322650733297122, "grad_norm": 0.38058275544859693, "learning_rate": 4.386070853462158e-05, "loss": 0.591, "step": 582 }, { "epoch": 0.6333514394350896, "grad_norm": 0.33526118419921314, "learning_rate": 4.384057971014493e-05, "loss": 0.6066, "step": 583 }, { "epoch": 0.6344378055404671, "grad_norm": 0.33756631721835634, "learning_rate": 4.3820450885668276e-05, "loss": 0.64, "step": 584 }, { "epoch": 0.6355241716458446, "grad_norm": 0.29444577249577025, "learning_rate": 4.380032206119163e-05, "loss": 0.6073, "step": 585 }, { "epoch": 0.6366105377512221, "grad_norm": 0.2859112551693706, "learning_rate": 4.378019323671498e-05, "loss": 0.5398, "step": 586 }, { "epoch": 0.6376969038565997, "grad_norm": 9.723890578072, "learning_rate": 4.376006441223833e-05, "loss": 0.6119, "step": 587 }, { "epoch": 0.6387832699619772, "grad_norm": 0.4805041877736139, "learning_rate": 4.373993558776168e-05, "loss": 0.5741, "step": 588 }, { "epoch": 0.6398696360673547, "grad_norm": 3.2497703137343823, "learning_rate": 4.371980676328503e-05, "loss": 0.6855, "step": 589 }, { "epoch": 0.6409560021727322, "grad_norm": 0.4554920847526277, "learning_rate": 4.369967793880837e-05, "loss": 0.5145, "step": 590 }, { "epoch": 0.6420423682781097, "grad_norm": 1.452767280509615, "learning_rate": 4.367954911433172e-05, "loss": 0.6533, "step": 591 }, { "epoch": 0.6431287343834873, "grad_norm": 0.3280145389749109, "learning_rate": 4.365942028985507e-05, "loss": 0.6102, "step": 592 }, { "epoch": 0.6442151004888648, "grad_norm": 0.41219504893127973, "learning_rate": 4.3639291465378424e-05, "loss": 0.5571, "step": 593 }, { "epoch": 0.6453014665942423, "grad_norm": 0.4061820807397402, "learning_rate": 4.3619162640901774e-05, "loss": 0.5855, "step": 594 }, { "epoch": 0.6463878326996197, "grad_norm": 0.4667077341822779, "learning_rate": 4.3599033816425124e-05, "loss": 0.5684, "step": 595 }, { "epoch": 0.6474741988049972, "grad_norm": 0.36101846972999063, "learning_rate": 4.3578904991948475e-05, "loss": 0.5, "step": 596 }, { "epoch": 0.6485605649103748, "grad_norm": 0.32742924711361443, "learning_rate": 4.355877616747182e-05, "loss": 0.5602, "step": 597 }, { "epoch": 0.6496469310157523, "grad_norm": 0.5786510376734098, "learning_rate": 4.353864734299517e-05, "loss": 0.5653, "step": 598 }, { "epoch": 0.6507332971211298, "grad_norm": 0.497231474708839, "learning_rate": 4.351851851851852e-05, "loss": 0.6093, "step": 599 }, { "epoch": 0.6518196632265073, "grad_norm": 0.45371220351570457, "learning_rate": 4.349838969404187e-05, "loss": 0.5801, "step": 600 }, { "epoch": 0.6529060293318848, "grad_norm": 0.9959132375604128, "learning_rate": 4.347826086956522e-05, "loss": 0.5611, "step": 601 }, { "epoch": 0.6539923954372624, "grad_norm": 0.33467127645792394, "learning_rate": 4.345813204508857e-05, "loss": 0.5509, "step": 602 }, { "epoch": 0.6550787615426399, "grad_norm": 0.27527491907260876, "learning_rate": 4.3438003220611914e-05, "loss": 0.5369, "step": 603 }, { "epoch": 0.6561651276480174, "grad_norm": 0.31160241536778377, "learning_rate": 4.3417874396135265e-05, "loss": 0.5924, "step": 604 }, { "epoch": 0.6572514937533949, "grad_norm": 0.2949682416157997, "learning_rate": 4.3397745571658615e-05, "loss": 0.4769, "step": 605 }, { "epoch": 0.6583378598587724, "grad_norm": 0.2770679404424011, "learning_rate": 4.3377616747181966e-05, "loss": 0.5367, "step": 606 }, { "epoch": 0.65942422596415, "grad_norm": 0.3204665833051286, "learning_rate": 4.3357487922705316e-05, "loss": 0.606, "step": 607 }, { "epoch": 0.6605105920695274, "grad_norm": 0.3533211098968133, "learning_rate": 4.333735909822867e-05, "loss": 0.5723, "step": 608 }, { "epoch": 0.6615969581749049, "grad_norm": 0.2917609640082713, "learning_rate": 4.331723027375201e-05, "loss": 0.5012, "step": 609 }, { "epoch": 0.6626833242802824, "grad_norm": 0.2668205238288225, "learning_rate": 4.329710144927536e-05, "loss": 0.618, "step": 610 }, { "epoch": 0.6637696903856599, "grad_norm": 0.28828693680972994, "learning_rate": 4.327697262479871e-05, "loss": 0.6274, "step": 611 }, { "epoch": 0.6648560564910375, "grad_norm": 0.260098042679316, "learning_rate": 4.325684380032206e-05, "loss": 0.4796, "step": 612 }, { "epoch": 0.665942422596415, "grad_norm": 0.25575299931645673, "learning_rate": 4.323671497584541e-05, "loss": 0.5901, "step": 613 }, { "epoch": 0.6670287887017925, "grad_norm": 0.2880128021028856, "learning_rate": 4.321658615136876e-05, "loss": 0.576, "step": 614 }, { "epoch": 0.66811515480717, "grad_norm": 0.2786413194140194, "learning_rate": 4.3196457326892106e-05, "loss": 0.5811, "step": 615 }, { "epoch": 0.6692015209125475, "grad_norm": 0.26729524334276594, "learning_rate": 4.317632850241546e-05, "loss": 0.5471, "step": 616 }, { "epoch": 0.6702878870179251, "grad_norm": 0.28718095411137257, "learning_rate": 4.315619967793881e-05, "loss": 0.5881, "step": 617 }, { "epoch": 0.6713742531233026, "grad_norm": 0.3111032284500772, "learning_rate": 4.313607085346216e-05, "loss": 0.5691, "step": 618 }, { "epoch": 0.6724606192286801, "grad_norm": 0.2760244366881542, "learning_rate": 4.3115942028985515e-05, "loss": 0.5492, "step": 619 }, { "epoch": 0.6735469853340575, "grad_norm": 0.2966551083640536, "learning_rate": 4.309581320450886e-05, "loss": 0.5976, "step": 620 }, { "epoch": 0.674633351439435, "grad_norm": 0.3009639569158255, "learning_rate": 4.307568438003221e-05, "loss": 0.5936, "step": 621 }, { "epoch": 0.6757197175448126, "grad_norm": 0.27819869984505674, "learning_rate": 4.305555555555556e-05, "loss": 0.535, "step": 622 }, { "epoch": 0.6768060836501901, "grad_norm": 0.3456283291792938, "learning_rate": 4.303542673107891e-05, "loss": 0.5447, "step": 623 }, { "epoch": 0.6778924497555676, "grad_norm": 0.33303970538771116, "learning_rate": 4.301529790660226e-05, "loss": 0.5438, "step": 624 }, { "epoch": 0.6789788158609451, "grad_norm": 0.2507442182218885, "learning_rate": 4.299516908212561e-05, "loss": 0.4747, "step": 625 }, { "epoch": 0.6800651819663226, "grad_norm": 0.23942940483546146, "learning_rate": 4.2975040257648954e-05, "loss": 0.4954, "step": 626 }, { "epoch": 0.6811515480717002, "grad_norm": 0.2745055507751452, "learning_rate": 4.2954911433172305e-05, "loss": 0.5863, "step": 627 }, { "epoch": 0.6822379141770777, "grad_norm": 0.2562925223272716, "learning_rate": 4.2934782608695655e-05, "loss": 0.6109, "step": 628 }, { "epoch": 0.6833242802824552, "grad_norm": 4.148414117611739, "learning_rate": 4.2914653784219006e-05, "loss": 0.6466, "step": 629 }, { "epoch": 0.6844106463878327, "grad_norm": 0.27749300521358267, "learning_rate": 4.2894524959742356e-05, "loss": 0.5413, "step": 630 }, { "epoch": 0.6854970124932103, "grad_norm": 0.2445547551747571, "learning_rate": 4.2874396135265707e-05, "loss": 0.5333, "step": 631 }, { "epoch": 0.6865833785985878, "grad_norm": 0.3178682084443685, "learning_rate": 4.285426731078905e-05, "loss": 0.5376, "step": 632 }, { "epoch": 0.6876697447039652, "grad_norm": 0.2678348745418432, "learning_rate": 4.28341384863124e-05, "loss": 0.5441, "step": 633 }, { "epoch": 0.6887561108093427, "grad_norm": 0.27485224060711755, "learning_rate": 4.281400966183575e-05, "loss": 0.5046, "step": 634 }, { "epoch": 0.6898424769147202, "grad_norm": 0.564697388495304, "learning_rate": 4.27938808373591e-05, "loss": 0.5301, "step": 635 }, { "epoch": 0.6909288430200978, "grad_norm": 0.248263515034209, "learning_rate": 4.277375201288245e-05, "loss": 0.5382, "step": 636 }, { "epoch": 0.6920152091254753, "grad_norm": 0.3251023506366872, "learning_rate": 4.27536231884058e-05, "loss": 0.6709, "step": 637 }, { "epoch": 0.6931015752308528, "grad_norm": 0.30692247332319694, "learning_rate": 4.273349436392915e-05, "loss": 0.6091, "step": 638 }, { "epoch": 0.6941879413362303, "grad_norm": 0.3080740302913298, "learning_rate": 4.2713365539452496e-05, "loss": 0.6145, "step": 639 }, { "epoch": 0.6952743074416078, "grad_norm": 0.2930566622102029, "learning_rate": 4.269323671497585e-05, "loss": 0.5392, "step": 640 }, { "epoch": 0.6963606735469854, "grad_norm": 0.3280831003008318, "learning_rate": 4.26731078904992e-05, "loss": 0.5456, "step": 641 }, { "epoch": 0.6974470396523629, "grad_norm": 0.4805492195920435, "learning_rate": 4.265297906602255e-05, "loss": 0.6066, "step": 642 }, { "epoch": 0.6985334057577404, "grad_norm": 0.24695258144938748, "learning_rate": 4.26328502415459e-05, "loss": 0.5386, "step": 643 }, { "epoch": 0.6996197718631179, "grad_norm": 0.2813947518423543, "learning_rate": 4.261272141706925e-05, "loss": 0.5343, "step": 644 }, { "epoch": 0.7007061379684953, "grad_norm": 0.2613145520740588, "learning_rate": 4.259259259259259e-05, "loss": 0.5751, "step": 645 }, { "epoch": 0.701792504073873, "grad_norm": 0.33690560235698386, "learning_rate": 4.257246376811594e-05, "loss": 0.6462, "step": 646 }, { "epoch": 0.7028788701792504, "grad_norm": 0.26066930890994916, "learning_rate": 4.255233494363929e-05, "loss": 0.5851, "step": 647 }, { "epoch": 0.7039652362846279, "grad_norm": 0.32932355783873807, "learning_rate": 4.2532206119162644e-05, "loss": 0.5593, "step": 648 }, { "epoch": 0.7050516023900054, "grad_norm": 0.25981134270654777, "learning_rate": 4.2512077294685994e-05, "loss": 0.521, "step": 649 }, { "epoch": 0.7061379684953829, "grad_norm": 0.28902924324340834, "learning_rate": 4.2491948470209345e-05, "loss": 0.4812, "step": 650 }, { "epoch": 0.7072243346007605, "grad_norm": 10.06743098365392, "learning_rate": 4.247181964573269e-05, "loss": 0.5529, "step": 651 }, { "epoch": 0.708310700706138, "grad_norm": 0.32357709611205554, "learning_rate": 4.245169082125604e-05, "loss": 0.5927, "step": 652 }, { "epoch": 0.7093970668115155, "grad_norm": 0.28431590524100436, "learning_rate": 4.243156199677939e-05, "loss": 0.5696, "step": 653 }, { "epoch": 0.710483432916893, "grad_norm": 0.32134642338686126, "learning_rate": 4.241143317230274e-05, "loss": 0.5619, "step": 654 }, { "epoch": 0.7115697990222705, "grad_norm": 4.141287580874875, "learning_rate": 4.239130434782609e-05, "loss": 0.6274, "step": 655 }, { "epoch": 0.7126561651276481, "grad_norm": 0.40772054946991687, "learning_rate": 4.237117552334944e-05, "loss": 0.5365, "step": 656 }, { "epoch": 0.7137425312330256, "grad_norm": 0.4839377300138959, "learning_rate": 4.2351046698872784e-05, "loss": 0.5836, "step": 657 }, { "epoch": 0.714828897338403, "grad_norm": 0.3820150073425682, "learning_rate": 4.2330917874396135e-05, "loss": 0.5285, "step": 658 }, { "epoch": 0.7159152634437805, "grad_norm": 0.4936137533681445, "learning_rate": 4.2310789049919485e-05, "loss": 0.5211, "step": 659 }, { "epoch": 0.717001629549158, "grad_norm": 0.3792320106926484, "learning_rate": 4.2290660225442835e-05, "loss": 0.5912, "step": 660 }, { "epoch": 0.7180879956545356, "grad_norm": 0.7745387217596819, "learning_rate": 4.2270531400966186e-05, "loss": 0.6028, "step": 661 }, { "epoch": 0.7191743617599131, "grad_norm": 0.350436352699651, "learning_rate": 4.2250402576489536e-05, "loss": 0.5977, "step": 662 }, { "epoch": 0.7202607278652906, "grad_norm": 0.35486515499727767, "learning_rate": 4.223027375201288e-05, "loss": 0.5196, "step": 663 }, { "epoch": 0.7213470939706681, "grad_norm": 3.4480854242768464, "learning_rate": 4.221014492753623e-05, "loss": 0.5667, "step": 664 }, { "epoch": 0.7224334600760456, "grad_norm": 0.3487529192798777, "learning_rate": 4.219001610305958e-05, "loss": 0.5584, "step": 665 }, { "epoch": 0.7235198261814232, "grad_norm": 0.3354723598631871, "learning_rate": 4.216988727858293e-05, "loss": 0.5588, "step": 666 }, { "epoch": 0.7246061922868007, "grad_norm": 0.2855562592142307, "learning_rate": 4.214975845410628e-05, "loss": 0.5172, "step": 667 }, { "epoch": 0.7256925583921782, "grad_norm": 0.3548328884674583, "learning_rate": 4.212962962962963e-05, "loss": 0.592, "step": 668 }, { "epoch": 0.7267789244975557, "grad_norm": 0.30593557235001395, "learning_rate": 4.210950080515298e-05, "loss": 0.568, "step": 669 }, { "epoch": 0.7278652906029331, "grad_norm": 0.2949144197714805, "learning_rate": 4.2089371980676326e-05, "loss": 0.5626, "step": 670 }, { "epoch": 0.7289516567083107, "grad_norm": 0.3025699685538706, "learning_rate": 4.206924315619968e-05, "loss": 0.5688, "step": 671 }, { "epoch": 0.7300380228136882, "grad_norm": 0.3332359621205195, "learning_rate": 4.204911433172303e-05, "loss": 0.5706, "step": 672 }, { "epoch": 0.7311243889190657, "grad_norm": 0.2731063323068049, "learning_rate": 4.202898550724638e-05, "loss": 0.6172, "step": 673 }, { "epoch": 0.7322107550244432, "grad_norm": 0.30253053785614237, "learning_rate": 4.200885668276973e-05, "loss": 0.6403, "step": 674 }, { "epoch": 0.7332971211298207, "grad_norm": 0.28734959256417525, "learning_rate": 4.198872785829308e-05, "loss": 0.5705, "step": 675 }, { "epoch": 0.7343834872351983, "grad_norm": 0.8598897412258376, "learning_rate": 4.196859903381642e-05, "loss": 0.6266, "step": 676 }, { "epoch": 0.7354698533405758, "grad_norm": 0.2979166103148701, "learning_rate": 4.194847020933977e-05, "loss": 0.553, "step": 677 }, { "epoch": 0.7365562194459533, "grad_norm": 0.29885545487366105, "learning_rate": 4.192834138486312e-05, "loss": 0.5436, "step": 678 }, { "epoch": 0.7376425855513308, "grad_norm": 0.7777338313033743, "learning_rate": 4.1908212560386474e-05, "loss": 0.5097, "step": 679 }, { "epoch": 0.7387289516567083, "grad_norm": 0.3537864916134092, "learning_rate": 4.1888083735909824e-05, "loss": 0.5453, "step": 680 }, { "epoch": 0.7398153177620859, "grad_norm": 0.30962115836605336, "learning_rate": 4.1867954911433174e-05, "loss": 0.6226, "step": 681 }, { "epoch": 0.7409016838674634, "grad_norm": 0.23431865556189305, "learning_rate": 4.1847826086956525e-05, "loss": 0.4933, "step": 682 }, { "epoch": 0.7419880499728408, "grad_norm": 0.34707703827405256, "learning_rate": 4.1827697262479875e-05, "loss": 0.6046, "step": 683 }, { "epoch": 0.7430744160782183, "grad_norm": 0.3003130154518514, "learning_rate": 4.1807568438003226e-05, "loss": 0.5359, "step": 684 }, { "epoch": 0.7441607821835958, "grad_norm": 0.24148301167863973, "learning_rate": 4.1787439613526576e-05, "loss": 0.484, "step": 685 }, { "epoch": 0.7452471482889734, "grad_norm": 0.27754409203859676, "learning_rate": 4.176731078904993e-05, "loss": 0.6043, "step": 686 }, { "epoch": 0.7463335143943509, "grad_norm": 0.2914247265101662, "learning_rate": 4.174718196457327e-05, "loss": 0.5295, "step": 687 }, { "epoch": 0.7474198804997284, "grad_norm": 0.26307319614992436, "learning_rate": 4.172705314009662e-05, "loss": 0.5792, "step": 688 }, { "epoch": 0.7485062466051059, "grad_norm": 0.2636943134950351, "learning_rate": 4.170692431561997e-05, "loss": 0.5869, "step": 689 }, { "epoch": 0.7495926127104834, "grad_norm": 0.32060950276547995, "learning_rate": 4.168679549114332e-05, "loss": 0.5199, "step": 690 }, { "epoch": 0.750678978815861, "grad_norm": 0.882187778664977, "learning_rate": 4.166666666666667e-05, "loss": 0.5321, "step": 691 }, { "epoch": 0.7517653449212385, "grad_norm": 0.37068406265032716, "learning_rate": 4.164653784219002e-05, "loss": 0.5938, "step": 692 }, { "epoch": 0.752851711026616, "grad_norm": 0.2842601411566619, "learning_rate": 4.1626409017713366e-05, "loss": 0.5123, "step": 693 }, { "epoch": 0.7539380771319935, "grad_norm": 29.667463582628482, "learning_rate": 4.1606280193236717e-05, "loss": 0.7669, "step": 694 }, { "epoch": 0.755024443237371, "grad_norm": 0.3303629750986385, "learning_rate": 4.158615136876007e-05, "loss": 0.5847, "step": 695 }, { "epoch": 0.7561108093427485, "grad_norm": 0.2870976884151802, "learning_rate": 4.156602254428342e-05, "loss": 0.5249, "step": 696 }, { "epoch": 0.757197175448126, "grad_norm": 0.30246117464201583, "learning_rate": 4.154589371980677e-05, "loss": 0.6256, "step": 697 }, { "epoch": 0.7582835415535035, "grad_norm": 0.2984752172041035, "learning_rate": 4.152576489533012e-05, "loss": 0.5286, "step": 698 }, { "epoch": 0.759369907658881, "grad_norm": 0.2437657388710987, "learning_rate": 4.150563607085346e-05, "loss": 0.5064, "step": 699 }, { "epoch": 0.7604562737642585, "grad_norm": 0.2871678664258822, "learning_rate": 4.148550724637681e-05, "loss": 0.5769, "step": 700 }, { "epoch": 0.7615426398696361, "grad_norm": 0.27773420286402434, "learning_rate": 4.146537842190016e-05, "loss": 0.5282, "step": 701 }, { "epoch": 0.7626290059750136, "grad_norm": 0.2760150022404163, "learning_rate": 4.144524959742351e-05, "loss": 0.6701, "step": 702 }, { "epoch": 0.7637153720803911, "grad_norm": 0.26064436679649233, "learning_rate": 4.1425120772946864e-05, "loss": 0.5668, "step": 703 }, { "epoch": 0.7648017381857686, "grad_norm": 0.23353643418759817, "learning_rate": 4.1404991948470214e-05, "loss": 0.5363, "step": 704 }, { "epoch": 0.7658881042911461, "grad_norm": 1.8505654240953997, "learning_rate": 4.1384863123993565e-05, "loss": 0.5797, "step": 705 }, { "epoch": 0.7669744703965237, "grad_norm": 0.26465390669871636, "learning_rate": 4.136473429951691e-05, "loss": 0.5347, "step": 706 }, { "epoch": 0.7680608365019012, "grad_norm": 0.2556544771858215, "learning_rate": 4.134460547504026e-05, "loss": 0.5965, "step": 707 }, { "epoch": 0.7691472026072786, "grad_norm": 0.2955259715923408, "learning_rate": 4.132447665056361e-05, "loss": 0.6275, "step": 708 }, { "epoch": 0.7702335687126561, "grad_norm": 0.26900052141638664, "learning_rate": 4.130434782608696e-05, "loss": 0.5649, "step": 709 }, { "epoch": 0.7713199348180336, "grad_norm": 2.3051536032921125, "learning_rate": 4.128421900161031e-05, "loss": 0.5119, "step": 710 }, { "epoch": 0.7724063009234112, "grad_norm": 0.34442888452834425, "learning_rate": 4.126409017713366e-05, "loss": 0.5422, "step": 711 }, { "epoch": 0.7734926670287887, "grad_norm": 0.2800453254159243, "learning_rate": 4.1243961352657004e-05, "loss": 0.5706, "step": 712 }, { "epoch": 0.7745790331341662, "grad_norm": 0.29699615782529737, "learning_rate": 4.1223832528180355e-05, "loss": 0.5613, "step": 713 }, { "epoch": 0.7756653992395437, "grad_norm": 0.3284880685970291, "learning_rate": 4.1203703703703705e-05, "loss": 0.6179, "step": 714 }, { "epoch": 0.7767517653449212, "grad_norm": 0.27842692219898496, "learning_rate": 4.1183574879227056e-05, "loss": 0.5164, "step": 715 }, { "epoch": 0.7778381314502988, "grad_norm": 0.29862610120293087, "learning_rate": 4.1163446054750406e-05, "loss": 0.5754, "step": 716 }, { "epoch": 0.7789244975556763, "grad_norm": 0.6885546902707601, "learning_rate": 4.1143317230273756e-05, "loss": 0.6274, "step": 717 }, { "epoch": 0.7800108636610538, "grad_norm": 0.353089688429944, "learning_rate": 4.11231884057971e-05, "loss": 0.5273, "step": 718 }, { "epoch": 0.7810972297664313, "grad_norm": 0.2990255490024318, "learning_rate": 4.110305958132045e-05, "loss": 0.5469, "step": 719 }, { "epoch": 0.7821835958718087, "grad_norm": 0.3007405257409647, "learning_rate": 4.10829307568438e-05, "loss": 0.5148, "step": 720 }, { "epoch": 0.7832699619771863, "grad_norm": 0.2621028065821319, "learning_rate": 4.106280193236715e-05, "loss": 0.5376, "step": 721 }, { "epoch": 0.7843563280825638, "grad_norm": 0.2836371974880871, "learning_rate": 4.10426731078905e-05, "loss": 0.5518, "step": 722 }, { "epoch": 0.7854426941879413, "grad_norm": 0.2530682238771518, "learning_rate": 4.102254428341385e-05, "loss": 0.5214, "step": 723 }, { "epoch": 0.7865290602933188, "grad_norm": 0.3117797259234281, "learning_rate": 4.1002415458937196e-05, "loss": 0.6318, "step": 724 }, { "epoch": 0.7876154263986963, "grad_norm": 1.2783506727386331, "learning_rate": 4.0982286634460546e-05, "loss": 0.4967, "step": 725 }, { "epoch": 0.7887017925040739, "grad_norm": 0.351953827398753, "learning_rate": 4.09621578099839e-05, "loss": 0.5285, "step": 726 }, { "epoch": 0.7897881586094514, "grad_norm": 0.4129797517777289, "learning_rate": 4.094202898550725e-05, "loss": 0.5236, "step": 727 }, { "epoch": 0.7908745247148289, "grad_norm": 0.24524228489989242, "learning_rate": 4.09219001610306e-05, "loss": 0.5479, "step": 728 }, { "epoch": 0.7919608908202064, "grad_norm": 0.3279584985690663, "learning_rate": 4.090177133655395e-05, "loss": 0.5341, "step": 729 }, { "epoch": 0.793047256925584, "grad_norm": 0.31846985536532146, "learning_rate": 4.088164251207729e-05, "loss": 0.5318, "step": 730 }, { "epoch": 0.7941336230309615, "grad_norm": 0.44577619908413463, "learning_rate": 4.086151368760064e-05, "loss": 0.4921, "step": 731 }, { "epoch": 0.795219989136339, "grad_norm": 0.28245394852767447, "learning_rate": 4.084138486312399e-05, "loss": 0.5332, "step": 732 }, { "epoch": 0.7963063552417164, "grad_norm": 0.30965446196861396, "learning_rate": 4.082125603864734e-05, "loss": 0.5385, "step": 733 }, { "epoch": 0.7973927213470939, "grad_norm": 0.2995207375891687, "learning_rate": 4.0801127214170694e-05, "loss": 0.4753, "step": 734 }, { "epoch": 0.7984790874524715, "grad_norm": 0.267591404046388, "learning_rate": 4.0780998389694044e-05, "loss": 0.5871, "step": 735 }, { "epoch": 0.799565453557849, "grad_norm": 0.27324512097255105, "learning_rate": 4.076086956521739e-05, "loss": 0.5168, "step": 736 }, { "epoch": 0.8006518196632265, "grad_norm": 0.31291686027664367, "learning_rate": 4.074074074074074e-05, "loss": 0.5831, "step": 737 }, { "epoch": 0.801738185768604, "grad_norm": 0.2676703494195483, "learning_rate": 4.072061191626409e-05, "loss": 0.5259, "step": 738 }, { "epoch": 0.8028245518739815, "grad_norm": 0.29142320017925477, "learning_rate": 4.070048309178744e-05, "loss": 0.6505, "step": 739 }, { "epoch": 0.8039109179793591, "grad_norm": 0.25798570157773554, "learning_rate": 4.068035426731079e-05, "loss": 0.5011, "step": 740 }, { "epoch": 0.8049972840847366, "grad_norm": 0.2579388774983862, "learning_rate": 4.066022544283414e-05, "loss": 0.5621, "step": 741 }, { "epoch": 0.8060836501901141, "grad_norm": 0.2628587553716643, "learning_rate": 4.064009661835749e-05, "loss": 0.593, "step": 742 }, { "epoch": 0.8071700162954916, "grad_norm": 0.2800423862719079, "learning_rate": 4.061996779388084e-05, "loss": 0.5707, "step": 743 }, { "epoch": 0.8082563824008691, "grad_norm": 0.6277306619144902, "learning_rate": 4.059983896940419e-05, "loss": 0.5732, "step": 744 }, { "epoch": 0.8093427485062467, "grad_norm": 0.279420894632876, "learning_rate": 4.057971014492754e-05, "loss": 0.5407, "step": 745 }, { "epoch": 0.8104291146116241, "grad_norm": 0.27195750719501394, "learning_rate": 4.055958132045089e-05, "loss": 0.4679, "step": 746 }, { "epoch": 0.8115154807170016, "grad_norm": 0.2596665951612251, "learning_rate": 4.053945249597424e-05, "loss": 0.5807, "step": 747 }, { "epoch": 0.8126018468223791, "grad_norm": 0.22867754917245658, "learning_rate": 4.0519323671497586e-05, "loss": 0.5163, "step": 748 }, { "epoch": 0.8136882129277566, "grad_norm": 0.2734354332312743, "learning_rate": 4.049919484702094e-05, "loss": 0.4912, "step": 749 }, { "epoch": 0.8147745790331342, "grad_norm": 0.3373999895918355, "learning_rate": 4.047906602254429e-05, "loss": 0.6212, "step": 750 }, { "epoch": 0.8158609451385117, "grad_norm": 0.26593547183004707, "learning_rate": 4.045893719806764e-05, "loss": 0.5674, "step": 751 }, { "epoch": 0.8169473112438892, "grad_norm": 0.25317640562686333, "learning_rate": 4.043880837359099e-05, "loss": 0.542, "step": 752 }, { "epoch": 0.8180336773492667, "grad_norm": 0.3249295983522979, "learning_rate": 4.041867954911434e-05, "loss": 0.6132, "step": 753 }, { "epoch": 0.8191200434546442, "grad_norm": 0.2762656133928672, "learning_rate": 4.039855072463768e-05, "loss": 0.4904, "step": 754 }, { "epoch": 0.8202064095600218, "grad_norm": 0.27176468406979454, "learning_rate": 4.037842190016103e-05, "loss": 0.6022, "step": 755 }, { "epoch": 0.8212927756653993, "grad_norm": 0.2738756937989509, "learning_rate": 4.035829307568438e-05, "loss": 0.4845, "step": 756 }, { "epoch": 0.8223791417707768, "grad_norm": 0.3076049901331222, "learning_rate": 4.0338164251207733e-05, "loss": 0.6243, "step": 757 }, { "epoch": 0.8234655078761542, "grad_norm": 0.27304331683123056, "learning_rate": 4.0318035426731084e-05, "loss": 0.5068, "step": 758 }, { "epoch": 0.8245518739815317, "grad_norm": 0.8351173493606772, "learning_rate": 4.0297906602254434e-05, "loss": 0.6477, "step": 759 }, { "epoch": 0.8256382400869093, "grad_norm": 0.28485560343327176, "learning_rate": 4.027777777777778e-05, "loss": 0.4946, "step": 760 }, { "epoch": 0.8267246061922868, "grad_norm": 0.27164777302374987, "learning_rate": 4.025764895330113e-05, "loss": 0.4577, "step": 761 }, { "epoch": 0.8278109722976643, "grad_norm": 0.3155585931757386, "learning_rate": 4.023752012882448e-05, "loss": 0.561, "step": 762 }, { "epoch": 0.8288973384030418, "grad_norm": 0.2919342491094034, "learning_rate": 4.021739130434783e-05, "loss": 0.5444, "step": 763 }, { "epoch": 0.8299837045084193, "grad_norm": 0.3005591631461311, "learning_rate": 4.019726247987118e-05, "loss": 0.561, "step": 764 }, { "epoch": 0.8310700706137969, "grad_norm": 0.3044902642260814, "learning_rate": 4.017713365539453e-05, "loss": 0.5575, "step": 765 }, { "epoch": 0.8321564367191744, "grad_norm": 0.2759865668834764, "learning_rate": 4.0157004830917874e-05, "loss": 0.5357, "step": 766 }, { "epoch": 0.8332428028245519, "grad_norm": 0.3138726453322014, "learning_rate": 4.0136876006441224e-05, "loss": 0.563, "step": 767 }, { "epoch": 0.8343291689299294, "grad_norm": 0.2780518309199237, "learning_rate": 4.0116747181964575e-05, "loss": 0.5295, "step": 768 }, { "epoch": 0.8354155350353069, "grad_norm": 0.27714462391367833, "learning_rate": 4.0096618357487925e-05, "loss": 0.5779, "step": 769 }, { "epoch": 0.8365019011406845, "grad_norm": 0.26325660690010233, "learning_rate": 4.0076489533011276e-05, "loss": 0.4725, "step": 770 }, { "epoch": 0.837588267246062, "grad_norm": 0.2634944813877524, "learning_rate": 4.0056360708534626e-05, "loss": 0.5238, "step": 771 }, { "epoch": 0.8386746333514394, "grad_norm": 0.2561444688044345, "learning_rate": 4.003623188405797e-05, "loss": 0.5589, "step": 772 }, { "epoch": 0.8397609994568169, "grad_norm": 0.2447050467872321, "learning_rate": 4.001610305958132e-05, "loss": 0.5636, "step": 773 }, { "epoch": 0.8408473655621944, "grad_norm": 0.2664646704812476, "learning_rate": 3.999597423510467e-05, "loss": 0.5259, "step": 774 }, { "epoch": 0.841933731667572, "grad_norm": 0.28818614923540936, "learning_rate": 3.997584541062802e-05, "loss": 0.4316, "step": 775 }, { "epoch": 0.8430200977729495, "grad_norm": 0.23896546481470265, "learning_rate": 3.995571658615137e-05, "loss": 0.5619, "step": 776 }, { "epoch": 0.844106463878327, "grad_norm": 0.28549168034119277, "learning_rate": 3.993558776167472e-05, "loss": 0.5531, "step": 777 }, { "epoch": 0.8451928299837045, "grad_norm": 0.27998258283366667, "learning_rate": 3.991545893719807e-05, "loss": 0.5759, "step": 778 }, { "epoch": 0.846279196089082, "grad_norm": 1.9349665230753847, "learning_rate": 3.9895330112721416e-05, "loss": 0.5544, "step": 779 }, { "epoch": 0.8473655621944596, "grad_norm": 0.3637038589522213, "learning_rate": 3.9875201288244767e-05, "loss": 0.5406, "step": 780 }, { "epoch": 0.8484519282998371, "grad_norm": 0.3491846242813757, "learning_rate": 3.985507246376812e-05, "loss": 0.5139, "step": 781 }, { "epoch": 0.8495382944052146, "grad_norm": 0.2563099624280818, "learning_rate": 3.983494363929147e-05, "loss": 0.5729, "step": 782 }, { "epoch": 0.850624660510592, "grad_norm": 0.25842473740508304, "learning_rate": 3.981481481481482e-05, "loss": 0.5132, "step": 783 }, { "epoch": 0.8517110266159695, "grad_norm": 0.3059660885393781, "learning_rate": 3.979468599033817e-05, "loss": 0.5527, "step": 784 }, { "epoch": 0.8527973927213471, "grad_norm": 1.0542173111233135, "learning_rate": 3.977455716586151e-05, "loss": 0.4873, "step": 785 }, { "epoch": 0.8538837588267246, "grad_norm": 0.3052538264864143, "learning_rate": 3.975442834138486e-05, "loss": 0.5147, "step": 786 }, { "epoch": 0.8549701249321021, "grad_norm": 0.2541812658099018, "learning_rate": 3.973429951690821e-05, "loss": 0.4724, "step": 787 }, { "epoch": 0.8560564910374796, "grad_norm": 0.28090548583817215, "learning_rate": 3.971417069243156e-05, "loss": 0.5251, "step": 788 }, { "epoch": 0.8571428571428571, "grad_norm": 5.986279093221173, "learning_rate": 3.9694041867954914e-05, "loss": 0.6416, "step": 789 }, { "epoch": 0.8582292232482347, "grad_norm": 0.3159345616870806, "learning_rate": 3.9673913043478264e-05, "loss": 0.5181, "step": 790 }, { "epoch": 0.8593155893536122, "grad_norm": 0.2891626599323767, "learning_rate": 3.965378421900161e-05, "loss": 0.5563, "step": 791 }, { "epoch": 0.8604019554589897, "grad_norm": 0.2698024972051373, "learning_rate": 3.963365539452496e-05, "loss": 0.5243, "step": 792 }, { "epoch": 0.8614883215643672, "grad_norm": 0.2691491594099145, "learning_rate": 3.961352657004831e-05, "loss": 0.5232, "step": 793 }, { "epoch": 0.8625746876697447, "grad_norm": 0.300118408389596, "learning_rate": 3.959339774557166e-05, "loss": 0.5348, "step": 794 }, { "epoch": 0.8636610537751223, "grad_norm": 0.24360983155558202, "learning_rate": 3.957326892109501e-05, "loss": 0.4706, "step": 795 }, { "epoch": 0.8647474198804997, "grad_norm": 0.27876429465876457, "learning_rate": 3.955314009661836e-05, "loss": 0.6121, "step": 796 }, { "epoch": 0.8658337859858772, "grad_norm": 0.32048787842915216, "learning_rate": 3.9533011272141704e-05, "loss": 0.603, "step": 797 }, { "epoch": 0.8669201520912547, "grad_norm": 0.2691288364532005, "learning_rate": 3.9512882447665054e-05, "loss": 0.535, "step": 798 }, { "epoch": 0.8680065181966322, "grad_norm": 0.28140683262034155, "learning_rate": 3.9492753623188405e-05, "loss": 0.5671, "step": 799 }, { "epoch": 0.8690928843020098, "grad_norm": 0.32761809255819196, "learning_rate": 3.9472624798711755e-05, "loss": 0.6042, "step": 800 }, { "epoch": 0.8701792504073873, "grad_norm": 0.23858784137076766, "learning_rate": 3.9452495974235105e-05, "loss": 0.4949, "step": 801 }, { "epoch": 0.8712656165127648, "grad_norm": 0.2904536221876138, "learning_rate": 3.9432367149758456e-05, "loss": 0.5168, "step": 802 }, { "epoch": 0.8723519826181423, "grad_norm": 0.3367038400457924, "learning_rate": 3.94122383252818e-05, "loss": 0.5793, "step": 803 }, { "epoch": 0.8734383487235198, "grad_norm": 0.28431769742835755, "learning_rate": 3.939210950080515e-05, "loss": 0.613, "step": 804 }, { "epoch": 0.8745247148288974, "grad_norm": 0.6030947344030734, "learning_rate": 3.937198067632851e-05, "loss": 0.5526, "step": 805 }, { "epoch": 0.8756110809342749, "grad_norm": 0.2952144015751339, "learning_rate": 3.935185185185186e-05, "loss": 0.5189, "step": 806 }, { "epoch": 0.8766974470396524, "grad_norm": 0.3303125485330522, "learning_rate": 3.933172302737521e-05, "loss": 0.5044, "step": 807 }, { "epoch": 0.8777838131450298, "grad_norm": 0.31265824209859816, "learning_rate": 3.931159420289855e-05, "loss": 0.5501, "step": 808 }, { "epoch": 0.8788701792504073, "grad_norm": 0.3241930450104132, "learning_rate": 3.92914653784219e-05, "loss": 0.5621, "step": 809 }, { "epoch": 0.8799565453557849, "grad_norm": 0.27846750494247946, "learning_rate": 3.927133655394525e-05, "loss": 0.6187, "step": 810 }, { "epoch": 0.8810429114611624, "grad_norm": 0.5266449715958925, "learning_rate": 3.92512077294686e-05, "loss": 0.5277, "step": 811 }, { "epoch": 0.8821292775665399, "grad_norm": 0.33734180034044525, "learning_rate": 3.9231078904991954e-05, "loss": 0.5523, "step": 812 }, { "epoch": 0.8832156436719174, "grad_norm": 3.17269524327637, "learning_rate": 3.9210950080515304e-05, "loss": 0.6684, "step": 813 }, { "epoch": 0.8843020097772949, "grad_norm": 0.4098966654501822, "learning_rate": 3.9190821256038654e-05, "loss": 0.5407, "step": 814 }, { "epoch": 0.8853883758826725, "grad_norm": 1.105932834262566, "learning_rate": 3.9170692431562e-05, "loss": 0.6339, "step": 815 }, { "epoch": 0.88647474198805, "grad_norm": 0.41580563407109405, "learning_rate": 3.915056360708535e-05, "loss": 0.5001, "step": 816 }, { "epoch": 0.8875611080934275, "grad_norm": 0.45334240314324514, "learning_rate": 3.91304347826087e-05, "loss": 0.5716, "step": 817 }, { "epoch": 0.888647474198805, "grad_norm": 0.3456171398300509, "learning_rate": 3.911030595813205e-05, "loss": 0.5722, "step": 818 }, { "epoch": 0.8897338403041825, "grad_norm": 0.2946860584245836, "learning_rate": 3.90901771336554e-05, "loss": 0.5467, "step": 819 }, { "epoch": 0.8908202064095601, "grad_norm": 0.30957197446505724, "learning_rate": 3.907004830917875e-05, "loss": 0.5011, "step": 820 }, { "epoch": 0.8919065725149375, "grad_norm": 0.37930843278290544, "learning_rate": 3.9049919484702094e-05, "loss": 0.6059, "step": 821 }, { "epoch": 0.892992938620315, "grad_norm": 0.2864189452919919, "learning_rate": 3.9029790660225444e-05, "loss": 0.5436, "step": 822 }, { "epoch": 0.8940793047256925, "grad_norm": 0.3469618366978588, "learning_rate": 3.9009661835748795e-05, "loss": 0.5402, "step": 823 }, { "epoch": 0.8951656708310701, "grad_norm": 0.27308470750485514, "learning_rate": 3.8989533011272145e-05, "loss": 0.5874, "step": 824 }, { "epoch": 0.8962520369364476, "grad_norm": 0.26739951408596463, "learning_rate": 3.8969404186795496e-05, "loss": 0.5799, "step": 825 }, { "epoch": 0.8973384030418251, "grad_norm": 0.2435089076313175, "learning_rate": 3.8949275362318846e-05, "loss": 0.5216, "step": 826 }, { "epoch": 0.8984247691472026, "grad_norm": 0.2445531005258342, "learning_rate": 3.892914653784219e-05, "loss": 0.5154, "step": 827 }, { "epoch": 0.8995111352525801, "grad_norm": 0.38326743485896986, "learning_rate": 3.890901771336554e-05, "loss": 0.523, "step": 828 }, { "epoch": 0.9005975013579577, "grad_norm": 0.23257384279769122, "learning_rate": 3.888888888888889e-05, "loss": 0.5053, "step": 829 }, { "epoch": 0.9016838674633352, "grad_norm": 0.3276227922620216, "learning_rate": 3.886876006441224e-05, "loss": 0.7427, "step": 830 }, { "epoch": 0.9027702335687127, "grad_norm": 0.2736201170561869, "learning_rate": 3.884863123993559e-05, "loss": 0.5225, "step": 831 }, { "epoch": 0.9038565996740902, "grad_norm": 0.27826746194927593, "learning_rate": 3.882850241545894e-05, "loss": 0.4752, "step": 832 }, { "epoch": 0.9049429657794676, "grad_norm": 0.29201227049783846, "learning_rate": 3.8808373590982286e-05, "loss": 0.6368, "step": 833 }, { "epoch": 0.9060293318848452, "grad_norm": 0.3726147537228803, "learning_rate": 3.8788244766505636e-05, "loss": 0.5181, "step": 834 }, { "epoch": 0.9071156979902227, "grad_norm": 0.30046356945077135, "learning_rate": 3.876811594202899e-05, "loss": 0.6118, "step": 835 }, { "epoch": 0.9082020640956002, "grad_norm": 0.6460825463811598, "learning_rate": 3.874798711755234e-05, "loss": 0.5618, "step": 836 }, { "epoch": 0.9092884302009777, "grad_norm": 0.44054082109898834, "learning_rate": 3.872785829307569e-05, "loss": 0.5724, "step": 837 }, { "epoch": 0.9103747963063552, "grad_norm": 0.2665558440387349, "learning_rate": 3.870772946859904e-05, "loss": 0.5213, "step": 838 }, { "epoch": 0.9114611624117328, "grad_norm": 0.35219817364106676, "learning_rate": 3.868760064412238e-05, "loss": 0.5968, "step": 839 }, { "epoch": 0.9125475285171103, "grad_norm": 0.33520414331638465, "learning_rate": 3.866747181964573e-05, "loss": 0.4987, "step": 840 }, { "epoch": 0.9136338946224878, "grad_norm": 0.3054098280062182, "learning_rate": 3.864734299516908e-05, "loss": 0.4813, "step": 841 }, { "epoch": 0.9147202607278653, "grad_norm": 0.3329995154169293, "learning_rate": 3.862721417069243e-05, "loss": 0.6152, "step": 842 }, { "epoch": 0.9158066268332428, "grad_norm": 0.779884666008192, "learning_rate": 3.8607085346215783e-05, "loss": 0.495, "step": 843 }, { "epoch": 0.9168929929386204, "grad_norm": 0.31199834063165727, "learning_rate": 3.8586956521739134e-05, "loss": 0.5148, "step": 844 }, { "epoch": 0.9179793590439979, "grad_norm": 0.2665302851108355, "learning_rate": 3.8566827697262484e-05, "loss": 0.5744, "step": 845 }, { "epoch": 0.9190657251493753, "grad_norm": 0.26168032902447086, "learning_rate": 3.854669887278583e-05, "loss": 0.5375, "step": 846 }, { "epoch": 0.9201520912547528, "grad_norm": 0.2714101611630496, "learning_rate": 3.852657004830918e-05, "loss": 0.5169, "step": 847 }, { "epoch": 0.9212384573601303, "grad_norm": 0.2664585646979702, "learning_rate": 3.850644122383253e-05, "loss": 0.5557, "step": 848 }, { "epoch": 0.9223248234655079, "grad_norm": 0.28227328691444986, "learning_rate": 3.848631239935588e-05, "loss": 0.559, "step": 849 }, { "epoch": 0.9234111895708854, "grad_norm": 0.29995318636325385, "learning_rate": 3.846618357487923e-05, "loss": 0.6207, "step": 850 }, { "epoch": 0.9244975556762629, "grad_norm": 0.29948763437250714, "learning_rate": 3.844605475040258e-05, "loss": 0.5597, "step": 851 }, { "epoch": 0.9255839217816404, "grad_norm": 0.2605250325002057, "learning_rate": 3.8425925925925924e-05, "loss": 0.4881, "step": 852 }, { "epoch": 0.9266702878870179, "grad_norm": 0.29920604759066255, "learning_rate": 3.8405797101449274e-05, "loss": 0.5417, "step": 853 }, { "epoch": 0.9277566539923955, "grad_norm": 0.32839114730597735, "learning_rate": 3.8385668276972625e-05, "loss": 0.5751, "step": 854 }, { "epoch": 0.928843020097773, "grad_norm": 0.28790423606382204, "learning_rate": 3.8365539452495975e-05, "loss": 0.6164, "step": 855 }, { "epoch": 0.9299293862031505, "grad_norm": 0.23874509104722436, "learning_rate": 3.8345410628019326e-05, "loss": 0.5191, "step": 856 }, { "epoch": 0.931015752308528, "grad_norm": 0.2627320148417092, "learning_rate": 3.8325281803542676e-05, "loss": 0.4872, "step": 857 }, { "epoch": 0.9321021184139054, "grad_norm": 0.25642545267846867, "learning_rate": 3.830515297906602e-05, "loss": 0.545, "step": 858 }, { "epoch": 0.933188484519283, "grad_norm": 0.25718435782474786, "learning_rate": 3.828502415458937e-05, "loss": 0.5494, "step": 859 }, { "epoch": 0.9342748506246605, "grad_norm": 0.6172907298391395, "learning_rate": 3.826489533011272e-05, "loss": 0.5277, "step": 860 }, { "epoch": 0.935361216730038, "grad_norm": 0.24800078440410356, "learning_rate": 3.824476650563607e-05, "loss": 0.5283, "step": 861 }, { "epoch": 0.9364475828354155, "grad_norm": 0.2823389235125623, "learning_rate": 3.822463768115942e-05, "loss": 0.5274, "step": 862 }, { "epoch": 0.937533948940793, "grad_norm": 0.3082763173912555, "learning_rate": 3.820450885668277e-05, "loss": 0.5459, "step": 863 }, { "epoch": 0.9386203150461706, "grad_norm": 0.267653230330239, "learning_rate": 3.8184380032206116e-05, "loss": 0.5713, "step": 864 }, { "epoch": 0.9397066811515481, "grad_norm": 0.2734505377795628, "learning_rate": 3.8164251207729466e-05, "loss": 0.5513, "step": 865 }, { "epoch": 0.9407930472569256, "grad_norm": 0.2641638972475421, "learning_rate": 3.8144122383252816e-05, "loss": 0.5896, "step": 866 }, { "epoch": 0.9418794133623031, "grad_norm": 0.2754629954300957, "learning_rate": 3.8123993558776174e-05, "loss": 0.5979, "step": 867 }, { "epoch": 0.9429657794676806, "grad_norm": 0.26567082466526026, "learning_rate": 3.8103864734299524e-05, "loss": 0.489, "step": 868 }, { "epoch": 0.9440521455730582, "grad_norm": 0.2585909864929331, "learning_rate": 3.808373590982287e-05, "loss": 0.5074, "step": 869 }, { "epoch": 0.9451385116784357, "grad_norm": 0.24156530514115848, "learning_rate": 3.806360708534622e-05, "loss": 0.5568, "step": 870 }, { "epoch": 0.9462248777838131, "grad_norm": 0.23281443785168907, "learning_rate": 3.804347826086957e-05, "loss": 0.5003, "step": 871 }, { "epoch": 0.9473112438891906, "grad_norm": 0.2620891791410109, "learning_rate": 3.802334943639292e-05, "loss": 0.5516, "step": 872 }, { "epoch": 0.9483976099945681, "grad_norm": 0.24772693112508248, "learning_rate": 3.800322061191627e-05, "loss": 0.5459, "step": 873 }, { "epoch": 0.9494839760999457, "grad_norm": 0.22859138414284813, "learning_rate": 3.798309178743962e-05, "loss": 0.5414, "step": 874 }, { "epoch": 0.9505703422053232, "grad_norm": 0.2718888975262205, "learning_rate": 3.7962962962962964e-05, "loss": 0.5941, "step": 875 }, { "epoch": 0.9516567083107007, "grad_norm": 0.2487419947850102, "learning_rate": 3.7942834138486314e-05, "loss": 0.4687, "step": 876 }, { "epoch": 0.9527430744160782, "grad_norm": 0.22535334762887924, "learning_rate": 3.7922705314009665e-05, "loss": 0.5213, "step": 877 }, { "epoch": 0.9538294405214557, "grad_norm": 0.24224751617307666, "learning_rate": 3.7902576489533015e-05, "loss": 0.5831, "step": 878 }, { "epoch": 0.9549158066268333, "grad_norm": 0.2521081830607822, "learning_rate": 3.7882447665056365e-05, "loss": 0.5164, "step": 879 }, { "epoch": 0.9560021727322108, "grad_norm": 3.9991081123768124, "learning_rate": 3.7862318840579716e-05, "loss": 0.7453, "step": 880 }, { "epoch": 0.9570885388375883, "grad_norm": 0.2932998791269109, "learning_rate": 3.784219001610306e-05, "loss": 0.6079, "step": 881 }, { "epoch": 0.9581749049429658, "grad_norm": 0.2605584762457054, "learning_rate": 3.782206119162641e-05, "loss": 0.523, "step": 882 }, { "epoch": 0.9592612710483432, "grad_norm": 0.240875413163794, "learning_rate": 3.780193236714976e-05, "loss": 0.5719, "step": 883 }, { "epoch": 0.9603476371537208, "grad_norm": 0.22759281982576135, "learning_rate": 3.778180354267311e-05, "loss": 0.4943, "step": 884 }, { "epoch": 0.9614340032590983, "grad_norm": 0.2615465549683056, "learning_rate": 3.776167471819646e-05, "loss": 0.552, "step": 885 }, { "epoch": 0.9625203693644758, "grad_norm": 0.6028819706470147, "learning_rate": 3.774154589371981e-05, "loss": 0.5435, "step": 886 }, { "epoch": 0.9636067354698533, "grad_norm": 0.2908371673122274, "learning_rate": 3.772141706924316e-05, "loss": 0.6058, "step": 887 }, { "epoch": 0.9646931015752308, "grad_norm": 0.2700917450128883, "learning_rate": 3.7701288244766506e-05, "loss": 0.5147, "step": 888 }, { "epoch": 0.9657794676806084, "grad_norm": 0.2518948467419895, "learning_rate": 3.7681159420289856e-05, "loss": 0.5738, "step": 889 }, { "epoch": 0.9668658337859859, "grad_norm": 0.31501462606152025, "learning_rate": 3.766103059581321e-05, "loss": 0.5252, "step": 890 }, { "epoch": 0.9679521998913634, "grad_norm": 0.24484109596445042, "learning_rate": 3.764090177133656e-05, "loss": 0.599, "step": 891 }, { "epoch": 0.9690385659967409, "grad_norm": 0.22688476409362932, "learning_rate": 3.762077294685991e-05, "loss": 0.4685, "step": 892 }, { "epoch": 0.9701249321021184, "grad_norm": 0.22097250075824373, "learning_rate": 3.760064412238326e-05, "loss": 0.4721, "step": 893 }, { "epoch": 0.971211298207496, "grad_norm": 0.24504159711349122, "learning_rate": 3.75805152979066e-05, "loss": 0.5251, "step": 894 }, { "epoch": 0.9722976643128735, "grad_norm": 0.23695843649931325, "learning_rate": 3.756038647342995e-05, "loss": 0.4763, "step": 895 }, { "epoch": 0.973384030418251, "grad_norm": 0.27234318383339046, "learning_rate": 3.75402576489533e-05, "loss": 0.5785, "step": 896 }, { "epoch": 0.9744703965236284, "grad_norm": 0.6488538672663117, "learning_rate": 3.752012882447665e-05, "loss": 0.5613, "step": 897 }, { "epoch": 0.9755567626290059, "grad_norm": 0.25548516797547005, "learning_rate": 3.7500000000000003e-05, "loss": 0.5753, "step": 898 }, { "epoch": 0.9766431287343835, "grad_norm": 0.3082329285442018, "learning_rate": 3.7479871175523354e-05, "loss": 0.5909, "step": 899 }, { "epoch": 0.977729494839761, "grad_norm": 0.26370178562911084, "learning_rate": 3.74597423510467e-05, "loss": 0.571, "step": 900 }, { "epoch": 0.9788158609451385, "grad_norm": 0.24925397264889632, "learning_rate": 3.743961352657005e-05, "loss": 0.6022, "step": 901 }, { "epoch": 0.979902227050516, "grad_norm": 0.2763115926457623, "learning_rate": 3.74194847020934e-05, "loss": 0.5455, "step": 902 }, { "epoch": 0.9809885931558935, "grad_norm": 0.2742224074218038, "learning_rate": 3.739935587761675e-05, "loss": 0.6229, "step": 903 }, { "epoch": 0.9820749592612711, "grad_norm": 0.23670004690599916, "learning_rate": 3.73792270531401e-05, "loss": 0.5075, "step": 904 }, { "epoch": 0.9831613253666486, "grad_norm": 0.25696158769409755, "learning_rate": 3.735909822866345e-05, "loss": 0.5676, "step": 905 }, { "epoch": 0.9842476914720261, "grad_norm": 0.23855222352471606, "learning_rate": 3.7338969404186793e-05, "loss": 0.5085, "step": 906 }, { "epoch": 0.9853340575774036, "grad_norm": 0.23452009798518011, "learning_rate": 3.7318840579710144e-05, "loss": 0.4979, "step": 907 }, { "epoch": 0.986420423682781, "grad_norm": 0.240404735587708, "learning_rate": 3.7298711755233494e-05, "loss": 0.4641, "step": 908 }, { "epoch": 0.9875067897881586, "grad_norm": 2.1472734696369393, "learning_rate": 3.7278582930756845e-05, "loss": 0.5864, "step": 909 }, { "epoch": 0.9885931558935361, "grad_norm": 0.2912148664574692, "learning_rate": 3.7258454106280195e-05, "loss": 0.5234, "step": 910 }, { "epoch": 0.9896795219989136, "grad_norm": 0.29667857958763333, "learning_rate": 3.7238325281803546e-05, "loss": 0.5028, "step": 911 }, { "epoch": 0.9907658881042911, "grad_norm": 0.287979358568837, "learning_rate": 3.721819645732689e-05, "loss": 0.6362, "step": 912 }, { "epoch": 0.9918522542096686, "grad_norm": 0.2617309031728676, "learning_rate": 3.719806763285024e-05, "loss": 0.5498, "step": 913 }, { "epoch": 0.9929386203150462, "grad_norm": 0.3574780233046042, "learning_rate": 3.717793880837359e-05, "loss": 0.5742, "step": 914 }, { "epoch": 0.9940249864204237, "grad_norm": 0.2784038322157246, "learning_rate": 3.715780998389694e-05, "loss": 0.4891, "step": 915 }, { "epoch": 0.9951113525258012, "grad_norm": 0.23602271223620394, "learning_rate": 3.713768115942029e-05, "loss": 0.4983, "step": 916 }, { "epoch": 0.9961977186311787, "grad_norm": 0.5091371906601059, "learning_rate": 3.711755233494364e-05, "loss": 0.5458, "step": 917 }, { "epoch": 0.9972840847365562, "grad_norm": 0.3093279727765414, "learning_rate": 3.709742351046699e-05, "loss": 0.5173, "step": 918 }, { "epoch": 0.9983704508419338, "grad_norm": 0.2509943034194716, "learning_rate": 3.7077294685990336e-05, "loss": 0.5103, "step": 919 }, { "epoch": 0.9994568169473113, "grad_norm": 0.24539182493669096, "learning_rate": 3.7057165861513686e-05, "loss": 0.4749, "step": 920 }, { "epoch": 1.0, "grad_norm": 0.24539182493669096, "learning_rate": 3.7037037037037037e-05, "loss": 0.4854, "step": 921 }, { "epoch": 1.0010863661053775, "grad_norm": 4.096581733764304, "learning_rate": 3.701690821256039e-05, "loss": 0.4867, "step": 922 }, { "epoch": 1.002172732210755, "grad_norm": 0.3435586900291524, "learning_rate": 3.699677938808374e-05, "loss": 0.4873, "step": 923 }, { "epoch": 1.0032590983161325, "grad_norm": 0.3223990065699192, "learning_rate": 3.697665056360709e-05, "loss": 0.4863, "step": 924 }, { "epoch": 1.00434546442151, "grad_norm": 0.23912484595723676, "learning_rate": 3.695652173913043e-05, "loss": 0.4973, "step": 925 }, { "epoch": 1.0054318305268877, "grad_norm": 0.2832473400121941, "learning_rate": 3.693639291465378e-05, "loss": 0.4484, "step": 926 }, { "epoch": 1.0065181966322652, "grad_norm": 0.3273234821057411, "learning_rate": 3.691626409017713e-05, "loss": 0.4272, "step": 927 }, { "epoch": 1.0076045627376427, "grad_norm": 0.2607194066296622, "learning_rate": 3.689613526570048e-05, "loss": 0.4118, "step": 928 }, { "epoch": 1.0086909288430201, "grad_norm": 0.2670304957317674, "learning_rate": 3.687600644122383e-05, "loss": 0.4473, "step": 929 }, { "epoch": 1.0097772949483976, "grad_norm": 0.31827466150125117, "learning_rate": 3.6855877616747184e-05, "loss": 0.4522, "step": 930 }, { "epoch": 1.0108636610537751, "grad_norm": 0.307721316032473, "learning_rate": 3.6835748792270534e-05, "loss": 0.3965, "step": 931 }, { "epoch": 1.0119500271591526, "grad_norm": 0.28012644181466917, "learning_rate": 3.6815619967793885e-05, "loss": 0.4469, "step": 932 }, { "epoch": 1.01303639326453, "grad_norm": 0.33371667952722917, "learning_rate": 3.6795491143317235e-05, "loss": 0.3987, "step": 933 }, { "epoch": 1.0141227593699076, "grad_norm": 0.2677898790956546, "learning_rate": 3.6775362318840586e-05, "loss": 0.4757, "step": 934 }, { "epoch": 1.015209125475285, "grad_norm": 0.2843685827672019, "learning_rate": 3.6755233494363936e-05, "loss": 0.3915, "step": 935 }, { "epoch": 1.0162954915806628, "grad_norm": 1.1185190137722616, "learning_rate": 3.673510466988728e-05, "loss": 0.4451, "step": 936 }, { "epoch": 1.0173818576860403, "grad_norm": 0.25735817622671914, "learning_rate": 3.671497584541063e-05, "loss": 0.4685, "step": 937 }, { "epoch": 1.0184682237914178, "grad_norm": 0.5928408122212683, "learning_rate": 3.669484702093398e-05, "loss": 0.4129, "step": 938 }, { "epoch": 1.0195545898967953, "grad_norm": 0.26726557648992094, "learning_rate": 3.667471819645733e-05, "loss": 0.4442, "step": 939 }, { "epoch": 1.0206409560021728, "grad_norm": 0.2699055021154356, "learning_rate": 3.665458937198068e-05, "loss": 0.4699, "step": 940 }, { "epoch": 1.0217273221075502, "grad_norm": 0.24211685500193517, "learning_rate": 3.663446054750403e-05, "loss": 0.4398, "step": 941 }, { "epoch": 1.0228136882129277, "grad_norm": 0.31310134283194324, "learning_rate": 3.6614331723027376e-05, "loss": 0.4312, "step": 942 }, { "epoch": 1.0239000543183052, "grad_norm": 0.32279784176643145, "learning_rate": 3.6594202898550726e-05, "loss": 0.4484, "step": 943 }, { "epoch": 1.0249864204236827, "grad_norm": 0.3074982922839508, "learning_rate": 3.6574074074074076e-05, "loss": 0.5518, "step": 944 }, { "epoch": 1.0260727865290602, "grad_norm": 0.28536701182671503, "learning_rate": 3.655394524959743e-05, "loss": 0.3793, "step": 945 }, { "epoch": 1.027159152634438, "grad_norm": 0.26792845253716624, "learning_rate": 3.653381642512078e-05, "loss": 0.4508, "step": 946 }, { "epoch": 1.0282455187398154, "grad_norm": 0.3037030511229976, "learning_rate": 3.651368760064413e-05, "loss": 0.4564, "step": 947 }, { "epoch": 1.029331884845193, "grad_norm": 0.328929697755862, "learning_rate": 3.649355877616747e-05, "loss": 0.3944, "step": 948 }, { "epoch": 1.0304182509505704, "grad_norm": 0.27170048089409865, "learning_rate": 3.647342995169082e-05, "loss": 0.4381, "step": 949 }, { "epoch": 1.0315046170559479, "grad_norm": 0.3377913531806924, "learning_rate": 3.645330112721417e-05, "loss": 0.4892, "step": 950 }, { "epoch": 1.0325909831613254, "grad_norm": 0.2685225106528555, "learning_rate": 3.643317230273752e-05, "loss": 0.5149, "step": 951 }, { "epoch": 1.0336773492667028, "grad_norm": 0.29481075007632196, "learning_rate": 3.641304347826087e-05, "loss": 0.4794, "step": 952 }, { "epoch": 1.0347637153720803, "grad_norm": 0.31931657158388616, "learning_rate": 3.6392914653784224e-05, "loss": 0.413, "step": 953 }, { "epoch": 1.0358500814774578, "grad_norm": 0.2405636710903745, "learning_rate": 3.6372785829307574e-05, "loss": 0.4354, "step": 954 }, { "epoch": 1.0369364475828353, "grad_norm": 0.971669897089315, "learning_rate": 3.635265700483092e-05, "loss": 0.461, "step": 955 }, { "epoch": 1.038022813688213, "grad_norm": 0.7447885545182876, "learning_rate": 3.633252818035427e-05, "loss": 0.4158, "step": 956 }, { "epoch": 1.0391091797935905, "grad_norm": 0.25476025451992845, "learning_rate": 3.631239935587762e-05, "loss": 0.4986, "step": 957 }, { "epoch": 1.040195545898968, "grad_norm": 0.36399898709676215, "learning_rate": 3.629227053140097e-05, "loss": 0.4673, "step": 958 }, { "epoch": 1.0412819120043455, "grad_norm": 0.33208195778584965, "learning_rate": 3.627214170692432e-05, "loss": 0.4841, "step": 959 }, { "epoch": 1.042368278109723, "grad_norm": 6.313492404926699, "learning_rate": 3.625201288244767e-05, "loss": 0.4937, "step": 960 }, { "epoch": 1.0434546442151005, "grad_norm": 0.4241692514823635, "learning_rate": 3.6231884057971014e-05, "loss": 0.4561, "step": 961 }, { "epoch": 1.044541010320478, "grad_norm": 0.31514005440762083, "learning_rate": 3.6211755233494364e-05, "loss": 0.4631, "step": 962 }, { "epoch": 1.0456273764258555, "grad_norm": 0.32359834584670044, "learning_rate": 3.6191626409017714e-05, "loss": 0.4819, "step": 963 }, { "epoch": 1.046713742531233, "grad_norm": 0.4134068210078399, "learning_rate": 3.6171497584541065e-05, "loss": 0.5035, "step": 964 }, { "epoch": 1.0478001086366104, "grad_norm": 0.29391394705663304, "learning_rate": 3.6151368760064415e-05, "loss": 0.4218, "step": 965 }, { "epoch": 1.0488864747419882, "grad_norm": 0.2780114957501375, "learning_rate": 3.6131239935587766e-05, "loss": 0.5046, "step": 966 }, { "epoch": 1.0499728408473656, "grad_norm": 0.3052404706631375, "learning_rate": 3.611111111111111e-05, "loss": 0.4193, "step": 967 }, { "epoch": 1.0510592069527431, "grad_norm": 0.2504788204166979, "learning_rate": 3.609098228663446e-05, "loss": 0.4068, "step": 968 }, { "epoch": 1.0521455730581206, "grad_norm": 0.28995932371816807, "learning_rate": 3.607085346215781e-05, "loss": 0.4191, "step": 969 }, { "epoch": 1.053231939163498, "grad_norm": 0.24514600915570084, "learning_rate": 3.605072463768116e-05, "loss": 0.4632, "step": 970 }, { "epoch": 1.0543183052688756, "grad_norm": 0.32785819946389044, "learning_rate": 3.603059581320451e-05, "loss": 0.454, "step": 971 }, { "epoch": 1.055404671374253, "grad_norm": 0.27765171110729575, "learning_rate": 3.601046698872786e-05, "loss": 0.4389, "step": 972 }, { "epoch": 1.0564910374796306, "grad_norm": 0.24233618019673328, "learning_rate": 3.5990338164251205e-05, "loss": 0.4619, "step": 973 }, { "epoch": 1.057577403585008, "grad_norm": 0.2682089480603506, "learning_rate": 3.5970209339774556e-05, "loss": 0.3952, "step": 974 }, { "epoch": 1.0586637696903856, "grad_norm": 0.27610067399352006, "learning_rate": 3.5950080515297906e-05, "loss": 0.5048, "step": 975 }, { "epoch": 1.0597501357957633, "grad_norm": 0.23737218983439326, "learning_rate": 3.592995169082126e-05, "loss": 0.4726, "step": 976 }, { "epoch": 1.0608365019011408, "grad_norm": 0.2734857573904458, "learning_rate": 3.590982286634461e-05, "loss": 0.4498, "step": 977 }, { "epoch": 1.0619228680065183, "grad_norm": 2.501895793515387, "learning_rate": 3.588969404186796e-05, "loss": 0.5228, "step": 978 }, { "epoch": 1.0630092341118957, "grad_norm": 0.28632085041122474, "learning_rate": 3.58695652173913e-05, "loss": 0.4177, "step": 979 }, { "epoch": 1.0640956002172732, "grad_norm": 0.24189545428699696, "learning_rate": 3.584943639291465e-05, "loss": 0.4614, "step": 980 }, { "epoch": 1.0651819663226507, "grad_norm": 0.26493019395302925, "learning_rate": 3.5829307568438e-05, "loss": 0.5009, "step": 981 }, { "epoch": 1.0662683324280282, "grad_norm": 0.24511781798907026, "learning_rate": 3.580917874396135e-05, "loss": 0.5097, "step": 982 }, { "epoch": 1.0673546985334057, "grad_norm": 0.24675346739299348, "learning_rate": 3.57890499194847e-05, "loss": 0.3797, "step": 983 }, { "epoch": 1.0684410646387832, "grad_norm": 0.22629341341775236, "learning_rate": 3.5768921095008053e-05, "loss": 0.4278, "step": 984 }, { "epoch": 1.0695274307441607, "grad_norm": 0.24394604876249948, "learning_rate": 3.57487922705314e-05, "loss": 0.5082, "step": 985 }, { "epoch": 1.0706137968495384, "grad_norm": 0.28009539003169137, "learning_rate": 3.572866344605475e-05, "loss": 0.394, "step": 986 }, { "epoch": 1.0717001629549159, "grad_norm": 0.23677093596704807, "learning_rate": 3.57085346215781e-05, "loss": 0.43, "step": 987 }, { "epoch": 1.0727865290602934, "grad_norm": 0.23414963066408664, "learning_rate": 3.568840579710145e-05, "loss": 0.4078, "step": 988 }, { "epoch": 1.0738728951656709, "grad_norm": 0.24420552097440654, "learning_rate": 3.56682769726248e-05, "loss": 0.4522, "step": 989 }, { "epoch": 1.0749592612710483, "grad_norm": 0.24068684087833, "learning_rate": 3.564814814814815e-05, "loss": 0.4452, "step": 990 }, { "epoch": 1.0760456273764258, "grad_norm": 0.505083182051089, "learning_rate": 3.56280193236715e-05, "loss": 0.4322, "step": 991 }, { "epoch": 1.0771319934818033, "grad_norm": 0.34104402570044523, "learning_rate": 3.560789049919485e-05, "loss": 0.4702, "step": 992 }, { "epoch": 1.0782183595871808, "grad_norm": 0.29709080475979666, "learning_rate": 3.55877616747182e-05, "loss": 0.4884, "step": 993 }, { "epoch": 1.0793047256925583, "grad_norm": 0.24003439943638313, "learning_rate": 3.556763285024155e-05, "loss": 0.4542, "step": 994 }, { "epoch": 1.080391091797936, "grad_norm": 0.2371316875623259, "learning_rate": 3.55475040257649e-05, "loss": 0.4906, "step": 995 }, { "epoch": 1.0814774579033135, "grad_norm": 0.2316148934837761, "learning_rate": 3.552737520128825e-05, "loss": 0.4508, "step": 996 }, { "epoch": 1.082563824008691, "grad_norm": 0.2535936085145419, "learning_rate": 3.5507246376811596e-05, "loss": 0.4318, "step": 997 }, { "epoch": 1.0836501901140685, "grad_norm": 2.0139458693198464, "learning_rate": 3.5487117552334946e-05, "loss": 0.4469, "step": 998 }, { "epoch": 1.084736556219446, "grad_norm": 0.2840408665692626, "learning_rate": 3.5466988727858296e-05, "loss": 0.4101, "step": 999 }, { "epoch": 1.0858229223248235, "grad_norm": 0.28897323508029216, "learning_rate": 3.544685990338165e-05, "loss": 0.4301, "step": 1000 }, { "epoch": 1.086909288430201, "grad_norm": 0.23669304204882724, "learning_rate": 3.5426731078905e-05, "loss": 0.4947, "step": 1001 }, { "epoch": 1.0879956545355784, "grad_norm": 0.2915852497652277, "learning_rate": 3.540660225442835e-05, "loss": 0.4591, "step": 1002 }, { "epoch": 1.089082020640956, "grad_norm": 0.28694527556790017, "learning_rate": 3.538647342995169e-05, "loss": 0.5279, "step": 1003 }, { "epoch": 1.0901683867463334, "grad_norm": 0.29419937060839313, "learning_rate": 3.536634460547504e-05, "loss": 0.4356, "step": 1004 }, { "epoch": 1.091254752851711, "grad_norm": 0.24302220479944378, "learning_rate": 3.534621578099839e-05, "loss": 0.4311, "step": 1005 }, { "epoch": 1.0923411189570886, "grad_norm": 0.258626532649508, "learning_rate": 3.532608695652174e-05, "loss": 0.4593, "step": 1006 }, { "epoch": 1.0934274850624661, "grad_norm": 0.2734971326219132, "learning_rate": 3.530595813204509e-05, "loss": 0.4725, "step": 1007 }, { "epoch": 1.0945138511678436, "grad_norm": 0.27810126164112425, "learning_rate": 3.5285829307568444e-05, "loss": 0.4835, "step": 1008 }, { "epoch": 1.095600217273221, "grad_norm": 0.22987202787743227, "learning_rate": 3.526570048309179e-05, "loss": 0.4388, "step": 1009 }, { "epoch": 1.0966865833785986, "grad_norm": 0.23142487935043574, "learning_rate": 3.524557165861514e-05, "loss": 0.4729, "step": 1010 }, { "epoch": 1.097772949483976, "grad_norm": 0.26296509783347266, "learning_rate": 3.522544283413849e-05, "loss": 0.3948, "step": 1011 }, { "epoch": 1.0988593155893536, "grad_norm": 0.24660263144754396, "learning_rate": 3.520531400966184e-05, "loss": 0.4267, "step": 1012 }, { "epoch": 1.099945681694731, "grad_norm": 0.2851365829188785, "learning_rate": 3.518518518518519e-05, "loss": 0.4575, "step": 1013 }, { "epoch": 1.1010320478001085, "grad_norm": 0.2550301713820784, "learning_rate": 3.516505636070854e-05, "loss": 0.5117, "step": 1014 }, { "epoch": 1.1021184139054863, "grad_norm": 0.22097965945235218, "learning_rate": 3.514492753623188e-05, "loss": 0.4637, "step": 1015 }, { "epoch": 1.1032047800108638, "grad_norm": 0.2733663665776308, "learning_rate": 3.5124798711755234e-05, "loss": 0.4394, "step": 1016 }, { "epoch": 1.1042911461162412, "grad_norm": 0.2204280826876049, "learning_rate": 3.5104669887278584e-05, "loss": 0.424, "step": 1017 }, { "epoch": 1.1053775122216187, "grad_norm": 0.24781349570734876, "learning_rate": 3.5084541062801935e-05, "loss": 0.4491, "step": 1018 }, { "epoch": 1.1064638783269962, "grad_norm": 0.2626682566180793, "learning_rate": 3.5064412238325285e-05, "loss": 0.4322, "step": 1019 }, { "epoch": 1.1075502444323737, "grad_norm": 0.2081530331444225, "learning_rate": 3.5044283413848635e-05, "loss": 0.5029, "step": 1020 }, { "epoch": 1.1086366105377512, "grad_norm": 0.2587865457963551, "learning_rate": 3.502415458937198e-05, "loss": 0.4139, "step": 1021 }, { "epoch": 1.1097229766431287, "grad_norm": 0.24810621643443775, "learning_rate": 3.500402576489533e-05, "loss": 0.5284, "step": 1022 }, { "epoch": 1.1108093427485062, "grad_norm": 0.24229875789651775, "learning_rate": 3.498389694041868e-05, "loss": 0.4171, "step": 1023 }, { "epoch": 1.1118957088538837, "grad_norm": 0.24507477310866113, "learning_rate": 3.496376811594203e-05, "loss": 0.4612, "step": 1024 }, { "epoch": 1.1129820749592612, "grad_norm": 0.22217781527481012, "learning_rate": 3.494363929146538e-05, "loss": 0.4604, "step": 1025 }, { "epoch": 1.1140684410646389, "grad_norm": 0.2292524648685896, "learning_rate": 3.492351046698873e-05, "loss": 0.4434, "step": 1026 }, { "epoch": 1.1151548071700164, "grad_norm": 0.24321061382928627, "learning_rate": 3.490338164251208e-05, "loss": 0.4063, "step": 1027 }, { "epoch": 1.1162411732753938, "grad_norm": 0.2389164294734218, "learning_rate": 3.4883252818035425e-05, "loss": 0.4351, "step": 1028 }, { "epoch": 1.1173275393807713, "grad_norm": 0.2437956809165272, "learning_rate": 3.4863123993558776e-05, "loss": 0.4855, "step": 1029 }, { "epoch": 1.1184139054861488, "grad_norm": 0.22942730999883357, "learning_rate": 3.4842995169082126e-05, "loss": 0.482, "step": 1030 }, { "epoch": 1.1195002715915263, "grad_norm": 0.24244995010764345, "learning_rate": 3.482286634460548e-05, "loss": 0.4374, "step": 1031 }, { "epoch": 1.1205866376969038, "grad_norm": 0.22960547994267733, "learning_rate": 3.480273752012883e-05, "loss": 0.4633, "step": 1032 }, { "epoch": 1.1216730038022813, "grad_norm": 0.23060440510971394, "learning_rate": 3.478260869565218e-05, "loss": 0.4071, "step": 1033 }, { "epoch": 1.1227593699076588, "grad_norm": 0.27132327206584966, "learning_rate": 3.476247987117552e-05, "loss": 0.4099, "step": 1034 }, { "epoch": 1.1238457360130365, "grad_norm": 0.2405187167128041, "learning_rate": 3.474235104669887e-05, "loss": 0.5267, "step": 1035 }, { "epoch": 1.124932102118414, "grad_norm": 0.2773871870647737, "learning_rate": 3.472222222222222e-05, "loss": 0.4905, "step": 1036 }, { "epoch": 1.1260184682237915, "grad_norm": 0.26227190997045924, "learning_rate": 3.470209339774557e-05, "loss": 0.48, "step": 1037 }, { "epoch": 1.127104834329169, "grad_norm": 0.2261137451667926, "learning_rate": 3.468196457326892e-05, "loss": 0.4526, "step": 1038 }, { "epoch": 1.1281912004345465, "grad_norm": 0.2215491791407378, "learning_rate": 3.4661835748792274e-05, "loss": 0.5462, "step": 1039 }, { "epoch": 1.129277566539924, "grad_norm": 2.0864018745058854, "learning_rate": 3.464170692431562e-05, "loss": 0.4302, "step": 1040 }, { "epoch": 1.1303639326453014, "grad_norm": 0.2546633460214261, "learning_rate": 3.462157809983897e-05, "loss": 0.4735, "step": 1041 }, { "epoch": 1.131450298750679, "grad_norm": 0.28134142615557356, "learning_rate": 3.460144927536232e-05, "loss": 0.4641, "step": 1042 }, { "epoch": 1.1325366648560564, "grad_norm": 0.2297068677495829, "learning_rate": 3.458132045088567e-05, "loss": 0.4505, "step": 1043 }, { "epoch": 1.133623030961434, "grad_norm": 0.24219470102111612, "learning_rate": 3.456119162640902e-05, "loss": 0.4438, "step": 1044 }, { "epoch": 1.1347093970668114, "grad_norm": 0.28217336712710556, "learning_rate": 3.454106280193237e-05, "loss": 0.441, "step": 1045 }, { "epoch": 1.135795763172189, "grad_norm": 0.2600514118831663, "learning_rate": 3.452093397745571e-05, "loss": 0.4969, "step": 1046 }, { "epoch": 1.1368821292775666, "grad_norm": 0.2590413192844335, "learning_rate": 3.4500805152979064e-05, "loss": 0.4152, "step": 1047 }, { "epoch": 1.137968495382944, "grad_norm": 0.2533575981334367, "learning_rate": 3.4480676328502414e-05, "loss": 0.4956, "step": 1048 }, { "epoch": 1.1390548614883216, "grad_norm": 0.2329172376306262, "learning_rate": 3.4460547504025764e-05, "loss": 0.4951, "step": 1049 }, { "epoch": 1.140141227593699, "grad_norm": 0.23340515710718823, "learning_rate": 3.4440418679549115e-05, "loss": 0.4712, "step": 1050 }, { "epoch": 1.1412275936990766, "grad_norm": 0.22367793793715832, "learning_rate": 3.4420289855072465e-05, "loss": 0.4239, "step": 1051 }, { "epoch": 1.142313959804454, "grad_norm": 0.24360828277933694, "learning_rate": 3.440016103059581e-05, "loss": 0.4902, "step": 1052 }, { "epoch": 1.1434003259098315, "grad_norm": 0.784402643228356, "learning_rate": 3.438003220611916e-05, "loss": 0.4635, "step": 1053 }, { "epoch": 1.144486692015209, "grad_norm": 0.2525686697427147, "learning_rate": 3.4359903381642517e-05, "loss": 0.4335, "step": 1054 }, { "epoch": 1.1455730581205867, "grad_norm": 0.3009043825984474, "learning_rate": 3.433977455716587e-05, "loss": 0.4581, "step": 1055 }, { "epoch": 1.1466594242259642, "grad_norm": 0.24497966266609258, "learning_rate": 3.431964573268922e-05, "loss": 0.3911, "step": 1056 }, { "epoch": 1.1477457903313417, "grad_norm": 0.2333120602674974, "learning_rate": 3.429951690821256e-05, "loss": 0.4583, "step": 1057 }, { "epoch": 1.1488321564367192, "grad_norm": 0.25738936606396334, "learning_rate": 3.427938808373591e-05, "loss": 0.4761, "step": 1058 }, { "epoch": 1.1499185225420967, "grad_norm": 0.26383282186764023, "learning_rate": 3.425925925925926e-05, "loss": 0.4289, "step": 1059 }, { "epoch": 1.1510048886474742, "grad_norm": 0.2317489733375867, "learning_rate": 3.423913043478261e-05, "loss": 0.4876, "step": 1060 }, { "epoch": 1.1520912547528517, "grad_norm": 0.26547114757208523, "learning_rate": 3.421900161030596e-05, "loss": 0.4856, "step": 1061 }, { "epoch": 1.1531776208582292, "grad_norm": 0.2536109683557548, "learning_rate": 3.419887278582931e-05, "loss": 0.5488, "step": 1062 }, { "epoch": 1.1542639869636067, "grad_norm": 0.24161501361531615, "learning_rate": 3.4178743961352664e-05, "loss": 0.4426, "step": 1063 }, { "epoch": 1.1553503530689841, "grad_norm": 0.21609446466227203, "learning_rate": 3.415861513687601e-05, "loss": 0.4358, "step": 1064 }, { "epoch": 1.1564367191743616, "grad_norm": 0.24514246577111476, "learning_rate": 3.413848631239936e-05, "loss": 0.4563, "step": 1065 }, { "epoch": 1.1575230852797393, "grad_norm": 0.2799785826187416, "learning_rate": 3.411835748792271e-05, "loss": 0.4394, "step": 1066 }, { "epoch": 1.1586094513851168, "grad_norm": 0.23133537650216596, "learning_rate": 3.409822866344606e-05, "loss": 0.427, "step": 1067 }, { "epoch": 1.1596958174904943, "grad_norm": 0.27441023780795065, "learning_rate": 3.407809983896941e-05, "loss": 0.427, "step": 1068 }, { "epoch": 1.1607821835958718, "grad_norm": 0.2702689214735813, "learning_rate": 3.405797101449276e-05, "loss": 0.4774, "step": 1069 }, { "epoch": 1.1618685497012493, "grad_norm": 0.2838947430958246, "learning_rate": 3.40378421900161e-05, "loss": 0.5388, "step": 1070 }, { "epoch": 1.1629549158066268, "grad_norm": 0.3051109615754959, "learning_rate": 3.4017713365539454e-05, "loss": 0.444, "step": 1071 }, { "epoch": 1.1640412819120043, "grad_norm": 0.2537916075718411, "learning_rate": 3.3997584541062804e-05, "loss": 0.4915, "step": 1072 }, { "epoch": 1.1651276480173818, "grad_norm": 0.2283033372268475, "learning_rate": 3.3977455716586155e-05, "loss": 0.4558, "step": 1073 }, { "epoch": 1.1662140141227595, "grad_norm": 0.2988071361342792, "learning_rate": 3.3957326892109505e-05, "loss": 0.4311, "step": 1074 }, { "epoch": 1.167300380228137, "grad_norm": 0.240136849436925, "learning_rate": 3.3937198067632856e-05, "loss": 0.434, "step": 1075 }, { "epoch": 1.1683867463335145, "grad_norm": 0.21865446295894403, "learning_rate": 3.39170692431562e-05, "loss": 0.4496, "step": 1076 }, { "epoch": 1.169473112438892, "grad_norm": 0.2876216714163655, "learning_rate": 3.389694041867955e-05, "loss": 0.5022, "step": 1077 }, { "epoch": 1.1705594785442694, "grad_norm": 0.2345225238208405, "learning_rate": 3.38768115942029e-05, "loss": 0.4779, "step": 1078 }, { "epoch": 1.171645844649647, "grad_norm": 0.2833784076588519, "learning_rate": 3.385668276972625e-05, "loss": 0.4406, "step": 1079 }, { "epoch": 1.1727322107550244, "grad_norm": 0.22867113302494443, "learning_rate": 3.38365539452496e-05, "loss": 0.5244, "step": 1080 }, { "epoch": 1.173818576860402, "grad_norm": 0.29208528613512397, "learning_rate": 3.381642512077295e-05, "loss": 0.465, "step": 1081 }, { "epoch": 1.1749049429657794, "grad_norm": 0.24903175705856267, "learning_rate": 3.3796296296296295e-05, "loss": 0.4458, "step": 1082 }, { "epoch": 1.175991309071157, "grad_norm": 0.23710124465460813, "learning_rate": 3.3776167471819646e-05, "loss": 0.4376, "step": 1083 }, { "epoch": 1.1770776751765344, "grad_norm": 0.22265520412351097, "learning_rate": 3.3756038647342996e-05, "loss": 0.3987, "step": 1084 }, { "epoch": 1.178164041281912, "grad_norm": 0.2342410243040514, "learning_rate": 3.3735909822866346e-05, "loss": 0.5096, "step": 1085 }, { "epoch": 1.1792504073872896, "grad_norm": 0.3029439139333912, "learning_rate": 3.37157809983897e-05, "loss": 0.4575, "step": 1086 }, { "epoch": 1.180336773492667, "grad_norm": 0.2233039186494698, "learning_rate": 3.369565217391305e-05, "loss": 0.4712, "step": 1087 }, { "epoch": 1.1814231395980446, "grad_norm": 0.22894631413190777, "learning_rate": 3.367552334943639e-05, "loss": 0.4598, "step": 1088 }, { "epoch": 1.182509505703422, "grad_norm": 0.28461797545479506, "learning_rate": 3.365539452495974e-05, "loss": 0.4441, "step": 1089 }, { "epoch": 1.1835958718087995, "grad_norm": 0.2763353781153828, "learning_rate": 3.363526570048309e-05, "loss": 0.4027, "step": 1090 }, { "epoch": 1.184682237914177, "grad_norm": 0.23788105208088217, "learning_rate": 3.361513687600644e-05, "loss": 0.5119, "step": 1091 }, { "epoch": 1.1857686040195545, "grad_norm": 0.2872204422196742, "learning_rate": 3.359500805152979e-05, "loss": 0.4812, "step": 1092 }, { "epoch": 1.186854970124932, "grad_norm": 0.24194408267621004, "learning_rate": 3.357487922705314e-05, "loss": 0.4468, "step": 1093 }, { "epoch": 1.1879413362303097, "grad_norm": 0.24463790391212228, "learning_rate": 3.3554750402576494e-05, "loss": 0.4265, "step": 1094 }, { "epoch": 1.1890277023356872, "grad_norm": 0.2737624606390409, "learning_rate": 3.353462157809984e-05, "loss": 0.5001, "step": 1095 }, { "epoch": 1.1901140684410647, "grad_norm": 0.31688110411053627, "learning_rate": 3.351449275362319e-05, "loss": 0.4315, "step": 1096 }, { "epoch": 1.1912004345464422, "grad_norm": 0.2410502512433085, "learning_rate": 3.349436392914654e-05, "loss": 0.4597, "step": 1097 }, { "epoch": 1.1922868006518197, "grad_norm": 0.2478473643313258, "learning_rate": 3.347423510466989e-05, "loss": 0.5066, "step": 1098 }, { "epoch": 1.1933731667571972, "grad_norm": 0.27337127439860043, "learning_rate": 3.345410628019324e-05, "loss": 0.4605, "step": 1099 }, { "epoch": 1.1944595328625747, "grad_norm": 0.31806334818098386, "learning_rate": 3.343397745571659e-05, "loss": 0.5021, "step": 1100 }, { "epoch": 1.1955458989679522, "grad_norm": 0.2543308272601177, "learning_rate": 3.341384863123993e-05, "loss": 0.455, "step": 1101 }, { "epoch": 1.1966322650733296, "grad_norm": 0.2198648035181778, "learning_rate": 3.3393719806763284e-05, "loss": 0.3851, "step": 1102 }, { "epoch": 1.1977186311787071, "grad_norm": 0.30662551241064123, "learning_rate": 3.3373590982286634e-05, "loss": 0.4249, "step": 1103 }, { "epoch": 1.1988049972840846, "grad_norm": 0.22025038170892255, "learning_rate": 3.3353462157809984e-05, "loss": 0.4209, "step": 1104 }, { "epoch": 1.1998913633894623, "grad_norm": 0.21395917185132254, "learning_rate": 3.3333333333333335e-05, "loss": 0.4597, "step": 1105 }, { "epoch": 1.2009777294948398, "grad_norm": 0.24070027647792466, "learning_rate": 3.3313204508856685e-05, "loss": 0.4841, "step": 1106 }, { "epoch": 1.2020640956002173, "grad_norm": 0.2380206159590756, "learning_rate": 3.329307568438003e-05, "loss": 0.4189, "step": 1107 }, { "epoch": 1.2031504617055948, "grad_norm": 0.20494879418429934, "learning_rate": 3.327294685990338e-05, "loss": 0.4462, "step": 1108 }, { "epoch": 1.2042368278109723, "grad_norm": 0.22505638707872933, "learning_rate": 3.325281803542673e-05, "loss": 0.3998, "step": 1109 }, { "epoch": 1.2053231939163498, "grad_norm": 0.22237579735756877, "learning_rate": 3.323268921095008e-05, "loss": 0.4449, "step": 1110 }, { "epoch": 1.2064095600217273, "grad_norm": 0.24512749159303923, "learning_rate": 3.321256038647343e-05, "loss": 0.4407, "step": 1111 }, { "epoch": 1.2074959261271048, "grad_norm": 0.2213565143132146, "learning_rate": 3.319243156199678e-05, "loss": 0.4502, "step": 1112 }, { "epoch": 1.2085822922324823, "grad_norm": 0.24400363807103695, "learning_rate": 3.3172302737520125e-05, "loss": 0.4929, "step": 1113 }, { "epoch": 1.20966865833786, "grad_norm": 0.25589829269651704, "learning_rate": 3.3152173913043475e-05, "loss": 0.4209, "step": 1114 }, { "epoch": 1.2107550244432375, "grad_norm": 0.2161492381113643, "learning_rate": 3.3132045088566826e-05, "loss": 0.4827, "step": 1115 }, { "epoch": 1.211841390548615, "grad_norm": 0.2315919074677684, "learning_rate": 3.311191626409018e-05, "loss": 0.5235, "step": 1116 }, { "epoch": 1.2129277566539924, "grad_norm": 0.2374456809673137, "learning_rate": 3.3091787439613533e-05, "loss": 0.4588, "step": 1117 }, { "epoch": 1.21401412275937, "grad_norm": 0.21853750709549433, "learning_rate": 3.307165861513688e-05, "loss": 0.4092, "step": 1118 }, { "epoch": 1.2151004888647474, "grad_norm": 0.2092619245204511, "learning_rate": 3.305152979066023e-05, "loss": 0.4673, "step": 1119 }, { "epoch": 1.216186854970125, "grad_norm": 0.23038445210842573, "learning_rate": 3.303140096618358e-05, "loss": 0.5162, "step": 1120 }, { "epoch": 1.2172732210755024, "grad_norm": 0.23406991681884395, "learning_rate": 3.301127214170693e-05, "loss": 0.4526, "step": 1121 }, { "epoch": 1.2183595871808799, "grad_norm": 0.21490705525776288, "learning_rate": 3.299114331723028e-05, "loss": 0.4384, "step": 1122 }, { "epoch": 1.2194459532862574, "grad_norm": 0.22877268672630666, "learning_rate": 3.297101449275363e-05, "loss": 0.4147, "step": 1123 }, { "epoch": 1.2205323193916349, "grad_norm": 0.21777650680816169, "learning_rate": 3.295088566827697e-05, "loss": 0.4595, "step": 1124 }, { "epoch": 1.2216186854970126, "grad_norm": 0.22052616389491583, "learning_rate": 3.2930756843800323e-05, "loss": 0.4445, "step": 1125 }, { "epoch": 1.22270505160239, "grad_norm": 0.25084257410129557, "learning_rate": 3.2910628019323674e-05, "loss": 0.4765, "step": 1126 }, { "epoch": 1.2237914177077676, "grad_norm": 0.2489118541347577, "learning_rate": 3.2890499194847024e-05, "loss": 0.4846, "step": 1127 }, { "epoch": 1.224877783813145, "grad_norm": 0.24961350374916866, "learning_rate": 3.2870370370370375e-05, "loss": 0.4581, "step": 1128 }, { "epoch": 1.2259641499185225, "grad_norm": 0.22169742874937381, "learning_rate": 3.2850241545893725e-05, "loss": 0.4536, "step": 1129 }, { "epoch": 1.2270505160239, "grad_norm": 0.25296343220210715, "learning_rate": 3.2830112721417076e-05, "loss": 0.4442, "step": 1130 }, { "epoch": 1.2281368821292775, "grad_norm": 0.2279562060603678, "learning_rate": 3.280998389694042e-05, "loss": 0.4759, "step": 1131 }, { "epoch": 1.229223248234655, "grad_norm": 0.21961459830527194, "learning_rate": 3.278985507246377e-05, "loss": 0.4432, "step": 1132 }, { "epoch": 1.2303096143400325, "grad_norm": 0.2076363650576274, "learning_rate": 3.276972624798712e-05, "loss": 0.3974, "step": 1133 }, { "epoch": 1.2313959804454102, "grad_norm": 0.2144707464113267, "learning_rate": 3.274959742351047e-05, "loss": 0.4088, "step": 1134 }, { "epoch": 1.2324823465507877, "grad_norm": 0.23016920746449285, "learning_rate": 3.272946859903382e-05, "loss": 0.42, "step": 1135 }, { "epoch": 1.2335687126561652, "grad_norm": 0.23915664417123803, "learning_rate": 3.270933977455717e-05, "loss": 0.4128, "step": 1136 }, { "epoch": 1.2346550787615427, "grad_norm": 0.22589180946319046, "learning_rate": 3.2689210950080515e-05, "loss": 0.4065, "step": 1137 }, { "epoch": 1.2357414448669202, "grad_norm": 0.23427505452834796, "learning_rate": 3.2669082125603866e-05, "loss": 0.3947, "step": 1138 }, { "epoch": 1.2368278109722977, "grad_norm": 0.2783760551112833, "learning_rate": 3.2648953301127216e-05, "loss": 0.4297, "step": 1139 }, { "epoch": 1.2379141770776751, "grad_norm": 0.2509735969849862, "learning_rate": 3.2628824476650567e-05, "loss": 0.4278, "step": 1140 }, { "epoch": 1.2390005431830526, "grad_norm": 0.23511401133287094, "learning_rate": 3.260869565217392e-05, "loss": 0.4946, "step": 1141 }, { "epoch": 1.2400869092884301, "grad_norm": 0.22015179797657355, "learning_rate": 3.258856682769727e-05, "loss": 0.4074, "step": 1142 }, { "epoch": 1.2411732753938076, "grad_norm": 0.26384038274292687, "learning_rate": 3.256843800322061e-05, "loss": 0.4296, "step": 1143 }, { "epoch": 1.242259641499185, "grad_norm": 0.2286734031270758, "learning_rate": 3.254830917874396e-05, "loss": 0.3966, "step": 1144 }, { "epoch": 1.2433460076045628, "grad_norm": 0.23072086660943877, "learning_rate": 3.252818035426731e-05, "loss": 0.4853, "step": 1145 }, { "epoch": 1.2444323737099403, "grad_norm": 0.30597813381261174, "learning_rate": 3.250805152979066e-05, "loss": 0.4555, "step": 1146 }, { "epoch": 1.2455187398153178, "grad_norm": 0.23618542112119859, "learning_rate": 3.248792270531401e-05, "loss": 0.4267, "step": 1147 }, { "epoch": 1.2466051059206953, "grad_norm": 0.24638968902560185, "learning_rate": 3.246779388083736e-05, "loss": 0.4409, "step": 1148 }, { "epoch": 1.2476914720260728, "grad_norm": 0.25399873306200627, "learning_rate": 3.244766505636071e-05, "loss": 0.4483, "step": 1149 }, { "epoch": 1.2487778381314503, "grad_norm": 0.2512061128110236, "learning_rate": 3.242753623188406e-05, "loss": 0.5566, "step": 1150 }, { "epoch": 1.2498642042368278, "grad_norm": 0.24159049392945312, "learning_rate": 3.240740740740741e-05, "loss": 0.3752, "step": 1151 }, { "epoch": 1.2509505703422052, "grad_norm": 0.25650857713377545, "learning_rate": 3.238727858293076e-05, "loss": 0.4434, "step": 1152 }, { "epoch": 1.252036936447583, "grad_norm": 0.25574566316907726, "learning_rate": 3.236714975845411e-05, "loss": 0.5086, "step": 1153 }, { "epoch": 1.2531233025529604, "grad_norm": 0.2398269509003359, "learning_rate": 3.234702093397746e-05, "loss": 0.4546, "step": 1154 }, { "epoch": 1.254209668658338, "grad_norm": 0.2341187730428935, "learning_rate": 3.23268921095008e-05, "loss": 0.4484, "step": 1155 }, { "epoch": 1.2552960347637154, "grad_norm": 0.2476398928600138, "learning_rate": 3.230676328502415e-05, "loss": 0.4186, "step": 1156 }, { "epoch": 1.256382400869093, "grad_norm": 0.23505500827293851, "learning_rate": 3.2286634460547504e-05, "loss": 0.4933, "step": 1157 }, { "epoch": 1.2574687669744704, "grad_norm": 0.2692847903870762, "learning_rate": 3.2266505636070854e-05, "loss": 0.4588, "step": 1158 }, { "epoch": 1.258555133079848, "grad_norm": 0.2405437790608115, "learning_rate": 3.2246376811594205e-05, "loss": 0.4174, "step": 1159 }, { "epoch": 1.2596414991852254, "grad_norm": 0.25909177191155613, "learning_rate": 3.2226247987117555e-05, "loss": 0.4406, "step": 1160 }, { "epoch": 1.2607278652906029, "grad_norm": 0.26526785871805614, "learning_rate": 3.22061191626409e-05, "loss": 0.5004, "step": 1161 }, { "epoch": 1.2618142313959804, "grad_norm": 0.24987172141352076, "learning_rate": 3.218599033816425e-05, "loss": 0.4815, "step": 1162 }, { "epoch": 1.2629005975013579, "grad_norm": 0.25885483777079626, "learning_rate": 3.21658615136876e-05, "loss": 0.5021, "step": 1163 }, { "epoch": 1.2639869636067353, "grad_norm": 0.23876771333845853, "learning_rate": 3.214573268921095e-05, "loss": 0.44, "step": 1164 }, { "epoch": 1.265073329712113, "grad_norm": 0.2947708867615779, "learning_rate": 3.21256038647343e-05, "loss": 0.3916, "step": 1165 }, { "epoch": 1.2661596958174905, "grad_norm": 0.2431264075087459, "learning_rate": 3.210547504025765e-05, "loss": 0.4851, "step": 1166 }, { "epoch": 1.267246061922868, "grad_norm": 0.2505893108559895, "learning_rate": 3.2085346215781e-05, "loss": 0.4974, "step": 1167 }, { "epoch": 1.2683324280282455, "grad_norm": 0.22072667670416612, "learning_rate": 3.2065217391304345e-05, "loss": 0.3991, "step": 1168 }, { "epoch": 1.269418794133623, "grad_norm": 0.25421957854240207, "learning_rate": 3.2045088566827695e-05, "loss": 0.417, "step": 1169 }, { "epoch": 1.2705051602390005, "grad_norm": 0.2175502325988229, "learning_rate": 3.2024959742351046e-05, "loss": 0.4535, "step": 1170 }, { "epoch": 1.271591526344378, "grad_norm": 0.2596852897013073, "learning_rate": 3.2004830917874396e-05, "loss": 0.4368, "step": 1171 }, { "epoch": 1.2726778924497555, "grad_norm": 0.22381830374425843, "learning_rate": 3.198470209339775e-05, "loss": 0.4302, "step": 1172 }, { "epoch": 1.2737642585551332, "grad_norm": 0.21053924190743176, "learning_rate": 3.19645732689211e-05, "loss": 0.4847, "step": 1173 }, { "epoch": 1.2748506246605107, "grad_norm": 0.27162391705524525, "learning_rate": 3.194444444444444e-05, "loss": 0.4853, "step": 1174 }, { "epoch": 1.2759369907658882, "grad_norm": 0.21994839548041553, "learning_rate": 3.192431561996779e-05, "loss": 0.4571, "step": 1175 }, { "epoch": 1.2770233568712657, "grad_norm": 0.8289252471172837, "learning_rate": 3.190418679549114e-05, "loss": 0.4271, "step": 1176 }, { "epoch": 1.2781097229766432, "grad_norm": 0.2426340417183644, "learning_rate": 3.188405797101449e-05, "loss": 0.4416, "step": 1177 }, { "epoch": 1.2791960890820206, "grad_norm": 0.26990938801749964, "learning_rate": 3.186392914653785e-05, "loss": 0.4696, "step": 1178 }, { "epoch": 1.2802824551873981, "grad_norm": 0.23738091419226484, "learning_rate": 3.184380032206119e-05, "loss": 0.4153, "step": 1179 }, { "epoch": 1.2813688212927756, "grad_norm": 0.22702052240703421, "learning_rate": 3.1823671497584544e-05, "loss": 0.4447, "step": 1180 }, { "epoch": 1.2824551873981531, "grad_norm": 0.24827747342632064, "learning_rate": 3.1803542673107894e-05, "loss": 0.4321, "step": 1181 }, { "epoch": 1.2835415535035306, "grad_norm": 0.25263899201016726, "learning_rate": 3.1783413848631244e-05, "loss": 0.5303, "step": 1182 }, { "epoch": 1.284627919608908, "grad_norm": 0.2596966721834399, "learning_rate": 3.1763285024154595e-05, "loss": 0.5114, "step": 1183 }, { "epoch": 1.2857142857142856, "grad_norm": 0.2605054870769199, "learning_rate": 3.1743156199677945e-05, "loss": 0.4343, "step": 1184 }, { "epoch": 1.2868006518196633, "grad_norm": 0.23684387451901115, "learning_rate": 3.172302737520129e-05, "loss": 0.3863, "step": 1185 }, { "epoch": 1.2878870179250408, "grad_norm": 0.2504483402096455, "learning_rate": 3.170289855072464e-05, "loss": 0.4574, "step": 1186 }, { "epoch": 1.2889733840304183, "grad_norm": 0.24528104565610195, "learning_rate": 3.168276972624799e-05, "loss": 0.5117, "step": 1187 }, { "epoch": 1.2900597501357958, "grad_norm": 0.2605321014468727, "learning_rate": 3.166264090177134e-05, "loss": 0.3938, "step": 1188 }, { "epoch": 1.2911461162411733, "grad_norm": 0.23390385350725434, "learning_rate": 3.164251207729469e-05, "loss": 0.4328, "step": 1189 }, { "epoch": 1.2922324823465507, "grad_norm": 0.22748347329606972, "learning_rate": 3.162238325281804e-05, "loss": 0.4035, "step": 1190 }, { "epoch": 1.2933188484519282, "grad_norm": 0.2609694622271787, "learning_rate": 3.1602254428341385e-05, "loss": 0.4265, "step": 1191 }, { "epoch": 1.294405214557306, "grad_norm": 0.2449184099098431, "learning_rate": 3.1582125603864735e-05, "loss": 0.4469, "step": 1192 }, { "epoch": 1.2954915806626834, "grad_norm": 0.2452064110831413, "learning_rate": 3.1561996779388086e-05, "loss": 0.4006, "step": 1193 }, { "epoch": 1.296577946768061, "grad_norm": 0.2358153227657377, "learning_rate": 3.1541867954911436e-05, "loss": 0.4084, "step": 1194 }, { "epoch": 1.2976643128734384, "grad_norm": 0.23477867415065373, "learning_rate": 3.152173913043479e-05, "loss": 0.4555, "step": 1195 }, { "epoch": 1.298750678978816, "grad_norm": 0.2756705005849965, "learning_rate": 3.150161030595814e-05, "loss": 0.4767, "step": 1196 }, { "epoch": 1.2998370450841934, "grad_norm": 0.22863422165785738, "learning_rate": 3.148148148148148e-05, "loss": 0.4064, "step": 1197 }, { "epoch": 1.3009234111895709, "grad_norm": 0.22264659421723149, "learning_rate": 3.146135265700483e-05, "loss": 0.4366, "step": 1198 }, { "epoch": 1.3020097772949484, "grad_norm": 0.24564411869016275, "learning_rate": 3.144122383252818e-05, "loss": 0.479, "step": 1199 }, { "epoch": 1.3030961434003259, "grad_norm": 0.2822115341061486, "learning_rate": 3.142109500805153e-05, "loss": 0.3895, "step": 1200 }, { "epoch": 1.3041825095057034, "grad_norm": 0.21524865646376082, "learning_rate": 3.140096618357488e-05, "loss": 0.4719, "step": 1201 }, { "epoch": 1.3052688756110808, "grad_norm": 0.24205458122534193, "learning_rate": 3.138083735909823e-05, "loss": 0.441, "step": 1202 }, { "epoch": 1.3063552417164583, "grad_norm": 0.26294477108248737, "learning_rate": 3.136070853462158e-05, "loss": 0.4096, "step": 1203 }, { "epoch": 1.3074416078218358, "grad_norm": 0.22078597864450306, "learning_rate": 3.134057971014493e-05, "loss": 0.4423, "step": 1204 }, { "epoch": 1.3085279739272135, "grad_norm": 0.4538055391781604, "learning_rate": 3.132045088566828e-05, "loss": 0.4599, "step": 1205 }, { "epoch": 1.309614340032591, "grad_norm": 0.23157186654944598, "learning_rate": 3.130032206119163e-05, "loss": 0.4271, "step": 1206 }, { "epoch": 1.3107007061379685, "grad_norm": 0.24428788482096325, "learning_rate": 3.128019323671498e-05, "loss": 0.4055, "step": 1207 }, { "epoch": 1.311787072243346, "grad_norm": 0.21991434748393476, "learning_rate": 3.126006441223833e-05, "loss": 0.4206, "step": 1208 }, { "epoch": 1.3128734383487235, "grad_norm": 0.2211571148626369, "learning_rate": 3.123993558776168e-05, "loss": 0.4611, "step": 1209 }, { "epoch": 1.313959804454101, "grad_norm": 0.22733498492816676, "learning_rate": 3.121980676328502e-05, "loss": 0.4447, "step": 1210 }, { "epoch": 1.3150461705594785, "grad_norm": 0.23712286350507136, "learning_rate": 3.119967793880837e-05, "loss": 0.4754, "step": 1211 }, { "epoch": 1.3161325366648562, "grad_norm": 0.2479160762445838, "learning_rate": 3.1179549114331724e-05, "loss": 0.4471, "step": 1212 }, { "epoch": 1.3172189027702337, "grad_norm": 0.23252426549033917, "learning_rate": 3.1159420289855074e-05, "loss": 0.4551, "step": 1213 }, { "epoch": 1.3183052688756112, "grad_norm": 0.24686536723198974, "learning_rate": 3.1139291465378425e-05, "loss": 0.4716, "step": 1214 }, { "epoch": 1.3193916349809887, "grad_norm": 0.24517830223624798, "learning_rate": 3.1119162640901775e-05, "loss": 0.4773, "step": 1215 }, { "epoch": 1.3204780010863661, "grad_norm": 0.2599676854438751, "learning_rate": 3.109903381642512e-05, "loss": 0.5062, "step": 1216 }, { "epoch": 1.3215643671917436, "grad_norm": 0.28409173482828015, "learning_rate": 3.107890499194847e-05, "loss": 0.4334, "step": 1217 }, { "epoch": 1.3226507332971211, "grad_norm": 1.4636567637594722, "learning_rate": 3.105877616747182e-05, "loss": 0.4546, "step": 1218 }, { "epoch": 1.3237370994024986, "grad_norm": 0.27645378565647954, "learning_rate": 3.103864734299517e-05, "loss": 0.4366, "step": 1219 }, { "epoch": 1.324823465507876, "grad_norm": 0.23550526121376644, "learning_rate": 3.101851851851852e-05, "loss": 0.4757, "step": 1220 }, { "epoch": 1.3259098316132536, "grad_norm": 0.2177791125925779, "learning_rate": 3.099838969404187e-05, "loss": 0.4308, "step": 1221 }, { "epoch": 1.326996197718631, "grad_norm": 0.22661373446428323, "learning_rate": 3.0978260869565215e-05, "loss": 0.5314, "step": 1222 }, { "epoch": 1.3280825638240086, "grad_norm": 0.23945225161327768, "learning_rate": 3.0958132045088565e-05, "loss": 0.4581, "step": 1223 }, { "epoch": 1.329168929929386, "grad_norm": 0.22896271252605077, "learning_rate": 3.0938003220611916e-05, "loss": 0.5054, "step": 1224 }, { "epoch": 1.3302552960347638, "grad_norm": 0.22752522587883348, "learning_rate": 3.0917874396135266e-05, "loss": 0.4455, "step": 1225 }, { "epoch": 1.3313416621401413, "grad_norm": 0.24365758448222102, "learning_rate": 3.0897745571658616e-05, "loss": 0.4652, "step": 1226 }, { "epoch": 1.3324280282455188, "grad_norm": 0.2357197331538666, "learning_rate": 3.087761674718197e-05, "loss": 0.4464, "step": 1227 }, { "epoch": 1.3335143943508962, "grad_norm": 0.22132006827744394, "learning_rate": 3.085748792270531e-05, "loss": 0.4414, "step": 1228 }, { "epoch": 1.3346007604562737, "grad_norm": 0.21775431873252557, "learning_rate": 3.083735909822866e-05, "loss": 0.4645, "step": 1229 }, { "epoch": 1.3356871265616512, "grad_norm": 0.22757731964574426, "learning_rate": 3.081723027375201e-05, "loss": 0.4186, "step": 1230 }, { "epoch": 1.3367734926670287, "grad_norm": 0.22284454992235941, "learning_rate": 3.079710144927536e-05, "loss": 0.4318, "step": 1231 }, { "epoch": 1.3378598587724064, "grad_norm": 0.22303875390575564, "learning_rate": 3.077697262479871e-05, "loss": 0.4526, "step": 1232 }, { "epoch": 1.338946224877784, "grad_norm": 0.2451784675128546, "learning_rate": 3.075684380032206e-05, "loss": 0.4668, "step": 1233 }, { "epoch": 1.3400325909831614, "grad_norm": 0.24102430977540115, "learning_rate": 3.073671497584541e-05, "loss": 0.4388, "step": 1234 }, { "epoch": 1.341118957088539, "grad_norm": 0.23838125469082697, "learning_rate": 3.071658615136876e-05, "loss": 0.4781, "step": 1235 }, { "epoch": 1.3422053231939164, "grad_norm": 0.24171609595064703, "learning_rate": 3.069645732689211e-05, "loss": 0.4457, "step": 1236 }, { "epoch": 1.3432916892992939, "grad_norm": 0.23507994425136636, "learning_rate": 3.067632850241546e-05, "loss": 0.4333, "step": 1237 }, { "epoch": 1.3443780554046714, "grad_norm": 0.2152986469750305, "learning_rate": 3.065619967793881e-05, "loss": 0.4519, "step": 1238 }, { "epoch": 1.3454644215100489, "grad_norm": 0.24311710575575554, "learning_rate": 3.063607085346216e-05, "loss": 0.4073, "step": 1239 }, { "epoch": 1.3465507876154263, "grad_norm": 0.24918433404253185, "learning_rate": 3.061594202898551e-05, "loss": 0.4381, "step": 1240 }, { "epoch": 1.3476371537208038, "grad_norm": 0.2274358747465878, "learning_rate": 3.059581320450886e-05, "loss": 0.4355, "step": 1241 }, { "epoch": 1.3487235198261813, "grad_norm": 0.27236025468510106, "learning_rate": 3.057568438003221e-05, "loss": 0.4301, "step": 1242 }, { "epoch": 1.3498098859315588, "grad_norm": 0.27170353094300015, "learning_rate": 3.055555555555556e-05, "loss": 0.5027, "step": 1243 }, { "epoch": 1.3508962520369365, "grad_norm": 0.24880836271240533, "learning_rate": 3.053542673107891e-05, "loss": 0.4339, "step": 1244 }, { "epoch": 1.351982618142314, "grad_norm": 0.2389508436645932, "learning_rate": 3.0515297906602258e-05, "loss": 0.4174, "step": 1245 }, { "epoch": 1.3530689842476915, "grad_norm": 0.26586634358809735, "learning_rate": 3.049516908212561e-05, "loss": 0.3684, "step": 1246 }, { "epoch": 1.354155350353069, "grad_norm": 0.2344995650781429, "learning_rate": 3.0475040257648955e-05, "loss": 0.4736, "step": 1247 }, { "epoch": 1.3552417164584465, "grad_norm": 0.23375375028789, "learning_rate": 3.0454911433172306e-05, "loss": 0.4709, "step": 1248 }, { "epoch": 1.356328082563824, "grad_norm": 0.22824852941455626, "learning_rate": 3.0434782608695656e-05, "loss": 0.4319, "step": 1249 }, { "epoch": 1.3574144486692015, "grad_norm": 0.2595438704233301, "learning_rate": 3.0414653784219003e-05, "loss": 0.5149, "step": 1250 }, { "epoch": 1.358500814774579, "grad_norm": 0.2708708439492421, "learning_rate": 3.0394524959742354e-05, "loss": 0.4454, "step": 1251 }, { "epoch": 1.3595871808799567, "grad_norm": 0.23406180946878524, "learning_rate": 3.0374396135265704e-05, "loss": 0.5156, "step": 1252 }, { "epoch": 1.3606735469853342, "grad_norm": 0.2889172784737098, "learning_rate": 3.035426731078905e-05, "loss": 0.477, "step": 1253 }, { "epoch": 1.3617599130907116, "grad_norm": 0.2181198997610945, "learning_rate": 3.0334138486312402e-05, "loss": 0.3749, "step": 1254 }, { "epoch": 1.3628462791960891, "grad_norm": 0.2776790010395675, "learning_rate": 3.0314009661835752e-05, "loss": 0.4836, "step": 1255 }, { "epoch": 1.3639326453014666, "grad_norm": 0.23756141904771202, "learning_rate": 3.0293880837359103e-05, "loss": 0.404, "step": 1256 }, { "epoch": 1.3650190114068441, "grad_norm": 0.22440542529652538, "learning_rate": 3.027375201288245e-05, "loss": 0.4377, "step": 1257 }, { "epoch": 1.3661053775122216, "grad_norm": 0.23918606424771252, "learning_rate": 3.02536231884058e-05, "loss": 0.4146, "step": 1258 }, { "epoch": 1.367191743617599, "grad_norm": 0.2293132346738226, "learning_rate": 3.023349436392915e-05, "loss": 0.4628, "step": 1259 }, { "epoch": 1.3682781097229766, "grad_norm": 0.2317448116511707, "learning_rate": 3.0213365539452498e-05, "loss": 0.3924, "step": 1260 }, { "epoch": 1.369364475828354, "grad_norm": 0.2401067120340545, "learning_rate": 3.0193236714975848e-05, "loss": 0.4512, "step": 1261 }, { "epoch": 1.3704508419337316, "grad_norm": 0.22669848854699762, "learning_rate": 3.01731078904992e-05, "loss": 0.4452, "step": 1262 }, { "epoch": 1.371537208039109, "grad_norm": 0.2340666715615013, "learning_rate": 3.0152979066022546e-05, "loss": 0.3995, "step": 1263 }, { "epoch": 1.3726235741444868, "grad_norm": 0.22385881506809613, "learning_rate": 3.0132850241545896e-05, "loss": 0.4413, "step": 1264 }, { "epoch": 1.3737099402498643, "grad_norm": 0.2659515913729495, "learning_rate": 3.0112721417069246e-05, "loss": 0.5372, "step": 1265 }, { "epoch": 1.3747963063552417, "grad_norm": 0.24824609428657352, "learning_rate": 3.0092592592592593e-05, "loss": 0.4391, "step": 1266 }, { "epoch": 1.3758826724606192, "grad_norm": 0.24423284701443743, "learning_rate": 3.0072463768115944e-05, "loss": 0.4553, "step": 1267 }, { "epoch": 1.3769690385659967, "grad_norm": 0.21339582396844275, "learning_rate": 3.0052334943639294e-05, "loss": 0.4172, "step": 1268 }, { "epoch": 1.3780554046713742, "grad_norm": 0.2568335509047871, "learning_rate": 3.003220611916264e-05, "loss": 0.503, "step": 1269 }, { "epoch": 1.3791417707767517, "grad_norm": 0.2503203190995816, "learning_rate": 3.0012077294685992e-05, "loss": 0.4667, "step": 1270 }, { "epoch": 1.3802281368821292, "grad_norm": 0.23773901871981526, "learning_rate": 2.9991948470209342e-05, "loss": 0.4887, "step": 1271 }, { "epoch": 1.381314502987507, "grad_norm": 0.2330239315784738, "learning_rate": 2.997181964573269e-05, "loss": 0.5005, "step": 1272 }, { "epoch": 1.3824008690928844, "grad_norm": 0.256965123076063, "learning_rate": 2.995169082125604e-05, "loss": 0.4287, "step": 1273 }, { "epoch": 1.3834872351982619, "grad_norm": 0.20917932631243183, "learning_rate": 2.993156199677939e-05, "loss": 0.3895, "step": 1274 }, { "epoch": 1.3845736013036394, "grad_norm": 0.22523004530401142, "learning_rate": 2.9911433172302737e-05, "loss": 0.4213, "step": 1275 }, { "epoch": 1.3856599674090169, "grad_norm": 0.22551595427248014, "learning_rate": 2.9891304347826088e-05, "loss": 0.4221, "step": 1276 }, { "epoch": 1.3867463335143944, "grad_norm": 0.19628520577013478, "learning_rate": 2.9871175523349438e-05, "loss": 0.3687, "step": 1277 }, { "epoch": 1.3878326996197718, "grad_norm": 0.2481058337033831, "learning_rate": 2.9851046698872785e-05, "loss": 0.3891, "step": 1278 }, { "epoch": 1.3889190657251493, "grad_norm": 0.21430140118232333, "learning_rate": 2.9830917874396136e-05, "loss": 0.45, "step": 1279 }, { "epoch": 1.3900054318305268, "grad_norm": 0.2377794522404102, "learning_rate": 2.9810789049919486e-05, "loss": 0.4384, "step": 1280 }, { "epoch": 1.3910917979359043, "grad_norm": 0.2205360619921941, "learning_rate": 2.9790660225442833e-05, "loss": 0.502, "step": 1281 }, { "epoch": 1.3921781640412818, "grad_norm": 0.22576615048350415, "learning_rate": 2.9770531400966184e-05, "loss": 0.4702, "step": 1282 }, { "epoch": 1.3932645301466593, "grad_norm": 0.2324989244623088, "learning_rate": 2.9750402576489534e-05, "loss": 0.4799, "step": 1283 }, { "epoch": 1.394350896252037, "grad_norm": 0.27918997915552574, "learning_rate": 2.973027375201288e-05, "loss": 0.4329, "step": 1284 }, { "epoch": 1.3954372623574145, "grad_norm": 2.1918233955984605, "learning_rate": 2.971014492753623e-05, "loss": 0.4668, "step": 1285 }, { "epoch": 1.396523628462792, "grad_norm": 1.963729945899805, "learning_rate": 2.9690016103059582e-05, "loss": 0.5867, "step": 1286 }, { "epoch": 1.3976099945681695, "grad_norm": 0.2671510931838171, "learning_rate": 2.966988727858293e-05, "loss": 0.452, "step": 1287 }, { "epoch": 1.398696360673547, "grad_norm": 0.24827977330019543, "learning_rate": 2.964975845410628e-05, "loss": 0.4337, "step": 1288 }, { "epoch": 1.3997827267789245, "grad_norm": 0.2742936443148806, "learning_rate": 2.962962962962963e-05, "loss": 0.4384, "step": 1289 }, { "epoch": 1.400869092884302, "grad_norm": 0.25275310741323226, "learning_rate": 2.960950080515298e-05, "loss": 0.4623, "step": 1290 }, { "epoch": 1.4019554589896797, "grad_norm": 0.2655598922676472, "learning_rate": 2.9589371980676327e-05, "loss": 0.433, "step": 1291 }, { "epoch": 1.4030418250950571, "grad_norm": 0.27041542168315713, "learning_rate": 2.9569243156199678e-05, "loss": 0.4522, "step": 1292 }, { "epoch": 1.4041281912004346, "grad_norm": 7.293268457993132, "learning_rate": 2.9549114331723028e-05, "loss": 0.464, "step": 1293 }, { "epoch": 1.4052145573058121, "grad_norm": 0.24959164206260703, "learning_rate": 2.9528985507246375e-05, "loss": 0.4573, "step": 1294 }, { "epoch": 1.4063009234111896, "grad_norm": 0.28056792681526144, "learning_rate": 2.9508856682769726e-05, "loss": 0.4832, "step": 1295 }, { "epoch": 1.407387289516567, "grad_norm": 0.26639453328805546, "learning_rate": 2.9488727858293076e-05, "loss": 0.4338, "step": 1296 }, { "epoch": 1.4084736556219446, "grad_norm": 0.5559483437172271, "learning_rate": 2.9468599033816423e-05, "loss": 0.4351, "step": 1297 }, { "epoch": 1.409560021727322, "grad_norm": 0.27260872262333463, "learning_rate": 2.9448470209339774e-05, "loss": 0.445, "step": 1298 }, { "epoch": 1.4106463878326996, "grad_norm": 0.26844410002253827, "learning_rate": 2.9428341384863124e-05, "loss": 0.4245, "step": 1299 }, { "epoch": 1.411732753938077, "grad_norm": 0.2565426257958552, "learning_rate": 2.940821256038647e-05, "loss": 0.4623, "step": 1300 }, { "epoch": 1.4128191200434546, "grad_norm": 0.23727363776400567, "learning_rate": 2.938808373590982e-05, "loss": 0.4272, "step": 1301 }, { "epoch": 1.413905486148832, "grad_norm": 0.2751571785394199, "learning_rate": 2.9367954911433176e-05, "loss": 0.4882, "step": 1302 }, { "epoch": 1.4149918522542095, "grad_norm": 0.23762191619474696, "learning_rate": 2.9347826086956526e-05, "loss": 0.4541, "step": 1303 }, { "epoch": 1.4160782183595872, "grad_norm": 0.22995743187146994, "learning_rate": 2.9327697262479876e-05, "loss": 0.42, "step": 1304 }, { "epoch": 1.4171645844649647, "grad_norm": 0.26020492299343567, "learning_rate": 2.9307568438003223e-05, "loss": 0.4595, "step": 1305 }, { "epoch": 1.4182509505703422, "grad_norm": 0.22108602234224747, "learning_rate": 2.9287439613526574e-05, "loss": 0.3963, "step": 1306 }, { "epoch": 1.4193373166757197, "grad_norm": 0.24734990170957655, "learning_rate": 2.9267310789049924e-05, "loss": 0.4659, "step": 1307 }, { "epoch": 1.4204236827810972, "grad_norm": 0.22806627245522956, "learning_rate": 2.924718196457327e-05, "loss": 0.4067, "step": 1308 }, { "epoch": 1.4215100488864747, "grad_norm": 0.4827811462342706, "learning_rate": 2.9227053140096622e-05, "loss": 0.4462, "step": 1309 }, { "epoch": 1.4225964149918522, "grad_norm": 0.2541926247708357, "learning_rate": 2.9206924315619972e-05, "loss": 0.3759, "step": 1310 }, { "epoch": 1.42368278109723, "grad_norm": 0.2236142590704321, "learning_rate": 2.918679549114332e-05, "loss": 0.4625, "step": 1311 }, { "epoch": 1.4247691472026074, "grad_norm": 0.2535304384692317, "learning_rate": 2.916666666666667e-05, "loss": 0.43, "step": 1312 }, { "epoch": 1.4258555133079849, "grad_norm": 0.24539482475307722, "learning_rate": 2.914653784219002e-05, "loss": 0.5263, "step": 1313 }, { "epoch": 1.4269418794133624, "grad_norm": 0.2330928861444648, "learning_rate": 2.9126409017713367e-05, "loss": 0.4149, "step": 1314 }, { "epoch": 1.4280282455187399, "grad_norm": 0.2235727681940187, "learning_rate": 2.9106280193236718e-05, "loss": 0.4096, "step": 1315 }, { "epoch": 1.4291146116241173, "grad_norm": 0.23802962496647254, "learning_rate": 2.9086151368760068e-05, "loss": 0.4941, "step": 1316 }, { "epoch": 1.4302009777294948, "grad_norm": 0.2812615448199625, "learning_rate": 2.9066022544283415e-05, "loss": 0.4891, "step": 1317 }, { "epoch": 1.4312873438348723, "grad_norm": 0.2368810366208595, "learning_rate": 2.9045893719806766e-05, "loss": 0.4719, "step": 1318 }, { "epoch": 1.4323737099402498, "grad_norm": 0.2346730636592381, "learning_rate": 2.9025764895330116e-05, "loss": 0.4658, "step": 1319 }, { "epoch": 1.4334600760456273, "grad_norm": 0.28904199570942785, "learning_rate": 2.9005636070853463e-05, "loss": 0.5105, "step": 1320 }, { "epoch": 1.4345464421510048, "grad_norm": 0.2821907328946595, "learning_rate": 2.8985507246376814e-05, "loss": 0.4774, "step": 1321 }, { "epoch": 1.4356328082563823, "grad_norm": 0.254491054616487, "learning_rate": 2.8965378421900164e-05, "loss": 0.4458, "step": 1322 }, { "epoch": 1.4367191743617598, "grad_norm": 0.23285943002777493, "learning_rate": 2.894524959742351e-05, "loss": 0.4298, "step": 1323 }, { "epoch": 1.4378055404671375, "grad_norm": 107.45115623442813, "learning_rate": 2.892512077294686e-05, "loss": 0.6862, "step": 1324 }, { "epoch": 1.438891906572515, "grad_norm": 0.29095792960905975, "learning_rate": 2.8904991948470212e-05, "loss": 0.4545, "step": 1325 }, { "epoch": 1.4399782726778925, "grad_norm": 0.28493846576364823, "learning_rate": 2.8884863123993562e-05, "loss": 0.5042, "step": 1326 }, { "epoch": 1.44106463878327, "grad_norm": 0.23161368436694904, "learning_rate": 2.886473429951691e-05, "loss": 0.4082, "step": 1327 }, { "epoch": 1.4421510048886474, "grad_norm": 0.2537074287661293, "learning_rate": 2.884460547504026e-05, "loss": 0.4428, "step": 1328 }, { "epoch": 1.443237370994025, "grad_norm": 0.2519498948756939, "learning_rate": 2.882447665056361e-05, "loss": 0.5453, "step": 1329 }, { "epoch": 1.4443237370994024, "grad_norm": 2.4335993616930955, "learning_rate": 2.8804347826086957e-05, "loss": 0.4144, "step": 1330 }, { "epoch": 1.4454101032047801, "grad_norm": 0.27456087740930685, "learning_rate": 2.8784219001610308e-05, "loss": 0.4785, "step": 1331 }, { "epoch": 1.4464964693101576, "grad_norm": 0.23430255284225207, "learning_rate": 2.8764090177133658e-05, "loss": 0.4043, "step": 1332 }, { "epoch": 1.4475828354155351, "grad_norm": 0.2412036458015238, "learning_rate": 2.8743961352657005e-05, "loss": 0.4856, "step": 1333 }, { "epoch": 1.4486692015209126, "grad_norm": 0.2987770811350436, "learning_rate": 2.8723832528180356e-05, "loss": 0.4733, "step": 1334 }, { "epoch": 1.44975556762629, "grad_norm": 0.22483934292390634, "learning_rate": 2.8703703703703706e-05, "loss": 0.4399, "step": 1335 }, { "epoch": 1.4508419337316676, "grad_norm": 0.22670211239371835, "learning_rate": 2.8683574879227053e-05, "loss": 0.4737, "step": 1336 }, { "epoch": 1.451928299837045, "grad_norm": 0.3608992957039712, "learning_rate": 2.8663446054750404e-05, "loss": 0.5136, "step": 1337 }, { "epoch": 1.4530146659424226, "grad_norm": 0.23746005363754405, "learning_rate": 2.8643317230273754e-05, "loss": 0.4118, "step": 1338 }, { "epoch": 1.4541010320478, "grad_norm": 0.23396926192133843, "learning_rate": 2.86231884057971e-05, "loss": 0.4669, "step": 1339 }, { "epoch": 1.4551873981531775, "grad_norm": 0.25523420048470447, "learning_rate": 2.860305958132045e-05, "loss": 0.3926, "step": 1340 }, { "epoch": 1.456273764258555, "grad_norm": 0.2420403462585365, "learning_rate": 2.8582930756843802e-05, "loss": 0.4516, "step": 1341 }, { "epoch": 1.4573601303639325, "grad_norm": 0.22745557825856297, "learning_rate": 2.856280193236715e-05, "loss": 0.4326, "step": 1342 }, { "epoch": 1.4584464964693102, "grad_norm": 0.2314719622902338, "learning_rate": 2.85426731078905e-05, "loss": 0.4274, "step": 1343 }, { "epoch": 1.4595328625746877, "grad_norm": 0.23766092165409247, "learning_rate": 2.852254428341385e-05, "loss": 0.4643, "step": 1344 }, { "epoch": 1.4606192286800652, "grad_norm": 0.244078182454431, "learning_rate": 2.8502415458937197e-05, "loss": 0.3896, "step": 1345 }, { "epoch": 1.4617055947854427, "grad_norm": 0.23172024602661026, "learning_rate": 2.8482286634460548e-05, "loss": 0.4488, "step": 1346 }, { "epoch": 1.4627919608908202, "grad_norm": 0.23140996756842375, "learning_rate": 2.8462157809983898e-05, "loss": 0.4887, "step": 1347 }, { "epoch": 1.4638783269961977, "grad_norm": 0.26099672081308467, "learning_rate": 2.8442028985507245e-05, "loss": 0.493, "step": 1348 }, { "epoch": 1.4649646931015752, "grad_norm": 0.2777187730044441, "learning_rate": 2.8421900161030595e-05, "loss": 0.4435, "step": 1349 }, { "epoch": 1.4660510592069527, "grad_norm": 0.3762272337568911, "learning_rate": 2.8401771336553946e-05, "loss": 0.4556, "step": 1350 }, { "epoch": 1.4671374253123304, "grad_norm": 0.23682333876998138, "learning_rate": 2.8381642512077293e-05, "loss": 0.4465, "step": 1351 }, { "epoch": 1.4682237914177079, "grad_norm": 0.24867925056166043, "learning_rate": 2.8361513687600643e-05, "loss": 0.4038, "step": 1352 }, { "epoch": 1.4693101575230854, "grad_norm": 0.279849318080608, "learning_rate": 2.8341384863123994e-05, "loss": 0.4561, "step": 1353 }, { "epoch": 1.4703965236284628, "grad_norm": 0.24814394446756946, "learning_rate": 2.832125603864734e-05, "loss": 0.4448, "step": 1354 }, { "epoch": 1.4714828897338403, "grad_norm": 0.2291797228994703, "learning_rate": 2.830112721417069e-05, "loss": 0.4557, "step": 1355 }, { "epoch": 1.4725692558392178, "grad_norm": 0.25398218774162656, "learning_rate": 2.8280998389694042e-05, "loss": 0.4656, "step": 1356 }, { "epoch": 1.4736556219445953, "grad_norm": 0.25818428679468153, "learning_rate": 2.826086956521739e-05, "loss": 0.5125, "step": 1357 }, { "epoch": 1.4747419880499728, "grad_norm": 0.24374157310264943, "learning_rate": 2.824074074074074e-05, "loss": 0.5043, "step": 1358 }, { "epoch": 1.4758283541553503, "grad_norm": 0.278572678795655, "learning_rate": 2.822061191626409e-05, "loss": 0.4994, "step": 1359 }, { "epoch": 1.4769147202607278, "grad_norm": 0.25009178665931164, "learning_rate": 2.820048309178744e-05, "loss": 0.4204, "step": 1360 }, { "epoch": 1.4780010863661053, "grad_norm": 0.2277694608144695, "learning_rate": 2.8180354267310787e-05, "loss": 0.4213, "step": 1361 }, { "epoch": 1.4790874524714828, "grad_norm": 0.2372728007574726, "learning_rate": 2.8160225442834138e-05, "loss": 0.4456, "step": 1362 }, { "epoch": 1.4801738185768605, "grad_norm": 0.24852637988322676, "learning_rate": 2.8140096618357488e-05, "loss": 0.4378, "step": 1363 }, { "epoch": 1.481260184682238, "grad_norm": 0.2646148256579471, "learning_rate": 2.8119967793880842e-05, "loss": 0.508, "step": 1364 }, { "epoch": 1.4823465507876155, "grad_norm": 0.2385386187404478, "learning_rate": 2.8099838969404192e-05, "loss": 0.4924, "step": 1365 }, { "epoch": 1.483432916892993, "grad_norm": 0.24280611961972298, "learning_rate": 2.807971014492754e-05, "loss": 0.3984, "step": 1366 }, { "epoch": 1.4845192829983704, "grad_norm": 0.22473308656716673, "learning_rate": 2.805958132045089e-05, "loss": 0.4601, "step": 1367 }, { "epoch": 1.485605649103748, "grad_norm": 0.2578544830118931, "learning_rate": 2.803945249597424e-05, "loss": 0.4188, "step": 1368 }, { "epoch": 1.4866920152091254, "grad_norm": 0.22280541951972452, "learning_rate": 2.8019323671497587e-05, "loss": 0.4451, "step": 1369 }, { "epoch": 1.487778381314503, "grad_norm": 0.22926823316458908, "learning_rate": 2.7999194847020938e-05, "loss": 0.4797, "step": 1370 }, { "epoch": 1.4888647474198806, "grad_norm": 0.24597125878642298, "learning_rate": 2.7979066022544288e-05, "loss": 0.413, "step": 1371 }, { "epoch": 1.489951113525258, "grad_norm": 0.21985214960220667, "learning_rate": 2.7958937198067635e-05, "loss": 0.4663, "step": 1372 }, { "epoch": 1.4910374796306356, "grad_norm": 0.22407852556793345, "learning_rate": 2.7938808373590986e-05, "loss": 0.396, "step": 1373 }, { "epoch": 1.492123845736013, "grad_norm": 0.21114774034921593, "learning_rate": 2.7918679549114336e-05, "loss": 0.3808, "step": 1374 }, { "epoch": 1.4932102118413906, "grad_norm": 0.2325435544533275, "learning_rate": 2.7898550724637683e-05, "loss": 0.4195, "step": 1375 }, { "epoch": 1.494296577946768, "grad_norm": 0.2267942635253576, "learning_rate": 2.7878421900161034e-05, "loss": 0.4353, "step": 1376 }, { "epoch": 1.4953829440521456, "grad_norm": 0.22052317884640196, "learning_rate": 2.7858293075684384e-05, "loss": 0.5927, "step": 1377 }, { "epoch": 1.496469310157523, "grad_norm": 3.656806917586406, "learning_rate": 2.783816425120773e-05, "loss": 0.4243, "step": 1378 }, { "epoch": 1.4975556762629005, "grad_norm": 0.30970586872564376, "learning_rate": 2.781803542673108e-05, "loss": 0.4686, "step": 1379 }, { "epoch": 1.498642042368278, "grad_norm": 0.206313817884503, "learning_rate": 2.7797906602254432e-05, "loss": 0.3848, "step": 1380 }, { "epoch": 1.4997284084736555, "grad_norm": 0.2091237668159324, "learning_rate": 2.777777777777778e-05, "loss": 0.3915, "step": 1381 }, { "epoch": 1.500814774579033, "grad_norm": 0.2814890307557159, "learning_rate": 2.775764895330113e-05, "loss": 0.4406, "step": 1382 }, { "epoch": 1.5019011406844105, "grad_norm": 0.23194842585925082, "learning_rate": 2.773752012882448e-05, "loss": 0.4189, "step": 1383 }, { "epoch": 1.5029875067897882, "grad_norm": 0.2004074479803714, "learning_rate": 2.7717391304347827e-05, "loss": 0.4261, "step": 1384 }, { "epoch": 1.5040738728951657, "grad_norm": 0.23006520470948089, "learning_rate": 2.7697262479871177e-05, "loss": 0.4417, "step": 1385 }, { "epoch": 1.5051602390005432, "grad_norm": 0.22798280162619367, "learning_rate": 2.7677133655394528e-05, "loss": 0.404, "step": 1386 }, { "epoch": 1.5062466051059207, "grad_norm": 0.23536309563626204, "learning_rate": 2.7657004830917875e-05, "loss": 0.425, "step": 1387 }, { "epoch": 1.5073329712112982, "grad_norm": 0.2246956741974256, "learning_rate": 2.7636876006441225e-05, "loss": 0.4719, "step": 1388 }, { "epoch": 1.5084193373166759, "grad_norm": 0.23298195827292958, "learning_rate": 2.7616747181964576e-05, "loss": 0.4636, "step": 1389 }, { "epoch": 1.5095057034220534, "grad_norm": 0.2499194719582776, "learning_rate": 2.7596618357487923e-05, "loss": 0.4177, "step": 1390 }, { "epoch": 1.5105920695274309, "grad_norm": 0.2601867794857315, "learning_rate": 2.7576489533011273e-05, "loss": 0.435, "step": 1391 }, { "epoch": 1.5116784356328083, "grad_norm": 0.22447884260966602, "learning_rate": 2.7556360708534624e-05, "loss": 0.4901, "step": 1392 }, { "epoch": 1.5127648017381858, "grad_norm": 0.22565867582247612, "learning_rate": 2.753623188405797e-05, "loss": 0.3878, "step": 1393 }, { "epoch": 1.5138511678435633, "grad_norm": 0.22448174533408496, "learning_rate": 2.751610305958132e-05, "loss": 0.412, "step": 1394 }, { "epoch": 1.5149375339489408, "grad_norm": 0.22070258978577537, "learning_rate": 2.7495974235104672e-05, "loss": 0.4347, "step": 1395 }, { "epoch": 1.5160239000543183, "grad_norm": 0.21022871484317165, "learning_rate": 2.7475845410628022e-05, "loss": 0.4386, "step": 1396 }, { "epoch": 1.5171102661596958, "grad_norm": 0.24279972028786237, "learning_rate": 2.745571658615137e-05, "loss": 0.4278, "step": 1397 }, { "epoch": 1.5181966322650733, "grad_norm": 0.21633769069906458, "learning_rate": 2.743558776167472e-05, "loss": 0.4443, "step": 1398 }, { "epoch": 1.5192829983704508, "grad_norm": 0.2459438225114051, "learning_rate": 2.741545893719807e-05, "loss": 0.4259, "step": 1399 }, { "epoch": 1.5203693644758283, "grad_norm": 0.22377615935041106, "learning_rate": 2.7395330112721417e-05, "loss": 0.4691, "step": 1400 }, { "epoch": 1.5214557305812058, "grad_norm": 0.20801893069640698, "learning_rate": 2.7375201288244768e-05, "loss": 0.4172, "step": 1401 }, { "epoch": 1.5225420966865832, "grad_norm": 0.2673293057736015, "learning_rate": 2.7355072463768118e-05, "loss": 0.5043, "step": 1402 }, { "epoch": 1.5236284627919607, "grad_norm": 0.22579203317690252, "learning_rate": 2.7334943639291465e-05, "loss": 0.4781, "step": 1403 }, { "epoch": 1.5247148288973384, "grad_norm": 0.2517093163437243, "learning_rate": 2.7314814814814816e-05, "loss": 0.4806, "step": 1404 }, { "epoch": 1.525801195002716, "grad_norm": 0.260056974154187, "learning_rate": 2.7294685990338166e-05, "loss": 0.4788, "step": 1405 }, { "epoch": 1.5268875611080934, "grad_norm": 0.23997797269819238, "learning_rate": 2.7274557165861513e-05, "loss": 0.5006, "step": 1406 }, { "epoch": 1.527973927213471, "grad_norm": 0.2107406566634624, "learning_rate": 2.7254428341384863e-05, "loss": 0.4062, "step": 1407 }, { "epoch": 1.5290602933188484, "grad_norm": 0.24734723763889102, "learning_rate": 2.7234299516908214e-05, "loss": 0.4606, "step": 1408 }, { "epoch": 1.5301466594242261, "grad_norm": 0.24227324231245503, "learning_rate": 2.721417069243156e-05, "loss": 0.4482, "step": 1409 }, { "epoch": 1.5312330255296036, "grad_norm": 0.2169395966406519, "learning_rate": 2.719404186795491e-05, "loss": 0.408, "step": 1410 }, { "epoch": 1.532319391634981, "grad_norm": 0.20807202231208366, "learning_rate": 2.7173913043478262e-05, "loss": 0.4122, "step": 1411 }, { "epoch": 1.5334057577403586, "grad_norm": 0.2372973865680712, "learning_rate": 2.715378421900161e-05, "loss": 0.4088, "step": 1412 }, { "epoch": 1.534492123845736, "grad_norm": 0.2371128020448619, "learning_rate": 2.713365539452496e-05, "loss": 0.4578, "step": 1413 }, { "epoch": 1.5355784899511136, "grad_norm": 0.20889091625850692, "learning_rate": 2.711352657004831e-05, "loss": 0.4679, "step": 1414 }, { "epoch": 1.536664856056491, "grad_norm": 0.2558551882056223, "learning_rate": 2.7093397745571657e-05, "loss": 0.4571, "step": 1415 }, { "epoch": 1.5377512221618685, "grad_norm": 0.2733579761727715, "learning_rate": 2.7073268921095007e-05, "loss": 0.4658, "step": 1416 }, { "epoch": 1.538837588267246, "grad_norm": 0.21852655825104433, "learning_rate": 2.7053140096618358e-05, "loss": 0.4197, "step": 1417 }, { "epoch": 1.5399239543726235, "grad_norm": 0.23194354606477013, "learning_rate": 2.7033011272141705e-05, "loss": 0.4587, "step": 1418 }, { "epoch": 1.541010320478001, "grad_norm": 0.24943563624233964, "learning_rate": 2.7012882447665055e-05, "loss": 0.425, "step": 1419 }, { "epoch": 1.5420966865833785, "grad_norm": 0.2194920380773244, "learning_rate": 2.6992753623188406e-05, "loss": 0.4497, "step": 1420 }, { "epoch": 1.543183052688756, "grad_norm": 0.4181958797938272, "learning_rate": 2.6972624798711753e-05, "loss": 0.4948, "step": 1421 }, { "epoch": 1.5442694187941335, "grad_norm": 0.22785075197766488, "learning_rate": 2.6952495974235103e-05, "loss": 0.4445, "step": 1422 }, { "epoch": 1.545355784899511, "grad_norm": 0.21100584748202564, "learning_rate": 2.6932367149758454e-05, "loss": 0.3879, "step": 1423 }, { "epoch": 1.5464421510048887, "grad_norm": 0.24606167861317402, "learning_rate": 2.69122383252818e-05, "loss": 0.4563, "step": 1424 }, { "epoch": 1.5475285171102662, "grad_norm": 0.24471338733605763, "learning_rate": 2.689210950080515e-05, "loss": 0.4073, "step": 1425 }, { "epoch": 1.5486148832156437, "grad_norm": 0.20011033211452542, "learning_rate": 2.6871980676328505e-05, "loss": 0.4984, "step": 1426 }, { "epoch": 1.5497012493210212, "grad_norm": 0.2351639318826315, "learning_rate": 2.6851851851851855e-05, "loss": 0.4591, "step": 1427 }, { "epoch": 1.5507876154263986, "grad_norm": 0.2627383630699222, "learning_rate": 2.6831723027375206e-05, "loss": 0.4227, "step": 1428 }, { "epoch": 1.5518739815317764, "grad_norm": 0.5023740432072241, "learning_rate": 2.6811594202898553e-05, "loss": 0.4072, "step": 1429 }, { "epoch": 1.5529603476371538, "grad_norm": 0.2105297425302088, "learning_rate": 2.6791465378421903e-05, "loss": 0.4077, "step": 1430 }, { "epoch": 1.5540467137425313, "grad_norm": 1.0627481457456227, "learning_rate": 2.6771336553945254e-05, "loss": 0.4075, "step": 1431 }, { "epoch": 1.5551330798479088, "grad_norm": 0.20850303568009032, "learning_rate": 2.6751207729468604e-05, "loss": 0.4376, "step": 1432 }, { "epoch": 1.5562194459532863, "grad_norm": 0.22913056052225242, "learning_rate": 2.673107890499195e-05, "loss": 0.5775, "step": 1433 }, { "epoch": 1.5573058120586638, "grad_norm": 18.182624231327885, "learning_rate": 2.6710950080515302e-05, "loss": 0.4415, "step": 1434 }, { "epoch": 1.5583921781640413, "grad_norm": 0.26018823597851237, "learning_rate": 2.6690821256038652e-05, "loss": 0.488, "step": 1435 }, { "epoch": 1.5594785442694188, "grad_norm": 0.20993086229709648, "learning_rate": 2.6670692431562e-05, "loss": 0.3933, "step": 1436 }, { "epoch": 1.5605649103747963, "grad_norm": 0.2313776782180547, "learning_rate": 2.665056360708535e-05, "loss": 0.4955, "step": 1437 }, { "epoch": 1.5616512764801738, "grad_norm": 0.2085314794722738, "learning_rate": 2.66304347826087e-05, "loss": 0.4402, "step": 1438 }, { "epoch": 1.5627376425855513, "grad_norm": 0.2146092210272079, "learning_rate": 2.6610305958132047e-05, "loss": 0.4558, "step": 1439 }, { "epoch": 1.5638240086909287, "grad_norm": 0.34840840977866994, "learning_rate": 2.6590177133655398e-05, "loss": 0.4498, "step": 1440 }, { "epoch": 1.5649103747963062, "grad_norm": 0.231171774131653, "learning_rate": 2.6570048309178748e-05, "loss": 0.4893, "step": 1441 }, { "epoch": 1.5659967409016837, "grad_norm": 0.8740506080014578, "learning_rate": 2.6549919484702095e-05, "loss": 0.4799, "step": 1442 }, { "epoch": 1.5670831070070612, "grad_norm": 0.24883522488970566, "learning_rate": 2.6529790660225446e-05, "loss": 0.4372, "step": 1443 }, { "epoch": 1.568169473112439, "grad_norm": 0.23111216543913368, "learning_rate": 2.6509661835748796e-05, "loss": 0.4111, "step": 1444 }, { "epoch": 1.5692558392178164, "grad_norm": 0.24524201346519303, "learning_rate": 2.6489533011272143e-05, "loss": 0.4833, "step": 1445 }, { "epoch": 1.570342205323194, "grad_norm": 0.2382550710734777, "learning_rate": 2.6469404186795493e-05, "loss": 0.4492, "step": 1446 }, { "epoch": 1.5714285714285714, "grad_norm": 0.2668723039826882, "learning_rate": 2.6449275362318844e-05, "loss": 0.4454, "step": 1447 }, { "epoch": 1.5725149375339489, "grad_norm": 0.2768757274841134, "learning_rate": 2.642914653784219e-05, "loss": 0.4485, "step": 1448 }, { "epoch": 1.5736013036393266, "grad_norm": 0.24496819804388453, "learning_rate": 2.640901771336554e-05, "loss": 0.4634, "step": 1449 }, { "epoch": 1.574687669744704, "grad_norm": 0.2309677788099744, "learning_rate": 2.6388888888888892e-05, "loss": 0.4652, "step": 1450 }, { "epoch": 1.5757740358500816, "grad_norm": 0.22167265146935483, "learning_rate": 2.636876006441224e-05, "loss": 0.4406, "step": 1451 }, { "epoch": 1.576860401955459, "grad_norm": 0.23401185536534747, "learning_rate": 2.634863123993559e-05, "loss": 0.4818, "step": 1452 }, { "epoch": 1.5779467680608366, "grad_norm": 0.23451302632858212, "learning_rate": 2.632850241545894e-05, "loss": 0.4794, "step": 1453 }, { "epoch": 1.579033134166214, "grad_norm": 0.22381989168558708, "learning_rate": 2.6308373590982287e-05, "loss": 0.4341, "step": 1454 }, { "epoch": 1.5801195002715915, "grad_norm": 0.2221536107874731, "learning_rate": 2.6288244766505637e-05, "loss": 0.4334, "step": 1455 }, { "epoch": 1.581205866376969, "grad_norm": 0.24214725323086708, "learning_rate": 2.6268115942028988e-05, "loss": 0.482, "step": 1456 }, { "epoch": 1.5822922324823465, "grad_norm": 0.2667011240388616, "learning_rate": 2.6247987117552335e-05, "loss": 0.5021, "step": 1457 }, { "epoch": 1.583378598587724, "grad_norm": 0.23574037312801782, "learning_rate": 2.6227858293075685e-05, "loss": 0.4949, "step": 1458 }, { "epoch": 1.5844649646931015, "grad_norm": 0.2348090616189591, "learning_rate": 2.6207729468599036e-05, "loss": 0.4392, "step": 1459 }, { "epoch": 1.585551330798479, "grad_norm": 0.24162303467696905, "learning_rate": 2.6187600644122383e-05, "loss": 0.4357, "step": 1460 }, { "epoch": 1.5866376969038565, "grad_norm": 0.22982161007131469, "learning_rate": 2.6167471819645733e-05, "loss": 0.4778, "step": 1461 }, { "epoch": 1.587724063009234, "grad_norm": 0.252119245987095, "learning_rate": 2.6147342995169084e-05, "loss": 0.4856, "step": 1462 }, { "epoch": 1.5888104291146117, "grad_norm": 0.22244086598282226, "learning_rate": 2.612721417069243e-05, "loss": 0.4605, "step": 1463 }, { "epoch": 1.5898967952199892, "grad_norm": 1.342454664233099, "learning_rate": 2.610708534621578e-05, "loss": 0.5053, "step": 1464 }, { "epoch": 1.5909831613253667, "grad_norm": 0.2558340986929565, "learning_rate": 2.608695652173913e-05, "loss": 0.4463, "step": 1465 }, { "epoch": 1.5920695274307441, "grad_norm": 0.2085274572337876, "learning_rate": 2.6066827697262482e-05, "loss": 0.4731, "step": 1466 }, { "epoch": 1.5931558935361216, "grad_norm": 0.22427144081164402, "learning_rate": 2.604669887278583e-05, "loss": 0.432, "step": 1467 }, { "epoch": 1.5942422596414993, "grad_norm": 0.23790372424961667, "learning_rate": 2.602657004830918e-05, "loss": 0.4441, "step": 1468 }, { "epoch": 1.5953286257468768, "grad_norm": 1.2292995747267546, "learning_rate": 2.600644122383253e-05, "loss": 0.4614, "step": 1469 }, { "epoch": 1.5964149918522543, "grad_norm": 0.2392934733252527, "learning_rate": 2.5986312399355877e-05, "loss": 0.4646, "step": 1470 }, { "epoch": 1.5975013579576318, "grad_norm": 0.2497903488476396, "learning_rate": 2.5966183574879227e-05, "loss": 0.4711, "step": 1471 }, { "epoch": 1.5985877240630093, "grad_norm": 0.22829762116187363, "learning_rate": 2.5946054750402578e-05, "loss": 0.4755, "step": 1472 }, { "epoch": 1.5996740901683868, "grad_norm": 3.090829719833534, "learning_rate": 2.5925925925925925e-05, "loss": 0.494, "step": 1473 }, { "epoch": 1.6007604562737643, "grad_norm": 0.2577525730416267, "learning_rate": 2.5905797101449275e-05, "loss": 0.4353, "step": 1474 }, { "epoch": 1.6018468223791418, "grad_norm": 0.23097619915568207, "learning_rate": 2.5885668276972626e-05, "loss": 0.4507, "step": 1475 }, { "epoch": 1.6029331884845193, "grad_norm": 0.2390279550788415, "learning_rate": 2.5865539452495973e-05, "loss": 0.3678, "step": 1476 }, { "epoch": 1.6040195545898968, "grad_norm": 0.3470150562505843, "learning_rate": 2.5845410628019323e-05, "loss": 0.445, "step": 1477 }, { "epoch": 1.6051059206952742, "grad_norm": 0.23106828030966287, "learning_rate": 2.5825281803542674e-05, "loss": 0.4123, "step": 1478 }, { "epoch": 1.6061922868006517, "grad_norm": 0.2462150887145053, "learning_rate": 2.580515297906602e-05, "loss": 0.4539, "step": 1479 }, { "epoch": 1.6072786529060292, "grad_norm": 0.24813204055872873, "learning_rate": 2.578502415458937e-05, "loss": 0.4531, "step": 1480 }, { "epoch": 1.6083650190114067, "grad_norm": 0.24086359054265946, "learning_rate": 2.576489533011272e-05, "loss": 0.4341, "step": 1481 }, { "epoch": 1.6094513851167842, "grad_norm": 0.23608901165794802, "learning_rate": 2.574476650563607e-05, "loss": 0.4622, "step": 1482 }, { "epoch": 1.610537751222162, "grad_norm": 0.2518048511135043, "learning_rate": 2.572463768115942e-05, "loss": 0.4846, "step": 1483 }, { "epoch": 1.6116241173275394, "grad_norm": 0.24291303292827512, "learning_rate": 2.570450885668277e-05, "loss": 0.452, "step": 1484 }, { "epoch": 1.612710483432917, "grad_norm": 0.23136839830511577, "learning_rate": 2.5684380032206117e-05, "loss": 0.4292, "step": 1485 }, { "epoch": 1.6137968495382944, "grad_norm": 0.24196953993188414, "learning_rate": 2.5664251207729467e-05, "loss": 0.4947, "step": 1486 }, { "epoch": 1.6148832156436719, "grad_norm": 0.23987637215275673, "learning_rate": 2.5644122383252818e-05, "loss": 0.4549, "step": 1487 }, { "epoch": 1.6159695817490496, "grad_norm": 0.23025367316768597, "learning_rate": 2.562399355877617e-05, "loss": 0.4354, "step": 1488 }, { "epoch": 1.617055947854427, "grad_norm": 0.24030750741409113, "learning_rate": 2.5603864734299522e-05, "loss": 0.4885, "step": 1489 }, { "epoch": 1.6181423139598046, "grad_norm": 0.2196159455533167, "learning_rate": 2.558373590982287e-05, "loss": 0.4384, "step": 1490 }, { "epoch": 1.619228680065182, "grad_norm": 0.23429642473362447, "learning_rate": 2.556360708534622e-05, "loss": 0.4976, "step": 1491 }, { "epoch": 1.6203150461705595, "grad_norm": 0.2270459460995555, "learning_rate": 2.554347826086957e-05, "loss": 0.4357, "step": 1492 }, { "epoch": 1.621401412275937, "grad_norm": 0.2390587782884978, "learning_rate": 2.5523349436392917e-05, "loss": 0.4684, "step": 1493 }, { "epoch": 1.6224877783813145, "grad_norm": 0.25749463087119373, "learning_rate": 2.5503220611916267e-05, "loss": 0.4747, "step": 1494 }, { "epoch": 1.623574144486692, "grad_norm": 0.20073696796734447, "learning_rate": 2.5483091787439618e-05, "loss": 0.4028, "step": 1495 }, { "epoch": 1.6246605105920695, "grad_norm": 2.2758272900521233, "learning_rate": 2.5462962962962965e-05, "loss": 0.5022, "step": 1496 }, { "epoch": 1.625746876697447, "grad_norm": 0.23555546213023887, "learning_rate": 2.5442834138486315e-05, "loss": 0.4357, "step": 1497 }, { "epoch": 1.6268332428028245, "grad_norm": 0.26999588514437595, "learning_rate": 2.5422705314009666e-05, "loss": 0.4512, "step": 1498 }, { "epoch": 1.627919608908202, "grad_norm": 0.22835435083357958, "learning_rate": 2.5402576489533013e-05, "loss": 0.417, "step": 1499 }, { "epoch": 1.6290059750135795, "grad_norm": 0.22486460570630462, "learning_rate": 2.5382447665056363e-05, "loss": 0.4351, "step": 1500 }, { "epoch": 1.630092341118957, "grad_norm": 0.27609594325927467, "learning_rate": 2.5362318840579714e-05, "loss": 0.4752, "step": 1501 }, { "epoch": 1.6311787072243344, "grad_norm": 0.25395443844692384, "learning_rate": 2.534219001610306e-05, "loss": 0.4658, "step": 1502 }, { "epoch": 1.6322650733297122, "grad_norm": 0.2108789674204526, "learning_rate": 2.532206119162641e-05, "loss": 0.4344, "step": 1503 }, { "epoch": 1.6333514394350896, "grad_norm": 0.22244872236118615, "learning_rate": 2.530193236714976e-05, "loss": 0.4191, "step": 1504 }, { "epoch": 1.6344378055404671, "grad_norm": 0.24442045420497363, "learning_rate": 2.5281803542673112e-05, "loss": 0.4914, "step": 1505 }, { "epoch": 1.6355241716458446, "grad_norm": 0.23655655434237638, "learning_rate": 2.526167471819646e-05, "loss": 0.4768, "step": 1506 }, { "epoch": 1.6366105377512221, "grad_norm": 0.21437612978018564, "learning_rate": 2.524154589371981e-05, "loss": 0.4096, "step": 1507 }, { "epoch": 1.6376969038565998, "grad_norm": 0.24352838220454648, "learning_rate": 2.522141706924316e-05, "loss": 0.478, "step": 1508 }, { "epoch": 1.6387832699619773, "grad_norm": 0.2215737496831947, "learning_rate": 2.5201288244766507e-05, "loss": 0.4306, "step": 1509 }, { "epoch": 1.6398696360673548, "grad_norm": 0.20382101352976537, "learning_rate": 2.5181159420289857e-05, "loss": 0.4421, "step": 1510 }, { "epoch": 1.6409560021727323, "grad_norm": 0.23540054267485058, "learning_rate": 2.5161030595813208e-05, "loss": 0.389, "step": 1511 }, { "epoch": 1.6420423682781098, "grad_norm": 0.2235432379786471, "learning_rate": 2.5140901771336555e-05, "loss": 0.5115, "step": 1512 }, { "epoch": 1.6431287343834873, "grad_norm": 0.2075075687414023, "learning_rate": 2.5120772946859905e-05, "loss": 0.4472, "step": 1513 }, { "epoch": 1.6442151004888648, "grad_norm": 0.26021924204912017, "learning_rate": 2.5100644122383256e-05, "loss": 0.4541, "step": 1514 }, { "epoch": 1.6453014665942423, "grad_norm": 0.22374648461573285, "learning_rate": 2.5080515297906603e-05, "loss": 0.4264, "step": 1515 }, { "epoch": 1.6463878326996197, "grad_norm": 0.22972135693927978, "learning_rate": 2.5060386473429953e-05, "loss": 0.4718, "step": 1516 }, { "epoch": 1.6474741988049972, "grad_norm": 0.2299034813026004, "learning_rate": 2.5040257648953304e-05, "loss": 0.4709, "step": 1517 }, { "epoch": 1.6485605649103747, "grad_norm": 0.22467460950939125, "learning_rate": 2.502012882447665e-05, "loss": 0.4386, "step": 1518 }, { "epoch": 1.6496469310157522, "grad_norm": 0.23702229280084838, "learning_rate": 2.5e-05, "loss": 0.4486, "step": 1519 }, { "epoch": 1.6507332971211297, "grad_norm": 0.21847402978948496, "learning_rate": 2.497987117552335e-05, "loss": 0.4126, "step": 1520 }, { "epoch": 1.6518196632265072, "grad_norm": 0.20498945088247, "learning_rate": 2.49597423510467e-05, "loss": 0.4239, "step": 1521 }, { "epoch": 1.6529060293318847, "grad_norm": 0.2243239363170583, "learning_rate": 2.493961352657005e-05, "loss": 0.4396, "step": 1522 }, { "epoch": 1.6539923954372624, "grad_norm": 0.2157489538630051, "learning_rate": 2.49194847020934e-05, "loss": 0.4978, "step": 1523 }, { "epoch": 1.6550787615426399, "grad_norm": 0.22831609153318602, "learning_rate": 2.4899355877616747e-05, "loss": 0.5017, "step": 1524 }, { "epoch": 1.6561651276480174, "grad_norm": 0.2188628120894474, "learning_rate": 2.4879227053140097e-05, "loss": 0.4401, "step": 1525 }, { "epoch": 1.6572514937533949, "grad_norm": 2.848084130863775, "learning_rate": 2.4859098228663448e-05, "loss": 0.5343, "step": 1526 }, { "epoch": 1.6583378598587724, "grad_norm": 0.23285267650479952, "learning_rate": 2.4838969404186795e-05, "loss": 0.4856, "step": 1527 }, { "epoch": 1.65942422596415, "grad_norm": 0.23468459697243294, "learning_rate": 2.4818840579710145e-05, "loss": 0.4253, "step": 1528 }, { "epoch": 1.6605105920695276, "grad_norm": 0.21944917394574298, "learning_rate": 2.4798711755233495e-05, "loss": 0.5152, "step": 1529 }, { "epoch": 1.661596958174905, "grad_norm": 0.21003390793942028, "learning_rate": 2.4778582930756843e-05, "loss": 0.4246, "step": 1530 }, { "epoch": 1.6626833242802825, "grad_norm": 0.2363557389354369, "learning_rate": 2.4758454106280193e-05, "loss": 0.4397, "step": 1531 }, { "epoch": 1.66376969038566, "grad_norm": 0.22741870255490632, "learning_rate": 2.4738325281803543e-05, "loss": 0.465, "step": 1532 }, { "epoch": 1.6648560564910375, "grad_norm": 0.21753072129157985, "learning_rate": 2.471819645732689e-05, "loss": 0.4952, "step": 1533 }, { "epoch": 1.665942422596415, "grad_norm": 0.21855240010729024, "learning_rate": 2.469806763285024e-05, "loss": 0.4235, "step": 1534 }, { "epoch": 1.6670287887017925, "grad_norm": 0.22725388136180258, "learning_rate": 2.4677938808373595e-05, "loss": 0.4585, "step": 1535 }, { "epoch": 1.66811515480717, "grad_norm": 0.21984330464663945, "learning_rate": 2.4657809983896942e-05, "loss": 0.453, "step": 1536 }, { "epoch": 1.6692015209125475, "grad_norm": 0.24757275323846634, "learning_rate": 2.4637681159420292e-05, "loss": 0.4355, "step": 1537 }, { "epoch": 1.670287887017925, "grad_norm": 0.24092302171269703, "learning_rate": 2.4617552334943643e-05, "loss": 0.4773, "step": 1538 }, { "epoch": 1.6713742531233025, "grad_norm": 0.24418629913275142, "learning_rate": 2.459742351046699e-05, "loss": 0.465, "step": 1539 }, { "epoch": 1.67246061922868, "grad_norm": 0.22608060201343, "learning_rate": 2.457729468599034e-05, "loss": 0.4418, "step": 1540 }, { "epoch": 1.6735469853340574, "grad_norm": 0.2176284707598746, "learning_rate": 2.455716586151369e-05, "loss": 0.4295, "step": 1541 }, { "epoch": 1.674633351439435, "grad_norm": 0.23121752911509594, "learning_rate": 2.4537037037037038e-05, "loss": 0.4546, "step": 1542 }, { "epoch": 1.6757197175448126, "grad_norm": 0.233835576267656, "learning_rate": 2.4516908212560388e-05, "loss": 0.4427, "step": 1543 }, { "epoch": 1.6768060836501901, "grad_norm": 3.4240827686743067, "learning_rate": 2.449677938808374e-05, "loss": 0.5854, "step": 1544 }, { "epoch": 1.6778924497555676, "grad_norm": 0.29225496506890764, "learning_rate": 2.4476650563607086e-05, "loss": 0.4148, "step": 1545 }, { "epoch": 1.678978815860945, "grad_norm": 0.274607861012683, "learning_rate": 2.4456521739130436e-05, "loss": 0.4574, "step": 1546 }, { "epoch": 1.6800651819663226, "grad_norm": 0.2517967243752364, "learning_rate": 2.4436392914653786e-05, "loss": 0.4491, "step": 1547 }, { "epoch": 1.6811515480717003, "grad_norm": 0.23741900503018984, "learning_rate": 2.4416264090177134e-05, "loss": 0.4133, "step": 1548 }, { "epoch": 1.6822379141770778, "grad_norm": 4.08631048765838, "learning_rate": 2.4396135265700484e-05, "loss": 0.5232, "step": 1549 }, { "epoch": 1.6833242802824553, "grad_norm": 0.34698837053247056, "learning_rate": 2.4376006441223834e-05, "loss": 0.4167, "step": 1550 }, { "epoch": 1.6844106463878328, "grad_norm": 0.27309081445586864, "learning_rate": 2.435587761674718e-05, "loss": 0.4496, "step": 1551 }, { "epoch": 1.6854970124932103, "grad_norm": 0.27428145843922325, "learning_rate": 2.4335748792270532e-05, "loss": 0.4832, "step": 1552 }, { "epoch": 1.6865833785985878, "grad_norm": 0.27947285375374764, "learning_rate": 2.4315619967793882e-05, "loss": 0.4247, "step": 1553 }, { "epoch": 1.6876697447039652, "grad_norm": 0.32936934968518766, "learning_rate": 2.429549114331723e-05, "loss": 0.4285, "step": 1554 }, { "epoch": 1.6887561108093427, "grad_norm": 0.2359189563828577, "learning_rate": 2.427536231884058e-05, "loss": 0.4528, "step": 1555 }, { "epoch": 1.6898424769147202, "grad_norm": 0.3620515241481902, "learning_rate": 2.425523349436393e-05, "loss": 0.4271, "step": 1556 }, { "epoch": 1.6909288430200977, "grad_norm": 0.27376293370976257, "learning_rate": 2.423510466988728e-05, "loss": 0.4149, "step": 1557 }, { "epoch": 1.6920152091254752, "grad_norm": 0.2520482178749143, "learning_rate": 2.4214975845410628e-05, "loss": 0.4439, "step": 1558 }, { "epoch": 1.6931015752308527, "grad_norm": 0.2641630438694232, "learning_rate": 2.4194847020933978e-05, "loss": 0.5172, "step": 1559 }, { "epoch": 1.6941879413362302, "grad_norm": 0.24175762260084618, "learning_rate": 2.417471819645733e-05, "loss": 0.4433, "step": 1560 }, { "epoch": 1.6952743074416077, "grad_norm": 0.23037212114774208, "learning_rate": 2.4154589371980676e-05, "loss": 0.4603, "step": 1561 }, { "epoch": 1.6963606735469854, "grad_norm": 0.2550386160752378, "learning_rate": 2.4134460547504026e-05, "loss": 0.3824, "step": 1562 }, { "epoch": 1.6974470396523629, "grad_norm": 0.22690348783079994, "learning_rate": 2.4114331723027377e-05, "loss": 0.4209, "step": 1563 }, { "epoch": 1.6985334057577404, "grad_norm": 0.3102597286478119, "learning_rate": 2.4094202898550724e-05, "loss": 0.5333, "step": 1564 }, { "epoch": 1.6996197718631179, "grad_norm": 0.24108676105135354, "learning_rate": 2.4074074074074074e-05, "loss": 0.4214, "step": 1565 }, { "epoch": 1.7007061379684953, "grad_norm": 0.23601557116577393, "learning_rate": 2.4053945249597425e-05, "loss": 0.4276, "step": 1566 }, { "epoch": 1.701792504073873, "grad_norm": 0.24745384812145635, "learning_rate": 2.4033816425120775e-05, "loss": 0.4393, "step": 1567 }, { "epoch": 1.7028788701792505, "grad_norm": 0.21851982222096508, "learning_rate": 2.4013687600644125e-05, "loss": 0.4237, "step": 1568 }, { "epoch": 1.703965236284628, "grad_norm": 0.28223937418235445, "learning_rate": 2.3993558776167472e-05, "loss": 0.4811, "step": 1569 }, { "epoch": 1.7050516023900055, "grad_norm": 0.2510734553509359, "learning_rate": 2.3973429951690823e-05, "loss": 0.4215, "step": 1570 }, { "epoch": 1.706137968495383, "grad_norm": 0.2059065802994948, "learning_rate": 2.3953301127214173e-05, "loss": 0.4889, "step": 1571 }, { "epoch": 1.7072243346007605, "grad_norm": 0.2695614102127069, "learning_rate": 2.393317230273752e-05, "loss": 0.4451, "step": 1572 }, { "epoch": 1.708310700706138, "grad_norm": 0.30939089799688296, "learning_rate": 2.391304347826087e-05, "loss": 0.4481, "step": 1573 }, { "epoch": 1.7093970668115155, "grad_norm": 0.23116743423724095, "learning_rate": 2.389291465378422e-05, "loss": 0.4383, "step": 1574 }, { "epoch": 1.710483432916893, "grad_norm": 0.933342770471384, "learning_rate": 2.3872785829307572e-05, "loss": 0.3913, "step": 1575 }, { "epoch": 1.7115697990222705, "grad_norm": 0.2209086721451814, "learning_rate": 2.385265700483092e-05, "loss": 0.4705, "step": 1576 }, { "epoch": 1.712656165127648, "grad_norm": 0.28825704303257466, "learning_rate": 2.383252818035427e-05, "loss": 0.4879, "step": 1577 }, { "epoch": 1.7137425312330254, "grad_norm": 0.23299823445382284, "learning_rate": 2.381239935587762e-05, "loss": 0.4186, "step": 1578 }, { "epoch": 1.714828897338403, "grad_norm": 0.2347351492619705, "learning_rate": 2.3792270531400967e-05, "loss": 0.4816, "step": 1579 }, { "epoch": 1.7159152634437804, "grad_norm": 0.22165509209729997, "learning_rate": 2.3772141706924317e-05, "loss": 0.4242, "step": 1580 }, { "epoch": 1.717001629549158, "grad_norm": 0.253358653614834, "learning_rate": 2.3752012882447668e-05, "loss": 0.448, "step": 1581 }, { "epoch": 1.7180879956545356, "grad_norm": 0.2209760046217434, "learning_rate": 2.3731884057971015e-05, "loss": 0.4685, "step": 1582 }, { "epoch": 1.7191743617599131, "grad_norm": 0.2099261657171052, "learning_rate": 2.3711755233494365e-05, "loss": 0.4002, "step": 1583 }, { "epoch": 1.7202607278652906, "grad_norm": 0.23223298997280403, "learning_rate": 2.3691626409017716e-05, "loss": 0.4325, "step": 1584 }, { "epoch": 1.721347093970668, "grad_norm": 0.2252607472832024, "learning_rate": 2.3671497584541063e-05, "loss": 0.4408, "step": 1585 }, { "epoch": 1.7224334600760456, "grad_norm": 0.20407024340881222, "learning_rate": 2.3651368760064413e-05, "loss": 0.4018, "step": 1586 }, { "epoch": 1.7235198261814233, "grad_norm": 0.21408093840328893, "learning_rate": 2.3631239935587763e-05, "loss": 0.4376, "step": 1587 }, { "epoch": 1.7246061922868008, "grad_norm": 0.20740926088132386, "learning_rate": 2.361111111111111e-05, "loss": 0.3408, "step": 1588 }, { "epoch": 1.7256925583921783, "grad_norm": 0.21943087885932755, "learning_rate": 2.359098228663446e-05, "loss": 0.4792, "step": 1589 }, { "epoch": 1.7267789244975558, "grad_norm": 0.24583986372820857, "learning_rate": 2.357085346215781e-05, "loss": 0.4699, "step": 1590 }, { "epoch": 1.7278652906029333, "grad_norm": 0.20080224543396666, "learning_rate": 2.355072463768116e-05, "loss": 0.3808, "step": 1591 }, { "epoch": 1.7289516567083107, "grad_norm": 0.3019440098055368, "learning_rate": 2.353059581320451e-05, "loss": 0.3646, "step": 1592 }, { "epoch": 1.7300380228136882, "grad_norm": 0.21695015051968813, "learning_rate": 2.351046698872786e-05, "loss": 0.4283, "step": 1593 }, { "epoch": 1.7311243889190657, "grad_norm": 0.26527315899957365, "learning_rate": 2.3490338164251206e-05, "loss": 0.4322, "step": 1594 }, { "epoch": 1.7322107550244432, "grad_norm": 0.21936419437967947, "learning_rate": 2.3470209339774557e-05, "loss": 0.4808, "step": 1595 }, { "epoch": 1.7332971211298207, "grad_norm": 0.2235748457517892, "learning_rate": 2.3450080515297907e-05, "loss": 0.463, "step": 1596 }, { "epoch": 1.7343834872351982, "grad_norm": 0.23638733985259713, "learning_rate": 2.3429951690821258e-05, "loss": 0.4925, "step": 1597 }, { "epoch": 1.7354698533405757, "grad_norm": 0.25449710688014815, "learning_rate": 2.3409822866344608e-05, "loss": 0.4648, "step": 1598 }, { "epoch": 1.7365562194459532, "grad_norm": 0.23114015231943716, "learning_rate": 2.338969404186796e-05, "loss": 0.4995, "step": 1599 }, { "epoch": 1.7376425855513307, "grad_norm": 0.2402535023507465, "learning_rate": 2.3369565217391306e-05, "loss": 0.4744, "step": 1600 }, { "epoch": 1.7387289516567082, "grad_norm": 0.2217321667058383, "learning_rate": 2.3349436392914656e-05, "loss": 0.4506, "step": 1601 }, { "epoch": 1.7398153177620859, "grad_norm": 0.22133900624281572, "learning_rate": 2.3329307568438007e-05, "loss": 0.398, "step": 1602 }, { "epoch": 1.7409016838674634, "grad_norm": 0.23817773174331494, "learning_rate": 2.3309178743961354e-05, "loss": 0.4873, "step": 1603 }, { "epoch": 1.7419880499728408, "grad_norm": 0.23848167935638206, "learning_rate": 2.3289049919484704e-05, "loss": 0.4117, "step": 1604 }, { "epoch": 1.7430744160782183, "grad_norm": 0.24001260406894065, "learning_rate": 2.3268921095008055e-05, "loss": 0.4686, "step": 1605 }, { "epoch": 1.7441607821835958, "grad_norm": 0.2259642714530892, "learning_rate": 2.32487922705314e-05, "loss": 0.367, "step": 1606 }, { "epoch": 1.7452471482889735, "grad_norm": 0.2548788955144687, "learning_rate": 2.3228663446054752e-05, "loss": 0.4893, "step": 1607 }, { "epoch": 1.746333514394351, "grad_norm": 0.22680821708797338, "learning_rate": 2.3208534621578102e-05, "loss": 0.4433, "step": 1608 }, { "epoch": 1.7474198804997285, "grad_norm": 0.2155887575311321, "learning_rate": 2.318840579710145e-05, "loss": 0.4231, "step": 1609 }, { "epoch": 1.748506246605106, "grad_norm": 0.2358763524278629, "learning_rate": 2.31682769726248e-05, "loss": 0.4332, "step": 1610 }, { "epoch": 1.7495926127104835, "grad_norm": 0.18395673673723473, "learning_rate": 2.314814814814815e-05, "loss": 0.3815, "step": 1611 }, { "epoch": 1.750678978815861, "grad_norm": 0.21567495034167328, "learning_rate": 2.3128019323671497e-05, "loss": 0.4032, "step": 1612 }, { "epoch": 1.7517653449212385, "grad_norm": 0.23368021854283832, "learning_rate": 2.3107890499194848e-05, "loss": 0.4826, "step": 1613 }, { "epoch": 1.752851711026616, "grad_norm": 0.2016586612641964, "learning_rate": 2.30877616747182e-05, "loss": 0.46, "step": 1614 }, { "epoch": 1.7539380771319935, "grad_norm": 0.20555083533447668, "learning_rate": 2.3067632850241545e-05, "loss": 0.3946, "step": 1615 }, { "epoch": 1.755024443237371, "grad_norm": 0.23098045479161675, "learning_rate": 2.3047504025764896e-05, "loss": 0.4389, "step": 1616 }, { "epoch": 1.7561108093427484, "grad_norm": 0.5503966123110782, "learning_rate": 2.3027375201288246e-05, "loss": 0.4392, "step": 1617 }, { "epoch": 1.757197175448126, "grad_norm": 0.22784854943300148, "learning_rate": 2.3007246376811593e-05, "loss": 0.4546, "step": 1618 }, { "epoch": 1.7582835415535034, "grad_norm": 0.21575938479544504, "learning_rate": 2.2987117552334944e-05, "loss": 0.4828, "step": 1619 }, { "epoch": 1.759369907658881, "grad_norm": 0.2425022814841582, "learning_rate": 2.2966988727858294e-05, "loss": 0.4842, "step": 1620 }, { "epoch": 1.7604562737642584, "grad_norm": 0.21595197727235968, "learning_rate": 2.294685990338164e-05, "loss": 0.4225, "step": 1621 }, { "epoch": 1.761542639869636, "grad_norm": 0.22936149363858654, "learning_rate": 2.2926731078904992e-05, "loss": 0.4643, "step": 1622 }, { "epoch": 1.7626290059750136, "grad_norm": 0.21878435026719703, "learning_rate": 2.2906602254428342e-05, "loss": 0.477, "step": 1623 }, { "epoch": 1.763715372080391, "grad_norm": 0.2147140035955375, "learning_rate": 2.288647342995169e-05, "loss": 0.4241, "step": 1624 }, { "epoch": 1.7648017381857686, "grad_norm": 0.20190721092168987, "learning_rate": 2.286634460547504e-05, "loss": 0.3835, "step": 1625 }, { "epoch": 1.765888104291146, "grad_norm": 0.20101561779088925, "learning_rate": 2.284621578099839e-05, "loss": 0.4284, "step": 1626 }, { "epoch": 1.7669744703965238, "grad_norm": 0.20460552070678953, "learning_rate": 2.282608695652174e-05, "loss": 0.4377, "step": 1627 }, { "epoch": 1.7680608365019013, "grad_norm": 0.2062140215423944, "learning_rate": 2.280595813204509e-05, "loss": 0.44, "step": 1628 }, { "epoch": 1.7691472026072788, "grad_norm": 0.22386118430727056, "learning_rate": 2.278582930756844e-05, "loss": 0.4432, "step": 1629 }, { "epoch": 1.7702335687126562, "grad_norm": 0.21827402282000874, "learning_rate": 2.276570048309179e-05, "loss": 0.4271, "step": 1630 }, { "epoch": 1.7713199348180337, "grad_norm": 0.23235125960193795, "learning_rate": 2.274557165861514e-05, "loss": 0.4652, "step": 1631 }, { "epoch": 1.7724063009234112, "grad_norm": 0.22860496016996612, "learning_rate": 2.272544283413849e-05, "loss": 0.4663, "step": 1632 }, { "epoch": 1.7734926670287887, "grad_norm": 0.21805408614836688, "learning_rate": 2.2705314009661836e-05, "loss": 0.473, "step": 1633 }, { "epoch": 1.7745790331341662, "grad_norm": 0.21087497703159114, "learning_rate": 2.2685185185185187e-05, "loss": 0.5354, "step": 1634 }, { "epoch": 1.7756653992395437, "grad_norm": 0.26446227903263825, "learning_rate": 2.2665056360708537e-05, "loss": 0.421, "step": 1635 }, { "epoch": 1.7767517653449212, "grad_norm": 0.21614898755543957, "learning_rate": 2.2644927536231884e-05, "loss": 0.4014, "step": 1636 }, { "epoch": 1.7778381314502987, "grad_norm": 2.017141816435814, "learning_rate": 2.2624798711755235e-05, "loss": 0.7291, "step": 1637 }, { "epoch": 1.7789244975556762, "grad_norm": 0.2240132858994438, "learning_rate": 2.2604669887278585e-05, "loss": 0.4449, "step": 1638 }, { "epoch": 1.7800108636610537, "grad_norm": 0.20998677244092695, "learning_rate": 2.2584541062801932e-05, "loss": 0.448, "step": 1639 }, { "epoch": 1.7810972297664311, "grad_norm": 0.2547776912695504, "learning_rate": 2.2564412238325283e-05, "loss": 0.388, "step": 1640 }, { "epoch": 1.7821835958718086, "grad_norm": 0.2148965988085345, "learning_rate": 2.2544283413848633e-05, "loss": 0.4251, "step": 1641 }, { "epoch": 1.7832699619771863, "grad_norm": 0.19694786542973036, "learning_rate": 2.252415458937198e-05, "loss": 0.401, "step": 1642 }, { "epoch": 1.7843563280825638, "grad_norm": 0.22334013216353973, "learning_rate": 2.250402576489533e-05, "loss": 0.4759, "step": 1643 }, { "epoch": 1.7854426941879413, "grad_norm": 0.24315979954172248, "learning_rate": 2.248389694041868e-05, "loss": 0.4954, "step": 1644 }, { "epoch": 1.7865290602933188, "grad_norm": 0.21772713021196713, "learning_rate": 2.246376811594203e-05, "loss": 0.4319, "step": 1645 }, { "epoch": 1.7876154263986963, "grad_norm": 0.22568293862330524, "learning_rate": 2.244363929146538e-05, "loss": 0.4041, "step": 1646 }, { "epoch": 1.788701792504074, "grad_norm": 0.22121232561947735, "learning_rate": 2.242351046698873e-05, "loss": 0.4648, "step": 1647 }, { "epoch": 1.7897881586094515, "grad_norm": 0.2173394808115193, "learning_rate": 2.240338164251208e-05, "loss": 0.4581, "step": 1648 }, { "epoch": 1.790874524714829, "grad_norm": 0.21224546575119763, "learning_rate": 2.2383252818035427e-05, "loss": 0.454, "step": 1649 }, { "epoch": 1.7919608908202065, "grad_norm": 0.21846216033652144, "learning_rate": 2.2363123993558777e-05, "loss": 0.4693, "step": 1650 }, { "epoch": 1.793047256925584, "grad_norm": 0.2085632950059437, "learning_rate": 2.2342995169082127e-05, "loss": 0.4239, "step": 1651 }, { "epoch": 1.7941336230309615, "grad_norm": 0.20965857341982505, "learning_rate": 2.2322866344605474e-05, "loss": 0.4725, "step": 1652 }, { "epoch": 1.795219989136339, "grad_norm": 0.2107593717737179, "learning_rate": 2.2302737520128825e-05, "loss": 0.4836, "step": 1653 }, { "epoch": 1.7963063552417164, "grad_norm": 0.21472530784649374, "learning_rate": 2.2282608695652175e-05, "loss": 0.4356, "step": 1654 }, { "epoch": 1.797392721347094, "grad_norm": 0.21817423870490607, "learning_rate": 2.2262479871175522e-05, "loss": 0.4207, "step": 1655 }, { "epoch": 1.7984790874524714, "grad_norm": 0.2174529399721642, "learning_rate": 2.2242351046698873e-05, "loss": 0.4694, "step": 1656 }, { "epoch": 1.799565453557849, "grad_norm": 0.21681472823194764, "learning_rate": 2.2222222222222223e-05, "loss": 0.4499, "step": 1657 }, { "epoch": 1.8006518196632264, "grad_norm": 0.2538841618850897, "learning_rate": 2.220209339774557e-05, "loss": 0.5491, "step": 1658 }, { "epoch": 1.801738185768604, "grad_norm": 0.20846028244359183, "learning_rate": 2.2181964573268924e-05, "loss": 0.4187, "step": 1659 }, { "epoch": 1.8028245518739814, "grad_norm": 0.21782212467349246, "learning_rate": 2.216183574879227e-05, "loss": 0.4485, "step": 1660 }, { "epoch": 1.803910917979359, "grad_norm": 0.24556619370394628, "learning_rate": 2.214170692431562e-05, "loss": 0.5843, "step": 1661 }, { "epoch": 1.8049972840847366, "grad_norm": 0.2300380532991679, "learning_rate": 2.2121578099838972e-05, "loss": 0.4568, "step": 1662 }, { "epoch": 1.806083650190114, "grad_norm": 0.24409643064354, "learning_rate": 2.2101449275362323e-05, "loss": 0.4847, "step": 1663 }, { "epoch": 1.8071700162954916, "grad_norm": 0.2094658310964809, "learning_rate": 2.208132045088567e-05, "loss": 0.4554, "step": 1664 }, { "epoch": 1.808256382400869, "grad_norm": 0.21340891841394968, "learning_rate": 2.206119162640902e-05, "loss": 0.4193, "step": 1665 }, { "epoch": 1.8093427485062468, "grad_norm": 0.2141634641650583, "learning_rate": 2.204106280193237e-05, "loss": 0.4338, "step": 1666 }, { "epoch": 1.8104291146116243, "grad_norm": 0.21128464798034716, "learning_rate": 2.2020933977455718e-05, "loss": 0.4134, "step": 1667 }, { "epoch": 1.8115154807170017, "grad_norm": 0.5741955311606068, "learning_rate": 2.2000805152979068e-05, "loss": 0.5353, "step": 1668 }, { "epoch": 1.8126018468223792, "grad_norm": 0.23524205283634594, "learning_rate": 2.198067632850242e-05, "loss": 0.4675, "step": 1669 }, { "epoch": 1.8136882129277567, "grad_norm": 0.21068073798714113, "learning_rate": 2.1960547504025765e-05, "loss": 0.4593, "step": 1670 }, { "epoch": 1.8147745790331342, "grad_norm": 0.2440708977078283, "learning_rate": 2.1940418679549116e-05, "loss": 0.4943, "step": 1671 }, { "epoch": 1.8158609451385117, "grad_norm": 0.23446479714883006, "learning_rate": 2.1920289855072466e-05, "loss": 0.4754, "step": 1672 }, { "epoch": 1.8169473112438892, "grad_norm": 0.22505578993380645, "learning_rate": 2.1900161030595813e-05, "loss": 0.4333, "step": 1673 }, { "epoch": 1.8180336773492667, "grad_norm": 0.24302974796941548, "learning_rate": 2.1880032206119164e-05, "loss": 0.4453, "step": 1674 }, { "epoch": 1.8191200434546442, "grad_norm": 0.2269100063215582, "learning_rate": 2.1859903381642514e-05, "loss": 0.4538, "step": 1675 }, { "epoch": 1.8202064095600217, "grad_norm": 0.20871922248645758, "learning_rate": 2.183977455716586e-05, "loss": 0.4581, "step": 1676 }, { "epoch": 1.8212927756653992, "grad_norm": 0.8351239840266482, "learning_rate": 2.1819645732689212e-05, "loss": 0.4446, "step": 1677 }, { "epoch": 1.8223791417707766, "grad_norm": 0.23194975885173408, "learning_rate": 2.1799516908212562e-05, "loss": 0.4725, "step": 1678 }, { "epoch": 1.8234655078761541, "grad_norm": 0.2168397859529696, "learning_rate": 2.177938808373591e-05, "loss": 0.4365, "step": 1679 }, { "epoch": 1.8245518739815316, "grad_norm": 0.23395750820438257, "learning_rate": 2.175925925925926e-05, "loss": 0.4398, "step": 1680 }, { "epoch": 1.8256382400869093, "grad_norm": 0.22359328782339574, "learning_rate": 2.173913043478261e-05, "loss": 0.4085, "step": 1681 }, { "epoch": 1.8267246061922868, "grad_norm": 0.19682557580599977, "learning_rate": 2.1719001610305957e-05, "loss": 0.4286, "step": 1682 }, { "epoch": 1.8278109722976643, "grad_norm": 0.2130193235296123, "learning_rate": 2.1698872785829308e-05, "loss": 0.4276, "step": 1683 }, { "epoch": 1.8288973384030418, "grad_norm": 0.21029683795767515, "learning_rate": 2.1678743961352658e-05, "loss": 0.4537, "step": 1684 }, { "epoch": 1.8299837045084193, "grad_norm": 0.22627475175818143, "learning_rate": 2.1658615136876005e-05, "loss": 0.4259, "step": 1685 }, { "epoch": 1.831070070613797, "grad_norm": 0.21836274352336993, "learning_rate": 2.1638486312399356e-05, "loss": 0.4305, "step": 1686 }, { "epoch": 1.8321564367191745, "grad_norm": 0.23378163498641938, "learning_rate": 2.1618357487922706e-05, "loss": 0.4793, "step": 1687 }, { "epoch": 1.833242802824552, "grad_norm": 0.23899689072007793, "learning_rate": 2.1598228663446053e-05, "loss": 0.4066, "step": 1688 }, { "epoch": 1.8343291689299295, "grad_norm": 0.22886111616536278, "learning_rate": 2.1578099838969404e-05, "loss": 0.4599, "step": 1689 }, { "epoch": 1.835415535035307, "grad_norm": 0.25132545027323844, "learning_rate": 2.1557971014492757e-05, "loss": 0.4847, "step": 1690 }, { "epoch": 1.8365019011406845, "grad_norm": 0.22508264038711517, "learning_rate": 2.1537842190016104e-05, "loss": 0.4267, "step": 1691 }, { "epoch": 1.837588267246062, "grad_norm": 0.2191472731527922, "learning_rate": 2.1517713365539455e-05, "loss": 0.3842, "step": 1692 }, { "epoch": 1.8386746333514394, "grad_norm": 0.23292521897024546, "learning_rate": 2.1497584541062805e-05, "loss": 0.4707, "step": 1693 }, { "epoch": 1.839760999456817, "grad_norm": 0.2381083276508289, "learning_rate": 2.1477455716586152e-05, "loss": 0.4033, "step": 1694 }, { "epoch": 1.8408473655621944, "grad_norm": 0.2083569754094539, "learning_rate": 2.1457326892109503e-05, "loss": 0.449, "step": 1695 }, { "epoch": 1.841933731667572, "grad_norm": 0.2206049529120985, "learning_rate": 2.1437198067632853e-05, "loss": 0.4229, "step": 1696 }, { "epoch": 1.8430200977729494, "grad_norm": 0.2178844333556454, "learning_rate": 2.14170692431562e-05, "loss": 0.4607, "step": 1697 }, { "epoch": 1.8441064638783269, "grad_norm": 0.2331554883090805, "learning_rate": 2.139694041867955e-05, "loss": 0.481, "step": 1698 }, { "epoch": 1.8451928299837044, "grad_norm": 0.23084079213453793, "learning_rate": 2.13768115942029e-05, "loss": 0.4635, "step": 1699 }, { "epoch": 1.8462791960890819, "grad_norm": 0.23981001452557538, "learning_rate": 2.1356682769726248e-05, "loss": 0.4633, "step": 1700 }, { "epoch": 1.8473655621944596, "grad_norm": 0.2108445515846414, "learning_rate": 2.13365539452496e-05, "loss": 0.4608, "step": 1701 }, { "epoch": 1.848451928299837, "grad_norm": 4.524470233182752, "learning_rate": 2.131642512077295e-05, "loss": 0.4478, "step": 1702 }, { "epoch": 1.8495382944052146, "grad_norm": 3.5596193491052768, "learning_rate": 2.1296296296296296e-05, "loss": 0.398, "step": 1703 }, { "epoch": 1.850624660510592, "grad_norm": 0.28552339038467067, "learning_rate": 2.1276167471819647e-05, "loss": 0.4108, "step": 1704 }, { "epoch": 1.8517110266159695, "grad_norm": 0.22463758260615244, "learning_rate": 2.1256038647342997e-05, "loss": 0.4537, "step": 1705 }, { "epoch": 1.8527973927213472, "grad_norm": 0.26952065218317606, "learning_rate": 2.1235909822866344e-05, "loss": 0.4546, "step": 1706 }, { "epoch": 1.8538837588267247, "grad_norm": 0.2605751271677947, "learning_rate": 2.1215780998389695e-05, "loss": 0.4737, "step": 1707 }, { "epoch": 1.8549701249321022, "grad_norm": 0.2006994929604339, "learning_rate": 2.1195652173913045e-05, "loss": 0.462, "step": 1708 }, { "epoch": 1.8560564910374797, "grad_norm": 0.21139079751454573, "learning_rate": 2.1175523349436392e-05, "loss": 0.4457, "step": 1709 }, { "epoch": 1.8571428571428572, "grad_norm": 0.21724933553414427, "learning_rate": 2.1155394524959743e-05, "loss": 0.4101, "step": 1710 }, { "epoch": 1.8582292232482347, "grad_norm": 0.22929182009576435, "learning_rate": 2.1135265700483093e-05, "loss": 0.432, "step": 1711 }, { "epoch": 1.8593155893536122, "grad_norm": 0.2116182431812655, "learning_rate": 2.111513687600644e-05, "loss": 0.4026, "step": 1712 }, { "epoch": 1.8604019554589897, "grad_norm": 0.20432611645648555, "learning_rate": 2.109500805152979e-05, "loss": 0.4555, "step": 1713 }, { "epoch": 1.8614883215643672, "grad_norm": 0.21243215865065354, "learning_rate": 2.107487922705314e-05, "loss": 0.4701, "step": 1714 }, { "epoch": 1.8625746876697447, "grad_norm": 0.23488685676478913, "learning_rate": 2.105475040257649e-05, "loss": 0.4299, "step": 1715 }, { "epoch": 1.8636610537751221, "grad_norm": 0.21569133721395453, "learning_rate": 2.103462157809984e-05, "loss": 0.4072, "step": 1716 }, { "epoch": 1.8647474198804996, "grad_norm": 0.21809351286061573, "learning_rate": 2.101449275362319e-05, "loss": 0.4579, "step": 1717 }, { "epoch": 1.8658337859858771, "grad_norm": 0.2417743354546307, "learning_rate": 2.099436392914654e-05, "loss": 0.4866, "step": 1718 }, { "epoch": 1.8669201520912546, "grad_norm": 0.20906843506186926, "learning_rate": 2.0974235104669886e-05, "loss": 0.4223, "step": 1719 }, { "epoch": 1.868006518196632, "grad_norm": 0.21047796238176522, "learning_rate": 2.0954106280193237e-05, "loss": 0.4299, "step": 1720 }, { "epoch": 1.8690928843020098, "grad_norm": 0.1930356661842423, "learning_rate": 2.0933977455716587e-05, "loss": 0.4696, "step": 1721 }, { "epoch": 1.8701792504073873, "grad_norm": 0.24450704887950778, "learning_rate": 2.0913848631239938e-05, "loss": 0.4516, "step": 1722 }, { "epoch": 1.8712656165127648, "grad_norm": 0.21857392423284827, "learning_rate": 2.0893719806763288e-05, "loss": 0.4146, "step": 1723 }, { "epoch": 1.8723519826181423, "grad_norm": 0.2032342271796096, "learning_rate": 2.0873590982286635e-05, "loss": 0.42, "step": 1724 }, { "epoch": 1.8734383487235198, "grad_norm": 0.21147748421121498, "learning_rate": 2.0853462157809986e-05, "loss": 0.4164, "step": 1725 }, { "epoch": 1.8745247148288975, "grad_norm": 0.2150765961788, "learning_rate": 2.0833333333333336e-05, "loss": 0.4685, "step": 1726 }, { "epoch": 1.875611080934275, "grad_norm": 0.23251566209726526, "learning_rate": 2.0813204508856683e-05, "loss": 0.4246, "step": 1727 }, { "epoch": 1.8766974470396525, "grad_norm": 0.2238025948299729, "learning_rate": 2.0793075684380034e-05, "loss": 0.446, "step": 1728 }, { "epoch": 1.87778381314503, "grad_norm": 0.20719046093217527, "learning_rate": 2.0772946859903384e-05, "loss": 0.3856, "step": 1729 }, { "epoch": 1.8788701792504074, "grad_norm": 0.19922316568671644, "learning_rate": 2.075281803542673e-05, "loss": 0.4018, "step": 1730 }, { "epoch": 1.879956545355785, "grad_norm": 0.22173807696397999, "learning_rate": 2.073268921095008e-05, "loss": 0.4089, "step": 1731 }, { "epoch": 1.8810429114611624, "grad_norm": 3.112292595284667, "learning_rate": 2.0712560386473432e-05, "loss": 0.3823, "step": 1732 }, { "epoch": 1.88212927756654, "grad_norm": 0.22009010174266064, "learning_rate": 2.0692431561996782e-05, "loss": 0.4589, "step": 1733 }, { "epoch": 1.8832156436719174, "grad_norm": 0.334912516602485, "learning_rate": 2.067230273752013e-05, "loss": 0.3904, "step": 1734 }, { "epoch": 1.884302009777295, "grad_norm": 0.21773038246127693, "learning_rate": 2.065217391304348e-05, "loss": 0.4671, "step": 1735 }, { "epoch": 1.8853883758826724, "grad_norm": 0.21944600091242059, "learning_rate": 2.063204508856683e-05, "loss": 0.4151, "step": 1736 }, { "epoch": 1.8864747419880499, "grad_norm": 0.2349569617037284, "learning_rate": 2.0611916264090177e-05, "loss": 0.4555, "step": 1737 }, { "epoch": 1.8875611080934274, "grad_norm": 0.2518961713104872, "learning_rate": 2.0591787439613528e-05, "loss": 0.5246, "step": 1738 }, { "epoch": 1.8886474741988049, "grad_norm": 0.21656061998414655, "learning_rate": 2.0571658615136878e-05, "loss": 0.4266, "step": 1739 }, { "epoch": 1.8897338403041823, "grad_norm": 0.2190480022548414, "learning_rate": 2.0551529790660225e-05, "loss": 0.3892, "step": 1740 }, { "epoch": 1.89082020640956, "grad_norm": 0.2218764018097131, "learning_rate": 2.0531400966183576e-05, "loss": 0.5247, "step": 1741 }, { "epoch": 1.8919065725149375, "grad_norm": 0.2188309929667018, "learning_rate": 2.0511272141706926e-05, "loss": 0.4104, "step": 1742 }, { "epoch": 1.892992938620315, "grad_norm": 0.21768783393180977, "learning_rate": 2.0491143317230273e-05, "loss": 0.4669, "step": 1743 }, { "epoch": 1.8940793047256925, "grad_norm": 0.22832506326459717, "learning_rate": 2.0471014492753624e-05, "loss": 0.424, "step": 1744 }, { "epoch": 1.8951656708310702, "grad_norm": 0.210652329792862, "learning_rate": 2.0450885668276974e-05, "loss": 0.4199, "step": 1745 }, { "epoch": 1.8962520369364477, "grad_norm": 0.216009429614325, "learning_rate": 2.043075684380032e-05, "loss": 0.4189, "step": 1746 }, { "epoch": 1.8973384030418252, "grad_norm": 0.20398367555209185, "learning_rate": 2.041062801932367e-05, "loss": 0.4655, "step": 1747 }, { "epoch": 1.8984247691472027, "grad_norm": 0.25587256469532005, "learning_rate": 2.0390499194847022e-05, "loss": 0.4689, "step": 1748 }, { "epoch": 1.8995111352525802, "grad_norm": 0.25140083518759376, "learning_rate": 2.037037037037037e-05, "loss": 0.4626, "step": 1749 }, { "epoch": 1.9005975013579577, "grad_norm": 0.20723571065962645, "learning_rate": 2.035024154589372e-05, "loss": 0.4346, "step": 1750 }, { "epoch": 1.9016838674633352, "grad_norm": 0.24668540800540295, "learning_rate": 2.033011272141707e-05, "loss": 0.4751, "step": 1751 }, { "epoch": 1.9027702335687127, "grad_norm": 0.22404183323569943, "learning_rate": 2.030998389694042e-05, "loss": 0.4672, "step": 1752 }, { "epoch": 1.9038565996740902, "grad_norm": 0.21067323581542283, "learning_rate": 2.028985507246377e-05, "loss": 0.3976, "step": 1753 }, { "epoch": 1.9049429657794676, "grad_norm": 0.20839408723678715, "learning_rate": 2.026972624798712e-05, "loss": 0.3966, "step": 1754 }, { "epoch": 1.9060293318848451, "grad_norm": 0.2585035011790393, "learning_rate": 2.024959742351047e-05, "loss": 0.449, "step": 1755 }, { "epoch": 1.9071156979902226, "grad_norm": 0.24180521271410169, "learning_rate": 2.022946859903382e-05, "loss": 0.4709, "step": 1756 }, { "epoch": 1.9082020640956001, "grad_norm": 0.2217369034670369, "learning_rate": 2.020933977455717e-05, "loss": 0.4183, "step": 1757 }, { "epoch": 1.9092884302009776, "grad_norm": 0.20435198791141007, "learning_rate": 2.0189210950080516e-05, "loss": 0.4128, "step": 1758 }, { "epoch": 1.910374796306355, "grad_norm": 0.23543315361586173, "learning_rate": 2.0169082125603867e-05, "loss": 0.4484, "step": 1759 }, { "epoch": 1.9114611624117328, "grad_norm": 0.2395645472726932, "learning_rate": 2.0148953301127217e-05, "loss": 0.424, "step": 1760 }, { "epoch": 1.9125475285171103, "grad_norm": 0.9435767109481396, "learning_rate": 2.0128824476650564e-05, "loss": 0.413, "step": 1761 }, { "epoch": 1.9136338946224878, "grad_norm": 0.24228242121908464, "learning_rate": 2.0108695652173915e-05, "loss": 0.4146, "step": 1762 }, { "epoch": 1.9147202607278653, "grad_norm": 0.20385611304624252, "learning_rate": 2.0088566827697265e-05, "loss": 0.4334, "step": 1763 }, { "epoch": 1.9158066268332428, "grad_norm": 0.20699243829259753, "learning_rate": 2.0068438003220612e-05, "loss": 0.4638, "step": 1764 }, { "epoch": 1.9168929929386205, "grad_norm": 0.21604430970178778, "learning_rate": 2.0048309178743963e-05, "loss": 0.3618, "step": 1765 }, { "epoch": 1.917979359043998, "grad_norm": 0.22110023187688535, "learning_rate": 2.0028180354267313e-05, "loss": 0.4505, "step": 1766 }, { "epoch": 1.9190657251493755, "grad_norm": 0.22545136244036518, "learning_rate": 2.000805152979066e-05, "loss": 0.4538, "step": 1767 }, { "epoch": 1.920152091254753, "grad_norm": 0.22197459848807677, "learning_rate": 1.998792270531401e-05, "loss": 0.4875, "step": 1768 }, { "epoch": 1.9212384573601304, "grad_norm": 0.22564671270732542, "learning_rate": 1.996779388083736e-05, "loss": 0.4569, "step": 1769 }, { "epoch": 1.922324823465508, "grad_norm": 0.21347471945360896, "learning_rate": 1.9947665056360708e-05, "loss": 0.4043, "step": 1770 }, { "epoch": 1.9234111895708854, "grad_norm": 0.22311644777639872, "learning_rate": 1.992753623188406e-05, "loss": 0.4194, "step": 1771 }, { "epoch": 1.924497555676263, "grad_norm": 0.24301141259098813, "learning_rate": 1.990740740740741e-05, "loss": 0.4568, "step": 1772 }, { "epoch": 1.9255839217816404, "grad_norm": 0.21858898911637273, "learning_rate": 1.9887278582930756e-05, "loss": 0.443, "step": 1773 }, { "epoch": 1.9266702878870179, "grad_norm": 0.20963691178923274, "learning_rate": 1.9867149758454106e-05, "loss": 0.3889, "step": 1774 }, { "epoch": 1.9277566539923954, "grad_norm": 0.24009607986182097, "learning_rate": 1.9847020933977457e-05, "loss": 0.4066, "step": 1775 }, { "epoch": 1.9288430200977729, "grad_norm": 0.19571962258573353, "learning_rate": 1.9826892109500804e-05, "loss": 0.4132, "step": 1776 }, { "epoch": 1.9299293862031504, "grad_norm": 0.22149933190193638, "learning_rate": 1.9806763285024154e-05, "loss": 0.4808, "step": 1777 }, { "epoch": 1.9310157523085278, "grad_norm": 0.21538384796115012, "learning_rate": 1.9786634460547505e-05, "loss": 0.3982, "step": 1778 }, { "epoch": 1.9321021184139053, "grad_norm": 0.22380494372173054, "learning_rate": 1.9766505636070852e-05, "loss": 0.4316, "step": 1779 }, { "epoch": 1.933188484519283, "grad_norm": 0.2156216339082002, "learning_rate": 1.9746376811594202e-05, "loss": 0.4656, "step": 1780 }, { "epoch": 1.9342748506246605, "grad_norm": 0.23124551920660558, "learning_rate": 1.9726247987117553e-05, "loss": 0.4935, "step": 1781 }, { "epoch": 1.935361216730038, "grad_norm": 0.3464184985121441, "learning_rate": 1.97061191626409e-05, "loss": 0.3834, "step": 1782 }, { "epoch": 1.9364475828354155, "grad_norm": 0.20424431992377254, "learning_rate": 1.9685990338164254e-05, "loss": 0.5034, "step": 1783 }, { "epoch": 1.937533948940793, "grad_norm": 10.71337296721434, "learning_rate": 1.9665861513687604e-05, "loss": 0.4402, "step": 1784 }, { "epoch": 1.9386203150461707, "grad_norm": 0.2591428293974612, "learning_rate": 1.964573268921095e-05, "loss": 0.4281, "step": 1785 }, { "epoch": 1.9397066811515482, "grad_norm": 0.23129994358955838, "learning_rate": 1.96256038647343e-05, "loss": 0.4322, "step": 1786 }, { "epoch": 1.9407930472569257, "grad_norm": 0.21808970780200476, "learning_rate": 1.9605475040257652e-05, "loss": 0.4544, "step": 1787 }, { "epoch": 1.9418794133623032, "grad_norm": 0.25742737512354935, "learning_rate": 1.9585346215781e-05, "loss": 0.4276, "step": 1788 }, { "epoch": 1.9429657794676807, "grad_norm": 0.2500849458025678, "learning_rate": 1.956521739130435e-05, "loss": 0.4244, "step": 1789 }, { "epoch": 1.9440521455730582, "grad_norm": 0.2033657648851227, "learning_rate": 1.95450885668277e-05, "loss": 0.3949, "step": 1790 }, { "epoch": 1.9451385116784357, "grad_norm": 0.238596851513712, "learning_rate": 1.9524959742351047e-05, "loss": 0.4654, "step": 1791 }, { "epoch": 1.9462248777838131, "grad_norm": 0.2315364564617079, "learning_rate": 1.9504830917874397e-05, "loss": 0.4522, "step": 1792 }, { "epoch": 1.9473112438891906, "grad_norm": 0.21540942813980613, "learning_rate": 1.9484702093397748e-05, "loss": 0.4648, "step": 1793 }, { "epoch": 1.9483976099945681, "grad_norm": 0.2876595134238499, "learning_rate": 1.9464573268921095e-05, "loss": 0.5162, "step": 1794 }, { "epoch": 1.9494839760999456, "grad_norm": 0.23862726237694645, "learning_rate": 1.9444444444444445e-05, "loss": 0.4014, "step": 1795 }, { "epoch": 1.950570342205323, "grad_norm": 0.22813053489289586, "learning_rate": 1.9424315619967796e-05, "loss": 0.5119, "step": 1796 }, { "epoch": 1.9516567083107006, "grad_norm": 0.5918461293478091, "learning_rate": 1.9404186795491143e-05, "loss": 0.3935, "step": 1797 }, { "epoch": 1.952743074416078, "grad_norm": 0.21881397841062075, "learning_rate": 1.9384057971014493e-05, "loss": 0.4764, "step": 1798 }, { "epoch": 1.9538294405214556, "grad_norm": 0.2378997935035269, "learning_rate": 1.9363929146537844e-05, "loss": 0.4177, "step": 1799 }, { "epoch": 1.9549158066268333, "grad_norm": 0.23674659269501497, "learning_rate": 1.934380032206119e-05, "loss": 0.5516, "step": 1800 }, { "epoch": 1.9560021727322108, "grad_norm": 0.22640213107631288, "learning_rate": 1.932367149758454e-05, "loss": 0.4198, "step": 1801 }, { "epoch": 1.9570885388375883, "grad_norm": 0.2047980327850262, "learning_rate": 1.9303542673107892e-05, "loss": 0.4503, "step": 1802 }, { "epoch": 1.9581749049429658, "grad_norm": 0.22927670789250287, "learning_rate": 1.9283413848631242e-05, "loss": 0.4139, "step": 1803 }, { "epoch": 1.9592612710483432, "grad_norm": 0.2302031973576372, "learning_rate": 1.926328502415459e-05, "loss": 0.4693, "step": 1804 }, { "epoch": 1.960347637153721, "grad_norm": 0.20963433047965388, "learning_rate": 1.924315619967794e-05, "loss": 0.3943, "step": 1805 }, { "epoch": 1.9614340032590984, "grad_norm": 0.22976387320979433, "learning_rate": 1.922302737520129e-05, "loss": 0.4012, "step": 1806 }, { "epoch": 1.962520369364476, "grad_norm": 0.5034267294559746, "learning_rate": 1.9202898550724637e-05, "loss": 0.4459, "step": 1807 }, { "epoch": 1.9636067354698534, "grad_norm": 0.21736702259907567, "learning_rate": 1.9182769726247988e-05, "loss": 0.4389, "step": 1808 }, { "epoch": 1.964693101575231, "grad_norm": 0.21301724357218219, "learning_rate": 1.9162640901771338e-05, "loss": 0.5009, "step": 1809 }, { "epoch": 1.9657794676806084, "grad_norm": 0.2532914424088384, "learning_rate": 1.9142512077294685e-05, "loss": 0.4564, "step": 1810 }, { "epoch": 1.966865833785986, "grad_norm": 0.22477648927613542, "learning_rate": 1.9122383252818036e-05, "loss": 0.4073, "step": 1811 }, { "epoch": 1.9679521998913634, "grad_norm": 0.22650557874963748, "learning_rate": 1.9102254428341386e-05, "loss": 0.4168, "step": 1812 }, { "epoch": 1.9690385659967409, "grad_norm": 0.2433313836441464, "learning_rate": 1.9082125603864733e-05, "loss": 0.4497, "step": 1813 }, { "epoch": 1.9701249321021184, "grad_norm": 0.22632484552270987, "learning_rate": 1.9061996779388087e-05, "loss": 0.4783, "step": 1814 }, { "epoch": 1.9712112982074959, "grad_norm": 0.23100679682193118, "learning_rate": 1.9041867954911434e-05, "loss": 0.474, "step": 1815 }, { "epoch": 1.9722976643128733, "grad_norm": 0.24642168978671886, "learning_rate": 1.9021739130434784e-05, "loss": 0.4797, "step": 1816 }, { "epoch": 1.9733840304182508, "grad_norm": 0.22379763932922866, "learning_rate": 1.9001610305958135e-05, "loss": 0.3831, "step": 1817 }, { "epoch": 1.9744703965236283, "grad_norm": 0.19632278511721737, "learning_rate": 1.8981481481481482e-05, "loss": 0.418, "step": 1818 }, { "epoch": 1.9755567626290058, "grad_norm": 0.2146452857757584, "learning_rate": 1.8961352657004832e-05, "loss": 0.4636, "step": 1819 }, { "epoch": 1.9766431287343835, "grad_norm": 0.2437284929883647, "learning_rate": 1.8941223832528183e-05, "loss": 0.4743, "step": 1820 }, { "epoch": 1.977729494839761, "grad_norm": 0.44012017605987513, "learning_rate": 1.892109500805153e-05, "loss": 0.4658, "step": 1821 }, { "epoch": 1.9788158609451385, "grad_norm": 0.21199131764267656, "learning_rate": 1.890096618357488e-05, "loss": 0.401, "step": 1822 }, { "epoch": 1.979902227050516, "grad_norm": 0.24820792697733768, "learning_rate": 1.888083735909823e-05, "loss": 0.4753, "step": 1823 }, { "epoch": 1.9809885931558935, "grad_norm": 0.2244072427640501, "learning_rate": 1.886070853462158e-05, "loss": 0.4588, "step": 1824 }, { "epoch": 1.9820749592612712, "grad_norm": 0.24785363357331378, "learning_rate": 1.8840579710144928e-05, "loss": 0.3994, "step": 1825 }, { "epoch": 1.9831613253666487, "grad_norm": 0.22388236590829497, "learning_rate": 1.882045088566828e-05, "loss": 0.3876, "step": 1826 }, { "epoch": 1.9842476914720262, "grad_norm": 0.2381980620172005, "learning_rate": 1.880032206119163e-05, "loss": 0.4833, "step": 1827 }, { "epoch": 1.9853340575774037, "grad_norm": 0.22227436604971476, "learning_rate": 1.8780193236714976e-05, "loss": 0.4076, "step": 1828 }, { "epoch": 1.9864204236827812, "grad_norm": 0.22712442127926008, "learning_rate": 1.8760064412238327e-05, "loss": 0.4767, "step": 1829 }, { "epoch": 1.9875067897881586, "grad_norm": 0.22826785625565896, "learning_rate": 1.8739935587761677e-05, "loss": 0.4184, "step": 1830 }, { "epoch": 1.9885931558935361, "grad_norm": 0.22958731747402578, "learning_rate": 1.8719806763285024e-05, "loss": 0.4636, "step": 1831 }, { "epoch": 1.9896795219989136, "grad_norm": 0.20666521669384696, "learning_rate": 1.8699677938808374e-05, "loss": 0.418, "step": 1832 }, { "epoch": 1.9907658881042911, "grad_norm": 0.21690796442947696, "learning_rate": 1.8679549114331725e-05, "loss": 0.4631, "step": 1833 }, { "epoch": 1.9918522542096686, "grad_norm": 0.215576408265058, "learning_rate": 1.8659420289855072e-05, "loss": 0.4859, "step": 1834 }, { "epoch": 1.992938620315046, "grad_norm": 0.2095949881216714, "learning_rate": 1.8639291465378422e-05, "loss": 0.5087, "step": 1835 }, { "epoch": 1.9940249864204236, "grad_norm": 0.24681713787929377, "learning_rate": 1.8619162640901773e-05, "loss": 0.4269, "step": 1836 }, { "epoch": 1.995111352525801, "grad_norm": 0.1858305518872402, "learning_rate": 1.859903381642512e-05, "loss": 0.4109, "step": 1837 }, { "epoch": 1.9961977186311786, "grad_norm": 0.22727593880417504, "learning_rate": 1.857890499194847e-05, "loss": 0.3941, "step": 1838 }, { "epoch": 1.997284084736556, "grad_norm": 2.6620422976614706, "learning_rate": 1.855877616747182e-05, "loss": 0.4936, "step": 1839 }, { "epoch": 1.9983704508419338, "grad_norm": 0.21350564503729527, "learning_rate": 1.8538647342995168e-05, "loss": 0.4135, "step": 1840 }, { "epoch": 1.9994568169473113, "grad_norm": 0.22971666905019328, "learning_rate": 1.8518518518518518e-05, "loss": 0.4871, "step": 1841 }, { "epoch": 2.0, "grad_norm": 0.46282689110158026, "learning_rate": 1.849838969404187e-05, "loss": 0.3801, "step": 1842 }, { "epoch": 2.0010863661053775, "grad_norm": 0.2870187434669542, "learning_rate": 1.8478260869565216e-05, "loss": 0.3267, "step": 1843 }, { "epoch": 2.002172732210755, "grad_norm": 0.24879741570118086, "learning_rate": 1.8458132045088566e-05, "loss": 0.361, "step": 1844 }, { "epoch": 2.0032590983161325, "grad_norm": 0.29138866381479833, "learning_rate": 1.8438003220611917e-05, "loss": 0.3772, "step": 1845 }, { "epoch": 2.00434546442151, "grad_norm": 0.2969195642112669, "learning_rate": 1.8417874396135267e-05, "loss": 0.3385, "step": 1846 }, { "epoch": 2.0054318305268874, "grad_norm": 0.24286365995995732, "learning_rate": 1.8397745571658618e-05, "loss": 0.3689, "step": 1847 }, { "epoch": 2.006518196632265, "grad_norm": 0.227522648098104, "learning_rate": 1.8377616747181968e-05, "loss": 0.3447, "step": 1848 }, { "epoch": 2.0076045627376424, "grad_norm": 0.2594911397443991, "learning_rate": 1.8357487922705315e-05, "loss": 0.3648, "step": 1849 }, { "epoch": 2.00869092884302, "grad_norm": 0.24541190381104463, "learning_rate": 1.8337359098228665e-05, "loss": 0.3431, "step": 1850 }, { "epoch": 2.0097772949483974, "grad_norm": 0.26698534408930336, "learning_rate": 1.8317230273752016e-05, "loss": 0.3592, "step": 1851 }, { "epoch": 2.0108636610537753, "grad_norm": 0.2583507993262128, "learning_rate": 1.8297101449275363e-05, "loss": 0.3746, "step": 1852 }, { "epoch": 2.011950027159153, "grad_norm": 0.2544753466970483, "learning_rate": 1.8276972624798713e-05, "loss": 0.3753, "step": 1853 }, { "epoch": 2.0130363932645303, "grad_norm": 0.21585556277124757, "learning_rate": 1.8256843800322064e-05, "loss": 0.3067, "step": 1854 }, { "epoch": 2.014122759369908, "grad_norm": 0.24559111953305185, "learning_rate": 1.823671497584541e-05, "loss": 0.3502, "step": 1855 }, { "epoch": 2.0152091254752853, "grad_norm": 0.24135479074826696, "learning_rate": 1.821658615136876e-05, "loss": 0.3793, "step": 1856 }, { "epoch": 2.016295491580663, "grad_norm": 0.2099203220360881, "learning_rate": 1.8196457326892112e-05, "loss": 0.3314, "step": 1857 }, { "epoch": 2.0173818576860403, "grad_norm": 0.2402687258993633, "learning_rate": 1.817632850241546e-05, "loss": 0.3414, "step": 1858 }, { "epoch": 2.0184682237914178, "grad_norm": 0.22722043928452118, "learning_rate": 1.815619967793881e-05, "loss": 0.3281, "step": 1859 }, { "epoch": 2.0195545898967953, "grad_norm": 0.2611466207402919, "learning_rate": 1.813607085346216e-05, "loss": 0.4089, "step": 1860 }, { "epoch": 2.0206409560021728, "grad_norm": 0.214619317698866, "learning_rate": 1.8115942028985507e-05, "loss": 0.314, "step": 1861 }, { "epoch": 2.0217273221075502, "grad_norm": 0.2386225456997683, "learning_rate": 1.8095813204508857e-05, "loss": 0.3388, "step": 1862 }, { "epoch": 2.0228136882129277, "grad_norm": 0.2517986101221125, "learning_rate": 1.8075684380032208e-05, "loss": 0.3931, "step": 1863 }, { "epoch": 2.023900054318305, "grad_norm": 0.2153508328123521, "learning_rate": 1.8055555555555555e-05, "loss": 0.3318, "step": 1864 }, { "epoch": 2.0249864204236827, "grad_norm": 0.2331001991453205, "learning_rate": 1.8035426731078905e-05, "loss": 0.37, "step": 1865 }, { "epoch": 2.02607278652906, "grad_norm": 0.20819823212112806, "learning_rate": 1.8015297906602256e-05, "loss": 0.3251, "step": 1866 }, { "epoch": 2.0271591526344377, "grad_norm": 0.21613841314809545, "learning_rate": 1.7995169082125603e-05, "loss": 0.3573, "step": 1867 }, { "epoch": 2.028245518739815, "grad_norm": 0.24832014584460813, "learning_rate": 1.7975040257648953e-05, "loss": 0.3682, "step": 1868 }, { "epoch": 2.0293318848451927, "grad_norm": 0.23813986844171095, "learning_rate": 1.7954911433172304e-05, "loss": 0.338, "step": 1869 }, { "epoch": 2.03041825095057, "grad_norm": 0.22094927560328853, "learning_rate": 1.793478260869565e-05, "loss": 0.3641, "step": 1870 }, { "epoch": 2.0315046170559476, "grad_norm": 0.21626626294710868, "learning_rate": 1.7914653784219e-05, "loss": 0.3477, "step": 1871 }, { "epoch": 2.0325909831613256, "grad_norm": 0.23638227030635417, "learning_rate": 1.789452495974235e-05, "loss": 0.3235, "step": 1872 }, { "epoch": 2.033677349266703, "grad_norm": 0.45781800672241285, "learning_rate": 1.78743961352657e-05, "loss": 0.3814, "step": 1873 }, { "epoch": 2.0347637153720806, "grad_norm": 0.2208034692429255, "learning_rate": 1.785426731078905e-05, "loss": 0.3652, "step": 1874 }, { "epoch": 2.035850081477458, "grad_norm": 0.1991256705774849, "learning_rate": 1.78341384863124e-05, "loss": 0.3269, "step": 1875 }, { "epoch": 2.0369364475828355, "grad_norm": 0.21935777996856232, "learning_rate": 1.781400966183575e-05, "loss": 0.3597, "step": 1876 }, { "epoch": 2.038022813688213, "grad_norm": 0.2187301832182881, "learning_rate": 1.77938808373591e-05, "loss": 0.3683, "step": 1877 }, { "epoch": 2.0391091797935905, "grad_norm": 0.23673136663786548, "learning_rate": 1.777375201288245e-05, "loss": 0.362, "step": 1878 }, { "epoch": 2.040195545898968, "grad_norm": 0.2699028566938819, "learning_rate": 1.7753623188405798e-05, "loss": 0.3325, "step": 1879 }, { "epoch": 2.0412819120043455, "grad_norm": 0.2123916255518914, "learning_rate": 1.7733494363929148e-05, "loss": 0.339, "step": 1880 }, { "epoch": 2.042368278109723, "grad_norm": 0.21613070060037282, "learning_rate": 1.77133655394525e-05, "loss": 0.3916, "step": 1881 }, { "epoch": 2.0434546442151005, "grad_norm": 0.20286608754936247, "learning_rate": 1.7693236714975846e-05, "loss": 0.3267, "step": 1882 }, { "epoch": 2.044541010320478, "grad_norm": 0.206685136239173, "learning_rate": 1.7673107890499196e-05, "loss": 0.3179, "step": 1883 }, { "epoch": 2.0456273764258555, "grad_norm": 0.21128327929546187, "learning_rate": 1.7652979066022547e-05, "loss": 0.3469, "step": 1884 }, { "epoch": 2.046713742531233, "grad_norm": 0.21979631296671517, "learning_rate": 1.7632850241545894e-05, "loss": 0.3599, "step": 1885 }, { "epoch": 2.0478001086366104, "grad_norm": 0.19771596128929364, "learning_rate": 1.7612721417069244e-05, "loss": 0.3134, "step": 1886 }, { "epoch": 2.048886474741988, "grad_norm": 0.21665286221266702, "learning_rate": 1.7592592592592595e-05, "loss": 0.3675, "step": 1887 }, { "epoch": 2.0499728408473654, "grad_norm": 0.20293756474812957, "learning_rate": 1.757246376811594e-05, "loss": 0.3064, "step": 1888 }, { "epoch": 2.051059206952743, "grad_norm": 0.21018516625755873, "learning_rate": 1.7552334943639292e-05, "loss": 0.3368, "step": 1889 }, { "epoch": 2.0521455730581204, "grad_norm": 0.22610589294026906, "learning_rate": 1.7532206119162643e-05, "loss": 0.3714, "step": 1890 }, { "epoch": 2.053231939163498, "grad_norm": 0.22278153897179856, "learning_rate": 1.751207729468599e-05, "loss": 0.3769, "step": 1891 }, { "epoch": 2.054318305268876, "grad_norm": 0.20364438827851272, "learning_rate": 1.749194847020934e-05, "loss": 0.3186, "step": 1892 }, { "epoch": 2.0554046713742533, "grad_norm": 0.20241061312327363, "learning_rate": 1.747181964573269e-05, "loss": 0.3407, "step": 1893 }, { "epoch": 2.056491037479631, "grad_norm": 0.20747614517062823, "learning_rate": 1.745169082125604e-05, "loss": 0.3473, "step": 1894 }, { "epoch": 2.0575774035850083, "grad_norm": 0.19779646602698908, "learning_rate": 1.7431561996779388e-05, "loss": 0.3397, "step": 1895 }, { "epoch": 2.058663769690386, "grad_norm": 0.21882942957439244, "learning_rate": 1.741143317230274e-05, "loss": 0.32, "step": 1896 }, { "epoch": 2.0597501357957633, "grad_norm": 0.20729307856859433, "learning_rate": 1.739130434782609e-05, "loss": 0.3601, "step": 1897 }, { "epoch": 2.0608365019011408, "grad_norm": 0.19914198134633848, "learning_rate": 1.7371175523349436e-05, "loss": 0.363, "step": 1898 }, { "epoch": 2.0619228680065183, "grad_norm": 0.2176038824706047, "learning_rate": 1.7351046698872786e-05, "loss": 0.3613, "step": 1899 }, { "epoch": 2.0630092341118957, "grad_norm": 0.2107259934807206, "learning_rate": 1.7330917874396137e-05, "loss": 0.3114, "step": 1900 }, { "epoch": 2.0640956002172732, "grad_norm": 0.21779688151138127, "learning_rate": 1.7310789049919484e-05, "loss": 0.3528, "step": 1901 }, { "epoch": 2.0651819663226507, "grad_norm": 0.19762895601936287, "learning_rate": 1.7290660225442834e-05, "loss": 0.3518, "step": 1902 }, { "epoch": 2.066268332428028, "grad_norm": 0.227609138409403, "learning_rate": 1.7270531400966185e-05, "loss": 0.3708, "step": 1903 }, { "epoch": 2.0673546985334057, "grad_norm": 0.41132171975499965, "learning_rate": 1.7250402576489532e-05, "loss": 0.3625, "step": 1904 }, { "epoch": 2.068441064638783, "grad_norm": 0.20815614829837228, "learning_rate": 1.7230273752012882e-05, "loss": 0.3324, "step": 1905 }, { "epoch": 2.0695274307441607, "grad_norm": 0.21556193106716084, "learning_rate": 1.7210144927536233e-05, "loss": 0.3496, "step": 1906 }, { "epoch": 2.070613796849538, "grad_norm": 0.23081831880202286, "learning_rate": 1.719001610305958e-05, "loss": 0.3761, "step": 1907 }, { "epoch": 2.0717001629549157, "grad_norm": 0.22485595249877302, "learning_rate": 1.7169887278582934e-05, "loss": 0.3926, "step": 1908 }, { "epoch": 2.072786529060293, "grad_norm": 0.2216178239424652, "learning_rate": 1.714975845410628e-05, "loss": 0.3601, "step": 1909 }, { "epoch": 2.0738728951656706, "grad_norm": 0.21528358245944648, "learning_rate": 1.712962962962963e-05, "loss": 0.3593, "step": 1910 }, { "epoch": 2.074959261271048, "grad_norm": 0.20314553997656867, "learning_rate": 1.710950080515298e-05, "loss": 0.3474, "step": 1911 }, { "epoch": 2.076045627376426, "grad_norm": 0.21110466224350774, "learning_rate": 1.7089371980676332e-05, "loss": 0.3764, "step": 1912 }, { "epoch": 2.0771319934818036, "grad_norm": 0.21452701128831958, "learning_rate": 1.706924315619968e-05, "loss": 0.3647, "step": 1913 }, { "epoch": 2.078218359587181, "grad_norm": 0.21553593750396075, "learning_rate": 1.704911433172303e-05, "loss": 0.3777, "step": 1914 }, { "epoch": 2.0793047256925585, "grad_norm": 0.20548859606950876, "learning_rate": 1.702898550724638e-05, "loss": 0.3638, "step": 1915 }, { "epoch": 2.080391091797936, "grad_norm": 0.21373611626346736, "learning_rate": 1.7008856682769727e-05, "loss": 0.3772, "step": 1916 }, { "epoch": 2.0814774579033135, "grad_norm": 0.22492936866674854, "learning_rate": 1.6988727858293077e-05, "loss": 0.3644, "step": 1917 }, { "epoch": 2.082563824008691, "grad_norm": 0.2002886295799795, "learning_rate": 1.6968599033816428e-05, "loss": 0.3146, "step": 1918 }, { "epoch": 2.0836501901140685, "grad_norm": 0.2047153009448794, "learning_rate": 1.6948470209339775e-05, "loss": 0.3669, "step": 1919 }, { "epoch": 2.084736556219446, "grad_norm": 0.1960814193030332, "learning_rate": 1.6928341384863125e-05, "loss": 0.3589, "step": 1920 }, { "epoch": 2.0858229223248235, "grad_norm": 0.19854659098383437, "learning_rate": 1.6908212560386476e-05, "loss": 0.3169, "step": 1921 }, { "epoch": 2.086909288430201, "grad_norm": 0.22944329276207423, "learning_rate": 1.6888083735909823e-05, "loss": 0.3697, "step": 1922 }, { "epoch": 2.0879956545355784, "grad_norm": 0.2096051305641943, "learning_rate": 1.6867954911433173e-05, "loss": 0.3637, "step": 1923 }, { "epoch": 2.089082020640956, "grad_norm": 0.2085353483057617, "learning_rate": 1.6847826086956524e-05, "loss": 0.3242, "step": 1924 }, { "epoch": 2.0901683867463334, "grad_norm": 0.2093955822099982, "learning_rate": 1.682769726247987e-05, "loss": 0.3673, "step": 1925 }, { "epoch": 2.091254752851711, "grad_norm": 0.20912470038577166, "learning_rate": 1.680756843800322e-05, "loss": 0.3471, "step": 1926 }, { "epoch": 2.0923411189570884, "grad_norm": 0.19926615308692272, "learning_rate": 1.678743961352657e-05, "loss": 0.34, "step": 1927 }, { "epoch": 2.093427485062466, "grad_norm": 0.19315137530347481, "learning_rate": 1.676731078904992e-05, "loss": 0.3256, "step": 1928 }, { "epoch": 2.0945138511678434, "grad_norm": 0.21312271898711338, "learning_rate": 1.674718196457327e-05, "loss": 0.3568, "step": 1929 }, { "epoch": 2.095600217273221, "grad_norm": 0.2072677068880387, "learning_rate": 1.672705314009662e-05, "loss": 0.3339, "step": 1930 }, { "epoch": 2.096686583378599, "grad_norm": 0.2199601953449042, "learning_rate": 1.6706924315619967e-05, "loss": 0.3511, "step": 1931 }, { "epoch": 2.0977729494839763, "grad_norm": 0.1959404818388162, "learning_rate": 1.6686795491143317e-05, "loss": 0.3331, "step": 1932 }, { "epoch": 2.098859315589354, "grad_norm": 0.2126849689447706, "learning_rate": 1.6666666666666667e-05, "loss": 0.4114, "step": 1933 }, { "epoch": 2.0999456816947313, "grad_norm": 0.20771198043636993, "learning_rate": 1.6646537842190015e-05, "loss": 0.3304, "step": 1934 }, { "epoch": 2.1010320478001088, "grad_norm": 0.21895742345292987, "learning_rate": 1.6626409017713365e-05, "loss": 0.3357, "step": 1935 }, { "epoch": 2.1021184139054863, "grad_norm": 0.20686646095152836, "learning_rate": 1.6606280193236715e-05, "loss": 0.3473, "step": 1936 }, { "epoch": 2.1032047800108638, "grad_norm": 0.21059372499336845, "learning_rate": 1.6586151368760062e-05, "loss": 0.3721, "step": 1937 }, { "epoch": 2.1042911461162412, "grad_norm": 0.20527849470692058, "learning_rate": 1.6566022544283413e-05, "loss": 0.3307, "step": 1938 }, { "epoch": 2.1053775122216187, "grad_norm": 0.2148691793918337, "learning_rate": 1.6545893719806767e-05, "loss": 0.3439, "step": 1939 }, { "epoch": 2.106463878326996, "grad_norm": 0.21079299317402259, "learning_rate": 1.6525764895330114e-05, "loss": 0.3663, "step": 1940 }, { "epoch": 2.1075502444323737, "grad_norm": 3.2999320096787423, "learning_rate": 1.6505636070853464e-05, "loss": 0.6103, "step": 1941 }, { "epoch": 2.108636610537751, "grad_norm": 0.20835742938884844, "learning_rate": 1.6485507246376815e-05, "loss": 0.3266, "step": 1942 }, { "epoch": 2.1097229766431287, "grad_norm": 3.964611693374118, "learning_rate": 1.6465378421900162e-05, "loss": 0.3377, "step": 1943 }, { "epoch": 2.110809342748506, "grad_norm": 0.204450717408689, "learning_rate": 1.6445249597423512e-05, "loss": 0.2988, "step": 1944 }, { "epoch": 2.1118957088538837, "grad_norm": 0.21390759324506556, "learning_rate": 1.6425120772946863e-05, "loss": 0.3717, "step": 1945 }, { "epoch": 2.112982074959261, "grad_norm": 0.20661032996164183, "learning_rate": 1.640499194847021e-05, "loss": 0.3434, "step": 1946 }, { "epoch": 2.1140684410646386, "grad_norm": 0.22018221637629942, "learning_rate": 1.638486312399356e-05, "loss": 0.3672, "step": 1947 }, { "epoch": 2.115154807170016, "grad_norm": 0.2052632819983996, "learning_rate": 1.636473429951691e-05, "loss": 0.3373, "step": 1948 }, { "epoch": 2.1162411732753936, "grad_norm": 0.21398797344882442, "learning_rate": 1.6344605475040258e-05, "loss": 0.3355, "step": 1949 }, { "epoch": 2.117327539380771, "grad_norm": 0.1901223427626088, "learning_rate": 1.6324476650563608e-05, "loss": 0.3106, "step": 1950 }, { "epoch": 2.1184139054861486, "grad_norm": 0.21663557520568413, "learning_rate": 1.630434782608696e-05, "loss": 0.3464, "step": 1951 }, { "epoch": 2.1195002715915265, "grad_norm": 0.2420450181498098, "learning_rate": 1.6284219001610306e-05, "loss": 0.3501, "step": 1952 }, { "epoch": 2.120586637696904, "grad_norm": 0.19661005777232873, "learning_rate": 1.6264090177133656e-05, "loss": 0.3484, "step": 1953 }, { "epoch": 2.1216730038022815, "grad_norm": 0.206765435716025, "learning_rate": 1.6243961352657006e-05, "loss": 0.3393, "step": 1954 }, { "epoch": 2.122759369907659, "grad_norm": 0.23537358878679945, "learning_rate": 1.6223832528180353e-05, "loss": 0.3658, "step": 1955 }, { "epoch": 2.1238457360130365, "grad_norm": 0.20612675250477938, "learning_rate": 1.6203703703703704e-05, "loss": 0.3132, "step": 1956 }, { "epoch": 2.124932102118414, "grad_norm": 0.19466796684061458, "learning_rate": 1.6183574879227054e-05, "loss": 0.3178, "step": 1957 }, { "epoch": 2.1260184682237915, "grad_norm": 0.19016989805729717, "learning_rate": 1.61634460547504e-05, "loss": 0.3149, "step": 1958 }, { "epoch": 2.127104834329169, "grad_norm": 0.22071341030111014, "learning_rate": 1.6143317230273752e-05, "loss": 0.3531, "step": 1959 }, { "epoch": 2.1281912004345465, "grad_norm": 0.20738345988730883, "learning_rate": 1.6123188405797102e-05, "loss": 0.34, "step": 1960 }, { "epoch": 2.129277566539924, "grad_norm": 0.2120425409671068, "learning_rate": 1.610305958132045e-05, "loss": 0.3352, "step": 1961 }, { "epoch": 2.1303639326453014, "grad_norm": 0.22117588509345898, "learning_rate": 1.60829307568438e-05, "loss": 0.3271, "step": 1962 }, { "epoch": 2.131450298750679, "grad_norm": 0.20678981874817287, "learning_rate": 1.606280193236715e-05, "loss": 0.3508, "step": 1963 }, { "epoch": 2.1325366648560564, "grad_norm": 0.2211640841327925, "learning_rate": 1.60426731078905e-05, "loss": 0.3367, "step": 1964 }, { "epoch": 2.133623030961434, "grad_norm": 0.21128336822536747, "learning_rate": 1.6022544283413848e-05, "loss": 0.3315, "step": 1965 }, { "epoch": 2.1347093970668114, "grad_norm": 0.20954874147748687, "learning_rate": 1.6002415458937198e-05, "loss": 0.3247, "step": 1966 }, { "epoch": 2.135795763172189, "grad_norm": 0.20217317905313004, "learning_rate": 1.598228663446055e-05, "loss": 0.3211, "step": 1967 }, { "epoch": 2.1368821292775664, "grad_norm": 0.1993715527699818, "learning_rate": 1.5962157809983896e-05, "loss": 0.3364, "step": 1968 }, { "epoch": 2.137968495382944, "grad_norm": 0.20861787820539263, "learning_rate": 1.5942028985507246e-05, "loss": 0.3709, "step": 1969 }, { "epoch": 2.1390548614883214, "grad_norm": 0.22306782629608898, "learning_rate": 1.5921900161030597e-05, "loss": 0.3576, "step": 1970 }, { "epoch": 2.1401412275936993, "grad_norm": 0.2213781769250681, "learning_rate": 1.5901771336553947e-05, "loss": 0.3357, "step": 1971 }, { "epoch": 2.141227593699077, "grad_norm": 0.20680273862778067, "learning_rate": 1.5881642512077297e-05, "loss": 0.3429, "step": 1972 }, { "epoch": 2.1423139598044543, "grad_norm": 0.19727115543458393, "learning_rate": 1.5861513687600644e-05, "loss": 0.3144, "step": 1973 }, { "epoch": 2.1434003259098318, "grad_norm": 0.20601743110709017, "learning_rate": 1.5841384863123995e-05, "loss": 0.3505, "step": 1974 }, { "epoch": 2.1444866920152093, "grad_norm": 0.20104348022953747, "learning_rate": 1.5821256038647345e-05, "loss": 0.3203, "step": 1975 }, { "epoch": 2.1455730581205867, "grad_norm": 0.24196183511506594, "learning_rate": 1.5801127214170692e-05, "loss": 0.3731, "step": 1976 }, { "epoch": 2.1466594242259642, "grad_norm": 0.22472297813951633, "learning_rate": 1.5780998389694043e-05, "loss": 0.35, "step": 1977 }, { "epoch": 2.1477457903313417, "grad_norm": 0.20506895692758217, "learning_rate": 1.5760869565217393e-05, "loss": 0.3246, "step": 1978 }, { "epoch": 2.148832156436719, "grad_norm": 0.22171674398509586, "learning_rate": 1.574074074074074e-05, "loss": 0.3585, "step": 1979 }, { "epoch": 2.1499185225420967, "grad_norm": 0.2164383753482112, "learning_rate": 1.572061191626409e-05, "loss": 0.3525, "step": 1980 }, { "epoch": 2.151004888647474, "grad_norm": 0.21332450911247908, "learning_rate": 1.570048309178744e-05, "loss": 0.3574, "step": 1981 }, { "epoch": 2.1520912547528517, "grad_norm": 0.2376036910249498, "learning_rate": 1.568035426731079e-05, "loss": 0.3565, "step": 1982 }, { "epoch": 2.153177620858229, "grad_norm": 0.2084195690550928, "learning_rate": 1.566022544283414e-05, "loss": 0.3254, "step": 1983 }, { "epoch": 2.1542639869636067, "grad_norm": 0.1942991793517843, "learning_rate": 1.564009661835749e-05, "loss": 0.3247, "step": 1984 }, { "epoch": 2.155350353068984, "grad_norm": 0.21785211133237067, "learning_rate": 1.561996779388084e-05, "loss": 0.3541, "step": 1985 }, { "epoch": 2.1564367191743616, "grad_norm": 0.22753565880680157, "learning_rate": 1.5599838969404187e-05, "loss": 0.3746, "step": 1986 }, { "epoch": 2.157523085279739, "grad_norm": 1.259980493786396, "learning_rate": 1.5579710144927537e-05, "loss": 0.3157, "step": 1987 }, { "epoch": 2.1586094513851166, "grad_norm": 0.21499617559866005, "learning_rate": 1.5559581320450888e-05, "loss": 0.3377, "step": 1988 }, { "epoch": 2.159695817490494, "grad_norm": 0.21175970300927488, "learning_rate": 1.5539452495974235e-05, "loss": 0.3501, "step": 1989 }, { "epoch": 2.160782183595872, "grad_norm": 0.24220139032129234, "learning_rate": 1.5519323671497585e-05, "loss": 0.4187, "step": 1990 }, { "epoch": 2.161868549701249, "grad_norm": 0.2147992388209294, "learning_rate": 1.5499194847020936e-05, "loss": 0.3415, "step": 1991 }, { "epoch": 2.162954915806627, "grad_norm": 0.2161893283561458, "learning_rate": 1.5479066022544283e-05, "loss": 0.3717, "step": 1992 }, { "epoch": 2.1640412819120045, "grad_norm": 0.23501329757940223, "learning_rate": 1.5458937198067633e-05, "loss": 0.3655, "step": 1993 }, { "epoch": 2.165127648017382, "grad_norm": 0.21398505840534368, "learning_rate": 1.5438808373590983e-05, "loss": 0.3234, "step": 1994 }, { "epoch": 2.1662140141227595, "grad_norm": 0.2565264693219482, "learning_rate": 1.541867954911433e-05, "loss": 0.3895, "step": 1995 }, { "epoch": 2.167300380228137, "grad_norm": 0.20540885439425374, "learning_rate": 1.539855072463768e-05, "loss": 0.3198, "step": 1996 }, { "epoch": 2.1683867463335145, "grad_norm": 0.1991204484596128, "learning_rate": 1.537842190016103e-05, "loss": 0.3333, "step": 1997 }, { "epoch": 2.169473112438892, "grad_norm": 0.2190724461978393, "learning_rate": 1.535829307568438e-05, "loss": 0.3589, "step": 1998 }, { "epoch": 2.1705594785442694, "grad_norm": 0.1990647557673858, "learning_rate": 1.533816425120773e-05, "loss": 0.3263, "step": 1999 }, { "epoch": 2.171645844649647, "grad_norm": 0.19556532629948195, "learning_rate": 1.531803542673108e-05, "loss": 0.3264, "step": 2000 }, { "epoch": 2.1727322107550244, "grad_norm": 0.20374707312532403, "learning_rate": 1.529790660225443e-05, "loss": 0.3452, "step": 2001 }, { "epoch": 2.173818576860402, "grad_norm": 0.20455881997237513, "learning_rate": 1.527777777777778e-05, "loss": 0.3692, "step": 2002 }, { "epoch": 2.1749049429657794, "grad_norm": 0.21351974653324357, "learning_rate": 1.5257648953301129e-05, "loss": 0.3586, "step": 2003 }, { "epoch": 2.175991309071157, "grad_norm": 0.20543706762571465, "learning_rate": 1.5237520128824478e-05, "loss": 0.3723, "step": 2004 }, { "epoch": 2.1770776751765344, "grad_norm": 0.18865992325204017, "learning_rate": 1.5217391304347828e-05, "loss": 0.3099, "step": 2005 }, { "epoch": 2.178164041281912, "grad_norm": 0.20956346865703113, "learning_rate": 1.5197262479871177e-05, "loss": 0.4085, "step": 2006 }, { "epoch": 2.1792504073872894, "grad_norm": 0.2803112155011685, "learning_rate": 1.5177133655394526e-05, "loss": 0.4038, "step": 2007 }, { "epoch": 2.180336773492667, "grad_norm": 0.2250815803256458, "learning_rate": 1.5157004830917876e-05, "loss": 0.3582, "step": 2008 }, { "epoch": 2.1814231395980443, "grad_norm": 0.19935796774420106, "learning_rate": 1.5136876006441225e-05, "loss": 0.3454, "step": 2009 }, { "epoch": 2.182509505703422, "grad_norm": 0.18936211840827508, "learning_rate": 1.5116747181964575e-05, "loss": 0.3174, "step": 2010 }, { "epoch": 2.1835958718087998, "grad_norm": 0.19482502904451637, "learning_rate": 1.5096618357487924e-05, "loss": 0.3233, "step": 2011 }, { "epoch": 2.1846822379141773, "grad_norm": 0.20490548743933448, "learning_rate": 1.5076489533011273e-05, "loss": 0.3541, "step": 2012 }, { "epoch": 2.1857686040195548, "grad_norm": 0.21049753130245058, "learning_rate": 1.5056360708534623e-05, "loss": 0.3622, "step": 2013 }, { "epoch": 2.1868549701249322, "grad_norm": 0.19489503925713686, "learning_rate": 1.5036231884057972e-05, "loss": 0.3428, "step": 2014 }, { "epoch": 2.1879413362303097, "grad_norm": 0.22444720968562482, "learning_rate": 1.501610305958132e-05, "loss": 0.388, "step": 2015 }, { "epoch": 2.189027702335687, "grad_norm": 0.21122284846617903, "learning_rate": 1.4995974235104671e-05, "loss": 0.3317, "step": 2016 }, { "epoch": 2.1901140684410647, "grad_norm": 0.20180193245350958, "learning_rate": 1.497584541062802e-05, "loss": 0.3309, "step": 2017 }, { "epoch": 2.191200434546442, "grad_norm": 0.20945988535714127, "learning_rate": 1.4955716586151369e-05, "loss": 0.3853, "step": 2018 }, { "epoch": 2.1922868006518197, "grad_norm": 0.19817932257580814, "learning_rate": 1.4935587761674719e-05, "loss": 0.3491, "step": 2019 }, { "epoch": 2.193373166757197, "grad_norm": 0.21223249878466768, "learning_rate": 1.4915458937198068e-05, "loss": 0.3436, "step": 2020 }, { "epoch": 2.1944595328625747, "grad_norm": 0.20848024970502796, "learning_rate": 1.4895330112721417e-05, "loss": 0.3706, "step": 2021 }, { "epoch": 2.195545898967952, "grad_norm": 0.19789144426516236, "learning_rate": 1.4875201288244767e-05, "loss": 0.3411, "step": 2022 }, { "epoch": 2.1966322650733296, "grad_norm": 0.2053207248267669, "learning_rate": 1.4855072463768116e-05, "loss": 0.3525, "step": 2023 }, { "epoch": 2.197718631178707, "grad_norm": 0.21854400713874697, "learning_rate": 1.4834943639291465e-05, "loss": 0.3113, "step": 2024 }, { "epoch": 2.1988049972840846, "grad_norm": 0.21648271502971161, "learning_rate": 1.4814814814814815e-05, "loss": 0.3495, "step": 2025 }, { "epoch": 2.199891363389462, "grad_norm": 0.19389445334662098, "learning_rate": 1.4794685990338164e-05, "loss": 0.3382, "step": 2026 }, { "epoch": 2.2009777294948396, "grad_norm": 0.20463803425253013, "learning_rate": 1.4774557165861514e-05, "loss": 0.3312, "step": 2027 }, { "epoch": 2.202064095600217, "grad_norm": 0.2141998407544012, "learning_rate": 1.4754428341384863e-05, "loss": 0.3775, "step": 2028 }, { "epoch": 2.2031504617055946, "grad_norm": 0.21640289323642567, "learning_rate": 1.4734299516908212e-05, "loss": 0.4086, "step": 2029 }, { "epoch": 2.2042368278109725, "grad_norm": 0.20358992793479305, "learning_rate": 1.4714170692431562e-05, "loss": 0.3338, "step": 2030 }, { "epoch": 2.20532319391635, "grad_norm": 0.19805411322399388, "learning_rate": 1.469404186795491e-05, "loss": 0.3416, "step": 2031 }, { "epoch": 2.2064095600217275, "grad_norm": 0.21284297276069186, "learning_rate": 1.4673913043478263e-05, "loss": 0.361, "step": 2032 }, { "epoch": 2.207495926127105, "grad_norm": 0.21726516122320513, "learning_rate": 1.4653784219001612e-05, "loss": 0.3404, "step": 2033 }, { "epoch": 2.2085822922324825, "grad_norm": 0.20396766143465442, "learning_rate": 1.4633655394524962e-05, "loss": 0.331, "step": 2034 }, { "epoch": 2.20966865833786, "grad_norm": 0.21622666803451127, "learning_rate": 1.4613526570048311e-05, "loss": 0.3579, "step": 2035 }, { "epoch": 2.2107550244432375, "grad_norm": 0.1927821177822864, "learning_rate": 1.459339774557166e-05, "loss": 0.3448, "step": 2036 }, { "epoch": 2.211841390548615, "grad_norm": 0.36508960084586345, "learning_rate": 1.457326892109501e-05, "loss": 0.4101, "step": 2037 }, { "epoch": 2.2129277566539924, "grad_norm": 0.20553101288925374, "learning_rate": 1.4553140096618359e-05, "loss": 0.3568, "step": 2038 }, { "epoch": 2.21401412275937, "grad_norm": 0.21436488970502737, "learning_rate": 1.4533011272141708e-05, "loss": 0.3683, "step": 2039 }, { "epoch": 2.2151004888647474, "grad_norm": 0.20786034074048473, "learning_rate": 1.4512882447665058e-05, "loss": 0.3717, "step": 2040 }, { "epoch": 2.216186854970125, "grad_norm": 0.1982639948328698, "learning_rate": 1.4492753623188407e-05, "loss": 0.3606, "step": 2041 }, { "epoch": 2.2172732210755024, "grad_norm": 0.2085013880626117, "learning_rate": 1.4472624798711756e-05, "loss": 0.3798, "step": 2042 }, { "epoch": 2.21835958718088, "grad_norm": 0.20190850437159744, "learning_rate": 1.4452495974235106e-05, "loss": 0.3248, "step": 2043 }, { "epoch": 2.2194459532862574, "grad_norm": 0.21653007106405345, "learning_rate": 1.4432367149758455e-05, "loss": 0.3677, "step": 2044 }, { "epoch": 2.220532319391635, "grad_norm": 0.20947724214534472, "learning_rate": 1.4412238325281805e-05, "loss": 0.3595, "step": 2045 }, { "epoch": 2.2216186854970124, "grad_norm": 0.20726609495444978, "learning_rate": 1.4392109500805154e-05, "loss": 0.3301, "step": 2046 }, { "epoch": 2.22270505160239, "grad_norm": 0.20544829313489907, "learning_rate": 1.4371980676328503e-05, "loss": 0.3621, "step": 2047 }, { "epoch": 2.2237914177077673, "grad_norm": 0.21961280729644125, "learning_rate": 1.4351851851851853e-05, "loss": 0.34, "step": 2048 }, { "epoch": 2.224877783813145, "grad_norm": 0.2199994364332682, "learning_rate": 1.4331723027375202e-05, "loss": 0.404, "step": 2049 }, { "epoch": 2.2259641499185223, "grad_norm": 0.2045129511378531, "learning_rate": 1.431159420289855e-05, "loss": 0.3366, "step": 2050 }, { "epoch": 2.2270505160239003, "grad_norm": 0.20428540334480633, "learning_rate": 1.4291465378421901e-05, "loss": 0.3404, "step": 2051 }, { "epoch": 2.2281368821292777, "grad_norm": 0.19680513322901264, "learning_rate": 1.427133655394525e-05, "loss": 0.2891, "step": 2052 }, { "epoch": 2.2292232482346552, "grad_norm": 0.19945306988763875, "learning_rate": 1.4251207729468599e-05, "loss": 0.3042, "step": 2053 }, { "epoch": 2.2303096143400327, "grad_norm": 0.2081917439712128, "learning_rate": 1.4231078904991949e-05, "loss": 0.3581, "step": 2054 }, { "epoch": 2.23139598044541, "grad_norm": 0.20601871560494894, "learning_rate": 1.4210950080515298e-05, "loss": 0.3688, "step": 2055 }, { "epoch": 2.2324823465507877, "grad_norm": 0.21120823923594367, "learning_rate": 1.4190821256038646e-05, "loss": 0.3501, "step": 2056 }, { "epoch": 2.233568712656165, "grad_norm": 0.2092474581426901, "learning_rate": 1.4170692431561997e-05, "loss": 0.3361, "step": 2057 }, { "epoch": 2.2346550787615427, "grad_norm": 0.20439753116066248, "learning_rate": 1.4150563607085346e-05, "loss": 0.3671, "step": 2058 }, { "epoch": 2.23574144486692, "grad_norm": 0.20114993445166873, "learning_rate": 1.4130434782608694e-05, "loss": 0.3434, "step": 2059 }, { "epoch": 2.2368278109722977, "grad_norm": 0.21798860217216456, "learning_rate": 1.4110305958132045e-05, "loss": 0.3365, "step": 2060 }, { "epoch": 2.237914177077675, "grad_norm": 0.21692111920447105, "learning_rate": 1.4090177133655394e-05, "loss": 0.3641, "step": 2061 }, { "epoch": 2.2390005431830526, "grad_norm": 0.19810887266782917, "learning_rate": 1.4070048309178744e-05, "loss": 0.3342, "step": 2062 }, { "epoch": 2.24008690928843, "grad_norm": 0.2109801546832827, "learning_rate": 1.4049919484702096e-05, "loss": 0.3775, "step": 2063 }, { "epoch": 2.2411732753938076, "grad_norm": 0.20489449246427419, "learning_rate": 1.4029790660225445e-05, "loss": 0.3615, "step": 2064 }, { "epoch": 2.242259641499185, "grad_norm": 0.1954547465405373, "learning_rate": 1.4009661835748794e-05, "loss": 0.3323, "step": 2065 }, { "epoch": 2.2433460076045626, "grad_norm": 0.20367954796706625, "learning_rate": 1.3989533011272144e-05, "loss": 0.3405, "step": 2066 }, { "epoch": 2.24443237370994, "grad_norm": 0.19735216092653138, "learning_rate": 1.3969404186795493e-05, "loss": 0.3322, "step": 2067 }, { "epoch": 2.2455187398153176, "grad_norm": 0.22637996331655016, "learning_rate": 1.3949275362318842e-05, "loss": 0.3907, "step": 2068 }, { "epoch": 2.246605105920695, "grad_norm": 0.19774918267507602, "learning_rate": 1.3929146537842192e-05, "loss": 0.3385, "step": 2069 }, { "epoch": 2.247691472026073, "grad_norm": 0.19464176344497836, "learning_rate": 1.390901771336554e-05, "loss": 0.356, "step": 2070 }, { "epoch": 2.2487778381314505, "grad_norm": 0.22458187288380305, "learning_rate": 1.388888888888889e-05, "loss": 0.3612, "step": 2071 }, { "epoch": 2.249864204236828, "grad_norm": 0.2168836450271455, "learning_rate": 1.386876006441224e-05, "loss": 0.3685, "step": 2072 }, { "epoch": 2.2509505703422055, "grad_norm": 0.19970921067154776, "learning_rate": 1.3848631239935589e-05, "loss": 0.3409, "step": 2073 }, { "epoch": 2.252036936447583, "grad_norm": 0.21037090183186435, "learning_rate": 1.3828502415458937e-05, "loss": 0.3681, "step": 2074 }, { "epoch": 2.2531233025529604, "grad_norm": 0.19977411021887714, "learning_rate": 1.3808373590982288e-05, "loss": 0.3412, "step": 2075 }, { "epoch": 2.254209668658338, "grad_norm": 0.21198732443943719, "learning_rate": 1.3788244766505637e-05, "loss": 0.3569, "step": 2076 }, { "epoch": 2.2552960347637154, "grad_norm": 0.21489244759009038, "learning_rate": 1.3768115942028985e-05, "loss": 0.3656, "step": 2077 }, { "epoch": 2.256382400869093, "grad_norm": 0.20368425328377648, "learning_rate": 1.3747987117552336e-05, "loss": 0.3712, "step": 2078 }, { "epoch": 2.2574687669744704, "grad_norm": 0.21481159015867868, "learning_rate": 1.3727858293075685e-05, "loss": 0.3543, "step": 2079 }, { "epoch": 2.258555133079848, "grad_norm": 0.2127026802475687, "learning_rate": 1.3707729468599035e-05, "loss": 0.3738, "step": 2080 }, { "epoch": 2.2596414991852254, "grad_norm": 0.2069788884739888, "learning_rate": 1.3687600644122384e-05, "loss": 0.3584, "step": 2081 }, { "epoch": 2.260727865290603, "grad_norm": 0.22004952981560283, "learning_rate": 1.3667471819645733e-05, "loss": 0.3591, "step": 2082 }, { "epoch": 2.2618142313959804, "grad_norm": 0.21250207810162117, "learning_rate": 1.3647342995169083e-05, "loss": 0.3348, "step": 2083 }, { "epoch": 2.262900597501358, "grad_norm": 0.21862971479960092, "learning_rate": 1.3627214170692432e-05, "loss": 0.3752, "step": 2084 }, { "epoch": 2.2639869636067353, "grad_norm": 0.19697336170068988, "learning_rate": 1.360708534621578e-05, "loss": 0.3221, "step": 2085 }, { "epoch": 2.265073329712113, "grad_norm": 0.21546037220272227, "learning_rate": 1.3586956521739131e-05, "loss": 0.3826, "step": 2086 }, { "epoch": 2.2661596958174903, "grad_norm": 0.21307432949736355, "learning_rate": 1.356682769726248e-05, "loss": 0.3919, "step": 2087 }, { "epoch": 2.267246061922868, "grad_norm": 0.21534206403174047, "learning_rate": 1.3546698872785828e-05, "loss": 0.337, "step": 2088 }, { "epoch": 2.2683324280282458, "grad_norm": 0.213785439883237, "learning_rate": 1.3526570048309179e-05, "loss": 0.3726, "step": 2089 }, { "epoch": 2.269418794133623, "grad_norm": 0.21469479149901727, "learning_rate": 1.3506441223832528e-05, "loss": 0.3631, "step": 2090 }, { "epoch": 2.2705051602390007, "grad_norm": 0.20531302999822287, "learning_rate": 1.3486312399355876e-05, "loss": 0.3434, "step": 2091 }, { "epoch": 2.271591526344378, "grad_norm": 0.23090447260458935, "learning_rate": 1.3466183574879227e-05, "loss": 0.3681, "step": 2092 }, { "epoch": 2.2726778924497557, "grad_norm": 0.21312089968288117, "learning_rate": 1.3446054750402576e-05, "loss": 0.3494, "step": 2093 }, { "epoch": 2.273764258555133, "grad_norm": 0.20343369305967698, "learning_rate": 1.3425925925925928e-05, "loss": 0.3212, "step": 2094 }, { "epoch": 2.2748506246605107, "grad_norm": 0.23536372375489278, "learning_rate": 1.3405797101449276e-05, "loss": 0.4044, "step": 2095 }, { "epoch": 2.275936990765888, "grad_norm": 0.24906162060682596, "learning_rate": 1.3385668276972627e-05, "loss": 0.4263, "step": 2096 }, { "epoch": 2.2770233568712657, "grad_norm": 0.22765269114721987, "learning_rate": 1.3365539452495976e-05, "loss": 0.3582, "step": 2097 }, { "epoch": 2.278109722976643, "grad_norm": 0.21396752295036495, "learning_rate": 1.3345410628019326e-05, "loss": 0.3654, "step": 2098 }, { "epoch": 2.2791960890820206, "grad_norm": 0.19370332969262183, "learning_rate": 1.3325281803542675e-05, "loss": 0.2901, "step": 2099 }, { "epoch": 2.280282455187398, "grad_norm": 0.20729214093555734, "learning_rate": 1.3305152979066024e-05, "loss": 0.3172, "step": 2100 }, { "epoch": 2.2813688212927756, "grad_norm": 0.21425472341747337, "learning_rate": 1.3285024154589374e-05, "loss": 0.3748, "step": 2101 }, { "epoch": 2.282455187398153, "grad_norm": 0.23392664454317594, "learning_rate": 1.3264895330112723e-05, "loss": 0.4031, "step": 2102 }, { "epoch": 2.2835415535035306, "grad_norm": 0.20356046255971538, "learning_rate": 1.3244766505636072e-05, "loss": 0.3224, "step": 2103 }, { "epoch": 2.284627919608908, "grad_norm": 0.22535759370268071, "learning_rate": 1.3224637681159422e-05, "loss": 0.364, "step": 2104 }, { "epoch": 2.2857142857142856, "grad_norm": 0.21859712851027988, "learning_rate": 1.320450885668277e-05, "loss": 0.3301, "step": 2105 }, { "epoch": 2.286800651819663, "grad_norm": 0.19927474189031535, "learning_rate": 1.318438003220612e-05, "loss": 0.3475, "step": 2106 }, { "epoch": 2.2878870179250406, "grad_norm": 0.22231599494458648, "learning_rate": 1.316425120772947e-05, "loss": 0.3568, "step": 2107 }, { "epoch": 2.288973384030418, "grad_norm": 0.21321828841112467, "learning_rate": 1.3144122383252819e-05, "loss": 0.3529, "step": 2108 }, { "epoch": 2.2900597501357955, "grad_norm": 0.2275089988168356, "learning_rate": 1.3123993558776167e-05, "loss": 0.4218, "step": 2109 }, { "epoch": 2.2911461162411735, "grad_norm": 0.20993953280553418, "learning_rate": 1.3103864734299518e-05, "loss": 0.3658, "step": 2110 }, { "epoch": 2.292232482346551, "grad_norm": 0.20258899156246765, "learning_rate": 1.3083735909822867e-05, "loss": 0.3109, "step": 2111 }, { "epoch": 2.2933188484519285, "grad_norm": 0.20782457566922719, "learning_rate": 1.3063607085346215e-05, "loss": 0.3896, "step": 2112 }, { "epoch": 2.294405214557306, "grad_norm": 0.20933700685644419, "learning_rate": 1.3043478260869566e-05, "loss": 0.3154, "step": 2113 }, { "epoch": 2.2954915806626834, "grad_norm": 0.2101326732916708, "learning_rate": 1.3023349436392915e-05, "loss": 0.3544, "step": 2114 }, { "epoch": 2.296577946768061, "grad_norm": 0.21238809623437963, "learning_rate": 1.3003220611916265e-05, "loss": 0.3853, "step": 2115 }, { "epoch": 2.2976643128734384, "grad_norm": 0.5049432326962194, "learning_rate": 1.2983091787439614e-05, "loss": 0.3832, "step": 2116 }, { "epoch": 2.298750678978816, "grad_norm": 0.19541292228295332, "learning_rate": 1.2962962962962962e-05, "loss": 0.3417, "step": 2117 }, { "epoch": 2.2998370450841934, "grad_norm": 0.19797417971430567, "learning_rate": 1.2942834138486313e-05, "loss": 0.3364, "step": 2118 }, { "epoch": 2.300923411189571, "grad_norm": 0.20537226327580627, "learning_rate": 1.2922705314009662e-05, "loss": 0.352, "step": 2119 }, { "epoch": 2.3020097772949484, "grad_norm": 0.19814995203457608, "learning_rate": 1.290257648953301e-05, "loss": 0.3606, "step": 2120 }, { "epoch": 2.303096143400326, "grad_norm": 0.21648448297609832, "learning_rate": 1.288244766505636e-05, "loss": 0.3595, "step": 2121 }, { "epoch": 2.3041825095057034, "grad_norm": 0.20909955231582747, "learning_rate": 1.286231884057971e-05, "loss": 0.3577, "step": 2122 }, { "epoch": 2.305268875611081, "grad_norm": 0.1978308477274207, "learning_rate": 1.2842190016103058e-05, "loss": 0.3337, "step": 2123 }, { "epoch": 2.3063552417164583, "grad_norm": 0.20570139231185905, "learning_rate": 1.2822061191626409e-05, "loss": 0.366, "step": 2124 }, { "epoch": 2.307441607821836, "grad_norm": 0.19043137656481682, "learning_rate": 1.2801932367149761e-05, "loss": 0.3433, "step": 2125 }, { "epoch": 2.3085279739272133, "grad_norm": 0.20890742746976054, "learning_rate": 1.278180354267311e-05, "loss": 0.3457, "step": 2126 }, { "epoch": 2.309614340032591, "grad_norm": 0.21145451904311138, "learning_rate": 1.2761674718196458e-05, "loss": 0.3512, "step": 2127 }, { "epoch": 2.3107007061379683, "grad_norm": 0.20291695909883276, "learning_rate": 1.2741545893719809e-05, "loss": 0.3805, "step": 2128 }, { "epoch": 2.3117870722433462, "grad_norm": 0.2152134837946387, "learning_rate": 1.2721417069243158e-05, "loss": 0.3948, "step": 2129 }, { "epoch": 2.3128734383487233, "grad_norm": 0.20243957868356377, "learning_rate": 1.2701288244766506e-05, "loss": 0.3644, "step": 2130 }, { "epoch": 2.313959804454101, "grad_norm": 0.19765645231829934, "learning_rate": 1.2681159420289857e-05, "loss": 0.3454, "step": 2131 }, { "epoch": 2.3150461705594787, "grad_norm": 0.2112703791176636, "learning_rate": 1.2661030595813206e-05, "loss": 0.3841, "step": 2132 }, { "epoch": 2.316132536664856, "grad_norm": 0.20743233802936203, "learning_rate": 1.2640901771336556e-05, "loss": 0.3578, "step": 2133 }, { "epoch": 2.3172189027702337, "grad_norm": 0.18708994573135831, "learning_rate": 1.2620772946859905e-05, "loss": 0.3063, "step": 2134 }, { "epoch": 2.318305268875611, "grad_norm": 0.20141261185510592, "learning_rate": 1.2600644122383253e-05, "loss": 0.3725, "step": 2135 }, { "epoch": 2.3193916349809887, "grad_norm": 0.2110796171190004, "learning_rate": 1.2580515297906604e-05, "loss": 0.3513, "step": 2136 }, { "epoch": 2.320478001086366, "grad_norm": 0.21094019728709568, "learning_rate": 1.2560386473429953e-05, "loss": 0.3829, "step": 2137 }, { "epoch": 2.3215643671917436, "grad_norm": 0.1889267954580735, "learning_rate": 1.2540257648953301e-05, "loss": 0.3441, "step": 2138 }, { "epoch": 2.322650733297121, "grad_norm": 0.5603686947582637, "learning_rate": 1.2520128824476652e-05, "loss": 0.2934, "step": 2139 }, { "epoch": 2.3237370994024986, "grad_norm": 0.21833233431254295, "learning_rate": 1.25e-05, "loss": 0.3697, "step": 2140 }, { "epoch": 2.324823465507876, "grad_norm": 0.20262598964712972, "learning_rate": 1.247987117552335e-05, "loss": 0.3551, "step": 2141 }, { "epoch": 2.3259098316132536, "grad_norm": 0.18005679202062866, "learning_rate": 1.24597423510467e-05, "loss": 0.3009, "step": 2142 }, { "epoch": 2.326996197718631, "grad_norm": 0.20736371503563955, "learning_rate": 1.2439613526570049e-05, "loss": 0.3785, "step": 2143 }, { "epoch": 2.3280825638240086, "grad_norm": 0.21655475236052246, "learning_rate": 1.2419484702093397e-05, "loss": 0.3655, "step": 2144 }, { "epoch": 2.329168929929386, "grad_norm": 0.21976578364377755, "learning_rate": 1.2399355877616748e-05, "loss": 0.3955, "step": 2145 }, { "epoch": 2.3302552960347636, "grad_norm": 0.1927840200613274, "learning_rate": 1.2379227053140096e-05, "loss": 0.3422, "step": 2146 }, { "epoch": 2.331341662140141, "grad_norm": 0.20509269668961377, "learning_rate": 1.2359098228663445e-05, "loss": 0.3715, "step": 2147 }, { "epoch": 2.332428028245519, "grad_norm": 0.2296575990334497, "learning_rate": 1.2338969404186797e-05, "loss": 0.3632, "step": 2148 }, { "epoch": 2.333514394350896, "grad_norm": 0.2401795507462315, "learning_rate": 1.2318840579710146e-05, "loss": 0.3916, "step": 2149 }, { "epoch": 2.334600760456274, "grad_norm": 0.2034268982362041, "learning_rate": 1.2298711755233495e-05, "loss": 0.3165, "step": 2150 }, { "epoch": 2.3356871265616514, "grad_norm": 0.20369572376021672, "learning_rate": 1.2278582930756845e-05, "loss": 0.3199, "step": 2151 }, { "epoch": 2.336773492667029, "grad_norm": 0.23094818943708556, "learning_rate": 1.2258454106280194e-05, "loss": 0.3548, "step": 2152 }, { "epoch": 2.3378598587724064, "grad_norm": 0.22360916548009746, "learning_rate": 1.2238325281803543e-05, "loss": 0.366, "step": 2153 }, { "epoch": 2.338946224877784, "grad_norm": 0.19011872616374687, "learning_rate": 1.2218196457326893e-05, "loss": 0.319, "step": 2154 }, { "epoch": 2.3400325909831614, "grad_norm": 0.224334800624983, "learning_rate": 1.2198067632850242e-05, "loss": 0.3501, "step": 2155 }, { "epoch": 2.341118957088539, "grad_norm": 0.20780552302689484, "learning_rate": 1.217793880837359e-05, "loss": 0.3506, "step": 2156 }, { "epoch": 2.3422053231939164, "grad_norm": 0.2115260164654605, "learning_rate": 1.2157809983896941e-05, "loss": 0.371, "step": 2157 }, { "epoch": 2.343291689299294, "grad_norm": 0.20052642766079115, "learning_rate": 1.213768115942029e-05, "loss": 0.3369, "step": 2158 }, { "epoch": 2.3443780554046714, "grad_norm": 0.19274774345801823, "learning_rate": 1.211755233494364e-05, "loss": 0.3125, "step": 2159 }, { "epoch": 2.345464421510049, "grad_norm": 0.22497945600342775, "learning_rate": 1.2097423510466989e-05, "loss": 0.3747, "step": 2160 }, { "epoch": 2.3465507876154263, "grad_norm": 0.18684908486610569, "learning_rate": 1.2077294685990338e-05, "loss": 0.3048, "step": 2161 }, { "epoch": 2.347637153720804, "grad_norm": 0.19684780449239267, "learning_rate": 1.2057165861513688e-05, "loss": 0.3374, "step": 2162 }, { "epoch": 2.3487235198261813, "grad_norm": 0.20658393516103152, "learning_rate": 1.2037037037037037e-05, "loss": 0.3666, "step": 2163 }, { "epoch": 2.349809885931559, "grad_norm": 0.21208569638658398, "learning_rate": 1.2016908212560387e-05, "loss": 0.3396, "step": 2164 }, { "epoch": 2.3508962520369363, "grad_norm": 0.197311325058958, "learning_rate": 1.1996779388083736e-05, "loss": 0.3409, "step": 2165 }, { "epoch": 2.351982618142314, "grad_norm": 0.23416260031154668, "learning_rate": 1.1976650563607087e-05, "loss": 0.4021, "step": 2166 }, { "epoch": 2.3530689842476913, "grad_norm": 0.19954865568782043, "learning_rate": 1.1956521739130435e-05, "loss": 0.3524, "step": 2167 }, { "epoch": 2.3541553503530688, "grad_norm": 0.20463696708511941, "learning_rate": 1.1936392914653786e-05, "loss": 0.341, "step": 2168 }, { "epoch": 2.3552417164584467, "grad_norm": 0.2143659524694792, "learning_rate": 1.1916264090177135e-05, "loss": 0.3422, "step": 2169 }, { "epoch": 2.356328082563824, "grad_norm": 0.19092467069003802, "learning_rate": 1.1896135265700483e-05, "loss": 0.3122, "step": 2170 }, { "epoch": 2.3574144486692017, "grad_norm": 0.17976069174985718, "learning_rate": 1.1876006441223834e-05, "loss": 0.281, "step": 2171 }, { "epoch": 2.358500814774579, "grad_norm": 0.22007056891799492, "learning_rate": 1.1855877616747183e-05, "loss": 0.3869, "step": 2172 }, { "epoch": 2.3595871808799567, "grad_norm": 0.19051982297536132, "learning_rate": 1.1835748792270531e-05, "loss": 0.3354, "step": 2173 }, { "epoch": 2.360673546985334, "grad_norm": 0.20592104438072717, "learning_rate": 1.1815619967793882e-05, "loss": 0.3361, "step": 2174 }, { "epoch": 2.3617599130907116, "grad_norm": 0.21188210998041887, "learning_rate": 1.179549114331723e-05, "loss": 0.3664, "step": 2175 }, { "epoch": 2.362846279196089, "grad_norm": 0.20083728845722323, "learning_rate": 1.177536231884058e-05, "loss": 0.3124, "step": 2176 }, { "epoch": 2.3639326453014666, "grad_norm": 0.21433468640710654, "learning_rate": 1.175523349436393e-05, "loss": 0.361, "step": 2177 }, { "epoch": 2.365019011406844, "grad_norm": 0.2118169738377887, "learning_rate": 1.1735104669887278e-05, "loss": 0.3476, "step": 2178 }, { "epoch": 2.3661053775122216, "grad_norm": 0.20409073279267528, "learning_rate": 1.1714975845410629e-05, "loss": 0.36, "step": 2179 }, { "epoch": 2.367191743617599, "grad_norm": 0.22301731260016489, "learning_rate": 1.169484702093398e-05, "loss": 0.3564, "step": 2180 }, { "epoch": 2.3682781097229766, "grad_norm": 0.19559040955066337, "learning_rate": 1.1674718196457328e-05, "loss": 0.3303, "step": 2181 }, { "epoch": 2.369364475828354, "grad_norm": 0.20500160519590976, "learning_rate": 1.1654589371980677e-05, "loss": 0.3279, "step": 2182 }, { "epoch": 2.3704508419337316, "grad_norm": 0.2132824970635035, "learning_rate": 1.1634460547504027e-05, "loss": 0.3562, "step": 2183 }, { "epoch": 2.371537208039109, "grad_norm": 0.22184957194162674, "learning_rate": 1.1614331723027376e-05, "loss": 0.353, "step": 2184 }, { "epoch": 2.3726235741444865, "grad_norm": 0.1941759414903014, "learning_rate": 1.1594202898550725e-05, "loss": 0.3401, "step": 2185 }, { "epoch": 2.373709940249864, "grad_norm": 0.21201021209478907, "learning_rate": 1.1574074074074075e-05, "loss": 0.3749, "step": 2186 }, { "epoch": 2.3747963063552415, "grad_norm": 0.20386250087491448, "learning_rate": 1.1553945249597424e-05, "loss": 0.327, "step": 2187 }, { "epoch": 2.3758826724606195, "grad_norm": 0.21560104270100858, "learning_rate": 1.1533816425120773e-05, "loss": 0.3761, "step": 2188 }, { "epoch": 2.3769690385659965, "grad_norm": 0.21468885057181245, "learning_rate": 1.1513687600644123e-05, "loss": 0.3434, "step": 2189 }, { "epoch": 2.3780554046713744, "grad_norm": 0.20278769665732363, "learning_rate": 1.1493558776167472e-05, "loss": 0.2837, "step": 2190 }, { "epoch": 2.379141770776752, "grad_norm": 0.2055640090286918, "learning_rate": 1.147342995169082e-05, "loss": 0.3189, "step": 2191 }, { "epoch": 2.3802281368821294, "grad_norm": 0.2139624755414391, "learning_rate": 1.1453301127214171e-05, "loss": 0.3589, "step": 2192 }, { "epoch": 2.381314502987507, "grad_norm": 0.21922472540309676, "learning_rate": 1.143317230273752e-05, "loss": 0.4012, "step": 2193 }, { "epoch": 2.3824008690928844, "grad_norm": 0.19731399313536957, "learning_rate": 1.141304347826087e-05, "loss": 0.3479, "step": 2194 }, { "epoch": 2.383487235198262, "grad_norm": 0.2179218750434797, "learning_rate": 1.139291465378422e-05, "loss": 0.3313, "step": 2195 }, { "epoch": 2.3845736013036394, "grad_norm": 0.21954477433323685, "learning_rate": 1.137278582930757e-05, "loss": 0.3836, "step": 2196 }, { "epoch": 2.385659967409017, "grad_norm": 0.1932210761610591, "learning_rate": 1.1352657004830918e-05, "loss": 0.3203, "step": 2197 }, { "epoch": 2.3867463335143944, "grad_norm": 0.206443820772661, "learning_rate": 1.1332528180354269e-05, "loss": 0.3472, "step": 2198 }, { "epoch": 2.387832699619772, "grad_norm": 0.20955245317911902, "learning_rate": 1.1312399355877617e-05, "loss": 0.3736, "step": 2199 }, { "epoch": 2.3889190657251493, "grad_norm": 0.20319131073710417, "learning_rate": 1.1292270531400966e-05, "loss": 0.3514, "step": 2200 }, { "epoch": 2.390005431830527, "grad_norm": 0.222074405085774, "learning_rate": 1.1272141706924317e-05, "loss": 0.3459, "step": 2201 }, { "epoch": 2.3910917979359043, "grad_norm": 0.2136441774847945, "learning_rate": 1.1252012882447665e-05, "loss": 0.3505, "step": 2202 }, { "epoch": 2.392178164041282, "grad_norm": 0.19719872531928095, "learning_rate": 1.1231884057971016e-05, "loss": 0.3309, "step": 2203 }, { "epoch": 2.3932645301466593, "grad_norm": 0.20511985585989737, "learning_rate": 1.1211755233494365e-05, "loss": 0.3381, "step": 2204 }, { "epoch": 2.394350896252037, "grad_norm": 0.23197843951775385, "learning_rate": 1.1191626409017713e-05, "loss": 0.3806, "step": 2205 }, { "epoch": 2.3954372623574143, "grad_norm": 0.2265329175362859, "learning_rate": 1.1171497584541064e-05, "loss": 0.3909, "step": 2206 }, { "epoch": 2.396523628462792, "grad_norm": 0.2051920787952969, "learning_rate": 1.1151368760064412e-05, "loss": 0.3817, "step": 2207 }, { "epoch": 2.3976099945681693, "grad_norm": 0.20803034018569913, "learning_rate": 1.1131239935587761e-05, "loss": 0.3846, "step": 2208 }, { "epoch": 2.398696360673547, "grad_norm": 0.19558423653768808, "learning_rate": 1.1111111111111112e-05, "loss": 0.3204, "step": 2209 }, { "epoch": 2.3997827267789247, "grad_norm": 0.19882203893039066, "learning_rate": 1.1090982286634462e-05, "loss": 0.3222, "step": 2210 }, { "epoch": 2.400869092884302, "grad_norm": 0.2032443644564996, "learning_rate": 1.107085346215781e-05, "loss": 0.3404, "step": 2211 }, { "epoch": 2.4019554589896797, "grad_norm": 0.20003767824675614, "learning_rate": 1.1050724637681161e-05, "loss": 0.3105, "step": 2212 }, { "epoch": 2.403041825095057, "grad_norm": 0.209988971137647, "learning_rate": 1.103059581320451e-05, "loss": 0.3639, "step": 2213 }, { "epoch": 2.4041281912004346, "grad_norm": 0.18883758174249615, "learning_rate": 1.1010466988727859e-05, "loss": 0.332, "step": 2214 }, { "epoch": 2.405214557305812, "grad_norm": 1.1619888726676872, "learning_rate": 1.099033816425121e-05, "loss": 0.441, "step": 2215 }, { "epoch": 2.4063009234111896, "grad_norm": 0.19230519769676793, "learning_rate": 1.0970209339774558e-05, "loss": 0.3103, "step": 2216 }, { "epoch": 2.407387289516567, "grad_norm": 0.1997247081509586, "learning_rate": 1.0950080515297907e-05, "loss": 0.3122, "step": 2217 }, { "epoch": 2.4084736556219446, "grad_norm": 0.2196488657180352, "learning_rate": 1.0929951690821257e-05, "loss": 0.363, "step": 2218 }, { "epoch": 2.409560021727322, "grad_norm": 0.1989096102442866, "learning_rate": 1.0909822866344606e-05, "loss": 0.3128, "step": 2219 }, { "epoch": 2.4106463878326996, "grad_norm": 0.21372431429618677, "learning_rate": 1.0889694041867955e-05, "loss": 0.3529, "step": 2220 }, { "epoch": 2.411732753938077, "grad_norm": 0.20297528699664397, "learning_rate": 1.0869565217391305e-05, "loss": 0.3755, "step": 2221 }, { "epoch": 2.4128191200434546, "grad_norm": 0.203392482489394, "learning_rate": 1.0849436392914654e-05, "loss": 0.3414, "step": 2222 }, { "epoch": 2.413905486148832, "grad_norm": 0.2026047502560985, "learning_rate": 1.0829307568438003e-05, "loss": 0.3596, "step": 2223 }, { "epoch": 2.4149918522542095, "grad_norm": 0.20461658252374249, "learning_rate": 1.0809178743961353e-05, "loss": 0.3786, "step": 2224 }, { "epoch": 2.416078218359587, "grad_norm": 0.19766501256348917, "learning_rate": 1.0789049919484702e-05, "loss": 0.34, "step": 2225 }, { "epoch": 2.4171645844649645, "grad_norm": 0.20160919142733325, "learning_rate": 1.0768921095008052e-05, "loss": 0.3458, "step": 2226 }, { "epoch": 2.418250950570342, "grad_norm": 0.21869877671770777, "learning_rate": 1.0748792270531403e-05, "loss": 0.3658, "step": 2227 }, { "epoch": 2.41933731667572, "grad_norm": 0.21903531841812965, "learning_rate": 1.0728663446054751e-05, "loss": 0.3462, "step": 2228 }, { "epoch": 2.420423682781097, "grad_norm": 0.21878460007163575, "learning_rate": 1.07085346215781e-05, "loss": 0.395, "step": 2229 }, { "epoch": 2.421510048886475, "grad_norm": 0.20884975087934848, "learning_rate": 1.068840579710145e-05, "loss": 0.3406, "step": 2230 }, { "epoch": 2.4225964149918524, "grad_norm": 0.20906184142801265, "learning_rate": 1.06682769726248e-05, "loss": 0.3589, "step": 2231 }, { "epoch": 2.42368278109723, "grad_norm": 0.2197723701267213, "learning_rate": 1.0648148148148148e-05, "loss": 0.3494, "step": 2232 }, { "epoch": 2.4247691472026074, "grad_norm": 0.1977810354022188, "learning_rate": 1.0628019323671499e-05, "loss": 0.3537, "step": 2233 }, { "epoch": 2.425855513307985, "grad_norm": 0.2047686220603348, "learning_rate": 1.0607890499194847e-05, "loss": 0.3804, "step": 2234 }, { "epoch": 2.4269418794133624, "grad_norm": 0.21226829891529858, "learning_rate": 1.0587761674718196e-05, "loss": 0.3868, "step": 2235 }, { "epoch": 2.42802824551874, "grad_norm": 0.22723135534017921, "learning_rate": 1.0567632850241546e-05, "loss": 0.3954, "step": 2236 }, { "epoch": 2.4291146116241173, "grad_norm": 0.20505353005522095, "learning_rate": 1.0547504025764895e-05, "loss": 0.3539, "step": 2237 }, { "epoch": 2.430200977729495, "grad_norm": 0.20459945859537476, "learning_rate": 1.0527375201288246e-05, "loss": 0.3748, "step": 2238 }, { "epoch": 2.4312873438348723, "grad_norm": 0.2047470209406508, "learning_rate": 1.0507246376811594e-05, "loss": 0.3647, "step": 2239 }, { "epoch": 2.43237370994025, "grad_norm": 0.20053253724526848, "learning_rate": 1.0487117552334943e-05, "loss": 0.3514, "step": 2240 }, { "epoch": 2.4334600760456273, "grad_norm": 0.20351501151266066, "learning_rate": 1.0466988727858294e-05, "loss": 0.3313, "step": 2241 }, { "epoch": 2.434546442151005, "grad_norm": 0.20599242736327075, "learning_rate": 1.0446859903381644e-05, "loss": 0.3703, "step": 2242 }, { "epoch": 2.4356328082563823, "grad_norm": 0.20469472319173063, "learning_rate": 1.0426731078904993e-05, "loss": 0.3632, "step": 2243 }, { "epoch": 2.4367191743617598, "grad_norm": 0.23249504561779133, "learning_rate": 1.0406602254428342e-05, "loss": 0.3932, "step": 2244 }, { "epoch": 2.4378055404671373, "grad_norm": 0.20348758235924816, "learning_rate": 1.0386473429951692e-05, "loss": 0.3098, "step": 2245 }, { "epoch": 2.4388919065725148, "grad_norm": 0.19448101109484522, "learning_rate": 1.036634460547504e-05, "loss": 0.3613, "step": 2246 }, { "epoch": 2.4399782726778927, "grad_norm": 0.22823287462718644, "learning_rate": 1.0346215780998391e-05, "loss": 0.4003, "step": 2247 }, { "epoch": 2.4410646387832697, "grad_norm": 0.21138123422930505, "learning_rate": 1.032608695652174e-05, "loss": 0.3602, "step": 2248 }, { "epoch": 2.4421510048886477, "grad_norm": 0.18753547900008405, "learning_rate": 1.0305958132045089e-05, "loss": 0.3051, "step": 2249 }, { "epoch": 2.443237370994025, "grad_norm": 0.201251686679034, "learning_rate": 1.0285829307568439e-05, "loss": 0.3653, "step": 2250 }, { "epoch": 2.4443237370994026, "grad_norm": 0.21135461319513305, "learning_rate": 1.0265700483091788e-05, "loss": 0.345, "step": 2251 }, { "epoch": 2.44541010320478, "grad_norm": 0.5234749772890982, "learning_rate": 1.0245571658615137e-05, "loss": 0.368, "step": 2252 }, { "epoch": 2.4464964693101576, "grad_norm": 0.21229028793689272, "learning_rate": 1.0225442834138487e-05, "loss": 0.362, "step": 2253 }, { "epoch": 2.447582835415535, "grad_norm": 0.209128907970762, "learning_rate": 1.0205314009661836e-05, "loss": 0.3403, "step": 2254 }, { "epoch": 2.4486692015209126, "grad_norm": 0.21593369421083108, "learning_rate": 1.0185185185185185e-05, "loss": 0.4013, "step": 2255 }, { "epoch": 2.44975556762629, "grad_norm": 0.1886113188903276, "learning_rate": 1.0165056360708535e-05, "loss": 0.3401, "step": 2256 }, { "epoch": 2.4508419337316676, "grad_norm": 6.573278396836927, "learning_rate": 1.0144927536231885e-05, "loss": 0.7314, "step": 2257 }, { "epoch": 2.451928299837045, "grad_norm": 0.2362997561346966, "learning_rate": 1.0124798711755234e-05, "loss": 0.4005, "step": 2258 }, { "epoch": 2.4530146659424226, "grad_norm": 0.21670382783463954, "learning_rate": 1.0104669887278585e-05, "loss": 0.3641, "step": 2259 }, { "epoch": 2.4541010320478, "grad_norm": 0.2047350507128539, "learning_rate": 1.0084541062801933e-05, "loss": 0.3526, "step": 2260 }, { "epoch": 2.4551873981531775, "grad_norm": 0.19770326454588896, "learning_rate": 1.0064412238325282e-05, "loss": 0.3398, "step": 2261 }, { "epoch": 2.456273764258555, "grad_norm": 0.22258459994084337, "learning_rate": 1.0044283413848633e-05, "loss": 0.3866, "step": 2262 }, { "epoch": 2.4573601303639325, "grad_norm": 0.2244114780775002, "learning_rate": 1.0024154589371981e-05, "loss": 0.4081, "step": 2263 }, { "epoch": 2.45844649646931, "grad_norm": 0.20425155774920833, "learning_rate": 1.000402576489533e-05, "loss": 0.3419, "step": 2264 }, { "epoch": 2.4595328625746875, "grad_norm": 0.21011499451618026, "learning_rate": 9.98389694041868e-06, "loss": 0.3649, "step": 2265 }, { "epoch": 2.460619228680065, "grad_norm": 0.21955185234180447, "learning_rate": 9.96376811594203e-06, "loss": 0.3567, "step": 2266 }, { "epoch": 2.4617055947854425, "grad_norm": 0.20434226737892977, "learning_rate": 9.943639291465378e-06, "loss": 0.3392, "step": 2267 }, { "epoch": 2.4627919608908204, "grad_norm": 0.19666435858453876, "learning_rate": 9.923510466988728e-06, "loss": 0.3233, "step": 2268 }, { "epoch": 2.463878326996198, "grad_norm": 0.20169642786487493, "learning_rate": 9.903381642512077e-06, "loss": 0.3712, "step": 2269 }, { "epoch": 2.4649646931015754, "grad_norm": 0.218421475291231, "learning_rate": 9.883252818035426e-06, "loss": 0.3852, "step": 2270 }, { "epoch": 2.466051059206953, "grad_norm": 0.19218253409122513, "learning_rate": 9.863123993558776e-06, "loss": 0.3249, "step": 2271 }, { "epoch": 2.4671374253123304, "grad_norm": 0.22120428821027396, "learning_rate": 9.842995169082127e-06, "loss": 0.354, "step": 2272 }, { "epoch": 2.468223791417708, "grad_norm": 0.2004021135823681, "learning_rate": 9.822866344605476e-06, "loss": 0.382, "step": 2273 }, { "epoch": 2.4693101575230854, "grad_norm": 0.19969005670592174, "learning_rate": 9.802737520128826e-06, "loss": 0.3466, "step": 2274 }, { "epoch": 2.470396523628463, "grad_norm": 0.19894871814938392, "learning_rate": 9.782608695652175e-06, "loss": 0.33, "step": 2275 }, { "epoch": 2.4714828897338403, "grad_norm": 0.20693840679967654, "learning_rate": 9.762479871175523e-06, "loss": 0.3746, "step": 2276 }, { "epoch": 2.472569255839218, "grad_norm": 0.2039275300398435, "learning_rate": 9.742351046698874e-06, "loss": 0.3915, "step": 2277 }, { "epoch": 2.4736556219445953, "grad_norm": 0.20558110571406735, "learning_rate": 9.722222222222223e-06, "loss": 0.3327, "step": 2278 }, { "epoch": 2.474741988049973, "grad_norm": 0.18803735823589438, "learning_rate": 9.702093397745571e-06, "loss": 0.3307, "step": 2279 }, { "epoch": 2.4758283541553503, "grad_norm": 0.19324175243482403, "learning_rate": 9.681964573268922e-06, "loss": 0.3435, "step": 2280 }, { "epoch": 2.476914720260728, "grad_norm": 0.19908565260061034, "learning_rate": 9.66183574879227e-06, "loss": 0.3618, "step": 2281 }, { "epoch": 2.4780010863661053, "grad_norm": 0.18837798613540477, "learning_rate": 9.641706924315621e-06, "loss": 0.3376, "step": 2282 }, { "epoch": 2.4790874524714828, "grad_norm": 0.2054008263716131, "learning_rate": 9.62157809983897e-06, "loss": 0.3379, "step": 2283 }, { "epoch": 2.4801738185768603, "grad_norm": 0.2058189985756336, "learning_rate": 9.601449275362319e-06, "loss": 0.3682, "step": 2284 }, { "epoch": 2.4812601846822377, "grad_norm": 0.19481042649409808, "learning_rate": 9.581320450885669e-06, "loss": 0.3741, "step": 2285 }, { "epoch": 2.4823465507876152, "grad_norm": 0.19733558912365254, "learning_rate": 9.561191626409018e-06, "loss": 0.3294, "step": 2286 }, { "epoch": 2.483432916892993, "grad_norm": 0.20512518427344534, "learning_rate": 9.541062801932367e-06, "loss": 0.3472, "step": 2287 }, { "epoch": 2.48451928299837, "grad_norm": 0.2060451645020202, "learning_rate": 9.520933977455717e-06, "loss": 0.3355, "step": 2288 }, { "epoch": 2.485605649103748, "grad_norm": 0.321791803430481, "learning_rate": 9.500805152979067e-06, "loss": 0.3533, "step": 2289 }, { "epoch": 2.4866920152091256, "grad_norm": 0.2116527320875136, "learning_rate": 9.480676328502416e-06, "loss": 0.3828, "step": 2290 }, { "epoch": 2.487778381314503, "grad_norm": 0.20247104382933578, "learning_rate": 9.460547504025765e-06, "loss": 0.3628, "step": 2291 }, { "epoch": 2.4888647474198806, "grad_norm": 0.21567261997199785, "learning_rate": 9.440418679549115e-06, "loss": 0.3794, "step": 2292 }, { "epoch": 2.489951113525258, "grad_norm": 0.18895607642341603, "learning_rate": 9.420289855072464e-06, "loss": 0.3068, "step": 2293 }, { "epoch": 2.4910374796306356, "grad_norm": 0.2210357422575355, "learning_rate": 9.400161030595815e-06, "loss": 0.3709, "step": 2294 }, { "epoch": 2.492123845736013, "grad_norm": 0.3119219177173871, "learning_rate": 9.380032206119163e-06, "loss": 0.3299, "step": 2295 }, { "epoch": 2.4932102118413906, "grad_norm": 0.20348339964022172, "learning_rate": 9.359903381642512e-06, "loss": 0.3349, "step": 2296 }, { "epoch": 2.494296577946768, "grad_norm": 0.193685085943097, "learning_rate": 9.339774557165862e-06, "loss": 0.2983, "step": 2297 }, { "epoch": 2.4953829440521456, "grad_norm": 0.20690015646672602, "learning_rate": 9.319645732689211e-06, "loss": 0.3439, "step": 2298 }, { "epoch": 2.496469310157523, "grad_norm": 0.19971966358356352, "learning_rate": 9.29951690821256e-06, "loss": 0.3508, "step": 2299 }, { "epoch": 2.4975556762629005, "grad_norm": 0.20171674289644978, "learning_rate": 9.27938808373591e-06, "loss": 0.355, "step": 2300 }, { "epoch": 2.498642042368278, "grad_norm": 0.20133401905099996, "learning_rate": 9.259259259259259e-06, "loss": 0.3415, "step": 2301 }, { "epoch": 2.4997284084736555, "grad_norm": 0.20244426085188763, "learning_rate": 9.239130434782608e-06, "loss": 0.358, "step": 2302 }, { "epoch": 2.500814774579033, "grad_norm": 0.1939119016523947, "learning_rate": 9.219001610305958e-06, "loss": 0.3405, "step": 2303 }, { "epoch": 2.5019011406844105, "grad_norm": 0.20627711433669652, "learning_rate": 9.198872785829309e-06, "loss": 0.3519, "step": 2304 }, { "epoch": 2.502987506789788, "grad_norm": 0.2072737177050329, "learning_rate": 9.178743961352658e-06, "loss": 0.3478, "step": 2305 }, { "epoch": 2.504073872895166, "grad_norm": 0.18693575267107762, "learning_rate": 9.158615136876008e-06, "loss": 0.3221, "step": 2306 }, { "epoch": 2.505160239000543, "grad_norm": 0.2093863498513159, "learning_rate": 9.138486312399357e-06, "loss": 0.3612, "step": 2307 }, { "epoch": 2.506246605105921, "grad_norm": 0.20727360959900964, "learning_rate": 9.118357487922705e-06, "loss": 0.387, "step": 2308 }, { "epoch": 2.507332971211298, "grad_norm": 0.2007228078924616, "learning_rate": 9.098228663446056e-06, "loss": 0.3487, "step": 2309 }, { "epoch": 2.508419337316676, "grad_norm": 0.21168894058479204, "learning_rate": 9.078099838969405e-06, "loss": 0.3766, "step": 2310 }, { "epoch": 2.5095057034220534, "grad_norm": 0.19231857116609907, "learning_rate": 9.057971014492753e-06, "loss": 0.3183, "step": 2311 }, { "epoch": 2.510592069527431, "grad_norm": 0.19470763918939657, "learning_rate": 9.037842190016104e-06, "loss": 0.3523, "step": 2312 }, { "epoch": 2.5116784356328083, "grad_norm": 0.1896404529110242, "learning_rate": 9.017713365539453e-06, "loss": 0.3251, "step": 2313 }, { "epoch": 2.512764801738186, "grad_norm": 0.20032975586937393, "learning_rate": 8.997584541062801e-06, "loss": 0.3461, "step": 2314 }, { "epoch": 2.5138511678435633, "grad_norm": 0.20798640563736573, "learning_rate": 8.977455716586152e-06, "loss": 0.3496, "step": 2315 }, { "epoch": 2.514937533948941, "grad_norm": 24.74731128026353, "learning_rate": 8.9573268921095e-06, "loss": 0.6085, "step": 2316 }, { "epoch": 2.5160239000543183, "grad_norm": 0.20354768147647834, "learning_rate": 8.93719806763285e-06, "loss": 0.32, "step": 2317 }, { "epoch": 2.517110266159696, "grad_norm": 0.20105399762478168, "learning_rate": 8.9170692431562e-06, "loss": 0.3575, "step": 2318 }, { "epoch": 2.5181966322650733, "grad_norm": 0.2029666583229498, "learning_rate": 8.89694041867955e-06, "loss": 0.338, "step": 2319 }, { "epoch": 2.5192829983704508, "grad_norm": 0.1963971740432225, "learning_rate": 8.876811594202899e-06, "loss": 0.3398, "step": 2320 }, { "epoch": 2.5203693644758283, "grad_norm": 0.20569148275241145, "learning_rate": 8.85668276972625e-06, "loss": 0.3975, "step": 2321 }, { "epoch": 2.5214557305812058, "grad_norm": 0.18398668913429775, "learning_rate": 8.836553945249598e-06, "loss": 0.3021, "step": 2322 }, { "epoch": 2.5225420966865832, "grad_norm": 0.20743500575088458, "learning_rate": 8.816425120772947e-06, "loss": 0.3657, "step": 2323 }, { "epoch": 2.5236284627919607, "grad_norm": 0.18382693260329735, "learning_rate": 8.796296296296297e-06, "loss": 0.3381, "step": 2324 }, { "epoch": 2.5247148288973387, "grad_norm": 0.21008114373734507, "learning_rate": 8.776167471819646e-06, "loss": 0.3667, "step": 2325 }, { "epoch": 2.5258011950027157, "grad_norm": 0.2186523816118309, "learning_rate": 8.756038647342995e-06, "loss": 0.3633, "step": 2326 }, { "epoch": 2.5268875611080936, "grad_norm": 0.2095950526894437, "learning_rate": 8.735909822866345e-06, "loss": 0.3638, "step": 2327 }, { "epoch": 2.5279739272134707, "grad_norm": 0.20609930677883512, "learning_rate": 8.715780998389694e-06, "loss": 0.3147, "step": 2328 }, { "epoch": 2.5290602933188486, "grad_norm": 0.21829475382783184, "learning_rate": 8.695652173913044e-06, "loss": 0.3937, "step": 2329 }, { "epoch": 2.530146659424226, "grad_norm": 0.19944195027924255, "learning_rate": 8.675523349436393e-06, "loss": 0.343, "step": 2330 }, { "epoch": 2.5312330255296036, "grad_norm": 0.2150388863417788, "learning_rate": 8.655394524959742e-06, "loss": 0.3629, "step": 2331 }, { "epoch": 2.532319391634981, "grad_norm": 0.20026619425218217, "learning_rate": 8.635265700483092e-06, "loss": 0.3329, "step": 2332 }, { "epoch": 2.5334057577403586, "grad_norm": 0.20542010725201787, "learning_rate": 8.615136876006441e-06, "loss": 0.3436, "step": 2333 }, { "epoch": 2.534492123845736, "grad_norm": 0.19782967997906734, "learning_rate": 8.59500805152979e-06, "loss": 0.3405, "step": 2334 }, { "epoch": 2.5355784899511136, "grad_norm": 0.1945011441857383, "learning_rate": 8.57487922705314e-06, "loss": 0.355, "step": 2335 }, { "epoch": 2.536664856056491, "grad_norm": 0.19853103099907024, "learning_rate": 8.55475040257649e-06, "loss": 0.3272, "step": 2336 }, { "epoch": 2.5377512221618685, "grad_norm": 0.19631106289661293, "learning_rate": 8.53462157809984e-06, "loss": 0.2932, "step": 2337 }, { "epoch": 2.538837588267246, "grad_norm": 0.20727596867898831, "learning_rate": 8.51449275362319e-06, "loss": 0.3726, "step": 2338 }, { "epoch": 2.5399239543726235, "grad_norm": 0.20173856221275982, "learning_rate": 8.494363929146539e-06, "loss": 0.3489, "step": 2339 }, { "epoch": 2.541010320478001, "grad_norm": 0.7599612165942928, "learning_rate": 8.474235104669887e-06, "loss": 0.4354, "step": 2340 }, { "epoch": 2.5420966865833785, "grad_norm": 0.21308217075056005, "learning_rate": 8.454106280193238e-06, "loss": 0.3741, "step": 2341 }, { "epoch": 2.543183052688756, "grad_norm": 0.21180881174192112, "learning_rate": 8.433977455716587e-06, "loss": 0.3846, "step": 2342 }, { "epoch": 2.5442694187941335, "grad_norm": 0.22095692005551318, "learning_rate": 8.413848631239935e-06, "loss": 0.3839, "step": 2343 }, { "epoch": 2.545355784899511, "grad_norm": 0.19994726319248923, "learning_rate": 8.393719806763286e-06, "loss": 0.3448, "step": 2344 }, { "epoch": 2.5464421510048885, "grad_norm": 0.20962045385161812, "learning_rate": 8.373590982286635e-06, "loss": 0.3593, "step": 2345 }, { "epoch": 2.5475285171102664, "grad_norm": 0.20197348293459147, "learning_rate": 8.353462157809983e-06, "loss": 0.3413, "step": 2346 }, { "epoch": 2.5486148832156434, "grad_norm": 0.19045274469083842, "learning_rate": 8.333333333333334e-06, "loss": 0.3363, "step": 2347 }, { "epoch": 2.5497012493210214, "grad_norm": 0.20518211408285134, "learning_rate": 8.313204508856682e-06, "loss": 0.3379, "step": 2348 }, { "epoch": 2.5507876154263984, "grad_norm": 0.1997519513482995, "learning_rate": 8.293075684380031e-06, "loss": 0.3372, "step": 2349 }, { "epoch": 2.5518739815317764, "grad_norm": 0.18213576518588298, "learning_rate": 8.272946859903383e-06, "loss": 0.2921, "step": 2350 }, { "epoch": 2.552960347637154, "grad_norm": 0.20760478569709703, "learning_rate": 8.252818035426732e-06, "loss": 0.3645, "step": 2351 }, { "epoch": 2.5540467137425313, "grad_norm": 0.20151268209108092, "learning_rate": 8.232689210950081e-06, "loss": 0.3417, "step": 2352 }, { "epoch": 2.555133079847909, "grad_norm": 0.20093777525791787, "learning_rate": 8.212560386473431e-06, "loss": 0.34, "step": 2353 }, { "epoch": 2.5562194459532863, "grad_norm": 0.20319038751273105, "learning_rate": 8.19243156199678e-06, "loss": 0.3463, "step": 2354 }, { "epoch": 2.557305812058664, "grad_norm": 0.19728425233758984, "learning_rate": 8.172302737520129e-06, "loss": 0.3477, "step": 2355 }, { "epoch": 2.5583921781640413, "grad_norm": 0.18439342072522005, "learning_rate": 8.15217391304348e-06, "loss": 0.3481, "step": 2356 }, { "epoch": 2.559478544269419, "grad_norm": 0.20486522510906036, "learning_rate": 8.132045088566828e-06, "loss": 0.333, "step": 2357 }, { "epoch": 2.5605649103747963, "grad_norm": 0.1971235356378623, "learning_rate": 8.111916264090177e-06, "loss": 0.3504, "step": 2358 }, { "epoch": 2.5616512764801738, "grad_norm": 0.2192084484823143, "learning_rate": 8.091787439613527e-06, "loss": 0.3676, "step": 2359 }, { "epoch": 2.5627376425855513, "grad_norm": 0.1977373579154168, "learning_rate": 8.071658615136876e-06, "loss": 0.3693, "step": 2360 }, { "epoch": 2.5638240086909287, "grad_norm": 0.21790072459492704, "learning_rate": 8.051529790660225e-06, "loss": 0.3625, "step": 2361 }, { "epoch": 2.5649103747963062, "grad_norm": 0.1998033560265658, "learning_rate": 8.031400966183575e-06, "loss": 0.3631, "step": 2362 }, { "epoch": 2.5659967409016837, "grad_norm": 0.18703013957053777, "learning_rate": 8.011272141706924e-06, "loss": 0.3441, "step": 2363 }, { "epoch": 2.567083107007061, "grad_norm": 0.2061460076390843, "learning_rate": 7.991143317230274e-06, "loss": 0.3811, "step": 2364 }, { "epoch": 2.568169473112439, "grad_norm": 0.19759335795433602, "learning_rate": 7.971014492753623e-06, "loss": 0.3227, "step": 2365 }, { "epoch": 2.569255839217816, "grad_norm": 0.21637803059927438, "learning_rate": 7.950885668276973e-06, "loss": 0.3553, "step": 2366 }, { "epoch": 2.570342205323194, "grad_norm": 0.18440788190945478, "learning_rate": 7.930756843800322e-06, "loss": 0.3024, "step": 2367 }, { "epoch": 2.571428571428571, "grad_norm": 0.19361836188856824, "learning_rate": 7.910628019323673e-06, "loss": 0.333, "step": 2368 }, { "epoch": 2.572514937533949, "grad_norm": 0.20525240362127153, "learning_rate": 7.890499194847021e-06, "loss": 0.39, "step": 2369 }, { "epoch": 2.5736013036393266, "grad_norm": 0.2002208477339354, "learning_rate": 7.87037037037037e-06, "loss": 0.3505, "step": 2370 }, { "epoch": 2.574687669744704, "grad_norm": 0.19731582174232065, "learning_rate": 7.85024154589372e-06, "loss": 0.3543, "step": 2371 }, { "epoch": 2.5757740358500816, "grad_norm": 0.19928238109244947, "learning_rate": 7.83011272141707e-06, "loss": 0.3484, "step": 2372 }, { "epoch": 2.576860401955459, "grad_norm": 7.696905133857416, "learning_rate": 7.80998389694042e-06, "loss": 0.3686, "step": 2373 }, { "epoch": 2.5779467680608366, "grad_norm": 0.2089989129787135, "learning_rate": 7.789855072463769e-06, "loss": 0.3674, "step": 2374 }, { "epoch": 2.579033134166214, "grad_norm": 0.20767170638547056, "learning_rate": 7.769726247987117e-06, "loss": 0.3759, "step": 2375 }, { "epoch": 2.5801195002715915, "grad_norm": 0.20259746635972917, "learning_rate": 7.749597423510468e-06, "loss": 0.3659, "step": 2376 }, { "epoch": 2.581205866376969, "grad_norm": 0.19836318194657357, "learning_rate": 7.729468599033817e-06, "loss": 0.337, "step": 2377 }, { "epoch": 2.5822922324823465, "grad_norm": 0.18411334015654898, "learning_rate": 7.709339774557165e-06, "loss": 0.3298, "step": 2378 }, { "epoch": 2.583378598587724, "grad_norm": 0.19948945697142456, "learning_rate": 7.689210950080516e-06, "loss": 0.3303, "step": 2379 }, { "epoch": 2.5844649646931015, "grad_norm": 0.1866968784012527, "learning_rate": 7.669082125603864e-06, "loss": 0.293, "step": 2380 }, { "epoch": 2.585551330798479, "grad_norm": 0.1998907586922035, "learning_rate": 7.648953301127215e-06, "loss": 0.3174, "step": 2381 }, { "epoch": 2.5866376969038565, "grad_norm": 0.19966808350842657, "learning_rate": 7.6288244766505645e-06, "loss": 0.35, "step": 2382 }, { "epoch": 2.587724063009234, "grad_norm": 0.21699218689235958, "learning_rate": 7.608695652173914e-06, "loss": 0.4044, "step": 2383 }, { "epoch": 2.588810429114612, "grad_norm": 0.19572922299538864, "learning_rate": 7.588566827697263e-06, "loss": 0.3281, "step": 2384 }, { "epoch": 2.589896795219989, "grad_norm": 0.19797506347393498, "learning_rate": 7.568438003220612e-06, "loss": 0.3198, "step": 2385 }, { "epoch": 2.590983161325367, "grad_norm": 0.20350331673243774, "learning_rate": 7.548309178743962e-06, "loss": 0.3428, "step": 2386 }, { "epoch": 2.592069527430744, "grad_norm": 0.20058495403887117, "learning_rate": 7.528180354267312e-06, "loss": 0.3364, "step": 2387 }, { "epoch": 2.593155893536122, "grad_norm": 0.18641859896008245, "learning_rate": 7.50805152979066e-06, "loss": 0.3366, "step": 2388 }, { "epoch": 2.5942422596414993, "grad_norm": 0.21523893676947115, "learning_rate": 7.48792270531401e-06, "loss": 0.3518, "step": 2389 }, { "epoch": 2.595328625746877, "grad_norm": 0.3622153127022363, "learning_rate": 7.4677938808373595e-06, "loss": 0.3569, "step": 2390 }, { "epoch": 2.5964149918522543, "grad_norm": 0.20359546538465237, "learning_rate": 7.447665056360708e-06, "loss": 0.3611, "step": 2391 }, { "epoch": 2.597501357957632, "grad_norm": 0.2088927743314145, "learning_rate": 7.427536231884058e-06, "loss": 0.3544, "step": 2392 }, { "epoch": 2.5985877240630093, "grad_norm": 0.2185367569979803, "learning_rate": 7.4074074074074075e-06, "loss": 0.3318, "step": 2393 }, { "epoch": 2.599674090168387, "grad_norm": 0.1931174972478951, "learning_rate": 7.387278582930757e-06, "loss": 0.3707, "step": 2394 }, { "epoch": 2.6007604562737643, "grad_norm": 0.19973209191486005, "learning_rate": 7.367149758454106e-06, "loss": 0.3796, "step": 2395 }, { "epoch": 2.6018468223791418, "grad_norm": 0.19061231063621525, "learning_rate": 7.347020933977455e-06, "loss": 0.3336, "step": 2396 }, { "epoch": 2.6029331884845193, "grad_norm": 0.18485401569959448, "learning_rate": 7.326892109500806e-06, "loss": 0.3176, "step": 2397 }, { "epoch": 2.6040195545898968, "grad_norm": 0.19596466558315723, "learning_rate": 7.3067632850241555e-06, "loss": 0.3474, "step": 2398 }, { "epoch": 2.6051059206952742, "grad_norm": 0.1995769935613207, "learning_rate": 7.286634460547505e-06, "loss": 0.3575, "step": 2399 }, { "epoch": 2.6061922868006517, "grad_norm": 0.20516637713613572, "learning_rate": 7.266505636070854e-06, "loss": 0.3621, "step": 2400 }, { "epoch": 2.6072786529060292, "grad_norm": 0.20196165300827798, "learning_rate": 7.246376811594203e-06, "loss": 0.3426, "step": 2401 }, { "epoch": 2.6083650190114067, "grad_norm": 0.1961231677305673, "learning_rate": 7.226247987117553e-06, "loss": 0.3463, "step": 2402 }, { "epoch": 2.609451385116784, "grad_norm": 0.20064680544890912, "learning_rate": 7.206119162640903e-06, "loss": 0.3384, "step": 2403 }, { "epoch": 2.6105377512221617, "grad_norm": 0.19857815552944963, "learning_rate": 7.185990338164251e-06, "loss": 0.3438, "step": 2404 }, { "epoch": 2.6116241173275396, "grad_norm": 0.18865092852834933, "learning_rate": 7.165861513687601e-06, "loss": 0.3305, "step": 2405 }, { "epoch": 2.6127104834329167, "grad_norm": 0.21243640530597518, "learning_rate": 7.1457326892109505e-06, "loss": 0.3766, "step": 2406 }, { "epoch": 2.6137968495382946, "grad_norm": 0.1909976960734106, "learning_rate": 7.125603864734299e-06, "loss": 0.3351, "step": 2407 }, { "epoch": 2.6148832156436717, "grad_norm": 0.1846347984525972, "learning_rate": 7.105475040257649e-06, "loss": 0.3389, "step": 2408 }, { "epoch": 2.6159695817490496, "grad_norm": 0.1994905344760097, "learning_rate": 7.0853462157809985e-06, "loss": 0.3777, "step": 2409 }, { "epoch": 2.617055947854427, "grad_norm": 0.19862808662124723, "learning_rate": 7.065217391304347e-06, "loss": 0.3808, "step": 2410 }, { "epoch": 2.6181423139598046, "grad_norm": 0.18953619469406005, "learning_rate": 7.045088566827697e-06, "loss": 0.3432, "step": 2411 }, { "epoch": 2.619228680065182, "grad_norm": 0.19451731657555188, "learning_rate": 7.024959742351048e-06, "loss": 0.3362, "step": 2412 }, { "epoch": 2.6203150461705595, "grad_norm": 0.20195199051469345, "learning_rate": 7.004830917874397e-06, "loss": 0.3214, "step": 2413 }, { "epoch": 2.621401412275937, "grad_norm": 0.26621122601080444, "learning_rate": 6.9847020933977464e-06, "loss": 0.3919, "step": 2414 }, { "epoch": 2.6224877783813145, "grad_norm": 0.21308601859742246, "learning_rate": 6.964573268921096e-06, "loss": 0.355, "step": 2415 }, { "epoch": 2.623574144486692, "grad_norm": 0.18585510261930266, "learning_rate": 6.944444444444445e-06, "loss": 0.3308, "step": 2416 }, { "epoch": 2.6246605105920695, "grad_norm": 0.1903315071641054, "learning_rate": 6.924315619967794e-06, "loss": 0.3336, "step": 2417 }, { "epoch": 2.625746876697447, "grad_norm": 0.1903300335623604, "learning_rate": 6.904186795491144e-06, "loss": 0.3192, "step": 2418 }, { "epoch": 2.6268332428028245, "grad_norm": 0.19480908349229814, "learning_rate": 6.884057971014493e-06, "loss": 0.3346, "step": 2419 }, { "epoch": 2.627919608908202, "grad_norm": 0.1980021288011791, "learning_rate": 6.863929146537842e-06, "loss": 0.3638, "step": 2420 }, { "epoch": 2.6290059750135795, "grad_norm": 0.2022766891004004, "learning_rate": 6.843800322061192e-06, "loss": 0.3827, "step": 2421 }, { "epoch": 2.630092341118957, "grad_norm": 0.2048600090428871, "learning_rate": 6.8236714975845415e-06, "loss": 0.362, "step": 2422 }, { "epoch": 2.6311787072243344, "grad_norm": 0.17841190523599726, "learning_rate": 6.80354267310789e-06, "loss": 0.2838, "step": 2423 }, { "epoch": 2.6322650733297124, "grad_norm": 0.18859724029712296, "learning_rate": 6.78341384863124e-06, "loss": 0.2884, "step": 2424 }, { "epoch": 2.6333514394350894, "grad_norm": 0.19368780885883566, "learning_rate": 6.7632850241545894e-06, "loss": 0.3288, "step": 2425 }, { "epoch": 2.6344378055404674, "grad_norm": 0.19628216998224016, "learning_rate": 6.743156199677938e-06, "loss": 0.3528, "step": 2426 }, { "epoch": 2.6355241716458444, "grad_norm": 0.19812817156144735, "learning_rate": 6.723027375201288e-06, "loss": 0.3393, "step": 2427 }, { "epoch": 2.6366105377512223, "grad_norm": 0.21405655796851053, "learning_rate": 6.702898550724638e-06, "loss": 0.3955, "step": 2428 }, { "epoch": 2.6376969038566, "grad_norm": 0.212045728573147, "learning_rate": 6.682769726247988e-06, "loss": 0.3854, "step": 2429 }, { "epoch": 2.6387832699619773, "grad_norm": 0.19582785562534386, "learning_rate": 6.662640901771337e-06, "loss": 0.332, "step": 2430 }, { "epoch": 2.639869636067355, "grad_norm": 0.20951090783473453, "learning_rate": 6.642512077294687e-06, "loss": 0.3284, "step": 2431 }, { "epoch": 2.6409560021727323, "grad_norm": 0.1898206434476757, "learning_rate": 6.622383252818036e-06, "loss": 0.337, "step": 2432 }, { "epoch": 2.64204236827811, "grad_norm": 0.19567861218101404, "learning_rate": 6.602254428341385e-06, "loss": 0.358, "step": 2433 }, { "epoch": 2.6431287343834873, "grad_norm": 0.20806373342081783, "learning_rate": 6.582125603864735e-06, "loss": 0.4048, "step": 2434 }, { "epoch": 2.6442151004888648, "grad_norm": 0.20005629130933897, "learning_rate": 6.561996779388084e-06, "loss": 0.3166, "step": 2435 }, { "epoch": 2.6453014665942423, "grad_norm": 0.20647669308751876, "learning_rate": 6.541867954911433e-06, "loss": 0.3725, "step": 2436 }, { "epoch": 2.6463878326996197, "grad_norm": 0.20494318387352997, "learning_rate": 6.521739130434783e-06, "loss": 0.366, "step": 2437 }, { "epoch": 2.6474741988049972, "grad_norm": 0.19933949496457667, "learning_rate": 6.5016103059581325e-06, "loss": 0.3444, "step": 2438 }, { "epoch": 2.6485605649103747, "grad_norm": 0.185654308820602, "learning_rate": 6.481481481481481e-06, "loss": 0.3136, "step": 2439 }, { "epoch": 2.649646931015752, "grad_norm": 0.1848428539647059, "learning_rate": 6.461352657004831e-06, "loss": 0.3062, "step": 2440 }, { "epoch": 2.6507332971211297, "grad_norm": 0.1896968402470162, "learning_rate": 6.44122383252818e-06, "loss": 0.3019, "step": 2441 }, { "epoch": 2.651819663226507, "grad_norm": 0.20931220074697823, "learning_rate": 6.421095008051529e-06, "loss": 0.3633, "step": 2442 }, { "epoch": 2.6529060293318847, "grad_norm": 0.2315620100159995, "learning_rate": 6.4009661835748805e-06, "loss": 0.3572, "step": 2443 }, { "epoch": 2.653992395437262, "grad_norm": 0.19588120668428496, "learning_rate": 6.380837359098229e-06, "loss": 0.3554, "step": 2444 }, { "epoch": 2.65507876154264, "grad_norm": 0.19502272890561903, "learning_rate": 6.360708534621579e-06, "loss": 0.3502, "step": 2445 }, { "epoch": 2.656165127648017, "grad_norm": 0.19380153075016804, "learning_rate": 6.340579710144928e-06, "loss": 0.3452, "step": 2446 }, { "epoch": 2.657251493753395, "grad_norm": 0.20448820593247838, "learning_rate": 6.320450885668278e-06, "loss": 0.3853, "step": 2447 }, { "epoch": 2.658337859858772, "grad_norm": 0.20669054587654195, "learning_rate": 6.300322061191627e-06, "loss": 0.4049, "step": 2448 }, { "epoch": 2.65942422596415, "grad_norm": 0.1981607712009281, "learning_rate": 6.280193236714976e-06, "loss": 0.3695, "step": 2449 }, { "epoch": 2.6605105920695276, "grad_norm": 0.20635145027040866, "learning_rate": 6.260064412238326e-06, "loss": 0.3575, "step": 2450 }, { "epoch": 2.661596958174905, "grad_norm": 0.19109184027136683, "learning_rate": 6.239935587761675e-06, "loss": 0.353, "step": 2451 }, { "epoch": 2.6626833242802825, "grad_norm": 0.2006434345762522, "learning_rate": 6.219806763285024e-06, "loss": 0.3736, "step": 2452 }, { "epoch": 2.66376969038566, "grad_norm": 0.20389963385708004, "learning_rate": 6.199677938808374e-06, "loss": 0.3861, "step": 2453 }, { "epoch": 2.6648560564910375, "grad_norm": 0.2064849206504024, "learning_rate": 6.179549114331723e-06, "loss": 0.3817, "step": 2454 }, { "epoch": 2.665942422596415, "grad_norm": 0.17794858254337312, "learning_rate": 6.159420289855073e-06, "loss": 0.321, "step": 2455 }, { "epoch": 2.6670287887017925, "grad_norm": 0.2010988978760901, "learning_rate": 6.139291465378423e-06, "loss": 0.3924, "step": 2456 }, { "epoch": 2.66811515480717, "grad_norm": 0.18555512641169508, "learning_rate": 6.119162640901771e-06, "loss": 0.3491, "step": 2457 }, { "epoch": 2.6692015209125475, "grad_norm": 0.19475842670276106, "learning_rate": 6.099033816425121e-06, "loss": 0.3665, "step": 2458 }, { "epoch": 2.670287887017925, "grad_norm": 0.20234806916298304, "learning_rate": 6.078904991948471e-06, "loss": 0.3481, "step": 2459 }, { "epoch": 2.6713742531233025, "grad_norm": 0.20730299159110502, "learning_rate": 6.05877616747182e-06, "loss": 0.3688, "step": 2460 }, { "epoch": 2.67246061922868, "grad_norm": 0.20780210942006097, "learning_rate": 6.038647342995169e-06, "loss": 0.3602, "step": 2461 }, { "epoch": 2.6735469853340574, "grad_norm": 0.19188031776511139, "learning_rate": 6.0185185185185185e-06, "loss": 0.348, "step": 2462 }, { "epoch": 2.674633351439435, "grad_norm": 0.19683265194192678, "learning_rate": 5.998389694041868e-06, "loss": 0.3371, "step": 2463 }, { "epoch": 2.675719717544813, "grad_norm": 0.2080693166750357, "learning_rate": 5.978260869565218e-06, "loss": 0.3683, "step": 2464 }, { "epoch": 2.67680608365019, "grad_norm": 0.18925727265636574, "learning_rate": 5.958132045088567e-06, "loss": 0.3328, "step": 2465 }, { "epoch": 2.677892449755568, "grad_norm": 0.20810741924970252, "learning_rate": 5.938003220611917e-06, "loss": 0.3659, "step": 2466 }, { "epoch": 2.678978815860945, "grad_norm": 0.2033411816511986, "learning_rate": 5.917874396135266e-06, "loss": 0.3799, "step": 2467 }, { "epoch": 2.680065181966323, "grad_norm": 0.2080612683798272, "learning_rate": 5.897745571658615e-06, "loss": 0.363, "step": 2468 }, { "epoch": 2.6811515480717003, "grad_norm": 0.19351029558859098, "learning_rate": 5.877616747181965e-06, "loss": 0.342, "step": 2469 }, { "epoch": 2.682237914177078, "grad_norm": 0.1898570855209162, "learning_rate": 5.8574879227053144e-06, "loss": 0.3628, "step": 2470 }, { "epoch": 2.6833242802824553, "grad_norm": 0.20397919082699514, "learning_rate": 5.837359098228664e-06, "loss": 0.3524, "step": 2471 }, { "epoch": 2.6844106463878328, "grad_norm": 0.19294324698180382, "learning_rate": 5.817230273752014e-06, "loss": 0.3486, "step": 2472 }, { "epoch": 2.6854970124932103, "grad_norm": 0.18901676815045484, "learning_rate": 5.797101449275362e-06, "loss": 0.3226, "step": 2473 }, { "epoch": 2.6865833785985878, "grad_norm": 0.18102287550120094, "learning_rate": 5.776972624798712e-06, "loss": 0.331, "step": 2474 }, { "epoch": 2.6876697447039652, "grad_norm": 0.18525307367285085, "learning_rate": 5.7568438003220616e-06, "loss": 0.3213, "step": 2475 }, { "epoch": 2.6887561108093427, "grad_norm": 0.19249039355209516, "learning_rate": 5.73671497584541e-06, "loss": 0.3537, "step": 2476 }, { "epoch": 2.6898424769147202, "grad_norm": 0.19037383666529373, "learning_rate": 5.71658615136876e-06, "loss": 0.3094, "step": 2477 }, { "epoch": 2.6909288430200977, "grad_norm": 0.20253365437284507, "learning_rate": 5.69645732689211e-06, "loss": 0.3645, "step": 2478 }, { "epoch": 2.692015209125475, "grad_norm": 0.19650703951575868, "learning_rate": 5.676328502415459e-06, "loss": 0.3591, "step": 2479 }, { "epoch": 2.6931015752308527, "grad_norm": 1.7301634854236623, "learning_rate": 5.656199677938809e-06, "loss": 0.4147, "step": 2480 }, { "epoch": 2.69418794133623, "grad_norm": 0.18719444209683891, "learning_rate": 5.636070853462158e-06, "loss": 0.3205, "step": 2481 }, { "epoch": 2.6952743074416077, "grad_norm": 0.1904706235008018, "learning_rate": 5.615942028985508e-06, "loss": 0.3564, "step": 2482 }, { "epoch": 2.6963606735469856, "grad_norm": 0.1976631473876131, "learning_rate": 5.595813204508857e-06, "loss": 0.3626, "step": 2483 }, { "epoch": 2.6974470396523627, "grad_norm": 0.19715954446154862, "learning_rate": 5.575684380032206e-06, "loss": 0.3681, "step": 2484 }, { "epoch": 2.6985334057577406, "grad_norm": 0.20503528294510995, "learning_rate": 5.555555555555556e-06, "loss": 0.359, "step": 2485 }, { "epoch": 2.6996197718631176, "grad_norm": 0.19039260019804227, "learning_rate": 5.535426731078905e-06, "loss": 0.3076, "step": 2486 }, { "epoch": 2.7007061379684956, "grad_norm": 0.18284446730461568, "learning_rate": 5.515297906602255e-06, "loss": 0.3232, "step": 2487 }, { "epoch": 2.701792504073873, "grad_norm": 0.1970875673473729, "learning_rate": 5.495169082125605e-06, "loss": 0.3639, "step": 2488 }, { "epoch": 2.7028788701792505, "grad_norm": 0.19573581255440078, "learning_rate": 5.475040257648953e-06, "loss": 0.3458, "step": 2489 }, { "epoch": 2.703965236284628, "grad_norm": 0.20258550386833818, "learning_rate": 5.454911433172303e-06, "loss": 0.3722, "step": 2490 }, { "epoch": 2.7050516023900055, "grad_norm": 0.20773908984118247, "learning_rate": 5.4347826086956525e-06, "loss": 0.3444, "step": 2491 }, { "epoch": 2.706137968495383, "grad_norm": 0.18728942842203689, "learning_rate": 5.414653784219001e-06, "loss": 0.3299, "step": 2492 }, { "epoch": 2.7072243346007605, "grad_norm": 0.21463635609783793, "learning_rate": 5.394524959742351e-06, "loss": 0.3725, "step": 2493 }, { "epoch": 2.708310700706138, "grad_norm": 0.6296966778357845, "learning_rate": 5.374396135265701e-06, "loss": 0.3285, "step": 2494 }, { "epoch": 2.7093970668115155, "grad_norm": 0.18711587745426536, "learning_rate": 5.35426731078905e-06, "loss": 0.3489, "step": 2495 }, { "epoch": 2.710483432916893, "grad_norm": 0.20583718553632166, "learning_rate": 5.3341384863124e-06, "loss": 0.375, "step": 2496 }, { "epoch": 2.7115697990222705, "grad_norm": 0.19314397345912143, "learning_rate": 5.314009661835749e-06, "loss": 0.3414, "step": 2497 }, { "epoch": 2.712656165127648, "grad_norm": 0.18631159975555064, "learning_rate": 5.293880837359098e-06, "loss": 0.3238, "step": 2498 }, { "epoch": 2.7137425312330254, "grad_norm": 0.19521702974291075, "learning_rate": 5.273752012882448e-06, "loss": 0.3264, "step": 2499 }, { "epoch": 2.714828897338403, "grad_norm": 0.20455891870940493, "learning_rate": 5.253623188405797e-06, "loss": 0.3887, "step": 2500 }, { "epoch": 2.7159152634437804, "grad_norm": 0.1810674183220631, "learning_rate": 5.233494363929147e-06, "loss": 0.3183, "step": 2501 }, { "epoch": 2.717001629549158, "grad_norm": 0.18923209478140066, "learning_rate": 5.213365539452496e-06, "loss": 0.3266, "step": 2502 }, { "epoch": 2.7180879956545354, "grad_norm": 0.20028408959594457, "learning_rate": 5.193236714975846e-06, "loss": 0.3658, "step": 2503 }, { "epoch": 2.7191743617599133, "grad_norm": 0.19915630851980567, "learning_rate": 5.173107890499196e-06, "loss": 0.3484, "step": 2504 }, { "epoch": 2.7202607278652904, "grad_norm": 0.19219318888519182, "learning_rate": 5.152979066022544e-06, "loss": 0.3352, "step": 2505 }, { "epoch": 2.7213470939706683, "grad_norm": 0.20486876931068376, "learning_rate": 5.132850241545894e-06, "loss": 0.3533, "step": 2506 }, { "epoch": 2.7224334600760454, "grad_norm": 0.19052569907760789, "learning_rate": 5.1127214170692435e-06, "loss": 0.3413, "step": 2507 }, { "epoch": 2.7235198261814233, "grad_norm": 0.18661401405475012, "learning_rate": 5.092592592592592e-06, "loss": 0.3184, "step": 2508 }, { "epoch": 2.724606192286801, "grad_norm": 0.22804396596638396, "learning_rate": 5.072463768115943e-06, "loss": 0.3693, "step": 2509 }, { "epoch": 2.7256925583921783, "grad_norm": 0.19296398527642195, "learning_rate": 5.052334943639292e-06, "loss": 0.3338, "step": 2510 }, { "epoch": 2.7267789244975558, "grad_norm": 0.20127768104020377, "learning_rate": 5.032206119162641e-06, "loss": 0.3708, "step": 2511 }, { "epoch": 2.7278652906029333, "grad_norm": 0.1906280991321303, "learning_rate": 5.012077294685991e-06, "loss": 0.3272, "step": 2512 }, { "epoch": 2.7289516567083107, "grad_norm": 0.19337568128802998, "learning_rate": 4.99194847020934e-06, "loss": 0.3426, "step": 2513 }, { "epoch": 2.7300380228136882, "grad_norm": 0.19823107721840563, "learning_rate": 4.971819645732689e-06, "loss": 0.3369, "step": 2514 }, { "epoch": 2.7311243889190657, "grad_norm": 0.19189763537699045, "learning_rate": 4.951690821256039e-06, "loss": 0.3415, "step": 2515 }, { "epoch": 2.732210755024443, "grad_norm": 0.2084906621202298, "learning_rate": 4.931561996779388e-06, "loss": 0.3611, "step": 2516 }, { "epoch": 2.7332971211298207, "grad_norm": 0.2052105987190405, "learning_rate": 4.911433172302738e-06, "loss": 0.3808, "step": 2517 }, { "epoch": 2.734383487235198, "grad_norm": 0.2082527016794756, "learning_rate": 4.891304347826087e-06, "loss": 0.4116, "step": 2518 }, { "epoch": 2.7354698533405757, "grad_norm": 0.18595970209345364, "learning_rate": 4.871175523349437e-06, "loss": 0.3313, "step": 2519 }, { "epoch": 2.736556219445953, "grad_norm": 0.20425257520322965, "learning_rate": 4.851046698872786e-06, "loss": 0.3683, "step": 2520 }, { "epoch": 2.7376425855513307, "grad_norm": 0.19839706358970338, "learning_rate": 4.830917874396135e-06, "loss": 0.3264, "step": 2521 }, { "epoch": 2.738728951656708, "grad_norm": 0.20661182901756864, "learning_rate": 4.810789049919485e-06, "loss": 0.3814, "step": 2522 }, { "epoch": 2.739815317762086, "grad_norm": 0.1873577990402711, "learning_rate": 4.7906602254428345e-06, "loss": 0.3207, "step": 2523 }, { "epoch": 2.740901683867463, "grad_norm": 0.2113202600765857, "learning_rate": 4.770531400966183e-06, "loss": 0.3948, "step": 2524 }, { "epoch": 2.741988049972841, "grad_norm": 0.20274352467693907, "learning_rate": 4.750402576489534e-06, "loss": 0.3601, "step": 2525 }, { "epoch": 2.743074416078218, "grad_norm": 0.18405827005047293, "learning_rate": 4.7302737520128824e-06, "loss": 0.3274, "step": 2526 }, { "epoch": 2.744160782183596, "grad_norm": 0.18649199058181817, "learning_rate": 4.710144927536232e-06, "loss": 0.3287, "step": 2527 }, { "epoch": 2.7452471482889735, "grad_norm": 0.1961771819983141, "learning_rate": 4.690016103059582e-06, "loss": 0.3571, "step": 2528 }, { "epoch": 2.746333514394351, "grad_norm": 0.21143633883455779, "learning_rate": 4.669887278582931e-06, "loss": 0.3503, "step": 2529 }, { "epoch": 2.7474198804997285, "grad_norm": 0.20062229480065072, "learning_rate": 4.64975845410628e-06, "loss": 0.3707, "step": 2530 }, { "epoch": 2.748506246605106, "grad_norm": 0.43002412154292285, "learning_rate": 4.6296296296296296e-06, "loss": 0.3685, "step": 2531 }, { "epoch": 2.7495926127104835, "grad_norm": 0.1930421499420919, "learning_rate": 4.609500805152979e-06, "loss": 0.3577, "step": 2532 }, { "epoch": 2.750678978815861, "grad_norm": 0.19954060945604996, "learning_rate": 4.589371980676329e-06, "loss": 0.362, "step": 2533 }, { "epoch": 2.7517653449212385, "grad_norm": 0.19449412855422404, "learning_rate": 4.569243156199678e-06, "loss": 0.3322, "step": 2534 }, { "epoch": 2.752851711026616, "grad_norm": 0.20014418182821556, "learning_rate": 4.549114331723028e-06, "loss": 0.3433, "step": 2535 }, { "epoch": 2.7539380771319935, "grad_norm": 0.19970238432018053, "learning_rate": 4.528985507246377e-06, "loss": 0.35, "step": 2536 }, { "epoch": 2.755024443237371, "grad_norm": 0.1894191575422968, "learning_rate": 4.508856682769726e-06, "loss": 0.3501, "step": 2537 }, { "epoch": 2.7561108093427484, "grad_norm": 0.1995706897559087, "learning_rate": 4.488727858293076e-06, "loss": 0.3601, "step": 2538 }, { "epoch": 2.757197175448126, "grad_norm": 0.18699155040281007, "learning_rate": 4.468599033816425e-06, "loss": 0.3208, "step": 2539 }, { "epoch": 2.7582835415535034, "grad_norm": 0.21401993992207477, "learning_rate": 4.448470209339775e-06, "loss": 0.341, "step": 2540 }, { "epoch": 2.759369907658881, "grad_norm": 0.20298819577648206, "learning_rate": 4.428341384863125e-06, "loss": 0.3817, "step": 2541 }, { "epoch": 2.7604562737642584, "grad_norm": 0.18791314300469994, "learning_rate": 4.408212560386473e-06, "loss": 0.3147, "step": 2542 }, { "epoch": 2.761542639869636, "grad_norm": 0.19076538484084743, "learning_rate": 4.388083735909823e-06, "loss": 0.3469, "step": 2543 }, { "epoch": 2.762629005975014, "grad_norm": 0.18084941207956498, "learning_rate": 4.367954911433173e-06, "loss": 0.3212, "step": 2544 }, { "epoch": 2.763715372080391, "grad_norm": 0.19717779954234815, "learning_rate": 4.347826086956522e-06, "loss": 0.3227, "step": 2545 }, { "epoch": 2.764801738185769, "grad_norm": 0.20512702204703112, "learning_rate": 4.327697262479871e-06, "loss": 0.4106, "step": 2546 }, { "epoch": 2.765888104291146, "grad_norm": 0.19938601670077621, "learning_rate": 4.3075684380032205e-06, "loss": 0.34, "step": 2547 }, { "epoch": 2.7669744703965238, "grad_norm": 0.21173137495127614, "learning_rate": 4.28743961352657e-06, "loss": 0.3992, "step": 2548 }, { "epoch": 2.7680608365019013, "grad_norm": 0.1951337989996158, "learning_rate": 4.26731078904992e-06, "loss": 0.3317, "step": 2549 }, { "epoch": 2.7691472026072788, "grad_norm": 0.19643054956329214, "learning_rate": 4.247181964573269e-06, "loss": 0.3335, "step": 2550 }, { "epoch": 2.7702335687126562, "grad_norm": 0.19453597854621568, "learning_rate": 4.227053140096619e-06, "loss": 0.3325, "step": 2551 }, { "epoch": 2.7713199348180337, "grad_norm": 0.19020750467835218, "learning_rate": 4.206924315619968e-06, "loss": 0.3347, "step": 2552 }, { "epoch": 2.7724063009234112, "grad_norm": 0.19464663471447702, "learning_rate": 4.186795491143317e-06, "loss": 0.3615, "step": 2553 }, { "epoch": 2.7734926670287887, "grad_norm": 0.19625925635816774, "learning_rate": 4.166666666666667e-06, "loss": 0.3507, "step": 2554 }, { "epoch": 2.774579033134166, "grad_norm": 0.1851841654833825, "learning_rate": 4.146537842190016e-06, "loss": 0.3175, "step": 2555 }, { "epoch": 2.7756653992395437, "grad_norm": 0.3358081338021581, "learning_rate": 4.126409017713366e-06, "loss": 0.3573, "step": 2556 }, { "epoch": 2.776751765344921, "grad_norm": 0.20166513539813172, "learning_rate": 4.106280193236716e-06, "loss": 0.3883, "step": 2557 }, { "epoch": 2.7778381314502987, "grad_norm": 0.1938736397024277, "learning_rate": 4.086151368760064e-06, "loss": 0.3571, "step": 2558 }, { "epoch": 2.778924497555676, "grad_norm": 0.21808575369847247, "learning_rate": 4.066022544283414e-06, "loss": 0.4183, "step": 2559 }, { "epoch": 2.7800108636610537, "grad_norm": 0.21908031047438595, "learning_rate": 4.045893719806764e-06, "loss": 0.4028, "step": 2560 }, { "epoch": 2.781097229766431, "grad_norm": 0.18522893194958415, "learning_rate": 4.025764895330112e-06, "loss": 0.321, "step": 2561 }, { "epoch": 2.7821835958718086, "grad_norm": 0.2011824999454939, "learning_rate": 4.005636070853462e-06, "loss": 0.3882, "step": 2562 }, { "epoch": 2.7832699619771866, "grad_norm": 0.19057931588418517, "learning_rate": 3.9855072463768115e-06, "loss": 0.3277, "step": 2563 }, { "epoch": 2.7843563280825636, "grad_norm": 0.19931707496346726, "learning_rate": 3.965378421900161e-06, "loss": 0.4009, "step": 2564 }, { "epoch": 2.7854426941879415, "grad_norm": 0.184081313012606, "learning_rate": 3.945249597423511e-06, "loss": 0.3095, "step": 2565 }, { "epoch": 2.7865290602933186, "grad_norm": 0.19325919921636456, "learning_rate": 3.92512077294686e-06, "loss": 0.3285, "step": 2566 }, { "epoch": 2.7876154263986965, "grad_norm": 0.1978948794961651, "learning_rate": 3.90499194847021e-06, "loss": 0.3616, "step": 2567 }, { "epoch": 2.788701792504074, "grad_norm": 0.18343598990097337, "learning_rate": 3.884863123993559e-06, "loss": 0.3222, "step": 2568 }, { "epoch": 2.7897881586094515, "grad_norm": 0.18749067117576607, "learning_rate": 3.864734299516908e-06, "loss": 0.3381, "step": 2569 }, { "epoch": 2.790874524714829, "grad_norm": 0.19941876997367222, "learning_rate": 3.844605475040258e-06, "loss": 0.3631, "step": 2570 }, { "epoch": 2.7919608908202065, "grad_norm": 0.21913080313049124, "learning_rate": 3.8244766505636074e-06, "loss": 0.3845, "step": 2571 }, { "epoch": 2.793047256925584, "grad_norm": 0.18694542080631263, "learning_rate": 3.804347826086957e-06, "loss": 0.3507, "step": 2572 }, { "epoch": 2.7941336230309615, "grad_norm": 0.2062220477033725, "learning_rate": 3.784219001610306e-06, "loss": 0.4061, "step": 2573 }, { "epoch": 2.795219989136339, "grad_norm": 0.2035662742017655, "learning_rate": 3.764090177133656e-06, "loss": 0.368, "step": 2574 }, { "epoch": 2.7963063552417164, "grad_norm": 0.19206157958006614, "learning_rate": 3.743961352657005e-06, "loss": 0.3297, "step": 2575 }, { "epoch": 2.797392721347094, "grad_norm": 0.1846213739581415, "learning_rate": 3.723832528180354e-06, "loss": 0.3135, "step": 2576 }, { "epoch": 2.7984790874524714, "grad_norm": 0.1962366501027734, "learning_rate": 3.7037037037037037e-06, "loss": 0.3413, "step": 2577 }, { "epoch": 2.799565453557849, "grad_norm": 0.20655437927295936, "learning_rate": 3.683574879227053e-06, "loss": 0.4072, "step": 2578 }, { "epoch": 2.8006518196632264, "grad_norm": 0.21064742968628883, "learning_rate": 3.663446054750403e-06, "loss": 0.398, "step": 2579 }, { "epoch": 2.801738185768604, "grad_norm": 0.19187056989879683, "learning_rate": 3.6433172302737525e-06, "loss": 0.3594, "step": 2580 }, { "epoch": 2.8028245518739814, "grad_norm": 0.2157703628047709, "learning_rate": 3.6231884057971017e-06, "loss": 0.3823, "step": 2581 }, { "epoch": 2.8039109179793593, "grad_norm": 0.21010385905574966, "learning_rate": 3.6030595813204513e-06, "loss": 0.3794, "step": 2582 }, { "epoch": 2.8049972840847364, "grad_norm": 0.18293991757858505, "learning_rate": 3.5829307568438005e-06, "loss": 0.3253, "step": 2583 }, { "epoch": 2.8060836501901143, "grad_norm": 0.18819390922408316, "learning_rate": 3.5628019323671496e-06, "loss": 0.3387, "step": 2584 }, { "epoch": 2.8071700162954913, "grad_norm": 0.19050002755839493, "learning_rate": 3.5426731078904992e-06, "loss": 0.3336, "step": 2585 }, { "epoch": 2.8082563824008693, "grad_norm": 0.21113758288488815, "learning_rate": 3.5225442834138484e-06, "loss": 0.3947, "step": 2586 }, { "epoch": 2.8093427485062468, "grad_norm": 0.195124341483627, "learning_rate": 3.5024154589371984e-06, "loss": 0.3198, "step": 2587 }, { "epoch": 2.8104291146116243, "grad_norm": 0.1994414413195172, "learning_rate": 3.482286634460548e-06, "loss": 0.3483, "step": 2588 }, { "epoch": 2.8115154807170017, "grad_norm": 0.1940528299420318, "learning_rate": 3.462157809983897e-06, "loss": 0.3468, "step": 2589 }, { "epoch": 2.8126018468223792, "grad_norm": 0.24508635765556067, "learning_rate": 3.4420289855072464e-06, "loss": 0.2996, "step": 2590 }, { "epoch": 2.8136882129277567, "grad_norm": 0.19045612237346451, "learning_rate": 3.421900161030596e-06, "loss": 0.3227, "step": 2591 }, { "epoch": 2.814774579033134, "grad_norm": 0.18792332624673308, "learning_rate": 3.401771336553945e-06, "loss": 0.3277, "step": 2592 }, { "epoch": 2.8158609451385117, "grad_norm": 0.1822261577214099, "learning_rate": 3.3816425120772947e-06, "loss": 0.2808, "step": 2593 }, { "epoch": 2.816947311243889, "grad_norm": 0.1937639464916838, "learning_rate": 3.361513687600644e-06, "loss": 0.3282, "step": 2594 }, { "epoch": 2.8180336773492667, "grad_norm": 0.19897769286430783, "learning_rate": 3.341384863123994e-06, "loss": 0.3787, "step": 2595 }, { "epoch": 2.819120043454644, "grad_norm": 0.2009729180363465, "learning_rate": 3.3212560386473435e-06, "loss": 0.3889, "step": 2596 }, { "epoch": 2.8202064095600217, "grad_norm": 0.19540468714008902, "learning_rate": 3.3011272141706927e-06, "loss": 0.3451, "step": 2597 }, { "epoch": 2.821292775665399, "grad_norm": 0.19222478791322295, "learning_rate": 3.280998389694042e-06, "loss": 0.3104, "step": 2598 }, { "epoch": 2.8223791417707766, "grad_norm": 0.18804478730600585, "learning_rate": 3.2608695652173914e-06, "loss": 0.299, "step": 2599 }, { "epoch": 2.823465507876154, "grad_norm": 0.20259592542227597, "learning_rate": 3.2407407407407406e-06, "loss": 0.3428, "step": 2600 }, { "epoch": 2.8245518739815316, "grad_norm": 0.18038008867745972, "learning_rate": 3.22061191626409e-06, "loss": 0.2885, "step": 2601 }, { "epoch": 2.825638240086909, "grad_norm": 0.20563838434161413, "learning_rate": 3.2004830917874402e-06, "loss": 0.3473, "step": 2602 }, { "epoch": 2.826724606192287, "grad_norm": 0.2033857251347828, "learning_rate": 3.1803542673107894e-06, "loss": 0.3763, "step": 2603 }, { "epoch": 2.827810972297664, "grad_norm": 0.24876071098342464, "learning_rate": 3.160225442834139e-06, "loss": 0.3862, "step": 2604 }, { "epoch": 2.828897338403042, "grad_norm": 0.18708096968942614, "learning_rate": 3.140096618357488e-06, "loss": 0.3456, "step": 2605 }, { "epoch": 2.829983704508419, "grad_norm": 0.19447641664624757, "learning_rate": 3.1199677938808373e-06, "loss": 0.3669, "step": 2606 }, { "epoch": 2.831070070613797, "grad_norm": 0.2020890621238513, "learning_rate": 3.099838969404187e-06, "loss": 0.3497, "step": 2607 }, { "epoch": 2.8321564367191745, "grad_norm": 0.1856698164308254, "learning_rate": 3.0797101449275365e-06, "loss": 0.3185, "step": 2608 }, { "epoch": 2.833242802824552, "grad_norm": 0.3491089901977067, "learning_rate": 3.0595813204508857e-06, "loss": 0.4275, "step": 2609 }, { "epoch": 2.8343291689299295, "grad_norm": 0.1933912189585094, "learning_rate": 3.0394524959742353e-06, "loss": 0.3469, "step": 2610 }, { "epoch": 2.835415535035307, "grad_norm": 0.19496879018118, "learning_rate": 3.0193236714975845e-06, "loss": 0.3648, "step": 2611 }, { "epoch": 2.8365019011406845, "grad_norm": 0.19077708312006997, "learning_rate": 2.999194847020934e-06, "loss": 0.3197, "step": 2612 }, { "epoch": 2.837588267246062, "grad_norm": 0.19693106580405137, "learning_rate": 2.9790660225442837e-06, "loss": 0.3277, "step": 2613 }, { "epoch": 2.8386746333514394, "grad_norm": 0.19401436634452526, "learning_rate": 2.958937198067633e-06, "loss": 0.3456, "step": 2614 }, { "epoch": 2.839760999456817, "grad_norm": 0.19932648398133576, "learning_rate": 2.9388083735909824e-06, "loss": 0.3734, "step": 2615 }, { "epoch": 2.8408473655621944, "grad_norm": 0.19057298795289634, "learning_rate": 2.918679549114332e-06, "loss": 0.3381, "step": 2616 }, { "epoch": 2.841933731667572, "grad_norm": 0.209578828313627, "learning_rate": 2.898550724637681e-06, "loss": 0.3883, "step": 2617 }, { "epoch": 2.8430200977729494, "grad_norm": 0.19363120692814384, "learning_rate": 2.8784219001610308e-06, "loss": 0.3291, "step": 2618 }, { "epoch": 2.844106463878327, "grad_norm": 0.19800543994056655, "learning_rate": 2.85829307568438e-06, "loss": 0.366, "step": 2619 }, { "epoch": 2.8451928299837044, "grad_norm": 0.18243419052882495, "learning_rate": 2.8381642512077295e-06, "loss": 0.344, "step": 2620 }, { "epoch": 2.846279196089082, "grad_norm": 0.18829613761506012, "learning_rate": 2.818035426731079e-06, "loss": 0.3445, "step": 2621 }, { "epoch": 2.84736556219446, "grad_norm": 0.1862977419068281, "learning_rate": 2.7979066022544283e-06, "loss": 0.3458, "step": 2622 }, { "epoch": 2.848451928299837, "grad_norm": 0.18812630174762976, "learning_rate": 2.777777777777778e-06, "loss": 0.3332, "step": 2623 }, { "epoch": 2.8495382944052148, "grad_norm": 0.20201881206306593, "learning_rate": 2.7576489533011275e-06, "loss": 0.3648, "step": 2624 }, { "epoch": 2.850624660510592, "grad_norm": 2.8706076180315314, "learning_rate": 2.7375201288244767e-06, "loss": 0.5935, "step": 2625 }, { "epoch": 2.8517110266159698, "grad_norm": 0.1918647508822719, "learning_rate": 2.7173913043478263e-06, "loss": 0.3234, "step": 2626 }, { "epoch": 2.8527973927213472, "grad_norm": 0.19703425354806803, "learning_rate": 2.6972624798711754e-06, "loss": 0.3263, "step": 2627 }, { "epoch": 2.8538837588267247, "grad_norm": 0.19754625793414174, "learning_rate": 2.677133655394525e-06, "loss": 0.3275, "step": 2628 }, { "epoch": 2.8549701249321022, "grad_norm": 0.1954983814016778, "learning_rate": 2.6570048309178746e-06, "loss": 0.3611, "step": 2629 }, { "epoch": 2.8560564910374797, "grad_norm": 0.1844697487794452, "learning_rate": 2.636876006441224e-06, "loss": 0.3118, "step": 2630 }, { "epoch": 2.857142857142857, "grad_norm": 0.18870648622161765, "learning_rate": 2.6167471819645734e-06, "loss": 0.3113, "step": 2631 }, { "epoch": 2.8582292232482347, "grad_norm": 0.1939599381813623, "learning_rate": 2.596618357487923e-06, "loss": 0.3377, "step": 2632 }, { "epoch": 2.859315589353612, "grad_norm": 0.2095439233720718, "learning_rate": 2.576489533011272e-06, "loss": 0.3273, "step": 2633 }, { "epoch": 2.8604019554589897, "grad_norm": 0.19028386760199725, "learning_rate": 2.5563607085346218e-06, "loss": 0.3353, "step": 2634 }, { "epoch": 2.861488321564367, "grad_norm": 0.19609779782548228, "learning_rate": 2.5362318840579714e-06, "loss": 0.3524, "step": 2635 }, { "epoch": 2.8625746876697447, "grad_norm": 0.19777948522789993, "learning_rate": 2.5161030595813205e-06, "loss": 0.3728, "step": 2636 }, { "epoch": 2.863661053775122, "grad_norm": 0.20670429886510813, "learning_rate": 2.49597423510467e-06, "loss": 0.3755, "step": 2637 }, { "epoch": 2.8647474198804996, "grad_norm": 0.1953592051811565, "learning_rate": 2.4758454106280193e-06, "loss": 0.348, "step": 2638 }, { "epoch": 2.865833785985877, "grad_norm": 0.19080923621687934, "learning_rate": 2.455716586151369e-06, "loss": 0.3466, "step": 2639 }, { "epoch": 2.8669201520912546, "grad_norm": 0.19890672985953106, "learning_rate": 2.4355877616747185e-06, "loss": 0.3772, "step": 2640 }, { "epoch": 2.868006518196632, "grad_norm": 0.19615872463569736, "learning_rate": 2.4154589371980677e-06, "loss": 0.3618, "step": 2641 }, { "epoch": 2.8690928843020096, "grad_norm": 0.2539145867417979, "learning_rate": 2.3953301127214173e-06, "loss": 0.3411, "step": 2642 }, { "epoch": 2.8701792504073875, "grad_norm": 0.17759964194198943, "learning_rate": 2.375201288244767e-06, "loss": 0.3175, "step": 2643 }, { "epoch": 2.8712656165127646, "grad_norm": 0.20080740195195554, "learning_rate": 2.355072463768116e-06, "loss": 0.3771, "step": 2644 }, { "epoch": 2.8723519826181425, "grad_norm": 0.19805170757156898, "learning_rate": 2.3349436392914656e-06, "loss": 0.3628, "step": 2645 }, { "epoch": 2.8734383487235196, "grad_norm": 0.18303754334301334, "learning_rate": 2.3148148148148148e-06, "loss": 0.3239, "step": 2646 }, { "epoch": 2.8745247148288975, "grad_norm": 0.20802616785133266, "learning_rate": 2.2946859903381644e-06, "loss": 0.387, "step": 2647 }, { "epoch": 2.875611080934275, "grad_norm": 0.19182774597416785, "learning_rate": 2.274557165861514e-06, "loss": 0.3579, "step": 2648 }, { "epoch": 2.8766974470396525, "grad_norm": 0.18677404240997858, "learning_rate": 2.254428341384863e-06, "loss": 0.3401, "step": 2649 }, { "epoch": 2.87778381314503, "grad_norm": 0.18834536685170705, "learning_rate": 2.2342995169082123e-06, "loss": 0.3247, "step": 2650 }, { "epoch": 2.8788701792504074, "grad_norm": 0.20688626648578157, "learning_rate": 2.2141706924315623e-06, "loss": 0.3934, "step": 2651 }, { "epoch": 2.879956545355785, "grad_norm": 0.1855598916018259, "learning_rate": 2.1940418679549115e-06, "loss": 0.349, "step": 2652 }, { "epoch": 2.8810429114611624, "grad_norm": 0.20149815485878358, "learning_rate": 2.173913043478261e-06, "loss": 0.3816, "step": 2653 }, { "epoch": 2.88212927756654, "grad_norm": 0.19315230858503965, "learning_rate": 2.1537842190016103e-06, "loss": 0.3349, "step": 2654 }, { "epoch": 2.8832156436719174, "grad_norm": 0.2313738926075436, "learning_rate": 2.13365539452496e-06, "loss": 0.3065, "step": 2655 }, { "epoch": 2.884302009777295, "grad_norm": 0.20476170490160048, "learning_rate": 2.1135265700483095e-06, "loss": 0.3702, "step": 2656 }, { "epoch": 2.8853883758826724, "grad_norm": 0.20151431249965915, "learning_rate": 2.0933977455716586e-06, "loss": 0.3852, "step": 2657 }, { "epoch": 2.88647474198805, "grad_norm": 0.19088235783996585, "learning_rate": 2.073268921095008e-06, "loss": 0.3378, "step": 2658 }, { "epoch": 2.8875611080934274, "grad_norm": 0.1978874697002367, "learning_rate": 2.053140096618358e-06, "loss": 0.3646, "step": 2659 }, { "epoch": 2.888647474198805, "grad_norm": 0.1964943736432712, "learning_rate": 2.033011272141707e-06, "loss": 0.3592, "step": 2660 }, { "epoch": 2.8897338403041823, "grad_norm": 0.19652984013600955, "learning_rate": 2.012882447665056e-06, "loss": 0.3654, "step": 2661 }, { "epoch": 2.8908202064095603, "grad_norm": 0.18010619648432705, "learning_rate": 1.9927536231884058e-06, "loss": 0.3134, "step": 2662 }, { "epoch": 2.8919065725149373, "grad_norm": 0.19937625461145408, "learning_rate": 1.9726247987117554e-06, "loss": 0.3819, "step": 2663 }, { "epoch": 2.8929929386203153, "grad_norm": 0.1909240391761414, "learning_rate": 1.952495974235105e-06, "loss": 0.3531, "step": 2664 }, { "epoch": 2.8940793047256923, "grad_norm": 0.1910830535484662, "learning_rate": 1.932367149758454e-06, "loss": 0.3458, "step": 2665 }, { "epoch": 2.8951656708310702, "grad_norm": 0.18856014891114162, "learning_rate": 1.9122383252818037e-06, "loss": 0.3459, "step": 2666 }, { "epoch": 2.8962520369364477, "grad_norm": 0.1953345494261154, "learning_rate": 1.892109500805153e-06, "loss": 0.3475, "step": 2667 }, { "epoch": 2.897338403041825, "grad_norm": 0.1802750458612982, "learning_rate": 1.8719806763285025e-06, "loss": 0.3214, "step": 2668 }, { "epoch": 2.8984247691472027, "grad_norm": 0.2064604221523391, "learning_rate": 1.8518518518518519e-06, "loss": 0.4089, "step": 2669 }, { "epoch": 2.89951113525258, "grad_norm": 0.18131915723781367, "learning_rate": 1.8317230273752015e-06, "loss": 0.3227, "step": 2670 }, { "epoch": 2.9005975013579577, "grad_norm": 0.19766081603332458, "learning_rate": 1.8115942028985508e-06, "loss": 0.3713, "step": 2671 }, { "epoch": 2.901683867463335, "grad_norm": 0.18247717096089144, "learning_rate": 1.7914653784219002e-06, "loss": 0.3395, "step": 2672 }, { "epoch": 2.9027702335687127, "grad_norm": 0.18516276563438028, "learning_rate": 1.7713365539452496e-06, "loss": 0.3377, "step": 2673 }, { "epoch": 2.90385659967409, "grad_norm": 0.19836493010810333, "learning_rate": 1.7512077294685992e-06, "loss": 0.3672, "step": 2674 }, { "epoch": 2.9049429657794676, "grad_norm": 0.18338244004547896, "learning_rate": 1.7310789049919486e-06, "loss": 0.3389, "step": 2675 }, { "epoch": 2.906029331884845, "grad_norm": 0.17706148333523447, "learning_rate": 1.710950080515298e-06, "loss": 0.2999, "step": 2676 }, { "epoch": 2.9071156979902226, "grad_norm": 0.18892113776071132, "learning_rate": 1.6908212560386474e-06, "loss": 0.3241, "step": 2677 }, { "epoch": 2.9082020640956, "grad_norm": 0.18598797026496994, "learning_rate": 1.670692431561997e-06, "loss": 0.3282, "step": 2678 }, { "epoch": 2.9092884302009776, "grad_norm": 0.2021870879904115, "learning_rate": 1.6505636070853463e-06, "loss": 0.3661, "step": 2679 }, { "epoch": 2.910374796306355, "grad_norm": 0.17884443620488769, "learning_rate": 1.6304347826086957e-06, "loss": 0.3146, "step": 2680 }, { "epoch": 2.911461162411733, "grad_norm": 0.19199208091074965, "learning_rate": 1.610305958132045e-06, "loss": 0.344, "step": 2681 }, { "epoch": 2.91254752851711, "grad_norm": 0.20284270738711976, "learning_rate": 1.5901771336553947e-06, "loss": 0.3586, "step": 2682 }, { "epoch": 2.913633894622488, "grad_norm": 0.18395854094045286, "learning_rate": 1.570048309178744e-06, "loss": 0.3322, "step": 2683 }, { "epoch": 2.914720260727865, "grad_norm": 0.19379069520156972, "learning_rate": 1.5499194847020935e-06, "loss": 0.3713, "step": 2684 }, { "epoch": 2.915806626833243, "grad_norm": 0.20055545376134942, "learning_rate": 1.5297906602254428e-06, "loss": 0.3833, "step": 2685 }, { "epoch": 2.9168929929386205, "grad_norm": 0.18029514481796385, "learning_rate": 1.5096618357487922e-06, "loss": 0.3438, "step": 2686 }, { "epoch": 2.917979359043998, "grad_norm": 0.18596613694139216, "learning_rate": 1.4895330112721418e-06, "loss": 0.3479, "step": 2687 }, { "epoch": 2.9190657251493755, "grad_norm": 0.1967110577858076, "learning_rate": 1.4694041867954912e-06, "loss": 0.3559, "step": 2688 }, { "epoch": 2.920152091254753, "grad_norm": 0.18341293722087007, "learning_rate": 1.4492753623188406e-06, "loss": 0.3471, "step": 2689 }, { "epoch": 2.9212384573601304, "grad_norm": 0.18335445020763821, "learning_rate": 1.42914653784219e-06, "loss": 0.3377, "step": 2690 }, { "epoch": 2.922324823465508, "grad_norm": 0.1933718010579775, "learning_rate": 1.4090177133655396e-06, "loss": 0.3746, "step": 2691 }, { "epoch": 2.9234111895708854, "grad_norm": 0.19031624759987922, "learning_rate": 1.388888888888889e-06, "loss": 0.359, "step": 2692 }, { "epoch": 2.924497555676263, "grad_norm": 0.20651634863557528, "learning_rate": 1.3687600644122383e-06, "loss": 0.3662, "step": 2693 }, { "epoch": 2.9255839217816404, "grad_norm": 0.19186227981561624, "learning_rate": 1.3486312399355877e-06, "loss": 0.3538, "step": 2694 }, { "epoch": 2.926670287887018, "grad_norm": 0.19018828542456506, "learning_rate": 1.3285024154589373e-06, "loss": 0.3278, "step": 2695 }, { "epoch": 2.9277566539923954, "grad_norm": 2.6125529385462456, "learning_rate": 1.3083735909822867e-06, "loss": 0.3411, "step": 2696 }, { "epoch": 2.928843020097773, "grad_norm": 0.18449776805223217, "learning_rate": 1.288244766505636e-06, "loss": 0.3228, "step": 2697 }, { "epoch": 2.9299293862031504, "grad_norm": 0.1909577660583255, "learning_rate": 1.2681159420289857e-06, "loss": 0.3743, "step": 2698 }, { "epoch": 2.931015752308528, "grad_norm": 0.18629964049047723, "learning_rate": 1.247987117552335e-06, "loss": 0.3375, "step": 2699 }, { "epoch": 2.9321021184139053, "grad_norm": 0.19264026243699875, "learning_rate": 1.2278582930756844e-06, "loss": 0.3411, "step": 2700 }, { "epoch": 2.933188484519283, "grad_norm": 0.1807769782835777, "learning_rate": 1.2077294685990338e-06, "loss": 0.3354, "step": 2701 }, { "epoch": 2.9342748506246608, "grad_norm": 0.20068668430457776, "learning_rate": 1.1876006441223834e-06, "loss": 0.3576, "step": 2702 }, { "epoch": 2.935361216730038, "grad_norm": 0.19194289673729759, "learning_rate": 1.1674718196457328e-06, "loss": 0.321, "step": 2703 }, { "epoch": 2.9364475828354157, "grad_norm": 0.19772770031108994, "learning_rate": 1.1473429951690822e-06, "loss": 0.3387, "step": 2704 }, { "epoch": 2.937533948940793, "grad_norm": 0.1936359894191471, "learning_rate": 1.1272141706924316e-06, "loss": 0.3651, "step": 2705 }, { "epoch": 2.9386203150461707, "grad_norm": 0.19383666714386596, "learning_rate": 1.1070853462157812e-06, "loss": 0.3469, "step": 2706 }, { "epoch": 2.939706681151548, "grad_norm": 0.1760614190794672, "learning_rate": 1.0869565217391306e-06, "loss": 0.3102, "step": 2707 }, { "epoch": 2.9407930472569257, "grad_norm": 0.19101407115539418, "learning_rate": 1.06682769726248e-06, "loss": 0.3622, "step": 2708 }, { "epoch": 2.941879413362303, "grad_norm": 0.200083465501516, "learning_rate": 1.0466988727858293e-06, "loss": 0.3516, "step": 2709 }, { "epoch": 2.9429657794676807, "grad_norm": 0.1890895448328312, "learning_rate": 1.026570048309179e-06, "loss": 0.359, "step": 2710 }, { "epoch": 2.944052145573058, "grad_norm": 0.20067287544910503, "learning_rate": 1.006441223832528e-06, "loss": 0.3822, "step": 2711 }, { "epoch": 2.9451385116784357, "grad_norm": 0.24489382441530963, "learning_rate": 9.863123993558777e-07, "loss": 0.3208, "step": 2712 }, { "epoch": 2.946224877783813, "grad_norm": 0.18710888350697752, "learning_rate": 9.66183574879227e-07, "loss": 0.3182, "step": 2713 }, { "epoch": 2.9473112438891906, "grad_norm": 0.18660752325309288, "learning_rate": 9.460547504025766e-07, "loss": 0.3524, "step": 2714 }, { "epoch": 2.948397609994568, "grad_norm": 0.21353310107337506, "learning_rate": 9.259259259259259e-07, "loss": 0.3465, "step": 2715 }, { "epoch": 2.9494839760999456, "grad_norm": 0.18241430187240135, "learning_rate": 9.057971014492754e-07, "loss": 0.3381, "step": 2716 }, { "epoch": 2.950570342205323, "grad_norm": 0.17664344697898302, "learning_rate": 8.856682769726248e-07, "loss": 0.3254, "step": 2717 }, { "epoch": 2.9516567083107006, "grad_norm": 0.18963467790011077, "learning_rate": 8.655394524959743e-07, "loss": 0.3291, "step": 2718 }, { "epoch": 2.952743074416078, "grad_norm": 0.18300294305628734, "learning_rate": 8.454106280193237e-07, "loss": 0.3311, "step": 2719 }, { "epoch": 2.9538294405214556, "grad_norm": 0.19666372446512495, "learning_rate": 8.252818035426732e-07, "loss": 0.3528, "step": 2720 }, { "epoch": 2.9549158066268335, "grad_norm": 0.18941366622663702, "learning_rate": 8.051529790660226e-07, "loss": 0.3181, "step": 2721 }, { "epoch": 2.9560021727322106, "grad_norm": 0.27642059558196885, "learning_rate": 7.85024154589372e-07, "loss": 0.3858, "step": 2722 }, { "epoch": 2.9570885388375885, "grad_norm": 0.18001238115059445, "learning_rate": 7.648953301127214e-07, "loss": 0.3238, "step": 2723 }, { "epoch": 2.9581749049429655, "grad_norm": 0.19128456226519494, "learning_rate": 7.447665056360709e-07, "loss": 0.361, "step": 2724 }, { "epoch": 2.9592612710483435, "grad_norm": 0.18097195857658635, "learning_rate": 7.246376811594203e-07, "loss": 0.3218, "step": 2725 }, { "epoch": 2.960347637153721, "grad_norm": 0.18703624453846635, "learning_rate": 7.045088566827698e-07, "loss": 0.3421, "step": 2726 }, { "epoch": 2.9614340032590984, "grad_norm": 0.19258921980492352, "learning_rate": 6.843800322061192e-07, "loss": 0.3673, "step": 2727 }, { "epoch": 2.962520369364476, "grad_norm": 0.18870800560468298, "learning_rate": 6.642512077294687e-07, "loss": 0.3265, "step": 2728 }, { "epoch": 2.9636067354698534, "grad_norm": 0.19064896929558806, "learning_rate": 6.44122383252818e-07, "loss": 0.3355, "step": 2729 }, { "epoch": 2.964693101575231, "grad_norm": 0.18898648686464387, "learning_rate": 6.239935587761675e-07, "loss": 0.3482, "step": 2730 }, { "epoch": 2.9657794676806084, "grad_norm": 0.20962653902520625, "learning_rate": 6.038647342995169e-07, "loss": 0.4005, "step": 2731 }, { "epoch": 2.966865833785986, "grad_norm": 0.18762052359127268, "learning_rate": 5.837359098228664e-07, "loss": 0.35, "step": 2732 }, { "epoch": 2.9679521998913634, "grad_norm": 0.1882567282235692, "learning_rate": 5.636070853462158e-07, "loss": 0.3292, "step": 2733 }, { "epoch": 2.969038565996741, "grad_norm": 0.1863188252730181, "learning_rate": 5.434782608695653e-07, "loss": 0.3264, "step": 2734 }, { "epoch": 2.9701249321021184, "grad_norm": 0.2064949454824847, "learning_rate": 5.233494363929147e-07, "loss": 0.3581, "step": 2735 }, { "epoch": 2.971211298207496, "grad_norm": 0.18721221660837847, "learning_rate": 5.03220611916264e-07, "loss": 0.324, "step": 2736 }, { "epoch": 2.9722976643128733, "grad_norm": 0.20400569789233944, "learning_rate": 4.830917874396135e-07, "loss": 0.398, "step": 2737 }, { "epoch": 2.973384030418251, "grad_norm": 0.18876175494627856, "learning_rate": 4.6296296296296297e-07, "loss": 0.3377, "step": 2738 }, { "epoch": 2.9744703965236283, "grad_norm": 0.18386223597972115, "learning_rate": 4.428341384863124e-07, "loss": 0.331, "step": 2739 }, { "epoch": 2.975556762629006, "grad_norm": 0.1942155291505249, "learning_rate": 4.2270531400966184e-07, "loss": 0.3299, "step": 2740 }, { "epoch": 2.9766431287343833, "grad_norm": 0.19197989282213052, "learning_rate": 4.025764895330113e-07, "loss": 0.3466, "step": 2741 }, { "epoch": 2.9777294948397612, "grad_norm": 0.1937078585575583, "learning_rate": 3.824476650563607e-07, "loss": 0.35, "step": 2742 }, { "epoch": 2.9788158609451383, "grad_norm": 0.1793357168984627, "learning_rate": 3.6231884057971015e-07, "loss": 0.313, "step": 2743 }, { "epoch": 2.979902227050516, "grad_norm": 0.18436778305740367, "learning_rate": 3.421900161030596e-07, "loss": 0.3491, "step": 2744 }, { "epoch": 2.9809885931558933, "grad_norm": 0.19185780907472721, "learning_rate": 3.22061191626409e-07, "loss": 0.3605, "step": 2745 }, { "epoch": 2.982074959261271, "grad_norm": 0.18830226589455795, "learning_rate": 3.0193236714975846e-07, "loss": 0.3642, "step": 2746 }, { "epoch": 2.9831613253666487, "grad_norm": 0.18369068287546522, "learning_rate": 2.818035426731079e-07, "loss": 0.3595, "step": 2747 }, { "epoch": 2.984247691472026, "grad_norm": 0.1899171473708013, "learning_rate": 2.6167471819645733e-07, "loss": 0.3488, "step": 2748 }, { "epoch": 2.9853340575774037, "grad_norm": 0.17945003494502718, "learning_rate": 2.4154589371980677e-07, "loss": 0.3139, "step": 2749 }, { "epoch": 2.986420423682781, "grad_norm": 0.18210014961659204, "learning_rate": 2.214170692431562e-07, "loss": 0.322, "step": 2750 }, { "epoch": 2.9875067897881586, "grad_norm": 0.1897853301889048, "learning_rate": 2.0128824476650564e-07, "loss": 0.3372, "step": 2751 }, { "epoch": 2.988593155893536, "grad_norm": 0.18877538789193724, "learning_rate": 1.8115942028985507e-07, "loss": 0.3065, "step": 2752 }, { "epoch": 2.9896795219989136, "grad_norm": 0.19424633419469828, "learning_rate": 1.610305958132045e-07, "loss": 0.3753, "step": 2753 }, { "epoch": 2.990765888104291, "grad_norm": 0.18120512800790176, "learning_rate": 1.4090177133655395e-07, "loss": 0.3247, "step": 2754 }, { "epoch": 2.9918522542096686, "grad_norm": 0.20112469813147693, "learning_rate": 1.2077294685990338e-07, "loss": 0.3878, "step": 2755 }, { "epoch": 2.992938620315046, "grad_norm": 0.1844942617512605, "learning_rate": 1.0064412238325282e-07, "loss": 0.3322, "step": 2756 }, { "epoch": 2.9940249864204236, "grad_norm": 0.20167100070801275, "learning_rate": 8.051529790660226e-08, "loss": 0.3593, "step": 2757 }, { "epoch": 2.995111352525801, "grad_norm": 0.1885885200639767, "learning_rate": 6.038647342995169e-08, "loss": 0.3372, "step": 2758 }, { "epoch": 2.9961977186311786, "grad_norm": 0.1849191606111283, "learning_rate": 4.025764895330113e-08, "loss": 0.3137, "step": 2759 }, { "epoch": 2.997284084736556, "grad_norm": 0.18565580591018752, "learning_rate": 2.0128824476650564e-08, "loss": 0.3396, "step": 2760 }, { "epoch": 2.997284084736556, "step": 2760, "total_flos": 3.0680510616901255e+19, "train_loss": 0.5328506597574206, "train_runtime": 86884.78, "train_samples_per_second": 0.508, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 2760, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0680510616901255e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }