{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3219, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009319664492078285, "grad_norm": 18.574089408545547, "learning_rate": 6.211180124223603e-08, "loss": 4.6295, "step": 1 }, { "epoch": 0.001863932898415657, "grad_norm": 18.343383668058692, "learning_rate": 1.2422360248447206e-07, "loss": 4.1939, "step": 2 }, { "epoch": 0.0027958993476234857, "grad_norm": 17.922014984298816, "learning_rate": 1.863354037267081e-07, "loss": 4.7484, "step": 3 }, { "epoch": 0.003727865796831314, "grad_norm": 19.46946649371405, "learning_rate": 2.484472049689441e-07, "loss": 5.0679, "step": 4 }, { "epoch": 0.004659832246039142, "grad_norm": 18.319109995729647, "learning_rate": 3.1055900621118013e-07, "loss": 4.9611, "step": 5 }, { "epoch": 0.005591798695246971, "grad_norm": 17.804724057833564, "learning_rate": 3.726708074534162e-07, "loss": 4.9502, "step": 6 }, { "epoch": 0.0065237651444548, "grad_norm": 19.36331949024679, "learning_rate": 4.347826086956522e-07, "loss": 5.1492, "step": 7 }, { "epoch": 0.007455731593662628, "grad_norm": 19.131843915883806, "learning_rate": 4.968944099378882e-07, "loss": 5.0172, "step": 8 }, { "epoch": 0.008387698042870456, "grad_norm": 18.63962003204135, "learning_rate": 5.590062111801243e-07, "loss": 4.9939, "step": 9 }, { "epoch": 0.009319664492078284, "grad_norm": 18.18631440401455, "learning_rate": 6.211180124223603e-07, "loss": 4.649, "step": 10 }, { "epoch": 0.010251630941286114, "grad_norm": 18.034499117525616, "learning_rate": 6.832298136645964e-07, "loss": 4.9143, "step": 11 }, { "epoch": 0.011183597390493943, "grad_norm": 18.62600975709469, "learning_rate": 7.453416149068324e-07, "loss": 5.0096, "step": 12 }, { "epoch": 0.012115563839701771, "grad_norm": 19.8479766346946, "learning_rate": 8.074534161490684e-07, "loss": 5.0506, "step": 13 }, { "epoch": 0.0130475302889096, "grad_norm": 17.73840986880824, "learning_rate": 8.695652173913044e-07, "loss": 4.6537, "step": 14 }, { "epoch": 0.013979496738117428, "grad_norm": 18.192748130143052, "learning_rate": 9.316770186335404e-07, "loss": 4.2078, "step": 15 }, { "epoch": 0.014911463187325256, "grad_norm": 18.233205387557724, "learning_rate": 9.937888198757765e-07, "loss": 5.0602, "step": 16 }, { "epoch": 0.015843429636533086, "grad_norm": 17.846263307005128, "learning_rate": 1.0559006211180126e-06, "loss": 4.5846, "step": 17 }, { "epoch": 0.016775396085740912, "grad_norm": 16.617375951186713, "learning_rate": 1.1180124223602485e-06, "loss": 4.3836, "step": 18 }, { "epoch": 0.017707362534948742, "grad_norm": 17.23349132708635, "learning_rate": 1.1801242236024846e-06, "loss": 5.0972, "step": 19 }, { "epoch": 0.01863932898415657, "grad_norm": 16.6702502319918, "learning_rate": 1.2422360248447205e-06, "loss": 4.4693, "step": 20 }, { "epoch": 0.0195712954333644, "grad_norm": 15.948738193062292, "learning_rate": 1.3043478260869566e-06, "loss": 4.9348, "step": 21 }, { "epoch": 0.02050326188257223, "grad_norm": 16.24449858542173, "learning_rate": 1.3664596273291927e-06, "loss": 4.3556, "step": 22 }, { "epoch": 0.021435228331780055, "grad_norm": 16.10274573605739, "learning_rate": 1.4285714285714286e-06, "loss": 4.3407, "step": 23 }, { "epoch": 0.022367194780987885, "grad_norm": 14.481579007026856, "learning_rate": 1.4906832298136647e-06, "loss": 4.4955, "step": 24 }, { "epoch": 0.023299161230195712, "grad_norm": 13.007823757463024, "learning_rate": 1.5527950310559006e-06, "loss": 4.3338, "step": 25 }, { "epoch": 0.024231127679403542, "grad_norm": 12.860715441103322, "learning_rate": 1.6149068322981367e-06, "loss": 3.8744, "step": 26 }, { "epoch": 0.02516309412861137, "grad_norm": 12.129005561487674, "learning_rate": 1.6770186335403729e-06, "loss": 4.6108, "step": 27 }, { "epoch": 0.0260950605778192, "grad_norm": 12.006174338787702, "learning_rate": 1.7391304347826088e-06, "loss": 4.1757, "step": 28 }, { "epoch": 0.02702702702702703, "grad_norm": 11.671706802851274, "learning_rate": 1.8012422360248449e-06, "loss": 4.3269, "step": 29 }, { "epoch": 0.027958993476234855, "grad_norm": 10.995296391573607, "learning_rate": 1.8633540372670808e-06, "loss": 3.9514, "step": 30 }, { "epoch": 0.028890959925442685, "grad_norm": 10.902493273792786, "learning_rate": 1.925465838509317e-06, "loss": 4.2675, "step": 31 }, { "epoch": 0.02982292637465051, "grad_norm": 10.906235219525374, "learning_rate": 1.987577639751553e-06, "loss": 4.175, "step": 32 }, { "epoch": 0.03075489282385834, "grad_norm": 8.85696376859057, "learning_rate": 2.049689440993789e-06, "loss": 4.2259, "step": 33 }, { "epoch": 0.03168685927306617, "grad_norm": 6.685256214702312, "learning_rate": 2.111801242236025e-06, "loss": 3.7789, "step": 34 }, { "epoch": 0.032618825722273995, "grad_norm": 6.824947806869357, "learning_rate": 2.173913043478261e-06, "loss": 4.1555, "step": 35 }, { "epoch": 0.033550792171481825, "grad_norm": 6.496600179532148, "learning_rate": 2.236024844720497e-06, "loss": 3.9308, "step": 36 }, { "epoch": 0.034482758620689655, "grad_norm": 6.238861700807826, "learning_rate": 2.298136645962733e-06, "loss": 4.0557, "step": 37 }, { "epoch": 0.035414725069897485, "grad_norm": 6.167939076908159, "learning_rate": 2.3602484472049692e-06, "loss": 4.0247, "step": 38 }, { "epoch": 0.036346691519105315, "grad_norm": 6.15352802231574, "learning_rate": 2.422360248447205e-06, "loss": 4.0924, "step": 39 }, { "epoch": 0.03727865796831314, "grad_norm": 5.935879862796458, "learning_rate": 2.484472049689441e-06, "loss": 4.0319, "step": 40 }, { "epoch": 0.03821062441752097, "grad_norm": 5.244596176050669, "learning_rate": 2.546583850931677e-06, "loss": 3.6406, "step": 41 }, { "epoch": 0.0391425908667288, "grad_norm": 6.162066650641728, "learning_rate": 2.6086956521739132e-06, "loss": 3.8968, "step": 42 }, { "epoch": 0.04007455731593663, "grad_norm": 5.247261773096669, "learning_rate": 2.670807453416149e-06, "loss": 3.8575, "step": 43 }, { "epoch": 0.04100652376514446, "grad_norm": 4.98613038458278, "learning_rate": 2.7329192546583855e-06, "loss": 3.8388, "step": 44 }, { "epoch": 0.04193849021435228, "grad_norm": 4.741676819324585, "learning_rate": 2.795031055900621e-06, "loss": 3.9067, "step": 45 }, { "epoch": 0.04287045666356011, "grad_norm": 4.2629982639436275, "learning_rate": 2.8571428571428573e-06, "loss": 3.1629, "step": 46 }, { "epoch": 0.04380242311276794, "grad_norm": 4.434796769394744, "learning_rate": 2.919254658385093e-06, "loss": 3.5441, "step": 47 }, { "epoch": 0.04473438956197577, "grad_norm": 4.074695537937748, "learning_rate": 2.9813664596273295e-06, "loss": 3.9441, "step": 48 }, { "epoch": 0.045666356011183594, "grad_norm": 4.465264468362094, "learning_rate": 3.043478260869566e-06, "loss": 3.6605, "step": 49 }, { "epoch": 0.046598322460391424, "grad_norm": 4.2931101651359125, "learning_rate": 3.1055900621118013e-06, "loss": 3.5003, "step": 50 }, { "epoch": 0.047530288909599254, "grad_norm": 5.895837152388427, "learning_rate": 3.1677018633540376e-06, "loss": 3.4957, "step": 51 }, { "epoch": 0.048462255358807084, "grad_norm": 5.413600670699295, "learning_rate": 3.2298136645962735e-06, "loss": 3.4089, "step": 52 }, { "epoch": 0.049394221808014914, "grad_norm": 3.8418846847946244, "learning_rate": 3.29192546583851e-06, "loss": 3.752, "step": 53 }, { "epoch": 0.05032618825722274, "grad_norm": 4.984184630291769, "learning_rate": 3.3540372670807457e-06, "loss": 3.6699, "step": 54 }, { "epoch": 0.05125815470643057, "grad_norm": 4.2242024884964895, "learning_rate": 3.4161490683229816e-06, "loss": 3.7871, "step": 55 }, { "epoch": 0.0521901211556384, "grad_norm": 3.453797330775885, "learning_rate": 3.4782608695652175e-06, "loss": 3.3426, "step": 56 }, { "epoch": 0.05312208760484623, "grad_norm": 7.182773667236586, "learning_rate": 3.540372670807454e-06, "loss": 3.7875, "step": 57 }, { "epoch": 0.05405405405405406, "grad_norm": 3.5593833833836572, "learning_rate": 3.6024844720496897e-06, "loss": 3.7083, "step": 58 }, { "epoch": 0.05498602050326188, "grad_norm": 3.7152015333114057, "learning_rate": 3.664596273291926e-06, "loss": 3.8052, "step": 59 }, { "epoch": 0.05591798695246971, "grad_norm": 3.8588032056870163, "learning_rate": 3.7267080745341615e-06, "loss": 3.7308, "step": 60 }, { "epoch": 0.05684995340167754, "grad_norm": 4.01035322253532, "learning_rate": 3.788819875776398e-06, "loss": 3.2201, "step": 61 }, { "epoch": 0.05778191985088537, "grad_norm": 3.875676858094562, "learning_rate": 3.850931677018634e-06, "loss": 3.4786, "step": 62 }, { "epoch": 0.05871388630009319, "grad_norm": 3.0075057574254123, "learning_rate": 3.91304347826087e-06, "loss": 2.9468, "step": 63 }, { "epoch": 0.05964585274930102, "grad_norm": 3.605311821158966, "learning_rate": 3.975155279503106e-06, "loss": 3.4185, "step": 64 }, { "epoch": 0.06057781919850885, "grad_norm": 3.114472647262959, "learning_rate": 4.037267080745342e-06, "loss": 3.5549, "step": 65 }, { "epoch": 0.06150978564771668, "grad_norm": 2.784805980735676, "learning_rate": 4.099378881987578e-06, "loss": 3.2428, "step": 66 }, { "epoch": 0.06244175209692451, "grad_norm": 3.4834366633949947, "learning_rate": 4.1614906832298145e-06, "loss": 3.2637, "step": 67 }, { "epoch": 0.06337371854613234, "grad_norm": 3.4365444464217814, "learning_rate": 4.22360248447205e-06, "loss": 3.1647, "step": 68 }, { "epoch": 0.06430568499534017, "grad_norm": 3.3917384725551614, "learning_rate": 4.2857142857142855e-06, "loss": 3.6672, "step": 69 }, { "epoch": 0.06523765144454799, "grad_norm": 3.1972178855625115, "learning_rate": 4.347826086956522e-06, "loss": 3.5035, "step": 70 }, { "epoch": 0.06616961789375582, "grad_norm": 3.0613185110381878, "learning_rate": 4.409937888198758e-06, "loss": 3.5424, "step": 71 }, { "epoch": 0.06710158434296365, "grad_norm": 3.0554657774406944, "learning_rate": 4.472049689440994e-06, "loss": 3.414, "step": 72 }, { "epoch": 0.06803355079217148, "grad_norm": 3.7357585743545916, "learning_rate": 4.534161490683231e-06, "loss": 3.5617, "step": 73 }, { "epoch": 0.06896551724137931, "grad_norm": 2.827237803715182, "learning_rate": 4.596273291925466e-06, "loss": 3.159, "step": 74 }, { "epoch": 0.06989748369058714, "grad_norm": 4.462763838164122, "learning_rate": 4.6583850931677025e-06, "loss": 3.3488, "step": 75 }, { "epoch": 0.07082945013979497, "grad_norm": 3.059565170854776, "learning_rate": 4.7204968944099384e-06, "loss": 3.2142, "step": 76 }, { "epoch": 0.0717614165890028, "grad_norm": 2.734157320023504, "learning_rate": 4.782608695652174e-06, "loss": 2.7664, "step": 77 }, { "epoch": 0.07269338303821063, "grad_norm": 3.6587733817110264, "learning_rate": 4.84472049689441e-06, "loss": 3.2643, "step": 78 }, { "epoch": 0.07362534948741846, "grad_norm": 3.3269146338144693, "learning_rate": 4.906832298136646e-06, "loss": 3.1043, "step": 79 }, { "epoch": 0.07455731593662628, "grad_norm": 2.738150009960239, "learning_rate": 4.968944099378882e-06, "loss": 3.0621, "step": 80 }, { "epoch": 0.0754892823858341, "grad_norm": 3.4869120344139963, "learning_rate": 5.031055900621118e-06, "loss": 3.2743, "step": 81 }, { "epoch": 0.07642124883504194, "grad_norm": 3.0971390514460646, "learning_rate": 5.093167701863354e-06, "loss": 3.3498, "step": 82 }, { "epoch": 0.07735321528424977, "grad_norm": 3.06197926790978, "learning_rate": 5.155279503105591e-06, "loss": 2.9556, "step": 83 }, { "epoch": 0.0782851817334576, "grad_norm": 2.041484614847774, "learning_rate": 5.2173913043478265e-06, "loss": 2.9338, "step": 84 }, { "epoch": 0.07921714818266543, "grad_norm": 3.6126272796749745, "learning_rate": 5.279503105590062e-06, "loss": 3.0102, "step": 85 }, { "epoch": 0.08014911463187326, "grad_norm": 2.4237304578285093, "learning_rate": 5.341614906832298e-06, "loss": 3.5053, "step": 86 }, { "epoch": 0.08108108108108109, "grad_norm": 2.7673114002910135, "learning_rate": 5.403726708074535e-06, "loss": 3.8122, "step": 87 }, { "epoch": 0.08201304753028892, "grad_norm": 3.080293681735572, "learning_rate": 5.465838509316771e-06, "loss": 3.3841, "step": 88 }, { "epoch": 0.08294501397949673, "grad_norm": 2.774773357802275, "learning_rate": 5.527950310559007e-06, "loss": 3.2569, "step": 89 }, { "epoch": 0.08387698042870456, "grad_norm": 2.6437546881211498, "learning_rate": 5.590062111801242e-06, "loss": 2.8321, "step": 90 }, { "epoch": 0.08480894687791239, "grad_norm": 2.352403779246417, "learning_rate": 5.652173913043479e-06, "loss": 2.9358, "step": 91 }, { "epoch": 0.08574091332712022, "grad_norm": 2.5604557087017454, "learning_rate": 5.7142857142857145e-06, "loss": 2.8585, "step": 92 }, { "epoch": 0.08667287977632805, "grad_norm": 3.366633960772098, "learning_rate": 5.77639751552795e-06, "loss": 3.1381, "step": 93 }, { "epoch": 0.08760484622553588, "grad_norm": 2.750954439238213, "learning_rate": 5.838509316770186e-06, "loss": 3.0604, "step": 94 }, { "epoch": 0.08853681267474371, "grad_norm": 2.819752106992725, "learning_rate": 5.900621118012423e-06, "loss": 3.146, "step": 95 }, { "epoch": 0.08946877912395154, "grad_norm": 2.9678482277868654, "learning_rate": 5.962732919254659e-06, "loss": 3.2837, "step": 96 }, { "epoch": 0.09040074557315937, "grad_norm": 2.3285939785273424, "learning_rate": 6.024844720496895e-06, "loss": 2.9056, "step": 97 }, { "epoch": 0.09133271202236719, "grad_norm": 2.7716808716278054, "learning_rate": 6.086956521739132e-06, "loss": 2.6162, "step": 98 }, { "epoch": 0.09226467847157502, "grad_norm": 3.0361965043974566, "learning_rate": 6.1490683229813675e-06, "loss": 3.2052, "step": 99 }, { "epoch": 0.09319664492078285, "grad_norm": 3.098390752577542, "learning_rate": 6.2111801242236025e-06, "loss": 2.9289, "step": 100 }, { "epoch": 0.09412861136999068, "grad_norm": 2.6837382120296662, "learning_rate": 6.2732919254658384e-06, "loss": 2.905, "step": 101 }, { "epoch": 0.09506057781919851, "grad_norm": 2.8348616747948503, "learning_rate": 6.335403726708075e-06, "loss": 2.9806, "step": 102 }, { "epoch": 0.09599254426840634, "grad_norm": 3.066229205949389, "learning_rate": 6.397515527950311e-06, "loss": 3.0331, "step": 103 }, { "epoch": 0.09692451071761417, "grad_norm": 2.2636657317286186, "learning_rate": 6.459627329192547e-06, "loss": 2.833, "step": 104 }, { "epoch": 0.097856477166822, "grad_norm": 3.1192657740462737, "learning_rate": 6.521739130434783e-06, "loss": 3.2033, "step": 105 }, { "epoch": 0.09878844361602983, "grad_norm": 2.5819517710852695, "learning_rate": 6.58385093167702e-06, "loss": 3.5739, "step": 106 }, { "epoch": 0.09972041006523766, "grad_norm": 2.7502633070352904, "learning_rate": 6.6459627329192555e-06, "loss": 2.8253, "step": 107 }, { "epoch": 0.10065237651444547, "grad_norm": 3.4324127825530715, "learning_rate": 6.7080745341614914e-06, "loss": 3.0911, "step": 108 }, { "epoch": 0.1015843429636533, "grad_norm": 2.258105827292967, "learning_rate": 6.7701863354037265e-06, "loss": 2.6164, "step": 109 }, { "epoch": 0.10251630941286113, "grad_norm": 2.6810685866415658, "learning_rate": 6.832298136645963e-06, "loss": 2.784, "step": 110 }, { "epoch": 0.10344827586206896, "grad_norm": 2.75708984267076, "learning_rate": 6.894409937888199e-06, "loss": 3.0053, "step": 111 }, { "epoch": 0.1043802423112768, "grad_norm": 2.8032119814940812, "learning_rate": 6.956521739130435e-06, "loss": 2.9915, "step": 112 }, { "epoch": 0.10531220876048462, "grad_norm": 3.4473246332701573, "learning_rate": 7.018633540372671e-06, "loss": 3.4611, "step": 113 }, { "epoch": 0.10624417520969245, "grad_norm": 2.9255116805176526, "learning_rate": 7.080745341614908e-06, "loss": 2.8999, "step": 114 }, { "epoch": 0.10717614165890028, "grad_norm": 2.5502384403932514, "learning_rate": 7.1428571428571436e-06, "loss": 3.8627, "step": 115 }, { "epoch": 0.10810810810810811, "grad_norm": 2.784786073118951, "learning_rate": 7.2049689440993795e-06, "loss": 2.8205, "step": 116 }, { "epoch": 0.10904007455731593, "grad_norm": 2.4580597017800465, "learning_rate": 7.267080745341616e-06, "loss": 3.2295, "step": 117 }, { "epoch": 0.10997204100652376, "grad_norm": 2.5498346257202864, "learning_rate": 7.329192546583852e-06, "loss": 3.0257, "step": 118 }, { "epoch": 0.11090400745573159, "grad_norm": 3.5436730807864305, "learning_rate": 7.391304347826087e-06, "loss": 3.12, "step": 119 }, { "epoch": 0.11183597390493942, "grad_norm": 2.144459132986669, "learning_rate": 7.453416149068323e-06, "loss": 2.7843, "step": 120 }, { "epoch": 0.11276794035414725, "grad_norm": 2.5456582969226624, "learning_rate": 7.515527950310559e-06, "loss": 3.2, "step": 121 }, { "epoch": 0.11369990680335508, "grad_norm": 4.372334184510492, "learning_rate": 7.577639751552796e-06, "loss": 2.7116, "step": 122 }, { "epoch": 0.11463187325256291, "grad_norm": 2.456758115400216, "learning_rate": 7.639751552795032e-06, "loss": 3.0824, "step": 123 }, { "epoch": 0.11556383970177074, "grad_norm": 2.962679705275665, "learning_rate": 7.701863354037268e-06, "loss": 2.8188, "step": 124 }, { "epoch": 0.11649580615097857, "grad_norm": 3.1304763829814624, "learning_rate": 7.763975155279503e-06, "loss": 3.1382, "step": 125 }, { "epoch": 0.11742777260018639, "grad_norm": 3.0565929552077686, "learning_rate": 7.82608695652174e-06, "loss": 3.2855, "step": 126 }, { "epoch": 0.11835973904939422, "grad_norm": 2.8798626007782917, "learning_rate": 7.888198757763977e-06, "loss": 3.0828, "step": 127 }, { "epoch": 0.11929170549860205, "grad_norm": 2.7352322296405744, "learning_rate": 7.950310559006212e-06, "loss": 2.969, "step": 128 }, { "epoch": 0.12022367194780988, "grad_norm": 2.840604281293738, "learning_rate": 8.012422360248447e-06, "loss": 3.1817, "step": 129 }, { "epoch": 0.1211556383970177, "grad_norm": 3.275025326138456, "learning_rate": 8.074534161490684e-06, "loss": 3.3612, "step": 130 }, { "epoch": 0.12208760484622554, "grad_norm": 2.842496619328633, "learning_rate": 8.13664596273292e-06, "loss": 3.0448, "step": 131 }, { "epoch": 0.12301957129543337, "grad_norm": 2.472714389624454, "learning_rate": 8.198757763975156e-06, "loss": 3.108, "step": 132 }, { "epoch": 0.1239515377446412, "grad_norm": 2.176532176986024, "learning_rate": 8.260869565217392e-06, "loss": 2.7294, "step": 133 }, { "epoch": 0.12488350419384903, "grad_norm": 3.2436985259321816, "learning_rate": 8.322981366459629e-06, "loss": 3.298, "step": 134 }, { "epoch": 0.12581547064305684, "grad_norm": 3.7345715885088238, "learning_rate": 8.385093167701864e-06, "loss": 3.1474, "step": 135 }, { "epoch": 0.1267474370922647, "grad_norm": 2.2295877102359847, "learning_rate": 8.4472049689441e-06, "loss": 2.6257, "step": 136 }, { "epoch": 0.1276794035414725, "grad_norm": 2.7381458401000596, "learning_rate": 8.509316770186336e-06, "loss": 2.7618, "step": 137 }, { "epoch": 0.12861136999068035, "grad_norm": 2.789708127207647, "learning_rate": 8.571428571428571e-06, "loss": 3.2215, "step": 138 }, { "epoch": 0.12954333643988816, "grad_norm": 2.6050956959891356, "learning_rate": 8.633540372670808e-06, "loss": 2.9271, "step": 139 }, { "epoch": 0.13047530288909598, "grad_norm": 2.4766654507695995, "learning_rate": 8.695652173913044e-06, "loss": 2.9141, "step": 140 }, { "epoch": 0.13140726933830382, "grad_norm": 2.1534850304171482, "learning_rate": 8.75776397515528e-06, "loss": 3.1762, "step": 141 }, { "epoch": 0.13233923578751164, "grad_norm": 2.6602695442736564, "learning_rate": 8.819875776397516e-06, "loss": 2.9563, "step": 142 }, { "epoch": 0.13327120223671948, "grad_norm": 2.9643291415924558, "learning_rate": 8.881987577639753e-06, "loss": 2.8369, "step": 143 }, { "epoch": 0.1342031686859273, "grad_norm": 3.075433016512971, "learning_rate": 8.944099378881988e-06, "loss": 2.3568, "step": 144 }, { "epoch": 0.13513513513513514, "grad_norm": 2.325085457241487, "learning_rate": 9.006211180124225e-06, "loss": 2.8025, "step": 145 }, { "epoch": 0.13606710158434296, "grad_norm": 1.975718274767468, "learning_rate": 9.068322981366461e-06, "loss": 3.1444, "step": 146 }, { "epoch": 0.1369990680335508, "grad_norm": 2.2465715436260845, "learning_rate": 9.130434782608697e-06, "loss": 3.0144, "step": 147 }, { "epoch": 0.13793103448275862, "grad_norm": 2.647063386769221, "learning_rate": 9.192546583850932e-06, "loss": 3.3568, "step": 148 }, { "epoch": 0.13886300093196646, "grad_norm": 2.488758613418054, "learning_rate": 9.254658385093168e-06, "loss": 3.2256, "step": 149 }, { "epoch": 0.13979496738117428, "grad_norm": 1.924210994195141, "learning_rate": 9.316770186335405e-06, "loss": 2.4778, "step": 150 }, { "epoch": 0.1407269338303821, "grad_norm": 2.9184564430343833, "learning_rate": 9.37888198757764e-06, "loss": 2.7937, "step": 151 }, { "epoch": 0.14165890027958994, "grad_norm": 2.2174169337600684, "learning_rate": 9.440993788819877e-06, "loss": 2.8713, "step": 152 }, { "epoch": 0.14259086672879775, "grad_norm": 2.8217456091609923, "learning_rate": 9.503105590062112e-06, "loss": 2.7998, "step": 153 }, { "epoch": 0.1435228331780056, "grad_norm": 3.048993881265487, "learning_rate": 9.565217391304349e-06, "loss": 2.9668, "step": 154 }, { "epoch": 0.14445479962721341, "grad_norm": 4.726850762987341, "learning_rate": 9.627329192546585e-06, "loss": 3.0322, "step": 155 }, { "epoch": 0.14538676607642126, "grad_norm": 2.691323357062255, "learning_rate": 9.68944099378882e-06, "loss": 2.9855, "step": 156 }, { "epoch": 0.14631873252562907, "grad_norm": 2.6747857316252484, "learning_rate": 9.751552795031056e-06, "loss": 2.7462, "step": 157 }, { "epoch": 0.14725069897483692, "grad_norm": 2.2384566742847234, "learning_rate": 9.813664596273292e-06, "loss": 2.6674, "step": 158 }, { "epoch": 0.14818266542404473, "grad_norm": 2.7553891367967487, "learning_rate": 9.875776397515529e-06, "loss": 2.9334, "step": 159 }, { "epoch": 0.14911463187325255, "grad_norm": 2.7690830176770116, "learning_rate": 9.937888198757764e-06, "loss": 3.2512, "step": 160 }, { "epoch": 0.1500465983224604, "grad_norm": 2.514045607930386, "learning_rate": 1e-05, "loss": 2.8958, "step": 161 }, { "epoch": 0.1509785647716682, "grad_norm": 2.3528238872571596, "learning_rate": 1.0062111801242236e-05, "loss": 2.9046, "step": 162 }, { "epoch": 0.15191053122087605, "grad_norm": 4.07814550970154, "learning_rate": 1.0124223602484473e-05, "loss": 3.1518, "step": 163 }, { "epoch": 0.15284249767008387, "grad_norm": 2.5962637888573843, "learning_rate": 1.0186335403726708e-05, "loss": 3.1119, "step": 164 }, { "epoch": 0.15377446411929171, "grad_norm": 2.3437030417928937, "learning_rate": 1.0248447204968946e-05, "loss": 2.6833, "step": 165 }, { "epoch": 0.15470643056849953, "grad_norm": 3.374451934102497, "learning_rate": 1.0310559006211181e-05, "loss": 2.9318, "step": 166 }, { "epoch": 0.15563839701770738, "grad_norm": 2.369050733479348, "learning_rate": 1.0372670807453418e-05, "loss": 2.9616, "step": 167 }, { "epoch": 0.1565703634669152, "grad_norm": 2.714350077061708, "learning_rate": 1.0434782608695653e-05, "loss": 2.8688, "step": 168 }, { "epoch": 0.157502329916123, "grad_norm": 2.556231381885857, "learning_rate": 1.049689440993789e-05, "loss": 2.9571, "step": 169 }, { "epoch": 0.15843429636533085, "grad_norm": 1.8260691511041147, "learning_rate": 1.0559006211180125e-05, "loss": 2.2627, "step": 170 }, { "epoch": 0.15936626281453867, "grad_norm": 2.3169773640969993, "learning_rate": 1.062111801242236e-05, "loss": 2.6989, "step": 171 }, { "epoch": 0.1602982292637465, "grad_norm": 2.4680133406795433, "learning_rate": 1.0683229813664597e-05, "loss": 3.1397, "step": 172 }, { "epoch": 0.16123019571295433, "grad_norm": 2.3744515384307654, "learning_rate": 1.0745341614906832e-05, "loss": 2.731, "step": 173 }, { "epoch": 0.16216216216216217, "grad_norm": 2.207505783989955, "learning_rate": 1.080745341614907e-05, "loss": 2.4359, "step": 174 }, { "epoch": 0.16309412861137, "grad_norm": 2.1075846889754497, "learning_rate": 1.0869565217391305e-05, "loss": 2.7746, "step": 175 }, { "epoch": 0.16402609506057783, "grad_norm": 2.8620638292819973, "learning_rate": 1.0931677018633542e-05, "loss": 3.0278, "step": 176 }, { "epoch": 0.16495806150978565, "grad_norm": 2.139900914026707, "learning_rate": 1.0993788819875777e-05, "loss": 3.1547, "step": 177 }, { "epoch": 0.16589002795899346, "grad_norm": 2.6505019076879117, "learning_rate": 1.1055900621118014e-05, "loss": 2.7343, "step": 178 }, { "epoch": 0.1668219944082013, "grad_norm": 1.9594182052476046, "learning_rate": 1.1118012422360249e-05, "loss": 2.8312, "step": 179 }, { "epoch": 0.16775396085740912, "grad_norm": 1.9896258745099056, "learning_rate": 1.1180124223602484e-05, "loss": 2.5, "step": 180 }, { "epoch": 0.16868592730661697, "grad_norm": 2.4483563850357526, "learning_rate": 1.1242236024844722e-05, "loss": 3.0155, "step": 181 }, { "epoch": 0.16961789375582478, "grad_norm": 2.4713906419338514, "learning_rate": 1.1304347826086957e-05, "loss": 2.9148, "step": 182 }, { "epoch": 0.17054986020503263, "grad_norm": 2.670625980305103, "learning_rate": 1.1366459627329194e-05, "loss": 2.9307, "step": 183 }, { "epoch": 0.17148182665424044, "grad_norm": 2.881303318548415, "learning_rate": 1.1428571428571429e-05, "loss": 2.7726, "step": 184 }, { "epoch": 0.1724137931034483, "grad_norm": 2.4615371145650324, "learning_rate": 1.1490683229813666e-05, "loss": 3.1076, "step": 185 }, { "epoch": 0.1733457595526561, "grad_norm": 2.7808558692590037, "learning_rate": 1.15527950310559e-05, "loss": 2.2638, "step": 186 }, { "epoch": 0.17427772600186392, "grad_norm": 2.310553791892974, "learning_rate": 1.161490683229814e-05, "loss": 2.843, "step": 187 }, { "epoch": 0.17520969245107176, "grad_norm": 2.0215686241935256, "learning_rate": 1.1677018633540373e-05, "loss": 2.8201, "step": 188 }, { "epoch": 0.17614165890027958, "grad_norm": 1.9417582216664866, "learning_rate": 1.1739130434782611e-05, "loss": 2.5957, "step": 189 }, { "epoch": 0.17707362534948742, "grad_norm": 1.8414981923790334, "learning_rate": 1.1801242236024846e-05, "loss": 2.5747, "step": 190 }, { "epoch": 0.17800559179869524, "grad_norm": 2.025454764001188, "learning_rate": 1.1863354037267081e-05, "loss": 2.9936, "step": 191 }, { "epoch": 0.17893755824790308, "grad_norm": 1.8829779646143852, "learning_rate": 1.1925465838509318e-05, "loss": 2.549, "step": 192 }, { "epoch": 0.1798695246971109, "grad_norm": 2.3805388699240986, "learning_rate": 1.1987577639751553e-05, "loss": 2.6599, "step": 193 }, { "epoch": 0.18080149114631874, "grad_norm": 2.4107809847033987, "learning_rate": 1.204968944099379e-05, "loss": 2.6903, "step": 194 }, { "epoch": 0.18173345759552656, "grad_norm": 2.4678242181709718, "learning_rate": 1.2111801242236025e-05, "loss": 3.0559, "step": 195 }, { "epoch": 0.18266542404473438, "grad_norm": 2.1096035584427106, "learning_rate": 1.2173913043478263e-05, "loss": 2.5939, "step": 196 }, { "epoch": 0.18359739049394222, "grad_norm": 2.1804994988106796, "learning_rate": 1.2236024844720498e-05, "loss": 2.5995, "step": 197 }, { "epoch": 0.18452935694315004, "grad_norm": 2.220490080977824, "learning_rate": 1.2298136645962735e-05, "loss": 2.9782, "step": 198 }, { "epoch": 0.18546132339235788, "grad_norm": 2.4933867368215767, "learning_rate": 1.236024844720497e-05, "loss": 2.8873, "step": 199 }, { "epoch": 0.1863932898415657, "grad_norm": 2.6728234469533594, "learning_rate": 1.2422360248447205e-05, "loss": 3.1652, "step": 200 }, { "epoch": 0.18732525629077354, "grad_norm": 2.2005476590184045, "learning_rate": 1.2484472049689442e-05, "loss": 2.8328, "step": 201 }, { "epoch": 0.18825722273998136, "grad_norm": 1.911061846835456, "learning_rate": 1.2546583850931677e-05, "loss": 2.6706, "step": 202 }, { "epoch": 0.1891891891891892, "grad_norm": 2.6030177131585663, "learning_rate": 1.2608695652173915e-05, "loss": 2.8582, "step": 203 }, { "epoch": 0.19012115563839702, "grad_norm": 2.7414551574803467, "learning_rate": 1.267080745341615e-05, "loss": 3.0519, "step": 204 }, { "epoch": 0.19105312208760486, "grad_norm": 2.4944361400893187, "learning_rate": 1.2732919254658387e-05, "loss": 2.7141, "step": 205 }, { "epoch": 0.19198508853681268, "grad_norm": 2.23991416067465, "learning_rate": 1.2795031055900622e-05, "loss": 2.9674, "step": 206 }, { "epoch": 0.1929170549860205, "grad_norm": 2.6386540125229363, "learning_rate": 1.2857142857142859e-05, "loss": 2.845, "step": 207 }, { "epoch": 0.19384902143522834, "grad_norm": 2.1962215804775203, "learning_rate": 1.2919254658385094e-05, "loss": 2.9755, "step": 208 }, { "epoch": 0.19478098788443615, "grad_norm": 2.063287869310383, "learning_rate": 1.2981366459627329e-05, "loss": 2.0295, "step": 209 }, { "epoch": 0.195712954333644, "grad_norm": 2.5116775209143523, "learning_rate": 1.3043478260869566e-05, "loss": 2.7716, "step": 210 }, { "epoch": 0.1966449207828518, "grad_norm": 1.7242559049530273, "learning_rate": 1.31055900621118e-05, "loss": 2.3065, "step": 211 }, { "epoch": 0.19757688723205966, "grad_norm": 2.187054344570464, "learning_rate": 1.316770186335404e-05, "loss": 2.8329, "step": 212 }, { "epoch": 0.19850885368126747, "grad_norm": 2.5206731334574197, "learning_rate": 1.3229813664596274e-05, "loss": 2.6833, "step": 213 }, { "epoch": 0.19944082013047532, "grad_norm": 3.083517684530792, "learning_rate": 1.3291925465838511e-05, "loss": 3.206, "step": 214 }, { "epoch": 0.20037278657968313, "grad_norm": 2.181002528148029, "learning_rate": 1.3354037267080746e-05, "loss": 2.4623, "step": 215 }, { "epoch": 0.20130475302889095, "grad_norm": 2.3097320913528114, "learning_rate": 1.3416149068322983e-05, "loss": 2.4913, "step": 216 }, { "epoch": 0.2022367194780988, "grad_norm": 2.5770694670784313, "learning_rate": 1.3478260869565218e-05, "loss": 2.7151, "step": 217 }, { "epoch": 0.2031686859273066, "grad_norm": 1.6786126701567152, "learning_rate": 1.3540372670807453e-05, "loss": 2.6813, "step": 218 }, { "epoch": 0.20410065237651445, "grad_norm": 2.1405595262738064, "learning_rate": 1.3602484472049691e-05, "loss": 2.8788, "step": 219 }, { "epoch": 0.20503261882572227, "grad_norm": 2.8677270981581433, "learning_rate": 1.3664596273291926e-05, "loss": 2.8041, "step": 220 }, { "epoch": 0.2059645852749301, "grad_norm": 2.0886350370015894, "learning_rate": 1.3726708074534163e-05, "loss": 2.8279, "step": 221 }, { "epoch": 0.20689655172413793, "grad_norm": 2.163218170543605, "learning_rate": 1.3788819875776398e-05, "loss": 2.7336, "step": 222 }, { "epoch": 0.20782851817334577, "grad_norm": 2.4243438351712623, "learning_rate": 1.3850931677018635e-05, "loss": 2.8475, "step": 223 }, { "epoch": 0.2087604846225536, "grad_norm": 2.462324251892657, "learning_rate": 1.391304347826087e-05, "loss": 2.8979, "step": 224 }, { "epoch": 0.2096924510717614, "grad_norm": 3.603934616619175, "learning_rate": 1.3975155279503107e-05, "loss": 2.9629, "step": 225 }, { "epoch": 0.21062441752096925, "grad_norm": 2.924873313924268, "learning_rate": 1.4037267080745342e-05, "loss": 2.7999, "step": 226 }, { "epoch": 0.21155638397017706, "grad_norm": 2.589976208691665, "learning_rate": 1.409937888198758e-05, "loss": 2.7906, "step": 227 }, { "epoch": 0.2124883504193849, "grad_norm": 2.2606876793926607, "learning_rate": 1.4161490683229815e-05, "loss": 2.899, "step": 228 }, { "epoch": 0.21342031686859272, "grad_norm": 2.457103318253213, "learning_rate": 1.422360248447205e-05, "loss": 3.0457, "step": 229 }, { "epoch": 0.21435228331780057, "grad_norm": 1.9399668699118051, "learning_rate": 1.4285714285714287e-05, "loss": 2.8602, "step": 230 }, { "epoch": 0.21528424976700838, "grad_norm": 2.0882827203616956, "learning_rate": 1.4347826086956522e-05, "loss": 2.3073, "step": 231 }, { "epoch": 0.21621621621621623, "grad_norm": 2.7620070547576776, "learning_rate": 1.4409937888198759e-05, "loss": 2.6395, "step": 232 }, { "epoch": 0.21714818266542404, "grad_norm": 2.1185072155358795, "learning_rate": 1.4472049689440994e-05, "loss": 3.1218, "step": 233 }, { "epoch": 0.21808014911463186, "grad_norm": 2.23555021537148, "learning_rate": 1.4534161490683232e-05, "loss": 2.5403, "step": 234 }, { "epoch": 0.2190121155638397, "grad_norm": 2.1354584096060223, "learning_rate": 1.4596273291925467e-05, "loss": 2.3692, "step": 235 }, { "epoch": 0.21994408201304752, "grad_norm": 1.7723495085361716, "learning_rate": 1.4658385093167704e-05, "loss": 2.2742, "step": 236 }, { "epoch": 0.22087604846225536, "grad_norm": 2.3331028752027168, "learning_rate": 1.472049689440994e-05, "loss": 2.4023, "step": 237 }, { "epoch": 0.22180801491146318, "grad_norm": 2.089093367997692, "learning_rate": 1.4782608695652174e-05, "loss": 2.4776, "step": 238 }, { "epoch": 0.22273998136067102, "grad_norm": 2.3106542390331133, "learning_rate": 1.4844720496894411e-05, "loss": 2.5375, "step": 239 }, { "epoch": 0.22367194780987884, "grad_norm": 2.448248542228409, "learning_rate": 1.4906832298136646e-05, "loss": 2.8846, "step": 240 }, { "epoch": 0.22460391425908668, "grad_norm": 2.459418862447774, "learning_rate": 1.4968944099378885e-05, "loss": 2.9368, "step": 241 }, { "epoch": 0.2255358807082945, "grad_norm": 2.0459119366554197, "learning_rate": 1.5031055900621118e-05, "loss": 2.8156, "step": 242 }, { "epoch": 0.22646784715750232, "grad_norm": 2.430738197974064, "learning_rate": 1.5093167701863356e-05, "loss": 2.6671, "step": 243 }, { "epoch": 0.22739981360671016, "grad_norm": 1.9962854708301216, "learning_rate": 1.5155279503105591e-05, "loss": 2.7837, "step": 244 }, { "epoch": 0.22833178005591798, "grad_norm": 2.5770063157656544, "learning_rate": 1.5217391304347828e-05, "loss": 2.8552, "step": 245 }, { "epoch": 0.22926374650512582, "grad_norm": 1.783263571105954, "learning_rate": 1.5279503105590063e-05, "loss": 2.9332, "step": 246 }, { "epoch": 0.23019571295433364, "grad_norm": 3.6003832315685034, "learning_rate": 1.5341614906832298e-05, "loss": 3.0509, "step": 247 }, { "epoch": 0.23112767940354148, "grad_norm": 2.1810465968963593, "learning_rate": 1.5403726708074537e-05, "loss": 2.6558, "step": 248 }, { "epoch": 0.2320596458527493, "grad_norm": 1.7340897108110036, "learning_rate": 1.5465838509316772e-05, "loss": 2.6122, "step": 249 }, { "epoch": 0.23299161230195714, "grad_norm": 2.0241691415761176, "learning_rate": 1.5527950310559007e-05, "loss": 2.5061, "step": 250 }, { "epoch": 0.23392357875116496, "grad_norm": 1.911462919356104, "learning_rate": 1.5590062111801242e-05, "loss": 2.8305, "step": 251 }, { "epoch": 0.23485554520037277, "grad_norm": 1.954314885008235, "learning_rate": 1.565217391304348e-05, "loss": 2.7312, "step": 252 }, { "epoch": 0.23578751164958062, "grad_norm": 2.429577220183252, "learning_rate": 1.5714285714285715e-05, "loss": 2.7075, "step": 253 }, { "epoch": 0.23671947809878843, "grad_norm": 2.3765528757981174, "learning_rate": 1.5776397515527954e-05, "loss": 2.8122, "step": 254 }, { "epoch": 0.23765144454799628, "grad_norm": 1.764688505388834, "learning_rate": 1.583850931677019e-05, "loss": 2.4317, "step": 255 }, { "epoch": 0.2385834109972041, "grad_norm": 2.1528469702378614, "learning_rate": 1.5900621118012424e-05, "loss": 2.225, "step": 256 }, { "epoch": 0.23951537744641194, "grad_norm": 2.5715022386529784, "learning_rate": 1.596273291925466e-05, "loss": 2.9411, "step": 257 }, { "epoch": 0.24044734389561975, "grad_norm": 1.9454129454187454, "learning_rate": 1.6024844720496894e-05, "loss": 2.7026, "step": 258 }, { "epoch": 0.2413793103448276, "grad_norm": 1.7047422097312759, "learning_rate": 1.6086956521739132e-05, "loss": 2.6009, "step": 259 }, { "epoch": 0.2423112767940354, "grad_norm": 1.9591791304885042, "learning_rate": 1.6149068322981367e-05, "loss": 2.6837, "step": 260 }, { "epoch": 0.24324324324324326, "grad_norm": 2.4136625412158934, "learning_rate": 1.6211180124223606e-05, "loss": 2.603, "step": 261 }, { "epoch": 0.24417520969245107, "grad_norm": 2.0800043868510754, "learning_rate": 1.627329192546584e-05, "loss": 2.4596, "step": 262 }, { "epoch": 0.2451071761416589, "grad_norm": 2.03299914101308, "learning_rate": 1.6335403726708076e-05, "loss": 2.6209, "step": 263 }, { "epoch": 0.24603914259086673, "grad_norm": 2.2481998325284636, "learning_rate": 1.639751552795031e-05, "loss": 2.6562, "step": 264 }, { "epoch": 0.24697110904007455, "grad_norm": 2.562254915847631, "learning_rate": 1.645962732919255e-05, "loss": 2.6237, "step": 265 }, { "epoch": 0.2479030754892824, "grad_norm": 1.7834247486768176, "learning_rate": 1.6521739130434785e-05, "loss": 2.5606, "step": 266 }, { "epoch": 0.2488350419384902, "grad_norm": 1.7480881861082782, "learning_rate": 1.658385093167702e-05, "loss": 2.9331, "step": 267 }, { "epoch": 0.24976700838769805, "grad_norm": 1.448776856346547, "learning_rate": 1.6645962732919258e-05, "loss": 2.0732, "step": 268 }, { "epoch": 0.2506989748369059, "grad_norm": 2.2047484189844186, "learning_rate": 1.670807453416149e-05, "loss": 2.7921, "step": 269 }, { "epoch": 0.2516309412861137, "grad_norm": 3.1270739872891693, "learning_rate": 1.6770186335403728e-05, "loss": 2.7127, "step": 270 }, { "epoch": 0.25256290773532153, "grad_norm": 2.078710541516544, "learning_rate": 1.6832298136645963e-05, "loss": 2.4783, "step": 271 }, { "epoch": 0.2534948741845294, "grad_norm": 2.215532381046001, "learning_rate": 1.68944099378882e-05, "loss": 2.7399, "step": 272 }, { "epoch": 0.25442684063373716, "grad_norm": 1.9963608601179987, "learning_rate": 1.6956521739130437e-05, "loss": 2.2149, "step": 273 }, { "epoch": 0.255358807082945, "grad_norm": 1.9355818301985732, "learning_rate": 1.7018633540372672e-05, "loss": 2.7631, "step": 274 }, { "epoch": 0.25629077353215285, "grad_norm": 1.8544911955129086, "learning_rate": 1.7080745341614907e-05, "loss": 2.9406, "step": 275 }, { "epoch": 0.2572227399813607, "grad_norm": 2.24530635601379, "learning_rate": 1.7142857142857142e-05, "loss": 2.7444, "step": 276 }, { "epoch": 0.2581547064305685, "grad_norm": 2.909795502777139, "learning_rate": 1.720496894409938e-05, "loss": 2.3416, "step": 277 }, { "epoch": 0.2590866728797763, "grad_norm": 1.9658895123069704, "learning_rate": 1.7267080745341615e-05, "loss": 2.657, "step": 278 }, { "epoch": 0.26001863932898417, "grad_norm": 2.1373157364942896, "learning_rate": 1.7329192546583854e-05, "loss": 2.2819, "step": 279 }, { "epoch": 0.26095060577819196, "grad_norm": 2.14445037687048, "learning_rate": 1.739130434782609e-05, "loss": 2.9149, "step": 280 }, { "epoch": 0.2618825722273998, "grad_norm": 2.05856450820221, "learning_rate": 1.7453416149068324e-05, "loss": 2.7498, "step": 281 }, { "epoch": 0.26281453867660765, "grad_norm": 1.7737683800278772, "learning_rate": 1.751552795031056e-05, "loss": 2.0814, "step": 282 }, { "epoch": 0.2637465051258155, "grad_norm": 2.1072893946249858, "learning_rate": 1.7577639751552797e-05, "loss": 2.4227, "step": 283 }, { "epoch": 0.2646784715750233, "grad_norm": 1.912729132463378, "learning_rate": 1.7639751552795032e-05, "loss": 2.2032, "step": 284 }, { "epoch": 0.2656104380242311, "grad_norm": 1.856668381170623, "learning_rate": 1.7701863354037267e-05, "loss": 2.8351, "step": 285 }, { "epoch": 0.26654240447343897, "grad_norm": 2.091870338612638, "learning_rate": 1.7763975155279506e-05, "loss": 2.7138, "step": 286 }, { "epoch": 0.2674743709226468, "grad_norm": 2.177090596789352, "learning_rate": 1.782608695652174e-05, "loss": 2.9056, "step": 287 }, { "epoch": 0.2684063373718546, "grad_norm": 2.0714931145439763, "learning_rate": 1.7888198757763976e-05, "loss": 2.7817, "step": 288 }, { "epoch": 0.26933830382106244, "grad_norm": 2.0945519213496504, "learning_rate": 1.795031055900621e-05, "loss": 3.0715, "step": 289 }, { "epoch": 0.2702702702702703, "grad_norm": 2.0177668024829845, "learning_rate": 1.801242236024845e-05, "loss": 2.6148, "step": 290 }, { "epoch": 0.2712022367194781, "grad_norm": 2.531662258167982, "learning_rate": 1.8074534161490685e-05, "loss": 2.0371, "step": 291 }, { "epoch": 0.2721342031686859, "grad_norm": 2.0606940124871396, "learning_rate": 1.8136645962732923e-05, "loss": 2.4224, "step": 292 }, { "epoch": 0.27306616961789376, "grad_norm": 1.753455272972178, "learning_rate": 1.8198757763975158e-05, "loss": 2.879, "step": 293 }, { "epoch": 0.2739981360671016, "grad_norm": 1.8094621654607967, "learning_rate": 1.8260869565217393e-05, "loss": 2.7234, "step": 294 }, { "epoch": 0.2749301025163094, "grad_norm": 2.0934436870243625, "learning_rate": 1.8322981366459628e-05, "loss": 2.8338, "step": 295 }, { "epoch": 0.27586206896551724, "grad_norm": 1.8481280016038062, "learning_rate": 1.8385093167701863e-05, "loss": 2.4132, "step": 296 }, { "epoch": 0.2767940354147251, "grad_norm": 1.7301668704821551, "learning_rate": 1.84472049689441e-05, "loss": 2.32, "step": 297 }, { "epoch": 0.2777260018639329, "grad_norm": 1.9957310369020944, "learning_rate": 1.8509316770186337e-05, "loss": 2.7684, "step": 298 }, { "epoch": 0.2786579683131407, "grad_norm": 2.1188701145611546, "learning_rate": 1.8571428571428575e-05, "loss": 2.6056, "step": 299 }, { "epoch": 0.27958993476234856, "grad_norm": 2.153062562879588, "learning_rate": 1.863354037267081e-05, "loss": 2.8858, "step": 300 }, { "epoch": 0.2805219012115564, "grad_norm": 1.7259700859623632, "learning_rate": 1.8695652173913045e-05, "loss": 1.9107, "step": 301 }, { "epoch": 0.2814538676607642, "grad_norm": 1.7040307861953652, "learning_rate": 1.875776397515528e-05, "loss": 2.1877, "step": 302 }, { "epoch": 0.28238583410997203, "grad_norm": 3.4280525306393574, "learning_rate": 1.881987577639752e-05, "loss": 2.7424, "step": 303 }, { "epoch": 0.2833178005591799, "grad_norm": 2.3585609262782654, "learning_rate": 1.8881987577639754e-05, "loss": 2.668, "step": 304 }, { "epoch": 0.2842497670083877, "grad_norm": 2.167642211797387, "learning_rate": 1.894409937888199e-05, "loss": 2.5894, "step": 305 }, { "epoch": 0.2851817334575955, "grad_norm": 2.5357864784848774, "learning_rate": 1.9006211180124224e-05, "loss": 2.8511, "step": 306 }, { "epoch": 0.28611369990680335, "grad_norm": 2.3988151861517393, "learning_rate": 1.906832298136646e-05, "loss": 2.6067, "step": 307 }, { "epoch": 0.2870456663560112, "grad_norm": 2.024745529390781, "learning_rate": 1.9130434782608697e-05, "loss": 2.744, "step": 308 }, { "epoch": 0.287977632805219, "grad_norm": 1.9851347702850581, "learning_rate": 1.9192546583850932e-05, "loss": 2.5781, "step": 309 }, { "epoch": 0.28890959925442683, "grad_norm": 1.6198220340848848, "learning_rate": 1.925465838509317e-05, "loss": 2.1133, "step": 310 }, { "epoch": 0.2898415657036347, "grad_norm": 2.978638965600936, "learning_rate": 1.9316770186335406e-05, "loss": 2.8758, "step": 311 }, { "epoch": 0.2907735321528425, "grad_norm": 2.0815767434513948, "learning_rate": 1.937888198757764e-05, "loss": 2.5483, "step": 312 }, { "epoch": 0.2917054986020503, "grad_norm": 1.5026167059716742, "learning_rate": 1.9440993788819876e-05, "loss": 2.1363, "step": 313 }, { "epoch": 0.29263746505125815, "grad_norm": 2.4045332305181897, "learning_rate": 1.950310559006211e-05, "loss": 2.7131, "step": 314 }, { "epoch": 0.293569431500466, "grad_norm": 2.1262767291304683, "learning_rate": 1.956521739130435e-05, "loss": 2.7894, "step": 315 }, { "epoch": 0.29450139794967384, "grad_norm": 1.7183922459574477, "learning_rate": 1.9627329192546585e-05, "loss": 2.5574, "step": 316 }, { "epoch": 0.2954333643988816, "grad_norm": 1.9502495777777566, "learning_rate": 1.9689440993788823e-05, "loss": 2.5677, "step": 317 }, { "epoch": 0.29636533084808947, "grad_norm": 1.7659911982684806, "learning_rate": 1.9751552795031058e-05, "loss": 2.5177, "step": 318 }, { "epoch": 0.2972972972972973, "grad_norm": 1.77227239060412, "learning_rate": 1.9813664596273293e-05, "loss": 2.6915, "step": 319 }, { "epoch": 0.2982292637465051, "grad_norm": 2.130385158549716, "learning_rate": 1.9875776397515528e-05, "loss": 2.6208, "step": 320 }, { "epoch": 0.29916123019571295, "grad_norm": 1.9076926220440196, "learning_rate": 1.9937888198757767e-05, "loss": 2.6331, "step": 321 }, { "epoch": 0.3000931966449208, "grad_norm": 1.5202361838746714, "learning_rate": 2e-05, "loss": 2.2782, "step": 322 }, { "epoch": 0.30102516309412863, "grad_norm": 2.3507214239956586, "learning_rate": 1.999309630652399e-05, "loss": 2.8742, "step": 323 }, { "epoch": 0.3019571295433364, "grad_norm": 1.8069630480792036, "learning_rate": 1.9986192613047983e-05, "loss": 1.8541, "step": 324 }, { "epoch": 0.30288909599254427, "grad_norm": 1.813614370316641, "learning_rate": 1.9979288919571972e-05, "loss": 2.7341, "step": 325 }, { "epoch": 0.3038210624417521, "grad_norm": 1.8603820143591556, "learning_rate": 1.997238522609596e-05, "loss": 2.3508, "step": 326 }, { "epoch": 0.3047530288909599, "grad_norm": 1.8870736274291264, "learning_rate": 1.9965481532619954e-05, "loss": 2.3656, "step": 327 }, { "epoch": 0.30568499534016774, "grad_norm": 1.76266275758064, "learning_rate": 1.9958577839143946e-05, "loss": 2.6198, "step": 328 }, { "epoch": 0.3066169617893756, "grad_norm": 2.4917457563241854, "learning_rate": 1.9951674145667935e-05, "loss": 2.6144, "step": 329 }, { "epoch": 0.30754892823858343, "grad_norm": 1.9808180224380896, "learning_rate": 1.9944770452191924e-05, "loss": 2.4945, "step": 330 }, { "epoch": 0.3084808946877912, "grad_norm": 1.767813360959999, "learning_rate": 1.9937866758715913e-05, "loss": 2.2171, "step": 331 }, { "epoch": 0.30941286113699906, "grad_norm": 1.5892900388640776, "learning_rate": 1.9930963065239906e-05, "loss": 1.9729, "step": 332 }, { "epoch": 0.3103448275862069, "grad_norm": 1.8779542266055826, "learning_rate": 1.9924059371763895e-05, "loss": 2.2829, "step": 333 }, { "epoch": 0.31127679403541475, "grad_norm": 2.2598605245595884, "learning_rate": 1.9917155678287884e-05, "loss": 2.61, "step": 334 }, { "epoch": 0.31220876048462254, "grad_norm": 1.6553840483952278, "learning_rate": 1.9910251984811876e-05, "loss": 2.212, "step": 335 }, { "epoch": 0.3131407269338304, "grad_norm": 1.8842600427669363, "learning_rate": 1.9903348291335865e-05, "loss": 2.6413, "step": 336 }, { "epoch": 0.3140726933830382, "grad_norm": 1.8860864256082979, "learning_rate": 1.9896444597859858e-05, "loss": 2.4014, "step": 337 }, { "epoch": 0.315004659832246, "grad_norm": 1.8587802821819415, "learning_rate": 1.9889540904383847e-05, "loss": 2.2576, "step": 338 }, { "epoch": 0.31593662628145386, "grad_norm": 2.5034296811940053, "learning_rate": 1.988263721090784e-05, "loss": 2.5159, "step": 339 }, { "epoch": 0.3168685927306617, "grad_norm": 1.8417227163635022, "learning_rate": 1.9875733517431828e-05, "loss": 2.761, "step": 340 }, { "epoch": 0.31780055917986955, "grad_norm": 1.9913536192715378, "learning_rate": 1.9868829823955817e-05, "loss": 2.6419, "step": 341 }, { "epoch": 0.31873252562907733, "grad_norm": 2.8100026967317127, "learning_rate": 1.9861926130479806e-05, "loss": 2.448, "step": 342 }, { "epoch": 0.3196644920782852, "grad_norm": 1.889317126727612, "learning_rate": 1.98550224370038e-05, "loss": 2.4602, "step": 343 }, { "epoch": 0.320596458527493, "grad_norm": 2.0428351463123486, "learning_rate": 1.9848118743527788e-05, "loss": 2.6181, "step": 344 }, { "epoch": 0.32152842497670087, "grad_norm": 2.3217755392220267, "learning_rate": 1.984121505005178e-05, "loss": 2.8047, "step": 345 }, { "epoch": 0.32246039142590865, "grad_norm": 1.7668632418657604, "learning_rate": 1.983431135657577e-05, "loss": 2.1762, "step": 346 }, { "epoch": 0.3233923578751165, "grad_norm": 1.9674328545598765, "learning_rate": 1.982740766309976e-05, "loss": 2.2608, "step": 347 }, { "epoch": 0.32432432432432434, "grad_norm": 1.5613606618725062, "learning_rate": 1.982050396962375e-05, "loss": 2.2267, "step": 348 }, { "epoch": 0.32525629077353213, "grad_norm": 2.2975064188860466, "learning_rate": 1.981360027614774e-05, "loss": 2.5045, "step": 349 }, { "epoch": 0.32618825722274, "grad_norm": 1.7533581810847154, "learning_rate": 1.9806696582671732e-05, "loss": 2.7367, "step": 350 }, { "epoch": 0.3271202236719478, "grad_norm": 1.8237089984157624, "learning_rate": 1.979979288919572e-05, "loss": 2.442, "step": 351 }, { "epoch": 0.32805219012115566, "grad_norm": 1.844841282869559, "learning_rate": 1.979288919571971e-05, "loss": 2.6069, "step": 352 }, { "epoch": 0.32898415657036345, "grad_norm": 2.2673754075426698, "learning_rate": 1.9785985502243702e-05, "loss": 2.919, "step": 353 }, { "epoch": 0.3299161230195713, "grad_norm": 2.092815024432731, "learning_rate": 1.977908180876769e-05, "loss": 2.5161, "step": 354 }, { "epoch": 0.33084808946877914, "grad_norm": 1.8571583124899742, "learning_rate": 1.9772178115291684e-05, "loss": 2.5125, "step": 355 }, { "epoch": 0.3317800559179869, "grad_norm": 2.2429867918148494, "learning_rate": 1.9765274421815673e-05, "loss": 2.8055, "step": 356 }, { "epoch": 0.33271202236719477, "grad_norm": 1.9063841749457864, "learning_rate": 1.9758370728339665e-05, "loss": 2.0323, "step": 357 }, { "epoch": 0.3336439888164026, "grad_norm": 2.131361379302395, "learning_rate": 1.9751467034863654e-05, "loss": 2.4797, "step": 358 }, { "epoch": 0.33457595526561046, "grad_norm": 1.8205180264268614, "learning_rate": 1.9744563341387643e-05, "loss": 2.5194, "step": 359 }, { "epoch": 0.33550792171481825, "grad_norm": 1.6450681510584355, "learning_rate": 1.9737659647911633e-05, "loss": 2.8328, "step": 360 }, { "epoch": 0.3364398881640261, "grad_norm": 2.0318670719935676, "learning_rate": 1.9730755954435625e-05, "loss": 2.6079, "step": 361 }, { "epoch": 0.33737185461323393, "grad_norm": 1.881295970058654, "learning_rate": 1.9723852260959614e-05, "loss": 1.9153, "step": 362 }, { "epoch": 0.3383038210624418, "grad_norm": 1.8901626243827332, "learning_rate": 1.9716948567483606e-05, "loss": 2.4755, "step": 363 }, { "epoch": 0.33923578751164957, "grad_norm": 1.9389949145832401, "learning_rate": 1.9710044874007595e-05, "loss": 2.8314, "step": 364 }, { "epoch": 0.3401677539608574, "grad_norm": 1.8187733791551264, "learning_rate": 1.9703141180531588e-05, "loss": 2.3422, "step": 365 }, { "epoch": 0.34109972041006525, "grad_norm": 2.164625661189727, "learning_rate": 1.9696237487055577e-05, "loss": 2.8184, "step": 366 }, { "epoch": 0.34203168685927304, "grad_norm": 1.7536040455520692, "learning_rate": 1.9689333793579566e-05, "loss": 2.7937, "step": 367 }, { "epoch": 0.3429636533084809, "grad_norm": 1.6587395914571668, "learning_rate": 1.968243010010356e-05, "loss": 2.0975, "step": 368 }, { "epoch": 0.34389561975768873, "grad_norm": 2.456607497248777, "learning_rate": 1.9675526406627547e-05, "loss": 2.4317, "step": 369 }, { "epoch": 0.3448275862068966, "grad_norm": 3.0081843835509052, "learning_rate": 1.9668622713151536e-05, "loss": 2.7433, "step": 370 }, { "epoch": 0.34575955265610436, "grad_norm": 2.265113455880847, "learning_rate": 1.9661719019675526e-05, "loss": 2.5841, "step": 371 }, { "epoch": 0.3466915191053122, "grad_norm": 1.9662200781239165, "learning_rate": 1.9654815326199518e-05, "loss": 2.5996, "step": 372 }, { "epoch": 0.34762348555452005, "grad_norm": 1.6875961122201415, "learning_rate": 1.9647911632723507e-05, "loss": 2.1905, "step": 373 }, { "epoch": 0.34855545200372784, "grad_norm": 2.1693354847564126, "learning_rate": 1.96410079392475e-05, "loss": 2.4449, "step": 374 }, { "epoch": 0.3494874184529357, "grad_norm": 1.8244109763540068, "learning_rate": 1.9634104245771492e-05, "loss": 2.6547, "step": 375 }, { "epoch": 0.3504193849021435, "grad_norm": 2.1735409754825, "learning_rate": 1.962720055229548e-05, "loss": 2.6178, "step": 376 }, { "epoch": 0.35135135135135137, "grad_norm": 1.556421456205418, "learning_rate": 1.962029685881947e-05, "loss": 2.0944, "step": 377 }, { "epoch": 0.35228331780055916, "grad_norm": 2.2124722466654623, "learning_rate": 1.961339316534346e-05, "loss": 2.5154, "step": 378 }, { "epoch": 0.353215284249767, "grad_norm": 1.922461778657558, "learning_rate": 1.960648947186745e-05, "loss": 2.3285, "step": 379 }, { "epoch": 0.35414725069897485, "grad_norm": 1.6254668809567454, "learning_rate": 1.959958577839144e-05, "loss": 2.6674, "step": 380 }, { "epoch": 0.3550792171481827, "grad_norm": 2.0648734466845857, "learning_rate": 1.959268208491543e-05, "loss": 2.3355, "step": 381 }, { "epoch": 0.3560111835973905, "grad_norm": 1.3590593599916105, "learning_rate": 1.9585778391439422e-05, "loss": 2.317, "step": 382 }, { "epoch": 0.3569431500465983, "grad_norm": 1.8413099521630905, "learning_rate": 1.957887469796341e-05, "loss": 2.675, "step": 383 }, { "epoch": 0.35787511649580617, "grad_norm": 1.3363152311049231, "learning_rate": 1.9571971004487403e-05, "loss": 2.0868, "step": 384 }, { "epoch": 0.35880708294501396, "grad_norm": 1.9307106924130417, "learning_rate": 1.9565067311011392e-05, "loss": 2.6631, "step": 385 }, { "epoch": 0.3597390493942218, "grad_norm": 1.869048166065908, "learning_rate": 1.9558163617535385e-05, "loss": 2.5508, "step": 386 }, { "epoch": 0.36067101584342964, "grad_norm": 1.728452434262955, "learning_rate": 1.9551259924059374e-05, "loss": 2.6853, "step": 387 }, { "epoch": 0.3616029822926375, "grad_norm": 1.9517670030633476, "learning_rate": 1.9544356230583363e-05, "loss": 2.7631, "step": 388 }, { "epoch": 0.3625349487418453, "grad_norm": 1.4166453705031357, "learning_rate": 1.9537452537107352e-05, "loss": 2.4687, "step": 389 }, { "epoch": 0.3634669151910531, "grad_norm": 1.5621856086031112, "learning_rate": 1.9530548843631344e-05, "loss": 2.0956, "step": 390 }, { "epoch": 0.36439888164026096, "grad_norm": 1.9678738851633801, "learning_rate": 1.9523645150155333e-05, "loss": 2.5155, "step": 391 }, { "epoch": 0.36533084808946875, "grad_norm": 5.083065465487919, "learning_rate": 1.9516741456679326e-05, "loss": 3.1459, "step": 392 }, { "epoch": 0.3662628145386766, "grad_norm": 2.179863340079931, "learning_rate": 1.9509837763203315e-05, "loss": 2.5312, "step": 393 }, { "epoch": 0.36719478098788444, "grad_norm": 1.9609116821594756, "learning_rate": 1.9502934069727307e-05, "loss": 2.4151, "step": 394 }, { "epoch": 0.3681267474370923, "grad_norm": 2.244283531654173, "learning_rate": 1.9496030376251296e-05, "loss": 2.2003, "step": 395 }, { "epoch": 0.36905871388630007, "grad_norm": 2.011616311160311, "learning_rate": 1.9489126682775285e-05, "loss": 2.8193, "step": 396 }, { "epoch": 0.3699906803355079, "grad_norm": 2.0151995827609066, "learning_rate": 1.9482222989299278e-05, "loss": 2.1413, "step": 397 }, { "epoch": 0.37092264678471576, "grad_norm": 2.1701979919371386, "learning_rate": 1.9475319295823267e-05, "loss": 2.4195, "step": 398 }, { "epoch": 0.3718546132339236, "grad_norm": 1.4976442763882378, "learning_rate": 1.9468415602347256e-05, "loss": 2.5746, "step": 399 }, { "epoch": 0.3727865796831314, "grad_norm": 2.552097267586992, "learning_rate": 1.9461511908871248e-05, "loss": 2.4135, "step": 400 }, { "epoch": 0.37371854613233924, "grad_norm": 2.2448800410387455, "learning_rate": 1.9454608215395237e-05, "loss": 2.6477, "step": 401 }, { "epoch": 0.3746505125815471, "grad_norm": 2.2639815026901, "learning_rate": 1.944770452191923e-05, "loss": 2.6398, "step": 402 }, { "epoch": 0.37558247903075487, "grad_norm": 1.8115587164903553, "learning_rate": 1.944080082844322e-05, "loss": 2.6396, "step": 403 }, { "epoch": 0.3765144454799627, "grad_norm": 1.4739160485198628, "learning_rate": 1.943389713496721e-05, "loss": 2.4133, "step": 404 }, { "epoch": 0.37744641192917056, "grad_norm": 2.0213422631735276, "learning_rate": 1.94269934414912e-05, "loss": 2.1864, "step": 405 }, { "epoch": 0.3783783783783784, "grad_norm": 1.908383836510235, "learning_rate": 1.942008974801519e-05, "loss": 2.3093, "step": 406 }, { "epoch": 0.3793103448275862, "grad_norm": 1.9558257598544295, "learning_rate": 1.941318605453918e-05, "loss": 2.3793, "step": 407 }, { "epoch": 0.38024231127679403, "grad_norm": 1.6930251959238014, "learning_rate": 1.940628236106317e-05, "loss": 2.1874, "step": 408 }, { "epoch": 0.3811742777260019, "grad_norm": 2.671466252206981, "learning_rate": 1.939937866758716e-05, "loss": 2.1689, "step": 409 }, { "epoch": 0.3821062441752097, "grad_norm": 2.30619418365082, "learning_rate": 1.9392474974111152e-05, "loss": 2.5622, "step": 410 }, { "epoch": 0.3830382106244175, "grad_norm": 1.9777600822723016, "learning_rate": 1.938557128063514e-05, "loss": 2.2856, "step": 411 }, { "epoch": 0.38397017707362535, "grad_norm": 2.158301353244853, "learning_rate": 1.9378667587159134e-05, "loss": 2.8821, "step": 412 }, { "epoch": 0.3849021435228332, "grad_norm": 2.7038969571410987, "learning_rate": 1.9371763893683123e-05, "loss": 2.7265, "step": 413 }, { "epoch": 0.385834109972041, "grad_norm": 1.5121510211285065, "learning_rate": 1.9364860200207112e-05, "loss": 2.0601, "step": 414 }, { "epoch": 0.38676607642124883, "grad_norm": 1.9787006742824198, "learning_rate": 1.9357956506731104e-05, "loss": 2.2618, "step": 415 }, { "epoch": 0.38769804287045667, "grad_norm": 1.7237826659758384, "learning_rate": 1.9351052813255093e-05, "loss": 2.2964, "step": 416 }, { "epoch": 0.3886300093196645, "grad_norm": 1.644512075846369, "learning_rate": 1.9344149119779082e-05, "loss": 2.3744, "step": 417 }, { "epoch": 0.3895619757688723, "grad_norm": 1.7037159614154869, "learning_rate": 1.933724542630307e-05, "loss": 2.4784, "step": 418 }, { "epoch": 0.39049394221808015, "grad_norm": 1.7888460264351322, "learning_rate": 1.9330341732827064e-05, "loss": 2.5624, "step": 419 }, { "epoch": 0.391425908667288, "grad_norm": 1.471603697884118, "learning_rate": 1.9323438039351056e-05, "loss": 2.616, "step": 420 }, { "epoch": 0.3923578751164958, "grad_norm": 1.6811613169936905, "learning_rate": 1.9316534345875045e-05, "loss": 2.4623, "step": 421 }, { "epoch": 0.3932898415657036, "grad_norm": 1.7375714208837856, "learning_rate": 1.9309630652399034e-05, "loss": 2.403, "step": 422 }, { "epoch": 0.39422180801491147, "grad_norm": 2.1962948004141563, "learning_rate": 1.9302726958923027e-05, "loss": 2.2623, "step": 423 }, { "epoch": 0.3951537744641193, "grad_norm": 2.2786858368810448, "learning_rate": 1.9295823265447016e-05, "loss": 2.747, "step": 424 }, { "epoch": 0.3960857409133271, "grad_norm": 1.5996447558276548, "learning_rate": 1.9288919571971005e-05, "loss": 2.0605, "step": 425 }, { "epoch": 0.39701770736253494, "grad_norm": 1.5424568131076664, "learning_rate": 1.9282015878494997e-05, "loss": 1.9166, "step": 426 }, { "epoch": 0.3979496738117428, "grad_norm": 1.7202028840297654, "learning_rate": 1.9275112185018986e-05, "loss": 2.3147, "step": 427 }, { "epoch": 0.39888164026095063, "grad_norm": 1.9674757333560762, "learning_rate": 1.9268208491542975e-05, "loss": 2.41, "step": 428 }, { "epoch": 0.3998136067101584, "grad_norm": 1.61712383097051, "learning_rate": 1.9261304798066968e-05, "loss": 2.519, "step": 429 }, { "epoch": 0.40074557315936626, "grad_norm": 1.623181708817791, "learning_rate": 1.925440110459096e-05, "loss": 2.3948, "step": 430 }, { "epoch": 0.4016775396085741, "grad_norm": 1.7474091897395596, "learning_rate": 1.924749741111495e-05, "loss": 2.7531, "step": 431 }, { "epoch": 0.4026095060577819, "grad_norm": 1.558273812501548, "learning_rate": 1.9240593717638938e-05, "loss": 2.0425, "step": 432 }, { "epoch": 0.40354147250698974, "grad_norm": 2.081286955885598, "learning_rate": 1.9233690024162927e-05, "loss": 2.8829, "step": 433 }, { "epoch": 0.4044734389561976, "grad_norm": 1.8124783823473887, "learning_rate": 1.922678633068692e-05, "loss": 2.2641, "step": 434 }, { "epoch": 0.40540540540540543, "grad_norm": 1.7291232709273883, "learning_rate": 1.921988263721091e-05, "loss": 2.2038, "step": 435 }, { "epoch": 0.4063373718546132, "grad_norm": 2.083909699377649, "learning_rate": 1.9212978943734898e-05, "loss": 2.7167, "step": 436 }, { "epoch": 0.40726933830382106, "grad_norm": 2.0126410699462585, "learning_rate": 1.920607525025889e-05, "loss": 2.5156, "step": 437 }, { "epoch": 0.4082013047530289, "grad_norm": 2.0820936285664065, "learning_rate": 1.919917155678288e-05, "loss": 2.4633, "step": 438 }, { "epoch": 0.4091332712022367, "grad_norm": 1.5427779920082096, "learning_rate": 1.919226786330687e-05, "loss": 2.2675, "step": 439 }, { "epoch": 0.41006523765144454, "grad_norm": 2.5304183693933737, "learning_rate": 1.918536416983086e-05, "loss": 2.4488, "step": 440 }, { "epoch": 0.4109972041006524, "grad_norm": 1.588171842061493, "learning_rate": 1.9178460476354853e-05, "loss": 2.4772, "step": 441 }, { "epoch": 0.4119291705498602, "grad_norm": 2.09829619237494, "learning_rate": 1.9171556782878842e-05, "loss": 2.4335, "step": 442 }, { "epoch": 0.412861136999068, "grad_norm": 1.8557085651448126, "learning_rate": 1.916465308940283e-05, "loss": 2.3189, "step": 443 }, { "epoch": 0.41379310344827586, "grad_norm": 1.9863884740801376, "learning_rate": 1.9157749395926824e-05, "loss": 2.5718, "step": 444 }, { "epoch": 0.4147250698974837, "grad_norm": 2.239627012807513, "learning_rate": 1.9150845702450813e-05, "loss": 2.5102, "step": 445 }, { "epoch": 0.41565703634669154, "grad_norm": 1.7365392957225934, "learning_rate": 1.91439420089748e-05, "loss": 2.6089, "step": 446 }, { "epoch": 0.41658900279589933, "grad_norm": 2.2757182973831105, "learning_rate": 1.9137038315498794e-05, "loss": 2.4324, "step": 447 }, { "epoch": 0.4175209692451072, "grad_norm": 1.4668568889595983, "learning_rate": 1.9130134622022783e-05, "loss": 2.1125, "step": 448 }, { "epoch": 0.418452935694315, "grad_norm": 1.7406523805415859, "learning_rate": 1.9123230928546776e-05, "loss": 2.7593, "step": 449 }, { "epoch": 0.4193849021435228, "grad_norm": 2.229158668931329, "learning_rate": 1.9116327235070765e-05, "loss": 2.5145, "step": 450 }, { "epoch": 0.42031686859273065, "grad_norm": 1.752898560291805, "learning_rate": 1.9109423541594754e-05, "loss": 2.4652, "step": 451 }, { "epoch": 0.4212488350419385, "grad_norm": 1.932474881076844, "learning_rate": 1.9102519848118746e-05, "loss": 2.5461, "step": 452 }, { "epoch": 0.42218080149114634, "grad_norm": 1.788061264351037, "learning_rate": 1.9095616154642735e-05, "loss": 2.734, "step": 453 }, { "epoch": 0.42311276794035413, "grad_norm": 1.4676391396034834, "learning_rate": 1.9088712461166724e-05, "loss": 2.2518, "step": 454 }, { "epoch": 0.424044734389562, "grad_norm": 1.561387031195951, "learning_rate": 1.9081808767690717e-05, "loss": 2.4304, "step": 455 }, { "epoch": 0.4249767008387698, "grad_norm": 1.618271655169365, "learning_rate": 1.9074905074214706e-05, "loss": 2.4623, "step": 456 }, { "epoch": 0.42590866728797766, "grad_norm": 1.8806993400548484, "learning_rate": 1.9068001380738698e-05, "loss": 2.48, "step": 457 }, { "epoch": 0.42684063373718545, "grad_norm": 1.7817716892293123, "learning_rate": 1.9061097687262687e-05, "loss": 2.276, "step": 458 }, { "epoch": 0.4277726001863933, "grad_norm": 2.8370110335007217, "learning_rate": 1.905419399378668e-05, "loss": 2.5387, "step": 459 }, { "epoch": 0.42870456663560114, "grad_norm": 2.0764439437672664, "learning_rate": 1.904729030031067e-05, "loss": 2.4688, "step": 460 }, { "epoch": 0.4296365330848089, "grad_norm": 1.8454163815569764, "learning_rate": 1.9040386606834658e-05, "loss": 2.3219, "step": 461 }, { "epoch": 0.43056849953401677, "grad_norm": 1.8376086173796036, "learning_rate": 1.9033482913358647e-05, "loss": 2.4567, "step": 462 }, { "epoch": 0.4315004659832246, "grad_norm": 1.4366261563258107, "learning_rate": 1.902657921988264e-05, "loss": 2.0204, "step": 463 }, { "epoch": 0.43243243243243246, "grad_norm": 1.8340954044810291, "learning_rate": 1.9019675526406628e-05, "loss": 2.5123, "step": 464 }, { "epoch": 0.43336439888164024, "grad_norm": 1.6313767995338604, "learning_rate": 1.901277183293062e-05, "loss": 2.6928, "step": 465 }, { "epoch": 0.4342963653308481, "grad_norm": 1.7739439343796535, "learning_rate": 1.900586813945461e-05, "loss": 2.2253, "step": 466 }, { "epoch": 0.43522833178005593, "grad_norm": 2.263123164951795, "learning_rate": 1.8998964445978602e-05, "loss": 2.2816, "step": 467 }, { "epoch": 0.4361602982292637, "grad_norm": 2.588090694120915, "learning_rate": 1.899206075250259e-05, "loss": 2.6387, "step": 468 }, { "epoch": 0.43709226467847156, "grad_norm": 1.5033746886925072, "learning_rate": 1.898515705902658e-05, "loss": 2.3344, "step": 469 }, { "epoch": 0.4380242311276794, "grad_norm": 1.7651917725323905, "learning_rate": 1.8978253365550572e-05, "loss": 2.5255, "step": 470 }, { "epoch": 0.43895619757688725, "grad_norm": 1.9755942629271488, "learning_rate": 1.897134967207456e-05, "loss": 2.6881, "step": 471 }, { "epoch": 0.43988816402609504, "grad_norm": 1.643404959860189, "learning_rate": 1.896444597859855e-05, "loss": 2.5334, "step": 472 }, { "epoch": 0.4408201304753029, "grad_norm": 1.5882324987795797, "learning_rate": 1.895754228512254e-05, "loss": 2.739, "step": 473 }, { "epoch": 0.44175209692451073, "grad_norm": 1.558956007509662, "learning_rate": 1.8950638591646532e-05, "loss": 1.9362, "step": 474 }, { "epoch": 0.4426840633737186, "grad_norm": 1.9698930575049758, "learning_rate": 1.894373489817052e-05, "loss": 2.7211, "step": 475 }, { "epoch": 0.44361602982292636, "grad_norm": 1.5024806941135869, "learning_rate": 1.8936831204694513e-05, "loss": 2.0268, "step": 476 }, { "epoch": 0.4445479962721342, "grad_norm": 1.4925489327910602, "learning_rate": 1.8929927511218506e-05, "loss": 2.3355, "step": 477 }, { "epoch": 0.44547996272134205, "grad_norm": 2.5342851785860945, "learning_rate": 1.8923023817742495e-05, "loss": 2.4001, "step": 478 }, { "epoch": 0.44641192917054984, "grad_norm": 1.617991830178864, "learning_rate": 1.8916120124266484e-05, "loss": 2.2724, "step": 479 }, { "epoch": 0.4473438956197577, "grad_norm": 1.8662275157309107, "learning_rate": 1.8909216430790473e-05, "loss": 2.1382, "step": 480 }, { "epoch": 0.4482758620689655, "grad_norm": 1.5593377065865537, "learning_rate": 1.8902312737314465e-05, "loss": 2.1076, "step": 481 }, { "epoch": 0.44920782851817337, "grad_norm": 1.7617467093849617, "learning_rate": 1.8895409043838454e-05, "loss": 2.1116, "step": 482 }, { "epoch": 0.45013979496738116, "grad_norm": 1.9206747947139504, "learning_rate": 1.8888505350362443e-05, "loss": 2.7938, "step": 483 }, { "epoch": 0.451071761416589, "grad_norm": 1.9723399223400295, "learning_rate": 1.8881601656886436e-05, "loss": 2.2548, "step": 484 }, { "epoch": 0.45200372786579684, "grad_norm": 1.701790891601717, "learning_rate": 1.8874697963410425e-05, "loss": 2.3171, "step": 485 }, { "epoch": 0.45293569431500463, "grad_norm": 1.5789129932988915, "learning_rate": 1.8867794269934417e-05, "loss": 2.3919, "step": 486 }, { "epoch": 0.4538676607642125, "grad_norm": 1.6668022142898915, "learning_rate": 1.8860890576458406e-05, "loss": 2.4587, "step": 487 }, { "epoch": 0.4547996272134203, "grad_norm": 1.7746240630162236, "learning_rate": 1.88539868829824e-05, "loss": 2.3923, "step": 488 }, { "epoch": 0.45573159366262816, "grad_norm": 1.518068529852265, "learning_rate": 1.8847083189506388e-05, "loss": 1.8592, "step": 489 }, { "epoch": 0.45666356011183595, "grad_norm": 1.980876445772171, "learning_rate": 1.8840179496030377e-05, "loss": 2.4006, "step": 490 }, { "epoch": 0.4575955265610438, "grad_norm": 1.657674691649083, "learning_rate": 1.8833275802554366e-05, "loss": 2.283, "step": 491 }, { "epoch": 0.45852749301025164, "grad_norm": 1.9199649705658026, "learning_rate": 1.882637210907836e-05, "loss": 2.6051, "step": 492 }, { "epoch": 0.4594594594594595, "grad_norm": 1.8685498968184326, "learning_rate": 1.8819468415602347e-05, "loss": 2.6472, "step": 493 }, { "epoch": 0.4603914259086673, "grad_norm": 1.6449753163604137, "learning_rate": 1.881256472212634e-05, "loss": 2.5933, "step": 494 }, { "epoch": 0.4613233923578751, "grad_norm": 1.6992386625557163, "learning_rate": 1.880566102865033e-05, "loss": 2.4904, "step": 495 }, { "epoch": 0.46225535880708296, "grad_norm": 1.783788942821385, "learning_rate": 1.879875733517432e-05, "loss": 2.4116, "step": 496 }, { "epoch": 0.46318732525629075, "grad_norm": 1.8009652855059717, "learning_rate": 1.879185364169831e-05, "loss": 2.5727, "step": 497 }, { "epoch": 0.4641192917054986, "grad_norm": 1.4309008357943231, "learning_rate": 1.87849499482223e-05, "loss": 1.7237, "step": 498 }, { "epoch": 0.46505125815470644, "grad_norm": 1.5553479819470342, "learning_rate": 1.8778046254746292e-05, "loss": 2.4031, "step": 499 }, { "epoch": 0.4659832246039143, "grad_norm": 1.8299578693967342, "learning_rate": 1.877114256127028e-05, "loss": 2.4303, "step": 500 }, { "epoch": 0.46691519105312207, "grad_norm": 2.051209554347143, "learning_rate": 1.876423886779427e-05, "loss": 2.5548, "step": 501 }, { "epoch": 0.4678471575023299, "grad_norm": 1.6355667653236006, "learning_rate": 1.8757335174318262e-05, "loss": 2.6295, "step": 502 }, { "epoch": 0.46877912395153776, "grad_norm": 2.6146443349045936, "learning_rate": 1.875043148084225e-05, "loss": 2.2944, "step": 503 }, { "epoch": 0.46971109040074555, "grad_norm": 1.5830168385125085, "learning_rate": 1.8743527787366244e-05, "loss": 2.2786, "step": 504 }, { "epoch": 0.4706430568499534, "grad_norm": 1.7309669705615185, "learning_rate": 1.8736624093890233e-05, "loss": 2.1528, "step": 505 }, { "epoch": 0.47157502329916123, "grad_norm": 1.7630520257309679, "learning_rate": 1.8729720400414225e-05, "loss": 2.0889, "step": 506 }, { "epoch": 0.4725069897483691, "grad_norm": 2.0225005171354824, "learning_rate": 1.8722816706938214e-05, "loss": 2.5834, "step": 507 }, { "epoch": 0.47343895619757687, "grad_norm": 1.4882866998531692, "learning_rate": 1.8715913013462203e-05, "loss": 2.4635, "step": 508 }, { "epoch": 0.4743709226467847, "grad_norm": 1.515579327839259, "learning_rate": 1.8709009319986192e-05, "loss": 2.3409, "step": 509 }, { "epoch": 0.47530288909599255, "grad_norm": 1.4079257829688998, "learning_rate": 1.8702105626510185e-05, "loss": 2.1216, "step": 510 }, { "epoch": 0.4762348555452004, "grad_norm": 1.668179816510298, "learning_rate": 1.8695201933034174e-05, "loss": 2.1399, "step": 511 }, { "epoch": 0.4771668219944082, "grad_norm": 1.5980601925974791, "learning_rate": 1.8688298239558166e-05, "loss": 2.1708, "step": 512 }, { "epoch": 0.47809878844361603, "grad_norm": 1.6638258508567916, "learning_rate": 1.8681394546082155e-05, "loss": 2.1404, "step": 513 }, { "epoch": 0.4790307548928239, "grad_norm": 1.5101345726403825, "learning_rate": 1.8674490852606148e-05, "loss": 2.2924, "step": 514 }, { "epoch": 0.47996272134203166, "grad_norm": 1.65001425724559, "learning_rate": 1.8667587159130137e-05, "loss": 2.3224, "step": 515 }, { "epoch": 0.4808946877912395, "grad_norm": 1.725910920160148, "learning_rate": 1.8660683465654126e-05, "loss": 2.272, "step": 516 }, { "epoch": 0.48182665424044735, "grad_norm": 1.767203751841675, "learning_rate": 1.8653779772178118e-05, "loss": 2.3482, "step": 517 }, { "epoch": 0.4827586206896552, "grad_norm": 1.7754022579605735, "learning_rate": 1.8646876078702107e-05, "loss": 2.2276, "step": 518 }, { "epoch": 0.483690587138863, "grad_norm": 1.510810581360773, "learning_rate": 1.8639972385226096e-05, "loss": 2.425, "step": 519 }, { "epoch": 0.4846225535880708, "grad_norm": 2.301140357481207, "learning_rate": 1.8633068691750085e-05, "loss": 2.4617, "step": 520 }, { "epoch": 0.48555452003727867, "grad_norm": 1.5930673457168807, "learning_rate": 1.8626164998274078e-05, "loss": 2.1748, "step": 521 }, { "epoch": 0.4864864864864865, "grad_norm": 1.4144192983025854, "learning_rate": 1.861926130479807e-05, "loss": 2.5406, "step": 522 }, { "epoch": 0.4874184529356943, "grad_norm": 1.6102553159566295, "learning_rate": 1.861235761132206e-05, "loss": 2.3527, "step": 523 }, { "epoch": 0.48835041938490215, "grad_norm": 1.6478372819955562, "learning_rate": 1.860545391784605e-05, "loss": 2.3039, "step": 524 }, { "epoch": 0.48928238583411, "grad_norm": 1.7554224784578771, "learning_rate": 1.859855022437004e-05, "loss": 2.4588, "step": 525 }, { "epoch": 0.4902143522833178, "grad_norm": 2.3798957892579513, "learning_rate": 1.859164653089403e-05, "loss": 2.2869, "step": 526 }, { "epoch": 0.4911463187325256, "grad_norm": 1.599139122786272, "learning_rate": 1.858474283741802e-05, "loss": 2.6385, "step": 527 }, { "epoch": 0.49207828518173347, "grad_norm": 1.8968805115905396, "learning_rate": 1.857783914394201e-05, "loss": 2.159, "step": 528 }, { "epoch": 0.4930102516309413, "grad_norm": 1.809917141021073, "learning_rate": 1.8570935450466e-05, "loss": 2.6141, "step": 529 }, { "epoch": 0.4939422180801491, "grad_norm": 1.7675801229587824, "learning_rate": 1.856403175698999e-05, "loss": 2.6255, "step": 530 }, { "epoch": 0.49487418452935694, "grad_norm": 2.2606400794832884, "learning_rate": 1.8557128063513982e-05, "loss": 2.5102, "step": 531 }, { "epoch": 0.4958061509785648, "grad_norm": 1.7582622317019732, "learning_rate": 1.8550224370037974e-05, "loss": 2.1288, "step": 532 }, { "epoch": 0.4967381174277726, "grad_norm": 1.8861458078904425, "learning_rate": 1.8543320676561963e-05, "loss": 2.2008, "step": 533 }, { "epoch": 0.4976700838769804, "grad_norm": 1.428045570295008, "learning_rate": 1.8536416983085952e-05, "loss": 2.6163, "step": 534 }, { "epoch": 0.49860205032618826, "grad_norm": 2.5521220732506547, "learning_rate": 1.8529513289609945e-05, "loss": 1.969, "step": 535 }, { "epoch": 0.4995340167753961, "grad_norm": 1.7554464998865007, "learning_rate": 1.8522609596133934e-05, "loss": 2.234, "step": 536 }, { "epoch": 0.5004659832246039, "grad_norm": 1.5221170061732059, "learning_rate": 1.8515705902657923e-05, "loss": 2.1597, "step": 537 }, { "epoch": 0.5013979496738118, "grad_norm": 1.6646000693439196, "learning_rate": 1.8508802209181912e-05, "loss": 2.0549, "step": 538 }, { "epoch": 0.5023299161230196, "grad_norm": 1.8169321574576747, "learning_rate": 1.8501898515705904e-05, "loss": 2.3396, "step": 539 }, { "epoch": 0.5032618825722274, "grad_norm": 1.6846499438477136, "learning_rate": 1.8494994822229893e-05, "loss": 2.5531, "step": 540 }, { "epoch": 0.5041938490214353, "grad_norm": 1.865270054947802, "learning_rate": 1.8488091128753886e-05, "loss": 2.4741, "step": 541 }, { "epoch": 0.5051258154706431, "grad_norm": 2.087122302289208, "learning_rate": 1.8481187435277875e-05, "loss": 2.4774, "step": 542 }, { "epoch": 0.5060577819198508, "grad_norm": 2.180171049283354, "learning_rate": 1.8474283741801867e-05, "loss": 2.4086, "step": 543 }, { "epoch": 0.5069897483690587, "grad_norm": 1.4468058503702064, "learning_rate": 1.8467380048325856e-05, "loss": 2.1566, "step": 544 }, { "epoch": 0.5079217148182665, "grad_norm": 2.0028251887097084, "learning_rate": 1.8460476354849845e-05, "loss": 2.3777, "step": 545 }, { "epoch": 0.5088536812674743, "grad_norm": 2.157083100096408, "learning_rate": 1.8453572661373838e-05, "loss": 2.8796, "step": 546 }, { "epoch": 0.5097856477166822, "grad_norm": 2.0221608581708184, "learning_rate": 1.8446668967897827e-05, "loss": 2.4618, "step": 547 }, { "epoch": 0.51071761416589, "grad_norm": 1.478043669450722, "learning_rate": 1.8439765274421816e-05, "loss": 2.2457, "step": 548 }, { "epoch": 0.5116495806150979, "grad_norm": 1.7603713819461808, "learning_rate": 1.8432861580945808e-05, "loss": 2.8598, "step": 549 }, { "epoch": 0.5125815470643057, "grad_norm": 1.860685695439536, "learning_rate": 1.8425957887469797e-05, "loss": 2.482, "step": 550 }, { "epoch": 0.5135135135135135, "grad_norm": 1.5646047154457254, "learning_rate": 1.841905419399379e-05, "loss": 2.3932, "step": 551 }, { "epoch": 0.5144454799627214, "grad_norm": 2.2538851673719065, "learning_rate": 1.841215050051778e-05, "loss": 2.4581, "step": 552 }, { "epoch": 0.5153774464119292, "grad_norm": 2.129390674538297, "learning_rate": 1.8405246807041768e-05, "loss": 2.3087, "step": 553 }, { "epoch": 0.516309412861137, "grad_norm": 1.9510396735919504, "learning_rate": 1.839834311356576e-05, "loss": 2.3381, "step": 554 }, { "epoch": 0.5172413793103449, "grad_norm": 2.0343147398183548, "learning_rate": 1.839143942008975e-05, "loss": 2.5663, "step": 555 }, { "epoch": 0.5181733457595527, "grad_norm": 1.756391495937355, "learning_rate": 1.8384535726613738e-05, "loss": 2.5705, "step": 556 }, { "epoch": 0.5191053122087604, "grad_norm": 1.6128902044165936, "learning_rate": 1.837763203313773e-05, "loss": 2.4191, "step": 557 }, { "epoch": 0.5200372786579683, "grad_norm": 1.8787846570576376, "learning_rate": 1.837072833966172e-05, "loss": 2.4635, "step": 558 }, { "epoch": 0.5209692451071761, "grad_norm": 1.7132272221145541, "learning_rate": 1.8363824646185712e-05, "loss": 2.0818, "step": 559 }, { "epoch": 0.5219012115563839, "grad_norm": 1.5928650969450737, "learning_rate": 1.83569209527097e-05, "loss": 2.106, "step": 560 }, { "epoch": 0.5228331780055918, "grad_norm": 1.6088478130583097, "learning_rate": 1.8350017259233694e-05, "loss": 2.2567, "step": 561 }, { "epoch": 0.5237651444547996, "grad_norm": 1.6291766938924395, "learning_rate": 1.8343113565757683e-05, "loss": 2.0544, "step": 562 }, { "epoch": 0.5246971109040075, "grad_norm": 1.529485573345341, "learning_rate": 1.833620987228167e-05, "loss": 2.3309, "step": 563 }, { "epoch": 0.5256290773532153, "grad_norm": 1.3816387205078178, "learning_rate": 1.832930617880566e-05, "loss": 1.975, "step": 564 }, { "epoch": 0.5265610438024231, "grad_norm": 1.495875069688022, "learning_rate": 1.8322402485329653e-05, "loss": 2.1401, "step": 565 }, { "epoch": 0.527493010251631, "grad_norm": 2.145367516466008, "learning_rate": 1.8315498791853642e-05, "loss": 2.3903, "step": 566 }, { "epoch": 0.5284249767008388, "grad_norm": 1.8019542401841993, "learning_rate": 1.830859509837763e-05, "loss": 2.2706, "step": 567 }, { "epoch": 0.5293569431500466, "grad_norm": 1.531290456633173, "learning_rate": 1.8301691404901624e-05, "loss": 2.2241, "step": 568 }, { "epoch": 0.5302889095992545, "grad_norm": 1.871533855647624, "learning_rate": 1.8294787711425616e-05, "loss": 2.5704, "step": 569 }, { "epoch": 0.5312208760484622, "grad_norm": 2.046485331309377, "learning_rate": 1.8287884017949605e-05, "loss": 2.5027, "step": 570 }, { "epoch": 0.53215284249767, "grad_norm": 1.6118406096085731, "learning_rate": 1.8280980324473594e-05, "loss": 1.9888, "step": 571 }, { "epoch": 0.5330848089468779, "grad_norm": 1.5602065466364712, "learning_rate": 1.8274076630997587e-05, "loss": 2.4036, "step": 572 }, { "epoch": 0.5340167753960857, "grad_norm": 1.5814724110136673, "learning_rate": 1.8267172937521576e-05, "loss": 2.6061, "step": 573 }, { "epoch": 0.5349487418452936, "grad_norm": 1.6107475910243625, "learning_rate": 1.8260269244045565e-05, "loss": 2.2097, "step": 574 }, { "epoch": 0.5358807082945014, "grad_norm": 1.6067200190885684, "learning_rate": 1.8253365550569557e-05, "loss": 2.4084, "step": 575 }, { "epoch": 0.5368126747437092, "grad_norm": 1.6134024090299897, "learning_rate": 1.8246461857093546e-05, "loss": 2.1932, "step": 576 }, { "epoch": 0.5377446411929171, "grad_norm": 1.5458794230708037, "learning_rate": 1.8239558163617535e-05, "loss": 2.3334, "step": 577 }, { "epoch": 0.5386766076421249, "grad_norm": 1.8049990917038101, "learning_rate": 1.8232654470141528e-05, "loss": 2.4757, "step": 578 }, { "epoch": 0.5396085740913327, "grad_norm": 1.5391777505682767, "learning_rate": 1.822575077666552e-05, "loss": 2.3773, "step": 579 }, { "epoch": 0.5405405405405406, "grad_norm": 1.7383137530065753, "learning_rate": 1.821884708318951e-05, "loss": 2.5384, "step": 580 }, { "epoch": 0.5414725069897484, "grad_norm": 1.341427302354314, "learning_rate": 1.8211943389713498e-05, "loss": 2.5526, "step": 581 }, { "epoch": 0.5424044734389561, "grad_norm": 1.3764119325676258, "learning_rate": 1.8205039696237487e-05, "loss": 2.236, "step": 582 }, { "epoch": 0.543336439888164, "grad_norm": 1.4033962130215378, "learning_rate": 1.819813600276148e-05, "loss": 2.0799, "step": 583 }, { "epoch": 0.5442684063373718, "grad_norm": 1.5277196701082982, "learning_rate": 1.819123230928547e-05, "loss": 2.272, "step": 584 }, { "epoch": 0.5452003727865797, "grad_norm": 1.7113767135117346, "learning_rate": 1.8184328615809458e-05, "loss": 2.2555, "step": 585 }, { "epoch": 0.5461323392357875, "grad_norm": 1.7562086947485298, "learning_rate": 1.817742492233345e-05, "loss": 1.9593, "step": 586 }, { "epoch": 0.5470643056849953, "grad_norm": 1.919775797704984, "learning_rate": 1.817052122885744e-05, "loss": 2.289, "step": 587 }, { "epoch": 0.5479962721342032, "grad_norm": 8.981615210648284, "learning_rate": 1.816361753538143e-05, "loss": 2.6117, "step": 588 }, { "epoch": 0.548928238583411, "grad_norm": 1.4369130808378567, "learning_rate": 1.815671384190542e-05, "loss": 2.4286, "step": 589 }, { "epoch": 0.5498602050326188, "grad_norm": 1.4469120153196182, "learning_rate": 1.8149810148429413e-05, "loss": 2.2959, "step": 590 }, { "epoch": 0.5507921714818267, "grad_norm": 1.5967671443388536, "learning_rate": 1.8142906454953402e-05, "loss": 2.6071, "step": 591 }, { "epoch": 0.5517241379310345, "grad_norm": 1.355017364787645, "learning_rate": 1.813600276147739e-05, "loss": 2.1381, "step": 592 }, { "epoch": 0.5526561043802423, "grad_norm": 1.7247666866324232, "learning_rate": 1.812909906800138e-05, "loss": 2.3542, "step": 593 }, { "epoch": 0.5535880708294502, "grad_norm": 1.5427574683912506, "learning_rate": 1.8122195374525372e-05, "loss": 2.6044, "step": 594 }, { "epoch": 0.554520037278658, "grad_norm": 1.6101890936501613, "learning_rate": 1.811529168104936e-05, "loss": 2.2601, "step": 595 }, { "epoch": 0.5554520037278659, "grad_norm": 1.8501920401471328, "learning_rate": 1.8108387987573354e-05, "loss": 2.2932, "step": 596 }, { "epoch": 0.5563839701770736, "grad_norm": 1.837825933291597, "learning_rate": 1.8101484294097343e-05, "loss": 2.4776, "step": 597 }, { "epoch": 0.5573159366262814, "grad_norm": 1.4953987427714137, "learning_rate": 1.8094580600621335e-05, "loss": 2.2256, "step": 598 }, { "epoch": 0.5582479030754893, "grad_norm": 1.6692244565592063, "learning_rate": 1.8087676907145324e-05, "loss": 2.6086, "step": 599 }, { "epoch": 0.5591798695246971, "grad_norm": 1.4500821304174227, "learning_rate": 1.8080773213669313e-05, "loss": 2.3948, "step": 600 }, { "epoch": 0.5601118359739049, "grad_norm": 2.1209657122418117, "learning_rate": 1.8073869520193306e-05, "loss": 2.247, "step": 601 }, { "epoch": 0.5610438024231128, "grad_norm": 1.222816873091472, "learning_rate": 1.8066965826717295e-05, "loss": 1.9996, "step": 602 }, { "epoch": 0.5619757688723206, "grad_norm": 1.6645141099072753, "learning_rate": 1.8060062133241284e-05, "loss": 2.1959, "step": 603 }, { "epoch": 0.5629077353215284, "grad_norm": 2.2306208173320243, "learning_rate": 1.8053158439765276e-05, "loss": 2.5842, "step": 604 }, { "epoch": 0.5638397017707363, "grad_norm": 1.7730857607695605, "learning_rate": 1.8046254746289265e-05, "loss": 2.4371, "step": 605 }, { "epoch": 0.5647716682199441, "grad_norm": 1.6809393420260546, "learning_rate": 1.8039351052813258e-05, "loss": 2.2509, "step": 606 }, { "epoch": 0.5657036346691519, "grad_norm": 1.622426488288895, "learning_rate": 1.8032447359337247e-05, "loss": 2.4725, "step": 607 }, { "epoch": 0.5666356011183598, "grad_norm": 2.0119226704183397, "learning_rate": 1.802554366586124e-05, "loss": 2.1862, "step": 608 }, { "epoch": 0.5675675675675675, "grad_norm": 2.154422996863913, "learning_rate": 1.801863997238523e-05, "loss": 2.3395, "step": 609 }, { "epoch": 0.5684995340167754, "grad_norm": 2.2940692814442327, "learning_rate": 1.8011736278909217e-05, "loss": 2.2249, "step": 610 }, { "epoch": 0.5694315004659832, "grad_norm": 1.7870454433686356, "learning_rate": 1.8004832585433206e-05, "loss": 2.4578, "step": 611 }, { "epoch": 0.570363466915191, "grad_norm": 1.7265871447591044, "learning_rate": 1.79979288919572e-05, "loss": 2.3309, "step": 612 }, { "epoch": 0.5712954333643989, "grad_norm": 1.8502037325415859, "learning_rate": 1.7991025198481188e-05, "loss": 2.4026, "step": 613 }, { "epoch": 0.5722273998136067, "grad_norm": 1.548080946879379, "learning_rate": 1.798412150500518e-05, "loss": 2.4717, "step": 614 }, { "epoch": 0.5731593662628145, "grad_norm": 1.468841048800912, "learning_rate": 1.797721781152917e-05, "loss": 2.0679, "step": 615 }, { "epoch": 0.5740913327120224, "grad_norm": 1.7692128129688827, "learning_rate": 1.7970314118053162e-05, "loss": 2.5583, "step": 616 }, { "epoch": 0.5750232991612302, "grad_norm": 1.5352944570004683, "learning_rate": 1.796341042457715e-05, "loss": 2.52, "step": 617 }, { "epoch": 0.575955265610438, "grad_norm": 1.5602081048732663, "learning_rate": 1.795650673110114e-05, "loss": 2.4383, "step": 618 }, { "epoch": 0.5768872320596459, "grad_norm": 1.6208899196829654, "learning_rate": 1.7949603037625132e-05, "loss": 2.2046, "step": 619 }, { "epoch": 0.5778191985088537, "grad_norm": 2.133986274433359, "learning_rate": 1.794269934414912e-05, "loss": 2.1525, "step": 620 }, { "epoch": 0.5787511649580616, "grad_norm": 1.2382255279325454, "learning_rate": 1.793579565067311e-05, "loss": 2.0222, "step": 621 }, { "epoch": 0.5796831314072693, "grad_norm": 1.4210178605015091, "learning_rate": 1.79288919571971e-05, "loss": 2.436, "step": 622 }, { "epoch": 0.5806150978564771, "grad_norm": 1.5985203421043772, "learning_rate": 1.7921988263721092e-05, "loss": 1.9814, "step": 623 }, { "epoch": 0.581547064305685, "grad_norm": 1.9023515362388261, "learning_rate": 1.7915084570245084e-05, "loss": 2.4374, "step": 624 }, { "epoch": 0.5824790307548928, "grad_norm": 1.5796250684717745, "learning_rate": 1.7908180876769073e-05, "loss": 2.512, "step": 625 }, { "epoch": 0.5834109972041006, "grad_norm": 1.6589289857885365, "learning_rate": 1.7901277183293066e-05, "loss": 2.5965, "step": 626 }, { "epoch": 0.5843429636533085, "grad_norm": 1.7484277120770635, "learning_rate": 1.7894373489817055e-05, "loss": 2.3103, "step": 627 }, { "epoch": 0.5852749301025163, "grad_norm": 1.6718295538752617, "learning_rate": 1.7887469796341044e-05, "loss": 2.3656, "step": 628 }, { "epoch": 0.5862068965517241, "grad_norm": 1.9531355302321125, "learning_rate": 1.7880566102865033e-05, "loss": 2.3221, "step": 629 }, { "epoch": 0.587138863000932, "grad_norm": 2.3042500888889137, "learning_rate": 1.7873662409389025e-05, "loss": 2.4295, "step": 630 }, { "epoch": 0.5880708294501398, "grad_norm": 1.6993622188938569, "learning_rate": 1.7866758715913014e-05, "loss": 2.3801, "step": 631 }, { "epoch": 0.5890027958993477, "grad_norm": 1.5196888552590688, "learning_rate": 1.7859855022437003e-05, "loss": 2.1939, "step": 632 }, { "epoch": 0.5899347623485555, "grad_norm": 2.048513661660252, "learning_rate": 1.7852951328960996e-05, "loss": 2.4693, "step": 633 }, { "epoch": 0.5908667287977633, "grad_norm": 1.3780059532238245, "learning_rate": 1.7846047635484988e-05, "loss": 2.346, "step": 634 }, { "epoch": 0.5917986952469712, "grad_norm": 1.5558076052099203, "learning_rate": 1.7839143942008977e-05, "loss": 2.4187, "step": 635 }, { "epoch": 0.5927306616961789, "grad_norm": 1.5780707986180345, "learning_rate": 1.7832240248532966e-05, "loss": 2.2279, "step": 636 }, { "epoch": 0.5936626281453867, "grad_norm": 1.8139715506671468, "learning_rate": 1.782533655505696e-05, "loss": 2.541, "step": 637 }, { "epoch": 0.5945945945945946, "grad_norm": 2.1883837301514903, "learning_rate": 1.7818432861580948e-05, "loss": 2.4539, "step": 638 }, { "epoch": 0.5955265610438024, "grad_norm": 1.6276138766997228, "learning_rate": 1.7811529168104937e-05, "loss": 2.4304, "step": 639 }, { "epoch": 0.5964585274930102, "grad_norm": 1.9692783560258216, "learning_rate": 1.7804625474628926e-05, "loss": 2.1985, "step": 640 }, { "epoch": 0.5973904939422181, "grad_norm": 1.6975795115503598, "learning_rate": 1.7797721781152918e-05, "loss": 2.2306, "step": 641 }, { "epoch": 0.5983224603914259, "grad_norm": 1.284344321492504, "learning_rate": 1.7790818087676907e-05, "loss": 2.2302, "step": 642 }, { "epoch": 0.5992544268406338, "grad_norm": 1.8135489214937195, "learning_rate": 1.77839143942009e-05, "loss": 2.2846, "step": 643 }, { "epoch": 0.6001863932898416, "grad_norm": 1.5026759816574176, "learning_rate": 1.777701070072489e-05, "loss": 2.2462, "step": 644 }, { "epoch": 0.6011183597390494, "grad_norm": 1.7139142184813314, "learning_rate": 1.777010700724888e-05, "loss": 2.3749, "step": 645 }, { "epoch": 0.6020503261882573, "grad_norm": 1.532548119099438, "learning_rate": 1.776320331377287e-05, "loss": 1.8971, "step": 646 }, { "epoch": 0.6029822926374651, "grad_norm": 1.9553973122274377, "learning_rate": 1.775629962029686e-05, "loss": 2.2753, "step": 647 }, { "epoch": 0.6039142590866728, "grad_norm": 1.7429405438750476, "learning_rate": 1.774939592682085e-05, "loss": 2.1508, "step": 648 }, { "epoch": 0.6048462255358807, "grad_norm": 1.881316723987194, "learning_rate": 1.774249223334484e-05, "loss": 2.0702, "step": 649 }, { "epoch": 0.6057781919850885, "grad_norm": 1.4070103820897037, "learning_rate": 1.773558853986883e-05, "loss": 1.9697, "step": 650 }, { "epoch": 0.6067101584342963, "grad_norm": 1.8771070883760774, "learning_rate": 1.7728684846392822e-05, "loss": 2.228, "step": 651 }, { "epoch": 0.6076421248835042, "grad_norm": 1.6196648973782697, "learning_rate": 1.772178115291681e-05, "loss": 2.093, "step": 652 }, { "epoch": 0.608574091332712, "grad_norm": 1.5351841679404403, "learning_rate": 1.7714877459440804e-05, "loss": 2.3512, "step": 653 }, { "epoch": 0.6095060577819198, "grad_norm": 1.4012509521357321, "learning_rate": 1.7707973765964793e-05, "loss": 2.3647, "step": 654 }, { "epoch": 0.6104380242311277, "grad_norm": 1.9374692411420407, "learning_rate": 1.7701070072488782e-05, "loss": 2.3356, "step": 655 }, { "epoch": 0.6113699906803355, "grad_norm": 1.9109813907333006, "learning_rate": 1.7694166379012774e-05, "loss": 2.2657, "step": 656 }, { "epoch": 0.6123019571295434, "grad_norm": 2.0199000845252812, "learning_rate": 1.7687262685536763e-05, "loss": 2.4394, "step": 657 }, { "epoch": 0.6132339235787512, "grad_norm": 1.5768969060599962, "learning_rate": 1.7680358992060752e-05, "loss": 2.2991, "step": 658 }, { "epoch": 0.614165890027959, "grad_norm": 1.3869425290627475, "learning_rate": 1.7673455298584745e-05, "loss": 2.0419, "step": 659 }, { "epoch": 0.6150978564771669, "grad_norm": 1.9378955766048485, "learning_rate": 1.7666551605108734e-05, "loss": 2.0141, "step": 660 }, { "epoch": 0.6160298229263746, "grad_norm": 1.8738632957969406, "learning_rate": 1.7659647911632726e-05, "loss": 2.0466, "step": 661 }, { "epoch": 0.6169617893755824, "grad_norm": 1.3541461622887194, "learning_rate": 1.7652744218156715e-05, "loss": 1.9222, "step": 662 }, { "epoch": 0.6178937558247903, "grad_norm": 1.5729850599359925, "learning_rate": 1.7645840524680708e-05, "loss": 2.0692, "step": 663 }, { "epoch": 0.6188257222739981, "grad_norm": 1.9059582657238978, "learning_rate": 1.7638936831204697e-05, "loss": 2.2621, "step": 664 }, { "epoch": 0.6197576887232059, "grad_norm": 1.4788378162578166, "learning_rate": 1.7632033137728686e-05, "loss": 2.4353, "step": 665 }, { "epoch": 0.6206896551724138, "grad_norm": 1.5998003845618698, "learning_rate": 1.7625129444252678e-05, "loss": 2.3437, "step": 666 }, { "epoch": 0.6216216216216216, "grad_norm": 1.3671709576234876, "learning_rate": 1.7618225750776667e-05, "loss": 2.5544, "step": 667 }, { "epoch": 0.6225535880708295, "grad_norm": 1.7258836476489974, "learning_rate": 1.7611322057300656e-05, "loss": 2.2137, "step": 668 }, { "epoch": 0.6234855545200373, "grad_norm": 1.9039406917198534, "learning_rate": 1.7604418363824645e-05, "loss": 2.4224, "step": 669 }, { "epoch": 0.6244175209692451, "grad_norm": 2.1113905361147878, "learning_rate": 1.7597514670348638e-05, "loss": 2.3168, "step": 670 }, { "epoch": 0.625349487418453, "grad_norm": 1.413612703403848, "learning_rate": 1.759061097687263e-05, "loss": 2.2449, "step": 671 }, { "epoch": 0.6262814538676608, "grad_norm": 1.5634538558291557, "learning_rate": 1.758370728339662e-05, "loss": 2.3374, "step": 672 }, { "epoch": 0.6272134203168686, "grad_norm": 1.4047231894952936, "learning_rate": 1.7576803589920608e-05, "loss": 2.2723, "step": 673 }, { "epoch": 0.6281453867660765, "grad_norm": 1.7868856774079254, "learning_rate": 1.75698998964446e-05, "loss": 2.2242, "step": 674 }, { "epoch": 0.6290773532152842, "grad_norm": 1.5574122840687916, "learning_rate": 1.756299620296859e-05, "loss": 2.0727, "step": 675 }, { "epoch": 0.630009319664492, "grad_norm": 1.6726764696134828, "learning_rate": 1.755609250949258e-05, "loss": 2.6072, "step": 676 }, { "epoch": 0.6309412861136999, "grad_norm": 1.7886129736788112, "learning_rate": 1.754918881601657e-05, "loss": 2.4685, "step": 677 }, { "epoch": 0.6318732525629077, "grad_norm": 1.515774820341664, "learning_rate": 1.754228512254056e-05, "loss": 2.6458, "step": 678 }, { "epoch": 0.6328052190121156, "grad_norm": 1.912014209661561, "learning_rate": 1.753538142906455e-05, "loss": 2.5313, "step": 679 }, { "epoch": 0.6337371854613234, "grad_norm": 1.7781389585944765, "learning_rate": 1.752847773558854e-05, "loss": 2.4682, "step": 680 }, { "epoch": 0.6346691519105312, "grad_norm": 1.4682984562817538, "learning_rate": 1.7521574042112534e-05, "loss": 2.4121, "step": 681 }, { "epoch": 0.6356011183597391, "grad_norm": 1.5097743486396376, "learning_rate": 1.7514670348636523e-05, "loss": 2.3813, "step": 682 }, { "epoch": 0.6365330848089469, "grad_norm": 1.481154654921394, "learning_rate": 1.7507766655160512e-05, "loss": 2.3084, "step": 683 }, { "epoch": 0.6374650512581547, "grad_norm": 1.9270825743443996, "learning_rate": 1.75008629616845e-05, "loss": 2.2657, "step": 684 }, { "epoch": 0.6383970177073626, "grad_norm": 1.5121014543239117, "learning_rate": 1.7493959268208494e-05, "loss": 2.4118, "step": 685 }, { "epoch": 0.6393289841565704, "grad_norm": 1.7044842784065863, "learning_rate": 1.7487055574732483e-05, "loss": 2.181, "step": 686 }, { "epoch": 0.6402609506057781, "grad_norm": 1.6387929451738683, "learning_rate": 1.748015188125647e-05, "loss": 2.1966, "step": 687 }, { "epoch": 0.641192917054986, "grad_norm": 1.7776339967913615, "learning_rate": 1.7473248187780464e-05, "loss": 2.2365, "step": 688 }, { "epoch": 0.6421248835041938, "grad_norm": 1.4414746355026191, "learning_rate": 1.7466344494304453e-05, "loss": 2.4607, "step": 689 }, { "epoch": 0.6430568499534017, "grad_norm": 1.7162944056816485, "learning_rate": 1.7459440800828446e-05, "loss": 2.2007, "step": 690 }, { "epoch": 0.6439888164026095, "grad_norm": 1.8082477733097304, "learning_rate": 1.7452537107352435e-05, "loss": 2.0166, "step": 691 }, { "epoch": 0.6449207828518173, "grad_norm": 1.5761752848753872, "learning_rate": 1.7445633413876427e-05, "loss": 2.0783, "step": 692 }, { "epoch": 0.6458527493010252, "grad_norm": 1.5841195720793393, "learning_rate": 1.7438729720400416e-05, "loss": 2.5398, "step": 693 }, { "epoch": 0.646784715750233, "grad_norm": 1.5238994110398034, "learning_rate": 1.7431826026924405e-05, "loss": 1.9405, "step": 694 }, { "epoch": 0.6477166821994408, "grad_norm": 1.4064850734493224, "learning_rate": 1.7424922333448394e-05, "loss": 2.3006, "step": 695 }, { "epoch": 0.6486486486486487, "grad_norm": 1.711697107494699, "learning_rate": 1.7418018639972387e-05, "loss": 2.2793, "step": 696 }, { "epoch": 0.6495806150978565, "grad_norm": 1.777269877611938, "learning_rate": 1.7411114946496376e-05, "loss": 2.0332, "step": 697 }, { "epoch": 0.6505125815470643, "grad_norm": 1.915173156972755, "learning_rate": 1.7404211253020368e-05, "loss": 2.5588, "step": 698 }, { "epoch": 0.6514445479962722, "grad_norm": 2.361375272936563, "learning_rate": 1.7397307559544357e-05, "loss": 2.468, "step": 699 }, { "epoch": 0.65237651444548, "grad_norm": 2.1690047066654707, "learning_rate": 1.739040386606835e-05, "loss": 2.3236, "step": 700 }, { "epoch": 0.6533084808946877, "grad_norm": 1.4703676974045763, "learning_rate": 1.738350017259234e-05, "loss": 2.3061, "step": 701 }, { "epoch": 0.6542404473438956, "grad_norm": 1.8672180243256902, "learning_rate": 1.7376596479116328e-05, "loss": 2.5306, "step": 702 }, { "epoch": 0.6551724137931034, "grad_norm": 1.4852599528910406, "learning_rate": 1.736969278564032e-05, "loss": 2.5097, "step": 703 }, { "epoch": 0.6561043802423113, "grad_norm": 1.7663884077203327, "learning_rate": 1.736278909216431e-05, "loss": 2.3939, "step": 704 }, { "epoch": 0.6570363466915191, "grad_norm": 1.3519291435141534, "learning_rate": 1.7355885398688298e-05, "loss": 2.2184, "step": 705 }, { "epoch": 0.6579683131407269, "grad_norm": 1.629541844849235, "learning_rate": 1.734898170521229e-05, "loss": 2.1671, "step": 706 }, { "epoch": 0.6589002795899348, "grad_norm": 1.6078151983361757, "learning_rate": 1.734207801173628e-05, "loss": 2.604, "step": 707 }, { "epoch": 0.6598322460391426, "grad_norm": 1.7515753949367625, "learning_rate": 1.7335174318260272e-05, "loss": 1.9801, "step": 708 }, { "epoch": 0.6607642124883504, "grad_norm": 1.5354220782491683, "learning_rate": 1.732827062478426e-05, "loss": 2.6224, "step": 709 }, { "epoch": 0.6616961789375583, "grad_norm": 1.4306757395484986, "learning_rate": 1.7321366931308253e-05, "loss": 2.2295, "step": 710 }, { "epoch": 0.6626281453867661, "grad_norm": 1.7335951959459026, "learning_rate": 1.7314463237832242e-05, "loss": 2.2992, "step": 711 }, { "epoch": 0.6635601118359739, "grad_norm": 1.3827327408965204, "learning_rate": 1.730755954435623e-05, "loss": 2.5596, "step": 712 }, { "epoch": 0.6644920782851818, "grad_norm": 1.7651504669153633, "learning_rate": 1.730065585088022e-05, "loss": 2.0925, "step": 713 }, { "epoch": 0.6654240447343895, "grad_norm": 1.399688525024962, "learning_rate": 1.7293752157404213e-05, "loss": 2.422, "step": 714 }, { "epoch": 0.6663560111835974, "grad_norm": 1.4117011086302813, "learning_rate": 1.7286848463928202e-05, "loss": 2.2194, "step": 715 }, { "epoch": 0.6672879776328052, "grad_norm": 1.4922869232850755, "learning_rate": 1.7279944770452194e-05, "loss": 2.0483, "step": 716 }, { "epoch": 0.668219944082013, "grad_norm": 1.367729622295672, "learning_rate": 1.7273041076976183e-05, "loss": 2.4298, "step": 717 }, { "epoch": 0.6691519105312209, "grad_norm": 1.3239652415691106, "learning_rate": 1.7266137383500176e-05, "loss": 2.2685, "step": 718 }, { "epoch": 0.6700838769804287, "grad_norm": 1.6029400988129938, "learning_rate": 1.7259233690024165e-05, "loss": 2.3839, "step": 719 }, { "epoch": 0.6710158434296365, "grad_norm": 1.5889828755111979, "learning_rate": 1.7252329996548154e-05, "loss": 2.378, "step": 720 }, { "epoch": 0.6719478098788444, "grad_norm": 1.8682334750141762, "learning_rate": 1.7245426303072146e-05, "loss": 2.166, "step": 721 }, { "epoch": 0.6728797763280522, "grad_norm": 1.4871567572172404, "learning_rate": 1.7238522609596135e-05, "loss": 2.4041, "step": 722 }, { "epoch": 0.67381174277726, "grad_norm": 1.4621888653488897, "learning_rate": 1.7231618916120124e-05, "loss": 2.1486, "step": 723 }, { "epoch": 0.6747437092264679, "grad_norm": 2.229671273672102, "learning_rate": 1.7224715222644113e-05, "loss": 2.3814, "step": 724 }, { "epoch": 0.6756756756756757, "grad_norm": 1.5782080328945083, "learning_rate": 1.7217811529168106e-05, "loss": 2.5828, "step": 725 }, { "epoch": 0.6766076421248836, "grad_norm": 2.1896633468466153, "learning_rate": 1.72109078356921e-05, "loss": 2.3489, "step": 726 }, { "epoch": 0.6775396085740913, "grad_norm": 1.5645686046725653, "learning_rate": 1.7204004142216087e-05, "loss": 2.102, "step": 727 }, { "epoch": 0.6784715750232991, "grad_norm": 1.3843170173054118, "learning_rate": 1.719710044874008e-05, "loss": 2.2497, "step": 728 }, { "epoch": 0.679403541472507, "grad_norm": 1.9189203240292994, "learning_rate": 1.719019675526407e-05, "loss": 2.0669, "step": 729 }, { "epoch": 0.6803355079217148, "grad_norm": 1.7258560033536934, "learning_rate": 1.7183293061788058e-05, "loss": 2.1433, "step": 730 }, { "epoch": 0.6812674743709226, "grad_norm": 1.4663483127541672, "learning_rate": 1.7176389368312047e-05, "loss": 2.2134, "step": 731 }, { "epoch": 0.6821994408201305, "grad_norm": 1.3972242050069832, "learning_rate": 1.716948567483604e-05, "loss": 2.3011, "step": 732 }, { "epoch": 0.6831314072693383, "grad_norm": 1.604981854227651, "learning_rate": 1.716258198136003e-05, "loss": 2.2344, "step": 733 }, { "epoch": 0.6840633737185461, "grad_norm": 1.5404561257074922, "learning_rate": 1.7155678287884017e-05, "loss": 2.4097, "step": 734 }, { "epoch": 0.684995340167754, "grad_norm": 1.8309093679496244, "learning_rate": 1.714877459440801e-05, "loss": 2.1069, "step": 735 }, { "epoch": 0.6859273066169618, "grad_norm": 1.3810479240340967, "learning_rate": 1.7141870900932002e-05, "loss": 2.225, "step": 736 }, { "epoch": 0.6868592730661697, "grad_norm": 2.2862359840204327, "learning_rate": 1.713496720745599e-05, "loss": 2.2912, "step": 737 }, { "epoch": 0.6877912395153775, "grad_norm": 2.0285066124496636, "learning_rate": 1.712806351397998e-05, "loss": 2.3886, "step": 738 }, { "epoch": 0.6887232059645852, "grad_norm": 1.554027093805671, "learning_rate": 1.7121159820503973e-05, "loss": 2.3984, "step": 739 }, { "epoch": 0.6896551724137931, "grad_norm": 1.3437735417941152, "learning_rate": 1.7114256127027962e-05, "loss": 1.9951, "step": 740 }, { "epoch": 0.6905871388630009, "grad_norm": 1.312815696064387, "learning_rate": 1.710735243355195e-05, "loss": 2.0939, "step": 741 }, { "epoch": 0.6915191053122087, "grad_norm": 1.5084488695742424, "learning_rate": 1.710044874007594e-05, "loss": 2.0755, "step": 742 }, { "epoch": 0.6924510717614166, "grad_norm": 1.3242087241344724, "learning_rate": 1.7093545046599932e-05, "loss": 2.2815, "step": 743 }, { "epoch": 0.6933830382106244, "grad_norm": 1.654088705756977, "learning_rate": 1.708664135312392e-05, "loss": 2.1714, "step": 744 }, { "epoch": 0.6943150046598322, "grad_norm": 1.5271844454225014, "learning_rate": 1.7079737659647914e-05, "loss": 2.4072, "step": 745 }, { "epoch": 0.6952469711090401, "grad_norm": 1.544883650345823, "learning_rate": 1.7072833966171903e-05, "loss": 2.2469, "step": 746 }, { "epoch": 0.6961789375582479, "grad_norm": 1.6195189671114913, "learning_rate": 1.7065930272695895e-05, "loss": 2.1304, "step": 747 }, { "epoch": 0.6971109040074557, "grad_norm": 1.505476685099248, "learning_rate": 1.7059026579219884e-05, "loss": 2.2136, "step": 748 }, { "epoch": 0.6980428704566636, "grad_norm": 1.3311592063657947, "learning_rate": 1.7052122885743873e-05, "loss": 2.2565, "step": 749 }, { "epoch": 0.6989748369058714, "grad_norm": 2.3446040449804775, "learning_rate": 1.7045219192267866e-05, "loss": 2.0339, "step": 750 }, { "epoch": 0.6999068033550793, "grad_norm": 1.3484245579906844, "learning_rate": 1.7038315498791855e-05, "loss": 2.0957, "step": 751 }, { "epoch": 0.700838769804287, "grad_norm": 2.0444493179398733, "learning_rate": 1.7031411805315844e-05, "loss": 2.179, "step": 752 }, { "epoch": 0.7017707362534948, "grad_norm": 1.5605148946645517, "learning_rate": 1.7024508111839836e-05, "loss": 2.5003, "step": 753 }, { "epoch": 0.7027027027027027, "grad_norm": 1.4910622684635757, "learning_rate": 1.7017604418363825e-05, "loss": 2.4436, "step": 754 }, { "epoch": 0.7036346691519105, "grad_norm": 1.3162963634478362, "learning_rate": 1.7010700724887818e-05, "loss": 2.4075, "step": 755 }, { "epoch": 0.7045666356011183, "grad_norm": 1.5727739072270142, "learning_rate": 1.7003797031411807e-05, "loss": 2.0266, "step": 756 }, { "epoch": 0.7054986020503262, "grad_norm": 1.844787040140555, "learning_rate": 1.69968933379358e-05, "loss": 1.9955, "step": 757 }, { "epoch": 0.706430568499534, "grad_norm": 1.8668317953660196, "learning_rate": 1.6989989644459788e-05, "loss": 2.4193, "step": 758 }, { "epoch": 0.7073625349487418, "grad_norm": 1.4533021759685798, "learning_rate": 1.6983085950983777e-05, "loss": 2.3369, "step": 759 }, { "epoch": 0.7082945013979497, "grad_norm": 1.5185652630380422, "learning_rate": 1.6976182257507766e-05, "loss": 2.1023, "step": 760 }, { "epoch": 0.7092264678471575, "grad_norm": 1.7281478319801802, "learning_rate": 1.696927856403176e-05, "loss": 2.0731, "step": 761 }, { "epoch": 0.7101584342963654, "grad_norm": 2.0145296611010273, "learning_rate": 1.6962374870555748e-05, "loss": 2.9403, "step": 762 }, { "epoch": 0.7110904007455732, "grad_norm": 1.5575443711900578, "learning_rate": 1.695547117707974e-05, "loss": 2.3835, "step": 763 }, { "epoch": 0.712022367194781, "grad_norm": 1.7462202022675617, "learning_rate": 1.694856748360373e-05, "loss": 2.3518, "step": 764 }, { "epoch": 0.7129543336439889, "grad_norm": 1.3256660980309998, "learning_rate": 1.694166379012772e-05, "loss": 2.1579, "step": 765 }, { "epoch": 0.7138863000931966, "grad_norm": 2.27908656849031, "learning_rate": 1.693476009665171e-05, "loss": 2.3773, "step": 766 }, { "epoch": 0.7148182665424044, "grad_norm": 1.6429731418374611, "learning_rate": 1.69278564031757e-05, "loss": 2.1832, "step": 767 }, { "epoch": 0.7157502329916123, "grad_norm": 1.2888014051021566, "learning_rate": 1.6920952709699692e-05, "loss": 2.2696, "step": 768 }, { "epoch": 0.7166821994408201, "grad_norm": 1.2725885549704217, "learning_rate": 1.691404901622368e-05, "loss": 2.1448, "step": 769 }, { "epoch": 0.7176141658900279, "grad_norm": 1.8338867615306629, "learning_rate": 1.690714532274767e-05, "loss": 2.0663, "step": 770 }, { "epoch": 0.7185461323392358, "grad_norm": 1.8687309756472272, "learning_rate": 1.690024162927166e-05, "loss": 2.1125, "step": 771 }, { "epoch": 0.7194780987884436, "grad_norm": 1.543723143742658, "learning_rate": 1.689333793579565e-05, "loss": 2.3186, "step": 772 }, { "epoch": 0.7204100652376515, "grad_norm": 1.3318978793080407, "learning_rate": 1.6886434242319644e-05, "loss": 2.1357, "step": 773 }, { "epoch": 0.7213420316868593, "grad_norm": 1.2950564260714106, "learning_rate": 1.6879530548843633e-05, "loss": 2.1239, "step": 774 }, { "epoch": 0.7222739981360671, "grad_norm": 1.5544478107636992, "learning_rate": 1.6872626855367622e-05, "loss": 2.5975, "step": 775 }, { "epoch": 0.723205964585275, "grad_norm": 1.647197228543481, "learning_rate": 1.6865723161891615e-05, "loss": 2.1966, "step": 776 }, { "epoch": 0.7241379310344828, "grad_norm": 1.455799215701283, "learning_rate": 1.6858819468415604e-05, "loss": 1.9044, "step": 777 }, { "epoch": 0.7250698974836906, "grad_norm": 1.8128952706180206, "learning_rate": 1.6851915774939593e-05, "loss": 1.9984, "step": 778 }, { "epoch": 0.7260018639328985, "grad_norm": 2.38766441589094, "learning_rate": 1.6845012081463585e-05, "loss": 2.4313, "step": 779 }, { "epoch": 0.7269338303821062, "grad_norm": 1.5324456441458751, "learning_rate": 1.6838108387987574e-05, "loss": 1.8972, "step": 780 }, { "epoch": 0.727865796831314, "grad_norm": 1.3850293883549827, "learning_rate": 1.6831204694511563e-05, "loss": 2.3475, "step": 781 }, { "epoch": 0.7287977632805219, "grad_norm": 1.4097839719706158, "learning_rate": 1.6824301001035556e-05, "loss": 2.204, "step": 782 }, { "epoch": 0.7297297297297297, "grad_norm": 1.2841718240402575, "learning_rate": 1.6817397307559548e-05, "loss": 1.8223, "step": 783 }, { "epoch": 0.7306616961789375, "grad_norm": 2.044478312844942, "learning_rate": 1.6810493614083537e-05, "loss": 2.0931, "step": 784 }, { "epoch": 0.7315936626281454, "grad_norm": 2.1443632220002193, "learning_rate": 1.6803589920607526e-05, "loss": 2.6574, "step": 785 }, { "epoch": 0.7325256290773532, "grad_norm": 2.027049383277873, "learning_rate": 1.6796686227131515e-05, "loss": 2.5615, "step": 786 }, { "epoch": 0.7334575955265611, "grad_norm": 1.7129224959559617, "learning_rate": 1.6789782533655508e-05, "loss": 2.1674, "step": 787 }, { "epoch": 0.7343895619757689, "grad_norm": 2.035679422586528, "learning_rate": 1.6782878840179497e-05, "loss": 2.3915, "step": 788 }, { "epoch": 0.7353215284249767, "grad_norm": 2.2387043183990856, "learning_rate": 1.6775975146703486e-05, "loss": 2.2313, "step": 789 }, { "epoch": 0.7362534948741846, "grad_norm": 1.6190554906998624, "learning_rate": 1.6769071453227478e-05, "loss": 2.4687, "step": 790 }, { "epoch": 0.7371854613233924, "grad_norm": 1.3903843480629399, "learning_rate": 1.6762167759751467e-05, "loss": 1.9912, "step": 791 }, { "epoch": 0.7381174277726001, "grad_norm": 1.522681484304724, "learning_rate": 1.675526406627546e-05, "loss": 2.0985, "step": 792 }, { "epoch": 0.739049394221808, "grad_norm": 1.7137793284777303, "learning_rate": 1.674836037279945e-05, "loss": 2.3336, "step": 793 }, { "epoch": 0.7399813606710158, "grad_norm": 1.652009592710027, "learning_rate": 1.674145667932344e-05, "loss": 2.0732, "step": 794 }, { "epoch": 0.7409133271202236, "grad_norm": 1.483307399216106, "learning_rate": 1.673455298584743e-05, "loss": 2.2186, "step": 795 }, { "epoch": 0.7418452935694315, "grad_norm": 1.339546917003664, "learning_rate": 1.672764929237142e-05, "loss": 1.8587, "step": 796 }, { "epoch": 0.7427772600186393, "grad_norm": 1.4763380274343656, "learning_rate": 1.672074559889541e-05, "loss": 2.4875, "step": 797 }, { "epoch": 0.7437092264678472, "grad_norm": 1.4901891454249077, "learning_rate": 1.67138419054194e-05, "loss": 2.1532, "step": 798 }, { "epoch": 0.744641192917055, "grad_norm": 2.498454908828917, "learning_rate": 1.670693821194339e-05, "loss": 2.2049, "step": 799 }, { "epoch": 0.7455731593662628, "grad_norm": 1.8648286865755732, "learning_rate": 1.6700034518467382e-05, "loss": 2.1583, "step": 800 }, { "epoch": 0.7465051258154707, "grad_norm": 1.5268157057258827, "learning_rate": 1.669313082499137e-05, "loss": 2.1508, "step": 801 }, { "epoch": 0.7474370922646785, "grad_norm": 1.349511616555709, "learning_rate": 1.6686227131515363e-05, "loss": 2.1191, "step": 802 }, { "epoch": 0.7483690587138863, "grad_norm": 1.546808206923036, "learning_rate": 1.6679323438039353e-05, "loss": 1.9764, "step": 803 }, { "epoch": 0.7493010251630942, "grad_norm": 1.6052437956770016, "learning_rate": 1.667241974456334e-05, "loss": 1.9927, "step": 804 }, { "epoch": 0.750232991612302, "grad_norm": 1.5099669651240004, "learning_rate": 1.6665516051087334e-05, "loss": 2.1581, "step": 805 }, { "epoch": 0.7511649580615097, "grad_norm": 2.3072909382063065, "learning_rate": 1.6658612357611323e-05, "loss": 2.171, "step": 806 }, { "epoch": 0.7520969245107176, "grad_norm": 1.6568685770484675, "learning_rate": 1.6651708664135312e-05, "loss": 2.4841, "step": 807 }, { "epoch": 0.7530288909599254, "grad_norm": 1.5241673951661432, "learning_rate": 1.6644804970659305e-05, "loss": 2.579, "step": 808 }, { "epoch": 0.7539608574091333, "grad_norm": 1.6165150736749323, "learning_rate": 1.6637901277183294e-05, "loss": 2.4153, "step": 809 }, { "epoch": 0.7548928238583411, "grad_norm": 1.3732667265303207, "learning_rate": 1.6630997583707286e-05, "loss": 2.0843, "step": 810 }, { "epoch": 0.7558247903075489, "grad_norm": 1.3451787069832382, "learning_rate": 1.6624093890231275e-05, "loss": 2.2487, "step": 811 }, { "epoch": 0.7567567567567568, "grad_norm": 1.823222298150742, "learning_rate": 1.6617190196755267e-05, "loss": 2.3287, "step": 812 }, { "epoch": 0.7576887232059646, "grad_norm": 1.4719920160425202, "learning_rate": 1.6610286503279256e-05, "loss": 2.3292, "step": 813 }, { "epoch": 0.7586206896551724, "grad_norm": 1.5665736903964165, "learning_rate": 1.6603382809803246e-05, "loss": 2.397, "step": 814 }, { "epoch": 0.7595526561043803, "grad_norm": 1.3815615399875916, "learning_rate": 1.6596479116327235e-05, "loss": 1.8915, "step": 815 }, { "epoch": 0.7604846225535881, "grad_norm": 1.5142784894971157, "learning_rate": 1.6589575422851227e-05, "loss": 1.7174, "step": 816 }, { "epoch": 0.7614165890027959, "grad_norm": 1.4790336525531065, "learning_rate": 1.6582671729375216e-05, "loss": 2.3212, "step": 817 }, { "epoch": 0.7623485554520038, "grad_norm": 1.1906092980486194, "learning_rate": 1.657576803589921e-05, "loss": 1.8652, "step": 818 }, { "epoch": 0.7632805219012115, "grad_norm": 1.3219720849974366, "learning_rate": 1.6568864342423197e-05, "loss": 2.0533, "step": 819 }, { "epoch": 0.7642124883504194, "grad_norm": 1.36969268900983, "learning_rate": 1.656196064894719e-05, "loss": 1.9924, "step": 820 }, { "epoch": 0.7651444547996272, "grad_norm": 1.3677087522260283, "learning_rate": 1.655505695547118e-05, "loss": 2.1653, "step": 821 }, { "epoch": 0.766076421248835, "grad_norm": 2.1518819027203593, "learning_rate": 1.6548153261995168e-05, "loss": 2.2021, "step": 822 }, { "epoch": 0.7670083876980429, "grad_norm": 1.874751073924987, "learning_rate": 1.654124956851916e-05, "loss": 2.368, "step": 823 }, { "epoch": 0.7679403541472507, "grad_norm": 1.6318574218579005, "learning_rate": 1.653434587504315e-05, "loss": 2.6719, "step": 824 }, { "epoch": 0.7688723205964585, "grad_norm": 1.967121716305956, "learning_rate": 1.652744218156714e-05, "loss": 2.4426, "step": 825 }, { "epoch": 0.7698042870456664, "grad_norm": 1.2495302723322859, "learning_rate": 1.6520538488091128e-05, "loss": 1.9794, "step": 826 }, { "epoch": 0.7707362534948742, "grad_norm": 2.591786501448038, "learning_rate": 1.651363479461512e-05, "loss": 2.336, "step": 827 }, { "epoch": 0.771668219944082, "grad_norm": 1.4549434095762133, "learning_rate": 1.6506731101139112e-05, "loss": 2.2633, "step": 828 }, { "epoch": 0.7726001863932899, "grad_norm": 1.6786103965974735, "learning_rate": 1.64998274076631e-05, "loss": 1.95, "step": 829 }, { "epoch": 0.7735321528424977, "grad_norm": 1.7815965850504185, "learning_rate": 1.6492923714187094e-05, "loss": 2.5001, "step": 830 }, { "epoch": 0.7744641192917054, "grad_norm": 1.262736856254316, "learning_rate": 1.6486020020711083e-05, "loss": 2.0455, "step": 831 }, { "epoch": 0.7753960857409133, "grad_norm": 1.6338226742115856, "learning_rate": 1.6479116327235072e-05, "loss": 2.0149, "step": 832 }, { "epoch": 0.7763280521901211, "grad_norm": 1.3575935682018883, "learning_rate": 1.647221263375906e-05, "loss": 2.2894, "step": 833 }, { "epoch": 0.777260018639329, "grad_norm": 1.2396266187643394, "learning_rate": 1.6465308940283053e-05, "loss": 2.1907, "step": 834 }, { "epoch": 0.7781919850885368, "grad_norm": 1.8969360191790205, "learning_rate": 1.6458405246807042e-05, "loss": 2.2832, "step": 835 }, { "epoch": 0.7791239515377446, "grad_norm": 2.4358249109867245, "learning_rate": 1.645150155333103e-05, "loss": 2.2286, "step": 836 }, { "epoch": 0.7800559179869525, "grad_norm": 1.3441121759838313, "learning_rate": 1.6444597859855024e-05, "loss": 2.3974, "step": 837 }, { "epoch": 0.7809878844361603, "grad_norm": 1.6657238598853972, "learning_rate": 1.6437694166379013e-05, "loss": 2.09, "step": 838 }, { "epoch": 0.7819198508853681, "grad_norm": 1.5475923705938621, "learning_rate": 1.6430790472903005e-05, "loss": 2.4388, "step": 839 }, { "epoch": 0.782851817334576, "grad_norm": 1.5832610215900271, "learning_rate": 1.6423886779426994e-05, "loss": 2.3025, "step": 840 }, { "epoch": 0.7837837837837838, "grad_norm": 1.74460547035744, "learning_rate": 1.6416983085950987e-05, "loss": 2.301, "step": 841 }, { "epoch": 0.7847157502329916, "grad_norm": 1.2630543099070823, "learning_rate": 1.6410079392474976e-05, "loss": 1.9669, "step": 842 }, { "epoch": 0.7856477166821995, "grad_norm": 1.7790262045837226, "learning_rate": 1.6403175698998965e-05, "loss": 2.2965, "step": 843 }, { "epoch": 0.7865796831314072, "grad_norm": 1.4801815178274522, "learning_rate": 1.6396272005522954e-05, "loss": 1.957, "step": 844 }, { "epoch": 0.7875116495806151, "grad_norm": 1.650951363721125, "learning_rate": 1.6389368312046946e-05, "loss": 2.2154, "step": 845 }, { "epoch": 0.7884436160298229, "grad_norm": 1.693708451162557, "learning_rate": 1.6382464618570935e-05, "loss": 2.3992, "step": 846 }, { "epoch": 0.7893755824790307, "grad_norm": 1.5149731298855154, "learning_rate": 1.6375560925094928e-05, "loss": 1.9005, "step": 847 }, { "epoch": 0.7903075489282386, "grad_norm": 1.4250483527641271, "learning_rate": 1.6368657231618917e-05, "loss": 1.9976, "step": 848 }, { "epoch": 0.7912395153774464, "grad_norm": 1.3801940825111652, "learning_rate": 1.636175353814291e-05, "loss": 1.8778, "step": 849 }, { "epoch": 0.7921714818266542, "grad_norm": 1.532154228370424, "learning_rate": 1.63548498446669e-05, "loss": 2.2016, "step": 850 }, { "epoch": 0.7931034482758621, "grad_norm": 1.4070350470338562, "learning_rate": 1.6347946151190887e-05, "loss": 2.5268, "step": 851 }, { "epoch": 0.7940354147250699, "grad_norm": 2.1947155266080407, "learning_rate": 1.634104245771488e-05, "loss": 2.7067, "step": 852 }, { "epoch": 0.7949673811742777, "grad_norm": 1.87530379383935, "learning_rate": 1.633413876423887e-05, "loss": 2.2238, "step": 853 }, { "epoch": 0.7958993476234856, "grad_norm": 1.6176456694506938, "learning_rate": 1.6327235070762858e-05, "loss": 2.5974, "step": 854 }, { "epoch": 0.7968313140726934, "grad_norm": 2.543041012197265, "learning_rate": 1.632033137728685e-05, "loss": 2.2327, "step": 855 }, { "epoch": 0.7977632805219013, "grad_norm": 1.7998445261266003, "learning_rate": 1.631342768381084e-05, "loss": 2.5087, "step": 856 }, { "epoch": 0.798695246971109, "grad_norm": 1.0643050823901232, "learning_rate": 1.6306523990334832e-05, "loss": 1.736, "step": 857 }, { "epoch": 0.7996272134203168, "grad_norm": 1.1899952969611327, "learning_rate": 1.629962029685882e-05, "loss": 1.8949, "step": 858 }, { "epoch": 0.8005591798695247, "grad_norm": 2.053154329196531, "learning_rate": 1.6292716603382813e-05, "loss": 2.5733, "step": 859 }, { "epoch": 0.8014911463187325, "grad_norm": 1.5119899251313127, "learning_rate": 1.6285812909906802e-05, "loss": 2.1228, "step": 860 }, { "epoch": 0.8024231127679403, "grad_norm": 1.450300745142739, "learning_rate": 1.627890921643079e-05, "loss": 2.2252, "step": 861 }, { "epoch": 0.8033550792171482, "grad_norm": 1.1634042669891311, "learning_rate": 1.627200552295478e-05, "loss": 1.9502, "step": 862 }, { "epoch": 0.804287045666356, "grad_norm": 1.4949522290296393, "learning_rate": 1.6265101829478773e-05, "loss": 2.0561, "step": 863 }, { "epoch": 0.8052190121155638, "grad_norm": 1.9185562391213686, "learning_rate": 1.6258198136002762e-05, "loss": 2.2171, "step": 864 }, { "epoch": 0.8061509785647717, "grad_norm": 2.4962454572703945, "learning_rate": 1.6251294442526754e-05, "loss": 2.279, "step": 865 }, { "epoch": 0.8070829450139795, "grad_norm": 1.525477499104841, "learning_rate": 1.6244390749050743e-05, "loss": 2.3982, "step": 866 }, { "epoch": 0.8080149114631874, "grad_norm": 1.2994414312749873, "learning_rate": 1.6237487055574736e-05, "loss": 2.1416, "step": 867 }, { "epoch": 0.8089468779123952, "grad_norm": 1.5622098368294561, "learning_rate": 1.6230583362098725e-05, "loss": 2.4714, "step": 868 }, { "epoch": 0.809878844361603, "grad_norm": 1.891858581499319, "learning_rate": 1.6223679668622714e-05, "loss": 2.414, "step": 869 }, { "epoch": 0.8108108108108109, "grad_norm": 1.5530008456514717, "learning_rate": 1.6216775975146706e-05, "loss": 2.3186, "step": 870 }, { "epoch": 0.8117427772600186, "grad_norm": 1.535418671556967, "learning_rate": 1.6209872281670695e-05, "loss": 2.0187, "step": 871 }, { "epoch": 0.8126747437092264, "grad_norm": 1.4596362996213446, "learning_rate": 1.6202968588194684e-05, "loss": 2.2028, "step": 872 }, { "epoch": 0.8136067101584343, "grad_norm": 1.909544344375068, "learning_rate": 1.6196064894718673e-05, "loss": 2.3401, "step": 873 }, { "epoch": 0.8145386766076421, "grad_norm": 1.2797234853262387, "learning_rate": 1.6189161201242666e-05, "loss": 1.8942, "step": 874 }, { "epoch": 0.8154706430568499, "grad_norm": 1.5263361465903527, "learning_rate": 1.6182257507766658e-05, "loss": 2.3393, "step": 875 }, { "epoch": 0.8164026095060578, "grad_norm": 1.4470525887372045, "learning_rate": 1.6175353814290647e-05, "loss": 2.1802, "step": 876 }, { "epoch": 0.8173345759552656, "grad_norm": 1.4454403623441434, "learning_rate": 1.6168450120814636e-05, "loss": 1.9755, "step": 877 }, { "epoch": 0.8182665424044734, "grad_norm": 1.663324066022331, "learning_rate": 1.616154642733863e-05, "loss": 2.0968, "step": 878 }, { "epoch": 0.8191985088536813, "grad_norm": 1.3007164631605606, "learning_rate": 1.6154642733862618e-05, "loss": 1.99, "step": 879 }, { "epoch": 0.8201304753028891, "grad_norm": 1.3201025168198344, "learning_rate": 1.6147739040386607e-05, "loss": 1.9308, "step": 880 }, { "epoch": 0.821062441752097, "grad_norm": 1.9799088428931761, "learning_rate": 1.61408353469106e-05, "loss": 2.2917, "step": 881 }, { "epoch": 0.8219944082013048, "grad_norm": 1.5405598446478295, "learning_rate": 1.6133931653434588e-05, "loss": 2.2507, "step": 882 }, { "epoch": 0.8229263746505125, "grad_norm": 1.3163109706867737, "learning_rate": 1.6127027959958577e-05, "loss": 2.1071, "step": 883 }, { "epoch": 0.8238583410997204, "grad_norm": 1.3309956114407366, "learning_rate": 1.612012426648257e-05, "loss": 2.0656, "step": 884 }, { "epoch": 0.8247903075489282, "grad_norm": 1.5040161889271724, "learning_rate": 1.6113220573006562e-05, "loss": 2.2913, "step": 885 }, { "epoch": 0.825722273998136, "grad_norm": 1.6305726854570588, "learning_rate": 1.610631687953055e-05, "loss": 2.0206, "step": 886 }, { "epoch": 0.8266542404473439, "grad_norm": 1.6046398550134608, "learning_rate": 1.609941318605454e-05, "loss": 1.9667, "step": 887 }, { "epoch": 0.8275862068965517, "grad_norm": 1.9850142872516423, "learning_rate": 1.6092509492578533e-05, "loss": 2.1249, "step": 888 }, { "epoch": 0.8285181733457595, "grad_norm": 1.4692458615793516, "learning_rate": 1.608560579910252e-05, "loss": 2.0872, "step": 889 }, { "epoch": 0.8294501397949674, "grad_norm": 1.737049199712905, "learning_rate": 1.607870210562651e-05, "loss": 2.2134, "step": 890 }, { "epoch": 0.8303821062441752, "grad_norm": 1.5618424577644223, "learning_rate": 1.60717984121505e-05, "loss": 2.0426, "step": 891 }, { "epoch": 0.8313140726933831, "grad_norm": 1.4719646128194963, "learning_rate": 1.6064894718674492e-05, "loss": 1.8197, "step": 892 }, { "epoch": 0.8322460391425909, "grad_norm": 1.237681727253165, "learning_rate": 1.605799102519848e-05, "loss": 2.0186, "step": 893 }, { "epoch": 0.8331780055917987, "grad_norm": 1.3701462082564075, "learning_rate": 1.6051087331722474e-05, "loss": 2.1124, "step": 894 }, { "epoch": 0.8341099720410066, "grad_norm": 1.524566748498418, "learning_rate": 1.6044183638246463e-05, "loss": 2.27, "step": 895 }, { "epoch": 0.8350419384902144, "grad_norm": 1.5717399692535163, "learning_rate": 1.6037279944770455e-05, "loss": 2.4992, "step": 896 }, { "epoch": 0.8359739049394221, "grad_norm": 1.5528457479982916, "learning_rate": 1.6030376251294444e-05, "loss": 1.8775, "step": 897 }, { "epoch": 0.83690587138863, "grad_norm": 1.2700171154509057, "learning_rate": 1.6023472557818433e-05, "loss": 1.7807, "step": 898 }, { "epoch": 0.8378378378378378, "grad_norm": 1.4822932530798336, "learning_rate": 1.6016568864342426e-05, "loss": 2.3578, "step": 899 }, { "epoch": 0.8387698042870456, "grad_norm": 1.6793814848633772, "learning_rate": 1.6009665170866415e-05, "loss": 2.3879, "step": 900 }, { "epoch": 0.8397017707362535, "grad_norm": 1.5482219837648048, "learning_rate": 1.6002761477390404e-05, "loss": 2.3264, "step": 901 }, { "epoch": 0.8406337371854613, "grad_norm": 1.3935893009905167, "learning_rate": 1.5995857783914396e-05, "loss": 2.0739, "step": 902 }, { "epoch": 0.8415657036346692, "grad_norm": 1.300806334519766, "learning_rate": 1.5988954090438385e-05, "loss": 1.8449, "step": 903 }, { "epoch": 0.842497670083877, "grad_norm": 1.2804850808249824, "learning_rate": 1.5982050396962378e-05, "loss": 2.1607, "step": 904 }, { "epoch": 0.8434296365330848, "grad_norm": 1.5058885724617979, "learning_rate": 1.5975146703486367e-05, "loss": 2.1043, "step": 905 }, { "epoch": 0.8443616029822927, "grad_norm": 1.3735899917175705, "learning_rate": 1.5968243010010356e-05, "loss": 2.2366, "step": 906 }, { "epoch": 0.8452935694315005, "grad_norm": 1.5792435757918426, "learning_rate": 1.5961339316534348e-05, "loss": 2.4593, "step": 907 }, { "epoch": 0.8462255358807083, "grad_norm": 1.3542885936304077, "learning_rate": 1.5954435623058337e-05, "loss": 2.2575, "step": 908 }, { "epoch": 0.8471575023299162, "grad_norm": 1.3740407975276896, "learning_rate": 1.5947531929582326e-05, "loss": 1.8852, "step": 909 }, { "epoch": 0.848089468779124, "grad_norm": 3.539652578340766, "learning_rate": 1.594062823610632e-05, "loss": 2.0059, "step": 910 }, { "epoch": 0.8490214352283317, "grad_norm": 1.3443098732552727, "learning_rate": 1.5933724542630308e-05, "loss": 2.169, "step": 911 }, { "epoch": 0.8499534016775396, "grad_norm": 1.3058738999289017, "learning_rate": 1.59268208491543e-05, "loss": 2.2428, "step": 912 }, { "epoch": 0.8508853681267474, "grad_norm": 1.4109177088440505, "learning_rate": 1.591991715567829e-05, "loss": 2.2242, "step": 913 }, { "epoch": 0.8518173345759553, "grad_norm": 1.4767713767838397, "learning_rate": 1.591301346220228e-05, "loss": 2.0591, "step": 914 }, { "epoch": 0.8527493010251631, "grad_norm": 2.5651440188190673, "learning_rate": 1.590610976872627e-05, "loss": 2.1055, "step": 915 }, { "epoch": 0.8536812674743709, "grad_norm": 1.3832976475589767, "learning_rate": 1.589920607525026e-05, "loss": 2.5265, "step": 916 }, { "epoch": 0.8546132339235788, "grad_norm": 1.5508674105872664, "learning_rate": 1.589230238177425e-05, "loss": 2.3678, "step": 917 }, { "epoch": 0.8555452003727866, "grad_norm": 1.2935249622456488, "learning_rate": 1.588539868829824e-05, "loss": 1.9282, "step": 918 }, { "epoch": 0.8564771668219944, "grad_norm": 1.4636258650795222, "learning_rate": 1.587849499482223e-05, "loss": 2.4413, "step": 919 }, { "epoch": 0.8574091332712023, "grad_norm": 1.3474850762589268, "learning_rate": 1.5871591301346222e-05, "loss": 2.0812, "step": 920 }, { "epoch": 0.8583410997204101, "grad_norm": 1.357015948099907, "learning_rate": 1.586468760787021e-05, "loss": 1.9283, "step": 921 }, { "epoch": 0.8592730661696178, "grad_norm": 1.498835374221067, "learning_rate": 1.5857783914394204e-05, "loss": 2.4179, "step": 922 }, { "epoch": 0.8602050326188257, "grad_norm": 1.666444296146104, "learning_rate": 1.5850880220918193e-05, "loss": 2.1226, "step": 923 }, { "epoch": 0.8611369990680335, "grad_norm": 1.3285976467365297, "learning_rate": 1.5843976527442182e-05, "loss": 2.0344, "step": 924 }, { "epoch": 0.8620689655172413, "grad_norm": 2.5994281516677433, "learning_rate": 1.5837072833966174e-05, "loss": 2.2172, "step": 925 }, { "epoch": 0.8630009319664492, "grad_norm": 1.3571739976311572, "learning_rate": 1.5830169140490164e-05, "loss": 2.2629, "step": 926 }, { "epoch": 0.863932898415657, "grad_norm": 1.3019026105299325, "learning_rate": 1.5823265447014153e-05, "loss": 1.948, "step": 927 }, { "epoch": 0.8648648648648649, "grad_norm": 1.5325181720859435, "learning_rate": 1.5816361753538145e-05, "loss": 2.4012, "step": 928 }, { "epoch": 0.8657968313140727, "grad_norm": 1.7665536684482048, "learning_rate": 1.5809458060062134e-05, "loss": 2.1266, "step": 929 }, { "epoch": 0.8667287977632805, "grad_norm": 1.6710964124566865, "learning_rate": 1.5802554366586126e-05, "loss": 2.0425, "step": 930 }, { "epoch": 0.8676607642124884, "grad_norm": 1.3534919156128489, "learning_rate": 1.5795650673110115e-05, "loss": 2.3971, "step": 931 }, { "epoch": 0.8685927306616962, "grad_norm": 3.3449136440435554, "learning_rate": 1.5788746979634108e-05, "loss": 2.2474, "step": 932 }, { "epoch": 0.869524697110904, "grad_norm": 1.3366354171478378, "learning_rate": 1.5781843286158097e-05, "loss": 1.9857, "step": 933 }, { "epoch": 0.8704566635601119, "grad_norm": 1.3413515183268125, "learning_rate": 1.5774939592682086e-05, "loss": 2.1535, "step": 934 }, { "epoch": 0.8713886300093197, "grad_norm": 1.2390765478555978, "learning_rate": 1.5768035899206075e-05, "loss": 2.0992, "step": 935 }, { "epoch": 0.8723205964585274, "grad_norm": 1.873013867193871, "learning_rate": 1.5761132205730067e-05, "loss": 2.3772, "step": 936 }, { "epoch": 0.8732525629077353, "grad_norm": 1.5818790652953838, "learning_rate": 1.5754228512254056e-05, "loss": 2.247, "step": 937 }, { "epoch": 0.8741845293569431, "grad_norm": 1.501831804715135, "learning_rate": 1.5747324818778046e-05, "loss": 1.8592, "step": 938 }, { "epoch": 0.875116495806151, "grad_norm": 1.3115426147603921, "learning_rate": 1.5740421125302038e-05, "loss": 2.1841, "step": 939 }, { "epoch": 0.8760484622553588, "grad_norm": 1.4709329228238865, "learning_rate": 1.5733517431826027e-05, "loss": 2.3627, "step": 940 }, { "epoch": 0.8769804287045666, "grad_norm": 1.758382934990756, "learning_rate": 1.572661373835002e-05, "loss": 2.3471, "step": 941 }, { "epoch": 0.8779123951537745, "grad_norm": 1.3139807492395612, "learning_rate": 1.571971004487401e-05, "loss": 2.1467, "step": 942 }, { "epoch": 0.8788443616029823, "grad_norm": 1.5374760707202433, "learning_rate": 1.5712806351398e-05, "loss": 2.1865, "step": 943 }, { "epoch": 0.8797763280521901, "grad_norm": 1.567216433807221, "learning_rate": 1.570590265792199e-05, "loss": 2.2008, "step": 944 }, { "epoch": 0.880708294501398, "grad_norm": 1.5474462308926777, "learning_rate": 1.569899896444598e-05, "loss": 1.7824, "step": 945 }, { "epoch": 0.8816402609506058, "grad_norm": 1.6959906258297794, "learning_rate": 1.5692095270969968e-05, "loss": 2.2227, "step": 946 }, { "epoch": 0.8825722273998136, "grad_norm": 1.2991959362613967, "learning_rate": 1.568519157749396e-05, "loss": 2.2576, "step": 947 }, { "epoch": 0.8835041938490215, "grad_norm": 2.390103945301576, "learning_rate": 1.567828788401795e-05, "loss": 2.4667, "step": 948 }, { "epoch": 0.8844361602982292, "grad_norm": 1.2079462930536196, "learning_rate": 1.5671384190541942e-05, "loss": 2.0007, "step": 949 }, { "epoch": 0.8853681267474371, "grad_norm": 1.7309802067923443, "learning_rate": 1.566448049706593e-05, "loss": 2.3797, "step": 950 }, { "epoch": 0.8863000931966449, "grad_norm": 1.568804724880393, "learning_rate": 1.5657576803589923e-05, "loss": 2.077, "step": 951 }, { "epoch": 0.8872320596458527, "grad_norm": 1.8565755626528961, "learning_rate": 1.5650673110113912e-05, "loss": 2.2848, "step": 952 }, { "epoch": 0.8881640260950606, "grad_norm": 1.3494185126844702, "learning_rate": 1.56437694166379e-05, "loss": 2.1667, "step": 953 }, { "epoch": 0.8890959925442684, "grad_norm": 1.5170910487670226, "learning_rate": 1.5636865723161894e-05, "loss": 2.1209, "step": 954 }, { "epoch": 0.8900279589934762, "grad_norm": 1.4954260826606767, "learning_rate": 1.5629962029685883e-05, "loss": 2.1254, "step": 955 }, { "epoch": 0.8909599254426841, "grad_norm": 1.3881573505523543, "learning_rate": 1.5623058336209872e-05, "loss": 2.1277, "step": 956 }, { "epoch": 0.8918918918918919, "grad_norm": 1.517037202615216, "learning_rate": 1.5616154642733864e-05, "loss": 2.3728, "step": 957 }, { "epoch": 0.8928238583410997, "grad_norm": 1.4206804819107688, "learning_rate": 1.5609250949257853e-05, "loss": 2.3973, "step": 958 }, { "epoch": 0.8937558247903076, "grad_norm": 1.231161134629323, "learning_rate": 1.5602347255781846e-05, "loss": 1.7461, "step": 959 }, { "epoch": 0.8946877912395154, "grad_norm": 1.393492529048295, "learning_rate": 1.5595443562305835e-05, "loss": 2.5285, "step": 960 }, { "epoch": 0.8956197576887233, "grad_norm": 1.8699477306709837, "learning_rate": 1.5588539868829827e-05, "loss": 2.3767, "step": 961 }, { "epoch": 0.896551724137931, "grad_norm": 1.3450048852599157, "learning_rate": 1.5581636175353816e-05, "loss": 2.2348, "step": 962 }, { "epoch": 0.8974836905871388, "grad_norm": 1.4407384085232542, "learning_rate": 1.5574732481877805e-05, "loss": 2.001, "step": 963 }, { "epoch": 0.8984156570363467, "grad_norm": 1.5438891063735467, "learning_rate": 1.5567828788401794e-05, "loss": 1.7672, "step": 964 }, { "epoch": 0.8993476234855545, "grad_norm": 1.5720192619745454, "learning_rate": 1.5560925094925787e-05, "loss": 2.1142, "step": 965 }, { "epoch": 0.9002795899347623, "grad_norm": 1.2928167812344047, "learning_rate": 1.5554021401449776e-05, "loss": 1.861, "step": 966 }, { "epoch": 0.9012115563839702, "grad_norm": 1.2268018718003215, "learning_rate": 1.5547117707973768e-05, "loss": 2.2228, "step": 967 }, { "epoch": 0.902143522833178, "grad_norm": 1.5960454078605646, "learning_rate": 1.5540214014497757e-05, "loss": 2.1497, "step": 968 }, { "epoch": 0.9030754892823858, "grad_norm": 1.7201831893251545, "learning_rate": 1.553331032102175e-05, "loss": 2.375, "step": 969 }, { "epoch": 0.9040074557315937, "grad_norm": 1.6787111839805804, "learning_rate": 1.552640662754574e-05, "loss": 2.3345, "step": 970 }, { "epoch": 0.9049394221808015, "grad_norm": 2.12836393032647, "learning_rate": 1.5519502934069728e-05, "loss": 2.1439, "step": 971 }, { "epoch": 0.9058713886300093, "grad_norm": 1.6134684968532131, "learning_rate": 1.551259924059372e-05, "loss": 2.2506, "step": 972 }, { "epoch": 0.9068033550792172, "grad_norm": 1.6365979270551525, "learning_rate": 1.550569554711771e-05, "loss": 1.8863, "step": 973 }, { "epoch": 0.907735321528425, "grad_norm": 1.5832250657664912, "learning_rate": 1.54987918536417e-05, "loss": 2.0236, "step": 974 }, { "epoch": 0.9086672879776329, "grad_norm": 1.9590317286806278, "learning_rate": 1.5491888160165687e-05, "loss": 2.4171, "step": 975 }, { "epoch": 0.9095992544268406, "grad_norm": 1.4973547084367913, "learning_rate": 1.548498446668968e-05, "loss": 2.3196, "step": 976 }, { "epoch": 0.9105312208760484, "grad_norm": 1.1755947947386474, "learning_rate": 1.5478080773213672e-05, "loss": 2.0991, "step": 977 }, { "epoch": 0.9114631873252563, "grad_norm": 1.697770432039578, "learning_rate": 1.547117707973766e-05, "loss": 1.5532, "step": 978 }, { "epoch": 0.9123951537744641, "grad_norm": 1.5956499978930465, "learning_rate": 1.5464273386261654e-05, "loss": 1.8829, "step": 979 }, { "epoch": 0.9133271202236719, "grad_norm": 1.4883549671851786, "learning_rate": 1.5457369692785643e-05, "loss": 2.1124, "step": 980 }, { "epoch": 0.9142590866728798, "grad_norm": 1.5876474340098954, "learning_rate": 1.5450465999309632e-05, "loss": 2.2691, "step": 981 }, { "epoch": 0.9151910531220876, "grad_norm": 1.331180271627316, "learning_rate": 1.544356230583362e-05, "loss": 2.3971, "step": 982 }, { "epoch": 0.9161230195712954, "grad_norm": 1.740724424848564, "learning_rate": 1.5436658612357613e-05, "loss": 2.1022, "step": 983 }, { "epoch": 0.9170549860205033, "grad_norm": 1.3118761224093907, "learning_rate": 1.5429754918881602e-05, "loss": 2.0955, "step": 984 }, { "epoch": 0.9179869524697111, "grad_norm": 1.4853827200372105, "learning_rate": 1.542285122540559e-05, "loss": 2.1433, "step": 985 }, { "epoch": 0.918918918918919, "grad_norm": 1.7106738712794978, "learning_rate": 1.5415947531929584e-05, "loss": 2.5712, "step": 986 }, { "epoch": 0.9198508853681268, "grad_norm": 1.4574063376183883, "learning_rate": 1.5409043838453576e-05, "loss": 1.8888, "step": 987 }, { "epoch": 0.9207828518173345, "grad_norm": 1.1518713494483976, "learning_rate": 1.5402140144977565e-05, "loss": 2.3036, "step": 988 }, { "epoch": 0.9217148182665424, "grad_norm": 1.4883103881568276, "learning_rate": 1.5395236451501554e-05, "loss": 2.2061, "step": 989 }, { "epoch": 0.9226467847157502, "grad_norm": 1.3382694045185548, "learning_rate": 1.5388332758025547e-05, "loss": 1.9916, "step": 990 }, { "epoch": 0.923578751164958, "grad_norm": 1.517955538351478, "learning_rate": 1.5381429064549536e-05, "loss": 2.3698, "step": 991 }, { "epoch": 0.9245107176141659, "grad_norm": 1.480884403658631, "learning_rate": 1.5374525371073525e-05, "loss": 2.3177, "step": 992 }, { "epoch": 0.9254426840633737, "grad_norm": 2.02692982095103, "learning_rate": 1.5367621677597514e-05, "loss": 2.3963, "step": 993 }, { "epoch": 0.9263746505125815, "grad_norm": 1.3383555727115273, "learning_rate": 1.5360717984121506e-05, "loss": 2.2937, "step": 994 }, { "epoch": 0.9273066169617894, "grad_norm": 1.6327442908640681, "learning_rate": 1.5353814290645495e-05, "loss": 2.1327, "step": 995 }, { "epoch": 0.9282385834109972, "grad_norm": 1.5589522653877599, "learning_rate": 1.5346910597169488e-05, "loss": 2.2085, "step": 996 }, { "epoch": 0.9291705498602051, "grad_norm": 1.8984312705014217, "learning_rate": 1.5340006903693477e-05, "loss": 2.2209, "step": 997 }, { "epoch": 0.9301025163094129, "grad_norm": 1.787335548102778, "learning_rate": 1.533310321021747e-05, "loss": 2.2444, "step": 998 }, { "epoch": 0.9310344827586207, "grad_norm": 1.4678817489839944, "learning_rate": 1.5326199516741458e-05, "loss": 1.9267, "step": 999 }, { "epoch": 0.9319664492078286, "grad_norm": 1.5382025049209742, "learning_rate": 1.5319295823265447e-05, "loss": 2.4115, "step": 1000 }, { "epoch": 0.9328984156570364, "grad_norm": 1.348358272929948, "learning_rate": 1.531239212978944e-05, "loss": 2.1427, "step": 1001 }, { "epoch": 0.9338303821062441, "grad_norm": 1.4644777693596271, "learning_rate": 1.530548843631343e-05, "loss": 2.2115, "step": 1002 }, { "epoch": 0.934762348555452, "grad_norm": 1.1985948132598767, "learning_rate": 1.5298584742837418e-05, "loss": 2.036, "step": 1003 }, { "epoch": 0.9356943150046598, "grad_norm": 1.3987611011279981, "learning_rate": 1.529168104936141e-05, "loss": 2.4653, "step": 1004 }, { "epoch": 0.9366262814538676, "grad_norm": 1.600476214825824, "learning_rate": 1.52847773558854e-05, "loss": 2.2513, "step": 1005 }, { "epoch": 0.9375582479030755, "grad_norm": 2.405090582973002, "learning_rate": 1.527787366240939e-05, "loss": 2.3263, "step": 1006 }, { "epoch": 0.9384902143522833, "grad_norm": 1.3777218111877885, "learning_rate": 1.527096996893338e-05, "loss": 2.254, "step": 1007 }, { "epoch": 0.9394221808014911, "grad_norm": 1.256114674999006, "learning_rate": 1.526406627545737e-05, "loss": 2.4916, "step": 1008 }, { "epoch": 0.940354147250699, "grad_norm": 1.2228343486815687, "learning_rate": 1.5257162581981362e-05, "loss": 1.8223, "step": 1009 }, { "epoch": 0.9412861136999068, "grad_norm": 1.1478595211258442, "learning_rate": 1.5250258888505351e-05, "loss": 1.7821, "step": 1010 }, { "epoch": 0.9422180801491147, "grad_norm": 1.4817978454846261, "learning_rate": 1.5243355195029342e-05, "loss": 1.754, "step": 1011 }, { "epoch": 0.9431500465983225, "grad_norm": 1.2187232787776228, "learning_rate": 1.5236451501553333e-05, "loss": 2.6883, "step": 1012 }, { "epoch": 0.9440820130475303, "grad_norm": 1.0708129345297097, "learning_rate": 1.5229547808077323e-05, "loss": 1.7325, "step": 1013 }, { "epoch": 0.9450139794967382, "grad_norm": 1.7740915841743976, "learning_rate": 1.5222644114601312e-05, "loss": 2.2192, "step": 1014 }, { "epoch": 0.9459459459459459, "grad_norm": 1.5060782906937278, "learning_rate": 1.5215740421125303e-05, "loss": 2.2776, "step": 1015 }, { "epoch": 0.9468779123951537, "grad_norm": 1.5325756310352756, "learning_rate": 1.5208836727649294e-05, "loss": 2.0769, "step": 1016 }, { "epoch": 0.9478098788443616, "grad_norm": 1.6286056495969417, "learning_rate": 1.5201933034173285e-05, "loss": 2.0686, "step": 1017 }, { "epoch": 0.9487418452935694, "grad_norm": 1.3694191966460894, "learning_rate": 1.5195029340697274e-05, "loss": 2.0579, "step": 1018 }, { "epoch": 0.9496738117427772, "grad_norm": 1.9074752686307845, "learning_rate": 1.5188125647221266e-05, "loss": 2.2155, "step": 1019 }, { "epoch": 0.9506057781919851, "grad_norm": 1.5618595793843144, "learning_rate": 1.5181221953745255e-05, "loss": 2.1978, "step": 1020 }, { "epoch": 0.9515377446411929, "grad_norm": 1.4740919691501928, "learning_rate": 1.5174318260269246e-05, "loss": 2.2792, "step": 1021 }, { "epoch": 0.9524697110904008, "grad_norm": 1.5308164493400325, "learning_rate": 1.5167414566793235e-05, "loss": 2.2494, "step": 1022 }, { "epoch": 0.9534016775396086, "grad_norm": 1.4758321748180707, "learning_rate": 1.5160510873317227e-05, "loss": 2.2924, "step": 1023 }, { "epoch": 0.9543336439888164, "grad_norm": 1.6801655132836741, "learning_rate": 1.5153607179841216e-05, "loss": 2.2038, "step": 1024 }, { "epoch": 0.9552656104380243, "grad_norm": 1.3755810243126947, "learning_rate": 1.5146703486365207e-05, "loss": 2.3222, "step": 1025 }, { "epoch": 0.9561975768872321, "grad_norm": 1.2293674866643358, "learning_rate": 1.5139799792889196e-05, "loss": 2.1553, "step": 1026 }, { "epoch": 0.9571295433364398, "grad_norm": 1.4552503533783174, "learning_rate": 1.5132896099413189e-05, "loss": 2.2699, "step": 1027 }, { "epoch": 0.9580615097856477, "grad_norm": 1.7670411402311528, "learning_rate": 1.5125992405937178e-05, "loss": 2.2636, "step": 1028 }, { "epoch": 0.9589934762348555, "grad_norm": 1.313289796702676, "learning_rate": 1.5119088712461167e-05, "loss": 2.0172, "step": 1029 }, { "epoch": 0.9599254426840633, "grad_norm": 1.4554753513031944, "learning_rate": 1.5112185018985159e-05, "loss": 2.1074, "step": 1030 }, { "epoch": 0.9608574091332712, "grad_norm": 1.3574285099716192, "learning_rate": 1.510528132550915e-05, "loss": 2.2032, "step": 1031 }, { "epoch": 0.961789375582479, "grad_norm": 1.4469317512931343, "learning_rate": 1.5098377632033139e-05, "loss": 2.2855, "step": 1032 }, { "epoch": 0.9627213420316869, "grad_norm": 1.5172875974821542, "learning_rate": 1.5091473938557128e-05, "loss": 2.136, "step": 1033 }, { "epoch": 0.9636533084808947, "grad_norm": 2.000318582193073, "learning_rate": 1.508457024508112e-05, "loss": 2.122, "step": 1034 }, { "epoch": 0.9645852749301025, "grad_norm": 1.5355967400951152, "learning_rate": 1.5077666551605111e-05, "loss": 2.4141, "step": 1035 }, { "epoch": 0.9655172413793104, "grad_norm": 3.7272362320539942, "learning_rate": 1.50707628581291e-05, "loss": 2.2106, "step": 1036 }, { "epoch": 0.9664492078285182, "grad_norm": 1.463614059080507, "learning_rate": 1.5063859164653089e-05, "loss": 1.811, "step": 1037 }, { "epoch": 0.967381174277726, "grad_norm": 1.0908255529214146, "learning_rate": 1.5056955471177081e-05, "loss": 1.9092, "step": 1038 }, { "epoch": 0.9683131407269339, "grad_norm": 1.685666207731347, "learning_rate": 1.505005177770107e-05, "loss": 2.2396, "step": 1039 }, { "epoch": 0.9692451071761417, "grad_norm": 2.004520451537105, "learning_rate": 1.5043148084225061e-05, "loss": 1.8091, "step": 1040 }, { "epoch": 0.9701770736253494, "grad_norm": 1.400267819903973, "learning_rate": 1.5036244390749054e-05, "loss": 2.0594, "step": 1041 }, { "epoch": 0.9711090400745573, "grad_norm": 2.052239189693397, "learning_rate": 1.5029340697273043e-05, "loss": 2.3719, "step": 1042 }, { "epoch": 0.9720410065237651, "grad_norm": 1.9044678876496508, "learning_rate": 1.5022437003797032e-05, "loss": 2.303, "step": 1043 }, { "epoch": 0.972972972972973, "grad_norm": 1.6590881841149172, "learning_rate": 1.5015533310321023e-05, "loss": 2.0088, "step": 1044 }, { "epoch": 0.9739049394221808, "grad_norm": 1.3598860957075474, "learning_rate": 1.5008629616845013e-05, "loss": 2.3337, "step": 1045 }, { "epoch": 0.9748369058713886, "grad_norm": 1.3271142653417962, "learning_rate": 1.5001725923369004e-05, "loss": 2.2589, "step": 1046 }, { "epoch": 0.9757688723205965, "grad_norm": 1.8531770334160829, "learning_rate": 1.4994822229892993e-05, "loss": 2.7838, "step": 1047 }, { "epoch": 0.9767008387698043, "grad_norm": 1.2945321074385416, "learning_rate": 1.4987918536416984e-05, "loss": 2.0546, "step": 1048 }, { "epoch": 0.9776328052190121, "grad_norm": 1.517192805835363, "learning_rate": 1.4981014842940974e-05, "loss": 2.4818, "step": 1049 }, { "epoch": 0.97856477166822, "grad_norm": 1.8701830361302005, "learning_rate": 1.4974111149464965e-05, "loss": 2.1476, "step": 1050 }, { "epoch": 0.9794967381174278, "grad_norm": 1.7306826056473275, "learning_rate": 1.4967207455988954e-05, "loss": 2.1072, "step": 1051 }, { "epoch": 0.9804287045666356, "grad_norm": 1.6335253363440119, "learning_rate": 1.4960303762512947e-05, "loss": 1.9924, "step": 1052 }, { "epoch": 0.9813606710158435, "grad_norm": 1.531180652446308, "learning_rate": 1.4953400069036936e-05, "loss": 2.2875, "step": 1053 }, { "epoch": 0.9822926374650512, "grad_norm": 1.3530977414794876, "learning_rate": 1.4946496375560926e-05, "loss": 2.2402, "step": 1054 }, { "epoch": 0.983224603914259, "grad_norm": 1.2651022815362098, "learning_rate": 1.4939592682084915e-05, "loss": 2.3793, "step": 1055 }, { "epoch": 0.9841565703634669, "grad_norm": 1.1813392651788475, "learning_rate": 1.4932688988608908e-05, "loss": 1.9422, "step": 1056 }, { "epoch": 0.9850885368126747, "grad_norm": 1.7091151110083123, "learning_rate": 1.4925785295132897e-05, "loss": 2.5311, "step": 1057 }, { "epoch": 0.9860205032618826, "grad_norm": 1.7574178036391357, "learning_rate": 1.4918881601656888e-05, "loss": 2.1683, "step": 1058 }, { "epoch": 0.9869524697110904, "grad_norm": 1.2082698607348785, "learning_rate": 1.4911977908180878e-05, "loss": 2.544, "step": 1059 }, { "epoch": 0.9878844361602982, "grad_norm": 1.6052315302491529, "learning_rate": 1.4905074214704869e-05, "loss": 2.0515, "step": 1060 }, { "epoch": 0.9888164026095061, "grad_norm": 1.3051482765049676, "learning_rate": 1.4898170521228858e-05, "loss": 2.2011, "step": 1061 }, { "epoch": 0.9897483690587139, "grad_norm": 1.6376181262945668, "learning_rate": 1.4891266827752849e-05, "loss": 2.1624, "step": 1062 }, { "epoch": 0.9906803355079217, "grad_norm": 1.2762891416468845, "learning_rate": 1.488436313427684e-05, "loss": 1.8866, "step": 1063 }, { "epoch": 0.9916123019571296, "grad_norm": 1.4736481356953655, "learning_rate": 1.487745944080083e-05, "loss": 2.3262, "step": 1064 }, { "epoch": 0.9925442684063374, "grad_norm": 1.2688400996468054, "learning_rate": 1.487055574732482e-05, "loss": 1.7049, "step": 1065 }, { "epoch": 0.9934762348555451, "grad_norm": 1.3975921922458945, "learning_rate": 1.486365205384881e-05, "loss": 2.1609, "step": 1066 }, { "epoch": 0.994408201304753, "grad_norm": 1.4296740126776948, "learning_rate": 1.4856748360372801e-05, "loss": 1.9085, "step": 1067 }, { "epoch": 0.9953401677539608, "grad_norm": 1.638468348861617, "learning_rate": 1.4849844666896792e-05, "loss": 2.2978, "step": 1068 }, { "epoch": 0.9962721342031687, "grad_norm": 1.4325718502859657, "learning_rate": 1.484294097342078e-05, "loss": 2.4253, "step": 1069 }, { "epoch": 0.9972041006523765, "grad_norm": 1.2824168696152, "learning_rate": 1.4836037279944773e-05, "loss": 2.0635, "step": 1070 }, { "epoch": 0.9981360671015843, "grad_norm": 1.4088806486109426, "learning_rate": 1.4829133586468762e-05, "loss": 1.9821, "step": 1071 }, { "epoch": 0.9990680335507922, "grad_norm": 1.572461586028596, "learning_rate": 1.4822229892992753e-05, "loss": 2.495, "step": 1072 }, { "epoch": 1.0, "grad_norm": 1.1913923963215751, "learning_rate": 1.4815326199516742e-05, "loss": 2.1232, "step": 1073 }, { "epoch": 1.0009319664492078, "grad_norm": 1.5284132520307974, "learning_rate": 1.4808422506040734e-05, "loss": 2.043, "step": 1074 }, { "epoch": 1.0018639328984156, "grad_norm": 1.5740077845561542, "learning_rate": 1.4801518812564723e-05, "loss": 2.2057, "step": 1075 }, { "epoch": 1.0027958993476236, "grad_norm": 1.0914591724226912, "learning_rate": 1.4794615119088714e-05, "loss": 1.7965, "step": 1076 }, { "epoch": 1.0037278657968314, "grad_norm": 1.2813862873746371, "learning_rate": 1.4787711425612703e-05, "loss": 1.817, "step": 1077 }, { "epoch": 1.0046598322460392, "grad_norm": 1.165756569381126, "learning_rate": 1.4780807732136696e-05, "loss": 2.1109, "step": 1078 }, { "epoch": 1.005591798695247, "grad_norm": 1.1252886182699955, "learning_rate": 1.4773904038660685e-05, "loss": 1.7273, "step": 1079 }, { "epoch": 1.0065237651444547, "grad_norm": 1.148417231261902, "learning_rate": 1.4767000345184674e-05, "loss": 1.6091, "step": 1080 }, { "epoch": 1.0074557315936625, "grad_norm": 1.2114524781360283, "learning_rate": 1.4760096651708666e-05, "loss": 2.1406, "step": 1081 }, { "epoch": 1.0083876980428705, "grad_norm": 1.2472100813147295, "learning_rate": 1.4753192958232657e-05, "loss": 1.8517, "step": 1082 }, { "epoch": 1.0093196644920783, "grad_norm": 1.4056329275689583, "learning_rate": 1.4746289264756646e-05, "loss": 1.9051, "step": 1083 }, { "epoch": 1.0102516309412861, "grad_norm": 1.2980876215885362, "learning_rate": 1.4739385571280635e-05, "loss": 1.7284, "step": 1084 }, { "epoch": 1.011183597390494, "grad_norm": 1.257563052019403, "learning_rate": 1.4732481877804627e-05, "loss": 2.0061, "step": 1085 }, { "epoch": 1.0121155638397017, "grad_norm": 1.2854587167066907, "learning_rate": 1.4725578184328618e-05, "loss": 1.8442, "step": 1086 }, { "epoch": 1.0130475302889097, "grad_norm": 1.1205811769650031, "learning_rate": 1.4718674490852607e-05, "loss": 1.8182, "step": 1087 }, { "epoch": 1.0139794967381175, "grad_norm": 1.3132626847327058, "learning_rate": 1.4711770797376596e-05, "loss": 1.815, "step": 1088 }, { "epoch": 1.0149114631873253, "grad_norm": 1.9771283161882611, "learning_rate": 1.4704867103900589e-05, "loss": 2.0027, "step": 1089 }, { "epoch": 1.015843429636533, "grad_norm": 1.4733120503017727, "learning_rate": 1.4697963410424578e-05, "loss": 1.9475, "step": 1090 }, { "epoch": 1.0167753960857409, "grad_norm": 1.219766620041778, "learning_rate": 1.4691059716948568e-05, "loss": 1.841, "step": 1091 }, { "epoch": 1.0177073625349486, "grad_norm": 1.2987700161251927, "learning_rate": 1.468415602347256e-05, "loss": 2.0424, "step": 1092 }, { "epoch": 1.0186393289841567, "grad_norm": 1.3228093529298481, "learning_rate": 1.467725232999655e-05, "loss": 2.1993, "step": 1093 }, { "epoch": 1.0195712954333644, "grad_norm": 1.7372298941524542, "learning_rate": 1.4670348636520539e-05, "loss": 1.9738, "step": 1094 }, { "epoch": 1.0205032618825722, "grad_norm": 1.222399369750198, "learning_rate": 1.466344494304453e-05, "loss": 1.6619, "step": 1095 }, { "epoch": 1.02143522833178, "grad_norm": 1.1177567625246612, "learning_rate": 1.465654124956852e-05, "loss": 1.7552, "step": 1096 }, { "epoch": 1.0223671947809878, "grad_norm": 1.2845986160552167, "learning_rate": 1.4649637556092511e-05, "loss": 1.817, "step": 1097 }, { "epoch": 1.0232991612301958, "grad_norm": 1.3218019262911427, "learning_rate": 1.46427338626165e-05, "loss": 1.5909, "step": 1098 }, { "epoch": 1.0242311276794036, "grad_norm": 1.610241520750157, "learning_rate": 1.463583016914049e-05, "loss": 2.1557, "step": 1099 }, { "epoch": 1.0251630941286114, "grad_norm": 1.4474859302797958, "learning_rate": 1.4628926475664481e-05, "loss": 1.8327, "step": 1100 }, { "epoch": 1.0260950605778192, "grad_norm": 1.1798721944792527, "learning_rate": 1.4622022782188472e-05, "loss": 1.7378, "step": 1101 }, { "epoch": 1.027027027027027, "grad_norm": 1.589072543569551, "learning_rate": 1.4615119088712461e-05, "loss": 2.0464, "step": 1102 }, { "epoch": 1.0279589934762348, "grad_norm": 1.5623935452812632, "learning_rate": 1.4608215395236454e-05, "loss": 1.836, "step": 1103 }, { "epoch": 1.0288909599254428, "grad_norm": 1.1807938785279581, "learning_rate": 1.4601311701760443e-05, "loss": 2.0042, "step": 1104 }, { "epoch": 1.0298229263746506, "grad_norm": 1.4923852191230853, "learning_rate": 1.4594408008284433e-05, "loss": 1.9129, "step": 1105 }, { "epoch": 1.0307548928238583, "grad_norm": 1.3834129676289786, "learning_rate": 1.4587504314808423e-05, "loss": 1.7916, "step": 1106 }, { "epoch": 1.0316868592730661, "grad_norm": 1.4391495221441082, "learning_rate": 1.4580600621332415e-05, "loss": 1.8085, "step": 1107 }, { "epoch": 1.032618825722274, "grad_norm": 1.3788157024861214, "learning_rate": 1.4573696927856404e-05, "loss": 2.0755, "step": 1108 }, { "epoch": 1.0335507921714817, "grad_norm": 1.4816463197304903, "learning_rate": 1.4566793234380395e-05, "loss": 1.7496, "step": 1109 }, { "epoch": 1.0344827586206897, "grad_norm": 1.3898086898120903, "learning_rate": 1.4559889540904385e-05, "loss": 1.8748, "step": 1110 }, { "epoch": 1.0354147250698975, "grad_norm": 1.4241362440458971, "learning_rate": 1.4552985847428376e-05, "loss": 1.6749, "step": 1111 }, { "epoch": 1.0363466915191053, "grad_norm": 1.1619947595386364, "learning_rate": 1.4546082153952365e-05, "loss": 1.9339, "step": 1112 }, { "epoch": 1.037278657968313, "grad_norm": 1.3440004992353438, "learning_rate": 1.4539178460476356e-05, "loss": 1.8169, "step": 1113 }, { "epoch": 1.0382106244175209, "grad_norm": 1.4317212022444057, "learning_rate": 1.4532274767000347e-05, "loss": 2.0559, "step": 1114 }, { "epoch": 1.0391425908667289, "grad_norm": 1.3316992332229363, "learning_rate": 1.4525371073524337e-05, "loss": 2.0391, "step": 1115 }, { "epoch": 1.0400745573159367, "grad_norm": 1.257284690668685, "learning_rate": 1.4518467380048326e-05, "loss": 1.6883, "step": 1116 }, { "epoch": 1.0410065237651445, "grad_norm": 1.3601929164543927, "learning_rate": 1.4511563686572317e-05, "loss": 1.9952, "step": 1117 }, { "epoch": 1.0419384902143523, "grad_norm": 1.5236302921923341, "learning_rate": 1.4504659993096308e-05, "loss": 2.1466, "step": 1118 }, { "epoch": 1.04287045666356, "grad_norm": 1.4609151225691976, "learning_rate": 1.4497756299620299e-05, "loss": 2.3464, "step": 1119 }, { "epoch": 1.0438024231127678, "grad_norm": 1.2745745788935385, "learning_rate": 1.4490852606144288e-05, "loss": 2.0816, "step": 1120 }, { "epoch": 1.0447343895619758, "grad_norm": 1.2236002978137959, "learning_rate": 1.448394891266828e-05, "loss": 1.7388, "step": 1121 }, { "epoch": 1.0456663560111836, "grad_norm": 1.242318900942485, "learning_rate": 1.4477045219192269e-05, "loss": 1.8264, "step": 1122 }, { "epoch": 1.0465983224603914, "grad_norm": 1.2921652688317116, "learning_rate": 1.447014152571626e-05, "loss": 2.1895, "step": 1123 }, { "epoch": 1.0475302889095992, "grad_norm": 1.2097602087353536, "learning_rate": 1.4463237832240249e-05, "loss": 2.0942, "step": 1124 }, { "epoch": 1.048462255358807, "grad_norm": 1.3725128158008144, "learning_rate": 1.4456334138764241e-05, "loss": 1.999, "step": 1125 }, { "epoch": 1.049394221808015, "grad_norm": 1.3282899111686242, "learning_rate": 1.444943044528823e-05, "loss": 2.1269, "step": 1126 }, { "epoch": 1.0503261882572228, "grad_norm": 1.136322753888604, "learning_rate": 1.4442526751812221e-05, "loss": 1.73, "step": 1127 }, { "epoch": 1.0512581547064306, "grad_norm": 1.222821724239529, "learning_rate": 1.443562305833621e-05, "loss": 1.7965, "step": 1128 }, { "epoch": 1.0521901211556384, "grad_norm": 1.2942809825296342, "learning_rate": 1.4428719364860203e-05, "loss": 2.0927, "step": 1129 }, { "epoch": 1.0531220876048462, "grad_norm": 1.6639446819246573, "learning_rate": 1.4421815671384192e-05, "loss": 1.9041, "step": 1130 }, { "epoch": 1.054054054054054, "grad_norm": 1.5575417844238448, "learning_rate": 1.441491197790818e-05, "loss": 2.2222, "step": 1131 }, { "epoch": 1.054986020503262, "grad_norm": 1.3681603676589396, "learning_rate": 1.4408008284432173e-05, "loss": 1.5392, "step": 1132 }, { "epoch": 1.0559179869524697, "grad_norm": 1.1969618978552583, "learning_rate": 1.4401104590956164e-05, "loss": 1.994, "step": 1133 }, { "epoch": 1.0568499534016775, "grad_norm": 1.1858943954365346, "learning_rate": 1.4394200897480153e-05, "loss": 1.9814, "step": 1134 }, { "epoch": 1.0577819198508853, "grad_norm": 1.5531590856605653, "learning_rate": 1.4387297204004142e-05, "loss": 1.9907, "step": 1135 }, { "epoch": 1.058713886300093, "grad_norm": 1.283370425963552, "learning_rate": 1.4380393510528134e-05, "loss": 1.8518, "step": 1136 }, { "epoch": 1.0596458527493011, "grad_norm": 1.156089107943458, "learning_rate": 1.4373489817052125e-05, "loss": 2.0584, "step": 1137 }, { "epoch": 1.060577819198509, "grad_norm": 1.4033302640262169, "learning_rate": 1.4366586123576114e-05, "loss": 2.0093, "step": 1138 }, { "epoch": 1.0615097856477167, "grad_norm": 1.2485359759131518, "learning_rate": 1.4359682430100103e-05, "loss": 1.8909, "step": 1139 }, { "epoch": 1.0624417520969245, "grad_norm": 1.8993658151179076, "learning_rate": 1.4352778736624096e-05, "loss": 2.4119, "step": 1140 }, { "epoch": 1.0633737185461323, "grad_norm": 1.220539816958848, "learning_rate": 1.4345875043148085e-05, "loss": 1.9271, "step": 1141 }, { "epoch": 1.06430568499534, "grad_norm": 1.1485819092685883, "learning_rate": 1.4338971349672075e-05, "loss": 1.457, "step": 1142 }, { "epoch": 1.065237651444548, "grad_norm": 1.1272447524856988, "learning_rate": 1.4332067656196068e-05, "loss": 1.9057, "step": 1143 }, { "epoch": 1.0661696178937559, "grad_norm": 1.3432323914742734, "learning_rate": 1.4325163962720057e-05, "loss": 2.0175, "step": 1144 }, { "epoch": 1.0671015843429636, "grad_norm": 1.3573756227626639, "learning_rate": 1.4318260269244046e-05, "loss": 1.8425, "step": 1145 }, { "epoch": 1.0680335507921714, "grad_norm": 1.36876144856026, "learning_rate": 1.4311356575768037e-05, "loss": 2.0186, "step": 1146 }, { "epoch": 1.0689655172413792, "grad_norm": 1.1759174310205556, "learning_rate": 1.4304452882292027e-05, "loss": 2.0849, "step": 1147 }, { "epoch": 1.0698974836905872, "grad_norm": 1.1779026567827648, "learning_rate": 1.4297549188816018e-05, "loss": 1.8734, "step": 1148 }, { "epoch": 1.070829450139795, "grad_norm": 1.656553202345495, "learning_rate": 1.4290645495340007e-05, "loss": 2.1045, "step": 1149 }, { "epoch": 1.0717614165890028, "grad_norm": 1.2395486384276178, "learning_rate": 1.4283741801864e-05, "loss": 1.8535, "step": 1150 }, { "epoch": 1.0726933830382106, "grad_norm": 1.733278979821276, "learning_rate": 1.4276838108387989e-05, "loss": 2.2128, "step": 1151 }, { "epoch": 1.0736253494874184, "grad_norm": 1.7769057532320371, "learning_rate": 1.426993441491198e-05, "loss": 1.9594, "step": 1152 }, { "epoch": 1.0745573159366262, "grad_norm": 1.413743003787963, "learning_rate": 1.4263030721435968e-05, "loss": 1.9637, "step": 1153 }, { "epoch": 1.0754892823858342, "grad_norm": 1.2272425614944917, "learning_rate": 1.425612702795996e-05, "loss": 1.8357, "step": 1154 }, { "epoch": 1.076421248835042, "grad_norm": 1.401439163439593, "learning_rate": 1.424922333448395e-05, "loss": 2.3007, "step": 1155 }, { "epoch": 1.0773532152842498, "grad_norm": 1.2015484674186538, "learning_rate": 1.424231964100794e-05, "loss": 2.0955, "step": 1156 }, { "epoch": 1.0782851817334576, "grad_norm": 1.2383431723924072, "learning_rate": 1.423541594753193e-05, "loss": 2.2091, "step": 1157 }, { "epoch": 1.0792171481826653, "grad_norm": 1.3854895936490514, "learning_rate": 1.4228512254055922e-05, "loss": 2.2551, "step": 1158 }, { "epoch": 1.0801491146318734, "grad_norm": 1.3801900926929027, "learning_rate": 1.4221608560579911e-05, "loss": 1.6671, "step": 1159 }, { "epoch": 1.0810810810810811, "grad_norm": 1.3061162236285346, "learning_rate": 1.4214704867103902e-05, "loss": 1.7621, "step": 1160 }, { "epoch": 1.082013047530289, "grad_norm": 1.706811263468446, "learning_rate": 1.4207801173627892e-05, "loss": 1.5631, "step": 1161 }, { "epoch": 1.0829450139794967, "grad_norm": 1.1491536619350315, "learning_rate": 1.4200897480151883e-05, "loss": 1.7197, "step": 1162 }, { "epoch": 1.0838769804287045, "grad_norm": 1.4294026696328717, "learning_rate": 1.4193993786675872e-05, "loss": 2.1901, "step": 1163 }, { "epoch": 1.0848089468779123, "grad_norm": 1.1631009997026318, "learning_rate": 1.4187090093199863e-05, "loss": 1.8648, "step": 1164 }, { "epoch": 1.0857409133271203, "grad_norm": 1.3374150644097687, "learning_rate": 1.4180186399723854e-05, "loss": 1.9153, "step": 1165 }, { "epoch": 1.086672879776328, "grad_norm": 1.1803053360444578, "learning_rate": 1.4173282706247844e-05, "loss": 1.737, "step": 1166 }, { "epoch": 1.0876048462255359, "grad_norm": 1.651794236452572, "learning_rate": 1.4166379012771833e-05, "loss": 2.1048, "step": 1167 }, { "epoch": 1.0885368126747437, "grad_norm": 1.326743676476893, "learning_rate": 1.4159475319295824e-05, "loss": 1.9869, "step": 1168 }, { "epoch": 1.0894687791239515, "grad_norm": 1.394784079300575, "learning_rate": 1.4152571625819815e-05, "loss": 2.1912, "step": 1169 }, { "epoch": 1.0904007455731595, "grad_norm": 1.0764753274324745, "learning_rate": 1.4145667932343806e-05, "loss": 1.9084, "step": 1170 }, { "epoch": 1.0913327120223673, "grad_norm": 1.3861097559962978, "learning_rate": 1.4138764238867795e-05, "loss": 1.8075, "step": 1171 }, { "epoch": 1.092264678471575, "grad_norm": 1.2856899226101066, "learning_rate": 1.4131860545391787e-05, "loss": 1.7991, "step": 1172 }, { "epoch": 1.0931966449207828, "grad_norm": 1.6037966279972395, "learning_rate": 1.4124956851915776e-05, "loss": 1.6345, "step": 1173 }, { "epoch": 1.0941286113699906, "grad_norm": 1.268598891828673, "learning_rate": 1.4118053158439767e-05, "loss": 1.6773, "step": 1174 }, { "epoch": 1.0950605778191984, "grad_norm": 1.327428442805058, "learning_rate": 1.4111149464963756e-05, "loss": 2.0749, "step": 1175 }, { "epoch": 1.0959925442684064, "grad_norm": 1.3719996293491996, "learning_rate": 1.4104245771487748e-05, "loss": 1.9458, "step": 1176 }, { "epoch": 1.0969245107176142, "grad_norm": 1.3683557362794427, "learning_rate": 1.4097342078011737e-05, "loss": 2.2369, "step": 1177 }, { "epoch": 1.097856477166822, "grad_norm": 1.370735584047544, "learning_rate": 1.4090438384535728e-05, "loss": 1.6469, "step": 1178 }, { "epoch": 1.0987884436160298, "grad_norm": 1.204526739340107, "learning_rate": 1.4083534691059717e-05, "loss": 2.0606, "step": 1179 }, { "epoch": 1.0997204100652376, "grad_norm": 1.237234412861544, "learning_rate": 1.407663099758371e-05, "loss": 1.7144, "step": 1180 }, { "epoch": 1.1006523765144456, "grad_norm": 1.4959932180532038, "learning_rate": 1.4069727304107699e-05, "loss": 1.8033, "step": 1181 }, { "epoch": 1.1015843429636534, "grad_norm": 1.1321211058687561, "learning_rate": 1.4062823610631688e-05, "loss": 1.6345, "step": 1182 }, { "epoch": 1.1025163094128612, "grad_norm": 1.28572653602034, "learning_rate": 1.405591991715568e-05, "loss": 1.8224, "step": 1183 }, { "epoch": 1.103448275862069, "grad_norm": 1.325320240276451, "learning_rate": 1.404901622367967e-05, "loss": 2.1248, "step": 1184 }, { "epoch": 1.1043802423112767, "grad_norm": 1.2076667336176403, "learning_rate": 1.404211253020366e-05, "loss": 1.8162, "step": 1185 }, { "epoch": 1.1053122087604845, "grad_norm": 1.2778516927957475, "learning_rate": 1.4035208836727649e-05, "loss": 1.9457, "step": 1186 }, { "epoch": 1.1062441752096925, "grad_norm": 1.3012928996748305, "learning_rate": 1.4028305143251641e-05, "loss": 2.0379, "step": 1187 }, { "epoch": 1.1071761416589003, "grad_norm": 1.4587708279737837, "learning_rate": 1.402140144977563e-05, "loss": 2.0333, "step": 1188 }, { "epoch": 1.1081081081081081, "grad_norm": 1.1808348619126143, "learning_rate": 1.4014497756299621e-05, "loss": 1.984, "step": 1189 }, { "epoch": 1.109040074557316, "grad_norm": 1.2710190723179686, "learning_rate": 1.400759406282361e-05, "loss": 1.656, "step": 1190 }, { "epoch": 1.1099720410065237, "grad_norm": 1.1624113753041014, "learning_rate": 1.4000690369347603e-05, "loss": 1.9534, "step": 1191 }, { "epoch": 1.1109040074557317, "grad_norm": 1.3320003224337205, "learning_rate": 1.3993786675871592e-05, "loss": 1.8632, "step": 1192 }, { "epoch": 1.1118359739049395, "grad_norm": 1.2702655172040742, "learning_rate": 1.3986882982395582e-05, "loss": 1.9767, "step": 1193 }, { "epoch": 1.1127679403541473, "grad_norm": 1.3278840351555896, "learning_rate": 1.3979979288919575e-05, "loss": 2.017, "step": 1194 }, { "epoch": 1.113699906803355, "grad_norm": 1.3099187795016887, "learning_rate": 1.3973075595443564e-05, "loss": 1.8387, "step": 1195 }, { "epoch": 1.1146318732525629, "grad_norm": 1.7609565495224238, "learning_rate": 1.3966171901967553e-05, "loss": 1.5787, "step": 1196 }, { "epoch": 1.1155638397017706, "grad_norm": 1.130860622969977, "learning_rate": 1.3959268208491544e-05, "loss": 2.2622, "step": 1197 }, { "epoch": 1.1164958061509787, "grad_norm": 1.2675278538683992, "learning_rate": 1.3952364515015534e-05, "loss": 2.3496, "step": 1198 }, { "epoch": 1.1174277726001864, "grad_norm": 1.5200501060110443, "learning_rate": 1.3945460821539525e-05, "loss": 2.0705, "step": 1199 }, { "epoch": 1.1183597390493942, "grad_norm": 1.8066908412105671, "learning_rate": 1.3938557128063514e-05, "loss": 1.7367, "step": 1200 }, { "epoch": 1.119291705498602, "grad_norm": 1.5794663441047023, "learning_rate": 1.3931653434587507e-05, "loss": 2.0529, "step": 1201 }, { "epoch": 1.1202236719478098, "grad_norm": 1.2103167701554238, "learning_rate": 1.3924749741111496e-05, "loss": 2.4334, "step": 1202 }, { "epoch": 1.1211556383970178, "grad_norm": 1.5843279460781463, "learning_rate": 1.3917846047635486e-05, "loss": 1.9864, "step": 1203 }, { "epoch": 1.1220876048462256, "grad_norm": 1.3879339265232713, "learning_rate": 1.3910942354159475e-05, "loss": 1.8208, "step": 1204 }, { "epoch": 1.1230195712954334, "grad_norm": 1.4271514711083426, "learning_rate": 1.3904038660683468e-05, "loss": 1.9725, "step": 1205 }, { "epoch": 1.1239515377446412, "grad_norm": 1.3761508654951204, "learning_rate": 1.3897134967207457e-05, "loss": 2.0557, "step": 1206 }, { "epoch": 1.124883504193849, "grad_norm": 1.3171338549737273, "learning_rate": 1.3890231273731448e-05, "loss": 1.9092, "step": 1207 }, { "epoch": 1.1258154706430568, "grad_norm": 1.3910716602434017, "learning_rate": 1.3883327580255437e-05, "loss": 2.0351, "step": 1208 }, { "epoch": 1.1267474370922648, "grad_norm": 1.4572975865426612, "learning_rate": 1.3876423886779429e-05, "loss": 2.1265, "step": 1209 }, { "epoch": 1.1276794035414726, "grad_norm": 1.586484751103784, "learning_rate": 1.3869520193303418e-05, "loss": 1.9842, "step": 1210 }, { "epoch": 1.1286113699906803, "grad_norm": 1.2919394632154748, "learning_rate": 1.3862616499827409e-05, "loss": 1.9951, "step": 1211 }, { "epoch": 1.1295433364398881, "grad_norm": 0.9792017257440596, "learning_rate": 1.38557128063514e-05, "loss": 1.4798, "step": 1212 }, { "epoch": 1.130475302889096, "grad_norm": 1.4879449547790247, "learning_rate": 1.384880911287539e-05, "loss": 2.3998, "step": 1213 }, { "epoch": 1.131407269338304, "grad_norm": 1.3901659807504596, "learning_rate": 1.384190541939938e-05, "loss": 2.04, "step": 1214 }, { "epoch": 1.1323392357875117, "grad_norm": 1.1219217850956436, "learning_rate": 1.383500172592337e-05, "loss": 1.8847, "step": 1215 }, { "epoch": 1.1332712022367195, "grad_norm": 1.4907796096315622, "learning_rate": 1.382809803244736e-05, "loss": 2.2483, "step": 1216 }, { "epoch": 1.1342031686859273, "grad_norm": 1.3710775840313953, "learning_rate": 1.3821194338971351e-05, "loss": 1.7888, "step": 1217 }, { "epoch": 1.135135135135135, "grad_norm": 1.2864559660981625, "learning_rate": 1.381429064549534e-05, "loss": 2.1176, "step": 1218 }, { "epoch": 1.1360671015843429, "grad_norm": 1.0422190892480698, "learning_rate": 1.3807386952019331e-05, "loss": 1.8659, "step": 1219 }, { "epoch": 1.1369990680335509, "grad_norm": 1.3418506661982452, "learning_rate": 1.3800483258543322e-05, "loss": 2.1052, "step": 1220 }, { "epoch": 1.1379310344827587, "grad_norm": 1.31163619429359, "learning_rate": 1.3793579565067313e-05, "loss": 1.8345, "step": 1221 }, { "epoch": 1.1388630009319665, "grad_norm": 1.3130030700733988, "learning_rate": 1.3786675871591302e-05, "loss": 1.7457, "step": 1222 }, { "epoch": 1.1397949673811743, "grad_norm": 1.2431160201886098, "learning_rate": 1.3779772178115294e-05, "loss": 2.0466, "step": 1223 }, { "epoch": 1.140726933830382, "grad_norm": 1.4965746902136556, "learning_rate": 1.3772868484639283e-05, "loss": 2.1596, "step": 1224 }, { "epoch": 1.14165890027959, "grad_norm": 1.2240347579118231, "learning_rate": 1.3765964791163274e-05, "loss": 1.8605, "step": 1225 }, { "epoch": 1.1425908667287978, "grad_norm": 1.1043580508309314, "learning_rate": 1.3759061097687263e-05, "loss": 1.7101, "step": 1226 }, { "epoch": 1.1435228331780056, "grad_norm": 1.2454331308283466, "learning_rate": 1.3752157404211255e-05, "loss": 1.5284, "step": 1227 }, { "epoch": 1.1444547996272134, "grad_norm": 2.197083067774712, "learning_rate": 1.3745253710735244e-05, "loss": 2.5952, "step": 1228 }, { "epoch": 1.1453867660764212, "grad_norm": 1.4376202327075034, "learning_rate": 1.3738350017259235e-05, "loss": 1.5449, "step": 1229 }, { "epoch": 1.146318732525629, "grad_norm": 1.4649225664778827, "learning_rate": 1.3731446323783224e-05, "loss": 2.056, "step": 1230 }, { "epoch": 1.147250698974837, "grad_norm": 1.306878310831745, "learning_rate": 1.3724542630307217e-05, "loss": 1.8843, "step": 1231 }, { "epoch": 1.1481826654240448, "grad_norm": 1.336611864232817, "learning_rate": 1.3717638936831206e-05, "loss": 2.0629, "step": 1232 }, { "epoch": 1.1491146318732526, "grad_norm": 1.4636533207972227, "learning_rate": 1.3710735243355195e-05, "loss": 1.586, "step": 1233 }, { "epoch": 1.1500465983224604, "grad_norm": 1.510694843250502, "learning_rate": 1.3703831549879187e-05, "loss": 1.8152, "step": 1234 }, { "epoch": 1.1509785647716682, "grad_norm": 1.5040325675487183, "learning_rate": 1.3696927856403178e-05, "loss": 2.025, "step": 1235 }, { "epoch": 1.1519105312208762, "grad_norm": 1.1371486928806729, "learning_rate": 1.3690024162927167e-05, "loss": 2.1034, "step": 1236 }, { "epoch": 1.152842497670084, "grad_norm": 1.2325965759292845, "learning_rate": 1.3683120469451156e-05, "loss": 2.0692, "step": 1237 }, { "epoch": 1.1537744641192917, "grad_norm": 1.2664180938935736, "learning_rate": 1.3676216775975148e-05, "loss": 2.0034, "step": 1238 }, { "epoch": 1.1547064305684995, "grad_norm": 1.3353062987793012, "learning_rate": 1.3669313082499137e-05, "loss": 1.9044, "step": 1239 }, { "epoch": 1.1556383970177073, "grad_norm": 1.5092266700478956, "learning_rate": 1.3662409389023128e-05, "loss": 1.8833, "step": 1240 }, { "epoch": 1.156570363466915, "grad_norm": 1.278641050414992, "learning_rate": 1.365550569554712e-05, "loss": 1.8999, "step": 1241 }, { "epoch": 1.157502329916123, "grad_norm": 1.4689903885023465, "learning_rate": 1.364860200207111e-05, "loss": 2.1067, "step": 1242 }, { "epoch": 1.158434296365331, "grad_norm": 1.4757273769362669, "learning_rate": 1.3641698308595099e-05, "loss": 1.85, "step": 1243 }, { "epoch": 1.1593662628145387, "grad_norm": 1.4776403108541658, "learning_rate": 1.363479461511909e-05, "loss": 2.1376, "step": 1244 }, { "epoch": 1.1602982292637465, "grad_norm": 1.2496189925122219, "learning_rate": 1.3627890921643082e-05, "loss": 1.9364, "step": 1245 }, { "epoch": 1.1612301957129543, "grad_norm": 1.0954982953941452, "learning_rate": 1.362098722816707e-05, "loss": 1.7874, "step": 1246 }, { "epoch": 1.1621621621621623, "grad_norm": 1.2125046899714458, "learning_rate": 1.361408353469106e-05, "loss": 1.9385, "step": 1247 }, { "epoch": 1.16309412861137, "grad_norm": 1.6170917527263096, "learning_rate": 1.360717984121505e-05, "loss": 1.7613, "step": 1248 }, { "epoch": 1.1640260950605779, "grad_norm": 2.220044782565357, "learning_rate": 1.3600276147739041e-05, "loss": 2.478, "step": 1249 }, { "epoch": 1.1649580615097856, "grad_norm": 1.3037968881667212, "learning_rate": 1.3593372454263032e-05, "loss": 1.9357, "step": 1250 }, { "epoch": 1.1658900279589934, "grad_norm": 1.2518740453983155, "learning_rate": 1.3586468760787021e-05, "loss": 1.8573, "step": 1251 }, { "epoch": 1.1668219944082012, "grad_norm": 1.229760956824613, "learning_rate": 1.3579565067311014e-05, "loss": 1.623, "step": 1252 }, { "epoch": 1.167753960857409, "grad_norm": 1.3294285928621559, "learning_rate": 1.3572661373835003e-05, "loss": 1.8953, "step": 1253 }, { "epoch": 1.168685927306617, "grad_norm": 1.311291498543469, "learning_rate": 1.3565757680358993e-05, "loss": 2.0004, "step": 1254 }, { "epoch": 1.1696178937558248, "grad_norm": 1.3865175193974726, "learning_rate": 1.3558853986882982e-05, "loss": 2.0137, "step": 1255 }, { "epoch": 1.1705498602050326, "grad_norm": 1.5125568238542981, "learning_rate": 1.3551950293406975e-05, "loss": 2.095, "step": 1256 }, { "epoch": 1.1714818266542404, "grad_norm": 1.1661394383564023, "learning_rate": 1.3545046599930964e-05, "loss": 2.1223, "step": 1257 }, { "epoch": 1.1724137931034484, "grad_norm": 1.215716343269265, "learning_rate": 1.3538142906454955e-05, "loss": 1.6139, "step": 1258 }, { "epoch": 1.1733457595526562, "grad_norm": 1.1327659389840996, "learning_rate": 1.3531239212978944e-05, "loss": 1.8109, "step": 1259 }, { "epoch": 1.174277726001864, "grad_norm": 1.2626022834947281, "learning_rate": 1.3524335519502936e-05, "loss": 1.8852, "step": 1260 }, { "epoch": 1.1752096924510718, "grad_norm": 1.5103419517556487, "learning_rate": 1.3517431826026925e-05, "loss": 1.8067, "step": 1261 }, { "epoch": 1.1761416589002796, "grad_norm": 1.3698297102498955, "learning_rate": 1.3510528132550916e-05, "loss": 1.8177, "step": 1262 }, { "epoch": 1.1770736253494873, "grad_norm": 1.1005102454458129, "learning_rate": 1.3503624439074907e-05, "loss": 1.8831, "step": 1263 }, { "epoch": 1.1780055917986951, "grad_norm": 1.2120078898868405, "learning_rate": 1.3496720745598897e-05, "loss": 1.9381, "step": 1264 }, { "epoch": 1.1789375582479031, "grad_norm": 1.1978237721886025, "learning_rate": 1.3489817052122886e-05, "loss": 1.9612, "step": 1265 }, { "epoch": 1.179869524697111, "grad_norm": 1.2473842552432677, "learning_rate": 1.3482913358646877e-05, "loss": 1.8071, "step": 1266 }, { "epoch": 1.1808014911463187, "grad_norm": 1.2988574217043631, "learning_rate": 1.3476009665170868e-05, "loss": 2.0012, "step": 1267 }, { "epoch": 1.1817334575955265, "grad_norm": 1.5461100173866056, "learning_rate": 1.3469105971694858e-05, "loss": 1.9819, "step": 1268 }, { "epoch": 1.1826654240447343, "grad_norm": 1.098266600764879, "learning_rate": 1.3462202278218848e-05, "loss": 1.9096, "step": 1269 }, { "epoch": 1.1835973904939423, "grad_norm": 1.2632337164152025, "learning_rate": 1.3455298584742838e-05, "loss": 1.9753, "step": 1270 }, { "epoch": 1.18452935694315, "grad_norm": 1.0893693200655334, "learning_rate": 1.3448394891266829e-05, "loss": 1.6756, "step": 1271 }, { "epoch": 1.1854613233923579, "grad_norm": 1.1640118360726186, "learning_rate": 1.344149119779082e-05, "loss": 1.6631, "step": 1272 }, { "epoch": 1.1863932898415657, "grad_norm": 1.2536107373947596, "learning_rate": 1.3434587504314809e-05, "loss": 1.7842, "step": 1273 }, { "epoch": 1.1873252562907735, "grad_norm": 1.3753005427946245, "learning_rate": 1.3427683810838801e-05, "loss": 1.8197, "step": 1274 }, { "epoch": 1.1882572227399812, "grad_norm": 1.5360745559050437, "learning_rate": 1.342078011736279e-05, "loss": 2.0452, "step": 1275 }, { "epoch": 1.1891891891891893, "grad_norm": 1.2730723877104484, "learning_rate": 1.3413876423886781e-05, "loss": 1.7122, "step": 1276 }, { "epoch": 1.190121155638397, "grad_norm": 1.3486866401616482, "learning_rate": 1.340697273041077e-05, "loss": 1.9581, "step": 1277 }, { "epoch": 1.1910531220876048, "grad_norm": 1.1154490681673224, "learning_rate": 1.3400069036934762e-05, "loss": 1.9083, "step": 1278 }, { "epoch": 1.1919850885368126, "grad_norm": 1.3811632244424363, "learning_rate": 1.3393165343458751e-05, "loss": 1.9532, "step": 1279 }, { "epoch": 1.1929170549860204, "grad_norm": 1.1544461748893298, "learning_rate": 1.3386261649982742e-05, "loss": 1.5818, "step": 1280 }, { "epoch": 1.1938490214352284, "grad_norm": 1.2539298757232062, "learning_rate": 1.3379357956506733e-05, "loss": 2.0642, "step": 1281 }, { "epoch": 1.1947809878844362, "grad_norm": 1.2964807551837256, "learning_rate": 1.3372454263030724e-05, "loss": 1.9171, "step": 1282 }, { "epoch": 1.195712954333644, "grad_norm": 1.2531411180503138, "learning_rate": 1.3365550569554713e-05, "loss": 1.6606, "step": 1283 }, { "epoch": 1.1966449207828518, "grad_norm": 1.2553506324275174, "learning_rate": 1.3358646876078702e-05, "loss": 2.0059, "step": 1284 }, { "epoch": 1.1975768872320596, "grad_norm": 1.3714839091750883, "learning_rate": 1.3351743182602694e-05, "loss": 2.084, "step": 1285 }, { "epoch": 1.1985088536812674, "grad_norm": 1.072700606734928, "learning_rate": 1.3344839489126685e-05, "loss": 1.7672, "step": 1286 }, { "epoch": 1.1994408201304754, "grad_norm": 1.3974081710884754, "learning_rate": 1.3337935795650674e-05, "loss": 1.9867, "step": 1287 }, { "epoch": 1.2003727865796832, "grad_norm": 1.1367536344003706, "learning_rate": 1.3331032102174663e-05, "loss": 1.7119, "step": 1288 }, { "epoch": 1.201304753028891, "grad_norm": 1.0636698631147705, "learning_rate": 1.3324128408698655e-05, "loss": 1.2575, "step": 1289 }, { "epoch": 1.2022367194780987, "grad_norm": 1.4507772689981455, "learning_rate": 1.3317224715222644e-05, "loss": 1.9706, "step": 1290 }, { "epoch": 1.2031686859273065, "grad_norm": 1.9170538784274738, "learning_rate": 1.3310321021746635e-05, "loss": 1.7185, "step": 1291 }, { "epoch": 1.2041006523765145, "grad_norm": 2.3027849966046383, "learning_rate": 1.3303417328270628e-05, "loss": 1.8432, "step": 1292 }, { "epoch": 1.2050326188257223, "grad_norm": 1.494718092771273, "learning_rate": 1.3296513634794617e-05, "loss": 1.9755, "step": 1293 }, { "epoch": 1.2059645852749301, "grad_norm": 1.4059501496411833, "learning_rate": 1.3289609941318606e-05, "loss": 2.3925, "step": 1294 }, { "epoch": 1.206896551724138, "grad_norm": 1.3374567272621716, "learning_rate": 1.3282706247842596e-05, "loss": 1.7479, "step": 1295 }, { "epoch": 1.2078285181733457, "grad_norm": 1.6496535729464434, "learning_rate": 1.3275802554366589e-05, "loss": 1.8413, "step": 1296 }, { "epoch": 1.2087604846225535, "grad_norm": 1.4205685648616593, "learning_rate": 1.3268898860890578e-05, "loss": 1.8559, "step": 1297 }, { "epoch": 1.2096924510717615, "grad_norm": 1.3866726943207237, "learning_rate": 1.3261995167414567e-05, "loss": 1.9108, "step": 1298 }, { "epoch": 1.2106244175209693, "grad_norm": 1.1271212944765896, "learning_rate": 1.3255091473938558e-05, "loss": 1.8432, "step": 1299 }, { "epoch": 1.211556383970177, "grad_norm": 1.0894145477438397, "learning_rate": 1.3248187780462548e-05, "loss": 1.7241, "step": 1300 }, { "epoch": 1.2124883504193849, "grad_norm": 1.2316542351655442, "learning_rate": 1.3241284086986539e-05, "loss": 2.1213, "step": 1301 }, { "epoch": 1.2134203168685926, "grad_norm": 2.656971351043641, "learning_rate": 1.3234380393510528e-05, "loss": 2.0266, "step": 1302 }, { "epoch": 1.2143522833178007, "grad_norm": 1.1933282897718829, "learning_rate": 1.322747670003452e-05, "loss": 1.7325, "step": 1303 }, { "epoch": 1.2152842497670084, "grad_norm": 1.2238833667698723, "learning_rate": 1.322057300655851e-05, "loss": 2.0257, "step": 1304 }, { "epoch": 1.2162162162162162, "grad_norm": 1.4800709594809276, "learning_rate": 1.32136693130825e-05, "loss": 2.0852, "step": 1305 }, { "epoch": 1.217148182665424, "grad_norm": 1.5487163546589853, "learning_rate": 1.320676561960649e-05, "loss": 1.8801, "step": 1306 }, { "epoch": 1.2180801491146318, "grad_norm": 1.4588056325798338, "learning_rate": 1.3199861926130482e-05, "loss": 2.0858, "step": 1307 }, { "epoch": 1.2190121155638396, "grad_norm": 1.5385883593673402, "learning_rate": 1.3192958232654471e-05, "loss": 2.0784, "step": 1308 }, { "epoch": 1.2199440820130476, "grad_norm": 1.1941283845313624, "learning_rate": 1.3186054539178462e-05, "loss": 1.6905, "step": 1309 }, { "epoch": 1.2208760484622554, "grad_norm": 1.3895057720157937, "learning_rate": 1.317915084570245e-05, "loss": 1.7296, "step": 1310 }, { "epoch": 1.2218080149114632, "grad_norm": 1.2919152575603918, "learning_rate": 1.3172247152226443e-05, "loss": 1.929, "step": 1311 }, { "epoch": 1.222739981360671, "grad_norm": 1.1799451139965964, "learning_rate": 1.3165343458750432e-05, "loss": 2.074, "step": 1312 }, { "epoch": 1.2236719478098788, "grad_norm": 1.215502060670599, "learning_rate": 1.3158439765274423e-05, "loss": 1.6795, "step": 1313 }, { "epoch": 1.2246039142590868, "grad_norm": 1.2337709145517946, "learning_rate": 1.3151536071798414e-05, "loss": 1.8454, "step": 1314 }, { "epoch": 1.2255358807082946, "grad_norm": 1.2484002039091708, "learning_rate": 1.3144632378322404e-05, "loss": 1.8602, "step": 1315 }, { "epoch": 1.2264678471575023, "grad_norm": 1.3054212931714104, "learning_rate": 1.3137728684846393e-05, "loss": 1.8727, "step": 1316 }, { "epoch": 1.2273998136067101, "grad_norm": 1.2385826099277035, "learning_rate": 1.3130824991370384e-05, "loss": 1.8756, "step": 1317 }, { "epoch": 1.228331780055918, "grad_norm": 1.188771366716739, "learning_rate": 1.3123921297894375e-05, "loss": 1.8525, "step": 1318 }, { "epoch": 1.2292637465051257, "grad_norm": 3.152365196186296, "learning_rate": 1.3117017604418366e-05, "loss": 2.0797, "step": 1319 }, { "epoch": 1.2301957129543337, "grad_norm": 1.5497479661293174, "learning_rate": 1.3110113910942355e-05, "loss": 1.9947, "step": 1320 }, { "epoch": 1.2311276794035415, "grad_norm": 1.3307697477280047, "learning_rate": 1.3103210217466345e-05, "loss": 1.8008, "step": 1321 }, { "epoch": 1.2320596458527493, "grad_norm": 1.1681998550471897, "learning_rate": 1.3096306523990336e-05, "loss": 1.6237, "step": 1322 }, { "epoch": 1.232991612301957, "grad_norm": 1.292287500766913, "learning_rate": 1.3089402830514327e-05, "loss": 1.9825, "step": 1323 }, { "epoch": 1.2339235787511649, "grad_norm": 1.2303707541241367, "learning_rate": 1.3082499137038316e-05, "loss": 1.8933, "step": 1324 }, { "epoch": 1.2348555452003729, "grad_norm": 1.347923609187294, "learning_rate": 1.3075595443562308e-05, "loss": 1.6939, "step": 1325 }, { "epoch": 1.2357875116495807, "grad_norm": 1.3717743219800653, "learning_rate": 1.3068691750086297e-05, "loss": 1.8102, "step": 1326 }, { "epoch": 1.2367194780987885, "grad_norm": 1.2505383047849008, "learning_rate": 1.3061788056610288e-05, "loss": 2.318, "step": 1327 }, { "epoch": 1.2376514445479962, "grad_norm": 1.1012076199277738, "learning_rate": 1.3054884363134277e-05, "loss": 1.614, "step": 1328 }, { "epoch": 1.238583410997204, "grad_norm": 1.2975596256687316, "learning_rate": 1.304798066965827e-05, "loss": 1.6696, "step": 1329 }, { "epoch": 1.2395153774464118, "grad_norm": 1.7432164980615548, "learning_rate": 1.3041076976182258e-05, "loss": 1.75, "step": 1330 }, { "epoch": 1.2404473438956198, "grad_norm": 1.2753427272191613, "learning_rate": 1.3034173282706248e-05, "loss": 2.1493, "step": 1331 }, { "epoch": 1.2413793103448276, "grad_norm": 1.3443112734016458, "learning_rate": 1.302726958923024e-05, "loss": 2.004, "step": 1332 }, { "epoch": 1.2423112767940354, "grad_norm": 1.354128525016168, "learning_rate": 1.302036589575423e-05, "loss": 1.7029, "step": 1333 }, { "epoch": 1.2432432432432432, "grad_norm": 1.2035138501961622, "learning_rate": 1.301346220227822e-05, "loss": 1.912, "step": 1334 }, { "epoch": 1.244175209692451, "grad_norm": 1.4242155740528515, "learning_rate": 1.3006558508802209e-05, "loss": 2.0113, "step": 1335 }, { "epoch": 1.245107176141659, "grad_norm": 1.2922736924454155, "learning_rate": 1.2999654815326201e-05, "loss": 2.2103, "step": 1336 }, { "epoch": 1.2460391425908668, "grad_norm": 1.1837101412039472, "learning_rate": 1.2992751121850192e-05, "loss": 1.8042, "step": 1337 }, { "epoch": 1.2469711090400746, "grad_norm": 1.1752667587711252, "learning_rate": 1.2985847428374181e-05, "loss": 2.0835, "step": 1338 }, { "epoch": 1.2479030754892824, "grad_norm": 1.3148099606083457, "learning_rate": 1.297894373489817e-05, "loss": 1.8041, "step": 1339 }, { "epoch": 1.2488350419384902, "grad_norm": 1.5358158079802153, "learning_rate": 1.2972040041422162e-05, "loss": 2.1344, "step": 1340 }, { "epoch": 1.249767008387698, "grad_norm": 1.3684106605259818, "learning_rate": 1.2965136347946151e-05, "loss": 2.0237, "step": 1341 }, { "epoch": 1.250698974836906, "grad_norm": 1.8277317481584103, "learning_rate": 1.2958232654470142e-05, "loss": 2.0373, "step": 1342 }, { "epoch": 1.2516309412861137, "grad_norm": 1.676842419755677, "learning_rate": 1.2951328960994135e-05, "loss": 1.6812, "step": 1343 }, { "epoch": 1.2525629077353215, "grad_norm": 1.7772904058885295, "learning_rate": 1.2944425267518124e-05, "loss": 2.0697, "step": 1344 }, { "epoch": 1.2534948741845293, "grad_norm": 1.2986542321045238, "learning_rate": 1.2937521574042113e-05, "loss": 1.8038, "step": 1345 }, { "epoch": 1.254426840633737, "grad_norm": 1.1924564320604532, "learning_rate": 1.2930617880566103e-05, "loss": 2.0335, "step": 1346 }, { "epoch": 1.2553588070829451, "grad_norm": 1.2749057779367017, "learning_rate": 1.2923714187090096e-05, "loss": 2.1651, "step": 1347 }, { "epoch": 1.256290773532153, "grad_norm": 1.3188565171996542, "learning_rate": 1.2916810493614085e-05, "loss": 1.9387, "step": 1348 }, { "epoch": 1.2572227399813607, "grad_norm": 1.035517653477476, "learning_rate": 1.2909906800138074e-05, "loss": 1.931, "step": 1349 }, { "epoch": 1.2581547064305685, "grad_norm": 1.15238307533602, "learning_rate": 1.2903003106662065e-05, "loss": 1.6289, "step": 1350 }, { "epoch": 1.2590866728797763, "grad_norm": 1.105166257296439, "learning_rate": 1.2896099413186055e-05, "loss": 1.5675, "step": 1351 }, { "epoch": 1.260018639328984, "grad_norm": 1.3562573542712215, "learning_rate": 1.2889195719710046e-05, "loss": 2.1324, "step": 1352 }, { "epoch": 1.2609506057781918, "grad_norm": 1.264900061146711, "learning_rate": 1.2882292026234035e-05, "loss": 1.9684, "step": 1353 }, { "epoch": 1.2618825722273999, "grad_norm": 1.3059302358911005, "learning_rate": 1.2875388332758028e-05, "loss": 2.1839, "step": 1354 }, { "epoch": 1.2628145386766076, "grad_norm": 1.4082953101533917, "learning_rate": 1.2868484639282017e-05, "loss": 1.8832, "step": 1355 }, { "epoch": 1.2637465051258154, "grad_norm": 1.3145963827454825, "learning_rate": 1.2861580945806007e-05, "loss": 1.9403, "step": 1356 }, { "epoch": 1.2646784715750232, "grad_norm": 1.2177940696079084, "learning_rate": 1.2854677252329996e-05, "loss": 2.0146, "step": 1357 }, { "epoch": 1.2656104380242312, "grad_norm": 1.233379363968971, "learning_rate": 1.2847773558853989e-05, "loss": 1.702, "step": 1358 }, { "epoch": 1.266542404473439, "grad_norm": 1.2545261315131269, "learning_rate": 1.2840869865377978e-05, "loss": 1.5333, "step": 1359 }, { "epoch": 1.2674743709226468, "grad_norm": 1.122467604962298, "learning_rate": 1.2833966171901969e-05, "loss": 1.8302, "step": 1360 }, { "epoch": 1.2684063373718546, "grad_norm": 1.1665754679753126, "learning_rate": 1.2827062478425958e-05, "loss": 1.9597, "step": 1361 }, { "epoch": 1.2693383038210624, "grad_norm": 1.4006369629817088, "learning_rate": 1.282015878494995e-05, "loss": 1.9311, "step": 1362 }, { "epoch": 1.2702702702702702, "grad_norm": 1.1010211148061284, "learning_rate": 1.2813255091473939e-05, "loss": 1.6075, "step": 1363 }, { "epoch": 1.271202236719478, "grad_norm": 1.2499709763069422, "learning_rate": 1.280635139799793e-05, "loss": 2.1111, "step": 1364 }, { "epoch": 1.272134203168686, "grad_norm": 1.253453930700239, "learning_rate": 1.279944770452192e-05, "loss": 1.6717, "step": 1365 }, { "epoch": 1.2730661696178938, "grad_norm": 1.4778315792861996, "learning_rate": 1.2792544011045911e-05, "loss": 1.8714, "step": 1366 }, { "epoch": 1.2739981360671015, "grad_norm": 1.4070451372820738, "learning_rate": 1.27856403175699e-05, "loss": 2.2937, "step": 1367 }, { "epoch": 1.2749301025163093, "grad_norm": 1.3007202004118041, "learning_rate": 1.2778736624093891e-05, "loss": 2.0751, "step": 1368 }, { "epoch": 1.2758620689655173, "grad_norm": 1.2507860051604491, "learning_rate": 1.2771832930617882e-05, "loss": 2.1275, "step": 1369 }, { "epoch": 1.2767940354147251, "grad_norm": 1.3673709170611286, "learning_rate": 1.2764929237141873e-05, "loss": 1.8573, "step": 1370 }, { "epoch": 1.277726001863933, "grad_norm": 1.4201393433470666, "learning_rate": 1.2758025543665862e-05, "loss": 1.9592, "step": 1371 }, { "epoch": 1.2786579683131407, "grad_norm": 1.171357896863135, "learning_rate": 1.2751121850189854e-05, "loss": 2.1442, "step": 1372 }, { "epoch": 1.2795899347623485, "grad_norm": 1.2750854471137594, "learning_rate": 1.2744218156713843e-05, "loss": 2.2239, "step": 1373 }, { "epoch": 1.2805219012115563, "grad_norm": 1.1611868242034864, "learning_rate": 1.2737314463237834e-05, "loss": 1.7502, "step": 1374 }, { "epoch": 1.281453867660764, "grad_norm": 1.0924930077548647, "learning_rate": 1.2730410769761823e-05, "loss": 2.023, "step": 1375 }, { "epoch": 1.282385834109972, "grad_norm": 1.3409216783470406, "learning_rate": 1.2723507076285815e-05, "loss": 1.753, "step": 1376 }, { "epoch": 1.2833178005591799, "grad_norm": 1.385783173673885, "learning_rate": 1.2716603382809804e-05, "loss": 1.8706, "step": 1377 }, { "epoch": 1.2842497670083877, "grad_norm": 1.3199728769419887, "learning_rate": 1.2709699689333795e-05, "loss": 1.7417, "step": 1378 }, { "epoch": 1.2851817334575955, "grad_norm": 1.5975904979216649, "learning_rate": 1.2702795995857784e-05, "loss": 1.8371, "step": 1379 }, { "epoch": 1.2861136999068035, "grad_norm": 1.40826074362029, "learning_rate": 1.2695892302381776e-05, "loss": 1.9512, "step": 1380 }, { "epoch": 1.2870456663560113, "grad_norm": 1.285953063571069, "learning_rate": 1.2688988608905766e-05, "loss": 2.1017, "step": 1381 }, { "epoch": 1.287977632805219, "grad_norm": 1.2027419363064864, "learning_rate": 1.2682084915429755e-05, "loss": 1.7036, "step": 1382 }, { "epoch": 1.2889095992544268, "grad_norm": 1.37387121794541, "learning_rate": 1.2675181221953747e-05, "loss": 1.9205, "step": 1383 }, { "epoch": 1.2898415657036346, "grad_norm": 1.389694392562109, "learning_rate": 1.2668277528477738e-05, "loss": 1.9593, "step": 1384 }, { "epoch": 1.2907735321528424, "grad_norm": 1.2691793924395967, "learning_rate": 1.2661373835001727e-05, "loss": 1.9152, "step": 1385 }, { "epoch": 1.2917054986020502, "grad_norm": 1.2899066982450558, "learning_rate": 1.2654470141525716e-05, "loss": 2.07, "step": 1386 }, { "epoch": 1.2926374650512582, "grad_norm": 1.431231624196526, "learning_rate": 1.2647566448049708e-05, "loss": 1.6666, "step": 1387 }, { "epoch": 1.293569431500466, "grad_norm": 1.2180223051018542, "learning_rate": 1.2640662754573699e-05, "loss": 1.623, "step": 1388 }, { "epoch": 1.2945013979496738, "grad_norm": 1.342568117031489, "learning_rate": 1.2633759061097688e-05, "loss": 1.7466, "step": 1389 }, { "epoch": 1.2954333643988816, "grad_norm": 1.2308945627709111, "learning_rate": 1.2626855367621677e-05, "loss": 1.8386, "step": 1390 }, { "epoch": 1.2963653308480896, "grad_norm": 1.5611106190044777, "learning_rate": 1.261995167414567e-05, "loss": 1.8765, "step": 1391 }, { "epoch": 1.2972972972972974, "grad_norm": 1.3472874190342574, "learning_rate": 1.2613047980669658e-05, "loss": 2.1245, "step": 1392 }, { "epoch": 1.2982292637465052, "grad_norm": 1.6011608420472356, "learning_rate": 1.260614428719365e-05, "loss": 1.8438, "step": 1393 }, { "epoch": 1.299161230195713, "grad_norm": 1.2264222882593008, "learning_rate": 1.2599240593717642e-05, "loss": 1.8587, "step": 1394 }, { "epoch": 1.3000931966449207, "grad_norm": 1.2326943728534256, "learning_rate": 1.259233690024163e-05, "loss": 1.8797, "step": 1395 }, { "epoch": 1.3010251630941285, "grad_norm": 1.4216916856748862, "learning_rate": 1.258543320676562e-05, "loss": 2.104, "step": 1396 }, { "epoch": 1.3019571295433363, "grad_norm": 1.2900192986640446, "learning_rate": 1.257852951328961e-05, "loss": 1.7538, "step": 1397 }, { "epoch": 1.3028890959925443, "grad_norm": 1.2706056421015304, "learning_rate": 1.2571625819813603e-05, "loss": 1.3843, "step": 1398 }, { "epoch": 1.303821062441752, "grad_norm": 1.3571414557772365, "learning_rate": 1.2564722126337592e-05, "loss": 1.857, "step": 1399 }, { "epoch": 1.30475302889096, "grad_norm": 1.0894588268684844, "learning_rate": 1.2557818432861581e-05, "loss": 1.7033, "step": 1400 }, { "epoch": 1.3056849953401677, "grad_norm": 1.4094477533249303, "learning_rate": 1.2550914739385572e-05, "loss": 1.6727, "step": 1401 }, { "epoch": 1.3066169617893757, "grad_norm": 1.1793186270707785, "learning_rate": 1.2544011045909562e-05, "loss": 1.7682, "step": 1402 }, { "epoch": 1.3075489282385835, "grad_norm": 1.2880969725057896, "learning_rate": 1.2537107352433553e-05, "loss": 1.8855, "step": 1403 }, { "epoch": 1.3084808946877913, "grad_norm": 1.3313742677686242, "learning_rate": 1.2530203658957542e-05, "loss": 2.0384, "step": 1404 }, { "epoch": 1.309412861136999, "grad_norm": 1.6317455167808899, "learning_rate": 1.2523299965481535e-05, "loss": 1.8746, "step": 1405 }, { "epoch": 1.3103448275862069, "grad_norm": 1.3025847246785711, "learning_rate": 1.2516396272005524e-05, "loss": 2.1336, "step": 1406 }, { "epoch": 1.3112767940354146, "grad_norm": 1.4744097990099896, "learning_rate": 1.2509492578529514e-05, "loss": 2.0765, "step": 1407 }, { "epoch": 1.3122087604846224, "grad_norm": 1.3289023080574909, "learning_rate": 1.2502588885053503e-05, "loss": 2.1968, "step": 1408 }, { "epoch": 1.3131407269338304, "grad_norm": 1.5794017278156827, "learning_rate": 1.2495685191577496e-05, "loss": 1.8896, "step": 1409 }, { "epoch": 1.3140726933830382, "grad_norm": 1.3698060941333559, "learning_rate": 1.2488781498101485e-05, "loss": 2.0111, "step": 1410 }, { "epoch": 1.315004659832246, "grad_norm": 1.488397938087097, "learning_rate": 1.2481877804625476e-05, "loss": 1.8815, "step": 1411 }, { "epoch": 1.3159366262814538, "grad_norm": 1.4579445348620088, "learning_rate": 1.2474974111149465e-05, "loss": 1.683, "step": 1412 }, { "epoch": 1.3168685927306618, "grad_norm": 1.4014077408960248, "learning_rate": 1.2468070417673457e-05, "loss": 2.0609, "step": 1413 }, { "epoch": 1.3178005591798696, "grad_norm": 1.7428611477734324, "learning_rate": 1.2461166724197446e-05, "loss": 1.847, "step": 1414 }, { "epoch": 1.3187325256290774, "grad_norm": 1.215047659653327, "learning_rate": 1.2454263030721437e-05, "loss": 2.1532, "step": 1415 }, { "epoch": 1.3196644920782852, "grad_norm": 1.1652669310778303, "learning_rate": 1.2447359337245428e-05, "loss": 1.8622, "step": 1416 }, { "epoch": 1.320596458527493, "grad_norm": 1.1069156469427726, "learning_rate": 1.2440455643769418e-05, "loss": 1.8234, "step": 1417 }, { "epoch": 1.3215284249767008, "grad_norm": 1.2452967259793053, "learning_rate": 1.2433551950293407e-05, "loss": 1.8919, "step": 1418 }, { "epoch": 1.3224603914259085, "grad_norm": 1.2101473373413407, "learning_rate": 1.2426648256817398e-05, "loss": 1.9974, "step": 1419 }, { "epoch": 1.3233923578751166, "grad_norm": 1.3009086493687274, "learning_rate": 1.2419744563341389e-05, "loss": 1.9832, "step": 1420 }, { "epoch": 1.3243243243243243, "grad_norm": 1.4769667487113611, "learning_rate": 1.241284086986538e-05, "loss": 2.1078, "step": 1421 }, { "epoch": 1.3252562907735321, "grad_norm": 1.1977363838429957, "learning_rate": 1.2405937176389369e-05, "loss": 1.6198, "step": 1422 }, { "epoch": 1.32618825722274, "grad_norm": 1.3102600596772147, "learning_rate": 1.2399033482913361e-05, "loss": 2.0126, "step": 1423 }, { "epoch": 1.327120223671948, "grad_norm": 1.4144902027722783, "learning_rate": 1.239212978943735e-05, "loss": 1.8292, "step": 1424 }, { "epoch": 1.3280521901211557, "grad_norm": 1.0216027465952386, "learning_rate": 1.238522609596134e-05, "loss": 1.6946, "step": 1425 }, { "epoch": 1.3289841565703635, "grad_norm": 1.3398513731421644, "learning_rate": 1.237832240248533e-05, "loss": 2.0034, "step": 1426 }, { "epoch": 1.3299161230195713, "grad_norm": 1.3258144397417604, "learning_rate": 1.2371418709009322e-05, "loss": 1.8871, "step": 1427 }, { "epoch": 1.330848089468779, "grad_norm": 1.149547499718509, "learning_rate": 1.2364515015533311e-05, "loss": 1.9607, "step": 1428 }, { "epoch": 1.3317800559179869, "grad_norm": 1.2203953995666812, "learning_rate": 1.2357611322057302e-05, "loss": 2.0372, "step": 1429 }, { "epoch": 1.3327120223671947, "grad_norm": 1.4363567123355305, "learning_rate": 1.2350707628581291e-05, "loss": 1.9891, "step": 1430 }, { "epoch": 1.3336439888164027, "grad_norm": 1.3351633868147834, "learning_rate": 1.2343803935105284e-05, "loss": 1.9463, "step": 1431 }, { "epoch": 1.3345759552656105, "grad_norm": 1.1858169228734907, "learning_rate": 1.2336900241629273e-05, "loss": 2.0922, "step": 1432 }, { "epoch": 1.3355079217148182, "grad_norm": 1.3084292288033532, "learning_rate": 1.2329996548153262e-05, "loss": 1.664, "step": 1433 }, { "epoch": 1.336439888164026, "grad_norm": 1.5866835609542613, "learning_rate": 1.2323092854677254e-05, "loss": 1.5097, "step": 1434 }, { "epoch": 1.337371854613234, "grad_norm": 1.4161272520717532, "learning_rate": 1.2316189161201245e-05, "loss": 1.8883, "step": 1435 }, { "epoch": 1.3383038210624418, "grad_norm": 1.1812073254374924, "learning_rate": 1.2309285467725234e-05, "loss": 1.9483, "step": 1436 }, { "epoch": 1.3392357875116496, "grad_norm": 1.9040104801380937, "learning_rate": 1.2302381774249223e-05, "loss": 1.8611, "step": 1437 }, { "epoch": 1.3401677539608574, "grad_norm": 1.2220112096397957, "learning_rate": 1.2295478080773215e-05, "loss": 1.7952, "step": 1438 }, { "epoch": 1.3410997204100652, "grad_norm": 1.2297188243984611, "learning_rate": 1.2288574387297206e-05, "loss": 2.1685, "step": 1439 }, { "epoch": 1.342031686859273, "grad_norm": 1.1303118060018835, "learning_rate": 1.2281670693821195e-05, "loss": 2.2517, "step": 1440 }, { "epoch": 1.3429636533084808, "grad_norm": 2.402504511173153, "learning_rate": 1.2274767000345184e-05, "loss": 2.1275, "step": 1441 }, { "epoch": 1.3438956197576888, "grad_norm": 1.3832163088439207, "learning_rate": 1.2267863306869176e-05, "loss": 1.9486, "step": 1442 }, { "epoch": 1.3448275862068966, "grad_norm": 1.0793650052100332, "learning_rate": 1.2260959613393166e-05, "loss": 1.6864, "step": 1443 }, { "epoch": 1.3457595526561044, "grad_norm": 1.5597323046532898, "learning_rate": 1.2254055919917156e-05, "loss": 1.9665, "step": 1444 }, { "epoch": 1.3466915191053122, "grad_norm": 1.165133667690362, "learning_rate": 1.2247152226441149e-05, "loss": 2.0503, "step": 1445 }, { "epoch": 1.3476234855545202, "grad_norm": 1.2242812224699748, "learning_rate": 1.2240248532965138e-05, "loss": 2.0542, "step": 1446 }, { "epoch": 1.348555452003728, "grad_norm": 1.2993359795415695, "learning_rate": 1.2233344839489127e-05, "loss": 2.0812, "step": 1447 }, { "epoch": 1.3494874184529357, "grad_norm": 1.5013265160216605, "learning_rate": 1.2226441146013117e-05, "loss": 1.7076, "step": 1448 }, { "epoch": 1.3504193849021435, "grad_norm": 1.0557062696278812, "learning_rate": 1.221953745253711e-05, "loss": 1.7992, "step": 1449 }, { "epoch": 1.3513513513513513, "grad_norm": 1.1889226229936667, "learning_rate": 1.2212633759061099e-05, "loss": 1.7142, "step": 1450 }, { "epoch": 1.352283317800559, "grad_norm": 1.4208967942422557, "learning_rate": 1.2205730065585088e-05, "loss": 1.8911, "step": 1451 }, { "epoch": 1.353215284249767, "grad_norm": 1.6323551388257511, "learning_rate": 1.2198826372109079e-05, "loss": 1.8003, "step": 1452 }, { "epoch": 1.354147250698975, "grad_norm": 1.071824422602992, "learning_rate": 1.219192267863307e-05, "loss": 1.4225, "step": 1453 }, { "epoch": 1.3550792171481827, "grad_norm": 0.9847647447907405, "learning_rate": 1.218501898515706e-05, "loss": 1.5582, "step": 1454 }, { "epoch": 1.3560111835973905, "grad_norm": 1.3664614461026094, "learning_rate": 1.217811529168105e-05, "loss": 1.7261, "step": 1455 }, { "epoch": 1.3569431500465983, "grad_norm": 1.2375581367614685, "learning_rate": 1.2171211598205042e-05, "loss": 1.728, "step": 1456 }, { "epoch": 1.3578751164958063, "grad_norm": 1.5873884782217629, "learning_rate": 1.216430790472903e-05, "loss": 1.8877, "step": 1457 }, { "epoch": 1.358807082945014, "grad_norm": 1.3773666479509696, "learning_rate": 1.2157404211253021e-05, "loss": 1.8005, "step": 1458 }, { "epoch": 1.3597390493942219, "grad_norm": 1.536587777131898, "learning_rate": 1.215050051777701e-05, "loss": 2.3013, "step": 1459 }, { "epoch": 1.3606710158434296, "grad_norm": 1.296230659976895, "learning_rate": 1.2143596824301003e-05, "loss": 2.0412, "step": 1460 }, { "epoch": 1.3616029822926374, "grad_norm": 1.0960726248250323, "learning_rate": 1.2136693130824992e-05, "loss": 1.6543, "step": 1461 }, { "epoch": 1.3625349487418452, "grad_norm": 1.7985120553675007, "learning_rate": 1.2129789437348983e-05, "loss": 1.8893, "step": 1462 }, { "epoch": 1.363466915191053, "grad_norm": 1.3450874625920135, "learning_rate": 1.2122885743872973e-05, "loss": 2.0985, "step": 1463 }, { "epoch": 1.364398881640261, "grad_norm": 1.2937900638303497, "learning_rate": 1.2115982050396964e-05, "loss": 1.6562, "step": 1464 }, { "epoch": 1.3653308480894688, "grad_norm": 1.3172815507664946, "learning_rate": 1.2109078356920953e-05, "loss": 1.8348, "step": 1465 }, { "epoch": 1.3662628145386766, "grad_norm": 1.2387381237283601, "learning_rate": 1.2102174663444944e-05, "loss": 2.142, "step": 1466 }, { "epoch": 1.3671947809878844, "grad_norm": 1.4371669901546729, "learning_rate": 1.2095270969968935e-05, "loss": 1.758, "step": 1467 }, { "epoch": 1.3681267474370924, "grad_norm": 1.308215568934512, "learning_rate": 1.2088367276492925e-05, "loss": 2.1512, "step": 1468 }, { "epoch": 1.3690587138863002, "grad_norm": 1.4398951916869127, "learning_rate": 1.2081463583016914e-05, "loss": 1.8702, "step": 1469 }, { "epoch": 1.369990680335508, "grad_norm": 1.3586567881010627, "learning_rate": 1.2074559889540905e-05, "loss": 2.322, "step": 1470 }, { "epoch": 1.3709226467847158, "grad_norm": 1.433214292797049, "learning_rate": 1.2067656196064896e-05, "loss": 1.758, "step": 1471 }, { "epoch": 1.3718546132339235, "grad_norm": 1.2433374613791002, "learning_rate": 1.2060752502588887e-05, "loss": 2.1706, "step": 1472 }, { "epoch": 1.3727865796831313, "grad_norm": 1.1442320625304256, "learning_rate": 1.2053848809112876e-05, "loss": 1.9313, "step": 1473 }, { "epoch": 1.3737185461323391, "grad_norm": 1.1757781439752661, "learning_rate": 1.2046945115636868e-05, "loss": 1.8126, "step": 1474 }, { "epoch": 1.3746505125815471, "grad_norm": 1.3922279273477163, "learning_rate": 1.2040041422160857e-05, "loss": 1.8722, "step": 1475 }, { "epoch": 1.375582479030755, "grad_norm": 1.1772733406636533, "learning_rate": 1.2033137728684848e-05, "loss": 1.8373, "step": 1476 }, { "epoch": 1.3765144454799627, "grad_norm": 1.130348165046684, "learning_rate": 1.2026234035208837e-05, "loss": 1.6383, "step": 1477 }, { "epoch": 1.3774464119291705, "grad_norm": 1.1999114930139245, "learning_rate": 1.201933034173283e-05, "loss": 2.0092, "step": 1478 }, { "epoch": 1.3783783783783785, "grad_norm": 1.1960809224075906, "learning_rate": 1.2012426648256818e-05, "loss": 1.5114, "step": 1479 }, { "epoch": 1.3793103448275863, "grad_norm": 1.0878561864373026, "learning_rate": 1.2005522954780809e-05, "loss": 2.0232, "step": 1480 }, { "epoch": 1.380242311276794, "grad_norm": 1.6107064862461977, "learning_rate": 1.1998619261304798e-05, "loss": 1.9246, "step": 1481 }, { "epoch": 1.3811742777260019, "grad_norm": 1.2326404307613816, "learning_rate": 1.199171556782879e-05, "loss": 1.9464, "step": 1482 }, { "epoch": 1.3821062441752097, "grad_norm": 1.2684677823923212, "learning_rate": 1.198481187435278e-05, "loss": 1.8877, "step": 1483 }, { "epoch": 1.3830382106244175, "grad_norm": 1.2819537451345384, "learning_rate": 1.1977908180876769e-05, "loss": 2.0975, "step": 1484 }, { "epoch": 1.3839701770736252, "grad_norm": 1.4256565460941129, "learning_rate": 1.1971004487400761e-05, "loss": 2.0241, "step": 1485 }, { "epoch": 1.3849021435228333, "grad_norm": 1.3540488058565208, "learning_rate": 1.1964100793924752e-05, "loss": 1.9314, "step": 1486 }, { "epoch": 1.385834109972041, "grad_norm": 1.2236978093789048, "learning_rate": 1.195719710044874e-05, "loss": 1.5037, "step": 1487 }, { "epoch": 1.3867660764212488, "grad_norm": 1.0978051684227874, "learning_rate": 1.195029340697273e-05, "loss": 1.656, "step": 1488 }, { "epoch": 1.3876980428704566, "grad_norm": 1.3556743424758229, "learning_rate": 1.1943389713496722e-05, "loss": 1.7856, "step": 1489 }, { "epoch": 1.3886300093196646, "grad_norm": 1.194417782576632, "learning_rate": 1.1936486020020713e-05, "loss": 2.0004, "step": 1490 }, { "epoch": 1.3895619757688724, "grad_norm": 1.19044767906045, "learning_rate": 1.1929582326544702e-05, "loss": 1.6954, "step": 1491 }, { "epoch": 1.3904939422180802, "grad_norm": 1.307069835110556, "learning_rate": 1.1922678633068691e-05, "loss": 1.9328, "step": 1492 }, { "epoch": 1.391425908667288, "grad_norm": 1.4497119471951496, "learning_rate": 1.1915774939592684e-05, "loss": 1.9657, "step": 1493 }, { "epoch": 1.3923578751164958, "grad_norm": 1.4077624971349454, "learning_rate": 1.1908871246116673e-05, "loss": 1.7878, "step": 1494 }, { "epoch": 1.3932898415657036, "grad_norm": 1.2293908785407133, "learning_rate": 1.1901967552640663e-05, "loss": 1.8601, "step": 1495 }, { "epoch": 1.3942218080149114, "grad_norm": 1.3376080741077203, "learning_rate": 1.1895063859164656e-05, "loss": 1.9392, "step": 1496 }, { "epoch": 1.3951537744641194, "grad_norm": 1.4342783861547506, "learning_rate": 1.1888160165688645e-05, "loss": 1.9556, "step": 1497 }, { "epoch": 1.3960857409133272, "grad_norm": 1.204834902700503, "learning_rate": 1.1881256472212634e-05, "loss": 1.6743, "step": 1498 }, { "epoch": 1.397017707362535, "grad_norm": 1.2134729560908184, "learning_rate": 1.1874352778736625e-05, "loss": 1.8338, "step": 1499 }, { "epoch": 1.3979496738117427, "grad_norm": 1.5496542113815162, "learning_rate": 1.1867449085260617e-05, "loss": 1.872, "step": 1500 }, { "epoch": 1.3988816402609507, "grad_norm": 1.4881037428232495, "learning_rate": 1.1860545391784606e-05, "loss": 1.7841, "step": 1501 }, { "epoch": 1.3998136067101585, "grad_norm": 1.3740244876639733, "learning_rate": 1.1853641698308595e-05, "loss": 1.8517, "step": 1502 }, { "epoch": 1.4007455731593663, "grad_norm": 1.137656478027823, "learning_rate": 1.1846738004832586e-05, "loss": 1.8719, "step": 1503 }, { "epoch": 1.401677539608574, "grad_norm": 1.2675568624083682, "learning_rate": 1.1839834311356576e-05, "loss": 1.7785, "step": 1504 }, { "epoch": 1.402609506057782, "grad_norm": 1.453565449347857, "learning_rate": 1.1832930617880567e-05, "loss": 2.1672, "step": 1505 }, { "epoch": 1.4035414725069897, "grad_norm": 1.216810202303098, "learning_rate": 1.1826026924404556e-05, "loss": 2.0674, "step": 1506 }, { "epoch": 1.4044734389561975, "grad_norm": 1.4623713033017263, "learning_rate": 1.1819123230928549e-05, "loss": 2.1653, "step": 1507 }, { "epoch": 1.4054054054054055, "grad_norm": 1.4235971561985654, "learning_rate": 1.1812219537452538e-05, "loss": 1.6756, "step": 1508 }, { "epoch": 1.4063373718546133, "grad_norm": 1.7752099756296802, "learning_rate": 1.1805315843976528e-05, "loss": 2.4021, "step": 1509 }, { "epoch": 1.407269338303821, "grad_norm": 1.2909270783452227, "learning_rate": 1.1798412150500517e-05, "loss": 1.7572, "step": 1510 }, { "epoch": 1.4082013047530288, "grad_norm": 1.3890980298629116, "learning_rate": 1.179150845702451e-05, "loss": 1.8226, "step": 1511 }, { "epoch": 1.4091332712022366, "grad_norm": 1.0683893752893945, "learning_rate": 1.1784604763548499e-05, "loss": 1.4897, "step": 1512 }, { "epoch": 1.4100652376514446, "grad_norm": 1.4050320325345365, "learning_rate": 1.177770107007249e-05, "loss": 1.9759, "step": 1513 }, { "epoch": 1.4109972041006524, "grad_norm": 1.4785520288738523, "learning_rate": 1.177079737659648e-05, "loss": 2.0845, "step": 1514 }, { "epoch": 1.4119291705498602, "grad_norm": 1.4771787865008914, "learning_rate": 1.1763893683120471e-05, "loss": 1.6236, "step": 1515 }, { "epoch": 1.412861136999068, "grad_norm": 1.5021446558346356, "learning_rate": 1.175698998964446e-05, "loss": 1.8267, "step": 1516 }, { "epoch": 1.4137931034482758, "grad_norm": 1.0434564721703672, "learning_rate": 1.1750086296168451e-05, "loss": 1.9215, "step": 1517 }, { "epoch": 1.4147250698974836, "grad_norm": 1.1477312334624479, "learning_rate": 1.1743182602692442e-05, "loss": 1.8686, "step": 1518 }, { "epoch": 1.4156570363466916, "grad_norm": 1.6484766864071105, "learning_rate": 1.1736278909216432e-05, "loss": 1.6681, "step": 1519 }, { "epoch": 1.4165890027958994, "grad_norm": 1.2785706228140183, "learning_rate": 1.1729375215740421e-05, "loss": 1.7617, "step": 1520 }, { "epoch": 1.4175209692451072, "grad_norm": 1.0885067728304532, "learning_rate": 1.1722471522264412e-05, "loss": 1.9589, "step": 1521 }, { "epoch": 1.418452935694315, "grad_norm": 2.1550544518535384, "learning_rate": 1.1715567828788403e-05, "loss": 2.2207, "step": 1522 }, { "epoch": 1.4193849021435228, "grad_norm": 1.1797966346154967, "learning_rate": 1.1708664135312394e-05, "loss": 1.777, "step": 1523 }, { "epoch": 1.4203168685927308, "grad_norm": 2.1059387650170973, "learning_rate": 1.1701760441836383e-05, "loss": 1.836, "step": 1524 }, { "epoch": 1.4212488350419386, "grad_norm": 1.4671163657378663, "learning_rate": 1.1694856748360375e-05, "loss": 1.9289, "step": 1525 }, { "epoch": 1.4221808014911463, "grad_norm": 1.5897236351496633, "learning_rate": 1.1687953054884364e-05, "loss": 2.1987, "step": 1526 }, { "epoch": 1.4231127679403541, "grad_norm": 1.3842864096173588, "learning_rate": 1.1681049361408355e-05, "loss": 1.9496, "step": 1527 }, { "epoch": 1.424044734389562, "grad_norm": 1.4903485868867357, "learning_rate": 1.1674145667932344e-05, "loss": 1.8247, "step": 1528 }, { "epoch": 1.4249767008387697, "grad_norm": 1.3502719582243954, "learning_rate": 1.1667241974456336e-05, "loss": 1.5899, "step": 1529 }, { "epoch": 1.4259086672879777, "grad_norm": 1.565111107953919, "learning_rate": 1.1660338280980325e-05, "loss": 1.941, "step": 1530 }, { "epoch": 1.4268406337371855, "grad_norm": 1.2682203273426802, "learning_rate": 1.1653434587504316e-05, "loss": 1.6997, "step": 1531 }, { "epoch": 1.4277726001863933, "grad_norm": 1.5908531771660244, "learning_rate": 1.1646530894028305e-05, "loss": 2.1759, "step": 1532 }, { "epoch": 1.428704566635601, "grad_norm": 1.466465315432323, "learning_rate": 1.1639627200552298e-05, "loss": 1.9366, "step": 1533 }, { "epoch": 1.4296365330848089, "grad_norm": 1.3086690420942706, "learning_rate": 1.1632723507076287e-05, "loss": 1.9265, "step": 1534 }, { "epoch": 1.4305684995340169, "grad_norm": 1.175794542116252, "learning_rate": 1.1625819813600276e-05, "loss": 1.8483, "step": 1535 }, { "epoch": 1.4315004659832247, "grad_norm": 1.5118764775220055, "learning_rate": 1.1618916120124268e-05, "loss": 1.5796, "step": 1536 }, { "epoch": 1.4324324324324325, "grad_norm": 1.2327165491799237, "learning_rate": 1.1612012426648259e-05, "loss": 2.0657, "step": 1537 }, { "epoch": 1.4333643988816402, "grad_norm": 1.149066183196232, "learning_rate": 1.1605108733172248e-05, "loss": 1.8362, "step": 1538 }, { "epoch": 1.434296365330848, "grad_norm": 1.297298576862864, "learning_rate": 1.1598205039696237e-05, "loss": 1.9683, "step": 1539 }, { "epoch": 1.4352283317800558, "grad_norm": 1.2119540819602375, "learning_rate": 1.159130134622023e-05, "loss": 1.855, "step": 1540 }, { "epoch": 1.4361602982292636, "grad_norm": 1.1461554285322995, "learning_rate": 1.158439765274422e-05, "loss": 1.7932, "step": 1541 }, { "epoch": 1.4370922646784716, "grad_norm": 1.5252042469229263, "learning_rate": 1.1577493959268209e-05, "loss": 1.9595, "step": 1542 }, { "epoch": 1.4380242311276794, "grad_norm": 2.5384430977130537, "learning_rate": 1.1570590265792198e-05, "loss": 2.0625, "step": 1543 }, { "epoch": 1.4389561975768872, "grad_norm": 1.2247789276295105, "learning_rate": 1.156368657231619e-05, "loss": 1.8309, "step": 1544 }, { "epoch": 1.439888164026095, "grad_norm": 1.2690564432193898, "learning_rate": 1.155678287884018e-05, "loss": 1.7186, "step": 1545 }, { "epoch": 1.440820130475303, "grad_norm": 1.2247782070710012, "learning_rate": 1.154987918536417e-05, "loss": 1.8984, "step": 1546 }, { "epoch": 1.4417520969245108, "grad_norm": 1.006398898788065, "learning_rate": 1.1542975491888163e-05, "loss": 1.676, "step": 1547 }, { "epoch": 1.4426840633737186, "grad_norm": 1.6713501996028841, "learning_rate": 1.1536071798412152e-05, "loss": 1.7754, "step": 1548 }, { "epoch": 1.4436160298229264, "grad_norm": 1.2426450272627771, "learning_rate": 1.152916810493614e-05, "loss": 1.9574, "step": 1549 }, { "epoch": 1.4445479962721341, "grad_norm": 1.3164428416161023, "learning_rate": 1.1522264411460132e-05, "loss": 1.954, "step": 1550 }, { "epoch": 1.445479962721342, "grad_norm": 1.6246293211592662, "learning_rate": 1.1515360717984124e-05, "loss": 1.9457, "step": 1551 }, { "epoch": 1.4464119291705497, "grad_norm": 1.2007017592108489, "learning_rate": 1.1508457024508113e-05, "loss": 1.7986, "step": 1552 }, { "epoch": 1.4473438956197577, "grad_norm": 1.1169157590606154, "learning_rate": 1.1501553331032102e-05, "loss": 1.6345, "step": 1553 }, { "epoch": 1.4482758620689655, "grad_norm": 1.1922559637000345, "learning_rate": 1.1494649637556094e-05, "loss": 2.0656, "step": 1554 }, { "epoch": 1.4492078285181733, "grad_norm": 1.302786769120491, "learning_rate": 1.1487745944080084e-05, "loss": 1.7273, "step": 1555 }, { "epoch": 1.450139794967381, "grad_norm": 1.1267676696602524, "learning_rate": 1.1480842250604074e-05, "loss": 2.0179, "step": 1556 }, { "epoch": 1.4510717614165891, "grad_norm": 1.0234272933294717, "learning_rate": 1.1473938557128063e-05, "loss": 2.041, "step": 1557 }, { "epoch": 1.452003727865797, "grad_norm": 1.188957949198127, "learning_rate": 1.1467034863652056e-05, "loss": 1.9598, "step": 1558 }, { "epoch": 1.4529356943150047, "grad_norm": 1.6304576871469691, "learning_rate": 1.1460131170176045e-05, "loss": 1.8077, "step": 1559 }, { "epoch": 1.4538676607642125, "grad_norm": 1.2466871358630027, "learning_rate": 1.1453227476700035e-05, "loss": 1.6692, "step": 1560 }, { "epoch": 1.4547996272134203, "grad_norm": 1.7190664061363612, "learning_rate": 1.1446323783224025e-05, "loss": 1.8699, "step": 1561 }, { "epoch": 1.455731593662628, "grad_norm": 1.2276326178797445, "learning_rate": 1.1439420089748017e-05, "loss": 1.9435, "step": 1562 }, { "epoch": 1.4566635601118358, "grad_norm": 1.6485759568820928, "learning_rate": 1.1432516396272006e-05, "loss": 1.8804, "step": 1563 }, { "epoch": 1.4575955265610439, "grad_norm": 1.4569991948717687, "learning_rate": 1.1425612702795997e-05, "loss": 2.2616, "step": 1564 }, { "epoch": 1.4585274930102516, "grad_norm": 1.2300438114063887, "learning_rate": 1.1418709009319987e-05, "loss": 1.7532, "step": 1565 }, { "epoch": 1.4594594594594594, "grad_norm": 1.245340878874392, "learning_rate": 1.1411805315843978e-05, "loss": 1.9478, "step": 1566 }, { "epoch": 1.4603914259086672, "grad_norm": 1.588876515437661, "learning_rate": 1.1404901622367967e-05, "loss": 1.7032, "step": 1567 }, { "epoch": 1.4613233923578752, "grad_norm": 1.4291536965523537, "learning_rate": 1.1397997928891958e-05, "loss": 2.0643, "step": 1568 }, { "epoch": 1.462255358807083, "grad_norm": 1.453495255747211, "learning_rate": 1.1391094235415949e-05, "loss": 2.1325, "step": 1569 }, { "epoch": 1.4631873252562908, "grad_norm": 1.463269294390816, "learning_rate": 1.138419054193994e-05, "loss": 2.0468, "step": 1570 }, { "epoch": 1.4641192917054986, "grad_norm": 1.2917525236387877, "learning_rate": 1.1377286848463928e-05, "loss": 1.7545, "step": 1571 }, { "epoch": 1.4650512581547064, "grad_norm": 1.2132775236282705, "learning_rate": 1.137038315498792e-05, "loss": 1.8839, "step": 1572 }, { "epoch": 1.4659832246039142, "grad_norm": 1.226638623946191, "learning_rate": 1.136347946151191e-05, "loss": 1.7299, "step": 1573 }, { "epoch": 1.466915191053122, "grad_norm": 1.257161293830846, "learning_rate": 1.13565757680359e-05, "loss": 1.9138, "step": 1574 }, { "epoch": 1.46784715750233, "grad_norm": 1.3581282730748743, "learning_rate": 1.134967207455989e-05, "loss": 1.9369, "step": 1575 }, { "epoch": 1.4687791239515378, "grad_norm": 1.5973739833996385, "learning_rate": 1.1342768381083882e-05, "loss": 1.8157, "step": 1576 }, { "epoch": 1.4697110904007455, "grad_norm": 1.4557459958364332, "learning_rate": 1.1335864687607871e-05, "loss": 1.9073, "step": 1577 }, { "epoch": 1.4706430568499533, "grad_norm": 1.1611655078259997, "learning_rate": 1.1328960994131862e-05, "loss": 1.9877, "step": 1578 }, { "epoch": 1.4715750232991613, "grad_norm": 1.6176724200856851, "learning_rate": 1.1322057300655851e-05, "loss": 2.1491, "step": 1579 }, { "epoch": 1.4725069897483691, "grad_norm": 1.3080297013778157, "learning_rate": 1.1315153607179843e-05, "loss": 1.8677, "step": 1580 }, { "epoch": 1.473438956197577, "grad_norm": 2.658628393011773, "learning_rate": 1.1308249913703832e-05, "loss": 1.6541, "step": 1581 }, { "epoch": 1.4743709226467847, "grad_norm": 1.592856782192119, "learning_rate": 1.1301346220227823e-05, "loss": 1.7683, "step": 1582 }, { "epoch": 1.4753028890959925, "grad_norm": 1.2717055155999568, "learning_rate": 1.1294442526751812e-05, "loss": 1.7705, "step": 1583 }, { "epoch": 1.4762348555452003, "grad_norm": 1.3580506657097033, "learning_rate": 1.1287538833275805e-05, "loss": 2.2836, "step": 1584 }, { "epoch": 1.477166821994408, "grad_norm": 1.2188916637561167, "learning_rate": 1.1280635139799794e-05, "loss": 1.8937, "step": 1585 }, { "epoch": 1.478098788443616, "grad_norm": 1.2963973588039184, "learning_rate": 1.1273731446323783e-05, "loss": 1.9836, "step": 1586 }, { "epoch": 1.4790307548928239, "grad_norm": 1.0872203234835898, "learning_rate": 1.1266827752847775e-05, "loss": 1.6942, "step": 1587 }, { "epoch": 1.4799627213420317, "grad_norm": 1.2057412330774782, "learning_rate": 1.1259924059371766e-05, "loss": 2.043, "step": 1588 }, { "epoch": 1.4808946877912395, "grad_norm": 1.2756180967482522, "learning_rate": 1.1253020365895755e-05, "loss": 1.8307, "step": 1589 }, { "epoch": 1.4818266542404475, "grad_norm": 1.17202236664837, "learning_rate": 1.1246116672419744e-05, "loss": 2.0424, "step": 1590 }, { "epoch": 1.4827586206896552, "grad_norm": 1.516262658509465, "learning_rate": 1.1239212978943736e-05, "loss": 1.9222, "step": 1591 }, { "epoch": 1.483690587138863, "grad_norm": 1.4287301041926317, "learning_rate": 1.1232309285467727e-05, "loss": 2.235, "step": 1592 }, { "epoch": 1.4846225535880708, "grad_norm": 1.0964332013572102, "learning_rate": 1.1225405591991716e-05, "loss": 1.5631, "step": 1593 }, { "epoch": 1.4855545200372786, "grad_norm": 1.1358479835061075, "learning_rate": 1.1218501898515709e-05, "loss": 2.1403, "step": 1594 }, { "epoch": 1.4864864864864864, "grad_norm": 1.3249174463971212, "learning_rate": 1.1211598205039698e-05, "loss": 1.868, "step": 1595 }, { "epoch": 1.4874184529356942, "grad_norm": 1.2912512768147117, "learning_rate": 1.1204694511563687e-05, "loss": 1.9069, "step": 1596 }, { "epoch": 1.4883504193849022, "grad_norm": 1.2675011831596354, "learning_rate": 1.1197790818087677e-05, "loss": 1.8786, "step": 1597 }, { "epoch": 1.48928238583411, "grad_norm": 1.3703102505546265, "learning_rate": 1.119088712461167e-05, "loss": 1.7384, "step": 1598 }, { "epoch": 1.4902143522833178, "grad_norm": 1.4712903615350588, "learning_rate": 1.1183983431135659e-05, "loss": 1.8789, "step": 1599 }, { "epoch": 1.4911463187325256, "grad_norm": 1.460899489361002, "learning_rate": 1.1177079737659648e-05, "loss": 1.6269, "step": 1600 }, { "epoch": 1.4920782851817336, "grad_norm": 1.2209666981996772, "learning_rate": 1.1170176044183639e-05, "loss": 2.1951, "step": 1601 }, { "epoch": 1.4930102516309414, "grad_norm": 1.1780904854264522, "learning_rate": 1.1163272350707631e-05, "loss": 1.9462, "step": 1602 }, { "epoch": 1.4939422180801492, "grad_norm": 1.2339475606170354, "learning_rate": 1.115636865723162e-05, "loss": 1.7926, "step": 1603 }, { "epoch": 1.494874184529357, "grad_norm": 1.376316249138832, "learning_rate": 1.1149464963755609e-05, "loss": 1.8559, "step": 1604 }, { "epoch": 1.4958061509785647, "grad_norm": 1.236685421256572, "learning_rate": 1.1142561270279602e-05, "loss": 2.2809, "step": 1605 }, { "epoch": 1.4967381174277725, "grad_norm": 1.4321076488163853, "learning_rate": 1.113565757680359e-05, "loss": 2.3351, "step": 1606 }, { "epoch": 1.4976700838769803, "grad_norm": 1.1987820274190368, "learning_rate": 1.1128753883327581e-05, "loss": 1.8673, "step": 1607 }, { "epoch": 1.4986020503261883, "grad_norm": 1.490660268191194, "learning_rate": 1.112185018985157e-05, "loss": 2.1378, "step": 1608 }, { "epoch": 1.499534016775396, "grad_norm": 1.0950942902240957, "learning_rate": 1.1114946496375563e-05, "loss": 1.5385, "step": 1609 }, { "epoch": 1.500465983224604, "grad_norm": 1.5131493375177574, "learning_rate": 1.1108042802899552e-05, "loss": 2.0069, "step": 1610 }, { "epoch": 1.501397949673812, "grad_norm": 1.216111626334847, "learning_rate": 1.1101139109423543e-05, "loss": 2.0526, "step": 1611 }, { "epoch": 1.5023299161230197, "grad_norm": 1.3962116215657516, "learning_rate": 1.1094235415947532e-05, "loss": 2.0284, "step": 1612 }, { "epoch": 1.5032618825722275, "grad_norm": 1.2738092145398594, "learning_rate": 1.1087331722471524e-05, "loss": 1.874, "step": 1613 }, { "epoch": 1.5041938490214353, "grad_norm": 1.423164049656284, "learning_rate": 1.1080428028995513e-05, "loss": 1.989, "step": 1614 }, { "epoch": 1.505125815470643, "grad_norm": 1.23600194873526, "learning_rate": 1.1073524335519504e-05, "loss": 1.5893, "step": 1615 }, { "epoch": 1.5060577819198508, "grad_norm": 1.2926417089802982, "learning_rate": 1.1066620642043494e-05, "loss": 1.8337, "step": 1616 }, { "epoch": 1.5069897483690586, "grad_norm": 1.274838855202429, "learning_rate": 1.1059716948567485e-05, "loss": 1.5462, "step": 1617 }, { "epoch": 1.5079217148182664, "grad_norm": 1.4921337918876527, "learning_rate": 1.1052813255091474e-05, "loss": 1.7917, "step": 1618 }, { "epoch": 1.5088536812674742, "grad_norm": 1.1135305768674382, "learning_rate": 1.1045909561615465e-05, "loss": 1.9226, "step": 1619 }, { "epoch": 1.5097856477166822, "grad_norm": 1.1960145697233313, "learning_rate": 1.1039005868139456e-05, "loss": 1.9973, "step": 1620 }, { "epoch": 1.51071761416589, "grad_norm": 1.1381079708241661, "learning_rate": 1.1032102174663446e-05, "loss": 1.963, "step": 1621 }, { "epoch": 1.511649580615098, "grad_norm": 1.1750992436809524, "learning_rate": 1.1025198481187435e-05, "loss": 1.7992, "step": 1622 }, { "epoch": 1.5125815470643058, "grad_norm": 1.5749605385181804, "learning_rate": 1.1018294787711426e-05, "loss": 1.9224, "step": 1623 }, { "epoch": 1.5135135135135136, "grad_norm": 1.3537860479584918, "learning_rate": 1.1011391094235417e-05, "loss": 1.5786, "step": 1624 }, { "epoch": 1.5144454799627214, "grad_norm": 1.1652794260154984, "learning_rate": 1.1004487400759408e-05, "loss": 1.6738, "step": 1625 }, { "epoch": 1.5153774464119292, "grad_norm": 1.4856456820869566, "learning_rate": 1.0997583707283397e-05, "loss": 1.8151, "step": 1626 }, { "epoch": 1.516309412861137, "grad_norm": 1.1335339282247316, "learning_rate": 1.0990680013807389e-05, "loss": 1.4986, "step": 1627 }, { "epoch": 1.5172413793103448, "grad_norm": 1.1324206319725032, "learning_rate": 1.0983776320331378e-05, "loss": 1.6306, "step": 1628 }, { "epoch": 1.5181733457595525, "grad_norm": 1.2357156761857753, "learning_rate": 1.0976872626855369e-05, "loss": 1.7278, "step": 1629 }, { "epoch": 1.5191053122087603, "grad_norm": 1.3900005656538843, "learning_rate": 1.0969968933379358e-05, "loss": 1.7753, "step": 1630 }, { "epoch": 1.5200372786579683, "grad_norm": 1.229236840571074, "learning_rate": 1.096306523990335e-05, "loss": 2.9072, "step": 1631 }, { "epoch": 1.5209692451071761, "grad_norm": 1.303992496056618, "learning_rate": 1.095616154642734e-05, "loss": 1.8603, "step": 1632 }, { "epoch": 1.521901211556384, "grad_norm": 1.3540853546433043, "learning_rate": 1.094925785295133e-05, "loss": 1.9173, "step": 1633 }, { "epoch": 1.522833178005592, "grad_norm": 1.4297072067839318, "learning_rate": 1.094235415947532e-05, "loss": 1.757, "step": 1634 }, { "epoch": 1.5237651444547997, "grad_norm": 1.0684243893160315, "learning_rate": 1.0935450465999312e-05, "loss": 1.8299, "step": 1635 }, { "epoch": 1.5246971109040075, "grad_norm": 1.1862175999536086, "learning_rate": 1.09285467725233e-05, "loss": 1.7214, "step": 1636 }, { "epoch": 1.5256290773532153, "grad_norm": 1.362494030989928, "learning_rate": 1.092164307904729e-05, "loss": 1.7356, "step": 1637 }, { "epoch": 1.526561043802423, "grad_norm": 1.2347265704214359, "learning_rate": 1.0914739385571282e-05, "loss": 1.5537, "step": 1638 }, { "epoch": 1.5274930102516309, "grad_norm": 1.4215163395993184, "learning_rate": 1.0907835692095273e-05, "loss": 1.9843, "step": 1639 }, { "epoch": 1.5284249767008387, "grad_norm": 1.278530803726745, "learning_rate": 1.0900931998619262e-05, "loss": 1.7473, "step": 1640 }, { "epoch": 1.5293569431500464, "grad_norm": 1.3223263490092385, "learning_rate": 1.0894028305143251e-05, "loss": 1.8971, "step": 1641 }, { "epoch": 1.5302889095992545, "grad_norm": 1.1110471791286678, "learning_rate": 1.0887124611667243e-05, "loss": 1.8583, "step": 1642 }, { "epoch": 1.5312208760484622, "grad_norm": 1.305958334544749, "learning_rate": 1.0880220918191234e-05, "loss": 1.8761, "step": 1643 }, { "epoch": 1.53215284249767, "grad_norm": 1.3434815319195046, "learning_rate": 1.0873317224715223e-05, "loss": 1.8559, "step": 1644 }, { "epoch": 1.533084808946878, "grad_norm": 1.2160198097978454, "learning_rate": 1.0866413531239216e-05, "loss": 1.9045, "step": 1645 }, { "epoch": 1.5340167753960858, "grad_norm": 1.2035413368831416, "learning_rate": 1.0859509837763205e-05, "loss": 1.7397, "step": 1646 }, { "epoch": 1.5349487418452936, "grad_norm": 1.4033341323995463, "learning_rate": 1.0852606144287194e-05, "loss": 1.9574, "step": 1647 }, { "epoch": 1.5358807082945014, "grad_norm": 1.3414605524586205, "learning_rate": 1.0845702450811184e-05, "loss": 2.0012, "step": 1648 }, { "epoch": 1.5368126747437092, "grad_norm": 1.2712100399325768, "learning_rate": 1.0838798757335177e-05, "loss": 1.9584, "step": 1649 }, { "epoch": 1.537744641192917, "grad_norm": 1.2491678547191518, "learning_rate": 1.0831895063859166e-05, "loss": 1.6194, "step": 1650 }, { "epoch": 1.5386766076421248, "grad_norm": 1.6263539025462597, "learning_rate": 1.0824991370383155e-05, "loss": 1.9842, "step": 1651 }, { "epoch": 1.5396085740913326, "grad_norm": 1.318056997937259, "learning_rate": 1.0818087676907146e-05, "loss": 2.0371, "step": 1652 }, { "epoch": 1.5405405405405406, "grad_norm": 1.3091962096378598, "learning_rate": 1.0811183983431136e-05, "loss": 1.9598, "step": 1653 }, { "epoch": 1.5414725069897484, "grad_norm": 1.3711695411443952, "learning_rate": 1.0804280289955127e-05, "loss": 1.7451, "step": 1654 }, { "epoch": 1.5424044734389561, "grad_norm": 1.6461761474307144, "learning_rate": 1.0797376596479116e-05, "loss": 1.8356, "step": 1655 }, { "epoch": 1.5433364398881642, "grad_norm": 1.324395596411948, "learning_rate": 1.0790472903003109e-05, "loss": 1.7498, "step": 1656 }, { "epoch": 1.544268406337372, "grad_norm": 1.4960527687944112, "learning_rate": 1.0783569209527098e-05, "loss": 2.2131, "step": 1657 }, { "epoch": 1.5452003727865797, "grad_norm": 1.30859072425431, "learning_rate": 1.0776665516051088e-05, "loss": 2.1275, "step": 1658 }, { "epoch": 1.5461323392357875, "grad_norm": 1.223247012331437, "learning_rate": 1.0769761822575077e-05, "loss": 1.7251, "step": 1659 }, { "epoch": 1.5470643056849953, "grad_norm": 1.1870442628092563, "learning_rate": 1.076285812909907e-05, "loss": 1.7287, "step": 1660 }, { "epoch": 1.547996272134203, "grad_norm": 1.4360633018954487, "learning_rate": 1.0755954435623059e-05, "loss": 1.9195, "step": 1661 }, { "epoch": 1.5489282385834109, "grad_norm": 1.6233278693218551, "learning_rate": 1.074905074214705e-05, "loss": 1.7844, "step": 1662 }, { "epoch": 1.5498602050326187, "grad_norm": 1.2412221642373493, "learning_rate": 1.0742147048671039e-05, "loss": 1.9233, "step": 1663 }, { "epoch": 1.5507921714818267, "grad_norm": 1.3549095645424047, "learning_rate": 1.0735243355195031e-05, "loss": 1.9127, "step": 1664 }, { "epoch": 1.5517241379310345, "grad_norm": 1.2790193556976612, "learning_rate": 1.072833966171902e-05, "loss": 1.8389, "step": 1665 }, { "epoch": 1.5526561043802423, "grad_norm": 1.1747249888296016, "learning_rate": 1.072143596824301e-05, "loss": 1.7188, "step": 1666 }, { "epoch": 1.5535880708294503, "grad_norm": 1.2756791706726018, "learning_rate": 1.0714532274767002e-05, "loss": 1.8841, "step": 1667 }, { "epoch": 1.554520037278658, "grad_norm": 1.349031641935315, "learning_rate": 1.0707628581290992e-05, "loss": 1.6724, "step": 1668 }, { "epoch": 1.5554520037278659, "grad_norm": 1.016103775740967, "learning_rate": 1.0700724887814981e-05, "loss": 1.7367, "step": 1669 }, { "epoch": 1.5563839701770736, "grad_norm": 1.4017222762439043, "learning_rate": 1.0693821194338972e-05, "loss": 1.6794, "step": 1670 }, { "epoch": 1.5573159366262814, "grad_norm": 1.2048809927648816, "learning_rate": 1.0686917500862963e-05, "loss": 1.8484, "step": 1671 }, { "epoch": 1.5582479030754892, "grad_norm": 1.3408517962256574, "learning_rate": 1.0680013807386953e-05, "loss": 2.0129, "step": 1672 }, { "epoch": 1.559179869524697, "grad_norm": 1.090353506760893, "learning_rate": 1.0673110113910943e-05, "loss": 1.6139, "step": 1673 }, { "epoch": 1.5601118359739048, "grad_norm": 1.289231351480981, "learning_rate": 1.0666206420434933e-05, "loss": 1.6556, "step": 1674 }, { "epoch": 1.5610438024231128, "grad_norm": 1.7119407344480448, "learning_rate": 1.0659302726958924e-05, "loss": 1.8735, "step": 1675 }, { "epoch": 1.5619757688723206, "grad_norm": 1.4769832141373236, "learning_rate": 1.0652399033482915e-05, "loss": 1.851, "step": 1676 }, { "epoch": 1.5629077353215284, "grad_norm": 1.5141687143316274, "learning_rate": 1.0645495340006904e-05, "loss": 1.5776, "step": 1677 }, { "epoch": 1.5638397017707364, "grad_norm": 1.1432536200735328, "learning_rate": 1.0638591646530896e-05, "loss": 1.614, "step": 1678 }, { "epoch": 1.5647716682199442, "grad_norm": 1.263648581089811, "learning_rate": 1.0631687953054885e-05, "loss": 1.684, "step": 1679 }, { "epoch": 1.565703634669152, "grad_norm": 1.2374030425160896, "learning_rate": 1.0624784259578876e-05, "loss": 2.12, "step": 1680 }, { "epoch": 1.5666356011183598, "grad_norm": 1.0871587039835644, "learning_rate": 1.0617880566102865e-05, "loss": 1.8142, "step": 1681 }, { "epoch": 1.5675675675675675, "grad_norm": 1.3392788319450044, "learning_rate": 1.0610976872626857e-05, "loss": 1.6232, "step": 1682 }, { "epoch": 1.5684995340167753, "grad_norm": 1.1571927299857405, "learning_rate": 1.0604073179150846e-05, "loss": 1.949, "step": 1683 }, { "epoch": 1.5694315004659831, "grad_norm": 1.163525430825344, "learning_rate": 1.0597169485674837e-05, "loss": 2.1258, "step": 1684 }, { "epoch": 1.570363466915191, "grad_norm": 1.2610918563582547, "learning_rate": 1.0590265792198828e-05, "loss": 2.2938, "step": 1685 }, { "epoch": 1.571295433364399, "grad_norm": 1.1849624389867506, "learning_rate": 1.0583362098722819e-05, "loss": 1.8263, "step": 1686 }, { "epoch": 1.5722273998136067, "grad_norm": 1.331050954754313, "learning_rate": 1.0576458405246808e-05, "loss": 2.0381, "step": 1687 }, { "epoch": 1.5731593662628145, "grad_norm": 1.2499191674158414, "learning_rate": 1.0569554711770797e-05, "loss": 1.7536, "step": 1688 }, { "epoch": 1.5740913327120225, "grad_norm": 1.57234496426548, "learning_rate": 1.056265101829479e-05, "loss": 1.9368, "step": 1689 }, { "epoch": 1.5750232991612303, "grad_norm": 1.107366848489834, "learning_rate": 1.055574732481878e-05, "loss": 1.7712, "step": 1690 }, { "epoch": 1.575955265610438, "grad_norm": 1.464229758105892, "learning_rate": 1.0548843631342769e-05, "loss": 2.0644, "step": 1691 }, { "epoch": 1.5768872320596459, "grad_norm": 1.158914404931978, "learning_rate": 1.0541939937866758e-05, "loss": 1.4361, "step": 1692 }, { "epoch": 1.5778191985088537, "grad_norm": 1.1838729392281782, "learning_rate": 1.053503624439075e-05, "loss": 1.9901, "step": 1693 }, { "epoch": 1.5787511649580614, "grad_norm": 1.1158313423873039, "learning_rate": 1.0528132550914741e-05, "loss": 1.7895, "step": 1694 }, { "epoch": 1.5796831314072692, "grad_norm": 1.522656630076931, "learning_rate": 1.052122885743873e-05, "loss": 1.8351, "step": 1695 }, { "epoch": 1.580615097856477, "grad_norm": 1.5250026588773038, "learning_rate": 1.0514325163962723e-05, "loss": 1.9454, "step": 1696 }, { "epoch": 1.581547064305685, "grad_norm": 1.1840511002495036, "learning_rate": 1.0507421470486712e-05, "loss": 1.6902, "step": 1697 }, { "epoch": 1.5824790307548928, "grad_norm": 1.2474302422657664, "learning_rate": 1.05005177770107e-05, "loss": 1.9057, "step": 1698 }, { "epoch": 1.5834109972041006, "grad_norm": 1.5069392676013722, "learning_rate": 1.0493614083534691e-05, "loss": 1.9129, "step": 1699 }, { "epoch": 1.5843429636533086, "grad_norm": 1.3322185798855903, "learning_rate": 1.0486710390058684e-05, "loss": 1.9379, "step": 1700 }, { "epoch": 1.5852749301025164, "grad_norm": 1.185815665130222, "learning_rate": 1.0479806696582673e-05, "loss": 1.6699, "step": 1701 }, { "epoch": 1.5862068965517242, "grad_norm": 1.5102191337738577, "learning_rate": 1.0472903003106662e-05, "loss": 1.7037, "step": 1702 }, { "epoch": 1.587138863000932, "grad_norm": 1.2481917464355383, "learning_rate": 1.0465999309630653e-05, "loss": 1.7827, "step": 1703 }, { "epoch": 1.5880708294501398, "grad_norm": 1.1580006049536833, "learning_rate": 1.0459095616154643e-05, "loss": 1.8627, "step": 1704 }, { "epoch": 1.5890027958993476, "grad_norm": 1.3902239936483936, "learning_rate": 1.0452191922678634e-05, "loss": 1.7691, "step": 1705 }, { "epoch": 1.5899347623485554, "grad_norm": 1.1874217342486282, "learning_rate": 1.0445288229202623e-05, "loss": 1.6494, "step": 1706 }, { "epoch": 1.5908667287977631, "grad_norm": 1.3070495775097728, "learning_rate": 1.0438384535726616e-05, "loss": 1.9185, "step": 1707 }, { "epoch": 1.5917986952469712, "grad_norm": 1.5965744719594603, "learning_rate": 1.0431480842250605e-05, "loss": 1.977, "step": 1708 }, { "epoch": 1.592730661696179, "grad_norm": 1.5646499691334252, "learning_rate": 1.0424577148774595e-05, "loss": 1.8331, "step": 1709 }, { "epoch": 1.5936626281453867, "grad_norm": 1.329779721964853, "learning_rate": 1.0417673455298584e-05, "loss": 1.9675, "step": 1710 }, { "epoch": 1.5945945945945947, "grad_norm": 1.342835633340383, "learning_rate": 1.0410769761822577e-05, "loss": 1.7123, "step": 1711 }, { "epoch": 1.5955265610438025, "grad_norm": 1.1027246062131322, "learning_rate": 1.0403866068346566e-05, "loss": 1.7693, "step": 1712 }, { "epoch": 1.5964585274930103, "grad_norm": 1.3084171499315003, "learning_rate": 1.0396962374870557e-05, "loss": 1.6953, "step": 1713 }, { "epoch": 1.597390493942218, "grad_norm": 1.6400099644700497, "learning_rate": 1.0390058681394546e-05, "loss": 1.9476, "step": 1714 }, { "epoch": 1.598322460391426, "grad_norm": 1.3187299979656835, "learning_rate": 1.0383154987918538e-05, "loss": 1.9104, "step": 1715 }, { "epoch": 1.5992544268406337, "grad_norm": 1.1183479618303855, "learning_rate": 1.0376251294442527e-05, "loss": 1.6919, "step": 1716 }, { "epoch": 1.6001863932898415, "grad_norm": 1.130075717696439, "learning_rate": 1.0369347600966518e-05, "loss": 1.9252, "step": 1717 }, { "epoch": 1.6011183597390493, "grad_norm": 1.375186633954898, "learning_rate": 1.0362443907490509e-05, "loss": 1.9412, "step": 1718 }, { "epoch": 1.6020503261882573, "grad_norm": 1.2334201018838806, "learning_rate": 1.03555402140145e-05, "loss": 1.9924, "step": 1719 }, { "epoch": 1.602982292637465, "grad_norm": 1.3882205894628283, "learning_rate": 1.0348636520538488e-05, "loss": 1.8948, "step": 1720 }, { "epoch": 1.6039142590866728, "grad_norm": 1.1010138144559631, "learning_rate": 1.0341732827062479e-05, "loss": 2.02, "step": 1721 }, { "epoch": 1.6048462255358809, "grad_norm": 1.0303588460557334, "learning_rate": 1.033482913358647e-05, "loss": 1.4589, "step": 1722 }, { "epoch": 1.6057781919850886, "grad_norm": 1.1642216141116426, "learning_rate": 1.032792544011046e-05, "loss": 1.5321, "step": 1723 }, { "epoch": 1.6067101584342964, "grad_norm": 1.3253258483968042, "learning_rate": 1.032102174663445e-05, "loss": 2.0126, "step": 1724 }, { "epoch": 1.6076421248835042, "grad_norm": 1.2563984401993058, "learning_rate": 1.031411805315844e-05, "loss": 1.9432, "step": 1725 }, { "epoch": 1.608574091332712, "grad_norm": 1.2695845624859579, "learning_rate": 1.0307214359682431e-05, "loss": 1.7896, "step": 1726 }, { "epoch": 1.6095060577819198, "grad_norm": 1.1692144587400877, "learning_rate": 1.0300310666206422e-05, "loss": 1.8608, "step": 1727 }, { "epoch": 1.6104380242311276, "grad_norm": 1.5359590650244352, "learning_rate": 1.029340697273041e-05, "loss": 1.8225, "step": 1728 }, { "epoch": 1.6113699906803354, "grad_norm": 1.1835986133066834, "learning_rate": 1.0286503279254403e-05, "loss": 1.6186, "step": 1729 }, { "epoch": 1.6123019571295434, "grad_norm": 1.229647649602864, "learning_rate": 1.0279599585778392e-05, "loss": 1.8813, "step": 1730 }, { "epoch": 1.6132339235787512, "grad_norm": 1.3135923069054596, "learning_rate": 1.0272695892302383e-05, "loss": 1.8571, "step": 1731 }, { "epoch": 1.614165890027959, "grad_norm": 1.2140847708438112, "learning_rate": 1.0265792198826372e-05, "loss": 1.8037, "step": 1732 }, { "epoch": 1.615097856477167, "grad_norm": 1.2202480326148322, "learning_rate": 1.0258888505350364e-05, "loss": 1.7236, "step": 1733 }, { "epoch": 1.6160298229263748, "grad_norm": 1.0063554892759952, "learning_rate": 1.0251984811874353e-05, "loss": 1.4915, "step": 1734 }, { "epoch": 1.6169617893755825, "grad_norm": 1.3649147225606455, "learning_rate": 1.0245081118398344e-05, "loss": 1.8089, "step": 1735 }, { "epoch": 1.6178937558247903, "grad_norm": 1.3320240772957757, "learning_rate": 1.0238177424922335e-05, "loss": 1.8347, "step": 1736 }, { "epoch": 1.6188257222739981, "grad_norm": 1.4071773592856018, "learning_rate": 1.0231273731446326e-05, "loss": 1.7111, "step": 1737 }, { "epoch": 1.619757688723206, "grad_norm": 1.0063199528196154, "learning_rate": 1.0224370037970315e-05, "loss": 1.643, "step": 1738 }, { "epoch": 1.6206896551724137, "grad_norm": 1.1748476805606023, "learning_rate": 1.0217466344494304e-05, "loss": 1.6029, "step": 1739 }, { "epoch": 1.6216216216216215, "grad_norm": 1.378472410587395, "learning_rate": 1.0210562651018296e-05, "loss": 1.6317, "step": 1740 }, { "epoch": 1.6225535880708295, "grad_norm": 1.240789881799917, "learning_rate": 1.0203658957542287e-05, "loss": 1.8969, "step": 1741 }, { "epoch": 1.6234855545200373, "grad_norm": 1.4832039245831172, "learning_rate": 1.0196755264066276e-05, "loss": 1.9644, "step": 1742 }, { "epoch": 1.624417520969245, "grad_norm": 1.5001769170568517, "learning_rate": 1.0189851570590265e-05, "loss": 1.9501, "step": 1743 }, { "epoch": 1.625349487418453, "grad_norm": 1.3989341458273168, "learning_rate": 1.0182947877114257e-05, "loss": 1.8502, "step": 1744 }, { "epoch": 1.6262814538676609, "grad_norm": 1.5263586259546869, "learning_rate": 1.0176044183638248e-05, "loss": 1.8703, "step": 1745 }, { "epoch": 1.6272134203168687, "grad_norm": 1.3497372027883285, "learning_rate": 1.0169140490162237e-05, "loss": 1.8125, "step": 1746 }, { "epoch": 1.6281453867660765, "grad_norm": 1.58554656143029, "learning_rate": 1.016223679668623e-05, "loss": 2.1143, "step": 1747 }, { "epoch": 1.6290773532152842, "grad_norm": 1.2229078407213771, "learning_rate": 1.0155333103210219e-05, "loss": 1.5972, "step": 1748 }, { "epoch": 1.630009319664492, "grad_norm": 1.3712678548615909, "learning_rate": 1.0148429409734208e-05, "loss": 2.2585, "step": 1749 }, { "epoch": 1.6309412861136998, "grad_norm": 1.2572774850154869, "learning_rate": 1.0141525716258198e-05, "loss": 1.8008, "step": 1750 }, { "epoch": 1.6318732525629076, "grad_norm": 1.2430463778041325, "learning_rate": 1.0134622022782191e-05, "loss": 2.0762, "step": 1751 }, { "epoch": 1.6328052190121156, "grad_norm": 1.1959987254291915, "learning_rate": 1.012771832930618e-05, "loss": 1.8403, "step": 1752 }, { "epoch": 1.6337371854613234, "grad_norm": 1.177614306036927, "learning_rate": 1.0120814635830169e-05, "loss": 1.6338, "step": 1753 }, { "epoch": 1.6346691519105312, "grad_norm": 1.2231055943823386, "learning_rate": 1.011391094235416e-05, "loss": 1.7822, "step": 1754 }, { "epoch": 1.6356011183597392, "grad_norm": 1.2657237625943984, "learning_rate": 1.010700724887815e-05, "loss": 1.5877, "step": 1755 }, { "epoch": 1.636533084808947, "grad_norm": 1.4354408807109584, "learning_rate": 1.0100103555402141e-05, "loss": 1.5153, "step": 1756 }, { "epoch": 1.6374650512581548, "grad_norm": 1.2904593000157978, "learning_rate": 1.009319986192613e-05, "loss": 1.8309, "step": 1757 }, { "epoch": 1.6383970177073626, "grad_norm": 1.1618594129263282, "learning_rate": 1.0086296168450123e-05, "loss": 1.7855, "step": 1758 }, { "epoch": 1.6393289841565704, "grad_norm": 1.3616294849805242, "learning_rate": 1.0079392474974112e-05, "loss": 2.0862, "step": 1759 }, { "epoch": 1.6402609506057781, "grad_norm": 1.279570944248118, "learning_rate": 1.0072488781498102e-05, "loss": 2.3454, "step": 1760 }, { "epoch": 1.641192917054986, "grad_norm": 1.5009438165155955, "learning_rate": 1.0065585088022091e-05, "loss": 1.6058, "step": 1761 }, { "epoch": 1.6421248835041937, "grad_norm": 1.2603802460419382, "learning_rate": 1.0058681394546084e-05, "loss": 1.7919, "step": 1762 }, { "epoch": 1.6430568499534017, "grad_norm": 1.2723426448074806, "learning_rate": 1.0051777701070073e-05, "loss": 2.1216, "step": 1763 }, { "epoch": 1.6439888164026095, "grad_norm": 1.2034271483042185, "learning_rate": 1.0044874007594064e-05, "loss": 1.6753, "step": 1764 }, { "epoch": 1.6449207828518173, "grad_norm": 1.241591782294117, "learning_rate": 1.0037970314118053e-05, "loss": 1.7193, "step": 1765 }, { "epoch": 1.6458527493010253, "grad_norm": 1.4880762415710287, "learning_rate": 1.0031066620642045e-05, "loss": 2.0934, "step": 1766 }, { "epoch": 1.646784715750233, "grad_norm": 1.301712519748899, "learning_rate": 1.0024162927166034e-05, "loss": 1.9484, "step": 1767 }, { "epoch": 1.647716682199441, "grad_norm": 1.0225039366457953, "learning_rate": 1.0017259233690025e-05, "loss": 1.8819, "step": 1768 }, { "epoch": 1.6486486486486487, "grad_norm": 1.3761707508809604, "learning_rate": 1.0010355540214016e-05, "loss": 1.8965, "step": 1769 }, { "epoch": 1.6495806150978565, "grad_norm": 1.1305521361118587, "learning_rate": 1.0003451846738006e-05, "loss": 1.8488, "step": 1770 }, { "epoch": 1.6505125815470643, "grad_norm": 1.2078542663518532, "learning_rate": 9.996548153261995e-06, "loss": 1.8267, "step": 1771 }, { "epoch": 1.651444547996272, "grad_norm": 1.2549046666427606, "learning_rate": 9.989644459785986e-06, "loss": 2.1295, "step": 1772 }, { "epoch": 1.6523765144454798, "grad_norm": 1.1074226409665255, "learning_rate": 9.982740766309977e-06, "loss": 1.5849, "step": 1773 }, { "epoch": 1.6533084808946876, "grad_norm": 1.172907960952428, "learning_rate": 9.975837072833968e-06, "loss": 1.7587, "step": 1774 }, { "epoch": 1.6542404473438956, "grad_norm": 1.2425787379680728, "learning_rate": 9.968933379357957e-06, "loss": 2.0015, "step": 1775 }, { "epoch": 1.6551724137931034, "grad_norm": 1.2433946953209374, "learning_rate": 9.962029685881947e-06, "loss": 1.6175, "step": 1776 }, { "epoch": 1.6561043802423114, "grad_norm": 1.3764262636448983, "learning_rate": 9.955125992405938e-06, "loss": 1.6539, "step": 1777 }, { "epoch": 1.6570363466915192, "grad_norm": 1.4397945374788514, "learning_rate": 9.948222298929929e-06, "loss": 1.8719, "step": 1778 }, { "epoch": 1.657968313140727, "grad_norm": 1.3776306145153032, "learning_rate": 9.94131860545392e-06, "loss": 1.8719, "step": 1779 }, { "epoch": 1.6589002795899348, "grad_norm": 1.136060424233933, "learning_rate": 9.934414911977909e-06, "loss": 1.8122, "step": 1780 }, { "epoch": 1.6598322460391426, "grad_norm": 1.0495666669995392, "learning_rate": 9.9275112185019e-06, "loss": 1.7045, "step": 1781 }, { "epoch": 1.6607642124883504, "grad_norm": 1.1842160506589996, "learning_rate": 9.92060752502589e-06, "loss": 1.8922, "step": 1782 }, { "epoch": 1.6616961789375582, "grad_norm": 1.4116354915647442, "learning_rate": 9.91370383154988e-06, "loss": 1.8659, "step": 1783 }, { "epoch": 1.662628145386766, "grad_norm": 1.258322083805929, "learning_rate": 9.90680013807387e-06, "loss": 1.6895, "step": 1784 }, { "epoch": 1.6635601118359737, "grad_norm": 1.269377285315139, "learning_rate": 9.89989644459786e-06, "loss": 2.19, "step": 1785 }, { "epoch": 1.6644920782851818, "grad_norm": 1.3768040669795454, "learning_rate": 9.892992751121851e-06, "loss": 2.3343, "step": 1786 }, { "epoch": 1.6654240447343895, "grad_norm": 1.5003527136400325, "learning_rate": 9.886089057645842e-06, "loss": 1.7382, "step": 1787 }, { "epoch": 1.6663560111835976, "grad_norm": 1.1415455032728699, "learning_rate": 9.879185364169833e-06, "loss": 1.9552, "step": 1788 }, { "epoch": 1.6672879776328053, "grad_norm": 1.137999037564749, "learning_rate": 9.872281670693822e-06, "loss": 2.0294, "step": 1789 }, { "epoch": 1.6682199440820131, "grad_norm": 1.4599166008506654, "learning_rate": 9.865377977217812e-06, "loss": 1.7964, "step": 1790 }, { "epoch": 1.669151910531221, "grad_norm": 1.194781526882688, "learning_rate": 9.858474283741803e-06, "loss": 1.7816, "step": 1791 }, { "epoch": 1.6700838769804287, "grad_norm": 1.3326423250037047, "learning_rate": 9.851570590265794e-06, "loss": 1.8504, "step": 1792 }, { "epoch": 1.6710158434296365, "grad_norm": 1.3932769570178238, "learning_rate": 9.844666896789783e-06, "loss": 2.2422, "step": 1793 }, { "epoch": 1.6719478098788443, "grad_norm": 1.5297479479274105, "learning_rate": 9.837763203313774e-06, "loss": 2.0176, "step": 1794 }, { "epoch": 1.672879776328052, "grad_norm": 1.3993445639024733, "learning_rate": 9.830859509837763e-06, "loss": 1.9685, "step": 1795 }, { "epoch": 1.6738117427772599, "grad_norm": 1.2822474734839808, "learning_rate": 9.823955816361753e-06, "loss": 1.9745, "step": 1796 }, { "epoch": 1.6747437092264679, "grad_norm": 1.2610381416858494, "learning_rate": 9.817052122885746e-06, "loss": 1.6177, "step": 1797 }, { "epoch": 1.6756756756756757, "grad_norm": 1.1873923395689823, "learning_rate": 9.810148429409735e-06, "loss": 1.9791, "step": 1798 }, { "epoch": 1.6766076421248837, "grad_norm": 1.3232805672806849, "learning_rate": 9.803244735933726e-06, "loss": 1.6448, "step": 1799 }, { "epoch": 1.6775396085740915, "grad_norm": 1.5443107877098756, "learning_rate": 9.796341042457715e-06, "loss": 1.9597, "step": 1800 }, { "epoch": 1.6784715750232992, "grad_norm": 1.430522409889889, "learning_rate": 9.789437348981705e-06, "loss": 2.0193, "step": 1801 }, { "epoch": 1.679403541472507, "grad_norm": 1.2802217286179707, "learning_rate": 9.782533655505696e-06, "loss": 1.7349, "step": 1802 }, { "epoch": 1.6803355079217148, "grad_norm": 1.1756592962907328, "learning_rate": 9.775629962029687e-06, "loss": 1.8261, "step": 1803 }, { "epoch": 1.6812674743709226, "grad_norm": 1.1769888019408365, "learning_rate": 9.768726268553676e-06, "loss": 1.8532, "step": 1804 }, { "epoch": 1.6821994408201304, "grad_norm": 1.2202371726524912, "learning_rate": 9.761822575077667e-06, "loss": 1.9338, "step": 1805 }, { "epoch": 1.6831314072693382, "grad_norm": 1.2960706226121228, "learning_rate": 9.754918881601657e-06, "loss": 1.6856, "step": 1806 }, { "epoch": 1.684063373718546, "grad_norm": 1.252223041634842, "learning_rate": 9.748015188125648e-06, "loss": 1.7467, "step": 1807 }, { "epoch": 1.684995340167754, "grad_norm": 1.2822636571119796, "learning_rate": 9.741111494649639e-06, "loss": 2.165, "step": 1808 }, { "epoch": 1.6859273066169618, "grad_norm": 1.1650386591581234, "learning_rate": 9.734207801173628e-06, "loss": 1.8429, "step": 1809 }, { "epoch": 1.6868592730661698, "grad_norm": 1.3303684322549596, "learning_rate": 9.727304107697619e-06, "loss": 1.8074, "step": 1810 }, { "epoch": 1.6877912395153776, "grad_norm": 1.2946002384450124, "learning_rate": 9.72040041422161e-06, "loss": 2.0522, "step": 1811 }, { "epoch": 1.6887232059645854, "grad_norm": 1.2191486286348399, "learning_rate": 9.7134967207456e-06, "loss": 1.6589, "step": 1812 }, { "epoch": 1.6896551724137931, "grad_norm": 1.5340462896956653, "learning_rate": 9.70659302726959e-06, "loss": 1.7957, "step": 1813 }, { "epoch": 1.690587138863001, "grad_norm": 1.2715799999332817, "learning_rate": 9.69968933379358e-06, "loss": 1.6325, "step": 1814 }, { "epoch": 1.6915191053122087, "grad_norm": 1.4682789895239237, "learning_rate": 9.69278564031757e-06, "loss": 2.3254, "step": 1815 }, { "epoch": 1.6924510717614165, "grad_norm": 1.3977068643937185, "learning_rate": 9.685881946841561e-06, "loss": 1.9734, "step": 1816 }, { "epoch": 1.6933830382106243, "grad_norm": 1.216101881507064, "learning_rate": 9.678978253365552e-06, "loss": 1.7428, "step": 1817 }, { "epoch": 1.694315004659832, "grad_norm": 0.9781957046863858, "learning_rate": 9.672074559889541e-06, "loss": 1.5523, "step": 1818 }, { "epoch": 1.69524697110904, "grad_norm": 1.5395683523550943, "learning_rate": 9.665170866413532e-06, "loss": 1.8194, "step": 1819 }, { "epoch": 1.696178937558248, "grad_norm": 1.2678931746981381, "learning_rate": 9.658267172937523e-06, "loss": 1.5937, "step": 1820 }, { "epoch": 1.6971109040074557, "grad_norm": 1.2511189325341003, "learning_rate": 9.651363479461513e-06, "loss": 1.7683, "step": 1821 }, { "epoch": 1.6980428704566637, "grad_norm": 1.5094048767086794, "learning_rate": 9.644459785985502e-06, "loss": 1.694, "step": 1822 }, { "epoch": 1.6989748369058715, "grad_norm": 1.26246952792451, "learning_rate": 9.637556092509493e-06, "loss": 1.7624, "step": 1823 }, { "epoch": 1.6999068033550793, "grad_norm": 1.4578341882327495, "learning_rate": 9.630652399033484e-06, "loss": 2.0084, "step": 1824 }, { "epoch": 1.700838769804287, "grad_norm": 1.2586657920444917, "learning_rate": 9.623748705557475e-06, "loss": 1.9837, "step": 1825 }, { "epoch": 1.7017707362534948, "grad_norm": 1.4460245376170007, "learning_rate": 9.616845012081464e-06, "loss": 2.1909, "step": 1826 }, { "epoch": 1.7027027027027026, "grad_norm": 1.253610790598955, "learning_rate": 9.609941318605454e-06, "loss": 1.8808, "step": 1827 }, { "epoch": 1.7036346691519104, "grad_norm": 1.3665924626949568, "learning_rate": 9.603037625129445e-06, "loss": 2.0774, "step": 1828 }, { "epoch": 1.7045666356011182, "grad_norm": 1.449255830857777, "learning_rate": 9.596133931653436e-06, "loss": 2.1492, "step": 1829 }, { "epoch": 1.7054986020503262, "grad_norm": 1.330882082569919, "learning_rate": 9.589230238177427e-06, "loss": 2.0665, "step": 1830 }, { "epoch": 1.706430568499534, "grad_norm": 1.1719292267581136, "learning_rate": 9.582326544701416e-06, "loss": 1.962, "step": 1831 }, { "epoch": 1.7073625349487418, "grad_norm": 1.2038433734626983, "learning_rate": 9.575422851225406e-06, "loss": 1.8137, "step": 1832 }, { "epoch": 1.7082945013979498, "grad_norm": 1.3885793368978618, "learning_rate": 9.568519157749397e-06, "loss": 1.6872, "step": 1833 }, { "epoch": 1.7092264678471576, "grad_norm": 1.1380835748299862, "learning_rate": 9.561615464273388e-06, "loss": 1.7764, "step": 1834 }, { "epoch": 1.7101584342963654, "grad_norm": 1.1101866306688504, "learning_rate": 9.554711770797377e-06, "loss": 1.4717, "step": 1835 }, { "epoch": 1.7110904007455732, "grad_norm": 1.2539773641729777, "learning_rate": 9.547808077321368e-06, "loss": 2.1548, "step": 1836 }, { "epoch": 1.712022367194781, "grad_norm": 1.3363616735999657, "learning_rate": 9.540904383845358e-06, "loss": 1.7566, "step": 1837 }, { "epoch": 1.7129543336439887, "grad_norm": 1.2889861071267088, "learning_rate": 9.534000690369349e-06, "loss": 1.751, "step": 1838 }, { "epoch": 1.7138863000931965, "grad_norm": 1.5243352112156947, "learning_rate": 9.52709699689334e-06, "loss": 1.93, "step": 1839 }, { "epoch": 1.7148182665424043, "grad_norm": 1.428806636716829, "learning_rate": 9.520193303417329e-06, "loss": 1.5951, "step": 1840 }, { "epoch": 1.7157502329916123, "grad_norm": 1.4675811108221708, "learning_rate": 9.51328960994132e-06, "loss": 1.9906, "step": 1841 }, { "epoch": 1.7166821994408201, "grad_norm": 1.4493460948599342, "learning_rate": 9.50638591646531e-06, "loss": 1.9209, "step": 1842 }, { "epoch": 1.717614165890028, "grad_norm": 1.4153435951091, "learning_rate": 9.499482222989301e-06, "loss": 1.919, "step": 1843 }, { "epoch": 1.718546132339236, "grad_norm": 1.0246039850321051, "learning_rate": 9.49257852951329e-06, "loss": 1.6396, "step": 1844 }, { "epoch": 1.7194780987884437, "grad_norm": 1.5260043002051713, "learning_rate": 9.48567483603728e-06, "loss": 1.9667, "step": 1845 }, { "epoch": 1.7204100652376515, "grad_norm": 1.29992174056194, "learning_rate": 9.47877114256127e-06, "loss": 1.8971, "step": 1846 }, { "epoch": 1.7213420316868593, "grad_norm": 1.1524387855763414, "learning_rate": 9.47186744908526e-06, "loss": 1.6586, "step": 1847 }, { "epoch": 1.722273998136067, "grad_norm": 1.420866686051684, "learning_rate": 9.464963755609253e-06, "loss": 1.8356, "step": 1848 }, { "epoch": 1.7232059645852749, "grad_norm": 1.3009253048572433, "learning_rate": 9.458060062133242e-06, "loss": 1.5076, "step": 1849 }, { "epoch": 1.7241379310344827, "grad_norm": 1.3920955934639547, "learning_rate": 9.451156368657233e-06, "loss": 1.6795, "step": 1850 }, { "epoch": 1.7250698974836904, "grad_norm": 1.2874535587976697, "learning_rate": 9.444252675181222e-06, "loss": 1.9741, "step": 1851 }, { "epoch": 1.7260018639328985, "grad_norm": 1.1947153392914216, "learning_rate": 9.437348981705212e-06, "loss": 1.7981, "step": 1852 }, { "epoch": 1.7269338303821062, "grad_norm": 1.3860765010340832, "learning_rate": 9.430445288229203e-06, "loss": 1.8288, "step": 1853 }, { "epoch": 1.727865796831314, "grad_norm": 1.0542676194004061, "learning_rate": 9.423541594753194e-06, "loss": 1.7126, "step": 1854 }, { "epoch": 1.728797763280522, "grad_norm": 1.436830180180769, "learning_rate": 9.416637901277183e-06, "loss": 1.584, "step": 1855 }, { "epoch": 1.7297297297297298, "grad_norm": 1.1009511077516623, "learning_rate": 9.409734207801174e-06, "loss": 1.6371, "step": 1856 }, { "epoch": 1.7306616961789376, "grad_norm": 1.1583832959619795, "learning_rate": 9.402830514325164e-06, "loss": 1.697, "step": 1857 }, { "epoch": 1.7315936626281454, "grad_norm": 1.1501285139000335, "learning_rate": 9.395926820849155e-06, "loss": 1.9253, "step": 1858 }, { "epoch": 1.7325256290773532, "grad_norm": 1.171316006074628, "learning_rate": 9.389023127373146e-06, "loss": 1.8527, "step": 1859 }, { "epoch": 1.733457595526561, "grad_norm": 1.2601888611123073, "learning_rate": 9.382119433897135e-06, "loss": 2.0653, "step": 1860 }, { "epoch": 1.7343895619757688, "grad_norm": 1.3213564146706627, "learning_rate": 9.375215740421126e-06, "loss": 1.8476, "step": 1861 }, { "epoch": 1.7353215284249766, "grad_norm": 1.1959667225625454, "learning_rate": 9.368312046945116e-06, "loss": 1.7302, "step": 1862 }, { "epoch": 1.7362534948741846, "grad_norm": 1.2099538375484367, "learning_rate": 9.361408353469107e-06, "loss": 1.5878, "step": 1863 }, { "epoch": 1.7371854613233924, "grad_norm": 1.524198087035741, "learning_rate": 9.354504659993096e-06, "loss": 1.8121, "step": 1864 }, { "epoch": 1.7381174277726001, "grad_norm": 1.2470330452725826, "learning_rate": 9.347600966517087e-06, "loss": 1.6367, "step": 1865 }, { "epoch": 1.7390493942218082, "grad_norm": 1.530255041547211, "learning_rate": 9.340697273041078e-06, "loss": 2.0276, "step": 1866 }, { "epoch": 1.739981360671016, "grad_norm": 1.168471053810354, "learning_rate": 9.333793579565068e-06, "loss": 1.993, "step": 1867 }, { "epoch": 1.7409133271202237, "grad_norm": 1.4126115499630094, "learning_rate": 9.326889886089059e-06, "loss": 1.7558, "step": 1868 }, { "epoch": 1.7418452935694315, "grad_norm": 1.0735583504992088, "learning_rate": 9.319986192613048e-06, "loss": 1.7515, "step": 1869 }, { "epoch": 1.7427772600186393, "grad_norm": 1.7179416426961789, "learning_rate": 9.313082499137039e-06, "loss": 2.0649, "step": 1870 }, { "epoch": 1.743709226467847, "grad_norm": 1.298369001822817, "learning_rate": 9.30617880566103e-06, "loss": 2.0734, "step": 1871 }, { "epoch": 1.7446411929170549, "grad_norm": 1.3097316777481933, "learning_rate": 9.29927511218502e-06, "loss": 1.7957, "step": 1872 }, { "epoch": 1.7455731593662627, "grad_norm": 1.4246809656307156, "learning_rate": 9.29237141870901e-06, "loss": 2.2132, "step": 1873 }, { "epoch": 1.7465051258154707, "grad_norm": 1.6695639291660378, "learning_rate": 9.285467725233e-06, "loss": 1.9054, "step": 1874 }, { "epoch": 1.7474370922646785, "grad_norm": 1.3902960698937767, "learning_rate": 9.278564031756991e-06, "loss": 2.6845, "step": 1875 }, { "epoch": 1.7483690587138863, "grad_norm": 1.2854317168898828, "learning_rate": 9.271660338280982e-06, "loss": 1.835, "step": 1876 }, { "epoch": 1.7493010251630943, "grad_norm": 1.413516461816987, "learning_rate": 9.264756644804972e-06, "loss": 1.6777, "step": 1877 }, { "epoch": 1.750232991612302, "grad_norm": 1.6143507472132441, "learning_rate": 9.257852951328961e-06, "loss": 2.1136, "step": 1878 }, { "epoch": 1.7511649580615098, "grad_norm": 1.3369654935909243, "learning_rate": 9.250949257852952e-06, "loss": 1.6623, "step": 1879 }, { "epoch": 1.7520969245107176, "grad_norm": 1.271023905175955, "learning_rate": 9.244045564376943e-06, "loss": 1.5134, "step": 1880 }, { "epoch": 1.7530288909599254, "grad_norm": 1.1322111856922208, "learning_rate": 9.237141870900934e-06, "loss": 1.9091, "step": 1881 }, { "epoch": 1.7539608574091332, "grad_norm": 1.2314972220549727, "learning_rate": 9.230238177424923e-06, "loss": 1.9259, "step": 1882 }, { "epoch": 1.754892823858341, "grad_norm": 1.230996003143571, "learning_rate": 9.223334483948913e-06, "loss": 1.5124, "step": 1883 }, { "epoch": 1.7558247903075488, "grad_norm": 1.1827881293772462, "learning_rate": 9.216430790472904e-06, "loss": 2.0596, "step": 1884 }, { "epoch": 1.7567567567567568, "grad_norm": 2.362707521187722, "learning_rate": 9.209527096996895e-06, "loss": 1.6541, "step": 1885 }, { "epoch": 1.7576887232059646, "grad_norm": 1.2768664120066302, "learning_rate": 9.202623403520884e-06, "loss": 2.0898, "step": 1886 }, { "epoch": 1.7586206896551724, "grad_norm": 1.0865633832228843, "learning_rate": 9.195719710044875e-06, "loss": 2.103, "step": 1887 }, { "epoch": 1.7595526561043804, "grad_norm": 1.4713935705476446, "learning_rate": 9.188816016568865e-06, "loss": 2.1168, "step": 1888 }, { "epoch": 1.7604846225535882, "grad_norm": 1.3995768463586988, "learning_rate": 9.181912323092856e-06, "loss": 1.9102, "step": 1889 }, { "epoch": 1.761416589002796, "grad_norm": 1.2516519234697294, "learning_rate": 9.175008629616847e-06, "loss": 1.7382, "step": 1890 }, { "epoch": 1.7623485554520038, "grad_norm": 1.7865603972421689, "learning_rate": 9.168104936140836e-06, "loss": 2.239, "step": 1891 }, { "epoch": 1.7632805219012115, "grad_norm": 1.290671225465268, "learning_rate": 9.161201242664827e-06, "loss": 2.1211, "step": 1892 }, { "epoch": 1.7642124883504193, "grad_norm": 1.419823466866071, "learning_rate": 9.154297549188816e-06, "loss": 1.6389, "step": 1893 }, { "epoch": 1.7651444547996271, "grad_norm": 1.3778700117127076, "learning_rate": 9.147393855712808e-06, "loss": 2.1069, "step": 1894 }, { "epoch": 1.766076421248835, "grad_norm": 1.4160521571400573, "learning_rate": 9.140490162236797e-06, "loss": 2.0574, "step": 1895 }, { "epoch": 1.767008387698043, "grad_norm": 1.6128221792678685, "learning_rate": 9.133586468760788e-06, "loss": 1.9096, "step": 1896 }, { "epoch": 1.7679403541472507, "grad_norm": 1.430147573632323, "learning_rate": 9.126682775284779e-06, "loss": 1.9923, "step": 1897 }, { "epoch": 1.7688723205964585, "grad_norm": 1.3373548639493458, "learning_rate": 9.119779081808768e-06, "loss": 2.2586, "step": 1898 }, { "epoch": 1.7698042870456665, "grad_norm": 1.2059774383010438, "learning_rate": 9.11287538833276e-06, "loss": 1.8722, "step": 1899 }, { "epoch": 1.7707362534948743, "grad_norm": 1.3870395144176273, "learning_rate": 9.105971694856749e-06, "loss": 1.7678, "step": 1900 }, { "epoch": 1.771668219944082, "grad_norm": 1.1691275390499818, "learning_rate": 9.09906800138074e-06, "loss": 1.7374, "step": 1901 }, { "epoch": 1.7726001863932899, "grad_norm": 1.013813470345412, "learning_rate": 9.092164307904729e-06, "loss": 1.2464, "step": 1902 }, { "epoch": 1.7735321528424977, "grad_norm": 1.2531327282143159, "learning_rate": 9.08526061442872e-06, "loss": 1.8549, "step": 1903 }, { "epoch": 1.7744641192917054, "grad_norm": 1.2557367577138485, "learning_rate": 9.07835692095271e-06, "loss": 1.7532, "step": 1904 }, { "epoch": 1.7753960857409132, "grad_norm": 1.2647506980358725, "learning_rate": 9.071453227476701e-06, "loss": 2.0998, "step": 1905 }, { "epoch": 1.776328052190121, "grad_norm": 1.2469721381108143, "learning_rate": 9.06454953400069e-06, "loss": 1.6515, "step": 1906 }, { "epoch": 1.777260018639329, "grad_norm": 1.1564045890242183, "learning_rate": 9.05764584052468e-06, "loss": 1.8693, "step": 1907 }, { "epoch": 1.7781919850885368, "grad_norm": 1.0925986297010477, "learning_rate": 9.050742147048671e-06, "loss": 1.8954, "step": 1908 }, { "epoch": 1.7791239515377446, "grad_norm": 1.3010137435372113, "learning_rate": 9.043838453572662e-06, "loss": 2.0089, "step": 1909 }, { "epoch": 1.7800559179869526, "grad_norm": 1.3298282245844524, "learning_rate": 9.036934760096653e-06, "loss": 1.9917, "step": 1910 }, { "epoch": 1.7809878844361604, "grad_norm": 1.2777002587040174, "learning_rate": 9.030031066620642e-06, "loss": 2.0971, "step": 1911 }, { "epoch": 1.7819198508853682, "grad_norm": 1.0892292127019811, "learning_rate": 9.023127373144633e-06, "loss": 1.4204, "step": 1912 }, { "epoch": 1.782851817334576, "grad_norm": 1.069656979608164, "learning_rate": 9.016223679668623e-06, "loss": 1.6634, "step": 1913 }, { "epoch": 1.7837837837837838, "grad_norm": 1.3702853627618703, "learning_rate": 9.009319986192614e-06, "loss": 1.5812, "step": 1914 }, { "epoch": 1.7847157502329916, "grad_norm": 1.4677564681694832, "learning_rate": 9.002416292716603e-06, "loss": 1.8163, "step": 1915 }, { "epoch": 1.7856477166821993, "grad_norm": 1.6716193728455115, "learning_rate": 8.995512599240594e-06, "loss": 1.8569, "step": 1916 }, { "epoch": 1.7865796831314071, "grad_norm": 1.3061881699823559, "learning_rate": 8.988608905764585e-06, "loss": 1.7265, "step": 1917 }, { "epoch": 1.7875116495806151, "grad_norm": 1.4249134341911203, "learning_rate": 8.981705212288575e-06, "loss": 2.0896, "step": 1918 }, { "epoch": 1.788443616029823, "grad_norm": 1.2502125481989186, "learning_rate": 8.974801518812566e-06, "loss": 1.766, "step": 1919 }, { "epoch": 1.7893755824790307, "grad_norm": 1.141202368014171, "learning_rate": 8.967897825336555e-06, "loss": 1.6641, "step": 1920 }, { "epoch": 1.7903075489282387, "grad_norm": 1.2918348548211411, "learning_rate": 8.960994131860546e-06, "loss": 1.6087, "step": 1921 }, { "epoch": 1.7912395153774465, "grad_norm": 1.372585251630132, "learning_rate": 8.954090438384537e-06, "loss": 1.7192, "step": 1922 }, { "epoch": 1.7921714818266543, "grad_norm": 1.354989886407914, "learning_rate": 8.947186744908527e-06, "loss": 1.8488, "step": 1923 }, { "epoch": 1.793103448275862, "grad_norm": 1.1884255500608214, "learning_rate": 8.940283051432516e-06, "loss": 1.6566, "step": 1924 }, { "epoch": 1.7940354147250699, "grad_norm": 1.5897670831716444, "learning_rate": 8.933379357956507e-06, "loss": 2.2196, "step": 1925 }, { "epoch": 1.7949673811742777, "grad_norm": 1.175600006548345, "learning_rate": 8.926475664480498e-06, "loss": 1.5683, "step": 1926 }, { "epoch": 1.7958993476234855, "grad_norm": 1.20428888696625, "learning_rate": 8.919571971004489e-06, "loss": 1.7549, "step": 1927 }, { "epoch": 1.7968313140726933, "grad_norm": 1.0806711233883488, "learning_rate": 8.91266827752848e-06, "loss": 1.7234, "step": 1928 }, { "epoch": 1.7977632805219013, "grad_norm": 1.2774193143108392, "learning_rate": 8.905764584052468e-06, "loss": 1.9283, "step": 1929 }, { "epoch": 1.798695246971109, "grad_norm": 1.1962521330146214, "learning_rate": 8.898860890576459e-06, "loss": 1.9541, "step": 1930 }, { "epoch": 1.7996272134203168, "grad_norm": 1.2545700230835706, "learning_rate": 8.89195719710045e-06, "loss": 1.7989, "step": 1931 }, { "epoch": 1.8005591798695249, "grad_norm": 1.1010322213488137, "learning_rate": 8.88505350362444e-06, "loss": 1.7218, "step": 1932 }, { "epoch": 1.8014911463187326, "grad_norm": 1.3546653103464066, "learning_rate": 8.87814981014843e-06, "loss": 1.8844, "step": 1933 }, { "epoch": 1.8024231127679404, "grad_norm": 1.276259857506546, "learning_rate": 8.87124611667242e-06, "loss": 1.8847, "step": 1934 }, { "epoch": 1.8033550792171482, "grad_norm": 1.5355043116088187, "learning_rate": 8.864342423196411e-06, "loss": 1.6915, "step": 1935 }, { "epoch": 1.804287045666356, "grad_norm": 1.1561300068168874, "learning_rate": 8.857438729720402e-06, "loss": 1.8676, "step": 1936 }, { "epoch": 1.8052190121155638, "grad_norm": 1.3331457174213568, "learning_rate": 8.850535036244391e-06, "loss": 1.8573, "step": 1937 }, { "epoch": 1.8061509785647716, "grad_norm": 1.0712409880022626, "learning_rate": 8.843631342768382e-06, "loss": 1.5492, "step": 1938 }, { "epoch": 1.8070829450139794, "grad_norm": 1.3229969484314958, "learning_rate": 8.836727649292372e-06, "loss": 1.4866, "step": 1939 }, { "epoch": 1.8080149114631874, "grad_norm": 1.3789262236923083, "learning_rate": 8.829823955816363e-06, "loss": 2.1728, "step": 1940 }, { "epoch": 1.8089468779123952, "grad_norm": 1.6293059953364275, "learning_rate": 8.822920262340354e-06, "loss": 2.0096, "step": 1941 }, { "epoch": 1.809878844361603, "grad_norm": 1.1227838591149646, "learning_rate": 8.816016568864343e-06, "loss": 1.7797, "step": 1942 }, { "epoch": 1.810810810810811, "grad_norm": 1.2616879487239652, "learning_rate": 8.809112875388334e-06, "loss": 1.6488, "step": 1943 }, { "epoch": 1.8117427772600188, "grad_norm": 1.1152482331517983, "learning_rate": 8.802209181912323e-06, "loss": 1.8126, "step": 1944 }, { "epoch": 1.8126747437092265, "grad_norm": 1.3947410986162483, "learning_rate": 8.795305488436315e-06, "loss": 2.0613, "step": 1945 }, { "epoch": 1.8136067101584343, "grad_norm": 1.1821118996471427, "learning_rate": 8.788401794960304e-06, "loss": 1.6566, "step": 1946 }, { "epoch": 1.8145386766076421, "grad_norm": 1.3260127302055316, "learning_rate": 8.781498101484295e-06, "loss": 1.8637, "step": 1947 }, { "epoch": 1.81547064305685, "grad_norm": 1.900423907936321, "learning_rate": 8.774594408008286e-06, "loss": 1.9311, "step": 1948 }, { "epoch": 1.8164026095060577, "grad_norm": 1.0798472499735106, "learning_rate": 8.767690714532275e-06, "loss": 1.6399, "step": 1949 }, { "epoch": 1.8173345759552655, "grad_norm": 1.184721805451584, "learning_rate": 8.760787021056267e-06, "loss": 1.5768, "step": 1950 }, { "epoch": 1.8182665424044733, "grad_norm": 1.2196476901379882, "learning_rate": 8.753883327580256e-06, "loss": 1.6317, "step": 1951 }, { "epoch": 1.8191985088536813, "grad_norm": 1.3087208295187782, "learning_rate": 8.746979634104247e-06, "loss": 1.6574, "step": 1952 }, { "epoch": 1.820130475302889, "grad_norm": 1.451367054212358, "learning_rate": 8.740075940628236e-06, "loss": 1.7192, "step": 1953 }, { "epoch": 1.821062441752097, "grad_norm": 1.5459374679714204, "learning_rate": 8.733172247152227e-06, "loss": 1.9459, "step": 1954 }, { "epoch": 1.8219944082013049, "grad_norm": 1.1496437557654184, "learning_rate": 8.726268553676217e-06, "loss": 1.5882, "step": 1955 }, { "epoch": 1.8229263746505127, "grad_norm": 1.4458241901790891, "learning_rate": 8.719364860200208e-06, "loss": 1.8321, "step": 1956 }, { "epoch": 1.8238583410997204, "grad_norm": 1.2275160571708033, "learning_rate": 8.712461166724197e-06, "loss": 2.1341, "step": 1957 }, { "epoch": 1.8247903075489282, "grad_norm": 1.542275035598196, "learning_rate": 8.705557473248188e-06, "loss": 1.8453, "step": 1958 }, { "epoch": 1.825722273998136, "grad_norm": 1.3502371681730256, "learning_rate": 8.698653779772179e-06, "loss": 2.1757, "step": 1959 }, { "epoch": 1.8266542404473438, "grad_norm": 1.28324086261217, "learning_rate": 8.69175008629617e-06, "loss": 2.1559, "step": 1960 }, { "epoch": 1.8275862068965516, "grad_norm": 1.0869202418410504, "learning_rate": 8.68484639282016e-06, "loss": 1.7675, "step": 1961 }, { "epoch": 1.8285181733457594, "grad_norm": 1.2693676537206617, "learning_rate": 8.677942699344149e-06, "loss": 2.1195, "step": 1962 }, { "epoch": 1.8294501397949674, "grad_norm": 1.4048745793057962, "learning_rate": 8.67103900586814e-06, "loss": 2.1733, "step": 1963 }, { "epoch": 1.8303821062441752, "grad_norm": 1.2062390883938736, "learning_rate": 8.66413531239213e-06, "loss": 1.645, "step": 1964 }, { "epoch": 1.8313140726933832, "grad_norm": 1.1112748791950986, "learning_rate": 8.657231618916121e-06, "loss": 1.9187, "step": 1965 }, { "epoch": 1.832246039142591, "grad_norm": 1.2434622423278845, "learning_rate": 8.65032792544011e-06, "loss": 1.5112, "step": 1966 }, { "epoch": 1.8331780055917988, "grad_norm": 1.2177000397225028, "learning_rate": 8.643424231964101e-06, "loss": 1.9501, "step": 1967 }, { "epoch": 1.8341099720410066, "grad_norm": 1.4735842580675225, "learning_rate": 8.636520538488092e-06, "loss": 1.9407, "step": 1968 }, { "epoch": 1.8350419384902144, "grad_norm": 1.2595844440820285, "learning_rate": 8.629616845012082e-06, "loss": 2.0837, "step": 1969 }, { "epoch": 1.8359739049394221, "grad_norm": 1.213430411328567, "learning_rate": 8.622713151536073e-06, "loss": 1.7211, "step": 1970 }, { "epoch": 1.83690587138863, "grad_norm": 1.1709300850586268, "learning_rate": 8.615809458060062e-06, "loss": 1.7466, "step": 1971 }, { "epoch": 1.8378378378378377, "grad_norm": 1.3405882373330338, "learning_rate": 8.608905764584053e-06, "loss": 1.7528, "step": 1972 }, { "epoch": 1.8387698042870455, "grad_norm": 1.255248751241592, "learning_rate": 8.602002071108044e-06, "loss": 1.8474, "step": 1973 }, { "epoch": 1.8397017707362535, "grad_norm": 1.5414974073817664, "learning_rate": 8.595098377632034e-06, "loss": 1.9407, "step": 1974 }, { "epoch": 1.8406337371854613, "grad_norm": 1.5241171132707272, "learning_rate": 8.588194684156023e-06, "loss": 1.7536, "step": 1975 }, { "epoch": 1.8415657036346693, "grad_norm": 1.21852386993819, "learning_rate": 8.581290990680014e-06, "loss": 1.4888, "step": 1976 }, { "epoch": 1.842497670083877, "grad_norm": 1.1747652719970276, "learning_rate": 8.574387297204005e-06, "loss": 1.7637, "step": 1977 }, { "epoch": 1.843429636533085, "grad_norm": 1.2174931395240522, "learning_rate": 8.567483603727996e-06, "loss": 1.7358, "step": 1978 }, { "epoch": 1.8443616029822927, "grad_norm": 1.683424532849489, "learning_rate": 8.560579910251986e-06, "loss": 1.8643, "step": 1979 }, { "epoch": 1.8452935694315005, "grad_norm": 1.1844853514293603, "learning_rate": 8.553676216775975e-06, "loss": 1.7625, "step": 1980 }, { "epoch": 1.8462255358807083, "grad_norm": 1.308745122045996, "learning_rate": 8.546772523299966e-06, "loss": 1.9435, "step": 1981 }, { "epoch": 1.847157502329916, "grad_norm": 1.2338011883490057, "learning_rate": 8.539868829823957e-06, "loss": 1.707, "step": 1982 }, { "epoch": 1.8480894687791238, "grad_norm": 1.3026183278484509, "learning_rate": 8.532965136347948e-06, "loss": 1.8012, "step": 1983 }, { "epoch": 1.8490214352283316, "grad_norm": 1.3416846952204704, "learning_rate": 8.526061442871937e-06, "loss": 1.7043, "step": 1984 }, { "epoch": 1.8499534016775396, "grad_norm": 1.3798740163709187, "learning_rate": 8.519157749395927e-06, "loss": 1.5448, "step": 1985 }, { "epoch": 1.8508853681267474, "grad_norm": 1.7152798220004306, "learning_rate": 8.512254055919918e-06, "loss": 1.8282, "step": 1986 }, { "epoch": 1.8518173345759554, "grad_norm": 1.1462362686074137, "learning_rate": 8.505350362443909e-06, "loss": 1.6986, "step": 1987 }, { "epoch": 1.8527493010251632, "grad_norm": 1.2491421616793896, "learning_rate": 8.4984466689679e-06, "loss": 1.6087, "step": 1988 }, { "epoch": 1.853681267474371, "grad_norm": 1.1893467872758745, "learning_rate": 8.491542975491889e-06, "loss": 1.8911, "step": 1989 }, { "epoch": 1.8546132339235788, "grad_norm": 1.3974399465932057, "learning_rate": 8.48463928201588e-06, "loss": 1.9502, "step": 1990 }, { "epoch": 1.8555452003727866, "grad_norm": 1.1510040182823575, "learning_rate": 8.47773558853987e-06, "loss": 1.799, "step": 1991 }, { "epoch": 1.8564771668219944, "grad_norm": 1.2869289778579933, "learning_rate": 8.47083189506386e-06, "loss": 2.1987, "step": 1992 }, { "epoch": 1.8574091332712022, "grad_norm": 1.342478116966376, "learning_rate": 8.46392820158785e-06, "loss": 2.0699, "step": 1993 }, { "epoch": 1.85834109972041, "grad_norm": 1.257110291571919, "learning_rate": 8.45702450811184e-06, "loss": 1.8652, "step": 1994 }, { "epoch": 1.8592730661696177, "grad_norm": 1.3030594176984625, "learning_rate": 8.45012081463583e-06, "loss": 1.912, "step": 1995 }, { "epoch": 1.8602050326188257, "grad_norm": 1.2107382556010624, "learning_rate": 8.443217121159822e-06, "loss": 1.8418, "step": 1996 }, { "epoch": 1.8611369990680335, "grad_norm": 1.2463657898332037, "learning_rate": 8.436313427683811e-06, "loss": 1.6027, "step": 1997 }, { "epoch": 1.8620689655172413, "grad_norm": 1.3118301545556628, "learning_rate": 8.429409734207802e-06, "loss": 1.7316, "step": 1998 }, { "epoch": 1.8630009319664493, "grad_norm": 1.2662534041726683, "learning_rate": 8.422506040731793e-06, "loss": 2.0135, "step": 1999 }, { "epoch": 1.8639328984156571, "grad_norm": 1.4451191715858185, "learning_rate": 8.415602347255782e-06, "loss": 2.403, "step": 2000 }, { "epoch": 1.864864864864865, "grad_norm": 1.474745561791836, "learning_rate": 8.408698653779774e-06, "loss": 1.7413, "step": 2001 }, { "epoch": 1.8657968313140727, "grad_norm": 1.3143004195540338, "learning_rate": 8.401794960303763e-06, "loss": 1.9426, "step": 2002 }, { "epoch": 1.8667287977632805, "grad_norm": 1.4479320152268165, "learning_rate": 8.394891266827754e-06, "loss": 1.862, "step": 2003 }, { "epoch": 1.8676607642124883, "grad_norm": 1.5915853318288593, "learning_rate": 8.387987573351743e-06, "loss": 2.1534, "step": 2004 }, { "epoch": 1.868592730661696, "grad_norm": 0.9646061142361378, "learning_rate": 8.381083879875734e-06, "loss": 1.5478, "step": 2005 }, { "epoch": 1.8695246971109039, "grad_norm": 1.5099321372372345, "learning_rate": 8.374180186399724e-06, "loss": 2.0934, "step": 2006 }, { "epoch": 1.8704566635601119, "grad_norm": 1.259685775101139, "learning_rate": 8.367276492923715e-06, "loss": 1.8553, "step": 2007 }, { "epoch": 1.8713886300093197, "grad_norm": 1.525831675501623, "learning_rate": 8.360372799447706e-06, "loss": 2.0389, "step": 2008 }, { "epoch": 1.8723205964585274, "grad_norm": 1.475545744339609, "learning_rate": 8.353469105971695e-06, "loss": 2.0247, "step": 2009 }, { "epoch": 1.8732525629077355, "grad_norm": 1.2696796915056987, "learning_rate": 8.346565412495686e-06, "loss": 1.6981, "step": 2010 }, { "epoch": 1.8741845293569432, "grad_norm": 1.3302720226013156, "learning_rate": 8.339661719019676e-06, "loss": 1.7356, "step": 2011 }, { "epoch": 1.875116495806151, "grad_norm": 1.4006419766591158, "learning_rate": 8.332758025543667e-06, "loss": 1.8328, "step": 2012 }, { "epoch": 1.8760484622553588, "grad_norm": 1.0672485985518678, "learning_rate": 8.325854332067656e-06, "loss": 1.5649, "step": 2013 }, { "epoch": 1.8769804287045666, "grad_norm": 1.3812986927216453, "learning_rate": 8.318950638591647e-06, "loss": 1.6981, "step": 2014 }, { "epoch": 1.8779123951537744, "grad_norm": 1.1592670034767574, "learning_rate": 8.312046945115638e-06, "loss": 1.6228, "step": 2015 }, { "epoch": 1.8788443616029822, "grad_norm": 1.1083787782931998, "learning_rate": 8.305143251639628e-06, "loss": 1.5473, "step": 2016 }, { "epoch": 1.87977632805219, "grad_norm": 1.5758493497826007, "learning_rate": 8.298239558163617e-06, "loss": 1.9106, "step": 2017 }, { "epoch": 1.880708294501398, "grad_norm": 1.2024095465827718, "learning_rate": 8.291335864687608e-06, "loss": 1.5781, "step": 2018 }, { "epoch": 1.8816402609506058, "grad_norm": 1.3479304002431298, "learning_rate": 8.284432171211599e-06, "loss": 2.0431, "step": 2019 }, { "epoch": 1.8825722273998136, "grad_norm": 1.2580520265682809, "learning_rate": 8.27752847773559e-06, "loss": 1.8113, "step": 2020 }, { "epoch": 1.8835041938490216, "grad_norm": 1.2989399774904213, "learning_rate": 8.27062478425958e-06, "loss": 1.9128, "step": 2021 }, { "epoch": 1.8844361602982294, "grad_norm": 1.5985186759535184, "learning_rate": 8.26372109078357e-06, "loss": 1.9199, "step": 2022 }, { "epoch": 1.8853681267474371, "grad_norm": 1.5342355609099072, "learning_rate": 8.25681739730756e-06, "loss": 1.972, "step": 2023 }, { "epoch": 1.886300093196645, "grad_norm": 1.7829153920228207, "learning_rate": 8.24991370383155e-06, "loss": 1.9092, "step": 2024 }, { "epoch": 1.8872320596458527, "grad_norm": 1.2629627557498098, "learning_rate": 8.243010010355541e-06, "loss": 1.6028, "step": 2025 }, { "epoch": 1.8881640260950605, "grad_norm": 1.2469623311367204, "learning_rate": 8.23610631687953e-06, "loss": 1.8077, "step": 2026 }, { "epoch": 1.8890959925442683, "grad_norm": 1.3626207511274586, "learning_rate": 8.229202623403521e-06, "loss": 1.6002, "step": 2027 }, { "epoch": 1.890027958993476, "grad_norm": 1.388789742148492, "learning_rate": 8.222298929927512e-06, "loss": 2.0087, "step": 2028 }, { "epoch": 1.890959925442684, "grad_norm": 1.337925913029777, "learning_rate": 8.215395236451503e-06, "loss": 1.8437, "step": 2029 }, { "epoch": 1.8918918918918919, "grad_norm": 1.2170852271135393, "learning_rate": 8.208491542975493e-06, "loss": 1.6818, "step": 2030 }, { "epoch": 1.8928238583410997, "grad_norm": 1.2185683636441877, "learning_rate": 8.201587849499482e-06, "loss": 1.7723, "step": 2031 }, { "epoch": 1.8937558247903077, "grad_norm": 1.3117992180004052, "learning_rate": 8.194684156023473e-06, "loss": 1.8004, "step": 2032 }, { "epoch": 1.8946877912395155, "grad_norm": 1.0175259847397515, "learning_rate": 8.187780462547464e-06, "loss": 1.3445, "step": 2033 }, { "epoch": 1.8956197576887233, "grad_norm": 1.2929650249044045, "learning_rate": 8.180876769071455e-06, "loss": 1.5852, "step": 2034 }, { "epoch": 1.896551724137931, "grad_norm": 1.2927375484525248, "learning_rate": 8.173973075595444e-06, "loss": 2.0418, "step": 2035 }, { "epoch": 1.8974836905871388, "grad_norm": 1.4264677032921576, "learning_rate": 8.167069382119434e-06, "loss": 1.8482, "step": 2036 }, { "epoch": 1.8984156570363466, "grad_norm": 1.064375734843864, "learning_rate": 8.160165688643425e-06, "loss": 1.4785, "step": 2037 }, { "epoch": 1.8993476234855544, "grad_norm": 1.2875831471920816, "learning_rate": 8.153261995167416e-06, "loss": 2.2123, "step": 2038 }, { "epoch": 1.9002795899347622, "grad_norm": 1.3765157708786684, "learning_rate": 8.146358301691407e-06, "loss": 2.042, "step": 2039 }, { "epoch": 1.9012115563839702, "grad_norm": 1.3280316974954507, "learning_rate": 8.139454608215396e-06, "loss": 1.5268, "step": 2040 }, { "epoch": 1.902143522833178, "grad_norm": 1.3299605355312143, "learning_rate": 8.132550914739386e-06, "loss": 1.9697, "step": 2041 }, { "epoch": 1.9030754892823858, "grad_norm": 1.2486741904642629, "learning_rate": 8.125647221263377e-06, "loss": 1.9547, "step": 2042 }, { "epoch": 1.9040074557315938, "grad_norm": 1.141160032428745, "learning_rate": 8.118743527787368e-06, "loss": 1.5967, "step": 2043 }, { "epoch": 1.9049394221808016, "grad_norm": 1.2622224863489264, "learning_rate": 8.111839834311357e-06, "loss": 2.0924, "step": 2044 }, { "epoch": 1.9058713886300094, "grad_norm": 1.473008214095865, "learning_rate": 8.104936140835348e-06, "loss": 1.8875, "step": 2045 }, { "epoch": 1.9068033550792172, "grad_norm": 1.2843750371380447, "learning_rate": 8.098032447359337e-06, "loss": 1.8109, "step": 2046 }, { "epoch": 1.907735321528425, "grad_norm": 1.217019019223209, "learning_rate": 8.091128753883329e-06, "loss": 1.6787, "step": 2047 }, { "epoch": 1.9086672879776327, "grad_norm": 1.5666874861021547, "learning_rate": 8.084225060407318e-06, "loss": 1.9846, "step": 2048 }, { "epoch": 1.9095992544268405, "grad_norm": 1.2086958465659114, "learning_rate": 8.077321366931309e-06, "loss": 1.9798, "step": 2049 }, { "epoch": 1.9105312208760483, "grad_norm": 1.2992087805244654, "learning_rate": 8.0704176734553e-06, "loss": 1.8046, "step": 2050 }, { "epoch": 1.9114631873252563, "grad_norm": 1.2796425169417527, "learning_rate": 8.063513979979289e-06, "loss": 1.9403, "step": 2051 }, { "epoch": 1.9123951537744641, "grad_norm": 1.403496039127196, "learning_rate": 8.056610286503281e-06, "loss": 1.7938, "step": 2052 }, { "epoch": 1.913327120223672, "grad_norm": 1.4579445092646288, "learning_rate": 8.04970659302727e-06, "loss": 1.7443, "step": 2053 }, { "epoch": 1.91425908667288, "grad_norm": 1.2720032196156652, "learning_rate": 8.04280289955126e-06, "loss": 1.737, "step": 2054 }, { "epoch": 1.9151910531220877, "grad_norm": 1.2506585835425823, "learning_rate": 8.03589920607525e-06, "loss": 1.9784, "step": 2055 }, { "epoch": 1.9161230195712955, "grad_norm": 1.0238023467528166, "learning_rate": 8.02899551259924e-06, "loss": 1.6841, "step": 2056 }, { "epoch": 1.9170549860205033, "grad_norm": 1.2989236959710826, "learning_rate": 8.022091819123231e-06, "loss": 2.0695, "step": 2057 }, { "epoch": 1.917986952469711, "grad_norm": 1.256876525099401, "learning_rate": 8.015188125647222e-06, "loss": 2.1085, "step": 2058 }, { "epoch": 1.9189189189189189, "grad_norm": 1.3757250876143057, "learning_rate": 8.008284432171213e-06, "loss": 1.9057, "step": 2059 }, { "epoch": 1.9198508853681266, "grad_norm": 1.5551494984489826, "learning_rate": 8.001380738695202e-06, "loss": 1.7343, "step": 2060 }, { "epoch": 1.9207828518173344, "grad_norm": 1.5674659086643015, "learning_rate": 7.994477045219193e-06, "loss": 1.9786, "step": 2061 }, { "epoch": 1.9217148182665424, "grad_norm": 1.0559307556941189, "learning_rate": 7.987573351743183e-06, "loss": 1.798, "step": 2062 }, { "epoch": 1.9226467847157502, "grad_norm": 1.2997353225662163, "learning_rate": 7.980669658267174e-06, "loss": 1.9205, "step": 2063 }, { "epoch": 1.923578751164958, "grad_norm": 1.104604726177047, "learning_rate": 7.973765964791163e-06, "loss": 1.7346, "step": 2064 }, { "epoch": 1.924510717614166, "grad_norm": 1.2573848733130675, "learning_rate": 7.966862271315154e-06, "loss": 1.8291, "step": 2065 }, { "epoch": 1.9254426840633738, "grad_norm": 1.1369630537230209, "learning_rate": 7.959958577839145e-06, "loss": 1.4423, "step": 2066 }, { "epoch": 1.9263746505125816, "grad_norm": 1.4048853339863125, "learning_rate": 7.953054884363135e-06, "loss": 1.7326, "step": 2067 }, { "epoch": 1.9273066169617894, "grad_norm": 1.229661378152866, "learning_rate": 7.946151190887124e-06, "loss": 1.8976, "step": 2068 }, { "epoch": 1.9282385834109972, "grad_norm": 1.1029080805798746, "learning_rate": 7.939247497411115e-06, "loss": 1.6052, "step": 2069 }, { "epoch": 1.929170549860205, "grad_norm": 1.1902390428672545, "learning_rate": 7.932343803935106e-06, "loss": 1.9208, "step": 2070 }, { "epoch": 1.9301025163094128, "grad_norm": 1.2005974008441822, "learning_rate": 7.925440110459096e-06, "loss": 1.8673, "step": 2071 }, { "epoch": 1.9310344827586206, "grad_norm": 1.3172943822873975, "learning_rate": 7.918536416983087e-06, "loss": 1.8526, "step": 2072 }, { "epoch": 1.9319664492078286, "grad_norm": 1.4624467508969041, "learning_rate": 7.911632723507076e-06, "loss": 1.7485, "step": 2073 }, { "epoch": 1.9328984156570364, "grad_norm": 1.3255308100067684, "learning_rate": 7.904729030031067e-06, "loss": 1.6278, "step": 2074 }, { "epoch": 1.9338303821062441, "grad_norm": 1.2387214295057174, "learning_rate": 7.897825336555058e-06, "loss": 1.8103, "step": 2075 }, { "epoch": 1.9347623485554521, "grad_norm": 1.2569627113034514, "learning_rate": 7.890921643079048e-06, "loss": 1.6732, "step": 2076 }, { "epoch": 1.93569431500466, "grad_norm": 1.0208713939926717, "learning_rate": 7.884017949603038e-06, "loss": 1.5521, "step": 2077 }, { "epoch": 1.9366262814538677, "grad_norm": 1.1510206596401016, "learning_rate": 7.877114256127028e-06, "loss": 1.9237, "step": 2078 }, { "epoch": 1.9375582479030755, "grad_norm": 1.2847840008230322, "learning_rate": 7.870210562651019e-06, "loss": 1.8025, "step": 2079 }, { "epoch": 1.9384902143522833, "grad_norm": 1.288801745805762, "learning_rate": 7.86330686917501e-06, "loss": 1.8535, "step": 2080 }, { "epoch": 1.939422180801491, "grad_norm": 2.551824503771259, "learning_rate": 7.856403175699e-06, "loss": 1.8201, "step": 2081 }, { "epoch": 1.9403541472506989, "grad_norm": 1.1247102523993067, "learning_rate": 7.84949948222299e-06, "loss": 1.8206, "step": 2082 }, { "epoch": 1.9412861136999067, "grad_norm": 1.3203253097245597, "learning_rate": 7.84259578874698e-06, "loss": 1.9327, "step": 2083 }, { "epoch": 1.9422180801491147, "grad_norm": 1.170646797613728, "learning_rate": 7.835692095270971e-06, "loss": 1.5199, "step": 2084 }, { "epoch": 1.9431500465983225, "grad_norm": 1.16068757764227, "learning_rate": 7.828788401794962e-06, "loss": 1.918, "step": 2085 }, { "epoch": 1.9440820130475303, "grad_norm": 1.3245450950362385, "learning_rate": 7.82188470831895e-06, "loss": 1.6672, "step": 2086 }, { "epoch": 1.9450139794967383, "grad_norm": 1.6840262312566443, "learning_rate": 7.814981014842941e-06, "loss": 2.2966, "step": 2087 }, { "epoch": 1.945945945945946, "grad_norm": 1.2029552619247699, "learning_rate": 7.808077321366932e-06, "loss": 1.9528, "step": 2088 }, { "epoch": 1.9468779123951538, "grad_norm": 1.2926810155295725, "learning_rate": 7.801173627890923e-06, "loss": 1.8619, "step": 2089 }, { "epoch": 1.9478098788443616, "grad_norm": 2.7983758912161023, "learning_rate": 7.794269934414914e-06, "loss": 1.7281, "step": 2090 }, { "epoch": 1.9487418452935694, "grad_norm": 1.0630604948333162, "learning_rate": 7.787366240938903e-06, "loss": 1.7824, "step": 2091 }, { "epoch": 1.9496738117427772, "grad_norm": 1.209734601263592, "learning_rate": 7.780462547462893e-06, "loss": 1.6291, "step": 2092 }, { "epoch": 1.950605778191985, "grad_norm": 1.4604918388208166, "learning_rate": 7.773558853986884e-06, "loss": 2.05, "step": 2093 }, { "epoch": 1.9515377446411928, "grad_norm": 1.09436888140609, "learning_rate": 7.766655160510875e-06, "loss": 1.7903, "step": 2094 }, { "epoch": 1.9524697110904008, "grad_norm": 1.2929318335634699, "learning_rate": 7.759751467034864e-06, "loss": 1.663, "step": 2095 }, { "epoch": 1.9534016775396086, "grad_norm": 1.2815000509164984, "learning_rate": 7.752847773558855e-06, "loss": 1.8143, "step": 2096 }, { "epoch": 1.9543336439888164, "grad_norm": 1.0694922363210133, "learning_rate": 7.745944080082844e-06, "loss": 1.7438, "step": 2097 }, { "epoch": 1.9552656104380244, "grad_norm": 1.2899006015571541, "learning_rate": 7.739040386606836e-06, "loss": 1.9191, "step": 2098 }, { "epoch": 1.9561975768872322, "grad_norm": 1.1213609008866592, "learning_rate": 7.732136693130827e-06, "loss": 1.6319, "step": 2099 }, { "epoch": 1.95712954333644, "grad_norm": 1.2151879185163208, "learning_rate": 7.725232999654816e-06, "loss": 1.9624, "step": 2100 }, { "epoch": 1.9580615097856477, "grad_norm": 1.2203518637319162, "learning_rate": 7.718329306178807e-06, "loss": 1.9316, "step": 2101 }, { "epoch": 1.9589934762348555, "grad_norm": 1.3699668332778676, "learning_rate": 7.711425612702796e-06, "loss": 1.5348, "step": 2102 }, { "epoch": 1.9599254426840633, "grad_norm": 1.1523224151073246, "learning_rate": 7.704521919226788e-06, "loss": 2.0487, "step": 2103 }, { "epoch": 1.9608574091332711, "grad_norm": 1.226596263081621, "learning_rate": 7.697618225750777e-06, "loss": 2.0215, "step": 2104 }, { "epoch": 1.961789375582479, "grad_norm": 1.3334399889487234, "learning_rate": 7.690714532274768e-06, "loss": 2.0397, "step": 2105 }, { "epoch": 1.962721342031687, "grad_norm": 1.1895443977487297, "learning_rate": 7.683810838798757e-06, "loss": 1.7097, "step": 2106 }, { "epoch": 1.9636533084808947, "grad_norm": 1.2582286799505384, "learning_rate": 7.676907145322748e-06, "loss": 1.7301, "step": 2107 }, { "epoch": 1.9645852749301025, "grad_norm": 1.2138150812222854, "learning_rate": 7.670003451846738e-06, "loss": 2.149, "step": 2108 }, { "epoch": 1.9655172413793105, "grad_norm": 1.1928121463904928, "learning_rate": 7.663099758370729e-06, "loss": 1.6582, "step": 2109 }, { "epoch": 1.9664492078285183, "grad_norm": 1.553967843501844, "learning_rate": 7.65619606489472e-06, "loss": 1.7117, "step": 2110 }, { "epoch": 1.967381174277726, "grad_norm": 1.272455790316925, "learning_rate": 7.649292371418709e-06, "loss": 1.8232, "step": 2111 }, { "epoch": 1.9683131407269339, "grad_norm": 1.3420281610357436, "learning_rate": 7.6423886779427e-06, "loss": 1.9854, "step": 2112 }, { "epoch": 1.9692451071761417, "grad_norm": 1.452699033158212, "learning_rate": 7.63548498446669e-06, "loss": 1.9149, "step": 2113 }, { "epoch": 1.9701770736253494, "grad_norm": 1.2198431036729647, "learning_rate": 7.628581290990681e-06, "loss": 1.6904, "step": 2114 }, { "epoch": 1.9711090400745572, "grad_norm": 1.1132424661798013, "learning_rate": 7.621677597514671e-06, "loss": 1.7054, "step": 2115 }, { "epoch": 1.972041006523765, "grad_norm": 1.3096851532954064, "learning_rate": 7.614773904038662e-06, "loss": 1.8992, "step": 2116 }, { "epoch": 1.972972972972973, "grad_norm": 1.268510962308587, "learning_rate": 7.6078702105626516e-06, "loss": 1.574, "step": 2117 }, { "epoch": 1.9739049394221808, "grad_norm": 1.3900000939729495, "learning_rate": 7.600966517086642e-06, "loss": 1.6881, "step": 2118 }, { "epoch": 1.9748369058713886, "grad_norm": 1.37958566394102, "learning_rate": 7.594062823610633e-06, "loss": 2.0129, "step": 2119 }, { "epoch": 1.9757688723205966, "grad_norm": 1.2337454691133112, "learning_rate": 7.587159130134623e-06, "loss": 1.8189, "step": 2120 }, { "epoch": 1.9767008387698044, "grad_norm": 1.5201938578272707, "learning_rate": 7.580255436658614e-06, "loss": 1.8467, "step": 2121 }, { "epoch": 1.9776328052190122, "grad_norm": 1.4580665015782843, "learning_rate": 7.5733517431826035e-06, "loss": 1.884, "step": 2122 }, { "epoch": 1.97856477166822, "grad_norm": 1.2584069030176375, "learning_rate": 7.566448049706594e-06, "loss": 1.9997, "step": 2123 }, { "epoch": 1.9794967381174278, "grad_norm": 1.4216426065852306, "learning_rate": 7.559544356230583e-06, "loss": 1.8942, "step": 2124 }, { "epoch": 1.9804287045666356, "grad_norm": 1.3364533731319066, "learning_rate": 7.552640662754575e-06, "loss": 2.0738, "step": 2125 }, { "epoch": 1.9813606710158433, "grad_norm": 1.184593058249597, "learning_rate": 7.545736969278564e-06, "loss": 1.9003, "step": 2126 }, { "epoch": 1.9822926374650511, "grad_norm": 1.445719510751881, "learning_rate": 7.5388332758025555e-06, "loss": 1.8543, "step": 2127 }, { "epoch": 1.983224603914259, "grad_norm": 1.150605812637957, "learning_rate": 7.5319295823265445e-06, "loss": 2.1156, "step": 2128 }, { "epoch": 1.984156570363467, "grad_norm": 1.131858508672884, "learning_rate": 7.525025888850535e-06, "loss": 1.8948, "step": 2129 }, { "epoch": 1.9850885368126747, "grad_norm": 1.1122285244863321, "learning_rate": 7.518122195374527e-06, "loss": 1.8557, "step": 2130 }, { "epoch": 1.9860205032618827, "grad_norm": 1.0980305667951313, "learning_rate": 7.511218501898516e-06, "loss": 1.5928, "step": 2131 }, { "epoch": 1.9869524697110905, "grad_norm": 1.2470767474962416, "learning_rate": 7.504314808422507e-06, "loss": 1.8114, "step": 2132 }, { "epoch": 1.9878844361602983, "grad_norm": 1.151322345689448, "learning_rate": 7.4974111149464965e-06, "loss": 1.7084, "step": 2133 }, { "epoch": 1.988816402609506, "grad_norm": 1.1998610247133996, "learning_rate": 7.490507421470487e-06, "loss": 1.7177, "step": 2134 }, { "epoch": 1.9897483690587139, "grad_norm": 1.160528648854474, "learning_rate": 7.483603727994477e-06, "loss": 1.6847, "step": 2135 }, { "epoch": 1.9906803355079217, "grad_norm": 1.24586747276362, "learning_rate": 7.476700034518468e-06, "loss": 1.8244, "step": 2136 }, { "epoch": 1.9916123019571295, "grad_norm": 1.8301253195381046, "learning_rate": 7.469796341042458e-06, "loss": 1.7156, "step": 2137 }, { "epoch": 1.9925442684063372, "grad_norm": 1.0292131271084457, "learning_rate": 7.4628926475664485e-06, "loss": 1.7582, "step": 2138 }, { "epoch": 1.993476234855545, "grad_norm": 1.1730172526344549, "learning_rate": 7.455988954090439e-06, "loss": 1.8161, "step": 2139 }, { "epoch": 1.994408201304753, "grad_norm": 1.0847001503571085, "learning_rate": 7.449085260614429e-06, "loss": 1.7081, "step": 2140 }, { "epoch": 1.9953401677539608, "grad_norm": 1.217181421988096, "learning_rate": 7.44218156713842e-06, "loss": 1.7809, "step": 2141 }, { "epoch": 1.9962721342031688, "grad_norm": 1.1490535059732583, "learning_rate": 7.43527787366241e-06, "loss": 1.7747, "step": 2142 }, { "epoch": 1.9972041006523766, "grad_norm": 1.2899653509023674, "learning_rate": 7.4283741801864004e-06, "loss": 1.7836, "step": 2143 }, { "epoch": 1.9981360671015844, "grad_norm": 1.1340559511106258, "learning_rate": 7.42147048671039e-06, "loss": 1.4816, "step": 2144 }, { "epoch": 1.9990680335507922, "grad_norm": 1.1903148555353222, "learning_rate": 7.414566793234381e-06, "loss": 1.5332, "step": 2145 }, { "epoch": 2.0, "grad_norm": 1.172224803687257, "learning_rate": 7.407663099758371e-06, "loss": 1.5417, "step": 2146 }, { "epoch": 2.000931966449208, "grad_norm": 1.0846565291913812, "learning_rate": 7.400759406282362e-06, "loss": 2.002, "step": 2147 }, { "epoch": 2.0018639328984156, "grad_norm": 1.3425075584379753, "learning_rate": 7.3938557128063516e-06, "loss": 1.6302, "step": 2148 }, { "epoch": 2.0027958993476234, "grad_norm": 1.1315325802475265, "learning_rate": 7.386952019330342e-06, "loss": 1.7122, "step": 2149 }, { "epoch": 2.003727865796831, "grad_norm": 0.9445807454359055, "learning_rate": 7.380048325854333e-06, "loss": 1.5866, "step": 2150 }, { "epoch": 2.004659832246039, "grad_norm": 1.1095815490990095, "learning_rate": 7.373144632378323e-06, "loss": 1.6341, "step": 2151 }, { "epoch": 2.005591798695247, "grad_norm": 1.1806841171962446, "learning_rate": 7.366240938902314e-06, "loss": 1.3834, "step": 2152 }, { "epoch": 2.006523765144455, "grad_norm": 1.1200195762497984, "learning_rate": 7.3593372454263035e-06, "loss": 1.511, "step": 2153 }, { "epoch": 2.0074557315936628, "grad_norm": 0.9267448090979961, "learning_rate": 7.352433551950294e-06, "loss": 1.3588, "step": 2154 }, { "epoch": 2.0083876980428705, "grad_norm": 1.026428890863551, "learning_rate": 7.345529858474284e-06, "loss": 1.595, "step": 2155 }, { "epoch": 2.0093196644920783, "grad_norm": 1.1673323292274407, "learning_rate": 7.338626164998275e-06, "loss": 1.7241, "step": 2156 }, { "epoch": 2.010251630941286, "grad_norm": 1.2625156400818158, "learning_rate": 7.331722471522265e-06, "loss": 1.6283, "step": 2157 }, { "epoch": 2.011183597390494, "grad_norm": 1.0839514393454923, "learning_rate": 7.3248187780462555e-06, "loss": 1.5698, "step": 2158 }, { "epoch": 2.0121155638397017, "grad_norm": 1.2749896994508116, "learning_rate": 7.317915084570245e-06, "loss": 1.5884, "step": 2159 }, { "epoch": 2.0130475302889095, "grad_norm": 1.3103594809554628, "learning_rate": 7.311011391094236e-06, "loss": 1.5039, "step": 2160 }, { "epoch": 2.0139794967381173, "grad_norm": 1.22003995630736, "learning_rate": 7.304107697618227e-06, "loss": 1.7905, "step": 2161 }, { "epoch": 2.014911463187325, "grad_norm": 1.35376609502147, "learning_rate": 7.297204004142217e-06, "loss": 1.3078, "step": 2162 }, { "epoch": 2.0158434296365333, "grad_norm": 1.1394990315480968, "learning_rate": 7.2903003106662075e-06, "loss": 1.8135, "step": 2163 }, { "epoch": 2.016775396085741, "grad_norm": 1.381784283924496, "learning_rate": 7.283396617190197e-06, "loss": 1.907, "step": 2164 }, { "epoch": 2.017707362534949, "grad_norm": 1.1513551888859521, "learning_rate": 7.276492923714188e-06, "loss": 1.4956, "step": 2165 }, { "epoch": 2.0186393289841567, "grad_norm": 1.2376427168860467, "learning_rate": 7.269589230238178e-06, "loss": 1.5015, "step": 2166 }, { "epoch": 2.0195712954333644, "grad_norm": 1.1630007085699932, "learning_rate": 7.262685536762169e-06, "loss": 2.0709, "step": 2167 }, { "epoch": 2.0205032618825722, "grad_norm": 1.1824261473765436, "learning_rate": 7.255781843286159e-06, "loss": 1.5631, "step": 2168 }, { "epoch": 2.02143522833178, "grad_norm": 1.0951276579997478, "learning_rate": 7.248878149810149e-06, "loss": 1.5861, "step": 2169 }, { "epoch": 2.022367194780988, "grad_norm": 1.4699115108254655, "learning_rate": 7.24197445633414e-06, "loss": 1.8531, "step": 2170 }, { "epoch": 2.0232991612301956, "grad_norm": 1.1996381484973357, "learning_rate": 7.23507076285813e-06, "loss": 1.5964, "step": 2171 }, { "epoch": 2.0242311276794034, "grad_norm": 1.1166862938101423, "learning_rate": 7.228167069382121e-06, "loss": 1.5923, "step": 2172 }, { "epoch": 2.025163094128611, "grad_norm": 0.9337897500196034, "learning_rate": 7.2212633759061106e-06, "loss": 1.4943, "step": 2173 }, { "epoch": 2.0260950605778194, "grad_norm": 1.1521951882777541, "learning_rate": 7.214359682430101e-06, "loss": 1.5024, "step": 2174 }, { "epoch": 2.027027027027027, "grad_norm": 1.306431274527095, "learning_rate": 7.20745598895409e-06, "loss": 1.9891, "step": 2175 }, { "epoch": 2.027958993476235, "grad_norm": 1.2751123722300386, "learning_rate": 7.200552295478082e-06, "loss": 1.5698, "step": 2176 }, { "epoch": 2.0288909599254428, "grad_norm": 1.2746141663562773, "learning_rate": 7.193648602002071e-06, "loss": 1.8664, "step": 2177 }, { "epoch": 2.0298229263746506, "grad_norm": 1.5922889940724645, "learning_rate": 7.1867449085260625e-06, "loss": 1.6688, "step": 2178 }, { "epoch": 2.0307548928238583, "grad_norm": 1.6301835581554949, "learning_rate": 7.1798412150500516e-06, "loss": 1.6858, "step": 2179 }, { "epoch": 2.031686859273066, "grad_norm": 0.9755621123953647, "learning_rate": 7.172937521574042e-06, "loss": 1.4196, "step": 2180 }, { "epoch": 2.032618825722274, "grad_norm": 1.1783438413318625, "learning_rate": 7.166033828098034e-06, "loss": 1.6012, "step": 2181 }, { "epoch": 2.0335507921714817, "grad_norm": 1.0739532748448979, "learning_rate": 7.159130134622023e-06, "loss": 1.265, "step": 2182 }, { "epoch": 2.0344827586206895, "grad_norm": 1.4237492950123183, "learning_rate": 7.152226441146014e-06, "loss": 1.8897, "step": 2183 }, { "epoch": 2.0354147250698973, "grad_norm": 1.7412516118390102, "learning_rate": 7.1453227476700035e-06, "loss": 1.7809, "step": 2184 }, { "epoch": 2.0363466915191055, "grad_norm": 1.3039837065124855, "learning_rate": 7.138419054193994e-06, "loss": 1.4228, "step": 2185 }, { "epoch": 2.0372786579683133, "grad_norm": 1.3227006402288104, "learning_rate": 7.131515360717984e-06, "loss": 1.5042, "step": 2186 }, { "epoch": 2.038210624417521, "grad_norm": 1.2215899317430383, "learning_rate": 7.124611667241975e-06, "loss": 1.7755, "step": 2187 }, { "epoch": 2.039142590866729, "grad_norm": 1.4154994090263293, "learning_rate": 7.117707973765965e-06, "loss": 1.8064, "step": 2188 }, { "epoch": 2.0400745573159367, "grad_norm": 1.0158726099882174, "learning_rate": 7.1108042802899555e-06, "loss": 1.3852, "step": 2189 }, { "epoch": 2.0410065237651445, "grad_norm": 1.4758080263526974, "learning_rate": 7.103900586813946e-06, "loss": 1.778, "step": 2190 }, { "epoch": 2.0419384902143523, "grad_norm": 1.388180893945507, "learning_rate": 7.096996893337936e-06, "loss": 1.6939, "step": 2191 }, { "epoch": 2.04287045666356, "grad_norm": 1.1880034409344584, "learning_rate": 7.090093199861927e-06, "loss": 1.9325, "step": 2192 }, { "epoch": 2.043802423112768, "grad_norm": 1.2738451063900649, "learning_rate": 7.083189506385917e-06, "loss": 1.5348, "step": 2193 }, { "epoch": 2.0447343895619756, "grad_norm": 1.1152865710357533, "learning_rate": 7.0762858129099075e-06, "loss": 1.834, "step": 2194 }, { "epoch": 2.0456663560111834, "grad_norm": 1.30945114668667, "learning_rate": 7.069382119433897e-06, "loss": 1.8478, "step": 2195 }, { "epoch": 2.0465983224603916, "grad_norm": 1.116140603004531, "learning_rate": 7.062478425957888e-06, "loss": 1.5151, "step": 2196 }, { "epoch": 2.0475302889095994, "grad_norm": 1.2610215520472376, "learning_rate": 7.055574732481878e-06, "loss": 1.5934, "step": 2197 }, { "epoch": 2.048462255358807, "grad_norm": 1.1152999181300942, "learning_rate": 7.048671039005869e-06, "loss": 1.2876, "step": 2198 }, { "epoch": 2.049394221808015, "grad_norm": 1.1160295213500764, "learning_rate": 7.041767345529859e-06, "loss": 1.6234, "step": 2199 }, { "epoch": 2.050326188257223, "grad_norm": 1.3156918038127754, "learning_rate": 7.034863652053849e-06, "loss": 1.9859, "step": 2200 }, { "epoch": 2.0512581547064306, "grad_norm": 1.1764085939871731, "learning_rate": 7.02795995857784e-06, "loss": 1.2822, "step": 2201 }, { "epoch": 2.0521901211556384, "grad_norm": 1.3200067092682184, "learning_rate": 7.02105626510183e-06, "loss": 1.7437, "step": 2202 }, { "epoch": 2.053122087604846, "grad_norm": 1.1487787763850374, "learning_rate": 7.014152571625821e-06, "loss": 1.5664, "step": 2203 }, { "epoch": 2.054054054054054, "grad_norm": 1.1437303741130802, "learning_rate": 7.0072488781498106e-06, "loss": 1.5433, "step": 2204 }, { "epoch": 2.0549860205032617, "grad_norm": 1.3332931097880945, "learning_rate": 7.000345184673801e-06, "loss": 1.544, "step": 2205 }, { "epoch": 2.0559179869524695, "grad_norm": 1.1147090862058473, "learning_rate": 6.993441491197791e-06, "loss": 1.5831, "step": 2206 }, { "epoch": 2.0568499534016778, "grad_norm": 1.3841283411865273, "learning_rate": 6.986537797721782e-06, "loss": 1.6996, "step": 2207 }, { "epoch": 2.0577819198508855, "grad_norm": 1.6044177531934327, "learning_rate": 6.979634104245772e-06, "loss": 1.6907, "step": 2208 }, { "epoch": 2.0587138863000933, "grad_norm": 1.042659548952643, "learning_rate": 6.9727304107697625e-06, "loss": 1.5533, "step": 2209 }, { "epoch": 2.059645852749301, "grad_norm": 0.9728912330051799, "learning_rate": 6.965826717293753e-06, "loss": 1.6563, "step": 2210 }, { "epoch": 2.060577819198509, "grad_norm": 1.3328072203782015, "learning_rate": 6.958923023817743e-06, "loss": 1.6622, "step": 2211 }, { "epoch": 2.0615097856477167, "grad_norm": 1.0358435329965954, "learning_rate": 6.952019330341734e-06, "loss": 1.4251, "step": 2212 }, { "epoch": 2.0624417520969245, "grad_norm": 1.3299322225012062, "learning_rate": 6.945115636865724e-06, "loss": 1.5526, "step": 2213 }, { "epoch": 2.0633737185461323, "grad_norm": 1.197298963336652, "learning_rate": 6.9382119433897145e-06, "loss": 1.4877, "step": 2214 }, { "epoch": 2.06430568499534, "grad_norm": 1.3568040490026059, "learning_rate": 6.931308249913704e-06, "loss": 1.6003, "step": 2215 }, { "epoch": 2.065237651444548, "grad_norm": 1.1729439532595027, "learning_rate": 6.924404556437695e-06, "loss": 1.3872, "step": 2216 }, { "epoch": 2.0661696178937556, "grad_norm": 1.2678686721123724, "learning_rate": 6.917500862961685e-06, "loss": 1.4961, "step": 2217 }, { "epoch": 2.0671015843429634, "grad_norm": 1.1490543853018627, "learning_rate": 6.910597169485676e-06, "loss": 1.7942, "step": 2218 }, { "epoch": 2.0680335507921717, "grad_norm": 1.6179560521587102, "learning_rate": 6.903693476009666e-06, "loss": 1.7168, "step": 2219 }, { "epoch": 2.0689655172413794, "grad_norm": 1.3357732784219558, "learning_rate": 6.896789782533656e-06, "loss": 1.4326, "step": 2220 }, { "epoch": 2.0698974836905872, "grad_norm": 1.2259844217739364, "learning_rate": 6.889886089057647e-06, "loss": 1.7354, "step": 2221 }, { "epoch": 2.070829450139795, "grad_norm": 1.4104911617676952, "learning_rate": 6.882982395581637e-06, "loss": 1.6957, "step": 2222 }, { "epoch": 2.071761416589003, "grad_norm": 1.355212232181636, "learning_rate": 6.876078702105628e-06, "loss": 1.5844, "step": 2223 }, { "epoch": 2.0726933830382106, "grad_norm": 1.3662181157322717, "learning_rate": 6.869175008629618e-06, "loss": 1.8558, "step": 2224 }, { "epoch": 2.0736253494874184, "grad_norm": 1.2542635394678525, "learning_rate": 6.862271315153608e-06, "loss": 1.596, "step": 2225 }, { "epoch": 2.074557315936626, "grad_norm": 1.227302316207369, "learning_rate": 6.855367621677597e-06, "loss": 1.7833, "step": 2226 }, { "epoch": 2.075489282385834, "grad_norm": 1.5192654666778536, "learning_rate": 6.848463928201589e-06, "loss": 1.6931, "step": 2227 }, { "epoch": 2.0764212488350418, "grad_norm": 1.0409862978150959, "learning_rate": 6.841560234725578e-06, "loss": 1.659, "step": 2228 }, { "epoch": 2.0773532152842495, "grad_norm": 1.5792192255904487, "learning_rate": 6.834656541249569e-06, "loss": 1.607, "step": 2229 }, { "epoch": 2.0782851817334578, "grad_norm": 1.1578229819510304, "learning_rate": 6.82775284777356e-06, "loss": 1.3992, "step": 2230 }, { "epoch": 2.0792171481826656, "grad_norm": 1.1192006939431998, "learning_rate": 6.820849154297549e-06, "loss": 1.4724, "step": 2231 }, { "epoch": 2.0801491146318734, "grad_norm": 1.3134660214039369, "learning_rate": 6.813945460821541e-06, "loss": 1.6591, "step": 2232 }, { "epoch": 2.081081081081081, "grad_norm": 1.1507125864403043, "learning_rate": 6.80704176734553e-06, "loss": 1.291, "step": 2233 }, { "epoch": 2.082013047530289, "grad_norm": 1.1283267057470805, "learning_rate": 6.800138073869521e-06, "loss": 1.709, "step": 2234 }, { "epoch": 2.0829450139794967, "grad_norm": 1.2272855987105211, "learning_rate": 6.7932343803935106e-06, "loss": 1.6265, "step": 2235 }, { "epoch": 2.0838769804287045, "grad_norm": 1.3075952667934205, "learning_rate": 6.786330686917501e-06, "loss": 1.4529, "step": 2236 }, { "epoch": 2.0848089468779123, "grad_norm": 1.3276477437879903, "learning_rate": 6.779426993441491e-06, "loss": 1.7792, "step": 2237 }, { "epoch": 2.08574091332712, "grad_norm": 1.2550757323414574, "learning_rate": 6.772523299965482e-06, "loss": 1.769, "step": 2238 }, { "epoch": 2.086672879776328, "grad_norm": 1.125560267552264, "learning_rate": 6.765619606489472e-06, "loss": 1.5652, "step": 2239 }, { "epoch": 2.0876048462255357, "grad_norm": 1.5211174141457071, "learning_rate": 6.7587159130134625e-06, "loss": 1.7637, "step": 2240 }, { "epoch": 2.088536812674744, "grad_norm": 1.346474172638907, "learning_rate": 6.751812219537453e-06, "loss": 1.7728, "step": 2241 }, { "epoch": 2.0894687791239517, "grad_norm": 1.3159674152730858, "learning_rate": 6.744908526061443e-06, "loss": 1.5971, "step": 2242 }, { "epoch": 2.0904007455731595, "grad_norm": 1.141171573885752, "learning_rate": 6.738004832585434e-06, "loss": 1.4316, "step": 2243 }, { "epoch": 2.0913327120223673, "grad_norm": 1.4694142890238706, "learning_rate": 6.731101139109424e-06, "loss": 1.5091, "step": 2244 }, { "epoch": 2.092264678471575, "grad_norm": 1.099616506622677, "learning_rate": 6.7241974456334145e-06, "loss": 1.4383, "step": 2245 }, { "epoch": 2.093196644920783, "grad_norm": 1.3561340717216515, "learning_rate": 6.717293752157404e-06, "loss": 1.5879, "step": 2246 }, { "epoch": 2.0941286113699906, "grad_norm": 1.5989772370666908, "learning_rate": 6.710390058681395e-06, "loss": 1.7148, "step": 2247 }, { "epoch": 2.0950605778191984, "grad_norm": 0.9918290379569829, "learning_rate": 6.703486365205385e-06, "loss": 1.3285, "step": 2248 }, { "epoch": 2.095992544268406, "grad_norm": 1.144637415908611, "learning_rate": 6.696582671729376e-06, "loss": 1.4594, "step": 2249 }, { "epoch": 2.096924510717614, "grad_norm": 1.4442886797283598, "learning_rate": 6.6896789782533665e-06, "loss": 1.7432, "step": 2250 }, { "epoch": 2.0978564771668218, "grad_norm": 1.3100868640335726, "learning_rate": 6.682775284777356e-06, "loss": 1.5816, "step": 2251 }, { "epoch": 2.09878844361603, "grad_norm": 1.2202173774494072, "learning_rate": 6.675871591301347e-06, "loss": 2.0411, "step": 2252 }, { "epoch": 2.099720410065238, "grad_norm": 1.2579639218121978, "learning_rate": 6.668967897825337e-06, "loss": 1.5411, "step": 2253 }, { "epoch": 2.1006523765144456, "grad_norm": 1.6821150044810778, "learning_rate": 6.662064204349328e-06, "loss": 1.8894, "step": 2254 }, { "epoch": 2.1015843429636534, "grad_norm": 1.0345650710064216, "learning_rate": 6.655160510873318e-06, "loss": 1.5439, "step": 2255 }, { "epoch": 2.102516309412861, "grad_norm": 1.3126417056108417, "learning_rate": 6.648256817397308e-06, "loss": 2.0965, "step": 2256 }, { "epoch": 2.103448275862069, "grad_norm": 1.2489962954572602, "learning_rate": 6.641353123921298e-06, "loss": 2.0545, "step": 2257 }, { "epoch": 2.1043802423112767, "grad_norm": 1.2359147407253046, "learning_rate": 6.634449430445289e-06, "loss": 1.4726, "step": 2258 }, { "epoch": 2.1053122087604845, "grad_norm": 1.1090308484560767, "learning_rate": 6.627545736969279e-06, "loss": 1.7479, "step": 2259 }, { "epoch": 2.1062441752096923, "grad_norm": 1.1856683653918274, "learning_rate": 6.6206420434932696e-06, "loss": 1.4274, "step": 2260 }, { "epoch": 2.1071761416589, "grad_norm": 1.2449709655837007, "learning_rate": 6.61373835001726e-06, "loss": 1.7698, "step": 2261 }, { "epoch": 2.108108108108108, "grad_norm": 1.38489055476726, "learning_rate": 6.60683465654125e-06, "loss": 1.8448, "step": 2262 }, { "epoch": 2.109040074557316, "grad_norm": 1.158703740002577, "learning_rate": 6.599930963065241e-06, "loss": 1.6404, "step": 2263 }, { "epoch": 2.109972041006524, "grad_norm": 1.190940524456881, "learning_rate": 6.593027269589231e-06, "loss": 1.3264, "step": 2264 }, { "epoch": 2.1109040074557317, "grad_norm": 1.1018870923773179, "learning_rate": 6.5861235761132215e-06, "loss": 1.5289, "step": 2265 }, { "epoch": 2.1118359739049395, "grad_norm": 0.9724734559973611, "learning_rate": 6.579219882637211e-06, "loss": 1.5458, "step": 2266 }, { "epoch": 2.1127679403541473, "grad_norm": 1.4585947570870545, "learning_rate": 6.572316189161202e-06, "loss": 1.9861, "step": 2267 }, { "epoch": 2.113699906803355, "grad_norm": 1.208270393792942, "learning_rate": 6.565412495685192e-06, "loss": 1.7798, "step": 2268 }, { "epoch": 2.114631873252563, "grad_norm": 1.0378254128037714, "learning_rate": 6.558508802209183e-06, "loss": 1.5799, "step": 2269 }, { "epoch": 2.1155638397017706, "grad_norm": 1.356610697592464, "learning_rate": 6.551605108733173e-06, "loss": 1.9668, "step": 2270 }, { "epoch": 2.1164958061509784, "grad_norm": 1.2150842989040525, "learning_rate": 6.544701415257163e-06, "loss": 1.6006, "step": 2271 }, { "epoch": 2.117427772600186, "grad_norm": 1.1408963618465675, "learning_rate": 6.537797721781154e-06, "loss": 1.7626, "step": 2272 }, { "epoch": 2.118359739049394, "grad_norm": 1.1292134570358505, "learning_rate": 6.530894028305144e-06, "loss": 1.4743, "step": 2273 }, { "epoch": 2.1192917054986022, "grad_norm": 1.5459677409457402, "learning_rate": 6.523990334829135e-06, "loss": 1.6999, "step": 2274 }, { "epoch": 2.12022367194781, "grad_norm": 1.0012960507135202, "learning_rate": 6.517086641353124e-06, "loss": 1.3874, "step": 2275 }, { "epoch": 2.121155638397018, "grad_norm": 1.2336385686289957, "learning_rate": 6.510182947877115e-06, "loss": 1.7171, "step": 2276 }, { "epoch": 2.1220876048462256, "grad_norm": 1.225803512342663, "learning_rate": 6.503279254401104e-06, "loss": 1.6377, "step": 2277 }, { "epoch": 2.1230195712954334, "grad_norm": 1.3706037193792011, "learning_rate": 6.496375560925096e-06, "loss": 1.7803, "step": 2278 }, { "epoch": 2.123951537744641, "grad_norm": 1.364728391033755, "learning_rate": 6.489471867449085e-06, "loss": 1.3345, "step": 2279 }, { "epoch": 2.124883504193849, "grad_norm": 0.938439138443986, "learning_rate": 6.482568173973076e-06, "loss": 1.2993, "step": 2280 }, { "epoch": 2.1258154706430568, "grad_norm": 1.3713466552626843, "learning_rate": 6.475664480497067e-06, "loss": 1.4436, "step": 2281 }, { "epoch": 2.1267474370922645, "grad_norm": 1.3648952980724651, "learning_rate": 6.468760787021056e-06, "loss": 1.5832, "step": 2282 }, { "epoch": 2.1276794035414723, "grad_norm": 1.4273029395290258, "learning_rate": 6.461857093545048e-06, "loss": 2.0403, "step": 2283 }, { "epoch": 2.12861136999068, "grad_norm": 1.6217667229508481, "learning_rate": 6.454953400069037e-06, "loss": 1.6567, "step": 2284 }, { "epoch": 2.1295433364398884, "grad_norm": 1.1035216103407919, "learning_rate": 6.448049706593028e-06, "loss": 1.3468, "step": 2285 }, { "epoch": 2.130475302889096, "grad_norm": 1.4378420112047523, "learning_rate": 6.441146013117018e-06, "loss": 2.0364, "step": 2286 }, { "epoch": 2.131407269338304, "grad_norm": 1.3406214999737196, "learning_rate": 6.434242319641008e-06, "loss": 1.6108, "step": 2287 }, { "epoch": 2.1323392357875117, "grad_norm": 1.2813872748917643, "learning_rate": 6.427338626164998e-06, "loss": 1.4672, "step": 2288 }, { "epoch": 2.1332712022367195, "grad_norm": 1.258508391717024, "learning_rate": 6.420434932688989e-06, "loss": 1.7937, "step": 2289 }, { "epoch": 2.1342031686859273, "grad_norm": 1.4402041705571413, "learning_rate": 6.413531239212979e-06, "loss": 1.7957, "step": 2290 }, { "epoch": 2.135135135135135, "grad_norm": 1.214862748142276, "learning_rate": 6.4066275457369696e-06, "loss": 1.6528, "step": 2291 }, { "epoch": 2.136067101584343, "grad_norm": 1.1657811835957594, "learning_rate": 6.39972385226096e-06, "loss": 1.3919, "step": 2292 }, { "epoch": 2.1369990680335507, "grad_norm": 1.2410039515367124, "learning_rate": 6.39282015878495e-06, "loss": 1.7785, "step": 2293 }, { "epoch": 2.1379310344827585, "grad_norm": 1.2432284301679335, "learning_rate": 6.385916465308941e-06, "loss": 1.5948, "step": 2294 }, { "epoch": 2.1388630009319662, "grad_norm": 1.3537298943945555, "learning_rate": 6.379012771832931e-06, "loss": 1.8538, "step": 2295 }, { "epoch": 2.1397949673811745, "grad_norm": 1.2435517235250448, "learning_rate": 6.3721090783569215e-06, "loss": 1.5075, "step": 2296 }, { "epoch": 2.1407269338303823, "grad_norm": 1.5683075938272846, "learning_rate": 6.365205384880911e-06, "loss": 1.82, "step": 2297 }, { "epoch": 2.14165890027959, "grad_norm": 1.1821904426144327, "learning_rate": 6.358301691404902e-06, "loss": 1.8823, "step": 2298 }, { "epoch": 2.142590866728798, "grad_norm": 1.270037510081216, "learning_rate": 6.351397997928892e-06, "loss": 1.5312, "step": 2299 }, { "epoch": 2.1435228331780056, "grad_norm": 1.4971497595532437, "learning_rate": 6.344494304452883e-06, "loss": 1.6245, "step": 2300 }, { "epoch": 2.1444547996272134, "grad_norm": 1.4130533058042942, "learning_rate": 6.3375906109768735e-06, "loss": 1.4785, "step": 2301 }, { "epoch": 2.145386766076421, "grad_norm": 1.1426416887330457, "learning_rate": 6.330686917500863e-06, "loss": 1.5423, "step": 2302 }, { "epoch": 2.146318732525629, "grad_norm": 1.1994001511838315, "learning_rate": 6.323783224024854e-06, "loss": 1.4993, "step": 2303 }, { "epoch": 2.147250698974837, "grad_norm": 1.089994811105038, "learning_rate": 6.316879530548844e-06, "loss": 1.5727, "step": 2304 }, { "epoch": 2.1481826654240446, "grad_norm": 1.2610585050330316, "learning_rate": 6.309975837072835e-06, "loss": 1.8009, "step": 2305 }, { "epoch": 2.1491146318732524, "grad_norm": 1.380234020560611, "learning_rate": 6.303072143596825e-06, "loss": 1.503, "step": 2306 }, { "epoch": 2.1500465983224606, "grad_norm": 1.0616759020867244, "learning_rate": 6.296168450120815e-06, "loss": 1.5173, "step": 2307 }, { "epoch": 2.1509785647716684, "grad_norm": 1.2936228138309585, "learning_rate": 6.289264756644805e-06, "loss": 1.7414, "step": 2308 }, { "epoch": 2.151910531220876, "grad_norm": 1.2469157406542009, "learning_rate": 6.282361063168796e-06, "loss": 1.5711, "step": 2309 }, { "epoch": 2.152842497670084, "grad_norm": 1.1697949929951335, "learning_rate": 6.275457369692786e-06, "loss": 1.5552, "step": 2310 }, { "epoch": 2.1537744641192917, "grad_norm": 1.250311887215215, "learning_rate": 6.268553676216777e-06, "loss": 1.6355, "step": 2311 }, { "epoch": 2.1547064305684995, "grad_norm": 1.4807289575075953, "learning_rate": 6.261649982740767e-06, "loss": 1.8018, "step": 2312 }, { "epoch": 2.1556383970177073, "grad_norm": 1.2705564649014425, "learning_rate": 6.254746289264757e-06, "loss": 1.8967, "step": 2313 }, { "epoch": 2.156570363466915, "grad_norm": 1.3932573009890072, "learning_rate": 6.247842595788748e-06, "loss": 1.3658, "step": 2314 }, { "epoch": 2.157502329916123, "grad_norm": 1.4027970520682331, "learning_rate": 6.240938902312738e-06, "loss": 1.5849, "step": 2315 }, { "epoch": 2.1584342963653307, "grad_norm": 1.3304339522651512, "learning_rate": 6.2340352088367286e-06, "loss": 1.9237, "step": 2316 }, { "epoch": 2.1593662628145385, "grad_norm": 1.3962446144205878, "learning_rate": 6.2271315153607184e-06, "loss": 1.5761, "step": 2317 }, { "epoch": 2.1602982292637467, "grad_norm": 1.2665326817667129, "learning_rate": 6.220227821884709e-06, "loss": 1.6237, "step": 2318 }, { "epoch": 2.1612301957129545, "grad_norm": 1.207410662145101, "learning_rate": 6.213324128408699e-06, "loss": 1.4529, "step": 2319 }, { "epoch": 2.1621621621621623, "grad_norm": 1.2327452774768055, "learning_rate": 6.20642043493269e-06, "loss": 1.6402, "step": 2320 }, { "epoch": 2.16309412861137, "grad_norm": 1.5706008756795915, "learning_rate": 6.1995167414566805e-06, "loss": 2.0381, "step": 2321 }, { "epoch": 2.164026095060578, "grad_norm": 1.153198036303549, "learning_rate": 6.19261304798067e-06, "loss": 1.6719, "step": 2322 }, { "epoch": 2.1649580615097856, "grad_norm": 1.3379900259861452, "learning_rate": 6.185709354504661e-06, "loss": 2.3098, "step": 2323 }, { "epoch": 2.1658900279589934, "grad_norm": 1.1155135554729716, "learning_rate": 6.178805661028651e-06, "loss": 1.4734, "step": 2324 }, { "epoch": 2.1668219944082012, "grad_norm": 1.0894118562934374, "learning_rate": 6.171901967552642e-06, "loss": 1.4912, "step": 2325 }, { "epoch": 2.167753960857409, "grad_norm": 1.439636192640003, "learning_rate": 6.164998274076631e-06, "loss": 1.6971, "step": 2326 }, { "epoch": 2.168685927306617, "grad_norm": 1.2820051804303465, "learning_rate": 6.158094580600622e-06, "loss": 1.5454, "step": 2327 }, { "epoch": 2.1696178937558246, "grad_norm": 1.3439264174435857, "learning_rate": 6.151190887124611e-06, "loss": 1.4419, "step": 2328 }, { "epoch": 2.170549860205033, "grad_norm": 1.2294359408935251, "learning_rate": 6.144287193648603e-06, "loss": 1.7552, "step": 2329 }, { "epoch": 2.1714818266542406, "grad_norm": 1.064747744743164, "learning_rate": 6.137383500172592e-06, "loss": 1.7737, "step": 2330 }, { "epoch": 2.1724137931034484, "grad_norm": 1.1291165052078542, "learning_rate": 6.130479806696583e-06, "loss": 1.8966, "step": 2331 }, { "epoch": 2.173345759552656, "grad_norm": 1.2702522414996449, "learning_rate": 6.123576113220574e-06, "loss": 1.7403, "step": 2332 }, { "epoch": 2.174277726001864, "grad_norm": 1.2386407777078967, "learning_rate": 6.116672419744563e-06, "loss": 1.5861, "step": 2333 }, { "epoch": 2.1752096924510718, "grad_norm": 1.2117391608605415, "learning_rate": 6.109768726268555e-06, "loss": 1.7543, "step": 2334 }, { "epoch": 2.1761416589002796, "grad_norm": 1.323665289758237, "learning_rate": 6.102865032792544e-06, "loss": 1.585, "step": 2335 }, { "epoch": 2.1770736253494873, "grad_norm": 1.317746398179385, "learning_rate": 6.095961339316535e-06, "loss": 1.766, "step": 2336 }, { "epoch": 2.178005591798695, "grad_norm": 1.3134933666379904, "learning_rate": 6.089057645840525e-06, "loss": 1.4922, "step": 2337 }, { "epoch": 2.178937558247903, "grad_norm": 1.1036944989634978, "learning_rate": 6.082153952364515e-06, "loss": 1.4757, "step": 2338 }, { "epoch": 2.1798695246971107, "grad_norm": 1.4784062856330347, "learning_rate": 6.075250258888505e-06, "loss": 1.5989, "step": 2339 }, { "epoch": 2.180801491146319, "grad_norm": 1.105554928137701, "learning_rate": 6.068346565412496e-06, "loss": 1.6021, "step": 2340 }, { "epoch": 2.1817334575955267, "grad_norm": 1.3149488394448283, "learning_rate": 6.061442871936487e-06, "loss": 1.556, "step": 2341 }, { "epoch": 2.1826654240447345, "grad_norm": 1.3363944822050295, "learning_rate": 6.054539178460477e-06, "loss": 1.7613, "step": 2342 }, { "epoch": 2.1835973904939423, "grad_norm": 1.132762070490161, "learning_rate": 6.047635484984467e-06, "loss": 1.876, "step": 2343 }, { "epoch": 2.18452935694315, "grad_norm": 1.154375909711082, "learning_rate": 6.040731791508457e-06, "loss": 1.7771, "step": 2344 }, { "epoch": 2.185461323392358, "grad_norm": 1.215073655083068, "learning_rate": 6.033828098032448e-06, "loss": 1.7222, "step": 2345 }, { "epoch": 2.1863932898415657, "grad_norm": 1.2568948820065635, "learning_rate": 6.026924404556438e-06, "loss": 1.6459, "step": 2346 }, { "epoch": 2.1873252562907735, "grad_norm": 1.3603562333812775, "learning_rate": 6.0200207110804286e-06, "loss": 1.7209, "step": 2347 }, { "epoch": 2.1882572227399812, "grad_norm": 1.1414738795277621, "learning_rate": 6.0131170176044184e-06, "loss": 1.605, "step": 2348 }, { "epoch": 2.189189189189189, "grad_norm": 1.1450782014287741, "learning_rate": 6.006213324128409e-06, "loss": 1.6842, "step": 2349 }, { "epoch": 2.190121155638397, "grad_norm": 1.3569833620808702, "learning_rate": 5.999309630652399e-06, "loss": 1.5018, "step": 2350 }, { "epoch": 2.191053122087605, "grad_norm": 0.9749537260862771, "learning_rate": 5.99240593717639e-06, "loss": 1.8161, "step": 2351 }, { "epoch": 2.191985088536813, "grad_norm": 1.3689103670212235, "learning_rate": 5.9855022437003805e-06, "loss": 2.1435, "step": 2352 }, { "epoch": 2.1929170549860206, "grad_norm": 1.4641641752346204, "learning_rate": 5.97859855022437e-06, "loss": 1.389, "step": 2353 }, { "epoch": 2.1938490214352284, "grad_norm": 1.1978923908761885, "learning_rate": 5.971694856748361e-06, "loss": 1.761, "step": 2354 }, { "epoch": 2.194780987884436, "grad_norm": 1.1516145084382545, "learning_rate": 5.964791163272351e-06, "loss": 1.4931, "step": 2355 }, { "epoch": 2.195712954333644, "grad_norm": 1.1143024602857117, "learning_rate": 5.957887469796342e-06, "loss": 1.5086, "step": 2356 }, { "epoch": 2.196644920782852, "grad_norm": 1.2729183421451062, "learning_rate": 5.950983776320332e-06, "loss": 1.9764, "step": 2357 }, { "epoch": 2.1975768872320596, "grad_norm": 1.1286539794609567, "learning_rate": 5.944080082844322e-06, "loss": 1.8657, "step": 2358 }, { "epoch": 2.1985088536812674, "grad_norm": 1.2025397745779227, "learning_rate": 5.937176389368312e-06, "loss": 1.781, "step": 2359 }, { "epoch": 2.199440820130475, "grad_norm": 1.630636249790059, "learning_rate": 5.930272695892303e-06, "loss": 1.8008, "step": 2360 }, { "epoch": 2.200372786579683, "grad_norm": 1.1538994049003501, "learning_rate": 5.923369002416293e-06, "loss": 1.7761, "step": 2361 }, { "epoch": 2.201304753028891, "grad_norm": 1.3750054614545757, "learning_rate": 5.916465308940284e-06, "loss": 1.4864, "step": 2362 }, { "epoch": 2.202236719478099, "grad_norm": 1.218140330369553, "learning_rate": 5.909561615464274e-06, "loss": 1.8258, "step": 2363 }, { "epoch": 2.2031686859273067, "grad_norm": 1.3018965139936511, "learning_rate": 5.902657921988264e-06, "loss": 1.7786, "step": 2364 }, { "epoch": 2.2041006523765145, "grad_norm": 1.3725864262002543, "learning_rate": 5.895754228512255e-06, "loss": 1.5257, "step": 2365 }, { "epoch": 2.2050326188257223, "grad_norm": 1.31606029397869, "learning_rate": 5.888850535036245e-06, "loss": 1.4418, "step": 2366 }, { "epoch": 2.20596458527493, "grad_norm": 1.2995761260711844, "learning_rate": 5.881946841560236e-06, "loss": 1.573, "step": 2367 }, { "epoch": 2.206896551724138, "grad_norm": 1.4334757372824438, "learning_rate": 5.8750431480842255e-06, "loss": 1.9436, "step": 2368 }, { "epoch": 2.2078285181733457, "grad_norm": 1.244615646389966, "learning_rate": 5.868139454608216e-06, "loss": 1.3947, "step": 2369 }, { "epoch": 2.2087604846225535, "grad_norm": 1.315633128252749, "learning_rate": 5.861235761132206e-06, "loss": 1.7269, "step": 2370 }, { "epoch": 2.2096924510717613, "grad_norm": 1.2725924456700664, "learning_rate": 5.854332067656197e-06, "loss": 1.6599, "step": 2371 }, { "epoch": 2.210624417520969, "grad_norm": 1.2301365979694354, "learning_rate": 5.8474283741801875e-06, "loss": 1.4203, "step": 2372 }, { "epoch": 2.2115563839701773, "grad_norm": 1.3255760435643242, "learning_rate": 5.8405246807041774e-06, "loss": 1.8295, "step": 2373 }, { "epoch": 2.212488350419385, "grad_norm": 1.14303636427031, "learning_rate": 5.833620987228168e-06, "loss": 1.4946, "step": 2374 }, { "epoch": 2.213420316868593, "grad_norm": 1.3129918048139912, "learning_rate": 5.826717293752158e-06, "loss": 1.5623, "step": 2375 }, { "epoch": 2.2143522833178007, "grad_norm": 1.7629395514216206, "learning_rate": 5.819813600276149e-06, "loss": 1.5312, "step": 2376 }, { "epoch": 2.2152842497670084, "grad_norm": 1.0959881582533388, "learning_rate": 5.812909906800138e-06, "loss": 1.6646, "step": 2377 }, { "epoch": 2.2162162162162162, "grad_norm": 1.1693076625460619, "learning_rate": 5.806006213324129e-06, "loss": 1.5007, "step": 2378 }, { "epoch": 2.217148182665424, "grad_norm": 1.2413263238808923, "learning_rate": 5.7991025198481184e-06, "loss": 1.663, "step": 2379 }, { "epoch": 2.218080149114632, "grad_norm": 1.1674061124669342, "learning_rate": 5.79219882637211e-06, "loss": 1.5159, "step": 2380 }, { "epoch": 2.2190121155638396, "grad_norm": 1.1320224569711435, "learning_rate": 5.785295132896099e-06, "loss": 1.825, "step": 2381 }, { "epoch": 2.2199440820130474, "grad_norm": 1.2220802022264905, "learning_rate": 5.77839143942009e-06, "loss": 1.8266, "step": 2382 }, { "epoch": 2.220876048462255, "grad_norm": 1.0492404043592052, "learning_rate": 5.771487745944081e-06, "loss": 1.3673, "step": 2383 }, { "epoch": 2.2218080149114634, "grad_norm": 1.4914678268935668, "learning_rate": 5.76458405246807e-06, "loss": 1.324, "step": 2384 }, { "epoch": 2.222739981360671, "grad_norm": 1.1281612393782763, "learning_rate": 5.757680358992062e-06, "loss": 1.4501, "step": 2385 }, { "epoch": 2.223671947809879, "grad_norm": 1.124852070459004, "learning_rate": 5.750776665516051e-06, "loss": 1.8387, "step": 2386 }, { "epoch": 2.2246039142590868, "grad_norm": 1.3613746632202612, "learning_rate": 5.743872972040042e-06, "loss": 1.7294, "step": 2387 }, { "epoch": 2.2255358807082946, "grad_norm": 1.1664033972961243, "learning_rate": 5.736969278564032e-06, "loss": 1.6225, "step": 2388 }, { "epoch": 2.2264678471575023, "grad_norm": 1.1580900895745467, "learning_rate": 5.730065585088022e-06, "loss": 1.7331, "step": 2389 }, { "epoch": 2.22739981360671, "grad_norm": 1.1720893138044302, "learning_rate": 5.723161891612012e-06, "loss": 1.3771, "step": 2390 }, { "epoch": 2.228331780055918, "grad_norm": 1.1476546289142489, "learning_rate": 5.716258198136003e-06, "loss": 1.475, "step": 2391 }, { "epoch": 2.2292637465051257, "grad_norm": 1.3158230204053434, "learning_rate": 5.709354504659994e-06, "loss": 1.5818, "step": 2392 }, { "epoch": 2.2301957129543335, "grad_norm": 1.3164243800920084, "learning_rate": 5.702450811183984e-06, "loss": 1.6547, "step": 2393 }, { "epoch": 2.2311276794035413, "grad_norm": 1.250636191670407, "learning_rate": 5.695547117707974e-06, "loss": 1.6972, "step": 2394 }, { "epoch": 2.2320596458527495, "grad_norm": 1.0686429930063914, "learning_rate": 5.688643424231964e-06, "loss": 1.3395, "step": 2395 }, { "epoch": 2.2329916123019573, "grad_norm": 1.495087860309427, "learning_rate": 5.681739730755955e-06, "loss": 1.5425, "step": 2396 }, { "epoch": 2.233923578751165, "grad_norm": 1.2490324482697241, "learning_rate": 5.674836037279945e-06, "loss": 1.5879, "step": 2397 }, { "epoch": 2.234855545200373, "grad_norm": 1.1066113553896801, "learning_rate": 5.667932343803936e-06, "loss": 1.3986, "step": 2398 }, { "epoch": 2.2357875116495807, "grad_norm": 1.2659862267188027, "learning_rate": 5.6610286503279255e-06, "loss": 1.7192, "step": 2399 }, { "epoch": 2.2367194780987885, "grad_norm": 1.2080566940673831, "learning_rate": 5.654124956851916e-06, "loss": 1.8766, "step": 2400 }, { "epoch": 2.2376514445479962, "grad_norm": 1.1606711308041766, "learning_rate": 5.647221263375906e-06, "loss": 1.867, "step": 2401 }, { "epoch": 2.238583410997204, "grad_norm": 1.0943828201518744, "learning_rate": 5.640317569899897e-06, "loss": 1.6364, "step": 2402 }, { "epoch": 2.239515377446412, "grad_norm": 1.3645724230123626, "learning_rate": 5.6334138764238875e-06, "loss": 1.6621, "step": 2403 }, { "epoch": 2.2404473438956196, "grad_norm": 1.3461966073550617, "learning_rate": 5.6265101829478774e-06, "loss": 1.2697, "step": 2404 }, { "epoch": 2.2413793103448274, "grad_norm": 1.3523661292432145, "learning_rate": 5.619606489471868e-06, "loss": 1.575, "step": 2405 }, { "epoch": 2.2423112767940356, "grad_norm": 1.2666447881710814, "learning_rate": 5.612702795995858e-06, "loss": 1.4648, "step": 2406 }, { "epoch": 2.2432432432432434, "grad_norm": 1.2122222685172435, "learning_rate": 5.605799102519849e-06, "loss": 2.0694, "step": 2407 }, { "epoch": 2.244175209692451, "grad_norm": 1.4343707904684202, "learning_rate": 5.598895409043839e-06, "loss": 1.4903, "step": 2408 }, { "epoch": 2.245107176141659, "grad_norm": 1.6575580221495778, "learning_rate": 5.591991715567829e-06, "loss": 1.5495, "step": 2409 }, { "epoch": 2.246039142590867, "grad_norm": 1.2206789412966184, "learning_rate": 5.585088022091819e-06, "loss": 1.5999, "step": 2410 }, { "epoch": 2.2469711090400746, "grad_norm": 1.6327015863855843, "learning_rate": 5.57818432861581e-06, "loss": 1.651, "step": 2411 }, { "epoch": 2.2479030754892824, "grad_norm": 1.1906145094603302, "learning_rate": 5.571280635139801e-06, "loss": 1.6067, "step": 2412 }, { "epoch": 2.24883504193849, "grad_norm": 1.0951735711934243, "learning_rate": 5.564376941663791e-06, "loss": 1.5221, "step": 2413 }, { "epoch": 2.249767008387698, "grad_norm": 1.2631163959102172, "learning_rate": 5.557473248187781e-06, "loss": 1.5705, "step": 2414 }, { "epoch": 2.2506989748369057, "grad_norm": 1.034568253573315, "learning_rate": 5.550569554711771e-06, "loss": 1.6279, "step": 2415 }, { "epoch": 2.2516309412861135, "grad_norm": 1.6636660442164373, "learning_rate": 5.543665861235762e-06, "loss": 1.7728, "step": 2416 }, { "epoch": 2.2525629077353218, "grad_norm": 1.2822656669716033, "learning_rate": 5.536762167759752e-06, "loss": 1.5236, "step": 2417 }, { "epoch": 2.2534948741845295, "grad_norm": 1.4721451788723092, "learning_rate": 5.529858474283743e-06, "loss": 1.7993, "step": 2418 }, { "epoch": 2.2544268406337373, "grad_norm": 1.1471354250540022, "learning_rate": 5.5229547808077325e-06, "loss": 1.5542, "step": 2419 }, { "epoch": 2.255358807082945, "grad_norm": 1.4377262465719995, "learning_rate": 5.516051087331723e-06, "loss": 1.5077, "step": 2420 }, { "epoch": 2.256290773532153, "grad_norm": 1.3788925363492848, "learning_rate": 5.509147393855713e-06, "loss": 1.4053, "step": 2421 }, { "epoch": 2.2572227399813607, "grad_norm": 1.0910633772745792, "learning_rate": 5.502243700379704e-06, "loss": 1.4387, "step": 2422 }, { "epoch": 2.2581547064305685, "grad_norm": 1.3068280288329095, "learning_rate": 5.4953400069036946e-06, "loss": 1.4309, "step": 2423 }, { "epoch": 2.2590866728797763, "grad_norm": 1.4513093395695607, "learning_rate": 5.4884363134276845e-06, "loss": 1.6879, "step": 2424 }, { "epoch": 2.260018639328984, "grad_norm": 1.1228085569056407, "learning_rate": 5.481532619951675e-06, "loss": 1.5653, "step": 2425 }, { "epoch": 2.260950605778192, "grad_norm": 1.5038344533606196, "learning_rate": 5.474628926475665e-06, "loss": 1.8634, "step": 2426 }, { "epoch": 2.2618825722273996, "grad_norm": 1.268323657167918, "learning_rate": 5.467725232999656e-06, "loss": 1.8375, "step": 2427 }, { "epoch": 2.262814538676608, "grad_norm": 1.1866800777243574, "learning_rate": 5.460821539523645e-06, "loss": 1.6392, "step": 2428 }, { "epoch": 2.2637465051258157, "grad_norm": 1.2439100160830028, "learning_rate": 5.4539178460476364e-06, "loss": 1.5643, "step": 2429 }, { "epoch": 2.2646784715750234, "grad_norm": 1.1086884101380476, "learning_rate": 5.4470141525716255e-06, "loss": 1.2988, "step": 2430 }, { "epoch": 2.2656104380242312, "grad_norm": 1.099753572531766, "learning_rate": 5.440110459095617e-06, "loss": 1.5308, "step": 2431 }, { "epoch": 2.266542404473439, "grad_norm": 1.064481776277818, "learning_rate": 5.433206765619608e-06, "loss": 1.6634, "step": 2432 }, { "epoch": 2.267474370922647, "grad_norm": 1.1807198713841973, "learning_rate": 5.426303072143597e-06, "loss": 1.5543, "step": 2433 }, { "epoch": 2.2684063373718546, "grad_norm": 1.2368808185322424, "learning_rate": 5.419399378667588e-06, "loss": 1.5129, "step": 2434 }, { "epoch": 2.2693383038210624, "grad_norm": 1.320597476260891, "learning_rate": 5.4124956851915774e-06, "loss": 1.7084, "step": 2435 }, { "epoch": 2.27027027027027, "grad_norm": 1.2658460877704794, "learning_rate": 5.405591991715568e-06, "loss": 1.3843, "step": 2436 }, { "epoch": 2.271202236719478, "grad_norm": 1.30655471999, "learning_rate": 5.398688298239558e-06, "loss": 1.7381, "step": 2437 }, { "epoch": 2.2721342031686858, "grad_norm": 1.1170712858114387, "learning_rate": 5.391784604763549e-06, "loss": 1.3218, "step": 2438 }, { "epoch": 2.273066169617894, "grad_norm": 1.1582227250779158, "learning_rate": 5.384880911287539e-06, "loss": 1.4366, "step": 2439 }, { "epoch": 2.2739981360671018, "grad_norm": 1.1886278880344638, "learning_rate": 5.377977217811529e-06, "loss": 1.7687, "step": 2440 }, { "epoch": 2.2749301025163096, "grad_norm": 1.2749717291899703, "learning_rate": 5.371073524335519e-06, "loss": 1.4815, "step": 2441 }, { "epoch": 2.2758620689655173, "grad_norm": 1.0587466070831622, "learning_rate": 5.36416983085951e-06, "loss": 1.6813, "step": 2442 }, { "epoch": 2.276794035414725, "grad_norm": 1.2860324937810124, "learning_rate": 5.357266137383501e-06, "loss": 1.515, "step": 2443 }, { "epoch": 2.277726001863933, "grad_norm": 1.1249291635804566, "learning_rate": 5.350362443907491e-06, "loss": 1.7391, "step": 2444 }, { "epoch": 2.2786579683131407, "grad_norm": 1.2003695754589851, "learning_rate": 5.343458750431481e-06, "loss": 1.947, "step": 2445 }, { "epoch": 2.2795899347623485, "grad_norm": 1.3065603038586144, "learning_rate": 5.336555056955471e-06, "loss": 1.7999, "step": 2446 }, { "epoch": 2.2805219012115563, "grad_norm": 1.3102650423942825, "learning_rate": 5.329651363479462e-06, "loss": 1.9169, "step": 2447 }, { "epoch": 2.281453867660764, "grad_norm": 1.284331145263835, "learning_rate": 5.322747670003452e-06, "loss": 1.9037, "step": 2448 }, { "epoch": 2.282385834109972, "grad_norm": 1.361973897358799, "learning_rate": 5.315843976527443e-06, "loss": 1.7024, "step": 2449 }, { "epoch": 2.28331780055918, "grad_norm": 1.3199261011026784, "learning_rate": 5.3089402830514325e-06, "loss": 1.9068, "step": 2450 }, { "epoch": 2.284249767008388, "grad_norm": 1.1857387175440044, "learning_rate": 5.302036589575423e-06, "loss": 1.771, "step": 2451 }, { "epoch": 2.2851817334575957, "grad_norm": 1.2723102794628267, "learning_rate": 5.295132896099414e-06, "loss": 1.5666, "step": 2452 }, { "epoch": 2.2861136999068035, "grad_norm": 1.0762394520674528, "learning_rate": 5.288229202623404e-06, "loss": 1.5056, "step": 2453 }, { "epoch": 2.2870456663560113, "grad_norm": 1.2256022826431132, "learning_rate": 5.281325509147395e-06, "loss": 1.8015, "step": 2454 }, { "epoch": 2.287977632805219, "grad_norm": 1.3233677020230343, "learning_rate": 5.2744218156713845e-06, "loss": 1.8858, "step": 2455 }, { "epoch": 2.288909599254427, "grad_norm": 1.1630814284268418, "learning_rate": 5.267518122195375e-06, "loss": 1.7429, "step": 2456 }, { "epoch": 2.2898415657036346, "grad_norm": 1.2982165581311167, "learning_rate": 5.260614428719365e-06, "loss": 1.8018, "step": 2457 }, { "epoch": 2.2907735321528424, "grad_norm": 1.1952015902732436, "learning_rate": 5.253710735243356e-06, "loss": 1.492, "step": 2458 }, { "epoch": 2.29170549860205, "grad_norm": 1.3581841745443513, "learning_rate": 5.246807041767346e-06, "loss": 1.4988, "step": 2459 }, { "epoch": 2.292637465051258, "grad_norm": 1.29272977256108, "learning_rate": 5.2399033482913364e-06, "loss": 1.555, "step": 2460 }, { "epoch": 2.293569431500466, "grad_norm": 1.0820560286890855, "learning_rate": 5.232999654815326e-06, "loss": 1.6186, "step": 2461 }, { "epoch": 2.294501397949674, "grad_norm": 1.3690553498451374, "learning_rate": 5.226095961339317e-06, "loss": 1.4665, "step": 2462 }, { "epoch": 2.295433364398882, "grad_norm": 1.4170458325757695, "learning_rate": 5.219192267863308e-06, "loss": 1.4926, "step": 2463 }, { "epoch": 2.2963653308480896, "grad_norm": 1.1058740208291495, "learning_rate": 5.212288574387298e-06, "loss": 1.5896, "step": 2464 }, { "epoch": 2.2972972972972974, "grad_norm": 1.2650856965468822, "learning_rate": 5.205384880911288e-06, "loss": 1.9542, "step": 2465 }, { "epoch": 2.298229263746505, "grad_norm": 1.1528503840918607, "learning_rate": 5.198481187435278e-06, "loss": 1.6001, "step": 2466 }, { "epoch": 2.299161230195713, "grad_norm": 1.3274820389916502, "learning_rate": 5.191577493959269e-06, "loss": 1.5077, "step": 2467 }, { "epoch": 2.3000931966449207, "grad_norm": 1.3423599250378906, "learning_rate": 5.184673800483259e-06, "loss": 1.7702, "step": 2468 }, { "epoch": 2.3010251630941285, "grad_norm": 1.0649168435272383, "learning_rate": 5.17777010700725e-06, "loss": 1.5662, "step": 2469 }, { "epoch": 2.3019571295433363, "grad_norm": 1.362725089649245, "learning_rate": 5.1708664135312395e-06, "loss": 1.7088, "step": 2470 }, { "epoch": 2.302889095992544, "grad_norm": 1.199885806538103, "learning_rate": 5.16396272005523e-06, "loss": 1.7567, "step": 2471 }, { "epoch": 2.3038210624417523, "grad_norm": 1.2150422437402206, "learning_rate": 5.15705902657922e-06, "loss": 1.7468, "step": 2472 }, { "epoch": 2.3047530288909597, "grad_norm": 1.1466830581075687, "learning_rate": 5.150155333103211e-06, "loss": 1.6315, "step": 2473 }, { "epoch": 2.305684995340168, "grad_norm": 1.335172234440541, "learning_rate": 5.143251639627202e-06, "loss": 1.5869, "step": 2474 }, { "epoch": 2.3066169617893757, "grad_norm": 1.1128180532608793, "learning_rate": 5.1363479461511915e-06, "loss": 1.6831, "step": 2475 }, { "epoch": 2.3075489282385835, "grad_norm": 1.0551870986117848, "learning_rate": 5.129444252675182e-06, "loss": 1.7014, "step": 2476 }, { "epoch": 2.3084808946877913, "grad_norm": 1.2866916330469287, "learning_rate": 5.122540559199172e-06, "loss": 1.5775, "step": 2477 }, { "epoch": 2.309412861136999, "grad_norm": 1.3362635146342712, "learning_rate": 5.115636865723163e-06, "loss": 1.6629, "step": 2478 }, { "epoch": 2.310344827586207, "grad_norm": 1.335726546842005, "learning_rate": 5.108733172247152e-06, "loss": 1.63, "step": 2479 }, { "epoch": 2.3112767940354146, "grad_norm": 1.1926202774783006, "learning_rate": 5.1018294787711435e-06, "loss": 1.3272, "step": 2480 }, { "epoch": 2.3122087604846224, "grad_norm": 1.389482151469187, "learning_rate": 5.0949257852951325e-06, "loss": 1.8917, "step": 2481 }, { "epoch": 2.31314072693383, "grad_norm": 1.2243179196607132, "learning_rate": 5.088022091819124e-06, "loss": 1.4675, "step": 2482 }, { "epoch": 2.3140726933830384, "grad_norm": 1.2935702002416716, "learning_rate": 5.081118398343115e-06, "loss": 1.5723, "step": 2483 }, { "epoch": 2.315004659832246, "grad_norm": 1.2777132600201302, "learning_rate": 5.074214704867104e-06, "loss": 1.5855, "step": 2484 }, { "epoch": 2.315936626281454, "grad_norm": 1.2545731162266462, "learning_rate": 5.0673110113910954e-06, "loss": 1.7125, "step": 2485 }, { "epoch": 2.316868592730662, "grad_norm": 1.310007230503821, "learning_rate": 5.0604073179150845e-06, "loss": 1.5754, "step": 2486 }, { "epoch": 2.3178005591798696, "grad_norm": 1.2232318993215083, "learning_rate": 5.053503624439075e-06, "loss": 1.7241, "step": 2487 }, { "epoch": 2.3187325256290774, "grad_norm": 1.2400390351767205, "learning_rate": 5.046599930963065e-06, "loss": 1.6777, "step": 2488 }, { "epoch": 2.319664492078285, "grad_norm": 1.3899195562010742, "learning_rate": 5.039696237487056e-06, "loss": 1.6233, "step": 2489 }, { "epoch": 2.320596458527493, "grad_norm": 1.2869377412094156, "learning_rate": 5.032792544011046e-06, "loss": 1.8165, "step": 2490 }, { "epoch": 2.3215284249767008, "grad_norm": 1.2979074921598936, "learning_rate": 5.0258888505350364e-06, "loss": 2.1059, "step": 2491 }, { "epoch": 2.3224603914259085, "grad_norm": 1.1607439915233069, "learning_rate": 5.018985157059026e-06, "loss": 1.8282, "step": 2492 }, { "epoch": 2.3233923578751163, "grad_norm": 1.280476888124119, "learning_rate": 5.012081463583017e-06, "loss": 1.8687, "step": 2493 }, { "epoch": 2.3243243243243246, "grad_norm": 1.270481536768073, "learning_rate": 5.005177770107008e-06, "loss": 1.8353, "step": 2494 }, { "epoch": 2.325256290773532, "grad_norm": 1.126121308015836, "learning_rate": 4.998274076630998e-06, "loss": 1.7266, "step": 2495 }, { "epoch": 2.32618825722274, "grad_norm": 1.3247343748304392, "learning_rate": 4.991370383154988e-06, "loss": 1.5509, "step": 2496 }, { "epoch": 2.327120223671948, "grad_norm": 1.2271793664624961, "learning_rate": 4.984466689678978e-06, "loss": 1.5072, "step": 2497 }, { "epoch": 2.3280521901211557, "grad_norm": 1.370941158284868, "learning_rate": 4.977562996202969e-06, "loss": 1.8748, "step": 2498 }, { "epoch": 2.3289841565703635, "grad_norm": 1.4842348002908254, "learning_rate": 4.97065930272696e-06, "loss": 1.5966, "step": 2499 }, { "epoch": 2.3299161230195713, "grad_norm": 1.2412400528501173, "learning_rate": 4.96375560925095e-06, "loss": 1.5968, "step": 2500 }, { "epoch": 2.330848089468779, "grad_norm": 1.3650412446317077, "learning_rate": 4.95685191577494e-06, "loss": 1.5964, "step": 2501 }, { "epoch": 2.331780055917987, "grad_norm": 1.4115986890601926, "learning_rate": 4.94994822229893e-06, "loss": 1.6516, "step": 2502 }, { "epoch": 2.3327120223671947, "grad_norm": 1.3066599805337111, "learning_rate": 4.943044528822921e-06, "loss": 1.7241, "step": 2503 }, { "epoch": 2.3336439888164024, "grad_norm": 1.5893284345669654, "learning_rate": 4.936140835346911e-06, "loss": 1.4581, "step": 2504 }, { "epoch": 2.3345759552656107, "grad_norm": 1.1590169101068784, "learning_rate": 4.929237141870902e-06, "loss": 1.5858, "step": 2505 }, { "epoch": 2.335507921714818, "grad_norm": 1.3798126849120758, "learning_rate": 4.9223334483948915e-06, "loss": 1.5368, "step": 2506 }, { "epoch": 2.3364398881640263, "grad_norm": 1.2706847586776897, "learning_rate": 4.915429754918881e-06, "loss": 1.624, "step": 2507 }, { "epoch": 2.337371854613234, "grad_norm": 1.4221758796659645, "learning_rate": 4.908526061442873e-06, "loss": 1.6796, "step": 2508 }, { "epoch": 2.338303821062442, "grad_norm": 1.2172802253350654, "learning_rate": 4.901622367966863e-06, "loss": 1.9422, "step": 2509 }, { "epoch": 2.3392357875116496, "grad_norm": 1.206637694022833, "learning_rate": 4.894718674490853e-06, "loss": 1.4177, "step": 2510 }, { "epoch": 2.3401677539608574, "grad_norm": 1.4604212924314992, "learning_rate": 4.8878149810148435e-06, "loss": 1.7234, "step": 2511 }, { "epoch": 2.341099720410065, "grad_norm": 1.1902039484906162, "learning_rate": 4.880911287538833e-06, "loss": 1.5251, "step": 2512 }, { "epoch": 2.342031686859273, "grad_norm": 1.228134714643214, "learning_rate": 4.874007594062824e-06, "loss": 1.4536, "step": 2513 }, { "epoch": 2.3429636533084808, "grad_norm": 1.1078902477409633, "learning_rate": 4.867103900586814e-06, "loss": 1.5905, "step": 2514 }, { "epoch": 2.3438956197576886, "grad_norm": 1.3972157310856137, "learning_rate": 4.860200207110805e-06, "loss": 1.6814, "step": 2515 }, { "epoch": 2.344827586206897, "grad_norm": 1.4645691501396059, "learning_rate": 4.853296513634795e-06, "loss": 1.637, "step": 2516 }, { "epoch": 2.345759552656104, "grad_norm": 1.1790692363197761, "learning_rate": 4.846392820158785e-06, "loss": 1.864, "step": 2517 }, { "epoch": 2.3466915191053124, "grad_norm": 1.2872688753877433, "learning_rate": 4.839489126682776e-06, "loss": 1.524, "step": 2518 }, { "epoch": 2.34762348555452, "grad_norm": 1.0551913196636764, "learning_rate": 4.832585433206766e-06, "loss": 1.5375, "step": 2519 }, { "epoch": 2.348555452003728, "grad_norm": 1.3261103486908075, "learning_rate": 4.825681739730757e-06, "loss": 1.4984, "step": 2520 }, { "epoch": 2.3494874184529357, "grad_norm": 1.2978725633356283, "learning_rate": 4.8187780462547465e-06, "loss": 1.6198, "step": 2521 }, { "epoch": 2.3504193849021435, "grad_norm": 1.1440656662025757, "learning_rate": 4.811874352778737e-06, "loss": 1.2209, "step": 2522 }, { "epoch": 2.3513513513513513, "grad_norm": 1.0801794480220335, "learning_rate": 4.804970659302727e-06, "loss": 1.975, "step": 2523 }, { "epoch": 2.352283317800559, "grad_norm": 1.2953667755501412, "learning_rate": 4.798066965826718e-06, "loss": 1.9199, "step": 2524 }, { "epoch": 2.353215284249767, "grad_norm": 1.1643667949839993, "learning_rate": 4.791163272350708e-06, "loss": 1.815, "step": 2525 }, { "epoch": 2.3541472506989747, "grad_norm": 1.188551463124254, "learning_rate": 4.7842595788746985e-06, "loss": 1.5278, "step": 2526 }, { "epoch": 2.355079217148183, "grad_norm": 1.198433484397191, "learning_rate": 4.777355885398688e-06, "loss": 1.6853, "step": 2527 }, { "epoch": 2.3560111835973903, "grad_norm": 1.2972659258907024, "learning_rate": 4.770452191922679e-06, "loss": 1.6455, "step": 2528 }, { "epoch": 2.3569431500465985, "grad_norm": 1.4563629904390374, "learning_rate": 4.76354849844667e-06, "loss": 1.8088, "step": 2529 }, { "epoch": 2.3578751164958063, "grad_norm": 1.5038597210089837, "learning_rate": 4.75664480497066e-06, "loss": 1.789, "step": 2530 }, { "epoch": 2.358807082945014, "grad_norm": 1.0690728096490625, "learning_rate": 4.7497411114946505e-06, "loss": 1.4812, "step": 2531 }, { "epoch": 2.359739049394222, "grad_norm": 1.228987920825965, "learning_rate": 4.74283741801864e-06, "loss": 1.6406, "step": 2532 }, { "epoch": 2.3606710158434296, "grad_norm": 1.260643616534314, "learning_rate": 4.73593372454263e-06, "loss": 1.5457, "step": 2533 }, { "epoch": 2.3616029822926374, "grad_norm": 1.3542013279917586, "learning_rate": 4.729030031066621e-06, "loss": 1.744, "step": 2534 }, { "epoch": 2.362534948741845, "grad_norm": 1.4172876160818728, "learning_rate": 4.722126337590611e-06, "loss": 1.3704, "step": 2535 }, { "epoch": 2.363466915191053, "grad_norm": 1.2773288878431774, "learning_rate": 4.715222644114602e-06, "loss": 1.6939, "step": 2536 }, { "epoch": 2.364398881640261, "grad_norm": 1.8209558817291949, "learning_rate": 4.7083189506385915e-06, "loss": 1.4983, "step": 2537 }, { "epoch": 2.3653308480894686, "grad_norm": 1.4004450900376961, "learning_rate": 4.701415257162582e-06, "loss": 1.5482, "step": 2538 }, { "epoch": 2.3662628145386764, "grad_norm": 1.219815183463194, "learning_rate": 4.694511563686573e-06, "loss": 1.6449, "step": 2539 }, { "epoch": 2.3671947809878846, "grad_norm": 1.103927428597681, "learning_rate": 4.687607870210563e-06, "loss": 1.6809, "step": 2540 }, { "epoch": 2.3681267474370924, "grad_norm": 1.1768596578273607, "learning_rate": 4.6807041767345536e-06, "loss": 2.2263, "step": 2541 }, { "epoch": 2.3690587138863, "grad_norm": 1.2171350903240779, "learning_rate": 4.6738004832585435e-06, "loss": 1.3959, "step": 2542 }, { "epoch": 2.369990680335508, "grad_norm": 1.4903819586738467, "learning_rate": 4.666896789782534e-06, "loss": 1.8302, "step": 2543 }, { "epoch": 2.3709226467847158, "grad_norm": 1.2325442917282818, "learning_rate": 4.659993096306524e-06, "loss": 1.667, "step": 2544 }, { "epoch": 2.3718546132339235, "grad_norm": 1.3592006339358325, "learning_rate": 4.653089402830515e-06, "loss": 1.4034, "step": 2545 }, { "epoch": 2.3727865796831313, "grad_norm": 1.3164119410532729, "learning_rate": 4.646185709354505e-06, "loss": 1.6073, "step": 2546 }, { "epoch": 2.373718546132339, "grad_norm": 1.1957440691039007, "learning_rate": 4.6392820158784954e-06, "loss": 1.4396, "step": 2547 }, { "epoch": 2.374650512581547, "grad_norm": 1.1148926885482782, "learning_rate": 4.632378322402486e-06, "loss": 1.4022, "step": 2548 }, { "epoch": 2.3755824790307547, "grad_norm": 1.350291406839152, "learning_rate": 4.625474628926476e-06, "loss": 1.7385, "step": 2549 }, { "epoch": 2.3765144454799625, "grad_norm": 1.1586298325546884, "learning_rate": 4.618570935450467e-06, "loss": 1.3633, "step": 2550 }, { "epoch": 2.3774464119291707, "grad_norm": 1.2797365467954573, "learning_rate": 4.611667241974457e-06, "loss": 1.6112, "step": 2551 }, { "epoch": 2.3783783783783785, "grad_norm": 1.36404821938763, "learning_rate": 4.604763548498447e-06, "loss": 1.4757, "step": 2552 }, { "epoch": 2.3793103448275863, "grad_norm": 1.146939321761056, "learning_rate": 4.597859855022437e-06, "loss": 1.6499, "step": 2553 }, { "epoch": 2.380242311276794, "grad_norm": 1.3009710920172113, "learning_rate": 4.590956161546428e-06, "loss": 1.7324, "step": 2554 }, { "epoch": 2.381174277726002, "grad_norm": 1.361348868194213, "learning_rate": 4.584052468070418e-06, "loss": 1.6387, "step": 2555 }, { "epoch": 2.3821062441752097, "grad_norm": 1.1215023765911263, "learning_rate": 4.577148774594408e-06, "loss": 1.5834, "step": 2556 }, { "epoch": 2.3830382106244175, "grad_norm": 1.28738682458157, "learning_rate": 4.5702450811183985e-06, "loss": 1.84, "step": 2557 }, { "epoch": 2.3839701770736252, "grad_norm": 1.3652954340712227, "learning_rate": 4.563341387642389e-06, "loss": 1.5501, "step": 2558 }, { "epoch": 2.384902143522833, "grad_norm": 1.2672783474680605, "learning_rate": 4.55643769416638e-06, "loss": 1.5196, "step": 2559 }, { "epoch": 2.385834109972041, "grad_norm": 1.4829888669888827, "learning_rate": 4.54953400069037e-06, "loss": 1.5431, "step": 2560 }, { "epoch": 2.3867660764212486, "grad_norm": 1.4206744537231215, "learning_rate": 4.54263030721436e-06, "loss": 1.7582, "step": 2561 }, { "epoch": 2.387698042870457, "grad_norm": 1.108342744076147, "learning_rate": 4.5357266137383505e-06, "loss": 1.3477, "step": 2562 }, { "epoch": 2.3886300093196646, "grad_norm": 1.2610114755075883, "learning_rate": 4.52882292026234e-06, "loss": 1.6409, "step": 2563 }, { "epoch": 2.3895619757688724, "grad_norm": 1.3644494198277026, "learning_rate": 4.521919226786331e-06, "loss": 1.7309, "step": 2564 }, { "epoch": 2.39049394221808, "grad_norm": 1.2647408485093825, "learning_rate": 4.515015533310321e-06, "loss": 1.6114, "step": 2565 }, { "epoch": 2.391425908667288, "grad_norm": 1.2407739330233731, "learning_rate": 4.508111839834312e-06, "loss": 1.8773, "step": 2566 }, { "epoch": 2.392357875116496, "grad_norm": 1.0932349416047435, "learning_rate": 4.501208146358302e-06, "loss": 1.36, "step": 2567 }, { "epoch": 2.3932898415657036, "grad_norm": 1.3091602221497507, "learning_rate": 4.494304452882292e-06, "loss": 1.6744, "step": 2568 }, { "epoch": 2.3942218080149114, "grad_norm": 1.3264153462428185, "learning_rate": 4.487400759406283e-06, "loss": 1.818, "step": 2569 }, { "epoch": 2.395153774464119, "grad_norm": 1.2527715505430934, "learning_rate": 4.480497065930273e-06, "loss": 1.5128, "step": 2570 }, { "epoch": 2.396085740913327, "grad_norm": 1.184530200865261, "learning_rate": 4.473593372454264e-06, "loss": 1.7413, "step": 2571 }, { "epoch": 2.3970177073625347, "grad_norm": 1.2026715816430542, "learning_rate": 4.466689678978254e-06, "loss": 1.8416, "step": 2572 }, { "epoch": 2.397949673811743, "grad_norm": 1.476595130851792, "learning_rate": 4.459785985502244e-06, "loss": 1.3523, "step": 2573 }, { "epoch": 2.3988816402609507, "grad_norm": 1.3858857347237667, "learning_rate": 4.452882292026234e-06, "loss": 1.5524, "step": 2574 }, { "epoch": 2.3998136067101585, "grad_norm": 1.3285807713450777, "learning_rate": 4.445978598550225e-06, "loss": 1.6774, "step": 2575 }, { "epoch": 2.4007455731593663, "grad_norm": 1.1680876949850756, "learning_rate": 4.439074905074215e-06, "loss": 1.595, "step": 2576 }, { "epoch": 2.401677539608574, "grad_norm": 1.2336213488818277, "learning_rate": 4.4321712115982055e-06, "loss": 1.9731, "step": 2577 }, { "epoch": 2.402609506057782, "grad_norm": 1.3083681148325332, "learning_rate": 4.4252675181221954e-06, "loss": 1.5874, "step": 2578 }, { "epoch": 2.4035414725069897, "grad_norm": 1.297850405282885, "learning_rate": 4.418363824646186e-06, "loss": 1.6355, "step": 2579 }, { "epoch": 2.4044734389561975, "grad_norm": 1.5905439591089026, "learning_rate": 4.411460131170177e-06, "loss": 1.792, "step": 2580 }, { "epoch": 2.4054054054054053, "grad_norm": 1.087669197453055, "learning_rate": 4.404556437694167e-06, "loss": 1.2703, "step": 2581 }, { "epoch": 2.406337371854613, "grad_norm": 1.1763904697301724, "learning_rate": 4.3976527442181575e-06, "loss": 1.4658, "step": 2582 }, { "epoch": 2.407269338303821, "grad_norm": 1.3461868536146047, "learning_rate": 4.390749050742147e-06, "loss": 1.6229, "step": 2583 }, { "epoch": 2.408201304753029, "grad_norm": 1.4402056899555415, "learning_rate": 4.383845357266137e-06, "loss": 1.5419, "step": 2584 }, { "epoch": 2.409133271202237, "grad_norm": 1.3612024681728045, "learning_rate": 4.376941663790128e-06, "loss": 1.6709, "step": 2585 }, { "epoch": 2.4100652376514446, "grad_norm": 1.2653936744492358, "learning_rate": 4.370037970314118e-06, "loss": 1.5802, "step": 2586 }, { "epoch": 2.4109972041006524, "grad_norm": 1.1905970825591004, "learning_rate": 4.363134276838109e-06, "loss": 1.7223, "step": 2587 }, { "epoch": 2.4119291705498602, "grad_norm": 1.1719250926667586, "learning_rate": 4.3562305833620985e-06, "loss": 1.4497, "step": 2588 }, { "epoch": 2.412861136999068, "grad_norm": 1.1568654093106245, "learning_rate": 4.349326889886089e-06, "loss": 1.5593, "step": 2589 }, { "epoch": 2.413793103448276, "grad_norm": 1.3273854856774154, "learning_rate": 4.34242319641008e-06, "loss": 1.4075, "step": 2590 }, { "epoch": 2.4147250698974836, "grad_norm": 1.3632418248236915, "learning_rate": 4.33551950293407e-06, "loss": 1.748, "step": 2591 }, { "epoch": 2.4156570363466914, "grad_norm": 1.3235715103436132, "learning_rate": 4.328615809458061e-06, "loss": 1.471, "step": 2592 }, { "epoch": 2.416589002795899, "grad_norm": 1.1376707949496705, "learning_rate": 4.3217121159820505e-06, "loss": 1.8422, "step": 2593 }, { "epoch": 2.417520969245107, "grad_norm": 1.1940290064320414, "learning_rate": 4.314808422506041e-06, "loss": 1.4596, "step": 2594 }, { "epoch": 2.418452935694315, "grad_norm": 1.3252177752521581, "learning_rate": 4.307904729030031e-06, "loss": 1.9405, "step": 2595 }, { "epoch": 2.419384902143523, "grad_norm": 1.186333122802773, "learning_rate": 4.301001035554022e-06, "loss": 1.3935, "step": 2596 }, { "epoch": 2.4203168685927308, "grad_norm": 1.0758063488044651, "learning_rate": 4.294097342078012e-06, "loss": 1.7023, "step": 2597 }, { "epoch": 2.4212488350419386, "grad_norm": 1.3747390026434716, "learning_rate": 4.2871936486020025e-06, "loss": 1.7683, "step": 2598 }, { "epoch": 2.4221808014911463, "grad_norm": 1.3169262890950584, "learning_rate": 4.280289955125993e-06, "loss": 1.536, "step": 2599 }, { "epoch": 2.423112767940354, "grad_norm": 1.2649374493483374, "learning_rate": 4.273386261649983e-06, "loss": 1.6689, "step": 2600 }, { "epoch": 2.424044734389562, "grad_norm": 1.938766242489805, "learning_rate": 4.266482568173974e-06, "loss": 1.8972, "step": 2601 }, { "epoch": 2.4249767008387697, "grad_norm": 1.2626295260096814, "learning_rate": 4.259578874697964e-06, "loss": 1.6334, "step": 2602 }, { "epoch": 2.4259086672879775, "grad_norm": 1.1004428038797964, "learning_rate": 4.2526751812219544e-06, "loss": 1.4204, "step": 2603 }, { "epoch": 2.4268406337371853, "grad_norm": 1.202601637393502, "learning_rate": 4.245771487745944e-06, "loss": 1.5086, "step": 2604 }, { "epoch": 2.427772600186393, "grad_norm": 1.18552820791386, "learning_rate": 4.238867794269935e-06, "loss": 1.6803, "step": 2605 }, { "epoch": 2.4287045666356013, "grad_norm": 1.3554240343238206, "learning_rate": 4.231964100793925e-06, "loss": 1.7957, "step": 2606 }, { "epoch": 2.429636533084809, "grad_norm": 1.2346635062439826, "learning_rate": 4.225060407317915e-06, "loss": 1.6391, "step": 2607 }, { "epoch": 2.430568499534017, "grad_norm": 1.530543342932955, "learning_rate": 4.2181567138419055e-06, "loss": 1.7998, "step": 2608 }, { "epoch": 2.4315004659832247, "grad_norm": 1.1113184301336592, "learning_rate": 4.211253020365896e-06, "loss": 1.4627, "step": 2609 }, { "epoch": 2.4324324324324325, "grad_norm": 1.3881544366428855, "learning_rate": 4.204349326889887e-06, "loss": 1.7586, "step": 2610 }, { "epoch": 2.4333643988816402, "grad_norm": 1.2218285558623474, "learning_rate": 4.197445633413877e-06, "loss": 1.7946, "step": 2611 }, { "epoch": 2.434296365330848, "grad_norm": 1.2610820600840145, "learning_rate": 4.190541939937867e-06, "loss": 1.6399, "step": 2612 }, { "epoch": 2.435228331780056, "grad_norm": 1.2316827386793088, "learning_rate": 4.1836382464618575e-06, "loss": 1.382, "step": 2613 }, { "epoch": 2.4361602982292636, "grad_norm": 1.2185786429073568, "learning_rate": 4.176734552985847e-06, "loss": 1.4759, "step": 2614 }, { "epoch": 2.4370922646784714, "grad_norm": 1.3050056720116356, "learning_rate": 4.169830859509838e-06, "loss": 1.6198, "step": 2615 }, { "epoch": 2.438024231127679, "grad_norm": 1.2483790741252754, "learning_rate": 4.162927166033828e-06, "loss": 1.3774, "step": 2616 }, { "epoch": 2.4389561975768874, "grad_norm": 1.4128052469389782, "learning_rate": 4.156023472557819e-06, "loss": 1.825, "step": 2617 }, { "epoch": 2.439888164026095, "grad_norm": 1.2736638184764286, "learning_rate": 4.149119779081809e-06, "loss": 1.7587, "step": 2618 }, { "epoch": 2.440820130475303, "grad_norm": 1.3703698754461913, "learning_rate": 4.142216085605799e-06, "loss": 1.7174, "step": 2619 }, { "epoch": 2.441752096924511, "grad_norm": 1.3424691945534226, "learning_rate": 4.13531239212979e-06, "loss": 1.5821, "step": 2620 }, { "epoch": 2.4426840633737186, "grad_norm": 1.160109382711457, "learning_rate": 4.12840869865378e-06, "loss": 1.6103, "step": 2621 }, { "epoch": 2.4436160298229264, "grad_norm": 1.2096461868248871, "learning_rate": 4.121505005177771e-06, "loss": 1.5765, "step": 2622 }, { "epoch": 2.444547996272134, "grad_norm": 1.2263806491872866, "learning_rate": 4.114601311701761e-06, "loss": 1.6364, "step": 2623 }, { "epoch": 2.445479962721342, "grad_norm": 1.1322620917975679, "learning_rate": 4.107697618225751e-06, "loss": 1.3784, "step": 2624 }, { "epoch": 2.4464119291705497, "grad_norm": 1.3231191363313393, "learning_rate": 4.100793924749741e-06, "loss": 1.8004, "step": 2625 }, { "epoch": 2.4473438956197575, "grad_norm": 1.1347018282478032, "learning_rate": 4.093890231273732e-06, "loss": 1.5523, "step": 2626 }, { "epoch": 2.4482758620689653, "grad_norm": 1.4019577461245079, "learning_rate": 4.086986537797722e-06, "loss": 1.7225, "step": 2627 }, { "epoch": 2.4492078285181735, "grad_norm": 1.3475334475529062, "learning_rate": 4.0800828443217126e-06, "loss": 1.4471, "step": 2628 }, { "epoch": 2.4501397949673813, "grad_norm": 1.189536693033125, "learning_rate": 4.073179150845703e-06, "loss": 1.426, "step": 2629 }, { "epoch": 2.451071761416589, "grad_norm": 1.3560154478826183, "learning_rate": 4.066275457369693e-06, "loss": 1.8033, "step": 2630 }, { "epoch": 2.452003727865797, "grad_norm": 1.1900208474303093, "learning_rate": 4.059371763893684e-06, "loss": 1.7699, "step": 2631 }, { "epoch": 2.4529356943150047, "grad_norm": 1.1895516402618194, "learning_rate": 4.052468070417674e-06, "loss": 1.4648, "step": 2632 }, { "epoch": 2.4538676607642125, "grad_norm": 1.3106235507455601, "learning_rate": 4.0455643769416645e-06, "loss": 1.7045, "step": 2633 }, { "epoch": 2.4547996272134203, "grad_norm": 1.0797361620001469, "learning_rate": 4.0386606834656544e-06, "loss": 1.5972, "step": 2634 }, { "epoch": 2.455731593662628, "grad_norm": 1.2729882433554076, "learning_rate": 4.031756989989644e-06, "loss": 1.6456, "step": 2635 }, { "epoch": 2.456663560111836, "grad_norm": 1.320775613217009, "learning_rate": 4.024853296513635e-06, "loss": 1.9111, "step": 2636 }, { "epoch": 2.4575955265610436, "grad_norm": 1.4439057013827779, "learning_rate": 4.017949603037625e-06, "loss": 1.6613, "step": 2637 }, { "epoch": 2.4585274930102514, "grad_norm": 1.6117030116250288, "learning_rate": 4.011045909561616e-06, "loss": 1.4613, "step": 2638 }, { "epoch": 2.4594594594594597, "grad_norm": 1.0339110584181397, "learning_rate": 4.004142216085606e-06, "loss": 1.5986, "step": 2639 }, { "epoch": 2.4603914259086674, "grad_norm": 1.4201258462864352, "learning_rate": 3.997238522609596e-06, "loss": 1.5642, "step": 2640 }, { "epoch": 2.4613233923578752, "grad_norm": 1.1593646386859815, "learning_rate": 3.990334829133587e-06, "loss": 1.7135, "step": 2641 }, { "epoch": 2.462255358807083, "grad_norm": 1.230983067271517, "learning_rate": 3.983431135657577e-06, "loss": 1.4288, "step": 2642 }, { "epoch": 2.463187325256291, "grad_norm": 1.075839337800139, "learning_rate": 3.976527442181568e-06, "loss": 1.5956, "step": 2643 }, { "epoch": 2.4641192917054986, "grad_norm": 1.1060992502672329, "learning_rate": 3.9696237487055575e-06, "loss": 1.6247, "step": 2644 }, { "epoch": 2.4650512581547064, "grad_norm": 1.4033588049026444, "learning_rate": 3.962720055229548e-06, "loss": 1.6562, "step": 2645 }, { "epoch": 2.465983224603914, "grad_norm": 1.3070817628271254, "learning_rate": 3.955816361753538e-06, "loss": 1.5182, "step": 2646 }, { "epoch": 2.466915191053122, "grad_norm": 1.0605700483771545, "learning_rate": 3.948912668277529e-06, "loss": 1.5294, "step": 2647 }, { "epoch": 2.4678471575023297, "grad_norm": 1.361953805566055, "learning_rate": 3.942008974801519e-06, "loss": 1.441, "step": 2648 }, { "epoch": 2.4687791239515375, "grad_norm": 1.3112964944129972, "learning_rate": 3.9351052813255095e-06, "loss": 1.5739, "step": 2649 }, { "epoch": 2.4697110904007458, "grad_norm": 1.164960918623768, "learning_rate": 3.9282015878495e-06, "loss": 1.3853, "step": 2650 }, { "epoch": 2.4706430568499536, "grad_norm": 1.419977665698657, "learning_rate": 3.92129789437349e-06, "loss": 1.6228, "step": 2651 }, { "epoch": 2.4715750232991613, "grad_norm": 1.1697427580656856, "learning_rate": 3.914394200897481e-06, "loss": 1.6677, "step": 2652 }, { "epoch": 2.472506989748369, "grad_norm": 1.2296407022755274, "learning_rate": 3.907490507421471e-06, "loss": 1.6676, "step": 2653 }, { "epoch": 2.473438956197577, "grad_norm": 1.3036613285527945, "learning_rate": 3.9005868139454615e-06, "loss": 1.5312, "step": 2654 }, { "epoch": 2.4743709226467847, "grad_norm": 1.1575575501433268, "learning_rate": 3.893683120469451e-06, "loss": 1.9516, "step": 2655 }, { "epoch": 2.4753028890959925, "grad_norm": 1.366584563949125, "learning_rate": 3.886779426993442e-06, "loss": 2.2419, "step": 2656 }, { "epoch": 2.4762348555452003, "grad_norm": 1.5463966167403498, "learning_rate": 3.879875733517432e-06, "loss": 1.3531, "step": 2657 }, { "epoch": 2.477166821994408, "grad_norm": 1.416146390605021, "learning_rate": 3.872972040041422e-06, "loss": 1.659, "step": 2658 }, { "epoch": 2.478098788443616, "grad_norm": 1.3079255659418443, "learning_rate": 3.866068346565413e-06, "loss": 1.6218, "step": 2659 }, { "epoch": 2.4790307548928237, "grad_norm": 1.2061825299252602, "learning_rate": 3.859164653089403e-06, "loss": 1.7704, "step": 2660 }, { "epoch": 2.479962721342032, "grad_norm": 1.3412281731015716, "learning_rate": 3.852260959613394e-06, "loss": 1.5915, "step": 2661 }, { "epoch": 2.4808946877912397, "grad_norm": 1.6368202694153244, "learning_rate": 3.845357266137384e-06, "loss": 1.7165, "step": 2662 }, { "epoch": 2.4818266542404475, "grad_norm": 1.5813783346026766, "learning_rate": 3.838453572661374e-06, "loss": 1.5708, "step": 2663 }, { "epoch": 2.4827586206896552, "grad_norm": 1.2031965912491744, "learning_rate": 3.8315498791853645e-06, "loss": 1.6643, "step": 2664 }, { "epoch": 2.483690587138863, "grad_norm": 1.103459946305633, "learning_rate": 3.8246461857093544e-06, "loss": 1.3201, "step": 2665 }, { "epoch": 2.484622553588071, "grad_norm": 1.3951125380419473, "learning_rate": 3.817742492233345e-06, "loss": 1.6184, "step": 2666 }, { "epoch": 2.4855545200372786, "grad_norm": 1.3743102348357592, "learning_rate": 3.8108387987573355e-06, "loss": 1.5835, "step": 2667 }, { "epoch": 2.4864864864864864, "grad_norm": 1.1602156996576514, "learning_rate": 3.8039351052813258e-06, "loss": 1.6126, "step": 2668 }, { "epoch": 2.487418452935694, "grad_norm": 1.2802807065147563, "learning_rate": 3.7970314118053165e-06, "loss": 1.7838, "step": 2669 }, { "epoch": 2.488350419384902, "grad_norm": 1.0460181632663856, "learning_rate": 3.790127718329307e-06, "loss": 1.5418, "step": 2670 }, { "epoch": 2.4892823858341098, "grad_norm": 1.1655327879890238, "learning_rate": 3.783224024853297e-06, "loss": 1.7361, "step": 2671 }, { "epoch": 2.490214352283318, "grad_norm": 1.0780500153876897, "learning_rate": 3.7763203313772874e-06, "loss": 1.3741, "step": 2672 }, { "epoch": 2.491146318732526, "grad_norm": 1.2286350197801572, "learning_rate": 3.7694166379012777e-06, "loss": 1.8092, "step": 2673 }, { "epoch": 2.4920782851817336, "grad_norm": 1.1786254591070908, "learning_rate": 3.7625129444252676e-06, "loss": 1.6136, "step": 2674 }, { "epoch": 2.4930102516309414, "grad_norm": 1.3306145060429186, "learning_rate": 3.755609250949258e-06, "loss": 1.6473, "step": 2675 }, { "epoch": 2.493942218080149, "grad_norm": 1.364192781209155, "learning_rate": 3.7487055574732483e-06, "loss": 1.7703, "step": 2676 }, { "epoch": 2.494874184529357, "grad_norm": 1.1500520771759266, "learning_rate": 3.7418018639972386e-06, "loss": 1.7442, "step": 2677 }, { "epoch": 2.4958061509785647, "grad_norm": 1.1119668055236134, "learning_rate": 3.734898170521229e-06, "loss": 1.2935, "step": 2678 }, { "epoch": 2.4967381174277725, "grad_norm": 1.0239842012807074, "learning_rate": 3.7279944770452196e-06, "loss": 1.7991, "step": 2679 }, { "epoch": 2.4976700838769803, "grad_norm": 1.2646244848650834, "learning_rate": 3.72109078356921e-06, "loss": 1.524, "step": 2680 }, { "epoch": 2.498602050326188, "grad_norm": 1.3143567854658336, "learning_rate": 3.7141870900932002e-06, "loss": 1.5913, "step": 2681 }, { "epoch": 2.499534016775396, "grad_norm": 1.15076019920635, "learning_rate": 3.7072833966171905e-06, "loss": 1.7509, "step": 2682 }, { "epoch": 2.500465983224604, "grad_norm": 1.2696357533162668, "learning_rate": 3.700379703141181e-06, "loss": 1.7296, "step": 2683 }, { "epoch": 2.501397949673812, "grad_norm": 1.2799902344271894, "learning_rate": 3.693476009665171e-06, "loss": 1.6736, "step": 2684 }, { "epoch": 2.5023299161230197, "grad_norm": 1.148160654438656, "learning_rate": 3.6865723161891615e-06, "loss": 1.754, "step": 2685 }, { "epoch": 2.5032618825722275, "grad_norm": 1.112828336285625, "learning_rate": 3.6796686227131518e-06, "loss": 1.5347, "step": 2686 }, { "epoch": 2.5041938490214353, "grad_norm": 1.363598495769184, "learning_rate": 3.672764929237142e-06, "loss": 1.8406, "step": 2687 }, { "epoch": 2.505125815470643, "grad_norm": 1.2286247279232863, "learning_rate": 3.6658612357611324e-06, "loss": 1.5013, "step": 2688 }, { "epoch": 2.506057781919851, "grad_norm": 1.1124716453987837, "learning_rate": 3.6589575422851227e-06, "loss": 1.447, "step": 2689 }, { "epoch": 2.5069897483690586, "grad_norm": 1.3263049753680918, "learning_rate": 3.6520538488091134e-06, "loss": 1.7544, "step": 2690 }, { "epoch": 2.5079217148182664, "grad_norm": 1.2778210633452947, "learning_rate": 3.6451501553331037e-06, "loss": 1.7816, "step": 2691 }, { "epoch": 2.508853681267474, "grad_norm": 1.428186260334897, "learning_rate": 3.638246461857094e-06, "loss": 1.8191, "step": 2692 }, { "epoch": 2.509785647716682, "grad_norm": 1.2141813747962753, "learning_rate": 3.6313427683810844e-06, "loss": 1.6695, "step": 2693 }, { "epoch": 2.5107176141658902, "grad_norm": 1.2313963479432444, "learning_rate": 3.6244390749050747e-06, "loss": 1.6111, "step": 2694 }, { "epoch": 2.511649580615098, "grad_norm": 1.184933803754177, "learning_rate": 3.617535381429065e-06, "loss": 1.9158, "step": 2695 }, { "epoch": 2.512581547064306, "grad_norm": 1.1790984961938198, "learning_rate": 3.6106316879530553e-06, "loss": 1.6976, "step": 2696 }, { "epoch": 2.5135135135135136, "grad_norm": 1.3515475297629855, "learning_rate": 3.603727994477045e-06, "loss": 1.355, "step": 2697 }, { "epoch": 2.5144454799627214, "grad_norm": 2.0093339846086096, "learning_rate": 3.5968243010010355e-06, "loss": 1.996, "step": 2698 }, { "epoch": 2.515377446411929, "grad_norm": 1.2337760665285677, "learning_rate": 3.5899206075250258e-06, "loss": 1.4238, "step": 2699 }, { "epoch": 2.516309412861137, "grad_norm": 1.5612006358091466, "learning_rate": 3.583016914049017e-06, "loss": 1.5617, "step": 2700 }, { "epoch": 2.5172413793103448, "grad_norm": 1.4665001990628912, "learning_rate": 3.576113220573007e-06, "loss": 1.7552, "step": 2701 }, { "epoch": 2.5181733457595525, "grad_norm": 1.1173362954867276, "learning_rate": 3.569209527096997e-06, "loss": 1.6021, "step": 2702 }, { "epoch": 2.5191053122087603, "grad_norm": 1.3172647991279518, "learning_rate": 3.5623058336209874e-06, "loss": 1.6873, "step": 2703 }, { "epoch": 2.520037278657968, "grad_norm": 1.2579299230127756, "learning_rate": 3.5554021401449777e-06, "loss": 1.4673, "step": 2704 }, { "epoch": 2.5209692451071763, "grad_norm": 1.4930656488918275, "learning_rate": 3.548498446668968e-06, "loss": 1.5364, "step": 2705 }, { "epoch": 2.5219012115563837, "grad_norm": 1.4573130350667332, "learning_rate": 3.5415947531929584e-06, "loss": 1.944, "step": 2706 }, { "epoch": 2.522833178005592, "grad_norm": 1.2991495097609236, "learning_rate": 3.5346910597169487e-06, "loss": 1.5041, "step": 2707 }, { "epoch": 2.5237651444547997, "grad_norm": 1.285254088446499, "learning_rate": 3.527787366240939e-06, "loss": 1.6334, "step": 2708 }, { "epoch": 2.5246971109040075, "grad_norm": 1.3703349572554984, "learning_rate": 3.5208836727649293e-06, "loss": 1.7187, "step": 2709 }, { "epoch": 2.5256290773532153, "grad_norm": 1.5043286395208777, "learning_rate": 3.51397997928892e-06, "loss": 1.7929, "step": 2710 }, { "epoch": 2.526561043802423, "grad_norm": 1.8265349678246967, "learning_rate": 3.5070762858129103e-06, "loss": 1.8831, "step": 2711 }, { "epoch": 2.527493010251631, "grad_norm": 1.307962405779119, "learning_rate": 3.5001725923369006e-06, "loss": 1.4578, "step": 2712 }, { "epoch": 2.5284249767008387, "grad_norm": 1.2195326136075795, "learning_rate": 3.493268898860891e-06, "loss": 1.8715, "step": 2713 }, { "epoch": 2.5293569431500464, "grad_norm": 1.347605989764987, "learning_rate": 3.4863652053848813e-06, "loss": 1.8986, "step": 2714 }, { "epoch": 2.5302889095992542, "grad_norm": 1.3953627675979132, "learning_rate": 3.4794615119088716e-06, "loss": 1.6429, "step": 2715 }, { "epoch": 2.5312208760484625, "grad_norm": 1.2091525355268034, "learning_rate": 3.472557818432862e-06, "loss": 1.7911, "step": 2716 }, { "epoch": 2.53215284249767, "grad_norm": 1.2851530951123944, "learning_rate": 3.465654124956852e-06, "loss": 1.5773, "step": 2717 }, { "epoch": 2.533084808946878, "grad_norm": 1.3518025807433558, "learning_rate": 3.4587504314808425e-06, "loss": 1.6033, "step": 2718 }, { "epoch": 2.534016775396086, "grad_norm": 1.257678691306038, "learning_rate": 3.451846738004833e-06, "loss": 1.4774, "step": 2719 }, { "epoch": 2.5349487418452936, "grad_norm": 1.0986952458890886, "learning_rate": 3.4449430445288235e-06, "loss": 1.6357, "step": 2720 }, { "epoch": 2.5358807082945014, "grad_norm": 1.5115972411036303, "learning_rate": 3.438039351052814e-06, "loss": 1.653, "step": 2721 }, { "epoch": 2.536812674743709, "grad_norm": 1.2065146046468476, "learning_rate": 3.431135657576804e-06, "loss": 1.3816, "step": 2722 }, { "epoch": 2.537744641192917, "grad_norm": 1.2435919050753883, "learning_rate": 3.4242319641007945e-06, "loss": 1.6872, "step": 2723 }, { "epoch": 2.5386766076421248, "grad_norm": 1.2830569868129942, "learning_rate": 3.4173282706247844e-06, "loss": 1.5744, "step": 2724 }, { "epoch": 2.5396085740913326, "grad_norm": 1.2552229216783966, "learning_rate": 3.4104245771487747e-06, "loss": 1.4212, "step": 2725 }, { "epoch": 2.5405405405405403, "grad_norm": 1.4480980833652553, "learning_rate": 3.403520883672765e-06, "loss": 1.7574, "step": 2726 }, { "epoch": 2.5414725069897486, "grad_norm": 1.2051886424218712, "learning_rate": 3.3966171901967553e-06, "loss": 1.8037, "step": 2727 }, { "epoch": 2.542404473438956, "grad_norm": 1.2563055309329039, "learning_rate": 3.3897134967207456e-06, "loss": 2.0016, "step": 2728 }, { "epoch": 2.543336439888164, "grad_norm": 1.6150396447357704, "learning_rate": 3.382809803244736e-06, "loss": 1.6592, "step": 2729 }, { "epoch": 2.544268406337372, "grad_norm": 1.2737609300063757, "learning_rate": 3.3759061097687266e-06, "loss": 1.6704, "step": 2730 }, { "epoch": 2.5452003727865797, "grad_norm": 1.4311621465402133, "learning_rate": 3.369002416292717e-06, "loss": 1.8728, "step": 2731 }, { "epoch": 2.5461323392357875, "grad_norm": 1.2790318792739488, "learning_rate": 3.3620987228167072e-06, "loss": 1.5938, "step": 2732 }, { "epoch": 2.5470643056849953, "grad_norm": 1.027272667541589, "learning_rate": 3.3551950293406976e-06, "loss": 1.5356, "step": 2733 }, { "epoch": 2.547996272134203, "grad_norm": 1.4008774101672938, "learning_rate": 3.348291335864688e-06, "loss": 1.6148, "step": 2734 }, { "epoch": 2.548928238583411, "grad_norm": 1.1656853705808243, "learning_rate": 3.341387642388678e-06, "loss": 1.6511, "step": 2735 }, { "epoch": 2.5498602050326187, "grad_norm": 1.3560861273949056, "learning_rate": 3.3344839489126685e-06, "loss": 1.5541, "step": 2736 }, { "epoch": 2.5507921714818265, "grad_norm": 1.4410678473525533, "learning_rate": 3.327580255436659e-06, "loss": 1.8877, "step": 2737 }, { "epoch": 2.5517241379310347, "grad_norm": 1.043042349072189, "learning_rate": 3.320676561960649e-06, "loss": 1.4836, "step": 2738 }, { "epoch": 2.552656104380242, "grad_norm": 1.0776766206396762, "learning_rate": 3.3137728684846394e-06, "loss": 1.5305, "step": 2739 }, { "epoch": 2.5535880708294503, "grad_norm": 1.4344052435276906, "learning_rate": 3.30686917500863e-06, "loss": 1.6827, "step": 2740 }, { "epoch": 2.554520037278658, "grad_norm": 1.1160000272955226, "learning_rate": 3.2999654815326205e-06, "loss": 1.6613, "step": 2741 }, { "epoch": 2.555452003727866, "grad_norm": 1.3393227569797337, "learning_rate": 3.2930617880566108e-06, "loss": 1.5231, "step": 2742 }, { "epoch": 2.5563839701770736, "grad_norm": 1.1673620647306648, "learning_rate": 3.286158094580601e-06, "loss": 1.5404, "step": 2743 }, { "epoch": 2.5573159366262814, "grad_norm": 1.229781080632606, "learning_rate": 3.2792544011045914e-06, "loss": 1.7319, "step": 2744 }, { "epoch": 2.558247903075489, "grad_norm": 2.4917892736463756, "learning_rate": 3.2723507076285817e-06, "loss": 1.4433, "step": 2745 }, { "epoch": 2.559179869524697, "grad_norm": 1.383514231560548, "learning_rate": 3.265447014152572e-06, "loss": 1.8355, "step": 2746 }, { "epoch": 2.560111835973905, "grad_norm": 1.3046074356606248, "learning_rate": 3.258543320676562e-06, "loss": 1.844, "step": 2747 }, { "epoch": 2.5610438024231126, "grad_norm": 1.0356034954672346, "learning_rate": 3.251639627200552e-06, "loss": 1.634, "step": 2748 }, { "epoch": 2.561975768872321, "grad_norm": 1.2623284636261072, "learning_rate": 3.2447359337245425e-06, "loss": 1.6583, "step": 2749 }, { "epoch": 2.562907735321528, "grad_norm": 1.165930026432629, "learning_rate": 3.2378322402485337e-06, "loss": 1.5788, "step": 2750 }, { "epoch": 2.5638397017707364, "grad_norm": 1.049965590536978, "learning_rate": 3.230928546772524e-06, "loss": 1.4523, "step": 2751 }, { "epoch": 2.564771668219944, "grad_norm": 1.3072204697002443, "learning_rate": 3.224024853296514e-06, "loss": 1.4345, "step": 2752 }, { "epoch": 2.565703634669152, "grad_norm": 1.1120864808834654, "learning_rate": 3.217121159820504e-06, "loss": 1.6211, "step": 2753 }, { "epoch": 2.5666356011183598, "grad_norm": 1.176311115850646, "learning_rate": 3.2102174663444945e-06, "loss": 1.6637, "step": 2754 }, { "epoch": 2.5675675675675675, "grad_norm": 1.1024601163564103, "learning_rate": 3.2033137728684848e-06, "loss": 1.5538, "step": 2755 }, { "epoch": 2.5684995340167753, "grad_norm": 1.2059656212178882, "learning_rate": 3.196410079392475e-06, "loss": 1.7297, "step": 2756 }, { "epoch": 2.569431500465983, "grad_norm": 1.1598881997766728, "learning_rate": 3.1895063859164654e-06, "loss": 1.3773, "step": 2757 }, { "epoch": 2.570363466915191, "grad_norm": 1.4322353820725502, "learning_rate": 3.1826026924404557e-06, "loss": 1.5629, "step": 2758 }, { "epoch": 2.5712954333643987, "grad_norm": 1.0930838139567602, "learning_rate": 3.175698998964446e-06, "loss": 1.4922, "step": 2759 }, { "epoch": 2.572227399813607, "grad_norm": 1.2481382895761646, "learning_rate": 3.1687953054884367e-06, "loss": 1.4283, "step": 2760 }, { "epoch": 2.5731593662628143, "grad_norm": 1.425681765817794, "learning_rate": 3.161891612012427e-06, "loss": 1.5459, "step": 2761 }, { "epoch": 2.5740913327120225, "grad_norm": 1.293297206468883, "learning_rate": 3.1549879185364174e-06, "loss": 1.674, "step": 2762 }, { "epoch": 2.5750232991612303, "grad_norm": 1.1088291009263267, "learning_rate": 3.1480842250604077e-06, "loss": 1.4685, "step": 2763 }, { "epoch": 2.575955265610438, "grad_norm": 1.2168747156644797, "learning_rate": 3.141180531584398e-06, "loss": 1.7492, "step": 2764 }, { "epoch": 2.576887232059646, "grad_norm": 1.6120821516690211, "learning_rate": 3.1342768381083883e-06, "loss": 2.0081, "step": 2765 }, { "epoch": 2.5778191985088537, "grad_norm": 1.162959288215759, "learning_rate": 3.1273731446323786e-06, "loss": 1.317, "step": 2766 }, { "epoch": 2.5787511649580614, "grad_norm": 1.0858981987915806, "learning_rate": 3.120469451156369e-06, "loss": 1.2311, "step": 2767 }, { "epoch": 2.5796831314072692, "grad_norm": 1.36998319104503, "learning_rate": 3.1135657576803592e-06, "loss": 1.6359, "step": 2768 }, { "epoch": 2.580615097856477, "grad_norm": 1.2033489654198886, "learning_rate": 3.1066620642043495e-06, "loss": 1.6608, "step": 2769 }, { "epoch": 2.581547064305685, "grad_norm": 1.2149805842619723, "learning_rate": 3.0997583707283403e-06, "loss": 1.7543, "step": 2770 }, { "epoch": 2.582479030754893, "grad_norm": 1.202953477851513, "learning_rate": 3.0928546772523306e-06, "loss": 1.6403, "step": 2771 }, { "epoch": 2.5834109972041004, "grad_norm": 1.120742725461214, "learning_rate": 3.085950983776321e-06, "loss": 1.6665, "step": 2772 }, { "epoch": 2.5843429636533086, "grad_norm": 0.9816844921843304, "learning_rate": 3.079047290300311e-06, "loss": 1.4819, "step": 2773 }, { "epoch": 2.5852749301025164, "grad_norm": 1.1549012211129317, "learning_rate": 3.0721435968243015e-06, "loss": 1.41, "step": 2774 }, { "epoch": 2.586206896551724, "grad_norm": 1.1239205030395147, "learning_rate": 3.0652399033482914e-06, "loss": 1.441, "step": 2775 }, { "epoch": 2.587138863000932, "grad_norm": 1.0460786128025397, "learning_rate": 3.0583362098722817e-06, "loss": 1.5067, "step": 2776 }, { "epoch": 2.5880708294501398, "grad_norm": 1.3296147288058577, "learning_rate": 3.051432516396272e-06, "loss": 1.7114, "step": 2777 }, { "epoch": 2.5890027958993476, "grad_norm": 1.331247773550261, "learning_rate": 3.0445288229202623e-06, "loss": 1.6618, "step": 2778 }, { "epoch": 2.5899347623485554, "grad_norm": 1.2017704865902856, "learning_rate": 3.0376251294442526e-06, "loss": 1.6067, "step": 2779 }, { "epoch": 2.590866728797763, "grad_norm": 1.0519293505729592, "learning_rate": 3.0307214359682433e-06, "loss": 1.5082, "step": 2780 }, { "epoch": 2.591798695246971, "grad_norm": 1.4536405367065148, "learning_rate": 3.0238177424922337e-06, "loss": 1.7094, "step": 2781 }, { "epoch": 2.592730661696179, "grad_norm": 1.3053322886452776, "learning_rate": 3.016914049016224e-06, "loss": 1.4553, "step": 2782 }, { "epoch": 2.5936626281453865, "grad_norm": 1.169036911325362, "learning_rate": 3.0100103555402143e-06, "loss": 1.7862, "step": 2783 }, { "epoch": 2.5945945945945947, "grad_norm": 1.2987905952384435, "learning_rate": 3.0031066620642046e-06, "loss": 1.7255, "step": 2784 }, { "epoch": 2.5955265610438025, "grad_norm": 1.160135583505425, "learning_rate": 2.996202968588195e-06, "loss": 1.4305, "step": 2785 }, { "epoch": 2.5964585274930103, "grad_norm": 1.3737307150372065, "learning_rate": 2.989299275112185e-06, "loss": 1.7512, "step": 2786 }, { "epoch": 2.597390493942218, "grad_norm": 1.1347352111142077, "learning_rate": 2.9823955816361755e-06, "loss": 1.7643, "step": 2787 }, { "epoch": 2.598322460391426, "grad_norm": 1.0606085179371854, "learning_rate": 2.975491888160166e-06, "loss": 1.6026, "step": 2788 }, { "epoch": 2.5992544268406337, "grad_norm": 1.4858892540230586, "learning_rate": 2.968588194684156e-06, "loss": 1.6376, "step": 2789 }, { "epoch": 2.6001863932898415, "grad_norm": 1.387753912701863, "learning_rate": 2.9616845012081464e-06, "loss": 1.5779, "step": 2790 }, { "epoch": 2.6011183597390493, "grad_norm": 1.3001356146080443, "learning_rate": 2.954780807732137e-06, "loss": 1.6233, "step": 2791 }, { "epoch": 2.602050326188257, "grad_norm": 1.1612633015493856, "learning_rate": 2.9478771142561275e-06, "loss": 1.6807, "step": 2792 }, { "epoch": 2.6029822926374653, "grad_norm": 1.390989412067465, "learning_rate": 2.940973420780118e-06, "loss": 1.8021, "step": 2793 }, { "epoch": 2.6039142590866726, "grad_norm": 1.449669665479545, "learning_rate": 2.934069727304108e-06, "loss": 1.5996, "step": 2794 }, { "epoch": 2.604846225535881, "grad_norm": 1.1295578982727617, "learning_rate": 2.9271660338280984e-06, "loss": 1.5287, "step": 2795 }, { "epoch": 2.6057781919850886, "grad_norm": 1.5487311282628489, "learning_rate": 2.9202623403520887e-06, "loss": 1.7952, "step": 2796 }, { "epoch": 2.6067101584342964, "grad_norm": 1.0571316720833257, "learning_rate": 2.913358646876079e-06, "loss": 1.6104, "step": 2797 }, { "epoch": 2.607642124883504, "grad_norm": 1.3475349676186106, "learning_rate": 2.906454953400069e-06, "loss": 1.8531, "step": 2798 }, { "epoch": 2.608574091332712, "grad_norm": 1.4187478028514395, "learning_rate": 2.8995512599240592e-06, "loss": 1.5267, "step": 2799 }, { "epoch": 2.60950605778192, "grad_norm": 1.2583819803585024, "learning_rate": 2.8926475664480495e-06, "loss": 1.6129, "step": 2800 }, { "epoch": 2.6104380242311276, "grad_norm": 1.2608214408750535, "learning_rate": 2.8857438729720407e-06, "loss": 1.6901, "step": 2801 }, { "epoch": 2.6113699906803354, "grad_norm": 1.4918327326468714, "learning_rate": 2.878840179496031e-06, "loss": 1.5244, "step": 2802 }, { "epoch": 2.612301957129543, "grad_norm": 1.2692102928579858, "learning_rate": 2.871936486020021e-06, "loss": 1.5035, "step": 2803 }, { "epoch": 2.6132339235787514, "grad_norm": 1.3899561652604862, "learning_rate": 2.865032792544011e-06, "loss": 1.6521, "step": 2804 }, { "epoch": 2.6141658900279587, "grad_norm": 1.1970633963221387, "learning_rate": 2.8581290990680015e-06, "loss": 1.5552, "step": 2805 }, { "epoch": 2.615097856477167, "grad_norm": 1.3750681870428034, "learning_rate": 2.851225405591992e-06, "loss": 2.0943, "step": 2806 }, { "epoch": 2.6160298229263748, "grad_norm": 1.0677532536147187, "learning_rate": 2.844321712115982e-06, "loss": 1.4537, "step": 2807 }, { "epoch": 2.6169617893755825, "grad_norm": 1.2486177398720402, "learning_rate": 2.8374180186399724e-06, "loss": 1.6367, "step": 2808 }, { "epoch": 2.6178937558247903, "grad_norm": 1.1327431095252043, "learning_rate": 2.8305143251639627e-06, "loss": 1.893, "step": 2809 }, { "epoch": 2.618825722273998, "grad_norm": 1.0791151770938168, "learning_rate": 2.823610631687953e-06, "loss": 1.2803, "step": 2810 }, { "epoch": 2.619757688723206, "grad_norm": 1.651820514615709, "learning_rate": 2.8167069382119438e-06, "loss": 1.5076, "step": 2811 }, { "epoch": 2.6206896551724137, "grad_norm": 1.1282153255666862, "learning_rate": 2.809803244735934e-06, "loss": 1.3819, "step": 2812 }, { "epoch": 2.6216216216216215, "grad_norm": 1.1621705471308994, "learning_rate": 2.8028995512599244e-06, "loss": 1.1621, "step": 2813 }, { "epoch": 2.6225535880708293, "grad_norm": 1.1011598442762935, "learning_rate": 2.7959958577839147e-06, "loss": 1.7044, "step": 2814 }, { "epoch": 2.6234855545200375, "grad_norm": 1.0826287106968637, "learning_rate": 2.789092164307905e-06, "loss": 1.5428, "step": 2815 }, { "epoch": 2.624417520969245, "grad_norm": 1.1186235633242185, "learning_rate": 2.7821884708318953e-06, "loss": 1.4829, "step": 2816 }, { "epoch": 2.625349487418453, "grad_norm": 1.2500760622374811, "learning_rate": 2.7752847773558856e-06, "loss": 1.7435, "step": 2817 }, { "epoch": 2.626281453867661, "grad_norm": 1.2994934908694784, "learning_rate": 2.768381083879876e-06, "loss": 1.6445, "step": 2818 }, { "epoch": 2.6272134203168687, "grad_norm": 1.1484358379790058, "learning_rate": 2.7614773904038662e-06, "loss": 1.4308, "step": 2819 }, { "epoch": 2.6281453867660765, "grad_norm": 1.4017568692575648, "learning_rate": 2.7545736969278566e-06, "loss": 1.4355, "step": 2820 }, { "epoch": 2.6290773532152842, "grad_norm": 1.3842884547936394, "learning_rate": 2.7476700034518473e-06, "loss": 1.9214, "step": 2821 }, { "epoch": 2.630009319664492, "grad_norm": 1.3054191142535922, "learning_rate": 2.7407663099758376e-06, "loss": 1.7206, "step": 2822 }, { "epoch": 2.6309412861137, "grad_norm": 1.2383194298459432, "learning_rate": 2.733862616499828e-06, "loss": 1.8779, "step": 2823 }, { "epoch": 2.6318732525629076, "grad_norm": 1.244535832920069, "learning_rate": 2.7269589230238182e-06, "loss": 1.7112, "step": 2824 }, { "epoch": 2.6328052190121154, "grad_norm": 1.1952831139895166, "learning_rate": 2.7200552295478085e-06, "loss": 1.5076, "step": 2825 }, { "epoch": 2.6337371854613236, "grad_norm": 1.2074263412042288, "learning_rate": 2.7131515360717984e-06, "loss": 1.6024, "step": 2826 }, { "epoch": 2.634669151910531, "grad_norm": 1.243034427284513, "learning_rate": 2.7062478425957887e-06, "loss": 1.8555, "step": 2827 }, { "epoch": 2.635601118359739, "grad_norm": 1.4137410529104522, "learning_rate": 2.699344149119779e-06, "loss": 1.7824, "step": 2828 }, { "epoch": 2.636533084808947, "grad_norm": 1.5252248284039205, "learning_rate": 2.6924404556437693e-06, "loss": 1.4223, "step": 2829 }, { "epoch": 2.637465051258155, "grad_norm": 1.4859532422004182, "learning_rate": 2.6855367621677596e-06, "loss": 2.0373, "step": 2830 }, { "epoch": 2.6383970177073626, "grad_norm": 1.3351363835989565, "learning_rate": 2.6786330686917504e-06, "loss": 1.547, "step": 2831 }, { "epoch": 2.6393289841565704, "grad_norm": 1.5437530867227356, "learning_rate": 2.6717293752157407e-06, "loss": 1.6909, "step": 2832 }, { "epoch": 2.640260950605778, "grad_norm": 1.1695458509751735, "learning_rate": 2.664825681739731e-06, "loss": 1.4345, "step": 2833 }, { "epoch": 2.641192917054986, "grad_norm": 1.149926203501616, "learning_rate": 2.6579219882637213e-06, "loss": 1.8681, "step": 2834 }, { "epoch": 2.6421248835041937, "grad_norm": 1.2080776017897736, "learning_rate": 2.6510182947877116e-06, "loss": 1.4231, "step": 2835 }, { "epoch": 2.6430568499534015, "grad_norm": 1.2535965424228754, "learning_rate": 2.644114601311702e-06, "loss": 1.4088, "step": 2836 }, { "epoch": 2.6439888164026097, "grad_norm": 1.2214275028400272, "learning_rate": 2.6372109078356922e-06, "loss": 1.8279, "step": 2837 }, { "epoch": 2.644920782851817, "grad_norm": 1.0707217024028814, "learning_rate": 2.6303072143596825e-06, "loss": 1.649, "step": 2838 }, { "epoch": 2.6458527493010253, "grad_norm": 1.2236780709788484, "learning_rate": 2.623403520883673e-06, "loss": 1.4965, "step": 2839 }, { "epoch": 2.646784715750233, "grad_norm": 1.1583245380041445, "learning_rate": 2.616499827407663e-06, "loss": 1.6023, "step": 2840 }, { "epoch": 2.647716682199441, "grad_norm": 1.1339372700374186, "learning_rate": 2.609596133931654e-06, "loss": 1.5431, "step": 2841 }, { "epoch": 2.6486486486486487, "grad_norm": 1.2673621391975898, "learning_rate": 2.602692440455644e-06, "loss": 2.0369, "step": 2842 }, { "epoch": 2.6495806150978565, "grad_norm": 1.1422552427641235, "learning_rate": 2.5957887469796345e-06, "loss": 1.6393, "step": 2843 }, { "epoch": 2.6505125815470643, "grad_norm": 1.3566926084209543, "learning_rate": 2.588885053503625e-06, "loss": 1.4896, "step": 2844 }, { "epoch": 2.651444547996272, "grad_norm": 1.1411869433570612, "learning_rate": 2.581981360027615e-06, "loss": 1.5957, "step": 2845 }, { "epoch": 2.65237651444548, "grad_norm": 1.2199508332190352, "learning_rate": 2.5750776665516054e-06, "loss": 1.7229, "step": 2846 }, { "epoch": 2.6533084808946876, "grad_norm": 1.3059242538671865, "learning_rate": 2.5681739730755957e-06, "loss": 1.8733, "step": 2847 }, { "epoch": 2.654240447343896, "grad_norm": 1.2219751381962305, "learning_rate": 2.561270279599586e-06, "loss": 1.5902, "step": 2848 }, { "epoch": 2.655172413793103, "grad_norm": 1.282300439468855, "learning_rate": 2.554366586123576e-06, "loss": 1.8989, "step": 2849 }, { "epoch": 2.6561043802423114, "grad_norm": 1.2249554374331022, "learning_rate": 2.5474628926475662e-06, "loss": 1.7449, "step": 2850 }, { "epoch": 2.6570363466915192, "grad_norm": 1.4708466583345066, "learning_rate": 2.5405591991715574e-06, "loss": 1.5363, "step": 2851 }, { "epoch": 2.657968313140727, "grad_norm": 1.2290139929940822, "learning_rate": 2.5336555056955477e-06, "loss": 1.4373, "step": 2852 }, { "epoch": 2.658900279589935, "grad_norm": 1.319267789005102, "learning_rate": 2.5267518122195376e-06, "loss": 1.6198, "step": 2853 }, { "epoch": 2.6598322460391426, "grad_norm": 1.2712336316320403, "learning_rate": 2.519848118743528e-06, "loss": 1.7388, "step": 2854 }, { "epoch": 2.6607642124883504, "grad_norm": 1.1494520035919413, "learning_rate": 2.5129444252675182e-06, "loss": 1.6247, "step": 2855 }, { "epoch": 2.661696178937558, "grad_norm": 0.9856891168204324, "learning_rate": 2.5060407317915085e-06, "loss": 1.69, "step": 2856 }, { "epoch": 2.662628145386766, "grad_norm": 1.2656183486961299, "learning_rate": 2.499137038315499e-06, "loss": 1.5384, "step": 2857 }, { "epoch": 2.6635601118359737, "grad_norm": 1.2019263060592753, "learning_rate": 2.492233344839489e-06, "loss": 1.5838, "step": 2858 }, { "epoch": 2.664492078285182, "grad_norm": 1.5179542938561013, "learning_rate": 2.48532965136348e-06, "loss": 1.7167, "step": 2859 }, { "epoch": 2.6654240447343893, "grad_norm": 1.264258815841328, "learning_rate": 2.47842595788747e-06, "loss": 1.7385, "step": 2860 }, { "epoch": 2.6663560111835976, "grad_norm": 1.4360963201465833, "learning_rate": 2.4715222644114605e-06, "loss": 1.4166, "step": 2861 }, { "epoch": 2.6672879776328053, "grad_norm": 1.2640020545889803, "learning_rate": 2.464618570935451e-06, "loss": 1.7337, "step": 2862 }, { "epoch": 2.668219944082013, "grad_norm": 1.213056389976662, "learning_rate": 2.4577148774594407e-06, "loss": 1.7234, "step": 2863 }, { "epoch": 2.669151910531221, "grad_norm": 1.1188426681864267, "learning_rate": 2.4508111839834314e-06, "loss": 1.5245, "step": 2864 }, { "epoch": 2.6700838769804287, "grad_norm": 1.3439302755181894, "learning_rate": 2.4439074905074217e-06, "loss": 2.1232, "step": 2865 }, { "epoch": 2.6710158434296365, "grad_norm": 1.1741486177366218, "learning_rate": 2.437003797031412e-06, "loss": 1.8484, "step": 2866 }, { "epoch": 2.6719478098788443, "grad_norm": 1.1992389396304186, "learning_rate": 2.4301001035554023e-06, "loss": 1.7632, "step": 2867 }, { "epoch": 2.672879776328052, "grad_norm": 1.147679487945916, "learning_rate": 2.4231964100793927e-06, "loss": 1.5081, "step": 2868 }, { "epoch": 2.67381174277726, "grad_norm": 1.0842503526739538, "learning_rate": 2.416292716603383e-06, "loss": 1.2697, "step": 2869 }, { "epoch": 2.674743709226468, "grad_norm": 1.1923853528998551, "learning_rate": 2.4093890231273733e-06, "loss": 1.5491, "step": 2870 }, { "epoch": 2.6756756756756754, "grad_norm": 1.3266075303768885, "learning_rate": 2.4024853296513636e-06, "loss": 1.7065, "step": 2871 }, { "epoch": 2.6766076421248837, "grad_norm": 1.2356909908101883, "learning_rate": 2.395581636175354e-06, "loss": 1.2544, "step": 2872 }, { "epoch": 2.6775396085740915, "grad_norm": 1.353084614926184, "learning_rate": 2.388677942699344e-06, "loss": 1.8275, "step": 2873 }, { "epoch": 2.6784715750232992, "grad_norm": 1.3301926304164868, "learning_rate": 2.381774249223335e-06, "loss": 1.8445, "step": 2874 }, { "epoch": 2.679403541472507, "grad_norm": 1.2385565842522162, "learning_rate": 2.3748705557473252e-06, "loss": 1.7445, "step": 2875 }, { "epoch": 2.680335507921715, "grad_norm": 1.5530630776753713, "learning_rate": 2.367966862271315e-06, "loss": 1.5743, "step": 2876 }, { "epoch": 2.6812674743709226, "grad_norm": 1.1606398869899053, "learning_rate": 2.3610631687953054e-06, "loss": 1.3775, "step": 2877 }, { "epoch": 2.6821994408201304, "grad_norm": 1.1685611199147858, "learning_rate": 2.3541594753192957e-06, "loss": 1.5774, "step": 2878 }, { "epoch": 2.683131407269338, "grad_norm": 1.1501696571940854, "learning_rate": 2.3472557818432865e-06, "loss": 1.4451, "step": 2879 }, { "epoch": 2.684063373718546, "grad_norm": 1.1675941474059204, "learning_rate": 2.3403520883672768e-06, "loss": 1.6072, "step": 2880 }, { "epoch": 2.684995340167754, "grad_norm": 1.2241367504259237, "learning_rate": 2.333448394891267e-06, "loss": 1.8337, "step": 2881 }, { "epoch": 2.6859273066169616, "grad_norm": 1.3130642071556988, "learning_rate": 2.3265447014152574e-06, "loss": 1.6822, "step": 2882 }, { "epoch": 2.68685927306617, "grad_norm": 1.07427395726844, "learning_rate": 2.3196410079392477e-06, "loss": 1.8442, "step": 2883 }, { "epoch": 2.6877912395153776, "grad_norm": 1.38142213348654, "learning_rate": 2.312737314463238e-06, "loss": 1.6256, "step": 2884 }, { "epoch": 2.6887232059645854, "grad_norm": 1.0091571435211886, "learning_rate": 2.3058336209872283e-06, "loss": 1.4264, "step": 2885 }, { "epoch": 2.689655172413793, "grad_norm": 1.1196204014177285, "learning_rate": 2.2989299275112186e-06, "loss": 1.6032, "step": 2886 }, { "epoch": 2.690587138863001, "grad_norm": 1.1183715188447207, "learning_rate": 2.292026234035209e-06, "loss": 1.4605, "step": 2887 }, { "epoch": 2.6915191053122087, "grad_norm": 1.339003908655625, "learning_rate": 2.2851225405591993e-06, "loss": 1.4947, "step": 2888 }, { "epoch": 2.6924510717614165, "grad_norm": 1.1913038349279546, "learning_rate": 2.27821884708319e-06, "loss": 1.5325, "step": 2889 }, { "epoch": 2.6933830382106243, "grad_norm": 1.105114474476193, "learning_rate": 2.27131515360718e-06, "loss": 1.6449, "step": 2890 }, { "epoch": 2.694315004659832, "grad_norm": 1.1909400842467506, "learning_rate": 2.26441146013117e-06, "loss": 1.4555, "step": 2891 }, { "epoch": 2.6952469711090403, "grad_norm": 1.1745174035204762, "learning_rate": 2.2575077666551605e-06, "loss": 1.5606, "step": 2892 }, { "epoch": 2.6961789375582477, "grad_norm": 1.091826581200835, "learning_rate": 2.250604073179151e-06, "loss": 1.6035, "step": 2893 }, { "epoch": 2.697110904007456, "grad_norm": 1.2899511904896104, "learning_rate": 2.2437003797031415e-06, "loss": 1.6062, "step": 2894 }, { "epoch": 2.6980428704566637, "grad_norm": 1.1098947514122985, "learning_rate": 2.236796686227132e-06, "loss": 1.5894, "step": 2895 }, { "epoch": 2.6989748369058715, "grad_norm": 1.3305228993621436, "learning_rate": 2.229892992751122e-06, "loss": 2.1464, "step": 2896 }, { "epoch": 2.6999068033550793, "grad_norm": 1.1009172350511953, "learning_rate": 2.2229892992751125e-06, "loss": 1.4986, "step": 2897 }, { "epoch": 2.700838769804287, "grad_norm": 1.4378399950106082, "learning_rate": 2.2160856057991028e-06, "loss": 1.9169, "step": 2898 }, { "epoch": 2.701770736253495, "grad_norm": 1.212783868889214, "learning_rate": 2.209181912323093e-06, "loss": 1.549, "step": 2899 }, { "epoch": 2.7027027027027026, "grad_norm": 1.163010368852381, "learning_rate": 2.2022782188470834e-06, "loss": 1.5698, "step": 2900 }, { "epoch": 2.7036346691519104, "grad_norm": 1.4799276679412379, "learning_rate": 2.1953745253710737e-06, "loss": 1.4377, "step": 2901 }, { "epoch": 2.704566635601118, "grad_norm": 1.2035479858041882, "learning_rate": 2.188470831895064e-06, "loss": 1.5863, "step": 2902 }, { "epoch": 2.7054986020503264, "grad_norm": 1.2538191158051146, "learning_rate": 2.1815671384190543e-06, "loss": 1.3187, "step": 2903 }, { "epoch": 2.706430568499534, "grad_norm": 1.4034771113928868, "learning_rate": 2.1746634449430446e-06, "loss": 1.9529, "step": 2904 }, { "epoch": 2.707362534948742, "grad_norm": 1.1142192955525554, "learning_rate": 2.167759751467035e-06, "loss": 1.5109, "step": 2905 }, { "epoch": 2.70829450139795, "grad_norm": 1.3399645710158745, "learning_rate": 2.1608560579910252e-06, "loss": 1.7358, "step": 2906 }, { "epoch": 2.7092264678471576, "grad_norm": 1.9871310213679814, "learning_rate": 2.1539523645150156e-06, "loss": 1.6684, "step": 2907 }, { "epoch": 2.7101584342963654, "grad_norm": 1.3213050790953627, "learning_rate": 2.147048671039006e-06, "loss": 1.4417, "step": 2908 }, { "epoch": 2.711090400745573, "grad_norm": 1.2369238336656934, "learning_rate": 2.1401449775629966e-06, "loss": 1.4939, "step": 2909 }, { "epoch": 2.712022367194781, "grad_norm": 1.6322717979417791, "learning_rate": 2.133241284086987e-06, "loss": 1.8461, "step": 2910 }, { "epoch": 2.7129543336439887, "grad_norm": 1.1742189714155646, "learning_rate": 2.1263375906109772e-06, "loss": 1.6272, "step": 2911 }, { "epoch": 2.7138863000931965, "grad_norm": 1.3700869153603807, "learning_rate": 2.1194338971349675e-06, "loss": 1.6422, "step": 2912 }, { "epoch": 2.7148182665424043, "grad_norm": 1.1192746629487293, "learning_rate": 2.1125302036589574e-06, "loss": 1.5017, "step": 2913 }, { "epoch": 2.7157502329916126, "grad_norm": 0.9802072619320469, "learning_rate": 2.105626510182948e-06, "loss": 1.3152, "step": 2914 }, { "epoch": 2.71668219944082, "grad_norm": 1.5604793558155807, "learning_rate": 2.0987228167069384e-06, "loss": 1.6795, "step": 2915 }, { "epoch": 2.717614165890028, "grad_norm": 1.1431755586573276, "learning_rate": 2.0918191232309288e-06, "loss": 1.5804, "step": 2916 }, { "epoch": 2.718546132339236, "grad_norm": 1.1836442667969314, "learning_rate": 2.084915429754919e-06, "loss": 1.5053, "step": 2917 }, { "epoch": 2.7194780987884437, "grad_norm": 1.2539923541085176, "learning_rate": 2.0780117362789094e-06, "loss": 1.8682, "step": 2918 }, { "epoch": 2.7204100652376515, "grad_norm": 1.2052054004572612, "learning_rate": 2.0711080428028997e-06, "loss": 1.6716, "step": 2919 }, { "epoch": 2.7213420316868593, "grad_norm": 1.5731602710949988, "learning_rate": 2.06420434932689e-06, "loss": 1.7441, "step": 2920 }, { "epoch": 2.722273998136067, "grad_norm": 1.1853211180801277, "learning_rate": 2.0573006558508803e-06, "loss": 1.5861, "step": 2921 }, { "epoch": 2.723205964585275, "grad_norm": 1.1981398595856583, "learning_rate": 2.0503969623748706e-06, "loss": 1.5729, "step": 2922 }, { "epoch": 2.7241379310344827, "grad_norm": 1.3804685561404288, "learning_rate": 2.043493268898861e-06, "loss": 1.6554, "step": 2923 }, { "epoch": 2.7250698974836904, "grad_norm": 1.358180851284877, "learning_rate": 2.0365895754228517e-06, "loss": 1.6147, "step": 2924 }, { "epoch": 2.7260018639328987, "grad_norm": 1.3007249432778643, "learning_rate": 2.029685881946842e-06, "loss": 1.6621, "step": 2925 }, { "epoch": 2.726933830382106, "grad_norm": 1.1488206021003893, "learning_rate": 2.0227821884708323e-06, "loss": 1.4253, "step": 2926 }, { "epoch": 2.7278657968313142, "grad_norm": 1.5037061218928667, "learning_rate": 2.015878494994822e-06, "loss": 1.6328, "step": 2927 }, { "epoch": 2.728797763280522, "grad_norm": 1.0667533398945213, "learning_rate": 2.0089748015188125e-06, "loss": 1.4541, "step": 2928 }, { "epoch": 2.72972972972973, "grad_norm": 1.2478489983848255, "learning_rate": 2.002071108042803e-06, "loss": 1.3975, "step": 2929 }, { "epoch": 2.7306616961789376, "grad_norm": 1.14763512135669, "learning_rate": 1.9951674145667935e-06, "loss": 1.643, "step": 2930 }, { "epoch": 2.7315936626281454, "grad_norm": 1.3531001362997317, "learning_rate": 1.988263721090784e-06, "loss": 1.7724, "step": 2931 }, { "epoch": 2.732525629077353, "grad_norm": 1.050236770154482, "learning_rate": 1.981360027614774e-06, "loss": 1.3511, "step": 2932 }, { "epoch": 2.733457595526561, "grad_norm": 1.5291212072954714, "learning_rate": 1.9744563341387644e-06, "loss": 1.9961, "step": 2933 }, { "epoch": 2.7343895619757688, "grad_norm": 1.4702532650709368, "learning_rate": 1.9675526406627547e-06, "loss": 2.0647, "step": 2934 }, { "epoch": 2.7353215284249766, "grad_norm": 1.0752190770354189, "learning_rate": 1.960648947186745e-06, "loss": 1.2367, "step": 2935 }, { "epoch": 2.736253494874185, "grad_norm": 1.3135489216112497, "learning_rate": 1.9537452537107354e-06, "loss": 1.7688, "step": 2936 }, { "epoch": 2.737185461323392, "grad_norm": 1.022630523799684, "learning_rate": 1.9468415602347257e-06, "loss": 1.4261, "step": 2937 }, { "epoch": 2.7381174277726004, "grad_norm": 1.35459887614155, "learning_rate": 1.939937866758716e-06, "loss": 1.788, "step": 2938 }, { "epoch": 2.739049394221808, "grad_norm": 1.319632509529697, "learning_rate": 1.9330341732827067e-06, "loss": 1.6636, "step": 2939 }, { "epoch": 2.739981360671016, "grad_norm": 0.9891799514049192, "learning_rate": 1.926130479806697e-06, "loss": 1.2794, "step": 2940 }, { "epoch": 2.7409133271202237, "grad_norm": 1.3845498033552293, "learning_rate": 1.919226786330687e-06, "loss": 1.7286, "step": 2941 }, { "epoch": 2.7418452935694315, "grad_norm": 1.2269095062559463, "learning_rate": 1.9123230928546772e-06, "loss": 1.5031, "step": 2942 }, { "epoch": 2.7427772600186393, "grad_norm": 1.1967135914499876, "learning_rate": 1.9054193993786677e-06, "loss": 1.6674, "step": 2943 }, { "epoch": 2.743709226467847, "grad_norm": 1.1294071501679175, "learning_rate": 1.8985157059026583e-06, "loss": 1.5418, "step": 2944 }, { "epoch": 2.744641192917055, "grad_norm": 1.039454594252878, "learning_rate": 1.8916120124266486e-06, "loss": 1.3146, "step": 2945 }, { "epoch": 2.7455731593662627, "grad_norm": 1.260119866790315, "learning_rate": 1.8847083189506389e-06, "loss": 1.6927, "step": 2946 }, { "epoch": 2.746505125815471, "grad_norm": 1.2402349423253038, "learning_rate": 1.877804625474629e-06, "loss": 1.6845, "step": 2947 }, { "epoch": 2.7474370922646782, "grad_norm": 1.103215624933358, "learning_rate": 1.8709009319986193e-06, "loss": 1.507, "step": 2948 }, { "epoch": 2.7483690587138865, "grad_norm": 1.033587903394416, "learning_rate": 1.8639972385226098e-06, "loss": 1.5373, "step": 2949 }, { "epoch": 2.7493010251630943, "grad_norm": 1.0768790866545364, "learning_rate": 1.8570935450466001e-06, "loss": 1.518, "step": 2950 }, { "epoch": 2.750232991612302, "grad_norm": 1.209033601210394, "learning_rate": 1.8501898515705904e-06, "loss": 1.5189, "step": 2951 }, { "epoch": 2.75116495806151, "grad_norm": 1.1917110452684074, "learning_rate": 1.8432861580945807e-06, "loss": 1.4101, "step": 2952 }, { "epoch": 2.7520969245107176, "grad_norm": 1.1194418629514378, "learning_rate": 1.836382464618571e-06, "loss": 1.5411, "step": 2953 }, { "epoch": 2.7530288909599254, "grad_norm": 1.3869601342631777, "learning_rate": 1.8294787711425613e-06, "loss": 1.7204, "step": 2954 }, { "epoch": 2.753960857409133, "grad_norm": 1.4490405229061396, "learning_rate": 1.8225750776665519e-06, "loss": 1.6867, "step": 2955 }, { "epoch": 2.754892823858341, "grad_norm": 1.1571971912072212, "learning_rate": 1.8156713841905422e-06, "loss": 1.7046, "step": 2956 }, { "epoch": 2.755824790307549, "grad_norm": 1.4745861261095399, "learning_rate": 1.8087676907145325e-06, "loss": 1.4099, "step": 2957 }, { "epoch": 2.756756756756757, "grad_norm": 1.298262494286628, "learning_rate": 1.8018639972385226e-06, "loss": 1.7127, "step": 2958 }, { "epoch": 2.7576887232059644, "grad_norm": 1.3805170096376609, "learning_rate": 1.7949603037625129e-06, "loss": 1.6881, "step": 2959 }, { "epoch": 2.7586206896551726, "grad_norm": 1.424424769219356, "learning_rate": 1.7880566102865034e-06, "loss": 1.5198, "step": 2960 }, { "epoch": 2.7595526561043804, "grad_norm": 1.502576442771367, "learning_rate": 1.7811529168104937e-06, "loss": 1.8432, "step": 2961 }, { "epoch": 2.760484622553588, "grad_norm": 1.2429014928425874, "learning_rate": 1.774249223334484e-06, "loss": 1.6767, "step": 2962 }, { "epoch": 2.761416589002796, "grad_norm": 1.3306724271741084, "learning_rate": 1.7673455298584743e-06, "loss": 1.6263, "step": 2963 }, { "epoch": 2.7623485554520038, "grad_norm": 1.3194804107102285, "learning_rate": 1.7604418363824646e-06, "loss": 1.7923, "step": 2964 }, { "epoch": 2.7632805219012115, "grad_norm": 1.213899673951874, "learning_rate": 1.7535381429064552e-06, "loss": 1.6308, "step": 2965 }, { "epoch": 2.7642124883504193, "grad_norm": 1.1598268185551326, "learning_rate": 1.7466344494304455e-06, "loss": 1.6964, "step": 2966 }, { "epoch": 2.765144454799627, "grad_norm": 1.1722560696888356, "learning_rate": 1.7397307559544358e-06, "loss": 1.2123, "step": 2967 }, { "epoch": 2.766076421248835, "grad_norm": 1.3179409902481167, "learning_rate": 1.732827062478426e-06, "loss": 1.6411, "step": 2968 }, { "epoch": 2.767008387698043, "grad_norm": 1.0702492046818908, "learning_rate": 1.7259233690024164e-06, "loss": 1.815, "step": 2969 }, { "epoch": 2.7679403541472505, "grad_norm": 1.0384127915433177, "learning_rate": 1.719019675526407e-06, "loss": 1.3263, "step": 2970 }, { "epoch": 2.7688723205964587, "grad_norm": 1.2413111728833508, "learning_rate": 1.7121159820503972e-06, "loss": 1.5282, "step": 2971 }, { "epoch": 2.7698042870456665, "grad_norm": 1.2001225576791814, "learning_rate": 1.7052122885743873e-06, "loss": 2.1532, "step": 2972 }, { "epoch": 2.7707362534948743, "grad_norm": 1.2954288318140974, "learning_rate": 1.6983085950983776e-06, "loss": 1.8376, "step": 2973 }, { "epoch": 2.771668219944082, "grad_norm": 1.0300275987017449, "learning_rate": 1.691404901622368e-06, "loss": 1.6383, "step": 2974 }, { "epoch": 2.77260018639329, "grad_norm": 1.2033444252874093, "learning_rate": 1.6845012081463585e-06, "loss": 1.5171, "step": 2975 }, { "epoch": 2.7735321528424977, "grad_norm": 1.3025269614640052, "learning_rate": 1.6775975146703488e-06, "loss": 1.4792, "step": 2976 }, { "epoch": 2.7744641192917054, "grad_norm": 1.1910877671464901, "learning_rate": 1.670693821194339e-06, "loss": 1.6007, "step": 2977 }, { "epoch": 2.7753960857409132, "grad_norm": 1.4744437808948472, "learning_rate": 1.6637901277183294e-06, "loss": 1.9395, "step": 2978 }, { "epoch": 2.776328052190121, "grad_norm": 1.3125126070920918, "learning_rate": 1.6568864342423197e-06, "loss": 1.532, "step": 2979 }, { "epoch": 2.7772600186393293, "grad_norm": 1.308692222026942, "learning_rate": 1.6499827407663102e-06, "loss": 1.7067, "step": 2980 }, { "epoch": 2.7781919850885366, "grad_norm": 1.351703010923721, "learning_rate": 1.6430790472903005e-06, "loss": 1.7805, "step": 2981 }, { "epoch": 2.779123951537745, "grad_norm": 1.9284619655486572, "learning_rate": 1.6361753538142908e-06, "loss": 2.1141, "step": 2982 }, { "epoch": 2.7800559179869526, "grad_norm": 1.1945091239618857, "learning_rate": 1.629271660338281e-06, "loss": 1.2798, "step": 2983 }, { "epoch": 2.7809878844361604, "grad_norm": 1.1542633347379092, "learning_rate": 1.6223679668622713e-06, "loss": 1.6988, "step": 2984 }, { "epoch": 2.781919850885368, "grad_norm": 1.1677445670498827, "learning_rate": 1.615464273386262e-06, "loss": 1.5612, "step": 2985 }, { "epoch": 2.782851817334576, "grad_norm": 1.4657426127613602, "learning_rate": 1.608560579910252e-06, "loss": 1.4118, "step": 2986 }, { "epoch": 2.7837837837837838, "grad_norm": 1.3276088481898722, "learning_rate": 1.6016568864342424e-06, "loss": 1.5631, "step": 2987 }, { "epoch": 2.7847157502329916, "grad_norm": 1.213730217153991, "learning_rate": 1.5947531929582327e-06, "loss": 1.7305, "step": 2988 }, { "epoch": 2.7856477166821993, "grad_norm": 1.3956510532420934, "learning_rate": 1.587849499482223e-06, "loss": 1.5327, "step": 2989 }, { "epoch": 2.786579683131407, "grad_norm": 1.27668127082559, "learning_rate": 1.5809458060062135e-06, "loss": 1.492, "step": 2990 }, { "epoch": 2.7875116495806154, "grad_norm": 1.1519522336253594, "learning_rate": 1.5740421125302038e-06, "loss": 1.2907, "step": 2991 }, { "epoch": 2.7884436160298227, "grad_norm": 1.1549930043720957, "learning_rate": 1.5671384190541941e-06, "loss": 1.5229, "step": 2992 }, { "epoch": 2.789375582479031, "grad_norm": 1.2144137720038455, "learning_rate": 1.5602347255781845e-06, "loss": 1.6535, "step": 2993 }, { "epoch": 2.7903075489282387, "grad_norm": 1.3636848556078343, "learning_rate": 1.5533310321021748e-06, "loss": 1.5232, "step": 2994 }, { "epoch": 2.7912395153774465, "grad_norm": 1.150862003856907, "learning_rate": 1.5464273386261653e-06, "loss": 1.6439, "step": 2995 }, { "epoch": 2.7921714818266543, "grad_norm": 1.3268333145134166, "learning_rate": 1.5395236451501556e-06, "loss": 1.4679, "step": 2996 }, { "epoch": 2.793103448275862, "grad_norm": 1.2148677566107233, "learning_rate": 1.5326199516741457e-06, "loss": 1.4067, "step": 2997 }, { "epoch": 2.79403541472507, "grad_norm": 1.1941494779821324, "learning_rate": 1.525716258198136e-06, "loss": 1.6382, "step": 2998 }, { "epoch": 2.7949673811742777, "grad_norm": 1.2521218110784493, "learning_rate": 1.5188125647221263e-06, "loss": 1.8864, "step": 2999 }, { "epoch": 2.7958993476234855, "grad_norm": 1.0996502361019314, "learning_rate": 1.5119088712461168e-06, "loss": 1.6032, "step": 3000 }, { "epoch": 2.7968313140726933, "grad_norm": 1.1500372834355714, "learning_rate": 1.5050051777701071e-06, "loss": 1.4463, "step": 3001 }, { "epoch": 2.7977632805219015, "grad_norm": 1.0702240568514927, "learning_rate": 1.4981014842940974e-06, "loss": 1.7026, "step": 3002 }, { "epoch": 2.798695246971109, "grad_norm": 1.1349907388940494, "learning_rate": 1.4911977908180878e-06, "loss": 1.4902, "step": 3003 }, { "epoch": 2.799627213420317, "grad_norm": 1.4561374127253985, "learning_rate": 1.484294097342078e-06, "loss": 1.4494, "step": 3004 }, { "epoch": 2.800559179869525, "grad_norm": 1.2662970943394654, "learning_rate": 1.4773904038660686e-06, "loss": 1.5678, "step": 3005 }, { "epoch": 2.8014911463187326, "grad_norm": 1.231746070965575, "learning_rate": 1.470486710390059e-06, "loss": 1.6288, "step": 3006 }, { "epoch": 2.8024231127679404, "grad_norm": 1.211258028123518, "learning_rate": 1.4635830169140492e-06, "loss": 1.7522, "step": 3007 }, { "epoch": 2.803355079217148, "grad_norm": 1.245591120181558, "learning_rate": 1.4566793234380395e-06, "loss": 1.8788, "step": 3008 }, { "epoch": 2.804287045666356, "grad_norm": 1.2647674776486995, "learning_rate": 1.4497756299620296e-06, "loss": 1.768, "step": 3009 }, { "epoch": 2.805219012115564, "grad_norm": 1.1895752087380234, "learning_rate": 1.4428719364860203e-06, "loss": 1.5918, "step": 3010 }, { "epoch": 2.8061509785647716, "grad_norm": 1.188832308552258, "learning_rate": 1.4359682430100104e-06, "loss": 1.5988, "step": 3011 }, { "epoch": 2.8070829450139794, "grad_norm": 1.0248275560629045, "learning_rate": 1.4290645495340007e-06, "loss": 1.2905, "step": 3012 }, { "epoch": 2.8080149114631876, "grad_norm": 1.2163379859889898, "learning_rate": 1.422160856057991e-06, "loss": 1.6999, "step": 3013 }, { "epoch": 2.808946877912395, "grad_norm": 1.3611085743236395, "learning_rate": 1.4152571625819814e-06, "loss": 1.4907, "step": 3014 }, { "epoch": 2.809878844361603, "grad_norm": 1.2269038761624638, "learning_rate": 1.4083534691059719e-06, "loss": 1.714, "step": 3015 }, { "epoch": 2.810810810810811, "grad_norm": 1.23693654545557, "learning_rate": 1.4014497756299622e-06, "loss": 1.4473, "step": 3016 }, { "epoch": 2.8117427772600188, "grad_norm": 1.1909902178340617, "learning_rate": 1.3945460821539525e-06, "loss": 1.5815, "step": 3017 }, { "epoch": 2.8126747437092265, "grad_norm": 1.6328225898385405, "learning_rate": 1.3876423886779428e-06, "loss": 1.8454, "step": 3018 }, { "epoch": 2.8136067101584343, "grad_norm": 1.1169140735901872, "learning_rate": 1.3807386952019331e-06, "loss": 1.5538, "step": 3019 }, { "epoch": 2.814538676607642, "grad_norm": 1.3205944809812022, "learning_rate": 1.3738350017259236e-06, "loss": 1.5131, "step": 3020 }, { "epoch": 2.81547064305685, "grad_norm": 1.5086387066898501, "learning_rate": 1.366931308249914e-06, "loss": 1.8218, "step": 3021 }, { "epoch": 2.8164026095060577, "grad_norm": 1.3224605607992663, "learning_rate": 1.3600276147739043e-06, "loss": 1.6907, "step": 3022 }, { "epoch": 2.8173345759552655, "grad_norm": 1.0679908593907164, "learning_rate": 1.3531239212978944e-06, "loss": 1.4309, "step": 3023 }, { "epoch": 2.8182665424044733, "grad_norm": 1.2286588670773664, "learning_rate": 1.3462202278218847e-06, "loss": 1.4526, "step": 3024 }, { "epoch": 2.819198508853681, "grad_norm": 1.330140813707747, "learning_rate": 1.3393165343458752e-06, "loss": 1.8109, "step": 3025 }, { "epoch": 2.8201304753028893, "grad_norm": 1.3425996105955769, "learning_rate": 1.3324128408698655e-06, "loss": 1.5225, "step": 3026 }, { "epoch": 2.821062441752097, "grad_norm": 1.388463056255122, "learning_rate": 1.3255091473938558e-06, "loss": 1.903, "step": 3027 }, { "epoch": 2.821994408201305, "grad_norm": 1.3663833341034184, "learning_rate": 1.3186054539178461e-06, "loss": 1.5654, "step": 3028 }, { "epoch": 2.8229263746505127, "grad_norm": 1.0739250777452898, "learning_rate": 1.3117017604418364e-06, "loss": 1.5014, "step": 3029 }, { "epoch": 2.8238583410997204, "grad_norm": 1.2494481985713064, "learning_rate": 1.304798066965827e-06, "loss": 1.548, "step": 3030 }, { "epoch": 2.8247903075489282, "grad_norm": 1.3151865747634446, "learning_rate": 1.2978943734898173e-06, "loss": 1.4689, "step": 3031 }, { "epoch": 2.825722273998136, "grad_norm": 1.465474554261152, "learning_rate": 1.2909906800138076e-06, "loss": 1.61, "step": 3032 }, { "epoch": 2.826654240447344, "grad_norm": 1.2504983460977122, "learning_rate": 1.2840869865377979e-06, "loss": 1.4437, "step": 3033 }, { "epoch": 2.8275862068965516, "grad_norm": 1.1241937314249033, "learning_rate": 1.277183293061788e-06, "loss": 1.3762, "step": 3034 }, { "epoch": 2.8285181733457594, "grad_norm": 1.8420356319489946, "learning_rate": 1.2702795995857787e-06, "loss": 1.8522, "step": 3035 }, { "epoch": 2.829450139794967, "grad_norm": 1.2601795206746205, "learning_rate": 1.2633759061097688e-06, "loss": 1.5262, "step": 3036 }, { "epoch": 2.8303821062441754, "grad_norm": 1.779299925593406, "learning_rate": 1.2564722126337591e-06, "loss": 1.7985, "step": 3037 }, { "epoch": 2.831314072693383, "grad_norm": 1.2834451620025211, "learning_rate": 1.2495685191577494e-06, "loss": 1.7445, "step": 3038 }, { "epoch": 2.832246039142591, "grad_norm": 1.3467324385216208, "learning_rate": 1.24266482568174e-06, "loss": 1.6472, "step": 3039 }, { "epoch": 2.8331780055917988, "grad_norm": 1.2212713142683527, "learning_rate": 1.2357611322057302e-06, "loss": 1.5808, "step": 3040 }, { "epoch": 2.8341099720410066, "grad_norm": 1.2412023378450923, "learning_rate": 1.2288574387297203e-06, "loss": 1.5204, "step": 3041 }, { "epoch": 2.8350419384902144, "grad_norm": 1.170533856379214, "learning_rate": 1.2219537452537109e-06, "loss": 1.4667, "step": 3042 }, { "epoch": 2.835973904939422, "grad_norm": 1.0382077949611705, "learning_rate": 1.2150500517777012e-06, "loss": 1.6113, "step": 3043 }, { "epoch": 2.83690587138863, "grad_norm": 1.346172124962084, "learning_rate": 1.2081463583016915e-06, "loss": 1.4681, "step": 3044 }, { "epoch": 2.8378378378378377, "grad_norm": 1.2117889782937268, "learning_rate": 1.2012426648256818e-06, "loss": 1.4858, "step": 3045 }, { "epoch": 2.8387698042870455, "grad_norm": 1.1163224657733584, "learning_rate": 1.194338971349672e-06, "loss": 1.7077, "step": 3046 }, { "epoch": 2.8397017707362533, "grad_norm": 1.1016134081895401, "learning_rate": 1.1874352778736626e-06, "loss": 1.5192, "step": 3047 }, { "epoch": 2.8406337371854615, "grad_norm": 1.2751829544759197, "learning_rate": 1.1805315843976527e-06, "loss": 1.3015, "step": 3048 }, { "epoch": 2.8415657036346693, "grad_norm": 1.458405780112519, "learning_rate": 1.1736278909216432e-06, "loss": 1.5439, "step": 3049 }, { "epoch": 2.842497670083877, "grad_norm": 1.4409058904611647, "learning_rate": 1.1667241974456335e-06, "loss": 1.524, "step": 3050 }, { "epoch": 2.843429636533085, "grad_norm": 1.542544088436309, "learning_rate": 1.1598205039696239e-06, "loss": 1.5451, "step": 3051 }, { "epoch": 2.8443616029822927, "grad_norm": 1.527510848961027, "learning_rate": 1.1529168104936142e-06, "loss": 1.528, "step": 3052 }, { "epoch": 2.8452935694315005, "grad_norm": 1.2051869269224889, "learning_rate": 1.1460131170176045e-06, "loss": 1.4814, "step": 3053 }, { "epoch": 2.8462255358807083, "grad_norm": 1.2601062573022035, "learning_rate": 1.139109423541595e-06, "loss": 1.5648, "step": 3054 }, { "epoch": 2.847157502329916, "grad_norm": 1.5570972004797667, "learning_rate": 1.132205730065585e-06, "loss": 1.6854, "step": 3055 }, { "epoch": 2.848089468779124, "grad_norm": 1.3243050160612195, "learning_rate": 1.1253020365895754e-06, "loss": 1.6391, "step": 3056 }, { "epoch": 2.8490214352283316, "grad_norm": 1.2893503398107466, "learning_rate": 1.118398343113566e-06, "loss": 1.8099, "step": 3057 }, { "epoch": 2.8499534016775394, "grad_norm": 1.4808075535280456, "learning_rate": 1.1114946496375562e-06, "loss": 1.7428, "step": 3058 }, { "epoch": 2.8508853681267476, "grad_norm": 1.4073389475232145, "learning_rate": 1.1045909561615465e-06, "loss": 1.7754, "step": 3059 }, { "epoch": 2.8518173345759554, "grad_norm": 1.3692408296812368, "learning_rate": 1.0976872626855368e-06, "loss": 1.5545, "step": 3060 }, { "epoch": 2.852749301025163, "grad_norm": 1.2329926299536929, "learning_rate": 1.0907835692095272e-06, "loss": 1.4694, "step": 3061 }, { "epoch": 2.853681267474371, "grad_norm": 1.329729043505873, "learning_rate": 1.0838798757335175e-06, "loss": 1.5731, "step": 3062 }, { "epoch": 2.854613233923579, "grad_norm": 1.4114045279779444, "learning_rate": 1.0769761822575078e-06, "loss": 1.7133, "step": 3063 }, { "epoch": 2.8555452003727866, "grad_norm": 1.2624952087426697, "learning_rate": 1.0700724887814983e-06, "loss": 1.5782, "step": 3064 }, { "epoch": 2.8564771668219944, "grad_norm": 1.3803834381724722, "learning_rate": 1.0631687953054886e-06, "loss": 1.5985, "step": 3065 }, { "epoch": 2.857409133271202, "grad_norm": 1.289333074329554, "learning_rate": 1.0562651018294787e-06, "loss": 1.7942, "step": 3066 }, { "epoch": 2.85834109972041, "grad_norm": 1.2321385350345033, "learning_rate": 1.0493614083534692e-06, "loss": 1.6617, "step": 3067 }, { "epoch": 2.8592730661696177, "grad_norm": 1.182954716276999, "learning_rate": 1.0424577148774595e-06, "loss": 1.4825, "step": 3068 }, { "epoch": 2.8602050326188255, "grad_norm": 1.045472935795335, "learning_rate": 1.0355540214014498e-06, "loss": 1.3614, "step": 3069 }, { "epoch": 2.8611369990680338, "grad_norm": 1.473110924024392, "learning_rate": 1.0286503279254402e-06, "loss": 2.0827, "step": 3070 }, { "epoch": 2.862068965517241, "grad_norm": 1.3318164888181065, "learning_rate": 1.0217466344494305e-06, "loss": 1.7092, "step": 3071 }, { "epoch": 2.8630009319664493, "grad_norm": 1.2474740784556695, "learning_rate": 1.014842940973421e-06, "loss": 1.6772, "step": 3072 }, { "epoch": 2.863932898415657, "grad_norm": 1.5075227375208324, "learning_rate": 1.007939247497411e-06, "loss": 1.8685, "step": 3073 }, { "epoch": 2.864864864864865, "grad_norm": 1.295407630098996, "learning_rate": 1.0010355540214016e-06, "loss": 1.5106, "step": 3074 }, { "epoch": 2.8657968313140727, "grad_norm": 1.2196270810112706, "learning_rate": 9.94131860545392e-07, "loss": 1.5897, "step": 3075 }, { "epoch": 2.8667287977632805, "grad_norm": 1.2618717702748568, "learning_rate": 9.872281670693822e-07, "loss": 1.6776, "step": 3076 }, { "epoch": 2.8676607642124883, "grad_norm": 1.1638290726074185, "learning_rate": 9.803244735933725e-07, "loss": 1.6899, "step": 3077 }, { "epoch": 2.868592730661696, "grad_norm": 1.3155684232181066, "learning_rate": 9.734207801173628e-07, "loss": 1.842, "step": 3078 }, { "epoch": 2.869524697110904, "grad_norm": 1.0643593300549403, "learning_rate": 9.665170866413534e-07, "loss": 1.2708, "step": 3079 }, { "epoch": 2.8704566635601116, "grad_norm": 1.2671197671375696, "learning_rate": 9.596133931653435e-07, "loss": 1.8358, "step": 3080 }, { "epoch": 2.87138863000932, "grad_norm": 1.2787188698099317, "learning_rate": 9.527096996893339e-07, "loss": 1.7013, "step": 3081 }, { "epoch": 2.872320596458527, "grad_norm": 1.2571556509827886, "learning_rate": 9.458060062133243e-07, "loss": 1.4286, "step": 3082 }, { "epoch": 2.8732525629077355, "grad_norm": 1.3145222020190677, "learning_rate": 9.389023127373145e-07, "loss": 1.4886, "step": 3083 }, { "epoch": 2.8741845293569432, "grad_norm": 1.1881960481207523, "learning_rate": 9.319986192613049e-07, "loss": 1.6129, "step": 3084 }, { "epoch": 2.875116495806151, "grad_norm": 1.0362529281635995, "learning_rate": 9.250949257852952e-07, "loss": 1.2898, "step": 3085 }, { "epoch": 2.876048462255359, "grad_norm": 1.363678676654329, "learning_rate": 9.181912323092855e-07, "loss": 1.6933, "step": 3086 }, { "epoch": 2.8769804287045666, "grad_norm": 1.2969455625973116, "learning_rate": 9.112875388332759e-07, "loss": 1.4369, "step": 3087 }, { "epoch": 2.8779123951537744, "grad_norm": 1.1193232976824865, "learning_rate": 9.043838453572662e-07, "loss": 1.709, "step": 3088 }, { "epoch": 2.878844361602982, "grad_norm": 1.2940335192724381, "learning_rate": 8.974801518812564e-07, "loss": 1.4495, "step": 3089 }, { "epoch": 2.87977632805219, "grad_norm": 1.186580112050351, "learning_rate": 8.905764584052469e-07, "loss": 1.4948, "step": 3090 }, { "epoch": 2.8807082945013978, "grad_norm": 1.098896055723584, "learning_rate": 8.836727649292372e-07, "loss": 1.613, "step": 3091 }, { "epoch": 2.881640260950606, "grad_norm": 1.321959942734422, "learning_rate": 8.767690714532276e-07, "loss": 1.4618, "step": 3092 }, { "epoch": 2.8825722273998133, "grad_norm": 1.111291811697763, "learning_rate": 8.698653779772179e-07, "loss": 1.5155, "step": 3093 }, { "epoch": 2.8835041938490216, "grad_norm": 1.1841162529072455, "learning_rate": 8.629616845012082e-07, "loss": 1.3896, "step": 3094 }, { "epoch": 2.8844361602982294, "grad_norm": 1.1443055206943198, "learning_rate": 8.560579910251986e-07, "loss": 1.6567, "step": 3095 }, { "epoch": 2.885368126747437, "grad_norm": 1.3061602783841795, "learning_rate": 8.491542975491888e-07, "loss": 1.4434, "step": 3096 }, { "epoch": 2.886300093196645, "grad_norm": 1.4106621104380945, "learning_rate": 8.422506040731792e-07, "loss": 1.762, "step": 3097 }, { "epoch": 2.8872320596458527, "grad_norm": 1.1066722901885073, "learning_rate": 8.353469105971695e-07, "loss": 1.5628, "step": 3098 }, { "epoch": 2.8881640260950605, "grad_norm": 1.1775792578113777, "learning_rate": 8.284432171211599e-07, "loss": 1.5239, "step": 3099 }, { "epoch": 2.8890959925442683, "grad_norm": 1.2214570652271883, "learning_rate": 8.215395236451503e-07, "loss": 1.5568, "step": 3100 }, { "epoch": 2.890027958993476, "grad_norm": 1.303323951427333, "learning_rate": 8.146358301691405e-07, "loss": 1.346, "step": 3101 }, { "epoch": 2.890959925442684, "grad_norm": 1.5696093098805723, "learning_rate": 8.07732136693131e-07, "loss": 1.7019, "step": 3102 }, { "epoch": 2.891891891891892, "grad_norm": 1.2266433630233, "learning_rate": 8.008284432171212e-07, "loss": 1.4549, "step": 3103 }, { "epoch": 2.8928238583410995, "grad_norm": 1.3905341029539244, "learning_rate": 7.939247497411115e-07, "loss": 1.6549, "step": 3104 }, { "epoch": 2.8937558247903077, "grad_norm": 0.9072410566756401, "learning_rate": 7.870210562651019e-07, "loss": 1.4254, "step": 3105 }, { "epoch": 2.8946877912395155, "grad_norm": 1.1743376555182705, "learning_rate": 7.801173627890922e-07, "loss": 1.5991, "step": 3106 }, { "epoch": 2.8956197576887233, "grad_norm": 0.9608680813478613, "learning_rate": 7.732136693130826e-07, "loss": 1.3766, "step": 3107 }, { "epoch": 2.896551724137931, "grad_norm": 1.402178287672745, "learning_rate": 7.663099758370728e-07, "loss": 1.9311, "step": 3108 }, { "epoch": 2.897483690587139, "grad_norm": 1.276408993608351, "learning_rate": 7.594062823610632e-07, "loss": 1.6436, "step": 3109 }, { "epoch": 2.8984156570363466, "grad_norm": 1.123412495730991, "learning_rate": 7.525025888850536e-07, "loss": 1.5674, "step": 3110 }, { "epoch": 2.8993476234855544, "grad_norm": 1.2630219523459656, "learning_rate": 7.455988954090439e-07, "loss": 1.4041, "step": 3111 }, { "epoch": 2.900279589934762, "grad_norm": 1.0999540193358024, "learning_rate": 7.386952019330343e-07, "loss": 1.2111, "step": 3112 }, { "epoch": 2.90121155638397, "grad_norm": 1.270356017383426, "learning_rate": 7.317915084570246e-07, "loss": 1.6976, "step": 3113 }, { "epoch": 2.9021435228331782, "grad_norm": 1.22630864210884, "learning_rate": 7.248878149810148e-07, "loss": 1.4189, "step": 3114 }, { "epoch": 2.9030754892823856, "grad_norm": 1.296930130673201, "learning_rate": 7.179841215050052e-07, "loss": 1.6623, "step": 3115 }, { "epoch": 2.904007455731594, "grad_norm": 1.0966903675611113, "learning_rate": 7.110804280289955e-07, "loss": 1.7152, "step": 3116 }, { "epoch": 2.9049394221808016, "grad_norm": 1.7782203683267634, "learning_rate": 7.041767345529859e-07, "loss": 1.546, "step": 3117 }, { "epoch": 2.9058713886300094, "grad_norm": 1.1434921706580763, "learning_rate": 6.972730410769763e-07, "loss": 1.6646, "step": 3118 }, { "epoch": 2.906803355079217, "grad_norm": 1.156124695060269, "learning_rate": 6.903693476009666e-07, "loss": 1.5657, "step": 3119 }, { "epoch": 2.907735321528425, "grad_norm": 1.4866425244675394, "learning_rate": 6.83465654124957e-07, "loss": 1.6871, "step": 3120 }, { "epoch": 2.9086672879776327, "grad_norm": 1.3288980683674338, "learning_rate": 6.765619606489472e-07, "loss": 1.6856, "step": 3121 }, { "epoch": 2.9095992544268405, "grad_norm": 1.2449103568423943, "learning_rate": 6.696582671729376e-07, "loss": 1.4978, "step": 3122 }, { "epoch": 2.9105312208760483, "grad_norm": 1.2751904299330974, "learning_rate": 6.627545736969279e-07, "loss": 1.6246, "step": 3123 }, { "epoch": 2.911463187325256, "grad_norm": 1.270303939818626, "learning_rate": 6.558508802209182e-07, "loss": 1.5463, "step": 3124 }, { "epoch": 2.9123951537744643, "grad_norm": 1.39547525827444, "learning_rate": 6.489471867449086e-07, "loss": 1.5206, "step": 3125 }, { "epoch": 2.9133271202236717, "grad_norm": 1.124150791228938, "learning_rate": 6.420434932688989e-07, "loss": 1.3244, "step": 3126 }, { "epoch": 2.91425908667288, "grad_norm": 1.2323357414426046, "learning_rate": 6.351397997928894e-07, "loss": 1.7258, "step": 3127 }, { "epoch": 2.9151910531220877, "grad_norm": 1.3539319814437896, "learning_rate": 6.282361063168796e-07, "loss": 1.5814, "step": 3128 }, { "epoch": 2.9161230195712955, "grad_norm": 1.4809633019920294, "learning_rate": 6.2133241284087e-07, "loss": 1.4695, "step": 3129 }, { "epoch": 2.9170549860205033, "grad_norm": 1.2930367766124453, "learning_rate": 6.144287193648602e-07, "loss": 1.7461, "step": 3130 }, { "epoch": 2.917986952469711, "grad_norm": 1.3586980249002658, "learning_rate": 6.075250258888506e-07, "loss": 1.4435, "step": 3131 }, { "epoch": 2.918918918918919, "grad_norm": 1.19184451409291, "learning_rate": 6.006213324128409e-07, "loss": 1.6299, "step": 3132 }, { "epoch": 2.9198508853681266, "grad_norm": 1.2295617606335296, "learning_rate": 5.937176389368313e-07, "loss": 1.7809, "step": 3133 }, { "epoch": 2.9207828518173344, "grad_norm": 1.3275826446191645, "learning_rate": 5.868139454608216e-07, "loss": 1.5154, "step": 3134 }, { "epoch": 2.9217148182665422, "grad_norm": 1.250790843750568, "learning_rate": 5.799102519848119e-07, "loss": 1.5736, "step": 3135 }, { "epoch": 2.9226467847157505, "grad_norm": 1.0883746956182798, "learning_rate": 5.730065585088022e-07, "loss": 1.2012, "step": 3136 }, { "epoch": 2.923578751164958, "grad_norm": 1.1049823320520311, "learning_rate": 5.661028650327925e-07, "loss": 1.2572, "step": 3137 }, { "epoch": 2.924510717614166, "grad_norm": 1.2171387950425245, "learning_rate": 5.59199171556783e-07, "loss": 1.5266, "step": 3138 }, { "epoch": 2.925442684063374, "grad_norm": 1.05475215005269, "learning_rate": 5.522954780807733e-07, "loss": 1.4974, "step": 3139 }, { "epoch": 2.9263746505125816, "grad_norm": 1.128441121892153, "learning_rate": 5.453917846047636e-07, "loss": 1.34, "step": 3140 }, { "epoch": 2.9273066169617894, "grad_norm": 1.2294195131773176, "learning_rate": 5.384880911287539e-07, "loss": 1.7653, "step": 3141 }, { "epoch": 2.928238583410997, "grad_norm": 1.3518335575844753, "learning_rate": 5.315843976527443e-07, "loss": 1.8217, "step": 3142 }, { "epoch": 2.929170549860205, "grad_norm": 1.0641324193690394, "learning_rate": 5.246807041767346e-07, "loss": 1.4185, "step": 3143 }, { "epoch": 2.9301025163094128, "grad_norm": 1.3303154227823966, "learning_rate": 5.177770107007249e-07, "loss": 1.736, "step": 3144 }, { "epoch": 2.9310344827586206, "grad_norm": 1.4301963527485055, "learning_rate": 5.108733172247152e-07, "loss": 1.8173, "step": 3145 }, { "epoch": 2.9319664492078283, "grad_norm": 1.2376127921833548, "learning_rate": 5.039696237487055e-07, "loss": 1.8056, "step": 3146 }, { "epoch": 2.9328984156570366, "grad_norm": 1.1882465181001467, "learning_rate": 4.97065930272696e-07, "loss": 1.4582, "step": 3147 }, { "epoch": 2.933830382106244, "grad_norm": 1.076925924647531, "learning_rate": 4.901622367966863e-07, "loss": 1.8384, "step": 3148 }, { "epoch": 2.934762348555452, "grad_norm": 1.0707352266045158, "learning_rate": 4.832585433206767e-07, "loss": 1.6928, "step": 3149 }, { "epoch": 2.93569431500466, "grad_norm": 1.4331309236434564, "learning_rate": 4.7635484984466693e-07, "loss": 1.6317, "step": 3150 }, { "epoch": 2.9366262814538677, "grad_norm": 1.1311212214934763, "learning_rate": 4.6945115636865724e-07, "loss": 1.5932, "step": 3151 }, { "epoch": 2.9375582479030755, "grad_norm": 1.3244627482282911, "learning_rate": 4.625474628926476e-07, "loss": 1.3918, "step": 3152 }, { "epoch": 2.9384902143522833, "grad_norm": 1.6183581845439698, "learning_rate": 4.5564376941663797e-07, "loss": 1.7938, "step": 3153 }, { "epoch": 2.939422180801491, "grad_norm": 1.2121722163315558, "learning_rate": 4.487400759406282e-07, "loss": 1.3561, "step": 3154 }, { "epoch": 2.940354147250699, "grad_norm": 1.1981774450398865, "learning_rate": 4.418363824646186e-07, "loss": 1.8647, "step": 3155 }, { "epoch": 2.9412861136999067, "grad_norm": 1.3436486462653758, "learning_rate": 4.3493268898860895e-07, "loss": 1.4267, "step": 3156 }, { "epoch": 2.9422180801491145, "grad_norm": 1.6170335010281438, "learning_rate": 4.280289955125993e-07, "loss": 1.9532, "step": 3157 }, { "epoch": 2.9431500465983227, "grad_norm": 1.1549793616256423, "learning_rate": 4.211253020365896e-07, "loss": 1.5022, "step": 3158 }, { "epoch": 2.94408201304753, "grad_norm": 1.1676450460264085, "learning_rate": 4.142216085605799e-07, "loss": 1.7674, "step": 3159 }, { "epoch": 2.9450139794967383, "grad_norm": 1.4976740768520194, "learning_rate": 4.0731791508457024e-07, "loss": 1.4137, "step": 3160 }, { "epoch": 2.945945945945946, "grad_norm": 1.3721325255574996, "learning_rate": 4.004142216085606e-07, "loss": 1.6484, "step": 3161 }, { "epoch": 2.946877912395154, "grad_norm": 1.3310556905226518, "learning_rate": 3.9351052813255096e-07, "loss": 1.5015, "step": 3162 }, { "epoch": 2.9478098788443616, "grad_norm": 1.5567446157278473, "learning_rate": 3.866068346565413e-07, "loss": 1.585, "step": 3163 }, { "epoch": 2.9487418452935694, "grad_norm": 1.652426803608317, "learning_rate": 3.797031411805316e-07, "loss": 1.7801, "step": 3164 }, { "epoch": 2.949673811742777, "grad_norm": 1.5337670009306243, "learning_rate": 3.7279944770452194e-07, "loss": 1.569, "step": 3165 }, { "epoch": 2.950605778191985, "grad_norm": 1.239247079596767, "learning_rate": 3.658957542285123e-07, "loss": 1.5114, "step": 3166 }, { "epoch": 2.951537744641193, "grad_norm": 1.1828409756735856, "learning_rate": 3.589920607525026e-07, "loss": 1.56, "step": 3167 }, { "epoch": 2.9524697110904006, "grad_norm": 1.2004777292719062, "learning_rate": 3.5208836727649297e-07, "loss": 1.7124, "step": 3168 }, { "epoch": 2.953401677539609, "grad_norm": 1.209066711815018, "learning_rate": 3.451846738004833e-07, "loss": 1.7726, "step": 3169 }, { "epoch": 2.954333643988816, "grad_norm": 1.2118268764714075, "learning_rate": 3.382809803244736e-07, "loss": 1.74, "step": 3170 }, { "epoch": 2.9552656104380244, "grad_norm": 1.117537220924492, "learning_rate": 3.3137728684846395e-07, "loss": 1.5236, "step": 3171 }, { "epoch": 2.956197576887232, "grad_norm": 1.1161590014733713, "learning_rate": 3.244735933724543e-07, "loss": 1.581, "step": 3172 }, { "epoch": 2.95712954333644, "grad_norm": 1.0813250746311518, "learning_rate": 3.175698998964447e-07, "loss": 1.7426, "step": 3173 }, { "epoch": 2.9580615097856477, "grad_norm": 1.3161139123224188, "learning_rate": 3.10666206420435e-07, "loss": 1.9653, "step": 3174 }, { "epoch": 2.9589934762348555, "grad_norm": 1.0368056135057315, "learning_rate": 3.037625129444253e-07, "loss": 1.3656, "step": 3175 }, { "epoch": 2.9599254426840633, "grad_norm": 1.0605268583363212, "learning_rate": 2.9685881946841566e-07, "loss": 1.3453, "step": 3176 }, { "epoch": 2.960857409133271, "grad_norm": 1.149362113483493, "learning_rate": 2.8995512599240596e-07, "loss": 1.5529, "step": 3177 }, { "epoch": 2.961789375582479, "grad_norm": 1.4659469522837247, "learning_rate": 2.8305143251639627e-07, "loss": 1.638, "step": 3178 }, { "epoch": 2.9627213420316867, "grad_norm": 1.5051977203504923, "learning_rate": 2.7614773904038664e-07, "loss": 1.7464, "step": 3179 }, { "epoch": 2.963653308480895, "grad_norm": 1.1602373034353302, "learning_rate": 2.6924404556437694e-07, "loss": 1.5732, "step": 3180 }, { "epoch": 2.9645852749301023, "grad_norm": 1.7618488680967979, "learning_rate": 2.623403520883673e-07, "loss": 1.867, "step": 3181 }, { "epoch": 2.9655172413793105, "grad_norm": 1.1116946043798572, "learning_rate": 2.554366586123576e-07, "loss": 1.62, "step": 3182 }, { "epoch": 2.9664492078285183, "grad_norm": 1.2884034858415911, "learning_rate": 2.48532965136348e-07, "loss": 1.4903, "step": 3183 }, { "epoch": 2.967381174277726, "grad_norm": 1.2556884257761751, "learning_rate": 2.4162927166033834e-07, "loss": 1.8042, "step": 3184 }, { "epoch": 2.968313140726934, "grad_norm": 1.3483683865615157, "learning_rate": 2.3472557818432862e-07, "loss": 1.4815, "step": 3185 }, { "epoch": 2.9692451071761417, "grad_norm": 1.5147312630498522, "learning_rate": 2.2782188470831898e-07, "loss": 1.7343, "step": 3186 }, { "epoch": 2.9701770736253494, "grad_norm": 1.5094953114824075, "learning_rate": 2.209181912323093e-07, "loss": 1.5979, "step": 3187 }, { "epoch": 2.9711090400745572, "grad_norm": 1.280132651440011, "learning_rate": 2.1401449775629965e-07, "loss": 1.6705, "step": 3188 }, { "epoch": 2.972041006523765, "grad_norm": 1.3682051737265528, "learning_rate": 2.0711080428028996e-07, "loss": 1.4633, "step": 3189 }, { "epoch": 2.972972972972973, "grad_norm": 1.17751362037371, "learning_rate": 2.002071108042803e-07, "loss": 1.3865, "step": 3190 }, { "epoch": 2.973904939422181, "grad_norm": 1.6668180421045413, "learning_rate": 1.9330341732827066e-07, "loss": 1.6726, "step": 3191 }, { "epoch": 2.9748369058713884, "grad_norm": 1.3210509014548562, "learning_rate": 1.8639972385226097e-07, "loss": 1.7054, "step": 3192 }, { "epoch": 2.9757688723205966, "grad_norm": 1.1260943585445293, "learning_rate": 1.794960303762513e-07, "loss": 1.552, "step": 3193 }, { "epoch": 2.9767008387698044, "grad_norm": 1.1873252469606035, "learning_rate": 1.7259233690024164e-07, "loss": 1.6809, "step": 3194 }, { "epoch": 2.977632805219012, "grad_norm": 1.2887349262714152, "learning_rate": 1.6568864342423198e-07, "loss": 1.4453, "step": 3195 }, { "epoch": 2.97856477166822, "grad_norm": 1.2729549076176496, "learning_rate": 1.5878494994822234e-07, "loss": 1.6854, "step": 3196 }, { "epoch": 2.9794967381174278, "grad_norm": 1.1653667495883187, "learning_rate": 1.5188125647221265e-07, "loss": 1.6106, "step": 3197 }, { "epoch": 2.9804287045666356, "grad_norm": 1.4666536973157192, "learning_rate": 1.4497756299620298e-07, "loss": 1.7098, "step": 3198 }, { "epoch": 2.9813606710158433, "grad_norm": 1.2533777042407834, "learning_rate": 1.3807386952019332e-07, "loss": 1.5154, "step": 3199 }, { "epoch": 2.982292637465051, "grad_norm": 1.3144978327865302, "learning_rate": 1.3117017604418365e-07, "loss": 1.7547, "step": 3200 }, { "epoch": 2.983224603914259, "grad_norm": 1.2059099411963659, "learning_rate": 1.24266482568174e-07, "loss": 1.6268, "step": 3201 }, { "epoch": 2.984156570363467, "grad_norm": 1.3709983892542337, "learning_rate": 1.1736278909216431e-07, "loss": 1.6563, "step": 3202 }, { "epoch": 2.9850885368126745, "grad_norm": 1.2170203314272832, "learning_rate": 1.1045909561615465e-07, "loss": 1.7583, "step": 3203 }, { "epoch": 2.9860205032618827, "grad_norm": 1.1040993271710535, "learning_rate": 1.0355540214014498e-07, "loss": 1.4563, "step": 3204 }, { "epoch": 2.9869524697110905, "grad_norm": 1.410092603650611, "learning_rate": 9.665170866413533e-08, "loss": 1.808, "step": 3205 }, { "epoch": 2.9878844361602983, "grad_norm": 1.1682696134255228, "learning_rate": 8.974801518812565e-08, "loss": 1.2697, "step": 3206 }, { "epoch": 2.988816402609506, "grad_norm": 1.0640424422927857, "learning_rate": 8.284432171211599e-08, "loss": 1.4842, "step": 3207 }, { "epoch": 2.989748369058714, "grad_norm": 1.4230193189845222, "learning_rate": 7.594062823610632e-08, "loss": 1.7719, "step": 3208 }, { "epoch": 2.9906803355079217, "grad_norm": 1.4609066834891316, "learning_rate": 6.903693476009666e-08, "loss": 1.6712, "step": 3209 }, { "epoch": 2.9916123019571295, "grad_norm": 1.0249876686782056, "learning_rate": 6.2133241284087e-08, "loss": 1.5605, "step": 3210 }, { "epoch": 2.9925442684063372, "grad_norm": 1.354026625990758, "learning_rate": 5.522954780807732e-08, "loss": 1.9358, "step": 3211 }, { "epoch": 2.993476234855545, "grad_norm": 1.5203499320432863, "learning_rate": 4.8325854332067665e-08, "loss": 1.4898, "step": 3212 }, { "epoch": 2.9944082013047533, "grad_norm": 1.3226198874299009, "learning_rate": 4.1422160856057994e-08, "loss": 1.7614, "step": 3213 }, { "epoch": 2.9953401677539606, "grad_norm": 1.1626883815316542, "learning_rate": 3.451846738004833e-08, "loss": 1.7051, "step": 3214 }, { "epoch": 2.996272134203169, "grad_norm": 1.1026395069471828, "learning_rate": 2.761477390403866e-08, "loss": 1.3621, "step": 3215 }, { "epoch": 2.9972041006523766, "grad_norm": 1.3879088475079218, "learning_rate": 2.0711080428028997e-08, "loss": 1.3684, "step": 3216 }, { "epoch": 2.9981360671015844, "grad_norm": 1.2362201926614713, "learning_rate": 1.380738695201933e-08, "loss": 1.518, "step": 3217 }, { "epoch": 2.999068033550792, "grad_norm": 1.2267821358763562, "learning_rate": 6.903693476009665e-09, "loss": 1.5445, "step": 3218 }, { "epoch": 3.0, "grad_norm": 1.0050115762016163, "learning_rate": 0.0, "loss": 1.1672, "step": 3219 }, { "epoch": 3.0, "step": 3219, "total_flos": 0.0, "train_loss": 2.010534495143055, "train_runtime": 85470.1229, "train_samples_per_second": 0.603, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 3219, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }