{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998873493297286, "eval_steps": 500, "global_step": 2219, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004506026810859525, "grad_norm": 4.815864771779815, "learning_rate": 4.504504504504505e-08, "loss": 0.366, "step": 1 }, { "epoch": 0.000901205362171905, "grad_norm": 5.195074053996181, "learning_rate": 9.00900900900901e-08, "loss": 0.3833, "step": 2 }, { "epoch": 0.0013518080432578573, "grad_norm": 5.109775812530038, "learning_rate": 1.3513513513513515e-07, "loss": 0.3872, "step": 3 }, { "epoch": 0.00180241072434381, "grad_norm": 4.912948106635599, "learning_rate": 1.801801801801802e-07, "loss": 0.374, "step": 4 }, { "epoch": 0.002253013405429762, "grad_norm": 4.96137274114766, "learning_rate": 2.2522522522522524e-07, "loss": 0.3723, "step": 5 }, { "epoch": 0.0027036160865157146, "grad_norm": 4.898137124077649, "learning_rate": 2.702702702702703e-07, "loss": 0.3896, "step": 6 }, { "epoch": 0.0031542187676016674, "grad_norm": 4.794314493950962, "learning_rate": 3.153153153153153e-07, "loss": 0.4154, "step": 7 }, { "epoch": 0.00360482144868762, "grad_norm": 5.105069245772989, "learning_rate": 3.603603603603604e-07, "loss": 0.3943, "step": 8 }, { "epoch": 0.004055424129773572, "grad_norm": 4.437918536689456, "learning_rate": 4.0540540540540546e-07, "loss": 0.3531, "step": 9 }, { "epoch": 0.004506026810859524, "grad_norm": 4.369828896152025, "learning_rate": 4.504504504504505e-07, "loss": 0.3822, "step": 10 }, { "epoch": 0.004956629491945477, "grad_norm": 4.647490729965825, "learning_rate": 4.954954954954956e-07, "loss": 0.3663, "step": 11 }, { "epoch": 0.005407232173031429, "grad_norm": 4.816738574275561, "learning_rate": 5.405405405405406e-07, "loss": 0.3648, "step": 12 }, { "epoch": 0.005857834854117382, "grad_norm": 3.969135638580672, "learning_rate": 5.855855855855856e-07, "loss": 0.3598, "step": 13 }, { "epoch": 0.006308437535203335, "grad_norm": 3.6447362042128044, "learning_rate": 6.306306306306306e-07, "loss": 0.3544, "step": 14 }, { "epoch": 0.006759040216289287, "grad_norm": 2.8706345170518266, "learning_rate": 6.756756756756758e-07, "loss": 0.3397, "step": 15 }, { "epoch": 0.00720964289737524, "grad_norm": 2.778442530150813, "learning_rate": 7.207207207207208e-07, "loss": 0.3503, "step": 16 }, { "epoch": 0.007660245578461192, "grad_norm": 3.0191413167735655, "learning_rate": 7.657657657657658e-07, "loss": 0.3786, "step": 17 }, { "epoch": 0.008110848259547145, "grad_norm": 2.6863762331965004, "learning_rate": 8.108108108108109e-07, "loss": 0.3487, "step": 18 }, { "epoch": 0.008561450940633097, "grad_norm": 2.500154127846048, "learning_rate": 8.55855855855856e-07, "loss": 0.3555, "step": 19 }, { "epoch": 0.009012053621719049, "grad_norm": 2.3648275264818084, "learning_rate": 9.00900900900901e-07, "loss": 0.3476, "step": 20 }, { "epoch": 0.009462656302805001, "grad_norm": 2.6069567739452872, "learning_rate": 9.459459459459461e-07, "loss": 0.3431, "step": 21 }, { "epoch": 0.009913258983890954, "grad_norm": 2.6546117604315937, "learning_rate": 9.909909909909911e-07, "loss": 0.3011, "step": 22 }, { "epoch": 0.010363861664976907, "grad_norm": 2.429078931197282, "learning_rate": 1.0360360360360361e-06, "loss": 0.3117, "step": 23 }, { "epoch": 0.010814464346062858, "grad_norm": 1.9589677923898288, "learning_rate": 1.0810810810810812e-06, "loss": 0.327, "step": 24 }, { "epoch": 0.011265067027148811, "grad_norm": 1.8522547214325198, "learning_rate": 1.1261261261261262e-06, "loss": 0.3097, "step": 25 }, { "epoch": 0.011715669708234764, "grad_norm": 1.5812319133542672, "learning_rate": 1.1711711711711712e-06, "loss": 0.2835, "step": 26 }, { "epoch": 0.012166272389320717, "grad_norm": 1.478551103830084, "learning_rate": 1.2162162162162164e-06, "loss": 0.3027, "step": 27 }, { "epoch": 0.01261687507040667, "grad_norm": 1.6047585590317004, "learning_rate": 1.2612612612612613e-06, "loss": 0.2851, "step": 28 }, { "epoch": 0.01306747775149262, "grad_norm": 1.8154984583588165, "learning_rate": 1.3063063063063065e-06, "loss": 0.2757, "step": 29 }, { "epoch": 0.013518080432578574, "grad_norm": 2.027167277213967, "learning_rate": 1.3513513513513515e-06, "loss": 0.3006, "step": 30 }, { "epoch": 0.013968683113664527, "grad_norm": 1.840440668341431, "learning_rate": 1.3963963963963963e-06, "loss": 0.2894, "step": 31 }, { "epoch": 0.01441928579475048, "grad_norm": 1.8547802317028712, "learning_rate": 1.4414414414414416e-06, "loss": 0.3104, "step": 32 }, { "epoch": 0.01486988847583643, "grad_norm": 1.6017226824079185, "learning_rate": 1.4864864864864868e-06, "loss": 0.2825, "step": 33 }, { "epoch": 0.015320491156922383, "grad_norm": 1.618932965986627, "learning_rate": 1.5315315315315316e-06, "loss": 0.2897, "step": 34 }, { "epoch": 0.015771093838008336, "grad_norm": 1.725889184071538, "learning_rate": 1.5765765765765766e-06, "loss": 0.2492, "step": 35 }, { "epoch": 0.01622169651909429, "grad_norm": 1.7545616100435062, "learning_rate": 1.6216216216216219e-06, "loss": 0.2885, "step": 36 }, { "epoch": 0.016672299200180242, "grad_norm": 1.443938382974907, "learning_rate": 1.6666666666666667e-06, "loss": 0.2847, "step": 37 }, { "epoch": 0.017122901881266195, "grad_norm": 1.3439469209062533, "learning_rate": 1.711711711711712e-06, "loss": 0.273, "step": 38 }, { "epoch": 0.017573504562352148, "grad_norm": 1.4919993368852724, "learning_rate": 1.756756756756757e-06, "loss": 0.2724, "step": 39 }, { "epoch": 0.018024107243438097, "grad_norm": 1.646749223197058, "learning_rate": 1.801801801801802e-06, "loss": 0.2597, "step": 40 }, { "epoch": 0.01847470992452405, "grad_norm": 1.280899828503881, "learning_rate": 1.846846846846847e-06, "loss": 0.2483, "step": 41 }, { "epoch": 0.018925312605610003, "grad_norm": 1.297992220461041, "learning_rate": 1.8918918918918922e-06, "loss": 0.2659, "step": 42 }, { "epoch": 0.019375915286695956, "grad_norm": 1.45364906642596, "learning_rate": 1.9369369369369372e-06, "loss": 0.2777, "step": 43 }, { "epoch": 0.01982651796778191, "grad_norm": 1.7350773523889338, "learning_rate": 1.9819819819819822e-06, "loss": 0.2699, "step": 44 }, { "epoch": 0.02027712064886786, "grad_norm": 1.3800904283438646, "learning_rate": 2.0270270270270273e-06, "loss": 0.2543, "step": 45 }, { "epoch": 0.020727723329953814, "grad_norm": 1.4181131950530033, "learning_rate": 2.0720720720720723e-06, "loss": 0.2576, "step": 46 }, { "epoch": 0.021178326011039767, "grad_norm": 1.535738819405899, "learning_rate": 2.1171171171171173e-06, "loss": 0.2432, "step": 47 }, { "epoch": 0.021628928692125717, "grad_norm": 1.2566148211560952, "learning_rate": 2.1621621621621623e-06, "loss": 0.2581, "step": 48 }, { "epoch": 0.02207953137321167, "grad_norm": 1.3215876440850718, "learning_rate": 2.2072072072072073e-06, "loss": 0.2544, "step": 49 }, { "epoch": 0.022530134054297622, "grad_norm": 1.3222957733533551, "learning_rate": 2.2522522522522524e-06, "loss": 0.2383, "step": 50 }, { "epoch": 0.022980736735383575, "grad_norm": 1.3558975651467386, "learning_rate": 2.297297297297298e-06, "loss": 0.2623, "step": 51 }, { "epoch": 0.023431339416469528, "grad_norm": 1.180665933720355, "learning_rate": 2.3423423423423424e-06, "loss": 0.2358, "step": 52 }, { "epoch": 0.02388194209755548, "grad_norm": 1.3377467432333017, "learning_rate": 2.3873873873873874e-06, "loss": 0.2674, "step": 53 }, { "epoch": 0.024332544778641434, "grad_norm": 1.429078367789538, "learning_rate": 2.432432432432433e-06, "loss": 0.2539, "step": 54 }, { "epoch": 0.024783147459727387, "grad_norm": 1.29497107657176, "learning_rate": 2.4774774774774775e-06, "loss": 0.2431, "step": 55 }, { "epoch": 0.02523375014081334, "grad_norm": 1.4338057761852927, "learning_rate": 2.5225225225225225e-06, "loss": 0.2477, "step": 56 }, { "epoch": 0.02568435282189929, "grad_norm": 1.4626114161947357, "learning_rate": 2.5675675675675675e-06, "loss": 0.2573, "step": 57 }, { "epoch": 0.02613495550298524, "grad_norm": 1.3908318133820947, "learning_rate": 2.612612612612613e-06, "loss": 0.2592, "step": 58 }, { "epoch": 0.026585558184071195, "grad_norm": 1.2880400988089797, "learning_rate": 2.657657657657658e-06, "loss": 0.2446, "step": 59 }, { "epoch": 0.027036160865157147, "grad_norm": 1.3643355524484657, "learning_rate": 2.702702702702703e-06, "loss": 0.2463, "step": 60 }, { "epoch": 0.0274867635462431, "grad_norm": 1.3930601643893419, "learning_rate": 2.747747747747748e-06, "loss": 0.247, "step": 61 }, { "epoch": 0.027937366227329053, "grad_norm": 1.3674157761140657, "learning_rate": 2.7927927927927926e-06, "loss": 0.239, "step": 62 }, { "epoch": 0.028387968908415006, "grad_norm": 1.2947244011903152, "learning_rate": 2.837837837837838e-06, "loss": 0.245, "step": 63 }, { "epoch": 0.02883857158950096, "grad_norm": 1.3182407851781994, "learning_rate": 2.882882882882883e-06, "loss": 0.2519, "step": 64 }, { "epoch": 0.02928917427058691, "grad_norm": 1.2817823833464046, "learning_rate": 2.927927927927928e-06, "loss": 0.254, "step": 65 }, { "epoch": 0.02973977695167286, "grad_norm": 1.3451925823541346, "learning_rate": 2.9729729729729736e-06, "loss": 0.2402, "step": 66 }, { "epoch": 0.030190379632758814, "grad_norm": 1.2943481824061323, "learning_rate": 3.0180180180180186e-06, "loss": 0.2359, "step": 67 }, { "epoch": 0.030640982313844767, "grad_norm": 1.3423425334342036, "learning_rate": 3.063063063063063e-06, "loss": 0.248, "step": 68 }, { "epoch": 0.03109158499493072, "grad_norm": 1.2049652231573809, "learning_rate": 3.1081081081081082e-06, "loss": 0.2301, "step": 69 }, { "epoch": 0.03154218767601667, "grad_norm": 1.3344500076487555, "learning_rate": 3.1531531531531532e-06, "loss": 0.2342, "step": 70 }, { "epoch": 0.031992790357102625, "grad_norm": 1.1874737435413407, "learning_rate": 3.1981981981981987e-06, "loss": 0.2209, "step": 71 }, { "epoch": 0.03244339303818858, "grad_norm": 1.3779776082559232, "learning_rate": 3.2432432432432437e-06, "loss": 0.2361, "step": 72 }, { "epoch": 0.03289399571927453, "grad_norm": 1.2969763953943954, "learning_rate": 3.2882882882882887e-06, "loss": 0.253, "step": 73 }, { "epoch": 0.033344598400360484, "grad_norm": 1.3469669191031444, "learning_rate": 3.3333333333333333e-06, "loss": 0.2411, "step": 74 }, { "epoch": 0.03379520108144644, "grad_norm": 1.3704530488229245, "learning_rate": 3.3783783783783788e-06, "loss": 0.234, "step": 75 }, { "epoch": 0.03424580376253239, "grad_norm": 1.223619143070724, "learning_rate": 3.423423423423424e-06, "loss": 0.2437, "step": 76 }, { "epoch": 0.03469640644361834, "grad_norm": 1.399539495833997, "learning_rate": 3.468468468468469e-06, "loss": 0.2564, "step": 77 }, { "epoch": 0.035147009124704295, "grad_norm": 1.2277545572758175, "learning_rate": 3.513513513513514e-06, "loss": 0.2293, "step": 78 }, { "epoch": 0.03559761180579024, "grad_norm": 1.3602232090766564, "learning_rate": 3.5585585585585584e-06, "loss": 0.2521, "step": 79 }, { "epoch": 0.036048214486876194, "grad_norm": 1.355790485541872, "learning_rate": 3.603603603603604e-06, "loss": 0.2429, "step": 80 }, { "epoch": 0.03649881716796215, "grad_norm": 1.1987258690046945, "learning_rate": 3.648648648648649e-06, "loss": 0.245, "step": 81 }, { "epoch": 0.0369494198490481, "grad_norm": 1.6169310703051463, "learning_rate": 3.693693693693694e-06, "loss": 0.2494, "step": 82 }, { "epoch": 0.03740002253013405, "grad_norm": 1.5187379828279406, "learning_rate": 3.7387387387387394e-06, "loss": 0.2346, "step": 83 }, { "epoch": 0.037850625211220006, "grad_norm": 1.3308952951835935, "learning_rate": 3.7837837837837844e-06, "loss": 0.2396, "step": 84 }, { "epoch": 0.03830122789230596, "grad_norm": 1.5888441541358347, "learning_rate": 3.828828828828829e-06, "loss": 0.249, "step": 85 }, { "epoch": 0.03875183057339191, "grad_norm": 1.2796516821327377, "learning_rate": 3.8738738738738744e-06, "loss": 0.2381, "step": 86 }, { "epoch": 0.039202433254477864, "grad_norm": 1.291791666163532, "learning_rate": 3.918918918918919e-06, "loss": 0.2422, "step": 87 }, { "epoch": 0.03965303593556382, "grad_norm": 1.381850806769349, "learning_rate": 3.9639639639639645e-06, "loss": 0.2558, "step": 88 }, { "epoch": 0.04010363861664977, "grad_norm": 1.3086943706864933, "learning_rate": 4.009009009009009e-06, "loss": 0.2497, "step": 89 }, { "epoch": 0.04055424129773572, "grad_norm": 1.2999519245744664, "learning_rate": 4.0540540540540545e-06, "loss": 0.2241, "step": 90 }, { "epoch": 0.041004843978821676, "grad_norm": 1.183224660888697, "learning_rate": 4.099099099099099e-06, "loss": 0.2327, "step": 91 }, { "epoch": 0.04145544665990763, "grad_norm": 1.2998101785174705, "learning_rate": 4.1441441441441446e-06, "loss": 0.2282, "step": 92 }, { "epoch": 0.04190604934099358, "grad_norm": 1.2410993409820663, "learning_rate": 4.189189189189189e-06, "loss": 0.2478, "step": 93 }, { "epoch": 0.042356652022079534, "grad_norm": 1.2645254985527676, "learning_rate": 4.234234234234235e-06, "loss": 0.2479, "step": 94 }, { "epoch": 0.04280725470316549, "grad_norm": 1.2459528417226233, "learning_rate": 4.27927927927928e-06, "loss": 0.2314, "step": 95 }, { "epoch": 0.04325785738425143, "grad_norm": 1.249112068397591, "learning_rate": 4.324324324324325e-06, "loss": 0.2255, "step": 96 }, { "epoch": 0.043708460065337386, "grad_norm": 1.2728802945422262, "learning_rate": 4.369369369369369e-06, "loss": 0.2387, "step": 97 }, { "epoch": 0.04415906274642334, "grad_norm": 1.4177091138057598, "learning_rate": 4.414414414414415e-06, "loss": 0.2218, "step": 98 }, { "epoch": 0.04460966542750929, "grad_norm": 1.296078758285749, "learning_rate": 4.45945945945946e-06, "loss": 0.2293, "step": 99 }, { "epoch": 0.045060268108595244, "grad_norm": 1.46261721387381, "learning_rate": 4.504504504504505e-06, "loss": 0.2279, "step": 100 }, { "epoch": 0.0455108707896812, "grad_norm": 1.3107218066814923, "learning_rate": 4.54954954954955e-06, "loss": 0.2289, "step": 101 }, { "epoch": 0.04596147347076715, "grad_norm": 1.275172277252069, "learning_rate": 4.594594594594596e-06, "loss": 0.2418, "step": 102 }, { "epoch": 0.0464120761518531, "grad_norm": 1.464017582981461, "learning_rate": 4.63963963963964e-06, "loss": 0.2424, "step": 103 }, { "epoch": 0.046862678832939056, "grad_norm": 1.3024312968634821, "learning_rate": 4.684684684684685e-06, "loss": 0.2242, "step": 104 }, { "epoch": 0.04731328151402501, "grad_norm": 1.363653722681725, "learning_rate": 4.72972972972973e-06, "loss": 0.2418, "step": 105 }, { "epoch": 0.04776388419511096, "grad_norm": 1.1701030694099703, "learning_rate": 4.774774774774775e-06, "loss": 0.2311, "step": 106 }, { "epoch": 0.048214486876196914, "grad_norm": 1.3797624934807715, "learning_rate": 4.81981981981982e-06, "loss": 0.2374, "step": 107 }, { "epoch": 0.04866508955728287, "grad_norm": 1.2136932937379246, "learning_rate": 4.864864864864866e-06, "loss": 0.2239, "step": 108 }, { "epoch": 0.04911569223836882, "grad_norm": 1.2629715016785896, "learning_rate": 4.90990990990991e-06, "loss": 0.2251, "step": 109 }, { "epoch": 0.04956629491945477, "grad_norm": 1.4317438257489863, "learning_rate": 4.954954954954955e-06, "loss": 0.2366, "step": 110 }, { "epoch": 0.050016897600540726, "grad_norm": 1.3241056157209627, "learning_rate": 5e-06, "loss": 0.233, "step": 111 }, { "epoch": 0.05046750028162668, "grad_norm": 1.290038853019236, "learning_rate": 4.999997223686756e-06, "loss": 0.2421, "step": 112 }, { "epoch": 0.050918102962712625, "grad_norm": 1.365409438767111, "learning_rate": 4.999988894753189e-06, "loss": 0.2393, "step": 113 }, { "epoch": 0.05136870564379858, "grad_norm": 1.2940027464844308, "learning_rate": 4.999975013217796e-06, "loss": 0.2351, "step": 114 }, { "epoch": 0.05181930832488453, "grad_norm": 1.5242296190272844, "learning_rate": 4.999955579111413e-06, "loss": 0.2224, "step": 115 }, { "epoch": 0.05226991100597048, "grad_norm": 1.2441974279995303, "learning_rate": 4.9999305924772e-06, "loss": 0.2346, "step": 116 }, { "epoch": 0.052720513687056436, "grad_norm": 1.2318049884586677, "learning_rate": 4.999900053370657e-06, "loss": 0.2197, "step": 117 }, { "epoch": 0.05317111636814239, "grad_norm": 1.378129834491324, "learning_rate": 4.99986396185961e-06, "loss": 0.2396, "step": 118 }, { "epoch": 0.05362171904922834, "grad_norm": 1.2120301657823338, "learning_rate": 4.999822318024222e-06, "loss": 0.2265, "step": 119 }, { "epoch": 0.054072321730314295, "grad_norm": 1.332984287268552, "learning_rate": 4.9997751219569844e-06, "loss": 0.2448, "step": 120 }, { "epoch": 0.05452292441140025, "grad_norm": 1.2389274937015193, "learning_rate": 4.999722373762725e-06, "loss": 0.224, "step": 121 }, { "epoch": 0.0549735270924862, "grad_norm": 1.204117312589835, "learning_rate": 4.999664073558596e-06, "loss": 0.2204, "step": 122 }, { "epoch": 0.05542412977357215, "grad_norm": 1.3101533213641092, "learning_rate": 4.999600221474089e-06, "loss": 0.2188, "step": 123 }, { "epoch": 0.055874732454658106, "grad_norm": 1.355729531514012, "learning_rate": 4.99953081765102e-06, "loss": 0.219, "step": 124 }, { "epoch": 0.05632533513574406, "grad_norm": 1.3145496397104341, "learning_rate": 4.999455862243539e-06, "loss": 0.2416, "step": 125 }, { "epoch": 0.05677593781683001, "grad_norm": 1.2549927498664672, "learning_rate": 4.999375355418128e-06, "loss": 0.2419, "step": 126 }, { "epoch": 0.057226540497915965, "grad_norm": 1.3163216207252515, "learning_rate": 4.999289297353593e-06, "loss": 0.2318, "step": 127 }, { "epoch": 0.05767714317900192, "grad_norm": 1.221692049141614, "learning_rate": 4.999197688241076e-06, "loss": 0.2329, "step": 128 }, { "epoch": 0.05812774586008787, "grad_norm": 1.2504114744133756, "learning_rate": 4.999100528284045e-06, "loss": 0.246, "step": 129 }, { "epoch": 0.05857834854117382, "grad_norm": 1.1644125285534819, "learning_rate": 4.998997817698298e-06, "loss": 0.2203, "step": 130 }, { "epoch": 0.05902895122225977, "grad_norm": 1.1744114893394246, "learning_rate": 4.998889556711958e-06, "loss": 0.2328, "step": 131 }, { "epoch": 0.05947955390334572, "grad_norm": 1.2414638105224192, "learning_rate": 4.998775745565479e-06, "loss": 0.2465, "step": 132 }, { "epoch": 0.059930156584431675, "grad_norm": 1.249926424886369, "learning_rate": 4.998656384511643e-06, "loss": 0.2264, "step": 133 }, { "epoch": 0.06038075926551763, "grad_norm": 1.2572203072207218, "learning_rate": 4.9985314738155545e-06, "loss": 0.2345, "step": 134 }, { "epoch": 0.06083136194660358, "grad_norm": 1.1950518724866406, "learning_rate": 4.9984010137546475e-06, "loss": 0.231, "step": 135 }, { "epoch": 0.061281964627689534, "grad_norm": 1.2768234243037746, "learning_rate": 4.998265004618682e-06, "loss": 0.2213, "step": 136 }, { "epoch": 0.061732567308775486, "grad_norm": 1.2224411308170113, "learning_rate": 4.998123446709739e-06, "loss": 0.228, "step": 137 }, { "epoch": 0.06218316998986144, "grad_norm": 1.282263129739265, "learning_rate": 4.997976340342226e-06, "loss": 0.2167, "step": 138 }, { "epoch": 0.06263377267094739, "grad_norm": 1.1557240536583024, "learning_rate": 4.997823685842875e-06, "loss": 0.2076, "step": 139 }, { "epoch": 0.06308437535203335, "grad_norm": 1.284456691992294, "learning_rate": 4.997665483550739e-06, "loss": 0.2416, "step": 140 }, { "epoch": 0.06353497803311929, "grad_norm": 1.1028932126513764, "learning_rate": 4.997501733817191e-06, "loss": 0.2031, "step": 141 }, { "epoch": 0.06398558071420525, "grad_norm": 1.1975966518750552, "learning_rate": 4.997332437005932e-06, "loss": 0.2201, "step": 142 }, { "epoch": 0.0644361833952912, "grad_norm": 1.3778575705019447, "learning_rate": 4.997157593492974e-06, "loss": 0.2289, "step": 143 }, { "epoch": 0.06488678607637716, "grad_norm": 1.212901982108371, "learning_rate": 4.996977203666657e-06, "loss": 0.2263, "step": 144 }, { "epoch": 0.0653373887574631, "grad_norm": 1.2524708726219558, "learning_rate": 4.996791267927632e-06, "loss": 0.2418, "step": 145 }, { "epoch": 0.06578799143854906, "grad_norm": 1.1251179828824371, "learning_rate": 4.996599786688876e-06, "loss": 0.2262, "step": 146 }, { "epoch": 0.06623859411963501, "grad_norm": 1.1644234192309482, "learning_rate": 4.996402760375676e-06, "loss": 0.2223, "step": 147 }, { "epoch": 0.06668919680072097, "grad_norm": 1.2518098382195715, "learning_rate": 4.996200189425638e-06, "loss": 0.2304, "step": 148 }, { "epoch": 0.06713979948180691, "grad_norm": 1.268646829769258, "learning_rate": 4.9959920742886815e-06, "loss": 0.2381, "step": 149 }, { "epoch": 0.06759040216289287, "grad_norm": 1.1373995313396938, "learning_rate": 4.995778415427042e-06, "loss": 0.2295, "step": 150 }, { "epoch": 0.06804100484397882, "grad_norm": 1.4124184544565843, "learning_rate": 4.995559213315267e-06, "loss": 0.2319, "step": 151 }, { "epoch": 0.06849160752506478, "grad_norm": 1.1988625116568261, "learning_rate": 4.995334468440213e-06, "loss": 0.2239, "step": 152 }, { "epoch": 0.06894221020615073, "grad_norm": 1.2400456487781812, "learning_rate": 4.995104181301052e-06, "loss": 0.2364, "step": 153 }, { "epoch": 0.06939281288723669, "grad_norm": 1.169373197319975, "learning_rate": 4.994868352409263e-06, "loss": 0.2209, "step": 154 }, { "epoch": 0.06984341556832263, "grad_norm": 1.2105436784965689, "learning_rate": 4.9946269822886335e-06, "loss": 0.2294, "step": 155 }, { "epoch": 0.07029401824940859, "grad_norm": 1.1146096056941384, "learning_rate": 4.9943800714752586e-06, "loss": 0.214, "step": 156 }, { "epoch": 0.07074462093049454, "grad_norm": 1.107431297406545, "learning_rate": 4.9941276205175405e-06, "loss": 0.2153, "step": 157 }, { "epoch": 0.07119522361158048, "grad_norm": 1.1493108111633201, "learning_rate": 4.9938696299761856e-06, "loss": 0.2127, "step": 158 }, { "epoch": 0.07164582629266644, "grad_norm": 1.1357095289262957, "learning_rate": 4.993606100424202e-06, "loss": 0.2089, "step": 159 }, { "epoch": 0.07209642897375239, "grad_norm": 1.2497593534516505, "learning_rate": 4.9933370324469045e-06, "loss": 0.2326, "step": 160 }, { "epoch": 0.07254703165483835, "grad_norm": 1.119865007378002, "learning_rate": 4.993062426641906e-06, "loss": 0.2084, "step": 161 }, { "epoch": 0.0729976343359243, "grad_norm": 1.1584209347845942, "learning_rate": 4.9927822836191185e-06, "loss": 0.2124, "step": 162 }, { "epoch": 0.07344823701701025, "grad_norm": 1.1154381951406347, "learning_rate": 4.992496604000756e-06, "loss": 0.2123, "step": 163 }, { "epoch": 0.0738988396980962, "grad_norm": 1.2442046545527665, "learning_rate": 4.992205388421326e-06, "loss": 0.2253, "step": 164 }, { "epoch": 0.07434944237918216, "grad_norm": 1.300606227018797, "learning_rate": 4.991908637527634e-06, "loss": 0.2472, "step": 165 }, { "epoch": 0.0748000450602681, "grad_norm": 1.2450291533554696, "learning_rate": 4.9916063519787775e-06, "loss": 0.2369, "step": 166 }, { "epoch": 0.07525064774135407, "grad_norm": 1.2066695787715223, "learning_rate": 4.991298532446149e-06, "loss": 0.2242, "step": 167 }, { "epoch": 0.07570125042244001, "grad_norm": 1.0901339065424078, "learning_rate": 4.990985179613431e-06, "loss": 0.2155, "step": 168 }, { "epoch": 0.07615185310352597, "grad_norm": 1.2221127283502966, "learning_rate": 4.990666294176596e-06, "loss": 0.2231, "step": 169 }, { "epoch": 0.07660245578461192, "grad_norm": 1.1620496889627006, "learning_rate": 4.990341876843904e-06, "loss": 0.2211, "step": 170 }, { "epoch": 0.07705305846569788, "grad_norm": 1.3071221084528932, "learning_rate": 4.9900119283359025e-06, "loss": 0.2164, "step": 171 }, { "epoch": 0.07750366114678382, "grad_norm": 1.206047751336447, "learning_rate": 4.989676449385426e-06, "loss": 0.2281, "step": 172 }, { "epoch": 0.07795426382786978, "grad_norm": 1.4597757942350271, "learning_rate": 4.989335440737587e-06, "loss": 0.2506, "step": 173 }, { "epoch": 0.07840486650895573, "grad_norm": 1.379958299076823, "learning_rate": 4.988988903149784e-06, "loss": 0.2189, "step": 174 }, { "epoch": 0.07885546919004167, "grad_norm": 1.20920858944242, "learning_rate": 4.988636837391696e-06, "loss": 0.2314, "step": 175 }, { "epoch": 0.07930607187112763, "grad_norm": 1.2313142738643095, "learning_rate": 4.988279244245278e-06, "loss": 0.2205, "step": 176 }, { "epoch": 0.07975667455221358, "grad_norm": 1.3661071015991848, "learning_rate": 4.987916124504761e-06, "loss": 0.2306, "step": 177 }, { "epoch": 0.08020727723329954, "grad_norm": 1.1046734494634796, "learning_rate": 4.987547478976655e-06, "loss": 0.2081, "step": 178 }, { "epoch": 0.08065787991438549, "grad_norm": 1.3121004896955752, "learning_rate": 4.987173308479738e-06, "loss": 0.2515, "step": 179 }, { "epoch": 0.08110848259547145, "grad_norm": 1.0664036431852173, "learning_rate": 4.9867936138450635e-06, "loss": 0.2124, "step": 180 }, { "epoch": 0.08155908527655739, "grad_norm": 1.1905058604489476, "learning_rate": 4.98640839591595e-06, "loss": 0.2087, "step": 181 }, { "epoch": 0.08200968795764335, "grad_norm": 1.1400198695770818, "learning_rate": 4.986017655547989e-06, "loss": 0.2155, "step": 182 }, { "epoch": 0.0824602906387293, "grad_norm": 1.1958747980717914, "learning_rate": 4.985621393609032e-06, "loss": 0.2299, "step": 183 }, { "epoch": 0.08291089331981526, "grad_norm": 1.1612767983501473, "learning_rate": 4.9852196109792e-06, "loss": 0.2128, "step": 184 }, { "epoch": 0.0833614960009012, "grad_norm": 1.09026246469268, "learning_rate": 4.984812308550869e-06, "loss": 0.2099, "step": 185 }, { "epoch": 0.08381209868198716, "grad_norm": 1.235643788294065, "learning_rate": 4.98439948722868e-06, "loss": 0.2101, "step": 186 }, { "epoch": 0.08426270136307311, "grad_norm": 1.0869085522856416, "learning_rate": 4.98398114792953e-06, "loss": 0.2193, "step": 187 }, { "epoch": 0.08471330404415907, "grad_norm": 1.1675783445811345, "learning_rate": 4.983557291582572e-06, "loss": 0.2398, "step": 188 }, { "epoch": 0.08516390672524501, "grad_norm": 1.2182948524762915, "learning_rate": 4.9831279191292114e-06, "loss": 0.2315, "step": 189 }, { "epoch": 0.08561450940633097, "grad_norm": 1.1259138226408614, "learning_rate": 4.982693031523107e-06, "loss": 0.2234, "step": 190 }, { "epoch": 0.08606511208741692, "grad_norm": 1.117678078505707, "learning_rate": 4.982252629730167e-06, "loss": 0.2308, "step": 191 }, { "epoch": 0.08651571476850287, "grad_norm": 1.0912133687527328, "learning_rate": 4.981806714728543e-06, "loss": 0.205, "step": 192 }, { "epoch": 0.08696631744958883, "grad_norm": 1.1608197726702825, "learning_rate": 4.981355287508638e-06, "loss": 0.2223, "step": 193 }, { "epoch": 0.08741692013067477, "grad_norm": 1.1867139699465552, "learning_rate": 4.980898349073094e-06, "loss": 0.2356, "step": 194 }, { "epoch": 0.08786752281176073, "grad_norm": 1.1699794880063066, "learning_rate": 4.980435900436793e-06, "loss": 0.212, "step": 195 }, { "epoch": 0.08831812549284668, "grad_norm": 1.162570000485188, "learning_rate": 4.9799679426268575e-06, "loss": 0.2385, "step": 196 }, { "epoch": 0.08876872817393264, "grad_norm": 1.1699257911021248, "learning_rate": 4.979494476682647e-06, "loss": 0.2215, "step": 197 }, { "epoch": 0.08921933085501858, "grad_norm": 1.142705953177672, "learning_rate": 4.979015503655751e-06, "loss": 0.2426, "step": 198 }, { "epoch": 0.08966993353610454, "grad_norm": 1.1079034543480484, "learning_rate": 4.978531024609994e-06, "loss": 0.1941, "step": 199 }, { "epoch": 0.09012053621719049, "grad_norm": 1.3549506501630537, "learning_rate": 4.978041040621428e-06, "loss": 0.2129, "step": 200 }, { "epoch": 0.09057113889827645, "grad_norm": 1.2443686889827277, "learning_rate": 4.977545552778333e-06, "loss": 0.2423, "step": 201 }, { "epoch": 0.0910217415793624, "grad_norm": 1.3448778534838146, "learning_rate": 4.977044562181212e-06, "loss": 0.2263, "step": 202 }, { "epoch": 0.09147234426044835, "grad_norm": 1.3809990244237884, "learning_rate": 4.9765380699427905e-06, "loss": 0.2332, "step": 203 }, { "epoch": 0.0919229469415343, "grad_norm": 1.194186222769067, "learning_rate": 4.976026077188013e-06, "loss": 0.2123, "step": 204 }, { "epoch": 0.09237354962262026, "grad_norm": 1.2171969561529559, "learning_rate": 4.9755085850540426e-06, "loss": 0.2176, "step": 205 }, { "epoch": 0.0928241523037062, "grad_norm": 1.2441216278856086, "learning_rate": 4.974985594690255e-06, "loss": 0.219, "step": 206 }, { "epoch": 0.09327475498479217, "grad_norm": 1.1897634301147553, "learning_rate": 4.9744571072582365e-06, "loss": 0.2118, "step": 207 }, { "epoch": 0.09372535766587811, "grad_norm": 1.2669159238364731, "learning_rate": 4.973923123931786e-06, "loss": 0.2289, "step": 208 }, { "epoch": 0.09417596034696406, "grad_norm": 1.0996102317785694, "learning_rate": 4.973383645896908e-06, "loss": 0.2184, "step": 209 }, { "epoch": 0.09462656302805002, "grad_norm": 1.2185753643269248, "learning_rate": 4.97283867435181e-06, "loss": 0.2207, "step": 210 }, { "epoch": 0.09507716570913596, "grad_norm": 1.138433632415245, "learning_rate": 4.972288210506902e-06, "loss": 0.212, "step": 211 }, { "epoch": 0.09552776839022192, "grad_norm": 1.132998283546536, "learning_rate": 4.971732255584789e-06, "loss": 0.2367, "step": 212 }, { "epoch": 0.09597837107130787, "grad_norm": 1.0780010142719922, "learning_rate": 4.971170810820279e-06, "loss": 0.2113, "step": 213 }, { "epoch": 0.09642897375239383, "grad_norm": 1.1546441401328669, "learning_rate": 4.970603877460367e-06, "loss": 0.2252, "step": 214 }, { "epoch": 0.09687957643347977, "grad_norm": 1.1087587222673596, "learning_rate": 4.970031456764242e-06, "loss": 0.2202, "step": 215 }, { "epoch": 0.09733017911456573, "grad_norm": 1.0767581515636935, "learning_rate": 4.969453550003277e-06, "loss": 0.2074, "step": 216 }, { "epoch": 0.09778078179565168, "grad_norm": 1.2355538300715063, "learning_rate": 4.9688701584610345e-06, "loss": 0.2331, "step": 217 }, { "epoch": 0.09823138447673764, "grad_norm": 1.0345630784935012, "learning_rate": 4.968281283433256e-06, "loss": 0.2089, "step": 218 }, { "epoch": 0.09868198715782359, "grad_norm": 1.1203797258878334, "learning_rate": 4.967686926227862e-06, "loss": 0.1997, "step": 219 }, { "epoch": 0.09913258983890955, "grad_norm": 1.1144022340348496, "learning_rate": 4.967087088164951e-06, "loss": 0.205, "step": 220 }, { "epoch": 0.09958319251999549, "grad_norm": 1.0263017472875062, "learning_rate": 4.966481770576793e-06, "loss": 0.1965, "step": 221 }, { "epoch": 0.10003379520108145, "grad_norm": 1.1372226739430344, "learning_rate": 4.965870974807829e-06, "loss": 0.2166, "step": 222 }, { "epoch": 0.1004843978821674, "grad_norm": 1.1239275991584654, "learning_rate": 4.965254702214668e-06, "loss": 0.2011, "step": 223 }, { "epoch": 0.10093500056325336, "grad_norm": 1.2744989782535152, "learning_rate": 4.964632954166081e-06, "loss": 0.2139, "step": 224 }, { "epoch": 0.1013856032443393, "grad_norm": 1.1648813557027615, "learning_rate": 4.964005732043003e-06, "loss": 0.2103, "step": 225 }, { "epoch": 0.10183620592542525, "grad_norm": 1.0781643061072272, "learning_rate": 4.963373037238527e-06, "loss": 0.2101, "step": 226 }, { "epoch": 0.10228680860651121, "grad_norm": 1.2248828989051597, "learning_rate": 4.9627348711578996e-06, "loss": 0.2174, "step": 227 }, { "epoch": 0.10273741128759716, "grad_norm": 1.1754380302621135, "learning_rate": 4.962091235218518e-06, "loss": 0.207, "step": 228 }, { "epoch": 0.10318801396868311, "grad_norm": 1.1695992674462652, "learning_rate": 4.961442130849933e-06, "loss": 0.2207, "step": 229 }, { "epoch": 0.10363861664976906, "grad_norm": 1.1412176861499799, "learning_rate": 4.960787559493836e-06, "loss": 0.2327, "step": 230 }, { "epoch": 0.10408921933085502, "grad_norm": 1.2884964764354325, "learning_rate": 4.960127522604065e-06, "loss": 0.2151, "step": 231 }, { "epoch": 0.10453982201194097, "grad_norm": 1.1575013412061594, "learning_rate": 4.959462021646593e-06, "loss": 0.2082, "step": 232 }, { "epoch": 0.10499042469302693, "grad_norm": 1.041034111903912, "learning_rate": 4.958791058099533e-06, "loss": 0.2302, "step": 233 }, { "epoch": 0.10544102737411287, "grad_norm": 1.2510411794297505, "learning_rate": 4.95811463345313e-06, "loss": 0.2197, "step": 234 }, { "epoch": 0.10589163005519883, "grad_norm": 1.1556663492126247, "learning_rate": 4.957432749209755e-06, "loss": 0.2086, "step": 235 }, { "epoch": 0.10634223273628478, "grad_norm": 1.2247058461119267, "learning_rate": 4.956745406883909e-06, "loss": 0.2032, "step": 236 }, { "epoch": 0.10679283541737074, "grad_norm": 1.192347886801293, "learning_rate": 4.956052608002212e-06, "loss": 0.2254, "step": 237 }, { "epoch": 0.10724343809845668, "grad_norm": 1.223611134683997, "learning_rate": 4.9553543541034086e-06, "loss": 0.2285, "step": 238 }, { "epoch": 0.10769404077954264, "grad_norm": 1.249263593149513, "learning_rate": 4.954650646738354e-06, "loss": 0.2073, "step": 239 }, { "epoch": 0.10814464346062859, "grad_norm": 1.2193624268365642, "learning_rate": 4.953941487470017e-06, "loss": 0.224, "step": 240 }, { "epoch": 0.10859524614171455, "grad_norm": 1.1481178521059008, "learning_rate": 4.953226877873479e-06, "loss": 0.2107, "step": 241 }, { "epoch": 0.1090458488228005, "grad_norm": 1.10926361952047, "learning_rate": 4.952506819535922e-06, "loss": 0.2134, "step": 242 }, { "epoch": 0.10949645150388645, "grad_norm": 1.0597226992856483, "learning_rate": 4.951781314056633e-06, "loss": 0.2056, "step": 243 }, { "epoch": 0.1099470541849724, "grad_norm": 1.1791047972354118, "learning_rate": 4.951050363046995e-06, "loss": 0.232, "step": 244 }, { "epoch": 0.11039765686605835, "grad_norm": 1.234202693540785, "learning_rate": 4.950313968130488e-06, "loss": 0.241, "step": 245 }, { "epoch": 0.1108482595471443, "grad_norm": 1.1783658842486509, "learning_rate": 4.949572130942683e-06, "loss": 0.2186, "step": 246 }, { "epoch": 0.11129886222823025, "grad_norm": 1.1081432331819334, "learning_rate": 4.948824853131237e-06, "loss": 0.1974, "step": 247 }, { "epoch": 0.11174946490931621, "grad_norm": 1.122692447305528, "learning_rate": 4.948072136355892e-06, "loss": 0.2176, "step": 248 }, { "epoch": 0.11220006759040216, "grad_norm": 1.1112537761570087, "learning_rate": 4.94731398228847e-06, "loss": 0.2141, "step": 249 }, { "epoch": 0.11265067027148812, "grad_norm": 1.2229695315887537, "learning_rate": 4.94655039261287e-06, "loss": 0.2324, "step": 250 }, { "epoch": 0.11310127295257406, "grad_norm": 1.1468824200816254, "learning_rate": 4.9457813690250635e-06, "loss": 0.2212, "step": 251 }, { "epoch": 0.11355187563366002, "grad_norm": 1.1102275136522628, "learning_rate": 4.94500691323309e-06, "loss": 0.218, "step": 252 }, { "epoch": 0.11400247831474597, "grad_norm": 1.0561632156543708, "learning_rate": 4.9442270269570545e-06, "loss": 0.1878, "step": 253 }, { "epoch": 0.11445308099583193, "grad_norm": 1.0005190577253862, "learning_rate": 4.943441711929126e-06, "loss": 0.1999, "step": 254 }, { "epoch": 0.11490368367691788, "grad_norm": 1.2093067612482298, "learning_rate": 4.942650969893527e-06, "loss": 0.2288, "step": 255 }, { "epoch": 0.11535428635800384, "grad_norm": 1.1937432006596116, "learning_rate": 4.941854802606537e-06, "loss": 0.2274, "step": 256 }, { "epoch": 0.11580488903908978, "grad_norm": 1.1330892183935246, "learning_rate": 4.941053211836482e-06, "loss": 0.2198, "step": 257 }, { "epoch": 0.11625549172017574, "grad_norm": 1.0548114873532413, "learning_rate": 4.940246199363737e-06, "loss": 0.2115, "step": 258 }, { "epoch": 0.11670609440126169, "grad_norm": 1.0442025208171648, "learning_rate": 4.939433766980717e-06, "loss": 0.1878, "step": 259 }, { "epoch": 0.11715669708234765, "grad_norm": 1.1337795310606045, "learning_rate": 4.9386159164918764e-06, "loss": 0.2234, "step": 260 }, { "epoch": 0.11760729976343359, "grad_norm": 1.14992228918911, "learning_rate": 4.937792649713701e-06, "loss": 0.2237, "step": 261 }, { "epoch": 0.11805790244451954, "grad_norm": 1.14839235156934, "learning_rate": 4.9369639684747095e-06, "loss": 0.2221, "step": 262 }, { "epoch": 0.1185085051256055, "grad_norm": 1.0767897369879735, "learning_rate": 4.936129874615443e-06, "loss": 0.2039, "step": 263 }, { "epoch": 0.11895910780669144, "grad_norm": 0.9979796809326557, "learning_rate": 4.935290369988468e-06, "loss": 0.2034, "step": 264 }, { "epoch": 0.1194097104877774, "grad_norm": 1.0530126787897642, "learning_rate": 4.934445456458366e-06, "loss": 0.2211, "step": 265 }, { "epoch": 0.11986031316886335, "grad_norm": 1.1032116938045, "learning_rate": 4.933595135901733e-06, "loss": 0.2063, "step": 266 }, { "epoch": 0.12031091584994931, "grad_norm": 1.168793383146328, "learning_rate": 4.932739410207172e-06, "loss": 0.2217, "step": 267 }, { "epoch": 0.12076151853103526, "grad_norm": 1.1618233220895118, "learning_rate": 4.931878281275296e-06, "loss": 0.195, "step": 268 }, { "epoch": 0.12121212121212122, "grad_norm": 1.1834676718267456, "learning_rate": 4.931011751018715e-06, "loss": 0.2276, "step": 269 }, { "epoch": 0.12166272389320716, "grad_norm": 1.1934211671010873, "learning_rate": 4.930139821362036e-06, "loss": 0.2232, "step": 270 }, { "epoch": 0.12211332657429312, "grad_norm": 1.0812911650522306, "learning_rate": 4.929262494241859e-06, "loss": 0.2235, "step": 271 }, { "epoch": 0.12256392925537907, "grad_norm": 1.0858861894225589, "learning_rate": 4.928379771606773e-06, "loss": 0.2024, "step": 272 }, { "epoch": 0.12301453193646503, "grad_norm": 0.9937957531157218, "learning_rate": 4.927491655417347e-06, "loss": 0.2064, "step": 273 }, { "epoch": 0.12346513461755097, "grad_norm": 1.02867764239494, "learning_rate": 4.926598147646134e-06, "loss": 0.1913, "step": 274 }, { "epoch": 0.12391573729863693, "grad_norm": 1.1906053232315303, "learning_rate": 4.9256992502776605e-06, "loss": 0.2348, "step": 275 }, { "epoch": 0.12436633997972288, "grad_norm": 1.1698084508619528, "learning_rate": 4.924794965308421e-06, "loss": 0.2167, "step": 276 }, { "epoch": 0.12481694266080884, "grad_norm": 1.0742190591584848, "learning_rate": 4.9238852947468796e-06, "loss": 0.2228, "step": 277 }, { "epoch": 0.12526754534189477, "grad_norm": 1.2705248393755768, "learning_rate": 4.922970240613461e-06, "loss": 0.218, "step": 278 }, { "epoch": 0.12571814802298073, "grad_norm": 1.056558181907751, "learning_rate": 4.922049804940546e-06, "loss": 0.2061, "step": 279 }, { "epoch": 0.1261687507040667, "grad_norm": 1.1196465892653233, "learning_rate": 4.9211239897724685e-06, "loss": 0.2195, "step": 280 }, { "epoch": 0.12661935338515265, "grad_norm": 1.1621109193661827, "learning_rate": 4.920192797165511e-06, "loss": 0.2256, "step": 281 }, { "epoch": 0.12706995606623858, "grad_norm": 1.300028969200385, "learning_rate": 4.9192562291879e-06, "loss": 0.2344, "step": 282 }, { "epoch": 0.12752055874732454, "grad_norm": 1.1955520034272398, "learning_rate": 4.9183142879198e-06, "loss": 0.211, "step": 283 }, { "epoch": 0.1279711614284105, "grad_norm": 1.155931030252852, "learning_rate": 4.917366975453311e-06, "loss": 0.216, "step": 284 }, { "epoch": 0.12842176410949646, "grad_norm": 1.083861187612339, "learning_rate": 4.9164142938924595e-06, "loss": 0.2222, "step": 285 }, { "epoch": 0.1288723667905824, "grad_norm": 1.1248471153486164, "learning_rate": 4.915456245353202e-06, "loss": 0.2093, "step": 286 }, { "epoch": 0.12932296947166835, "grad_norm": 1.221633030384653, "learning_rate": 4.914492831963411e-06, "loss": 0.2086, "step": 287 }, { "epoch": 0.1297735721527543, "grad_norm": 1.2342848534915776, "learning_rate": 4.9135240558628786e-06, "loss": 0.224, "step": 288 }, { "epoch": 0.13022417483384027, "grad_norm": 1.0704411714834268, "learning_rate": 4.9125499192033035e-06, "loss": 0.1953, "step": 289 }, { "epoch": 0.1306747775149262, "grad_norm": 1.1195374340985238, "learning_rate": 4.911570424148293e-06, "loss": 0.2052, "step": 290 }, { "epoch": 0.13112538019601216, "grad_norm": 1.2051951166308854, "learning_rate": 4.910585572873355e-06, "loss": 0.2335, "step": 291 }, { "epoch": 0.13157598287709812, "grad_norm": 1.1643319404848254, "learning_rate": 4.9095953675658945e-06, "loss": 0.2146, "step": 292 }, { "epoch": 0.13202658555818408, "grad_norm": 1.0772311574842537, "learning_rate": 4.908599810425208e-06, "loss": 0.2192, "step": 293 }, { "epoch": 0.13247718823927002, "grad_norm": 1.0844966317512954, "learning_rate": 4.907598903662477e-06, "loss": 0.2222, "step": 294 }, { "epoch": 0.13292779092035598, "grad_norm": 1.0524330356877105, "learning_rate": 4.906592649500767e-06, "loss": 0.2306, "step": 295 }, { "epoch": 0.13337839360144194, "grad_norm": 1.1527168046897904, "learning_rate": 4.9055810501750205e-06, "loss": 0.2085, "step": 296 }, { "epoch": 0.13382899628252787, "grad_norm": 1.093657074566182, "learning_rate": 4.904564107932048e-06, "loss": 0.2107, "step": 297 }, { "epoch": 0.13427959896361383, "grad_norm": 1.0682596809770557, "learning_rate": 4.9035418250305314e-06, "loss": 0.2092, "step": 298 }, { "epoch": 0.1347302016446998, "grad_norm": 1.0844093558079646, "learning_rate": 4.902514203741013e-06, "loss": 0.2102, "step": 299 }, { "epoch": 0.13518080432578575, "grad_norm": 1.1018401031463165, "learning_rate": 4.9014812463458905e-06, "loss": 0.2192, "step": 300 }, { "epoch": 0.13563140700687168, "grad_norm": 1.1860232064810416, "learning_rate": 4.9004429551394155e-06, "loss": 0.2195, "step": 301 }, { "epoch": 0.13608200968795764, "grad_norm": 1.0961452643370815, "learning_rate": 4.899399332427685e-06, "loss": 0.214, "step": 302 }, { "epoch": 0.1365326123690436, "grad_norm": 1.1694339378011251, "learning_rate": 4.898350380528638e-06, "loss": 0.2242, "step": 303 }, { "epoch": 0.13698321505012956, "grad_norm": 1.1381527321950415, "learning_rate": 4.89729610177205e-06, "loss": 0.2129, "step": 304 }, { "epoch": 0.1374338177312155, "grad_norm": 1.0391770844547668, "learning_rate": 4.896236498499526e-06, "loss": 0.2175, "step": 305 }, { "epoch": 0.13788442041230145, "grad_norm": 1.195827567002921, "learning_rate": 4.8951715730645e-06, "loss": 0.2103, "step": 306 }, { "epoch": 0.1383350230933874, "grad_norm": 1.0423117479504835, "learning_rate": 4.894101327832225e-06, "loss": 0.2097, "step": 307 }, { "epoch": 0.13878562577447337, "grad_norm": 1.0259954075618385, "learning_rate": 4.89302576517977e-06, "loss": 0.1999, "step": 308 }, { "epoch": 0.1392362284555593, "grad_norm": 1.0589967702762215, "learning_rate": 4.891944887496013e-06, "loss": 0.22, "step": 309 }, { "epoch": 0.13968683113664526, "grad_norm": 1.1349746682270179, "learning_rate": 4.890858697181638e-06, "loss": 0.2167, "step": 310 }, { "epoch": 0.14013743381773122, "grad_norm": 1.1205630914176257, "learning_rate": 4.8897671966491315e-06, "loss": 0.2113, "step": 311 }, { "epoch": 0.14058803649881718, "grad_norm": 1.106503889764021, "learning_rate": 4.888670388322768e-06, "loss": 0.2199, "step": 312 }, { "epoch": 0.1410386391799031, "grad_norm": 1.1486711288247042, "learning_rate": 4.887568274638616e-06, "loss": 0.2196, "step": 313 }, { "epoch": 0.14148924186098907, "grad_norm": 1.0386088355319454, "learning_rate": 4.886460858044524e-06, "loss": 0.2107, "step": 314 }, { "epoch": 0.14193984454207503, "grad_norm": 1.1205610283336147, "learning_rate": 4.8853481410001225e-06, "loss": 0.2114, "step": 315 }, { "epoch": 0.14239044722316097, "grad_norm": 1.0635378718798172, "learning_rate": 4.884230125976812e-06, "loss": 0.215, "step": 316 }, { "epoch": 0.14284104990424693, "grad_norm": 1.1079718736074389, "learning_rate": 4.883106815457758e-06, "loss": 0.2083, "step": 317 }, { "epoch": 0.14329165258533288, "grad_norm": 1.093694928005843, "learning_rate": 4.881978211937895e-06, "loss": 0.208, "step": 318 }, { "epoch": 0.14374225526641884, "grad_norm": 1.189116563826118, "learning_rate": 4.8808443179239025e-06, "loss": 0.2353, "step": 319 }, { "epoch": 0.14419285794750478, "grad_norm": 1.0418971408472701, "learning_rate": 4.87970513593422e-06, "loss": 0.194, "step": 320 }, { "epoch": 0.14464346062859074, "grad_norm": 1.2566564381502143, "learning_rate": 4.878560668499029e-06, "loss": 0.2026, "step": 321 }, { "epoch": 0.1450940633096767, "grad_norm": 1.1717876183709541, "learning_rate": 4.877410918160247e-06, "loss": 0.2263, "step": 322 }, { "epoch": 0.14554466599076266, "grad_norm": 1.0480720367468714, "learning_rate": 4.87625588747153e-06, "loss": 0.1973, "step": 323 }, { "epoch": 0.1459952686718486, "grad_norm": 1.124387310287164, "learning_rate": 4.875095578998258e-06, "loss": 0.2079, "step": 324 }, { "epoch": 0.14644587135293455, "grad_norm": 1.0591920566737216, "learning_rate": 4.873929995317535e-06, "loss": 0.2112, "step": 325 }, { "epoch": 0.1468964740340205, "grad_norm": 1.0816650484251327, "learning_rate": 4.872759139018183e-06, "loss": 0.1993, "step": 326 }, { "epoch": 0.14734707671510647, "grad_norm": 1.048432719751396, "learning_rate": 4.87158301270073e-06, "loss": 0.2012, "step": 327 }, { "epoch": 0.1477976793961924, "grad_norm": 1.0772249510553387, "learning_rate": 4.870401618977415e-06, "loss": 0.2111, "step": 328 }, { "epoch": 0.14824828207727836, "grad_norm": 1.10268941823853, "learning_rate": 4.869214960472172e-06, "loss": 0.1897, "step": 329 }, { "epoch": 0.14869888475836432, "grad_norm": 1.1182951938865848, "learning_rate": 4.868023039820629e-06, "loss": 0.2033, "step": 330 }, { "epoch": 0.14914948743945025, "grad_norm": 1.1372072508746598, "learning_rate": 4.8668258596701035e-06, "loss": 0.22, "step": 331 }, { "epoch": 0.1496000901205362, "grad_norm": 1.2455087803794016, "learning_rate": 4.865623422679593e-06, "loss": 0.2272, "step": 332 }, { "epoch": 0.15005069280162217, "grad_norm": 1.0882360090058665, "learning_rate": 4.864415731519769e-06, "loss": 0.2232, "step": 333 }, { "epoch": 0.15050129548270813, "grad_norm": 1.1280177553587423, "learning_rate": 4.8632027888729765e-06, "loss": 0.2105, "step": 334 }, { "epoch": 0.15095189816379406, "grad_norm": 1.112279326112284, "learning_rate": 4.861984597433223e-06, "loss": 0.2189, "step": 335 }, { "epoch": 0.15140250084488002, "grad_norm": 0.9537982452845898, "learning_rate": 4.860761159906171e-06, "loss": 0.1827, "step": 336 }, { "epoch": 0.15185310352596598, "grad_norm": 1.0957786431292713, "learning_rate": 4.859532479009138e-06, "loss": 0.2112, "step": 337 }, { "epoch": 0.15230370620705194, "grad_norm": 1.0489404604521035, "learning_rate": 4.858298557471089e-06, "loss": 0.2092, "step": 338 }, { "epoch": 0.15275430888813787, "grad_norm": 1.0606372053262079, "learning_rate": 4.857059398032622e-06, "loss": 0.2042, "step": 339 }, { "epoch": 0.15320491156922383, "grad_norm": 1.0656352397918851, "learning_rate": 4.855815003445975e-06, "loss": 0.2111, "step": 340 }, { "epoch": 0.1536555142503098, "grad_norm": 1.1368796379748434, "learning_rate": 4.8545653764750125e-06, "loss": 0.2239, "step": 341 }, { "epoch": 0.15410611693139575, "grad_norm": 1.1472947549450268, "learning_rate": 4.853310519895217e-06, "loss": 0.2022, "step": 342 }, { "epoch": 0.15455671961248169, "grad_norm": 1.0772677119769347, "learning_rate": 4.85205043649369e-06, "loss": 0.2068, "step": 343 }, { "epoch": 0.15500732229356765, "grad_norm": 1.0646966878370703, "learning_rate": 4.850785129069139e-06, "loss": 0.2005, "step": 344 }, { "epoch": 0.1554579249746536, "grad_norm": 1.1191305904999096, "learning_rate": 4.849514600431877e-06, "loss": 0.2006, "step": 345 }, { "epoch": 0.15590852765573956, "grad_norm": 1.1166711817286015, "learning_rate": 4.848238853403813e-06, "loss": 0.2368, "step": 346 }, { "epoch": 0.1563591303368255, "grad_norm": 1.0823358127817442, "learning_rate": 4.846957890818444e-06, "loss": 0.2174, "step": 347 }, { "epoch": 0.15680973301791146, "grad_norm": 1.1055726640147139, "learning_rate": 4.845671715520853e-06, "loss": 0.1982, "step": 348 }, { "epoch": 0.15726033569899742, "grad_norm": 1.2381768961440425, "learning_rate": 4.844380330367701e-06, "loss": 0.2163, "step": 349 }, { "epoch": 0.15771093838008335, "grad_norm": 1.150113639278982, "learning_rate": 4.84308373822722e-06, "loss": 0.2211, "step": 350 }, { "epoch": 0.1581615410611693, "grad_norm": 1.0537141133450023, "learning_rate": 4.841781941979207e-06, "loss": 0.2055, "step": 351 }, { "epoch": 0.15861214374225527, "grad_norm": 1.213212670501889, "learning_rate": 4.840474944515017e-06, "loss": 0.2377, "step": 352 }, { "epoch": 0.15906274642334123, "grad_norm": 1.1778409627654554, "learning_rate": 4.839162748737556e-06, "loss": 0.2274, "step": 353 }, { "epoch": 0.15951334910442716, "grad_norm": 1.1285072146679263, "learning_rate": 4.8378453575612785e-06, "loss": 0.2235, "step": 354 }, { "epoch": 0.15996395178551312, "grad_norm": 1.0263423089549462, "learning_rate": 4.836522773912178e-06, "loss": 0.2088, "step": 355 }, { "epoch": 0.16041455446659908, "grad_norm": 1.1673268220763229, "learning_rate": 4.835195000727778e-06, "loss": 0.2186, "step": 356 }, { "epoch": 0.16086515714768504, "grad_norm": 1.010037043801117, "learning_rate": 4.83386204095713e-06, "loss": 0.2196, "step": 357 }, { "epoch": 0.16131575982877097, "grad_norm": 1.038204264055667, "learning_rate": 4.832523897560806e-06, "loss": 0.2085, "step": 358 }, { "epoch": 0.16176636250985693, "grad_norm": 1.014200195964716, "learning_rate": 4.83118057351089e-06, "loss": 0.2022, "step": 359 }, { "epoch": 0.1622169651909429, "grad_norm": 1.044154989517685, "learning_rate": 4.829832071790972e-06, "loss": 0.2096, "step": 360 }, { "epoch": 0.16266756787202885, "grad_norm": 1.0855706592511223, "learning_rate": 4.828478395396143e-06, "loss": 0.2167, "step": 361 }, { "epoch": 0.16311817055311478, "grad_norm": 1.144346091551533, "learning_rate": 4.827119547332988e-06, "loss": 0.2092, "step": 362 }, { "epoch": 0.16356877323420074, "grad_norm": 1.0894953489918704, "learning_rate": 4.825755530619576e-06, "loss": 0.1952, "step": 363 }, { "epoch": 0.1640193759152867, "grad_norm": 1.0974545315300446, "learning_rate": 4.824386348285456e-06, "loss": 0.2099, "step": 364 }, { "epoch": 0.16446997859637263, "grad_norm": 1.1097692923627165, "learning_rate": 4.8230120033716525e-06, "loss": 0.2006, "step": 365 }, { "epoch": 0.1649205812774586, "grad_norm": 1.065810253027864, "learning_rate": 4.821632498930656e-06, "loss": 0.2029, "step": 366 }, { "epoch": 0.16537118395854455, "grad_norm": 1.1383355108751643, "learning_rate": 4.820247838026414e-06, "loss": 0.2212, "step": 367 }, { "epoch": 0.16582178663963051, "grad_norm": 1.0915403947722182, "learning_rate": 4.81885802373433e-06, "loss": 0.2087, "step": 368 }, { "epoch": 0.16627238932071645, "grad_norm": 1.1135779035259634, "learning_rate": 4.8174630591412495e-06, "loss": 0.2038, "step": 369 }, { "epoch": 0.1667229920018024, "grad_norm": 1.0641064873717077, "learning_rate": 4.816062947345462e-06, "loss": 0.1974, "step": 370 }, { "epoch": 0.16717359468288837, "grad_norm": 1.101177412661662, "learning_rate": 4.814657691456685e-06, "loss": 0.2086, "step": 371 }, { "epoch": 0.16762419736397433, "grad_norm": 1.025754827090159, "learning_rate": 4.813247294596065e-06, "loss": 0.1984, "step": 372 }, { "epoch": 0.16807480004506026, "grad_norm": 1.0645510570530907, "learning_rate": 4.8118317598961625e-06, "loss": 0.2132, "step": 373 }, { "epoch": 0.16852540272614622, "grad_norm": 1.1139680760636317, "learning_rate": 4.810411090500952e-06, "loss": 0.2032, "step": 374 }, { "epoch": 0.16897600540723218, "grad_norm": 1.0717001593600692, "learning_rate": 4.808985289565813e-06, "loss": 0.1972, "step": 375 }, { "epoch": 0.16942660808831814, "grad_norm": 1.1069937243229389, "learning_rate": 4.807554360257522e-06, "loss": 0.2119, "step": 376 }, { "epoch": 0.16987721076940407, "grad_norm": 1.1217259717457204, "learning_rate": 4.8061183057542424e-06, "loss": 0.2078, "step": 377 }, { "epoch": 0.17032781345049003, "grad_norm": 1.0468196713561457, "learning_rate": 4.804677129245527e-06, "loss": 0.2038, "step": 378 }, { "epoch": 0.170778416131576, "grad_norm": 1.034643233619461, "learning_rate": 4.803230833932302e-06, "loss": 0.2051, "step": 379 }, { "epoch": 0.17122901881266195, "grad_norm": 1.0967312312186066, "learning_rate": 4.80177942302686e-06, "loss": 0.2101, "step": 380 }, { "epoch": 0.17167962149374788, "grad_norm": 1.1600308475491419, "learning_rate": 4.800322899752859e-06, "loss": 0.2215, "step": 381 }, { "epoch": 0.17213022417483384, "grad_norm": 1.0126108557101248, "learning_rate": 4.798861267345312e-06, "loss": 0.1999, "step": 382 }, { "epoch": 0.1725808268559198, "grad_norm": 0.9515557584883886, "learning_rate": 4.797394529050577e-06, "loss": 0.1771, "step": 383 }, { "epoch": 0.17303142953700573, "grad_norm": 1.1155117108927086, "learning_rate": 4.795922688126355e-06, "loss": 0.2343, "step": 384 }, { "epoch": 0.1734820322180917, "grad_norm": 0.9933539774509211, "learning_rate": 4.794445747841679e-06, "loss": 0.2018, "step": 385 }, { "epoch": 0.17393263489917765, "grad_norm": 1.0253085493855625, "learning_rate": 4.792963711476908e-06, "loss": 0.2109, "step": 386 }, { "epoch": 0.1743832375802636, "grad_norm": 1.0937444150984799, "learning_rate": 4.791476582323719e-06, "loss": 0.203, "step": 387 }, { "epoch": 0.17483384026134954, "grad_norm": 1.0994215912796548, "learning_rate": 4.7899843636851014e-06, "loss": 0.2066, "step": 388 }, { "epoch": 0.1752844429424355, "grad_norm": 1.0391589566973922, "learning_rate": 4.78848705887535e-06, "loss": 0.1987, "step": 389 }, { "epoch": 0.17573504562352146, "grad_norm": 1.025549842249766, "learning_rate": 4.786984671220053e-06, "loss": 0.2023, "step": 390 }, { "epoch": 0.17618564830460742, "grad_norm": 1.1952860424909204, "learning_rate": 4.785477204056089e-06, "loss": 0.2218, "step": 391 }, { "epoch": 0.17663625098569335, "grad_norm": 1.0996497512784118, "learning_rate": 4.78396466073162e-06, "loss": 0.2119, "step": 392 }, { "epoch": 0.17708685366677931, "grad_norm": 1.225758035064243, "learning_rate": 4.78244704460608e-06, "loss": 0.22, "step": 393 }, { "epoch": 0.17753745634786527, "grad_norm": 1.1385526357533455, "learning_rate": 4.7809243590501725e-06, "loss": 0.2062, "step": 394 }, { "epoch": 0.17798805902895123, "grad_norm": 1.0395990820077765, "learning_rate": 4.779396607445858e-06, "loss": 0.2056, "step": 395 }, { "epoch": 0.17843866171003717, "grad_norm": 1.1634899736709214, "learning_rate": 4.777863793186351e-06, "loss": 0.2025, "step": 396 }, { "epoch": 0.17888926439112313, "grad_norm": 1.2017617562113168, "learning_rate": 4.776325919676109e-06, "loss": 0.2005, "step": 397 }, { "epoch": 0.17933986707220909, "grad_norm": 1.0989936285862003, "learning_rate": 4.774782990330828e-06, "loss": 0.2109, "step": 398 }, { "epoch": 0.17979046975329505, "grad_norm": 1.0609884089685264, "learning_rate": 4.77323500857743e-06, "loss": 0.2122, "step": 399 }, { "epoch": 0.18024107243438098, "grad_norm": 0.9956576638871627, "learning_rate": 4.771681977854062e-06, "loss": 0.2017, "step": 400 }, { "epoch": 0.18069167511546694, "grad_norm": 1.0886524252129588, "learning_rate": 4.770123901610085e-06, "loss": 0.2165, "step": 401 }, { "epoch": 0.1811422777965529, "grad_norm": 1.049117446897747, "learning_rate": 4.768560783306064e-06, "loss": 0.2129, "step": 402 }, { "epoch": 0.18159288047763883, "grad_norm": 1.1728204295468192, "learning_rate": 4.7669926264137625e-06, "loss": 0.2196, "step": 403 }, { "epoch": 0.1820434831587248, "grad_norm": 1.0530113188774437, "learning_rate": 4.765419434416138e-06, "loss": 0.1988, "step": 404 }, { "epoch": 0.18249408583981075, "grad_norm": 1.1006552707986152, "learning_rate": 4.763841210807329e-06, "loss": 0.1969, "step": 405 }, { "epoch": 0.1829446885208967, "grad_norm": 1.1056839598789527, "learning_rate": 4.762257959092651e-06, "loss": 0.2165, "step": 406 }, { "epoch": 0.18339529120198264, "grad_norm": 1.166339387634052, "learning_rate": 4.760669682788584e-06, "loss": 0.2025, "step": 407 }, { "epoch": 0.1838458938830686, "grad_norm": 1.1845695122450088, "learning_rate": 4.759076385422773e-06, "loss": 0.2288, "step": 408 }, { "epoch": 0.18429649656415456, "grad_norm": 1.0314361289522471, "learning_rate": 4.7574780705340094e-06, "loss": 0.212, "step": 409 }, { "epoch": 0.18474709924524052, "grad_norm": 1.0060908332453342, "learning_rate": 4.755874741672233e-06, "loss": 0.1893, "step": 410 }, { "epoch": 0.18519770192632645, "grad_norm": 0.9813435104532242, "learning_rate": 4.754266402398517e-06, "loss": 0.2055, "step": 411 }, { "epoch": 0.1856483046074124, "grad_norm": 1.0197480260218619, "learning_rate": 4.752653056285066e-06, "loss": 0.2134, "step": 412 }, { "epoch": 0.18609890728849837, "grad_norm": 1.0464587885926853, "learning_rate": 4.7510347069152015e-06, "loss": 0.2224, "step": 413 }, { "epoch": 0.18654950996958433, "grad_norm": 1.1570694502589038, "learning_rate": 4.74941135788336e-06, "loss": 0.2171, "step": 414 }, { "epoch": 0.18700011265067026, "grad_norm": 1.0691121161245825, "learning_rate": 4.747783012795083e-06, "loss": 0.1931, "step": 415 }, { "epoch": 0.18745071533175622, "grad_norm": 1.0451364800748664, "learning_rate": 4.746149675267005e-06, "loss": 0.193, "step": 416 }, { "epoch": 0.18790131801284218, "grad_norm": 1.135060265323339, "learning_rate": 4.744511348926855e-06, "loss": 0.2188, "step": 417 }, { "epoch": 0.18835192069392812, "grad_norm": 1.0453942068357704, "learning_rate": 4.742868037413435e-06, "loss": 0.1882, "step": 418 }, { "epoch": 0.18880252337501408, "grad_norm": 1.1036284009807769, "learning_rate": 4.741219744376624e-06, "loss": 0.2077, "step": 419 }, { "epoch": 0.18925312605610004, "grad_norm": 1.0896714127730052, "learning_rate": 4.739566473477365e-06, "loss": 0.1957, "step": 420 }, { "epoch": 0.189703728737186, "grad_norm": 1.093929205439995, "learning_rate": 4.737908228387656e-06, "loss": 0.2204, "step": 421 }, { "epoch": 0.19015433141827193, "grad_norm": 0.957604866376255, "learning_rate": 4.736245012790543e-06, "loss": 0.1875, "step": 422 }, { "epoch": 0.1906049340993579, "grad_norm": 1.1117554592845362, "learning_rate": 4.734576830380113e-06, "loss": 0.2054, "step": 423 }, { "epoch": 0.19105553678044385, "grad_norm": 1.0285882348865756, "learning_rate": 4.732903684861482e-06, "loss": 0.1886, "step": 424 }, { "epoch": 0.1915061394615298, "grad_norm": 1.0396359377134285, "learning_rate": 4.731225579950791e-06, "loss": 0.2, "step": 425 }, { "epoch": 0.19195674214261574, "grad_norm": 1.0661040136052777, "learning_rate": 4.7295425193751974e-06, "loss": 0.2043, "step": 426 }, { "epoch": 0.1924073448237017, "grad_norm": 1.1689089296608768, "learning_rate": 4.727854506872863e-06, "loss": 0.2203, "step": 427 }, { "epoch": 0.19285794750478766, "grad_norm": 1.0981202998096562, "learning_rate": 4.726161546192949e-06, "loss": 0.2138, "step": 428 }, { "epoch": 0.19330855018587362, "grad_norm": 1.0895700511789579, "learning_rate": 4.724463641095606e-06, "loss": 0.2092, "step": 429 }, { "epoch": 0.19375915286695955, "grad_norm": 1.1454103133342515, "learning_rate": 4.7227607953519686e-06, "loss": 0.2123, "step": 430 }, { "epoch": 0.1942097555480455, "grad_norm": 1.081115059551756, "learning_rate": 4.721053012744142e-06, "loss": 0.2004, "step": 431 }, { "epoch": 0.19466035822913147, "grad_norm": 1.047592461395092, "learning_rate": 4.719340297065198e-06, "loss": 0.2012, "step": 432 }, { "epoch": 0.19511096091021743, "grad_norm": 1.0496672476197972, "learning_rate": 4.717622652119166e-06, "loss": 0.2096, "step": 433 }, { "epoch": 0.19556156359130336, "grad_norm": 1.1483209659869804, "learning_rate": 4.715900081721021e-06, "loss": 0.2035, "step": 434 }, { "epoch": 0.19601216627238932, "grad_norm": 1.0567737050713122, "learning_rate": 4.71417258969668e-06, "loss": 0.2093, "step": 435 }, { "epoch": 0.19646276895347528, "grad_norm": 1.006874897763615, "learning_rate": 4.712440179882989e-06, "loss": 0.1982, "step": 436 }, { "epoch": 0.1969133716345612, "grad_norm": 1.0512955804872948, "learning_rate": 4.710702856127718e-06, "loss": 0.2128, "step": 437 }, { "epoch": 0.19736397431564717, "grad_norm": 1.0421413033250877, "learning_rate": 4.708960622289552e-06, "loss": 0.196, "step": 438 }, { "epoch": 0.19781457699673313, "grad_norm": 1.0393078215848977, "learning_rate": 4.70721348223808e-06, "loss": 0.2003, "step": 439 }, { "epoch": 0.1982651796778191, "grad_norm": 1.0652170410294124, "learning_rate": 4.705461439853789e-06, "loss": 0.2038, "step": 440 }, { "epoch": 0.19871578235890502, "grad_norm": 1.148884011184797, "learning_rate": 4.703704499028052e-06, "loss": 0.2105, "step": 441 }, { "epoch": 0.19916638503999098, "grad_norm": 1.1182902562395605, "learning_rate": 4.701942663663126e-06, "loss": 0.2115, "step": 442 }, { "epoch": 0.19961698772107694, "grad_norm": 1.0273097092583952, "learning_rate": 4.700175937672134e-06, "loss": 0.2096, "step": 443 }, { "epoch": 0.2000675904021629, "grad_norm": 1.0524293589683538, "learning_rate": 4.698404324979066e-06, "loss": 0.2152, "step": 444 }, { "epoch": 0.20051819308324884, "grad_norm": 1.0461725590043855, "learning_rate": 4.696627829518761e-06, "loss": 0.2095, "step": 445 }, { "epoch": 0.2009687957643348, "grad_norm": 1.1052123985360112, "learning_rate": 4.6948464552369075e-06, "loss": 0.2008, "step": 446 }, { "epoch": 0.20141939844542076, "grad_norm": 1.0488786380047408, "learning_rate": 4.693060206090028e-06, "loss": 0.203, "step": 447 }, { "epoch": 0.20187000112650672, "grad_norm": 1.1015751617793508, "learning_rate": 4.69126908604547e-06, "loss": 0.2022, "step": 448 }, { "epoch": 0.20232060380759265, "grad_norm": 1.0223628260749347, "learning_rate": 4.689473099081403e-06, "loss": 0.2077, "step": 449 }, { "epoch": 0.2027712064886786, "grad_norm": 1.1696606518556552, "learning_rate": 4.687672249186805e-06, "loss": 0.2236, "step": 450 }, { "epoch": 0.20322180916976457, "grad_norm": 1.0789601171063559, "learning_rate": 4.685866540361456e-06, "loss": 0.1996, "step": 451 }, { "epoch": 0.2036724118508505, "grad_norm": 1.1208502934238764, "learning_rate": 4.684055976615924e-06, "loss": 0.2077, "step": 452 }, { "epoch": 0.20412301453193646, "grad_norm": 1.1152284615506332, "learning_rate": 4.682240561971565e-06, "loss": 0.2188, "step": 453 }, { "epoch": 0.20457361721302242, "grad_norm": 1.06037564984265, "learning_rate": 4.680420300460505e-06, "loss": 0.2118, "step": 454 }, { "epoch": 0.20502421989410838, "grad_norm": 1.0336715129306815, "learning_rate": 4.678595196125638e-06, "loss": 0.2088, "step": 455 }, { "epoch": 0.2054748225751943, "grad_norm": 1.0917414927010969, "learning_rate": 4.676765253020613e-06, "loss": 0.2027, "step": 456 }, { "epoch": 0.20592542525628027, "grad_norm": 1.115251383762241, "learning_rate": 4.674930475209827e-06, "loss": 0.2171, "step": 457 }, { "epoch": 0.20637602793736623, "grad_norm": 1.0157487847463096, "learning_rate": 4.673090866768412e-06, "loss": 0.2004, "step": 458 }, { "epoch": 0.2068266306184522, "grad_norm": 1.0532015695939707, "learning_rate": 4.671246431782234e-06, "loss": 0.2017, "step": 459 }, { "epoch": 0.20727723329953812, "grad_norm": 1.1298334121350209, "learning_rate": 4.669397174347874e-06, "loss": 0.2153, "step": 460 }, { "epoch": 0.20772783598062408, "grad_norm": 1.1027909606697068, "learning_rate": 4.667543098572627e-06, "loss": 0.206, "step": 461 }, { "epoch": 0.20817843866171004, "grad_norm": 1.088777757241232, "learning_rate": 4.665684208574492e-06, "loss": 0.2272, "step": 462 }, { "epoch": 0.208629041342796, "grad_norm": 1.189137049814458, "learning_rate": 4.6638205084821544e-06, "loss": 0.2178, "step": 463 }, { "epoch": 0.20907964402388193, "grad_norm": 1.0758757066228637, "learning_rate": 4.661952002434988e-06, "loss": 0.2166, "step": 464 }, { "epoch": 0.2095302467049679, "grad_norm": 1.0313498928838842, "learning_rate": 4.660078694583037e-06, "loss": 0.2157, "step": 465 }, { "epoch": 0.20998084938605385, "grad_norm": 1.023803389318251, "learning_rate": 4.658200589087016e-06, "loss": 0.2086, "step": 466 }, { "epoch": 0.2104314520671398, "grad_norm": 0.9904629162792291, "learning_rate": 4.656317690118291e-06, "loss": 0.188, "step": 467 }, { "epoch": 0.21088205474822574, "grad_norm": 1.0719361560549883, "learning_rate": 4.654430001858874e-06, "loss": 0.1975, "step": 468 }, { "epoch": 0.2113326574293117, "grad_norm": 1.0159140827322095, "learning_rate": 4.6525375285014195e-06, "loss": 0.2041, "step": 469 }, { "epoch": 0.21178326011039766, "grad_norm": 1.057379808185924, "learning_rate": 4.650640274249205e-06, "loss": 0.1936, "step": 470 }, { "epoch": 0.2122338627914836, "grad_norm": 1.0517236533065415, "learning_rate": 4.648738243316128e-06, "loss": 0.2069, "step": 471 }, { "epoch": 0.21268446547256956, "grad_norm": 1.283567142637064, "learning_rate": 4.646831439926696e-06, "loss": 0.2162, "step": 472 }, { "epoch": 0.21313506815365552, "grad_norm": 1.0710202659430694, "learning_rate": 4.644919868316014e-06, "loss": 0.2083, "step": 473 }, { "epoch": 0.21358567083474148, "grad_norm": 1.082095821155301, "learning_rate": 4.643003532729783e-06, "loss": 0.2109, "step": 474 }, { "epoch": 0.2140362735158274, "grad_norm": 1.0884595291687782, "learning_rate": 4.641082437424277e-06, "loss": 0.205, "step": 475 }, { "epoch": 0.21448687619691337, "grad_norm": 1.0536622056317448, "learning_rate": 4.639156586666349e-06, "loss": 0.2103, "step": 476 }, { "epoch": 0.21493747887799933, "grad_norm": 1.0232598149017575, "learning_rate": 4.63722598473341e-06, "loss": 0.2066, "step": 477 }, { "epoch": 0.2153880815590853, "grad_norm": 1.033764557474931, "learning_rate": 4.635290635913425e-06, "loss": 0.1858, "step": 478 }, { "epoch": 0.21583868424017122, "grad_norm": 1.0772507958771667, "learning_rate": 4.633350544504899e-06, "loss": 0.2181, "step": 479 }, { "epoch": 0.21628928692125718, "grad_norm": 0.970929042699419, "learning_rate": 4.6314057148168765e-06, "loss": 0.2022, "step": 480 }, { "epoch": 0.21673988960234314, "grad_norm": 1.0328732636920106, "learning_rate": 4.629456151168921e-06, "loss": 0.2078, "step": 481 }, { "epoch": 0.2171904922834291, "grad_norm": 0.9803373687650507, "learning_rate": 4.627501857891113e-06, "loss": 0.1961, "step": 482 }, { "epoch": 0.21764109496451503, "grad_norm": 0.9604150843019789, "learning_rate": 4.625542839324036e-06, "loss": 0.1949, "step": 483 }, { "epoch": 0.218091697645601, "grad_norm": 1.075511254310141, "learning_rate": 4.623579099818769e-06, "loss": 0.2203, "step": 484 }, { "epoch": 0.21854230032668695, "grad_norm": 1.0782563669494776, "learning_rate": 4.621610643736878e-06, "loss": 0.2148, "step": 485 }, { "epoch": 0.2189929030077729, "grad_norm": 1.03653667368632, "learning_rate": 4.6196374754504024e-06, "loss": 0.1953, "step": 486 }, { "epoch": 0.21944350568885884, "grad_norm": 1.0098473894676974, "learning_rate": 4.617659599341849e-06, "loss": 0.1939, "step": 487 }, { "epoch": 0.2198941083699448, "grad_norm": 1.130106809561545, "learning_rate": 4.615677019804182e-06, "loss": 0.2112, "step": 488 }, { "epoch": 0.22034471105103076, "grad_norm": 0.9678884435582474, "learning_rate": 4.6136897412408084e-06, "loss": 0.1814, "step": 489 }, { "epoch": 0.2207953137321167, "grad_norm": 1.013409423056004, "learning_rate": 4.611697768065577e-06, "loss": 0.1911, "step": 490 }, { "epoch": 0.22124591641320265, "grad_norm": 1.075379836070552, "learning_rate": 4.609701104702759e-06, "loss": 0.202, "step": 491 }, { "epoch": 0.2216965190942886, "grad_norm": 1.0091142238608348, "learning_rate": 4.607699755587046e-06, "loss": 0.2074, "step": 492 }, { "epoch": 0.22214712177537457, "grad_norm": 1.0663643831417986, "learning_rate": 4.605693725163536e-06, "loss": 0.1986, "step": 493 }, { "epoch": 0.2225977244564605, "grad_norm": 1.145112521709734, "learning_rate": 4.603683017887722e-06, "loss": 0.2005, "step": 494 }, { "epoch": 0.22304832713754646, "grad_norm": 1.0339947461099595, "learning_rate": 4.6016676382254895e-06, "loss": 0.2077, "step": 495 }, { "epoch": 0.22349892981863242, "grad_norm": 1.0179207841068676, "learning_rate": 4.5996475906530955e-06, "loss": 0.2023, "step": 496 }, { "epoch": 0.22394953249971838, "grad_norm": 1.0543173957144043, "learning_rate": 4.597622879657171e-06, "loss": 0.1885, "step": 497 }, { "epoch": 0.22440013518080432, "grad_norm": 1.052136332232682, "learning_rate": 4.595593509734699e-06, "loss": 0.2052, "step": 498 }, { "epoch": 0.22485073786189028, "grad_norm": 1.0295283490102973, "learning_rate": 4.593559485393015e-06, "loss": 0.199, "step": 499 }, { "epoch": 0.22530134054297624, "grad_norm": 1.0855825905904153, "learning_rate": 4.591520811149787e-06, "loss": 0.2103, "step": 500 }, { "epoch": 0.22530134054297624, "eval_loss": 0.20520994067192078, "eval_runtime": 59.4657, "eval_samples_per_second": 24.132, "eval_steps_per_second": 3.027, "step": 500 }, { "epoch": 0.2257519432240622, "grad_norm": 1.0302681861841023, "learning_rate": 4.589477491533016e-06, "loss": 0.186, "step": 501 }, { "epoch": 0.22620254590514813, "grad_norm": 1.061738831547382, "learning_rate": 4.587429531081019e-06, "loss": 0.2081, "step": 502 }, { "epoch": 0.2266531485862341, "grad_norm": 0.9794242056013339, "learning_rate": 4.585376934342418e-06, "loss": 0.1995, "step": 503 }, { "epoch": 0.22710375126732005, "grad_norm": 0.9686601035730674, "learning_rate": 4.583319705876133e-06, "loss": 0.1873, "step": 504 }, { "epoch": 0.22755435394840598, "grad_norm": 0.9664239720091494, "learning_rate": 4.581257850251376e-06, "loss": 0.1876, "step": 505 }, { "epoch": 0.22800495662949194, "grad_norm": 1.0649310131480245, "learning_rate": 4.579191372047631e-06, "loss": 0.1981, "step": 506 }, { "epoch": 0.2284555593105779, "grad_norm": 1.054654976580904, "learning_rate": 4.577120275854649e-06, "loss": 0.2, "step": 507 }, { "epoch": 0.22890616199166386, "grad_norm": 1.0938236596107724, "learning_rate": 4.5750445662724426e-06, "loss": 0.2133, "step": 508 }, { "epoch": 0.2293567646727498, "grad_norm": 0.9920852942754904, "learning_rate": 4.572964247911265e-06, "loss": 0.1971, "step": 509 }, { "epoch": 0.22980736735383575, "grad_norm": 1.044583342143946, "learning_rate": 4.5708793253916104e-06, "loss": 0.1967, "step": 510 }, { "epoch": 0.2302579700349217, "grad_norm": 1.1506941684890593, "learning_rate": 4.568789803344196e-06, "loss": 0.2156, "step": 511 }, { "epoch": 0.23070857271600767, "grad_norm": 0.9861261159798321, "learning_rate": 4.566695686409957e-06, "loss": 0.1931, "step": 512 }, { "epoch": 0.2311591753970936, "grad_norm": 1.0039176366458205, "learning_rate": 4.564596979240031e-06, "loss": 0.1922, "step": 513 }, { "epoch": 0.23160977807817956, "grad_norm": 1.0205337354605422, "learning_rate": 4.562493686495756e-06, "loss": 0.2001, "step": 514 }, { "epoch": 0.23206038075926552, "grad_norm": 0.9917793017218205, "learning_rate": 4.56038581284865e-06, "loss": 0.2012, "step": 515 }, { "epoch": 0.23251098344035148, "grad_norm": 1.0321116954854799, "learning_rate": 4.558273362980406e-06, "loss": 0.2173, "step": 516 }, { "epoch": 0.23296158612143741, "grad_norm": 1.0540135978614726, "learning_rate": 4.556156341582884e-06, "loss": 0.2176, "step": 517 }, { "epoch": 0.23341218880252337, "grad_norm": 1.026187489177222, "learning_rate": 4.5540347533580935e-06, "loss": 0.1982, "step": 518 }, { "epoch": 0.23386279148360933, "grad_norm": 1.0683916408326868, "learning_rate": 4.551908603018191e-06, "loss": 0.1916, "step": 519 }, { "epoch": 0.2343133941646953, "grad_norm": 0.9800651964726533, "learning_rate": 4.549777895285464e-06, "loss": 0.1928, "step": 520 }, { "epoch": 0.23476399684578123, "grad_norm": 1.0102074038951585, "learning_rate": 4.547642634892321e-06, "loss": 0.2039, "step": 521 }, { "epoch": 0.23521459952686719, "grad_norm": 1.1222993821068836, "learning_rate": 4.545502826581284e-06, "loss": 0.2234, "step": 522 }, { "epoch": 0.23566520220795314, "grad_norm": 1.084335390939449, "learning_rate": 4.543358475104975e-06, "loss": 0.2147, "step": 523 }, { "epoch": 0.23611580488903908, "grad_norm": 1.1110169688938027, "learning_rate": 4.541209585226109e-06, "loss": 0.1997, "step": 524 }, { "epoch": 0.23656640757012504, "grad_norm": 0.9454783484088987, "learning_rate": 4.539056161717477e-06, "loss": 0.1906, "step": 525 }, { "epoch": 0.237017010251211, "grad_norm": 1.0979245101280635, "learning_rate": 4.536898209361942e-06, "loss": 0.2097, "step": 526 }, { "epoch": 0.23746761293229696, "grad_norm": 1.0218511577120122, "learning_rate": 4.5347357329524254e-06, "loss": 0.1942, "step": 527 }, { "epoch": 0.2379182156133829, "grad_norm": 1.1530219699070952, "learning_rate": 4.532568737291898e-06, "loss": 0.2013, "step": 528 }, { "epoch": 0.23836881829446885, "grad_norm": 1.0873274217971285, "learning_rate": 4.530397227193365e-06, "loss": 0.207, "step": 529 }, { "epoch": 0.2388194209755548, "grad_norm": 1.0476737638086648, "learning_rate": 4.528221207479862e-06, "loss": 0.1988, "step": 530 }, { "epoch": 0.23927002365664077, "grad_norm": 1.096263795148243, "learning_rate": 4.5260406829844364e-06, "loss": 0.2012, "step": 531 }, { "epoch": 0.2397206263377267, "grad_norm": 1.030880422031526, "learning_rate": 4.523855658550146e-06, "loss": 0.2027, "step": 532 }, { "epoch": 0.24017122901881266, "grad_norm": 1.0067117254618483, "learning_rate": 4.521666139030039e-06, "loss": 0.1951, "step": 533 }, { "epoch": 0.24062183169989862, "grad_norm": 1.0747100498903996, "learning_rate": 4.51947212928715e-06, "loss": 0.1859, "step": 534 }, { "epoch": 0.24107243438098458, "grad_norm": 1.2309094629093367, "learning_rate": 4.5172736341944845e-06, "loss": 0.222, "step": 535 }, { "epoch": 0.2415230370620705, "grad_norm": 1.103494575925489, "learning_rate": 4.515070658635013e-06, "loss": 0.2129, "step": 536 }, { "epoch": 0.24197363974315647, "grad_norm": 1.0397715812091397, "learning_rate": 4.512863207501654e-06, "loss": 0.1839, "step": 537 }, { "epoch": 0.24242424242424243, "grad_norm": 1.0560302158424433, "learning_rate": 4.510651285697269e-06, "loss": 0.1952, "step": 538 }, { "epoch": 0.24287484510532836, "grad_norm": 1.1120154366274742, "learning_rate": 4.5084348981346495e-06, "loss": 0.2071, "step": 539 }, { "epoch": 0.24332544778641432, "grad_norm": 1.0545051119809872, "learning_rate": 4.506214049736502e-06, "loss": 0.2219, "step": 540 }, { "epoch": 0.24377605046750028, "grad_norm": 1.0352049466055129, "learning_rate": 4.503988745435443e-06, "loss": 0.1836, "step": 541 }, { "epoch": 0.24422665314858624, "grad_norm": 1.0809072576838936, "learning_rate": 4.5017589901739885e-06, "loss": 0.2045, "step": 542 }, { "epoch": 0.24467725582967217, "grad_norm": 1.110072290591729, "learning_rate": 4.499524788904537e-06, "loss": 0.2059, "step": 543 }, { "epoch": 0.24512785851075813, "grad_norm": 0.9665808924396558, "learning_rate": 4.497286146589361e-06, "loss": 0.2011, "step": 544 }, { "epoch": 0.2455784611918441, "grad_norm": 1.0639169166435112, "learning_rate": 4.4950430682005995e-06, "loss": 0.1952, "step": 545 }, { "epoch": 0.24602906387293005, "grad_norm": 1.1497485140437143, "learning_rate": 4.492795558720242e-06, "loss": 0.2067, "step": 546 }, { "epoch": 0.24647966655401599, "grad_norm": 1.0118960522373834, "learning_rate": 4.490543623140123e-06, "loss": 0.1972, "step": 547 }, { "epoch": 0.24693026923510195, "grad_norm": 1.100344333423072, "learning_rate": 4.488287266461904e-06, "loss": 0.2132, "step": 548 }, { "epoch": 0.2473808719161879, "grad_norm": 1.059434492217794, "learning_rate": 4.486026493697067e-06, "loss": 0.1949, "step": 549 }, { "epoch": 0.24783147459727387, "grad_norm": 1.1281625090194913, "learning_rate": 4.483761309866902e-06, "loss": 0.2168, "step": 550 }, { "epoch": 0.2482820772783598, "grad_norm": 1.0960219014600239, "learning_rate": 4.481491720002499e-06, "loss": 0.1988, "step": 551 }, { "epoch": 0.24873267995944576, "grad_norm": 1.0403322611009544, "learning_rate": 4.479217729144731e-06, "loss": 0.2003, "step": 552 }, { "epoch": 0.24918328264053172, "grad_norm": 1.0318746166370159, "learning_rate": 4.476939342344246e-06, "loss": 0.1929, "step": 553 }, { "epoch": 0.24963388532161768, "grad_norm": 1.0679148861170464, "learning_rate": 4.474656564661458e-06, "loss": 0.183, "step": 554 }, { "epoch": 0.2500844880027036, "grad_norm": 1.0258442730168609, "learning_rate": 4.472369401166531e-06, "loss": 0.1892, "step": 555 }, { "epoch": 0.25053509068378954, "grad_norm": 1.065168544541267, "learning_rate": 4.47007785693937e-06, "loss": 0.1855, "step": 556 }, { "epoch": 0.25098569336487553, "grad_norm": 1.0375402257867379, "learning_rate": 4.467781937069611e-06, "loss": 0.1934, "step": 557 }, { "epoch": 0.25143629604596146, "grad_norm": 1.0182166840713762, "learning_rate": 4.465481646656608e-06, "loss": 0.2082, "step": 558 }, { "epoch": 0.25188689872704745, "grad_norm": 0.9643240424365269, "learning_rate": 4.463176990809423e-06, "loss": 0.1913, "step": 559 }, { "epoch": 0.2523375014081334, "grad_norm": 1.049581595009854, "learning_rate": 4.460867974646814e-06, "loss": 0.2022, "step": 560 }, { "epoch": 0.2527881040892193, "grad_norm": 0.9918991445137725, "learning_rate": 4.45855460329722e-06, "loss": 0.1845, "step": 561 }, { "epoch": 0.2532387067703053, "grad_norm": 1.0873121965600965, "learning_rate": 4.45623688189876e-06, "loss": 0.2085, "step": 562 }, { "epoch": 0.25368930945139123, "grad_norm": 0.987846557405897, "learning_rate": 4.453914815599206e-06, "loss": 0.1953, "step": 563 }, { "epoch": 0.25413991213247716, "grad_norm": 1.0298405968391158, "learning_rate": 4.451588409555988e-06, "loss": 0.1962, "step": 564 }, { "epoch": 0.25459051481356315, "grad_norm": 1.0326677560841386, "learning_rate": 4.4492576689361705e-06, "loss": 0.1948, "step": 565 }, { "epoch": 0.2550411174946491, "grad_norm": 1.0519711817977115, "learning_rate": 4.446922598916445e-06, "loss": 0.204, "step": 566 }, { "epoch": 0.25549172017573507, "grad_norm": 1.027262799140467, "learning_rate": 4.444583204683123e-06, "loss": 0.1938, "step": 567 }, { "epoch": 0.255942322856821, "grad_norm": 1.0965809840354055, "learning_rate": 4.4422394914321145e-06, "loss": 0.2047, "step": 568 }, { "epoch": 0.25639292553790693, "grad_norm": 1.049829610661597, "learning_rate": 4.439891464368927e-06, "loss": 0.2064, "step": 569 }, { "epoch": 0.2568435282189929, "grad_norm": 0.9765801119075459, "learning_rate": 4.437539128708647e-06, "loss": 0.183, "step": 570 }, { "epoch": 0.25729413090007885, "grad_norm": 1.058406240013727, "learning_rate": 4.435182489675931e-06, "loss": 0.2043, "step": 571 }, { "epoch": 0.2577447335811648, "grad_norm": 0.9556853971932254, "learning_rate": 4.432821552504994e-06, "loss": 0.1949, "step": 572 }, { "epoch": 0.2581953362622508, "grad_norm": 1.0499296701467336, "learning_rate": 4.430456322439596e-06, "loss": 0.2013, "step": 573 }, { "epoch": 0.2586459389433367, "grad_norm": 1.0824220937552163, "learning_rate": 4.4280868047330325e-06, "loss": 0.2068, "step": 574 }, { "epoch": 0.25909654162442264, "grad_norm": 1.060056337801298, "learning_rate": 4.425713004648123e-06, "loss": 0.2064, "step": 575 }, { "epoch": 0.2595471443055086, "grad_norm": 1.0606451489538105, "learning_rate": 4.423334927457198e-06, "loss": 0.2046, "step": 576 }, { "epoch": 0.25999774698659456, "grad_norm": 1.091280452882288, "learning_rate": 4.420952578442086e-06, "loss": 0.2156, "step": 577 }, { "epoch": 0.26044834966768055, "grad_norm": 0.9696606076862494, "learning_rate": 4.4185659628941054e-06, "loss": 0.1856, "step": 578 }, { "epoch": 0.2608989523487665, "grad_norm": 1.0364418339411665, "learning_rate": 4.416175086114049e-06, "loss": 0.2086, "step": 579 }, { "epoch": 0.2613495550298524, "grad_norm": 1.0784544410291654, "learning_rate": 4.4137799534121785e-06, "loss": 0.1933, "step": 580 }, { "epoch": 0.2618001577109384, "grad_norm": 1.01841478080086, "learning_rate": 4.4113805701082e-06, "loss": 0.1914, "step": 581 }, { "epoch": 0.26225076039202433, "grad_norm": 1.165117484518737, "learning_rate": 4.408976941531269e-06, "loss": 0.2226, "step": 582 }, { "epoch": 0.26270136307311026, "grad_norm": 0.952663941356781, "learning_rate": 4.406569073019965e-06, "loss": 0.2004, "step": 583 }, { "epoch": 0.26315196575419625, "grad_norm": 1.0687364462500708, "learning_rate": 4.404156969922284e-06, "loss": 0.2009, "step": 584 }, { "epoch": 0.2636025684352822, "grad_norm": 0.9840008293163256, "learning_rate": 4.401740637595633e-06, "loss": 0.1878, "step": 585 }, { "epoch": 0.26405317111636817, "grad_norm": 0.9605772910240837, "learning_rate": 4.3993200814068035e-06, "loss": 0.1893, "step": 586 }, { "epoch": 0.2645037737974541, "grad_norm": 0.9941565744960191, "learning_rate": 4.396895306731978e-06, "loss": 0.193, "step": 587 }, { "epoch": 0.26495437647854003, "grad_norm": 0.9451043951281098, "learning_rate": 4.394466318956701e-06, "loss": 0.1861, "step": 588 }, { "epoch": 0.265404979159626, "grad_norm": 0.9357557601128812, "learning_rate": 4.392033123475876e-06, "loss": 0.1928, "step": 589 }, { "epoch": 0.26585558184071195, "grad_norm": 1.211949872669835, "learning_rate": 4.389595725693756e-06, "loss": 0.1953, "step": 590 }, { "epoch": 0.2663061845217979, "grad_norm": 0.9744058315459057, "learning_rate": 4.387154131023924e-06, "loss": 0.1953, "step": 591 }, { "epoch": 0.26675678720288387, "grad_norm": 0.9240075427887562, "learning_rate": 4.384708344889285e-06, "loss": 0.1762, "step": 592 }, { "epoch": 0.2672073898839698, "grad_norm": 0.9646315497774892, "learning_rate": 4.382258372722054e-06, "loss": 0.1925, "step": 593 }, { "epoch": 0.26765799256505574, "grad_norm": 1.1246869317376789, "learning_rate": 4.379804219963742e-06, "loss": 0.2179, "step": 594 }, { "epoch": 0.2681085952461417, "grad_norm": 0.9989771573424515, "learning_rate": 4.377345892065149e-06, "loss": 0.1947, "step": 595 }, { "epoch": 0.26855919792722766, "grad_norm": 1.0192448988756184, "learning_rate": 4.374883394486343e-06, "loss": 0.183, "step": 596 }, { "epoch": 0.26900980060831364, "grad_norm": 1.1840938286800815, "learning_rate": 4.3724167326966575e-06, "loss": 0.2031, "step": 597 }, { "epoch": 0.2694604032893996, "grad_norm": 0.9756787429230731, "learning_rate": 4.3699459121746726e-06, "loss": 0.206, "step": 598 }, { "epoch": 0.2699110059704855, "grad_norm": 1.1107694091005171, "learning_rate": 4.367470938408204e-06, "loss": 0.1869, "step": 599 }, { "epoch": 0.2703616086515715, "grad_norm": 0.9232888391273938, "learning_rate": 4.364991816894296e-06, "loss": 0.1788, "step": 600 }, { "epoch": 0.2708122113326574, "grad_norm": 1.0025886336195018, "learning_rate": 4.362508553139203e-06, "loss": 0.2023, "step": 601 }, { "epoch": 0.27126281401374336, "grad_norm": 1.1433010440724132, "learning_rate": 4.360021152658378e-06, "loss": 0.1994, "step": 602 }, { "epoch": 0.27171341669482935, "grad_norm": 0.9895897645038598, "learning_rate": 4.357529620976463e-06, "loss": 0.1883, "step": 603 }, { "epoch": 0.2721640193759153, "grad_norm": 1.0296556364889053, "learning_rate": 4.355033963627277e-06, "loss": 0.2105, "step": 604 }, { "epoch": 0.27261462205700127, "grad_norm": 1.1099119086432157, "learning_rate": 4.352534186153802e-06, "loss": 0.1953, "step": 605 }, { "epoch": 0.2730652247380872, "grad_norm": 1.0549408039081614, "learning_rate": 4.3500302941081685e-06, "loss": 0.1977, "step": 606 }, { "epoch": 0.27351582741917313, "grad_norm": 1.039314334394156, "learning_rate": 4.3475222930516484e-06, "loss": 0.2092, "step": 607 }, { "epoch": 0.2739664301002591, "grad_norm": 1.1203701706687121, "learning_rate": 4.345010188554638e-06, "loss": 0.1932, "step": 608 }, { "epoch": 0.27441703278134505, "grad_norm": 1.0628458411287045, "learning_rate": 4.34249398619665e-06, "loss": 0.1841, "step": 609 }, { "epoch": 0.274867635462431, "grad_norm": 1.1398229296350408, "learning_rate": 4.339973691566297e-06, "loss": 0.2031, "step": 610 }, { "epoch": 0.27531823814351697, "grad_norm": 0.9514446506895067, "learning_rate": 4.337449310261279e-06, "loss": 0.1942, "step": 611 }, { "epoch": 0.2757688408246029, "grad_norm": 1.061471271498229, "learning_rate": 4.334920847888376e-06, "loss": 0.217, "step": 612 }, { "epoch": 0.27621944350568883, "grad_norm": 1.0214906979222622, "learning_rate": 4.332388310063431e-06, "loss": 0.1981, "step": 613 }, { "epoch": 0.2766700461867748, "grad_norm": 1.0346404733819128, "learning_rate": 4.329851702411339e-06, "loss": 0.1946, "step": 614 }, { "epoch": 0.27712064886786075, "grad_norm": 1.002298040924044, "learning_rate": 4.327311030566033e-06, "loss": 0.1962, "step": 615 }, { "epoch": 0.27757125154894674, "grad_norm": 1.0160693312542004, "learning_rate": 4.324766300170473e-06, "loss": 0.2045, "step": 616 }, { "epoch": 0.2780218542300327, "grad_norm": 1.0621190326896313, "learning_rate": 4.322217516876635e-06, "loss": 0.1997, "step": 617 }, { "epoch": 0.2784724569111186, "grad_norm": 1.044710760150364, "learning_rate": 4.3196646863454975e-06, "loss": 0.1975, "step": 618 }, { "epoch": 0.2789230595922046, "grad_norm": 1.0565566812202085, "learning_rate": 4.317107814247022e-06, "loss": 0.2179, "step": 619 }, { "epoch": 0.2793736622732905, "grad_norm": 1.0822662595100792, "learning_rate": 4.314546906260156e-06, "loss": 0.2171, "step": 620 }, { "epoch": 0.27982426495437646, "grad_norm": 0.9847692683483855, "learning_rate": 4.3119819680728e-06, "loss": 0.1789, "step": 621 }, { "epoch": 0.28027486763546244, "grad_norm": 1.0377605692374061, "learning_rate": 4.3094130053818164e-06, "loss": 0.2069, "step": 622 }, { "epoch": 0.2807254703165484, "grad_norm": 1.064720287531545, "learning_rate": 4.306840023892998e-06, "loss": 0.1854, "step": 623 }, { "epoch": 0.28117607299763436, "grad_norm": 1.033767676273761, "learning_rate": 4.304263029321069e-06, "loss": 0.1931, "step": 624 }, { "epoch": 0.2816266756787203, "grad_norm": 0.8815106297337543, "learning_rate": 4.301682027389663e-06, "loss": 0.1676, "step": 625 }, { "epoch": 0.2820772783598062, "grad_norm": 1.014281098690602, "learning_rate": 4.299097023831318e-06, "loss": 0.1952, "step": 626 }, { "epoch": 0.2825278810408922, "grad_norm": 1.0085581349510135, "learning_rate": 4.2965080243874555e-06, "loss": 0.1873, "step": 627 }, { "epoch": 0.28297848372197815, "grad_norm": 1.030623931702184, "learning_rate": 4.293915034808376e-06, "loss": 0.1922, "step": 628 }, { "epoch": 0.2834290864030641, "grad_norm": 1.0188262260920022, "learning_rate": 4.29131806085324e-06, "loss": 0.1966, "step": 629 }, { "epoch": 0.28387968908415007, "grad_norm": 1.0037022727298737, "learning_rate": 4.288717108290056e-06, "loss": 0.2009, "step": 630 }, { "epoch": 0.284330291765236, "grad_norm": 1.0271567668441708, "learning_rate": 4.2861121828956745e-06, "loss": 0.2024, "step": 631 }, { "epoch": 0.28478089444632193, "grad_norm": 1.012705939797216, "learning_rate": 4.283503290455765e-06, "loss": 0.1863, "step": 632 }, { "epoch": 0.2852314971274079, "grad_norm": 1.0628715517829384, "learning_rate": 4.28089043676481e-06, "loss": 0.2175, "step": 633 }, { "epoch": 0.28568209980849385, "grad_norm": 1.0106381943649927, "learning_rate": 4.27827362762609e-06, "loss": 0.2087, "step": 634 }, { "epoch": 0.28613270248957984, "grad_norm": 1.0630146592405614, "learning_rate": 4.275652868851669e-06, "loss": 0.1938, "step": 635 }, { "epoch": 0.28658330517066577, "grad_norm": 1.0499418639693274, "learning_rate": 4.2730281662623866e-06, "loss": 0.1864, "step": 636 }, { "epoch": 0.2870339078517517, "grad_norm": 1.0016998697053148, "learning_rate": 4.270399525687839e-06, "loss": 0.1993, "step": 637 }, { "epoch": 0.2874845105328377, "grad_norm": 1.0127517858972737, "learning_rate": 4.267766952966369e-06, "loss": 0.1863, "step": 638 }, { "epoch": 0.2879351132139236, "grad_norm": 1.0939363436908969, "learning_rate": 4.265130453945056e-06, "loss": 0.1922, "step": 639 }, { "epoch": 0.28838571589500955, "grad_norm": 1.0282674433305248, "learning_rate": 4.262490034479697e-06, "loss": 0.1974, "step": 640 }, { "epoch": 0.28883631857609554, "grad_norm": 1.0713994069774757, "learning_rate": 4.259845700434797e-06, "loss": 0.2178, "step": 641 }, { "epoch": 0.2892869212571815, "grad_norm": 1.0095365158524388, "learning_rate": 4.257197457683556e-06, "loss": 0.1984, "step": 642 }, { "epoch": 0.2897375239382674, "grad_norm": 0.9517358327424785, "learning_rate": 4.254545312107854e-06, "loss": 0.1995, "step": 643 }, { "epoch": 0.2901881266193534, "grad_norm": 0.9988737112663147, "learning_rate": 4.251889269598241e-06, "loss": 0.201, "step": 644 }, { "epoch": 0.2906387293004393, "grad_norm": 0.9491077816555287, "learning_rate": 4.249229336053924e-06, "loss": 0.1948, "step": 645 }, { "epoch": 0.2910893319815253, "grad_norm": 0.9177074871325192, "learning_rate": 4.2465655173827465e-06, "loss": 0.1875, "step": 646 }, { "epoch": 0.29153993466261124, "grad_norm": 1.0676678310448093, "learning_rate": 4.243897819501187e-06, "loss": 0.2117, "step": 647 }, { "epoch": 0.2919905373436972, "grad_norm": 0.9838253108409748, "learning_rate": 4.241226248334335e-06, "loss": 0.1998, "step": 648 }, { "epoch": 0.29244114002478316, "grad_norm": 0.9928941143354909, "learning_rate": 4.238550809815889e-06, "loss": 0.1895, "step": 649 }, { "epoch": 0.2928917427058691, "grad_norm": 1.0128875324516162, "learning_rate": 4.23587150988813e-06, "loss": 0.2203, "step": 650 }, { "epoch": 0.29334234538695503, "grad_norm": 1.028303567499585, "learning_rate": 4.233188354501921e-06, "loss": 0.206, "step": 651 }, { "epoch": 0.293792948068041, "grad_norm": 1.1145425080945328, "learning_rate": 4.230501349616683e-06, "loss": 0.2088, "step": 652 }, { "epoch": 0.29424355074912695, "grad_norm": 1.1234390517892725, "learning_rate": 4.227810501200393e-06, "loss": 0.1889, "step": 653 }, { "epoch": 0.29469415343021294, "grad_norm": 0.9793683475808044, "learning_rate": 4.225115815229559e-06, "loss": 0.1878, "step": 654 }, { "epoch": 0.29514475611129887, "grad_norm": 1.0115361523498685, "learning_rate": 4.222417297689217e-06, "loss": 0.1919, "step": 655 }, { "epoch": 0.2955953587923848, "grad_norm": 1.0416286762299183, "learning_rate": 4.219714954572909e-06, "loss": 0.1994, "step": 656 }, { "epoch": 0.2960459614734708, "grad_norm": 0.9552050906912731, "learning_rate": 4.217008791882678e-06, "loss": 0.1893, "step": 657 }, { "epoch": 0.2964965641545567, "grad_norm": 1.0577253087294156, "learning_rate": 4.214298815629046e-06, "loss": 0.1884, "step": 658 }, { "epoch": 0.29694716683564265, "grad_norm": 0.9647606522962644, "learning_rate": 4.211585031831007e-06, "loss": 0.1953, "step": 659 }, { "epoch": 0.29739776951672864, "grad_norm": 0.9562447824803063, "learning_rate": 4.208867446516015e-06, "loss": 0.1855, "step": 660 }, { "epoch": 0.29784837219781457, "grad_norm": 0.972853425024187, "learning_rate": 4.206146065719963e-06, "loss": 0.1819, "step": 661 }, { "epoch": 0.2982989748789005, "grad_norm": 0.9787638447089178, "learning_rate": 4.203420895487175e-06, "loss": 0.2029, "step": 662 }, { "epoch": 0.2987495775599865, "grad_norm": 0.9759659661884763, "learning_rate": 4.200691941870392e-06, "loss": 0.2023, "step": 663 }, { "epoch": 0.2992001802410724, "grad_norm": 0.9943286482597673, "learning_rate": 4.197959210930759e-06, "loss": 0.1978, "step": 664 }, { "epoch": 0.2996507829221584, "grad_norm": 0.9801408645055518, "learning_rate": 4.195222708737809e-06, "loss": 0.2013, "step": 665 }, { "epoch": 0.30010138560324434, "grad_norm": 1.0539478116036578, "learning_rate": 4.192482441369451e-06, "loss": 0.2035, "step": 666 }, { "epoch": 0.3005519882843303, "grad_norm": 1.0997940239624653, "learning_rate": 4.189738414911959e-06, "loss": 0.22, "step": 667 }, { "epoch": 0.30100259096541626, "grad_norm": 0.9853856855339789, "learning_rate": 4.186990635459954e-06, "loss": 0.2016, "step": 668 }, { "epoch": 0.3014531936465022, "grad_norm": 1.1197830097140093, "learning_rate": 4.184239109116393e-06, "loss": 0.2102, "step": 669 }, { "epoch": 0.3019037963275881, "grad_norm": 1.0300352624715958, "learning_rate": 4.181483841992556e-06, "loss": 0.2018, "step": 670 }, { "epoch": 0.3023543990086741, "grad_norm": 1.0137327368224471, "learning_rate": 4.178724840208029e-06, "loss": 0.1986, "step": 671 }, { "epoch": 0.30280500168976004, "grad_norm": 0.9204806350681561, "learning_rate": 4.175962109890697e-06, "loss": 0.1813, "step": 672 }, { "epoch": 0.30325560437084603, "grad_norm": 1.0673745916166968, "learning_rate": 4.1731956571767215e-06, "loss": 0.1998, "step": 673 }, { "epoch": 0.30370620705193196, "grad_norm": 1.0160656530915944, "learning_rate": 4.170425488210534e-06, "loss": 0.207, "step": 674 }, { "epoch": 0.3041568097330179, "grad_norm": 1.054838095733939, "learning_rate": 4.167651609144822e-06, "loss": 0.1942, "step": 675 }, { "epoch": 0.3046074124141039, "grad_norm": 1.077041886917572, "learning_rate": 4.164874026140511e-06, "loss": 0.2014, "step": 676 }, { "epoch": 0.3050580150951898, "grad_norm": 1.0857669925430111, "learning_rate": 4.1620927453667515e-06, "loss": 0.2099, "step": 677 }, { "epoch": 0.30550861777627575, "grad_norm": 1.0608069365987411, "learning_rate": 4.159307773000909e-06, "loss": 0.1873, "step": 678 }, { "epoch": 0.30595922045736174, "grad_norm": 0.957307574538134, "learning_rate": 4.15651911522855e-06, "loss": 0.1796, "step": 679 }, { "epoch": 0.30640982313844767, "grad_norm": 1.0234422261068838, "learning_rate": 4.153726778243422e-06, "loss": 0.1903, "step": 680 }, { "epoch": 0.3068604258195336, "grad_norm": 1.03627065798202, "learning_rate": 4.150930768247449e-06, "loss": 0.1859, "step": 681 }, { "epoch": 0.3073110285006196, "grad_norm": 1.0456496616827005, "learning_rate": 4.148131091450709e-06, "loss": 0.1975, "step": 682 }, { "epoch": 0.3077616311817055, "grad_norm": 1.0630130936439768, "learning_rate": 4.145327754071427e-06, "loss": 0.2077, "step": 683 }, { "epoch": 0.3082122338627915, "grad_norm": 1.065688708892473, "learning_rate": 4.142520762335957e-06, "loss": 0.1849, "step": 684 }, { "epoch": 0.30866283654387744, "grad_norm": 1.1263818513906516, "learning_rate": 4.13971012247877e-06, "loss": 0.2117, "step": 685 }, { "epoch": 0.30911343922496337, "grad_norm": 1.0666870069787027, "learning_rate": 4.136895840742437e-06, "loss": 0.2082, "step": 686 }, { "epoch": 0.30956404190604936, "grad_norm": 1.1195428726866035, "learning_rate": 4.134077923377622e-06, "loss": 0.2128, "step": 687 }, { "epoch": 0.3100146445871353, "grad_norm": 0.9706588710375661, "learning_rate": 4.131256376643062e-06, "loss": 0.194, "step": 688 }, { "epoch": 0.3104652472682212, "grad_norm": 0.9841129879832418, "learning_rate": 4.128431206805556e-06, "loss": 0.191, "step": 689 }, { "epoch": 0.3109158499493072, "grad_norm": 0.964224745997007, "learning_rate": 4.125602420139947e-06, "loss": 0.1909, "step": 690 }, { "epoch": 0.31136645263039314, "grad_norm": 0.9875736538745279, "learning_rate": 4.122770022929114e-06, "loss": 0.1862, "step": 691 }, { "epoch": 0.31181705531147913, "grad_norm": 1.0739578378289165, "learning_rate": 4.119934021463956e-06, "loss": 0.2165, "step": 692 }, { "epoch": 0.31226765799256506, "grad_norm": 0.9672865623771372, "learning_rate": 4.117094422043374e-06, "loss": 0.1858, "step": 693 }, { "epoch": 0.312718260673651, "grad_norm": 1.0671701635171398, "learning_rate": 4.114251230974263e-06, "loss": 0.2117, "step": 694 }, { "epoch": 0.313168863354737, "grad_norm": 1.0091868850822507, "learning_rate": 4.1114044545714935e-06, "loss": 0.2034, "step": 695 }, { "epoch": 0.3136194660358229, "grad_norm": 1.025473722449347, "learning_rate": 4.1085540991579e-06, "loss": 0.2114, "step": 696 }, { "epoch": 0.31407006871690885, "grad_norm": 0.9791997644517443, "learning_rate": 4.105700171064267e-06, "loss": 0.1877, "step": 697 }, { "epoch": 0.31452067139799483, "grad_norm": 1.0485648239190921, "learning_rate": 4.102842676629313e-06, "loss": 0.2098, "step": 698 }, { "epoch": 0.31497127407908077, "grad_norm": 0.9932204868367267, "learning_rate": 4.0999816221996755e-06, "loss": 0.2016, "step": 699 }, { "epoch": 0.3154218767601667, "grad_norm": 1.0052350903793839, "learning_rate": 4.097117014129903e-06, "loss": 0.2031, "step": 700 }, { "epoch": 0.3158724794412527, "grad_norm": 1.0588148231889312, "learning_rate": 4.094248858782436e-06, "loss": 0.2059, "step": 701 }, { "epoch": 0.3163230821223386, "grad_norm": 0.9062525913899364, "learning_rate": 4.091377162527592e-06, "loss": 0.1751, "step": 702 }, { "epoch": 0.3167736848034246, "grad_norm": 1.033952410424911, "learning_rate": 4.088501931743551e-06, "loss": 0.1969, "step": 703 }, { "epoch": 0.31722428748451054, "grad_norm": 1.0173227691023725, "learning_rate": 4.08562317281635e-06, "loss": 0.1939, "step": 704 }, { "epoch": 0.31767489016559647, "grad_norm": 0.9377764057356622, "learning_rate": 4.082740892139856e-06, "loss": 0.1808, "step": 705 }, { "epoch": 0.31812549284668246, "grad_norm": 1.0885942813084413, "learning_rate": 4.07985509611576e-06, "loss": 0.1933, "step": 706 }, { "epoch": 0.3185760955277684, "grad_norm": 1.0460613454763035, "learning_rate": 4.076965791153562e-06, "loss": 0.1853, "step": 707 }, { "epoch": 0.3190266982088543, "grad_norm": 1.0112840919837836, "learning_rate": 4.074072983670555e-06, "loss": 0.1992, "step": 708 }, { "epoch": 0.3194773008899403, "grad_norm": 1.0035685646513803, "learning_rate": 4.071176680091809e-06, "loss": 0.1914, "step": 709 }, { "epoch": 0.31992790357102624, "grad_norm": 0.9455942563033818, "learning_rate": 4.068276886850162e-06, "loss": 0.1977, "step": 710 }, { "epoch": 0.3203785062521122, "grad_norm": 1.0248608488543998, "learning_rate": 4.065373610386201e-06, "loss": 0.2098, "step": 711 }, { "epoch": 0.32082910893319816, "grad_norm": 1.0833442868468053, "learning_rate": 4.06246685714825e-06, "loss": 0.1971, "step": 712 }, { "epoch": 0.3212797116142841, "grad_norm": 1.0376988954502842, "learning_rate": 4.059556633592356e-06, "loss": 0.2036, "step": 713 }, { "epoch": 0.3217303142953701, "grad_norm": 0.9076161254666845, "learning_rate": 4.056642946182271e-06, "loss": 0.181, "step": 714 }, { "epoch": 0.322180916976456, "grad_norm": 1.4291617734098425, "learning_rate": 4.0537258013894434e-06, "loss": 0.1913, "step": 715 }, { "epoch": 0.32263151965754194, "grad_norm": 1.0678912008243973, "learning_rate": 4.0508052056929995e-06, "loss": 0.2181, "step": 716 }, { "epoch": 0.32308212233862793, "grad_norm": 1.007985475433121, "learning_rate": 4.047881165579729e-06, "loss": 0.1888, "step": 717 }, { "epoch": 0.32353272501971386, "grad_norm": 0.9951732450674088, "learning_rate": 4.044953687544074e-06, "loss": 0.1886, "step": 718 }, { "epoch": 0.3239833277007998, "grad_norm": 1.0522626864012918, "learning_rate": 4.042022778088111e-06, "loss": 0.1925, "step": 719 }, { "epoch": 0.3244339303818858, "grad_norm": 1.1114385816164356, "learning_rate": 4.039088443721538e-06, "loss": 0.2056, "step": 720 }, { "epoch": 0.3248845330629717, "grad_norm": 1.1086070930098275, "learning_rate": 4.03615069096166e-06, "loss": 0.201, "step": 721 }, { "epoch": 0.3253351357440577, "grad_norm": 1.1126787479550855, "learning_rate": 4.033209526333375e-06, "loss": 0.1989, "step": 722 }, { "epoch": 0.32578573842514363, "grad_norm": 1.1134234887380865, "learning_rate": 4.030264956369158e-06, "loss": 0.1872, "step": 723 }, { "epoch": 0.32623634110622957, "grad_norm": 0.9526105310356239, "learning_rate": 4.0273169876090475e-06, "loss": 0.1684, "step": 724 }, { "epoch": 0.32668694378731555, "grad_norm": 1.0535295444698698, "learning_rate": 4.024365626600632e-06, "loss": 0.1918, "step": 725 }, { "epoch": 0.3271375464684015, "grad_norm": 1.0045054252904533, "learning_rate": 4.021410879899035e-06, "loss": 0.1906, "step": 726 }, { "epoch": 0.3275881491494874, "grad_norm": 0.9547446035752759, "learning_rate": 4.018452754066895e-06, "loss": 0.1938, "step": 727 }, { "epoch": 0.3280387518305734, "grad_norm": 0.9450343638165539, "learning_rate": 4.015491255674362e-06, "loss": 0.1848, "step": 728 }, { "epoch": 0.32848935451165934, "grad_norm": 0.9879269597269819, "learning_rate": 4.012526391299073e-06, "loss": 0.1902, "step": 729 }, { "epoch": 0.32893995719274527, "grad_norm": 1.038235237988296, "learning_rate": 4.0095581675261405e-06, "loss": 0.184, "step": 730 }, { "epoch": 0.32939055987383126, "grad_norm": 1.0063041190990036, "learning_rate": 4.006586590948141e-06, "loss": 0.1944, "step": 731 }, { "epoch": 0.3298411625549172, "grad_norm": 1.0159279699242378, "learning_rate": 4.003611668165097e-06, "loss": 0.2028, "step": 732 }, { "epoch": 0.3302917652360032, "grad_norm": 0.963739146223804, "learning_rate": 4.000633405784461e-06, "loss": 0.1932, "step": 733 }, { "epoch": 0.3307423679170891, "grad_norm": 1.1048402537588942, "learning_rate": 3.997651810421106e-06, "loss": 0.1941, "step": 734 }, { "epoch": 0.33119297059817504, "grad_norm": 1.159578027342481, "learning_rate": 3.994666888697304e-06, "loss": 0.2027, "step": 735 }, { "epoch": 0.33164357327926103, "grad_norm": 1.1470168711955169, "learning_rate": 3.991678647242719e-06, "loss": 0.1942, "step": 736 }, { "epoch": 0.33209417596034696, "grad_norm": 0.9693885269072713, "learning_rate": 3.988687092694386e-06, "loss": 0.1848, "step": 737 }, { "epoch": 0.3325447786414329, "grad_norm": 1.0257260940401318, "learning_rate": 3.985692231696699e-06, "loss": 0.1778, "step": 738 }, { "epoch": 0.3329953813225189, "grad_norm": 1.0951165232433586, "learning_rate": 3.982694070901396e-06, "loss": 0.1892, "step": 739 }, { "epoch": 0.3334459840036048, "grad_norm": 1.0589651968870157, "learning_rate": 3.979692616967543e-06, "loss": 0.2101, "step": 740 }, { "epoch": 0.3338965866846908, "grad_norm": 1.0310320306160434, "learning_rate": 3.976687876561523e-06, "loss": 0.1834, "step": 741 }, { "epoch": 0.33434718936577673, "grad_norm": 1.0199776554567461, "learning_rate": 3.973679856357014e-06, "loss": 0.2089, "step": 742 }, { "epoch": 0.33479779204686266, "grad_norm": 1.0407462383516777, "learning_rate": 3.970668563034982e-06, "loss": 0.206, "step": 743 }, { "epoch": 0.33524839472794865, "grad_norm": 0.9809522973667985, "learning_rate": 3.967654003283662e-06, "loss": 0.1841, "step": 744 }, { "epoch": 0.3356989974090346, "grad_norm": 1.0533085891523293, "learning_rate": 3.9646361837985435e-06, "loss": 0.1955, "step": 745 }, { "epoch": 0.3361496000901205, "grad_norm": 0.9753893521548701, "learning_rate": 3.961615111282357e-06, "loss": 0.2019, "step": 746 }, { "epoch": 0.3366002027712065, "grad_norm": 0.9561918974139727, "learning_rate": 3.958590792445057e-06, "loss": 0.1904, "step": 747 }, { "epoch": 0.33705080545229243, "grad_norm": 0.9983077525053788, "learning_rate": 3.9555632340038075e-06, "loss": 0.2079, "step": 748 }, { "epoch": 0.33750140813337837, "grad_norm": 0.9907836859881152, "learning_rate": 3.9525324426829716e-06, "loss": 0.2009, "step": 749 }, { "epoch": 0.33795201081446435, "grad_norm": 1.02928274413019, "learning_rate": 3.949498425214088e-06, "loss": 0.1992, "step": 750 }, { "epoch": 0.3384026134955503, "grad_norm": 0.971530439209323, "learning_rate": 3.946461188335863e-06, "loss": 0.2045, "step": 751 }, { "epoch": 0.3388532161766363, "grad_norm": 0.9787331384874378, "learning_rate": 3.943420738794153e-06, "loss": 0.1934, "step": 752 }, { "epoch": 0.3393038188577222, "grad_norm": 1.0152487571688742, "learning_rate": 3.9403770833419535e-06, "loss": 0.1958, "step": 753 }, { "epoch": 0.33975442153880814, "grad_norm": 1.0198547090516574, "learning_rate": 3.937330228739374e-06, "loss": 0.1917, "step": 754 }, { "epoch": 0.3402050242198941, "grad_norm": 0.9536730400178354, "learning_rate": 3.934280181753634e-06, "loss": 0.1938, "step": 755 }, { "epoch": 0.34065562690098006, "grad_norm": 0.9630484450286109, "learning_rate": 3.931226949159041e-06, "loss": 0.2046, "step": 756 }, { "epoch": 0.341106229582066, "grad_norm": 0.9695068944165888, "learning_rate": 3.9281705377369814e-06, "loss": 0.1848, "step": 757 }, { "epoch": 0.341556832263152, "grad_norm": 1.0305403392456742, "learning_rate": 3.925110954275897e-06, "loss": 0.2004, "step": 758 }, { "epoch": 0.3420074349442379, "grad_norm": 1.0173098492440267, "learning_rate": 3.922048205571279e-06, "loss": 0.2033, "step": 759 }, { "epoch": 0.3424580376253239, "grad_norm": 1.005538120788323, "learning_rate": 3.918982298425647e-06, "loss": 0.1996, "step": 760 }, { "epoch": 0.34290864030640983, "grad_norm": 0.9893565295690564, "learning_rate": 3.915913239648535e-06, "loss": 0.1974, "step": 761 }, { "epoch": 0.34335924298749576, "grad_norm": 0.9822976359231225, "learning_rate": 3.91284103605648e-06, "loss": 0.195, "step": 762 }, { "epoch": 0.34380984566858175, "grad_norm": 0.9662221221778032, "learning_rate": 3.909765694473e-06, "loss": 0.2002, "step": 763 }, { "epoch": 0.3442604483496677, "grad_norm": 0.9521020154940343, "learning_rate": 3.906687221728583e-06, "loss": 0.2094, "step": 764 }, { "epoch": 0.3447110510307536, "grad_norm": 1.0427912181383632, "learning_rate": 3.903605624660676e-06, "loss": 0.2075, "step": 765 }, { "epoch": 0.3451616537118396, "grad_norm": 1.0258973989098399, "learning_rate": 3.900520910113659e-06, "loss": 0.1995, "step": 766 }, { "epoch": 0.34561225639292553, "grad_norm": 0.9627094669670759, "learning_rate": 3.897433084938841e-06, "loss": 0.1969, "step": 767 }, { "epoch": 0.34606285907401146, "grad_norm": 1.013620020316575, "learning_rate": 3.894342155994437e-06, "loss": 0.2025, "step": 768 }, { "epoch": 0.34651346175509745, "grad_norm": 1.0600168839850888, "learning_rate": 3.891248130145556e-06, "loss": 0.1916, "step": 769 }, { "epoch": 0.3469640644361834, "grad_norm": 1.0029113853836964, "learning_rate": 3.888151014264189e-06, "loss": 0.2109, "step": 770 }, { "epoch": 0.34741466711726937, "grad_norm": 0.940613635155343, "learning_rate": 3.885050815229182e-06, "loss": 0.1868, "step": 771 }, { "epoch": 0.3478652697983553, "grad_norm": 1.032773679790588, "learning_rate": 3.881947539926239e-06, "loss": 0.1952, "step": 772 }, { "epoch": 0.34831587247944124, "grad_norm": 1.0247046836218952, "learning_rate": 3.878841195247888e-06, "loss": 0.2031, "step": 773 }, { "epoch": 0.3487664751605272, "grad_norm": 1.0044205882937856, "learning_rate": 3.875731788093478e-06, "loss": 0.2025, "step": 774 }, { "epoch": 0.34921707784161315, "grad_norm": 1.0192088758913846, "learning_rate": 3.872619325369162e-06, "loss": 0.2009, "step": 775 }, { "epoch": 0.3496676805226991, "grad_norm": 1.041923818288308, "learning_rate": 3.869503813987876e-06, "loss": 0.2059, "step": 776 }, { "epoch": 0.3501182832037851, "grad_norm": 0.9724353251647057, "learning_rate": 3.866385260869327e-06, "loss": 0.1917, "step": 777 }, { "epoch": 0.350568885884871, "grad_norm": 1.0820177279797962, "learning_rate": 3.8632636729399815e-06, "loss": 0.1858, "step": 778 }, { "epoch": 0.351019488565957, "grad_norm": 0.9825284921649515, "learning_rate": 3.860139057133042e-06, "loss": 0.1897, "step": 779 }, { "epoch": 0.3514700912470429, "grad_norm": 1.0056511814944353, "learning_rate": 3.85701142038844e-06, "loss": 0.1978, "step": 780 }, { "epoch": 0.35192069392812886, "grad_norm": 0.9430992622095392, "learning_rate": 3.853880769652815e-06, "loss": 0.1803, "step": 781 }, { "epoch": 0.35237129660921485, "grad_norm": 1.0122134922297075, "learning_rate": 3.850747111879499e-06, "loss": 0.2087, "step": 782 }, { "epoch": 0.3528218992903008, "grad_norm": 0.9881164713411805, "learning_rate": 3.8476104540285054e-06, "loss": 0.1887, "step": 783 }, { "epoch": 0.3532725019713867, "grad_norm": 0.9507298728508743, "learning_rate": 3.84447080306651e-06, "loss": 0.1912, "step": 784 }, { "epoch": 0.3537231046524727, "grad_norm": 1.022312185699959, "learning_rate": 3.841328165966837e-06, "loss": 0.188, "step": 785 }, { "epoch": 0.35417370733355863, "grad_norm": 0.9611389218860413, "learning_rate": 3.838182549709442e-06, "loss": 0.1833, "step": 786 }, { "epoch": 0.35462431001464456, "grad_norm": 0.8927961208592826, "learning_rate": 3.835033961280898e-06, "loss": 0.1792, "step": 787 }, { "epoch": 0.35507491269573055, "grad_norm": 0.9756392838597563, "learning_rate": 3.831882407674379e-06, "loss": 0.2036, "step": 788 }, { "epoch": 0.3555255153768165, "grad_norm": 1.057959342523254, "learning_rate": 3.828727895889644e-06, "loss": 0.2099, "step": 789 }, { "epoch": 0.35597611805790247, "grad_norm": 1.0103033156169718, "learning_rate": 3.825570432933026e-06, "loss": 0.2006, "step": 790 }, { "epoch": 0.3564267207389884, "grad_norm": 1.0889593061950777, "learning_rate": 3.8224100258174066e-06, "loss": 0.2034, "step": 791 }, { "epoch": 0.35687732342007433, "grad_norm": 1.0371619150509108, "learning_rate": 3.819246681562212e-06, "loss": 0.1879, "step": 792 }, { "epoch": 0.3573279261011603, "grad_norm": 1.0877887937676791, "learning_rate": 3.81608040719339e-06, "loss": 0.2106, "step": 793 }, { "epoch": 0.35777852878224625, "grad_norm": 1.1276852463298828, "learning_rate": 3.812911209743395e-06, "loss": 0.1981, "step": 794 }, { "epoch": 0.3582291314633322, "grad_norm": 1.0746615401972357, "learning_rate": 3.809739096251176e-06, "loss": 0.207, "step": 795 }, { "epoch": 0.35867973414441817, "grad_norm": 1.1233647909244424, "learning_rate": 3.8065640737621566e-06, "loss": 0.2118, "step": 796 }, { "epoch": 0.3591303368255041, "grad_norm": 0.932081242806347, "learning_rate": 3.803386149328223e-06, "loss": 0.1762, "step": 797 }, { "epoch": 0.3595809395065901, "grad_norm": 0.9944199466037139, "learning_rate": 3.8002053300077056e-06, "loss": 0.2005, "step": 798 }, { "epoch": 0.360031542187676, "grad_norm": 0.9892400702131467, "learning_rate": 3.7970216228653667e-06, "loss": 0.2013, "step": 799 }, { "epoch": 0.36048214486876196, "grad_norm": 1.0418347804871562, "learning_rate": 3.7938350349723784e-06, "loss": 0.2153, "step": 800 }, { "epoch": 0.36093274754984794, "grad_norm": 0.967000729688822, "learning_rate": 3.7906455734063156e-06, "loss": 0.1836, "step": 801 }, { "epoch": 0.3613833502309339, "grad_norm": 1.0134046794785294, "learning_rate": 3.7874532452511324e-06, "loss": 0.194, "step": 802 }, { "epoch": 0.3618339529120198, "grad_norm": 0.963474210883665, "learning_rate": 3.7842580575971533e-06, "loss": 0.1878, "step": 803 }, { "epoch": 0.3622845555931058, "grad_norm": 0.9863618310097502, "learning_rate": 3.7810600175410493e-06, "loss": 0.2038, "step": 804 }, { "epoch": 0.3627351582741917, "grad_norm": 1.0560263462424802, "learning_rate": 3.77785913218583e-06, "loss": 0.2048, "step": 805 }, { "epoch": 0.36318576095527766, "grad_norm": 0.9669821875912328, "learning_rate": 3.7746554086408245e-06, "loss": 0.1921, "step": 806 }, { "epoch": 0.36363636363636365, "grad_norm": 1.0005888075873781, "learning_rate": 3.7714488540216637e-06, "loss": 0.1807, "step": 807 }, { "epoch": 0.3640869663174496, "grad_norm": 1.0506705011418698, "learning_rate": 3.7682394754502687e-06, "loss": 0.2082, "step": 808 }, { "epoch": 0.36453756899853557, "grad_norm": 1.0058206144540291, "learning_rate": 3.7650272800548316e-06, "loss": 0.1972, "step": 809 }, { "epoch": 0.3649881716796215, "grad_norm": 1.015804371446931, "learning_rate": 3.7618122749697993e-06, "loss": 0.1856, "step": 810 }, { "epoch": 0.36543877436070743, "grad_norm": 1.00510305109926, "learning_rate": 3.7585944673358632e-06, "loss": 0.1879, "step": 811 }, { "epoch": 0.3658893770417934, "grad_norm": 0.9593373407976435, "learning_rate": 3.7553738642999354e-06, "loss": 0.2011, "step": 812 }, { "epoch": 0.36633997972287935, "grad_norm": 1.0581373680068094, "learning_rate": 3.7521504730151382e-06, "loss": 0.1945, "step": 813 }, { "epoch": 0.3667905824039653, "grad_norm": 1.0159773971934891, "learning_rate": 3.748924300640787e-06, "loss": 0.2079, "step": 814 }, { "epoch": 0.36724118508505127, "grad_norm": 0.9934744368057637, "learning_rate": 3.745695354342374e-06, "loss": 0.1884, "step": 815 }, { "epoch": 0.3676917877661372, "grad_norm": 0.966842175173914, "learning_rate": 3.742463641291552e-06, "loss": 0.1866, "step": 816 }, { "epoch": 0.36814239044722313, "grad_norm": 0.9893481101549668, "learning_rate": 3.73922916866612e-06, "loss": 0.1812, "step": 817 }, { "epoch": 0.3685929931283091, "grad_norm": 0.9005100733960006, "learning_rate": 3.7359919436500038e-06, "loss": 0.1809, "step": 818 }, { "epoch": 0.36904359580939505, "grad_norm": 1.0286666188930826, "learning_rate": 3.7327519734332453e-06, "loss": 0.2104, "step": 819 }, { "epoch": 0.36949419849048104, "grad_norm": 1.0335040765709167, "learning_rate": 3.7295092652119815e-06, "loss": 0.1893, "step": 820 }, { "epoch": 0.369944801171567, "grad_norm": 0.9711351753344023, "learning_rate": 3.726263826188432e-06, "loss": 0.181, "step": 821 }, { "epoch": 0.3703954038526529, "grad_norm": 1.0233958256343774, "learning_rate": 3.7230156635708815e-06, "loss": 0.2092, "step": 822 }, { "epoch": 0.3708460065337389, "grad_norm": 1.0021813440100957, "learning_rate": 3.7197647845736616e-06, "loss": 0.1829, "step": 823 }, { "epoch": 0.3712966092148248, "grad_norm": 1.0745683687708758, "learning_rate": 3.7165111964171407e-06, "loss": 0.1905, "step": 824 }, { "epoch": 0.37174721189591076, "grad_norm": 1.0130321630619799, "learning_rate": 3.7132549063277033e-06, "loss": 0.1907, "step": 825 }, { "epoch": 0.37219781457699674, "grad_norm": 0.9754258549582402, "learning_rate": 3.7099959215377325e-06, "loss": 0.1881, "step": 826 }, { "epoch": 0.3726484172580827, "grad_norm": 1.0538670773156578, "learning_rate": 3.7067342492855997e-06, "loss": 0.1743, "step": 827 }, { "epoch": 0.37309901993916866, "grad_norm": 0.9937882758720834, "learning_rate": 3.7034698968156434e-06, "loss": 0.1768, "step": 828 }, { "epoch": 0.3735496226202546, "grad_norm": 1.0726219314072167, "learning_rate": 3.700202871378156e-06, "loss": 0.2057, "step": 829 }, { "epoch": 0.3740002253013405, "grad_norm": 1.0479897737139336, "learning_rate": 3.696933180229366e-06, "loss": 0.1936, "step": 830 }, { "epoch": 0.3744508279824265, "grad_norm": 1.0252496809768523, "learning_rate": 3.6936608306314227e-06, "loss": 0.2142, "step": 831 }, { "epoch": 0.37490143066351245, "grad_norm": 1.0155015291771505, "learning_rate": 3.690385829852381e-06, "loss": 0.2065, "step": 832 }, { "epoch": 0.3753520333445984, "grad_norm": 1.0121077503776583, "learning_rate": 3.6871081851661825e-06, "loss": 0.2129, "step": 833 }, { "epoch": 0.37580263602568437, "grad_norm": 1.001133609252554, "learning_rate": 3.6838279038526427e-06, "loss": 0.1742, "step": 834 }, { "epoch": 0.3762532387067703, "grad_norm": 1.024247819419461, "learning_rate": 3.6805449931974313e-06, "loss": 0.1944, "step": 835 }, { "epoch": 0.37670384138785623, "grad_norm": 0.9805878653120293, "learning_rate": 3.67725946049206e-06, "loss": 0.193, "step": 836 }, { "epoch": 0.3771544440689422, "grad_norm": 0.9405085224910799, "learning_rate": 3.6739713130338617e-06, "loss": 0.1848, "step": 837 }, { "epoch": 0.37760504675002815, "grad_norm": 0.9600181235407473, "learning_rate": 3.6706805581259807e-06, "loss": 0.1664, "step": 838 }, { "epoch": 0.37805564943111414, "grad_norm": 1.089790825383409, "learning_rate": 3.6673872030773473e-06, "loss": 0.1956, "step": 839 }, { "epoch": 0.37850625211220007, "grad_norm": 1.0607218450180482, "learning_rate": 3.664091255202672e-06, "loss": 0.19, "step": 840 }, { "epoch": 0.378956854793286, "grad_norm": 0.9635783809020961, "learning_rate": 3.66079272182242e-06, "loss": 0.1844, "step": 841 }, { "epoch": 0.379407457474372, "grad_norm": 0.9742041910839031, "learning_rate": 3.657491610262802e-06, "loss": 0.1826, "step": 842 }, { "epoch": 0.3798580601554579, "grad_norm": 1.0457625000926623, "learning_rate": 3.654187927855754e-06, "loss": 0.1853, "step": 843 }, { "epoch": 0.38030866283654385, "grad_norm": 0.9692868387633684, "learning_rate": 3.6508816819389216e-06, "loss": 0.1834, "step": 844 }, { "epoch": 0.38075926551762984, "grad_norm": 1.0098777109203647, "learning_rate": 3.6475728798556426e-06, "loss": 0.1971, "step": 845 }, { "epoch": 0.3812098681987158, "grad_norm": 1.0007350741530037, "learning_rate": 3.6442615289549354e-06, "loss": 0.2078, "step": 846 }, { "epoch": 0.38166047087980176, "grad_norm": 1.0018669288063613, "learning_rate": 3.6409476365914786e-06, "loss": 0.1927, "step": 847 }, { "epoch": 0.3821110735608877, "grad_norm": 0.9336398545199618, "learning_rate": 3.6376312101255934e-06, "loss": 0.1829, "step": 848 }, { "epoch": 0.3825616762419736, "grad_norm": 0.9529685621807624, "learning_rate": 3.6343122569232313e-06, "loss": 0.182, "step": 849 }, { "epoch": 0.3830122789230596, "grad_norm": 0.9503579977057526, "learning_rate": 3.6309907843559542e-06, "loss": 0.1889, "step": 850 }, { "epoch": 0.38346288160414554, "grad_norm": 1.0280068475319317, "learning_rate": 3.6276667998009242e-06, "loss": 0.1921, "step": 851 }, { "epoch": 0.3839134842852315, "grad_norm": 1.095055906870189, "learning_rate": 3.624340310640875e-06, "loss": 0.1965, "step": 852 }, { "epoch": 0.38436408696631746, "grad_norm": 0.9701378921242254, "learning_rate": 3.621011324264109e-06, "loss": 0.1992, "step": 853 }, { "epoch": 0.3848146896474034, "grad_norm": 0.9833259317334301, "learning_rate": 3.617679848064474e-06, "loss": 0.1856, "step": 854 }, { "epoch": 0.38526529232848933, "grad_norm": 1.0079759165427837, "learning_rate": 3.6143458894413463e-06, "loss": 0.194, "step": 855 }, { "epoch": 0.3857158950095753, "grad_norm": 1.0679285294794556, "learning_rate": 3.611009455799617e-06, "loss": 0.1969, "step": 856 }, { "epoch": 0.38616649769066125, "grad_norm": 1.0490902535539468, "learning_rate": 3.6076705545496743e-06, "loss": 0.1909, "step": 857 }, { "epoch": 0.38661710037174724, "grad_norm": 0.9517268188133337, "learning_rate": 3.604329193107386e-06, "loss": 0.1799, "step": 858 }, { "epoch": 0.38706770305283317, "grad_norm": 0.9713171651660856, "learning_rate": 3.600985378894086e-06, "loss": 0.1796, "step": 859 }, { "epoch": 0.3875183057339191, "grad_norm": 1.0165864377870517, "learning_rate": 3.5976391193365544e-06, "loss": 0.2003, "step": 860 }, { "epoch": 0.3879689084150051, "grad_norm": 0.9665093569343551, "learning_rate": 3.5942904218670025e-06, "loss": 0.168, "step": 861 }, { "epoch": 0.388419511096091, "grad_norm": 0.9587625653826859, "learning_rate": 3.590939293923058e-06, "loss": 0.1758, "step": 862 }, { "epoch": 0.38887011377717695, "grad_norm": 0.9738142413965079, "learning_rate": 3.5875857429477447e-06, "loss": 0.1818, "step": 863 }, { "epoch": 0.38932071645826294, "grad_norm": 0.9544055095909694, "learning_rate": 3.584229776389468e-06, "loss": 0.1776, "step": 864 }, { "epoch": 0.38977131913934887, "grad_norm": 1.0326363275288095, "learning_rate": 3.580871401702002e-06, "loss": 0.1977, "step": 865 }, { "epoch": 0.39022192182043486, "grad_norm": 1.0214661988145164, "learning_rate": 3.5775106263444644e-06, "loss": 0.1899, "step": 866 }, { "epoch": 0.3906725245015208, "grad_norm": 0.9328284154481535, "learning_rate": 3.5741474577813086e-06, "loss": 0.1908, "step": 867 }, { "epoch": 0.3911231271826067, "grad_norm": 0.9729363140620655, "learning_rate": 3.570781903482302e-06, "loss": 0.1794, "step": 868 }, { "epoch": 0.3915737298636927, "grad_norm": 1.0659796965374109, "learning_rate": 3.5674139709225104e-06, "loss": 0.2057, "step": 869 }, { "epoch": 0.39202433254477864, "grad_norm": 0.9969607922349554, "learning_rate": 3.5640436675822833e-06, "loss": 0.1924, "step": 870 }, { "epoch": 0.3924749352258646, "grad_norm": 0.9635216162411704, "learning_rate": 3.5606710009472335e-06, "loss": 0.1955, "step": 871 }, { "epoch": 0.39292553790695056, "grad_norm": 1.0526216128071109, "learning_rate": 3.5572959785082264e-06, "loss": 0.1956, "step": 872 }, { "epoch": 0.3933761405880365, "grad_norm": 0.9289014978788578, "learning_rate": 3.5539186077613562e-06, "loss": 0.1821, "step": 873 }, { "epoch": 0.3938267432691224, "grad_norm": 1.0324049051398465, "learning_rate": 3.5505388962079337e-06, "loss": 0.1986, "step": 874 }, { "epoch": 0.3942773459502084, "grad_norm": 1.02273364347944, "learning_rate": 3.54715685135447e-06, "loss": 0.1918, "step": 875 }, { "epoch": 0.39472794863129435, "grad_norm": 0.9961316154921143, "learning_rate": 3.5437724807126583e-06, "loss": 0.1851, "step": 876 }, { "epoch": 0.39517855131238033, "grad_norm": 0.9794174446801692, "learning_rate": 3.5403857917993554e-06, "loss": 0.186, "step": 877 }, { "epoch": 0.39562915399346626, "grad_norm": 1.0200190502541928, "learning_rate": 3.5369967921365718e-06, "loss": 0.1769, "step": 878 }, { "epoch": 0.3960797566745522, "grad_norm": 0.9950100030363491, "learning_rate": 3.5336054892514437e-06, "loss": 0.1856, "step": 879 }, { "epoch": 0.3965303593556382, "grad_norm": 1.0393570190944585, "learning_rate": 3.530211890676229e-06, "loss": 0.1926, "step": 880 }, { "epoch": 0.3969809620367241, "grad_norm": 0.998482884761975, "learning_rate": 3.52681600394828e-06, "loss": 0.1846, "step": 881 }, { "epoch": 0.39743156471781005, "grad_norm": 1.0263931600593703, "learning_rate": 3.5234178366100343e-06, "loss": 0.1866, "step": 882 }, { "epoch": 0.39788216739889604, "grad_norm": 0.9802252111597417, "learning_rate": 3.520017396208993e-06, "loss": 0.1735, "step": 883 }, { "epoch": 0.39833277007998197, "grad_norm": 1.0025582938431925, "learning_rate": 3.5166146902977055e-06, "loss": 0.1762, "step": 884 }, { "epoch": 0.39878337276106796, "grad_norm": 1.0495303756517684, "learning_rate": 3.5132097264337546e-06, "loss": 0.19, "step": 885 }, { "epoch": 0.3992339754421539, "grad_norm": 0.9685699335532568, "learning_rate": 3.5098025121797375e-06, "loss": 0.1731, "step": 886 }, { "epoch": 0.3996845781232398, "grad_norm": 0.9810351192503403, "learning_rate": 3.5063930551032494e-06, "loss": 0.1903, "step": 887 }, { "epoch": 0.4001351808043258, "grad_norm": 1.1162636062796865, "learning_rate": 3.5029813627768665e-06, "loss": 0.2147, "step": 888 }, { "epoch": 0.40058578348541174, "grad_norm": 1.0342032669702461, "learning_rate": 3.499567442778131e-06, "loss": 0.1809, "step": 889 }, { "epoch": 0.40103638616649767, "grad_norm": 0.9844265403732025, "learning_rate": 3.49615130268953e-06, "loss": 0.2001, "step": 890 }, { "epoch": 0.40148698884758366, "grad_norm": 0.9606329542973896, "learning_rate": 3.4927329500984857e-06, "loss": 0.1743, "step": 891 }, { "epoch": 0.4019375915286696, "grad_norm": 1.0172110372775587, "learning_rate": 3.489312392597331e-06, "loss": 0.1924, "step": 892 }, { "epoch": 0.4023881942097555, "grad_norm": 1.0223046255744688, "learning_rate": 3.4858896377832966e-06, "loss": 0.2013, "step": 893 }, { "epoch": 0.4028387968908415, "grad_norm": 1.0198574264570306, "learning_rate": 3.482464693258496e-06, "loss": 0.1931, "step": 894 }, { "epoch": 0.40328939957192744, "grad_norm": 0.9771282166871073, "learning_rate": 3.4790375666299026e-06, "loss": 0.1896, "step": 895 }, { "epoch": 0.40374000225301343, "grad_norm": 0.9854569207671147, "learning_rate": 3.4756082655093387e-06, "loss": 0.1974, "step": 896 }, { "epoch": 0.40419060493409936, "grad_norm": 0.9855416122374189, "learning_rate": 3.4721767975134557e-06, "loss": 0.1961, "step": 897 }, { "epoch": 0.4046412076151853, "grad_norm": 1.108058477150135, "learning_rate": 3.4687431702637165e-06, "loss": 0.1965, "step": 898 }, { "epoch": 0.4050918102962713, "grad_norm": 0.9481181010928448, "learning_rate": 3.465307391386383e-06, "loss": 0.1869, "step": 899 }, { "epoch": 0.4055424129773572, "grad_norm": 1.0237695974057641, "learning_rate": 3.4618694685124927e-06, "loss": 0.2043, "step": 900 }, { "epoch": 0.40599301565844315, "grad_norm": 1.0884923581804866, "learning_rate": 3.458429409277846e-06, "loss": 0.1971, "step": 901 }, { "epoch": 0.40644361833952913, "grad_norm": 1.0271173038047998, "learning_rate": 3.454987221322989e-06, "loss": 0.1933, "step": 902 }, { "epoch": 0.40689422102061507, "grad_norm": 0.9107687670560624, "learning_rate": 3.4515429122931955e-06, "loss": 0.1685, "step": 903 }, { "epoch": 0.407344823701701, "grad_norm": 0.9680902887861618, "learning_rate": 3.4480964898384495e-06, "loss": 0.1869, "step": 904 }, { "epoch": 0.407795426382787, "grad_norm": 1.037144807969508, "learning_rate": 3.44464796161343e-06, "loss": 0.1981, "step": 905 }, { "epoch": 0.4082460290638729, "grad_norm": 0.974989893773787, "learning_rate": 3.4411973352774917e-06, "loss": 0.1862, "step": 906 }, { "epoch": 0.4086966317449589, "grad_norm": 1.0336123109595516, "learning_rate": 3.437744618494653e-06, "loss": 0.2073, "step": 907 }, { "epoch": 0.40914723442604484, "grad_norm": 0.95994206370728, "learning_rate": 3.4342898189335692e-06, "loss": 0.1902, "step": 908 }, { "epoch": 0.40959783710713077, "grad_norm": 1.0055854015029335, "learning_rate": 3.4308329442675276e-06, "loss": 0.1962, "step": 909 }, { "epoch": 0.41004843978821676, "grad_norm": 0.9612475566675484, "learning_rate": 3.42737400217442e-06, "loss": 0.1923, "step": 910 }, { "epoch": 0.4104990424693027, "grad_norm": 0.9549762987559874, "learning_rate": 3.423913000336732e-06, "loss": 0.1839, "step": 911 }, { "epoch": 0.4109496451503886, "grad_norm": 0.9058879282248115, "learning_rate": 3.4204499464415253e-06, "loss": 0.1776, "step": 912 }, { "epoch": 0.4114002478314746, "grad_norm": 0.9557686317220591, "learning_rate": 3.4169848481804165e-06, "loss": 0.1753, "step": 913 }, { "epoch": 0.41185085051256054, "grad_norm": 1.0083873515901405, "learning_rate": 3.4135177132495632e-06, "loss": 0.1961, "step": 914 }, { "epoch": 0.4123014531936465, "grad_norm": 0.9948199841105111, "learning_rate": 3.41004854934965e-06, "loss": 0.1921, "step": 915 }, { "epoch": 0.41275205587473246, "grad_norm": 1.064213580358891, "learning_rate": 3.406577364185864e-06, "loss": 0.1931, "step": 916 }, { "epoch": 0.4132026585558184, "grad_norm": 0.9679940604781346, "learning_rate": 3.403104165467883e-06, "loss": 0.1998, "step": 917 }, { "epoch": 0.4136532612369044, "grad_norm": 0.9808223597712648, "learning_rate": 3.399628960909857e-06, "loss": 0.19, "step": 918 }, { "epoch": 0.4141038639179903, "grad_norm": 0.9916725836748305, "learning_rate": 3.3961517582303916e-06, "loss": 0.1904, "step": 919 }, { "epoch": 0.41455446659907624, "grad_norm": 0.970381514220033, "learning_rate": 3.39267256515253e-06, "loss": 0.1871, "step": 920 }, { "epoch": 0.41500506928016223, "grad_norm": 0.9112522410429553, "learning_rate": 3.3891913894037354e-06, "loss": 0.18, "step": 921 }, { "epoch": 0.41545567196124816, "grad_norm": 0.9689677718277616, "learning_rate": 3.385708238715876e-06, "loss": 0.1822, "step": 922 }, { "epoch": 0.4159062746423341, "grad_norm": 0.9579319965915192, "learning_rate": 3.3822231208252053e-06, "loss": 0.1839, "step": 923 }, { "epoch": 0.4163568773234201, "grad_norm": 1.0343627135702063, "learning_rate": 3.3787360434723466e-06, "loss": 0.1905, "step": 924 }, { "epoch": 0.416807480004506, "grad_norm": 0.9307878163914602, "learning_rate": 3.3752470144022745e-06, "loss": 0.1672, "step": 925 }, { "epoch": 0.417258082685592, "grad_norm": 0.9749987708402977, "learning_rate": 3.371756041364301e-06, "loss": 0.1977, "step": 926 }, { "epoch": 0.41770868536667793, "grad_norm": 1.0281325905465875, "learning_rate": 3.3682631321120507e-06, "loss": 0.2033, "step": 927 }, { "epoch": 0.41815928804776387, "grad_norm": 0.9271935858859663, "learning_rate": 3.364768294403455e-06, "loss": 0.1642, "step": 928 }, { "epoch": 0.41860989072884985, "grad_norm": 0.9740269407929583, "learning_rate": 3.361271536000723e-06, "loss": 0.1839, "step": 929 }, { "epoch": 0.4190604934099358, "grad_norm": 0.9661329367186603, "learning_rate": 3.3577728646703335e-06, "loss": 0.1789, "step": 930 }, { "epoch": 0.4195110960910217, "grad_norm": 1.0041765071251951, "learning_rate": 3.354272288183012e-06, "loss": 0.1957, "step": 931 }, { "epoch": 0.4199616987721077, "grad_norm": 0.984166508205924, "learning_rate": 3.3507698143137157e-06, "loss": 0.1852, "step": 932 }, { "epoch": 0.42041230145319364, "grad_norm": 1.0382196872498093, "learning_rate": 3.3472654508416157e-06, "loss": 0.1881, "step": 933 }, { "epoch": 0.4208629041342796, "grad_norm": 1.0415012014327485, "learning_rate": 3.3437592055500825e-06, "loss": 0.1798, "step": 934 }, { "epoch": 0.42131350681536556, "grad_norm": 1.0009036129479045, "learning_rate": 3.340251086226663e-06, "loss": 0.189, "step": 935 }, { "epoch": 0.4217641094964515, "grad_norm": 0.9821502289558824, "learning_rate": 3.3367411006630677e-06, "loss": 0.2014, "step": 936 }, { "epoch": 0.4222147121775375, "grad_norm": 0.8914293408127837, "learning_rate": 3.333229256655153e-06, "loss": 0.1873, "step": 937 }, { "epoch": 0.4226653148586234, "grad_norm": 1.0219408028076342, "learning_rate": 3.3297155620029e-06, "loss": 0.1913, "step": 938 }, { "epoch": 0.42311591753970934, "grad_norm": 1.0374615443742041, "learning_rate": 3.326200024510405e-06, "loss": 0.2085, "step": 939 }, { "epoch": 0.42356652022079533, "grad_norm": 1.0704603560066228, "learning_rate": 3.3226826519858526e-06, "loss": 0.1789, "step": 940 }, { "epoch": 0.42401712290188126, "grad_norm": 0.9484752054686993, "learning_rate": 3.3191634522415064e-06, "loss": 0.2042, "step": 941 }, { "epoch": 0.4244677255829672, "grad_norm": 0.9627020313640821, "learning_rate": 3.315642433093686e-06, "loss": 0.2018, "step": 942 }, { "epoch": 0.4249183282640532, "grad_norm": 1.0542705175273657, "learning_rate": 3.3121196023627543e-06, "loss": 0.2026, "step": 943 }, { "epoch": 0.4253689309451391, "grad_norm": 1.0358385566299095, "learning_rate": 3.3085949678730953e-06, "loss": 0.1942, "step": 944 }, { "epoch": 0.4258195336262251, "grad_norm": 0.9995141966693947, "learning_rate": 3.305068537453102e-06, "loss": 0.188, "step": 945 }, { "epoch": 0.42627013630731103, "grad_norm": 0.9300383955846249, "learning_rate": 3.3015403189351536e-06, "loss": 0.1782, "step": 946 }, { "epoch": 0.42672073898839696, "grad_norm": 0.9899587938284999, "learning_rate": 3.2980103201556023e-06, "loss": 0.1884, "step": 947 }, { "epoch": 0.42717134166948295, "grad_norm": 0.9620887217479097, "learning_rate": 3.2944785489547544e-06, "loss": 0.1795, "step": 948 }, { "epoch": 0.4276219443505689, "grad_norm": 0.9312121794699554, "learning_rate": 3.290945013176852e-06, "loss": 0.1852, "step": 949 }, { "epoch": 0.4280725470316548, "grad_norm": 1.118845733713994, "learning_rate": 3.2874097206700566e-06, "loss": 0.2082, "step": 950 }, { "epoch": 0.4285231497127408, "grad_norm": 1.0087779710378257, "learning_rate": 3.2838726792864315e-06, "loss": 0.1922, "step": 951 }, { "epoch": 0.42897375239382673, "grad_norm": 0.9053213847907385, "learning_rate": 3.2803338968819264e-06, "loss": 0.1706, "step": 952 }, { "epoch": 0.4294243550749127, "grad_norm": 0.94550246667197, "learning_rate": 3.2767933813163542e-06, "loss": 0.1878, "step": 953 }, { "epoch": 0.42987495775599865, "grad_norm": 1.0667255593304628, "learning_rate": 3.2732511404533797e-06, "loss": 0.1851, "step": 954 }, { "epoch": 0.4303255604370846, "grad_norm": 0.9771340881855369, "learning_rate": 3.2697071821604986e-06, "loss": 0.1882, "step": 955 }, { "epoch": 0.4307761631181706, "grad_norm": 0.9550554995820587, "learning_rate": 3.266161514309023e-06, "loss": 0.1739, "step": 956 }, { "epoch": 0.4312267657992565, "grad_norm": 1.0298010176964945, "learning_rate": 3.262614144774059e-06, "loss": 0.1776, "step": 957 }, { "epoch": 0.43167736848034244, "grad_norm": 1.0111021112344831, "learning_rate": 3.259065081434495e-06, "loss": 0.1827, "step": 958 }, { "epoch": 0.4321279711614284, "grad_norm": 1.0110533511752027, "learning_rate": 3.255514332172979e-06, "loss": 0.1921, "step": 959 }, { "epoch": 0.43257857384251436, "grad_norm": 1.0569579356257515, "learning_rate": 3.2519619048759056e-06, "loss": 0.199, "step": 960 }, { "epoch": 0.4330291765236003, "grad_norm": 1.0231127453347497, "learning_rate": 3.248407807433396e-06, "loss": 0.1816, "step": 961 }, { "epoch": 0.4334797792046863, "grad_norm": 1.0261507249808781, "learning_rate": 3.2448520477392788e-06, "loss": 0.1998, "step": 962 }, { "epoch": 0.4339303818857722, "grad_norm": 1.0499398268465796, "learning_rate": 3.2412946336910778e-06, "loss": 0.1879, "step": 963 }, { "epoch": 0.4343809845668582, "grad_norm": 0.9178380998321612, "learning_rate": 3.237735573189989e-06, "loss": 0.1749, "step": 964 }, { "epoch": 0.43483158724794413, "grad_norm": 1.0622394272309097, "learning_rate": 3.234174874140866e-06, "loss": 0.1816, "step": 965 }, { "epoch": 0.43528218992903006, "grad_norm": 1.0657095360369357, "learning_rate": 3.230612544452202e-06, "loss": 0.1993, "step": 966 }, { "epoch": 0.43573279261011605, "grad_norm": 0.9392726541767252, "learning_rate": 3.2270485920361093e-06, "loss": 0.1816, "step": 967 }, { "epoch": 0.436183395291202, "grad_norm": 1.024166929104129, "learning_rate": 3.2234830248083095e-06, "loss": 0.1845, "step": 968 }, { "epoch": 0.4366339979722879, "grad_norm": 1.0403201840799952, "learning_rate": 3.219915850688106e-06, "loss": 0.1871, "step": 969 }, { "epoch": 0.4370846006533739, "grad_norm": 1.0511669741921026, "learning_rate": 3.2163470775983733e-06, "loss": 0.2009, "step": 970 }, { "epoch": 0.43753520333445983, "grad_norm": 0.9891281098274592, "learning_rate": 3.2127767134655374e-06, "loss": 0.1888, "step": 971 }, { "epoch": 0.4379858060155458, "grad_norm": 1.0288193967999293, "learning_rate": 3.209204766219558e-06, "loss": 0.1897, "step": 972 }, { "epoch": 0.43843640869663175, "grad_norm": 0.9114869638159263, "learning_rate": 3.205631243793909e-06, "loss": 0.1746, "step": 973 }, { "epoch": 0.4388870113777177, "grad_norm": 0.9351530196812792, "learning_rate": 3.202056154125567e-06, "loss": 0.1821, "step": 974 }, { "epoch": 0.43933761405880367, "grad_norm": 0.8879072088338338, "learning_rate": 3.198479505154984e-06, "loss": 0.1681, "step": 975 }, { "epoch": 0.4397882167398896, "grad_norm": 0.9869879230018886, "learning_rate": 3.1949013048260813e-06, "loss": 0.1813, "step": 976 }, { "epoch": 0.44023881942097554, "grad_norm": 0.9639695729445393, "learning_rate": 3.1913215610862208e-06, "loss": 0.1739, "step": 977 }, { "epoch": 0.4406894221020615, "grad_norm": 0.9647122784620339, "learning_rate": 3.1877402818861954e-06, "loss": 0.1771, "step": 978 }, { "epoch": 0.44114002478314746, "grad_norm": 0.9353403144241794, "learning_rate": 3.184157475180208e-06, "loss": 0.182, "step": 979 }, { "epoch": 0.4415906274642334, "grad_norm": 1.0466040073099763, "learning_rate": 3.1805731489258516e-06, "loss": 0.2019, "step": 980 }, { "epoch": 0.4420412301453194, "grad_norm": 0.9855274074373859, "learning_rate": 3.1769873110840977e-06, "loss": 0.195, "step": 981 }, { "epoch": 0.4424918328264053, "grad_norm": 0.9571646517475396, "learning_rate": 3.1733999696192736e-06, "loss": 0.1923, "step": 982 }, { "epoch": 0.4429424355074913, "grad_norm": 0.9290347394228827, "learning_rate": 3.1698111324990454e-06, "loss": 0.1716, "step": 983 }, { "epoch": 0.4433930381885772, "grad_norm": 0.9607777203317588, "learning_rate": 3.1662208076944027e-06, "loss": 0.1693, "step": 984 }, { "epoch": 0.44384364086966316, "grad_norm": 0.9973605790758735, "learning_rate": 3.162629003179638e-06, "loss": 0.2023, "step": 985 }, { "epoch": 0.44429424355074915, "grad_norm": 1.0225259641771225, "learning_rate": 3.1590357269323312e-06, "loss": 0.2018, "step": 986 }, { "epoch": 0.4447448462318351, "grad_norm": 0.9810464652010766, "learning_rate": 3.15544098693333e-06, "loss": 0.19, "step": 987 }, { "epoch": 0.445195448912921, "grad_norm": 1.032890935446246, "learning_rate": 3.151844791166735e-06, "loss": 0.1913, "step": 988 }, { "epoch": 0.445646051594007, "grad_norm": 1.0218822651561286, "learning_rate": 3.1482471476198784e-06, "loss": 0.1998, "step": 989 }, { "epoch": 0.44609665427509293, "grad_norm": 0.9838750989611218, "learning_rate": 3.1446480642833077e-06, "loss": 0.1879, "step": 990 }, { "epoch": 0.44654725695617886, "grad_norm": 0.9533897607350283, "learning_rate": 3.14104754915077e-06, "loss": 0.1718, "step": 991 }, { "epoch": 0.44699785963726485, "grad_norm": 1.0097739680223856, "learning_rate": 3.137445610219192e-06, "loss": 0.1856, "step": 992 }, { "epoch": 0.4474484623183508, "grad_norm": 0.975630130381292, "learning_rate": 3.133842255488661e-06, "loss": 0.2084, "step": 993 }, { "epoch": 0.44789906499943677, "grad_norm": 0.9277306719461578, "learning_rate": 3.130237492962411e-06, "loss": 0.1828, "step": 994 }, { "epoch": 0.4483496676805227, "grad_norm": 0.9568578595773181, "learning_rate": 3.1266313306468018e-06, "loss": 0.193, "step": 995 }, { "epoch": 0.44880027036160863, "grad_norm": 1.0000320220613919, "learning_rate": 3.1230237765513023e-06, "loss": 0.1813, "step": 996 }, { "epoch": 0.4492508730426946, "grad_norm": 0.9982728552087634, "learning_rate": 3.119414838688473e-06, "loss": 0.1962, "step": 997 }, { "epoch": 0.44970147572378055, "grad_norm": 0.9273058649051045, "learning_rate": 3.1158045250739473e-06, "loss": 0.1845, "step": 998 }, { "epoch": 0.4501520784048665, "grad_norm": 0.9096138394901124, "learning_rate": 3.1121928437264138e-06, "loss": 0.1715, "step": 999 }, { "epoch": 0.4506026810859525, "grad_norm": 1.0183371982803404, "learning_rate": 3.1085798026676e-06, "loss": 0.1811, "step": 1000 }, { "epoch": 0.4506026810859525, "eval_loss": 0.189128577709198, "eval_runtime": 59.4414, "eval_samples_per_second": 24.141, "eval_steps_per_second": 3.028, "step": 1000 }, { "epoch": 0.4510532837670384, "grad_norm": 0.9671572024242282, "learning_rate": 3.1049654099222542e-06, "loss": 0.1912, "step": 1001 }, { "epoch": 0.4515038864481244, "grad_norm": 0.9162211305759308, "learning_rate": 3.1013496735181232e-06, "loss": 0.176, "step": 1002 }, { "epoch": 0.4519544891292103, "grad_norm": 1.0090932364505272, "learning_rate": 3.0977326014859415e-06, "loss": 0.2013, "step": 1003 }, { "epoch": 0.45240509181029626, "grad_norm": 0.8951383914420861, "learning_rate": 3.09411420185941e-06, "loss": 0.1749, "step": 1004 }, { "epoch": 0.45285569449138224, "grad_norm": 0.9397408568438251, "learning_rate": 3.090494482675176e-06, "loss": 0.1804, "step": 1005 }, { "epoch": 0.4533062971724682, "grad_norm": 0.9022667632823632, "learning_rate": 3.0868734519728194e-06, "loss": 0.1788, "step": 1006 }, { "epoch": 0.4537568998535541, "grad_norm": 1.0031529885463817, "learning_rate": 3.0832511177948326e-06, "loss": 0.2066, "step": 1007 }, { "epoch": 0.4542075025346401, "grad_norm": 1.0303330103237227, "learning_rate": 3.0796274881866034e-06, "loss": 0.1992, "step": 1008 }, { "epoch": 0.454658105215726, "grad_norm": 1.0239442642225518, "learning_rate": 3.0760025711963964e-06, "loss": 0.1913, "step": 1009 }, { "epoch": 0.45510870789681196, "grad_norm": 1.0115757759674917, "learning_rate": 3.0723763748753354e-06, "loss": 0.181, "step": 1010 }, { "epoch": 0.45555931057789795, "grad_norm": 1.024472357217328, "learning_rate": 3.0687489072773864e-06, "loss": 0.1949, "step": 1011 }, { "epoch": 0.4560099132589839, "grad_norm": 1.0663046028032832, "learning_rate": 3.0651201764593375e-06, "loss": 0.2073, "step": 1012 }, { "epoch": 0.45646051594006987, "grad_norm": 1.0420504549269418, "learning_rate": 3.0614901904807836e-06, "loss": 0.1842, "step": 1013 }, { "epoch": 0.4569111186211558, "grad_norm": 1.1242567350514105, "learning_rate": 3.0578589574041097e-06, "loss": 0.2077, "step": 1014 }, { "epoch": 0.45736172130224173, "grad_norm": 0.9648102387148972, "learning_rate": 3.0542264852944635e-06, "loss": 0.182, "step": 1015 }, { "epoch": 0.4578123239833277, "grad_norm": 0.9493708361147652, "learning_rate": 3.0505927822197533e-06, "loss": 0.1685, "step": 1016 }, { "epoch": 0.45826292666441365, "grad_norm": 0.9934079363031364, "learning_rate": 3.0469578562506165e-06, "loss": 0.1935, "step": 1017 }, { "epoch": 0.4587135293454996, "grad_norm": 0.9805921173422116, "learning_rate": 3.0433217154604067e-06, "loss": 0.1908, "step": 1018 }, { "epoch": 0.45916413202658557, "grad_norm": 1.065514609639169, "learning_rate": 3.0396843679251777e-06, "loss": 0.2006, "step": 1019 }, { "epoch": 0.4596147347076715, "grad_norm": 0.9809771886572722, "learning_rate": 3.0360458217236604e-06, "loss": 0.1815, "step": 1020 }, { "epoch": 0.4600653373887575, "grad_norm": 1.023903500208084, "learning_rate": 3.0324060849372526e-06, "loss": 0.1962, "step": 1021 }, { "epoch": 0.4605159400698434, "grad_norm": 1.0217469739032734, "learning_rate": 3.028765165649992e-06, "loss": 0.1985, "step": 1022 }, { "epoch": 0.46096654275092935, "grad_norm": 1.0221731312641227, "learning_rate": 3.0251230719485465e-06, "loss": 0.1992, "step": 1023 }, { "epoch": 0.46141714543201534, "grad_norm": 1.0164802191447049, "learning_rate": 3.0214798119221884e-06, "loss": 0.1994, "step": 1024 }, { "epoch": 0.4618677481131013, "grad_norm": 0.9419980545476748, "learning_rate": 3.0178353936627835e-06, "loss": 0.1812, "step": 1025 }, { "epoch": 0.4623183507941872, "grad_norm": 0.956154108614536, "learning_rate": 3.0141898252647682e-06, "loss": 0.1887, "step": 1026 }, { "epoch": 0.4627689534752732, "grad_norm": 1.0955572002986789, "learning_rate": 3.0105431148251364e-06, "loss": 0.1871, "step": 1027 }, { "epoch": 0.4632195561563591, "grad_norm": 1.0245926161342613, "learning_rate": 3.0068952704434145e-06, "loss": 0.186, "step": 1028 }, { "epoch": 0.46367015883744506, "grad_norm": 0.9618126188953687, "learning_rate": 3.0032463002216504e-06, "loss": 0.1867, "step": 1029 }, { "epoch": 0.46412076151853104, "grad_norm": 1.1893777161990908, "learning_rate": 2.999596212264392e-06, "loss": 0.2028, "step": 1030 }, { "epoch": 0.464571364199617, "grad_norm": 0.9948596042457155, "learning_rate": 2.9959450146786674e-06, "loss": 0.1914, "step": 1031 }, { "epoch": 0.46502196688070296, "grad_norm": 1.0487962174424645, "learning_rate": 2.9922927155739737e-06, "loss": 0.2008, "step": 1032 }, { "epoch": 0.4654725695617889, "grad_norm": 1.065771087796345, "learning_rate": 2.9886393230622507e-06, "loss": 0.1951, "step": 1033 }, { "epoch": 0.46592317224287483, "grad_norm": 1.0549580408530124, "learning_rate": 2.984984845257868e-06, "loss": 0.2018, "step": 1034 }, { "epoch": 0.4663737749239608, "grad_norm": 0.9777277229138798, "learning_rate": 2.981329290277605e-06, "loss": 0.1827, "step": 1035 }, { "epoch": 0.46682437760504675, "grad_norm": 0.9528614427706873, "learning_rate": 2.977672666240636e-06, "loss": 0.1826, "step": 1036 }, { "epoch": 0.4672749802861327, "grad_norm": 0.893984600025219, "learning_rate": 2.974014981268507e-06, "loss": 0.1708, "step": 1037 }, { "epoch": 0.46772558296721867, "grad_norm": 0.94876165184311, "learning_rate": 2.9703562434851218e-06, "loss": 0.1789, "step": 1038 }, { "epoch": 0.4681761856483046, "grad_norm": 1.0414212143930934, "learning_rate": 2.966696461016721e-06, "loss": 0.2034, "step": 1039 }, { "epoch": 0.4686267883293906, "grad_norm": 0.9462011872528657, "learning_rate": 2.9630356419918682e-06, "loss": 0.1719, "step": 1040 }, { "epoch": 0.4690773910104765, "grad_norm": 0.9266863274496223, "learning_rate": 2.9593737945414264e-06, "loss": 0.1813, "step": 1041 }, { "epoch": 0.46952799369156245, "grad_norm": 1.0780107256167346, "learning_rate": 2.9557109267985445e-06, "loss": 0.1891, "step": 1042 }, { "epoch": 0.46997859637264844, "grad_norm": 0.9718365354652926, "learning_rate": 2.952047046898637e-06, "loss": 0.1778, "step": 1043 }, { "epoch": 0.47042919905373437, "grad_norm": 0.9854537626498936, "learning_rate": 2.9483821629793673e-06, "loss": 0.1855, "step": 1044 }, { "epoch": 0.4708798017348203, "grad_norm": 0.9509490677167141, "learning_rate": 2.9447162831806275e-06, "loss": 0.1871, "step": 1045 }, { "epoch": 0.4713304044159063, "grad_norm": 0.9835264138568843, "learning_rate": 2.941049415644522e-06, "loss": 0.181, "step": 1046 }, { "epoch": 0.4717810070969922, "grad_norm": 0.976177751134452, "learning_rate": 2.9373815685153485e-06, "loss": 0.1745, "step": 1047 }, { "epoch": 0.47223160977807815, "grad_norm": 1.0395516553789195, "learning_rate": 2.933712749939582e-06, "loss": 0.2009, "step": 1048 }, { "epoch": 0.47268221245916414, "grad_norm": 0.9508188955276239, "learning_rate": 2.9300429680658538e-06, "loss": 0.192, "step": 1049 }, { "epoch": 0.4731328151402501, "grad_norm": 1.030095865559414, "learning_rate": 2.9263722310449353e-06, "loss": 0.2087, "step": 1050 }, { "epoch": 0.47358341782133606, "grad_norm": 1.0090146660033465, "learning_rate": 2.9227005470297194e-06, "loss": 0.198, "step": 1051 }, { "epoch": 0.474034020502422, "grad_norm": 1.01886158123234, "learning_rate": 2.919027924175201e-06, "loss": 0.1982, "step": 1052 }, { "epoch": 0.4744846231835079, "grad_norm": 0.9858334972569566, "learning_rate": 2.915354370638462e-06, "loss": 0.1833, "step": 1053 }, { "epoch": 0.4749352258645939, "grad_norm": 0.9198114480645141, "learning_rate": 2.9116798945786515e-06, "loss": 0.1779, "step": 1054 }, { "epoch": 0.47538582854567984, "grad_norm": 1.0202969087409115, "learning_rate": 2.9080045041569647e-06, "loss": 0.1925, "step": 1055 }, { "epoch": 0.4758364312267658, "grad_norm": 0.9323881824017032, "learning_rate": 2.904328207536632e-06, "loss": 0.1896, "step": 1056 }, { "epoch": 0.47628703390785176, "grad_norm": 0.9957756137102491, "learning_rate": 2.900651012882893e-06, "loss": 0.1846, "step": 1057 }, { "epoch": 0.4767376365889377, "grad_norm": 0.921082253021773, "learning_rate": 2.896972928362983e-06, "loss": 0.1858, "step": 1058 }, { "epoch": 0.4771882392700237, "grad_norm": 0.9220354600553122, "learning_rate": 2.893293962146114e-06, "loss": 0.1695, "step": 1059 }, { "epoch": 0.4776388419511096, "grad_norm": 1.045451312341842, "learning_rate": 2.8896141224034554e-06, "loss": 0.199, "step": 1060 }, { "epoch": 0.47808944463219555, "grad_norm": 0.9166173855182119, "learning_rate": 2.885933417308118e-06, "loss": 0.1632, "step": 1061 }, { "epoch": 0.47854004731328154, "grad_norm": 0.9995930807668175, "learning_rate": 2.8822518550351356e-06, "loss": 0.1839, "step": 1062 }, { "epoch": 0.47899064999436747, "grad_norm": 0.9823258706934731, "learning_rate": 2.878569443761442e-06, "loss": 0.1845, "step": 1063 }, { "epoch": 0.4794412526754534, "grad_norm": 1.0462859714933759, "learning_rate": 2.87488619166586e-06, "loss": 0.2096, "step": 1064 }, { "epoch": 0.4798918553565394, "grad_norm": 0.9484837330036915, "learning_rate": 2.8712021069290786e-06, "loss": 0.1748, "step": 1065 }, { "epoch": 0.4803424580376253, "grad_norm": 0.943429390904282, "learning_rate": 2.8675171977336357e-06, "loss": 0.1777, "step": 1066 }, { "epoch": 0.48079306071871125, "grad_norm": 0.965224271549074, "learning_rate": 2.863831472263904e-06, "loss": 0.1924, "step": 1067 }, { "epoch": 0.48124366339979724, "grad_norm": 1.0724951204071391, "learning_rate": 2.8601449387060622e-06, "loss": 0.2004, "step": 1068 }, { "epoch": 0.48169426608088317, "grad_norm": 1.0527358404221765, "learning_rate": 2.8564576052480895e-06, "loss": 0.1992, "step": 1069 }, { "epoch": 0.48214486876196916, "grad_norm": 0.9835497307888469, "learning_rate": 2.8527694800797417e-06, "loss": 0.1818, "step": 1070 }, { "epoch": 0.4825954714430551, "grad_norm": 0.9246506213040395, "learning_rate": 2.8490805713925298e-06, "loss": 0.1791, "step": 1071 }, { "epoch": 0.483046074124141, "grad_norm": 0.9701524857121642, "learning_rate": 2.845390887379706e-06, "loss": 0.1793, "step": 1072 }, { "epoch": 0.483496676805227, "grad_norm": 1.01141868910757, "learning_rate": 2.8417004362362465e-06, "loss": 0.1831, "step": 1073 }, { "epoch": 0.48394727948631294, "grad_norm": 0.9620211674176621, "learning_rate": 2.838009226158829e-06, "loss": 0.1934, "step": 1074 }, { "epoch": 0.4843978821673989, "grad_norm": 0.9797701458800275, "learning_rate": 2.8343172653458194e-06, "loss": 0.1898, "step": 1075 }, { "epoch": 0.48484848484848486, "grad_norm": 0.9517023202475261, "learning_rate": 2.8306245619972476e-06, "loss": 0.1795, "step": 1076 }, { "epoch": 0.4852990875295708, "grad_norm": 0.9797022376101314, "learning_rate": 2.826931124314796e-06, "loss": 0.181, "step": 1077 }, { "epoch": 0.4857496902106567, "grad_norm": 0.9979686434950376, "learning_rate": 2.8232369605017757e-06, "loss": 0.1806, "step": 1078 }, { "epoch": 0.4862002928917427, "grad_norm": 1.022734670572374, "learning_rate": 2.8195420787631113e-06, "loss": 0.2038, "step": 1079 }, { "epoch": 0.48665089557282865, "grad_norm": 1.0508347141889778, "learning_rate": 2.8158464873053236e-06, "loss": 0.1871, "step": 1080 }, { "epoch": 0.48710149825391463, "grad_norm": 1.0117363738105383, "learning_rate": 2.8121501943365066e-06, "loss": 0.1844, "step": 1081 }, { "epoch": 0.48755210093500057, "grad_norm": 0.9731494514961763, "learning_rate": 2.808453208066314e-06, "loss": 0.1796, "step": 1082 }, { "epoch": 0.4880027036160865, "grad_norm": 0.9779594049370961, "learning_rate": 2.8047555367059404e-06, "loss": 0.1842, "step": 1083 }, { "epoch": 0.4884533062971725, "grad_norm": 1.0236799385043172, "learning_rate": 2.8010571884681004e-06, "loss": 0.1994, "step": 1084 }, { "epoch": 0.4889039089782584, "grad_norm": 0.9975674985465199, "learning_rate": 2.7973581715670124e-06, "loss": 0.1939, "step": 1085 }, { "epoch": 0.48935451165934435, "grad_norm": 0.9722700577518006, "learning_rate": 2.7936584942183804e-06, "loss": 0.1864, "step": 1086 }, { "epoch": 0.48980511434043034, "grad_norm": 0.9792037443912968, "learning_rate": 2.7899581646393746e-06, "loss": 0.1741, "step": 1087 }, { "epoch": 0.49025571702151627, "grad_norm": 0.925245177190625, "learning_rate": 2.7862571910486148e-06, "loss": 0.1793, "step": 1088 }, { "epoch": 0.49070631970260226, "grad_norm": 0.9677656832455358, "learning_rate": 2.7825555816661503e-06, "loss": 0.1824, "step": 1089 }, { "epoch": 0.4911569223836882, "grad_norm": 0.9340259731910094, "learning_rate": 2.778853344713443e-06, "loss": 0.1846, "step": 1090 }, { "epoch": 0.4916075250647741, "grad_norm": 0.9199222980083757, "learning_rate": 2.7751504884133484e-06, "loss": 0.1725, "step": 1091 }, { "epoch": 0.4920581277458601, "grad_norm": 1.0357457170145339, "learning_rate": 2.7714470209900974e-06, "loss": 0.2018, "step": 1092 }, { "epoch": 0.49250873042694604, "grad_norm": 1.0034531580348318, "learning_rate": 2.7677429506692788e-06, "loss": 0.1843, "step": 1093 }, { "epoch": 0.49295933310803197, "grad_norm": 0.9227250756376808, "learning_rate": 2.76403828567782e-06, "loss": 0.1703, "step": 1094 }, { "epoch": 0.49340993578911796, "grad_norm": 1.1108052954539598, "learning_rate": 2.7603330342439686e-06, "loss": 0.1943, "step": 1095 }, { "epoch": 0.4938605384702039, "grad_norm": 1.0136992592552376, "learning_rate": 2.7566272045972777e-06, "loss": 0.2009, "step": 1096 }, { "epoch": 0.4943111411512898, "grad_norm": 1.0125863121371006, "learning_rate": 2.752920804968581e-06, "loss": 0.1933, "step": 1097 }, { "epoch": 0.4947617438323758, "grad_norm": 0.9430089748065448, "learning_rate": 2.7492138435899794e-06, "loss": 0.1862, "step": 1098 }, { "epoch": 0.49521234651346174, "grad_norm": 0.9233452523390339, "learning_rate": 2.745506328694822e-06, "loss": 0.1728, "step": 1099 }, { "epoch": 0.49566294919454773, "grad_norm": 0.9918009786809843, "learning_rate": 2.741798268517687e-06, "loss": 0.182, "step": 1100 }, { "epoch": 0.49611355187563366, "grad_norm": 0.9232139062649801, "learning_rate": 2.738089671294364e-06, "loss": 0.1838, "step": 1101 }, { "epoch": 0.4965641545567196, "grad_norm": 0.9492452596399719, "learning_rate": 2.734380545261835e-06, "loss": 0.1889, "step": 1102 }, { "epoch": 0.4970147572378056, "grad_norm": 0.9370507529188418, "learning_rate": 2.730670898658255e-06, "loss": 0.1852, "step": 1103 }, { "epoch": 0.4974653599188915, "grad_norm": 0.9675402055012017, "learning_rate": 2.726960739722939e-06, "loss": 0.1858, "step": 1104 }, { "epoch": 0.49791596259997745, "grad_norm": 0.9610111983594906, "learning_rate": 2.7232500766963373e-06, "loss": 0.1664, "step": 1105 }, { "epoch": 0.49836656528106343, "grad_norm": 0.9763726053294081, "learning_rate": 2.7195389178200194e-06, "loss": 0.1883, "step": 1106 }, { "epoch": 0.49881716796214937, "grad_norm": 0.8905104609381886, "learning_rate": 2.7158272713366573e-06, "loss": 0.1648, "step": 1107 }, { "epoch": 0.49926777064323535, "grad_norm": 0.9757044444911063, "learning_rate": 2.7121151454900048e-06, "loss": 0.1829, "step": 1108 }, { "epoch": 0.4997183733243213, "grad_norm": 0.9268012060057049, "learning_rate": 2.7084025485248827e-06, "loss": 0.1772, "step": 1109 }, { "epoch": 0.5001689760054072, "grad_norm": 0.96130114765164, "learning_rate": 2.7046894886871564e-06, "loss": 0.1813, "step": 1110 }, { "epoch": 0.5006195786864932, "grad_norm": 0.9083792140560981, "learning_rate": 2.700975974223719e-06, "loss": 0.1688, "step": 1111 }, { "epoch": 0.5010701813675791, "grad_norm": 0.9850647912366456, "learning_rate": 2.6972620133824745e-06, "loss": 0.1824, "step": 1112 }, { "epoch": 0.5015207840486651, "grad_norm": 0.9354381030521021, "learning_rate": 2.6935476144123173e-06, "loss": 0.1722, "step": 1113 }, { "epoch": 0.5019713867297511, "grad_norm": 0.9619636972861941, "learning_rate": 2.689832785563116e-06, "loss": 0.1833, "step": 1114 }, { "epoch": 0.502421989410837, "grad_norm": 1.0638894566317683, "learning_rate": 2.6861175350856937e-06, "loss": 0.1972, "step": 1115 }, { "epoch": 0.5028725920919229, "grad_norm": 1.0561930996704842, "learning_rate": 2.6824018712318084e-06, "loss": 0.1976, "step": 1116 }, { "epoch": 0.5033231947730089, "grad_norm": 0.9432639175030324, "learning_rate": 2.6786858022541385e-06, "loss": 0.1802, "step": 1117 }, { "epoch": 0.5037737974540949, "grad_norm": 1.0134235760379509, "learning_rate": 2.674969336406262e-06, "loss": 0.1984, "step": 1118 }, { "epoch": 0.5042244001351808, "grad_norm": 1.000797354765843, "learning_rate": 2.6712524819426355e-06, "loss": 0.2042, "step": 1119 }, { "epoch": 0.5046750028162668, "grad_norm": 0.9847284359031943, "learning_rate": 2.6675352471185824e-06, "loss": 0.1993, "step": 1120 }, { "epoch": 0.5051256054973527, "grad_norm": 0.9365176593909826, "learning_rate": 2.6638176401902693e-06, "loss": 0.1624, "step": 1121 }, { "epoch": 0.5055762081784386, "grad_norm": 0.9596545364462479, "learning_rate": 2.6600996694146876e-06, "loss": 0.1835, "step": 1122 }, { "epoch": 0.5060268108595246, "grad_norm": 0.9683771795353776, "learning_rate": 2.656381343049641e-06, "loss": 0.1707, "step": 1123 }, { "epoch": 0.5064774135406106, "grad_norm": 0.916376052612141, "learning_rate": 2.652662669353719e-06, "loss": 0.1723, "step": 1124 }, { "epoch": 0.5069280162216965, "grad_norm": 1.027462308260936, "learning_rate": 2.648943656586284e-06, "loss": 0.1814, "step": 1125 }, { "epoch": 0.5073786189027825, "grad_norm": 0.9776859126929065, "learning_rate": 2.6452243130074523e-06, "loss": 0.193, "step": 1126 }, { "epoch": 0.5078292215838685, "grad_norm": 0.9894419131466585, "learning_rate": 2.6415046468780726e-06, "loss": 0.173, "step": 1127 }, { "epoch": 0.5082798242649543, "grad_norm": 1.1271817158890742, "learning_rate": 2.637784666459714e-06, "loss": 0.1994, "step": 1128 }, { "epoch": 0.5087304269460403, "grad_norm": 1.1120470123407524, "learning_rate": 2.6340643800146387e-06, "loss": 0.2024, "step": 1129 }, { "epoch": 0.5091810296271263, "grad_norm": 0.9944937657896967, "learning_rate": 2.6303437958057932e-06, "loss": 0.1974, "step": 1130 }, { "epoch": 0.5096316323082122, "grad_norm": 1.0567641371999308, "learning_rate": 2.626622922096782e-06, "loss": 0.1935, "step": 1131 }, { "epoch": 0.5100822349892982, "grad_norm": 0.9953924398661689, "learning_rate": 2.622901767151855e-06, "loss": 0.1708, "step": 1132 }, { "epoch": 0.5105328376703842, "grad_norm": 0.9867187399054106, "learning_rate": 2.619180339235884e-06, "loss": 0.1738, "step": 1133 }, { "epoch": 0.5109834403514701, "grad_norm": 0.9951517194658875, "learning_rate": 2.6154586466143495e-06, "loss": 0.1813, "step": 1134 }, { "epoch": 0.511434043032556, "grad_norm": 1.0909408491594175, "learning_rate": 2.6117366975533187e-06, "loss": 0.1938, "step": 1135 }, { "epoch": 0.511884645713642, "grad_norm": 0.9980885855116574, "learning_rate": 2.60801450031943e-06, "loss": 0.1783, "step": 1136 }, { "epoch": 0.512335248394728, "grad_norm": 0.9713276180645412, "learning_rate": 2.604292063179871e-06, "loss": 0.187, "step": 1137 }, { "epoch": 0.5127858510758139, "grad_norm": 0.9996504568137589, "learning_rate": 2.600569394402363e-06, "loss": 0.1817, "step": 1138 }, { "epoch": 0.5132364537568999, "grad_norm": 1.079413432325811, "learning_rate": 2.596846502255142e-06, "loss": 0.1814, "step": 1139 }, { "epoch": 0.5136870564379858, "grad_norm": 0.9937278770929203, "learning_rate": 2.5931233950069385e-06, "loss": 0.1817, "step": 1140 }, { "epoch": 0.5141376591190717, "grad_norm": 0.9741045869421943, "learning_rate": 2.589400080926964e-06, "loss": 0.1811, "step": 1141 }, { "epoch": 0.5145882618001577, "grad_norm": 0.9975149063522775, "learning_rate": 2.585676568284886e-06, "loss": 0.1828, "step": 1142 }, { "epoch": 0.5150388644812437, "grad_norm": 0.942867421002799, "learning_rate": 2.581952865350815e-06, "loss": 0.1763, "step": 1143 }, { "epoch": 0.5154894671623296, "grad_norm": 0.9346474421337058, "learning_rate": 2.578228980395283e-06, "loss": 0.1752, "step": 1144 }, { "epoch": 0.5159400698434156, "grad_norm": 0.9362968071159, "learning_rate": 2.5745049216892286e-06, "loss": 0.1844, "step": 1145 }, { "epoch": 0.5163906725245015, "grad_norm": 0.9218340206816221, "learning_rate": 2.570780697503973e-06, "loss": 0.1731, "step": 1146 }, { "epoch": 0.5168412752055874, "grad_norm": 0.9672691329516838, "learning_rate": 2.5670563161112073e-06, "loss": 0.175, "step": 1147 }, { "epoch": 0.5172918778866734, "grad_norm": 1.0414538127693016, "learning_rate": 2.56333178578297e-06, "loss": 0.1918, "step": 1148 }, { "epoch": 0.5177424805677594, "grad_norm": 0.9228840524787749, "learning_rate": 2.5596071147916325e-06, "loss": 0.1855, "step": 1149 }, { "epoch": 0.5181930832488453, "grad_norm": 0.9483825423269993, "learning_rate": 2.555882311409878e-06, "loss": 0.1808, "step": 1150 }, { "epoch": 0.5186436859299313, "grad_norm": 0.943622736088532, "learning_rate": 2.5521573839106815e-06, "loss": 0.1624, "step": 1151 }, { "epoch": 0.5190942886110173, "grad_norm": 0.9228808632463539, "learning_rate": 2.5484323405672965e-06, "loss": 0.1651, "step": 1152 }, { "epoch": 0.5195448912921032, "grad_norm": 0.9450047308942011, "learning_rate": 2.544707189653233e-06, "loss": 0.1828, "step": 1153 }, { "epoch": 0.5199954939731891, "grad_norm": 1.0551519234183033, "learning_rate": 2.5409819394422386e-06, "loss": 0.1832, "step": 1154 }, { "epoch": 0.5204460966542751, "grad_norm": 0.9628349705620093, "learning_rate": 2.5372565982082843e-06, "loss": 0.1779, "step": 1155 }, { "epoch": 0.5208966993353611, "grad_norm": 0.9919441111964074, "learning_rate": 2.5335311742255392e-06, "loss": 0.176, "step": 1156 }, { "epoch": 0.521347302016447, "grad_norm": 1.000419350534717, "learning_rate": 2.5298056757683604e-06, "loss": 0.1931, "step": 1157 }, { "epoch": 0.521797904697533, "grad_norm": 0.9758752044237246, "learning_rate": 2.5260801111112677e-06, "loss": 0.1807, "step": 1158 }, { "epoch": 0.5222485073786189, "grad_norm": 1.0378540918761032, "learning_rate": 2.5223544885289287e-06, "loss": 0.1716, "step": 1159 }, { "epoch": 0.5226991100597048, "grad_norm": 0.9893420988758019, "learning_rate": 2.518628816296139e-06, "loss": 0.1878, "step": 1160 }, { "epoch": 0.5231497127407908, "grad_norm": 1.0141282352076857, "learning_rate": 2.5149031026878063e-06, "loss": 0.1768, "step": 1161 }, { "epoch": 0.5236003154218768, "grad_norm": 1.0045365971161448, "learning_rate": 2.5111773559789277e-06, "loss": 0.1845, "step": 1162 }, { "epoch": 0.5240509181029627, "grad_norm": 0.9600097817814024, "learning_rate": 2.5074515844445774e-06, "loss": 0.1658, "step": 1163 }, { "epoch": 0.5245015207840487, "grad_norm": 0.9917555687212044, "learning_rate": 2.50372579635988e-06, "loss": 0.1892, "step": 1164 }, { "epoch": 0.5249521234651346, "grad_norm": 0.932460946654997, "learning_rate": 2.5e-06, "loss": 0.1808, "step": 1165 }, { "epoch": 0.5254027261462205, "grad_norm": 0.9802496388899974, "learning_rate": 2.4962742036401213e-06, "loss": 0.1967, "step": 1166 }, { "epoch": 0.5258533288273065, "grad_norm": 0.963276152099357, "learning_rate": 2.4925484155554235e-06, "loss": 0.1861, "step": 1167 }, { "epoch": 0.5263039315083925, "grad_norm": 1.007968690345994, "learning_rate": 2.4888226440210723e-06, "loss": 0.1948, "step": 1168 }, { "epoch": 0.5267545341894784, "grad_norm": 1.0312240246625133, "learning_rate": 2.4850968973121945e-06, "loss": 0.1878, "step": 1169 }, { "epoch": 0.5272051368705644, "grad_norm": 0.969518162593365, "learning_rate": 2.481371183703862e-06, "loss": 0.1786, "step": 1170 }, { "epoch": 0.5276557395516503, "grad_norm": 0.9951428379491346, "learning_rate": 2.477645511471073e-06, "loss": 0.1922, "step": 1171 }, { "epoch": 0.5281063422327363, "grad_norm": 0.9458709445109998, "learning_rate": 2.473919888888733e-06, "loss": 0.1909, "step": 1172 }, { "epoch": 0.5285569449138222, "grad_norm": 0.9502320144205059, "learning_rate": 2.4701943242316405e-06, "loss": 0.1845, "step": 1173 }, { "epoch": 0.5290075475949082, "grad_norm": 1.0147978070284416, "learning_rate": 2.466468825774461e-06, "loss": 0.1909, "step": 1174 }, { "epoch": 0.5294581502759942, "grad_norm": 0.9538740775918518, "learning_rate": 2.462743401791716e-06, "loss": 0.1719, "step": 1175 }, { "epoch": 0.5299087529570801, "grad_norm": 1.0356645892445344, "learning_rate": 2.4590180605577614e-06, "loss": 0.1772, "step": 1176 }, { "epoch": 0.530359355638166, "grad_norm": 0.9667337701658915, "learning_rate": 2.4552928103467677e-06, "loss": 0.1851, "step": 1177 }, { "epoch": 0.530809958319252, "grad_norm": 1.0215511619289341, "learning_rate": 2.4515676594327035e-06, "loss": 0.1862, "step": 1178 }, { "epoch": 0.5312605610003379, "grad_norm": 1.080239159714636, "learning_rate": 2.4478426160893197e-06, "loss": 0.1938, "step": 1179 }, { "epoch": 0.5317111636814239, "grad_norm": 0.978810820359603, "learning_rate": 2.4441176885901234e-06, "loss": 0.1929, "step": 1180 }, { "epoch": 0.5321617663625099, "grad_norm": 1.0027349214556345, "learning_rate": 2.440392885208368e-06, "loss": 0.1827, "step": 1181 }, { "epoch": 0.5326123690435958, "grad_norm": 0.9746023894623473, "learning_rate": 2.436668214217031e-06, "loss": 0.1741, "step": 1182 }, { "epoch": 0.5330629717246818, "grad_norm": 0.9550186212617013, "learning_rate": 2.4329436838887936e-06, "loss": 0.1636, "step": 1183 }, { "epoch": 0.5335135744057677, "grad_norm": 1.052266512449798, "learning_rate": 2.4292193024960275e-06, "loss": 0.193, "step": 1184 }, { "epoch": 0.5339641770868536, "grad_norm": 1.0365594108241232, "learning_rate": 2.425495078310772e-06, "loss": 0.1916, "step": 1185 }, { "epoch": 0.5344147797679396, "grad_norm": 1.0742016996862758, "learning_rate": 2.4217710196047166e-06, "loss": 0.1945, "step": 1186 }, { "epoch": 0.5348653824490256, "grad_norm": 0.9998451715585931, "learning_rate": 2.4180471346491864e-06, "loss": 0.1799, "step": 1187 }, { "epoch": 0.5353159851301115, "grad_norm": 1.0481115869339797, "learning_rate": 2.414323431715115e-06, "loss": 0.2012, "step": 1188 }, { "epoch": 0.5357665878111975, "grad_norm": 0.9740939934965689, "learning_rate": 2.410599919073037e-06, "loss": 0.1907, "step": 1189 }, { "epoch": 0.5362171904922834, "grad_norm": 0.9782525842641908, "learning_rate": 2.4068766049930623e-06, "loss": 0.1829, "step": 1190 }, { "epoch": 0.5366677931733694, "grad_norm": 0.926823673464016, "learning_rate": 2.403153497744859e-06, "loss": 0.1731, "step": 1191 }, { "epoch": 0.5371183958544553, "grad_norm": 0.9317322933086251, "learning_rate": 2.3994306055976374e-06, "loss": 0.1766, "step": 1192 }, { "epoch": 0.5375689985355413, "grad_norm": 0.906705256516563, "learning_rate": 2.3957079368201293e-06, "loss": 0.1637, "step": 1193 }, { "epoch": 0.5380196012166273, "grad_norm": 0.9111380070644578, "learning_rate": 2.39198549968057e-06, "loss": 0.1797, "step": 1194 }, { "epoch": 0.5384702038977132, "grad_norm": 1.0235451500244883, "learning_rate": 2.3882633024466813e-06, "loss": 0.1902, "step": 1195 }, { "epoch": 0.5389208065787991, "grad_norm": 0.919670741033484, "learning_rate": 2.3845413533856517e-06, "loss": 0.1713, "step": 1196 }, { "epoch": 0.5393714092598851, "grad_norm": 1.0088308399238464, "learning_rate": 2.3808196607641176e-06, "loss": 0.194, "step": 1197 }, { "epoch": 0.539822011940971, "grad_norm": 0.92840775528469, "learning_rate": 2.3770982328481464e-06, "loss": 0.1841, "step": 1198 }, { "epoch": 0.540272614622057, "grad_norm": 0.9848018352185613, "learning_rate": 2.3733770779032185e-06, "loss": 0.1635, "step": 1199 }, { "epoch": 0.540723217303143, "grad_norm": 0.9327976302812879, "learning_rate": 2.3696562041942076e-06, "loss": 0.1804, "step": 1200 }, { "epoch": 0.5411738199842289, "grad_norm": 0.9655142952949034, "learning_rate": 2.3659356199853617e-06, "loss": 0.1825, "step": 1201 }, { "epoch": 0.5416244226653149, "grad_norm": 1.0219936814274773, "learning_rate": 2.362215333540287e-06, "loss": 0.1968, "step": 1202 }, { "epoch": 0.5420750253464008, "grad_norm": 0.8904345856244417, "learning_rate": 2.3584953531219278e-06, "loss": 0.1662, "step": 1203 }, { "epoch": 0.5425256280274867, "grad_norm": 0.9514585972491365, "learning_rate": 2.3547756869925485e-06, "loss": 0.1766, "step": 1204 }, { "epoch": 0.5429762307085727, "grad_norm": 1.0241101215031847, "learning_rate": 2.3510563434137175e-06, "loss": 0.1767, "step": 1205 }, { "epoch": 0.5434268333896587, "grad_norm": 0.9578253213796663, "learning_rate": 2.347337330646282e-06, "loss": 0.1772, "step": 1206 }, { "epoch": 0.5438774360707446, "grad_norm": 0.9817433827617003, "learning_rate": 2.3436186569503598e-06, "loss": 0.1936, "step": 1207 }, { "epoch": 0.5443280387518306, "grad_norm": 0.9864355407353985, "learning_rate": 2.339900330585313e-06, "loss": 0.1932, "step": 1208 }, { "epoch": 0.5447786414329165, "grad_norm": 0.9722800161647528, "learning_rate": 2.3361823598097316e-06, "loss": 0.1852, "step": 1209 }, { "epoch": 0.5452292441140025, "grad_norm": 1.0342230857446506, "learning_rate": 2.332464752881418e-06, "loss": 0.1955, "step": 1210 }, { "epoch": 0.5456798467950884, "grad_norm": 0.9414693753247567, "learning_rate": 2.3287475180573653e-06, "loss": 0.182, "step": 1211 }, { "epoch": 0.5461304494761744, "grad_norm": 0.8860482726242138, "learning_rate": 2.3250306635937385e-06, "loss": 0.1634, "step": 1212 }, { "epoch": 0.5465810521572604, "grad_norm": 0.9469723822144591, "learning_rate": 2.3213141977458615e-06, "loss": 0.184, "step": 1213 }, { "epoch": 0.5470316548383463, "grad_norm": 0.8631153012390324, "learning_rate": 2.3175981287681924e-06, "loss": 0.1573, "step": 1214 }, { "epoch": 0.5474822575194322, "grad_norm": 0.9624737172159589, "learning_rate": 2.3138824649143076e-06, "loss": 0.1709, "step": 1215 }, { "epoch": 0.5479328602005182, "grad_norm": 0.981564862048761, "learning_rate": 2.310167214436885e-06, "loss": 0.1905, "step": 1216 }, { "epoch": 0.5483834628816041, "grad_norm": 0.9860990048581212, "learning_rate": 2.306452385587683e-06, "loss": 0.1893, "step": 1217 }, { "epoch": 0.5488340655626901, "grad_norm": 0.9997846222056105, "learning_rate": 2.3027379866175263e-06, "loss": 0.1816, "step": 1218 }, { "epoch": 0.5492846682437761, "grad_norm": 0.9839070512185661, "learning_rate": 2.2990240257762817e-06, "loss": 0.1789, "step": 1219 }, { "epoch": 0.549735270924862, "grad_norm": 1.010734431680887, "learning_rate": 2.295310511312844e-06, "loss": 0.1811, "step": 1220 }, { "epoch": 0.550185873605948, "grad_norm": 0.9636102309687704, "learning_rate": 2.2915974514751173e-06, "loss": 0.1811, "step": 1221 }, { "epoch": 0.5506364762870339, "grad_norm": 0.9112518930451056, "learning_rate": 2.287884854509995e-06, "loss": 0.17, "step": 1222 }, { "epoch": 0.5510870789681198, "grad_norm": 0.9724041065846214, "learning_rate": 2.2841727286633444e-06, "loss": 0.1809, "step": 1223 }, { "epoch": 0.5515376816492058, "grad_norm": 0.8976637885767892, "learning_rate": 2.280461082179982e-06, "loss": 0.1531, "step": 1224 }, { "epoch": 0.5519882843302918, "grad_norm": 0.9710139357320663, "learning_rate": 2.2767499233036635e-06, "loss": 0.1715, "step": 1225 }, { "epoch": 0.5524388870113777, "grad_norm": 1.022286767055151, "learning_rate": 2.2730392602770617e-06, "loss": 0.1831, "step": 1226 }, { "epoch": 0.5528894896924637, "grad_norm": 0.9629754668562253, "learning_rate": 2.269329101341745e-06, "loss": 0.1729, "step": 1227 }, { "epoch": 0.5533400923735496, "grad_norm": 0.9665296459469704, "learning_rate": 2.265619454738166e-06, "loss": 0.1732, "step": 1228 }, { "epoch": 0.5537906950546356, "grad_norm": 1.0583946774908262, "learning_rate": 2.2619103287056366e-06, "loss": 0.1897, "step": 1229 }, { "epoch": 0.5542412977357215, "grad_norm": 1.0533800740583372, "learning_rate": 2.2582017314823135e-06, "loss": 0.1825, "step": 1230 }, { "epoch": 0.5546919004168075, "grad_norm": 0.9304386750010083, "learning_rate": 2.254493671305179e-06, "loss": 0.1746, "step": 1231 }, { "epoch": 0.5551425030978935, "grad_norm": 0.9594946709485886, "learning_rate": 2.250786156410022e-06, "loss": 0.1714, "step": 1232 }, { "epoch": 0.5555931057789794, "grad_norm": 1.0124409711112767, "learning_rate": 2.24707919503142e-06, "loss": 0.1887, "step": 1233 }, { "epoch": 0.5560437084600653, "grad_norm": 1.110284743291757, "learning_rate": 2.2433727954027227e-06, "loss": 0.1724, "step": 1234 }, { "epoch": 0.5564943111411513, "grad_norm": 0.9968654870523563, "learning_rate": 2.239666965756032e-06, "loss": 0.1838, "step": 1235 }, { "epoch": 0.5569449138222372, "grad_norm": 1.0236544356271942, "learning_rate": 2.2359617143221805e-06, "loss": 0.1798, "step": 1236 }, { "epoch": 0.5573955165033232, "grad_norm": 1.0263790451012669, "learning_rate": 2.232257049330722e-06, "loss": 0.1988, "step": 1237 }, { "epoch": 0.5578461191844092, "grad_norm": 0.9338162135659887, "learning_rate": 2.2285529790099034e-06, "loss": 0.1749, "step": 1238 }, { "epoch": 0.5582967218654951, "grad_norm": 1.0169422018470666, "learning_rate": 2.224849511586652e-06, "loss": 0.1837, "step": 1239 }, { "epoch": 0.558747324546581, "grad_norm": 1.045923343357873, "learning_rate": 2.221146655286558e-06, "loss": 0.1913, "step": 1240 }, { "epoch": 0.559197927227667, "grad_norm": 0.9631444964816668, "learning_rate": 2.21744441833385e-06, "loss": 0.1861, "step": 1241 }, { "epoch": 0.5596485299087529, "grad_norm": 0.9529062092810353, "learning_rate": 2.2137428089513857e-06, "loss": 0.1811, "step": 1242 }, { "epoch": 0.5600991325898389, "grad_norm": 1.0154141341033607, "learning_rate": 2.2100418353606262e-06, "loss": 0.1794, "step": 1243 }, { "epoch": 0.5605497352709249, "grad_norm": 0.9766059599452342, "learning_rate": 2.20634150578162e-06, "loss": 0.1819, "step": 1244 }, { "epoch": 0.5610003379520108, "grad_norm": 0.932965228493953, "learning_rate": 2.202641828432988e-06, "loss": 0.1748, "step": 1245 }, { "epoch": 0.5614509406330968, "grad_norm": 0.9702059483136485, "learning_rate": 2.1989428115319005e-06, "loss": 0.1839, "step": 1246 }, { "epoch": 0.5619015433141827, "grad_norm": 1.0785932501489395, "learning_rate": 2.19524446329406e-06, "loss": 0.187, "step": 1247 }, { "epoch": 0.5623521459952687, "grad_norm": 0.9990242640449595, "learning_rate": 2.1915467919336862e-06, "loss": 0.1842, "step": 1248 }, { "epoch": 0.5628027486763546, "grad_norm": 1.0202625100556078, "learning_rate": 2.1878498056634946e-06, "loss": 0.1795, "step": 1249 }, { "epoch": 0.5632533513574406, "grad_norm": 1.0275123282984282, "learning_rate": 2.1841535126946777e-06, "loss": 0.1832, "step": 1250 }, { "epoch": 0.5637039540385266, "grad_norm": 1.0132575380436037, "learning_rate": 2.180457921236889e-06, "loss": 0.1765, "step": 1251 }, { "epoch": 0.5641545567196125, "grad_norm": 1.003409258685893, "learning_rate": 2.176763039498225e-06, "loss": 0.1879, "step": 1252 }, { "epoch": 0.5646051594006984, "grad_norm": 0.9874249442484657, "learning_rate": 2.1730688756852046e-06, "loss": 0.195, "step": 1253 }, { "epoch": 0.5650557620817844, "grad_norm": 0.9868641897122264, "learning_rate": 2.1693754380027533e-06, "loss": 0.177, "step": 1254 }, { "epoch": 0.5655063647628703, "grad_norm": 1.0504175835437466, "learning_rate": 2.165682734654181e-06, "loss": 0.1825, "step": 1255 }, { "epoch": 0.5659569674439563, "grad_norm": 1.0619516029800902, "learning_rate": 2.161990773841171e-06, "loss": 0.1922, "step": 1256 }, { "epoch": 0.5664075701250423, "grad_norm": 1.0344653262001975, "learning_rate": 2.1582995637637543e-06, "loss": 0.1783, "step": 1257 }, { "epoch": 0.5668581728061282, "grad_norm": 0.9973482494131128, "learning_rate": 2.1546091126202955e-06, "loss": 0.1778, "step": 1258 }, { "epoch": 0.5673087754872141, "grad_norm": 0.9561514880489893, "learning_rate": 2.150919428607472e-06, "loss": 0.1726, "step": 1259 }, { "epoch": 0.5677593781683001, "grad_norm": 1.0191580883847822, "learning_rate": 2.147230519920259e-06, "loss": 0.1741, "step": 1260 }, { "epoch": 0.568209980849386, "grad_norm": 1.0366037060072417, "learning_rate": 2.143542394751911e-06, "loss": 0.1833, "step": 1261 }, { "epoch": 0.568660583530472, "grad_norm": 1.0454064471673492, "learning_rate": 2.139855061293939e-06, "loss": 0.1896, "step": 1262 }, { "epoch": 0.569111186211558, "grad_norm": 1.00899648660538, "learning_rate": 2.1361685277360973e-06, "loss": 0.1747, "step": 1263 }, { "epoch": 0.5695617888926439, "grad_norm": 1.0075600486399183, "learning_rate": 2.132482802266364e-06, "loss": 0.1892, "step": 1264 }, { "epoch": 0.5700123915737298, "grad_norm": 0.9291801587696538, "learning_rate": 2.128797893070922e-06, "loss": 0.1563, "step": 1265 }, { "epoch": 0.5704629942548158, "grad_norm": 0.9408535292316663, "learning_rate": 2.1251138083341404e-06, "loss": 0.1759, "step": 1266 }, { "epoch": 0.5709135969359018, "grad_norm": 0.9543537844054255, "learning_rate": 2.1214305562385592e-06, "loss": 0.1833, "step": 1267 }, { "epoch": 0.5713641996169877, "grad_norm": 1.0435966860876906, "learning_rate": 2.117748144964865e-06, "loss": 0.1857, "step": 1268 }, { "epoch": 0.5718148022980737, "grad_norm": 1.0360293503534796, "learning_rate": 2.1140665826918823e-06, "loss": 0.1922, "step": 1269 }, { "epoch": 0.5722654049791597, "grad_norm": 0.9475430842301149, "learning_rate": 2.1103858775965455e-06, "loss": 0.1717, "step": 1270 }, { "epoch": 0.5727160076602456, "grad_norm": 0.9873360818255287, "learning_rate": 2.106706037853887e-06, "loss": 0.1766, "step": 1271 }, { "epoch": 0.5731666103413315, "grad_norm": 0.9988576739952401, "learning_rate": 2.103027071637018e-06, "loss": 0.1917, "step": 1272 }, { "epoch": 0.5736172130224175, "grad_norm": 0.9794579225845605, "learning_rate": 2.099348987117108e-06, "loss": 0.1652, "step": 1273 }, { "epoch": 0.5740678157035034, "grad_norm": 0.9049004556537857, "learning_rate": 2.095671792463368e-06, "loss": 0.1735, "step": 1274 }, { "epoch": 0.5745184183845894, "grad_norm": 0.9161327176570143, "learning_rate": 2.0919954958430357e-06, "loss": 0.1733, "step": 1275 }, { "epoch": 0.5749690210656754, "grad_norm": 0.9504311110927137, "learning_rate": 2.0883201054213493e-06, "loss": 0.1822, "step": 1276 }, { "epoch": 0.5754196237467613, "grad_norm": 1.0141813177166452, "learning_rate": 2.0846456293615384e-06, "loss": 0.1872, "step": 1277 }, { "epoch": 0.5758702264278472, "grad_norm": 0.9481128040139777, "learning_rate": 2.0809720758247997e-06, "loss": 0.1764, "step": 1278 }, { "epoch": 0.5763208291089332, "grad_norm": 0.9956268281053715, "learning_rate": 2.077299452970282e-06, "loss": 0.1839, "step": 1279 }, { "epoch": 0.5767714317900191, "grad_norm": 0.9221100295953925, "learning_rate": 2.0736277689550655e-06, "loss": 0.1696, "step": 1280 }, { "epoch": 0.5772220344711051, "grad_norm": 0.9992405251980395, "learning_rate": 2.069957031934147e-06, "loss": 0.174, "step": 1281 }, { "epoch": 0.5776726371521911, "grad_norm": 0.975709935305381, "learning_rate": 2.066287250060418e-06, "loss": 0.1685, "step": 1282 }, { "epoch": 0.578123239833277, "grad_norm": 1.0276498478372256, "learning_rate": 2.062618431484652e-06, "loss": 0.168, "step": 1283 }, { "epoch": 0.578573842514363, "grad_norm": 0.9649717673625273, "learning_rate": 2.05895058435548e-06, "loss": 0.1735, "step": 1284 }, { "epoch": 0.5790244451954489, "grad_norm": 1.0183890478781983, "learning_rate": 2.0552837168193738e-06, "loss": 0.1793, "step": 1285 }, { "epoch": 0.5794750478765348, "grad_norm": 0.9693380530197512, "learning_rate": 2.051617837020633e-06, "loss": 0.175, "step": 1286 }, { "epoch": 0.5799256505576208, "grad_norm": 1.0035692278148098, "learning_rate": 2.047952953101363e-06, "loss": 0.1716, "step": 1287 }, { "epoch": 0.5803762532387068, "grad_norm": 0.9543283088936171, "learning_rate": 2.0442890732014563e-06, "loss": 0.1782, "step": 1288 }, { "epoch": 0.5808268559197928, "grad_norm": 0.9856491679341663, "learning_rate": 2.040626205458574e-06, "loss": 0.1926, "step": 1289 }, { "epoch": 0.5812774586008786, "grad_norm": 0.9368718149030635, "learning_rate": 2.0369643580081326e-06, "loss": 0.1745, "step": 1290 }, { "epoch": 0.5817280612819646, "grad_norm": 1.054142375462708, "learning_rate": 2.0333035389832795e-06, "loss": 0.1881, "step": 1291 }, { "epoch": 0.5821786639630506, "grad_norm": 1.0454295548530987, "learning_rate": 2.0296437565148786e-06, "loss": 0.1927, "step": 1292 }, { "epoch": 0.5826292666441365, "grad_norm": 0.9451479577628555, "learning_rate": 2.025985018731494e-06, "loss": 0.1698, "step": 1293 }, { "epoch": 0.5830798693252225, "grad_norm": 1.0436155713832889, "learning_rate": 2.0223273337593647e-06, "loss": 0.1911, "step": 1294 }, { "epoch": 0.5835304720063085, "grad_norm": 0.9725657216642605, "learning_rate": 2.0186707097223952e-06, "loss": 0.1693, "step": 1295 }, { "epoch": 0.5839810746873944, "grad_norm": 0.9906222847408853, "learning_rate": 2.0150151547421333e-06, "loss": 0.1912, "step": 1296 }, { "epoch": 0.5844316773684803, "grad_norm": 0.9044813618430019, "learning_rate": 2.0113606769377497e-06, "loss": 0.1733, "step": 1297 }, { "epoch": 0.5848822800495663, "grad_norm": 0.9667967965279892, "learning_rate": 2.0077072844260267e-06, "loss": 0.1761, "step": 1298 }, { "epoch": 0.5853328827306522, "grad_norm": 0.9802438293326249, "learning_rate": 2.0040549853213326e-06, "loss": 0.1676, "step": 1299 }, { "epoch": 0.5857834854117382, "grad_norm": 0.972410342837984, "learning_rate": 2.0004037877356085e-06, "loss": 0.1904, "step": 1300 }, { "epoch": 0.5862340880928242, "grad_norm": 0.9597522742672756, "learning_rate": 1.9967536997783495e-06, "loss": 0.1834, "step": 1301 }, { "epoch": 0.5866846907739101, "grad_norm": 0.9814692842912235, "learning_rate": 1.9931047295565863e-06, "loss": 0.1732, "step": 1302 }, { "epoch": 0.587135293454996, "grad_norm": 0.9533163314110267, "learning_rate": 1.989456885174865e-06, "loss": 0.1723, "step": 1303 }, { "epoch": 0.587585896136082, "grad_norm": 1.0013117897122807, "learning_rate": 1.9858101747352326e-06, "loss": 0.1831, "step": 1304 }, { "epoch": 0.5880364988171679, "grad_norm": 1.0237539849809265, "learning_rate": 1.9821646063372174e-06, "loss": 0.1924, "step": 1305 }, { "epoch": 0.5884871014982539, "grad_norm": 1.0430023568792837, "learning_rate": 1.978520188077813e-06, "loss": 0.1798, "step": 1306 }, { "epoch": 0.5889377041793399, "grad_norm": 0.9744176987681425, "learning_rate": 1.9748769280514544e-06, "loss": 0.1777, "step": 1307 }, { "epoch": 0.5893883068604259, "grad_norm": 0.9218200088489699, "learning_rate": 1.971234834350008e-06, "loss": 0.1732, "step": 1308 }, { "epoch": 0.5898389095415117, "grad_norm": 0.9739010233717921, "learning_rate": 1.967593915062748e-06, "loss": 0.1836, "step": 1309 }, { "epoch": 0.5902895122225977, "grad_norm": 0.9373857688708352, "learning_rate": 1.96395417827634e-06, "loss": 0.172, "step": 1310 }, { "epoch": 0.5907401149036837, "grad_norm": 1.0305890099141217, "learning_rate": 1.960315632074824e-06, "loss": 0.1891, "step": 1311 }, { "epoch": 0.5911907175847696, "grad_norm": 0.9677534466523053, "learning_rate": 1.9566782845395945e-06, "loss": 0.1771, "step": 1312 }, { "epoch": 0.5916413202658556, "grad_norm": 0.8941128931209235, "learning_rate": 1.9530421437493843e-06, "loss": 0.1695, "step": 1313 }, { "epoch": 0.5920919229469416, "grad_norm": 0.9555412056280095, "learning_rate": 1.949407217780247e-06, "loss": 0.1672, "step": 1314 }, { "epoch": 0.5925425256280274, "grad_norm": 0.9689284207826656, "learning_rate": 1.945773514705537e-06, "loss": 0.1622, "step": 1315 }, { "epoch": 0.5929931283091134, "grad_norm": 0.9930091192518737, "learning_rate": 1.9421410425958915e-06, "loss": 0.1744, "step": 1316 }, { "epoch": 0.5934437309901994, "grad_norm": 0.9963914708795829, "learning_rate": 1.938509809519216e-06, "loss": 0.1822, "step": 1317 }, { "epoch": 0.5938943336712853, "grad_norm": 1.0169006281779729, "learning_rate": 1.934879823540663e-06, "loss": 0.1743, "step": 1318 }, { "epoch": 0.5943449363523713, "grad_norm": 0.9630659155754449, "learning_rate": 1.931251092722615e-06, "loss": 0.1734, "step": 1319 }, { "epoch": 0.5947955390334573, "grad_norm": 1.038229967298548, "learning_rate": 1.9276236251246655e-06, "loss": 0.1952, "step": 1320 }, { "epoch": 0.5952461417145432, "grad_norm": 1.0316333623222642, "learning_rate": 1.9239974288036044e-06, "loss": 0.1758, "step": 1321 }, { "epoch": 0.5956967443956291, "grad_norm": 0.9971216934509304, "learning_rate": 1.920372511813397e-06, "loss": 0.1866, "step": 1322 }, { "epoch": 0.5961473470767151, "grad_norm": 0.9896930008148693, "learning_rate": 1.916748882205168e-06, "loss": 0.1864, "step": 1323 }, { "epoch": 0.596597949757801, "grad_norm": 0.9268799065373754, "learning_rate": 1.913126548027181e-06, "loss": 0.1754, "step": 1324 }, { "epoch": 0.597048552438887, "grad_norm": 0.9682132912782482, "learning_rate": 1.909505517324825e-06, "loss": 0.1792, "step": 1325 }, { "epoch": 0.597499155119973, "grad_norm": 0.9930817836872204, "learning_rate": 1.905885798140591e-06, "loss": 0.1713, "step": 1326 }, { "epoch": 0.597949757801059, "grad_norm": 0.9429726025467023, "learning_rate": 1.9022673985140585e-06, "loss": 0.1676, "step": 1327 }, { "epoch": 0.5984003604821448, "grad_norm": 0.9461399995953571, "learning_rate": 1.8986503264818785e-06, "loss": 0.1687, "step": 1328 }, { "epoch": 0.5988509631632308, "grad_norm": 0.9023598770727506, "learning_rate": 1.895034590077747e-06, "loss": 0.1723, "step": 1329 }, { "epoch": 0.5993015658443168, "grad_norm": 0.9853066110059597, "learning_rate": 1.8914201973324004e-06, "loss": 0.1651, "step": 1330 }, { "epoch": 0.5997521685254027, "grad_norm": 1.0013857184348935, "learning_rate": 1.8878071562735873e-06, "loss": 0.1876, "step": 1331 }, { "epoch": 0.6002027712064887, "grad_norm": 1.0148699485876223, "learning_rate": 1.8841954749260535e-06, "loss": 0.1723, "step": 1332 }, { "epoch": 0.6006533738875747, "grad_norm": 0.9383847419000086, "learning_rate": 1.8805851613115278e-06, "loss": 0.1698, "step": 1333 }, { "epoch": 0.6011039765686605, "grad_norm": 0.9219094194233376, "learning_rate": 1.8769762234486982e-06, "loss": 0.1752, "step": 1334 }, { "epoch": 0.6015545792497465, "grad_norm": 0.9551193816669125, "learning_rate": 1.8733686693531986e-06, "loss": 0.1876, "step": 1335 }, { "epoch": 0.6020051819308325, "grad_norm": 0.9149029805316468, "learning_rate": 1.8697625070375893e-06, "loss": 0.1771, "step": 1336 }, { "epoch": 0.6024557846119184, "grad_norm": 0.965925307004654, "learning_rate": 1.8661577445113399e-06, "loss": 0.1779, "step": 1337 }, { "epoch": 0.6029063872930044, "grad_norm": 0.9551150150198514, "learning_rate": 1.8625543897808094e-06, "loss": 0.181, "step": 1338 }, { "epoch": 0.6033569899740904, "grad_norm": 0.898041242799122, "learning_rate": 1.8589524508492308e-06, "loss": 0.1739, "step": 1339 }, { "epoch": 0.6038075926551763, "grad_norm": 0.9870301856111156, "learning_rate": 1.8553519357166927e-06, "loss": 0.1762, "step": 1340 }, { "epoch": 0.6042581953362622, "grad_norm": 0.9238496565541336, "learning_rate": 1.8517528523801226e-06, "loss": 0.1631, "step": 1341 }, { "epoch": 0.6047087980173482, "grad_norm": 0.9021602741648262, "learning_rate": 1.8481552088332656e-06, "loss": 0.1776, "step": 1342 }, { "epoch": 0.6051594006984341, "grad_norm": 0.9457868006313538, "learning_rate": 1.84455901306667e-06, "loss": 0.1815, "step": 1343 }, { "epoch": 0.6056100033795201, "grad_norm": 0.9644972060860808, "learning_rate": 1.8409642730676692e-06, "loss": 0.1953, "step": 1344 }, { "epoch": 0.6060606060606061, "grad_norm": 0.9569847068003844, "learning_rate": 1.8373709968203624e-06, "loss": 0.1861, "step": 1345 }, { "epoch": 0.6065112087416921, "grad_norm": 1.0258866036102268, "learning_rate": 1.8337791923055983e-06, "loss": 0.1874, "step": 1346 }, { "epoch": 0.6069618114227779, "grad_norm": 0.9898107412694851, "learning_rate": 1.8301888675009554e-06, "loss": 0.1753, "step": 1347 }, { "epoch": 0.6074124141038639, "grad_norm": 0.9746476143849012, "learning_rate": 1.8266000303807272e-06, "loss": 0.1873, "step": 1348 }, { "epoch": 0.6078630167849499, "grad_norm": 0.9905149297692676, "learning_rate": 1.8230126889159027e-06, "loss": 0.1897, "step": 1349 }, { "epoch": 0.6083136194660358, "grad_norm": 0.9566033353505723, "learning_rate": 1.8194268510741493e-06, "loss": 0.1896, "step": 1350 }, { "epoch": 0.6087642221471218, "grad_norm": 0.9952340005510767, "learning_rate": 1.8158425248197931e-06, "loss": 0.1772, "step": 1351 }, { "epoch": 0.6092148248282078, "grad_norm": 0.9738307656928912, "learning_rate": 1.812259718113805e-06, "loss": 0.1736, "step": 1352 }, { "epoch": 0.6096654275092936, "grad_norm": 0.9621906341557562, "learning_rate": 1.8086784389137796e-06, "loss": 0.1754, "step": 1353 }, { "epoch": 0.6101160301903796, "grad_norm": 0.891429745139326, "learning_rate": 1.8050986951739201e-06, "loss": 0.1486, "step": 1354 }, { "epoch": 0.6105666328714656, "grad_norm": 1.0068295759803783, "learning_rate": 1.8015204948450166e-06, "loss": 0.1941, "step": 1355 }, { "epoch": 0.6110172355525515, "grad_norm": 0.9585690106550426, "learning_rate": 1.7979438458744343e-06, "loss": 0.1841, "step": 1356 }, { "epoch": 0.6114678382336375, "grad_norm": 0.9641696373258147, "learning_rate": 1.7943687562060919e-06, "loss": 0.1789, "step": 1357 }, { "epoch": 0.6119184409147235, "grad_norm": 1.0316316029702115, "learning_rate": 1.7907952337804429e-06, "loss": 0.1873, "step": 1358 }, { "epoch": 0.6123690435958093, "grad_norm": 0.9616240258140892, "learning_rate": 1.787223286534463e-06, "loss": 0.1641, "step": 1359 }, { "epoch": 0.6128196462768953, "grad_norm": 0.9791093566130695, "learning_rate": 1.783652922401627e-06, "loss": 0.1685, "step": 1360 }, { "epoch": 0.6132702489579813, "grad_norm": 0.9810794464604121, "learning_rate": 1.7800841493118942e-06, "loss": 0.1765, "step": 1361 }, { "epoch": 0.6137208516390672, "grad_norm": 0.9590742488463256, "learning_rate": 1.776516975191691e-06, "loss": 0.1679, "step": 1362 }, { "epoch": 0.6141714543201532, "grad_norm": 0.988237012807688, "learning_rate": 1.7729514079638915e-06, "loss": 0.1876, "step": 1363 }, { "epoch": 0.6146220570012392, "grad_norm": 1.0202288172318874, "learning_rate": 1.7693874555477996e-06, "loss": 0.1981, "step": 1364 }, { "epoch": 0.6150726596823252, "grad_norm": 0.9592437603221992, "learning_rate": 1.7658251258591352e-06, "loss": 0.1809, "step": 1365 }, { "epoch": 0.615523262363411, "grad_norm": 0.9248493524713802, "learning_rate": 1.7622644268100116e-06, "loss": 0.1781, "step": 1366 }, { "epoch": 0.615973865044497, "grad_norm": 1.0127059943444199, "learning_rate": 1.7587053663089233e-06, "loss": 0.1826, "step": 1367 }, { "epoch": 0.616424467725583, "grad_norm": 1.0484692118625194, "learning_rate": 1.755147952260722e-06, "loss": 0.1981, "step": 1368 }, { "epoch": 0.6168750704066689, "grad_norm": 1.0462986733395399, "learning_rate": 1.7515921925666053e-06, "loss": 0.189, "step": 1369 }, { "epoch": 0.6173256730877549, "grad_norm": 0.9569914429667652, "learning_rate": 1.748038095124095e-06, "loss": 0.1797, "step": 1370 }, { "epoch": 0.6177762757688409, "grad_norm": 0.8934663502251415, "learning_rate": 1.7444856678270218e-06, "loss": 0.167, "step": 1371 }, { "epoch": 0.6182268784499267, "grad_norm": 0.9411554465371383, "learning_rate": 1.7409349185655067e-06, "loss": 0.1857, "step": 1372 }, { "epoch": 0.6186774811310127, "grad_norm": 0.9232735692254691, "learning_rate": 1.7373858552259421e-06, "loss": 0.1827, "step": 1373 }, { "epoch": 0.6191280838120987, "grad_norm": 0.9723262002837848, "learning_rate": 1.733838485690978e-06, "loss": 0.1818, "step": 1374 }, { "epoch": 0.6195786864931846, "grad_norm": 0.9639076957534941, "learning_rate": 1.7302928178395018e-06, "loss": 0.1914, "step": 1375 }, { "epoch": 0.6200292891742706, "grad_norm": 0.9430657640039586, "learning_rate": 1.726748859546621e-06, "loss": 0.1733, "step": 1376 }, { "epoch": 0.6204798918553566, "grad_norm": 0.9399938223372823, "learning_rate": 1.723206618683646e-06, "loss": 0.1815, "step": 1377 }, { "epoch": 0.6209304945364424, "grad_norm": 0.950436274242313, "learning_rate": 1.7196661031180738e-06, "loss": 0.1771, "step": 1378 }, { "epoch": 0.6213810972175284, "grad_norm": 0.9333762226180593, "learning_rate": 1.716127320713568e-06, "loss": 0.1779, "step": 1379 }, { "epoch": 0.6218316998986144, "grad_norm": 0.9819914710831309, "learning_rate": 1.7125902793299434e-06, "loss": 0.1833, "step": 1380 }, { "epoch": 0.6222823025797003, "grad_norm": 0.9524398256739168, "learning_rate": 1.7090549868231492e-06, "loss": 0.1712, "step": 1381 }, { "epoch": 0.6227329052607863, "grad_norm": 0.9914782024254578, "learning_rate": 1.7055214510452462e-06, "loss": 0.1699, "step": 1382 }, { "epoch": 0.6231835079418723, "grad_norm": 0.9496482485318583, "learning_rate": 1.7019896798443984e-06, "loss": 0.1608, "step": 1383 }, { "epoch": 0.6236341106229583, "grad_norm": 1.001098465203453, "learning_rate": 1.6984596810648475e-06, "loss": 0.1825, "step": 1384 }, { "epoch": 0.6240847133040441, "grad_norm": 0.9402250107639619, "learning_rate": 1.6949314625468985e-06, "loss": 0.1608, "step": 1385 }, { "epoch": 0.6245353159851301, "grad_norm": 1.0011483352758226, "learning_rate": 1.6914050321269049e-06, "loss": 0.1806, "step": 1386 }, { "epoch": 0.6249859186662161, "grad_norm": 1.0503798519563663, "learning_rate": 1.6878803976372465e-06, "loss": 0.1964, "step": 1387 }, { "epoch": 0.625436521347302, "grad_norm": 0.9440187921912822, "learning_rate": 1.6843575669063142e-06, "loss": 0.1738, "step": 1388 }, { "epoch": 0.625887124028388, "grad_norm": 0.9326821497482316, "learning_rate": 1.6808365477584953e-06, "loss": 0.1648, "step": 1389 }, { "epoch": 0.626337726709474, "grad_norm": 0.9424691557698677, "learning_rate": 1.6773173480141487e-06, "loss": 0.1652, "step": 1390 }, { "epoch": 0.6267883293905598, "grad_norm": 0.9281771608858566, "learning_rate": 1.6737999754895965e-06, "loss": 0.1777, "step": 1391 }, { "epoch": 0.6272389320716458, "grad_norm": 0.9752991219227313, "learning_rate": 1.6702844379971012e-06, "loss": 0.1697, "step": 1392 }, { "epoch": 0.6276895347527318, "grad_norm": 0.960553601801517, "learning_rate": 1.6667707433448482e-06, "loss": 0.1752, "step": 1393 }, { "epoch": 0.6281401374338177, "grad_norm": 0.9489921824793127, "learning_rate": 1.663258899336933e-06, "loss": 0.1745, "step": 1394 }, { "epoch": 0.6285907401149037, "grad_norm": 0.9941610654543575, "learning_rate": 1.6597489137733377e-06, "loss": 0.1692, "step": 1395 }, { "epoch": 0.6290413427959897, "grad_norm": 0.9797241069847795, "learning_rate": 1.6562407944499175e-06, "loss": 0.1707, "step": 1396 }, { "epoch": 0.6294919454770755, "grad_norm": 0.9293994462914644, "learning_rate": 1.652734549158384e-06, "loss": 0.1665, "step": 1397 }, { "epoch": 0.6299425481581615, "grad_norm": 1.03038629799781, "learning_rate": 1.6492301856862855e-06, "loss": 0.1695, "step": 1398 }, { "epoch": 0.6303931508392475, "grad_norm": 0.9622243992637415, "learning_rate": 1.6457277118169893e-06, "loss": 0.1781, "step": 1399 }, { "epoch": 0.6308437535203334, "grad_norm": 1.002772156035792, "learning_rate": 1.6422271353296675e-06, "loss": 0.1891, "step": 1400 }, { "epoch": 0.6312943562014194, "grad_norm": 1.0526800841741413, "learning_rate": 1.6387284639992773e-06, "loss": 0.1777, "step": 1401 }, { "epoch": 0.6317449588825054, "grad_norm": 1.0387407303203702, "learning_rate": 1.6352317055965458e-06, "loss": 0.1875, "step": 1402 }, { "epoch": 0.6321955615635914, "grad_norm": 1.0162282724569995, "learning_rate": 1.6317368678879497e-06, "loss": 0.1841, "step": 1403 }, { "epoch": 0.6326461642446772, "grad_norm": 0.9823874974583439, "learning_rate": 1.6282439586356999e-06, "loss": 0.1769, "step": 1404 }, { "epoch": 0.6330967669257632, "grad_norm": 0.9863386832219366, "learning_rate": 1.6247529855977256e-06, "loss": 0.1704, "step": 1405 }, { "epoch": 0.6335473696068492, "grad_norm": 1.0528860491393666, "learning_rate": 1.6212639565276538e-06, "loss": 0.174, "step": 1406 }, { "epoch": 0.6339979722879351, "grad_norm": 0.9737102543598695, "learning_rate": 1.6177768791747957e-06, "loss": 0.184, "step": 1407 }, { "epoch": 0.6344485749690211, "grad_norm": 1.0153204654045858, "learning_rate": 1.6142917612841252e-06, "loss": 0.1849, "step": 1408 }, { "epoch": 0.6348991776501071, "grad_norm": 0.8991885233662639, "learning_rate": 1.610808610596265e-06, "loss": 0.1632, "step": 1409 }, { "epoch": 0.6353497803311929, "grad_norm": 0.9713420971163291, "learning_rate": 1.607327434847471e-06, "loss": 0.173, "step": 1410 }, { "epoch": 0.6358003830122789, "grad_norm": 0.9842018617155793, "learning_rate": 1.6038482417696095e-06, "loss": 0.1681, "step": 1411 }, { "epoch": 0.6362509856933649, "grad_norm": 1.0444753107781046, "learning_rate": 1.6003710390901434e-06, "loss": 0.2003, "step": 1412 }, { "epoch": 0.6367015883744508, "grad_norm": 1.009201486730144, "learning_rate": 1.5968958345321178e-06, "loss": 0.1866, "step": 1413 }, { "epoch": 0.6371521910555368, "grad_norm": 0.9793931412926407, "learning_rate": 1.5934226358141368e-06, "loss": 0.1865, "step": 1414 }, { "epoch": 0.6376027937366228, "grad_norm": 1.0086558938363293, "learning_rate": 1.5899514506503499e-06, "loss": 0.188, "step": 1415 }, { "epoch": 0.6380533964177086, "grad_norm": 1.012525844933204, "learning_rate": 1.5864822867504376e-06, "loss": 0.1644, "step": 1416 }, { "epoch": 0.6385039990987946, "grad_norm": 0.9497102281955796, "learning_rate": 1.5830151518195846e-06, "loss": 0.1638, "step": 1417 }, { "epoch": 0.6389546017798806, "grad_norm": 1.0713360411193678, "learning_rate": 1.5795500535584758e-06, "loss": 0.1864, "step": 1418 }, { "epoch": 0.6394052044609665, "grad_norm": 0.9669969719861, "learning_rate": 1.5760869996632685e-06, "loss": 0.1667, "step": 1419 }, { "epoch": 0.6398558071420525, "grad_norm": 0.9016232986127477, "learning_rate": 1.572625997825581e-06, "loss": 0.1694, "step": 1420 }, { "epoch": 0.6403064098231385, "grad_norm": 1.0014722298530099, "learning_rate": 1.5691670557324734e-06, "loss": 0.1896, "step": 1421 }, { "epoch": 0.6407570125042245, "grad_norm": 0.9885424898642345, "learning_rate": 1.5657101810664314e-06, "loss": 0.1759, "step": 1422 }, { "epoch": 0.6412076151853103, "grad_norm": 1.0163722000157043, "learning_rate": 1.5622553815053476e-06, "loss": 0.1814, "step": 1423 }, { "epoch": 0.6416582178663963, "grad_norm": 1.0470342847013532, "learning_rate": 1.558802664722508e-06, "loss": 0.1803, "step": 1424 }, { "epoch": 0.6421088205474823, "grad_norm": 0.9986140861027836, "learning_rate": 1.555352038386571e-06, "loss": 0.166, "step": 1425 }, { "epoch": 0.6425594232285682, "grad_norm": 0.962847713888351, "learning_rate": 1.5519035101615518e-06, "loss": 0.186, "step": 1426 }, { "epoch": 0.6430100259096542, "grad_norm": 0.93191438242337, "learning_rate": 1.5484570877068055e-06, "loss": 0.1728, "step": 1427 }, { "epoch": 0.6434606285907402, "grad_norm": 0.9810361262173285, "learning_rate": 1.5450127786770116e-06, "loss": 0.1704, "step": 1428 }, { "epoch": 0.643911231271826, "grad_norm": 0.9952257906091154, "learning_rate": 1.5415705907221545e-06, "loss": 0.178, "step": 1429 }, { "epoch": 0.644361833952912, "grad_norm": 0.9429605864762918, "learning_rate": 1.5381305314875084e-06, "loss": 0.1738, "step": 1430 }, { "epoch": 0.644812436633998, "grad_norm": 1.0299359367615164, "learning_rate": 1.5346926086136171e-06, "loss": 0.1779, "step": 1431 }, { "epoch": 0.6452630393150839, "grad_norm": 1.0049274983457543, "learning_rate": 1.5312568297362834e-06, "loss": 0.1746, "step": 1432 }, { "epoch": 0.6457136419961699, "grad_norm": 0.9967559026461587, "learning_rate": 1.5278232024865458e-06, "loss": 0.1818, "step": 1433 }, { "epoch": 0.6461642446772559, "grad_norm": 0.97845506443779, "learning_rate": 1.5243917344906625e-06, "loss": 0.1889, "step": 1434 }, { "epoch": 0.6466148473583417, "grad_norm": 0.9698446326236689, "learning_rate": 1.5209624333700985e-06, "loss": 0.1698, "step": 1435 }, { "epoch": 0.6470654500394277, "grad_norm": 0.9755719784670406, "learning_rate": 1.517535306741505e-06, "loss": 0.1772, "step": 1436 }, { "epoch": 0.6475160527205137, "grad_norm": 0.9745589060736123, "learning_rate": 1.5141103622167042e-06, "loss": 0.1709, "step": 1437 }, { "epoch": 0.6479666554015996, "grad_norm": 0.9965046320724716, "learning_rate": 1.51068760740267e-06, "loss": 0.1813, "step": 1438 }, { "epoch": 0.6484172580826856, "grad_norm": 0.9464651783958671, "learning_rate": 1.5072670499015151e-06, "loss": 0.177, "step": 1439 }, { "epoch": 0.6488678607637716, "grad_norm": 0.9722466387532496, "learning_rate": 1.5038486973104704e-06, "loss": 0.1521, "step": 1440 }, { "epoch": 0.6493184634448576, "grad_norm": 1.012247646444772, "learning_rate": 1.5004325572218698e-06, "loss": 0.1747, "step": 1441 }, { "epoch": 0.6497690661259434, "grad_norm": 0.9999964959383242, "learning_rate": 1.4970186372231347e-06, "loss": 0.1601, "step": 1442 }, { "epoch": 0.6502196688070294, "grad_norm": 0.9028737511479311, "learning_rate": 1.493606944896751e-06, "loss": 0.1688, "step": 1443 }, { "epoch": 0.6506702714881154, "grad_norm": 1.0231645677089356, "learning_rate": 1.490197487820263e-06, "loss": 0.185, "step": 1444 }, { "epoch": 0.6511208741692013, "grad_norm": 0.9358664932252706, "learning_rate": 1.486790273566246e-06, "loss": 0.1744, "step": 1445 }, { "epoch": 0.6515714768502873, "grad_norm": 0.9300778756974287, "learning_rate": 1.483385309702295e-06, "loss": 0.1703, "step": 1446 }, { "epoch": 0.6520220795313733, "grad_norm": 0.9169957045442526, "learning_rate": 1.4799826037910082e-06, "loss": 0.1662, "step": 1447 }, { "epoch": 0.6524726822124591, "grad_norm": 1.0311445049370658, "learning_rate": 1.4765821633899663e-06, "loss": 0.1766, "step": 1448 }, { "epoch": 0.6529232848935451, "grad_norm": 1.0370975219569198, "learning_rate": 1.4731839960517202e-06, "loss": 0.1952, "step": 1449 }, { "epoch": 0.6533738875746311, "grad_norm": 1.0233401948492444, "learning_rate": 1.4697881093237714e-06, "loss": 0.1923, "step": 1450 }, { "epoch": 0.653824490255717, "grad_norm": 0.9461622243859847, "learning_rate": 1.4663945107485567e-06, "loss": 0.1782, "step": 1451 }, { "epoch": 0.654275092936803, "grad_norm": 1.055504555070659, "learning_rate": 1.4630032078634293e-06, "loss": 0.1815, "step": 1452 }, { "epoch": 0.654725695617889, "grad_norm": 0.9234637697624012, "learning_rate": 1.4596142082006448e-06, "loss": 0.1752, "step": 1453 }, { "epoch": 0.6551762982989748, "grad_norm": 0.9507817258448786, "learning_rate": 1.456227519287343e-06, "loss": 0.1827, "step": 1454 }, { "epoch": 0.6556269009800608, "grad_norm": 0.9759081998164142, "learning_rate": 1.4528431486455311e-06, "loss": 0.1773, "step": 1455 }, { "epoch": 0.6560775036611468, "grad_norm": 0.9777184016717716, "learning_rate": 1.4494611037920667e-06, "loss": 0.1747, "step": 1456 }, { "epoch": 0.6565281063422327, "grad_norm": 0.9271892542931569, "learning_rate": 1.4460813922386446e-06, "loss": 0.1652, "step": 1457 }, { "epoch": 0.6569787090233187, "grad_norm": 0.9337910757457696, "learning_rate": 1.4427040214917742e-06, "loss": 0.1689, "step": 1458 }, { "epoch": 0.6574293117044047, "grad_norm": 0.9282004178300622, "learning_rate": 1.4393289990527665e-06, "loss": 0.1751, "step": 1459 }, { "epoch": 0.6578799143854905, "grad_norm": 0.8970983420937423, "learning_rate": 1.4359563324177176e-06, "loss": 0.1638, "step": 1460 }, { "epoch": 0.6583305170665765, "grad_norm": 1.0074464827163818, "learning_rate": 1.43258602907749e-06, "loss": 0.1826, "step": 1461 }, { "epoch": 0.6587811197476625, "grad_norm": 0.990811575091218, "learning_rate": 1.429218096517699e-06, "loss": 0.1814, "step": 1462 }, { "epoch": 0.6592317224287485, "grad_norm": 0.9480008770331717, "learning_rate": 1.425852542218692e-06, "loss": 0.1772, "step": 1463 }, { "epoch": 0.6596823251098344, "grad_norm": 0.9752983222946117, "learning_rate": 1.4224893736555364e-06, "loss": 0.1749, "step": 1464 }, { "epoch": 0.6601329277909204, "grad_norm": 0.91785989621098, "learning_rate": 1.4191285982979992e-06, "loss": 0.1677, "step": 1465 }, { "epoch": 0.6605835304720064, "grad_norm": 0.9753765831245645, "learning_rate": 1.4157702236105326e-06, "loss": 0.1647, "step": 1466 }, { "epoch": 0.6610341331530922, "grad_norm": 1.001355344074237, "learning_rate": 1.412414257052256e-06, "loss": 0.1877, "step": 1467 }, { "epoch": 0.6614847358341782, "grad_norm": 0.9502520083816269, "learning_rate": 1.4090607060769423e-06, "loss": 0.1702, "step": 1468 }, { "epoch": 0.6619353385152642, "grad_norm": 0.9227004266546573, "learning_rate": 1.4057095781329983e-06, "loss": 0.1779, "step": 1469 }, { "epoch": 0.6623859411963501, "grad_norm": 1.0407222566367818, "learning_rate": 1.402360880663447e-06, "loss": 0.1956, "step": 1470 }, { "epoch": 0.6628365438774361, "grad_norm": 0.9579131430837633, "learning_rate": 1.3990146211059141e-06, "loss": 0.1688, "step": 1471 }, { "epoch": 0.6632871465585221, "grad_norm": 0.9511338591956713, "learning_rate": 1.3956708068926141e-06, "loss": 0.1818, "step": 1472 }, { "epoch": 0.6637377492396079, "grad_norm": 0.8779448931101682, "learning_rate": 1.3923294454503263e-06, "loss": 0.1546, "step": 1473 }, { "epoch": 0.6641883519206939, "grad_norm": 0.9808281780180107, "learning_rate": 1.3889905442003836e-06, "loss": 0.1874, "step": 1474 }, { "epoch": 0.6646389546017799, "grad_norm": 0.9098712770252587, "learning_rate": 1.3856541105586545e-06, "loss": 0.1512, "step": 1475 }, { "epoch": 0.6650895572828658, "grad_norm": 0.9049853200687668, "learning_rate": 1.382320151935527e-06, "loss": 0.1561, "step": 1476 }, { "epoch": 0.6655401599639518, "grad_norm": 0.9485229447896064, "learning_rate": 1.3789886757358916e-06, "loss": 0.1728, "step": 1477 }, { "epoch": 0.6659907626450378, "grad_norm": 0.9490990019665922, "learning_rate": 1.375659689359126e-06, "loss": 0.1599, "step": 1478 }, { "epoch": 0.6664413653261236, "grad_norm": 0.9711185150006841, "learning_rate": 1.3723332001990774e-06, "loss": 0.1786, "step": 1479 }, { "epoch": 0.6668919680072096, "grad_norm": 0.9973765149760946, "learning_rate": 1.369009215644046e-06, "loss": 0.1818, "step": 1480 }, { "epoch": 0.6673425706882956, "grad_norm": 0.9644391158864567, "learning_rate": 1.36568774307677e-06, "loss": 0.18, "step": 1481 }, { "epoch": 0.6677931733693816, "grad_norm": 0.8888734345308115, "learning_rate": 1.362368789874407e-06, "loss": 0.1647, "step": 1482 }, { "epoch": 0.6682437760504675, "grad_norm": 0.9481091827652459, "learning_rate": 1.3590523634085218e-06, "loss": 0.1747, "step": 1483 }, { "epoch": 0.6686943787315535, "grad_norm": 1.0330690689616744, "learning_rate": 1.3557384710450644e-06, "loss": 0.1889, "step": 1484 }, { "epoch": 0.6691449814126395, "grad_norm": 0.960954850888663, "learning_rate": 1.3524271201443578e-06, "loss": 0.1719, "step": 1485 }, { "epoch": 0.6695955840937253, "grad_norm": 1.0055793871460201, "learning_rate": 1.3491183180610807e-06, "loss": 0.1871, "step": 1486 }, { "epoch": 0.6700461867748113, "grad_norm": 1.0545932802846039, "learning_rate": 1.3458120721442464e-06, "loss": 0.1799, "step": 1487 }, { "epoch": 0.6704967894558973, "grad_norm": 0.962979895049587, "learning_rate": 1.3425083897371983e-06, "loss": 0.1765, "step": 1488 }, { "epoch": 0.6709473921369832, "grad_norm": 0.9431647526868862, "learning_rate": 1.3392072781775806e-06, "loss": 0.167, "step": 1489 }, { "epoch": 0.6713979948180692, "grad_norm": 0.9559904240284464, "learning_rate": 1.335908744797329e-06, "loss": 0.1769, "step": 1490 }, { "epoch": 0.6718485974991552, "grad_norm": 1.048775633545965, "learning_rate": 1.3326127969226535e-06, "loss": 0.1846, "step": 1491 }, { "epoch": 0.672299200180241, "grad_norm": 0.9744426542397633, "learning_rate": 1.3293194418740207e-06, "loss": 0.1817, "step": 1492 }, { "epoch": 0.672749802861327, "grad_norm": 0.9623650677514825, "learning_rate": 1.3260286869661378e-06, "loss": 0.17, "step": 1493 }, { "epoch": 0.673200405542413, "grad_norm": 0.9659168187588352, "learning_rate": 1.32274053950794e-06, "loss": 0.1821, "step": 1494 }, { "epoch": 0.6736510082234989, "grad_norm": 0.9368891207747331, "learning_rate": 1.3194550068025697e-06, "loss": 0.1672, "step": 1495 }, { "epoch": 0.6741016109045849, "grad_norm": 0.938184877171042, "learning_rate": 1.3161720961473583e-06, "loss": 0.1706, "step": 1496 }, { "epoch": 0.6745522135856709, "grad_norm": 0.9467721060732873, "learning_rate": 1.3128918148338183e-06, "loss": 0.177, "step": 1497 }, { "epoch": 0.6750028162667567, "grad_norm": 0.9552196388275247, "learning_rate": 1.3096141701476189e-06, "loss": 0.1658, "step": 1498 }, { "epoch": 0.6754534189478427, "grad_norm": 0.9321703673762389, "learning_rate": 1.3063391693685773e-06, "loss": 0.1758, "step": 1499 }, { "epoch": 0.6759040216289287, "grad_norm": 0.967964781154864, "learning_rate": 1.3030668197706347e-06, "loss": 0.1809, "step": 1500 }, { "epoch": 0.6759040216289287, "eval_loss": 0.1767047941684723, "eval_runtime": 59.4127, "eval_samples_per_second": 24.153, "eval_steps_per_second": 3.03, "step": 1500 }, { "epoch": 0.6763546243100147, "grad_norm": 0.9101930612729155, "learning_rate": 1.2997971286218448e-06, "loss": 0.1727, "step": 1501 }, { "epoch": 0.6768052269911006, "grad_norm": 1.0473837945881148, "learning_rate": 1.2965301031843574e-06, "loss": 0.1747, "step": 1502 }, { "epoch": 0.6772558296721866, "grad_norm": 0.9865409373557444, "learning_rate": 1.2932657507144014e-06, "loss": 0.1817, "step": 1503 }, { "epoch": 0.6777064323532725, "grad_norm": 0.9575822375259619, "learning_rate": 1.2900040784622686e-06, "loss": 0.1861, "step": 1504 }, { "epoch": 0.6781570350343584, "grad_norm": 0.9333033180300786, "learning_rate": 1.286745093672298e-06, "loss": 0.1716, "step": 1505 }, { "epoch": 0.6786076377154444, "grad_norm": 0.9589865720767485, "learning_rate": 1.2834888035828597e-06, "loss": 0.1697, "step": 1506 }, { "epoch": 0.6790582403965304, "grad_norm": 0.9599746633693138, "learning_rate": 1.2802352154263392e-06, "loss": 0.1649, "step": 1507 }, { "epoch": 0.6795088430776163, "grad_norm": 0.9459104304885603, "learning_rate": 1.2769843364291202e-06, "loss": 0.1758, "step": 1508 }, { "epoch": 0.6799594457587023, "grad_norm": 0.9237537511212347, "learning_rate": 1.2737361738115681e-06, "loss": 0.1712, "step": 1509 }, { "epoch": 0.6804100484397883, "grad_norm": 0.9363027728902537, "learning_rate": 1.2704907347880185e-06, "loss": 0.1672, "step": 1510 }, { "epoch": 0.6808606511208741, "grad_norm": 0.9624914550683219, "learning_rate": 1.2672480265667553e-06, "loss": 0.1701, "step": 1511 }, { "epoch": 0.6813112538019601, "grad_norm": 1.0049184213593736, "learning_rate": 1.2640080563499977e-06, "loss": 0.1684, "step": 1512 }, { "epoch": 0.6817618564830461, "grad_norm": 0.9386797820514148, "learning_rate": 1.2607708313338818e-06, "loss": 0.1669, "step": 1513 }, { "epoch": 0.682212459164132, "grad_norm": 0.9281750049626383, "learning_rate": 1.2575363587084486e-06, "loss": 0.163, "step": 1514 }, { "epoch": 0.682663061845218, "grad_norm": 0.9465979108054715, "learning_rate": 1.2543046456576267e-06, "loss": 0.1835, "step": 1515 }, { "epoch": 0.683113664526304, "grad_norm": 1.0080017997547543, "learning_rate": 1.2510756993592138e-06, "loss": 0.1846, "step": 1516 }, { "epoch": 0.6835642672073898, "grad_norm": 0.9580231845014326, "learning_rate": 1.2478495269848626e-06, "loss": 0.1726, "step": 1517 }, { "epoch": 0.6840148698884758, "grad_norm": 0.943782011616573, "learning_rate": 1.2446261357000655e-06, "loss": 0.1757, "step": 1518 }, { "epoch": 0.6844654725695618, "grad_norm": 0.9322388978426438, "learning_rate": 1.2414055326641378e-06, "loss": 0.1729, "step": 1519 }, { "epoch": 0.6849160752506478, "grad_norm": 0.9737181786806623, "learning_rate": 1.2381877250302002e-06, "loss": 0.1711, "step": 1520 }, { "epoch": 0.6853666779317337, "grad_norm": 0.9187221744081951, "learning_rate": 1.2349727199451696e-06, "loss": 0.1546, "step": 1521 }, { "epoch": 0.6858172806128197, "grad_norm": 0.9164566553499499, "learning_rate": 1.2317605245497324e-06, "loss": 0.161, "step": 1522 }, { "epoch": 0.6862678832939056, "grad_norm": 0.9373259647717872, "learning_rate": 1.2285511459783373e-06, "loss": 0.1666, "step": 1523 }, { "epoch": 0.6867184859749915, "grad_norm": 0.9556482155420707, "learning_rate": 1.225344591359177e-06, "loss": 0.1769, "step": 1524 }, { "epoch": 0.6871690886560775, "grad_norm": 0.9826183613099548, "learning_rate": 1.2221408678141702e-06, "loss": 0.1678, "step": 1525 }, { "epoch": 0.6876196913371635, "grad_norm": 0.9866834247082366, "learning_rate": 1.2189399824589513e-06, "loss": 0.1692, "step": 1526 }, { "epoch": 0.6880702940182494, "grad_norm": 0.9346005849361148, "learning_rate": 1.2157419424028473e-06, "loss": 0.1713, "step": 1527 }, { "epoch": 0.6885208966993354, "grad_norm": 0.9740865834584703, "learning_rate": 1.2125467547488676e-06, "loss": 0.1719, "step": 1528 }, { "epoch": 0.6889714993804213, "grad_norm": 0.9326077413958622, "learning_rate": 1.2093544265936848e-06, "loss": 0.1695, "step": 1529 }, { "epoch": 0.6894221020615072, "grad_norm": 0.9579890239329361, "learning_rate": 1.206164965027622e-06, "loss": 0.1631, "step": 1530 }, { "epoch": 0.6898727047425932, "grad_norm": 0.9193027867511092, "learning_rate": 1.2029783771346344e-06, "loss": 0.1513, "step": 1531 }, { "epoch": 0.6903233074236792, "grad_norm": 1.0313184201825583, "learning_rate": 1.1997946699922946e-06, "loss": 0.1861, "step": 1532 }, { "epoch": 0.6907739101047651, "grad_norm": 1.0699790891833825, "learning_rate": 1.1966138506717776e-06, "loss": 0.198, "step": 1533 }, { "epoch": 0.6912245127858511, "grad_norm": 0.9909657945142227, "learning_rate": 1.1934359262378443e-06, "loss": 0.1824, "step": 1534 }, { "epoch": 0.691675115466937, "grad_norm": 1.0552297480751036, "learning_rate": 1.190260903748825e-06, "loss": 0.1876, "step": 1535 }, { "epoch": 0.6921257181480229, "grad_norm": 0.9639349014155145, "learning_rate": 1.187088790256605e-06, "loss": 0.1847, "step": 1536 }, { "epoch": 0.6925763208291089, "grad_norm": 0.9933982369645189, "learning_rate": 1.1839195928066101e-06, "loss": 0.178, "step": 1537 }, { "epoch": 0.6930269235101949, "grad_norm": 0.9881179962964661, "learning_rate": 1.1807533184377882e-06, "loss": 0.1867, "step": 1538 }, { "epoch": 0.6934775261912809, "grad_norm": 0.9497248106372381, "learning_rate": 1.1775899741825947e-06, "loss": 0.1656, "step": 1539 }, { "epoch": 0.6939281288723668, "grad_norm": 0.9751954371033619, "learning_rate": 1.1744295670669752e-06, "loss": 0.1732, "step": 1540 }, { "epoch": 0.6943787315534528, "grad_norm": 0.9850014562760682, "learning_rate": 1.171272104110356e-06, "loss": 0.1709, "step": 1541 }, { "epoch": 0.6948293342345387, "grad_norm": 0.9356887222349297, "learning_rate": 1.168117592325622e-06, "loss": 0.1709, "step": 1542 }, { "epoch": 0.6952799369156246, "grad_norm": 0.9789522658468988, "learning_rate": 1.1649660387191027e-06, "loss": 0.1763, "step": 1543 }, { "epoch": 0.6957305395967106, "grad_norm": 0.97631716881953, "learning_rate": 1.1618174502905586e-06, "loss": 0.1798, "step": 1544 }, { "epoch": 0.6961811422777966, "grad_norm": 0.958842559879847, "learning_rate": 1.1586718340331634e-06, "loss": 0.1659, "step": 1545 }, { "epoch": 0.6966317449588825, "grad_norm": 0.9418656914716783, "learning_rate": 1.1555291969334907e-06, "loss": 0.1764, "step": 1546 }, { "epoch": 0.6970823476399685, "grad_norm": 0.9320216201816975, "learning_rate": 1.1523895459714948e-06, "loss": 0.1666, "step": 1547 }, { "epoch": 0.6975329503210544, "grad_norm": 0.9956915927235741, "learning_rate": 1.1492528881205027e-06, "loss": 0.1702, "step": 1548 }, { "epoch": 0.6979835530021403, "grad_norm": 0.9832844212602442, "learning_rate": 1.146119230347187e-06, "loss": 0.1921, "step": 1549 }, { "epoch": 0.6984341556832263, "grad_norm": 0.9762699484093021, "learning_rate": 1.142988579611561e-06, "loss": 0.1702, "step": 1550 }, { "epoch": 0.6988847583643123, "grad_norm": 0.9540162501795935, "learning_rate": 1.1398609428669582e-06, "loss": 0.1734, "step": 1551 }, { "epoch": 0.6993353610453982, "grad_norm": 0.9471062331433902, "learning_rate": 1.136736327060019e-06, "loss": 0.1631, "step": 1552 }, { "epoch": 0.6997859637264842, "grad_norm": 0.960608892965562, "learning_rate": 1.133614739130673e-06, "loss": 0.1811, "step": 1553 }, { "epoch": 0.7002365664075701, "grad_norm": 1.0008247062038094, "learning_rate": 1.1304961860121246e-06, "loss": 0.1847, "step": 1554 }, { "epoch": 0.700687169088656, "grad_norm": 0.9546410817103299, "learning_rate": 1.127380674630838e-06, "loss": 0.189, "step": 1555 }, { "epoch": 0.701137771769742, "grad_norm": 0.9665656588679624, "learning_rate": 1.1242682119065217e-06, "loss": 0.1801, "step": 1556 }, { "epoch": 0.701588374450828, "grad_norm": 0.9687500702022401, "learning_rate": 1.121158804752113e-06, "loss": 0.1778, "step": 1557 }, { "epoch": 0.702038977131914, "grad_norm": 0.9262435022683353, "learning_rate": 1.1180524600737624e-06, "loss": 0.1672, "step": 1558 }, { "epoch": 0.7024895798129999, "grad_norm": 0.9802604827010679, "learning_rate": 1.1149491847708186e-06, "loss": 0.1797, "step": 1559 }, { "epoch": 0.7029401824940859, "grad_norm": 0.9429318887072621, "learning_rate": 1.1118489857358129e-06, "loss": 0.1686, "step": 1560 }, { "epoch": 0.7033907851751718, "grad_norm": 0.9530986895583247, "learning_rate": 1.1087518698544444e-06, "loss": 0.1515, "step": 1561 }, { "epoch": 0.7038413878562577, "grad_norm": 0.9803536176709677, "learning_rate": 1.1056578440055631e-06, "loss": 0.1829, "step": 1562 }, { "epoch": 0.7042919905373437, "grad_norm": 1.0034683246802105, "learning_rate": 1.1025669150611594e-06, "loss": 0.1861, "step": 1563 }, { "epoch": 0.7047425932184297, "grad_norm": 0.9442434238308353, "learning_rate": 1.0994790898863409e-06, "loss": 0.164, "step": 1564 }, { "epoch": 0.7051931958995156, "grad_norm": 0.9803487037575894, "learning_rate": 1.0963943753393252e-06, "loss": 0.1745, "step": 1565 }, { "epoch": 0.7056437985806016, "grad_norm": 1.0057243604822155, "learning_rate": 1.0933127782714175e-06, "loss": 0.1892, "step": 1566 }, { "epoch": 0.7060944012616875, "grad_norm": 0.9489071650027512, "learning_rate": 1.0902343055270006e-06, "loss": 0.1769, "step": 1567 }, { "epoch": 0.7065450039427734, "grad_norm": 0.9722597903746137, "learning_rate": 1.0871589639435204e-06, "loss": 0.173, "step": 1568 }, { "epoch": 0.7069956066238594, "grad_norm": 0.9839533611583738, "learning_rate": 1.0840867603514648e-06, "loss": 0.1784, "step": 1569 }, { "epoch": 0.7074462093049454, "grad_norm": 0.9817529379115976, "learning_rate": 1.0810177015743536e-06, "loss": 0.185, "step": 1570 }, { "epoch": 0.7078968119860313, "grad_norm": 0.9029027680731931, "learning_rate": 1.0779517944287216e-06, "loss": 0.1555, "step": 1571 }, { "epoch": 0.7083474146671173, "grad_norm": 0.9949792727511889, "learning_rate": 1.0748890457241037e-06, "loss": 0.1941, "step": 1572 }, { "epoch": 0.7087980173482032, "grad_norm": 0.9906999802115285, "learning_rate": 1.0718294622630188e-06, "loss": 0.169, "step": 1573 }, { "epoch": 0.7092486200292891, "grad_norm": 0.9126582765867913, "learning_rate": 1.0687730508409594e-06, "loss": 0.1614, "step": 1574 }, { "epoch": 0.7096992227103751, "grad_norm": 1.0667875398406488, "learning_rate": 1.065719818246367e-06, "loss": 0.1809, "step": 1575 }, { "epoch": 0.7101498253914611, "grad_norm": 0.9828684797908929, "learning_rate": 1.062669771260627e-06, "loss": 0.175, "step": 1576 }, { "epoch": 0.7106004280725471, "grad_norm": 1.0258993060364723, "learning_rate": 1.0596229166580477e-06, "loss": 0.1743, "step": 1577 }, { "epoch": 0.711051030753633, "grad_norm": 0.8864350336490167, "learning_rate": 1.0565792612058462e-06, "loss": 0.1545, "step": 1578 }, { "epoch": 0.711501633434719, "grad_norm": 1.014170295448755, "learning_rate": 1.0535388116641376e-06, "loss": 0.1828, "step": 1579 }, { "epoch": 0.7119522361158049, "grad_norm": 0.9755717840362919, "learning_rate": 1.050501574785913e-06, "loss": 0.1764, "step": 1580 }, { "epoch": 0.7124028387968908, "grad_norm": 0.9763870999531946, "learning_rate": 1.0474675573170293e-06, "loss": 0.1722, "step": 1581 }, { "epoch": 0.7128534414779768, "grad_norm": 1.01133859060459, "learning_rate": 1.0444367659961927e-06, "loss": 0.1807, "step": 1582 }, { "epoch": 0.7133040441590628, "grad_norm": 0.976304679761672, "learning_rate": 1.041409207554944e-06, "loss": 0.1633, "step": 1583 }, { "epoch": 0.7137546468401487, "grad_norm": 1.0054771418264985, "learning_rate": 1.0383848887176437e-06, "loss": 0.1789, "step": 1584 }, { "epoch": 0.7142052495212347, "grad_norm": 0.9916973069684641, "learning_rate": 1.035363816201457e-06, "loss": 0.1589, "step": 1585 }, { "epoch": 0.7146558522023206, "grad_norm": 0.9446406731337711, "learning_rate": 1.032345996716339e-06, "loss": 0.1724, "step": 1586 }, { "epoch": 0.7151064548834065, "grad_norm": 0.9517791019473585, "learning_rate": 1.0293314369650193e-06, "loss": 0.1657, "step": 1587 }, { "epoch": 0.7155570575644925, "grad_norm": 0.9264839629163255, "learning_rate": 1.0263201436429873e-06, "loss": 0.1623, "step": 1588 }, { "epoch": 0.7160076602455785, "grad_norm": 1.0208241763086796, "learning_rate": 1.0233121234384777e-06, "loss": 0.1776, "step": 1589 }, { "epoch": 0.7164582629266644, "grad_norm": 0.962412964362255, "learning_rate": 1.0203073830324566e-06, "loss": 0.1753, "step": 1590 }, { "epoch": 0.7169088656077504, "grad_norm": 0.9392689838924189, "learning_rate": 1.0173059290986048e-06, "loss": 0.1731, "step": 1591 }, { "epoch": 0.7173594682888363, "grad_norm": 0.9674414745912845, "learning_rate": 1.0143077683033017e-06, "loss": 0.1853, "step": 1592 }, { "epoch": 0.7178100709699222, "grad_norm": 0.9463688929691734, "learning_rate": 1.0113129073056149e-06, "loss": 0.1771, "step": 1593 }, { "epoch": 0.7182606736510082, "grad_norm": 1.0074875241387953, "learning_rate": 1.008321352757281e-06, "loss": 0.1954, "step": 1594 }, { "epoch": 0.7187112763320942, "grad_norm": 0.9871282729852052, "learning_rate": 1.0053331113026962e-06, "loss": 0.1695, "step": 1595 }, { "epoch": 0.7191618790131802, "grad_norm": 0.9518387475350171, "learning_rate": 1.002348189578895e-06, "loss": 0.1811, "step": 1596 }, { "epoch": 0.7196124816942661, "grad_norm": 0.9170341529828044, "learning_rate": 9.993665942155395e-07, "loss": 0.1687, "step": 1597 }, { "epoch": 0.720063084375352, "grad_norm": 0.9957667942869256, "learning_rate": 9.963883318349039e-07, "loss": 0.1792, "step": 1598 }, { "epoch": 0.720513687056438, "grad_norm": 0.9548282910026167, "learning_rate": 9.934134090518593e-07, "loss": 0.171, "step": 1599 }, { "epoch": 0.7209642897375239, "grad_norm": 0.9649719991717198, "learning_rate": 9.904418324738605e-07, "loss": 0.1859, "step": 1600 }, { "epoch": 0.7214148924186099, "grad_norm": 0.9461436353544868, "learning_rate": 9.874736087009285e-07, "loss": 0.1708, "step": 1601 }, { "epoch": 0.7218654950996959, "grad_norm": 1.0458030537435585, "learning_rate": 9.84508744325639e-07, "loss": 0.1967, "step": 1602 }, { "epoch": 0.7223160977807818, "grad_norm": 0.9635061887687748, "learning_rate": 9.815472459331061e-07, "loss": 0.1655, "step": 1603 }, { "epoch": 0.7227667004618678, "grad_norm": 0.9821847851496524, "learning_rate": 9.785891201009667e-07, "loss": 0.1729, "step": 1604 }, { "epoch": 0.7232173031429537, "grad_norm": 0.9389984597940151, "learning_rate": 9.756343733993679e-07, "loss": 0.1713, "step": 1605 }, { "epoch": 0.7236679058240396, "grad_norm": 0.9598950848804205, "learning_rate": 9.726830123909527e-07, "loss": 0.1719, "step": 1606 }, { "epoch": 0.7241185085051256, "grad_norm": 0.9662287057802987, "learning_rate": 9.697350436308428e-07, "loss": 0.1884, "step": 1607 }, { "epoch": 0.7245691111862116, "grad_norm": 0.8983592840074917, "learning_rate": 9.667904736666258e-07, "loss": 0.159, "step": 1608 }, { "epoch": 0.7250197138672975, "grad_norm": 1.0072194197806417, "learning_rate": 9.638493090383408e-07, "loss": 0.1801, "step": 1609 }, { "epoch": 0.7254703165483835, "grad_norm": 0.9665513942121345, "learning_rate": 9.60911556278463e-07, "loss": 0.1607, "step": 1610 }, { "epoch": 0.7259209192294694, "grad_norm": 0.8889613238958063, "learning_rate": 9.579772219118899e-07, "loss": 0.1582, "step": 1611 }, { "epoch": 0.7263715219105553, "grad_norm": 0.9675942815591192, "learning_rate": 9.550463124559267e-07, "loss": 0.1889, "step": 1612 }, { "epoch": 0.7268221245916413, "grad_norm": 0.9684044848937015, "learning_rate": 9.521188344202717e-07, "loss": 0.1828, "step": 1613 }, { "epoch": 0.7272727272727273, "grad_norm": 0.9711203932408818, "learning_rate": 9.491947943070015e-07, "loss": 0.1636, "step": 1614 }, { "epoch": 0.7277233299538133, "grad_norm": 0.9589346438716095, "learning_rate": 9.462741986105573e-07, "loss": 0.1728, "step": 1615 }, { "epoch": 0.7281739326348992, "grad_norm": 0.9509324960345422, "learning_rate": 9.433570538177289e-07, "loss": 0.1808, "step": 1616 }, { "epoch": 0.7286245353159851, "grad_norm": 0.9226311502358819, "learning_rate": 9.404433664076442e-07, "loss": 0.15, "step": 1617 }, { "epoch": 0.7290751379970711, "grad_norm": 0.9459518608458831, "learning_rate": 9.375331428517506e-07, "loss": 0.1619, "step": 1618 }, { "epoch": 0.729525740678157, "grad_norm": 0.9204979082818124, "learning_rate": 9.346263896138e-07, "loss": 0.1636, "step": 1619 }, { "epoch": 0.729976343359243, "grad_norm": 0.9540585406156679, "learning_rate": 9.317231131498383e-07, "loss": 0.1745, "step": 1620 }, { "epoch": 0.730426946040329, "grad_norm": 0.9859865659794163, "learning_rate": 9.288233199081914e-07, "loss": 0.1924, "step": 1621 }, { "epoch": 0.7308775487214149, "grad_norm": 1.0188292649434882, "learning_rate": 9.259270163294457e-07, "loss": 0.1768, "step": 1622 }, { "epoch": 0.7313281514025008, "grad_norm": 0.9572221089304336, "learning_rate": 9.230342088464381e-07, "loss": 0.1639, "step": 1623 }, { "epoch": 0.7317787540835868, "grad_norm": 1.0402117668520487, "learning_rate": 9.201449038842403e-07, "loss": 0.1764, "step": 1624 }, { "epoch": 0.7322293567646727, "grad_norm": 0.9431140245076933, "learning_rate": 9.172591078601448e-07, "loss": 0.1771, "step": 1625 }, { "epoch": 0.7326799594457587, "grad_norm": 0.96634616013845, "learning_rate": 9.143768271836506e-07, "loss": 0.1783, "step": 1626 }, { "epoch": 0.7331305621268447, "grad_norm": 0.9303321472969261, "learning_rate": 9.114980682564492e-07, "loss": 0.1751, "step": 1627 }, { "epoch": 0.7335811648079306, "grad_norm": 0.9357908307152987, "learning_rate": 9.086228374724096e-07, "loss": 0.171, "step": 1628 }, { "epoch": 0.7340317674890166, "grad_norm": 0.9427016806703898, "learning_rate": 9.057511412175646e-07, "loss": 0.1683, "step": 1629 }, { "epoch": 0.7344823701701025, "grad_norm": 1.0535091981818019, "learning_rate": 9.028829858700974e-07, "loss": 0.1728, "step": 1630 }, { "epoch": 0.7349329728511884, "grad_norm": 1.015879638374581, "learning_rate": 9.000183778003246e-07, "loss": 0.1815, "step": 1631 }, { "epoch": 0.7353835755322744, "grad_norm": 0.9704576831219224, "learning_rate": 8.971573233706881e-07, "loss": 0.1719, "step": 1632 }, { "epoch": 0.7358341782133604, "grad_norm": 0.9348539870408025, "learning_rate": 8.942998289357333e-07, "loss": 0.1669, "step": 1633 }, { "epoch": 0.7362847808944463, "grad_norm": 1.0153054184600172, "learning_rate": 8.914459008421e-07, "loss": 0.1905, "step": 1634 }, { "epoch": 0.7367353835755323, "grad_norm": 1.022740942947041, "learning_rate": 8.885955454285078e-07, "loss": 0.178, "step": 1635 }, { "epoch": 0.7371859862566182, "grad_norm": 0.9811667045250413, "learning_rate": 8.857487690257374e-07, "loss": 0.1666, "step": 1636 }, { "epoch": 0.7376365889377042, "grad_norm": 0.9817560207242689, "learning_rate": 8.829055779566262e-07, "loss": 0.1717, "step": 1637 }, { "epoch": 0.7380871916187901, "grad_norm": 0.9468531084095793, "learning_rate": 8.800659785360444e-07, "loss": 0.1699, "step": 1638 }, { "epoch": 0.7385377942998761, "grad_norm": 0.9261482499229096, "learning_rate": 8.772299770708859e-07, "loss": 0.1703, "step": 1639 }, { "epoch": 0.7389883969809621, "grad_norm": 0.9865460071470947, "learning_rate": 8.743975798600535e-07, "loss": 0.1649, "step": 1640 }, { "epoch": 0.739438999662048, "grad_norm": 0.9327954533317464, "learning_rate": 8.71568793194445e-07, "loss": 0.1595, "step": 1641 }, { "epoch": 0.739889602343134, "grad_norm": 0.9404578144721737, "learning_rate": 8.687436233569375e-07, "loss": 0.1518, "step": 1642 }, { "epoch": 0.7403402050242199, "grad_norm": 0.9621250147866677, "learning_rate": 8.659220766223778e-07, "loss": 0.1818, "step": 1643 }, { "epoch": 0.7407908077053058, "grad_norm": 0.914458397158177, "learning_rate": 8.631041592575643e-07, "loss": 0.1702, "step": 1644 }, { "epoch": 0.7412414103863918, "grad_norm": 0.9466787090846438, "learning_rate": 8.602898775212317e-07, "loss": 0.1731, "step": 1645 }, { "epoch": 0.7416920130674778, "grad_norm": 0.9196682383255834, "learning_rate": 8.57479237664044e-07, "loss": 0.1659, "step": 1646 }, { "epoch": 0.7421426157485637, "grad_norm": 0.9387110760030573, "learning_rate": 8.546722459285727e-07, "loss": 0.1699, "step": 1647 }, { "epoch": 0.7425932184296496, "grad_norm": 0.9798617561875471, "learning_rate": 8.518689085492909e-07, "loss": 0.1697, "step": 1648 }, { "epoch": 0.7430438211107356, "grad_norm": 0.8746481602186569, "learning_rate": 8.490692317525514e-07, "loss": 0.1519, "step": 1649 }, { "epoch": 0.7434944237918215, "grad_norm": 0.9623227355213652, "learning_rate": 8.462732217565783e-07, "loss": 0.162, "step": 1650 }, { "epoch": 0.7439450264729075, "grad_norm": 0.8927891324002865, "learning_rate": 8.434808847714512e-07, "loss": 0.1702, "step": 1651 }, { "epoch": 0.7443956291539935, "grad_norm": 0.9417444618831278, "learning_rate": 8.406922269990917e-07, "loss": 0.1644, "step": 1652 }, { "epoch": 0.7448462318350794, "grad_norm": 0.9919400797788069, "learning_rate": 8.379072546332498e-07, "loss": 0.1844, "step": 1653 }, { "epoch": 0.7452968345161654, "grad_norm": 1.0314878215406893, "learning_rate": 8.351259738594902e-07, "loss": 0.1797, "step": 1654 }, { "epoch": 0.7457474371972513, "grad_norm": 0.974844209179654, "learning_rate": 8.323483908551783e-07, "loss": 0.1814, "step": 1655 }, { "epoch": 0.7461980398783373, "grad_norm": 0.9434763418229525, "learning_rate": 8.29574511789466e-07, "loss": 0.1765, "step": 1656 }, { "epoch": 0.7466486425594232, "grad_norm": 0.9667369016142698, "learning_rate": 8.268043428232798e-07, "loss": 0.1686, "step": 1657 }, { "epoch": 0.7470992452405092, "grad_norm": 0.9442150411912407, "learning_rate": 8.240378901093035e-07, "loss": 0.1703, "step": 1658 }, { "epoch": 0.7475498479215952, "grad_norm": 0.9185798422832856, "learning_rate": 8.212751597919708e-07, "loss": 0.1804, "step": 1659 }, { "epoch": 0.748000450602681, "grad_norm": 0.9150873838854694, "learning_rate": 8.185161580074444e-07, "loss": 0.1588, "step": 1660 }, { "epoch": 0.748451053283767, "grad_norm": 0.9401507535925011, "learning_rate": 8.157608908836071e-07, "loss": 0.1583, "step": 1661 }, { "epoch": 0.748901655964853, "grad_norm": 0.901302230754303, "learning_rate": 8.130093645400469e-07, "loss": 0.1677, "step": 1662 }, { "epoch": 0.7493522586459389, "grad_norm": 0.9197142291521521, "learning_rate": 8.102615850880413e-07, "loss": 0.1724, "step": 1663 }, { "epoch": 0.7498028613270249, "grad_norm": 0.9382328996600817, "learning_rate": 8.075175586305492e-07, "loss": 0.1726, "step": 1664 }, { "epoch": 0.7502534640081109, "grad_norm": 0.90462702708837, "learning_rate": 8.047772912621921e-07, "loss": 0.1636, "step": 1665 }, { "epoch": 0.7507040666891968, "grad_norm": 0.9711524591154302, "learning_rate": 8.020407890692419e-07, "loss": 0.1716, "step": 1666 }, { "epoch": 0.7511546693702827, "grad_norm": 0.9434890487318909, "learning_rate": 7.993080581296087e-07, "loss": 0.1784, "step": 1667 }, { "epoch": 0.7516052720513687, "grad_norm": 0.9312101653818623, "learning_rate": 7.96579104512826e-07, "loss": 0.1644, "step": 1668 }, { "epoch": 0.7520558747324546, "grad_norm": 0.9848096739117619, "learning_rate": 7.938539342800373e-07, "loss": 0.1783, "step": 1669 }, { "epoch": 0.7525064774135406, "grad_norm": 0.9327157230766141, "learning_rate": 7.911325534839851e-07, "loss": 0.1664, "step": 1670 }, { "epoch": 0.7529570800946266, "grad_norm": 0.9750163171110814, "learning_rate": 7.884149681689937e-07, "loss": 0.1858, "step": 1671 }, { "epoch": 0.7534076827757125, "grad_norm": 0.9094999286751292, "learning_rate": 7.857011843709559e-07, "loss": 0.1582, "step": 1672 }, { "epoch": 0.7538582854567984, "grad_norm": 0.9650020997875528, "learning_rate": 7.829912081173238e-07, "loss": 0.1689, "step": 1673 }, { "epoch": 0.7543088881378844, "grad_norm": 0.9982763186279041, "learning_rate": 7.802850454270913e-07, "loss": 0.1758, "step": 1674 }, { "epoch": 0.7547594908189704, "grad_norm": 0.9579696747409793, "learning_rate": 7.775827023107835e-07, "loss": 0.1679, "step": 1675 }, { "epoch": 0.7552100935000563, "grad_norm": 1.0668093753827521, "learning_rate": 7.74884184770441e-07, "loss": 0.1969, "step": 1676 }, { "epoch": 0.7556606961811423, "grad_norm": 0.9626572194119604, "learning_rate": 7.721894987996076e-07, "loss": 0.1733, "step": 1677 }, { "epoch": 0.7561112988622283, "grad_norm": 0.9744882640905289, "learning_rate": 7.694986503833171e-07, "loss": 0.1801, "step": 1678 }, { "epoch": 0.7565619015433142, "grad_norm": 0.9403313127374262, "learning_rate": 7.668116454980804e-07, "loss": 0.1734, "step": 1679 }, { "epoch": 0.7570125042244001, "grad_norm": 0.9897631979152582, "learning_rate": 7.641284901118703e-07, "loss": 0.1855, "step": 1680 }, { "epoch": 0.7574631069054861, "grad_norm": 0.917940041749129, "learning_rate": 7.614491901841118e-07, "loss": 0.1649, "step": 1681 }, { "epoch": 0.757913709586572, "grad_norm": 0.9028036690911325, "learning_rate": 7.587737516656651e-07, "loss": 0.164, "step": 1682 }, { "epoch": 0.758364312267658, "grad_norm": 0.8959138677827185, "learning_rate": 7.561021804988141e-07, "loss": 0.1613, "step": 1683 }, { "epoch": 0.758814914948744, "grad_norm": 0.9453945367159504, "learning_rate": 7.534344826172546e-07, "loss": 0.1812, "step": 1684 }, { "epoch": 0.7592655176298299, "grad_norm": 0.9475739534763087, "learning_rate": 7.507706639460768e-07, "loss": 0.1774, "step": 1685 }, { "epoch": 0.7597161203109158, "grad_norm": 0.9284456020009094, "learning_rate": 7.481107304017588e-07, "loss": 0.1757, "step": 1686 }, { "epoch": 0.7601667229920018, "grad_norm": 0.9481356302559381, "learning_rate": 7.454546878921465e-07, "loss": 0.1717, "step": 1687 }, { "epoch": 0.7606173256730877, "grad_norm": 1.0275463149896271, "learning_rate": 7.428025423164456e-07, "loss": 0.1844, "step": 1688 }, { "epoch": 0.7610679283541737, "grad_norm": 0.9794885029503064, "learning_rate": 7.401542995652033e-07, "loss": 0.1798, "step": 1689 }, { "epoch": 0.7615185310352597, "grad_norm": 0.9444897520966486, "learning_rate": 7.375099655203033e-07, "loss": 0.1651, "step": 1690 }, { "epoch": 0.7619691337163456, "grad_norm": 0.9188622310083528, "learning_rate": 7.348695460549443e-07, "loss": 0.168, "step": 1691 }, { "epoch": 0.7624197363974315, "grad_norm": 0.9874990105238635, "learning_rate": 7.322330470336314e-07, "loss": 0.1838, "step": 1692 }, { "epoch": 0.7628703390785175, "grad_norm": 0.9549127415217946, "learning_rate": 7.296004743121627e-07, "loss": 0.1685, "step": 1693 }, { "epoch": 0.7633209417596035, "grad_norm": 0.9834011052029592, "learning_rate": 7.26971833737615e-07, "loss": 0.1752, "step": 1694 }, { "epoch": 0.7637715444406894, "grad_norm": 0.9292459980595047, "learning_rate": 7.243471311483322e-07, "loss": 0.1686, "step": 1695 }, { "epoch": 0.7642221471217754, "grad_norm": 0.9744168782441898, "learning_rate": 7.217263723739107e-07, "loss": 0.1864, "step": 1696 }, { "epoch": 0.7646727498028614, "grad_norm": 0.9696008836681465, "learning_rate": 7.191095632351908e-07, "loss": 0.1776, "step": 1697 }, { "epoch": 0.7651233524839472, "grad_norm": 0.96774867165191, "learning_rate": 7.164967095442357e-07, "loss": 0.1759, "step": 1698 }, { "epoch": 0.7655739551650332, "grad_norm": 0.9884684198285855, "learning_rate": 7.138878171043262e-07, "loss": 0.1732, "step": 1699 }, { "epoch": 0.7660245578461192, "grad_norm": 1.0068448942639325, "learning_rate": 7.112828917099438e-07, "loss": 0.1739, "step": 1700 }, { "epoch": 0.7664751605272051, "grad_norm": 0.9987811192949849, "learning_rate": 7.086819391467612e-07, "loss": 0.1831, "step": 1701 }, { "epoch": 0.7669257632082911, "grad_norm": 1.0017086323571975, "learning_rate": 7.060849651916244e-07, "loss": 0.1709, "step": 1702 }, { "epoch": 0.7673763658893771, "grad_norm": 0.9373080593200287, "learning_rate": 7.034919756125447e-07, "loss": 0.1696, "step": 1703 }, { "epoch": 0.767826968570463, "grad_norm": 0.9490094333958343, "learning_rate": 7.009029761686825e-07, "loss": 0.167, "step": 1704 }, { "epoch": 0.7682775712515489, "grad_norm": 0.9750208874260144, "learning_rate": 6.98317972610337e-07, "loss": 0.1604, "step": 1705 }, { "epoch": 0.7687281739326349, "grad_norm": 0.9624993279869102, "learning_rate": 6.957369706789319e-07, "loss": 0.1608, "step": 1706 }, { "epoch": 0.7691787766137208, "grad_norm": 0.9787624995991694, "learning_rate": 6.931599761070027e-07, "loss": 0.1691, "step": 1707 }, { "epoch": 0.7696293792948068, "grad_norm": 1.007340459210081, "learning_rate": 6.905869946181848e-07, "loss": 0.1812, "step": 1708 }, { "epoch": 0.7700799819758928, "grad_norm": 0.9808544623527553, "learning_rate": 6.880180319272006e-07, "loss": 0.1764, "step": 1709 }, { "epoch": 0.7705305846569787, "grad_norm": 0.9657443098275049, "learning_rate": 6.854530937398459e-07, "loss": 0.1787, "step": 1710 }, { "epoch": 0.7709811873380646, "grad_norm": 0.9448605482477895, "learning_rate": 6.828921857529774e-07, "loss": 0.1602, "step": 1711 }, { "epoch": 0.7714317900191506, "grad_norm": 0.9353366127853433, "learning_rate": 6.803353136545033e-07, "loss": 0.1814, "step": 1712 }, { "epoch": 0.7718823927002366, "grad_norm": 1.0082830443026907, "learning_rate": 6.777824831233645e-07, "loss": 0.1843, "step": 1713 }, { "epoch": 0.7723329953813225, "grad_norm": 1.0012175555664218, "learning_rate": 6.752336998295281e-07, "loss": 0.1656, "step": 1714 }, { "epoch": 0.7727835980624085, "grad_norm": 0.9510344959155705, "learning_rate": 6.726889694339689e-07, "loss": 0.1735, "step": 1715 }, { "epoch": 0.7732342007434945, "grad_norm": 0.9607739674740182, "learning_rate": 6.701482975886617e-07, "loss": 0.1761, "step": 1716 }, { "epoch": 0.7736848034245803, "grad_norm": 0.9266629906847302, "learning_rate": 6.676116899365692e-07, "loss": 0.1649, "step": 1717 }, { "epoch": 0.7741354061056663, "grad_norm": 0.9105923017777625, "learning_rate": 6.650791521116243e-07, "loss": 0.1663, "step": 1718 }, { "epoch": 0.7745860087867523, "grad_norm": 0.9094686688201417, "learning_rate": 6.625506897387215e-07, "loss": 0.1626, "step": 1719 }, { "epoch": 0.7750366114678382, "grad_norm": 1.0543916388352368, "learning_rate": 6.600263084337041e-07, "loss": 0.1732, "step": 1720 }, { "epoch": 0.7754872141489242, "grad_norm": 0.9678813090969567, "learning_rate": 6.575060138033504e-07, "loss": 0.1778, "step": 1721 }, { "epoch": 0.7759378168300102, "grad_norm": 0.9860219233564725, "learning_rate": 6.549898114453615e-07, "loss": 0.1761, "step": 1722 }, { "epoch": 0.776388419511096, "grad_norm": 0.9359894577473049, "learning_rate": 6.524777069483526e-07, "loss": 0.1663, "step": 1723 }, { "epoch": 0.776839022192182, "grad_norm": 1.0095972151252641, "learning_rate": 6.499697058918326e-07, "loss": 0.1748, "step": 1724 }, { "epoch": 0.777289624873268, "grad_norm": 0.9426089610465914, "learning_rate": 6.474658138461992e-07, "loss": 0.173, "step": 1725 }, { "epoch": 0.7777402275543539, "grad_norm": 0.9234669569269677, "learning_rate": 6.449660363727236e-07, "loss": 0.1663, "step": 1726 }, { "epoch": 0.7781908302354399, "grad_norm": 0.9405863138790215, "learning_rate": 6.424703790235374e-07, "loss": 0.1692, "step": 1727 }, { "epoch": 0.7786414329165259, "grad_norm": 0.9335835838722892, "learning_rate": 6.399788473416229e-07, "loss": 0.166, "step": 1728 }, { "epoch": 0.7790920355976118, "grad_norm": 0.9511706860844333, "learning_rate": 6.374914468607976e-07, "loss": 0.1791, "step": 1729 }, { "epoch": 0.7795426382786977, "grad_norm": 0.9852564601610345, "learning_rate": 6.35008183105704e-07, "loss": 0.1628, "step": 1730 }, { "epoch": 0.7799932409597837, "grad_norm": 1.0096784064849857, "learning_rate": 6.325290615917961e-07, "loss": 0.187, "step": 1731 }, { "epoch": 0.7804438436408697, "grad_norm": 0.9287036327371591, "learning_rate": 6.300540878253286e-07, "loss": 0.1689, "step": 1732 }, { "epoch": 0.7808944463219556, "grad_norm": 0.9645312989670097, "learning_rate": 6.27583267303343e-07, "loss": 0.1629, "step": 1733 }, { "epoch": 0.7813450490030416, "grad_norm": 0.9098747848669801, "learning_rate": 6.251166055136573e-07, "loss": 0.1663, "step": 1734 }, { "epoch": 0.7817956516841276, "grad_norm": 0.9672682550620934, "learning_rate": 6.226541079348517e-07, "loss": 0.1824, "step": 1735 }, { "epoch": 0.7822462543652134, "grad_norm": 0.9033211880815116, "learning_rate": 6.201957800362579e-07, "loss": 0.1572, "step": 1736 }, { "epoch": 0.7826968570462994, "grad_norm": 0.9729803882075092, "learning_rate": 6.177416272779468e-07, "loss": 0.1789, "step": 1737 }, { "epoch": 0.7831474597273854, "grad_norm": 1.0067811549313692, "learning_rate": 6.152916551107149e-07, "loss": 0.1843, "step": 1738 }, { "epoch": 0.7835980624084713, "grad_norm": 0.9130863572610125, "learning_rate": 6.12845868976076e-07, "loss": 0.1524, "step": 1739 }, { "epoch": 0.7840486650895573, "grad_norm": 0.9814494497101688, "learning_rate": 6.104042743062439e-07, "loss": 0.175, "step": 1740 }, { "epoch": 0.7844992677706433, "grad_norm": 0.9493598919925331, "learning_rate": 6.079668765241248e-07, "loss": 0.1685, "step": 1741 }, { "epoch": 0.7849498704517291, "grad_norm": 0.9674465998413315, "learning_rate": 6.05533681043301e-07, "loss": 0.1731, "step": 1742 }, { "epoch": 0.7854004731328151, "grad_norm": 0.9144085528498724, "learning_rate": 6.031046932680229e-07, "loss": 0.1577, "step": 1743 }, { "epoch": 0.7858510758139011, "grad_norm": 0.9306591229731381, "learning_rate": 6.006799185931964e-07, "loss": 0.1738, "step": 1744 }, { "epoch": 0.786301678494987, "grad_norm": 0.9666823036771994, "learning_rate": 5.982593624043682e-07, "loss": 0.1656, "step": 1745 }, { "epoch": 0.786752281176073, "grad_norm": 1.0216256666717884, "learning_rate": 5.958430300777157e-07, "loss": 0.1815, "step": 1746 }, { "epoch": 0.787202883857159, "grad_norm": 0.9414221341205914, "learning_rate": 5.934309269800359e-07, "loss": 0.1635, "step": 1747 }, { "epoch": 0.7876534865382449, "grad_norm": 1.073218436568168, "learning_rate": 5.910230584687316e-07, "loss": 0.1747, "step": 1748 }, { "epoch": 0.7881040892193308, "grad_norm": 1.022405588004278, "learning_rate": 5.886194298917994e-07, "loss": 0.1834, "step": 1749 }, { "epoch": 0.7885546919004168, "grad_norm": 0.9334120421015962, "learning_rate": 5.862200465878228e-07, "loss": 0.1603, "step": 1750 }, { "epoch": 0.7890052945815028, "grad_norm": 0.9383559818428457, "learning_rate": 5.838249138859509e-07, "loss": 0.1651, "step": 1751 }, { "epoch": 0.7894558972625887, "grad_norm": 0.8996153804259718, "learning_rate": 5.814340371058957e-07, "loss": 0.1596, "step": 1752 }, { "epoch": 0.7899064999436747, "grad_norm": 0.9662819628561679, "learning_rate": 5.79047421557915e-07, "loss": 0.1701, "step": 1753 }, { "epoch": 0.7903571026247607, "grad_norm": 0.9821115878303758, "learning_rate": 5.766650725428027e-07, "loss": 0.1721, "step": 1754 }, { "epoch": 0.7908077053058465, "grad_norm": 0.9530481652480088, "learning_rate": 5.742869953518773e-07, "loss": 0.1698, "step": 1755 }, { "epoch": 0.7912583079869325, "grad_norm": 0.9347238724778862, "learning_rate": 5.719131952669679e-07, "loss": 0.1746, "step": 1756 }, { "epoch": 0.7917089106680185, "grad_norm": 0.9974845092899297, "learning_rate": 5.695436775604049e-07, "loss": 0.1872, "step": 1757 }, { "epoch": 0.7921595133491044, "grad_norm": 0.9823704981517646, "learning_rate": 5.671784474950068e-07, "loss": 0.1646, "step": 1758 }, { "epoch": 0.7926101160301904, "grad_norm": 0.9528761781150609, "learning_rate": 5.648175103240694e-07, "loss": 0.1678, "step": 1759 }, { "epoch": 0.7930607187112764, "grad_norm": 0.9137049047059816, "learning_rate": 5.624608712913531e-07, "loss": 0.1702, "step": 1760 }, { "epoch": 0.7935113213923622, "grad_norm": 0.9307972970588964, "learning_rate": 5.601085356310734e-07, "loss": 0.176, "step": 1761 }, { "epoch": 0.7939619240734482, "grad_norm": 0.9444469133716082, "learning_rate": 5.577605085678858e-07, "loss": 0.1694, "step": 1762 }, { "epoch": 0.7944125267545342, "grad_norm": 0.9333704389867957, "learning_rate": 5.554167953168779e-07, "loss": 0.1675, "step": 1763 }, { "epoch": 0.7948631294356201, "grad_norm": 0.993750845065701, "learning_rate": 5.530774010835552e-07, "loss": 0.1819, "step": 1764 }, { "epoch": 0.7953137321167061, "grad_norm": 0.9348594108562784, "learning_rate": 5.507423310638299e-07, "loss": 0.1681, "step": 1765 }, { "epoch": 0.7957643347977921, "grad_norm": 0.9708757301258297, "learning_rate": 5.48411590444012e-07, "loss": 0.164, "step": 1766 }, { "epoch": 0.796214937478878, "grad_norm": 0.9431276887221065, "learning_rate": 5.460851844007945e-07, "loss": 0.1701, "step": 1767 }, { "epoch": 0.7966655401599639, "grad_norm": 0.9369828225416942, "learning_rate": 5.437631181012415e-07, "loss": 0.1725, "step": 1768 }, { "epoch": 0.7971161428410499, "grad_norm": 0.9742268350847076, "learning_rate": 5.414453967027797e-07, "loss": 0.1763, "step": 1769 }, { "epoch": 0.7975667455221359, "grad_norm": 0.9691529149171444, "learning_rate": 5.391320253531868e-07, "loss": 0.1813, "step": 1770 }, { "epoch": 0.7980173482032218, "grad_norm": 0.8867961718282267, "learning_rate": 5.368230091905774e-07, "loss": 0.1616, "step": 1771 }, { "epoch": 0.7984679508843078, "grad_norm": 0.9198921470166019, "learning_rate": 5.345183533433926e-07, "loss": 0.1619, "step": 1772 }, { "epoch": 0.7989185535653938, "grad_norm": 0.9747081060897254, "learning_rate": 5.322180629303902e-07, "loss": 0.1806, "step": 1773 }, { "epoch": 0.7993691562464796, "grad_norm": 0.9760127339119377, "learning_rate": 5.299221430606313e-07, "loss": 0.1758, "step": 1774 }, { "epoch": 0.7998197589275656, "grad_norm": 0.9701776719091275, "learning_rate": 5.276305988334701e-07, "loss": 0.1806, "step": 1775 }, { "epoch": 0.8002703616086516, "grad_norm": 0.988371999259503, "learning_rate": 5.253434353385422e-07, "loss": 0.1687, "step": 1776 }, { "epoch": 0.8007209642897375, "grad_norm": 0.8598044634557204, "learning_rate": 5.23060657655754e-07, "loss": 0.1523, "step": 1777 }, { "epoch": 0.8011715669708235, "grad_norm": 0.9071673557398994, "learning_rate": 5.207822708552695e-07, "loss": 0.1655, "step": 1778 }, { "epoch": 0.8016221696519095, "grad_norm": 0.9513483134209044, "learning_rate": 5.185082799975013e-07, "loss": 0.1738, "step": 1779 }, { "epoch": 0.8020727723329953, "grad_norm": 1.0075511177548173, "learning_rate": 5.162386901330977e-07, "loss": 0.161, "step": 1780 }, { "epoch": 0.8025233750140813, "grad_norm": 0.9359270752654021, "learning_rate": 5.139735063029338e-07, "loss": 0.1771, "step": 1781 }, { "epoch": 0.8029739776951673, "grad_norm": 0.9189830873891193, "learning_rate": 5.117127335380967e-07, "loss": 0.1623, "step": 1782 }, { "epoch": 0.8034245803762532, "grad_norm": 0.9536212390806852, "learning_rate": 5.094563768598773e-07, "loss": 0.1765, "step": 1783 }, { "epoch": 0.8038751830573392, "grad_norm": 0.892559123324922, "learning_rate": 5.07204441279758e-07, "loss": 0.1595, "step": 1784 }, { "epoch": 0.8043257857384252, "grad_norm": 1.0071090882494602, "learning_rate": 5.049569317994013e-07, "loss": 0.1775, "step": 1785 }, { "epoch": 0.804776388419511, "grad_norm": 0.908277153855911, "learning_rate": 5.027138534106399e-07, "loss": 0.1548, "step": 1786 }, { "epoch": 0.805226991100597, "grad_norm": 0.9223046738550524, "learning_rate": 5.004752110954642e-07, "loss": 0.1693, "step": 1787 }, { "epoch": 0.805677593781683, "grad_norm": 0.9814309602126301, "learning_rate": 4.982410098260118e-07, "loss": 0.173, "step": 1788 }, { "epoch": 0.806128196462769, "grad_norm": 0.949733318801411, "learning_rate": 4.96011254564557e-07, "loss": 0.1552, "step": 1789 }, { "epoch": 0.8065787991438549, "grad_norm": 0.9276630676010313, "learning_rate": 4.937859502634992e-07, "loss": 0.1684, "step": 1790 }, { "epoch": 0.8070294018249409, "grad_norm": 0.9493296870395656, "learning_rate": 4.915651018653511e-07, "loss": 0.1708, "step": 1791 }, { "epoch": 0.8074800045060269, "grad_norm": 0.9831480782450759, "learning_rate": 4.893487143027307e-07, "loss": 0.1716, "step": 1792 }, { "epoch": 0.8079306071871127, "grad_norm": 0.9536954872523767, "learning_rate": 4.871367924983458e-07, "loss": 0.1657, "step": 1793 }, { "epoch": 0.8083812098681987, "grad_norm": 0.9577400855458258, "learning_rate": 4.84929341364988e-07, "loss": 0.1695, "step": 1794 }, { "epoch": 0.8088318125492847, "grad_norm": 1.0109618072957722, "learning_rate": 4.827263658055161e-07, "loss": 0.176, "step": 1795 }, { "epoch": 0.8092824152303706, "grad_norm": 0.9240733937478894, "learning_rate": 4.805278707128505e-07, "loss": 0.1633, "step": 1796 }, { "epoch": 0.8097330179114566, "grad_norm": 0.9175205272410301, "learning_rate": 4.783338609699614e-07, "loss": 0.1639, "step": 1797 }, { "epoch": 0.8101836205925426, "grad_norm": 0.9832145863528448, "learning_rate": 4.7614434144985486e-07, "loss": 0.1839, "step": 1798 }, { "epoch": 0.8106342232736284, "grad_norm": 0.9250138705305986, "learning_rate": 4.73959317015564e-07, "loss": 0.1607, "step": 1799 }, { "epoch": 0.8110848259547144, "grad_norm": 0.9988990989007607, "learning_rate": 4.7177879252013945e-07, "loss": 0.1963, "step": 1800 }, { "epoch": 0.8115354286358004, "grad_norm": 0.9884011429992995, "learning_rate": 4.6960277280663574e-07, "loss": 0.1747, "step": 1801 }, { "epoch": 0.8119860313168863, "grad_norm": 0.9687937064883984, "learning_rate": 4.674312627081032e-07, "loss": 0.1746, "step": 1802 }, { "epoch": 0.8124366339979723, "grad_norm": 0.9680030187172702, "learning_rate": 4.6526426704757545e-07, "loss": 0.1683, "step": 1803 }, { "epoch": 0.8128872366790583, "grad_norm": 0.9128681785542643, "learning_rate": 4.6310179063805916e-07, "loss": 0.1583, "step": 1804 }, { "epoch": 0.8133378393601441, "grad_norm": 0.9503424077025497, "learning_rate": 4.609438382825246e-07, "loss": 0.168, "step": 1805 }, { "epoch": 0.8137884420412301, "grad_norm": 0.9854433552645877, "learning_rate": 4.587904147738925e-07, "loss": 0.1722, "step": 1806 }, { "epoch": 0.8142390447223161, "grad_norm": 0.9535187246316041, "learning_rate": 4.566415248950251e-07, "loss": 0.1714, "step": 1807 }, { "epoch": 0.814689647403402, "grad_norm": 0.9730755908109525, "learning_rate": 4.5449717341871646e-07, "loss": 0.1631, "step": 1808 }, { "epoch": 0.815140250084488, "grad_norm": 0.9486061901445837, "learning_rate": 4.5235736510767957e-07, "loss": 0.1631, "step": 1809 }, { "epoch": 0.815590852765574, "grad_norm": 0.9516956179529896, "learning_rate": 4.5022210471453664e-07, "loss": 0.177, "step": 1810 }, { "epoch": 0.81604145544666, "grad_norm": 0.9337467064350875, "learning_rate": 4.480913969818099e-07, "loss": 0.1738, "step": 1811 }, { "epoch": 0.8164920581277458, "grad_norm": 0.9557630287598873, "learning_rate": 4.4596524664190674e-07, "loss": 0.1564, "step": 1812 }, { "epoch": 0.8169426608088318, "grad_norm": 0.9106928433789534, "learning_rate": 4.4384365841711684e-07, "loss": 0.1682, "step": 1813 }, { "epoch": 0.8173932634899178, "grad_norm": 0.8929215982521911, "learning_rate": 4.417266370195944e-07, "loss": 0.1635, "step": 1814 }, { "epoch": 0.8178438661710037, "grad_norm": 0.9601587758950517, "learning_rate": 4.3961418715135097e-07, "loss": 0.1759, "step": 1815 }, { "epoch": 0.8182944688520897, "grad_norm": 0.9622107099235466, "learning_rate": 4.3750631350424456e-07, "loss": 0.1727, "step": 1816 }, { "epoch": 0.8187450715331757, "grad_norm": 0.9675048072605508, "learning_rate": 4.354030207599691e-07, "loss": 0.1744, "step": 1817 }, { "epoch": 0.8191956742142615, "grad_norm": 0.9853236739076215, "learning_rate": 4.333043135900436e-07, "loss": 0.1686, "step": 1818 }, { "epoch": 0.8196462768953475, "grad_norm": 0.9781020890619168, "learning_rate": 4.312101966558044e-07, "loss": 0.1729, "step": 1819 }, { "epoch": 0.8200968795764335, "grad_norm": 0.9115554590765232, "learning_rate": 4.2912067460839066e-07, "loss": 0.1581, "step": 1820 }, { "epoch": 0.8205474822575194, "grad_norm": 0.9616917821289626, "learning_rate": 4.2703575208873585e-07, "loss": 0.1565, "step": 1821 }, { "epoch": 0.8209980849386054, "grad_norm": 0.922130671794463, "learning_rate": 4.2495543372755854e-07, "loss": 0.1595, "step": 1822 }, { "epoch": 0.8214486876196914, "grad_norm": 0.9683579853431844, "learning_rate": 4.2287972414535084e-07, "loss": 0.1673, "step": 1823 }, { "epoch": 0.8218992903007772, "grad_norm": 0.961905725310261, "learning_rate": 4.208086279523699e-07, "loss": 0.1741, "step": 1824 }, { "epoch": 0.8223498929818632, "grad_norm": 0.9119598816657689, "learning_rate": 4.1874214974862436e-07, "loss": 0.1695, "step": 1825 }, { "epoch": 0.8228004956629492, "grad_norm": 0.973526812761145, "learning_rate": 4.1668029412386677e-07, "loss": 0.1733, "step": 1826 }, { "epoch": 0.8232510983440351, "grad_norm": 0.9599913535559322, "learning_rate": 4.146230656575831e-07, "loss": 0.1792, "step": 1827 }, { "epoch": 0.8237017010251211, "grad_norm": 0.9636632216252867, "learning_rate": 4.125704689189819e-07, "loss": 0.1761, "step": 1828 }, { "epoch": 0.8241523037062071, "grad_norm": 0.9087918090205587, "learning_rate": 4.105225084669839e-07, "loss": 0.1551, "step": 1829 }, { "epoch": 0.824602906387293, "grad_norm": 0.9532950453959921, "learning_rate": 4.084791888502135e-07, "loss": 0.1562, "step": 1830 }, { "epoch": 0.8250535090683789, "grad_norm": 0.9171866851813432, "learning_rate": 4.0644051460698634e-07, "loss": 0.1711, "step": 1831 }, { "epoch": 0.8255041117494649, "grad_norm": 0.9644212760081424, "learning_rate": 4.0440649026530166e-07, "loss": 0.1763, "step": 1832 }, { "epoch": 0.8259547144305509, "grad_norm": 0.9135624183632747, "learning_rate": 4.0237712034283004e-07, "loss": 0.1699, "step": 1833 }, { "epoch": 0.8264053171116368, "grad_norm": 0.9700605328629466, "learning_rate": 4.003524093469041e-07, "loss": 0.1795, "step": 1834 }, { "epoch": 0.8268559197927228, "grad_norm": 0.9494632861193675, "learning_rate": 3.983323617745111e-07, "loss": 0.166, "step": 1835 }, { "epoch": 0.8273065224738088, "grad_norm": 0.9689443716041818, "learning_rate": 3.963169821122778e-07, "loss": 0.1667, "step": 1836 }, { "epoch": 0.8277571251548946, "grad_norm": 0.9494816847933051, "learning_rate": 3.943062748364651e-07, "loss": 0.1864, "step": 1837 }, { "epoch": 0.8282077278359806, "grad_norm": 0.961626291646709, "learning_rate": 3.9230024441295394e-07, "loss": 0.1713, "step": 1838 }, { "epoch": 0.8286583305170666, "grad_norm": 0.8826493854231543, "learning_rate": 3.9029889529724113e-07, "loss": 0.1637, "step": 1839 }, { "epoch": 0.8291089331981525, "grad_norm": 0.91167006225646, "learning_rate": 3.8830223193442345e-07, "loss": 0.1588, "step": 1840 }, { "epoch": 0.8295595358792385, "grad_norm": 0.9264347983159924, "learning_rate": 3.863102587591919e-07, "loss": 0.16, "step": 1841 }, { "epoch": 0.8300101385603245, "grad_norm": 0.951334590876306, "learning_rate": 3.84322980195819e-07, "loss": 0.1683, "step": 1842 }, { "epoch": 0.8304607412414103, "grad_norm": 0.9659033816073591, "learning_rate": 3.823404006581513e-07, "loss": 0.1703, "step": 1843 }, { "epoch": 0.8309113439224963, "grad_norm": 0.9209942273315058, "learning_rate": 3.8036252454959844e-07, "loss": 0.1662, "step": 1844 }, { "epoch": 0.8313619466035823, "grad_norm": 0.9281131852365946, "learning_rate": 3.7838935626312246e-07, "loss": 0.1646, "step": 1845 }, { "epoch": 0.8318125492846682, "grad_norm": 0.9439773737834088, "learning_rate": 3.764209001812316e-07, "loss": 0.1748, "step": 1846 }, { "epoch": 0.8322631519657542, "grad_norm": 0.9266237935276338, "learning_rate": 3.7445716067596506e-07, "loss": 0.1731, "step": 1847 }, { "epoch": 0.8327137546468402, "grad_norm": 0.924011205749328, "learning_rate": 3.72498142108888e-07, "loss": 0.166, "step": 1848 }, { "epoch": 0.8331643573279262, "grad_norm": 0.9975906771664418, "learning_rate": 3.705438488310792e-07, "loss": 0.175, "step": 1849 }, { "epoch": 0.833614960009012, "grad_norm": 0.9870663130220605, "learning_rate": 3.6859428518312394e-07, "loss": 0.177, "step": 1850 }, { "epoch": 0.834065562690098, "grad_norm": 1.0589620302719522, "learning_rate": 3.666494554951014e-07, "loss": 0.1731, "step": 1851 }, { "epoch": 0.834516165371184, "grad_norm": 0.9792308554611852, "learning_rate": 3.6470936408657647e-07, "loss": 0.1544, "step": 1852 }, { "epoch": 0.8349667680522699, "grad_norm": 1.0004784542843967, "learning_rate": 3.6277401526659067e-07, "loss": 0.1729, "step": 1853 }, { "epoch": 0.8354173707333559, "grad_norm": 1.1253575928698794, "learning_rate": 3.6084341333365135e-07, "loss": 0.1718, "step": 1854 }, { "epoch": 0.8358679734144419, "grad_norm": 0.9624239214809235, "learning_rate": 3.58917562575723e-07, "loss": 0.1791, "step": 1855 }, { "epoch": 0.8363185760955277, "grad_norm": 0.9606501614996009, "learning_rate": 3.569964672702178e-07, "loss": 0.1714, "step": 1856 }, { "epoch": 0.8367691787766137, "grad_norm": 0.9759429493736111, "learning_rate": 3.550801316839858e-07, "loss": 0.1805, "step": 1857 }, { "epoch": 0.8372197814576997, "grad_norm": 0.9783279630837232, "learning_rate": 3.531685600733051e-07, "loss": 0.1681, "step": 1858 }, { "epoch": 0.8376703841387856, "grad_norm": 0.9918620830896449, "learning_rate": 3.5126175668387275e-07, "loss": 0.1843, "step": 1859 }, { "epoch": 0.8381209868198716, "grad_norm": 1.0541404809751138, "learning_rate": 3.4935972575079524e-07, "loss": 0.1868, "step": 1860 }, { "epoch": 0.8385715895009576, "grad_norm": 0.8875615125272613, "learning_rate": 3.474624714985805e-07, "loss": 0.1542, "step": 1861 }, { "epoch": 0.8390221921820434, "grad_norm": 0.947000239574121, "learning_rate": 3.455699981411259e-07, "loss": 0.1639, "step": 1862 }, { "epoch": 0.8394727948631294, "grad_norm": 0.9871624452171542, "learning_rate": 3.436823098817102e-07, "loss": 0.1717, "step": 1863 }, { "epoch": 0.8399233975442154, "grad_norm": 0.9771759407766166, "learning_rate": 3.417994109129852e-07, "loss": 0.1602, "step": 1864 }, { "epoch": 0.8403740002253013, "grad_norm": 0.9195044006987618, "learning_rate": 3.3992130541696336e-07, "loss": 0.1652, "step": 1865 }, { "epoch": 0.8408246029063873, "grad_norm": 0.9445742965717538, "learning_rate": 3.3804799756501335e-07, "loss": 0.1721, "step": 1866 }, { "epoch": 0.8412752055874733, "grad_norm": 0.9362275436972677, "learning_rate": 3.3617949151784623e-07, "loss": 0.1703, "step": 1867 }, { "epoch": 0.8417258082685593, "grad_norm": 1.014145441841758, "learning_rate": 3.343157914255085e-07, "loss": 0.1819, "step": 1868 }, { "epoch": 0.8421764109496451, "grad_norm": 0.9314737099222552, "learning_rate": 3.3245690142737236e-07, "loss": 0.1756, "step": 1869 }, { "epoch": 0.8426270136307311, "grad_norm": 0.9722306559540486, "learning_rate": 3.306028256521265e-07, "loss": 0.1699, "step": 1870 }, { "epoch": 0.8430776163118171, "grad_norm": 0.9672581166734934, "learning_rate": 3.287535682177667e-07, "loss": 0.1799, "step": 1871 }, { "epoch": 0.843528218992903, "grad_norm": 0.9858629820179523, "learning_rate": 3.2690913323158795e-07, "loss": 0.167, "step": 1872 }, { "epoch": 0.843978821673989, "grad_norm": 0.9771711710226948, "learning_rate": 3.2506952479017417e-07, "loss": 0.1755, "step": 1873 }, { "epoch": 0.844429424355075, "grad_norm": 0.9702797005479236, "learning_rate": 3.2323474697938727e-07, "loss": 0.1807, "step": 1874 }, { "epoch": 0.8448800270361608, "grad_norm": 0.9728669390673415, "learning_rate": 3.214048038743622e-07, "loss": 0.1735, "step": 1875 }, { "epoch": 0.8453306297172468, "grad_norm": 0.9032972459456295, "learning_rate": 3.1957969953949506e-07, "loss": 0.1514, "step": 1876 }, { "epoch": 0.8457812323983328, "grad_norm": 1.0259745438344021, "learning_rate": 3.1775943802843546e-07, "loss": 0.1889, "step": 1877 }, { "epoch": 0.8462318350794187, "grad_norm": 0.9278308411260581, "learning_rate": 3.1594402338407633e-07, "loss": 0.1619, "step": 1878 }, { "epoch": 0.8466824377605047, "grad_norm": 0.9486461271581993, "learning_rate": 3.141334596385448e-07, "loss": 0.1701, "step": 1879 }, { "epoch": 0.8471330404415907, "grad_norm": 0.9264937573351875, "learning_rate": 3.12327750813195e-07, "loss": 0.16, "step": 1880 }, { "epoch": 0.8475836431226765, "grad_norm": 0.9063889096458639, "learning_rate": 3.105269009185974e-07, "loss": 0.153, "step": 1881 }, { "epoch": 0.8480342458037625, "grad_norm": 0.9345172159415808, "learning_rate": 3.087309139545311e-07, "loss": 0.1706, "step": 1882 }, { "epoch": 0.8484848484848485, "grad_norm": 0.884600902211049, "learning_rate": 3.0693979390997333e-07, "loss": 0.1579, "step": 1883 }, { "epoch": 0.8489354511659344, "grad_norm": 0.9278539998900202, "learning_rate": 3.0515354476309293e-07, "loss": 0.1617, "step": 1884 }, { "epoch": 0.8493860538470204, "grad_norm": 0.9492452968446602, "learning_rate": 3.033721704812395e-07, "loss": 0.1676, "step": 1885 }, { "epoch": 0.8498366565281064, "grad_norm": 0.9832023339183169, "learning_rate": 3.0159567502093535e-07, "loss": 0.179, "step": 1886 }, { "epoch": 0.8502872592091923, "grad_norm": 0.9782668678333437, "learning_rate": 2.9982406232786614e-07, "loss": 0.1742, "step": 1887 }, { "epoch": 0.8507378618902782, "grad_norm": 0.9701650943366985, "learning_rate": 2.9805733633687467e-07, "loss": 0.175, "step": 1888 }, { "epoch": 0.8511884645713642, "grad_norm": 0.9651583163077686, "learning_rate": 2.9629550097194787e-07, "loss": 0.1782, "step": 1889 }, { "epoch": 0.8516390672524502, "grad_norm": 0.9024669829464759, "learning_rate": 2.9453856014621224e-07, "loss": 0.1623, "step": 1890 }, { "epoch": 0.8520896699335361, "grad_norm": 1.0192621240776767, "learning_rate": 2.9278651776192073e-07, "loss": 0.1936, "step": 1891 }, { "epoch": 0.8525402726146221, "grad_norm": 0.9918084930237235, "learning_rate": 2.910393777104481e-07, "loss": 0.1751, "step": 1892 }, { "epoch": 0.852990875295708, "grad_norm": 0.9650070719897159, "learning_rate": 2.892971438722822e-07, "loss": 0.1772, "step": 1893 }, { "epoch": 0.8534414779767939, "grad_norm": 0.9685100244625674, "learning_rate": 2.8755982011701183e-07, "loss": 0.1767, "step": 1894 }, { "epoch": 0.8538920806578799, "grad_norm": 0.9556958216529681, "learning_rate": 2.8582741030332095e-07, "loss": 0.1779, "step": 1895 }, { "epoch": 0.8543426833389659, "grad_norm": 0.9757391441636464, "learning_rate": 2.840999182789797e-07, "loss": 0.1679, "step": 1896 }, { "epoch": 0.8547932860200518, "grad_norm": 0.9672040065820859, "learning_rate": 2.823773478808348e-07, "loss": 0.1762, "step": 1897 }, { "epoch": 0.8552438887011378, "grad_norm": 0.9500345661356947, "learning_rate": 2.806597029348018e-07, "loss": 0.1616, "step": 1898 }, { "epoch": 0.8556944913822238, "grad_norm": 0.8833689073169025, "learning_rate": 2.7894698725585866e-07, "loss": 0.1587, "step": 1899 }, { "epoch": 0.8561450940633096, "grad_norm": 0.9467995781424163, "learning_rate": 2.772392046480324e-07, "loss": 0.1674, "step": 1900 }, { "epoch": 0.8565956967443956, "grad_norm": 0.9205995331832161, "learning_rate": 2.755363589043944e-07, "loss": 0.1634, "step": 1901 }, { "epoch": 0.8570462994254816, "grad_norm": 0.9563426840948828, "learning_rate": 2.738384538070518e-07, "loss": 0.1741, "step": 1902 }, { "epoch": 0.8574969021065675, "grad_norm": 0.9669901785480284, "learning_rate": 2.7214549312713723e-07, "loss": 0.1659, "step": 1903 }, { "epoch": 0.8579475047876535, "grad_norm": 0.9743277211364876, "learning_rate": 2.7045748062480254e-07, "loss": 0.1732, "step": 1904 }, { "epoch": 0.8583981074687395, "grad_norm": 0.9411003828673956, "learning_rate": 2.6877442004920873e-07, "loss": 0.1645, "step": 1905 }, { "epoch": 0.8588487101498254, "grad_norm": 0.9734653439920148, "learning_rate": 2.6709631513851834e-07, "loss": 0.1778, "step": 1906 }, { "epoch": 0.8592993128309113, "grad_norm": 0.9312433368537456, "learning_rate": 2.654231696198878e-07, "loss": 0.164, "step": 1907 }, { "epoch": 0.8597499155119973, "grad_norm": 0.9692174115782383, "learning_rate": 2.6375498720945717e-07, "loss": 0.1718, "step": 1908 }, { "epoch": 0.8602005181930833, "grad_norm": 0.8951997876415702, "learning_rate": 2.620917716123444e-07, "loss": 0.1693, "step": 1909 }, { "epoch": 0.8606511208741692, "grad_norm": 0.9778603788324381, "learning_rate": 2.604335265226354e-07, "loss": 0.1827, "step": 1910 }, { "epoch": 0.8611017235552552, "grad_norm": 1.0493504549340391, "learning_rate": 2.587802556233765e-07, "loss": 0.1845, "step": 1911 }, { "epoch": 0.8615523262363411, "grad_norm": 0.9252258432234339, "learning_rate": 2.571319625865662e-07, "loss": 0.1636, "step": 1912 }, { "epoch": 0.862002928917427, "grad_norm": 1.015522402651877, "learning_rate": 2.5548865107314606e-07, "loss": 0.1794, "step": 1913 }, { "epoch": 0.862453531598513, "grad_norm": 0.9275781007378919, "learning_rate": 2.5385032473299433e-07, "loss": 0.1751, "step": 1914 }, { "epoch": 0.862904134279599, "grad_norm": 0.9128078845993659, "learning_rate": 2.522169872049174e-07, "loss": 0.1589, "step": 1915 }, { "epoch": 0.8633547369606849, "grad_norm": 0.8879718361876989, "learning_rate": 2.5058864211664064e-07, "loss": 0.1706, "step": 1916 }, { "epoch": 0.8638053396417709, "grad_norm": 0.9372693909164671, "learning_rate": 2.4896529308479966e-07, "loss": 0.1698, "step": 1917 }, { "epoch": 0.8642559423228569, "grad_norm": 1.0087108719862292, "learning_rate": 2.4734694371493507e-07, "loss": 0.1661, "step": 1918 }, { "epoch": 0.8647065450039427, "grad_norm": 0.9844600152081066, "learning_rate": 2.4573359760148354e-07, "loss": 0.1842, "step": 1919 }, { "epoch": 0.8651571476850287, "grad_norm": 1.0349326975424789, "learning_rate": 2.441252583277678e-07, "loss": 0.1811, "step": 1920 }, { "epoch": 0.8656077503661147, "grad_norm": 0.9821007975011048, "learning_rate": 2.425219294659908e-07, "loss": 0.1825, "step": 1921 }, { "epoch": 0.8660583530472006, "grad_norm": 0.9446142750611491, "learning_rate": 2.409236145772276e-07, "loss": 0.1714, "step": 1922 }, { "epoch": 0.8665089557282866, "grad_norm": 0.9596367414516151, "learning_rate": 2.393303172114159e-07, "loss": 0.1732, "step": 1923 }, { "epoch": 0.8669595584093726, "grad_norm": 0.9551816035996794, "learning_rate": 2.377420409073497e-07, "loss": 0.1671, "step": 1924 }, { "epoch": 0.8674101610904585, "grad_norm": 0.8993318779686909, "learning_rate": 2.3615878919267116e-07, "loss": 0.171, "step": 1925 }, { "epoch": 0.8678607637715444, "grad_norm": 1.021525400453317, "learning_rate": 2.345805655838626e-07, "loss": 0.1914, "step": 1926 }, { "epoch": 0.8683113664526304, "grad_norm": 0.9314956686766991, "learning_rate": 2.3300737358623843e-07, "loss": 0.1746, "step": 1927 }, { "epoch": 0.8687619691337164, "grad_norm": 0.9964051166260204, "learning_rate": 2.314392166939375e-07, "loss": 0.1773, "step": 1928 }, { "epoch": 0.8692125718148023, "grad_norm": 1.0162505058504328, "learning_rate": 2.2987609838991536e-07, "loss": 0.1867, "step": 1929 }, { "epoch": 0.8696631744958883, "grad_norm": 0.9257283800837275, "learning_rate": 2.2831802214593774e-07, "loss": 0.1731, "step": 1930 }, { "epoch": 0.8701137771769742, "grad_norm": 0.9677329540924063, "learning_rate": 2.2676499142257002e-07, "loss": 0.1735, "step": 1931 }, { "epoch": 0.8705643798580601, "grad_norm": 1.000711690360383, "learning_rate": 2.2521700966917276e-07, "loss": 0.1754, "step": 1932 }, { "epoch": 0.8710149825391461, "grad_norm": 0.987391302910459, "learning_rate": 2.23674080323891e-07, "loss": 0.1743, "step": 1933 }, { "epoch": 0.8714655852202321, "grad_norm": 0.9747308865230088, "learning_rate": 2.221362068136493e-07, "loss": 0.1608, "step": 1934 }, { "epoch": 0.871916187901318, "grad_norm": 0.9376304181926186, "learning_rate": 2.2060339255414232e-07, "loss": 0.1628, "step": 1935 }, { "epoch": 0.872366790582404, "grad_norm": 0.9334000321457883, "learning_rate": 2.190756409498282e-07, "loss": 0.1597, "step": 1936 }, { "epoch": 0.87281739326349, "grad_norm": 0.9008641202830003, "learning_rate": 2.175529553939204e-07, "loss": 0.1596, "step": 1937 }, { "epoch": 0.8732679959445758, "grad_norm": 0.9367208891666635, "learning_rate": 2.1603533926838088e-07, "loss": 0.1693, "step": 1938 }, { "epoch": 0.8737185986256618, "grad_norm": 0.939496478640009, "learning_rate": 2.1452279594391167e-07, "loss": 0.1668, "step": 1939 }, { "epoch": 0.8741692013067478, "grad_norm": 0.9209952395233147, "learning_rate": 2.1301532877994747e-07, "loss": 0.1676, "step": 1940 }, { "epoch": 0.8746198039878337, "grad_norm": 0.9103353045426509, "learning_rate": 2.1151294112464997e-07, "loss": 0.1691, "step": 1941 }, { "epoch": 0.8750704066689197, "grad_norm": 0.8665460511350714, "learning_rate": 2.1001563631489807e-07, "loss": 0.1487, "step": 1942 }, { "epoch": 0.8755210093500057, "grad_norm": 0.9679891948748914, "learning_rate": 2.0852341767628182e-07, "loss": 0.169, "step": 1943 }, { "epoch": 0.8759716120310916, "grad_norm": 0.9496329960942475, "learning_rate": 2.0703628852309336e-07, "loss": 0.1699, "step": 1944 }, { "epoch": 0.8764222147121775, "grad_norm": 0.9084124393445112, "learning_rate": 2.0555425215832176e-07, "loss": 0.1448, "step": 1945 }, { "epoch": 0.8768728173932635, "grad_norm": 0.9325294190123458, "learning_rate": 2.0407731187364556e-07, "loss": 0.1611, "step": 1946 }, { "epoch": 0.8773234200743495, "grad_norm": 0.9608168170515962, "learning_rate": 2.026054709494235e-07, "loss": 0.1637, "step": 1947 }, { "epoch": 0.8777740227554354, "grad_norm": 0.9508933509137032, "learning_rate": 2.0113873265468875e-07, "loss": 0.1637, "step": 1948 }, { "epoch": 0.8782246254365214, "grad_norm": 0.9627415360121703, "learning_rate": 1.996771002471415e-07, "loss": 0.171, "step": 1949 }, { "epoch": 0.8786752281176073, "grad_norm": 0.9269279051004062, "learning_rate": 1.9822057697314102e-07, "loss": 0.158, "step": 1950 }, { "epoch": 0.8791258307986932, "grad_norm": 0.9975744188618978, "learning_rate": 1.9676916606769874e-07, "loss": 0.1733, "step": 1951 }, { "epoch": 0.8795764334797792, "grad_norm": 0.9907918900931711, "learning_rate": 1.9532287075447325e-07, "loss": 0.1717, "step": 1952 }, { "epoch": 0.8800270361608652, "grad_norm": 0.9029717491407483, "learning_rate": 1.9388169424575802e-07, "loss": 0.1526, "step": 1953 }, { "epoch": 0.8804776388419511, "grad_norm": 0.9924216757155009, "learning_rate": 1.9244563974247953e-07, "loss": 0.1871, "step": 1954 }, { "epoch": 0.8809282415230371, "grad_norm": 1.0046395234515613, "learning_rate": 1.910147104341875e-07, "loss": 0.1677, "step": 1955 }, { "epoch": 0.881378844204123, "grad_norm": 0.8428374340697592, "learning_rate": 1.8958890949904802e-07, "loss": 0.1428, "step": 1956 }, { "epoch": 0.8818294468852089, "grad_norm": 0.8916062368929274, "learning_rate": 1.881682401038379e-07, "loss": 0.1627, "step": 1957 }, { "epoch": 0.8822800495662949, "grad_norm": 0.9133925219285579, "learning_rate": 1.8675270540393532e-07, "loss": 0.1634, "step": 1958 }, { "epoch": 0.8827306522473809, "grad_norm": 0.9603725651655006, "learning_rate": 1.8534230854331454e-07, "loss": 0.1589, "step": 1959 }, { "epoch": 0.8831812549284668, "grad_norm": 0.8832900144509541, "learning_rate": 1.8393705265453838e-07, "loss": 0.1526, "step": 1960 }, { "epoch": 0.8836318576095528, "grad_norm": 0.9656796733108457, "learning_rate": 1.8253694085875047e-07, "loss": 0.1632, "step": 1961 }, { "epoch": 0.8840824602906387, "grad_norm": 1.0103093827826655, "learning_rate": 1.8114197626567105e-07, "loss": 0.1801, "step": 1962 }, { "epoch": 0.8845330629717247, "grad_norm": 0.9605960303059552, "learning_rate": 1.7975216197358648e-07, "loss": 0.174, "step": 1963 }, { "epoch": 0.8849836656528106, "grad_norm": 1.0141937286324803, "learning_rate": 1.7836750106934475e-07, "loss": 0.1691, "step": 1964 }, { "epoch": 0.8854342683338966, "grad_norm": 0.9434775531410926, "learning_rate": 1.7698799662834776e-07, "loss": 0.1602, "step": 1965 }, { "epoch": 0.8858848710149826, "grad_norm": 0.9267902111682871, "learning_rate": 1.7561365171454488e-07, "loss": 0.1601, "step": 1966 }, { "epoch": 0.8863354736960685, "grad_norm": 0.9224140721738877, "learning_rate": 1.7424446938042517e-07, "loss": 0.1638, "step": 1967 }, { "epoch": 0.8867860763771545, "grad_norm": 0.979554116800001, "learning_rate": 1.7288045266701247e-07, "loss": 0.178, "step": 1968 }, { "epoch": 0.8872366790582404, "grad_norm": 0.9415277153144294, "learning_rate": 1.7152160460385703e-07, "loss": 0.1578, "step": 1969 }, { "epoch": 0.8876872817393263, "grad_norm": 0.9751633874844109, "learning_rate": 1.701679282090285e-07, "loss": 0.1648, "step": 1970 }, { "epoch": 0.8881378844204123, "grad_norm": 0.9433218610454983, "learning_rate": 1.6881942648911077e-07, "loss": 0.1517, "step": 1971 }, { "epoch": 0.8885884871014983, "grad_norm": 0.9790183372335409, "learning_rate": 1.6747610243919437e-07, "loss": 0.1855, "step": 1972 }, { "epoch": 0.8890390897825842, "grad_norm": 0.9729615081723881, "learning_rate": 1.661379590428705e-07, "loss": 0.1692, "step": 1973 }, { "epoch": 0.8894896924636702, "grad_norm": 1.0050367822916513, "learning_rate": 1.6480499927222283e-07, "loss": 0.1759, "step": 1974 }, { "epoch": 0.8899402951447561, "grad_norm": 0.9976121317605783, "learning_rate": 1.6347722608782284e-07, "loss": 0.1761, "step": 1975 }, { "epoch": 0.890390897825842, "grad_norm": 0.9559950177435562, "learning_rate": 1.6215464243872186e-07, "loss": 0.1637, "step": 1976 }, { "epoch": 0.890841500506928, "grad_norm": 0.9723990375383502, "learning_rate": 1.608372512624448e-07, "loss": 0.1687, "step": 1977 }, { "epoch": 0.891292103188014, "grad_norm": 0.9655332499571794, "learning_rate": 1.595250554849842e-07, "loss": 0.1677, "step": 1978 }, { "epoch": 0.8917427058690999, "grad_norm": 0.9214762769979025, "learning_rate": 1.5821805802079343e-07, "loss": 0.1608, "step": 1979 }, { "epoch": 0.8921933085501859, "grad_norm": 0.9136099662881566, "learning_rate": 1.5691626177277986e-07, "loss": 0.1781, "step": 1980 }, { "epoch": 0.8926439112312718, "grad_norm": 0.9262022249621232, "learning_rate": 1.5561966963229925e-07, "loss": 0.1755, "step": 1981 }, { "epoch": 0.8930945139123577, "grad_norm": 0.9415300421061215, "learning_rate": 1.5432828447914743e-07, "loss": 0.1671, "step": 1982 }, { "epoch": 0.8935451165934437, "grad_norm": 0.9750913944440511, "learning_rate": 1.5304210918155677e-07, "loss": 0.1818, "step": 1983 }, { "epoch": 0.8939957192745297, "grad_norm": 0.916736823632913, "learning_rate": 1.5176114659618796e-07, "loss": 0.1506, "step": 1984 }, { "epoch": 0.8944463219556157, "grad_norm": 0.9672707724576233, "learning_rate": 1.5048539956812324e-07, "loss": 0.1663, "step": 1985 }, { "epoch": 0.8948969246367016, "grad_norm": 0.9182721543696385, "learning_rate": 1.4921487093086134e-07, "loss": 0.1699, "step": 1986 }, { "epoch": 0.8953475273177876, "grad_norm": 0.8891077554280398, "learning_rate": 1.4794956350631106e-07, "loss": 0.1663, "step": 1987 }, { "epoch": 0.8957981299988735, "grad_norm": 0.9034991158889414, "learning_rate": 1.4668948010478358e-07, "loss": 0.168, "step": 1988 }, { "epoch": 0.8962487326799594, "grad_norm": 1.0096026558664806, "learning_rate": 1.4543462352498844e-07, "loss": 0.171, "step": 1989 }, { "epoch": 0.8966993353610454, "grad_norm": 0.9336033398750972, "learning_rate": 1.4418499655402512e-07, "loss": 0.1723, "step": 1990 }, { "epoch": 0.8971499380421314, "grad_norm": 0.9866388504700837, "learning_rate": 1.4294060196737874e-07, "loss": 0.1649, "step": 1991 }, { "epoch": 0.8976005407232173, "grad_norm": 0.9743774072891516, "learning_rate": 1.417014425289126e-07, "loss": 0.1625, "step": 1992 }, { "epoch": 0.8980511434043033, "grad_norm": 0.90538670491052, "learning_rate": 1.4046752099086236e-07, "loss": 0.1538, "step": 1993 }, { "epoch": 0.8985017460853892, "grad_norm": 0.9378839919331537, "learning_rate": 1.3923884009382994e-07, "loss": 0.1532, "step": 1994 }, { "epoch": 0.8989523487664751, "grad_norm": 0.961745818647385, "learning_rate": 1.380154025667782e-07, "loss": 0.1649, "step": 1995 }, { "epoch": 0.8994029514475611, "grad_norm": 0.9689054481835896, "learning_rate": 1.367972111270241e-07, "loss": 0.1724, "step": 1996 }, { "epoch": 0.8998535541286471, "grad_norm": 0.9601026525696907, "learning_rate": 1.3558426848023165e-07, "loss": 0.1674, "step": 1997 }, { "epoch": 0.900304156809733, "grad_norm": 0.9116251944923741, "learning_rate": 1.3437657732040783e-07, "loss": 0.1675, "step": 1998 }, { "epoch": 0.900754759490819, "grad_norm": 0.9712578312105717, "learning_rate": 1.3317414032989668e-07, "loss": 0.1795, "step": 1999 }, { "epoch": 0.901205362171905, "grad_norm": 0.9513325556706582, "learning_rate": 1.3197696017937106e-07, "loss": 0.1841, "step": 2000 }, { "epoch": 0.901205362171905, "eval_loss": 0.17061425745487213, "eval_runtime": 59.2934, "eval_samples_per_second": 24.202, "eval_steps_per_second": 3.036, "step": 2000 }, { "epoch": 0.9016559648529908, "grad_norm": 0.9117132984829037, "learning_rate": 1.3078503952782845e-07, "loss": 0.1618, "step": 2001 }, { "epoch": 0.9021065675340768, "grad_norm": 0.9472154091042211, "learning_rate": 1.2959838102258537e-07, "loss": 0.1862, "step": 2002 }, { "epoch": 0.9025571702151628, "grad_norm": 0.9494247550660043, "learning_rate": 1.2841698729927022e-07, "loss": 0.1557, "step": 2003 }, { "epoch": 0.9030077728962488, "grad_norm": 0.9157731618730593, "learning_rate": 1.272408609818182e-07, "loss": 0.1603, "step": 2004 }, { "epoch": 0.9034583755773347, "grad_norm": 0.950573888351368, "learning_rate": 1.2607000468246533e-07, "loss": 0.1678, "step": 2005 }, { "epoch": 0.9039089782584206, "grad_norm": 0.961300360072569, "learning_rate": 1.2490442100174278e-07, "loss": 0.1777, "step": 2006 }, { "epoch": 0.9043595809395066, "grad_norm": 0.8913003543296754, "learning_rate": 1.237441125284708e-07, "loss": 0.1678, "step": 2007 }, { "epoch": 0.9048101836205925, "grad_norm": 0.9608134144134601, "learning_rate": 1.2258908183975322e-07, "loss": 0.1775, "step": 2008 }, { "epoch": 0.9052607863016785, "grad_norm": 0.9596706472769803, "learning_rate": 1.2143933150097154e-07, "loss": 0.165, "step": 2009 }, { "epoch": 0.9057113889827645, "grad_norm": 0.9328106883096551, "learning_rate": 1.2029486406577972e-07, "loss": 0.1687, "step": 2010 }, { "epoch": 0.9061619916638504, "grad_norm": 0.9509999156498098, "learning_rate": 1.191556820760978e-07, "loss": 0.1635, "step": 2011 }, { "epoch": 0.9066125943449364, "grad_norm": 0.986143322145186, "learning_rate": 1.1802178806210624e-07, "loss": 0.1646, "step": 2012 }, { "epoch": 0.9070631970260223, "grad_norm": 0.9252282348305862, "learning_rate": 1.1689318454224191e-07, "loss": 0.1594, "step": 2013 }, { "epoch": 0.9075137997071082, "grad_norm": 0.9665747400156657, "learning_rate": 1.1576987402318884e-07, "loss": 0.1686, "step": 2014 }, { "epoch": 0.9079644023881942, "grad_norm": 0.9242552281663089, "learning_rate": 1.1465185899987797e-07, "loss": 0.1653, "step": 2015 }, { "epoch": 0.9084150050692802, "grad_norm": 0.9338454099056986, "learning_rate": 1.1353914195547655e-07, "loss": 0.1664, "step": 2016 }, { "epoch": 0.9088656077503661, "grad_norm": 0.9488715062833334, "learning_rate": 1.1243172536138547e-07, "loss": 0.167, "step": 2017 }, { "epoch": 0.909316210431452, "grad_norm": 0.9192528840903248, "learning_rate": 1.1132961167723305e-07, "loss": 0.1602, "step": 2018 }, { "epoch": 0.909766813112538, "grad_norm": 0.9358983791063447, "learning_rate": 1.1023280335086956e-07, "loss": 0.1786, "step": 2019 }, { "epoch": 0.9102174157936239, "grad_norm": 0.8800096012948293, "learning_rate": 1.091413028183616e-07, "loss": 0.1551, "step": 2020 }, { "epoch": 0.9106680184747099, "grad_norm": 0.9819759586485249, "learning_rate": 1.0805511250398748e-07, "loss": 0.178, "step": 2021 }, { "epoch": 0.9111186211557959, "grad_norm": 0.9300351124591425, "learning_rate": 1.06974234820231e-07, "loss": 0.1677, "step": 2022 }, { "epoch": 0.9115692238368819, "grad_norm": 0.9574289668603256, "learning_rate": 1.0589867216777544e-07, "loss": 0.1783, "step": 2023 }, { "epoch": 0.9120198265179678, "grad_norm": 0.9480950786557066, "learning_rate": 1.0482842693550044e-07, "loss": 0.1598, "step": 2024 }, { "epoch": 0.9124704291990537, "grad_norm": 0.9389594010578164, "learning_rate": 1.0376350150047427e-07, "loss": 0.1748, "step": 2025 }, { "epoch": 0.9129210318801397, "grad_norm": 0.9902783035113683, "learning_rate": 1.0270389822795073e-07, "loss": 0.1759, "step": 2026 }, { "epoch": 0.9133716345612256, "grad_norm": 0.9010340950200544, "learning_rate": 1.0164961947136232e-07, "loss": 0.1666, "step": 2027 }, { "epoch": 0.9138222372423116, "grad_norm": 0.9640758949806947, "learning_rate": 1.0060066757231535e-07, "loss": 0.1617, "step": 2028 }, { "epoch": 0.9142728399233976, "grad_norm": 0.9466267617525821, "learning_rate": 9.955704486058482e-08, "loss": 0.1728, "step": 2029 }, { "epoch": 0.9147234426044835, "grad_norm": 0.9468720807805653, "learning_rate": 9.85187536541099e-08, "loss": 0.1762, "step": 2030 }, { "epoch": 0.9151740452855694, "grad_norm": 0.9519779843009848, "learning_rate": 9.748579625898758e-08, "loss": 0.159, "step": 2031 }, { "epoch": 0.9156246479666554, "grad_norm": 0.935520409067934, "learning_rate": 9.645817496946902e-08, "loss": 0.1534, "step": 2032 }, { "epoch": 0.9160752506477413, "grad_norm": 0.9398750660029521, "learning_rate": 9.54358920679524e-08, "loss": 0.1585, "step": 2033 }, { "epoch": 0.9165258533288273, "grad_norm": 0.97514422818835, "learning_rate": 9.441894982498035e-08, "loss": 0.19, "step": 2034 }, { "epoch": 0.9169764560099133, "grad_norm": 0.901524756172699, "learning_rate": 9.340735049923277e-08, "loss": 0.1549, "step": 2035 }, { "epoch": 0.9174270586909992, "grad_norm": 0.8978882505519804, "learning_rate": 9.24010963375227e-08, "loss": 0.1613, "step": 2036 }, { "epoch": 0.9178776613720852, "grad_norm": 0.943693740183377, "learning_rate": 9.140018957479236e-08, "loss": 0.1756, "step": 2037 }, { "epoch": 0.9183282640531711, "grad_norm": 0.9409050633896883, "learning_rate": 9.040463243410541e-08, "loss": 0.1705, "step": 2038 }, { "epoch": 0.918778866734257, "grad_norm": 0.9264646847635797, "learning_rate": 8.941442712664561e-08, "loss": 0.1578, "step": 2039 }, { "epoch": 0.919229469415343, "grad_norm": 0.9371489557681785, "learning_rate": 8.842957585170814e-08, "loss": 0.1686, "step": 2040 }, { "epoch": 0.919680072096429, "grad_norm": 0.9836891585497494, "learning_rate": 8.745008079669742e-08, "loss": 0.1756, "step": 2041 }, { "epoch": 0.920130674777515, "grad_norm": 0.9805837221537438, "learning_rate": 8.647594413712212e-08, "loss": 0.1618, "step": 2042 }, { "epoch": 0.9205812774586009, "grad_norm": 0.9261986023438442, "learning_rate": 8.550716803658904e-08, "loss": 0.1686, "step": 2043 }, { "epoch": 0.9210318801396868, "grad_norm": 0.9734722095132714, "learning_rate": 8.454375464679865e-08, "loss": 0.1695, "step": 2044 }, { "epoch": 0.9214824828207728, "grad_norm": 0.9188884072317375, "learning_rate": 8.358570610754097e-08, "loss": 0.1517, "step": 2045 }, { "epoch": 0.9219330855018587, "grad_norm": 1.004333840199922, "learning_rate": 8.263302454669025e-08, "loss": 0.1833, "step": 2046 }, { "epoch": 0.9223836881829447, "grad_norm": 0.9829396979168501, "learning_rate": 8.168571208020032e-08, "loss": 0.1644, "step": 2047 }, { "epoch": 0.9228342908640307, "grad_norm": 0.9260687624875473, "learning_rate": 8.074377081210033e-08, "loss": 0.1681, "step": 2048 }, { "epoch": 0.9232848935451166, "grad_norm": 0.9609051945697021, "learning_rate": 7.980720283448957e-08, "loss": 0.1677, "step": 2049 }, { "epoch": 0.9237354962262025, "grad_norm": 1.0120669225710806, "learning_rate": 7.887601022753238e-08, "loss": 0.1834, "step": 2050 }, { "epoch": 0.9241860989072885, "grad_norm": 0.9511276526179403, "learning_rate": 7.795019505945495e-08, "loss": 0.177, "step": 2051 }, { "epoch": 0.9246367015883744, "grad_norm": 0.9227281855415796, "learning_rate": 7.702975938653934e-08, "loss": 0.1644, "step": 2052 }, { "epoch": 0.9250873042694604, "grad_norm": 1.0033217399635819, "learning_rate": 7.611470525312054e-08, "loss": 0.1726, "step": 2053 }, { "epoch": 0.9255379069505464, "grad_norm": 0.9349013994174448, "learning_rate": 7.520503469157947e-08, "loss": 0.1631, "step": 2054 }, { "epoch": 0.9259885096316323, "grad_norm": 1.0181422908202642, "learning_rate": 7.430074972234053e-08, "loss": 0.1814, "step": 2055 }, { "epoch": 0.9264391123127182, "grad_norm": 0.9334383021799829, "learning_rate": 7.340185235386627e-08, "loss": 0.176, "step": 2056 }, { "epoch": 0.9268897149938042, "grad_norm": 1.0492598395611263, "learning_rate": 7.250834458265355e-08, "loss": 0.1847, "step": 2057 }, { "epoch": 0.9273403176748901, "grad_norm": 0.9613018614069104, "learning_rate": 7.162022839322824e-08, "loss": 0.1794, "step": 2058 }, { "epoch": 0.9277909203559761, "grad_norm": 0.91255311271636, "learning_rate": 7.073750575814136e-08, "loss": 0.1607, "step": 2059 }, { "epoch": 0.9282415230370621, "grad_norm": 0.9397419477936786, "learning_rate": 6.986017863796435e-08, "loss": 0.1695, "step": 2060 }, { "epoch": 0.9286921257181481, "grad_norm": 0.9247421158046627, "learning_rate": 6.898824898128515e-08, "loss": 0.16, "step": 2061 }, { "epoch": 0.929142728399234, "grad_norm": 0.9261537295041204, "learning_rate": 6.81217187247038e-08, "loss": 0.1561, "step": 2062 }, { "epoch": 0.9295933310803199, "grad_norm": 0.9764265206005728, "learning_rate": 6.726058979282774e-08, "loss": 0.1677, "step": 2063 }, { "epoch": 0.9300439337614059, "grad_norm": 0.902753626414874, "learning_rate": 6.640486409826785e-08, "loss": 0.1512, "step": 2064 }, { "epoch": 0.9304945364424918, "grad_norm": 0.8803283338825151, "learning_rate": 6.555454354163437e-08, "loss": 0.1474, "step": 2065 }, { "epoch": 0.9309451391235778, "grad_norm": 0.9548943830388827, "learning_rate": 6.470963001153268e-08, "loss": 0.1755, "step": 2066 }, { "epoch": 0.9313957418046638, "grad_norm": 0.941676196449646, "learning_rate": 6.387012538455723e-08, "loss": 0.1706, "step": 2067 }, { "epoch": 0.9318463444857497, "grad_norm": 0.9066120881953356, "learning_rate": 6.303603152529119e-08, "loss": 0.1603, "step": 2068 }, { "epoch": 0.9322969471668356, "grad_norm": 0.8975701997942634, "learning_rate": 6.220735028629937e-08, "loss": 0.1762, "step": 2069 }, { "epoch": 0.9327475498479216, "grad_norm": 0.937967144997872, "learning_rate": 6.13840835081242e-08, "loss": 0.1662, "step": 2070 }, { "epoch": 0.9331981525290075, "grad_norm": 0.9603182403140018, "learning_rate": 6.056623301928327e-08, "loss": 0.1778, "step": 2071 }, { "epoch": 0.9336487552100935, "grad_norm": 0.9593460237620045, "learning_rate": 5.975380063626356e-08, "loss": 0.1628, "step": 2072 }, { "epoch": 0.9340993578911795, "grad_norm": 0.9396155709094206, "learning_rate": 5.894678816351862e-08, "loss": 0.1759, "step": 2073 }, { "epoch": 0.9345499605722654, "grad_norm": 0.9063173270817928, "learning_rate": 5.8145197393463806e-08, "loss": 0.1632, "step": 2074 }, { "epoch": 0.9350005632533513, "grad_norm": 0.9198235629026493, "learning_rate": 5.73490301064733e-08, "loss": 0.1589, "step": 2075 }, { "epoch": 0.9354511659344373, "grad_norm": 0.9202608244954159, "learning_rate": 5.6558288070874544e-08, "loss": 0.1597, "step": 2076 }, { "epoch": 0.9359017686155232, "grad_norm": 0.9398303538897654, "learning_rate": 5.577297304294543e-08, "loss": 0.1715, "step": 2077 }, { "epoch": 0.9363523712966092, "grad_norm": 0.91695514632765, "learning_rate": 5.4993086766910733e-08, "loss": 0.1502, "step": 2078 }, { "epoch": 0.9368029739776952, "grad_norm": 0.9207099117368464, "learning_rate": 5.421863097493707e-08, "loss": 0.1644, "step": 2079 }, { "epoch": 0.9372535766587812, "grad_norm": 0.9549078634315331, "learning_rate": 5.344960738713018e-08, "loss": 0.1555, "step": 2080 }, { "epoch": 0.937704179339867, "grad_norm": 0.9115254832418993, "learning_rate": 5.268601771153042e-08, "loss": 0.1665, "step": 2081 }, { "epoch": 0.938154782020953, "grad_norm": 0.9002114451702619, "learning_rate": 5.192786364410868e-08, "loss": 0.1578, "step": 2082 }, { "epoch": 0.938605384702039, "grad_norm": 1.0061200857124815, "learning_rate": 5.117514686876379e-08, "loss": 0.1755, "step": 2083 }, { "epoch": 0.9390559873831249, "grad_norm": 0.8938090934337861, "learning_rate": 5.0427869057317894e-08, "loss": 0.1538, "step": 2084 }, { "epoch": 0.9395065900642109, "grad_norm": 0.977029876716849, "learning_rate": 4.9686031869512486e-08, "loss": 0.1787, "step": 2085 }, { "epoch": 0.9399571927452969, "grad_norm": 0.9161657090501041, "learning_rate": 4.89496369530057e-08, "loss": 0.1528, "step": 2086 }, { "epoch": 0.9404077954263828, "grad_norm": 0.9095643914608285, "learning_rate": 4.8218685943368094e-08, "loss": 0.1727, "step": 2087 }, { "epoch": 0.9408583981074687, "grad_norm": 0.9587057997902817, "learning_rate": 4.7493180464078246e-08, "loss": 0.1644, "step": 2088 }, { "epoch": 0.9413090007885547, "grad_norm": 0.9629604422897137, "learning_rate": 4.677312212652108e-08, "loss": 0.1703, "step": 2089 }, { "epoch": 0.9417596034696406, "grad_norm": 1.0003390614928875, "learning_rate": 4.605851252998256e-08, "loss": 0.1744, "step": 2090 }, { "epoch": 0.9422102061507266, "grad_norm": 0.9286037501169877, "learning_rate": 4.5349353261646414e-08, "loss": 0.1596, "step": 2091 }, { "epoch": 0.9426608088318126, "grad_norm": 1.0118829837532903, "learning_rate": 4.464564589659187e-08, "loss": 0.1692, "step": 2092 }, { "epoch": 0.9431114115128985, "grad_norm": 0.9425679544226208, "learning_rate": 4.3947391997787857e-08, "loss": 0.1627, "step": 2093 }, { "epoch": 0.9435620141939844, "grad_norm": 0.9436041869876578, "learning_rate": 4.325459311609187e-08, "loss": 0.1603, "step": 2094 }, { "epoch": 0.9440126168750704, "grad_norm": 0.8886903189454783, "learning_rate": 4.256725079024554e-08, "loss": 0.1574, "step": 2095 }, { "epoch": 0.9444632195561563, "grad_norm": 0.9329423795551214, "learning_rate": 4.1885366546870754e-08, "loss": 0.1596, "step": 2096 }, { "epoch": 0.9449138222372423, "grad_norm": 0.9293096433954279, "learning_rate": 4.120894190046687e-08, "loss": 0.1559, "step": 2097 }, { "epoch": 0.9453644249183283, "grad_norm": 0.911463989327341, "learning_rate": 4.053797835340739e-08, "loss": 0.1593, "step": 2098 }, { "epoch": 0.9458150275994143, "grad_norm": 0.9566973300551312, "learning_rate": 3.987247739593636e-08, "loss": 0.1778, "step": 2099 }, { "epoch": 0.9462656302805001, "grad_norm": 0.8613712769085644, "learning_rate": 3.9212440506164465e-08, "loss": 0.1573, "step": 2100 }, { "epoch": 0.9467162329615861, "grad_norm": 0.9437761617537445, "learning_rate": 3.855786915006793e-08, "loss": 0.1709, "step": 2101 }, { "epoch": 0.9471668356426721, "grad_norm": 0.9124133143204082, "learning_rate": 3.790876478148242e-08, "loss": 0.1643, "step": 2102 }, { "epoch": 0.947617438323758, "grad_norm": 0.9108542684114203, "learning_rate": 3.726512884210165e-08, "loss": 0.1609, "step": 2103 }, { "epoch": 0.948068041004844, "grad_norm": 0.9720712434506643, "learning_rate": 3.6626962761473205e-08, "loss": 0.1733, "step": 2104 }, { "epoch": 0.94851864368593, "grad_norm": 1.0000646581857153, "learning_rate": 3.599426795699662e-08, "loss": 0.1889, "step": 2105 }, { "epoch": 0.9489692463670159, "grad_norm": 0.9370680977666221, "learning_rate": 3.53670458339192e-08, "loss": 0.1576, "step": 2106 }, { "epoch": 0.9494198490481018, "grad_norm": 0.9063791405664849, "learning_rate": 3.474529778533298e-08, "loss": 0.1577, "step": 2107 }, { "epoch": 0.9498704517291878, "grad_norm": 0.9593311899211125, "learning_rate": 3.412902519217137e-08, "loss": 0.1713, "step": 2108 }, { "epoch": 0.9503210544102737, "grad_norm": 1.0327217247594411, "learning_rate": 3.351822942320754e-08, "loss": 0.1905, "step": 2109 }, { "epoch": 0.9507716570913597, "grad_norm": 0.9764468754314656, "learning_rate": 3.2912911835049634e-08, "loss": 0.1704, "step": 2110 }, { "epoch": 0.9512222597724457, "grad_norm": 0.8945964138702499, "learning_rate": 3.231307377213833e-08, "loss": 0.1689, "step": 2111 }, { "epoch": 0.9516728624535316, "grad_norm": 0.8908152247664284, "learning_rate": 3.171871656674458e-08, "loss": 0.1544, "step": 2112 }, { "epoch": 0.9521234651346175, "grad_norm": 0.9630683493999175, "learning_rate": 3.112984153896603e-08, "loss": 0.1631, "step": 2113 }, { "epoch": 0.9525740678157035, "grad_norm": 0.9599669484647724, "learning_rate": 3.0546449996723404e-08, "loss": 0.1672, "step": 2114 }, { "epoch": 0.9530246704967894, "grad_norm": 0.9241733201124437, "learning_rate": 2.996854323575937e-08, "loss": 0.1605, "step": 2115 }, { "epoch": 0.9534752731778754, "grad_norm": 0.9710764623086404, "learning_rate": 2.939612253963331e-08, "loss": 0.1812, "step": 2116 }, { "epoch": 0.9539258758589614, "grad_norm": 0.908121707533848, "learning_rate": 2.8829189179721552e-08, "loss": 0.1649, "step": 2117 }, { "epoch": 0.9543764785400474, "grad_norm": 0.9083640662527406, "learning_rate": 2.8267744415211296e-08, "loss": 0.1656, "step": 2118 }, { "epoch": 0.9548270812211332, "grad_norm": 0.8951491282974102, "learning_rate": 2.7711789493099495e-08, "loss": 0.1528, "step": 2119 }, { "epoch": 0.9552776839022192, "grad_norm": 0.9383039096921281, "learning_rate": 2.716132564819035e-08, "loss": 0.1665, "step": 2120 }, { "epoch": 0.9557282865833052, "grad_norm": 0.9518623239811919, "learning_rate": 2.661635410309199e-08, "loss": 0.1708, "step": 2121 }, { "epoch": 0.9561788892643911, "grad_norm": 0.9378862678041917, "learning_rate": 2.6076876068213965e-08, "loss": 0.1741, "step": 2122 }, { "epoch": 0.9566294919454771, "grad_norm": 0.9278206810862306, "learning_rate": 2.554289274176419e-08, "loss": 0.159, "step": 2123 }, { "epoch": 0.9570800946265631, "grad_norm": 0.9577788372059376, "learning_rate": 2.5014405309746193e-08, "loss": 0.1698, "step": 2124 }, { "epoch": 0.957530697307649, "grad_norm": 1.025839380977837, "learning_rate": 2.449141494595797e-08, "loss": 0.1655, "step": 2125 }, { "epoch": 0.9579812999887349, "grad_norm": 0.9433126152166391, "learning_rate": 2.3973922811987295e-08, "loss": 0.1747, "step": 2126 }, { "epoch": 0.9584319026698209, "grad_norm": 0.903691513339492, "learning_rate": 2.3461930057210037e-08, "loss": 0.167, "step": 2127 }, { "epoch": 0.9588825053509068, "grad_norm": 1.0058632683573403, "learning_rate": 2.2955437818788508e-08, "loss": 0.1899, "step": 2128 }, { "epoch": 0.9593331080319928, "grad_norm": 0.9399743408942693, "learning_rate": 2.2454447221667563e-08, "loss": 0.1595, "step": 2129 }, { "epoch": 0.9597837107130788, "grad_norm": 0.9594553246397292, "learning_rate": 2.1958959378572398e-08, "loss": 0.177, "step": 2130 }, { "epoch": 0.9602343133941647, "grad_norm": 0.8830875917954497, "learning_rate": 2.1468975390006587e-08, "loss": 0.1569, "step": 2131 }, { "epoch": 0.9606849160752506, "grad_norm": 0.9507601854325585, "learning_rate": 2.0984496344249596e-08, "loss": 0.16, "step": 2132 }, { "epoch": 0.9611355187563366, "grad_norm": 0.9559829111178594, "learning_rate": 2.0505523317353727e-08, "loss": 0.1765, "step": 2133 }, { "epoch": 0.9615861214374225, "grad_norm": 0.9352989883382946, "learning_rate": 2.0032057373142453e-08, "loss": 0.1586, "step": 2134 }, { "epoch": 0.9620367241185085, "grad_norm": 0.9464894185394025, "learning_rate": 1.956409956320737e-08, "loss": 0.1724, "step": 2135 }, { "epoch": 0.9624873267995945, "grad_norm": 0.9775886909433427, "learning_rate": 1.91016509269068e-08, "loss": 0.1733, "step": 2136 }, { "epoch": 0.9629379294806804, "grad_norm": 0.9707150276281535, "learning_rate": 1.864471249136218e-08, "loss": 0.1702, "step": 2137 }, { "epoch": 0.9633885321617663, "grad_norm": 0.9579207813847991, "learning_rate": 1.819328527145725e-08, "loss": 0.1612, "step": 2138 }, { "epoch": 0.9638391348428523, "grad_norm": 0.9901610730307316, "learning_rate": 1.774737026983414e-08, "loss": 0.1693, "step": 2139 }, { "epoch": 0.9642897375239383, "grad_norm": 0.9252177839544279, "learning_rate": 1.7306968476893393e-08, "loss": 0.1632, "step": 2140 }, { "epoch": 0.9647403402050242, "grad_norm": 0.9930268546277804, "learning_rate": 1.6872080870788955e-08, "loss": 0.1708, "step": 2141 }, { "epoch": 0.9651909428861102, "grad_norm": 0.9284837696552151, "learning_rate": 1.6442708417428732e-08, "loss": 0.1619, "step": 2142 }, { "epoch": 0.9656415455671962, "grad_norm": 0.9169153943890256, "learning_rate": 1.6018852070470437e-08, "loss": 0.1686, "step": 2143 }, { "epoch": 0.966092148248282, "grad_norm": 0.9161979724962043, "learning_rate": 1.5600512771320462e-08, "loss": 0.1611, "step": 2144 }, { "epoch": 0.966542750929368, "grad_norm": 0.9529401144797984, "learning_rate": 1.518769144913168e-08, "loss": 0.1756, "step": 2145 }, { "epoch": 0.966993353610454, "grad_norm": 0.9619057195005012, "learning_rate": 1.4780389020800923e-08, "loss": 0.1588, "step": 2146 }, { "epoch": 0.9674439562915399, "grad_norm": 0.9314450454764756, "learning_rate": 1.4378606390967609e-08, "loss": 0.1735, "step": 2147 }, { "epoch": 0.9678945589726259, "grad_norm": 1.047548802018552, "learning_rate": 1.3982344452011242e-08, "loss": 0.1904, "step": 2148 }, { "epoch": 0.9683451616537119, "grad_norm": 0.9194561078192867, "learning_rate": 1.3591604084049747e-08, "loss": 0.1647, "step": 2149 }, { "epoch": 0.9687957643347977, "grad_norm": 0.9575900898195624, "learning_rate": 1.3206386154937245e-08, "loss": 0.1735, "step": 2150 }, { "epoch": 0.9692463670158837, "grad_norm": 0.9411881280016939, "learning_rate": 1.2826691520262114e-08, "loss": 0.1534, "step": 2151 }, { "epoch": 0.9696969696969697, "grad_norm": 0.9482488682440647, "learning_rate": 1.2452521023345598e-08, "loss": 0.1721, "step": 2152 }, { "epoch": 0.9701475723780556, "grad_norm": 0.9573010716579112, "learning_rate": 1.2083875495238761e-08, "loss": 0.1812, "step": 2153 }, { "epoch": 0.9705981750591416, "grad_norm": 0.9727335771556579, "learning_rate": 1.1720755754722757e-08, "loss": 0.1724, "step": 2154 }, { "epoch": 0.9710487777402276, "grad_norm": 0.9349543815438219, "learning_rate": 1.1363162608304112e-08, "loss": 0.1732, "step": 2155 }, { "epoch": 0.9714993804213135, "grad_norm": 0.9566156847154865, "learning_rate": 1.1011096850215842e-08, "loss": 0.1753, "step": 2156 }, { "epoch": 0.9719499831023994, "grad_norm": 0.9612547178015083, "learning_rate": 1.0664559262413831e-08, "loss": 0.1676, "step": 2157 }, { "epoch": 0.9724005857834854, "grad_norm": 0.8969285709349176, "learning_rate": 1.0323550614574907e-08, "loss": 0.1726, "step": 2158 }, { "epoch": 0.9728511884645714, "grad_norm": 0.92332833314932, "learning_rate": 9.988071664097376e-09, "loss": 0.1712, "step": 2159 }, { "epoch": 0.9733017911456573, "grad_norm": 0.9574569632858295, "learning_rate": 9.658123156096599e-09, "loss": 0.1739, "step": 2160 }, { "epoch": 0.9737523938267433, "grad_norm": 1.017729151344142, "learning_rate": 9.333705823404981e-09, "loss": 0.1643, "step": 2161 }, { "epoch": 0.9742029965078293, "grad_norm": 0.9393637507775909, "learning_rate": 9.014820386569756e-09, "loss": 0.1517, "step": 2162 }, { "epoch": 0.9746535991889151, "grad_norm": 0.961731341583023, "learning_rate": 8.701467553851317e-09, "loss": 0.1694, "step": 2163 }, { "epoch": 0.9751042018700011, "grad_norm": 0.9826839522611126, "learning_rate": 8.393648021222666e-09, "loss": 0.1748, "step": 2164 }, { "epoch": 0.9755548045510871, "grad_norm": 0.907644786835214, "learning_rate": 8.09136247236636e-09, "loss": 0.1717, "step": 2165 }, { "epoch": 0.976005407232173, "grad_norm": 1.0061970372794153, "learning_rate": 7.79461157867395e-09, "loss": 0.1799, "step": 2166 }, { "epoch": 0.976456009913259, "grad_norm": 0.9357599268796304, "learning_rate": 7.503395999244045e-09, "loss": 0.1647, "step": 2167 }, { "epoch": 0.976906612594345, "grad_norm": 0.9357608624110657, "learning_rate": 7.217716380881479e-09, "loss": 0.1711, "step": 2168 }, { "epoch": 0.9773572152754308, "grad_norm": 0.962898995983811, "learning_rate": 6.937573358094529e-09, "loss": 0.1752, "step": 2169 }, { "epoch": 0.9778078179565168, "grad_norm": 0.9687006598048918, "learning_rate": 6.662967553095756e-09, "loss": 0.1674, "step": 2170 }, { "epoch": 0.9782584206376028, "grad_norm": 0.9104077045031495, "learning_rate": 6.3938995757981125e-09, "loss": 0.1609, "step": 2171 }, { "epoch": 0.9787090233186887, "grad_norm": 0.9437695260385421, "learning_rate": 6.1303700238152245e-09, "loss": 0.1705, "step": 2172 }, { "epoch": 0.9791596259997747, "grad_norm": 1.0098665718813014, "learning_rate": 5.8723794824597226e-09, "loss": 0.169, "step": 2173 }, { "epoch": 0.9796102286808607, "grad_norm": 0.9436186302476126, "learning_rate": 5.6199285247415805e-09, "loss": 0.1725, "step": 2174 }, { "epoch": 0.9800608313619465, "grad_norm": 0.9388694365891636, "learning_rate": 5.373017711367001e-09, "loss": 0.1674, "step": 2175 }, { "epoch": 0.9805114340430325, "grad_norm": 0.937859556500245, "learning_rate": 5.131647590737587e-09, "loss": 0.1694, "step": 2176 }, { "epoch": 0.9809620367241185, "grad_norm": 1.0038011437378807, "learning_rate": 4.895818698948396e-09, "loss": 0.1753, "step": 2177 }, { "epoch": 0.9814126394052045, "grad_norm": 0.934190879507068, "learning_rate": 4.6655315597876615e-09, "loss": 0.1548, "step": 2178 }, { "epoch": 0.9818632420862904, "grad_norm": 0.9931696688123586, "learning_rate": 4.440786684734577e-09, "loss": 0.1596, "step": 2179 }, { "epoch": 0.9823138447673764, "grad_norm": 0.9113361170344727, "learning_rate": 4.221584572958737e-09, "loss": 0.1569, "step": 2180 }, { "epoch": 0.9827644474484624, "grad_norm": 0.9184095650294223, "learning_rate": 4.0079257113190275e-09, "loss": 0.1563, "step": 2181 }, { "epoch": 0.9832150501295482, "grad_norm": 0.9809381433522103, "learning_rate": 3.799810574363072e-09, "loss": 0.18, "step": 2182 }, { "epoch": 0.9836656528106342, "grad_norm": 0.9200844447422045, "learning_rate": 3.597239624325011e-09, "loss": 0.1636, "step": 2183 }, { "epoch": 0.9841162554917202, "grad_norm": 0.9477262368387944, "learning_rate": 3.4002133111246673e-09, "loss": 0.163, "step": 2184 }, { "epoch": 0.9845668581728061, "grad_norm": 0.9829302587131702, "learning_rate": 3.208732072368104e-09, "loss": 0.1828, "step": 2185 }, { "epoch": 0.9850174608538921, "grad_norm": 0.9323177390240451, "learning_rate": 3.022796333344291e-09, "loss": 0.1686, "step": 2186 }, { "epoch": 0.9854680635349781, "grad_norm": 0.97959980859569, "learning_rate": 2.8424065070262186e-09, "loss": 0.1721, "step": 2187 }, { "epoch": 0.9859186662160639, "grad_norm": 0.9360035720240707, "learning_rate": 2.6675629940689508e-09, "loss": 0.1661, "step": 2188 }, { "epoch": 0.9863692688971499, "grad_norm": 0.9829560482882546, "learning_rate": 2.4982661828085175e-09, "loss": 0.1745, "step": 2189 }, { "epoch": 0.9868198715782359, "grad_norm": 0.8791817829948829, "learning_rate": 2.3345164492616367e-09, "loss": 0.1481, "step": 2190 }, { "epoch": 0.9872704742593218, "grad_norm": 0.9195574805142371, "learning_rate": 2.1763141571248813e-09, "loss": 0.173, "step": 2191 }, { "epoch": 0.9877210769404078, "grad_norm": 0.9689298058051112, "learning_rate": 2.0236596577738466e-09, "loss": 0.1746, "step": 2192 }, { "epoch": 0.9881716796214938, "grad_norm": 0.9370644186173055, "learning_rate": 1.876553290261207e-09, "loss": 0.1744, "step": 2193 }, { "epoch": 0.9886222823025796, "grad_norm": 0.905510873781387, "learning_rate": 1.7349953813183828e-09, "loss": 0.1543, "step": 2194 }, { "epoch": 0.9890728849836656, "grad_norm": 0.9101236065698919, "learning_rate": 1.5989862453522075e-09, "loss": 0.1578, "step": 2195 }, { "epoch": 0.9895234876647516, "grad_norm": 0.8847140026088114, "learning_rate": 1.468526184445762e-09, "loss": 0.1527, "step": 2196 }, { "epoch": 0.9899740903458376, "grad_norm": 0.9682107112247759, "learning_rate": 1.343615488357819e-09, "loss": 0.1662, "step": 2197 }, { "epoch": 0.9904246930269235, "grad_norm": 0.9471219244863741, "learning_rate": 1.2242544345211772e-09, "loss": 0.1595, "step": 2198 }, { "epoch": 0.9908752957080095, "grad_norm": 0.9464719657257396, "learning_rate": 1.1104432880429394e-09, "loss": 0.1605, "step": 2199 }, { "epoch": 0.9913258983890955, "grad_norm": 0.952522471678984, "learning_rate": 1.0021823017028475e-09, "loss": 0.1706, "step": 2200 }, { "epoch": 0.9917765010701813, "grad_norm": 0.9166081061347666, "learning_rate": 8.994717159546695e-10, "loss": 0.1585, "step": 2201 }, { "epoch": 0.9922271037512673, "grad_norm": 0.894856070306113, "learning_rate": 8.023117589237017e-10, "loss": 0.1594, "step": 2202 }, { "epoch": 0.9926777064323533, "grad_norm": 0.8818944106764158, "learning_rate": 7.10702646406769e-10, "loss": 0.156, "step": 2203 }, { "epoch": 0.9931283091134392, "grad_norm": 0.9621081747527447, "learning_rate": 6.246445818727798e-10, "loss": 0.1742, "step": 2204 }, { "epoch": 0.9935789117945252, "grad_norm": 0.9263306240100847, "learning_rate": 5.44137756460783e-10, "loss": 0.1645, "step": 2205 }, { "epoch": 0.9940295144756112, "grad_norm": 1.0081667902258236, "learning_rate": 4.691823489805236e-10, "loss": 0.1873, "step": 2206 }, { "epoch": 0.994480117156697, "grad_norm": 0.8830293356060712, "learning_rate": 3.9977852591188694e-10, "loss": 0.1563, "step": 2207 }, { "epoch": 0.994930719837783, "grad_norm": 0.9731367845524682, "learning_rate": 3.3592644140434393e-10, "loss": 0.1622, "step": 2208 }, { "epoch": 0.995381322518869, "grad_norm": 0.9199241966535081, "learning_rate": 2.776262372761185e-10, "loss": 0.1696, "step": 2209 }, { "epoch": 0.9958319251999549, "grad_norm": 0.960822046253473, "learning_rate": 2.2487804301557503e-10, "loss": 0.1634, "step": 2210 }, { "epoch": 0.9962825278810409, "grad_norm": 0.9356621441742621, "learning_rate": 1.776819757787207e-10, "loss": 0.164, "step": 2211 }, { "epoch": 0.9967331305621269, "grad_norm": 1.0160315804220252, "learning_rate": 1.3603814039031547e-10, "loss": 0.1824, "step": 2212 }, { "epoch": 0.9971837332432127, "grad_norm": 0.9952641254925227, "learning_rate": 9.994662934387223e-11, "loss": 0.1648, "step": 2213 }, { "epoch": 0.9976343359242987, "grad_norm": 0.9693799712923902, "learning_rate": 6.94075227999913e-11, "loss": 0.1677, "step": 2214 }, { "epoch": 0.9980849386053847, "grad_norm": 1.0026671968102894, "learning_rate": 4.44208885877484e-11, "loss": 0.1815, "step": 2215 }, { "epoch": 0.9985355412864707, "grad_norm": 0.9105412050504266, "learning_rate": 2.498678220386186e-11, "loss": 0.1594, "step": 2216 }, { "epoch": 0.9989861439675566, "grad_norm": 0.9882824663570642, "learning_rate": 1.1105246812137538e-11, "loss": 0.1816, "step": 2217 }, { "epoch": 0.9994367466486426, "grad_norm": 0.9497467838011637, "learning_rate": 2.7763132445790543e-12, "loss": 0.1736, "step": 2218 }, { "epoch": 0.9998873493297286, "grad_norm": 1.0189201478595424, "learning_rate": 0.0, "loss": 0.1765, "step": 2219 }, { "epoch": 0.9998873493297286, "step": 2219, "total_flos": 369431437639680.0, "train_loss": 0.19088607980840752, "train_runtime": 22854.2492, "train_samples_per_second": 6.214, "train_steps_per_second": 0.097 } ], "logging_steps": 1, "max_steps": 2219, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 369431437639680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }