{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1901, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005260389268805891, "grad_norm": 4.7434234619140625, "learning_rate": 5.0000000000000004e-08, "loss": 1.7896, "step": 1 }, { "epoch": 0.0010520778537611783, "grad_norm": 4.893940448760986, "learning_rate": 1.0000000000000001e-07, "loss": 1.8423, "step": 2 }, { "epoch": 0.0015781167806417674, "grad_norm": 5.008203029632568, "learning_rate": 1.5000000000000002e-07, "loss": 1.7775, "step": 3 }, { "epoch": 0.0021041557075223566, "grad_norm": 4.682094097137451, "learning_rate": 2.0000000000000002e-07, "loss": 1.7132, "step": 4 }, { "epoch": 0.0026301946344029457, "grad_norm": 5.076476097106934, "learning_rate": 2.5000000000000004e-07, "loss": 1.7946, "step": 5 }, { "epoch": 0.003156233561283535, "grad_norm": 5.164911270141602, "learning_rate": 3.0000000000000004e-07, "loss": 1.7562, "step": 6 }, { "epoch": 0.003682272488164124, "grad_norm": 5.532482624053955, "learning_rate": 3.5000000000000004e-07, "loss": 1.9173, "step": 7 }, { "epoch": 0.004208311415044713, "grad_norm": 4.994466304779053, "learning_rate": 4.0000000000000003e-07, "loss": 1.8048, "step": 8 }, { "epoch": 0.004734350341925302, "grad_norm": 4.728099822998047, "learning_rate": 4.5000000000000003e-07, "loss": 1.8313, "step": 9 }, { "epoch": 0.0052603892688058915, "grad_norm": 4.757445335388184, "learning_rate": 5.000000000000001e-07, "loss": 1.7745, "step": 10 }, { "epoch": 0.005786428195686481, "grad_norm": 4.926065444946289, "learning_rate": 5.5e-07, "loss": 1.8448, "step": 11 }, { "epoch": 0.00631246712256707, "grad_norm": 4.987133979797363, "learning_rate": 6.000000000000001e-07, "loss": 1.7755, "step": 12 }, { "epoch": 0.006838506049447659, "grad_norm": 4.783141613006592, "learning_rate": 6.5e-07, "loss": 1.7815, "step": 13 }, { "epoch": 0.007364544976328248, "grad_norm": 4.668217182159424, "learning_rate": 7.000000000000001e-07, "loss": 1.754, "step": 14 }, { "epoch": 0.007890583903208837, "grad_norm": 4.673665523529053, "learning_rate": 7.5e-07, "loss": 1.7608, "step": 15 }, { "epoch": 0.008416622830089426, "grad_norm": 4.452486991882324, "learning_rate": 8.000000000000001e-07, "loss": 1.7222, "step": 16 }, { "epoch": 0.008942661756970016, "grad_norm": 4.257665157318115, "learning_rate": 8.500000000000001e-07, "loss": 1.7556, "step": 17 }, { "epoch": 0.009468700683850605, "grad_norm": 4.1270432472229, "learning_rate": 9.000000000000001e-07, "loss": 1.7121, "step": 18 }, { "epoch": 0.009994739610731194, "grad_norm": 4.321215629577637, "learning_rate": 9.500000000000001e-07, "loss": 1.7584, "step": 19 }, { "epoch": 0.010520778537611783, "grad_norm": 3.8703970909118652, "learning_rate": 1.0000000000000002e-06, "loss": 1.6611, "step": 20 }, { "epoch": 0.011046817464492372, "grad_norm": 4.07947301864624, "learning_rate": 1.0500000000000001e-06, "loss": 1.7914, "step": 21 }, { "epoch": 0.011572856391372961, "grad_norm": 3.9068686962127686, "learning_rate": 1.1e-06, "loss": 1.7848, "step": 22 }, { "epoch": 0.01209889531825355, "grad_norm": 3.7697386741638184, "learning_rate": 1.1500000000000002e-06, "loss": 1.6694, "step": 23 }, { "epoch": 0.01262493424513414, "grad_norm": 3.795276641845703, "learning_rate": 1.2000000000000002e-06, "loss": 1.759, "step": 24 }, { "epoch": 0.013150973172014729, "grad_norm": 3.331472396850586, "learning_rate": 1.25e-06, "loss": 1.7053, "step": 25 }, { "epoch": 0.013677012098895318, "grad_norm": 3.381592035293579, "learning_rate": 1.3e-06, "loss": 1.683, "step": 26 }, { "epoch": 0.014203051025775907, "grad_norm": 3.2494184970855713, "learning_rate": 1.3500000000000002e-06, "loss": 1.5756, "step": 27 }, { "epoch": 0.014729089952656496, "grad_norm": 3.124213695526123, "learning_rate": 1.4000000000000001e-06, "loss": 1.7102, "step": 28 }, { "epoch": 0.015255128879537085, "grad_norm": 2.9148762226104736, "learning_rate": 1.45e-06, "loss": 1.6007, "step": 29 }, { "epoch": 0.015781167806417674, "grad_norm": 2.886734962463379, "learning_rate": 1.5e-06, "loss": 1.7086, "step": 30 }, { "epoch": 0.016307206733298264, "grad_norm": 2.6898605823516846, "learning_rate": 1.5500000000000002e-06, "loss": 1.5655, "step": 31 }, { "epoch": 0.016833245660178853, "grad_norm": 2.6458981037139893, "learning_rate": 1.6000000000000001e-06, "loss": 1.4881, "step": 32 }, { "epoch": 0.017359284587059442, "grad_norm": 2.481387138366699, "learning_rate": 1.6500000000000003e-06, "loss": 1.5608, "step": 33 }, { "epoch": 0.01788532351394003, "grad_norm": 2.743023633956909, "learning_rate": 1.7000000000000002e-06, "loss": 1.5705, "step": 34 }, { "epoch": 0.01841136244082062, "grad_norm": 2.7273406982421875, "learning_rate": 1.75e-06, "loss": 1.5819, "step": 35 }, { "epoch": 0.01893740136770121, "grad_norm": 2.7253308296203613, "learning_rate": 1.8000000000000001e-06, "loss": 1.5201, "step": 36 }, { "epoch": 0.0194634402945818, "grad_norm": 2.8794732093811035, "learning_rate": 1.85e-06, "loss": 1.4743, "step": 37 }, { "epoch": 0.019989479221462388, "grad_norm": 2.767172336578369, "learning_rate": 1.9000000000000002e-06, "loss": 1.5366, "step": 38 }, { "epoch": 0.020515518148342977, "grad_norm": 2.84169864654541, "learning_rate": 1.9500000000000004e-06, "loss": 1.5635, "step": 39 }, { "epoch": 0.021041557075223566, "grad_norm": 2.6982147693634033, "learning_rate": 2.0000000000000003e-06, "loss": 1.49, "step": 40 }, { "epoch": 0.021567596002104155, "grad_norm": 2.597731590270996, "learning_rate": 2.05e-06, "loss": 1.5189, "step": 41 }, { "epoch": 0.022093634928984744, "grad_norm": 2.4286556243896484, "learning_rate": 2.1000000000000002e-06, "loss": 1.4439, "step": 42 }, { "epoch": 0.022619673855865333, "grad_norm": 2.6267499923706055, "learning_rate": 2.15e-06, "loss": 1.3522, "step": 43 }, { "epoch": 0.023145712782745922, "grad_norm": 2.2576816082000732, "learning_rate": 2.2e-06, "loss": 1.4713, "step": 44 }, { "epoch": 0.02367175170962651, "grad_norm": 2.406381368637085, "learning_rate": 2.25e-06, "loss": 1.47, "step": 45 }, { "epoch": 0.0241977906365071, "grad_norm": 2.2341415882110596, "learning_rate": 2.3000000000000004e-06, "loss": 1.4041, "step": 46 }, { "epoch": 0.02472382956338769, "grad_norm": 2.5055644512176514, "learning_rate": 2.35e-06, "loss": 1.4321, "step": 47 }, { "epoch": 0.02524986849026828, "grad_norm": 2.2131927013397217, "learning_rate": 2.4000000000000003e-06, "loss": 1.3631, "step": 48 }, { "epoch": 0.025775907417148868, "grad_norm": 2.3399457931518555, "learning_rate": 2.4500000000000003e-06, "loss": 1.4055, "step": 49 }, { "epoch": 0.026301946344029457, "grad_norm": 2.2194554805755615, "learning_rate": 2.5e-06, "loss": 1.3722, "step": 50 }, { "epoch": 0.026827985270910047, "grad_norm": 2.196530342102051, "learning_rate": 2.55e-06, "loss": 1.4126, "step": 51 }, { "epoch": 0.027354024197790636, "grad_norm": 2.401376485824585, "learning_rate": 2.6e-06, "loss": 1.4174, "step": 52 }, { "epoch": 0.027880063124671225, "grad_norm": 2.2509777545928955, "learning_rate": 2.6500000000000005e-06, "loss": 1.3725, "step": 53 }, { "epoch": 0.028406102051551814, "grad_norm": 2.2538340091705322, "learning_rate": 2.7000000000000004e-06, "loss": 1.4274, "step": 54 }, { "epoch": 0.028932140978432403, "grad_norm": 2.218494176864624, "learning_rate": 2.7500000000000004e-06, "loss": 1.4518, "step": 55 }, { "epoch": 0.029458179905312992, "grad_norm": 2.06544828414917, "learning_rate": 2.8000000000000003e-06, "loss": 1.3547, "step": 56 }, { "epoch": 0.02998421883219358, "grad_norm": 2.014075994491577, "learning_rate": 2.85e-06, "loss": 1.2274, "step": 57 }, { "epoch": 0.03051025775907417, "grad_norm": 2.187418222427368, "learning_rate": 2.9e-06, "loss": 1.3663, "step": 58 }, { "epoch": 0.03103629668595476, "grad_norm": 1.993913173675537, "learning_rate": 2.95e-06, "loss": 1.3357, "step": 59 }, { "epoch": 0.03156233561283535, "grad_norm": 2.1067426204681396, "learning_rate": 3e-06, "loss": 1.3627, "step": 60 }, { "epoch": 0.03208837453971594, "grad_norm": 2.0144565105438232, "learning_rate": 3.05e-06, "loss": 1.394, "step": 61 }, { "epoch": 0.03261441346659653, "grad_norm": 2.2240288257598877, "learning_rate": 3.1000000000000004e-06, "loss": 1.3657, "step": 62 }, { "epoch": 0.03314045239347712, "grad_norm": 2.0080718994140625, "learning_rate": 3.1500000000000003e-06, "loss": 1.2954, "step": 63 }, { "epoch": 0.033666491320357705, "grad_norm": 2.1592211723327637, "learning_rate": 3.2000000000000003e-06, "loss": 1.363, "step": 64 }, { "epoch": 0.0341925302472383, "grad_norm": 2.1390435695648193, "learning_rate": 3.2500000000000002e-06, "loss": 1.3329, "step": 65 }, { "epoch": 0.034718569174118884, "grad_norm": 2.309795379638672, "learning_rate": 3.3000000000000006e-06, "loss": 1.3378, "step": 66 }, { "epoch": 0.035244608100999476, "grad_norm": 2.0283970832824707, "learning_rate": 3.3500000000000005e-06, "loss": 1.2707, "step": 67 }, { "epoch": 0.03577064702788006, "grad_norm": 2.3350703716278076, "learning_rate": 3.4000000000000005e-06, "loss": 1.3149, "step": 68 }, { "epoch": 0.036296685954760655, "grad_norm": 2.1374268531799316, "learning_rate": 3.45e-06, "loss": 1.3181, "step": 69 }, { "epoch": 0.03682272488164124, "grad_norm": 2.1340744495391846, "learning_rate": 3.5e-06, "loss": 1.2968, "step": 70 }, { "epoch": 0.03734876380852183, "grad_norm": 2.212939500808716, "learning_rate": 3.5500000000000003e-06, "loss": 1.3285, "step": 71 }, { "epoch": 0.03787480273540242, "grad_norm": 2.0891077518463135, "learning_rate": 3.6000000000000003e-06, "loss": 1.3142, "step": 72 }, { "epoch": 0.03840084166228301, "grad_norm": 2.0146496295928955, "learning_rate": 3.65e-06, "loss": 1.2932, "step": 73 }, { "epoch": 0.0389268805891636, "grad_norm": 2.2315266132354736, "learning_rate": 3.7e-06, "loss": 1.3515, "step": 74 }, { "epoch": 0.03945291951604419, "grad_norm": 2.0311717987060547, "learning_rate": 3.7500000000000005e-06, "loss": 1.2601, "step": 75 }, { "epoch": 0.039978958442924775, "grad_norm": 1.9522899389266968, "learning_rate": 3.8000000000000005e-06, "loss": 1.3521, "step": 76 }, { "epoch": 0.04050499736980537, "grad_norm": 2.0501742362976074, "learning_rate": 3.85e-06, "loss": 1.3243, "step": 77 }, { "epoch": 0.041031036296685953, "grad_norm": 2.136033535003662, "learning_rate": 3.900000000000001e-06, "loss": 1.3373, "step": 78 }, { "epoch": 0.041557075223566546, "grad_norm": 2.328866958618164, "learning_rate": 3.95e-06, "loss": 1.2864, "step": 79 }, { "epoch": 0.04208311415044713, "grad_norm": 2.0889344215393066, "learning_rate": 4.000000000000001e-06, "loss": 1.2692, "step": 80 }, { "epoch": 0.042609153077327724, "grad_norm": 2.088667631149292, "learning_rate": 4.05e-06, "loss": 1.2232, "step": 81 }, { "epoch": 0.04313519200420831, "grad_norm": 2.0293898582458496, "learning_rate": 4.1e-06, "loss": 1.2505, "step": 82 }, { "epoch": 0.0436612309310889, "grad_norm": 2.240025281906128, "learning_rate": 4.15e-06, "loss": 1.3107, "step": 83 }, { "epoch": 0.04418726985796949, "grad_norm": 2.123445987701416, "learning_rate": 4.2000000000000004e-06, "loss": 1.1674, "step": 84 }, { "epoch": 0.04471330878485008, "grad_norm": 2.1865620613098145, "learning_rate": 4.25e-06, "loss": 1.3257, "step": 85 }, { "epoch": 0.04523934771173067, "grad_norm": 2.1336405277252197, "learning_rate": 4.3e-06, "loss": 1.2968, "step": 86 }, { "epoch": 0.04576538663861126, "grad_norm": 2.117763042449951, "learning_rate": 4.350000000000001e-06, "loss": 1.2294, "step": 87 }, { "epoch": 0.046291425565491845, "grad_norm": 1.9969348907470703, "learning_rate": 4.4e-06, "loss": 1.2621, "step": 88 }, { "epoch": 0.04681746449237244, "grad_norm": 2.24861741065979, "learning_rate": 4.450000000000001e-06, "loss": 1.2909, "step": 89 }, { "epoch": 0.04734350341925302, "grad_norm": 2.08335542678833, "learning_rate": 4.5e-06, "loss": 1.2691, "step": 90 }, { "epoch": 0.047869542346133616, "grad_norm": 2.1306045055389404, "learning_rate": 4.5500000000000005e-06, "loss": 1.3248, "step": 91 }, { "epoch": 0.0483955812730142, "grad_norm": 2.2251298427581787, "learning_rate": 4.600000000000001e-06, "loss": 1.2391, "step": 92 }, { "epoch": 0.048921620199894794, "grad_norm": 2.1604959964752197, "learning_rate": 4.65e-06, "loss": 1.2169, "step": 93 }, { "epoch": 0.04944765912677538, "grad_norm": 2.0155038833618164, "learning_rate": 4.7e-06, "loss": 1.2533, "step": 94 }, { "epoch": 0.04997369805365597, "grad_norm": 1.9579726457595825, "learning_rate": 4.75e-06, "loss": 1.2228, "step": 95 }, { "epoch": 0.05049973698053656, "grad_norm": 2.129992961883545, "learning_rate": 4.800000000000001e-06, "loss": 1.2573, "step": 96 }, { "epoch": 0.05102577590741715, "grad_norm": 2.0832459926605225, "learning_rate": 4.85e-06, "loss": 1.241, "step": 97 }, { "epoch": 0.051551814834297736, "grad_norm": 2.278550148010254, "learning_rate": 4.9000000000000005e-06, "loss": 1.2565, "step": 98 }, { "epoch": 0.05207785376117833, "grad_norm": 2.0997259616851807, "learning_rate": 4.95e-06, "loss": 1.2445, "step": 99 }, { "epoch": 0.052603892688058915, "grad_norm": 2.127976417541504, "learning_rate": 5e-06, "loss": 1.2605, "step": 100 }, { "epoch": 0.05312993161493951, "grad_norm": 2.1200127601623535, "learning_rate": 4.9999999034856715e-06, "loss": 1.3057, "step": 101 }, { "epoch": 0.05365597054182009, "grad_norm": 2.456881046295166, "learning_rate": 4.999999613942694e-06, "loss": 1.2741, "step": 102 }, { "epoch": 0.054182009468700686, "grad_norm": 2.189507484436035, "learning_rate": 4.9999991313710884e-06, "loss": 1.2399, "step": 103 }, { "epoch": 0.05470804839558127, "grad_norm": 2.258619785308838, "learning_rate": 4.9999984557708936e-06, "loss": 1.2161, "step": 104 }, { "epoch": 0.055234087322461864, "grad_norm": 1.983225703239441, "learning_rate": 4.999997587142161e-06, "loss": 1.2027, "step": 105 }, { "epoch": 0.05576012624934245, "grad_norm": 2.1400973796844482, "learning_rate": 4.999996525484957e-06, "loss": 1.2685, "step": 106 }, { "epoch": 0.05628616517622304, "grad_norm": 1.9494950771331787, "learning_rate": 4.999995270799365e-06, "loss": 1.2604, "step": 107 }, { "epoch": 0.05681220410310363, "grad_norm": 2.1203386783599854, "learning_rate": 4.9999938230854814e-06, "loss": 1.2345, "step": 108 }, { "epoch": 0.05733824302998422, "grad_norm": 2.131884813308716, "learning_rate": 4.999992182343417e-06, "loss": 1.2097, "step": 109 }, { "epoch": 0.057864281956864806, "grad_norm": 2.136289119720459, "learning_rate": 4.9999903485732996e-06, "loss": 1.2617, "step": 110 }, { "epoch": 0.0583903208837454, "grad_norm": 2.025071144104004, "learning_rate": 4.9999883217752705e-06, "loss": 1.2004, "step": 111 }, { "epoch": 0.058916359810625984, "grad_norm": 2.513960838317871, "learning_rate": 4.999986101949486e-06, "loss": 1.2399, "step": 112 }, { "epoch": 0.05944239873750658, "grad_norm": 2.2483277320861816, "learning_rate": 4.999983689096117e-06, "loss": 1.2265, "step": 113 }, { "epoch": 0.05996843766438716, "grad_norm": 2.0863187313079834, "learning_rate": 4.999981083215352e-06, "loss": 1.1969, "step": 114 }, { "epoch": 0.060494476591267755, "grad_norm": 2.1240596771240234, "learning_rate": 4.99997828430739e-06, "loss": 1.275, "step": 115 }, { "epoch": 0.06102051551814834, "grad_norm": 2.3810060024261475, "learning_rate": 4.9999752923724465e-06, "loss": 1.3054, "step": 116 }, { "epoch": 0.061546554445028934, "grad_norm": 2.1266205310821533, "learning_rate": 4.999972107410754e-06, "loss": 1.1933, "step": 117 }, { "epoch": 0.06207259337190952, "grad_norm": 2.039619207382202, "learning_rate": 4.999968729422559e-06, "loss": 1.1886, "step": 118 }, { "epoch": 0.0625986322987901, "grad_norm": 2.024503707885742, "learning_rate": 4.999965158408122e-06, "loss": 1.2008, "step": 119 }, { "epoch": 0.0631246712256707, "grad_norm": 2.058926582336426, "learning_rate": 4.999961394367717e-06, "loss": 1.1772, "step": 120 }, { "epoch": 0.06365071015255129, "grad_norm": 1.989399790763855, "learning_rate": 4.999957437301637e-06, "loss": 1.1869, "step": 121 }, { "epoch": 0.06417674907943188, "grad_norm": 2.0462567806243896, "learning_rate": 4.999953287210185e-06, "loss": 1.1944, "step": 122 }, { "epoch": 0.06470278800631246, "grad_norm": 2.258549213409424, "learning_rate": 4.999948944093683e-06, "loss": 1.2304, "step": 123 }, { "epoch": 0.06522882693319305, "grad_norm": 2.115344285964966, "learning_rate": 4.999944407952467e-06, "loss": 1.1901, "step": 124 }, { "epoch": 0.06575486586007365, "grad_norm": 2.082406997680664, "learning_rate": 4.999939678786886e-06, "loss": 1.2481, "step": 125 }, { "epoch": 0.06628090478695424, "grad_norm": 2.5095906257629395, "learning_rate": 4.999934756597305e-06, "loss": 1.2526, "step": 126 }, { "epoch": 0.06680694371383482, "grad_norm": 1.989524483680725, "learning_rate": 4.999929641384105e-06, "loss": 1.2298, "step": 127 }, { "epoch": 0.06733298264071541, "grad_norm": 2.3429722785949707, "learning_rate": 4.999924333147681e-06, "loss": 1.2511, "step": 128 }, { "epoch": 0.067859021567596, "grad_norm": 2.064497232437134, "learning_rate": 4.999918831888441e-06, "loss": 1.2041, "step": 129 }, { "epoch": 0.0683850604944766, "grad_norm": 2.099992513656616, "learning_rate": 4.999913137606813e-06, "loss": 1.2256, "step": 130 }, { "epoch": 0.06891109942135717, "grad_norm": 2.188778877258301, "learning_rate": 4.999907250303234e-06, "loss": 1.2009, "step": 131 }, { "epoch": 0.06943713834823777, "grad_norm": 2.154895067214966, "learning_rate": 4.999901169978158e-06, "loss": 1.273, "step": 132 }, { "epoch": 0.06996317727511836, "grad_norm": 2.457084894180298, "learning_rate": 4.999894896632058e-06, "loss": 1.2003, "step": 133 }, { "epoch": 0.07048921620199895, "grad_norm": 2.0455472469329834, "learning_rate": 4.999888430265415e-06, "loss": 1.1909, "step": 134 }, { "epoch": 0.07101525512887953, "grad_norm": 2.3690097332000732, "learning_rate": 4.99988177087873e-06, "loss": 1.2414, "step": 135 }, { "epoch": 0.07154129405576012, "grad_norm": 2.0194432735443115, "learning_rate": 4.999874918472516e-06, "loss": 1.2072, "step": 136 }, { "epoch": 0.07206733298264072, "grad_norm": 2.0639989376068115, "learning_rate": 4.999867873047303e-06, "loss": 1.1853, "step": 137 }, { "epoch": 0.07259337190952131, "grad_norm": 2.1263129711151123, "learning_rate": 4.999860634603635e-06, "loss": 1.1915, "step": 138 }, { "epoch": 0.07311941083640189, "grad_norm": 1.9768770933151245, "learning_rate": 4.99985320314207e-06, "loss": 1.1623, "step": 139 }, { "epoch": 0.07364544976328248, "grad_norm": 2.4466986656188965, "learning_rate": 4.9998455786631835e-06, "loss": 1.2549, "step": 140 }, { "epoch": 0.07417148869016307, "grad_norm": 2.482954263687134, "learning_rate": 4.999837761167563e-06, "loss": 1.1503, "step": 141 }, { "epoch": 0.07469752761704367, "grad_norm": 2.1949164867401123, "learning_rate": 4.9998297506558116e-06, "loss": 1.2515, "step": 142 }, { "epoch": 0.07522356654392424, "grad_norm": 2.3435401916503906, "learning_rate": 4.9998215471285486e-06, "loss": 1.2231, "step": 143 }, { "epoch": 0.07574960547080484, "grad_norm": 2.2442994117736816, "learning_rate": 4.9998131505864064e-06, "loss": 1.2472, "step": 144 }, { "epoch": 0.07627564439768543, "grad_norm": 2.4117157459259033, "learning_rate": 4.999804561030036e-06, "loss": 1.2303, "step": 145 }, { "epoch": 0.07680168332456602, "grad_norm": 2.263303279876709, "learning_rate": 4.999795778460097e-06, "loss": 1.2435, "step": 146 }, { "epoch": 0.0773277222514466, "grad_norm": 2.174962282180786, "learning_rate": 4.99978680287727e-06, "loss": 1.2074, "step": 147 }, { "epoch": 0.0778537611783272, "grad_norm": 2.1498875617980957, "learning_rate": 4.999777634282248e-06, "loss": 1.1665, "step": 148 }, { "epoch": 0.07837980010520779, "grad_norm": 2.0245747566223145, "learning_rate": 4.999768272675737e-06, "loss": 1.169, "step": 149 }, { "epoch": 0.07890583903208838, "grad_norm": 2.03243350982666, "learning_rate": 4.999758718058462e-06, "loss": 1.2113, "step": 150 }, { "epoch": 0.07943187795896896, "grad_norm": 2.104052782058716, "learning_rate": 4.9997489704311586e-06, "loss": 1.1792, "step": 151 }, { "epoch": 0.07995791688584955, "grad_norm": 2.16056752204895, "learning_rate": 4.999739029794581e-06, "loss": 1.2183, "step": 152 }, { "epoch": 0.08048395581273014, "grad_norm": 2.1418581008911133, "learning_rate": 4.9997288961494975e-06, "loss": 1.2024, "step": 153 }, { "epoch": 0.08100999473961074, "grad_norm": 2.235917329788208, "learning_rate": 4.999718569496688e-06, "loss": 1.2234, "step": 154 }, { "epoch": 0.08153603366649131, "grad_norm": 2.0039474964141846, "learning_rate": 4.999708049836952e-06, "loss": 1.1164, "step": 155 }, { "epoch": 0.08206207259337191, "grad_norm": 2.0888242721557617, "learning_rate": 4.9996973371710995e-06, "loss": 1.1935, "step": 156 }, { "epoch": 0.0825881115202525, "grad_norm": 2.245558500289917, "learning_rate": 4.999686431499961e-06, "loss": 1.1438, "step": 157 }, { "epoch": 0.08311415044713309, "grad_norm": 2.351905345916748, "learning_rate": 4.999675332824376e-06, "loss": 1.2208, "step": 158 }, { "epoch": 0.08364018937401367, "grad_norm": 2.0418808460235596, "learning_rate": 4.999664041145201e-06, "loss": 1.1537, "step": 159 }, { "epoch": 0.08416622830089426, "grad_norm": 2.194399118423462, "learning_rate": 4.99965255646331e-06, "loss": 1.1602, "step": 160 }, { "epoch": 0.08469226722777486, "grad_norm": 2.4853098392486572, "learning_rate": 4.999640878779588e-06, "loss": 1.1981, "step": 161 }, { "epoch": 0.08521830615465545, "grad_norm": 2.1702558994293213, "learning_rate": 4.9996290080949386e-06, "loss": 1.1682, "step": 162 }, { "epoch": 0.08574434508153603, "grad_norm": 2.150707960128784, "learning_rate": 4.999616944410276e-06, "loss": 1.2123, "step": 163 }, { "epoch": 0.08627038400841662, "grad_norm": 2.166897773742676, "learning_rate": 4.9996046877265325e-06, "loss": 1.1855, "step": 164 }, { "epoch": 0.08679642293529721, "grad_norm": 2.1538188457489014, "learning_rate": 4.999592238044655e-06, "loss": 1.1797, "step": 165 }, { "epoch": 0.0873224618621778, "grad_norm": 2.222170114517212, "learning_rate": 4.999579595365604e-06, "loss": 1.1606, "step": 166 }, { "epoch": 0.08784850078905838, "grad_norm": 2.264437437057495, "learning_rate": 4.999566759690356e-06, "loss": 1.1662, "step": 167 }, { "epoch": 0.08837453971593898, "grad_norm": 2.2306337356567383, "learning_rate": 4.999553731019903e-06, "loss": 1.1933, "step": 168 }, { "epoch": 0.08890057864281957, "grad_norm": 2.2025609016418457, "learning_rate": 4.9995405093552495e-06, "loss": 1.2241, "step": 169 }, { "epoch": 0.08942661756970016, "grad_norm": 2.3908772468566895, "learning_rate": 4.999527094697418e-06, "loss": 1.1954, "step": 170 }, { "epoch": 0.08995265649658074, "grad_norm": 2.1161653995513916, "learning_rate": 4.999513487047442e-06, "loss": 1.2315, "step": 171 }, { "epoch": 0.09047869542346133, "grad_norm": 2.0984017848968506, "learning_rate": 4.9994996864063735e-06, "loss": 1.2413, "step": 172 }, { "epoch": 0.09100473435034193, "grad_norm": 2.205087900161743, "learning_rate": 4.999485692775279e-06, "loss": 1.2267, "step": 173 }, { "epoch": 0.09153077327722252, "grad_norm": 2.224553108215332, "learning_rate": 4.9994715061552365e-06, "loss": 1.1613, "step": 174 }, { "epoch": 0.0920568122041031, "grad_norm": 2.191676139831543, "learning_rate": 4.999457126547344e-06, "loss": 1.168, "step": 175 }, { "epoch": 0.09258285113098369, "grad_norm": 2.2432751655578613, "learning_rate": 4.99944255395271e-06, "loss": 1.218, "step": 176 }, { "epoch": 0.09310889005786428, "grad_norm": 2.1327083110809326, "learning_rate": 4.999427788372461e-06, "loss": 1.1994, "step": 177 }, { "epoch": 0.09363492898474488, "grad_norm": 2.146256923675537, "learning_rate": 4.999412829807735e-06, "loss": 1.1387, "step": 178 }, { "epoch": 0.09416096791162545, "grad_norm": 2.377356767654419, "learning_rate": 4.999397678259689e-06, "loss": 1.1901, "step": 179 }, { "epoch": 0.09468700683850605, "grad_norm": 2.192535638809204, "learning_rate": 4.999382333729492e-06, "loss": 1.2079, "step": 180 }, { "epoch": 0.09521304576538664, "grad_norm": 2.0958621501922607, "learning_rate": 4.999366796218329e-06, "loss": 1.1663, "step": 181 }, { "epoch": 0.09573908469226723, "grad_norm": 2.1492772102355957, "learning_rate": 4.9993510657274e-06, "loss": 1.1877, "step": 182 }, { "epoch": 0.09626512361914781, "grad_norm": 2.366111993789673, "learning_rate": 4.999335142257919e-06, "loss": 1.1849, "step": 183 }, { "epoch": 0.0967911625460284, "grad_norm": 2.144526243209839, "learning_rate": 4.999319025811116e-06, "loss": 1.1739, "step": 184 }, { "epoch": 0.097317201472909, "grad_norm": 2.3407647609710693, "learning_rate": 4.999302716388234e-06, "loss": 1.1987, "step": 185 }, { "epoch": 0.09784324039978959, "grad_norm": 2.3771328926086426, "learning_rate": 4.999286213990534e-06, "loss": 1.2024, "step": 186 }, { "epoch": 0.09836927932667017, "grad_norm": 2.2484753131866455, "learning_rate": 4.99926951861929e-06, "loss": 1.2087, "step": 187 }, { "epoch": 0.09889531825355076, "grad_norm": 2.276099681854248, "learning_rate": 4.99925263027579e-06, "loss": 1.1696, "step": 188 }, { "epoch": 0.09942135718043135, "grad_norm": 2.1576876640319824, "learning_rate": 4.999235548961338e-06, "loss": 1.1404, "step": 189 }, { "epoch": 0.09994739610731194, "grad_norm": 2.1412558555603027, "learning_rate": 4.999218274677254e-06, "loss": 1.1279, "step": 190 }, { "epoch": 0.10047343503419252, "grad_norm": 2.1507153511047363, "learning_rate": 4.999200807424871e-06, "loss": 1.1841, "step": 191 }, { "epoch": 0.10099947396107312, "grad_norm": 2.236116886138916, "learning_rate": 4.999183147205538e-06, "loss": 1.208, "step": 192 }, { "epoch": 0.10152551288795371, "grad_norm": 2.1643691062927246, "learning_rate": 4.9991652940206185e-06, "loss": 1.1325, "step": 193 }, { "epoch": 0.1020515518148343, "grad_norm": 2.11639142036438, "learning_rate": 4.999147247871491e-06, "loss": 1.2073, "step": 194 }, { "epoch": 0.10257759074171488, "grad_norm": 1.9682193994522095, "learning_rate": 4.9991290087595475e-06, "loss": 1.1447, "step": 195 }, { "epoch": 0.10310362966859547, "grad_norm": 1.9927830696105957, "learning_rate": 4.9991105766861996e-06, "loss": 1.1694, "step": 196 }, { "epoch": 0.10362966859547607, "grad_norm": 2.0124592781066895, "learning_rate": 4.999091951652867e-06, "loss": 1.152, "step": 197 }, { "epoch": 0.10415570752235666, "grad_norm": 2.1793248653411865, "learning_rate": 4.99907313366099e-06, "loss": 1.228, "step": 198 }, { "epoch": 0.10468174644923724, "grad_norm": 2.1615028381347656, "learning_rate": 4.99905412271202e-06, "loss": 1.2106, "step": 199 }, { "epoch": 0.10520778537611783, "grad_norm": 1.9827650785446167, "learning_rate": 4.999034918807425e-06, "loss": 1.1829, "step": 200 }, { "epoch": 0.10573382430299842, "grad_norm": 2.1772680282592773, "learning_rate": 4.999015521948689e-06, "loss": 1.13, "step": 201 }, { "epoch": 0.10625986322987901, "grad_norm": 2.257385492324829, "learning_rate": 4.99899593213731e-06, "loss": 1.2144, "step": 202 }, { "epoch": 0.1067859021567596, "grad_norm": 2.104809045791626, "learning_rate": 4.998976149374799e-06, "loss": 1.1715, "step": 203 }, { "epoch": 0.10731194108364019, "grad_norm": 2.116504430770874, "learning_rate": 4.998956173662683e-06, "loss": 1.1442, "step": 204 }, { "epoch": 0.10783798001052078, "grad_norm": 2.2018845081329346, "learning_rate": 4.998936005002507e-06, "loss": 1.1327, "step": 205 }, { "epoch": 0.10836401893740137, "grad_norm": 2.2733311653137207, "learning_rate": 4.998915643395826e-06, "loss": 1.1821, "step": 206 }, { "epoch": 0.10889005786428195, "grad_norm": 2.0005805492401123, "learning_rate": 4.998895088844212e-06, "loss": 1.0955, "step": 207 }, { "epoch": 0.10941609679116254, "grad_norm": 2.0851638317108154, "learning_rate": 4.998874341349253e-06, "loss": 1.1851, "step": 208 }, { "epoch": 0.10994213571804314, "grad_norm": 2.032989501953125, "learning_rate": 4.998853400912552e-06, "loss": 1.1069, "step": 209 }, { "epoch": 0.11046817464492373, "grad_norm": 2.295994520187378, "learning_rate": 4.9988322675357235e-06, "loss": 1.1511, "step": 210 }, { "epoch": 0.1109942135718043, "grad_norm": 1.9963881969451904, "learning_rate": 4.9988109412204015e-06, "loss": 1.1497, "step": 211 }, { "epoch": 0.1115202524986849, "grad_norm": 2.6223835945129395, "learning_rate": 4.998789421968231e-06, "loss": 1.1692, "step": 212 }, { "epoch": 0.11204629142556549, "grad_norm": 2.1924188137054443, "learning_rate": 4.998767709780873e-06, "loss": 1.1659, "step": 213 }, { "epoch": 0.11257233035244608, "grad_norm": 2.4124836921691895, "learning_rate": 4.998745804660005e-06, "loss": 1.1965, "step": 214 }, { "epoch": 0.11309836927932668, "grad_norm": 2.15348482131958, "learning_rate": 4.99872370660732e-06, "loss": 1.1337, "step": 215 }, { "epoch": 0.11362440820620726, "grad_norm": 2.3462562561035156, "learning_rate": 4.9987014156245215e-06, "loss": 1.1793, "step": 216 }, { "epoch": 0.11415044713308785, "grad_norm": 2.1864969730377197, "learning_rate": 4.998678931713331e-06, "loss": 1.1139, "step": 217 }, { "epoch": 0.11467648605996844, "grad_norm": 2.1411378383636475, "learning_rate": 4.998656254875486e-06, "loss": 1.1582, "step": 218 }, { "epoch": 0.11520252498684903, "grad_norm": 2.2826247215270996, "learning_rate": 4.998633385112737e-06, "loss": 1.1779, "step": 219 }, { "epoch": 0.11572856391372961, "grad_norm": 2.0697169303894043, "learning_rate": 4.998610322426848e-06, "loss": 1.1775, "step": 220 }, { "epoch": 0.1162546028406102, "grad_norm": 2.153381824493408, "learning_rate": 4.998587066819602e-06, "loss": 1.2244, "step": 221 }, { "epoch": 0.1167806417674908, "grad_norm": 2.151595115661621, "learning_rate": 4.998563618292793e-06, "loss": 1.1562, "step": 222 }, { "epoch": 0.11730668069437139, "grad_norm": 2.1102607250213623, "learning_rate": 4.998539976848233e-06, "loss": 1.1326, "step": 223 }, { "epoch": 0.11783271962125197, "grad_norm": 2.3099205493927, "learning_rate": 4.998516142487746e-06, "loss": 1.1934, "step": 224 }, { "epoch": 0.11835875854813256, "grad_norm": 2.0830485820770264, "learning_rate": 4.998492115213173e-06, "loss": 1.105, "step": 225 }, { "epoch": 0.11888479747501315, "grad_norm": 1.965256929397583, "learning_rate": 4.998467895026369e-06, "loss": 1.1496, "step": 226 }, { "epoch": 0.11941083640189375, "grad_norm": 2.060734272003174, "learning_rate": 4.9984434819292036e-06, "loss": 1.1256, "step": 227 }, { "epoch": 0.11993687532877433, "grad_norm": 2.278106927871704, "learning_rate": 4.998418875923563e-06, "loss": 1.1557, "step": 228 }, { "epoch": 0.12046291425565492, "grad_norm": 2.562490463256836, "learning_rate": 4.998394077011346e-06, "loss": 1.1579, "step": 229 }, { "epoch": 0.12098895318253551, "grad_norm": 2.20798921585083, "learning_rate": 4.998369085194468e-06, "loss": 1.181, "step": 230 }, { "epoch": 0.1215149921094161, "grad_norm": 2.3529961109161377, "learning_rate": 4.998343900474858e-06, "loss": 1.1514, "step": 231 }, { "epoch": 0.12204103103629668, "grad_norm": 2.2413651943206787, "learning_rate": 4.998318522854461e-06, "loss": 1.1317, "step": 232 }, { "epoch": 0.12256706996317727, "grad_norm": 2.2179031372070312, "learning_rate": 4.998292952335236e-06, "loss": 1.1784, "step": 233 }, { "epoch": 0.12309310889005787, "grad_norm": 2.2591211795806885, "learning_rate": 4.998267188919158e-06, "loss": 1.1587, "step": 234 }, { "epoch": 0.12361914781693846, "grad_norm": 2.4820573329925537, "learning_rate": 4.998241232608216e-06, "loss": 1.1448, "step": 235 }, { "epoch": 0.12414518674381904, "grad_norm": 2.202066659927368, "learning_rate": 4.998215083404414e-06, "loss": 1.1859, "step": 236 }, { "epoch": 0.12467122567069963, "grad_norm": 2.246918201446533, "learning_rate": 4.9981887413097705e-06, "loss": 1.1778, "step": 237 }, { "epoch": 0.1251972645975802, "grad_norm": 2.166926145553589, "learning_rate": 4.9981622063263205e-06, "loss": 1.16, "step": 238 }, { "epoch": 0.12572330352446082, "grad_norm": 2.2850661277770996, "learning_rate": 4.998135478456112e-06, "loss": 1.1522, "step": 239 }, { "epoch": 0.1262493424513414, "grad_norm": 2.1694653034210205, "learning_rate": 4.9981085577012095e-06, "loss": 1.1394, "step": 240 }, { "epoch": 0.126775381378222, "grad_norm": 2.061791181564331, "learning_rate": 4.998081444063691e-06, "loss": 1.1551, "step": 241 }, { "epoch": 0.12730142030510258, "grad_norm": 2.1517114639282227, "learning_rate": 4.998054137545649e-06, "loss": 1.1487, "step": 242 }, { "epoch": 0.12782745923198316, "grad_norm": 2.118903398513794, "learning_rate": 4.9980266381491935e-06, "loss": 1.1871, "step": 243 }, { "epoch": 0.12835349815886377, "grad_norm": 2.271512508392334, "learning_rate": 4.997998945876448e-06, "loss": 1.21, "step": 244 }, { "epoch": 0.12887953708574434, "grad_norm": 2.199542760848999, "learning_rate": 4.997971060729549e-06, "loss": 1.17, "step": 245 }, { "epoch": 0.12940557601262492, "grad_norm": 2.213566303253174, "learning_rate": 4.997942982710651e-06, "loss": 1.1521, "step": 246 }, { "epoch": 0.12993161493950553, "grad_norm": 2.291456699371338, "learning_rate": 4.997914711821921e-06, "loss": 1.1671, "step": 247 }, { "epoch": 0.1304576538663861, "grad_norm": 2.017871856689453, "learning_rate": 4.997886248065542e-06, "loss": 1.1522, "step": 248 }, { "epoch": 0.13098369279326671, "grad_norm": 2.1125521659851074, "learning_rate": 4.9978575914437115e-06, "loss": 1.1335, "step": 249 }, { "epoch": 0.1315097317201473, "grad_norm": 2.262874126434326, "learning_rate": 4.997828741958643e-06, "loss": 1.1697, "step": 250 }, { "epoch": 0.13203577064702787, "grad_norm": 2.450192451477051, "learning_rate": 4.997799699612563e-06, "loss": 1.1329, "step": 251 }, { "epoch": 0.13256180957390848, "grad_norm": 2.0831351280212402, "learning_rate": 4.997770464407715e-06, "loss": 1.1711, "step": 252 }, { "epoch": 0.13308784850078906, "grad_norm": 2.2078895568847656, "learning_rate": 4.997741036346357e-06, "loss": 1.1998, "step": 253 }, { "epoch": 0.13361388742766964, "grad_norm": 2.175858497619629, "learning_rate": 4.997711415430759e-06, "loss": 1.1083, "step": 254 }, { "epoch": 0.13413992635455024, "grad_norm": 2.203817129135132, "learning_rate": 4.997681601663207e-06, "loss": 1.088, "step": 255 }, { "epoch": 0.13466596528143082, "grad_norm": 2.0065557956695557, "learning_rate": 4.997651595046007e-06, "loss": 1.1584, "step": 256 }, { "epoch": 0.13519200420831143, "grad_norm": 2.299633264541626, "learning_rate": 4.997621395581474e-06, "loss": 1.2102, "step": 257 }, { "epoch": 0.135718043135192, "grad_norm": 2.2972707748413086, "learning_rate": 4.997591003271938e-06, "loss": 1.1821, "step": 258 }, { "epoch": 0.13624408206207259, "grad_norm": 2.399705171585083, "learning_rate": 4.997560418119749e-06, "loss": 1.1325, "step": 259 }, { "epoch": 0.1367701209889532, "grad_norm": 2.2461678981781006, "learning_rate": 4.997529640127266e-06, "loss": 1.2361, "step": 260 }, { "epoch": 0.13729615991583377, "grad_norm": 2.236917495727539, "learning_rate": 4.997498669296865e-06, "loss": 1.1159, "step": 261 }, { "epoch": 0.13782219884271435, "grad_norm": 2.2851338386535645, "learning_rate": 4.99746750563094e-06, "loss": 1.1688, "step": 262 }, { "epoch": 0.13834823776959496, "grad_norm": 2.1499626636505127, "learning_rate": 4.997436149131894e-06, "loss": 1.1478, "step": 263 }, { "epoch": 0.13887427669647553, "grad_norm": 2.0969858169555664, "learning_rate": 4.997404599802151e-06, "loss": 1.1102, "step": 264 }, { "epoch": 0.13940031562335614, "grad_norm": 2.5635933876037598, "learning_rate": 4.997372857644146e-06, "loss": 1.1173, "step": 265 }, { "epoch": 0.13992635455023672, "grad_norm": 2.1076197624206543, "learning_rate": 4.997340922660329e-06, "loss": 1.1321, "step": 266 }, { "epoch": 0.1404523934771173, "grad_norm": 2.179189443588257, "learning_rate": 4.997308794853165e-06, "loss": 1.1325, "step": 267 }, { "epoch": 0.1409784324039979, "grad_norm": 2.0838067531585693, "learning_rate": 4.9972764742251375e-06, "loss": 1.1243, "step": 268 }, { "epoch": 0.14150447133087848, "grad_norm": 2.1462979316711426, "learning_rate": 4.9972439607787405e-06, "loss": 1.1251, "step": 269 }, { "epoch": 0.14203051025775906, "grad_norm": 2.144658088684082, "learning_rate": 4.997211254516484e-06, "loss": 1.1879, "step": 270 }, { "epoch": 0.14255654918463967, "grad_norm": 2.118098020553589, "learning_rate": 4.997178355440892e-06, "loss": 1.1635, "step": 271 }, { "epoch": 0.14308258811152025, "grad_norm": 2.284640312194824, "learning_rate": 4.99714526355451e-06, "loss": 1.1181, "step": 272 }, { "epoch": 0.14360862703840085, "grad_norm": 2.2020652294158936, "learning_rate": 4.997111978859886e-06, "loss": 1.1234, "step": 273 }, { "epoch": 0.14413466596528143, "grad_norm": 2.164998769760132, "learning_rate": 4.997078501359595e-06, "loss": 1.1723, "step": 274 }, { "epoch": 0.144660704892162, "grad_norm": 2.1917877197265625, "learning_rate": 4.9970448310562196e-06, "loss": 1.1222, "step": 275 }, { "epoch": 0.14518674381904262, "grad_norm": 2.314770221710205, "learning_rate": 4.99701096795236e-06, "loss": 1.183, "step": 276 }, { "epoch": 0.1457127827459232, "grad_norm": 2.217176675796509, "learning_rate": 4.996976912050632e-06, "loss": 1.1509, "step": 277 }, { "epoch": 0.14623882167280378, "grad_norm": 2.253232002258301, "learning_rate": 4.996942663353663e-06, "loss": 1.1733, "step": 278 }, { "epoch": 0.14676486059968438, "grad_norm": 2.091414213180542, "learning_rate": 4.996908221864099e-06, "loss": 1.1479, "step": 279 }, { "epoch": 0.14729089952656496, "grad_norm": 2.391035556793213, "learning_rate": 4.996873587584599e-06, "loss": 1.1646, "step": 280 }, { "epoch": 0.14781693845344557, "grad_norm": 1.941179871559143, "learning_rate": 4.996838760517836e-06, "loss": 1.1362, "step": 281 }, { "epoch": 0.14834297738032615, "grad_norm": 2.3869614601135254, "learning_rate": 4.9968037406665e-06, "loss": 1.1455, "step": 282 }, { "epoch": 0.14886901630720673, "grad_norm": 2.2253477573394775, "learning_rate": 4.9967685280332955e-06, "loss": 1.1934, "step": 283 }, { "epoch": 0.14939505523408733, "grad_norm": 2.235481023788452, "learning_rate": 4.99673312262094e-06, "loss": 1.1457, "step": 284 }, { "epoch": 0.1499210941609679, "grad_norm": 2.1756770610809326, "learning_rate": 4.996697524432169e-06, "loss": 1.1874, "step": 285 }, { "epoch": 0.1504471330878485, "grad_norm": 1.9890838861465454, "learning_rate": 4.99666173346973e-06, "loss": 1.1381, "step": 286 }, { "epoch": 0.1509731720147291, "grad_norm": 2.032940149307251, "learning_rate": 4.996625749736386e-06, "loss": 1.1408, "step": 287 }, { "epoch": 0.15149921094160967, "grad_norm": 2.38653564453125, "learning_rate": 4.996589573234915e-06, "loss": 1.1137, "step": 288 }, { "epoch": 0.15202524986849028, "grad_norm": 2.5009000301361084, "learning_rate": 4.9965532039681116e-06, "loss": 1.1404, "step": 289 }, { "epoch": 0.15255128879537086, "grad_norm": 2.113600969314575, "learning_rate": 4.996516641938784e-06, "loss": 1.0764, "step": 290 }, { "epoch": 0.15307732772225144, "grad_norm": 2.2645368576049805, "learning_rate": 4.996479887149754e-06, "loss": 1.1499, "step": 291 }, { "epoch": 0.15360336664913204, "grad_norm": 2.015124559402466, "learning_rate": 4.99644293960386e-06, "loss": 1.0487, "step": 292 }, { "epoch": 0.15412940557601262, "grad_norm": 2.121588706970215, "learning_rate": 4.996405799303955e-06, "loss": 1.1119, "step": 293 }, { "epoch": 0.1546554445028932, "grad_norm": 2.3707003593444824, "learning_rate": 4.996368466252907e-06, "loss": 1.1797, "step": 294 }, { "epoch": 0.1551814834297738, "grad_norm": 2.3027000427246094, "learning_rate": 4.996330940453598e-06, "loss": 1.1228, "step": 295 }, { "epoch": 0.1557075223566544, "grad_norm": 2.0909178256988525, "learning_rate": 4.996293221908925e-06, "loss": 1.0932, "step": 296 }, { "epoch": 0.156233561283535, "grad_norm": 2.362823486328125, "learning_rate": 4.996255310621801e-06, "loss": 1.1507, "step": 297 }, { "epoch": 0.15675960021041557, "grad_norm": 2.080667495727539, "learning_rate": 4.996217206595153e-06, "loss": 1.1158, "step": 298 }, { "epoch": 0.15728563913729615, "grad_norm": 2.0508742332458496, "learning_rate": 4.996178909831922e-06, "loss": 1.1326, "step": 299 }, { "epoch": 0.15781167806417676, "grad_norm": 2.1632707118988037, "learning_rate": 4.996140420335068e-06, "loss": 1.0946, "step": 300 }, { "epoch": 0.15833771699105734, "grad_norm": 1.9084789752960205, "learning_rate": 4.996101738107559e-06, "loss": 1.0939, "step": 301 }, { "epoch": 0.15886375591793792, "grad_norm": 1.9817906618118286, "learning_rate": 4.996062863152385e-06, "loss": 1.1013, "step": 302 }, { "epoch": 0.15938979484481852, "grad_norm": 1.9947365522384644, "learning_rate": 4.9960237954725446e-06, "loss": 1.0635, "step": 303 }, { "epoch": 0.1599158337716991, "grad_norm": 2.0908870697021484, "learning_rate": 4.995984535071056e-06, "loss": 1.0914, "step": 304 }, { "epoch": 0.1604418726985797, "grad_norm": 2.1920530796051025, "learning_rate": 4.995945081950952e-06, "loss": 1.1816, "step": 305 }, { "epoch": 0.16096791162546029, "grad_norm": 2.250007152557373, "learning_rate": 4.995905436115276e-06, "loss": 1.1543, "step": 306 }, { "epoch": 0.16149395055234086, "grad_norm": 2.3157906532287598, "learning_rate": 4.995865597567091e-06, "loss": 1.1349, "step": 307 }, { "epoch": 0.16201998947922147, "grad_norm": 2.816443681716919, "learning_rate": 4.995825566309471e-06, "loss": 1.1154, "step": 308 }, { "epoch": 0.16254602840610205, "grad_norm": 2.3194282054901123, "learning_rate": 4.995785342345509e-06, "loss": 1.1547, "step": 309 }, { "epoch": 0.16307206733298263, "grad_norm": 2.1249098777770996, "learning_rate": 4.99574492567831e-06, "loss": 1.0995, "step": 310 }, { "epoch": 0.16359810625986324, "grad_norm": 2.100315809249878, "learning_rate": 4.995704316310994e-06, "loss": 1.1662, "step": 311 }, { "epoch": 0.16412414518674381, "grad_norm": 2.1664323806762695, "learning_rate": 4.995663514246697e-06, "loss": 1.1466, "step": 312 }, { "epoch": 0.16465018411362442, "grad_norm": 2.217438220977783, "learning_rate": 4.9956225194885704e-06, "loss": 1.1908, "step": 313 }, { "epoch": 0.165176223040505, "grad_norm": 2.3328514099121094, "learning_rate": 4.995581332039778e-06, "loss": 1.0809, "step": 314 }, { "epoch": 0.16570226196738558, "grad_norm": 2.088467836380005, "learning_rate": 4.9955399519035e-06, "loss": 1.0908, "step": 315 }, { "epoch": 0.16622830089426618, "grad_norm": 2.2554612159729004, "learning_rate": 4.995498379082932e-06, "loss": 1.1702, "step": 316 }, { "epoch": 0.16675433982114676, "grad_norm": 2.2798142433166504, "learning_rate": 4.995456613581284e-06, "loss": 1.107, "step": 317 }, { "epoch": 0.16728037874802734, "grad_norm": 2.4394755363464355, "learning_rate": 4.9954146554017816e-06, "loss": 1.0881, "step": 318 }, { "epoch": 0.16780641767490795, "grad_norm": 2.1176295280456543, "learning_rate": 4.995372504547662e-06, "loss": 1.1177, "step": 319 }, { "epoch": 0.16833245660178853, "grad_norm": 2.141923189163208, "learning_rate": 4.995330161022181e-06, "loss": 1.1321, "step": 320 }, { "epoch": 0.16885849552866913, "grad_norm": 2.273068428039551, "learning_rate": 4.9952876248286086e-06, "loss": 1.1832, "step": 321 }, { "epoch": 0.1693845344555497, "grad_norm": 2.267636299133301, "learning_rate": 4.995244895970228e-06, "loss": 1.1058, "step": 322 }, { "epoch": 0.1699105733824303, "grad_norm": 2.133772850036621, "learning_rate": 4.99520197445034e-06, "loss": 1.1478, "step": 323 }, { "epoch": 0.1704366123093109, "grad_norm": 2.2782862186431885, "learning_rate": 4.995158860272257e-06, "loss": 1.1074, "step": 324 }, { "epoch": 0.17096265123619148, "grad_norm": 2.544316053390503, "learning_rate": 4.995115553439308e-06, "loss": 1.0583, "step": 325 }, { "epoch": 0.17148869016307206, "grad_norm": 2.2900187969207764, "learning_rate": 4.995072053954838e-06, "loss": 1.1933, "step": 326 }, { "epoch": 0.17201472908995266, "grad_norm": 2.190380811691284, "learning_rate": 4.995028361822206e-06, "loss": 1.135, "step": 327 }, { "epoch": 0.17254076801683324, "grad_norm": 2.4495794773101807, "learning_rate": 4.9949844770447834e-06, "loss": 1.1214, "step": 328 }, { "epoch": 0.17306680694371385, "grad_norm": 2.332644462585449, "learning_rate": 4.994940399625959e-06, "loss": 1.1017, "step": 329 }, { "epoch": 0.17359284587059443, "grad_norm": 2.0709457397460938, "learning_rate": 4.994896129569138e-06, "loss": 1.1073, "step": 330 }, { "epoch": 0.174118884797475, "grad_norm": 2.8817923069000244, "learning_rate": 4.994851666877736e-06, "loss": 1.0758, "step": 331 }, { "epoch": 0.1746449237243556, "grad_norm": 2.2557790279388428, "learning_rate": 4.994807011555189e-06, "loss": 1.173, "step": 332 }, { "epoch": 0.1751709626512362, "grad_norm": 2.2412662506103516, "learning_rate": 4.994762163604942e-06, "loss": 1.1357, "step": 333 }, { "epoch": 0.17569700157811677, "grad_norm": 2.1749277114868164, "learning_rate": 4.9947171230304595e-06, "loss": 1.0988, "step": 334 }, { "epoch": 0.17622304050499737, "grad_norm": 2.4530062675476074, "learning_rate": 4.994671889835218e-06, "loss": 1.1377, "step": 335 }, { "epoch": 0.17674907943187795, "grad_norm": 2.2602410316467285, "learning_rate": 4.994626464022711e-06, "loss": 1.0799, "step": 336 }, { "epoch": 0.17727511835875856, "grad_norm": 2.0797061920166016, "learning_rate": 4.994580845596446e-06, "loss": 1.1214, "step": 337 }, { "epoch": 0.17780115728563914, "grad_norm": 2.1437630653381348, "learning_rate": 4.994535034559945e-06, "loss": 1.1794, "step": 338 }, { "epoch": 0.17832719621251972, "grad_norm": 2.0809285640716553, "learning_rate": 4.994489030916745e-06, "loss": 1.1331, "step": 339 }, { "epoch": 0.17885323513940032, "grad_norm": 2.31193208694458, "learning_rate": 4.994442834670397e-06, "loss": 1.1425, "step": 340 }, { "epoch": 0.1793792740662809, "grad_norm": 2.0348451137542725, "learning_rate": 4.99439644582447e-06, "loss": 1.1149, "step": 341 }, { "epoch": 0.17990531299316148, "grad_norm": 2.2816810607910156, "learning_rate": 4.994349864382544e-06, "loss": 1.1509, "step": 342 }, { "epoch": 0.1804313519200421, "grad_norm": 2.08492374420166, "learning_rate": 4.994303090348217e-06, "loss": 1.0854, "step": 343 }, { "epoch": 0.18095739084692267, "grad_norm": 2.0389866828918457, "learning_rate": 4.994256123725098e-06, "loss": 1.1195, "step": 344 }, { "epoch": 0.18148342977380327, "grad_norm": 2.2040510177612305, "learning_rate": 4.9942089645168175e-06, "loss": 1.1112, "step": 345 }, { "epoch": 0.18200946870068385, "grad_norm": 2.058849811553955, "learning_rate": 4.994161612727013e-06, "loss": 1.1462, "step": 346 }, { "epoch": 0.18253550762756443, "grad_norm": 2.2940948009490967, "learning_rate": 4.994114068359343e-06, "loss": 1.2183, "step": 347 }, { "epoch": 0.18306154655444504, "grad_norm": 2.0303874015808105, "learning_rate": 4.9940663314174756e-06, "loss": 1.1136, "step": 348 }, { "epoch": 0.18358758548132562, "grad_norm": 2.208289861679077, "learning_rate": 4.9940184019051e-06, "loss": 1.1507, "step": 349 }, { "epoch": 0.1841136244082062, "grad_norm": 2.438228130340576, "learning_rate": 4.993970279825915e-06, "loss": 1.1619, "step": 350 }, { "epoch": 0.1846396633350868, "grad_norm": 2.1701645851135254, "learning_rate": 4.993921965183636e-06, "loss": 1.1057, "step": 351 }, { "epoch": 0.18516570226196738, "grad_norm": 2.345054864883423, "learning_rate": 4.9938734579819944e-06, "loss": 1.1758, "step": 352 }, { "epoch": 0.185691741188848, "grad_norm": 2.3761768341064453, "learning_rate": 4.9938247582247345e-06, "loss": 1.1093, "step": 353 }, { "epoch": 0.18621778011572857, "grad_norm": 2.2209126949310303, "learning_rate": 4.993775865915618e-06, "loss": 1.0882, "step": 354 }, { "epoch": 0.18674381904260914, "grad_norm": 2.093406915664673, "learning_rate": 4.993726781058419e-06, "loss": 1.1621, "step": 355 }, { "epoch": 0.18726985796948975, "grad_norm": 2.509725332260132, "learning_rate": 4.993677503656927e-06, "loss": 1.1411, "step": 356 }, { "epoch": 0.18779589689637033, "grad_norm": 2.2245242595672607, "learning_rate": 4.993628033714947e-06, "loss": 1.1042, "step": 357 }, { "epoch": 0.1883219358232509, "grad_norm": 1.838408350944519, "learning_rate": 4.9935783712363e-06, "loss": 1.0204, "step": 358 }, { "epoch": 0.18884797475013151, "grad_norm": 2.0559537410736084, "learning_rate": 4.993528516224818e-06, "loss": 1.0681, "step": 359 }, { "epoch": 0.1893740136770121, "grad_norm": 2.084890604019165, "learning_rate": 4.993478468684352e-06, "loss": 1.1149, "step": 360 }, { "epoch": 0.1899000526038927, "grad_norm": 2.179478168487549, "learning_rate": 4.993428228618767e-06, "loss": 1.1342, "step": 361 }, { "epoch": 0.19042609153077328, "grad_norm": 2.082578182220459, "learning_rate": 4.99337779603194e-06, "loss": 1.1293, "step": 362 }, { "epoch": 0.19095213045765386, "grad_norm": 2.031831979751587, "learning_rate": 4.993327170927766e-06, "loss": 1.0728, "step": 363 }, { "epoch": 0.19147816938453446, "grad_norm": 2.1939597129821777, "learning_rate": 4.993276353310155e-06, "loss": 1.1252, "step": 364 }, { "epoch": 0.19200420831141504, "grad_norm": 2.031350612640381, "learning_rate": 4.9932253431830295e-06, "loss": 1.1039, "step": 365 }, { "epoch": 0.19253024723829562, "grad_norm": 2.3367671966552734, "learning_rate": 4.993174140550327e-06, "loss": 1.1211, "step": 366 }, { "epoch": 0.19305628616517623, "grad_norm": 2.2768945693969727, "learning_rate": 4.993122745416003e-06, "loss": 1.1119, "step": 367 }, { "epoch": 0.1935823250920568, "grad_norm": 2.220766544342041, "learning_rate": 4.993071157784025e-06, "loss": 1.1451, "step": 368 }, { "epoch": 0.1941083640189374, "grad_norm": 2.3694369792938232, "learning_rate": 4.993019377658376e-06, "loss": 1.1156, "step": 369 }, { "epoch": 0.194634402945818, "grad_norm": 2.245237350463867, "learning_rate": 4.9929674050430535e-06, "loss": 1.1316, "step": 370 }, { "epoch": 0.19516044187269857, "grad_norm": 2.720625400543213, "learning_rate": 4.992915239942071e-06, "loss": 1.1092, "step": 371 }, { "epoch": 0.19568648079957918, "grad_norm": 2.115727424621582, "learning_rate": 4.992862882359457e-06, "loss": 1.1769, "step": 372 }, { "epoch": 0.19621251972645976, "grad_norm": 2.235677480697632, "learning_rate": 4.992810332299253e-06, "loss": 1.1786, "step": 373 }, { "epoch": 0.19673855865334033, "grad_norm": 2.539433002471924, "learning_rate": 4.992757589765516e-06, "loss": 1.1251, "step": 374 }, { "epoch": 0.19726459758022094, "grad_norm": 5.042508602142334, "learning_rate": 4.99270465476232e-06, "loss": 1.0706, "step": 375 }, { "epoch": 0.19779063650710152, "grad_norm": 2.1171703338623047, "learning_rate": 4.9926515272937516e-06, "loss": 1.1287, "step": 376 }, { "epoch": 0.19831667543398213, "grad_norm": 2.4587223529815674, "learning_rate": 4.992598207363912e-06, "loss": 1.053, "step": 377 }, { "epoch": 0.1988427143608627, "grad_norm": 2.1502695083618164, "learning_rate": 4.9925446949769184e-06, "loss": 1.0837, "step": 378 }, { "epoch": 0.19936875328774328, "grad_norm": 2.139822483062744, "learning_rate": 4.992490990136903e-06, "loss": 1.1358, "step": 379 }, { "epoch": 0.1998947922146239, "grad_norm": 2.4914610385894775, "learning_rate": 4.992437092848012e-06, "loss": 1.1053, "step": 380 }, { "epoch": 0.20042083114150447, "grad_norm": 2.24576735496521, "learning_rate": 4.992383003114408e-06, "loss": 1.1034, "step": 381 }, { "epoch": 0.20094687006838505, "grad_norm": 2.1979477405548096, "learning_rate": 4.992328720940266e-06, "loss": 1.0839, "step": 382 }, { "epoch": 0.20147290899526565, "grad_norm": 2.1680850982666016, "learning_rate": 4.992274246329778e-06, "loss": 1.1011, "step": 383 }, { "epoch": 0.20199894792214623, "grad_norm": 2.3214027881622314, "learning_rate": 4.9922195792871495e-06, "loss": 1.03, "step": 384 }, { "epoch": 0.20252498684902684, "grad_norm": 2.162393808364868, "learning_rate": 4.9921647198166014e-06, "loss": 1.0466, "step": 385 }, { "epoch": 0.20305102577590742, "grad_norm": 2.184163808822632, "learning_rate": 4.99210966792237e-06, "loss": 1.1379, "step": 386 }, { "epoch": 0.203577064702788, "grad_norm": 2.3308913707733154, "learning_rate": 4.992054423608706e-06, "loss": 1.1751, "step": 387 }, { "epoch": 0.2041031036296686, "grad_norm": 2.123298168182373, "learning_rate": 4.991998986879874e-06, "loss": 1.1079, "step": 388 }, { "epoch": 0.20462914255654918, "grad_norm": 2.229844331741333, "learning_rate": 4.991943357740155e-06, "loss": 1.1242, "step": 389 }, { "epoch": 0.20515518148342976, "grad_norm": 2.1815683841705322, "learning_rate": 4.991887536193845e-06, "loss": 1.0949, "step": 390 }, { "epoch": 0.20568122041031037, "grad_norm": 2.4636261463165283, "learning_rate": 4.991831522245253e-06, "loss": 1.1118, "step": 391 }, { "epoch": 0.20620725933719095, "grad_norm": 2.0095014572143555, "learning_rate": 4.991775315898703e-06, "loss": 1.0197, "step": 392 }, { "epoch": 0.20673329826407155, "grad_norm": 2.1244406700134277, "learning_rate": 4.991718917158538e-06, "loss": 1.1081, "step": 393 }, { "epoch": 0.20725933719095213, "grad_norm": 1.9773920774459839, "learning_rate": 4.991662326029109e-06, "loss": 1.0657, "step": 394 }, { "epoch": 0.2077853761178327, "grad_norm": 2.204554796218872, "learning_rate": 4.9916055425147874e-06, "loss": 1.1434, "step": 395 }, { "epoch": 0.20831141504471332, "grad_norm": 2.068147659301758, "learning_rate": 4.991548566619957e-06, "loss": 1.1281, "step": 396 }, { "epoch": 0.2088374539715939, "grad_norm": 2.1518101692199707, "learning_rate": 4.991491398349017e-06, "loss": 1.0977, "step": 397 }, { "epoch": 0.20936349289847447, "grad_norm": 2.091654062271118, "learning_rate": 4.991434037706382e-06, "loss": 1.1033, "step": 398 }, { "epoch": 0.20988953182535508, "grad_norm": 2.8754067420959473, "learning_rate": 4.9913764846964805e-06, "loss": 1.1237, "step": 399 }, { "epoch": 0.21041557075223566, "grad_norm": 2.2165675163269043, "learning_rate": 4.991318739323757e-06, "loss": 1.1298, "step": 400 }, { "epoch": 0.21094160967911627, "grad_norm": 2.1219065189361572, "learning_rate": 4.991260801592668e-06, "loss": 1.0795, "step": 401 }, { "epoch": 0.21146764860599684, "grad_norm": 2.132737159729004, "learning_rate": 4.9912026715076885e-06, "loss": 1.0546, "step": 402 }, { "epoch": 0.21199368753287742, "grad_norm": 2.228076457977295, "learning_rate": 4.9911443490733075e-06, "loss": 1.1759, "step": 403 }, { "epoch": 0.21251972645975803, "grad_norm": 2.1305177211761475, "learning_rate": 4.991085834294027e-06, "loss": 1.0865, "step": 404 }, { "epoch": 0.2130457653866386, "grad_norm": 2.1550936698913574, "learning_rate": 4.991027127174365e-06, "loss": 1.1027, "step": 405 }, { "epoch": 0.2135718043135192, "grad_norm": 2.3489346504211426, "learning_rate": 4.990968227718854e-06, "loss": 1.184, "step": 406 }, { "epoch": 0.2140978432403998, "grad_norm": 2.2208189964294434, "learning_rate": 4.9909091359320434e-06, "loss": 1.1476, "step": 407 }, { "epoch": 0.21462388216728037, "grad_norm": 2.230978012084961, "learning_rate": 4.990849851818494e-06, "loss": 1.1125, "step": 408 }, { "epoch": 0.21514992109416098, "grad_norm": 2.294647216796875, "learning_rate": 4.990790375382784e-06, "loss": 1.1526, "step": 409 }, { "epoch": 0.21567596002104156, "grad_norm": 2.160446882247925, "learning_rate": 4.990730706629507e-06, "loss": 1.1569, "step": 410 }, { "epoch": 0.21620199894792214, "grad_norm": 2.1352434158325195, "learning_rate": 4.990670845563268e-06, "loss": 1.049, "step": 411 }, { "epoch": 0.21672803787480274, "grad_norm": 2.0740866661071777, "learning_rate": 4.99061079218869e-06, "loss": 1.104, "step": 412 }, { "epoch": 0.21725407680168332, "grad_norm": 2.302877426147461, "learning_rate": 4.990550546510408e-06, "loss": 1.0942, "step": 413 }, { "epoch": 0.2177801157285639, "grad_norm": 2.270836353302002, "learning_rate": 4.990490108533076e-06, "loss": 1.107, "step": 414 }, { "epoch": 0.2183061546554445, "grad_norm": 2.05703067779541, "learning_rate": 4.99042947826136e-06, "loss": 1.1284, "step": 415 }, { "epoch": 0.21883219358232509, "grad_norm": 2.3524155616760254, "learning_rate": 4.990368655699941e-06, "loss": 1.068, "step": 416 }, { "epoch": 0.2193582325092057, "grad_norm": 2.5300350189208984, "learning_rate": 4.9903076408535145e-06, "loss": 1.0993, "step": 417 }, { "epoch": 0.21988427143608627, "grad_norm": 2.1858162879943848, "learning_rate": 4.990246433726793e-06, "loss": 1.1398, "step": 418 }, { "epoch": 0.22041031036296685, "grad_norm": 1.9856489896774292, "learning_rate": 4.990185034324501e-06, "loss": 1.0671, "step": 419 }, { "epoch": 0.22093634928984746, "grad_norm": 2.177152156829834, "learning_rate": 4.99012344265138e-06, "loss": 1.1673, "step": 420 }, { "epoch": 0.22146238821672803, "grad_norm": 2.128787040710449, "learning_rate": 4.990061658712186e-06, "loss": 1.1629, "step": 421 }, { "epoch": 0.2219884271436086, "grad_norm": 2.1840457916259766, "learning_rate": 4.989999682511688e-06, "loss": 1.0739, "step": 422 }, { "epoch": 0.22251446607048922, "grad_norm": 2.37825608253479, "learning_rate": 4.989937514054673e-06, "loss": 1.1179, "step": 423 }, { "epoch": 0.2230405049973698, "grad_norm": 2.2746498584747314, "learning_rate": 4.98987515334594e-06, "loss": 1.1117, "step": 424 }, { "epoch": 0.2235665439242504, "grad_norm": 2.441087007522583, "learning_rate": 4.989812600390304e-06, "loss": 1.134, "step": 425 }, { "epoch": 0.22409258285113098, "grad_norm": 1.9548932313919067, "learning_rate": 4.989749855192596e-06, "loss": 1.0962, "step": 426 }, { "epoch": 0.22461862177801156, "grad_norm": 2.382025957107544, "learning_rate": 4.989686917757659e-06, "loss": 1.1233, "step": 427 }, { "epoch": 0.22514466070489217, "grad_norm": 2.1739771366119385, "learning_rate": 4.989623788090353e-06, "loss": 1.0665, "step": 428 }, { "epoch": 0.22567069963177275, "grad_norm": 2.3246262073516846, "learning_rate": 4.989560466195553e-06, "loss": 1.0834, "step": 429 }, { "epoch": 0.22619673855865335, "grad_norm": 2.1649882793426514, "learning_rate": 4.9894969520781475e-06, "loss": 1.1144, "step": 430 }, { "epoch": 0.22672277748553393, "grad_norm": 2.307199001312256, "learning_rate": 4.98943324574304e-06, "loss": 1.2195, "step": 431 }, { "epoch": 0.2272488164124145, "grad_norm": 2.2414958477020264, "learning_rate": 4.989369347195151e-06, "loss": 1.0549, "step": 432 }, { "epoch": 0.22777485533929512, "grad_norm": 2.12762713432312, "learning_rate": 4.989305256439413e-06, "loss": 1.1185, "step": 433 }, { "epoch": 0.2283008942661757, "grad_norm": 2.1503520011901855, "learning_rate": 4.989240973480774e-06, "loss": 1.1294, "step": 434 }, { "epoch": 0.22882693319305628, "grad_norm": 2.1283833980560303, "learning_rate": 4.9891764983242e-06, "loss": 1.1154, "step": 435 }, { "epoch": 0.22935297211993688, "grad_norm": 2.239828109741211, "learning_rate": 4.9891118309746666e-06, "loss": 1.073, "step": 436 }, { "epoch": 0.22987901104681746, "grad_norm": 2.396672248840332, "learning_rate": 4.989046971437167e-06, "loss": 1.0916, "step": 437 }, { "epoch": 0.23040504997369807, "grad_norm": 2.1172304153442383, "learning_rate": 4.98898191971671e-06, "loss": 1.1001, "step": 438 }, { "epoch": 0.23093108890057865, "grad_norm": 2.1714346408843994, "learning_rate": 4.98891667581832e-06, "loss": 1.1672, "step": 439 }, { "epoch": 0.23145712782745922, "grad_norm": 2.058523178100586, "learning_rate": 4.98885123974703e-06, "loss": 1.0842, "step": 440 }, { "epoch": 0.23198316675433983, "grad_norm": 2.4147160053253174, "learning_rate": 4.988785611507896e-06, "loss": 1.0755, "step": 441 }, { "epoch": 0.2325092056812204, "grad_norm": 2.274296283721924, "learning_rate": 4.988719791105985e-06, "loss": 1.1141, "step": 442 }, { "epoch": 0.233035244608101, "grad_norm": 2.178182363510132, "learning_rate": 4.988653778546379e-06, "loss": 1.212, "step": 443 }, { "epoch": 0.2335612835349816, "grad_norm": 2.200793743133545, "learning_rate": 4.988587573834173e-06, "loss": 1.0992, "step": 444 }, { "epoch": 0.23408732246186217, "grad_norm": 1.9726881980895996, "learning_rate": 4.98852117697448e-06, "loss": 1.1165, "step": 445 }, { "epoch": 0.23461336138874278, "grad_norm": 2.1173300743103027, "learning_rate": 4.988454587972428e-06, "loss": 1.1162, "step": 446 }, { "epoch": 0.23513940031562336, "grad_norm": 2.1428768634796143, "learning_rate": 4.9883878068331556e-06, "loss": 1.1343, "step": 447 }, { "epoch": 0.23566543924250394, "grad_norm": 2.00190806388855, "learning_rate": 4.988320833561822e-06, "loss": 1.0873, "step": 448 }, { "epoch": 0.23619147816938454, "grad_norm": 2.2472777366638184, "learning_rate": 4.988253668163596e-06, "loss": 1.1209, "step": 449 }, { "epoch": 0.23671751709626512, "grad_norm": 2.0522475242614746, "learning_rate": 4.988186310643666e-06, "loss": 1.0912, "step": 450 }, { "epoch": 0.2372435560231457, "grad_norm": 2.1521215438842773, "learning_rate": 4.98811876100723e-06, "loss": 1.0971, "step": 451 }, { "epoch": 0.2377695949500263, "grad_norm": 2.1117734909057617, "learning_rate": 4.988051019259505e-06, "loss": 1.1247, "step": 452 }, { "epoch": 0.2382956338769069, "grad_norm": 2.1884706020355225, "learning_rate": 4.987983085405722e-06, "loss": 1.1255, "step": 453 }, { "epoch": 0.2388216728037875, "grad_norm": 2.138962984085083, "learning_rate": 4.9879149594511245e-06, "loss": 1.0787, "step": 454 }, { "epoch": 0.23934771173066807, "grad_norm": 2.553452730178833, "learning_rate": 4.987846641400974e-06, "loss": 1.1178, "step": 455 }, { "epoch": 0.23987375065754865, "grad_norm": 2.5340464115142822, "learning_rate": 4.987778131260546e-06, "loss": 1.1577, "step": 456 }, { "epoch": 0.24039978958442926, "grad_norm": 2.2375919818878174, "learning_rate": 4.987709429035128e-06, "loss": 1.0711, "step": 457 }, { "epoch": 0.24092582851130984, "grad_norm": 2.35756254196167, "learning_rate": 4.987640534730027e-06, "loss": 1.1031, "step": 458 }, { "epoch": 0.24145186743819042, "grad_norm": 2.03385591506958, "learning_rate": 4.987571448350561e-06, "loss": 1.0869, "step": 459 }, { "epoch": 0.24197790636507102, "grad_norm": 2.662584066390991, "learning_rate": 4.987502169902065e-06, "loss": 1.0909, "step": 460 }, { "epoch": 0.2425039452919516, "grad_norm": 2.2569165229797363, "learning_rate": 4.987432699389888e-06, "loss": 1.1576, "step": 461 }, { "epoch": 0.2430299842188322, "grad_norm": 1.9718097448349, "learning_rate": 4.987363036819393e-06, "loss": 1.0577, "step": 462 }, { "epoch": 0.24355602314571279, "grad_norm": 2.2083537578582764, "learning_rate": 4.987293182195959e-06, "loss": 1.1328, "step": 463 }, { "epoch": 0.24408206207259336, "grad_norm": 2.2045726776123047, "learning_rate": 4.987223135524981e-06, "loss": 1.0908, "step": 464 }, { "epoch": 0.24460810099947397, "grad_norm": 2.213714122772217, "learning_rate": 4.987152896811866e-06, "loss": 1.124, "step": 465 }, { "epoch": 0.24513413992635455, "grad_norm": 4.030746936798096, "learning_rate": 4.987082466062038e-06, "loss": 1.0855, "step": 466 }, { "epoch": 0.24566017885323513, "grad_norm": 2.1142022609710693, "learning_rate": 4.987011843280934e-06, "loss": 1.1305, "step": 467 }, { "epoch": 0.24618621778011573, "grad_norm": 2.1746232509613037, "learning_rate": 4.986941028474009e-06, "loss": 1.0846, "step": 468 }, { "epoch": 0.2467122567069963, "grad_norm": 2.038947820663452, "learning_rate": 4.986870021646728e-06, "loss": 1.0907, "step": 469 }, { "epoch": 0.24723829563387692, "grad_norm": 12.261099815368652, "learning_rate": 4.986798822804576e-06, "loss": 1.1012, "step": 470 }, { "epoch": 0.2477643345607575, "grad_norm": 2.020077705383301, "learning_rate": 4.986727431953048e-06, "loss": 1.097, "step": 471 }, { "epoch": 0.24829037348763808, "grad_norm": 2.070114850997925, "learning_rate": 4.986655849097658e-06, "loss": 1.175, "step": 472 }, { "epoch": 0.24881641241451868, "grad_norm": 2.0364394187927246, "learning_rate": 4.986584074243932e-06, "loss": 1.0892, "step": 473 }, { "epoch": 0.24934245134139926, "grad_norm": 2.1961004734039307, "learning_rate": 4.986512107397413e-06, "loss": 1.0867, "step": 474 }, { "epoch": 0.24986849026827984, "grad_norm": 3.1488072872161865, "learning_rate": 4.986439948563656e-06, "loss": 1.0276, "step": 475 }, { "epoch": 0.2503945291951604, "grad_norm": 2.3070068359375, "learning_rate": 4.986367597748235e-06, "loss": 1.0897, "step": 476 }, { "epoch": 0.25092056812204105, "grad_norm": 2.0328757762908936, "learning_rate": 4.986295054956733e-06, "loss": 1.0573, "step": 477 }, { "epoch": 0.25144660704892163, "grad_norm": 2.4608747959136963, "learning_rate": 4.986222320194754e-06, "loss": 1.1343, "step": 478 }, { "epoch": 0.2519726459758022, "grad_norm": 2.249994993209839, "learning_rate": 4.986149393467913e-06, "loss": 1.0771, "step": 479 }, { "epoch": 0.2524986849026828, "grad_norm": 2.1573803424835205, "learning_rate": 4.98607627478184e-06, "loss": 1.0795, "step": 480 }, { "epoch": 0.25302472382956337, "grad_norm": 2.6239383220672607, "learning_rate": 4.986002964142182e-06, "loss": 1.0874, "step": 481 }, { "epoch": 0.253550762756444, "grad_norm": 2.0815794467926025, "learning_rate": 4.985929461554597e-06, "loss": 1.0729, "step": 482 }, { "epoch": 0.2540768016833246, "grad_norm": 2.156259059906006, "learning_rate": 4.985855767024763e-06, "loss": 1.0912, "step": 483 }, { "epoch": 0.25460284061020516, "grad_norm": 2.4136252403259277, "learning_rate": 4.985781880558369e-06, "loss": 1.1365, "step": 484 }, { "epoch": 0.25512887953708574, "grad_norm": 2.265622854232788, "learning_rate": 4.98570780216112e-06, "loss": 1.1218, "step": 485 }, { "epoch": 0.2556549184639663, "grad_norm": 2.1097841262817383, "learning_rate": 4.985633531838735e-06, "loss": 1.1238, "step": 486 }, { "epoch": 0.2561809573908469, "grad_norm": 2.205012083053589, "learning_rate": 4.985559069596949e-06, "loss": 1.0664, "step": 487 }, { "epoch": 0.25670699631772753, "grad_norm": 2.1896169185638428, "learning_rate": 4.9854844154415115e-06, "loss": 1.0374, "step": 488 }, { "epoch": 0.2572330352446081, "grad_norm": 2.0652949810028076, "learning_rate": 4.985409569378187e-06, "loss": 1.1016, "step": 489 }, { "epoch": 0.2577590741714887, "grad_norm": 2.1278676986694336, "learning_rate": 4.985334531412754e-06, "loss": 1.147, "step": 490 }, { "epoch": 0.25828511309836927, "grad_norm": 2.2769057750701904, "learning_rate": 4.985259301551005e-06, "loss": 1.1389, "step": 491 }, { "epoch": 0.25881115202524985, "grad_norm": 2.0440104007720947, "learning_rate": 4.985183879798751e-06, "loss": 1.0826, "step": 492 }, { "epoch": 0.2593371909521305, "grad_norm": 2.4153213500976562, "learning_rate": 4.985108266161815e-06, "loss": 1.105, "step": 493 }, { "epoch": 0.25986322987901106, "grad_norm": 2.3863043785095215, "learning_rate": 4.985032460646033e-06, "loss": 1.1023, "step": 494 }, { "epoch": 0.26038926880589164, "grad_norm": 2.2597336769104004, "learning_rate": 4.98495646325726e-06, "loss": 1.1046, "step": 495 }, { "epoch": 0.2609153077327722, "grad_norm": 2.541444778442383, "learning_rate": 4.984880274001364e-06, "loss": 1.1149, "step": 496 }, { "epoch": 0.2614413466596528, "grad_norm": 2.3011064529418945, "learning_rate": 4.984803892884227e-06, "loss": 1.0757, "step": 497 }, { "epoch": 0.26196738558653343, "grad_norm": 2.116774797439575, "learning_rate": 4.9847273199117475e-06, "loss": 1.1151, "step": 498 }, { "epoch": 0.262493424513414, "grad_norm": 2.2372357845306396, "learning_rate": 4.984650555089836e-06, "loss": 1.1107, "step": 499 }, { "epoch": 0.2630194634402946, "grad_norm": 2.0782155990600586, "learning_rate": 4.984573598424421e-06, "loss": 1.1174, "step": 500 }, { "epoch": 0.26354550236717517, "grad_norm": 2.0625476837158203, "learning_rate": 4.984496449921444e-06, "loss": 1.0965, "step": 501 }, { "epoch": 0.26407154129405574, "grad_norm": 2.142184019088745, "learning_rate": 4.9844191095868615e-06, "loss": 1.0678, "step": 502 }, { "epoch": 0.2645975802209363, "grad_norm": 2.1218082904815674, "learning_rate": 4.984341577426646e-06, "loss": 1.0661, "step": 503 }, { "epoch": 0.26512361914781696, "grad_norm": 2.2910757064819336, "learning_rate": 4.984263853446783e-06, "loss": 1.1111, "step": 504 }, { "epoch": 0.26564965807469754, "grad_norm": 2.0604546070098877, "learning_rate": 4.984185937653274e-06, "loss": 1.0614, "step": 505 }, { "epoch": 0.2661756970015781, "grad_norm": 2.1210556030273438, "learning_rate": 4.984107830052134e-06, "loss": 1.0925, "step": 506 }, { "epoch": 0.2667017359284587, "grad_norm": 2.535501003265381, "learning_rate": 4.984029530649396e-06, "loss": 1.1238, "step": 507 }, { "epoch": 0.2672277748553393, "grad_norm": 2.2978546619415283, "learning_rate": 4.9839510394511035e-06, "loss": 1.1615, "step": 508 }, { "epoch": 0.2677538137822199, "grad_norm": 2.0443382263183594, "learning_rate": 4.983872356463318e-06, "loss": 1.1087, "step": 509 }, { "epoch": 0.2682798527091005, "grad_norm": 2.216139316558838, "learning_rate": 4.983793481692114e-06, "loss": 1.1431, "step": 510 }, { "epoch": 0.26880589163598106, "grad_norm": 1.9255571365356445, "learning_rate": 4.983714415143583e-06, "loss": 1.0204, "step": 511 }, { "epoch": 0.26933193056286164, "grad_norm": 2.103969097137451, "learning_rate": 4.9836351568238286e-06, "loss": 1.0855, "step": 512 }, { "epoch": 0.2698579694897422, "grad_norm": 2.5458972454071045, "learning_rate": 4.98355570673897e-06, "loss": 1.0747, "step": 513 }, { "epoch": 0.27038400841662286, "grad_norm": 2.023601531982422, "learning_rate": 4.983476064895143e-06, "loss": 1.0471, "step": 514 }, { "epoch": 0.27091004734350344, "grad_norm": 2.0976908206939697, "learning_rate": 4.983396231298496e-06, "loss": 1.0658, "step": 515 }, { "epoch": 0.271436086270384, "grad_norm": 2.4051074981689453, "learning_rate": 4.9833162059551936e-06, "loss": 1.0624, "step": 516 }, { "epoch": 0.2719621251972646, "grad_norm": 2.0524230003356934, "learning_rate": 4.983235988871414e-06, "loss": 1.1261, "step": 517 }, { "epoch": 0.27248816412414517, "grad_norm": 2.1440162658691406, "learning_rate": 4.983155580053351e-06, "loss": 0.9893, "step": 518 }, { "epoch": 0.27301420305102575, "grad_norm": 2.1923670768737793, "learning_rate": 4.983074979507213e-06, "loss": 1.1066, "step": 519 }, { "epoch": 0.2735402419779064, "grad_norm": 2.2967565059661865, "learning_rate": 4.982994187239225e-06, "loss": 1.1256, "step": 520 }, { "epoch": 0.27406628090478696, "grad_norm": 2.0392587184906006, "learning_rate": 4.982913203255623e-06, "loss": 1.1026, "step": 521 }, { "epoch": 0.27459231983166754, "grad_norm": 2.371121644973755, "learning_rate": 4.9828320275626605e-06, "loss": 1.0607, "step": 522 }, { "epoch": 0.2751183587585481, "grad_norm": 2.082239866256714, "learning_rate": 4.982750660166606e-06, "loss": 1.0749, "step": 523 }, { "epoch": 0.2756443976854287, "grad_norm": 2.2039687633514404, "learning_rate": 4.98266910107374e-06, "loss": 1.0769, "step": 524 }, { "epoch": 0.27617043661230933, "grad_norm": 2.087859869003296, "learning_rate": 4.9825873502903625e-06, "loss": 1.1575, "step": 525 }, { "epoch": 0.2766964755391899, "grad_norm": 2.1991021633148193, "learning_rate": 4.982505407822783e-06, "loss": 1.1149, "step": 526 }, { "epoch": 0.2772225144660705, "grad_norm": 2.2656140327453613, "learning_rate": 4.98242327367733e-06, "loss": 1.0948, "step": 527 }, { "epoch": 0.27774855339295107, "grad_norm": 2.1107430458068848, "learning_rate": 4.982340947860344e-06, "loss": 1.0289, "step": 528 }, { "epoch": 0.27827459231983165, "grad_norm": 2.2510344982147217, "learning_rate": 4.982258430378184e-06, "loss": 1.0694, "step": 529 }, { "epoch": 0.2788006312467123, "grad_norm": 2.252258062362671, "learning_rate": 4.982175721237218e-06, "loss": 1.0435, "step": 530 }, { "epoch": 0.27932667017359286, "grad_norm": 2.12455677986145, "learning_rate": 4.982092820443834e-06, "loss": 1.0202, "step": 531 }, { "epoch": 0.27985270910047344, "grad_norm": 2.3654651641845703, "learning_rate": 4.982009728004433e-06, "loss": 1.1282, "step": 532 }, { "epoch": 0.280378748027354, "grad_norm": 2.3759138584136963, "learning_rate": 4.981926443925431e-06, "loss": 1.1557, "step": 533 }, { "epoch": 0.2809047869542346, "grad_norm": 1.9874821901321411, "learning_rate": 4.981842968213256e-06, "loss": 1.0723, "step": 534 }, { "epoch": 0.2814308258811152, "grad_norm": 2.154383897781372, "learning_rate": 4.981759300874356e-06, "loss": 1.0786, "step": 535 }, { "epoch": 0.2819568648079958, "grad_norm": 2.1774797439575195, "learning_rate": 4.9816754419151906e-06, "loss": 1.0457, "step": 536 }, { "epoch": 0.2824829037348764, "grad_norm": 2.206082820892334, "learning_rate": 4.981591391342233e-06, "loss": 1.0216, "step": 537 }, { "epoch": 0.28300894266175697, "grad_norm": 2.008676528930664, "learning_rate": 4.981507149161975e-06, "loss": 1.0297, "step": 538 }, { "epoch": 0.28353498158863755, "grad_norm": 2.0553462505340576, "learning_rate": 4.981422715380919e-06, "loss": 1.0967, "step": 539 }, { "epoch": 0.2840610205155181, "grad_norm": 2.047567844390869, "learning_rate": 4.981338090005586e-06, "loss": 1.0524, "step": 540 }, { "epoch": 0.28458705944239876, "grad_norm": 2.2144312858581543, "learning_rate": 4.981253273042509e-06, "loss": 1.1178, "step": 541 }, { "epoch": 0.28511309836927934, "grad_norm": 2.388124465942383, "learning_rate": 4.981168264498238e-06, "loss": 1.0728, "step": 542 }, { "epoch": 0.2856391372961599, "grad_norm": 2.152280807495117, "learning_rate": 4.981083064379335e-06, "loss": 1.1146, "step": 543 }, { "epoch": 0.2861651762230405, "grad_norm": 2.1481564044952393, "learning_rate": 4.98099767269238e-06, "loss": 1.1376, "step": 544 }, { "epoch": 0.2866912151499211, "grad_norm": 2.060664415359497, "learning_rate": 4.980912089443966e-06, "loss": 1.0961, "step": 545 }, { "epoch": 0.2872172540768017, "grad_norm": 2.032557964324951, "learning_rate": 4.9808263146406985e-06, "loss": 1.1055, "step": 546 }, { "epoch": 0.2877432930036823, "grad_norm": 2.0957093238830566, "learning_rate": 4.980740348289204e-06, "loss": 1.0444, "step": 547 }, { "epoch": 0.28826933193056287, "grad_norm": 2.0774853229522705, "learning_rate": 4.980654190396118e-06, "loss": 1.0963, "step": 548 }, { "epoch": 0.28879537085744345, "grad_norm": 2.0808207988739014, "learning_rate": 4.980567840968094e-06, "loss": 1.0634, "step": 549 }, { "epoch": 0.289321409784324, "grad_norm": 2.2924559116363525, "learning_rate": 4.980481300011797e-06, "loss": 1.0805, "step": 550 }, { "epoch": 0.2898474487112046, "grad_norm": 2.041088104248047, "learning_rate": 4.980394567533911e-06, "loss": 1.0983, "step": 551 }, { "epoch": 0.29037348763808524, "grad_norm": 2.030073881149292, "learning_rate": 4.980307643541132e-06, "loss": 1.1334, "step": 552 }, { "epoch": 0.2908995265649658, "grad_norm": 2.15849232673645, "learning_rate": 4.980220528040172e-06, "loss": 1.0906, "step": 553 }, { "epoch": 0.2914255654918464, "grad_norm": 2.094135284423828, "learning_rate": 4.9801332210377574e-06, "loss": 1.0644, "step": 554 }, { "epoch": 0.291951604418727, "grad_norm": 2.193941354751587, "learning_rate": 4.980045722540628e-06, "loss": 1.0819, "step": 555 }, { "epoch": 0.29247764334560755, "grad_norm": 2.2015504837036133, "learning_rate": 4.979958032555542e-06, "loss": 1.0759, "step": 556 }, { "epoch": 0.2930036822724882, "grad_norm": 2.1240222454071045, "learning_rate": 4.979870151089267e-06, "loss": 1.1268, "step": 557 }, { "epoch": 0.29352972119936876, "grad_norm": 2.0243959426879883, "learning_rate": 4.9797820781485905e-06, "loss": 1.0449, "step": 558 }, { "epoch": 0.29405576012624934, "grad_norm": 2.2300705909729004, "learning_rate": 4.979693813740313e-06, "loss": 1.0493, "step": 559 }, { "epoch": 0.2945817990531299, "grad_norm": 2.1185836791992188, "learning_rate": 4.979605357871249e-06, "loss": 1.0921, "step": 560 }, { "epoch": 0.2951078379800105, "grad_norm": 2.091691732406616, "learning_rate": 4.979516710548227e-06, "loss": 1.1025, "step": 561 }, { "epoch": 0.29563387690689114, "grad_norm": 2.1666178703308105, "learning_rate": 4.979427871778094e-06, "loss": 1.1245, "step": 562 }, { "epoch": 0.2961599158337717, "grad_norm": 2.6985056400299072, "learning_rate": 4.9793388415677066e-06, "loss": 1.1398, "step": 563 }, { "epoch": 0.2966859547606523, "grad_norm": 2.118074655532837, "learning_rate": 4.979249619923942e-06, "loss": 1.0897, "step": 564 }, { "epoch": 0.29721199368753287, "grad_norm": 2.246856927871704, "learning_rate": 4.979160206853687e-06, "loss": 1.0714, "step": 565 }, { "epoch": 0.29773803261441345, "grad_norm": 2.201953887939453, "learning_rate": 4.979070602363846e-06, "loss": 1.1466, "step": 566 }, { "epoch": 0.29826407154129403, "grad_norm": 2.048617362976074, "learning_rate": 4.9789808064613375e-06, "loss": 1.1368, "step": 567 }, { "epoch": 0.29879011046817466, "grad_norm": 2.1507785320281982, "learning_rate": 4.978890819153095e-06, "loss": 1.1499, "step": 568 }, { "epoch": 0.29931614939505524, "grad_norm": 1.9633440971374512, "learning_rate": 4.978800640446066e-06, "loss": 1.0667, "step": 569 }, { "epoch": 0.2998421883219358, "grad_norm": 2.1089606285095215, "learning_rate": 4.978710270347214e-06, "loss": 1.0611, "step": 570 }, { "epoch": 0.3003682272488164, "grad_norm": 2.170901298522949, "learning_rate": 4.9786197088635145e-06, "loss": 1.1524, "step": 571 }, { "epoch": 0.300894266175697, "grad_norm": 2.165510892868042, "learning_rate": 4.978528956001964e-06, "loss": 1.0987, "step": 572 }, { "epoch": 0.3014203051025776, "grad_norm": 2.0415878295898438, "learning_rate": 4.978438011769565e-06, "loss": 1.1582, "step": 573 }, { "epoch": 0.3019463440294582, "grad_norm": 2.110260248184204, "learning_rate": 4.978346876173342e-06, "loss": 1.0587, "step": 574 }, { "epoch": 0.30247238295633877, "grad_norm": 2.253488063812256, "learning_rate": 4.9782555492203334e-06, "loss": 1.1038, "step": 575 }, { "epoch": 0.30299842188321935, "grad_norm": 2.0166091918945312, "learning_rate": 4.978164030917587e-06, "loss": 1.0367, "step": 576 }, { "epoch": 0.3035244608100999, "grad_norm": 2.2842600345611572, "learning_rate": 4.978072321272171e-06, "loss": 1.0996, "step": 577 }, { "epoch": 0.30405049973698056, "grad_norm": 2.0563907623291016, "learning_rate": 4.977980420291166e-06, "loss": 1.1219, "step": 578 }, { "epoch": 0.30457653866386114, "grad_norm": 2.059800863265991, "learning_rate": 4.977888327981668e-06, "loss": 1.1193, "step": 579 }, { "epoch": 0.3051025775907417, "grad_norm": 2.242919921875, "learning_rate": 4.977796044350788e-06, "loss": 1.0701, "step": 580 }, { "epoch": 0.3056286165176223, "grad_norm": 1.9749282598495483, "learning_rate": 4.977703569405651e-06, "loss": 1.0771, "step": 581 }, { "epoch": 0.3061546554445029, "grad_norm": 2.2251386642456055, "learning_rate": 4.977610903153397e-06, "loss": 1.084, "step": 582 }, { "epoch": 0.30668069437138346, "grad_norm": 2.0289855003356934, "learning_rate": 4.97751804560118e-06, "loss": 1.0732, "step": 583 }, { "epoch": 0.3072067332982641, "grad_norm": 2.152841806411743, "learning_rate": 4.977424996756171e-06, "loss": 1.0712, "step": 584 }, { "epoch": 0.30773277222514467, "grad_norm": 2.3243937492370605, "learning_rate": 4.977331756625555e-06, "loss": 1.0197, "step": 585 }, { "epoch": 0.30825881115202525, "grad_norm": 2.293274402618408, "learning_rate": 4.97723832521653e-06, "loss": 1.1121, "step": 586 }, { "epoch": 0.3087848500789058, "grad_norm": 2.139958143234253, "learning_rate": 4.97714470253631e-06, "loss": 1.0799, "step": 587 }, { "epoch": 0.3093108890057864, "grad_norm": 2.269357442855835, "learning_rate": 4.977050888592123e-06, "loss": 1.0872, "step": 588 }, { "epoch": 0.30983692793266704, "grad_norm": 2.268691301345825, "learning_rate": 4.976956883391215e-06, "loss": 1.1079, "step": 589 }, { "epoch": 0.3103629668595476, "grad_norm": 2.127131223678589, "learning_rate": 4.976862686940842e-06, "loss": 1.1217, "step": 590 }, { "epoch": 0.3108890057864282, "grad_norm": 2.0126006603240967, "learning_rate": 4.976768299248278e-06, "loss": 1.0719, "step": 591 }, { "epoch": 0.3114150447133088, "grad_norm": 1.965903639793396, "learning_rate": 4.97667372032081e-06, "loss": 1.0843, "step": 592 }, { "epoch": 0.31194108364018935, "grad_norm": 2.1280322074890137, "learning_rate": 4.976578950165742e-06, "loss": 1.0676, "step": 593 }, { "epoch": 0.31246712256707, "grad_norm": 2.2355756759643555, "learning_rate": 4.976483988790391e-06, "loss": 1.0855, "step": 594 }, { "epoch": 0.31299316149395057, "grad_norm": 2.153095245361328, "learning_rate": 4.976388836202088e-06, "loss": 1.0357, "step": 595 }, { "epoch": 0.31351920042083115, "grad_norm": 2.023137092590332, "learning_rate": 4.97629349240818e-06, "loss": 1.0381, "step": 596 }, { "epoch": 0.3140452393477117, "grad_norm": 2.2524759769439697, "learning_rate": 4.97619795741603e-06, "loss": 1.0911, "step": 597 }, { "epoch": 0.3145712782745923, "grad_norm": 2.1904008388519287, "learning_rate": 4.9761022312330135e-06, "loss": 1.047, "step": 598 }, { "epoch": 0.3150973172014729, "grad_norm": 2.3166565895080566, "learning_rate": 4.976006313866521e-06, "loss": 1.0663, "step": 599 }, { "epoch": 0.3156233561283535, "grad_norm": 2.11413836479187, "learning_rate": 4.975910205323959e-06, "loss": 1.0843, "step": 600 }, { "epoch": 0.3161493950552341, "grad_norm": 2.1609344482421875, "learning_rate": 4.975813905612749e-06, "loss": 1.1344, "step": 601 }, { "epoch": 0.3166754339821147, "grad_norm": 2.055330276489258, "learning_rate": 4.975717414740326e-06, "loss": 1.0663, "step": 602 }, { "epoch": 0.31720147290899525, "grad_norm": 2.2735755443573, "learning_rate": 4.975620732714139e-06, "loss": 1.1061, "step": 603 }, { "epoch": 0.31772751183587583, "grad_norm": 2.1966300010681152, "learning_rate": 4.975523859541654e-06, "loss": 1.1498, "step": 604 }, { "epoch": 0.31825355076275647, "grad_norm": 2.20951247215271, "learning_rate": 4.975426795230351e-06, "loss": 1.1057, "step": 605 }, { "epoch": 0.31877958968963704, "grad_norm": 2.0706050395965576, "learning_rate": 4.975329539787725e-06, "loss": 1.0906, "step": 606 }, { "epoch": 0.3193056286165176, "grad_norm": 2.0394089221954346, "learning_rate": 4.975232093221284e-06, "loss": 1.0514, "step": 607 }, { "epoch": 0.3198316675433982, "grad_norm": 2.1639111042022705, "learning_rate": 4.975134455538551e-06, "loss": 1.0787, "step": 608 }, { "epoch": 0.3203577064702788, "grad_norm": 2.025575876235962, "learning_rate": 4.975036626747067e-06, "loss": 1.0451, "step": 609 }, { "epoch": 0.3208837453971594, "grad_norm": 2.060215950012207, "learning_rate": 4.974938606854384e-06, "loss": 1.0821, "step": 610 }, { "epoch": 0.32140978432404, "grad_norm": 2.265155792236328, "learning_rate": 4.974840395868073e-06, "loss": 1.1341, "step": 611 }, { "epoch": 0.32193582325092057, "grad_norm": 2.22503924369812, "learning_rate": 4.974741993795712e-06, "loss": 1.1643, "step": 612 }, { "epoch": 0.32246186217780115, "grad_norm": 2.11155104637146, "learning_rate": 4.9746434006449034e-06, "loss": 1.0548, "step": 613 }, { "epoch": 0.32298790110468173, "grad_norm": 2.0055696964263916, "learning_rate": 4.974544616423258e-06, "loss": 1.0769, "step": 614 }, { "epoch": 0.3235139400315623, "grad_norm": 2.0843770503997803, "learning_rate": 4.974445641138403e-06, "loss": 1.0701, "step": 615 }, { "epoch": 0.32403997895844294, "grad_norm": 2.0580337047576904, "learning_rate": 4.9743464747979785e-06, "loss": 1.0465, "step": 616 }, { "epoch": 0.3245660178853235, "grad_norm": 2.3719844818115234, "learning_rate": 4.974247117409645e-06, "loss": 1.1498, "step": 617 }, { "epoch": 0.3250920568122041, "grad_norm": 1.9926241636276245, "learning_rate": 4.974147568981072e-06, "loss": 1.081, "step": 618 }, { "epoch": 0.3256180957390847, "grad_norm": 2.029318332672119, "learning_rate": 4.974047829519946e-06, "loss": 1.139, "step": 619 }, { "epoch": 0.32614413466596526, "grad_norm": 2.0171804428100586, "learning_rate": 4.973947899033969e-06, "loss": 1.0887, "step": 620 }, { "epoch": 0.3266701735928459, "grad_norm": 2.3209071159362793, "learning_rate": 4.973847777530854e-06, "loss": 1.1156, "step": 621 }, { "epoch": 0.32719621251972647, "grad_norm": 2.360849142074585, "learning_rate": 4.973747465018334e-06, "loss": 1.1305, "step": 622 }, { "epoch": 0.32772225144660705, "grad_norm": 2.1828086376190186, "learning_rate": 4.973646961504154e-06, "loss": 1.091, "step": 623 }, { "epoch": 0.32824829037348763, "grad_norm": 1.9628446102142334, "learning_rate": 4.973546266996074e-06, "loss": 1.0932, "step": 624 }, { "epoch": 0.3287743293003682, "grad_norm": 2.0040283203125, "learning_rate": 4.973445381501868e-06, "loss": 1.0723, "step": 625 }, { "epoch": 0.32930036822724884, "grad_norm": 2.289292097091675, "learning_rate": 4.973344305029326e-06, "loss": 1.1526, "step": 626 }, { "epoch": 0.3298264071541294, "grad_norm": 2.1106910705566406, "learning_rate": 4.973243037586252e-06, "loss": 1.1327, "step": 627 }, { "epoch": 0.33035244608101, "grad_norm": 2.326677083969116, "learning_rate": 4.9731415791804655e-06, "loss": 1.0898, "step": 628 }, { "epoch": 0.3308784850078906, "grad_norm": 2.086299180984497, "learning_rate": 4.9730399298198e-06, "loss": 1.0842, "step": 629 }, { "epoch": 0.33140452393477116, "grad_norm": 2.045738935470581, "learning_rate": 4.972938089512104e-06, "loss": 1.0156, "step": 630 }, { "epoch": 0.33193056286165173, "grad_norm": 2.038058280944824, "learning_rate": 4.97283605826524e-06, "loss": 1.0545, "step": 631 }, { "epoch": 0.33245660178853237, "grad_norm": 2.0892717838287354, "learning_rate": 4.972733836087088e-06, "loss": 1.099, "step": 632 }, { "epoch": 0.33298264071541295, "grad_norm": 2.2152934074401855, "learning_rate": 4.972631422985538e-06, "loss": 1.0775, "step": 633 }, { "epoch": 0.3335086796422935, "grad_norm": 2.3605494499206543, "learning_rate": 4.9725288189685e-06, "loss": 1.0682, "step": 634 }, { "epoch": 0.3340347185691741, "grad_norm": 2.076491117477417, "learning_rate": 4.9724260240438945e-06, "loss": 1.063, "step": 635 }, { "epoch": 0.3345607574960547, "grad_norm": 3.2677767276763916, "learning_rate": 4.97232303821966e-06, "loss": 1.1173, "step": 636 }, { "epoch": 0.3350867964229353, "grad_norm": 2.110320568084717, "learning_rate": 4.972219861503746e-06, "loss": 1.0264, "step": 637 }, { "epoch": 0.3356128353498159, "grad_norm": 2.101353406906128, "learning_rate": 4.972116493904121e-06, "loss": 1.0806, "step": 638 }, { "epoch": 0.3361388742766965, "grad_norm": 2.247091293334961, "learning_rate": 4.972012935428765e-06, "loss": 1.1178, "step": 639 }, { "epoch": 0.33666491320357705, "grad_norm": 2.183757781982422, "learning_rate": 4.971909186085675e-06, "loss": 1.0615, "step": 640 }, { "epoch": 0.33719095213045763, "grad_norm": 2.0801236629486084, "learning_rate": 4.97180524588286e-06, "loss": 1.0441, "step": 641 }, { "epoch": 0.33771699105733827, "grad_norm": 1.9939873218536377, "learning_rate": 4.9717011148283455e-06, "loss": 1.0853, "step": 642 }, { "epoch": 0.33824302998421885, "grad_norm": 2.13399338722229, "learning_rate": 4.971596792930174e-06, "loss": 0.9943, "step": 643 }, { "epoch": 0.3387690689110994, "grad_norm": 2.1221766471862793, "learning_rate": 4.971492280196397e-06, "loss": 1.0088, "step": 644 }, { "epoch": 0.33929510783798, "grad_norm": 2.023320436477661, "learning_rate": 4.971387576635087e-06, "loss": 1.0449, "step": 645 }, { "epoch": 0.3398211467648606, "grad_norm": 2.1422126293182373, "learning_rate": 4.971282682254327e-06, "loss": 1.0987, "step": 646 }, { "epoch": 0.3403471856917412, "grad_norm": 2.136868715286255, "learning_rate": 4.971177597062215e-06, "loss": 1.0983, "step": 647 }, { "epoch": 0.3408732246186218, "grad_norm": 2.1036930084228516, "learning_rate": 4.971072321066868e-06, "loss": 1.1284, "step": 648 }, { "epoch": 0.3413992635455024, "grad_norm": 2.147191286087036, "learning_rate": 4.970966854276411e-06, "loss": 1.1165, "step": 649 }, { "epoch": 0.34192530247238295, "grad_norm": 2.1734893321990967, "learning_rate": 4.970861196698988e-06, "loss": 1.0834, "step": 650 }, { "epoch": 0.34245134139926353, "grad_norm": 2.038435459136963, "learning_rate": 4.97075534834276e-06, "loss": 1.0193, "step": 651 }, { "epoch": 0.3429773803261441, "grad_norm": 2.077822208404541, "learning_rate": 4.970649309215895e-06, "loss": 1.0697, "step": 652 }, { "epoch": 0.34350341925302474, "grad_norm": 2.056907892227173, "learning_rate": 4.970543079326584e-06, "loss": 1.0593, "step": 653 }, { "epoch": 0.3440294581799053, "grad_norm": 2.7795369625091553, "learning_rate": 4.9704366586830275e-06, "loss": 1.122, "step": 654 }, { "epoch": 0.3445554971067859, "grad_norm": 2.0807559490203857, "learning_rate": 4.970330047293443e-06, "loss": 1.0225, "step": 655 }, { "epoch": 0.3450815360336665, "grad_norm": 2.219024658203125, "learning_rate": 4.970223245166062e-06, "loss": 1.1506, "step": 656 }, { "epoch": 0.34560757496054706, "grad_norm": 2.1809475421905518, "learning_rate": 4.970116252309131e-06, "loss": 1.1094, "step": 657 }, { "epoch": 0.3461336138874277, "grad_norm": 2.243777275085449, "learning_rate": 4.970009068730911e-06, "loss": 1.0942, "step": 658 }, { "epoch": 0.3466596528143083, "grad_norm": 2.106391191482544, "learning_rate": 4.969901694439677e-06, "loss": 1.0899, "step": 659 }, { "epoch": 0.34718569174118885, "grad_norm": 2.1109979152679443, "learning_rate": 4.96979412944372e-06, "loss": 1.0622, "step": 660 }, { "epoch": 0.34771173066806943, "grad_norm": 2.292466163635254, "learning_rate": 4.969686373751347e-06, "loss": 1.1081, "step": 661 }, { "epoch": 0.34823776959495, "grad_norm": 1.9919096231460571, "learning_rate": 4.9695784273708755e-06, "loss": 1.0774, "step": 662 }, { "epoch": 0.34876380852183064, "grad_norm": 2.2421789169311523, "learning_rate": 4.969470290310641e-06, "loss": 1.0958, "step": 663 }, { "epoch": 0.3492898474487112, "grad_norm": 2.069939613342285, "learning_rate": 4.969361962578994e-06, "loss": 1.0758, "step": 664 }, { "epoch": 0.3498158863755918, "grad_norm": 2.0892951488494873, "learning_rate": 4.969253444184297e-06, "loss": 1.105, "step": 665 }, { "epoch": 0.3503419253024724, "grad_norm": 2.1536753177642822, "learning_rate": 4.969144735134929e-06, "loss": 1.0655, "step": 666 }, { "epoch": 0.35086796422935296, "grad_norm": 2.031996250152588, "learning_rate": 4.969035835439284e-06, "loss": 1.1107, "step": 667 }, { "epoch": 0.35139400315623354, "grad_norm": 2.068693161010742, "learning_rate": 4.9689267451057714e-06, "loss": 1.0293, "step": 668 }, { "epoch": 0.35192004208311417, "grad_norm": 2.1489906311035156, "learning_rate": 4.9688174641428136e-06, "loss": 1.0656, "step": 669 }, { "epoch": 0.35244608100999475, "grad_norm": 2.5132720470428467, "learning_rate": 4.9687079925588475e-06, "loss": 1.0558, "step": 670 }, { "epoch": 0.35297211993687533, "grad_norm": 1.9639642238616943, "learning_rate": 4.968598330362326e-06, "loss": 1.0498, "step": 671 }, { "epoch": 0.3534981588637559, "grad_norm": 2.2413175106048584, "learning_rate": 4.968488477561716e-06, "loss": 0.986, "step": 672 }, { "epoch": 0.3540241977906365, "grad_norm": 2.0109381675720215, "learning_rate": 4.968378434165501e-06, "loss": 1.1112, "step": 673 }, { "epoch": 0.3545502367175171, "grad_norm": 2.1863934993743896, "learning_rate": 4.968268200182175e-06, "loss": 1.0843, "step": 674 }, { "epoch": 0.3550762756443977, "grad_norm": 2.262173652648926, "learning_rate": 4.968157775620252e-06, "loss": 1.0938, "step": 675 }, { "epoch": 0.3556023145712783, "grad_norm": 2.261918067932129, "learning_rate": 4.968047160488256e-06, "loss": 1.1004, "step": 676 }, { "epoch": 0.35612835349815886, "grad_norm": 2.13324236869812, "learning_rate": 4.967936354794728e-06, "loss": 1.0881, "step": 677 }, { "epoch": 0.35665439242503943, "grad_norm": 2.271207809448242, "learning_rate": 4.967825358548225e-06, "loss": 1.0967, "step": 678 }, { "epoch": 0.35718043135192007, "grad_norm": 2.177339553833008, "learning_rate": 4.967714171757315e-06, "loss": 1.1131, "step": 679 }, { "epoch": 0.35770647027880065, "grad_norm": 2.1329848766326904, "learning_rate": 4.967602794430585e-06, "loss": 1.112, "step": 680 }, { "epoch": 0.3582325092056812, "grad_norm": 2.0018250942230225, "learning_rate": 4.967491226576634e-06, "loss": 1.0853, "step": 681 }, { "epoch": 0.3587585481325618, "grad_norm": 2.06925106048584, "learning_rate": 4.967379468204075e-06, "loss": 1.1405, "step": 682 }, { "epoch": 0.3592845870594424, "grad_norm": 2.0437614917755127, "learning_rate": 4.967267519321538e-06, "loss": 1.1165, "step": 683 }, { "epoch": 0.35981062598632296, "grad_norm": 2.043297290802002, "learning_rate": 4.9671553799376685e-06, "loss": 1.0438, "step": 684 }, { "epoch": 0.3603366649132036, "grad_norm": 2.060760259628296, "learning_rate": 4.967043050061121e-06, "loss": 1.0401, "step": 685 }, { "epoch": 0.3608627038400842, "grad_norm": 2.3929009437561035, "learning_rate": 4.966930529700572e-06, "loss": 1.0812, "step": 686 }, { "epoch": 0.36138874276696475, "grad_norm": 2.2057461738586426, "learning_rate": 4.966817818864708e-06, "loss": 1.0499, "step": 687 }, { "epoch": 0.36191478169384533, "grad_norm": 2.0358550548553467, "learning_rate": 4.966704917562231e-06, "loss": 1.1603, "step": 688 }, { "epoch": 0.3624408206207259, "grad_norm": 2.0840682983398438, "learning_rate": 4.966591825801859e-06, "loss": 1.0967, "step": 689 }, { "epoch": 0.36296685954760655, "grad_norm": 2.0170061588287354, "learning_rate": 4.9664785435923255e-06, "loss": 1.0573, "step": 690 }, { "epoch": 0.3634928984744871, "grad_norm": 2.1349408626556396, "learning_rate": 4.966365070942375e-06, "loss": 1.0665, "step": 691 }, { "epoch": 0.3640189374013677, "grad_norm": 2.1616368293762207, "learning_rate": 4.966251407860769e-06, "loss": 1.0306, "step": 692 }, { "epoch": 0.3645449763282483, "grad_norm": 2.2529335021972656, "learning_rate": 4.966137554356285e-06, "loss": 1.0445, "step": 693 }, { "epoch": 0.36507101525512886, "grad_norm": 2.041102170944214, "learning_rate": 4.966023510437713e-06, "loss": 1.0395, "step": 694 }, { "epoch": 0.3655970541820095, "grad_norm": 2.0450620651245117, "learning_rate": 4.9659092761138585e-06, "loss": 1.064, "step": 695 }, { "epoch": 0.3661230931088901, "grad_norm": 2.163081407546997, "learning_rate": 4.965794851393541e-06, "loss": 1.0729, "step": 696 }, { "epoch": 0.36664913203577065, "grad_norm": 2.1602089405059814, "learning_rate": 4.965680236285596e-06, "loss": 1.0707, "step": 697 }, { "epoch": 0.36717517096265123, "grad_norm": 2.3263938426971436, "learning_rate": 4.965565430798875e-06, "loss": 1.0146, "step": 698 }, { "epoch": 0.3677012098895318, "grad_norm": 2.0192365646362305, "learning_rate": 4.965450434942238e-06, "loss": 1.0751, "step": 699 }, { "epoch": 0.3682272488164124, "grad_norm": 2.0557174682617188, "learning_rate": 4.965335248724568e-06, "loss": 1.0749, "step": 700 }, { "epoch": 0.368753287743293, "grad_norm": 2.29679799079895, "learning_rate": 4.965219872154757e-06, "loss": 1.0516, "step": 701 }, { "epoch": 0.3692793266701736, "grad_norm": 2.2303829193115234, "learning_rate": 4.965104305241713e-06, "loss": 1.1586, "step": 702 }, { "epoch": 0.3698053655970542, "grad_norm": 2.112283706665039, "learning_rate": 4.964988547994361e-06, "loss": 1.0833, "step": 703 }, { "epoch": 0.37033140452393476, "grad_norm": 2.1807613372802734, "learning_rate": 4.9648726004216354e-06, "loss": 1.0786, "step": 704 }, { "epoch": 0.37085744345081534, "grad_norm": 2.0990889072418213, "learning_rate": 4.964756462532492e-06, "loss": 1.0555, "step": 705 }, { "epoch": 0.371383482377696, "grad_norm": 2.2034318447113037, "learning_rate": 4.964640134335896e-06, "loss": 1.0696, "step": 706 }, { "epoch": 0.37190952130457655, "grad_norm": 2.207235813140869, "learning_rate": 4.964523615840831e-06, "loss": 1.0897, "step": 707 }, { "epoch": 0.37243556023145713, "grad_norm": 1.8820483684539795, "learning_rate": 4.964406907056291e-06, "loss": 1.0822, "step": 708 }, { "epoch": 0.3729615991583377, "grad_norm": 2.2243785858154297, "learning_rate": 4.964290007991291e-06, "loss": 1.0958, "step": 709 }, { "epoch": 0.3734876380852183, "grad_norm": 2.208770990371704, "learning_rate": 4.964172918654854e-06, "loss": 1.0803, "step": 710 }, { "epoch": 0.3740136770120989, "grad_norm": 2.1083521842956543, "learning_rate": 4.96405563905602e-06, "loss": 1.0513, "step": 711 }, { "epoch": 0.3745397159389795, "grad_norm": 2.0161774158477783, "learning_rate": 4.963938169203847e-06, "loss": 1.0775, "step": 712 }, { "epoch": 0.3750657548658601, "grad_norm": 2.1578962802886963, "learning_rate": 4.963820509107403e-06, "loss": 1.0695, "step": 713 }, { "epoch": 0.37559179379274066, "grad_norm": 2.1972339153289795, "learning_rate": 4.963702658775774e-06, "loss": 1.0703, "step": 714 }, { "epoch": 0.37611783271962124, "grad_norm": 2.338205575942993, "learning_rate": 4.9635846182180594e-06, "loss": 1.0756, "step": 715 }, { "epoch": 0.3766438716465018, "grad_norm": 2.281242847442627, "learning_rate": 4.963466387443372e-06, "loss": 1.1177, "step": 716 }, { "epoch": 0.37716991057338245, "grad_norm": 2.092036724090576, "learning_rate": 4.963347966460841e-06, "loss": 1.1004, "step": 717 }, { "epoch": 0.37769594950026303, "grad_norm": 2.148244857788086, "learning_rate": 4.963229355279611e-06, "loss": 1.1157, "step": 718 }, { "epoch": 0.3782219884271436, "grad_norm": 1.9961777925491333, "learning_rate": 4.963110553908838e-06, "loss": 1.0703, "step": 719 }, { "epoch": 0.3787480273540242, "grad_norm": 2.299091339111328, "learning_rate": 4.962991562357697e-06, "loss": 1.1265, "step": 720 }, { "epoch": 0.37927406628090476, "grad_norm": 2.1055006980895996, "learning_rate": 4.962872380635374e-06, "loss": 1.0361, "step": 721 }, { "epoch": 0.3798001052077854, "grad_norm": 2.1554667949676514, "learning_rate": 4.9627530087510725e-06, "loss": 1.0603, "step": 722 }, { "epoch": 0.380326144134666, "grad_norm": 2.1003949642181396, "learning_rate": 4.962633446714009e-06, "loss": 1.0714, "step": 723 }, { "epoch": 0.38085218306154656, "grad_norm": 2.1850736141204834, "learning_rate": 4.962513694533414e-06, "loss": 1.0795, "step": 724 }, { "epoch": 0.38137822198842714, "grad_norm": 2.0440175533294678, "learning_rate": 4.962393752218535e-06, "loss": 1.0882, "step": 725 }, { "epoch": 0.3819042609153077, "grad_norm": 2.2579755783081055, "learning_rate": 4.962273619778632e-06, "loss": 1.1066, "step": 726 }, { "epoch": 0.38243029984218835, "grad_norm": 2.0210318565368652, "learning_rate": 4.962153297222981e-06, "loss": 1.0843, "step": 727 }, { "epoch": 0.3829563387690689, "grad_norm": 2.1218135356903076, "learning_rate": 4.962032784560873e-06, "loss": 1.1039, "step": 728 }, { "epoch": 0.3834823776959495, "grad_norm": 2.2498831748962402, "learning_rate": 4.961912081801612e-06, "loss": 1.0389, "step": 729 }, { "epoch": 0.3840084166228301, "grad_norm": 2.6789276599884033, "learning_rate": 4.9617911889545175e-06, "loss": 1.0772, "step": 730 }, { "epoch": 0.38453445554971066, "grad_norm": 1.9847339391708374, "learning_rate": 4.961670106028924e-06, "loss": 1.0804, "step": 731 }, { "epoch": 0.38506049447659124, "grad_norm": 2.048737049102783, "learning_rate": 4.9615488330341814e-06, "loss": 1.1089, "step": 732 }, { "epoch": 0.3855865334034719, "grad_norm": 2.2241313457489014, "learning_rate": 4.961427369979652e-06, "loss": 1.0618, "step": 733 }, { "epoch": 0.38611257233035245, "grad_norm": 1.9084025621414185, "learning_rate": 4.961305716874716e-06, "loss": 1.0316, "step": 734 }, { "epoch": 0.38663861125723303, "grad_norm": 2.0064773559570312, "learning_rate": 4.9611838737287646e-06, "loss": 1.0289, "step": 735 }, { "epoch": 0.3871646501841136, "grad_norm": 2.386962652206421, "learning_rate": 4.961061840551205e-06, "loss": 1.1488, "step": 736 }, { "epoch": 0.3876906891109942, "grad_norm": 2.0626862049102783, "learning_rate": 4.960939617351462e-06, "loss": 1.0793, "step": 737 }, { "epoch": 0.3882167280378748, "grad_norm": 2.1622767448425293, "learning_rate": 4.960817204138971e-06, "loss": 1.0923, "step": 738 }, { "epoch": 0.3887427669647554, "grad_norm": 2.049163818359375, "learning_rate": 4.9606946009231834e-06, "loss": 1.0423, "step": 739 }, { "epoch": 0.389268805891636, "grad_norm": 2.0196399688720703, "learning_rate": 4.960571807713568e-06, "loss": 0.9832, "step": 740 }, { "epoch": 0.38979484481851656, "grad_norm": 1.982647180557251, "learning_rate": 4.960448824519602e-06, "loss": 1.0424, "step": 741 }, { "epoch": 0.39032088374539714, "grad_norm": 2.0468926429748535, "learning_rate": 4.960325651350784e-06, "loss": 1.074, "step": 742 }, { "epoch": 0.3908469226722778, "grad_norm": 2.402381181716919, "learning_rate": 4.960202288216624e-06, "loss": 1.058, "step": 743 }, { "epoch": 0.39137296159915835, "grad_norm": 2.065232753753662, "learning_rate": 4.960078735126646e-06, "loss": 1.0985, "step": 744 }, { "epoch": 0.39189900052603893, "grad_norm": 2.1949756145477295, "learning_rate": 4.95995499209039e-06, "loss": 1.0791, "step": 745 }, { "epoch": 0.3924250394529195, "grad_norm": 2.121232271194458, "learning_rate": 4.959831059117411e-06, "loss": 1.0606, "step": 746 }, { "epoch": 0.3929510783798001, "grad_norm": 2.247145652770996, "learning_rate": 4.959706936217278e-06, "loss": 1.0991, "step": 747 }, { "epoch": 0.39347711730668067, "grad_norm": 2.0540339946746826, "learning_rate": 4.9595826233995735e-06, "loss": 1.0835, "step": 748 }, { "epoch": 0.3940031562335613, "grad_norm": 2.173257350921631, "learning_rate": 4.959458120673898e-06, "loss": 1.0588, "step": 749 }, { "epoch": 0.3945291951604419, "grad_norm": 2.1530778408050537, "learning_rate": 4.959333428049862e-06, "loss": 1.0395, "step": 750 }, { "epoch": 0.39505523408732246, "grad_norm": 2.0705490112304688, "learning_rate": 4.959208545537095e-06, "loss": 1.071, "step": 751 }, { "epoch": 0.39558127301420304, "grad_norm": 1.9439338445663452, "learning_rate": 4.95908347314524e-06, "loss": 1.0224, "step": 752 }, { "epoch": 0.3961073119410836, "grad_norm": 2.1683454513549805, "learning_rate": 4.958958210883952e-06, "loss": 1.0745, "step": 753 }, { "epoch": 0.39663335086796425, "grad_norm": 2.2809042930603027, "learning_rate": 4.958832758762903e-06, "loss": 1.0887, "step": 754 }, { "epoch": 0.39715938979484483, "grad_norm": 2.161447048187256, "learning_rate": 4.9587071167917814e-06, "loss": 1.1447, "step": 755 }, { "epoch": 0.3976854287217254, "grad_norm": 2.1375932693481445, "learning_rate": 4.958581284980285e-06, "loss": 1.0295, "step": 756 }, { "epoch": 0.398211467648606, "grad_norm": 2.0431041717529297, "learning_rate": 4.958455263338133e-06, "loss": 1.0567, "step": 757 }, { "epoch": 0.39873750657548657, "grad_norm": 2.0288238525390625, "learning_rate": 4.958329051875053e-06, "loss": 1.0736, "step": 758 }, { "epoch": 0.3992635455023672, "grad_norm": 2.146132230758667, "learning_rate": 4.958202650600791e-06, "loss": 1.0744, "step": 759 }, { "epoch": 0.3997895844292478, "grad_norm": 2.1740963459014893, "learning_rate": 4.958076059525107e-06, "loss": 1.0263, "step": 760 }, { "epoch": 0.40031562335612836, "grad_norm": 2.1219875812530518, "learning_rate": 4.957949278657773e-06, "loss": 1.0508, "step": 761 }, { "epoch": 0.40084166228300894, "grad_norm": 2.0742340087890625, "learning_rate": 4.9578223080085815e-06, "loss": 1.0455, "step": 762 }, { "epoch": 0.4013677012098895, "grad_norm": 2.1779415607452393, "learning_rate": 4.957695147587334e-06, "loss": 1.1079, "step": 763 }, { "epoch": 0.4018937401367701, "grad_norm": 2.151047706604004, "learning_rate": 4.957567797403848e-06, "loss": 1.0893, "step": 764 }, { "epoch": 0.40241977906365073, "grad_norm": 2.1728570461273193, "learning_rate": 4.9574402574679594e-06, "loss": 1.0726, "step": 765 }, { "epoch": 0.4029458179905313, "grad_norm": 1.982230305671692, "learning_rate": 4.957312527789512e-06, "loss": 1.0629, "step": 766 }, { "epoch": 0.4034718569174119, "grad_norm": 1.953464150428772, "learning_rate": 4.95718460837837e-06, "loss": 1.1093, "step": 767 }, { "epoch": 0.40399789584429247, "grad_norm": 1.9718215465545654, "learning_rate": 4.9570564992444116e-06, "loss": 1.1018, "step": 768 }, { "epoch": 0.40452393477117304, "grad_norm": 2.067629337310791, "learning_rate": 4.956928200397526e-06, "loss": 1.0364, "step": 769 }, { "epoch": 0.4050499736980537, "grad_norm": 2.1172022819519043, "learning_rate": 4.956799711847619e-06, "loss": 1.0693, "step": 770 }, { "epoch": 0.40557601262493426, "grad_norm": 2.0539615154266357, "learning_rate": 4.956671033604613e-06, "loss": 1.0034, "step": 771 }, { "epoch": 0.40610205155181484, "grad_norm": 1.9780375957489014, "learning_rate": 4.956542165678443e-06, "loss": 1.0515, "step": 772 }, { "epoch": 0.4066280904786954, "grad_norm": 2.0974819660186768, "learning_rate": 4.95641310807906e-06, "loss": 1.0754, "step": 773 }, { "epoch": 0.407154129405576, "grad_norm": 2.1018221378326416, "learning_rate": 4.956283860816427e-06, "loss": 1.1102, "step": 774 }, { "epoch": 0.4076801683324566, "grad_norm": 2.3969085216522217, "learning_rate": 4.9561544239005235e-06, "loss": 1.0455, "step": 775 }, { "epoch": 0.4082062072593372, "grad_norm": 2.2645649909973145, "learning_rate": 4.956024797341345e-06, "loss": 0.9724, "step": 776 }, { "epoch": 0.4087322461862178, "grad_norm": 2.3406150341033936, "learning_rate": 4.955894981148898e-06, "loss": 1.1341, "step": 777 }, { "epoch": 0.40925828511309836, "grad_norm": 2.0782880783081055, "learning_rate": 4.955764975333208e-06, "loss": 1.0116, "step": 778 }, { "epoch": 0.40978432403997894, "grad_norm": 2.1269314289093018, "learning_rate": 4.955634779904312e-06, "loss": 1.0967, "step": 779 }, { "epoch": 0.4103103629668595, "grad_norm": 2.198559522628784, "learning_rate": 4.9555043948722625e-06, "loss": 1.0815, "step": 780 }, { "epoch": 0.41083640189374016, "grad_norm": 2.2189719676971436, "learning_rate": 4.9553738202471264e-06, "loss": 1.0559, "step": 781 }, { "epoch": 0.41136244082062073, "grad_norm": 2.2313179969787598, "learning_rate": 4.955243056038986e-06, "loss": 1.046, "step": 782 }, { "epoch": 0.4118884797475013, "grad_norm": 1.9563003778457642, "learning_rate": 4.955112102257939e-06, "loss": 1.0735, "step": 783 }, { "epoch": 0.4124145186743819, "grad_norm": 1.99479341506958, "learning_rate": 4.954980958914093e-06, "loss": 1.0657, "step": 784 }, { "epoch": 0.41294055760126247, "grad_norm": 2.029634714126587, "learning_rate": 4.954849626017577e-06, "loss": 1.0811, "step": 785 }, { "epoch": 0.4134665965281431, "grad_norm": 2.2947723865509033, "learning_rate": 4.9547181035785314e-06, "loss": 1.0807, "step": 786 }, { "epoch": 0.4139926354550237, "grad_norm": 2.0323445796966553, "learning_rate": 4.9545863916071094e-06, "loss": 1.0715, "step": 787 }, { "epoch": 0.41451867438190426, "grad_norm": 2.0068464279174805, "learning_rate": 4.954454490113482e-06, "loss": 1.0447, "step": 788 }, { "epoch": 0.41504471330878484, "grad_norm": 2.132549285888672, "learning_rate": 4.954322399107833e-06, "loss": 1.0454, "step": 789 }, { "epoch": 0.4155707522356654, "grad_norm": 2.0086755752563477, "learning_rate": 4.954190118600361e-06, "loss": 1.0724, "step": 790 }, { "epoch": 0.41609679116254605, "grad_norm": 2.1461241245269775, "learning_rate": 4.95405764860128e-06, "loss": 1.0391, "step": 791 }, { "epoch": 0.41662283008942663, "grad_norm": 2.1352107524871826, "learning_rate": 4.953924989120818e-06, "loss": 0.9898, "step": 792 }, { "epoch": 0.4171488690163072, "grad_norm": 2.0694406032562256, "learning_rate": 4.953792140169219e-06, "loss": 1.0819, "step": 793 }, { "epoch": 0.4176749079431878, "grad_norm": 2.088433027267456, "learning_rate": 4.953659101756739e-06, "loss": 1.0833, "step": 794 }, { "epoch": 0.41820094687006837, "grad_norm": 2.1760306358337402, "learning_rate": 4.95352587389365e-06, "loss": 1.0535, "step": 795 }, { "epoch": 0.41872698579694895, "grad_norm": 2.2031099796295166, "learning_rate": 4.95339245659024e-06, "loss": 1.0389, "step": 796 }, { "epoch": 0.4192530247238296, "grad_norm": 2.247276782989502, "learning_rate": 4.953258849856809e-06, "loss": 1.0839, "step": 797 }, { "epoch": 0.41977906365071016, "grad_norm": 2.24357271194458, "learning_rate": 4.953125053703674e-06, "loss": 1.0666, "step": 798 }, { "epoch": 0.42030510257759074, "grad_norm": 2.240151882171631, "learning_rate": 4.952991068141165e-06, "loss": 1.1009, "step": 799 }, { "epoch": 0.4208311415044713, "grad_norm": 2.2172327041625977, "learning_rate": 4.952856893179628e-06, "loss": 1.0928, "step": 800 }, { "epoch": 0.4213571804313519, "grad_norm": 2.377336025238037, "learning_rate": 4.952722528829422e-06, "loss": 1.0968, "step": 801 }, { "epoch": 0.42188321935823253, "grad_norm": 2.466841459274292, "learning_rate": 4.9525879751009205e-06, "loss": 1.0631, "step": 802 }, { "epoch": 0.4224092582851131, "grad_norm": 2.035644054412842, "learning_rate": 4.952453232004516e-06, "loss": 1.0609, "step": 803 }, { "epoch": 0.4229352972119937, "grad_norm": 2.2472054958343506, "learning_rate": 4.952318299550608e-06, "loss": 1.0613, "step": 804 }, { "epoch": 0.42346133613887427, "grad_norm": 2.175999879837036, "learning_rate": 4.952183177749618e-06, "loss": 1.0954, "step": 805 }, { "epoch": 0.42398737506575485, "grad_norm": 2.206052303314209, "learning_rate": 4.952047866611978e-06, "loss": 1.0965, "step": 806 }, { "epoch": 0.4245134139926355, "grad_norm": 1.9550546407699585, "learning_rate": 4.951912366148135e-06, "loss": 1.0835, "step": 807 }, { "epoch": 0.42503945291951606, "grad_norm": 2.194734811782837, "learning_rate": 4.951776676368552e-06, "loss": 1.1179, "step": 808 }, { "epoch": 0.42556549184639664, "grad_norm": 2.094862222671509, "learning_rate": 4.951640797283704e-06, "loss": 1.0634, "step": 809 }, { "epoch": 0.4260915307732772, "grad_norm": 1.980043888092041, "learning_rate": 4.951504728904085e-06, "loss": 1.0874, "step": 810 }, { "epoch": 0.4266175697001578, "grad_norm": 2.2654919624328613, "learning_rate": 4.9513684712402e-06, "loss": 1.057, "step": 811 }, { "epoch": 0.4271436086270384, "grad_norm": 2.197120189666748, "learning_rate": 4.951232024302569e-06, "loss": 1.1114, "step": 812 }, { "epoch": 0.427669647553919, "grad_norm": 2.143324375152588, "learning_rate": 4.9510953881017275e-06, "loss": 1.07, "step": 813 }, { "epoch": 0.4281956864807996, "grad_norm": 2.1920077800750732, "learning_rate": 4.950958562648226e-06, "loss": 1.0373, "step": 814 }, { "epoch": 0.42872172540768017, "grad_norm": 2.0401923656463623, "learning_rate": 4.950821547952629e-06, "loss": 1.1111, "step": 815 }, { "epoch": 0.42924776433456074, "grad_norm": 1.9541674852371216, "learning_rate": 4.950684344025515e-06, "loss": 1.0153, "step": 816 }, { "epoch": 0.4297738032614413, "grad_norm": 3.5096704959869385, "learning_rate": 4.9505469508774776e-06, "loss": 1.0435, "step": 817 }, { "epoch": 0.43029984218832196, "grad_norm": 2.0304462909698486, "learning_rate": 4.9504093685191255e-06, "loss": 1.0786, "step": 818 }, { "epoch": 0.43082588111520254, "grad_norm": 2.115224599838257, "learning_rate": 4.950271596961082e-06, "loss": 1.0854, "step": 819 }, { "epoch": 0.4313519200420831, "grad_norm": 2.176621913909912, "learning_rate": 4.950133636213984e-06, "loss": 0.9909, "step": 820 }, { "epoch": 0.4318779589689637, "grad_norm": 2.2046449184417725, "learning_rate": 4.949995486288484e-06, "loss": 1.0688, "step": 821 }, { "epoch": 0.43240399789584427, "grad_norm": 2.1462888717651367, "learning_rate": 4.949857147195249e-06, "loss": 1.0644, "step": 822 }, { "epoch": 0.4329300368227249, "grad_norm": 2.0735347270965576, "learning_rate": 4.94971861894496e-06, "loss": 1.022, "step": 823 }, { "epoch": 0.4334560757496055, "grad_norm": 2.086724042892456, "learning_rate": 4.949579901548312e-06, "loss": 1.02, "step": 824 }, { "epoch": 0.43398211467648606, "grad_norm": 2.078622341156006, "learning_rate": 4.949440995016018e-06, "loss": 1.0653, "step": 825 }, { "epoch": 0.43450815360336664, "grad_norm": 2.1504440307617188, "learning_rate": 4.949301899358801e-06, "loss": 1.0708, "step": 826 }, { "epoch": 0.4350341925302472, "grad_norm": 2.2340216636657715, "learning_rate": 4.949162614587401e-06, "loss": 1.0688, "step": 827 }, { "epoch": 0.4355602314571278, "grad_norm": 2.2017569541931152, "learning_rate": 4.949023140712574e-06, "loss": 1.0935, "step": 828 }, { "epoch": 0.43608627038400843, "grad_norm": 2.117745876312256, "learning_rate": 4.948883477745088e-06, "loss": 1.0868, "step": 829 }, { "epoch": 0.436612309310889, "grad_norm": 2.0983524322509766, "learning_rate": 4.948743625695726e-06, "loss": 1.0695, "step": 830 }, { "epoch": 0.4371383482377696, "grad_norm": 2.205693244934082, "learning_rate": 4.948603584575287e-06, "loss": 1.0541, "step": 831 }, { "epoch": 0.43766438716465017, "grad_norm": 1.9967527389526367, "learning_rate": 4.948463354394583e-06, "loss": 0.9933, "step": 832 }, { "epoch": 0.43819042609153075, "grad_norm": 2.113577127456665, "learning_rate": 4.948322935164442e-06, "loss": 1.0199, "step": 833 }, { "epoch": 0.4387164650184114, "grad_norm": 2.0825533866882324, "learning_rate": 4.948182326895705e-06, "loss": 1.0446, "step": 834 }, { "epoch": 0.43924250394529196, "grad_norm": 2.0186421871185303, "learning_rate": 4.94804152959923e-06, "loss": 1.0798, "step": 835 }, { "epoch": 0.43976854287217254, "grad_norm": 2.3025147914886475, "learning_rate": 4.947900543285888e-06, "loss": 0.9977, "step": 836 }, { "epoch": 0.4402945817990531, "grad_norm": 2.1662867069244385, "learning_rate": 4.947759367966564e-06, "loss": 1.048, "step": 837 }, { "epoch": 0.4408206207259337, "grad_norm": 2.0708656311035156, "learning_rate": 4.947618003652158e-06, "loss": 1.0715, "step": 838 }, { "epoch": 0.44134665965281433, "grad_norm": 2.2494263648986816, "learning_rate": 4.947476450353586e-06, "loss": 1.0901, "step": 839 }, { "epoch": 0.4418726985796949, "grad_norm": 2.3319430351257324, "learning_rate": 4.947334708081777e-06, "loss": 1.0308, "step": 840 }, { "epoch": 0.4423987375065755, "grad_norm": 2.134620428085327, "learning_rate": 4.947192776847676e-06, "loss": 1.0459, "step": 841 }, { "epoch": 0.44292477643345607, "grad_norm": 2.075429916381836, "learning_rate": 4.94705065666224e-06, "loss": 1.0733, "step": 842 }, { "epoch": 0.44345081536033665, "grad_norm": 2.173069953918457, "learning_rate": 4.946908347536444e-06, "loss": 1.1092, "step": 843 }, { "epoch": 0.4439768542872172, "grad_norm": 2.1481893062591553, "learning_rate": 4.946765849481274e-06, "loss": 1.0822, "step": 844 }, { "epoch": 0.44450289321409786, "grad_norm": 2.247277021408081, "learning_rate": 4.9466231625077354e-06, "loss": 1.0777, "step": 845 }, { "epoch": 0.44502893214097844, "grad_norm": 2.1181042194366455, "learning_rate": 4.946480286626842e-06, "loss": 1.1139, "step": 846 }, { "epoch": 0.445554971067859, "grad_norm": 2.05195951461792, "learning_rate": 4.946337221849628e-06, "loss": 1.0738, "step": 847 }, { "epoch": 0.4460810099947396, "grad_norm": 2.122732639312744, "learning_rate": 4.946193968187139e-06, "loss": 1.061, "step": 848 }, { "epoch": 0.4466070489216202, "grad_norm": 1.8827515840530396, "learning_rate": 4.946050525650434e-06, "loss": 1.061, "step": 849 }, { "epoch": 0.4471330878485008, "grad_norm": 2.3874471187591553, "learning_rate": 4.945906894250591e-06, "loss": 1.0667, "step": 850 }, { "epoch": 0.4476591267753814, "grad_norm": 2.274724006652832, "learning_rate": 4.945763073998699e-06, "loss": 1.0559, "step": 851 }, { "epoch": 0.44818516570226197, "grad_norm": 2.2730906009674072, "learning_rate": 4.945619064905861e-06, "loss": 1.0952, "step": 852 }, { "epoch": 0.44871120462914255, "grad_norm": 2.190969944000244, "learning_rate": 4.945474866983199e-06, "loss": 1.0816, "step": 853 }, { "epoch": 0.4492372435560231, "grad_norm": 3.6214282512664795, "learning_rate": 4.945330480241844e-06, "loss": 1.09, "step": 854 }, { "epoch": 0.44976328248290376, "grad_norm": 2.0487356185913086, "learning_rate": 4.945185904692946e-06, "loss": 1.0279, "step": 855 }, { "epoch": 0.45028932140978434, "grad_norm": 2.074282646179199, "learning_rate": 4.945041140347669e-06, "loss": 1.0514, "step": 856 }, { "epoch": 0.4508153603366649, "grad_norm": 2.126495838165283, "learning_rate": 4.944896187217187e-06, "loss": 1.0819, "step": 857 }, { "epoch": 0.4513413992635455, "grad_norm": 2.0265605449676514, "learning_rate": 4.944751045312695e-06, "loss": 1.0282, "step": 858 }, { "epoch": 0.4518674381904261, "grad_norm": 2.0557355880737305, "learning_rate": 4.944605714645399e-06, "loss": 1.1052, "step": 859 }, { "epoch": 0.4523934771173067, "grad_norm": 2.026393175125122, "learning_rate": 4.944460195226519e-06, "loss": 0.982, "step": 860 }, { "epoch": 0.4529195160441873, "grad_norm": 2.1781463623046875, "learning_rate": 4.9443144870672925e-06, "loss": 1.1251, "step": 861 }, { "epoch": 0.45344555497106787, "grad_norm": 2.053683042526245, "learning_rate": 4.944168590178968e-06, "loss": 1.0766, "step": 862 }, { "epoch": 0.45397159389794844, "grad_norm": 2.1147496700286865, "learning_rate": 4.944022504572811e-06, "loss": 1.0174, "step": 863 }, { "epoch": 0.454497632824829, "grad_norm": 2.06046199798584, "learning_rate": 4.943876230260102e-06, "loss": 1.0836, "step": 864 }, { "epoch": 0.4550236717517096, "grad_norm": 2.171419382095337, "learning_rate": 4.9437297672521345e-06, "loss": 1.0695, "step": 865 }, { "epoch": 0.45554971067859024, "grad_norm": 2.064301013946533, "learning_rate": 4.943583115560217e-06, "loss": 1.0147, "step": 866 }, { "epoch": 0.4560757496054708, "grad_norm": 2.6638195514678955, "learning_rate": 4.943436275195673e-06, "loss": 1.0565, "step": 867 }, { "epoch": 0.4566017885323514, "grad_norm": 3.9418976306915283, "learning_rate": 4.943289246169839e-06, "loss": 1.0768, "step": 868 }, { "epoch": 0.457127827459232, "grad_norm": 2.114297389984131, "learning_rate": 4.943142028494069e-06, "loss": 1.0687, "step": 869 }, { "epoch": 0.45765386638611255, "grad_norm": 2.139803171157837, "learning_rate": 4.942994622179729e-06, "loss": 1.0464, "step": 870 }, { "epoch": 0.4581799053129932, "grad_norm": 2.011474370956421, "learning_rate": 4.942847027238201e-06, "loss": 1.0181, "step": 871 }, { "epoch": 0.45870594423987376, "grad_norm": 2.1592113971710205, "learning_rate": 4.94269924368088e-06, "loss": 1.0699, "step": 872 }, { "epoch": 0.45923198316675434, "grad_norm": 2.0230283737182617, "learning_rate": 4.942551271519178e-06, "loss": 1.075, "step": 873 }, { "epoch": 0.4597580220936349, "grad_norm": 2.286768913269043, "learning_rate": 4.942403110764518e-06, "loss": 1.0604, "step": 874 }, { "epoch": 0.4602840610205155, "grad_norm": 2.305375337600708, "learning_rate": 4.942254761428343e-06, "loss": 1.0067, "step": 875 }, { "epoch": 0.46081009994739613, "grad_norm": 2.416245698928833, "learning_rate": 4.942106223522104e-06, "loss": 1.1109, "step": 876 }, { "epoch": 0.4613361388742767, "grad_norm": 2.1339962482452393, "learning_rate": 4.941957497057272e-06, "loss": 1.0708, "step": 877 }, { "epoch": 0.4618621778011573, "grad_norm": 1.9983795881271362, "learning_rate": 4.941808582045329e-06, "loss": 1.0032, "step": 878 }, { "epoch": 0.46238821672803787, "grad_norm": 2.1115024089813232, "learning_rate": 4.9416594784977735e-06, "loss": 1.0272, "step": 879 }, { "epoch": 0.46291425565491845, "grad_norm": 2.2785818576812744, "learning_rate": 4.941510186426118e-06, "loss": 1.0538, "step": 880 }, { "epoch": 0.46344029458179903, "grad_norm": 2.009938955307007, "learning_rate": 4.94136070584189e-06, "loss": 1.0432, "step": 881 }, { "epoch": 0.46396633350867966, "grad_norm": 2.119264841079712, "learning_rate": 4.94121103675663e-06, "loss": 1.063, "step": 882 }, { "epoch": 0.46449237243556024, "grad_norm": 2.267575979232788, "learning_rate": 4.941061179181896e-06, "loss": 1.0698, "step": 883 }, { "epoch": 0.4650184113624408, "grad_norm": 2.2345592975616455, "learning_rate": 4.940911133129257e-06, "loss": 1.0898, "step": 884 }, { "epoch": 0.4655444502893214, "grad_norm": 2.175180673599243, "learning_rate": 4.940760898610299e-06, "loss": 1.0915, "step": 885 }, { "epoch": 0.466070489216202, "grad_norm": 2.036628246307373, "learning_rate": 4.940610475636621e-06, "loss": 1.0981, "step": 886 }, { "epoch": 0.4665965281430826, "grad_norm": 2.193129539489746, "learning_rate": 4.9404598642198386e-06, "loss": 1.1237, "step": 887 }, { "epoch": 0.4671225670699632, "grad_norm": 1.920074462890625, "learning_rate": 4.9403090643715804e-06, "loss": 1.0358, "step": 888 }, { "epoch": 0.46764860599684377, "grad_norm": 2.0745346546173096, "learning_rate": 4.940158076103489e-06, "loss": 1.0487, "step": 889 }, { "epoch": 0.46817464492372435, "grad_norm": 1.9645469188690186, "learning_rate": 4.940006899427225e-06, "loss": 1.0256, "step": 890 }, { "epoch": 0.4687006838506049, "grad_norm": 1.9696778059005737, "learning_rate": 4.939855534354458e-06, "loss": 1.0302, "step": 891 }, { "epoch": 0.46922672277748556, "grad_norm": 2.1893057823181152, "learning_rate": 4.939703980896875e-06, "loss": 1.0391, "step": 892 }, { "epoch": 0.46975276170436614, "grad_norm": 2.0537021160125732, "learning_rate": 4.93955223906618e-06, "loss": 1.0498, "step": 893 }, { "epoch": 0.4702788006312467, "grad_norm": 2.4528138637542725, "learning_rate": 4.9394003088740875e-06, "loss": 1.0393, "step": 894 }, { "epoch": 0.4708048395581273, "grad_norm": 2.2085723876953125, "learning_rate": 4.93924819033233e-06, "loss": 1.0789, "step": 895 }, { "epoch": 0.4713308784850079, "grad_norm": 2.0029642581939697, "learning_rate": 4.9390958834526504e-06, "loss": 1.0621, "step": 896 }, { "epoch": 0.47185691741188845, "grad_norm": 2.0400004386901855, "learning_rate": 4.93894338824681e-06, "loss": 1.0426, "step": 897 }, { "epoch": 0.4723829563387691, "grad_norm": 2.3174595832824707, "learning_rate": 4.9387907047265825e-06, "loss": 1.0273, "step": 898 }, { "epoch": 0.47290899526564967, "grad_norm": 1.998889446258545, "learning_rate": 4.938637832903758e-06, "loss": 1.0401, "step": 899 }, { "epoch": 0.47343503419253025, "grad_norm": 2.0847246646881104, "learning_rate": 4.93848477279014e-06, "loss": 1.0677, "step": 900 }, { "epoch": 0.4739610731194108, "grad_norm": 2.086249351501465, "learning_rate": 4.938331524397544e-06, "loss": 1.043, "step": 901 }, { "epoch": 0.4744871120462914, "grad_norm": 2.1909382343292236, "learning_rate": 4.938178087737805e-06, "loss": 0.9977, "step": 902 }, { "epoch": 0.47501315097317204, "grad_norm": 2.066394567489624, "learning_rate": 4.938024462822769e-06, "loss": 1.044, "step": 903 }, { "epoch": 0.4755391899000526, "grad_norm": 2.1768858432769775, "learning_rate": 4.937870649664299e-06, "loss": 0.9886, "step": 904 }, { "epoch": 0.4760652288269332, "grad_norm": 2.0450236797332764, "learning_rate": 4.937716648274269e-06, "loss": 1.0471, "step": 905 }, { "epoch": 0.4765912677538138, "grad_norm": 2.218719720840454, "learning_rate": 4.937562458664571e-06, "loss": 1.0324, "step": 906 }, { "epoch": 0.47711730668069435, "grad_norm": 2.2519423961639404, "learning_rate": 4.937408080847109e-06, "loss": 1.0899, "step": 907 }, { "epoch": 0.477643345607575, "grad_norm": 2.045959234237671, "learning_rate": 4.9372535148338055e-06, "loss": 1.0383, "step": 908 }, { "epoch": 0.47816938453445557, "grad_norm": 2.1137306690216064, "learning_rate": 4.937098760636591e-06, "loss": 1.0223, "step": 909 }, { "epoch": 0.47869542346133614, "grad_norm": 2.2585835456848145, "learning_rate": 4.936943818267418e-06, "loss": 1.027, "step": 910 }, { "epoch": 0.4792214623882167, "grad_norm": 2.161625862121582, "learning_rate": 4.936788687738247e-06, "loss": 1.0318, "step": 911 }, { "epoch": 0.4797475013150973, "grad_norm": 2.0743277072906494, "learning_rate": 4.936633369061057e-06, "loss": 1.1014, "step": 912 }, { "epoch": 0.4802735402419779, "grad_norm": 2.1271307468414307, "learning_rate": 4.936477862247841e-06, "loss": 1.0403, "step": 913 }, { "epoch": 0.4807995791688585, "grad_norm": 2.0820491313934326, "learning_rate": 4.9363221673106046e-06, "loss": 1.069, "step": 914 }, { "epoch": 0.4813256180957391, "grad_norm": 2.0069093704223633, "learning_rate": 4.936166284261369e-06, "loss": 1.0752, "step": 915 }, { "epoch": 0.4818516570226197, "grad_norm": 2.2541720867156982, "learning_rate": 4.936010213112172e-06, "loss": 1.0309, "step": 916 }, { "epoch": 0.48237769594950025, "grad_norm": 2.155980110168457, "learning_rate": 4.9358539538750636e-06, "loss": 1.0078, "step": 917 }, { "epoch": 0.48290373487638083, "grad_norm": 2.217339038848877, "learning_rate": 4.935697506562107e-06, "loss": 1.0522, "step": 918 }, { "epoch": 0.48342977380326146, "grad_norm": 1.963270902633667, "learning_rate": 4.935540871185384e-06, "loss": 1.0692, "step": 919 }, { "epoch": 0.48395581273014204, "grad_norm": 1.9923917055130005, "learning_rate": 4.935384047756987e-06, "loss": 1.0926, "step": 920 }, { "epoch": 0.4844818516570226, "grad_norm": 2.177624464035034, "learning_rate": 4.935227036289026e-06, "loss": 1.0727, "step": 921 }, { "epoch": 0.4850078905839032, "grad_norm": 2.022496461868286, "learning_rate": 4.935069836793622e-06, "loss": 1.0267, "step": 922 }, { "epoch": 0.4855339295107838, "grad_norm": 2.0110666751861572, "learning_rate": 4.9349124492829155e-06, "loss": 1.0911, "step": 923 }, { "epoch": 0.4860599684376644, "grad_norm": 2.1780877113342285, "learning_rate": 4.934754873769057e-06, "loss": 1.0494, "step": 924 }, { "epoch": 0.486586007364545, "grad_norm": 2.0291390419006348, "learning_rate": 4.934597110264212e-06, "loss": 1.0485, "step": 925 }, { "epoch": 0.48711204629142557, "grad_norm": 1.947896957397461, "learning_rate": 4.9344391587805626e-06, "loss": 1.0789, "step": 926 }, { "epoch": 0.48763808521830615, "grad_norm": 1.9520971775054932, "learning_rate": 4.934281019330305e-06, "loss": 1.0644, "step": 927 }, { "epoch": 0.48816412414518673, "grad_norm": 2.0348432064056396, "learning_rate": 4.93412269192565e-06, "loss": 1.0471, "step": 928 }, { "epoch": 0.4886901630720673, "grad_norm": 2.214876651763916, "learning_rate": 4.93396417657882e-06, "loss": 1.0921, "step": 929 }, { "epoch": 0.48921620199894794, "grad_norm": 1.9910991191864014, "learning_rate": 4.933805473302057e-06, "loss": 1.0962, "step": 930 }, { "epoch": 0.4897422409258285, "grad_norm": 2.0497536659240723, "learning_rate": 4.933646582107612e-06, "loss": 1.0502, "step": 931 }, { "epoch": 0.4902682798527091, "grad_norm": 2.102994203567505, "learning_rate": 4.933487503007756e-06, "loss": 1.0676, "step": 932 }, { "epoch": 0.4907943187795897, "grad_norm": 1.885666012763977, "learning_rate": 4.933328236014768e-06, "loss": 1.0005, "step": 933 }, { "epoch": 0.49132035770647026, "grad_norm": 2.1525766849517822, "learning_rate": 4.933168781140949e-06, "loss": 1.0997, "step": 934 }, { "epoch": 0.4918463966333509, "grad_norm": 2.0346620082855225, "learning_rate": 4.9330091383986086e-06, "loss": 1.0651, "step": 935 }, { "epoch": 0.49237243556023147, "grad_norm": 2.0436878204345703, "learning_rate": 4.932849307800074e-06, "loss": 1.0539, "step": 936 }, { "epoch": 0.49289847448711205, "grad_norm": 2.1023032665252686, "learning_rate": 4.932689289357686e-06, "loss": 1.0583, "step": 937 }, { "epoch": 0.4934245134139926, "grad_norm": 2.0781443119049072, "learning_rate": 4.932529083083798e-06, "loss": 1.0753, "step": 938 }, { "epoch": 0.4939505523408732, "grad_norm": 2.0385992527008057, "learning_rate": 4.932368688990783e-06, "loss": 1.0165, "step": 939 }, { "epoch": 0.49447659126775384, "grad_norm": 2.350186586380005, "learning_rate": 4.932208107091022e-06, "loss": 1.0834, "step": 940 }, { "epoch": 0.4950026301946344, "grad_norm": 2.2009286880493164, "learning_rate": 4.932047337396917e-06, "loss": 1.0975, "step": 941 }, { "epoch": 0.495528669121515, "grad_norm": 2.389380931854248, "learning_rate": 4.931886379920878e-06, "loss": 1.0853, "step": 942 }, { "epoch": 0.4960547080483956, "grad_norm": 2.016162157058716, "learning_rate": 4.931725234675334e-06, "loss": 1.039, "step": 943 }, { "epoch": 0.49658074697527615, "grad_norm": 2.116718292236328, "learning_rate": 4.9315639016727286e-06, "loss": 1.0182, "step": 944 }, { "epoch": 0.49710678590215673, "grad_norm": 2.1381125450134277, "learning_rate": 4.931402380925517e-06, "loss": 1.1051, "step": 945 }, { "epoch": 0.49763282482903737, "grad_norm": 2.0954737663269043, "learning_rate": 4.931240672446171e-06, "loss": 1.038, "step": 946 }, { "epoch": 0.49815886375591795, "grad_norm": 2.167865037918091, "learning_rate": 4.931078776247176e-06, "loss": 1.0998, "step": 947 }, { "epoch": 0.4986849026827985, "grad_norm": 2.1278021335601807, "learning_rate": 4.930916692341034e-06, "loss": 1.0374, "step": 948 }, { "epoch": 0.4992109416096791, "grad_norm": 2.088512420654297, "learning_rate": 4.9307544207402565e-06, "loss": 1.0954, "step": 949 }, { "epoch": 0.4997369805365597, "grad_norm": 2.015916109085083, "learning_rate": 4.930591961457375e-06, "loss": 1.0163, "step": 950 }, { "epoch": 0.5002630194634403, "grad_norm": 2.0662143230438232, "learning_rate": 4.930429314504933e-06, "loss": 1.0968, "step": 951 }, { "epoch": 0.5007890583903208, "grad_norm": 2.0692410469055176, "learning_rate": 4.930266479895488e-06, "loss": 1.0772, "step": 952 }, { "epoch": 0.5013150973172015, "grad_norm": 2.0734803676605225, "learning_rate": 4.930103457641613e-06, "loss": 1.1096, "step": 953 }, { "epoch": 0.5018411362440821, "grad_norm": 2.167228937149048, "learning_rate": 4.929940247755896e-06, "loss": 1.0608, "step": 954 }, { "epoch": 0.5023671751709626, "grad_norm": 2.272087574005127, "learning_rate": 4.929776850250937e-06, "loss": 1.0825, "step": 955 }, { "epoch": 0.5028932140978433, "grad_norm": 2.0937726497650146, "learning_rate": 4.929613265139354e-06, "loss": 1.0651, "step": 956 }, { "epoch": 0.5034192530247238, "grad_norm": 2.168090343475342, "learning_rate": 4.929449492433777e-06, "loss": 1.0821, "step": 957 }, { "epoch": 0.5039452919516044, "grad_norm": 2.0708675384521484, "learning_rate": 4.92928553214685e-06, "loss": 1.0655, "step": 958 }, { "epoch": 0.5044713308784851, "grad_norm": 2.067678689956665, "learning_rate": 4.929121384291234e-06, "loss": 1.05, "step": 959 }, { "epoch": 0.5049973698053656, "grad_norm": 1.9181219339370728, "learning_rate": 4.928957048879602e-06, "loss": 0.9935, "step": 960 }, { "epoch": 0.5055234087322462, "grad_norm": 2.217785358428955, "learning_rate": 4.928792525924644e-06, "loss": 0.97, "step": 961 }, { "epoch": 0.5060494476591267, "grad_norm": 2.084656238555908, "learning_rate": 4.928627815439062e-06, "loss": 1.0541, "step": 962 }, { "epoch": 0.5065754865860074, "grad_norm": 2.035367727279663, "learning_rate": 4.928462917435574e-06, "loss": 1.0694, "step": 963 }, { "epoch": 0.507101525512888, "grad_norm": 2.001654624938965, "learning_rate": 4.928297831926912e-06, "loss": 1.0232, "step": 964 }, { "epoch": 0.5076275644397685, "grad_norm": 2.57733154296875, "learning_rate": 4.928132558925822e-06, "loss": 1.0664, "step": 965 }, { "epoch": 0.5081536033666492, "grad_norm": 2.1757423877716064, "learning_rate": 4.927967098445066e-06, "loss": 1.1119, "step": 966 }, { "epoch": 0.5086796422935297, "grad_norm": 2.089594602584839, "learning_rate": 4.927801450497417e-06, "loss": 1.0212, "step": 967 }, { "epoch": 0.5092056812204103, "grad_norm": 2.078519821166992, "learning_rate": 4.927635615095668e-06, "loss": 1.0381, "step": 968 }, { "epoch": 0.5097317201472908, "grad_norm": 2.0807132720947266, "learning_rate": 4.927469592252621e-06, "loss": 1.0272, "step": 969 }, { "epoch": 0.5102577590741715, "grad_norm": 2.1806020736694336, "learning_rate": 4.927303381981098e-06, "loss": 1.0846, "step": 970 }, { "epoch": 0.5107837980010521, "grad_norm": 2.1434948444366455, "learning_rate": 4.927136984293928e-06, "loss": 1.0775, "step": 971 }, { "epoch": 0.5113098369279326, "grad_norm": 2.000924825668335, "learning_rate": 4.926970399203962e-06, "loss": 1.0272, "step": 972 }, { "epoch": 0.5118358758548133, "grad_norm": 2.1742711067199707, "learning_rate": 4.926803626724062e-06, "loss": 1.0253, "step": 973 }, { "epoch": 0.5123619147816938, "grad_norm": 2.1074674129486084, "learning_rate": 4.926636666867103e-06, "loss": 1.0146, "step": 974 }, { "epoch": 0.5128879537085744, "grad_norm": 2.1562392711639404, "learning_rate": 4.926469519645976e-06, "loss": 1.0364, "step": 975 }, { "epoch": 0.5134139926354551, "grad_norm": 2.4177775382995605, "learning_rate": 4.926302185073591e-06, "loss": 1.0658, "step": 976 }, { "epoch": 0.5139400315623356, "grad_norm": 2.322571277618408, "learning_rate": 4.9261346631628635e-06, "loss": 1.0489, "step": 977 }, { "epoch": 0.5144660704892162, "grad_norm": 2.0937836170196533, "learning_rate": 4.925966953926729e-06, "loss": 1.0535, "step": 978 }, { "epoch": 0.5149921094160967, "grad_norm": 2.065213680267334, "learning_rate": 4.925799057378139e-06, "loss": 1.0097, "step": 979 }, { "epoch": 0.5155181483429774, "grad_norm": 2.0844249725341797, "learning_rate": 4.925630973530054e-06, "loss": 1.0719, "step": 980 }, { "epoch": 0.516044187269858, "grad_norm": 2.4148666858673096, "learning_rate": 4.925462702395454e-06, "loss": 1.0906, "step": 981 }, { "epoch": 0.5165702261967385, "grad_norm": 2.071423053741455, "learning_rate": 4.925294243987331e-06, "loss": 1.03, "step": 982 }, { "epoch": 0.5170962651236192, "grad_norm": 2.2397513389587402, "learning_rate": 4.9251255983186915e-06, "loss": 1.0412, "step": 983 }, { "epoch": 0.5176223040504997, "grad_norm": 2.171269655227661, "learning_rate": 4.924956765402557e-06, "loss": 1.1, "step": 984 }, { "epoch": 0.5181483429773803, "grad_norm": 2.1705877780914307, "learning_rate": 4.924787745251963e-06, "loss": 1.0534, "step": 985 }, { "epoch": 0.518674381904261, "grad_norm": 2.178514003753662, "learning_rate": 4.924618537879961e-06, "loss": 1.0759, "step": 986 }, { "epoch": 0.5192004208311415, "grad_norm": 2.072097063064575, "learning_rate": 4.924449143299614e-06, "loss": 1.0321, "step": 987 }, { "epoch": 0.5197264597580221, "grad_norm": 2.124030351638794, "learning_rate": 4.924279561524004e-06, "loss": 1.0465, "step": 988 }, { "epoch": 0.5202524986849026, "grad_norm": 2.0927019119262695, "learning_rate": 4.924109792566222e-06, "loss": 1.0716, "step": 989 }, { "epoch": 0.5207785376117833, "grad_norm": 2.0673232078552246, "learning_rate": 4.923939836439377e-06, "loss": 1.0628, "step": 990 }, { "epoch": 0.5213045765386639, "grad_norm": 2.2538528442382812, "learning_rate": 4.92376969315659e-06, "loss": 1.0687, "step": 991 }, { "epoch": 0.5218306154655444, "grad_norm": 2.120530366897583, "learning_rate": 4.923599362731001e-06, "loss": 1.0893, "step": 992 }, { "epoch": 0.5223566543924251, "grad_norm": 2.0750699043273926, "learning_rate": 4.92342884517576e-06, "loss": 1.0016, "step": 993 }, { "epoch": 0.5228826933193056, "grad_norm": 1.9984569549560547, "learning_rate": 4.923258140504032e-06, "loss": 1.0326, "step": 994 }, { "epoch": 0.5234087322461862, "grad_norm": 2.201758861541748, "learning_rate": 4.923087248729e-06, "loss": 1.0413, "step": 995 }, { "epoch": 0.5239347711730669, "grad_norm": 2.1322648525238037, "learning_rate": 4.922916169863855e-06, "loss": 1.0505, "step": 996 }, { "epoch": 0.5244608100999474, "grad_norm": 2.0557119846343994, "learning_rate": 4.922744903921809e-06, "loss": 0.9761, "step": 997 }, { "epoch": 0.524986849026828, "grad_norm": 2.0989720821380615, "learning_rate": 4.922573450916086e-06, "loss": 1.0436, "step": 998 }, { "epoch": 0.5255128879537085, "grad_norm": 2.152665138244629, "learning_rate": 4.922401810859922e-06, "loss": 1.0567, "step": 999 }, { "epoch": 0.5260389268805892, "grad_norm": 1.959796667098999, "learning_rate": 4.922229983766571e-06, "loss": 1.0694, "step": 1000 }, { "epoch": 0.5265649658074697, "grad_norm": 2.121493101119995, "learning_rate": 4.9220579696493e-06, "loss": 1.1024, "step": 1001 }, { "epoch": 0.5270910047343503, "grad_norm": 1.9629384279251099, "learning_rate": 4.92188576852139e-06, "loss": 1.0538, "step": 1002 }, { "epoch": 0.527617043661231, "grad_norm": 2.396224021911621, "learning_rate": 4.921713380396137e-06, "loss": 1.0711, "step": 1003 }, { "epoch": 0.5281430825881115, "grad_norm": 2.1571781635284424, "learning_rate": 4.921540805286852e-06, "loss": 1.0663, "step": 1004 }, { "epoch": 0.5286691215149921, "grad_norm": 2.032282590866089, "learning_rate": 4.921368043206858e-06, "loss": 1.0658, "step": 1005 }, { "epoch": 0.5291951604418726, "grad_norm": 1.9589232206344604, "learning_rate": 4.921195094169496e-06, "loss": 0.9755, "step": 1006 }, { "epoch": 0.5297211993687533, "grad_norm": 1.9304051399230957, "learning_rate": 4.92102195818812e-06, "loss": 1.011, "step": 1007 }, { "epoch": 0.5302472382956339, "grad_norm": 2.306674003601074, "learning_rate": 4.920848635276096e-06, "loss": 1.0626, "step": 1008 }, { "epoch": 0.5307732772225144, "grad_norm": 2.156906843185425, "learning_rate": 4.920675125446809e-06, "loss": 1.0107, "step": 1009 }, { "epoch": 0.5312993161493951, "grad_norm": 2.2959272861480713, "learning_rate": 4.9205014287136535e-06, "loss": 1.0527, "step": 1010 }, { "epoch": 0.5318253550762756, "grad_norm": 2.046900510787964, "learning_rate": 4.9203275450900426e-06, "loss": 1.0154, "step": 1011 }, { "epoch": 0.5323513940031562, "grad_norm": 1.9947476387023926, "learning_rate": 4.920153474589401e-06, "loss": 1.0456, "step": 1012 }, { "epoch": 0.5328774329300369, "grad_norm": 2.3516438007354736, "learning_rate": 4.919979217225169e-06, "loss": 1.0982, "step": 1013 }, { "epoch": 0.5334034718569174, "grad_norm": 2.5909998416900635, "learning_rate": 4.919804773010802e-06, "loss": 1.0436, "step": 1014 }, { "epoch": 0.533929510783798, "grad_norm": 2.5206117630004883, "learning_rate": 4.91963014195977e-06, "loss": 1.0405, "step": 1015 }, { "epoch": 0.5344555497106785, "grad_norm": 2.21992826461792, "learning_rate": 4.919455324085554e-06, "loss": 1.0594, "step": 1016 }, { "epoch": 0.5349815886375592, "grad_norm": 2.2773711681365967, "learning_rate": 4.919280319401654e-06, "loss": 1.0501, "step": 1017 }, { "epoch": 0.5355076275644398, "grad_norm": 2.075962543487549, "learning_rate": 4.919105127921582e-06, "loss": 1.0052, "step": 1018 }, { "epoch": 0.5360336664913203, "grad_norm": 2.108670473098755, "learning_rate": 4.9189297496588636e-06, "loss": 1.0675, "step": 1019 }, { "epoch": 0.536559705418201, "grad_norm": 2.125927209854126, "learning_rate": 4.918754184627041e-06, "loss": 1.0912, "step": 1020 }, { "epoch": 0.5370857443450815, "grad_norm": 2.1099467277526855, "learning_rate": 4.91857843283967e-06, "loss": 1.0424, "step": 1021 }, { "epoch": 0.5376117832719621, "grad_norm": 2.0880467891693115, "learning_rate": 4.918402494310319e-06, "loss": 1.061, "step": 1022 }, { "epoch": 0.5381378221988428, "grad_norm": 2.1544101238250732, "learning_rate": 4.918226369052575e-06, "loss": 1.0608, "step": 1023 }, { "epoch": 0.5386638611257233, "grad_norm": 2.213214635848999, "learning_rate": 4.918050057080036e-06, "loss": 1.1368, "step": 1024 }, { "epoch": 0.5391899000526039, "grad_norm": 2.062903642654419, "learning_rate": 4.917873558406315e-06, "loss": 1.0861, "step": 1025 }, { "epoch": 0.5397159389794844, "grad_norm": 1.9643436670303345, "learning_rate": 4.917696873045039e-06, "loss": 1.0008, "step": 1026 }, { "epoch": 0.5402419779063651, "grad_norm": 2.276639699935913, "learning_rate": 4.917520001009851e-06, "loss": 0.9812, "step": 1027 }, { "epoch": 0.5407680168332457, "grad_norm": 2.1487631797790527, "learning_rate": 4.917342942314407e-06, "loss": 1.0603, "step": 1028 }, { "epoch": 0.5412940557601262, "grad_norm": 2.1040542125701904, "learning_rate": 4.917165696972379e-06, "loss": 1.0425, "step": 1029 }, { "epoch": 0.5418200946870069, "grad_norm": 2.214475154876709, "learning_rate": 4.916988264997452e-06, "loss": 1.032, "step": 1030 }, { "epoch": 0.5423461336138874, "grad_norm": 2.154320001602173, "learning_rate": 4.916810646403325e-06, "loss": 1.0371, "step": 1031 }, { "epoch": 0.542872172540768, "grad_norm": 2.1565327644348145, "learning_rate": 4.916632841203714e-06, "loss": 1.0866, "step": 1032 }, { "epoch": 0.5433982114676486, "grad_norm": 2.197402238845825, "learning_rate": 4.916454849412344e-06, "loss": 1.0531, "step": 1033 }, { "epoch": 0.5439242503945292, "grad_norm": 2.0249993801116943, "learning_rate": 4.916276671042962e-06, "loss": 1.0485, "step": 1034 }, { "epoch": 0.5444502893214098, "grad_norm": 2.077765703201294, "learning_rate": 4.916098306109323e-06, "loss": 1.0731, "step": 1035 }, { "epoch": 0.5449763282482903, "grad_norm": 2.0669186115264893, "learning_rate": 4.915919754625199e-06, "loss": 1.0912, "step": 1036 }, { "epoch": 0.545502367175171, "grad_norm": 2.160076379776001, "learning_rate": 4.915741016604378e-06, "loss": 1.0523, "step": 1037 }, { "epoch": 0.5460284061020515, "grad_norm": 1.8992373943328857, "learning_rate": 4.915562092060659e-06, "loss": 1.0185, "step": 1038 }, { "epoch": 0.5465544450289321, "grad_norm": 2.0712900161743164, "learning_rate": 4.915382981007857e-06, "loss": 1.0581, "step": 1039 }, { "epoch": 0.5470804839558128, "grad_norm": 2.2600317001342773, "learning_rate": 4.915203683459802e-06, "loss": 1.0154, "step": 1040 }, { "epoch": 0.5476065228826933, "grad_norm": 2.050366163253784, "learning_rate": 4.915024199430338e-06, "loss": 1.0371, "step": 1041 }, { "epoch": 0.5481325618095739, "grad_norm": 2.208393096923828, "learning_rate": 4.914844528933322e-06, "loss": 1.0767, "step": 1042 }, { "epoch": 0.5486586007364544, "grad_norm": 2.1388466358184814, "learning_rate": 4.914664671982629e-06, "loss": 1.1074, "step": 1043 }, { "epoch": 0.5491846396633351, "grad_norm": 2.253007411956787, "learning_rate": 4.914484628592144e-06, "loss": 1.0455, "step": 1044 }, { "epoch": 0.5497106785902157, "grad_norm": 2.2380669116973877, "learning_rate": 4.9143043987757684e-06, "loss": 1.0581, "step": 1045 }, { "epoch": 0.5502367175170962, "grad_norm": 2.136256456375122, "learning_rate": 4.914123982547419e-06, "loss": 1.0588, "step": 1046 }, { "epoch": 0.5507627564439769, "grad_norm": 2.0044257640838623, "learning_rate": 4.913943379921025e-06, "loss": 0.9918, "step": 1047 }, { "epoch": 0.5512887953708574, "grad_norm": 2.089315414428711, "learning_rate": 4.913762590910533e-06, "loss": 1.0675, "step": 1048 }, { "epoch": 0.551814834297738, "grad_norm": 2.048976182937622, "learning_rate": 4.9135816155298985e-06, "loss": 1.0259, "step": 1049 }, { "epoch": 0.5523408732246187, "grad_norm": 2.273501396179199, "learning_rate": 4.913400453793098e-06, "loss": 1.0743, "step": 1050 }, { "epoch": 0.5528669121514992, "grad_norm": 2.0761802196502686, "learning_rate": 4.913219105714117e-06, "loss": 1.0199, "step": 1051 }, { "epoch": 0.5533929510783798, "grad_norm": 1.9552183151245117, "learning_rate": 4.913037571306961e-06, "loss": 1.0582, "step": 1052 }, { "epoch": 0.5539189900052603, "grad_norm": 2.0593061447143555, "learning_rate": 4.9128558505856425e-06, "loss": 0.9626, "step": 1053 }, { "epoch": 0.554445028932141, "grad_norm": 2.026820659637451, "learning_rate": 4.9126739435641955e-06, "loss": 1.0253, "step": 1054 }, { "epoch": 0.5549710678590216, "grad_norm": 2.22835111618042, "learning_rate": 4.9124918502566635e-06, "loss": 1.0176, "step": 1055 }, { "epoch": 0.5554971067859021, "grad_norm": 1.9653559923171997, "learning_rate": 4.9123095706771064e-06, "loss": 0.9886, "step": 1056 }, { "epoch": 0.5560231457127828, "grad_norm": 2.083310842514038, "learning_rate": 4.912127104839599e-06, "loss": 1.0105, "step": 1057 }, { "epoch": 0.5565491846396633, "grad_norm": 2.1681482791900635, "learning_rate": 4.91194445275823e-06, "loss": 1.0359, "step": 1058 }, { "epoch": 0.5570752235665439, "grad_norm": 1.990717887878418, "learning_rate": 4.911761614447101e-06, "loss": 1.0369, "step": 1059 }, { "epoch": 0.5576012624934246, "grad_norm": 2.159813642501831, "learning_rate": 4.91157858992033e-06, "loss": 1.0341, "step": 1060 }, { "epoch": 0.5581273014203051, "grad_norm": 1.9474655389785767, "learning_rate": 4.911395379192048e-06, "loss": 1.0432, "step": 1061 }, { "epoch": 0.5586533403471857, "grad_norm": 2.140634536743164, "learning_rate": 4.911211982276402e-06, "loss": 1.0485, "step": 1062 }, { "epoch": 0.5591793792740662, "grad_norm": 2.2925636768341064, "learning_rate": 4.911028399187552e-06, "loss": 1.0648, "step": 1063 }, { "epoch": 0.5597054182009469, "grad_norm": 2.037755250930786, "learning_rate": 4.910844629939672e-06, "loss": 1.0568, "step": 1064 }, { "epoch": 0.5602314571278275, "grad_norm": 1.997471809387207, "learning_rate": 4.910660674546951e-06, "loss": 1.0109, "step": 1065 }, { "epoch": 0.560757496054708, "grad_norm": 2.109219551086426, "learning_rate": 4.910476533023593e-06, "loss": 1.0658, "step": 1066 }, { "epoch": 0.5612835349815887, "grad_norm": 2.10469388961792, "learning_rate": 4.9102922053838175e-06, "loss": 1.0612, "step": 1067 }, { "epoch": 0.5618095739084692, "grad_norm": 2.2748658657073975, "learning_rate": 4.9101076916418535e-06, "loss": 1.0422, "step": 1068 }, { "epoch": 0.5623356128353498, "grad_norm": 2.0472326278686523, "learning_rate": 4.90992299181195e-06, "loss": 1.0339, "step": 1069 }, { "epoch": 0.5628616517622304, "grad_norm": 2.0694494247436523, "learning_rate": 4.909738105908367e-06, "loss": 1.0083, "step": 1070 }, { "epoch": 0.563387690689111, "grad_norm": 2.0032498836517334, "learning_rate": 4.909553033945379e-06, "loss": 0.9889, "step": 1071 }, { "epoch": 0.5639137296159916, "grad_norm": 2.091393232345581, "learning_rate": 4.909367775937278e-06, "loss": 1.0856, "step": 1072 }, { "epoch": 0.5644397685428721, "grad_norm": 2.0542173385620117, "learning_rate": 4.909182331898366e-06, "loss": 1.0422, "step": 1073 }, { "epoch": 0.5649658074697528, "grad_norm": 2.009228467941284, "learning_rate": 4.908996701842962e-06, "loss": 1.0594, "step": 1074 }, { "epoch": 0.5654918463966333, "grad_norm": 1.9546911716461182, "learning_rate": 4.9088108857853985e-06, "loss": 1.0691, "step": 1075 }, { "epoch": 0.5660178853235139, "grad_norm": 2.1657440662384033, "learning_rate": 4.908624883740023e-06, "loss": 1.0252, "step": 1076 }, { "epoch": 0.5665439242503946, "grad_norm": 2.151035785675049, "learning_rate": 4.9084386957211975e-06, "loss": 1.0587, "step": 1077 }, { "epoch": 0.5670699631772751, "grad_norm": 2.299673557281494, "learning_rate": 4.908252321743296e-06, "loss": 1.0221, "step": 1078 }, { "epoch": 0.5675960021041557, "grad_norm": 2.0144848823547363, "learning_rate": 4.908065761820711e-06, "loss": 1.0256, "step": 1079 }, { "epoch": 0.5681220410310363, "grad_norm": 2.172971725463867, "learning_rate": 4.907879015967846e-06, "loss": 1.0231, "step": 1080 }, { "epoch": 0.5686480799579169, "grad_norm": 2.0427041053771973, "learning_rate": 4.907692084199119e-06, "loss": 1.0433, "step": 1081 }, { "epoch": 0.5691741188847975, "grad_norm": 2.1561834812164307, "learning_rate": 4.907504966528966e-06, "loss": 1.0478, "step": 1082 }, { "epoch": 0.569700157811678, "grad_norm": 2.012385606765747, "learning_rate": 4.907317662971831e-06, "loss": 1.0703, "step": 1083 }, { "epoch": 0.5702261967385587, "grad_norm": 2.137075424194336, "learning_rate": 4.907130173542179e-06, "loss": 1.0527, "step": 1084 }, { "epoch": 0.5707522356654392, "grad_norm": 2.039424180984497, "learning_rate": 4.906942498254485e-06, "loss": 0.9969, "step": 1085 }, { "epoch": 0.5712782745923198, "grad_norm": 2.0207748413085938, "learning_rate": 4.90675463712324e-06, "loss": 1.0157, "step": 1086 }, { "epoch": 0.5718043135192005, "grad_norm": 2.024454116821289, "learning_rate": 4.906566590162949e-06, "loss": 1.0699, "step": 1087 }, { "epoch": 0.572330352446081, "grad_norm": 2.256537675857544, "learning_rate": 4.90637835738813e-06, "loss": 1.1083, "step": 1088 }, { "epoch": 0.5728563913729616, "grad_norm": 2.099698543548584, "learning_rate": 4.90618993881332e-06, "loss": 1.0242, "step": 1089 }, { "epoch": 0.5733824302998421, "grad_norm": 2.0367214679718018, "learning_rate": 4.906001334453064e-06, "loss": 1.0088, "step": 1090 }, { "epoch": 0.5739084692267228, "grad_norm": 1.9988690614700317, "learning_rate": 4.9058125443219245e-06, "loss": 1.044, "step": 1091 }, { "epoch": 0.5744345081536034, "grad_norm": 1.9970273971557617, "learning_rate": 4.9056235684344805e-06, "loss": 1.0847, "step": 1092 }, { "epoch": 0.5749605470804839, "grad_norm": 2.152602434158325, "learning_rate": 4.905434406805322e-06, "loss": 1.0931, "step": 1093 }, { "epoch": 0.5754865860073646, "grad_norm": 2.0728707313537598, "learning_rate": 4.905245059449053e-06, "loss": 1.0401, "step": 1094 }, { "epoch": 0.5760126249342451, "grad_norm": 1.94095778465271, "learning_rate": 4.9050555263802954e-06, "loss": 1.0262, "step": 1095 }, { "epoch": 0.5765386638611257, "grad_norm": 2.126347780227661, "learning_rate": 4.904865807613683e-06, "loss": 1.0678, "step": 1096 }, { "epoch": 0.5770647027880064, "grad_norm": 2.085378646850586, "learning_rate": 4.904675903163864e-06, "loss": 1.0665, "step": 1097 }, { "epoch": 0.5775907417148869, "grad_norm": 2.2276804447174072, "learning_rate": 4.9044858130454995e-06, "loss": 1.0718, "step": 1098 }, { "epoch": 0.5781167806417675, "grad_norm": 2.2318899631500244, "learning_rate": 4.904295537273269e-06, "loss": 1.0663, "step": 1099 }, { "epoch": 0.578642819568648, "grad_norm": 2.0555522441864014, "learning_rate": 4.904105075861864e-06, "loss": 0.9989, "step": 1100 }, { "epoch": 0.5791688584955287, "grad_norm": 2.094501256942749, "learning_rate": 4.9039144288259876e-06, "loss": 1.0802, "step": 1101 }, { "epoch": 0.5796948974224092, "grad_norm": 2.7403769493103027, "learning_rate": 4.903723596180363e-06, "loss": 1.0024, "step": 1102 }, { "epoch": 0.5802209363492898, "grad_norm": 2.1775436401367188, "learning_rate": 4.9035325779397225e-06, "loss": 1.0234, "step": 1103 }, { "epoch": 0.5807469752761705, "grad_norm": 2.2489676475524902, "learning_rate": 4.903341374118816e-06, "loss": 1.0188, "step": 1104 }, { "epoch": 0.581273014203051, "grad_norm": 2.2214367389678955, "learning_rate": 4.903149984732407e-06, "loss": 1.0835, "step": 1105 }, { "epoch": 0.5817990531299316, "grad_norm": 2.203273296356201, "learning_rate": 4.902958409795272e-06, "loss": 1.0547, "step": 1106 }, { "epoch": 0.5823250920568122, "grad_norm": 2.1076622009277344, "learning_rate": 4.902766649322204e-06, "loss": 1.0571, "step": 1107 }, { "epoch": 0.5828511309836928, "grad_norm": 2.1270394325256348, "learning_rate": 4.902574703328007e-06, "loss": 0.9863, "step": 1108 }, { "epoch": 0.5833771699105734, "grad_norm": 2.1030006408691406, "learning_rate": 4.902382571827503e-06, "loss": 1.0404, "step": 1109 }, { "epoch": 0.583903208837454, "grad_norm": 2.1046831607818604, "learning_rate": 4.9021902548355275e-06, "loss": 1.018, "step": 1110 }, { "epoch": 0.5844292477643346, "grad_norm": 2.0193376541137695, "learning_rate": 4.901997752366927e-06, "loss": 1.0035, "step": 1111 }, { "epoch": 0.5849552866912151, "grad_norm": 2.0812923908233643, "learning_rate": 4.9018050644365675e-06, "loss": 0.9928, "step": 1112 }, { "epoch": 0.5854813256180957, "grad_norm": 2.035750150680542, "learning_rate": 4.901612191059325e-06, "loss": 1.0658, "step": 1113 }, { "epoch": 0.5860073645449764, "grad_norm": 2.093606948852539, "learning_rate": 4.901419132250093e-06, "loss": 1.0019, "step": 1114 }, { "epoch": 0.5865334034718569, "grad_norm": 2.4018402099609375, "learning_rate": 4.901225888023776e-06, "loss": 1.0785, "step": 1115 }, { "epoch": 0.5870594423987375, "grad_norm": 2.1731529235839844, "learning_rate": 4.901032458395296e-06, "loss": 1.0437, "step": 1116 }, { "epoch": 0.587585481325618, "grad_norm": 2.085692882537842, "learning_rate": 4.900838843379588e-06, "loss": 1.0122, "step": 1117 }, { "epoch": 0.5881115202524987, "grad_norm": 2.272787094116211, "learning_rate": 4.900645042991601e-06, "loss": 1.0708, "step": 1118 }, { "epoch": 0.5886375591793793, "grad_norm": 2.197758913040161, "learning_rate": 4.900451057246298e-06, "loss": 1.037, "step": 1119 }, { "epoch": 0.5891635981062598, "grad_norm": 2.228980779647827, "learning_rate": 4.900256886158658e-06, "loss": 1.0306, "step": 1120 }, { "epoch": 0.5896896370331405, "grad_norm": 2.010698080062866, "learning_rate": 4.900062529743672e-06, "loss": 1.0777, "step": 1121 }, { "epoch": 0.590215675960021, "grad_norm": 2.0015103816986084, "learning_rate": 4.899867988016348e-06, "loss": 0.9991, "step": 1122 }, { "epoch": 0.5907417148869016, "grad_norm": 1.9307256937026978, "learning_rate": 4.899673260991706e-06, "loss": 1.0655, "step": 1123 }, { "epoch": 0.5912677538137823, "grad_norm": 2.339930295944214, "learning_rate": 4.899478348684782e-06, "loss": 1.0177, "step": 1124 }, { "epoch": 0.5917937927406628, "grad_norm": 2.000337839126587, "learning_rate": 4.899283251110624e-06, "loss": 1.036, "step": 1125 }, { "epoch": 0.5923198316675434, "grad_norm": 2.0116374492645264, "learning_rate": 4.899087968284297e-06, "loss": 0.9666, "step": 1126 }, { "epoch": 0.592845870594424, "grad_norm": 2.27270245552063, "learning_rate": 4.898892500220878e-06, "loss": 1.0526, "step": 1127 }, { "epoch": 0.5933719095213046, "grad_norm": 2.1844749450683594, "learning_rate": 4.89869684693546e-06, "loss": 1.0606, "step": 1128 }, { "epoch": 0.5938979484481852, "grad_norm": 2.112031936645508, "learning_rate": 4.898501008443151e-06, "loss": 1.0846, "step": 1129 }, { "epoch": 0.5944239873750657, "grad_norm": 2.251878499984741, "learning_rate": 4.898304984759069e-06, "loss": 1.023, "step": 1130 }, { "epoch": 0.5949500263019464, "grad_norm": 2.064732074737549, "learning_rate": 4.898108775898351e-06, "loss": 1.066, "step": 1131 }, { "epoch": 0.5954760652288269, "grad_norm": 2.10412335395813, "learning_rate": 4.897912381876147e-06, "loss": 1.0476, "step": 1132 }, { "epoch": 0.5960021041557075, "grad_norm": 2.1343259811401367, "learning_rate": 4.897715802707621e-06, "loss": 1.0264, "step": 1133 }, { "epoch": 0.5965281430825881, "grad_norm": 2.3453173637390137, "learning_rate": 4.89751903840795e-06, "loss": 1.076, "step": 1134 }, { "epoch": 0.5970541820094687, "grad_norm": 2.040123462677002, "learning_rate": 4.897322088992326e-06, "loss": 1.0494, "step": 1135 }, { "epoch": 0.5975802209363493, "grad_norm": 2.070585012435913, "learning_rate": 4.897124954475958e-06, "loss": 1.0904, "step": 1136 }, { "epoch": 0.5981062598632298, "grad_norm": 2.048081159591675, "learning_rate": 4.896927634874065e-06, "loss": 0.9855, "step": 1137 }, { "epoch": 0.5986322987901105, "grad_norm": 2.07633113861084, "learning_rate": 4.896730130201883e-06, "loss": 1.0848, "step": 1138 }, { "epoch": 0.599158337716991, "grad_norm": 2.233821153640747, "learning_rate": 4.8965324404746624e-06, "loss": 1.0419, "step": 1139 }, { "epoch": 0.5996843766438716, "grad_norm": 2.1806929111480713, "learning_rate": 4.896334565707666e-06, "loss": 1.0377, "step": 1140 }, { "epoch": 0.6002104155707523, "grad_norm": 2.056483268737793, "learning_rate": 4.896136505916174e-06, "loss": 1.0269, "step": 1141 }, { "epoch": 0.6007364544976328, "grad_norm": 1.9446007013320923, "learning_rate": 4.895938261115476e-06, "loss": 0.9958, "step": 1142 }, { "epoch": 0.6012624934245134, "grad_norm": 1.9170737266540527, "learning_rate": 4.8957398313208795e-06, "loss": 1.0083, "step": 1143 }, { "epoch": 0.601788532351394, "grad_norm": 2.0455801486968994, "learning_rate": 4.895541216547707e-06, "loss": 1.0819, "step": 1144 }, { "epoch": 0.6023145712782746, "grad_norm": 2.410231828689575, "learning_rate": 4.8953424168112925e-06, "loss": 1.0265, "step": 1145 }, { "epoch": 0.6028406102051552, "grad_norm": 2.0946412086486816, "learning_rate": 4.895143432126986e-06, "loss": 1.014, "step": 1146 }, { "epoch": 0.6033666491320357, "grad_norm": 1.9825836420059204, "learning_rate": 4.894944262510152e-06, "loss": 0.9721, "step": 1147 }, { "epoch": 0.6038926880589164, "grad_norm": 2.1228606700897217, "learning_rate": 4.8947449079761685e-06, "loss": 1.0971, "step": 1148 }, { "epoch": 0.6044187269857969, "grad_norm": 2.1443943977355957, "learning_rate": 4.894545368540427e-06, "loss": 0.9956, "step": 1149 }, { "epoch": 0.6049447659126775, "grad_norm": 1.9651165008544922, "learning_rate": 4.894345644218335e-06, "loss": 1.0103, "step": 1150 }, { "epoch": 0.6054708048395582, "grad_norm": 1.9829816818237305, "learning_rate": 4.8941457350253134e-06, "loss": 1.0425, "step": 1151 }, { "epoch": 0.6059968437664387, "grad_norm": 2.122873067855835, "learning_rate": 4.893945640976798e-06, "loss": 1.0532, "step": 1152 }, { "epoch": 0.6065228826933193, "grad_norm": 2.0714738368988037, "learning_rate": 4.8937453620882365e-06, "loss": 1.0307, "step": 1153 }, { "epoch": 0.6070489216201999, "grad_norm": 1.9049363136291504, "learning_rate": 4.893544898375096e-06, "loss": 0.9805, "step": 1154 }, { "epoch": 0.6075749605470805, "grad_norm": 2.432041645050049, "learning_rate": 4.893344249852851e-06, "loss": 1.0833, "step": 1155 }, { "epoch": 0.6081009994739611, "grad_norm": 2.055748224258423, "learning_rate": 4.893143416536997e-06, "loss": 1.0315, "step": 1156 }, { "epoch": 0.6086270384008416, "grad_norm": 1.9813153743743896, "learning_rate": 4.892942398443037e-06, "loss": 1.0786, "step": 1157 }, { "epoch": 0.6091530773277223, "grad_norm": 2.2038941383361816, "learning_rate": 4.892741195586496e-06, "loss": 1.0604, "step": 1158 }, { "epoch": 0.6096791162546028, "grad_norm": 2.0015673637390137, "learning_rate": 4.892539807982906e-06, "loss": 0.9863, "step": 1159 }, { "epoch": 0.6102051551814834, "grad_norm": 2.0392401218414307, "learning_rate": 4.892338235647818e-06, "loss": 1.0218, "step": 1160 }, { "epoch": 0.6107311941083641, "grad_norm": 2.0060133934020996, "learning_rate": 4.892136478596796e-06, "loss": 1.0134, "step": 1161 }, { "epoch": 0.6112572330352446, "grad_norm": 1.9645148515701294, "learning_rate": 4.8919345368454164e-06, "loss": 1.0206, "step": 1162 }, { "epoch": 0.6117832719621252, "grad_norm": 1.9299581050872803, "learning_rate": 4.8917324104092725e-06, "loss": 1.0243, "step": 1163 }, { "epoch": 0.6123093108890058, "grad_norm": 2.071143388748169, "learning_rate": 4.891530099303971e-06, "loss": 1.0466, "step": 1164 }, { "epoch": 0.6128353498158864, "grad_norm": 2.122020959854126, "learning_rate": 4.891327603545132e-06, "loss": 1.0886, "step": 1165 }, { "epoch": 0.6133613887427669, "grad_norm": 2.0861775875091553, "learning_rate": 4.891124923148391e-06, "loss": 1.0481, "step": 1166 }, { "epoch": 0.6138874276696475, "grad_norm": 2.053553581237793, "learning_rate": 4.890922058129396e-06, "loss": 1.0332, "step": 1167 }, { "epoch": 0.6144134665965282, "grad_norm": 2.0698556900024414, "learning_rate": 4.890719008503813e-06, "loss": 0.9913, "step": 1168 }, { "epoch": 0.6149395055234087, "grad_norm": 2.0626866817474365, "learning_rate": 4.890515774287317e-06, "loss": 1.0383, "step": 1169 }, { "epoch": 0.6154655444502893, "grad_norm": 2.001122236251831, "learning_rate": 4.890312355495602e-06, "loss": 0.997, "step": 1170 }, { "epoch": 0.6159915833771699, "grad_norm": 2.141261577606201, "learning_rate": 4.890108752144373e-06, "loss": 1.0139, "step": 1171 }, { "epoch": 0.6165176223040505, "grad_norm": 2.0430335998535156, "learning_rate": 4.8899049642493514e-06, "loss": 1.0177, "step": 1172 }, { "epoch": 0.6170436612309311, "grad_norm": 2.0376110076904297, "learning_rate": 4.889700991826271e-06, "loss": 1.0306, "step": 1173 }, { "epoch": 0.6175697001578117, "grad_norm": 2.0546419620513916, "learning_rate": 4.889496834890882e-06, "loss": 1.0379, "step": 1174 }, { "epoch": 0.6180957390846923, "grad_norm": 2.004117012023926, "learning_rate": 4.889292493458947e-06, "loss": 1.1014, "step": 1175 }, { "epoch": 0.6186217780115728, "grad_norm": 2.1904101371765137, "learning_rate": 4.889087967546243e-06, "loss": 1.0252, "step": 1176 }, { "epoch": 0.6191478169384534, "grad_norm": 2.2026965618133545, "learning_rate": 4.8888832571685626e-06, "loss": 1.0309, "step": 1177 }, { "epoch": 0.6196738558653341, "grad_norm": 1.9925811290740967, "learning_rate": 4.888678362341711e-06, "loss": 1.0157, "step": 1178 }, { "epoch": 0.6201998947922146, "grad_norm": 2.4098422527313232, "learning_rate": 4.88847328308151e-06, "loss": 0.9825, "step": 1179 }, { "epoch": 0.6207259337190952, "grad_norm": 1.9352220296859741, "learning_rate": 4.888268019403792e-06, "loss": 1.0235, "step": 1180 }, { "epoch": 0.6212519726459758, "grad_norm": 1.9798966646194458, "learning_rate": 4.888062571324407e-06, "loss": 1.0124, "step": 1181 }, { "epoch": 0.6217780115728564, "grad_norm": 1.9737377166748047, "learning_rate": 4.887856938859218e-06, "loss": 1.005, "step": 1182 }, { "epoch": 0.622304050499737, "grad_norm": 2.2528250217437744, "learning_rate": 4.887651122024102e-06, "loss": 1.0207, "step": 1183 }, { "epoch": 0.6228300894266176, "grad_norm": 2.01436185836792, "learning_rate": 4.887445120834949e-06, "loss": 1.0368, "step": 1184 }, { "epoch": 0.6233561283534982, "grad_norm": 2.0212924480438232, "learning_rate": 4.887238935307667e-06, "loss": 1.0136, "step": 1185 }, { "epoch": 0.6238821672803787, "grad_norm": 2.080514669418335, "learning_rate": 4.887032565458174e-06, "loss": 1.0012, "step": 1186 }, { "epoch": 0.6244082062072593, "grad_norm": 2.220168113708496, "learning_rate": 4.886826011302406e-06, "loss": 1.0055, "step": 1187 }, { "epoch": 0.62493424513414, "grad_norm": 2.042325258255005, "learning_rate": 4.886619272856309e-06, "loss": 1.0793, "step": 1188 }, { "epoch": 0.6254602840610205, "grad_norm": 2.0139427185058594, "learning_rate": 4.886412350135848e-06, "loss": 1.0853, "step": 1189 }, { "epoch": 0.6259863229879011, "grad_norm": 2.072531223297119, "learning_rate": 4.886205243156998e-06, "loss": 1.0611, "step": 1190 }, { "epoch": 0.6265123619147817, "grad_norm": 2.1070992946624756, "learning_rate": 4.8859979519357505e-06, "loss": 1.0171, "step": 1191 }, { "epoch": 0.6270384008416623, "grad_norm": 1.9750585556030273, "learning_rate": 4.885790476488111e-06, "loss": 1.01, "step": 1192 }, { "epoch": 0.6275644397685429, "grad_norm": 1.9221036434173584, "learning_rate": 4.885582816830099e-06, "loss": 1.0173, "step": 1193 }, { "epoch": 0.6280904786954234, "grad_norm": 2.0700929164886475, "learning_rate": 4.885374972977748e-06, "loss": 1.0469, "step": 1194 }, { "epoch": 0.6286165176223041, "grad_norm": 2.1358914375305176, "learning_rate": 4.885166944947106e-06, "loss": 1.0144, "step": 1195 }, { "epoch": 0.6291425565491846, "grad_norm": 2.0657570362091064, "learning_rate": 4.884958732754236e-06, "loss": 1.0278, "step": 1196 }, { "epoch": 0.6296685954760652, "grad_norm": 2.050619125366211, "learning_rate": 4.884750336415213e-06, "loss": 1.0401, "step": 1197 }, { "epoch": 0.6301946344029458, "grad_norm": 2.029069423675537, "learning_rate": 4.884541755946127e-06, "loss": 1.0265, "step": 1198 }, { "epoch": 0.6307206733298264, "grad_norm": 2.2242050170898438, "learning_rate": 4.884332991363086e-06, "loss": 1.043, "step": 1199 }, { "epoch": 0.631246712256707, "grad_norm": 1.9235576391220093, "learning_rate": 4.8841240426822056e-06, "loss": 1.0323, "step": 1200 }, { "epoch": 0.6317727511835876, "grad_norm": 2.0110039710998535, "learning_rate": 4.88391490991962e-06, "loss": 0.9861, "step": 1201 }, { "epoch": 0.6322987901104682, "grad_norm": 1.9583542346954346, "learning_rate": 4.883705593091478e-06, "loss": 1.0907, "step": 1202 }, { "epoch": 0.6328248290373487, "grad_norm": 2.046147346496582, "learning_rate": 4.88349609221394e-06, "loss": 1.0264, "step": 1203 }, { "epoch": 0.6333508679642293, "grad_norm": 2.072329521179199, "learning_rate": 4.8832864073031826e-06, "loss": 1.0273, "step": 1204 }, { "epoch": 0.63387690689111, "grad_norm": 2.163562774658203, "learning_rate": 4.883076538375395e-06, "loss": 0.9729, "step": 1205 }, { "epoch": 0.6344029458179905, "grad_norm": 2.018745183944702, "learning_rate": 4.8828664854467825e-06, "loss": 1.0349, "step": 1206 }, { "epoch": 0.6349289847448711, "grad_norm": 1.9641830921173096, "learning_rate": 4.882656248533562e-06, "loss": 1.0254, "step": 1207 }, { "epoch": 0.6354550236717517, "grad_norm": 2.189903736114502, "learning_rate": 4.8824458276519676e-06, "loss": 1.0347, "step": 1208 }, { "epoch": 0.6359810625986323, "grad_norm": 1.9000815153121948, "learning_rate": 4.882235222818245e-06, "loss": 1.0068, "step": 1209 }, { "epoch": 0.6365071015255129, "grad_norm": 2.008253335952759, "learning_rate": 4.882024434048658e-06, "loss": 0.9951, "step": 1210 }, { "epoch": 0.6370331404523935, "grad_norm": 2.254880905151367, "learning_rate": 4.881813461359479e-06, "loss": 1.0254, "step": 1211 }, { "epoch": 0.6375591793792741, "grad_norm": 2.079281806945801, "learning_rate": 4.881602304766999e-06, "loss": 1.0138, "step": 1212 }, { "epoch": 0.6380852183061546, "grad_norm": 1.9515445232391357, "learning_rate": 4.881390964287521e-06, "loss": 0.9896, "step": 1213 }, { "epoch": 0.6386112572330352, "grad_norm": 2.118746757507324, "learning_rate": 4.881179439937363e-06, "loss": 1.0554, "step": 1214 }, { "epoch": 0.6391372961599159, "grad_norm": 1.9809492826461792, "learning_rate": 4.8809677317328574e-06, "loss": 1.0327, "step": 1215 }, { "epoch": 0.6396633350867964, "grad_norm": 2.0196714401245117, "learning_rate": 4.88075583969035e-06, "loss": 1.0072, "step": 1216 }, { "epoch": 0.640189374013677, "grad_norm": 2.075596570968628, "learning_rate": 4.8805437638262024e-06, "loss": 1.0088, "step": 1217 }, { "epoch": 0.6407154129405576, "grad_norm": 1.919331431388855, "learning_rate": 4.880331504156788e-06, "loss": 0.9561, "step": 1218 }, { "epoch": 0.6412414518674382, "grad_norm": 2.1209754943847656, "learning_rate": 4.8801190606984974e-06, "loss": 1.0436, "step": 1219 }, { "epoch": 0.6417674907943188, "grad_norm": 2.1692416667938232, "learning_rate": 4.879906433467731e-06, "loss": 1.0596, "step": 1220 }, { "epoch": 0.6422935297211994, "grad_norm": 2.127383232116699, "learning_rate": 4.879693622480908e-06, "loss": 1.0527, "step": 1221 }, { "epoch": 0.64281956864808, "grad_norm": 2.0686752796173096, "learning_rate": 4.87948062775446e-06, "loss": 1.0161, "step": 1222 }, { "epoch": 0.6433456075749605, "grad_norm": 1.9912559986114502, "learning_rate": 4.879267449304831e-06, "loss": 1.0246, "step": 1223 }, { "epoch": 0.6438716465018411, "grad_norm": 1.9714523553848267, "learning_rate": 4.879054087148483e-06, "loss": 1.0669, "step": 1224 }, { "epoch": 0.6443976854287218, "grad_norm": 2.0122146606445312, "learning_rate": 4.878840541301888e-06, "loss": 1.0383, "step": 1225 }, { "epoch": 0.6449237243556023, "grad_norm": 2.191110134124756, "learning_rate": 4.878626811781536e-06, "loss": 1.0832, "step": 1226 }, { "epoch": 0.6454497632824829, "grad_norm": 2.018800735473633, "learning_rate": 4.8784128986039274e-06, "loss": 1.0588, "step": 1227 }, { "epoch": 0.6459758022093635, "grad_norm": 2.0812923908233643, "learning_rate": 4.87819880178558e-06, "loss": 1.0221, "step": 1228 }, { "epoch": 0.6465018411362441, "grad_norm": 2.110596179962158, "learning_rate": 4.877984521343025e-06, "loss": 1.0252, "step": 1229 }, { "epoch": 0.6470278800631246, "grad_norm": 2.2176296710968018, "learning_rate": 4.877770057292806e-06, "loss": 1.0575, "step": 1230 }, { "epoch": 0.6475539189900053, "grad_norm": 2.0294981002807617, "learning_rate": 4.8775554096514836e-06, "loss": 0.9862, "step": 1231 }, { "epoch": 0.6480799579168859, "grad_norm": 2.03635573387146, "learning_rate": 4.8773405784356285e-06, "loss": 1.0229, "step": 1232 }, { "epoch": 0.6486059968437664, "grad_norm": 2.2391481399536133, "learning_rate": 4.877125563661831e-06, "loss": 1.1258, "step": 1233 }, { "epoch": 0.649132035770647, "grad_norm": 2.1449427604675293, "learning_rate": 4.876910365346691e-06, "loss": 1.039, "step": 1234 }, { "epoch": 0.6496580746975276, "grad_norm": 2.075510025024414, "learning_rate": 4.876694983506826e-06, "loss": 1.047, "step": 1235 }, { "epoch": 0.6501841136244082, "grad_norm": 1.9154462814331055, "learning_rate": 4.876479418158862e-06, "loss": 0.9906, "step": 1236 }, { "epoch": 0.6507101525512888, "grad_norm": 2.2096331119537354, "learning_rate": 4.876263669319449e-06, "loss": 1.0843, "step": 1237 }, { "epoch": 0.6512361914781694, "grad_norm": 2.0682895183563232, "learning_rate": 4.87604773700524e-06, "loss": 1.0262, "step": 1238 }, { "epoch": 0.65176223040505, "grad_norm": 2.0859344005584717, "learning_rate": 4.8758316212329106e-06, "loss": 1.02, "step": 1239 }, { "epoch": 0.6522882693319305, "grad_norm": 2.060521364212036, "learning_rate": 4.875615322019146e-06, "loss": 1.0455, "step": 1240 }, { "epoch": 0.6528143082588111, "grad_norm": 2.049457311630249, "learning_rate": 4.875398839380647e-06, "loss": 1.0763, "step": 1241 }, { "epoch": 0.6533403471856918, "grad_norm": 2.2475039958953857, "learning_rate": 4.875182173334129e-06, "loss": 1.0599, "step": 1242 }, { "epoch": 0.6538663861125723, "grad_norm": 1.9375535249710083, "learning_rate": 4.874965323896321e-06, "loss": 0.9758, "step": 1243 }, { "epoch": 0.6543924250394529, "grad_norm": 2.0157570838928223, "learning_rate": 4.874748291083967e-06, "loss": 1.0491, "step": 1244 }, { "epoch": 0.6549184639663335, "grad_norm": 2.1339237689971924, "learning_rate": 4.874531074913823e-06, "loss": 0.9634, "step": 1245 }, { "epoch": 0.6554445028932141, "grad_norm": 1.946191430091858, "learning_rate": 4.874313675402662e-06, "loss": 1.0407, "step": 1246 }, { "epoch": 0.6559705418200947, "grad_norm": 1.9623258113861084, "learning_rate": 4.874096092567268e-06, "loss": 1.0662, "step": 1247 }, { "epoch": 0.6564965807469753, "grad_norm": 2.092224359512329, "learning_rate": 4.873878326424443e-06, "loss": 1.0802, "step": 1248 }, { "epoch": 0.6570226196738559, "grad_norm": 1.863853931427002, "learning_rate": 4.873660376990999e-06, "loss": 1.0789, "step": 1249 }, { "epoch": 0.6575486586007364, "grad_norm": 2.146857976913452, "learning_rate": 4.8734422442837655e-06, "loss": 1.0132, "step": 1250 }, { "epoch": 0.658074697527617, "grad_norm": 2.022573232650757, "learning_rate": 4.8732239283195844e-06, "loss": 1.0252, "step": 1251 }, { "epoch": 0.6586007364544977, "grad_norm": 2.160632848739624, "learning_rate": 4.873005429115312e-06, "loss": 1.0235, "step": 1252 }, { "epoch": 0.6591267753813782, "grad_norm": 2.0909252166748047, "learning_rate": 4.87278674668782e-06, "loss": 1.0671, "step": 1253 }, { "epoch": 0.6596528143082588, "grad_norm": 1.9689445495605469, "learning_rate": 4.872567881053991e-06, "loss": 1.0323, "step": 1254 }, { "epoch": 0.6601788532351394, "grad_norm": 2.141439914703369, "learning_rate": 4.872348832230727e-06, "loss": 1.0019, "step": 1255 }, { "epoch": 0.66070489216202, "grad_norm": 1.9927963018417358, "learning_rate": 4.872129600234938e-06, "loss": 1.0262, "step": 1256 }, { "epoch": 0.6612309310889006, "grad_norm": 2.1227667331695557, "learning_rate": 4.871910185083554e-06, "loss": 1.0341, "step": 1257 }, { "epoch": 0.6617569700157812, "grad_norm": 2.0554583072662354, "learning_rate": 4.871690586793514e-06, "loss": 1.0458, "step": 1258 }, { "epoch": 0.6622830089426618, "grad_norm": 1.9936654567718506, "learning_rate": 4.871470805381775e-06, "loss": 1.0125, "step": 1259 }, { "epoch": 0.6628090478695423, "grad_norm": 2.0953080654144287, "learning_rate": 4.871250840865306e-06, "loss": 1.0518, "step": 1260 }, { "epoch": 0.663335086796423, "grad_norm": 1.9445053339004517, "learning_rate": 4.871030693261091e-06, "loss": 0.9892, "step": 1261 }, { "epoch": 0.6638611257233035, "grad_norm": 2.054898500442505, "learning_rate": 4.870810362586127e-06, "loss": 1.0712, "step": 1262 }, { "epoch": 0.6643871646501841, "grad_norm": 2.158090114593506, "learning_rate": 4.870589848857428e-06, "loss": 0.9874, "step": 1263 }, { "epoch": 0.6649132035770647, "grad_norm": 2.081550121307373, "learning_rate": 4.870369152092019e-06, "loss": 1.0299, "step": 1264 }, { "epoch": 0.6654392425039453, "grad_norm": 1.9839400053024292, "learning_rate": 4.87014827230694e-06, "loss": 0.9997, "step": 1265 }, { "epoch": 0.6659652814308259, "grad_norm": 2.0596096515655518, "learning_rate": 4.869927209519246e-06, "loss": 1.0655, "step": 1266 }, { "epoch": 0.6664913203577064, "grad_norm": 2.3403422832489014, "learning_rate": 4.8697059637460055e-06, "loss": 1.0551, "step": 1267 }, { "epoch": 0.667017359284587, "grad_norm": 2.072814702987671, "learning_rate": 4.8694845350043004e-06, "loss": 1.0454, "step": 1268 }, { "epoch": 0.6675433982114677, "grad_norm": 2.2819271087646484, "learning_rate": 4.86926292331123e-06, "loss": 1.0076, "step": 1269 }, { "epoch": 0.6680694371383482, "grad_norm": 2.162179708480835, "learning_rate": 4.8690411286839024e-06, "loss": 1.0145, "step": 1270 }, { "epoch": 0.6685954760652288, "grad_norm": 2.1072568893432617, "learning_rate": 4.868819151139443e-06, "loss": 1.0936, "step": 1271 }, { "epoch": 0.6691215149921094, "grad_norm": 2.113056182861328, "learning_rate": 4.868596990694994e-06, "loss": 1.044, "step": 1272 }, { "epoch": 0.66964755391899, "grad_norm": 1.9856184720993042, "learning_rate": 4.868374647367705e-06, "loss": 1.0119, "step": 1273 }, { "epoch": 0.6701735928458706, "grad_norm": 2.013106346130371, "learning_rate": 4.868152121174746e-06, "loss": 1.0913, "step": 1274 }, { "epoch": 0.6706996317727512, "grad_norm": 1.8831686973571777, "learning_rate": 4.867929412133297e-06, "loss": 1.0077, "step": 1275 }, { "epoch": 0.6712256706996318, "grad_norm": 2.035214424133301, "learning_rate": 4.867706520260554e-06, "loss": 0.9683, "step": 1276 }, { "epoch": 0.6717517096265123, "grad_norm": 2.0336945056915283, "learning_rate": 4.867483445573727e-06, "loss": 1.0583, "step": 1277 }, { "epoch": 0.672277748553393, "grad_norm": 1.9241890907287598, "learning_rate": 4.867260188090041e-06, "loss": 1.0162, "step": 1278 }, { "epoch": 0.6728037874802736, "grad_norm": 2.122288942337036, "learning_rate": 4.8670367478267335e-06, "loss": 1.0633, "step": 1279 }, { "epoch": 0.6733298264071541, "grad_norm": 1.964282512664795, "learning_rate": 4.8668131248010555e-06, "loss": 1.0009, "step": 1280 }, { "epoch": 0.6738558653340347, "grad_norm": 2.075181722640991, "learning_rate": 4.866589319030273e-06, "loss": 1.0535, "step": 1281 }, { "epoch": 0.6743819042609153, "grad_norm": 2.086574077606201, "learning_rate": 4.866365330531668e-06, "loss": 1.0125, "step": 1282 }, { "epoch": 0.6749079431877959, "grad_norm": 2.176712989807129, "learning_rate": 4.866141159322535e-06, "loss": 1.0883, "step": 1283 }, { "epoch": 0.6754339821146765, "grad_norm": 2.4133596420288086, "learning_rate": 4.865916805420181e-06, "loss": 1.1115, "step": 1284 }, { "epoch": 0.6759600210415571, "grad_norm": 1.9632985591888428, "learning_rate": 4.865692268841931e-06, "loss": 0.9837, "step": 1285 }, { "epoch": 0.6764860599684377, "grad_norm": 2.320810556411743, "learning_rate": 4.865467549605119e-06, "loss": 1.0307, "step": 1286 }, { "epoch": 0.6770120988953182, "grad_norm": 2.259291172027588, "learning_rate": 4.865242647727097e-06, "loss": 1.0125, "step": 1287 }, { "epoch": 0.6775381378221988, "grad_norm": 2.069227695465088, "learning_rate": 4.8650175632252314e-06, "loss": 1.0348, "step": 1288 }, { "epoch": 0.6780641767490795, "grad_norm": 2.093912363052368, "learning_rate": 4.8647922961169e-06, "loss": 1.0628, "step": 1289 }, { "epoch": 0.67859021567596, "grad_norm": 2.0842857360839844, "learning_rate": 4.864566846419497e-06, "loss": 1.0296, "step": 1290 }, { "epoch": 0.6791162546028406, "grad_norm": 2.1448631286621094, "learning_rate": 4.864341214150428e-06, "loss": 1.0344, "step": 1291 }, { "epoch": 0.6796422935297212, "grad_norm": 2.173478841781616, "learning_rate": 4.864115399327115e-06, "loss": 1.0662, "step": 1292 }, { "epoch": 0.6801683324566018, "grad_norm": 2.1156740188598633, "learning_rate": 4.863889401966995e-06, "loss": 1.0568, "step": 1293 }, { "epoch": 0.6806943713834824, "grad_norm": 2.0641050338745117, "learning_rate": 4.863663222087515e-06, "loss": 1.0508, "step": 1294 }, { "epoch": 0.681220410310363, "grad_norm": 2.050645112991333, "learning_rate": 4.863436859706141e-06, "loss": 1.0198, "step": 1295 }, { "epoch": 0.6817464492372436, "grad_norm": 1.9624086618423462, "learning_rate": 4.86321031484035e-06, "loss": 1.012, "step": 1296 }, { "epoch": 0.6822724881641241, "grad_norm": 2.2763307094573975, "learning_rate": 4.8629835875076325e-06, "loss": 1.0208, "step": 1297 }, { "epoch": 0.6827985270910047, "grad_norm": 1.952094316482544, "learning_rate": 4.862756677725496e-06, "loss": 0.9912, "step": 1298 }, { "epoch": 0.6833245660178853, "grad_norm": 1.9964386224746704, "learning_rate": 4.862529585511461e-06, "loss": 1.0216, "step": 1299 }, { "epoch": 0.6838506049447659, "grad_norm": 2.0915441513061523, "learning_rate": 4.862302310883061e-06, "loss": 1.028, "step": 1300 }, { "epoch": 0.6843766438716465, "grad_norm": 2.239182233810425, "learning_rate": 4.862074853857843e-06, "loss": 1.1119, "step": 1301 }, { "epoch": 0.6849026827985271, "grad_norm": 2.120128870010376, "learning_rate": 4.861847214453371e-06, "loss": 1.0811, "step": 1302 }, { "epoch": 0.6854287217254077, "grad_norm": 1.8495033979415894, "learning_rate": 4.86161939268722e-06, "loss": 0.9559, "step": 1303 }, { "epoch": 0.6859547606522882, "grad_norm": 1.9767253398895264, "learning_rate": 4.861391388576982e-06, "loss": 0.9942, "step": 1304 }, { "epoch": 0.6864807995791689, "grad_norm": 1.9148463010787964, "learning_rate": 4.8611632021402605e-06, "loss": 1.0152, "step": 1305 }, { "epoch": 0.6870068385060495, "grad_norm": 2.036726474761963, "learning_rate": 4.860934833394674e-06, "loss": 1.0692, "step": 1306 }, { "epoch": 0.68753287743293, "grad_norm": 2.03383731842041, "learning_rate": 4.860706282357856e-06, "loss": 1.0429, "step": 1307 }, { "epoch": 0.6880589163598106, "grad_norm": 1.986863374710083, "learning_rate": 4.860477549047452e-06, "loss": 0.9737, "step": 1308 }, { "epoch": 0.6885849552866912, "grad_norm": 1.9917157888412476, "learning_rate": 4.860248633481124e-06, "loss": 0.9808, "step": 1309 }, { "epoch": 0.6891109942135718, "grad_norm": 1.9868308305740356, "learning_rate": 4.860019535676546e-06, "loss": 1.0001, "step": 1310 }, { "epoch": 0.6896370331404524, "grad_norm": 1.9900240898132324, "learning_rate": 4.859790255651408e-06, "loss": 1.0561, "step": 1311 }, { "epoch": 0.690163072067333, "grad_norm": 1.987703800201416, "learning_rate": 4.859560793423412e-06, "loss": 1.013, "step": 1312 }, { "epoch": 0.6906891109942136, "grad_norm": 1.9851711988449097, "learning_rate": 4.859331149010276e-06, "loss": 1.0727, "step": 1313 }, { "epoch": 0.6912151499210941, "grad_norm": 1.9733060598373413, "learning_rate": 4.8591013224297304e-06, "loss": 0.9924, "step": 1314 }, { "epoch": 0.6917411888479748, "grad_norm": 1.9737035036087036, "learning_rate": 4.85887131369952e-06, "loss": 1.0131, "step": 1315 }, { "epoch": 0.6922672277748554, "grad_norm": 2.176969528198242, "learning_rate": 4.858641122837407e-06, "loss": 1.0382, "step": 1316 }, { "epoch": 0.6927932667017359, "grad_norm": 1.951177716255188, "learning_rate": 4.858410749861161e-06, "loss": 1.011, "step": 1317 }, { "epoch": 0.6933193056286165, "grad_norm": 2.009986639022827, "learning_rate": 4.858180194788572e-06, "loss": 1.0999, "step": 1318 }, { "epoch": 0.6938453445554971, "grad_norm": 2.0470845699310303, "learning_rate": 4.857949457637441e-06, "loss": 1.0477, "step": 1319 }, { "epoch": 0.6943713834823777, "grad_norm": 2.163547992706299, "learning_rate": 4.857718538425582e-06, "loss": 1.0229, "step": 1320 }, { "epoch": 0.6948974224092583, "grad_norm": 2.0979368686676025, "learning_rate": 4.857487437170827e-06, "loss": 1.0686, "step": 1321 }, { "epoch": 0.6954234613361389, "grad_norm": 2.0388388633728027, "learning_rate": 4.857256153891017e-06, "loss": 0.991, "step": 1322 }, { "epoch": 0.6959495002630195, "grad_norm": 2.136115312576294, "learning_rate": 4.8570246886040124e-06, "loss": 1.0249, "step": 1323 }, { "epoch": 0.6964755391899, "grad_norm": 2.0932974815368652, "learning_rate": 4.8567930413276835e-06, "loss": 1.0649, "step": 1324 }, { "epoch": 0.6970015781167807, "grad_norm": 2.0559682846069336, "learning_rate": 4.856561212079916e-06, "loss": 0.9931, "step": 1325 }, { "epoch": 0.6975276170436613, "grad_norm": 1.9723689556121826, "learning_rate": 4.856329200878611e-06, "loss": 0.9628, "step": 1326 }, { "epoch": 0.6980536559705418, "grad_norm": 2.054049253463745, "learning_rate": 4.8560970077416805e-06, "loss": 1.0322, "step": 1327 }, { "epoch": 0.6985796948974224, "grad_norm": 2.100574254989624, "learning_rate": 4.855864632687055e-06, "loss": 1.0941, "step": 1328 }, { "epoch": 0.699105733824303, "grad_norm": 2.1415367126464844, "learning_rate": 4.8556320757326735e-06, "loss": 1.0341, "step": 1329 }, { "epoch": 0.6996317727511836, "grad_norm": 1.988004207611084, "learning_rate": 4.855399336896495e-06, "loss": 1.0357, "step": 1330 }, { "epoch": 0.7001578116780641, "grad_norm": 2.0249714851379395, "learning_rate": 4.855166416196487e-06, "loss": 1.0489, "step": 1331 }, { "epoch": 0.7006838506049448, "grad_norm": 1.9197039604187012, "learning_rate": 4.8549333136506356e-06, "loss": 1.0094, "step": 1332 }, { "epoch": 0.7012098895318254, "grad_norm": 2.153716564178467, "learning_rate": 4.854700029276938e-06, "loss": 1.0613, "step": 1333 }, { "epoch": 0.7017359284587059, "grad_norm": 1.9626339673995972, "learning_rate": 4.854466563093407e-06, "loss": 1.024, "step": 1334 }, { "epoch": 0.7022619673855865, "grad_norm": 2.0288281440734863, "learning_rate": 4.854232915118068e-06, "loss": 0.9778, "step": 1335 }, { "epoch": 0.7027880063124671, "grad_norm": 1.9677989482879639, "learning_rate": 4.853999085368963e-06, "loss": 0.9802, "step": 1336 }, { "epoch": 0.7033140452393477, "grad_norm": 2.054617404937744, "learning_rate": 4.853765073864144e-06, "loss": 0.9523, "step": 1337 }, { "epoch": 0.7038400841662283, "grad_norm": 2.0509955883026123, "learning_rate": 4.853530880621681e-06, "loss": 1.0324, "step": 1338 }, { "epoch": 0.7043661230931089, "grad_norm": 2.224724054336548, "learning_rate": 4.853296505659657e-06, "loss": 1.0965, "step": 1339 }, { "epoch": 0.7048921620199895, "grad_norm": 1.9698208570480347, "learning_rate": 4.8530619489961664e-06, "loss": 1.0486, "step": 1340 }, { "epoch": 0.70541820094687, "grad_norm": 2.129383087158203, "learning_rate": 4.85282721064932e-06, "loss": 1.0857, "step": 1341 }, { "epoch": 0.7059442398737507, "grad_norm": 2.2943053245544434, "learning_rate": 4.852592290637244e-06, "loss": 1.0628, "step": 1342 }, { "epoch": 0.7064702788006313, "grad_norm": 2.0792641639709473, "learning_rate": 4.852357188978075e-06, "loss": 1.0604, "step": 1343 }, { "epoch": 0.7069963177275118, "grad_norm": 2.0224812030792236, "learning_rate": 4.852121905689968e-06, "loss": 1.0687, "step": 1344 }, { "epoch": 0.7075223566543924, "grad_norm": 2.4030919075012207, "learning_rate": 4.851886440791087e-06, "loss": 1.0942, "step": 1345 }, { "epoch": 0.708048395581273, "grad_norm": 2.190215826034546, "learning_rate": 4.851650794299614e-06, "loss": 1.0393, "step": 1346 }, { "epoch": 0.7085744345081536, "grad_norm": 2.1099565029144287, "learning_rate": 4.851414966233743e-06, "loss": 1.0452, "step": 1347 }, { "epoch": 0.7091004734350342, "grad_norm": 2.156395673751831, "learning_rate": 4.851178956611682e-06, "loss": 1.0625, "step": 1348 }, { "epoch": 0.7096265123619148, "grad_norm": 2.1840314865112305, "learning_rate": 4.850942765451655e-06, "loss": 1.0467, "step": 1349 }, { "epoch": 0.7101525512887954, "grad_norm": 2.0080723762512207, "learning_rate": 4.850706392771899e-06, "loss": 1.0187, "step": 1350 }, { "epoch": 0.7106785902156759, "grad_norm": 2.1242828369140625, "learning_rate": 4.850469838590664e-06, "loss": 1.0459, "step": 1351 }, { "epoch": 0.7112046291425566, "grad_norm": 1.9652162790298462, "learning_rate": 4.8502331029262125e-06, "loss": 1.0404, "step": 1352 }, { "epoch": 0.7117306680694372, "grad_norm": 2.2363545894622803, "learning_rate": 4.849996185796827e-06, "loss": 1.0182, "step": 1353 }, { "epoch": 0.7122567069963177, "grad_norm": 2.028017044067383, "learning_rate": 4.849759087220798e-06, "loss": 1.0213, "step": 1354 }, { "epoch": 0.7127827459231983, "grad_norm": 2.265037775039673, "learning_rate": 4.849521807216432e-06, "loss": 1.0316, "step": 1355 }, { "epoch": 0.7133087848500789, "grad_norm": 2.083799362182617, "learning_rate": 4.849284345802051e-06, "loss": 1.0133, "step": 1356 }, { "epoch": 0.7138348237769595, "grad_norm": 1.9307647943496704, "learning_rate": 4.8490467029959895e-06, "loss": 1.0023, "step": 1357 }, { "epoch": 0.7143608627038401, "grad_norm": 2.1079766750335693, "learning_rate": 4.848808878816595e-06, "loss": 1.0208, "step": 1358 }, { "epoch": 0.7148869016307207, "grad_norm": 2.0214877128601074, "learning_rate": 4.8485708732822315e-06, "loss": 0.9904, "step": 1359 }, { "epoch": 0.7154129405576013, "grad_norm": 2.150768756866455, "learning_rate": 4.848332686411276e-06, "loss": 0.9969, "step": 1360 }, { "epoch": 0.7159389794844818, "grad_norm": 2.0330607891082764, "learning_rate": 4.8480943182221184e-06, "loss": 0.9865, "step": 1361 }, { "epoch": 0.7164650184113625, "grad_norm": 1.973970651626587, "learning_rate": 4.847855768733163e-06, "loss": 0.9815, "step": 1362 }, { "epoch": 0.716991057338243, "grad_norm": 2.074868679046631, "learning_rate": 4.84761703796283e-06, "loss": 1.0499, "step": 1363 }, { "epoch": 0.7175170962651236, "grad_norm": 1.9750478267669678, "learning_rate": 4.8473781259295514e-06, "loss": 0.9797, "step": 1364 }, { "epoch": 0.7180431351920042, "grad_norm": 1.971375823020935, "learning_rate": 4.847139032651774e-06, "loss": 0.9805, "step": 1365 }, { "epoch": 0.7185691741188848, "grad_norm": 2.0710880756378174, "learning_rate": 4.846899758147958e-06, "loss": 1.0143, "step": 1366 }, { "epoch": 0.7190952130457654, "grad_norm": 1.9696688652038574, "learning_rate": 4.8466603024365785e-06, "loss": 0.9869, "step": 1367 }, { "epoch": 0.7196212519726459, "grad_norm": 2.1022462844848633, "learning_rate": 4.846420665536126e-06, "loss": 1.0048, "step": 1368 }, { "epoch": 0.7201472908995266, "grad_norm": 2.164783000946045, "learning_rate": 4.8461808474651e-06, "loss": 1.0114, "step": 1369 }, { "epoch": 0.7206733298264072, "grad_norm": 2.0148744583129883, "learning_rate": 4.845940848242019e-06, "loss": 1.0232, "step": 1370 }, { "epoch": 0.7211993687532877, "grad_norm": 2.0193605422973633, "learning_rate": 4.845700667885414e-06, "loss": 0.9764, "step": 1371 }, { "epoch": 0.7217254076801684, "grad_norm": 2.005157232284546, "learning_rate": 4.845460306413829e-06, "loss": 1.0242, "step": 1372 }, { "epoch": 0.7222514466070489, "grad_norm": 2.128805637359619, "learning_rate": 4.845219763845823e-06, "loss": 0.9964, "step": 1373 }, { "epoch": 0.7227774855339295, "grad_norm": 1.9924060106277466, "learning_rate": 4.844979040199968e-06, "loss": 1.0185, "step": 1374 }, { "epoch": 0.7233035244608101, "grad_norm": 2.2126121520996094, "learning_rate": 4.844738135494851e-06, "loss": 1.0013, "step": 1375 }, { "epoch": 0.7238295633876907, "grad_norm": 2.0494630336761475, "learning_rate": 4.844497049749073e-06, "loss": 1.0628, "step": 1376 }, { "epoch": 0.7243556023145713, "grad_norm": 2.4115402698516846, "learning_rate": 4.844255782981249e-06, "loss": 1.0623, "step": 1377 }, { "epoch": 0.7248816412414518, "grad_norm": 2.062485933303833, "learning_rate": 4.8440143352100054e-06, "loss": 1.0115, "step": 1378 }, { "epoch": 0.7254076801683325, "grad_norm": 2.995894432067871, "learning_rate": 4.843772706453988e-06, "loss": 1.0805, "step": 1379 }, { "epoch": 0.7259337190952131, "grad_norm": 1.9974204301834106, "learning_rate": 4.84353089673185e-06, "loss": 1.0221, "step": 1380 }, { "epoch": 0.7264597580220936, "grad_norm": 2.1927318572998047, "learning_rate": 4.843288906062264e-06, "loss": 1.0273, "step": 1381 }, { "epoch": 0.7269857969489742, "grad_norm": 2.0213675498962402, "learning_rate": 4.8430467344639136e-06, "loss": 0.968, "step": 1382 }, { "epoch": 0.7275118358758548, "grad_norm": 2.2534306049346924, "learning_rate": 4.842804381955497e-06, "loss": 1.0457, "step": 1383 }, { "epoch": 0.7280378748027354, "grad_norm": 2.003638505935669, "learning_rate": 4.842561848555728e-06, "loss": 1.0471, "step": 1384 }, { "epoch": 0.728563913729616, "grad_norm": 2.217237949371338, "learning_rate": 4.842319134283331e-06, "loss": 1.0348, "step": 1385 }, { "epoch": 0.7290899526564966, "grad_norm": 2.1162800788879395, "learning_rate": 4.842076239157047e-06, "loss": 1.0548, "step": 1386 }, { "epoch": 0.7296159915833772, "grad_norm": 2.043252944946289, "learning_rate": 4.8418331631956325e-06, "loss": 1.0931, "step": 1387 }, { "epoch": 0.7301420305102577, "grad_norm": 2.099283218383789, "learning_rate": 4.841589906417853e-06, "loss": 1.0059, "step": 1388 }, { "epoch": 0.7306680694371384, "grad_norm": 1.9934890270233154, "learning_rate": 4.8413464688424904e-06, "loss": 1.0327, "step": 1389 }, { "epoch": 0.731194108364019, "grad_norm": 1.868202567100525, "learning_rate": 4.841102850488343e-06, "loss": 0.9622, "step": 1390 }, { "epoch": 0.7317201472908995, "grad_norm": 1.9592076539993286, "learning_rate": 4.84085905137422e-06, "loss": 1.0413, "step": 1391 }, { "epoch": 0.7322461862177801, "grad_norm": 2.0478546619415283, "learning_rate": 4.840615071518946e-06, "loss": 1.0343, "step": 1392 }, { "epoch": 0.7327722251446607, "grad_norm": 2.4996554851531982, "learning_rate": 4.840370910941358e-06, "loss": 1.1106, "step": 1393 }, { "epoch": 0.7332982640715413, "grad_norm": 2.0023233890533447, "learning_rate": 4.8401265696603085e-06, "loss": 1.0273, "step": 1394 }, { "epoch": 0.7338243029984218, "grad_norm": 2.0366029739379883, "learning_rate": 4.8398820476946625e-06, "loss": 1.0092, "step": 1395 }, { "epoch": 0.7343503419253025, "grad_norm": 2.2142248153686523, "learning_rate": 4.839637345063302e-06, "loss": 0.9884, "step": 1396 }, { "epoch": 0.7348763808521831, "grad_norm": 1.9955226182937622, "learning_rate": 4.839392461785119e-06, "loss": 1.054, "step": 1397 }, { "epoch": 0.7354024197790636, "grad_norm": 2.0607223510742188, "learning_rate": 4.839147397879023e-06, "loss": 0.9826, "step": 1398 }, { "epoch": 0.7359284587059443, "grad_norm": 2.054483652114868, "learning_rate": 4.8389021533639345e-06, "loss": 1.0738, "step": 1399 }, { "epoch": 0.7364544976328248, "grad_norm": 2.1066908836364746, "learning_rate": 4.8386567282587886e-06, "loss": 1.0937, "step": 1400 }, { "epoch": 0.7369805365597054, "grad_norm": 2.018155097961426, "learning_rate": 4.8384111225825355e-06, "loss": 0.9767, "step": 1401 }, { "epoch": 0.737506575486586, "grad_norm": 2.152189016342163, "learning_rate": 4.83816533635414e-06, "loss": 1.0062, "step": 1402 }, { "epoch": 0.7380326144134666, "grad_norm": 1.9946335554122925, "learning_rate": 4.8379193695925785e-06, "loss": 1.0724, "step": 1403 }, { "epoch": 0.7385586533403472, "grad_norm": 2.077017307281494, "learning_rate": 4.837673222316843e-06, "loss": 1.0991, "step": 1404 }, { "epoch": 0.7390846922672277, "grad_norm": 2.0850563049316406, "learning_rate": 4.837426894545938e-06, "loss": 1.0527, "step": 1405 }, { "epoch": 0.7396107311941084, "grad_norm": 1.9786406755447388, "learning_rate": 4.837180386298883e-06, "loss": 0.9666, "step": 1406 }, { "epoch": 0.740136770120989, "grad_norm": 2.0060155391693115, "learning_rate": 4.836933697594711e-06, "loss": 1.0795, "step": 1407 }, { "epoch": 0.7406628090478695, "grad_norm": 2.086906909942627, "learning_rate": 4.836686828452471e-06, "loss": 0.9925, "step": 1408 }, { "epoch": 0.7411888479747502, "grad_norm": 2.0125632286071777, "learning_rate": 4.836439778891223e-06, "loss": 0.9706, "step": 1409 }, { "epoch": 0.7417148869016307, "grad_norm": 1.8921434879302979, "learning_rate": 4.836192548930041e-06, "loss": 1.0237, "step": 1410 }, { "epoch": 0.7422409258285113, "grad_norm": 1.9400858879089355, "learning_rate": 4.835945138588015e-06, "loss": 1.0444, "step": 1411 }, { "epoch": 0.742766964755392, "grad_norm": 2.083749294281006, "learning_rate": 4.835697547884248e-06, "loss": 1.0136, "step": 1412 }, { "epoch": 0.7432930036822725, "grad_norm": 2.0750844478607178, "learning_rate": 4.8354497768378575e-06, "loss": 1.0863, "step": 1413 }, { "epoch": 0.7438190426091531, "grad_norm": 2.137214183807373, "learning_rate": 4.835201825467973e-06, "loss": 1.0095, "step": 1414 }, { "epoch": 0.7443450815360336, "grad_norm": 2.06549072265625, "learning_rate": 4.834953693793739e-06, "loss": 1.0449, "step": 1415 }, { "epoch": 0.7448711204629143, "grad_norm": 2.0396728515625, "learning_rate": 4.834705381834315e-06, "loss": 1.0093, "step": 1416 }, { "epoch": 0.7453971593897949, "grad_norm": 1.993697166442871, "learning_rate": 4.834456889608874e-06, "loss": 1.0075, "step": 1417 }, { "epoch": 0.7459231983166754, "grad_norm": 2.1017816066741943, "learning_rate": 4.834208217136601e-06, "loss": 1.0687, "step": 1418 }, { "epoch": 0.746449237243556, "grad_norm": 2.0740413665771484, "learning_rate": 4.833959364436698e-06, "loss": 0.9777, "step": 1419 }, { "epoch": 0.7469752761704366, "grad_norm": 2.0858206748962402, "learning_rate": 4.833710331528377e-06, "loss": 1.044, "step": 1420 }, { "epoch": 0.7475013150973172, "grad_norm": 2.33298921585083, "learning_rate": 4.833461118430869e-06, "loss": 1.0602, "step": 1421 }, { "epoch": 0.7480273540241978, "grad_norm": 2.1458897590637207, "learning_rate": 4.833211725163414e-06, "loss": 0.9903, "step": 1422 }, { "epoch": 0.7485533929510784, "grad_norm": 2.15071177482605, "learning_rate": 4.8329621517452685e-06, "loss": 1.011, "step": 1423 }, { "epoch": 0.749079431877959, "grad_norm": 2.0375895500183105, "learning_rate": 4.8327123981957025e-06, "loss": 1.0021, "step": 1424 }, { "epoch": 0.7496054708048395, "grad_norm": 1.9808685779571533, "learning_rate": 4.832462464534e-06, "loss": 1.025, "step": 1425 }, { "epoch": 0.7501315097317202, "grad_norm": 2.046558380126953, "learning_rate": 4.832212350779459e-06, "loss": 1.0435, "step": 1426 }, { "epoch": 0.7506575486586007, "grad_norm": 2.0020248889923096, "learning_rate": 4.831962056951392e-06, "loss": 1.0207, "step": 1427 }, { "epoch": 0.7511835875854813, "grad_norm": 1.9901740550994873, "learning_rate": 4.831711583069122e-06, "loss": 1.0505, "step": 1428 }, { "epoch": 0.751709626512362, "grad_norm": 2.112236738204956, "learning_rate": 4.83146092915199e-06, "loss": 1.0353, "step": 1429 }, { "epoch": 0.7522356654392425, "grad_norm": 2.0244028568267822, "learning_rate": 4.831210095219349e-06, "loss": 1.0169, "step": 1430 }, { "epoch": 0.7527617043661231, "grad_norm": 2.298645257949829, "learning_rate": 4.830959081290567e-06, "loss": 1.0498, "step": 1431 }, { "epoch": 0.7532877432930036, "grad_norm": 2.1593234539031982, "learning_rate": 4.8307078873850244e-06, "loss": 1.0954, "step": 1432 }, { "epoch": 0.7538137822198843, "grad_norm": 1.9387123584747314, "learning_rate": 4.830456513522117e-06, "loss": 0.9784, "step": 1433 }, { "epoch": 0.7543398211467649, "grad_norm": 2.1634531021118164, "learning_rate": 4.830204959721253e-06, "loss": 1.0516, "step": 1434 }, { "epoch": 0.7548658600736454, "grad_norm": 1.9310704469680786, "learning_rate": 4.829953226001855e-06, "loss": 0.9648, "step": 1435 }, { "epoch": 0.7553918990005261, "grad_norm": 2.0547149181365967, "learning_rate": 4.8297013123833605e-06, "loss": 1.0734, "step": 1436 }, { "epoch": 0.7559179379274066, "grad_norm": 2.222872734069824, "learning_rate": 4.829449218885219e-06, "loss": 0.9645, "step": 1437 }, { "epoch": 0.7564439768542872, "grad_norm": 2.128120183944702, "learning_rate": 4.829196945526897e-06, "loss": 1.046, "step": 1438 }, { "epoch": 0.7569700157811678, "grad_norm": 2.0309526920318604, "learning_rate": 4.828944492327872e-06, "loss": 1.0596, "step": 1439 }, { "epoch": 0.7574960547080484, "grad_norm": 2.0946176052093506, "learning_rate": 4.828691859307635e-06, "loss": 1.0134, "step": 1440 }, { "epoch": 0.758022093634929, "grad_norm": 1.9159823656082153, "learning_rate": 4.828439046485693e-06, "loss": 1.0081, "step": 1441 }, { "epoch": 0.7585481325618095, "grad_norm": 2.203627586364746, "learning_rate": 4.828186053881566e-06, "loss": 1.0451, "step": 1442 }, { "epoch": 0.7590741714886902, "grad_norm": 2.065521240234375, "learning_rate": 4.8279328815147895e-06, "loss": 1.0289, "step": 1443 }, { "epoch": 0.7596002104155708, "grad_norm": 2.1597719192504883, "learning_rate": 4.827679529404909e-06, "loss": 1.0373, "step": 1444 }, { "epoch": 0.7601262493424513, "grad_norm": 2.6100237369537354, "learning_rate": 4.827425997571488e-06, "loss": 1.0254, "step": 1445 }, { "epoch": 0.760652288269332, "grad_norm": 2.1975550651550293, "learning_rate": 4.8271722860341e-06, "loss": 1.0254, "step": 1446 }, { "epoch": 0.7611783271962125, "grad_norm": 2.019261360168457, "learning_rate": 4.826918394812336e-06, "loss": 1.0823, "step": 1447 }, { "epoch": 0.7617043661230931, "grad_norm": 1.9351961612701416, "learning_rate": 4.8266643239257996e-06, "loss": 1.0248, "step": 1448 }, { "epoch": 0.7622304050499737, "grad_norm": 1.9437129497528076, "learning_rate": 4.826410073394106e-06, "loss": 0.984, "step": 1449 }, { "epoch": 0.7627564439768543, "grad_norm": 2.277479887008667, "learning_rate": 4.826155643236889e-06, "loss": 1.0264, "step": 1450 }, { "epoch": 0.7632824829037349, "grad_norm": 2.2033772468566895, "learning_rate": 4.825901033473791e-06, "loss": 1.0249, "step": 1451 }, { "epoch": 0.7638085218306154, "grad_norm": 2.1912593841552734, "learning_rate": 4.825646244124472e-06, "loss": 1.0366, "step": 1452 }, { "epoch": 0.7643345607574961, "grad_norm": 2.0046746730804443, "learning_rate": 4.825391275208606e-06, "loss": 1.0411, "step": 1453 }, { "epoch": 0.7648605996843767, "grad_norm": 2.0601322650909424, "learning_rate": 4.825136126745877e-06, "loss": 1.052, "step": 1454 }, { "epoch": 0.7653866386112572, "grad_norm": 2.148794651031494, "learning_rate": 4.824880798755986e-06, "loss": 1.04, "step": 1455 }, { "epoch": 0.7659126775381379, "grad_norm": 2.027374505996704, "learning_rate": 4.824625291258649e-06, "loss": 1.005, "step": 1456 }, { "epoch": 0.7664387164650184, "grad_norm": 2.0703351497650146, "learning_rate": 4.824369604273592e-06, "loss": 1.0157, "step": 1457 }, { "epoch": 0.766964755391899, "grad_norm": 2.1002986431121826, "learning_rate": 4.8241137378205575e-06, "loss": 1.0355, "step": 1458 }, { "epoch": 0.7674907943187795, "grad_norm": 1.9970546960830688, "learning_rate": 4.823857691919302e-06, "loss": 0.9833, "step": 1459 }, { "epoch": 0.7680168332456602, "grad_norm": 2.0489771366119385, "learning_rate": 4.823601466589595e-06, "loss": 1.0351, "step": 1460 }, { "epoch": 0.7685428721725408, "grad_norm": 2.0190834999084473, "learning_rate": 4.823345061851219e-06, "loss": 1.0406, "step": 1461 }, { "epoch": 0.7690689110994213, "grad_norm": 2.0567877292633057, "learning_rate": 4.823088477723973e-06, "loss": 1.0593, "step": 1462 }, { "epoch": 0.769594950026302, "grad_norm": 1.883132815361023, "learning_rate": 4.822831714227667e-06, "loss": 1.0055, "step": 1463 }, { "epoch": 0.7701209889531825, "grad_norm": 1.9520277976989746, "learning_rate": 4.822574771382127e-06, "loss": 0.9831, "step": 1464 }, { "epoch": 0.7706470278800631, "grad_norm": 2.0123813152313232, "learning_rate": 4.822317649207191e-06, "loss": 0.9841, "step": 1465 }, { "epoch": 0.7711730668069438, "grad_norm": 2.089940309524536, "learning_rate": 4.8220603477227124e-06, "loss": 1.0121, "step": 1466 }, { "epoch": 0.7716991057338243, "grad_norm": 1.9485499858856201, "learning_rate": 4.8218028669485585e-06, "loss": 0.9744, "step": 1467 }, { "epoch": 0.7722251446607049, "grad_norm": 2.2764859199523926, "learning_rate": 4.821545206904608e-06, "loss": 1.0018, "step": 1468 }, { "epoch": 0.7727511835875854, "grad_norm": 2.039769411087036, "learning_rate": 4.821287367610756e-06, "loss": 1.0256, "step": 1469 }, { "epoch": 0.7732772225144661, "grad_norm": 2.0036065578460693, "learning_rate": 4.821029349086911e-06, "loss": 1.0399, "step": 1470 }, { "epoch": 0.7738032614413467, "grad_norm": 2.056286573410034, "learning_rate": 4.820771151352996e-06, "loss": 1.0077, "step": 1471 }, { "epoch": 0.7743293003682272, "grad_norm": 2.0001938343048096, "learning_rate": 4.820512774428944e-06, "loss": 1.0109, "step": 1472 }, { "epoch": 0.7748553392951079, "grad_norm": 2.007289409637451, "learning_rate": 4.820254218334707e-06, "loss": 1.0223, "step": 1473 }, { "epoch": 0.7753813782219884, "grad_norm": 2.079768657684326, "learning_rate": 4.8199954830902465e-06, "loss": 1.0565, "step": 1474 }, { "epoch": 0.775907417148869, "grad_norm": 2.030198097229004, "learning_rate": 4.819736568715543e-06, "loss": 1.033, "step": 1475 }, { "epoch": 0.7764334560757497, "grad_norm": 2.6482961177825928, "learning_rate": 4.819477475230584e-06, "loss": 1.0595, "step": 1476 }, { "epoch": 0.7769594950026302, "grad_norm": 2.160472869873047, "learning_rate": 4.8192182026553775e-06, "loss": 1.0214, "step": 1477 }, { "epoch": 0.7774855339295108, "grad_norm": 2.1956963539123535, "learning_rate": 4.818958751009941e-06, "loss": 1.0647, "step": 1478 }, { "epoch": 0.7780115728563913, "grad_norm": 2.346040725708008, "learning_rate": 4.818699120314306e-06, "loss": 1.0289, "step": 1479 }, { "epoch": 0.778537611783272, "grad_norm": 2.049593448638916, "learning_rate": 4.818439310588521e-06, "loss": 1.0188, "step": 1480 }, { "epoch": 0.7790636507101526, "grad_norm": 1.9567065238952637, "learning_rate": 4.818179321852646e-06, "loss": 1.0645, "step": 1481 }, { "epoch": 0.7795896896370331, "grad_norm": 2.0995101928710938, "learning_rate": 4.817919154126753e-06, "loss": 1.0283, "step": 1482 }, { "epoch": 0.7801157285639138, "grad_norm": 2.117649555206299, "learning_rate": 4.817658807430933e-06, "loss": 0.9973, "step": 1483 }, { "epoch": 0.7806417674907943, "grad_norm": 2.058525800704956, "learning_rate": 4.817398281785286e-06, "loss": 1.0278, "step": 1484 }, { "epoch": 0.7811678064176749, "grad_norm": 1.9914313554763794, "learning_rate": 4.817137577209927e-06, "loss": 0.9591, "step": 1485 }, { "epoch": 0.7816938453445555, "grad_norm": 1.9432276487350464, "learning_rate": 4.816876693724987e-06, "loss": 0.9964, "step": 1486 }, { "epoch": 0.7822198842714361, "grad_norm": 2.011399507522583, "learning_rate": 4.816615631350608e-06, "loss": 0.9963, "step": 1487 }, { "epoch": 0.7827459231983167, "grad_norm": 1.9606966972351074, "learning_rate": 4.816354390106947e-06, "loss": 0.9756, "step": 1488 }, { "epoch": 0.7832719621251972, "grad_norm": 2.011887788772583, "learning_rate": 4.816092970014176e-06, "loss": 1.0194, "step": 1489 }, { "epoch": 0.7837980010520779, "grad_norm": 2.0520918369293213, "learning_rate": 4.815831371092478e-06, "loss": 1.02, "step": 1490 }, { "epoch": 0.7843240399789585, "grad_norm": 2.018293619155884, "learning_rate": 4.815569593362053e-06, "loss": 1.0289, "step": 1491 }, { "epoch": 0.784850078905839, "grad_norm": 2.016738176345825, "learning_rate": 4.815307636843112e-06, "loss": 1.0523, "step": 1492 }, { "epoch": 0.7853761178327197, "grad_norm": 2.063619375228882, "learning_rate": 4.815045501555882e-06, "loss": 1.0099, "step": 1493 }, { "epoch": 0.7859021567596002, "grad_norm": 2.122360944747925, "learning_rate": 4.814783187520602e-06, "loss": 1.0346, "step": 1494 }, { "epoch": 0.7864281956864808, "grad_norm": 2.040095329284668, "learning_rate": 4.814520694757526e-06, "loss": 1.0017, "step": 1495 }, { "epoch": 0.7869542346133613, "grad_norm": 2.003471612930298, "learning_rate": 4.814258023286922e-06, "loss": 0.975, "step": 1496 }, { "epoch": 0.787480273540242, "grad_norm": 1.905517816543579, "learning_rate": 4.81399517312907e-06, "loss": 0.9899, "step": 1497 }, { "epoch": 0.7880063124671226, "grad_norm": 2.047112226486206, "learning_rate": 4.813732144304266e-06, "loss": 0.9558, "step": 1498 }, { "epoch": 0.7885323513940031, "grad_norm": 1.9621355533599854, "learning_rate": 4.8134689368328194e-06, "loss": 1.0668, "step": 1499 }, { "epoch": 0.7890583903208838, "grad_norm": 1.9221957921981812, "learning_rate": 4.813205550735052e-06, "loss": 1.0082, "step": 1500 }, { "epoch": 0.7895844292477643, "grad_norm": 2.002659797668457, "learning_rate": 4.812941986031299e-06, "loss": 1.0192, "step": 1501 }, { "epoch": 0.7901104681746449, "grad_norm": 2.1077136993408203, "learning_rate": 4.812678242741913e-06, "loss": 1.0316, "step": 1502 }, { "epoch": 0.7906365071015256, "grad_norm": 2.0782320499420166, "learning_rate": 4.812414320887256e-06, "loss": 1.058, "step": 1503 }, { "epoch": 0.7911625460284061, "grad_norm": 2.049888849258423, "learning_rate": 4.812150220487708e-06, "loss": 1.0033, "step": 1504 }, { "epoch": 0.7916885849552867, "grad_norm": 2.025468587875366, "learning_rate": 4.811885941563659e-06, "loss": 1.0066, "step": 1505 }, { "epoch": 0.7922146238821672, "grad_norm": 2.0612878799438477, "learning_rate": 4.8116214841355145e-06, "loss": 0.9783, "step": 1506 }, { "epoch": 0.7927406628090479, "grad_norm": 1.9370075464248657, "learning_rate": 4.811356848223693e-06, "loss": 1.0171, "step": 1507 }, { "epoch": 0.7932667017359285, "grad_norm": 2.069326877593994, "learning_rate": 4.8110920338486285e-06, "loss": 1.0283, "step": 1508 }, { "epoch": 0.793792740662809, "grad_norm": 2.076786518096924, "learning_rate": 4.810827041030768e-06, "loss": 0.9942, "step": 1509 }, { "epoch": 0.7943187795896897, "grad_norm": 1.8861708641052246, "learning_rate": 4.810561869790571e-06, "loss": 0.9909, "step": 1510 }, { "epoch": 0.7948448185165702, "grad_norm": 2.064493417739868, "learning_rate": 4.810296520148513e-06, "loss": 1.0302, "step": 1511 }, { "epoch": 0.7953708574434508, "grad_norm": 2.0212459564208984, "learning_rate": 4.810030992125081e-06, "loss": 0.9912, "step": 1512 }, { "epoch": 0.7958968963703315, "grad_norm": 2.047384023666382, "learning_rate": 4.809765285740776e-06, "loss": 1.0165, "step": 1513 }, { "epoch": 0.796422935297212, "grad_norm": 2.2222740650177, "learning_rate": 4.809499401016115e-06, "loss": 1.0295, "step": 1514 }, { "epoch": 0.7969489742240926, "grad_norm": 1.9516112804412842, "learning_rate": 4.809233337971627e-06, "loss": 0.9562, "step": 1515 }, { "epoch": 0.7974750131509731, "grad_norm": 2.0002121925354004, "learning_rate": 4.808967096627855e-06, "loss": 1.0076, "step": 1516 }, { "epoch": 0.7980010520778538, "grad_norm": 2.182039260864258, "learning_rate": 4.808700677005357e-06, "loss": 0.9925, "step": 1517 }, { "epoch": 0.7985270910047344, "grad_norm": 2.0578761100769043, "learning_rate": 4.808434079124701e-06, "loss": 0.9831, "step": 1518 }, { "epoch": 0.7990531299316149, "grad_norm": 1.8856642246246338, "learning_rate": 4.8081673030064735e-06, "loss": 1.0309, "step": 1519 }, { "epoch": 0.7995791688584956, "grad_norm": 2.1273880004882812, "learning_rate": 4.807900348671272e-06, "loss": 1.0581, "step": 1520 }, { "epoch": 0.8001052077853761, "grad_norm": 2.0696675777435303, "learning_rate": 4.8076332161397085e-06, "loss": 1.0402, "step": 1521 }, { "epoch": 0.8006312467122567, "grad_norm": 2.034176826477051, "learning_rate": 4.80736590543241e-06, "loss": 1.018, "step": 1522 }, { "epoch": 0.8011572856391374, "grad_norm": 1.9405510425567627, "learning_rate": 4.807098416570014e-06, "loss": 1.037, "step": 1523 }, { "epoch": 0.8016833245660179, "grad_norm": 2.0185844898223877, "learning_rate": 4.806830749573174e-06, "loss": 1.0817, "step": 1524 }, { "epoch": 0.8022093634928985, "grad_norm": 2.0617692470550537, "learning_rate": 4.806562904462559e-06, "loss": 0.989, "step": 1525 }, { "epoch": 0.802735402419779, "grad_norm": 2.022000789642334, "learning_rate": 4.806294881258846e-06, "loss": 1.0245, "step": 1526 }, { "epoch": 0.8032614413466597, "grad_norm": 2.189361572265625, "learning_rate": 4.806026679982733e-06, "loss": 1.0537, "step": 1527 }, { "epoch": 0.8037874802735402, "grad_norm": 2.0094563961029053, "learning_rate": 4.805758300654926e-06, "loss": 1.0437, "step": 1528 }, { "epoch": 0.8043135192004208, "grad_norm": 1.8940585851669312, "learning_rate": 4.805489743296148e-06, "loss": 0.9811, "step": 1529 }, { "epoch": 0.8048395581273015, "grad_norm": 2.0169241428375244, "learning_rate": 4.805221007927134e-06, "loss": 1.0354, "step": 1530 }, { "epoch": 0.805365597054182, "grad_norm": 2.1269545555114746, "learning_rate": 4.804952094568635e-06, "loss": 1.0439, "step": 1531 }, { "epoch": 0.8058916359810626, "grad_norm": 1.99850332736969, "learning_rate": 4.804683003241413e-06, "loss": 1.0313, "step": 1532 }, { "epoch": 0.8064176749079431, "grad_norm": 2.0577683448791504, "learning_rate": 4.804413733966244e-06, "loss": 1.0319, "step": 1533 }, { "epoch": 0.8069437138348238, "grad_norm": 1.993945837020874, "learning_rate": 4.804144286763921e-06, "loss": 1.0571, "step": 1534 }, { "epoch": 0.8074697527617044, "grad_norm": 2.00144624710083, "learning_rate": 4.803874661655246e-06, "loss": 1.0136, "step": 1535 }, { "epoch": 0.8079957916885849, "grad_norm": 2.114583969116211, "learning_rate": 4.8036048586610394e-06, "loss": 0.9996, "step": 1536 }, { "epoch": 0.8085218306154656, "grad_norm": 2.019767999649048, "learning_rate": 4.803334877802131e-06, "loss": 0.9812, "step": 1537 }, { "epoch": 0.8090478695423461, "grad_norm": 2.1253437995910645, "learning_rate": 4.803064719099368e-06, "loss": 1.041, "step": 1538 }, { "epoch": 0.8095739084692267, "grad_norm": 2.055514335632324, "learning_rate": 4.802794382573609e-06, "loss": 0.9733, "step": 1539 }, { "epoch": 0.8100999473961074, "grad_norm": 2.0274434089660645, "learning_rate": 4.802523868245727e-06, "loss": 1.0222, "step": 1540 }, { "epoch": 0.8106259863229879, "grad_norm": 2.1663291454315186, "learning_rate": 4.80225317613661e-06, "loss": 1.0308, "step": 1541 }, { "epoch": 0.8111520252498685, "grad_norm": 1.8864918947219849, "learning_rate": 4.801982306267156e-06, "loss": 0.9551, "step": 1542 }, { "epoch": 0.811678064176749, "grad_norm": 2.1302011013031006, "learning_rate": 4.801711258658281e-06, "loss": 1.0188, "step": 1543 }, { "epoch": 0.8122041031036297, "grad_norm": 1.9002829790115356, "learning_rate": 4.801440033330914e-06, "loss": 1.0278, "step": 1544 }, { "epoch": 0.8127301420305103, "grad_norm": 2.1114113330841064, "learning_rate": 4.801168630305995e-06, "loss": 1.0616, "step": 1545 }, { "epoch": 0.8132561809573908, "grad_norm": 1.9383304119110107, "learning_rate": 4.800897049604479e-06, "loss": 0.9977, "step": 1546 }, { "epoch": 0.8137822198842715, "grad_norm": 1.9206221103668213, "learning_rate": 4.800625291247338e-06, "loss": 0.9758, "step": 1547 }, { "epoch": 0.814308258811152, "grad_norm": 1.9258513450622559, "learning_rate": 4.800353355255552e-06, "loss": 0.985, "step": 1548 }, { "epoch": 0.8148342977380326, "grad_norm": 1.9767898321151733, "learning_rate": 4.800081241650117e-06, "loss": 0.9802, "step": 1549 }, { "epoch": 0.8153603366649133, "grad_norm": 1.9899487495422363, "learning_rate": 4.799808950452047e-06, "loss": 1.0104, "step": 1550 }, { "epoch": 0.8158863755917938, "grad_norm": 1.9970616102218628, "learning_rate": 4.799536481682362e-06, "loss": 1.0125, "step": 1551 }, { "epoch": 0.8164124145186744, "grad_norm": 1.9914542436599731, "learning_rate": 4.799263835362103e-06, "loss": 1.0458, "step": 1552 }, { "epoch": 0.8169384534455549, "grad_norm": 2.072939157485962, "learning_rate": 4.798991011512319e-06, "loss": 1.0663, "step": 1553 }, { "epoch": 0.8174644923724356, "grad_norm": 1.9783833026885986, "learning_rate": 4.798718010154076e-06, "loss": 1.0281, "step": 1554 }, { "epoch": 0.8179905312993162, "grad_norm": 2.4431405067443848, "learning_rate": 4.798444831308454e-06, "loss": 1.0667, "step": 1555 }, { "epoch": 0.8185165702261967, "grad_norm": 2.1270408630371094, "learning_rate": 4.798171474996543e-06, "loss": 1.0217, "step": 1556 }, { "epoch": 0.8190426091530774, "grad_norm": 2.091042995452881, "learning_rate": 4.797897941239452e-06, "loss": 1.0126, "step": 1557 }, { "epoch": 0.8195686480799579, "grad_norm": 2.016575336456299, "learning_rate": 4.797624230058299e-06, "loss": 1.0269, "step": 1558 }, { "epoch": 0.8200946870068385, "grad_norm": 2.1780738830566406, "learning_rate": 4.797350341474218e-06, "loss": 1.0405, "step": 1559 }, { "epoch": 0.820620725933719, "grad_norm": 2.0331525802612305, "learning_rate": 4.797076275508358e-06, "loss": 1.0452, "step": 1560 }, { "epoch": 0.8211467648605997, "grad_norm": 2.0023865699768066, "learning_rate": 4.796802032181877e-06, "loss": 0.9752, "step": 1561 }, { "epoch": 0.8216728037874803, "grad_norm": 2.11030912399292, "learning_rate": 4.796527611515952e-06, "loss": 1.0675, "step": 1562 }, { "epoch": 0.8221988427143608, "grad_norm": 2.0733113288879395, "learning_rate": 4.7962530135317705e-06, "loss": 1.0511, "step": 1563 }, { "epoch": 0.8227248816412415, "grad_norm": 2.0920655727386475, "learning_rate": 4.795978238250535e-06, "loss": 1.0797, "step": 1564 }, { "epoch": 0.823250920568122, "grad_norm": 2.218693256378174, "learning_rate": 4.795703285693461e-06, "loss": 1.0385, "step": 1565 }, { "epoch": 0.8237769594950026, "grad_norm": 1.9661623239517212, "learning_rate": 4.795428155881779e-06, "loss": 1.001, "step": 1566 }, { "epoch": 0.8243029984218833, "grad_norm": 2.1669209003448486, "learning_rate": 4.795152848836731e-06, "loss": 1.0317, "step": 1567 }, { "epoch": 0.8248290373487638, "grad_norm": 1.9323532581329346, "learning_rate": 4.794877364579573e-06, "loss": 1.0182, "step": 1568 }, { "epoch": 0.8253550762756444, "grad_norm": 1.9551295042037964, "learning_rate": 4.794601703131579e-06, "loss": 1.0048, "step": 1569 }, { "epoch": 0.8258811152025249, "grad_norm": 1.9809366464614868, "learning_rate": 4.7943258645140285e-06, "loss": 1.0377, "step": 1570 }, { "epoch": 0.8264071541294056, "grad_norm": 2.0074756145477295, "learning_rate": 4.794049848748224e-06, "loss": 1.0218, "step": 1571 }, { "epoch": 0.8269331930562862, "grad_norm": 2.0177736282348633, "learning_rate": 4.793773655855474e-06, "loss": 1.0402, "step": 1572 }, { "epoch": 0.8274592319831667, "grad_norm": 2.0348360538482666, "learning_rate": 4.7934972858571035e-06, "loss": 1.0312, "step": 1573 }, { "epoch": 0.8279852709100474, "grad_norm": 2.097808599472046, "learning_rate": 4.793220738774455e-06, "loss": 1.0618, "step": 1574 }, { "epoch": 0.8285113098369279, "grad_norm": 2.061023473739624, "learning_rate": 4.792944014628877e-06, "loss": 1.0464, "step": 1575 }, { "epoch": 0.8290373487638085, "grad_norm": 2.1510798931121826, "learning_rate": 4.792667113441738e-06, "loss": 1.0102, "step": 1576 }, { "epoch": 0.8295633876906892, "grad_norm": 2.1446409225463867, "learning_rate": 4.7923900352344185e-06, "loss": 1.0577, "step": 1577 }, { "epoch": 0.8300894266175697, "grad_norm": 2.2582831382751465, "learning_rate": 4.79211278002831e-06, "loss": 1.1042, "step": 1578 }, { "epoch": 0.8306154655444503, "grad_norm": 2.0069401264190674, "learning_rate": 4.791835347844821e-06, "loss": 0.9835, "step": 1579 }, { "epoch": 0.8311415044713308, "grad_norm": 2.0074360370635986, "learning_rate": 4.791557738705372e-06, "loss": 1.0596, "step": 1580 }, { "epoch": 0.8316675433982115, "grad_norm": 2.2237892150878906, "learning_rate": 4.791279952631399e-06, "loss": 1.0162, "step": 1581 }, { "epoch": 0.8321935823250921, "grad_norm": 2.0037453174591064, "learning_rate": 4.791001989644349e-06, "loss": 0.9879, "step": 1582 }, { "epoch": 0.8327196212519726, "grad_norm": 1.994869351387024, "learning_rate": 4.790723849765684e-06, "loss": 0.9908, "step": 1583 }, { "epoch": 0.8332456601788533, "grad_norm": 2.1808955669403076, "learning_rate": 4.790445533016879e-06, "loss": 0.9896, "step": 1584 }, { "epoch": 0.8337716991057338, "grad_norm": 1.9274131059646606, "learning_rate": 4.790167039419424e-06, "loss": 0.9383, "step": 1585 }, { "epoch": 0.8342977380326144, "grad_norm": 2.0095322132110596, "learning_rate": 4.789888368994823e-06, "loss": 1.0282, "step": 1586 }, { "epoch": 0.8348237769594951, "grad_norm": 1.957546353340149, "learning_rate": 4.7896095217645895e-06, "loss": 0.9559, "step": 1587 }, { "epoch": 0.8353498158863756, "grad_norm": 2.1231918334960938, "learning_rate": 4.789330497750258e-06, "loss": 1.0414, "step": 1588 }, { "epoch": 0.8358758548132562, "grad_norm": 2.0618984699249268, "learning_rate": 4.789051296973368e-06, "loss": 0.9931, "step": 1589 }, { "epoch": 0.8364018937401367, "grad_norm": 2.023416042327881, "learning_rate": 4.78877191945548e-06, "loss": 0.963, "step": 1590 }, { "epoch": 0.8369279326670174, "grad_norm": 2.0902810096740723, "learning_rate": 4.788492365218164e-06, "loss": 1.076, "step": 1591 }, { "epoch": 0.8374539715938979, "grad_norm": 1.9094164371490479, "learning_rate": 4.788212634283005e-06, "loss": 0.9444, "step": 1592 }, { "epoch": 0.8379800105207785, "grad_norm": 1.9887592792510986, "learning_rate": 4.7879327266716e-06, "loss": 1.0364, "step": 1593 }, { "epoch": 0.8385060494476592, "grad_norm": 2.0019707679748535, "learning_rate": 4.787652642405564e-06, "loss": 1.0544, "step": 1594 }, { "epoch": 0.8390320883745397, "grad_norm": 2.0776329040527344, "learning_rate": 4.787372381506521e-06, "loss": 0.9949, "step": 1595 }, { "epoch": 0.8395581273014203, "grad_norm": 2.0091662406921387, "learning_rate": 4.7870919439961094e-06, "loss": 1.0165, "step": 1596 }, { "epoch": 0.8400841662283008, "grad_norm": 2.0458288192749023, "learning_rate": 4.786811329895984e-06, "loss": 1.0341, "step": 1597 }, { "epoch": 0.8406102051551815, "grad_norm": 2.0741751194000244, "learning_rate": 4.78653053922781e-06, "loss": 1.0509, "step": 1598 }, { "epoch": 0.8411362440820621, "grad_norm": 2.141406774520874, "learning_rate": 4.7862495720132695e-06, "loss": 1.0665, "step": 1599 }, { "epoch": 0.8416622830089426, "grad_norm": 2.2400975227355957, "learning_rate": 4.785968428274055e-06, "loss": 0.93, "step": 1600 }, { "epoch": 0.8421883219358233, "grad_norm": 1.929742455482483, "learning_rate": 4.785687108031875e-06, "loss": 1.0339, "step": 1601 }, { "epoch": 0.8427143608627038, "grad_norm": 2.012728452682495, "learning_rate": 4.785405611308448e-06, "loss": 0.9945, "step": 1602 }, { "epoch": 0.8432403997895844, "grad_norm": 2.0826306343078613, "learning_rate": 4.785123938125511e-06, "loss": 1.0322, "step": 1603 }, { "epoch": 0.8437664387164651, "grad_norm": 2.0303595066070557, "learning_rate": 4.784842088504813e-06, "loss": 1.0304, "step": 1604 }, { "epoch": 0.8442924776433456, "grad_norm": 2.0710513591766357, "learning_rate": 4.7845600624681145e-06, "loss": 1.0358, "step": 1605 }, { "epoch": 0.8448185165702262, "grad_norm": 2.052515983581543, "learning_rate": 4.784277860037192e-06, "loss": 1.0316, "step": 1606 }, { "epoch": 0.8453445554971067, "grad_norm": 2.1331636905670166, "learning_rate": 4.783995481233835e-06, "loss": 1.0139, "step": 1607 }, { "epoch": 0.8458705944239874, "grad_norm": 1.9738709926605225, "learning_rate": 4.783712926079846e-06, "loss": 1.034, "step": 1608 }, { "epoch": 0.846396633350868, "grad_norm": 2.059412956237793, "learning_rate": 4.78343019459704e-06, "loss": 1.0468, "step": 1609 }, { "epoch": 0.8469226722777485, "grad_norm": 2.027773141860962, "learning_rate": 4.783147286807249e-06, "loss": 1.0028, "step": 1610 }, { "epoch": 0.8474487112046292, "grad_norm": 2.1288933753967285, "learning_rate": 4.782864202732317e-06, "loss": 1.0177, "step": 1611 }, { "epoch": 0.8479747501315097, "grad_norm": 2.160947322845459, "learning_rate": 4.7825809423941e-06, "loss": 0.9814, "step": 1612 }, { "epoch": 0.8485007890583903, "grad_norm": 2.021970272064209, "learning_rate": 4.782297505814469e-06, "loss": 1.0198, "step": 1613 }, { "epoch": 0.849026827985271, "grad_norm": 1.9154043197631836, "learning_rate": 4.7820138930153106e-06, "loss": 1.0044, "step": 1614 }, { "epoch": 0.8495528669121515, "grad_norm": 2.0858964920043945, "learning_rate": 4.781730104018521e-06, "loss": 0.9932, "step": 1615 }, { "epoch": 0.8500789058390321, "grad_norm": 2.236711025238037, "learning_rate": 4.7814461388460105e-06, "loss": 1.0495, "step": 1616 }, { "epoch": 0.8506049447659126, "grad_norm": 2.0810344219207764, "learning_rate": 4.781161997519707e-06, "loss": 1.0617, "step": 1617 }, { "epoch": 0.8511309836927933, "grad_norm": 2.224187135696411, "learning_rate": 4.780877680061551e-06, "loss": 0.9911, "step": 1618 }, { "epoch": 0.8516570226196739, "grad_norm": 1.8846218585968018, "learning_rate": 4.780593186493491e-06, "loss": 1.0185, "step": 1619 }, { "epoch": 0.8521830615465544, "grad_norm": 2.0876333713531494, "learning_rate": 4.780308516837495e-06, "loss": 1.0173, "step": 1620 }, { "epoch": 0.8527091004734351, "grad_norm": 1.942492961883545, "learning_rate": 4.780023671115544e-06, "loss": 1.0154, "step": 1621 }, { "epoch": 0.8532351394003156, "grad_norm": 1.9483400583267212, "learning_rate": 4.779738649349629e-06, "loss": 1.0492, "step": 1622 }, { "epoch": 0.8537611783271962, "grad_norm": 1.8866205215454102, "learning_rate": 4.7794534515617586e-06, "loss": 0.9896, "step": 1623 }, { "epoch": 0.8542872172540767, "grad_norm": 2.146117687225342, "learning_rate": 4.779168077773953e-06, "loss": 1.0391, "step": 1624 }, { "epoch": 0.8548132561809574, "grad_norm": 2.099858283996582, "learning_rate": 4.778882528008245e-06, "loss": 1.0185, "step": 1625 }, { "epoch": 0.855339295107838, "grad_norm": 2.0597662925720215, "learning_rate": 4.7785968022866846e-06, "loss": 1.0373, "step": 1626 }, { "epoch": 0.8558653340347185, "grad_norm": 2.0234663486480713, "learning_rate": 4.7783109006313316e-06, "loss": 1.0471, "step": 1627 }, { "epoch": 0.8563913729615992, "grad_norm": 1.9113049507141113, "learning_rate": 4.778024823064261e-06, "loss": 1.01, "step": 1628 }, { "epoch": 0.8569174118884797, "grad_norm": 2.4924910068511963, "learning_rate": 4.777738569607562e-06, "loss": 1.0267, "step": 1629 }, { "epoch": 0.8574434508153603, "grad_norm": 1.9605613946914673, "learning_rate": 4.777452140283336e-06, "loss": 1.0237, "step": 1630 }, { "epoch": 0.857969489742241, "grad_norm": 2.1404225826263428, "learning_rate": 4.7771655351136996e-06, "loss": 1.0353, "step": 1631 }, { "epoch": 0.8584955286691215, "grad_norm": 2.1174509525299072, "learning_rate": 4.776878754120781e-06, "loss": 1.0517, "step": 1632 }, { "epoch": 0.8590215675960021, "grad_norm": 1.895843267440796, "learning_rate": 4.7765917973267226e-06, "loss": 0.9479, "step": 1633 }, { "epoch": 0.8595476065228826, "grad_norm": 2.080152988433838, "learning_rate": 4.776304664753682e-06, "loss": 1.0642, "step": 1634 }, { "epoch": 0.8600736454497633, "grad_norm": 1.9730490446090698, "learning_rate": 4.776017356423827e-06, "loss": 1.0059, "step": 1635 }, { "epoch": 0.8605996843766439, "grad_norm": 2.19085693359375, "learning_rate": 4.775729872359343e-06, "loss": 1.0368, "step": 1636 }, { "epoch": 0.8611257233035244, "grad_norm": 2.14911150932312, "learning_rate": 4.775442212582428e-06, "loss": 1.0583, "step": 1637 }, { "epoch": 0.8616517622304051, "grad_norm": 1.9603419303894043, "learning_rate": 4.775154377115291e-06, "loss": 1.0336, "step": 1638 }, { "epoch": 0.8621778011572856, "grad_norm": 1.9417442083358765, "learning_rate": 4.774866365980156e-06, "loss": 0.9885, "step": 1639 }, { "epoch": 0.8627038400841662, "grad_norm": 2.092170000076294, "learning_rate": 4.774578179199261e-06, "loss": 1.0496, "step": 1640 }, { "epoch": 0.8632298790110469, "grad_norm": 2.0614163875579834, "learning_rate": 4.774289816794858e-06, "loss": 1.0011, "step": 1641 }, { "epoch": 0.8637559179379274, "grad_norm": 2.168977975845337, "learning_rate": 4.774001278789211e-06, "loss": 1.0342, "step": 1642 }, { "epoch": 0.864281956864808, "grad_norm": 2.0560708045959473, "learning_rate": 4.773712565204599e-06, "loss": 1.0239, "step": 1643 }, { "epoch": 0.8648079957916885, "grad_norm": 1.9980727434158325, "learning_rate": 4.773423676063314e-06, "loss": 1.0312, "step": 1644 }, { "epoch": 0.8653340347185692, "grad_norm": 2.0650413036346436, "learning_rate": 4.773134611387661e-06, "loss": 1.0468, "step": 1645 }, { "epoch": 0.8658600736454498, "grad_norm": 1.954148530960083, "learning_rate": 4.77284537119996e-06, "loss": 1.0138, "step": 1646 }, { "epoch": 0.8663861125723303, "grad_norm": 2.092515468597412, "learning_rate": 4.772555955522543e-06, "loss": 0.987, "step": 1647 }, { "epoch": 0.866912151499211, "grad_norm": 2.007941246032715, "learning_rate": 4.772266364377757e-06, "loss": 0.9918, "step": 1648 }, { "epoch": 0.8674381904260915, "grad_norm": 1.9608757495880127, "learning_rate": 4.77197659778796e-06, "loss": 1.0502, "step": 1649 }, { "epoch": 0.8679642293529721, "grad_norm": 2.0067436695098877, "learning_rate": 4.771686655775527e-06, "loss": 1.0335, "step": 1650 }, { "epoch": 0.8684902682798528, "grad_norm": 2.079745292663574, "learning_rate": 4.771396538362845e-06, "loss": 1.043, "step": 1651 }, { "epoch": 0.8690163072067333, "grad_norm": 1.9542405605316162, "learning_rate": 4.771106245572313e-06, "loss": 0.984, "step": 1652 }, { "epoch": 0.8695423461336139, "grad_norm": 2.028416872024536, "learning_rate": 4.770815777426346e-06, "loss": 0.9933, "step": 1653 }, { "epoch": 0.8700683850604944, "grad_norm": 1.9436818361282349, "learning_rate": 4.77052513394737e-06, "loss": 1.0118, "step": 1654 }, { "epoch": 0.8705944239873751, "grad_norm": 2.028409004211426, "learning_rate": 4.770234315157828e-06, "loss": 1.0494, "step": 1655 }, { "epoch": 0.8711204629142556, "grad_norm": 2.0709540843963623, "learning_rate": 4.769943321080174e-06, "loss": 1.0542, "step": 1656 }, { "epoch": 0.8716465018411362, "grad_norm": 2.0256619453430176, "learning_rate": 4.7696521517368755e-06, "loss": 1.0011, "step": 1657 }, { "epoch": 0.8721725407680169, "grad_norm": 2.0937297344207764, "learning_rate": 4.769360807150414e-06, "loss": 0.9974, "step": 1658 }, { "epoch": 0.8726985796948974, "grad_norm": 2.2346062660217285, "learning_rate": 4.769069287343285e-06, "loss": 1.0128, "step": 1659 }, { "epoch": 0.873224618621778, "grad_norm": 2.1082491874694824, "learning_rate": 4.7687775923379975e-06, "loss": 1.0321, "step": 1660 }, { "epoch": 0.8737506575486585, "grad_norm": 2.0769453048706055, "learning_rate": 4.768485722157074e-06, "loss": 0.973, "step": 1661 }, { "epoch": 0.8742766964755392, "grad_norm": 2.0329558849334717, "learning_rate": 4.768193676823048e-06, "loss": 1.0102, "step": 1662 }, { "epoch": 0.8748027354024198, "grad_norm": 2.0758261680603027, "learning_rate": 4.767901456358471e-06, "loss": 1.0125, "step": 1663 }, { "epoch": 0.8753287743293003, "grad_norm": 2.12320613861084, "learning_rate": 4.767609060785905e-06, "loss": 1.0294, "step": 1664 }, { "epoch": 0.875854813256181, "grad_norm": 1.9771841764450073, "learning_rate": 4.767316490127927e-06, "loss": 0.9886, "step": 1665 }, { "epoch": 0.8763808521830615, "grad_norm": 1.9373329877853394, "learning_rate": 4.7670237444071255e-06, "loss": 0.994, "step": 1666 }, { "epoch": 0.8769068911099421, "grad_norm": 2.0343801975250244, "learning_rate": 4.766730823646105e-06, "loss": 1.0352, "step": 1667 }, { "epoch": 0.8774329300368228, "grad_norm": 2.020343542098999, "learning_rate": 4.766437727867481e-06, "loss": 0.979, "step": 1668 }, { "epoch": 0.8779589689637033, "grad_norm": 2.107820510864258, "learning_rate": 4.766144457093886e-06, "loss": 1.0296, "step": 1669 }, { "epoch": 0.8784850078905839, "grad_norm": 2.1452198028564453, "learning_rate": 4.765851011347962e-06, "loss": 1.0438, "step": 1670 }, { "epoch": 0.8790110468174644, "grad_norm": 2.087686777114868, "learning_rate": 4.7655573906523665e-06, "loss": 0.9788, "step": 1671 }, { "epoch": 0.8795370857443451, "grad_norm": 2.083097457885742, "learning_rate": 4.765263595029771e-06, "loss": 0.9921, "step": 1672 }, { "epoch": 0.8800631246712257, "grad_norm": 2.0001168251037598, "learning_rate": 4.76496962450286e-06, "loss": 0.9784, "step": 1673 }, { "epoch": 0.8805891635981062, "grad_norm": 1.9493898153305054, "learning_rate": 4.7646754790943315e-06, "loss": 1.0145, "step": 1674 }, { "epoch": 0.8811152025249869, "grad_norm": 2.140746831893921, "learning_rate": 4.764381158826896e-06, "loss": 1.0286, "step": 1675 }, { "epoch": 0.8816412414518674, "grad_norm": 2.0411407947540283, "learning_rate": 4.764086663723278e-06, "loss": 1.0297, "step": 1676 }, { "epoch": 0.882167280378748, "grad_norm": 2.164043664932251, "learning_rate": 4.763791993806218e-06, "loss": 1.0246, "step": 1677 }, { "epoch": 0.8826933193056287, "grad_norm": 2.0231616497039795, "learning_rate": 4.7634971490984675e-06, "loss": 0.9692, "step": 1678 }, { "epoch": 0.8832193582325092, "grad_norm": 2.0884130001068115, "learning_rate": 4.763202129622789e-06, "loss": 1.0441, "step": 1679 }, { "epoch": 0.8837453971593898, "grad_norm": 1.959078311920166, "learning_rate": 4.7629069354019654e-06, "loss": 1.0166, "step": 1680 }, { "epoch": 0.8842714360862703, "grad_norm": 1.836121916770935, "learning_rate": 4.762611566458786e-06, "loss": 1.0347, "step": 1681 }, { "epoch": 0.884797475013151, "grad_norm": 2.099907398223877, "learning_rate": 4.762316022816058e-06, "loss": 1.0309, "step": 1682 }, { "epoch": 0.8853235139400316, "grad_norm": 1.941465139389038, "learning_rate": 4.7620203044966004e-06, "loss": 1.0203, "step": 1683 }, { "epoch": 0.8858495528669121, "grad_norm": 1.893522024154663, "learning_rate": 4.761724411523247e-06, "loss": 0.9769, "step": 1684 }, { "epoch": 0.8863755917937928, "grad_norm": 1.9919662475585938, "learning_rate": 4.7614283439188426e-06, "loss": 1.0116, "step": 1685 }, { "epoch": 0.8869016307206733, "grad_norm": 1.9670614004135132, "learning_rate": 4.761132101706249e-06, "loss": 0.9719, "step": 1686 }, { "epoch": 0.8874276696475539, "grad_norm": 1.9545384645462036, "learning_rate": 4.760835684908337e-06, "loss": 0.9986, "step": 1687 }, { "epoch": 0.8879537085744345, "grad_norm": 1.9402283430099487, "learning_rate": 4.7605390935479946e-06, "loss": 0.9911, "step": 1688 }, { "epoch": 0.8884797475013151, "grad_norm": 1.954526424407959, "learning_rate": 4.760242327648122e-06, "loss": 1.0021, "step": 1689 }, { "epoch": 0.8890057864281957, "grad_norm": 1.9458253383636475, "learning_rate": 4.759945387231633e-06, "loss": 1.0346, "step": 1690 }, { "epoch": 0.8895318253550762, "grad_norm": 1.9583990573883057, "learning_rate": 4.7596482723214565e-06, "loss": 1.0509, "step": 1691 }, { "epoch": 0.8900578642819569, "grad_norm": 2.0227482318878174, "learning_rate": 4.75935098294053e-06, "loss": 1.0651, "step": 1692 }, { "epoch": 0.8905839032088374, "grad_norm": 1.977971076965332, "learning_rate": 4.7590535191118096e-06, "loss": 1.0609, "step": 1693 }, { "epoch": 0.891109942135718, "grad_norm": 2.0564186573028564, "learning_rate": 4.758755880858262e-06, "loss": 1.0125, "step": 1694 }, { "epoch": 0.8916359810625987, "grad_norm": 1.9081783294677734, "learning_rate": 4.75845806820287e-06, "loss": 1.007, "step": 1695 }, { "epoch": 0.8921620199894792, "grad_norm": 2.0456745624542236, "learning_rate": 4.758160081168626e-06, "loss": 1.0116, "step": 1696 }, { "epoch": 0.8926880589163598, "grad_norm": 1.9237746000289917, "learning_rate": 4.757861919778539e-06, "loss": 1.0023, "step": 1697 }, { "epoch": 0.8932140978432404, "grad_norm": 1.9402356147766113, "learning_rate": 4.75756358405563e-06, "loss": 1.0264, "step": 1698 }, { "epoch": 0.893740136770121, "grad_norm": 1.9538573026657104, "learning_rate": 4.757265074022935e-06, "loss": 0.9582, "step": 1699 }, { "epoch": 0.8942661756970016, "grad_norm": 2.09053897857666, "learning_rate": 4.756966389703501e-06, "loss": 1.0245, "step": 1700 }, { "epoch": 0.8947922146238821, "grad_norm": 2.071685552597046, "learning_rate": 4.756667531120391e-06, "loss": 1.0124, "step": 1701 }, { "epoch": 0.8953182535507628, "grad_norm": 2.0141103267669678, "learning_rate": 4.75636849829668e-06, "loss": 0.9852, "step": 1702 }, { "epoch": 0.8958442924776433, "grad_norm": 1.9167203903198242, "learning_rate": 4.756069291255456e-06, "loss": 1.0194, "step": 1703 }, { "epoch": 0.8963703314045239, "grad_norm": 2.011918067932129, "learning_rate": 4.755769910019823e-06, "loss": 1.0029, "step": 1704 }, { "epoch": 0.8968963703314046, "grad_norm": 2.1252031326293945, "learning_rate": 4.755470354612895e-06, "loss": 1.0071, "step": 1705 }, { "epoch": 0.8974224092582851, "grad_norm": 2.0214016437530518, "learning_rate": 4.755170625057801e-06, "loss": 1.0371, "step": 1706 }, { "epoch": 0.8979484481851657, "grad_norm": 2.4289193153381348, "learning_rate": 4.754870721377685e-06, "loss": 1.0581, "step": 1707 }, { "epoch": 0.8984744871120462, "grad_norm": 2.1093404293060303, "learning_rate": 4.754570643595702e-06, "loss": 1.0017, "step": 1708 }, { "epoch": 0.8990005260389269, "grad_norm": 2.0420546531677246, "learning_rate": 4.7542703917350215e-06, "loss": 1.0642, "step": 1709 }, { "epoch": 0.8995265649658075, "grad_norm": 1.9818446636199951, "learning_rate": 4.753969965818827e-06, "loss": 1.0313, "step": 1710 }, { "epoch": 0.900052603892688, "grad_norm": 1.897628664970398, "learning_rate": 4.753669365870313e-06, "loss": 0.9875, "step": 1711 }, { "epoch": 0.9005786428195687, "grad_norm": 2.0208487510681152, "learning_rate": 4.753368591912693e-06, "loss": 1.0271, "step": 1712 }, { "epoch": 0.9011046817464492, "grad_norm": 1.9346519708633423, "learning_rate": 4.753067643969186e-06, "loss": 1.0352, "step": 1713 }, { "epoch": 0.9016307206733298, "grad_norm": 2.0617661476135254, "learning_rate": 4.75276652206303e-06, "loss": 0.9806, "step": 1714 }, { "epoch": 0.9021567596002105, "grad_norm": 1.8809938430786133, "learning_rate": 4.752465226217477e-06, "loss": 1.0333, "step": 1715 }, { "epoch": 0.902682798527091, "grad_norm": 2.047309398651123, "learning_rate": 4.752163756455789e-06, "loss": 1.0614, "step": 1716 }, { "epoch": 0.9032088374539716, "grad_norm": 2.1308083534240723, "learning_rate": 4.751862112801242e-06, "loss": 1.0229, "step": 1717 }, { "epoch": 0.9037348763808521, "grad_norm": 2.0333852767944336, "learning_rate": 4.751560295277127e-06, "loss": 1.0077, "step": 1718 }, { "epoch": 0.9042609153077328, "grad_norm": 1.9486128091812134, "learning_rate": 4.7512583039067485e-06, "loss": 1.0026, "step": 1719 }, { "epoch": 0.9047869542346134, "grad_norm": 2.004258394241333, "learning_rate": 4.750956138713424e-06, "loss": 0.986, "step": 1720 }, { "epoch": 0.9053129931614939, "grad_norm": 2.5763192176818848, "learning_rate": 4.750653799720483e-06, "loss": 0.979, "step": 1721 }, { "epoch": 0.9058390320883746, "grad_norm": 2.1086039543151855, "learning_rate": 4.750351286951269e-06, "loss": 1.0368, "step": 1722 }, { "epoch": 0.9063650710152551, "grad_norm": 2.0445361137390137, "learning_rate": 4.750048600429141e-06, "loss": 0.9756, "step": 1723 }, { "epoch": 0.9068911099421357, "grad_norm": 1.8900635242462158, "learning_rate": 4.7497457401774694e-06, "loss": 0.8947, "step": 1724 }, { "epoch": 0.9074171488690163, "grad_norm": 2.116900682449341, "learning_rate": 4.749442706219638e-06, "loss": 1.0502, "step": 1725 }, { "epoch": 0.9079431877958969, "grad_norm": 2.1096391677856445, "learning_rate": 4.749139498579044e-06, "loss": 1.0089, "step": 1726 }, { "epoch": 0.9084692267227775, "grad_norm": 2.2117018699645996, "learning_rate": 4.7488361172791005e-06, "loss": 1.056, "step": 1727 }, { "epoch": 0.908995265649658, "grad_norm": 2.0012335777282715, "learning_rate": 4.748532562343231e-06, "loss": 0.916, "step": 1728 }, { "epoch": 0.9095213045765387, "grad_norm": 1.8673421144485474, "learning_rate": 4.748228833794872e-06, "loss": 0.9844, "step": 1729 }, { "epoch": 0.9100473435034192, "grad_norm": 1.9152559041976929, "learning_rate": 4.747924931657477e-06, "loss": 0.9619, "step": 1730 }, { "epoch": 0.9105733824302998, "grad_norm": 2.107985496520996, "learning_rate": 4.7476208559545104e-06, "loss": 1.017, "step": 1731 }, { "epoch": 0.9110994213571805, "grad_norm": 2.162464141845703, "learning_rate": 4.7473166067094474e-06, "loss": 1.0197, "step": 1732 }, { "epoch": 0.911625460284061, "grad_norm": 2.085958480834961, "learning_rate": 4.747012183945784e-06, "loss": 1.0166, "step": 1733 }, { "epoch": 0.9121514992109416, "grad_norm": 2.0198309421539307, "learning_rate": 4.746707587687022e-06, "loss": 0.9883, "step": 1734 }, { "epoch": 0.9126775381378222, "grad_norm": 2.013784646987915, "learning_rate": 4.746402817956681e-06, "loss": 0.9775, "step": 1735 }, { "epoch": 0.9132035770647028, "grad_norm": 2.1442627906799316, "learning_rate": 4.746097874778293e-06, "loss": 1.0358, "step": 1736 }, { "epoch": 0.9137296159915834, "grad_norm": 2.143627643585205, "learning_rate": 4.745792758175402e-06, "loss": 0.9537, "step": 1737 }, { "epoch": 0.914255654918464, "grad_norm": 1.9581515789031982, "learning_rate": 4.745487468171566e-06, "loss": 0.9756, "step": 1738 }, { "epoch": 0.9147816938453446, "grad_norm": 1.9869537353515625, "learning_rate": 4.74518200479036e-06, "loss": 0.995, "step": 1739 }, { "epoch": 0.9153077327722251, "grad_norm": 1.9129465818405151, "learning_rate": 4.744876368055365e-06, "loss": 1.0088, "step": 1740 }, { "epoch": 0.9158337716991057, "grad_norm": 1.957229733467102, "learning_rate": 4.744570557990183e-06, "loss": 0.9832, "step": 1741 }, { "epoch": 0.9163598106259864, "grad_norm": 2.061002492904663, "learning_rate": 4.744264574618425e-06, "loss": 1.0338, "step": 1742 }, { "epoch": 0.9168858495528669, "grad_norm": 2.0439558029174805, "learning_rate": 4.743958417963715e-06, "loss": 1.0678, "step": 1743 }, { "epoch": 0.9174118884797475, "grad_norm": 2.0407450199127197, "learning_rate": 4.743652088049695e-06, "loss": 1.0219, "step": 1744 }, { "epoch": 0.917937927406628, "grad_norm": 2.2696166038513184, "learning_rate": 4.743345584900014e-06, "loss": 0.9909, "step": 1745 }, { "epoch": 0.9184639663335087, "grad_norm": 1.9783145189285278, "learning_rate": 4.74303890853834e-06, "loss": 0.9423, "step": 1746 }, { "epoch": 0.9189900052603893, "grad_norm": 2.019179344177246, "learning_rate": 4.74273205898835e-06, "loss": 0.9985, "step": 1747 }, { "epoch": 0.9195160441872698, "grad_norm": 1.966417670249939, "learning_rate": 4.742425036273737e-06, "loss": 1.0605, "step": 1748 }, { "epoch": 0.9200420831141505, "grad_norm": 1.9425163269042969, "learning_rate": 4.742117840418207e-06, "loss": 0.9855, "step": 1749 }, { "epoch": 0.920568122041031, "grad_norm": 1.9825159311294556, "learning_rate": 4.741810471445478e-06, "loss": 1.0214, "step": 1750 }, { "epoch": 0.9210941609679116, "grad_norm": 1.9764158725738525, "learning_rate": 4.741502929379284e-06, "loss": 1.0249, "step": 1751 }, { "epoch": 0.9216201998947923, "grad_norm": 2.0177724361419678, "learning_rate": 4.74119521424337e-06, "loss": 1.0434, "step": 1752 }, { "epoch": 0.9221462388216728, "grad_norm": 2.0949506759643555, "learning_rate": 4.740887326061495e-06, "loss": 1.0331, "step": 1753 }, { "epoch": 0.9226722777485534, "grad_norm": 1.9468920230865479, "learning_rate": 4.740579264857431e-06, "loss": 0.9212, "step": 1754 }, { "epoch": 0.923198316675434, "grad_norm": 2.2116925716400146, "learning_rate": 4.740271030654965e-06, "loss": 1.0241, "step": 1755 }, { "epoch": 0.9237243556023146, "grad_norm": 1.9227603673934937, "learning_rate": 4.739962623477896e-06, "loss": 0.98, "step": 1756 }, { "epoch": 0.9242503945291951, "grad_norm": 2.013141632080078, "learning_rate": 4.739654043350036e-06, "loss": 1.0321, "step": 1757 }, { "epoch": 0.9247764334560757, "grad_norm": 2.1053218841552734, "learning_rate": 4.739345290295211e-06, "loss": 1.0359, "step": 1758 }, { "epoch": 0.9253024723829564, "grad_norm": 2.072932243347168, "learning_rate": 4.739036364337261e-06, "loss": 0.9826, "step": 1759 }, { "epoch": 0.9258285113098369, "grad_norm": 2.104072093963623, "learning_rate": 4.738727265500037e-06, "loss": 1.0239, "step": 1760 }, { "epoch": 0.9263545502367175, "grad_norm": 2.0704009532928467, "learning_rate": 4.738417993807407e-06, "loss": 1.0235, "step": 1761 }, { "epoch": 0.9268805891635981, "grad_norm": 1.9992990493774414, "learning_rate": 4.738108549283249e-06, "loss": 0.988, "step": 1762 }, { "epoch": 0.9274066280904787, "grad_norm": 2.150501251220703, "learning_rate": 4.737798931951456e-06, "loss": 1.0574, "step": 1763 }, { "epoch": 0.9279326670173593, "grad_norm": 1.906421184539795, "learning_rate": 4.7374891418359345e-06, "loss": 1.0479, "step": 1764 }, { "epoch": 0.9284587059442398, "grad_norm": 1.8720351457595825, "learning_rate": 4.737179178960603e-06, "loss": 1.038, "step": 1765 }, { "epoch": 0.9289847448711205, "grad_norm": 1.9185991287231445, "learning_rate": 4.736869043349394e-06, "loss": 1.0632, "step": 1766 }, { "epoch": 0.929510783798001, "grad_norm": 2.040290594100952, "learning_rate": 4.736558735026255e-06, "loss": 0.9857, "step": 1767 }, { "epoch": 0.9300368227248816, "grad_norm": 1.9188529253005981, "learning_rate": 4.7362482540151445e-06, "loss": 1.0115, "step": 1768 }, { "epoch": 0.9305628616517623, "grad_norm": 2.092855215072632, "learning_rate": 4.7359376003400345e-06, "loss": 1.0318, "step": 1769 }, { "epoch": 0.9310889005786428, "grad_norm": 1.9537826776504517, "learning_rate": 4.735626774024912e-06, "loss": 1.0005, "step": 1770 }, { "epoch": 0.9316149395055234, "grad_norm": 1.8022964000701904, "learning_rate": 4.735315775093775e-06, "loss": 0.9696, "step": 1771 }, { "epoch": 0.932140978432404, "grad_norm": 2.0534324645996094, "learning_rate": 4.735004603570639e-06, "loss": 1.0647, "step": 1772 }, { "epoch": 0.9326670173592846, "grad_norm": 2.082421064376831, "learning_rate": 4.734693259479527e-06, "loss": 1.0168, "step": 1773 }, { "epoch": 0.9331930562861652, "grad_norm": 2.2331955432891846, "learning_rate": 4.734381742844481e-06, "loss": 1.0288, "step": 1774 }, { "epoch": 0.9337190952130457, "grad_norm": 1.9978649616241455, "learning_rate": 4.73407005368955e-06, "loss": 0.9542, "step": 1775 }, { "epoch": 0.9342451341399264, "grad_norm": 2.054856061935425, "learning_rate": 4.733758192038804e-06, "loss": 1.0457, "step": 1776 }, { "epoch": 0.9347711730668069, "grad_norm": 2.1446175575256348, "learning_rate": 4.733446157916319e-06, "loss": 1.0767, "step": 1777 }, { "epoch": 0.9352972119936875, "grad_norm": 2.149594783782959, "learning_rate": 4.7331339513461905e-06, "loss": 0.9975, "step": 1778 }, { "epoch": 0.9358232509205682, "grad_norm": 2.0066800117492676, "learning_rate": 4.732821572352522e-06, "loss": 1.0296, "step": 1779 }, { "epoch": 0.9363492898474487, "grad_norm": 2.4036574363708496, "learning_rate": 4.732509020959434e-06, "loss": 0.9726, "step": 1780 }, { "epoch": 0.9368753287743293, "grad_norm": 2.0901482105255127, "learning_rate": 4.73219629719106e-06, "loss": 1.0748, "step": 1781 }, { "epoch": 0.9374013677012099, "grad_norm": 2.093503713607788, "learning_rate": 4.731883401071543e-06, "loss": 1.0413, "step": 1782 }, { "epoch": 0.9379274066280905, "grad_norm": 2.1437647342681885, "learning_rate": 4.731570332625044e-06, "loss": 1.0624, "step": 1783 }, { "epoch": 0.9384534455549711, "grad_norm": 2.141866445541382, "learning_rate": 4.731257091875736e-06, "loss": 0.9547, "step": 1784 }, { "epoch": 0.9389794844818516, "grad_norm": 2.138530731201172, "learning_rate": 4.730943678847804e-06, "loss": 1.0498, "step": 1785 }, { "epoch": 0.9395055234087323, "grad_norm": 2.192941188812256, "learning_rate": 4.730630093565447e-06, "loss": 1.0426, "step": 1786 }, { "epoch": 0.9400315623356128, "grad_norm": 1.9256808757781982, "learning_rate": 4.730316336052877e-06, "loss": 0.9864, "step": 1787 }, { "epoch": 0.9405576012624934, "grad_norm": 2.1694893836975098, "learning_rate": 4.730002406334321e-06, "loss": 0.9926, "step": 1788 }, { "epoch": 0.941083640189374, "grad_norm": 1.9891979694366455, "learning_rate": 4.729688304434017e-06, "loss": 0.9835, "step": 1789 }, { "epoch": 0.9416096791162546, "grad_norm": 2.112396240234375, "learning_rate": 4.729374030376217e-06, "loss": 1.0131, "step": 1790 }, { "epoch": 0.9421357180431352, "grad_norm": 2.049139976501465, "learning_rate": 4.729059584185187e-06, "loss": 1.0176, "step": 1791 }, { "epoch": 0.9426617569700158, "grad_norm": 2.259706497192383, "learning_rate": 4.728744965885207e-06, "loss": 1.0566, "step": 1792 }, { "epoch": 0.9431877958968964, "grad_norm": 1.9924520254135132, "learning_rate": 4.728430175500567e-06, "loss": 0.9912, "step": 1793 }, { "epoch": 0.9437138348237769, "grad_norm": 2.1724114418029785, "learning_rate": 4.728115213055573e-06, "loss": 0.9919, "step": 1794 }, { "epoch": 0.9442398737506575, "grad_norm": 2.083853244781494, "learning_rate": 4.7278000785745445e-06, "loss": 1.0368, "step": 1795 }, { "epoch": 0.9447659126775382, "grad_norm": 2.089245080947876, "learning_rate": 4.727484772081814e-06, "loss": 1.0471, "step": 1796 }, { "epoch": 0.9452919516044187, "grad_norm": 1.9880348443984985, "learning_rate": 4.727169293601725e-06, "loss": 0.9752, "step": 1797 }, { "epoch": 0.9458179905312993, "grad_norm": 2.0518887042999268, "learning_rate": 4.7268536431586375e-06, "loss": 0.977, "step": 1798 }, { "epoch": 0.9463440294581799, "grad_norm": 2.3292527198791504, "learning_rate": 4.726537820776922e-06, "loss": 0.9696, "step": 1799 }, { "epoch": 0.9468700683850605, "grad_norm": 2.093759775161743, "learning_rate": 4.7262218264809656e-06, "loss": 1.028, "step": 1800 }, { "epoch": 0.9473961073119411, "grad_norm": 1.9579375982284546, "learning_rate": 4.7259056602951644e-06, "loss": 0.9797, "step": 1801 }, { "epoch": 0.9479221462388217, "grad_norm": 2.1174583435058594, "learning_rate": 4.725589322243932e-06, "loss": 0.9993, "step": 1802 }, { "epoch": 0.9484481851657023, "grad_norm": 2.167732000350952, "learning_rate": 4.725272812351692e-06, "loss": 1.0031, "step": 1803 }, { "epoch": 0.9489742240925828, "grad_norm": 2.1166253089904785, "learning_rate": 4.724956130642883e-06, "loss": 1.0029, "step": 1804 }, { "epoch": 0.9495002630194634, "grad_norm": 2.0212886333465576, "learning_rate": 4.724639277141957e-06, "loss": 1.0202, "step": 1805 }, { "epoch": 0.9500263019463441, "grad_norm": 2.1849446296691895, "learning_rate": 4.7243222518733775e-06, "loss": 0.9847, "step": 1806 }, { "epoch": 0.9505523408732246, "grad_norm": 2.019671678543091, "learning_rate": 4.724005054861623e-06, "loss": 1.0141, "step": 1807 }, { "epoch": 0.9510783798001052, "grad_norm": 2.0654826164245605, "learning_rate": 4.723687686131186e-06, "loss": 1.0266, "step": 1808 }, { "epoch": 0.9516044187269858, "grad_norm": 2.0668342113494873, "learning_rate": 4.7233701457065694e-06, "loss": 1.0249, "step": 1809 }, { "epoch": 0.9521304576538664, "grad_norm": 1.9022929668426514, "learning_rate": 4.723052433612292e-06, "loss": 1.0092, "step": 1810 }, { "epoch": 0.952656496580747, "grad_norm": 2.0411059856414795, "learning_rate": 4.722734549872884e-06, "loss": 0.9896, "step": 1811 }, { "epoch": 0.9531825355076275, "grad_norm": 2.0354626178741455, "learning_rate": 4.722416494512889e-06, "loss": 0.9529, "step": 1812 }, { "epoch": 0.9537085744345082, "grad_norm": 1.866688847541809, "learning_rate": 4.722098267556867e-06, "loss": 0.971, "step": 1813 }, { "epoch": 0.9542346133613887, "grad_norm": 1.9963386058807373, "learning_rate": 4.721779869029387e-06, "loss": 0.9931, "step": 1814 }, { "epoch": 0.9547606522882693, "grad_norm": 1.9810550212860107, "learning_rate": 4.721461298955033e-06, "loss": 1.0335, "step": 1815 }, { "epoch": 0.95528669121515, "grad_norm": 2.0094194412231445, "learning_rate": 4.721142557358402e-06, "loss": 1.0248, "step": 1816 }, { "epoch": 0.9558127301420305, "grad_norm": 2.110318183898926, "learning_rate": 4.720823644264106e-06, "loss": 0.9726, "step": 1817 }, { "epoch": 0.9563387690689111, "grad_norm": 2.051914691925049, "learning_rate": 4.720504559696768e-06, "loss": 1.0205, "step": 1818 }, { "epoch": 0.9568648079957917, "grad_norm": 2.0969302654266357, "learning_rate": 4.7201853036810245e-06, "loss": 1.0313, "step": 1819 }, { "epoch": 0.9573908469226723, "grad_norm": 2.098721742630005, "learning_rate": 4.719865876241525e-06, "loss": 1.0276, "step": 1820 }, { "epoch": 0.9579168858495528, "grad_norm": 1.9741021394729614, "learning_rate": 4.719546277402936e-06, "loss": 1.0142, "step": 1821 }, { "epoch": 0.9584429247764334, "grad_norm": 2.1097187995910645, "learning_rate": 4.71922650718993e-06, "loss": 0.9812, "step": 1822 }, { "epoch": 0.9589689637033141, "grad_norm": 2.1343348026275635, "learning_rate": 4.718906565627201e-06, "loss": 1.0126, "step": 1823 }, { "epoch": 0.9594950026301946, "grad_norm": 2.089698553085327, "learning_rate": 4.71858645273945e-06, "loss": 0.9982, "step": 1824 }, { "epoch": 0.9600210415570752, "grad_norm": 2.1942148208618164, "learning_rate": 4.7182661685513925e-06, "loss": 1.0781, "step": 1825 }, { "epoch": 0.9605470804839558, "grad_norm": 1.92880380153656, "learning_rate": 4.7179457130877605e-06, "loss": 1.0214, "step": 1826 }, { "epoch": 0.9610731194108364, "grad_norm": 2.093219518661499, "learning_rate": 4.717625086373295e-06, "loss": 1.0411, "step": 1827 }, { "epoch": 0.961599158337717, "grad_norm": 1.9406787157058716, "learning_rate": 4.7173042884327525e-06, "loss": 1.0296, "step": 1828 }, { "epoch": 0.9621251972645976, "grad_norm": 1.9737564325332642, "learning_rate": 4.7169833192909025e-06, "loss": 1.0119, "step": 1829 }, { "epoch": 0.9626512361914782, "grad_norm": 1.9281796216964722, "learning_rate": 4.7166621789725276e-06, "loss": 1.0203, "step": 1830 }, { "epoch": 0.9631772751183587, "grad_norm": 2.128120183944702, "learning_rate": 4.716340867502424e-06, "loss": 1.087, "step": 1831 }, { "epoch": 0.9637033140452393, "grad_norm": 2.1313352584838867, "learning_rate": 4.716019384905399e-06, "loss": 1.0049, "step": 1832 }, { "epoch": 0.96422935297212, "grad_norm": 1.882323980331421, "learning_rate": 4.715697731206275e-06, "loss": 1.052, "step": 1833 }, { "epoch": 0.9647553918990005, "grad_norm": 1.902729868888855, "learning_rate": 4.71537590642989e-06, "loss": 1.013, "step": 1834 }, { "epoch": 0.9652814308258811, "grad_norm": 1.9752705097198486, "learning_rate": 4.715053910601089e-06, "loss": 0.9964, "step": 1835 }, { "epoch": 0.9658074697527617, "grad_norm": 2.2092044353485107, "learning_rate": 4.714731743744736e-06, "loss": 1.0142, "step": 1836 }, { "epoch": 0.9663335086796423, "grad_norm": 1.9738699197769165, "learning_rate": 4.714409405885706e-06, "loss": 1.0431, "step": 1837 }, { "epoch": 0.9668595476065229, "grad_norm": 1.94752836227417, "learning_rate": 4.714086897048886e-06, "loss": 0.9776, "step": 1838 }, { "epoch": 0.9673855865334035, "grad_norm": 2.044384717941284, "learning_rate": 4.713764217259178e-06, "loss": 0.9428, "step": 1839 }, { "epoch": 0.9679116254602841, "grad_norm": 2.067378520965576, "learning_rate": 4.713441366541497e-06, "loss": 1.0222, "step": 1840 }, { "epoch": 0.9684376643871646, "grad_norm": 2.0729427337646484, "learning_rate": 4.71311834492077e-06, "loss": 1.0244, "step": 1841 }, { "epoch": 0.9689637033140452, "grad_norm": 1.9986896514892578, "learning_rate": 4.712795152421938e-06, "loss": 1.0246, "step": 1842 }, { "epoch": 0.9694897422409259, "grad_norm": 2.134274482727051, "learning_rate": 4.712471789069956e-06, "loss": 1.0317, "step": 1843 }, { "epoch": 0.9700157811678064, "grad_norm": 2.116116762161255, "learning_rate": 4.7121482548897896e-06, "loss": 1.0431, "step": 1844 }, { "epoch": 0.970541820094687, "grad_norm": 2.146329164505005, "learning_rate": 4.7118245499064205e-06, "loss": 1.0185, "step": 1845 }, { "epoch": 0.9710678590215676, "grad_norm": 2.2587080001831055, "learning_rate": 4.711500674144844e-06, "loss": 1.0172, "step": 1846 }, { "epoch": 0.9715938979484482, "grad_norm": 2.133565902709961, "learning_rate": 4.7111766276300645e-06, "loss": 1.0887, "step": 1847 }, { "epoch": 0.9721199368753288, "grad_norm": 2.4180047512054443, "learning_rate": 4.710852410387103e-06, "loss": 1.0686, "step": 1848 }, { "epoch": 0.9726459758022094, "grad_norm": 1.9758679866790771, "learning_rate": 4.7105280224409936e-06, "loss": 0.9851, "step": 1849 }, { "epoch": 0.97317201472909, "grad_norm": 2.0190632343292236, "learning_rate": 4.710203463816782e-06, "loss": 0.9967, "step": 1850 }, { "epoch": 0.9736980536559705, "grad_norm": 2.0636117458343506, "learning_rate": 4.709878734539527e-06, "loss": 1.0209, "step": 1851 }, { "epoch": 0.9742240925828511, "grad_norm": 2.0756478309631348, "learning_rate": 4.709553834634303e-06, "loss": 0.9793, "step": 1852 }, { "epoch": 0.9747501315097317, "grad_norm": 1.94191312789917, "learning_rate": 4.709228764126195e-06, "loss": 0.9697, "step": 1853 }, { "epoch": 0.9752761704366123, "grad_norm": 2.057345390319824, "learning_rate": 4.708903523040303e-06, "loss": 0.938, "step": 1854 }, { "epoch": 0.9758022093634929, "grad_norm": 2.1611337661743164, "learning_rate": 4.7085781114017384e-06, "loss": 1.0464, "step": 1855 }, { "epoch": 0.9763282482903735, "grad_norm": 1.9461411237716675, "learning_rate": 4.708252529235627e-06, "loss": 0.9934, "step": 1856 }, { "epoch": 0.9768542872172541, "grad_norm": 1.9107236862182617, "learning_rate": 4.707926776567108e-06, "loss": 0.9895, "step": 1857 }, { "epoch": 0.9773803261441346, "grad_norm": 2.0953640937805176, "learning_rate": 4.707600853421332e-06, "loss": 1.0009, "step": 1858 }, { "epoch": 0.9779063650710152, "grad_norm": 2.126648187637329, "learning_rate": 4.707274759823466e-06, "loss": 0.9801, "step": 1859 }, { "epoch": 0.9784324039978959, "grad_norm": 2.0868916511535645, "learning_rate": 4.706948495798687e-06, "loss": 0.9765, "step": 1860 }, { "epoch": 0.9789584429247764, "grad_norm": 2.0332181453704834, "learning_rate": 4.706622061372185e-06, "loss": 1.0216, "step": 1861 }, { "epoch": 0.979484481851657, "grad_norm": 2.05155348777771, "learning_rate": 4.706295456569167e-06, "loss": 1.0594, "step": 1862 }, { "epoch": 0.9800105207785376, "grad_norm": 2.1178739070892334, "learning_rate": 4.7059686814148485e-06, "loss": 1.0463, "step": 1863 }, { "epoch": 0.9805365597054182, "grad_norm": 1.9961886405944824, "learning_rate": 4.705641735934462e-06, "loss": 0.9658, "step": 1864 }, { "epoch": 0.9810625986322988, "grad_norm": 1.9905188083648682, "learning_rate": 4.705314620153251e-06, "loss": 0.9677, "step": 1865 }, { "epoch": 0.9815886375591794, "grad_norm": 1.9200838804244995, "learning_rate": 4.704987334096471e-06, "loss": 1.0011, "step": 1866 }, { "epoch": 0.98211467648606, "grad_norm": 2.069359302520752, "learning_rate": 4.704659877789395e-06, "loss": 1.01, "step": 1867 }, { "epoch": 0.9826407154129405, "grad_norm": 1.8069074153900146, "learning_rate": 4.704332251257304e-06, "loss": 1.037, "step": 1868 }, { "epoch": 0.9831667543398211, "grad_norm": 1.9900349378585815, "learning_rate": 4.704004454525496e-06, "loss": 1.0035, "step": 1869 }, { "epoch": 0.9836927932667018, "grad_norm": 1.902032494544983, "learning_rate": 4.70367648761928e-06, "loss": 1.0001, "step": 1870 }, { "epoch": 0.9842188321935823, "grad_norm": 2.5718839168548584, "learning_rate": 4.703348350563978e-06, "loss": 1.002, "step": 1871 }, { "epoch": 0.9847448711204629, "grad_norm": 1.90852952003479, "learning_rate": 4.703020043384927e-06, "loss": 1.0338, "step": 1872 }, { "epoch": 0.9852709100473435, "grad_norm": 2.0179872512817383, "learning_rate": 4.702691566107477e-06, "loss": 0.9724, "step": 1873 }, { "epoch": 0.9857969489742241, "grad_norm": 2.0315425395965576, "learning_rate": 4.702362918756988e-06, "loss": 1.0256, "step": 1874 }, { "epoch": 0.9863229879011047, "grad_norm": 1.898896336555481, "learning_rate": 4.702034101358837e-06, "loss": 0.9695, "step": 1875 }, { "epoch": 0.9868490268279853, "grad_norm": 2.1176962852478027, "learning_rate": 4.701705113938411e-06, "loss": 1.0217, "step": 1876 }, { "epoch": 0.9873750657548659, "grad_norm": 1.94914972782135, "learning_rate": 4.701375956521113e-06, "loss": 1.0081, "step": 1877 }, { "epoch": 0.9879011046817464, "grad_norm": 1.9665032625198364, "learning_rate": 4.701046629132358e-06, "loss": 1.0174, "step": 1878 }, { "epoch": 0.988427143608627, "grad_norm": 2.005793571472168, "learning_rate": 4.700717131797573e-06, "loss": 0.9653, "step": 1879 }, { "epoch": 0.9889531825355077, "grad_norm": 2.0769705772399902, "learning_rate": 4.700387464542199e-06, "loss": 1.0142, "step": 1880 }, { "epoch": 0.9894792214623882, "grad_norm": 1.9945422410964966, "learning_rate": 4.700057627391689e-06, "loss": 1.0225, "step": 1881 }, { "epoch": 0.9900052603892688, "grad_norm": 2.1121349334716797, "learning_rate": 4.699727620371513e-06, "loss": 1.0056, "step": 1882 }, { "epoch": 0.9905312993161494, "grad_norm": 2.156942844390869, "learning_rate": 4.699397443507148e-06, "loss": 1.0049, "step": 1883 }, { "epoch": 0.99105733824303, "grad_norm": 2.065075159072876, "learning_rate": 4.699067096824091e-06, "loss": 0.9694, "step": 1884 }, { "epoch": 0.9915833771699105, "grad_norm": 2.12490177154541, "learning_rate": 4.698736580347845e-06, "loss": 1.0268, "step": 1885 }, { "epoch": 0.9921094160967912, "grad_norm": 2.039874792098999, "learning_rate": 4.698405894103932e-06, "loss": 1.0122, "step": 1886 }, { "epoch": 0.9926354550236718, "grad_norm": 2.0004734992980957, "learning_rate": 4.698075038117884e-06, "loss": 0.9996, "step": 1887 }, { "epoch": 0.9931614939505523, "grad_norm": 1.996697187423706, "learning_rate": 4.697744012415248e-06, "loss": 1.0658, "step": 1888 }, { "epoch": 0.9936875328774329, "grad_norm": 1.9783189296722412, "learning_rate": 4.69741281702158e-06, "loss": 0.9799, "step": 1889 }, { "epoch": 0.9942135718043135, "grad_norm": 2.054898738861084, "learning_rate": 4.697081451962456e-06, "loss": 1.0302, "step": 1890 }, { "epoch": 0.9947396107311941, "grad_norm": 1.953337550163269, "learning_rate": 4.696749917263458e-06, "loss": 0.9634, "step": 1891 }, { "epoch": 0.9952656496580747, "grad_norm": 2.6126086711883545, "learning_rate": 4.6964182129501855e-06, "loss": 0.9659, "step": 1892 }, { "epoch": 0.9957916885849553, "grad_norm": 1.931026816368103, "learning_rate": 4.69608633904825e-06, "loss": 1.0456, "step": 1893 }, { "epoch": 0.9963177275118359, "grad_norm": 1.9246487617492676, "learning_rate": 4.695754295583276e-06, "loss": 1.0057, "step": 1894 }, { "epoch": 0.9968437664387164, "grad_norm": 1.9731547832489014, "learning_rate": 4.695422082580901e-06, "loss": 0.9619, "step": 1895 }, { "epoch": 0.997369805365597, "grad_norm": 2.1975600719451904, "learning_rate": 4.695089700066776e-06, "loss": 0.9667, "step": 1896 }, { "epoch": 0.9978958442924777, "grad_norm": 1.9038164615631104, "learning_rate": 4.6947571480665636e-06, "loss": 0.9564, "step": 1897 }, { "epoch": 0.9984218832193582, "grad_norm": 1.9997332096099854, "learning_rate": 4.694424426605942e-06, "loss": 0.9717, "step": 1898 }, { "epoch": 0.9989479221462388, "grad_norm": 2.0790839195251465, "learning_rate": 4.6940915357106e-06, "loss": 1.044, "step": 1899 }, { "epoch": 0.9994739610731194, "grad_norm": 2.0779690742492676, "learning_rate": 4.693758475406241e-06, "loss": 1.052, "step": 1900 }, { "epoch": 1.0, "grad_norm": 2.3423078060150146, "learning_rate": 4.693425245718581e-06, "loss": 0.9887, "step": 1901 } ], "logging_steps": 1, "max_steps": 11406, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1901, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.801364251367178e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }