{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002105263157894737, "grad_norm": 6.217544016659503, "learning_rate": 9.999972660400536e-06, "loss": 0.3989, "step": 1 }, { "epoch": 0.004210526315789474, "grad_norm": 4.110718040113299, "learning_rate": 9.999890641901124e-06, "loss": 0.3669, "step": 2 }, { "epoch": 0.00631578947368421, "grad_norm": 4.338497302088012, "learning_rate": 9.999753945398704e-06, "loss": 0.312, "step": 3 }, { "epoch": 0.008421052631578947, "grad_norm": 5.282542189044485, "learning_rate": 9.99956257238817e-06, "loss": 0.3437, "step": 4 }, { "epoch": 0.010526315789473684, "grad_norm": 3.834449844048163, "learning_rate": 9.999316524962347e-06, "loss": 0.2323, "step": 5 }, { "epoch": 0.01263157894736842, "grad_norm": 3.8748984755674143, "learning_rate": 9.999015805811965e-06, "loss": 0.3381, "step": 6 }, { "epoch": 0.014736842105263158, "grad_norm": 3.0258910678432107, "learning_rate": 9.998660418225645e-06, "loss": 0.2191, "step": 7 }, { "epoch": 0.016842105263157894, "grad_norm": 3.569909197687586, "learning_rate": 9.998250366089848e-06, "loss": 0.2458, "step": 8 }, { "epoch": 0.018947368421052633, "grad_norm": 5.072654329244835, "learning_rate": 9.997785653888835e-06, "loss": 0.308, "step": 9 }, { "epoch": 0.021052631578947368, "grad_norm": 3.23335292769489, "learning_rate": 9.99726628670463e-06, "loss": 0.2266, "step": 10 }, { "epoch": 0.023157894736842106, "grad_norm": 3.287012798491171, "learning_rate": 9.996692270216946e-06, "loss": 0.2592, "step": 11 }, { "epoch": 0.02526315789473684, "grad_norm": 4.0856720065160745, "learning_rate": 9.996063610703138e-06, "loss": 0.2674, "step": 12 }, { "epoch": 0.02736842105263158, "grad_norm": 3.454246688370746, "learning_rate": 9.995380315038119e-06, "loss": 0.2565, "step": 13 }, { "epoch": 0.029473684210526315, "grad_norm": 3.2274320728380066, "learning_rate": 9.994642390694308e-06, "loss": 0.1976, "step": 14 }, { "epoch": 0.031578947368421054, "grad_norm": 4.260565705891167, "learning_rate": 9.993849845741525e-06, "loss": 0.245, "step": 15 }, { "epoch": 0.03368421052631579, "grad_norm": 3.4341257739977102, "learning_rate": 9.993002688846913e-06, "loss": 0.2833, "step": 16 }, { "epoch": 0.035789473684210524, "grad_norm": 3.62754868119338, "learning_rate": 9.992100929274848e-06, "loss": 0.2459, "step": 17 }, { "epoch": 0.037894736842105266, "grad_norm": 3.3599215508623788, "learning_rate": 9.991144576886824e-06, "loss": 0.2597, "step": 18 }, { "epoch": 0.04, "grad_norm": 2.759190741252958, "learning_rate": 9.990133642141359e-06, "loss": 0.2007, "step": 19 }, { "epoch": 0.042105263157894736, "grad_norm": 3.1433858959494656, "learning_rate": 9.989068136093873e-06, "loss": 0.2126, "step": 20 }, { "epoch": 0.04421052631578947, "grad_norm": 4.780930240687513, "learning_rate": 9.987948070396572e-06, "loss": 0.2567, "step": 21 }, { "epoch": 0.04631578947368421, "grad_norm": 3.1361116248862704, "learning_rate": 9.986773457298311e-06, "loss": 0.218, "step": 22 }, { "epoch": 0.04842105263157895, "grad_norm": 4.660863559826474, "learning_rate": 9.985544309644474e-06, "loss": 0.3171, "step": 23 }, { "epoch": 0.05052631578947368, "grad_norm": 3.83833154370056, "learning_rate": 9.984260640876821e-06, "loss": 0.2453, "step": 24 }, { "epoch": 0.05263157894736842, "grad_norm": 4.394031981209525, "learning_rate": 9.98292246503335e-06, "loss": 0.3022, "step": 25 }, { "epoch": 0.05473684210526316, "grad_norm": 3.815312734391198, "learning_rate": 9.981529796748135e-06, "loss": 0.2566, "step": 26 }, { "epoch": 0.056842105263157895, "grad_norm": 3.751567223965873, "learning_rate": 9.980082651251175e-06, "loss": 0.2057, "step": 27 }, { "epoch": 0.05894736842105263, "grad_norm": 3.5703892124307886, "learning_rate": 9.97858104436822e-06, "loss": 0.2611, "step": 28 }, { "epoch": 0.061052631578947365, "grad_norm": 4.882927650965578, "learning_rate": 9.977024992520604e-06, "loss": 0.2957, "step": 29 }, { "epoch": 0.06315789473684211, "grad_norm": 3.5773603919322614, "learning_rate": 9.975414512725058e-06, "loss": 0.2483, "step": 30 }, { "epoch": 0.06526315789473684, "grad_norm": 3.19691324527535, "learning_rate": 9.973749622593534e-06, "loss": 0.199, "step": 31 }, { "epoch": 0.06736842105263158, "grad_norm": 3.285608727213878, "learning_rate": 9.972030340333e-06, "loss": 0.2476, "step": 32 }, { "epoch": 0.06947368421052631, "grad_norm": 3.9448084183142202, "learning_rate": 9.970256684745258e-06, "loss": 0.2584, "step": 33 }, { "epoch": 0.07157894736842105, "grad_norm": 3.093609102136492, "learning_rate": 9.968428675226714e-06, "loss": 0.1965, "step": 34 }, { "epoch": 0.07368421052631578, "grad_norm": 2.866766273599304, "learning_rate": 9.966546331768192e-06, "loss": 0.235, "step": 35 }, { "epoch": 0.07578947368421053, "grad_norm": 3.1680983150404862, "learning_rate": 9.964609674954696e-06, "loss": 0.2179, "step": 36 }, { "epoch": 0.07789473684210527, "grad_norm": 2.531925390508716, "learning_rate": 9.962618725965196e-06, "loss": 0.1638, "step": 37 }, { "epoch": 0.08, "grad_norm": 3.479913242409686, "learning_rate": 9.960573506572391e-06, "loss": 0.2607, "step": 38 }, { "epoch": 0.08210526315789474, "grad_norm": 3.453241583441747, "learning_rate": 9.95847403914247e-06, "loss": 0.2846, "step": 39 }, { "epoch": 0.08421052631578947, "grad_norm": 3.7499588076121415, "learning_rate": 9.956320346634877e-06, "loss": 0.2474, "step": 40 }, { "epoch": 0.0863157894736842, "grad_norm": 3.7513628477476626, "learning_rate": 9.954112452602045e-06, "loss": 0.257, "step": 41 }, { "epoch": 0.08842105263157894, "grad_norm": 3.249974357345021, "learning_rate": 9.951850381189152e-06, "loss": 0.2342, "step": 42 }, { "epoch": 0.09052631578947369, "grad_norm": 3.5256183668310053, "learning_rate": 9.949534157133844e-06, "loss": 0.254, "step": 43 }, { "epoch": 0.09263157894736843, "grad_norm": 3.0367239176760554, "learning_rate": 9.94716380576598e-06, "loss": 0.2181, "step": 44 }, { "epoch": 0.09473684210526316, "grad_norm": 2.680983187953939, "learning_rate": 9.944739353007344e-06, "loss": 0.2092, "step": 45 }, { "epoch": 0.0968421052631579, "grad_norm": 3.172342567980907, "learning_rate": 9.942260825371359e-06, "loss": 0.1665, "step": 46 }, { "epoch": 0.09894736842105263, "grad_norm": 3.683978184159089, "learning_rate": 9.939728249962808e-06, "loss": 0.2671, "step": 47 }, { "epoch": 0.10105263157894737, "grad_norm": 3.6089008339524664, "learning_rate": 9.937141654477529e-06, "loss": 0.2557, "step": 48 }, { "epoch": 0.1031578947368421, "grad_norm": 3.1128433475033224, "learning_rate": 9.934501067202117e-06, "loss": 0.2298, "step": 49 }, { "epoch": 0.10526315789473684, "grad_norm": 3.837339505212247, "learning_rate": 9.931806517013612e-06, "loss": 0.2121, "step": 50 }, { "epoch": 0.10736842105263159, "grad_norm": 3.1188404843719986, "learning_rate": 9.929058033379181e-06, "loss": 0.2449, "step": 51 }, { "epoch": 0.10947368421052632, "grad_norm": 3.499970578608811, "learning_rate": 9.926255646355804e-06, "loss": 0.2362, "step": 52 }, { "epoch": 0.11157894736842106, "grad_norm": 3.8814562389117127, "learning_rate": 9.923399386589933e-06, "loss": 0.2403, "step": 53 }, { "epoch": 0.11368421052631579, "grad_norm": 3.887530063847657, "learning_rate": 9.920489285317169e-06, "loss": 0.2276, "step": 54 }, { "epoch": 0.11578947368421053, "grad_norm": 3.6293508472876455, "learning_rate": 9.917525374361913e-06, "loss": 0.2577, "step": 55 }, { "epoch": 0.11789473684210526, "grad_norm": 3.865568740211283, "learning_rate": 9.91450768613702e-06, "loss": 0.2416, "step": 56 }, { "epoch": 0.12, "grad_norm": 3.7137281964397095, "learning_rate": 9.911436253643445e-06, "loss": 0.2411, "step": 57 }, { "epoch": 0.12210526315789473, "grad_norm": 3.6900754543193153, "learning_rate": 9.908311110469881e-06, "loss": 0.267, "step": 58 }, { "epoch": 0.12421052631578948, "grad_norm": 3.4314998812484165, "learning_rate": 9.905132290792395e-06, "loss": 0.2415, "step": 59 }, { "epoch": 0.12631578947368421, "grad_norm": 3.5441260133809154, "learning_rate": 9.901899829374048e-06, "loss": 0.2649, "step": 60 }, { "epoch": 0.12842105263157894, "grad_norm": 2.969279579151304, "learning_rate": 9.89861376156452e-06, "loss": 0.2342, "step": 61 }, { "epoch": 0.13052631578947368, "grad_norm": 3.0270242630571644, "learning_rate": 9.895274123299724e-06, "loss": 0.215, "step": 62 }, { "epoch": 0.13263157894736843, "grad_norm": 3.2468251690158945, "learning_rate": 9.891880951101407e-06, "loss": 0.2645, "step": 63 }, { "epoch": 0.13473684210526315, "grad_norm": 3.7405952840185255, "learning_rate": 9.888434282076759e-06, "loss": 0.224, "step": 64 }, { "epoch": 0.1368421052631579, "grad_norm": 3.7386890942637736, "learning_rate": 9.884934153917998e-06, "loss": 0.2338, "step": 65 }, { "epoch": 0.13894736842105262, "grad_norm": 3.6559224501234655, "learning_rate": 9.881380604901964e-06, "loss": 0.2674, "step": 66 }, { "epoch": 0.14105263157894737, "grad_norm": 3.2126708403650723, "learning_rate": 9.877773673889702e-06, "loss": 0.257, "step": 67 }, { "epoch": 0.1431578947368421, "grad_norm": 3.3544140835288387, "learning_rate": 9.874113400326031e-06, "loss": 0.2644, "step": 68 }, { "epoch": 0.14526315789473684, "grad_norm": 3.2197549480894305, "learning_rate": 9.870399824239116e-06, "loss": 0.2337, "step": 69 }, { "epoch": 0.14736842105263157, "grad_norm": 3.0956092032854787, "learning_rate": 9.86663298624003e-06, "loss": 0.2018, "step": 70 }, { "epoch": 0.14947368421052631, "grad_norm": 2.728914245630825, "learning_rate": 9.86281292752231e-06, "loss": 0.1902, "step": 71 }, { "epoch": 0.15157894736842106, "grad_norm": 2.728563361219932, "learning_rate": 9.858939689861506e-06, "loss": 0.1998, "step": 72 }, { "epoch": 0.15368421052631578, "grad_norm": 3.6065260596056428, "learning_rate": 9.855013315614725e-06, "loss": 0.2589, "step": 73 }, { "epoch": 0.15578947368421053, "grad_norm": 3.9437466543223016, "learning_rate": 9.851033847720167e-06, "loss": 0.2839, "step": 74 }, { "epoch": 0.15789473684210525, "grad_norm": 2.6492053943266973, "learning_rate": 9.847001329696653e-06, "loss": 0.1926, "step": 75 }, { "epoch": 0.16, "grad_norm": 3.4928074094432975, "learning_rate": 9.842915805643156e-06, "loss": 0.2567, "step": 76 }, { "epoch": 0.16210526315789472, "grad_norm": 3.745761767248173, "learning_rate": 9.838777320238312e-06, "loss": 0.2473, "step": 77 }, { "epoch": 0.16421052631578947, "grad_norm": 4.662473350343442, "learning_rate": 9.834585918739936e-06, "loss": 0.2534, "step": 78 }, { "epoch": 0.16631578947368422, "grad_norm": 3.5985866535045092, "learning_rate": 9.830341646984521e-06, "loss": 0.2375, "step": 79 }, { "epoch": 0.16842105263157894, "grad_norm": 3.3095318865144323, "learning_rate": 9.826044551386743e-06, "loss": 0.2179, "step": 80 }, { "epoch": 0.1705263157894737, "grad_norm": 3.218832777420868, "learning_rate": 9.821694678938954e-06, "loss": 0.2245, "step": 81 }, { "epoch": 0.1726315789473684, "grad_norm": 3.4749041260361326, "learning_rate": 9.817292077210658e-06, "loss": 0.2451, "step": 82 }, { "epoch": 0.17473684210526316, "grad_norm": 3.6052413099966376, "learning_rate": 9.812836794348005e-06, "loss": 0.2132, "step": 83 }, { "epoch": 0.17684210526315788, "grad_norm": 3.819893458132905, "learning_rate": 9.808328879073251e-06, "loss": 0.2518, "step": 84 }, { "epoch": 0.17894736842105263, "grad_norm": 3.447449141237711, "learning_rate": 9.803768380684242e-06, "loss": 0.2832, "step": 85 }, { "epoch": 0.18105263157894738, "grad_norm": 3.357478738557209, "learning_rate": 9.79915534905385e-06, "loss": 0.2568, "step": 86 }, { "epoch": 0.1831578947368421, "grad_norm": 3.7920160087811476, "learning_rate": 9.794489834629457e-06, "loss": 0.263, "step": 87 }, { "epoch": 0.18526315789473685, "grad_norm": 3.182104834724342, "learning_rate": 9.789771888432375e-06, "loss": 0.2245, "step": 88 }, { "epoch": 0.18736842105263157, "grad_norm": 3.4674212312793813, "learning_rate": 9.785001562057311e-06, "loss": 0.2417, "step": 89 }, { "epoch": 0.18947368421052632, "grad_norm": 4.117467872656145, "learning_rate": 9.780178907671788e-06, "loss": 0.2768, "step": 90 }, { "epoch": 0.19157894736842104, "grad_norm": 3.631475929836605, "learning_rate": 9.775303978015585e-06, "loss": 0.2437, "step": 91 }, { "epoch": 0.1936842105263158, "grad_norm": 3.3413603143822335, "learning_rate": 9.77037682640015e-06, "loss": 0.2642, "step": 92 }, { "epoch": 0.1957894736842105, "grad_norm": 3.9842190799219876, "learning_rate": 9.765397506708023e-06, "loss": 0.3169, "step": 93 }, { "epoch": 0.19789473684210526, "grad_norm": 3.7032684341350173, "learning_rate": 9.760366073392246e-06, "loss": 0.1791, "step": 94 }, { "epoch": 0.2, "grad_norm": 3.987149618467848, "learning_rate": 9.755282581475769e-06, "loss": 0.3039, "step": 95 }, { "epoch": 0.20210526315789473, "grad_norm": 3.2139873581817486, "learning_rate": 9.750147086550843e-06, "loss": 0.2504, "step": 96 }, { "epoch": 0.20421052631578948, "grad_norm": 3.566561813353208, "learning_rate": 9.744959644778422e-06, "loss": 0.2863, "step": 97 }, { "epoch": 0.2063157894736842, "grad_norm": 3.7268559206075946, "learning_rate": 9.739720312887536e-06, "loss": 0.2167, "step": 98 }, { "epoch": 0.20842105263157895, "grad_norm": 2.9695827733722595, "learning_rate": 9.734429148174676e-06, "loss": 0.2393, "step": 99 }, { "epoch": 0.21052631578947367, "grad_norm": 3.6108040436142823, "learning_rate": 9.729086208503174e-06, "loss": 0.295, "step": 100 }, { "epoch": 0.21263157894736842, "grad_norm": 3.925500913610178, "learning_rate": 9.723691552302563e-06, "loss": 0.2467, "step": 101 }, { "epoch": 0.21473684210526317, "grad_norm": 3.5695343047388666, "learning_rate": 9.718245238567939e-06, "loss": 0.2242, "step": 102 }, { "epoch": 0.2168421052631579, "grad_norm": 3.3235918183280866, "learning_rate": 9.712747326859316e-06, "loss": 0.2278, "step": 103 }, { "epoch": 0.21894736842105264, "grad_norm": 3.764042590013744, "learning_rate": 9.707197877300974e-06, "loss": 0.2921, "step": 104 }, { "epoch": 0.22105263157894736, "grad_norm": 2.903454636343328, "learning_rate": 9.701596950580807e-06, "loss": 0.2165, "step": 105 }, { "epoch": 0.2231578947368421, "grad_norm": 2.5462228191934124, "learning_rate": 9.69594460794965e-06, "loss": 0.1913, "step": 106 }, { "epoch": 0.22526315789473683, "grad_norm": 3.048612041076824, "learning_rate": 9.690240911220618e-06, "loss": 0.1913, "step": 107 }, { "epoch": 0.22736842105263158, "grad_norm": 2.7190276333100885, "learning_rate": 9.684485922768422e-06, "loss": 0.1846, "step": 108 }, { "epoch": 0.2294736842105263, "grad_norm": 3.3279879332903164, "learning_rate": 9.678679705528699e-06, "loss": 0.2444, "step": 109 }, { "epoch": 0.23157894736842105, "grad_norm": 3.086182493388614, "learning_rate": 9.672822322997305e-06, "loss": 0.1827, "step": 110 }, { "epoch": 0.2336842105263158, "grad_norm": 3.0198656142842433, "learning_rate": 9.666913839229639e-06, "loss": 0.2064, "step": 111 }, { "epoch": 0.23578947368421052, "grad_norm": 3.871643739742935, "learning_rate": 9.660954318839934e-06, "loss": 0.2537, "step": 112 }, { "epoch": 0.23789473684210527, "grad_norm": 4.034332856853841, "learning_rate": 9.654943827000548e-06, "loss": 0.2499, "step": 113 }, { "epoch": 0.24, "grad_norm": 3.7628273980242515, "learning_rate": 9.648882429441258e-06, "loss": 0.2557, "step": 114 }, { "epoch": 0.24210526315789474, "grad_norm": 3.5786660920291493, "learning_rate": 9.642770192448537e-06, "loss": 0.2677, "step": 115 }, { "epoch": 0.24421052631578946, "grad_norm": 4.532586938593248, "learning_rate": 9.636607182864828e-06, "loss": 0.2685, "step": 116 }, { "epoch": 0.2463157894736842, "grad_norm": 3.0674072329356856, "learning_rate": 9.630393468087818e-06, "loss": 0.1846, "step": 117 }, { "epoch": 0.24842105263157896, "grad_norm": 4.2865579808258945, "learning_rate": 9.624129116069695e-06, "loss": 0.342, "step": 118 }, { "epoch": 0.2505263157894737, "grad_norm": 3.8921150967122156, "learning_rate": 9.61781419531641e-06, "loss": 0.2634, "step": 119 }, { "epoch": 0.25263157894736843, "grad_norm": 3.3675053578108978, "learning_rate": 9.611448774886925e-06, "loss": 0.2273, "step": 120 }, { "epoch": 0.25473684210526315, "grad_norm": 4.105187040991947, "learning_rate": 9.605032924392457e-06, "loss": 0.242, "step": 121 }, { "epoch": 0.25684210526315787, "grad_norm": 3.2229116365485924, "learning_rate": 9.598566713995718e-06, "loss": 0.2471, "step": 122 }, { "epoch": 0.25894736842105265, "grad_norm": 2.8700645053873126, "learning_rate": 9.592050214410152e-06, "loss": 0.2465, "step": 123 }, { "epoch": 0.26105263157894737, "grad_norm": 3.6312759588775783, "learning_rate": 9.585483496899151e-06, "loss": 0.24, "step": 124 }, { "epoch": 0.2631578947368421, "grad_norm": 2.9630698799183226, "learning_rate": 9.578866633275289e-06, "loss": 0.2054, "step": 125 }, { "epoch": 0.26526315789473687, "grad_norm": 3.544581712241485, "learning_rate": 9.572199695899522e-06, "loss": 0.2314, "step": 126 }, { "epoch": 0.2673684210526316, "grad_norm": 3.255776996164575, "learning_rate": 9.565482757680415e-06, "loss": 0.2785, "step": 127 }, { "epoch": 0.2694736842105263, "grad_norm": 2.8952918607035363, "learning_rate": 9.558715892073324e-06, "loss": 0.2218, "step": 128 }, { "epoch": 0.27157894736842103, "grad_norm": 3.4853221427011065, "learning_rate": 9.551899173079607e-06, "loss": 0.2862, "step": 129 }, { "epoch": 0.2736842105263158, "grad_norm": 4.128929611734161, "learning_rate": 9.545032675245814e-06, "loss": 0.3055, "step": 130 }, { "epoch": 0.27578947368421053, "grad_norm": 2.974600343932656, "learning_rate": 9.538116473662862e-06, "loss": 0.215, "step": 131 }, { "epoch": 0.27789473684210525, "grad_norm": 2.7669257916596823, "learning_rate": 9.531150643965224e-06, "loss": 0.2182, "step": 132 }, { "epoch": 0.28, "grad_norm": 3.97273403473512, "learning_rate": 9.524135262330098e-06, "loss": 0.2658, "step": 133 }, { "epoch": 0.28210526315789475, "grad_norm": 3.6696654833766895, "learning_rate": 9.517070405476575e-06, "loss": 0.2305, "step": 134 }, { "epoch": 0.28421052631578947, "grad_norm": 2.5127237232679667, "learning_rate": 9.509956150664796e-06, "loss": 0.1511, "step": 135 }, { "epoch": 0.2863157894736842, "grad_norm": 3.371290504994853, "learning_rate": 9.502792575695112e-06, "loss": 0.26, "step": 136 }, { "epoch": 0.28842105263157897, "grad_norm": 3.0689625597079684, "learning_rate": 9.495579758907231e-06, "loss": 0.2524, "step": 137 }, { "epoch": 0.2905263157894737, "grad_norm": 3.183088939033141, "learning_rate": 9.48831777917936e-06, "loss": 0.2122, "step": 138 }, { "epoch": 0.2926315789473684, "grad_norm": 3.8034642187376035, "learning_rate": 9.481006715927352e-06, "loss": 0.2593, "step": 139 }, { "epoch": 0.29473684210526313, "grad_norm": 3.8705009381157343, "learning_rate": 9.473646649103819e-06, "loss": 0.2594, "step": 140 }, { "epoch": 0.2968421052631579, "grad_norm": 3.0318361821750286, "learning_rate": 9.466237659197271e-06, "loss": 0.2254, "step": 141 }, { "epoch": 0.29894736842105263, "grad_norm": 3.615169294903516, "learning_rate": 9.458779827231237e-06, "loss": 0.2096, "step": 142 }, { "epoch": 0.30105263157894735, "grad_norm": 3.2733885578132313, "learning_rate": 9.451273234763372e-06, "loss": 0.2431, "step": 143 }, { "epoch": 0.3031578947368421, "grad_norm": 3.2273667507387533, "learning_rate": 9.443717963884568e-06, "loss": 0.2228, "step": 144 }, { "epoch": 0.30526315789473685, "grad_norm": 3.632106743266242, "learning_rate": 9.43611409721806e-06, "loss": 0.248, "step": 145 }, { "epoch": 0.30736842105263157, "grad_norm": 3.320763426450409, "learning_rate": 9.428461717918512e-06, "loss": 0.2878, "step": 146 }, { "epoch": 0.3094736842105263, "grad_norm": 3.549945610201063, "learning_rate": 9.420760909671119e-06, "loss": 0.231, "step": 147 }, { "epoch": 0.31157894736842107, "grad_norm": 3.282653593524781, "learning_rate": 9.413011756690686e-06, "loss": 0.2659, "step": 148 }, { "epoch": 0.3136842105263158, "grad_norm": 3.261626438744862, "learning_rate": 9.405214343720708e-06, "loss": 0.2586, "step": 149 }, { "epoch": 0.3157894736842105, "grad_norm": 2.924567282994091, "learning_rate": 9.397368756032445e-06, "loss": 0.1778, "step": 150 }, { "epoch": 0.3178947368421053, "grad_norm": 3.0558746792231464, "learning_rate": 9.389475079423988e-06, "loss": 0.2471, "step": 151 }, { "epoch": 0.32, "grad_norm": 3.7586909960856207, "learning_rate": 9.381533400219319e-06, "loss": 0.258, "step": 152 }, { "epoch": 0.32210526315789473, "grad_norm": 3.392179309145632, "learning_rate": 9.373543805267367e-06, "loss": 0.249, "step": 153 }, { "epoch": 0.32421052631578945, "grad_norm": 4.083035200586394, "learning_rate": 9.365506381941066e-06, "loss": 0.2789, "step": 154 }, { "epoch": 0.3263157894736842, "grad_norm": 3.19777370952952, "learning_rate": 9.357421218136387e-06, "loss": 0.2062, "step": 155 }, { "epoch": 0.32842105263157895, "grad_norm": 3.4456582477689928, "learning_rate": 9.349288402271387e-06, "loss": 0.2382, "step": 156 }, { "epoch": 0.33052631578947367, "grad_norm": 2.765215943542346, "learning_rate": 9.341108023285239e-06, "loss": 0.1827, "step": 157 }, { "epoch": 0.33263157894736844, "grad_norm": 2.883818519531558, "learning_rate": 9.332880170637252e-06, "loss": 0.1995, "step": 158 }, { "epoch": 0.33473684210526317, "grad_norm": 3.6487183139434234, "learning_rate": 9.324604934305911e-06, "loss": 0.2598, "step": 159 }, { "epoch": 0.3368421052631579, "grad_norm": 3.9308083666697344, "learning_rate": 9.31628240478787e-06, "loss": 0.2412, "step": 160 }, { "epoch": 0.3389473684210526, "grad_norm": 3.5970617830856773, "learning_rate": 9.30791267309698e-06, "loss": 0.2851, "step": 161 }, { "epoch": 0.3410526315789474, "grad_norm": 3.467839501820664, "learning_rate": 9.299495830763285e-06, "loss": 0.2853, "step": 162 }, { "epoch": 0.3431578947368421, "grad_norm": 3.602755193669457, "learning_rate": 9.291031969832026e-06, "loss": 0.2225, "step": 163 }, { "epoch": 0.3452631578947368, "grad_norm": 3.0886925699452985, "learning_rate": 9.28252118286263e-06, "loss": 0.1903, "step": 164 }, { "epoch": 0.3473684210526316, "grad_norm": 3.439033801554011, "learning_rate": 9.273963562927695e-06, "loss": 0.2287, "step": 165 }, { "epoch": 0.3494736842105263, "grad_norm": 3.3882150165690783, "learning_rate": 9.265359203611988e-06, "loss": 0.2904, "step": 166 }, { "epoch": 0.35157894736842105, "grad_norm": 3.3452062359089507, "learning_rate": 9.256708199011402e-06, "loss": 0.2339, "step": 167 }, { "epoch": 0.35368421052631577, "grad_norm": 3.700729392048823, "learning_rate": 9.248010643731936e-06, "loss": 0.2796, "step": 168 }, { "epoch": 0.35578947368421054, "grad_norm": 3.1210284485776874, "learning_rate": 9.23926663288866e-06, "loss": 0.2126, "step": 169 }, { "epoch": 0.35789473684210527, "grad_norm": 3.6574312237344992, "learning_rate": 9.230476262104678e-06, "loss": 0.2493, "step": 170 }, { "epoch": 0.36, "grad_norm": 3.679409049213219, "learning_rate": 9.221639627510076e-06, "loss": 0.2551, "step": 171 }, { "epoch": 0.36210526315789476, "grad_norm": 3.3167727556758693, "learning_rate": 9.212756825740874e-06, "loss": 0.2482, "step": 172 }, { "epoch": 0.3642105263157895, "grad_norm": 2.401115724431016, "learning_rate": 9.203827953937969e-06, "loss": 0.1881, "step": 173 }, { "epoch": 0.3663157894736842, "grad_norm": 3.4427049239845533, "learning_rate": 9.194853109746073e-06, "loss": 0.2088, "step": 174 }, { "epoch": 0.3684210526315789, "grad_norm": 3.2573463355993826, "learning_rate": 9.185832391312644e-06, "loss": 0.2495, "step": 175 }, { "epoch": 0.3705263157894737, "grad_norm": 3.2797367783671234, "learning_rate": 9.176765897286812e-06, "loss": 0.2435, "step": 176 }, { "epoch": 0.3726315789473684, "grad_norm": 3.6588831382550806, "learning_rate": 9.167653726818305e-06, "loss": 0.2293, "step": 177 }, { "epoch": 0.37473684210526315, "grad_norm": 2.7755595814174363, "learning_rate": 9.15849597955636e-06, "loss": 0.2044, "step": 178 }, { "epoch": 0.37684210526315787, "grad_norm": 3.5962245807858255, "learning_rate": 9.149292755648631e-06, "loss": 0.2214, "step": 179 }, { "epoch": 0.37894736842105264, "grad_norm": 3.4357076470231305, "learning_rate": 9.140044155740102e-06, "loss": 0.2402, "step": 180 }, { "epoch": 0.38105263157894737, "grad_norm": 3.4123688743753853, "learning_rate": 9.130750280971978e-06, "loss": 0.2553, "step": 181 }, { "epoch": 0.3831578947368421, "grad_norm": 2.741940954433975, "learning_rate": 9.121411232980589e-06, "loss": 0.1907, "step": 182 }, { "epoch": 0.38526315789473686, "grad_norm": 3.6400973883721384, "learning_rate": 9.112027113896262e-06, "loss": 0.2616, "step": 183 }, { "epoch": 0.3873684210526316, "grad_norm": 3.5161597871058277, "learning_rate": 9.102598026342223e-06, "loss": 0.2114, "step": 184 }, { "epoch": 0.3894736842105263, "grad_norm": 3.5917090120879904, "learning_rate": 9.093124073433464e-06, "loss": 0.2304, "step": 185 }, { "epoch": 0.391578947368421, "grad_norm": 3.1502787194480897, "learning_rate": 9.083605358775612e-06, "loss": 0.2032, "step": 186 }, { "epoch": 0.3936842105263158, "grad_norm": 3.9729218899091063, "learning_rate": 9.074041986463808e-06, "loss": 0.2325, "step": 187 }, { "epoch": 0.3957894736842105, "grad_norm": 3.0185052960999523, "learning_rate": 9.064434061081562e-06, "loss": 0.1981, "step": 188 }, { "epoch": 0.39789473684210525, "grad_norm": 4.209209496600263, "learning_rate": 9.0547816876996e-06, "loss": 0.2586, "step": 189 }, { "epoch": 0.4, "grad_norm": 3.967209050356877, "learning_rate": 9.045084971874738e-06, "loss": 0.2946, "step": 190 }, { "epoch": 0.40210526315789474, "grad_norm": 3.017106050384066, "learning_rate": 9.035344019648701e-06, "loss": 0.2465, "step": 191 }, { "epoch": 0.40421052631578946, "grad_norm": 3.917356737129771, "learning_rate": 9.025558937546987e-06, "loss": 0.3207, "step": 192 }, { "epoch": 0.4063157894736842, "grad_norm": 3.2403291834347767, "learning_rate": 9.015729832577681e-06, "loss": 0.233, "step": 193 }, { "epoch": 0.40842105263157896, "grad_norm": 3.322798306669591, "learning_rate": 9.005856812230304e-06, "loss": 0.1899, "step": 194 }, { "epoch": 0.4105263157894737, "grad_norm": 3.430365209049047, "learning_rate": 8.995939984474624e-06, "loss": 0.2304, "step": 195 }, { "epoch": 0.4126315789473684, "grad_norm": 3.39458107073051, "learning_rate": 8.98597945775948e-06, "loss": 0.2357, "step": 196 }, { "epoch": 0.4147368421052632, "grad_norm": 3.3413958584715475, "learning_rate": 8.975975341011595e-06, "loss": 0.2855, "step": 197 }, { "epoch": 0.4168421052631579, "grad_norm": 2.5726994940315415, "learning_rate": 8.96592774363439e-06, "loss": 0.1901, "step": 198 }, { "epoch": 0.4189473684210526, "grad_norm": 4.335519110486464, "learning_rate": 8.955836775506776e-06, "loss": 0.2933, "step": 199 }, { "epoch": 0.42105263157894735, "grad_norm": 4.19815390177116, "learning_rate": 8.94570254698197e-06, "loss": 0.2688, "step": 200 }, { "epoch": 0.42105263157894735, "eval_loss": 0.2162427455186844, "eval_runtime": 0.9508, "eval_samples_per_second": 41.017, "eval_steps_per_second": 10.517, "step": 200 }, { "epoch": 0.4231578947368421, "grad_norm": 3.269776118271859, "learning_rate": 8.935525168886263e-06, "loss": 0.2096, "step": 201 }, { "epoch": 0.42526315789473684, "grad_norm": 4.04123612262294, "learning_rate": 8.92530475251784e-06, "loss": 0.2568, "step": 202 }, { "epoch": 0.42736842105263156, "grad_norm": 3.8368271309479933, "learning_rate": 8.91504140964553e-06, "loss": 0.2657, "step": 203 }, { "epoch": 0.42947368421052634, "grad_norm": 3.1272371621856037, "learning_rate": 8.90473525250761e-06, "loss": 0.2268, "step": 204 }, { "epoch": 0.43157894736842106, "grad_norm": 3.093955290257307, "learning_rate": 8.894386393810563e-06, "loss": 0.2042, "step": 205 }, { "epoch": 0.4336842105263158, "grad_norm": 2.6581659343898543, "learning_rate": 8.883994946727848e-06, "loss": 0.1746, "step": 206 }, { "epoch": 0.4357894736842105, "grad_norm": 3.955613588917937, "learning_rate": 8.873561024898668e-06, "loss": 0.1996, "step": 207 }, { "epoch": 0.4378947368421053, "grad_norm": 2.7835005086015903, "learning_rate": 8.863084742426719e-06, "loss": 0.192, "step": 208 }, { "epoch": 0.44, "grad_norm": 3.347640148688381, "learning_rate": 8.852566213878947e-06, "loss": 0.1955, "step": 209 }, { "epoch": 0.4421052631578947, "grad_norm": 3.6781001625254643, "learning_rate": 8.842005554284296e-06, "loss": 0.2583, "step": 210 }, { "epoch": 0.4442105263157895, "grad_norm": 3.3060488425103416, "learning_rate": 8.831402879132447e-06, "loss": 0.2273, "step": 211 }, { "epoch": 0.4463157894736842, "grad_norm": 3.924014440413263, "learning_rate": 8.820758304372557e-06, "loss": 0.2294, "step": 212 }, { "epoch": 0.44842105263157894, "grad_norm": 3.7994401024720066, "learning_rate": 8.810071946411989e-06, "loss": 0.2199, "step": 213 }, { "epoch": 0.45052631578947366, "grad_norm": 3.376294637610717, "learning_rate": 8.799343922115045e-06, "loss": 0.2433, "step": 214 }, { "epoch": 0.45263157894736844, "grad_norm": 3.3193795798150165, "learning_rate": 8.788574348801676e-06, "loss": 0.209, "step": 215 }, { "epoch": 0.45473684210526316, "grad_norm": 3.0915010534262795, "learning_rate": 8.777763344246209e-06, "loss": 0.179, "step": 216 }, { "epoch": 0.4568421052631579, "grad_norm": 2.8659181552677375, "learning_rate": 8.766911026676063e-06, "loss": 0.1811, "step": 217 }, { "epoch": 0.4589473684210526, "grad_norm": 3.45215463473198, "learning_rate": 8.756017514770444e-06, "loss": 0.2281, "step": 218 }, { "epoch": 0.4610526315789474, "grad_norm": 3.1257499399451394, "learning_rate": 8.745082927659048e-06, "loss": 0.2184, "step": 219 }, { "epoch": 0.4631578947368421, "grad_norm": 3.8271139734522945, "learning_rate": 8.734107384920771e-06, "loss": 0.2623, "step": 220 }, { "epoch": 0.4652631578947368, "grad_norm": 2.835561102259285, "learning_rate": 8.72309100658239e-06, "loss": 0.1964, "step": 221 }, { "epoch": 0.4673684210526316, "grad_norm": 3.3688712713428766, "learning_rate": 8.71203391311725e-06, "loss": 0.2168, "step": 222 }, { "epoch": 0.4694736842105263, "grad_norm": 3.7240976383868736, "learning_rate": 8.700936225443958e-06, "loss": 0.2518, "step": 223 }, { "epoch": 0.47157894736842104, "grad_norm": 2.96476521824005, "learning_rate": 8.689798064925049e-06, "loss": 0.2378, "step": 224 }, { "epoch": 0.47368421052631576, "grad_norm": 2.7984591391533953, "learning_rate": 8.67861955336566e-06, "loss": 0.2252, "step": 225 }, { "epoch": 0.47578947368421054, "grad_norm": 2.7976795282629254, "learning_rate": 8.6674008130122e-06, "loss": 0.1755, "step": 226 }, { "epoch": 0.47789473684210526, "grad_norm": 3.33023467809358, "learning_rate": 8.65614196655102e-06, "loss": 0.2361, "step": 227 }, { "epoch": 0.48, "grad_norm": 2.966759381413828, "learning_rate": 8.644843137107058e-06, "loss": 0.2027, "step": 228 }, { "epoch": 0.48210526315789476, "grad_norm": 3.1104223364393535, "learning_rate": 8.633504448242504e-06, "loss": 0.1961, "step": 229 }, { "epoch": 0.4842105263157895, "grad_norm": 2.787274197616676, "learning_rate": 8.622126023955446e-06, "loss": 0.2031, "step": 230 }, { "epoch": 0.4863157894736842, "grad_norm": 3.3738049865267925, "learning_rate": 8.610707988678504e-06, "loss": 0.2533, "step": 231 }, { "epoch": 0.4884210526315789, "grad_norm": 3.407815533241093, "learning_rate": 8.599250467277483e-06, "loss": 0.2524, "step": 232 }, { "epoch": 0.4905263157894737, "grad_norm": 3.296831884586839, "learning_rate": 8.587753585050004e-06, "loss": 0.2396, "step": 233 }, { "epoch": 0.4926315789473684, "grad_norm": 2.8560599820160073, "learning_rate": 8.576217467724129e-06, "loss": 0.2416, "step": 234 }, { "epoch": 0.49473684210526314, "grad_norm": 2.9054696528766524, "learning_rate": 8.564642241456986e-06, "loss": 0.1973, "step": 235 }, { "epoch": 0.4968421052631579, "grad_norm": 2.8181421804733358, "learning_rate": 8.553028032833397e-06, "loss": 0.179, "step": 236 }, { "epoch": 0.49894736842105264, "grad_norm": 2.7050097156036284, "learning_rate": 8.541374968864486e-06, "loss": 0.2037, "step": 237 }, { "epoch": 0.5010526315789474, "grad_norm": 2.585908271011497, "learning_rate": 8.529683176986295e-06, "loss": 0.1633, "step": 238 }, { "epoch": 0.5031578947368421, "grad_norm": 3.6063087447245414, "learning_rate": 8.517952785058385e-06, "loss": 0.2354, "step": 239 }, { "epoch": 0.5052631578947369, "grad_norm": 2.8004827647319073, "learning_rate": 8.506183921362443e-06, "loss": 0.1783, "step": 240 }, { "epoch": 0.5073684210526316, "grad_norm": 3.0924391138448777, "learning_rate": 8.494376714600878e-06, "loss": 0.2086, "step": 241 }, { "epoch": 0.5094736842105263, "grad_norm": 3.28651564075383, "learning_rate": 8.482531293895412e-06, "loss": 0.2345, "step": 242 }, { "epoch": 0.511578947368421, "grad_norm": 3.2830296016413056, "learning_rate": 8.470647788785665e-06, "loss": 0.2149, "step": 243 }, { "epoch": 0.5136842105263157, "grad_norm": 3.546287405553885, "learning_rate": 8.458726329227748e-06, "loss": 0.2261, "step": 244 }, { "epoch": 0.5157894736842106, "grad_norm": 3.394923024937159, "learning_rate": 8.446767045592829e-06, "loss": 0.2468, "step": 245 }, { "epoch": 0.5178947368421053, "grad_norm": 3.864701196963864, "learning_rate": 8.434770068665723e-06, "loss": 0.2638, "step": 246 }, { "epoch": 0.52, "grad_norm": 3.4189011314403976, "learning_rate": 8.422735529643445e-06, "loss": 0.2219, "step": 247 }, { "epoch": 0.5221052631578947, "grad_norm": 3.4940583139796497, "learning_rate": 8.410663560133784e-06, "loss": 0.2055, "step": 248 }, { "epoch": 0.5242105263157895, "grad_norm": 2.9563885540382717, "learning_rate": 8.398554292153866e-06, "loss": 0.2063, "step": 249 }, { "epoch": 0.5263157894736842, "grad_norm": 3.856575711945962, "learning_rate": 8.386407858128707e-06, "loss": 0.2493, "step": 250 }, { "epoch": 0.5284210526315789, "grad_norm": 2.963714344149301, "learning_rate": 8.37422439088976e-06, "loss": 0.2173, "step": 251 }, { "epoch": 0.5305263157894737, "grad_norm": 3.5084770315497953, "learning_rate": 8.362004023673473e-06, "loss": 0.2637, "step": 252 }, { "epoch": 0.5326315789473685, "grad_norm": 3.2627548109310545, "learning_rate": 8.349746890119826e-06, "loss": 0.2059, "step": 253 }, { "epoch": 0.5347368421052632, "grad_norm": 3.537857944594144, "learning_rate": 8.337453124270864e-06, "loss": 0.2064, "step": 254 }, { "epoch": 0.5368421052631579, "grad_norm": 3.203619307633033, "learning_rate": 8.325122860569241e-06, "loss": 0.1859, "step": 255 }, { "epoch": 0.5389473684210526, "grad_norm": 2.8427156228829946, "learning_rate": 8.31275623385675e-06, "loss": 0.1781, "step": 256 }, { "epoch": 0.5410526315789473, "grad_norm": 3.4548444256099495, "learning_rate": 8.300353379372834e-06, "loss": 0.2253, "step": 257 }, { "epoch": 0.5431578947368421, "grad_norm": 3.316389585769609, "learning_rate": 8.287914432753123e-06, "loss": 0.2496, "step": 258 }, { "epoch": 0.5452631578947369, "grad_norm": 3.925056071030507, "learning_rate": 8.275439530027948e-06, "loss": 0.2259, "step": 259 }, { "epoch": 0.5473684210526316, "grad_norm": 3.992456726752316, "learning_rate": 8.262928807620843e-06, "loss": 0.2566, "step": 260 }, { "epoch": 0.5494736842105263, "grad_norm": 3.432001698331824, "learning_rate": 8.250382402347066e-06, "loss": 0.2084, "step": 261 }, { "epoch": 0.5515789473684211, "grad_norm": 3.4259679677663843, "learning_rate": 8.237800451412095e-06, "loss": 0.2381, "step": 262 }, { "epoch": 0.5536842105263158, "grad_norm": 3.1299226563183193, "learning_rate": 8.225183092410128e-06, "loss": 0.2374, "step": 263 }, { "epoch": 0.5557894736842105, "grad_norm": 3.2234103937622924, "learning_rate": 8.212530463322584e-06, "loss": 0.2192, "step": 264 }, { "epoch": 0.5578947368421052, "grad_norm": 3.840611086800957, "learning_rate": 8.199842702516584e-06, "loss": 0.2349, "step": 265 }, { "epoch": 0.56, "grad_norm": 3.090365309566825, "learning_rate": 8.18711994874345e-06, "loss": 0.2441, "step": 266 }, { "epoch": 0.5621052631578948, "grad_norm": 3.5041886865116783, "learning_rate": 8.174362341137177e-06, "loss": 0.2659, "step": 267 }, { "epoch": 0.5642105263157895, "grad_norm": 3.0931593729585516, "learning_rate": 8.161570019212921e-06, "loss": 0.2308, "step": 268 }, { "epoch": 0.5663157894736842, "grad_norm": 3.6356498976901332, "learning_rate": 8.148743122865463e-06, "loss": 0.2534, "step": 269 }, { "epoch": 0.5684210526315789, "grad_norm": 3.408126383096958, "learning_rate": 8.135881792367686e-06, "loss": 0.2321, "step": 270 }, { "epoch": 0.5705263157894737, "grad_norm": 2.6458628263496284, "learning_rate": 8.12298616836904e-06, "loss": 0.1978, "step": 271 }, { "epoch": 0.5726315789473684, "grad_norm": 3.1483733395983595, "learning_rate": 8.110056391894005e-06, "loss": 0.2172, "step": 272 }, { "epoch": 0.5747368421052632, "grad_norm": 3.467397710095167, "learning_rate": 8.097092604340543e-06, "loss": 0.2394, "step": 273 }, { "epoch": 0.5768421052631579, "grad_norm": 3.8996216518849454, "learning_rate": 8.084094947478556e-06, "loss": 0.2731, "step": 274 }, { "epoch": 0.5789473684210527, "grad_norm": 3.0037248186783936, "learning_rate": 8.071063563448341e-06, "loss": 0.1767, "step": 275 }, { "epoch": 0.5810526315789474, "grad_norm": 2.5277085823211864, "learning_rate": 8.057998594759022e-06, "loss": 0.1814, "step": 276 }, { "epoch": 0.5831578947368421, "grad_norm": 3.3543130599108255, "learning_rate": 8.044900184287007e-06, "loss": 0.2266, "step": 277 }, { "epoch": 0.5852631578947368, "grad_norm": 3.1857375439158266, "learning_rate": 8.031768475274412e-06, "loss": 0.2343, "step": 278 }, { "epoch": 0.5873684210526315, "grad_norm": 3.055157108563214, "learning_rate": 8.018603611327505e-06, "loss": 0.227, "step": 279 }, { "epoch": 0.5894736842105263, "grad_norm": 3.2243095637150927, "learning_rate": 8.005405736415127e-06, "loss": 0.1937, "step": 280 }, { "epoch": 0.5915789473684211, "grad_norm": 3.250488849370332, "learning_rate": 7.992174994867124e-06, "loss": 0.2374, "step": 281 }, { "epoch": 0.5936842105263158, "grad_norm": 3.0167916103746912, "learning_rate": 7.978911531372764e-06, "loss": 0.225, "step": 282 }, { "epoch": 0.5957894736842105, "grad_norm": 3.2651532548799374, "learning_rate": 7.965615490979165e-06, "loss": 0.2337, "step": 283 }, { "epoch": 0.5978947368421053, "grad_norm": 3.896346456849055, "learning_rate": 7.952287019089686e-06, "loss": 0.2748, "step": 284 }, { "epoch": 0.6, "grad_norm": 3.5822792888425803, "learning_rate": 7.938926261462366e-06, "loss": 0.211, "step": 285 }, { "epoch": 0.6021052631578947, "grad_norm": 3.444306149909226, "learning_rate": 7.925533364208308e-06, "loss": 0.1983, "step": 286 }, { "epoch": 0.6042105263157894, "grad_norm": 4.1948069859545445, "learning_rate": 7.912108473790092e-06, "loss": 0.2328, "step": 287 }, { "epoch": 0.6063157894736843, "grad_norm": 3.4747320472234517, "learning_rate": 7.898651737020166e-06, "loss": 0.265, "step": 288 }, { "epoch": 0.608421052631579, "grad_norm": 3.240236939628344, "learning_rate": 7.885163301059251e-06, "loss": 0.2105, "step": 289 }, { "epoch": 0.6105263157894737, "grad_norm": 3.721836217869373, "learning_rate": 7.871643313414718e-06, "loss": 0.2183, "step": 290 }, { "epoch": 0.6126315789473684, "grad_norm": 3.326881302452429, "learning_rate": 7.858091921938989e-06, "loss": 0.2394, "step": 291 }, { "epoch": 0.6147368421052631, "grad_norm": 4.006855011965986, "learning_rate": 7.844509274827907e-06, "loss": 0.2294, "step": 292 }, { "epoch": 0.6168421052631579, "grad_norm": 2.977288794276405, "learning_rate": 7.830895520619129e-06, "loss": 0.1943, "step": 293 }, { "epoch": 0.6189473684210526, "grad_norm": 3.503869431295621, "learning_rate": 7.817250808190483e-06, "loss": 0.2271, "step": 294 }, { "epoch": 0.6210526315789474, "grad_norm": 2.397881273267794, "learning_rate": 7.803575286758365e-06, "loss": 0.1522, "step": 295 }, { "epoch": 0.6231578947368421, "grad_norm": 3.1498677204648855, "learning_rate": 7.789869105876083e-06, "loss": 0.2223, "step": 296 }, { "epoch": 0.6252631578947369, "grad_norm": 3.532048573053879, "learning_rate": 7.776132415432234e-06, "loss": 0.2548, "step": 297 }, { "epoch": 0.6273684210526316, "grad_norm": 2.9494325963626777, "learning_rate": 7.762365365649068e-06, "loss": 0.2047, "step": 298 }, { "epoch": 0.6294736842105263, "grad_norm": 3.1322331545957707, "learning_rate": 7.748568107080831e-06, "loss": 0.2239, "step": 299 }, { "epoch": 0.631578947368421, "grad_norm": 2.996031382032748, "learning_rate": 7.734740790612137e-06, "loss": 0.177, "step": 300 }, { "epoch": 0.6336842105263157, "grad_norm": 3.6318074014394135, "learning_rate": 7.720883567456299e-06, "loss": 0.2797, "step": 301 }, { "epoch": 0.6357894736842106, "grad_norm": 3.5126271433689817, "learning_rate": 7.70699658915369e-06, "loss": 0.2965, "step": 302 }, { "epoch": 0.6378947368421053, "grad_norm": 3.067374146183351, "learning_rate": 7.693080007570084e-06, "loss": 0.2311, "step": 303 }, { "epoch": 0.64, "grad_norm": 2.8467013786071735, "learning_rate": 7.679133974894984e-06, "loss": 0.1952, "step": 304 }, { "epoch": 0.6421052631578947, "grad_norm": 3.298916474796445, "learning_rate": 7.66515864363997e-06, "loss": 0.2233, "step": 305 }, { "epoch": 0.6442105263157895, "grad_norm": 4.447954496664178, "learning_rate": 7.651154166637025e-06, "loss": 0.3085, "step": 306 }, { "epoch": 0.6463157894736842, "grad_norm": 3.0739296320424736, "learning_rate": 7.637120697036866e-06, "loss": 0.1874, "step": 307 }, { "epoch": 0.6484210526315789, "grad_norm": 2.672772402397274, "learning_rate": 7.62305838830727e-06, "loss": 0.2168, "step": 308 }, { "epoch": 0.6505263157894737, "grad_norm": 3.5823577010326844, "learning_rate": 7.608967394231387e-06, "loss": 0.2523, "step": 309 }, { "epoch": 0.6526315789473685, "grad_norm": 3.363408010518267, "learning_rate": 7.594847868906076e-06, "loss": 0.213, "step": 310 }, { "epoch": 0.6547368421052632, "grad_norm": 3.0932376636426238, "learning_rate": 7.580699966740201e-06, "loss": 0.2267, "step": 311 }, { "epoch": 0.6568421052631579, "grad_norm": 3.483318561632507, "learning_rate": 7.566523842452958e-06, "loss": 0.256, "step": 312 }, { "epoch": 0.6589473684210526, "grad_norm": 2.7912893670301484, "learning_rate": 7.552319651072164e-06, "loss": 0.2106, "step": 313 }, { "epoch": 0.6610526315789473, "grad_norm": 3.4981450541010704, "learning_rate": 7.5380875479325855e-06, "loss": 0.2547, "step": 314 }, { "epoch": 0.6631578947368421, "grad_norm": 3.124883447115098, "learning_rate": 7.52382768867422e-06, "loss": 0.1939, "step": 315 }, { "epoch": 0.6652631578947369, "grad_norm": 4.620680045339017, "learning_rate": 7.509540229240601e-06, "loss": 0.2953, "step": 316 }, { "epoch": 0.6673684210526316, "grad_norm": 3.2282886161755786, "learning_rate": 7.4952253258771036e-06, "loss": 0.2112, "step": 317 }, { "epoch": 0.6694736842105263, "grad_norm": 3.047727830370946, "learning_rate": 7.480883135129211e-06, "loss": 0.2086, "step": 318 }, { "epoch": 0.671578947368421, "grad_norm": 2.584859580905444, "learning_rate": 7.4665138138408255e-06, "loss": 0.2119, "step": 319 }, { "epoch": 0.6736842105263158, "grad_norm": 3.316066265356493, "learning_rate": 7.452117519152542e-06, "loss": 0.2489, "step": 320 }, { "epoch": 0.6757894736842105, "grad_norm": 3.2406113992536136, "learning_rate": 7.437694408499932e-06, "loss": 0.1915, "step": 321 }, { "epoch": 0.6778947368421052, "grad_norm": 2.956072384698419, "learning_rate": 7.4232446396118265e-06, "loss": 0.2141, "step": 322 }, { "epoch": 0.68, "grad_norm": 2.911407487056924, "learning_rate": 7.408768370508577e-06, "loss": 0.2149, "step": 323 }, { "epoch": 0.6821052631578948, "grad_norm": 2.8116902443594016, "learning_rate": 7.394265759500348e-06, "loss": 0.1691, "step": 324 }, { "epoch": 0.6842105263157895, "grad_norm": 3.276193445347204, "learning_rate": 7.379736965185369e-06, "loss": 0.2003, "step": 325 }, { "epoch": 0.6863157894736842, "grad_norm": 2.980429816982403, "learning_rate": 7.365182146448205e-06, "loss": 0.2071, "step": 326 }, { "epoch": 0.6884210526315789, "grad_norm": 3.168944857843924, "learning_rate": 7.350601462458025e-06, "loss": 0.2249, "step": 327 }, { "epoch": 0.6905263157894737, "grad_norm": 3.2312005808906608, "learning_rate": 7.335995072666848e-06, "loss": 0.1985, "step": 328 }, { "epoch": 0.6926315789473684, "grad_norm": 3.0522979756884236, "learning_rate": 7.3213631368078196e-06, "loss": 0.2025, "step": 329 }, { "epoch": 0.6947368421052632, "grad_norm": 2.787658703366056, "learning_rate": 7.30670581489344e-06, "loss": 0.1983, "step": 330 }, { "epoch": 0.6968421052631579, "grad_norm": 4.3667882707177625, "learning_rate": 7.292023267213836e-06, "loss": 0.2243, "step": 331 }, { "epoch": 0.6989473684210527, "grad_norm": 5.1674527899722085, "learning_rate": 7.2773156543349965e-06, "loss": 0.2317, "step": 332 }, { "epoch": 0.7010526315789474, "grad_norm": 2.7521986848960216, "learning_rate": 7.262583137097019e-06, "loss": 0.1964, "step": 333 }, { "epoch": 0.7031578947368421, "grad_norm": 2.8301069192286445, "learning_rate": 7.247825876612353e-06, "loss": 0.2043, "step": 334 }, { "epoch": 0.7052631578947368, "grad_norm": 3.770631339460926, "learning_rate": 7.233044034264034e-06, "loss": 0.1965, "step": 335 }, { "epoch": 0.7073684210526315, "grad_norm": 2.8548456329448872, "learning_rate": 7.218237771703921e-06, "loss": 0.1819, "step": 336 }, { "epoch": 0.7094736842105264, "grad_norm": 3.6843919985708173, "learning_rate": 7.203407250850929e-06, "loss": 0.2101, "step": 337 }, { "epoch": 0.7115789473684211, "grad_norm": 2.481860597568968, "learning_rate": 7.18855263388926e-06, "loss": 0.1619, "step": 338 }, { "epoch": 0.7136842105263158, "grad_norm": 2.8454463712055653, "learning_rate": 7.173674083266624e-06, "loss": 0.1548, "step": 339 }, { "epoch": 0.7157894736842105, "grad_norm": 3.1220177562190297, "learning_rate": 7.158771761692464e-06, "loss": 0.1873, "step": 340 }, { "epoch": 0.7178947368421053, "grad_norm": 3.1026746108893204, "learning_rate": 7.143845832136188e-06, "loss": 0.1708, "step": 341 }, { "epoch": 0.72, "grad_norm": 3.613177488828585, "learning_rate": 7.128896457825364e-06, "loss": 0.2051, "step": 342 }, { "epoch": 0.7221052631578947, "grad_norm": 4.023734813281506, "learning_rate": 7.113923802243957e-06, "loss": 0.2371, "step": 343 }, { "epoch": 0.7242105263157895, "grad_norm": 2.4891706091722283, "learning_rate": 7.098928029130529e-06, "loss": 0.1585, "step": 344 }, { "epoch": 0.7263157894736842, "grad_norm": 3.625956257810872, "learning_rate": 7.083909302476453e-06, "loss": 0.2379, "step": 345 }, { "epoch": 0.728421052631579, "grad_norm": 3.409493884604401, "learning_rate": 7.068867786524116e-06, "loss": 0.1783, "step": 346 }, { "epoch": 0.7305263157894737, "grad_norm": 3.0090022256319866, "learning_rate": 7.053803645765128e-06, "loss": 0.1831, "step": 347 }, { "epoch": 0.7326315789473684, "grad_norm": 3.5360587589584127, "learning_rate": 7.038717044938519e-06, "loss": 0.2413, "step": 348 }, { "epoch": 0.7347368421052631, "grad_norm": 3.4382217950294236, "learning_rate": 7.023608149028936e-06, "loss": 0.2155, "step": 349 }, { "epoch": 0.7368421052631579, "grad_norm": 4.004045022458863, "learning_rate": 7.008477123264849e-06, "loss": 0.2836, "step": 350 }, { "epoch": 0.7389473684210527, "grad_norm": 3.3203295306272196, "learning_rate": 6.993324133116726e-06, "loss": 0.2658, "step": 351 }, { "epoch": 0.7410526315789474, "grad_norm": 2.548964384681694, "learning_rate": 6.978149344295242e-06, "loss": 0.1785, "step": 352 }, { "epoch": 0.7431578947368421, "grad_norm": 3.4483832833571912, "learning_rate": 6.9629529227494575e-06, "loss": 0.2214, "step": 353 }, { "epoch": 0.7452631578947368, "grad_norm": 3.4383584987113274, "learning_rate": 6.9477350346650016e-06, "loss": 0.192, "step": 354 }, { "epoch": 0.7473684210526316, "grad_norm": 3.691018189312247, "learning_rate": 6.932495846462262e-06, "loss": 0.2435, "step": 355 }, { "epoch": 0.7494736842105263, "grad_norm": 3.385770493095089, "learning_rate": 6.9172355247945586e-06, "loss": 0.205, "step": 356 }, { "epoch": 0.751578947368421, "grad_norm": 2.662810311197674, "learning_rate": 6.901954236546324e-06, "loss": 0.1659, "step": 357 }, { "epoch": 0.7536842105263157, "grad_norm": 4.168405645399794, "learning_rate": 6.88665214883128e-06, "loss": 0.2934, "step": 358 }, { "epoch": 0.7557894736842106, "grad_norm": 3.5383114057012843, "learning_rate": 6.871329428990602e-06, "loss": 0.2157, "step": 359 }, { "epoch": 0.7578947368421053, "grad_norm": 2.8894956368254103, "learning_rate": 6.855986244591104e-06, "loss": 0.1912, "step": 360 }, { "epoch": 0.76, "grad_norm": 2.787711973501566, "learning_rate": 6.840622763423391e-06, "loss": 0.1706, "step": 361 }, { "epoch": 0.7621052631578947, "grad_norm": 2.72901738571353, "learning_rate": 6.825239153500029e-06, "loss": 0.164, "step": 362 }, { "epoch": 0.7642105263157895, "grad_norm": 3.189665352469265, "learning_rate": 6.809835583053716e-06, "loss": 0.1764, "step": 363 }, { "epoch": 0.7663157894736842, "grad_norm": 3.1275848607099133, "learning_rate": 6.794412220535426e-06, "loss": 0.2197, "step": 364 }, { "epoch": 0.7684210526315789, "grad_norm": 3.5188488634301263, "learning_rate": 6.778969234612583e-06, "loss": 0.2439, "step": 365 }, { "epoch": 0.7705263157894737, "grad_norm": 2.62111339980637, "learning_rate": 6.763506794167207e-06, "loss": 0.1879, "step": 366 }, { "epoch": 0.7726315789473684, "grad_norm": 2.8407752570746005, "learning_rate": 6.748025068294067e-06, "loss": 0.179, "step": 367 }, { "epoch": 0.7747368421052632, "grad_norm": 3.230423148951695, "learning_rate": 6.732524226298841e-06, "loss": 0.1906, "step": 368 }, { "epoch": 0.7768421052631579, "grad_norm": 3.9240082867236974, "learning_rate": 6.717004437696249e-06, "loss": 0.2593, "step": 369 }, { "epoch": 0.7789473684210526, "grad_norm": 2.949281736906227, "learning_rate": 6.701465872208216e-06, "loss": 0.1767, "step": 370 }, { "epoch": 0.7810526315789473, "grad_norm": 3.4699155102688293, "learning_rate": 6.685908699762003e-06, "loss": 0.2495, "step": 371 }, { "epoch": 0.783157894736842, "grad_norm": 3.441878628446404, "learning_rate": 6.670333090488357e-06, "loss": 0.2499, "step": 372 }, { "epoch": 0.7852631578947369, "grad_norm": 3.1405985518052772, "learning_rate": 6.654739214719642e-06, "loss": 0.2127, "step": 373 }, { "epoch": 0.7873684210526316, "grad_norm": 2.593987567673624, "learning_rate": 6.6391272429879886e-06, "loss": 0.1835, "step": 374 }, { "epoch": 0.7894736842105263, "grad_norm": 3.276693821618827, "learning_rate": 6.6234973460234184e-06, "loss": 0.2027, "step": 375 }, { "epoch": 0.791578947368421, "grad_norm": 2.995174038901829, "learning_rate": 6.607849694751978e-06, "loss": 0.2003, "step": 376 }, { "epoch": 0.7936842105263158, "grad_norm": 2.6846031430529567, "learning_rate": 6.592184460293878e-06, "loss": 0.1421, "step": 377 }, { "epoch": 0.7957894736842105, "grad_norm": 3.312415514283232, "learning_rate": 6.576501813961609e-06, "loss": 0.1863, "step": 378 }, { "epoch": 0.7978947368421052, "grad_norm": 3.775675123728028, "learning_rate": 6.560801927258081e-06, "loss": 0.1958, "step": 379 }, { "epoch": 0.8, "grad_norm": 2.5927726340982264, "learning_rate": 6.545084971874738e-06, "loss": 0.1625, "step": 380 }, { "epoch": 0.8021052631578948, "grad_norm": 3.1587922841231917, "learning_rate": 6.529351119689687e-06, "loss": 0.1965, "step": 381 }, { "epoch": 0.8042105263157895, "grad_norm": 3.1769362735899356, "learning_rate": 6.513600542765816e-06, "loss": 0.2057, "step": 382 }, { "epoch": 0.8063157894736842, "grad_norm": 3.808162384466138, "learning_rate": 6.49783341334891e-06, "loss": 0.2042, "step": 383 }, { "epoch": 0.8084210526315789, "grad_norm": 3.3063478217630107, "learning_rate": 6.4820499038657695e-06, "loss": 0.1916, "step": 384 }, { "epoch": 0.8105263157894737, "grad_norm": 3.043905617430906, "learning_rate": 6.466250186922325e-06, "loss": 0.1944, "step": 385 }, { "epoch": 0.8126315789473684, "grad_norm": 4.168593975170044, "learning_rate": 6.450434435301751e-06, "loss": 0.2748, "step": 386 }, { "epoch": 0.8147368421052632, "grad_norm": 4.274013610158174, "learning_rate": 6.434602821962571e-06, "loss": 0.2494, "step": 387 }, { "epoch": 0.8168421052631579, "grad_norm": 3.5963573539929463, "learning_rate": 6.418755520036775e-06, "loss": 0.2013, "step": 388 }, { "epoch": 0.8189473684210526, "grad_norm": 2.9666962047426666, "learning_rate": 6.402892702827916e-06, "loss": 0.187, "step": 389 }, { "epoch": 0.8210526315789474, "grad_norm": 2.9643994270594884, "learning_rate": 6.387014543809224e-06, "loss": 0.2049, "step": 390 }, { "epoch": 0.8231578947368421, "grad_norm": 2.3657391759758397, "learning_rate": 6.371121216621698e-06, "loss": 0.1751, "step": 391 }, { "epoch": 0.8252631578947368, "grad_norm": 3.3529253765458167, "learning_rate": 6.355212895072223e-06, "loss": 0.2193, "step": 392 }, { "epoch": 0.8273684210526315, "grad_norm": 3.1720607901606206, "learning_rate": 6.339289753131649e-06, "loss": 0.2148, "step": 393 }, { "epoch": 0.8294736842105264, "grad_norm": 3.3584897742031834, "learning_rate": 6.323351964932909e-06, "loss": 0.2302, "step": 394 }, { "epoch": 0.8315789473684211, "grad_norm": 4.380475651099131, "learning_rate": 6.3073997047691e-06, "loss": 0.2887, "step": 395 }, { "epoch": 0.8336842105263158, "grad_norm": 3.289882212635633, "learning_rate": 6.291433147091583e-06, "loss": 0.2106, "step": 396 }, { "epoch": 0.8357894736842105, "grad_norm": 2.9345166529972952, "learning_rate": 6.275452466508076e-06, "loss": 0.2063, "step": 397 }, { "epoch": 0.8378947368421052, "grad_norm": 3.6273888701243355, "learning_rate": 6.259457837780741e-06, "loss": 0.2245, "step": 398 }, { "epoch": 0.84, "grad_norm": 3.6790816473406847, "learning_rate": 6.243449435824276e-06, "loss": 0.193, "step": 399 }, { "epoch": 0.8421052631578947, "grad_norm": 3.1914433056812426, "learning_rate": 6.227427435703997e-06, "loss": 0.2164, "step": 400 }, { "epoch": 0.8421052631578947, "eval_loss": 0.19365844130516052, "eval_runtime": 0.9303, "eval_samples_per_second": 41.923, "eval_steps_per_second": 10.749, "step": 400 }, { "epoch": 0.8442105263157895, "grad_norm": 3.0422393517644095, "learning_rate": 6.211392012633932e-06, "loss": 0.1945, "step": 401 }, { "epoch": 0.8463157894736842, "grad_norm": 3.3896222895957204, "learning_rate": 6.1953433419748995e-06, "loss": 0.2183, "step": 402 }, { "epoch": 0.848421052631579, "grad_norm": 2.8202481621645226, "learning_rate": 6.179281599232592e-06, "loss": 0.222, "step": 403 }, { "epoch": 0.8505263157894737, "grad_norm": 2.7904065123537545, "learning_rate": 6.163206960055652e-06, "loss": 0.1965, "step": 404 }, { "epoch": 0.8526315789473684, "grad_norm": 3.318994535797195, "learning_rate": 6.147119600233758e-06, "loss": 0.2116, "step": 405 }, { "epoch": 0.8547368421052631, "grad_norm": 3.787907520422109, "learning_rate": 6.131019695695702e-06, "loss": 0.2441, "step": 406 }, { "epoch": 0.8568421052631578, "grad_norm": 2.6044409986603947, "learning_rate": 6.114907422507459e-06, "loss": 0.1696, "step": 407 }, { "epoch": 0.8589473684210527, "grad_norm": 3.1123186046200577, "learning_rate": 6.098782956870266e-06, "loss": 0.1714, "step": 408 }, { "epoch": 0.8610526315789474, "grad_norm": 3.5641698976572886, "learning_rate": 6.0826464751187e-06, "loss": 0.2129, "step": 409 }, { "epoch": 0.8631578947368421, "grad_norm": 3.4449729307238397, "learning_rate": 6.066498153718735e-06, "loss": 0.2059, "step": 410 }, { "epoch": 0.8652631578947368, "grad_norm": 3.091646410008194, "learning_rate": 6.0503381692658305e-06, "loss": 0.2244, "step": 411 }, { "epoch": 0.8673684210526316, "grad_norm": 3.426356919246921, "learning_rate": 6.034166698482984e-06, "loss": 0.2493, "step": 412 }, { "epoch": 0.8694736842105263, "grad_norm": 3.009157338394937, "learning_rate": 6.0179839182188125e-06, "loss": 0.1769, "step": 413 }, { "epoch": 0.871578947368421, "grad_norm": 2.68571377740786, "learning_rate": 6.001790005445607e-06, "loss": 0.1801, "step": 414 }, { "epoch": 0.8736842105263158, "grad_norm": 3.13266305671967, "learning_rate": 5.985585137257401e-06, "loss": 0.2552, "step": 415 }, { "epoch": 0.8757894736842106, "grad_norm": 3.118129327899299, "learning_rate": 5.969369490868042e-06, "loss": 0.2213, "step": 416 }, { "epoch": 0.8778947368421053, "grad_norm": 3.1170850548476428, "learning_rate": 5.953143243609235e-06, "loss": 0.2228, "step": 417 }, { "epoch": 0.88, "grad_norm": 3.4825948598222136, "learning_rate": 5.936906572928625e-06, "loss": 0.2319, "step": 418 }, { "epoch": 0.8821052631578947, "grad_norm": 3.364021447031936, "learning_rate": 5.920659656387836e-06, "loss": 0.1935, "step": 419 }, { "epoch": 0.8842105263157894, "grad_norm": 2.7683123292862497, "learning_rate": 5.904402671660551e-06, "loss": 0.1622, "step": 420 }, { "epoch": 0.8863157894736842, "grad_norm": 3.089059939046834, "learning_rate": 5.8881357965305444e-06, "loss": 0.1677, "step": 421 }, { "epoch": 0.888421052631579, "grad_norm": 3.1348448785512675, "learning_rate": 5.871859208889759e-06, "loss": 0.1814, "step": 422 }, { "epoch": 0.8905263157894737, "grad_norm": 3.230597062554221, "learning_rate": 5.855573086736351e-06, "loss": 0.2091, "step": 423 }, { "epoch": 0.8926315789473684, "grad_norm": 2.883110792133594, "learning_rate": 5.839277608172739e-06, "loss": 0.1836, "step": 424 }, { "epoch": 0.8947368421052632, "grad_norm": 4.415508413152931, "learning_rate": 5.82297295140367e-06, "loss": 0.3021, "step": 425 }, { "epoch": 0.8968421052631579, "grad_norm": 2.953180474766528, "learning_rate": 5.806659294734256e-06, "loss": 0.1912, "step": 426 }, { "epoch": 0.8989473684210526, "grad_norm": 2.5058106964907814, "learning_rate": 5.790336816568033e-06, "loss": 0.1418, "step": 427 }, { "epoch": 0.9010526315789473, "grad_norm": 2.784908569571114, "learning_rate": 5.774005695405008e-06, "loss": 0.1733, "step": 428 }, { "epoch": 0.9031578947368422, "grad_norm": 3.2074914294258643, "learning_rate": 5.7576661098397024e-06, "loss": 0.217, "step": 429 }, { "epoch": 0.9052631578947369, "grad_norm": 3.8184949532629955, "learning_rate": 5.74131823855921e-06, "loss": 0.1928, "step": 430 }, { "epoch": 0.9073684210526316, "grad_norm": 2.884763048980032, "learning_rate": 5.72496226034123e-06, "loss": 0.179, "step": 431 }, { "epoch": 0.9094736842105263, "grad_norm": 3.131007686373488, "learning_rate": 5.708598354052122e-06, "loss": 0.2092, "step": 432 }, { "epoch": 0.911578947368421, "grad_norm": 3.600180991489015, "learning_rate": 5.692226698644938e-06, "loss": 0.1771, "step": 433 }, { "epoch": 0.9136842105263158, "grad_norm": 2.6092430120715386, "learning_rate": 5.675847473157485e-06, "loss": 0.1505, "step": 434 }, { "epoch": 0.9157894736842105, "grad_norm": 3.758561821727175, "learning_rate": 5.659460856710346e-06, "loss": 0.2449, "step": 435 }, { "epoch": 0.9178947368421052, "grad_norm": 3.005737007201367, "learning_rate": 5.643067028504931e-06, "loss": 0.1706, "step": 436 }, { "epoch": 0.92, "grad_norm": 2.9179364125259557, "learning_rate": 5.626666167821522e-06, "loss": 0.1812, "step": 437 }, { "epoch": 0.9221052631578948, "grad_norm": 3.1976728733646738, "learning_rate": 5.610258454017301e-06, "loss": 0.2345, "step": 438 }, { "epoch": 0.9242105263157895, "grad_norm": 3.475521355404778, "learning_rate": 5.593844066524401e-06, "loss": 0.254, "step": 439 }, { "epoch": 0.9263157894736842, "grad_norm": 3.5995093334761963, "learning_rate": 5.577423184847932e-06, "loss": 0.2348, "step": 440 }, { "epoch": 0.9284210526315789, "grad_norm": 2.835624142601258, "learning_rate": 5.560995988564023e-06, "loss": 0.1802, "step": 441 }, { "epoch": 0.9305263157894736, "grad_norm": 3.8989119467413613, "learning_rate": 5.544562657317863e-06, "loss": 0.2229, "step": 442 }, { "epoch": 0.9326315789473684, "grad_norm": 3.62544713638484, "learning_rate": 5.52812337082173e-06, "loss": 0.2153, "step": 443 }, { "epoch": 0.9347368421052632, "grad_norm": 3.392283457067749, "learning_rate": 5.5116783088530255e-06, "loss": 0.1824, "step": 444 }, { "epoch": 0.9368421052631579, "grad_norm": 4.303709047671292, "learning_rate": 5.495227651252315e-06, "loss": 0.298, "step": 445 }, { "epoch": 0.9389473684210526, "grad_norm": 2.830025115217364, "learning_rate": 5.478771577921351e-06, "loss": 0.1657, "step": 446 }, { "epoch": 0.9410526315789474, "grad_norm": 3.2810223083748826, "learning_rate": 5.4623102688211186e-06, "loss": 0.2494, "step": 447 }, { "epoch": 0.9431578947368421, "grad_norm": 3.4438213790356444, "learning_rate": 5.445843903969854e-06, "loss": 0.2062, "step": 448 }, { "epoch": 0.9452631578947368, "grad_norm": 2.879757240077144, "learning_rate": 5.429372663441086e-06, "loss": 0.2002, "step": 449 }, { "epoch": 0.9473684210526315, "grad_norm": 2.8548701745465563, "learning_rate": 5.412896727361663e-06, "loss": 0.1942, "step": 450 }, { "epoch": 0.9494736842105264, "grad_norm": 3.3673638986518872, "learning_rate": 5.396416275909779e-06, "loss": 0.2442, "step": 451 }, { "epoch": 0.9515789473684211, "grad_norm": 3.151677859424395, "learning_rate": 5.379931489313016e-06, "loss": 0.1857, "step": 452 }, { "epoch": 0.9536842105263158, "grad_norm": 2.3401970680752653, "learning_rate": 5.363442547846356e-06, "loss": 0.1574, "step": 453 }, { "epoch": 0.9557894736842105, "grad_norm": 3.171440734498741, "learning_rate": 5.346949631830221e-06, "loss": 0.1858, "step": 454 }, { "epoch": 0.9578947368421052, "grad_norm": 3.572091487862273, "learning_rate": 5.3304529216284974e-06, "loss": 0.233, "step": 455 }, { "epoch": 0.96, "grad_norm": 3.3362097570655704, "learning_rate": 5.3139525976465675e-06, "loss": 0.1577, "step": 456 }, { "epoch": 0.9621052631578947, "grad_norm": 3.521394981695169, "learning_rate": 5.2974488403293285e-06, "loss": 0.2165, "step": 457 }, { "epoch": 0.9642105263157895, "grad_norm": 3.5537369585027876, "learning_rate": 5.280941830159228e-06, "loss": 0.2035, "step": 458 }, { "epoch": 0.9663157894736842, "grad_norm": 2.6967873973758336, "learning_rate": 5.264431747654284e-06, "loss": 0.1903, "step": 459 }, { "epoch": 0.968421052631579, "grad_norm": 3.451224252952003, "learning_rate": 5.247918773366112e-06, "loss": 0.2189, "step": 460 }, { "epoch": 0.9705263157894737, "grad_norm": 3.3703738535372305, "learning_rate": 5.231403087877955e-06, "loss": 0.1658, "step": 461 }, { "epoch": 0.9726315789473684, "grad_norm": 2.850165218926584, "learning_rate": 5.214884871802703e-06, "loss": 0.1932, "step": 462 }, { "epoch": 0.9747368421052631, "grad_norm": 3.37619966686572, "learning_rate": 5.198364305780922e-06, "loss": 0.1988, "step": 463 }, { "epoch": 0.9768421052631578, "grad_norm": 2.960765636480082, "learning_rate": 5.1818415704788725e-06, "loss": 0.1904, "step": 464 }, { "epoch": 0.9789473684210527, "grad_norm": 2.7214682076892354, "learning_rate": 5.165316846586541e-06, "loss": 0.2017, "step": 465 }, { "epoch": 0.9810526315789474, "grad_norm": 2.4388957400236624, "learning_rate": 5.148790314815662e-06, "loss": 0.1764, "step": 466 }, { "epoch": 0.9831578947368421, "grad_norm": 2.8678613327792184, "learning_rate": 5.132262155897739e-06, "loss": 0.1778, "step": 467 }, { "epoch": 0.9852631578947368, "grad_norm": 3.210155912400773, "learning_rate": 5.11573255058207e-06, "loss": 0.2211, "step": 468 }, { "epoch": 0.9873684210526316, "grad_norm": 3.5663187219986074, "learning_rate": 5.099201679633769e-06, "loss": 0.2235, "step": 469 }, { "epoch": 0.9894736842105263, "grad_norm": 2.956548038927285, "learning_rate": 5.082669723831793e-06, "loss": 0.1466, "step": 470 }, { "epoch": 0.991578947368421, "grad_norm": 3.930944163022198, "learning_rate": 5.066136863966963e-06, "loss": 0.2018, "step": 471 }, { "epoch": 0.9936842105263158, "grad_norm": 3.031263337005746, "learning_rate": 5.049603280839982e-06, "loss": 0.2197, "step": 472 }, { "epoch": 0.9957894736842106, "grad_norm": 3.721525482445003, "learning_rate": 5.033069155259471e-06, "loss": 0.2175, "step": 473 }, { "epoch": 0.9978947368421053, "grad_norm": 2.238401391190845, "learning_rate": 5.016534668039976e-06, "loss": 0.1057, "step": 474 }, { "epoch": 1.0, "grad_norm": 2.700095513199168, "learning_rate": 5e-06, "loss": 0.1825, "step": 475 }, { "epoch": 1.0021052631578948, "grad_norm": 2.102327993497151, "learning_rate": 4.983465331960025e-06, "loss": 0.0885, "step": 476 }, { "epoch": 1.0042105263157894, "grad_norm": 2.2164179180128127, "learning_rate": 4.96693084474053e-06, "loss": 0.101, "step": 477 }, { "epoch": 1.0063157894736843, "grad_norm": 2.591907392954552, "learning_rate": 4.950396719160019e-06, "loss": 0.1016, "step": 478 }, { "epoch": 1.0084210526315789, "grad_norm": 2.3536825700723707, "learning_rate": 4.93386313603304e-06, "loss": 0.12, "step": 479 }, { "epoch": 1.0105263157894737, "grad_norm": 2.7777745052862106, "learning_rate": 4.917330276168208e-06, "loss": 0.102, "step": 480 }, { "epoch": 1.0126315789473683, "grad_norm": 2.4751195672854704, "learning_rate": 4.900798320366233e-06, "loss": 0.0989, "step": 481 }, { "epoch": 1.0147368421052632, "grad_norm": 1.889820583657195, "learning_rate": 4.884267449417932e-06, "loss": 0.0762, "step": 482 }, { "epoch": 1.016842105263158, "grad_norm": 2.2064010641296155, "learning_rate": 4.867737844102261e-06, "loss": 0.0932, "step": 483 }, { "epoch": 1.0189473684210526, "grad_norm": 2.9805461448209556, "learning_rate": 4.851209685184339e-06, "loss": 0.092, "step": 484 }, { "epoch": 1.0210526315789474, "grad_norm": 2.2601627421875032, "learning_rate": 4.8346831534134595e-06, "loss": 0.09, "step": 485 }, { "epoch": 1.023157894736842, "grad_norm": 2.3939195754809055, "learning_rate": 4.818158429521129e-06, "loss": 0.1179, "step": 486 }, { "epoch": 1.0252631578947369, "grad_norm": 2.3451597644966573, "learning_rate": 4.801635694219079e-06, "loss": 0.08, "step": 487 }, { "epoch": 1.0273684210526315, "grad_norm": 2.6640782365642592, "learning_rate": 4.785115128197298e-06, "loss": 0.1017, "step": 488 }, { "epoch": 1.0294736842105263, "grad_norm": 2.1177638500079365, "learning_rate": 4.768596912122046e-06, "loss": 0.0731, "step": 489 }, { "epoch": 1.0315789473684212, "grad_norm": 3.3240436401313618, "learning_rate": 4.752081226633888e-06, "loss": 0.0919, "step": 490 }, { "epoch": 1.0336842105263158, "grad_norm": 2.2384781946355794, "learning_rate": 4.735568252345718e-06, "loss": 0.0719, "step": 491 }, { "epoch": 1.0357894736842106, "grad_norm": 2.983441854897483, "learning_rate": 4.719058169840773e-06, "loss": 0.0745, "step": 492 }, { "epoch": 1.0378947368421052, "grad_norm": 2.6556422035702045, "learning_rate": 4.702551159670672e-06, "loss": 0.0734, "step": 493 }, { "epoch": 1.04, "grad_norm": 3.219998720581149, "learning_rate": 4.686047402353433e-06, "loss": 0.0775, "step": 494 }, { "epoch": 1.0421052631578946, "grad_norm": 2.6391239908163233, "learning_rate": 4.669547078371503e-06, "loss": 0.0787, "step": 495 }, { "epoch": 1.0442105263157895, "grad_norm": 3.041237149660994, "learning_rate": 4.65305036816978e-06, "loss": 0.089, "step": 496 }, { "epoch": 1.0463157894736843, "grad_norm": 3.687880198514741, "learning_rate": 4.636557452153645e-06, "loss": 0.0831, "step": 497 }, { "epoch": 1.048421052631579, "grad_norm": 5.095705229375661, "learning_rate": 4.620068510686985e-06, "loss": 0.0804, "step": 498 }, { "epoch": 1.0505263157894738, "grad_norm": 3.197432814925761, "learning_rate": 4.60358372409022e-06, "loss": 0.0574, "step": 499 }, { "epoch": 1.0526315789473684, "grad_norm": 3.09106465983814, "learning_rate": 4.587103272638339e-06, "loss": 0.0823, "step": 500 }, { "epoch": 1.0547368421052632, "grad_norm": 3.4094205016943193, "learning_rate": 4.570627336558915e-06, "loss": 0.077, "step": 501 }, { "epoch": 1.0568421052631578, "grad_norm": 3.2700532893266723, "learning_rate": 4.554156096030149e-06, "loss": 0.0888, "step": 502 }, { "epoch": 1.0589473684210526, "grad_norm": 3.8444997651481274, "learning_rate": 4.537689731178883e-06, "loss": 0.0995, "step": 503 }, { "epoch": 1.0610526315789475, "grad_norm": 3.460457464328528, "learning_rate": 4.5212284220786495e-06, "loss": 0.0852, "step": 504 }, { "epoch": 1.063157894736842, "grad_norm": 3.5197825821543844, "learning_rate": 4.504772348747687e-06, "loss": 0.089, "step": 505 }, { "epoch": 1.065263157894737, "grad_norm": 2.9315058365098148, "learning_rate": 4.488321691146975e-06, "loss": 0.0917, "step": 506 }, { "epoch": 1.0673684210526315, "grad_norm": 2.959097650131179, "learning_rate": 4.471876629178273e-06, "loss": 0.0927, "step": 507 }, { "epoch": 1.0694736842105264, "grad_norm": 2.9752640084242543, "learning_rate": 4.4554373426821375e-06, "loss": 0.0754, "step": 508 }, { "epoch": 1.071578947368421, "grad_norm": 3.6867032363293633, "learning_rate": 4.439004011435979e-06, "loss": 0.0931, "step": 509 }, { "epoch": 1.0736842105263158, "grad_norm": 3.9162282663094437, "learning_rate": 4.42257681515207e-06, "loss": 0.0915, "step": 510 }, { "epoch": 1.0757894736842106, "grad_norm": 3.245956552006904, "learning_rate": 4.406155933475599e-06, "loss": 0.0825, "step": 511 }, { "epoch": 1.0778947368421052, "grad_norm": 3.3407176706701303, "learning_rate": 4.3897415459827e-06, "loss": 0.0833, "step": 512 }, { "epoch": 1.08, "grad_norm": 3.2605562952641325, "learning_rate": 4.373333832178478e-06, "loss": 0.0836, "step": 513 }, { "epoch": 1.0821052631578947, "grad_norm": 3.022186927091034, "learning_rate": 4.356932971495071e-06, "loss": 0.0893, "step": 514 }, { "epoch": 1.0842105263157895, "grad_norm": 2.570982657234066, "learning_rate": 4.340539143289655e-06, "loss": 0.0691, "step": 515 }, { "epoch": 1.0863157894736841, "grad_norm": 2.6495275768322966, "learning_rate": 4.324152526842517e-06, "loss": 0.0703, "step": 516 }, { "epoch": 1.088421052631579, "grad_norm": 2.9529381342049357, "learning_rate": 4.307773301355063e-06, "loss": 0.0878, "step": 517 }, { "epoch": 1.0905263157894738, "grad_norm": 3.2139910027789913, "learning_rate": 4.291401645947879e-06, "loss": 0.0858, "step": 518 }, { "epoch": 1.0926315789473684, "grad_norm": 3.359687231677775, "learning_rate": 4.275037739658771e-06, "loss": 0.0991, "step": 519 }, { "epoch": 1.0947368421052632, "grad_norm": 2.7257961811651867, "learning_rate": 4.25868176144079e-06, "loss": 0.0636, "step": 520 }, { "epoch": 1.0968421052631578, "grad_norm": 3.0300299027782205, "learning_rate": 4.242333890160299e-06, "loss": 0.0744, "step": 521 }, { "epoch": 1.0989473684210527, "grad_norm": 2.673076324741469, "learning_rate": 4.225994304594994e-06, "loss": 0.0733, "step": 522 }, { "epoch": 1.1010526315789473, "grad_norm": 3.434548397420313, "learning_rate": 4.209663183431969e-06, "loss": 0.0894, "step": 523 }, { "epoch": 1.1031578947368421, "grad_norm": 2.656738290688316, "learning_rate": 4.193340705265746e-06, "loss": 0.0816, "step": 524 }, { "epoch": 1.1052631578947367, "grad_norm": 2.752642253228426, "learning_rate": 4.17702704859633e-06, "loss": 0.0737, "step": 525 }, { "epoch": 1.1073684210526316, "grad_norm": 2.8240215409779204, "learning_rate": 4.160722391827262e-06, "loss": 0.0946, "step": 526 }, { "epoch": 1.1094736842105264, "grad_norm": 2.6310984451059523, "learning_rate": 4.14442691326365e-06, "loss": 0.075, "step": 527 }, { "epoch": 1.111578947368421, "grad_norm": 3.3487552916421097, "learning_rate": 4.128140791110243e-06, "loss": 0.0904, "step": 528 }, { "epoch": 1.1136842105263158, "grad_norm": 3.071402311195297, "learning_rate": 4.111864203469457e-06, "loss": 0.079, "step": 529 }, { "epoch": 1.1157894736842104, "grad_norm": 2.8739724428152993, "learning_rate": 4.0955973283394525e-06, "loss": 0.0844, "step": 530 }, { "epoch": 1.1178947368421053, "grad_norm": 2.7822422708634442, "learning_rate": 4.079340343612165e-06, "loss": 0.0943, "step": 531 }, { "epoch": 1.12, "grad_norm": 2.5902291543214364, "learning_rate": 4.063093427071376e-06, "loss": 0.0827, "step": 532 }, { "epoch": 1.1221052631578947, "grad_norm": 3.084348236278604, "learning_rate": 4.046856756390767e-06, "loss": 0.0892, "step": 533 }, { "epoch": 1.1242105263157895, "grad_norm": 2.95461042174687, "learning_rate": 4.03063050913196e-06, "loss": 0.0816, "step": 534 }, { "epoch": 1.1263157894736842, "grad_norm": 2.7009483055282892, "learning_rate": 4.0144148627426e-06, "loss": 0.063, "step": 535 }, { "epoch": 1.128421052631579, "grad_norm": 3.2167472489705062, "learning_rate": 3.998209994554395e-06, "loss": 0.0993, "step": 536 }, { "epoch": 1.1305263157894736, "grad_norm": 3.164155379501995, "learning_rate": 3.982016081781189e-06, "loss": 0.0928, "step": 537 }, { "epoch": 1.1326315789473684, "grad_norm": 2.6712684161255873, "learning_rate": 3.965833301517017e-06, "loss": 0.0792, "step": 538 }, { "epoch": 1.134736842105263, "grad_norm": 3.590217130090868, "learning_rate": 3.949661830734172e-06, "loss": 0.1122, "step": 539 }, { "epoch": 1.1368421052631579, "grad_norm": 2.757855187593266, "learning_rate": 3.9335018462812664e-06, "loss": 0.0732, "step": 540 }, { "epoch": 1.1389473684210527, "grad_norm": 3.536487052728721, "learning_rate": 3.9173535248813026e-06, "loss": 0.0678, "step": 541 }, { "epoch": 1.1410526315789473, "grad_norm": 2.4862366546978483, "learning_rate": 3.901217043129735e-06, "loss": 0.0728, "step": 542 }, { "epoch": 1.1431578947368422, "grad_norm": 3.2902474007718907, "learning_rate": 3.885092577492543e-06, "loss": 0.1086, "step": 543 }, { "epoch": 1.1452631578947368, "grad_norm": 3.451017932646852, "learning_rate": 3.8689803043043e-06, "loss": 0.0868, "step": 544 }, { "epoch": 1.1473684210526316, "grad_norm": 2.8980165245573692, "learning_rate": 3.852880399766243e-06, "loss": 0.0829, "step": 545 }, { "epoch": 1.1494736842105264, "grad_norm": 2.8916674632956134, "learning_rate": 3.8367930399443495e-06, "loss": 0.0782, "step": 546 }, { "epoch": 1.151578947368421, "grad_norm": 2.505026430566736, "learning_rate": 3.820718400767409e-06, "loss": 0.0763, "step": 547 }, { "epoch": 1.1536842105263159, "grad_norm": 3.7982142015035305, "learning_rate": 3.8046566580251e-06, "loss": 0.0895, "step": 548 }, { "epoch": 1.1557894736842105, "grad_norm": 2.550264620010391, "learning_rate": 3.7886079873660693e-06, "loss": 0.085, "step": 549 }, { "epoch": 1.1578947368421053, "grad_norm": 3.2823943298060483, "learning_rate": 3.7725725642960047e-06, "loss": 0.0838, "step": 550 }, { "epoch": 1.16, "grad_norm": 3.091694066417572, "learning_rate": 3.756550564175727e-06, "loss": 0.0945, "step": 551 }, { "epoch": 1.1621052631578948, "grad_norm": 2.6667880955040855, "learning_rate": 3.7405421622192607e-06, "loss": 0.067, "step": 552 }, { "epoch": 1.1642105263157894, "grad_norm": 3.2831274480460055, "learning_rate": 3.7245475334919246e-06, "loss": 0.0994, "step": 553 }, { "epoch": 1.1663157894736842, "grad_norm": 2.2115339073168903, "learning_rate": 3.7085668529084183e-06, "loss": 0.0609, "step": 554 }, { "epoch": 1.168421052631579, "grad_norm": 3.103922834249276, "learning_rate": 3.6926002952309015e-06, "loss": 0.0705, "step": 555 }, { "epoch": 1.1705263157894736, "grad_norm": 3.1379466258374413, "learning_rate": 3.676648035067093e-06, "loss": 0.0755, "step": 556 }, { "epoch": 1.1726315789473685, "grad_norm": 3.144558129851556, "learning_rate": 3.6607102468683524e-06, "loss": 0.0906, "step": 557 }, { "epoch": 1.174736842105263, "grad_norm": 2.843620211669143, "learning_rate": 3.64478710492778e-06, "loss": 0.0752, "step": 558 }, { "epoch": 1.176842105263158, "grad_norm": 2.737119577196797, "learning_rate": 3.628878783378302e-06, "loss": 0.0855, "step": 559 }, { "epoch": 1.1789473684210527, "grad_norm": 3.38606655435301, "learning_rate": 3.6129854561907786e-06, "loss": 0.1073, "step": 560 }, { "epoch": 1.1810526315789474, "grad_norm": 3.022666071905334, "learning_rate": 3.5971072971720844e-06, "loss": 0.096, "step": 561 }, { "epoch": 1.1831578947368422, "grad_norm": 3.013844862309235, "learning_rate": 3.581244479963225e-06, "loss": 0.0699, "step": 562 }, { "epoch": 1.1852631578947368, "grad_norm": 2.4616484667093532, "learning_rate": 3.56539717803743e-06, "loss": 0.0686, "step": 563 }, { "epoch": 1.1873684210526316, "grad_norm": 3.092537315474559, "learning_rate": 3.5495655646982506e-06, "loss": 0.1022, "step": 564 }, { "epoch": 1.1894736842105262, "grad_norm": 2.781358161394791, "learning_rate": 3.533749813077677e-06, "loss": 0.0804, "step": 565 }, { "epoch": 1.191578947368421, "grad_norm": 2.453888657239943, "learning_rate": 3.517950096134232e-06, "loss": 0.0577, "step": 566 }, { "epoch": 1.1936842105263157, "grad_norm": 2.935143310735812, "learning_rate": 3.5021665866510924e-06, "loss": 0.0905, "step": 567 }, { "epoch": 1.1957894736842105, "grad_norm": 2.6509666167907726, "learning_rate": 3.4863994572341845e-06, "loss": 0.0854, "step": 568 }, { "epoch": 1.1978947368421053, "grad_norm": 2.955009635915876, "learning_rate": 3.470648880310313e-06, "loss": 0.0883, "step": 569 }, { "epoch": 1.2, "grad_norm": 3.2167621312639794, "learning_rate": 3.4549150281252635e-06, "loss": 0.072, "step": 570 }, { "epoch": 1.2021052631578948, "grad_norm": 2.7638395907473225, "learning_rate": 3.4391980727419206e-06, "loss": 0.082, "step": 571 }, { "epoch": 1.2042105263157894, "grad_norm": 3.4412996909683806, "learning_rate": 3.423498186038393e-06, "loss": 0.1093, "step": 572 }, { "epoch": 1.2063157894736842, "grad_norm": 2.6520865262952515, "learning_rate": 3.4078155397061243e-06, "loss": 0.07, "step": 573 }, { "epoch": 1.208421052631579, "grad_norm": 2.376283277388007, "learning_rate": 3.3921503052480243e-06, "loss": 0.0748, "step": 574 }, { "epoch": 1.2105263157894737, "grad_norm": 2.242871891939484, "learning_rate": 3.3765026539765832e-06, "loss": 0.0681, "step": 575 }, { "epoch": 1.2126315789473685, "grad_norm": 4.4744073948915535, "learning_rate": 3.3608727570120114e-06, "loss": 0.0928, "step": 576 }, { "epoch": 1.2147368421052631, "grad_norm": 2.7537788196508903, "learning_rate": 3.3452607852803585e-06, "loss": 0.0966, "step": 577 }, { "epoch": 1.216842105263158, "grad_norm": 3.1307439683337517, "learning_rate": 3.3296669095116454e-06, "loss": 0.0778, "step": 578 }, { "epoch": 1.2189473684210526, "grad_norm": 2.7177020729876253, "learning_rate": 3.3140913002379993e-06, "loss": 0.0697, "step": 579 }, { "epoch": 1.2210526315789474, "grad_norm": 2.4210212110003484, "learning_rate": 3.298534127791785e-06, "loss": 0.0898, "step": 580 }, { "epoch": 1.223157894736842, "grad_norm": 2.486998916941089, "learning_rate": 3.2829955623037536e-06, "loss": 0.0906, "step": 581 }, { "epoch": 1.2252631578947368, "grad_norm": 2.398945212379016, "learning_rate": 3.267475773701161e-06, "loss": 0.072, "step": 582 }, { "epoch": 1.2273684210526317, "grad_norm": 2.98342220040315, "learning_rate": 3.251974931705933e-06, "loss": 0.0884, "step": 583 }, { "epoch": 1.2294736842105263, "grad_norm": 2.471587722526877, "learning_rate": 3.236493205832795e-06, "loss": 0.0803, "step": 584 }, { "epoch": 1.231578947368421, "grad_norm": 3.3871586088205277, "learning_rate": 3.2210307653874175e-06, "loss": 0.0905, "step": 585 }, { "epoch": 1.2336842105263157, "grad_norm": 3.1548244792093727, "learning_rate": 3.205587779464576e-06, "loss": 0.0807, "step": 586 }, { "epoch": 1.2357894736842105, "grad_norm": 2.751888064247698, "learning_rate": 3.1901644169462854e-06, "loss": 0.1001, "step": 587 }, { "epoch": 1.2378947368421054, "grad_norm": 2.953100733477854, "learning_rate": 3.1747608464999723e-06, "loss": 0.0859, "step": 588 }, { "epoch": 1.24, "grad_norm": 3.0915284845137383, "learning_rate": 3.1593772365766107e-06, "loss": 0.0916, "step": 589 }, { "epoch": 1.2421052631578948, "grad_norm": 3.212692328218412, "learning_rate": 3.1440137554088957e-06, "loss": 0.087, "step": 590 }, { "epoch": 1.2442105263157894, "grad_norm": 3.6741186905601673, "learning_rate": 3.128670571009399e-06, "loss": 0.0918, "step": 591 }, { "epoch": 1.2463157894736843, "grad_norm": 3.052869956993313, "learning_rate": 3.1133478511687217e-06, "loss": 0.0882, "step": 592 }, { "epoch": 1.2484210526315789, "grad_norm": 2.7038650862339026, "learning_rate": 3.0980457634536775e-06, "loss": 0.0694, "step": 593 }, { "epoch": 1.2505263157894737, "grad_norm": 3.018334207884892, "learning_rate": 3.082764475205442e-06, "loss": 0.0858, "step": 594 }, { "epoch": 1.2526315789473683, "grad_norm": 2.569495801078813, "learning_rate": 3.06750415353774e-06, "loss": 0.0782, "step": 595 }, { "epoch": 1.2547368421052632, "grad_norm": 3.3984298834388835, "learning_rate": 3.052264965335e-06, "loss": 0.109, "step": 596 }, { "epoch": 1.256842105263158, "grad_norm": 2.3590945955494416, "learning_rate": 3.0370470772505433e-06, "loss": 0.071, "step": 597 }, { "epoch": 1.2589473684210526, "grad_norm": 2.234877038235419, "learning_rate": 3.02185065570476e-06, "loss": 0.0692, "step": 598 }, { "epoch": 1.2610526315789474, "grad_norm": 4.052354805427412, "learning_rate": 3.0066758668832752e-06, "loss": 0.0948, "step": 599 }, { "epoch": 1.263157894736842, "grad_norm": 2.742079337401893, "learning_rate": 2.991522876735154e-06, "loss": 0.0969, "step": 600 }, { "epoch": 1.263157894736842, "eval_loss": 0.20044729113578796, "eval_runtime": 0.929, "eval_samples_per_second": 41.983, "eval_steps_per_second": 10.765, "step": 600 }, { "epoch": 1.2652631578947369, "grad_norm": 2.8741805199859205, "learning_rate": 2.9763918509710647e-06, "loss": 0.0963, "step": 601 }, { "epoch": 1.2673684210526317, "grad_norm": 2.737388943410033, "learning_rate": 2.9612829550614836e-06, "loss": 0.0826, "step": 602 }, { "epoch": 1.2694736842105263, "grad_norm": 2.717582073137317, "learning_rate": 2.9461963542348737e-06, "loss": 0.0681, "step": 603 }, { "epoch": 1.271578947368421, "grad_norm": 3.3716699599065123, "learning_rate": 2.931132213475884e-06, "loss": 0.101, "step": 604 }, { "epoch": 1.2736842105263158, "grad_norm": 2.439989476563021, "learning_rate": 2.9160906975235493e-06, "loss": 0.0732, "step": 605 }, { "epoch": 1.2757894736842106, "grad_norm": 3.092634953355724, "learning_rate": 2.9010719708694724e-06, "loss": 0.0744, "step": 606 }, { "epoch": 1.2778947368421052, "grad_norm": 2.638312682828106, "learning_rate": 2.8860761977560435e-06, "loss": 0.0757, "step": 607 }, { "epoch": 1.28, "grad_norm": 2.3219077212278494, "learning_rate": 2.871103542174637e-06, "loss": 0.0941, "step": 608 }, { "epoch": 1.2821052631578946, "grad_norm": 2.7468019529994607, "learning_rate": 2.8561541678638145e-06, "loss": 0.0679, "step": 609 }, { "epoch": 1.2842105263157895, "grad_norm": 2.592555737944712, "learning_rate": 2.8412282383075362e-06, "loss": 0.072, "step": 610 }, { "epoch": 1.2863157894736843, "grad_norm": 2.5748600678466493, "learning_rate": 2.826325916733378e-06, "loss": 0.078, "step": 611 }, { "epoch": 1.288421052631579, "grad_norm": 3.272935966473756, "learning_rate": 2.811447366110741e-06, "loss": 0.0985, "step": 612 }, { "epoch": 1.2905263157894737, "grad_norm": 2.718391171117533, "learning_rate": 2.796592749149071e-06, "loss": 0.0856, "step": 613 }, { "epoch": 1.2926315789473684, "grad_norm": 2.895251659895727, "learning_rate": 2.7817622282960816e-06, "loss": 0.0814, "step": 614 }, { "epoch": 1.2947368421052632, "grad_norm": 3.098118941203153, "learning_rate": 2.766955965735968e-06, "loss": 0.1, "step": 615 }, { "epoch": 1.296842105263158, "grad_norm": 2.4925459738486078, "learning_rate": 2.7521741233876496e-06, "loss": 0.066, "step": 616 }, { "epoch": 1.2989473684210526, "grad_norm": 3.2668047779458447, "learning_rate": 2.7374168629029814e-06, "loss": 0.0662, "step": 617 }, { "epoch": 1.3010526315789472, "grad_norm": 2.365373318259492, "learning_rate": 2.722684345665004e-06, "loss": 0.0568, "step": 618 }, { "epoch": 1.303157894736842, "grad_norm": 2.674596379228086, "learning_rate": 2.707976732786166e-06, "loss": 0.0716, "step": 619 }, { "epoch": 1.305263157894737, "grad_norm": 2.629799474227148, "learning_rate": 2.693294185106562e-06, "loss": 0.0708, "step": 620 }, { "epoch": 1.3073684210526315, "grad_norm": 2.7427790950830917, "learning_rate": 2.678636863192184e-06, "loss": 0.0819, "step": 621 }, { "epoch": 1.3094736842105263, "grad_norm": 2.355303580669883, "learning_rate": 2.6640049273331516e-06, "loss": 0.0682, "step": 622 }, { "epoch": 1.311578947368421, "grad_norm": 2.5202135513477595, "learning_rate": 2.649398537541978e-06, "loss": 0.0592, "step": 623 }, { "epoch": 1.3136842105263158, "grad_norm": 3.1053392641729056, "learning_rate": 2.6348178535517967e-06, "loss": 0.0815, "step": 624 }, { "epoch": 1.3157894736842106, "grad_norm": 2.287461098881695, "learning_rate": 2.6202630348146323e-06, "loss": 0.0809, "step": 625 }, { "epoch": 1.3178947368421052, "grad_norm": 2.443760626345547, "learning_rate": 2.605734240499652e-06, "loss": 0.0604, "step": 626 }, { "epoch": 1.32, "grad_norm": 2.845171834877243, "learning_rate": 2.5912316294914232e-06, "loss": 0.0744, "step": 627 }, { "epoch": 1.3221052631578947, "grad_norm": 2.1955991785027127, "learning_rate": 2.576755360388177e-06, "loss": 0.0592, "step": 628 }, { "epoch": 1.3242105263157895, "grad_norm": 3.8739275751093456, "learning_rate": 2.562305591500069e-06, "loss": 0.1056, "step": 629 }, { "epoch": 1.3263157894736843, "grad_norm": 2.6932111092229234, "learning_rate": 2.5478824808474613e-06, "loss": 0.0762, "step": 630 }, { "epoch": 1.328421052631579, "grad_norm": 2.3653261808302393, "learning_rate": 2.5334861861591753e-06, "loss": 0.072, "step": 631 }, { "epoch": 1.3305263157894736, "grad_norm": 3.0167769884448057, "learning_rate": 2.5191168648707888e-06, "loss": 0.0896, "step": 632 }, { "epoch": 1.3326315789473684, "grad_norm": 2.4444541248066796, "learning_rate": 2.5047746741228977e-06, "loss": 0.0679, "step": 633 }, { "epoch": 1.3347368421052632, "grad_norm": 3.1319838501376056, "learning_rate": 2.490459770759398e-06, "loss": 0.0794, "step": 634 }, { "epoch": 1.3368421052631578, "grad_norm": 2.4160632314580583, "learning_rate": 2.476172311325783e-06, "loss": 0.057, "step": 635 }, { "epoch": 1.3389473684210527, "grad_norm": 2.8056259509770083, "learning_rate": 2.461912452067415e-06, "loss": 0.0788, "step": 636 }, { "epoch": 1.3410526315789473, "grad_norm": 3.5606250812923994, "learning_rate": 2.447680348927837e-06, "loss": 0.0991, "step": 637 }, { "epoch": 1.343157894736842, "grad_norm": 2.939424365838173, "learning_rate": 2.433476157547044e-06, "loss": 0.0791, "step": 638 }, { "epoch": 1.345263157894737, "grad_norm": 2.3944747127012547, "learning_rate": 2.4193000332597984e-06, "loss": 0.0776, "step": 639 }, { "epoch": 1.3473684210526315, "grad_norm": 2.4097140332528144, "learning_rate": 2.4051521310939258e-06, "loss": 0.0548, "step": 640 }, { "epoch": 1.3494736842105264, "grad_norm": 2.484876677592921, "learning_rate": 2.391032605768613e-06, "loss": 0.0639, "step": 641 }, { "epoch": 1.351578947368421, "grad_norm": 2.9569313033101023, "learning_rate": 2.3769416116927335e-06, "loss": 0.0702, "step": 642 }, { "epoch": 1.3536842105263158, "grad_norm": 2.3154837496863268, "learning_rate": 2.3628793029631353e-06, "loss": 0.0696, "step": 643 }, { "epoch": 1.3557894736842107, "grad_norm": 4.0950527559872345, "learning_rate": 2.3488458333629777e-06, "loss": 0.0988, "step": 644 }, { "epoch": 1.3578947368421053, "grad_norm": 3.2898299985671953, "learning_rate": 2.3348413563600324e-06, "loss": 0.0998, "step": 645 }, { "epoch": 1.3599999999999999, "grad_norm": 3.5565589465236402, "learning_rate": 2.320866025105016e-06, "loss": 0.0748, "step": 646 }, { "epoch": 1.3621052631578947, "grad_norm": 2.9838579493260142, "learning_rate": 2.3069199924299175e-06, "loss": 0.0781, "step": 647 }, { "epoch": 1.3642105263157895, "grad_norm": 3.109898543479839, "learning_rate": 2.29300341084631e-06, "loss": 0.0702, "step": 648 }, { "epoch": 1.3663157894736842, "grad_norm": 2.736969851304859, "learning_rate": 2.2791164325437047e-06, "loss": 0.0792, "step": 649 }, { "epoch": 1.368421052631579, "grad_norm": 3.3868564041377973, "learning_rate": 2.265259209387867e-06, "loss": 0.0899, "step": 650 }, { "epoch": 1.3705263157894736, "grad_norm": 2.7082372890636375, "learning_rate": 2.2514318929191707e-06, "loss": 0.0752, "step": 651 }, { "epoch": 1.3726315789473684, "grad_norm": 3.274685542658562, "learning_rate": 2.2376346343509343e-06, "loss": 0.0789, "step": 652 }, { "epoch": 1.3747368421052633, "grad_norm": 3.3918195389906436, "learning_rate": 2.2238675845677663e-06, "loss": 0.0811, "step": 653 }, { "epoch": 1.3768421052631579, "grad_norm": 2.436153684588233, "learning_rate": 2.2101308941239204e-06, "loss": 0.0694, "step": 654 }, { "epoch": 1.3789473684210527, "grad_norm": 3.2956597758884816, "learning_rate": 2.1964247132416373e-06, "loss": 0.0845, "step": 655 }, { "epoch": 1.3810526315789473, "grad_norm": 2.891107537325035, "learning_rate": 2.182749191809518e-06, "loss": 0.0806, "step": 656 }, { "epoch": 1.3831578947368421, "grad_norm": 3.174882207717556, "learning_rate": 2.1691044793808734e-06, "loss": 0.0766, "step": 657 }, { "epoch": 1.385263157894737, "grad_norm": 3.239310623984899, "learning_rate": 2.1554907251720947e-06, "loss": 0.1132, "step": 658 }, { "epoch": 1.3873684210526316, "grad_norm": 3.0000793459288797, "learning_rate": 2.1419080780610123e-06, "loss": 0.0779, "step": 659 }, { "epoch": 1.3894736842105262, "grad_norm": 3.0156978277959166, "learning_rate": 2.1283566865852824e-06, "loss": 0.074, "step": 660 }, { "epoch": 1.391578947368421, "grad_norm": 2.6551132879995007, "learning_rate": 2.11483669894075e-06, "loss": 0.0746, "step": 661 }, { "epoch": 1.3936842105263159, "grad_norm": 2.3673083937385875, "learning_rate": 2.1013482629798334e-06, "loss": 0.0714, "step": 662 }, { "epoch": 1.3957894736842105, "grad_norm": 3.2553558351574288, "learning_rate": 2.08789152620991e-06, "loss": 0.1127, "step": 663 }, { "epoch": 1.3978947368421053, "grad_norm": 3.1793358246388386, "learning_rate": 2.0744666357916925e-06, "loss": 0.1027, "step": 664 }, { "epoch": 1.4, "grad_norm": 2.3044156367485766, "learning_rate": 2.061073738537635e-06, "loss": 0.079, "step": 665 }, { "epoch": 1.4021052631578947, "grad_norm": 3.038506581721067, "learning_rate": 2.0477129809103147e-06, "loss": 0.078, "step": 666 }, { "epoch": 1.4042105263157896, "grad_norm": 3.2134574970286254, "learning_rate": 2.034384509020837e-06, "loss": 0.0787, "step": 667 }, { "epoch": 1.4063157894736842, "grad_norm": 2.8833109727229593, "learning_rate": 2.021088468627237e-06, "loss": 0.0945, "step": 668 }, { "epoch": 1.408421052631579, "grad_norm": 2.6681060575893922, "learning_rate": 2.0078250051328783e-06, "loss": 0.0785, "step": 669 }, { "epoch": 1.4105263157894736, "grad_norm": 3.309105305859723, "learning_rate": 1.9945942635848745e-06, "loss": 0.0932, "step": 670 }, { "epoch": 1.4126315789473685, "grad_norm": 2.7557375649742992, "learning_rate": 1.981396388672496e-06, "loss": 0.0704, "step": 671 }, { "epoch": 1.4147368421052633, "grad_norm": 2.780789111855544, "learning_rate": 1.9682315247255897e-06, "loss": 0.0681, "step": 672 }, { "epoch": 1.416842105263158, "grad_norm": 2.891129671769762, "learning_rate": 1.9550998157129946e-06, "loss": 0.0689, "step": 673 }, { "epoch": 1.4189473684210525, "grad_norm": 2.8438288324834136, "learning_rate": 1.9420014052409793e-06, "loss": 0.0948, "step": 674 }, { "epoch": 1.4210526315789473, "grad_norm": 2.8648199763393363, "learning_rate": 1.928936436551661e-06, "loss": 0.0852, "step": 675 }, { "epoch": 1.4231578947368422, "grad_norm": 2.4983276715177802, "learning_rate": 1.915905052521445e-06, "loss": 0.0691, "step": 676 }, { "epoch": 1.4252631578947368, "grad_norm": 2.6685238310395167, "learning_rate": 1.9029073956594607e-06, "loss": 0.0902, "step": 677 }, { "epoch": 1.4273684210526316, "grad_norm": 2.6981688261841623, "learning_rate": 1.8899436081059974e-06, "loss": 0.0626, "step": 678 }, { "epoch": 1.4294736842105262, "grad_norm": 3.4512093051473287, "learning_rate": 1.877013831630961e-06, "loss": 0.0873, "step": 679 }, { "epoch": 1.431578947368421, "grad_norm": 3.136937795473418, "learning_rate": 1.864118207632315e-06, "loss": 0.0817, "step": 680 }, { "epoch": 1.433684210526316, "grad_norm": 2.845589565177414, "learning_rate": 1.851256877134538e-06, "loss": 0.084, "step": 681 }, { "epoch": 1.4357894736842105, "grad_norm": 2.6247730269493634, "learning_rate": 1.838429980787081e-06, "loss": 0.0868, "step": 682 }, { "epoch": 1.4378947368421053, "grad_norm": 2.212932444663422, "learning_rate": 1.825637658862824e-06, "loss": 0.056, "step": 683 }, { "epoch": 1.44, "grad_norm": 3.0708180874395525, "learning_rate": 1.8128800512565514e-06, "loss": 0.0819, "step": 684 }, { "epoch": 1.4421052631578948, "grad_norm": 3.191848306499893, "learning_rate": 1.8001572974834169e-06, "loss": 0.0874, "step": 685 }, { "epoch": 1.4442105263157896, "grad_norm": 3.118644672611863, "learning_rate": 1.7874695366774191e-06, "loss": 0.0703, "step": 686 }, { "epoch": 1.4463157894736842, "grad_norm": 3.3640943896050577, "learning_rate": 1.774816907589873e-06, "loss": 0.0792, "step": 687 }, { "epoch": 1.4484210526315788, "grad_norm": 2.4258203150187994, "learning_rate": 1.7621995485879062e-06, "loss": 0.0759, "step": 688 }, { "epoch": 1.4505263157894737, "grad_norm": 2.581622498733916, "learning_rate": 1.749617597652934e-06, "loss": 0.063, "step": 689 }, { "epoch": 1.4526315789473685, "grad_norm": 2.946473481196987, "learning_rate": 1.7370711923791567e-06, "loss": 0.0822, "step": 690 }, { "epoch": 1.454736842105263, "grad_norm": 2.4753384481559055, "learning_rate": 1.7245604699720536e-06, "loss": 0.0598, "step": 691 }, { "epoch": 1.456842105263158, "grad_norm": 3.416152120993626, "learning_rate": 1.7120855672468779e-06, "loss": 0.0907, "step": 692 }, { "epoch": 1.4589473684210525, "grad_norm": 2.695879145021625, "learning_rate": 1.6996466206271679e-06, "loss": 0.0612, "step": 693 }, { "epoch": 1.4610526315789474, "grad_norm": 2.9873858087756635, "learning_rate": 1.6872437661432518e-06, "loss": 0.0811, "step": 694 }, { "epoch": 1.4631578947368422, "grad_norm": 3.4960013543762125, "learning_rate": 1.6748771394307584e-06, "loss": 0.0813, "step": 695 }, { "epoch": 1.4652631578947368, "grad_norm": 2.277927581675807, "learning_rate": 1.6625468757291379e-06, "loss": 0.0561, "step": 696 }, { "epoch": 1.4673684210526317, "grad_norm": 2.3801716966637105, "learning_rate": 1.6502531098801756e-06, "loss": 0.0672, "step": 697 }, { "epoch": 1.4694736842105263, "grad_norm": 3.4687383026734837, "learning_rate": 1.6379959763265268e-06, "loss": 0.0876, "step": 698 }, { "epoch": 1.471578947368421, "grad_norm": 3.2466329337423874, "learning_rate": 1.62577560911024e-06, "loss": 0.0778, "step": 699 }, { "epoch": 1.4736842105263157, "grad_norm": 2.904526223024363, "learning_rate": 1.6135921418712959e-06, "loss": 0.0938, "step": 700 }, { "epoch": 1.4757894736842105, "grad_norm": 2.686553988217244, "learning_rate": 1.6014457078461354e-06, "loss": 0.0643, "step": 701 }, { "epoch": 1.4778947368421052, "grad_norm": 3.5299540645067444, "learning_rate": 1.5893364398662175e-06, "loss": 0.0936, "step": 702 }, { "epoch": 1.48, "grad_norm": 2.9376299944007855, "learning_rate": 1.5772644703565564e-06, "loss": 0.0853, "step": 703 }, { "epoch": 1.4821052631578948, "grad_norm": 2.688100953143273, "learning_rate": 1.5652299313342772e-06, "loss": 0.0792, "step": 704 }, { "epoch": 1.4842105263157894, "grad_norm": 3.4048347453623857, "learning_rate": 1.5532329544071712e-06, "loss": 0.083, "step": 705 }, { "epoch": 1.4863157894736843, "grad_norm": 2.0479188515220623, "learning_rate": 1.5412736707722537e-06, "loss": 0.0483, "step": 706 }, { "epoch": 1.4884210526315789, "grad_norm": 2.2750434748340935, "learning_rate": 1.5293522112143371e-06, "loss": 0.0619, "step": 707 }, { "epoch": 1.4905263157894737, "grad_norm": 2.7200793140054103, "learning_rate": 1.517468706104589e-06, "loss": 0.0727, "step": 708 }, { "epoch": 1.4926315789473685, "grad_norm": 2.2546054623423335, "learning_rate": 1.505623285399121e-06, "loss": 0.0488, "step": 709 }, { "epoch": 1.4947368421052631, "grad_norm": 2.7989024442399435, "learning_rate": 1.4938160786375571e-06, "loss": 0.0921, "step": 710 }, { "epoch": 1.496842105263158, "grad_norm": 2.6090857807395023, "learning_rate": 1.4820472149416153e-06, "loss": 0.074, "step": 711 }, { "epoch": 1.4989473684210526, "grad_norm": 2.5952217582934756, "learning_rate": 1.4703168230137072e-06, "loss": 0.0531, "step": 712 }, { "epoch": 1.5010526315789474, "grad_norm": 2.854421948263307, "learning_rate": 1.4586250311355132e-06, "loss": 0.0706, "step": 713 }, { "epoch": 1.5031578947368422, "grad_norm": 2.491393713483, "learning_rate": 1.4469719671666043e-06, "loss": 0.0712, "step": 714 }, { "epoch": 1.5052631578947369, "grad_norm": 3.0941426529266085, "learning_rate": 1.4353577585430152e-06, "loss": 0.1008, "step": 715 }, { "epoch": 1.5073684210526315, "grad_norm": 2.1188906422201153, "learning_rate": 1.4237825322758735e-06, "loss": 0.053, "step": 716 }, { "epoch": 1.5094736842105263, "grad_norm": 2.499562810650923, "learning_rate": 1.412246414949997e-06, "loss": 0.0773, "step": 717 }, { "epoch": 1.5115789473684211, "grad_norm": 2.686352758516756, "learning_rate": 1.4007495327225162e-06, "loss": 0.0803, "step": 718 }, { "epoch": 1.5136842105263157, "grad_norm": 3.003166198685424, "learning_rate": 1.389292011321498e-06, "loss": 0.0942, "step": 719 }, { "epoch": 1.5157894736842106, "grad_norm": 3.276518434334761, "learning_rate": 1.3778739760445552e-06, "loss": 0.0822, "step": 720 }, { "epoch": 1.5178947368421052, "grad_norm": 2.5581478643854147, "learning_rate": 1.3664955517574967e-06, "loss": 0.0656, "step": 721 }, { "epoch": 1.52, "grad_norm": 2.9505141669864203, "learning_rate": 1.3551568628929434e-06, "loss": 0.0695, "step": 722 }, { "epoch": 1.5221052631578948, "grad_norm": 2.8839627500523632, "learning_rate": 1.343858033448982e-06, "loss": 0.0731, "step": 723 }, { "epoch": 1.5242105263157895, "grad_norm": 2.7109817535795693, "learning_rate": 1.3325991869878013e-06, "loss": 0.0648, "step": 724 }, { "epoch": 1.526315789473684, "grad_norm": 2.9195695673715063, "learning_rate": 1.321380446634342e-06, "loss": 0.0825, "step": 725 }, { "epoch": 1.528421052631579, "grad_norm": 2.857165833663471, "learning_rate": 1.3102019350749528e-06, "loss": 0.062, "step": 726 }, { "epoch": 1.5305263157894737, "grad_norm": 3.120768459996078, "learning_rate": 1.2990637745560418e-06, "loss": 0.0638, "step": 727 }, { "epoch": 1.5326315789473686, "grad_norm": 3.601845356286033, "learning_rate": 1.2879660868827508e-06, "loss": 0.0606, "step": 728 }, { "epoch": 1.5347368421052632, "grad_norm": 2.754264101684756, "learning_rate": 1.2769089934176126e-06, "loss": 0.061, "step": 729 }, { "epoch": 1.5368421052631578, "grad_norm": 3.763355031418207, "learning_rate": 1.2658926150792321e-06, "loss": 0.0853, "step": 730 }, { "epoch": 1.5389473684210526, "grad_norm": 2.435184182108376, "learning_rate": 1.2549170723409548e-06, "loss": 0.068, "step": 731 }, { "epoch": 1.5410526315789475, "grad_norm": 3.1384502245840205, "learning_rate": 1.243982485229559e-06, "loss": 0.0839, "step": 732 }, { "epoch": 1.543157894736842, "grad_norm": 2.915181764624064, "learning_rate": 1.233088973323937e-06, "loss": 0.0932, "step": 733 }, { "epoch": 1.545263157894737, "grad_norm": 3.4631955921569824, "learning_rate": 1.2222366557537911e-06, "loss": 0.0902, "step": 734 }, { "epoch": 1.5473684210526315, "grad_norm": 3.012931176367388, "learning_rate": 1.2114256511983274e-06, "loss": 0.0887, "step": 735 }, { "epoch": 1.5494736842105263, "grad_norm": 3.1207818685791144, "learning_rate": 1.200656077884958e-06, "loss": 0.1018, "step": 736 }, { "epoch": 1.5515789473684212, "grad_norm": 2.4612609560279877, "learning_rate": 1.189928053588012e-06, "loss": 0.0822, "step": 737 }, { "epoch": 1.5536842105263158, "grad_norm": 3.8256380681691797, "learning_rate": 1.1792416956274443e-06, "loss": 0.0885, "step": 738 }, { "epoch": 1.5557894736842104, "grad_norm": 2.3636768015398557, "learning_rate": 1.1685971208675539e-06, "loss": 0.0459, "step": 739 }, { "epoch": 1.5578947368421052, "grad_norm": 2.6401608062517825, "learning_rate": 1.157994445715706e-06, "loss": 0.0828, "step": 740 }, { "epoch": 1.56, "grad_norm": 2.3934171850503727, "learning_rate": 1.1474337861210543e-06, "loss": 0.0678, "step": 741 }, { "epoch": 1.5621052631578949, "grad_norm": 2.337161351456917, "learning_rate": 1.1369152575732823e-06, "loss": 0.0514, "step": 742 }, { "epoch": 1.5642105263157895, "grad_norm": 2.7480915820160154, "learning_rate": 1.1264389751013326e-06, "loss": 0.0881, "step": 743 }, { "epoch": 1.566315789473684, "grad_norm": 2.3256118886994943, "learning_rate": 1.1160050532721527e-06, "loss": 0.0733, "step": 744 }, { "epoch": 1.568421052631579, "grad_norm": 2.798649624701208, "learning_rate": 1.1056136061894386e-06, "loss": 0.0833, "step": 745 }, { "epoch": 1.5705263157894738, "grad_norm": 3.3074889617939847, "learning_rate": 1.095264747492391e-06, "loss": 0.0854, "step": 746 }, { "epoch": 1.5726315789473684, "grad_norm": 3.246747529237332, "learning_rate": 1.0849585903544707e-06, "loss": 0.09, "step": 747 }, { "epoch": 1.5747368421052632, "grad_norm": 2.268435499902229, "learning_rate": 1.0746952474821615e-06, "loss": 0.0635, "step": 748 }, { "epoch": 1.5768421052631578, "grad_norm": 3.048564605805993, "learning_rate": 1.0644748311137377e-06, "loss": 0.0736, "step": 749 }, { "epoch": 1.5789473684210527, "grad_norm": 2.729660101843144, "learning_rate": 1.0542974530180327e-06, "loss": 0.0815, "step": 750 }, { "epoch": 1.5810526315789475, "grad_norm": 3.846053704689222, "learning_rate": 1.0441632244932238e-06, "loss": 0.0855, "step": 751 }, { "epoch": 1.583157894736842, "grad_norm": 3.242518428961916, "learning_rate": 1.0340722563656109e-06, "loss": 0.0617, "step": 752 }, { "epoch": 1.5852631578947367, "grad_norm": 3.288508939138202, "learning_rate": 1.0240246589884046e-06, "loss": 0.0627, "step": 753 }, { "epoch": 1.5873684210526315, "grad_norm": 2.6583829662575753, "learning_rate": 1.0140205422405213e-06, "loss": 0.0668, "step": 754 }, { "epoch": 1.5894736842105264, "grad_norm": 3.1324991437039547, "learning_rate": 1.0040600155253766e-06, "loss": 0.0875, "step": 755 }, { "epoch": 1.5915789473684212, "grad_norm": 2.1752967431606436, "learning_rate": 9.941431877696955e-07, "loss": 0.0625, "step": 756 }, { "epoch": 1.5936842105263158, "grad_norm": 3.153051450237148, "learning_rate": 9.842701674223187e-07, "loss": 0.0916, "step": 757 }, { "epoch": 1.5957894736842104, "grad_norm": 3.0851678174272656, "learning_rate": 9.744410624530148e-07, "loss": 0.0682, "step": 758 }, { "epoch": 1.5978947368421053, "grad_norm": 3.6017843313201627, "learning_rate": 9.646559803512995e-07, "loss": 0.0718, "step": 759 }, { "epoch": 1.6, "grad_norm": 2.6419335189350717, "learning_rate": 9.549150281252633e-07, "loss": 0.0771, "step": 760 }, { "epoch": 1.6021052631578947, "grad_norm": 2.672014578151709, "learning_rate": 9.452183123003999e-07, "loss": 0.0664, "step": 761 }, { "epoch": 1.6042105263157893, "grad_norm": 2.7604515088486776, "learning_rate": 9.355659389184396e-07, "loss": 0.0802, "step": 762 }, { "epoch": 1.6063157894736841, "grad_norm": 2.4989218985147827, "learning_rate": 9.259580135361929e-07, "loss": 0.0712, "step": 763 }, { "epoch": 1.608421052631579, "grad_norm": 2.761122397313585, "learning_rate": 9.163946412243896e-07, "loss": 0.0776, "step": 764 }, { "epoch": 1.6105263157894738, "grad_norm": 2.502582479061757, "learning_rate": 9.068759265665384e-07, "loss": 0.0579, "step": 765 }, { "epoch": 1.6126315789473684, "grad_norm": 2.564496551486698, "learning_rate": 8.974019736577777e-07, "loss": 0.067, "step": 766 }, { "epoch": 1.614736842105263, "grad_norm": 2.6370265688116774, "learning_rate": 8.879728861037385e-07, "loss": 0.0851, "step": 767 }, { "epoch": 1.6168421052631579, "grad_norm": 2.711692043610178, "learning_rate": 8.785887670194137e-07, "loss": 0.0503, "step": 768 }, { "epoch": 1.6189473684210527, "grad_norm": 2.9973543080867993, "learning_rate": 8.692497190280225e-07, "loss": 0.084, "step": 769 }, { "epoch": 1.6210526315789475, "grad_norm": 2.797781874617319, "learning_rate": 8.599558442598998e-07, "loss": 0.0772, "step": 770 }, { "epoch": 1.6231578947368421, "grad_norm": 3.008160474882518, "learning_rate": 8.507072443513703e-07, "loss": 0.0718, "step": 771 }, { "epoch": 1.6252631578947367, "grad_norm": 2.569308456782316, "learning_rate": 8.415040204436426e-07, "loss": 0.0566, "step": 772 }, { "epoch": 1.6273684210526316, "grad_norm": 2.605706053568482, "learning_rate": 8.323462731816962e-07, "loss": 0.0572, "step": 773 }, { "epoch": 1.6294736842105264, "grad_norm": 2.326197374578758, "learning_rate": 8.232341027131885e-07, "loss": 0.0627, "step": 774 }, { "epoch": 1.631578947368421, "grad_norm": 2.5642000583273283, "learning_rate": 8.141676086873574e-07, "loss": 0.0751, "step": 775 }, { "epoch": 1.6336842105263156, "grad_norm": 2.191527946956071, "learning_rate": 8.051468902539272e-07, "loss": 0.0383, "step": 776 }, { "epoch": 1.6357894736842105, "grad_norm": 2.4051907305716265, "learning_rate": 7.961720460620321e-07, "loss": 0.0627, "step": 777 }, { "epoch": 1.6378947368421053, "grad_norm": 2.5165851730543114, "learning_rate": 7.872431742591268e-07, "loss": 0.0579, "step": 778 }, { "epoch": 1.6400000000000001, "grad_norm": 3.389030001143065, "learning_rate": 7.783603724899258e-07, "loss": 0.0897, "step": 779 }, { "epoch": 1.6421052631578947, "grad_norm": 3.150859694485845, "learning_rate": 7.695237378953224e-07, "loss": 0.0889, "step": 780 }, { "epoch": 1.6442105263157893, "grad_norm": 2.5782746782491577, "learning_rate": 7.607333671113409e-07, "loss": 0.0691, "step": 781 }, { "epoch": 1.6463157894736842, "grad_norm": 2.932437547604155, "learning_rate": 7.519893562680663e-07, "loss": 0.0747, "step": 782 }, { "epoch": 1.648421052631579, "grad_norm": 2.7289448649430486, "learning_rate": 7.432918009885997e-07, "loss": 0.0894, "step": 783 }, { "epoch": 1.6505263157894738, "grad_norm": 2.622139736136532, "learning_rate": 7.346407963880137e-07, "loss": 0.0657, "step": 784 }, { "epoch": 1.6526315789473685, "grad_norm": 2.844401593647993, "learning_rate": 7.260364370723044e-07, "loss": 0.0678, "step": 785 }, { "epoch": 1.654736842105263, "grad_norm": 2.46801387849074, "learning_rate": 7.174788171373731e-07, "loss": 0.0698, "step": 786 }, { "epoch": 1.656842105263158, "grad_norm": 3.2459197537543103, "learning_rate": 7.089680301679752e-07, "loss": 0.0763, "step": 787 }, { "epoch": 1.6589473684210527, "grad_norm": 2.1306283714857694, "learning_rate": 7.005041692367154e-07, "loss": 0.0516, "step": 788 }, { "epoch": 1.6610526315789473, "grad_norm": 2.6775897413319028, "learning_rate": 6.92087326903022e-07, "loss": 0.0619, "step": 789 }, { "epoch": 1.663157894736842, "grad_norm": 2.524367526847338, "learning_rate": 6.837175952121305e-07, "loss": 0.0688, "step": 790 }, { "epoch": 1.6652631578947368, "grad_norm": 2.708587596728961, "learning_rate": 6.753950656940905e-07, "loss": 0.0703, "step": 791 }, { "epoch": 1.6673684210526316, "grad_norm": 2.897104706239707, "learning_rate": 6.671198293627479e-07, "loss": 0.0621, "step": 792 }, { "epoch": 1.6694736842105264, "grad_norm": 3.359155853905581, "learning_rate": 6.58891976714764e-07, "loss": 0.0843, "step": 793 }, { "epoch": 1.671578947368421, "grad_norm": 3.1572683992293564, "learning_rate": 6.507115977286144e-07, "loss": 0.0631, "step": 794 }, { "epoch": 1.6736842105263157, "grad_norm": 2.5427050293849613, "learning_rate": 6.425787818636131e-07, "loss": 0.0789, "step": 795 }, { "epoch": 1.6757894736842105, "grad_norm": 3.066625121124378, "learning_rate": 6.34493618058935e-07, "loss": 0.0686, "step": 796 }, { "epoch": 1.6778947368421053, "grad_norm": 2.822755036635395, "learning_rate": 6.264561947326331e-07, "loss": 0.0684, "step": 797 }, { "epoch": 1.6800000000000002, "grad_norm": 2.8703507999231035, "learning_rate": 6.184665997806832e-07, "loss": 0.0747, "step": 798 }, { "epoch": 1.6821052631578948, "grad_norm": 3.333171116783649, "learning_rate": 6.105249205760128e-07, "loss": 0.089, "step": 799 }, { "epoch": 1.6842105263157894, "grad_norm": 2.2878271103214316, "learning_rate": 6.026312439675553e-07, "loss": 0.0622, "step": 800 }, { "epoch": 1.6842105263157894, "eval_loss": 0.1972101926803589, "eval_runtime": 0.9281, "eval_samples_per_second": 42.023, "eval_steps_per_second": 10.775, "step": 800 }, { "epoch": 1.6863157894736842, "grad_norm": 2.812568484004764, "learning_rate": 5.947856562792926e-07, "loss": 0.0654, "step": 801 }, { "epoch": 1.688421052631579, "grad_norm": 2.7128006161210108, "learning_rate": 5.869882433093154e-07, "loss": 0.0864, "step": 802 }, { "epoch": 1.6905263157894737, "grad_norm": 3.0041687010640983, "learning_rate": 5.79239090328883e-07, "loss": 0.0747, "step": 803 }, { "epoch": 1.6926315789473683, "grad_norm": 2.9138447859502095, "learning_rate": 5.715382820814885e-07, "loss": 0.0802, "step": 804 }, { "epoch": 1.694736842105263, "grad_norm": 2.5166922519171506, "learning_rate": 5.63885902781941e-07, "loss": 0.0675, "step": 805 }, { "epoch": 1.696842105263158, "grad_norm": 2.8497481916116896, "learning_rate": 5.562820361154315e-07, "loss": 0.0696, "step": 806 }, { "epoch": 1.6989473684210528, "grad_norm": 3.4754839940102, "learning_rate": 5.487267652366291e-07, "loss": 0.0852, "step": 807 }, { "epoch": 1.7010526315789474, "grad_norm": 3.2547037998817596, "learning_rate": 5.412201727687644e-07, "loss": 0.0862, "step": 808 }, { "epoch": 1.703157894736842, "grad_norm": 2.414633712271825, "learning_rate": 5.337623408027293e-07, "loss": 0.061, "step": 809 }, { "epoch": 1.7052631578947368, "grad_norm": 3.401374616115059, "learning_rate": 5.263533508961827e-07, "loss": 0.0952, "step": 810 }, { "epoch": 1.7073684210526316, "grad_norm": 2.73608904289166, "learning_rate": 5.189932840726486e-07, "loss": 0.0679, "step": 811 }, { "epoch": 1.7094736842105265, "grad_norm": 2.808779656242632, "learning_rate": 5.116822208206396e-07, "loss": 0.0636, "step": 812 }, { "epoch": 1.711578947368421, "grad_norm": 2.8278948519331046, "learning_rate": 5.044202410927707e-07, "loss": 0.0757, "step": 813 }, { "epoch": 1.7136842105263157, "grad_norm": 2.5103777723289054, "learning_rate": 4.972074243048896e-07, "loss": 0.0603, "step": 814 }, { "epoch": 1.7157894736842105, "grad_norm": 3.1933744646672135, "learning_rate": 4.900438493352056e-07, "loss": 0.0682, "step": 815 }, { "epoch": 1.7178947368421054, "grad_norm": 3.1503659721543706, "learning_rate": 4.829295945234258e-07, "loss": 0.072, "step": 816 }, { "epoch": 1.72, "grad_norm": 2.747297586956878, "learning_rate": 4.758647376699033e-07, "loss": 0.0528, "step": 817 }, { "epoch": 1.7221052631578946, "grad_norm": 2.2158924674953777, "learning_rate": 4.6884935603477733e-07, "loss": 0.0565, "step": 818 }, { "epoch": 1.7242105263157894, "grad_norm": 2.8793571493416823, "learning_rate": 4.6188352633713964e-07, "loss": 0.072, "step": 819 }, { "epoch": 1.7263157894736842, "grad_norm": 3.1524529277439055, "learning_rate": 4.549673247541875e-07, "loss": 0.0759, "step": 820 }, { "epoch": 1.728421052631579, "grad_norm": 2.5386910299707854, "learning_rate": 4.48100826920394e-07, "loss": 0.0715, "step": 821 }, { "epoch": 1.7305263157894737, "grad_norm": 3.012214075182323, "learning_rate": 4.412841079266778e-07, "loss": 0.0613, "step": 822 }, { "epoch": 1.7326315789473683, "grad_norm": 2.708221561942324, "learning_rate": 4.345172423195865e-07, "loss": 0.0977, "step": 823 }, { "epoch": 1.7347368421052631, "grad_norm": 3.115720504229963, "learning_rate": 4.27800304100478e-07, "loss": 0.0688, "step": 824 }, { "epoch": 1.736842105263158, "grad_norm": 1.987345247381263, "learning_rate": 4.211333667247125e-07, "loss": 0.0569, "step": 825 }, { "epoch": 1.7389473684210528, "grad_norm": 3.3300694952948593, "learning_rate": 4.1451650310085076e-07, "loss": 0.0718, "step": 826 }, { "epoch": 1.7410526315789474, "grad_norm": 2.6073178578766365, "learning_rate": 4.079497855898501e-07, "loss": 0.0651, "step": 827 }, { "epoch": 1.743157894736842, "grad_norm": 2.520132180808996, "learning_rate": 4.01433286004283e-07, "loss": 0.0746, "step": 828 }, { "epoch": 1.7452631578947368, "grad_norm": 4.216004495536436, "learning_rate": 3.949670756075447e-07, "loss": 0.0986, "step": 829 }, { "epoch": 1.7473684210526317, "grad_norm": 3.0641794864142313, "learning_rate": 3.885512251130763e-07, "loss": 0.0694, "step": 830 }, { "epoch": 1.7494736842105263, "grad_norm": 2.3436697576482493, "learning_rate": 3.8218580468359136e-07, "loss": 0.0677, "step": 831 }, { "epoch": 1.751578947368421, "grad_norm": 2.5525226853131406, "learning_rate": 3.7587088393030604e-07, "loss": 0.07, "step": 832 }, { "epoch": 1.7536842105263157, "grad_norm": 2.557138885671911, "learning_rate": 3.6960653191218333e-07, "loss": 0.0715, "step": 833 }, { "epoch": 1.7557894736842106, "grad_norm": 2.7336947284134907, "learning_rate": 3.6339281713517304e-07, "loss": 0.0766, "step": 834 }, { "epoch": 1.7578947368421054, "grad_norm": 2.7665277428264687, "learning_rate": 3.572298075514652e-07, "loss": 0.0892, "step": 835 }, { "epoch": 1.76, "grad_norm": 3.7811259313548935, "learning_rate": 3.511175705587433e-07, "loss": 0.0848, "step": 836 }, { "epoch": 1.7621052631578946, "grad_norm": 3.144574327722061, "learning_rate": 3.450561729994534e-07, "loss": 0.0738, "step": 837 }, { "epoch": 1.7642105263157895, "grad_norm": 2.857062951752684, "learning_rate": 3.390456811600673e-07, "loss": 0.0733, "step": 838 }, { "epoch": 1.7663157894736843, "grad_norm": 3.655857210909399, "learning_rate": 3.3308616077036113e-07, "loss": 0.1221, "step": 839 }, { "epoch": 1.768421052631579, "grad_norm": 2.1902799347040944, "learning_rate": 3.271776770026963e-07, "loss": 0.0592, "step": 840 }, { "epoch": 1.7705263157894737, "grad_norm": 3.4061442220418305, "learning_rate": 3.213202944713023e-07, "loss": 0.0959, "step": 841 }, { "epoch": 1.7726315789473683, "grad_norm": 2.1231955359369286, "learning_rate": 3.1551407723157734e-07, "loss": 0.065, "step": 842 }, { "epoch": 1.7747368421052632, "grad_norm": 2.8471057173295273, "learning_rate": 3.0975908877938277e-07, "loss": 0.0744, "step": 843 }, { "epoch": 1.776842105263158, "grad_norm": 3.2337983555019605, "learning_rate": 3.040553920503503e-07, "loss": 0.0905, "step": 844 }, { "epoch": 1.7789473684210526, "grad_norm": 2.542206604869396, "learning_rate": 2.984030494191942e-07, "loss": 0.0689, "step": 845 }, { "epoch": 1.7810526315789472, "grad_norm": 2.980410819168702, "learning_rate": 2.928021226990263e-07, "loss": 0.0698, "step": 846 }, { "epoch": 1.783157894736842, "grad_norm": 2.117615427082996, "learning_rate": 2.8725267314068496e-07, "loss": 0.0611, "step": 847 }, { "epoch": 1.7852631578947369, "grad_norm": 2.3322615491429475, "learning_rate": 2.817547614320615e-07, "loss": 0.0606, "step": 848 }, { "epoch": 1.7873684210526317, "grad_norm": 2.1217132504251626, "learning_rate": 2.763084476974376e-07, "loss": 0.0677, "step": 849 }, { "epoch": 1.7894736842105263, "grad_norm": 2.635485392016026, "learning_rate": 2.7091379149682683e-07, "loss": 0.0654, "step": 850 }, { "epoch": 1.791578947368421, "grad_norm": 2.9790312851694876, "learning_rate": 2.655708518253258e-07, "loss": 0.0677, "step": 851 }, { "epoch": 1.7936842105263158, "grad_norm": 2.3973095715711317, "learning_rate": 2.602796871124663e-07, "loss": 0.0504, "step": 852 }, { "epoch": 1.7957894736842106, "grad_norm": 2.7082756582731973, "learning_rate": 2.5504035522157853e-07, "loss": 0.0647, "step": 853 }, { "epoch": 1.7978947368421052, "grad_norm": 2.9121449991346062, "learning_rate": 2.4985291344915675e-07, "loss": 0.0903, "step": 854 }, { "epoch": 1.8, "grad_norm": 2.9867534692660245, "learning_rate": 2.447174185242324e-07, "loss": 0.074, "step": 855 }, { "epoch": 1.8021052631578947, "grad_norm": 2.8737059550127455, "learning_rate": 2.3963392660775576e-07, "loss": 0.084, "step": 856 }, { "epoch": 1.8042105263157895, "grad_norm": 3.2375432000666216, "learning_rate": 2.3460249329197825e-07, "loss": 0.0939, "step": 857 }, { "epoch": 1.8063157894736843, "grad_norm": 3.286934040592846, "learning_rate": 2.296231735998511e-07, "loss": 0.0756, "step": 858 }, { "epoch": 1.808421052631579, "grad_norm": 3.06301786931153, "learning_rate": 2.2469602198441575e-07, "loss": 0.0777, "step": 859 }, { "epoch": 1.8105263157894735, "grad_norm": 3.1645723085351123, "learning_rate": 2.198210923282118e-07, "loss": 0.073, "step": 860 }, { "epoch": 1.8126315789473684, "grad_norm": 2.493058988989823, "learning_rate": 2.149984379426906e-07, "loss": 0.0764, "step": 861 }, { "epoch": 1.8147368421052632, "grad_norm": 2.881863955432623, "learning_rate": 2.102281115676258e-07, "loss": 0.0809, "step": 862 }, { "epoch": 1.816842105263158, "grad_norm": 2.7694248049652974, "learning_rate": 2.0551016537054492e-07, "loss": 0.0627, "step": 863 }, { "epoch": 1.8189473684210526, "grad_norm": 2.879825150355328, "learning_rate": 2.008446509461498e-07, "loss": 0.0651, "step": 864 }, { "epoch": 1.8210526315789473, "grad_norm": 2.6298059648937135, "learning_rate": 1.962316193157593e-07, "loss": 0.077, "step": 865 }, { "epoch": 1.823157894736842, "grad_norm": 2.302690351699349, "learning_rate": 1.91671120926748e-07, "loss": 0.055, "step": 866 }, { "epoch": 1.825263157894737, "grad_norm": 2.954098530395544, "learning_rate": 1.871632056519962e-07, "loss": 0.0876, "step": 867 }, { "epoch": 1.8273684210526315, "grad_norm": 2.7636736978855594, "learning_rate": 1.8270792278934302e-07, "loss": 0.0915, "step": 868 }, { "epoch": 1.8294736842105264, "grad_norm": 2.7545832309484988, "learning_rate": 1.7830532106104747e-07, "loss": 0.0667, "step": 869 }, { "epoch": 1.831578947368421, "grad_norm": 2.543153382990055, "learning_rate": 1.7395544861325718e-07, "loss": 0.0584, "step": 870 }, { "epoch": 1.8336842105263158, "grad_norm": 2.807508111947715, "learning_rate": 1.696583530154794e-07, "loss": 0.0797, "step": 871 }, { "epoch": 1.8357894736842106, "grad_norm": 2.8192335775327795, "learning_rate": 1.6541408126006464e-07, "loss": 0.0872, "step": 872 }, { "epoch": 1.8378947368421052, "grad_norm": 3.209795989098829, "learning_rate": 1.6122267976168783e-07, "loss": 0.0999, "step": 873 }, { "epoch": 1.8399999999999999, "grad_norm": 3.4761511974704264, "learning_rate": 1.5708419435684463e-07, "loss": 0.0922, "step": 874 }, { "epoch": 1.8421052631578947, "grad_norm": 3.1674697236400813, "learning_rate": 1.5299867030334815e-07, "loss": 0.0635, "step": 875 }, { "epoch": 1.8442105263157895, "grad_norm": 3.6723508356857484, "learning_rate": 1.4896615227983468e-07, "loss": 0.0602, "step": 876 }, { "epoch": 1.8463157894736844, "grad_norm": 2.911727155785081, "learning_rate": 1.4498668438527597e-07, "loss": 0.0529, "step": 877 }, { "epoch": 1.848421052631579, "grad_norm": 2.3359023762817825, "learning_rate": 1.4106031013849498e-07, "loss": 0.0514, "step": 878 }, { "epoch": 1.8505263157894736, "grad_norm": 2.409615553053479, "learning_rate": 1.3718707247769137e-07, "loss": 0.0555, "step": 879 }, { "epoch": 1.8526315789473684, "grad_norm": 2.8117387336774855, "learning_rate": 1.333670137599713e-07, "loss": 0.0916, "step": 880 }, { "epoch": 1.8547368421052632, "grad_norm": 2.0682295293481077, "learning_rate": 1.2960017576088445e-07, "loss": 0.0582, "step": 881 }, { "epoch": 1.8568421052631578, "grad_norm": 2.551562601977345, "learning_rate": 1.2588659967396998e-07, "loss": 0.0732, "step": 882 }, { "epoch": 1.8589473684210527, "grad_norm": 3.2003699780502597, "learning_rate": 1.222263261102985e-07, "loss": 0.0711, "step": 883 }, { "epoch": 1.8610526315789473, "grad_norm": 3.486580924102614, "learning_rate": 1.1861939509803688e-07, "loss": 0.0748, "step": 884 }, { "epoch": 1.8631578947368421, "grad_norm": 2.2455491108015613, "learning_rate": 1.1506584608200366e-07, "loss": 0.0727, "step": 885 }, { "epoch": 1.865263157894737, "grad_norm": 2.4398853127583284, "learning_rate": 1.1156571792324212e-07, "loss": 0.0529, "step": 886 }, { "epoch": 1.8673684210526316, "grad_norm": 2.624358820662373, "learning_rate": 1.0811904889859337e-07, "loss": 0.0796, "step": 887 }, { "epoch": 1.8694736842105262, "grad_norm": 2.984398704743138, "learning_rate": 1.0472587670027678e-07, "loss": 0.0853, "step": 888 }, { "epoch": 1.871578947368421, "grad_norm": 3.4342828404155736, "learning_rate": 1.0138623843548078e-07, "loss": 0.0807, "step": 889 }, { "epoch": 1.8736842105263158, "grad_norm": 2.135864268441869, "learning_rate": 9.810017062595322e-08, "loss": 0.053, "step": 890 }, { "epoch": 1.8757894736842107, "grad_norm": 2.915254739899194, "learning_rate": 9.486770920760668e-08, "loss": 0.0751, "step": 891 }, { "epoch": 1.8778947368421053, "grad_norm": 2.867686868160426, "learning_rate": 9.16888895301199e-08, "loss": 0.0695, "step": 892 }, { "epoch": 1.88, "grad_norm": 2.4403837817102008, "learning_rate": 8.856374635655696e-08, "loss": 0.0552, "step": 893 }, { "epoch": 1.8821052631578947, "grad_norm": 2.96278001070486, "learning_rate": 8.549231386298151e-08, "loss": 0.0745, "step": 894 }, { "epoch": 1.8842105263157896, "grad_norm": 2.2812297751359134, "learning_rate": 8.247462563808816e-08, "loss": 0.058, "step": 895 }, { "epoch": 1.8863157894736842, "grad_norm": 2.8257873863079532, "learning_rate": 7.951071468283166e-08, "loss": 0.0799, "step": 896 }, { "epoch": 1.888421052631579, "grad_norm": 3.826719667852147, "learning_rate": 7.660061341006719e-08, "loss": 0.0713, "step": 897 }, { "epoch": 1.8905263157894736, "grad_norm": 3.7405575996641156, "learning_rate": 7.374435364419675e-08, "loss": 0.0948, "step": 898 }, { "epoch": 1.8926315789473684, "grad_norm": 3.4882300697302058, "learning_rate": 7.094196662081832e-08, "loss": 0.0778, "step": 899 }, { "epoch": 1.8947368421052633, "grad_norm": 2.7491143569308525, "learning_rate": 6.819348298638839e-08, "loss": 0.0549, "step": 900 }, { "epoch": 1.8968421052631579, "grad_norm": 2.255741826050675, "learning_rate": 6.549893279788278e-08, "loss": 0.0581, "step": 901 }, { "epoch": 1.8989473684210525, "grad_norm": 2.8944471113447774, "learning_rate": 6.285834552247127e-08, "loss": 0.101, "step": 902 }, { "epoch": 1.9010526315789473, "grad_norm": 3.516368002515928, "learning_rate": 6.027175003719354e-08, "loss": 0.0843, "step": 903 }, { "epoch": 1.9031578947368422, "grad_norm": 2.8055662943099717, "learning_rate": 5.773917462864265e-08, "loss": 0.0718, "step": 904 }, { "epoch": 1.905263157894737, "grad_norm": 2.8528278587154317, "learning_rate": 5.526064699265754e-08, "loss": 0.0771, "step": 905 }, { "epoch": 1.9073684210526316, "grad_norm": 2.4442597669717374, "learning_rate": 5.2836194234019976e-08, "loss": 0.0578, "step": 906 }, { "epoch": 1.9094736842105262, "grad_norm": 3.328401624060881, "learning_rate": 5.0465842866156965e-08, "loss": 0.0834, "step": 907 }, { "epoch": 1.911578947368421, "grad_norm": 3.1255146432363095, "learning_rate": 4.8149618810850454e-08, "loss": 0.0775, "step": 908 }, { "epoch": 1.9136842105263159, "grad_norm": 2.6888604157650615, "learning_rate": 4.588754739795587e-08, "loss": 0.0697, "step": 909 }, { "epoch": 1.9157894736842105, "grad_norm": 4.068321596359929, "learning_rate": 4.367965336512403e-08, "loss": 0.0959, "step": 910 }, { "epoch": 1.917894736842105, "grad_norm": 2.432792122907068, "learning_rate": 4.1525960857530244e-08, "loss": 0.0583, "step": 911 }, { "epoch": 1.92, "grad_norm": 2.77665437029982, "learning_rate": 3.9426493427611177e-08, "loss": 0.0672, "step": 912 }, { "epoch": 1.9221052631578948, "grad_norm": 3.087230249032823, "learning_rate": 3.738127403480507e-08, "loss": 0.0699, "step": 913 }, { "epoch": 1.9242105263157896, "grad_norm": 2.6241279675480054, "learning_rate": 3.5390325045304704e-08, "loss": 0.0649, "step": 914 }, { "epoch": 1.9263157894736842, "grad_norm": 2.7148617633550507, "learning_rate": 3.345366823180929e-08, "loss": 0.0677, "step": 915 }, { "epoch": 1.9284210526315788, "grad_norm": 2.5474852778498316, "learning_rate": 3.1571324773286284e-08, "loss": 0.0639, "step": 916 }, { "epoch": 1.9305263157894736, "grad_norm": 2.1409033097729813, "learning_rate": 2.9743315254743834e-08, "loss": 0.041, "step": 917 }, { "epoch": 1.9326315789473685, "grad_norm": 3.275485354161904, "learning_rate": 2.7969659666999273e-08, "loss": 0.0881, "step": 918 }, { "epoch": 1.9347368421052633, "grad_norm": 2.6681527860380396, "learning_rate": 2.625037740646763e-08, "loss": 0.0772, "step": 919 }, { "epoch": 1.936842105263158, "grad_norm": 2.3495006124196576, "learning_rate": 2.4585487274942922e-08, "loss": 0.0557, "step": 920 }, { "epoch": 1.9389473684210525, "grad_norm": 2.5228819355573235, "learning_rate": 2.2975007479397736e-08, "loss": 0.0507, "step": 921 }, { "epoch": 1.9410526315789474, "grad_norm": 3.756140115268572, "learning_rate": 2.1418955631781203e-08, "loss": 0.1053, "step": 922 }, { "epoch": 1.9431578947368422, "grad_norm": 2.960441030514824, "learning_rate": 1.9917348748826337e-08, "loss": 0.0708, "step": 923 }, { "epoch": 1.9452631578947368, "grad_norm": 2.8382521221794863, "learning_rate": 1.847020325186577e-08, "loss": 0.0531, "step": 924 }, { "epoch": 1.9473684210526314, "grad_norm": 3.0227161901258914, "learning_rate": 1.7077534966650767e-08, "loss": 0.0752, "step": 925 }, { "epoch": 1.9494736842105262, "grad_norm": 2.666050724013321, "learning_rate": 1.5739359123178587e-08, "loss": 0.0606, "step": 926 }, { "epoch": 1.951578947368421, "grad_norm": 3.1791911217585054, "learning_rate": 1.4455690355525964e-08, "loss": 0.0657, "step": 927 }, { "epoch": 1.953684210526316, "grad_norm": 2.9100920884957406, "learning_rate": 1.3226542701689215e-08, "loss": 0.0674, "step": 928 }, { "epoch": 1.9557894736842105, "grad_norm": 2.903965005728698, "learning_rate": 1.2051929603428824e-08, "loss": 0.0801, "step": 929 }, { "epoch": 1.9578947368421051, "grad_norm": 3.3785772789177706, "learning_rate": 1.0931863906127327e-08, "loss": 0.0813, "step": 930 }, { "epoch": 1.96, "grad_norm": 2.8464427553618235, "learning_rate": 9.866357858642206e-09, "loss": 0.0773, "step": 931 }, { "epoch": 1.9621052631578948, "grad_norm": 2.683834178385762, "learning_rate": 8.855423113177664e-09, "loss": 0.0878, "step": 932 }, { "epoch": 1.9642105263157896, "grad_norm": 2.293423722958731, "learning_rate": 7.899070725153612e-09, "loss": 0.0574, "step": 933 }, { "epoch": 1.9663157894736842, "grad_norm": 2.9921722390971985, "learning_rate": 6.997311153086883e-09, "loss": 0.0786, "step": 934 }, { "epoch": 1.9684210526315788, "grad_norm": 3.0905052771040102, "learning_rate": 6.150154258476315e-09, "loss": 0.0795, "step": 935 }, { "epoch": 1.9705263157894737, "grad_norm": 2.91652752925557, "learning_rate": 5.357609305692291e-09, "loss": 0.0869, "step": 936 }, { "epoch": 1.9726315789473685, "grad_norm": 2.664322796790042, "learning_rate": 4.619684961881255e-09, "loss": 0.0603, "step": 937 }, { "epoch": 1.9747368421052631, "grad_norm": 2.5227019769378507, "learning_rate": 3.936389296864129e-09, "loss": 0.0691, "step": 938 }, { "epoch": 1.9768421052631577, "grad_norm": 3.070188327105861, "learning_rate": 3.307729783054159e-09, "loss": 0.0701, "step": 939 }, { "epoch": 1.9789473684210526, "grad_norm": 2.527356227943579, "learning_rate": 2.7337132953697555e-09, "loss": 0.0543, "step": 940 }, { "epoch": 1.9810526315789474, "grad_norm": 3.2854112794071066, "learning_rate": 2.214346111164556e-09, "loss": 0.0843, "step": 941 }, { "epoch": 1.9831578947368422, "grad_norm": 2.4190595898058014, "learning_rate": 1.749633910153592e-09, "loss": 0.0584, "step": 942 }, { "epoch": 1.9852631578947368, "grad_norm": 2.924977300546306, "learning_rate": 1.3395817743561135e-09, "loss": 0.0867, "step": 943 }, { "epoch": 1.9873684210526315, "grad_norm": 2.892067107131255, "learning_rate": 9.841941880361917e-10, "loss": 0.0712, "step": 944 }, { "epoch": 1.9894736842105263, "grad_norm": 2.507649725691115, "learning_rate": 6.834750376549793e-10, "loss": 0.0695, "step": 945 }, { "epoch": 1.9915789473684211, "grad_norm": 3.029465353157538, "learning_rate": 4.374276118301879e-10, "loss": 0.0804, "step": 946 }, { "epoch": 1.993684210526316, "grad_norm": 3.759725367800689, "learning_rate": 2.4605460129556446e-10, "loss": 0.0821, "step": 947 }, { "epoch": 1.9957894736842106, "grad_norm": 2.500823320563419, "learning_rate": 1.0935809887702154e-10, "loss": 0.0678, "step": 948 }, { "epoch": 1.9978947368421052, "grad_norm": 3.6772672072650834, "learning_rate": 2.733959946432663e-11, "loss": 0.0815, "step": 949 }, { "epoch": 2.0, "grad_norm": 2.6395246965587384, "learning_rate": 0.0, "loss": 0.0596, "step": 950 }, { "epoch": 2.0, "step": 950, "total_flos": 1598198317056.0, "train_loss": 0.15143464744875307, "train_runtime": 436.4524, "train_samples_per_second": 17.409, "train_steps_per_second": 2.177 } ], "logging_steps": 1, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1598198317056.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }