{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.993178717598909, "eval_steps": 500, "global_step": 915, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005457025920873124, "grad_norm": 8.40548507464911, "learning_rate": 4.347826086956522e-07, "loss": 1.4358, "step": 1 }, { "epoch": 0.010914051841746248, "grad_norm": 8.27768117898077, "learning_rate": 8.695652173913044e-07, "loss": 1.4226, "step": 2 }, { "epoch": 0.01637107776261937, "grad_norm": 8.34817859423518, "learning_rate": 1.3043478260869566e-06, "loss": 1.4277, "step": 3 }, { "epoch": 0.021828103683492497, "grad_norm": 8.241415349829516, "learning_rate": 1.7391304347826088e-06, "loss": 1.4422, "step": 4 }, { "epoch": 0.027285129604365622, "grad_norm": 7.847219720997312, "learning_rate": 2.173913043478261e-06, "loss": 1.4021, "step": 5 }, { "epoch": 0.03274215552523874, "grad_norm": 6.490709031031551, "learning_rate": 2.6086956521739132e-06, "loss": 1.385, "step": 6 }, { "epoch": 0.03819918144611187, "grad_norm": 5.970054048772479, "learning_rate": 3.043478260869566e-06, "loss": 1.3497, "step": 7 }, { "epoch": 0.04365620736698499, "grad_norm": 3.386624409374888, "learning_rate": 3.4782608695652175e-06, "loss": 1.3161, "step": 8 }, { "epoch": 0.04911323328785812, "grad_norm": 2.764488361103231, "learning_rate": 3.91304347826087e-06, "loss": 1.3038, "step": 9 }, { "epoch": 0.054570259208731244, "grad_norm": 4.202620684729429, "learning_rate": 4.347826086956522e-06, "loss": 1.302, "step": 10 }, { "epoch": 0.06002728512960437, "grad_norm": 6.086560136767305, "learning_rate": 4.782608695652174e-06, "loss": 1.3128, "step": 11 }, { "epoch": 0.06548431105047749, "grad_norm": 6.073370326241115, "learning_rate": 5.2173913043478265e-06, "loss": 1.3088, "step": 12 }, { "epoch": 0.07094133697135062, "grad_norm": 5.6470857345830945, "learning_rate": 5.652173913043479e-06, "loss": 1.2941, "step": 13 }, { "epoch": 0.07639836289222374, "grad_norm": 4.138470108037422, "learning_rate": 6.086956521739132e-06, "loss": 1.2619, "step": 14 }, { "epoch": 0.08185538881309687, "grad_norm": 3.7821082022725294, "learning_rate": 6.521739130434783e-06, "loss": 1.2451, "step": 15 }, { "epoch": 0.08731241473396999, "grad_norm": 2.805651263629899, "learning_rate": 6.956521739130435e-06, "loss": 1.229, "step": 16 }, { "epoch": 0.0927694406548431, "grad_norm": 1.760678215208644, "learning_rate": 7.391304347826087e-06, "loss": 1.1933, "step": 17 }, { "epoch": 0.09822646657571624, "grad_norm": 1.471427773413438, "learning_rate": 7.82608695652174e-06, "loss": 1.2011, "step": 18 }, { "epoch": 0.10368349249658936, "grad_norm": 1.6385745213912601, "learning_rate": 8.260869565217392e-06, "loss": 1.1761, "step": 19 }, { "epoch": 0.10914051841746249, "grad_norm": 1.8107444245592996, "learning_rate": 8.695652173913044e-06, "loss": 1.1718, "step": 20 }, { "epoch": 0.1145975443383356, "grad_norm": 1.5828548709190915, "learning_rate": 9.130434782608697e-06, "loss": 1.1686, "step": 21 }, { "epoch": 0.12005457025920874, "grad_norm": 1.0833660495626252, "learning_rate": 9.565217391304349e-06, "loss": 1.1517, "step": 22 }, { "epoch": 0.12551159618008187, "grad_norm": 1.0067497148092934, "learning_rate": 1e-05, "loss": 1.1596, "step": 23 }, { "epoch": 0.13096862210095497, "grad_norm": 1.1511016542466355, "learning_rate": 1.0434782608695653e-05, "loss": 1.1185, "step": 24 }, { "epoch": 0.1364256480218281, "grad_norm": 1.1129912962754631, "learning_rate": 1.0869565217391305e-05, "loss": 1.1044, "step": 25 }, { "epoch": 0.14188267394270124, "grad_norm": 0.982424004614232, "learning_rate": 1.1304347826086957e-05, "loss": 1.1154, "step": 26 }, { "epoch": 0.14733969986357434, "grad_norm": 0.8935088021503061, "learning_rate": 1.1739130434782611e-05, "loss": 1.0855, "step": 27 }, { "epoch": 0.15279672578444747, "grad_norm": 0.9536160463503343, "learning_rate": 1.2173913043478263e-05, "loss": 1.0844, "step": 28 }, { "epoch": 0.1582537517053206, "grad_norm": 0.7998848521277707, "learning_rate": 1.2608695652173915e-05, "loss": 1.0866, "step": 29 }, { "epoch": 0.16371077762619374, "grad_norm": 0.7673685776489387, "learning_rate": 1.3043478260869566e-05, "loss": 1.1062, "step": 30 }, { "epoch": 0.16916780354706684, "grad_norm": 0.9634656988616612, "learning_rate": 1.3478260869565218e-05, "loss": 1.0713, "step": 31 }, { "epoch": 0.17462482946793997, "grad_norm": 0.7656758207809929, "learning_rate": 1.391304347826087e-05, "loss": 1.0804, "step": 32 }, { "epoch": 0.1800818553888131, "grad_norm": 0.8925856020905516, "learning_rate": 1.4347826086956522e-05, "loss": 1.0791, "step": 33 }, { "epoch": 0.1855388813096862, "grad_norm": 0.9604475518815286, "learning_rate": 1.4782608695652174e-05, "loss": 1.0743, "step": 34 }, { "epoch": 0.19099590723055934, "grad_norm": 0.6673627257965253, "learning_rate": 1.5217391304347828e-05, "loss": 1.0827, "step": 35 }, { "epoch": 0.19645293315143247, "grad_norm": 0.6466823830941191, "learning_rate": 1.565217391304348e-05, "loss": 1.074, "step": 36 }, { "epoch": 0.2019099590723056, "grad_norm": 0.8141432611114343, "learning_rate": 1.6086956521739132e-05, "loss": 1.0935, "step": 37 }, { "epoch": 0.2073669849931787, "grad_norm": 0.7303516492961905, "learning_rate": 1.6521739130434785e-05, "loss": 1.0733, "step": 38 }, { "epoch": 0.21282401091405184, "grad_norm": 0.6688305682070583, "learning_rate": 1.6956521739130437e-05, "loss": 1.0424, "step": 39 }, { "epoch": 0.21828103683492497, "grad_norm": 0.6750314584251758, "learning_rate": 1.739130434782609e-05, "loss": 1.039, "step": 40 }, { "epoch": 0.22373806275579808, "grad_norm": 0.6414610193667182, "learning_rate": 1.782608695652174e-05, "loss": 1.0673, "step": 41 }, { "epoch": 0.2291950886766712, "grad_norm": 0.6004774591873056, "learning_rate": 1.8260869565217393e-05, "loss": 1.0472, "step": 42 }, { "epoch": 0.23465211459754434, "grad_norm": 1.001192284108013, "learning_rate": 1.8695652173913045e-05, "loss": 1.0668, "step": 43 }, { "epoch": 0.24010914051841747, "grad_norm": 1.2347279701133878, "learning_rate": 1.9130434782608697e-05, "loss": 1.0515, "step": 44 }, { "epoch": 0.24556616643929058, "grad_norm": 0.7545298666968991, "learning_rate": 1.956521739130435e-05, "loss": 1.0118, "step": 45 }, { "epoch": 0.25102319236016374, "grad_norm": 1.7910029989682172, "learning_rate": 2e-05, "loss": 1.0403, "step": 46 }, { "epoch": 0.25648021828103684, "grad_norm": 0.6646975296811077, "learning_rate": 2.0434782608695657e-05, "loss": 1.0404, "step": 47 }, { "epoch": 0.26193724420190995, "grad_norm": 1.8544620284122977, "learning_rate": 2.0869565217391306e-05, "loss": 1.0452, "step": 48 }, { "epoch": 0.2673942701227831, "grad_norm": 0.9159008525844143, "learning_rate": 2.1304347826086958e-05, "loss": 1.0352, "step": 49 }, { "epoch": 0.2728512960436562, "grad_norm": 1.8651550264621868, "learning_rate": 2.173913043478261e-05, "loss": 1.0478, "step": 50 }, { "epoch": 0.2783083219645293, "grad_norm": 1.2948016317604922, "learning_rate": 2.2173913043478262e-05, "loss": 1.0309, "step": 51 }, { "epoch": 0.2837653478854025, "grad_norm": 1.5798033957703332, "learning_rate": 2.2608695652173914e-05, "loss": 1.0342, "step": 52 }, { "epoch": 0.2892223738062756, "grad_norm": 1.6098370993374367, "learning_rate": 2.3043478260869567e-05, "loss": 1.0329, "step": 53 }, { "epoch": 0.2946793997271487, "grad_norm": 1.2453365308049091, "learning_rate": 2.3478260869565222e-05, "loss": 1.0392, "step": 54 }, { "epoch": 0.30013642564802184, "grad_norm": 1.5844016545843662, "learning_rate": 2.391304347826087e-05, "loss": 1.0125, "step": 55 }, { "epoch": 0.30559345156889495, "grad_norm": 1.4627626883931912, "learning_rate": 2.4347826086956526e-05, "loss": 1.0499, "step": 56 }, { "epoch": 0.31105047748976805, "grad_norm": 1.4143561739785215, "learning_rate": 2.4782608695652175e-05, "loss": 1.0281, "step": 57 }, { "epoch": 0.3165075034106412, "grad_norm": 1.4070850348598627, "learning_rate": 2.521739130434783e-05, "loss": 1.0195, "step": 58 }, { "epoch": 0.3219645293315143, "grad_norm": 1.078315144035433, "learning_rate": 2.565217391304348e-05, "loss": 1.0216, "step": 59 }, { "epoch": 0.3274215552523875, "grad_norm": 1.1643154896766894, "learning_rate": 2.608695652173913e-05, "loss": 1.0097, "step": 60 }, { "epoch": 0.3328785811732606, "grad_norm": 1.2461170132940949, "learning_rate": 2.6521739130434784e-05, "loss": 1.0209, "step": 61 }, { "epoch": 0.3383356070941337, "grad_norm": 2.008669853014052, "learning_rate": 2.6956521739130436e-05, "loss": 1.0392, "step": 62 }, { "epoch": 0.34379263301500684, "grad_norm": 1.1644314565358613, "learning_rate": 2.739130434782609e-05, "loss": 1.0117, "step": 63 }, { "epoch": 0.34924965893587995, "grad_norm": 1.799658123734067, "learning_rate": 2.782608695652174e-05, "loss": 1.0083, "step": 64 }, { "epoch": 0.35470668485675305, "grad_norm": 1.3978781458275338, "learning_rate": 2.8260869565217396e-05, "loss": 1.0082, "step": 65 }, { "epoch": 0.3601637107776262, "grad_norm": 2.0511544765413583, "learning_rate": 2.8695652173913044e-05, "loss": 1.0248, "step": 66 }, { "epoch": 0.3656207366984993, "grad_norm": 1.2539457961483822, "learning_rate": 2.91304347826087e-05, "loss": 1.021, "step": 67 }, { "epoch": 0.3710777626193724, "grad_norm": 2.2336673036604777, "learning_rate": 2.956521739130435e-05, "loss": 0.9919, "step": 68 }, { "epoch": 0.3765347885402456, "grad_norm": 1.9545303539529588, "learning_rate": 3.0000000000000004e-05, "loss": 1.0026, "step": 69 }, { "epoch": 0.3819918144611187, "grad_norm": 1.715545568872597, "learning_rate": 3.0434782608695656e-05, "loss": 1.015, "step": 70 }, { "epoch": 0.3874488403819918, "grad_norm": 1.9877889103452786, "learning_rate": 3.086956521739131e-05, "loss": 1.0161, "step": 71 }, { "epoch": 0.39290586630286495, "grad_norm": 1.403829736078723, "learning_rate": 3.130434782608696e-05, "loss": 1.0094, "step": 72 }, { "epoch": 0.39836289222373805, "grad_norm": 1.2911183724867115, "learning_rate": 3.173913043478261e-05, "loss": 1.0188, "step": 73 }, { "epoch": 0.4038199181446112, "grad_norm": 1.725635724089668, "learning_rate": 3.2173913043478265e-05, "loss": 1.0233, "step": 74 }, { "epoch": 0.4092769440654843, "grad_norm": 1.5524172183602378, "learning_rate": 3.260869565217392e-05, "loss": 1.0153, "step": 75 }, { "epoch": 0.4147339699863574, "grad_norm": 1.6937536176639882, "learning_rate": 3.304347826086957e-05, "loss": 1.0087, "step": 76 }, { "epoch": 0.4201909959072306, "grad_norm": 1.031762197478065, "learning_rate": 3.347826086956522e-05, "loss": 0.996, "step": 77 }, { "epoch": 0.4256480218281037, "grad_norm": 1.2214301629403135, "learning_rate": 3.391304347826087e-05, "loss": 0.9952, "step": 78 }, { "epoch": 0.4311050477489768, "grad_norm": 1.5276490268514877, "learning_rate": 3.4347826086956526e-05, "loss": 1.0261, "step": 79 }, { "epoch": 0.43656207366984995, "grad_norm": 1.5857383204051316, "learning_rate": 3.478260869565218e-05, "loss": 1.0023, "step": 80 }, { "epoch": 0.44201909959072305, "grad_norm": 1.7303601564744104, "learning_rate": 3.521739130434783e-05, "loss": 0.9994, "step": 81 }, { "epoch": 0.44747612551159616, "grad_norm": 1.0635597436256417, "learning_rate": 3.565217391304348e-05, "loss": 1.0098, "step": 82 }, { "epoch": 0.4529331514324693, "grad_norm": 3.079665829601424, "learning_rate": 3.6086956521739134e-05, "loss": 1.0082, "step": 83 }, { "epoch": 0.4583901773533424, "grad_norm": 1.8491839328742012, "learning_rate": 3.6521739130434786e-05, "loss": 0.9895, "step": 84 }, { "epoch": 0.4638472032742155, "grad_norm": 3.00445118120071, "learning_rate": 3.695652173913044e-05, "loss": 1.0142, "step": 85 }, { "epoch": 0.4693042291950887, "grad_norm": 2.9140568598663514, "learning_rate": 3.739130434782609e-05, "loss": 1.0107, "step": 86 }, { "epoch": 0.4747612551159618, "grad_norm": 2.411767008345035, "learning_rate": 3.782608695652174e-05, "loss": 1.017, "step": 87 }, { "epoch": 0.48021828103683495, "grad_norm": 2.6524907076537505, "learning_rate": 3.8260869565217395e-05, "loss": 1.0203, "step": 88 }, { "epoch": 0.48567530695770805, "grad_norm": 2.133767717181375, "learning_rate": 3.869565217391305e-05, "loss": 0.9971, "step": 89 }, { "epoch": 0.49113233287858116, "grad_norm": 2.281609780939416, "learning_rate": 3.91304347826087e-05, "loss": 1.0142, "step": 90 }, { "epoch": 0.4965893587994543, "grad_norm": 1.6150193464799012, "learning_rate": 3.956521739130435e-05, "loss": 0.9959, "step": 91 }, { "epoch": 0.5020463847203275, "grad_norm": 2.1774944953837663, "learning_rate": 4e-05, "loss": 1.0178, "step": 92 }, { "epoch": 0.5075034106412005, "grad_norm": 1.8522257265922395, "learning_rate": 3.9999854286581316e-05, "loss": 0.9939, "step": 93 }, { "epoch": 0.5129604365620737, "grad_norm": 2.0976106058434145, "learning_rate": 3.999941714844849e-05, "loss": 0.9989, "step": 94 }, { "epoch": 0.5184174624829468, "grad_norm": 2.033288450566452, "learning_rate": 3.999868859197122e-05, "loss": 1.014, "step": 95 }, { "epoch": 0.5238744884038199, "grad_norm": 1.64201358795121, "learning_rate": 3.999766862776556e-05, "loss": 0.9962, "step": 96 }, { "epoch": 0.529331514324693, "grad_norm": 1.9801453443085917, "learning_rate": 3.999635727069373e-05, "loss": 0.9898, "step": 97 }, { "epoch": 0.5347885402455662, "grad_norm": 1.6558664282997086, "learning_rate": 3.9994754539863984e-05, "loss": 0.9937, "step": 98 }, { "epoch": 0.5402455661664393, "grad_norm": 1.702235161068369, "learning_rate": 3.999286045863026e-05, "loss": 0.9922, "step": 99 }, { "epoch": 0.5457025920873124, "grad_norm": 1.6798920660304613, "learning_rate": 3.999067505459185e-05, "loss": 0.9922, "step": 100 }, { "epoch": 0.5511596180081856, "grad_norm": 1.14927571951454, "learning_rate": 3.998819835959304e-05, "loss": 0.959, "step": 101 }, { "epoch": 0.5566166439290586, "grad_norm": 1.994460504643915, "learning_rate": 3.998543040972259e-05, "loss": 0.9896, "step": 102 }, { "epoch": 0.5620736698499318, "grad_norm": 2.225475662267806, "learning_rate": 3.998237124531324e-05, "loss": 0.9838, "step": 103 }, { "epoch": 0.567530695770805, "grad_norm": 0.9844824291875355, "learning_rate": 3.9979020910941135e-05, "loss": 0.9851, "step": 104 }, { "epoch": 0.572987721691678, "grad_norm": 2.4932772876759817, "learning_rate": 3.9975379455425126e-05, "loss": 0.9843, "step": 105 }, { "epoch": 0.5784447476125512, "grad_norm": 1.8029107577553645, "learning_rate": 3.9971446931826116e-05, "loss": 0.9991, "step": 106 }, { "epoch": 0.5839017735334243, "grad_norm": 2.290841965389965, "learning_rate": 3.996722339744625e-05, "loss": 1.0061, "step": 107 }, { "epoch": 0.5893587994542974, "grad_norm": 1.8729223351387532, "learning_rate": 3.9962708913828086e-05, "loss": 0.9968, "step": 108 }, { "epoch": 0.5948158253751705, "grad_norm": 2.1962460148515826, "learning_rate": 3.995790354675372e-05, "loss": 1.0082, "step": 109 }, { "epoch": 0.6002728512960437, "grad_norm": 1.9722134818162933, "learning_rate": 3.995280736624378e-05, "loss": 0.9975, "step": 110 }, { "epoch": 0.6057298772169167, "grad_norm": 2.0118864615891394, "learning_rate": 3.994742044655647e-05, "loss": 0.9889, "step": 111 }, { "epoch": 0.6111869031377899, "grad_norm": 2.090834428592416, "learning_rate": 3.994174286618643e-05, "loss": 1.0247, "step": 112 }, { "epoch": 0.616643929058663, "grad_norm": 1.5840918067308427, "learning_rate": 3.993577470786363e-05, "loss": 0.9859, "step": 113 }, { "epoch": 0.6221009549795361, "grad_norm": 1.4515746682829112, "learning_rate": 3.9929516058552143e-05, "loss": 0.9761, "step": 114 }, { "epoch": 0.6275579809004093, "grad_norm": 1.504559871894639, "learning_rate": 3.992296700944889e-05, "loss": 0.975, "step": 115 }, { "epoch": 0.6330150068212824, "grad_norm": 1.5927280628411824, "learning_rate": 3.99161276559823e-05, "loss": 0.9979, "step": 116 }, { "epoch": 0.6384720327421555, "grad_norm": 1.60127479724257, "learning_rate": 3.990899809781093e-05, "loss": 0.9743, "step": 117 }, { "epoch": 0.6439290586630286, "grad_norm": 1.991280239304608, "learning_rate": 3.990157843882202e-05, "loss": 0.981, "step": 118 }, { "epoch": 0.6493860845839018, "grad_norm": 1.1853516129644286, "learning_rate": 3.989386878712994e-05, "loss": 0.9767, "step": 119 }, { "epoch": 0.654843110504775, "grad_norm": 1.3174203496686017, "learning_rate": 3.9885869255074674e-05, "loss": 0.9904, "step": 120 }, { "epoch": 0.660300136425648, "grad_norm": 1.5305788456446745, "learning_rate": 3.987757995922014e-05, "loss": 0.9925, "step": 121 }, { "epoch": 0.6657571623465212, "grad_norm": 1.881540433498657, "learning_rate": 3.9869001020352484e-05, "loss": 1.0114, "step": 122 }, { "epoch": 0.6712141882673943, "grad_norm": 1.1981839977942124, "learning_rate": 3.9860132563478384e-05, "loss": 0.9883, "step": 123 }, { "epoch": 0.6766712141882674, "grad_norm": 2.0059502915759264, "learning_rate": 3.985097471782313e-05, "loss": 0.9939, "step": 124 }, { "epoch": 0.6821282401091405, "grad_norm": 1.4590850776551136, "learning_rate": 3.984152761682884e-05, "loss": 0.9831, "step": 125 }, { "epoch": 0.6875852660300137, "grad_norm": 1.1239926129461735, "learning_rate": 3.983179139815245e-05, "loss": 1.0005, "step": 126 }, { "epoch": 0.6930422919508867, "grad_norm": 2.1543984625500836, "learning_rate": 3.982176620366372e-05, "loss": 0.9639, "step": 127 }, { "epoch": 0.6984993178717599, "grad_norm": 1.8429479992055005, "learning_rate": 3.98114521794432e-05, "loss": 0.9941, "step": 128 }, { "epoch": 0.703956343792633, "grad_norm": 1.0868587312281466, "learning_rate": 3.9800849475780054e-05, "loss": 1.0049, "step": 129 }, { "epoch": 0.7094133697135061, "grad_norm": 2.118679896726006, "learning_rate": 3.97899582471699e-05, "loss": 0.9724, "step": 130 }, { "epoch": 0.7148703956343793, "grad_norm": 1.1508635163751133, "learning_rate": 3.977877865231256e-05, "loss": 0.9917, "step": 131 }, { "epoch": 0.7203274215552524, "grad_norm": 2.0430027109665905, "learning_rate": 3.976731085410974e-05, "loss": 0.9862, "step": 132 }, { "epoch": 0.7257844474761255, "grad_norm": 1.4515209852628121, "learning_rate": 3.975555501966263e-05, "loss": 0.9895, "step": 133 }, { "epoch": 0.7312414733969986, "grad_norm": 1.6287273596496654, "learning_rate": 3.974351132026952e-05, "loss": 0.9763, "step": 134 }, { "epoch": 0.7366984993178718, "grad_norm": 1.5473296478105147, "learning_rate": 3.973117993142327e-05, "loss": 0.9817, "step": 135 }, { "epoch": 0.7421555252387448, "grad_norm": 1.403531094420758, "learning_rate": 3.9718561032808774e-05, "loss": 0.9928, "step": 136 }, { "epoch": 0.747612551159618, "grad_norm": 1.2777940256720086, "learning_rate": 3.97056548083003e-05, "loss": 0.9654, "step": 137 }, { "epoch": 0.7530695770804912, "grad_norm": 2.2605652171854183, "learning_rate": 3.9692461445958876e-05, "loss": 0.98, "step": 138 }, { "epoch": 0.7585266030013642, "grad_norm": 1.5555447148375732, "learning_rate": 3.967898113802948e-05, "loss": 0.973, "step": 139 }, { "epoch": 0.7639836289222374, "grad_norm": 1.5712062419592667, "learning_rate": 3.9665214080938294e-05, "loss": 0.9837, "step": 140 }, { "epoch": 0.7694406548431105, "grad_norm": 1.87211562183804, "learning_rate": 3.9651160475289805e-05, "loss": 1.0069, "step": 141 }, { "epoch": 0.7748976807639836, "grad_norm": 1.684924662180551, "learning_rate": 3.963682052586392e-05, "loss": 0.9854, "step": 142 }, { "epoch": 0.7803547066848567, "grad_norm": 1.1116249825439455, "learning_rate": 3.962219444161294e-05, "loss": 0.9808, "step": 143 }, { "epoch": 0.7858117326057299, "grad_norm": 1.794929259692561, "learning_rate": 3.960728243565853e-05, "loss": 0.9826, "step": 144 }, { "epoch": 0.791268758526603, "grad_norm": 1.4024768691530294, "learning_rate": 3.959208472528863e-05, "loss": 0.97, "step": 145 }, { "epoch": 0.7967257844474761, "grad_norm": 1.5359858110261895, "learning_rate": 3.957660153195428e-05, "loss": 1.0029, "step": 146 }, { "epoch": 0.8021828103683493, "grad_norm": 1.8187808557656198, "learning_rate": 3.956083308126638e-05, "loss": 0.9576, "step": 147 }, { "epoch": 0.8076398362892224, "grad_norm": 1.4399907834108585, "learning_rate": 3.954477960299241e-05, "loss": 0.9612, "step": 148 }, { "epoch": 0.8130968622100955, "grad_norm": 1.1465593393044229, "learning_rate": 3.95284413310531e-05, "loss": 0.9936, "step": 149 }, { "epoch": 0.8185538881309686, "grad_norm": 1.458918663058527, "learning_rate": 3.9511818503518985e-05, "loss": 0.9813, "step": 150 }, { "epoch": 0.8240109140518418, "grad_norm": 2.6076181813742476, "learning_rate": 3.949491136260698e-05, "loss": 0.9798, "step": 151 }, { "epoch": 0.8294679399727148, "grad_norm": 1.0393193084437864, "learning_rate": 3.9477720154676806e-05, "loss": 0.9722, "step": 152 }, { "epoch": 0.834924965893588, "grad_norm": 3.9060717284201085, "learning_rate": 3.9460245130227435e-05, "loss": 0.9727, "step": 153 }, { "epoch": 0.8403819918144612, "grad_norm": 3.3082527760716767, "learning_rate": 3.9442486543893424e-05, "loss": 0.9794, "step": 154 }, { "epoch": 0.8458390177353342, "grad_norm": 2.4057404986106485, "learning_rate": 3.94244446544412e-05, "loss": 0.9837, "step": 155 }, { "epoch": 0.8512960436562074, "grad_norm": 1.8476216743035543, "learning_rate": 3.94061197247653e-05, "loss": 0.978, "step": 156 }, { "epoch": 0.8567530695770805, "grad_norm": 2.400979620356147, "learning_rate": 3.9387512021884555e-05, "loss": 0.981, "step": 157 }, { "epoch": 0.8622100954979536, "grad_norm": 2.1733630004298643, "learning_rate": 3.936862181693815e-05, "loss": 0.9776, "step": 158 }, { "epoch": 0.8676671214188267, "grad_norm": 1.8102603434505127, "learning_rate": 3.934944938518172e-05, "loss": 0.9937, "step": 159 }, { "epoch": 0.8731241473396999, "grad_norm": 2.0904632664136913, "learning_rate": 3.932999500598333e-05, "loss": 0.9577, "step": 160 }, { "epoch": 0.878581173260573, "grad_norm": 1.72487012815194, "learning_rate": 3.931025896281939e-05, "loss": 0.9885, "step": 161 }, { "epoch": 0.8840381991814461, "grad_norm": 1.859132027046651, "learning_rate": 3.929024154327052e-05, "loss": 0.9768, "step": 162 }, { "epoch": 0.8894952251023193, "grad_norm": 2.043990751240127, "learning_rate": 3.926994303901739e-05, "loss": 0.988, "step": 163 }, { "epoch": 0.8949522510231923, "grad_norm": 1.2949639926877792, "learning_rate": 3.9249363745836453e-05, "loss": 0.9803, "step": 164 }, { "epoch": 0.9004092769440655, "grad_norm": 1.95004872308144, "learning_rate": 3.922850396359562e-05, "loss": 0.9681, "step": 165 }, { "epoch": 0.9058663028649386, "grad_norm": 1.5438513810678176, "learning_rate": 3.92073639962499e-05, "loss": 0.9832, "step": 166 }, { "epoch": 0.9113233287858117, "grad_norm": 0.8915095612184046, "learning_rate": 3.9185944151837006e-05, "loss": 0.9933, "step": 167 }, { "epoch": 0.9167803547066848, "grad_norm": 1.7381086459322714, "learning_rate": 3.9164244742472795e-05, "loss": 0.9923, "step": 168 }, { "epoch": 0.922237380627558, "grad_norm": 1.5006202521018344, "learning_rate": 3.914226608434678e-05, "loss": 0.9803, "step": 169 }, { "epoch": 0.927694406548431, "grad_norm": 1.7809759035226784, "learning_rate": 3.912000849771751e-05, "loss": 0.9845, "step": 170 }, { "epoch": 0.9331514324693042, "grad_norm": 1.416880011606568, "learning_rate": 3.909747230690789e-05, "loss": 0.9813, "step": 171 }, { "epoch": 0.9386084583901774, "grad_norm": 1.2752605112134887, "learning_rate": 3.907465784030045e-05, "loss": 0.979, "step": 172 }, { "epoch": 0.9440654843110505, "grad_norm": 1.8931991472592369, "learning_rate": 3.90515654303326e-05, "loss": 0.9651, "step": 173 }, { "epoch": 0.9495225102319236, "grad_norm": 1.0457088342185985, "learning_rate": 3.902819541349171e-05, "loss": 0.9575, "step": 174 }, { "epoch": 0.9549795361527967, "grad_norm": 1.9658747343963177, "learning_rate": 3.900454813031032e-05, "loss": 0.9709, "step": 175 }, { "epoch": 0.9604365620736699, "grad_norm": 1.5573294008142207, "learning_rate": 3.898062392536106e-05, "loss": 0.9852, "step": 176 }, { "epoch": 0.965893587994543, "grad_norm": 1.7467537921928091, "learning_rate": 3.895642314725169e-05, "loss": 0.9671, "step": 177 }, { "epoch": 0.9713506139154161, "grad_norm": 1.6127230465883864, "learning_rate": 3.893194614862005e-05, "loss": 0.969, "step": 178 }, { "epoch": 0.9768076398362893, "grad_norm": 1.6603200328670693, "learning_rate": 3.890719328612882e-05, "loss": 0.9795, "step": 179 }, { "epoch": 0.9822646657571623, "grad_norm": 1.6320378665613324, "learning_rate": 3.888216492046045e-05, "loss": 0.9553, "step": 180 }, { "epoch": 0.9877216916780355, "grad_norm": 0.928699164443798, "learning_rate": 3.88568614163118e-05, "loss": 0.9844, "step": 181 }, { "epoch": 0.9931787175989086, "grad_norm": 1.2989789969103307, "learning_rate": 3.883128314238888e-05, "loss": 0.9633, "step": 182 }, { "epoch": 0.9986357435197817, "grad_norm": 1.5050415954099332, "learning_rate": 3.880543047140146e-05, "loss": 0.9832, "step": 183 }, { "epoch": 1.004092769440655, "grad_norm": 3.1493232961865725, "learning_rate": 3.877930378005766e-05, "loss": 1.6761, "step": 184 }, { "epoch": 1.009549795361528, "grad_norm": 1.045095816055446, "learning_rate": 3.8752903449058414e-05, "loss": 0.9363, "step": 185 }, { "epoch": 1.015006821282401, "grad_norm": 1.7070253819059258, "learning_rate": 3.872622986309198e-05, "loss": 0.9788, "step": 186 }, { "epoch": 1.0204638472032743, "grad_norm": 1.5326319060129026, "learning_rate": 3.8699283410828304e-05, "loss": 0.9738, "step": 187 }, { "epoch": 1.0259208731241474, "grad_norm": 1.1087556695241214, "learning_rate": 3.867206448491335e-05, "loss": 0.974, "step": 188 }, { "epoch": 1.0313778990450204, "grad_norm": 1.4845940458146507, "learning_rate": 3.8644573481963386e-05, "loss": 0.9676, "step": 189 }, { "epoch": 1.0368349249658937, "grad_norm": 1.4362719095357956, "learning_rate": 3.861681080255922e-05, "loss": 0.9382, "step": 190 }, { "epoch": 1.0422919508867667, "grad_norm": 1.4674385107699772, "learning_rate": 3.858877685124034e-05, "loss": 0.94, "step": 191 }, { "epoch": 1.0477489768076398, "grad_norm": 1.084446006406934, "learning_rate": 3.8560472036499044e-05, "loss": 0.9548, "step": 192 }, { "epoch": 1.053206002728513, "grad_norm": 1.7424024173389683, "learning_rate": 3.8531896770774454e-05, "loss": 0.966, "step": 193 }, { "epoch": 1.058663028649386, "grad_norm": 1.7927777941962322, "learning_rate": 3.8503051470446544e-05, "loss": 0.9371, "step": 194 }, { "epoch": 1.0641200545702592, "grad_norm": 0.8728719723252784, "learning_rate": 3.847393655583004e-05, "loss": 0.9778, "step": 195 }, { "epoch": 1.0695770804911324, "grad_norm": 1.5459212237514233, "learning_rate": 3.844455245116832e-05, "loss": 0.9714, "step": 196 }, { "epoch": 1.0750341064120055, "grad_norm": 1.723318009783005, "learning_rate": 3.8414899584627223e-05, "loss": 0.9483, "step": 197 }, { "epoch": 1.0804911323328785, "grad_norm": 1.6105441502277638, "learning_rate": 3.838497838828879e-05, "loss": 0.9529, "step": 198 }, { "epoch": 1.0859481582537518, "grad_norm": 1.235861043156412, "learning_rate": 3.835478929814502e-05, "loss": 0.9714, "step": 199 }, { "epoch": 1.0914051841746248, "grad_norm": 1.5553009472910362, "learning_rate": 3.8324332754091447e-05, "loss": 0.9499, "step": 200 }, { "epoch": 1.096862210095498, "grad_norm": 1.9631947357338404, "learning_rate": 3.82936091999208e-05, "loss": 0.9481, "step": 201 }, { "epoch": 1.1023192360163712, "grad_norm": 0.771286766088072, "learning_rate": 3.826261908331649e-05, "loss": 0.9528, "step": 202 }, { "epoch": 1.1077762619372442, "grad_norm": 1.8335561541725196, "learning_rate": 3.8231362855846105e-05, "loss": 0.9498, "step": 203 }, { "epoch": 1.1132332878581173, "grad_norm": 1.8424106742867963, "learning_rate": 3.8199840972954806e-05, "loss": 0.9476, "step": 204 }, { "epoch": 1.1186903137789905, "grad_norm": 0.7950788375956499, "learning_rate": 3.816805389395873e-05, "loss": 0.9422, "step": 205 }, { "epoch": 1.1241473396998636, "grad_norm": 1.6715342615720261, "learning_rate": 3.813600208203828e-05, "loss": 0.9652, "step": 206 }, { "epoch": 1.1296043656207366, "grad_norm": 1.0978850847460873, "learning_rate": 3.810368600423136e-05, "loss": 0.9578, "step": 207 }, { "epoch": 1.13506139154161, "grad_norm": 2.252408921193313, "learning_rate": 3.8071106131426586e-05, "loss": 0.9667, "step": 208 }, { "epoch": 1.140518417462483, "grad_norm": 1.1643241254847931, "learning_rate": 3.803826293835642e-05, "loss": 0.9514, "step": 209 }, { "epoch": 1.145975443383356, "grad_norm": 1.9506655247258313, "learning_rate": 3.8005156903590265e-05, "loss": 0.9436, "step": 210 }, { "epoch": 1.1514324693042293, "grad_norm": 1.6736581284768521, "learning_rate": 3.797178850952747e-05, "loss": 0.9563, "step": 211 }, { "epoch": 1.1568894952251023, "grad_norm": 1.698498967382254, "learning_rate": 3.79381582423903e-05, "loss": 0.96, "step": 212 }, { "epoch": 1.1623465211459754, "grad_norm": 1.4463473539957177, "learning_rate": 3.790426659221689e-05, "loss": 0.9583, "step": 213 }, { "epoch": 1.1678035470668486, "grad_norm": 1.996119225700199, "learning_rate": 3.7870114052854056e-05, "loss": 0.9686, "step": 214 }, { "epoch": 1.1732605729877217, "grad_norm": 1.2453858458138212, "learning_rate": 3.783570112195013e-05, "loss": 0.9476, "step": 215 }, { "epoch": 1.1787175989085947, "grad_norm": 1.9429791252993835, "learning_rate": 3.780102830094768e-05, "loss": 0.9633, "step": 216 }, { "epoch": 1.184174624829468, "grad_norm": 1.7144005781733527, "learning_rate": 3.7766096095076236e-05, "loss": 0.9452, "step": 217 }, { "epoch": 1.189631650750341, "grad_norm": 1.2919220781788054, "learning_rate": 3.7730905013344925e-05, "loss": 0.9505, "step": 218 }, { "epoch": 1.195088676671214, "grad_norm": 1.7283120463695893, "learning_rate": 3.7695455568535015e-05, "loss": 0.9583, "step": 219 }, { "epoch": 1.2005457025920874, "grad_norm": 1.2984823063070836, "learning_rate": 3.76597482771925e-05, "loss": 0.925, "step": 220 }, { "epoch": 1.2060027285129604, "grad_norm": 1.2101553255952835, "learning_rate": 3.7623783659620515e-05, "loss": 0.9671, "step": 221 }, { "epoch": 1.2114597544338335, "grad_norm": 1.9193420409227075, "learning_rate": 3.7587562239871804e-05, "loss": 0.9713, "step": 222 }, { "epoch": 1.2169167803547067, "grad_norm": 1.145139436855805, "learning_rate": 3.755108454574107e-05, "loss": 0.9688, "step": 223 }, { "epoch": 1.2223738062755798, "grad_norm": 2.3369999491203814, "learning_rate": 3.751435110875724e-05, "loss": 0.966, "step": 224 }, { "epoch": 1.2278308321964528, "grad_norm": 1.6283559400501786, "learning_rate": 3.7477362464175794e-05, "loss": 0.9629, "step": 225 }, { "epoch": 1.233287858117326, "grad_norm": 2.1896432971296447, "learning_rate": 3.7440119150970924e-05, "loss": 0.967, "step": 226 }, { "epoch": 1.2387448840381992, "grad_norm": 1.4314027126167852, "learning_rate": 3.7402621711827675e-05, "loss": 0.9391, "step": 227 }, { "epoch": 1.2442019099590724, "grad_norm": 2.448680005865948, "learning_rate": 3.7364870693134044e-05, "loss": 0.9791, "step": 228 }, { "epoch": 1.2496589358799455, "grad_norm": 1.988787930308905, "learning_rate": 3.732686664497304e-05, "loss": 0.9678, "step": 229 }, { "epoch": 1.2551159618008185, "grad_norm": 2.063824899367631, "learning_rate": 3.7288610121114634e-05, "loss": 0.9617, "step": 230 }, { "epoch": 1.2605729877216918, "grad_norm": 1.7243515110002714, "learning_rate": 3.725010167900772e-05, "loss": 0.9533, "step": 231 }, { "epoch": 1.2660300136425648, "grad_norm": 1.8647332677788166, "learning_rate": 3.721134187977197e-05, "loss": 0.9563, "step": 232 }, { "epoch": 1.271487039563438, "grad_norm": 1.636320006353433, "learning_rate": 3.7172331288189667e-05, "loss": 0.9568, "step": 233 }, { "epoch": 1.2769440654843112, "grad_norm": 1.7187722452357803, "learning_rate": 3.713307047269748e-05, "loss": 0.9538, "step": 234 }, { "epoch": 1.2824010914051842, "grad_norm": 1.5589845753526528, "learning_rate": 3.7093560005378175e-05, "loss": 0.9426, "step": 235 }, { "epoch": 1.2878581173260573, "grad_norm": 1.8373924763108647, "learning_rate": 3.705380046195228e-05, "loss": 0.9244, "step": 236 }, { "epoch": 1.2933151432469305, "grad_norm": 1.3882254378197982, "learning_rate": 3.701379242176969e-05, "loss": 0.9498, "step": 237 }, { "epoch": 1.2987721691678036, "grad_norm": 1.7021142374331253, "learning_rate": 3.697353646780124e-05, "loss": 0.9434, "step": 238 }, { "epoch": 1.3042291950886766, "grad_norm": 1.3543258636289206, "learning_rate": 3.693303318663019e-05, "loss": 0.9543, "step": 239 }, { "epoch": 1.30968622100955, "grad_norm": 1.6810213439521031, "learning_rate": 3.689228316844371e-05, "loss": 0.9462, "step": 240 }, { "epoch": 1.315143246930423, "grad_norm": 1.3377038870303093, "learning_rate": 3.685128700702423e-05, "loss": 0.9576, "step": 241 }, { "epoch": 1.320600272851296, "grad_norm": 1.5727626762086575, "learning_rate": 3.681004529974085e-05, "loss": 0.9583, "step": 242 }, { "epoch": 1.3260572987721693, "grad_norm": 1.2786793127927039, "learning_rate": 3.676855864754057e-05, "loss": 0.9357, "step": 243 }, { "epoch": 1.3315143246930423, "grad_norm": 1.648396462433026, "learning_rate": 3.67268276549396e-05, "loss": 0.9735, "step": 244 }, { "epoch": 1.3369713506139154, "grad_norm": 1.2216794004695668, "learning_rate": 3.668485293001448e-05, "loss": 0.9741, "step": 245 }, { "epoch": 1.3424283765347886, "grad_norm": 1.5971696430835944, "learning_rate": 3.664263508439329e-05, "loss": 0.9484, "step": 246 }, { "epoch": 1.3478854024556617, "grad_norm": 1.3024833094157782, "learning_rate": 3.660017473324669e-05, "loss": 0.9406, "step": 247 }, { "epoch": 1.3533424283765347, "grad_norm": 1.5316788751229022, "learning_rate": 3.655747249527897e-05, "loss": 0.9601, "step": 248 }, { "epoch": 1.358799454297408, "grad_norm": 1.5547319797496317, "learning_rate": 3.6514528992719044e-05, "loss": 0.9474, "step": 249 }, { "epoch": 1.364256480218281, "grad_norm": 1.206667830823351, "learning_rate": 3.6471344851311356e-05, "loss": 0.9502, "step": 250 }, { "epoch": 1.369713506139154, "grad_norm": 1.2600525155706597, "learning_rate": 3.64279207003068e-05, "loss": 0.9452, "step": 251 }, { "epoch": 1.3751705320600274, "grad_norm": 1.3484101306757132, "learning_rate": 3.638425717245353e-05, "loss": 0.9502, "step": 252 }, { "epoch": 1.3806275579809004, "grad_norm": 1.2235801669480915, "learning_rate": 3.634035490398774e-05, "loss": 0.9384, "step": 253 }, { "epoch": 1.3860845839017735, "grad_norm": 1.5485793543675035, "learning_rate": 3.629621453462438e-05, "loss": 0.959, "step": 254 }, { "epoch": 1.3915416098226467, "grad_norm": 1.4002101413586943, "learning_rate": 3.625183670754787e-05, "loss": 0.9472, "step": 255 }, { "epoch": 1.3969986357435198, "grad_norm": 0.9434127178746972, "learning_rate": 3.6207222069402696e-05, "loss": 0.9455, "step": 256 }, { "epoch": 1.4024556616643928, "grad_norm": 0.9858801112297753, "learning_rate": 3.6162371270284004e-05, "loss": 0.9436, "step": 257 }, { "epoch": 1.407912687585266, "grad_norm": 1.3469345939907027, "learning_rate": 3.611728496372813e-05, "loss": 0.9368, "step": 258 }, { "epoch": 1.4133697135061392, "grad_norm": 1.8149253369471827, "learning_rate": 3.6071963806703054e-05, "loss": 0.9427, "step": 259 }, { "epoch": 1.4188267394270122, "grad_norm": 0.7473132379864426, "learning_rate": 3.6026408459598844e-05, "loss": 0.9638, "step": 260 }, { "epoch": 1.4242837653478855, "grad_norm": 1.6128737568835454, "learning_rate": 3.598061958621804e-05, "loss": 0.9557, "step": 261 }, { "epoch": 1.4297407912687585, "grad_norm": 1.4020351576310623, "learning_rate": 3.593459785376597e-05, "loss": 0.9421, "step": 262 }, { "epoch": 1.4351978171896316, "grad_norm": 1.2945719219835932, "learning_rate": 3.5888343932841035e-05, "loss": 0.9532, "step": 263 }, { "epoch": 1.4406548431105048, "grad_norm": 1.2851599043172512, "learning_rate": 3.584185849742492e-05, "loss": 0.9307, "step": 264 }, { "epoch": 1.446111869031378, "grad_norm": 1.2427656903613609, "learning_rate": 3.579514222487281e-05, "loss": 0.9538, "step": 265 }, { "epoch": 1.451568894952251, "grad_norm": 1.2877332071545373, "learning_rate": 3.5748195795903474e-05, "loss": 0.9339, "step": 266 }, { "epoch": 1.4570259208731242, "grad_norm": 1.198006739181478, "learning_rate": 3.5701019894589376e-05, "loss": 0.9512, "step": 267 }, { "epoch": 1.4624829467939973, "grad_norm": 1.5795004337836194, "learning_rate": 3.565361520834671e-05, "loss": 0.9448, "step": 268 }, { "epoch": 1.4679399727148703, "grad_norm": 1.1556792865151078, "learning_rate": 3.5605982427925356e-05, "loss": 0.9332, "step": 269 }, { "epoch": 1.4733969986357436, "grad_norm": 0.5930547881100959, "learning_rate": 3.555812224739884e-05, "loss": 0.9613, "step": 270 }, { "epoch": 1.4788540245566166, "grad_norm": 1.4579608488740115, "learning_rate": 3.5510035364154236e-05, "loss": 0.957, "step": 271 }, { "epoch": 1.4843110504774897, "grad_norm": 0.9399997272018373, "learning_rate": 3.5461722478881935e-05, "loss": 0.9362, "step": 272 }, { "epoch": 1.489768076398363, "grad_norm": 1.181780640902133, "learning_rate": 3.541318429556552e-05, "loss": 0.9304, "step": 273 }, { "epoch": 1.495225102319236, "grad_norm": 2.438002638433228, "learning_rate": 3.5364421521471443e-05, "loss": 0.9539, "step": 274 }, { "epoch": 1.500682128240109, "grad_norm": 0.9264166142215685, "learning_rate": 3.531543486713877e-05, "loss": 0.9592, "step": 275 }, { "epoch": 1.5061391541609823, "grad_norm": 4.380791625672203, "learning_rate": 3.5266225046368765e-05, "loss": 0.9625, "step": 276 }, { "epoch": 1.5115961800818554, "grad_norm": 4.119745847530299, "learning_rate": 3.521679277621457e-05, "loss": 0.9811, "step": 277 }, { "epoch": 1.5170532060027284, "grad_norm": 1.3888384210153164, "learning_rate": 3.5167138776970686e-05, "loss": 0.9344, "step": 278 }, { "epoch": 1.5225102319236017, "grad_norm": 3.242363274569884, "learning_rate": 3.5117263772162515e-05, "loss": 0.9699, "step": 279 }, { "epoch": 1.5279672578444747, "grad_norm": 2.131900747816542, "learning_rate": 3.5067168488535794e-05, "loss": 0.9899, "step": 280 }, { "epoch": 1.5334242837653478, "grad_norm": 3.1589070088722515, "learning_rate": 3.501685365604604e-05, "loss": 0.9546, "step": 281 }, { "epoch": 1.538881309686221, "grad_norm": 2.6438273574397404, "learning_rate": 3.496632000784787e-05, "loss": 0.9694, "step": 282 }, { "epoch": 1.544338335607094, "grad_norm": 2.0669427502395594, "learning_rate": 3.4915568280284335e-05, "loss": 0.9452, "step": 283 }, { "epoch": 1.5497953615279672, "grad_norm": 2.1718089915480014, "learning_rate": 3.4864599212876234e-05, "loss": 0.9454, "step": 284 }, { "epoch": 1.5552523874488404, "grad_norm": 2.0439265869282193, "learning_rate": 3.481341354831125e-05, "loss": 0.9266, "step": 285 }, { "epoch": 1.5607094133697135, "grad_norm": 1.7375290887295285, "learning_rate": 3.476201203243322e-05, "loss": 0.9461, "step": 286 }, { "epoch": 1.5661664392905865, "grad_norm": 1.7370946125028597, "learning_rate": 3.4710395414231195e-05, "loss": 0.9657, "step": 287 }, { "epoch": 1.5716234652114598, "grad_norm": 1.403531131584409, "learning_rate": 3.465856444582856e-05, "loss": 0.9495, "step": 288 }, { "epoch": 1.5770804911323328, "grad_norm": 1.4819115235994536, "learning_rate": 3.460651988247208e-05, "loss": 0.9617, "step": 289 }, { "epoch": 1.5825375170532059, "grad_norm": 1.761856728208756, "learning_rate": 3.4554262482520875e-05, "loss": 0.921, "step": 290 }, { "epoch": 1.5879945429740792, "grad_norm": 1.0191878209582437, "learning_rate": 3.4501793007435394e-05, "loss": 0.9447, "step": 291 }, { "epoch": 1.5934515688949522, "grad_norm": 2.274348027783054, "learning_rate": 3.444911222176629e-05, "loss": 0.9497, "step": 292 }, { "epoch": 1.5989085948158253, "grad_norm": 1.5339383301336882, "learning_rate": 3.43962208931433e-05, "loss": 0.9669, "step": 293 }, { "epoch": 1.6043656207366985, "grad_norm": 2.550276251211631, "learning_rate": 3.434311979226406e-05, "loss": 0.956, "step": 294 }, { "epoch": 1.6098226466575716, "grad_norm": 1.7875909094899942, "learning_rate": 3.428980969288287e-05, "loss": 0.9495, "step": 295 }, { "epoch": 1.6152796725784446, "grad_norm": 2.823228050378481, "learning_rate": 3.42362913717994e-05, "loss": 0.9362, "step": 296 }, { "epoch": 1.620736698499318, "grad_norm": 2.4678216750780857, "learning_rate": 3.41825656088474e-05, "loss": 0.9386, "step": 297 }, { "epoch": 1.626193724420191, "grad_norm": 1.9114157924579258, "learning_rate": 3.4128633186883346e-05, "loss": 0.9576, "step": 298 }, { "epoch": 1.631650750341064, "grad_norm": 1.8379349077219813, "learning_rate": 3.407449489177499e-05, "loss": 0.9479, "step": 299 }, { "epoch": 1.6371077762619373, "grad_norm": 1.852909901213652, "learning_rate": 3.4020151512389924e-05, "loss": 0.9279, "step": 300 }, { "epoch": 1.6425648021828103, "grad_norm": 1.3420457124335345, "learning_rate": 3.396560384058413e-05, "loss": 0.9298, "step": 301 }, { "epoch": 1.6480218281036834, "grad_norm": 2.1617773929000172, "learning_rate": 3.391085267119037e-05, "loss": 0.9225, "step": 302 }, { "epoch": 1.6534788540245566, "grad_norm": 1.316355967958462, "learning_rate": 3.3855898802006644e-05, "loss": 0.9342, "step": 303 }, { "epoch": 1.65893587994543, "grad_norm": 2.453815979459407, "learning_rate": 3.380074303378458e-05, "loss": 0.9394, "step": 304 }, { "epoch": 1.6643929058663027, "grad_norm": 1.748815933891966, "learning_rate": 3.374538617021773e-05, "loss": 0.9315, "step": 305 }, { "epoch": 1.669849931787176, "grad_norm": 2.5597232277901973, "learning_rate": 3.3689829017929875e-05, "loss": 0.9573, "step": 306 }, { "epoch": 1.6753069577080493, "grad_norm": 2.368134432470627, "learning_rate": 3.363407238646327e-05, "loss": 0.9494, "step": 307 }, { "epoch": 1.680763983628922, "grad_norm": 1.724634315811694, "learning_rate": 3.357811708826686e-05, "loss": 0.9407, "step": 308 }, { "epoch": 1.6862210095497954, "grad_norm": 1.8226179705374004, "learning_rate": 3.352196393868442e-05, "loss": 0.9495, "step": 309 }, { "epoch": 1.6916780354706686, "grad_norm": 1.6945951192803632, "learning_rate": 3.34656137559427e-05, "loss": 0.9402, "step": 310 }, { "epoch": 1.6971350613915415, "grad_norm": 1.402641679011377, "learning_rate": 3.3409067361139464e-05, "loss": 0.9191, "step": 311 }, { "epoch": 1.7025920873124147, "grad_norm": 1.3467589645615918, "learning_rate": 3.3352325578231565e-05, "loss": 0.9636, "step": 312 }, { "epoch": 1.708049113233288, "grad_norm": 1.25752862289665, "learning_rate": 3.329538923402293e-05, "loss": 0.9554, "step": 313 }, { "epoch": 1.7135061391541608, "grad_norm": 0.986547181961436, "learning_rate": 3.323825915815248e-05, "loss": 0.9305, "step": 314 }, { "epoch": 1.718963165075034, "grad_norm": 1.4979513167093783, "learning_rate": 3.31809361830821e-05, "loss": 0.9567, "step": 315 }, { "epoch": 1.7244201909959074, "grad_norm": 0.7937925026119881, "learning_rate": 3.312342114408444e-05, "loss": 0.9458, "step": 316 }, { "epoch": 1.7298772169167802, "grad_norm": 1.8876612539551143, "learning_rate": 3.30657148792308e-05, "loss": 0.9649, "step": 317 }, { "epoch": 1.7353342428376535, "grad_norm": 1.226595551778844, "learning_rate": 3.3007818229378896e-05, "loss": 0.9643, "step": 318 }, { "epoch": 1.7407912687585267, "grad_norm": 2.213786521631912, "learning_rate": 3.29497320381606e-05, "loss": 0.9584, "step": 319 }, { "epoch": 1.7462482946793996, "grad_norm": 2.1570819482352235, "learning_rate": 3.2891457151969675e-05, "loss": 0.9531, "step": 320 }, { "epoch": 1.7517053206002728, "grad_norm": 1.4381280543608101, "learning_rate": 3.2832994419949393e-05, "loss": 0.9421, "step": 321 }, { "epoch": 1.7571623465211461, "grad_norm": 1.737184951842976, "learning_rate": 3.277434469398022e-05, "loss": 0.9416, "step": 322 }, { "epoch": 1.762619372442019, "grad_norm": 1.632657953412784, "learning_rate": 3.2715508828667366e-05, "loss": 0.9321, "step": 323 }, { "epoch": 1.7680763983628922, "grad_norm": 1.488744578094212, "learning_rate": 3.265648768132834e-05, "loss": 0.9365, "step": 324 }, { "epoch": 1.7735334242837655, "grad_norm": 1.6336003571844502, "learning_rate": 3.2597282111980444e-05, "loss": 0.9515, "step": 325 }, { "epoch": 1.7789904502046383, "grad_norm": 1.4154118064331849, "learning_rate": 3.253789298332828e-05, "loss": 0.9641, "step": 326 }, { "epoch": 1.7844474761255116, "grad_norm": 1.5366612747550772, "learning_rate": 3.2478321160751134e-05, "loss": 0.9456, "step": 327 }, { "epoch": 1.7899045020463848, "grad_norm": 1.3450928156923279, "learning_rate": 3.241856751229041e-05, "loss": 0.9486, "step": 328 }, { "epoch": 1.795361527967258, "grad_norm": 1.2765561802175178, "learning_rate": 3.2358632908636955e-05, "loss": 0.9567, "step": 329 }, { "epoch": 1.800818553888131, "grad_norm": 1.162610958798, "learning_rate": 3.229851822311834e-05, "loss": 0.9288, "step": 330 }, { "epoch": 1.8062755798090042, "grad_norm": 1.1625836925107373, "learning_rate": 3.223822433168623e-05, "loss": 0.9263, "step": 331 }, { "epoch": 1.8117326057298773, "grad_norm": 0.8071251992329053, "learning_rate": 3.217775211290351e-05, "loss": 0.9482, "step": 332 }, { "epoch": 1.8171896316507503, "grad_norm": 0.89790755928994, "learning_rate": 3.211710244793156e-05, "loss": 0.9173, "step": 333 }, { "epoch": 1.8226466575716236, "grad_norm": 0.8356390239967052, "learning_rate": 3.205627622051738e-05, "loss": 0.9504, "step": 334 }, { "epoch": 1.8281036834924966, "grad_norm": 0.6998885337784212, "learning_rate": 3.199527431698073e-05, "loss": 0.9459, "step": 335 }, { "epoch": 1.8335607094133697, "grad_norm": 0.8727569735519537, "learning_rate": 3.19340976262012e-05, "loss": 0.9435, "step": 336 }, { "epoch": 1.839017735334243, "grad_norm": 0.6362860972023866, "learning_rate": 3.187274703960526e-05, "loss": 0.9406, "step": 337 }, { "epoch": 1.844474761255116, "grad_norm": 0.8761738610839735, "learning_rate": 3.181122345115329e-05, "loss": 0.9353, "step": 338 }, { "epoch": 1.849931787175989, "grad_norm": 0.7208261657101167, "learning_rate": 3.174952775732651e-05, "loss": 0.9368, "step": 339 }, { "epoch": 1.8553888130968623, "grad_norm": 0.8342099154714143, "learning_rate": 3.1687660857114e-05, "loss": 0.9515, "step": 340 }, { "epoch": 1.8608458390177354, "grad_norm": 0.7588834066746923, "learning_rate": 3.1625623651999485e-05, "loss": 0.946, "step": 341 }, { "epoch": 1.8663028649386084, "grad_norm": 0.7261790084313842, "learning_rate": 3.1563417045948295e-05, "loss": 0.9332, "step": 342 }, { "epoch": 1.8717598908594817, "grad_norm": 0.5170313983982283, "learning_rate": 3.150104194539417e-05, "loss": 0.9305, "step": 343 }, { "epoch": 1.8772169167803547, "grad_norm": 0.7727261576998418, "learning_rate": 3.1438499259226e-05, "loss": 0.9437, "step": 344 }, { "epoch": 1.8826739427012278, "grad_norm": 1.0590324797396327, "learning_rate": 3.137578989877466e-05, "loss": 0.9496, "step": 345 }, { "epoch": 1.888130968622101, "grad_norm": 0.7511992016971163, "learning_rate": 3.131291477779968e-05, "loss": 0.9556, "step": 346 }, { "epoch": 1.893587994542974, "grad_norm": 1.081487500255035, "learning_rate": 3.124987481247594e-05, "loss": 0.9479, "step": 347 }, { "epoch": 1.8990450204638472, "grad_norm": 1.4968005117001788, "learning_rate": 3.118667092138033e-05, "loss": 0.9214, "step": 348 }, { "epoch": 1.9045020463847204, "grad_norm": 0.6464116981961434, "learning_rate": 3.112330402547834e-05, "loss": 0.9599, "step": 349 }, { "epoch": 1.9099590723055935, "grad_norm": 1.1571751705071633, "learning_rate": 3.10597750481107e-05, "loss": 0.9438, "step": 350 }, { "epoch": 1.9154160982264665, "grad_norm": 1.383173192553895, "learning_rate": 3.099608491497983e-05, "loss": 0.9369, "step": 351 }, { "epoch": 1.9208731241473398, "grad_norm": 1.0142077195831358, "learning_rate": 3.093223455413645e-05, "loss": 0.9181, "step": 352 }, { "epoch": 1.9263301500682128, "grad_norm": 1.146163334987763, "learning_rate": 3.0868224895965996e-05, "loss": 0.9396, "step": 353 }, { "epoch": 1.931787175989086, "grad_norm": 0.6987837846263671, "learning_rate": 3.080405687317507e-05, "loss": 0.9303, "step": 354 }, { "epoch": 1.9372442019099592, "grad_norm": 1.3380093833598752, "learning_rate": 3.073973142077788e-05, "loss": 0.9462, "step": 355 }, { "epoch": 1.9427012278308322, "grad_norm": 0.6049244168030435, "learning_rate": 3.067524947608258e-05, "loss": 0.9187, "step": 356 }, { "epoch": 1.9481582537517053, "grad_norm": 0.8098504286256158, "learning_rate": 3.061061197867763e-05, "loss": 0.9162, "step": 357 }, { "epoch": 1.9536152796725785, "grad_norm": 0.7357777980477844, "learning_rate": 3.05458198704181e-05, "loss": 0.9344, "step": 358 }, { "epoch": 1.9590723055934516, "grad_norm": 0.5713529931575109, "learning_rate": 3.0480874095411946e-05, "loss": 0.9515, "step": 359 }, { "epoch": 1.9645293315143246, "grad_norm": 0.8373330331353604, "learning_rate": 3.0415775600006267e-05, "loss": 0.9546, "step": 360 }, { "epoch": 1.969986357435198, "grad_norm": 0.6868147137493235, "learning_rate": 3.035052533277349e-05, "loss": 0.907, "step": 361 }, { "epoch": 1.975443383356071, "grad_norm": 0.47372940490854243, "learning_rate": 3.0285124244497576e-05, "loss": 0.9246, "step": 362 }, { "epoch": 1.980900409276944, "grad_norm": 0.6977343075907223, "learning_rate": 3.0219573288160128e-05, "loss": 0.9562, "step": 363 }, { "epoch": 1.9863574351978173, "grad_norm": 0.6563089786155916, "learning_rate": 3.0153873418926543e-05, "loss": 0.9344, "step": 364 }, { "epoch": 1.9918144611186903, "grad_norm": 0.7033335661318982, "learning_rate": 3.0088025594132086e-05, "loss": 0.9479, "step": 365 }, { "epoch": 1.9972714870395634, "grad_norm": 1.1633808323873716, "learning_rate": 3.0022030773267908e-05, "loss": 0.935, "step": 366 }, { "epoch": 2.0027285129604366, "grad_norm": 2.256649667221531, "learning_rate": 2.9955889917967114e-05, "loss": 1.6487, "step": 367 }, { "epoch": 2.00818553888131, "grad_norm": 0.9257803693615221, "learning_rate": 2.9889603991990718e-05, "loss": 0.9194, "step": 368 }, { "epoch": 2.0136425648021827, "grad_norm": 0.8374064842179173, "learning_rate": 2.9823173961213614e-05, "loss": 0.936, "step": 369 }, { "epoch": 2.019099590723056, "grad_norm": 0.6888393857507884, "learning_rate": 2.9756600793610477e-05, "loss": 0.9069, "step": 370 }, { "epoch": 2.0245566166439293, "grad_norm": 0.6078836940762362, "learning_rate": 2.9689885459241705e-05, "loss": 0.9181, "step": 371 }, { "epoch": 2.030013642564802, "grad_norm": 0.6540715623371649, "learning_rate": 2.9623028930239234e-05, "loss": 0.9365, "step": 372 }, { "epoch": 2.0354706684856754, "grad_norm": 0.6022481328295576, "learning_rate": 2.955603218079241e-05, "loss": 0.923, "step": 373 }, { "epoch": 2.0409276944065486, "grad_norm": 0.7165752848226464, "learning_rate": 2.9488896187133767e-05, "loss": 0.9181, "step": 374 }, { "epoch": 2.0463847203274215, "grad_norm": 0.8352826439816641, "learning_rate": 2.942162192752483e-05, "loss": 0.9236, "step": 375 }, { "epoch": 2.0518417462482947, "grad_norm": 1.124128627018019, "learning_rate": 2.935421038224182e-05, "loss": 0.919, "step": 376 }, { "epoch": 2.057298772169168, "grad_norm": 1.0339665065551706, "learning_rate": 2.9286662533561423e-05, "loss": 0.9367, "step": 377 }, { "epoch": 2.062755798090041, "grad_norm": 1.2298783039098067, "learning_rate": 2.9218979365746426e-05, "loss": 0.9456, "step": 378 }, { "epoch": 2.068212824010914, "grad_norm": 0.8183361526417724, "learning_rate": 2.9151161865031414e-05, "loss": 0.9444, "step": 379 }, { "epoch": 2.0736698499317874, "grad_norm": 0.484619834541414, "learning_rate": 2.908321101960837e-05, "loss": 0.9085, "step": 380 }, { "epoch": 2.07912687585266, "grad_norm": 0.3810542728807868, "learning_rate": 2.9015127819612292e-05, "loss": 0.8991, "step": 381 }, { "epoch": 2.0845839017735335, "grad_norm": 0.4925827663184475, "learning_rate": 2.894691325710677e-05, "loss": 0.9218, "step": 382 }, { "epoch": 2.0900409276944067, "grad_norm": 0.7465936328564935, "learning_rate": 2.8878568326069494e-05, "loss": 0.93, "step": 383 }, { "epoch": 2.0954979536152796, "grad_norm": 1.0199914288512335, "learning_rate": 2.8810094022377842e-05, "loss": 0.9388, "step": 384 }, { "epoch": 2.100954979536153, "grad_norm": 1.4039532764332685, "learning_rate": 2.8741491343794296e-05, "loss": 0.9205, "step": 385 }, { "epoch": 2.106412005457026, "grad_norm": 0.6570765199675046, "learning_rate": 2.867276128995193e-05, "loss": 0.9472, "step": 386 }, { "epoch": 2.111869031377899, "grad_norm": 0.47805545813863976, "learning_rate": 2.860390486233987e-05, "loss": 0.9213, "step": 387 }, { "epoch": 2.117326057298772, "grad_norm": 0.9100198379548127, "learning_rate": 2.8534923064288652e-05, "loss": 0.9185, "step": 388 }, { "epoch": 2.1227830832196455, "grad_norm": 1.359999448910369, "learning_rate": 2.8465816900955635e-05, "loss": 0.9103, "step": 389 }, { "epoch": 2.1282401091405183, "grad_norm": 0.7267473662850902, "learning_rate": 2.8396587379310366e-05, "loss": 0.9263, "step": 390 }, { "epoch": 2.1336971350613916, "grad_norm": 0.6852106225837414, "learning_rate": 2.8327235508119854e-05, "loss": 0.9056, "step": 391 }, { "epoch": 2.139154160982265, "grad_norm": 0.6935707651834161, "learning_rate": 2.8257762297933927e-05, "loss": 0.9279, "step": 392 }, { "epoch": 2.1446111869031377, "grad_norm": 0.8762210438590792, "learning_rate": 2.81881687610705e-05, "loss": 0.9069, "step": 393 }, { "epoch": 2.150068212824011, "grad_norm": 1.1906568951863223, "learning_rate": 2.8118455911600767e-05, "loss": 0.929, "step": 394 }, { "epoch": 2.155525238744884, "grad_norm": 0.980254177026494, "learning_rate": 2.8048624765334502e-05, "loss": 0.9323, "step": 395 }, { "epoch": 2.160982264665757, "grad_norm": 1.0373134164423028, "learning_rate": 2.7978676339805208e-05, "loss": 0.9208, "step": 396 }, { "epoch": 2.1664392905866303, "grad_norm": 1.0207154812500114, "learning_rate": 2.79086116542553e-05, "loss": 0.9096, "step": 397 }, { "epoch": 2.1718963165075036, "grad_norm": 1.1988463269854843, "learning_rate": 2.783843172962128e-05, "loss": 0.9402, "step": 398 }, { "epoch": 2.1773533424283764, "grad_norm": 0.7969790707530212, "learning_rate": 2.7768137588518807e-05, "loss": 0.908, "step": 399 }, { "epoch": 2.1828103683492497, "grad_norm": 0.4748645421435369, "learning_rate": 2.769773025522785e-05, "loss": 0.914, "step": 400 }, { "epoch": 2.188267394270123, "grad_norm": 0.45121822491331515, "learning_rate": 2.7627210755677733e-05, "loss": 0.9307, "step": 401 }, { "epoch": 2.193724420190996, "grad_norm": 0.8118676469523863, "learning_rate": 2.7556580117432185e-05, "loss": 0.9102, "step": 402 }, { "epoch": 2.199181446111869, "grad_norm": 1.1207703065447276, "learning_rate": 2.7485839369674384e-05, "loss": 0.9231, "step": 403 }, { "epoch": 2.2046384720327423, "grad_norm": 0.9740106870010401, "learning_rate": 2.7414989543191964e-05, "loss": 0.9087, "step": 404 }, { "epoch": 2.210095497953615, "grad_norm": 0.9634686443072049, "learning_rate": 2.734403167036195e-05, "loss": 0.9082, "step": 405 }, { "epoch": 2.2155525238744884, "grad_norm": 0.9832162277660468, "learning_rate": 2.727296678513577e-05, "loss": 0.9241, "step": 406 }, { "epoch": 2.2210095497953617, "grad_norm": 1.0746452821377297, "learning_rate": 2.720179592302417e-05, "loss": 0.9407, "step": 407 }, { "epoch": 2.2264665757162345, "grad_norm": 0.8835118585227068, "learning_rate": 2.71305201210821e-05, "loss": 0.906, "step": 408 }, { "epoch": 2.231923601637108, "grad_norm": 0.806040386235616, "learning_rate": 2.7059140417893645e-05, "loss": 0.9142, "step": 409 }, { "epoch": 2.237380627557981, "grad_norm": 0.7956258201623788, "learning_rate": 2.6987657853556864e-05, "loss": 0.8814, "step": 410 }, { "epoch": 2.242837653478854, "grad_norm": 0.7155012234587093, "learning_rate": 2.6916073469668633e-05, "loss": 0.9408, "step": 411 }, { "epoch": 2.248294679399727, "grad_norm": 0.745980798963711, "learning_rate": 2.6844388309309494e-05, "loss": 0.9334, "step": 412 }, { "epoch": 2.2537517053206004, "grad_norm": 0.8718383779341066, "learning_rate": 2.6772603417028408e-05, "loss": 0.9244, "step": 413 }, { "epoch": 2.2592087312414733, "grad_norm": 0.8697224939003284, "learning_rate": 2.6700719838827595e-05, "loss": 0.9132, "step": 414 }, { "epoch": 2.2646657571623465, "grad_norm": 0.7800957792385944, "learning_rate": 2.662873862214724e-05, "loss": 0.9253, "step": 415 }, { "epoch": 2.27012278308322, "grad_norm": 0.8009973379664055, "learning_rate": 2.655666081585027e-05, "loss": 0.9, "step": 416 }, { "epoch": 2.2755798090040926, "grad_norm": 0.8649005822972493, "learning_rate": 2.6484487470207035e-05, "loss": 0.9204, "step": 417 }, { "epoch": 2.281036834924966, "grad_norm": 0.8818657424466958, "learning_rate": 2.641221963688002e-05, "loss": 0.9155, "step": 418 }, { "epoch": 2.286493860845839, "grad_norm": 0.5647385759805507, "learning_rate": 2.633985836890854e-05, "loss": 0.9206, "step": 419 }, { "epoch": 2.291950886766712, "grad_norm": 0.5034679857244327, "learning_rate": 2.6267404720693375e-05, "loss": 0.9204, "step": 420 }, { "epoch": 2.2974079126875853, "grad_norm": 0.710256150433762, "learning_rate": 2.6194859747981385e-05, "loss": 0.9191, "step": 421 }, { "epoch": 2.3028649386084585, "grad_norm": 0.5706543763177601, "learning_rate": 2.6122224507850182e-05, "loss": 0.9185, "step": 422 }, { "epoch": 2.3083219645293314, "grad_norm": 0.6833880125599795, "learning_rate": 2.604950005869268e-05, "loss": 0.9213, "step": 423 }, { "epoch": 2.3137789904502046, "grad_norm": 0.8483843690019908, "learning_rate": 2.5976687460201683e-05, "loss": 0.9126, "step": 424 }, { "epoch": 2.319236016371078, "grad_norm": 0.8129051361925009, "learning_rate": 2.5903787773354463e-05, "loss": 0.9188, "step": 425 }, { "epoch": 2.3246930422919507, "grad_norm": 0.5996381128568273, "learning_rate": 2.583080206039728e-05, "loss": 0.9096, "step": 426 }, { "epoch": 2.330150068212824, "grad_norm": 0.41863958371356735, "learning_rate": 2.57577313848299e-05, "loss": 0.9432, "step": 427 }, { "epoch": 2.3356070941336973, "grad_norm": 0.34060059093315503, "learning_rate": 2.5684576811390125e-05, "loss": 0.9137, "step": 428 }, { "epoch": 2.34106412005457, "grad_norm": 0.5069480306429284, "learning_rate": 2.5611339406038257e-05, "loss": 0.9124, "step": 429 }, { "epoch": 2.3465211459754434, "grad_norm": 0.5427881229277935, "learning_rate": 2.5538020235941552e-05, "loss": 0.9166, "step": 430 }, { "epoch": 2.3519781718963166, "grad_norm": 0.543245106400598, "learning_rate": 2.5464620369458724e-05, "loss": 0.9197, "step": 431 }, { "epoch": 2.3574351978171895, "grad_norm": 0.5487542346479996, "learning_rate": 2.5391140876124305e-05, "loss": 0.9203, "step": 432 }, { "epoch": 2.3628922237380627, "grad_norm": 0.504474417772234, "learning_rate": 2.531758282663311e-05, "loss": 0.9139, "step": 433 }, { "epoch": 2.368349249658936, "grad_norm": 0.3570671212002871, "learning_rate": 2.524394729282464e-05, "loss": 0.9227, "step": 434 }, { "epoch": 2.373806275579809, "grad_norm": 0.33080967390463245, "learning_rate": 2.5170235347667425e-05, "loss": 0.9298, "step": 435 }, { "epoch": 2.379263301500682, "grad_norm": 0.2629370339698779, "learning_rate": 2.5096448065243415e-05, "loss": 0.9222, "step": 436 }, { "epoch": 2.3847203274215554, "grad_norm": 0.32467107495565267, "learning_rate": 2.5022586520732334e-05, "loss": 0.9092, "step": 437 }, { "epoch": 2.390177353342428, "grad_norm": 0.27556269692287366, "learning_rate": 2.494865179039599e-05, "loss": 0.8993, "step": 438 }, { "epoch": 2.3956343792633015, "grad_norm": 0.279539516282507, "learning_rate": 2.4874644951562618e-05, "loss": 0.9019, "step": 439 }, { "epoch": 2.4010914051841747, "grad_norm": 0.33354360728490134, "learning_rate": 2.4800567082611165e-05, "loss": 0.9152, "step": 440 }, { "epoch": 2.4065484311050476, "grad_norm": 0.33169175944263035, "learning_rate": 2.4726419262955595e-05, "loss": 0.9091, "step": 441 }, { "epoch": 2.412005457025921, "grad_norm": 0.3587055937970976, "learning_rate": 2.465220257302913e-05, "loss": 0.9202, "step": 442 }, { "epoch": 2.417462482946794, "grad_norm": 0.40441219606068757, "learning_rate": 2.4577918094268523e-05, "loss": 0.9226, "step": 443 }, { "epoch": 2.422919508867667, "grad_norm": 0.4865996215311924, "learning_rate": 2.4503566909098318e-05, "loss": 0.9093, "step": 444 }, { "epoch": 2.42837653478854, "grad_norm": 0.38008820904475854, "learning_rate": 2.4429150100915054e-05, "loss": 0.9322, "step": 445 }, { "epoch": 2.4338335607094135, "grad_norm": 0.41170329827458135, "learning_rate": 2.435466875407148e-05, "loss": 0.9324, "step": 446 }, { "epoch": 2.4392905866302863, "grad_norm": 0.3622800817675993, "learning_rate": 2.4280123953860767e-05, "loss": 0.9001, "step": 447 }, { "epoch": 2.4447476125511596, "grad_norm": 0.2682950261173189, "learning_rate": 2.4205516786500684e-05, "loss": 0.9314, "step": 448 }, { "epoch": 2.450204638472033, "grad_norm": 0.2805378098796358, "learning_rate": 2.4130848339117766e-05, "loss": 0.9341, "step": 449 }, { "epoch": 2.4556616643929057, "grad_norm": 0.26782126481321455, "learning_rate": 2.4056119699731495e-05, "loss": 0.9077, "step": 450 }, { "epoch": 2.461118690313779, "grad_norm": 0.37285051812558306, "learning_rate": 2.3981331957238414e-05, "loss": 0.9235, "step": 451 }, { "epoch": 2.466575716234652, "grad_norm": 0.3129713500376212, "learning_rate": 2.3906486201396287e-05, "loss": 0.9213, "step": 452 }, { "epoch": 2.472032742155525, "grad_norm": 0.36665287480858777, "learning_rate": 2.3831583522808224e-05, "loss": 0.917, "step": 453 }, { "epoch": 2.4774897680763983, "grad_norm": 0.3443704371520464, "learning_rate": 2.375662501290675e-05, "loss": 0.9189, "step": 454 }, { "epoch": 2.4829467939972716, "grad_norm": 0.31197899443616667, "learning_rate": 2.368161176393793e-05, "loss": 0.9127, "step": 455 }, { "epoch": 2.488403819918145, "grad_norm": 0.35012014939390956, "learning_rate": 2.360654486894548e-05, "loss": 0.9113, "step": 456 }, { "epoch": 2.4938608458390177, "grad_norm": 0.35258642719846595, "learning_rate": 2.3531425421754782e-05, "loss": 0.9137, "step": 457 }, { "epoch": 2.499317871759891, "grad_norm": 0.4818508820401416, "learning_rate": 2.3456254516956973e-05, "loss": 0.9322, "step": 458 }, { "epoch": 2.504774897680764, "grad_norm": 0.41831055845919374, "learning_rate": 2.3381033249893007e-05, "loss": 0.9358, "step": 459 }, { "epoch": 2.510231923601637, "grad_norm": 0.46003166070829415, "learning_rate": 2.3305762716637696e-05, "loss": 0.9134, "step": 460 }, { "epoch": 2.5156889495225103, "grad_norm": 0.34405667621405894, "learning_rate": 2.32304440139837e-05, "loss": 0.914, "step": 461 }, { "epoch": 2.5211459754433836, "grad_norm": 0.30837605247167627, "learning_rate": 2.315507823942559e-05, "loss": 0.8906, "step": 462 }, { "epoch": 2.5266030013642564, "grad_norm": 0.35159469224889583, "learning_rate": 2.3079666491143827e-05, "loss": 0.9291, "step": 463 }, { "epoch": 2.5320600272851297, "grad_norm": 0.3797916060475412, "learning_rate": 2.3004209867988783e-05, "loss": 0.9087, "step": 464 }, { "epoch": 2.5375170532060025, "grad_norm": 0.40916286067612617, "learning_rate": 2.2928709469464705e-05, "loss": 0.9158, "step": 465 }, { "epoch": 2.542974079126876, "grad_norm": 0.28077601639148303, "learning_rate": 2.2853166395713715e-05, "loss": 0.908, "step": 466 }, { "epoch": 2.548431105047749, "grad_norm": 0.30535476691189556, "learning_rate": 2.2777581747499767e-05, "loss": 0.9288, "step": 467 }, { "epoch": 2.5538881309686223, "grad_norm": 0.2741959984551279, "learning_rate": 2.2701956626192603e-05, "loss": 0.9123, "step": 468 }, { "epoch": 2.559345156889495, "grad_norm": 0.29160243799401836, "learning_rate": 2.262629213375173e-05, "loss": 0.9153, "step": 469 }, { "epoch": 2.5648021828103684, "grad_norm": 0.31211888825075323, "learning_rate": 2.255058937271032e-05, "loss": 0.9019, "step": 470 }, { "epoch": 2.5702592087312413, "grad_norm": 0.24605091808209184, "learning_rate": 2.2474849446159193e-05, "loss": 0.9041, "step": 471 }, { "epoch": 2.5757162346521145, "grad_norm": 0.296940058046894, "learning_rate": 2.2399073457730723e-05, "loss": 0.8933, "step": 472 }, { "epoch": 2.581173260572988, "grad_norm": 0.39017704428903854, "learning_rate": 2.2323262511582726e-05, "loss": 0.9219, "step": 473 }, { "epoch": 2.586630286493861, "grad_norm": 0.26845683489444067, "learning_rate": 2.2247417712382423e-05, "loss": 0.9072, "step": 474 }, { "epoch": 2.592087312414734, "grad_norm": 0.29710964002091833, "learning_rate": 2.217154016529031e-05, "loss": 0.9254, "step": 475 }, { "epoch": 2.597544338335607, "grad_norm": 0.2773002611218211, "learning_rate": 2.2095630975944068e-05, "loss": 0.9196, "step": 476 }, { "epoch": 2.60300136425648, "grad_norm": 0.27685282385866905, "learning_rate": 2.2019691250442442e-05, "loss": 0.9048, "step": 477 }, { "epoch": 2.6084583901773533, "grad_norm": 0.35014690047193237, "learning_rate": 2.1943722095329138e-05, "loss": 0.9113, "step": 478 }, { "epoch": 2.6139154160982265, "grad_norm": 0.2596786590850847, "learning_rate": 2.1867724617576685e-05, "loss": 0.9161, "step": 479 }, { "epoch": 2.6193724420191, "grad_norm": 0.3426543130719377, "learning_rate": 2.1791699924570313e-05, "loss": 0.8926, "step": 480 }, { "epoch": 2.6248294679399726, "grad_norm": 0.3078282469487072, "learning_rate": 2.1715649124091814e-05, "loss": 0.9183, "step": 481 }, { "epoch": 2.630286493860846, "grad_norm": 0.22901258390983542, "learning_rate": 2.16395733243034e-05, "loss": 0.9344, "step": 482 }, { "epoch": 2.6357435197817187, "grad_norm": 0.371108470895669, "learning_rate": 2.156347363373156e-05, "loss": 0.9192, "step": 483 }, { "epoch": 2.641200545702592, "grad_norm": 0.3675376564769477, "learning_rate": 2.14873511612509e-05, "loss": 0.914, "step": 484 }, { "epoch": 2.6466575716234653, "grad_norm": 0.47791366315200284, "learning_rate": 2.141120701606799e-05, "loss": 0.9078, "step": 485 }, { "epoch": 2.6521145975443385, "grad_norm": 0.4222978650582422, "learning_rate": 2.1335042307705206e-05, "loss": 0.9099, "step": 486 }, { "epoch": 2.6575716234652114, "grad_norm": 0.3556115683063452, "learning_rate": 2.125885814598454e-05, "loss": 0.9064, "step": 487 }, { "epoch": 2.6630286493860846, "grad_norm": 0.356222691019892, "learning_rate": 2.1182655641011468e-05, "loss": 0.9109, "step": 488 }, { "epoch": 2.6684856753069575, "grad_norm": 0.2950967727936582, "learning_rate": 2.1106435903158734e-05, "loss": 0.907, "step": 489 }, { "epoch": 2.6739427012278307, "grad_norm": 0.2589049008249365, "learning_rate": 2.10302000430502e-05, "loss": 0.9167, "step": 490 }, { "epoch": 2.679399727148704, "grad_norm": 0.2679428400644797, "learning_rate": 2.0953949171544646e-05, "loss": 0.9029, "step": 491 }, { "epoch": 2.6848567530695773, "grad_norm": 0.30000226534532, "learning_rate": 2.0877684399719596e-05, "loss": 0.902, "step": 492 }, { "epoch": 2.69031377899045, "grad_norm": 0.31357462517216056, "learning_rate": 2.0801406838855095e-05, "loss": 0.9151, "step": 493 }, { "epoch": 2.6957708049113234, "grad_norm": 0.2692910544239183, "learning_rate": 2.0725117600417572e-05, "loss": 0.9218, "step": 494 }, { "epoch": 2.701227830832196, "grad_norm": 0.30151763927530156, "learning_rate": 2.0648817796043598e-05, "loss": 0.9198, "step": 495 }, { "epoch": 2.7066848567530695, "grad_norm": 0.2758793028048215, "learning_rate": 2.0572508537523705e-05, "loss": 0.8979, "step": 496 }, { "epoch": 2.7121418826739427, "grad_norm": 0.2812105414991479, "learning_rate": 2.0496190936786196e-05, "loss": 0.9131, "step": 497 }, { "epoch": 2.717598908594816, "grad_norm": 0.2963610249601614, "learning_rate": 2.041986610588091e-05, "loss": 0.9377, "step": 498 }, { "epoch": 2.723055934515689, "grad_norm": 0.3097919911404899, "learning_rate": 2.0343535156963057e-05, "loss": 0.9262, "step": 499 }, { "epoch": 2.728512960436562, "grad_norm": 0.34847730033316476, "learning_rate": 2.026719920227699e-05, "loss": 0.8998, "step": 500 }, { "epoch": 2.733969986357435, "grad_norm": 0.30531935495612433, "learning_rate": 2.0190859354139994e-05, "loss": 0.9269, "step": 501 }, { "epoch": 2.739427012278308, "grad_norm": 0.2669945648424582, "learning_rate": 2.0114516724926103e-05, "loss": 0.9455, "step": 502 }, { "epoch": 2.7448840381991815, "grad_norm": 0.2785334692894501, "learning_rate": 2.0038172427049862e-05, "loss": 0.912, "step": 503 }, { "epoch": 2.7503410641200547, "grad_norm": 0.3445461005907961, "learning_rate": 1.9961827572950138e-05, "loss": 0.9163, "step": 504 }, { "epoch": 2.7557980900409276, "grad_norm": 0.39296279811877044, "learning_rate": 1.98854832750739e-05, "loss": 0.9369, "step": 505 }, { "epoch": 2.761255115961801, "grad_norm": 0.39702351389810686, "learning_rate": 1.9809140645860013e-05, "loss": 0.891, "step": 506 }, { "epoch": 2.7667121418826737, "grad_norm": 0.2512865215587987, "learning_rate": 1.9732800797723018e-05, "loss": 0.9115, "step": 507 }, { "epoch": 2.772169167803547, "grad_norm": 0.2820633130771331, "learning_rate": 1.965646484303695e-05, "loss": 0.9212, "step": 508 }, { "epoch": 2.77762619372442, "grad_norm": 0.32145777353057775, "learning_rate": 1.9580133894119098e-05, "loss": 0.9207, "step": 509 }, { "epoch": 2.7830832196452935, "grad_norm": 0.33762112618327617, "learning_rate": 1.9503809063213807e-05, "loss": 0.8845, "step": 510 }, { "epoch": 2.7885402455661663, "grad_norm": 0.24634508212661455, "learning_rate": 1.9427491462476295e-05, "loss": 0.9156, "step": 511 }, { "epoch": 2.7939972714870396, "grad_norm": 0.3457860742517539, "learning_rate": 1.9351182203956405e-05, "loss": 0.9106, "step": 512 }, { "epoch": 2.799454297407913, "grad_norm": 0.3810319883859794, "learning_rate": 1.927488239958243e-05, "loss": 0.8924, "step": 513 }, { "epoch": 2.8049113233287857, "grad_norm": 0.37285981835585597, "learning_rate": 1.919859316114491e-05, "loss": 0.906, "step": 514 }, { "epoch": 2.810368349249659, "grad_norm": 0.24108156149639062, "learning_rate": 1.9122315600280418e-05, "loss": 0.9175, "step": 515 }, { "epoch": 2.815825375170532, "grad_norm": 0.3943374958725155, "learning_rate": 1.904605082845536e-05, "loss": 0.9078, "step": 516 }, { "epoch": 2.821282401091405, "grad_norm": 0.3145717053046707, "learning_rate": 1.89697999569498e-05, "loss": 0.9135, "step": 517 }, { "epoch": 2.8267394270122783, "grad_norm": 0.22533549622277005, "learning_rate": 1.8893564096841273e-05, "loss": 0.909, "step": 518 }, { "epoch": 2.8321964529331516, "grad_norm": 0.23525731404627342, "learning_rate": 1.881734435898854e-05, "loss": 0.9299, "step": 519 }, { "epoch": 2.8376534788540244, "grad_norm": 0.2512060708918993, "learning_rate": 1.8741141854015468e-05, "loss": 0.8893, "step": 520 }, { "epoch": 2.8431105047748977, "grad_norm": 0.19994216173059465, "learning_rate": 1.8664957692294808e-05, "loss": 0.9221, "step": 521 }, { "epoch": 2.848567530695771, "grad_norm": 0.20556264949760783, "learning_rate": 1.858879298393202e-05, "loss": 0.9316, "step": 522 }, { "epoch": 2.854024556616644, "grad_norm": 0.20256542941627978, "learning_rate": 1.8512648838749105e-05, "loss": 0.9093, "step": 523 }, { "epoch": 2.859481582537517, "grad_norm": 0.22020875876934895, "learning_rate": 1.8436526366268444e-05, "loss": 0.9049, "step": 524 }, { "epoch": 2.8649386084583903, "grad_norm": 0.20768534379511697, "learning_rate": 1.8360426675696606e-05, "loss": 0.9144, "step": 525 }, { "epoch": 2.870395634379263, "grad_norm": 0.28896251352128466, "learning_rate": 1.828435087590819e-05, "loss": 0.9145, "step": 526 }, { "epoch": 2.8758526603001364, "grad_norm": 0.3131376106100284, "learning_rate": 1.8208300075429693e-05, "loss": 0.9308, "step": 527 }, { "epoch": 2.8813096862210097, "grad_norm": 0.24876481284966392, "learning_rate": 1.8132275382423325e-05, "loss": 0.9115, "step": 528 }, { "epoch": 2.8867667121418825, "grad_norm": 0.2530867014542135, "learning_rate": 1.8056277904670865e-05, "loss": 0.8851, "step": 529 }, { "epoch": 2.892223738062756, "grad_norm": 0.2592890449900578, "learning_rate": 1.798030874955756e-05, "loss": 0.9058, "step": 530 }, { "epoch": 2.897680763983629, "grad_norm": 0.22039748569474332, "learning_rate": 1.7904369024055942e-05, "loss": 0.9176, "step": 531 }, { "epoch": 2.903137789904502, "grad_norm": 0.2209833356939442, "learning_rate": 1.7828459834709694e-05, "loss": 0.917, "step": 532 }, { "epoch": 2.908594815825375, "grad_norm": 0.23766546854501655, "learning_rate": 1.7752582287617583e-05, "loss": 0.8989, "step": 533 }, { "epoch": 2.9140518417462484, "grad_norm": 0.2376537458371181, "learning_rate": 1.767673748841728e-05, "loss": 0.8946, "step": 534 }, { "epoch": 2.9195088676671213, "grad_norm": 0.262071528071461, "learning_rate": 1.7600926542269277e-05, "loss": 0.9231, "step": 535 }, { "epoch": 2.9249658935879945, "grad_norm": 0.29376545282596106, "learning_rate": 1.7525150553840806e-05, "loss": 0.8938, "step": 536 }, { "epoch": 2.930422919508868, "grad_norm": 0.3134884408737219, "learning_rate": 1.7449410627289687e-05, "loss": 0.9168, "step": 537 }, { "epoch": 2.9358799454297406, "grad_norm": 0.2712354478643755, "learning_rate": 1.7373707866248278e-05, "loss": 0.933, "step": 538 }, { "epoch": 2.941336971350614, "grad_norm": 0.24553201691764942, "learning_rate": 1.7298043373807404e-05, "loss": 0.9159, "step": 539 }, { "epoch": 2.946793997271487, "grad_norm": 0.3030078675065205, "learning_rate": 1.7222418252500243e-05, "loss": 0.9062, "step": 540 }, { "epoch": 2.9522510231923604, "grad_norm": 0.23890406347684276, "learning_rate": 1.7146833604286295e-05, "loss": 0.8945, "step": 541 }, { "epoch": 2.9577080491132333, "grad_norm": 0.2670091183635565, "learning_rate": 1.7071290530535298e-05, "loss": 0.909, "step": 542 }, { "epoch": 2.9631650750341065, "grad_norm": 0.23126297362235826, "learning_rate": 1.6995790132011223e-05, "loss": 0.9143, "step": 543 }, { "epoch": 2.9686221009549794, "grad_norm": 0.31050871509494943, "learning_rate": 1.6920333508856176e-05, "loss": 0.8994, "step": 544 }, { "epoch": 2.9740791268758526, "grad_norm": 0.22661046923902323, "learning_rate": 1.6844921760574417e-05, "loss": 0.9294, "step": 545 }, { "epoch": 2.979536152796726, "grad_norm": 0.3118001086032258, "learning_rate": 1.676955598601631e-05, "loss": 0.9041, "step": 546 }, { "epoch": 2.984993178717599, "grad_norm": 0.23665950368215852, "learning_rate": 1.6694237283362314e-05, "loss": 0.9038, "step": 547 }, { "epoch": 2.990450204638472, "grad_norm": 0.24492951232429386, "learning_rate": 1.6618966750106996e-05, "loss": 0.916, "step": 548 }, { "epoch": 2.9959072305593453, "grad_norm": 0.25300337782976023, "learning_rate": 1.6543745483043037e-05, "loss": 0.9083, "step": 549 }, { "epoch": 3.001364256480218, "grad_norm": 0.584103118759897, "learning_rate": 1.6468574578245225e-05, "loss": 1.6082, "step": 550 }, { "epoch": 3.0068212824010914, "grad_norm": 0.671101312579536, "learning_rate": 1.639345513105452e-05, "loss": 0.8859, "step": 551 }, { "epoch": 3.0122783083219646, "grad_norm": 0.3787017346934449, "learning_rate": 1.6318388236062072e-05, "loss": 0.8951, "step": 552 }, { "epoch": 3.0177353342428375, "grad_norm": 0.42606514302989157, "learning_rate": 1.624337498709326e-05, "loss": 0.8877, "step": 553 }, { "epoch": 3.0231923601637107, "grad_norm": 0.35542176787821733, "learning_rate": 1.616841647719178e-05, "loss": 0.8895, "step": 554 }, { "epoch": 3.028649386084584, "grad_norm": 0.35418748629561114, "learning_rate": 1.6093513798603713e-05, "loss": 0.8968, "step": 555 }, { "epoch": 3.034106412005457, "grad_norm": 0.4142394476010708, "learning_rate": 1.6018668042761593e-05, "loss": 0.8855, "step": 556 }, { "epoch": 3.03956343792633, "grad_norm": 0.26285840734342447, "learning_rate": 1.594388030026851e-05, "loss": 0.8685, "step": 557 }, { "epoch": 3.0450204638472034, "grad_norm": 0.3399484818274934, "learning_rate": 1.586915166088224e-05, "loss": 0.908, "step": 558 }, { "epoch": 3.050477489768076, "grad_norm": 0.3389204352265327, "learning_rate": 1.5794483213499326e-05, "loss": 0.8911, "step": 559 }, { "epoch": 3.0559345156889495, "grad_norm": 0.33188066961256374, "learning_rate": 1.5719876046139243e-05, "loss": 0.9147, "step": 560 }, { "epoch": 3.0613915416098227, "grad_norm": 0.3377610682449399, "learning_rate": 1.564533124592852e-05, "loss": 0.8949, "step": 561 }, { "epoch": 3.0668485675306956, "grad_norm": 0.2957318174966501, "learning_rate": 1.557084989908495e-05, "loss": 0.8986, "step": 562 }, { "epoch": 3.072305593451569, "grad_norm": 0.430673617485615, "learning_rate": 1.5496433090901685e-05, "loss": 0.8949, "step": 563 }, { "epoch": 3.077762619372442, "grad_norm": 0.2554433088355423, "learning_rate": 1.5422081905731484e-05, "loss": 0.8882, "step": 564 }, { "epoch": 3.083219645293315, "grad_norm": 0.32618011312611783, "learning_rate": 1.534779742697088e-05, "loss": 0.9174, "step": 565 }, { "epoch": 3.088676671214188, "grad_norm": 0.31352014509777587, "learning_rate": 1.5273580737044416e-05, "loss": 0.8918, "step": 566 }, { "epoch": 3.0941336971350615, "grad_norm": 0.2557790089027306, "learning_rate": 1.5199432917388835e-05, "loss": 0.9007, "step": 567 }, { "epoch": 3.0995907230559343, "grad_norm": 0.27540644472124487, "learning_rate": 1.5125355048437389e-05, "loss": 0.884, "step": 568 }, { "epoch": 3.1050477489768076, "grad_norm": 0.34235171994492863, "learning_rate": 1.5051348209604016e-05, "loss": 0.8686, "step": 569 }, { "epoch": 3.110504774897681, "grad_norm": 0.25008950788915946, "learning_rate": 1.4977413479267675e-05, "loss": 0.9026, "step": 570 }, { "epoch": 3.1159618008185537, "grad_norm": 0.3964129492366135, "learning_rate": 1.4903551934756592e-05, "loss": 0.8992, "step": 571 }, { "epoch": 3.121418826739427, "grad_norm": 0.40686134783523276, "learning_rate": 1.4829764652332585e-05, "loss": 0.9209, "step": 572 }, { "epoch": 3.1268758526603, "grad_norm": 0.2670447009105334, "learning_rate": 1.4756052707175361e-05, "loss": 0.9153, "step": 573 }, { "epoch": 3.132332878581173, "grad_norm": 0.4402126815582449, "learning_rate": 1.4682417173366892e-05, "loss": 0.907, "step": 574 }, { "epoch": 3.1377899045020463, "grad_norm": 0.23815050001596294, "learning_rate": 1.4608859123875703e-05, "loss": 0.9038, "step": 575 }, { "epoch": 3.1432469304229196, "grad_norm": 0.3030117101013267, "learning_rate": 1.4535379630541284e-05, "loss": 0.9065, "step": 576 }, { "epoch": 3.148703956343793, "grad_norm": 0.37381110214711166, "learning_rate": 1.4461979764058454e-05, "loss": 0.9096, "step": 577 }, { "epoch": 3.1541609822646657, "grad_norm": 0.24485627188888226, "learning_rate": 1.4388660593961756e-05, "loss": 0.8858, "step": 578 }, { "epoch": 3.159618008185539, "grad_norm": 0.25282112926237954, "learning_rate": 1.4315423188609878e-05, "loss": 0.8905, "step": 579 }, { "epoch": 3.1650750341064118, "grad_norm": 0.24907017187679334, "learning_rate": 1.4242268615170106e-05, "loss": 0.9068, "step": 580 }, { "epoch": 3.170532060027285, "grad_norm": 0.2129960819490356, "learning_rate": 1.4169197939602723e-05, "loss": 0.8912, "step": 581 }, { "epoch": 3.1759890859481583, "grad_norm": 0.24279078285844446, "learning_rate": 1.409621222664554e-05, "loss": 0.8838, "step": 582 }, { "epoch": 3.1814461118690316, "grad_norm": 0.23381673434042413, "learning_rate": 1.4023312539798322e-05, "loss": 0.8896, "step": 583 }, { "epoch": 3.1869031377899044, "grad_norm": 0.22227554143448716, "learning_rate": 1.3950499941307332e-05, "loss": 0.8826, "step": 584 }, { "epoch": 3.1923601637107777, "grad_norm": 0.22806009027283225, "learning_rate": 1.3877775492149828e-05, "loss": 0.899, "step": 585 }, { "epoch": 3.197817189631651, "grad_norm": 0.25047196400087585, "learning_rate": 1.3805140252018618e-05, "loss": 0.8954, "step": 586 }, { "epoch": 3.203274215552524, "grad_norm": 0.2118062936691214, "learning_rate": 1.373259527930663e-05, "loss": 0.8966, "step": 587 }, { "epoch": 3.208731241473397, "grad_norm": 0.2730005282503477, "learning_rate": 1.366014163109146e-05, "loss": 0.8795, "step": 588 }, { "epoch": 3.2141882673942703, "grad_norm": 0.29613230912460564, "learning_rate": 1.3587780363119986e-05, "loss": 0.8796, "step": 589 }, { "epoch": 3.219645293315143, "grad_norm": 0.23990776796738883, "learning_rate": 1.3515512529792978e-05, "loss": 0.9071, "step": 590 }, { "epoch": 3.2251023192360164, "grad_norm": 0.2538388076227864, "learning_rate": 1.3443339184149739e-05, "loss": 0.9036, "step": 591 }, { "epoch": 3.2305593451568897, "grad_norm": 0.24743496996389577, "learning_rate": 1.337126137785276e-05, "loss": 0.8861, "step": 592 }, { "epoch": 3.2360163710777625, "grad_norm": 0.20121450134982874, "learning_rate": 1.329928016117241e-05, "loss": 0.8939, "step": 593 }, { "epoch": 3.241473396998636, "grad_norm": 0.2869931420078408, "learning_rate": 1.3227396582971594e-05, "loss": 0.8906, "step": 594 }, { "epoch": 3.246930422919509, "grad_norm": 0.1908364191371087, "learning_rate": 1.3155611690690515e-05, "loss": 0.886, "step": 595 }, { "epoch": 3.252387448840382, "grad_norm": 0.3472699144561854, "learning_rate": 1.3083926530331372e-05, "loss": 0.9158, "step": 596 }, { "epoch": 3.257844474761255, "grad_norm": 0.22549962507966057, "learning_rate": 1.3012342146443144e-05, "loss": 0.8764, "step": 597 }, { "epoch": 3.2633015006821284, "grad_norm": 0.26789532061692434, "learning_rate": 1.2940859582106357e-05, "loss": 0.8841, "step": 598 }, { "epoch": 3.2687585266030013, "grad_norm": 0.2522357843484046, "learning_rate": 1.2869479878917904e-05, "loss": 0.8819, "step": 599 }, { "epoch": 3.2742155525238745, "grad_norm": 0.21493911054710754, "learning_rate": 1.2798204076975835e-05, "loss": 0.92, "step": 600 }, { "epoch": 3.279672578444748, "grad_norm": 0.2945646091669156, "learning_rate": 1.2727033214864233e-05, "loss": 0.8838, "step": 601 }, { "epoch": 3.2851296043656206, "grad_norm": 0.2829300287180026, "learning_rate": 1.265596832963806e-05, "loss": 0.8755, "step": 602 }, { "epoch": 3.290586630286494, "grad_norm": 0.2536303900570064, "learning_rate": 1.2585010456808046e-05, "loss": 0.8904, "step": 603 }, { "epoch": 3.296043656207367, "grad_norm": 0.3585519781803995, "learning_rate": 1.2514160630325617e-05, "loss": 0.8922, "step": 604 }, { "epoch": 3.30150068212824, "grad_norm": 0.2792945795336993, "learning_rate": 1.2443419882567821e-05, "loss": 0.8771, "step": 605 }, { "epoch": 3.3069577080491133, "grad_norm": 0.35260384633142106, "learning_rate": 1.2372789244322272e-05, "loss": 0.901, "step": 606 }, { "epoch": 3.3124147339699865, "grad_norm": 0.31364366488160306, "learning_rate": 1.2302269744772155e-05, "loss": 0.8818, "step": 607 }, { "epoch": 3.3178717598908594, "grad_norm": 0.23743622737062894, "learning_rate": 1.22318624114812e-05, "loss": 0.9072, "step": 608 }, { "epoch": 3.3233287858117326, "grad_norm": 0.3642214485244677, "learning_rate": 1.216156827037873e-05, "loss": 0.8833, "step": 609 }, { "epoch": 3.328785811732606, "grad_norm": 0.2925427624739931, "learning_rate": 1.2091388345744703e-05, "loss": 0.911, "step": 610 }, { "epoch": 3.3342428376534787, "grad_norm": 0.2377203948239386, "learning_rate": 1.2021323660194798e-05, "loss": 0.8965, "step": 611 }, { "epoch": 3.339699863574352, "grad_norm": 0.2706687731608815, "learning_rate": 1.1951375234665501e-05, "loss": 0.9036, "step": 612 }, { "epoch": 3.3451568894952253, "grad_norm": 0.2679343617436159, "learning_rate": 1.1881544088399237e-05, "loss": 0.8939, "step": 613 }, { "epoch": 3.350613915416098, "grad_norm": 0.22617857543228842, "learning_rate": 1.1811831238929508e-05, "loss": 0.9021, "step": 614 }, { "epoch": 3.3560709413369714, "grad_norm": 0.2904617911241792, "learning_rate": 1.1742237702066074e-05, "loss": 0.8863, "step": 615 }, { "epoch": 3.3615279672578446, "grad_norm": 0.22733511585309843, "learning_rate": 1.1672764491880153e-05, "loss": 0.9143, "step": 616 }, { "epoch": 3.3669849931787175, "grad_norm": 0.256013923198982, "learning_rate": 1.1603412620689637e-05, "loss": 0.899, "step": 617 }, { "epoch": 3.3724420190995907, "grad_norm": 0.25205210893149643, "learning_rate": 1.1534183099044363e-05, "loss": 0.8853, "step": 618 }, { "epoch": 3.377899045020464, "grad_norm": 0.23143271683735414, "learning_rate": 1.1465076935711355e-05, "loss": 0.8947, "step": 619 }, { "epoch": 3.383356070941337, "grad_norm": 0.22370756793978866, "learning_rate": 1.1396095137660134e-05, "loss": 0.8785, "step": 620 }, { "epoch": 3.38881309686221, "grad_norm": 0.21290283764682943, "learning_rate": 1.1327238710048075e-05, "loss": 0.9032, "step": 621 }, { "epoch": 3.3942701227830834, "grad_norm": 0.2685069204258351, "learning_rate": 1.1258508656205715e-05, "loss": 0.8941, "step": 622 }, { "epoch": 3.399727148703956, "grad_norm": 0.20912948755324795, "learning_rate": 1.118990597762216e-05, "loss": 0.8913, "step": 623 }, { "epoch": 3.4051841746248295, "grad_norm": 0.24827347077451523, "learning_rate": 1.1121431673930509e-05, "loss": 0.883, "step": 624 }, { "epoch": 3.4106412005457027, "grad_norm": 0.22274674891516377, "learning_rate": 1.1053086742893244e-05, "loss": 0.9017, "step": 625 }, { "epoch": 3.4160982264665756, "grad_norm": 0.23575151807168895, "learning_rate": 1.0984872180387715e-05, "loss": 0.8988, "step": 626 }, { "epoch": 3.421555252387449, "grad_norm": 0.21353314466163129, "learning_rate": 1.0916788980391633e-05, "loss": 0.9098, "step": 627 }, { "epoch": 3.427012278308322, "grad_norm": 0.22040517357317185, "learning_rate": 1.0848838134968589e-05, "loss": 0.884, "step": 628 }, { "epoch": 3.432469304229195, "grad_norm": 0.22910802159215685, "learning_rate": 1.0781020634253579e-05, "loss": 0.8833, "step": 629 }, { "epoch": 3.437926330150068, "grad_norm": 0.21849412085599912, "learning_rate": 1.0713337466438578e-05, "loss": 0.8839, "step": 630 }, { "epoch": 3.4433833560709415, "grad_norm": 0.21965410678288466, "learning_rate": 1.0645789617758181e-05, "loss": 0.9005, "step": 631 }, { "epoch": 3.4488403819918143, "grad_norm": 0.204035562242123, "learning_rate": 1.057837807247518e-05, "loss": 0.892, "step": 632 }, { "epoch": 3.4542974079126876, "grad_norm": 0.16983377384281073, "learning_rate": 1.0511103812866238e-05, "loss": 0.8812, "step": 633 }, { "epoch": 3.459754433833561, "grad_norm": 0.2042102923266645, "learning_rate": 1.0443967819207602e-05, "loss": 0.88, "step": 634 }, { "epoch": 3.4652114597544337, "grad_norm": 0.18518985041839892, "learning_rate": 1.0376971069760774e-05, "loss": 0.9172, "step": 635 }, { "epoch": 3.470668485675307, "grad_norm": 0.19653140995159937, "learning_rate": 1.0310114540758298e-05, "loss": 0.895, "step": 636 }, { "epoch": 3.47612551159618, "grad_norm": 0.22830479434165665, "learning_rate": 1.0243399206389527e-05, "loss": 0.9044, "step": 637 }, { "epoch": 3.481582537517053, "grad_norm": 0.19206764620071587, "learning_rate": 1.0176826038786394e-05, "loss": 0.8818, "step": 638 }, { "epoch": 3.4870395634379263, "grad_norm": 0.21389623128712906, "learning_rate": 1.011039600800928e-05, "loss": 0.8956, "step": 639 }, { "epoch": 3.4924965893587996, "grad_norm": 0.21993143291851755, "learning_rate": 1.004411008203289e-05, "loss": 0.8927, "step": 640 }, { "epoch": 3.4979536152796724, "grad_norm": 0.1894006892821513, "learning_rate": 9.977969226732099e-06, "loss": 0.8771, "step": 641 }, { "epoch": 3.5034106412005457, "grad_norm": 0.19959640202420684, "learning_rate": 9.911974405867917e-06, "loss": 0.8912, "step": 642 }, { "epoch": 3.508867667121419, "grad_norm": 0.14759174219062646, "learning_rate": 9.846126581073457e-06, "loss": 0.8992, "step": 643 }, { "epoch": 3.5143246930422922, "grad_norm": 0.20035668476318763, "learning_rate": 9.780426711839877e-06, "loss": 0.9006, "step": 644 }, { "epoch": 3.519781718963165, "grad_norm": 0.16797091670116737, "learning_rate": 9.714875755502429e-06, "loss": 0.8873, "step": 645 }, { "epoch": 3.5252387448840383, "grad_norm": 0.189909496119316, "learning_rate": 9.649474667226513e-06, "loss": 0.9186, "step": 646 }, { "epoch": 3.530695770804911, "grad_norm": 0.1662855707845877, "learning_rate": 9.58422439999374e-06, "loss": 0.9061, "step": 647 }, { "epoch": 3.5361527967257844, "grad_norm": 0.1877435970889167, "learning_rate": 9.519125904588059e-06, "loss": 0.9124, "step": 648 }, { "epoch": 3.5416098226466577, "grad_norm": 0.18966972578830213, "learning_rate": 9.45418012958191e-06, "loss": 0.9002, "step": 649 }, { "epoch": 3.547066848567531, "grad_norm": 0.18521500133290328, "learning_rate": 9.389388021322381e-06, "loss": 0.8921, "step": 650 }, { "epoch": 3.552523874488404, "grad_norm": 0.20655179032846327, "learning_rate": 9.32475052391742e-06, "loss": 0.8975, "step": 651 }, { "epoch": 3.557980900409277, "grad_norm": 0.1819692294620117, "learning_rate": 9.26026857922212e-06, "loss": 0.9082, "step": 652 }, { "epoch": 3.56343792633015, "grad_norm": 0.18675168504713038, "learning_rate": 9.19594312682493e-06, "loss": 0.9045, "step": 653 }, { "epoch": 3.568894952251023, "grad_norm": 0.16349611233292402, "learning_rate": 9.131775104034009e-06, "loss": 0.8907, "step": 654 }, { "epoch": 3.5743519781718964, "grad_norm": 0.17657868890026518, "learning_rate": 9.067765445863545e-06, "loss": 0.8777, "step": 655 }, { "epoch": 3.5798090040927697, "grad_norm": 0.1520862113066698, "learning_rate": 9.00391508502017e-06, "loss": 0.8761, "step": 656 }, { "epoch": 3.5852660300136425, "grad_norm": 0.16877815138189672, "learning_rate": 8.940224951889304e-06, "loss": 0.869, "step": 657 }, { "epoch": 3.590723055934516, "grad_norm": 0.16925000281087574, "learning_rate": 8.876695974521659e-06, "loss": 0.9011, "step": 658 }, { "epoch": 3.5961800818553886, "grad_norm": 0.16759697258423073, "learning_rate": 8.813329078619679e-06, "loss": 0.9045, "step": 659 }, { "epoch": 3.601637107776262, "grad_norm": 0.1896922083229097, "learning_rate": 8.750125187524068e-06, "loss": 0.86, "step": 660 }, { "epoch": 3.607094133697135, "grad_norm": 0.17884520359215278, "learning_rate": 8.687085222200323e-06, "loss": 0.9095, "step": 661 }, { "epoch": 3.6125511596180084, "grad_norm": 0.176877762158684, "learning_rate": 8.624210101225343e-06, "loss": 0.8985, "step": 662 }, { "epoch": 3.6180081855388813, "grad_norm": 0.2002369650449839, "learning_rate": 8.561500740774008e-06, "loss": 0.8929, "step": 663 }, { "epoch": 3.6234652114597545, "grad_norm": 0.17592875629565122, "learning_rate": 8.498958054605837e-06, "loss": 0.8778, "step": 664 }, { "epoch": 3.6289222373806274, "grad_norm": 0.21757591177018767, "learning_rate": 8.436582954051707e-06, "loss": 0.9046, "step": 665 }, { "epoch": 3.6343792633015006, "grad_norm": 0.16964570321715836, "learning_rate": 8.374376348000523e-06, "loss": 0.8766, "step": 666 }, { "epoch": 3.639836289222374, "grad_norm": 0.20816910485872794, "learning_rate": 8.312339142886003e-06, "loss": 0.8948, "step": 667 }, { "epoch": 3.645293315143247, "grad_norm": 0.21318859663355175, "learning_rate": 8.250472242673486e-06, "loss": 0.9035, "step": 668 }, { "epoch": 3.65075034106412, "grad_norm": 0.17223582052559827, "learning_rate": 8.188776548846717e-06, "loss": 0.8914, "step": 669 }, { "epoch": 3.6562073669849933, "grad_norm": 0.20492759686497783, "learning_rate": 8.127252960394744e-06, "loss": 0.8871, "step": 670 }, { "epoch": 3.661664392905866, "grad_norm": 0.17660213480793235, "learning_rate": 8.065902373798808e-06, "loss": 0.8658, "step": 671 }, { "epoch": 3.6671214188267394, "grad_norm": 0.18013543727863568, "learning_rate": 8.004725683019276e-06, "loss": 0.9016, "step": 672 }, { "epoch": 3.6725784447476126, "grad_norm": 0.1844280666804985, "learning_rate": 7.943723779482628e-06, "loss": 0.9034, "step": 673 }, { "epoch": 3.678035470668486, "grad_norm": 0.14933482527632957, "learning_rate": 7.882897552068447e-06, "loss": 0.9044, "step": 674 }, { "epoch": 3.6834924965893587, "grad_norm": 0.180577120421336, "learning_rate": 7.822247887096499e-06, "loss": 0.8987, "step": 675 }, { "epoch": 3.688949522510232, "grad_norm": 0.18976867015358279, "learning_rate": 7.761775668313775e-06, "loss": 0.9055, "step": 676 }, { "epoch": 3.694406548431105, "grad_norm": 0.14380655448071636, "learning_rate": 7.70148177688166e-06, "loss": 0.8819, "step": 677 }, { "epoch": 3.699863574351978, "grad_norm": 0.1605511243289739, "learning_rate": 7.641367091363056e-06, "loss": 0.8765, "step": 678 }, { "epoch": 3.7053206002728514, "grad_norm": 0.16966229691015783, "learning_rate": 7.581432487709595e-06, "loss": 0.8956, "step": 679 }, { "epoch": 3.7107776261937246, "grad_norm": 0.15825612639259118, "learning_rate": 7.521678839248867e-06, "loss": 0.8757, "step": 680 }, { "epoch": 3.7162346521145975, "grad_norm": 0.15905765650755102, "learning_rate": 7.462107016671727e-06, "loss": 0.9021, "step": 681 }, { "epoch": 3.7216916780354707, "grad_norm": 0.1678589543544254, "learning_rate": 7.402717888019561e-06, "loss": 0.9037, "step": 682 }, { "epoch": 3.7271487039563436, "grad_norm": 0.16250907925377683, "learning_rate": 7.343512318671668e-06, "loss": 0.8996, "step": 683 }, { "epoch": 3.732605729877217, "grad_norm": 0.1796362073897607, "learning_rate": 7.284491171332637e-06, "loss": 0.9044, "step": 684 }, { "epoch": 3.73806275579809, "grad_norm": 0.15668011051829173, "learning_rate": 7.225655306019783e-06, "loss": 0.888, "step": 685 }, { "epoch": 3.7435197817189634, "grad_norm": 0.1668930240876366, "learning_rate": 7.167005580050608e-06, "loss": 0.9017, "step": 686 }, { "epoch": 3.748976807639836, "grad_norm": 0.18870659107182658, "learning_rate": 7.108542848030333e-06, "loss": 0.8767, "step": 687 }, { "epoch": 3.7544338335607095, "grad_norm": 0.15696986217820777, "learning_rate": 7.050267961839407e-06, "loss": 0.8909, "step": 688 }, { "epoch": 3.7598908594815823, "grad_norm": 0.18431028719776638, "learning_rate": 6.992181770621109e-06, "loss": 0.8868, "step": 689 }, { "epoch": 3.7653478854024556, "grad_norm": 0.16154837397895874, "learning_rate": 6.934285120769206e-06, "loss": 0.8994, "step": 690 }, { "epoch": 3.770804911323329, "grad_norm": 0.1608522865427035, "learning_rate": 6.87657885591557e-06, "loss": 0.9054, "step": 691 }, { "epoch": 3.776261937244202, "grad_norm": 0.17546410153871858, "learning_rate": 6.819063816917904e-06, "loss": 0.8771, "step": 692 }, { "epoch": 3.781718963165075, "grad_norm": 0.17779343503619688, "learning_rate": 6.761740841847517e-06, "loss": 0.8828, "step": 693 }, { "epoch": 3.787175989085948, "grad_norm": 0.1620894791729856, "learning_rate": 6.704610765977073e-06, "loss": 0.8896, "step": 694 }, { "epoch": 3.792633015006821, "grad_norm": 0.16551990476440234, "learning_rate": 6.647674421768435e-06, "loss": 0.8885, "step": 695 }, { "epoch": 3.7980900409276943, "grad_norm": 0.17247511398164073, "learning_rate": 6.590932638860543e-06, "loss": 0.9229, "step": 696 }, { "epoch": 3.8035470668485676, "grad_norm": 0.17229017961388754, "learning_rate": 6.5343862440573095e-06, "loss": 0.8809, "step": 697 }, { "epoch": 3.809004092769441, "grad_norm": 0.15732362181652573, "learning_rate": 6.478036061315587e-06, "loss": 0.903, "step": 698 }, { "epoch": 3.8144611186903137, "grad_norm": 0.14793725507686076, "learning_rate": 6.421882911733146e-06, "loss": 0.9084, "step": 699 }, { "epoch": 3.819918144611187, "grad_norm": 0.18160474710129887, "learning_rate": 6.365927613536737e-06, "loss": 0.8833, "step": 700 }, { "epoch": 3.8253751705320598, "grad_norm": 0.16205271433369595, "learning_rate": 6.310170982070132e-06, "loss": 0.903, "step": 701 }, { "epoch": 3.830832196452933, "grad_norm": 0.1755196814184644, "learning_rate": 6.254613829782274e-06, "loss": 0.8866, "step": 702 }, { "epoch": 3.8362892223738063, "grad_norm": 0.16947891319556294, "learning_rate": 6.199256966215423e-06, "loss": 0.9072, "step": 703 }, { "epoch": 3.8417462482946796, "grad_norm": 0.1598029992685231, "learning_rate": 6.1441011979933615e-06, "loss": 0.8965, "step": 704 }, { "epoch": 3.8472032742155524, "grad_norm": 0.17633255200544773, "learning_rate": 6.089147328809637e-06, "loss": 0.9213, "step": 705 }, { "epoch": 3.8526603001364257, "grad_norm": 0.14858434315925467, "learning_rate": 6.034396159415874e-06, "loss": 0.9057, "step": 706 }, { "epoch": 3.8581173260572985, "grad_norm": 0.1359593564440916, "learning_rate": 5.979848487610078e-06, "loss": 0.9002, "step": 707 }, { "epoch": 3.863574351978172, "grad_norm": 0.1546596886497959, "learning_rate": 5.92550510822502e-06, "loss": 0.881, "step": 708 }, { "epoch": 3.869031377899045, "grad_norm": 0.1553240834204749, "learning_rate": 5.871366813116661e-06, "loss": 0.9015, "step": 709 }, { "epoch": 3.8744884038199183, "grad_norm": 0.14118959880699977, "learning_rate": 5.817434391152605e-06, "loss": 0.8907, "step": 710 }, { "epoch": 3.879945429740791, "grad_norm": 0.14059851937404533, "learning_rate": 5.763708628200609e-06, "loss": 0.8891, "step": 711 }, { "epoch": 3.8854024556616644, "grad_norm": 0.15427945771110663, "learning_rate": 5.710190307117138e-06, "loss": 0.8951, "step": 712 }, { "epoch": 3.8908594815825372, "grad_norm": 0.1445538887040146, "learning_rate": 5.656880207735938e-06, "loss": 0.8877, "step": 713 }, { "epoch": 3.8963165075034105, "grad_norm": 0.15649585838748734, "learning_rate": 5.603779106856699e-06, "loss": 0.9074, "step": 714 }, { "epoch": 3.901773533424284, "grad_norm": 0.13648774675224182, "learning_rate": 5.550887778233713e-06, "loss": 0.8941, "step": 715 }, { "epoch": 3.907230559345157, "grad_norm": 0.15565409065858304, "learning_rate": 5.498206992564612e-06, "loss": 0.9173, "step": 716 }, { "epoch": 3.91268758526603, "grad_norm": 0.13922969052192785, "learning_rate": 5.4457375174791325e-06, "loss": 0.8893, "step": 717 }, { "epoch": 3.918144611186903, "grad_norm": 0.15294676839534935, "learning_rate": 5.3934801175279276e-06, "loss": 0.9154, "step": 718 }, { "epoch": 3.923601637107776, "grad_norm": 0.15092879808147422, "learning_rate": 5.341435554171448e-06, "loss": 0.8827, "step": 719 }, { "epoch": 3.9290586630286493, "grad_norm": 0.14825666022997366, "learning_rate": 5.289604585768813e-06, "loss": 0.8848, "step": 720 }, { "epoch": 3.9345156889495225, "grad_norm": 0.1606715610763504, "learning_rate": 5.237987967566787e-06, "loss": 0.8772, "step": 721 }, { "epoch": 3.939972714870396, "grad_norm": 0.16522816411905664, "learning_rate": 5.1865864516887535e-06, "loss": 0.8976, "step": 722 }, { "epoch": 3.9454297407912686, "grad_norm": 0.15958019587002623, "learning_rate": 5.1354007871237765e-06, "loss": 0.906, "step": 723 }, { "epoch": 3.950886766712142, "grad_norm": 0.150449287740693, "learning_rate": 5.084431719715668e-06, "loss": 0.8925, "step": 724 }, { "epoch": 3.956343792633015, "grad_norm": 0.1654448721490872, "learning_rate": 5.033679992152143e-06, "loss": 0.8949, "step": 725 }, { "epoch": 3.961800818553888, "grad_norm": 0.15862344300369557, "learning_rate": 4.983146343953964e-06, "loss": 0.8802, "step": 726 }, { "epoch": 3.9672578444747613, "grad_norm": 0.13976420034767134, "learning_rate": 4.932831511464206e-06, "loss": 0.887, "step": 727 }, { "epoch": 3.9727148703956345, "grad_norm": 0.18682370943191948, "learning_rate": 4.88273622783749e-06, "loss": 0.8953, "step": 728 }, { "epoch": 3.9781718963165074, "grad_norm": 0.142893917159586, "learning_rate": 4.83286122302932e-06, "loss": 0.8823, "step": 729 }, { "epoch": 3.9836289222373806, "grad_norm": 0.1501981132875881, "learning_rate": 4.783207223785431e-06, "loss": 0.8964, "step": 730 }, { "epoch": 3.989085948158254, "grad_norm": 0.15657458729040308, "learning_rate": 4.733774953631238e-06, "loss": 0.8979, "step": 731 }, { "epoch": 3.9945429740791267, "grad_norm": 0.13982230103959686, "learning_rate": 4.68456513286124e-06, "loss": 0.8923, "step": 732 }, { "epoch": 4.0, "grad_norm": 0.27858828514063777, "learning_rate": 4.6355784785285615e-06, "loss": 1.5566, "step": 733 }, { "epoch": 4.005457025920873, "grad_norm": 0.17089905607735426, "learning_rate": 4.586815704434488e-06, "loss": 0.887, "step": 734 }, { "epoch": 4.0109140518417465, "grad_norm": 0.14705477825085042, "learning_rate": 4.538277521118071e-06, "loss": 0.8841, "step": 735 }, { "epoch": 4.01637107776262, "grad_norm": 0.1636792606316968, "learning_rate": 4.489964635845769e-06, "loss": 0.8899, "step": 736 }, { "epoch": 4.021828103683492, "grad_norm": 0.15198479944975976, "learning_rate": 4.44187775260116e-06, "loss": 0.8881, "step": 737 }, { "epoch": 4.0272851296043655, "grad_norm": 0.13291652839803894, "learning_rate": 4.3940175720746494e-06, "loss": 0.8696, "step": 738 }, { "epoch": 4.032742155525239, "grad_norm": 0.1490103061507369, "learning_rate": 4.346384791653298e-06, "loss": 0.8984, "step": 739 }, { "epoch": 4.038199181446112, "grad_norm": 0.17175563522601708, "learning_rate": 4.2989801054106305e-06, "loss": 0.8665, "step": 740 }, { "epoch": 4.043656207366985, "grad_norm": 0.1499319026668514, "learning_rate": 4.251804204096535e-06, "loss": 0.8779, "step": 741 }, { "epoch": 4.0491132332878585, "grad_norm": 0.16227165418614628, "learning_rate": 4.204857775127198e-06, "loss": 0.8755, "step": 742 }, { "epoch": 4.054570259208731, "grad_norm": 0.1581981145043867, "learning_rate": 4.1581415025750795e-06, "loss": 0.8895, "step": 743 }, { "epoch": 4.060027285129604, "grad_norm": 0.15513935379525345, "learning_rate": 4.111656067158971e-06, "loss": 0.8974, "step": 744 }, { "epoch": 4.0654843110504775, "grad_norm": 0.14535697871945671, "learning_rate": 4.065402146234034e-06, "loss": 0.8485, "step": 745 }, { "epoch": 4.070941336971351, "grad_norm": 0.1297532212724062, "learning_rate": 4.019380413781968e-06, "loss": 0.885, "step": 746 }, { "epoch": 4.076398362892224, "grad_norm": 0.1488778393601588, "learning_rate": 3.973591540401165e-06, "loss": 0.9015, "step": 747 }, { "epoch": 4.081855388813097, "grad_norm": 0.13978030494695767, "learning_rate": 3.928036193296958e-06, "loss": 0.8887, "step": 748 }, { "epoch": 4.08731241473397, "grad_norm": 0.14411923483228978, "learning_rate": 3.882715036271874e-06, "loss": 0.8734, "step": 749 }, { "epoch": 4.092769440654843, "grad_norm": 0.139081525574305, "learning_rate": 3.837628729715994e-06, "loss": 0.8781, "step": 750 }, { "epoch": 4.098226466575716, "grad_norm": 0.14858634817778646, "learning_rate": 3.7927779305973066e-06, "loss": 0.8708, "step": 751 }, { "epoch": 4.1036834924965895, "grad_norm": 0.1364174899816674, "learning_rate": 3.7481632924521383e-06, "loss": 0.8741, "step": 752 }, { "epoch": 4.109140518417463, "grad_norm": 0.13932957227692389, "learning_rate": 3.7037854653756287e-06, "loss": 0.8921, "step": 753 }, { "epoch": 4.114597544338336, "grad_norm": 0.14304788451278092, "learning_rate": 3.65964509601227e-06, "loss": 0.8765, "step": 754 }, { "epoch": 4.120054570259208, "grad_norm": 0.1629674318472855, "learning_rate": 3.6157428275464713e-06, "loss": 0.8865, "step": 755 }, { "epoch": 4.125511596180082, "grad_norm": 0.1363245824129024, "learning_rate": 3.572079299693201e-06, "loss": 0.9084, "step": 756 }, { "epoch": 4.130968622100955, "grad_norm": 0.1508822391111529, "learning_rate": 3.528655148688649e-06, "loss": 0.8851, "step": 757 }, { "epoch": 4.136425648021828, "grad_norm": 0.14303462818362056, "learning_rate": 3.485471007280965e-06, "loss": 0.8758, "step": 758 }, { "epoch": 4.1418826739427015, "grad_norm": 0.14526482585748274, "learning_rate": 3.4425275047210337e-06, "loss": 0.8888, "step": 759 }, { "epoch": 4.147339699863575, "grad_norm": 0.13337908689426514, "learning_rate": 3.399825266753316e-06, "loss": 0.8996, "step": 760 }, { "epoch": 4.152796725784447, "grad_norm": 0.12506301079113333, "learning_rate": 3.357364915606711e-06, "loss": 0.8817, "step": 761 }, { "epoch": 4.15825375170532, "grad_norm": 0.13808109230175336, "learning_rate": 3.3151470699855226e-06, "loss": 0.8784, "step": 762 }, { "epoch": 4.163710777626194, "grad_norm": 0.1322332471365764, "learning_rate": 3.2731723450604047e-06, "loss": 0.8905, "step": 763 }, { "epoch": 4.169167803547067, "grad_norm": 0.13389925768742733, "learning_rate": 3.23144135245943e-06, "loss": 0.8952, "step": 764 }, { "epoch": 4.17462482946794, "grad_norm": 0.13683886240243162, "learning_rate": 3.1899547002591548e-06, "loss": 0.8755, "step": 765 }, { "epoch": 4.1800818553888135, "grad_norm": 0.12558097802450152, "learning_rate": 3.148712992975773e-06, "loss": 0.8579, "step": 766 }, { "epoch": 4.185538881309686, "grad_norm": 0.13782990446140714, "learning_rate": 3.107716831556298e-06, "loss": 0.8929, "step": 767 }, { "epoch": 4.190995907230559, "grad_norm": 0.13743942655956906, "learning_rate": 3.0669668133698114e-06, "loss": 0.8627, "step": 768 }, { "epoch": 4.196452933151432, "grad_norm": 0.12989089669107465, "learning_rate": 3.026463532198767e-06, "loss": 0.8799, "step": 769 }, { "epoch": 4.201909959072306, "grad_norm": 0.15662525453225684, "learning_rate": 2.9862075782303155e-06, "loss": 0.8731, "step": 770 }, { "epoch": 4.207366984993179, "grad_norm": 0.1350418186415897, "learning_rate": 2.946199538047727e-06, "loss": 0.8602, "step": 771 }, { "epoch": 4.212824010914052, "grad_norm": 0.12498595424209477, "learning_rate": 2.9064399946218304e-06, "loss": 0.868, "step": 772 }, { "epoch": 4.218281036834925, "grad_norm": 0.21098257229096243, "learning_rate": 2.866929527302522e-06, "loss": 0.8883, "step": 773 }, { "epoch": 4.223738062755798, "grad_norm": 0.133123941008207, "learning_rate": 2.8276687118103384e-06, "loss": 0.8878, "step": 774 }, { "epoch": 4.229195088676671, "grad_norm": 0.1418691768230737, "learning_rate": 2.7886581202280338e-06, "loss": 0.8978, "step": 775 }, { "epoch": 4.234652114597544, "grad_norm": 0.14622777292208364, "learning_rate": 2.749898320992286e-06, "loss": 0.8855, "step": 776 }, { "epoch": 4.240109140518418, "grad_norm": 0.13868949813718004, "learning_rate": 2.711389878885371e-06, "loss": 0.8782, "step": 777 }, { "epoch": 4.245566166439291, "grad_norm": 0.12620162322262743, "learning_rate": 2.673133355026969e-06, "loss": 0.8742, "step": 778 }, { "epoch": 4.251023192360163, "grad_norm": 0.1271015484185532, "learning_rate": 2.6351293068659643e-06, "loss": 0.8748, "step": 779 }, { "epoch": 4.256480218281037, "grad_norm": 0.18196702435356202, "learning_rate": 2.597378288172332e-06, "loss": 0.8851, "step": 780 }, { "epoch": 4.26193724420191, "grad_norm": 0.16872955546686272, "learning_rate": 2.559880849029079e-06, "loss": 0.8802, "step": 781 }, { "epoch": 4.267394270122783, "grad_norm": 0.13072466845715314, "learning_rate": 2.5226375358242085e-06, "loss": 0.8877, "step": 782 }, { "epoch": 4.272851296043656, "grad_norm": 0.14754876950071485, "learning_rate": 2.485648891242767e-06, "loss": 0.8904, "step": 783 }, { "epoch": 4.27830832196453, "grad_norm": 0.1590434780138768, "learning_rate": 2.448915454258942e-06, "loss": 0.9032, "step": 784 }, { "epoch": 4.283765347885402, "grad_norm": 0.14151048099974572, "learning_rate": 2.412437760128199e-06, "loss": 0.8918, "step": 785 }, { "epoch": 4.289222373806275, "grad_norm": 0.13620855260975054, "learning_rate": 2.376216340379489e-06, "loss": 0.8845, "step": 786 }, { "epoch": 4.294679399727149, "grad_norm": 0.1901936677421411, "learning_rate": 2.3402517228075073e-06, "loss": 0.8851, "step": 787 }, { "epoch": 4.300136425648022, "grad_norm": 0.19671986170174766, "learning_rate": 2.3045444314649856e-06, "loss": 0.8678, "step": 788 }, { "epoch": 4.305593451568895, "grad_norm": 0.14899305081412742, "learning_rate": 2.2690949866550803e-06, "loss": 0.8893, "step": 789 }, { "epoch": 4.311050477489768, "grad_norm": 0.16143293820225038, "learning_rate": 2.2339039049237687e-06, "loss": 0.9024, "step": 790 }, { "epoch": 4.316507503410641, "grad_norm": 0.17932782006553405, "learning_rate": 2.19897169905233e-06, "loss": 0.8929, "step": 791 }, { "epoch": 4.321964529331514, "grad_norm": 0.14806731199839362, "learning_rate": 2.164298878049882e-06, "loss": 0.8662, "step": 792 }, { "epoch": 4.327421555252387, "grad_norm": 0.1289649779602983, "learning_rate": 2.1298859471459443e-06, "loss": 0.8813, "step": 793 }, { "epoch": 4.332878581173261, "grad_norm": 0.19801253886238948, "learning_rate": 2.0957334077831115e-06, "loss": 0.9005, "step": 794 }, { "epoch": 4.338335607094134, "grad_norm": 0.19694630717701755, "learning_rate": 2.0618417576097016e-06, "loss": 0.9052, "step": 795 }, { "epoch": 4.343792633015007, "grad_norm": 0.13211759110481675, "learning_rate": 2.028211490472538e-06, "loss": 0.8727, "step": 796 }, { "epoch": 4.34924965893588, "grad_norm": 0.16942182286893248, "learning_rate": 1.99484309640974e-06, "loss": 0.8939, "step": 797 }, { "epoch": 4.354706684856753, "grad_norm": 0.15525627631122169, "learning_rate": 1.9617370616435827e-06, "loss": 0.8769, "step": 798 }, { "epoch": 4.360163710777626, "grad_norm": 0.14568495391925143, "learning_rate": 1.9288938685734206e-06, "loss": 0.8801, "step": 799 }, { "epoch": 4.365620736698499, "grad_norm": 0.1482638689148959, "learning_rate": 1.8963139957686439e-06, "loss": 0.8865, "step": 800 }, { "epoch": 4.371077762619373, "grad_norm": 0.1698206452526069, "learning_rate": 1.863997917961724e-06, "loss": 0.8756, "step": 801 }, { "epoch": 4.376534788540246, "grad_norm": 0.15989049182819062, "learning_rate": 1.8319461060412735e-06, "loss": 0.8827, "step": 802 }, { "epoch": 4.381991814461118, "grad_norm": 0.12598713099232536, "learning_rate": 1.8001590270452007e-06, "loss": 0.8955, "step": 803 }, { "epoch": 4.387448840381992, "grad_norm": 0.13029377486709406, "learning_rate": 1.7686371441539041e-06, "loss": 0.8964, "step": 804 }, { "epoch": 4.392905866302865, "grad_norm": 0.15126612488881352, "learning_rate": 1.7373809166835131e-06, "loss": 0.8838, "step": 805 }, { "epoch": 4.398362892223738, "grad_norm": 0.13445739913334448, "learning_rate": 1.7063908000791984e-06, "loss": 0.8958, "step": 806 }, { "epoch": 4.403819918144611, "grad_norm": 0.16664103966071625, "learning_rate": 1.6756672459085565e-06, "loss": 0.8826, "step": 807 }, { "epoch": 4.409276944065485, "grad_norm": 0.1762539743129894, "learning_rate": 1.645210701854989e-06, "loss": 0.8785, "step": 808 }, { "epoch": 4.414733969986357, "grad_norm": 0.15934966334590775, "learning_rate": 1.615021611711216e-06, "loss": 0.8854, "step": 809 }, { "epoch": 4.42019099590723, "grad_norm": 0.124968014984558, "learning_rate": 1.5851004153727845e-06, "loss": 0.8788, "step": 810 }, { "epoch": 4.425648021828104, "grad_norm": 0.1385331750299138, "learning_rate": 1.5554475488316812e-06, "loss": 0.8916, "step": 811 }, { "epoch": 4.431105047748977, "grad_norm": 0.13707927403446576, "learning_rate": 1.5260634441699585e-06, "loss": 0.8742, "step": 812 }, { "epoch": 4.43656207366985, "grad_norm": 0.12627498054063097, "learning_rate": 1.496948529553457e-06, "loss": 0.887, "step": 813 }, { "epoch": 4.442019099590723, "grad_norm": 0.1486808085420501, "learning_rate": 1.468103229225546e-06, "loss": 0.8808, "step": 814 }, { "epoch": 4.447476125511596, "grad_norm": 0.14605688992873062, "learning_rate": 1.4395279635009595e-06, "loss": 0.8708, "step": 815 }, { "epoch": 4.452933151432469, "grad_norm": 0.13906895719296147, "learning_rate": 1.4112231487596618e-06, "loss": 0.8649, "step": 816 }, { "epoch": 4.458390177353342, "grad_norm": 0.11788044087197277, "learning_rate": 1.3831891974407862e-06, "loss": 0.8783, "step": 817 }, { "epoch": 4.463847203274216, "grad_norm": 0.11778162515868901, "learning_rate": 1.3554265180366177e-06, "loss": 0.91, "step": 818 }, { "epoch": 4.469304229195089, "grad_norm": 0.14670430911084376, "learning_rate": 1.3279355150866536e-06, "loss": 0.8694, "step": 819 }, { "epoch": 4.474761255115962, "grad_norm": 0.12231810737886735, "learning_rate": 1.3007165891716978e-06, "loss": 0.8519, "step": 820 }, { "epoch": 4.480218281036835, "grad_norm": 0.1271770535078628, "learning_rate": 1.2737701369080213e-06, "loss": 0.9097, "step": 821 }, { "epoch": 4.485675306957708, "grad_norm": 0.1385648556146423, "learning_rate": 1.2470965509415911e-06, "loss": 0.8968, "step": 822 }, { "epoch": 4.491132332878581, "grad_norm": 0.152826890949677, "learning_rate": 1.2206962199423478e-06, "loss": 0.8831, "step": 823 }, { "epoch": 4.496589358799454, "grad_norm": 0.12805457871619716, "learning_rate": 1.1945695285985437e-06, "loss": 0.9114, "step": 824 }, { "epoch": 4.502046384720328, "grad_norm": 0.1200667730156898, "learning_rate": 1.1687168576111251e-06, "loss": 0.897, "step": 825 }, { "epoch": 4.507503410641201, "grad_norm": 0.14528129682089547, "learning_rate": 1.1431385836882058e-06, "loss": 0.8645, "step": 826 }, { "epoch": 4.512960436562073, "grad_norm": 0.11909725664621254, "learning_rate": 1.1178350795395553e-06, "loss": 0.875, "step": 827 }, { "epoch": 4.5184174624829465, "grad_norm": 0.140122744341455, "learning_rate": 1.0928067138711817e-06, "loss": 0.8825, "step": 828 }, { "epoch": 4.52387448840382, "grad_norm": 0.15162334185887835, "learning_rate": 1.06805385137996e-06, "loss": 0.8794, "step": 829 }, { "epoch": 4.529331514324693, "grad_norm": 0.14150098143714812, "learning_rate": 1.0435768527483114e-06, "loss": 0.8937, "step": 830 }, { "epoch": 4.534788540245566, "grad_norm": 0.1260117766174468, "learning_rate": 1.019376074638949e-06, "loss": 0.8815, "step": 831 }, { "epoch": 4.54024556616644, "grad_norm": 0.12409915203852431, "learning_rate": 9.954518696896854e-07, "loss": 0.8834, "step": 832 }, { "epoch": 4.545702592087313, "grad_norm": 0.136871178123947, "learning_rate": 9.718045865082914e-07, "loss": 0.8793, "step": 833 }, { "epoch": 4.551159618008185, "grad_norm": 0.14107625789727535, "learning_rate": 9.484345696674135e-07, "loss": 0.9022, "step": 834 }, { "epoch": 4.5566166439290585, "grad_norm": 0.15144418956800026, "learning_rate": 9.253421596995538e-07, "loss": 0.8668, "step": 835 }, { "epoch": 4.562073669849932, "grad_norm": 0.14652192158608265, "learning_rate": 9.025276930921168e-07, "loss": 0.8952, "step": 836 }, { "epoch": 4.567530695770805, "grad_norm": 0.1414596872944082, "learning_rate": 8.799915022824912e-07, "loss": 0.89, "step": 837 }, { "epoch": 4.572987721691678, "grad_norm": 0.11762183453991368, "learning_rate": 8.577339156532228e-07, "loss": 0.8891, "step": 838 }, { "epoch": 4.578444747612551, "grad_norm": 0.11696302332812643, "learning_rate": 8.35755257527211e-07, "loss": 0.8865, "step": 839 }, { "epoch": 4.583901773533424, "grad_norm": 0.14096367257161824, "learning_rate": 8.140558481629978e-07, "loss": 0.883, "step": 840 }, { "epoch": 4.589358799454297, "grad_norm": 0.13025076800848065, "learning_rate": 7.92636003750098e-07, "loss": 0.861, "step": 841 }, { "epoch": 4.5948158253751705, "grad_norm": 0.13357496841366284, "learning_rate": 7.714960364043844e-07, "loss": 0.8917, "step": 842 }, { "epoch": 4.600272851296044, "grad_norm": 0.12340312706447396, "learning_rate": 7.506362541635482e-07, "loss": 0.8899, "step": 843 }, { "epoch": 4.605729877216917, "grad_norm": 0.14740826905136978, "learning_rate": 7.300569609826103e-07, "loss": 0.9164, "step": 844 }, { "epoch": 4.61118690313779, "grad_norm": 0.12349556901637186, "learning_rate": 7.097584567294858e-07, "loss": 0.9002, "step": 845 }, { "epoch": 4.616643929058663, "grad_norm": 0.12502508254095465, "learning_rate": 6.897410371806202e-07, "loss": 0.8966, "step": 846 }, { "epoch": 4.622100954979536, "grad_norm": 0.1211951289522415, "learning_rate": 6.70004994016673e-07, "loss": 0.8834, "step": 847 }, { "epoch": 4.627557980900409, "grad_norm": 0.12281440993768762, "learning_rate": 6.505506148182816e-07, "loss": 0.8871, "step": 848 }, { "epoch": 4.6330150068212825, "grad_norm": 0.12502962964548078, "learning_rate": 6.313781830618549e-07, "loss": 0.8767, "step": 849 }, { "epoch": 4.638472032742156, "grad_norm": 0.12943087930152467, "learning_rate": 6.124879781154458e-07, "loss": 0.875, "step": 850 }, { "epoch": 4.643929058663028, "grad_norm": 0.14031219133585143, "learning_rate": 5.938802752346972e-07, "loss": 0.8927, "step": 851 }, { "epoch": 4.6493860845839015, "grad_norm": 0.12542619019610873, "learning_rate": 5.755553455588025e-07, "loss": 0.8876, "step": 852 }, { "epoch": 4.654843110504775, "grad_norm": 0.12638050340038925, "learning_rate": 5.575134561065798e-07, "loss": 0.8665, "step": 853 }, { "epoch": 4.660300136425648, "grad_norm": 0.12333899862571804, "learning_rate": 5.397548697725686e-07, "loss": 0.8903, "step": 854 }, { "epoch": 4.665757162346521, "grad_norm": 0.1346635192449993, "learning_rate": 5.22279845323197e-07, "loss": 0.8725, "step": 855 }, { "epoch": 4.6712141882673945, "grad_norm": 0.13647426558512074, "learning_rate": 5.050886373930231e-07, "loss": 0.8875, "step": 856 }, { "epoch": 4.676671214188268, "grad_norm": 0.11566165671981071, "learning_rate": 4.881814964810172e-07, "loss": 0.8749, "step": 857 }, { "epoch": 4.68212824010914, "grad_norm": 0.12363326959711636, "learning_rate": 4.715586689469054e-07, "loss": 0.8769, "step": 858 }, { "epoch": 4.6875852660300135, "grad_norm": 0.11764168862039581, "learning_rate": 4.552203970075941e-07, "loss": 0.8918, "step": 859 }, { "epoch": 4.693042291950887, "grad_norm": 0.11426043356029422, "learning_rate": 4.391669187336267e-07, "loss": 0.89, "step": 860 }, { "epoch": 4.69849931787176, "grad_norm": 0.11108719415747546, "learning_rate": 4.2339846804572596e-07, "loss": 0.8804, "step": 861 }, { "epoch": 4.703956343792633, "grad_norm": 0.12913366570368975, "learning_rate": 4.079152747113746e-07, "loss": 0.8803, "step": 862 }, { "epoch": 4.709413369713506, "grad_norm": 0.11303274704940805, "learning_rate": 3.9271756434147825e-07, "loss": 0.8707, "step": 863 }, { "epoch": 4.714870395634379, "grad_norm": 0.12668292386761498, "learning_rate": 3.778055583870677e-07, "loss": 0.8615, "step": 864 }, { "epoch": 4.720327421555252, "grad_norm": 0.12160210451896335, "learning_rate": 3.631794741360839e-07, "loss": 0.8749, "step": 865 }, { "epoch": 4.7257844474761255, "grad_norm": 0.12255220583052599, "learning_rate": 3.4883952471019833e-07, "loss": 0.8656, "step": 866 }, { "epoch": 4.731241473396999, "grad_norm": 0.13097406090149366, "learning_rate": 3.347859190617153e-07, "loss": 0.9104, "step": 867 }, { "epoch": 4.736698499317872, "grad_norm": 0.13879798939381358, "learning_rate": 3.210188619705257e-07, "loss": 0.8932, "step": 868 }, { "epoch": 4.742155525238745, "grad_norm": 0.10869652399193062, "learning_rate": 3.0753855404112907e-07, "loss": 0.8617, "step": 869 }, { "epoch": 4.747612551159618, "grad_norm": 0.14405688232051542, "learning_rate": 2.943451916997009e-07, "loss": 0.8849, "step": 870 }, { "epoch": 4.753069577080491, "grad_norm": 0.10950574849744894, "learning_rate": 2.814389671912321e-07, "loss": 0.8894, "step": 871 }, { "epoch": 4.758526603001364, "grad_norm": 0.12549196376105284, "learning_rate": 2.6882006857672946e-07, "loss": 0.8666, "step": 872 }, { "epoch": 4.7639836289222375, "grad_norm": 0.12049040986742628, "learning_rate": 2.564886797304844e-07, "loss": 0.8925, "step": 873 }, { "epoch": 4.769440654843111, "grad_norm": 0.13628451786758633, "learning_rate": 2.444449803373772e-07, "loss": 0.8736, "step": 874 }, { "epoch": 4.774897680763983, "grad_norm": 0.11819888541785435, "learning_rate": 2.3268914589026582e-07, "loss": 0.876, "step": 875 }, { "epoch": 4.780354706684856, "grad_norm": 0.13249771798161303, "learning_rate": 2.212213476874392e-07, "loss": 0.8721, "step": 876 }, { "epoch": 4.78581173260573, "grad_norm": 0.16019402654757406, "learning_rate": 2.100417528301013e-07, "loss": 0.8574, "step": 877 }, { "epoch": 4.791268758526603, "grad_norm": 0.12709978273924896, "learning_rate": 1.9915052421995095e-07, "loss": 0.8788, "step": 878 }, { "epoch": 4.796725784447476, "grad_norm": 0.1269380086439239, "learning_rate": 1.8854782055680588e-07, "loss": 0.8856, "step": 879 }, { "epoch": 4.8021828103683495, "grad_norm": 0.12816104723044472, "learning_rate": 1.7823379633628236e-07, "loss": 0.8682, "step": 880 }, { "epoch": 4.807639836289223, "grad_norm": 0.11897350073492513, "learning_rate": 1.6820860184755705e-07, "loss": 0.8893, "step": 881 }, { "epoch": 4.813096862210095, "grad_norm": 0.1676441632472798, "learning_rate": 1.584723831711621e-07, "loss": 0.8827, "step": 882 }, { "epoch": 4.818553888130968, "grad_norm": 0.11260862448875701, "learning_rate": 1.4902528217687339e-07, "loss": 0.8668, "step": 883 }, { "epoch": 4.824010914051842, "grad_norm": 0.11073774182348436, "learning_rate": 1.398674365216235e-07, "loss": 0.8985, "step": 884 }, { "epoch": 4.829467939972715, "grad_norm": 0.11457097434689421, "learning_rate": 1.309989796475164e-07, "loss": 0.8671, "step": 885 }, { "epoch": 4.834924965893588, "grad_norm": 0.11278616103798808, "learning_rate": 1.22420040779867e-07, "loss": 0.8627, "step": 886 }, { "epoch": 4.8403819918144615, "grad_norm": 0.11658568907087213, "learning_rate": 1.1413074492532927e-07, "loss": 0.8698, "step": 887 }, { "epoch": 4.845839017735334, "grad_norm": 0.12477104191258748, "learning_rate": 1.06131212870062e-07, "loss": 0.8972, "step": 888 }, { "epoch": 4.851296043656207, "grad_norm": 0.11569142614675672, "learning_rate": 9.842156117798817e-08, "loss": 0.8808, "step": 889 }, { "epoch": 4.85675306957708, "grad_norm": 0.15639110653049954, "learning_rate": 9.10019021890718e-08, "loss": 0.8757, "step": 890 }, { "epoch": 4.862210095497954, "grad_norm": 0.1414742721261049, "learning_rate": 8.387234401770361e-08, "loss": 0.884, "step": 891 }, { "epoch": 4.867667121418827, "grad_norm": 0.11635396600358816, "learning_rate": 7.703299055111357e-08, "loss": 0.9047, "step": 892 }, { "epoch": 4.8731241473397, "grad_norm": 0.1168433074137953, "learning_rate": 7.048394144785863e-08, "loss": 0.8669, "step": 893 }, { "epoch": 4.878581173260573, "grad_norm": 0.11125431182410457, "learning_rate": 6.422529213637063e-08, "loss": 0.8713, "step": 894 }, { "epoch": 4.884038199181446, "grad_norm": 0.11391519788296704, "learning_rate": 5.8257133813570675e-08, "loss": 0.8851, "step": 895 }, { "epoch": 4.889495225102319, "grad_norm": 0.1050397723513658, "learning_rate": 5.257955344353471e-08, "loss": 0.8742, "step": 896 }, { "epoch": 4.894952251023192, "grad_norm": 0.110886347004846, "learning_rate": 4.71926337562234e-08, "loss": 0.8835, "step": 897 }, { "epoch": 4.900409276944066, "grad_norm": 0.13022649928545438, "learning_rate": 4.2096453246287526e-08, "loss": 0.8798, "step": 898 }, { "epoch": 4.905866302864939, "grad_norm": 0.11447299895739564, "learning_rate": 3.729108617191557e-08, "loss": 0.8915, "step": 899 }, { "epoch": 4.911323328785811, "grad_norm": 0.1226591978951474, "learning_rate": 3.277660255375237e-08, "loss": 0.9051, "step": 900 }, { "epoch": 4.916780354706685, "grad_norm": 0.11938229502321866, "learning_rate": 2.855306817388659e-08, "loss": 0.8961, "step": 901 }, { "epoch": 4.922237380627558, "grad_norm": 0.13559091937945114, "learning_rate": 2.462054457487595e-08, "loss": 0.8778, "step": 902 }, { "epoch": 4.927694406548431, "grad_norm": 0.12486180567731954, "learning_rate": 2.097908905887014e-08, "loss": 0.8877, "step": 903 }, { "epoch": 4.933151432469304, "grad_norm": 0.12698382896355306, "learning_rate": 1.7628754686760397e-08, "loss": 0.8837, "step": 904 }, { "epoch": 4.938608458390178, "grad_norm": 0.10840982827247776, "learning_rate": 1.4569590277413447e-08, "loss": 0.8738, "step": 905 }, { "epoch": 4.94406548431105, "grad_norm": 0.11341918799763352, "learning_rate": 1.1801640406963188e-08, "loss": 0.8731, "step": 906 }, { "epoch": 4.949522510231923, "grad_norm": 0.11594299163597076, "learning_rate": 9.32494540815121e-09, "loss": 0.8704, "step": 907 }, { "epoch": 4.954979536152797, "grad_norm": 0.10946767908645595, "learning_rate": 7.13954136974504e-09, "loss": 0.8916, "step": 908 }, { "epoch": 4.96043656207367, "grad_norm": 0.1131154133991995, "learning_rate": 5.245460136018565e-09, "loss": 0.8931, "step": 909 }, { "epoch": 4.965893587994543, "grad_norm": 0.11676272543288499, "learning_rate": 3.6427293062724077e-09, "loss": 0.8906, "step": 910 }, { "epoch": 4.971350613915416, "grad_norm": 0.11538578325055797, "learning_rate": 2.3313722344497914e-09, "loss": 0.8779, "step": 911 }, { "epoch": 4.97680763983629, "grad_norm": 0.1264648783936699, "learning_rate": 1.3114080287790488e-09, "loss": 0.8652, "step": 912 }, { "epoch": 4.982264665757162, "grad_norm": 0.11491863538487673, "learning_rate": 5.828515515116096e-10, "loss": 0.8722, "step": 913 }, { "epoch": 4.987721691678035, "grad_norm": 0.1086827877742504, "learning_rate": 1.457134186866327e-10, "loss": 0.9013, "step": 914 }, { "epoch": 4.993178717598909, "grad_norm": 0.11831005135790107, "learning_rate": 0.0, "loss": 0.8894, "step": 915 }, { "epoch": 4.993178717598909, "step": 915, "total_flos": 1.883960626772548e+19, "train_loss": 0.9399711781512192, "train_runtime": 49360.7108, "train_samples_per_second": 9.497, "train_steps_per_second": 0.019 } ], "logging_steps": 1.0, "max_steps": 915, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.883960626772548e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }