{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.95475113122172, "eval_steps": 500, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01809954751131222, "grad_norm": 1.6741957199641677, "learning_rate": 8.333333333333333e-07, "loss": 0.392, "step": 1 }, { "epoch": 0.03619909502262444, "grad_norm": 1.526970859287005, "learning_rate": 1.6666666666666667e-06, "loss": 0.3479, "step": 2 }, { "epoch": 0.05429864253393665, "grad_norm": 1.8103690939719148, "learning_rate": 2.5e-06, "loss": 0.363, "step": 3 }, { "epoch": 0.07239819004524888, "grad_norm": 1.568077888738942, "learning_rate": 3.3333333333333333e-06, "loss": 0.3513, "step": 4 }, { "epoch": 0.09049773755656108, "grad_norm": 1.668945098216231, "learning_rate": 4.166666666666667e-06, "loss": 0.3759, "step": 5 }, { "epoch": 0.1085972850678733, "grad_norm": 1.3864660758192329, "learning_rate": 5e-06, "loss": 0.3525, "step": 6 }, { "epoch": 0.12669683257918551, "grad_norm": 1.538592504007101, "learning_rate": 4.99995831202958e-06, "loss": 0.3904, "step": 7 }, { "epoch": 0.14479638009049775, "grad_norm": 1.2047351614977708, "learning_rate": 4.999833249508629e-06, "loss": 0.3924, "step": 8 }, { "epoch": 0.16289592760180996, "grad_norm": 1.0640124047316322, "learning_rate": 4.999624816608027e-06, "loss": 0.375, "step": 9 }, { "epoch": 0.18099547511312217, "grad_norm": 0.7966517341350207, "learning_rate": 4.999333020279094e-06, "loss": 0.356, "step": 10 }, { "epoch": 0.19909502262443438, "grad_norm": 0.4554353875165799, "learning_rate": 4.998957870253344e-06, "loss": 0.3598, "step": 11 }, { "epoch": 0.2171945701357466, "grad_norm": 0.6557533564712539, "learning_rate": 4.998499379042172e-06, "loss": 0.3392, "step": 12 }, { "epoch": 0.23529411764705882, "grad_norm": 0.7936629840883419, "learning_rate": 4.997957561936433e-06, "loss": 0.3691, "step": 13 }, { "epoch": 0.25339366515837103, "grad_norm": 0.7547277609627707, "learning_rate": 4.997332437005932e-06, "loss": 0.352, "step": 14 }, { "epoch": 0.27149321266968324, "grad_norm": 0.8087501558896228, "learning_rate": 4.996624025098819e-06, "loss": 0.3449, "step": 15 }, { "epoch": 0.2895927601809955, "grad_norm": 0.7820896976667914, "learning_rate": 4.9958323498409e-06, "loss": 0.3401, "step": 16 }, { "epoch": 0.3076923076923077, "grad_norm": 0.7431270073814646, "learning_rate": 4.99495743763484e-06, "loss": 0.3567, "step": 17 }, { "epoch": 0.3257918552036199, "grad_norm": 0.6777032410783791, "learning_rate": 4.993999317659293e-06, "loss": 0.3585, "step": 18 }, { "epoch": 0.3438914027149321, "grad_norm": 0.6196369624534765, "learning_rate": 4.9929580218679195e-06, "loss": 0.3293, "step": 19 }, { "epoch": 0.36199095022624433, "grad_norm": 0.5604472586874513, "learning_rate": 4.991833584988326e-06, "loss": 0.3437, "step": 20 }, { "epoch": 0.38009049773755654, "grad_norm": 0.5137629265098744, "learning_rate": 4.990626044520905e-06, "loss": 0.3249, "step": 21 }, { "epoch": 0.39819004524886875, "grad_norm": 0.547237003947588, "learning_rate": 4.989335440737587e-06, "loss": 0.3532, "step": 22 }, { "epoch": 0.416289592760181, "grad_norm": 0.4164415963578454, "learning_rate": 4.987961816680493e-06, "loss": 0.3533, "step": 23 }, { "epoch": 0.4343891402714932, "grad_norm": 0.35699522892651586, "learning_rate": 4.986505218160502e-06, "loss": 0.3268, "step": 24 }, { "epoch": 0.45248868778280543, "grad_norm": 0.4026088211790661, "learning_rate": 4.984965693755723e-06, "loss": 0.3332, "step": 25 }, { "epoch": 0.47058823529411764, "grad_norm": 0.35057192166080064, "learning_rate": 4.983343294809875e-06, "loss": 0.3245, "step": 26 }, { "epoch": 0.48868778280542985, "grad_norm": 0.3639947181438965, "learning_rate": 4.981638075430572e-06, "loss": 0.3199, "step": 27 }, { "epoch": 0.5067873303167421, "grad_norm": 0.3387354723957761, "learning_rate": 4.979850092487525e-06, "loss": 0.3282, "step": 28 }, { "epoch": 0.5248868778280543, "grad_norm": 0.3528078697583281, "learning_rate": 4.977979405610635e-06, "loss": 0.337, "step": 29 }, { "epoch": 0.5429864253393665, "grad_norm": 0.3126032062813636, "learning_rate": 4.976026077188013e-06, "loss": 0.3265, "step": 30 }, { "epoch": 0.5610859728506787, "grad_norm": 0.3584209299955196, "learning_rate": 4.973990172363899e-06, "loss": 0.3568, "step": 31 }, { "epoch": 0.579185520361991, "grad_norm": 0.4239503710543474, "learning_rate": 4.9718717590364855e-06, "loss": 0.3287, "step": 32 }, { "epoch": 0.5972850678733032, "grad_norm": 0.41156579276283284, "learning_rate": 4.969670907855651e-06, "loss": 0.3267, "step": 33 }, { "epoch": 0.6153846153846154, "grad_norm": 0.33536968371087267, "learning_rate": 4.967387692220615e-06, "loss": 0.3367, "step": 34 }, { "epoch": 0.6334841628959276, "grad_norm": 0.30272106018319034, "learning_rate": 4.965022188277474e-06, "loss": 0.3236, "step": 35 }, { "epoch": 0.6515837104072398, "grad_norm": 0.28697723150322, "learning_rate": 4.962574474916678e-06, "loss": 0.3236, "step": 36 }, { "epoch": 0.669683257918552, "grad_norm": 0.21062422377276369, "learning_rate": 4.960044633770387e-06, "loss": 0.3295, "step": 37 }, { "epoch": 0.6877828054298643, "grad_norm": 0.28283155334950705, "learning_rate": 4.957432749209755e-06, "loss": 0.3453, "step": 38 }, { "epoch": 0.7058823529411765, "grad_norm": 0.2161778814999892, "learning_rate": 4.954738908342116e-06, "loss": 0.3645, "step": 39 }, { "epoch": 0.7239819004524887, "grad_norm": 0.2354424408008659, "learning_rate": 4.9519632010080765e-06, "loss": 0.3372, "step": 40 }, { "epoch": 0.7420814479638009, "grad_norm": 0.26054770828411217, "learning_rate": 4.9491057197785205e-06, "loss": 0.3349, "step": 41 }, { "epoch": 0.7601809954751131, "grad_norm": 0.2596310001381547, "learning_rate": 4.946166559951523e-06, "loss": 0.3174, "step": 42 }, { "epoch": 0.7782805429864253, "grad_norm": 0.2763815562688228, "learning_rate": 4.943145819549169e-06, "loss": 0.3464, "step": 43 }, { "epoch": 0.7963800904977375, "grad_norm": 0.2508801820692124, "learning_rate": 4.9400435993142895e-06, "loss": 0.3277, "step": 44 }, { "epoch": 0.8144796380090498, "grad_norm": 0.25823275674527674, "learning_rate": 4.936860002707096e-06, "loss": 0.343, "step": 45 }, { "epoch": 0.832579185520362, "grad_norm": 0.23862916529933217, "learning_rate": 4.933595135901733e-06, "loss": 0.3425, "step": 46 }, { "epoch": 0.8506787330316742, "grad_norm": 0.2377285409864031, "learning_rate": 4.9302491077827366e-06, "loss": 0.3345, "step": 47 }, { "epoch": 0.8687782805429864, "grad_norm": 0.2054263655021643, "learning_rate": 4.926822029941406e-06, "loss": 0.3599, "step": 48 }, { "epoch": 0.8868778280542986, "grad_norm": 0.21857378026560212, "learning_rate": 4.923314016672075e-06, "loss": 0.3293, "step": 49 }, { "epoch": 0.9049773755656109, "grad_norm": 0.20834775020466292, "learning_rate": 4.919725184968307e-06, "loss": 0.3231, "step": 50 }, { "epoch": 0.9230769230769231, "grad_norm": 0.2000139484905926, "learning_rate": 4.9160556545189895e-06, "loss": 0.3248, "step": 51 }, { "epoch": 0.9411764705882353, "grad_norm": 0.24124485812118368, "learning_rate": 4.9123055477043454e-06, "loss": 0.3314, "step": 52 }, { "epoch": 0.9592760180995475, "grad_norm": 0.26803109191751107, "learning_rate": 4.908474989591846e-06, "loss": 0.3341, "step": 53 }, { "epoch": 0.9773755656108597, "grad_norm": 0.21490833872159623, "learning_rate": 4.904564107932048e-06, "loss": 0.3189, "step": 54 }, { "epoch": 0.995475113122172, "grad_norm": 0.22738113980709365, "learning_rate": 4.900573033154325e-06, "loss": 0.3198, "step": 55 }, { "epoch": 1.0135746606334841, "grad_norm": 0.1860953606536629, "learning_rate": 4.8965018983625245e-06, "loss": 0.3273, "step": 56 }, { "epoch": 1.0316742081447963, "grad_norm": 0.2170252756734204, "learning_rate": 4.8923508393305224e-06, "loss": 0.3058, "step": 57 }, { "epoch": 1.0497737556561086, "grad_norm": 0.19753070998453712, "learning_rate": 4.888119994497701e-06, "loss": 0.2949, "step": 58 }, { "epoch": 1.0678733031674208, "grad_norm": 0.21040212719480175, "learning_rate": 4.883809504964325e-06, "loss": 0.298, "step": 59 }, { "epoch": 1.085972850678733, "grad_norm": 0.20799415615187367, "learning_rate": 4.879419514486846e-06, "loss": 0.3201, "step": 60 }, { "epoch": 1.1040723981900453, "grad_norm": 0.19784508945913667, "learning_rate": 4.874950169473097e-06, "loss": 0.3338, "step": 61 }, { "epoch": 1.1221719457013575, "grad_norm": 0.20898074744097636, "learning_rate": 4.870401618977415e-06, "loss": 0.3053, "step": 62 }, { "epoch": 1.1402714932126696, "grad_norm": 0.21530409824217756, "learning_rate": 4.8657740146956724e-06, "loss": 0.3346, "step": 63 }, { "epoch": 1.1583710407239818, "grad_norm": 0.21656570481740497, "learning_rate": 4.8610675109602135e-06, "loss": 0.3175, "step": 64 }, { "epoch": 1.1764705882352942, "grad_norm": 0.18916438407683134, "learning_rate": 4.856282264734708e-06, "loss": 0.2973, "step": 65 }, { "epoch": 1.1945701357466063, "grad_norm": 0.19298959302885896, "learning_rate": 4.851418435608919e-06, "loss": 0.3328, "step": 66 }, { "epoch": 1.2126696832579185, "grad_norm": 0.19382840884955524, "learning_rate": 4.84647618579338e-06, "loss": 0.3233, "step": 67 }, { "epoch": 1.2307692307692308, "grad_norm": 0.22308099956654967, "learning_rate": 4.841455680113979e-06, "loss": 0.3401, "step": 68 }, { "epoch": 1.248868778280543, "grad_norm": 0.1908581730308582, "learning_rate": 4.836357086006471e-06, "loss": 0.3199, "step": 69 }, { "epoch": 1.2669683257918551, "grad_norm": 0.1900661127768816, "learning_rate": 4.83118057351089e-06, "loss": 0.3193, "step": 70 }, { "epoch": 1.2850678733031673, "grad_norm": 0.1842083788274683, "learning_rate": 4.825926315265874e-06, "loss": 0.3093, "step": 71 }, { "epoch": 1.3031674208144797, "grad_norm": 0.19304820753044424, "learning_rate": 4.820594486502913e-06, "loss": 0.3147, "step": 72 }, { "epoch": 1.3212669683257918, "grad_norm": 0.1865184743330753, "learning_rate": 4.815185265040504e-06, "loss": 0.3371, "step": 73 }, { "epoch": 1.3393665158371042, "grad_norm": 0.21257371675686554, "learning_rate": 4.809698831278217e-06, "loss": 0.3556, "step": 74 }, { "epoch": 1.3574660633484164, "grad_norm": 0.19738810108074692, "learning_rate": 4.804135368190684e-06, "loss": 0.3098, "step": 75 }, { "epoch": 1.3755656108597285, "grad_norm": 0.20419379710110824, "learning_rate": 4.798495061321492e-06, "loss": 0.3037, "step": 76 }, { "epoch": 1.3936651583710407, "grad_norm": 0.21182701854581448, "learning_rate": 4.792778098776997e-06, "loss": 0.3046, "step": 77 }, { "epoch": 1.4117647058823528, "grad_norm": 0.20966701782750055, "learning_rate": 4.786984671220053e-06, "loss": 0.3146, "step": 78 }, { "epoch": 1.4298642533936652, "grad_norm": 0.2228994463496351, "learning_rate": 4.7811149718636475e-06, "loss": 0.3133, "step": 79 }, { "epoch": 1.4479638009049773, "grad_norm": 0.2125517747018847, "learning_rate": 4.7751691964644655e-06, "loss": 0.3181, "step": 80 }, { "epoch": 1.4660633484162897, "grad_norm": 0.18774294015726306, "learning_rate": 4.7691475433163515e-06, "loss": 0.3107, "step": 81 }, { "epoch": 1.4841628959276019, "grad_norm": 0.2105655304494509, "learning_rate": 4.763050213243705e-06, "loss": 0.3193, "step": 82 }, { "epoch": 1.502262443438914, "grad_norm": 0.2101302949838479, "learning_rate": 4.7568774095947804e-06, "loss": 0.3372, "step": 83 }, { "epoch": 1.5203619909502262, "grad_norm": 0.1761520660073366, "learning_rate": 4.7506293382349e-06, "loss": 0.3058, "step": 84 }, { "epoch": 1.5384615384615383, "grad_norm": 0.20214706457289192, "learning_rate": 4.744306207539595e-06, "loss": 0.34, "step": 85 }, { "epoch": 1.5565610859728507, "grad_norm": 0.21608846929666756, "learning_rate": 4.737908228387656e-06, "loss": 0.3285, "step": 86 }, { "epoch": 1.5746606334841629, "grad_norm": 0.19692503921435273, "learning_rate": 4.731435614154094e-06, "loss": 0.3134, "step": 87 }, { "epoch": 1.5927601809954752, "grad_norm": 0.19107736826101185, "learning_rate": 4.72488858070303e-06, "loss": 0.305, "step": 88 }, { "epoch": 1.6108597285067874, "grad_norm": 0.19148405595657123, "learning_rate": 4.718267346380492e-06, "loss": 0.3157, "step": 89 }, { "epoch": 1.6289592760180995, "grad_norm": 0.19180277215162053, "learning_rate": 4.711572132007139e-06, "loss": 0.3124, "step": 90 }, { "epoch": 1.6470588235294117, "grad_norm": 0.19539080957269014, "learning_rate": 4.704803160870888e-06, "loss": 0.3306, "step": 91 }, { "epoch": 1.6651583710407238, "grad_norm": 0.21052797618402563, "learning_rate": 4.697960658719475e-06, "loss": 0.3061, "step": 92 }, { "epoch": 1.6832579185520362, "grad_norm": 0.20191616959818315, "learning_rate": 4.69104485375292e-06, "loss": 0.3098, "step": 93 }, { "epoch": 1.7013574660633484, "grad_norm": 0.2159013380308242, "learning_rate": 4.684055976615924e-06, "loss": 0.3088, "step": 94 }, { "epoch": 1.7194570135746607, "grad_norm": 0.18904626555927467, "learning_rate": 4.676994260390168e-06, "loss": 0.2912, "step": 95 }, { "epoch": 1.737556561085973, "grad_norm": 0.19467640291002175, "learning_rate": 4.6698599405865465e-06, "loss": 0.303, "step": 96 }, { "epoch": 1.755656108597285, "grad_norm": 0.2880548104749461, "learning_rate": 4.662653255137308e-06, "loss": 0.3348, "step": 97 }, { "epoch": 1.7737556561085972, "grad_norm": 0.2019155699824381, "learning_rate": 4.655374444388127e-06, "loss": 0.327, "step": 98 }, { "epoch": 1.7918552036199094, "grad_norm": 0.2592156259533593, "learning_rate": 4.648023751090079e-06, "loss": 0.3363, "step": 99 }, { "epoch": 1.8099547511312217, "grad_norm": 0.2180192099378802, "learning_rate": 4.640601420391554e-06, "loss": 0.3113, "step": 100 }, { "epoch": 1.8280542986425339, "grad_norm": 0.20679493678747934, "learning_rate": 4.633107699830073e-06, "loss": 0.3148, "step": 101 }, { "epoch": 1.8461538461538463, "grad_norm": 0.2053440368213778, "learning_rate": 4.625542839324036e-06, "loss": 0.2967, "step": 102 }, { "epoch": 1.8642533936651584, "grad_norm": 0.19200611510261656, "learning_rate": 4.617907091164389e-06, "loss": 0.3188, "step": 103 }, { "epoch": 1.8823529411764706, "grad_norm": 0.2302101510970096, "learning_rate": 4.610200710006206e-06, "loss": 0.3121, "step": 104 }, { "epoch": 1.9004524886877827, "grad_norm": 0.2221804604843677, "learning_rate": 4.602423952860199e-06, "loss": 0.3146, "step": 105 }, { "epoch": 1.9185520361990949, "grad_norm": 0.21983834708053807, "learning_rate": 4.594577079084146e-06, "loss": 0.3405, "step": 106 }, { "epoch": 1.9366515837104072, "grad_norm": 0.21085636909889235, "learning_rate": 4.58666035037424e-06, "loss": 0.3089, "step": 107 }, { "epoch": 1.9547511312217196, "grad_norm": 0.2016884181282795, "learning_rate": 4.578674030756364e-06, "loss": 0.3229, "step": 108 }, { "epoch": 1.9728506787330318, "grad_norm": 0.19657023773974253, "learning_rate": 4.57061838657728e-06, "loss": 0.3237, "step": 109 }, { "epoch": 1.990950226244344, "grad_norm": 0.20813455436587358, "learning_rate": 4.562493686495756e-06, "loss": 0.3255, "step": 110 }, { "epoch": 2.009049773755656, "grad_norm": 0.18872409832307335, "learning_rate": 4.5543002014735955e-06, "loss": 0.2988, "step": 111 }, { "epoch": 2.0271493212669682, "grad_norm": 0.19594421270270285, "learning_rate": 4.546038204766609e-06, "loss": 0.3109, "step": 112 }, { "epoch": 2.0452488687782804, "grad_norm": 0.22355285614452686, "learning_rate": 4.537707971915495e-06, "loss": 0.3066, "step": 113 }, { "epoch": 2.0633484162895925, "grad_norm": 0.2017792264758022, "learning_rate": 4.529309780736654e-06, "loss": 0.2939, "step": 114 }, { "epoch": 2.081447963800905, "grad_norm": 0.20223483022494018, "learning_rate": 4.520843911312922e-06, "loss": 0.294, "step": 115 }, { "epoch": 2.0995475113122173, "grad_norm": 0.20322098664858632, "learning_rate": 4.512310645984231e-06, "loss": 0.2984, "step": 116 }, { "epoch": 2.1176470588235294, "grad_norm": 0.20705072743185104, "learning_rate": 4.503710269338191e-06, "loss": 0.2694, "step": 117 }, { "epoch": 2.1357466063348416, "grad_norm": 0.18442012590242893, "learning_rate": 4.4950430682005995e-06, "loss": 0.2979, "step": 118 }, { "epoch": 2.1538461538461537, "grad_norm": 0.2076635780792367, "learning_rate": 4.486309331625877e-06, "loss": 0.2874, "step": 119 }, { "epoch": 2.171945701357466, "grad_norm": 0.19968964517363474, "learning_rate": 4.477509350887424e-06, "loss": 0.291, "step": 120 }, { "epoch": 2.1900452488687785, "grad_norm": 0.18959726179400077, "learning_rate": 4.468643419467909e-06, "loss": 0.2921, "step": 121 }, { "epoch": 2.2081447963800906, "grad_norm": 0.2388780187488927, "learning_rate": 4.459711833049485e-06, "loss": 0.3061, "step": 122 }, { "epoch": 2.226244343891403, "grad_norm": 0.22092548916393367, "learning_rate": 4.4507148895039165e-06, "loss": 0.2765, "step": 123 }, { "epoch": 2.244343891402715, "grad_norm": 0.21070917452514223, "learning_rate": 4.4416528888826595e-06, "loss": 0.2969, "step": 124 }, { "epoch": 2.262443438914027, "grad_norm": 0.19807472108481627, "learning_rate": 4.432526133406843e-06, "loss": 0.3044, "step": 125 }, { "epoch": 2.2805429864253393, "grad_norm": 0.1910225174641335, "learning_rate": 4.423334927457198e-06, "loss": 0.3132, "step": 126 }, { "epoch": 2.2986425339366514, "grad_norm": 0.2203923882052516, "learning_rate": 4.414079577563901e-06, "loss": 0.3032, "step": 127 }, { "epoch": 2.3167420814479636, "grad_norm": 0.21331518168793756, "learning_rate": 4.404760392396355e-06, "loss": 0.3033, "step": 128 }, { "epoch": 2.334841628959276, "grad_norm": 0.21461268917839496, "learning_rate": 4.3953776827528925e-06, "loss": 0.3039, "step": 129 }, { "epoch": 2.3529411764705883, "grad_norm": 0.1862241130798519, "learning_rate": 4.385931761550411e-06, "loss": 0.2793, "step": 130 }, { "epoch": 2.3710407239819005, "grad_norm": 0.19779667332990994, "learning_rate": 4.376422943813936e-06, "loss": 0.2849, "step": 131 }, { "epoch": 2.3891402714932126, "grad_norm": 0.20538470648954774, "learning_rate": 4.366851546666118e-06, "loss": 0.3129, "step": 132 }, { "epoch": 2.4072398190045248, "grad_norm": 0.20067043214876432, "learning_rate": 4.357217889316657e-06, "loss": 0.3041, "step": 133 }, { "epoch": 2.425339366515837, "grad_norm": 0.1997136625573991, "learning_rate": 4.3475222930516484e-06, "loss": 0.2839, "step": 134 }, { "epoch": 2.4434389140271495, "grad_norm": 0.20004099038403145, "learning_rate": 4.3377650812228765e-06, "loss": 0.3014, "step": 135 }, { "epoch": 2.4615384615384617, "grad_norm": 0.19311135694466858, "learning_rate": 4.327946579237028e-06, "loss": 0.2834, "step": 136 }, { "epoch": 2.479638009049774, "grad_norm": 0.21078445039076968, "learning_rate": 4.318067114544838e-06, "loss": 0.2796, "step": 137 }, { "epoch": 2.497737556561086, "grad_norm": 0.21975365365759061, "learning_rate": 4.308127016630176e-06, "loss": 0.2972, "step": 138 }, { "epoch": 2.515837104072398, "grad_norm": 0.21203142423348517, "learning_rate": 4.2981266169990436e-06, "loss": 0.3196, "step": 139 }, { "epoch": 2.5339366515837103, "grad_norm": 0.20131092451024465, "learning_rate": 4.2880662491685345e-06, "loss": 0.3003, "step": 140 }, { "epoch": 2.5520361990950224, "grad_norm": 0.22294798360675439, "learning_rate": 4.277946248655701e-06, "loss": 0.2947, "step": 141 }, { "epoch": 2.5701357466063346, "grad_norm": 0.22859386995564024, "learning_rate": 4.267766952966369e-06, "loss": 0.2958, "step": 142 }, { "epoch": 2.588235294117647, "grad_norm": 0.19567845392715985, "learning_rate": 4.257528701583882e-06, "loss": 0.2998, "step": 143 }, { "epoch": 2.6063348416289593, "grad_norm": 0.19741413456031112, "learning_rate": 4.247231835957773e-06, "loss": 0.3408, "step": 144 }, { "epoch": 2.6244343891402715, "grad_norm": 0.19905612890447116, "learning_rate": 4.236876699492391e-06, "loss": 0.3117, "step": 145 }, { "epoch": 2.6425339366515836, "grad_norm": 0.1942385041095113, "learning_rate": 4.226463637535429e-06, "loss": 0.3152, "step": 146 }, { "epoch": 2.660633484162896, "grad_norm": 0.22327732166814804, "learning_rate": 4.215992997366425e-06, "loss": 0.3142, "step": 147 }, { "epoch": 2.6787330316742084, "grad_norm": 0.1935161282714164, "learning_rate": 4.2054651281851685e-06, "loss": 0.3081, "step": 148 }, { "epoch": 2.6968325791855206, "grad_norm": 0.23957566926280122, "learning_rate": 4.1948803811000585e-06, "loss": 0.2894, "step": 149 }, { "epoch": 2.7149321266968327, "grad_norm": 0.18805009890662516, "learning_rate": 4.184239109116393e-06, "loss": 0.2984, "step": 150 }, { "epoch": 2.733031674208145, "grad_norm": 0.212580814141281, "learning_rate": 4.173541667124599e-06, "loss": 0.3097, "step": 151 }, { "epoch": 2.751131221719457, "grad_norm": 0.19712271093008257, "learning_rate": 4.1627884118883925e-06, "loss": 0.3177, "step": 152 }, { "epoch": 2.769230769230769, "grad_norm": 0.2278946892968003, "learning_rate": 4.1519797020328815e-06, "loss": 0.3101, "step": 153 }, { "epoch": 2.7873303167420813, "grad_norm": 0.21064766645861627, "learning_rate": 4.141115898032607e-06, "loss": 0.274, "step": 154 }, { "epoch": 2.8054298642533935, "grad_norm": 0.20995612915210915, "learning_rate": 4.130197362199521e-06, "loss": 0.2926, "step": 155 }, { "epoch": 2.8235294117647056, "grad_norm": 0.21633523471290103, "learning_rate": 4.119224458670905e-06, "loss": 0.2875, "step": 156 }, { "epoch": 2.841628959276018, "grad_norm": 0.21266765467202223, "learning_rate": 4.1081975533972185e-06, "loss": 0.2947, "step": 157 }, { "epoch": 2.8597285067873304, "grad_norm": 0.19506346084116072, "learning_rate": 4.097117014129903e-06, "loss": 0.296, "step": 158 }, { "epoch": 2.8778280542986425, "grad_norm": 0.1986031276610744, "learning_rate": 4.085983210409114e-06, "loss": 0.2988, "step": 159 }, { "epoch": 2.8959276018099547, "grad_norm": 0.22662474336309782, "learning_rate": 4.074796513551395e-06, "loss": 0.2952, "step": 160 }, { "epoch": 2.914027149321267, "grad_norm": 0.21721813582738397, "learning_rate": 4.063557296637295e-06, "loss": 0.3099, "step": 161 }, { "epoch": 2.9321266968325794, "grad_norm": 0.2133328804989817, "learning_rate": 4.052265934498929e-06, "loss": 0.2974, "step": 162 }, { "epoch": 2.9502262443438916, "grad_norm": 0.1960218423105953, "learning_rate": 4.040922803707474e-06, "loss": 0.3065, "step": 163 }, { "epoch": 2.9683257918552037, "grad_norm": 0.22167341080572722, "learning_rate": 4.029528282560609e-06, "loss": 0.2886, "step": 164 }, { "epoch": 2.986425339366516, "grad_norm": 0.20386239234209946, "learning_rate": 4.018082751069904e-06, "loss": 0.3076, "step": 165 }, { "epoch": 3.004524886877828, "grad_norm": 0.23748187918298697, "learning_rate": 4.006586590948141e-06, "loss": 0.2985, "step": 166 }, { "epoch": 3.02262443438914, "grad_norm": 0.22617083435609797, "learning_rate": 3.995040185596588e-06, "loss": 0.2754, "step": 167 }, { "epoch": 3.0407239819004523, "grad_norm": 0.23986769037952196, "learning_rate": 3.983443920092206e-06, "loss": 0.2854, "step": 168 }, { "epoch": 3.0588235294117645, "grad_norm": 0.20150185345396, "learning_rate": 3.971798181174816e-06, "loss": 0.2832, "step": 169 }, { "epoch": 3.076923076923077, "grad_norm": 0.20913884879987113, "learning_rate": 3.960103357234192e-06, "loss": 0.2986, "step": 170 }, { "epoch": 3.0950226244343892, "grad_norm": 0.20477890710932672, "learning_rate": 3.948359838297115e-06, "loss": 0.2876, "step": 171 }, { "epoch": 3.1131221719457014, "grad_norm": 0.2031405516319826, "learning_rate": 3.9365680160143595e-06, "loss": 0.2971, "step": 172 }, { "epoch": 3.1312217194570136, "grad_norm": 0.185634722744017, "learning_rate": 3.924728283647638e-06, "loss": 0.279, "step": 173 }, { "epoch": 3.1493212669683257, "grad_norm": 0.20745314489894484, "learning_rate": 3.91284103605648e-06, "loss": 0.2903, "step": 174 }, { "epoch": 3.167420814479638, "grad_norm": 0.20741649089082642, "learning_rate": 3.9009066696850664e-06, "loss": 0.2964, "step": 175 }, { "epoch": 3.1855203619909505, "grad_norm": 0.20883578071304365, "learning_rate": 3.888925582549006e-06, "loss": 0.2946, "step": 176 }, { "epoch": 3.2036199095022626, "grad_norm": 0.21451927304435986, "learning_rate": 3.8768981742220646e-06, "loss": 0.2811, "step": 177 }, { "epoch": 3.2217194570135748, "grad_norm": 0.21080586953456093, "learning_rate": 3.864824845822837e-06, "loss": 0.2825, "step": 178 }, { "epoch": 3.239819004524887, "grad_norm": 0.20609867837665838, "learning_rate": 3.852706000001367e-06, "loss": 0.2903, "step": 179 }, { "epoch": 3.257918552036199, "grad_norm": 0.1972989252518201, "learning_rate": 3.840542040925725e-06, "loss": 0.2626, "step": 180 }, { "epoch": 3.276018099547511, "grad_norm": 0.21568160972522105, "learning_rate": 3.828333374268523e-06, "loss": 0.2906, "step": 181 }, { "epoch": 3.2941176470588234, "grad_norm": 0.1997067887507601, "learning_rate": 3.81608040719339e-06, "loss": 0.2862, "step": 182 }, { "epoch": 3.3122171945701355, "grad_norm": 0.2017980016690952, "learning_rate": 3.8037835483413877e-06, "loss": 0.2855, "step": 183 }, { "epoch": 3.330316742081448, "grad_norm": 0.20121264738949698, "learning_rate": 3.7914432078173867e-06, "loss": 0.2795, "step": 184 }, { "epoch": 3.3484162895927603, "grad_norm": 0.22611700851822947, "learning_rate": 3.7790597971763892e-06, "loss": 0.2836, "step": 185 }, { "epoch": 3.3665158371040724, "grad_norm": 0.2353941218093955, "learning_rate": 3.7666337294097987e-06, "loss": 0.288, "step": 186 }, { "epoch": 3.3846153846153846, "grad_norm": 0.18605988505854537, "learning_rate": 3.7541654189316525e-06, "loss": 0.275, "step": 187 }, { "epoch": 3.4027149321266967, "grad_norm": 0.22628052198695675, "learning_rate": 3.741655281564796e-06, "loss": 0.2966, "step": 188 }, { "epoch": 3.420814479638009, "grad_norm": 0.21236583352079183, "learning_rate": 3.72910373452702e-06, "loss": 0.2702, "step": 189 }, { "epoch": 3.4389140271493215, "grad_norm": 0.22886628365130654, "learning_rate": 3.7165111964171407e-06, "loss": 0.2718, "step": 190 }, { "epoch": 3.4570135746606336, "grad_norm": 0.19177205299999378, "learning_rate": 3.703878087201044e-06, "loss": 0.2785, "step": 191 }, { "epoch": 3.475113122171946, "grad_norm": 0.21164213919986813, "learning_rate": 3.6912048281976764e-06, "loss": 0.2991, "step": 192 }, { "epoch": 3.493212669683258, "grad_norm": 0.20082392739954888, "learning_rate": 3.6784918420649952e-06, "loss": 0.2814, "step": 193 }, { "epoch": 3.51131221719457, "grad_norm": 0.21531730826425216, "learning_rate": 3.66573955278587e-06, "loss": 0.2719, "step": 194 }, { "epoch": 3.5294117647058822, "grad_norm": 0.20053264760640085, "learning_rate": 3.6529483856539512e-06, "loss": 0.2639, "step": 195 }, { "epoch": 3.5475113122171944, "grad_norm": 0.18714903727677973, "learning_rate": 3.640118767259474e-06, "loss": 0.2712, "step": 196 }, { "epoch": 3.5656108597285066, "grad_norm": 0.19923843024788357, "learning_rate": 3.6272511254750403e-06, "loss": 0.2825, "step": 197 }, { "epoch": 3.583710407239819, "grad_norm": 0.2016875130706868, "learning_rate": 3.6143458894413463e-06, "loss": 0.2977, "step": 198 }, { "epoch": 3.6018099547511313, "grad_norm": 0.21861290041385015, "learning_rate": 3.6014034895528705e-06, "loss": 0.284, "step": 199 }, { "epoch": 3.6199095022624435, "grad_norm": 0.16879798551287897, "learning_rate": 3.588424357443521e-06, "loss": 0.2782, "step": 200 }, { "epoch": 3.6380090497737556, "grad_norm": 0.22087168375536256, "learning_rate": 3.5754089259722365e-06, "loss": 0.2902, "step": 201 }, { "epoch": 3.6561085972850678, "grad_norm": 0.2219253724635141, "learning_rate": 3.5623576292085555e-06, "loss": 0.294, "step": 202 }, { "epoch": 3.6742081447963804, "grad_norm": 0.19173446074813308, "learning_rate": 3.549270902418136e-06, "loss": 0.2715, "step": 203 }, { "epoch": 3.6923076923076925, "grad_norm": 0.20017688015075918, "learning_rate": 3.536149182048243e-06, "loss": 0.2823, "step": 204 }, { "epoch": 3.7104072398190047, "grad_norm": 0.19825967232402708, "learning_rate": 3.5229929057131877e-06, "loss": 0.2881, "step": 205 }, { "epoch": 3.728506787330317, "grad_norm": 0.218766824892692, "learning_rate": 3.5098025121797375e-06, "loss": 0.2999, "step": 206 }, { "epoch": 3.746606334841629, "grad_norm": 0.18895340916933914, "learning_rate": 3.496578441352481e-06, "loss": 0.2687, "step": 207 }, { "epoch": 3.764705882352941, "grad_norm": 0.19002028251920547, "learning_rate": 3.4833211342591565e-06, "loss": 0.2866, "step": 208 }, { "epoch": 3.7828054298642533, "grad_norm": 0.20990408194136284, "learning_rate": 3.4700310330359456e-06, "loss": 0.2805, "step": 209 }, { "epoch": 3.8009049773755654, "grad_norm": 0.1929810608863491, "learning_rate": 3.4567085809127247e-06, "loss": 0.2864, "step": 210 }, { "epoch": 3.8190045248868776, "grad_norm": 0.2026264509793902, "learning_rate": 3.4433542221982863e-06, "loss": 0.2847, "step": 211 }, { "epoch": 3.83710407239819, "grad_norm": 0.20527047718646593, "learning_rate": 3.4299684022655196e-06, "loss": 0.285, "step": 212 }, { "epoch": 3.8552036199095023, "grad_norm": 0.2004805974514508, "learning_rate": 3.4165515675365558e-06, "loss": 0.2862, "step": 213 }, { "epoch": 3.8733031674208145, "grad_norm": 0.18650100732919933, "learning_rate": 3.403104165467883e-06, "loss": 0.2748, "step": 214 }, { "epoch": 3.8914027149321266, "grad_norm": 0.21391027704520638, "learning_rate": 3.3896266445354208e-06, "loss": 0.2875, "step": 215 }, { "epoch": 3.909502262443439, "grad_norm": 0.19076214648617013, "learning_rate": 3.376119454219565e-06, "loss": 0.2811, "step": 216 }, { "epoch": 3.9276018099547514, "grad_norm": 0.22812272390771685, "learning_rate": 3.362583044990195e-06, "loss": 0.2923, "step": 217 }, { "epoch": 3.9457013574660635, "grad_norm": 0.2114461488141671, "learning_rate": 3.3490178682916534e-06, "loss": 0.2784, "step": 218 }, { "epoch": 3.9638009049773757, "grad_norm": 0.20971069504695025, "learning_rate": 3.335424376527688e-06, "loss": 0.2796, "step": 219 }, { "epoch": 3.981900452488688, "grad_norm": 0.20721944316747504, "learning_rate": 3.321803023046366e-06, "loss": 0.2855, "step": 220 }, { "epoch": 4.0, "grad_norm": 0.19529166110448204, "learning_rate": 3.3081542621249503e-06, "loss": 0.2722, "step": 221 }, { "epoch": 4.018099547511312, "grad_norm": 0.2181909689708081, "learning_rate": 3.2944785489547544e-06, "loss": 0.2769, "step": 222 }, { "epoch": 4.036199095022624, "grad_norm": 0.2041261831519089, "learning_rate": 3.2807763396259597e-06, "loss": 0.2755, "step": 223 }, { "epoch": 4.0542986425339365, "grad_norm": 0.17317381953746722, "learning_rate": 3.2670480911124045e-06, "loss": 0.2457, "step": 224 }, { "epoch": 4.072398190045249, "grad_norm": 0.20985257213280492, "learning_rate": 3.2532942612563436e-06, "loss": 0.3084, "step": 225 }, { "epoch": 4.090497737556561, "grad_norm": 0.1805568892682367, "learning_rate": 3.2395153087531767e-06, "loss": 0.2688, "step": 226 }, { "epoch": 4.108597285067873, "grad_norm": 0.20212288478471152, "learning_rate": 3.225711693136156e-06, "loss": 0.2678, "step": 227 }, { "epoch": 4.126696832579185, "grad_norm": 0.20073111869372287, "learning_rate": 3.211883874761058e-06, "loss": 0.2636, "step": 228 }, { "epoch": 4.144796380090498, "grad_norm": 0.21913185170065114, "learning_rate": 3.19803231479083e-06, "loss": 0.282, "step": 229 }, { "epoch": 4.16289592760181, "grad_norm": 0.20273970778843858, "learning_rate": 3.184157475180208e-06, "loss": 0.2689, "step": 230 }, { "epoch": 4.180995475113122, "grad_norm": 0.17451394799617262, "learning_rate": 3.1702598186603152e-06, "loss": 0.2583, "step": 231 }, { "epoch": 4.199095022624435, "grad_norm": 0.1918998406915714, "learning_rate": 3.1563398087232265e-06, "loss": 0.2795, "step": 232 }, { "epoch": 4.217194570135747, "grad_norm": 0.1970262585975004, "learning_rate": 3.1423979096065134e-06, "loss": 0.2605, "step": 233 }, { "epoch": 4.235294117647059, "grad_norm": 0.18769426018045784, "learning_rate": 3.1284345862777572e-06, "loss": 0.2592, "step": 234 }, { "epoch": 4.253393665158371, "grad_norm": 0.18870339061291633, "learning_rate": 3.1144503044190456e-06, "loss": 0.2642, "step": 235 }, { "epoch": 4.271493212669683, "grad_norm": 0.18389039727257753, "learning_rate": 3.100445530411442e-06, "loss": 0.2376, "step": 236 }, { "epoch": 4.289592760180995, "grad_norm": 0.20812476367192412, "learning_rate": 3.086420731319429e-06, "loss": 0.2708, "step": 237 }, { "epoch": 4.3076923076923075, "grad_norm": 0.197363611944904, "learning_rate": 3.0723763748753354e-06, "loss": 0.2844, "step": 238 }, { "epoch": 4.32579185520362, "grad_norm": 0.20806148893145612, "learning_rate": 3.0583129294637342e-06, "loss": 0.2487, "step": 239 }, { "epoch": 4.343891402714932, "grad_norm": 0.1903831131540733, "learning_rate": 3.044230864105821e-06, "loss": 0.256, "step": 240 }, { "epoch": 4.361990950226244, "grad_norm": 0.19042569669796638, "learning_rate": 3.030130648443777e-06, "loss": 0.2788, "step": 241 }, { "epoch": 4.380090497737557, "grad_norm": 0.18850059553704934, "learning_rate": 3.0160127527250993e-06, "loss": 0.2808, "step": 242 }, { "epoch": 4.398190045248869, "grad_norm": 0.20047462331384677, "learning_rate": 3.0018776477869244e-06, "loss": 0.2654, "step": 243 }, { "epoch": 4.416289592760181, "grad_norm": 0.20580882731270267, "learning_rate": 2.9877258050403214e-06, "loss": 0.2753, "step": 244 }, { "epoch": 4.4343891402714934, "grad_norm": 0.18933219589587963, "learning_rate": 2.973557696454571e-06, "loss": 0.2627, "step": 245 }, { "epoch": 4.452488687782806, "grad_norm": 0.2019766662866527, "learning_rate": 2.9593737945414264e-06, "loss": 0.2779, "step": 246 }, { "epoch": 4.470588235294118, "grad_norm": 0.21814847735110302, "learning_rate": 2.9451745723393547e-06, "loss": 0.2747, "step": 247 }, { "epoch": 4.48868778280543, "grad_norm": 0.2044800165411035, "learning_rate": 2.930960503397761e-06, "loss": 0.2726, "step": 248 }, { "epoch": 4.506787330316742, "grad_norm": 0.21142216034006506, "learning_rate": 2.916732061761192e-06, "loss": 0.2646, "step": 249 }, { "epoch": 4.524886877828054, "grad_norm": 0.20150189042920258, "learning_rate": 2.9024897219535326e-06, "loss": 0.279, "step": 250 }, { "epoch": 4.542986425339366, "grad_norm": 0.19100387279656014, "learning_rate": 2.8882339589621742e-06, "loss": 0.2795, "step": 251 }, { "epoch": 4.5610859728506785, "grad_norm": 0.18930829794972215, "learning_rate": 2.873965248222178e-06, "loss": 0.2672, "step": 252 }, { "epoch": 4.579185520361991, "grad_norm": 0.18814066866007795, "learning_rate": 2.859684065600417e-06, "loss": 0.2478, "step": 253 }, { "epoch": 4.597285067873303, "grad_norm": 0.19644782065692218, "learning_rate": 2.845390887379706e-06, "loss": 0.2639, "step": 254 }, { "epoch": 4.615384615384615, "grad_norm": 0.20255200557130154, "learning_rate": 2.8310861902429176e-06, "loss": 0.2725, "step": 255 }, { "epoch": 4.633484162895927, "grad_norm": 0.19849695081074425, "learning_rate": 2.816770451257085e-06, "loss": 0.2685, "step": 256 }, { "epoch": 4.65158371040724, "grad_norm": 0.20106804584886076, "learning_rate": 2.80244414785749e-06, "loss": 0.2572, "step": 257 }, { "epoch": 4.669683257918552, "grad_norm": 0.2021059922332257, "learning_rate": 2.7881077578317445e-06, "loss": 0.2924, "step": 258 }, { "epoch": 4.6877828054298645, "grad_norm": 0.21055754243512417, "learning_rate": 2.7737617593038493e-06, "loss": 0.2714, "step": 259 }, { "epoch": 4.705882352941177, "grad_norm": 0.18496100638206028, "learning_rate": 2.759406630718255e-06, "loss": 0.2609, "step": 260 }, { "epoch": 4.723981900452489, "grad_norm": 0.18437211430441194, "learning_rate": 2.7450428508239024e-06, "loss": 0.2662, "step": 261 }, { "epoch": 4.742081447963801, "grad_norm": 0.18828985621936872, "learning_rate": 2.730670898658255e-06, "loss": 0.2549, "step": 262 }, { "epoch": 4.760180995475113, "grad_norm": 0.19972804362365068, "learning_rate": 2.716291253531329e-06, "loss": 0.2873, "step": 263 }, { "epoch": 4.778280542986425, "grad_norm": 0.2092834898971349, "learning_rate": 2.7019043950096992e-06, "loss": 0.2674, "step": 264 }, { "epoch": 4.796380090497737, "grad_norm": 0.19131496744019671, "learning_rate": 2.6875108029005113e-06, "loss": 0.2724, "step": 265 }, { "epoch": 4.8144796380090495, "grad_norm": 0.21255643670178404, "learning_rate": 2.6731109572354795e-06, "loss": 0.2684, "step": 266 }, { "epoch": 4.832579185520362, "grad_norm": 0.18562764869110326, "learning_rate": 2.658705338254876e-06, "loss": 0.271, "step": 267 }, { "epoch": 4.850678733031674, "grad_norm": 0.21207337609644833, "learning_rate": 2.6442944263915153e-06, "loss": 0.2719, "step": 268 }, { "epoch": 4.868778280542987, "grad_norm": 0.2129223213748196, "learning_rate": 2.6298787022547317e-06, "loss": 0.2666, "step": 269 }, { "epoch": 4.886877828054299, "grad_norm": 0.18692841429903953, "learning_rate": 2.6154586466143495e-06, "loss": 0.2755, "step": 270 }, { "epoch": 4.904977375565611, "grad_norm": 0.19199436687113453, "learning_rate": 2.6010347403846508e-06, "loss": 0.2864, "step": 271 }, { "epoch": 4.923076923076923, "grad_norm": 0.19327034490069303, "learning_rate": 2.5866074646083385e-06, "loss": 0.2694, "step": 272 }, { "epoch": 4.9411764705882355, "grad_norm": 0.26379305562686184, "learning_rate": 2.572177300440487e-06, "loss": 0.2597, "step": 273 }, { "epoch": 4.959276018099548, "grad_norm": 0.1894366168665776, "learning_rate": 2.557744729132503e-06, "loss": 0.2825, "step": 274 }, { "epoch": 4.97737556561086, "grad_norm": 0.19519701404452072, "learning_rate": 2.5433102320160713e-06, "loss": 0.2893, "step": 275 }, { "epoch": 4.995475113122172, "grad_norm": 0.19163121413004777, "learning_rate": 2.528874290487102e-06, "loss": 0.2508, "step": 276 }, { "epoch": 5.013574660633484, "grad_norm": 0.18512352279959782, "learning_rate": 2.5144373859896792e-06, "loss": 0.2589, "step": 277 }, { "epoch": 5.031674208144796, "grad_norm": 0.18339390733870273, "learning_rate": 2.5e-06, "loss": 0.2621, "step": 278 }, { "epoch": 5.049773755656108, "grad_norm": 0.1942547479998011, "learning_rate": 2.4855626140103216e-06, "loss": 0.245, "step": 279 }, { "epoch": 5.067873303167421, "grad_norm": 0.201133955927992, "learning_rate": 2.4711257095128987e-06, "loss": 0.2428, "step": 280 }, { "epoch": 5.085972850678733, "grad_norm": 0.19802266448824934, "learning_rate": 2.4566897679839295e-06, "loss": 0.2756, "step": 281 }, { "epoch": 5.104072398190045, "grad_norm": 0.19714188235491836, "learning_rate": 2.4422552708674977e-06, "loss": 0.2626, "step": 282 }, { "epoch": 5.122171945701357, "grad_norm": 0.18710363733656865, "learning_rate": 2.427822699559514e-06, "loss": 0.2616, "step": 283 }, { "epoch": 5.14027149321267, "grad_norm": 0.18029896729988643, "learning_rate": 2.413392535391663e-06, "loss": 0.2671, "step": 284 }, { "epoch": 5.158371040723982, "grad_norm": 0.19353123935666788, "learning_rate": 2.3989652596153496e-06, "loss": 0.2518, "step": 285 }, { "epoch": 5.176470588235294, "grad_norm": 0.1999507247304982, "learning_rate": 2.3845413533856517e-06, "loss": 0.2691, "step": 286 }, { "epoch": 5.1945701357466065, "grad_norm": 0.1802458898092889, "learning_rate": 2.3701212977452683e-06, "loss": 0.2662, "step": 287 }, { "epoch": 5.212669683257919, "grad_norm": 0.20005237780106283, "learning_rate": 2.3557055736084847e-06, "loss": 0.2706, "step": 288 }, { "epoch": 5.230769230769231, "grad_norm": 0.20349821320072675, "learning_rate": 2.3412946617451242e-06, "loss": 0.2651, "step": 289 }, { "epoch": 5.248868778280543, "grad_norm": 0.19275858047883396, "learning_rate": 2.3268890427645213e-06, "loss": 0.2809, "step": 290 }, { "epoch": 5.266968325791855, "grad_norm": 0.19491454590375834, "learning_rate": 2.312489197099489e-06, "loss": 0.242, "step": 291 }, { "epoch": 5.285067873303167, "grad_norm": 0.17860701410760396, "learning_rate": 2.298095604990302e-06, "loss": 0.252, "step": 292 }, { "epoch": 5.3031674208144794, "grad_norm": 0.18166338870243837, "learning_rate": 2.283708746468672e-06, "loss": 0.2687, "step": 293 }, { "epoch": 5.321266968325792, "grad_norm": 0.20860085100238554, "learning_rate": 2.269329101341745e-06, "loss": 0.2749, "step": 294 }, { "epoch": 5.339366515837104, "grad_norm": 0.18128543910141529, "learning_rate": 2.2549571491760985e-06, "loss": 0.2423, "step": 295 }, { "epoch": 5.357466063348416, "grad_norm": 0.23828035104300602, "learning_rate": 2.2405933692817458e-06, "loss": 0.2582, "step": 296 }, { "epoch": 5.375565610859729, "grad_norm": 0.19867583702537983, "learning_rate": 2.226238240696151e-06, "loss": 0.2505, "step": 297 }, { "epoch": 5.393665158371041, "grad_norm": 0.2238993077156904, "learning_rate": 2.2118922421682563e-06, "loss": 0.2547, "step": 298 }, { "epoch": 5.411764705882353, "grad_norm": 0.18659890730168405, "learning_rate": 2.1975558521425106e-06, "loss": 0.2541, "step": 299 }, { "epoch": 5.429864253393665, "grad_norm": 0.2086208336638683, "learning_rate": 2.183229548742916e-06, "loss": 0.2449, "step": 300 }, { "epoch": 5.447963800904978, "grad_norm": 0.19744096649329249, "learning_rate": 2.1689138097570832e-06, "loss": 0.2529, "step": 301 }, { "epoch": 5.46606334841629, "grad_norm": 0.1905137878945102, "learning_rate": 2.1546091126202955e-06, "loss": 0.2549, "step": 302 }, { "epoch": 5.484162895927602, "grad_norm": 0.18724152511382108, "learning_rate": 2.1403159343995845e-06, "loss": 0.2544, "step": 303 }, { "epoch": 5.502262443438914, "grad_norm": 0.18137306072412968, "learning_rate": 2.1260347517778223e-06, "loss": 0.2472, "step": 304 }, { "epoch": 5.520361990950226, "grad_norm": 0.21137486256539126, "learning_rate": 2.111766041037826e-06, "loss": 0.2663, "step": 305 }, { "epoch": 5.538461538461538, "grad_norm": 0.18969561601900994, "learning_rate": 2.0975102780464674e-06, "loss": 0.2654, "step": 306 }, { "epoch": 5.5565610859728505, "grad_norm": 0.18687293378459552, "learning_rate": 2.083267938238808e-06, "loss": 0.2521, "step": 307 }, { "epoch": 5.574660633484163, "grad_norm": 0.18563465250651875, "learning_rate": 2.0690394966022397e-06, "loss": 0.2599, "step": 308 }, { "epoch": 5.592760180995475, "grad_norm": 0.18961353982721652, "learning_rate": 2.0548254276606457e-06, "loss": 0.253, "step": 309 }, { "epoch": 5.610859728506787, "grad_norm": 0.19358594701649867, "learning_rate": 2.040626205458574e-06, "loss": 0.268, "step": 310 }, { "epoch": 5.628959276018099, "grad_norm": 0.18903082550740266, "learning_rate": 2.02644230354543e-06, "loss": 0.2794, "step": 311 }, { "epoch": 5.647058823529412, "grad_norm": 0.18955280198715693, "learning_rate": 2.01227419495968e-06, "loss": 0.2466, "step": 312 }, { "epoch": 5.665158371040724, "grad_norm": 0.21673963839382857, "learning_rate": 1.9981223522130764e-06, "loss": 0.2646, "step": 313 }, { "epoch": 5.683257918552036, "grad_norm": 0.18658355423161882, "learning_rate": 1.9839872472749016e-06, "loss": 0.2524, "step": 314 }, { "epoch": 5.701357466063349, "grad_norm": 0.18351414151686257, "learning_rate": 1.9698693515562235e-06, "loss": 0.2484, "step": 315 }, { "epoch": 5.719457013574661, "grad_norm": 0.19521700431845607, "learning_rate": 1.9557691358941796e-06, "loss": 0.241, "step": 316 }, { "epoch": 5.737556561085973, "grad_norm": 0.18325038007655156, "learning_rate": 1.941687070536267e-06, "loss": 0.2834, "step": 317 }, { "epoch": 5.755656108597285, "grad_norm": 0.2024434466335083, "learning_rate": 1.9276236251246655e-06, "loss": 0.2617, "step": 318 }, { "epoch": 5.773755656108597, "grad_norm": 0.19282545684546182, "learning_rate": 1.913579268680572e-06, "loss": 0.251, "step": 319 }, { "epoch": 5.791855203619909, "grad_norm": 0.1985416405665436, "learning_rate": 1.8995544695885593e-06, "loss": 0.2528, "step": 320 }, { "epoch": 5.8099547511312215, "grad_norm": 0.19180458814723977, "learning_rate": 1.8855496955809546e-06, "loss": 0.2623, "step": 321 }, { "epoch": 5.828054298642534, "grad_norm": 0.19714720164607588, "learning_rate": 1.8715654137222434e-06, "loss": 0.2603, "step": 322 }, { "epoch": 5.846153846153846, "grad_norm": 0.17915913395978303, "learning_rate": 1.8576020903934872e-06, "loss": 0.2461, "step": 323 }, { "epoch": 5.864253393665159, "grad_norm": 0.1872517611416961, "learning_rate": 1.8436601912767737e-06, "loss": 0.2443, "step": 324 }, { "epoch": 5.882352941176471, "grad_norm": 0.2088265937495008, "learning_rate": 1.8297401813396854e-06, "loss": 0.2606, "step": 325 }, { "epoch": 5.900452488687783, "grad_norm": 0.20072778739580704, "learning_rate": 1.8158425248197931e-06, "loss": 0.2683, "step": 326 }, { "epoch": 5.918552036199095, "grad_norm": 0.20162018571475668, "learning_rate": 1.801967685209171e-06, "loss": 0.2674, "step": 327 }, { "epoch": 5.9366515837104075, "grad_norm": 0.19962010438752759, "learning_rate": 1.7881161252389423e-06, "loss": 0.2518, "step": 328 }, { "epoch": 5.95475113122172, "grad_norm": 0.1924016139723619, "learning_rate": 1.7742883068638447e-06, "loss": 0.2332, "step": 329 }, { "epoch": 5.972850678733032, "grad_norm": 0.19688732396260147, "learning_rate": 1.7604846912468243e-06, "loss": 0.2758, "step": 330 }, { "epoch": 5.990950226244344, "grad_norm": 0.21367643724553775, "learning_rate": 1.7467057387436577e-06, "loss": 0.2722, "step": 331 }, { "epoch": 6.009049773755656, "grad_norm": 0.18143686535639186, "learning_rate": 1.7329519088875959e-06, "loss": 0.2505, "step": 332 }, { "epoch": 6.027149321266968, "grad_norm": 0.19884601017939751, "learning_rate": 1.719223660374041e-06, "loss": 0.2406, "step": 333 }, { "epoch": 6.04524886877828, "grad_norm": 0.19790104231314157, "learning_rate": 1.7055214510452462e-06, "loss": 0.2459, "step": 334 }, { "epoch": 6.0633484162895925, "grad_norm": 0.21259902967676111, "learning_rate": 1.6918457378750511e-06, "loss": 0.256, "step": 335 }, { "epoch": 6.081447963800905, "grad_norm": 0.213170628627418, "learning_rate": 1.6781969769536356e-06, "loss": 0.2606, "step": 336 }, { "epoch": 6.099547511312217, "grad_norm": 0.18867147575952214, "learning_rate": 1.6645756234723127e-06, "loss": 0.2445, "step": 337 }, { "epoch": 6.117647058823529, "grad_norm": 0.18694162673757048, "learning_rate": 1.6509821317083466e-06, "loss": 0.2346, "step": 338 }, { "epoch": 6.135746606334842, "grad_norm": 0.19692152056487713, "learning_rate": 1.6374169550098052e-06, "loss": 0.2645, "step": 339 }, { "epoch": 6.153846153846154, "grad_norm": 0.18900423846777845, "learning_rate": 1.6238805457804353e-06, "loss": 0.2409, "step": 340 }, { "epoch": 6.171945701357466, "grad_norm": 0.19281737146761763, "learning_rate": 1.6103733554645794e-06, "loss": 0.2511, "step": 341 }, { "epoch": 6.1900452488687785, "grad_norm": 0.18576535863582108, "learning_rate": 1.5968958345321178e-06, "loss": 0.2562, "step": 342 }, { "epoch": 6.208144796380091, "grad_norm": 0.1937616575487202, "learning_rate": 1.5834484324634453e-06, "loss": 0.2558, "step": 343 }, { "epoch": 6.226244343891403, "grad_norm": 0.20266025820130834, "learning_rate": 1.5700315977344813e-06, "loss": 0.2619, "step": 344 }, { "epoch": 6.244343891402715, "grad_norm": 0.19244645126328583, "learning_rate": 1.5566457778017141e-06, "loss": 0.2357, "step": 345 }, { "epoch": 6.262443438914027, "grad_norm": 0.19529354957198908, "learning_rate": 1.5432914190872757e-06, "loss": 0.2547, "step": 346 }, { "epoch": 6.280542986425339, "grad_norm": 0.1977639183994923, "learning_rate": 1.529968966964055e-06, "loss": 0.253, "step": 347 }, { "epoch": 6.298642533936651, "grad_norm": 0.19407931113719454, "learning_rate": 1.5166788657408441e-06, "loss": 0.2632, "step": 348 }, { "epoch": 6.316742081447964, "grad_norm": 0.19011112857943221, "learning_rate": 1.5034215586475194e-06, "loss": 0.2647, "step": 349 }, { "epoch": 6.334841628959276, "grad_norm": 0.2186558043805355, "learning_rate": 1.490197487820263e-06, "loss": 0.2395, "step": 350 }, { "epoch": 6.352941176470588, "grad_norm": 0.18367578824384137, "learning_rate": 1.477007094286813e-06, "loss": 0.2516, "step": 351 }, { "epoch": 6.371040723981901, "grad_norm": 0.18371310311269254, "learning_rate": 1.4638508179517583e-06, "loss": 0.2709, "step": 352 }, { "epoch": 6.389140271493213, "grad_norm": 0.19750798322441557, "learning_rate": 1.4507290975818648e-06, "loss": 0.2497, "step": 353 }, { "epoch": 6.407239819004525, "grad_norm": 0.17489326087119314, "learning_rate": 1.4376423707914462e-06, "loss": 0.2518, "step": 354 }, { "epoch": 6.425339366515837, "grad_norm": 0.19109685375971255, "learning_rate": 1.4245910740277642e-06, "loss": 0.2464, "step": 355 }, { "epoch": 6.4434389140271495, "grad_norm": 0.18732644035351217, "learning_rate": 1.4115756425564798e-06, "loss": 0.2554, "step": 356 }, { "epoch": 6.461538461538462, "grad_norm": 0.2042904942174333, "learning_rate": 1.39859651044713e-06, "loss": 0.2677, "step": 357 }, { "epoch": 6.479638009049774, "grad_norm": 0.20346012347129977, "learning_rate": 1.3856541105586545e-06, "loss": 0.2433, "step": 358 }, { "epoch": 6.497737556561086, "grad_norm": 0.18096207448536866, "learning_rate": 1.372748874524961e-06, "loss": 0.248, "step": 359 }, { "epoch": 6.515837104072398, "grad_norm": 0.18311281316650868, "learning_rate": 1.3598812327405274e-06, "loss": 0.2433, "step": 360 }, { "epoch": 6.53393665158371, "grad_norm": 0.19877832010020277, "learning_rate": 1.3470516143460494e-06, "loss": 0.2419, "step": 361 }, { "epoch": 6.552036199095022, "grad_norm": 0.19411009696243373, "learning_rate": 1.3342604472141296e-06, "loss": 0.2485, "step": 362 }, { "epoch": 6.570135746606335, "grad_norm": 0.18775697820498174, "learning_rate": 1.3215081579350058e-06, "loss": 0.2514, "step": 363 }, { "epoch": 6.588235294117647, "grad_norm": 0.1974485040630947, "learning_rate": 1.308795171802324e-06, "loss": 0.2623, "step": 364 }, { "epoch": 6.606334841628959, "grad_norm": 0.20195192192796554, "learning_rate": 1.2961219127989562e-06, "loss": 0.2523, "step": 365 }, { "epoch": 6.624434389140271, "grad_norm": 0.1867586520187508, "learning_rate": 1.2834888035828597e-06, "loss": 0.2434, "step": 366 }, { "epoch": 6.642533936651584, "grad_norm": 0.19535767032905008, "learning_rate": 1.2708962654729812e-06, "loss": 0.2246, "step": 367 }, { "epoch": 6.660633484162896, "grad_norm": 0.17951796660986621, "learning_rate": 1.258344718435205e-06, "loss": 0.2548, "step": 368 }, { "epoch": 6.678733031674208, "grad_norm": 0.1838076745236157, "learning_rate": 1.2458345810683492e-06, "loss": 0.2517, "step": 369 }, { "epoch": 6.6968325791855206, "grad_norm": 0.1987502629500275, "learning_rate": 1.233366270590202e-06, "loss": 0.2373, "step": 370 }, { "epoch": 6.714932126696833, "grad_norm": 0.1921556070273265, "learning_rate": 1.2209402028236114e-06, "loss": 0.2444, "step": 371 }, { "epoch": 6.733031674208145, "grad_norm": 0.18753751737041122, "learning_rate": 1.2085567921826128e-06, "loss": 0.2429, "step": 372 }, { "epoch": 6.751131221719457, "grad_norm": 0.17267111610692507, "learning_rate": 1.1962164516586123e-06, "loss": 0.2408, "step": 373 }, { "epoch": 6.769230769230769, "grad_norm": 0.1785397882614972, "learning_rate": 1.1839195928066101e-06, "loss": 0.2364, "step": 374 }, { "epoch": 6.787330316742081, "grad_norm": 0.1974641160114867, "learning_rate": 1.171666625731477e-06, "loss": 0.2502, "step": 375 }, { "epoch": 6.8054298642533935, "grad_norm": 0.1936200917713445, "learning_rate": 1.1594579590742758e-06, "loss": 0.2495, "step": 376 }, { "epoch": 6.823529411764706, "grad_norm": 0.20474767855899034, "learning_rate": 1.1472939999986338e-06, "loss": 0.2444, "step": 377 }, { "epoch": 6.841628959276018, "grad_norm": 0.21747609011178112, "learning_rate": 1.1351751541771644e-06, "loss": 0.2423, "step": 378 }, { "epoch": 6.859728506787331, "grad_norm": 0.2024534108733349, "learning_rate": 1.1231018257779363e-06, "loss": 0.2641, "step": 379 }, { "epoch": 6.877828054298643, "grad_norm": 0.19486585090979294, "learning_rate": 1.1110744174509952e-06, "loss": 0.2463, "step": 380 }, { "epoch": 6.895927601809955, "grad_norm": 0.17849040364534344, "learning_rate": 1.0990933303149342e-06, "loss": 0.2631, "step": 381 }, { "epoch": 6.914027149321267, "grad_norm": 0.19002926125887049, "learning_rate": 1.0871589639435204e-06, "loss": 0.2481, "step": 382 }, { "epoch": 6.932126696832579, "grad_norm": 0.18083592050616315, "learning_rate": 1.0752717163523623e-06, "loss": 0.241, "step": 383 }, { "epoch": 6.950226244343892, "grad_norm": 0.19496492930938145, "learning_rate": 1.0634319839856407e-06, "loss": 0.2527, "step": 384 }, { "epoch": 6.968325791855204, "grad_norm": 0.19417699707230154, "learning_rate": 1.0516401617028863e-06, "loss": 0.2322, "step": 385 }, { "epoch": 6.986425339366516, "grad_norm": 0.18003217148044237, "learning_rate": 1.0398966427658091e-06, "loss": 0.2357, "step": 386 }, { "epoch": 7.004524886877828, "grad_norm": 0.18246799637458713, "learning_rate": 1.0282018188251854e-06, "loss": 0.2568, "step": 387 }, { "epoch": 7.02262443438914, "grad_norm": 0.18781508356688068, "learning_rate": 1.0165560799077952e-06, "loss": 0.2387, "step": 388 }, { "epoch": 7.040723981900452, "grad_norm": 0.17588577341825412, "learning_rate": 1.004959814403413e-06, "loss": 0.262, "step": 389 }, { "epoch": 7.0588235294117645, "grad_norm": 0.19676767898186667, "learning_rate": 9.934134090518593e-07, "loss": 0.2374, "step": 390 }, { "epoch": 7.076923076923077, "grad_norm": 0.19345676011938345, "learning_rate": 9.81917248930096e-07, "loss": 0.2162, "step": 391 }, { "epoch": 7.095022624434389, "grad_norm": 0.2178742299523153, "learning_rate": 9.704717174393912e-07, "loss": 0.2495, "step": 392 }, { "epoch": 7.113122171945701, "grad_norm": 0.18628703610003405, "learning_rate": 9.590771962925272e-07, "loss": 0.2596, "step": 393 }, { "epoch": 7.131221719457014, "grad_norm": 0.18042019029734135, "learning_rate": 9.477340655010717e-07, "loss": 0.2465, "step": 394 }, { "epoch": 7.149321266968326, "grad_norm": 0.1924619560299915, "learning_rate": 9.36442703362706e-07, "loss": 0.2395, "step": 395 }, { "epoch": 7.167420814479638, "grad_norm": 0.18162050443390207, "learning_rate": 9.252034864486062e-07, "loss": 0.2425, "step": 396 }, { "epoch": 7.1855203619909505, "grad_norm": 0.1725352404799184, "learning_rate": 9.140167895908867e-07, "loss": 0.2257, "step": 397 }, { "epoch": 7.203619909502263, "grad_norm": 0.17850869622337964, "learning_rate": 9.028829858700974e-07, "loss": 0.2313, "step": 398 }, { "epoch": 7.221719457013575, "grad_norm": 0.1896145123389741, "learning_rate": 8.918024466027822e-07, "loss": 0.2462, "step": 399 }, { "epoch": 7.239819004524887, "grad_norm": 0.1878899849862918, "learning_rate": 8.807755413290953e-07, "loss": 0.2502, "step": 400 }, { "epoch": 7.257918552036199, "grad_norm": 0.19070595484051797, "learning_rate": 8.698026378004787e-07, "loss": 0.2433, "step": 401 }, { "epoch": 7.276018099547511, "grad_norm": 0.17359356109341043, "learning_rate": 8.588841019673938e-07, "loss": 0.2604, "step": 402 }, { "epoch": 7.294117647058823, "grad_norm": 0.20358309076003017, "learning_rate": 8.480202979671201e-07, "loss": 0.2327, "step": 403 }, { "epoch": 7.3122171945701355, "grad_norm": 0.1835516820557226, "learning_rate": 8.372115881116089e-07, "loss": 0.2409, "step": 404 }, { "epoch": 7.330316742081448, "grad_norm": 0.18238130931189853, "learning_rate": 8.264583328754017e-07, "loss": 0.2393, "step": 405 }, { "epoch": 7.34841628959276, "grad_norm": 0.17542601825119047, "learning_rate": 8.157608908836071e-07, "loss": 0.2312, "step": 406 }, { "epoch": 7.366515837104072, "grad_norm": 0.18257023212771115, "learning_rate": 8.051196188999425e-07, "loss": 0.2503, "step": 407 }, { "epoch": 7.384615384615385, "grad_norm": 0.1967778738312882, "learning_rate": 7.945348718148324e-07, "loss": 0.2419, "step": 408 }, { "epoch": 7.402714932126697, "grad_norm": 0.18755379540882788, "learning_rate": 7.840070026335758e-07, "loss": 0.2332, "step": 409 }, { "epoch": 7.420814479638009, "grad_norm": 0.1911070489817504, "learning_rate": 7.735363624645712e-07, "loss": 0.2484, "step": 410 }, { "epoch": 7.4389140271493215, "grad_norm": 0.1882055636984676, "learning_rate": 7.6312330050761e-07, "loss": 0.2404, "step": 411 }, { "epoch": 7.457013574660634, "grad_norm": 0.20190668623593286, "learning_rate": 7.527681640422265e-07, "loss": 0.2526, "step": 412 }, { "epoch": 7.475113122171946, "grad_norm": 0.1974234563343766, "learning_rate": 7.424712984161192e-07, "loss": 0.2688, "step": 413 }, { "epoch": 7.493212669683258, "grad_norm": 0.17631879313649837, "learning_rate": 7.322330470336314e-07, "loss": 0.2508, "step": 414 }, { "epoch": 7.51131221719457, "grad_norm": 0.18714884817468105, "learning_rate": 7.220537513442999e-07, "loss": 0.2486, "step": 415 }, { "epoch": 7.529411764705882, "grad_norm": 0.19399653562175878, "learning_rate": 7.11933750831467e-07, "loss": 0.2618, "step": 416 }, { "epoch": 7.547511312217194, "grad_norm": 0.1881943799081702, "learning_rate": 7.018733830009578e-07, "loss": 0.2745, "step": 417 }, { "epoch": 7.5656108597285066, "grad_norm": 0.19410422423302068, "learning_rate": 6.91872983369826e-07, "loss": 0.2575, "step": 418 }, { "epoch": 7.583710407239819, "grad_norm": 0.19139908757724744, "learning_rate": 6.819328854551619e-07, "loss": 0.2431, "step": 419 }, { "epoch": 7.601809954751131, "grad_norm": 0.19407692138480465, "learning_rate": 6.720534207629731e-07, "loss": 0.2612, "step": 420 }, { "epoch": 7.619909502262443, "grad_norm": 0.19077609905815648, "learning_rate": 6.622349187771246e-07, "loss": 0.2363, "step": 421 }, { "epoch": 7.638009049773755, "grad_norm": 0.19785590661298624, "learning_rate": 6.524777069483526e-07, "loss": 0.2165, "step": 422 }, { "epoch": 7.656108597285068, "grad_norm": 0.18170589381863933, "learning_rate": 6.427821106833429e-07, "loss": 0.2518, "step": 423 }, { "epoch": 7.67420814479638, "grad_norm": 0.19082550580582264, "learning_rate": 6.33148453333881e-07, "loss": 0.2497, "step": 424 }, { "epoch": 7.6923076923076925, "grad_norm": 0.2010429672996338, "learning_rate": 6.235770561860646e-07, "loss": 0.2735, "step": 425 }, { "epoch": 7.710407239819005, "grad_norm": 0.20631621699435826, "learning_rate": 6.140682384495902e-07, "loss": 0.2638, "step": 426 }, { "epoch": 7.728506787330317, "grad_norm": 0.18857883979117615, "learning_rate": 6.046223172471083e-07, "loss": 0.2511, "step": 427 }, { "epoch": 7.746606334841629, "grad_norm": 0.19438107603701976, "learning_rate": 5.952396076036457e-07, "loss": 0.2411, "step": 428 }, { "epoch": 7.764705882352941, "grad_norm": 0.18435853585586434, "learning_rate": 5.85920422436099e-07, "loss": 0.2337, "step": 429 }, { "epoch": 7.782805429864253, "grad_norm": 0.19759361458272545, "learning_rate": 5.766650725428027e-07, "loss": 0.2304, "step": 430 }, { "epoch": 7.800904977375565, "grad_norm": 0.17820786715247264, "learning_rate": 5.674738665931575e-07, "loss": 0.2302, "step": 431 }, { "epoch": 7.819004524886878, "grad_norm": 0.18336638108510472, "learning_rate": 5.583471111173414e-07, "loss": 0.2415, "step": 432 }, { "epoch": 7.83710407239819, "grad_norm": 0.1861341218211825, "learning_rate": 5.492851104960839e-07, "loss": 0.2347, "step": 433 }, { "epoch": 7.855203619909502, "grad_norm": 0.18671520221803245, "learning_rate": 5.402881669505164e-07, "loss": 0.2433, "step": 434 }, { "epoch": 7.873303167420815, "grad_norm": 0.18470916369258913, "learning_rate": 5.313565805320914e-07, "loss": 0.2392, "step": 435 }, { "epoch": 7.891402714932127, "grad_norm": 0.18145209957770228, "learning_rate": 5.224906491125778e-07, "loss": 0.2491, "step": 436 }, { "epoch": 7.909502262443439, "grad_norm": 0.1841316864472566, "learning_rate": 5.13690668374125e-07, "loss": 0.2374, "step": 437 }, { "epoch": 7.927601809954751, "grad_norm": 0.16991217903448427, "learning_rate": 5.049569317994013e-07, "loss": 0.2222, "step": 438 }, { "epoch": 7.9457013574660635, "grad_norm": 0.18977292588230824, "learning_rate": 4.962897306618101e-07, "loss": 0.2413, "step": 439 }, { "epoch": 7.963800904977376, "grad_norm": 0.2034200762540194, "learning_rate": 4.876893540157692e-07, "loss": 0.2526, "step": 440 }, { "epoch": 7.981900452488688, "grad_norm": 0.18561076018112563, "learning_rate": 4.791560886870786e-07, "loss": 0.2505, "step": 441 }, { "epoch": 8.0, "grad_norm": 0.1808509581648577, "learning_rate": 4.70690219263347e-07, "loss": 0.2397, "step": 442 }, { "epoch": 8.018099547511312, "grad_norm": 0.1983786803651098, "learning_rate": 4.6229202808450587e-07, "loss": 0.2384, "step": 443 }, { "epoch": 8.036199095022624, "grad_norm": 0.19613362321076386, "learning_rate": 4.539617952333913e-07, "loss": 0.2396, "step": 444 }, { "epoch": 8.054298642533936, "grad_norm": 0.18104571677229486, "learning_rate": 4.4569979852640444e-07, "loss": 0.2481, "step": 445 }, { "epoch": 8.072398190045249, "grad_norm": 0.18894956462902818, "learning_rate": 4.3750631350424456e-07, "loss": 0.2331, "step": 446 }, { "epoch": 8.09049773755656, "grad_norm": 0.1856642703057781, "learning_rate": 4.2938161342272024e-07, "loss": 0.2398, "step": 447 }, { "epoch": 8.108597285067873, "grad_norm": 0.19509279291436657, "learning_rate": 4.2132596924363666e-07, "loss": 0.2396, "step": 448 }, { "epoch": 8.126696832579185, "grad_norm": 0.18583235612820456, "learning_rate": 4.1333964962575995e-07, "loss": 0.2457, "step": 449 }, { "epoch": 8.144796380090497, "grad_norm": 0.19414831334323818, "learning_rate": 4.0542292091585447e-07, "loss": 0.2557, "step": 450 }, { "epoch": 8.16289592760181, "grad_norm": 0.1948999434614907, "learning_rate": 3.975760471398013e-07, "loss": 0.2346, "step": 451 }, { "epoch": 8.180995475113122, "grad_norm": 0.18223819061827173, "learning_rate": 3.89799289993795e-07, "loss": 0.2176, "step": 452 }, { "epoch": 8.199095022624434, "grad_norm": 0.19449644313553408, "learning_rate": 3.8209290883561205e-07, "loss": 0.247, "step": 453 }, { "epoch": 8.217194570135746, "grad_norm": 0.1930258214779179, "learning_rate": 3.7445716067596506e-07, "loss": 0.2298, "step": 454 }, { "epoch": 8.235294117647058, "grad_norm": 0.18628969575946702, "learning_rate": 3.668923001699284e-07, "loss": 0.2385, "step": 455 }, { "epoch": 8.25339366515837, "grad_norm": 0.18169941514755078, "learning_rate": 3.593985796084468e-07, "loss": 0.2519, "step": 456 }, { "epoch": 8.271493212669684, "grad_norm": 0.1837119269988211, "learning_rate": 3.519762489099207e-07, "loss": 0.2602, "step": 457 }, { "epoch": 8.289592760180996, "grad_norm": 0.1953248401558189, "learning_rate": 3.446255556118736e-07, "loss": 0.2567, "step": 458 }, { "epoch": 8.307692307692308, "grad_norm": 0.17837155536528138, "learning_rate": 3.373467448626916e-07, "loss": 0.2332, "step": 459 }, { "epoch": 8.32579185520362, "grad_norm": 0.1879124674324348, "learning_rate": 3.3014005941345406e-07, "loss": 0.2357, "step": 460 }, { "epoch": 8.343891402714933, "grad_norm": 0.19669583622722217, "learning_rate": 3.230057396098321e-07, "loss": 0.2188, "step": 461 }, { "epoch": 8.361990950226245, "grad_norm": 0.19436805306375338, "learning_rate": 3.1594402338407633e-07, "loss": 0.2595, "step": 462 }, { "epoch": 8.380090497737557, "grad_norm": 0.1731035690780127, "learning_rate": 3.0895514624707994e-07, "loss": 0.2293, "step": 463 }, { "epoch": 8.39819004524887, "grad_norm": 0.19086125694967881, "learning_rate": 3.020393412805259e-07, "loss": 0.2305, "step": 464 }, { "epoch": 8.416289592760181, "grad_norm": 0.18779406733198983, "learning_rate": 2.9519683912911267e-07, "loss": 0.2596, "step": 465 }, { "epoch": 8.434389140271493, "grad_norm": 0.18546808477280827, "learning_rate": 2.8842786799286204e-07, "loss": 0.2435, "step": 466 }, { "epoch": 8.452488687782806, "grad_norm": 0.1896684936541315, "learning_rate": 2.8173265361950837e-07, "loss": 0.2386, "step": 467 }, { "epoch": 8.470588235294118, "grad_norm": 0.17852233356583405, "learning_rate": 2.751114192969709e-07, "loss": 0.231, "step": 468 }, { "epoch": 8.48868778280543, "grad_norm": 0.18399543647963754, "learning_rate": 2.685643858459064e-07, "loss": 0.2477, "step": 469 }, { "epoch": 8.506787330316742, "grad_norm": 0.18054851239071437, "learning_rate": 2.620917716123444e-07, "loss": 0.2504, "step": 470 }, { "epoch": 8.524886877828054, "grad_norm": 0.19308936407562874, "learning_rate": 2.55693792460405e-07, "loss": 0.2545, "step": 471 }, { "epoch": 8.542986425339366, "grad_norm": 0.19847333989927235, "learning_rate": 2.4937066176510123e-07, "loss": 0.2462, "step": 472 }, { "epoch": 8.561085972850679, "grad_norm": 0.20082127472743996, "learning_rate": 2.4312259040522093e-07, "loss": 0.2449, "step": 473 }, { "epoch": 8.57918552036199, "grad_norm": 0.1843637491284879, "learning_rate": 2.3694978675629476e-07, "loss": 0.2422, "step": 474 }, { "epoch": 8.597285067873303, "grad_norm": 0.18297796260401825, "learning_rate": 2.3085245668364897e-07, "loss": 0.2492, "step": 475 }, { "epoch": 8.615384615384615, "grad_norm": 0.18214698681303781, "learning_rate": 2.2483080353553537e-07, "loss": 0.2435, "step": 476 }, { "epoch": 8.633484162895927, "grad_norm": 0.1932187580551005, "learning_rate": 2.1888502813635276e-07, "loss": 0.2471, "step": 477 }, { "epoch": 8.65158371040724, "grad_norm": 0.1862160611593082, "learning_rate": 2.1301532877994747e-07, "loss": 0.2367, "step": 478 }, { "epoch": 8.669683257918551, "grad_norm": 0.1853161129752053, "learning_rate": 2.0722190122300311e-07, "loss": 0.2344, "step": 479 }, { "epoch": 8.687782805429864, "grad_norm": 0.18442104500106515, "learning_rate": 2.0150493867850867e-07, "loss": 0.2394, "step": 480 }, { "epoch": 8.705882352941176, "grad_norm": 0.1836768530394557, "learning_rate": 1.9586463180931658e-07, "loss": 0.242, "step": 481 }, { "epoch": 8.723981900452488, "grad_norm": 0.18225478866484207, "learning_rate": 1.9030116872178317e-07, "loss": 0.2571, "step": 482 }, { "epoch": 8.742081447963802, "grad_norm": 0.19072644512673081, "learning_rate": 1.848147349594967e-07, "loss": 0.2457, "step": 483 }, { "epoch": 8.760180995475114, "grad_norm": 0.18223857901348137, "learning_rate": 1.7940551349708734e-07, "loss": 0.2351, "step": 484 }, { "epoch": 8.778280542986426, "grad_norm": 0.23050285345657223, "learning_rate": 1.7407368473412678e-07, "loss": 0.2355, "step": 485 }, { "epoch": 8.796380090497738, "grad_norm": 0.18880764635155572, "learning_rate": 1.6881942648911077e-07, "loss": 0.2287, "step": 486 }, { "epoch": 8.81447963800905, "grad_norm": 0.1830117965150596, "learning_rate": 1.6364291399352916e-07, "loss": 0.2447, "step": 487 }, { "epoch": 8.832579185520363, "grad_norm": 0.18803881671915923, "learning_rate": 1.5854431988602175e-07, "loss": 0.2431, "step": 488 }, { "epoch": 8.850678733031675, "grad_norm": 0.18013778534000302, "learning_rate": 1.5352381420662144e-07, "loss": 0.2397, "step": 489 }, { "epoch": 8.868778280542987, "grad_norm": 0.2003292008190993, "learning_rate": 1.4858156439108097e-07, "loss": 0.2291, "step": 490 }, { "epoch": 8.886877828054299, "grad_norm": 0.1780640301175049, "learning_rate": 1.4371773526529216e-07, "loss": 0.2138, "step": 491 }, { "epoch": 8.904977375565611, "grad_norm": 0.1858049004037094, "learning_rate": 1.3893248903978695e-07, "loss": 0.2248, "step": 492 }, { "epoch": 8.923076923076923, "grad_norm": 0.1870658138910751, "learning_rate": 1.342259853043279e-07, "loss": 0.2628, "step": 493 }, { "epoch": 8.941176470588236, "grad_norm": 0.1837618747915919, "learning_rate": 1.2959838102258537e-07, "loss": 0.2369, "step": 494 }, { "epoch": 8.959276018099548, "grad_norm": 0.1825018533847707, "learning_rate": 1.2504983052690406e-07, "loss": 0.2371, "step": 495 }, { "epoch": 8.97737556561086, "grad_norm": 0.18050085376698732, "learning_rate": 1.2058048551315455e-07, "loss": 0.2364, "step": 496 }, { "epoch": 8.995475113122172, "grad_norm": 0.17972618184239006, "learning_rate": 1.1619049503567486e-07, "loss": 0.2473, "step": 497 }, { "epoch": 9.013574660633484, "grad_norm": 0.1830792217516428, "learning_rate": 1.1188000550230005e-07, "loss": 0.2352, "step": 498 }, { "epoch": 9.031674208144796, "grad_norm": 0.17879744556952354, "learning_rate": 1.0764916066947795e-07, "loss": 0.2641, "step": 499 }, { "epoch": 9.049773755656108, "grad_norm": 0.18166675174635316, "learning_rate": 1.0349810163747587e-07, "loss": 0.2324, "step": 500 }, { "epoch": 9.06787330316742, "grad_norm": 0.169470120760864, "learning_rate": 9.942696684567488e-08, "loss": 0.2433, "step": 501 }, { "epoch": 9.085972850678733, "grad_norm": 0.18110948245786077, "learning_rate": 9.54358920679524e-08, "loss": 0.2374, "step": 502 }, { "epoch": 9.104072398190045, "grad_norm": 0.18319694777040335, "learning_rate": 9.152501040815442e-08, "loss": 0.254, "step": 503 }, { "epoch": 9.122171945701357, "grad_norm": 0.1915504535166829, "learning_rate": 8.769445229565549e-08, "loss": 0.2325, "step": 504 }, { "epoch": 9.14027149321267, "grad_norm": 0.17665350982157665, "learning_rate": 8.394434548101099e-08, "loss": 0.2251, "step": 505 }, { "epoch": 9.158371040723981, "grad_norm": 0.17427434868030764, "learning_rate": 8.027481503169371e-08, "loss": 0.2345, "step": 506 }, { "epoch": 9.176470588235293, "grad_norm": 0.1787409835322033, "learning_rate": 7.66859833279257e-08, "loss": 0.2389, "step": 507 }, { "epoch": 9.194570135746606, "grad_norm": 0.18100016492103735, "learning_rate": 7.317797005859467e-08, "loss": 0.2519, "step": 508 }, { "epoch": 9.212669683257918, "grad_norm": 0.17821751417293089, "learning_rate": 6.97508922172635e-08, "loss": 0.2287, "step": 509 }, { "epoch": 9.23076923076923, "grad_norm": 0.20843753394336795, "learning_rate": 6.640486409826785e-08, "loss": 0.2444, "step": 510 }, { "epoch": 9.248868778280542, "grad_norm": 0.17620285125559612, "learning_rate": 6.313999729290476e-08, "loss": 0.2601, "step": 511 }, { "epoch": 9.266968325791856, "grad_norm": 0.18672849956899618, "learning_rate": 5.99564006857109e-08, "loss": 0.2247, "step": 512 }, { "epoch": 9.285067873303168, "grad_norm": 0.18049275292301087, "learning_rate": 5.685418045083102e-08, "loss": 0.2511, "step": 513 }, { "epoch": 9.30316742081448, "grad_norm": 0.17415682650124498, "learning_rate": 5.383344004847774e-08, "loss": 0.2122, "step": 514 }, { "epoch": 9.321266968325792, "grad_norm": 0.18556393996618256, "learning_rate": 5.0894280221479855e-08, "loss": 0.2294, "step": 515 }, { "epoch": 9.339366515837105, "grad_norm": 0.1838789712871206, "learning_rate": 4.8036798991923925e-08, "loss": 0.2223, "step": 516 }, { "epoch": 9.357466063348417, "grad_norm": 0.19715964425866056, "learning_rate": 4.526109165788439e-08, "loss": 0.2381, "step": 517 }, { "epoch": 9.375565610859729, "grad_norm": 0.1855854696991745, "learning_rate": 4.256725079024554e-08, "loss": 0.2342, "step": 518 }, { "epoch": 9.393665158371041, "grad_norm": 0.18048725239749752, "learning_rate": 3.995536622961399e-08, "loss": 0.2524, "step": 519 }, { "epoch": 9.411764705882353, "grad_norm": 0.19277390554384807, "learning_rate": 3.7425525083322755e-08, "loss": 0.2488, "step": 520 }, { "epoch": 9.429864253393665, "grad_norm": 0.18825292413778436, "learning_rate": 3.4977811722526065e-08, "loss": 0.2263, "step": 521 }, { "epoch": 9.447963800904978, "grad_norm": 0.17855716822938666, "learning_rate": 3.261230777938607e-08, "loss": 0.2549, "step": 522 }, { "epoch": 9.46606334841629, "grad_norm": 0.18271161254439716, "learning_rate": 3.032909214434887e-08, "loss": 0.2062, "step": 523 }, { "epoch": 9.484162895927602, "grad_norm": 0.1985253721454189, "learning_rate": 2.8128240963515574e-08, "loss": 0.2395, "step": 524 }, { "epoch": 9.502262443438914, "grad_norm": 0.17838320881574793, "learning_rate": 2.600982763610094e-08, "loss": 0.2526, "step": 525 }, { "epoch": 9.520361990950226, "grad_norm": 0.18314903249677716, "learning_rate": 2.3973922811987295e-08, "loss": 0.2264, "step": 526 }, { "epoch": 9.538461538461538, "grad_norm": 0.19459399624660845, "learning_rate": 2.202059438936588e-08, "loss": 0.2589, "step": 527 }, { "epoch": 9.55656108597285, "grad_norm": 0.1904697767266005, "learning_rate": 2.0149907512475585e-08, "loss": 0.2515, "step": 528 }, { "epoch": 9.574660633484163, "grad_norm": 0.19523143039480956, "learning_rate": 1.8361924569427204e-08, "loss": 0.2525, "step": 529 }, { "epoch": 9.592760180995475, "grad_norm": 0.17856164334939217, "learning_rate": 1.6656705190125078e-08, "loss": 0.2276, "step": 530 }, { "epoch": 9.610859728506787, "grad_norm": 0.18321195430667842, "learning_rate": 1.5034306244277042e-08, "loss": 0.2418, "step": 531 }, { "epoch": 9.628959276018099, "grad_norm": 0.17787016407899692, "learning_rate": 1.3494781839498428e-08, "loss": 0.2342, "step": 532 }, { "epoch": 9.647058823529411, "grad_norm": 0.19126723554650038, "learning_rate": 1.2038183319507957e-08, "loss": 0.2469, "step": 533 }, { "epoch": 9.665158371040723, "grad_norm": 0.1892990291817674, "learning_rate": 1.0664559262413831e-08, "loss": 0.2549, "step": 534 }, { "epoch": 9.683257918552036, "grad_norm": 0.1793510043645716, "learning_rate": 9.373955479095587e-09, "loss": 0.2299, "step": 535 }, { "epoch": 9.701357466063348, "grad_norm": 0.18123530213186048, "learning_rate": 8.166415011675032e-09, "loss": 0.238, "step": 536 }, { "epoch": 9.71945701357466, "grad_norm": 0.19155362352898522, "learning_rate": 7.041978132081295e-09, "loss": 0.2505, "step": 537 }, { "epoch": 9.737556561085974, "grad_norm": 0.19166375374826475, "learning_rate": 6.00068234070772e-09, "loss": 0.2486, "step": 538 }, { "epoch": 9.755656108597286, "grad_norm": 0.19649835123518228, "learning_rate": 5.042562365160375e-09, "loss": 0.2339, "step": 539 }, { "epoch": 9.773755656108598, "grad_norm": 0.17975277337095447, "learning_rate": 4.167650159100922e-09, "loss": 0.2386, "step": 540 }, { "epoch": 9.79185520361991, "grad_norm": 0.19853806613153782, "learning_rate": 3.375974901181356e-09, "loss": 0.2651, "step": 541 }, { "epoch": 9.809954751131222, "grad_norm": 0.17741051186070012, "learning_rate": 2.6675629940689508e-09, "loss": 0.2345, "step": 542 }, { "epoch": 9.828054298642535, "grad_norm": 0.21010956050591995, "learning_rate": 2.0424380635675202e-09, "loss": 0.2433, "step": 543 }, { "epoch": 9.846153846153847, "grad_norm": 0.1925677170949037, "learning_rate": 1.5006209578286024e-09, "loss": 0.2442, "step": 544 }, { "epoch": 9.864253393665159, "grad_norm": 0.18453673405456344, "learning_rate": 1.0421297466570169e-09, "loss": 0.2302, "step": 545 }, { "epoch": 9.882352941176471, "grad_norm": 0.20600991870093216, "learning_rate": 6.669797209069018e-10, "loss": 0.2338, "step": 546 }, { "epoch": 9.900452488687783, "grad_norm": 0.19783049619088353, "learning_rate": 3.7518339197267774e-10, "loss": 0.2584, "step": 547 }, { "epoch": 9.918552036199095, "grad_norm": 0.2143160581746704, "learning_rate": 1.6675049137188094e-10, "loss": 0.2481, "step": 548 }, { "epoch": 9.936651583710407, "grad_norm": 0.18644523978656508, "learning_rate": 4.1687970420423165e-11, "loss": 0.2456, "step": 549 }, { "epoch": 9.95475113122172, "grad_norm": 0.19744141932163012, "learning_rate": 0.0, "loss": 0.2503, "step": 550 }, { "epoch": 9.95475113122172, "step": 550, "total_flos": 9.907464757911224e+17, "train_loss": 0.2748682842471383, "train_runtime": 89439.6385, "train_samples_per_second": 0.395, "train_steps_per_second": 0.006 } ], "logging_steps": 1.0, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.907464757911224e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }