diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3892 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.95475113122172, + "eval_steps": 500, + "global_step": 550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01809954751131222, + "grad_norm": 1.6741957199641677, + "learning_rate": 8.333333333333333e-07, + "loss": 0.392, + "step": 1 + }, + { + "epoch": 0.03619909502262444, + "grad_norm": 1.526970859287005, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.3479, + "step": 2 + }, + { + "epoch": 0.05429864253393665, + "grad_norm": 1.8103690939719148, + "learning_rate": 2.5e-06, + "loss": 0.363, + "step": 3 + }, + { + "epoch": 0.07239819004524888, + "grad_norm": 1.568077888738942, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3513, + "step": 4 + }, + { + "epoch": 0.09049773755656108, + "grad_norm": 1.668945098216231, + "learning_rate": 4.166666666666667e-06, + "loss": 0.3759, + "step": 5 + }, + { + "epoch": 0.1085972850678733, + "grad_norm": 1.3864660758192329, + "learning_rate": 5e-06, + "loss": 0.3525, + "step": 6 + }, + { + "epoch": 0.12669683257918551, + "grad_norm": 1.538592504007101, + "learning_rate": 4.99995831202958e-06, + "loss": 0.3904, + "step": 7 + }, + { + "epoch": 0.14479638009049775, + "grad_norm": 1.2047351614977708, + "learning_rate": 4.999833249508629e-06, + "loss": 0.3924, + "step": 8 + }, + { + "epoch": 0.16289592760180996, + "grad_norm": 1.0640124047316322, + "learning_rate": 4.999624816608027e-06, + "loss": 0.375, + "step": 9 + }, + { + "epoch": 0.18099547511312217, + "grad_norm": 0.7966517341350207, + "learning_rate": 4.999333020279094e-06, + "loss": 0.356, + "step": 10 + }, + { + "epoch": 0.19909502262443438, + "grad_norm": 0.4554353875165799, + "learning_rate": 4.998957870253344e-06, + "loss": 0.3598, + "step": 11 + }, + { + "epoch": 0.2171945701357466, + "grad_norm": 0.6557533564712539, + "learning_rate": 4.998499379042172e-06, + "loss": 0.3392, + "step": 12 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.7936629840883419, + "learning_rate": 4.997957561936433e-06, + "loss": 0.3691, + "step": 13 + }, + { + "epoch": 0.25339366515837103, + "grad_norm": 0.7547277609627707, + "learning_rate": 4.997332437005932e-06, + "loss": 0.352, + "step": 14 + }, + { + "epoch": 0.27149321266968324, + "grad_norm": 0.8087501558896228, + "learning_rate": 4.996624025098819e-06, + "loss": 0.3449, + "step": 15 + }, + { + "epoch": 0.2895927601809955, + "grad_norm": 0.7820896976667914, + "learning_rate": 4.9958323498409e-06, + "loss": 0.3401, + "step": 16 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.7431270073814646, + "learning_rate": 4.99495743763484e-06, + "loss": 0.3567, + "step": 17 + }, + { + "epoch": 0.3257918552036199, + "grad_norm": 0.6777032410783791, + "learning_rate": 4.993999317659293e-06, + "loss": 0.3585, + "step": 18 + }, + { + "epoch": 0.3438914027149321, + "grad_norm": 0.6196369624534765, + "learning_rate": 4.9929580218679195e-06, + "loss": 0.3293, + "step": 19 + }, + { + "epoch": 0.36199095022624433, + "grad_norm": 0.5604472586874513, + "learning_rate": 4.991833584988326e-06, + "loss": 0.3437, + "step": 20 + }, + { + "epoch": 0.38009049773755654, + "grad_norm": 0.5137629265098744, + "learning_rate": 4.990626044520905e-06, + "loss": 0.3249, + "step": 21 + }, + { + "epoch": 0.39819004524886875, + "grad_norm": 0.547237003947588, + "learning_rate": 4.989335440737587e-06, + "loss": 0.3532, + "step": 22 + }, + { + "epoch": 0.416289592760181, + "grad_norm": 0.4164415963578454, + "learning_rate": 4.987961816680493e-06, + "loss": 0.3533, + "step": 23 + }, + { + "epoch": 0.4343891402714932, + "grad_norm": 0.35699522892651586, + "learning_rate": 4.986505218160502e-06, + "loss": 0.3268, + "step": 24 + }, + { + "epoch": 0.45248868778280543, + "grad_norm": 0.4026088211790661, + "learning_rate": 4.984965693755723e-06, + "loss": 0.3332, + "step": 25 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.35057192166080064, + "learning_rate": 4.983343294809875e-06, + "loss": 0.3245, + "step": 26 + }, + { + "epoch": 0.48868778280542985, + "grad_norm": 0.3639947181438965, + "learning_rate": 4.981638075430572e-06, + "loss": 0.3199, + "step": 27 + }, + { + "epoch": 0.5067873303167421, + "grad_norm": 0.3387354723957761, + "learning_rate": 4.979850092487525e-06, + "loss": 0.3282, + "step": 28 + }, + { + "epoch": 0.5248868778280543, + "grad_norm": 0.3528078697583281, + "learning_rate": 4.977979405610635e-06, + "loss": 0.337, + "step": 29 + }, + { + "epoch": 0.5429864253393665, + "grad_norm": 0.3126032062813636, + "learning_rate": 4.976026077188013e-06, + "loss": 0.3265, + "step": 30 + }, + { + "epoch": 0.5610859728506787, + "grad_norm": 0.3584209299955196, + "learning_rate": 4.973990172363899e-06, + "loss": 0.3568, + "step": 31 + }, + { + "epoch": 0.579185520361991, + "grad_norm": 0.4239503710543474, + "learning_rate": 4.9718717590364855e-06, + "loss": 0.3287, + "step": 32 + }, + { + "epoch": 0.5972850678733032, + "grad_norm": 0.41156579276283284, + "learning_rate": 4.969670907855651e-06, + "loss": 0.3267, + "step": 33 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.33536968371087267, + "learning_rate": 4.967387692220615e-06, + "loss": 0.3367, + "step": 34 + }, + { + "epoch": 0.6334841628959276, + "grad_norm": 0.30272106018319034, + "learning_rate": 4.965022188277474e-06, + "loss": 0.3236, + "step": 35 + }, + { + "epoch": 0.6515837104072398, + "grad_norm": 0.28697723150322, + "learning_rate": 4.962574474916678e-06, + "loss": 0.3236, + "step": 36 + }, + { + "epoch": 0.669683257918552, + "grad_norm": 0.21062422377276369, + "learning_rate": 4.960044633770387e-06, + "loss": 0.3295, + "step": 37 + }, + { + "epoch": 0.6877828054298643, + "grad_norm": 0.28283155334950705, + "learning_rate": 4.957432749209755e-06, + "loss": 0.3453, + "step": 38 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.2161778814999892, + "learning_rate": 4.954738908342116e-06, + "loss": 0.3645, + "step": 39 + }, + { + "epoch": 0.7239819004524887, + "grad_norm": 0.2354424408008659, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.3372, + "step": 40 + }, + { + "epoch": 0.7420814479638009, + "grad_norm": 0.26054770828411217, + "learning_rate": 4.9491057197785205e-06, + "loss": 0.3349, + "step": 41 + }, + { + "epoch": 0.7601809954751131, + "grad_norm": 0.2596310001381547, + "learning_rate": 4.946166559951523e-06, + "loss": 0.3174, + "step": 42 + }, + { + "epoch": 0.7782805429864253, + "grad_norm": 0.2763815562688228, + "learning_rate": 4.943145819549169e-06, + "loss": 0.3464, + "step": 43 + }, + { + "epoch": 0.7963800904977375, + "grad_norm": 0.2508801820692124, + "learning_rate": 4.9400435993142895e-06, + "loss": 0.3277, + "step": 44 + }, + { + "epoch": 0.8144796380090498, + "grad_norm": 0.25823275674527674, + "learning_rate": 4.936860002707096e-06, + "loss": 0.343, + "step": 45 + }, + { + "epoch": 0.832579185520362, + "grad_norm": 0.23862916529933217, + "learning_rate": 4.933595135901733e-06, + "loss": 0.3425, + "step": 46 + }, + { + "epoch": 0.8506787330316742, + "grad_norm": 0.2377285409864031, + "learning_rate": 4.9302491077827366e-06, + "loss": 0.3345, + "step": 47 + }, + { + "epoch": 0.8687782805429864, + "grad_norm": 0.2054263655021643, + "learning_rate": 4.926822029941406e-06, + "loss": 0.3599, + "step": 48 + }, + { + "epoch": 0.8868778280542986, + "grad_norm": 0.21857378026560212, + "learning_rate": 4.923314016672075e-06, + "loss": 0.3293, + "step": 49 + }, + { + "epoch": 0.9049773755656109, + "grad_norm": 0.20834775020466292, + "learning_rate": 4.919725184968307e-06, + "loss": 0.3231, + "step": 50 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.2000139484905926, + "learning_rate": 4.9160556545189895e-06, + "loss": 0.3248, + "step": 51 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.24124485812118368, + "learning_rate": 4.9123055477043454e-06, + "loss": 0.3314, + "step": 52 + }, + { + "epoch": 0.9592760180995475, + "grad_norm": 0.26803109191751107, + "learning_rate": 4.908474989591846e-06, + "loss": 0.3341, + "step": 53 + }, + { + "epoch": 0.9773755656108597, + "grad_norm": 0.21490833872159623, + "learning_rate": 4.904564107932048e-06, + "loss": 0.3189, + "step": 54 + }, + { + "epoch": 0.995475113122172, + "grad_norm": 0.22738113980709365, + "learning_rate": 4.900573033154325e-06, + "loss": 0.3198, + "step": 55 + }, + { + "epoch": 1.0135746606334841, + "grad_norm": 0.1860953606536629, + "learning_rate": 4.8965018983625245e-06, + "loss": 0.3273, + "step": 56 + }, + { + "epoch": 1.0316742081447963, + "grad_norm": 0.2170252756734204, + "learning_rate": 4.8923508393305224e-06, + "loss": 0.3058, + "step": 57 + }, + { + "epoch": 1.0497737556561086, + "grad_norm": 0.19753070998453712, + "learning_rate": 4.888119994497701e-06, + "loss": 0.2949, + "step": 58 + }, + { + "epoch": 1.0678733031674208, + "grad_norm": 0.21040212719480175, + "learning_rate": 4.883809504964325e-06, + "loss": 0.298, + "step": 59 + }, + { + "epoch": 1.085972850678733, + "grad_norm": 0.20799415615187367, + "learning_rate": 4.879419514486846e-06, + "loss": 0.3201, + "step": 60 + }, + { + "epoch": 1.1040723981900453, + "grad_norm": 0.19784508945913667, + "learning_rate": 4.874950169473097e-06, + "loss": 0.3338, + "step": 61 + }, + { + "epoch": 1.1221719457013575, + "grad_norm": 0.20898074744097636, + "learning_rate": 4.870401618977415e-06, + "loss": 0.3053, + "step": 62 + }, + { + "epoch": 1.1402714932126696, + "grad_norm": 0.21530409824217756, + "learning_rate": 4.8657740146956724e-06, + "loss": 0.3346, + "step": 63 + }, + { + "epoch": 1.1583710407239818, + "grad_norm": 0.21656570481740497, + "learning_rate": 4.8610675109602135e-06, + "loss": 0.3175, + "step": 64 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.18916438407683134, + "learning_rate": 4.856282264734708e-06, + "loss": 0.2973, + "step": 65 + }, + { + "epoch": 1.1945701357466063, + "grad_norm": 0.19298959302885896, + "learning_rate": 4.851418435608919e-06, + "loss": 0.3328, + "step": 66 + }, + { + "epoch": 1.2126696832579185, + "grad_norm": 0.19382840884955524, + "learning_rate": 4.84647618579338e-06, + "loss": 0.3233, + "step": 67 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.22308099956654967, + "learning_rate": 4.841455680113979e-06, + "loss": 0.3401, + "step": 68 + }, + { + "epoch": 1.248868778280543, + "grad_norm": 0.1908581730308582, + "learning_rate": 4.836357086006471e-06, + "loss": 0.3199, + "step": 69 + }, + { + "epoch": 1.2669683257918551, + "grad_norm": 0.1900661127768816, + "learning_rate": 4.83118057351089e-06, + "loss": 0.3193, + "step": 70 + }, + { + "epoch": 1.2850678733031673, + "grad_norm": 0.1842083788274683, + "learning_rate": 4.825926315265874e-06, + "loss": 0.3093, + "step": 71 + }, + { + "epoch": 1.3031674208144797, + "grad_norm": 0.19304820753044424, + "learning_rate": 4.820594486502913e-06, + "loss": 0.3147, + "step": 72 + }, + { + "epoch": 1.3212669683257918, + "grad_norm": 0.1865184743330753, + "learning_rate": 4.815185265040504e-06, + "loss": 0.3371, + "step": 73 + }, + { + "epoch": 1.3393665158371042, + "grad_norm": 0.21257371675686554, + "learning_rate": 4.809698831278217e-06, + "loss": 0.3556, + "step": 74 + }, + { + "epoch": 1.3574660633484164, + "grad_norm": 0.19738810108074692, + "learning_rate": 4.804135368190684e-06, + "loss": 0.3098, + "step": 75 + }, + { + "epoch": 1.3755656108597285, + "grad_norm": 0.20419379710110824, + "learning_rate": 4.798495061321492e-06, + "loss": 0.3037, + "step": 76 + }, + { + "epoch": 1.3936651583710407, + "grad_norm": 0.21182701854581448, + "learning_rate": 4.792778098776997e-06, + "loss": 0.3046, + "step": 77 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.20966701782750055, + "learning_rate": 4.786984671220053e-06, + "loss": 0.3146, + "step": 78 + }, + { + "epoch": 1.4298642533936652, + "grad_norm": 0.2228994463496351, + "learning_rate": 4.7811149718636475e-06, + "loss": 0.3133, + "step": 79 + }, + { + "epoch": 1.4479638009049773, + "grad_norm": 0.2125517747018847, + "learning_rate": 4.7751691964644655e-06, + "loss": 0.3181, + "step": 80 + }, + { + "epoch": 1.4660633484162897, + "grad_norm": 0.18774294015726306, + "learning_rate": 4.7691475433163515e-06, + "loss": 0.3107, + "step": 81 + }, + { + "epoch": 1.4841628959276019, + "grad_norm": 0.2105655304494509, + "learning_rate": 4.763050213243705e-06, + "loss": 0.3193, + "step": 82 + }, + { + "epoch": 1.502262443438914, + "grad_norm": 0.2101302949838479, + "learning_rate": 4.7568774095947804e-06, + "loss": 0.3372, + "step": 83 + }, + { + "epoch": 1.5203619909502262, + "grad_norm": 0.1761520660073366, + "learning_rate": 4.7506293382349e-06, + "loss": 0.3058, + "step": 84 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.20214706457289192, + "learning_rate": 4.744306207539595e-06, + "loss": 0.34, + "step": 85 + }, + { + "epoch": 1.5565610859728507, + "grad_norm": 0.21608846929666756, + "learning_rate": 4.737908228387656e-06, + "loss": 0.3285, + "step": 86 + }, + { + "epoch": 1.5746606334841629, + "grad_norm": 0.19692503921435273, + "learning_rate": 4.731435614154094e-06, + "loss": 0.3134, + "step": 87 + }, + { + "epoch": 1.5927601809954752, + "grad_norm": 0.19107736826101185, + "learning_rate": 4.72488858070303e-06, + "loss": 0.305, + "step": 88 + }, + { + "epoch": 1.6108597285067874, + "grad_norm": 0.19148405595657123, + "learning_rate": 4.718267346380492e-06, + "loss": 0.3157, + "step": 89 + }, + { + "epoch": 1.6289592760180995, + "grad_norm": 0.19180277215162053, + "learning_rate": 4.711572132007139e-06, + "loss": 0.3124, + "step": 90 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.19539080957269014, + "learning_rate": 4.704803160870888e-06, + "loss": 0.3306, + "step": 91 + }, + { + "epoch": 1.6651583710407238, + "grad_norm": 0.21052797618402563, + "learning_rate": 4.697960658719475e-06, + "loss": 0.3061, + "step": 92 + }, + { + "epoch": 1.6832579185520362, + "grad_norm": 0.20191616959818315, + "learning_rate": 4.69104485375292e-06, + "loss": 0.3098, + "step": 93 + }, + { + "epoch": 1.7013574660633484, + "grad_norm": 0.2159013380308242, + "learning_rate": 4.684055976615924e-06, + "loss": 0.3088, + "step": 94 + }, + { + "epoch": 1.7194570135746607, + "grad_norm": 0.18904626555927467, + "learning_rate": 4.676994260390168e-06, + "loss": 0.2912, + "step": 95 + }, + { + "epoch": 1.737556561085973, + "grad_norm": 0.19467640291002175, + "learning_rate": 4.6698599405865465e-06, + "loss": 0.303, + "step": 96 + }, + { + "epoch": 1.755656108597285, + "grad_norm": 0.2880548104749461, + "learning_rate": 4.662653255137308e-06, + "loss": 0.3348, + "step": 97 + }, + { + "epoch": 1.7737556561085972, + "grad_norm": 0.2019155699824381, + "learning_rate": 4.655374444388127e-06, + "loss": 0.327, + "step": 98 + }, + { + "epoch": 1.7918552036199094, + "grad_norm": 0.2592156259533593, + "learning_rate": 4.648023751090079e-06, + "loss": 0.3363, + "step": 99 + }, + { + "epoch": 1.8099547511312217, + "grad_norm": 0.2180192099378802, + "learning_rate": 4.640601420391554e-06, + "loss": 0.3113, + "step": 100 + }, + { + "epoch": 1.8280542986425339, + "grad_norm": 0.20679493678747934, + "learning_rate": 4.633107699830073e-06, + "loss": 0.3148, + "step": 101 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.2053440368213778, + "learning_rate": 4.625542839324036e-06, + "loss": 0.2967, + "step": 102 + }, + { + "epoch": 1.8642533936651584, + "grad_norm": 0.19200611510261656, + "learning_rate": 4.617907091164389e-06, + "loss": 0.3188, + "step": 103 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.2302101510970096, + "learning_rate": 4.610200710006206e-06, + "loss": 0.3121, + "step": 104 + }, + { + "epoch": 1.9004524886877827, + "grad_norm": 0.2221804604843677, + "learning_rate": 4.602423952860199e-06, + "loss": 0.3146, + "step": 105 + }, + { + "epoch": 1.9185520361990949, + "grad_norm": 0.21983834708053807, + "learning_rate": 4.594577079084146e-06, + "loss": 0.3405, + "step": 106 + }, + { + "epoch": 1.9366515837104072, + "grad_norm": 0.21085636909889235, + "learning_rate": 4.58666035037424e-06, + "loss": 0.3089, + "step": 107 + }, + { + "epoch": 1.9547511312217196, + "grad_norm": 0.2016884181282795, + "learning_rate": 4.578674030756364e-06, + "loss": 0.3229, + "step": 108 + }, + { + "epoch": 1.9728506787330318, + "grad_norm": 0.19657023773974253, + "learning_rate": 4.57061838657728e-06, + "loss": 0.3237, + "step": 109 + }, + { + "epoch": 1.990950226244344, + "grad_norm": 0.20813455436587358, + "learning_rate": 4.562493686495756e-06, + "loss": 0.3255, + "step": 110 + }, + { + "epoch": 2.009049773755656, + "grad_norm": 0.18872409832307335, + "learning_rate": 4.5543002014735955e-06, + "loss": 0.2988, + "step": 111 + }, + { + "epoch": 2.0271493212669682, + "grad_norm": 0.19594421270270285, + "learning_rate": 4.546038204766609e-06, + "loss": 0.3109, + "step": 112 + }, + { + "epoch": 2.0452488687782804, + "grad_norm": 0.22355285614452686, + "learning_rate": 4.537707971915495e-06, + "loss": 0.3066, + "step": 113 + }, + { + "epoch": 2.0633484162895925, + "grad_norm": 0.2017792264758022, + "learning_rate": 4.529309780736654e-06, + "loss": 0.2939, + "step": 114 + }, + { + "epoch": 2.081447963800905, + "grad_norm": 0.20223483022494018, + "learning_rate": 4.520843911312922e-06, + "loss": 0.294, + "step": 115 + }, + { + "epoch": 2.0995475113122173, + "grad_norm": 0.20322098664858632, + "learning_rate": 4.512310645984231e-06, + "loss": 0.2984, + "step": 116 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.20705072743185104, + "learning_rate": 4.503710269338191e-06, + "loss": 0.2694, + "step": 117 + }, + { + "epoch": 2.1357466063348416, + "grad_norm": 0.18442012590242893, + "learning_rate": 4.4950430682005995e-06, + "loss": 0.2979, + "step": 118 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.2076635780792367, + "learning_rate": 4.486309331625877e-06, + "loss": 0.2874, + "step": 119 + }, + { + "epoch": 2.171945701357466, + "grad_norm": 0.19968964517363474, + "learning_rate": 4.477509350887424e-06, + "loss": 0.291, + "step": 120 + }, + { + "epoch": 2.1900452488687785, + "grad_norm": 0.18959726179400077, + "learning_rate": 4.468643419467909e-06, + "loss": 0.2921, + "step": 121 + }, + { + "epoch": 2.2081447963800906, + "grad_norm": 0.2388780187488927, + "learning_rate": 4.459711833049485e-06, + "loss": 0.3061, + "step": 122 + }, + { + "epoch": 2.226244343891403, + "grad_norm": 0.22092548916393367, + "learning_rate": 4.4507148895039165e-06, + "loss": 0.2765, + "step": 123 + }, + { + "epoch": 2.244343891402715, + "grad_norm": 0.21070917452514223, + "learning_rate": 4.4416528888826595e-06, + "loss": 0.2969, + "step": 124 + }, + { + "epoch": 2.262443438914027, + "grad_norm": 0.19807472108481627, + "learning_rate": 4.432526133406843e-06, + "loss": 0.3044, + "step": 125 + }, + { + "epoch": 2.2805429864253393, + "grad_norm": 0.1910225174641335, + "learning_rate": 4.423334927457198e-06, + "loss": 0.3132, + "step": 126 + }, + { + "epoch": 2.2986425339366514, + "grad_norm": 0.2203923882052516, + "learning_rate": 4.414079577563901e-06, + "loss": 0.3032, + "step": 127 + }, + { + "epoch": 2.3167420814479636, + "grad_norm": 0.21331518168793756, + "learning_rate": 4.404760392396355e-06, + "loss": 0.3033, + "step": 128 + }, + { + "epoch": 2.334841628959276, + "grad_norm": 0.21461268917839496, + "learning_rate": 4.3953776827528925e-06, + "loss": 0.3039, + "step": 129 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.1862241130798519, + "learning_rate": 4.385931761550411e-06, + "loss": 0.2793, + "step": 130 + }, + { + "epoch": 2.3710407239819005, + "grad_norm": 0.19779667332990994, + "learning_rate": 4.376422943813936e-06, + "loss": 0.2849, + "step": 131 + }, + { + "epoch": 2.3891402714932126, + "grad_norm": 0.20538470648954774, + "learning_rate": 4.366851546666118e-06, + "loss": 0.3129, + "step": 132 + }, + { + "epoch": 2.4072398190045248, + "grad_norm": 0.20067043214876432, + "learning_rate": 4.357217889316657e-06, + "loss": 0.3041, + "step": 133 + }, + { + "epoch": 2.425339366515837, + "grad_norm": 0.1997136625573991, + "learning_rate": 4.3475222930516484e-06, + "loss": 0.2839, + "step": 134 + }, + { + "epoch": 2.4434389140271495, + "grad_norm": 0.20004099038403145, + "learning_rate": 4.3377650812228765e-06, + "loss": 0.3014, + "step": 135 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.19311135694466858, + "learning_rate": 4.327946579237028e-06, + "loss": 0.2834, + "step": 136 + }, + { + "epoch": 2.479638009049774, + "grad_norm": 0.21078445039076968, + "learning_rate": 4.318067114544838e-06, + "loss": 0.2796, + "step": 137 + }, + { + "epoch": 2.497737556561086, + "grad_norm": 0.21975365365759061, + "learning_rate": 4.308127016630176e-06, + "loss": 0.2972, + "step": 138 + }, + { + "epoch": 2.515837104072398, + "grad_norm": 0.21203142423348517, + "learning_rate": 4.2981266169990436e-06, + "loss": 0.3196, + "step": 139 + }, + { + "epoch": 2.5339366515837103, + "grad_norm": 0.20131092451024465, + "learning_rate": 4.2880662491685345e-06, + "loss": 0.3003, + "step": 140 + }, + { + "epoch": 2.5520361990950224, + "grad_norm": 0.22294798360675439, + "learning_rate": 4.277946248655701e-06, + "loss": 0.2947, + "step": 141 + }, + { + "epoch": 2.5701357466063346, + "grad_norm": 0.22859386995564024, + "learning_rate": 4.267766952966369e-06, + "loss": 0.2958, + "step": 142 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.19567845392715985, + "learning_rate": 4.257528701583882e-06, + "loss": 0.2998, + "step": 143 + }, + { + "epoch": 2.6063348416289593, + "grad_norm": 0.19741413456031112, + "learning_rate": 4.247231835957773e-06, + "loss": 0.3408, + "step": 144 + }, + { + "epoch": 2.6244343891402715, + "grad_norm": 0.19905612890447116, + "learning_rate": 4.236876699492391e-06, + "loss": 0.3117, + "step": 145 + }, + { + "epoch": 2.6425339366515836, + "grad_norm": 0.1942385041095113, + "learning_rate": 4.226463637535429e-06, + "loss": 0.3152, + "step": 146 + }, + { + "epoch": 2.660633484162896, + "grad_norm": 0.22327732166814804, + "learning_rate": 4.215992997366425e-06, + "loss": 0.3142, + "step": 147 + }, + { + "epoch": 2.6787330316742084, + "grad_norm": 0.1935161282714164, + "learning_rate": 4.2054651281851685e-06, + "loss": 0.3081, + "step": 148 + }, + { + "epoch": 2.6968325791855206, + "grad_norm": 0.23957566926280122, + "learning_rate": 4.1948803811000585e-06, + "loss": 0.2894, + "step": 149 + }, + { + "epoch": 2.7149321266968327, + "grad_norm": 0.18805009890662516, + "learning_rate": 4.184239109116393e-06, + "loss": 0.2984, + "step": 150 + }, + { + "epoch": 2.733031674208145, + "grad_norm": 0.212580814141281, + "learning_rate": 4.173541667124599e-06, + "loss": 0.3097, + "step": 151 + }, + { + "epoch": 2.751131221719457, + "grad_norm": 0.19712271093008257, + "learning_rate": 4.1627884118883925e-06, + "loss": 0.3177, + "step": 152 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.2278946892968003, + "learning_rate": 4.1519797020328815e-06, + "loss": 0.3101, + "step": 153 + }, + { + "epoch": 2.7873303167420813, + "grad_norm": 0.21064766645861627, + "learning_rate": 4.141115898032607e-06, + "loss": 0.274, + "step": 154 + }, + { + "epoch": 2.8054298642533935, + "grad_norm": 0.20995612915210915, + "learning_rate": 4.130197362199521e-06, + "loss": 0.2926, + "step": 155 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.21633523471290103, + "learning_rate": 4.119224458670905e-06, + "loss": 0.2875, + "step": 156 + }, + { + "epoch": 2.841628959276018, + "grad_norm": 0.21266765467202223, + "learning_rate": 4.1081975533972185e-06, + "loss": 0.2947, + "step": 157 + }, + { + "epoch": 2.8597285067873304, + "grad_norm": 0.19506346084116072, + "learning_rate": 4.097117014129903e-06, + "loss": 0.296, + "step": 158 + }, + { + "epoch": 2.8778280542986425, + "grad_norm": 0.1986031276610744, + "learning_rate": 4.085983210409114e-06, + "loss": 0.2988, + "step": 159 + }, + { + "epoch": 2.8959276018099547, + "grad_norm": 0.22662474336309782, + "learning_rate": 4.074796513551395e-06, + "loss": 0.2952, + "step": 160 + }, + { + "epoch": 2.914027149321267, + "grad_norm": 0.21721813582738397, + "learning_rate": 4.063557296637295e-06, + "loss": 0.3099, + "step": 161 + }, + { + "epoch": 2.9321266968325794, + "grad_norm": 0.2133328804989817, + "learning_rate": 4.052265934498929e-06, + "loss": 0.2974, + "step": 162 + }, + { + "epoch": 2.9502262443438916, + "grad_norm": 0.1960218423105953, + "learning_rate": 4.040922803707474e-06, + "loss": 0.3065, + "step": 163 + }, + { + "epoch": 2.9683257918552037, + "grad_norm": 0.22167341080572722, + "learning_rate": 4.029528282560609e-06, + "loss": 0.2886, + "step": 164 + }, + { + "epoch": 2.986425339366516, + "grad_norm": 0.20386239234209946, + "learning_rate": 4.018082751069904e-06, + "loss": 0.3076, + "step": 165 + }, + { + "epoch": 3.004524886877828, + "grad_norm": 0.23748187918298697, + "learning_rate": 4.006586590948141e-06, + "loss": 0.2985, + "step": 166 + }, + { + "epoch": 3.02262443438914, + "grad_norm": 0.22617083435609797, + "learning_rate": 3.995040185596588e-06, + "loss": 0.2754, + "step": 167 + }, + { + "epoch": 3.0407239819004523, + "grad_norm": 0.23986769037952196, + "learning_rate": 3.983443920092206e-06, + "loss": 0.2854, + "step": 168 + }, + { + "epoch": 3.0588235294117645, + "grad_norm": 0.20150185345396, + "learning_rate": 3.971798181174816e-06, + "loss": 0.2832, + "step": 169 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.20913884879987113, + "learning_rate": 3.960103357234192e-06, + "loss": 0.2986, + "step": 170 + }, + { + "epoch": 3.0950226244343892, + "grad_norm": 0.20477890710932672, + "learning_rate": 3.948359838297115e-06, + "loss": 0.2876, + "step": 171 + }, + { + "epoch": 3.1131221719457014, + "grad_norm": 0.2031405516319826, + "learning_rate": 3.9365680160143595e-06, + "loss": 0.2971, + "step": 172 + }, + { + "epoch": 3.1312217194570136, + "grad_norm": 0.185634722744017, + "learning_rate": 3.924728283647638e-06, + "loss": 0.279, + "step": 173 + }, + { + "epoch": 3.1493212669683257, + "grad_norm": 0.20745314489894484, + "learning_rate": 3.91284103605648e-06, + "loss": 0.2903, + "step": 174 + }, + { + "epoch": 3.167420814479638, + "grad_norm": 0.20741649089082642, + "learning_rate": 3.9009066696850664e-06, + "loss": 0.2964, + "step": 175 + }, + { + "epoch": 3.1855203619909505, + "grad_norm": 0.20883578071304365, + "learning_rate": 3.888925582549006e-06, + "loss": 0.2946, + "step": 176 + }, + { + "epoch": 3.2036199095022626, + "grad_norm": 0.21451927304435986, + "learning_rate": 3.8768981742220646e-06, + "loss": 0.2811, + "step": 177 + }, + { + "epoch": 3.2217194570135748, + "grad_norm": 0.21080586953456093, + "learning_rate": 3.864824845822837e-06, + "loss": 0.2825, + "step": 178 + }, + { + "epoch": 3.239819004524887, + "grad_norm": 0.20609867837665838, + "learning_rate": 3.852706000001367e-06, + "loss": 0.2903, + "step": 179 + }, + { + "epoch": 3.257918552036199, + "grad_norm": 0.1972989252518201, + "learning_rate": 3.840542040925725e-06, + "loss": 0.2626, + "step": 180 + }, + { + "epoch": 3.276018099547511, + "grad_norm": 0.21568160972522105, + "learning_rate": 3.828333374268523e-06, + "loss": 0.2906, + "step": 181 + }, + { + "epoch": 3.2941176470588234, + "grad_norm": 0.1997067887507601, + "learning_rate": 3.81608040719339e-06, + "loss": 0.2862, + "step": 182 + }, + { + "epoch": 3.3122171945701355, + "grad_norm": 0.2017980016690952, + "learning_rate": 3.8037835483413877e-06, + "loss": 0.2855, + "step": 183 + }, + { + "epoch": 3.330316742081448, + "grad_norm": 0.20121264738949698, + "learning_rate": 3.7914432078173867e-06, + "loss": 0.2795, + "step": 184 + }, + { + "epoch": 3.3484162895927603, + "grad_norm": 0.22611700851822947, + "learning_rate": 3.7790597971763892e-06, + "loss": 0.2836, + "step": 185 + }, + { + "epoch": 3.3665158371040724, + "grad_norm": 0.2353941218093955, + "learning_rate": 3.7666337294097987e-06, + "loss": 0.288, + "step": 186 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 0.18605988505854537, + "learning_rate": 3.7541654189316525e-06, + "loss": 0.275, + "step": 187 + }, + { + "epoch": 3.4027149321266967, + "grad_norm": 0.22628052198695675, + "learning_rate": 3.741655281564796e-06, + "loss": 0.2966, + "step": 188 + }, + { + "epoch": 3.420814479638009, + "grad_norm": 0.21236583352079183, + "learning_rate": 3.72910373452702e-06, + "loss": 0.2702, + "step": 189 + }, + { + "epoch": 3.4389140271493215, + "grad_norm": 0.22886628365130654, + "learning_rate": 3.7165111964171407e-06, + "loss": 0.2718, + "step": 190 + }, + { + "epoch": 3.4570135746606336, + "grad_norm": 0.19177205299999378, + "learning_rate": 3.703878087201044e-06, + "loss": 0.2785, + "step": 191 + }, + { + "epoch": 3.475113122171946, + "grad_norm": 0.21164213919986813, + "learning_rate": 3.6912048281976764e-06, + "loss": 0.2991, + "step": 192 + }, + { + "epoch": 3.493212669683258, + "grad_norm": 0.20082392739954888, + "learning_rate": 3.6784918420649952e-06, + "loss": 0.2814, + "step": 193 + }, + { + "epoch": 3.51131221719457, + "grad_norm": 0.21531730826425216, + "learning_rate": 3.66573955278587e-06, + "loss": 0.2719, + "step": 194 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.20053264760640085, + "learning_rate": 3.6529483856539512e-06, + "loss": 0.2639, + "step": 195 + }, + { + "epoch": 3.5475113122171944, + "grad_norm": 0.18714903727677973, + "learning_rate": 3.640118767259474e-06, + "loss": 0.2712, + "step": 196 + }, + { + "epoch": 3.5656108597285066, + "grad_norm": 0.19923843024788357, + "learning_rate": 3.6272511254750403e-06, + "loss": 0.2825, + "step": 197 + }, + { + "epoch": 3.583710407239819, + "grad_norm": 0.2016875130706868, + "learning_rate": 3.6143458894413463e-06, + "loss": 0.2977, + "step": 198 + }, + { + "epoch": 3.6018099547511313, + "grad_norm": 0.21861290041385015, + "learning_rate": 3.6014034895528705e-06, + "loss": 0.284, + "step": 199 + }, + { + "epoch": 3.6199095022624435, + "grad_norm": 0.16879798551287897, + "learning_rate": 3.588424357443521e-06, + "loss": 0.2782, + "step": 200 + }, + { + "epoch": 3.6380090497737556, + "grad_norm": 0.22087168375536256, + "learning_rate": 3.5754089259722365e-06, + "loss": 0.2902, + "step": 201 + }, + { + "epoch": 3.6561085972850678, + "grad_norm": 0.2219253724635141, + "learning_rate": 3.5623576292085555e-06, + "loss": 0.294, + "step": 202 + }, + { + "epoch": 3.6742081447963804, + "grad_norm": 0.19173446074813308, + "learning_rate": 3.549270902418136e-06, + "loss": 0.2715, + "step": 203 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 0.20017688015075918, + "learning_rate": 3.536149182048243e-06, + "loss": 0.2823, + "step": 204 + }, + { + "epoch": 3.7104072398190047, + "grad_norm": 0.19825967232402708, + "learning_rate": 3.5229929057131877e-06, + "loss": 0.2881, + "step": 205 + }, + { + "epoch": 3.728506787330317, + "grad_norm": 0.218766824892692, + "learning_rate": 3.5098025121797375e-06, + "loss": 0.2999, + "step": 206 + }, + { + "epoch": 3.746606334841629, + "grad_norm": 0.18895340916933914, + "learning_rate": 3.496578441352481e-06, + "loss": 0.2687, + "step": 207 + }, + { + "epoch": 3.764705882352941, + "grad_norm": 0.19002028251920547, + "learning_rate": 3.4833211342591565e-06, + "loss": 0.2866, + "step": 208 + }, + { + "epoch": 3.7828054298642533, + "grad_norm": 0.20990408194136284, + "learning_rate": 3.4700310330359456e-06, + "loss": 0.2805, + "step": 209 + }, + { + "epoch": 3.8009049773755654, + "grad_norm": 0.1929810608863491, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.2864, + "step": 210 + }, + { + "epoch": 3.8190045248868776, + "grad_norm": 0.2026264509793902, + "learning_rate": 3.4433542221982863e-06, + "loss": 0.2847, + "step": 211 + }, + { + "epoch": 3.83710407239819, + "grad_norm": 0.20527047718646593, + "learning_rate": 3.4299684022655196e-06, + "loss": 0.285, + "step": 212 + }, + { + "epoch": 3.8552036199095023, + "grad_norm": 0.2004805974514508, + "learning_rate": 3.4165515675365558e-06, + "loss": 0.2862, + "step": 213 + }, + { + "epoch": 3.8733031674208145, + "grad_norm": 0.18650100732919933, + "learning_rate": 3.403104165467883e-06, + "loss": 0.2748, + "step": 214 + }, + { + "epoch": 3.8914027149321266, + "grad_norm": 0.21391027704520638, + "learning_rate": 3.3896266445354208e-06, + "loss": 0.2875, + "step": 215 + }, + { + "epoch": 3.909502262443439, + "grad_norm": 0.19076214648617013, + "learning_rate": 3.376119454219565e-06, + "loss": 0.2811, + "step": 216 + }, + { + "epoch": 3.9276018099547514, + "grad_norm": 0.22812272390771685, + "learning_rate": 3.362583044990195e-06, + "loss": 0.2923, + "step": 217 + }, + { + "epoch": 3.9457013574660635, + "grad_norm": 0.2114461488141671, + "learning_rate": 3.3490178682916534e-06, + "loss": 0.2784, + "step": 218 + }, + { + "epoch": 3.9638009049773757, + "grad_norm": 0.20971069504695025, + "learning_rate": 3.335424376527688e-06, + "loss": 0.2796, + "step": 219 + }, + { + "epoch": 3.981900452488688, + "grad_norm": 0.20721944316747504, + "learning_rate": 3.321803023046366e-06, + "loss": 0.2855, + "step": 220 + }, + { + "epoch": 4.0, + "grad_norm": 0.19529166110448204, + "learning_rate": 3.3081542621249503e-06, + "loss": 0.2722, + "step": 221 + }, + { + "epoch": 4.018099547511312, + "grad_norm": 0.2181909689708081, + "learning_rate": 3.2944785489547544e-06, + "loss": 0.2769, + "step": 222 + }, + { + "epoch": 4.036199095022624, + "grad_norm": 0.2041261831519089, + "learning_rate": 3.2807763396259597e-06, + "loss": 0.2755, + "step": 223 + }, + { + "epoch": 4.0542986425339365, + "grad_norm": 0.17317381953746722, + "learning_rate": 3.2670480911124045e-06, + "loss": 0.2457, + "step": 224 + }, + { + "epoch": 4.072398190045249, + "grad_norm": 0.20985257213280492, + "learning_rate": 3.2532942612563436e-06, + "loss": 0.3084, + "step": 225 + }, + { + "epoch": 4.090497737556561, + "grad_norm": 0.1805568892682367, + "learning_rate": 3.2395153087531767e-06, + "loss": 0.2688, + "step": 226 + }, + { + "epoch": 4.108597285067873, + "grad_norm": 0.20212288478471152, + "learning_rate": 3.225711693136156e-06, + "loss": 0.2678, + "step": 227 + }, + { + "epoch": 4.126696832579185, + "grad_norm": 0.20073111869372287, + "learning_rate": 3.211883874761058e-06, + "loss": 0.2636, + "step": 228 + }, + { + "epoch": 4.144796380090498, + "grad_norm": 0.21913185170065114, + "learning_rate": 3.19803231479083e-06, + "loss": 0.282, + "step": 229 + }, + { + "epoch": 4.16289592760181, + "grad_norm": 0.20273970778843858, + "learning_rate": 3.184157475180208e-06, + "loss": 0.2689, + "step": 230 + }, + { + "epoch": 4.180995475113122, + "grad_norm": 0.17451394799617262, + "learning_rate": 3.1702598186603152e-06, + "loss": 0.2583, + "step": 231 + }, + { + "epoch": 4.199095022624435, + "grad_norm": 0.1918998406915714, + "learning_rate": 3.1563398087232265e-06, + "loss": 0.2795, + "step": 232 + }, + { + "epoch": 4.217194570135747, + "grad_norm": 0.1970262585975004, + "learning_rate": 3.1423979096065134e-06, + "loss": 0.2605, + "step": 233 + }, + { + "epoch": 4.235294117647059, + "grad_norm": 0.18769426018045784, + "learning_rate": 3.1284345862777572e-06, + "loss": 0.2592, + "step": 234 + }, + { + "epoch": 4.253393665158371, + "grad_norm": 0.18870339061291633, + "learning_rate": 3.1144503044190456e-06, + "loss": 0.2642, + "step": 235 + }, + { + "epoch": 4.271493212669683, + "grad_norm": 0.18389039727257753, + "learning_rate": 3.100445530411442e-06, + "loss": 0.2376, + "step": 236 + }, + { + "epoch": 4.289592760180995, + "grad_norm": 0.20812476367192412, + "learning_rate": 3.086420731319429e-06, + "loss": 0.2708, + "step": 237 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 0.197363611944904, + "learning_rate": 3.0723763748753354e-06, + "loss": 0.2844, + "step": 238 + }, + { + "epoch": 4.32579185520362, + "grad_norm": 0.20806148893145612, + "learning_rate": 3.0583129294637342e-06, + "loss": 0.2487, + "step": 239 + }, + { + "epoch": 4.343891402714932, + "grad_norm": 0.1903831131540733, + "learning_rate": 3.044230864105821e-06, + "loss": 0.256, + "step": 240 + }, + { + "epoch": 4.361990950226244, + "grad_norm": 0.19042569669796638, + "learning_rate": 3.030130648443777e-06, + "loss": 0.2788, + "step": 241 + }, + { + "epoch": 4.380090497737557, + "grad_norm": 0.18850059553704934, + "learning_rate": 3.0160127527250993e-06, + "loss": 0.2808, + "step": 242 + }, + { + "epoch": 4.398190045248869, + "grad_norm": 0.20047462331384677, + "learning_rate": 3.0018776477869244e-06, + "loss": 0.2654, + "step": 243 + }, + { + "epoch": 4.416289592760181, + "grad_norm": 0.20580882731270267, + "learning_rate": 2.9877258050403214e-06, + "loss": 0.2753, + "step": 244 + }, + { + "epoch": 4.4343891402714934, + "grad_norm": 0.18933219589587963, + "learning_rate": 2.973557696454571e-06, + "loss": 0.2627, + "step": 245 + }, + { + "epoch": 4.452488687782806, + "grad_norm": 0.2019766662866527, + "learning_rate": 2.9593737945414264e-06, + "loss": 0.2779, + "step": 246 + }, + { + "epoch": 4.470588235294118, + "grad_norm": 0.21814847735110302, + "learning_rate": 2.9451745723393547e-06, + "loss": 0.2747, + "step": 247 + }, + { + "epoch": 4.48868778280543, + "grad_norm": 0.2044800165411035, + "learning_rate": 2.930960503397761e-06, + "loss": 0.2726, + "step": 248 + }, + { + "epoch": 4.506787330316742, + "grad_norm": 0.21142216034006506, + "learning_rate": 2.916732061761192e-06, + "loss": 0.2646, + "step": 249 + }, + { + "epoch": 4.524886877828054, + "grad_norm": 0.20150189042920258, + "learning_rate": 2.9024897219535326e-06, + "loss": 0.279, + "step": 250 + }, + { + "epoch": 4.542986425339366, + "grad_norm": 0.19100387279656014, + "learning_rate": 2.8882339589621742e-06, + "loss": 0.2795, + "step": 251 + }, + { + "epoch": 4.5610859728506785, + "grad_norm": 0.18930829794972215, + "learning_rate": 2.873965248222178e-06, + "loss": 0.2672, + "step": 252 + }, + { + "epoch": 4.579185520361991, + "grad_norm": 0.18814066866007795, + "learning_rate": 2.859684065600417e-06, + "loss": 0.2478, + "step": 253 + }, + { + "epoch": 4.597285067873303, + "grad_norm": 0.19644782065692218, + "learning_rate": 2.845390887379706e-06, + "loss": 0.2639, + "step": 254 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.20255200557130154, + "learning_rate": 2.8310861902429176e-06, + "loss": 0.2725, + "step": 255 + }, + { + "epoch": 4.633484162895927, + "grad_norm": 0.19849695081074425, + "learning_rate": 2.816770451257085e-06, + "loss": 0.2685, + "step": 256 + }, + { + "epoch": 4.65158371040724, + "grad_norm": 0.20106804584886076, + "learning_rate": 2.80244414785749e-06, + "loss": 0.2572, + "step": 257 + }, + { + "epoch": 4.669683257918552, + "grad_norm": 0.2021059922332257, + "learning_rate": 2.7881077578317445e-06, + "loss": 0.2924, + "step": 258 + }, + { + "epoch": 4.6877828054298645, + "grad_norm": 0.21055754243512417, + "learning_rate": 2.7737617593038493e-06, + "loss": 0.2714, + "step": 259 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.18496100638206028, + "learning_rate": 2.759406630718255e-06, + "loss": 0.2609, + "step": 260 + }, + { + "epoch": 4.723981900452489, + "grad_norm": 0.18437211430441194, + "learning_rate": 2.7450428508239024e-06, + "loss": 0.2662, + "step": 261 + }, + { + "epoch": 4.742081447963801, + "grad_norm": 0.18828985621936872, + "learning_rate": 2.730670898658255e-06, + "loss": 0.2549, + "step": 262 + }, + { + "epoch": 4.760180995475113, + "grad_norm": 0.19972804362365068, + "learning_rate": 2.716291253531329e-06, + "loss": 0.2873, + "step": 263 + }, + { + "epoch": 4.778280542986425, + "grad_norm": 0.2092834898971349, + "learning_rate": 2.7019043950096992e-06, + "loss": 0.2674, + "step": 264 + }, + { + "epoch": 4.796380090497737, + "grad_norm": 0.19131496744019671, + "learning_rate": 2.6875108029005113e-06, + "loss": 0.2724, + "step": 265 + }, + { + "epoch": 4.8144796380090495, + "grad_norm": 0.21255643670178404, + "learning_rate": 2.6731109572354795e-06, + "loss": 0.2684, + "step": 266 + }, + { + "epoch": 4.832579185520362, + "grad_norm": 0.18562764869110326, + "learning_rate": 2.658705338254876e-06, + "loss": 0.271, + "step": 267 + }, + { + "epoch": 4.850678733031674, + "grad_norm": 0.21207337609644833, + "learning_rate": 2.6442944263915153e-06, + "loss": 0.2719, + "step": 268 + }, + { + "epoch": 4.868778280542987, + "grad_norm": 0.2129223213748196, + "learning_rate": 2.6298787022547317e-06, + "loss": 0.2666, + "step": 269 + }, + { + "epoch": 4.886877828054299, + "grad_norm": 0.18692841429903953, + "learning_rate": 2.6154586466143495e-06, + "loss": 0.2755, + "step": 270 + }, + { + "epoch": 4.904977375565611, + "grad_norm": 0.19199436687113453, + "learning_rate": 2.6010347403846508e-06, + "loss": 0.2864, + "step": 271 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 0.19327034490069303, + "learning_rate": 2.5866074646083385e-06, + "loss": 0.2694, + "step": 272 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 0.26379305562686184, + "learning_rate": 2.572177300440487e-06, + "loss": 0.2597, + "step": 273 + }, + { + "epoch": 4.959276018099548, + "grad_norm": 0.1894366168665776, + "learning_rate": 2.557744729132503e-06, + "loss": 0.2825, + "step": 274 + }, + { + "epoch": 4.97737556561086, + "grad_norm": 0.19519701404452072, + "learning_rate": 2.5433102320160713e-06, + "loss": 0.2893, + "step": 275 + }, + { + "epoch": 4.995475113122172, + "grad_norm": 0.19163121413004777, + "learning_rate": 2.528874290487102e-06, + "loss": 0.2508, + "step": 276 + }, + { + "epoch": 5.013574660633484, + "grad_norm": 0.18512352279959782, + "learning_rate": 2.5144373859896792e-06, + "loss": 0.2589, + "step": 277 + }, + { + "epoch": 5.031674208144796, + "grad_norm": 0.18339390733870273, + "learning_rate": 2.5e-06, + "loss": 0.2621, + "step": 278 + }, + { + "epoch": 5.049773755656108, + "grad_norm": 0.1942547479998011, + "learning_rate": 2.4855626140103216e-06, + "loss": 0.245, + "step": 279 + }, + { + "epoch": 5.067873303167421, + "grad_norm": 0.201133955927992, + "learning_rate": 2.4711257095128987e-06, + "loss": 0.2428, + "step": 280 + }, + { + "epoch": 5.085972850678733, + "grad_norm": 0.19802266448824934, + "learning_rate": 2.4566897679839295e-06, + "loss": 0.2756, + "step": 281 + }, + { + "epoch": 5.104072398190045, + "grad_norm": 0.19714188235491836, + "learning_rate": 2.4422552708674977e-06, + "loss": 0.2626, + "step": 282 + }, + { + "epoch": 5.122171945701357, + "grad_norm": 0.18710363733656865, + "learning_rate": 2.427822699559514e-06, + "loss": 0.2616, + "step": 283 + }, + { + "epoch": 5.14027149321267, + "grad_norm": 0.18029896729988643, + "learning_rate": 2.413392535391663e-06, + "loss": 0.2671, + "step": 284 + }, + { + "epoch": 5.158371040723982, + "grad_norm": 0.19353123935666788, + "learning_rate": 2.3989652596153496e-06, + "loss": 0.2518, + "step": 285 + }, + { + "epoch": 5.176470588235294, + "grad_norm": 0.1999507247304982, + "learning_rate": 2.3845413533856517e-06, + "loss": 0.2691, + "step": 286 + }, + { + "epoch": 5.1945701357466065, + "grad_norm": 0.1802458898092889, + "learning_rate": 2.3701212977452683e-06, + "loss": 0.2662, + "step": 287 + }, + { + "epoch": 5.212669683257919, + "grad_norm": 0.20005237780106283, + "learning_rate": 2.3557055736084847e-06, + "loss": 0.2706, + "step": 288 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 0.20349821320072675, + "learning_rate": 2.3412946617451242e-06, + "loss": 0.2651, + "step": 289 + }, + { + "epoch": 5.248868778280543, + "grad_norm": 0.19275858047883396, + "learning_rate": 2.3268890427645213e-06, + "loss": 0.2809, + "step": 290 + }, + { + "epoch": 5.266968325791855, + "grad_norm": 0.19491454590375834, + "learning_rate": 2.312489197099489e-06, + "loss": 0.242, + "step": 291 + }, + { + "epoch": 5.285067873303167, + "grad_norm": 0.17860701410760396, + "learning_rate": 2.298095604990302e-06, + "loss": 0.252, + "step": 292 + }, + { + "epoch": 5.3031674208144794, + "grad_norm": 0.18166338870243837, + "learning_rate": 2.283708746468672e-06, + "loss": 0.2687, + "step": 293 + }, + { + "epoch": 5.321266968325792, + "grad_norm": 0.20860085100238554, + "learning_rate": 2.269329101341745e-06, + "loss": 0.2749, + "step": 294 + }, + { + "epoch": 5.339366515837104, + "grad_norm": 0.18128543910141529, + "learning_rate": 2.2549571491760985e-06, + "loss": 0.2423, + "step": 295 + }, + { + "epoch": 5.357466063348416, + "grad_norm": 0.23828035104300602, + "learning_rate": 2.2405933692817458e-06, + "loss": 0.2582, + "step": 296 + }, + { + "epoch": 5.375565610859729, + "grad_norm": 0.19867583702537983, + "learning_rate": 2.226238240696151e-06, + "loss": 0.2505, + "step": 297 + }, + { + "epoch": 5.393665158371041, + "grad_norm": 0.2238993077156904, + "learning_rate": 2.2118922421682563e-06, + "loss": 0.2547, + "step": 298 + }, + { + "epoch": 5.411764705882353, + "grad_norm": 0.18659890730168405, + "learning_rate": 2.1975558521425106e-06, + "loss": 0.2541, + "step": 299 + }, + { + "epoch": 5.429864253393665, + "grad_norm": 0.2086208336638683, + "learning_rate": 2.183229548742916e-06, + "loss": 0.2449, + "step": 300 + }, + { + "epoch": 5.447963800904978, + "grad_norm": 0.19744096649329249, + "learning_rate": 2.1689138097570832e-06, + "loss": 0.2529, + "step": 301 + }, + { + "epoch": 5.46606334841629, + "grad_norm": 0.1905137878945102, + "learning_rate": 2.1546091126202955e-06, + "loss": 0.2549, + "step": 302 + }, + { + "epoch": 5.484162895927602, + "grad_norm": 0.18724152511382108, + "learning_rate": 2.1403159343995845e-06, + "loss": 0.2544, + "step": 303 + }, + { + "epoch": 5.502262443438914, + "grad_norm": 0.18137306072412968, + "learning_rate": 2.1260347517778223e-06, + "loss": 0.2472, + "step": 304 + }, + { + "epoch": 5.520361990950226, + "grad_norm": 0.21137486256539126, + "learning_rate": 2.111766041037826e-06, + "loss": 0.2663, + "step": 305 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 0.18969561601900994, + "learning_rate": 2.0975102780464674e-06, + "loss": 0.2654, + "step": 306 + }, + { + "epoch": 5.5565610859728505, + "grad_norm": 0.18687293378459552, + "learning_rate": 2.083267938238808e-06, + "loss": 0.2521, + "step": 307 + }, + { + "epoch": 5.574660633484163, + "grad_norm": 0.18563465250651875, + "learning_rate": 2.0690394966022397e-06, + "loss": 0.2599, + "step": 308 + }, + { + "epoch": 5.592760180995475, + "grad_norm": 0.18961353982721652, + "learning_rate": 2.0548254276606457e-06, + "loss": 0.253, + "step": 309 + }, + { + "epoch": 5.610859728506787, + "grad_norm": 0.19358594701649867, + "learning_rate": 2.040626205458574e-06, + "loss": 0.268, + "step": 310 + }, + { + "epoch": 5.628959276018099, + "grad_norm": 0.18903082550740266, + "learning_rate": 2.02644230354543e-06, + "loss": 0.2794, + "step": 311 + }, + { + "epoch": 5.647058823529412, + "grad_norm": 0.18955280198715693, + "learning_rate": 2.01227419495968e-06, + "loss": 0.2466, + "step": 312 + }, + { + "epoch": 5.665158371040724, + "grad_norm": 0.21673963839382857, + "learning_rate": 1.9981223522130764e-06, + "loss": 0.2646, + "step": 313 + }, + { + "epoch": 5.683257918552036, + "grad_norm": 0.18658355423161882, + "learning_rate": 1.9839872472749016e-06, + "loss": 0.2524, + "step": 314 + }, + { + "epoch": 5.701357466063349, + "grad_norm": 0.18351414151686257, + "learning_rate": 1.9698693515562235e-06, + "loss": 0.2484, + "step": 315 + }, + { + "epoch": 5.719457013574661, + "grad_norm": 0.19521700431845607, + "learning_rate": 1.9557691358941796e-06, + "loss": 0.241, + "step": 316 + }, + { + "epoch": 5.737556561085973, + "grad_norm": 0.18325038007655156, + "learning_rate": 1.941687070536267e-06, + "loss": 0.2834, + "step": 317 + }, + { + "epoch": 5.755656108597285, + "grad_norm": 0.2024434466335083, + "learning_rate": 1.9276236251246655e-06, + "loss": 0.2617, + "step": 318 + }, + { + "epoch": 5.773755656108597, + "grad_norm": 0.19282545684546182, + "learning_rate": 1.913579268680572e-06, + "loss": 0.251, + "step": 319 + }, + { + "epoch": 5.791855203619909, + "grad_norm": 0.1985416405665436, + "learning_rate": 1.8995544695885593e-06, + "loss": 0.2528, + "step": 320 + }, + { + "epoch": 5.8099547511312215, + "grad_norm": 0.19180458814723977, + "learning_rate": 1.8855496955809546e-06, + "loss": 0.2623, + "step": 321 + }, + { + "epoch": 5.828054298642534, + "grad_norm": 0.19714720164607588, + "learning_rate": 1.8715654137222434e-06, + "loss": 0.2603, + "step": 322 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 0.17915913395978303, + "learning_rate": 1.8576020903934872e-06, + "loss": 0.2461, + "step": 323 + }, + { + "epoch": 5.864253393665159, + "grad_norm": 0.1872517611416961, + "learning_rate": 1.8436601912767737e-06, + "loss": 0.2443, + "step": 324 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 0.2088265937495008, + "learning_rate": 1.8297401813396854e-06, + "loss": 0.2606, + "step": 325 + }, + { + "epoch": 5.900452488687783, + "grad_norm": 0.20072778739580704, + "learning_rate": 1.8158425248197931e-06, + "loss": 0.2683, + "step": 326 + }, + { + "epoch": 5.918552036199095, + "grad_norm": 0.20162018571475668, + "learning_rate": 1.801967685209171e-06, + "loss": 0.2674, + "step": 327 + }, + { + "epoch": 5.9366515837104075, + "grad_norm": 0.19962010438752759, + "learning_rate": 1.7881161252389423e-06, + "loss": 0.2518, + "step": 328 + }, + { + "epoch": 5.95475113122172, + "grad_norm": 0.1924016139723619, + "learning_rate": 1.7742883068638447e-06, + "loss": 0.2332, + "step": 329 + }, + { + "epoch": 5.972850678733032, + "grad_norm": 0.19688732396260147, + "learning_rate": 1.7604846912468243e-06, + "loss": 0.2758, + "step": 330 + }, + { + "epoch": 5.990950226244344, + "grad_norm": 0.21367643724553775, + "learning_rate": 1.7467057387436577e-06, + "loss": 0.2722, + "step": 331 + }, + { + "epoch": 6.009049773755656, + "grad_norm": 0.18143686535639186, + "learning_rate": 1.7329519088875959e-06, + "loss": 0.2505, + "step": 332 + }, + { + "epoch": 6.027149321266968, + "grad_norm": 0.19884601017939751, + "learning_rate": 1.719223660374041e-06, + "loss": 0.2406, + "step": 333 + }, + { + "epoch": 6.04524886877828, + "grad_norm": 0.19790104231314157, + "learning_rate": 1.7055214510452462e-06, + "loss": 0.2459, + "step": 334 + }, + { + "epoch": 6.0633484162895925, + "grad_norm": 0.21259902967676111, + "learning_rate": 1.6918457378750511e-06, + "loss": 0.256, + "step": 335 + }, + { + "epoch": 6.081447963800905, + "grad_norm": 0.213170628627418, + "learning_rate": 1.6781969769536356e-06, + "loss": 0.2606, + "step": 336 + }, + { + "epoch": 6.099547511312217, + "grad_norm": 0.18867147575952214, + "learning_rate": 1.6645756234723127e-06, + "loss": 0.2445, + "step": 337 + }, + { + "epoch": 6.117647058823529, + "grad_norm": 0.18694162673757048, + "learning_rate": 1.6509821317083466e-06, + "loss": 0.2346, + "step": 338 + }, + { + "epoch": 6.135746606334842, + "grad_norm": 0.19692152056487713, + "learning_rate": 1.6374169550098052e-06, + "loss": 0.2645, + "step": 339 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.18900423846777845, + "learning_rate": 1.6238805457804353e-06, + "loss": 0.2409, + "step": 340 + }, + { + "epoch": 6.171945701357466, + "grad_norm": 0.19281737146761763, + "learning_rate": 1.6103733554645794e-06, + "loss": 0.2511, + "step": 341 + }, + { + "epoch": 6.1900452488687785, + "grad_norm": 0.18576535863582108, + "learning_rate": 1.5968958345321178e-06, + "loss": 0.2562, + "step": 342 + }, + { + "epoch": 6.208144796380091, + "grad_norm": 0.1937616575487202, + "learning_rate": 1.5834484324634453e-06, + "loss": 0.2558, + "step": 343 + }, + { + "epoch": 6.226244343891403, + "grad_norm": 0.20266025820130834, + "learning_rate": 1.5700315977344813e-06, + "loss": 0.2619, + "step": 344 + }, + { + "epoch": 6.244343891402715, + "grad_norm": 0.19244645126328583, + "learning_rate": 1.5566457778017141e-06, + "loss": 0.2357, + "step": 345 + }, + { + "epoch": 6.262443438914027, + "grad_norm": 0.19529354957198908, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.2547, + "step": 346 + }, + { + "epoch": 6.280542986425339, + "grad_norm": 0.1977639183994923, + "learning_rate": 1.529968966964055e-06, + "loss": 0.253, + "step": 347 + }, + { + "epoch": 6.298642533936651, + "grad_norm": 0.19407931113719454, + "learning_rate": 1.5166788657408441e-06, + "loss": 0.2632, + "step": 348 + }, + { + "epoch": 6.316742081447964, + "grad_norm": 0.19011112857943221, + "learning_rate": 1.5034215586475194e-06, + "loss": 0.2647, + "step": 349 + }, + { + "epoch": 6.334841628959276, + "grad_norm": 0.2186558043805355, + "learning_rate": 1.490197487820263e-06, + "loss": 0.2395, + "step": 350 + }, + { + "epoch": 6.352941176470588, + "grad_norm": 0.18367578824384137, + "learning_rate": 1.477007094286813e-06, + "loss": 0.2516, + "step": 351 + }, + { + "epoch": 6.371040723981901, + "grad_norm": 0.18371310311269254, + "learning_rate": 1.4638508179517583e-06, + "loss": 0.2709, + "step": 352 + }, + { + "epoch": 6.389140271493213, + "grad_norm": 0.19750798322441557, + "learning_rate": 1.4507290975818648e-06, + "loss": 0.2497, + "step": 353 + }, + { + "epoch": 6.407239819004525, + "grad_norm": 0.17489326087119314, + "learning_rate": 1.4376423707914462e-06, + "loss": 0.2518, + "step": 354 + }, + { + "epoch": 6.425339366515837, + "grad_norm": 0.19109685375971255, + "learning_rate": 1.4245910740277642e-06, + "loss": 0.2464, + "step": 355 + }, + { + "epoch": 6.4434389140271495, + "grad_norm": 0.18732644035351217, + "learning_rate": 1.4115756425564798e-06, + "loss": 0.2554, + "step": 356 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 0.2042904942174333, + "learning_rate": 1.39859651044713e-06, + "loss": 0.2677, + "step": 357 + }, + { + "epoch": 6.479638009049774, + "grad_norm": 0.20346012347129977, + "learning_rate": 1.3856541105586545e-06, + "loss": 0.2433, + "step": 358 + }, + { + "epoch": 6.497737556561086, + "grad_norm": 0.18096207448536866, + "learning_rate": 1.372748874524961e-06, + "loss": 0.248, + "step": 359 + }, + { + "epoch": 6.515837104072398, + "grad_norm": 0.18311281316650868, + "learning_rate": 1.3598812327405274e-06, + "loss": 0.2433, + "step": 360 + }, + { + "epoch": 6.53393665158371, + "grad_norm": 0.19877832010020277, + "learning_rate": 1.3470516143460494e-06, + "loss": 0.2419, + "step": 361 + }, + { + "epoch": 6.552036199095022, + "grad_norm": 0.19411009696243373, + "learning_rate": 1.3342604472141296e-06, + "loss": 0.2485, + "step": 362 + }, + { + "epoch": 6.570135746606335, + "grad_norm": 0.18775697820498174, + "learning_rate": 1.3215081579350058e-06, + "loss": 0.2514, + "step": 363 + }, + { + "epoch": 6.588235294117647, + "grad_norm": 0.1974485040630947, + "learning_rate": 1.308795171802324e-06, + "loss": 0.2623, + "step": 364 + }, + { + "epoch": 6.606334841628959, + "grad_norm": 0.20195192192796554, + "learning_rate": 1.2961219127989562e-06, + "loss": 0.2523, + "step": 365 + }, + { + "epoch": 6.624434389140271, + "grad_norm": 0.1867586520187508, + "learning_rate": 1.2834888035828597e-06, + "loss": 0.2434, + "step": 366 + }, + { + "epoch": 6.642533936651584, + "grad_norm": 0.19535767032905008, + "learning_rate": 1.2708962654729812e-06, + "loss": 0.2246, + "step": 367 + }, + { + "epoch": 6.660633484162896, + "grad_norm": 0.17951796660986621, + "learning_rate": 1.258344718435205e-06, + "loss": 0.2548, + "step": 368 + }, + { + "epoch": 6.678733031674208, + "grad_norm": 0.1838076745236157, + "learning_rate": 1.2458345810683492e-06, + "loss": 0.2517, + "step": 369 + }, + { + "epoch": 6.6968325791855206, + "grad_norm": 0.1987502629500275, + "learning_rate": 1.233366270590202e-06, + "loss": 0.2373, + "step": 370 + }, + { + "epoch": 6.714932126696833, + "grad_norm": 0.1921556070273265, + "learning_rate": 1.2209402028236114e-06, + "loss": 0.2444, + "step": 371 + }, + { + "epoch": 6.733031674208145, + "grad_norm": 0.18753751737041122, + "learning_rate": 1.2085567921826128e-06, + "loss": 0.2429, + "step": 372 + }, + { + "epoch": 6.751131221719457, + "grad_norm": 0.17267111610692507, + "learning_rate": 1.1962164516586123e-06, + "loss": 0.2408, + "step": 373 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 0.1785397882614972, + "learning_rate": 1.1839195928066101e-06, + "loss": 0.2364, + "step": 374 + }, + { + "epoch": 6.787330316742081, + "grad_norm": 0.1974641160114867, + "learning_rate": 1.171666625731477e-06, + "loss": 0.2502, + "step": 375 + }, + { + "epoch": 6.8054298642533935, + "grad_norm": 0.1936200917713445, + "learning_rate": 1.1594579590742758e-06, + "loss": 0.2495, + "step": 376 + }, + { + "epoch": 6.823529411764706, + "grad_norm": 0.20474767855899034, + "learning_rate": 1.1472939999986338e-06, + "loss": 0.2444, + "step": 377 + }, + { + "epoch": 6.841628959276018, + "grad_norm": 0.21747609011178112, + "learning_rate": 1.1351751541771644e-06, + "loss": 0.2423, + "step": 378 + }, + { + "epoch": 6.859728506787331, + "grad_norm": 0.2024534108733349, + "learning_rate": 1.1231018257779363e-06, + "loss": 0.2641, + "step": 379 + }, + { + "epoch": 6.877828054298643, + "grad_norm": 0.19486585090979294, + "learning_rate": 1.1110744174509952e-06, + "loss": 0.2463, + "step": 380 + }, + { + "epoch": 6.895927601809955, + "grad_norm": 0.17849040364534344, + "learning_rate": 1.0990933303149342e-06, + "loss": 0.2631, + "step": 381 + }, + { + "epoch": 6.914027149321267, + "grad_norm": 0.19002926125887049, + "learning_rate": 1.0871589639435204e-06, + "loss": 0.2481, + "step": 382 + }, + { + "epoch": 6.932126696832579, + "grad_norm": 0.18083592050616315, + "learning_rate": 1.0752717163523623e-06, + "loss": 0.241, + "step": 383 + }, + { + "epoch": 6.950226244343892, + "grad_norm": 0.19496492930938145, + "learning_rate": 1.0634319839856407e-06, + "loss": 0.2527, + "step": 384 + }, + { + "epoch": 6.968325791855204, + "grad_norm": 0.19417699707230154, + "learning_rate": 1.0516401617028863e-06, + "loss": 0.2322, + "step": 385 + }, + { + "epoch": 6.986425339366516, + "grad_norm": 0.18003217148044237, + "learning_rate": 1.0398966427658091e-06, + "loss": 0.2357, + "step": 386 + }, + { + "epoch": 7.004524886877828, + "grad_norm": 0.18246799637458713, + "learning_rate": 1.0282018188251854e-06, + "loss": 0.2568, + "step": 387 + }, + { + "epoch": 7.02262443438914, + "grad_norm": 0.18781508356688068, + "learning_rate": 1.0165560799077952e-06, + "loss": 0.2387, + "step": 388 + }, + { + "epoch": 7.040723981900452, + "grad_norm": 0.17588577341825412, + "learning_rate": 1.004959814403413e-06, + "loss": 0.262, + "step": 389 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 0.19676767898186667, + "learning_rate": 9.934134090518593e-07, + "loss": 0.2374, + "step": 390 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 0.19345676011938345, + "learning_rate": 9.81917248930096e-07, + "loss": 0.2162, + "step": 391 + }, + { + "epoch": 7.095022624434389, + "grad_norm": 0.2178742299523153, + "learning_rate": 9.704717174393912e-07, + "loss": 0.2495, + "step": 392 + }, + { + "epoch": 7.113122171945701, + "grad_norm": 0.18628703610003405, + "learning_rate": 9.590771962925272e-07, + "loss": 0.2596, + "step": 393 + }, + { + "epoch": 7.131221719457014, + "grad_norm": 0.18042019029734135, + "learning_rate": 9.477340655010717e-07, + "loss": 0.2465, + "step": 394 + }, + { + "epoch": 7.149321266968326, + "grad_norm": 0.1924619560299915, + "learning_rate": 9.36442703362706e-07, + "loss": 0.2395, + "step": 395 + }, + { + "epoch": 7.167420814479638, + "grad_norm": 0.18162050443390207, + "learning_rate": 9.252034864486062e-07, + "loss": 0.2425, + "step": 396 + }, + { + "epoch": 7.1855203619909505, + "grad_norm": 0.1725352404799184, + "learning_rate": 9.140167895908867e-07, + "loss": 0.2257, + "step": 397 + }, + { + "epoch": 7.203619909502263, + "grad_norm": 0.17850869622337964, + "learning_rate": 9.028829858700974e-07, + "loss": 0.2313, + "step": 398 + }, + { + "epoch": 7.221719457013575, + "grad_norm": 0.1896145123389741, + "learning_rate": 8.918024466027822e-07, + "loss": 0.2462, + "step": 399 + }, + { + "epoch": 7.239819004524887, + "grad_norm": 0.1878899849862918, + "learning_rate": 8.807755413290953e-07, + "loss": 0.2502, + "step": 400 + }, + { + "epoch": 7.257918552036199, + "grad_norm": 0.19070595484051797, + "learning_rate": 8.698026378004787e-07, + "loss": 0.2433, + "step": 401 + }, + { + "epoch": 7.276018099547511, + "grad_norm": 0.17359356109341043, + "learning_rate": 8.588841019673938e-07, + "loss": 0.2604, + "step": 402 + }, + { + "epoch": 7.294117647058823, + "grad_norm": 0.20358309076003017, + "learning_rate": 8.480202979671201e-07, + "loss": 0.2327, + "step": 403 + }, + { + "epoch": 7.3122171945701355, + "grad_norm": 0.1835516820557226, + "learning_rate": 8.372115881116089e-07, + "loss": 0.2409, + "step": 404 + }, + { + "epoch": 7.330316742081448, + "grad_norm": 0.18238130931189853, + "learning_rate": 8.264583328754017e-07, + "loss": 0.2393, + "step": 405 + }, + { + "epoch": 7.34841628959276, + "grad_norm": 0.17542601825119047, + "learning_rate": 8.157608908836071e-07, + "loss": 0.2312, + "step": 406 + }, + { + "epoch": 7.366515837104072, + "grad_norm": 0.18257023212771115, + "learning_rate": 8.051196188999425e-07, + "loss": 0.2503, + "step": 407 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 0.1967778738312882, + "learning_rate": 7.945348718148324e-07, + "loss": 0.2419, + "step": 408 + }, + { + "epoch": 7.402714932126697, + "grad_norm": 0.18755379540882788, + "learning_rate": 7.840070026335758e-07, + "loss": 0.2332, + "step": 409 + }, + { + "epoch": 7.420814479638009, + "grad_norm": 0.1911070489817504, + "learning_rate": 7.735363624645712e-07, + "loss": 0.2484, + "step": 410 + }, + { + "epoch": 7.4389140271493215, + "grad_norm": 0.1882055636984676, + "learning_rate": 7.6312330050761e-07, + "loss": 0.2404, + "step": 411 + }, + { + "epoch": 7.457013574660634, + "grad_norm": 0.20190668623593286, + "learning_rate": 7.527681640422265e-07, + "loss": 0.2526, + "step": 412 + }, + { + "epoch": 7.475113122171946, + "grad_norm": 0.1974234563343766, + "learning_rate": 7.424712984161192e-07, + "loss": 0.2688, + "step": 413 + }, + { + "epoch": 7.493212669683258, + "grad_norm": 0.17631879313649837, + "learning_rate": 7.322330470336314e-07, + "loss": 0.2508, + "step": 414 + }, + { + "epoch": 7.51131221719457, + "grad_norm": 0.18714884817468105, + "learning_rate": 7.220537513442999e-07, + "loss": 0.2486, + "step": 415 + }, + { + "epoch": 7.529411764705882, + "grad_norm": 0.19399653562175878, + "learning_rate": 7.11933750831467e-07, + "loss": 0.2618, + "step": 416 + }, + { + "epoch": 7.547511312217194, + "grad_norm": 0.1881943799081702, + "learning_rate": 7.018733830009578e-07, + "loss": 0.2745, + "step": 417 + }, + { + "epoch": 7.5656108597285066, + "grad_norm": 0.19410422423302068, + "learning_rate": 6.91872983369826e-07, + "loss": 0.2575, + "step": 418 + }, + { + "epoch": 7.583710407239819, + "grad_norm": 0.19139908757724744, + "learning_rate": 6.819328854551619e-07, + "loss": 0.2431, + "step": 419 + }, + { + "epoch": 7.601809954751131, + "grad_norm": 0.19407692138480465, + "learning_rate": 6.720534207629731e-07, + "loss": 0.2612, + "step": 420 + }, + { + "epoch": 7.619909502262443, + "grad_norm": 0.19077609905815648, + "learning_rate": 6.622349187771246e-07, + "loss": 0.2363, + "step": 421 + }, + { + "epoch": 7.638009049773755, + "grad_norm": 0.19785590661298624, + "learning_rate": 6.524777069483526e-07, + "loss": 0.2165, + "step": 422 + }, + { + "epoch": 7.656108597285068, + "grad_norm": 0.18170589381863933, + "learning_rate": 6.427821106833429e-07, + "loss": 0.2518, + "step": 423 + }, + { + "epoch": 7.67420814479638, + "grad_norm": 0.19082550580582264, + "learning_rate": 6.33148453333881e-07, + "loss": 0.2497, + "step": 424 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.2010429672996338, + "learning_rate": 6.235770561860646e-07, + "loss": 0.2735, + "step": 425 + }, + { + "epoch": 7.710407239819005, + "grad_norm": 0.20631621699435826, + "learning_rate": 6.140682384495902e-07, + "loss": 0.2638, + "step": 426 + }, + { + "epoch": 7.728506787330317, + "grad_norm": 0.18857883979117615, + "learning_rate": 6.046223172471083e-07, + "loss": 0.2511, + "step": 427 + }, + { + "epoch": 7.746606334841629, + "grad_norm": 0.19438107603701976, + "learning_rate": 5.952396076036457e-07, + "loss": 0.2411, + "step": 428 + }, + { + "epoch": 7.764705882352941, + "grad_norm": 0.18435853585586434, + "learning_rate": 5.85920422436099e-07, + "loss": 0.2337, + "step": 429 + }, + { + "epoch": 7.782805429864253, + "grad_norm": 0.19759361458272545, + "learning_rate": 5.766650725428027e-07, + "loss": 0.2304, + "step": 430 + }, + { + "epoch": 7.800904977375565, + "grad_norm": 0.17820786715247264, + "learning_rate": 5.674738665931575e-07, + "loss": 0.2302, + "step": 431 + }, + { + "epoch": 7.819004524886878, + "grad_norm": 0.18336638108510472, + "learning_rate": 5.583471111173414e-07, + "loss": 0.2415, + "step": 432 + }, + { + "epoch": 7.83710407239819, + "grad_norm": 0.1861341218211825, + "learning_rate": 5.492851104960839e-07, + "loss": 0.2347, + "step": 433 + }, + { + "epoch": 7.855203619909502, + "grad_norm": 0.18671520221803245, + "learning_rate": 5.402881669505164e-07, + "loss": 0.2433, + "step": 434 + }, + { + "epoch": 7.873303167420815, + "grad_norm": 0.18470916369258913, + "learning_rate": 5.313565805320914e-07, + "loss": 0.2392, + "step": 435 + }, + { + "epoch": 7.891402714932127, + "grad_norm": 0.18145209957770228, + "learning_rate": 5.224906491125778e-07, + "loss": 0.2491, + "step": 436 + }, + { + "epoch": 7.909502262443439, + "grad_norm": 0.1841316864472566, + "learning_rate": 5.13690668374125e-07, + "loss": 0.2374, + "step": 437 + }, + { + "epoch": 7.927601809954751, + "grad_norm": 0.16991217903448427, + "learning_rate": 5.049569317994013e-07, + "loss": 0.2222, + "step": 438 + }, + { + "epoch": 7.9457013574660635, + "grad_norm": 0.18977292588230824, + "learning_rate": 4.962897306618101e-07, + "loss": 0.2413, + "step": 439 + }, + { + "epoch": 7.963800904977376, + "grad_norm": 0.2034200762540194, + "learning_rate": 4.876893540157692e-07, + "loss": 0.2526, + "step": 440 + }, + { + "epoch": 7.981900452488688, + "grad_norm": 0.18561076018112563, + "learning_rate": 4.791560886870786e-07, + "loss": 0.2505, + "step": 441 + }, + { + "epoch": 8.0, + "grad_norm": 0.1808509581648577, + "learning_rate": 4.70690219263347e-07, + "loss": 0.2397, + "step": 442 + }, + { + "epoch": 8.018099547511312, + "grad_norm": 0.1983786803651098, + "learning_rate": 4.6229202808450587e-07, + "loss": 0.2384, + "step": 443 + }, + { + "epoch": 8.036199095022624, + "grad_norm": 0.19613362321076386, + "learning_rate": 4.539617952333913e-07, + "loss": 0.2396, + "step": 444 + }, + { + "epoch": 8.054298642533936, + "grad_norm": 0.18104571677229486, + "learning_rate": 4.4569979852640444e-07, + "loss": 0.2481, + "step": 445 + }, + { + "epoch": 8.072398190045249, + "grad_norm": 0.18894956462902818, + "learning_rate": 4.3750631350424456e-07, + "loss": 0.2331, + "step": 446 + }, + { + "epoch": 8.09049773755656, + "grad_norm": 0.1856642703057781, + "learning_rate": 4.2938161342272024e-07, + "loss": 0.2398, + "step": 447 + }, + { + "epoch": 8.108597285067873, + "grad_norm": 0.19509279291436657, + "learning_rate": 4.2132596924363666e-07, + "loss": 0.2396, + "step": 448 + }, + { + "epoch": 8.126696832579185, + "grad_norm": 0.18583235612820456, + "learning_rate": 4.1333964962575995e-07, + "loss": 0.2457, + "step": 449 + }, + { + "epoch": 8.144796380090497, + "grad_norm": 0.19414831334323818, + "learning_rate": 4.0542292091585447e-07, + "loss": 0.2557, + "step": 450 + }, + { + "epoch": 8.16289592760181, + "grad_norm": 0.1948999434614907, + "learning_rate": 3.975760471398013e-07, + "loss": 0.2346, + "step": 451 + }, + { + "epoch": 8.180995475113122, + "grad_norm": 0.18223819061827173, + "learning_rate": 3.89799289993795e-07, + "loss": 0.2176, + "step": 452 + }, + { + "epoch": 8.199095022624434, + "grad_norm": 0.19449644313553408, + "learning_rate": 3.8209290883561205e-07, + "loss": 0.247, + "step": 453 + }, + { + "epoch": 8.217194570135746, + "grad_norm": 0.1930258214779179, + "learning_rate": 3.7445716067596506e-07, + "loss": 0.2298, + "step": 454 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 0.18628969575946702, + "learning_rate": 3.668923001699284e-07, + "loss": 0.2385, + "step": 455 + }, + { + "epoch": 8.25339366515837, + "grad_norm": 0.18169941514755078, + "learning_rate": 3.593985796084468e-07, + "loss": 0.2519, + "step": 456 + }, + { + "epoch": 8.271493212669684, + "grad_norm": 0.1837119269988211, + "learning_rate": 3.519762489099207e-07, + "loss": 0.2602, + "step": 457 + }, + { + "epoch": 8.289592760180996, + "grad_norm": 0.1953248401558189, + "learning_rate": 3.446255556118736e-07, + "loss": 0.2567, + "step": 458 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 0.17837155536528138, + "learning_rate": 3.373467448626916e-07, + "loss": 0.2332, + "step": 459 + }, + { + "epoch": 8.32579185520362, + "grad_norm": 0.1879124674324348, + "learning_rate": 3.3014005941345406e-07, + "loss": 0.2357, + "step": 460 + }, + { + "epoch": 8.343891402714933, + "grad_norm": 0.19669583622722217, + "learning_rate": 3.230057396098321e-07, + "loss": 0.2188, + "step": 461 + }, + { + "epoch": 8.361990950226245, + "grad_norm": 0.19436805306375338, + "learning_rate": 3.1594402338407633e-07, + "loss": 0.2595, + "step": 462 + }, + { + "epoch": 8.380090497737557, + "grad_norm": 0.1731035690780127, + "learning_rate": 3.0895514624707994e-07, + "loss": 0.2293, + "step": 463 + }, + { + "epoch": 8.39819004524887, + "grad_norm": 0.19086125694967881, + "learning_rate": 3.020393412805259e-07, + "loss": 0.2305, + "step": 464 + }, + { + "epoch": 8.416289592760181, + "grad_norm": 0.18779406733198983, + "learning_rate": 2.9519683912911267e-07, + "loss": 0.2596, + "step": 465 + }, + { + "epoch": 8.434389140271493, + "grad_norm": 0.18546808477280827, + "learning_rate": 2.8842786799286204e-07, + "loss": 0.2435, + "step": 466 + }, + { + "epoch": 8.452488687782806, + "grad_norm": 0.1896684936541315, + "learning_rate": 2.8173265361950837e-07, + "loss": 0.2386, + "step": 467 + }, + { + "epoch": 8.470588235294118, + "grad_norm": 0.17852233356583405, + "learning_rate": 2.751114192969709e-07, + "loss": 0.231, + "step": 468 + }, + { + "epoch": 8.48868778280543, + "grad_norm": 0.18399543647963754, + "learning_rate": 2.685643858459064e-07, + "loss": 0.2477, + "step": 469 + }, + { + "epoch": 8.506787330316742, + "grad_norm": 0.18054851239071437, + "learning_rate": 2.620917716123444e-07, + "loss": 0.2504, + "step": 470 + }, + { + "epoch": 8.524886877828054, + "grad_norm": 0.19308936407562874, + "learning_rate": 2.55693792460405e-07, + "loss": 0.2545, + "step": 471 + }, + { + "epoch": 8.542986425339366, + "grad_norm": 0.19847333989927235, + "learning_rate": 2.4937066176510123e-07, + "loss": 0.2462, + "step": 472 + }, + { + "epoch": 8.561085972850679, + "grad_norm": 0.20082127472743996, + "learning_rate": 2.4312259040522093e-07, + "loss": 0.2449, + "step": 473 + }, + { + "epoch": 8.57918552036199, + "grad_norm": 0.1843637491284879, + "learning_rate": 2.3694978675629476e-07, + "loss": 0.2422, + "step": 474 + }, + { + "epoch": 8.597285067873303, + "grad_norm": 0.18297796260401825, + "learning_rate": 2.3085245668364897e-07, + "loss": 0.2492, + "step": 475 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 0.18214698681303781, + "learning_rate": 2.2483080353553537e-07, + "loss": 0.2435, + "step": 476 + }, + { + "epoch": 8.633484162895927, + "grad_norm": 0.1932187580551005, + "learning_rate": 2.1888502813635276e-07, + "loss": 0.2471, + "step": 477 + }, + { + "epoch": 8.65158371040724, + "grad_norm": 0.1862160611593082, + "learning_rate": 2.1301532877994747e-07, + "loss": 0.2367, + "step": 478 + }, + { + "epoch": 8.669683257918551, + "grad_norm": 0.1853161129752053, + "learning_rate": 2.0722190122300311e-07, + "loss": 0.2344, + "step": 479 + }, + { + "epoch": 8.687782805429864, + "grad_norm": 0.18442104500106515, + "learning_rate": 2.0150493867850867e-07, + "loss": 0.2394, + "step": 480 + }, + { + "epoch": 8.705882352941176, + "grad_norm": 0.1836768530394557, + "learning_rate": 1.9586463180931658e-07, + "loss": 0.242, + "step": 481 + }, + { + "epoch": 8.723981900452488, + "grad_norm": 0.18225478866484207, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.2571, + "step": 482 + }, + { + "epoch": 8.742081447963802, + "grad_norm": 0.19072644512673081, + "learning_rate": 1.848147349594967e-07, + "loss": 0.2457, + "step": 483 + }, + { + "epoch": 8.760180995475114, + "grad_norm": 0.18223857901348137, + "learning_rate": 1.7940551349708734e-07, + "loss": 0.2351, + "step": 484 + }, + { + "epoch": 8.778280542986426, + "grad_norm": 0.23050285345657223, + "learning_rate": 1.7407368473412678e-07, + "loss": 0.2355, + "step": 485 + }, + { + "epoch": 8.796380090497738, + "grad_norm": 0.18880764635155572, + "learning_rate": 1.6881942648911077e-07, + "loss": 0.2287, + "step": 486 + }, + { + "epoch": 8.81447963800905, + "grad_norm": 0.1830117965150596, + "learning_rate": 1.6364291399352916e-07, + "loss": 0.2447, + "step": 487 + }, + { + "epoch": 8.832579185520363, + "grad_norm": 0.18803881671915923, + "learning_rate": 1.5854431988602175e-07, + "loss": 0.2431, + "step": 488 + }, + { + "epoch": 8.850678733031675, + "grad_norm": 0.18013778534000302, + "learning_rate": 1.5352381420662144e-07, + "loss": 0.2397, + "step": 489 + }, + { + "epoch": 8.868778280542987, + "grad_norm": 0.2003292008190993, + "learning_rate": 1.4858156439108097e-07, + "loss": 0.2291, + "step": 490 + }, + { + "epoch": 8.886877828054299, + "grad_norm": 0.1780640301175049, + "learning_rate": 1.4371773526529216e-07, + "loss": 0.2138, + "step": 491 + }, + { + "epoch": 8.904977375565611, + "grad_norm": 0.1858049004037094, + "learning_rate": 1.3893248903978695e-07, + "loss": 0.2248, + "step": 492 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 0.1870658138910751, + "learning_rate": 1.342259853043279e-07, + "loss": 0.2628, + "step": 493 + }, + { + "epoch": 8.941176470588236, + "grad_norm": 0.1837618747915919, + "learning_rate": 1.2959838102258537e-07, + "loss": 0.2369, + "step": 494 + }, + { + "epoch": 8.959276018099548, + "grad_norm": 0.1825018533847707, + "learning_rate": 1.2504983052690406e-07, + "loss": 0.2371, + "step": 495 + }, + { + "epoch": 8.97737556561086, + "grad_norm": 0.18050085376698732, + "learning_rate": 1.2058048551315455e-07, + "loss": 0.2364, + "step": 496 + }, + { + "epoch": 8.995475113122172, + "grad_norm": 0.17972618184239006, + "learning_rate": 1.1619049503567486e-07, + "loss": 0.2473, + "step": 497 + }, + { + "epoch": 9.013574660633484, + "grad_norm": 0.1830792217516428, + "learning_rate": 1.1188000550230005e-07, + "loss": 0.2352, + "step": 498 + }, + { + "epoch": 9.031674208144796, + "grad_norm": 0.17879744556952354, + "learning_rate": 1.0764916066947795e-07, + "loss": 0.2641, + "step": 499 + }, + { + "epoch": 9.049773755656108, + "grad_norm": 0.18166675174635316, + "learning_rate": 1.0349810163747587e-07, + "loss": 0.2324, + "step": 500 + }, + { + "epoch": 9.06787330316742, + "grad_norm": 0.169470120760864, + "learning_rate": 9.942696684567488e-08, + "loss": 0.2433, + "step": 501 + }, + { + "epoch": 9.085972850678733, + "grad_norm": 0.18110948245786077, + "learning_rate": 9.54358920679524e-08, + "loss": 0.2374, + "step": 502 + }, + { + "epoch": 9.104072398190045, + "grad_norm": 0.18319694777040335, + "learning_rate": 9.152501040815442e-08, + "loss": 0.254, + "step": 503 + }, + { + "epoch": 9.122171945701357, + "grad_norm": 0.1915504535166829, + "learning_rate": 8.769445229565549e-08, + "loss": 0.2325, + "step": 504 + }, + { + "epoch": 9.14027149321267, + "grad_norm": 0.17665350982157665, + "learning_rate": 8.394434548101099e-08, + "loss": 0.2251, + "step": 505 + }, + { + "epoch": 9.158371040723981, + "grad_norm": 0.17427434868030764, + "learning_rate": 8.027481503169371e-08, + "loss": 0.2345, + "step": 506 + }, + { + "epoch": 9.176470588235293, + "grad_norm": 0.1787409835322033, + "learning_rate": 7.66859833279257e-08, + "loss": 0.2389, + "step": 507 + }, + { + "epoch": 9.194570135746606, + "grad_norm": 0.18100016492103735, + "learning_rate": 7.317797005859467e-08, + "loss": 0.2519, + "step": 508 + }, + { + "epoch": 9.212669683257918, + "grad_norm": 0.17821751417293089, + "learning_rate": 6.97508922172635e-08, + "loss": 0.2287, + "step": 509 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.20843753394336795, + "learning_rate": 6.640486409826785e-08, + "loss": 0.2444, + "step": 510 + }, + { + "epoch": 9.248868778280542, + "grad_norm": 0.17620285125559612, + "learning_rate": 6.313999729290476e-08, + "loss": 0.2601, + "step": 511 + }, + { + "epoch": 9.266968325791856, + "grad_norm": 0.18672849956899618, + "learning_rate": 5.99564006857109e-08, + "loss": 0.2247, + "step": 512 + }, + { + "epoch": 9.285067873303168, + "grad_norm": 0.18049275292301087, + "learning_rate": 5.685418045083102e-08, + "loss": 0.2511, + "step": 513 + }, + { + "epoch": 9.30316742081448, + "grad_norm": 0.17415682650124498, + "learning_rate": 5.383344004847774e-08, + "loss": 0.2122, + "step": 514 + }, + { + "epoch": 9.321266968325792, + "grad_norm": 0.18556393996618256, + "learning_rate": 5.0894280221479855e-08, + "loss": 0.2294, + "step": 515 + }, + { + "epoch": 9.339366515837105, + "grad_norm": 0.1838789712871206, + "learning_rate": 4.8036798991923925e-08, + "loss": 0.2223, + "step": 516 + }, + { + "epoch": 9.357466063348417, + "grad_norm": 0.19715964425866056, + "learning_rate": 4.526109165788439e-08, + "loss": 0.2381, + "step": 517 + }, + { + "epoch": 9.375565610859729, + "grad_norm": 0.1855854696991745, + "learning_rate": 4.256725079024554e-08, + "loss": 0.2342, + "step": 518 + }, + { + "epoch": 9.393665158371041, + "grad_norm": 0.18048725239749752, + "learning_rate": 3.995536622961399e-08, + "loss": 0.2524, + "step": 519 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 0.19277390554384807, + "learning_rate": 3.7425525083322755e-08, + "loss": 0.2488, + "step": 520 + }, + { + "epoch": 9.429864253393665, + "grad_norm": 0.18825292413778436, + "learning_rate": 3.4977811722526065e-08, + "loss": 0.2263, + "step": 521 + }, + { + "epoch": 9.447963800904978, + "grad_norm": 0.17855716822938666, + "learning_rate": 3.261230777938607e-08, + "loss": 0.2549, + "step": 522 + }, + { + "epoch": 9.46606334841629, + "grad_norm": 0.18271161254439716, + "learning_rate": 3.032909214434887e-08, + "loss": 0.2062, + "step": 523 + }, + { + "epoch": 9.484162895927602, + "grad_norm": 0.1985253721454189, + "learning_rate": 2.8128240963515574e-08, + "loss": 0.2395, + "step": 524 + }, + { + "epoch": 9.502262443438914, + "grad_norm": 0.17838320881574793, + "learning_rate": 2.600982763610094e-08, + "loss": 0.2526, + "step": 525 + }, + { + "epoch": 9.520361990950226, + "grad_norm": 0.18314903249677716, + "learning_rate": 2.3973922811987295e-08, + "loss": 0.2264, + "step": 526 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 0.19459399624660845, + "learning_rate": 2.202059438936588e-08, + "loss": 0.2589, + "step": 527 + }, + { + "epoch": 9.55656108597285, + "grad_norm": 0.1904697767266005, + "learning_rate": 2.0149907512475585e-08, + "loss": 0.2515, + "step": 528 + }, + { + "epoch": 9.574660633484163, + "grad_norm": 0.19523143039480956, + "learning_rate": 1.8361924569427204e-08, + "loss": 0.2525, + "step": 529 + }, + { + "epoch": 9.592760180995475, + "grad_norm": 0.17856164334939217, + "learning_rate": 1.6656705190125078e-08, + "loss": 0.2276, + "step": 530 + }, + { + "epoch": 9.610859728506787, + "grad_norm": 0.18321195430667842, + "learning_rate": 1.5034306244277042e-08, + "loss": 0.2418, + "step": 531 + }, + { + "epoch": 9.628959276018099, + "grad_norm": 0.17787016407899692, + "learning_rate": 1.3494781839498428e-08, + "loss": 0.2342, + "step": 532 + }, + { + "epoch": 9.647058823529411, + "grad_norm": 0.19126723554650038, + "learning_rate": 1.2038183319507957e-08, + "loss": 0.2469, + "step": 533 + }, + { + "epoch": 9.665158371040723, + "grad_norm": 0.1892990291817674, + "learning_rate": 1.0664559262413831e-08, + "loss": 0.2549, + "step": 534 + }, + { + "epoch": 9.683257918552036, + "grad_norm": 0.1793510043645716, + "learning_rate": 9.373955479095587e-09, + "loss": 0.2299, + "step": 535 + }, + { + "epoch": 9.701357466063348, + "grad_norm": 0.18123530213186048, + "learning_rate": 8.166415011675032e-09, + "loss": 0.238, + "step": 536 + }, + { + "epoch": 9.71945701357466, + "grad_norm": 0.19155362352898522, + "learning_rate": 7.041978132081295e-09, + "loss": 0.2505, + "step": 537 + }, + { + "epoch": 9.737556561085974, + "grad_norm": 0.19166375374826475, + "learning_rate": 6.00068234070772e-09, + "loss": 0.2486, + "step": 538 + }, + { + "epoch": 9.755656108597286, + "grad_norm": 0.19649835123518228, + "learning_rate": 5.042562365160375e-09, + "loss": 0.2339, + "step": 539 + }, + { + "epoch": 9.773755656108598, + "grad_norm": 0.17975277337095447, + "learning_rate": 4.167650159100922e-09, + "loss": 0.2386, + "step": 540 + }, + { + "epoch": 9.79185520361991, + "grad_norm": 0.19853806613153782, + "learning_rate": 3.375974901181356e-09, + "loss": 0.2651, + "step": 541 + }, + { + "epoch": 9.809954751131222, + "grad_norm": 0.17741051186070012, + "learning_rate": 2.6675629940689508e-09, + "loss": 0.2345, + "step": 542 + }, + { + "epoch": 9.828054298642535, + "grad_norm": 0.21010956050591995, + "learning_rate": 2.0424380635675202e-09, + "loss": 0.2433, + "step": 543 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 0.1925677170949037, + "learning_rate": 1.5006209578286024e-09, + "loss": 0.2442, + "step": 544 + }, + { + "epoch": 9.864253393665159, + "grad_norm": 0.18453673405456344, + "learning_rate": 1.0421297466570169e-09, + "loss": 0.2302, + "step": 545 + }, + { + "epoch": 9.882352941176471, + "grad_norm": 0.20600991870093216, + "learning_rate": 6.669797209069018e-10, + "loss": 0.2338, + "step": 546 + }, + { + "epoch": 9.900452488687783, + "grad_norm": 0.19783049619088353, + "learning_rate": 3.7518339197267774e-10, + "loss": 0.2584, + "step": 547 + }, + { + "epoch": 9.918552036199095, + "grad_norm": 0.2143160581746704, + "learning_rate": 1.6675049137188094e-10, + "loss": 0.2481, + "step": 548 + }, + { + "epoch": 9.936651583710407, + "grad_norm": 0.18644523978656508, + "learning_rate": 4.1687970420423165e-11, + "loss": 0.2456, + "step": 549 + }, + { + "epoch": 9.95475113122172, + "grad_norm": 0.19744141932163012, + "learning_rate": 0.0, + "loss": 0.2503, + "step": 550 + }, + { + "epoch": 9.95475113122172, + "step": 550, + "total_flos": 9.907464757911224e+17, + "train_loss": 0.2748682842471383, + "train_runtime": 89439.6385, + "train_samples_per_second": 0.395, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1.0, + "max_steps": 550, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.907464757911224e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}