{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 513, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005847953216374269, "grad_norm": 17.388755230326108, "learning_rate": 3.846153846153847e-07, "loss": 2.1955, "step": 1 }, { "epoch": 0.011695906432748537, "grad_norm": 17.472331789941457, "learning_rate": 7.692307692307694e-07, "loss": 2.3172, "step": 2 }, { "epoch": 0.017543859649122806, "grad_norm": 16.918399664716798, "learning_rate": 1.153846153846154e-06, "loss": 2.369, "step": 3 }, { "epoch": 0.023391812865497075, "grad_norm": 18.225976223813337, "learning_rate": 1.5384615384615387e-06, "loss": 2.3503, "step": 4 }, { "epoch": 0.029239766081871343, "grad_norm": 17.290350784392576, "learning_rate": 1.9230769230769234e-06, "loss": 2.206, "step": 5 }, { "epoch": 0.03508771929824561, "grad_norm": 17.713013786969835, "learning_rate": 2.307692307692308e-06, "loss": 2.1757, "step": 6 }, { "epoch": 0.04093567251461988, "grad_norm": 16.957525841766792, "learning_rate": 2.6923076923076923e-06, "loss": 2.263, "step": 7 }, { "epoch": 0.04678362573099415, "grad_norm": 15.274517120371355, "learning_rate": 3.0769230769230774e-06, "loss": 2.0823, "step": 8 }, { "epoch": 0.05263157894736842, "grad_norm": 16.17819052550626, "learning_rate": 3.4615384615384617e-06, "loss": 2.1592, "step": 9 }, { "epoch": 0.05847953216374269, "grad_norm": 14.878611384619472, "learning_rate": 3.846153846153847e-06, "loss": 2.0778, "step": 10 }, { "epoch": 0.06432748538011696, "grad_norm": 11.028350456358304, "learning_rate": 4.230769230769231e-06, "loss": 1.7525, "step": 11 }, { "epoch": 0.07017543859649122, "grad_norm": 10.604424239205104, "learning_rate": 4.615384615384616e-06, "loss": 1.8686, "step": 12 }, { "epoch": 0.07602339181286549, "grad_norm": 9.83197777453571, "learning_rate": 5e-06, "loss": 1.6995, "step": 13 }, { "epoch": 0.08187134502923976, "grad_norm": 9.28612318800235, "learning_rate": 5.384615384615385e-06, "loss": 1.7511, "step": 14 }, { "epoch": 0.08771929824561403, "grad_norm": 3.6938751947460333, "learning_rate": 5.769230769230769e-06, "loss": 1.4354, "step": 15 }, { "epoch": 0.0935672514619883, "grad_norm": 3.64251741494419, "learning_rate": 6.153846153846155e-06, "loss": 1.4634, "step": 16 }, { "epoch": 0.09941520467836257, "grad_norm": 3.249845410537068, "learning_rate": 6.538461538461539e-06, "loss": 1.4062, "step": 17 }, { "epoch": 0.10526315789473684, "grad_norm": 3.0197933728284476, "learning_rate": 6.923076923076923e-06, "loss": 1.4268, "step": 18 }, { "epoch": 0.1111111111111111, "grad_norm": 2.5032405922437087, "learning_rate": 7.307692307692308e-06, "loss": 1.3546, "step": 19 }, { "epoch": 0.11695906432748537, "grad_norm": 1.6914458221673982, "learning_rate": 7.692307692307694e-06, "loss": 1.2072, "step": 20 }, { "epoch": 0.12280701754385964, "grad_norm": 1.633209041430983, "learning_rate": 8.076923076923077e-06, "loss": 1.1706, "step": 21 }, { "epoch": 0.1286549707602339, "grad_norm": 1.62333800604462, "learning_rate": 8.461538461538462e-06, "loss": 1.2573, "step": 22 }, { "epoch": 0.13450292397660818, "grad_norm": 1.2572597783261759, "learning_rate": 8.846153846153847e-06, "loss": 1.1549, "step": 23 }, { "epoch": 0.14035087719298245, "grad_norm": 1.0892793835477907, "learning_rate": 9.230769230769232e-06, "loss": 1.1367, "step": 24 }, { "epoch": 0.14619883040935672, "grad_norm": 0.9726760103698124, "learning_rate": 9.615384615384616e-06, "loss": 1.1664, "step": 25 }, { "epoch": 0.15204678362573099, "grad_norm": 0.8399835297943901, "learning_rate": 1e-05, "loss": 1.0771, "step": 26 }, { "epoch": 0.15789473684210525, "grad_norm": 0.756344388475637, "learning_rate": 1.0384615384615386e-05, "loss": 1.0361, "step": 27 }, { "epoch": 0.16374269005847952, "grad_norm": 0.6916203141074345, "learning_rate": 1.076923076923077e-05, "loss": 1.0276, "step": 28 }, { "epoch": 0.1695906432748538, "grad_norm": 0.6795075377629257, "learning_rate": 1.1153846153846154e-05, "loss": 1.0305, "step": 29 }, { "epoch": 0.17543859649122806, "grad_norm": 0.7397958603300506, "learning_rate": 1.1538461538461538e-05, "loss": 1.036, "step": 30 }, { "epoch": 0.18128654970760233, "grad_norm": 0.5914063886870811, "learning_rate": 1.1923076923076925e-05, "loss": 1.0643, "step": 31 }, { "epoch": 0.1871345029239766, "grad_norm": 0.558807526586334, "learning_rate": 1.230769230769231e-05, "loss": 0.9457, "step": 32 }, { "epoch": 0.19298245614035087, "grad_norm": 0.4962345963320037, "learning_rate": 1.2692307692307693e-05, "loss": 0.9556, "step": 33 }, { "epoch": 0.19883040935672514, "grad_norm": 0.5368004540999115, "learning_rate": 1.3076923076923078e-05, "loss": 1.0031, "step": 34 }, { "epoch": 0.2046783625730994, "grad_norm": 0.5193693046254093, "learning_rate": 1.3461538461538463e-05, "loss": 0.937, "step": 35 }, { "epoch": 0.21052631578947367, "grad_norm": 0.42294351955291465, "learning_rate": 1.3846153846153847e-05, "loss": 0.8972, "step": 36 }, { "epoch": 0.21637426900584794, "grad_norm": 0.39791430214156615, "learning_rate": 1.4230769230769232e-05, "loss": 0.9484, "step": 37 }, { "epoch": 0.2222222222222222, "grad_norm": 0.42681451896746464, "learning_rate": 1.4615384615384615e-05, "loss": 0.942, "step": 38 }, { "epoch": 0.22807017543859648, "grad_norm": 0.39243989614880825, "learning_rate": 1.5000000000000002e-05, "loss": 0.9379, "step": 39 }, { "epoch": 0.23391812865497075, "grad_norm": 0.4195184915021303, "learning_rate": 1.5384615384615387e-05, "loss": 0.9327, "step": 40 }, { "epoch": 0.23976608187134502, "grad_norm": 0.3544937192321327, "learning_rate": 1.576923076923077e-05, "loss": 0.851, "step": 41 }, { "epoch": 0.24561403508771928, "grad_norm": 0.3416373732580841, "learning_rate": 1.6153846153846154e-05, "loss": 0.8644, "step": 42 }, { "epoch": 0.25146198830409355, "grad_norm": 0.4128427286910145, "learning_rate": 1.653846153846154e-05, "loss": 0.9002, "step": 43 }, { "epoch": 0.2573099415204678, "grad_norm": 0.4386903858466522, "learning_rate": 1.6923076923076924e-05, "loss": 0.8995, "step": 44 }, { "epoch": 0.2631578947368421, "grad_norm": 0.3894766430305266, "learning_rate": 1.730769230769231e-05, "loss": 0.8796, "step": 45 }, { "epoch": 0.26900584795321636, "grad_norm": 0.33237410703928805, "learning_rate": 1.7692307692307694e-05, "loss": 0.887, "step": 46 }, { "epoch": 0.27485380116959063, "grad_norm": 0.3287665841977238, "learning_rate": 1.807692307692308e-05, "loss": 0.8444, "step": 47 }, { "epoch": 0.2807017543859649, "grad_norm": 0.3109160417844228, "learning_rate": 1.8461538461538465e-05, "loss": 0.8708, "step": 48 }, { "epoch": 0.28654970760233917, "grad_norm": 0.30795401416756046, "learning_rate": 1.8846153846153846e-05, "loss": 0.8434, "step": 49 }, { "epoch": 0.29239766081871343, "grad_norm": 0.3549208935855604, "learning_rate": 1.923076923076923e-05, "loss": 0.8526, "step": 50 }, { "epoch": 0.2982456140350877, "grad_norm": 0.2755256325053317, "learning_rate": 1.9615384615384617e-05, "loss": 0.7736, "step": 51 }, { "epoch": 0.30409356725146197, "grad_norm": 0.43817461634852256, "learning_rate": 2e-05, "loss": 0.8534, "step": 52 }, { "epoch": 0.30994152046783624, "grad_norm": 0.3377814673600554, "learning_rate": 1.995661605206074e-05, "loss": 0.8019, "step": 53 }, { "epoch": 0.3157894736842105, "grad_norm": 0.34154032226288855, "learning_rate": 1.9913232104121476e-05, "loss": 0.8458, "step": 54 }, { "epoch": 0.3216374269005848, "grad_norm": 0.33647765891218867, "learning_rate": 1.9869848156182215e-05, "loss": 0.8527, "step": 55 }, { "epoch": 0.32748538011695905, "grad_norm": 0.2933887985818341, "learning_rate": 1.9826464208242954e-05, "loss": 0.8182, "step": 56 }, { "epoch": 0.3333333333333333, "grad_norm": 0.28265837293209917, "learning_rate": 1.978308026030369e-05, "loss": 0.8355, "step": 57 }, { "epoch": 0.3391812865497076, "grad_norm": 0.2632405163872988, "learning_rate": 1.973969631236443e-05, "loss": 0.7543, "step": 58 }, { "epoch": 0.34502923976608185, "grad_norm": 0.38062672853122476, "learning_rate": 1.9696312364425164e-05, "loss": 0.8183, "step": 59 }, { "epoch": 0.3508771929824561, "grad_norm": 0.26245608696685946, "learning_rate": 1.96529284164859e-05, "loss": 0.8004, "step": 60 }, { "epoch": 0.3567251461988304, "grad_norm": 0.32799401692804, "learning_rate": 1.960954446854664e-05, "loss": 0.8405, "step": 61 }, { "epoch": 0.36257309941520466, "grad_norm": 0.29066553024128605, "learning_rate": 1.9566160520607378e-05, "loss": 0.8492, "step": 62 }, { "epoch": 0.3684210526315789, "grad_norm": 0.28501467209616904, "learning_rate": 1.9522776572668113e-05, "loss": 0.8207, "step": 63 }, { "epoch": 0.3742690058479532, "grad_norm": 0.2525036458551552, "learning_rate": 1.9479392624728852e-05, "loss": 0.779, "step": 64 }, { "epoch": 0.38011695906432746, "grad_norm": 0.2920718950928194, "learning_rate": 1.9436008676789588e-05, "loss": 0.7937, "step": 65 }, { "epoch": 0.38596491228070173, "grad_norm": 0.27183550316859734, "learning_rate": 1.9392624728850327e-05, "loss": 0.8344, "step": 66 }, { "epoch": 0.391812865497076, "grad_norm": 0.272325024687968, "learning_rate": 1.9349240780911066e-05, "loss": 0.7577, "step": 67 }, { "epoch": 0.39766081871345027, "grad_norm": 0.2761663772793096, "learning_rate": 1.93058568329718e-05, "loss": 0.8253, "step": 68 }, { "epoch": 0.40350877192982454, "grad_norm": 0.3577604398976665, "learning_rate": 1.926247288503254e-05, "loss": 0.871, "step": 69 }, { "epoch": 0.4093567251461988, "grad_norm": 0.3054954243342987, "learning_rate": 1.921908893709328e-05, "loss": 0.8485, "step": 70 }, { "epoch": 0.4152046783625731, "grad_norm": 0.2295446772431491, "learning_rate": 1.9175704989154015e-05, "loss": 0.775, "step": 71 }, { "epoch": 0.42105263157894735, "grad_norm": 0.27441930221043814, "learning_rate": 1.9132321041214754e-05, "loss": 0.7984, "step": 72 }, { "epoch": 0.4269005847953216, "grad_norm": 0.25560502683198316, "learning_rate": 1.908893709327549e-05, "loss": 0.8089, "step": 73 }, { "epoch": 0.4327485380116959, "grad_norm": 0.27391446302846595, "learning_rate": 1.9045553145336228e-05, "loss": 0.8194, "step": 74 }, { "epoch": 0.43859649122807015, "grad_norm": 0.25049008602661516, "learning_rate": 1.9002169197396964e-05, "loss": 0.7685, "step": 75 }, { "epoch": 0.4444444444444444, "grad_norm": 0.32703190034733925, "learning_rate": 1.8958785249457703e-05, "loss": 0.8045, "step": 76 }, { "epoch": 0.4502923976608187, "grad_norm": 0.2461722936867296, "learning_rate": 1.8915401301518438e-05, "loss": 0.7747, "step": 77 }, { "epoch": 0.45614035087719296, "grad_norm": 0.3049860315464052, "learning_rate": 1.8872017353579177e-05, "loss": 0.8265, "step": 78 }, { "epoch": 0.4619883040935672, "grad_norm": 0.2769624138638705, "learning_rate": 1.8828633405639916e-05, "loss": 0.8186, "step": 79 }, { "epoch": 0.4678362573099415, "grad_norm": 0.22632052204690653, "learning_rate": 1.878524945770065e-05, "loss": 0.7426, "step": 80 }, { "epoch": 0.47368421052631576, "grad_norm": 0.2538308819987603, "learning_rate": 1.874186550976139e-05, "loss": 0.7849, "step": 81 }, { "epoch": 0.47953216374269003, "grad_norm": 0.3146181235378422, "learning_rate": 1.869848156182213e-05, "loss": 0.8087, "step": 82 }, { "epoch": 0.4853801169590643, "grad_norm": 0.22831617588223724, "learning_rate": 1.8655097613882865e-05, "loss": 0.7431, "step": 83 }, { "epoch": 0.49122807017543857, "grad_norm": 0.24832072861713958, "learning_rate": 1.8611713665943604e-05, "loss": 0.7807, "step": 84 }, { "epoch": 0.49707602339181284, "grad_norm": 0.28945761508471823, "learning_rate": 1.856832971800434e-05, "loss": 0.8025, "step": 85 }, { "epoch": 0.5029239766081871, "grad_norm": 0.24882286573309492, "learning_rate": 1.852494577006508e-05, "loss": 0.8041, "step": 86 }, { "epoch": 0.5087719298245614, "grad_norm": 0.2569507918826724, "learning_rate": 1.8481561822125814e-05, "loss": 0.8097, "step": 87 }, { "epoch": 0.5146198830409356, "grad_norm": 0.2660930480772777, "learning_rate": 1.8438177874186553e-05, "loss": 0.7199, "step": 88 }, { "epoch": 0.52046783625731, "grad_norm": 0.26945118834678633, "learning_rate": 1.839479392624729e-05, "loss": 0.8035, "step": 89 }, { "epoch": 0.5263157894736842, "grad_norm": 0.2748667946921001, "learning_rate": 1.8351409978308028e-05, "loss": 0.8062, "step": 90 }, { "epoch": 0.5321637426900585, "grad_norm": 0.2363367636075127, "learning_rate": 1.8308026030368763e-05, "loss": 0.7497, "step": 91 }, { "epoch": 0.5380116959064327, "grad_norm": 0.2194408996520716, "learning_rate": 1.8264642082429502e-05, "loss": 0.7582, "step": 92 }, { "epoch": 0.543859649122807, "grad_norm": 0.2479217006944137, "learning_rate": 1.822125813449024e-05, "loss": 0.7816, "step": 93 }, { "epoch": 0.5497076023391813, "grad_norm": 0.24365954457591307, "learning_rate": 1.8177874186550977e-05, "loss": 0.7951, "step": 94 }, { "epoch": 0.5555555555555556, "grad_norm": 0.2480572301895391, "learning_rate": 1.8134490238611715e-05, "loss": 0.7808, "step": 95 }, { "epoch": 0.5614035087719298, "grad_norm": 0.24464048645651124, "learning_rate": 1.8091106290672454e-05, "loss": 0.7153, "step": 96 }, { "epoch": 0.5672514619883041, "grad_norm": 0.23776979402481216, "learning_rate": 1.804772234273319e-05, "loss": 0.7168, "step": 97 }, { "epoch": 0.5730994152046783, "grad_norm": 0.2779826898090206, "learning_rate": 1.800433839479393e-05, "loss": 0.784, "step": 98 }, { "epoch": 0.5789473684210527, "grad_norm": 0.2625471662464305, "learning_rate": 1.7960954446854664e-05, "loss": 0.7575, "step": 99 }, { "epoch": 0.5847953216374269, "grad_norm": 0.24973722791738373, "learning_rate": 1.7917570498915403e-05, "loss": 0.7604, "step": 100 }, { "epoch": 0.5906432748538012, "grad_norm": 0.24882129597326091, "learning_rate": 1.787418655097614e-05, "loss": 0.7571, "step": 101 }, { "epoch": 0.5964912280701754, "grad_norm": 0.2490465646513338, "learning_rate": 1.7830802603036878e-05, "loss": 0.7728, "step": 102 }, { "epoch": 0.6023391812865497, "grad_norm": 0.2803127473261744, "learning_rate": 1.7787418655097614e-05, "loss": 0.7486, "step": 103 }, { "epoch": 0.6081871345029239, "grad_norm": 0.30875931205277196, "learning_rate": 1.7744034707158352e-05, "loss": 0.7747, "step": 104 }, { "epoch": 0.6140350877192983, "grad_norm": 0.249801739956383, "learning_rate": 1.770065075921909e-05, "loss": 0.7719, "step": 105 }, { "epoch": 0.6198830409356725, "grad_norm": 0.2493900745685089, "learning_rate": 1.7657266811279827e-05, "loss": 0.7517, "step": 106 }, { "epoch": 0.6257309941520468, "grad_norm": 0.2217608176730444, "learning_rate": 1.7613882863340566e-05, "loss": 0.7385, "step": 107 }, { "epoch": 0.631578947368421, "grad_norm": 0.23151529808146598, "learning_rate": 1.7570498915401305e-05, "loss": 0.7092, "step": 108 }, { "epoch": 0.6374269005847953, "grad_norm": 0.2648606357036367, "learning_rate": 1.752711496746204e-05, "loss": 0.7748, "step": 109 }, { "epoch": 0.6432748538011696, "grad_norm": 0.22637593754873542, "learning_rate": 1.748373101952278e-05, "loss": 0.7594, "step": 110 }, { "epoch": 0.6491228070175439, "grad_norm": 0.24569329004133555, "learning_rate": 1.7440347071583515e-05, "loss": 0.7662, "step": 111 }, { "epoch": 0.6549707602339181, "grad_norm": 0.23086082605618571, "learning_rate": 1.7396963123644254e-05, "loss": 0.7291, "step": 112 }, { "epoch": 0.6608187134502924, "grad_norm": 0.23164513757355204, "learning_rate": 1.735357917570499e-05, "loss": 0.761, "step": 113 }, { "epoch": 0.6666666666666666, "grad_norm": 0.2341951309434963, "learning_rate": 1.731019522776573e-05, "loss": 0.7707, "step": 114 }, { "epoch": 0.672514619883041, "grad_norm": 0.2294815579241083, "learning_rate": 1.7266811279826464e-05, "loss": 0.7307, "step": 115 }, { "epoch": 0.6783625730994152, "grad_norm": 0.2425767445634441, "learning_rate": 1.7223427331887203e-05, "loss": 0.7573, "step": 116 }, { "epoch": 0.6842105263157895, "grad_norm": 0.22967591410278537, "learning_rate": 1.718004338394794e-05, "loss": 0.7513, "step": 117 }, { "epoch": 0.6900584795321637, "grad_norm": 0.26903092877754314, "learning_rate": 1.7136659436008677e-05, "loss": 0.7858, "step": 118 }, { "epoch": 0.695906432748538, "grad_norm": 0.2571480378610959, "learning_rate": 1.7093275488069416e-05, "loss": 0.7736, "step": 119 }, { "epoch": 0.7017543859649122, "grad_norm": 0.23273043019862788, "learning_rate": 1.7049891540130152e-05, "loss": 0.7669, "step": 120 }, { "epoch": 0.7076023391812866, "grad_norm": 0.23314091686361454, "learning_rate": 1.700650759219089e-05, "loss": 0.7699, "step": 121 }, { "epoch": 0.7134502923976608, "grad_norm": 0.26268224212689045, "learning_rate": 1.696312364425163e-05, "loss": 0.7832, "step": 122 }, { "epoch": 0.7192982456140351, "grad_norm": 0.26423904380170976, "learning_rate": 1.6919739696312365e-05, "loss": 0.7595, "step": 123 }, { "epoch": 0.7251461988304093, "grad_norm": 0.21495414583172803, "learning_rate": 1.6876355748373104e-05, "loss": 0.7106, "step": 124 }, { "epoch": 0.7309941520467836, "grad_norm": 0.2111254963997244, "learning_rate": 1.6832971800433843e-05, "loss": 0.7455, "step": 125 }, { "epoch": 0.7368421052631579, "grad_norm": 0.2156942153910527, "learning_rate": 1.678958785249458e-05, "loss": 0.69, "step": 126 }, { "epoch": 0.7426900584795322, "grad_norm": 0.20057578031019538, "learning_rate": 1.6746203904555314e-05, "loss": 0.7253, "step": 127 }, { "epoch": 0.7485380116959064, "grad_norm": 0.258323958272931, "learning_rate": 1.6702819956616053e-05, "loss": 0.7156, "step": 128 }, { "epoch": 0.7543859649122807, "grad_norm": 0.23301112011268071, "learning_rate": 1.665943600867679e-05, "loss": 0.7562, "step": 129 }, { "epoch": 0.7602339181286549, "grad_norm": 0.27354281471105707, "learning_rate": 1.6616052060737528e-05, "loss": 0.7494, "step": 130 }, { "epoch": 0.7660818713450293, "grad_norm": 0.25737706341844985, "learning_rate": 1.6572668112798267e-05, "loss": 0.7471, "step": 131 }, { "epoch": 0.7719298245614035, "grad_norm": 0.2112391813708006, "learning_rate": 1.6529284164859002e-05, "loss": 0.7296, "step": 132 }, { "epoch": 0.7777777777777778, "grad_norm": 0.2066541279425585, "learning_rate": 1.648590021691974e-05, "loss": 0.7427, "step": 133 }, { "epoch": 0.783625730994152, "grad_norm": 0.21492978047244818, "learning_rate": 1.644251626898048e-05, "loss": 0.6956, "step": 134 }, { "epoch": 0.7894736842105263, "grad_norm": 0.22539724372329056, "learning_rate": 1.6399132321041216e-05, "loss": 0.7358, "step": 135 }, { "epoch": 0.7953216374269005, "grad_norm": 0.223824231061946, "learning_rate": 1.6355748373101955e-05, "loss": 0.747, "step": 136 }, { "epoch": 0.8011695906432749, "grad_norm": 0.22433634692844312, "learning_rate": 1.631236442516269e-05, "loss": 0.7478, "step": 137 }, { "epoch": 0.8070175438596491, "grad_norm": 0.2355525364186235, "learning_rate": 1.626898047722343e-05, "loss": 0.7539, "step": 138 }, { "epoch": 0.8128654970760234, "grad_norm": 0.22774103617994296, "learning_rate": 1.6225596529284168e-05, "loss": 0.7227, "step": 139 }, { "epoch": 0.8187134502923976, "grad_norm": 0.24837995707152566, "learning_rate": 1.6182212581344904e-05, "loss": 0.7048, "step": 140 }, { "epoch": 0.8245614035087719, "grad_norm": 0.2165941656087455, "learning_rate": 1.613882863340564e-05, "loss": 0.7095, "step": 141 }, { "epoch": 0.8304093567251462, "grad_norm": 0.24496476577766357, "learning_rate": 1.609544468546638e-05, "loss": 0.731, "step": 142 }, { "epoch": 0.8362573099415205, "grad_norm": 0.2275760050109454, "learning_rate": 1.6052060737527114e-05, "loss": 0.7159, "step": 143 }, { "epoch": 0.8421052631578947, "grad_norm": 0.203518916790755, "learning_rate": 1.6008676789587853e-05, "loss": 0.6462, "step": 144 }, { "epoch": 0.847953216374269, "grad_norm": 0.24268384602078139, "learning_rate": 1.5965292841648592e-05, "loss": 0.7206, "step": 145 }, { "epoch": 0.8538011695906432, "grad_norm": 0.2911481588164572, "learning_rate": 1.5921908893709327e-05, "loss": 0.766, "step": 146 }, { "epoch": 0.8596491228070176, "grad_norm": 0.25277324694147335, "learning_rate": 1.5878524945770066e-05, "loss": 0.7501, "step": 147 }, { "epoch": 0.8654970760233918, "grad_norm": 0.2372457450088363, "learning_rate": 1.5835140997830805e-05, "loss": 0.712, "step": 148 }, { "epoch": 0.8713450292397661, "grad_norm": 0.19877008506291952, "learning_rate": 1.579175704989154e-05, "loss": 0.7146, "step": 149 }, { "epoch": 0.8771929824561403, "grad_norm": 0.27732708769815756, "learning_rate": 1.574837310195228e-05, "loss": 0.7347, "step": 150 }, { "epoch": 0.8830409356725146, "grad_norm": 0.20303134209612006, "learning_rate": 1.570498915401302e-05, "loss": 0.71, "step": 151 }, { "epoch": 0.8888888888888888, "grad_norm": 0.23703380426454326, "learning_rate": 1.5661605206073754e-05, "loss": 0.6915, "step": 152 }, { "epoch": 0.8947368421052632, "grad_norm": 0.23526601753982804, "learning_rate": 1.5618221258134493e-05, "loss": 0.72, "step": 153 }, { "epoch": 0.9005847953216374, "grad_norm": 0.2408627140057496, "learning_rate": 1.557483731019523e-05, "loss": 0.7206, "step": 154 }, { "epoch": 0.9064327485380117, "grad_norm": 0.22070261442759123, "learning_rate": 1.5531453362255964e-05, "loss": 0.7019, "step": 155 }, { "epoch": 0.9122807017543859, "grad_norm": 0.236776997470983, "learning_rate": 1.5488069414316703e-05, "loss": 0.7314, "step": 156 }, { "epoch": 0.9181286549707602, "grad_norm": 0.25431877096559957, "learning_rate": 1.5444685466377442e-05, "loss": 0.7663, "step": 157 }, { "epoch": 0.9239766081871345, "grad_norm": 0.2934790109300597, "learning_rate": 1.5401301518438178e-05, "loss": 0.7388, "step": 158 }, { "epoch": 0.9298245614035088, "grad_norm": 0.2287254855752223, "learning_rate": 1.5357917570498917e-05, "loss": 0.703, "step": 159 }, { "epoch": 0.935672514619883, "grad_norm": 0.21116594695108679, "learning_rate": 1.5314533622559656e-05, "loss": 0.7228, "step": 160 }, { "epoch": 0.9415204678362573, "grad_norm": 0.25825565901072856, "learning_rate": 1.527114967462039e-05, "loss": 0.7791, "step": 161 }, { "epoch": 0.9473684210526315, "grad_norm": 0.23103746722781796, "learning_rate": 1.522776572668113e-05, "loss": 0.6951, "step": 162 }, { "epoch": 0.9532163742690059, "grad_norm": 0.2580198439201409, "learning_rate": 1.5184381778741866e-05, "loss": 0.741, "step": 163 }, { "epoch": 0.9590643274853801, "grad_norm": 0.2974306573786225, "learning_rate": 1.5140997830802605e-05, "loss": 0.725, "step": 164 }, { "epoch": 0.9649122807017544, "grad_norm": 0.26570078456731205, "learning_rate": 1.5097613882863342e-05, "loss": 0.7467, "step": 165 }, { "epoch": 0.9707602339181286, "grad_norm": 0.4533476269871839, "learning_rate": 1.5054229934924078e-05, "loss": 0.6971, "step": 166 }, { "epoch": 0.9766081871345029, "grad_norm": 0.23895703831919585, "learning_rate": 1.5010845986984816e-05, "loss": 0.7341, "step": 167 }, { "epoch": 0.9824561403508771, "grad_norm": 0.20339364928582415, "learning_rate": 1.4967462039045555e-05, "loss": 0.6757, "step": 168 }, { "epoch": 0.9883040935672515, "grad_norm": 0.23049708494261534, "learning_rate": 1.4924078091106291e-05, "loss": 0.7236, "step": 169 }, { "epoch": 0.9941520467836257, "grad_norm": 0.2026976413223512, "learning_rate": 1.488069414316703e-05, "loss": 0.7006, "step": 170 }, { "epoch": 1.0, "grad_norm": 0.22355532686958146, "learning_rate": 1.4837310195227767e-05, "loss": 0.701, "step": 171 }, { "epoch": 1.0058479532163742, "grad_norm": 0.20282760221622007, "learning_rate": 1.4793926247288504e-05, "loss": 0.7168, "step": 172 }, { "epoch": 1.0116959064327484, "grad_norm": 0.19949275591479185, "learning_rate": 1.4750542299349242e-05, "loss": 0.6984, "step": 173 }, { "epoch": 1.0175438596491229, "grad_norm": 0.21384886986850865, "learning_rate": 1.470715835140998e-05, "loss": 0.7272, "step": 174 }, { "epoch": 1.023391812865497, "grad_norm": 0.2085869267067017, "learning_rate": 1.4663774403470716e-05, "loss": 0.7044, "step": 175 }, { "epoch": 1.0292397660818713, "grad_norm": 0.20631353790684379, "learning_rate": 1.4620390455531455e-05, "loss": 0.7181, "step": 176 }, { "epoch": 1.0350877192982457, "grad_norm": 0.24538509221900098, "learning_rate": 1.4577006507592192e-05, "loss": 0.7731, "step": 177 }, { "epoch": 1.04093567251462, "grad_norm": 0.23156823897416134, "learning_rate": 1.453362255965293e-05, "loss": 0.7129, "step": 178 }, { "epoch": 1.0467836257309941, "grad_norm": 0.20155082532453575, "learning_rate": 1.4490238611713667e-05, "loss": 0.7037, "step": 179 }, { "epoch": 1.0526315789473684, "grad_norm": 0.19242380310026896, "learning_rate": 1.4446854663774406e-05, "loss": 0.7026, "step": 180 }, { "epoch": 1.0584795321637426, "grad_norm": 0.21376599859201403, "learning_rate": 1.4403470715835141e-05, "loss": 0.7021, "step": 181 }, { "epoch": 1.064327485380117, "grad_norm": 0.21321842835439078, "learning_rate": 1.436008676789588e-05, "loss": 0.7186, "step": 182 }, { "epoch": 1.0701754385964912, "grad_norm": 0.23152992175479814, "learning_rate": 1.4316702819956618e-05, "loss": 0.7262, "step": 183 }, { "epoch": 1.0760233918128654, "grad_norm": 0.20707778685395156, "learning_rate": 1.4273318872017355e-05, "loss": 0.742, "step": 184 }, { "epoch": 1.0818713450292399, "grad_norm": 0.21284401184030297, "learning_rate": 1.4229934924078092e-05, "loss": 0.683, "step": 185 }, { "epoch": 1.087719298245614, "grad_norm": 0.21105448131636317, "learning_rate": 1.418655097613883e-05, "loss": 0.7218, "step": 186 }, { "epoch": 1.0935672514619883, "grad_norm": 0.23854659151648439, "learning_rate": 1.4143167028199567e-05, "loss": 0.707, "step": 187 }, { "epoch": 1.0994152046783625, "grad_norm": 0.1979900232322942, "learning_rate": 1.4099783080260306e-05, "loss": 0.6793, "step": 188 }, { "epoch": 1.1052631578947367, "grad_norm": 0.19940118749588795, "learning_rate": 1.4056399132321041e-05, "loss": 0.6793, "step": 189 }, { "epoch": 1.1111111111111112, "grad_norm": 0.2216608207413802, "learning_rate": 1.401301518438178e-05, "loss": 0.7183, "step": 190 }, { "epoch": 1.1169590643274854, "grad_norm": 0.19705996044476262, "learning_rate": 1.3969631236442517e-05, "loss": 0.692, "step": 191 }, { "epoch": 1.1228070175438596, "grad_norm": 0.18840081658391272, "learning_rate": 1.3926247288503255e-05, "loss": 0.69, "step": 192 }, { "epoch": 1.128654970760234, "grad_norm": 0.22778993399760028, "learning_rate": 1.3882863340563992e-05, "loss": 0.7458, "step": 193 }, { "epoch": 1.1345029239766082, "grad_norm": 0.19922962343898284, "learning_rate": 1.3839479392624731e-05, "loss": 0.6935, "step": 194 }, { "epoch": 1.1403508771929824, "grad_norm": 0.17961965737395658, "learning_rate": 1.3796095444685466e-05, "loss": 0.6902, "step": 195 }, { "epoch": 1.1461988304093567, "grad_norm": 0.20117480573787744, "learning_rate": 1.3752711496746205e-05, "loss": 0.6966, "step": 196 }, { "epoch": 1.1520467836257309, "grad_norm": 0.20576287270564314, "learning_rate": 1.3709327548806943e-05, "loss": 0.6626, "step": 197 }, { "epoch": 1.1578947368421053, "grad_norm": 0.20954364596102132, "learning_rate": 1.366594360086768e-05, "loss": 0.712, "step": 198 }, { "epoch": 1.1637426900584795, "grad_norm": 0.18682996007735939, "learning_rate": 1.3622559652928417e-05, "loss": 0.7075, "step": 199 }, { "epoch": 1.1695906432748537, "grad_norm": 0.20043695366127617, "learning_rate": 1.3579175704989156e-05, "loss": 0.688, "step": 200 }, { "epoch": 1.1754385964912282, "grad_norm": 0.19280097802899304, "learning_rate": 1.3535791757049892e-05, "loss": 0.7177, "step": 201 }, { "epoch": 1.1812865497076024, "grad_norm": 0.1857970119964957, "learning_rate": 1.349240780911063e-05, "loss": 0.6463, "step": 202 }, { "epoch": 1.1871345029239766, "grad_norm": 0.1825176976963816, "learning_rate": 1.3449023861171368e-05, "loss": 0.6673, "step": 203 }, { "epoch": 1.1929824561403508, "grad_norm": 0.22051713697050027, "learning_rate": 1.3405639913232105e-05, "loss": 0.7145, "step": 204 }, { "epoch": 1.198830409356725, "grad_norm": 0.18423219666459137, "learning_rate": 1.3362255965292842e-05, "loss": 0.6528, "step": 205 }, { "epoch": 1.2046783625730995, "grad_norm": 0.19618225427002017, "learning_rate": 1.3318872017353581e-05, "loss": 0.6668, "step": 206 }, { "epoch": 1.2105263157894737, "grad_norm": 0.20587148922859191, "learning_rate": 1.3275488069414317e-05, "loss": 0.7067, "step": 207 }, { "epoch": 1.2163742690058479, "grad_norm": 0.2090448851687986, "learning_rate": 1.3232104121475056e-05, "loss": 0.7029, "step": 208 }, { "epoch": 1.2222222222222223, "grad_norm": 0.19626708957217415, "learning_rate": 1.3188720173535795e-05, "loss": 0.6662, "step": 209 }, { "epoch": 1.2280701754385965, "grad_norm": 0.18762036234283117, "learning_rate": 1.314533622559653e-05, "loss": 0.6874, "step": 210 }, { "epoch": 1.2339181286549707, "grad_norm": 0.19417670023667025, "learning_rate": 1.3101952277657268e-05, "loss": 0.6683, "step": 211 }, { "epoch": 1.239766081871345, "grad_norm": 0.20177458796436643, "learning_rate": 1.3058568329718005e-05, "loss": 0.685, "step": 212 }, { "epoch": 1.2456140350877192, "grad_norm": 0.22040877827401095, "learning_rate": 1.3015184381778742e-05, "loss": 0.7254, "step": 213 }, { "epoch": 1.2514619883040936, "grad_norm": 0.19637215780432019, "learning_rate": 1.2971800433839481e-05, "loss": 0.6897, "step": 214 }, { "epoch": 1.2573099415204678, "grad_norm": 0.199110748095673, "learning_rate": 1.2928416485900217e-05, "loss": 0.6854, "step": 215 }, { "epoch": 1.263157894736842, "grad_norm": 0.21819712890299467, "learning_rate": 1.2885032537960956e-05, "loss": 0.6986, "step": 216 }, { "epoch": 1.2690058479532165, "grad_norm": 0.21142557814635124, "learning_rate": 1.2841648590021693e-05, "loss": 0.7203, "step": 217 }, { "epoch": 1.2748538011695907, "grad_norm": 0.18250187399866635, "learning_rate": 1.279826464208243e-05, "loss": 0.6785, "step": 218 }, { "epoch": 1.280701754385965, "grad_norm": 0.19755959536466466, "learning_rate": 1.2754880694143167e-05, "loss": 0.6706, "step": 219 }, { "epoch": 1.286549707602339, "grad_norm": 0.19529246308103604, "learning_rate": 1.2711496746203906e-05, "loss": 0.6998, "step": 220 }, { "epoch": 1.2923976608187133, "grad_norm": 0.1936160811683211, "learning_rate": 1.2668112798264642e-05, "loss": 0.6896, "step": 221 }, { "epoch": 1.2982456140350878, "grad_norm": 0.1845218398315034, "learning_rate": 1.262472885032538e-05, "loss": 0.6568, "step": 222 }, { "epoch": 1.304093567251462, "grad_norm": 0.20772884369385505, "learning_rate": 1.258134490238612e-05, "loss": 0.6625, "step": 223 }, { "epoch": 1.3099415204678362, "grad_norm": 0.229042568059284, "learning_rate": 1.2537960954446855e-05, "loss": 0.6861, "step": 224 }, { "epoch": 1.3157894736842106, "grad_norm": 0.20350171741172374, "learning_rate": 1.2494577006507593e-05, "loss": 0.6478, "step": 225 }, { "epoch": 1.3216374269005848, "grad_norm": 0.19144221585747292, "learning_rate": 1.2451193058568331e-05, "loss": 0.6764, "step": 226 }, { "epoch": 1.327485380116959, "grad_norm": 0.21913738701924326, "learning_rate": 1.2407809110629067e-05, "loss": 0.695, "step": 227 }, { "epoch": 1.3333333333333333, "grad_norm": 0.2020711158267139, "learning_rate": 1.2364425162689806e-05, "loss": 0.7062, "step": 228 }, { "epoch": 1.3391812865497075, "grad_norm": 0.21899620359258645, "learning_rate": 1.2321041214750545e-05, "loss": 0.7145, "step": 229 }, { "epoch": 1.345029239766082, "grad_norm": 0.18931923720637447, "learning_rate": 1.227765726681128e-05, "loss": 0.6956, "step": 230 }, { "epoch": 1.3508771929824561, "grad_norm": 0.1916810843880607, "learning_rate": 1.223427331887202e-05, "loss": 0.668, "step": 231 }, { "epoch": 1.3567251461988303, "grad_norm": 0.19261705533668297, "learning_rate": 1.2190889370932757e-05, "loss": 0.653, "step": 232 }, { "epoch": 1.3625730994152048, "grad_norm": 0.20814835626639575, "learning_rate": 1.2147505422993492e-05, "loss": 0.7068, "step": 233 }, { "epoch": 1.368421052631579, "grad_norm": 0.2076525513781835, "learning_rate": 1.2104121475054231e-05, "loss": 0.6989, "step": 234 }, { "epoch": 1.3742690058479532, "grad_norm": 0.1911948741286201, "learning_rate": 1.2060737527114967e-05, "loss": 0.6774, "step": 235 }, { "epoch": 1.3801169590643274, "grad_norm": 0.2100123955547407, "learning_rate": 1.2017353579175706e-05, "loss": 0.6997, "step": 236 }, { "epoch": 1.3859649122807016, "grad_norm": 0.31584573390504456, "learning_rate": 1.1973969631236445e-05, "loss": 0.7052, "step": 237 }, { "epoch": 1.391812865497076, "grad_norm": 0.18688166233524203, "learning_rate": 1.193058568329718e-05, "loss": 0.6526, "step": 238 }, { "epoch": 1.3976608187134503, "grad_norm": 0.22026356851753442, "learning_rate": 1.1887201735357918e-05, "loss": 0.6454, "step": 239 }, { "epoch": 1.4035087719298245, "grad_norm": 0.19323076025261185, "learning_rate": 1.1843817787418656e-05, "loss": 0.6594, "step": 240 }, { "epoch": 1.409356725146199, "grad_norm": 0.19902277064282112, "learning_rate": 1.1800433839479392e-05, "loss": 0.7244, "step": 241 }, { "epoch": 1.4152046783625731, "grad_norm": 0.1908671762046153, "learning_rate": 1.1757049891540131e-05, "loss": 0.6681, "step": 242 }, { "epoch": 1.4210526315789473, "grad_norm": 0.19560133699568794, "learning_rate": 1.171366594360087e-05, "loss": 0.6731, "step": 243 }, { "epoch": 1.4269005847953216, "grad_norm": 0.2094438443568091, "learning_rate": 1.1670281995661605e-05, "loss": 0.701, "step": 244 }, { "epoch": 1.4327485380116958, "grad_norm": 0.19053569086952576, "learning_rate": 1.1626898047722344e-05, "loss": 0.6104, "step": 245 }, { "epoch": 1.4385964912280702, "grad_norm": 0.19913609339747246, "learning_rate": 1.1583514099783082e-05, "loss": 0.6573, "step": 246 }, { "epoch": 1.4444444444444444, "grad_norm": 0.20956879358597585, "learning_rate": 1.1540130151843817e-05, "loss": 0.662, "step": 247 }, { "epoch": 1.4502923976608186, "grad_norm": 0.20216430625120646, "learning_rate": 1.1496746203904556e-05, "loss": 0.6505, "step": 248 }, { "epoch": 1.456140350877193, "grad_norm": 0.2061734262125184, "learning_rate": 1.1453362255965295e-05, "loss": 0.6786, "step": 249 }, { "epoch": 1.4619883040935673, "grad_norm": 0.22574876209542377, "learning_rate": 1.140997830802603e-05, "loss": 0.7325, "step": 250 }, { "epoch": 1.4678362573099415, "grad_norm": 0.1772735034592302, "learning_rate": 1.136659436008677e-05, "loss": 0.636, "step": 251 }, { "epoch": 1.4736842105263157, "grad_norm": 0.2073376585966582, "learning_rate": 1.1323210412147507e-05, "loss": 0.6791, "step": 252 }, { "epoch": 1.47953216374269, "grad_norm": 0.18918995778508665, "learning_rate": 1.1279826464208244e-05, "loss": 0.6406, "step": 253 }, { "epoch": 1.4853801169590644, "grad_norm": 0.20195402902912296, "learning_rate": 1.1236442516268981e-05, "loss": 0.6625, "step": 254 }, { "epoch": 1.4912280701754386, "grad_norm": 0.18582829458374092, "learning_rate": 1.119305856832972e-05, "loss": 0.6831, "step": 255 }, { "epoch": 1.4970760233918128, "grad_norm": 0.18667034513926425, "learning_rate": 1.1149674620390456e-05, "loss": 0.6819, "step": 256 }, { "epoch": 1.5029239766081872, "grad_norm": 0.1884977125227984, "learning_rate": 1.1106290672451195e-05, "loss": 0.6515, "step": 257 }, { "epoch": 1.5087719298245614, "grad_norm": 0.19917650464796147, "learning_rate": 1.1062906724511932e-05, "loss": 0.672, "step": 258 }, { "epoch": 1.5146198830409356, "grad_norm": 0.20496434407592237, "learning_rate": 1.101952277657267e-05, "loss": 0.6538, "step": 259 }, { "epoch": 1.52046783625731, "grad_norm": 0.18169707048812828, "learning_rate": 1.0976138828633407e-05, "loss": 0.661, "step": 260 }, { "epoch": 1.526315789473684, "grad_norm": 0.22056891228087572, "learning_rate": 1.0932754880694142e-05, "loss": 0.6929, "step": 261 }, { "epoch": 1.5321637426900585, "grad_norm": 0.2085232928793704, "learning_rate": 1.0889370932754881e-05, "loss": 0.6954, "step": 262 }, { "epoch": 1.5380116959064327, "grad_norm": 0.20789260798479195, "learning_rate": 1.084598698481562e-05, "loss": 0.7011, "step": 263 }, { "epoch": 1.543859649122807, "grad_norm": 0.1849807776906847, "learning_rate": 1.0802603036876356e-05, "loss": 0.686, "step": 264 }, { "epoch": 1.5497076023391814, "grad_norm": 0.18518274667657642, "learning_rate": 1.0759219088937095e-05, "loss": 0.6636, "step": 265 }, { "epoch": 1.5555555555555556, "grad_norm": 0.19333183404204385, "learning_rate": 1.0715835140997832e-05, "loss": 0.7133, "step": 266 }, { "epoch": 1.5614035087719298, "grad_norm": 0.19922629243071752, "learning_rate": 1.0672451193058569e-05, "loss": 0.6745, "step": 267 }, { "epoch": 1.5672514619883042, "grad_norm": 0.1895519362467185, "learning_rate": 1.0629067245119306e-05, "loss": 0.6648, "step": 268 }, { "epoch": 1.5730994152046782, "grad_norm": 0.1871315579144127, "learning_rate": 1.0585683297180045e-05, "loss": 0.6721, "step": 269 }, { "epoch": 1.5789473684210527, "grad_norm": 0.18380449023430315, "learning_rate": 1.0542299349240781e-05, "loss": 0.6697, "step": 270 }, { "epoch": 1.5847953216374269, "grad_norm": 0.2562545867136886, "learning_rate": 1.049891540130152e-05, "loss": 0.6969, "step": 271 }, { "epoch": 1.590643274853801, "grad_norm": 0.1740952081571547, "learning_rate": 1.0455531453362257e-05, "loss": 0.6282, "step": 272 }, { "epoch": 1.5964912280701755, "grad_norm": 0.1800104491661315, "learning_rate": 1.0412147505422994e-05, "loss": 0.666, "step": 273 }, { "epoch": 1.6023391812865497, "grad_norm": 0.21004999295392382, "learning_rate": 1.0368763557483732e-05, "loss": 0.6849, "step": 274 }, { "epoch": 1.608187134502924, "grad_norm": 0.1787656466284205, "learning_rate": 1.032537960954447e-05, "loss": 0.6723, "step": 275 }, { "epoch": 1.6140350877192984, "grad_norm": 0.21871948943154398, "learning_rate": 1.0281995661605206e-05, "loss": 0.6901, "step": 276 }, { "epoch": 1.6198830409356724, "grad_norm": 0.18361595886864504, "learning_rate": 1.0238611713665945e-05, "loss": 0.6421, "step": 277 }, { "epoch": 1.6257309941520468, "grad_norm": 0.18916065927378428, "learning_rate": 1.0195227765726682e-05, "loss": 0.6511, "step": 278 }, { "epoch": 1.631578947368421, "grad_norm": 0.1979017696376173, "learning_rate": 1.015184381778742e-05, "loss": 0.7018, "step": 279 }, { "epoch": 1.6374269005847952, "grad_norm": 0.18969323017848436, "learning_rate": 1.0108459869848157e-05, "loss": 0.6677, "step": 280 }, { "epoch": 1.6432748538011697, "grad_norm": 0.18594561560924552, "learning_rate": 1.0065075921908896e-05, "loss": 0.6584, "step": 281 }, { "epoch": 1.6491228070175439, "grad_norm": 0.17998449840117228, "learning_rate": 1.0021691973969631e-05, "loss": 0.6616, "step": 282 }, { "epoch": 1.654970760233918, "grad_norm": 0.18463090829340062, "learning_rate": 9.97830802603037e-06, "loss": 0.6718, "step": 283 }, { "epoch": 1.6608187134502925, "grad_norm": 0.1941128688320993, "learning_rate": 9.934924078091108e-06, "loss": 0.7071, "step": 284 }, { "epoch": 1.6666666666666665, "grad_norm": 0.19238570224026413, "learning_rate": 9.891540130151845e-06, "loss": 0.6732, "step": 285 }, { "epoch": 1.672514619883041, "grad_norm": 0.19518291083148756, "learning_rate": 9.848156182212582e-06, "loss": 0.6761, "step": 286 }, { "epoch": 1.6783625730994152, "grad_norm": 0.18202289710684016, "learning_rate": 9.80477223427332e-06, "loss": 0.6577, "step": 287 }, { "epoch": 1.6842105263157894, "grad_norm": 0.18625725493648193, "learning_rate": 9.761388286334057e-06, "loss": 0.7017, "step": 288 }, { "epoch": 1.6900584795321638, "grad_norm": 0.2100099826676321, "learning_rate": 9.718004338394794e-06, "loss": 0.6858, "step": 289 }, { "epoch": 1.695906432748538, "grad_norm": 0.2084953033980061, "learning_rate": 9.674620390455533e-06, "loss": 0.6674, "step": 290 }, { "epoch": 1.7017543859649122, "grad_norm": 0.18596234796611538, "learning_rate": 9.63123644251627e-06, "loss": 0.6496, "step": 291 }, { "epoch": 1.7076023391812867, "grad_norm": 0.1978635671319887, "learning_rate": 9.587852494577007e-06, "loss": 0.6642, "step": 292 }, { "epoch": 1.7134502923976607, "grad_norm": 0.1979317376200934, "learning_rate": 9.544468546637745e-06, "loss": 0.7282, "step": 293 }, { "epoch": 1.719298245614035, "grad_norm": 0.1792470289825809, "learning_rate": 9.501084598698482e-06, "loss": 0.661, "step": 294 }, { "epoch": 1.7251461988304093, "grad_norm": 0.18979635817761115, "learning_rate": 9.457700650759219e-06, "loss": 0.6911, "step": 295 }, { "epoch": 1.7309941520467835, "grad_norm": 0.21977929643672667, "learning_rate": 9.414316702819958e-06, "loss": 0.6636, "step": 296 }, { "epoch": 1.736842105263158, "grad_norm": 0.19142793950578896, "learning_rate": 9.370932754880695e-06, "loss": 0.6652, "step": 297 }, { "epoch": 1.7426900584795322, "grad_norm": 0.1872314527946603, "learning_rate": 9.327548806941433e-06, "loss": 0.7241, "step": 298 }, { "epoch": 1.7485380116959064, "grad_norm": 0.21294716763423086, "learning_rate": 9.28416485900217e-06, "loss": 0.6505, "step": 299 }, { "epoch": 1.7543859649122808, "grad_norm": 0.20627848491038323, "learning_rate": 9.240780911062907e-06, "loss": 0.6839, "step": 300 }, { "epoch": 1.7602339181286548, "grad_norm": 0.1913775128261492, "learning_rate": 9.197396963123644e-06, "loss": 0.7072, "step": 301 }, { "epoch": 1.7660818713450293, "grad_norm": 0.18287999729259147, "learning_rate": 9.154013015184382e-06, "loss": 0.6571, "step": 302 }, { "epoch": 1.7719298245614035, "grad_norm": 0.1743048128512118, "learning_rate": 9.11062906724512e-06, "loss": 0.6404, "step": 303 }, { "epoch": 1.7777777777777777, "grad_norm": 0.18832199972802494, "learning_rate": 9.067245119305858e-06, "loss": 0.6853, "step": 304 }, { "epoch": 1.7836257309941521, "grad_norm": 0.20655204935711033, "learning_rate": 9.023861171366595e-06, "loss": 0.7093, "step": 305 }, { "epoch": 1.7894736842105263, "grad_norm": 0.19209280973506734, "learning_rate": 8.980477223427332e-06, "loss": 0.6548, "step": 306 }, { "epoch": 1.7953216374269005, "grad_norm": 0.1885931981782652, "learning_rate": 8.93709327548807e-06, "loss": 0.6558, "step": 307 }, { "epoch": 1.801169590643275, "grad_norm": 0.1962953890386984, "learning_rate": 8.893709327548807e-06, "loss": 0.6586, "step": 308 }, { "epoch": 1.807017543859649, "grad_norm": 0.19945775782899686, "learning_rate": 8.850325379609546e-06, "loss": 0.6636, "step": 309 }, { "epoch": 1.8128654970760234, "grad_norm": 0.1941326419111805, "learning_rate": 8.806941431670283e-06, "loss": 0.6615, "step": 310 }, { "epoch": 1.8187134502923976, "grad_norm": 0.18927283838641645, "learning_rate": 8.76355748373102e-06, "loss": 0.6722, "step": 311 }, { "epoch": 1.8245614035087718, "grad_norm": 0.18432693872655953, "learning_rate": 8.720173535791757e-06, "loss": 0.6522, "step": 312 }, { "epoch": 1.8304093567251463, "grad_norm": 0.1971710237782894, "learning_rate": 8.676789587852495e-06, "loss": 0.6996, "step": 313 }, { "epoch": 1.8362573099415205, "grad_norm": 0.1809013320142788, "learning_rate": 8.633405639913232e-06, "loss": 0.6476, "step": 314 }, { "epoch": 1.8421052631578947, "grad_norm": 0.17273470066786814, "learning_rate": 8.59002169197397e-06, "loss": 0.6205, "step": 315 }, { "epoch": 1.8479532163742691, "grad_norm": 0.1973244932241699, "learning_rate": 8.546637744034708e-06, "loss": 0.7028, "step": 316 }, { "epoch": 1.8538011695906431, "grad_norm": 0.18443943998865936, "learning_rate": 8.503253796095445e-06, "loss": 0.6821, "step": 317 }, { "epoch": 1.8596491228070176, "grad_norm": 0.19742863809842442, "learning_rate": 8.459869848156183e-06, "loss": 0.696, "step": 318 }, { "epoch": 1.8654970760233918, "grad_norm": 0.19602002536800328, "learning_rate": 8.416485900216922e-06, "loss": 0.6643, "step": 319 }, { "epoch": 1.871345029239766, "grad_norm": 0.18322608246185332, "learning_rate": 8.373101952277657e-06, "loss": 0.6877, "step": 320 }, { "epoch": 1.8771929824561404, "grad_norm": 0.20312293700355982, "learning_rate": 8.329718004338394e-06, "loss": 0.6779, "step": 321 }, { "epoch": 1.8830409356725146, "grad_norm": 0.18955838414122606, "learning_rate": 8.286334056399133e-06, "loss": 0.6889, "step": 322 }, { "epoch": 1.8888888888888888, "grad_norm": 0.18608685857531174, "learning_rate": 8.24295010845987e-06, "loss": 0.7066, "step": 323 }, { "epoch": 1.8947368421052633, "grad_norm": 0.19324997721387963, "learning_rate": 8.199566160520608e-06, "loss": 0.6721, "step": 324 }, { "epoch": 1.9005847953216373, "grad_norm": 0.18269525520661356, "learning_rate": 8.156182212581345e-06, "loss": 0.6606, "step": 325 }, { "epoch": 1.9064327485380117, "grad_norm": 0.17879213689825307, "learning_rate": 8.112798264642084e-06, "loss": 0.6195, "step": 326 }, { "epoch": 1.912280701754386, "grad_norm": 0.19572563149944922, "learning_rate": 8.06941431670282e-06, "loss": 0.6553, "step": 327 }, { "epoch": 1.9181286549707601, "grad_norm": 0.20455740497972336, "learning_rate": 8.026030368763557e-06, "loss": 0.7073, "step": 328 }, { "epoch": 1.9239766081871346, "grad_norm": 0.20379817717927606, "learning_rate": 7.982646420824296e-06, "loss": 0.6656, "step": 329 }, { "epoch": 1.9298245614035088, "grad_norm": 0.18816989178876325, "learning_rate": 7.939262472885033e-06, "loss": 0.6599, "step": 330 }, { "epoch": 1.935672514619883, "grad_norm": 0.19040798822146188, "learning_rate": 7.89587852494577e-06, "loss": 0.6872, "step": 331 }, { "epoch": 1.9415204678362574, "grad_norm": 0.2060421681157549, "learning_rate": 7.85249457700651e-06, "loss": 0.6634, "step": 332 }, { "epoch": 1.9473684210526314, "grad_norm": 0.1841817001629427, "learning_rate": 7.809110629067247e-06, "loss": 0.6249, "step": 333 }, { "epoch": 1.9532163742690059, "grad_norm": 0.19185741242924698, "learning_rate": 7.765726681127982e-06, "loss": 0.6603, "step": 334 }, { "epoch": 1.95906432748538, "grad_norm": 0.17490775565813746, "learning_rate": 7.722342733188721e-06, "loss": 0.649, "step": 335 }, { "epoch": 1.9649122807017543, "grad_norm": 0.18154097192716664, "learning_rate": 7.678958785249458e-06, "loss": 0.6869, "step": 336 }, { "epoch": 1.9707602339181287, "grad_norm": 0.2171151900817146, "learning_rate": 7.635574837310196e-06, "loss": 0.6806, "step": 337 }, { "epoch": 1.976608187134503, "grad_norm": 0.20056475561893633, "learning_rate": 7.592190889370933e-06, "loss": 0.6143, "step": 338 }, { "epoch": 1.9824561403508771, "grad_norm": 0.1859196448723673, "learning_rate": 7.548806941431671e-06, "loss": 0.6565, "step": 339 }, { "epoch": 1.9883040935672516, "grad_norm": 0.18291788926738473, "learning_rate": 7.505422993492408e-06, "loss": 0.6656, "step": 340 }, { "epoch": 1.9941520467836256, "grad_norm": 0.1851247551589902, "learning_rate": 7.4620390455531455e-06, "loss": 0.658, "step": 341 }, { "epoch": 2.0, "grad_norm": 0.19091041161918318, "learning_rate": 7.418655097613884e-06, "loss": 0.675, "step": 342 }, { "epoch": 2.0058479532163744, "grad_norm": 0.18034530461971127, "learning_rate": 7.375271149674621e-06, "loss": 0.6371, "step": 343 }, { "epoch": 2.0116959064327484, "grad_norm": 0.17929321132321624, "learning_rate": 7.331887201735358e-06, "loss": 0.6238, "step": 344 }, { "epoch": 2.017543859649123, "grad_norm": 0.1805743053336667, "learning_rate": 7.288503253796096e-06, "loss": 0.676, "step": 345 }, { "epoch": 2.023391812865497, "grad_norm": 0.18134202268639932, "learning_rate": 7.2451193058568335e-06, "loss": 0.6926, "step": 346 }, { "epoch": 2.0292397660818713, "grad_norm": 0.16664489258040083, "learning_rate": 7.201735357917571e-06, "loss": 0.635, "step": 347 }, { "epoch": 2.0350877192982457, "grad_norm": 0.17418680651725119, "learning_rate": 7.158351409978309e-06, "loss": 0.6625, "step": 348 }, { "epoch": 2.0409356725146197, "grad_norm": 0.16806000135863103, "learning_rate": 7.114967462039046e-06, "loss": 0.656, "step": 349 }, { "epoch": 2.046783625730994, "grad_norm": 0.1766385026508446, "learning_rate": 7.071583514099783e-06, "loss": 0.644, "step": 350 }, { "epoch": 2.0526315789473686, "grad_norm": 0.18299281472851398, "learning_rate": 7.028199566160521e-06, "loss": 0.6609, "step": 351 }, { "epoch": 2.0584795321637426, "grad_norm": 0.20986189876178032, "learning_rate": 6.984815618221259e-06, "loss": 0.6113, "step": 352 }, { "epoch": 2.064327485380117, "grad_norm": 0.17241912699938555, "learning_rate": 6.941431670281996e-06, "loss": 0.622, "step": 353 }, { "epoch": 2.0701754385964914, "grad_norm": 0.17175110508577335, "learning_rate": 6.898047722342733e-06, "loss": 0.6475, "step": 354 }, { "epoch": 2.0760233918128654, "grad_norm": 0.17952837380953865, "learning_rate": 6.854663774403471e-06, "loss": 0.624, "step": 355 }, { "epoch": 2.08187134502924, "grad_norm": 0.16440737350129503, "learning_rate": 6.8112798264642086e-06, "loss": 0.6216, "step": 356 }, { "epoch": 2.087719298245614, "grad_norm": 0.19647840255348978, "learning_rate": 6.767895878524946e-06, "loss": 0.6685, "step": 357 }, { "epoch": 2.0935672514619883, "grad_norm": 0.1696642474859097, "learning_rate": 6.724511930585684e-06, "loss": 0.6513, "step": 358 }, { "epoch": 2.0994152046783627, "grad_norm": 0.16781192390446642, "learning_rate": 6.681127982646421e-06, "loss": 0.6316, "step": 359 }, { "epoch": 2.1052631578947367, "grad_norm": 0.17665396661182975, "learning_rate": 6.6377440347071584e-06, "loss": 0.6444, "step": 360 }, { "epoch": 2.111111111111111, "grad_norm": 0.17026024356498806, "learning_rate": 6.594360086767897e-06, "loss": 0.6369, "step": 361 }, { "epoch": 2.116959064327485, "grad_norm": 0.1771238959431666, "learning_rate": 6.550976138828634e-06, "loss": 0.6363, "step": 362 }, { "epoch": 2.1228070175438596, "grad_norm": 0.18074195829403725, "learning_rate": 6.507592190889371e-06, "loss": 0.6295, "step": 363 }, { "epoch": 2.128654970760234, "grad_norm": 0.17590315483807462, "learning_rate": 6.464208242950108e-06, "loss": 0.6352, "step": 364 }, { "epoch": 2.134502923976608, "grad_norm": 0.1833679378524948, "learning_rate": 6.420824295010846e-06, "loss": 0.668, "step": 365 }, { "epoch": 2.1403508771929824, "grad_norm": 0.17426945543091085, "learning_rate": 6.377440347071584e-06, "loss": 0.6309, "step": 366 }, { "epoch": 2.146198830409357, "grad_norm": 0.17558570852982017, "learning_rate": 6.334056399132321e-06, "loss": 0.6183, "step": 367 }, { "epoch": 2.152046783625731, "grad_norm": 0.18869020603808476, "learning_rate": 6.29067245119306e-06, "loss": 0.6743, "step": 368 }, { "epoch": 2.1578947368421053, "grad_norm": 0.16860328391840887, "learning_rate": 6.247288503253796e-06, "loss": 0.6272, "step": 369 }, { "epoch": 2.1637426900584797, "grad_norm": 0.1787201818661304, "learning_rate": 6.2039045553145335e-06, "loss": 0.6536, "step": 370 }, { "epoch": 2.1695906432748537, "grad_norm": 0.17123056998213806, "learning_rate": 6.1605206073752725e-06, "loss": 0.6382, "step": 371 }, { "epoch": 2.175438596491228, "grad_norm": 0.1687316996284582, "learning_rate": 6.11713665943601e-06, "loss": 0.6212, "step": 372 }, { "epoch": 2.181286549707602, "grad_norm": 0.1891269844696612, "learning_rate": 6.073752711496746e-06, "loss": 0.6585, "step": 373 }, { "epoch": 2.1871345029239766, "grad_norm": 0.1725455615706422, "learning_rate": 6.030368763557483e-06, "loss": 0.6559, "step": 374 }, { "epoch": 2.192982456140351, "grad_norm": 0.16915435536877974, "learning_rate": 5.986984815618222e-06, "loss": 0.6324, "step": 375 }, { "epoch": 2.198830409356725, "grad_norm": 0.17215684923648952, "learning_rate": 5.943600867678959e-06, "loss": 0.6539, "step": 376 }, { "epoch": 2.2046783625730995, "grad_norm": 0.1954313719903045, "learning_rate": 5.900216919739696e-06, "loss": 0.6866, "step": 377 }, { "epoch": 2.2105263157894735, "grad_norm": 0.17042598764998235, "learning_rate": 5.856832971800435e-06, "loss": 0.6558, "step": 378 }, { "epoch": 2.216374269005848, "grad_norm": 0.17192364297534282, "learning_rate": 5.813449023861172e-06, "loss": 0.6378, "step": 379 }, { "epoch": 2.2222222222222223, "grad_norm": 0.1739599234963019, "learning_rate": 5.770065075921909e-06, "loss": 0.6194, "step": 380 }, { "epoch": 2.2280701754385963, "grad_norm": 0.17013107466272653, "learning_rate": 5.7266811279826476e-06, "loss": 0.6071, "step": 381 }, { "epoch": 2.2339181286549707, "grad_norm": 0.1848300211606863, "learning_rate": 5.683297180043385e-06, "loss": 0.6859, "step": 382 }, { "epoch": 2.239766081871345, "grad_norm": 0.17752768182741563, "learning_rate": 5.639913232104122e-06, "loss": 0.6674, "step": 383 }, { "epoch": 2.245614035087719, "grad_norm": 0.17268014916608854, "learning_rate": 5.59652928416486e-06, "loss": 0.6447, "step": 384 }, { "epoch": 2.2514619883040936, "grad_norm": 0.1975248493024482, "learning_rate": 5.5531453362255974e-06, "loss": 0.6877, "step": 385 }, { "epoch": 2.257309941520468, "grad_norm": 0.1854455256428647, "learning_rate": 5.509761388286335e-06, "loss": 0.6663, "step": 386 }, { "epoch": 2.263157894736842, "grad_norm": 0.18048830972034413, "learning_rate": 5.466377440347071e-06, "loss": 0.6515, "step": 387 }, { "epoch": 2.2690058479532165, "grad_norm": 0.18529428469214002, "learning_rate": 5.42299349240781e-06, "loss": 0.6742, "step": 388 }, { "epoch": 2.2748538011695905, "grad_norm": 0.1953029253712016, "learning_rate": 5.379609544468547e-06, "loss": 0.6715, "step": 389 }, { "epoch": 2.280701754385965, "grad_norm": 0.18506576413704273, "learning_rate": 5.3362255965292846e-06, "loss": 0.6441, "step": 390 }, { "epoch": 2.2865497076023393, "grad_norm": 0.20519359995385428, "learning_rate": 5.292841648590023e-06, "loss": 0.6324, "step": 391 }, { "epoch": 2.2923976608187133, "grad_norm": 0.1812910105371836, "learning_rate": 5.24945770065076e-06, "loss": 0.6151, "step": 392 }, { "epoch": 2.2982456140350878, "grad_norm": 0.16615863932290006, "learning_rate": 5.206073752711497e-06, "loss": 0.6178, "step": 393 }, { "epoch": 2.3040935672514617, "grad_norm": 0.1867312948806079, "learning_rate": 5.162689804772235e-06, "loss": 0.6844, "step": 394 }, { "epoch": 2.309941520467836, "grad_norm": 0.17482246796590165, "learning_rate": 5.1193058568329725e-06, "loss": 0.6316, "step": 395 }, { "epoch": 2.3157894736842106, "grad_norm": 0.18919167148846638, "learning_rate": 5.07592190889371e-06, "loss": 0.6929, "step": 396 }, { "epoch": 2.3216374269005846, "grad_norm": 0.17135779399399315, "learning_rate": 5.032537960954448e-06, "loss": 0.6507, "step": 397 }, { "epoch": 2.327485380116959, "grad_norm": 0.16589752923541318, "learning_rate": 4.989154013015185e-06, "loss": 0.6169, "step": 398 }, { "epoch": 2.3333333333333335, "grad_norm": 0.17836212191167625, "learning_rate": 4.945770065075922e-06, "loss": 0.6534, "step": 399 }, { "epoch": 2.3391812865497075, "grad_norm": 0.17486989043138282, "learning_rate": 4.90238611713666e-06, "loss": 0.6229, "step": 400 }, { "epoch": 2.345029239766082, "grad_norm": 0.18358705375708667, "learning_rate": 4.859002169197397e-06, "loss": 0.6806, "step": 401 }, { "epoch": 2.3508771929824563, "grad_norm": 0.17755890153992399, "learning_rate": 4.815618221258135e-06, "loss": 0.6835, "step": 402 }, { "epoch": 2.3567251461988303, "grad_norm": 0.1796432140151646, "learning_rate": 4.772234273318872e-06, "loss": 0.6643, "step": 403 }, { "epoch": 2.3625730994152048, "grad_norm": 0.16924652263187123, "learning_rate": 4.7288503253796095e-06, "loss": 0.6157, "step": 404 }, { "epoch": 2.3684210526315788, "grad_norm": 0.1726890776222668, "learning_rate": 4.685466377440348e-06, "loss": 0.6557, "step": 405 }, { "epoch": 2.374269005847953, "grad_norm": 0.1780550008345725, "learning_rate": 4.642082429501085e-06, "loss": 0.6554, "step": 406 }, { "epoch": 2.3801169590643276, "grad_norm": 0.17128524860089567, "learning_rate": 4.598698481561822e-06, "loss": 0.6768, "step": 407 }, { "epoch": 2.3859649122807016, "grad_norm": 0.16558703660041527, "learning_rate": 4.55531453362256e-06, "loss": 0.6492, "step": 408 }, { "epoch": 2.391812865497076, "grad_norm": 0.17532697429039085, "learning_rate": 4.5119305856832975e-06, "loss": 0.6572, "step": 409 }, { "epoch": 2.39766081871345, "grad_norm": 0.16937166076280238, "learning_rate": 4.468546637744035e-06, "loss": 0.6628, "step": 410 }, { "epoch": 2.4035087719298245, "grad_norm": 0.18312431667652093, "learning_rate": 4.425162689804773e-06, "loss": 0.6477, "step": 411 }, { "epoch": 2.409356725146199, "grad_norm": 0.1695999967647095, "learning_rate": 4.38177874186551e-06, "loss": 0.6603, "step": 412 }, { "epoch": 2.415204678362573, "grad_norm": 0.1720525919637609, "learning_rate": 4.338394793926247e-06, "loss": 0.6452, "step": 413 }, { "epoch": 2.4210526315789473, "grad_norm": 0.16726491752955858, "learning_rate": 4.295010845986985e-06, "loss": 0.6199, "step": 414 }, { "epoch": 2.426900584795322, "grad_norm": 0.2059763044769876, "learning_rate": 4.251626898047723e-06, "loss": 0.6783, "step": 415 }, { "epoch": 2.4327485380116958, "grad_norm": 0.1731042493041403, "learning_rate": 4.208242950108461e-06, "loss": 0.6289, "step": 416 }, { "epoch": 2.43859649122807, "grad_norm": 0.17595716841396303, "learning_rate": 4.164859002169197e-06, "loss": 0.6607, "step": 417 }, { "epoch": 2.4444444444444446, "grad_norm": 0.17232699533316642, "learning_rate": 4.121475054229935e-06, "loss": 0.6203, "step": 418 }, { "epoch": 2.4502923976608186, "grad_norm": 0.17550156147686838, "learning_rate": 4.078091106290673e-06, "loss": 0.6584, "step": 419 }, { "epoch": 2.456140350877193, "grad_norm": 0.18080214333436065, "learning_rate": 4.03470715835141e-06, "loss": 0.6031, "step": 420 }, { "epoch": 2.461988304093567, "grad_norm": 0.18048583412947314, "learning_rate": 3.991323210412148e-06, "loss": 0.6354, "step": 421 }, { "epoch": 2.4678362573099415, "grad_norm": 0.18253929691844767, "learning_rate": 3.947939262472885e-06, "loss": 0.6502, "step": 422 }, { "epoch": 2.473684210526316, "grad_norm": 0.1697304593286738, "learning_rate": 3.904555314533623e-06, "loss": 0.6332, "step": 423 }, { "epoch": 2.47953216374269, "grad_norm": 0.17269048510291535, "learning_rate": 3.8611713665943606e-06, "loss": 0.6095, "step": 424 }, { "epoch": 2.4853801169590644, "grad_norm": 0.16216619960446743, "learning_rate": 3.817787418655098e-06, "loss": 0.6112, "step": 425 }, { "epoch": 2.4912280701754383, "grad_norm": 0.17239216132714047, "learning_rate": 3.7744034707158355e-06, "loss": 0.6715, "step": 426 }, { "epoch": 2.497076023391813, "grad_norm": 0.1715509924251108, "learning_rate": 3.7310195227765728e-06, "loss": 0.6459, "step": 427 }, { "epoch": 2.502923976608187, "grad_norm": 0.1674736258064931, "learning_rate": 3.6876355748373104e-06, "loss": 0.6355, "step": 428 }, { "epoch": 2.5087719298245617, "grad_norm": 0.16465926005700326, "learning_rate": 3.644251626898048e-06, "loss": 0.6381, "step": 429 }, { "epoch": 2.5146198830409356, "grad_norm": 0.1766218788353798, "learning_rate": 3.6008676789587854e-06, "loss": 0.67, "step": 430 }, { "epoch": 2.52046783625731, "grad_norm": 0.17349720246234343, "learning_rate": 3.557483731019523e-06, "loss": 0.6493, "step": 431 }, { "epoch": 2.526315789473684, "grad_norm": 0.167194985421623, "learning_rate": 3.5140997830802603e-06, "loss": 0.628, "step": 432 }, { "epoch": 2.5321637426900585, "grad_norm": 0.1704752632069036, "learning_rate": 3.470715835140998e-06, "loss": 0.6431, "step": 433 }, { "epoch": 2.538011695906433, "grad_norm": 0.18481707817941734, "learning_rate": 3.4273318872017357e-06, "loss": 0.6715, "step": 434 }, { "epoch": 2.543859649122807, "grad_norm": 0.1953699500403843, "learning_rate": 3.383947939262473e-06, "loss": 0.6845, "step": 435 }, { "epoch": 2.5497076023391814, "grad_norm": 0.16379216515216974, "learning_rate": 3.3405639913232106e-06, "loss": 0.6455, "step": 436 }, { "epoch": 2.5555555555555554, "grad_norm": 0.1980120403081147, "learning_rate": 3.2971800433839487e-06, "loss": 0.6695, "step": 437 }, { "epoch": 2.56140350877193, "grad_norm": 0.16118979174422027, "learning_rate": 3.2537960954446855e-06, "loss": 0.5928, "step": 438 }, { "epoch": 2.5672514619883042, "grad_norm": 0.1657791823109499, "learning_rate": 3.210412147505423e-06, "loss": 0.645, "step": 439 }, { "epoch": 2.573099415204678, "grad_norm": 0.17132915146971192, "learning_rate": 3.1670281995661605e-06, "loss": 0.6847, "step": 440 }, { "epoch": 2.5789473684210527, "grad_norm": 0.16324444549230824, "learning_rate": 3.123644251626898e-06, "loss": 0.6413, "step": 441 }, { "epoch": 2.5847953216374266, "grad_norm": 0.17488238495665867, "learning_rate": 3.0802603036876362e-06, "loss": 0.6321, "step": 442 }, { "epoch": 2.590643274853801, "grad_norm": 0.17634328132329954, "learning_rate": 3.036876355748373e-06, "loss": 0.6784, "step": 443 }, { "epoch": 2.5964912280701755, "grad_norm": 0.17868073636307982, "learning_rate": 2.993492407809111e-06, "loss": 0.678, "step": 444 }, { "epoch": 2.60233918128655, "grad_norm": 0.1632381806494582, "learning_rate": 2.950108459869848e-06, "loss": 0.6266, "step": 445 }, { "epoch": 2.608187134502924, "grad_norm": 0.16547418794872898, "learning_rate": 2.906724511930586e-06, "loss": 0.6328, "step": 446 }, { "epoch": 2.6140350877192984, "grad_norm": 0.17622874984246908, "learning_rate": 2.8633405639913238e-06, "loss": 0.661, "step": 447 }, { "epoch": 2.6198830409356724, "grad_norm": 0.16541694161777937, "learning_rate": 2.819956616052061e-06, "loss": 0.6236, "step": 448 }, { "epoch": 2.625730994152047, "grad_norm": 0.1662936609526993, "learning_rate": 2.7765726681127987e-06, "loss": 0.6159, "step": 449 }, { "epoch": 2.6315789473684212, "grad_norm": 0.16669675160496522, "learning_rate": 2.7331887201735356e-06, "loss": 0.6505, "step": 450 }, { "epoch": 2.6374269005847952, "grad_norm": 0.16549634014330256, "learning_rate": 2.6898047722342737e-06, "loss": 0.6508, "step": 451 }, { "epoch": 2.6432748538011697, "grad_norm": 0.1810240612515184, "learning_rate": 2.6464208242950113e-06, "loss": 0.6705, "step": 452 }, { "epoch": 2.6491228070175437, "grad_norm": 0.17384206262358587, "learning_rate": 2.6030368763557486e-06, "loss": 0.6376, "step": 453 }, { "epoch": 2.654970760233918, "grad_norm": 0.17845392301327417, "learning_rate": 2.5596529284164863e-06, "loss": 0.6726, "step": 454 }, { "epoch": 2.6608187134502925, "grad_norm": 0.1998313342763234, "learning_rate": 2.516268980477224e-06, "loss": 0.7018, "step": 455 }, { "epoch": 2.6666666666666665, "grad_norm": 0.16980658137466279, "learning_rate": 2.472885032537961e-06, "loss": 0.649, "step": 456 }, { "epoch": 2.672514619883041, "grad_norm": 0.18303799471242801, "learning_rate": 2.4295010845986985e-06, "loss": 0.6807, "step": 457 }, { "epoch": 2.678362573099415, "grad_norm": 0.16687064769711984, "learning_rate": 2.386117136659436e-06, "loss": 0.625, "step": 458 }, { "epoch": 2.6842105263157894, "grad_norm": 0.17675345144700674, "learning_rate": 2.342733188720174e-06, "loss": 0.6877, "step": 459 }, { "epoch": 2.690058479532164, "grad_norm": 0.18401049432140446, "learning_rate": 2.299349240780911e-06, "loss": 0.6887, "step": 460 }, { "epoch": 2.6959064327485383, "grad_norm": 0.1744979749607572, "learning_rate": 2.2559652928416487e-06, "loss": 0.6589, "step": 461 }, { "epoch": 2.7017543859649122, "grad_norm": 0.1746641852105471, "learning_rate": 2.2125813449023864e-06, "loss": 0.6495, "step": 462 }, { "epoch": 2.7076023391812867, "grad_norm": 0.16657675516372344, "learning_rate": 2.1691973969631237e-06, "loss": 0.662, "step": 463 }, { "epoch": 2.7134502923976607, "grad_norm": 0.17198446823209654, "learning_rate": 2.1258134490238614e-06, "loss": 0.6732, "step": 464 }, { "epoch": 2.719298245614035, "grad_norm": 0.1666041499402243, "learning_rate": 2.0824295010845986e-06, "loss": 0.6812, "step": 465 }, { "epoch": 2.7251461988304095, "grad_norm": 0.17396505588176064, "learning_rate": 2.0390455531453363e-06, "loss": 0.6591, "step": 466 }, { "epoch": 2.7309941520467835, "grad_norm": 0.17207201652443582, "learning_rate": 1.995661605206074e-06, "loss": 0.6278, "step": 467 }, { "epoch": 2.736842105263158, "grad_norm": 0.16767533054287867, "learning_rate": 1.9522776572668117e-06, "loss": 0.6508, "step": 468 }, { "epoch": 2.742690058479532, "grad_norm": 0.17199489358502026, "learning_rate": 1.908893709327549e-06, "loss": 0.6652, "step": 469 }, { "epoch": 2.7485380116959064, "grad_norm": 0.15742337242113655, "learning_rate": 1.8655097613882864e-06, "loss": 0.6281, "step": 470 }, { "epoch": 2.754385964912281, "grad_norm": 0.16549888305173557, "learning_rate": 1.822125813449024e-06, "loss": 0.6257, "step": 471 }, { "epoch": 2.760233918128655, "grad_norm": 0.17228844722867567, "learning_rate": 1.7787418655097615e-06, "loss": 0.6897, "step": 472 }, { "epoch": 2.7660818713450293, "grad_norm": 0.16656984900009209, "learning_rate": 1.735357917570499e-06, "loss": 0.6576, "step": 473 }, { "epoch": 2.7719298245614032, "grad_norm": 0.1617090427960584, "learning_rate": 1.6919739696312365e-06, "loss": 0.6375, "step": 474 }, { "epoch": 2.7777777777777777, "grad_norm": 0.17066915492008342, "learning_rate": 1.6485900216919743e-06, "loss": 0.6434, "step": 475 }, { "epoch": 2.783625730994152, "grad_norm": 0.17283365217712324, "learning_rate": 1.6052060737527116e-06, "loss": 0.6404, "step": 476 }, { "epoch": 2.7894736842105265, "grad_norm": 0.16377562920106029, "learning_rate": 1.561822125813449e-06, "loss": 0.5996, "step": 477 }, { "epoch": 2.7953216374269005, "grad_norm": 0.16639432488486533, "learning_rate": 1.5184381778741865e-06, "loss": 0.5969, "step": 478 }, { "epoch": 2.801169590643275, "grad_norm": 0.16980646505093647, "learning_rate": 1.475054229934924e-06, "loss": 0.6769, "step": 479 }, { "epoch": 2.807017543859649, "grad_norm": 0.1628222318868079, "learning_rate": 1.4316702819956619e-06, "loss": 0.6508, "step": 480 }, { "epoch": 2.8128654970760234, "grad_norm": 0.18172158119006254, "learning_rate": 1.3882863340563994e-06, "loss": 0.6604, "step": 481 }, { "epoch": 2.818713450292398, "grad_norm": 0.16423487898529526, "learning_rate": 1.3449023861171368e-06, "loss": 0.6228, "step": 482 }, { "epoch": 2.824561403508772, "grad_norm": 0.17478062902651836, "learning_rate": 1.3015184381778743e-06, "loss": 0.6251, "step": 483 }, { "epoch": 2.8304093567251463, "grad_norm": 0.1726032282493946, "learning_rate": 1.258134490238612e-06, "loss": 0.6735, "step": 484 }, { "epoch": 2.8362573099415203, "grad_norm": 0.16790264066853555, "learning_rate": 1.2147505422993492e-06, "loss": 0.6434, "step": 485 }, { "epoch": 2.8421052631578947, "grad_norm": 0.1671571499569638, "learning_rate": 1.171366594360087e-06, "loss": 0.6578, "step": 486 }, { "epoch": 2.847953216374269, "grad_norm": 0.16863160149729373, "learning_rate": 1.1279826464208244e-06, "loss": 0.6242, "step": 487 }, { "epoch": 2.853801169590643, "grad_norm": 0.161190538585518, "learning_rate": 1.0845986984815618e-06, "loss": 0.6342, "step": 488 }, { "epoch": 2.8596491228070176, "grad_norm": 0.16562131765046972, "learning_rate": 1.0412147505422993e-06, "loss": 0.6346, "step": 489 }, { "epoch": 2.8654970760233915, "grad_norm": 0.16478891417223968, "learning_rate": 9.97830802603037e-07, "loss": 0.629, "step": 490 }, { "epoch": 2.871345029239766, "grad_norm": 0.1652066649082407, "learning_rate": 9.544468546637745e-07, "loss": 0.6512, "step": 491 }, { "epoch": 2.8771929824561404, "grad_norm": 0.1808259238987679, "learning_rate": 9.11062906724512e-07, "loss": 0.6501, "step": 492 }, { "epoch": 2.883040935672515, "grad_norm": 0.16595306747518687, "learning_rate": 8.676789587852495e-07, "loss": 0.6187, "step": 493 }, { "epoch": 2.888888888888889, "grad_norm": 0.16577185891507523, "learning_rate": 8.242950108459872e-07, "loss": 0.6608, "step": 494 }, { "epoch": 2.8947368421052633, "grad_norm": 0.17578227817996883, "learning_rate": 7.809110629067245e-07, "loss": 0.6522, "step": 495 }, { "epoch": 2.9005847953216373, "grad_norm": 0.16712846714191626, "learning_rate": 7.37527114967462e-07, "loss": 0.6741, "step": 496 }, { "epoch": 2.9064327485380117, "grad_norm": 0.1695052637928444, "learning_rate": 6.941431670281997e-07, "loss": 0.6697, "step": 497 }, { "epoch": 2.912280701754386, "grad_norm": 0.16523586320140343, "learning_rate": 6.507592190889371e-07, "loss": 0.6359, "step": 498 }, { "epoch": 2.91812865497076, "grad_norm": 0.17250068186561412, "learning_rate": 6.073752711496746e-07, "loss": 0.635, "step": 499 }, { "epoch": 2.9239766081871346, "grad_norm": 0.15377259520433112, "learning_rate": 5.639913232104122e-07, "loss": 0.5981, "step": 500 }, { "epoch": 2.9298245614035086, "grad_norm": 0.1658156010520523, "learning_rate": 5.206073752711497e-07, "loss": 0.6501, "step": 501 }, { "epoch": 2.935672514619883, "grad_norm": 0.16408478791637582, "learning_rate": 4.772234273318872e-07, "loss": 0.6142, "step": 502 }, { "epoch": 2.9415204678362574, "grad_norm": 0.16388275974704705, "learning_rate": 4.3383947939262475e-07, "loss": 0.6752, "step": 503 }, { "epoch": 2.9473684210526314, "grad_norm": 0.16114934023396965, "learning_rate": 3.9045553145336227e-07, "loss": 0.6342, "step": 504 }, { "epoch": 2.953216374269006, "grad_norm": 0.16383179577586124, "learning_rate": 3.4707158351409984e-07, "loss": 0.6357, "step": 505 }, { "epoch": 2.95906432748538, "grad_norm": 0.15975403048273223, "learning_rate": 3.036876355748373e-07, "loss": 0.6188, "step": 506 }, { "epoch": 2.9649122807017543, "grad_norm": 0.1676357353298206, "learning_rate": 2.6030368763557483e-07, "loss": 0.6311, "step": 507 }, { "epoch": 2.9707602339181287, "grad_norm": 0.15959844257029077, "learning_rate": 2.1691973969631237e-07, "loss": 0.6397, "step": 508 }, { "epoch": 2.976608187134503, "grad_norm": 0.1748528110908195, "learning_rate": 1.7353579175704992e-07, "loss": 0.6354, "step": 509 }, { "epoch": 2.982456140350877, "grad_norm": 0.16899094676604337, "learning_rate": 1.3015184381778741e-07, "loss": 0.6553, "step": 510 }, { "epoch": 2.9883040935672516, "grad_norm": 0.18419361328324801, "learning_rate": 8.676789587852496e-08, "loss": 0.6327, "step": 511 }, { "epoch": 2.9941520467836256, "grad_norm": 0.1710576834882864, "learning_rate": 4.338394793926248e-08, "loss": 0.6288, "step": 512 }, { "epoch": 3.0, "grad_norm": 0.1608724141438565, "learning_rate": 0.0, "loss": 0.6311, "step": 513 }, { "epoch": 3.0, "step": 513, "total_flos": 233827146399744.0, "train_loss": 0.7522663490349322, "train_runtime": 28313.6625, "train_samples_per_second": 0.289, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 513, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 233827146399744.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }