{ "best_metric": 0.6652334928512573, "best_model_checkpoint": "/l/users/visionlanguage/mostafa_ciai/hf_checkpoints_code_ciai_gemma2/checkpoint-1700", "epoch": 5.994075260208167, "eval_steps": 50, "global_step": 1752, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006832132372564718, "grad_norm": 93.82548522949219, "learning_rate": 2.777777777777778e-06, "loss": 208.4052, "step": 2 }, { "epoch": 0.013664264745129436, "grad_norm": 65.51689147949219, "learning_rate": 5.555555555555556e-06, "loss": 194.4831, "step": 4 }, { "epoch": 0.020496397117694156, "grad_norm": 30.816993713378906, "learning_rate": 8.333333333333334e-06, "loss": 159.6516, "step": 6 }, { "epoch": 0.027328529490258872, "grad_norm": 30.113662719726562, "learning_rate": 1.1111111111111112e-05, "loss": 145.5557, "step": 8 }, { "epoch": 0.03416066186282359, "grad_norm": 22.37295150756836, "learning_rate": 1.388888888888889e-05, "loss": 128.5444, "step": 10 }, { "epoch": 0.04099279423538831, "grad_norm": 22.287870407104492, "learning_rate": 1.6666666666666667e-05, "loss": 116.2723, "step": 12 }, { "epoch": 0.04782492660795303, "grad_norm": 16.027904510498047, "learning_rate": 1.9444444444444445e-05, "loss": 107.5451, "step": 14 }, { "epoch": 0.054657058980517745, "grad_norm": 17.97212791442871, "learning_rate": 2.2222222222222223e-05, "loss": 100.7136, "step": 16 }, { "epoch": 0.061489191353082465, "grad_norm": 15.427449226379395, "learning_rate": 2.5e-05, "loss": 96.4422, "step": 18 }, { "epoch": 0.06832132372564718, "grad_norm": 11.836018562316895, "learning_rate": 2.777777777777778e-05, "loss": 89.9874, "step": 20 }, { "epoch": 0.0751534560982119, "grad_norm": 13.170073509216309, "learning_rate": 3.055555555555556e-05, "loss": 90.5263, "step": 22 }, { "epoch": 0.08198558847077662, "grad_norm": 12.781464576721191, "learning_rate": 3.3333333333333335e-05, "loss": 87.3144, "step": 24 }, { "epoch": 0.08881772084334134, "grad_norm": 11.460458755493164, "learning_rate": 3.611111111111111e-05, "loss": 85.6209, "step": 26 }, { "epoch": 0.09564985321590606, "grad_norm": 10.382000923156738, "learning_rate": 3.888888888888889e-05, "loss": 88.2803, "step": 28 }, { "epoch": 0.10248198558847077, "grad_norm": 10.578895568847656, "learning_rate": 4.166666666666667e-05, "loss": 80.589, "step": 30 }, { "epoch": 0.10931411796103549, "grad_norm": 10.231274604797363, "learning_rate": 4.4444444444444447e-05, "loss": 83.0791, "step": 32 }, { "epoch": 0.11614625033360021, "grad_norm": 13.121459007263184, "learning_rate": 4.722222222222222e-05, "loss": 81.0775, "step": 34 }, { "epoch": 0.12297838270616493, "grad_norm": 11.594988822937012, "learning_rate": 5e-05, "loss": 79.3985, "step": 36 }, { "epoch": 0.12981051507872965, "grad_norm": 10.554534912109375, "learning_rate": 4.9999832415172185e-05, "loss": 78.9732, "step": 38 }, { "epoch": 0.13664264745129437, "grad_norm": 9.661481857299805, "learning_rate": 4.9999329662935534e-05, "loss": 77.5229, "step": 40 }, { "epoch": 0.1434747798238591, "grad_norm": 11.10251235961914, "learning_rate": 4.9998491750030315e-05, "loss": 77.7747, "step": 42 }, { "epoch": 0.1503069121964238, "grad_norm": 9.058899879455566, "learning_rate": 4.999731868769027e-05, "loss": 79.2141, "step": 44 }, { "epoch": 0.15713904456898853, "grad_norm": 9.254643440246582, "learning_rate": 4.999581049164237e-05, "loss": 77.5962, "step": 46 }, { "epoch": 0.16397117694155325, "grad_norm": 10.37578010559082, "learning_rate": 4.99939671821067e-05, "loss": 76.6356, "step": 48 }, { "epoch": 0.17080330931411797, "grad_norm": 9.983922004699707, "learning_rate": 4.999178878379611e-05, "loss": 76.0763, "step": 50 }, { "epoch": 0.17080330931411797, "eval_loss": 1.20554518699646, "eval_runtime": 119.3115, "eval_samples_per_second": 33.065, "eval_steps_per_second": 8.272, "step": 50 }, { "epoch": 0.1776354416866827, "grad_norm": 9.109485626220703, "learning_rate": 4.998927532591592e-05, "loss": 75.2524, "step": 52 }, { "epoch": 0.1844675740592474, "grad_norm": 8.939992904663086, "learning_rate": 4.9986426842163515e-05, "loss": 75.8614, "step": 54 }, { "epoch": 0.19129970643181213, "grad_norm": 8.342733383178711, "learning_rate": 4.9983243370727914e-05, "loss": 72.864, "step": 56 }, { "epoch": 0.19813183880437685, "grad_norm": 7.625518321990967, "learning_rate": 4.9979724954289244e-05, "loss": 75.7165, "step": 58 }, { "epoch": 0.20496397117694154, "grad_norm": 6.545467853546143, "learning_rate": 4.9975871640018154e-05, "loss": 72.337, "step": 60 }, { "epoch": 0.21179610354950626, "grad_norm": 8.73936939239502, "learning_rate": 4.99716834795752e-05, "loss": 73.0804, "step": 62 }, { "epoch": 0.21862823592207098, "grad_norm": 7.599481105804443, "learning_rate": 4.996716052911017e-05, "loss": 71.3494, "step": 64 }, { "epoch": 0.2254603682946357, "grad_norm": 8.88508415222168, "learning_rate": 4.996230284926128e-05, "loss": 73.4886, "step": 66 }, { "epoch": 0.23229250066720042, "grad_norm": 7.141696453094482, "learning_rate": 4.99571105051544e-05, "loss": 73.0934, "step": 68 }, { "epoch": 0.23912463303976514, "grad_norm": 8.946745872497559, "learning_rate": 4.99515835664022e-05, "loss": 70.5761, "step": 70 }, { "epoch": 0.24595676541232986, "grad_norm": 7.428682804107666, "learning_rate": 4.994572210710315e-05, "loss": 69.8488, "step": 72 }, { "epoch": 0.2527888977848946, "grad_norm": 10.490913391113281, "learning_rate": 4.993952620584058e-05, "loss": 72.1602, "step": 74 }, { "epoch": 0.2596210301574593, "grad_norm": 6.010617733001709, "learning_rate": 4.993299594568163e-05, "loss": 70.0962, "step": 76 }, { "epoch": 0.26645316253002405, "grad_norm": 5.207183361053467, "learning_rate": 4.992613141417608e-05, "loss": 70.6436, "step": 78 }, { "epoch": 0.27328529490258874, "grad_norm": 7.816757678985596, "learning_rate": 4.9918932703355256e-05, "loss": 68.9464, "step": 80 }, { "epoch": 0.28011742727515343, "grad_norm": 6.2263383865356445, "learning_rate": 4.9911399909730714e-05, "loss": 68.8249, "step": 82 }, { "epoch": 0.2869495596477182, "grad_norm": 6.726258754730225, "learning_rate": 4.990353313429303e-05, "loss": 68.7637, "step": 84 }, { "epoch": 0.29378169202028287, "grad_norm": 5.4038543701171875, "learning_rate": 4.989533248251037e-05, "loss": 68.7726, "step": 86 }, { "epoch": 0.3006138243928476, "grad_norm": 9.256815910339355, "learning_rate": 4.988679806432712e-05, "loss": 68.2967, "step": 88 }, { "epoch": 0.3074459567654123, "grad_norm": 7.765486717224121, "learning_rate": 4.98779299941624e-05, "loss": 70.6181, "step": 90 }, { "epoch": 0.31427808913797706, "grad_norm": 7.625786304473877, "learning_rate": 4.9868728390908526e-05, "loss": 68.5738, "step": 92 }, { "epoch": 0.32111022151054175, "grad_norm": 7.776100158691406, "learning_rate": 4.985919337792944e-05, "loss": 65.0074, "step": 94 }, { "epoch": 0.3279423538831065, "grad_norm": 6.496335029602051, "learning_rate": 4.9849325083059e-05, "loss": 66.7343, "step": 96 }, { "epoch": 0.3347744862556712, "grad_norm": 6.616697311401367, "learning_rate": 4.983912363859935e-05, "loss": 69.292, "step": 98 }, { "epoch": 0.34160661862823594, "grad_norm": 7.259242057800293, "learning_rate": 4.982858918131906e-05, "loss": 66.8941, "step": 100 }, { "epoch": 0.34160661862823594, "eval_loss": 1.0700218677520752, "eval_runtime": 119.6843, "eval_samples_per_second": 32.962, "eval_steps_per_second": 8.247, "step": 100 }, { "epoch": 0.34843875100080063, "grad_norm": 7.206521987915039, "learning_rate": 4.981772185245135e-05, "loss": 68.3145, "step": 102 }, { "epoch": 0.3552708833733654, "grad_norm": 6.332549095153809, "learning_rate": 4.980652179769218e-05, "loss": 67.5062, "step": 104 }, { "epoch": 0.36210301574593007, "grad_norm": 8.422966957092285, "learning_rate": 4.979498916719828e-05, "loss": 69.0426, "step": 106 }, { "epoch": 0.3689351481184948, "grad_norm": 4.5074357986450195, "learning_rate": 4.978312411558518e-05, "loss": 66.0764, "step": 108 }, { "epoch": 0.3757672804910595, "grad_norm": 6.847994327545166, "learning_rate": 4.977092680192507e-05, "loss": 68.0597, "step": 110 }, { "epoch": 0.38259941286362426, "grad_norm": 9.010295867919922, "learning_rate": 4.9758397389744734e-05, "loss": 66.7856, "step": 112 }, { "epoch": 0.38943154523618895, "grad_norm": 8.793087005615234, "learning_rate": 4.9745536047023324e-05, "loss": 66.6415, "step": 114 }, { "epoch": 0.3962636776087537, "grad_norm": 6.820159912109375, "learning_rate": 4.973234294619011e-05, "loss": 66.8668, "step": 116 }, { "epoch": 0.4030958099813184, "grad_norm": 10.739355087280273, "learning_rate": 4.971881826412218e-05, "loss": 64.5842, "step": 118 }, { "epoch": 0.4099279423538831, "grad_norm": 6.451905727386475, "learning_rate": 4.9704962182142044e-05, "loss": 64.2948, "step": 120 }, { "epoch": 0.4167600747264478, "grad_norm": 6.998046398162842, "learning_rate": 4.9690774886015244e-05, "loss": 66.095, "step": 122 }, { "epoch": 0.4235922070990125, "grad_norm": 6.946700096130371, "learning_rate": 4.967625656594782e-05, "loss": 66.6205, "step": 124 }, { "epoch": 0.43042433947157727, "grad_norm": 7.656089782714844, "learning_rate": 4.966140741658379e-05, "loss": 65.2253, "step": 126 }, { "epoch": 0.43725647184414196, "grad_norm": 8.242254257202148, "learning_rate": 4.9646227637002515e-05, "loss": 65.4466, "step": 128 }, { "epoch": 0.4440886042167067, "grad_norm": 6.5599894523620605, "learning_rate": 4.963071743071607e-05, "loss": 64.5302, "step": 130 }, { "epoch": 0.4509207365892714, "grad_norm": 5.671536922454834, "learning_rate": 4.961487700566646e-05, "loss": 64.9711, "step": 132 }, { "epoch": 0.45775286896183615, "grad_norm": 6.317226886749268, "learning_rate": 4.9598706574222886e-05, "loss": 66.1428, "step": 134 }, { "epoch": 0.46458500133440084, "grad_norm": 7.731470584869385, "learning_rate": 4.958220635317886e-05, "loss": 65.6398, "step": 136 }, { "epoch": 0.4714171337069656, "grad_norm": 7.070956230163574, "learning_rate": 4.956537656374933e-05, "loss": 64.027, "step": 138 }, { "epoch": 0.4782492660795303, "grad_norm": 5.216205596923828, "learning_rate": 4.9548217431567665e-05, "loss": 64.9929, "step": 140 }, { "epoch": 0.485081398452095, "grad_norm": 6.5882344245910645, "learning_rate": 4.95307291866827e-05, "loss": 66.2789, "step": 142 }, { "epoch": 0.4919135308246597, "grad_norm": 5.5962934494018555, "learning_rate": 4.95129120635556e-05, "loss": 65.4516, "step": 144 }, { "epoch": 0.49874566319722446, "grad_norm": 7.341054916381836, "learning_rate": 4.949476630105669e-05, "loss": 64.339, "step": 146 }, { "epoch": 0.5055777955697892, "grad_norm": 7.5083441734313965, "learning_rate": 4.9476292142462374e-05, "loss": 62.7076, "step": 148 }, { "epoch": 0.5124099279423538, "grad_norm": 5.081834316253662, "learning_rate": 4.945748983545172e-05, "loss": 64.2066, "step": 150 }, { "epoch": 0.5124099279423538, "eval_loss": 0.9920685291290283, "eval_runtime": 120.1858, "eval_samples_per_second": 32.824, "eval_steps_per_second": 8.212, "step": 150 }, { "epoch": 0.5192420603149186, "grad_norm": 6.279696464538574, "learning_rate": 4.943835963210324e-05, "loss": 63.3412, "step": 152 }, { "epoch": 0.5260741926874833, "grad_norm": 6.806802749633789, "learning_rate": 4.941890178889149e-05, "loss": 63.2038, "step": 154 }, { "epoch": 0.5329063250600481, "grad_norm": 8.012312889099121, "learning_rate": 4.939911656668361e-05, "loss": 63.4725, "step": 156 }, { "epoch": 0.5397384574326127, "grad_norm": 6.68613338470459, "learning_rate": 4.937900423073585e-05, "loss": 62.8267, "step": 158 }, { "epoch": 0.5465705898051775, "grad_norm": 6.391062259674072, "learning_rate": 4.9358565050689985e-05, "loss": 63.4099, "step": 160 }, { "epoch": 0.5534027221777422, "grad_norm": 6.4117817878723145, "learning_rate": 4.933779930056975e-05, "loss": 62.475, "step": 162 }, { "epoch": 0.5602348545503069, "grad_norm": 10.238900184631348, "learning_rate": 4.93167072587771e-05, "loss": 62.3929, "step": 164 }, { "epoch": 0.5670669869228716, "grad_norm": 6.800478935241699, "learning_rate": 4.929528920808854e-05, "loss": 63.4465, "step": 166 }, { "epoch": 0.5738991192954364, "grad_norm": 6.688059329986572, "learning_rate": 4.92735454356513e-05, "loss": 62.3017, "step": 168 }, { "epoch": 0.5807312516680011, "grad_norm": 5.010741710662842, "learning_rate": 4.925147623297949e-05, "loss": 61.5306, "step": 170 }, { "epoch": 0.5875633840405657, "grad_norm": 6.061219215393066, "learning_rate": 4.922908189595018e-05, "loss": 63.5529, "step": 172 }, { "epoch": 0.5943955164131305, "grad_norm": 7.6835126876831055, "learning_rate": 4.920636272479946e-05, "loss": 64.4077, "step": 174 }, { "epoch": 0.6012276487856952, "grad_norm": 5.945671558380127, "learning_rate": 4.9183319024118415e-05, "loss": 64.3411, "step": 176 }, { "epoch": 0.60805978115826, "grad_norm": 4.983694076538086, "learning_rate": 4.915995110284901e-05, "loss": 63.5529, "step": 178 }, { "epoch": 0.6148919135308246, "grad_norm": 5.736062049865723, "learning_rate": 4.9136259274279955e-05, "loss": 63.7282, "step": 180 }, { "epoch": 0.6217240459033894, "grad_norm": 6.8453545570373535, "learning_rate": 4.911224385604255e-05, "loss": 63.5027, "step": 182 }, { "epoch": 0.6285561782759541, "grad_norm": 5.9253668785095215, "learning_rate": 4.908790517010636e-05, "loss": 60.5142, "step": 184 }, { "epoch": 0.6353883106485189, "grad_norm": 5.743585586547852, "learning_rate": 4.906324354277495e-05, "loss": 62.4935, "step": 186 }, { "epoch": 0.6422204430210835, "grad_norm": 4.686921119689941, "learning_rate": 4.903825930468149e-05, "loss": 60.8045, "step": 188 }, { "epoch": 0.6490525753936482, "grad_norm": 5.350888729095459, "learning_rate": 4.901295279078431e-05, "loss": 62.3775, "step": 190 }, { "epoch": 0.655884707766213, "grad_norm": 5.417562961578369, "learning_rate": 4.898732434036244e-05, "loss": 60.1095, "step": 192 }, { "epoch": 0.6627168401387777, "grad_norm": 5.238453388214111, "learning_rate": 4.896137429701102e-05, "loss": 62.8943, "step": 194 }, { "epoch": 0.6695489725113424, "grad_norm": 6.252527713775635, "learning_rate": 4.893510300863676e-05, "loss": 61.1666, "step": 196 }, { "epoch": 0.6763811048839071, "grad_norm": 5.860842704772949, "learning_rate": 4.890851082745319e-05, "loss": 62.6643, "step": 198 }, { "epoch": 0.6832132372564719, "grad_norm": 6.3946099281311035, "learning_rate": 4.8881598109976004e-05, "loss": 61.939, "step": 200 }, { "epoch": 0.6832132372564719, "eval_loss": 0.9664058685302734, "eval_runtime": 119.3157, "eval_samples_per_second": 33.064, "eval_steps_per_second": 8.272, "step": 200 }, { "epoch": 0.6900453696290365, "grad_norm": 5.909948825836182, "learning_rate": 4.885436521701824e-05, "loss": 63.9172, "step": 202 }, { "epoch": 0.6968775020016013, "grad_norm": 6.600235462188721, "learning_rate": 4.8826812513685487e-05, "loss": 60.6396, "step": 204 }, { "epoch": 0.703709634374166, "grad_norm": 5.97224235534668, "learning_rate": 4.8798940369370944e-05, "loss": 61.1365, "step": 206 }, { "epoch": 0.7105417667467308, "grad_norm": 5.521954536437988, "learning_rate": 4.877074915775049e-05, "loss": 61.9178, "step": 208 }, { "epoch": 0.7173738991192954, "grad_norm": 4.756962299346924, "learning_rate": 4.8742239256777674e-05, "loss": 60.0003, "step": 210 }, { "epoch": 0.7242060314918601, "grad_norm": 7.966216564178467, "learning_rate": 4.8713411048678635e-05, "loss": 60.3937, "step": 212 }, { "epoch": 0.7310381638644249, "grad_norm": 5.864863872528076, "learning_rate": 4.868426491994702e-05, "loss": 60.5208, "step": 214 }, { "epoch": 0.7378702962369896, "grad_norm": 4.952422142028809, "learning_rate": 4.865480126133872e-05, "loss": 61.4458, "step": 216 }, { "epoch": 0.7447024286095543, "grad_norm": 4.522135257720947, "learning_rate": 4.862502046786671e-05, "loss": 62.5035, "step": 218 }, { "epoch": 0.751534560982119, "grad_norm": 4.29464054107666, "learning_rate": 4.859492293879574e-05, "loss": 61.5825, "step": 220 }, { "epoch": 0.7583666933546838, "grad_norm": 5.789974212646484, "learning_rate": 4.856450907763693e-05, "loss": 59.9352, "step": 222 }, { "epoch": 0.7651988257272485, "grad_norm": 6.44216251373291, "learning_rate": 4.853377929214243e-05, "loss": 59.1637, "step": 224 }, { "epoch": 0.7720309580998131, "grad_norm": 4.520390033721924, "learning_rate": 4.85027339942999e-05, "loss": 60.4813, "step": 226 }, { "epoch": 0.7788630904723779, "grad_norm": 6.058870315551758, "learning_rate": 4.8471373600326996e-05, "loss": 60.2968, "step": 228 }, { "epoch": 0.7856952228449426, "grad_norm": 5.945502281188965, "learning_rate": 4.843969853066584e-05, "loss": 58.2098, "step": 230 }, { "epoch": 0.7925273552175074, "grad_norm": 4.318876266479492, "learning_rate": 4.8407709209977305e-05, "loss": 58.4711, "step": 232 }, { "epoch": 0.799359487590072, "grad_norm": 5.385821342468262, "learning_rate": 4.837540606713538e-05, "loss": 59.5379, "step": 234 }, { "epoch": 0.8061916199626368, "grad_norm": 6.59214973449707, "learning_rate": 4.834278953522138e-05, "loss": 58.4163, "step": 236 }, { "epoch": 0.8130237523352015, "grad_norm": 5.087238311767578, "learning_rate": 4.8309860051518204e-05, "loss": 60.5546, "step": 238 }, { "epoch": 0.8198558847077662, "grad_norm": 6.804642200469971, "learning_rate": 4.8276618057504376e-05, "loss": 59.0874, "step": 240 }, { "epoch": 0.8266880170803309, "grad_norm": 5.035391330718994, "learning_rate": 4.824306399884822e-05, "loss": 59.9545, "step": 242 }, { "epoch": 0.8335201494528957, "grad_norm": 5.837290287017822, "learning_rate": 4.8209198325401815e-05, "loss": 59.5963, "step": 244 }, { "epoch": 0.8403522818254604, "grad_norm": 4.17293643951416, "learning_rate": 4.817502149119502e-05, "loss": 59.7065, "step": 246 }, { "epoch": 0.847184414198025, "grad_norm": 4.964944362640381, "learning_rate": 4.8140533954429327e-05, "loss": 59.5358, "step": 248 }, { "epoch": 0.8540165465705898, "grad_norm": 6.021297931671143, "learning_rate": 4.810573617747178e-05, "loss": 60.6391, "step": 250 }, { "epoch": 0.8540165465705898, "eval_loss": 0.9407148361206055, "eval_runtime": 119.9595, "eval_samples_per_second": 32.886, "eval_steps_per_second": 8.228, "step": 250 }, { "epoch": 0.8608486789431545, "grad_norm": 5.707021713256836, "learning_rate": 4.8070628626848735e-05, "loss": 61.5872, "step": 252 }, { "epoch": 0.8676808113157193, "grad_norm": 4.725375652313232, "learning_rate": 4.803521177323962e-05, "loss": 59.2192, "step": 254 }, { "epoch": 0.8745129436882839, "grad_norm": 23.445714950561523, "learning_rate": 4.799948609147061e-05, "loss": 60.1762, "step": 256 }, { "epoch": 0.8813450760608487, "grad_norm": 5.503020286560059, "learning_rate": 4.796345206050829e-05, "loss": 62.2226, "step": 258 }, { "epoch": 0.8881772084334134, "grad_norm": 6.558228015899658, "learning_rate": 4.792711016345321e-05, "loss": 62.089, "step": 260 }, { "epoch": 0.8950093408059782, "grad_norm": 8.109895706176758, "learning_rate": 4.7890460887533417e-05, "loss": 60.7872, "step": 262 }, { "epoch": 0.9018414731785428, "grad_norm": 5.230234622955322, "learning_rate": 4.785350472409792e-05, "loss": 57.9312, "step": 264 }, { "epoch": 0.9086736055511075, "grad_norm": 6.669562339782715, "learning_rate": 4.7816242168610093e-05, "loss": 61.7966, "step": 266 }, { "epoch": 0.9155057379236723, "grad_norm": 5.428192615509033, "learning_rate": 4.777867372064105e-05, "loss": 58.4551, "step": 268 }, { "epoch": 0.922337870296237, "grad_norm": 5.6168131828308105, "learning_rate": 4.774079988386296e-05, "loss": 59.9015, "step": 270 }, { "epoch": 0.9291700026688017, "grad_norm": 5.785460948944092, "learning_rate": 4.770262116604224e-05, "loss": 59.723, "step": 272 }, { "epoch": 0.9360021350413664, "grad_norm": 8.77035140991211, "learning_rate": 4.76641380790328e-05, "loss": 60.8996, "step": 274 }, { "epoch": 0.9428342674139312, "grad_norm": 4.000178813934326, "learning_rate": 4.762535113876917e-05, "loss": 59.2908, "step": 276 }, { "epoch": 0.9496663997864959, "grad_norm": 5.8565826416015625, "learning_rate": 4.758626086525956e-05, "loss": 59.296, "step": 278 }, { "epoch": 0.9564985321590606, "grad_norm": 6.792466163635254, "learning_rate": 4.754686778257891e-05, "loss": 58.351, "step": 280 }, { "epoch": 0.9633306645316253, "grad_norm": 6.484628677368164, "learning_rate": 4.750717241886185e-05, "loss": 58.46, "step": 282 }, { "epoch": 0.97016279690419, "grad_norm": 5.421430587768555, "learning_rate": 4.7467175306295655e-05, "loss": 59.0205, "step": 284 }, { "epoch": 0.9769949292767547, "grad_norm": 4.550335884094238, "learning_rate": 4.7426876981113044e-05, "loss": 60.8234, "step": 286 }, { "epoch": 0.9838270616493194, "grad_norm": 5.412383079528809, "learning_rate": 4.738627798358506e-05, "loss": 57.3651, "step": 288 }, { "epoch": 0.9906591940218842, "grad_norm": 5.225856781005859, "learning_rate": 4.7345378858013776e-05, "loss": 58.8522, "step": 290 }, { "epoch": 0.9974913263944489, "grad_norm": 3.856189250946045, "learning_rate": 4.730418015272503e-05, "loss": 59.7945, "step": 292 }, { "epoch": 1.0034160661862823, "grad_norm": 6.19010066986084, "learning_rate": 4.726268242006106e-05, "loss": 50.2722, "step": 294 }, { "epoch": 1.0102481985588472, "grad_norm": 5.333181858062744, "learning_rate": 4.722088621637309e-05, "loss": 58.7285, "step": 296 }, { "epoch": 1.0170803309314118, "grad_norm": 5.93973970413208, "learning_rate": 4.717879210201389e-05, "loss": 57.2823, "step": 298 }, { "epoch": 1.0239124633039765, "grad_norm": 4.59360408782959, "learning_rate": 4.713640064133025e-05, "loss": 58.4687, "step": 300 }, { "epoch": 1.0239124633039765, "eval_loss": 0.9195547699928284, "eval_runtime": 119.3076, "eval_samples_per_second": 33.066, "eval_steps_per_second": 8.273, "step": 300 }, { "epoch": 1.0307445956765413, "grad_norm": 5.437332630157471, "learning_rate": 4.7093712402655427e-05, "loss": 57.7491, "step": 302 }, { "epoch": 1.037576728049106, "grad_norm": 4.938009738922119, "learning_rate": 4.7050727958301506e-05, "loss": 58.2642, "step": 304 }, { "epoch": 1.0444088604216706, "grad_norm": 5.104777812957764, "learning_rate": 4.7007447884551745e-05, "loss": 56.1312, "step": 306 }, { "epoch": 1.0512409927942354, "grad_norm": 5.78248405456543, "learning_rate": 4.6963872761652835e-05, "loss": 56.9488, "step": 308 }, { "epoch": 1.0580731251668, "grad_norm": 4.8224287033081055, "learning_rate": 4.692000317380715e-05, "loss": 56.6993, "step": 310 }, { "epoch": 1.064905257539365, "grad_norm": 4.517540454864502, "learning_rate": 4.687583970916487e-05, "loss": 58.8636, "step": 312 }, { "epoch": 1.0717373899119296, "grad_norm": 5.353949069976807, "learning_rate": 4.683138295981611e-05, "loss": 58.6762, "step": 314 }, { "epoch": 1.0785695222844942, "grad_norm": 6.164919376373291, "learning_rate": 4.678663352178301e-05, "loss": 57.9218, "step": 316 }, { "epoch": 1.085401654657059, "grad_norm": 4.577470302581787, "learning_rate": 4.674159199501173e-05, "loss": 58.1644, "step": 318 }, { "epoch": 1.0922337870296237, "grad_norm": 6.5861592292785645, "learning_rate": 4.6696258983364385e-05, "loss": 57.3447, "step": 320 }, { "epoch": 1.0990659194021883, "grad_norm": 4.327467918395996, "learning_rate": 4.665063509461097e-05, "loss": 57.2627, "step": 322 }, { "epoch": 1.1058980517747532, "grad_norm": 7.534716606140137, "learning_rate": 4.660472094042121e-05, "loss": 57.2099, "step": 324 }, { "epoch": 1.1127301841473178, "grad_norm": 5.549008369445801, "learning_rate": 4.655851713635635e-05, "loss": 58.4564, "step": 326 }, { "epoch": 1.1195623165198825, "grad_norm": 4.385070323944092, "learning_rate": 4.651202430186092e-05, "loss": 57.0019, "step": 328 }, { "epoch": 1.1263944488924473, "grad_norm": 4.763044357299805, "learning_rate": 4.6465243060254415e-05, "loss": 55.7849, "step": 330 }, { "epoch": 1.133226581265012, "grad_norm": 3.9461379051208496, "learning_rate": 4.641817403872293e-05, "loss": 56.2399, "step": 332 }, { "epoch": 1.1400587136375768, "grad_norm": 4.946137428283691, "learning_rate": 4.637081786831079e-05, "loss": 56.7089, "step": 334 }, { "epoch": 1.1468908460101415, "grad_norm": 5.664731025695801, "learning_rate": 4.6323175183912024e-05, "loss": 57.1022, "step": 336 }, { "epoch": 1.153722978382706, "grad_norm": 5.261230945587158, "learning_rate": 4.627524662426194e-05, "loss": 56.3552, "step": 338 }, { "epoch": 1.160555110755271, "grad_norm": 4.166741847991943, "learning_rate": 4.6227032831928484e-05, "loss": 56.888, "step": 340 }, { "epoch": 1.1673872431278356, "grad_norm": 6.015218734741211, "learning_rate": 4.6178534453303666e-05, "loss": 57.3006, "step": 342 }, { "epoch": 1.1742193755004002, "grad_norm": 6.349710941314697, "learning_rate": 4.6129752138594874e-05, "loss": 57.0208, "step": 344 }, { "epoch": 1.181051507872965, "grad_norm": 5.403022766113281, "learning_rate": 4.608068654181617e-05, "loss": 57.0645, "step": 346 }, { "epoch": 1.1878836402455297, "grad_norm": 6.523670673370361, "learning_rate": 4.6031338320779534e-05, "loss": 58.2164, "step": 348 }, { "epoch": 1.1947157726180944, "grad_norm": 6.369359970092773, "learning_rate": 4.5981708137086e-05, "loss": 56.7965, "step": 350 }, { "epoch": 1.1947157726180944, "eval_loss": 0.8986765146255493, "eval_runtime": 119.0222, "eval_samples_per_second": 33.145, "eval_steps_per_second": 8.293, "step": 350 }, { "epoch": 1.2015479049906592, "grad_norm": 5.050749778747559, "learning_rate": 4.5931796656116846e-05, "loss": 56.7828, "step": 352 }, { "epoch": 1.2083800373632239, "grad_norm": 5.341484069824219, "learning_rate": 4.588160454702462e-05, "loss": 57.4058, "step": 354 }, { "epoch": 1.2152121697357887, "grad_norm": 4.554074287414551, "learning_rate": 4.5831132482724195e-05, "loss": 57.6257, "step": 356 }, { "epoch": 1.2220443021083534, "grad_norm": 4.951889514923096, "learning_rate": 4.578038113988376e-05, "loss": 56.0608, "step": 358 }, { "epoch": 1.228876434480918, "grad_norm": 4.2526421546936035, "learning_rate": 4.572935119891571e-05, "loss": 55.8586, "step": 360 }, { "epoch": 1.2357085668534828, "grad_norm": 4.805353164672852, "learning_rate": 4.5678043343967554e-05, "loss": 59.2427, "step": 362 }, { "epoch": 1.2425406992260475, "grad_norm": 4.9927978515625, "learning_rate": 4.5626458262912745e-05, "loss": 55.1494, "step": 364 }, { "epoch": 1.2493728315986123, "grad_norm": 5.778275012969971, "learning_rate": 4.557459664734141e-05, "loss": 55.9791, "step": 366 }, { "epoch": 1.256204963971177, "grad_norm": 4.41555643081665, "learning_rate": 4.552245919255117e-05, "loss": 57.3123, "step": 368 }, { "epoch": 1.2630370963437416, "grad_norm": 5.230330944061279, "learning_rate": 4.5470046597537735e-05, "loss": 55.9031, "step": 370 }, { "epoch": 1.2698692287163063, "grad_norm": 3.9548189640045166, "learning_rate": 4.541735956498554e-05, "loss": 56.6997, "step": 372 }, { "epoch": 1.2767013610888711, "grad_norm": 5.017361640930176, "learning_rate": 4.5364398801258396e-05, "loss": 57.3268, "step": 374 }, { "epoch": 1.2835334934614357, "grad_norm": 5.562941074371338, "learning_rate": 4.5311165016389916e-05, "loss": 55.6271, "step": 376 }, { "epoch": 1.2903656258340006, "grad_norm": 6.675297737121582, "learning_rate": 4.525765892407409e-05, "loss": 55.9593, "step": 378 }, { "epoch": 1.2971977582065652, "grad_norm": 6.47582483291626, "learning_rate": 4.5203881241655644e-05, "loss": 57.0788, "step": 380 }, { "epoch": 1.3040298905791299, "grad_norm": 5.157675743103027, "learning_rate": 4.514983269012049e-05, "loss": 56.3623, "step": 382 }, { "epoch": 1.3108620229516947, "grad_norm": 8.075702667236328, "learning_rate": 4.509551399408598e-05, "loss": 55.6531, "step": 384 }, { "epoch": 1.3176941553242594, "grad_norm": 3.849310874938965, "learning_rate": 4.504092588179128e-05, "loss": 58.7546, "step": 386 }, { "epoch": 1.3245262876968242, "grad_norm": 3.6027579307556152, "learning_rate": 4.498606908508754e-05, "loss": 57.7153, "step": 388 }, { "epoch": 1.3313584200693889, "grad_norm": 5.139729976654053, "learning_rate": 4.4930944339428085e-05, "loss": 56.4532, "step": 390 }, { "epoch": 1.3381905524419535, "grad_norm": 5.337704181671143, "learning_rate": 4.487555238385862e-05, "loss": 54.2958, "step": 392 }, { "epoch": 1.3450226848145181, "grad_norm": 3.3229618072509766, "learning_rate": 4.481989396100724e-05, "loss": 54.2046, "step": 394 }, { "epoch": 1.351854817187083, "grad_norm": 5.2183074951171875, "learning_rate": 4.476396981707453e-05, "loss": 56.0147, "step": 396 }, { "epoch": 1.3586869495596476, "grad_norm": 5.028941631317139, "learning_rate": 4.470778070182353e-05, "loss": 54.3446, "step": 398 }, { "epoch": 1.3655190819322125, "grad_norm": 6.347212791442871, "learning_rate": 4.465132736856969e-05, "loss": 56.7659, "step": 400 }, { "epoch": 1.3655190819322125, "eval_loss": 0.8771227598190308, "eval_runtime": 118.9477, "eval_samples_per_second": 33.166, "eval_steps_per_second": 8.298, "step": 400 }, { "epoch": 1.3723512143047771, "grad_norm": 9.381309509277344, "learning_rate": 4.459461057417078e-05, "loss": 56.8099, "step": 402 }, { "epoch": 1.3791833466773418, "grad_norm": 5.657813549041748, "learning_rate": 4.453763107901675e-05, "loss": 56.3326, "step": 404 }, { "epoch": 1.3860154790499066, "grad_norm": 4.476396083831787, "learning_rate": 4.4480389647019505e-05, "loss": 57.3978, "step": 406 }, { "epoch": 1.3928476114224713, "grad_norm": 5.402798652648926, "learning_rate": 4.442288704560268e-05, "loss": 55.7143, "step": 408 }, { "epoch": 1.3996797437950361, "grad_norm": 4.367002010345459, "learning_rate": 4.436512404569136e-05, "loss": 55.7044, "step": 410 }, { "epoch": 1.4065118761676008, "grad_norm": 5.653073310852051, "learning_rate": 4.430710142170176e-05, "loss": 55.7266, "step": 412 }, { "epoch": 1.4133440085401654, "grad_norm": 7.221829414367676, "learning_rate": 4.424881995153076e-05, "loss": 56.4174, "step": 414 }, { "epoch": 1.4201761409127303, "grad_norm": 5.465057373046875, "learning_rate": 4.419028041654559e-05, "loss": 56.9093, "step": 416 }, { "epoch": 1.427008273285295, "grad_norm": 8.383552551269531, "learning_rate": 4.4131483601573285e-05, "loss": 56.0841, "step": 418 }, { "epoch": 1.4338404056578598, "grad_norm": 4.208652973175049, "learning_rate": 4.4072430294890174e-05, "loss": 57.5786, "step": 420 }, { "epoch": 1.4406725380304244, "grad_norm": 5.773376941680908, "learning_rate": 4.4013121288211307e-05, "loss": 55.8851, "step": 422 }, { "epoch": 1.447504670402989, "grad_norm": 5.354812145233154, "learning_rate": 4.3953557376679856e-05, "loss": 55.1571, "step": 424 }, { "epoch": 1.4543368027755537, "grad_norm": 4.6360039710998535, "learning_rate": 4.389373935885646e-05, "loss": 54.0095, "step": 426 }, { "epoch": 1.4611689351481185, "grad_norm": 7.125521183013916, "learning_rate": 4.383366803670849e-05, "loss": 56.645, "step": 428 }, { "epoch": 1.4680010675206832, "grad_norm": 6.071737766265869, "learning_rate": 4.377334421559932e-05, "loss": 55.3209, "step": 430 }, { "epoch": 1.474833199893248, "grad_norm": 4.569766998291016, "learning_rate": 4.371276870427753e-05, "loss": 54.6604, "step": 432 }, { "epoch": 1.4816653322658127, "grad_norm": 5.426764965057373, "learning_rate": 4.365194231486604e-05, "loss": 56.4116, "step": 434 }, { "epoch": 1.4884974646383773, "grad_norm": 5.6092023849487305, "learning_rate": 4.359086586285127e-05, "loss": 56.0268, "step": 436 }, { "epoch": 1.4953295970109421, "grad_norm": 6.140939712524414, "learning_rate": 4.3529540167072126e-05, "loss": 54.886, "step": 438 }, { "epoch": 1.5021617293835068, "grad_norm": 4.043739318847656, "learning_rate": 4.346796604970912e-05, "loss": 56.6431, "step": 440 }, { "epoch": 1.5089938617560716, "grad_norm": 3.8898212909698486, "learning_rate": 4.340614433627328e-05, "loss": 55.6492, "step": 442 }, { "epoch": 1.5158259941286363, "grad_norm": 6.158950328826904, "learning_rate": 4.3344075855595104e-05, "loss": 55.6869, "step": 444 }, { "epoch": 1.522658126501201, "grad_norm": 3.874180316925049, "learning_rate": 4.328176143981343e-05, "loss": 53.7981, "step": 446 }, { "epoch": 1.5294902588737656, "grad_norm": 4.068581581115723, "learning_rate": 4.321920192436433e-05, "loss": 54.6618, "step": 448 }, { "epoch": 1.5363223912463304, "grad_norm": 4.552149295806885, "learning_rate": 4.315639814796983e-05, "loss": 55.1642, "step": 450 }, { "epoch": 1.5363223912463304, "eval_loss": 0.8704175353050232, "eval_runtime": 119.5049, "eval_samples_per_second": 33.011, "eval_steps_per_second": 8.259, "step": 450 }, { "epoch": 1.5431545236188953, "grad_norm": 4.1831374168396, "learning_rate": 4.309335095262676e-05, "loss": 53.2926, "step": 452 }, { "epoch": 1.54998665599146, "grad_norm": 4.456052780151367, "learning_rate": 4.303006118359537e-05, "loss": 53.6038, "step": 454 }, { "epoch": 1.5568187883640245, "grad_norm": 17.7099609375, "learning_rate": 4.296652968938807e-05, "loss": 54.9325, "step": 456 }, { "epoch": 1.5636509207365892, "grad_norm": 8.005233764648438, "learning_rate": 4.2902757321758016e-05, "loss": 53.7884, "step": 458 }, { "epoch": 1.570483053109154, "grad_norm": 5.034004211425781, "learning_rate": 4.283874493568772e-05, "loss": 53.2575, "step": 460 }, { "epoch": 1.5773151854817187, "grad_norm": 4.005930423736572, "learning_rate": 4.2774493389377545e-05, "loss": 55.4554, "step": 462 }, { "epoch": 1.5841473178542835, "grad_norm": 5.812296390533447, "learning_rate": 4.271000354423426e-05, "loss": 56.7008, "step": 464 }, { "epoch": 1.5909794502268482, "grad_norm": 6.425695896148682, "learning_rate": 4.2645276264859394e-05, "loss": 56.8804, "step": 466 }, { "epoch": 1.5978115825994128, "grad_norm": 4.44102144241333, "learning_rate": 4.258031241903778e-05, "loss": 54.2011, "step": 468 }, { "epoch": 1.6046437149719774, "grad_norm": 4.444553852081299, "learning_rate": 4.251511287772579e-05, "loss": 54.9826, "step": 470 }, { "epoch": 1.6114758473445423, "grad_norm": 3.8157808780670166, "learning_rate": 4.2449678515039747e-05, "loss": 55.2601, "step": 472 }, { "epoch": 1.6183079797171072, "grad_norm": 6.47904634475708, "learning_rate": 4.238401020824416e-05, "loss": 54.5978, "step": 474 }, { "epoch": 1.6251401120896718, "grad_norm": 5.010526180267334, "learning_rate": 4.231810883773999e-05, "loss": 56.0995, "step": 476 }, { "epoch": 1.6319722444622364, "grad_norm": 5.843505382537842, "learning_rate": 4.2251975287052804e-05, "loss": 54.0241, "step": 478 }, { "epoch": 1.638804376834801, "grad_norm": 4.549996852874756, "learning_rate": 4.218561044282099e-05, "loss": 56.3071, "step": 480 }, { "epoch": 1.645636509207366, "grad_norm": 4.20985221862793, "learning_rate": 4.211901519478382e-05, "loss": 54.3977, "step": 482 }, { "epoch": 1.6524686415799306, "grad_norm": 5.491010665893555, "learning_rate": 4.2052190435769554e-05, "loss": 53.1375, "step": 484 }, { "epoch": 1.6593007739524954, "grad_norm": 4.417302131652832, "learning_rate": 4.198513706168345e-05, "loss": 53.959, "step": 486 }, { "epoch": 1.66613290632506, "grad_norm": 5.39029598236084, "learning_rate": 4.191785597149577e-05, "loss": 54.5638, "step": 488 }, { "epoch": 1.6729650386976247, "grad_norm": 4.233526229858398, "learning_rate": 4.1850348067229696e-05, "loss": 54.6384, "step": 490 }, { "epoch": 1.6797971710701893, "grad_norm": 6.301634311676025, "learning_rate": 4.178261425394926e-05, "loss": 55.1738, "step": 492 }, { "epoch": 1.6866293034427542, "grad_norm": 5.9507246017456055, "learning_rate": 4.171465543974723e-05, "loss": 54.7009, "step": 494 }, { "epoch": 1.693461435815319, "grad_norm": 5.033243656158447, "learning_rate": 4.1646472535732895e-05, "loss": 54.3154, "step": 496 }, { "epoch": 1.7002935681878837, "grad_norm": 4.675721168518066, "learning_rate": 4.157806645601988e-05, "loss": 54.1507, "step": 498 }, { "epoch": 1.7071257005604483, "grad_norm": 3.5945537090301514, "learning_rate": 4.1509438117713866e-05, "loss": 52.2103, "step": 500 }, { "epoch": 1.7071257005604483, "eval_loss": 0.8516557216644287, "eval_runtime": 119.4754, "eval_samples_per_second": 33.019, "eval_steps_per_second": 8.261, "step": 500 }, { "epoch": 1.713957832933013, "grad_norm": 4.187085151672363, "learning_rate": 4.144058844090032e-05, "loss": 54.1474, "step": 502 }, { "epoch": 1.7207899653055778, "grad_norm": 3.818648099899292, "learning_rate": 4.137151834863213e-05, "loss": 55.5711, "step": 504 }, { "epoch": 1.7276220976781427, "grad_norm": 5.919620513916016, "learning_rate": 4.130222876691726e-05, "loss": 54.3803, "step": 506 }, { "epoch": 1.7344542300507073, "grad_norm": 5.772305011749268, "learning_rate": 4.123272062470633e-05, "loss": 53.9454, "step": 508 }, { "epoch": 1.741286362423272, "grad_norm": 4.569563865661621, "learning_rate": 4.116299485388014e-05, "loss": 53.5009, "step": 510 }, { "epoch": 1.7481184947958366, "grad_norm": 4.183293342590332, "learning_rate": 4.109305238923718e-05, "loss": 52.9927, "step": 512 }, { "epoch": 1.7549506271684012, "grad_norm": 4.4316301345825195, "learning_rate": 4.102289416848114e-05, "loss": 54.5023, "step": 514 }, { "epoch": 1.761782759540966, "grad_norm": 14.234251976013184, "learning_rate": 4.095252113220827e-05, "loss": 53.1473, "step": 516 }, { "epoch": 1.768614891913531, "grad_norm": 4.889795780181885, "learning_rate": 4.088193422389484e-05, "loss": 53.7265, "step": 518 }, { "epoch": 1.7754470242860956, "grad_norm": 3.02785325050354, "learning_rate": 4.0811134389884433e-05, "loss": 52.5917, "step": 520 }, { "epoch": 1.7822791566586602, "grad_norm": 5.794788360595703, "learning_rate": 4.0740122579375286e-05, "loss": 55.4619, "step": 522 }, { "epoch": 1.7891112890312248, "grad_norm": 4.442338466644287, "learning_rate": 4.066889974440757e-05, "loss": 53.7709, "step": 524 }, { "epoch": 1.7959434214037897, "grad_norm": 4.7714715003967285, "learning_rate": 4.0597466839850595e-05, "loss": 54.16, "step": 526 }, { "epoch": 1.8027755537763546, "grad_norm": 4.7263569831848145, "learning_rate": 4.0525824823390045e-05, "loss": 55.9749, "step": 528 }, { "epoch": 1.8096076861489192, "grad_norm": 4.258271217346191, "learning_rate": 4.045397465551513e-05, "loss": 52.5445, "step": 530 }, { "epoch": 1.8164398185214838, "grad_norm": 4.56829309463501, "learning_rate": 4.038191729950569e-05, "loss": 53.8703, "step": 532 }, { "epoch": 1.8232719508940485, "grad_norm": 8.888167381286621, "learning_rate": 4.030965372141927e-05, "loss": 52.7209, "step": 534 }, { "epoch": 1.8301040832666133, "grad_norm": 4.5087175369262695, "learning_rate": 4.0237184890078245e-05, "loss": 54.591, "step": 536 }, { "epoch": 1.836936215639178, "grad_norm": 4.460638523101807, "learning_rate": 4.0164511777056725e-05, "loss": 54.8662, "step": 538 }, { "epoch": 1.8437683480117428, "grad_norm": 3.5958664417266846, "learning_rate": 4.009163535666761e-05, "loss": 53.423, "step": 540 }, { "epoch": 1.8506004803843075, "grad_norm": 4.3935418128967285, "learning_rate": 4.001855660594948e-05, "loss": 53.9048, "step": 542 }, { "epoch": 1.857432612756872, "grad_norm": 5.473939895629883, "learning_rate": 3.994527650465352e-05, "loss": 52.9295, "step": 544 }, { "epoch": 1.8642647451294367, "grad_norm": 4.8625922203063965, "learning_rate": 3.98717960352304e-05, "loss": 51.8002, "step": 546 }, { "epoch": 1.8710968775020016, "grad_norm": 4.244052886962891, "learning_rate": 3.979811618281706e-05, "loss": 53.6904, "step": 548 }, { "epoch": 1.8779290098745665, "grad_norm": 4.050732612609863, "learning_rate": 3.972423793522352e-05, "loss": 54.7441, "step": 550 }, { "epoch": 1.8779290098745665, "eval_loss": 0.8419561982154846, "eval_runtime": 119.6757, "eval_samples_per_second": 32.964, "eval_steps_per_second": 8.247, "step": 550 }, { "epoch": 1.884761142247131, "grad_norm": 5.255309104919434, "learning_rate": 3.9650162282919655e-05, "loss": 53.6842, "step": 552 }, { "epoch": 1.8915932746196957, "grad_norm": 5.483623504638672, "learning_rate": 3.957589021902191e-05, "loss": 54.0004, "step": 554 }, { "epoch": 1.8984254069922604, "grad_norm": 4.224212169647217, "learning_rate": 3.9501422739279956e-05, "loss": 51.7289, "step": 556 }, { "epoch": 1.9052575393648252, "grad_norm": 5.061962127685547, "learning_rate": 3.942676084206338e-05, "loss": 53.4457, "step": 558 }, { "epoch": 1.9120896717373899, "grad_norm": 3.8694398403167725, "learning_rate": 3.9351905528348285e-05, "loss": 51.8595, "step": 560 }, { "epoch": 1.9189218041099547, "grad_norm": 4.149620056152344, "learning_rate": 3.927685780170385e-05, "loss": 51.8196, "step": 562 }, { "epoch": 1.9257539364825194, "grad_norm": 6.877647399902344, "learning_rate": 3.920161866827889e-05, "loss": 52.7279, "step": 564 }, { "epoch": 1.932586068855084, "grad_norm": 4.069815635681152, "learning_rate": 3.9126189136788416e-05, "loss": 51.1502, "step": 566 }, { "epoch": 1.9394182012276486, "grad_norm": 6.629972457885742, "learning_rate": 3.90505702185e-05, "loss": 52.6793, "step": 568 }, { "epoch": 1.9462503336002135, "grad_norm": 4.475677013397217, "learning_rate": 3.897476292722034e-05, "loss": 51.4329, "step": 570 }, { "epoch": 1.9530824659727783, "grad_norm": 5.370522499084473, "learning_rate": 3.889876827928156e-05, "loss": 53.1101, "step": 572 }, { "epoch": 1.959914598345343, "grad_norm": 5.481414794921875, "learning_rate": 3.882258729352768e-05, "loss": 53.3684, "step": 574 }, { "epoch": 1.9667467307179076, "grad_norm": 6.393594741821289, "learning_rate": 3.874622099130087e-05, "loss": 52.7341, "step": 576 }, { "epoch": 1.9735788630904723, "grad_norm": 3.9178807735443115, "learning_rate": 3.866967039642784e-05, "loss": 51.5249, "step": 578 }, { "epoch": 1.9804109954630371, "grad_norm": 9.721770286560059, "learning_rate": 3.859293653520604e-05, "loss": 51.2705, "step": 580 }, { "epoch": 1.987243127835602, "grad_norm": 4.619483470916748, "learning_rate": 3.851602043638994e-05, "loss": 51.7596, "step": 582 }, { "epoch": 1.9940752602081666, "grad_norm": 4.899592399597168, "learning_rate": 3.843892313117724e-05, "loss": 54.7586, "step": 584 }, { "epoch": 2.0, "grad_norm": 3.8423385620117188, "learning_rate": 3.8361645653195026e-05, "loss": 44.9497, "step": 586 }, { "epoch": 2.0068321323725646, "grad_norm": 4.93556022644043, "learning_rate": 3.8284189038485936e-05, "loss": 53.1383, "step": 588 }, { "epoch": 2.0136642647451293, "grad_norm": 6.575899124145508, "learning_rate": 3.8206554325494225e-05, "loss": 52.1373, "step": 590 }, { "epoch": 2.0204963971176944, "grad_norm": 3.5134201049804688, "learning_rate": 3.812874255505191e-05, "loss": 50.8711, "step": 592 }, { "epoch": 2.027328529490259, "grad_norm": 4.761475086212158, "learning_rate": 3.805075477036476e-05, "loss": 52.0756, "step": 594 }, { "epoch": 2.0341606618628236, "grad_norm": 3.7381017208099365, "learning_rate": 3.797259201699833e-05, "loss": 51.0594, "step": 596 }, { "epoch": 2.0409927942353883, "grad_norm": 5.102145671844482, "learning_rate": 3.789425534286394e-05, "loss": 52.1454, "step": 598 }, { "epoch": 2.047824926607953, "grad_norm": 4.762547969818115, "learning_rate": 3.781574579820464e-05, "loss": 50.3373, "step": 600 }, { "epoch": 2.047824926607953, "eval_loss": 0.8283991813659668, "eval_runtime": 119.5704, "eval_samples_per_second": 32.993, "eval_steps_per_second": 8.255, "step": 600 }, { "epoch": 2.0546570589805175, "grad_norm": 4.646745681762695, "learning_rate": 3.773706443558111e-05, "loss": 51.0792, "step": 602 }, { "epoch": 2.0614891913530826, "grad_norm": 5.648324012756348, "learning_rate": 3.765821230985758e-05, "loss": 50.6017, "step": 604 }, { "epoch": 2.0683213237256473, "grad_norm": 4.703359603881836, "learning_rate": 3.75791904781876e-05, "loss": 52.4212, "step": 606 }, { "epoch": 2.075153456098212, "grad_norm": 4.082385540008545, "learning_rate": 3.7500000000000003e-05, "loss": 51.9666, "step": 608 }, { "epoch": 2.0819855884707765, "grad_norm": 4.6461687088012695, "learning_rate": 3.74206419369846e-05, "loss": 51.6205, "step": 610 }, { "epoch": 2.088817720843341, "grad_norm": 3.9972918033599854, "learning_rate": 3.7341117353077966e-05, "loss": 52.6521, "step": 612 }, { "epoch": 2.0956498532159062, "grad_norm": 5.636791229248047, "learning_rate": 3.726142731444921e-05, "loss": 52.6811, "step": 614 }, { "epoch": 2.102481985588471, "grad_norm": 6.055325508117676, "learning_rate": 3.718157288948563e-05, "loss": 51.2952, "step": 616 }, { "epoch": 2.1093141179610355, "grad_norm": 5.317610740661621, "learning_rate": 3.710155514877844e-05, "loss": 52.4443, "step": 618 }, { "epoch": 2.1161462503336, "grad_norm": 4.979522705078125, "learning_rate": 3.702137516510838e-05, "loss": 51.3593, "step": 620 }, { "epoch": 2.122978382706165, "grad_norm": 7.410902500152588, "learning_rate": 3.694103401343136e-05, "loss": 51.5919, "step": 622 }, { "epoch": 2.12981051507873, "grad_norm": 4.962103366851807, "learning_rate": 3.686053277086401e-05, "loss": 51.272, "step": 624 }, { "epoch": 2.1366426474512945, "grad_norm": 4.0044426918029785, "learning_rate": 3.6779872516669295e-05, "loss": 51.6362, "step": 626 }, { "epoch": 2.143474779823859, "grad_norm": 5.016703128814697, "learning_rate": 3.669905433224199e-05, "loss": 51.7369, "step": 628 }, { "epoch": 2.150306912196424, "grad_norm": 4.700343132019043, "learning_rate": 3.6618079301094216e-05, "loss": 50.9454, "step": 630 }, { "epoch": 2.1571390445689884, "grad_norm": 8.11246395111084, "learning_rate": 3.653694850884091e-05, "loss": 50.4605, "step": 632 }, { "epoch": 2.163971176941553, "grad_norm": 3.8724536895751953, "learning_rate": 3.645566304318526e-05, "loss": 52.4849, "step": 634 }, { "epoch": 2.170803309314118, "grad_norm": 3.699873208999634, "learning_rate": 3.637422399390413e-05, "loss": 49.8017, "step": 636 }, { "epoch": 2.1776354416866828, "grad_norm": 4.757104873657227, "learning_rate": 3.6292632452833436e-05, "loss": 52.0966, "step": 638 }, { "epoch": 2.1844675740592474, "grad_norm": 5.273576736450195, "learning_rate": 3.621088951385353e-05, "loss": 49.5201, "step": 640 }, { "epoch": 2.191299706431812, "grad_norm": 4.152122497558594, "learning_rate": 3.612899627287452e-05, "loss": 51.121, "step": 642 }, { "epoch": 2.1981318388043767, "grad_norm": 4.448339939117432, "learning_rate": 3.604695382782159e-05, "loss": 51.5833, "step": 644 }, { "epoch": 2.2049639711769418, "grad_norm": 3.272676706314087, "learning_rate": 3.596476327862024e-05, "loss": 50.4036, "step": 646 }, { "epoch": 2.2117961035495064, "grad_norm": 4.293691158294678, "learning_rate": 3.588242572718162e-05, "loss": 50.4138, "step": 648 }, { "epoch": 2.218628235922071, "grad_norm": 6.384798049926758, "learning_rate": 3.579994227738767e-05, "loss": 49.0042, "step": 650 }, { "epoch": 2.218628235922071, "eval_loss": 0.8110712170600891, "eval_runtime": 119.0744, "eval_samples_per_second": 33.131, "eval_steps_per_second": 8.289, "step": 650 }, { "epoch": 2.2254603682946357, "grad_norm": 4.501573085784912, "learning_rate": 3.5717314035076355e-05, "loss": 49.7713, "step": 652 }, { "epoch": 2.2322925006672003, "grad_norm": 4.808114051818848, "learning_rate": 3.5634542108026876e-05, "loss": 50.6265, "step": 654 }, { "epoch": 2.239124633039765, "grad_norm": 5.616351127624512, "learning_rate": 3.5551627605944745e-05, "loss": 52.1332, "step": 656 }, { "epoch": 2.24595676541233, "grad_norm": 7.0716071128845215, "learning_rate": 3.5468571640446994e-05, "loss": 50.7825, "step": 658 }, { "epoch": 2.2527888977848947, "grad_norm": 4.64641809463501, "learning_rate": 3.5385375325047166e-05, "loss": 50.3092, "step": 660 }, { "epoch": 2.2596210301574593, "grad_norm": 4.058784008026123, "learning_rate": 3.5302039775140486e-05, "loss": 51.7827, "step": 662 }, { "epoch": 2.266453162530024, "grad_norm": 4.011864185333252, "learning_rate": 3.521856610798887e-05, "loss": 51.4194, "step": 664 }, { "epoch": 2.2732852949025886, "grad_norm": 3.89857816696167, "learning_rate": 3.513495544270592e-05, "loss": 50.7032, "step": 666 }, { "epoch": 2.2801174272751537, "grad_norm": 4.966712951660156, "learning_rate": 3.505120890024195e-05, "loss": 49.925, "step": 668 }, { "epoch": 2.2869495596477183, "grad_norm": 4.181141376495361, "learning_rate": 3.496732760336895e-05, "loss": 49.5112, "step": 670 }, { "epoch": 2.293781692020283, "grad_norm": 4.761594772338867, "learning_rate": 3.4883312676665536e-05, "loss": 49.6545, "step": 672 }, { "epoch": 2.3006138243928476, "grad_norm": 3.97501802444458, "learning_rate": 3.479916524650188e-05, "loss": 51.1862, "step": 674 }, { "epoch": 2.307445956765412, "grad_norm": 5.200672149658203, "learning_rate": 3.4714886441024574e-05, "loss": 49.9163, "step": 676 }, { "epoch": 2.314278089137977, "grad_norm": 4.147047519683838, "learning_rate": 3.4630477390141556e-05, "loss": 48.6138, "step": 678 }, { "epoch": 2.321110221510542, "grad_norm": 4.9791693687438965, "learning_rate": 3.4545939225506934e-05, "loss": 51.4538, "step": 680 }, { "epoch": 2.3279423538831066, "grad_norm": 4.929348945617676, "learning_rate": 3.4461273080505793e-05, "loss": 51.2735, "step": 682 }, { "epoch": 2.334774486255671, "grad_norm": 4.98499059677124, "learning_rate": 3.437648009023905e-05, "loss": 48.5889, "step": 684 }, { "epoch": 2.341606618628236, "grad_norm": 4.354183673858643, "learning_rate": 3.4291561391508185e-05, "loss": 51.7768, "step": 686 }, { "epoch": 2.3484387510008005, "grad_norm": 3.482697010040283, "learning_rate": 3.420651812280006e-05, "loss": 48.9966, "step": 688 }, { "epoch": 2.3552708833733655, "grad_norm": 4.613458156585693, "learning_rate": 3.4121351424271594e-05, "loss": 50.8534, "step": 690 }, { "epoch": 2.36210301574593, "grad_norm": 3.93235182762146, "learning_rate": 3.4036062437734484e-05, "loss": 50.9164, "step": 692 }, { "epoch": 2.368935148118495, "grad_norm": 5.348623275756836, "learning_rate": 3.395065230663996e-05, "loss": 49.6679, "step": 694 }, { "epoch": 2.3757672804910595, "grad_norm": 5.050134181976318, "learning_rate": 3.386512217606339e-05, "loss": 48.0534, "step": 696 }, { "epoch": 2.382599412863624, "grad_norm": 3.7587573528289795, "learning_rate": 3.3779473192688954e-05, "loss": 50.3013, "step": 698 }, { "epoch": 2.3894315452361887, "grad_norm": 5.177303314208984, "learning_rate": 3.369370650479425e-05, "loss": 48.8704, "step": 700 }, { "epoch": 2.3894315452361887, "eval_loss": 0.7940448522567749, "eval_runtime": 119.8708, "eval_samples_per_second": 32.91, "eval_steps_per_second": 8.234, "step": 700 }, { "epoch": 2.396263677608754, "grad_norm": 4.268886089324951, "learning_rate": 3.360782326223493e-05, "loss": 50.0788, "step": 702 }, { "epoch": 2.4030958099813184, "grad_norm": 4.847851276397705, "learning_rate": 3.3521824616429285e-05, "loss": 50.5298, "step": 704 }, { "epoch": 2.409927942353883, "grad_norm": 4.221863746643066, "learning_rate": 3.3435711720342764e-05, "loss": 51.0571, "step": 706 }, { "epoch": 2.4167600747264477, "grad_norm": 5.5122528076171875, "learning_rate": 3.3349485728472535e-05, "loss": 48.3266, "step": 708 }, { "epoch": 2.4235922070990124, "grad_norm": 3.7766902446746826, "learning_rate": 3.326314779683207e-05, "loss": 49.9334, "step": 710 }, { "epoch": 2.4304243394715774, "grad_norm": 4.093820571899414, "learning_rate": 3.3176699082935545e-05, "loss": 48.4746, "step": 712 }, { "epoch": 2.437256471844142, "grad_norm": 4.116121292114258, "learning_rate": 3.3090140745782396e-05, "loss": 48.5131, "step": 714 }, { "epoch": 2.4440886042167067, "grad_norm": 5.181516647338867, "learning_rate": 3.300347394584172e-05, "loss": 50.4981, "step": 716 }, { "epoch": 2.4509207365892713, "grad_norm": 4.464053630828857, "learning_rate": 3.2916699845036816e-05, "loss": 50.2301, "step": 718 }, { "epoch": 2.457752868961836, "grad_norm": 4.229206562042236, "learning_rate": 3.282981960672948e-05, "loss": 50.1858, "step": 720 }, { "epoch": 2.4645850013344006, "grad_norm": 3.8356049060821533, "learning_rate": 3.2742834395704486e-05, "loss": 48.9147, "step": 722 }, { "epoch": 2.4714171337069657, "grad_norm": 3.9584670066833496, "learning_rate": 3.265574537815398e-05, "loss": 48.6574, "step": 724 }, { "epoch": 2.4782492660795303, "grad_norm": 4.802350997924805, "learning_rate": 3.25685537216618e-05, "loss": 48.9724, "step": 726 }, { "epoch": 2.485081398452095, "grad_norm": 4.078526020050049, "learning_rate": 3.248126059518785e-05, "loss": 47.7639, "step": 728 }, { "epoch": 2.4919135308246596, "grad_norm": 3.8187856674194336, "learning_rate": 3.2393867169052385e-05, "loss": 48.2195, "step": 730 }, { "epoch": 2.4987456631972247, "grad_norm": 5.273796081542969, "learning_rate": 3.230637461492043e-05, "loss": 49.7512, "step": 732 }, { "epoch": 2.5055777955697893, "grad_norm": 4.126491069793701, "learning_rate": 3.221878410578593e-05, "loss": 49.0844, "step": 734 }, { "epoch": 2.512409927942354, "grad_norm": 4.665433406829834, "learning_rate": 3.213109681595612e-05, "loss": 48.7829, "step": 736 }, { "epoch": 2.5192420603149186, "grad_norm": 4.897470951080322, "learning_rate": 3.2043313921035743e-05, "loss": 49.5252, "step": 738 }, { "epoch": 2.5260741926874832, "grad_norm": 5.257498264312744, "learning_rate": 3.195543659791132e-05, "loss": 50.4767, "step": 740 }, { "epoch": 2.532906325060048, "grad_norm": 3.754957914352417, "learning_rate": 3.186746602473533e-05, "loss": 49.4055, "step": 742 }, { "epoch": 2.5397384574326125, "grad_norm": 3.994774341583252, "learning_rate": 3.177940338091043e-05, "loss": 49.3039, "step": 744 }, { "epoch": 2.5465705898051776, "grad_norm": 4.923650741577148, "learning_rate": 3.169124984707367e-05, "loss": 48.6568, "step": 746 }, { "epoch": 2.5534027221777422, "grad_norm": 6.377063274383545, "learning_rate": 3.160300660508064e-05, "loss": 48.7655, "step": 748 }, { "epoch": 2.560234854550307, "grad_norm": 3.7124524116516113, "learning_rate": 3.151467483798961e-05, "loss": 48.0997, "step": 750 }, { "epoch": 2.560234854550307, "eval_loss": 0.7798339128494263, "eval_runtime": 119.2173, "eval_samples_per_second": 33.091, "eval_steps_per_second": 8.279, "step": 750 }, { "epoch": 2.5670669869228715, "grad_norm": 4.752464294433594, "learning_rate": 3.14262557300457e-05, "loss": 48.422, "step": 752 }, { "epoch": 2.5738991192954366, "grad_norm": 4.635769844055176, "learning_rate": 3.1337750466665e-05, "loss": 48.9177, "step": 754 }, { "epoch": 2.580731251668001, "grad_norm": 4.357526779174805, "learning_rate": 3.124916023441865e-05, "loss": 49.4801, "step": 756 }, { "epoch": 2.587563384040566, "grad_norm": 16.189651489257812, "learning_rate": 3.116048622101694e-05, "loss": 49.275, "step": 758 }, { "epoch": 2.5943955164131305, "grad_norm": 3.983285903930664, "learning_rate": 3.107172961529343e-05, "loss": 47.968, "step": 760 }, { "epoch": 2.601227648785695, "grad_norm": 4.357701301574707, "learning_rate": 3.098289160718895e-05, "loss": 47.8592, "step": 762 }, { "epoch": 2.6080597811582598, "grad_norm": 3.9686052799224854, "learning_rate": 3.0893973387735687e-05, "loss": 49.5191, "step": 764 }, { "epoch": 2.6148919135308244, "grad_norm": 3.9062581062316895, "learning_rate": 3.0804976149041195e-05, "loss": 48.5485, "step": 766 }, { "epoch": 2.6217240459033895, "grad_norm": 4.7290143966674805, "learning_rate": 3.071590108427244e-05, "loss": 49.2073, "step": 768 }, { "epoch": 2.628556178275954, "grad_norm": 4.57703161239624, "learning_rate": 3.062674938763976e-05, "loss": 49.7624, "step": 770 }, { "epoch": 2.6353883106485188, "grad_norm": 4.4061737060546875, "learning_rate": 3.0537522254380905e-05, "loss": 49.0566, "step": 772 }, { "epoch": 2.6422204430210834, "grad_norm": 4.166697978973389, "learning_rate": 3.044822088074496e-05, "loss": 49.3193, "step": 774 }, { "epoch": 2.6490525753936485, "grad_norm": 3.5513172149658203, "learning_rate": 3.0358846463976372e-05, "loss": 48.9675, "step": 776 }, { "epoch": 2.655884707766213, "grad_norm": 4.9701995849609375, "learning_rate": 3.026940020229882e-05, "loss": 49.6229, "step": 778 }, { "epoch": 2.6627168401387777, "grad_norm": 4.223094463348389, "learning_rate": 3.017988329489923e-05, "loss": 47.1613, "step": 780 }, { "epoch": 2.6695489725113424, "grad_norm": 4.849906921386719, "learning_rate": 3.0090296941911633e-05, "loss": 47.5764, "step": 782 }, { "epoch": 2.676381104883907, "grad_norm": 3.507953643798828, "learning_rate": 3.0000642344401113e-05, "loss": 47.1944, "step": 784 }, { "epoch": 2.6832132372564717, "grad_norm": 4.040694713592529, "learning_rate": 2.9910920704347696e-05, "loss": 48.6472, "step": 786 }, { "epoch": 2.6900453696290363, "grad_norm": 5.141117095947266, "learning_rate": 2.9821133224630226e-05, "loss": 47.177, "step": 788 }, { "epoch": 2.6968775020016014, "grad_norm": 4.463181018829346, "learning_rate": 2.9731281109010256e-05, "loss": 47.4283, "step": 790 }, { "epoch": 2.703709634374166, "grad_norm": 3.586456060409546, "learning_rate": 2.9641365562115887e-05, "loss": 48.9784, "step": 792 }, { "epoch": 2.7105417667467306, "grad_norm": 3.9780969619750977, "learning_rate": 2.9551387789425638e-05, "loss": 48.601, "step": 794 }, { "epoch": 2.7173738991192953, "grad_norm": 4.445759296417236, "learning_rate": 2.9461348997252265e-05, "loss": 49.9106, "step": 796 }, { "epoch": 2.7242060314918604, "grad_norm": 4.416858673095703, "learning_rate": 2.9371250392726614e-05, "loss": 48.3298, "step": 798 }, { "epoch": 2.731038163864425, "grad_norm": 4.36728572845459, "learning_rate": 2.9281093183781403e-05, "loss": 48.6063, "step": 800 }, { "epoch": 2.731038163864425, "eval_loss": 0.7699871063232422, "eval_runtime": 119.5951, "eval_samples_per_second": 32.986, "eval_steps_per_second": 8.253, "step": 800 }, { "epoch": 2.7378702962369896, "grad_norm": 5.540378570556641, "learning_rate": 2.919087857913508e-05, "loss": 49.4323, "step": 802 }, { "epoch": 2.7447024286095543, "grad_norm": 3.73681640625, "learning_rate": 2.9100607788275545e-05, "loss": 49.0439, "step": 804 }, { "epoch": 2.751534560982119, "grad_norm": 4.437684535980225, "learning_rate": 2.9010282021444008e-05, "loss": 48.8682, "step": 806 }, { "epoch": 2.7583666933546835, "grad_norm": 4.933871746063232, "learning_rate": 2.891990248961871e-05, "loss": 48.0791, "step": 808 }, { "epoch": 2.7651988257272486, "grad_norm": 4.351380825042725, "learning_rate": 2.8829470404498697e-05, "loss": 47.0584, "step": 810 }, { "epoch": 2.7720309580998133, "grad_norm": 4.953640937805176, "learning_rate": 2.8738986978487625e-05, "loss": 50.0531, "step": 812 }, { "epoch": 2.778863090472378, "grad_norm": 3.676950216293335, "learning_rate": 2.8648453424677434e-05, "loss": 46.9994, "step": 814 }, { "epoch": 2.7856952228449425, "grad_norm": 4.177380084991455, "learning_rate": 2.8557870956832132e-05, "loss": 48.3932, "step": 816 }, { "epoch": 2.7925273552175076, "grad_norm": 4.177119731903076, "learning_rate": 2.846724078937149e-05, "loss": 48.2385, "step": 818 }, { "epoch": 2.7993594875900722, "grad_norm": 4.261831283569336, "learning_rate": 2.8376564137354795e-05, "loss": 48.813, "step": 820 }, { "epoch": 2.806191619962637, "grad_norm": 3.7779037952423096, "learning_rate": 2.8285842216464543e-05, "loss": 48.801, "step": 822 }, { "epoch": 2.8130237523352015, "grad_norm": 5.378250598907471, "learning_rate": 2.8195076242990122e-05, "loss": 45.9584, "step": 824 }, { "epoch": 2.819855884707766, "grad_norm": 3.5369153022766113, "learning_rate": 2.8104267433811533e-05, "loss": 46.97, "step": 826 }, { "epoch": 2.826688017080331, "grad_norm": 3.493602991104126, "learning_rate": 2.8013417006383076e-05, "loss": 46.7352, "step": 828 }, { "epoch": 2.8335201494528954, "grad_norm": 5.41981840133667, "learning_rate": 2.7922526178717017e-05, "loss": 48.4586, "step": 830 }, { "epoch": 2.8403522818254605, "grad_norm": 4.6053948402404785, "learning_rate": 2.783159616936723e-05, "loss": 46.5008, "step": 832 }, { "epoch": 2.847184414198025, "grad_norm": 4.136333465576172, "learning_rate": 2.774062819741293e-05, "loss": 47.3448, "step": 834 }, { "epoch": 2.85401654657059, "grad_norm": 3.927877187728882, "learning_rate": 2.764962348244228e-05, "loss": 46.7369, "step": 836 }, { "epoch": 2.8608486789431544, "grad_norm": 4.283491611480713, "learning_rate": 2.7558583244536007e-05, "loss": 48.098, "step": 838 }, { "epoch": 2.8676808113157195, "grad_norm": 3.802030563354492, "learning_rate": 2.7467508704251137e-05, "loss": 48.2908, "step": 840 }, { "epoch": 2.874512943688284, "grad_norm": 5.212815761566162, "learning_rate": 2.7376401082604564e-05, "loss": 47.8921, "step": 842 }, { "epoch": 2.8813450760608488, "grad_norm": 4.39296293258667, "learning_rate": 2.7285261601056698e-05, "loss": 48.2491, "step": 844 }, { "epoch": 2.8881772084334134, "grad_norm": 5.428844928741455, "learning_rate": 2.7194091481495076e-05, "loss": 49.1209, "step": 846 }, { "epoch": 2.895009340805978, "grad_norm": 3.9836559295654297, "learning_rate": 2.7102891946217994e-05, "loss": 47.0515, "step": 848 }, { "epoch": 2.9018414731785427, "grad_norm": 3.1067824363708496, "learning_rate": 2.7011664217918154e-05, "loss": 46.0087, "step": 850 }, { "epoch": 2.9018414731785427, "eval_loss": 0.760260820388794, "eval_runtime": 119.6698, "eval_samples_per_second": 32.966, "eval_steps_per_second": 8.248, "step": 850 }, { "epoch": 2.9086736055511073, "grad_norm": 4.688024997711182, "learning_rate": 2.6920409519666174e-05, "loss": 47.0489, "step": 852 }, { "epoch": 2.9155057379236724, "grad_norm": 4.777935981750488, "learning_rate": 2.6829129074894304e-05, "loss": 48.1153, "step": 854 }, { "epoch": 2.922337870296237, "grad_norm": 4.912516117095947, "learning_rate": 2.6737824107379948e-05, "loss": 48.0798, "step": 856 }, { "epoch": 2.9291700026688017, "grad_norm": 4.066973686218262, "learning_rate": 2.6646495841229287e-05, "loss": 46.9194, "step": 858 }, { "epoch": 2.9360021350413663, "grad_norm": 4.499208927154541, "learning_rate": 2.655514550086086e-05, "loss": 48.3087, "step": 860 }, { "epoch": 2.9428342674139314, "grad_norm": 4.891952991485596, "learning_rate": 2.6463774310989154e-05, "loss": 46.8565, "step": 862 }, { "epoch": 2.949666399786496, "grad_norm": 3.8262720108032227, "learning_rate": 2.637238349660819e-05, "loss": 46.7596, "step": 864 }, { "epoch": 2.9564985321590607, "grad_norm": 5.6072492599487305, "learning_rate": 2.6280974282975063e-05, "loss": 45.254, "step": 866 }, { "epoch": 2.9633306645316253, "grad_norm": 3.9889800548553467, "learning_rate": 2.6189547895593562e-05, "loss": 46.754, "step": 868 }, { "epoch": 2.97016279690419, "grad_norm": 3.7260525226593018, "learning_rate": 2.6098105560197722e-05, "loss": 46.6516, "step": 870 }, { "epoch": 2.9769949292767546, "grad_norm": 4.090394973754883, "learning_rate": 2.600664850273538e-05, "loss": 47.2404, "step": 872 }, { "epoch": 2.983827061649319, "grad_norm": 3.6287267208099365, "learning_rate": 2.5915177949351765e-05, "loss": 46.3821, "step": 874 }, { "epoch": 2.9906591940218843, "grad_norm": 3.5229976177215576, "learning_rate": 2.582369512637302e-05, "loss": 46.8471, "step": 876 }, { "epoch": 2.997491326394449, "grad_norm": 3.532615900039673, "learning_rate": 2.5732201260289806e-05, "loss": 47.0364, "step": 878 }, { "epoch": 3.0034160661862823, "grad_norm": 3.482403039932251, "learning_rate": 2.564069757774082e-05, "loss": 40.3241, "step": 880 }, { "epoch": 3.010248198558847, "grad_norm": 3.94649600982666, "learning_rate": 2.554918530549637e-05, "loss": 46.7226, "step": 882 }, { "epoch": 3.0170803309314116, "grad_norm": 4.395301818847656, "learning_rate": 2.545766567044194e-05, "loss": 45.266, "step": 884 }, { "epoch": 3.0239124633039767, "grad_norm": 4.813998699188232, "learning_rate": 2.5366139899561696e-05, "loss": 46.8651, "step": 886 }, { "epoch": 3.0307445956765413, "grad_norm": 5.5799174308776855, "learning_rate": 2.527460921992209e-05, "loss": 46.5727, "step": 888 }, { "epoch": 3.037576728049106, "grad_norm": 6.693199634552002, "learning_rate": 2.518307485865538e-05, "loss": 47.987, "step": 890 }, { "epoch": 3.0444088604216706, "grad_norm": 6.33953332901001, "learning_rate": 2.509153804294318e-05, "loss": 45.7221, "step": 892 }, { "epoch": 3.051240992794235, "grad_norm": 4.887784957885742, "learning_rate": 2.5e-05, "loss": 44.5186, "step": 894 }, { "epoch": 3.0580731251668003, "grad_norm": 4.337290287017822, "learning_rate": 2.490846195705683e-05, "loss": 46.394, "step": 896 }, { "epoch": 3.064905257539365, "grad_norm": 3.7094030380249023, "learning_rate": 2.4816925141344623e-05, "loss": 45.122, "step": 898 }, { "epoch": 3.0717373899119296, "grad_norm": 3.71903920173645, "learning_rate": 2.4725390780077908e-05, "loss": 44.7121, "step": 900 }, { "epoch": 3.0717373899119296, "eval_loss": 0.7495905160903931, "eval_runtime": 119.7503, "eval_samples_per_second": 32.944, "eval_steps_per_second": 8.242, "step": 900 }, { "epoch": 3.078569522284494, "grad_norm": 4.690406799316406, "learning_rate": 2.4633860100438316e-05, "loss": 45.6299, "step": 902 }, { "epoch": 3.085401654657059, "grad_norm": 4.29756498336792, "learning_rate": 2.4542334329558077e-05, "loss": 48.2504, "step": 904 }, { "epoch": 3.092233787029624, "grad_norm": 5.62404727935791, "learning_rate": 2.4450814694503636e-05, "loss": 47.6091, "step": 906 }, { "epoch": 3.0990659194021886, "grad_norm": 3.726529836654663, "learning_rate": 2.435930242225919e-05, "loss": 46.4755, "step": 908 }, { "epoch": 3.105898051774753, "grad_norm": 6.04416036605835, "learning_rate": 2.4267798739710203e-05, "loss": 46.9715, "step": 910 }, { "epoch": 3.112730184147318, "grad_norm": 3.8375885486602783, "learning_rate": 2.4176304873626985e-05, "loss": 47.9794, "step": 912 }, { "epoch": 3.1195623165198825, "grad_norm": 3.296687602996826, "learning_rate": 2.4084822050648237e-05, "loss": 45.0776, "step": 914 }, { "epoch": 3.126394448892447, "grad_norm": 3.546963930130005, "learning_rate": 2.399335149726463e-05, "loss": 44.6584, "step": 916 }, { "epoch": 3.133226581265012, "grad_norm": 3.896601676940918, "learning_rate": 2.390189443980229e-05, "loss": 47.0284, "step": 918 }, { "epoch": 3.140058713637577, "grad_norm": 3.570570468902588, "learning_rate": 2.3810452104406444e-05, "loss": 46.4413, "step": 920 }, { "epoch": 3.1468908460101415, "grad_norm": 4.160488605499268, "learning_rate": 2.3719025717024946e-05, "loss": 47.1564, "step": 922 }, { "epoch": 3.153722978382706, "grad_norm": 5.714613914489746, "learning_rate": 2.3627616503391814e-05, "loss": 48.2275, "step": 924 }, { "epoch": 3.1605551107552707, "grad_norm": 4.362124919891357, "learning_rate": 2.3536225689010845e-05, "loss": 47.0592, "step": 926 }, { "epoch": 3.167387243127836, "grad_norm": 6.478647708892822, "learning_rate": 2.3444854499139142e-05, "loss": 47.4139, "step": 928 }, { "epoch": 3.1742193755004005, "grad_norm": 3.713979721069336, "learning_rate": 2.3353504158770722e-05, "loss": 47.7301, "step": 930 }, { "epoch": 3.181051507872965, "grad_norm": 3.875537872314453, "learning_rate": 2.3262175892620065e-05, "loss": 45.6112, "step": 932 }, { "epoch": 3.1878836402455297, "grad_norm": 5.328731536865234, "learning_rate": 2.3170870925105702e-05, "loss": 46.6125, "step": 934 }, { "epoch": 3.1947157726180944, "grad_norm": 5.152383327484131, "learning_rate": 2.307959048033383e-05, "loss": 45.6076, "step": 936 }, { "epoch": 3.201547904990659, "grad_norm": 4.689112186431885, "learning_rate": 2.2988335782081855e-05, "loss": 45.648, "step": 938 }, { "epoch": 3.208380037363224, "grad_norm": 3.3412325382232666, "learning_rate": 2.2897108053782e-05, "loss": 44.4993, "step": 940 }, { "epoch": 3.2152121697357887, "grad_norm": 11.583976745605469, "learning_rate": 2.280590851850493e-05, "loss": 46.3174, "step": 942 }, { "epoch": 3.2220443021083534, "grad_norm": 4.012174606323242, "learning_rate": 2.271473839894331e-05, "loss": 46.3054, "step": 944 }, { "epoch": 3.228876434480918, "grad_norm": 6.315187931060791, "learning_rate": 2.2623598917395438e-05, "loss": 44.3273, "step": 946 }, { "epoch": 3.2357085668534826, "grad_norm": 5.612927436828613, "learning_rate": 2.253249129574887e-05, "loss": 46.8669, "step": 948 }, { "epoch": 3.2425406992260477, "grad_norm": 3.7026705741882324, "learning_rate": 2.2441416755463995e-05, "loss": 46.4012, "step": 950 }, { "epoch": 3.2425406992260477, "eval_loss": 0.7383518218994141, "eval_runtime": 118.6959, "eval_samples_per_second": 33.236, "eval_steps_per_second": 8.315, "step": 950 }, { "epoch": 3.2493728315986123, "grad_norm": 4.251457214355469, "learning_rate": 2.2350376517557727e-05, "loss": 47.1319, "step": 952 }, { "epoch": 3.256204963971177, "grad_norm": 4.500071048736572, "learning_rate": 2.2259371802587068e-05, "loss": 47.0883, "step": 954 }, { "epoch": 3.2630370963437416, "grad_norm": 4.684493064880371, "learning_rate": 2.216840383063277e-05, "loss": 45.0587, "step": 956 }, { "epoch": 3.2698692287163063, "grad_norm": 3.853529453277588, "learning_rate": 2.2077473821282996e-05, "loss": 46.3262, "step": 958 }, { "epoch": 3.276701361088871, "grad_norm": 5.501523971557617, "learning_rate": 2.1986582993616926e-05, "loss": 44.8375, "step": 960 }, { "epoch": 3.283533493461436, "grad_norm": 15.540706634521484, "learning_rate": 2.1895732566188476e-05, "loss": 45.117, "step": 962 }, { "epoch": 3.2903656258340006, "grad_norm": 2.6855862140655518, "learning_rate": 2.1804923757009884e-05, "loss": 45.9567, "step": 964 }, { "epoch": 3.2971977582065652, "grad_norm": 4.529240131378174, "learning_rate": 2.1714157783535463e-05, "loss": 44.7532, "step": 966 }, { "epoch": 3.30402989057913, "grad_norm": 4.690282344818115, "learning_rate": 2.1623435862645204e-05, "loss": 45.8376, "step": 968 }, { "epoch": 3.3108620229516945, "grad_norm": 5.309507846832275, "learning_rate": 2.153275921062851e-05, "loss": 46.1757, "step": 970 }, { "epoch": 3.3176941553242596, "grad_norm": 4.278385639190674, "learning_rate": 2.1442129043167874e-05, "loss": 46.6388, "step": 972 }, { "epoch": 3.3245262876968242, "grad_norm": 4.2424516677856445, "learning_rate": 2.1351546575322572e-05, "loss": 45.1695, "step": 974 }, { "epoch": 3.331358420069389, "grad_norm": 3.695155143737793, "learning_rate": 2.126101302151238e-05, "loss": 45.9417, "step": 976 }, { "epoch": 3.3381905524419535, "grad_norm": 4.2003374099731445, "learning_rate": 2.1170529595501305e-05, "loss": 44.4002, "step": 978 }, { "epoch": 3.345022684814518, "grad_norm": 4.378734588623047, "learning_rate": 2.1080097510381298e-05, "loss": 45.4517, "step": 980 }, { "epoch": 3.351854817187083, "grad_norm": 3.96730637550354, "learning_rate": 2.098971797855599e-05, "loss": 43.9996, "step": 982 }, { "epoch": 3.358686949559648, "grad_norm": 3.6162188053131104, "learning_rate": 2.089939221172446e-05, "loss": 43.9178, "step": 984 }, { "epoch": 3.3655190819322125, "grad_norm": 4.3834099769592285, "learning_rate": 2.0809121420864923e-05, "loss": 46.2701, "step": 986 }, { "epoch": 3.372351214304777, "grad_norm": 4.271561145782471, "learning_rate": 2.07189068162186e-05, "loss": 45.7546, "step": 988 }, { "epoch": 3.3791833466773418, "grad_norm": 3.5791757106781006, "learning_rate": 2.0628749607273396e-05, "loss": 45.3079, "step": 990 }, { "epoch": 3.3860154790499064, "grad_norm": 4.5101318359375, "learning_rate": 2.0538651002747744e-05, "loss": 46.5476, "step": 992 }, { "epoch": 3.3928476114224715, "grad_norm": 5.944687366485596, "learning_rate": 2.0448612210574365e-05, "loss": 44.0355, "step": 994 }, { "epoch": 3.399679743795036, "grad_norm": 4.936254501342773, "learning_rate": 2.0358634437884112e-05, "loss": 46.0717, "step": 996 }, { "epoch": 3.4065118761676008, "grad_norm": 4.114757537841797, "learning_rate": 2.0268718890989753e-05, "loss": 44.5295, "step": 998 }, { "epoch": 3.4133440085401654, "grad_norm": 8.12585735321045, "learning_rate": 2.0178866775369777e-05, "loss": 45.0747, "step": 1000 }, { "epoch": 3.4133440085401654, "eval_loss": 0.7275528907775879, "eval_runtime": 119.5885, "eval_samples_per_second": 32.988, "eval_steps_per_second": 8.253, "step": 1000 }, { "epoch": 3.4304243394715774, "grad_norm": 4.9336113929748535, "learning_rate": 2.0089079295652306e-05, "loss": 45.5736, "step": 1002 }, { "epoch": 3.437256471844142, "grad_norm": 5.042412757873535, "learning_rate": 1.9999357655598893e-05, "loss": 45.6651, "step": 1004 }, { "epoch": 3.4440886042167067, "grad_norm": 3.9377660751342773, "learning_rate": 1.9909703058088376e-05, "loss": 44.5559, "step": 1006 }, { "epoch": 3.4509207365892713, "grad_norm": 4.054321765899658, "learning_rate": 1.9820116705100777e-05, "loss": 45.1868, "step": 1008 }, { "epoch": 3.457752868961836, "grad_norm": 4.860738277435303, "learning_rate": 1.9730599797701177e-05, "loss": 44.6737, "step": 1010 }, { "epoch": 3.4645850013344006, "grad_norm": 3.950925827026367, "learning_rate": 1.9641153536023644e-05, "loss": 43.7733, "step": 1012 }, { "epoch": 3.4714171337069657, "grad_norm": 3.831669569015503, "learning_rate": 1.9551779119255043e-05, "loss": 43.7403, "step": 1014 }, { "epoch": 3.4782492660795303, "grad_norm": 4.114947319030762, "learning_rate": 1.9462477745619108e-05, "loss": 45.5074, "step": 1016 }, { "epoch": 3.485081398452095, "grad_norm": 3.405243158340454, "learning_rate": 1.9373250612360246e-05, "loss": 46.4417, "step": 1018 }, { "epoch": 3.4919135308246596, "grad_norm": 4.80495023727417, "learning_rate": 1.928409891572757e-05, "loss": 44.9758, "step": 1020 }, { "epoch": 3.4987456631972247, "grad_norm": 4.239831447601318, "learning_rate": 1.919502385095881e-05, "loss": 44.6174, "step": 1022 }, { "epoch": 3.5055777955697893, "grad_norm": 4.724026203155518, "learning_rate": 1.9106026612264316e-05, "loss": 44.7325, "step": 1024 }, { "epoch": 3.512409927942354, "grad_norm": 3.4634554386138916, "learning_rate": 1.9017108392811065e-05, "loss": 43.7796, "step": 1026 }, { "epoch": 3.5192420603149186, "grad_norm": 4.715716361999512, "learning_rate": 1.8928270384706584e-05, "loss": 45.2777, "step": 1028 }, { "epoch": 3.5260741926874832, "grad_norm": 5.100541114807129, "learning_rate": 1.8839513778983066e-05, "loss": 46.4359, "step": 1030 }, { "epoch": 3.532906325060048, "grad_norm": 4.475189685821533, "learning_rate": 1.875083976558136e-05, "loss": 44.0298, "step": 1032 }, { "epoch": 3.5397384574326125, "grad_norm": 4.431650161743164, "learning_rate": 1.8662249533335003e-05, "loss": 44.2631, "step": 1034 }, { "epoch": 3.5465705898051776, "grad_norm": 4.561038970947266, "learning_rate": 1.8573744269954298e-05, "loss": 43.9968, "step": 1036 }, { "epoch": 3.5534027221777422, "grad_norm": 3.4181675910949707, "learning_rate": 1.848532516201039e-05, "loss": 43.372, "step": 1038 }, { "epoch": 3.560234854550307, "grad_norm": 4.05961799621582, "learning_rate": 1.8396993394919372e-05, "loss": 43.5887, "step": 1040 }, { "epoch": 3.5670669869228715, "grad_norm": 4.183586597442627, "learning_rate": 1.8308750152926337e-05, "loss": 43.1976, "step": 1042 }, { "epoch": 3.5738991192954366, "grad_norm": 4.6883745193481445, "learning_rate": 1.8220596619089576e-05, "loss": 44.4463, "step": 1044 }, { "epoch": 3.580731251668001, "grad_norm": 4.490588665008545, "learning_rate": 1.8132533975264682e-05, "loss": 44.3332, "step": 1046 }, { "epoch": 3.587563384040566, "grad_norm": 4.937854766845703, "learning_rate": 1.8044563402088684e-05, "loss": 45.1199, "step": 1048 }, { "epoch": 3.5943955164131305, "grad_norm": 3.8182907104492188, "learning_rate": 1.795668607896426e-05, "loss": 45.2035, "step": 1050 }, { "epoch": 3.5943955164131305, "eval_loss": 0.7135393619537354, "eval_runtime": 130.7813, "eval_samples_per_second": 30.165, "eval_steps_per_second": 7.547, "step": 1050 }, { "epoch": 3.601227648785695, "grad_norm": 3.3739826679229736, "learning_rate": 1.7868903184043887e-05, "loss": 43.5257, "step": 1052 }, { "epoch": 3.6080597811582598, "grad_norm": 3.8119192123413086, "learning_rate": 1.7781215894214078e-05, "loss": 44.9718, "step": 1054 }, { "epoch": 3.6148919135308244, "grad_norm": 3.6780483722686768, "learning_rate": 1.7693625385079577e-05, "loss": 44.496, "step": 1056 }, { "epoch": 3.6217240459033895, "grad_norm": 4.625596523284912, "learning_rate": 1.7606132830947614e-05, "loss": 43.6496, "step": 1058 }, { "epoch": 3.628556178275954, "grad_norm": 5.467988967895508, "learning_rate": 1.7518739404812155e-05, "loss": 45.3773, "step": 1060 }, { "epoch": 3.6353883106485188, "grad_norm": 3.7848103046417236, "learning_rate": 1.7431446278338197e-05, "loss": 43.6622, "step": 1062 }, { "epoch": 3.6422204430210834, "grad_norm": 6.2495222091674805, "learning_rate": 1.7344254621846016e-05, "loss": 44.7325, "step": 1064 }, { "epoch": 3.6490525753936485, "grad_norm": 4.541433811187744, "learning_rate": 1.7257165604295513e-05, "loss": 45.7111, "step": 1066 }, { "epoch": 3.655884707766213, "grad_norm": 3.6900789737701416, "learning_rate": 1.7170180393270532e-05, "loss": 46.2799, "step": 1068 }, { "epoch": 3.6627168401387777, "grad_norm": 3.999112129211426, "learning_rate": 1.7083300154963193e-05, "loss": 44.9348, "step": 1070 }, { "epoch": 3.6695489725113424, "grad_norm": 4.940526008605957, "learning_rate": 1.699652605415828e-05, "loss": 45.9208, "step": 1072 }, { "epoch": 3.676381104883907, "grad_norm": 3.8536486625671387, "learning_rate": 1.6909859254217613e-05, "loss": 45.3559, "step": 1074 }, { "epoch": 3.6832132372564717, "grad_norm": 5.941255569458008, "learning_rate": 1.682330091706446e-05, "loss": 44.2183, "step": 1076 }, { "epoch": 3.6900453696290363, "grad_norm": 4.6851091384887695, "learning_rate": 1.6736852203167935e-05, "loss": 45.0132, "step": 1078 }, { "epoch": 3.6968775020016014, "grad_norm": 6.338913917541504, "learning_rate": 1.6650514271527468e-05, "loss": 44.5087, "step": 1080 }, { "epoch": 3.703709634374166, "grad_norm": 6.134509086608887, "learning_rate": 1.6564288279657252e-05, "loss": 44.5929, "step": 1082 }, { "epoch": 3.7105417667467306, "grad_norm": 3.0185976028442383, "learning_rate": 1.647817538357072e-05, "loss": 44.4708, "step": 1084 }, { "epoch": 3.7173738991192953, "grad_norm": 4.479791641235352, "learning_rate": 1.639217673776507e-05, "loss": 44.4799, "step": 1086 }, { "epoch": 3.7242060314918604, "grad_norm": 3.9354395866394043, "learning_rate": 1.630629349520576e-05, "loss": 43.3393, "step": 1088 }, { "epoch": 3.731038163864425, "grad_norm": 4.530430316925049, "learning_rate": 1.622052680731105e-05, "loss": 43.1996, "step": 1090 }, { "epoch": 3.7378702962369896, "grad_norm": 4.594604015350342, "learning_rate": 1.613487782393661e-05, "loss": 43.6473, "step": 1092 }, { "epoch": 3.7447024286095543, "grad_norm": 4.38798713684082, "learning_rate": 1.604934769336004e-05, "loss": 43.1229, "step": 1094 }, { "epoch": 3.751534560982119, "grad_norm": 4.350236415863037, "learning_rate": 1.5963937562265525e-05, "loss": 44.7883, "step": 1096 }, { "epoch": 3.7583666933546835, "grad_norm": 4.064984321594238, "learning_rate": 1.587864857572842e-05, "loss": 44.1865, "step": 1098 }, { "epoch": 3.7651988257272486, "grad_norm": 4.607226848602295, "learning_rate": 1.5793481877199946e-05, "loss": 44.6176, "step": 1100 }, { "epoch": 3.7651988257272486, "eval_loss": 0.7090520858764648, "eval_runtime": 136.3013, "eval_samples_per_second": 28.943, "eval_steps_per_second": 7.241, "step": 1100 }, { "epoch": 3.7720309580998133, "grad_norm": 4.4557719230651855, "learning_rate": 1.5708438608491814e-05, "loss": 42.0453, "step": 1102 }, { "epoch": 3.778863090472378, "grad_norm": 5.199422359466553, "learning_rate": 1.5623519909760954e-05, "loss": 42.589, "step": 1104 }, { "epoch": 3.7856952228449425, "grad_norm": 3.632471799850464, "learning_rate": 1.5538726919494206e-05, "loss": 43.7924, "step": 1106 }, { "epoch": 3.7925273552175076, "grad_norm": 4.203450679779053, "learning_rate": 1.5454060774493068e-05, "loss": 45.02, "step": 1108 }, { "epoch": 3.7993594875900722, "grad_norm": 5.149316310882568, "learning_rate": 1.5369522609858446e-05, "loss": 44.2724, "step": 1110 }, { "epoch": 3.806191619962637, "grad_norm": 3.5306341648101807, "learning_rate": 1.528511355897543e-05, "loss": 44.2268, "step": 1112 }, { "epoch": 3.8130237523352015, "grad_norm": 4.296536445617676, "learning_rate": 1.5200834753498128e-05, "loss": 44.0479, "step": 1114 }, { "epoch": 3.819855884707766, "grad_norm": 2.969525098800659, "learning_rate": 1.5116687323334467e-05, "loss": 43.5543, "step": 1116 }, { "epoch": 3.826688017080331, "grad_norm": 4.044551849365234, "learning_rate": 1.5032672396631056e-05, "loss": 45.7925, "step": 1118 }, { "epoch": 3.8335201494528954, "grad_norm": 5.003629207611084, "learning_rate": 1.4948791099758052e-05, "loss": 44.2037, "step": 1120 }, { "epoch": 3.8403522818254605, "grad_norm": 3.4248318672180176, "learning_rate": 1.486504455729408e-05, "loss": 43.9243, "step": 1122 }, { "epoch": 3.847184414198025, "grad_norm": 4.228148937225342, "learning_rate": 1.4781433892011131e-05, "loss": 44.7779, "step": 1124 }, { "epoch": 3.85401654657059, "grad_norm": 4.345002174377441, "learning_rate": 1.4697960224859513e-05, "loss": 43.0617, "step": 1126 }, { "epoch": 3.8608486789431544, "grad_norm": 4.824610233306885, "learning_rate": 1.4614624674952842e-05, "loss": 43.2687, "step": 1128 }, { "epoch": 3.8676808113157195, "grad_norm": 5.528540134429932, "learning_rate": 1.4531428359553017e-05, "loss": 43.5145, "step": 1130 }, { "epoch": 3.874512943688284, "grad_norm": 3.7578537464141846, "learning_rate": 1.4448372394055249e-05, "loss": 43.2377, "step": 1132 }, { "epoch": 3.8813450760608488, "grad_norm": 3.191563367843628, "learning_rate": 1.436545789197313e-05, "loss": 43.493, "step": 1134 }, { "epoch": 3.8881772084334134, "grad_norm": 3.1072089672088623, "learning_rate": 1.4282685964923642e-05, "loss": 44.5567, "step": 1136 }, { "epoch": 3.895009340805978, "grad_norm": 4.651160717010498, "learning_rate": 1.4200057722612336e-05, "loss": 42.7739, "step": 1138 }, { "epoch": 3.9018414731785427, "grad_norm": 3.203441858291626, "learning_rate": 1.4117574272818388e-05, "loss": 43.1438, "step": 1140 }, { "epoch": 3.9086736055511073, "grad_norm": 4.5728349685668945, "learning_rate": 1.4035236721379757e-05, "loss": 44.305, "step": 1142 }, { "epoch": 3.9155057379236724, "grad_norm": 6.874294757843018, "learning_rate": 1.3953046172178414e-05, "loss": 42.8162, "step": 1144 }, { "epoch": 3.922337870296237, "grad_norm": 5.198761463165283, "learning_rate": 1.387100372712548e-05, "loss": 44.2441, "step": 1146 }, { "epoch": 3.9291700026688017, "grad_norm": 3.9007508754730225, "learning_rate": 1.378911048614647e-05, "loss": 43.0147, "step": 1148 }, { "epoch": 3.9360021350413663, "grad_norm": 3.7035725116729736, "learning_rate": 1.3707367547166569e-05, "loss": 45.0733, "step": 1150 }, { "epoch": 3.9360021350413663, "eval_loss": 0.7048025131225586, "eval_runtime": 132.7997, "eval_samples_per_second": 29.706, "eval_steps_per_second": 7.432, "step": 1150 }, { "epoch": 3.9428342674139314, "grad_norm": 5.101466655731201, "learning_rate": 1.3625776006095881e-05, "loss": 42.4982, "step": 1152 }, { "epoch": 3.949666399786496, "grad_norm": 4.983183860778809, "learning_rate": 1.354433695681474e-05, "loss": 43.3568, "step": 1154 }, { "epoch": 3.9564985321590607, "grad_norm": 3.6875593662261963, "learning_rate": 1.3463051491159096e-05, "loss": 45.16, "step": 1156 }, { "epoch": 3.9633306645316253, "grad_norm": 4.482807636260986, "learning_rate": 1.3381920698905787e-05, "loss": 42.8545, "step": 1158 }, { "epoch": 3.97016279690419, "grad_norm": 3.858903646469116, "learning_rate": 1.3300945667758014e-05, "loss": 42.5779, "step": 1160 }, { "epoch": 3.9769949292767546, "grad_norm": 5.07602596282959, "learning_rate": 1.3220127483330713e-05, "loss": 43.8678, "step": 1162 }, { "epoch": 3.983827061649319, "grad_norm": 5.183884620666504, "learning_rate": 1.3139467229135999e-05, "loss": 44.2575, "step": 1164 }, { "epoch": 3.9906591940218843, "grad_norm": 5.44564962387085, "learning_rate": 1.3058965986568648e-05, "loss": 42.0898, "step": 1166 }, { "epoch": 3.997491326394449, "grad_norm": 3.4175875186920166, "learning_rate": 1.2978624834891628e-05, "loss": 43.526, "step": 1168 }, { "epoch": 4.006832132372565, "grad_norm": 5.1483588218688965, "learning_rate": 1.2898444851221565e-05, "loss": 60.1634, "step": 1170 }, { "epoch": 4.013664264745129, "grad_norm": 4.452287673950195, "learning_rate": 1.281842711051438e-05, "loss": 41.7569, "step": 1172 }, { "epoch": 4.020496397117694, "grad_norm": 4.024214267730713, "learning_rate": 1.2738572685550799e-05, "loss": 44.7667, "step": 1174 }, { "epoch": 4.0273285294902585, "grad_norm": 5.533107757568359, "learning_rate": 1.2658882646922034e-05, "loss": 43.7144, "step": 1176 }, { "epoch": 4.034160661862823, "grad_norm": 4.520675182342529, "learning_rate": 1.2579358063015418e-05, "loss": 43.3862, "step": 1178 }, { "epoch": 4.040992794235389, "grad_norm": 4.086079120635986, "learning_rate": 1.2500000000000006e-05, "loss": 44.268, "step": 1180 }, { "epoch": 4.047824926607953, "grad_norm": 3.335569381713867, "learning_rate": 1.2420809521812404e-05, "loss": 43.1871, "step": 1182 }, { "epoch": 4.054657058980518, "grad_norm": 4.651849746704102, "learning_rate": 1.2341787690142437e-05, "loss": 43.4785, "step": 1184 }, { "epoch": 4.061489191353083, "grad_norm": 3.9412457942962646, "learning_rate": 1.2262935564418886e-05, "loss": 42.1075, "step": 1186 }, { "epoch": 4.068321323725647, "grad_norm": 5.621413230895996, "learning_rate": 1.2184254201795365e-05, "loss": 44.5849, "step": 1188 }, { "epoch": 4.075153456098212, "grad_norm": 4.291881084442139, "learning_rate": 1.2105744657136064e-05, "loss": 42.9562, "step": 1190 }, { "epoch": 4.0819855884707765, "grad_norm": 3.730132818222046, "learning_rate": 1.2027407983001681e-05, "loss": 44.0838, "step": 1192 }, { "epoch": 4.088817720843341, "grad_norm": 3.540987968444824, "learning_rate": 1.1949245229635245e-05, "loss": 43.4705, "step": 1194 }, { "epoch": 4.095649853215906, "grad_norm": 3.0649805068969727, "learning_rate": 1.1871257444948098e-05, "loss": 43.0996, "step": 1196 }, { "epoch": 4.10248198558847, "grad_norm": 3.2024762630462646, "learning_rate": 1.1793445674505776e-05, "loss": 42.772, "step": 1198 }, { "epoch": 4.109314117961035, "grad_norm": 3.462251663208008, "learning_rate": 1.1715810961514073e-05, "loss": 43.2502, "step": 1200 }, { "epoch": 4.109314117961035, "eval_loss": 0.7009151577949524, "eval_runtime": 133.1765, "eval_samples_per_second": 29.622, "eval_steps_per_second": 7.411, "step": 1200 }, { "epoch": 4.116146250333601, "grad_norm": 4.633735656738281, "learning_rate": 1.1638354346804971e-05, "loss": 42.8239, "step": 1202 }, { "epoch": 4.122978382706165, "grad_norm": 3.758700132369995, "learning_rate": 1.1561076868822756e-05, "loss": 43.3475, "step": 1204 }, { "epoch": 4.12981051507873, "grad_norm": 4.143715858459473, "learning_rate": 1.148397956361007e-05, "loss": 44.0, "step": 1206 }, { "epoch": 4.1366426474512945, "grad_norm": 5.201571941375732, "learning_rate": 1.1407063464793966e-05, "loss": 42.5036, "step": 1208 }, { "epoch": 4.143474779823859, "grad_norm": 3.4282047748565674, "learning_rate": 1.133032960357216e-05, "loss": 43.0577, "step": 1210 }, { "epoch": 4.150306912196424, "grad_norm": 4.114802837371826, "learning_rate": 1.1253779008699131e-05, "loss": 43.3517, "step": 1212 }, { "epoch": 4.157139044568988, "grad_norm": 3.979163408279419, "learning_rate": 1.1177412706472321e-05, "loss": 42.5044, "step": 1214 }, { "epoch": 4.163971176941553, "grad_norm": 4.363109588623047, "learning_rate": 1.1101231720718442e-05, "loss": 43.8954, "step": 1216 }, { "epoch": 4.170803309314118, "grad_norm": 4.6219401359558105, "learning_rate": 1.1025237072779663e-05, "loss": 43.413, "step": 1218 }, { "epoch": 4.177635441686682, "grad_norm": 4.945540904998779, "learning_rate": 1.09494297815e-05, "loss": 43.9628, "step": 1220 }, { "epoch": 4.184467574059248, "grad_norm": 4.4585747718811035, "learning_rate": 1.0873810863211595e-05, "loss": 42.6454, "step": 1222 }, { "epoch": 4.1912997064318125, "grad_norm": 4.659883499145508, "learning_rate": 1.0798381331721109e-05, "loss": 42.5656, "step": 1224 }, { "epoch": 4.198131838804377, "grad_norm": 4.411434650421143, "learning_rate": 1.0723142198296155e-05, "loss": 41.2252, "step": 1226 }, { "epoch": 4.204963971176942, "grad_norm": 4.985414028167725, "learning_rate": 1.0648094471651724e-05, "loss": 42.05, "step": 1228 }, { "epoch": 4.211796103549506, "grad_norm": 5.09487771987915, "learning_rate": 1.0573239157936619e-05, "loss": 42.9917, "step": 1230 }, { "epoch": 4.218628235922071, "grad_norm": 4.299539089202881, "learning_rate": 1.049857726072005e-05, "loss": 42.7934, "step": 1232 }, { "epoch": 4.225460368294636, "grad_norm": 4.075766086578369, "learning_rate": 1.0424109780978103e-05, "loss": 41.0067, "step": 1234 }, { "epoch": 4.2322925006672, "grad_norm": 4.9132232666015625, "learning_rate": 1.034983771708035e-05, "loss": 43.6556, "step": 1236 }, { "epoch": 4.239124633039765, "grad_norm": 4.45914888381958, "learning_rate": 1.0275762064776492e-05, "loss": 42.588, "step": 1238 }, { "epoch": 4.24595676541233, "grad_norm": 3.7621419429779053, "learning_rate": 1.020188381718295e-05, "loss": 41.7435, "step": 1240 }, { "epoch": 4.252788897784894, "grad_norm": 2.9593658447265625, "learning_rate": 1.0128203964769601e-05, "loss": 43.7138, "step": 1242 }, { "epoch": 4.25962103015746, "grad_norm": 4.333788871765137, "learning_rate": 1.0054723495346482e-05, "loss": 42.7332, "step": 1244 }, { "epoch": 4.266453162530024, "grad_norm": 4.040637493133545, "learning_rate": 9.981443394050525e-06, "loss": 43.0547, "step": 1246 }, { "epoch": 4.273285294902589, "grad_norm": 5.255796432495117, "learning_rate": 9.908364643332399e-06, "loss": 42.1078, "step": 1248 }, { "epoch": 4.280117427275154, "grad_norm": 3.434884786605835, "learning_rate": 9.835488222943285e-06, "loss": 42.6684, "step": 1250 }, { "epoch": 4.280117427275154, "eval_loss": 0.6948874592781067, "eval_runtime": 138.5111, "eval_samples_per_second": 28.481, "eval_steps_per_second": 7.126, "step": 1250 }, { "epoch": 4.286949559647718, "grad_norm": 4.761016368865967, "learning_rate": 9.762815109921761e-06, "loss": 43.8, "step": 1252 }, { "epoch": 4.293781692020283, "grad_norm": 5.999067783355713, "learning_rate": 9.690346278580726e-06, "loss": 42.8654, "step": 1254 }, { "epoch": 4.300613824392848, "grad_norm": 4.777903079986572, "learning_rate": 9.618082700494319e-06, "loss": 42.3409, "step": 1256 }, { "epoch": 4.307445956765412, "grad_norm": 4.543084144592285, "learning_rate": 9.546025344484869e-06, "loss": 43.6205, "step": 1258 }, { "epoch": 4.314278089137977, "grad_norm": 3.6853065490722656, "learning_rate": 9.474175176609956e-06, "loss": 43.9045, "step": 1260 }, { "epoch": 4.3211102215105415, "grad_norm": 4.3578338623046875, "learning_rate": 9.402533160149416e-06, "loss": 41.781, "step": 1262 }, { "epoch": 4.327942353883106, "grad_norm": 4.191073894500732, "learning_rate": 9.331100255592437e-06, "loss": 42.5713, "step": 1264 }, { "epoch": 4.334774486255672, "grad_norm": 5.591835021972656, "learning_rate": 9.259877420624721e-06, "loss": 42.9316, "step": 1266 }, { "epoch": 4.341606618628236, "grad_norm": 4.916292667388916, "learning_rate": 9.18886561011557e-06, "loss": 42.9316, "step": 1268 }, { "epoch": 4.348438751000801, "grad_norm": 3.4310858249664307, "learning_rate": 9.118065776105159e-06, "loss": 42.0445, "step": 1270 }, { "epoch": 4.3552708833733655, "grad_norm": 3.6645348072052, "learning_rate": 9.047478867791732e-06, "loss": 41.5698, "step": 1272 }, { "epoch": 4.36210301574593, "grad_norm": 4.118466854095459, "learning_rate": 8.977105831518864e-06, "loss": 41.7493, "step": 1274 }, { "epoch": 4.368935148118495, "grad_norm": 4.731881141662598, "learning_rate": 8.906947610762825e-06, "loss": 41.2277, "step": 1276 }, { "epoch": 4.3757672804910595, "grad_norm": 4.580758571624756, "learning_rate": 8.837005146119872e-06, "loss": 42.3467, "step": 1278 }, { "epoch": 4.382599412863624, "grad_norm": 5.310960292816162, "learning_rate": 8.767279375293672e-06, "loss": 43.1447, "step": 1280 }, { "epoch": 4.389431545236189, "grad_norm": 4.382359027862549, "learning_rate": 8.697771233082744e-06, "loss": 42.4424, "step": 1282 }, { "epoch": 4.396263677608753, "grad_norm": 3.6488263607025146, "learning_rate": 8.628481651367876e-06, "loss": 43.8516, "step": 1284 }, { "epoch": 4.403095809981318, "grad_norm": 3.2983975410461426, "learning_rate": 8.55941155909968e-06, "loss": 43.3322, "step": 1286 }, { "epoch": 4.4099279423538835, "grad_norm": 3.5116684436798096, "learning_rate": 8.490561882286136e-06, "loss": 41.4651, "step": 1288 }, { "epoch": 4.416760074726448, "grad_norm": 3.5123932361602783, "learning_rate": 8.421933543980126e-06, "loss": 43.1034, "step": 1290 }, { "epoch": 4.423592207099013, "grad_norm": 4.123583793640137, "learning_rate": 8.353527464267104e-06, "loss": 43.566, "step": 1292 }, { "epoch": 4.430424339471577, "grad_norm": 3.6427931785583496, "learning_rate": 8.285344560252777e-06, "loss": 42.0333, "step": 1294 }, { "epoch": 4.437256471844142, "grad_norm": 3.8917388916015625, "learning_rate": 8.217385746050742e-06, "loss": 42.0382, "step": 1296 }, { "epoch": 4.444088604216707, "grad_norm": 4.964122772216797, "learning_rate": 8.149651932770308e-06, "loss": 43.6584, "step": 1298 }, { "epoch": 4.450920736589271, "grad_norm": 4.227240085601807, "learning_rate": 8.082144028504233e-06, "loss": 42.4086, "step": 1300 }, { "epoch": 4.450920736589271, "eval_loss": 0.6897044777870178, "eval_runtime": 131.8148, "eval_samples_per_second": 29.928, "eval_steps_per_second": 7.488, "step": 1300 }, { "epoch": 4.457752868961836, "grad_norm": 4.605757713317871, "learning_rate": 8.014862938316542e-06, "loss": 43.7962, "step": 1302 }, { "epoch": 4.464585001334401, "grad_norm": 4.2398176193237305, "learning_rate": 7.947809564230445e-06, "loss": 42.3544, "step": 1304 }, { "epoch": 4.471417133706965, "grad_norm": 5.234216213226318, "learning_rate": 7.880984805216185e-06, "loss": 41.9833, "step": 1306 }, { "epoch": 4.47824926607953, "grad_norm": 3.9220240116119385, "learning_rate": 7.814389557179017e-06, "loss": 42.0345, "step": 1308 }, { "epoch": 4.485081398452095, "grad_norm": 5.44996976852417, "learning_rate": 7.748024712947205e-06, "loss": 42.0309, "step": 1310 }, { "epoch": 4.49191353082466, "grad_norm": 5.07472038269043, "learning_rate": 7.681891162260015e-06, "loss": 42.6996, "step": 1312 }, { "epoch": 4.498745663197225, "grad_norm": 3.818120241165161, "learning_rate": 7.615989791755834e-06, "loss": 42.8775, "step": 1314 }, { "epoch": 4.505577795569789, "grad_norm": 4.252802848815918, "learning_rate": 7.5503214849602516e-06, "loss": 42.4118, "step": 1316 }, { "epoch": 4.512409927942354, "grad_norm": 4.17697286605835, "learning_rate": 7.484887122274215e-06, "loss": 41.2153, "step": 1318 }, { "epoch": 4.519242060314919, "grad_norm": 3.7324466705322266, "learning_rate": 7.419687580962223e-06, "loss": 42.3343, "step": 1320 }, { "epoch": 4.526074192687483, "grad_norm": 3.870089054107666, "learning_rate": 7.354723735140609e-06, "loss": 42.0028, "step": 1322 }, { "epoch": 4.532906325060048, "grad_norm": 3.6424801349639893, "learning_rate": 7.289996455765749e-06, "loss": 43.5842, "step": 1324 }, { "epoch": 4.5397384574326125, "grad_norm": 4.695961952209473, "learning_rate": 7.225506610622456e-06, "loss": 42.0951, "step": 1326 }, { "epoch": 4.546570589805177, "grad_norm": 4.842666149139404, "learning_rate": 7.161255064312283e-06, "loss": 43.8668, "step": 1328 }, { "epoch": 4.553402722177742, "grad_norm": 4.4085822105407715, "learning_rate": 7.0972426782419884e-06, "loss": 43.7836, "step": 1330 }, { "epoch": 4.560234854550307, "grad_norm": 3.606607437133789, "learning_rate": 7.033470310611945e-06, "loss": 41.4304, "step": 1332 }, { "epoch": 4.567066986922872, "grad_norm": 4.789222717285156, "learning_rate": 6.969938816404639e-06, "loss": 41.6355, "step": 1334 }, { "epoch": 4.573899119295437, "grad_norm": 4.463109493255615, "learning_rate": 6.906649047373246e-06, "loss": 43.4969, "step": 1336 }, { "epoch": 4.580731251668001, "grad_norm": 4.483322620391846, "learning_rate": 6.843601852030171e-06, "loss": 42.4094, "step": 1338 }, { "epoch": 4.587563384040566, "grad_norm": 4.021024703979492, "learning_rate": 6.780798075635675e-06, "loss": 42.2893, "step": 1340 }, { "epoch": 4.5943955164131305, "grad_norm": 3.9479868412017822, "learning_rate": 6.718238560186571e-06, "loss": 40.8073, "step": 1342 }, { "epoch": 4.601227648785695, "grad_norm": 4.778145790100098, "learning_rate": 6.655924144404907e-06, "loss": 42.0845, "step": 1344 }, { "epoch": 4.60805978115826, "grad_norm": 3.555271863937378, "learning_rate": 6.593855663726722e-06, "loss": 41.1015, "step": 1346 }, { "epoch": 4.614891913530824, "grad_norm": 4.007204532623291, "learning_rate": 6.532033950290886e-06, "loss": 42.9137, "step": 1348 }, { "epoch": 4.621724045903389, "grad_norm": 4.328546524047852, "learning_rate": 6.470459832927881e-06, "loss": 41.274, "step": 1350 }, { "epoch": 4.621724045903389, "eval_loss": 0.6830974221229553, "eval_runtime": 135.2812, "eval_samples_per_second": 29.161, "eval_steps_per_second": 7.296, "step": 1350 }, { "epoch": 4.628556178275954, "grad_norm": 4.948083877563477, "learning_rate": 6.409134137148737e-06, "loss": 43.0462, "step": 1352 }, { "epoch": 4.635388310648519, "grad_norm": 4.637773036956787, "learning_rate": 6.3480576851339625e-06, "loss": 42.6268, "step": 1354 }, { "epoch": 4.642220443021084, "grad_norm": 3.72841215133667, "learning_rate": 6.28723129572247e-06, "loss": 41.0574, "step": 1356 }, { "epoch": 4.6490525753936485, "grad_norm": 4.539714813232422, "learning_rate": 6.226655784400684e-06, "loss": 43.5752, "step": 1358 }, { "epoch": 4.655884707766213, "grad_norm": 5.519583225250244, "learning_rate": 6.166331963291519e-06, "loss": 43.3111, "step": 1360 }, { "epoch": 4.662716840138778, "grad_norm": 4.942199230194092, "learning_rate": 6.106260641143546e-06, "loss": 43.6514, "step": 1362 }, { "epoch": 4.669548972511342, "grad_norm": 5.164299011230469, "learning_rate": 6.046442623320145e-06, "loss": 40.8611, "step": 1364 }, { "epoch": 4.676381104883907, "grad_norm": 4.309698581695557, "learning_rate": 5.986878711788702e-06, "loss": 41.3937, "step": 1366 }, { "epoch": 4.683213237256472, "grad_norm": 4.105101585388184, "learning_rate": 5.927569705109828e-06, "loss": 40.3001, "step": 1368 }, { "epoch": 4.690045369629036, "grad_norm": 3.571514368057251, "learning_rate": 5.868516398426716e-06, "loss": 41.6858, "step": 1370 }, { "epoch": 4.696877502001601, "grad_norm": 5.120858192443848, "learning_rate": 5.809719583454415e-06, "loss": 41.4156, "step": 1372 }, { "epoch": 4.703709634374166, "grad_norm": 4.679799556732178, "learning_rate": 5.751180048469243e-06, "loss": 43.1858, "step": 1374 }, { "epoch": 4.710541766746731, "grad_norm": 3.0465521812438965, "learning_rate": 5.692898578298253e-06, "loss": 41.213, "step": 1376 }, { "epoch": 4.717373899119296, "grad_norm": 4.835347652435303, "learning_rate": 5.634875954308638e-06, "loss": 44.0938, "step": 1378 }, { "epoch": 4.72420603149186, "grad_norm": 6.645193099975586, "learning_rate": 5.577112954397321e-06, "loss": 41.7528, "step": 1380 }, { "epoch": 4.731038163864425, "grad_norm": 4.592052936553955, "learning_rate": 5.519610352980501e-06, "loss": 42.566, "step": 1382 }, { "epoch": 4.73787029623699, "grad_norm": 3.7620317935943604, "learning_rate": 5.462368920983249e-06, "loss": 41.7184, "step": 1384 }, { "epoch": 4.744702428609554, "grad_norm": 4.0445027351379395, "learning_rate": 5.405389425829219e-06, "loss": 41.6249, "step": 1386 }, { "epoch": 4.751534560982119, "grad_norm": 3.744433641433716, "learning_rate": 5.348672631430318e-06, "loss": 43.0626, "step": 1388 }, { "epoch": 4.7583666933546835, "grad_norm": 3.12141489982605, "learning_rate": 5.292219298176476e-06, "loss": 42.1533, "step": 1390 }, { "epoch": 4.765198825727248, "grad_norm": 6.73304557800293, "learning_rate": 5.236030182925475e-06, "loss": 41.6015, "step": 1392 }, { "epoch": 4.772030958099813, "grad_norm": 4.076465129852295, "learning_rate": 5.1801060389927606e-06, "loss": 43.2645, "step": 1394 }, { "epoch": 4.7788630904723775, "grad_norm": 4.178272247314453, "learning_rate": 5.124447616141381e-06, "loss": 43.0354, "step": 1396 }, { "epoch": 4.785695222844943, "grad_norm": 4.555927276611328, "learning_rate": 5.06905566057192e-06, "loss": 42.1086, "step": 1398 }, { "epoch": 4.792527355217508, "grad_norm": 4.799075126647949, "learning_rate": 5.013930914912476e-06, "loss": 40.7555, "step": 1400 }, { "epoch": 4.792527355217508, "eval_loss": 0.6814665198326111, "eval_runtime": 134.9461, "eval_samples_per_second": 29.234, "eval_steps_per_second": 7.314, "step": 1400 }, { "epoch": 4.799359487590072, "grad_norm": 3.7408673763275146, "learning_rate": 4.959074118208726e-06, "loss": 40.9295, "step": 1402 }, { "epoch": 4.806191619962637, "grad_norm": 3.9520747661590576, "learning_rate": 4.9044860059140275e-06, "loss": 43.4186, "step": 1404 }, { "epoch": 4.8130237523352015, "grad_norm": 4.115049839019775, "learning_rate": 4.850167309879519e-06, "loss": 42.2491, "step": 1406 }, { "epoch": 4.819855884707766, "grad_norm": 5.181631088256836, "learning_rate": 4.796118758344354e-06, "loss": 41.583, "step": 1408 }, { "epoch": 4.826688017080331, "grad_norm": 3.838186740875244, "learning_rate": 4.742341075925916e-06, "loss": 43.3278, "step": 1410 }, { "epoch": 4.833520149452895, "grad_norm": 3.6494245529174805, "learning_rate": 4.6888349836100825e-06, "loss": 41.3961, "step": 1412 }, { "epoch": 4.84035228182546, "grad_norm": 4.139842510223389, "learning_rate": 4.6356011987416075e-06, "loss": 43.4135, "step": 1414 }, { "epoch": 4.847184414198025, "grad_norm": 4.385437965393066, "learning_rate": 4.58264043501446e-06, "loss": 42.1478, "step": 1416 }, { "epoch": 4.854016546570589, "grad_norm": 3.691343307495117, "learning_rate": 4.52995340246227e-06, "loss": 42.4175, "step": 1418 }, { "epoch": 4.860848678943155, "grad_norm": 4.149899482727051, "learning_rate": 4.477540807448832e-06, "loss": 42.4116, "step": 1420 }, { "epoch": 4.8676808113157195, "grad_norm": 3.8960561752319336, "learning_rate": 4.425403352658591e-06, "loss": 41.2306, "step": 1422 }, { "epoch": 4.874512943688284, "grad_norm": 3.6276168823242188, "learning_rate": 4.373541737087264e-06, "loss": 42.7317, "step": 1424 }, { "epoch": 4.881345076060849, "grad_norm": 4.214303016662598, "learning_rate": 4.32195665603245e-06, "loss": 41.6166, "step": 1426 }, { "epoch": 4.888177208433413, "grad_norm": 4.3136210441589355, "learning_rate": 4.270648801084296e-06, "loss": 42.3309, "step": 1428 }, { "epoch": 4.895009340805978, "grad_norm": 5.340824604034424, "learning_rate": 4.219618860116242e-06, "loss": 40.6249, "step": 1430 }, { "epoch": 4.901841473178543, "grad_norm": 3.750943183898926, "learning_rate": 4.1688675172758064e-06, "loss": 42.0754, "step": 1432 }, { "epoch": 4.908673605551107, "grad_norm": 3.8021140098571777, "learning_rate": 4.118395452975382e-06, "loss": 42.8221, "step": 1434 }, { "epoch": 4.915505737923672, "grad_norm": 5.09911584854126, "learning_rate": 4.068203343883159e-06, "loss": 42.3164, "step": 1436 }, { "epoch": 4.9223378702962375, "grad_norm": 3.590981960296631, "learning_rate": 4.018291862914001e-06, "loss": 41.0773, "step": 1438 }, { "epoch": 4.929170002668801, "grad_norm": 4.474262714385986, "learning_rate": 3.968661679220468e-06, "loss": 41.1827, "step": 1440 }, { "epoch": 4.936002135041367, "grad_norm": 3.780853748321533, "learning_rate": 3.919313458183838e-06, "loss": 41.9009, "step": 1442 }, { "epoch": 4.942834267413931, "grad_norm": 4.165524482727051, "learning_rate": 3.8702478614051355e-06, "loss": 41.6988, "step": 1444 }, { "epoch": 4.949666399786496, "grad_norm": 4.537020683288574, "learning_rate": 3.821465546696337e-06, "loss": 42.6527, "step": 1446 }, { "epoch": 4.956498532159061, "grad_norm": 5.992898941040039, "learning_rate": 3.772967168071517e-06, "loss": 42.3257, "step": 1448 }, { "epoch": 4.963330664531625, "grad_norm": 5.681396007537842, "learning_rate": 3.7247533757380603e-06, "loss": 42.5366, "step": 1450 }, { "epoch": 4.963330664531625, "eval_loss": 0.6770752668380737, "eval_runtime": 133.8871, "eval_samples_per_second": 29.465, "eval_steps_per_second": 7.372, "step": 1450 }, { "epoch": 4.97016279690419, "grad_norm": 4.46541166305542, "learning_rate": 3.6768248160879787e-06, "loss": 41.0476, "step": 1452 }, { "epoch": 4.976994929276755, "grad_norm": 4.15000057220459, "learning_rate": 3.6291821316892184e-06, "loss": 40.7134, "step": 1454 }, { "epoch": 4.983827061649319, "grad_norm": 4.230960369110107, "learning_rate": 3.5818259612770744e-06, "loss": 43.5967, "step": 1456 }, { "epoch": 4.990659194021884, "grad_norm": 4.932849884033203, "learning_rate": 3.53475693974559e-06, "loss": 43.2516, "step": 1458 }, { "epoch": 4.997491326394449, "grad_norm": 4.316704273223877, "learning_rate": 3.487975698139084e-06, "loss": 42.3811, "step": 1460 }, { "epoch": 5.003416066186283, "grad_norm": 4.146729469299316, "learning_rate": 3.4414828636436525e-06, "loss": 36.1288, "step": 1462 }, { "epoch": 5.010248198558847, "grad_norm": 5.610274791717529, "learning_rate": 3.3952790595787987e-06, "loss": 40.6556, "step": 1464 }, { "epoch": 5.017080330931412, "grad_norm": 6.292807102203369, "learning_rate": 3.3493649053890326e-06, "loss": 42.2675, "step": 1466 }, { "epoch": 5.023912463303977, "grad_norm": 4.371929168701172, "learning_rate": 3.3037410166356143e-06, "loss": 41.1544, "step": 1468 }, { "epoch": 5.030744595676541, "grad_norm": 3.275562047958374, "learning_rate": 3.258408004988278e-06, "loss": 42.7401, "step": 1470 }, { "epoch": 5.037576728049106, "grad_norm": 5.2857666015625, "learning_rate": 3.2133664782169948e-06, "loss": 39.4961, "step": 1472 }, { "epoch": 5.044408860421671, "grad_norm": 3.9162814617156982, "learning_rate": 3.168617040183897e-06, "loss": 42.7691, "step": 1474 }, { "epoch": 5.051240992794235, "grad_norm": 4.741237640380859, "learning_rate": 3.1241602908351404e-06, "loss": 39.9539, "step": 1476 }, { "epoch": 5.0580731251668, "grad_norm": 4.904325008392334, "learning_rate": 3.079996826192849e-06, "loss": 40.999, "step": 1478 }, { "epoch": 5.0649052575393645, "grad_norm": 3.9396679401397705, "learning_rate": 3.036127238347164e-06, "loss": 41.8233, "step": 1480 }, { "epoch": 5.071737389911929, "grad_norm": 3.5699760913848877, "learning_rate": 2.992552115448258e-06, "loss": 41.4895, "step": 1482 }, { "epoch": 5.078569522284495, "grad_norm": 4.227250099182129, "learning_rate": 2.9492720416985e-06, "loss": 41.7825, "step": 1484 }, { "epoch": 5.085401654657059, "grad_norm": 3.8788514137268066, "learning_rate": 2.9062875973445813e-06, "loss": 41.4301, "step": 1486 }, { "epoch": 5.092233787029624, "grad_norm": 3.7242729663848877, "learning_rate": 2.8635993586697553e-06, "loss": 40.2917, "step": 1488 }, { "epoch": 5.099065919402189, "grad_norm": 5.645269870758057, "learning_rate": 2.821207897986114e-06, "loss": 41.1435, "step": 1490 }, { "epoch": 5.105898051774753, "grad_norm": 3.9231839179992676, "learning_rate": 2.779113783626916e-06, "loss": 41.5506, "step": 1492 }, { "epoch": 5.112730184147318, "grad_norm": 4.276205062866211, "learning_rate": 2.7373175799389415e-06, "loss": 40.4141, "step": 1494 }, { "epoch": 5.1195623165198825, "grad_norm": 6.223433971405029, "learning_rate": 2.6958198472749717e-06, "loss": 42.1149, "step": 1496 }, { "epoch": 5.126394448892447, "grad_norm": 4.167882442474365, "learning_rate": 2.65462114198623e-06, "loss": 40.7711, "step": 1498 }, { "epoch": 5.133226581265012, "grad_norm": 3.588376998901367, "learning_rate": 2.6137220164149435e-06, "loss": 42.5513, "step": 1500 }, { "epoch": 5.133226581265012, "eval_loss": 0.6761642694473267, "eval_runtime": 137.9512, "eval_samples_per_second": 28.597, "eval_steps_per_second": 7.155, "step": 1500 }, { "epoch": 5.140058713637576, "grad_norm": 4.149092674255371, "learning_rate": 2.573123018886961e-06, "loss": 40.5633, "step": 1502 }, { "epoch": 5.146890846010141, "grad_norm": 3.9322760105133057, "learning_rate": 2.5328246937043526e-06, "loss": 41.3711, "step": 1504 }, { "epoch": 5.1537229783827065, "grad_norm": 4.557422161102295, "learning_rate": 2.492827581138149e-06, "loss": 39.5696, "step": 1506 }, { "epoch": 5.160555110755271, "grad_norm": 3.772927761077881, "learning_rate": 2.4531322174210975e-06, "loss": 42.9544, "step": 1508 }, { "epoch": 5.167387243127836, "grad_norm": 4.051291465759277, "learning_rate": 2.4137391347404476e-06, "loss": 40.978, "step": 1510 }, { "epoch": 5.1742193755004005, "grad_norm": 3.6557424068450928, "learning_rate": 2.37464886123083e-06, "loss": 41.606, "step": 1512 }, { "epoch": 5.181051507872965, "grad_norm": 4.801413536071777, "learning_rate": 2.3358619209672e-06, "loss": 41.5917, "step": 1514 }, { "epoch": 5.18788364024553, "grad_norm": 4.2001423835754395, "learning_rate": 2.2973788339577613e-06, "loss": 43.0596, "step": 1516 }, { "epoch": 5.194715772618094, "grad_norm": 5.291867256164551, "learning_rate": 2.2592001161370392e-06, "loss": 40.3588, "step": 1518 }, { "epoch": 5.201547904990659, "grad_norm": 3.7930984497070312, "learning_rate": 2.2213262793589484e-06, "loss": 42.0758, "step": 1520 }, { "epoch": 5.208380037363224, "grad_norm": 4.888052940368652, "learning_rate": 2.1837578313899098e-06, "loss": 39.7415, "step": 1522 }, { "epoch": 5.215212169735788, "grad_norm": 4.963688850402832, "learning_rate": 2.1464952759020855e-06, "loss": 42.05, "step": 1524 }, { "epoch": 5.222044302108353, "grad_norm": 4.556923866271973, "learning_rate": 2.109539112466588e-06, "loss": 40.5828, "step": 1526 }, { "epoch": 5.228876434480918, "grad_norm": 3.550285577774048, "learning_rate": 2.0728898365467903e-06, "loss": 41.4201, "step": 1528 }, { "epoch": 5.235708566853483, "grad_norm": 4.290851593017578, "learning_rate": 2.0365479394917147e-06, "loss": 41.1988, "step": 1530 }, { "epoch": 5.242540699226048, "grad_norm": 4.436618804931641, "learning_rate": 2.0005139085293945e-06, "loss": 41.1016, "step": 1532 }, { "epoch": 5.249372831598612, "grad_norm": 6.221188068389893, "learning_rate": 1.9647882267603862e-06, "loss": 42.1538, "step": 1534 }, { "epoch": 5.256204963971177, "grad_norm": 4.712629795074463, "learning_rate": 1.9293713731512673e-06, "loss": 41.1176, "step": 1536 }, { "epoch": 5.263037096343742, "grad_norm": 4.693170070648193, "learning_rate": 1.894263822528225e-06, "loss": 41.3687, "step": 1538 }, { "epoch": 5.269869228716306, "grad_norm": 4.854535102844238, "learning_rate": 1.8594660455706763e-06, "loss": 41.6856, "step": 1540 }, { "epoch": 5.276701361088871, "grad_norm": 3.5167202949523926, "learning_rate": 1.8249785088049893e-06, "loss": 42.5848, "step": 1542 }, { "epoch": 5.2835334934614355, "grad_norm": 4.029543399810791, "learning_rate": 1.790801674598186e-06, "loss": 41.8932, "step": 1544 }, { "epoch": 5.290365625834, "grad_norm": 4.217826843261719, "learning_rate": 1.7569360011517848e-06, "loss": 41.478, "step": 1546 }, { "epoch": 5.297197758206565, "grad_norm": 3.8237998485565186, "learning_rate": 1.7233819424956248e-06, "loss": 42.5394, "step": 1548 }, { "epoch": 5.30402989057913, "grad_norm": 5.044140338897705, "learning_rate": 1.6901399484818004e-06, "loss": 41.0466, "step": 1550 }, { "epoch": 5.30402989057913, "eval_loss": 0.6723917722702026, "eval_runtime": 132.3674, "eval_samples_per_second": 29.803, "eval_steps_per_second": 7.457, "step": 1550 }, { "epoch": 5.310862022951695, "grad_norm": 4.023882865905762, "learning_rate": 1.6572104647786247e-06, "loss": 40.4515, "step": 1552 }, { "epoch": 5.31769415532426, "grad_norm": 5.667575836181641, "learning_rate": 1.624593932864632e-06, "loss": 42.2196, "step": 1554 }, { "epoch": 5.324526287696824, "grad_norm": 3.771815299987793, "learning_rate": 1.5922907900227018e-06, "loss": 41.1018, "step": 1556 }, { "epoch": 5.331358420069389, "grad_norm": 4.044847011566162, "learning_rate": 1.5603014693341662e-06, "loss": 40.8528, "step": 1558 }, { "epoch": 5.3381905524419535, "grad_norm": 4.64625358581543, "learning_rate": 1.5286263996730026e-06, "loss": 41.612, "step": 1560 }, { "epoch": 5.345022684814518, "grad_norm": 5.102336406707764, "learning_rate": 1.497266005700107e-06, "loss": 40.965, "step": 1562 }, { "epoch": 5.351854817187083, "grad_norm": 3.1535797119140625, "learning_rate": 1.4662207078575684e-06, "loss": 40.5264, "step": 1564 }, { "epoch": 5.358686949559647, "grad_norm": 3.740694522857666, "learning_rate": 1.4354909223630669e-06, "loss": 41.5863, "step": 1566 }, { "epoch": 5.365519081932212, "grad_norm": 4.79527473449707, "learning_rate": 1.40507706120426e-06, "loss": 41.3632, "step": 1568 }, { "epoch": 5.372351214304777, "grad_norm": 4.936699867248535, "learning_rate": 1.3749795321332887e-06, "loss": 41.898, "step": 1570 }, { "epoch": 5.379183346677342, "grad_norm": 6.228104114532471, "learning_rate": 1.3451987386612851e-06, "loss": 41.3327, "step": 1572 }, { "epoch": 5.386015479049907, "grad_norm": 3.9607808589935303, "learning_rate": 1.3157350800529878e-06, "loss": 39.3806, "step": 1574 }, { "epoch": 5.3928476114224715, "grad_norm": 3.2485790252685547, "learning_rate": 1.286588951321363e-06, "loss": 39.292, "step": 1576 }, { "epoch": 5.399679743795036, "grad_norm": 4.702234745025635, "learning_rate": 1.2577607432223276e-06, "loss": 40.3127, "step": 1578 }, { "epoch": 5.406511876167601, "grad_norm": 4.465649127960205, "learning_rate": 1.2292508422495158e-06, "loss": 41.7889, "step": 1580 }, { "epoch": 5.413344008540165, "grad_norm": 4.618641376495361, "learning_rate": 1.2010596306290589e-06, "loss": 41.2257, "step": 1582 }, { "epoch": 5.42017614091273, "grad_norm": 4.093713283538818, "learning_rate": 1.1731874863145143e-06, "loss": 41.7067, "step": 1584 }, { "epoch": 5.427008273285295, "grad_norm": 5.642305374145508, "learning_rate": 1.145634782981761e-06, "loss": 41.1947, "step": 1586 }, { "epoch": 5.433840405657859, "grad_norm": 3.9637906551361084, "learning_rate": 1.1184018900240011e-06, "loss": 41.5425, "step": 1588 }, { "epoch": 5.440672538030424, "grad_norm": 4.328593730926514, "learning_rate": 1.0914891725468141e-06, "loss": 41.7915, "step": 1590 }, { "epoch": 5.4475046704029895, "grad_norm": 4.559619903564453, "learning_rate": 1.06489699136324e-06, "loss": 39.5462, "step": 1592 }, { "epoch": 5.454336802775554, "grad_norm": 4.174973011016846, "learning_rate": 1.0386257029889768e-06, "loss": 40.6458, "step": 1594 }, { "epoch": 5.461168935148119, "grad_norm": 3.249431610107422, "learning_rate": 1.0126756596375686e-06, "loss": 41.4128, "step": 1596 }, { "epoch": 5.468001067520683, "grad_norm": 4.598479747772217, "learning_rate": 9.87047209215694e-07, "loss": 41.7854, "step": 1598 }, { "epoch": 5.474833199893248, "grad_norm": 3.558709144592285, "learning_rate": 9.617406953185138e-07, "loss": 41.9632, "step": 1600 }, { "epoch": 5.474833199893248, "eval_loss": 0.6698766350746155, "eval_runtime": 133.9539, "eval_samples_per_second": 29.45, "eval_steps_per_second": 7.368, "step": 1600 }, { "epoch": 5.481665332265813, "grad_norm": 5.397751331329346, "learning_rate": 9.36756457225052e-07, "loss": 40.2635, "step": 1602 }, { "epoch": 5.488497464638377, "grad_norm": 5.443418502807617, "learning_rate": 9.120948298936421e-07, "loss": 40.6923, "step": 1604 }, { "epoch": 5.495329597010942, "grad_norm": 3.991673707962036, "learning_rate": 8.87756143957455e-07, "loss": 40.0543, "step": 1606 }, { "epoch": 5.502161729383507, "grad_norm": 4.649523735046387, "learning_rate": 8.637407257200497e-07, "loss": 41.3534, "step": 1608 }, { "epoch": 5.508993861756071, "grad_norm": 4.675793170928955, "learning_rate": 8.400488971509968e-07, "loss": 39.8315, "step": 1610 }, { "epoch": 5.515825994128637, "grad_norm": 3.273359775543213, "learning_rate": 8.166809758815896e-07, "loss": 39.9979, "step": 1612 }, { "epoch": 5.5226581265012005, "grad_norm": 4.165469169616699, "learning_rate": 7.936372752005399e-07, "loss": 39.3362, "step": 1614 }, { "epoch": 5.529490258873766, "grad_norm": 4.015806674957275, "learning_rate": 7.709181040498254e-07, "loss": 40.7772, "step": 1616 }, { "epoch": 5.536322391246331, "grad_norm": 6.13747501373291, "learning_rate": 7.485237670205175e-07, "loss": 40.8463, "step": 1618 }, { "epoch": 5.543154523618895, "grad_norm": 3.6014761924743652, "learning_rate": 7.264545643486997e-07, "loss": 40.231, "step": 1620 }, { "epoch": 5.54998665599146, "grad_norm": 4.055222034454346, "learning_rate": 7.047107919114588e-07, "loss": 42.5435, "step": 1622 }, { "epoch": 5.5568187883640245, "grad_norm": 5.444411277770996, "learning_rate": 6.832927412229018e-07, "loss": 41.0914, "step": 1624 }, { "epoch": 5.563650920736589, "grad_norm": 3.4832520484924316, "learning_rate": 6.622006994302543e-07, "loss": 42.297, "step": 1626 }, { "epoch": 5.570483053109154, "grad_norm": 5.123753547668457, "learning_rate": 6.41434949310013e-07, "loss": 40.4283, "step": 1628 }, { "epoch": 5.5773151854817185, "grad_norm": 5.2065277099609375, "learning_rate": 6.209957692641544e-07, "loss": 40.5581, "step": 1630 }, { "epoch": 5.584147317854283, "grad_norm": 4.573667049407959, "learning_rate": 6.008834333163876e-07, "loss": 39.4126, "step": 1632 }, { "epoch": 5.590979450226849, "grad_norm": 5.208593368530273, "learning_rate": 5.810982111085106e-07, "loss": 40.7202, "step": 1634 }, { "epoch": 5.597811582599413, "grad_norm": 4.341737747192383, "learning_rate": 5.616403678967624e-07, "loss": 40.9683, "step": 1636 }, { "epoch": 5.604643714971978, "grad_norm": 4.836015701293945, "learning_rate": 5.42510164548285e-07, "loss": 40.4273, "step": 1638 }, { "epoch": 5.6114758473445425, "grad_norm": 4.308472633361816, "learning_rate": 5.237078575376336e-07, "loss": 41.0492, "step": 1640 }, { "epoch": 5.618307979717107, "grad_norm": 4.316090106964111, "learning_rate": 5.052336989433082e-07, "loss": 40.6806, "step": 1642 }, { "epoch": 5.625140112089672, "grad_norm": 3.6825830936431885, "learning_rate": 4.870879364444109e-07, "loss": 40.5467, "step": 1644 }, { "epoch": 5.631972244462236, "grad_norm": 5.199794769287109, "learning_rate": 4.692708133172991e-07, "loss": 39.4587, "step": 1646 }, { "epoch": 5.638804376834801, "grad_norm": 3.3388471603393555, "learning_rate": 4.517825684323324e-07, "loss": 39.1098, "step": 1648 }, { "epoch": 5.645636509207366, "grad_norm": 4.200729846954346, "learning_rate": 4.346234362506724e-07, "loss": 40.122, "step": 1650 }, { "epoch": 5.645636509207366, "eval_loss": 0.6662212014198303, "eval_runtime": 137.6293, "eval_samples_per_second": 28.664, "eval_steps_per_second": 7.171, "step": 1650 }, { "epoch": 5.65246864157993, "grad_norm": 3.9246127605438232, "learning_rate": 4.1779364682113796e-07, "loss": 40.0725, "step": 1652 }, { "epoch": 5.659300773952495, "grad_norm": 4.904084205627441, "learning_rate": 4.012934257771134e-07, "loss": 40.0188, "step": 1654 }, { "epoch": 5.6661329063250605, "grad_norm": 4.436688423156738, "learning_rate": 3.851229943335394e-07, "loss": 39.9216, "step": 1656 }, { "epoch": 5.672965038697625, "grad_norm": 4.027088642120361, "learning_rate": 3.6928256928393247e-07, "loss": 41.4124, "step": 1658 }, { "epoch": 5.67979717107019, "grad_norm": 3.796221971511841, "learning_rate": 3.537723629974815e-07, "loss": 39.8851, "step": 1660 }, { "epoch": 5.686629303442754, "grad_norm": 4.7540130615234375, "learning_rate": 3.3859258341621125e-07, "loss": 40.1716, "step": 1662 }, { "epoch": 5.693461435815319, "grad_norm": 4.521333694458008, "learning_rate": 3.237434340521789e-07, "loss": 41.4182, "step": 1664 }, { "epoch": 5.700293568187884, "grad_norm": 4.776477336883545, "learning_rate": 3.0922511398475683e-07, "loss": 41.2698, "step": 1666 }, { "epoch": 5.707125700560448, "grad_norm": 4.749114990234375, "learning_rate": 2.9503781785795713e-07, "loss": 42.4175, "step": 1668 }, { "epoch": 5.713957832933013, "grad_norm": 4.831925392150879, "learning_rate": 2.8118173587782516e-07, "loss": 40.593, "step": 1670 }, { "epoch": 5.720789965305578, "grad_norm": 4.17523193359375, "learning_rate": 2.6765705380989437e-07, "loss": 39.8755, "step": 1672 }, { "epoch": 5.727622097678142, "grad_norm": 4.183824062347412, "learning_rate": 2.544639529766829e-07, "loss": 40.7682, "step": 1674 }, { "epoch": 5.734454230050707, "grad_norm": 4.203549385070801, "learning_rate": 2.416026102552732e-07, "loss": 40.1932, "step": 1676 }, { "epoch": 5.741286362423272, "grad_norm": 4.252909183502197, "learning_rate": 2.290731980749361e-07, "loss": 41.4024, "step": 1678 }, { "epoch": 5.748118494795837, "grad_norm": 4.110680103302002, "learning_rate": 2.168758844148272e-07, "loss": 40.8089, "step": 1680 }, { "epoch": 5.754950627168402, "grad_norm": 4.860687732696533, "learning_rate": 2.050108328017164e-07, "loss": 41.278, "step": 1682 }, { "epoch": 5.761782759540966, "grad_norm": 7.037466526031494, "learning_rate": 1.93478202307823e-07, "loss": 42.0162, "step": 1684 }, { "epoch": 5.768614891913531, "grad_norm": 4.048498630523682, "learning_rate": 1.8227814754865068e-07, "loss": 41.2187, "step": 1686 }, { "epoch": 5.775447024286096, "grad_norm": 3.721379518508911, "learning_rate": 1.7141081868094212e-07, "loss": 41.8383, "step": 1688 }, { "epoch": 5.78227915665866, "grad_norm": 6.793107509613037, "learning_rate": 1.6087636140065532e-07, "loss": 40.5894, "step": 1690 }, { "epoch": 5.789111289031225, "grad_norm": 4.424513339996338, "learning_rate": 1.5067491694100154e-07, "loss": 41.2666, "step": 1692 }, { "epoch": 5.7959434214037895, "grad_norm": 4.707203388214111, "learning_rate": 1.4080662207056894e-07, "loss": 41.2405, "step": 1694 }, { "epoch": 5.802775553776354, "grad_norm": 2.994469165802002, "learning_rate": 1.3127160909147672e-07, "loss": 42.6466, "step": 1696 }, { "epoch": 5.809607686148919, "grad_norm": 3.029481887817383, "learning_rate": 1.220700058376073e-07, "loss": 40.642, "step": 1698 }, { "epoch": 5.816439818521484, "grad_norm": 3.4690332412719727, "learning_rate": 1.1320193567288529e-07, "loss": 41.02, "step": 1700 }, { "epoch": 5.816439818521484, "eval_loss": 0.6652334928512573, "eval_runtime": 134.4616, "eval_samples_per_second": 29.339, "eval_steps_per_second": 7.34, "step": 1700 }, { "epoch": 5.823271950894049, "grad_norm": 5.008721828460693, "learning_rate": 1.0466751748963444e-07, "loss": 40.1855, "step": 1702 }, { "epoch": 5.830104083266614, "grad_norm": 5.638387680053711, "learning_rate": 9.646686570697061e-08, "loss": 40.6194, "step": 1704 }, { "epoch": 5.836936215639178, "grad_norm": 5.234898567199707, "learning_rate": 8.860009026928629e-08, "loss": 40.6608, "step": 1706 }, { "epoch": 5.843768348011743, "grad_norm": 4.212846279144287, "learning_rate": 8.106729664475176e-08, "loss": 41.4097, "step": 1708 }, { "epoch": 5.8506004803843075, "grad_norm": 3.5884008407592773, "learning_rate": 7.386858582392187e-08, "loss": 39.4515, "step": 1710 }, { "epoch": 5.857432612756872, "grad_norm": 4.441662788391113, "learning_rate": 6.700405431837587e-08, "loss": 41.8026, "step": 1712 }, { "epoch": 5.864264745129437, "grad_norm": 5.290170192718506, "learning_rate": 6.047379415941856e-08, "loss": 40.8839, "step": 1714 }, { "epoch": 5.871096877502001, "grad_norm": 3.4507861137390137, "learning_rate": 5.4277892896853476e-08, "loss": 40.574, "step": 1716 }, { "epoch": 5.877929009874566, "grad_norm": 3.869871139526367, "learning_rate": 4.8416433597803234e-08, "loss": 41.8288, "step": 1718 }, { "epoch": 5.884761142247131, "grad_norm": 4.644185543060303, "learning_rate": 4.2889494845599344e-08, "loss": 41.318, "step": 1720 }, { "epoch": 5.891593274619696, "grad_norm": 3.191018581390381, "learning_rate": 3.769715073872748e-08, "loss": 41.1112, "step": 1722 }, { "epoch": 5.898425406992261, "grad_norm": 3.394134998321533, "learning_rate": 3.283947088983663e-08, "loss": 41.9932, "step": 1724 }, { "epoch": 5.9052575393648254, "grad_norm": 4.62444543838501, "learning_rate": 2.831652042480093e-08, "loss": 39.9583, "step": 1726 }, { "epoch": 5.91208967173739, "grad_norm": 4.27966833114624, "learning_rate": 2.4128359981850924e-08, "loss": 39.915, "step": 1728 }, { "epoch": 5.918921804109955, "grad_norm": 3.7036333084106445, "learning_rate": 2.0275045710760334e-08, "loss": 40.0384, "step": 1730 }, { "epoch": 5.925753936482519, "grad_norm": 5.249677658081055, "learning_rate": 1.6756629272085545e-08, "loss": 40.1564, "step": 1732 }, { "epoch": 5.932586068855084, "grad_norm": 4.477707862854004, "learning_rate": 1.3573157836485606e-08, "loss": 40.6008, "step": 1734 }, { "epoch": 5.939418201227649, "grad_norm": 4.939481258392334, "learning_rate": 1.0724674084083841e-08, "loss": 40.9639, "step": 1736 }, { "epoch": 5.946250333600213, "grad_norm": 2.9428999423980713, "learning_rate": 8.211216203890537e-09, "loss": 40.9722, "step": 1738 }, { "epoch": 5.953082465972778, "grad_norm": 4.589330673217773, "learning_rate": 6.032817893297793e-09, "loss": 41.4832, "step": 1740 }, { "epoch": 5.9599145983453425, "grad_norm": 5.4429450035095215, "learning_rate": 4.1895083576271035e-09, "loss": 41.8059, "step": 1742 }, { "epoch": 5.966746730717908, "grad_norm": 3.5152432918548584, "learning_rate": 2.681312309735229e-09, "loss": 41.2228, "step": 1744 }, { "epoch": 5.973578863090473, "grad_norm": 4.573424339294434, "learning_rate": 1.5082499696839059e-09, "loss": 41.9849, "step": 1746 }, { "epoch": 5.980410995463037, "grad_norm": 4.099581718444824, "learning_rate": 6.703370644706164e-10, "loss": 40.6948, "step": 1748 }, { "epoch": 5.987243127835602, "grad_norm": 4.090056896209717, "learning_rate": 1.6758482781209507e-10, "loss": 40.9226, "step": 1750 }, { "epoch": 5.987243127835602, "eval_loss": 0.6658891439437866, "eval_runtime": 134.1369, "eval_samples_per_second": 29.41, "eval_steps_per_second": 7.358, "step": 1750 }, { "epoch": 5.994075260208167, "grad_norm": 4.494061470031738, "learning_rate": 0.0, "loss": 41.0993, "step": 1752 } ], "logging_steps": 2, "max_steps": 1752, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.616163439072248e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }