diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6454 @@ +{ + "best_metric": 0.6652334928512573, + "best_model_checkpoint": "/l/users/visionlanguage/mostafa_ciai/hf_checkpoints_code_ciai_gemma2/checkpoint-1700", + "epoch": 5.994075260208167, + "eval_steps": 50, + "global_step": 1752, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006832132372564718, + "grad_norm": 93.82548522949219, + "learning_rate": 2.777777777777778e-06, + "loss": 208.4052, + "step": 2 + }, + { + "epoch": 0.013664264745129436, + "grad_norm": 65.51689147949219, + "learning_rate": 5.555555555555556e-06, + "loss": 194.4831, + "step": 4 + }, + { + "epoch": 0.020496397117694156, + "grad_norm": 30.816993713378906, + "learning_rate": 8.333333333333334e-06, + "loss": 159.6516, + "step": 6 + }, + { + "epoch": 0.027328529490258872, + "grad_norm": 30.113662719726562, + "learning_rate": 1.1111111111111112e-05, + "loss": 145.5557, + "step": 8 + }, + { + "epoch": 0.03416066186282359, + "grad_norm": 22.37295150756836, + "learning_rate": 1.388888888888889e-05, + "loss": 128.5444, + "step": 10 + }, + { + "epoch": 0.04099279423538831, + "grad_norm": 22.287870407104492, + "learning_rate": 1.6666666666666667e-05, + "loss": 116.2723, + "step": 12 + }, + { + "epoch": 0.04782492660795303, + "grad_norm": 16.027904510498047, + "learning_rate": 1.9444444444444445e-05, + "loss": 107.5451, + "step": 14 + }, + { + "epoch": 0.054657058980517745, + "grad_norm": 17.97212791442871, + "learning_rate": 2.2222222222222223e-05, + "loss": 100.7136, + "step": 16 + }, + { + "epoch": 0.061489191353082465, + "grad_norm": 15.427449226379395, + "learning_rate": 2.5e-05, + "loss": 96.4422, + "step": 18 + }, + { + "epoch": 0.06832132372564718, + "grad_norm": 11.836018562316895, + "learning_rate": 2.777777777777778e-05, + "loss": 89.9874, + "step": 20 + }, + { + "epoch": 0.0751534560982119, + "grad_norm": 13.170073509216309, + "learning_rate": 3.055555555555556e-05, + "loss": 90.5263, + "step": 22 + }, + { + "epoch": 0.08198558847077662, + "grad_norm": 12.781464576721191, + "learning_rate": 3.3333333333333335e-05, + "loss": 87.3144, + "step": 24 + }, + { + "epoch": 0.08881772084334134, + "grad_norm": 11.460458755493164, + "learning_rate": 3.611111111111111e-05, + "loss": 85.6209, + "step": 26 + }, + { + "epoch": 0.09564985321590606, + "grad_norm": 10.382000923156738, + "learning_rate": 3.888888888888889e-05, + "loss": 88.2803, + "step": 28 + }, + { + "epoch": 0.10248198558847077, + "grad_norm": 10.578895568847656, + "learning_rate": 4.166666666666667e-05, + "loss": 80.589, + "step": 30 + }, + { + "epoch": 0.10931411796103549, + "grad_norm": 10.231274604797363, + "learning_rate": 4.4444444444444447e-05, + "loss": 83.0791, + "step": 32 + }, + { + "epoch": 0.11614625033360021, + "grad_norm": 13.121459007263184, + "learning_rate": 4.722222222222222e-05, + "loss": 81.0775, + "step": 34 + }, + { + "epoch": 0.12297838270616493, + "grad_norm": 11.594988822937012, + "learning_rate": 5e-05, + "loss": 79.3985, + "step": 36 + }, + { + "epoch": 0.12981051507872965, + "grad_norm": 10.554534912109375, + "learning_rate": 4.9999832415172185e-05, + "loss": 78.9732, + "step": 38 + }, + { + "epoch": 0.13664264745129437, + "grad_norm": 9.661481857299805, + "learning_rate": 4.9999329662935534e-05, + "loss": 77.5229, + "step": 40 + }, + { + "epoch": 0.1434747798238591, + "grad_norm": 11.10251235961914, + "learning_rate": 4.9998491750030315e-05, + "loss": 77.7747, + "step": 42 + }, + { + "epoch": 0.1503069121964238, + "grad_norm": 9.058899879455566, + "learning_rate": 4.999731868769027e-05, + "loss": 79.2141, + "step": 44 + }, + { + "epoch": 0.15713904456898853, + "grad_norm": 9.254643440246582, + "learning_rate": 4.999581049164237e-05, + "loss": 77.5962, + "step": 46 + }, + { + "epoch": 0.16397117694155325, + "grad_norm": 10.37578010559082, + "learning_rate": 4.99939671821067e-05, + "loss": 76.6356, + "step": 48 + }, + { + "epoch": 0.17080330931411797, + "grad_norm": 9.983922004699707, + "learning_rate": 4.999178878379611e-05, + "loss": 76.0763, + "step": 50 + }, + { + "epoch": 0.17080330931411797, + "eval_loss": 1.20554518699646, + "eval_runtime": 119.3115, + "eval_samples_per_second": 33.065, + "eval_steps_per_second": 8.272, + "step": 50 + }, + { + "epoch": 0.1776354416866827, + "grad_norm": 9.109485626220703, + "learning_rate": 4.998927532591592e-05, + "loss": 75.2524, + "step": 52 + }, + { + "epoch": 0.1844675740592474, + "grad_norm": 8.939992904663086, + "learning_rate": 4.9986426842163515e-05, + "loss": 75.8614, + "step": 54 + }, + { + "epoch": 0.19129970643181213, + "grad_norm": 8.342733383178711, + "learning_rate": 4.9983243370727914e-05, + "loss": 72.864, + "step": 56 + }, + { + "epoch": 0.19813183880437685, + "grad_norm": 7.625518321990967, + "learning_rate": 4.9979724954289244e-05, + "loss": 75.7165, + "step": 58 + }, + { + "epoch": 0.20496397117694154, + "grad_norm": 6.545467853546143, + "learning_rate": 4.9975871640018154e-05, + "loss": 72.337, + "step": 60 + }, + { + "epoch": 0.21179610354950626, + "grad_norm": 8.73936939239502, + "learning_rate": 4.99716834795752e-05, + "loss": 73.0804, + "step": 62 + }, + { + "epoch": 0.21862823592207098, + "grad_norm": 7.599481105804443, + "learning_rate": 4.996716052911017e-05, + "loss": 71.3494, + "step": 64 + }, + { + "epoch": 0.2254603682946357, + "grad_norm": 8.88508415222168, + "learning_rate": 4.996230284926128e-05, + "loss": 73.4886, + "step": 66 + }, + { + "epoch": 0.23229250066720042, + "grad_norm": 7.141696453094482, + "learning_rate": 4.99571105051544e-05, + "loss": 73.0934, + "step": 68 + }, + { + "epoch": 0.23912463303976514, + "grad_norm": 8.946745872497559, + "learning_rate": 4.99515835664022e-05, + "loss": 70.5761, + "step": 70 + }, + { + "epoch": 0.24595676541232986, + "grad_norm": 7.428682804107666, + "learning_rate": 4.994572210710315e-05, + "loss": 69.8488, + "step": 72 + }, + { + "epoch": 0.2527888977848946, + "grad_norm": 10.490913391113281, + "learning_rate": 4.993952620584058e-05, + "loss": 72.1602, + "step": 74 + }, + { + "epoch": 0.2596210301574593, + "grad_norm": 6.010617733001709, + "learning_rate": 4.993299594568163e-05, + "loss": 70.0962, + "step": 76 + }, + { + "epoch": 0.26645316253002405, + "grad_norm": 5.207183361053467, + "learning_rate": 4.992613141417608e-05, + "loss": 70.6436, + "step": 78 + }, + { + "epoch": 0.27328529490258874, + "grad_norm": 7.816757678985596, + "learning_rate": 4.9918932703355256e-05, + "loss": 68.9464, + "step": 80 + }, + { + "epoch": 0.28011742727515343, + "grad_norm": 6.2263383865356445, + "learning_rate": 4.9911399909730714e-05, + "loss": 68.8249, + "step": 82 + }, + { + "epoch": 0.2869495596477182, + "grad_norm": 6.726258754730225, + "learning_rate": 4.990353313429303e-05, + "loss": 68.7637, + "step": 84 + }, + { + "epoch": 0.29378169202028287, + "grad_norm": 5.4038543701171875, + "learning_rate": 4.989533248251037e-05, + "loss": 68.7726, + "step": 86 + }, + { + "epoch": 0.3006138243928476, + "grad_norm": 9.256815910339355, + "learning_rate": 4.988679806432712e-05, + "loss": 68.2967, + "step": 88 + }, + { + "epoch": 0.3074459567654123, + "grad_norm": 7.765486717224121, + "learning_rate": 4.98779299941624e-05, + "loss": 70.6181, + "step": 90 + }, + { + "epoch": 0.31427808913797706, + "grad_norm": 7.625786304473877, + "learning_rate": 4.9868728390908526e-05, + "loss": 68.5738, + "step": 92 + }, + { + "epoch": 0.32111022151054175, + "grad_norm": 7.776100158691406, + "learning_rate": 4.985919337792944e-05, + "loss": 65.0074, + "step": 94 + }, + { + "epoch": 0.3279423538831065, + "grad_norm": 6.496335029602051, + "learning_rate": 4.9849325083059e-05, + "loss": 66.7343, + "step": 96 + }, + { + "epoch": 0.3347744862556712, + "grad_norm": 6.616697311401367, + "learning_rate": 4.983912363859935e-05, + "loss": 69.292, + "step": 98 + }, + { + "epoch": 0.34160661862823594, + "grad_norm": 7.259242057800293, + "learning_rate": 4.982858918131906e-05, + "loss": 66.8941, + "step": 100 + }, + { + "epoch": 0.34160661862823594, + "eval_loss": 1.0700218677520752, + "eval_runtime": 119.6843, + "eval_samples_per_second": 32.962, + "eval_steps_per_second": 8.247, + "step": 100 + }, + { + "epoch": 0.34843875100080063, + "grad_norm": 7.206521987915039, + "learning_rate": 4.981772185245135e-05, + "loss": 68.3145, + "step": 102 + }, + { + "epoch": 0.3552708833733654, + "grad_norm": 6.332549095153809, + "learning_rate": 4.980652179769218e-05, + "loss": 67.5062, + "step": 104 + }, + { + "epoch": 0.36210301574593007, + "grad_norm": 8.422966957092285, + "learning_rate": 4.979498916719828e-05, + "loss": 69.0426, + "step": 106 + }, + { + "epoch": 0.3689351481184948, + "grad_norm": 4.5074357986450195, + "learning_rate": 4.978312411558518e-05, + "loss": 66.0764, + "step": 108 + }, + { + "epoch": 0.3757672804910595, + "grad_norm": 6.847994327545166, + "learning_rate": 4.977092680192507e-05, + "loss": 68.0597, + "step": 110 + }, + { + "epoch": 0.38259941286362426, + "grad_norm": 9.010295867919922, + "learning_rate": 4.9758397389744734e-05, + "loss": 66.7856, + "step": 112 + }, + { + "epoch": 0.38943154523618895, + "grad_norm": 8.793087005615234, + "learning_rate": 4.9745536047023324e-05, + "loss": 66.6415, + "step": 114 + }, + { + "epoch": 0.3962636776087537, + "grad_norm": 6.820159912109375, + "learning_rate": 4.973234294619011e-05, + "loss": 66.8668, + "step": 116 + }, + { + "epoch": 0.4030958099813184, + "grad_norm": 10.739355087280273, + "learning_rate": 4.971881826412218e-05, + "loss": 64.5842, + "step": 118 + }, + { + "epoch": 0.4099279423538831, + "grad_norm": 6.451905727386475, + "learning_rate": 4.9704962182142044e-05, + "loss": 64.2948, + "step": 120 + }, + { + "epoch": 0.4167600747264478, + "grad_norm": 6.998046398162842, + "learning_rate": 4.9690774886015244e-05, + "loss": 66.095, + "step": 122 + }, + { + "epoch": 0.4235922070990125, + "grad_norm": 6.946700096130371, + "learning_rate": 4.967625656594782e-05, + "loss": 66.6205, + "step": 124 + }, + { + "epoch": 0.43042433947157727, + "grad_norm": 7.656089782714844, + "learning_rate": 4.966140741658379e-05, + "loss": 65.2253, + "step": 126 + }, + { + "epoch": 0.43725647184414196, + "grad_norm": 8.242254257202148, + "learning_rate": 4.9646227637002515e-05, + "loss": 65.4466, + "step": 128 + }, + { + "epoch": 0.4440886042167067, + "grad_norm": 6.5599894523620605, + "learning_rate": 4.963071743071607e-05, + "loss": 64.5302, + "step": 130 + }, + { + "epoch": 0.4509207365892714, + "grad_norm": 5.671536922454834, + "learning_rate": 4.961487700566646e-05, + "loss": 64.9711, + "step": 132 + }, + { + "epoch": 0.45775286896183615, + "grad_norm": 6.317226886749268, + "learning_rate": 4.9598706574222886e-05, + "loss": 66.1428, + "step": 134 + }, + { + "epoch": 0.46458500133440084, + "grad_norm": 7.731470584869385, + "learning_rate": 4.958220635317886e-05, + "loss": 65.6398, + "step": 136 + }, + { + "epoch": 0.4714171337069656, + "grad_norm": 7.070956230163574, + "learning_rate": 4.956537656374933e-05, + "loss": 64.027, + "step": 138 + }, + { + "epoch": 0.4782492660795303, + "grad_norm": 5.216205596923828, + "learning_rate": 4.9548217431567665e-05, + "loss": 64.9929, + "step": 140 + }, + { + "epoch": 0.485081398452095, + "grad_norm": 6.5882344245910645, + "learning_rate": 4.95307291866827e-05, + "loss": 66.2789, + "step": 142 + }, + { + "epoch": 0.4919135308246597, + "grad_norm": 5.5962934494018555, + "learning_rate": 4.95129120635556e-05, + "loss": 65.4516, + "step": 144 + }, + { + "epoch": 0.49874566319722446, + "grad_norm": 7.341054916381836, + "learning_rate": 4.949476630105669e-05, + "loss": 64.339, + "step": 146 + }, + { + "epoch": 0.5055777955697892, + "grad_norm": 7.5083441734313965, + "learning_rate": 4.9476292142462374e-05, + "loss": 62.7076, + "step": 148 + }, + { + "epoch": 0.5124099279423538, + "grad_norm": 5.081834316253662, + "learning_rate": 4.945748983545172e-05, + "loss": 64.2066, + "step": 150 + }, + { + "epoch": 0.5124099279423538, + "eval_loss": 0.9920685291290283, + "eval_runtime": 120.1858, + "eval_samples_per_second": 32.824, + "eval_steps_per_second": 8.212, + "step": 150 + }, + { + "epoch": 0.5192420603149186, + "grad_norm": 6.279696464538574, + "learning_rate": 4.943835963210324e-05, + "loss": 63.3412, + "step": 152 + }, + { + "epoch": 0.5260741926874833, + "grad_norm": 6.806802749633789, + "learning_rate": 4.941890178889149e-05, + "loss": 63.2038, + "step": 154 + }, + { + "epoch": 0.5329063250600481, + "grad_norm": 8.012312889099121, + "learning_rate": 4.939911656668361e-05, + "loss": 63.4725, + "step": 156 + }, + { + "epoch": 0.5397384574326127, + "grad_norm": 6.68613338470459, + "learning_rate": 4.937900423073585e-05, + "loss": 62.8267, + "step": 158 + }, + { + "epoch": 0.5465705898051775, + "grad_norm": 6.391062259674072, + "learning_rate": 4.9358565050689985e-05, + "loss": 63.4099, + "step": 160 + }, + { + "epoch": 0.5534027221777422, + "grad_norm": 6.4117817878723145, + "learning_rate": 4.933779930056975e-05, + "loss": 62.475, + "step": 162 + }, + { + "epoch": 0.5602348545503069, + "grad_norm": 10.238900184631348, + "learning_rate": 4.93167072587771e-05, + "loss": 62.3929, + "step": 164 + }, + { + "epoch": 0.5670669869228716, + "grad_norm": 6.800478935241699, + "learning_rate": 4.929528920808854e-05, + "loss": 63.4465, + "step": 166 + }, + { + "epoch": 0.5738991192954364, + "grad_norm": 6.688059329986572, + "learning_rate": 4.92735454356513e-05, + "loss": 62.3017, + "step": 168 + }, + { + "epoch": 0.5807312516680011, + "grad_norm": 5.010741710662842, + "learning_rate": 4.925147623297949e-05, + "loss": 61.5306, + "step": 170 + }, + { + "epoch": 0.5875633840405657, + "grad_norm": 6.061219215393066, + "learning_rate": 4.922908189595018e-05, + "loss": 63.5529, + "step": 172 + }, + { + "epoch": 0.5943955164131305, + "grad_norm": 7.6835126876831055, + "learning_rate": 4.920636272479946e-05, + "loss": 64.4077, + "step": 174 + }, + { + "epoch": 0.6012276487856952, + "grad_norm": 5.945671558380127, + "learning_rate": 4.9183319024118415e-05, + "loss": 64.3411, + "step": 176 + }, + { + "epoch": 0.60805978115826, + "grad_norm": 4.983694076538086, + "learning_rate": 4.915995110284901e-05, + "loss": 63.5529, + "step": 178 + }, + { + "epoch": 0.6148919135308246, + "grad_norm": 5.736062049865723, + "learning_rate": 4.9136259274279955e-05, + "loss": 63.7282, + "step": 180 + }, + { + "epoch": 0.6217240459033894, + "grad_norm": 6.8453545570373535, + "learning_rate": 4.911224385604255e-05, + "loss": 63.5027, + "step": 182 + }, + { + "epoch": 0.6285561782759541, + "grad_norm": 5.9253668785095215, + "learning_rate": 4.908790517010636e-05, + "loss": 60.5142, + "step": 184 + }, + { + "epoch": 0.6353883106485189, + "grad_norm": 5.743585586547852, + "learning_rate": 4.906324354277495e-05, + "loss": 62.4935, + "step": 186 + }, + { + "epoch": 0.6422204430210835, + "grad_norm": 4.686921119689941, + "learning_rate": 4.903825930468149e-05, + "loss": 60.8045, + "step": 188 + }, + { + "epoch": 0.6490525753936482, + "grad_norm": 5.350888729095459, + "learning_rate": 4.901295279078431e-05, + "loss": 62.3775, + "step": 190 + }, + { + "epoch": 0.655884707766213, + "grad_norm": 5.417562961578369, + "learning_rate": 4.898732434036244e-05, + "loss": 60.1095, + "step": 192 + }, + { + "epoch": 0.6627168401387777, + "grad_norm": 5.238453388214111, + "learning_rate": 4.896137429701102e-05, + "loss": 62.8943, + "step": 194 + }, + { + "epoch": 0.6695489725113424, + "grad_norm": 6.252527713775635, + "learning_rate": 4.893510300863676e-05, + "loss": 61.1666, + "step": 196 + }, + { + "epoch": 0.6763811048839071, + "grad_norm": 5.860842704772949, + "learning_rate": 4.890851082745319e-05, + "loss": 62.6643, + "step": 198 + }, + { + "epoch": 0.6832132372564719, + "grad_norm": 6.3946099281311035, + "learning_rate": 4.8881598109976004e-05, + "loss": 61.939, + "step": 200 + }, + { + "epoch": 0.6832132372564719, + "eval_loss": 0.9664058685302734, + "eval_runtime": 119.3157, + "eval_samples_per_second": 33.064, + "eval_steps_per_second": 8.272, + "step": 200 + }, + { + "epoch": 0.6900453696290365, + "grad_norm": 5.909948825836182, + "learning_rate": 4.885436521701824e-05, + "loss": 63.9172, + "step": 202 + }, + { + "epoch": 0.6968775020016013, + "grad_norm": 6.600235462188721, + "learning_rate": 4.8826812513685487e-05, + "loss": 60.6396, + "step": 204 + }, + { + "epoch": 0.703709634374166, + "grad_norm": 5.97224235534668, + "learning_rate": 4.8798940369370944e-05, + "loss": 61.1365, + "step": 206 + }, + { + "epoch": 0.7105417667467308, + "grad_norm": 5.521954536437988, + "learning_rate": 4.877074915775049e-05, + "loss": 61.9178, + "step": 208 + }, + { + "epoch": 0.7173738991192954, + "grad_norm": 4.756962299346924, + "learning_rate": 4.8742239256777674e-05, + "loss": 60.0003, + "step": 210 + }, + { + "epoch": 0.7242060314918601, + "grad_norm": 7.966216564178467, + "learning_rate": 4.8713411048678635e-05, + "loss": 60.3937, + "step": 212 + }, + { + "epoch": 0.7310381638644249, + "grad_norm": 5.864863872528076, + "learning_rate": 4.868426491994702e-05, + "loss": 60.5208, + "step": 214 + }, + { + "epoch": 0.7378702962369896, + "grad_norm": 4.952422142028809, + "learning_rate": 4.865480126133872e-05, + "loss": 61.4458, + "step": 216 + }, + { + "epoch": 0.7447024286095543, + "grad_norm": 4.522135257720947, + "learning_rate": 4.862502046786671e-05, + "loss": 62.5035, + "step": 218 + }, + { + "epoch": 0.751534560982119, + "grad_norm": 4.29464054107666, + "learning_rate": 4.859492293879574e-05, + "loss": 61.5825, + "step": 220 + }, + { + "epoch": 0.7583666933546838, + "grad_norm": 5.789974212646484, + "learning_rate": 4.856450907763693e-05, + "loss": 59.9352, + "step": 222 + }, + { + "epoch": 0.7651988257272485, + "grad_norm": 6.44216251373291, + "learning_rate": 4.853377929214243e-05, + "loss": 59.1637, + "step": 224 + }, + { + "epoch": 0.7720309580998131, + "grad_norm": 4.520390033721924, + "learning_rate": 4.85027339942999e-05, + "loss": 60.4813, + "step": 226 + }, + { + "epoch": 0.7788630904723779, + "grad_norm": 6.058870315551758, + "learning_rate": 4.8471373600326996e-05, + "loss": 60.2968, + "step": 228 + }, + { + "epoch": 0.7856952228449426, + "grad_norm": 5.945502281188965, + "learning_rate": 4.843969853066584e-05, + "loss": 58.2098, + "step": 230 + }, + { + "epoch": 0.7925273552175074, + "grad_norm": 4.318876266479492, + "learning_rate": 4.8407709209977305e-05, + "loss": 58.4711, + "step": 232 + }, + { + "epoch": 0.799359487590072, + "grad_norm": 5.385821342468262, + "learning_rate": 4.837540606713538e-05, + "loss": 59.5379, + "step": 234 + }, + { + "epoch": 0.8061916199626368, + "grad_norm": 6.59214973449707, + "learning_rate": 4.834278953522138e-05, + "loss": 58.4163, + "step": 236 + }, + { + "epoch": 0.8130237523352015, + "grad_norm": 5.087238311767578, + "learning_rate": 4.8309860051518204e-05, + "loss": 60.5546, + "step": 238 + }, + { + "epoch": 0.8198558847077662, + "grad_norm": 6.804642200469971, + "learning_rate": 4.8276618057504376e-05, + "loss": 59.0874, + "step": 240 + }, + { + "epoch": 0.8266880170803309, + "grad_norm": 5.035391330718994, + "learning_rate": 4.824306399884822e-05, + "loss": 59.9545, + "step": 242 + }, + { + "epoch": 0.8335201494528957, + "grad_norm": 5.837290287017822, + "learning_rate": 4.8209198325401815e-05, + "loss": 59.5963, + "step": 244 + }, + { + "epoch": 0.8403522818254604, + "grad_norm": 4.17293643951416, + "learning_rate": 4.817502149119502e-05, + "loss": 59.7065, + "step": 246 + }, + { + "epoch": 0.847184414198025, + "grad_norm": 4.964944362640381, + "learning_rate": 4.8140533954429327e-05, + "loss": 59.5358, + "step": 248 + }, + { + "epoch": 0.8540165465705898, + "grad_norm": 6.021297931671143, + "learning_rate": 4.810573617747178e-05, + "loss": 60.6391, + "step": 250 + }, + { + "epoch": 0.8540165465705898, + "eval_loss": 0.9407148361206055, + "eval_runtime": 119.9595, + "eval_samples_per_second": 32.886, + "eval_steps_per_second": 8.228, + "step": 250 + }, + { + "epoch": 0.8608486789431545, + "grad_norm": 5.707021713256836, + "learning_rate": 4.8070628626848735e-05, + "loss": 61.5872, + "step": 252 + }, + { + "epoch": 0.8676808113157193, + "grad_norm": 4.725375652313232, + "learning_rate": 4.803521177323962e-05, + "loss": 59.2192, + "step": 254 + }, + { + "epoch": 0.8745129436882839, + "grad_norm": 23.445714950561523, + "learning_rate": 4.799948609147061e-05, + "loss": 60.1762, + "step": 256 + }, + { + "epoch": 0.8813450760608487, + "grad_norm": 5.503020286560059, + "learning_rate": 4.796345206050829e-05, + "loss": 62.2226, + "step": 258 + }, + { + "epoch": 0.8881772084334134, + "grad_norm": 6.558228015899658, + "learning_rate": 4.792711016345321e-05, + "loss": 62.089, + "step": 260 + }, + { + "epoch": 0.8950093408059782, + "grad_norm": 8.109895706176758, + "learning_rate": 4.7890460887533417e-05, + "loss": 60.7872, + "step": 262 + }, + { + "epoch": 0.9018414731785428, + "grad_norm": 5.230234622955322, + "learning_rate": 4.785350472409792e-05, + "loss": 57.9312, + "step": 264 + }, + { + "epoch": 0.9086736055511075, + "grad_norm": 6.669562339782715, + "learning_rate": 4.7816242168610093e-05, + "loss": 61.7966, + "step": 266 + }, + { + "epoch": 0.9155057379236723, + "grad_norm": 5.428192615509033, + "learning_rate": 4.777867372064105e-05, + "loss": 58.4551, + "step": 268 + }, + { + "epoch": 0.922337870296237, + "grad_norm": 5.6168131828308105, + "learning_rate": 4.774079988386296e-05, + "loss": 59.9015, + "step": 270 + }, + { + "epoch": 0.9291700026688017, + "grad_norm": 5.785460948944092, + "learning_rate": 4.770262116604224e-05, + "loss": 59.723, + "step": 272 + }, + { + "epoch": 0.9360021350413664, + "grad_norm": 8.77035140991211, + "learning_rate": 4.76641380790328e-05, + "loss": 60.8996, + "step": 274 + }, + { + "epoch": 0.9428342674139312, + "grad_norm": 4.000178813934326, + "learning_rate": 4.762535113876917e-05, + "loss": 59.2908, + "step": 276 + }, + { + "epoch": 0.9496663997864959, + "grad_norm": 5.8565826416015625, + "learning_rate": 4.758626086525956e-05, + "loss": 59.296, + "step": 278 + }, + { + "epoch": 0.9564985321590606, + "grad_norm": 6.792466163635254, + "learning_rate": 4.754686778257891e-05, + "loss": 58.351, + "step": 280 + }, + { + "epoch": 0.9633306645316253, + "grad_norm": 6.484628677368164, + "learning_rate": 4.750717241886185e-05, + "loss": 58.46, + "step": 282 + }, + { + "epoch": 0.97016279690419, + "grad_norm": 5.421430587768555, + "learning_rate": 4.7467175306295655e-05, + "loss": 59.0205, + "step": 284 + }, + { + "epoch": 0.9769949292767547, + "grad_norm": 4.550335884094238, + "learning_rate": 4.7426876981113044e-05, + "loss": 60.8234, + "step": 286 + }, + { + "epoch": 0.9838270616493194, + "grad_norm": 5.412383079528809, + "learning_rate": 4.738627798358506e-05, + "loss": 57.3651, + "step": 288 + }, + { + "epoch": 0.9906591940218842, + "grad_norm": 5.225856781005859, + "learning_rate": 4.7345378858013776e-05, + "loss": 58.8522, + "step": 290 + }, + { + "epoch": 0.9974913263944489, + "grad_norm": 3.856189250946045, + "learning_rate": 4.730418015272503e-05, + "loss": 59.7945, + "step": 292 + }, + { + "epoch": 1.0034160661862823, + "grad_norm": 6.19010066986084, + "learning_rate": 4.726268242006106e-05, + "loss": 50.2722, + "step": 294 + }, + { + "epoch": 1.0102481985588472, + "grad_norm": 5.333181858062744, + "learning_rate": 4.722088621637309e-05, + "loss": 58.7285, + "step": 296 + }, + { + "epoch": 1.0170803309314118, + "grad_norm": 5.93973970413208, + "learning_rate": 4.717879210201389e-05, + "loss": 57.2823, + "step": 298 + }, + { + "epoch": 1.0239124633039765, + "grad_norm": 4.59360408782959, + "learning_rate": 4.713640064133025e-05, + "loss": 58.4687, + "step": 300 + }, + { + "epoch": 1.0239124633039765, + "eval_loss": 0.9195547699928284, + "eval_runtime": 119.3076, + "eval_samples_per_second": 33.066, + "eval_steps_per_second": 8.273, + "step": 300 + }, + { + "epoch": 1.0307445956765413, + "grad_norm": 5.437332630157471, + "learning_rate": 4.7093712402655427e-05, + "loss": 57.7491, + "step": 302 + }, + { + "epoch": 1.037576728049106, + "grad_norm": 4.938009738922119, + "learning_rate": 4.7050727958301506e-05, + "loss": 58.2642, + "step": 304 + }, + { + "epoch": 1.0444088604216706, + "grad_norm": 5.104777812957764, + "learning_rate": 4.7007447884551745e-05, + "loss": 56.1312, + "step": 306 + }, + { + "epoch": 1.0512409927942354, + "grad_norm": 5.78248405456543, + "learning_rate": 4.6963872761652835e-05, + "loss": 56.9488, + "step": 308 + }, + { + "epoch": 1.0580731251668, + "grad_norm": 4.8224287033081055, + "learning_rate": 4.692000317380715e-05, + "loss": 56.6993, + "step": 310 + }, + { + "epoch": 1.064905257539365, + "grad_norm": 4.517540454864502, + "learning_rate": 4.687583970916487e-05, + "loss": 58.8636, + "step": 312 + }, + { + "epoch": 1.0717373899119296, + "grad_norm": 5.353949069976807, + "learning_rate": 4.683138295981611e-05, + "loss": 58.6762, + "step": 314 + }, + { + "epoch": 1.0785695222844942, + "grad_norm": 6.164919376373291, + "learning_rate": 4.678663352178301e-05, + "loss": 57.9218, + "step": 316 + }, + { + "epoch": 1.085401654657059, + "grad_norm": 4.577470302581787, + "learning_rate": 4.674159199501173e-05, + "loss": 58.1644, + "step": 318 + }, + { + "epoch": 1.0922337870296237, + "grad_norm": 6.5861592292785645, + "learning_rate": 4.6696258983364385e-05, + "loss": 57.3447, + "step": 320 + }, + { + "epoch": 1.0990659194021883, + "grad_norm": 4.327467918395996, + "learning_rate": 4.665063509461097e-05, + "loss": 57.2627, + "step": 322 + }, + { + "epoch": 1.1058980517747532, + "grad_norm": 7.534716606140137, + "learning_rate": 4.660472094042121e-05, + "loss": 57.2099, + "step": 324 + }, + { + "epoch": 1.1127301841473178, + "grad_norm": 5.549008369445801, + "learning_rate": 4.655851713635635e-05, + "loss": 58.4564, + "step": 326 + }, + { + "epoch": 1.1195623165198825, + "grad_norm": 4.385070323944092, + "learning_rate": 4.651202430186092e-05, + "loss": 57.0019, + "step": 328 + }, + { + "epoch": 1.1263944488924473, + "grad_norm": 4.763044357299805, + "learning_rate": 4.6465243060254415e-05, + "loss": 55.7849, + "step": 330 + }, + { + "epoch": 1.133226581265012, + "grad_norm": 3.9461379051208496, + "learning_rate": 4.641817403872293e-05, + "loss": 56.2399, + "step": 332 + }, + { + "epoch": 1.1400587136375768, + "grad_norm": 4.946137428283691, + "learning_rate": 4.637081786831079e-05, + "loss": 56.7089, + "step": 334 + }, + { + "epoch": 1.1468908460101415, + "grad_norm": 5.664731025695801, + "learning_rate": 4.6323175183912024e-05, + "loss": 57.1022, + "step": 336 + }, + { + "epoch": 1.153722978382706, + "grad_norm": 5.261230945587158, + "learning_rate": 4.627524662426194e-05, + "loss": 56.3552, + "step": 338 + }, + { + "epoch": 1.160555110755271, + "grad_norm": 4.166741847991943, + "learning_rate": 4.6227032831928484e-05, + "loss": 56.888, + "step": 340 + }, + { + "epoch": 1.1673872431278356, + "grad_norm": 6.015218734741211, + "learning_rate": 4.6178534453303666e-05, + "loss": 57.3006, + "step": 342 + }, + { + "epoch": 1.1742193755004002, + "grad_norm": 6.349710941314697, + "learning_rate": 4.6129752138594874e-05, + "loss": 57.0208, + "step": 344 + }, + { + "epoch": 1.181051507872965, + "grad_norm": 5.403022766113281, + "learning_rate": 4.608068654181617e-05, + "loss": 57.0645, + "step": 346 + }, + { + "epoch": 1.1878836402455297, + "grad_norm": 6.523670673370361, + "learning_rate": 4.6031338320779534e-05, + "loss": 58.2164, + "step": 348 + }, + { + "epoch": 1.1947157726180944, + "grad_norm": 6.369359970092773, + "learning_rate": 4.5981708137086e-05, + "loss": 56.7965, + "step": 350 + }, + { + "epoch": 1.1947157726180944, + "eval_loss": 0.8986765146255493, + "eval_runtime": 119.0222, + "eval_samples_per_second": 33.145, + "eval_steps_per_second": 8.293, + "step": 350 + }, + { + "epoch": 1.2015479049906592, + "grad_norm": 5.050749778747559, + "learning_rate": 4.5931796656116846e-05, + "loss": 56.7828, + "step": 352 + }, + { + "epoch": 1.2083800373632239, + "grad_norm": 5.341484069824219, + "learning_rate": 4.588160454702462e-05, + "loss": 57.4058, + "step": 354 + }, + { + "epoch": 1.2152121697357887, + "grad_norm": 4.554074287414551, + "learning_rate": 4.5831132482724195e-05, + "loss": 57.6257, + "step": 356 + }, + { + "epoch": 1.2220443021083534, + "grad_norm": 4.951889514923096, + "learning_rate": 4.578038113988376e-05, + "loss": 56.0608, + "step": 358 + }, + { + "epoch": 1.228876434480918, + "grad_norm": 4.2526421546936035, + "learning_rate": 4.572935119891571e-05, + "loss": 55.8586, + "step": 360 + }, + { + "epoch": 1.2357085668534828, + "grad_norm": 4.805353164672852, + "learning_rate": 4.5678043343967554e-05, + "loss": 59.2427, + "step": 362 + }, + { + "epoch": 1.2425406992260475, + "grad_norm": 4.9927978515625, + "learning_rate": 4.5626458262912745e-05, + "loss": 55.1494, + "step": 364 + }, + { + "epoch": 1.2493728315986123, + "grad_norm": 5.778275012969971, + "learning_rate": 4.557459664734141e-05, + "loss": 55.9791, + "step": 366 + }, + { + "epoch": 1.256204963971177, + "grad_norm": 4.41555643081665, + "learning_rate": 4.552245919255117e-05, + "loss": 57.3123, + "step": 368 + }, + { + "epoch": 1.2630370963437416, + "grad_norm": 5.230330944061279, + "learning_rate": 4.5470046597537735e-05, + "loss": 55.9031, + "step": 370 + }, + { + "epoch": 1.2698692287163063, + "grad_norm": 3.9548189640045166, + "learning_rate": 4.541735956498554e-05, + "loss": 56.6997, + "step": 372 + }, + { + "epoch": 1.2767013610888711, + "grad_norm": 5.017361640930176, + "learning_rate": 4.5364398801258396e-05, + "loss": 57.3268, + "step": 374 + }, + { + "epoch": 1.2835334934614357, + "grad_norm": 5.562941074371338, + "learning_rate": 4.5311165016389916e-05, + "loss": 55.6271, + "step": 376 + }, + { + "epoch": 1.2903656258340006, + "grad_norm": 6.675297737121582, + "learning_rate": 4.525765892407409e-05, + "loss": 55.9593, + "step": 378 + }, + { + "epoch": 1.2971977582065652, + "grad_norm": 6.47582483291626, + "learning_rate": 4.5203881241655644e-05, + "loss": 57.0788, + "step": 380 + }, + { + "epoch": 1.3040298905791299, + "grad_norm": 5.157675743103027, + "learning_rate": 4.514983269012049e-05, + "loss": 56.3623, + "step": 382 + }, + { + "epoch": 1.3108620229516947, + "grad_norm": 8.075702667236328, + "learning_rate": 4.509551399408598e-05, + "loss": 55.6531, + "step": 384 + }, + { + "epoch": 1.3176941553242594, + "grad_norm": 3.849310874938965, + "learning_rate": 4.504092588179128e-05, + "loss": 58.7546, + "step": 386 + }, + { + "epoch": 1.3245262876968242, + "grad_norm": 3.6027579307556152, + "learning_rate": 4.498606908508754e-05, + "loss": 57.7153, + "step": 388 + }, + { + "epoch": 1.3313584200693889, + "grad_norm": 5.139729976654053, + "learning_rate": 4.4930944339428085e-05, + "loss": 56.4532, + "step": 390 + }, + { + "epoch": 1.3381905524419535, + "grad_norm": 5.337704181671143, + "learning_rate": 4.487555238385862e-05, + "loss": 54.2958, + "step": 392 + }, + { + "epoch": 1.3450226848145181, + "grad_norm": 3.3229618072509766, + "learning_rate": 4.481989396100724e-05, + "loss": 54.2046, + "step": 394 + }, + { + "epoch": 1.351854817187083, + "grad_norm": 5.2183074951171875, + "learning_rate": 4.476396981707453e-05, + "loss": 56.0147, + "step": 396 + }, + { + "epoch": 1.3586869495596476, + "grad_norm": 5.028941631317139, + "learning_rate": 4.470778070182353e-05, + "loss": 54.3446, + "step": 398 + }, + { + "epoch": 1.3655190819322125, + "grad_norm": 6.347212791442871, + "learning_rate": 4.465132736856969e-05, + "loss": 56.7659, + "step": 400 + }, + { + "epoch": 1.3655190819322125, + "eval_loss": 0.8771227598190308, + "eval_runtime": 118.9477, + "eval_samples_per_second": 33.166, + "eval_steps_per_second": 8.298, + "step": 400 + }, + { + "epoch": 1.3723512143047771, + "grad_norm": 9.381309509277344, + "learning_rate": 4.459461057417078e-05, + "loss": 56.8099, + "step": 402 + }, + { + "epoch": 1.3791833466773418, + "grad_norm": 5.657813549041748, + "learning_rate": 4.453763107901675e-05, + "loss": 56.3326, + "step": 404 + }, + { + "epoch": 1.3860154790499066, + "grad_norm": 4.476396083831787, + "learning_rate": 4.4480389647019505e-05, + "loss": 57.3978, + "step": 406 + }, + { + "epoch": 1.3928476114224713, + "grad_norm": 5.402798652648926, + "learning_rate": 4.442288704560268e-05, + "loss": 55.7143, + "step": 408 + }, + { + "epoch": 1.3996797437950361, + "grad_norm": 4.367002010345459, + "learning_rate": 4.436512404569136e-05, + "loss": 55.7044, + "step": 410 + }, + { + "epoch": 1.4065118761676008, + "grad_norm": 5.653073310852051, + "learning_rate": 4.430710142170176e-05, + "loss": 55.7266, + "step": 412 + }, + { + "epoch": 1.4133440085401654, + "grad_norm": 7.221829414367676, + "learning_rate": 4.424881995153076e-05, + "loss": 56.4174, + "step": 414 + }, + { + "epoch": 1.4201761409127303, + "grad_norm": 5.465057373046875, + "learning_rate": 4.419028041654559e-05, + "loss": 56.9093, + "step": 416 + }, + { + "epoch": 1.427008273285295, + "grad_norm": 8.383552551269531, + "learning_rate": 4.4131483601573285e-05, + "loss": 56.0841, + "step": 418 + }, + { + "epoch": 1.4338404056578598, + "grad_norm": 4.208652973175049, + "learning_rate": 4.4072430294890174e-05, + "loss": 57.5786, + "step": 420 + }, + { + "epoch": 1.4406725380304244, + "grad_norm": 5.773376941680908, + "learning_rate": 4.4013121288211307e-05, + "loss": 55.8851, + "step": 422 + }, + { + "epoch": 1.447504670402989, + "grad_norm": 5.354812145233154, + "learning_rate": 4.3953557376679856e-05, + "loss": 55.1571, + "step": 424 + }, + { + "epoch": 1.4543368027755537, + "grad_norm": 4.6360039710998535, + "learning_rate": 4.389373935885646e-05, + "loss": 54.0095, + "step": 426 + }, + { + "epoch": 1.4611689351481185, + "grad_norm": 7.125521183013916, + "learning_rate": 4.383366803670849e-05, + "loss": 56.645, + "step": 428 + }, + { + "epoch": 1.4680010675206832, + "grad_norm": 6.071737766265869, + "learning_rate": 4.377334421559932e-05, + "loss": 55.3209, + "step": 430 + }, + { + "epoch": 1.474833199893248, + "grad_norm": 4.569766998291016, + "learning_rate": 4.371276870427753e-05, + "loss": 54.6604, + "step": 432 + }, + { + "epoch": 1.4816653322658127, + "grad_norm": 5.426764965057373, + "learning_rate": 4.365194231486604e-05, + "loss": 56.4116, + "step": 434 + }, + { + "epoch": 1.4884974646383773, + "grad_norm": 5.6092023849487305, + "learning_rate": 4.359086586285127e-05, + "loss": 56.0268, + "step": 436 + }, + { + "epoch": 1.4953295970109421, + "grad_norm": 6.140939712524414, + "learning_rate": 4.3529540167072126e-05, + "loss": 54.886, + "step": 438 + }, + { + "epoch": 1.5021617293835068, + "grad_norm": 4.043739318847656, + "learning_rate": 4.346796604970912e-05, + "loss": 56.6431, + "step": 440 + }, + { + "epoch": 1.5089938617560716, + "grad_norm": 3.8898212909698486, + "learning_rate": 4.340614433627328e-05, + "loss": 55.6492, + "step": 442 + }, + { + "epoch": 1.5158259941286363, + "grad_norm": 6.158950328826904, + "learning_rate": 4.3344075855595104e-05, + "loss": 55.6869, + "step": 444 + }, + { + "epoch": 1.522658126501201, + "grad_norm": 3.874180316925049, + "learning_rate": 4.328176143981343e-05, + "loss": 53.7981, + "step": 446 + }, + { + "epoch": 1.5294902588737656, + "grad_norm": 4.068581581115723, + "learning_rate": 4.321920192436433e-05, + "loss": 54.6618, + "step": 448 + }, + { + "epoch": 1.5363223912463304, + "grad_norm": 4.552149295806885, + "learning_rate": 4.315639814796983e-05, + "loss": 55.1642, + "step": 450 + }, + { + "epoch": 1.5363223912463304, + "eval_loss": 0.8704175353050232, + "eval_runtime": 119.5049, + "eval_samples_per_second": 33.011, + "eval_steps_per_second": 8.259, + "step": 450 + }, + { + "epoch": 1.5431545236188953, + "grad_norm": 4.1831374168396, + "learning_rate": 4.309335095262676e-05, + "loss": 53.2926, + "step": 452 + }, + { + "epoch": 1.54998665599146, + "grad_norm": 4.456052780151367, + "learning_rate": 4.303006118359537e-05, + "loss": 53.6038, + "step": 454 + }, + { + "epoch": 1.5568187883640245, + "grad_norm": 17.7099609375, + "learning_rate": 4.296652968938807e-05, + "loss": 54.9325, + "step": 456 + }, + { + "epoch": 1.5636509207365892, + "grad_norm": 8.005233764648438, + "learning_rate": 4.2902757321758016e-05, + "loss": 53.7884, + "step": 458 + }, + { + "epoch": 1.570483053109154, + "grad_norm": 5.034004211425781, + "learning_rate": 4.283874493568772e-05, + "loss": 53.2575, + "step": 460 + }, + { + "epoch": 1.5773151854817187, + "grad_norm": 4.005930423736572, + "learning_rate": 4.2774493389377545e-05, + "loss": 55.4554, + "step": 462 + }, + { + "epoch": 1.5841473178542835, + "grad_norm": 5.812296390533447, + "learning_rate": 4.271000354423426e-05, + "loss": 56.7008, + "step": 464 + }, + { + "epoch": 1.5909794502268482, + "grad_norm": 6.425695896148682, + "learning_rate": 4.2645276264859394e-05, + "loss": 56.8804, + "step": 466 + }, + { + "epoch": 1.5978115825994128, + "grad_norm": 4.44102144241333, + "learning_rate": 4.258031241903778e-05, + "loss": 54.2011, + "step": 468 + }, + { + "epoch": 1.6046437149719774, + "grad_norm": 4.444553852081299, + "learning_rate": 4.251511287772579e-05, + "loss": 54.9826, + "step": 470 + }, + { + "epoch": 1.6114758473445423, + "grad_norm": 3.8157808780670166, + "learning_rate": 4.2449678515039747e-05, + "loss": 55.2601, + "step": 472 + }, + { + "epoch": 1.6183079797171072, + "grad_norm": 6.47904634475708, + "learning_rate": 4.238401020824416e-05, + "loss": 54.5978, + "step": 474 + }, + { + "epoch": 1.6251401120896718, + "grad_norm": 5.010526180267334, + "learning_rate": 4.231810883773999e-05, + "loss": 56.0995, + "step": 476 + }, + { + "epoch": 1.6319722444622364, + "grad_norm": 5.843505382537842, + "learning_rate": 4.2251975287052804e-05, + "loss": 54.0241, + "step": 478 + }, + { + "epoch": 1.638804376834801, + "grad_norm": 4.549996852874756, + "learning_rate": 4.218561044282099e-05, + "loss": 56.3071, + "step": 480 + }, + { + "epoch": 1.645636509207366, + "grad_norm": 4.20985221862793, + "learning_rate": 4.211901519478382e-05, + "loss": 54.3977, + "step": 482 + }, + { + "epoch": 1.6524686415799306, + "grad_norm": 5.491010665893555, + "learning_rate": 4.2052190435769554e-05, + "loss": 53.1375, + "step": 484 + }, + { + "epoch": 1.6593007739524954, + "grad_norm": 4.417302131652832, + "learning_rate": 4.198513706168345e-05, + "loss": 53.959, + "step": 486 + }, + { + "epoch": 1.66613290632506, + "grad_norm": 5.39029598236084, + "learning_rate": 4.191785597149577e-05, + "loss": 54.5638, + "step": 488 + }, + { + "epoch": 1.6729650386976247, + "grad_norm": 4.233526229858398, + "learning_rate": 4.1850348067229696e-05, + "loss": 54.6384, + "step": 490 + }, + { + "epoch": 1.6797971710701893, + "grad_norm": 6.301634311676025, + "learning_rate": 4.178261425394926e-05, + "loss": 55.1738, + "step": 492 + }, + { + "epoch": 1.6866293034427542, + "grad_norm": 5.9507246017456055, + "learning_rate": 4.171465543974723e-05, + "loss": 54.7009, + "step": 494 + }, + { + "epoch": 1.693461435815319, + "grad_norm": 5.033243656158447, + "learning_rate": 4.1646472535732895e-05, + "loss": 54.3154, + "step": 496 + }, + { + "epoch": 1.7002935681878837, + "grad_norm": 4.675721168518066, + "learning_rate": 4.157806645601988e-05, + "loss": 54.1507, + "step": 498 + }, + { + "epoch": 1.7071257005604483, + "grad_norm": 3.5945537090301514, + "learning_rate": 4.1509438117713866e-05, + "loss": 52.2103, + "step": 500 + }, + { + "epoch": 1.7071257005604483, + "eval_loss": 0.8516557216644287, + "eval_runtime": 119.4754, + "eval_samples_per_second": 33.019, + "eval_steps_per_second": 8.261, + "step": 500 + }, + { + "epoch": 1.713957832933013, + "grad_norm": 4.187085151672363, + "learning_rate": 4.144058844090032e-05, + "loss": 54.1474, + "step": 502 + }, + { + "epoch": 1.7207899653055778, + "grad_norm": 3.818648099899292, + "learning_rate": 4.137151834863213e-05, + "loss": 55.5711, + "step": 504 + }, + { + "epoch": 1.7276220976781427, + "grad_norm": 5.919620513916016, + "learning_rate": 4.130222876691726e-05, + "loss": 54.3803, + "step": 506 + }, + { + "epoch": 1.7344542300507073, + "grad_norm": 5.772305011749268, + "learning_rate": 4.123272062470633e-05, + "loss": 53.9454, + "step": 508 + }, + { + "epoch": 1.741286362423272, + "grad_norm": 4.569563865661621, + "learning_rate": 4.116299485388014e-05, + "loss": 53.5009, + "step": 510 + }, + { + "epoch": 1.7481184947958366, + "grad_norm": 4.183293342590332, + "learning_rate": 4.109305238923718e-05, + "loss": 52.9927, + "step": 512 + }, + { + "epoch": 1.7549506271684012, + "grad_norm": 4.4316301345825195, + "learning_rate": 4.102289416848114e-05, + "loss": 54.5023, + "step": 514 + }, + { + "epoch": 1.761782759540966, + "grad_norm": 14.234251976013184, + "learning_rate": 4.095252113220827e-05, + "loss": 53.1473, + "step": 516 + }, + { + "epoch": 1.768614891913531, + "grad_norm": 4.889795780181885, + "learning_rate": 4.088193422389484e-05, + "loss": 53.7265, + "step": 518 + }, + { + "epoch": 1.7754470242860956, + "grad_norm": 3.02785325050354, + "learning_rate": 4.0811134389884433e-05, + "loss": 52.5917, + "step": 520 + }, + { + "epoch": 1.7822791566586602, + "grad_norm": 5.794788360595703, + "learning_rate": 4.0740122579375286e-05, + "loss": 55.4619, + "step": 522 + }, + { + "epoch": 1.7891112890312248, + "grad_norm": 4.442338466644287, + "learning_rate": 4.066889974440757e-05, + "loss": 53.7709, + "step": 524 + }, + { + "epoch": 1.7959434214037897, + "grad_norm": 4.7714715003967285, + "learning_rate": 4.0597466839850595e-05, + "loss": 54.16, + "step": 526 + }, + { + "epoch": 1.8027755537763546, + "grad_norm": 4.7263569831848145, + "learning_rate": 4.0525824823390045e-05, + "loss": 55.9749, + "step": 528 + }, + { + "epoch": 1.8096076861489192, + "grad_norm": 4.258271217346191, + "learning_rate": 4.045397465551513e-05, + "loss": 52.5445, + "step": 530 + }, + { + "epoch": 1.8164398185214838, + "grad_norm": 4.56829309463501, + "learning_rate": 4.038191729950569e-05, + "loss": 53.8703, + "step": 532 + }, + { + "epoch": 1.8232719508940485, + "grad_norm": 8.888167381286621, + "learning_rate": 4.030965372141927e-05, + "loss": 52.7209, + "step": 534 + }, + { + "epoch": 1.8301040832666133, + "grad_norm": 4.5087175369262695, + "learning_rate": 4.0237184890078245e-05, + "loss": 54.591, + "step": 536 + }, + { + "epoch": 1.836936215639178, + "grad_norm": 4.460638523101807, + "learning_rate": 4.0164511777056725e-05, + "loss": 54.8662, + "step": 538 + }, + { + "epoch": 1.8437683480117428, + "grad_norm": 3.5958664417266846, + "learning_rate": 4.009163535666761e-05, + "loss": 53.423, + "step": 540 + }, + { + "epoch": 1.8506004803843075, + "grad_norm": 4.3935418128967285, + "learning_rate": 4.001855660594948e-05, + "loss": 53.9048, + "step": 542 + }, + { + "epoch": 1.857432612756872, + "grad_norm": 5.473939895629883, + "learning_rate": 3.994527650465352e-05, + "loss": 52.9295, + "step": 544 + }, + { + "epoch": 1.8642647451294367, + "grad_norm": 4.8625922203063965, + "learning_rate": 3.98717960352304e-05, + "loss": 51.8002, + "step": 546 + }, + { + "epoch": 1.8710968775020016, + "grad_norm": 4.244052886962891, + "learning_rate": 3.979811618281706e-05, + "loss": 53.6904, + "step": 548 + }, + { + "epoch": 1.8779290098745665, + "grad_norm": 4.050732612609863, + "learning_rate": 3.972423793522352e-05, + "loss": 54.7441, + "step": 550 + }, + { + "epoch": 1.8779290098745665, + "eval_loss": 0.8419561982154846, + "eval_runtime": 119.6757, + "eval_samples_per_second": 32.964, + "eval_steps_per_second": 8.247, + "step": 550 + }, + { + "epoch": 1.884761142247131, + "grad_norm": 5.255309104919434, + "learning_rate": 3.9650162282919655e-05, + "loss": 53.6842, + "step": 552 + }, + { + "epoch": 1.8915932746196957, + "grad_norm": 5.483623504638672, + "learning_rate": 3.957589021902191e-05, + "loss": 54.0004, + "step": 554 + }, + { + "epoch": 1.8984254069922604, + "grad_norm": 4.224212169647217, + "learning_rate": 3.9501422739279956e-05, + "loss": 51.7289, + "step": 556 + }, + { + "epoch": 1.9052575393648252, + "grad_norm": 5.061962127685547, + "learning_rate": 3.942676084206338e-05, + "loss": 53.4457, + "step": 558 + }, + { + "epoch": 1.9120896717373899, + "grad_norm": 3.8694398403167725, + "learning_rate": 3.9351905528348285e-05, + "loss": 51.8595, + "step": 560 + }, + { + "epoch": 1.9189218041099547, + "grad_norm": 4.149620056152344, + "learning_rate": 3.927685780170385e-05, + "loss": 51.8196, + "step": 562 + }, + { + "epoch": 1.9257539364825194, + "grad_norm": 6.877647399902344, + "learning_rate": 3.920161866827889e-05, + "loss": 52.7279, + "step": 564 + }, + { + "epoch": 1.932586068855084, + "grad_norm": 4.069815635681152, + "learning_rate": 3.9126189136788416e-05, + "loss": 51.1502, + "step": 566 + }, + { + "epoch": 1.9394182012276486, + "grad_norm": 6.629972457885742, + "learning_rate": 3.90505702185e-05, + "loss": 52.6793, + "step": 568 + }, + { + "epoch": 1.9462503336002135, + "grad_norm": 4.475677013397217, + "learning_rate": 3.897476292722034e-05, + "loss": 51.4329, + "step": 570 + }, + { + "epoch": 1.9530824659727783, + "grad_norm": 5.370522499084473, + "learning_rate": 3.889876827928156e-05, + "loss": 53.1101, + "step": 572 + }, + { + "epoch": 1.959914598345343, + "grad_norm": 5.481414794921875, + "learning_rate": 3.882258729352768e-05, + "loss": 53.3684, + "step": 574 + }, + { + "epoch": 1.9667467307179076, + "grad_norm": 6.393594741821289, + "learning_rate": 3.874622099130087e-05, + "loss": 52.7341, + "step": 576 + }, + { + "epoch": 1.9735788630904723, + "grad_norm": 3.9178807735443115, + "learning_rate": 3.866967039642784e-05, + "loss": 51.5249, + "step": 578 + }, + { + "epoch": 1.9804109954630371, + "grad_norm": 9.721770286560059, + "learning_rate": 3.859293653520604e-05, + "loss": 51.2705, + "step": 580 + }, + { + "epoch": 1.987243127835602, + "grad_norm": 4.619483470916748, + "learning_rate": 3.851602043638994e-05, + "loss": 51.7596, + "step": 582 + }, + { + "epoch": 1.9940752602081666, + "grad_norm": 4.899592399597168, + "learning_rate": 3.843892313117724e-05, + "loss": 54.7586, + "step": 584 + }, + { + "epoch": 2.0, + "grad_norm": 3.8423385620117188, + "learning_rate": 3.8361645653195026e-05, + "loss": 44.9497, + "step": 586 + }, + { + "epoch": 2.0068321323725646, + "grad_norm": 4.93556022644043, + "learning_rate": 3.8284189038485936e-05, + "loss": 53.1383, + "step": 588 + }, + { + "epoch": 2.0136642647451293, + "grad_norm": 6.575899124145508, + "learning_rate": 3.8206554325494225e-05, + "loss": 52.1373, + "step": 590 + }, + { + "epoch": 2.0204963971176944, + "grad_norm": 3.5134201049804688, + "learning_rate": 3.812874255505191e-05, + "loss": 50.8711, + "step": 592 + }, + { + "epoch": 2.027328529490259, + "grad_norm": 4.761475086212158, + "learning_rate": 3.805075477036476e-05, + "loss": 52.0756, + "step": 594 + }, + { + "epoch": 2.0341606618628236, + "grad_norm": 3.7381017208099365, + "learning_rate": 3.797259201699833e-05, + "loss": 51.0594, + "step": 596 + }, + { + "epoch": 2.0409927942353883, + "grad_norm": 5.102145671844482, + "learning_rate": 3.789425534286394e-05, + "loss": 52.1454, + "step": 598 + }, + { + "epoch": 2.047824926607953, + "grad_norm": 4.762547969818115, + "learning_rate": 3.781574579820464e-05, + "loss": 50.3373, + "step": 600 + }, + { + "epoch": 2.047824926607953, + "eval_loss": 0.8283991813659668, + "eval_runtime": 119.5704, + "eval_samples_per_second": 32.993, + "eval_steps_per_second": 8.255, + "step": 600 + }, + { + "epoch": 2.0546570589805175, + "grad_norm": 4.646745681762695, + "learning_rate": 3.773706443558111e-05, + "loss": 51.0792, + "step": 602 + }, + { + "epoch": 2.0614891913530826, + "grad_norm": 5.648324012756348, + "learning_rate": 3.765821230985758e-05, + "loss": 50.6017, + "step": 604 + }, + { + "epoch": 2.0683213237256473, + "grad_norm": 4.703359603881836, + "learning_rate": 3.75791904781876e-05, + "loss": 52.4212, + "step": 606 + }, + { + "epoch": 2.075153456098212, + "grad_norm": 4.082385540008545, + "learning_rate": 3.7500000000000003e-05, + "loss": 51.9666, + "step": 608 + }, + { + "epoch": 2.0819855884707765, + "grad_norm": 4.6461687088012695, + "learning_rate": 3.74206419369846e-05, + "loss": 51.6205, + "step": 610 + }, + { + "epoch": 2.088817720843341, + "grad_norm": 3.9972918033599854, + "learning_rate": 3.7341117353077966e-05, + "loss": 52.6521, + "step": 612 + }, + { + "epoch": 2.0956498532159062, + "grad_norm": 5.636791229248047, + "learning_rate": 3.726142731444921e-05, + "loss": 52.6811, + "step": 614 + }, + { + "epoch": 2.102481985588471, + "grad_norm": 6.055325508117676, + "learning_rate": 3.718157288948563e-05, + "loss": 51.2952, + "step": 616 + }, + { + "epoch": 2.1093141179610355, + "grad_norm": 5.317610740661621, + "learning_rate": 3.710155514877844e-05, + "loss": 52.4443, + "step": 618 + }, + { + "epoch": 2.1161462503336, + "grad_norm": 4.979522705078125, + "learning_rate": 3.702137516510838e-05, + "loss": 51.3593, + "step": 620 + }, + { + "epoch": 2.122978382706165, + "grad_norm": 7.410902500152588, + "learning_rate": 3.694103401343136e-05, + "loss": 51.5919, + "step": 622 + }, + { + "epoch": 2.12981051507873, + "grad_norm": 4.962103366851807, + "learning_rate": 3.686053277086401e-05, + "loss": 51.272, + "step": 624 + }, + { + "epoch": 2.1366426474512945, + "grad_norm": 4.0044426918029785, + "learning_rate": 3.6779872516669295e-05, + "loss": 51.6362, + "step": 626 + }, + { + "epoch": 2.143474779823859, + "grad_norm": 5.016703128814697, + "learning_rate": 3.669905433224199e-05, + "loss": 51.7369, + "step": 628 + }, + { + "epoch": 2.150306912196424, + "grad_norm": 4.700343132019043, + "learning_rate": 3.6618079301094216e-05, + "loss": 50.9454, + "step": 630 + }, + { + "epoch": 2.1571390445689884, + "grad_norm": 8.11246395111084, + "learning_rate": 3.653694850884091e-05, + "loss": 50.4605, + "step": 632 + }, + { + "epoch": 2.163971176941553, + "grad_norm": 3.8724536895751953, + "learning_rate": 3.645566304318526e-05, + "loss": 52.4849, + "step": 634 + }, + { + "epoch": 2.170803309314118, + "grad_norm": 3.699873208999634, + "learning_rate": 3.637422399390413e-05, + "loss": 49.8017, + "step": 636 + }, + { + "epoch": 2.1776354416866828, + "grad_norm": 4.757104873657227, + "learning_rate": 3.6292632452833436e-05, + "loss": 52.0966, + "step": 638 + }, + { + "epoch": 2.1844675740592474, + "grad_norm": 5.273576736450195, + "learning_rate": 3.621088951385353e-05, + "loss": 49.5201, + "step": 640 + }, + { + "epoch": 2.191299706431812, + "grad_norm": 4.152122497558594, + "learning_rate": 3.612899627287452e-05, + "loss": 51.121, + "step": 642 + }, + { + "epoch": 2.1981318388043767, + "grad_norm": 4.448339939117432, + "learning_rate": 3.604695382782159e-05, + "loss": 51.5833, + "step": 644 + }, + { + "epoch": 2.2049639711769418, + "grad_norm": 3.272676706314087, + "learning_rate": 3.596476327862024e-05, + "loss": 50.4036, + "step": 646 + }, + { + "epoch": 2.2117961035495064, + "grad_norm": 4.293691158294678, + "learning_rate": 3.588242572718162e-05, + "loss": 50.4138, + "step": 648 + }, + { + "epoch": 2.218628235922071, + "grad_norm": 6.384798049926758, + "learning_rate": 3.579994227738767e-05, + "loss": 49.0042, + "step": 650 + }, + { + "epoch": 2.218628235922071, + "eval_loss": 0.8110712170600891, + "eval_runtime": 119.0744, + "eval_samples_per_second": 33.131, + "eval_steps_per_second": 8.289, + "step": 650 + }, + { + "epoch": 2.2254603682946357, + "grad_norm": 4.501573085784912, + "learning_rate": 3.5717314035076355e-05, + "loss": 49.7713, + "step": 652 + }, + { + "epoch": 2.2322925006672003, + "grad_norm": 4.808114051818848, + "learning_rate": 3.5634542108026876e-05, + "loss": 50.6265, + "step": 654 + }, + { + "epoch": 2.239124633039765, + "grad_norm": 5.616351127624512, + "learning_rate": 3.5551627605944745e-05, + "loss": 52.1332, + "step": 656 + }, + { + "epoch": 2.24595676541233, + "grad_norm": 7.0716071128845215, + "learning_rate": 3.5468571640446994e-05, + "loss": 50.7825, + "step": 658 + }, + { + "epoch": 2.2527888977848947, + "grad_norm": 4.64641809463501, + "learning_rate": 3.5385375325047166e-05, + "loss": 50.3092, + "step": 660 + }, + { + "epoch": 2.2596210301574593, + "grad_norm": 4.058784008026123, + "learning_rate": 3.5302039775140486e-05, + "loss": 51.7827, + "step": 662 + }, + { + "epoch": 2.266453162530024, + "grad_norm": 4.011864185333252, + "learning_rate": 3.521856610798887e-05, + "loss": 51.4194, + "step": 664 + }, + { + "epoch": 2.2732852949025886, + "grad_norm": 3.89857816696167, + "learning_rate": 3.513495544270592e-05, + "loss": 50.7032, + "step": 666 + }, + { + "epoch": 2.2801174272751537, + "grad_norm": 4.966712951660156, + "learning_rate": 3.505120890024195e-05, + "loss": 49.925, + "step": 668 + }, + { + "epoch": 2.2869495596477183, + "grad_norm": 4.181141376495361, + "learning_rate": 3.496732760336895e-05, + "loss": 49.5112, + "step": 670 + }, + { + "epoch": 2.293781692020283, + "grad_norm": 4.761594772338867, + "learning_rate": 3.4883312676665536e-05, + "loss": 49.6545, + "step": 672 + }, + { + "epoch": 2.3006138243928476, + "grad_norm": 3.97501802444458, + "learning_rate": 3.479916524650188e-05, + "loss": 51.1862, + "step": 674 + }, + { + "epoch": 2.307445956765412, + "grad_norm": 5.200672149658203, + "learning_rate": 3.4714886441024574e-05, + "loss": 49.9163, + "step": 676 + }, + { + "epoch": 2.314278089137977, + "grad_norm": 4.147047519683838, + "learning_rate": 3.4630477390141556e-05, + "loss": 48.6138, + "step": 678 + }, + { + "epoch": 2.321110221510542, + "grad_norm": 4.9791693687438965, + "learning_rate": 3.4545939225506934e-05, + "loss": 51.4538, + "step": 680 + }, + { + "epoch": 2.3279423538831066, + "grad_norm": 4.929348945617676, + "learning_rate": 3.4461273080505793e-05, + "loss": 51.2735, + "step": 682 + }, + { + "epoch": 2.334774486255671, + "grad_norm": 4.98499059677124, + "learning_rate": 3.437648009023905e-05, + "loss": 48.5889, + "step": 684 + }, + { + "epoch": 2.341606618628236, + "grad_norm": 4.354183673858643, + "learning_rate": 3.4291561391508185e-05, + "loss": 51.7768, + "step": 686 + }, + { + "epoch": 2.3484387510008005, + "grad_norm": 3.482697010040283, + "learning_rate": 3.420651812280006e-05, + "loss": 48.9966, + "step": 688 + }, + { + "epoch": 2.3552708833733655, + "grad_norm": 4.613458156585693, + "learning_rate": 3.4121351424271594e-05, + "loss": 50.8534, + "step": 690 + }, + { + "epoch": 2.36210301574593, + "grad_norm": 3.93235182762146, + "learning_rate": 3.4036062437734484e-05, + "loss": 50.9164, + "step": 692 + }, + { + "epoch": 2.368935148118495, + "grad_norm": 5.348623275756836, + "learning_rate": 3.395065230663996e-05, + "loss": 49.6679, + "step": 694 + }, + { + "epoch": 2.3757672804910595, + "grad_norm": 5.050134181976318, + "learning_rate": 3.386512217606339e-05, + "loss": 48.0534, + "step": 696 + }, + { + "epoch": 2.382599412863624, + "grad_norm": 3.7587573528289795, + "learning_rate": 3.3779473192688954e-05, + "loss": 50.3013, + "step": 698 + }, + { + "epoch": 2.3894315452361887, + "grad_norm": 5.177303314208984, + "learning_rate": 3.369370650479425e-05, + "loss": 48.8704, + "step": 700 + }, + { + "epoch": 2.3894315452361887, + "eval_loss": 0.7940448522567749, + "eval_runtime": 119.8708, + "eval_samples_per_second": 32.91, + "eval_steps_per_second": 8.234, + "step": 700 + }, + { + "epoch": 2.396263677608754, + "grad_norm": 4.268886089324951, + "learning_rate": 3.360782326223493e-05, + "loss": 50.0788, + "step": 702 + }, + { + "epoch": 2.4030958099813184, + "grad_norm": 4.847851276397705, + "learning_rate": 3.3521824616429285e-05, + "loss": 50.5298, + "step": 704 + }, + { + "epoch": 2.409927942353883, + "grad_norm": 4.221863746643066, + "learning_rate": 3.3435711720342764e-05, + "loss": 51.0571, + "step": 706 + }, + { + "epoch": 2.4167600747264477, + "grad_norm": 5.5122528076171875, + "learning_rate": 3.3349485728472535e-05, + "loss": 48.3266, + "step": 708 + }, + { + "epoch": 2.4235922070990124, + "grad_norm": 3.7766902446746826, + "learning_rate": 3.326314779683207e-05, + "loss": 49.9334, + "step": 710 + }, + { + "epoch": 2.4304243394715774, + "grad_norm": 4.093820571899414, + "learning_rate": 3.3176699082935545e-05, + "loss": 48.4746, + "step": 712 + }, + { + "epoch": 2.437256471844142, + "grad_norm": 4.116121292114258, + "learning_rate": 3.3090140745782396e-05, + "loss": 48.5131, + "step": 714 + }, + { + "epoch": 2.4440886042167067, + "grad_norm": 5.181516647338867, + "learning_rate": 3.300347394584172e-05, + "loss": 50.4981, + "step": 716 + }, + { + "epoch": 2.4509207365892713, + "grad_norm": 4.464053630828857, + "learning_rate": 3.2916699845036816e-05, + "loss": 50.2301, + "step": 718 + }, + { + "epoch": 2.457752868961836, + "grad_norm": 4.229206562042236, + "learning_rate": 3.282981960672948e-05, + "loss": 50.1858, + "step": 720 + }, + { + "epoch": 2.4645850013344006, + "grad_norm": 3.8356049060821533, + "learning_rate": 3.2742834395704486e-05, + "loss": 48.9147, + "step": 722 + }, + { + "epoch": 2.4714171337069657, + "grad_norm": 3.9584670066833496, + "learning_rate": 3.265574537815398e-05, + "loss": 48.6574, + "step": 724 + }, + { + "epoch": 2.4782492660795303, + "grad_norm": 4.802350997924805, + "learning_rate": 3.25685537216618e-05, + "loss": 48.9724, + "step": 726 + }, + { + "epoch": 2.485081398452095, + "grad_norm": 4.078526020050049, + "learning_rate": 3.248126059518785e-05, + "loss": 47.7639, + "step": 728 + }, + { + "epoch": 2.4919135308246596, + "grad_norm": 3.8187856674194336, + "learning_rate": 3.2393867169052385e-05, + "loss": 48.2195, + "step": 730 + }, + { + "epoch": 2.4987456631972247, + "grad_norm": 5.273796081542969, + "learning_rate": 3.230637461492043e-05, + "loss": 49.7512, + "step": 732 + }, + { + "epoch": 2.5055777955697893, + "grad_norm": 4.126491069793701, + "learning_rate": 3.221878410578593e-05, + "loss": 49.0844, + "step": 734 + }, + { + "epoch": 2.512409927942354, + "grad_norm": 4.665433406829834, + "learning_rate": 3.213109681595612e-05, + "loss": 48.7829, + "step": 736 + }, + { + "epoch": 2.5192420603149186, + "grad_norm": 4.897470951080322, + "learning_rate": 3.2043313921035743e-05, + "loss": 49.5252, + "step": 738 + }, + { + "epoch": 2.5260741926874832, + "grad_norm": 5.257498264312744, + "learning_rate": 3.195543659791132e-05, + "loss": 50.4767, + "step": 740 + }, + { + "epoch": 2.532906325060048, + "grad_norm": 3.754957914352417, + "learning_rate": 3.186746602473533e-05, + "loss": 49.4055, + "step": 742 + }, + { + "epoch": 2.5397384574326125, + "grad_norm": 3.994774341583252, + "learning_rate": 3.177940338091043e-05, + "loss": 49.3039, + "step": 744 + }, + { + "epoch": 2.5465705898051776, + "grad_norm": 4.923650741577148, + "learning_rate": 3.169124984707367e-05, + "loss": 48.6568, + "step": 746 + }, + { + "epoch": 2.5534027221777422, + "grad_norm": 6.377063274383545, + "learning_rate": 3.160300660508064e-05, + "loss": 48.7655, + "step": 748 + }, + { + "epoch": 2.560234854550307, + "grad_norm": 3.7124524116516113, + "learning_rate": 3.151467483798961e-05, + "loss": 48.0997, + "step": 750 + }, + { + "epoch": 2.560234854550307, + "eval_loss": 0.7798339128494263, + "eval_runtime": 119.2173, + "eval_samples_per_second": 33.091, + "eval_steps_per_second": 8.279, + "step": 750 + }, + { + "epoch": 2.5670669869228715, + "grad_norm": 4.752464294433594, + "learning_rate": 3.14262557300457e-05, + "loss": 48.422, + "step": 752 + }, + { + "epoch": 2.5738991192954366, + "grad_norm": 4.635769844055176, + "learning_rate": 3.1337750466665e-05, + "loss": 48.9177, + "step": 754 + }, + { + "epoch": 2.580731251668001, + "grad_norm": 4.357526779174805, + "learning_rate": 3.124916023441865e-05, + "loss": 49.4801, + "step": 756 + }, + { + "epoch": 2.587563384040566, + "grad_norm": 16.189651489257812, + "learning_rate": 3.116048622101694e-05, + "loss": 49.275, + "step": 758 + }, + { + "epoch": 2.5943955164131305, + "grad_norm": 3.983285903930664, + "learning_rate": 3.107172961529343e-05, + "loss": 47.968, + "step": 760 + }, + { + "epoch": 2.601227648785695, + "grad_norm": 4.357701301574707, + "learning_rate": 3.098289160718895e-05, + "loss": 47.8592, + "step": 762 + }, + { + "epoch": 2.6080597811582598, + "grad_norm": 3.9686052799224854, + "learning_rate": 3.0893973387735687e-05, + "loss": 49.5191, + "step": 764 + }, + { + "epoch": 2.6148919135308244, + "grad_norm": 3.9062581062316895, + "learning_rate": 3.0804976149041195e-05, + "loss": 48.5485, + "step": 766 + }, + { + "epoch": 2.6217240459033895, + "grad_norm": 4.7290143966674805, + "learning_rate": 3.071590108427244e-05, + "loss": 49.2073, + "step": 768 + }, + { + "epoch": 2.628556178275954, + "grad_norm": 4.57703161239624, + "learning_rate": 3.062674938763976e-05, + "loss": 49.7624, + "step": 770 + }, + { + "epoch": 2.6353883106485188, + "grad_norm": 4.4061737060546875, + "learning_rate": 3.0537522254380905e-05, + "loss": 49.0566, + "step": 772 + }, + { + "epoch": 2.6422204430210834, + "grad_norm": 4.166697978973389, + "learning_rate": 3.044822088074496e-05, + "loss": 49.3193, + "step": 774 + }, + { + "epoch": 2.6490525753936485, + "grad_norm": 3.5513172149658203, + "learning_rate": 3.0358846463976372e-05, + "loss": 48.9675, + "step": 776 + }, + { + "epoch": 2.655884707766213, + "grad_norm": 4.9701995849609375, + "learning_rate": 3.026940020229882e-05, + "loss": 49.6229, + "step": 778 + }, + { + "epoch": 2.6627168401387777, + "grad_norm": 4.223094463348389, + "learning_rate": 3.017988329489923e-05, + "loss": 47.1613, + "step": 780 + }, + { + "epoch": 2.6695489725113424, + "grad_norm": 4.849906921386719, + "learning_rate": 3.0090296941911633e-05, + "loss": 47.5764, + "step": 782 + }, + { + "epoch": 2.676381104883907, + "grad_norm": 3.507953643798828, + "learning_rate": 3.0000642344401113e-05, + "loss": 47.1944, + "step": 784 + }, + { + "epoch": 2.6832132372564717, + "grad_norm": 4.040694713592529, + "learning_rate": 2.9910920704347696e-05, + "loss": 48.6472, + "step": 786 + }, + { + "epoch": 2.6900453696290363, + "grad_norm": 5.141117095947266, + "learning_rate": 2.9821133224630226e-05, + "loss": 47.177, + "step": 788 + }, + { + "epoch": 2.6968775020016014, + "grad_norm": 4.463181018829346, + "learning_rate": 2.9731281109010256e-05, + "loss": 47.4283, + "step": 790 + }, + { + "epoch": 2.703709634374166, + "grad_norm": 3.586456060409546, + "learning_rate": 2.9641365562115887e-05, + "loss": 48.9784, + "step": 792 + }, + { + "epoch": 2.7105417667467306, + "grad_norm": 3.9780969619750977, + "learning_rate": 2.9551387789425638e-05, + "loss": 48.601, + "step": 794 + }, + { + "epoch": 2.7173738991192953, + "grad_norm": 4.445759296417236, + "learning_rate": 2.9461348997252265e-05, + "loss": 49.9106, + "step": 796 + }, + { + "epoch": 2.7242060314918604, + "grad_norm": 4.416858673095703, + "learning_rate": 2.9371250392726614e-05, + "loss": 48.3298, + "step": 798 + }, + { + "epoch": 2.731038163864425, + "grad_norm": 4.36728572845459, + "learning_rate": 2.9281093183781403e-05, + "loss": 48.6063, + "step": 800 + }, + { + "epoch": 2.731038163864425, + "eval_loss": 0.7699871063232422, + "eval_runtime": 119.5951, + "eval_samples_per_second": 32.986, + "eval_steps_per_second": 8.253, + "step": 800 + }, + { + "epoch": 2.7378702962369896, + "grad_norm": 5.540378570556641, + "learning_rate": 2.919087857913508e-05, + "loss": 49.4323, + "step": 802 + }, + { + "epoch": 2.7447024286095543, + "grad_norm": 3.73681640625, + "learning_rate": 2.9100607788275545e-05, + "loss": 49.0439, + "step": 804 + }, + { + "epoch": 2.751534560982119, + "grad_norm": 4.437684535980225, + "learning_rate": 2.9010282021444008e-05, + "loss": 48.8682, + "step": 806 + }, + { + "epoch": 2.7583666933546835, + "grad_norm": 4.933871746063232, + "learning_rate": 2.891990248961871e-05, + "loss": 48.0791, + "step": 808 + }, + { + "epoch": 2.7651988257272486, + "grad_norm": 4.351380825042725, + "learning_rate": 2.8829470404498697e-05, + "loss": 47.0584, + "step": 810 + }, + { + "epoch": 2.7720309580998133, + "grad_norm": 4.953640937805176, + "learning_rate": 2.8738986978487625e-05, + "loss": 50.0531, + "step": 812 + }, + { + "epoch": 2.778863090472378, + "grad_norm": 3.676950216293335, + "learning_rate": 2.8648453424677434e-05, + "loss": 46.9994, + "step": 814 + }, + { + "epoch": 2.7856952228449425, + "grad_norm": 4.177380084991455, + "learning_rate": 2.8557870956832132e-05, + "loss": 48.3932, + "step": 816 + }, + { + "epoch": 2.7925273552175076, + "grad_norm": 4.177119731903076, + "learning_rate": 2.846724078937149e-05, + "loss": 48.2385, + "step": 818 + }, + { + "epoch": 2.7993594875900722, + "grad_norm": 4.261831283569336, + "learning_rate": 2.8376564137354795e-05, + "loss": 48.813, + "step": 820 + }, + { + "epoch": 2.806191619962637, + "grad_norm": 3.7779037952423096, + "learning_rate": 2.8285842216464543e-05, + "loss": 48.801, + "step": 822 + }, + { + "epoch": 2.8130237523352015, + "grad_norm": 5.378250598907471, + "learning_rate": 2.8195076242990122e-05, + "loss": 45.9584, + "step": 824 + }, + { + "epoch": 2.819855884707766, + "grad_norm": 3.5369153022766113, + "learning_rate": 2.8104267433811533e-05, + "loss": 46.97, + "step": 826 + }, + { + "epoch": 2.826688017080331, + "grad_norm": 3.493602991104126, + "learning_rate": 2.8013417006383076e-05, + "loss": 46.7352, + "step": 828 + }, + { + "epoch": 2.8335201494528954, + "grad_norm": 5.41981840133667, + "learning_rate": 2.7922526178717017e-05, + "loss": 48.4586, + "step": 830 + }, + { + "epoch": 2.8403522818254605, + "grad_norm": 4.6053948402404785, + "learning_rate": 2.783159616936723e-05, + "loss": 46.5008, + "step": 832 + }, + { + "epoch": 2.847184414198025, + "grad_norm": 4.136333465576172, + "learning_rate": 2.774062819741293e-05, + "loss": 47.3448, + "step": 834 + }, + { + "epoch": 2.85401654657059, + "grad_norm": 3.927877187728882, + "learning_rate": 2.764962348244228e-05, + "loss": 46.7369, + "step": 836 + }, + { + "epoch": 2.8608486789431544, + "grad_norm": 4.283491611480713, + "learning_rate": 2.7558583244536007e-05, + "loss": 48.098, + "step": 838 + }, + { + "epoch": 2.8676808113157195, + "grad_norm": 3.802030563354492, + "learning_rate": 2.7467508704251137e-05, + "loss": 48.2908, + "step": 840 + }, + { + "epoch": 2.874512943688284, + "grad_norm": 5.212815761566162, + "learning_rate": 2.7376401082604564e-05, + "loss": 47.8921, + "step": 842 + }, + { + "epoch": 2.8813450760608488, + "grad_norm": 4.39296293258667, + "learning_rate": 2.7285261601056698e-05, + "loss": 48.2491, + "step": 844 + }, + { + "epoch": 2.8881772084334134, + "grad_norm": 5.428844928741455, + "learning_rate": 2.7194091481495076e-05, + "loss": 49.1209, + "step": 846 + }, + { + "epoch": 2.895009340805978, + "grad_norm": 3.9836559295654297, + "learning_rate": 2.7102891946217994e-05, + "loss": 47.0515, + "step": 848 + }, + { + "epoch": 2.9018414731785427, + "grad_norm": 3.1067824363708496, + "learning_rate": 2.7011664217918154e-05, + "loss": 46.0087, + "step": 850 + }, + { + "epoch": 2.9018414731785427, + "eval_loss": 0.760260820388794, + "eval_runtime": 119.6698, + "eval_samples_per_second": 32.966, + "eval_steps_per_second": 8.248, + "step": 850 + }, + { + "epoch": 2.9086736055511073, + "grad_norm": 4.688024997711182, + "learning_rate": 2.6920409519666174e-05, + "loss": 47.0489, + "step": 852 + }, + { + "epoch": 2.9155057379236724, + "grad_norm": 4.777935981750488, + "learning_rate": 2.6829129074894304e-05, + "loss": 48.1153, + "step": 854 + }, + { + "epoch": 2.922337870296237, + "grad_norm": 4.912516117095947, + "learning_rate": 2.6737824107379948e-05, + "loss": 48.0798, + "step": 856 + }, + { + "epoch": 2.9291700026688017, + "grad_norm": 4.066973686218262, + "learning_rate": 2.6646495841229287e-05, + "loss": 46.9194, + "step": 858 + }, + { + "epoch": 2.9360021350413663, + "grad_norm": 4.499208927154541, + "learning_rate": 2.655514550086086e-05, + "loss": 48.3087, + "step": 860 + }, + { + "epoch": 2.9428342674139314, + "grad_norm": 4.891952991485596, + "learning_rate": 2.6463774310989154e-05, + "loss": 46.8565, + "step": 862 + }, + { + "epoch": 2.949666399786496, + "grad_norm": 3.8262720108032227, + "learning_rate": 2.637238349660819e-05, + "loss": 46.7596, + "step": 864 + }, + { + "epoch": 2.9564985321590607, + "grad_norm": 5.6072492599487305, + "learning_rate": 2.6280974282975063e-05, + "loss": 45.254, + "step": 866 + }, + { + "epoch": 2.9633306645316253, + "grad_norm": 3.9889800548553467, + "learning_rate": 2.6189547895593562e-05, + "loss": 46.754, + "step": 868 + }, + { + "epoch": 2.97016279690419, + "grad_norm": 3.7260525226593018, + "learning_rate": 2.6098105560197722e-05, + "loss": 46.6516, + "step": 870 + }, + { + "epoch": 2.9769949292767546, + "grad_norm": 4.090394973754883, + "learning_rate": 2.600664850273538e-05, + "loss": 47.2404, + "step": 872 + }, + { + "epoch": 2.983827061649319, + "grad_norm": 3.6287267208099365, + "learning_rate": 2.5915177949351765e-05, + "loss": 46.3821, + "step": 874 + }, + { + "epoch": 2.9906591940218843, + "grad_norm": 3.5229976177215576, + "learning_rate": 2.582369512637302e-05, + "loss": 46.8471, + "step": 876 + }, + { + "epoch": 2.997491326394449, + "grad_norm": 3.532615900039673, + "learning_rate": 2.5732201260289806e-05, + "loss": 47.0364, + "step": 878 + }, + { + "epoch": 3.0034160661862823, + "grad_norm": 3.482403039932251, + "learning_rate": 2.564069757774082e-05, + "loss": 40.3241, + "step": 880 + }, + { + "epoch": 3.010248198558847, + "grad_norm": 3.94649600982666, + "learning_rate": 2.554918530549637e-05, + "loss": 46.7226, + "step": 882 + }, + { + "epoch": 3.0170803309314116, + "grad_norm": 4.395301818847656, + "learning_rate": 2.545766567044194e-05, + "loss": 45.266, + "step": 884 + }, + { + "epoch": 3.0239124633039767, + "grad_norm": 4.813998699188232, + "learning_rate": 2.5366139899561696e-05, + "loss": 46.8651, + "step": 886 + }, + { + "epoch": 3.0307445956765413, + "grad_norm": 5.5799174308776855, + "learning_rate": 2.527460921992209e-05, + "loss": 46.5727, + "step": 888 + }, + { + "epoch": 3.037576728049106, + "grad_norm": 6.693199634552002, + "learning_rate": 2.518307485865538e-05, + "loss": 47.987, + "step": 890 + }, + { + "epoch": 3.0444088604216706, + "grad_norm": 6.33953332901001, + "learning_rate": 2.509153804294318e-05, + "loss": 45.7221, + "step": 892 + }, + { + "epoch": 3.051240992794235, + "grad_norm": 4.887784957885742, + "learning_rate": 2.5e-05, + "loss": 44.5186, + "step": 894 + }, + { + "epoch": 3.0580731251668003, + "grad_norm": 4.337290287017822, + "learning_rate": 2.490846195705683e-05, + "loss": 46.394, + "step": 896 + }, + { + "epoch": 3.064905257539365, + "grad_norm": 3.7094030380249023, + "learning_rate": 2.4816925141344623e-05, + "loss": 45.122, + "step": 898 + }, + { + "epoch": 3.0717373899119296, + "grad_norm": 3.71903920173645, + "learning_rate": 2.4725390780077908e-05, + "loss": 44.7121, + "step": 900 + }, + { + "epoch": 3.0717373899119296, + "eval_loss": 0.7495905160903931, + "eval_runtime": 119.7503, + "eval_samples_per_second": 32.944, + "eval_steps_per_second": 8.242, + "step": 900 + }, + { + "epoch": 3.078569522284494, + "grad_norm": 4.690406799316406, + "learning_rate": 2.4633860100438316e-05, + "loss": 45.6299, + "step": 902 + }, + { + "epoch": 3.085401654657059, + "grad_norm": 4.29756498336792, + "learning_rate": 2.4542334329558077e-05, + "loss": 48.2504, + "step": 904 + }, + { + "epoch": 3.092233787029624, + "grad_norm": 5.62404727935791, + "learning_rate": 2.4450814694503636e-05, + "loss": 47.6091, + "step": 906 + }, + { + "epoch": 3.0990659194021886, + "grad_norm": 3.726529836654663, + "learning_rate": 2.435930242225919e-05, + "loss": 46.4755, + "step": 908 + }, + { + "epoch": 3.105898051774753, + "grad_norm": 6.04416036605835, + "learning_rate": 2.4267798739710203e-05, + "loss": 46.9715, + "step": 910 + }, + { + "epoch": 3.112730184147318, + "grad_norm": 3.8375885486602783, + "learning_rate": 2.4176304873626985e-05, + "loss": 47.9794, + "step": 912 + }, + { + "epoch": 3.1195623165198825, + "grad_norm": 3.296687602996826, + "learning_rate": 2.4084822050648237e-05, + "loss": 45.0776, + "step": 914 + }, + { + "epoch": 3.126394448892447, + "grad_norm": 3.546963930130005, + "learning_rate": 2.399335149726463e-05, + "loss": 44.6584, + "step": 916 + }, + { + "epoch": 3.133226581265012, + "grad_norm": 3.896601676940918, + "learning_rate": 2.390189443980229e-05, + "loss": 47.0284, + "step": 918 + }, + { + "epoch": 3.140058713637577, + "grad_norm": 3.570570468902588, + "learning_rate": 2.3810452104406444e-05, + "loss": 46.4413, + "step": 920 + }, + { + "epoch": 3.1468908460101415, + "grad_norm": 4.160488605499268, + "learning_rate": 2.3719025717024946e-05, + "loss": 47.1564, + "step": 922 + }, + { + "epoch": 3.153722978382706, + "grad_norm": 5.714613914489746, + "learning_rate": 2.3627616503391814e-05, + "loss": 48.2275, + "step": 924 + }, + { + "epoch": 3.1605551107552707, + "grad_norm": 4.362124919891357, + "learning_rate": 2.3536225689010845e-05, + "loss": 47.0592, + "step": 926 + }, + { + "epoch": 3.167387243127836, + "grad_norm": 6.478647708892822, + "learning_rate": 2.3444854499139142e-05, + "loss": 47.4139, + "step": 928 + }, + { + "epoch": 3.1742193755004005, + "grad_norm": 3.713979721069336, + "learning_rate": 2.3353504158770722e-05, + "loss": 47.7301, + "step": 930 + }, + { + "epoch": 3.181051507872965, + "grad_norm": 3.875537872314453, + "learning_rate": 2.3262175892620065e-05, + "loss": 45.6112, + "step": 932 + }, + { + "epoch": 3.1878836402455297, + "grad_norm": 5.328731536865234, + "learning_rate": 2.3170870925105702e-05, + "loss": 46.6125, + "step": 934 + }, + { + "epoch": 3.1947157726180944, + "grad_norm": 5.152383327484131, + "learning_rate": 2.307959048033383e-05, + "loss": 45.6076, + "step": 936 + }, + { + "epoch": 3.201547904990659, + "grad_norm": 4.689112186431885, + "learning_rate": 2.2988335782081855e-05, + "loss": 45.648, + "step": 938 + }, + { + "epoch": 3.208380037363224, + "grad_norm": 3.3412325382232666, + "learning_rate": 2.2897108053782e-05, + "loss": 44.4993, + "step": 940 + }, + { + "epoch": 3.2152121697357887, + "grad_norm": 11.583976745605469, + "learning_rate": 2.280590851850493e-05, + "loss": 46.3174, + "step": 942 + }, + { + "epoch": 3.2220443021083534, + "grad_norm": 4.012174606323242, + "learning_rate": 2.271473839894331e-05, + "loss": 46.3054, + "step": 944 + }, + { + "epoch": 3.228876434480918, + "grad_norm": 6.315187931060791, + "learning_rate": 2.2623598917395438e-05, + "loss": 44.3273, + "step": 946 + }, + { + "epoch": 3.2357085668534826, + "grad_norm": 5.612927436828613, + "learning_rate": 2.253249129574887e-05, + "loss": 46.8669, + "step": 948 + }, + { + "epoch": 3.2425406992260477, + "grad_norm": 3.7026705741882324, + "learning_rate": 2.2441416755463995e-05, + "loss": 46.4012, + "step": 950 + }, + { + "epoch": 3.2425406992260477, + "eval_loss": 0.7383518218994141, + "eval_runtime": 118.6959, + "eval_samples_per_second": 33.236, + "eval_steps_per_second": 8.315, + "step": 950 + }, + { + "epoch": 3.2493728315986123, + "grad_norm": 4.251457214355469, + "learning_rate": 2.2350376517557727e-05, + "loss": 47.1319, + "step": 952 + }, + { + "epoch": 3.256204963971177, + "grad_norm": 4.500071048736572, + "learning_rate": 2.2259371802587068e-05, + "loss": 47.0883, + "step": 954 + }, + { + "epoch": 3.2630370963437416, + "grad_norm": 4.684493064880371, + "learning_rate": 2.216840383063277e-05, + "loss": 45.0587, + "step": 956 + }, + { + "epoch": 3.2698692287163063, + "grad_norm": 3.853529453277588, + "learning_rate": 2.2077473821282996e-05, + "loss": 46.3262, + "step": 958 + }, + { + "epoch": 3.276701361088871, + "grad_norm": 5.501523971557617, + "learning_rate": 2.1986582993616926e-05, + "loss": 44.8375, + "step": 960 + }, + { + "epoch": 3.283533493461436, + "grad_norm": 15.540706634521484, + "learning_rate": 2.1895732566188476e-05, + "loss": 45.117, + "step": 962 + }, + { + "epoch": 3.2903656258340006, + "grad_norm": 2.6855862140655518, + "learning_rate": 2.1804923757009884e-05, + "loss": 45.9567, + "step": 964 + }, + { + "epoch": 3.2971977582065652, + "grad_norm": 4.529240131378174, + "learning_rate": 2.1714157783535463e-05, + "loss": 44.7532, + "step": 966 + }, + { + "epoch": 3.30402989057913, + "grad_norm": 4.690282344818115, + "learning_rate": 2.1623435862645204e-05, + "loss": 45.8376, + "step": 968 + }, + { + "epoch": 3.3108620229516945, + "grad_norm": 5.309507846832275, + "learning_rate": 2.153275921062851e-05, + "loss": 46.1757, + "step": 970 + }, + { + "epoch": 3.3176941553242596, + "grad_norm": 4.278385639190674, + "learning_rate": 2.1442129043167874e-05, + "loss": 46.6388, + "step": 972 + }, + { + "epoch": 3.3245262876968242, + "grad_norm": 4.2424516677856445, + "learning_rate": 2.1351546575322572e-05, + "loss": 45.1695, + "step": 974 + }, + { + "epoch": 3.331358420069389, + "grad_norm": 3.695155143737793, + "learning_rate": 2.126101302151238e-05, + "loss": 45.9417, + "step": 976 + }, + { + "epoch": 3.3381905524419535, + "grad_norm": 4.2003374099731445, + "learning_rate": 2.1170529595501305e-05, + "loss": 44.4002, + "step": 978 + }, + { + "epoch": 3.345022684814518, + "grad_norm": 4.378734588623047, + "learning_rate": 2.1080097510381298e-05, + "loss": 45.4517, + "step": 980 + }, + { + "epoch": 3.351854817187083, + "grad_norm": 3.96730637550354, + "learning_rate": 2.098971797855599e-05, + "loss": 43.9996, + "step": 982 + }, + { + "epoch": 3.358686949559648, + "grad_norm": 3.6162188053131104, + "learning_rate": 2.089939221172446e-05, + "loss": 43.9178, + "step": 984 + }, + { + "epoch": 3.3655190819322125, + "grad_norm": 4.3834099769592285, + "learning_rate": 2.0809121420864923e-05, + "loss": 46.2701, + "step": 986 + }, + { + "epoch": 3.372351214304777, + "grad_norm": 4.271561145782471, + "learning_rate": 2.07189068162186e-05, + "loss": 45.7546, + "step": 988 + }, + { + "epoch": 3.3791833466773418, + "grad_norm": 3.5791757106781006, + "learning_rate": 2.0628749607273396e-05, + "loss": 45.3079, + "step": 990 + }, + { + "epoch": 3.3860154790499064, + "grad_norm": 4.5101318359375, + "learning_rate": 2.0538651002747744e-05, + "loss": 46.5476, + "step": 992 + }, + { + "epoch": 3.3928476114224715, + "grad_norm": 5.944687366485596, + "learning_rate": 2.0448612210574365e-05, + "loss": 44.0355, + "step": 994 + }, + { + "epoch": 3.399679743795036, + "grad_norm": 4.936254501342773, + "learning_rate": 2.0358634437884112e-05, + "loss": 46.0717, + "step": 996 + }, + { + "epoch": 3.4065118761676008, + "grad_norm": 4.114757537841797, + "learning_rate": 2.0268718890989753e-05, + "loss": 44.5295, + "step": 998 + }, + { + "epoch": 3.4133440085401654, + "grad_norm": 8.12585735321045, + "learning_rate": 2.0178866775369777e-05, + "loss": 45.0747, + "step": 1000 + }, + { + "epoch": 3.4133440085401654, + "eval_loss": 0.7275528907775879, + "eval_runtime": 119.5885, + "eval_samples_per_second": 32.988, + "eval_steps_per_second": 8.253, + "step": 1000 + }, + { + "epoch": 3.4304243394715774, + "grad_norm": 4.9336113929748535, + "learning_rate": 2.0089079295652306e-05, + "loss": 45.5736, + "step": 1002 + }, + { + "epoch": 3.437256471844142, + "grad_norm": 5.042412757873535, + "learning_rate": 1.9999357655598893e-05, + "loss": 45.6651, + "step": 1004 + }, + { + "epoch": 3.4440886042167067, + "grad_norm": 3.9377660751342773, + "learning_rate": 1.9909703058088376e-05, + "loss": 44.5559, + "step": 1006 + }, + { + "epoch": 3.4509207365892713, + "grad_norm": 4.054321765899658, + "learning_rate": 1.9820116705100777e-05, + "loss": 45.1868, + "step": 1008 + }, + { + "epoch": 3.457752868961836, + "grad_norm": 4.860738277435303, + "learning_rate": 1.9730599797701177e-05, + "loss": 44.6737, + "step": 1010 + }, + { + "epoch": 3.4645850013344006, + "grad_norm": 3.950925827026367, + "learning_rate": 1.9641153536023644e-05, + "loss": 43.7733, + "step": 1012 + }, + { + "epoch": 3.4714171337069657, + "grad_norm": 3.831669569015503, + "learning_rate": 1.9551779119255043e-05, + "loss": 43.7403, + "step": 1014 + }, + { + "epoch": 3.4782492660795303, + "grad_norm": 4.114947319030762, + "learning_rate": 1.9462477745619108e-05, + "loss": 45.5074, + "step": 1016 + }, + { + "epoch": 3.485081398452095, + "grad_norm": 3.405243158340454, + "learning_rate": 1.9373250612360246e-05, + "loss": 46.4417, + "step": 1018 + }, + { + "epoch": 3.4919135308246596, + "grad_norm": 4.80495023727417, + "learning_rate": 1.928409891572757e-05, + "loss": 44.9758, + "step": 1020 + }, + { + "epoch": 3.4987456631972247, + "grad_norm": 4.239831447601318, + "learning_rate": 1.919502385095881e-05, + "loss": 44.6174, + "step": 1022 + }, + { + "epoch": 3.5055777955697893, + "grad_norm": 4.724026203155518, + "learning_rate": 1.9106026612264316e-05, + "loss": 44.7325, + "step": 1024 + }, + { + "epoch": 3.512409927942354, + "grad_norm": 3.4634554386138916, + "learning_rate": 1.9017108392811065e-05, + "loss": 43.7796, + "step": 1026 + }, + { + "epoch": 3.5192420603149186, + "grad_norm": 4.715716361999512, + "learning_rate": 1.8928270384706584e-05, + "loss": 45.2777, + "step": 1028 + }, + { + "epoch": 3.5260741926874832, + "grad_norm": 5.100541114807129, + "learning_rate": 1.8839513778983066e-05, + "loss": 46.4359, + "step": 1030 + }, + { + "epoch": 3.532906325060048, + "grad_norm": 4.475189685821533, + "learning_rate": 1.875083976558136e-05, + "loss": 44.0298, + "step": 1032 + }, + { + "epoch": 3.5397384574326125, + "grad_norm": 4.431650161743164, + "learning_rate": 1.8662249533335003e-05, + "loss": 44.2631, + "step": 1034 + }, + { + "epoch": 3.5465705898051776, + "grad_norm": 4.561038970947266, + "learning_rate": 1.8573744269954298e-05, + "loss": 43.9968, + "step": 1036 + }, + { + "epoch": 3.5534027221777422, + "grad_norm": 3.4181675910949707, + "learning_rate": 1.848532516201039e-05, + "loss": 43.372, + "step": 1038 + }, + { + "epoch": 3.560234854550307, + "grad_norm": 4.05961799621582, + "learning_rate": 1.8396993394919372e-05, + "loss": 43.5887, + "step": 1040 + }, + { + "epoch": 3.5670669869228715, + "grad_norm": 4.183586597442627, + "learning_rate": 1.8308750152926337e-05, + "loss": 43.1976, + "step": 1042 + }, + { + "epoch": 3.5738991192954366, + "grad_norm": 4.6883745193481445, + "learning_rate": 1.8220596619089576e-05, + "loss": 44.4463, + "step": 1044 + }, + { + "epoch": 3.580731251668001, + "grad_norm": 4.490588665008545, + "learning_rate": 1.8132533975264682e-05, + "loss": 44.3332, + "step": 1046 + }, + { + "epoch": 3.587563384040566, + "grad_norm": 4.937854766845703, + "learning_rate": 1.8044563402088684e-05, + "loss": 45.1199, + "step": 1048 + }, + { + "epoch": 3.5943955164131305, + "grad_norm": 3.8182907104492188, + "learning_rate": 1.795668607896426e-05, + "loss": 45.2035, + "step": 1050 + }, + { + "epoch": 3.5943955164131305, + "eval_loss": 0.7135393619537354, + "eval_runtime": 130.7813, + "eval_samples_per_second": 30.165, + "eval_steps_per_second": 7.547, + "step": 1050 + }, + { + "epoch": 3.601227648785695, + "grad_norm": 3.3739826679229736, + "learning_rate": 1.7868903184043887e-05, + "loss": 43.5257, + "step": 1052 + }, + { + "epoch": 3.6080597811582598, + "grad_norm": 3.8119192123413086, + "learning_rate": 1.7781215894214078e-05, + "loss": 44.9718, + "step": 1054 + }, + { + "epoch": 3.6148919135308244, + "grad_norm": 3.6780483722686768, + "learning_rate": 1.7693625385079577e-05, + "loss": 44.496, + "step": 1056 + }, + { + "epoch": 3.6217240459033895, + "grad_norm": 4.625596523284912, + "learning_rate": 1.7606132830947614e-05, + "loss": 43.6496, + "step": 1058 + }, + { + "epoch": 3.628556178275954, + "grad_norm": 5.467988967895508, + "learning_rate": 1.7518739404812155e-05, + "loss": 45.3773, + "step": 1060 + }, + { + "epoch": 3.6353883106485188, + "grad_norm": 3.7848103046417236, + "learning_rate": 1.7431446278338197e-05, + "loss": 43.6622, + "step": 1062 + }, + { + "epoch": 3.6422204430210834, + "grad_norm": 6.2495222091674805, + "learning_rate": 1.7344254621846016e-05, + "loss": 44.7325, + "step": 1064 + }, + { + "epoch": 3.6490525753936485, + "grad_norm": 4.541433811187744, + "learning_rate": 1.7257165604295513e-05, + "loss": 45.7111, + "step": 1066 + }, + { + "epoch": 3.655884707766213, + "grad_norm": 3.6900789737701416, + "learning_rate": 1.7170180393270532e-05, + "loss": 46.2799, + "step": 1068 + }, + { + "epoch": 3.6627168401387777, + "grad_norm": 3.999112129211426, + "learning_rate": 1.7083300154963193e-05, + "loss": 44.9348, + "step": 1070 + }, + { + "epoch": 3.6695489725113424, + "grad_norm": 4.940526008605957, + "learning_rate": 1.699652605415828e-05, + "loss": 45.9208, + "step": 1072 + }, + { + "epoch": 3.676381104883907, + "grad_norm": 3.8536486625671387, + "learning_rate": 1.6909859254217613e-05, + "loss": 45.3559, + "step": 1074 + }, + { + "epoch": 3.6832132372564717, + "grad_norm": 5.941255569458008, + "learning_rate": 1.682330091706446e-05, + "loss": 44.2183, + "step": 1076 + }, + { + "epoch": 3.6900453696290363, + "grad_norm": 4.6851091384887695, + "learning_rate": 1.6736852203167935e-05, + "loss": 45.0132, + "step": 1078 + }, + { + "epoch": 3.6968775020016014, + "grad_norm": 6.338913917541504, + "learning_rate": 1.6650514271527468e-05, + "loss": 44.5087, + "step": 1080 + }, + { + "epoch": 3.703709634374166, + "grad_norm": 6.134509086608887, + "learning_rate": 1.6564288279657252e-05, + "loss": 44.5929, + "step": 1082 + }, + { + "epoch": 3.7105417667467306, + "grad_norm": 3.0185976028442383, + "learning_rate": 1.647817538357072e-05, + "loss": 44.4708, + "step": 1084 + }, + { + "epoch": 3.7173738991192953, + "grad_norm": 4.479791641235352, + "learning_rate": 1.639217673776507e-05, + "loss": 44.4799, + "step": 1086 + }, + { + "epoch": 3.7242060314918604, + "grad_norm": 3.9354395866394043, + "learning_rate": 1.630629349520576e-05, + "loss": 43.3393, + "step": 1088 + }, + { + "epoch": 3.731038163864425, + "grad_norm": 4.530430316925049, + "learning_rate": 1.622052680731105e-05, + "loss": 43.1996, + "step": 1090 + }, + { + "epoch": 3.7378702962369896, + "grad_norm": 4.594604015350342, + "learning_rate": 1.613487782393661e-05, + "loss": 43.6473, + "step": 1092 + }, + { + "epoch": 3.7447024286095543, + "grad_norm": 4.38798713684082, + "learning_rate": 1.604934769336004e-05, + "loss": 43.1229, + "step": 1094 + }, + { + "epoch": 3.751534560982119, + "grad_norm": 4.350236415863037, + "learning_rate": 1.5963937562265525e-05, + "loss": 44.7883, + "step": 1096 + }, + { + "epoch": 3.7583666933546835, + "grad_norm": 4.064984321594238, + "learning_rate": 1.587864857572842e-05, + "loss": 44.1865, + "step": 1098 + }, + { + "epoch": 3.7651988257272486, + "grad_norm": 4.607226848602295, + "learning_rate": 1.5793481877199946e-05, + "loss": 44.6176, + "step": 1100 + }, + { + "epoch": 3.7651988257272486, + "eval_loss": 0.7090520858764648, + "eval_runtime": 136.3013, + "eval_samples_per_second": 28.943, + "eval_steps_per_second": 7.241, + "step": 1100 + }, + { + "epoch": 3.7720309580998133, + "grad_norm": 4.4557719230651855, + "learning_rate": 1.5708438608491814e-05, + "loss": 42.0453, + "step": 1102 + }, + { + "epoch": 3.778863090472378, + "grad_norm": 5.199422359466553, + "learning_rate": 1.5623519909760954e-05, + "loss": 42.589, + "step": 1104 + }, + { + "epoch": 3.7856952228449425, + "grad_norm": 3.632471799850464, + "learning_rate": 1.5538726919494206e-05, + "loss": 43.7924, + "step": 1106 + }, + { + "epoch": 3.7925273552175076, + "grad_norm": 4.203450679779053, + "learning_rate": 1.5454060774493068e-05, + "loss": 45.02, + "step": 1108 + }, + { + "epoch": 3.7993594875900722, + "grad_norm": 5.149316310882568, + "learning_rate": 1.5369522609858446e-05, + "loss": 44.2724, + "step": 1110 + }, + { + "epoch": 3.806191619962637, + "grad_norm": 3.5306341648101807, + "learning_rate": 1.528511355897543e-05, + "loss": 44.2268, + "step": 1112 + }, + { + "epoch": 3.8130237523352015, + "grad_norm": 4.296536445617676, + "learning_rate": 1.5200834753498128e-05, + "loss": 44.0479, + "step": 1114 + }, + { + "epoch": 3.819855884707766, + "grad_norm": 2.969525098800659, + "learning_rate": 1.5116687323334467e-05, + "loss": 43.5543, + "step": 1116 + }, + { + "epoch": 3.826688017080331, + "grad_norm": 4.044551849365234, + "learning_rate": 1.5032672396631056e-05, + "loss": 45.7925, + "step": 1118 + }, + { + "epoch": 3.8335201494528954, + "grad_norm": 5.003629207611084, + "learning_rate": 1.4948791099758052e-05, + "loss": 44.2037, + "step": 1120 + }, + { + "epoch": 3.8403522818254605, + "grad_norm": 3.4248318672180176, + "learning_rate": 1.486504455729408e-05, + "loss": 43.9243, + "step": 1122 + }, + { + "epoch": 3.847184414198025, + "grad_norm": 4.228148937225342, + "learning_rate": 1.4781433892011131e-05, + "loss": 44.7779, + "step": 1124 + }, + { + "epoch": 3.85401654657059, + "grad_norm": 4.345002174377441, + "learning_rate": 1.4697960224859513e-05, + "loss": 43.0617, + "step": 1126 + }, + { + "epoch": 3.8608486789431544, + "grad_norm": 4.824610233306885, + "learning_rate": 1.4614624674952842e-05, + "loss": 43.2687, + "step": 1128 + }, + { + "epoch": 3.8676808113157195, + "grad_norm": 5.528540134429932, + "learning_rate": 1.4531428359553017e-05, + "loss": 43.5145, + "step": 1130 + }, + { + "epoch": 3.874512943688284, + "grad_norm": 3.7578537464141846, + "learning_rate": 1.4448372394055249e-05, + "loss": 43.2377, + "step": 1132 + }, + { + "epoch": 3.8813450760608488, + "grad_norm": 3.191563367843628, + "learning_rate": 1.436545789197313e-05, + "loss": 43.493, + "step": 1134 + }, + { + "epoch": 3.8881772084334134, + "grad_norm": 3.1072089672088623, + "learning_rate": 1.4282685964923642e-05, + "loss": 44.5567, + "step": 1136 + }, + { + "epoch": 3.895009340805978, + "grad_norm": 4.651160717010498, + "learning_rate": 1.4200057722612336e-05, + "loss": 42.7739, + "step": 1138 + }, + { + "epoch": 3.9018414731785427, + "grad_norm": 3.203441858291626, + "learning_rate": 1.4117574272818388e-05, + "loss": 43.1438, + "step": 1140 + }, + { + "epoch": 3.9086736055511073, + "grad_norm": 4.5728349685668945, + "learning_rate": 1.4035236721379757e-05, + "loss": 44.305, + "step": 1142 + }, + { + "epoch": 3.9155057379236724, + "grad_norm": 6.874294757843018, + "learning_rate": 1.3953046172178414e-05, + "loss": 42.8162, + "step": 1144 + }, + { + "epoch": 3.922337870296237, + "grad_norm": 5.198761463165283, + "learning_rate": 1.387100372712548e-05, + "loss": 44.2441, + "step": 1146 + }, + { + "epoch": 3.9291700026688017, + "grad_norm": 3.9007508754730225, + "learning_rate": 1.378911048614647e-05, + "loss": 43.0147, + "step": 1148 + }, + { + "epoch": 3.9360021350413663, + "grad_norm": 3.7035725116729736, + "learning_rate": 1.3707367547166569e-05, + "loss": 45.0733, + "step": 1150 + }, + { + "epoch": 3.9360021350413663, + "eval_loss": 0.7048025131225586, + "eval_runtime": 132.7997, + "eval_samples_per_second": 29.706, + "eval_steps_per_second": 7.432, + "step": 1150 + }, + { + "epoch": 3.9428342674139314, + "grad_norm": 5.101466655731201, + "learning_rate": 1.3625776006095881e-05, + "loss": 42.4982, + "step": 1152 + }, + { + "epoch": 3.949666399786496, + "grad_norm": 4.983183860778809, + "learning_rate": 1.354433695681474e-05, + "loss": 43.3568, + "step": 1154 + }, + { + "epoch": 3.9564985321590607, + "grad_norm": 3.6875593662261963, + "learning_rate": 1.3463051491159096e-05, + "loss": 45.16, + "step": 1156 + }, + { + "epoch": 3.9633306645316253, + "grad_norm": 4.482807636260986, + "learning_rate": 1.3381920698905787e-05, + "loss": 42.8545, + "step": 1158 + }, + { + "epoch": 3.97016279690419, + "grad_norm": 3.858903646469116, + "learning_rate": 1.3300945667758014e-05, + "loss": 42.5779, + "step": 1160 + }, + { + "epoch": 3.9769949292767546, + "grad_norm": 5.07602596282959, + "learning_rate": 1.3220127483330713e-05, + "loss": 43.8678, + "step": 1162 + }, + { + "epoch": 3.983827061649319, + "grad_norm": 5.183884620666504, + "learning_rate": 1.3139467229135999e-05, + "loss": 44.2575, + "step": 1164 + }, + { + "epoch": 3.9906591940218843, + "grad_norm": 5.44564962387085, + "learning_rate": 1.3058965986568648e-05, + "loss": 42.0898, + "step": 1166 + }, + { + "epoch": 3.997491326394449, + "grad_norm": 3.4175875186920166, + "learning_rate": 1.2978624834891628e-05, + "loss": 43.526, + "step": 1168 + }, + { + "epoch": 4.006832132372565, + "grad_norm": 5.1483588218688965, + "learning_rate": 1.2898444851221565e-05, + "loss": 60.1634, + "step": 1170 + }, + { + "epoch": 4.013664264745129, + "grad_norm": 4.452287673950195, + "learning_rate": 1.281842711051438e-05, + "loss": 41.7569, + "step": 1172 + }, + { + "epoch": 4.020496397117694, + "grad_norm": 4.024214267730713, + "learning_rate": 1.2738572685550799e-05, + "loss": 44.7667, + "step": 1174 + }, + { + "epoch": 4.0273285294902585, + "grad_norm": 5.533107757568359, + "learning_rate": 1.2658882646922034e-05, + "loss": 43.7144, + "step": 1176 + }, + { + "epoch": 4.034160661862823, + "grad_norm": 4.520675182342529, + "learning_rate": 1.2579358063015418e-05, + "loss": 43.3862, + "step": 1178 + }, + { + "epoch": 4.040992794235389, + "grad_norm": 4.086079120635986, + "learning_rate": 1.2500000000000006e-05, + "loss": 44.268, + "step": 1180 + }, + { + "epoch": 4.047824926607953, + "grad_norm": 3.335569381713867, + "learning_rate": 1.2420809521812404e-05, + "loss": 43.1871, + "step": 1182 + }, + { + "epoch": 4.054657058980518, + "grad_norm": 4.651849746704102, + "learning_rate": 1.2341787690142437e-05, + "loss": 43.4785, + "step": 1184 + }, + { + "epoch": 4.061489191353083, + "grad_norm": 3.9412457942962646, + "learning_rate": 1.2262935564418886e-05, + "loss": 42.1075, + "step": 1186 + }, + { + "epoch": 4.068321323725647, + "grad_norm": 5.621413230895996, + "learning_rate": 1.2184254201795365e-05, + "loss": 44.5849, + "step": 1188 + }, + { + "epoch": 4.075153456098212, + "grad_norm": 4.291881084442139, + "learning_rate": 1.2105744657136064e-05, + "loss": 42.9562, + "step": 1190 + }, + { + "epoch": 4.0819855884707765, + "grad_norm": 3.730132818222046, + "learning_rate": 1.2027407983001681e-05, + "loss": 44.0838, + "step": 1192 + }, + { + "epoch": 4.088817720843341, + "grad_norm": 3.540987968444824, + "learning_rate": 1.1949245229635245e-05, + "loss": 43.4705, + "step": 1194 + }, + { + "epoch": 4.095649853215906, + "grad_norm": 3.0649805068969727, + "learning_rate": 1.1871257444948098e-05, + "loss": 43.0996, + "step": 1196 + }, + { + "epoch": 4.10248198558847, + "grad_norm": 3.2024762630462646, + "learning_rate": 1.1793445674505776e-05, + "loss": 42.772, + "step": 1198 + }, + { + "epoch": 4.109314117961035, + "grad_norm": 3.462251663208008, + "learning_rate": 1.1715810961514073e-05, + "loss": 43.2502, + "step": 1200 + }, + { + "epoch": 4.109314117961035, + "eval_loss": 0.7009151577949524, + "eval_runtime": 133.1765, + "eval_samples_per_second": 29.622, + "eval_steps_per_second": 7.411, + "step": 1200 + }, + { + "epoch": 4.116146250333601, + "grad_norm": 4.633735656738281, + "learning_rate": 1.1638354346804971e-05, + "loss": 42.8239, + "step": 1202 + }, + { + "epoch": 4.122978382706165, + "grad_norm": 3.758700132369995, + "learning_rate": 1.1561076868822756e-05, + "loss": 43.3475, + "step": 1204 + }, + { + "epoch": 4.12981051507873, + "grad_norm": 4.143715858459473, + "learning_rate": 1.148397956361007e-05, + "loss": 44.0, + "step": 1206 + }, + { + "epoch": 4.1366426474512945, + "grad_norm": 5.201571941375732, + "learning_rate": 1.1407063464793966e-05, + "loss": 42.5036, + "step": 1208 + }, + { + "epoch": 4.143474779823859, + "grad_norm": 3.4282047748565674, + "learning_rate": 1.133032960357216e-05, + "loss": 43.0577, + "step": 1210 + }, + { + "epoch": 4.150306912196424, + "grad_norm": 4.114802837371826, + "learning_rate": 1.1253779008699131e-05, + "loss": 43.3517, + "step": 1212 + }, + { + "epoch": 4.157139044568988, + "grad_norm": 3.979163408279419, + "learning_rate": 1.1177412706472321e-05, + "loss": 42.5044, + "step": 1214 + }, + { + "epoch": 4.163971176941553, + "grad_norm": 4.363109588623047, + "learning_rate": 1.1101231720718442e-05, + "loss": 43.8954, + "step": 1216 + }, + { + "epoch": 4.170803309314118, + "grad_norm": 4.6219401359558105, + "learning_rate": 1.1025237072779663e-05, + "loss": 43.413, + "step": 1218 + }, + { + "epoch": 4.177635441686682, + "grad_norm": 4.945540904998779, + "learning_rate": 1.09494297815e-05, + "loss": 43.9628, + "step": 1220 + }, + { + "epoch": 4.184467574059248, + "grad_norm": 4.4585747718811035, + "learning_rate": 1.0873810863211595e-05, + "loss": 42.6454, + "step": 1222 + }, + { + "epoch": 4.1912997064318125, + "grad_norm": 4.659883499145508, + "learning_rate": 1.0798381331721109e-05, + "loss": 42.5656, + "step": 1224 + }, + { + "epoch": 4.198131838804377, + "grad_norm": 4.411434650421143, + "learning_rate": 1.0723142198296155e-05, + "loss": 41.2252, + "step": 1226 + }, + { + "epoch": 4.204963971176942, + "grad_norm": 4.985414028167725, + "learning_rate": 1.0648094471651724e-05, + "loss": 42.05, + "step": 1228 + }, + { + "epoch": 4.211796103549506, + "grad_norm": 5.09487771987915, + "learning_rate": 1.0573239157936619e-05, + "loss": 42.9917, + "step": 1230 + }, + { + "epoch": 4.218628235922071, + "grad_norm": 4.299539089202881, + "learning_rate": 1.049857726072005e-05, + "loss": 42.7934, + "step": 1232 + }, + { + "epoch": 4.225460368294636, + "grad_norm": 4.075766086578369, + "learning_rate": 1.0424109780978103e-05, + "loss": 41.0067, + "step": 1234 + }, + { + "epoch": 4.2322925006672, + "grad_norm": 4.9132232666015625, + "learning_rate": 1.034983771708035e-05, + "loss": 43.6556, + "step": 1236 + }, + { + "epoch": 4.239124633039765, + "grad_norm": 4.45914888381958, + "learning_rate": 1.0275762064776492e-05, + "loss": 42.588, + "step": 1238 + }, + { + "epoch": 4.24595676541233, + "grad_norm": 3.7621419429779053, + "learning_rate": 1.020188381718295e-05, + "loss": 41.7435, + "step": 1240 + }, + { + "epoch": 4.252788897784894, + "grad_norm": 2.9593658447265625, + "learning_rate": 1.0128203964769601e-05, + "loss": 43.7138, + "step": 1242 + }, + { + "epoch": 4.25962103015746, + "grad_norm": 4.333788871765137, + "learning_rate": 1.0054723495346482e-05, + "loss": 42.7332, + "step": 1244 + }, + { + "epoch": 4.266453162530024, + "grad_norm": 4.040637493133545, + "learning_rate": 9.981443394050525e-06, + "loss": 43.0547, + "step": 1246 + }, + { + "epoch": 4.273285294902589, + "grad_norm": 5.255796432495117, + "learning_rate": 9.908364643332399e-06, + "loss": 42.1078, + "step": 1248 + }, + { + "epoch": 4.280117427275154, + "grad_norm": 3.434884786605835, + "learning_rate": 9.835488222943285e-06, + "loss": 42.6684, + "step": 1250 + }, + { + "epoch": 4.280117427275154, + "eval_loss": 0.6948874592781067, + "eval_runtime": 138.5111, + "eval_samples_per_second": 28.481, + "eval_steps_per_second": 7.126, + "step": 1250 + }, + { + "epoch": 4.286949559647718, + "grad_norm": 4.761016368865967, + "learning_rate": 9.762815109921761e-06, + "loss": 43.8, + "step": 1252 + }, + { + "epoch": 4.293781692020283, + "grad_norm": 5.999067783355713, + "learning_rate": 9.690346278580726e-06, + "loss": 42.8654, + "step": 1254 + }, + { + "epoch": 4.300613824392848, + "grad_norm": 4.777903079986572, + "learning_rate": 9.618082700494319e-06, + "loss": 42.3409, + "step": 1256 + }, + { + "epoch": 4.307445956765412, + "grad_norm": 4.543084144592285, + "learning_rate": 9.546025344484869e-06, + "loss": 43.6205, + "step": 1258 + }, + { + "epoch": 4.314278089137977, + "grad_norm": 3.6853065490722656, + "learning_rate": 9.474175176609956e-06, + "loss": 43.9045, + "step": 1260 + }, + { + "epoch": 4.3211102215105415, + "grad_norm": 4.3578338623046875, + "learning_rate": 9.402533160149416e-06, + "loss": 41.781, + "step": 1262 + }, + { + "epoch": 4.327942353883106, + "grad_norm": 4.191073894500732, + "learning_rate": 9.331100255592437e-06, + "loss": 42.5713, + "step": 1264 + }, + { + "epoch": 4.334774486255672, + "grad_norm": 5.591835021972656, + "learning_rate": 9.259877420624721e-06, + "loss": 42.9316, + "step": 1266 + }, + { + "epoch": 4.341606618628236, + "grad_norm": 4.916292667388916, + "learning_rate": 9.18886561011557e-06, + "loss": 42.9316, + "step": 1268 + }, + { + "epoch": 4.348438751000801, + "grad_norm": 3.4310858249664307, + "learning_rate": 9.118065776105159e-06, + "loss": 42.0445, + "step": 1270 + }, + { + "epoch": 4.3552708833733655, + "grad_norm": 3.6645348072052, + "learning_rate": 9.047478867791732e-06, + "loss": 41.5698, + "step": 1272 + }, + { + "epoch": 4.36210301574593, + "grad_norm": 4.118466854095459, + "learning_rate": 8.977105831518864e-06, + "loss": 41.7493, + "step": 1274 + }, + { + "epoch": 4.368935148118495, + "grad_norm": 4.731881141662598, + "learning_rate": 8.906947610762825e-06, + "loss": 41.2277, + "step": 1276 + }, + { + "epoch": 4.3757672804910595, + "grad_norm": 4.580758571624756, + "learning_rate": 8.837005146119872e-06, + "loss": 42.3467, + "step": 1278 + }, + { + "epoch": 4.382599412863624, + "grad_norm": 5.310960292816162, + "learning_rate": 8.767279375293672e-06, + "loss": 43.1447, + "step": 1280 + }, + { + "epoch": 4.389431545236189, + "grad_norm": 4.382359027862549, + "learning_rate": 8.697771233082744e-06, + "loss": 42.4424, + "step": 1282 + }, + { + "epoch": 4.396263677608753, + "grad_norm": 3.6488263607025146, + "learning_rate": 8.628481651367876e-06, + "loss": 43.8516, + "step": 1284 + }, + { + "epoch": 4.403095809981318, + "grad_norm": 3.2983975410461426, + "learning_rate": 8.55941155909968e-06, + "loss": 43.3322, + "step": 1286 + }, + { + "epoch": 4.4099279423538835, + "grad_norm": 3.5116684436798096, + "learning_rate": 8.490561882286136e-06, + "loss": 41.4651, + "step": 1288 + }, + { + "epoch": 4.416760074726448, + "grad_norm": 3.5123932361602783, + "learning_rate": 8.421933543980126e-06, + "loss": 43.1034, + "step": 1290 + }, + { + "epoch": 4.423592207099013, + "grad_norm": 4.123583793640137, + "learning_rate": 8.353527464267104e-06, + "loss": 43.566, + "step": 1292 + }, + { + "epoch": 4.430424339471577, + "grad_norm": 3.6427931785583496, + "learning_rate": 8.285344560252777e-06, + "loss": 42.0333, + "step": 1294 + }, + { + "epoch": 4.437256471844142, + "grad_norm": 3.8917388916015625, + "learning_rate": 8.217385746050742e-06, + "loss": 42.0382, + "step": 1296 + }, + { + "epoch": 4.444088604216707, + "grad_norm": 4.964122772216797, + "learning_rate": 8.149651932770308e-06, + "loss": 43.6584, + "step": 1298 + }, + { + "epoch": 4.450920736589271, + "grad_norm": 4.227240085601807, + "learning_rate": 8.082144028504233e-06, + "loss": 42.4086, + "step": 1300 + }, + { + "epoch": 4.450920736589271, + "eval_loss": 0.6897044777870178, + "eval_runtime": 131.8148, + "eval_samples_per_second": 29.928, + "eval_steps_per_second": 7.488, + "step": 1300 + }, + { + "epoch": 4.457752868961836, + "grad_norm": 4.605757713317871, + "learning_rate": 8.014862938316542e-06, + "loss": 43.7962, + "step": 1302 + }, + { + "epoch": 4.464585001334401, + "grad_norm": 4.2398176193237305, + "learning_rate": 7.947809564230445e-06, + "loss": 42.3544, + "step": 1304 + }, + { + "epoch": 4.471417133706965, + "grad_norm": 5.234216213226318, + "learning_rate": 7.880984805216185e-06, + "loss": 41.9833, + "step": 1306 + }, + { + "epoch": 4.47824926607953, + "grad_norm": 3.9220240116119385, + "learning_rate": 7.814389557179017e-06, + "loss": 42.0345, + "step": 1308 + }, + { + "epoch": 4.485081398452095, + "grad_norm": 5.44996976852417, + "learning_rate": 7.748024712947205e-06, + "loss": 42.0309, + "step": 1310 + }, + { + "epoch": 4.49191353082466, + "grad_norm": 5.07472038269043, + "learning_rate": 7.681891162260015e-06, + "loss": 42.6996, + "step": 1312 + }, + { + "epoch": 4.498745663197225, + "grad_norm": 3.818120241165161, + "learning_rate": 7.615989791755834e-06, + "loss": 42.8775, + "step": 1314 + }, + { + "epoch": 4.505577795569789, + "grad_norm": 4.252802848815918, + "learning_rate": 7.5503214849602516e-06, + "loss": 42.4118, + "step": 1316 + }, + { + "epoch": 4.512409927942354, + "grad_norm": 4.17697286605835, + "learning_rate": 7.484887122274215e-06, + "loss": 41.2153, + "step": 1318 + }, + { + "epoch": 4.519242060314919, + "grad_norm": 3.7324466705322266, + "learning_rate": 7.419687580962223e-06, + "loss": 42.3343, + "step": 1320 + }, + { + "epoch": 4.526074192687483, + "grad_norm": 3.870089054107666, + "learning_rate": 7.354723735140609e-06, + "loss": 42.0028, + "step": 1322 + }, + { + "epoch": 4.532906325060048, + "grad_norm": 3.6424801349639893, + "learning_rate": 7.289996455765749e-06, + "loss": 43.5842, + "step": 1324 + }, + { + "epoch": 4.5397384574326125, + "grad_norm": 4.695961952209473, + "learning_rate": 7.225506610622456e-06, + "loss": 42.0951, + "step": 1326 + }, + { + "epoch": 4.546570589805177, + "grad_norm": 4.842666149139404, + "learning_rate": 7.161255064312283e-06, + "loss": 43.8668, + "step": 1328 + }, + { + "epoch": 4.553402722177742, + "grad_norm": 4.4085822105407715, + "learning_rate": 7.0972426782419884e-06, + "loss": 43.7836, + "step": 1330 + }, + { + "epoch": 4.560234854550307, + "grad_norm": 3.606607437133789, + "learning_rate": 7.033470310611945e-06, + "loss": 41.4304, + "step": 1332 + }, + { + "epoch": 4.567066986922872, + "grad_norm": 4.789222717285156, + "learning_rate": 6.969938816404639e-06, + "loss": 41.6355, + "step": 1334 + }, + { + "epoch": 4.573899119295437, + "grad_norm": 4.463109493255615, + "learning_rate": 6.906649047373246e-06, + "loss": 43.4969, + "step": 1336 + }, + { + "epoch": 4.580731251668001, + "grad_norm": 4.483322620391846, + "learning_rate": 6.843601852030171e-06, + "loss": 42.4094, + "step": 1338 + }, + { + "epoch": 4.587563384040566, + "grad_norm": 4.021024703979492, + "learning_rate": 6.780798075635675e-06, + "loss": 42.2893, + "step": 1340 + }, + { + "epoch": 4.5943955164131305, + "grad_norm": 3.9479868412017822, + "learning_rate": 6.718238560186571e-06, + "loss": 40.8073, + "step": 1342 + }, + { + "epoch": 4.601227648785695, + "grad_norm": 4.778145790100098, + "learning_rate": 6.655924144404907e-06, + "loss": 42.0845, + "step": 1344 + }, + { + "epoch": 4.60805978115826, + "grad_norm": 3.555271863937378, + "learning_rate": 6.593855663726722e-06, + "loss": 41.1015, + "step": 1346 + }, + { + "epoch": 4.614891913530824, + "grad_norm": 4.007204532623291, + "learning_rate": 6.532033950290886e-06, + "loss": 42.9137, + "step": 1348 + }, + { + "epoch": 4.621724045903389, + "grad_norm": 4.328546524047852, + "learning_rate": 6.470459832927881e-06, + "loss": 41.274, + "step": 1350 + }, + { + "epoch": 4.621724045903389, + "eval_loss": 0.6830974221229553, + "eval_runtime": 135.2812, + "eval_samples_per_second": 29.161, + "eval_steps_per_second": 7.296, + "step": 1350 + }, + { + "epoch": 4.628556178275954, + "grad_norm": 4.948083877563477, + "learning_rate": 6.409134137148737e-06, + "loss": 43.0462, + "step": 1352 + }, + { + "epoch": 4.635388310648519, + "grad_norm": 4.637773036956787, + "learning_rate": 6.3480576851339625e-06, + "loss": 42.6268, + "step": 1354 + }, + { + "epoch": 4.642220443021084, + "grad_norm": 3.72841215133667, + "learning_rate": 6.28723129572247e-06, + "loss": 41.0574, + "step": 1356 + }, + { + "epoch": 4.6490525753936485, + "grad_norm": 4.539714813232422, + "learning_rate": 6.226655784400684e-06, + "loss": 43.5752, + "step": 1358 + }, + { + "epoch": 4.655884707766213, + "grad_norm": 5.519583225250244, + "learning_rate": 6.166331963291519e-06, + "loss": 43.3111, + "step": 1360 + }, + { + "epoch": 4.662716840138778, + "grad_norm": 4.942199230194092, + "learning_rate": 6.106260641143546e-06, + "loss": 43.6514, + "step": 1362 + }, + { + "epoch": 4.669548972511342, + "grad_norm": 5.164299011230469, + "learning_rate": 6.046442623320145e-06, + "loss": 40.8611, + "step": 1364 + }, + { + "epoch": 4.676381104883907, + "grad_norm": 4.309698581695557, + "learning_rate": 5.986878711788702e-06, + "loss": 41.3937, + "step": 1366 + }, + { + "epoch": 4.683213237256472, + "grad_norm": 4.105101585388184, + "learning_rate": 5.927569705109828e-06, + "loss": 40.3001, + "step": 1368 + }, + { + "epoch": 4.690045369629036, + "grad_norm": 3.571514368057251, + "learning_rate": 5.868516398426716e-06, + "loss": 41.6858, + "step": 1370 + }, + { + "epoch": 4.696877502001601, + "grad_norm": 5.120858192443848, + "learning_rate": 5.809719583454415e-06, + "loss": 41.4156, + "step": 1372 + }, + { + "epoch": 4.703709634374166, + "grad_norm": 4.679799556732178, + "learning_rate": 5.751180048469243e-06, + "loss": 43.1858, + "step": 1374 + }, + { + "epoch": 4.710541766746731, + "grad_norm": 3.0465521812438965, + "learning_rate": 5.692898578298253e-06, + "loss": 41.213, + "step": 1376 + }, + { + "epoch": 4.717373899119296, + "grad_norm": 4.835347652435303, + "learning_rate": 5.634875954308638e-06, + "loss": 44.0938, + "step": 1378 + }, + { + "epoch": 4.72420603149186, + "grad_norm": 6.645193099975586, + "learning_rate": 5.577112954397321e-06, + "loss": 41.7528, + "step": 1380 + }, + { + "epoch": 4.731038163864425, + "grad_norm": 4.592052936553955, + "learning_rate": 5.519610352980501e-06, + "loss": 42.566, + "step": 1382 + }, + { + "epoch": 4.73787029623699, + "grad_norm": 3.7620317935943604, + "learning_rate": 5.462368920983249e-06, + "loss": 41.7184, + "step": 1384 + }, + { + "epoch": 4.744702428609554, + "grad_norm": 4.0445027351379395, + "learning_rate": 5.405389425829219e-06, + "loss": 41.6249, + "step": 1386 + }, + { + "epoch": 4.751534560982119, + "grad_norm": 3.744433641433716, + "learning_rate": 5.348672631430318e-06, + "loss": 43.0626, + "step": 1388 + }, + { + "epoch": 4.7583666933546835, + "grad_norm": 3.12141489982605, + "learning_rate": 5.292219298176476e-06, + "loss": 42.1533, + "step": 1390 + }, + { + "epoch": 4.765198825727248, + "grad_norm": 6.73304557800293, + "learning_rate": 5.236030182925475e-06, + "loss": 41.6015, + "step": 1392 + }, + { + "epoch": 4.772030958099813, + "grad_norm": 4.076465129852295, + "learning_rate": 5.1801060389927606e-06, + "loss": 43.2645, + "step": 1394 + }, + { + "epoch": 4.7788630904723775, + "grad_norm": 4.178272247314453, + "learning_rate": 5.124447616141381e-06, + "loss": 43.0354, + "step": 1396 + }, + { + "epoch": 4.785695222844943, + "grad_norm": 4.555927276611328, + "learning_rate": 5.06905566057192e-06, + "loss": 42.1086, + "step": 1398 + }, + { + "epoch": 4.792527355217508, + "grad_norm": 4.799075126647949, + "learning_rate": 5.013930914912476e-06, + "loss": 40.7555, + "step": 1400 + }, + { + "epoch": 4.792527355217508, + "eval_loss": 0.6814665198326111, + "eval_runtime": 134.9461, + "eval_samples_per_second": 29.234, + "eval_steps_per_second": 7.314, + "step": 1400 + }, + { + "epoch": 4.799359487590072, + "grad_norm": 3.7408673763275146, + "learning_rate": 4.959074118208726e-06, + "loss": 40.9295, + "step": 1402 + }, + { + "epoch": 4.806191619962637, + "grad_norm": 3.9520747661590576, + "learning_rate": 4.9044860059140275e-06, + "loss": 43.4186, + "step": 1404 + }, + { + "epoch": 4.8130237523352015, + "grad_norm": 4.115049839019775, + "learning_rate": 4.850167309879519e-06, + "loss": 42.2491, + "step": 1406 + }, + { + "epoch": 4.819855884707766, + "grad_norm": 5.181631088256836, + "learning_rate": 4.796118758344354e-06, + "loss": 41.583, + "step": 1408 + }, + { + "epoch": 4.826688017080331, + "grad_norm": 3.838186740875244, + "learning_rate": 4.742341075925916e-06, + "loss": 43.3278, + "step": 1410 + }, + { + "epoch": 4.833520149452895, + "grad_norm": 3.6494245529174805, + "learning_rate": 4.6888349836100825e-06, + "loss": 41.3961, + "step": 1412 + }, + { + "epoch": 4.84035228182546, + "grad_norm": 4.139842510223389, + "learning_rate": 4.6356011987416075e-06, + "loss": 43.4135, + "step": 1414 + }, + { + "epoch": 4.847184414198025, + "grad_norm": 4.385437965393066, + "learning_rate": 4.58264043501446e-06, + "loss": 42.1478, + "step": 1416 + }, + { + "epoch": 4.854016546570589, + "grad_norm": 3.691343307495117, + "learning_rate": 4.52995340246227e-06, + "loss": 42.4175, + "step": 1418 + }, + { + "epoch": 4.860848678943155, + "grad_norm": 4.149899482727051, + "learning_rate": 4.477540807448832e-06, + "loss": 42.4116, + "step": 1420 + }, + { + "epoch": 4.8676808113157195, + "grad_norm": 3.8960561752319336, + "learning_rate": 4.425403352658591e-06, + "loss": 41.2306, + "step": 1422 + }, + { + "epoch": 4.874512943688284, + "grad_norm": 3.6276168823242188, + "learning_rate": 4.373541737087264e-06, + "loss": 42.7317, + "step": 1424 + }, + { + "epoch": 4.881345076060849, + "grad_norm": 4.214303016662598, + "learning_rate": 4.32195665603245e-06, + "loss": 41.6166, + "step": 1426 + }, + { + "epoch": 4.888177208433413, + "grad_norm": 4.3136210441589355, + "learning_rate": 4.270648801084296e-06, + "loss": 42.3309, + "step": 1428 + }, + { + "epoch": 4.895009340805978, + "grad_norm": 5.340824604034424, + "learning_rate": 4.219618860116242e-06, + "loss": 40.6249, + "step": 1430 + }, + { + "epoch": 4.901841473178543, + "grad_norm": 3.750943183898926, + "learning_rate": 4.1688675172758064e-06, + "loss": 42.0754, + "step": 1432 + }, + { + "epoch": 4.908673605551107, + "grad_norm": 3.8021140098571777, + "learning_rate": 4.118395452975382e-06, + "loss": 42.8221, + "step": 1434 + }, + { + "epoch": 4.915505737923672, + "grad_norm": 5.09911584854126, + "learning_rate": 4.068203343883159e-06, + "loss": 42.3164, + "step": 1436 + }, + { + "epoch": 4.9223378702962375, + "grad_norm": 3.590981960296631, + "learning_rate": 4.018291862914001e-06, + "loss": 41.0773, + "step": 1438 + }, + { + "epoch": 4.929170002668801, + "grad_norm": 4.474262714385986, + "learning_rate": 3.968661679220468e-06, + "loss": 41.1827, + "step": 1440 + }, + { + "epoch": 4.936002135041367, + "grad_norm": 3.780853748321533, + "learning_rate": 3.919313458183838e-06, + "loss": 41.9009, + "step": 1442 + }, + { + "epoch": 4.942834267413931, + "grad_norm": 4.165524482727051, + "learning_rate": 3.8702478614051355e-06, + "loss": 41.6988, + "step": 1444 + }, + { + "epoch": 4.949666399786496, + "grad_norm": 4.537020683288574, + "learning_rate": 3.821465546696337e-06, + "loss": 42.6527, + "step": 1446 + }, + { + "epoch": 4.956498532159061, + "grad_norm": 5.992898941040039, + "learning_rate": 3.772967168071517e-06, + "loss": 42.3257, + "step": 1448 + }, + { + "epoch": 4.963330664531625, + "grad_norm": 5.681396007537842, + "learning_rate": 3.7247533757380603e-06, + "loss": 42.5366, + "step": 1450 + }, + { + "epoch": 4.963330664531625, + "eval_loss": 0.6770752668380737, + "eval_runtime": 133.8871, + "eval_samples_per_second": 29.465, + "eval_steps_per_second": 7.372, + "step": 1450 + }, + { + "epoch": 4.97016279690419, + "grad_norm": 4.46541166305542, + "learning_rate": 3.6768248160879787e-06, + "loss": 41.0476, + "step": 1452 + }, + { + "epoch": 4.976994929276755, + "grad_norm": 4.15000057220459, + "learning_rate": 3.6291821316892184e-06, + "loss": 40.7134, + "step": 1454 + }, + { + "epoch": 4.983827061649319, + "grad_norm": 4.230960369110107, + "learning_rate": 3.5818259612770744e-06, + "loss": 43.5967, + "step": 1456 + }, + { + "epoch": 4.990659194021884, + "grad_norm": 4.932849884033203, + "learning_rate": 3.53475693974559e-06, + "loss": 43.2516, + "step": 1458 + }, + { + "epoch": 4.997491326394449, + "grad_norm": 4.316704273223877, + "learning_rate": 3.487975698139084e-06, + "loss": 42.3811, + "step": 1460 + }, + { + "epoch": 5.003416066186283, + "grad_norm": 4.146729469299316, + "learning_rate": 3.4414828636436525e-06, + "loss": 36.1288, + "step": 1462 + }, + { + "epoch": 5.010248198558847, + "grad_norm": 5.610274791717529, + "learning_rate": 3.3952790595787987e-06, + "loss": 40.6556, + "step": 1464 + }, + { + "epoch": 5.017080330931412, + "grad_norm": 6.292807102203369, + "learning_rate": 3.3493649053890326e-06, + "loss": 42.2675, + "step": 1466 + }, + { + "epoch": 5.023912463303977, + "grad_norm": 4.371929168701172, + "learning_rate": 3.3037410166356143e-06, + "loss": 41.1544, + "step": 1468 + }, + { + "epoch": 5.030744595676541, + "grad_norm": 3.275562047958374, + "learning_rate": 3.258408004988278e-06, + "loss": 42.7401, + "step": 1470 + }, + { + "epoch": 5.037576728049106, + "grad_norm": 5.2857666015625, + "learning_rate": 3.2133664782169948e-06, + "loss": 39.4961, + "step": 1472 + }, + { + "epoch": 5.044408860421671, + "grad_norm": 3.9162814617156982, + "learning_rate": 3.168617040183897e-06, + "loss": 42.7691, + "step": 1474 + }, + { + "epoch": 5.051240992794235, + "grad_norm": 4.741237640380859, + "learning_rate": 3.1241602908351404e-06, + "loss": 39.9539, + "step": 1476 + }, + { + "epoch": 5.0580731251668, + "grad_norm": 4.904325008392334, + "learning_rate": 3.079996826192849e-06, + "loss": 40.999, + "step": 1478 + }, + { + "epoch": 5.0649052575393645, + "grad_norm": 3.9396679401397705, + "learning_rate": 3.036127238347164e-06, + "loss": 41.8233, + "step": 1480 + }, + { + "epoch": 5.071737389911929, + "grad_norm": 3.5699760913848877, + "learning_rate": 2.992552115448258e-06, + "loss": 41.4895, + "step": 1482 + }, + { + "epoch": 5.078569522284495, + "grad_norm": 4.227250099182129, + "learning_rate": 2.9492720416985e-06, + "loss": 41.7825, + "step": 1484 + }, + { + "epoch": 5.085401654657059, + "grad_norm": 3.8788514137268066, + "learning_rate": 2.9062875973445813e-06, + "loss": 41.4301, + "step": 1486 + }, + { + "epoch": 5.092233787029624, + "grad_norm": 3.7242729663848877, + "learning_rate": 2.8635993586697553e-06, + "loss": 40.2917, + "step": 1488 + }, + { + "epoch": 5.099065919402189, + "grad_norm": 5.645269870758057, + "learning_rate": 2.821207897986114e-06, + "loss": 41.1435, + "step": 1490 + }, + { + "epoch": 5.105898051774753, + "grad_norm": 3.9231839179992676, + "learning_rate": 2.779113783626916e-06, + "loss": 41.5506, + "step": 1492 + }, + { + "epoch": 5.112730184147318, + "grad_norm": 4.276205062866211, + "learning_rate": 2.7373175799389415e-06, + "loss": 40.4141, + "step": 1494 + }, + { + "epoch": 5.1195623165198825, + "grad_norm": 6.223433971405029, + "learning_rate": 2.6958198472749717e-06, + "loss": 42.1149, + "step": 1496 + }, + { + "epoch": 5.126394448892447, + "grad_norm": 4.167882442474365, + "learning_rate": 2.65462114198623e-06, + "loss": 40.7711, + "step": 1498 + }, + { + "epoch": 5.133226581265012, + "grad_norm": 3.588376998901367, + "learning_rate": 2.6137220164149435e-06, + "loss": 42.5513, + "step": 1500 + }, + { + "epoch": 5.133226581265012, + "eval_loss": 0.6761642694473267, + "eval_runtime": 137.9512, + "eval_samples_per_second": 28.597, + "eval_steps_per_second": 7.155, + "step": 1500 + }, + { + "epoch": 5.140058713637576, + "grad_norm": 4.149092674255371, + "learning_rate": 2.573123018886961e-06, + "loss": 40.5633, + "step": 1502 + }, + { + "epoch": 5.146890846010141, + "grad_norm": 3.9322760105133057, + "learning_rate": 2.5328246937043526e-06, + "loss": 41.3711, + "step": 1504 + }, + { + "epoch": 5.1537229783827065, + "grad_norm": 4.557422161102295, + "learning_rate": 2.492827581138149e-06, + "loss": 39.5696, + "step": 1506 + }, + { + "epoch": 5.160555110755271, + "grad_norm": 3.772927761077881, + "learning_rate": 2.4531322174210975e-06, + "loss": 42.9544, + "step": 1508 + }, + { + "epoch": 5.167387243127836, + "grad_norm": 4.051291465759277, + "learning_rate": 2.4137391347404476e-06, + "loss": 40.978, + "step": 1510 + }, + { + "epoch": 5.1742193755004005, + "grad_norm": 3.6557424068450928, + "learning_rate": 2.37464886123083e-06, + "loss": 41.606, + "step": 1512 + }, + { + "epoch": 5.181051507872965, + "grad_norm": 4.801413536071777, + "learning_rate": 2.3358619209672e-06, + "loss": 41.5917, + "step": 1514 + }, + { + "epoch": 5.18788364024553, + "grad_norm": 4.2001423835754395, + "learning_rate": 2.2973788339577613e-06, + "loss": 43.0596, + "step": 1516 + }, + { + "epoch": 5.194715772618094, + "grad_norm": 5.291867256164551, + "learning_rate": 2.2592001161370392e-06, + "loss": 40.3588, + "step": 1518 + }, + { + "epoch": 5.201547904990659, + "grad_norm": 3.7930984497070312, + "learning_rate": 2.2213262793589484e-06, + "loss": 42.0758, + "step": 1520 + }, + { + "epoch": 5.208380037363224, + "grad_norm": 4.888052940368652, + "learning_rate": 2.1837578313899098e-06, + "loss": 39.7415, + "step": 1522 + }, + { + "epoch": 5.215212169735788, + "grad_norm": 4.963688850402832, + "learning_rate": 2.1464952759020855e-06, + "loss": 42.05, + "step": 1524 + }, + { + "epoch": 5.222044302108353, + "grad_norm": 4.556923866271973, + "learning_rate": 2.109539112466588e-06, + "loss": 40.5828, + "step": 1526 + }, + { + "epoch": 5.228876434480918, + "grad_norm": 3.550285577774048, + "learning_rate": 2.0728898365467903e-06, + "loss": 41.4201, + "step": 1528 + }, + { + "epoch": 5.235708566853483, + "grad_norm": 4.290851593017578, + "learning_rate": 2.0365479394917147e-06, + "loss": 41.1988, + "step": 1530 + }, + { + "epoch": 5.242540699226048, + "grad_norm": 4.436618804931641, + "learning_rate": 2.0005139085293945e-06, + "loss": 41.1016, + "step": 1532 + }, + { + "epoch": 5.249372831598612, + "grad_norm": 6.221188068389893, + "learning_rate": 1.9647882267603862e-06, + "loss": 42.1538, + "step": 1534 + }, + { + "epoch": 5.256204963971177, + "grad_norm": 4.712629795074463, + "learning_rate": 1.9293713731512673e-06, + "loss": 41.1176, + "step": 1536 + }, + { + "epoch": 5.263037096343742, + "grad_norm": 4.693170070648193, + "learning_rate": 1.894263822528225e-06, + "loss": 41.3687, + "step": 1538 + }, + { + "epoch": 5.269869228716306, + "grad_norm": 4.854535102844238, + "learning_rate": 1.8594660455706763e-06, + "loss": 41.6856, + "step": 1540 + }, + { + "epoch": 5.276701361088871, + "grad_norm": 3.5167202949523926, + "learning_rate": 1.8249785088049893e-06, + "loss": 42.5848, + "step": 1542 + }, + { + "epoch": 5.2835334934614355, + "grad_norm": 4.029543399810791, + "learning_rate": 1.790801674598186e-06, + "loss": 41.8932, + "step": 1544 + }, + { + "epoch": 5.290365625834, + "grad_norm": 4.217826843261719, + "learning_rate": 1.7569360011517848e-06, + "loss": 41.478, + "step": 1546 + }, + { + "epoch": 5.297197758206565, + "grad_norm": 3.8237998485565186, + "learning_rate": 1.7233819424956248e-06, + "loss": 42.5394, + "step": 1548 + }, + { + "epoch": 5.30402989057913, + "grad_norm": 5.044140338897705, + "learning_rate": 1.6901399484818004e-06, + "loss": 41.0466, + "step": 1550 + }, + { + "epoch": 5.30402989057913, + "eval_loss": 0.6723917722702026, + "eval_runtime": 132.3674, + "eval_samples_per_second": 29.803, + "eval_steps_per_second": 7.457, + "step": 1550 + }, + { + "epoch": 5.310862022951695, + "grad_norm": 4.023882865905762, + "learning_rate": 1.6572104647786247e-06, + "loss": 40.4515, + "step": 1552 + }, + { + "epoch": 5.31769415532426, + "grad_norm": 5.667575836181641, + "learning_rate": 1.624593932864632e-06, + "loss": 42.2196, + "step": 1554 + }, + { + "epoch": 5.324526287696824, + "grad_norm": 3.771815299987793, + "learning_rate": 1.5922907900227018e-06, + "loss": 41.1018, + "step": 1556 + }, + { + "epoch": 5.331358420069389, + "grad_norm": 4.044847011566162, + "learning_rate": 1.5603014693341662e-06, + "loss": 40.8528, + "step": 1558 + }, + { + "epoch": 5.3381905524419535, + "grad_norm": 4.64625358581543, + "learning_rate": 1.5286263996730026e-06, + "loss": 41.612, + "step": 1560 + }, + { + "epoch": 5.345022684814518, + "grad_norm": 5.102336406707764, + "learning_rate": 1.497266005700107e-06, + "loss": 40.965, + "step": 1562 + }, + { + "epoch": 5.351854817187083, + "grad_norm": 3.1535797119140625, + "learning_rate": 1.4662207078575684e-06, + "loss": 40.5264, + "step": 1564 + }, + { + "epoch": 5.358686949559647, + "grad_norm": 3.740694522857666, + "learning_rate": 1.4354909223630669e-06, + "loss": 41.5863, + "step": 1566 + }, + { + "epoch": 5.365519081932212, + "grad_norm": 4.79527473449707, + "learning_rate": 1.40507706120426e-06, + "loss": 41.3632, + "step": 1568 + }, + { + "epoch": 5.372351214304777, + "grad_norm": 4.936699867248535, + "learning_rate": 1.3749795321332887e-06, + "loss": 41.898, + "step": 1570 + }, + { + "epoch": 5.379183346677342, + "grad_norm": 6.228104114532471, + "learning_rate": 1.3451987386612851e-06, + "loss": 41.3327, + "step": 1572 + }, + { + "epoch": 5.386015479049907, + "grad_norm": 3.9607808589935303, + "learning_rate": 1.3157350800529878e-06, + "loss": 39.3806, + "step": 1574 + }, + { + "epoch": 5.3928476114224715, + "grad_norm": 3.2485790252685547, + "learning_rate": 1.286588951321363e-06, + "loss": 39.292, + "step": 1576 + }, + { + "epoch": 5.399679743795036, + "grad_norm": 4.702234745025635, + "learning_rate": 1.2577607432223276e-06, + "loss": 40.3127, + "step": 1578 + }, + { + "epoch": 5.406511876167601, + "grad_norm": 4.465649127960205, + "learning_rate": 1.2292508422495158e-06, + "loss": 41.7889, + "step": 1580 + }, + { + "epoch": 5.413344008540165, + "grad_norm": 4.618641376495361, + "learning_rate": 1.2010596306290589e-06, + "loss": 41.2257, + "step": 1582 + }, + { + "epoch": 5.42017614091273, + "grad_norm": 4.093713283538818, + "learning_rate": 1.1731874863145143e-06, + "loss": 41.7067, + "step": 1584 + }, + { + "epoch": 5.427008273285295, + "grad_norm": 5.642305374145508, + "learning_rate": 1.145634782981761e-06, + "loss": 41.1947, + "step": 1586 + }, + { + "epoch": 5.433840405657859, + "grad_norm": 3.9637906551361084, + "learning_rate": 1.1184018900240011e-06, + "loss": 41.5425, + "step": 1588 + }, + { + "epoch": 5.440672538030424, + "grad_norm": 4.328593730926514, + "learning_rate": 1.0914891725468141e-06, + "loss": 41.7915, + "step": 1590 + }, + { + "epoch": 5.4475046704029895, + "grad_norm": 4.559619903564453, + "learning_rate": 1.06489699136324e-06, + "loss": 39.5462, + "step": 1592 + }, + { + "epoch": 5.454336802775554, + "grad_norm": 4.174973011016846, + "learning_rate": 1.0386257029889768e-06, + "loss": 40.6458, + "step": 1594 + }, + { + "epoch": 5.461168935148119, + "grad_norm": 3.249431610107422, + "learning_rate": 1.0126756596375686e-06, + "loss": 41.4128, + "step": 1596 + }, + { + "epoch": 5.468001067520683, + "grad_norm": 4.598479747772217, + "learning_rate": 9.87047209215694e-07, + "loss": 41.7854, + "step": 1598 + }, + { + "epoch": 5.474833199893248, + "grad_norm": 3.558709144592285, + "learning_rate": 9.617406953185138e-07, + "loss": 41.9632, + "step": 1600 + }, + { + "epoch": 5.474833199893248, + "eval_loss": 0.6698766350746155, + "eval_runtime": 133.9539, + "eval_samples_per_second": 29.45, + "eval_steps_per_second": 7.368, + "step": 1600 + }, + { + "epoch": 5.481665332265813, + "grad_norm": 5.397751331329346, + "learning_rate": 9.36756457225052e-07, + "loss": 40.2635, + "step": 1602 + }, + { + "epoch": 5.488497464638377, + "grad_norm": 5.443418502807617, + "learning_rate": 9.120948298936421e-07, + "loss": 40.6923, + "step": 1604 + }, + { + "epoch": 5.495329597010942, + "grad_norm": 3.991673707962036, + "learning_rate": 8.87756143957455e-07, + "loss": 40.0543, + "step": 1606 + }, + { + "epoch": 5.502161729383507, + "grad_norm": 4.649523735046387, + "learning_rate": 8.637407257200497e-07, + "loss": 41.3534, + "step": 1608 + }, + { + "epoch": 5.508993861756071, + "grad_norm": 4.675793170928955, + "learning_rate": 8.400488971509968e-07, + "loss": 39.8315, + "step": 1610 + }, + { + "epoch": 5.515825994128637, + "grad_norm": 3.273359775543213, + "learning_rate": 8.166809758815896e-07, + "loss": 39.9979, + "step": 1612 + }, + { + "epoch": 5.5226581265012005, + "grad_norm": 4.165469169616699, + "learning_rate": 7.936372752005399e-07, + "loss": 39.3362, + "step": 1614 + }, + { + "epoch": 5.529490258873766, + "grad_norm": 4.015806674957275, + "learning_rate": 7.709181040498254e-07, + "loss": 40.7772, + "step": 1616 + }, + { + "epoch": 5.536322391246331, + "grad_norm": 6.13747501373291, + "learning_rate": 7.485237670205175e-07, + "loss": 40.8463, + "step": 1618 + }, + { + "epoch": 5.543154523618895, + "grad_norm": 3.6014761924743652, + "learning_rate": 7.264545643486997e-07, + "loss": 40.231, + "step": 1620 + }, + { + "epoch": 5.54998665599146, + "grad_norm": 4.055222034454346, + "learning_rate": 7.047107919114588e-07, + "loss": 42.5435, + "step": 1622 + }, + { + "epoch": 5.5568187883640245, + "grad_norm": 5.444411277770996, + "learning_rate": 6.832927412229018e-07, + "loss": 41.0914, + "step": 1624 + }, + { + "epoch": 5.563650920736589, + "grad_norm": 3.4832520484924316, + "learning_rate": 6.622006994302543e-07, + "loss": 42.297, + "step": 1626 + }, + { + "epoch": 5.570483053109154, + "grad_norm": 5.123753547668457, + "learning_rate": 6.41434949310013e-07, + "loss": 40.4283, + "step": 1628 + }, + { + "epoch": 5.5773151854817185, + "grad_norm": 5.2065277099609375, + "learning_rate": 6.209957692641544e-07, + "loss": 40.5581, + "step": 1630 + }, + { + "epoch": 5.584147317854283, + "grad_norm": 4.573667049407959, + "learning_rate": 6.008834333163876e-07, + "loss": 39.4126, + "step": 1632 + }, + { + "epoch": 5.590979450226849, + "grad_norm": 5.208593368530273, + "learning_rate": 5.810982111085106e-07, + "loss": 40.7202, + "step": 1634 + }, + { + "epoch": 5.597811582599413, + "grad_norm": 4.341737747192383, + "learning_rate": 5.616403678967624e-07, + "loss": 40.9683, + "step": 1636 + }, + { + "epoch": 5.604643714971978, + "grad_norm": 4.836015701293945, + "learning_rate": 5.42510164548285e-07, + "loss": 40.4273, + "step": 1638 + }, + { + "epoch": 5.6114758473445425, + "grad_norm": 4.308472633361816, + "learning_rate": 5.237078575376336e-07, + "loss": 41.0492, + "step": 1640 + }, + { + "epoch": 5.618307979717107, + "grad_norm": 4.316090106964111, + "learning_rate": 5.052336989433082e-07, + "loss": 40.6806, + "step": 1642 + }, + { + "epoch": 5.625140112089672, + "grad_norm": 3.6825830936431885, + "learning_rate": 4.870879364444109e-07, + "loss": 40.5467, + "step": 1644 + }, + { + "epoch": 5.631972244462236, + "grad_norm": 5.199794769287109, + "learning_rate": 4.692708133172991e-07, + "loss": 39.4587, + "step": 1646 + }, + { + "epoch": 5.638804376834801, + "grad_norm": 3.3388471603393555, + "learning_rate": 4.517825684323324e-07, + "loss": 39.1098, + "step": 1648 + }, + { + "epoch": 5.645636509207366, + "grad_norm": 4.200729846954346, + "learning_rate": 4.346234362506724e-07, + "loss": 40.122, + "step": 1650 + }, + { + "epoch": 5.645636509207366, + "eval_loss": 0.6662212014198303, + "eval_runtime": 137.6293, + "eval_samples_per_second": 28.664, + "eval_steps_per_second": 7.171, + "step": 1650 + }, + { + "epoch": 5.65246864157993, + "grad_norm": 3.9246127605438232, + "learning_rate": 4.1779364682113796e-07, + "loss": 40.0725, + "step": 1652 + }, + { + "epoch": 5.659300773952495, + "grad_norm": 4.904084205627441, + "learning_rate": 4.012934257771134e-07, + "loss": 40.0188, + "step": 1654 + }, + { + "epoch": 5.6661329063250605, + "grad_norm": 4.436688423156738, + "learning_rate": 3.851229943335394e-07, + "loss": 39.9216, + "step": 1656 + }, + { + "epoch": 5.672965038697625, + "grad_norm": 4.027088642120361, + "learning_rate": 3.6928256928393247e-07, + "loss": 41.4124, + "step": 1658 + }, + { + "epoch": 5.67979717107019, + "grad_norm": 3.796221971511841, + "learning_rate": 3.537723629974815e-07, + "loss": 39.8851, + "step": 1660 + }, + { + "epoch": 5.686629303442754, + "grad_norm": 4.7540130615234375, + "learning_rate": 3.3859258341621125e-07, + "loss": 40.1716, + "step": 1662 + }, + { + "epoch": 5.693461435815319, + "grad_norm": 4.521333694458008, + "learning_rate": 3.237434340521789e-07, + "loss": 41.4182, + "step": 1664 + }, + { + "epoch": 5.700293568187884, + "grad_norm": 4.776477336883545, + "learning_rate": 3.0922511398475683e-07, + "loss": 41.2698, + "step": 1666 + }, + { + "epoch": 5.707125700560448, + "grad_norm": 4.749114990234375, + "learning_rate": 2.9503781785795713e-07, + "loss": 42.4175, + "step": 1668 + }, + { + "epoch": 5.713957832933013, + "grad_norm": 4.831925392150879, + "learning_rate": 2.8118173587782516e-07, + "loss": 40.593, + "step": 1670 + }, + { + "epoch": 5.720789965305578, + "grad_norm": 4.17523193359375, + "learning_rate": 2.6765705380989437e-07, + "loss": 39.8755, + "step": 1672 + }, + { + "epoch": 5.727622097678142, + "grad_norm": 4.183824062347412, + "learning_rate": 2.544639529766829e-07, + "loss": 40.7682, + "step": 1674 + }, + { + "epoch": 5.734454230050707, + "grad_norm": 4.203549385070801, + "learning_rate": 2.416026102552732e-07, + "loss": 40.1932, + "step": 1676 + }, + { + "epoch": 5.741286362423272, + "grad_norm": 4.252909183502197, + "learning_rate": 2.290731980749361e-07, + "loss": 41.4024, + "step": 1678 + }, + { + "epoch": 5.748118494795837, + "grad_norm": 4.110680103302002, + "learning_rate": 2.168758844148272e-07, + "loss": 40.8089, + "step": 1680 + }, + { + "epoch": 5.754950627168402, + "grad_norm": 4.860687732696533, + "learning_rate": 2.050108328017164e-07, + "loss": 41.278, + "step": 1682 + }, + { + "epoch": 5.761782759540966, + "grad_norm": 7.037466526031494, + "learning_rate": 1.93478202307823e-07, + "loss": 42.0162, + "step": 1684 + }, + { + "epoch": 5.768614891913531, + "grad_norm": 4.048498630523682, + "learning_rate": 1.8227814754865068e-07, + "loss": 41.2187, + "step": 1686 + }, + { + "epoch": 5.775447024286096, + "grad_norm": 3.721379518508911, + "learning_rate": 1.7141081868094212e-07, + "loss": 41.8383, + "step": 1688 + }, + { + "epoch": 5.78227915665866, + "grad_norm": 6.793107509613037, + "learning_rate": 1.6087636140065532e-07, + "loss": 40.5894, + "step": 1690 + }, + { + "epoch": 5.789111289031225, + "grad_norm": 4.424513339996338, + "learning_rate": 1.5067491694100154e-07, + "loss": 41.2666, + "step": 1692 + }, + { + "epoch": 5.7959434214037895, + "grad_norm": 4.707203388214111, + "learning_rate": 1.4080662207056894e-07, + "loss": 41.2405, + "step": 1694 + }, + { + "epoch": 5.802775553776354, + "grad_norm": 2.994469165802002, + "learning_rate": 1.3127160909147672e-07, + "loss": 42.6466, + "step": 1696 + }, + { + "epoch": 5.809607686148919, + "grad_norm": 3.029481887817383, + "learning_rate": 1.220700058376073e-07, + "loss": 40.642, + "step": 1698 + }, + { + "epoch": 5.816439818521484, + "grad_norm": 3.4690332412719727, + "learning_rate": 1.1320193567288529e-07, + "loss": 41.02, + "step": 1700 + }, + { + "epoch": 5.816439818521484, + "eval_loss": 0.6652334928512573, + "eval_runtime": 134.4616, + "eval_samples_per_second": 29.339, + "eval_steps_per_second": 7.34, + "step": 1700 + }, + { + "epoch": 5.823271950894049, + "grad_norm": 5.008721828460693, + "learning_rate": 1.0466751748963444e-07, + "loss": 40.1855, + "step": 1702 + }, + { + "epoch": 5.830104083266614, + "grad_norm": 5.638387680053711, + "learning_rate": 9.646686570697061e-08, + "loss": 40.6194, + "step": 1704 + }, + { + "epoch": 5.836936215639178, + "grad_norm": 5.234898567199707, + "learning_rate": 8.860009026928629e-08, + "loss": 40.6608, + "step": 1706 + }, + { + "epoch": 5.843768348011743, + "grad_norm": 4.212846279144287, + "learning_rate": 8.106729664475176e-08, + "loss": 41.4097, + "step": 1708 + }, + { + "epoch": 5.8506004803843075, + "grad_norm": 3.5884008407592773, + "learning_rate": 7.386858582392187e-08, + "loss": 39.4515, + "step": 1710 + }, + { + "epoch": 5.857432612756872, + "grad_norm": 4.441662788391113, + "learning_rate": 6.700405431837587e-08, + "loss": 41.8026, + "step": 1712 + }, + { + "epoch": 5.864264745129437, + "grad_norm": 5.290170192718506, + "learning_rate": 6.047379415941856e-08, + "loss": 40.8839, + "step": 1714 + }, + { + "epoch": 5.871096877502001, + "grad_norm": 3.4507861137390137, + "learning_rate": 5.4277892896853476e-08, + "loss": 40.574, + "step": 1716 + }, + { + "epoch": 5.877929009874566, + "grad_norm": 3.869871139526367, + "learning_rate": 4.8416433597803234e-08, + "loss": 41.8288, + "step": 1718 + }, + { + "epoch": 5.884761142247131, + "grad_norm": 4.644185543060303, + "learning_rate": 4.2889494845599344e-08, + "loss": 41.318, + "step": 1720 + }, + { + "epoch": 5.891593274619696, + "grad_norm": 3.191018581390381, + "learning_rate": 3.769715073872748e-08, + "loss": 41.1112, + "step": 1722 + }, + { + "epoch": 5.898425406992261, + "grad_norm": 3.394134998321533, + "learning_rate": 3.283947088983663e-08, + "loss": 41.9932, + "step": 1724 + }, + { + "epoch": 5.9052575393648254, + "grad_norm": 4.62444543838501, + "learning_rate": 2.831652042480093e-08, + "loss": 39.9583, + "step": 1726 + }, + { + "epoch": 5.91208967173739, + "grad_norm": 4.27966833114624, + "learning_rate": 2.4128359981850924e-08, + "loss": 39.915, + "step": 1728 + }, + { + "epoch": 5.918921804109955, + "grad_norm": 3.7036333084106445, + "learning_rate": 2.0275045710760334e-08, + "loss": 40.0384, + "step": 1730 + }, + { + "epoch": 5.925753936482519, + "grad_norm": 5.249677658081055, + "learning_rate": 1.6756629272085545e-08, + "loss": 40.1564, + "step": 1732 + }, + { + "epoch": 5.932586068855084, + "grad_norm": 4.477707862854004, + "learning_rate": 1.3573157836485606e-08, + "loss": 40.6008, + "step": 1734 + }, + { + "epoch": 5.939418201227649, + "grad_norm": 4.939481258392334, + "learning_rate": 1.0724674084083841e-08, + "loss": 40.9639, + "step": 1736 + }, + { + "epoch": 5.946250333600213, + "grad_norm": 2.9428999423980713, + "learning_rate": 8.211216203890537e-09, + "loss": 40.9722, + "step": 1738 + }, + { + "epoch": 5.953082465972778, + "grad_norm": 4.589330673217773, + "learning_rate": 6.032817893297793e-09, + "loss": 41.4832, + "step": 1740 + }, + { + "epoch": 5.9599145983453425, + "grad_norm": 5.4429450035095215, + "learning_rate": 4.1895083576271035e-09, + "loss": 41.8059, + "step": 1742 + }, + { + "epoch": 5.966746730717908, + "grad_norm": 3.5152432918548584, + "learning_rate": 2.681312309735229e-09, + "loss": 41.2228, + "step": 1744 + }, + { + "epoch": 5.973578863090473, + "grad_norm": 4.573424339294434, + "learning_rate": 1.5082499696839059e-09, + "loss": 41.9849, + "step": 1746 + }, + { + "epoch": 5.980410995463037, + "grad_norm": 4.099581718444824, + "learning_rate": 6.703370644706164e-10, + "loss": 40.6948, + "step": 1748 + }, + { + "epoch": 5.987243127835602, + "grad_norm": 4.090056896209717, + "learning_rate": 1.6758482781209507e-10, + "loss": 40.9226, + "step": 1750 + }, + { + "epoch": 5.987243127835602, + "eval_loss": 0.6658891439437866, + "eval_runtime": 134.1369, + "eval_samples_per_second": 29.41, + "eval_steps_per_second": 7.358, + "step": 1750 + }, + { + "epoch": 5.994075260208167, + "grad_norm": 4.494061470031738, + "learning_rate": 0.0, + "loss": 41.0993, + "step": 1752 + } + ], + "logging_steps": 2, + "max_steps": 1752, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 50, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.616163439072248e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}