diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35490 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9991116375481197, + "eval_steps": 500, + "global_step": 5064, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005922416345869114, + "grad_norm": 18.717661840154292, + "learning_rate": 1.3157894736842107e-07, + "loss": 1.1434, + "step": 1 + }, + { + "epoch": 0.0011844832691738228, + "grad_norm": 15.975211698366682, + "learning_rate": 2.6315789473684213e-07, + "loss": 1.1722, + "step": 2 + }, + { + "epoch": 0.0017767249037607344, + "grad_norm": 13.436630281888167, + "learning_rate": 3.9473684210526315e-07, + "loss": 1.2216, + "step": 3 + }, + { + "epoch": 0.0023689665383476457, + "grad_norm": 14.141974273595771, + "learning_rate": 5.263157894736843e-07, + "loss": 1.2084, + "step": 4 + }, + { + "epoch": 0.002961208172934557, + "grad_norm": 12.817546549612011, + "learning_rate": 6.578947368421053e-07, + "loss": 1.1613, + "step": 5 + }, + { + "epoch": 0.0035534498075214687, + "grad_norm": 15.101738254046671, + "learning_rate": 7.894736842105263e-07, + "loss": 1.2291, + "step": 6 + }, + { + "epoch": 0.00414569144210838, + "grad_norm": 21.872300636479174, + "learning_rate": 9.210526315789474e-07, + "loss": 1.2358, + "step": 7 + }, + { + "epoch": 0.004737933076695291, + "grad_norm": 18.219829859695743, + "learning_rate": 1.0526315789473685e-06, + "loss": 1.2111, + "step": 8 + }, + { + "epoch": 0.005330174711282203, + "grad_norm": 11.651800626605043, + "learning_rate": 1.1842105263157894e-06, + "loss": 1.1699, + "step": 9 + }, + { + "epoch": 0.005922416345869114, + "grad_norm": 14.428397731094817, + "learning_rate": 1.3157894736842106e-06, + "loss": 1.2005, + "step": 10 + }, + { + "epoch": 0.006514657980456026, + "grad_norm": 11.505996928681398, + "learning_rate": 1.4473684210526317e-06, + "loss": 1.1428, + "step": 11 + }, + { + "epoch": 0.0071068996150429374, + "grad_norm": 9.258825562856831, + "learning_rate": 1.5789473684210526e-06, + "loss": 1.1801, + "step": 12 + }, + { + "epoch": 0.007699141249629849, + "grad_norm": 11.003890134146193, + "learning_rate": 1.710526315789474e-06, + "loss": 1.1603, + "step": 13 + }, + { + "epoch": 0.00829138288421676, + "grad_norm": 8.53365801142655, + "learning_rate": 1.8421052631578948e-06, + "loss": 1.1299, + "step": 14 + }, + { + "epoch": 0.008883624518803672, + "grad_norm": 9.898879357555042, + "learning_rate": 1.973684210526316e-06, + "loss": 1.1691, + "step": 15 + }, + { + "epoch": 0.009475866153390583, + "grad_norm": 10.549302142463828, + "learning_rate": 2.105263157894737e-06, + "loss": 1.1276, + "step": 16 + }, + { + "epoch": 0.010068107787977496, + "grad_norm": 13.735637524962215, + "learning_rate": 2.236842105263158e-06, + "loss": 1.0958, + "step": 17 + }, + { + "epoch": 0.010660349422564407, + "grad_norm": 9.075762665514619, + "learning_rate": 2.368421052631579e-06, + "loss": 1.1352, + "step": 18 + }, + { + "epoch": 0.011252591057151318, + "grad_norm": 8.125019295709604, + "learning_rate": 2.5e-06, + "loss": 1.0965, + "step": 19 + }, + { + "epoch": 0.011844832691738229, + "grad_norm": 6.28753990985354, + "learning_rate": 2.631578947368421e-06, + "loss": 1.0706, + "step": 20 + }, + { + "epoch": 0.01243707432632514, + "grad_norm": 9.479465602728467, + "learning_rate": 2.7631578947368424e-06, + "loss": 1.1351, + "step": 21 + }, + { + "epoch": 0.013029315960912053, + "grad_norm": 10.298865353341801, + "learning_rate": 2.8947368421052634e-06, + "loss": 1.1403, + "step": 22 + }, + { + "epoch": 0.013621557595498964, + "grad_norm": 37.987752673055155, + "learning_rate": 3.0263157894736843e-06, + "loss": 1.0901, + "step": 23 + }, + { + "epoch": 0.014213799230085875, + "grad_norm": 9.457834823064632, + "learning_rate": 3.157894736842105e-06, + "loss": 1.1241, + "step": 24 + }, + { + "epoch": 0.014806040864672786, + "grad_norm": 6.66281436022819, + "learning_rate": 3.289473684210527e-06, + "loss": 1.0351, + "step": 25 + }, + { + "epoch": 0.015398282499259699, + "grad_norm": 6.610062092413164, + "learning_rate": 3.421052631578948e-06, + "loss": 1.0715, + "step": 26 + }, + { + "epoch": 0.01599052413384661, + "grad_norm": 6.998988378675071, + "learning_rate": 3.5526315789473687e-06, + "loss": 1.0664, + "step": 27 + }, + { + "epoch": 0.01658276576843352, + "grad_norm": 6.559109520573562, + "learning_rate": 3.6842105263157896e-06, + "loss": 1.0371, + "step": 28 + }, + { + "epoch": 0.017175007403020432, + "grad_norm": 8.1722379764705, + "learning_rate": 3.815789473684211e-06, + "loss": 1.0594, + "step": 29 + }, + { + "epoch": 0.017767249037607343, + "grad_norm": 9.992648704415512, + "learning_rate": 3.947368421052632e-06, + "loss": 1.0802, + "step": 30 + }, + { + "epoch": 0.018359490672194254, + "grad_norm": 6.773662148667347, + "learning_rate": 4.078947368421053e-06, + "loss": 0.9767, + "step": 31 + }, + { + "epoch": 0.018951732306781165, + "grad_norm": 5.160580354167821, + "learning_rate": 4.210526315789474e-06, + "loss": 1.0125, + "step": 32 + }, + { + "epoch": 0.019543973941368076, + "grad_norm": 7.601356319162516, + "learning_rate": 4.342105263157895e-06, + "loss": 1.0109, + "step": 33 + }, + { + "epoch": 0.02013621557595499, + "grad_norm": 6.835933036484314, + "learning_rate": 4.473684210526316e-06, + "loss": 1.0355, + "step": 34 + }, + { + "epoch": 0.020728457210541902, + "grad_norm": 14.435112479372032, + "learning_rate": 4.605263157894737e-06, + "loss": 1.0154, + "step": 35 + }, + { + "epoch": 0.021320698845128813, + "grad_norm": 7.829860179549529, + "learning_rate": 4.736842105263158e-06, + "loss": 1.04, + "step": 36 + }, + { + "epoch": 0.021912940479715724, + "grad_norm": 7.169740079807402, + "learning_rate": 4.8684210526315795e-06, + "loss": 0.9694, + "step": 37 + }, + { + "epoch": 0.022505182114302635, + "grad_norm": 8.83069550799535, + "learning_rate": 5e-06, + "loss": 0.9532, + "step": 38 + }, + { + "epoch": 0.023097423748889546, + "grad_norm": 9.131263259747842, + "learning_rate": 5.131578947368422e-06, + "loss": 0.9806, + "step": 39 + }, + { + "epoch": 0.023689665383476458, + "grad_norm": 10.809549787669706, + "learning_rate": 5.263157894736842e-06, + "loss": 1.0129, + "step": 40 + }, + { + "epoch": 0.02428190701806337, + "grad_norm": 8.804492257877177, + "learning_rate": 5.394736842105264e-06, + "loss": 1.0173, + "step": 41 + }, + { + "epoch": 0.02487414865265028, + "grad_norm": 20.71245515215828, + "learning_rate": 5.526315789473685e-06, + "loss": 0.9932, + "step": 42 + }, + { + "epoch": 0.025466390287237194, + "grad_norm": 9.655846535170545, + "learning_rate": 5.657894736842106e-06, + "loss": 0.9879, + "step": 43 + }, + { + "epoch": 0.026058631921824105, + "grad_norm": 6.537124531460736, + "learning_rate": 5.789473684210527e-06, + "loss": 1.0188, + "step": 44 + }, + { + "epoch": 0.026650873556411016, + "grad_norm": 13.091149382863613, + "learning_rate": 5.921052631578948e-06, + "loss": 0.9964, + "step": 45 + }, + { + "epoch": 0.027243115190997928, + "grad_norm": 8.006244083133073, + "learning_rate": 6.0526315789473685e-06, + "loss": 0.9726, + "step": 46 + }, + { + "epoch": 0.02783535682558484, + "grad_norm": 7.668734764475963, + "learning_rate": 6.18421052631579e-06, + "loss": 1.0153, + "step": 47 + }, + { + "epoch": 0.02842759846017175, + "grad_norm": 8.137447421123358, + "learning_rate": 6.31578947368421e-06, + "loss": 1.0091, + "step": 48 + }, + { + "epoch": 0.02901984009475866, + "grad_norm": 7.606771295715497, + "learning_rate": 6.447368421052632e-06, + "loss": 0.9991, + "step": 49 + }, + { + "epoch": 0.029612081729345572, + "grad_norm": 7.345858891479215, + "learning_rate": 6.578947368421054e-06, + "loss": 0.9509, + "step": 50 + }, + { + "epoch": 0.030204323363932483, + "grad_norm": 7.643466506554061, + "learning_rate": 6.710526315789474e-06, + "loss": 0.9572, + "step": 51 + }, + { + "epoch": 0.030796564998519398, + "grad_norm": 13.10135554836337, + "learning_rate": 6.842105263157896e-06, + "loss": 0.9797, + "step": 52 + }, + { + "epoch": 0.03138880663310631, + "grad_norm": 63.002320155093024, + "learning_rate": 6.973684210526316e-06, + "loss": 0.9633, + "step": 53 + }, + { + "epoch": 0.03198104826769322, + "grad_norm": 6.071766418833052, + "learning_rate": 7.1052631578947375e-06, + "loss": 0.9146, + "step": 54 + }, + { + "epoch": 0.03257328990228013, + "grad_norm": 15.636734262160939, + "learning_rate": 7.236842105263158e-06, + "loss": 1.0414, + "step": 55 + }, + { + "epoch": 0.03316553153686704, + "grad_norm": 22.37892911472697, + "learning_rate": 7.368421052631579e-06, + "loss": 0.9515, + "step": 56 + }, + { + "epoch": 0.03375777317145395, + "grad_norm": 11.914023755244859, + "learning_rate": 7.500000000000001e-06, + "loss": 0.9538, + "step": 57 + }, + { + "epoch": 0.034350014806040864, + "grad_norm": 8.586686698837346, + "learning_rate": 7.631578947368423e-06, + "loss": 0.9835, + "step": 58 + }, + { + "epoch": 0.034942256440627775, + "grad_norm": 27.707875718601368, + "learning_rate": 7.763157894736843e-06, + "loss": 1.0073, + "step": 59 + }, + { + "epoch": 0.035534498075214686, + "grad_norm": 24.521633695176586, + "learning_rate": 7.894736842105265e-06, + "loss": 0.961, + "step": 60 + }, + { + "epoch": 0.0361267397098016, + "grad_norm": 100.54943552048488, + "learning_rate": 8.026315789473685e-06, + "loss": 1.0181, + "step": 61 + }, + { + "epoch": 0.03671898134438851, + "grad_norm": 9.987380890435652, + "learning_rate": 8.157894736842106e-06, + "loss": 0.9711, + "step": 62 + }, + { + "epoch": 0.03731122297897542, + "grad_norm": 9.222884295954952, + "learning_rate": 8.289473684210526e-06, + "loss": 0.9489, + "step": 63 + }, + { + "epoch": 0.03790346461356233, + "grad_norm": 19.82352747253693, + "learning_rate": 8.421052631578948e-06, + "loss": 0.9661, + "step": 64 + }, + { + "epoch": 0.03849570624814924, + "grad_norm": 8.36286785237061, + "learning_rate": 8.552631578947368e-06, + "loss": 0.9554, + "step": 65 + }, + { + "epoch": 0.03908794788273615, + "grad_norm": 14.58438374105295, + "learning_rate": 8.68421052631579e-06, + "loss": 0.9321, + "step": 66 + }, + { + "epoch": 0.03968018951732307, + "grad_norm": 12.929431130450986, + "learning_rate": 8.81578947368421e-06, + "loss": 0.9036, + "step": 67 + }, + { + "epoch": 0.04027243115190998, + "grad_norm": 16.089145436097088, + "learning_rate": 8.947368421052632e-06, + "loss": 0.9221, + "step": 68 + }, + { + "epoch": 0.04086467278649689, + "grad_norm": 9.617533163896221, + "learning_rate": 9.078947368421054e-06, + "loss": 0.9167, + "step": 69 + }, + { + "epoch": 0.041456914421083804, + "grad_norm": 7.694991032785801, + "learning_rate": 9.210526315789474e-06, + "loss": 0.9336, + "step": 70 + }, + { + "epoch": 0.042049156055670715, + "grad_norm": 14.08369539185404, + "learning_rate": 9.342105263157895e-06, + "loss": 0.9538, + "step": 71 + }, + { + "epoch": 0.042641397690257626, + "grad_norm": 7.803725255120761, + "learning_rate": 9.473684210526315e-06, + "loss": 0.9534, + "step": 72 + }, + { + "epoch": 0.04323363932484454, + "grad_norm": 10.048093652946386, + "learning_rate": 9.605263157894737e-06, + "loss": 1.0111, + "step": 73 + }, + { + "epoch": 0.04382588095943145, + "grad_norm": 8.06770139647209, + "learning_rate": 9.736842105263159e-06, + "loss": 1.0268, + "step": 74 + }, + { + "epoch": 0.04441812259401836, + "grad_norm": 26.383451642079425, + "learning_rate": 9.868421052631579e-06, + "loss": 0.9725, + "step": 75 + }, + { + "epoch": 0.04501036422860527, + "grad_norm": 10.64378030761482, + "learning_rate": 1e-05, + "loss": 0.9351, + "step": 76 + }, + { + "epoch": 0.04560260586319218, + "grad_norm": 17.542208054609976, + "learning_rate": 1.0131578947368421e-05, + "loss": 0.9737, + "step": 77 + }, + { + "epoch": 0.04619484749777909, + "grad_norm": 4.904257105145783, + "learning_rate": 1.0263157894736844e-05, + "loss": 0.9626, + "step": 78 + }, + { + "epoch": 0.046787089132366004, + "grad_norm": 11.772948651444537, + "learning_rate": 1.0394736842105264e-05, + "loss": 0.9473, + "step": 79 + }, + { + "epoch": 0.047379330766952915, + "grad_norm": 5.711100010911588, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.9313, + "step": 80 + }, + { + "epoch": 0.047971572401539826, + "grad_norm": 17.763080812645768, + "learning_rate": 1.0657894736842108e-05, + "loss": 0.9619, + "step": 81 + }, + { + "epoch": 0.04856381403612674, + "grad_norm": 48.7479319965714, + "learning_rate": 1.0789473684210528e-05, + "loss": 0.9389, + "step": 82 + }, + { + "epoch": 0.04915605567071365, + "grad_norm": 12.652307160279797, + "learning_rate": 1.0921052631578948e-05, + "loss": 0.9623, + "step": 83 + }, + { + "epoch": 0.04974829730530056, + "grad_norm": 5.463588308594221, + "learning_rate": 1.105263157894737e-05, + "loss": 0.9464, + "step": 84 + }, + { + "epoch": 0.05034053893988748, + "grad_norm": 24.19528693762655, + "learning_rate": 1.1184210526315792e-05, + "loss": 0.8953, + "step": 85 + }, + { + "epoch": 0.05093278057447439, + "grad_norm": 6.29193429817291, + "learning_rate": 1.1315789473684212e-05, + "loss": 0.9005, + "step": 86 + }, + { + "epoch": 0.0515250222090613, + "grad_norm": 13.36573014121912, + "learning_rate": 1.1447368421052632e-05, + "loss": 0.9075, + "step": 87 + }, + { + "epoch": 0.05211726384364821, + "grad_norm": 10.377086501174952, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.9572, + "step": 88 + }, + { + "epoch": 0.05270950547823512, + "grad_norm": 8.69662787671184, + "learning_rate": 1.1710526315789475e-05, + "loss": 0.934, + "step": 89 + }, + { + "epoch": 0.05330174711282203, + "grad_norm": 12.914962830795819, + "learning_rate": 1.1842105263157895e-05, + "loss": 0.9381, + "step": 90 + }, + { + "epoch": 0.053893988747408944, + "grad_norm": 5.960162225544222, + "learning_rate": 1.1973684210526317e-05, + "loss": 0.8615, + "step": 91 + }, + { + "epoch": 0.054486230381995855, + "grad_norm": 14.528434970541078, + "learning_rate": 1.2105263157894737e-05, + "loss": 0.9648, + "step": 92 + }, + { + "epoch": 0.055078472016582766, + "grad_norm": 10.384675365673601, + "learning_rate": 1.2236842105263159e-05, + "loss": 0.912, + "step": 93 + }, + { + "epoch": 0.05567071365116968, + "grad_norm": 7.33465251957887, + "learning_rate": 1.236842105263158e-05, + "loss": 0.9293, + "step": 94 + }, + { + "epoch": 0.05626295528575659, + "grad_norm": 4.727007339545679, + "learning_rate": 1.25e-05, + "loss": 0.9342, + "step": 95 + }, + { + "epoch": 0.0568551969203435, + "grad_norm": 5.324071485836242, + "learning_rate": 1.263157894736842e-05, + "loss": 0.9379, + "step": 96 + }, + { + "epoch": 0.05744743855493041, + "grad_norm": 7.490618326236765, + "learning_rate": 1.2763157894736844e-05, + "loss": 0.8976, + "step": 97 + }, + { + "epoch": 0.05803968018951732, + "grad_norm": 10.570705779798235, + "learning_rate": 1.2894736842105264e-05, + "loss": 0.9339, + "step": 98 + }, + { + "epoch": 0.05863192182410423, + "grad_norm": 6.638002466312831, + "learning_rate": 1.3026315789473684e-05, + "loss": 0.9004, + "step": 99 + }, + { + "epoch": 0.059224163458691144, + "grad_norm": 6.082259049790152, + "learning_rate": 1.3157894736842108e-05, + "loss": 0.9741, + "step": 100 + }, + { + "epoch": 0.059816405093278055, + "grad_norm": 27.384068364009153, + "learning_rate": 1.3289473684210528e-05, + "loss": 0.9887, + "step": 101 + }, + { + "epoch": 0.060408646727864966, + "grad_norm": 7.414470248625567, + "learning_rate": 1.3421052631578948e-05, + "loss": 0.9548, + "step": 102 + }, + { + "epoch": 0.06100088836245188, + "grad_norm": 12.181747128869292, + "learning_rate": 1.3552631578947371e-05, + "loss": 0.9342, + "step": 103 + }, + { + "epoch": 0.061593129997038795, + "grad_norm": 18.814533980462837, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.8821, + "step": 104 + }, + { + "epoch": 0.062185371631625706, + "grad_norm": 11.37830438605211, + "learning_rate": 1.3815789473684211e-05, + "loss": 0.9453, + "step": 105 + }, + { + "epoch": 0.06277761326621262, + "grad_norm": 7.700584289573716, + "learning_rate": 1.3947368421052631e-05, + "loss": 0.9137, + "step": 106 + }, + { + "epoch": 0.06336985490079952, + "grad_norm": 24.239676785791914, + "learning_rate": 1.4078947368421055e-05, + "loss": 0.9604, + "step": 107 + }, + { + "epoch": 0.06396209653538644, + "grad_norm": 15.296887074877906, + "learning_rate": 1.4210526315789475e-05, + "loss": 0.9241, + "step": 108 + }, + { + "epoch": 0.06455433816997334, + "grad_norm": 5.911563958223732, + "learning_rate": 1.4342105263157895e-05, + "loss": 0.8995, + "step": 109 + }, + { + "epoch": 0.06514657980456026, + "grad_norm": 6.472036308433493, + "learning_rate": 1.4473684210526317e-05, + "loss": 0.9123, + "step": 110 + }, + { + "epoch": 0.06573882143914717, + "grad_norm": 8.838225442941134, + "learning_rate": 1.4605263157894739e-05, + "loss": 0.866, + "step": 111 + }, + { + "epoch": 0.06633106307373408, + "grad_norm": 6.179553262811797, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.9363, + "step": 112 + }, + { + "epoch": 0.06692330470832099, + "grad_norm": 8.05988074037456, + "learning_rate": 1.486842105263158e-05, + "loss": 0.8737, + "step": 113 + }, + { + "epoch": 0.0675155463429079, + "grad_norm": 8.800587055109485, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.9535, + "step": 114 + }, + { + "epoch": 0.06810778797749482, + "grad_norm": 14.297791370718492, + "learning_rate": 1.5131578947368422e-05, + "loss": 0.9063, + "step": 115 + }, + { + "epoch": 0.06870002961208173, + "grad_norm": 12.532931475700789, + "learning_rate": 1.5263157894736846e-05, + "loss": 0.9646, + "step": 116 + }, + { + "epoch": 0.06929227124666865, + "grad_norm": 10.961935762322105, + "learning_rate": 1.5394736842105264e-05, + "loss": 0.9074, + "step": 117 + }, + { + "epoch": 0.06988451288125555, + "grad_norm": 16.197406867612408, + "learning_rate": 1.5526315789473686e-05, + "loss": 0.9213, + "step": 118 + }, + { + "epoch": 0.07047675451584247, + "grad_norm": 4.618325199175887, + "learning_rate": 1.5657894736842107e-05, + "loss": 0.9723, + "step": 119 + }, + { + "epoch": 0.07106899615042937, + "grad_norm": 12.33412742358314, + "learning_rate": 1.578947368421053e-05, + "loss": 0.9582, + "step": 120 + }, + { + "epoch": 0.07166123778501629, + "grad_norm": 8.051122431463254, + "learning_rate": 1.5921052631578948e-05, + "loss": 0.9051, + "step": 121 + }, + { + "epoch": 0.0722534794196032, + "grad_norm": 247.48175705261465, + "learning_rate": 1.605263157894737e-05, + "loss": 0.9365, + "step": 122 + }, + { + "epoch": 0.07284572105419011, + "grad_norm": 4.117202388731062, + "learning_rate": 1.618421052631579e-05, + "loss": 0.919, + "step": 123 + }, + { + "epoch": 0.07343796268877702, + "grad_norm": 4.4180789641519755, + "learning_rate": 1.6315789473684213e-05, + "loss": 0.9452, + "step": 124 + }, + { + "epoch": 0.07403020432336394, + "grad_norm": 10.408859163405296, + "learning_rate": 1.644736842105263e-05, + "loss": 0.9171, + "step": 125 + }, + { + "epoch": 0.07462244595795084, + "grad_norm": 5.812635898712064, + "learning_rate": 1.6578947368421053e-05, + "loss": 0.9144, + "step": 126 + }, + { + "epoch": 0.07521468759253776, + "grad_norm": 16.681339227878798, + "learning_rate": 1.6710526315789475e-05, + "loss": 0.9183, + "step": 127 + }, + { + "epoch": 0.07580692922712466, + "grad_norm": 5.4477969668787445, + "learning_rate": 1.6842105263157896e-05, + "loss": 0.9358, + "step": 128 + }, + { + "epoch": 0.07639917086171158, + "grad_norm": 6.20964288619415, + "learning_rate": 1.6973684210526318e-05, + "loss": 0.9687, + "step": 129 + }, + { + "epoch": 0.07699141249629848, + "grad_norm": 16.245213640484007, + "learning_rate": 1.7105263157894737e-05, + "loss": 0.9135, + "step": 130 + }, + { + "epoch": 0.0775836541308854, + "grad_norm": 11.924195905537667, + "learning_rate": 1.723684210526316e-05, + "loss": 0.9306, + "step": 131 + }, + { + "epoch": 0.0781758957654723, + "grad_norm": 234.49611616589422, + "learning_rate": 1.736842105263158e-05, + "loss": 0.95, + "step": 132 + }, + { + "epoch": 0.07876813740005922, + "grad_norm": 5.235845225399804, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.9042, + "step": 133 + }, + { + "epoch": 0.07936037903464614, + "grad_norm": 7.032132390324782, + "learning_rate": 1.763157894736842e-05, + "loss": 0.958, + "step": 134 + }, + { + "epoch": 0.07995262066923305, + "grad_norm": 9.064372001479308, + "learning_rate": 1.7763157894736845e-05, + "loss": 0.8924, + "step": 135 + }, + { + "epoch": 0.08054486230381996, + "grad_norm": 6.104808834461892, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.9176, + "step": 136 + }, + { + "epoch": 0.08113710393840687, + "grad_norm": 8.886467300119726, + "learning_rate": 1.8026315789473685e-05, + "loss": 0.8794, + "step": 137 + }, + { + "epoch": 0.08172934557299379, + "grad_norm": 7.344989153314532, + "learning_rate": 1.8157894736842107e-05, + "loss": 0.9546, + "step": 138 + }, + { + "epoch": 0.08232158720758069, + "grad_norm": 3.97659060586214, + "learning_rate": 1.828947368421053e-05, + "loss": 0.9321, + "step": 139 + }, + { + "epoch": 0.08291382884216761, + "grad_norm": 11.103481278827003, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.9627, + "step": 140 + }, + { + "epoch": 0.08350607047675451, + "grad_norm": 4.6592325848802565, + "learning_rate": 1.8552631578947373e-05, + "loss": 0.9245, + "step": 141 + }, + { + "epoch": 0.08409831211134143, + "grad_norm": 4.7813770731736245, + "learning_rate": 1.868421052631579e-05, + "loss": 0.9103, + "step": 142 + }, + { + "epoch": 0.08469055374592833, + "grad_norm": 3.923276266087163, + "learning_rate": 1.8815789473684213e-05, + "loss": 0.9362, + "step": 143 + }, + { + "epoch": 0.08528279538051525, + "grad_norm": 5.389802023966097, + "learning_rate": 1.894736842105263e-05, + "loss": 0.8964, + "step": 144 + }, + { + "epoch": 0.08587503701510216, + "grad_norm": 3.416809050980661, + "learning_rate": 1.9078947368421056e-05, + "loss": 0.9409, + "step": 145 + }, + { + "epoch": 0.08646727864968907, + "grad_norm": 7.72766710731213, + "learning_rate": 1.9210526315789474e-05, + "loss": 0.9519, + "step": 146 + }, + { + "epoch": 0.08705952028427598, + "grad_norm": 4.149089365478041, + "learning_rate": 1.9342105263157896e-05, + "loss": 0.8684, + "step": 147 + }, + { + "epoch": 0.0876517619188629, + "grad_norm": 3.6666693738461795, + "learning_rate": 1.9473684210526318e-05, + "loss": 0.9186, + "step": 148 + }, + { + "epoch": 0.0882440035534498, + "grad_norm": 4.7322097295129195, + "learning_rate": 1.960526315789474e-05, + "loss": 0.942, + "step": 149 + }, + { + "epoch": 0.08883624518803672, + "grad_norm": 4.525436493311952, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.9201, + "step": 150 + }, + { + "epoch": 0.08942848682262364, + "grad_norm": 4.785414884022584, + "learning_rate": 1.986842105263158e-05, + "loss": 0.9087, + "step": 151 + }, + { + "epoch": 0.09002072845721054, + "grad_norm": 6.846600500963897, + "learning_rate": 2e-05, + "loss": 0.9397, + "step": 152 + }, + { + "epoch": 0.09061297009179746, + "grad_norm": 10.702916565838894, + "learning_rate": 1.9999997954718838e-05, + "loss": 0.9056, + "step": 153 + }, + { + "epoch": 0.09120521172638436, + "grad_norm": 5.25139552872273, + "learning_rate": 1.9999991818876183e-05, + "loss": 0.9303, + "step": 154 + }, + { + "epoch": 0.09179745336097128, + "grad_norm": 3.289991163103779, + "learning_rate": 1.999998159247455e-05, + "loss": 0.8763, + "step": 155 + }, + { + "epoch": 0.09238969499555819, + "grad_norm": 4.456341363473166, + "learning_rate": 1.9999967275518118e-05, + "loss": 0.8785, + "step": 156 + }, + { + "epoch": 0.0929819366301451, + "grad_norm": 3.149442922056867, + "learning_rate": 1.9999948868012743e-05, + "loss": 0.9021, + "step": 157 + }, + { + "epoch": 0.09357417826473201, + "grad_norm": 2.4588136993301704, + "learning_rate": 1.999992636996596e-05, + "loss": 0.8785, + "step": 158 + }, + { + "epoch": 0.09416641989931893, + "grad_norm": 4.5661756454164255, + "learning_rate": 1.9999899781386968e-05, + "loss": 0.8916, + "step": 159 + }, + { + "epoch": 0.09475866153390583, + "grad_norm": 2.30357391498451, + "learning_rate": 1.9999869102286638e-05, + "loss": 0.9262, + "step": 160 + }, + { + "epoch": 0.09535090316849275, + "grad_norm": 2.6065419794469467, + "learning_rate": 1.9999834332677534e-05, + "loss": 0.9292, + "step": 161 + }, + { + "epoch": 0.09594314480307965, + "grad_norm": 2.8765449701366164, + "learning_rate": 1.9999795472573865e-05, + "loss": 0.9565, + "step": 162 + }, + { + "epoch": 0.09653538643766657, + "grad_norm": 2.049749661177928, + "learning_rate": 1.9999752521991535e-05, + "loss": 0.934, + "step": 163 + }, + { + "epoch": 0.09712762807225347, + "grad_norm": 2.143243330636517, + "learning_rate": 1.9999705480948107e-05, + "loss": 0.9237, + "step": 164 + }, + { + "epoch": 0.09771986970684039, + "grad_norm": 4.902266517773235, + "learning_rate": 1.9999654349462828e-05, + "loss": 0.9111, + "step": 165 + }, + { + "epoch": 0.0983121113414273, + "grad_norm": 3.7313193755804397, + "learning_rate": 1.9999599127556614e-05, + "loss": 0.8766, + "step": 166 + }, + { + "epoch": 0.09890435297601421, + "grad_norm": 2.121600886528297, + "learning_rate": 1.9999539815252048e-05, + "loss": 0.8929, + "step": 167 + }, + { + "epoch": 0.09949659461060112, + "grad_norm": 3.3403412194087374, + "learning_rate": 1.99994764125734e-05, + "loss": 0.9495, + "step": 168 + }, + { + "epoch": 0.10008883624518804, + "grad_norm": 2.468640773279232, + "learning_rate": 1.99994089195466e-05, + "loss": 0.926, + "step": 169 + }, + { + "epoch": 0.10068107787977496, + "grad_norm": 6.834707950691288, + "learning_rate": 1.9999337336199257e-05, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.10127331951436186, + "grad_norm": 8.374475093505808, + "learning_rate": 1.9999261662560657e-05, + "loss": 0.8807, + "step": 171 + }, + { + "epoch": 0.10186556114894878, + "grad_norm": 3.7723195507143905, + "learning_rate": 1.999918189866175e-05, + "loss": 0.9351, + "step": 172 + }, + { + "epoch": 0.10245780278353568, + "grad_norm": 5.716189938070129, + "learning_rate": 1.9999098044535164e-05, + "loss": 0.9152, + "step": 173 + }, + { + "epoch": 0.1030500444181226, + "grad_norm": 9.612635045762634, + "learning_rate": 1.9999010100215202e-05, + "loss": 0.8843, + "step": 174 + }, + { + "epoch": 0.1036422860527095, + "grad_norm": 4.703911264627151, + "learning_rate": 1.999891806573784e-05, + "loss": 0.9066, + "step": 175 + }, + { + "epoch": 0.10423452768729642, + "grad_norm": 4.8130705468771335, + "learning_rate": 1.9998821941140716e-05, + "loss": 0.9073, + "step": 176 + }, + { + "epoch": 0.10482676932188333, + "grad_norm": 4.204520297054476, + "learning_rate": 1.999872172646316e-05, + "loss": 0.8713, + "step": 177 + }, + { + "epoch": 0.10541901095647024, + "grad_norm": 9.243056732431207, + "learning_rate": 1.9998617421746166e-05, + "loss": 0.899, + "step": 178 + }, + { + "epoch": 0.10601125259105715, + "grad_norm": 7.819090907308756, + "learning_rate": 1.9998509027032392e-05, + "loss": 0.9286, + "step": 179 + }, + { + "epoch": 0.10660349422564407, + "grad_norm": 3.3915991356633426, + "learning_rate": 1.9998396542366188e-05, + "loss": 0.9553, + "step": 180 + }, + { + "epoch": 0.10719573586023097, + "grad_norm": 6.875890351613609, + "learning_rate": 1.9998279967793558e-05, + "loss": 0.9107, + "step": 181 + }, + { + "epoch": 0.10778797749481789, + "grad_norm": 18.641411497663324, + "learning_rate": 1.9998159303362193e-05, + "loss": 0.9007, + "step": 182 + }, + { + "epoch": 0.10838021912940479, + "grad_norm": 2.8778263671228794, + "learning_rate": 1.9998034549121445e-05, + "loss": 0.9288, + "step": 183 + }, + { + "epoch": 0.10897246076399171, + "grad_norm": 7.460744753488814, + "learning_rate": 1.9997905705122352e-05, + "loss": 0.9221, + "step": 184 + }, + { + "epoch": 0.10956470239857861, + "grad_norm": 38.271685018077065, + "learning_rate": 1.9997772771417615e-05, + "loss": 0.8964, + "step": 185 + }, + { + "epoch": 0.11015694403316553, + "grad_norm": 5.046727664405057, + "learning_rate": 1.9997635748061615e-05, + "loss": 0.9444, + "step": 186 + }, + { + "epoch": 0.11074918566775244, + "grad_norm": 8.624881305642129, + "learning_rate": 1.99974946351104e-05, + "loss": 0.9131, + "step": 187 + }, + { + "epoch": 0.11134142730233935, + "grad_norm": 8.837284064529285, + "learning_rate": 1.999734943262169e-05, + "loss": 0.906, + "step": 188 + }, + { + "epoch": 0.11193366893692627, + "grad_norm": 10.041268687119777, + "learning_rate": 1.999720014065489e-05, + "loss": 0.901, + "step": 189 + }, + { + "epoch": 0.11252591057151318, + "grad_norm": 2.6502443902067765, + "learning_rate": 1.9997046759271055e-05, + "loss": 0.9126, + "step": 190 + }, + { + "epoch": 0.1131181522061001, + "grad_norm": 5.889543292461257, + "learning_rate": 1.999688928853294e-05, + "loss": 0.9007, + "step": 191 + }, + { + "epoch": 0.113710393840687, + "grad_norm": 3.5995531544601578, + "learning_rate": 1.999672772850495e-05, + "loss": 0.8766, + "step": 192 + }, + { + "epoch": 0.11430263547527392, + "grad_norm": 1.796533079320344, + "learning_rate": 1.9996562079253177e-05, + "loss": 0.9259, + "step": 193 + }, + { + "epoch": 0.11489487710986082, + "grad_norm": 3.417277613043803, + "learning_rate": 1.999639234084538e-05, + "loss": 0.8797, + "step": 194 + }, + { + "epoch": 0.11548711874444774, + "grad_norm": 4.546208064925828, + "learning_rate": 1.999621851335099e-05, + "loss": 0.9833, + "step": 195 + }, + { + "epoch": 0.11607936037903464, + "grad_norm": 1.8321385514505888, + "learning_rate": 1.9996040596841118e-05, + "loss": 0.8972, + "step": 196 + }, + { + "epoch": 0.11667160201362156, + "grad_norm": 4.278863948371289, + "learning_rate": 1.9995858591388532e-05, + "loss": 0.9177, + "step": 197 + }, + { + "epoch": 0.11726384364820847, + "grad_norm": 2.5684660110656488, + "learning_rate": 1.999567249706769e-05, + "loss": 0.9269, + "step": 198 + }, + { + "epoch": 0.11785608528279538, + "grad_norm": 2.6222054577372784, + "learning_rate": 1.9995482313954713e-05, + "loss": 0.9453, + "step": 199 + }, + { + "epoch": 0.11844832691738229, + "grad_norm": 2.4017263015983557, + "learning_rate": 1.9995288042127396e-05, + "loss": 0.9334, + "step": 200 + }, + { + "epoch": 0.1190405685519692, + "grad_norm": 3.4487920247999244, + "learning_rate": 1.999508968166521e-05, + "loss": 0.8751, + "step": 201 + }, + { + "epoch": 0.11963281018655611, + "grad_norm": 4.923653355915247, + "learning_rate": 1.999488723264929e-05, + "loss": 0.9157, + "step": 202 + }, + { + "epoch": 0.12022505182114303, + "grad_norm": 2.210102296979183, + "learning_rate": 1.9994680695162453e-05, + "loss": 0.9256, + "step": 203 + }, + { + "epoch": 0.12081729345572993, + "grad_norm": 3.8316888952990786, + "learning_rate": 1.999447006928918e-05, + "loss": 0.8524, + "step": 204 + }, + { + "epoch": 0.12140953509031685, + "grad_norm": 1.703827800390735, + "learning_rate": 1.999425535511564e-05, + "loss": 0.8918, + "step": 205 + }, + { + "epoch": 0.12200177672490375, + "grad_norm": 3.5473701576710623, + "learning_rate": 1.999403655272965e-05, + "loss": 0.8804, + "step": 206 + }, + { + "epoch": 0.12259401835949067, + "grad_norm": 2.2496153840955415, + "learning_rate": 1.999381366222072e-05, + "loss": 0.9551, + "step": 207 + }, + { + "epoch": 0.12318625999407759, + "grad_norm": 2.126988606794187, + "learning_rate": 1.999358668368002e-05, + "loss": 0.8835, + "step": 208 + }, + { + "epoch": 0.1237785016286645, + "grad_norm": 2.660694810075823, + "learning_rate": 1.9993355617200404e-05, + "loss": 0.8951, + "step": 209 + }, + { + "epoch": 0.12437074326325141, + "grad_norm": 4.602299174530589, + "learning_rate": 1.9993120462876385e-05, + "loss": 0.9475, + "step": 210 + }, + { + "epoch": 0.12496298489783832, + "grad_norm": 3.931184146620322, + "learning_rate": 1.9992881220804157e-05, + "loss": 0.8922, + "step": 211 + }, + { + "epoch": 0.12555522653242523, + "grad_norm": 2.6749433222616292, + "learning_rate": 1.9992637891081585e-05, + "loss": 0.9381, + "step": 212 + }, + { + "epoch": 0.12614746816701214, + "grad_norm": 3.4189957335214105, + "learning_rate": 1.9992390473808195e-05, + "loss": 0.8404, + "step": 213 + }, + { + "epoch": 0.12673970980159904, + "grad_norm": 2.1866401943466, + "learning_rate": 1.999213896908521e-05, + "loss": 0.8585, + "step": 214 + }, + { + "epoch": 0.12733195143618598, + "grad_norm": 1.6522576171627141, + "learning_rate": 1.9991883377015497e-05, + "loss": 0.8548, + "step": 215 + }, + { + "epoch": 0.12792419307077288, + "grad_norm": 2.2509610072884074, + "learning_rate": 1.9991623697703613e-05, + "loss": 0.9282, + "step": 216 + }, + { + "epoch": 0.12851643470535978, + "grad_norm": 4.028256239000191, + "learning_rate": 1.9991359931255782e-05, + "loss": 0.9042, + "step": 217 + }, + { + "epoch": 0.1291086763399467, + "grad_norm": 1.8685142977539413, + "learning_rate": 1.9991092077779895e-05, + "loss": 0.9028, + "step": 218 + }, + { + "epoch": 0.12970091797453362, + "grad_norm": 2.87573942157228, + "learning_rate": 1.9990820137385525e-05, + "loss": 0.8854, + "step": 219 + }, + { + "epoch": 0.13029315960912052, + "grad_norm": 2.4941325943289323, + "learning_rate": 1.9990544110183907e-05, + "loss": 0.9238, + "step": 220 + }, + { + "epoch": 0.13088540124370743, + "grad_norm": 3.4805013559739235, + "learning_rate": 1.999026399628795e-05, + "loss": 0.8405, + "step": 221 + }, + { + "epoch": 0.13147764287829433, + "grad_norm": 1.8670578876428667, + "learning_rate": 1.998997979581224e-05, + "loss": 0.8898, + "step": 222 + }, + { + "epoch": 0.13206988451288126, + "grad_norm": 2.2661788538769043, + "learning_rate": 1.9989691508873032e-05, + "loss": 0.9229, + "step": 223 + }, + { + "epoch": 0.13266212614746817, + "grad_norm": 2.499121551088041, + "learning_rate": 1.9989399135588246e-05, + "loss": 0.9035, + "step": 224 + }, + { + "epoch": 0.13325436778205507, + "grad_norm": 3.4358794596174334, + "learning_rate": 1.9989102676077484e-05, + "loss": 0.9143, + "step": 225 + }, + { + "epoch": 0.13384660941664198, + "grad_norm": 3.4214432168583353, + "learning_rate": 1.9988802130462017e-05, + "loss": 0.9384, + "step": 226 + }, + { + "epoch": 0.1344388510512289, + "grad_norm": 2.544662580599115, + "learning_rate": 1.9988497498864776e-05, + "loss": 0.8979, + "step": 227 + }, + { + "epoch": 0.1350310926858158, + "grad_norm": 2.087938927884032, + "learning_rate": 1.9988188781410377e-05, + "loss": 0.8918, + "step": 228 + }, + { + "epoch": 0.13562333432040272, + "grad_norm": 3.1166292965067925, + "learning_rate": 1.9987875978225107e-05, + "loss": 0.9169, + "step": 229 + }, + { + "epoch": 0.13621557595498965, + "grad_norm": 2.9358196092245383, + "learning_rate": 1.9987559089436917e-05, + "loss": 0.9254, + "step": 230 + }, + { + "epoch": 0.13680781758957655, + "grad_norm": 4.473321811652426, + "learning_rate": 1.9987238115175428e-05, + "loss": 0.9227, + "step": 231 + }, + { + "epoch": 0.13740005922416346, + "grad_norm": 3.988884973768667, + "learning_rate": 1.998691305557194e-05, + "loss": 0.8807, + "step": 232 + }, + { + "epoch": 0.13799230085875036, + "grad_norm": 3.5180020915988632, + "learning_rate": 1.9986583910759427e-05, + "loss": 0.9089, + "step": 233 + }, + { + "epoch": 0.1385845424933373, + "grad_norm": 4.311583066844405, + "learning_rate": 1.9986250680872515e-05, + "loss": 0.8793, + "step": 234 + }, + { + "epoch": 0.1391767841279242, + "grad_norm": 4.094646789970844, + "learning_rate": 1.9985913366047524e-05, + "loss": 0.8717, + "step": 235 + }, + { + "epoch": 0.1397690257625111, + "grad_norm": 5.37580338032932, + "learning_rate": 1.998557196642243e-05, + "loss": 0.9662, + "step": 236 + }, + { + "epoch": 0.140361267397098, + "grad_norm": 5.374652282579761, + "learning_rate": 1.9985226482136887e-05, + "loss": 0.9267, + "step": 237 + }, + { + "epoch": 0.14095350903168494, + "grad_norm": 8.012244025293068, + "learning_rate": 1.9984876913332215e-05, + "loss": 0.9403, + "step": 238 + }, + { + "epoch": 0.14154575066627184, + "grad_norm": 4.000814675128233, + "learning_rate": 1.998452326015141e-05, + "loss": 0.9192, + "step": 239 + }, + { + "epoch": 0.14213799230085875, + "grad_norm": 2.743822493310051, + "learning_rate": 1.9984165522739135e-05, + "loss": 0.876, + "step": 240 + }, + { + "epoch": 0.14273023393544565, + "grad_norm": 3.584538417944988, + "learning_rate": 1.9983803701241723e-05, + "loss": 0.9261, + "step": 241 + }, + { + "epoch": 0.14332247557003258, + "grad_norm": 7.9272167764769845, + "learning_rate": 1.998343779580718e-05, + "loss": 0.8728, + "step": 242 + }, + { + "epoch": 0.14391471720461949, + "grad_norm": 2.481390145790585, + "learning_rate": 1.9983067806585184e-05, + "loss": 0.8885, + "step": 243 + }, + { + "epoch": 0.1445069588392064, + "grad_norm": 6.220923433805533, + "learning_rate": 1.998269373372708e-05, + "loss": 0.8931, + "step": 244 + }, + { + "epoch": 0.1450992004737933, + "grad_norm": 4.045398395801386, + "learning_rate": 1.9982315577385885e-05, + "loss": 0.8867, + "step": 245 + }, + { + "epoch": 0.14569144210838023, + "grad_norm": 5.776802690732297, + "learning_rate": 1.9981933337716288e-05, + "loss": 0.8379, + "step": 246 + }, + { + "epoch": 0.14628368374296713, + "grad_norm": 14.36864250446338, + "learning_rate": 1.998154701487464e-05, + "loss": 0.9106, + "step": 247 + }, + { + "epoch": 0.14687592537755403, + "grad_norm": 3.967462127445808, + "learning_rate": 1.9981156609018977e-05, + "loss": 0.9349, + "step": 248 + }, + { + "epoch": 0.14746816701214097, + "grad_norm": 3.265424009929576, + "learning_rate": 1.998076212030899e-05, + "loss": 0.8946, + "step": 249 + }, + { + "epoch": 0.14806040864672787, + "grad_norm": 5.252463182631547, + "learning_rate": 1.9980363548906056e-05, + "loss": 0.9346, + "step": 250 + }, + { + "epoch": 0.14865265028131477, + "grad_norm": 3.913170557844579, + "learning_rate": 1.9979960894973202e-05, + "loss": 0.9643, + "step": 251 + }, + { + "epoch": 0.14924489191590168, + "grad_norm": 10.316912742558536, + "learning_rate": 1.9979554158675145e-05, + "loss": 0.8396, + "step": 252 + }, + { + "epoch": 0.1498371335504886, + "grad_norm": 3.808990535792799, + "learning_rate": 1.9979143340178258e-05, + "loss": 0.9148, + "step": 253 + }, + { + "epoch": 0.15042937518507551, + "grad_norm": 12.348960229120108, + "learning_rate": 1.997872843965059e-05, + "loss": 0.8809, + "step": 254 + }, + { + "epoch": 0.15102161681966242, + "grad_norm": 7.004456279640057, + "learning_rate": 1.997830945726186e-05, + "loss": 0.8835, + "step": 255 + }, + { + "epoch": 0.15161385845424932, + "grad_norm": 7.66590001684898, + "learning_rate": 1.9977886393183454e-05, + "loss": 0.8604, + "step": 256 + }, + { + "epoch": 0.15220610008883625, + "grad_norm": 3.272971996729421, + "learning_rate": 1.997745924758843e-05, + "loss": 0.9124, + "step": 257 + }, + { + "epoch": 0.15279834172342316, + "grad_norm": 3.163402751353713, + "learning_rate": 1.9977028020651516e-05, + "loss": 0.9089, + "step": 258 + }, + { + "epoch": 0.15339058335801006, + "grad_norm": 9.234252191086176, + "learning_rate": 1.9976592712549102e-05, + "loss": 0.9354, + "step": 259 + }, + { + "epoch": 0.15398282499259697, + "grad_norm": 3.471952759539684, + "learning_rate": 1.9976153323459262e-05, + "loss": 0.9084, + "step": 260 + }, + { + "epoch": 0.1545750666271839, + "grad_norm": 19.265326355686057, + "learning_rate": 1.9975709853561725e-05, + "loss": 0.8811, + "step": 261 + }, + { + "epoch": 0.1551673082617708, + "grad_norm": 25.404153390256994, + "learning_rate": 1.9975262303037896e-05, + "loss": 0.9463, + "step": 262 + }, + { + "epoch": 0.1557595498963577, + "grad_norm": 1.790796295391984, + "learning_rate": 1.997481067207085e-05, + "loss": 0.9076, + "step": 263 + }, + { + "epoch": 0.1563517915309446, + "grad_norm": 1.5429882740755128, + "learning_rate": 1.9974354960845326e-05, + "loss": 0.8787, + "step": 264 + }, + { + "epoch": 0.15694403316553154, + "grad_norm": 1.8706636553302713, + "learning_rate": 1.997389516954774e-05, + "loss": 0.9265, + "step": 265 + }, + { + "epoch": 0.15753627480011845, + "grad_norm": 2.730000619216694, + "learning_rate": 1.997343129836617e-05, + "loss": 0.9074, + "step": 266 + }, + { + "epoch": 0.15812851643470535, + "grad_norm": 6.628653976491072, + "learning_rate": 1.9972963347490366e-05, + "loss": 0.8885, + "step": 267 + }, + { + "epoch": 0.15872075806929228, + "grad_norm": 1.9178468971516744, + "learning_rate": 1.9972491317111745e-05, + "loss": 0.8867, + "step": 268 + }, + { + "epoch": 0.1593129997038792, + "grad_norm": 3.3558537963254267, + "learning_rate": 1.9972015207423396e-05, + "loss": 0.844, + "step": 269 + }, + { + "epoch": 0.1599052413384661, + "grad_norm": 4.932473991808671, + "learning_rate": 1.997153501862007e-05, + "loss": 0.9451, + "step": 270 + }, + { + "epoch": 0.160497482973053, + "grad_norm": 1.7319648821727869, + "learning_rate": 1.99710507508982e-05, + "loss": 0.8596, + "step": 271 + }, + { + "epoch": 0.16108972460763993, + "grad_norm": 3.1569769697720558, + "learning_rate": 1.9970562404455872e-05, + "loss": 0.89, + "step": 272 + }, + { + "epoch": 0.16168196624222683, + "grad_norm": 3.6061121751456753, + "learning_rate": 1.9970069979492846e-05, + "loss": 0.8865, + "step": 273 + }, + { + "epoch": 0.16227420787681374, + "grad_norm": 19.08626640686028, + "learning_rate": 1.9969573476210558e-05, + "loss": 0.9106, + "step": 274 + }, + { + "epoch": 0.16286644951140064, + "grad_norm": 1.5734768192780313, + "learning_rate": 1.99690728948121e-05, + "loss": 0.9331, + "step": 275 + }, + { + "epoch": 0.16345869114598757, + "grad_norm": 1.8545986500276248, + "learning_rate": 1.996856823550224e-05, + "loss": 0.8472, + "step": 276 + }, + { + "epoch": 0.16405093278057448, + "grad_norm": 2.3229100358592563, + "learning_rate": 1.9968059498487415e-05, + "loss": 0.8914, + "step": 277 + }, + { + "epoch": 0.16464317441516138, + "grad_norm": 1.736892427592765, + "learning_rate": 1.996754668397572e-05, + "loss": 0.9359, + "step": 278 + }, + { + "epoch": 0.16523541604974828, + "grad_norm": 3.6486468336727444, + "learning_rate": 1.9967029792176932e-05, + "loss": 0.8875, + "step": 279 + }, + { + "epoch": 0.16582765768433522, + "grad_norm": 1.7408683315370481, + "learning_rate": 1.9966508823302484e-05, + "loss": 0.877, + "step": 280 + }, + { + "epoch": 0.16641989931892212, + "grad_norm": 1.8687430980001674, + "learning_rate": 1.9965983777565483e-05, + "loss": 0.8905, + "step": 281 + }, + { + "epoch": 0.16701214095350903, + "grad_norm": 3.904642504972808, + "learning_rate": 1.9965454655180704e-05, + "loss": 0.852, + "step": 282 + }, + { + "epoch": 0.16760438258809596, + "grad_norm": 1.9130632597634436, + "learning_rate": 1.9964921456364584e-05, + "loss": 0.9199, + "step": 283 + }, + { + "epoch": 0.16819662422268286, + "grad_norm": 2.675290531374258, + "learning_rate": 1.9964384181335237e-05, + "loss": 0.9072, + "step": 284 + }, + { + "epoch": 0.16878886585726977, + "grad_norm": 1.5847471748990105, + "learning_rate": 1.9963842830312434e-05, + "loss": 0.9185, + "step": 285 + }, + { + "epoch": 0.16938110749185667, + "grad_norm": 10.321315145183203, + "learning_rate": 1.996329740351762e-05, + "loss": 0.8816, + "step": 286 + }, + { + "epoch": 0.1699733491264436, + "grad_norm": 10.276955241455104, + "learning_rate": 1.9962747901173904e-05, + "loss": 0.9097, + "step": 287 + }, + { + "epoch": 0.1705655907610305, + "grad_norm": 2.3615243550709155, + "learning_rate": 1.9962194323506064e-05, + "loss": 0.928, + "step": 288 + }, + { + "epoch": 0.1711578323956174, + "grad_norm": 1.1933250899884889, + "learning_rate": 1.9961636670740546e-05, + "loss": 0.9304, + "step": 289 + }, + { + "epoch": 0.1717500740302043, + "grad_norm": 3.5837635071883396, + "learning_rate": 1.9961074943105457e-05, + "loss": 0.9264, + "step": 290 + }, + { + "epoch": 0.17234231566479125, + "grad_norm": 1.4231322142951308, + "learning_rate": 1.996050914083058e-05, + "loss": 0.915, + "step": 291 + }, + { + "epoch": 0.17293455729937815, + "grad_norm": 2.3576679543712724, + "learning_rate": 1.9959939264147355e-05, + "loss": 0.9305, + "step": 292 + }, + { + "epoch": 0.17352679893396505, + "grad_norm": 2.292913955194318, + "learning_rate": 1.99593653132889e-05, + "loss": 0.9098, + "step": 293 + }, + { + "epoch": 0.17411904056855196, + "grad_norm": 1.9398072904004944, + "learning_rate": 1.9958787288489983e-05, + "loss": 0.9107, + "step": 294 + }, + { + "epoch": 0.1747112822031389, + "grad_norm": 11.08115964739409, + "learning_rate": 1.9958205189987066e-05, + "loss": 0.8475, + "step": 295 + }, + { + "epoch": 0.1753035238377258, + "grad_norm": 2.6304317819640852, + "learning_rate": 1.9957619018018243e-05, + "loss": 0.8905, + "step": 296 + }, + { + "epoch": 0.1758957654723127, + "grad_norm": 3.3173060476207477, + "learning_rate": 1.99570287728233e-05, + "loss": 0.8585, + "step": 297 + }, + { + "epoch": 0.1764880071068996, + "grad_norm": 1.7335753531753704, + "learning_rate": 1.9956434454643675e-05, + "loss": 0.8835, + "step": 298 + }, + { + "epoch": 0.17708024874148653, + "grad_norm": 1.5781518168435433, + "learning_rate": 1.995583606372248e-05, + "loss": 0.8636, + "step": 299 + }, + { + "epoch": 0.17767249037607344, + "grad_norm": 3.186713245410372, + "learning_rate": 1.9955233600304496e-05, + "loss": 0.8857, + "step": 300 + }, + { + "epoch": 0.17826473201066034, + "grad_norm": 2.4742503690715254, + "learning_rate": 1.9954627064636157e-05, + "loss": 0.9306, + "step": 301 + }, + { + "epoch": 0.17885697364524727, + "grad_norm": 2.1188838286209513, + "learning_rate": 1.995401645696557e-05, + "loss": 0.8874, + "step": 302 + }, + { + "epoch": 0.17944921527983418, + "grad_norm": 1.979483491889314, + "learning_rate": 1.9953401777542517e-05, + "loss": 0.9408, + "step": 303 + }, + { + "epoch": 0.18004145691442108, + "grad_norm": 1.285049652875338, + "learning_rate": 1.9952783026618424e-05, + "loss": 0.8872, + "step": 304 + }, + { + "epoch": 0.180633698549008, + "grad_norm": 1.9180889335059634, + "learning_rate": 1.9952160204446404e-05, + "loss": 0.8832, + "step": 305 + }, + { + "epoch": 0.18122594018359492, + "grad_norm": 2.556643811478596, + "learning_rate": 1.995153331128122e-05, + "loss": 0.8946, + "step": 306 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.4449018469180723, + "learning_rate": 1.9950902347379316e-05, + "loss": 0.9229, + "step": 307 + }, + { + "epoch": 0.18241042345276873, + "grad_norm": 2.436648980404225, + "learning_rate": 1.9950267312998783e-05, + "loss": 0.9155, + "step": 308 + }, + { + "epoch": 0.18300266508735563, + "grad_norm": 2.1396571184843793, + "learning_rate": 1.9949628208399388e-05, + "loss": 0.9097, + "step": 309 + }, + { + "epoch": 0.18359490672194256, + "grad_norm": 1.8397633591444138, + "learning_rate": 1.994898503384256e-05, + "loss": 0.9461, + "step": 310 + }, + { + "epoch": 0.18418714835652947, + "grad_norm": 5.288654703997796, + "learning_rate": 1.9948337789591396e-05, + "loss": 0.9367, + "step": 311 + }, + { + "epoch": 0.18477938999111637, + "grad_norm": 1.8505284885364202, + "learning_rate": 1.9947686475910656e-05, + "loss": 0.8994, + "step": 312 + }, + { + "epoch": 0.18537163162570328, + "grad_norm": 1.655781456697678, + "learning_rate": 1.9947031093066758e-05, + "loss": 0.8772, + "step": 313 + }, + { + "epoch": 0.1859638732602902, + "grad_norm": 2.0037040325972835, + "learning_rate": 1.9946371641327794e-05, + "loss": 0.9276, + "step": 314 + }, + { + "epoch": 0.1865561148948771, + "grad_norm": 4.111557030910506, + "learning_rate": 1.994570812096352e-05, + "loss": 0.9274, + "step": 315 + }, + { + "epoch": 0.18714835652946402, + "grad_norm": 1.6261247430270642, + "learning_rate": 1.9945040532245352e-05, + "loss": 0.9014, + "step": 316 + }, + { + "epoch": 0.18774059816405092, + "grad_norm": 1.6162415339185936, + "learning_rate": 1.9944368875446363e-05, + "loss": 0.9511, + "step": 317 + }, + { + "epoch": 0.18833283979863785, + "grad_norm": 1.5148471582795817, + "learning_rate": 1.9943693150841312e-05, + "loss": 0.8605, + "step": 318 + }, + { + "epoch": 0.18892508143322476, + "grad_norm": 1.540594634489607, + "learning_rate": 1.99430133587066e-05, + "loss": 0.8838, + "step": 319 + }, + { + "epoch": 0.18951732306781166, + "grad_norm": 2.049032648950319, + "learning_rate": 1.9942329499320298e-05, + "loss": 0.9253, + "step": 320 + }, + { + "epoch": 0.1901095647023986, + "grad_norm": 1.977962711697837, + "learning_rate": 1.994164157296215e-05, + "loss": 0.921, + "step": 321 + }, + { + "epoch": 0.1907018063369855, + "grad_norm": 5.4655707773851265, + "learning_rate": 1.994094957991355e-05, + "loss": 0.8246, + "step": 322 + }, + { + "epoch": 0.1912940479715724, + "grad_norm": 1.264635984959813, + "learning_rate": 1.994025352045757e-05, + "loss": 0.8664, + "step": 323 + }, + { + "epoch": 0.1918862896061593, + "grad_norm": 2.1323498253950275, + "learning_rate": 1.9939553394878926e-05, + "loss": 0.9058, + "step": 324 + }, + { + "epoch": 0.19247853124074624, + "grad_norm": 1.9659140774542014, + "learning_rate": 1.9938849203464023e-05, + "loss": 0.8865, + "step": 325 + }, + { + "epoch": 0.19307077287533314, + "grad_norm": 1.9108336737757217, + "learning_rate": 1.99381409465009e-05, + "loss": 0.9114, + "step": 326 + }, + { + "epoch": 0.19366301450992005, + "grad_norm": 1.9649788148181904, + "learning_rate": 1.9937428624279284e-05, + "loss": 0.8786, + "step": 327 + }, + { + "epoch": 0.19425525614450695, + "grad_norm": 1.999999763126164, + "learning_rate": 1.9936712237090554e-05, + "loss": 0.8931, + "step": 328 + }, + { + "epoch": 0.19484749777909388, + "grad_norm": 2.6307075206759976, + "learning_rate": 1.993599178522775e-05, + "loss": 0.8924, + "step": 329 + }, + { + "epoch": 0.19543973941368079, + "grad_norm": 1.6672297479747191, + "learning_rate": 1.9935267268985577e-05, + "loss": 0.8818, + "step": 330 + }, + { + "epoch": 0.1960319810482677, + "grad_norm": 1.9888771189937435, + "learning_rate": 1.9934538688660403e-05, + "loss": 0.9191, + "step": 331 + }, + { + "epoch": 0.1966242226828546, + "grad_norm": 2.656316010840173, + "learning_rate": 1.9933806044550262e-05, + "loss": 0.8806, + "step": 332 + }, + { + "epoch": 0.19721646431744153, + "grad_norm": 4.286512255009421, + "learning_rate": 1.9933069336954842e-05, + "loss": 0.8969, + "step": 333 + }, + { + "epoch": 0.19780870595202843, + "grad_norm": 3.055765430366512, + "learning_rate": 1.99323285661755e-05, + "loss": 0.9115, + "step": 334 + }, + { + "epoch": 0.19840094758661533, + "grad_norm": 1.7964416584438196, + "learning_rate": 1.9931583732515252e-05, + "loss": 0.8734, + "step": 335 + }, + { + "epoch": 0.19899318922120224, + "grad_norm": 1.8519391859954, + "learning_rate": 1.993083483627878e-05, + "loss": 0.9265, + "step": 336 + }, + { + "epoch": 0.19958543085578917, + "grad_norm": 1.9971914645128772, + "learning_rate": 1.993008187777242e-05, + "loss": 0.8784, + "step": 337 + }, + { + "epoch": 0.20017767249037607, + "grad_norm": 2.4232990241295522, + "learning_rate": 1.9929324857304175e-05, + "loss": 0.8837, + "step": 338 + }, + { + "epoch": 0.20076991412496298, + "grad_norm": 1.4178062062472774, + "learning_rate": 1.9928563775183713e-05, + "loss": 0.8635, + "step": 339 + }, + { + "epoch": 0.2013621557595499, + "grad_norm": 4.756076969204727, + "learning_rate": 1.9927798631722353e-05, + "loss": 0.9008, + "step": 340 + }, + { + "epoch": 0.20195439739413681, + "grad_norm": 5.072255729219359, + "learning_rate": 1.992702942723309e-05, + "loss": 0.8908, + "step": 341 + }, + { + "epoch": 0.20254663902872372, + "grad_norm": 3.724264153241877, + "learning_rate": 1.9926256162030564e-05, + "loss": 0.893, + "step": 342 + }, + { + "epoch": 0.20313888066331062, + "grad_norm": 2.284558332966103, + "learning_rate": 1.992547883643109e-05, + "loss": 0.8823, + "step": 343 + }, + { + "epoch": 0.20373112229789755, + "grad_norm": 7.51141744653435, + "learning_rate": 1.9924697450752636e-05, + "loss": 0.8618, + "step": 344 + }, + { + "epoch": 0.20432336393248446, + "grad_norm": 2.7293707676417487, + "learning_rate": 1.9923912005314827e-05, + "loss": 0.8983, + "step": 345 + }, + { + "epoch": 0.20491560556707136, + "grad_norm": 3.2471120301938776, + "learning_rate": 1.9923122500438964e-05, + "loss": 0.9105, + "step": 346 + }, + { + "epoch": 0.20550784720165827, + "grad_norm": 6.07549147250831, + "learning_rate": 1.9922328936447992e-05, + "loss": 0.8483, + "step": 347 + }, + { + "epoch": 0.2061000888362452, + "grad_norm": 2.791357973212501, + "learning_rate": 1.9921531313666526e-05, + "loss": 0.8484, + "step": 348 + }, + { + "epoch": 0.2066923304708321, + "grad_norm": 2.076877360733566, + "learning_rate": 1.992072963242084e-05, + "loss": 0.9362, + "step": 349 + }, + { + "epoch": 0.207284572105419, + "grad_norm": 2.7096944494505024, + "learning_rate": 1.9919923893038863e-05, + "loss": 0.8252, + "step": 350 + }, + { + "epoch": 0.2078768137400059, + "grad_norm": 2.355203227286866, + "learning_rate": 1.991911409585019e-05, + "loss": 0.92, + "step": 351 + }, + { + "epoch": 0.20846905537459284, + "grad_norm": 3.436005203104444, + "learning_rate": 1.991830024118607e-05, + "loss": 0.8602, + "step": 352 + }, + { + "epoch": 0.20906129700917975, + "grad_norm": 2.3276079656629856, + "learning_rate": 1.991748232937942e-05, + "loss": 0.8665, + "step": 353 + }, + { + "epoch": 0.20965353864376665, + "grad_norm": 3.406633969476107, + "learning_rate": 1.991666036076481e-05, + "loss": 0.8774, + "step": 354 + }, + { + "epoch": 0.21024578027835356, + "grad_norm": 3.2519752361578043, + "learning_rate": 1.9915834335678476e-05, + "loss": 0.938, + "step": 355 + }, + { + "epoch": 0.2108380219129405, + "grad_norm": 1.5571411069585908, + "learning_rate": 1.99150042544583e-05, + "loss": 0.8665, + "step": 356 + }, + { + "epoch": 0.2114302635475274, + "grad_norm": 1.797561656093368, + "learning_rate": 1.9914170117443837e-05, + "loss": 0.8467, + "step": 357 + }, + { + "epoch": 0.2120225051821143, + "grad_norm": 2.173677269835551, + "learning_rate": 1.9913331924976295e-05, + "loss": 0.8809, + "step": 358 + }, + { + "epoch": 0.21261474681670123, + "grad_norm": 1.4857064069360264, + "learning_rate": 1.9912489677398543e-05, + "loss": 0.8524, + "step": 359 + }, + { + "epoch": 0.21320698845128813, + "grad_norm": 1.5451577112951609, + "learning_rate": 1.991164337505511e-05, + "loss": 0.8625, + "step": 360 + }, + { + "epoch": 0.21379923008587504, + "grad_norm": 4.152036575903902, + "learning_rate": 1.9910793018292168e-05, + "loss": 0.891, + "step": 361 + }, + { + "epoch": 0.21439147172046194, + "grad_norm": 1.7176492006424662, + "learning_rate": 1.990993860745758e-05, + "loss": 0.8393, + "step": 362 + }, + { + "epoch": 0.21498371335504887, + "grad_norm": 16.044944435431123, + "learning_rate": 1.9909080142900835e-05, + "loss": 0.8994, + "step": 363 + }, + { + "epoch": 0.21557595498963578, + "grad_norm": 3.070726746879335, + "learning_rate": 1.99082176249731e-05, + "loss": 0.9275, + "step": 364 + }, + { + "epoch": 0.21616819662422268, + "grad_norm": 11.949735878042022, + "learning_rate": 1.9907351054027188e-05, + "loss": 0.8794, + "step": 365 + }, + { + "epoch": 0.21676043825880958, + "grad_norm": 4.743677400565406, + "learning_rate": 1.9906480430417575e-05, + "loss": 0.8665, + "step": 366 + }, + { + "epoch": 0.21735267989339652, + "grad_norm": 6.192596337832938, + "learning_rate": 1.99056057545004e-05, + "loss": 0.8838, + "step": 367 + }, + { + "epoch": 0.21794492152798342, + "grad_norm": 2.1298876915873426, + "learning_rate": 1.9904727026633453e-05, + "loss": 0.8655, + "step": 368 + }, + { + "epoch": 0.21853716316257032, + "grad_norm": 2.872112183326634, + "learning_rate": 1.9903844247176182e-05, + "loss": 0.8857, + "step": 369 + }, + { + "epoch": 0.21912940479715723, + "grad_norm": 2.116013503348104, + "learning_rate": 1.9902957416489693e-05, + "loss": 0.8675, + "step": 370 + }, + { + "epoch": 0.21972164643174416, + "grad_norm": 1.9650140489362669, + "learning_rate": 1.990206653493675e-05, + "loss": 0.8901, + "step": 371 + }, + { + "epoch": 0.22031388806633107, + "grad_norm": 2.1533395141766847, + "learning_rate": 1.9901171602881778e-05, + "loss": 0.8268, + "step": 372 + }, + { + "epoch": 0.22090612970091797, + "grad_norm": 1.8679750441425857, + "learning_rate": 1.990027262069085e-05, + "loss": 0.8956, + "step": 373 + }, + { + "epoch": 0.22149837133550487, + "grad_norm": 1.876927910343411, + "learning_rate": 1.9899369588731697e-05, + "loss": 0.915, + "step": 374 + }, + { + "epoch": 0.2220906129700918, + "grad_norm": 1.966063586945644, + "learning_rate": 1.9898462507373713e-05, + "loss": 0.85, + "step": 375 + }, + { + "epoch": 0.2226828546046787, + "grad_norm": 5.175814315545093, + "learning_rate": 1.9897551376987948e-05, + "loss": 0.8877, + "step": 376 + }, + { + "epoch": 0.2232750962392656, + "grad_norm": 3.3647238312551293, + "learning_rate": 1.9896636197947104e-05, + "loss": 0.9013, + "step": 377 + }, + { + "epoch": 0.22386733787385255, + "grad_norm": 3.0764967820975113, + "learning_rate": 1.9895716970625544e-05, + "loss": 0.9351, + "step": 378 + }, + { + "epoch": 0.22445957950843945, + "grad_norm": 2.601494620918347, + "learning_rate": 1.9894793695399276e-05, + "loss": 0.8853, + "step": 379 + }, + { + "epoch": 0.22505182114302635, + "grad_norm": 2.4828022845602735, + "learning_rate": 1.9893866372645975e-05, + "loss": 0.8749, + "step": 380 + }, + { + "epoch": 0.22564406277761326, + "grad_norm": 2.0904211550387846, + "learning_rate": 1.989293500274497e-05, + "loss": 0.8613, + "step": 381 + }, + { + "epoch": 0.2262363044122002, + "grad_norm": 2.594547419908662, + "learning_rate": 1.989199958607724e-05, + "loss": 0.8853, + "step": 382 + }, + { + "epoch": 0.2268285460467871, + "grad_norm": 3.3824116389291046, + "learning_rate": 1.9891060123025427e-05, + "loss": 0.8458, + "step": 383 + }, + { + "epoch": 0.227420787681374, + "grad_norm": 2.388543281852487, + "learning_rate": 1.9890116613973822e-05, + "loss": 0.8779, + "step": 384 + }, + { + "epoch": 0.2280130293159609, + "grad_norm": 2.3270199405136425, + "learning_rate": 1.9889169059308374e-05, + "loss": 0.9211, + "step": 385 + }, + { + "epoch": 0.22860527095054783, + "grad_norm": 3.4392018992585625, + "learning_rate": 1.9888217459416685e-05, + "loss": 0.8896, + "step": 386 + }, + { + "epoch": 0.22919751258513474, + "grad_norm": 2.9845110896198865, + "learning_rate": 1.9887261814688017e-05, + "loss": 0.8656, + "step": 387 + }, + { + "epoch": 0.22978975421972164, + "grad_norm": 1.7722551264363466, + "learning_rate": 1.9886302125513276e-05, + "loss": 0.9233, + "step": 388 + }, + { + "epoch": 0.23038199585430855, + "grad_norm": 2.3468863928571864, + "learning_rate": 1.9885338392285032e-05, + "loss": 0.9028, + "step": 389 + }, + { + "epoch": 0.23097423748889548, + "grad_norm": 2.2088822196102136, + "learning_rate": 1.9884370615397507e-05, + "loss": 0.876, + "step": 390 + }, + { + "epoch": 0.23156647912348238, + "grad_norm": 2.954411705632498, + "learning_rate": 1.9883398795246577e-05, + "loss": 0.9139, + "step": 391 + }, + { + "epoch": 0.2321587207580693, + "grad_norm": 2.2449696353833963, + "learning_rate": 1.9882422932229765e-05, + "loss": 0.8882, + "step": 392 + }, + { + "epoch": 0.2327509623926562, + "grad_norm": 1.911476833050929, + "learning_rate": 1.988144302674626e-05, + "loss": 0.8947, + "step": 393 + }, + { + "epoch": 0.23334320402724312, + "grad_norm": 2.0157438479335625, + "learning_rate": 1.9880459079196898e-05, + "loss": 0.8994, + "step": 394 + }, + { + "epoch": 0.23393544566183003, + "grad_norm": 2.2549219273884655, + "learning_rate": 1.9879471089984168e-05, + "loss": 0.8613, + "step": 395 + }, + { + "epoch": 0.23452768729641693, + "grad_norm": 2.369416456819026, + "learning_rate": 1.9878479059512212e-05, + "loss": 0.8987, + "step": 396 + }, + { + "epoch": 0.23511992893100386, + "grad_norm": 2.065708029987891, + "learning_rate": 1.9877482988186825e-05, + "loss": 0.8903, + "step": 397 + }, + { + "epoch": 0.23571217056559077, + "grad_norm": 1.7370389947259615, + "learning_rate": 1.987648287641546e-05, + "loss": 0.9043, + "step": 398 + }, + { + "epoch": 0.23630441220017767, + "grad_norm": 3.4678559229085546, + "learning_rate": 1.987547872460722e-05, + "loss": 0.867, + "step": 399 + }, + { + "epoch": 0.23689665383476458, + "grad_norm": 2.1054707516143694, + "learning_rate": 1.987447053317285e-05, + "loss": 0.863, + "step": 400 + }, + { + "epoch": 0.2374888954693515, + "grad_norm": 2.820738998283495, + "learning_rate": 1.9873458302524767e-05, + "loss": 0.9357, + "step": 401 + }, + { + "epoch": 0.2380811371039384, + "grad_norm": 4.024264238834477, + "learning_rate": 1.9872442033077027e-05, + "loss": 0.8398, + "step": 402 + }, + { + "epoch": 0.23867337873852532, + "grad_norm": 2.7872478316820755, + "learning_rate": 1.9871421725245342e-05, + "loss": 0.8621, + "step": 403 + }, + { + "epoch": 0.23926562037311222, + "grad_norm": 2.641334532671031, + "learning_rate": 1.9870397379447074e-05, + "loss": 0.9111, + "step": 404 + }, + { + "epoch": 0.23985786200769915, + "grad_norm": 11.371926019038037, + "learning_rate": 1.9869368996101238e-05, + "loss": 0.863, + "step": 405 + }, + { + "epoch": 0.24045010364228606, + "grad_norm": 2.7635300627133925, + "learning_rate": 1.98683365756285e-05, + "loss": 0.866, + "step": 406 + }, + { + "epoch": 0.24104234527687296, + "grad_norm": 2.7794994837778084, + "learning_rate": 1.986730011845118e-05, + "loss": 0.8632, + "step": 407 + }, + { + "epoch": 0.24163458691145986, + "grad_norm": 3.08645053943537, + "learning_rate": 1.9866259624993246e-05, + "loss": 0.9136, + "step": 408 + }, + { + "epoch": 0.2422268285460468, + "grad_norm": 2.6869600181928255, + "learning_rate": 1.9865215095680322e-05, + "loss": 0.8791, + "step": 409 + }, + { + "epoch": 0.2428190701806337, + "grad_norm": 2.772950886041293, + "learning_rate": 1.986416653093967e-05, + "loss": 0.9567, + "step": 410 + }, + { + "epoch": 0.2434113118152206, + "grad_norm": 4.210601184028655, + "learning_rate": 1.986311393120022e-05, + "loss": 0.8685, + "step": 411 + }, + { + "epoch": 0.2440035534498075, + "grad_norm": 9.174595783532942, + "learning_rate": 1.9862057296892546e-05, + "loss": 0.888, + "step": 412 + }, + { + "epoch": 0.24459579508439444, + "grad_norm": 4.247915627932793, + "learning_rate": 1.9860996628448866e-05, + "loss": 0.8967, + "step": 413 + }, + { + "epoch": 0.24518803671898134, + "grad_norm": 2.835218121637848, + "learning_rate": 1.985993192630305e-05, + "loss": 0.8579, + "step": 414 + }, + { + "epoch": 0.24578027835356825, + "grad_norm": 2.9565335495075638, + "learning_rate": 1.985886319089063e-05, + "loss": 0.9049, + "step": 415 + }, + { + "epoch": 0.24637251998815518, + "grad_norm": 3.778725444535079, + "learning_rate": 1.9857790422648774e-05, + "loss": 0.8764, + "step": 416 + }, + { + "epoch": 0.24696476162274208, + "grad_norm": 3.1572313052180685, + "learning_rate": 1.9856713622016305e-05, + "loss": 0.8345, + "step": 417 + }, + { + "epoch": 0.247557003257329, + "grad_norm": 2.24801735688883, + "learning_rate": 1.9855632789433695e-05, + "loss": 0.8645, + "step": 418 + }, + { + "epoch": 0.2481492448919159, + "grad_norm": 23.485871617876615, + "learning_rate": 1.985454792534306e-05, + "loss": 0.8602, + "step": 419 + }, + { + "epoch": 0.24874148652650283, + "grad_norm": 6.005824269857163, + "learning_rate": 1.9853459030188183e-05, + "loss": 0.8688, + "step": 420 + }, + { + "epoch": 0.24933372816108973, + "grad_norm": 2.1616583205115796, + "learning_rate": 1.985236610441447e-05, + "loss": 0.907, + "step": 421 + }, + { + "epoch": 0.24992596979567663, + "grad_norm": 2.7443618096985993, + "learning_rate": 1.9851269148468998e-05, + "loss": 0.8735, + "step": 422 + }, + { + "epoch": 0.25051821143026354, + "grad_norm": 1.9467808502303237, + "learning_rate": 1.9850168162800482e-05, + "loss": 0.88, + "step": 423 + }, + { + "epoch": 0.25111045306485047, + "grad_norm": 2.293568091632234, + "learning_rate": 1.9849063147859282e-05, + "loss": 0.8251, + "step": 424 + }, + { + "epoch": 0.25170269469943735, + "grad_norm": 6.45680121195122, + "learning_rate": 1.9847954104097416e-05, + "loss": 0.8671, + "step": 425 + }, + { + "epoch": 0.2522949363340243, + "grad_norm": 2.6574165173210362, + "learning_rate": 1.9846841031968545e-05, + "loss": 0.8692, + "step": 426 + }, + { + "epoch": 0.2528871779686112, + "grad_norm": 3.3705354945908645, + "learning_rate": 1.9845723931927975e-05, + "loss": 0.8723, + "step": 427 + }, + { + "epoch": 0.2534794196031981, + "grad_norm": 2.039139055549686, + "learning_rate": 1.9844602804432667e-05, + "loss": 0.8749, + "step": 428 + }, + { + "epoch": 0.254071661237785, + "grad_norm": 2.3388722575578482, + "learning_rate": 1.9843477649941223e-05, + "loss": 0.8881, + "step": 429 + }, + { + "epoch": 0.25466390287237195, + "grad_norm": 2.3766640178995257, + "learning_rate": 1.9842348468913895e-05, + "loss": 0.8715, + "step": 430 + }, + { + "epoch": 0.2552561445069588, + "grad_norm": 2.114898403529378, + "learning_rate": 1.9841215261812578e-05, + "loss": 0.8633, + "step": 431 + }, + { + "epoch": 0.25584838614154576, + "grad_norm": 3.219697021874039, + "learning_rate": 1.9840078029100826e-05, + "loss": 0.8636, + "step": 432 + }, + { + "epoch": 0.25644062777613263, + "grad_norm": 2.73094050276702, + "learning_rate": 1.9838936771243823e-05, + "loss": 0.8747, + "step": 433 + }, + { + "epoch": 0.25703286941071957, + "grad_norm": 3.2677036829236252, + "learning_rate": 1.983779148870841e-05, + "loss": 0.8409, + "step": 434 + }, + { + "epoch": 0.2576251110453065, + "grad_norm": 3.691432854617829, + "learning_rate": 1.9836642181963074e-05, + "loss": 0.9213, + "step": 435 + }, + { + "epoch": 0.2582173526798934, + "grad_norm": 1.913831109953419, + "learning_rate": 1.9835488851477943e-05, + "loss": 0.878, + "step": 436 + }, + { + "epoch": 0.2588095943144803, + "grad_norm": 1.362390320852231, + "learning_rate": 1.9834331497724795e-05, + "loss": 0.8654, + "step": 437 + }, + { + "epoch": 0.25940183594906724, + "grad_norm": 4.080901803953802, + "learning_rate": 1.983317012117705e-05, + "loss": 0.8479, + "step": 438 + }, + { + "epoch": 0.2599940775836541, + "grad_norm": 2.235992282079437, + "learning_rate": 1.983200472230979e-05, + "loss": 0.8802, + "step": 439 + }, + { + "epoch": 0.26058631921824105, + "grad_norm": 2.171461179516376, + "learning_rate": 1.983083530159971e-05, + "loss": 0.9045, + "step": 440 + }, + { + "epoch": 0.261178560852828, + "grad_norm": 2.7476770749878145, + "learning_rate": 1.9829661859525176e-05, + "loss": 0.9307, + "step": 441 + }, + { + "epoch": 0.26177080248741486, + "grad_norm": 4.118541834493762, + "learning_rate": 1.9828484396566197e-05, + "loss": 0.9237, + "step": 442 + }, + { + "epoch": 0.2623630441220018, + "grad_norm": 1.827944414998417, + "learning_rate": 1.982730291320442e-05, + "loss": 0.8942, + "step": 443 + }, + { + "epoch": 0.26295528575658866, + "grad_norm": 1.8056252338200751, + "learning_rate": 1.982611740992313e-05, + "loss": 0.8737, + "step": 444 + }, + { + "epoch": 0.2635475273911756, + "grad_norm": 2.1706398680744328, + "learning_rate": 1.982492788720727e-05, + "loss": 0.8847, + "step": 445 + }, + { + "epoch": 0.2641397690257625, + "grad_norm": 2.0469770330929933, + "learning_rate": 1.9823734345543422e-05, + "loss": 0.8588, + "step": 446 + }, + { + "epoch": 0.2647320106603494, + "grad_norm": 1.86522114555931, + "learning_rate": 1.9822536785419815e-05, + "loss": 0.8934, + "step": 447 + }, + { + "epoch": 0.26532425229493634, + "grad_norm": 1.5844917473664697, + "learning_rate": 1.982133520732631e-05, + "loss": 0.8978, + "step": 448 + }, + { + "epoch": 0.26591649392952327, + "grad_norm": 2.8109169891194394, + "learning_rate": 1.9820129611754428e-05, + "loss": 0.889, + "step": 449 + }, + { + "epoch": 0.26650873556411014, + "grad_norm": 2.176184810578095, + "learning_rate": 1.981891999919732e-05, + "loss": 0.8648, + "step": 450 + }, + { + "epoch": 0.2671009771986971, + "grad_norm": 2.343556853907917, + "learning_rate": 1.981770637014979e-05, + "loss": 0.8753, + "step": 451 + }, + { + "epoch": 0.26769321883328395, + "grad_norm": 2.961712486256478, + "learning_rate": 1.981648872510828e-05, + "loss": 0.8267, + "step": 452 + }, + { + "epoch": 0.2682854604678709, + "grad_norm": 1.4529926233743575, + "learning_rate": 1.981526706457087e-05, + "loss": 0.8387, + "step": 453 + }, + { + "epoch": 0.2688777021024578, + "grad_norm": 1.9505755733893613, + "learning_rate": 1.9814041389037292e-05, + "loss": 0.8853, + "step": 454 + }, + { + "epoch": 0.2694699437370447, + "grad_norm": 2.200797769974991, + "learning_rate": 1.981281169900892e-05, + "loss": 0.9057, + "step": 455 + }, + { + "epoch": 0.2700621853716316, + "grad_norm": 2.1870312653366137, + "learning_rate": 1.9811577994988755e-05, + "loss": 0.872, + "step": 456 + }, + { + "epoch": 0.27065442700621856, + "grad_norm": 1.2993955402450537, + "learning_rate": 1.9810340277481463e-05, + "loss": 0.8613, + "step": 457 + }, + { + "epoch": 0.27124666864080543, + "grad_norm": 2.425202483291421, + "learning_rate": 1.9809098546993333e-05, + "loss": 0.9174, + "step": 458 + }, + { + "epoch": 0.27183891027539236, + "grad_norm": 1.83353264512604, + "learning_rate": 1.9807852804032306e-05, + "loss": 0.8911, + "step": 459 + }, + { + "epoch": 0.2724311519099793, + "grad_norm": 1.8314326605476354, + "learning_rate": 1.980660304910796e-05, + "loss": 0.9341, + "step": 460 + }, + { + "epoch": 0.2730233935445662, + "grad_norm": 1.6973761426278493, + "learning_rate": 1.9805349282731513e-05, + "loss": 0.9077, + "step": 461 + }, + { + "epoch": 0.2736156351791531, + "grad_norm": 1.695076266085342, + "learning_rate": 1.9804091505415833e-05, + "loss": 0.8425, + "step": 462 + }, + { + "epoch": 0.27420787681374, + "grad_norm": 1.6422303345636073, + "learning_rate": 1.9802829717675413e-05, + "loss": 0.8885, + "step": 463 + }, + { + "epoch": 0.2748001184483269, + "grad_norm": 2.8627350079091456, + "learning_rate": 1.98015639200264e-05, + "loss": 0.8507, + "step": 464 + }, + { + "epoch": 0.27539236008291385, + "grad_norm": 1.3298065268134922, + "learning_rate": 1.980029411298657e-05, + "loss": 0.8834, + "step": 465 + }, + { + "epoch": 0.2759846017175007, + "grad_norm": 1.3923498378130503, + "learning_rate": 1.979902029707536e-05, + "loss": 0.8122, + "step": 466 + }, + { + "epoch": 0.27657684335208765, + "grad_norm": 2.558317417775508, + "learning_rate": 1.9797742472813815e-05, + "loss": 0.9, + "step": 467 + }, + { + "epoch": 0.2771690849866746, + "grad_norm": 2.184421600404926, + "learning_rate": 1.9796460640724646e-05, + "loss": 0.8803, + "step": 468 + }, + { + "epoch": 0.27776132662126146, + "grad_norm": 1.5965091803938036, + "learning_rate": 1.9795174801332195e-05, + "loss": 0.8966, + "step": 469 + }, + { + "epoch": 0.2783535682558484, + "grad_norm": 1.6260839104501157, + "learning_rate": 1.9793884955162442e-05, + "loss": 0.867, + "step": 470 + }, + { + "epoch": 0.27894580989043527, + "grad_norm": 1.7420396047568658, + "learning_rate": 1.9792591102743006e-05, + "loss": 0.8878, + "step": 471 + }, + { + "epoch": 0.2795380515250222, + "grad_norm": 1.306769898643035, + "learning_rate": 1.979129324460314e-05, + "loss": 0.8346, + "step": 472 + }, + { + "epoch": 0.28013029315960913, + "grad_norm": 2.108370805335531, + "learning_rate": 1.978999138127375e-05, + "loss": 0.8706, + "step": 473 + }, + { + "epoch": 0.280722534794196, + "grad_norm": 4.654961995545459, + "learning_rate": 1.9788685513287368e-05, + "loss": 0.8544, + "step": 474 + }, + { + "epoch": 0.28131477642878294, + "grad_norm": 5.160042089413392, + "learning_rate": 1.9787375641178162e-05, + "loss": 0.8469, + "step": 475 + }, + { + "epoch": 0.2819070180633699, + "grad_norm": 2.0855599172877337, + "learning_rate": 1.9786061765481954e-05, + "loss": 0.9022, + "step": 476 + }, + { + "epoch": 0.28249925969795675, + "grad_norm": 1.621103726662229, + "learning_rate": 1.9784743886736185e-05, + "loss": 0.8852, + "step": 477 + }, + { + "epoch": 0.2830915013325437, + "grad_norm": 1.959405094759062, + "learning_rate": 1.9783422005479942e-05, + "loss": 0.8392, + "step": 478 + }, + { + "epoch": 0.2836837429671306, + "grad_norm": 2.3454164384529523, + "learning_rate": 1.978209612225395e-05, + "loss": 0.8649, + "step": 479 + }, + { + "epoch": 0.2842759846017175, + "grad_norm": 2.3325577065069933, + "learning_rate": 1.9780766237600574e-05, + "loss": 0.8854, + "step": 480 + }, + { + "epoch": 0.2848682262363044, + "grad_norm": 2.937131824507951, + "learning_rate": 1.9779432352063806e-05, + "loss": 0.9174, + "step": 481 + }, + { + "epoch": 0.2854604678708913, + "grad_norm": 2.1825918189828744, + "learning_rate": 1.977809446618928e-05, + "loss": 0.8332, + "step": 482 + }, + { + "epoch": 0.28605270950547823, + "grad_norm": 2.6555766398144494, + "learning_rate": 1.9776752580524268e-05, + "loss": 0.8701, + "step": 483 + }, + { + "epoch": 0.28664495114006516, + "grad_norm": 2.1721573688476927, + "learning_rate": 1.9775406695617677e-05, + "loss": 0.9111, + "step": 484 + }, + { + "epoch": 0.28723719277465204, + "grad_norm": 2.605674968341765, + "learning_rate": 1.977405681202005e-05, + "loss": 0.9035, + "step": 485 + }, + { + "epoch": 0.28782943440923897, + "grad_norm": 1.8150442904058317, + "learning_rate": 1.977270293028357e-05, + "loss": 0.8809, + "step": 486 + }, + { + "epoch": 0.2884216760438259, + "grad_norm": 2.126220077493787, + "learning_rate": 1.977134505096204e-05, + "loss": 0.8752, + "step": 487 + }, + { + "epoch": 0.2890139176784128, + "grad_norm": 3.4772636452956593, + "learning_rate": 1.9769983174610918e-05, + "loss": 0.8583, + "step": 488 + }, + { + "epoch": 0.2896061593129997, + "grad_norm": 2.44750945614034, + "learning_rate": 1.9768617301787284e-05, + "loss": 0.9164, + "step": 489 + }, + { + "epoch": 0.2901984009475866, + "grad_norm": 2.0538731894956586, + "learning_rate": 1.9767247433049858e-05, + "loss": 0.8891, + "step": 490 + }, + { + "epoch": 0.2907906425821735, + "grad_norm": 2.315988950710287, + "learning_rate": 1.9765873568958996e-05, + "loss": 0.868, + "step": 491 + }, + { + "epoch": 0.29138288421676045, + "grad_norm": 2.6759070473500586, + "learning_rate": 1.9764495710076678e-05, + "loss": 0.8976, + "step": 492 + }, + { + "epoch": 0.29197512585134733, + "grad_norm": 2.8515729411143624, + "learning_rate": 1.9763113856966532e-05, + "loss": 0.8445, + "step": 493 + }, + { + "epoch": 0.29256736748593426, + "grad_norm": 5.706902531350699, + "learning_rate": 1.9761728010193812e-05, + "loss": 0.8302, + "step": 494 + }, + { + "epoch": 0.2931596091205212, + "grad_norm": 2.3398897793110813, + "learning_rate": 1.9760338170325405e-05, + "loss": 0.8355, + "step": 495 + }, + { + "epoch": 0.29375185075510807, + "grad_norm": 3.290716851896505, + "learning_rate": 1.975894433792984e-05, + "loss": 0.9306, + "step": 496 + }, + { + "epoch": 0.294344092389695, + "grad_norm": 2.140513045045442, + "learning_rate": 1.975754651357727e-05, + "loss": 0.8518, + "step": 497 + }, + { + "epoch": 0.29493633402428193, + "grad_norm": 2.1673411170485193, + "learning_rate": 1.9756144697839477e-05, + "loss": 0.8967, + "step": 498 + }, + { + "epoch": 0.2955285756588688, + "grad_norm": 3.750627708947347, + "learning_rate": 1.975473889128989e-05, + "loss": 0.8229, + "step": 499 + }, + { + "epoch": 0.29612081729345574, + "grad_norm": 2.3819533335097303, + "learning_rate": 1.9753329094503563e-05, + "loss": 0.8417, + "step": 500 + }, + { + "epoch": 0.2967130589280426, + "grad_norm": 2.1932086970836924, + "learning_rate": 1.9751915308057176e-05, + "loss": 0.8552, + "step": 501 + }, + { + "epoch": 0.29730530056262955, + "grad_norm": 3.717240525518049, + "learning_rate": 1.9750497532529053e-05, + "loss": 0.8384, + "step": 502 + }, + { + "epoch": 0.2978975421972165, + "grad_norm": 3.828126525639594, + "learning_rate": 1.9749075768499148e-05, + "loss": 0.8433, + "step": 503 + }, + { + "epoch": 0.29848978383180336, + "grad_norm": 2.6956236004584277, + "learning_rate": 1.974765001654903e-05, + "loss": 0.8441, + "step": 504 + }, + { + "epoch": 0.2990820254663903, + "grad_norm": 3.676685156782072, + "learning_rate": 1.974622027726192e-05, + "loss": 0.8943, + "step": 505 + }, + { + "epoch": 0.2996742671009772, + "grad_norm": 2.590931432467633, + "learning_rate": 1.9744786551222658e-05, + "loss": 0.8639, + "step": 506 + }, + { + "epoch": 0.3002665087355641, + "grad_norm": 3.9335673574606935, + "learning_rate": 1.9743348839017728e-05, + "loss": 0.8656, + "step": 507 + }, + { + "epoch": 0.30085875037015103, + "grad_norm": 2.2776114718789904, + "learning_rate": 1.974190714123522e-05, + "loss": 0.8451, + "step": 508 + }, + { + "epoch": 0.3014509920047379, + "grad_norm": 4.638920612322161, + "learning_rate": 1.974046145846488e-05, + "loss": 0.8153, + "step": 509 + }, + { + "epoch": 0.30204323363932484, + "grad_norm": 6.980939867299674, + "learning_rate": 1.9739011791298073e-05, + "loss": 0.8504, + "step": 510 + }, + { + "epoch": 0.30263547527391177, + "grad_norm": 4.408663359040869, + "learning_rate": 1.973755814032779e-05, + "loss": 0.8419, + "step": 511 + }, + { + "epoch": 0.30322771690849865, + "grad_norm": 3.0168512612679113, + "learning_rate": 1.9736100506148657e-05, + "loss": 0.8906, + "step": 512 + }, + { + "epoch": 0.3038199585430856, + "grad_norm": 3.541981364911518, + "learning_rate": 1.973463888935693e-05, + "loss": 0.8498, + "step": 513 + }, + { + "epoch": 0.3044122001776725, + "grad_norm": 2.628539633705203, + "learning_rate": 1.9733173290550494e-05, + "loss": 0.8722, + "step": 514 + }, + { + "epoch": 0.3050044418122594, + "grad_norm": 8.965610022562394, + "learning_rate": 1.973170371032886e-05, + "loss": 0.888, + "step": 515 + }, + { + "epoch": 0.3055966834468463, + "grad_norm": 3.339091960381498, + "learning_rate": 1.9730230149293167e-05, + "loss": 0.8096, + "step": 516 + }, + { + "epoch": 0.30618892508143325, + "grad_norm": 2.91895416655151, + "learning_rate": 1.9728752608046184e-05, + "loss": 0.8708, + "step": 517 + }, + { + "epoch": 0.3067811667160201, + "grad_norm": 2.8857357994218216, + "learning_rate": 1.9727271087192312e-05, + "loss": 0.8614, + "step": 518 + }, + { + "epoch": 0.30737340835060706, + "grad_norm": 3.8897082719903815, + "learning_rate": 1.9725785587337574e-05, + "loss": 0.8364, + "step": 519 + }, + { + "epoch": 0.30796564998519393, + "grad_norm": 3.0809821310404657, + "learning_rate": 1.9724296109089623e-05, + "loss": 0.8634, + "step": 520 + }, + { + "epoch": 0.30855789161978087, + "grad_norm": 4.31589497235874, + "learning_rate": 1.972280265305774e-05, + "loss": 0.8938, + "step": 521 + }, + { + "epoch": 0.3091501332543678, + "grad_norm": 3.4659891853357174, + "learning_rate": 1.9721305219852833e-05, + "loss": 0.8612, + "step": 522 + }, + { + "epoch": 0.3097423748889547, + "grad_norm": 3.1684707972088106, + "learning_rate": 1.9719803810087436e-05, + "loss": 0.8838, + "step": 523 + }, + { + "epoch": 0.3103346165235416, + "grad_norm": 5.701615286531985, + "learning_rate": 1.971829842437571e-05, + "loss": 0.8652, + "step": 524 + }, + { + "epoch": 0.31092685815812854, + "grad_norm": 3.4465521972170987, + "learning_rate": 1.971678906333344e-05, + "loss": 0.8845, + "step": 525 + }, + { + "epoch": 0.3115190997927154, + "grad_norm": 1.6217244261703954, + "learning_rate": 1.971527572757804e-05, + "loss": 0.8227, + "step": 526 + }, + { + "epoch": 0.31211134142730235, + "grad_norm": 2.3507099292364897, + "learning_rate": 1.9713758417728555e-05, + "loss": 0.863, + "step": 527 + }, + { + "epoch": 0.3127035830618892, + "grad_norm": 2.281502522601501, + "learning_rate": 1.971223713440564e-05, + "loss": 0.8567, + "step": 528 + }, + { + "epoch": 0.31329582469647616, + "grad_norm": 3.1146189997548976, + "learning_rate": 1.97107118782316e-05, + "loss": 0.8631, + "step": 529 + }, + { + "epoch": 0.3138880663310631, + "grad_norm": 1.9410780864006059, + "learning_rate": 1.970918264983034e-05, + "loss": 0.8465, + "step": 530 + }, + { + "epoch": 0.31448030796564996, + "grad_norm": 3.224337429475279, + "learning_rate": 1.97076494498274e-05, + "loss": 0.846, + "step": 531 + }, + { + "epoch": 0.3150725496002369, + "grad_norm": 6.086192190115328, + "learning_rate": 1.970611227884995e-05, + "loss": 0.8496, + "step": 532 + }, + { + "epoch": 0.3156647912348238, + "grad_norm": 2.644702902924804, + "learning_rate": 1.9704571137526775e-05, + "loss": 0.7961, + "step": 533 + }, + { + "epoch": 0.3162570328694107, + "grad_norm": 5.673305681652432, + "learning_rate": 1.9703026026488288e-05, + "loss": 0.8546, + "step": 534 + }, + { + "epoch": 0.31684927450399764, + "grad_norm": 2.5921802921198767, + "learning_rate": 1.9701476946366533e-05, + "loss": 0.8838, + "step": 535 + }, + { + "epoch": 0.31744151613858457, + "grad_norm": 2.008362256981687, + "learning_rate": 1.9699923897795165e-05, + "loss": 0.9067, + "step": 536 + }, + { + "epoch": 0.31803375777317144, + "grad_norm": 3.335473379056157, + "learning_rate": 1.969836688140947e-05, + "loss": 0.8315, + "step": 537 + }, + { + "epoch": 0.3186259994077584, + "grad_norm": 1.6713798386358392, + "learning_rate": 1.9696805897846353e-05, + "loss": 0.8996, + "step": 538 + }, + { + "epoch": 0.31921824104234525, + "grad_norm": 1.6247895343380683, + "learning_rate": 1.9695240947744345e-05, + "loss": 0.9091, + "step": 539 + }, + { + "epoch": 0.3198104826769322, + "grad_norm": 2.0514048927331685, + "learning_rate": 1.9693672031743604e-05, + "loss": 0.8807, + "step": 540 + }, + { + "epoch": 0.3204027243115191, + "grad_norm": 3.2346881141878923, + "learning_rate": 1.9692099150485897e-05, + "loss": 0.8744, + "step": 541 + }, + { + "epoch": 0.320994965946106, + "grad_norm": 1.993896753955855, + "learning_rate": 1.9690522304614624e-05, + "loss": 0.8935, + "step": 542 + }, + { + "epoch": 0.3215872075806929, + "grad_norm": 1.6718977284924557, + "learning_rate": 1.9688941494774807e-05, + "loss": 0.871, + "step": 543 + }, + { + "epoch": 0.32217944921527986, + "grad_norm": 1.635671027032527, + "learning_rate": 1.9687356721613084e-05, + "loss": 0.8915, + "step": 544 + }, + { + "epoch": 0.32277169084986673, + "grad_norm": 1.873327797284262, + "learning_rate": 1.968576798577771e-05, + "loss": 0.8994, + "step": 545 + }, + { + "epoch": 0.32336393248445366, + "grad_norm": 1.2751536161018124, + "learning_rate": 1.9684175287918576e-05, + "loss": 0.8471, + "step": 546 + }, + { + "epoch": 0.32395617411904054, + "grad_norm": 2.7974266868919426, + "learning_rate": 1.9682578628687183e-05, + "loss": 0.8911, + "step": 547 + }, + { + "epoch": 0.3245484157536275, + "grad_norm": 1.6428003910383486, + "learning_rate": 1.968097800873665e-05, + "loss": 0.877, + "step": 548 + }, + { + "epoch": 0.3251406573882144, + "grad_norm": 5.184321377796686, + "learning_rate": 1.9679373428721728e-05, + "loss": 0.8149, + "step": 549 + }, + { + "epoch": 0.3257328990228013, + "grad_norm": 3.034456419245674, + "learning_rate": 1.9677764889298775e-05, + "loss": 0.9059, + "step": 550 + }, + { + "epoch": 0.3263251406573882, + "grad_norm": 1.824534137714233, + "learning_rate": 1.9676152391125773e-05, + "loss": 0.8729, + "step": 551 + }, + { + "epoch": 0.32691738229197514, + "grad_norm": 2.2247918235652526, + "learning_rate": 1.9674535934862327e-05, + "loss": 0.899, + "step": 552 + }, + { + "epoch": 0.327509623926562, + "grad_norm": 2.0674047220831655, + "learning_rate": 1.967291552116966e-05, + "loss": 0.8418, + "step": 553 + }, + { + "epoch": 0.32810186556114895, + "grad_norm": 1.8961950219733992, + "learning_rate": 1.967129115071061e-05, + "loss": 0.8938, + "step": 554 + }, + { + "epoch": 0.3286941071957359, + "grad_norm": 1.5716138089131724, + "learning_rate": 1.9669662824149632e-05, + "loss": 0.8675, + "step": 555 + }, + { + "epoch": 0.32928634883032276, + "grad_norm": 2.505642914698843, + "learning_rate": 1.966803054215281e-05, + "loss": 0.8575, + "step": 556 + }, + { + "epoch": 0.3298785904649097, + "grad_norm": 3.1345316813906745, + "learning_rate": 1.966639430538784e-05, + "loss": 0.8786, + "step": 557 + }, + { + "epoch": 0.33047083209949657, + "grad_norm": 2.56591314502249, + "learning_rate": 1.966475411452403e-05, + "loss": 0.8637, + "step": 558 + }, + { + "epoch": 0.3310630737340835, + "grad_norm": 2.7664296698481032, + "learning_rate": 1.966310997023231e-05, + "loss": 0.9181, + "step": 559 + }, + { + "epoch": 0.33165531536867043, + "grad_norm": 1.5231814543200226, + "learning_rate": 1.966146187318523e-05, + "loss": 0.8748, + "step": 560 + }, + { + "epoch": 0.3322475570032573, + "grad_norm": 4.815465527837311, + "learning_rate": 1.9659809824056954e-05, + "loss": 0.8408, + "step": 561 + }, + { + "epoch": 0.33283979863784424, + "grad_norm": 1.6014447979733908, + "learning_rate": 1.9658153823523262e-05, + "loss": 0.8672, + "step": 562 + }, + { + "epoch": 0.3334320402724312, + "grad_norm": 2.2744955163709006, + "learning_rate": 1.9656493872261554e-05, + "loss": 0.9016, + "step": 563 + }, + { + "epoch": 0.33402428190701805, + "grad_norm": 1.781058566595245, + "learning_rate": 1.9654829970950838e-05, + "loss": 0.8764, + "step": 564 + }, + { + "epoch": 0.334616523541605, + "grad_norm": 1.7582008304848717, + "learning_rate": 1.9653162120271746e-05, + "loss": 0.877, + "step": 565 + }, + { + "epoch": 0.3352087651761919, + "grad_norm": 1.9319354513096203, + "learning_rate": 1.965149032090653e-05, + "loss": 0.8599, + "step": 566 + }, + { + "epoch": 0.3358010068107788, + "grad_norm": 1.6123157537046895, + "learning_rate": 1.9649814573539037e-05, + "loss": 0.8374, + "step": 567 + }, + { + "epoch": 0.3363932484453657, + "grad_norm": 2.605187020230856, + "learning_rate": 1.9648134878854747e-05, + "loss": 0.8699, + "step": 568 + }, + { + "epoch": 0.3369854900799526, + "grad_norm": 2.4840368350540016, + "learning_rate": 1.9646451237540756e-05, + "loss": 0.8727, + "step": 569 + }, + { + "epoch": 0.33757773171453953, + "grad_norm": 3.6970475679434465, + "learning_rate": 1.9644763650285758e-05, + "loss": 0.8354, + "step": 570 + }, + { + "epoch": 0.33816997334912646, + "grad_norm": 1.804745820267029, + "learning_rate": 1.964307211778008e-05, + "loss": 0.8231, + "step": 571 + }, + { + "epoch": 0.33876221498371334, + "grad_norm": 1.7814414820272138, + "learning_rate": 1.9641376640715646e-05, + "loss": 0.8829, + "step": 572 + }, + { + "epoch": 0.33935445661830027, + "grad_norm": 2.246104894860363, + "learning_rate": 1.963967721978601e-05, + "loss": 0.8279, + "step": 573 + }, + { + "epoch": 0.3399466982528872, + "grad_norm": 1.5276776993277508, + "learning_rate": 1.963797385568632e-05, + "loss": 0.8421, + "step": 574 + }, + { + "epoch": 0.3405389398874741, + "grad_norm": 3.984819253438696, + "learning_rate": 1.9636266549113358e-05, + "loss": 0.8715, + "step": 575 + }, + { + "epoch": 0.341131181522061, + "grad_norm": 1.6707727313470635, + "learning_rate": 1.96345553007655e-05, + "loss": 0.868, + "step": 576 + }, + { + "epoch": 0.3417234231566479, + "grad_norm": 1.8434045564594856, + "learning_rate": 1.9632840111342747e-05, + "loss": 0.8946, + "step": 577 + }, + { + "epoch": 0.3423156647912348, + "grad_norm": 2.5166196457952386, + "learning_rate": 1.9631120981546713e-05, + "loss": 0.8169, + "step": 578 + }, + { + "epoch": 0.34290790642582175, + "grad_norm": 1.7256723725893435, + "learning_rate": 1.962939791208061e-05, + "loss": 0.8983, + "step": 579 + }, + { + "epoch": 0.3435001480604086, + "grad_norm": 1.537497453982475, + "learning_rate": 1.9627670903649273e-05, + "loss": 0.8535, + "step": 580 + }, + { + "epoch": 0.34409238969499556, + "grad_norm": 2.0982710201843346, + "learning_rate": 1.9625939956959146e-05, + "loss": 0.8352, + "step": 581 + }, + { + "epoch": 0.3446846313295825, + "grad_norm": 1.6615651140303191, + "learning_rate": 1.9624205072718285e-05, + "loss": 0.8424, + "step": 582 + }, + { + "epoch": 0.34527687296416937, + "grad_norm": 2.2732967556610144, + "learning_rate": 1.9622466251636352e-05, + "loss": 0.8548, + "step": 583 + }, + { + "epoch": 0.3458691145987563, + "grad_norm": 1.9163123664346449, + "learning_rate": 1.9620723494424627e-05, + "loss": 0.8903, + "step": 584 + }, + { + "epoch": 0.34646135623334323, + "grad_norm": 1.8941032464371603, + "learning_rate": 1.961897680179599e-05, + "loss": 0.856, + "step": 585 + }, + { + "epoch": 0.3470535978679301, + "grad_norm": 1.851962175877676, + "learning_rate": 1.9617226174464945e-05, + "loss": 0.8798, + "step": 586 + }, + { + "epoch": 0.34764583950251704, + "grad_norm": 1.4058738019152999, + "learning_rate": 1.961547161314759e-05, + "loss": 0.862, + "step": 587 + }, + { + "epoch": 0.3482380811371039, + "grad_norm": 1.9819045004335991, + "learning_rate": 1.9613713118561638e-05, + "loss": 0.8779, + "step": 588 + }, + { + "epoch": 0.34883032277169085, + "grad_norm": 1.6060471271059031, + "learning_rate": 1.961195069142642e-05, + "loss": 0.843, + "step": 589 + }, + { + "epoch": 0.3494225644062778, + "grad_norm": 1.5640055985461452, + "learning_rate": 1.961018433246286e-05, + "loss": 0.8379, + "step": 590 + }, + { + "epoch": 0.35001480604086466, + "grad_norm": 3.0209204466667905, + "learning_rate": 1.9608414042393503e-05, + "loss": 0.825, + "step": 591 + }, + { + "epoch": 0.3506070476754516, + "grad_norm": 1.6509809673195002, + "learning_rate": 1.9606639821942496e-05, + "loss": 0.858, + "step": 592 + }, + { + "epoch": 0.3511992893100385, + "grad_norm": 1.8338700183391534, + "learning_rate": 1.9604861671835593e-05, + "loss": 0.9081, + "step": 593 + }, + { + "epoch": 0.3517915309446254, + "grad_norm": 1.6832166078423174, + "learning_rate": 1.9603079592800157e-05, + "loss": 0.8525, + "step": 594 + }, + { + "epoch": 0.35238377257921233, + "grad_norm": 3.6998428943163715, + "learning_rate": 1.960129358556516e-05, + "loss": 0.9052, + "step": 595 + }, + { + "epoch": 0.3529760142137992, + "grad_norm": 1.8174759599966348, + "learning_rate": 1.9599503650861183e-05, + "loss": 0.8153, + "step": 596 + }, + { + "epoch": 0.35356825584838614, + "grad_norm": 3.5284496969394277, + "learning_rate": 1.9597709789420404e-05, + "loss": 0.8488, + "step": 597 + }, + { + "epoch": 0.35416049748297307, + "grad_norm": 8.653779192107086, + "learning_rate": 1.959591200197662e-05, + "loss": 0.8625, + "step": 598 + }, + { + "epoch": 0.35475273911755995, + "grad_norm": 1.8759995754403318, + "learning_rate": 1.9594110289265218e-05, + "loss": 0.8618, + "step": 599 + }, + { + "epoch": 0.3553449807521469, + "grad_norm": 1.9485225987035864, + "learning_rate": 1.9592304652023208e-05, + "loss": 0.8226, + "step": 600 + }, + { + "epoch": 0.3559372223867338, + "grad_norm": 3.686184834700953, + "learning_rate": 1.959049509098919e-05, + "loss": 0.8781, + "step": 601 + }, + { + "epoch": 0.3565294640213207, + "grad_norm": 2.8230171313225783, + "learning_rate": 1.9588681606903385e-05, + "loss": 0.8445, + "step": 602 + }, + { + "epoch": 0.3571217056559076, + "grad_norm": 2.7255745530386193, + "learning_rate": 1.95868642005076e-05, + "loss": 0.8965, + "step": 603 + }, + { + "epoch": 0.35771394729049455, + "grad_norm": 3.1116205729926385, + "learning_rate": 1.9585042872545266e-05, + "loss": 0.8434, + "step": 604 + }, + { + "epoch": 0.3583061889250814, + "grad_norm": 3.8126207167526154, + "learning_rate": 1.9583217623761404e-05, + "loss": 0.8651, + "step": 605 + }, + { + "epoch": 0.35889843055966836, + "grad_norm": 1.658269096943048, + "learning_rate": 1.958138845490264e-05, + "loss": 0.8434, + "step": 606 + }, + { + "epoch": 0.35949067219425523, + "grad_norm": 2.040025083348231, + "learning_rate": 1.9579555366717214e-05, + "loss": 0.8795, + "step": 607 + }, + { + "epoch": 0.36008291382884217, + "grad_norm": 4.804650864705654, + "learning_rate": 1.9577718359954955e-05, + "loss": 0.8943, + "step": 608 + }, + { + "epoch": 0.3606751554634291, + "grad_norm": 1.7709023659733834, + "learning_rate": 1.957587743536731e-05, + "loss": 0.8878, + "step": 609 + }, + { + "epoch": 0.361267397098016, + "grad_norm": 2.25150632588813, + "learning_rate": 1.9574032593707314e-05, + "loss": 0.8498, + "step": 610 + }, + { + "epoch": 0.3618596387326029, + "grad_norm": 7.9605607465838695, + "learning_rate": 1.9572183835729613e-05, + "loss": 0.9201, + "step": 611 + }, + { + "epoch": 0.36245188036718984, + "grad_norm": 2.5393697091836627, + "learning_rate": 1.957033116219045e-05, + "loss": 0.8733, + "step": 612 + }, + { + "epoch": 0.3630441220017767, + "grad_norm": 1.6878302312671156, + "learning_rate": 1.956847457384768e-05, + "loss": 0.8936, + "step": 613 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.868290304790888, + "learning_rate": 1.956661407146075e-05, + "loss": 0.8469, + "step": 614 + }, + { + "epoch": 0.3642286052709505, + "grad_norm": 1.7524149989576479, + "learning_rate": 1.9564749655790706e-05, + "loss": 0.8834, + "step": 615 + }, + { + "epoch": 0.36482084690553745, + "grad_norm": 3.268017562205005, + "learning_rate": 1.9562881327600197e-05, + "loss": 0.8549, + "step": 616 + }, + { + "epoch": 0.3654130885401244, + "grad_norm": 1.976076053518677, + "learning_rate": 1.9561009087653483e-05, + "loss": 0.8568, + "step": 617 + }, + { + "epoch": 0.36600533017471126, + "grad_norm": 2.637521424974901, + "learning_rate": 1.955913293671641e-05, + "loss": 0.8918, + "step": 618 + }, + { + "epoch": 0.3665975718092982, + "grad_norm": 3.314444680794765, + "learning_rate": 1.9557252875556428e-05, + "loss": 0.8414, + "step": 619 + }, + { + "epoch": 0.3671898134438851, + "grad_norm": 2.3746791597113766, + "learning_rate": 1.9555368904942593e-05, + "loss": 0.8457, + "step": 620 + }, + { + "epoch": 0.367782055078472, + "grad_norm": 2.6011871499667523, + "learning_rate": 1.9553481025645545e-05, + "loss": 0.8873, + "step": 621 + }, + { + "epoch": 0.36837429671305894, + "grad_norm": 6.819452995677599, + "learning_rate": 1.9551589238437546e-05, + "loss": 0.8907, + "step": 622 + }, + { + "epoch": 0.36896653834764587, + "grad_norm": 2.048515035382417, + "learning_rate": 1.954969354409243e-05, + "loss": 0.8392, + "step": 623 + }, + { + "epoch": 0.36955877998223274, + "grad_norm": 2.3431036995490855, + "learning_rate": 1.954779394338566e-05, + "loss": 0.8916, + "step": 624 + }, + { + "epoch": 0.3701510216168197, + "grad_norm": 3.0084271040530073, + "learning_rate": 1.954589043709426e-05, + "loss": 0.904, + "step": 625 + }, + { + "epoch": 0.37074326325140655, + "grad_norm": 1.5517306350499644, + "learning_rate": 1.954398302599688e-05, + "loss": 0.8716, + "step": 626 + }, + { + "epoch": 0.3713355048859935, + "grad_norm": 4.467132411617738, + "learning_rate": 1.954207171087376e-05, + "loss": 0.8593, + "step": 627 + }, + { + "epoch": 0.3719277465205804, + "grad_norm": 2.292921810220871, + "learning_rate": 1.9540156492506734e-05, + "loss": 0.8766, + "step": 628 + }, + { + "epoch": 0.3725199881551673, + "grad_norm": 1.9559474188452068, + "learning_rate": 1.9538237371679233e-05, + "loss": 0.8705, + "step": 629 + }, + { + "epoch": 0.3731122297897542, + "grad_norm": 1.6930579045607708, + "learning_rate": 1.9536314349176288e-05, + "loss": 0.8455, + "step": 630 + }, + { + "epoch": 0.37370447142434116, + "grad_norm": 5.371918547587357, + "learning_rate": 1.9534387425784518e-05, + "loss": 0.8998, + "step": 631 + }, + { + "epoch": 0.37429671305892803, + "grad_norm": 1.7900417336112107, + "learning_rate": 1.9532456602292148e-05, + "loss": 0.8481, + "step": 632 + }, + { + "epoch": 0.37488895469351496, + "grad_norm": 2.2958736067272416, + "learning_rate": 1.9530521879488993e-05, + "loss": 0.9105, + "step": 633 + }, + { + "epoch": 0.37548119632810184, + "grad_norm": 2.638086764469702, + "learning_rate": 1.952858325816646e-05, + "loss": 0.8381, + "step": 634 + }, + { + "epoch": 0.3760734379626888, + "grad_norm": 2.439133220470199, + "learning_rate": 1.9526640739117555e-05, + "loss": 0.8751, + "step": 635 + }, + { + "epoch": 0.3766656795972757, + "grad_norm": 1.8675503716373216, + "learning_rate": 1.9524694323136883e-05, + "loss": 0.883, + "step": 636 + }, + { + "epoch": 0.3772579212318626, + "grad_norm": 4.742878698850084, + "learning_rate": 1.952274401102063e-05, + "loss": 0.888, + "step": 637 + }, + { + "epoch": 0.3778501628664495, + "grad_norm": 2.420961821182289, + "learning_rate": 1.952078980356659e-05, + "loss": 0.8812, + "step": 638 + }, + { + "epoch": 0.37844240450103644, + "grad_norm": 2.237864761118068, + "learning_rate": 1.9518831701574136e-05, + "loss": 0.8906, + "step": 639 + }, + { + "epoch": 0.3790346461356233, + "grad_norm": 3.5814294479841915, + "learning_rate": 1.951686970584425e-05, + "loss": 0.8762, + "step": 640 + }, + { + "epoch": 0.37962688777021025, + "grad_norm": 3.322882097775164, + "learning_rate": 1.951490381717949e-05, + "loss": 0.8855, + "step": 641 + }, + { + "epoch": 0.3802191294047972, + "grad_norm": 2.2781530112916104, + "learning_rate": 1.9512934036384026e-05, + "loss": 0.8861, + "step": 642 + }, + { + "epoch": 0.38081137103938406, + "grad_norm": 3.1293049656485166, + "learning_rate": 1.95109603642636e-05, + "loss": 0.8684, + "step": 643 + }, + { + "epoch": 0.381403612673971, + "grad_norm": 2.2366175666301626, + "learning_rate": 1.9508982801625557e-05, + "loss": 0.8209, + "step": 644 + }, + { + "epoch": 0.38199585430855787, + "grad_norm": 2.0870198727029776, + "learning_rate": 1.9507001349278834e-05, + "loss": 0.8726, + "step": 645 + }, + { + "epoch": 0.3825880959431448, + "grad_norm": 5.789578141166538, + "learning_rate": 1.9505016008033953e-05, + "loss": 0.8065, + "step": 646 + }, + { + "epoch": 0.38318033757773173, + "grad_norm": 3.0810399663606183, + "learning_rate": 1.9503026778703034e-05, + "loss": 0.8686, + "step": 647 + }, + { + "epoch": 0.3837725792123186, + "grad_norm": 3.6634728348627528, + "learning_rate": 1.950103366209978e-05, + "loss": 0.8719, + "step": 648 + }, + { + "epoch": 0.38436482084690554, + "grad_norm": 2.8736691268816874, + "learning_rate": 1.949903665903949e-05, + "loss": 0.8767, + "step": 649 + }, + { + "epoch": 0.3849570624814925, + "grad_norm": 5.882320046408276, + "learning_rate": 1.949703577033905e-05, + "loss": 0.8671, + "step": 650 + }, + { + "epoch": 0.38554930411607935, + "grad_norm": 2.9528476437318925, + "learning_rate": 1.9495030996816932e-05, + "loss": 0.9013, + "step": 651 + }, + { + "epoch": 0.3861415457506663, + "grad_norm": 2.6363739674663247, + "learning_rate": 1.9493022339293207e-05, + "loss": 0.8833, + "step": 652 + }, + { + "epoch": 0.38673378738525316, + "grad_norm": 2.17785246783246, + "learning_rate": 1.949100979858953e-05, + "loss": 0.8866, + "step": 653 + }, + { + "epoch": 0.3873260290198401, + "grad_norm": 5.128470877775802, + "learning_rate": 1.9488993375529137e-05, + "loss": 0.9099, + "step": 654 + }, + { + "epoch": 0.387918270654427, + "grad_norm": 2.804392512181631, + "learning_rate": 1.9486973070936862e-05, + "loss": 0.8498, + "step": 655 + }, + { + "epoch": 0.3885105122890139, + "grad_norm": 2.9434325093740905, + "learning_rate": 1.9484948885639122e-05, + "loss": 0.8449, + "step": 656 + }, + { + "epoch": 0.38910275392360083, + "grad_norm": 4.154023552286942, + "learning_rate": 1.9482920820463923e-05, + "loss": 0.8563, + "step": 657 + }, + { + "epoch": 0.38969499555818776, + "grad_norm": 3.148511348448834, + "learning_rate": 1.948088887624086e-05, + "loss": 0.8056, + "step": 658 + }, + { + "epoch": 0.39028723719277464, + "grad_norm": 3.926327591148058, + "learning_rate": 1.947885305380111e-05, + "loss": 0.8888, + "step": 659 + }, + { + "epoch": 0.39087947882736157, + "grad_norm": 5.980099519975163, + "learning_rate": 1.9476813353977442e-05, + "loss": 0.8454, + "step": 660 + }, + { + "epoch": 0.3914717204619485, + "grad_norm": 3.3367209292691, + "learning_rate": 1.9474769777604198e-05, + "loss": 0.8548, + "step": 661 + }, + { + "epoch": 0.3920639620965354, + "grad_norm": 2.968203146893538, + "learning_rate": 1.947272232551733e-05, + "loss": 0.84, + "step": 662 + }, + { + "epoch": 0.3926562037311223, + "grad_norm": 39.05789203370318, + "learning_rate": 1.9470670998554352e-05, + "loss": 0.8914, + "step": 663 + }, + { + "epoch": 0.3932484453657092, + "grad_norm": 3.4148463229844506, + "learning_rate": 1.9468615797554374e-05, + "loss": 0.8605, + "step": 664 + }, + { + "epoch": 0.3938406870002961, + "grad_norm": 6.8180037854052316, + "learning_rate": 1.9466556723358084e-05, + "loss": 0.9065, + "step": 665 + }, + { + "epoch": 0.39443292863488305, + "grad_norm": 2.1152979135979484, + "learning_rate": 1.946449377680777e-05, + "loss": 0.8731, + "step": 666 + }, + { + "epoch": 0.3950251702694699, + "grad_norm": 2.4870435911253033, + "learning_rate": 1.9462426958747285e-05, + "loss": 0.8652, + "step": 667 + }, + { + "epoch": 0.39561741190405686, + "grad_norm": 2.6453498019333277, + "learning_rate": 1.9460356270022073e-05, + "loss": 0.898, + "step": 668 + }, + { + "epoch": 0.3962096535386438, + "grad_norm": 3.0508069490190066, + "learning_rate": 1.945828171147917e-05, + "loss": 0.9055, + "step": 669 + }, + { + "epoch": 0.39680189517323067, + "grad_norm": 1.6241048949900765, + "learning_rate": 1.945620328396718e-05, + "loss": 0.9111, + "step": 670 + }, + { + "epoch": 0.3973941368078176, + "grad_norm": 1.2381656243162942, + "learning_rate": 1.9454120988336297e-05, + "loss": 0.9074, + "step": 671 + }, + { + "epoch": 0.3979863784424045, + "grad_norm": 1.8466695435602942, + "learning_rate": 1.9452034825438302e-05, + "loss": 0.9597, + "step": 672 + }, + { + "epoch": 0.3985786200769914, + "grad_norm": 1.2562124363063687, + "learning_rate": 1.9449944796126547e-05, + "loss": 0.9371, + "step": 673 + }, + { + "epoch": 0.39917086171157834, + "grad_norm": 1.7231260645325064, + "learning_rate": 1.9447850901255975e-05, + "loss": 0.8556, + "step": 674 + }, + { + "epoch": 0.3997631033461652, + "grad_norm": 1.6192363719486302, + "learning_rate": 1.9445753141683107e-05, + "loss": 0.8885, + "step": 675 + }, + { + "epoch": 0.40035534498075215, + "grad_norm": 1.675728205658867, + "learning_rate": 1.9443651518266044e-05, + "loss": 0.8692, + "step": 676 + }, + { + "epoch": 0.4009475866153391, + "grad_norm": 2.04303793153201, + "learning_rate": 1.9441546031864467e-05, + "loss": 0.8552, + "step": 677 + }, + { + "epoch": 0.40153982824992596, + "grad_norm": 1.9133034698626181, + "learning_rate": 1.943943668333964e-05, + "loss": 0.882, + "step": 678 + }, + { + "epoch": 0.4021320698845129, + "grad_norm": 2.1370541384543342, + "learning_rate": 1.9437323473554404e-05, + "loss": 0.8968, + "step": 679 + }, + { + "epoch": 0.4027243115190998, + "grad_norm": 1.5028317313676052, + "learning_rate": 1.943520640337318e-05, + "loss": 0.8582, + "step": 680 + }, + { + "epoch": 0.4033165531536867, + "grad_norm": 3.9594823865775868, + "learning_rate": 1.943308547366197e-05, + "loss": 0.8111, + "step": 681 + }, + { + "epoch": 0.40390879478827363, + "grad_norm": 3.417942000806396, + "learning_rate": 1.9430960685288355e-05, + "loss": 0.8385, + "step": 682 + }, + { + "epoch": 0.4045010364228605, + "grad_norm": 1.6652473512379486, + "learning_rate": 1.9428832039121487e-05, + "loss": 0.8686, + "step": 683 + }, + { + "epoch": 0.40509327805744744, + "grad_norm": 3.8546279443523663, + "learning_rate": 1.942669953603211e-05, + "loss": 0.8697, + "step": 684 + }, + { + "epoch": 0.40568551969203437, + "grad_norm": 1.9412501113108496, + "learning_rate": 1.9424563176892534e-05, + "loss": 0.8235, + "step": 685 + }, + { + "epoch": 0.40627776132662125, + "grad_norm": 1.8569982182714377, + "learning_rate": 1.9422422962576646e-05, + "loss": 0.9344, + "step": 686 + }, + { + "epoch": 0.4068700029612082, + "grad_norm": 2.1750669740257824, + "learning_rate": 1.9420278893959922e-05, + "loss": 0.868, + "step": 687 + }, + { + "epoch": 0.4074622445957951, + "grad_norm": 2.4649163202529607, + "learning_rate": 1.94181309719194e-05, + "loss": 0.8585, + "step": 688 + }, + { + "epoch": 0.408054486230382, + "grad_norm": 2.5516191944482443, + "learning_rate": 1.9415979197333704e-05, + "loss": 0.866, + "step": 689 + }, + { + "epoch": 0.4086467278649689, + "grad_norm": 2.140588212831023, + "learning_rate": 1.941382357108303e-05, + "loss": 0.8736, + "step": 690 + }, + { + "epoch": 0.4092389694995558, + "grad_norm": 2.6369171422119417, + "learning_rate": 1.941166409404915e-05, + "loss": 0.8362, + "step": 691 + }, + { + "epoch": 0.4098312111341427, + "grad_norm": 3.0607023188332074, + "learning_rate": 1.9409500767115414e-05, + "loss": 0.8399, + "step": 692 + }, + { + "epoch": 0.41042345276872966, + "grad_norm": 3.0627901913227924, + "learning_rate": 1.940733359116674e-05, + "loss": 0.879, + "step": 693 + }, + { + "epoch": 0.41101569440331653, + "grad_norm": 3.2361644739692292, + "learning_rate": 1.9405162567089627e-05, + "loss": 0.8678, + "step": 694 + }, + { + "epoch": 0.41160793603790347, + "grad_norm": 18.463939120851368, + "learning_rate": 1.940298769577215e-05, + "loss": 0.9256, + "step": 695 + }, + { + "epoch": 0.4122001776724904, + "grad_norm": 10.368145595990683, + "learning_rate": 1.9400808978103948e-05, + "loss": 0.8188, + "step": 696 + }, + { + "epoch": 0.4127924193070773, + "grad_norm": 2.4187494879479745, + "learning_rate": 1.939862641497624e-05, + "loss": 0.8745, + "step": 697 + }, + { + "epoch": 0.4133846609416642, + "grad_norm": 1.806561112554444, + "learning_rate": 1.939644000728182e-05, + "loss": 0.8622, + "step": 698 + }, + { + "epoch": 0.41397690257625114, + "grad_norm": 2.376096051071182, + "learning_rate": 1.9394249755915047e-05, + "loss": 0.8613, + "step": 699 + }, + { + "epoch": 0.414569144210838, + "grad_norm": 2.0606522501522724, + "learning_rate": 1.939205566177186e-05, + "loss": 0.8889, + "step": 700 + }, + { + "epoch": 0.41516138584542495, + "grad_norm": 1.6424519963083875, + "learning_rate": 1.9389857725749767e-05, + "loss": 0.924, + "step": 701 + }, + { + "epoch": 0.4157536274800118, + "grad_norm": 2.5660132236892608, + "learning_rate": 1.938765594874785e-05, + "loss": 0.8472, + "step": 702 + }, + { + "epoch": 0.41634586911459875, + "grad_norm": 2.8325159916694598, + "learning_rate": 1.9385450331666754e-05, + "loss": 0.8887, + "step": 703 + }, + { + "epoch": 0.4169381107491857, + "grad_norm": 1.7008840901930986, + "learning_rate": 1.93832408754087e-05, + "loss": 0.8748, + "step": 704 + }, + { + "epoch": 0.41753035238377256, + "grad_norm": 1.9216283469656232, + "learning_rate": 1.9381027580877486e-05, + "loss": 0.8415, + "step": 705 + }, + { + "epoch": 0.4181225940183595, + "grad_norm": 2.026996390561362, + "learning_rate": 1.937881044897847e-05, + "loss": 0.8416, + "step": 706 + }, + { + "epoch": 0.4187148356529464, + "grad_norm": 1.423893980632455, + "learning_rate": 1.9376589480618583e-05, + "loss": 0.8505, + "step": 707 + }, + { + "epoch": 0.4193070772875333, + "grad_norm": 1.8333091580377612, + "learning_rate": 1.937436467670633e-05, + "loss": 0.8715, + "step": 708 + }, + { + "epoch": 0.41989931892212023, + "grad_norm": 2.2298036589587658, + "learning_rate": 1.937213603815178e-05, + "loss": 0.8767, + "step": 709 + }, + { + "epoch": 0.4204915605567071, + "grad_norm": 3.440326466071599, + "learning_rate": 1.9369903565866565e-05, + "loss": 0.8322, + "step": 710 + }, + { + "epoch": 0.42108380219129404, + "grad_norm": 1.6529181371554102, + "learning_rate": 1.9367667260763898e-05, + "loss": 0.899, + "step": 711 + }, + { + "epoch": 0.421676043825881, + "grad_norm": 1.811748367956541, + "learning_rate": 1.936542712375855e-05, + "loss": 0.805, + "step": 712 + }, + { + "epoch": 0.42226828546046785, + "grad_norm": 1.4423225369050077, + "learning_rate": 1.9363183155766867e-05, + "loss": 0.9193, + "step": 713 + }, + { + "epoch": 0.4228605270950548, + "grad_norm": 3.711540769404225, + "learning_rate": 1.9360935357706756e-05, + "loss": 0.8143, + "step": 714 + }, + { + "epoch": 0.4234527687296417, + "grad_norm": 1.6013652544896453, + "learning_rate": 1.9358683730497695e-05, + "loss": 0.8883, + "step": 715 + }, + { + "epoch": 0.4240450103642286, + "grad_norm": 4.765502585221681, + "learning_rate": 1.9356428275060722e-05, + "loss": 0.845, + "step": 716 + }, + { + "epoch": 0.4246372519988155, + "grad_norm": 3.1849530858655983, + "learning_rate": 1.9354168992318448e-05, + "loss": 0.9136, + "step": 717 + }, + { + "epoch": 0.42522949363340246, + "grad_norm": 1.62933697491851, + "learning_rate": 1.9351905883195044e-05, + "loss": 0.8887, + "step": 718 + }, + { + "epoch": 0.42582173526798933, + "grad_norm": 1.8945696646562518, + "learning_rate": 1.9349638948616253e-05, + "loss": 0.8268, + "step": 719 + }, + { + "epoch": 0.42641397690257626, + "grad_norm": 2.4172838989795125, + "learning_rate": 1.934736818950937e-05, + "loss": 0.8683, + "step": 720 + }, + { + "epoch": 0.42700621853716314, + "grad_norm": 1.884065405882965, + "learning_rate": 1.9345093606803276e-05, + "loss": 0.8832, + "step": 721 + }, + { + "epoch": 0.42759846017175007, + "grad_norm": 1.77943938389625, + "learning_rate": 1.9342815201428394e-05, + "loss": 0.8842, + "step": 722 + }, + { + "epoch": 0.428190701806337, + "grad_norm": 1.4357452116165423, + "learning_rate": 1.9340532974316727e-05, + "loss": 0.8274, + "step": 723 + }, + { + "epoch": 0.4287829434409239, + "grad_norm": 1.4379221296115967, + "learning_rate": 1.9338246926401828e-05, + "loss": 0.8388, + "step": 724 + }, + { + "epoch": 0.4293751850755108, + "grad_norm": 2.07752636716361, + "learning_rate": 1.933595705861882e-05, + "loss": 0.833, + "step": 725 + }, + { + "epoch": 0.42996742671009774, + "grad_norm": 2.0105999576116322, + "learning_rate": 1.9333663371904388e-05, + "loss": 0.8473, + "step": 726 + }, + { + "epoch": 0.4305596683446846, + "grad_norm": 3.956301063546753, + "learning_rate": 1.933136586719678e-05, + "loss": 0.8496, + "step": 727 + }, + { + "epoch": 0.43115190997927155, + "grad_norm": 1.3549992353009284, + "learning_rate": 1.9329064545435803e-05, + "loss": 0.8747, + "step": 728 + }, + { + "epoch": 0.43174415161385843, + "grad_norm": 2.227577277579618, + "learning_rate": 1.932675940756283e-05, + "loss": 0.8362, + "step": 729 + }, + { + "epoch": 0.43233639324844536, + "grad_norm": 1.5147909801235255, + "learning_rate": 1.932445045452079e-05, + "loss": 0.8688, + "step": 730 + }, + { + "epoch": 0.4329286348830323, + "grad_norm": 1.920697239844712, + "learning_rate": 1.9322137687254175e-05, + "loss": 0.8608, + "step": 731 + }, + { + "epoch": 0.43352087651761917, + "grad_norm": 1.9550148078238179, + "learning_rate": 1.931982110670904e-05, + "loss": 0.8438, + "step": 732 + }, + { + "epoch": 0.4341131181522061, + "grad_norm": 1.9357461615720977, + "learning_rate": 1.9317500713832987e-05, + "loss": 0.8289, + "step": 733 + }, + { + "epoch": 0.43470535978679303, + "grad_norm": 2.436470304212335, + "learning_rate": 1.9315176509575196e-05, + "loss": 0.8505, + "step": 734 + }, + { + "epoch": 0.4352976014213799, + "grad_norm": 2.0394551856453633, + "learning_rate": 1.93128484948864e-05, + "loss": 0.8595, + "step": 735 + }, + { + "epoch": 0.43588984305596684, + "grad_norm": 16.717645135664075, + "learning_rate": 1.9310516670718877e-05, + "loss": 0.8358, + "step": 736 + }, + { + "epoch": 0.4364820846905538, + "grad_norm": 1.9423408377644282, + "learning_rate": 1.930818103802648e-05, + "loss": 0.861, + "step": 737 + }, + { + "epoch": 0.43707432632514065, + "grad_norm": 4.812386875770368, + "learning_rate": 1.9305841597764615e-05, + "loss": 0.8801, + "step": 738 + }, + { + "epoch": 0.4376665679597276, + "grad_norm": 1.4672305025896835, + "learning_rate": 1.9303498350890246e-05, + "loss": 0.8782, + "step": 739 + }, + { + "epoch": 0.43825880959431446, + "grad_norm": 2.2508593137348676, + "learning_rate": 1.9301151298361887e-05, + "loss": 0.877, + "step": 740 + }, + { + "epoch": 0.4388510512289014, + "grad_norm": 1.7135079210199697, + "learning_rate": 1.9298800441139623e-05, + "loss": 0.8972, + "step": 741 + }, + { + "epoch": 0.4394432928634883, + "grad_norm": 1.4490997536994275, + "learning_rate": 1.9296445780185077e-05, + "loss": 0.8224, + "step": 742 + }, + { + "epoch": 0.4400355344980752, + "grad_norm": 1.770609855482205, + "learning_rate": 1.9294087316461446e-05, + "loss": 0.8803, + "step": 743 + }, + { + "epoch": 0.44062777613266213, + "grad_norm": 4.232242865631687, + "learning_rate": 1.929172505093347e-05, + "loss": 0.8834, + "step": 744 + }, + { + "epoch": 0.44122001776724906, + "grad_norm": 2.0717939954326625, + "learning_rate": 1.9289358984567446e-05, + "loss": 0.8896, + "step": 745 + }, + { + "epoch": 0.44181225940183594, + "grad_norm": 4.304229428729412, + "learning_rate": 1.928698911833123e-05, + "loss": 0.8621, + "step": 746 + }, + { + "epoch": 0.44240450103642287, + "grad_norm": 2.200899929703116, + "learning_rate": 1.928461545319424e-05, + "loss": 0.8648, + "step": 747 + }, + { + "epoch": 0.44299674267100975, + "grad_norm": 1.6834184926932727, + "learning_rate": 1.9282237990127425e-05, + "loss": 0.8714, + "step": 748 + }, + { + "epoch": 0.4435889843055967, + "grad_norm": 2.104833385776733, + "learning_rate": 1.927985673010331e-05, + "loss": 0.8172, + "step": 749 + }, + { + "epoch": 0.4441812259401836, + "grad_norm": 3.6197381435137324, + "learning_rate": 1.9277471674095952e-05, + "loss": 0.8392, + "step": 750 + }, + { + "epoch": 0.4447734675747705, + "grad_norm": 2.11958620090702, + "learning_rate": 1.9275082823080992e-05, + "loss": 0.8735, + "step": 751 + }, + { + "epoch": 0.4453657092093574, + "grad_norm": 3.3384124014666874, + "learning_rate": 1.927269017803559e-05, + "loss": 0.9202, + "step": 752 + }, + { + "epoch": 0.44595795084394435, + "grad_norm": 2.705834935266405, + "learning_rate": 1.9270293739938477e-05, + "loss": 0.8619, + "step": 753 + }, + { + "epoch": 0.4465501924785312, + "grad_norm": 2.7094042650834527, + "learning_rate": 1.9267893509769927e-05, + "loss": 0.8918, + "step": 754 + }, + { + "epoch": 0.44714243411311816, + "grad_norm": 1.7854900824015136, + "learning_rate": 1.926548948851178e-05, + "loss": 0.8564, + "step": 755 + }, + { + "epoch": 0.4477346757477051, + "grad_norm": 1.6005502953829145, + "learning_rate": 1.92630816771474e-05, + "loss": 0.8412, + "step": 756 + }, + { + "epoch": 0.44832691738229197, + "grad_norm": 7.492700831605615, + "learning_rate": 1.9260670076661735e-05, + "loss": 0.8718, + "step": 757 + }, + { + "epoch": 0.4489191590168789, + "grad_norm": 5.427630233101737, + "learning_rate": 1.925825468804125e-05, + "loss": 0.8778, + "step": 758 + }, + { + "epoch": 0.4495114006514658, + "grad_norm": 1.8888166698278774, + "learning_rate": 1.9255835512273982e-05, + "loss": 0.8596, + "step": 759 + }, + { + "epoch": 0.4501036422860527, + "grad_norm": 2.1812538978009055, + "learning_rate": 1.9253412550349507e-05, + "loss": 0.8691, + "step": 760 + }, + { + "epoch": 0.45069588392063964, + "grad_norm": 2.924997761771959, + "learning_rate": 1.9250985803258957e-05, + "loss": 0.8579, + "step": 761 + }, + { + "epoch": 0.4512881255552265, + "grad_norm": 2.1898518842107424, + "learning_rate": 1.9248555271995006e-05, + "loss": 0.9125, + "step": 762 + }, + { + "epoch": 0.45188036718981345, + "grad_norm": 2.1171881776125234, + "learning_rate": 1.924612095755188e-05, + "loss": 0.818, + "step": 763 + }, + { + "epoch": 0.4524726088244004, + "grad_norm": 1.860211579546916, + "learning_rate": 1.924368286092534e-05, + "loss": 0.8827, + "step": 764 + }, + { + "epoch": 0.45306485045898726, + "grad_norm": 1.6407946441138859, + "learning_rate": 1.9241240983112718e-05, + "loss": 0.8979, + "step": 765 + }, + { + "epoch": 0.4536570920935742, + "grad_norm": 1.8118381222826243, + "learning_rate": 1.9238795325112867e-05, + "loss": 0.8969, + "step": 766 + }, + { + "epoch": 0.45424933372816106, + "grad_norm": 1.5722837226227442, + "learning_rate": 1.9236345887926215e-05, + "loss": 0.8211, + "step": 767 + }, + { + "epoch": 0.454841575362748, + "grad_norm": 1.690859845921261, + "learning_rate": 1.92338926725547e-05, + "loss": 0.9023, + "step": 768 + }, + { + "epoch": 0.45543381699733493, + "grad_norm": 1.3414797354584787, + "learning_rate": 1.9231435680001844e-05, + "loss": 0.8433, + "step": 769 + }, + { + "epoch": 0.4560260586319218, + "grad_norm": 2.9314823394828884, + "learning_rate": 1.9228974911272682e-05, + "loss": 0.8694, + "step": 770 + }, + { + "epoch": 0.45661830026650874, + "grad_norm": 1.6345887387544766, + "learning_rate": 1.9226510367373812e-05, + "loss": 0.9751, + "step": 771 + }, + { + "epoch": 0.45721054190109567, + "grad_norm": 1.4992911219830736, + "learning_rate": 1.922404204931337e-05, + "loss": 0.8129, + "step": 772 + }, + { + "epoch": 0.45780278353568254, + "grad_norm": 1.4628930115387142, + "learning_rate": 1.9221569958101038e-05, + "loss": 0.8945, + "step": 773 + }, + { + "epoch": 0.4583950251702695, + "grad_norm": 2.723334540080781, + "learning_rate": 1.9219094094748035e-05, + "loss": 0.8383, + "step": 774 + }, + { + "epoch": 0.4589872668048564, + "grad_norm": 1.9492331431341308, + "learning_rate": 1.9216614460267132e-05, + "loss": 0.8664, + "step": 775 + }, + { + "epoch": 0.4595795084394433, + "grad_norm": 1.8901483017176506, + "learning_rate": 1.9214131055672648e-05, + "loss": 0.9114, + "step": 776 + }, + { + "epoch": 0.4601717500740302, + "grad_norm": 1.5947480549984965, + "learning_rate": 1.921164388198042e-05, + "loss": 0.8619, + "step": 777 + }, + { + "epoch": 0.4607639917086171, + "grad_norm": 2.944268928829697, + "learning_rate": 1.9209152940207846e-05, + "loss": 0.8841, + "step": 778 + }, + { + "epoch": 0.461356233343204, + "grad_norm": 1.9959401062119286, + "learning_rate": 1.920665823137387e-05, + "loss": 0.8942, + "step": 779 + }, + { + "epoch": 0.46194847497779096, + "grad_norm": 2.055054417491741, + "learning_rate": 1.920415975649896e-05, + "loss": 0.8985, + "step": 780 + }, + { + "epoch": 0.46254071661237783, + "grad_norm": 1.43509961761953, + "learning_rate": 1.9201657516605136e-05, + "loss": 0.8409, + "step": 781 + }, + { + "epoch": 0.46313295824696477, + "grad_norm": 2.0964064977562464, + "learning_rate": 1.9199151512715948e-05, + "loss": 0.8603, + "step": 782 + }, + { + "epoch": 0.4637251998815517, + "grad_norm": 2.0471728931299933, + "learning_rate": 1.9196641745856502e-05, + "loss": 0.8195, + "step": 783 + }, + { + "epoch": 0.4643174415161386, + "grad_norm": 1.9479876262651854, + "learning_rate": 1.919412821705343e-05, + "loss": 0.892, + "step": 784 + }, + { + "epoch": 0.4649096831507255, + "grad_norm": 1.8429301406660341, + "learning_rate": 1.9191610927334905e-05, + "loss": 0.8517, + "step": 785 + }, + { + "epoch": 0.4655019247853124, + "grad_norm": 1.6626463990171203, + "learning_rate": 1.918908987773064e-05, + "loss": 0.8422, + "step": 786 + }, + { + "epoch": 0.4660941664198993, + "grad_norm": 2.484399506113593, + "learning_rate": 1.918656506927189e-05, + "loss": 0.8349, + "step": 787 + }, + { + "epoch": 0.46668640805448625, + "grad_norm": 2.2394704693255325, + "learning_rate": 1.918403650299144e-05, + "loss": 0.8584, + "step": 788 + }, + { + "epoch": 0.4672786496890731, + "grad_norm": 2.7696631299861494, + "learning_rate": 1.9181504179923622e-05, + "loss": 0.8702, + "step": 789 + }, + { + "epoch": 0.46787089132366005, + "grad_norm": 2.3702451262189603, + "learning_rate": 1.9178968101104285e-05, + "loss": 0.8812, + "step": 790 + }, + { + "epoch": 0.468463132958247, + "grad_norm": 3.347228840796043, + "learning_rate": 1.9176428267570842e-05, + "loss": 0.8925, + "step": 791 + }, + { + "epoch": 0.46905537459283386, + "grad_norm": 1.8592907890766055, + "learning_rate": 1.917388468036222e-05, + "loss": 0.859, + "step": 792 + }, + { + "epoch": 0.4696476162274208, + "grad_norm": 1.8720866828747977, + "learning_rate": 1.917133734051889e-05, + "loss": 0.8683, + "step": 793 + }, + { + "epoch": 0.4702398578620077, + "grad_norm": 2.3125042184278684, + "learning_rate": 1.9168786249082862e-05, + "loss": 0.8256, + "step": 794 + }, + { + "epoch": 0.4708320994965946, + "grad_norm": 1.719445243382972, + "learning_rate": 1.916623140709767e-05, + "loss": 0.9095, + "step": 795 + }, + { + "epoch": 0.47142434113118153, + "grad_norm": 3.0132158639000695, + "learning_rate": 1.9163672815608392e-05, + "loss": 0.8053, + "step": 796 + }, + { + "epoch": 0.4720165827657684, + "grad_norm": 1.933417130240812, + "learning_rate": 1.9161110475661634e-05, + "loss": 0.8653, + "step": 797 + }, + { + "epoch": 0.47260882440035534, + "grad_norm": 2.001369020588165, + "learning_rate": 1.9158544388305534e-05, + "loss": 0.8821, + "step": 798 + }, + { + "epoch": 0.4732010660349423, + "grad_norm": 2.236004511554376, + "learning_rate": 1.9155974554589774e-05, + "loss": 0.8531, + "step": 799 + }, + { + "epoch": 0.47379330766952915, + "grad_norm": 3.7355229765335785, + "learning_rate": 1.915340097556555e-05, + "loss": 0.8913, + "step": 800 + }, + { + "epoch": 0.4743855493041161, + "grad_norm": 6.246619019749031, + "learning_rate": 1.915082365228561e-05, + "loss": 0.9032, + "step": 801 + }, + { + "epoch": 0.474977790938703, + "grad_norm": 1.8815129005316729, + "learning_rate": 1.914824258580422e-05, + "loss": 0.8619, + "step": 802 + }, + { + "epoch": 0.4755700325732899, + "grad_norm": 1.662450993675011, + "learning_rate": 1.9145657777177186e-05, + "loss": 0.8465, + "step": 803 + }, + { + "epoch": 0.4761622742078768, + "grad_norm": 1.8295549474984136, + "learning_rate": 1.914306922746183e-05, + "loss": 0.8569, + "step": 804 + }, + { + "epoch": 0.4767545158424637, + "grad_norm": 10.419046505396, + "learning_rate": 1.914047693771702e-05, + "loss": 0.8476, + "step": 805 + }, + { + "epoch": 0.47734675747705063, + "grad_norm": 4.545366161612398, + "learning_rate": 1.9137880909003155e-05, + "loss": 0.8815, + "step": 806 + }, + { + "epoch": 0.47793899911163756, + "grad_norm": 2.3239626037301266, + "learning_rate": 1.9135281142382147e-05, + "loss": 0.812, + "step": 807 + }, + { + "epoch": 0.47853124074622444, + "grad_norm": 3.2052660355799256, + "learning_rate": 1.913267763891745e-05, + "loss": 0.8444, + "step": 808 + }, + { + "epoch": 0.47912348238081137, + "grad_norm": 2.661447353848109, + "learning_rate": 1.913007039967404e-05, + "loss": 0.8676, + "step": 809 + }, + { + "epoch": 0.4797157240153983, + "grad_norm": 3.1975284655785043, + "learning_rate": 1.912745942571843e-05, + "loss": 0.8973, + "step": 810 + }, + { + "epoch": 0.4803079656499852, + "grad_norm": 8.817742135689066, + "learning_rate": 1.9124844718118657e-05, + "loss": 0.8338, + "step": 811 + }, + { + "epoch": 0.4809002072845721, + "grad_norm": 6.647384676362516, + "learning_rate": 1.9122226277944276e-05, + "loss": 0.8034, + "step": 812 + }, + { + "epoch": 0.48149244891915904, + "grad_norm": 3.0068701129220665, + "learning_rate": 1.911960410626638e-05, + "loss": 0.8558, + "step": 813 + }, + { + "epoch": 0.4820846905537459, + "grad_norm": 2.4470838917289055, + "learning_rate": 1.9116978204157583e-05, + "loss": 0.8781, + "step": 814 + }, + { + "epoch": 0.48267693218833285, + "grad_norm": 3.568749125351664, + "learning_rate": 1.911434857269203e-05, + "loss": 0.8366, + "step": 815 + }, + { + "epoch": 0.48326917382291973, + "grad_norm": 2.788154497136242, + "learning_rate": 1.9111715212945384e-05, + "loss": 0.8179, + "step": 816 + }, + { + "epoch": 0.48386141545750666, + "grad_norm": 2.561601471580433, + "learning_rate": 1.9109078125994843e-05, + "loss": 0.8388, + "step": 817 + }, + { + "epoch": 0.4844536570920936, + "grad_norm": 13.67055884933429, + "learning_rate": 1.9106437312919116e-05, + "loss": 0.851, + "step": 818 + }, + { + "epoch": 0.48504589872668047, + "grad_norm": 3.884383319733934, + "learning_rate": 1.910379277479845e-05, + "loss": 0.8999, + "step": 819 + }, + { + "epoch": 0.4856381403612674, + "grad_norm": 4.2385883179797625, + "learning_rate": 1.910114451271461e-05, + "loss": 0.8981, + "step": 820 + }, + { + "epoch": 0.48623038199585433, + "grad_norm": 2.8064585598908356, + "learning_rate": 1.909849252775088e-05, + "loss": 0.8633, + "step": 821 + }, + { + "epoch": 0.4868226236304412, + "grad_norm": 3.4012309556283227, + "learning_rate": 1.9095836820992074e-05, + "loss": 0.8425, + "step": 822 + }, + { + "epoch": 0.48741486526502814, + "grad_norm": 1.8123186335536858, + "learning_rate": 1.9093177393524524e-05, + "loss": 0.8615, + "step": 823 + }, + { + "epoch": 0.488007106899615, + "grad_norm": 3.468928077402017, + "learning_rate": 1.9090514246436085e-05, + "loss": 0.8995, + "step": 824 + }, + { + "epoch": 0.48859934853420195, + "grad_norm": 1.8021817875541082, + "learning_rate": 1.908784738081614e-05, + "loss": 0.8284, + "step": 825 + }, + { + "epoch": 0.4891915901687889, + "grad_norm": 2.4825340026394374, + "learning_rate": 1.9085176797755575e-05, + "loss": 0.8585, + "step": 826 + }, + { + "epoch": 0.48978383180337576, + "grad_norm": 2.47602922744696, + "learning_rate": 1.908250249834682e-05, + "loss": 0.8805, + "step": 827 + }, + { + "epoch": 0.4903760734379627, + "grad_norm": 1.9033817119185694, + "learning_rate": 1.9079824483683808e-05, + "loss": 0.9176, + "step": 828 + }, + { + "epoch": 0.4909683150725496, + "grad_norm": 1.5335390856862279, + "learning_rate": 1.9077142754862e-05, + "loss": 0.8394, + "step": 829 + }, + { + "epoch": 0.4915605567071365, + "grad_norm": 1.8269219296715522, + "learning_rate": 1.9074457312978373e-05, + "loss": 0.8565, + "step": 830 + }, + { + "epoch": 0.49215279834172343, + "grad_norm": 1.8745343150998126, + "learning_rate": 1.907176815913142e-05, + "loss": 0.8292, + "step": 831 + }, + { + "epoch": 0.49274503997631036, + "grad_norm": 3.137235022218658, + "learning_rate": 1.9069075294421163e-05, + "loss": 0.8564, + "step": 832 + }, + { + "epoch": 0.49333728161089724, + "grad_norm": 4.136469645798408, + "learning_rate": 1.906637871994913e-05, + "loss": 0.8611, + "step": 833 + }, + { + "epoch": 0.49392952324548417, + "grad_norm": 2.105315946203615, + "learning_rate": 1.9063678436818372e-05, + "loss": 0.8306, + "step": 834 + }, + { + "epoch": 0.49452176488007105, + "grad_norm": 1.6169111636378513, + "learning_rate": 1.906097444613346e-05, + "loss": 0.8065, + "step": 835 + }, + { + "epoch": 0.495114006514658, + "grad_norm": 2.1563275408886495, + "learning_rate": 1.905826674900048e-05, + "loss": 0.8578, + "step": 836 + }, + { + "epoch": 0.4957062481492449, + "grad_norm": 1.6035367865321255, + "learning_rate": 1.9055555346527024e-05, + "loss": 0.8172, + "step": 837 + }, + { + "epoch": 0.4962984897838318, + "grad_norm": 2.7262844772305868, + "learning_rate": 1.9052840239822218e-05, + "loss": 0.8535, + "step": 838 + }, + { + "epoch": 0.4968907314184187, + "grad_norm": 5.363131725816022, + "learning_rate": 1.9050121429996682e-05, + "loss": 0.8659, + "step": 839 + }, + { + "epoch": 0.49748297305300565, + "grad_norm": 8.968200408901346, + "learning_rate": 1.904739891816257e-05, + "loss": 0.8318, + "step": 840 + }, + { + "epoch": 0.4980752146875925, + "grad_norm": 1.7248247185116594, + "learning_rate": 1.904467270543354e-05, + "loss": 0.8957, + "step": 841 + }, + { + "epoch": 0.49866745632217946, + "grad_norm": 1.9562698832994654, + "learning_rate": 1.9041942792924767e-05, + "loss": 0.8345, + "step": 842 + }, + { + "epoch": 0.49925969795676634, + "grad_norm": 3.9947226743168156, + "learning_rate": 1.9039209181752942e-05, + "loss": 0.8652, + "step": 843 + }, + { + "epoch": 0.49985193959135327, + "grad_norm": 3.1571777018867677, + "learning_rate": 1.903647187303626e-05, + "loss": 0.8515, + "step": 844 + }, + { + "epoch": 0.5004441812259401, + "grad_norm": 2.1169391797594668, + "learning_rate": 1.9033730867894436e-05, + "loss": 0.8119, + "step": 845 + }, + { + "epoch": 0.5010364228605271, + "grad_norm": 2.7287026350418806, + "learning_rate": 1.9030986167448696e-05, + "loss": 0.9056, + "step": 846 + }, + { + "epoch": 0.501628664495114, + "grad_norm": 4.03173653799457, + "learning_rate": 1.9028237772821778e-05, + "loss": 0.835, + "step": 847 + }, + { + "epoch": 0.5022209061297009, + "grad_norm": 3.2102449210627246, + "learning_rate": 1.902548568513793e-05, + "loss": 0.869, + "step": 848 + }, + { + "epoch": 0.5028131477642879, + "grad_norm": 2.243822048790965, + "learning_rate": 1.9022729905522906e-05, + "loss": 0.8895, + "step": 849 + }, + { + "epoch": 0.5034053893988747, + "grad_norm": 2.4533940450724314, + "learning_rate": 1.9019970435103978e-05, + "loss": 0.8636, + "step": 850 + }, + { + "epoch": 0.5039976310334616, + "grad_norm": 2.848746378952194, + "learning_rate": 1.9017207275009925e-05, + "loss": 0.892, + "step": 851 + }, + { + "epoch": 0.5045898726680486, + "grad_norm": 2.3326834597031105, + "learning_rate": 1.9014440426371034e-05, + "loss": 0.7968, + "step": 852 + }, + { + "epoch": 0.5051821143026355, + "grad_norm": 2.745519544350419, + "learning_rate": 1.9011669890319104e-05, + "loss": 0.8378, + "step": 853 + }, + { + "epoch": 0.5057743559372224, + "grad_norm": 2.3271640191288183, + "learning_rate": 1.9008895667987434e-05, + "loss": 0.9074, + "step": 854 + }, + { + "epoch": 0.5063665975718094, + "grad_norm": 2.06754274248316, + "learning_rate": 1.9006117760510846e-05, + "loss": 0.8939, + "step": 855 + }, + { + "epoch": 0.5069588392063962, + "grad_norm": 4.335317472534966, + "learning_rate": 1.9003336169025655e-05, + "loss": 0.8571, + "step": 856 + }, + { + "epoch": 0.5075510808409831, + "grad_norm": 2.494123585371266, + "learning_rate": 1.9000550894669686e-05, + "loss": 0.8623, + "step": 857 + }, + { + "epoch": 0.50814332247557, + "grad_norm": 2.8756951470707577, + "learning_rate": 1.8997761938582277e-05, + "loss": 0.8292, + "step": 858 + }, + { + "epoch": 0.508735564110157, + "grad_norm": 2.761198259777182, + "learning_rate": 1.8994969301904266e-05, + "loss": 0.868, + "step": 859 + }, + { + "epoch": 0.5093278057447439, + "grad_norm": 6.139448459726971, + "learning_rate": 1.8992172985778002e-05, + "loss": 0.8189, + "step": 860 + }, + { + "epoch": 0.5099200473793307, + "grad_norm": 2.3982620378137, + "learning_rate": 1.898937299134733e-05, + "loss": 0.8413, + "step": 861 + }, + { + "epoch": 0.5105122890139177, + "grad_norm": 2.4360272672092758, + "learning_rate": 1.8986569319757605e-05, + "loss": 0.8573, + "step": 862 + }, + { + "epoch": 0.5111045306485046, + "grad_norm": 2.978466515723497, + "learning_rate": 1.898376197215569e-05, + "loss": 0.8423, + "step": 863 + }, + { + "epoch": 0.5116967722830915, + "grad_norm": 3.3896178499592464, + "learning_rate": 1.8980950949689952e-05, + "loss": 0.8559, + "step": 864 + }, + { + "epoch": 0.5122890139176784, + "grad_norm": 3.3035441398858993, + "learning_rate": 1.8978136253510248e-05, + "loss": 0.8613, + "step": 865 + }, + { + "epoch": 0.5128812555522653, + "grad_norm": 2.8070122645483035, + "learning_rate": 1.897531788476795e-05, + "loss": 0.8941, + "step": 866 + }, + { + "epoch": 0.5134734971868522, + "grad_norm": 3.414743000811395, + "learning_rate": 1.8972495844615933e-05, + "loss": 0.8576, + "step": 867 + }, + { + "epoch": 0.5140657388214391, + "grad_norm": 4.129209851440052, + "learning_rate": 1.896967013420857e-05, + "loss": 0.8392, + "step": 868 + }, + { + "epoch": 0.5146579804560261, + "grad_norm": 2.7614250927203696, + "learning_rate": 1.896684075470173e-05, + "loss": 0.8044, + "step": 869 + }, + { + "epoch": 0.515250222090613, + "grad_norm": 5.7190450126340275, + "learning_rate": 1.896400770725279e-05, + "loss": 0.8782, + "step": 870 + }, + { + "epoch": 0.5158424637251999, + "grad_norm": 3.264689428254472, + "learning_rate": 1.896117099302063e-05, + "loss": 0.8781, + "step": 871 + }, + { + "epoch": 0.5164347053597867, + "grad_norm": 2.3052957981729123, + "learning_rate": 1.8958330613165622e-05, + "loss": 0.7944, + "step": 872 + }, + { + "epoch": 0.5170269469943737, + "grad_norm": 3.6741300739615745, + "learning_rate": 1.895548656884964e-05, + "loss": 0.846, + "step": 873 + }, + { + "epoch": 0.5176191886289606, + "grad_norm": 2.633170158045045, + "learning_rate": 1.8952638861236066e-05, + "loss": 0.837, + "step": 874 + }, + { + "epoch": 0.5182114302635475, + "grad_norm": 3.933048892917864, + "learning_rate": 1.894978749148976e-05, + "loss": 0.8227, + "step": 875 + }, + { + "epoch": 0.5188036718981345, + "grad_norm": 3.415883244982847, + "learning_rate": 1.8946932460777105e-05, + "loss": 0.8591, + "step": 876 + }, + { + "epoch": 0.5193959135327213, + "grad_norm": 2.1556681217465905, + "learning_rate": 1.8944073770265958e-05, + "loss": 0.8476, + "step": 877 + }, + { + "epoch": 0.5199881551673082, + "grad_norm": 3.3058563739280307, + "learning_rate": 1.894121142112569e-05, + "loss": 0.8595, + "step": 878 + }, + { + "epoch": 0.5205803968018952, + "grad_norm": 1.6701206007058242, + "learning_rate": 1.8938345414527165e-05, + "loss": 0.8418, + "step": 879 + }, + { + "epoch": 0.5211726384364821, + "grad_norm": 2.8666163842304786, + "learning_rate": 1.8935475751642736e-05, + "loss": 0.8302, + "step": 880 + }, + { + "epoch": 0.521764880071069, + "grad_norm": 2.448842057434439, + "learning_rate": 1.893260243364626e-05, + "loss": 0.8475, + "step": 881 + }, + { + "epoch": 0.522357121705656, + "grad_norm": 2.8805816085789355, + "learning_rate": 1.8929725461713083e-05, + "loss": 0.8595, + "step": 882 + }, + { + "epoch": 0.5229493633402428, + "grad_norm": 2.190759226441507, + "learning_rate": 1.892684483702005e-05, + "loss": 0.8447, + "step": 883 + }, + { + "epoch": 0.5235416049748297, + "grad_norm": 2.7950857557738464, + "learning_rate": 1.8923960560745495e-05, + "loss": 0.8345, + "step": 884 + }, + { + "epoch": 0.5241338466094166, + "grad_norm": 2.2234121325584213, + "learning_rate": 1.8921072634069255e-05, + "loss": 0.8267, + "step": 885 + }, + { + "epoch": 0.5247260882440036, + "grad_norm": 1.457185905309736, + "learning_rate": 1.891818105817265e-05, + "loss": 0.8214, + "step": 886 + }, + { + "epoch": 0.5253183298785905, + "grad_norm": 3.4034755252325515, + "learning_rate": 1.8915285834238498e-05, + "loss": 0.8626, + "step": 887 + }, + { + "epoch": 0.5259105715131773, + "grad_norm": 3.715757488653005, + "learning_rate": 1.891238696345111e-05, + "loss": 0.8602, + "step": 888 + }, + { + "epoch": 0.5265028131477643, + "grad_norm": 1.8540026574196378, + "learning_rate": 1.890948444699629e-05, + "loss": 0.8251, + "step": 889 + }, + { + "epoch": 0.5270950547823512, + "grad_norm": 2.1775940247813343, + "learning_rate": 1.8906578286061325e-05, + "loss": 0.8593, + "step": 890 + }, + { + "epoch": 0.5276872964169381, + "grad_norm": 2.6307649588163047, + "learning_rate": 1.8903668481834996e-05, + "loss": 0.8612, + "step": 891 + }, + { + "epoch": 0.528279538051525, + "grad_norm": 3.989337506531906, + "learning_rate": 1.890075503550758e-05, + "loss": 0.8391, + "step": 892 + }, + { + "epoch": 0.528871779686112, + "grad_norm": 8.95506433868868, + "learning_rate": 1.889783794827085e-05, + "loss": 0.8594, + "step": 893 + }, + { + "epoch": 0.5294640213206988, + "grad_norm": 2.6218182036375026, + "learning_rate": 1.8894917221318038e-05, + "loss": 0.8828, + "step": 894 + }, + { + "epoch": 0.5300562629552857, + "grad_norm": 1.8329700035472636, + "learning_rate": 1.8891992855843902e-05, + "loss": 0.8307, + "step": 895 + }, + { + "epoch": 0.5306485045898727, + "grad_norm": 2.011555405209645, + "learning_rate": 1.888906485304467e-05, + "loss": 0.8204, + "step": 896 + }, + { + "epoch": 0.5312407462244596, + "grad_norm": 2.8138211953978933, + "learning_rate": 1.8886133214118053e-05, + "loss": 0.8544, + "step": 897 + }, + { + "epoch": 0.5318329878590465, + "grad_norm": 2.7779050915507666, + "learning_rate": 1.888319794026326e-05, + "loss": 0.8776, + "step": 898 + }, + { + "epoch": 0.5324252294936334, + "grad_norm": 2.867628838160467, + "learning_rate": 1.8880259032680985e-05, + "loss": 0.8714, + "step": 899 + }, + { + "epoch": 0.5330174711282203, + "grad_norm": 2.645795842168525, + "learning_rate": 1.88773164925734e-05, + "loss": 0.8673, + "step": 900 + }, + { + "epoch": 0.5336097127628072, + "grad_norm": 1.988822701440226, + "learning_rate": 1.887437032114418e-05, + "loss": 0.8058, + "step": 901 + }, + { + "epoch": 0.5342019543973942, + "grad_norm": 1.6378154166647196, + "learning_rate": 1.887142051959847e-05, + "loss": 0.8464, + "step": 902 + }, + { + "epoch": 0.5347941960319811, + "grad_norm": 2.2058743781314036, + "learning_rate": 1.8868467089142893e-05, + "loss": 0.8751, + "step": 903 + }, + { + "epoch": 0.5353864376665679, + "grad_norm": 2.5282653299151527, + "learning_rate": 1.8865510030985588e-05, + "loss": 0.8621, + "step": 904 + }, + { + "epoch": 0.5359786793011548, + "grad_norm": 27.31034341693317, + "learning_rate": 1.8862549346336144e-05, + "loss": 0.8408, + "step": 905 + }, + { + "epoch": 0.5365709209357418, + "grad_norm": 2.8532802195059985, + "learning_rate": 1.8859585036405653e-05, + "loss": 0.8242, + "step": 906 + }, + { + "epoch": 0.5371631625703287, + "grad_norm": 2.0534557906487887, + "learning_rate": 1.8856617102406685e-05, + "loss": 0.8428, + "step": 907 + }, + { + "epoch": 0.5377554042049156, + "grad_norm": 1.5482048628272183, + "learning_rate": 1.885364554555329e-05, + "loss": 0.8626, + "step": 908 + }, + { + "epoch": 0.5383476458395026, + "grad_norm": 2.1575688293112507, + "learning_rate": 1.8850670367061003e-05, + "loss": 0.8558, + "step": 909 + }, + { + "epoch": 0.5389398874740894, + "grad_norm": 2.102806685634922, + "learning_rate": 1.884769156814684e-05, + "loss": 0.8687, + "step": 910 + }, + { + "epoch": 0.5395321291086763, + "grad_norm": 1.4463393941880638, + "learning_rate": 1.884470915002929e-05, + "loss": 0.8818, + "step": 911 + }, + { + "epoch": 0.5401243707432632, + "grad_norm": 1.4609406531497033, + "learning_rate": 1.884172311392834e-05, + "loss": 0.8548, + "step": 912 + }, + { + "epoch": 0.5407166123778502, + "grad_norm": 1.8329647817569432, + "learning_rate": 1.883873346106544e-05, + "loss": 0.858, + "step": 913 + }, + { + "epoch": 0.5413088540124371, + "grad_norm": 3.9152641529549856, + "learning_rate": 1.883574019266353e-05, + "loss": 0.8783, + "step": 914 + }, + { + "epoch": 0.5419010956470239, + "grad_norm": 1.4892888981468162, + "learning_rate": 1.8832743309947026e-05, + "loss": 0.8504, + "step": 915 + }, + { + "epoch": 0.5424933372816109, + "grad_norm": 2.1974930679295963, + "learning_rate": 1.8829742814141813e-05, + "loss": 0.7888, + "step": 916 + }, + { + "epoch": 0.5430855789161978, + "grad_norm": 2.960377295236711, + "learning_rate": 1.8826738706475275e-05, + "loss": 0.9234, + "step": 917 + }, + { + "epoch": 0.5436778205507847, + "grad_norm": 2.413195078205313, + "learning_rate": 1.882373098817625e-05, + "loss": 0.8354, + "step": 918 + }, + { + "epoch": 0.5442700621853717, + "grad_norm": 2.29920506736285, + "learning_rate": 1.882071966047507e-05, + "loss": 0.8359, + "step": 919 + }, + { + "epoch": 0.5448623038199586, + "grad_norm": 2.2127645514863916, + "learning_rate": 1.8817704724603536e-05, + "loss": 0.8713, + "step": 920 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.5056883347384074, + "learning_rate": 1.8814686181794927e-05, + "loss": 0.8836, + "step": 921 + }, + { + "epoch": 0.5460467870891323, + "grad_norm": 2.319729335193821, + "learning_rate": 1.8811664033283993e-05, + "loss": 0.8758, + "step": 922 + }, + { + "epoch": 0.5466390287237193, + "grad_norm": 2.4646633874772603, + "learning_rate": 1.880863828030697e-05, + "loss": 0.8338, + "step": 923 + }, + { + "epoch": 0.5472312703583062, + "grad_norm": 3.5967734066777624, + "learning_rate": 1.880560892410155e-05, + "loss": 0.8142, + "step": 924 + }, + { + "epoch": 0.5478235119928931, + "grad_norm": 1.931083687931415, + "learning_rate": 1.8802575965906923e-05, + "loss": 0.8624, + "step": 925 + }, + { + "epoch": 0.54841575362748, + "grad_norm": 1.3658707686323024, + "learning_rate": 1.879953940696373e-05, + "loss": 0.82, + "step": 926 + }, + { + "epoch": 0.5490079952620669, + "grad_norm": 1.5399776635493545, + "learning_rate": 1.87964992485141e-05, + "loss": 0.8752, + "step": 927 + }, + { + "epoch": 0.5496002368966538, + "grad_norm": 1.2888985969973952, + "learning_rate": 1.8793455491801623e-05, + "loss": 0.8258, + "step": 928 + }, + { + "epoch": 0.5501924785312408, + "grad_norm": 1.2235740279377274, + "learning_rate": 1.8790408138071372e-05, + "loss": 0.7807, + "step": 929 + }, + { + "epoch": 0.5507847201658277, + "grad_norm": 1.3948987520636114, + "learning_rate": 1.878735718856988e-05, + "loss": 0.85, + "step": 930 + }, + { + "epoch": 0.5513769618004146, + "grad_norm": 1.1530555013033301, + "learning_rate": 1.8784302644545165e-05, + "loss": 0.8434, + "step": 931 + }, + { + "epoch": 0.5519692034350014, + "grad_norm": 1.1001632567365147, + "learning_rate": 1.8781244507246706e-05, + "loss": 0.8357, + "step": 932 + }, + { + "epoch": 0.5525614450695884, + "grad_norm": 1.1272615396567933, + "learning_rate": 1.8778182777925447e-05, + "loss": 0.8673, + "step": 933 + }, + { + "epoch": 0.5531536867041753, + "grad_norm": 1.5817714863387373, + "learning_rate": 1.877511745783381e-05, + "loss": 0.8628, + "step": 934 + }, + { + "epoch": 0.5537459283387622, + "grad_norm": 1.5329765777416222, + "learning_rate": 1.8772048548225684e-05, + "loss": 0.8412, + "step": 935 + }, + { + "epoch": 0.5543381699733492, + "grad_norm": 1.15668478337925, + "learning_rate": 1.8768976050356428e-05, + "loss": 0.8673, + "step": 936 + }, + { + "epoch": 0.554930411607936, + "grad_norm": 1.4034733165443383, + "learning_rate": 1.8765899965482858e-05, + "loss": 0.8063, + "step": 937 + }, + { + "epoch": 0.5555226532425229, + "grad_norm": 1.278219411060756, + "learning_rate": 1.876282029486328e-05, + "loss": 0.8432, + "step": 938 + }, + { + "epoch": 0.5561148948771099, + "grad_norm": 1.434098032436352, + "learning_rate": 1.8759737039757437e-05, + "loss": 0.8616, + "step": 939 + }, + { + "epoch": 0.5567071365116968, + "grad_norm": 1.3706128629162078, + "learning_rate": 1.8756650201426565e-05, + "loss": 0.8397, + "step": 940 + }, + { + "epoch": 0.5572993781462837, + "grad_norm": 1.4524914917856544, + "learning_rate": 1.875355978113335e-05, + "loss": 0.8423, + "step": 941 + }, + { + "epoch": 0.5578916197808705, + "grad_norm": 1.6212056588454582, + "learning_rate": 1.8750465780141946e-05, + "loss": 0.8259, + "step": 942 + }, + { + "epoch": 0.5584838614154575, + "grad_norm": 1.5764324461444237, + "learning_rate": 1.8747368199717977e-05, + "loss": 0.8723, + "step": 943 + }, + { + "epoch": 0.5590761030500444, + "grad_norm": 1.5399903048769072, + "learning_rate": 1.8744267041128528e-05, + "loss": 0.8563, + "step": 944 + }, + { + "epoch": 0.5596683446846313, + "grad_norm": 1.4178813586778414, + "learning_rate": 1.874116230564214e-05, + "loss": 0.8551, + "step": 945 + }, + { + "epoch": 0.5602605863192183, + "grad_norm": 1.2554205520730095, + "learning_rate": 1.8738053994528835e-05, + "loss": 0.84, + "step": 946 + }, + { + "epoch": 0.5608528279538052, + "grad_norm": 1.436381834061602, + "learning_rate": 1.873494210906008e-05, + "loss": 0.8633, + "step": 947 + }, + { + "epoch": 0.561445069588392, + "grad_norm": 1.572609783293438, + "learning_rate": 1.8731826650508812e-05, + "loss": 0.8575, + "step": 948 + }, + { + "epoch": 0.562037311222979, + "grad_norm": 1.9911513522446955, + "learning_rate": 1.872870762014943e-05, + "loss": 0.8699, + "step": 949 + }, + { + "epoch": 0.5626295528575659, + "grad_norm": 1.7221797946035173, + "learning_rate": 1.8725585019257794e-05, + "loss": 0.8288, + "step": 950 + }, + { + "epoch": 0.5632217944921528, + "grad_norm": 1.6316667513165497, + "learning_rate": 1.8722458849111218e-05, + "loss": 0.8476, + "step": 951 + }, + { + "epoch": 0.5638140361267397, + "grad_norm": 1.8817617861124867, + "learning_rate": 1.8719329110988487e-05, + "loss": 0.8812, + "step": 952 + }, + { + "epoch": 0.5644062777613266, + "grad_norm": 1.225383750075909, + "learning_rate": 1.871619580616984e-05, + "loss": 0.8646, + "step": 953 + }, + { + "epoch": 0.5649985193959135, + "grad_norm": 4.355740946012875, + "learning_rate": 1.871305893593697e-05, + "loss": 0.8008, + "step": 954 + }, + { + "epoch": 0.5655907610305004, + "grad_norm": 2.1927471549874564, + "learning_rate": 1.8709918501573038e-05, + "loss": 0.8762, + "step": 955 + }, + { + "epoch": 0.5661830026650874, + "grad_norm": 3.055980708697755, + "learning_rate": 1.8706774504362655e-05, + "loss": 0.8584, + "step": 956 + }, + { + "epoch": 0.5667752442996743, + "grad_norm": 1.5384769512510463, + "learning_rate": 1.8703626945591895e-05, + "loss": 0.874, + "step": 957 + }, + { + "epoch": 0.5673674859342612, + "grad_norm": 1.593507898135259, + "learning_rate": 1.8700475826548285e-05, + "loss": 0.8539, + "step": 958 + }, + { + "epoch": 0.567959727568848, + "grad_norm": 2.4658738412503185, + "learning_rate": 1.8697321148520812e-05, + "loss": 0.8464, + "step": 959 + }, + { + "epoch": 0.568551969203435, + "grad_norm": 2.3647390172875213, + "learning_rate": 1.8694162912799917e-05, + "loss": 0.8487, + "step": 960 + }, + { + "epoch": 0.5691442108380219, + "grad_norm": 1.8822227155497309, + "learning_rate": 1.869100112067749e-05, + "loss": 0.924, + "step": 961 + }, + { + "epoch": 0.5697364524726088, + "grad_norm": 3.1476849757648258, + "learning_rate": 1.868783577344689e-05, + "loss": 0.8641, + "step": 962 + }, + { + "epoch": 0.5703286941071958, + "grad_norm": 2.391640628039692, + "learning_rate": 1.8684666872402913e-05, + "loss": 0.8189, + "step": 963 + }, + { + "epoch": 0.5709209357417826, + "grad_norm": 1.9209124622082125, + "learning_rate": 1.8681494418841825e-05, + "loss": 0.9141, + "step": 964 + }, + { + "epoch": 0.5715131773763695, + "grad_norm": 1.883444300120634, + "learning_rate": 1.8678318414061336e-05, + "loss": 0.8244, + "step": 965 + }, + { + "epoch": 0.5721054190109565, + "grad_norm": 1.8683731975284492, + "learning_rate": 1.867513885936061e-05, + "loss": 0.8039, + "step": 966 + }, + { + "epoch": 0.5726976606455434, + "grad_norm": 2.186169687223295, + "learning_rate": 1.8671955756040264e-05, + "loss": 0.8481, + "step": 967 + }, + { + "epoch": 0.5732899022801303, + "grad_norm": 2.4126100375538746, + "learning_rate": 1.8668769105402366e-05, + "loss": 0.8283, + "step": 968 + }, + { + "epoch": 0.5738821439147173, + "grad_norm": 5.881790876957725, + "learning_rate": 1.8665578908750437e-05, + "loss": 0.8004, + "step": 969 + }, + { + "epoch": 0.5744743855493041, + "grad_norm": 2.1957974263362963, + "learning_rate": 1.8662385167389443e-05, + "loss": 0.8125, + "step": 970 + }, + { + "epoch": 0.575066627183891, + "grad_norm": 2.122100726977036, + "learning_rate": 1.8659187882625807e-05, + "loss": 0.8323, + "step": 971 + }, + { + "epoch": 0.5756588688184779, + "grad_norm": 2.0089182722346512, + "learning_rate": 1.8655987055767396e-05, + "loss": 0.8107, + "step": 972 + }, + { + "epoch": 0.5762511104530649, + "grad_norm": 3.4788064271268677, + "learning_rate": 1.8652782688123535e-05, + "loss": 0.8357, + "step": 973 + }, + { + "epoch": 0.5768433520876518, + "grad_norm": 1.9971681598642872, + "learning_rate": 1.864957478100498e-05, + "loss": 0.8487, + "step": 974 + }, + { + "epoch": 0.5774355937222386, + "grad_norm": 2.7963971479792096, + "learning_rate": 1.8646363335723952e-05, + "loss": 0.9236, + "step": 975 + }, + { + "epoch": 0.5780278353568256, + "grad_norm": 2.6501420919148706, + "learning_rate": 1.864314835359411e-05, + "loss": 0.8643, + "step": 976 + }, + { + "epoch": 0.5786200769914125, + "grad_norm": 2.5379734296943273, + "learning_rate": 1.863992983593056e-05, + "loss": 0.8564, + "step": 977 + }, + { + "epoch": 0.5792123186259994, + "grad_norm": 3.4686271540396936, + "learning_rate": 1.8636707784049867e-05, + "loss": 0.8665, + "step": 978 + }, + { + "epoch": 0.5798045602605864, + "grad_norm": 1.9639248298320788, + "learning_rate": 1.8633482199270025e-05, + "loss": 0.8785, + "step": 979 + }, + { + "epoch": 0.5803968018951732, + "grad_norm": 3.009335654283905, + "learning_rate": 1.8630253082910473e-05, + "loss": 0.8629, + "step": 980 + }, + { + "epoch": 0.5809890435297601, + "grad_norm": 2.5913527017658535, + "learning_rate": 1.862702043629211e-05, + "loss": 0.8483, + "step": 981 + }, + { + "epoch": 0.581581285164347, + "grad_norm": 2.3301173927122636, + "learning_rate": 1.8623784260737272e-05, + "loss": 0.8872, + "step": 982 + }, + { + "epoch": 0.582173526798934, + "grad_norm": 2.1728733813785954, + "learning_rate": 1.862054455756973e-05, + "loss": 0.8851, + "step": 983 + }, + { + "epoch": 0.5827657684335209, + "grad_norm": 2.6022654651479153, + "learning_rate": 1.8617301328114704e-05, + "loss": 0.853, + "step": 984 + }, + { + "epoch": 0.5833580100681078, + "grad_norm": 5.499406695169115, + "learning_rate": 1.8614054573698867e-05, + "loss": 0.8533, + "step": 985 + }, + { + "epoch": 0.5839502517026947, + "grad_norm": 7.358085625855621, + "learning_rate": 1.861080429565031e-05, + "loss": 0.8609, + "step": 986 + }, + { + "epoch": 0.5845424933372816, + "grad_norm": 2.5901934912509406, + "learning_rate": 1.8607550495298594e-05, + "loss": 0.8123, + "step": 987 + }, + { + "epoch": 0.5851347349718685, + "grad_norm": 3.577130582550033, + "learning_rate": 1.8604293173974694e-05, + "loss": 0.8424, + "step": 988 + }, + { + "epoch": 0.5857269766064555, + "grad_norm": 6.266496305800451, + "learning_rate": 1.8601032333011045e-05, + "loss": 0.8278, + "step": 989 + }, + { + "epoch": 0.5863192182410424, + "grad_norm": 3.688636492554437, + "learning_rate": 1.8597767973741514e-05, + "loss": 0.8391, + "step": 990 + }, + { + "epoch": 0.5869114598756292, + "grad_norm": 3.4278947346345556, + "learning_rate": 1.8594500097501403e-05, + "loss": 0.8925, + "step": 991 + }, + { + "epoch": 0.5875037015102161, + "grad_norm": 2.798245409340499, + "learning_rate": 1.8591228705627464e-05, + "loss": 0.8683, + "step": 992 + }, + { + "epoch": 0.5880959431448031, + "grad_norm": 2.6101217239588674, + "learning_rate": 1.858795379945787e-05, + "loss": 0.7957, + "step": 993 + }, + { + "epoch": 0.58868818477939, + "grad_norm": 1.9767054625050304, + "learning_rate": 1.8584675380332254e-05, + "loss": 0.8868, + "step": 994 + }, + { + "epoch": 0.5892804264139769, + "grad_norm": 3.561966477734429, + "learning_rate": 1.8581393449591667e-05, + "loss": 0.8462, + "step": 995 + }, + { + "epoch": 0.5898726680485639, + "grad_norm": 1.6872502667794813, + "learning_rate": 1.8578108008578603e-05, + "loss": 0.8334, + "step": 996 + }, + { + "epoch": 0.5904649096831507, + "grad_norm": 2.4287274023717615, + "learning_rate": 1.8574819058636993e-05, + "loss": 0.8921, + "step": 997 + }, + { + "epoch": 0.5910571513177376, + "grad_norm": 1.7111015618500356, + "learning_rate": 1.8571526601112202e-05, + "loss": 0.8314, + "step": 998 + }, + { + "epoch": 0.5916493929523245, + "grad_norm": 2.5502109328589233, + "learning_rate": 1.856823063735103e-05, + "loss": 0.8725, + "step": 999 + }, + { + "epoch": 0.5922416345869115, + "grad_norm": 2.040139228442696, + "learning_rate": 1.8564931168701713e-05, + "loss": 0.8347, + "step": 1000 + }, + { + "epoch": 0.5928338762214984, + "grad_norm": 2.5055813610082303, + "learning_rate": 1.8561628196513923e-05, + "loss": 0.8585, + "step": 1001 + }, + { + "epoch": 0.5934261178560852, + "grad_norm": 2.1524220648045427, + "learning_rate": 1.855832172213875e-05, + "loss": 0.8474, + "step": 1002 + }, + { + "epoch": 0.5940183594906722, + "grad_norm": 1.619731922084106, + "learning_rate": 1.8555011746928738e-05, + "loss": 0.8179, + "step": 1003 + }, + { + "epoch": 0.5946106011252591, + "grad_norm": 2.8662321460118805, + "learning_rate": 1.855169827223785e-05, + "loss": 0.8484, + "step": 1004 + }, + { + "epoch": 0.595202842759846, + "grad_norm": 2.449255266934718, + "learning_rate": 1.8548381299421486e-05, + "loss": 0.8856, + "step": 1005 + }, + { + "epoch": 0.595795084394433, + "grad_norm": 1.8813446878490585, + "learning_rate": 1.854506082983647e-05, + "loss": 0.8173, + "step": 1006 + }, + { + "epoch": 0.5963873260290199, + "grad_norm": 2.5469931424576586, + "learning_rate": 1.8541736864841064e-05, + "loss": 0.8532, + "step": 1007 + }, + { + "epoch": 0.5969795676636067, + "grad_norm": 2.641308461614614, + "learning_rate": 1.8538409405794952e-05, + "loss": 0.8217, + "step": 1008 + }, + { + "epoch": 0.5975718092981936, + "grad_norm": 1.4950837736543847, + "learning_rate": 1.8535078454059256e-05, + "loss": 0.8381, + "step": 1009 + }, + { + "epoch": 0.5981640509327806, + "grad_norm": 2.338715560487363, + "learning_rate": 1.8531744010996525e-05, + "loss": 0.8573, + "step": 1010 + }, + { + "epoch": 0.5987562925673675, + "grad_norm": 1.6059242683751407, + "learning_rate": 1.8528406077970725e-05, + "loss": 0.8527, + "step": 1011 + }, + { + "epoch": 0.5993485342019544, + "grad_norm": 12.361811261173756, + "learning_rate": 1.8525064656347265e-05, + "loss": 0.8298, + "step": 1012 + }, + { + "epoch": 0.5999407758365413, + "grad_norm": 1.1511502155917168, + "learning_rate": 1.8521719747492974e-05, + "loss": 0.8212, + "step": 1013 + }, + { + "epoch": 0.6005330174711282, + "grad_norm": 1.6632441653678172, + "learning_rate": 1.8518371352776107e-05, + "loss": 0.8415, + "step": 1014 + }, + { + "epoch": 0.6011252591057151, + "grad_norm": 4.116651414311895, + "learning_rate": 1.8515019473566346e-05, + "loss": 0.8252, + "step": 1015 + }, + { + "epoch": 0.6017175007403021, + "grad_norm": 1.8732938641491954, + "learning_rate": 1.85116641112348e-05, + "loss": 0.8283, + "step": 1016 + }, + { + "epoch": 0.602309742374889, + "grad_norm": 1.8390615077922212, + "learning_rate": 1.8508305267153992e-05, + "loss": 0.8132, + "step": 1017 + }, + { + "epoch": 0.6029019840094758, + "grad_norm": 2.633892399081572, + "learning_rate": 1.850494294269789e-05, + "loss": 0.8758, + "step": 1018 + }, + { + "epoch": 0.6034942256440627, + "grad_norm": 2.3622253842023335, + "learning_rate": 1.8501577139241865e-05, + "loss": 0.829, + "step": 1019 + }, + { + "epoch": 0.6040864672786497, + "grad_norm": 2.037756142332905, + "learning_rate": 1.8498207858162724e-05, + "loss": 0.867, + "step": 1020 + }, + { + "epoch": 0.6046787089132366, + "grad_norm": 4.528232825533869, + "learning_rate": 1.8494835100838693e-05, + "loss": 0.8648, + "step": 1021 + }, + { + "epoch": 0.6052709505478235, + "grad_norm": 2.8907054221222, + "learning_rate": 1.8491458868649417e-05, + "loss": 0.8574, + "step": 1022 + }, + { + "epoch": 0.6058631921824105, + "grad_norm": 2.182754237899523, + "learning_rate": 1.8488079162975965e-05, + "loss": 0.8716, + "step": 1023 + }, + { + "epoch": 0.6064554338169973, + "grad_norm": 3.6488297829005316, + "learning_rate": 1.8484695985200832e-05, + "loss": 0.8888, + "step": 1024 + }, + { + "epoch": 0.6070476754515842, + "grad_norm": 2.1360401343086375, + "learning_rate": 1.848130933670792e-05, + "loss": 0.853, + "step": 1025 + }, + { + "epoch": 0.6076399170861712, + "grad_norm": 2.7634831832013074, + "learning_rate": 1.847791921888256e-05, + "loss": 0.8444, + "step": 1026 + }, + { + "epoch": 0.6082321587207581, + "grad_norm": 2.548946934028294, + "learning_rate": 1.847452563311151e-05, + "loss": 0.8552, + "step": 1027 + }, + { + "epoch": 0.608824400355345, + "grad_norm": 2.00472769722451, + "learning_rate": 1.8471128580782923e-05, + "loss": 0.7907, + "step": 1028 + }, + { + "epoch": 0.6094166419899318, + "grad_norm": 7.112058400031312, + "learning_rate": 1.8467728063286398e-05, + "loss": 0.811, + "step": 1029 + }, + { + "epoch": 0.6100088836245188, + "grad_norm": 5.904134501716097, + "learning_rate": 1.8464324082012926e-05, + "loss": 0.8236, + "step": 1030 + }, + { + "epoch": 0.6106011252591057, + "grad_norm": 2.677789919762913, + "learning_rate": 1.8460916638354934e-05, + "loss": 0.8855, + "step": 1031 + }, + { + "epoch": 0.6111933668936926, + "grad_norm": 2.979499541128458, + "learning_rate": 1.845750573370626e-05, + "loss": 0.81, + "step": 1032 + }, + { + "epoch": 0.6117856085282796, + "grad_norm": 4.395198347599419, + "learning_rate": 1.845409136946215e-05, + "loss": 0.8649, + "step": 1033 + }, + { + "epoch": 0.6123778501628665, + "grad_norm": 1.9418555795640626, + "learning_rate": 1.8450673547019273e-05, + "loss": 0.8106, + "step": 1034 + }, + { + "epoch": 0.6129700917974533, + "grad_norm": 2.50689095850713, + "learning_rate": 1.844725226777571e-05, + "loss": 0.8387, + "step": 1035 + }, + { + "epoch": 0.6135623334320403, + "grad_norm": 2.812993440172385, + "learning_rate": 1.844382753313096e-05, + "loss": 0.8102, + "step": 1036 + }, + { + "epoch": 0.6141545750666272, + "grad_norm": 2.6604257345575206, + "learning_rate": 1.844039934448593e-05, + "loss": 0.8277, + "step": 1037 + }, + { + "epoch": 0.6147468167012141, + "grad_norm": 2.561145533436182, + "learning_rate": 1.8436967703242938e-05, + "loss": 0.8557, + "step": 1038 + }, + { + "epoch": 0.615339058335801, + "grad_norm": 3.2075888751958, + "learning_rate": 1.8433532610805724e-05, + "loss": 0.8399, + "step": 1039 + }, + { + "epoch": 0.6159312999703879, + "grad_norm": 7.7990669355472475, + "learning_rate": 1.843009406857943e-05, + "loss": 0.8473, + "step": 1040 + }, + { + "epoch": 0.6165235416049748, + "grad_norm": 3.132725554622429, + "learning_rate": 1.8426652077970616e-05, + "loss": 0.8058, + "step": 1041 + }, + { + "epoch": 0.6171157832395617, + "grad_norm": 2.1854341940737187, + "learning_rate": 1.842320664038725e-05, + "loss": 0.8126, + "step": 1042 + }, + { + "epoch": 0.6177080248741487, + "grad_norm": 2.9704427255656825, + "learning_rate": 1.841975775723871e-05, + "loss": 0.8319, + "step": 1043 + }, + { + "epoch": 0.6183002665087356, + "grad_norm": 2.2725465194986367, + "learning_rate": 1.8416305429935776e-05, + "loss": 0.8729, + "step": 1044 + }, + { + "epoch": 0.6188925081433225, + "grad_norm": 2.5307221134117674, + "learning_rate": 1.8412849659890652e-05, + "loss": 0.8412, + "step": 1045 + }, + { + "epoch": 0.6194847497779093, + "grad_norm": 1.810981613810124, + "learning_rate": 1.840939044851694e-05, + "loss": 0.8673, + "step": 1046 + }, + { + "epoch": 0.6200769914124963, + "grad_norm": 2.4443325681183907, + "learning_rate": 1.840592779722965e-05, + "loss": 0.7945, + "step": 1047 + }, + { + "epoch": 0.6206692330470832, + "grad_norm": 2.8376072263141485, + "learning_rate": 1.8402461707445206e-05, + "loss": 0.8427, + "step": 1048 + }, + { + "epoch": 0.6212614746816701, + "grad_norm": 3.6210685733471397, + "learning_rate": 1.8398992180581427e-05, + "loss": 0.826, + "step": 1049 + }, + { + "epoch": 0.6218537163162571, + "grad_norm": 1.561394208799311, + "learning_rate": 1.839551921805755e-05, + "loss": 0.8572, + "step": 1050 + }, + { + "epoch": 0.6224459579508439, + "grad_norm": 3.103802020893097, + "learning_rate": 1.839204282129421e-05, + "loss": 0.8646, + "step": 1051 + }, + { + "epoch": 0.6230381995854308, + "grad_norm": 4.425061232798762, + "learning_rate": 1.8388562991713447e-05, + "loss": 0.8143, + "step": 1052 + }, + { + "epoch": 0.6236304412200178, + "grad_norm": 2.191428810992078, + "learning_rate": 1.838507973073871e-05, + "loss": 0.8502, + "step": 1053 + }, + { + "epoch": 0.6242226828546047, + "grad_norm": 4.95011030076338, + "learning_rate": 1.8381593039794846e-05, + "loss": 0.864, + "step": 1054 + }, + { + "epoch": 0.6248149244891916, + "grad_norm": 2.7797880624241516, + "learning_rate": 1.837810292030811e-05, + "loss": 0.859, + "step": 1055 + }, + { + "epoch": 0.6254071661237784, + "grad_norm": 4.869981957929177, + "learning_rate": 1.8374609373706156e-05, + "loss": 0.8444, + "step": 1056 + }, + { + "epoch": 0.6259994077583654, + "grad_norm": 2.5746569296165522, + "learning_rate": 1.8371112401418042e-05, + "loss": 0.8364, + "step": 1057 + }, + { + "epoch": 0.6265916493929523, + "grad_norm": 2.0625606665572653, + "learning_rate": 1.8367612004874224e-05, + "loss": 0.8906, + "step": 1058 + }, + { + "epoch": 0.6271838910275392, + "grad_norm": 2.676271947186104, + "learning_rate": 1.8364108185506563e-05, + "loss": 0.8459, + "step": 1059 + }, + { + "epoch": 0.6277761326621262, + "grad_norm": 1.5883481724433661, + "learning_rate": 1.8360600944748316e-05, + "loss": 0.7969, + "step": 1060 + }, + { + "epoch": 0.6283683742967131, + "grad_norm": 1.695724970760026, + "learning_rate": 1.8357090284034145e-05, + "loss": 0.8488, + "step": 1061 + }, + { + "epoch": 0.6289606159312999, + "grad_norm": 5.99104068302577, + "learning_rate": 1.8353576204800106e-05, + "loss": 0.8584, + "step": 1062 + }, + { + "epoch": 0.6295528575658869, + "grad_norm": 1.4158669500557108, + "learning_rate": 1.8350058708483654e-05, + "loss": 0.8109, + "step": 1063 + }, + { + "epoch": 0.6301450992004738, + "grad_norm": 4.619972925464011, + "learning_rate": 1.8346537796523643e-05, + "loss": 0.8387, + "step": 1064 + }, + { + "epoch": 0.6307373408350607, + "grad_norm": 1.840623636179062, + "learning_rate": 1.834301347036033e-05, + "loss": 0.7773, + "step": 1065 + }, + { + "epoch": 0.6313295824696477, + "grad_norm": 3.0573832313039073, + "learning_rate": 1.833948573143535e-05, + "loss": 0.8626, + "step": 1066 + }, + { + "epoch": 0.6319218241042345, + "grad_norm": 2.518622510039811, + "learning_rate": 1.8335954581191758e-05, + "loss": 0.8783, + "step": 1067 + }, + { + "epoch": 0.6325140657388214, + "grad_norm": 9.493928447056057, + "learning_rate": 1.8332420021073992e-05, + "loss": 0.8544, + "step": 1068 + }, + { + "epoch": 0.6331063073734083, + "grad_norm": 1.720905288086446, + "learning_rate": 1.8328882052527878e-05, + "loss": 0.8282, + "step": 1069 + }, + { + "epoch": 0.6336985490079953, + "grad_norm": 1.8020645303459293, + "learning_rate": 1.8325340677000646e-05, + "loss": 0.8923, + "step": 1070 + }, + { + "epoch": 0.6342907906425822, + "grad_norm": 2.8749501095196517, + "learning_rate": 1.8321795895940925e-05, + "loss": 0.8544, + "step": 1071 + }, + { + "epoch": 0.6348830322771691, + "grad_norm": 2.008670621988076, + "learning_rate": 1.8318247710798728e-05, + "loss": 0.8462, + "step": 1072 + }, + { + "epoch": 0.635475273911756, + "grad_norm": 1.8563596582266815, + "learning_rate": 1.8314696123025456e-05, + "loss": 0.8324, + "step": 1073 + }, + { + "epoch": 0.6360675155463429, + "grad_norm": 2.228469119107762, + "learning_rate": 1.831114113407391e-05, + "loss": 0.8325, + "step": 1074 + }, + { + "epoch": 0.6366597571809298, + "grad_norm": 2.5028199204197596, + "learning_rate": 1.8307582745398282e-05, + "loss": 0.7938, + "step": 1075 + }, + { + "epoch": 0.6372519988155168, + "grad_norm": 2.563042207855269, + "learning_rate": 1.8304020958454156e-05, + "loss": 0.8539, + "step": 1076 + }, + { + "epoch": 0.6378442404501037, + "grad_norm": 2.748425207975687, + "learning_rate": 1.83004557746985e-05, + "loss": 0.9026, + "step": 1077 + }, + { + "epoch": 0.6384364820846905, + "grad_norm": 2.198078366256531, + "learning_rate": 1.8296887195589678e-05, + "loss": 0.836, + "step": 1078 + }, + { + "epoch": 0.6390287237192774, + "grad_norm": 2.978264957639652, + "learning_rate": 1.829331522258743e-05, + "loss": 0.8374, + "step": 1079 + }, + { + "epoch": 0.6396209653538644, + "grad_norm": 3.0902393566945694, + "learning_rate": 1.8289739857152903e-05, + "loss": 0.8522, + "step": 1080 + }, + { + "epoch": 0.6402132069884513, + "grad_norm": 2.5510293746898793, + "learning_rate": 1.828616110074862e-05, + "loss": 0.8959, + "step": 1081 + }, + { + "epoch": 0.6408054486230382, + "grad_norm": 2.4920369060953, + "learning_rate": 1.8282578954838493e-05, + "loss": 0.8635, + "step": 1082 + }, + { + "epoch": 0.6413976902576252, + "grad_norm": 3.8584939608718174, + "learning_rate": 1.8278993420887822e-05, + "loss": 0.897, + "step": 1083 + }, + { + "epoch": 0.641989931892212, + "grad_norm": 1.6943535494324076, + "learning_rate": 1.8275404500363293e-05, + "loss": 0.8555, + "step": 1084 + }, + { + "epoch": 0.6425821735267989, + "grad_norm": 3.625435056497657, + "learning_rate": 1.8271812194732972e-05, + "loss": 0.8555, + "step": 1085 + }, + { + "epoch": 0.6431744151613858, + "grad_norm": 9.548040429248383, + "learning_rate": 1.8268216505466318e-05, + "loss": 0.9243, + "step": 1086 + }, + { + "epoch": 0.6437666567959728, + "grad_norm": 6.204666207240246, + "learning_rate": 1.8264617434034168e-05, + "loss": 0.8679, + "step": 1087 + }, + { + "epoch": 0.6443588984305597, + "grad_norm": 2.3034334204515767, + "learning_rate": 1.826101498190875e-05, + "loss": 0.8646, + "step": 1088 + }, + { + "epoch": 0.6449511400651465, + "grad_norm": 4.514126342622965, + "learning_rate": 1.825740915056366e-05, + "loss": 0.8552, + "step": 1089 + }, + { + "epoch": 0.6455433816997335, + "grad_norm": 2.4208151622387604, + "learning_rate": 1.8253799941473894e-05, + "loss": 0.8319, + "step": 1090 + }, + { + "epoch": 0.6461356233343204, + "grad_norm": 3.3019190869678736, + "learning_rate": 1.8250187356115817e-05, + "loss": 0.8964, + "step": 1091 + }, + { + "epoch": 0.6467278649689073, + "grad_norm": 6.537722710331606, + "learning_rate": 1.824657139596718e-05, + "loss": 0.8184, + "step": 1092 + }, + { + "epoch": 0.6473201066034943, + "grad_norm": 5.245807404499727, + "learning_rate": 1.8242952062507115e-05, + "loss": 0.8338, + "step": 1093 + }, + { + "epoch": 0.6479123482380811, + "grad_norm": 3.8290099537018167, + "learning_rate": 1.8239329357216135e-05, + "loss": 0.7772, + "step": 1094 + }, + { + "epoch": 0.648504589872668, + "grad_norm": 4.36239109924843, + "learning_rate": 1.8235703281576127e-05, + "loss": 0.8034, + "step": 1095 + }, + { + "epoch": 0.649096831507255, + "grad_norm": 3.266500591644286, + "learning_rate": 1.823207383707036e-05, + "loss": 0.8272, + "step": 1096 + }, + { + "epoch": 0.6496890731418419, + "grad_norm": 1.9798959268550758, + "learning_rate": 1.822844102518348e-05, + "loss": 0.843, + "step": 1097 + }, + { + "epoch": 0.6502813147764288, + "grad_norm": 2.4272575236741343, + "learning_rate": 1.8224804847401518e-05, + "loss": 0.8178, + "step": 1098 + }, + { + "epoch": 0.6508735564110157, + "grad_norm": 2.643309325700988, + "learning_rate": 1.822116530521187e-05, + "loss": 0.8607, + "step": 1099 + }, + { + "epoch": 0.6514657980456026, + "grad_norm": 2.7517207819440466, + "learning_rate": 1.821752240010331e-05, + "loss": 0.9101, + "step": 1100 + }, + { + "epoch": 0.6520580396801895, + "grad_norm": 2.409184915147776, + "learning_rate": 1.8213876133565996e-05, + "loss": 0.894, + "step": 1101 + }, + { + "epoch": 0.6526502813147764, + "grad_norm": 1.8447833000009395, + "learning_rate": 1.8210226507091454e-05, + "loss": 0.8607, + "step": 1102 + }, + { + "epoch": 0.6532425229493634, + "grad_norm": 2.9713191095410174, + "learning_rate": 1.820657352217259e-05, + "loss": 0.834, + "step": 1103 + }, + { + "epoch": 0.6538347645839503, + "grad_norm": 2.5876456677398956, + "learning_rate": 1.8202917180303673e-05, + "loss": 0.866, + "step": 1104 + }, + { + "epoch": 0.6544270062185371, + "grad_norm": 2.3509085863918284, + "learning_rate": 1.8199257482980358e-05, + "loss": 0.8582, + "step": 1105 + }, + { + "epoch": 0.655019247853124, + "grad_norm": 1.5013962529594334, + "learning_rate": 1.819559443169967e-05, + "loss": 0.8376, + "step": 1106 + }, + { + "epoch": 0.655611489487711, + "grad_norm": 2.604248538807811, + "learning_rate": 1.8191928027959996e-05, + "loss": 0.8598, + "step": 1107 + }, + { + "epoch": 0.6562037311222979, + "grad_norm": 1.8138214242680486, + "learning_rate": 1.8188258273261104e-05, + "loss": 0.8527, + "step": 1108 + }, + { + "epoch": 0.6567959727568848, + "grad_norm": 2.716863323227044, + "learning_rate": 1.818458516910413e-05, + "loss": 0.8676, + "step": 1109 + }, + { + "epoch": 0.6573882143914718, + "grad_norm": 1.9705395914912947, + "learning_rate": 1.818090871699158e-05, + "loss": 0.8832, + "step": 1110 + }, + { + "epoch": 0.6579804560260586, + "grad_norm": 4.001918700211936, + "learning_rate": 1.817722891842733e-05, + "loss": 0.858, + "step": 1111 + }, + { + "epoch": 0.6585726976606455, + "grad_norm": 1.7504239586578514, + "learning_rate": 1.8173545774916628e-05, + "loss": 0.8335, + "step": 1112 + }, + { + "epoch": 0.6591649392952325, + "grad_norm": 2.103180849415543, + "learning_rate": 1.816985928796608e-05, + "loss": 0.8058, + "step": 1113 + }, + { + "epoch": 0.6597571809298194, + "grad_norm": 2.042550989077482, + "learning_rate": 1.8166169459083673e-05, + "loss": 0.8622, + "step": 1114 + }, + { + "epoch": 0.6603494225644063, + "grad_norm": 2.895471180154479, + "learning_rate": 1.8162476289778745e-05, + "loss": 0.8349, + "step": 1115 + }, + { + "epoch": 0.6609416641989931, + "grad_norm": 2.1623953611204163, + "learning_rate": 1.8158779781562022e-05, + "loss": 0.8047, + "step": 1116 + }, + { + "epoch": 0.6615339058335801, + "grad_norm": 4.82625679912337, + "learning_rate": 1.8155079935945577e-05, + "loss": 0.8058, + "step": 1117 + }, + { + "epoch": 0.662126147468167, + "grad_norm": 2.1209114049054163, + "learning_rate": 1.8151376754442856e-05, + "loss": 0.8345, + "step": 1118 + }, + { + "epoch": 0.6627183891027539, + "grad_norm": 3.202247450296245, + "learning_rate": 1.8147670238568666e-05, + "loss": 0.8264, + "step": 1119 + }, + { + "epoch": 0.6633106307373409, + "grad_norm": 4.683917620146975, + "learning_rate": 1.8143960389839184e-05, + "loss": 0.8997, + "step": 1120 + }, + { + "epoch": 0.6639028723719278, + "grad_norm": 1.5146727248882648, + "learning_rate": 1.8140247209771946e-05, + "loss": 0.8525, + "step": 1121 + }, + { + "epoch": 0.6644951140065146, + "grad_norm": 2.4296936822760746, + "learning_rate": 1.8136530699885852e-05, + "loss": 0.8435, + "step": 1122 + }, + { + "epoch": 0.6650873556411016, + "grad_norm": 3.833524070202994, + "learning_rate": 1.813281086170116e-05, + "loss": 0.9018, + "step": 1123 + }, + { + "epoch": 0.6656795972756885, + "grad_norm": 2.0040513437046648, + "learning_rate": 1.8129087696739497e-05, + "loss": 0.8821, + "step": 1124 + }, + { + "epoch": 0.6662718389102754, + "grad_norm": 1.4186690415848557, + "learning_rate": 1.8125361206523845e-05, + "loss": 0.8447, + "step": 1125 + }, + { + "epoch": 0.6668640805448623, + "grad_norm": 1.4088568380399054, + "learning_rate": 1.8121631392578545e-05, + "loss": 0.8548, + "step": 1126 + }, + { + "epoch": 0.6674563221794492, + "grad_norm": 1.8279521308810265, + "learning_rate": 1.811789825642931e-05, + "loss": 0.8499, + "step": 1127 + }, + { + "epoch": 0.6680485638140361, + "grad_norm": 1.7623710462745776, + "learning_rate": 1.8114161799603195e-05, + "loss": 0.8746, + "step": 1128 + }, + { + "epoch": 0.668640805448623, + "grad_norm": 1.6841484340107102, + "learning_rate": 1.8110422023628623e-05, + "loss": 0.8287, + "step": 1129 + }, + { + "epoch": 0.66923304708321, + "grad_norm": 1.5460477858044241, + "learning_rate": 1.810667893003537e-05, + "loss": 0.8403, + "step": 1130 + }, + { + "epoch": 0.6698252887177969, + "grad_norm": 2.904407513681924, + "learning_rate": 1.8102932520354572e-05, + "loss": 0.8535, + "step": 1131 + }, + { + "epoch": 0.6704175303523838, + "grad_norm": 2.3502598935192682, + "learning_rate": 1.8099182796118727e-05, + "loss": 0.9069, + "step": 1132 + }, + { + "epoch": 0.6710097719869706, + "grad_norm": 2.267045869299457, + "learning_rate": 1.8095429758861682e-05, + "loss": 0.8095, + "step": 1133 + }, + { + "epoch": 0.6716020136215576, + "grad_norm": 1.5438780262047374, + "learning_rate": 1.8091673410118633e-05, + "loss": 0.8264, + "step": 1134 + }, + { + "epoch": 0.6721942552561445, + "grad_norm": 2.956645508277099, + "learning_rate": 1.8087913751426142e-05, + "loss": 0.8709, + "step": 1135 + }, + { + "epoch": 0.6727864968907314, + "grad_norm": 2.2889022888507284, + "learning_rate": 1.8084150784322123e-05, + "loss": 0.836, + "step": 1136 + }, + { + "epoch": 0.6733787385253184, + "grad_norm": 1.4850766527643833, + "learning_rate": 1.8080384510345838e-05, + "loss": 0.8603, + "step": 1137 + }, + { + "epoch": 0.6739709801599052, + "grad_norm": 2.1026164044539315, + "learning_rate": 1.8076614931037908e-05, + "loss": 0.8917, + "step": 1138 + }, + { + "epoch": 0.6745632217944921, + "grad_norm": 1.620826120946746, + "learning_rate": 1.80728420479403e-05, + "loss": 0.8327, + "step": 1139 + }, + { + "epoch": 0.6751554634290791, + "grad_norm": 1.5100307488301057, + "learning_rate": 1.8069065862596338e-05, + "loss": 0.872, + "step": 1140 + }, + { + "epoch": 0.675747705063666, + "grad_norm": 1.3534401313900613, + "learning_rate": 1.8065286376550692e-05, + "loss": 0.8426, + "step": 1141 + }, + { + "epoch": 0.6763399466982529, + "grad_norm": 12.092384661716544, + "learning_rate": 1.8061503591349386e-05, + "loss": 0.8416, + "step": 1142 + }, + { + "epoch": 0.6769321883328397, + "grad_norm": 1.208766260196838, + "learning_rate": 1.8057717508539786e-05, + "loss": 0.8475, + "step": 1143 + }, + { + "epoch": 0.6775244299674267, + "grad_norm": 1.2451969427779477, + "learning_rate": 1.8053928129670624e-05, + "loss": 0.8306, + "step": 1144 + }, + { + "epoch": 0.6781166716020136, + "grad_norm": 1.799182555450414, + "learning_rate": 1.805013545629196e-05, + "loss": 0.8296, + "step": 1145 + }, + { + "epoch": 0.6787089132366005, + "grad_norm": 1.3830942844154197, + "learning_rate": 1.8046339489955214e-05, + "loss": 0.8246, + "step": 1146 + }, + { + "epoch": 0.6793011548711875, + "grad_norm": 1.6078573859008807, + "learning_rate": 1.804254023221315e-05, + "loss": 0.8071, + "step": 1147 + }, + { + "epoch": 0.6798933965057744, + "grad_norm": 1.335961933557725, + "learning_rate": 1.8038737684619874e-05, + "loss": 0.8593, + "step": 1148 + }, + { + "epoch": 0.6804856381403612, + "grad_norm": 1.546298180666286, + "learning_rate": 1.8034931848730846e-05, + "loss": 0.8249, + "step": 1149 + }, + { + "epoch": 0.6810778797749482, + "grad_norm": 1.635069545524243, + "learning_rate": 1.8031122726102868e-05, + "loss": 0.877, + "step": 1150 + }, + { + "epoch": 0.6816701214095351, + "grad_norm": 1.2691447098723208, + "learning_rate": 1.802731031829408e-05, + "loss": 0.8395, + "step": 1151 + }, + { + "epoch": 0.682262363044122, + "grad_norm": 1.869539168329112, + "learning_rate": 1.8023494626863976e-05, + "loss": 0.813, + "step": 1152 + }, + { + "epoch": 0.682854604678709, + "grad_norm": 4.282750886399974, + "learning_rate": 1.8019675653373387e-05, + "loss": 0.8337, + "step": 1153 + }, + { + "epoch": 0.6834468463132958, + "grad_norm": 1.5109022276624124, + "learning_rate": 1.8015853399384488e-05, + "loss": 0.8654, + "step": 1154 + }, + { + "epoch": 0.6840390879478827, + "grad_norm": 3.215342397054156, + "learning_rate": 1.8012027866460797e-05, + "loss": 0.9001, + "step": 1155 + }, + { + "epoch": 0.6846313295824696, + "grad_norm": 1.311513238293177, + "learning_rate": 1.8008199056167167e-05, + "loss": 0.8889, + "step": 1156 + }, + { + "epoch": 0.6852235712170566, + "grad_norm": 1.4742093712676303, + "learning_rate": 1.80043669700698e-05, + "loss": 0.8175, + "step": 1157 + }, + { + "epoch": 0.6858158128516435, + "grad_norm": 1.2459495798186484, + "learning_rate": 1.8000531609736236e-05, + "loss": 0.8096, + "step": 1158 + }, + { + "epoch": 0.6864080544862304, + "grad_norm": 1.6603825601369882, + "learning_rate": 1.799669297673535e-05, + "loss": 0.8859, + "step": 1159 + }, + { + "epoch": 0.6870002961208173, + "grad_norm": 1.8179070682239424, + "learning_rate": 1.7992851072637366e-05, + "loss": 0.8541, + "step": 1160 + }, + { + "epoch": 0.6875925377554042, + "grad_norm": 1.2406374896101122, + "learning_rate": 1.7989005899013828e-05, + "loss": 0.8413, + "step": 1161 + }, + { + "epoch": 0.6881847793899911, + "grad_norm": 1.5721991668978201, + "learning_rate": 1.798515745743764e-05, + "loss": 0.8636, + "step": 1162 + }, + { + "epoch": 0.688777021024578, + "grad_norm": 1.8095072497988238, + "learning_rate": 1.798130574948302e-05, + "loss": 0.8892, + "step": 1163 + }, + { + "epoch": 0.689369262659165, + "grad_norm": 1.8048696574204277, + "learning_rate": 1.797745077672554e-05, + "loss": 0.8207, + "step": 1164 + }, + { + "epoch": 0.6899615042937518, + "grad_norm": 1.9168478766964387, + "learning_rate": 1.7973592540742095e-05, + "loss": 0.8593, + "step": 1165 + }, + { + "epoch": 0.6905537459283387, + "grad_norm": 1.7351792742264414, + "learning_rate": 1.7969731043110928e-05, + "loss": 0.845, + "step": 1166 + }, + { + "epoch": 0.6911459875629257, + "grad_norm": 1.9689492610583388, + "learning_rate": 1.79658662854116e-05, + "loss": 0.8399, + "step": 1167 + }, + { + "epoch": 0.6917382291975126, + "grad_norm": 2.378890204311825, + "learning_rate": 1.7961998269225024e-05, + "loss": 0.836, + "step": 1168 + }, + { + "epoch": 0.6923304708320995, + "grad_norm": 1.7234996114622456, + "learning_rate": 1.7958126996133427e-05, + "loss": 0.8726, + "step": 1169 + }, + { + "epoch": 0.6929227124666865, + "grad_norm": 12.35334136288384, + "learning_rate": 1.7954252467720386e-05, + "loss": 0.8219, + "step": 1170 + }, + { + "epoch": 0.6935149541012733, + "grad_norm": 1.5256431304922908, + "learning_rate": 1.7950374685570794e-05, + "loss": 0.8227, + "step": 1171 + }, + { + "epoch": 0.6941071957358602, + "grad_norm": 2.490658777946983, + "learning_rate": 1.7946493651270883e-05, + "loss": 0.8785, + "step": 1172 + }, + { + "epoch": 0.6946994373704471, + "grad_norm": 2.143120458120011, + "learning_rate": 1.794260936640822e-05, + "loss": 0.7914, + "step": 1173 + }, + { + "epoch": 0.6952916790050341, + "grad_norm": 1.961814139735007, + "learning_rate": 1.7938721832571688e-05, + "loss": 0.84, + "step": 1174 + }, + { + "epoch": 0.695883920639621, + "grad_norm": 1.8808018807507807, + "learning_rate": 1.7934831051351513e-05, + "loss": 0.8379, + "step": 1175 + }, + { + "epoch": 0.6964761622742078, + "grad_norm": 3.8564691984179915, + "learning_rate": 1.793093702433924e-05, + "loss": 0.8931, + "step": 1176 + }, + { + "epoch": 0.6970684039087948, + "grad_norm": 2.991169714160442, + "learning_rate": 1.792703975312774e-05, + "loss": 0.8591, + "step": 1177 + }, + { + "epoch": 0.6976606455433817, + "grad_norm": 2.0564470226192775, + "learning_rate": 1.792313923931123e-05, + "loss": 0.8484, + "step": 1178 + }, + { + "epoch": 0.6982528871779686, + "grad_norm": 2.7676305573686486, + "learning_rate": 1.791923548448523e-05, + "loss": 0.9096, + "step": 1179 + }, + { + "epoch": 0.6988451288125556, + "grad_norm": 2.313908537479878, + "learning_rate": 1.7915328490246594e-05, + "loss": 0.8442, + "step": 1180 + }, + { + "epoch": 0.6994373704471424, + "grad_norm": 2.6234620243010967, + "learning_rate": 1.7911418258193503e-05, + "loss": 0.8982, + "step": 1181 + }, + { + "epoch": 0.7000296120817293, + "grad_norm": 1.8042865323547594, + "learning_rate": 1.7907504789925473e-05, + "loss": 0.8913, + "step": 1182 + }, + { + "epoch": 0.7006218537163162, + "grad_norm": 2.7891631926045948, + "learning_rate": 1.7903588087043314e-05, + "loss": 0.833, + "step": 1183 + }, + { + "epoch": 0.7012140953509032, + "grad_norm": 1.8602348121875936, + "learning_rate": 1.789966815114919e-05, + "loss": 0.8298, + "step": 1184 + }, + { + "epoch": 0.7018063369854901, + "grad_norm": 2.8372707284352074, + "learning_rate": 1.7895744983846575e-05, + "loss": 0.8192, + "step": 1185 + }, + { + "epoch": 0.702398578620077, + "grad_norm": 1.8134109667963114, + "learning_rate": 1.789181858674026e-05, + "loss": 0.8336, + "step": 1186 + }, + { + "epoch": 0.7029908202546639, + "grad_norm": 3.3699318930231392, + "learning_rate": 1.7887888961436367e-05, + "loss": 0.837, + "step": 1187 + }, + { + "epoch": 0.7035830618892508, + "grad_norm": 3.5279273185363933, + "learning_rate": 1.788395610954233e-05, + "loss": 0.8584, + "step": 1188 + }, + { + "epoch": 0.7041753035238377, + "grad_norm": 10.545341229489324, + "learning_rate": 1.7880020032666906e-05, + "loss": 0.8366, + "step": 1189 + }, + { + "epoch": 0.7047675451584247, + "grad_norm": 1.8317791616806671, + "learning_rate": 1.7876080732420176e-05, + "loss": 0.8156, + "step": 1190 + }, + { + "epoch": 0.7053597867930116, + "grad_norm": 2.8166796146116635, + "learning_rate": 1.7872138210413533e-05, + "loss": 0.8591, + "step": 1191 + }, + { + "epoch": 0.7059520284275984, + "grad_norm": 2.3506890987289837, + "learning_rate": 1.7868192468259686e-05, + "loss": 0.8696, + "step": 1192 + }, + { + "epoch": 0.7065442700621853, + "grad_norm": 1.9915077835053718, + "learning_rate": 1.7864243507572678e-05, + "loss": 0.8078, + "step": 1193 + }, + { + "epoch": 0.7071365116967723, + "grad_norm": 2.4789943921714728, + "learning_rate": 1.7860291329967842e-05, + "loss": 0.8549, + "step": 1194 + }, + { + "epoch": 0.7077287533313592, + "grad_norm": 3.5203512739721114, + "learning_rate": 1.7856335937061843e-05, + "loss": 0.8259, + "step": 1195 + }, + { + "epoch": 0.7083209949659461, + "grad_norm": 2.7626514330217207, + "learning_rate": 1.7852377330472668e-05, + "loss": 0.8024, + "step": 1196 + }, + { + "epoch": 0.7089132366005331, + "grad_norm": 2.9434175694356286, + "learning_rate": 1.7848415511819602e-05, + "loss": 0.8561, + "step": 1197 + }, + { + "epoch": 0.7095054782351199, + "grad_norm": 2.938160794113487, + "learning_rate": 1.7844450482723258e-05, + "loss": 0.8348, + "step": 1198 + }, + { + "epoch": 0.7100977198697068, + "grad_norm": 7.531717951600264, + "learning_rate": 1.7840482244805546e-05, + "loss": 0.8257, + "step": 1199 + }, + { + "epoch": 0.7106899615042938, + "grad_norm": 2.5108516098902345, + "learning_rate": 1.783651079968971e-05, + "loss": 0.8359, + "step": 1200 + }, + { + "epoch": 0.7112822031388807, + "grad_norm": 6.9259734842190515, + "learning_rate": 1.7832536149000283e-05, + "loss": 0.8493, + "step": 1201 + }, + { + "epoch": 0.7118744447734676, + "grad_norm": 3.604438919377138, + "learning_rate": 1.782855829436313e-05, + "loss": 0.9005, + "step": 1202 + }, + { + "epoch": 0.7124666864080544, + "grad_norm": 6.432790286739704, + "learning_rate": 1.782457723740541e-05, + "loss": 0.8502, + "step": 1203 + }, + { + "epoch": 0.7130589280426414, + "grad_norm": 2.309337652505556, + "learning_rate": 1.7820592979755605e-05, + "loss": 0.8378, + "step": 1204 + }, + { + "epoch": 0.7136511696772283, + "grad_norm": 4.86010637400942, + "learning_rate": 1.78166055230435e-05, + "loss": 0.8571, + "step": 1205 + }, + { + "epoch": 0.7142434113118152, + "grad_norm": 2.4643862263555003, + "learning_rate": 1.7812614868900185e-05, + "loss": 0.8026, + "step": 1206 + }, + { + "epoch": 0.7148356529464022, + "grad_norm": 4.488518458955996, + "learning_rate": 1.7808621018958063e-05, + "loss": 0.8474, + "step": 1207 + }, + { + "epoch": 0.7154278945809891, + "grad_norm": 2.142014115136399, + "learning_rate": 1.7804623974850844e-05, + "loss": 0.8327, + "step": 1208 + }, + { + "epoch": 0.7160201362155759, + "grad_norm": 3.059145127668178, + "learning_rate": 1.7800623738213544e-05, + "loss": 0.8184, + "step": 1209 + }, + { + "epoch": 0.7166123778501629, + "grad_norm": 3.1694873538825927, + "learning_rate": 1.779662031068249e-05, + "loss": 0.8609, + "step": 1210 + }, + { + "epoch": 0.7172046194847498, + "grad_norm": 1.8086628702754877, + "learning_rate": 1.7792613693895298e-05, + "loss": 0.8932, + "step": 1211 + }, + { + "epoch": 0.7177968611193367, + "grad_norm": 3.4357386272756254, + "learning_rate": 1.7788603889490907e-05, + "loss": 0.825, + "step": 1212 + }, + { + "epoch": 0.7183891027539236, + "grad_norm": 5.825010332449306, + "learning_rate": 1.7784590899109554e-05, + "loss": 0.8121, + "step": 1213 + }, + { + "epoch": 0.7189813443885105, + "grad_norm": 4.868145531572263, + "learning_rate": 1.778057472439277e-05, + "loss": 0.8699, + "step": 1214 + }, + { + "epoch": 0.7195735860230974, + "grad_norm": 2.4243315664795104, + "learning_rate": 1.7776555366983403e-05, + "loss": 0.8094, + "step": 1215 + }, + { + "epoch": 0.7201658276576843, + "grad_norm": 2.2560480044043554, + "learning_rate": 1.7772532828525593e-05, + "loss": 0.8866, + "step": 1216 + }, + { + "epoch": 0.7207580692922713, + "grad_norm": 2.7080185493295432, + "learning_rate": 1.7768507110664787e-05, + "loss": 0.8779, + "step": 1217 + }, + { + "epoch": 0.7213503109268582, + "grad_norm": 2.1434635867346246, + "learning_rate": 1.7764478215047725e-05, + "loss": 0.8646, + "step": 1218 + }, + { + "epoch": 0.721942552561445, + "grad_norm": 3.651352165498139, + "learning_rate": 1.776044614332246e-05, + "loss": 0.8161, + "step": 1219 + }, + { + "epoch": 0.722534794196032, + "grad_norm": 3.1506750177909093, + "learning_rate": 1.7756410897138326e-05, + "loss": 0.8791, + "step": 1220 + }, + { + "epoch": 0.7231270358306189, + "grad_norm": 3.4301185321356424, + "learning_rate": 1.7752372478145975e-05, + "loss": 0.8843, + "step": 1221 + }, + { + "epoch": 0.7237192774652058, + "grad_norm": 2.985509123858241, + "learning_rate": 1.7748330887997344e-05, + "loss": 0.8521, + "step": 1222 + }, + { + "epoch": 0.7243115190997927, + "grad_norm": 2.1395675990186964, + "learning_rate": 1.774428612834567e-05, + "loss": 0.8504, + "step": 1223 + }, + { + "epoch": 0.7249037607343797, + "grad_norm": 1.8208520277656095, + "learning_rate": 1.7740238200845485e-05, + "loss": 0.8359, + "step": 1224 + }, + { + "epoch": 0.7254960023689665, + "grad_norm": 2.5196284085738583, + "learning_rate": 1.773618710715262e-05, + "loss": 0.8758, + "step": 1225 + }, + { + "epoch": 0.7260882440035534, + "grad_norm": 1.9151106256797734, + "learning_rate": 1.7732132848924206e-05, + "loss": 0.8402, + "step": 1226 + }, + { + "epoch": 0.7266804856381404, + "grad_norm": 2.8093175656971927, + "learning_rate": 1.7728075427818658e-05, + "loss": 0.8215, + "step": 1227 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 3.506653025025842, + "learning_rate": 1.7724014845495684e-05, + "loss": 0.8823, + "step": 1228 + }, + { + "epoch": 0.7278649689073142, + "grad_norm": 3.033233001673288, + "learning_rate": 1.77199511036163e-05, + "loss": 0.8542, + "step": 1229 + }, + { + "epoch": 0.728457210541901, + "grad_norm": 1.474917032531982, + "learning_rate": 1.77158842038428e-05, + "loss": 0.8106, + "step": 1230 + }, + { + "epoch": 0.729049452176488, + "grad_norm": 5.616912789611092, + "learning_rate": 1.7711814147838776e-05, + "loss": 0.8468, + "step": 1231 + }, + { + "epoch": 0.7296416938110749, + "grad_norm": 2.2432187424799217, + "learning_rate": 1.7707740937269108e-05, + "loss": 0.7949, + "step": 1232 + }, + { + "epoch": 0.7302339354456618, + "grad_norm": 3.5975642635267424, + "learning_rate": 1.770366457379997e-05, + "loss": 0.8752, + "step": 1233 + }, + { + "epoch": 0.7308261770802488, + "grad_norm": 2.0011627880439096, + "learning_rate": 1.769958505909882e-05, + "loss": 0.8102, + "step": 1234 + }, + { + "epoch": 0.7314184187148357, + "grad_norm": 2.8280112394549604, + "learning_rate": 1.7695502394834416e-05, + "loss": 0.8692, + "step": 1235 + }, + { + "epoch": 0.7320106603494225, + "grad_norm": 5.669137853206307, + "learning_rate": 1.7691416582676792e-05, + "loss": 0.8747, + "step": 1236 + }, + { + "epoch": 0.7326029019840095, + "grad_norm": 2.1063941357764935, + "learning_rate": 1.768732762429728e-05, + "loss": 0.7983, + "step": 1237 + }, + { + "epoch": 0.7331951436185964, + "grad_norm": 2.4339492898585253, + "learning_rate": 1.7683235521368484e-05, + "loss": 0.8579, + "step": 1238 + }, + { + "epoch": 0.7337873852531833, + "grad_norm": 3.432527823235171, + "learning_rate": 1.7679140275564315e-05, + "loss": 0.814, + "step": 1239 + }, + { + "epoch": 0.7343796268877703, + "grad_norm": 2.4768439625101286, + "learning_rate": 1.7675041888559952e-05, + "loss": 0.8373, + "step": 1240 + }, + { + "epoch": 0.7349718685223571, + "grad_norm": 3.360300015627485, + "learning_rate": 1.767094036203187e-05, + "loss": 0.8396, + "step": 1241 + }, + { + "epoch": 0.735564110156944, + "grad_norm": 3.705948689891925, + "learning_rate": 1.7666835697657824e-05, + "loss": 0.8773, + "step": 1242 + }, + { + "epoch": 0.7361563517915309, + "grad_norm": 2.29301370449494, + "learning_rate": 1.7662727897116843e-05, + "loss": 0.8446, + "step": 1243 + }, + { + "epoch": 0.7367485934261179, + "grad_norm": 5.029628549833236, + "learning_rate": 1.7658616962089262e-05, + "loss": 0.7834, + "step": 1244 + }, + { + "epoch": 0.7373408350607048, + "grad_norm": 3.1057311683988056, + "learning_rate": 1.765450289425668e-05, + "loss": 0.832, + "step": 1245 + }, + { + "epoch": 0.7379330766952917, + "grad_norm": 3.097561035378791, + "learning_rate": 1.765038569530198e-05, + "loss": 0.8543, + "step": 1246 + }, + { + "epoch": 0.7385253183298786, + "grad_norm": 3.034406664943384, + "learning_rate": 1.7646265366909324e-05, + "loss": 0.8386, + "step": 1247 + }, + { + "epoch": 0.7391175599644655, + "grad_norm": 2.6367359298667985, + "learning_rate": 1.7642141910764164e-05, + "loss": 0.8551, + "step": 1248 + }, + { + "epoch": 0.7397098015990524, + "grad_norm": 2.3375256950022014, + "learning_rate": 1.763801532855323e-05, + "loss": 0.8636, + "step": 1249 + }, + { + "epoch": 0.7403020432336394, + "grad_norm": 2.518186118551617, + "learning_rate": 1.7633885621964516e-05, + "loss": 0.8382, + "step": 1250 + }, + { + "epoch": 0.7408942848682263, + "grad_norm": 2.8355859671570003, + "learning_rate": 1.762975279268731e-05, + "loss": 0.8467, + "step": 1251 + }, + { + "epoch": 0.7414865265028131, + "grad_norm": 3.038664124903949, + "learning_rate": 1.7625616842412166e-05, + "loss": 0.8035, + "step": 1252 + }, + { + "epoch": 0.7420787681374, + "grad_norm": 3.4648343011990033, + "learning_rate": 1.7621477772830927e-05, + "loss": 0.8167, + "step": 1253 + }, + { + "epoch": 0.742671009771987, + "grad_norm": 2.5985099036690764, + "learning_rate": 1.76173355856367e-05, + "loss": 0.852, + "step": 1254 + }, + { + "epoch": 0.7432632514065739, + "grad_norm": 2.043804821692882, + "learning_rate": 1.761319028252388e-05, + "loss": 0.8029, + "step": 1255 + }, + { + "epoch": 0.7438554930411608, + "grad_norm": 3.499822034990886, + "learning_rate": 1.7609041865188122e-05, + "loss": 0.8653, + "step": 1256 + }, + { + "epoch": 0.7444477346757477, + "grad_norm": 2.351751327143647, + "learning_rate": 1.7604890335326362e-05, + "loss": 0.8544, + "step": 1257 + }, + { + "epoch": 0.7450399763103346, + "grad_norm": 1.989760304041313, + "learning_rate": 1.7600735694636814e-05, + "loss": 0.809, + "step": 1258 + }, + { + "epoch": 0.7456322179449215, + "grad_norm": 3.2403502877273676, + "learning_rate": 1.7596577944818954e-05, + "loss": 0.8615, + "step": 1259 + }, + { + "epoch": 0.7462244595795084, + "grad_norm": 1.5631166920770456, + "learning_rate": 1.759241708757354e-05, + "loss": 0.8369, + "step": 1260 + }, + { + "epoch": 0.7468167012140954, + "grad_norm": 5.042322573986611, + "learning_rate": 1.7588253124602596e-05, + "loss": 0.7944, + "step": 1261 + }, + { + "epoch": 0.7474089428486823, + "grad_norm": 4.732786628166864, + "learning_rate": 1.7584086057609413e-05, + "loss": 0.8202, + "step": 1262 + }, + { + "epoch": 0.7480011844832691, + "grad_norm": 2.275400890981689, + "learning_rate": 1.757991588829856e-05, + "loss": 0.8174, + "step": 1263 + }, + { + "epoch": 0.7485934261178561, + "grad_norm": 2.669086920815046, + "learning_rate": 1.757574261837587e-05, + "loss": 0.867, + "step": 1264 + }, + { + "epoch": 0.749185667752443, + "grad_norm": 2.6052774063562865, + "learning_rate": 1.7571566249548446e-05, + "loss": 0.8538, + "step": 1265 + }, + { + "epoch": 0.7497779093870299, + "grad_norm": 2.405180495723794, + "learning_rate": 1.7567386783524655e-05, + "loss": 0.8009, + "step": 1266 + }, + { + "epoch": 0.7503701510216169, + "grad_norm": 3.3676019565827886, + "learning_rate": 1.756320422201413e-05, + "loss": 0.8042, + "step": 1267 + }, + { + "epoch": 0.7509623926562037, + "grad_norm": 2.966741525614473, + "learning_rate": 1.7559018566727788e-05, + "loss": 0.8026, + "step": 1268 + }, + { + "epoch": 0.7515546342907906, + "grad_norm": 3.2543032515193993, + "learning_rate": 1.755482981937778e-05, + "loss": 0.8627, + "step": 1269 + }, + { + "epoch": 0.7521468759253775, + "grad_norm": 2.0946507733084165, + "learning_rate": 1.755063798167755e-05, + "loss": 0.8687, + "step": 1270 + }, + { + "epoch": 0.7527391175599645, + "grad_norm": 1.3831223141092206, + "learning_rate": 1.754644305534179e-05, + "loss": 0.8339, + "step": 1271 + }, + { + "epoch": 0.7533313591945514, + "grad_norm": 2.0665293112108465, + "learning_rate": 1.754224504208647e-05, + "loss": 0.7964, + "step": 1272 + }, + { + "epoch": 0.7539236008291383, + "grad_norm": 13.182638808399124, + "learning_rate": 1.7538043943628803e-05, + "loss": 0.851, + "step": 1273 + }, + { + "epoch": 0.7545158424637252, + "grad_norm": 1.7213978689349443, + "learning_rate": 1.7533839761687278e-05, + "loss": 0.848, + "step": 1274 + }, + { + "epoch": 0.7551080840983121, + "grad_norm": 4.2008316501367435, + "learning_rate": 1.7529632497981644e-05, + "loss": 0.7775, + "step": 1275 + }, + { + "epoch": 0.755700325732899, + "grad_norm": 1.7693985409902604, + "learning_rate": 1.7525422154232906e-05, + "loss": 0.8109, + "step": 1276 + }, + { + "epoch": 0.756292567367486, + "grad_norm": 2.2297535857353155, + "learning_rate": 1.752120873216333e-05, + "loss": 0.9018, + "step": 1277 + }, + { + "epoch": 0.7568848090020729, + "grad_norm": 2.7453149628984215, + "learning_rate": 1.7516992233496443e-05, + "loss": 0.8978, + "step": 1278 + }, + { + "epoch": 0.7574770506366597, + "grad_norm": 1.6904372827667524, + "learning_rate": 1.7512772659957037e-05, + "loss": 0.875, + "step": 1279 + }, + { + "epoch": 0.7580692922712466, + "grad_norm": 2.943908693131239, + "learning_rate": 1.7508550013271146e-05, + "loss": 0.8365, + "step": 1280 + }, + { + "epoch": 0.7586615339058336, + "grad_norm": 2.5437925239618684, + "learning_rate": 1.7504324295166073e-05, + "loss": 0.8528, + "step": 1281 + }, + { + "epoch": 0.7592537755404205, + "grad_norm": 1.6253829546363479, + "learning_rate": 1.7500095507370376e-05, + "loss": 0.8533, + "step": 1282 + }, + { + "epoch": 0.7598460171750074, + "grad_norm": 2.534930764133692, + "learning_rate": 1.7495863651613866e-05, + "loss": 0.8562, + "step": 1283 + }, + { + "epoch": 0.7604382588095944, + "grad_norm": 1.6060087765012423, + "learning_rate": 1.749162872962761e-05, + "loss": 0.8215, + "step": 1284 + }, + { + "epoch": 0.7610305004441812, + "grad_norm": 1.7368003536795598, + "learning_rate": 1.7487390743143927e-05, + "loss": 0.8376, + "step": 1285 + }, + { + "epoch": 0.7616227420787681, + "grad_norm": 3.382650140379708, + "learning_rate": 1.7483149693896396e-05, + "loss": 0.8282, + "step": 1286 + }, + { + "epoch": 0.762214983713355, + "grad_norm": 3.2394560487893074, + "learning_rate": 1.747890558361984e-05, + "loss": 0.8334, + "step": 1287 + }, + { + "epoch": 0.762807225347942, + "grad_norm": 2.1963133524390295, + "learning_rate": 1.7474658414050344e-05, + "loss": 0.8512, + "step": 1288 + }, + { + "epoch": 0.7633994669825289, + "grad_norm": 2.7546045395326986, + "learning_rate": 1.7470408186925233e-05, + "loss": 0.846, + "step": 1289 + }, + { + "epoch": 0.7639917086171157, + "grad_norm": 2.4453730617259004, + "learning_rate": 1.7466154903983092e-05, + "loss": 0.8403, + "step": 1290 + }, + { + "epoch": 0.7645839502517027, + "grad_norm": 3.391610937118782, + "learning_rate": 1.7461898566963754e-05, + "loss": 0.844, + "step": 1291 + }, + { + "epoch": 0.7651761918862896, + "grad_norm": 2.9168088797780345, + "learning_rate": 1.74576391776083e-05, + "loss": 0.8823, + "step": 1292 + }, + { + "epoch": 0.7657684335208765, + "grad_norm": 2.199990393304276, + "learning_rate": 1.745337673765906e-05, + "loss": 0.8407, + "step": 1293 + }, + { + "epoch": 0.7663606751554635, + "grad_norm": 1.9287995776670166, + "learning_rate": 1.744911124885961e-05, + "loss": 0.8179, + "step": 1294 + }, + { + "epoch": 0.7669529167900503, + "grad_norm": 2.2102693326836964, + "learning_rate": 1.7444842712954778e-05, + "loss": 0.7751, + "step": 1295 + }, + { + "epoch": 0.7675451584246372, + "grad_norm": 2.388498039600017, + "learning_rate": 1.7440571131690626e-05, + "loss": 0.8686, + "step": 1296 + }, + { + "epoch": 0.7681374000592242, + "grad_norm": 2.6036812285499322, + "learning_rate": 1.7436296506814483e-05, + "loss": 0.8603, + "step": 1297 + }, + { + "epoch": 0.7687296416938111, + "grad_norm": 2.9525736440709425, + "learning_rate": 1.7432018840074905e-05, + "loss": 0.8569, + "step": 1298 + }, + { + "epoch": 0.769321883328398, + "grad_norm": 2.301471909050177, + "learning_rate": 1.7427738133221694e-05, + "loss": 0.8967, + "step": 1299 + }, + { + "epoch": 0.769914124962985, + "grad_norm": 2.399067499647489, + "learning_rate": 1.742345438800591e-05, + "loss": 0.886, + "step": 1300 + }, + { + "epoch": 0.7705063665975718, + "grad_norm": 1.6895013555018883, + "learning_rate": 1.7419167606179837e-05, + "loss": 0.8087, + "step": 1301 + }, + { + "epoch": 0.7710986082321587, + "grad_norm": 3.155550562789463, + "learning_rate": 1.7414877789497017e-05, + "loss": 0.8124, + "step": 1302 + }, + { + "epoch": 0.7716908498667456, + "grad_norm": 4.009429752899135, + "learning_rate": 1.741058493971222e-05, + "loss": 0.835, + "step": 1303 + }, + { + "epoch": 0.7722830915013326, + "grad_norm": 2.7343050132154483, + "learning_rate": 1.7406289058581466e-05, + "loss": 0.8687, + "step": 1304 + }, + { + "epoch": 0.7728753331359195, + "grad_norm": 2.511148964673554, + "learning_rate": 1.7401990147862008e-05, + "loss": 0.8697, + "step": 1305 + }, + { + "epoch": 0.7734675747705063, + "grad_norm": 1.6005516598974556, + "learning_rate": 1.739768820931235e-05, + "loss": 0.8558, + "step": 1306 + }, + { + "epoch": 0.7740598164050932, + "grad_norm": 5.650942014927322, + "learning_rate": 1.7393383244692218e-05, + "loss": 0.8184, + "step": 1307 + }, + { + "epoch": 0.7746520580396802, + "grad_norm": 3.5550381737938634, + "learning_rate": 1.7389075255762592e-05, + "loss": 0.8144, + "step": 1308 + }, + { + "epoch": 0.7752442996742671, + "grad_norm": 2.764917013421342, + "learning_rate": 1.738476424428568e-05, + "loss": 0.8719, + "step": 1309 + }, + { + "epoch": 0.775836541308854, + "grad_norm": 3.587759059350319, + "learning_rate": 1.7380450212024924e-05, + "loss": 0.861, + "step": 1310 + }, + { + "epoch": 0.776428782943441, + "grad_norm": 2.253441060117673, + "learning_rate": 1.737613316074501e-05, + "loss": 0.8321, + "step": 1311 + }, + { + "epoch": 0.7770210245780278, + "grad_norm": 1.4280085430434677, + "learning_rate": 1.737181309221185e-05, + "loss": 0.8575, + "step": 1312 + }, + { + "epoch": 0.7776132662126147, + "grad_norm": 2.035397944647251, + "learning_rate": 1.73674900081926e-05, + "loss": 0.8998, + "step": 1313 + }, + { + "epoch": 0.7782055078472017, + "grad_norm": 1.519998519535366, + "learning_rate": 1.7363163910455646e-05, + "loss": 0.7885, + "step": 1314 + }, + { + "epoch": 0.7787977494817886, + "grad_norm": 3.650699183495285, + "learning_rate": 1.73588348007706e-05, + "loss": 0.8609, + "step": 1315 + }, + { + "epoch": 0.7793899911163755, + "grad_norm": 5.6510846760274775, + "learning_rate": 1.735450268090831e-05, + "loss": 0.8084, + "step": 1316 + }, + { + "epoch": 0.7799822327509623, + "grad_norm": 2.09775440341691, + "learning_rate": 1.735016755264086e-05, + "loss": 0.8738, + "step": 1317 + }, + { + "epoch": 0.7805744743855493, + "grad_norm": 1.931648307541478, + "learning_rate": 1.7345829417741564e-05, + "loss": 0.815, + "step": 1318 + }, + { + "epoch": 0.7811667160201362, + "grad_norm": 3.1720176685267183, + "learning_rate": 1.734148827798496e-05, + "loss": 0.8635, + "step": 1319 + }, + { + "epoch": 0.7817589576547231, + "grad_norm": 3.202514276793904, + "learning_rate": 1.7337144135146818e-05, + "loss": 0.8523, + "step": 1320 + }, + { + "epoch": 0.7823511992893101, + "grad_norm": 2.3467580376278767, + "learning_rate": 1.7332796991004137e-05, + "loss": 0.8929, + "step": 1321 + }, + { + "epoch": 0.782943440923897, + "grad_norm": 1.7193074019861174, + "learning_rate": 1.7328446847335142e-05, + "loss": 0.8243, + "step": 1322 + }, + { + "epoch": 0.7835356825584838, + "grad_norm": 1.93789230603662, + "learning_rate": 1.7324093705919288e-05, + "loss": 0.8041, + "step": 1323 + }, + { + "epoch": 0.7841279241930708, + "grad_norm": 2.813798090440131, + "learning_rate": 1.731973756853726e-05, + "loss": 0.8083, + "step": 1324 + }, + { + "epoch": 0.7847201658276577, + "grad_norm": 5.447428960254784, + "learning_rate": 1.7315378436970952e-05, + "loss": 0.8272, + "step": 1325 + }, + { + "epoch": 0.7853124074622446, + "grad_norm": 1.5252289352217188, + "learning_rate": 1.73110163130035e-05, + "loss": 0.9191, + "step": 1326 + }, + { + "epoch": 0.7859046490968316, + "grad_norm": 4.098459400505394, + "learning_rate": 1.730665119841926e-05, + "loss": 0.8249, + "step": 1327 + }, + { + "epoch": 0.7864968907314184, + "grad_norm": 1.3507732532666377, + "learning_rate": 1.7302283095003807e-05, + "loss": 0.8448, + "step": 1328 + }, + { + "epoch": 0.7870891323660053, + "grad_norm": 1.8647038075327456, + "learning_rate": 1.729791200454394e-05, + "loss": 0.8271, + "step": 1329 + }, + { + "epoch": 0.7876813740005922, + "grad_norm": 6.007731309930798, + "learning_rate": 1.729353792882768e-05, + "loss": 0.8679, + "step": 1330 + }, + { + "epoch": 0.7882736156351792, + "grad_norm": 1.730951437129075, + "learning_rate": 1.7289160869644273e-05, + "loss": 0.8049, + "step": 1331 + }, + { + "epoch": 0.7888658572697661, + "grad_norm": 2.5126410391090825, + "learning_rate": 1.728478082878418e-05, + "loss": 0.854, + "step": 1332 + }, + { + "epoch": 0.7894580989043529, + "grad_norm": 17.97525643692378, + "learning_rate": 1.7280397808039087e-05, + "loss": 0.8243, + "step": 1333 + }, + { + "epoch": 0.7900503405389399, + "grad_norm": 3.5691587966526512, + "learning_rate": 1.7276011809201896e-05, + "loss": 0.8118, + "step": 1334 + }, + { + "epoch": 0.7906425821735268, + "grad_norm": 1.8559040171020436, + "learning_rate": 1.7271622834066722e-05, + "loss": 0.8886, + "step": 1335 + }, + { + "epoch": 0.7912348238081137, + "grad_norm": 1.6994200843830343, + "learning_rate": 1.7267230884428905e-05, + "loss": 0.8215, + "step": 1336 + }, + { + "epoch": 0.7918270654427007, + "grad_norm": 2.6015274500336343, + "learning_rate": 1.7262835962085e-05, + "loss": 0.7956, + "step": 1337 + }, + { + "epoch": 0.7924193070772876, + "grad_norm": 1.8824140694819236, + "learning_rate": 1.725843806883278e-05, + "loss": 0.8642, + "step": 1338 + }, + { + "epoch": 0.7930115487118744, + "grad_norm": 2.14510961510416, + "learning_rate": 1.7254037206471226e-05, + "loss": 0.7958, + "step": 1339 + }, + { + "epoch": 0.7936037903464613, + "grad_norm": 1.978972467508621, + "learning_rate": 1.7249633376800542e-05, + "loss": 0.7886, + "step": 1340 + }, + { + "epoch": 0.7941960319810483, + "grad_norm": 1.797453382477477, + "learning_rate": 1.724522658162214e-05, + "loss": 0.8117, + "step": 1341 + }, + { + "epoch": 0.7947882736156352, + "grad_norm": 1.9495032921229507, + "learning_rate": 1.7240816822738646e-05, + "loss": 0.7986, + "step": 1342 + }, + { + "epoch": 0.7953805152502221, + "grad_norm": 2.420632731466788, + "learning_rate": 1.72364041019539e-05, + "loss": 0.8428, + "step": 1343 + }, + { + "epoch": 0.795972756884809, + "grad_norm": 1.430427148223359, + "learning_rate": 1.7231988421072957e-05, + "loss": 0.8502, + "step": 1344 + }, + { + "epoch": 0.7965649985193959, + "grad_norm": 2.0260574149321755, + "learning_rate": 1.7227569781902073e-05, + "loss": 0.819, + "step": 1345 + }, + { + "epoch": 0.7971572401539828, + "grad_norm": 1.8930044657278047, + "learning_rate": 1.722314818624872e-05, + "loss": 0.8378, + "step": 1346 + }, + { + "epoch": 0.7977494817885697, + "grad_norm": 1.368043792921863, + "learning_rate": 1.7218723635921587e-05, + "loss": 0.8363, + "step": 1347 + }, + { + "epoch": 0.7983417234231567, + "grad_norm": 1.6583322562133502, + "learning_rate": 1.7214296132730555e-05, + "loss": 0.8587, + "step": 1348 + }, + { + "epoch": 0.7989339650577436, + "grad_norm": 2.6952971468399842, + "learning_rate": 1.7209865678486727e-05, + "loss": 0.8388, + "step": 1349 + }, + { + "epoch": 0.7995262066923304, + "grad_norm": 1.603441854661535, + "learning_rate": 1.7205432275002403e-05, + "loss": 0.8303, + "step": 1350 + }, + { + "epoch": 0.8001184483269174, + "grad_norm": 1.3660414196014246, + "learning_rate": 1.7200995924091102e-05, + "loss": 0.8137, + "step": 1351 + }, + { + "epoch": 0.8007106899615043, + "grad_norm": 1.3428573586009116, + "learning_rate": 1.719655662756753e-05, + "loss": 0.8324, + "step": 1352 + }, + { + "epoch": 0.8013029315960912, + "grad_norm": 3.4551970616956185, + "learning_rate": 1.719211438724762e-05, + "loss": 0.8495, + "step": 1353 + }, + { + "epoch": 0.8018951732306782, + "grad_norm": 3.699225184360079, + "learning_rate": 1.7187669204948495e-05, + "loss": 0.8617, + "step": 1354 + }, + { + "epoch": 0.802487414865265, + "grad_norm": 1.6762625181042976, + "learning_rate": 1.718322108248848e-05, + "loss": 0.8473, + "step": 1355 + }, + { + "epoch": 0.8030796564998519, + "grad_norm": 1.3717529222149383, + "learning_rate": 1.7178770021687113e-05, + "loss": 0.834, + "step": 1356 + }, + { + "epoch": 0.8036718981344388, + "grad_norm": 2.315814133619738, + "learning_rate": 1.7174316024365123e-05, + "loss": 0.8763, + "step": 1357 + }, + { + "epoch": 0.8042641397690258, + "grad_norm": 4.886422808842332, + "learning_rate": 1.7169859092344448e-05, + "loss": 0.8444, + "step": 1358 + }, + { + "epoch": 0.8048563814036127, + "grad_norm": 1.7525565651645565, + "learning_rate": 1.7165399227448222e-05, + "loss": 0.8134, + "step": 1359 + }, + { + "epoch": 0.8054486230381996, + "grad_norm": 2.806549546737031, + "learning_rate": 1.7160936431500785e-05, + "loss": 0.9172, + "step": 1360 + }, + { + "epoch": 0.8060408646727865, + "grad_norm": 1.5747965592763717, + "learning_rate": 1.7156470706327665e-05, + "loss": 0.8433, + "step": 1361 + }, + { + "epoch": 0.8066331063073734, + "grad_norm": 2.128730321422943, + "learning_rate": 1.7152002053755604e-05, + "loss": 0.8361, + "step": 1362 + }, + { + "epoch": 0.8072253479419603, + "grad_norm": 2.327065092345198, + "learning_rate": 1.7147530475612524e-05, + "loss": 0.8628, + "step": 1363 + }, + { + "epoch": 0.8078175895765473, + "grad_norm": 2.380546899275701, + "learning_rate": 1.714305597372755e-05, + "loss": 0.7691, + "step": 1364 + }, + { + "epoch": 0.8084098312111342, + "grad_norm": 2.5637775941669063, + "learning_rate": 1.7138578549931013e-05, + "loss": 0.7957, + "step": 1365 + }, + { + "epoch": 0.809002072845721, + "grad_norm": 2.4241927150678175, + "learning_rate": 1.713409820605443e-05, + "loss": 0.8272, + "step": 1366 + }, + { + "epoch": 0.8095943144803079, + "grad_norm": 1.8511672871361389, + "learning_rate": 1.7129614943930505e-05, + "loss": 0.8062, + "step": 1367 + }, + { + "epoch": 0.8101865561148949, + "grad_norm": 5.52637450036081, + "learning_rate": 1.7125128765393157e-05, + "loss": 0.8065, + "step": 1368 + }, + { + "epoch": 0.8107787977494818, + "grad_norm": 1.5937260830934799, + "learning_rate": 1.7120639672277474e-05, + "loss": 0.8085, + "step": 1369 + }, + { + "epoch": 0.8113710393840687, + "grad_norm": 1.4639078667885639, + "learning_rate": 1.7116147666419755e-05, + "loss": 0.8631, + "step": 1370 + }, + { + "epoch": 0.8119632810186556, + "grad_norm": 2.4168601194929518, + "learning_rate": 1.7111652749657473e-05, + "loss": 0.816, + "step": 1371 + }, + { + "epoch": 0.8125555226532425, + "grad_norm": 7.289050587362146, + "learning_rate": 1.7107154923829317e-05, + "loss": 0.8525, + "step": 1372 + }, + { + "epoch": 0.8131477642878294, + "grad_norm": 2.360028584777416, + "learning_rate": 1.710265419077514e-05, + "loss": 0.8313, + "step": 1373 + }, + { + "epoch": 0.8137400059224164, + "grad_norm": 2.0945779721069457, + "learning_rate": 1.7098150552335997e-05, + "loss": 0.8624, + "step": 1374 + }, + { + "epoch": 0.8143322475570033, + "grad_norm": 3.0412982148200283, + "learning_rate": 1.709364401035413e-05, + "loss": 0.8574, + "step": 1375 + }, + { + "epoch": 0.8149244891915902, + "grad_norm": 1.5284655725565341, + "learning_rate": 1.708913456667297e-05, + "loss": 0.8844, + "step": 1376 + }, + { + "epoch": 0.815516730826177, + "grad_norm": 3.2308336643923368, + "learning_rate": 1.7084622223137128e-05, + "loss": 0.8091, + "step": 1377 + }, + { + "epoch": 0.816108972460764, + "grad_norm": 1.830650660023485, + "learning_rate": 1.7080106981592407e-05, + "loss": 0.8669, + "step": 1378 + }, + { + "epoch": 0.8167012140953509, + "grad_norm": 2.814611843926401, + "learning_rate": 1.70755888438858e-05, + "loss": 0.8626, + "step": 1379 + }, + { + "epoch": 0.8172934557299378, + "grad_norm": 2.183664883059378, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.8077, + "step": 1380 + }, + { + "epoch": 0.8178856973645248, + "grad_norm": 3.4013315528606594, + "learning_rate": 1.7066543887380793e-05, + "loss": 0.8089, + "step": 1381 + }, + { + "epoch": 0.8184779389991116, + "grad_norm": 2.299860689598534, + "learning_rate": 1.7062017072282285e-05, + "loss": 0.8607, + "step": 1382 + }, + { + "epoch": 0.8190701806336985, + "grad_norm": 2.268163956608553, + "learning_rate": 1.7057487368421685e-05, + "loss": 0.8744, + "step": 1383 + }, + { + "epoch": 0.8196624222682855, + "grad_norm": 3.0103061900949166, + "learning_rate": 1.705295477765188e-05, + "loss": 0.7831, + "step": 1384 + }, + { + "epoch": 0.8202546639028724, + "grad_norm": 1.9846928716505232, + "learning_rate": 1.7048419301826973e-05, + "loss": 0.8019, + "step": 1385 + }, + { + "epoch": 0.8208469055374593, + "grad_norm": 2.1436359754808434, + "learning_rate": 1.7043880942802212e-05, + "loss": 0.7726, + "step": 1386 + }, + { + "epoch": 0.8214391471720462, + "grad_norm": 15.99919570552301, + "learning_rate": 1.7039339702434057e-05, + "loss": 0.8581, + "step": 1387 + }, + { + "epoch": 0.8220313888066331, + "grad_norm": 1.9788780361165421, + "learning_rate": 1.7034795582580118e-05, + "loss": 0.8926, + "step": 1388 + }, + { + "epoch": 0.82262363044122, + "grad_norm": 1.9979870871622687, + "learning_rate": 1.70302485850992e-05, + "loss": 0.8445, + "step": 1389 + }, + { + "epoch": 0.8232158720758069, + "grad_norm": 3.3526689648805106, + "learning_rate": 1.7025698711851283e-05, + "loss": 0.8779, + "step": 1390 + }, + { + "epoch": 0.8238081137103939, + "grad_norm": 3.2663903535745553, + "learning_rate": 1.7021145964697518e-05, + "loss": 0.8397, + "step": 1391 + }, + { + "epoch": 0.8244003553449808, + "grad_norm": 2.4623605879732278, + "learning_rate": 1.701659034550023e-05, + "loss": 0.8613, + "step": 1392 + }, + { + "epoch": 0.8249925969795676, + "grad_norm": 4.055785244924466, + "learning_rate": 1.7012031856122936e-05, + "loss": 0.8329, + "step": 1393 + }, + { + "epoch": 0.8255848386141545, + "grad_norm": 2.546760379245732, + "learning_rate": 1.70074704984303e-05, + "loss": 0.8129, + "step": 1394 + }, + { + "epoch": 0.8261770802487415, + "grad_norm": 2.004660636750862, + "learning_rate": 1.7002906274288187e-05, + "loss": 0.8649, + "step": 1395 + }, + { + "epoch": 0.8267693218833284, + "grad_norm": 3.63927143622832, + "learning_rate": 1.6998339185563614e-05, + "loss": 0.8651, + "step": 1396 + }, + { + "epoch": 0.8273615635179153, + "grad_norm": 2.6389702902480354, + "learning_rate": 1.6993769234124777e-05, + "loss": 0.8868, + "step": 1397 + }, + { + "epoch": 0.8279538051525023, + "grad_norm": 3.2789514048802326, + "learning_rate": 1.6989196421841045e-05, + "loss": 0.8329, + "step": 1398 + }, + { + "epoch": 0.8285460467870891, + "grad_norm": 2.9698551323065248, + "learning_rate": 1.6984620750582957e-05, + "loss": 0.8506, + "step": 1399 + }, + { + "epoch": 0.829138288421676, + "grad_norm": 4.493208712376449, + "learning_rate": 1.6980042222222216e-05, + "loss": 0.8512, + "step": 1400 + }, + { + "epoch": 0.829730530056263, + "grad_norm": 3.2446475490162587, + "learning_rate": 1.69754608386317e-05, + "loss": 0.8283, + "step": 1401 + }, + { + "epoch": 0.8303227716908499, + "grad_norm": 3.8139641340470707, + "learning_rate": 1.697087660168545e-05, + "loss": 0.8369, + "step": 1402 + }, + { + "epoch": 0.8309150133254368, + "grad_norm": 2.537763008254762, + "learning_rate": 1.6966289513258683e-05, + "loss": 0.8677, + "step": 1403 + }, + { + "epoch": 0.8315072549600236, + "grad_norm": 2.97503941123126, + "learning_rate": 1.6961699575227767e-05, + "loss": 0.8321, + "step": 1404 + }, + { + "epoch": 0.8320994965946106, + "grad_norm": 5.054631443547838, + "learning_rate": 1.6957106789470252e-05, + "loss": 0.8819, + "step": 1405 + }, + { + "epoch": 0.8326917382291975, + "grad_norm": 3.2685313483268086, + "learning_rate": 1.695251115786484e-05, + "loss": 0.8571, + "step": 1406 + }, + { + "epoch": 0.8332839798637844, + "grad_norm": 2.6791989072294466, + "learning_rate": 1.6947912682291412e-05, + "loss": 0.82, + "step": 1407 + }, + { + "epoch": 0.8338762214983714, + "grad_norm": 6.388461734354429, + "learning_rate": 1.694331136463099e-05, + "loss": 0.8654, + "step": 1408 + }, + { + "epoch": 0.8344684631329582, + "grad_norm": 5.378746896229759, + "learning_rate": 1.6938707206765784e-05, + "loss": 0.8756, + "step": 1409 + }, + { + "epoch": 0.8350607047675451, + "grad_norm": 2.382113562740471, + "learning_rate": 1.6934100210579144e-05, + "loss": 0.8521, + "step": 1410 + }, + { + "epoch": 0.8356529464021321, + "grad_norm": 2.3110802302573936, + "learning_rate": 1.69294903779556e-05, + "loss": 0.8355, + "step": 1411 + }, + { + "epoch": 0.836245188036719, + "grad_norm": 2.217801291758983, + "learning_rate": 1.6924877710780818e-05, + "loss": 0.8798, + "step": 1412 + }, + { + "epoch": 0.8368374296713059, + "grad_norm": 2.0560538956481036, + "learning_rate": 1.6920262210941657e-05, + "loss": 0.8128, + "step": 1413 + }, + { + "epoch": 0.8374296713058929, + "grad_norm": 6.864186095594025, + "learning_rate": 1.69156438803261e-05, + "loss": 0.8533, + "step": 1414 + }, + { + "epoch": 0.8380219129404797, + "grad_norm": 2.136582758478113, + "learning_rate": 1.6911022720823315e-05, + "loss": 0.8236, + "step": 1415 + }, + { + "epoch": 0.8386141545750666, + "grad_norm": 2.499403080840201, + "learning_rate": 1.690639873432361e-05, + "loss": 0.8663, + "step": 1416 + }, + { + "epoch": 0.8392063962096535, + "grad_norm": 6.035882530883013, + "learning_rate": 1.6901771922718453e-05, + "loss": 0.8783, + "step": 1417 + }, + { + "epoch": 0.8397986378442405, + "grad_norm": 4.243343370936844, + "learning_rate": 1.6897142287900477e-05, + "loss": 0.8794, + "step": 1418 + }, + { + "epoch": 0.8403908794788274, + "grad_norm": 6.601930358453118, + "learning_rate": 1.6892509831763467e-05, + "loss": 0.8406, + "step": 1419 + }, + { + "epoch": 0.8409831211134142, + "grad_norm": 2.546863435034202, + "learning_rate": 1.6887874556202342e-05, + "loss": 0.8262, + "step": 1420 + }, + { + "epoch": 0.8415753627480012, + "grad_norm": 1.626447021048131, + "learning_rate": 1.6883236463113207e-05, + "loss": 0.8047, + "step": 1421 + }, + { + "epoch": 0.8421676043825881, + "grad_norm": 16.71050322079958, + "learning_rate": 1.687859555439329e-05, + "loss": 0.8308, + "step": 1422 + }, + { + "epoch": 0.842759846017175, + "grad_norm": 2.0493773260165353, + "learning_rate": 1.6873951831940993e-05, + "loss": 0.8338, + "step": 1423 + }, + { + "epoch": 0.843352087651762, + "grad_norm": 2.4162309237205988, + "learning_rate": 1.686930529765585e-05, + "loss": 0.8567, + "step": 1424 + }, + { + "epoch": 0.8439443292863489, + "grad_norm": 2.4878273609774753, + "learning_rate": 1.6864655953438563e-05, + "loss": 0.8029, + "step": 1425 + }, + { + "epoch": 0.8445365709209357, + "grad_norm": 3.2572445643608003, + "learning_rate": 1.6860003801190975e-05, + "loss": 0.8457, + "step": 1426 + }, + { + "epoch": 0.8451288125555226, + "grad_norm": 5.916107483028511, + "learning_rate": 1.6855348842816074e-05, + "loss": 0.8797, + "step": 1427 + }, + { + "epoch": 0.8457210541901096, + "grad_norm": 2.2527335241769166, + "learning_rate": 1.6850691080218e-05, + "loss": 0.8791, + "step": 1428 + }, + { + "epoch": 0.8463132958246965, + "grad_norm": 5.083451401380974, + "learning_rate": 1.6846030515302044e-05, + "loss": 0.8085, + "step": 1429 + }, + { + "epoch": 0.8469055374592834, + "grad_norm": 3.1992848879644495, + "learning_rate": 1.6841367149974638e-05, + "loss": 0.8713, + "step": 1430 + }, + { + "epoch": 0.8474977790938703, + "grad_norm": 2.1404065000354775, + "learning_rate": 1.6836700986143354e-05, + "loss": 0.8767, + "step": 1431 + }, + { + "epoch": 0.8480900207284572, + "grad_norm": 3.2502047486489327, + "learning_rate": 1.683203202571692e-05, + "loss": 0.825, + "step": 1432 + }, + { + "epoch": 0.8486822623630441, + "grad_norm": 4.062783383833649, + "learning_rate": 1.682736027060521e-05, + "loss": 0.7975, + "step": 1433 + }, + { + "epoch": 0.849274503997631, + "grad_norm": 3.1346761905120077, + "learning_rate": 1.6822685722719224e-05, + "loss": 0.8598, + "step": 1434 + }, + { + "epoch": 0.849866745632218, + "grad_norm": 4.335065021022113, + "learning_rate": 1.681800838397112e-05, + "loss": 0.7865, + "step": 1435 + }, + { + "epoch": 0.8504589872668049, + "grad_norm": 2.017703357099882, + "learning_rate": 1.681332825627419e-05, + "loss": 0.8488, + "step": 1436 + }, + { + "epoch": 0.8510512289013917, + "grad_norm": 4.753567787732221, + "learning_rate": 1.680864534154287e-05, + "loss": 0.8702, + "step": 1437 + }, + { + "epoch": 0.8516434705359787, + "grad_norm": 2.438375125937311, + "learning_rate": 1.680395964169274e-05, + "loss": 0.8571, + "step": 1438 + }, + { + "epoch": 0.8522357121705656, + "grad_norm": 2.9251937289705916, + "learning_rate": 1.6799271158640517e-05, + "loss": 0.822, + "step": 1439 + }, + { + "epoch": 0.8528279538051525, + "grad_norm": 2.2312157866256412, + "learning_rate": 1.6794579894304043e-05, + "loss": 0.8243, + "step": 1440 + }, + { + "epoch": 0.8534201954397395, + "grad_norm": 7.074372608081394, + "learning_rate": 1.678988585060231e-05, + "loss": 0.8624, + "step": 1441 + }, + { + "epoch": 0.8540124370743263, + "grad_norm": 2.7556705357562357, + "learning_rate": 1.6785189029455455e-05, + "loss": 0.8614, + "step": 1442 + }, + { + "epoch": 0.8546046787089132, + "grad_norm": 1.9746989976797515, + "learning_rate": 1.6780489432784738e-05, + "loss": 0.8229, + "step": 1443 + }, + { + "epoch": 0.8551969203435001, + "grad_norm": 1.9413368148573928, + "learning_rate": 1.6775787062512557e-05, + "loss": 0.8694, + "step": 1444 + }, + { + "epoch": 0.8557891619780871, + "grad_norm": 2.2060934638723206, + "learning_rate": 1.6771081920562445e-05, + "loss": 0.8598, + "step": 1445 + }, + { + "epoch": 0.856381403612674, + "grad_norm": 3.5024309364669444, + "learning_rate": 1.676637400885907e-05, + "loss": 0.8485, + "step": 1446 + }, + { + "epoch": 0.8569736452472608, + "grad_norm": 4.018519154408224, + "learning_rate": 1.676166332932824e-05, + "loss": 0.8383, + "step": 1447 + }, + { + "epoch": 0.8575658868818478, + "grad_norm": 1.7615281242175937, + "learning_rate": 1.6756949883896874e-05, + "loss": 0.8035, + "step": 1448 + }, + { + "epoch": 0.8581581285164347, + "grad_norm": 2.1966167616847403, + "learning_rate": 1.675223367449305e-05, + "loss": 0.831, + "step": 1449 + }, + { + "epoch": 0.8587503701510216, + "grad_norm": 2.6550583205151685, + "learning_rate": 1.6747514703045952e-05, + "loss": 0.8216, + "step": 1450 + }, + { + "epoch": 0.8593426117856086, + "grad_norm": 2.478511653719764, + "learning_rate": 1.6742792971485912e-05, + "loss": 0.8828, + "step": 1451 + }, + { + "epoch": 0.8599348534201955, + "grad_norm": 5.962171814245029, + "learning_rate": 1.673806848174438e-05, + "loss": 0.8454, + "step": 1452 + }, + { + "epoch": 0.8605270950547823, + "grad_norm": 2.083540364486756, + "learning_rate": 1.6733341235753938e-05, + "loss": 0.8842, + "step": 1453 + }, + { + "epoch": 0.8611193366893692, + "grad_norm": 2.063306619099545, + "learning_rate": 1.67286112354483e-05, + "loss": 0.8155, + "step": 1454 + }, + { + "epoch": 0.8617115783239562, + "grad_norm": 3.43994740810729, + "learning_rate": 1.6723878482762296e-05, + "loss": 0.8795, + "step": 1455 + }, + { + "epoch": 0.8623038199585431, + "grad_norm": 3.1123310742355312, + "learning_rate": 1.671914297963189e-05, + "loss": 0.8512, + "step": 1456 + }, + { + "epoch": 0.86289606159313, + "grad_norm": 3.1237187679340455, + "learning_rate": 1.671440472799417e-05, + "loss": 0.8313, + "step": 1457 + }, + { + "epoch": 0.8634883032277169, + "grad_norm": 2.450839707155073, + "learning_rate": 1.670966372978735e-05, + "loss": 0.8421, + "step": 1458 + }, + { + "epoch": 0.8640805448623038, + "grad_norm": 10.847110912043622, + "learning_rate": 1.6704919986950757e-05, + "loss": 0.9009, + "step": 1459 + }, + { + "epoch": 0.8646727864968907, + "grad_norm": 3.949302192305963, + "learning_rate": 1.670017350142486e-05, + "loss": 0.8754, + "step": 1460 + }, + { + "epoch": 0.8652650281314777, + "grad_norm": 1.8975667299053336, + "learning_rate": 1.6695424275151228e-05, + "loss": 0.819, + "step": 1461 + }, + { + "epoch": 0.8658572697660646, + "grad_norm": 3.3380137589905337, + "learning_rate": 1.669067231007256e-05, + "loss": 0.85, + "step": 1462 + }, + { + "epoch": 0.8664495114006515, + "grad_norm": 2.2925444313034133, + "learning_rate": 1.668591760813269e-05, + "loss": 0.8283, + "step": 1463 + }, + { + "epoch": 0.8670417530352383, + "grad_norm": 1.7424162303718678, + "learning_rate": 1.668116017127655e-05, + "loss": 0.8352, + "step": 1464 + }, + { + "epoch": 0.8676339946698253, + "grad_norm": 2.3289766280717425, + "learning_rate": 1.66764000014502e-05, + "loss": 0.8662, + "step": 1465 + }, + { + "epoch": 0.8682262363044122, + "grad_norm": 2.501537435831264, + "learning_rate": 1.667163710060082e-05, + "loss": 0.8126, + "step": 1466 + }, + { + "epoch": 0.8688184779389991, + "grad_norm": 2.9512357732950405, + "learning_rate": 1.6666871470676692e-05, + "loss": 0.8318, + "step": 1467 + }, + { + "epoch": 0.8694107195735861, + "grad_norm": 1.7783090866183635, + "learning_rate": 1.6662103113627246e-05, + "loss": 0.8499, + "step": 1468 + }, + { + "epoch": 0.8700029612081729, + "grad_norm": 2.5732417344764875, + "learning_rate": 1.6657332031402992e-05, + "loss": 0.8511, + "step": 1469 + }, + { + "epoch": 0.8705952028427598, + "grad_norm": 2.270239747001915, + "learning_rate": 1.6652558225955582e-05, + "loss": 0.8056, + "step": 1470 + }, + { + "epoch": 0.8711874444773468, + "grad_norm": 2.0703557879671104, + "learning_rate": 1.6647781699237765e-05, + "loss": 0.8341, + "step": 1471 + }, + { + "epoch": 0.8717796861119337, + "grad_norm": 3.563658846641916, + "learning_rate": 1.6643002453203405e-05, + "loss": 0.8619, + "step": 1472 + }, + { + "epoch": 0.8723719277465206, + "grad_norm": 2.8049555007770453, + "learning_rate": 1.6638220489807495e-05, + "loss": 0.8093, + "step": 1473 + }, + { + "epoch": 0.8729641693811075, + "grad_norm": 5.679515019474668, + "learning_rate": 1.6633435811006117e-05, + "loss": 0.8665, + "step": 1474 + }, + { + "epoch": 0.8735564110156944, + "grad_norm": 5.038477719863848, + "learning_rate": 1.6628648418756474e-05, + "loss": 0.8382, + "step": 1475 + }, + { + "epoch": 0.8741486526502813, + "grad_norm": 8.615254589030032, + "learning_rate": 1.662385831501688e-05, + "loss": 0.7959, + "step": 1476 + }, + { + "epoch": 0.8747408942848682, + "grad_norm": 8.49226125726425, + "learning_rate": 1.6619065501746762e-05, + "loss": 0.8569, + "step": 1477 + }, + { + "epoch": 0.8753331359194552, + "grad_norm": 2.216292664316629, + "learning_rate": 1.661426998090664e-05, + "loss": 0.8056, + "step": 1478 + }, + { + "epoch": 0.8759253775540421, + "grad_norm": 3.1706560681895177, + "learning_rate": 1.6609471754458163e-05, + "loss": 0.8421, + "step": 1479 + }, + { + "epoch": 0.8765176191886289, + "grad_norm": 2.596125359352298, + "learning_rate": 1.6604670824364067e-05, + "loss": 0.8084, + "step": 1480 + }, + { + "epoch": 0.8771098608232158, + "grad_norm": 3.599816985420332, + "learning_rate": 1.6599867192588207e-05, + "loss": 0.8211, + "step": 1481 + }, + { + "epoch": 0.8777021024578028, + "grad_norm": 5.1170671887832, + "learning_rate": 1.6595060861095534e-05, + "loss": 0.8538, + "step": 1482 + }, + { + "epoch": 0.8782943440923897, + "grad_norm": 3.911881349947541, + "learning_rate": 1.6590251831852113e-05, + "loss": 0.8509, + "step": 1483 + }, + { + "epoch": 0.8788865857269766, + "grad_norm": 3.6771517985702125, + "learning_rate": 1.6585440106825107e-05, + "loss": 0.7833, + "step": 1484 + }, + { + "epoch": 0.8794788273615635, + "grad_norm": 2.5698713996771105, + "learning_rate": 1.6580625687982776e-05, + "loss": 0.814, + "step": 1485 + }, + { + "epoch": 0.8800710689961504, + "grad_norm": 10.101577187451042, + "learning_rate": 1.6575808577294492e-05, + "loss": 0.819, + "step": 1486 + }, + { + "epoch": 0.8806633106307373, + "grad_norm": 2.2073990472582476, + "learning_rate": 1.657098877673073e-05, + "loss": 0.885, + "step": 1487 + }, + { + "epoch": 0.8812555522653243, + "grad_norm": 4.739398445699331, + "learning_rate": 1.6566166288263046e-05, + "loss": 0.8629, + "step": 1488 + }, + { + "epoch": 0.8818477938999112, + "grad_norm": 3.1974524924806107, + "learning_rate": 1.656134111386412e-05, + "loss": 0.8511, + "step": 1489 + }, + { + "epoch": 0.8824400355344981, + "grad_norm": 2.5663036406803608, + "learning_rate": 1.6556513255507714e-05, + "loss": 0.816, + "step": 1490 + }, + { + "epoch": 0.883032277169085, + "grad_norm": 1.9680923900191523, + "learning_rate": 1.65516827151687e-05, + "loss": 0.8843, + "step": 1491 + }, + { + "epoch": 0.8836245188036719, + "grad_norm": 2.414157535179823, + "learning_rate": 1.6546849494823037e-05, + "loss": 0.8753, + "step": 1492 + }, + { + "epoch": 0.8842167604382588, + "grad_norm": 2.4575703751091065, + "learning_rate": 1.654201359644778e-05, + "loss": 0.8654, + "step": 1493 + }, + { + "epoch": 0.8848090020728457, + "grad_norm": 2.992592460545843, + "learning_rate": 1.653717502202109e-05, + "loss": 0.8567, + "step": 1494 + }, + { + "epoch": 0.8854012437074327, + "grad_norm": 2.2703367582766525, + "learning_rate": 1.653233377352221e-05, + "loss": 0.8335, + "step": 1495 + }, + { + "epoch": 0.8859934853420195, + "grad_norm": 20.27278103196212, + "learning_rate": 1.652748985293149e-05, + "loss": 0.883, + "step": 1496 + }, + { + "epoch": 0.8865857269766064, + "grad_norm": 1.8516469130504767, + "learning_rate": 1.652264326223036e-05, + "loss": 0.7949, + "step": 1497 + }, + { + "epoch": 0.8871779686111934, + "grad_norm": 4.524574044226397, + "learning_rate": 1.6517794003401345e-05, + "loss": 0.8369, + "step": 1498 + }, + { + "epoch": 0.8877702102457803, + "grad_norm": 2.3066187010469865, + "learning_rate": 1.6512942078428072e-05, + "loss": 0.8298, + "step": 1499 + }, + { + "epoch": 0.8883624518803672, + "grad_norm": 3.9255893035367437, + "learning_rate": 1.650808748929525e-05, + "loss": 0.8436, + "step": 1500 + }, + { + "epoch": 0.8889546935149542, + "grad_norm": 2.475550455756812, + "learning_rate": 1.6503230237988676e-05, + "loss": 0.7807, + "step": 1501 + }, + { + "epoch": 0.889546935149541, + "grad_norm": 1.9064164754303783, + "learning_rate": 1.6498370326495242e-05, + "loss": 0.8097, + "step": 1502 + }, + { + "epoch": 0.8901391767841279, + "grad_norm": 2.9818131068437967, + "learning_rate": 1.649350775680292e-05, + "loss": 0.8664, + "step": 1503 + }, + { + "epoch": 0.8907314184187148, + "grad_norm": 1.928535242218003, + "learning_rate": 1.648864253090078e-05, + "loss": 0.8444, + "step": 1504 + }, + { + "epoch": 0.8913236600533018, + "grad_norm": 2.4186278041533753, + "learning_rate": 1.6483774650778973e-05, + "loss": 0.8225, + "step": 1505 + }, + { + "epoch": 0.8919159016878887, + "grad_norm": 6.329817463922851, + "learning_rate": 1.6478904118428735e-05, + "loss": 0.8687, + "step": 1506 + }, + { + "epoch": 0.8925081433224755, + "grad_norm": 2.406656080352376, + "learning_rate": 1.647403093584238e-05, + "loss": 0.8689, + "step": 1507 + }, + { + "epoch": 0.8931003849570625, + "grad_norm": 1.698392307205128, + "learning_rate": 1.6469155105013324e-05, + "loss": 0.8136, + "step": 1508 + }, + { + "epoch": 0.8936926265916494, + "grad_norm": 1.7222065175556518, + "learning_rate": 1.646427662793605e-05, + "loss": 0.8621, + "step": 1509 + }, + { + "epoch": 0.8942848682262363, + "grad_norm": 1.835429252270241, + "learning_rate": 1.6459395506606133e-05, + "loss": 0.8463, + "step": 1510 + }, + { + "epoch": 0.8948771098608232, + "grad_norm": 2.02354089579679, + "learning_rate": 1.6454511743020222e-05, + "loss": 0.822, + "step": 1511 + }, + { + "epoch": 0.8954693514954102, + "grad_norm": 2.178325990673068, + "learning_rate": 1.6449625339176056e-05, + "loss": 0.9125, + "step": 1512 + }, + { + "epoch": 0.896061593129997, + "grad_norm": 1.6566812304799738, + "learning_rate": 1.6444736297072446e-05, + "loss": 0.8098, + "step": 1513 + }, + { + "epoch": 0.8966538347645839, + "grad_norm": 2.7595341729881344, + "learning_rate": 1.6439844618709285e-05, + "loss": 0.8959, + "step": 1514 + }, + { + "epoch": 0.8972460763991709, + "grad_norm": 1.992051362483181, + "learning_rate": 1.6434950306087544e-05, + "loss": 0.8598, + "step": 1515 + }, + { + "epoch": 0.8978383180337578, + "grad_norm": 1.4714721824680785, + "learning_rate": 1.6430053361209274e-05, + "loss": 0.8236, + "step": 1516 + }, + { + "epoch": 0.8984305596683447, + "grad_norm": 1.6591161099805962, + "learning_rate": 1.6425153786077598e-05, + "loss": 0.8749, + "step": 1517 + }, + { + "epoch": 0.8990228013029316, + "grad_norm": 1.2831071701329722, + "learning_rate": 1.642025158269672e-05, + "loss": 0.8128, + "step": 1518 + }, + { + "epoch": 0.8996150429375185, + "grad_norm": 1.4872368851201596, + "learning_rate": 1.641534675307192e-05, + "loss": 0.7701, + "step": 1519 + }, + { + "epoch": 0.9002072845721054, + "grad_norm": 1.756308879818358, + "learning_rate": 1.641043929920954e-05, + "loss": 0.8523, + "step": 1520 + }, + { + "epoch": 0.9007995262066923, + "grad_norm": 2.161708847108366, + "learning_rate": 1.6405529223117013e-05, + "loss": 0.8145, + "step": 1521 + }, + { + "epoch": 0.9013917678412793, + "grad_norm": 1.585827703784451, + "learning_rate": 1.6400616526802835e-05, + "loss": 0.8352, + "step": 1522 + }, + { + "epoch": 0.9019840094758661, + "grad_norm": 1.823999072218306, + "learning_rate": 1.6395701212276573e-05, + "loss": 0.8103, + "step": 1523 + }, + { + "epoch": 0.902576251110453, + "grad_norm": 1.7892988299933283, + "learning_rate": 1.6390783281548865e-05, + "loss": 0.8468, + "step": 1524 + }, + { + "epoch": 0.90316849274504, + "grad_norm": 1.828224641878733, + "learning_rate": 1.638586273663143e-05, + "loss": 0.8189, + "step": 1525 + }, + { + "epoch": 0.9037607343796269, + "grad_norm": 1.5516682705105311, + "learning_rate": 1.6380939579537033e-05, + "loss": 0.8347, + "step": 1526 + }, + { + "epoch": 0.9043529760142138, + "grad_norm": 3.834168570081106, + "learning_rate": 1.6376013812279534e-05, + "loss": 0.8573, + "step": 1527 + }, + { + "epoch": 0.9049452176488008, + "grad_norm": 2.1526452193356547, + "learning_rate": 1.6371085436873847e-05, + "loss": 0.8469, + "step": 1528 + }, + { + "epoch": 0.9055374592833876, + "grad_norm": 1.525680512787262, + "learning_rate": 1.636615445533595e-05, + "loss": 0.8192, + "step": 1529 + }, + { + "epoch": 0.9061297009179745, + "grad_norm": 2.288883878412789, + "learning_rate": 1.6361220869682896e-05, + "loss": 0.8153, + "step": 1530 + }, + { + "epoch": 0.9067219425525614, + "grad_norm": 4.621322518818992, + "learning_rate": 1.63562846819328e-05, + "loss": 0.8144, + "step": 1531 + }, + { + "epoch": 0.9073141841871484, + "grad_norm": 1.6212241602499209, + "learning_rate": 1.635134589410483e-05, + "loss": 0.8537, + "step": 1532 + }, + { + "epoch": 0.9079064258217353, + "grad_norm": 1.750883625265076, + "learning_rate": 1.6346404508219244e-05, + "loss": 0.8252, + "step": 1533 + }, + { + "epoch": 0.9084986674563221, + "grad_norm": 1.8359912026959841, + "learning_rate": 1.6341460526297335e-05, + "loss": 0.8425, + "step": 1534 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.4533966344737113, + "learning_rate": 1.6336513950361474e-05, + "loss": 0.8562, + "step": 1535 + }, + { + "epoch": 0.909683150725496, + "grad_norm": 2.2110168582468552, + "learning_rate": 1.6331564782435087e-05, + "loss": 0.7873, + "step": 1536 + }, + { + "epoch": 0.9102753923600829, + "grad_norm": 2.4584163208420913, + "learning_rate": 1.6326613024542667e-05, + "loss": 0.8297, + "step": 1537 + }, + { + "epoch": 0.9108676339946699, + "grad_norm": 2.304101358316969, + "learning_rate": 1.6321658678709752e-05, + "loss": 0.8304, + "step": 1538 + }, + { + "epoch": 0.9114598756292568, + "grad_norm": 1.5235885803236862, + "learning_rate": 1.6316701746962956e-05, + "loss": 0.8285, + "step": 1539 + }, + { + "epoch": 0.9120521172638436, + "grad_norm": 2.0501459681351517, + "learning_rate": 1.6311742231329936e-05, + "loss": 0.8338, + "step": 1540 + }, + { + "epoch": 0.9126443588984305, + "grad_norm": 1.7456834254936182, + "learning_rate": 1.630678013383942e-05, + "loss": 0.8232, + "step": 1541 + }, + { + "epoch": 0.9132366005330175, + "grad_norm": 2.017939224936834, + "learning_rate": 1.6301815456521185e-05, + "loss": 0.8093, + "step": 1542 + }, + { + "epoch": 0.9138288421676044, + "grad_norm": 1.7066296321756638, + "learning_rate": 1.629684820140606e-05, + "loss": 0.8545, + "step": 1543 + }, + { + "epoch": 0.9144210838021913, + "grad_norm": 5.4294245310007, + "learning_rate": 1.6291878370525925e-05, + "loss": 0.828, + "step": 1544 + }, + { + "epoch": 0.9150133254367782, + "grad_norm": 2.6780329423128446, + "learning_rate": 1.6286905965913732e-05, + "loss": 0.8682, + "step": 1545 + }, + { + "epoch": 0.9156055670713651, + "grad_norm": 2.5467678711002364, + "learning_rate": 1.6281930989603466e-05, + "loss": 0.7832, + "step": 1546 + }, + { + "epoch": 0.916197808705952, + "grad_norm": 2.230883357213969, + "learning_rate": 1.627695344363018e-05, + "loss": 0.8445, + "step": 1547 + }, + { + "epoch": 0.916790050340539, + "grad_norm": 4.2585630086605315, + "learning_rate": 1.627197333002996e-05, + "loss": 0.7631, + "step": 1548 + }, + { + "epoch": 0.9173822919751259, + "grad_norm": 2.018426386994712, + "learning_rate": 1.6266990650839965e-05, + "loss": 0.8065, + "step": 1549 + }, + { + "epoch": 0.9179745336097128, + "grad_norm": 5.5918944687522405, + "learning_rate": 1.6262005408098378e-05, + "loss": 0.8405, + "step": 1550 + }, + { + "epoch": 0.9185667752442996, + "grad_norm": 3.0051047010536234, + "learning_rate": 1.6257017603844452e-05, + "loss": 0.8329, + "step": 1551 + }, + { + "epoch": 0.9191590168788866, + "grad_norm": 1.6536747992743932, + "learning_rate": 1.6252027240118472e-05, + "loss": 0.8, + "step": 1552 + }, + { + "epoch": 0.9197512585134735, + "grad_norm": 2.3069408419926978, + "learning_rate": 1.6247034318961788e-05, + "loss": 0.7958, + "step": 1553 + }, + { + "epoch": 0.9203435001480604, + "grad_norm": 2.5997053054107244, + "learning_rate": 1.624203884241678e-05, + "loss": 0.8594, + "step": 1554 + }, + { + "epoch": 0.9209357417826474, + "grad_norm": 2.63780427340796, + "learning_rate": 1.6237040812526875e-05, + "loss": 0.84, + "step": 1555 + }, + { + "epoch": 0.9215279834172342, + "grad_norm": 2.4212885909450206, + "learning_rate": 1.6232040231336556e-05, + "loss": 0.8491, + "step": 1556 + }, + { + "epoch": 0.9221202250518211, + "grad_norm": 2.2044096639579815, + "learning_rate": 1.6227037100891335e-05, + "loss": 0.8598, + "step": 1557 + }, + { + "epoch": 0.922712466686408, + "grad_norm": 1.6328122663952063, + "learning_rate": 1.6222031423237776e-05, + "loss": 0.8353, + "step": 1558 + }, + { + "epoch": 0.923304708320995, + "grad_norm": 2.3168890361561263, + "learning_rate": 1.6217023200423483e-05, + "loss": 0.8548, + "step": 1559 + }, + { + "epoch": 0.9238969499555819, + "grad_norm": 1.7424095619475186, + "learning_rate": 1.6212012434497103e-05, + "loss": 0.7988, + "step": 1560 + }, + { + "epoch": 0.9244891915901687, + "grad_norm": 1.6126116008758955, + "learning_rate": 1.6206999127508318e-05, + "loss": 0.85, + "step": 1561 + }, + { + "epoch": 0.9250814332247557, + "grad_norm": 1.4774563212178027, + "learning_rate": 1.620198328150785e-05, + "loss": 0.7911, + "step": 1562 + }, + { + "epoch": 0.9256736748593426, + "grad_norm": 1.3520460937622927, + "learning_rate": 1.6196964898547474e-05, + "loss": 0.8204, + "step": 1563 + }, + { + "epoch": 0.9262659164939295, + "grad_norm": 1.5690304650328533, + "learning_rate": 1.6191943980679975e-05, + "loss": 0.7729, + "step": 1564 + }, + { + "epoch": 0.9268581581285165, + "grad_norm": 1.7240591652512482, + "learning_rate": 1.61869205299592e-05, + "loss": 0.8504, + "step": 1565 + }, + { + "epoch": 0.9274503997631034, + "grad_norm": 1.629961765323752, + "learning_rate": 1.6181894548440022e-05, + "loss": 0.8158, + "step": 1566 + }, + { + "epoch": 0.9280426413976902, + "grad_norm": 1.664765209050695, + "learning_rate": 1.6176866038178348e-05, + "loss": 0.8291, + "step": 1567 + }, + { + "epoch": 0.9286348830322771, + "grad_norm": 1.3306049204265953, + "learning_rate": 1.617183500123112e-05, + "loss": 0.7634, + "step": 1568 + }, + { + "epoch": 0.9292271246668641, + "grad_norm": 1.7500132992266746, + "learning_rate": 1.6166801439656322e-05, + "loss": 0.8743, + "step": 1569 + }, + { + "epoch": 0.929819366301451, + "grad_norm": 1.5049905349441384, + "learning_rate": 1.6161765355512958e-05, + "loss": 0.8252, + "step": 1570 + }, + { + "epoch": 0.9304116079360379, + "grad_norm": 1.4564671298739658, + "learning_rate": 1.615672675086107e-05, + "loss": 0.8371, + "step": 1571 + }, + { + "epoch": 0.9310038495706248, + "grad_norm": 1.3839434144832101, + "learning_rate": 1.615168562776173e-05, + "loss": 0.8293, + "step": 1572 + }, + { + "epoch": 0.9315960912052117, + "grad_norm": 1.4540537733669165, + "learning_rate": 1.6146641988277044e-05, + "loss": 0.8373, + "step": 1573 + }, + { + "epoch": 0.9321883328397986, + "grad_norm": 5.541810457275405, + "learning_rate": 1.6141595834470142e-05, + "loss": 0.8454, + "step": 1574 + }, + { + "epoch": 0.9327805744743856, + "grad_norm": 1.5031210400984882, + "learning_rate": 1.6136547168405185e-05, + "loss": 0.8377, + "step": 1575 + }, + { + "epoch": 0.9333728161089725, + "grad_norm": 2.397459980734325, + "learning_rate": 1.6131495992147363e-05, + "loss": 0.8208, + "step": 1576 + }, + { + "epoch": 0.9339650577435594, + "grad_norm": 4.008252947997453, + "learning_rate": 1.6126442307762886e-05, + "loss": 0.8473, + "step": 1577 + }, + { + "epoch": 0.9345572993781462, + "grad_norm": 2.148720748325416, + "learning_rate": 1.6121386117319e-05, + "loss": 0.8426, + "step": 1578 + }, + { + "epoch": 0.9351495410127332, + "grad_norm": 4.17679478258533, + "learning_rate": 1.611632742288397e-05, + "loss": 0.7861, + "step": 1579 + }, + { + "epoch": 0.9357417826473201, + "grad_norm": 1.458220294326226, + "learning_rate": 1.6111266226527086e-05, + "loss": 0.797, + "step": 1580 + }, + { + "epoch": 0.936334024281907, + "grad_norm": 2.424871404734393, + "learning_rate": 1.6106202530318662e-05, + "loss": 0.7973, + "step": 1581 + }, + { + "epoch": 0.936926265916494, + "grad_norm": 1.2771446682809802, + "learning_rate": 1.6101136336330037e-05, + "loss": 0.8365, + "step": 1582 + }, + { + "epoch": 0.9375185075510808, + "grad_norm": 1.1198521871087879, + "learning_rate": 1.6096067646633568e-05, + "loss": 0.8309, + "step": 1583 + }, + { + "epoch": 0.9381107491856677, + "grad_norm": 1.4872636584914338, + "learning_rate": 1.609099646330263e-05, + "loss": 0.8144, + "step": 1584 + }, + { + "epoch": 0.9387029908202547, + "grad_norm": 1.2530678270199203, + "learning_rate": 1.6085922788411625e-05, + "loss": 0.8321, + "step": 1585 + }, + { + "epoch": 0.9392952324548416, + "grad_norm": 3.1068270835989167, + "learning_rate": 1.6080846624035972e-05, + "loss": 0.8287, + "step": 1586 + }, + { + "epoch": 0.9398874740894285, + "grad_norm": 1.4244796316413282, + "learning_rate": 1.6075767972252107e-05, + "loss": 0.827, + "step": 1587 + }, + { + "epoch": 0.9404797157240155, + "grad_norm": 1.8678497643123235, + "learning_rate": 1.6070686835137484e-05, + "loss": 0.8164, + "step": 1588 + }, + { + "epoch": 0.9410719573586023, + "grad_norm": 2.0794440551572744, + "learning_rate": 1.6065603214770576e-05, + "loss": 0.8368, + "step": 1589 + }, + { + "epoch": 0.9416641989931892, + "grad_norm": 1.4310283817792016, + "learning_rate": 1.6060517113230866e-05, + "loss": 0.8414, + "step": 1590 + }, + { + "epoch": 0.9422564406277761, + "grad_norm": 1.1400352112299277, + "learning_rate": 1.605542853259886e-05, + "loss": 0.8632, + "step": 1591 + }, + { + "epoch": 0.9428486822623631, + "grad_norm": 2.2260369027177402, + "learning_rate": 1.605033747495607e-05, + "loss": 0.8193, + "step": 1592 + }, + { + "epoch": 0.94344092389695, + "grad_norm": 1.8613131439464332, + "learning_rate": 1.6045243942385026e-05, + "loss": 0.8796, + "step": 1593 + }, + { + "epoch": 0.9440331655315368, + "grad_norm": 1.6025848171890213, + "learning_rate": 1.6040147936969263e-05, + "loss": 0.8037, + "step": 1594 + }, + { + "epoch": 0.9446254071661238, + "grad_norm": 1.9429865966947775, + "learning_rate": 1.6035049460793346e-05, + "loss": 0.8611, + "step": 1595 + }, + { + "epoch": 0.9452176488007107, + "grad_norm": 1.4125880162885809, + "learning_rate": 1.602994851594283e-05, + "loss": 0.8218, + "step": 1596 + }, + { + "epoch": 0.9458098904352976, + "grad_norm": 1.4496484290895701, + "learning_rate": 1.6024845104504295e-05, + "loss": 0.831, + "step": 1597 + }, + { + "epoch": 0.9464021320698845, + "grad_norm": 1.0965352833215942, + "learning_rate": 1.6019739228565314e-05, + "loss": 0.7957, + "step": 1598 + }, + { + "epoch": 0.9469943737044714, + "grad_norm": 2.5680253220532525, + "learning_rate": 1.6014630890214483e-05, + "loss": 0.8338, + "step": 1599 + }, + { + "epoch": 0.9475866153390583, + "grad_norm": 1.4680992447699333, + "learning_rate": 1.6009520091541403e-05, + "loss": 0.8043, + "step": 1600 + }, + { + "epoch": 0.9481788569736452, + "grad_norm": 1.808937748229738, + "learning_rate": 1.600440683463667e-05, + "loss": 0.8217, + "step": 1601 + }, + { + "epoch": 0.9487710986082322, + "grad_norm": 1.0990585324473146, + "learning_rate": 1.5999291121591894e-05, + "loss": 0.8004, + "step": 1602 + }, + { + "epoch": 0.9493633402428191, + "grad_norm": 1.3712584097732767, + "learning_rate": 1.59941729544997e-05, + "loss": 0.8557, + "step": 1603 + }, + { + "epoch": 0.949955581877406, + "grad_norm": 1.8683892068199166, + "learning_rate": 1.5989052335453695e-05, + "loss": 0.8331, + "step": 1604 + }, + { + "epoch": 0.9505478235119929, + "grad_norm": 1.6272782502704557, + "learning_rate": 1.598392926654851e-05, + "loss": 0.8554, + "step": 1605 + }, + { + "epoch": 0.9511400651465798, + "grad_norm": 6.533052707890747, + "learning_rate": 1.5978803749879754e-05, + "loss": 0.8254, + "step": 1606 + }, + { + "epoch": 0.9517323067811667, + "grad_norm": 1.5123671105986136, + "learning_rate": 1.5973675787544062e-05, + "loss": 0.8491, + "step": 1607 + }, + { + "epoch": 0.9523245484157536, + "grad_norm": 2.956300129012484, + "learning_rate": 1.596854538163906e-05, + "loss": 0.8472, + "step": 1608 + }, + { + "epoch": 0.9529167900503406, + "grad_norm": 1.1789264958256336, + "learning_rate": 1.5963412534263368e-05, + "loss": 0.8399, + "step": 1609 + }, + { + "epoch": 0.9535090316849274, + "grad_norm": 2.116364944004953, + "learning_rate": 1.595827724751661e-05, + "loss": 0.8087, + "step": 1610 + }, + { + "epoch": 0.9541012733195143, + "grad_norm": 1.0294899321599131, + "learning_rate": 1.5953139523499407e-05, + "loss": 0.8103, + "step": 1611 + }, + { + "epoch": 0.9546935149541013, + "grad_norm": 1.2277020777634202, + "learning_rate": 1.5947999364313378e-05, + "loss": 0.8414, + "step": 1612 + }, + { + "epoch": 0.9552857565886882, + "grad_norm": 2.0861925635013807, + "learning_rate": 1.594285677206114e-05, + "loss": 0.8432, + "step": 1613 + }, + { + "epoch": 0.9558779982232751, + "grad_norm": 1.6204140200740225, + "learning_rate": 1.5937711748846292e-05, + "loss": 0.8855, + "step": 1614 + }, + { + "epoch": 0.9564702398578621, + "grad_norm": 1.5551835590922503, + "learning_rate": 1.5932564296773452e-05, + "loss": 0.8211, + "step": 1615 + }, + { + "epoch": 0.9570624814924489, + "grad_norm": 1.338127305643877, + "learning_rate": 1.5927414417948205e-05, + "loss": 0.8845, + "step": 1616 + }, + { + "epoch": 0.9576547231270358, + "grad_norm": 1.2387865170884367, + "learning_rate": 1.592226211447715e-05, + "loss": 0.7897, + "step": 1617 + }, + { + "epoch": 0.9582469647616227, + "grad_norm": 1.226435037795598, + "learning_rate": 1.5917107388467866e-05, + "loss": 0.842, + "step": 1618 + }, + { + "epoch": 0.9588392063962097, + "grad_norm": 1.2457195951382203, + "learning_rate": 1.5911950242028924e-05, + "loss": 0.804, + "step": 1619 + }, + { + "epoch": 0.9594314480307966, + "grad_norm": 1.6797866491993712, + "learning_rate": 1.5906790677269887e-05, + "loss": 0.8548, + "step": 1620 + }, + { + "epoch": 0.9600236896653834, + "grad_norm": 1.202377183866836, + "learning_rate": 1.590162869630131e-05, + "loss": 0.8212, + "step": 1621 + }, + { + "epoch": 0.9606159312999704, + "grad_norm": 1.6157690809746326, + "learning_rate": 1.589646430123473e-05, + "loss": 0.8579, + "step": 1622 + }, + { + "epoch": 0.9612081729345573, + "grad_norm": 1.0841272568029858, + "learning_rate": 1.5891297494182677e-05, + "loss": 0.8165, + "step": 1623 + }, + { + "epoch": 0.9618004145691442, + "grad_norm": 1.0296129071614393, + "learning_rate": 1.5886128277258665e-05, + "loss": 0.8144, + "step": 1624 + }, + { + "epoch": 0.9623926562037312, + "grad_norm": 8.702037235312739, + "learning_rate": 1.5880956652577194e-05, + "loss": 0.8223, + "step": 1625 + }, + { + "epoch": 0.9629848978383181, + "grad_norm": 1.1958806153516213, + "learning_rate": 1.587578262225375e-05, + "loss": 0.8464, + "step": 1626 + }, + { + "epoch": 0.9635771394729049, + "grad_norm": 3.794741977806126, + "learning_rate": 1.5870606188404803e-05, + "loss": 0.7858, + "step": 1627 + }, + { + "epoch": 0.9641693811074918, + "grad_norm": 2.0476194711292743, + "learning_rate": 1.5865427353147805e-05, + "loss": 0.8624, + "step": 1628 + }, + { + "epoch": 0.9647616227420788, + "grad_norm": 1.1041034856721486, + "learning_rate": 1.586024611860119e-05, + "loss": 0.827, + "step": 1629 + }, + { + "epoch": 0.9653538643766657, + "grad_norm": 1.2670169849566177, + "learning_rate": 1.5855062486884377e-05, + "loss": 0.8591, + "step": 1630 + }, + { + "epoch": 0.9659461060112526, + "grad_norm": 1.4173702686257748, + "learning_rate": 1.5849876460117756e-05, + "loss": 0.8619, + "step": 1631 + }, + { + "epoch": 0.9665383476458395, + "grad_norm": 3.026025776762052, + "learning_rate": 1.5844688040422714e-05, + "loss": 0.8098, + "step": 1632 + }, + { + "epoch": 0.9671305892804264, + "grad_norm": 1.4053556345848581, + "learning_rate": 1.5839497229921596e-05, + "loss": 0.807, + "step": 1633 + }, + { + "epoch": 0.9677228309150133, + "grad_norm": 1.7031188178538146, + "learning_rate": 1.5834304030737744e-05, + "loss": 0.7775, + "step": 1634 + }, + { + "epoch": 0.9683150725496003, + "grad_norm": 2.9127557245302187, + "learning_rate": 1.5829108444995463e-05, + "loss": 0.7833, + "step": 1635 + }, + { + "epoch": 0.9689073141841872, + "grad_norm": 1.180567784206561, + "learning_rate": 1.582391047482004e-05, + "loss": 0.8379, + "step": 1636 + }, + { + "epoch": 0.969499555818774, + "grad_norm": 1.6229999823880097, + "learning_rate": 1.581871012233774e-05, + "loss": 0.8217, + "step": 1637 + }, + { + "epoch": 0.9700917974533609, + "grad_norm": 1.208083563753721, + "learning_rate": 1.5813507389675796e-05, + "loss": 0.8457, + "step": 1638 + }, + { + "epoch": 0.9706840390879479, + "grad_norm": 2.1554849675219407, + "learning_rate": 1.5808302278962425e-05, + "loss": 0.8622, + "step": 1639 + }, + { + "epoch": 0.9712762807225348, + "grad_norm": 2.64337198259556, + "learning_rate": 1.58030947923268e-05, + "loss": 0.7997, + "step": 1640 + }, + { + "epoch": 0.9718685223571217, + "grad_norm": 1.5576104708781473, + "learning_rate": 1.5797884931899085e-05, + "loss": 0.8174, + "step": 1641 + }, + { + "epoch": 0.9724607639917087, + "grad_norm": 2.038533162623759, + "learning_rate": 1.57926726998104e-05, + "loss": 0.7796, + "step": 1642 + }, + { + "epoch": 0.9730530056262955, + "grad_norm": 2.9263975793725403, + "learning_rate": 1.5787458098192846e-05, + "loss": 0.8364, + "step": 1643 + }, + { + "epoch": 0.9736452472608824, + "grad_norm": 1.4490134642361368, + "learning_rate": 1.5782241129179482e-05, + "loss": 0.8124, + "step": 1644 + }, + { + "epoch": 0.9742374888954694, + "grad_norm": 1.533779399794962, + "learning_rate": 1.5777021794904347e-05, + "loss": 0.8359, + "step": 1645 + }, + { + "epoch": 0.9748297305300563, + "grad_norm": 1.3549406554518657, + "learning_rate": 1.5771800097502437e-05, + "loss": 0.7979, + "step": 1646 + }, + { + "epoch": 0.9754219721646432, + "grad_norm": 1.5156946411040901, + "learning_rate": 1.5766576039109727e-05, + "loss": 0.8221, + "step": 1647 + }, + { + "epoch": 0.97601421379923, + "grad_norm": 1.0074291473613406, + "learning_rate": 1.5761349621863145e-05, + "loss": 0.7519, + "step": 1648 + }, + { + "epoch": 0.976606455433817, + "grad_norm": 1.749198076091115, + "learning_rate": 1.575612084790059e-05, + "loss": 0.8644, + "step": 1649 + }, + { + "epoch": 0.9771986970684039, + "grad_norm": 1.576797332615058, + "learning_rate": 1.5750889719360927e-05, + "loss": 0.8243, + "step": 1650 + }, + { + "epoch": 0.9777909387029908, + "grad_norm": 1.1947271437764446, + "learning_rate": 1.5745656238383983e-05, + "loss": 0.8444, + "step": 1651 + }, + { + "epoch": 0.9783831803375778, + "grad_norm": 1.4690410500886613, + "learning_rate": 1.574042040711054e-05, + "loss": 0.8234, + "step": 1652 + }, + { + "epoch": 0.9789754219721647, + "grad_norm": 1.3832210724891452, + "learning_rate": 1.5735182227682353e-05, + "loss": 0.8242, + "step": 1653 + }, + { + "epoch": 0.9795676636067515, + "grad_norm": 1.0700735949850522, + "learning_rate": 1.572994170224213e-05, + "loss": 0.738, + "step": 1654 + }, + { + "epoch": 0.9801599052413384, + "grad_norm": 1.048484464353663, + "learning_rate": 1.572469883293354e-05, + "loss": 0.8888, + "step": 1655 + }, + { + "epoch": 0.9807521468759254, + "grad_norm": 1.2979362621765205, + "learning_rate": 1.571945362190121e-05, + "loss": 0.8947, + "step": 1656 + }, + { + "epoch": 0.9813443885105123, + "grad_norm": 1.5547020809846688, + "learning_rate": 1.571420607129073e-05, + "loss": 0.8581, + "step": 1657 + }, + { + "epoch": 0.9819366301450992, + "grad_norm": 2.0365430398639117, + "learning_rate": 1.5708956183248644e-05, + "loss": 0.856, + "step": 1658 + }, + { + "epoch": 0.9825288717796861, + "grad_norm": 1.3434352080990184, + "learning_rate": 1.5703703959922444e-05, + "loss": 0.84, + "step": 1659 + }, + { + "epoch": 0.983121113414273, + "grad_norm": 1.5105070318212588, + "learning_rate": 1.5698449403460593e-05, + "loss": 0.8324, + "step": 1660 + }, + { + "epoch": 0.9837133550488599, + "grad_norm": 1.2274682794404093, + "learning_rate": 1.5693192516012497e-05, + "loss": 0.7942, + "step": 1661 + }, + { + "epoch": 0.9843055966834469, + "grad_norm": 1.5114527110798501, + "learning_rate": 1.5687933299728517e-05, + "loss": 0.9006, + "step": 1662 + }, + { + "epoch": 0.9848978383180338, + "grad_norm": 1.118678025977753, + "learning_rate": 1.568267175675997e-05, + "loss": 0.8235, + "step": 1663 + }, + { + "epoch": 0.9854900799526207, + "grad_norm": 0.9278952601780849, + "learning_rate": 1.567740788925912e-05, + "loss": 0.8125, + "step": 1664 + }, + { + "epoch": 0.9860823215872075, + "grad_norm": 1.2284653816674624, + "learning_rate": 1.5672141699379195e-05, + "loss": 0.7883, + "step": 1665 + }, + { + "epoch": 0.9866745632217945, + "grad_norm": 1.5199993795896831, + "learning_rate": 1.5666873189274344e-05, + "loss": 0.8336, + "step": 1666 + }, + { + "epoch": 0.9872668048563814, + "grad_norm": 1.2261415724075908, + "learning_rate": 1.56616023610997e-05, + "loss": 0.8881, + "step": 1667 + }, + { + "epoch": 0.9878590464909683, + "grad_norm": 1.0819795359579585, + "learning_rate": 1.5656329217011322e-05, + "loss": 0.782, + "step": 1668 + }, + { + "epoch": 0.9884512881255553, + "grad_norm": 1.3268966177305566, + "learning_rate": 1.565105375916623e-05, + "loss": 0.8229, + "step": 1669 + }, + { + "epoch": 0.9890435297601421, + "grad_norm": 1.2761359970445783, + "learning_rate": 1.5645775989722366e-05, + "loss": 0.8224, + "step": 1670 + }, + { + "epoch": 0.989635771394729, + "grad_norm": 1.194769993158141, + "learning_rate": 1.5640495910838652e-05, + "loss": 0.8327, + "step": 1671 + }, + { + "epoch": 0.990228013029316, + "grad_norm": 1.2775750866147921, + "learning_rate": 1.563521352467493e-05, + "loss": 0.8399, + "step": 1672 + }, + { + "epoch": 0.9908202546639029, + "grad_norm": 1.0439693594751094, + "learning_rate": 1.562992883339199e-05, + "loss": 0.838, + "step": 1673 + }, + { + "epoch": 0.9914124962984898, + "grad_norm": 1.4853858307458851, + "learning_rate": 1.562464183915157e-05, + "loss": 0.8309, + "step": 1674 + }, + { + "epoch": 0.9920047379330766, + "grad_norm": 1.2338933063602229, + "learning_rate": 1.5619352544116354e-05, + "loss": 0.7848, + "step": 1675 + }, + { + "epoch": 0.9925969795676636, + "grad_norm": 1.1283006710007517, + "learning_rate": 1.5614060950449948e-05, + "loss": 0.8699, + "step": 1676 + }, + { + "epoch": 0.9931892212022505, + "grad_norm": 1.0265880293302552, + "learning_rate": 1.5608767060316927e-05, + "loss": 0.7997, + "step": 1677 + }, + { + "epoch": 0.9937814628368374, + "grad_norm": 8.09364358410445, + "learning_rate": 1.560347087588278e-05, + "loss": 0.8407, + "step": 1678 + }, + { + "epoch": 0.9943737044714244, + "grad_norm": 1.1900974681831218, + "learning_rate": 1.5598172399313946e-05, + "loss": 0.8168, + "step": 1679 + }, + { + "epoch": 0.9949659461060113, + "grad_norm": 1.4239135447597135, + "learning_rate": 1.5592871632777798e-05, + "loss": 0.8455, + "step": 1680 + }, + { + "epoch": 0.9955581877405981, + "grad_norm": 1.0018040213544295, + "learning_rate": 1.5587568578442654e-05, + "loss": 0.8209, + "step": 1681 + }, + { + "epoch": 0.996150429375185, + "grad_norm": 0.9312885357262206, + "learning_rate": 1.5582263238477753e-05, + "loss": 0.8201, + "step": 1682 + }, + { + "epoch": 0.996742671009772, + "grad_norm": 4.822722321751858, + "learning_rate": 1.5576955615053283e-05, + "loss": 0.826, + "step": 1683 + }, + { + "epoch": 0.9973349126443589, + "grad_norm": 1.3705883900182685, + "learning_rate": 1.557164571034036e-05, + "loss": 0.8634, + "step": 1684 + }, + { + "epoch": 0.9979271542789458, + "grad_norm": 1.2331718885455818, + "learning_rate": 1.5566333526511032e-05, + "loss": 0.836, + "step": 1685 + }, + { + "epoch": 0.9985193959135327, + "grad_norm": 1.7535753725578784, + "learning_rate": 1.5561019065738282e-05, + "loss": 0.7917, + "step": 1686 + }, + { + "epoch": 0.9991116375481196, + "grad_norm": 1.141962242390919, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.8084, + "step": 1687 + }, + { + "epoch": 0.9997038791827065, + "grad_norm": 0.9912371715304618, + "learning_rate": 1.55503833220591e-05, + "loss": 0.8038, + "step": 1688 + }, + { + "epoch": 1.0002961208172934, + "grad_norm": 1.1500062970774998, + "learning_rate": 1.5545062043503284e-05, + "loss": 0.7195, + "step": 1689 + }, + { + "epoch": 1.0008883624518803, + "grad_norm": 1.162441466138965, + "learning_rate": 1.5539738496705277e-05, + "loss": 0.7466, + "step": 1690 + }, + { + "epoch": 1.0014806040864672, + "grad_norm": 2.452972866337579, + "learning_rate": 1.553441268384271e-05, + "loss": 0.7629, + "step": 1691 + }, + { + "epoch": 1.0020728457210542, + "grad_norm": 1.1676699232346432, + "learning_rate": 1.5529084607094144e-05, + "loss": 0.7211, + "step": 1692 + }, + { + "epoch": 1.002665087355641, + "grad_norm": 1.1249391475584591, + "learning_rate": 1.5523754268639053e-05, + "loss": 0.6833, + "step": 1693 + }, + { + "epoch": 1.003257328990228, + "grad_norm": 0.9431636171309491, + "learning_rate": 1.5518421670657856e-05, + "loss": 0.6873, + "step": 1694 + }, + { + "epoch": 1.003849570624815, + "grad_norm": 1.2138310832226853, + "learning_rate": 1.5513086815331876e-05, + "loss": 0.7482, + "step": 1695 + }, + { + "epoch": 1.0044418122594019, + "grad_norm": 1.3175202159803, + "learning_rate": 1.550774970484337e-05, + "loss": 0.7627, + "step": 1696 + }, + { + "epoch": 1.0050340538939888, + "grad_norm": 1.2998459350824845, + "learning_rate": 1.5502410341375525e-05, + "loss": 0.7568, + "step": 1697 + }, + { + "epoch": 1.0056262955285757, + "grad_norm": 1.3077350264029703, + "learning_rate": 1.5497068727112435e-05, + "loss": 0.7064, + "step": 1698 + }, + { + "epoch": 1.0062185371631627, + "grad_norm": 1.295716875999854, + "learning_rate": 1.5491724864239116e-05, + "loss": 0.7511, + "step": 1699 + }, + { + "epoch": 1.0068107787977494, + "grad_norm": 1.0973356675970865, + "learning_rate": 1.5486378754941514e-05, + "loss": 0.7318, + "step": 1700 + }, + { + "epoch": 1.0074030204323363, + "grad_norm": 1.064599007825042, + "learning_rate": 1.5481030401406486e-05, + "loss": 0.67, + "step": 1701 + }, + { + "epoch": 1.0079952620669232, + "grad_norm": 1.7373649804796532, + "learning_rate": 1.5475679805821814e-05, + "loss": 0.7182, + "step": 1702 + }, + { + "epoch": 1.0085875037015102, + "grad_norm": 1.1434410016960337, + "learning_rate": 1.547032697037619e-05, + "loss": 0.7471, + "step": 1703 + }, + { + "epoch": 1.0091797453360971, + "grad_norm": 1.027785100761493, + "learning_rate": 1.546497189725922e-05, + "loss": 0.7033, + "step": 1704 + }, + { + "epoch": 1.009771986970684, + "grad_norm": 1.4750262197277808, + "learning_rate": 1.5459614588661435e-05, + "loss": 0.6917, + "step": 1705 + }, + { + "epoch": 1.010364228605271, + "grad_norm": 1.5320626210105739, + "learning_rate": 1.5454255046774273e-05, + "loss": 0.7396, + "step": 1706 + }, + { + "epoch": 1.010956470239858, + "grad_norm": 1.2399324918591237, + "learning_rate": 1.5448893273790093e-05, + "loss": 0.7728, + "step": 1707 + }, + { + "epoch": 1.0115487118744448, + "grad_norm": 1.2465148539400586, + "learning_rate": 1.5443529271902155e-05, + "loss": 0.7421, + "step": 1708 + }, + { + "epoch": 1.0121409535090318, + "grad_norm": 1.0519427663128549, + "learning_rate": 1.543816304330464e-05, + "loss": 0.744, + "step": 1709 + }, + { + "epoch": 1.0127331951436187, + "grad_norm": 1.1516952366005058, + "learning_rate": 1.543279459019264e-05, + "loss": 0.7283, + "step": 1710 + }, + { + "epoch": 1.0133254367782054, + "grad_norm": 1.1650321254394644, + "learning_rate": 1.542742391476215e-05, + "loss": 0.7158, + "step": 1711 + }, + { + "epoch": 1.0139176784127923, + "grad_norm": 1.0202804013315474, + "learning_rate": 1.5422051019210082e-05, + "loss": 0.7283, + "step": 1712 + }, + { + "epoch": 1.0145099200473793, + "grad_norm": 1.923691433363424, + "learning_rate": 1.5416675905734247e-05, + "loss": 0.7629, + "step": 1713 + }, + { + "epoch": 1.0151021616819662, + "grad_norm": 1.5605996887729816, + "learning_rate": 1.5411298576533376e-05, + "loss": 0.7222, + "step": 1714 + }, + { + "epoch": 1.0156944033165531, + "grad_norm": 1.181981096535394, + "learning_rate": 1.540591903380709e-05, + "loss": 0.7437, + "step": 1715 + }, + { + "epoch": 1.01628664495114, + "grad_norm": 1.2141712016450033, + "learning_rate": 1.5400537279755935e-05, + "loss": 0.7403, + "step": 1716 + }, + { + "epoch": 1.016878886585727, + "grad_norm": 1.4181664222010109, + "learning_rate": 1.539515331658134e-05, + "loss": 0.7366, + "step": 1717 + }, + { + "epoch": 1.017471128220314, + "grad_norm": 1.5554806786394424, + "learning_rate": 1.538976714648566e-05, + "loss": 0.7507, + "step": 1718 + }, + { + "epoch": 1.0180633698549009, + "grad_norm": 1.2281864029513376, + "learning_rate": 1.5384378771672132e-05, + "loss": 0.7295, + "step": 1719 + }, + { + "epoch": 1.0186556114894878, + "grad_norm": 1.8180155521778687, + "learning_rate": 1.5378988194344913e-05, + "loss": 0.6907, + "step": 1720 + }, + { + "epoch": 1.0192478531240745, + "grad_norm": 1.3836146904218154, + "learning_rate": 1.537359541670904e-05, + "loss": 0.7357, + "step": 1721 + }, + { + "epoch": 1.0198400947586614, + "grad_norm": 1.2844050388131933, + "learning_rate": 1.5368200440970478e-05, + "loss": 0.7001, + "step": 1722 + }, + { + "epoch": 1.0204323363932484, + "grad_norm": 1.0399087505464837, + "learning_rate": 1.5362803269336063e-05, + "loss": 0.707, + "step": 1723 + }, + { + "epoch": 1.0210245780278353, + "grad_norm": 1.2549348887269711, + "learning_rate": 1.5357403904013546e-05, + "loss": 0.7467, + "step": 1724 + }, + { + "epoch": 1.0216168196624222, + "grad_norm": 1.1036457909371953, + "learning_rate": 1.535200234721157e-05, + "loss": 0.7386, + "step": 1725 + }, + { + "epoch": 1.0222090612970092, + "grad_norm": 1.5796396509435675, + "learning_rate": 1.5346598601139677e-05, + "loss": 0.698, + "step": 1726 + }, + { + "epoch": 1.022801302931596, + "grad_norm": 1.172115897988437, + "learning_rate": 1.5341192668008305e-05, + "loss": 0.7232, + "step": 1727 + }, + { + "epoch": 1.023393544566183, + "grad_norm": 1.2380517696446258, + "learning_rate": 1.533578455002878e-05, + "loss": 0.7418, + "step": 1728 + }, + { + "epoch": 1.02398578620077, + "grad_norm": 1.0709459361086207, + "learning_rate": 1.5330374249413327e-05, + "loss": 0.7005, + "step": 1729 + }, + { + "epoch": 1.024578027835357, + "grad_norm": 1.7356966783518266, + "learning_rate": 1.5324961768375065e-05, + "loss": 0.7308, + "step": 1730 + }, + { + "epoch": 1.0251702694699438, + "grad_norm": 1.9959691714404597, + "learning_rate": 1.5319547109128e-05, + "loss": 0.6944, + "step": 1731 + }, + { + "epoch": 1.0257625111045305, + "grad_norm": 1.1724773521675533, + "learning_rate": 1.531413027388704e-05, + "loss": 0.7128, + "step": 1732 + }, + { + "epoch": 1.0263547527391175, + "grad_norm": 1.3876472736084153, + "learning_rate": 1.5308711264867966e-05, + "loss": 0.7233, + "step": 1733 + }, + { + "epoch": 1.0269469943737044, + "grad_norm": 0.9552747056186334, + "learning_rate": 1.5303290084287465e-05, + "loss": 0.7608, + "step": 1734 + }, + { + "epoch": 1.0275392360082913, + "grad_norm": 1.4703684447974952, + "learning_rate": 1.52978667343631e-05, + "loss": 0.7286, + "step": 1735 + }, + { + "epoch": 1.0281314776428783, + "grad_norm": 1.295631529067337, + "learning_rate": 1.5292441217313324e-05, + "loss": 0.7348, + "step": 1736 + }, + { + "epoch": 1.0287237192774652, + "grad_norm": 1.7649342180752203, + "learning_rate": 1.5287013535357488e-05, + "loss": 0.7281, + "step": 1737 + }, + { + "epoch": 1.0293159609120521, + "grad_norm": 1.0923294902447087, + "learning_rate": 1.5281583690715805e-05, + "loss": 0.6939, + "step": 1738 + }, + { + "epoch": 1.029908202546639, + "grad_norm": 1.499563132831407, + "learning_rate": 1.52761516856094e-05, + "loss": 0.6817, + "step": 1739 + }, + { + "epoch": 1.030500444181226, + "grad_norm": 1.0171580664278979, + "learning_rate": 1.5270717522260264e-05, + "loss": 0.7234, + "step": 1740 + }, + { + "epoch": 1.031092685815813, + "grad_norm": 1.0354234657563626, + "learning_rate": 1.526528120289127e-05, + "loss": 0.7194, + "step": 1741 + }, + { + "epoch": 1.0316849274503999, + "grad_norm": 1.2340045067110246, + "learning_rate": 1.5259842729726186e-05, + "loss": 0.7582, + "step": 1742 + }, + { + "epoch": 1.0322771690849866, + "grad_norm": 1.1262612916228454, + "learning_rate": 1.5254402104989652e-05, + "loss": 0.7168, + "step": 1743 + }, + { + "epoch": 1.0328694107195735, + "grad_norm": 1.2118168057016097, + "learning_rate": 1.5248959330907186e-05, + "loss": 0.743, + "step": 1744 + }, + { + "epoch": 1.0334616523541604, + "grad_norm": 0.988865475204544, + "learning_rate": 1.5243514409705187e-05, + "loss": 0.7318, + "step": 1745 + }, + { + "epoch": 1.0340538939887474, + "grad_norm": 1.1771979349040396, + "learning_rate": 1.5238067343610943e-05, + "loss": 0.7278, + "step": 1746 + }, + { + "epoch": 1.0346461356233343, + "grad_norm": 1.3840039027011963, + "learning_rate": 1.5232618134852598e-05, + "loss": 0.7461, + "step": 1747 + }, + { + "epoch": 1.0352383772579212, + "grad_norm": 6.602016761729066, + "learning_rate": 1.52271667856592e-05, + "loss": 0.7317, + "step": 1748 + }, + { + "epoch": 1.0358306188925082, + "grad_norm": 1.0065104505442493, + "learning_rate": 1.522171329826064e-05, + "loss": 0.7306, + "step": 1749 + }, + { + "epoch": 1.036422860527095, + "grad_norm": 1.0752528953543894, + "learning_rate": 1.5216257674887718e-05, + "loss": 0.683, + "step": 1750 + }, + { + "epoch": 1.037015102161682, + "grad_norm": 1.0970642144292997, + "learning_rate": 1.5210799917772076e-05, + "loss": 0.7274, + "step": 1751 + }, + { + "epoch": 1.037607343796269, + "grad_norm": 2.4805051543151233, + "learning_rate": 1.5205340029146256e-05, + "loss": 0.7554, + "step": 1752 + }, + { + "epoch": 1.0381995854308559, + "grad_norm": 1.7822116435903128, + "learning_rate": 1.5199878011243647e-05, + "loss": 0.7089, + "step": 1753 + }, + { + "epoch": 1.0387918270654426, + "grad_norm": 0.8928687106650833, + "learning_rate": 1.5194413866298536e-05, + "loss": 0.7456, + "step": 1754 + }, + { + "epoch": 1.0393840687000295, + "grad_norm": 1.19053453703619, + "learning_rate": 1.5188947596546053e-05, + "loss": 0.7291, + "step": 1755 + }, + { + "epoch": 1.0399763103346165, + "grad_norm": 1.4988527163029588, + "learning_rate": 1.5183479204222216e-05, + "loss": 0.7221, + "step": 1756 + }, + { + "epoch": 1.0405685519692034, + "grad_norm": 1.7373245033776372, + "learning_rate": 1.5178008691563902e-05, + "loss": 0.7289, + "step": 1757 + }, + { + "epoch": 1.0411607936037903, + "grad_norm": 1.838119156554147, + "learning_rate": 1.5172536060808857e-05, + "loss": 0.726, + "step": 1758 + }, + { + "epoch": 1.0417530352383773, + "grad_norm": 1.3624164343579135, + "learning_rate": 1.5167061314195702e-05, + "loss": 0.7325, + "step": 1759 + }, + { + "epoch": 1.0423452768729642, + "grad_norm": 1.6612931900677586, + "learning_rate": 1.5161584453963908e-05, + "loss": 0.7035, + "step": 1760 + }, + { + "epoch": 1.0429375185075511, + "grad_norm": 1.4616110065160015, + "learning_rate": 1.5156105482353827e-05, + "loss": 0.7442, + "step": 1761 + }, + { + "epoch": 1.043529760142138, + "grad_norm": 2.0905521686727737, + "learning_rate": 1.5150624401606658e-05, + "loss": 0.7269, + "step": 1762 + }, + { + "epoch": 1.044122001776725, + "grad_norm": 1.7611278415862754, + "learning_rate": 1.5145141213964479e-05, + "loss": 0.696, + "step": 1763 + }, + { + "epoch": 1.044714243411312, + "grad_norm": 2.1466849185074683, + "learning_rate": 1.5139655921670213e-05, + "loss": 0.6893, + "step": 1764 + }, + { + "epoch": 1.0453064850458986, + "grad_norm": 3.86853781945047, + "learning_rate": 1.5134168526967661e-05, + "loss": 0.7508, + "step": 1765 + }, + { + "epoch": 1.0458987266804856, + "grad_norm": 3.5972940752882137, + "learning_rate": 1.5128679032101472e-05, + "loss": 0.7366, + "step": 1766 + }, + { + "epoch": 1.0464909683150725, + "grad_norm": 3.6857544393967347, + "learning_rate": 1.5123187439317159e-05, + "loss": 0.7766, + "step": 1767 + }, + { + "epoch": 1.0470832099496594, + "grad_norm": 1.6678885841692537, + "learning_rate": 1.5117693750861096e-05, + "loss": 0.7361, + "step": 1768 + }, + { + "epoch": 1.0476754515842464, + "grad_norm": 1.672518635910601, + "learning_rate": 1.5112197968980503e-05, + "loss": 0.6881, + "step": 1769 + }, + { + "epoch": 1.0482676932188333, + "grad_norm": 1.5591607166759955, + "learning_rate": 1.5106700095923471e-05, + "loss": 0.693, + "step": 1770 + }, + { + "epoch": 1.0488599348534202, + "grad_norm": 2.257758886616912, + "learning_rate": 1.5101200133938933e-05, + "loss": 0.6811, + "step": 1771 + }, + { + "epoch": 1.0494521764880071, + "grad_norm": 2.0363297195079912, + "learning_rate": 1.5095698085276692e-05, + "loss": 0.7407, + "step": 1772 + }, + { + "epoch": 1.050044418122594, + "grad_norm": 2.830719217097853, + "learning_rate": 1.5090193952187382e-05, + "loss": 0.7523, + "step": 1773 + }, + { + "epoch": 1.050636659757181, + "grad_norm": 1.671928706296085, + "learning_rate": 1.5084687736922514e-05, + "loss": 0.7253, + "step": 1774 + }, + { + "epoch": 1.051228901391768, + "grad_norm": 2.790885707597884, + "learning_rate": 1.5079179441734435e-05, + "loss": 0.7218, + "step": 1775 + }, + { + "epoch": 1.0518211430263547, + "grad_norm": 1.7819284353063456, + "learning_rate": 1.5073669068876348e-05, + "loss": 0.73, + "step": 1776 + }, + { + "epoch": 1.0524133846609416, + "grad_norm": 2.7217998288857794, + "learning_rate": 1.5068156620602303e-05, + "loss": 0.7359, + "step": 1777 + }, + { + "epoch": 1.0530056262955285, + "grad_norm": 5.101430064202783, + "learning_rate": 1.5062642099167208e-05, + "loss": 0.7175, + "step": 1778 + }, + { + "epoch": 1.0535978679301155, + "grad_norm": 2.4798155357352902, + "learning_rate": 1.5057125506826806e-05, + "loss": 0.7212, + "step": 1779 + }, + { + "epoch": 1.0541901095647024, + "grad_norm": 1.4306493055601683, + "learning_rate": 1.5051606845837699e-05, + "loss": 0.7508, + "step": 1780 + }, + { + "epoch": 1.0547823511992893, + "grad_norm": 1.9617350912249327, + "learning_rate": 1.5046086118457325e-05, + "loss": 0.7302, + "step": 1781 + }, + { + "epoch": 1.0553745928338762, + "grad_norm": 3.2294694912122766, + "learning_rate": 1.5040563326943974e-05, + "loss": 0.7164, + "step": 1782 + }, + { + "epoch": 1.0559668344684632, + "grad_norm": 2.159249108038316, + "learning_rate": 1.5035038473556776e-05, + "loss": 0.7142, + "step": 1783 + }, + { + "epoch": 1.05655907610305, + "grad_norm": 1.810542509614688, + "learning_rate": 1.5029511560555707e-05, + "loss": 0.7212, + "step": 1784 + }, + { + "epoch": 1.057151317737637, + "grad_norm": 4.37550667109893, + "learning_rate": 1.5023982590201586e-05, + "loss": 0.6615, + "step": 1785 + }, + { + "epoch": 1.057743559372224, + "grad_norm": 1.4873887560427135, + "learning_rate": 1.5018451564756078e-05, + "loss": 0.6939, + "step": 1786 + }, + { + "epoch": 1.0583358010068107, + "grad_norm": 3.2445272993387335, + "learning_rate": 1.5012918486481677e-05, + "loss": 0.7296, + "step": 1787 + }, + { + "epoch": 1.0589280426413976, + "grad_norm": 1.6958901351830118, + "learning_rate": 1.5007383357641723e-05, + "loss": 0.7386, + "step": 1788 + }, + { + "epoch": 1.0595202842759845, + "grad_norm": 1.6693502713038142, + "learning_rate": 1.5001846180500399e-05, + "loss": 0.7317, + "step": 1789 + }, + { + "epoch": 1.0601125259105715, + "grad_norm": 1.4628738488051636, + "learning_rate": 1.499630695732272e-05, + "loss": 0.7039, + "step": 1790 + }, + { + "epoch": 1.0607047675451584, + "grad_norm": 1.4961438926110393, + "learning_rate": 1.4990765690374537e-05, + "loss": 0.7023, + "step": 1791 + }, + { + "epoch": 1.0612970091797453, + "grad_norm": 2.1544637844797125, + "learning_rate": 1.4985222381922543e-05, + "loss": 0.7214, + "step": 1792 + }, + { + "epoch": 1.0618892508143323, + "grad_norm": 8.28217196999516, + "learning_rate": 1.4979677034234265e-05, + "loss": 0.717, + "step": 1793 + }, + { + "epoch": 1.0624814924489192, + "grad_norm": 2.501132894894816, + "learning_rate": 1.4974129649578058e-05, + "loss": 0.7356, + "step": 1794 + }, + { + "epoch": 1.0630737340835061, + "grad_norm": 1.16138368637112, + "learning_rate": 1.4968580230223112e-05, + "loss": 0.7435, + "step": 1795 + }, + { + "epoch": 1.063665975718093, + "grad_norm": 1.5162128926090968, + "learning_rate": 1.496302877843946e-05, + "loss": 0.7392, + "step": 1796 + }, + { + "epoch": 1.06425821735268, + "grad_norm": 1.2373419739712945, + "learning_rate": 1.4957475296497953e-05, + "loss": 0.7, + "step": 1797 + }, + { + "epoch": 1.0648504589872667, + "grad_norm": 1.9258973289098624, + "learning_rate": 1.4951919786670274e-05, + "loss": 0.7231, + "step": 1798 + }, + { + "epoch": 1.0654427006218536, + "grad_norm": 1.2248509462971935, + "learning_rate": 1.4946362251228943e-05, + "loss": 0.7276, + "step": 1799 + }, + { + "epoch": 1.0660349422564406, + "grad_norm": 1.5808585539599749, + "learning_rate": 1.4940802692447306e-05, + "loss": 0.7257, + "step": 1800 + }, + { + "epoch": 1.0666271838910275, + "grad_norm": 1.3982040276801868, + "learning_rate": 1.493524111259953e-05, + "loss": 0.7563, + "step": 1801 + }, + { + "epoch": 1.0672194255256144, + "grad_norm": 3.7095487179566713, + "learning_rate": 1.4929677513960621e-05, + "loss": 0.7217, + "step": 1802 + }, + { + "epoch": 1.0678116671602014, + "grad_norm": 2.0014255531666936, + "learning_rate": 1.4924111898806395e-05, + "loss": 0.7425, + "step": 1803 + }, + { + "epoch": 1.0684039087947883, + "grad_norm": 1.4939878195406038, + "learning_rate": 1.4918544269413511e-05, + "loss": 0.727, + "step": 1804 + }, + { + "epoch": 1.0689961504293752, + "grad_norm": 1.74927476414085, + "learning_rate": 1.4912974628059433e-05, + "loss": 0.7637, + "step": 1805 + }, + { + "epoch": 1.0695883920639622, + "grad_norm": 1.658292058667489, + "learning_rate": 1.4907402977022465e-05, + "loss": 0.7623, + "step": 1806 + }, + { + "epoch": 1.070180633698549, + "grad_norm": 1.4315694213602883, + "learning_rate": 1.4901829318581722e-05, + "loss": 0.7211, + "step": 1807 + }, + { + "epoch": 1.070772875333136, + "grad_norm": 1.704429216725604, + "learning_rate": 1.4896253655017146e-05, + "loss": 0.7316, + "step": 1808 + }, + { + "epoch": 1.0713651169677227, + "grad_norm": 1.3312378206473277, + "learning_rate": 1.4890675988609493e-05, + "loss": 0.7199, + "step": 1809 + }, + { + "epoch": 1.0719573586023097, + "grad_norm": 3.97929527512917, + "learning_rate": 1.4885096321640346e-05, + "loss": 0.725, + "step": 1810 + }, + { + "epoch": 1.0725496002368966, + "grad_norm": 1.50003812253246, + "learning_rate": 1.48795146563921e-05, + "loss": 0.7149, + "step": 1811 + }, + { + "epoch": 1.0731418418714835, + "grad_norm": 1.67906861392692, + "learning_rate": 1.4873930995147971e-05, + "loss": 0.7448, + "step": 1812 + }, + { + "epoch": 1.0737340835060705, + "grad_norm": 1.6617287890649148, + "learning_rate": 1.4868345340191992e-05, + "loss": 0.7207, + "step": 1813 + }, + { + "epoch": 1.0743263251406574, + "grad_norm": 1.3825842739615184, + "learning_rate": 1.4862757693809009e-05, + "loss": 0.7454, + "step": 1814 + }, + { + "epoch": 1.0749185667752443, + "grad_norm": 1.4144163977489808, + "learning_rate": 1.485716805828468e-05, + "loss": 0.7158, + "step": 1815 + }, + { + "epoch": 1.0755108084098313, + "grad_norm": 1.8101168890390542, + "learning_rate": 1.4851576435905489e-05, + "loss": 0.7072, + "step": 1816 + }, + { + "epoch": 1.0761030500444182, + "grad_norm": 1.872316239101291, + "learning_rate": 1.4845982828958713e-05, + "loss": 0.7332, + "step": 1817 + }, + { + "epoch": 1.0766952916790051, + "grad_norm": 1.3296117156310565, + "learning_rate": 1.484038723973246e-05, + "loss": 0.7099, + "step": 1818 + }, + { + "epoch": 1.077287533313592, + "grad_norm": 1.4154353624795581, + "learning_rate": 1.4834789670515637e-05, + "loss": 0.7188, + "step": 1819 + }, + { + "epoch": 1.0778797749481788, + "grad_norm": 1.7457686235458405, + "learning_rate": 1.4829190123597965e-05, + "loss": 0.7769, + "step": 1820 + }, + { + "epoch": 1.0784720165827657, + "grad_norm": 1.7634196004341314, + "learning_rate": 1.4823588601269973e-05, + "loss": 0.7494, + "step": 1821 + }, + { + "epoch": 1.0790642582173526, + "grad_norm": 1.2440518586809848, + "learning_rate": 1.4817985105823003e-05, + "loss": 0.6979, + "step": 1822 + }, + { + "epoch": 1.0796564998519396, + "grad_norm": 2.2385183360083953, + "learning_rate": 1.4812379639549194e-05, + "loss": 0.7202, + "step": 1823 + }, + { + "epoch": 1.0802487414865265, + "grad_norm": 1.2215652923766105, + "learning_rate": 1.4806772204741503e-05, + "loss": 0.7091, + "step": 1824 + }, + { + "epoch": 1.0808409831211134, + "grad_norm": 1.4449642609762843, + "learning_rate": 1.4801162803693676e-05, + "loss": 0.7442, + "step": 1825 + }, + { + "epoch": 1.0814332247557004, + "grad_norm": 1.105763165926035, + "learning_rate": 1.4795551438700283e-05, + "loss": 0.7515, + "step": 1826 + }, + { + "epoch": 1.0820254663902873, + "grad_norm": 1.6473343255705488, + "learning_rate": 1.4789938112056683e-05, + "loss": 0.7527, + "step": 1827 + }, + { + "epoch": 1.0826177080248742, + "grad_norm": 1.618781048885639, + "learning_rate": 1.4784322826059048e-05, + "loss": 0.7249, + "step": 1828 + }, + { + "epoch": 1.0832099496594612, + "grad_norm": 1.652228341540137, + "learning_rate": 1.4778705583004338e-05, + "loss": 0.7447, + "step": 1829 + }, + { + "epoch": 1.083802191294048, + "grad_norm": 2.4386960209217343, + "learning_rate": 1.4773086385190328e-05, + "loss": 0.7448, + "step": 1830 + }, + { + "epoch": 1.0843944329286348, + "grad_norm": 1.259552923914992, + "learning_rate": 1.4767465234915577e-05, + "loss": 0.7799, + "step": 1831 + }, + { + "epoch": 1.0849866745632217, + "grad_norm": 3.1470024337770446, + "learning_rate": 1.4761842134479463e-05, + "loss": 0.7429, + "step": 1832 + }, + { + "epoch": 1.0855789161978087, + "grad_norm": 1.2003867622349509, + "learning_rate": 1.4756217086182142e-05, + "loss": 0.7326, + "step": 1833 + }, + { + "epoch": 1.0861711578323956, + "grad_norm": 2.4507353417893314, + "learning_rate": 1.4750590092324579e-05, + "loss": 0.7183, + "step": 1834 + }, + { + "epoch": 1.0867633994669825, + "grad_norm": 1.1444638541124244, + "learning_rate": 1.474496115520853e-05, + "loss": 0.7436, + "step": 1835 + }, + { + "epoch": 1.0873556411015695, + "grad_norm": 1.9216426351330924, + "learning_rate": 1.4739330277136546e-05, + "loss": 0.7302, + "step": 1836 + }, + { + "epoch": 1.0879478827361564, + "grad_norm": 1.7272854817225367, + "learning_rate": 1.4733697460411973e-05, + "loss": 0.7041, + "step": 1837 + }, + { + "epoch": 1.0885401243707433, + "grad_norm": 1.1808892975599021, + "learning_rate": 1.4728062707338949e-05, + "loss": 0.7214, + "step": 1838 + }, + { + "epoch": 1.0891323660053303, + "grad_norm": 1.2060838830500324, + "learning_rate": 1.4722426020222406e-05, + "loss": 0.7387, + "step": 1839 + }, + { + "epoch": 1.0897246076399172, + "grad_norm": 1.2045178599251487, + "learning_rate": 1.4716787401368067e-05, + "loss": 0.708, + "step": 1840 + }, + { + "epoch": 1.090316849274504, + "grad_norm": 1.4796202285269815, + "learning_rate": 1.4711146853082443e-05, + "loss": 0.7453, + "step": 1841 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.2637405534511554, + "learning_rate": 1.4705504377672834e-05, + "loss": 0.7001, + "step": 1842 + }, + { + "epoch": 1.0915013325436778, + "grad_norm": 0.9993594395369135, + "learning_rate": 1.4699859977447335e-05, + "loss": 0.7461, + "step": 1843 + }, + { + "epoch": 1.0920935741782647, + "grad_norm": 1.3269419928105537, + "learning_rate": 1.4694213654714816e-05, + "loss": 0.7391, + "step": 1844 + }, + { + "epoch": 1.0926858158128516, + "grad_norm": 1.4669810778171086, + "learning_rate": 1.4688565411784943e-05, + "loss": 0.7218, + "step": 1845 + }, + { + "epoch": 1.0932780574474386, + "grad_norm": 3.385399768271858, + "learning_rate": 1.4682915250968169e-05, + "loss": 0.7459, + "step": 1846 + }, + { + "epoch": 1.0938702990820255, + "grad_norm": 1.6211356022303685, + "learning_rate": 1.4677263174575723e-05, + "loss": 0.7276, + "step": 1847 + }, + { + "epoch": 1.0944625407166124, + "grad_norm": 1.3469501603849017, + "learning_rate": 1.4671609184919622e-05, + "loss": 0.7409, + "step": 1848 + }, + { + "epoch": 1.0950547823511994, + "grad_norm": 1.3639447724050635, + "learning_rate": 1.4665953284312668e-05, + "loss": 0.7392, + "step": 1849 + }, + { + "epoch": 1.0956470239857863, + "grad_norm": 1.2226531191169623, + "learning_rate": 1.4660295475068443e-05, + "loss": 0.7384, + "step": 1850 + }, + { + "epoch": 1.0962392656203732, + "grad_norm": 1.5816714452127887, + "learning_rate": 1.4654635759501306e-05, + "loss": 0.7145, + "step": 1851 + }, + { + "epoch": 1.09683150725496, + "grad_norm": 1.3398064935081868, + "learning_rate": 1.4648974139926403e-05, + "loss": 0.7341, + "step": 1852 + }, + { + "epoch": 1.0974237488895469, + "grad_norm": 1.3669350828600808, + "learning_rate": 1.4643310618659646e-05, + "loss": 0.7229, + "step": 1853 + }, + { + "epoch": 1.0980159905241338, + "grad_norm": 1.7617904906850115, + "learning_rate": 1.4637645198017745e-05, + "loss": 0.7067, + "step": 1854 + }, + { + "epoch": 1.0986082321587207, + "grad_norm": 3.706221805008793, + "learning_rate": 1.463197788031817e-05, + "loss": 0.7075, + "step": 1855 + }, + { + "epoch": 1.0992004737933077, + "grad_norm": 1.6435750167047545, + "learning_rate": 1.4626308667879175e-05, + "loss": 0.758, + "step": 1856 + }, + { + "epoch": 1.0997927154278946, + "grad_norm": 1.7817244713218505, + "learning_rate": 1.462063756301978e-05, + "loss": 0.7666, + "step": 1857 + }, + { + "epoch": 1.1003849570624815, + "grad_norm": 3.183409165734301, + "learning_rate": 1.4614964568059795e-05, + "loss": 0.7431, + "step": 1858 + }, + { + "epoch": 1.1009771986970684, + "grad_norm": 1.5588746699566092, + "learning_rate": 1.4609289685319788e-05, + "loss": 0.6989, + "step": 1859 + }, + { + "epoch": 1.1015694403316554, + "grad_norm": 1.4715880184354002, + "learning_rate": 1.4603612917121107e-05, + "loss": 0.7476, + "step": 1860 + }, + { + "epoch": 1.1021616819662423, + "grad_norm": 1.5483999283147984, + "learning_rate": 1.4597934265785868e-05, + "loss": 0.7095, + "step": 1861 + }, + { + "epoch": 1.102753923600829, + "grad_norm": 1.8136131665101558, + "learning_rate": 1.4592253733636961e-05, + "loss": 0.7218, + "step": 1862 + }, + { + "epoch": 1.103346165235416, + "grad_norm": 1.898459387929092, + "learning_rate": 1.4586571322998041e-05, + "loss": 0.6928, + "step": 1863 + }, + { + "epoch": 1.1039384068700029, + "grad_norm": 1.1332820859860129, + "learning_rate": 1.4580887036193539e-05, + "loss": 0.7379, + "step": 1864 + }, + { + "epoch": 1.1045306485045898, + "grad_norm": 1.1910399671887877, + "learning_rate": 1.4575200875548639e-05, + "loss": 0.7093, + "step": 1865 + }, + { + "epoch": 1.1051228901391768, + "grad_norm": 1.7135675328136981, + "learning_rate": 1.4569512843389306e-05, + "loss": 0.7452, + "step": 1866 + }, + { + "epoch": 1.1057151317737637, + "grad_norm": 1.3131715172492713, + "learning_rate": 1.4563822942042264e-05, + "loss": 0.7512, + "step": 1867 + }, + { + "epoch": 1.1063073734083506, + "grad_norm": 1.301975931331709, + "learning_rate": 1.4558131173835002e-05, + "loss": 0.724, + "step": 1868 + }, + { + "epoch": 1.1068996150429375, + "grad_norm": 1.9451185830687892, + "learning_rate": 1.4552437541095774e-05, + "loss": 0.7658, + "step": 1869 + }, + { + "epoch": 1.1074918566775245, + "grad_norm": 2.5902094254473274, + "learning_rate": 1.4546742046153596e-05, + "loss": 0.7389, + "step": 1870 + }, + { + "epoch": 1.1080840983121114, + "grad_norm": 1.8005401821027935, + "learning_rate": 1.4541044691338246e-05, + "loss": 0.7113, + "step": 1871 + }, + { + "epoch": 1.1086763399466983, + "grad_norm": 1.3297390320707363, + "learning_rate": 1.453534547898026e-05, + "loss": 0.7192, + "step": 1872 + }, + { + "epoch": 1.109268581581285, + "grad_norm": 1.616999402923775, + "learning_rate": 1.452964441141094e-05, + "loss": 0.703, + "step": 1873 + }, + { + "epoch": 1.109860823215872, + "grad_norm": 3.5826588522082607, + "learning_rate": 1.4523941490962342e-05, + "loss": 0.7405, + "step": 1874 + }, + { + "epoch": 1.110453064850459, + "grad_norm": 1.5079201259207546, + "learning_rate": 1.451823671996728e-05, + "loss": 0.739, + "step": 1875 + }, + { + "epoch": 1.1110453064850458, + "grad_norm": 2.719011118153245, + "learning_rate": 1.451253010075933e-05, + "loss": 0.7482, + "step": 1876 + }, + { + "epoch": 1.1116375481196328, + "grad_norm": 2.508076865789526, + "learning_rate": 1.450682163567281e-05, + "loss": 0.7225, + "step": 1877 + }, + { + "epoch": 1.1122297897542197, + "grad_norm": 1.6745532655484892, + "learning_rate": 1.4501111327042817e-05, + "loss": 0.7662, + "step": 1878 + }, + { + "epoch": 1.1128220313888066, + "grad_norm": 1.617999952278557, + "learning_rate": 1.4495399177205177e-05, + "loss": 0.7055, + "step": 1879 + }, + { + "epoch": 1.1134142730233936, + "grad_norm": 1.783576483289883, + "learning_rate": 1.4489685188496488e-05, + "loss": 0.7175, + "step": 1880 + }, + { + "epoch": 1.1140065146579805, + "grad_norm": 1.8113648186864728, + "learning_rate": 1.4483969363254085e-05, + "loss": 0.7221, + "step": 1881 + }, + { + "epoch": 1.1145987562925674, + "grad_norm": 4.466398490357165, + "learning_rate": 1.447825170381607e-05, + "loss": 0.731, + "step": 1882 + }, + { + "epoch": 1.1151909979271544, + "grad_norm": 2.3544006277235776, + "learning_rate": 1.4472532212521281e-05, + "loss": 0.7599, + "step": 1883 + }, + { + "epoch": 1.115783239561741, + "grad_norm": 1.5430309293543931, + "learning_rate": 1.446681089170932e-05, + "loss": 0.7418, + "step": 1884 + }, + { + "epoch": 1.116375481196328, + "grad_norm": 2.547169019555421, + "learning_rate": 1.4461087743720519e-05, + "loss": 0.7436, + "step": 1885 + }, + { + "epoch": 1.116967722830915, + "grad_norm": 2.4575093456645174, + "learning_rate": 1.4455362770895976e-05, + "loss": 0.7316, + "step": 1886 + }, + { + "epoch": 1.1175599644655019, + "grad_norm": 1.5479754084135895, + "learning_rate": 1.444963597557752e-05, + "loss": 0.6865, + "step": 1887 + }, + { + "epoch": 1.1181522061000888, + "grad_norm": 1.5331344869939385, + "learning_rate": 1.4443907360107734e-05, + "loss": 0.7007, + "step": 1888 + }, + { + "epoch": 1.1187444477346757, + "grad_norm": 2.1333020071731896, + "learning_rate": 1.4438176926829944e-05, + "loss": 0.727, + "step": 1889 + }, + { + "epoch": 1.1193366893692627, + "grad_norm": 2.758567222418793, + "learning_rate": 1.4432444678088222e-05, + "loss": 0.7018, + "step": 1890 + }, + { + "epoch": 1.1199289310038496, + "grad_norm": 1.7694989327792088, + "learning_rate": 1.4426710616227377e-05, + "loss": 0.7148, + "step": 1891 + }, + { + "epoch": 1.1205211726384365, + "grad_norm": 1.674707947535516, + "learning_rate": 1.4420974743592964e-05, + "loss": 0.7497, + "step": 1892 + }, + { + "epoch": 1.1211134142730235, + "grad_norm": 1.7836099010979485, + "learning_rate": 1.4415237062531277e-05, + "loss": 0.7138, + "step": 1893 + }, + { + "epoch": 1.1217056559076104, + "grad_norm": 1.2259620137684928, + "learning_rate": 1.4409497575389352e-05, + "loss": 0.6871, + "step": 1894 + }, + { + "epoch": 1.122297897542197, + "grad_norm": 3.7532717279318066, + "learning_rate": 1.4403756284514962e-05, + "loss": 0.7662, + "step": 1895 + }, + { + "epoch": 1.122890139176784, + "grad_norm": 1.6907860579891794, + "learning_rate": 1.4398013192256615e-05, + "loss": 0.7459, + "step": 1896 + }, + { + "epoch": 1.123482380811371, + "grad_norm": 2.1021110669948166, + "learning_rate": 1.439226830096356e-05, + "loss": 0.7327, + "step": 1897 + }, + { + "epoch": 1.124074622445958, + "grad_norm": 3.754781447508906, + "learning_rate": 1.438652161298578e-05, + "loss": 0.7541, + "step": 1898 + }, + { + "epoch": 1.1246668640805448, + "grad_norm": 1.6070012623831906, + "learning_rate": 1.4380773130673994e-05, + "loss": 0.7569, + "step": 1899 + }, + { + "epoch": 1.1252591057151318, + "grad_norm": 7.979250377670888, + "learning_rate": 1.4375022856379657e-05, + "loss": 0.766, + "step": 1900 + }, + { + "epoch": 1.1258513473497187, + "grad_norm": 1.3313809617715828, + "learning_rate": 1.436927079245495e-05, + "loss": 0.7326, + "step": 1901 + }, + { + "epoch": 1.1264435889843056, + "grad_norm": 1.9341986555985717, + "learning_rate": 1.4363516941252795e-05, + "loss": 0.7408, + "step": 1902 + }, + { + "epoch": 1.1270358306188926, + "grad_norm": 1.6237139068552036, + "learning_rate": 1.4357761305126836e-05, + "loss": 0.7064, + "step": 1903 + }, + { + "epoch": 1.1276280722534795, + "grad_norm": 1.2456602536397703, + "learning_rate": 1.4352003886431459e-05, + "loss": 0.7131, + "step": 1904 + }, + { + "epoch": 1.1282203138880664, + "grad_norm": 1.8462015297534495, + "learning_rate": 1.4346244687521761e-05, + "loss": 0.719, + "step": 1905 + }, + { + "epoch": 1.1288125555226531, + "grad_norm": 2.308176089812671, + "learning_rate": 1.434048371075359e-05, + "loss": 0.6911, + "step": 1906 + }, + { + "epoch": 1.12940479715724, + "grad_norm": 13.436596947136401, + "learning_rate": 1.43347209584835e-05, + "loss": 0.7109, + "step": 1907 + }, + { + "epoch": 1.129997038791827, + "grad_norm": 2.4217564676011896, + "learning_rate": 1.4328956433068789e-05, + "loss": 0.7412, + "step": 1908 + }, + { + "epoch": 1.130589280426414, + "grad_norm": 1.917753337825582, + "learning_rate": 1.4323190136867464e-05, + "loss": 0.753, + "step": 1909 + }, + { + "epoch": 1.1311815220610009, + "grad_norm": 1.411783160540838, + "learning_rate": 1.4317422072238271e-05, + "loss": 0.7423, + "step": 1910 + }, + { + "epoch": 1.1317737636955878, + "grad_norm": 1.6181997909355976, + "learning_rate": 1.4311652241540668e-05, + "loss": 0.7672, + "step": 1911 + }, + { + "epoch": 1.1323660053301747, + "grad_norm": 1.8668388500696302, + "learning_rate": 1.4305880647134847e-05, + "loss": 0.7447, + "step": 1912 + }, + { + "epoch": 1.1329582469647617, + "grad_norm": 1.6503141304547309, + "learning_rate": 1.4300107291381704e-05, + "loss": 0.7519, + "step": 1913 + }, + { + "epoch": 1.1335504885993486, + "grad_norm": 1.8711353472468055, + "learning_rate": 1.4294332176642875e-05, + "loss": 0.7117, + "step": 1914 + }, + { + "epoch": 1.1341427302339355, + "grad_norm": 1.5597160551862157, + "learning_rate": 1.4288555305280702e-05, + "loss": 0.7413, + "step": 1915 + }, + { + "epoch": 1.1347349718685225, + "grad_norm": 1.9438549888259047, + "learning_rate": 1.4282776679658255e-05, + "loss": 0.7131, + "step": 1916 + }, + { + "epoch": 1.1353272135031092, + "grad_norm": 3.576441284877924, + "learning_rate": 1.4276996302139312e-05, + "loss": 0.7173, + "step": 1917 + }, + { + "epoch": 1.135919455137696, + "grad_norm": 4.139553745005125, + "learning_rate": 1.4271214175088374e-05, + "loss": 0.7347, + "step": 1918 + }, + { + "epoch": 1.136511696772283, + "grad_norm": 2.094913208691617, + "learning_rate": 1.4265430300870656e-05, + "loss": 0.7003, + "step": 1919 + }, + { + "epoch": 1.13710393840687, + "grad_norm": 1.961827791646241, + "learning_rate": 1.425964468185209e-05, + "loss": 0.7393, + "step": 1920 + }, + { + "epoch": 1.137696180041457, + "grad_norm": 1.2595516677616476, + "learning_rate": 1.4253857320399316e-05, + "loss": 0.7199, + "step": 1921 + }, + { + "epoch": 1.1382884216760438, + "grad_norm": 1.76264901153827, + "learning_rate": 1.4248068218879691e-05, + "loss": 0.7213, + "step": 1922 + }, + { + "epoch": 1.1388806633106308, + "grad_norm": 1.8110656077220395, + "learning_rate": 1.4242277379661286e-05, + "loss": 0.7134, + "step": 1923 + }, + { + "epoch": 1.1394729049452177, + "grad_norm": 1.3189613973752465, + "learning_rate": 1.4236484805112878e-05, + "loss": 0.7287, + "step": 1924 + }, + { + "epoch": 1.1400651465798046, + "grad_norm": 2.039277540030174, + "learning_rate": 1.4230690497603955e-05, + "loss": 0.716, + "step": 1925 + }, + { + "epoch": 1.1406573882143916, + "grad_norm": 2.20069189081546, + "learning_rate": 1.4224894459504717e-05, + "loss": 0.7297, + "step": 1926 + }, + { + "epoch": 1.1412496298489785, + "grad_norm": 1.342282720151037, + "learning_rate": 1.4219096693186065e-05, + "loss": 0.7599, + "step": 1927 + }, + { + "epoch": 1.1418418714835652, + "grad_norm": 2.128575546767518, + "learning_rate": 1.4213297201019618e-05, + "loss": 0.727, + "step": 1928 + }, + { + "epoch": 1.1424341131181521, + "grad_norm": 5.679213605059689, + "learning_rate": 1.4207495985377687e-05, + "loss": 0.7075, + "step": 1929 + }, + { + "epoch": 1.143026354752739, + "grad_norm": 1.4477866874730978, + "learning_rate": 1.4201693048633302e-05, + "loss": 0.7705, + "step": 1930 + }, + { + "epoch": 1.143618596387326, + "grad_norm": 1.3689716380240244, + "learning_rate": 1.4195888393160183e-05, + "loss": 0.7145, + "step": 1931 + }, + { + "epoch": 1.144210838021913, + "grad_norm": 2.1302801407154153, + "learning_rate": 1.419008202133277e-05, + "loss": 0.7193, + "step": 1932 + }, + { + "epoch": 1.1448030796564999, + "grad_norm": 1.300453821118966, + "learning_rate": 1.4184273935526184e-05, + "loss": 0.7635, + "step": 1933 + }, + { + "epoch": 1.1453953212910868, + "grad_norm": 1.9849938187340435, + "learning_rate": 1.4178464138116272e-05, + "loss": 0.7525, + "step": 1934 + }, + { + "epoch": 1.1459875629256737, + "grad_norm": 1.7532705143201168, + "learning_rate": 1.4172652631479558e-05, + "loss": 0.7159, + "step": 1935 + }, + { + "epoch": 1.1465798045602607, + "grad_norm": 1.993678172399548, + "learning_rate": 1.4166839417993281e-05, + "loss": 0.7287, + "step": 1936 + }, + { + "epoch": 1.1471720461948476, + "grad_norm": 1.6612921095486382, + "learning_rate": 1.4161024500035364e-05, + "loss": 0.7116, + "step": 1937 + }, + { + "epoch": 1.1477642878294345, + "grad_norm": 2.704022908452418, + "learning_rate": 1.4155207879984447e-05, + "loss": 0.7005, + "step": 1938 + }, + { + "epoch": 1.1483565294640212, + "grad_norm": 1.5244879998598664, + "learning_rate": 1.4149389560219846e-05, + "loss": 0.7588, + "step": 1939 + }, + { + "epoch": 1.1489487710986082, + "grad_norm": 3.6604754255800307, + "learning_rate": 1.414356954312158e-05, + "loss": 0.749, + "step": 1940 + }, + { + "epoch": 1.149541012733195, + "grad_norm": 1.9096638139679427, + "learning_rate": 1.4137747831070371e-05, + "loss": 0.7127, + "step": 1941 + }, + { + "epoch": 1.150133254367782, + "grad_norm": 1.5220531333094949, + "learning_rate": 1.4131924426447621e-05, + "loss": 0.7561, + "step": 1942 + }, + { + "epoch": 1.150725496002369, + "grad_norm": 2.3373596861886456, + "learning_rate": 1.412609933163543e-05, + "loss": 0.7053, + "step": 1943 + }, + { + "epoch": 1.1513177376369559, + "grad_norm": 1.4526913515699773, + "learning_rate": 1.4120272549016591e-05, + "loss": 0.7197, + "step": 1944 + }, + { + "epoch": 1.1519099792715428, + "grad_norm": 1.7959045608328434, + "learning_rate": 1.4114444080974585e-05, + "loss": 0.7105, + "step": 1945 + }, + { + "epoch": 1.1525022209061297, + "grad_norm": 2.2273046483347834, + "learning_rate": 1.4108613929893586e-05, + "loss": 0.7099, + "step": 1946 + }, + { + "epoch": 1.1530944625407167, + "grad_norm": 1.851272958075549, + "learning_rate": 1.410278209815845e-05, + "loss": 0.6989, + "step": 1947 + }, + { + "epoch": 1.1536867041753036, + "grad_norm": 1.4743710555973606, + "learning_rate": 1.4096948588154723e-05, + "loss": 0.7193, + "step": 1948 + }, + { + "epoch": 1.1542789458098905, + "grad_norm": 1.8040748031208247, + "learning_rate": 1.4091113402268644e-05, + "loss": 0.7769, + "step": 1949 + }, + { + "epoch": 1.1548711874444773, + "grad_norm": 4.56346912920917, + "learning_rate": 1.4085276542887128e-05, + "loss": 0.7509, + "step": 1950 + }, + { + "epoch": 1.1554634290790642, + "grad_norm": 1.83204117485103, + "learning_rate": 1.4079438012397777e-05, + "loss": 0.7835, + "step": 1951 + }, + { + "epoch": 1.1560556707136511, + "grad_norm": 1.7404819759752297, + "learning_rate": 1.4073597813188884e-05, + "loss": 0.7317, + "step": 1952 + }, + { + "epoch": 1.156647912348238, + "grad_norm": 2.5984937357601803, + "learning_rate": 1.4067755947649416e-05, + "loss": 0.735, + "step": 1953 + }, + { + "epoch": 1.157240153982825, + "grad_norm": 1.6054589075238175, + "learning_rate": 1.4061912418169024e-05, + "loss": 0.7258, + "step": 1954 + }, + { + "epoch": 1.157832395617412, + "grad_norm": 1.9010263960704716, + "learning_rate": 1.405606722713804e-05, + "loss": 0.7318, + "step": 1955 + }, + { + "epoch": 1.1584246372519988, + "grad_norm": 1.859336257713254, + "learning_rate": 1.405022037694748e-05, + "loss": 0.721, + "step": 1956 + }, + { + "epoch": 1.1590168788865858, + "grad_norm": 1.815312595979808, + "learning_rate": 1.4044371869989024e-05, + "loss": 0.7329, + "step": 1957 + }, + { + "epoch": 1.1596091205211727, + "grad_norm": 2.126985969178438, + "learning_rate": 1.4038521708655054e-05, + "loss": 0.7508, + "step": 1958 + }, + { + "epoch": 1.1602013621557596, + "grad_norm": 1.5244816275828186, + "learning_rate": 1.4032669895338602e-05, + "loss": 0.7254, + "step": 1959 + }, + { + "epoch": 1.1607936037903466, + "grad_norm": 2.1945138390122834, + "learning_rate": 1.40268164324334e-05, + "loss": 0.7529, + "step": 1960 + }, + { + "epoch": 1.1613858454249333, + "grad_norm": 2.5793485490868235, + "learning_rate": 1.4020961322333833e-05, + "loss": 0.7489, + "step": 1961 + }, + { + "epoch": 1.1619780870595202, + "grad_norm": 1.6233997764262025, + "learning_rate": 1.4015104567434981e-05, + "loss": 0.7458, + "step": 1962 + }, + { + "epoch": 1.1625703286941071, + "grad_norm": 2.1892983877595875, + "learning_rate": 1.4009246170132575e-05, + "loss": 0.7234, + "step": 1963 + }, + { + "epoch": 1.163162570328694, + "grad_norm": 2.2188497921341654, + "learning_rate": 1.400338613282304e-05, + "loss": 0.7235, + "step": 1964 + }, + { + "epoch": 1.163754811963281, + "grad_norm": 2.6410164085265233, + "learning_rate": 1.3997524457903455e-05, + "loss": 0.7395, + "step": 1965 + }, + { + "epoch": 1.164347053597868, + "grad_norm": 2.8137504546585395, + "learning_rate": 1.3991661147771574e-05, + "loss": 0.725, + "step": 1966 + }, + { + "epoch": 1.1649392952324549, + "grad_norm": 4.3683730385339965, + "learning_rate": 1.398579620482582e-05, + "loss": 0.7569, + "step": 1967 + }, + { + "epoch": 1.1655315368670418, + "grad_norm": 3.000428917939229, + "learning_rate": 1.3979929631465286e-05, + "loss": 0.703, + "step": 1968 + }, + { + "epoch": 1.1661237785016287, + "grad_norm": 1.8828332802791297, + "learning_rate": 1.3974061430089731e-05, + "loss": 0.7377, + "step": 1969 + }, + { + "epoch": 1.1667160201362157, + "grad_norm": 1.6402504327655474, + "learning_rate": 1.396819160309958e-05, + "loss": 0.7423, + "step": 1970 + }, + { + "epoch": 1.1673082617708026, + "grad_norm": 6.3377427586159625, + "learning_rate": 1.396232015289592e-05, + "loss": 0.7424, + "step": 1971 + }, + { + "epoch": 1.1679005034053893, + "grad_norm": 2.2909334335027407, + "learning_rate": 1.3956447081880506e-05, + "loss": 0.7274, + "step": 1972 + }, + { + "epoch": 1.1684927450399762, + "grad_norm": 1.7468340066260577, + "learning_rate": 1.3950572392455755e-05, + "loss": 0.7087, + "step": 1973 + }, + { + "epoch": 1.1690849866745632, + "grad_norm": 3.9317619791484644, + "learning_rate": 1.394469608702474e-05, + "loss": 0.7178, + "step": 1974 + }, + { + "epoch": 1.16967722830915, + "grad_norm": 1.8428851547057905, + "learning_rate": 1.3938818167991208e-05, + "loss": 0.7067, + "step": 1975 + }, + { + "epoch": 1.170269469943737, + "grad_norm": 3.147154685015463, + "learning_rate": 1.3932938637759555e-05, + "loss": 0.7499, + "step": 1976 + }, + { + "epoch": 1.170861711578324, + "grad_norm": 5.221103236332467, + "learning_rate": 1.3927057498734837e-05, + "loss": 0.7523, + "step": 1977 + }, + { + "epoch": 1.171453953212911, + "grad_norm": 2.367076528325581, + "learning_rate": 1.3921174753322775e-05, + "loss": 0.739, + "step": 1978 + }, + { + "epoch": 1.1720461948474978, + "grad_norm": 2.216413750813569, + "learning_rate": 1.3915290403929738e-05, + "loss": 0.732, + "step": 1979 + }, + { + "epoch": 1.1726384364820848, + "grad_norm": 2.7444238440546584, + "learning_rate": 1.390940445296276e-05, + "loss": 0.7295, + "step": 1980 + }, + { + "epoch": 1.1732306781166717, + "grad_norm": 3.5442659708991076, + "learning_rate": 1.3903516902829525e-05, + "loss": 0.7312, + "step": 1981 + }, + { + "epoch": 1.1738229197512586, + "grad_norm": 2.4148636767736478, + "learning_rate": 1.3897627755938372e-05, + "loss": 0.7149, + "step": 1982 + }, + { + "epoch": 1.1744151613858453, + "grad_norm": 1.686105294543067, + "learning_rate": 1.389173701469829e-05, + "loss": 0.7449, + "step": 1983 + }, + { + "epoch": 1.1750074030204323, + "grad_norm": 2.18226411659928, + "learning_rate": 1.388584468151893e-05, + "loss": 0.7074, + "step": 1984 + }, + { + "epoch": 1.1755996446550192, + "grad_norm": 1.6462875388280893, + "learning_rate": 1.3879950758810577e-05, + "loss": 0.7186, + "step": 1985 + }, + { + "epoch": 1.1761918862896061, + "grad_norm": 1.9514852968058827, + "learning_rate": 1.3874055248984191e-05, + "loss": 0.7708, + "step": 1986 + }, + { + "epoch": 1.176784127924193, + "grad_norm": 2.5287146246879058, + "learning_rate": 1.3868158154451354e-05, + "loss": 0.7552, + "step": 1987 + }, + { + "epoch": 1.17737636955878, + "grad_norm": 5.354876267561049, + "learning_rate": 1.3862259477624317e-05, + "loss": 0.7096, + "step": 1988 + }, + { + "epoch": 1.177968611193367, + "grad_norm": 2.080724856383468, + "learning_rate": 1.3856359220915967e-05, + "loss": 0.7306, + "step": 1989 + }, + { + "epoch": 1.1785608528279539, + "grad_norm": 1.9442702346562672, + "learning_rate": 1.3850457386739846e-05, + "loss": 0.7006, + "step": 1990 + }, + { + "epoch": 1.1791530944625408, + "grad_norm": 2.217412566401062, + "learning_rate": 1.3844553977510127e-05, + "loss": 0.7313, + "step": 1991 + }, + { + "epoch": 1.1797453360971275, + "grad_norm": 2.3005541144417094, + "learning_rate": 1.3838648995641645e-05, + "loss": 0.7372, + "step": 1992 + }, + { + "epoch": 1.1803375777317147, + "grad_norm": 1.5633590022176485, + "learning_rate": 1.3832742443549865e-05, + "loss": 0.7273, + "step": 1993 + }, + { + "epoch": 1.1809298193663014, + "grad_norm": 2.177628183648929, + "learning_rate": 1.3826834323650899e-05, + "loss": 0.7433, + "step": 1994 + }, + { + "epoch": 1.1815220610008883, + "grad_norm": 2.1872512165965596, + "learning_rate": 1.3820924638361501e-05, + "loss": 0.7884, + "step": 1995 + }, + { + "epoch": 1.1821143026354752, + "grad_norm": 2.7491390741203183, + "learning_rate": 1.3815013390099068e-05, + "loss": 0.77, + "step": 1996 + }, + { + "epoch": 1.1827065442700622, + "grad_norm": 2.2399320892471595, + "learning_rate": 1.380910058128163e-05, + "loss": 0.7449, + "step": 1997 + }, + { + "epoch": 1.183298785904649, + "grad_norm": 1.8717444666929153, + "learning_rate": 1.3803186214327852e-05, + "loss": 0.7234, + "step": 1998 + }, + { + "epoch": 1.183891027539236, + "grad_norm": 2.3196057634912934, + "learning_rate": 1.3797270291657056e-05, + "loss": 0.7311, + "step": 1999 + }, + { + "epoch": 1.184483269173823, + "grad_norm": 1.5216917870082234, + "learning_rate": 1.3791352815689174e-05, + "loss": 0.7444, + "step": 2000 + }, + { + "epoch": 1.18507551080841, + "grad_norm": 4.333885469046637, + "learning_rate": 1.37854337888448e-05, + "loss": 0.7638, + "step": 2001 + }, + { + "epoch": 1.1856677524429968, + "grad_norm": 2.019243585845638, + "learning_rate": 1.3779513213545132e-05, + "loss": 0.7491, + "step": 2002 + }, + { + "epoch": 1.1862599940775835, + "grad_norm": 2.577233313193749, + "learning_rate": 1.3773591092212035e-05, + "loss": 0.7296, + "step": 2003 + }, + { + "epoch": 1.1868522357121707, + "grad_norm": 1.9298566377245834, + "learning_rate": 1.3767667427267976e-05, + "loss": 0.7447, + "step": 2004 + }, + { + "epoch": 1.1874444773467574, + "grad_norm": 2.427978631614327, + "learning_rate": 1.3761742221136078e-05, + "loss": 0.7494, + "step": 2005 + }, + { + "epoch": 1.1880367189813443, + "grad_norm": 5.679942493223697, + "learning_rate": 1.3755815476240076e-05, + "loss": 0.7124, + "step": 2006 + }, + { + "epoch": 1.1886289606159313, + "grad_norm": 2.087484384962119, + "learning_rate": 1.3749887195004345e-05, + "loss": 0.7371, + "step": 2007 + }, + { + "epoch": 1.1892212022505182, + "grad_norm": 2.329160941125, + "learning_rate": 1.3743957379853885e-05, + "loss": 0.7031, + "step": 2008 + }, + { + "epoch": 1.1898134438851051, + "grad_norm": 2.8641168542308226, + "learning_rate": 1.3738026033214323e-05, + "loss": 0.7713, + "step": 2009 + }, + { + "epoch": 1.190405685519692, + "grad_norm": 2.7900911750772703, + "learning_rate": 1.3732093157511914e-05, + "loss": 0.753, + "step": 2010 + }, + { + "epoch": 1.190997927154279, + "grad_norm": 1.8472081752565688, + "learning_rate": 1.372615875517354e-05, + "loss": 0.7305, + "step": 2011 + }, + { + "epoch": 1.191590168788866, + "grad_norm": 1.9016156353449258, + "learning_rate": 1.3720222828626699e-05, + "loss": 0.7529, + "step": 2012 + }, + { + "epoch": 1.1921824104234529, + "grad_norm": 1.9348033443818209, + "learning_rate": 1.3714285380299525e-05, + "loss": 0.7213, + "step": 2013 + }, + { + "epoch": 1.1927746520580396, + "grad_norm": 1.371298962671068, + "learning_rate": 1.3708346412620768e-05, + "loss": 0.7334, + "step": 2014 + }, + { + "epoch": 1.1933668936926265, + "grad_norm": 1.4004771260654063, + "learning_rate": 1.3702405928019795e-05, + "loss": 0.715, + "step": 2015 + }, + { + "epoch": 1.1939591353272134, + "grad_norm": 1.6163665195760184, + "learning_rate": 1.3696463928926602e-05, + "loss": 0.727, + "step": 2016 + }, + { + "epoch": 1.1945513769618004, + "grad_norm": 1.8911450014106126, + "learning_rate": 1.36905204177718e-05, + "loss": 0.7406, + "step": 2017 + }, + { + "epoch": 1.1951436185963873, + "grad_norm": 2.6466552053840413, + "learning_rate": 1.3684575396986622e-05, + "loss": 0.7064, + "step": 2018 + }, + { + "epoch": 1.1957358602309742, + "grad_norm": 1.7980339830263137, + "learning_rate": 1.3678628869002908e-05, + "loss": 0.6718, + "step": 2019 + }, + { + "epoch": 1.1963281018655612, + "grad_norm": 2.18736908567566, + "learning_rate": 1.3672680836253129e-05, + "loss": 0.7137, + "step": 2020 + }, + { + "epoch": 1.196920343500148, + "grad_norm": 6.371103342127131, + "learning_rate": 1.3666731301170364e-05, + "loss": 0.7216, + "step": 2021 + }, + { + "epoch": 1.197512585134735, + "grad_norm": 1.7740748237595747, + "learning_rate": 1.3660780266188306e-05, + "loss": 0.7277, + "step": 2022 + }, + { + "epoch": 1.198104826769322, + "grad_norm": 1.437742129128349, + "learning_rate": 1.3654827733741263e-05, + "loss": 0.7204, + "step": 2023 + }, + { + "epoch": 1.1986970684039089, + "grad_norm": 2.9002638880137597, + "learning_rate": 1.3648873706264159e-05, + "loss": 0.6936, + "step": 2024 + }, + { + "epoch": 1.1992893100384956, + "grad_norm": 1.6559528300674338, + "learning_rate": 1.3642918186192521e-05, + "loss": 0.7051, + "step": 2025 + }, + { + "epoch": 1.1998815516730825, + "grad_norm": 2.2195218190520465, + "learning_rate": 1.363696117596249e-05, + "loss": 0.7208, + "step": 2026 + }, + { + "epoch": 1.2004737933076695, + "grad_norm": 1.8165964870897855, + "learning_rate": 1.363100267801083e-05, + "loss": 0.7576, + "step": 2027 + }, + { + "epoch": 1.2010660349422564, + "grad_norm": 2.3943878965553655, + "learning_rate": 1.3625042694774886e-05, + "loss": 0.7357, + "step": 2028 + }, + { + "epoch": 1.2016582765768433, + "grad_norm": 1.7673693572256555, + "learning_rate": 1.3619081228692639e-05, + "loss": 0.7445, + "step": 2029 + }, + { + "epoch": 1.2022505182114303, + "grad_norm": 2.3818913908134585, + "learning_rate": 1.3613118282202653e-05, + "loss": 0.773, + "step": 2030 + }, + { + "epoch": 1.2028427598460172, + "grad_norm": 1.3134819673961666, + "learning_rate": 1.360715385774412e-05, + "loss": 0.7121, + "step": 2031 + }, + { + "epoch": 1.2034350014806041, + "grad_norm": 1.4209296259531388, + "learning_rate": 1.3601187957756814e-05, + "loss": 0.7476, + "step": 2032 + }, + { + "epoch": 1.204027243115191, + "grad_norm": 1.5555865610475328, + "learning_rate": 1.3595220584681132e-05, + "loss": 0.7267, + "step": 2033 + }, + { + "epoch": 1.204619484749778, + "grad_norm": 1.7478667815196771, + "learning_rate": 1.358925174095806e-05, + "loss": 0.736, + "step": 2034 + }, + { + "epoch": 1.205211726384365, + "grad_norm": 1.5866637854107661, + "learning_rate": 1.3583281429029197e-05, + "loss": 0.734, + "step": 2035 + }, + { + "epoch": 1.2058039680189516, + "grad_norm": 1.7725107135676927, + "learning_rate": 1.3577309651336728e-05, + "loss": 0.7305, + "step": 2036 + }, + { + "epoch": 1.2063962096535386, + "grad_norm": 1.623392832320982, + "learning_rate": 1.3571336410323448e-05, + "loss": 0.7349, + "step": 2037 + }, + { + "epoch": 1.2069884512881255, + "grad_norm": 1.6230978180494768, + "learning_rate": 1.3565361708432754e-05, + "loss": 0.7369, + "step": 2038 + }, + { + "epoch": 1.2075806929227124, + "grad_norm": 1.3893660474733283, + "learning_rate": 1.3559385548108628e-05, + "loss": 0.777, + "step": 2039 + }, + { + "epoch": 1.2081729345572993, + "grad_norm": 1.8002057697729996, + "learning_rate": 1.3553407931795662e-05, + "loss": 0.7434, + "step": 2040 + }, + { + "epoch": 1.2087651761918863, + "grad_norm": 2.208673349342069, + "learning_rate": 1.3547428861939031e-05, + "loss": 0.7277, + "step": 2041 + }, + { + "epoch": 1.2093574178264732, + "grad_norm": 2.130797939268362, + "learning_rate": 1.3541448340984516e-05, + "loss": 0.7517, + "step": 2042 + }, + { + "epoch": 1.2099496594610601, + "grad_norm": 1.6356617427047075, + "learning_rate": 1.3535466371378483e-05, + "loss": 0.7003, + "step": 2043 + }, + { + "epoch": 1.210541901095647, + "grad_norm": 1.41442682311907, + "learning_rate": 1.3529482955567896e-05, + "loss": 0.7375, + "step": 2044 + }, + { + "epoch": 1.211134142730234, + "grad_norm": 1.31478939013615, + "learning_rate": 1.352349809600031e-05, + "loss": 0.7268, + "step": 2045 + }, + { + "epoch": 1.211726384364821, + "grad_norm": 2.630006601001849, + "learning_rate": 1.3517511795123864e-05, + "loss": 0.7314, + "step": 2046 + }, + { + "epoch": 1.2123186259994077, + "grad_norm": 2.771345319898994, + "learning_rate": 1.3511524055387293e-05, + "loss": 0.738, + "step": 2047 + }, + { + "epoch": 1.2129108676339946, + "grad_norm": 1.4849713723280222, + "learning_rate": 1.3505534879239923e-05, + "loss": 0.7009, + "step": 2048 + }, + { + "epoch": 1.2135031092685815, + "grad_norm": 1.6458441198708356, + "learning_rate": 1.3499544269131662e-05, + "loss": 0.7104, + "step": 2049 + }, + { + "epoch": 1.2140953509031684, + "grad_norm": 1.7131561551779049, + "learning_rate": 1.3493552227513007e-05, + "loss": 0.7311, + "step": 2050 + }, + { + "epoch": 1.2146875925377554, + "grad_norm": 1.3615653142107633, + "learning_rate": 1.3487558756835037e-05, + "loss": 0.7221, + "step": 2051 + }, + { + "epoch": 1.2152798341723423, + "grad_norm": 1.2965127926167885, + "learning_rate": 1.348156385954942e-05, + "loss": 0.7176, + "step": 2052 + }, + { + "epoch": 1.2158720758069292, + "grad_norm": 1.2844748989522476, + "learning_rate": 1.347556753810841e-05, + "loss": 0.7511, + "step": 2053 + }, + { + "epoch": 1.2164643174415162, + "grad_norm": 1.530448235101446, + "learning_rate": 1.3469569794964832e-05, + "loss": 0.7388, + "step": 2054 + }, + { + "epoch": 1.217056559076103, + "grad_norm": 1.958572967417298, + "learning_rate": 1.346357063257211e-05, + "loss": 0.706, + "step": 2055 + }, + { + "epoch": 1.21764880071069, + "grad_norm": 2.5624134832322514, + "learning_rate": 1.3457570053384225e-05, + "loss": 0.7516, + "step": 2056 + }, + { + "epoch": 1.218241042345277, + "grad_norm": 1.4244459644738547, + "learning_rate": 1.3451568059855769e-05, + "loss": 0.7369, + "step": 2057 + }, + { + "epoch": 1.2188332839798637, + "grad_norm": 1.3769283070535534, + "learning_rate": 1.3445564654441879e-05, + "loss": 0.7497, + "step": 2058 + }, + { + "epoch": 1.2194255256144506, + "grad_norm": 2.074590247262327, + "learning_rate": 1.3439559839598296e-05, + "loss": 0.7637, + "step": 2059 + }, + { + "epoch": 1.2200177672490375, + "grad_norm": 2.0719993078058923, + "learning_rate": 1.3433553617781318e-05, + "loss": 0.7669, + "step": 2060 + }, + { + "epoch": 1.2206100088836245, + "grad_norm": 2.3062477000075248, + "learning_rate": 1.3427545991447838e-05, + "loss": 0.7561, + "step": 2061 + }, + { + "epoch": 1.2212022505182114, + "grad_norm": 1.45455392841872, + "learning_rate": 1.3421536963055304e-05, + "loss": 0.7921, + "step": 2062 + }, + { + "epoch": 1.2217944921527983, + "grad_norm": 1.8743454895734402, + "learning_rate": 1.341552653506175e-05, + "loss": 0.7586, + "step": 2063 + }, + { + "epoch": 1.2223867337873853, + "grad_norm": 1.6979098890708813, + "learning_rate": 1.3409514709925777e-05, + "loss": 0.741, + "step": 2064 + }, + { + "epoch": 1.2229789754219722, + "grad_norm": 3.0439938556284045, + "learning_rate": 1.3403501490106562e-05, + "loss": 0.753, + "step": 2065 + }, + { + "epoch": 1.2235712170565591, + "grad_norm": 1.4161385510159186, + "learning_rate": 1.3397486878063852e-05, + "loss": 0.7204, + "step": 2066 + }, + { + "epoch": 1.224163458691146, + "grad_norm": 1.4943178672108177, + "learning_rate": 1.3391470876257957e-05, + "loss": 0.7726, + "step": 2067 + }, + { + "epoch": 1.224755700325733, + "grad_norm": 9.496258794249528, + "learning_rate": 1.3385453487149765e-05, + "loss": 0.7337, + "step": 2068 + }, + { + "epoch": 1.2253479419603197, + "grad_norm": 2.31673978824299, + "learning_rate": 1.3379434713200719e-05, + "loss": 0.7277, + "step": 2069 + }, + { + "epoch": 1.2259401835949066, + "grad_norm": 4.688994620362354, + "learning_rate": 1.3373414556872844e-05, + "loss": 0.7385, + "step": 2070 + }, + { + "epoch": 1.2265324252294936, + "grad_norm": 2.7363030213596686, + "learning_rate": 1.3367393020628716e-05, + "loss": 0.7473, + "step": 2071 + }, + { + "epoch": 1.2271246668640805, + "grad_norm": 2.074298161390372, + "learning_rate": 1.3361370106931486e-05, + "loss": 0.7227, + "step": 2072 + }, + { + "epoch": 1.2277169084986674, + "grad_norm": 1.790196274702212, + "learning_rate": 1.3355345818244864e-05, + "loss": 0.7003, + "step": 2073 + }, + { + "epoch": 1.2283091501332544, + "grad_norm": 2.2647881453530636, + "learning_rate": 1.3349320157033121e-05, + "loss": 0.6884, + "step": 2074 + }, + { + "epoch": 1.2289013917678413, + "grad_norm": 2.009616238480604, + "learning_rate": 1.3343293125761095e-05, + "loss": 0.739, + "step": 2075 + }, + { + "epoch": 1.2294936334024282, + "grad_norm": 1.5327846716374411, + "learning_rate": 1.3337264726894175e-05, + "loss": 0.765, + "step": 2076 + }, + { + "epoch": 1.2300858750370152, + "grad_norm": 1.5570663444419837, + "learning_rate": 1.333123496289832e-05, + "loss": 0.8065, + "step": 2077 + }, + { + "epoch": 1.230678116671602, + "grad_norm": 9.590583102694426, + "learning_rate": 1.3325203836240039e-05, + "loss": 0.7578, + "step": 2078 + }, + { + "epoch": 1.231270358306189, + "grad_norm": 1.945591127870425, + "learning_rate": 1.3319171349386407e-05, + "loss": 0.7356, + "step": 2079 + }, + { + "epoch": 1.2318625999407757, + "grad_norm": 1.9607332954570817, + "learning_rate": 1.3313137504805042e-05, + "loss": 0.72, + "step": 2080 + }, + { + "epoch": 1.2324548415753627, + "grad_norm": 2.1351386740486387, + "learning_rate": 1.3307102304964137e-05, + "loss": 0.7345, + "step": 2081 + }, + { + "epoch": 1.2330470832099496, + "grad_norm": 2.0536248837102336, + "learning_rate": 1.3301065752332415e-05, + "loss": 0.7216, + "step": 2082 + }, + { + "epoch": 1.2336393248445365, + "grad_norm": 2.055956373479122, + "learning_rate": 1.329502784937918e-05, + "loss": 0.734, + "step": 2083 + }, + { + "epoch": 1.2342315664791235, + "grad_norm": 1.335091728441026, + "learning_rate": 1.328898859857426e-05, + "loss": 0.6955, + "step": 2084 + }, + { + "epoch": 1.2348238081137104, + "grad_norm": 3.0796986220464118, + "learning_rate": 1.328294800238806e-05, + "loss": 0.6923, + "step": 2085 + }, + { + "epoch": 1.2354160497482973, + "grad_norm": 1.4601736651102073, + "learning_rate": 1.3276906063291511e-05, + "loss": 0.7418, + "step": 2086 + }, + { + "epoch": 1.2360082913828843, + "grad_norm": 1.9846393822643258, + "learning_rate": 1.327086278375612e-05, + "loss": 0.7092, + "step": 2087 + }, + { + "epoch": 1.2366005330174712, + "grad_norm": 1.5795884904364481, + "learning_rate": 1.3264818166253917e-05, + "loss": 0.7119, + "step": 2088 + }, + { + "epoch": 1.2371927746520581, + "grad_norm": 3.1635752576713663, + "learning_rate": 1.3258772213257493e-05, + "loss": 0.724, + "step": 2089 + }, + { + "epoch": 1.237785016286645, + "grad_norm": 2.0561446054723573, + "learning_rate": 1.3252724927239986e-05, + "loss": 0.7008, + "step": 2090 + }, + { + "epoch": 1.2383772579212318, + "grad_norm": 1.977573696753446, + "learning_rate": 1.3246676310675076e-05, + "loss": 0.6925, + "step": 2091 + }, + { + "epoch": 1.2389694995558187, + "grad_norm": 1.459694866102705, + "learning_rate": 1.3240626366036982e-05, + "loss": 0.7235, + "step": 2092 + }, + { + "epoch": 1.2395617411904056, + "grad_norm": 1.7170760478119698, + "learning_rate": 1.3234575095800477e-05, + "loss": 0.672, + "step": 2093 + }, + { + "epoch": 1.2401539828249926, + "grad_norm": 2.4802114399082673, + "learning_rate": 1.3228522502440868e-05, + "loss": 0.7352, + "step": 2094 + }, + { + "epoch": 1.2407462244595795, + "grad_norm": 2.0363567883714713, + "learning_rate": 1.3222468588434007e-05, + "loss": 0.7406, + "step": 2095 + }, + { + "epoch": 1.2413384660941664, + "grad_norm": 1.6732345457040516, + "learning_rate": 1.3216413356256286e-05, + "loss": 0.7671, + "step": 2096 + }, + { + "epoch": 1.2419307077287534, + "grad_norm": 2.0841489770262744, + "learning_rate": 1.3210356808384634e-05, + "loss": 0.7337, + "step": 2097 + }, + { + "epoch": 1.2425229493633403, + "grad_norm": 2.033101647120512, + "learning_rate": 1.3204298947296521e-05, + "loss": 0.7288, + "step": 2098 + }, + { + "epoch": 1.2431151909979272, + "grad_norm": 7.475913929909522, + "learning_rate": 1.3198239775469953e-05, + "loss": 0.7209, + "step": 2099 + }, + { + "epoch": 1.2437074326325142, + "grad_norm": 2.093757868519759, + "learning_rate": 1.319217929538347e-05, + "loss": 0.7292, + "step": 2100 + }, + { + "epoch": 1.244299674267101, + "grad_norm": 1.9048288578338362, + "learning_rate": 1.318611750951615e-05, + "loss": 0.7345, + "step": 2101 + }, + { + "epoch": 1.2448919159016878, + "grad_norm": 2.054573126945815, + "learning_rate": 1.3180054420347603e-05, + "loss": 0.7389, + "step": 2102 + }, + { + "epoch": 1.2454841575362747, + "grad_norm": 1.8305207739909872, + "learning_rate": 1.317399003035798e-05, + "loss": 0.7297, + "step": 2103 + }, + { + "epoch": 1.2460763991708617, + "grad_norm": 1.9690959794278322, + "learning_rate": 1.3167924342027947e-05, + "loss": 0.7013, + "step": 2104 + }, + { + "epoch": 1.2466686408054486, + "grad_norm": 2.0772164992233715, + "learning_rate": 1.316185735783872e-05, + "loss": 0.7077, + "step": 2105 + }, + { + "epoch": 1.2472608824400355, + "grad_norm": 2.034723235828488, + "learning_rate": 1.315578908027203e-05, + "loss": 0.7378, + "step": 2106 + }, + { + "epoch": 1.2478531240746225, + "grad_norm": 1.5869955061233847, + "learning_rate": 1.3149719511810152e-05, + "loss": 0.7371, + "step": 2107 + }, + { + "epoch": 1.2484453657092094, + "grad_norm": 3.66716265133577, + "learning_rate": 1.3143648654935875e-05, + "loss": 0.7766, + "step": 2108 + }, + { + "epoch": 1.2490376073437963, + "grad_norm": 2.1574547752973707, + "learning_rate": 1.3137576512132524e-05, + "loss": 0.7387, + "step": 2109 + }, + { + "epoch": 1.2496298489783833, + "grad_norm": 2.0549597904621426, + "learning_rate": 1.313150308588394e-05, + "loss": 0.7119, + "step": 2110 + }, + { + "epoch": 1.25022209061297, + "grad_norm": 1.766968422618495, + "learning_rate": 1.3125428378674507e-05, + "loss": 0.7619, + "step": 2111 + }, + { + "epoch": 1.2508143322475571, + "grad_norm": 1.6504903814709435, + "learning_rate": 1.311935239298911e-05, + "loss": 0.7339, + "step": 2112 + }, + { + "epoch": 1.2514065738821438, + "grad_norm": 2.3755573371765846, + "learning_rate": 1.3113275131313179e-05, + "loss": 0.7706, + "step": 2113 + }, + { + "epoch": 1.2519988155167308, + "grad_norm": 6.419415940537187, + "learning_rate": 1.310719659613265e-05, + "loss": 0.7298, + "step": 2114 + }, + { + "epoch": 1.2525910571513177, + "grad_norm": 4.777728911657151, + "learning_rate": 1.3101116789933988e-05, + "loss": 0.7351, + "step": 2115 + }, + { + "epoch": 1.2531832987859046, + "grad_norm": 1.9830147062841872, + "learning_rate": 1.3095035715204171e-05, + "loss": 0.7352, + "step": 2116 + }, + { + "epoch": 1.2537755404204916, + "grad_norm": 3.8969606332634212, + "learning_rate": 1.3088953374430709e-05, + "loss": 0.7565, + "step": 2117 + }, + { + "epoch": 1.2543677820550785, + "grad_norm": 3.895878218256781, + "learning_rate": 1.3082869770101613e-05, + "loss": 0.737, + "step": 2118 + }, + { + "epoch": 1.2549600236896654, + "grad_norm": 3.0246769337442725, + "learning_rate": 1.3076784904705426e-05, + "loss": 0.7413, + "step": 2119 + }, + { + "epoch": 1.2555522653242523, + "grad_norm": 6.0066054184240105, + "learning_rate": 1.3070698780731194e-05, + "loss": 0.7304, + "step": 2120 + }, + { + "epoch": 1.2561445069588393, + "grad_norm": 3.00887439367028, + "learning_rate": 1.306461140066849e-05, + "loss": 0.7248, + "step": 2121 + }, + { + "epoch": 1.256736748593426, + "grad_norm": 2.3778171464319318, + "learning_rate": 1.305852276700739e-05, + "loss": 0.7199, + "step": 2122 + }, + { + "epoch": 1.2573289902280131, + "grad_norm": 3.933106640277816, + "learning_rate": 1.3052432882238487e-05, + "loss": 0.7539, + "step": 2123 + }, + { + "epoch": 1.2579212318625999, + "grad_norm": 12.537942854482242, + "learning_rate": 1.304634174885289e-05, + "loss": 0.7503, + "step": 2124 + }, + { + "epoch": 1.2585134734971868, + "grad_norm": 3.6650070710135174, + "learning_rate": 1.3040249369342215e-05, + "loss": 0.7451, + "step": 2125 + }, + { + "epoch": 1.2591057151317737, + "grad_norm": 3.320172986670575, + "learning_rate": 1.3034155746198588e-05, + "loss": 0.692, + "step": 2126 + }, + { + "epoch": 1.2596979567663606, + "grad_norm": 2.414366162856636, + "learning_rate": 1.3028060881914639e-05, + "loss": 0.7276, + "step": 2127 + }, + { + "epoch": 1.2602901984009476, + "grad_norm": 1.845695162557533, + "learning_rate": 1.3021964778983513e-05, + "loss": 0.7083, + "step": 2128 + }, + { + "epoch": 1.2608824400355345, + "grad_norm": 3.8714818295986957, + "learning_rate": 1.3015867439898862e-05, + "loss": 0.7238, + "step": 2129 + }, + { + "epoch": 1.2614746816701214, + "grad_norm": 3.0251438217693205, + "learning_rate": 1.3009768867154834e-05, + "loss": 0.7342, + "step": 2130 + }, + { + "epoch": 1.2620669233047084, + "grad_norm": 3.7078771253414717, + "learning_rate": 1.3003669063246096e-05, + "loss": 0.7253, + "step": 2131 + }, + { + "epoch": 1.2626591649392953, + "grad_norm": 1.6876561495268882, + "learning_rate": 1.2997568030667802e-05, + "loss": 0.7286, + "step": 2132 + }, + { + "epoch": 1.263251406573882, + "grad_norm": 2.3907642362721835, + "learning_rate": 1.2991465771915626e-05, + "loss": 0.7579, + "step": 2133 + }, + { + "epoch": 1.2638436482084692, + "grad_norm": 2.5589770747316867, + "learning_rate": 1.2985362289485728e-05, + "loss": 0.7411, + "step": 2134 + }, + { + "epoch": 1.2644358898430559, + "grad_norm": 1.670540837036476, + "learning_rate": 1.2979257585874782e-05, + "loss": 0.7457, + "step": 2135 + }, + { + "epoch": 1.2650281314776428, + "grad_norm": 2.1619277987058525, + "learning_rate": 1.2973151663579948e-05, + "loss": 0.7158, + "step": 2136 + }, + { + "epoch": 1.2656203731122297, + "grad_norm": 4.927979400324849, + "learning_rate": 1.2967044525098897e-05, + "loss": 0.7082, + "step": 2137 + }, + { + "epoch": 1.2662126147468167, + "grad_norm": 1.7102954049147032, + "learning_rate": 1.296093617292979e-05, + "loss": 0.731, + "step": 2138 + }, + { + "epoch": 1.2668048563814036, + "grad_norm": 4.087043152351807, + "learning_rate": 1.295482660957129e-05, + "loss": 0.7433, + "step": 2139 + }, + { + "epoch": 1.2673970980159905, + "grad_norm": 7.571384572693805, + "learning_rate": 1.2948715837522542e-05, + "loss": 0.7354, + "step": 2140 + }, + { + "epoch": 1.2679893396505775, + "grad_norm": 2.3629194134848928, + "learning_rate": 1.2942603859283207e-05, + "loss": 0.7327, + "step": 2141 + }, + { + "epoch": 1.2685815812851644, + "grad_norm": 2.8525406546001415, + "learning_rate": 1.2936490677353422e-05, + "loss": 0.7268, + "step": 2142 + }, + { + "epoch": 1.2691738229197513, + "grad_norm": 14.104534903708966, + "learning_rate": 1.2930376294233821e-05, + "loss": 0.723, + "step": 2143 + }, + { + "epoch": 1.269766064554338, + "grad_norm": 2.488060816626756, + "learning_rate": 1.2924260712425536e-05, + "loss": 0.7055, + "step": 2144 + }, + { + "epoch": 1.2703583061889252, + "grad_norm": 2.281444459835155, + "learning_rate": 1.2918143934430178e-05, + "loss": 0.7358, + "step": 2145 + }, + { + "epoch": 1.270950547823512, + "grad_norm": 2.863071025708633, + "learning_rate": 1.2912025962749856e-05, + "loss": 0.7511, + "step": 2146 + }, + { + "epoch": 1.2715427894580988, + "grad_norm": 3.4445132717535145, + "learning_rate": 1.2905906799887164e-05, + "loss": 0.7211, + "step": 2147 + }, + { + "epoch": 1.2721350310926858, + "grad_norm": 2.992321062019045, + "learning_rate": 1.2899786448345186e-05, + "loss": 0.7323, + "step": 2148 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 2.7754937679502767, + "learning_rate": 1.2893664910627486e-05, + "loss": 0.6995, + "step": 2149 + }, + { + "epoch": 1.2733195143618596, + "grad_norm": 4.043526562299961, + "learning_rate": 1.288754218923812e-05, + "loss": 0.7038, + "step": 2150 + }, + { + "epoch": 1.2739117559964466, + "grad_norm": 2.601133025890244, + "learning_rate": 1.2881418286681622e-05, + "loss": 0.7592, + "step": 2151 + }, + { + "epoch": 1.2745039976310335, + "grad_norm": 3.228479699181939, + "learning_rate": 1.2875293205463018e-05, + "loss": 0.7457, + "step": 2152 + }, + { + "epoch": 1.2750962392656204, + "grad_norm": 5.282129958437338, + "learning_rate": 1.2869166948087804e-05, + "loss": 0.7341, + "step": 2153 + }, + { + "epoch": 1.2756884809002074, + "grad_norm": 12.516944032609397, + "learning_rate": 1.2863039517061968e-05, + "loss": 0.7495, + "step": 2154 + }, + { + "epoch": 1.276280722534794, + "grad_norm": 4.462130212698797, + "learning_rate": 1.2856910914891973e-05, + "loss": 0.7348, + "step": 2155 + }, + { + "epoch": 1.2768729641693812, + "grad_norm": 4.499459533831295, + "learning_rate": 1.2850781144084763e-05, + "loss": 0.7377, + "step": 2156 + }, + { + "epoch": 1.277465205803968, + "grad_norm": 4.366118773756862, + "learning_rate": 1.284465020714776e-05, + "loss": 0.7213, + "step": 2157 + }, + { + "epoch": 1.2780574474385549, + "grad_norm": 5.796358264091171, + "learning_rate": 1.2838518106588856e-05, + "loss": 0.7296, + "step": 2158 + }, + { + "epoch": 1.2786496890731418, + "grad_norm": 3.051441847538537, + "learning_rate": 1.2832384844916433e-05, + "loss": 0.7283, + "step": 2159 + }, + { + "epoch": 1.2792419307077287, + "grad_norm": 2.3238935491908865, + "learning_rate": 1.2826250424639329e-05, + "loss": 0.7258, + "step": 2160 + }, + { + "epoch": 1.2798341723423157, + "grad_norm": 10.583266939609347, + "learning_rate": 1.282011484826688e-05, + "loss": 0.6906, + "step": 2161 + }, + { + "epoch": 1.2804264139769026, + "grad_norm": 1.9613859782854648, + "learning_rate": 1.2813978118308872e-05, + "loss": 0.7381, + "step": 2162 + }, + { + "epoch": 1.2810186556114895, + "grad_norm": 1.5522438888542716, + "learning_rate": 1.2807840237275578e-05, + "loss": 0.7759, + "step": 2163 + }, + { + "epoch": 1.2816108972460765, + "grad_norm": 2.0620688237603866, + "learning_rate": 1.2801701207677731e-05, + "loss": 0.7275, + "step": 2164 + }, + { + "epoch": 1.2822031388806634, + "grad_norm": 4.033929396881479, + "learning_rate": 1.2795561032026546e-05, + "loss": 0.7161, + "step": 2165 + }, + { + "epoch": 1.28279538051525, + "grad_norm": 2.2174066380945128, + "learning_rate": 1.2789419712833698e-05, + "loss": 0.7347, + "step": 2166 + }, + { + "epoch": 1.2833876221498373, + "grad_norm": 2.989193169042572, + "learning_rate": 1.2783277252611326e-05, + "loss": 0.744, + "step": 2167 + }, + { + "epoch": 1.283979863784424, + "grad_norm": 3.7142286905163817, + "learning_rate": 1.277713365387205e-05, + "loss": 0.711, + "step": 2168 + }, + { + "epoch": 1.284572105419011, + "grad_norm": 5.139644730862864, + "learning_rate": 1.2770988919128943e-05, + "loss": 0.7386, + "step": 2169 + }, + { + "epoch": 1.2851643470535978, + "grad_norm": 6.589804111626187, + "learning_rate": 1.2764843050895548e-05, + "loss": 0.7376, + "step": 2170 + }, + { + "epoch": 1.2857565886881848, + "grad_norm": 7.048157778907931, + "learning_rate": 1.275869605168587e-05, + "loss": 0.726, + "step": 2171 + }, + { + "epoch": 1.2863488303227717, + "grad_norm": 4.519497731902214, + "learning_rate": 1.2752547924014378e-05, + "loss": 0.7176, + "step": 2172 + }, + { + "epoch": 1.2869410719573586, + "grad_norm": 4.560564674061755, + "learning_rate": 1.2746398670396003e-05, + "loss": 0.7548, + "step": 2173 + }, + { + "epoch": 1.2875333135919456, + "grad_norm": 2.476033120505578, + "learning_rate": 1.2740248293346134e-05, + "loss": 0.6937, + "step": 2174 + }, + { + "epoch": 1.2881255552265325, + "grad_norm": 1.711083220222386, + "learning_rate": 1.2734096795380619e-05, + "loss": 0.7451, + "step": 2175 + }, + { + "epoch": 1.2887177968611194, + "grad_norm": 2.308003756437765, + "learning_rate": 1.2727944179015773e-05, + "loss": 0.7071, + "step": 2176 + }, + { + "epoch": 1.2893100384957061, + "grad_norm": 1.9018295125948241, + "learning_rate": 1.2721790446768355e-05, + "loss": 0.7064, + "step": 2177 + }, + { + "epoch": 1.2899022801302933, + "grad_norm": 3.3632041500633663, + "learning_rate": 1.271563560115559e-05, + "loss": 0.756, + "step": 2178 + }, + { + "epoch": 1.29049452176488, + "grad_norm": 9.199019371454177, + "learning_rate": 1.2709479644695157e-05, + "loss": 0.7183, + "step": 2179 + }, + { + "epoch": 1.291086763399467, + "grad_norm": 3.5389831036172223, + "learning_rate": 1.2703322579905191e-05, + "loss": 0.717, + "step": 2180 + }, + { + "epoch": 1.2916790050340539, + "grad_norm": 7.54002229069042, + "learning_rate": 1.2697164409304273e-05, + "loss": 0.7707, + "step": 2181 + }, + { + "epoch": 1.2922712466686408, + "grad_norm": 1.7866494430336994, + "learning_rate": 1.269100513541144e-05, + "loss": 0.7525, + "step": 2182 + }, + { + "epoch": 1.2928634883032277, + "grad_norm": 1.4683566571838764, + "learning_rate": 1.2684844760746188e-05, + "loss": 0.741, + "step": 2183 + }, + { + "epoch": 1.2934557299378147, + "grad_norm": 1.8765066274808877, + "learning_rate": 1.2678683287828451e-05, + "loss": 0.7657, + "step": 2184 + }, + { + "epoch": 1.2940479715724016, + "grad_norm": 3.013366849613377, + "learning_rate": 1.267252071917862e-05, + "loss": 0.7016, + "step": 2185 + }, + { + "epoch": 1.2946402132069885, + "grad_norm": 1.7177077320985006, + "learning_rate": 1.266635705731753e-05, + "loss": 0.692, + "step": 2186 + }, + { + "epoch": 1.2952324548415755, + "grad_norm": 3.9718676101648813, + "learning_rate": 1.266019230476647e-05, + "loss": 0.6845, + "step": 2187 + }, + { + "epoch": 1.2958246964761622, + "grad_norm": 7.981490810275518, + "learning_rate": 1.2654026464047165e-05, + "loss": 0.6967, + "step": 2188 + }, + { + "epoch": 1.2964169381107493, + "grad_norm": 1.6653196455250276, + "learning_rate": 1.2647859537681794e-05, + "loss": 0.7264, + "step": 2189 + }, + { + "epoch": 1.297009179745336, + "grad_norm": 3.229577009236302, + "learning_rate": 1.2641691528192976e-05, + "loss": 0.7175, + "step": 2190 + }, + { + "epoch": 1.297601421379923, + "grad_norm": 1.8793833759264493, + "learning_rate": 1.2635522438103775e-05, + "loss": 0.7383, + "step": 2191 + }, + { + "epoch": 1.29819366301451, + "grad_norm": 6.074008068005565, + "learning_rate": 1.262935226993769e-05, + "loss": 0.716, + "step": 2192 + }, + { + "epoch": 1.2987859046490968, + "grad_norm": 3.365277587578852, + "learning_rate": 1.2623181026218676e-05, + "loss": 0.7079, + "step": 2193 + }, + { + "epoch": 1.2993781462836838, + "grad_norm": 3.8730032760836712, + "learning_rate": 1.261700870947111e-05, + "loss": 0.7094, + "step": 2194 + }, + { + "epoch": 1.2999703879182707, + "grad_norm": 2.4954320743950205, + "learning_rate": 1.2610835322219822e-05, + "loss": 0.7249, + "step": 2195 + }, + { + "epoch": 1.3005626295528576, + "grad_norm": 2.328817696663847, + "learning_rate": 1.2604660866990072e-05, + "loss": 0.7027, + "step": 2196 + }, + { + "epoch": 1.3011548711874446, + "grad_norm": 1.7371517673037984, + "learning_rate": 1.259848534630756e-05, + "loss": 0.7134, + "step": 2197 + }, + { + "epoch": 1.3017471128220315, + "grad_norm": 2.4321062518581096, + "learning_rate": 1.2592308762698422e-05, + "loss": 0.7529, + "step": 2198 + }, + { + "epoch": 1.3023393544566182, + "grad_norm": 2.551030855888438, + "learning_rate": 1.2586131118689229e-05, + "loss": 0.7395, + "step": 2199 + }, + { + "epoch": 1.3029315960912053, + "grad_norm": 2.5192997738692253, + "learning_rate": 1.257995241680698e-05, + "loss": 0.767, + "step": 2200 + }, + { + "epoch": 1.303523837725792, + "grad_norm": 1.871844395912189, + "learning_rate": 1.2573772659579118e-05, + "loss": 0.7456, + "step": 2201 + }, + { + "epoch": 1.304116079360379, + "grad_norm": 2.1558955925829903, + "learning_rate": 1.2567591849533507e-05, + "loss": 0.7714, + "step": 2202 + }, + { + "epoch": 1.304708320994966, + "grad_norm": 1.5443144366947106, + "learning_rate": 1.2561409989198445e-05, + "loss": 0.7153, + "step": 2203 + }, + { + "epoch": 1.3053005626295529, + "grad_norm": 2.893987768096241, + "learning_rate": 1.2555227081102663e-05, + "loss": 0.7501, + "step": 2204 + }, + { + "epoch": 1.3058928042641398, + "grad_norm": 2.9864438149165293, + "learning_rate": 1.2549043127775317e-05, + "loss": 0.7392, + "step": 2205 + }, + { + "epoch": 1.3064850458987267, + "grad_norm": 2.2495043135725044, + "learning_rate": 1.2542858131745997e-05, + "loss": 0.7344, + "step": 2206 + }, + { + "epoch": 1.3070772875333136, + "grad_norm": 1.8073617958458001, + "learning_rate": 1.2536672095544705e-05, + "loss": 0.739, + "step": 2207 + }, + { + "epoch": 1.3076695291679006, + "grad_norm": 5.482475430635642, + "learning_rate": 1.253048502170188e-05, + "loss": 0.7081, + "step": 2208 + }, + { + "epoch": 1.3082617708024875, + "grad_norm": 1.6369830148297229, + "learning_rate": 1.2524296912748391e-05, + "loss": 0.743, + "step": 2209 + }, + { + "epoch": 1.3088540124370742, + "grad_norm": 2.3037478748956457, + "learning_rate": 1.2518107771215511e-05, + "loss": 0.7013, + "step": 2210 + }, + { + "epoch": 1.3094462540716614, + "grad_norm": 2.2512579980860727, + "learning_rate": 1.2511917599634957e-05, + "loss": 0.7202, + "step": 2211 + }, + { + "epoch": 1.310038495706248, + "grad_norm": 3.811836598474259, + "learning_rate": 1.2505726400538849e-05, + "loss": 0.735, + "step": 2212 + }, + { + "epoch": 1.310630737340835, + "grad_norm": 3.888905349606496, + "learning_rate": 1.2499534176459742e-05, + "loss": 0.7649, + "step": 2213 + }, + { + "epoch": 1.311222978975422, + "grad_norm": 2.6409628828714062, + "learning_rate": 1.24933409299306e-05, + "loss": 0.7387, + "step": 2214 + }, + { + "epoch": 1.3118152206100089, + "grad_norm": 1.1361250179762734, + "learning_rate": 1.2487146663484811e-05, + "loss": 0.7357, + "step": 2215 + }, + { + "epoch": 1.3124074622445958, + "grad_norm": 3.0048958781591812, + "learning_rate": 1.2480951379656175e-05, + "loss": 0.7016, + "step": 2216 + }, + { + "epoch": 1.3129997038791827, + "grad_norm": 3.618139046477133, + "learning_rate": 1.247475508097892e-05, + "loss": 0.7279, + "step": 2217 + }, + { + "epoch": 1.3135919455137697, + "grad_norm": 5.253731958808833, + "learning_rate": 1.2468557769987667e-05, + "loss": 0.749, + "step": 2218 + }, + { + "epoch": 1.3141841871483566, + "grad_norm": 1.6888707222430752, + "learning_rate": 1.2462359449217478e-05, + "loss": 0.7063, + "step": 2219 + }, + { + "epoch": 1.3147764287829435, + "grad_norm": 1.8167018428081732, + "learning_rate": 1.2456160121203808e-05, + "loss": 0.6972, + "step": 2220 + }, + { + "epoch": 1.3153686704175303, + "grad_norm": 3.091651135895501, + "learning_rate": 1.244995978848253e-05, + "loss": 0.7003, + "step": 2221 + }, + { + "epoch": 1.3159609120521172, + "grad_norm": 1.5525088892116699, + "learning_rate": 1.2443758453589934e-05, + "loss": 0.722, + "step": 2222 + }, + { + "epoch": 1.3165531536867041, + "grad_norm": 2.7538503380997894, + "learning_rate": 1.2437556119062707e-05, + "loss": 0.7218, + "step": 2223 + }, + { + "epoch": 1.317145395321291, + "grad_norm": 2.232065009961151, + "learning_rate": 1.243135278743796e-05, + "loss": 0.7338, + "step": 2224 + }, + { + "epoch": 1.317737636955878, + "grad_norm": 1.4109391466878773, + "learning_rate": 1.24251484612532e-05, + "loss": 0.7339, + "step": 2225 + }, + { + "epoch": 1.318329878590465, + "grad_norm": 5.387146562198588, + "learning_rate": 1.2418943143046346e-05, + "loss": 0.7163, + "step": 2226 + }, + { + "epoch": 1.3189221202250518, + "grad_norm": 7.336494868885874, + "learning_rate": 1.2412736835355725e-05, + "loss": 0.7206, + "step": 2227 + }, + { + "epoch": 1.3195143618596388, + "grad_norm": 5.47208025176641, + "learning_rate": 1.2406529540720063e-05, + "loss": 0.7154, + "step": 2228 + }, + { + "epoch": 1.3201066034942257, + "grad_norm": 2.6695992583732946, + "learning_rate": 1.2400321261678492e-05, + "loss": 0.7236, + "step": 2229 + }, + { + "epoch": 1.3206988451288126, + "grad_norm": 2.195132600070081, + "learning_rate": 1.239411200077055e-05, + "loss": 0.7574, + "step": 2230 + }, + { + "epoch": 1.3212910867633996, + "grad_norm": 11.165845514321358, + "learning_rate": 1.238790176053617e-05, + "loss": 0.7546, + "step": 2231 + }, + { + "epoch": 1.3218833283979863, + "grad_norm": 2.9312352374249864, + "learning_rate": 1.2381690543515692e-05, + "loss": 0.7301, + "step": 2232 + }, + { + "epoch": 1.3224755700325732, + "grad_norm": 1.6545166318992972, + "learning_rate": 1.2375478352249854e-05, + "loss": 0.7211, + "step": 2233 + }, + { + "epoch": 1.3230678116671601, + "grad_norm": 2.1861482000851518, + "learning_rate": 1.236926518927979e-05, + "loss": 0.6936, + "step": 2234 + }, + { + "epoch": 1.323660053301747, + "grad_norm": 1.8651361150526538, + "learning_rate": 1.2363051057147036e-05, + "loss": 0.7116, + "step": 2235 + }, + { + "epoch": 1.324252294936334, + "grad_norm": 2.1494204464514923, + "learning_rate": 1.2356835958393513e-05, + "loss": 0.746, + "step": 2236 + }, + { + "epoch": 1.324844536570921, + "grad_norm": 1.5645161950911375, + "learning_rate": 1.2350619895561557e-05, + "loss": 0.7028, + "step": 2237 + }, + { + "epoch": 1.3254367782055079, + "grad_norm": 1.7327580503628912, + "learning_rate": 1.2344402871193876e-05, + "loss": 0.7459, + "step": 2238 + }, + { + "epoch": 1.3260290198400948, + "grad_norm": 2.9366360287029085, + "learning_rate": 1.2338184887833595e-05, + "loss": 0.7929, + "step": 2239 + }, + { + "epoch": 1.3266212614746817, + "grad_norm": 2.234776359408874, + "learning_rate": 1.2331965948024209e-05, + "loss": 0.741, + "step": 2240 + }, + { + "epoch": 1.3272135031092684, + "grad_norm": 16.615421551346895, + "learning_rate": 1.232574605430962e-05, + "loss": 0.7348, + "step": 2241 + }, + { + "epoch": 1.3278057447438556, + "grad_norm": 1.836805999142044, + "learning_rate": 1.2319525209234109e-05, + "loss": 0.7431, + "step": 2242 + }, + { + "epoch": 1.3283979863784423, + "grad_norm": 3.6059103156218217, + "learning_rate": 1.2313303415342358e-05, + "loss": 0.702, + "step": 2243 + }, + { + "epoch": 1.3289902280130292, + "grad_norm": 2.494436178191213, + "learning_rate": 1.230708067517942e-05, + "loss": 0.7407, + "step": 2244 + }, + { + "epoch": 1.3295824696476162, + "grad_norm": 3.946731442600297, + "learning_rate": 1.230085699129076e-05, + "loss": 0.6811, + "step": 2245 + }, + { + "epoch": 1.330174711282203, + "grad_norm": 4.20508246282007, + "learning_rate": 1.2294632366222201e-05, + "loss": 0.7007, + "step": 2246 + }, + { + "epoch": 1.33076695291679, + "grad_norm": 6.980679537862269, + "learning_rate": 1.2288406802519974e-05, + "loss": 0.6982, + "step": 2247 + }, + { + "epoch": 1.331359194551377, + "grad_norm": 2.0063581908355195, + "learning_rate": 1.2282180302730683e-05, + "loss": 0.7171, + "step": 2248 + }, + { + "epoch": 1.331951436185964, + "grad_norm": 2.812205651172888, + "learning_rate": 1.2275952869401311e-05, + "loss": 0.7062, + "step": 2249 + }, + { + "epoch": 1.3325436778205508, + "grad_norm": 4.260513166725431, + "learning_rate": 1.2269724505079234e-05, + "loss": 0.7677, + "step": 2250 + }, + { + "epoch": 1.3331359194551378, + "grad_norm": 2.395786745837272, + "learning_rate": 1.2263495212312202e-05, + "loss": 0.7321, + "step": 2251 + }, + { + "epoch": 1.3337281610897245, + "grad_norm": 3.4523356684471933, + "learning_rate": 1.2257264993648345e-05, + "loss": 0.7213, + "step": 2252 + }, + { + "epoch": 1.3343204027243116, + "grad_norm": 1.6680835833538643, + "learning_rate": 1.2251033851636174e-05, + "loss": 0.7316, + "step": 2253 + }, + { + "epoch": 1.3349126443588983, + "grad_norm": 4.066851683497322, + "learning_rate": 1.2244801788824577e-05, + "loss": 0.7494, + "step": 2254 + }, + { + "epoch": 1.3355048859934853, + "grad_norm": 2.2024623170572575, + "learning_rate": 1.2238568807762813e-05, + "loss": 0.7078, + "step": 2255 + }, + { + "epoch": 1.3360971276280722, + "grad_norm": 4.096509077552267, + "learning_rate": 1.223233491100053e-05, + "loss": 0.6927, + "step": 2256 + }, + { + "epoch": 1.3366893692626591, + "grad_norm": 1.7054951183155915, + "learning_rate": 1.2226100101087737e-05, + "loss": 0.744, + "step": 2257 + }, + { + "epoch": 1.337281610897246, + "grad_norm": 2.256752220268974, + "learning_rate": 1.2219864380574822e-05, + "loss": 0.7134, + "step": 2258 + }, + { + "epoch": 1.337873852531833, + "grad_norm": 2.6221532584973217, + "learning_rate": 1.2213627752012547e-05, + "loss": 0.7147, + "step": 2259 + }, + { + "epoch": 1.33846609416642, + "grad_norm": 3.3038467185814953, + "learning_rate": 1.2207390217952044e-05, + "loss": 0.7271, + "step": 2260 + }, + { + "epoch": 1.3390583358010069, + "grad_norm": 1.9956526626820097, + "learning_rate": 1.2201151780944813e-05, + "loss": 0.7499, + "step": 2261 + }, + { + "epoch": 1.3396505774355938, + "grad_norm": 2.58029133331588, + "learning_rate": 1.2194912443542728e-05, + "loss": 0.7362, + "step": 2262 + }, + { + "epoch": 1.3402428190701805, + "grad_norm": 1.3093228436379185, + "learning_rate": 1.2188672208298028e-05, + "loss": 0.7464, + "step": 2263 + }, + { + "epoch": 1.3408350607047677, + "grad_norm": 2.2716810173689943, + "learning_rate": 1.2182431077763317e-05, + "loss": 0.7151, + "step": 2264 + }, + { + "epoch": 1.3414273023393544, + "grad_norm": 3.7636447060204112, + "learning_rate": 1.2176189054491576e-05, + "loss": 0.7064, + "step": 2265 + }, + { + "epoch": 1.3420195439739413, + "grad_norm": 1.6344445874345244, + "learning_rate": 1.2169946141036133e-05, + "loss": 0.7248, + "step": 2266 + }, + { + "epoch": 1.3426117856085282, + "grad_norm": 1.660994198119721, + "learning_rate": 1.2163702339950702e-05, + "loss": 0.7344, + "step": 2267 + }, + { + "epoch": 1.3432040272431152, + "grad_norm": 2.3481480865911912, + "learning_rate": 1.2157457653789337e-05, + "loss": 0.7358, + "step": 2268 + }, + { + "epoch": 1.343796268877702, + "grad_norm": 2.085597969173187, + "learning_rate": 1.2151212085106478e-05, + "loss": 0.7227, + "step": 2269 + }, + { + "epoch": 1.344388510512289, + "grad_norm": 1.6707958861915149, + "learning_rate": 1.2144965636456903e-05, + "loss": 0.7298, + "step": 2270 + }, + { + "epoch": 1.344980752146876, + "grad_norm": 2.7788857483382206, + "learning_rate": 1.213871831039577e-05, + "loss": 0.6869, + "step": 2271 + }, + { + "epoch": 1.345572993781463, + "grad_norm": 2.382424211273845, + "learning_rate": 1.2132470109478577e-05, + "loss": 0.6929, + "step": 2272 + }, + { + "epoch": 1.3461652354160498, + "grad_norm": 3.2307082039534047, + "learning_rate": 1.21262210362612e-05, + "loss": 0.716, + "step": 2273 + }, + { + "epoch": 1.3467574770506365, + "grad_norm": 2.304898917214821, + "learning_rate": 1.2119971093299852e-05, + "loss": 0.7246, + "step": 2274 + }, + { + "epoch": 1.3473497186852237, + "grad_norm": 4.297876675540321, + "learning_rate": 1.2113720283151115e-05, + "loss": 0.7579, + "step": 2275 + }, + { + "epoch": 1.3479419603198104, + "grad_norm": 2.19405196855775, + "learning_rate": 1.2107468608371924e-05, + "loss": 0.7604, + "step": 2276 + }, + { + "epoch": 1.3485342019543973, + "grad_norm": 2.236962795916548, + "learning_rate": 1.2101216071519561e-05, + "loss": 0.7458, + "step": 2277 + }, + { + "epoch": 1.3491264435889843, + "grad_norm": 3.9633695711684975, + "learning_rate": 1.209496267515167e-05, + "loss": 0.7194, + "step": 2278 + }, + { + "epoch": 1.3497186852235712, + "grad_norm": 2.4068817921340333, + "learning_rate": 1.2088708421826238e-05, + "loss": 0.7136, + "step": 2279 + }, + { + "epoch": 1.3503109268581581, + "grad_norm": 1.5247334800083856, + "learning_rate": 1.2082453314101607e-05, + "loss": 0.6947, + "step": 2280 + }, + { + "epoch": 1.350903168492745, + "grad_norm": 1.673058636797337, + "learning_rate": 1.2076197354536472e-05, + "loss": 0.7307, + "step": 2281 + }, + { + "epoch": 1.351495410127332, + "grad_norm": 2.7089039748271495, + "learning_rate": 1.2069940545689867e-05, + "loss": 0.7495, + "step": 2282 + }, + { + "epoch": 1.352087651761919, + "grad_norm": 1.4724209313434784, + "learning_rate": 1.2063682890121178e-05, + "loss": 0.7076, + "step": 2283 + }, + { + "epoch": 1.3526798933965059, + "grad_norm": 1.4347383550436956, + "learning_rate": 1.2057424390390141e-05, + "loss": 0.7122, + "step": 2284 + }, + { + "epoch": 1.3532721350310926, + "grad_norm": 1.6233887203509207, + "learning_rate": 1.2051165049056836e-05, + "loss": 0.7273, + "step": 2285 + }, + { + "epoch": 1.3538643766656797, + "grad_norm": 2.2293249045490504, + "learning_rate": 1.2044904868681684e-05, + "loss": 0.7501, + "step": 2286 + }, + { + "epoch": 1.3544566183002664, + "grad_norm": 2.3431634966622115, + "learning_rate": 1.2038643851825449e-05, + "loss": 0.7621, + "step": 2287 + }, + { + "epoch": 1.3550488599348534, + "grad_norm": 2.132037815874192, + "learning_rate": 1.203238200104924e-05, + "loss": 0.7268, + "step": 2288 + }, + { + "epoch": 1.3556411015694403, + "grad_norm": 2.230932140756579, + "learning_rate": 1.2026119318914507e-05, + "loss": 0.735, + "step": 2289 + }, + { + "epoch": 1.3562333432040272, + "grad_norm": 2.313542593223687, + "learning_rate": 1.2019855807983036e-05, + "loss": 0.7422, + "step": 2290 + }, + { + "epoch": 1.3568255848386142, + "grad_norm": 1.9178912189104353, + "learning_rate": 1.201359147081696e-05, + "loss": 0.73, + "step": 2291 + }, + { + "epoch": 1.357417826473201, + "grad_norm": 1.5513535945386705, + "learning_rate": 1.200732630997874e-05, + "loss": 0.7273, + "step": 2292 + }, + { + "epoch": 1.358010068107788, + "grad_norm": 2.3144421995406512, + "learning_rate": 1.2001060328031185e-05, + "loss": 0.7184, + "step": 2293 + }, + { + "epoch": 1.358602309742375, + "grad_norm": 1.8962622817640602, + "learning_rate": 1.1994793527537427e-05, + "loss": 0.7409, + "step": 2294 + }, + { + "epoch": 1.3591945513769619, + "grad_norm": 1.9723792893075207, + "learning_rate": 1.198852591106095e-05, + "loss": 0.7409, + "step": 2295 + }, + { + "epoch": 1.3597867930115486, + "grad_norm": 1.6685845910473676, + "learning_rate": 1.1982257481165547e-05, + "loss": 0.6975, + "step": 2296 + }, + { + "epoch": 1.3603790346461357, + "grad_norm": 1.8774532039410945, + "learning_rate": 1.1975988240415373e-05, + "loss": 0.7433, + "step": 2297 + }, + { + "epoch": 1.3609712762807225, + "grad_norm": 4.187478803679397, + "learning_rate": 1.1969718191374888e-05, + "loss": 0.7295, + "step": 2298 + }, + { + "epoch": 1.3615635179153094, + "grad_norm": 2.2037953249975315, + "learning_rate": 1.1963447336608906e-05, + "loss": 0.7087, + "step": 2299 + }, + { + "epoch": 1.3621557595498963, + "grad_norm": 3.372329419689456, + "learning_rate": 1.1957175678682548e-05, + "loss": 0.737, + "step": 2300 + }, + { + "epoch": 1.3627480011844832, + "grad_norm": 1.7609785926431862, + "learning_rate": 1.1950903220161286e-05, + "loss": 0.7049, + "step": 2301 + }, + { + "epoch": 1.3633402428190702, + "grad_norm": 10.575058480914846, + "learning_rate": 1.1944629963610897e-05, + "loss": 0.732, + "step": 2302 + }, + { + "epoch": 1.3639324844536571, + "grad_norm": 1.7154442069759137, + "learning_rate": 1.1938355911597503e-05, + "loss": 0.7155, + "step": 2303 + }, + { + "epoch": 1.364524726088244, + "grad_norm": 2.2620417290126946, + "learning_rate": 1.1932081066687544e-05, + "loss": 0.7689, + "step": 2304 + }, + { + "epoch": 1.365116967722831, + "grad_norm": 5.085468174057115, + "learning_rate": 1.1925805431447779e-05, + "loss": 0.7161, + "step": 2305 + }, + { + "epoch": 1.365709209357418, + "grad_norm": 2.4608886800316285, + "learning_rate": 1.1919529008445302e-05, + "loss": 0.7487, + "step": 2306 + }, + { + "epoch": 1.3663014509920046, + "grad_norm": 2.196016033444546, + "learning_rate": 1.1913251800247515e-05, + "loss": 0.7473, + "step": 2307 + }, + { + "epoch": 1.3668936926265918, + "grad_norm": 2.3351921735739136, + "learning_rate": 1.1906973809422163e-05, + "loss": 0.7114, + "step": 2308 + }, + { + "epoch": 1.3674859342611785, + "grad_norm": 3.712094735632108, + "learning_rate": 1.1900695038537283e-05, + "loss": 0.7126, + "step": 2309 + }, + { + "epoch": 1.3680781758957654, + "grad_norm": 3.0445067803167434, + "learning_rate": 1.1894415490161253e-05, + "loss": 0.7616, + "step": 2310 + }, + { + "epoch": 1.3686704175303523, + "grad_norm": 5.092443503760028, + "learning_rate": 1.1888135166862756e-05, + "loss": 0.7598, + "step": 2311 + }, + { + "epoch": 1.3692626591649393, + "grad_norm": 1.7515743364833825, + "learning_rate": 1.1881854071210805e-05, + "loss": 0.7519, + "step": 2312 + }, + { + "epoch": 1.3698549007995262, + "grad_norm": 1.4595659443345792, + "learning_rate": 1.1875572205774712e-05, + "loss": 0.6944, + "step": 2313 + }, + { + "epoch": 1.3704471424341131, + "grad_norm": 2.0276065761611566, + "learning_rate": 1.1869289573124125e-05, + "loss": 0.7117, + "step": 2314 + }, + { + "epoch": 1.3710393840687, + "grad_norm": 1.6385422571466461, + "learning_rate": 1.1863006175828984e-05, + "loss": 0.7348, + "step": 2315 + }, + { + "epoch": 1.371631625703287, + "grad_norm": 1.2986389722977396, + "learning_rate": 1.1856722016459554e-05, + "loss": 0.7225, + "step": 2316 + }, + { + "epoch": 1.372223867337874, + "grad_norm": 2.5907128138808284, + "learning_rate": 1.1850437097586412e-05, + "loss": 0.7053, + "step": 2317 + }, + { + "epoch": 1.3728161089724606, + "grad_norm": 1.72659341320089, + "learning_rate": 1.1844151421780442e-05, + "loss": 0.7747, + "step": 2318 + }, + { + "epoch": 1.3734083506070478, + "grad_norm": 2.3936636379326326, + "learning_rate": 1.1837864991612839e-05, + "loss": 0.746, + "step": 2319 + }, + { + "epoch": 1.3740005922416345, + "grad_norm": 3.268948302001786, + "learning_rate": 1.1831577809655105e-05, + "loss": 0.7287, + "step": 2320 + }, + { + "epoch": 1.3745928338762214, + "grad_norm": 1.8358669402385104, + "learning_rate": 1.1825289878479054e-05, + "loss": 0.7323, + "step": 2321 + }, + { + "epoch": 1.3751850755108084, + "grad_norm": 6.659136666043565, + "learning_rate": 1.18190012006568e-05, + "loss": 0.781, + "step": 2322 + }, + { + "epoch": 1.3757773171453953, + "grad_norm": 1.8103702241279684, + "learning_rate": 1.1812711778760768e-05, + "loss": 0.7562, + "step": 2323 + }, + { + "epoch": 1.3763695587799822, + "grad_norm": 3.4205148227314486, + "learning_rate": 1.1806421615363685e-05, + "loss": 0.7057, + "step": 2324 + }, + { + "epoch": 1.3769618004145692, + "grad_norm": 5.3299975994261874, + "learning_rate": 1.1800130713038582e-05, + "loss": 0.7554, + "step": 2325 + }, + { + "epoch": 1.377554042049156, + "grad_norm": 1.5067357582677707, + "learning_rate": 1.179383907435879e-05, + "loss": 0.728, + "step": 2326 + }, + { + "epoch": 1.378146283683743, + "grad_norm": 1.5937327086086364, + "learning_rate": 1.1787546701897947e-05, + "loss": 0.6923, + "step": 2327 + }, + { + "epoch": 1.37873852531833, + "grad_norm": 2.9084759352815137, + "learning_rate": 1.1781253598229982e-05, + "loss": 0.6867, + "step": 2328 + }, + { + "epoch": 1.3793307669529167, + "grad_norm": 1.8065003892691773, + "learning_rate": 1.177495976592913e-05, + "loss": 0.7264, + "step": 2329 + }, + { + "epoch": 1.3799230085875038, + "grad_norm": 1.374778675210592, + "learning_rate": 1.1768665207569922e-05, + "loss": 0.763, + "step": 2330 + }, + { + "epoch": 1.3805152502220905, + "grad_norm": 5.135815860167982, + "learning_rate": 1.176236992572719e-05, + "loss": 0.7287, + "step": 2331 + }, + { + "epoch": 1.3811074918566775, + "grad_norm": 1.9451779116322265, + "learning_rate": 1.1756073922976056e-05, + "loss": 0.7603, + "step": 2332 + }, + { + "epoch": 1.3816997334912644, + "grad_norm": 4.102609143725733, + "learning_rate": 1.1749777201891937e-05, + "loss": 0.6891, + "step": 2333 + }, + { + "epoch": 1.3822919751258513, + "grad_norm": 2.703163169605834, + "learning_rate": 1.1743479765050549e-05, + "loss": 0.7238, + "step": 2334 + }, + { + "epoch": 1.3828842167604383, + "grad_norm": 1.4161656206435274, + "learning_rate": 1.1737181615027894e-05, + "loss": 0.7072, + "step": 2335 + }, + { + "epoch": 1.3834764583950252, + "grad_norm": 2.111623863485621, + "learning_rate": 1.1730882754400274e-05, + "loss": 0.6903, + "step": 2336 + }, + { + "epoch": 1.3840687000296121, + "grad_norm": 1.9893098347865092, + "learning_rate": 1.172458318574427e-05, + "loss": 0.7353, + "step": 2337 + }, + { + "epoch": 1.384660941664199, + "grad_norm": 3.119953828491785, + "learning_rate": 1.1718282911636774e-05, + "loss": 0.7126, + "step": 2338 + }, + { + "epoch": 1.385253183298786, + "grad_norm": 2.0807212215419937, + "learning_rate": 1.1711981934654937e-05, + "loss": 0.7486, + "step": 2339 + }, + { + "epoch": 1.3858454249333727, + "grad_norm": 2.1409029627127025, + "learning_rate": 1.1705680257376224e-05, + "loss": 0.736, + "step": 2340 + }, + { + "epoch": 1.3864376665679599, + "grad_norm": 1.7678639437104227, + "learning_rate": 1.1699377882378367e-05, + "loss": 0.6811, + "step": 2341 + }, + { + "epoch": 1.3870299082025466, + "grad_norm": 1.1969173608840782, + "learning_rate": 1.1693074812239397e-05, + "loss": 0.7072, + "step": 2342 + }, + { + "epoch": 1.3876221498371335, + "grad_norm": 1.7602686350385015, + "learning_rate": 1.1686771049537621e-05, + "loss": 0.6984, + "step": 2343 + }, + { + "epoch": 1.3882143914717204, + "grad_norm": 1.4360466219925514, + "learning_rate": 1.1680466596851635e-05, + "loss": 0.7065, + "step": 2344 + }, + { + "epoch": 1.3888066331063074, + "grad_norm": 4.263733785806742, + "learning_rate": 1.1674161456760314e-05, + "loss": 0.7292, + "step": 2345 + }, + { + "epoch": 1.3893988747408943, + "grad_norm": 2.471335440136607, + "learning_rate": 1.1667855631842815e-05, + "loss": 0.7671, + "step": 2346 + }, + { + "epoch": 1.3899911163754812, + "grad_norm": 4.172593773758679, + "learning_rate": 1.1661549124678573e-05, + "loss": 0.751, + "step": 2347 + }, + { + "epoch": 1.3905833580100682, + "grad_norm": 2.5493693541099667, + "learning_rate": 1.1655241937847305e-05, + "loss": 0.7763, + "step": 2348 + }, + { + "epoch": 1.391175599644655, + "grad_norm": 12.734269954324368, + "learning_rate": 1.1648934073929008e-05, + "loss": 0.7433, + "step": 2349 + }, + { + "epoch": 1.391767841279242, + "grad_norm": 2.0172453728072015, + "learning_rate": 1.164262553550395e-05, + "loss": 0.7186, + "step": 2350 + }, + { + "epoch": 1.3923600829138287, + "grad_norm": 1.6347819754535515, + "learning_rate": 1.1636316325152678e-05, + "loss": 0.7313, + "step": 2351 + }, + { + "epoch": 1.3929523245484159, + "grad_norm": 1.6197215465398926, + "learning_rate": 1.1630006445456015e-05, + "loss": 0.7162, + "step": 2352 + }, + { + "epoch": 1.3935445661830026, + "grad_norm": 1.3419132909074647, + "learning_rate": 1.1623695898995057e-05, + "loss": 0.6812, + "step": 2353 + }, + { + "epoch": 1.3941368078175895, + "grad_norm": 1.999373727061114, + "learning_rate": 1.161738468835117e-05, + "loss": 0.6904, + "step": 2354 + }, + { + "epoch": 1.3947290494521765, + "grad_norm": 2.0145229640481106, + "learning_rate": 1.1611072816105995e-05, + "loss": 0.7258, + "step": 2355 + }, + { + "epoch": 1.3953212910867634, + "grad_norm": 1.6691310278707019, + "learning_rate": 1.1604760284841446e-05, + "loss": 0.7245, + "step": 2356 + }, + { + "epoch": 1.3959135327213503, + "grad_norm": 4.128799120066488, + "learning_rate": 1.1598447097139698e-05, + "loss": 0.7285, + "step": 2357 + }, + { + "epoch": 1.3965057743559373, + "grad_norm": 2.0773835779838663, + "learning_rate": 1.1592133255583204e-05, + "loss": 0.7516, + "step": 2358 + }, + { + "epoch": 1.3970980159905242, + "grad_norm": 1.3942800893003757, + "learning_rate": 1.1585818762754678e-05, + "loss": 0.731, + "step": 2359 + }, + { + "epoch": 1.3976902576251111, + "grad_norm": 1.8775327674483524, + "learning_rate": 1.1579503621237102e-05, + "loss": 0.7454, + "step": 2360 + }, + { + "epoch": 1.398282499259698, + "grad_norm": 3.027896060244714, + "learning_rate": 1.1573187833613723e-05, + "loss": 0.7275, + "step": 2361 + }, + { + "epoch": 1.3988747408942848, + "grad_norm": 1.4716030596428373, + "learning_rate": 1.156687140246806e-05, + "loss": 0.7367, + "step": 2362 + }, + { + "epoch": 1.399466982528872, + "grad_norm": 1.7820811493950572, + "learning_rate": 1.1560554330383881e-05, + "loss": 0.7146, + "step": 2363 + }, + { + "epoch": 1.4000592241634586, + "grad_norm": 2.7037191130979323, + "learning_rate": 1.1554236619945229e-05, + "loss": 0.7242, + "step": 2364 + }, + { + "epoch": 1.4006514657980456, + "grad_norm": 1.7688945832299963, + "learning_rate": 1.1547918273736397e-05, + "loss": 0.7575, + "step": 2365 + }, + { + "epoch": 1.4012437074326325, + "grad_norm": 1.7948443959599745, + "learning_rate": 1.1541599294341952e-05, + "loss": 0.7385, + "step": 2366 + }, + { + "epoch": 1.4018359490672194, + "grad_norm": 2.5494232660163414, + "learning_rate": 1.1535279684346702e-05, + "loss": 0.7321, + "step": 2367 + }, + { + "epoch": 1.4024281907018064, + "grad_norm": 1.131792892745796, + "learning_rate": 1.1528959446335735e-05, + "loss": 0.7251, + "step": 2368 + }, + { + "epoch": 1.4030204323363933, + "grad_norm": 1.7242196863607848, + "learning_rate": 1.1522638582894372e-05, + "loss": 0.7357, + "step": 2369 + }, + { + "epoch": 1.4036126739709802, + "grad_norm": 1.7494101461566682, + "learning_rate": 1.1516317096608207e-05, + "loss": 0.7277, + "step": 2370 + }, + { + "epoch": 1.4042049156055671, + "grad_norm": 2.45681555550249, + "learning_rate": 1.1509994990063085e-05, + "loss": 0.6967, + "step": 2371 + }, + { + "epoch": 1.404797157240154, + "grad_norm": 2.9094052043800223, + "learning_rate": 1.1503672265845098e-05, + "loss": 0.7258, + "step": 2372 + }, + { + "epoch": 1.4053893988747408, + "grad_norm": 2.4462949935085616, + "learning_rate": 1.1497348926540602e-05, + "loss": 0.7472, + "step": 2373 + }, + { + "epoch": 1.4059816405093277, + "grad_norm": 1.5009612756951904, + "learning_rate": 1.1491024974736191e-05, + "loss": 0.6824, + "step": 2374 + }, + { + "epoch": 1.4065738821439147, + "grad_norm": 3.095173697672372, + "learning_rate": 1.1484700413018724e-05, + "loss": 0.7173, + "step": 2375 + }, + { + "epoch": 1.4071661237785016, + "grad_norm": 2.4512057822789606, + "learning_rate": 1.1478375243975298e-05, + "loss": 0.7297, + "step": 2376 + }, + { + "epoch": 1.4077583654130885, + "grad_norm": 1.9421393264777882, + "learning_rate": 1.1472049470193263e-05, + "loss": 0.7504, + "step": 2377 + }, + { + "epoch": 1.4083506070476755, + "grad_norm": 2.4342391043313047, + "learning_rate": 1.1465723094260219e-05, + "loss": 0.7148, + "step": 2378 + }, + { + "epoch": 1.4089428486822624, + "grad_norm": 1.2678552133839875, + "learning_rate": 1.1459396118764007e-05, + "loss": 0.6937, + "step": 2379 + }, + { + "epoch": 1.4095350903168493, + "grad_norm": 1.6681069042211958, + "learning_rate": 1.1453068546292718e-05, + "loss": 0.7413, + "step": 2380 + }, + { + "epoch": 1.4101273319514362, + "grad_norm": 2.0274279135928706, + "learning_rate": 1.1446740379434681e-05, + "loss": 0.7516, + "step": 2381 + }, + { + "epoch": 1.4107195735860232, + "grad_norm": 1.4001394906264233, + "learning_rate": 1.1440411620778478e-05, + "loss": 0.7393, + "step": 2382 + }, + { + "epoch": 1.41131181522061, + "grad_norm": 3.0338701312366445, + "learning_rate": 1.1434082272912923e-05, + "loss": 0.7396, + "step": 2383 + }, + { + "epoch": 1.4119040568551968, + "grad_norm": 1.786481375473841, + "learning_rate": 1.1427752338427075e-05, + "loss": 0.7079, + "step": 2384 + }, + { + "epoch": 1.4124962984897838, + "grad_norm": 1.1712156976339714, + "learning_rate": 1.1421421819910235e-05, + "loss": 0.7258, + "step": 2385 + }, + { + "epoch": 1.4130885401243707, + "grad_norm": 2.996800157879306, + "learning_rate": 1.141509071995194e-05, + "loss": 0.7282, + "step": 2386 + }, + { + "epoch": 1.4136807817589576, + "grad_norm": 1.3892970824112951, + "learning_rate": 1.1408759041141963e-05, + "loss": 0.7463, + "step": 2387 + }, + { + "epoch": 1.4142730233935445, + "grad_norm": 1.6106966314025877, + "learning_rate": 1.1402426786070326e-05, + "loss": 0.7159, + "step": 2388 + }, + { + "epoch": 1.4148652650281315, + "grad_norm": 2.391973281082433, + "learning_rate": 1.1396093957327266e-05, + "loss": 0.6767, + "step": 2389 + }, + { + "epoch": 1.4154575066627184, + "grad_norm": 1.8064771701538858, + "learning_rate": 1.1389760557503275e-05, + "loss": 0.7618, + "step": 2390 + }, + { + "epoch": 1.4160497482973053, + "grad_norm": 1.9436742635325912, + "learning_rate": 1.1383426589189062e-05, + "loss": 0.7507, + "step": 2391 + }, + { + "epoch": 1.4166419899318923, + "grad_norm": 1.4722158504330858, + "learning_rate": 1.1377092054975586e-05, + "loss": 0.7029, + "step": 2392 + }, + { + "epoch": 1.417234231566479, + "grad_norm": 1.2331100621292894, + "learning_rate": 1.1370756957454015e-05, + "loss": 0.6877, + "step": 2393 + }, + { + "epoch": 1.4178264732010661, + "grad_norm": 1.8467647418967628, + "learning_rate": 1.1364421299215773e-05, + "loss": 0.734, + "step": 2394 + }, + { + "epoch": 1.4184187148356528, + "grad_norm": 2.0082819089866293, + "learning_rate": 1.135808508285249e-05, + "loss": 0.719, + "step": 2395 + }, + { + "epoch": 1.4190109564702398, + "grad_norm": 1.461710287319451, + "learning_rate": 1.135174831095604e-05, + "loss": 0.7138, + "step": 2396 + }, + { + "epoch": 1.4196031981048267, + "grad_norm": 1.519799568496123, + "learning_rate": 1.134541098611852e-05, + "loss": 0.7084, + "step": 2397 + }, + { + "epoch": 1.4201954397394136, + "grad_norm": 2.315400648418732, + "learning_rate": 1.133907311093225e-05, + "loss": 0.7295, + "step": 2398 + }, + { + "epoch": 1.4207876813740006, + "grad_norm": 1.8933519527297487, + "learning_rate": 1.133273468798978e-05, + "loss": 0.7078, + "step": 2399 + }, + { + "epoch": 1.4213799230085875, + "grad_norm": 1.91221104798189, + "learning_rate": 1.1326395719883876e-05, + "loss": 0.6819, + "step": 2400 + }, + { + "epoch": 1.4219721646431744, + "grad_norm": 1.979878107597815, + "learning_rate": 1.1320056209207538e-05, + "loss": 0.71, + "step": 2401 + }, + { + "epoch": 1.4225644062777614, + "grad_norm": 2.977882494720392, + "learning_rate": 1.1313716158553978e-05, + "loss": 0.7408, + "step": 2402 + }, + { + "epoch": 1.4231566479123483, + "grad_norm": 1.6024476014527083, + "learning_rate": 1.1307375570516637e-05, + "loss": 0.7167, + "step": 2403 + }, + { + "epoch": 1.423748889546935, + "grad_norm": 1.714605953833717, + "learning_rate": 1.130103444768917e-05, + "loss": 0.7197, + "step": 2404 + }, + { + "epoch": 1.4243411311815222, + "grad_norm": 2.5811940978711068, + "learning_rate": 1.1294692792665452e-05, + "loss": 0.6986, + "step": 2405 + }, + { + "epoch": 1.4249333728161089, + "grad_norm": 5.955582148467331, + "learning_rate": 1.1288350608039577e-05, + "loss": 0.7285, + "step": 2406 + }, + { + "epoch": 1.4255256144506958, + "grad_norm": 1.580417338453988, + "learning_rate": 1.1282007896405858e-05, + "loss": 0.7571, + "step": 2407 + }, + { + "epoch": 1.4261178560852827, + "grad_norm": 1.7271693274308317, + "learning_rate": 1.1275664660358818e-05, + "loss": 0.7756, + "step": 2408 + }, + { + "epoch": 1.4267100977198697, + "grad_norm": 1.5963788064488735, + "learning_rate": 1.1269320902493199e-05, + "loss": 0.7245, + "step": 2409 + }, + { + "epoch": 1.4273023393544566, + "grad_norm": 2.2232952704603828, + "learning_rate": 1.1262976625403954e-05, + "loss": 0.7156, + "step": 2410 + }, + { + "epoch": 1.4278945809890435, + "grad_norm": 4.047457885454327, + "learning_rate": 1.1256631831686245e-05, + "loss": 0.7397, + "step": 2411 + }, + { + "epoch": 1.4284868226236305, + "grad_norm": 2.079661525227277, + "learning_rate": 1.1250286523935456e-05, + "loss": 0.6906, + "step": 2412 + }, + { + "epoch": 1.4290790642582174, + "grad_norm": 2.3170134187542666, + "learning_rate": 1.1243940704747169e-05, + "loss": 0.7696, + "step": 2413 + }, + { + "epoch": 1.4296713058928043, + "grad_norm": 4.447126564702576, + "learning_rate": 1.1237594376717188e-05, + "loss": 0.7215, + "step": 2414 + }, + { + "epoch": 1.430263547527391, + "grad_norm": 1.487487996677641, + "learning_rate": 1.1231247542441507e-05, + "loss": 0.7387, + "step": 2415 + }, + { + "epoch": 1.4308557891619782, + "grad_norm": 2.7450360360522925, + "learning_rate": 1.122490020451635e-05, + "loss": 0.7184, + "step": 2416 + }, + { + "epoch": 1.431448030796565, + "grad_norm": 4.220383960594748, + "learning_rate": 1.1218552365538129e-05, + "loss": 0.7528, + "step": 2417 + }, + { + "epoch": 1.4320402724311518, + "grad_norm": 8.716959325200545, + "learning_rate": 1.1212204028103469e-05, + "loss": 0.7771, + "step": 2418 + }, + { + "epoch": 1.4326325140657388, + "grad_norm": 1.6083934310732948, + "learning_rate": 1.1205855194809191e-05, + "loss": 0.7349, + "step": 2419 + }, + { + "epoch": 1.4332247557003257, + "grad_norm": 2.098687005967919, + "learning_rate": 1.1199505868252336e-05, + "loss": 0.7355, + "step": 2420 + }, + { + "epoch": 1.4338169973349126, + "grad_norm": 1.6917431368313898, + "learning_rate": 1.1193156051030128e-05, + "loss": 0.6713, + "step": 2421 + }, + { + "epoch": 1.4344092389694996, + "grad_norm": 1.9121827759052907, + "learning_rate": 1.1186805745739999e-05, + "loss": 0.7065, + "step": 2422 + }, + { + "epoch": 1.4350014806040865, + "grad_norm": 1.7303129056373097, + "learning_rate": 1.1180454954979583e-05, + "loss": 0.6621, + "step": 2423 + }, + { + "epoch": 1.4355937222386734, + "grad_norm": 1.4653359584564212, + "learning_rate": 1.1174103681346711e-05, + "loss": 0.7288, + "step": 2424 + }, + { + "epoch": 1.4361859638732604, + "grad_norm": 1.6242879043550942, + "learning_rate": 1.1167751927439407e-05, + "loss": 0.7364, + "step": 2425 + }, + { + "epoch": 1.436778205507847, + "grad_norm": 1.879062950337473, + "learning_rate": 1.1161399695855903e-05, + "loss": 0.7296, + "step": 2426 + }, + { + "epoch": 1.4373704471424342, + "grad_norm": 2.2495686189430617, + "learning_rate": 1.1155046989194613e-05, + "loss": 0.775, + "step": 2427 + }, + { + "epoch": 1.437962688777021, + "grad_norm": 1.667428421117216, + "learning_rate": 1.1148693810054152e-05, + "loss": 0.7023, + "step": 2428 + }, + { + "epoch": 1.4385549304116079, + "grad_norm": 1.3747252199835087, + "learning_rate": 1.1142340161033331e-05, + "loss": 0.7057, + "step": 2429 + }, + { + "epoch": 1.4391471720461948, + "grad_norm": 2.463314444705448, + "learning_rate": 1.1135986044731144e-05, + "loss": 0.6567, + "step": 2430 + }, + { + "epoch": 1.4397394136807817, + "grad_norm": 1.5810081020557705, + "learning_rate": 1.1129631463746789e-05, + "loss": 0.6708, + "step": 2431 + }, + { + "epoch": 1.4403316553153687, + "grad_norm": 1.8170531880544196, + "learning_rate": 1.112327642067964e-05, + "loss": 0.7159, + "step": 2432 + }, + { + "epoch": 1.4409238969499556, + "grad_norm": 5.805209899239579, + "learning_rate": 1.1116920918129271e-05, + "loss": 0.7091, + "step": 2433 + }, + { + "epoch": 1.4415161385845425, + "grad_norm": 1.9375366699904837, + "learning_rate": 1.111056495869544e-05, + "loss": 0.7578, + "step": 2434 + }, + { + "epoch": 1.4421083802191295, + "grad_norm": 2.8102088648986525, + "learning_rate": 1.110420854497809e-05, + "loss": 0.7223, + "step": 2435 + }, + { + "epoch": 1.4427006218537164, + "grad_norm": 2.1131020428074456, + "learning_rate": 1.1097851679577351e-05, + "loss": 0.7278, + "step": 2436 + }, + { + "epoch": 1.443292863488303, + "grad_norm": 2.3630646661630004, + "learning_rate": 1.1091494365093542e-05, + "loss": 0.751, + "step": 2437 + }, + { + "epoch": 1.4438851051228903, + "grad_norm": 3.8773111167009624, + "learning_rate": 1.1085136604127161e-05, + "loss": 0.7495, + "step": 2438 + }, + { + "epoch": 1.444477346757477, + "grad_norm": 2.7572277967036456, + "learning_rate": 1.1078778399278885e-05, + "loss": 0.749, + "step": 2439 + }, + { + "epoch": 1.445069588392064, + "grad_norm": 1.469912383127116, + "learning_rate": 1.1072419753149585e-05, + "loss": 0.7198, + "step": 2440 + }, + { + "epoch": 1.4456618300266508, + "grad_norm": 2.777349956968284, + "learning_rate": 1.1066060668340298e-05, + "loss": 0.6924, + "step": 2441 + }, + { + "epoch": 1.4462540716612378, + "grad_norm": 1.4767013976366694, + "learning_rate": 1.105970114745225e-05, + "loss": 0.7943, + "step": 2442 + }, + { + "epoch": 1.4468463132958247, + "grad_norm": 3.0907295250096785, + "learning_rate": 1.1053341193086844e-05, + "loss": 0.6945, + "step": 2443 + }, + { + "epoch": 1.4474385549304116, + "grad_norm": 1.473657931484691, + "learning_rate": 1.104698080784566e-05, + "loss": 0.7246, + "step": 2444 + }, + { + "epoch": 1.4480307965649986, + "grad_norm": 1.5243657541187539, + "learning_rate": 1.1040619994330446e-05, + "loss": 0.7208, + "step": 2445 + }, + { + "epoch": 1.4486230381995855, + "grad_norm": 1.4396515407272992, + "learning_rate": 1.1034258755143141e-05, + "loss": 0.7082, + "step": 2446 + }, + { + "epoch": 1.4492152798341724, + "grad_norm": 4.67485917339473, + "learning_rate": 1.1027897092885846e-05, + "loss": 0.7418, + "step": 2447 + }, + { + "epoch": 1.4498075214687591, + "grad_norm": 1.8934539767167, + "learning_rate": 1.1021535010160838e-05, + "loss": 0.7048, + "step": 2448 + }, + { + "epoch": 1.4503997631033463, + "grad_norm": 1.7934976398624312, + "learning_rate": 1.1015172509570567e-05, + "loss": 0.7438, + "step": 2449 + }, + { + "epoch": 1.450992004737933, + "grad_norm": 3.1616972894112574, + "learning_rate": 1.1008809593717653e-05, + "loss": 0.7354, + "step": 2450 + }, + { + "epoch": 1.45158424637252, + "grad_norm": 2.5005464211548216, + "learning_rate": 1.1002446265204887e-05, + "loss": 0.7263, + "step": 2451 + }, + { + "epoch": 1.4521764880071069, + "grad_norm": 3.4868223181586875, + "learning_rate": 1.0996082526635227e-05, + "loss": 0.7188, + "step": 2452 + }, + { + "epoch": 1.4527687296416938, + "grad_norm": 1.6735246025817165, + "learning_rate": 1.0989718380611805e-05, + "loss": 0.6973, + "step": 2453 + }, + { + "epoch": 1.4533609712762807, + "grad_norm": 2.147911052347242, + "learning_rate": 1.0983353829737909e-05, + "loss": 0.6972, + "step": 2454 + }, + { + "epoch": 1.4539532129108677, + "grad_norm": 1.1261985891425348, + "learning_rate": 1.0976988876616998e-05, + "loss": 0.7311, + "step": 2455 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.3716117307279665, + "learning_rate": 1.0970623523852699e-05, + "loss": 0.7525, + "step": 2456 + }, + { + "epoch": 1.4551376961800415, + "grad_norm": 2.1195272638552165, + "learning_rate": 1.0964257774048799e-05, + "loss": 0.7471, + "step": 2457 + }, + { + "epoch": 1.4557299378146284, + "grad_norm": 1.9959343877258, + "learning_rate": 1.0957891629809248e-05, + "loss": 0.761, + "step": 2458 + }, + { + "epoch": 1.4563221794492152, + "grad_norm": 2.2291293933510095, + "learning_rate": 1.0951525093738151e-05, + "loss": 0.7153, + "step": 2459 + }, + { + "epoch": 1.4569144210838023, + "grad_norm": 1.5622339941082886, + "learning_rate": 1.0945158168439786e-05, + "loss": 0.7644, + "step": 2460 + }, + { + "epoch": 1.457506662718389, + "grad_norm": 2.3239762154569554, + "learning_rate": 1.0938790856518582e-05, + "loss": 0.7644, + "step": 2461 + }, + { + "epoch": 1.458098904352976, + "grad_norm": 2.650172654861937, + "learning_rate": 1.0932423160579126e-05, + "loss": 0.7249, + "step": 2462 + }, + { + "epoch": 1.4586911459875629, + "grad_norm": 1.717625601814134, + "learning_rate": 1.0926055083226166e-05, + "loss": 0.7325, + "step": 2463 + }, + { + "epoch": 1.4592833876221498, + "grad_norm": 3.9491611186011384, + "learning_rate": 1.0919686627064603e-05, + "loss": 0.7214, + "step": 2464 + }, + { + "epoch": 1.4598756292567368, + "grad_norm": 1.2001329473694484, + "learning_rate": 1.091331779469949e-05, + "loss": 0.7511, + "step": 2465 + }, + { + "epoch": 1.4604678708913237, + "grad_norm": 1.742997197381938, + "learning_rate": 1.0906948588736044e-05, + "loss": 0.7016, + "step": 2466 + }, + { + "epoch": 1.4610601125259106, + "grad_norm": 1.2448970057704796, + "learning_rate": 1.0900579011779622e-05, + "loss": 0.7431, + "step": 2467 + }, + { + "epoch": 1.4616523541604975, + "grad_norm": 1.3860151218112158, + "learning_rate": 1.0894209066435746e-05, + "loss": 0.715, + "step": 2468 + }, + { + "epoch": 1.4622445957950845, + "grad_norm": 16.17995815293019, + "learning_rate": 1.0887838755310072e-05, + "loss": 0.7018, + "step": 2469 + }, + { + "epoch": 1.4628368374296712, + "grad_norm": 1.7009020003527942, + "learning_rate": 1.0881468081008428e-05, + "loss": 0.7178, + "step": 2470 + }, + { + "epoch": 1.4634290790642583, + "grad_norm": 3.5437807058153696, + "learning_rate": 1.0875097046136764e-05, + "loss": 0.7219, + "step": 2471 + }, + { + "epoch": 1.464021320698845, + "grad_norm": 1.3944824507804543, + "learning_rate": 1.0868725653301206e-05, + "loss": 0.7535, + "step": 2472 + }, + { + "epoch": 1.464613562333432, + "grad_norm": 1.3126484653094563, + "learning_rate": 1.0862353905108002e-05, + "loss": 0.7229, + "step": 2473 + }, + { + "epoch": 1.465205803968019, + "grad_norm": 2.458706343995586, + "learning_rate": 1.085598180416356e-05, + "loss": 0.7186, + "step": 2474 + }, + { + "epoch": 1.4657980456026058, + "grad_norm": 4.852881691527256, + "learning_rate": 1.0849609353074423e-05, + "loss": 0.7252, + "step": 2475 + }, + { + "epoch": 1.4663902872371928, + "grad_norm": 2.036540439548759, + "learning_rate": 1.0843236554447288e-05, + "loss": 0.7141, + "step": 2476 + }, + { + "epoch": 1.4669825288717797, + "grad_norm": 2.066706696048189, + "learning_rate": 1.0836863410888983e-05, + "loss": 0.7087, + "step": 2477 + }, + { + "epoch": 1.4675747705063666, + "grad_norm": 1.7357600707145933, + "learning_rate": 1.0830489925006485e-05, + "loss": 0.7375, + "step": 2478 + }, + { + "epoch": 1.4681670121409536, + "grad_norm": 2.55498272013775, + "learning_rate": 1.0824116099406905e-05, + "loss": 0.7577, + "step": 2479 + }, + { + "epoch": 1.4687592537755405, + "grad_norm": 1.3261417160385482, + "learning_rate": 1.0817741936697499e-05, + "loss": 0.7181, + "step": 2480 + }, + { + "epoch": 1.4693514954101272, + "grad_norm": 2.4364379575015587, + "learning_rate": 1.0811367439485658e-05, + "loss": 0.7276, + "step": 2481 + }, + { + "epoch": 1.4699437370447144, + "grad_norm": 4.800149895264068, + "learning_rate": 1.0804992610378907e-05, + "loss": 0.7436, + "step": 2482 + }, + { + "epoch": 1.470535978679301, + "grad_norm": 5.046483999796505, + "learning_rate": 1.0798617451984912e-05, + "loss": 0.7433, + "step": 2483 + }, + { + "epoch": 1.471128220313888, + "grad_norm": 1.9692087823077262, + "learning_rate": 1.0792241966911472e-05, + "loss": 0.6953, + "step": 2484 + }, + { + "epoch": 1.471720461948475, + "grad_norm": 2.128256332160169, + "learning_rate": 1.0785866157766515e-05, + "loss": 0.7532, + "step": 2485 + }, + { + "epoch": 1.4723127035830619, + "grad_norm": 1.6170247391820483, + "learning_rate": 1.077949002715811e-05, + "loss": 0.747, + "step": 2486 + }, + { + "epoch": 1.4729049452176488, + "grad_norm": 2.938634854754934, + "learning_rate": 1.0773113577694452e-05, + "loss": 0.7153, + "step": 2487 + }, + { + "epoch": 1.4734971868522357, + "grad_norm": 4.000268288506294, + "learning_rate": 1.0766736811983864e-05, + "loss": 0.7287, + "step": 2488 + }, + { + "epoch": 1.4740894284868227, + "grad_norm": 3.613730312963113, + "learning_rate": 1.0760359732634806e-05, + "loss": 0.7625, + "step": 2489 + }, + { + "epoch": 1.4746816701214096, + "grad_norm": 5.433532028459535, + "learning_rate": 1.0753982342255863e-05, + "loss": 0.7634, + "step": 2490 + }, + { + "epoch": 1.4752739117559965, + "grad_norm": 3.32345033735752, + "learning_rate": 1.0747604643455735e-05, + "loss": 0.7312, + "step": 2491 + }, + { + "epoch": 1.4758661533905832, + "grad_norm": 1.5311040334516028, + "learning_rate": 1.0741226638843276e-05, + "loss": 0.7367, + "step": 2492 + }, + { + "epoch": 1.4764583950251704, + "grad_norm": 2.7319357660771355, + "learning_rate": 1.0734848331027437e-05, + "loss": 0.7359, + "step": 2493 + }, + { + "epoch": 1.477050636659757, + "grad_norm": 18.11777663466048, + "learning_rate": 1.072846972261731e-05, + "loss": 0.7704, + "step": 2494 + }, + { + "epoch": 1.477642878294344, + "grad_norm": 2.4993251519063895, + "learning_rate": 1.07220908162221e-05, + "loss": 0.7497, + "step": 2495 + }, + { + "epoch": 1.478235119928931, + "grad_norm": 1.8417261729994114, + "learning_rate": 1.0715711614451146e-05, + "loss": 0.7485, + "step": 2496 + }, + { + "epoch": 1.478827361563518, + "grad_norm": 2.7762420879853558, + "learning_rate": 1.0709332119913889e-05, + "loss": 0.7371, + "step": 2497 + }, + { + "epoch": 1.4794196031981048, + "grad_norm": 2.8156204757494394, + "learning_rate": 1.0702952335219912e-05, + "loss": 0.7295, + "step": 2498 + }, + { + "epoch": 1.4800118448326918, + "grad_norm": 3.4764320543388645, + "learning_rate": 1.0696572262978897e-05, + "loss": 0.7117, + "step": 2499 + }, + { + "epoch": 1.4806040864672787, + "grad_norm": 2.613928689321406, + "learning_rate": 1.0690191905800659e-05, + "loss": 0.7364, + "step": 2500 + }, + { + "epoch": 1.4811963281018656, + "grad_norm": 3.310002223294027, + "learning_rate": 1.0683811266295122e-05, + "loss": 0.7071, + "step": 2501 + }, + { + "epoch": 1.4817885697364526, + "grad_norm": 1.8031901176722984, + "learning_rate": 1.067743034707232e-05, + "loss": 0.7363, + "step": 2502 + }, + { + "epoch": 1.4823808113710393, + "grad_norm": 2.736654320785856, + "learning_rate": 1.0671049150742414e-05, + "loss": 0.7292, + "step": 2503 + }, + { + "epoch": 1.4829730530056264, + "grad_norm": 6.165916524526913, + "learning_rate": 1.066466767991567e-05, + "loss": 0.7259, + "step": 2504 + }, + { + "epoch": 1.4835652946402131, + "grad_norm": 3.2740591856838286, + "learning_rate": 1.065828593720247e-05, + "loss": 0.7333, + "step": 2505 + }, + { + "epoch": 1.4841575362748, + "grad_norm": 8.686085817094266, + "learning_rate": 1.0651903925213304e-05, + "loss": 0.694, + "step": 2506 + }, + { + "epoch": 1.484749777909387, + "grad_norm": 3.2639051263293744, + "learning_rate": 1.0645521646558774e-05, + "loss": 0.7347, + "step": 2507 + }, + { + "epoch": 1.485342019543974, + "grad_norm": 2.5554270332110334, + "learning_rate": 1.0639139103849591e-05, + "loss": 0.7331, + "step": 2508 + }, + { + "epoch": 1.4859342611785609, + "grad_norm": 2.926347424292201, + "learning_rate": 1.0632756299696576e-05, + "loss": 0.7181, + "step": 2509 + }, + { + "epoch": 1.4865265028131478, + "grad_norm": 2.3947137814149047, + "learning_rate": 1.062637323671065e-05, + "loss": 0.7079, + "step": 2510 + }, + { + "epoch": 1.4871187444477347, + "grad_norm": 2.6108737168898264, + "learning_rate": 1.061998991750285e-05, + "loss": 0.7863, + "step": 2511 + }, + { + "epoch": 1.4877109860823217, + "grad_norm": 2.7866821674655755, + "learning_rate": 1.0613606344684309e-05, + "loss": 0.7435, + "step": 2512 + }, + { + "epoch": 1.4883032277169086, + "grad_norm": 2.1621024720841935, + "learning_rate": 1.0607222520866268e-05, + "loss": 0.7383, + "step": 2513 + }, + { + "epoch": 1.4888954693514953, + "grad_norm": 1.4595498878836146, + "learning_rate": 1.060083844866007e-05, + "loss": 0.7215, + "step": 2514 + }, + { + "epoch": 1.4894877109860825, + "grad_norm": 2.4323235608077267, + "learning_rate": 1.0594454130677159e-05, + "loss": 0.7737, + "step": 2515 + }, + { + "epoch": 1.4900799526206692, + "grad_norm": 2.0951574345823167, + "learning_rate": 1.0588069569529085e-05, + "loss": 0.751, + "step": 2516 + }, + { + "epoch": 1.490672194255256, + "grad_norm": 2.7057235512306264, + "learning_rate": 1.0581684767827483e-05, + "loss": 0.7164, + "step": 2517 + }, + { + "epoch": 1.491264435889843, + "grad_norm": 2.3050182970407462, + "learning_rate": 1.0575299728184105e-05, + "loss": 0.7266, + "step": 2518 + }, + { + "epoch": 1.49185667752443, + "grad_norm": 2.658571871840395, + "learning_rate": 1.0568914453210784e-05, + "loss": 0.729, + "step": 2519 + }, + { + "epoch": 1.492448919159017, + "grad_norm": 2.6694118505347046, + "learning_rate": 1.0562528945519463e-05, + "loss": 0.7099, + "step": 2520 + }, + { + "epoch": 1.4930411607936038, + "grad_norm": 2.7419506378224257, + "learning_rate": 1.0556143207722167e-05, + "loss": 0.7001, + "step": 2521 + }, + { + "epoch": 1.4936334024281908, + "grad_norm": 1.5146904472068337, + "learning_rate": 1.0549757242431032e-05, + "loss": 0.7012, + "step": 2522 + }, + { + "epoch": 1.4942256440627777, + "grad_norm": 2.187917848241182, + "learning_rate": 1.0543371052258262e-05, + "loss": 0.7006, + "step": 2523 + }, + { + "epoch": 1.4948178856973646, + "grad_norm": 3.0173641599728307, + "learning_rate": 1.0536984639816183e-05, + "loss": 0.7009, + "step": 2524 + }, + { + "epoch": 1.4954101273319513, + "grad_norm": 4.342321069678089, + "learning_rate": 1.0530598007717188e-05, + "loss": 0.7325, + "step": 2525 + }, + { + "epoch": 1.4960023689665383, + "grad_norm": 2.168797900177454, + "learning_rate": 1.0524211158573772e-05, + "loss": 0.7077, + "step": 2526 + }, + { + "epoch": 1.4965946106011252, + "grad_norm": 2.459366128626486, + "learning_rate": 1.0517824094998514e-05, + "loss": 0.7149, + "step": 2527 + }, + { + "epoch": 1.4971868522357121, + "grad_norm": 3.5713298768798794, + "learning_rate": 1.0511436819604082e-05, + "loss": 0.734, + "step": 2528 + }, + { + "epoch": 1.497779093870299, + "grad_norm": 1.9883923998857207, + "learning_rate": 1.050504933500323e-05, + "loss": 0.6835, + "step": 2529 + }, + { + "epoch": 1.498371335504886, + "grad_norm": 2.939625720286527, + "learning_rate": 1.0498661643808801e-05, + "loss": 0.6833, + "step": 2530 + }, + { + "epoch": 1.498963577139473, + "grad_norm": 3.2278416983726586, + "learning_rate": 1.0492273748633718e-05, + "loss": 0.7615, + "step": 2531 + }, + { + "epoch": 1.4995558187740599, + "grad_norm": 2.134003837207207, + "learning_rate": 1.0485885652090992e-05, + "loss": 0.7285, + "step": 2532 + }, + { + "epoch": 1.5001480604086468, + "grad_norm": 1.4289709457674888, + "learning_rate": 1.0479497356793708e-05, + "loss": 0.7301, + "step": 2533 + }, + { + "epoch": 1.5007403020432335, + "grad_norm": 2.2718039122799936, + "learning_rate": 1.0473108865355046e-05, + "loss": 0.6829, + "step": 2534 + }, + { + "epoch": 1.5013325436778207, + "grad_norm": 2.1362613649289837, + "learning_rate": 1.046672018038825e-05, + "loss": 0.7585, + "step": 2535 + }, + { + "epoch": 1.5019247853124074, + "grad_norm": 4.9287065917580035, + "learning_rate": 1.0460331304506658e-05, + "loss": 0.6854, + "step": 2536 + }, + { + "epoch": 1.5025170269469945, + "grad_norm": 2.71738703423648, + "learning_rate": 1.0453942240323676e-05, + "loss": 0.6972, + "step": 2537 + }, + { + "epoch": 1.5031092685815812, + "grad_norm": 3.3571433608705967, + "learning_rate": 1.044755299045279e-05, + "loss": 0.7208, + "step": 2538 + }, + { + "epoch": 1.5037015102161682, + "grad_norm": 2.2306864648628104, + "learning_rate": 1.0441163557507565e-05, + "loss": 0.7, + "step": 2539 + }, + { + "epoch": 1.504293751850755, + "grad_norm": 2.127715558269432, + "learning_rate": 1.0434773944101637e-05, + "loss": 0.7028, + "step": 2540 + }, + { + "epoch": 1.504885993485342, + "grad_norm": 3.283305061966627, + "learning_rate": 1.0428384152848716e-05, + "loss": 0.7194, + "step": 2541 + }, + { + "epoch": 1.505478235119929, + "grad_norm": 1.7421560789370591, + "learning_rate": 1.0421994186362591e-05, + "loss": 0.7146, + "step": 2542 + }, + { + "epoch": 1.5060704767545159, + "grad_norm": 2.1998312820966035, + "learning_rate": 1.0415604047257108e-05, + "loss": 0.7055, + "step": 2543 + }, + { + "epoch": 1.5066627183891028, + "grad_norm": 2.8243735302280797, + "learning_rate": 1.0409213738146207e-05, + "loss": 0.7101, + "step": 2544 + }, + { + "epoch": 1.5072549600236895, + "grad_norm": 2.064730143131007, + "learning_rate": 1.0402823261643869e-05, + "loss": 0.7449, + "step": 2545 + }, + { + "epoch": 1.5078472016582767, + "grad_norm": 2.1534779272441016, + "learning_rate": 1.039643262036417e-05, + "loss": 0.7249, + "step": 2546 + }, + { + "epoch": 1.5084394432928634, + "grad_norm": 1.9313484117848871, + "learning_rate": 1.039004181692123e-05, + "loss": 0.7402, + "step": 2547 + }, + { + "epoch": 1.5090316849274505, + "grad_norm": 2.075887908145325, + "learning_rate": 1.0383650853929261e-05, + "loss": 0.7791, + "step": 2548 + }, + { + "epoch": 1.5096239265620373, + "grad_norm": 3.2492748129540985, + "learning_rate": 1.0377259734002514e-05, + "loss": 0.7529, + "step": 2549 + }, + { + "epoch": 1.5102161681966242, + "grad_norm": 1.900962739440968, + "learning_rate": 1.0370868459755325e-05, + "loss": 0.7034, + "step": 2550 + }, + { + "epoch": 1.5108084098312111, + "grad_norm": 1.477785833608308, + "learning_rate": 1.0364477033802079e-05, + "loss": 0.6937, + "step": 2551 + }, + { + "epoch": 1.511400651465798, + "grad_norm": 3.631747743457973, + "learning_rate": 1.0358085458757233e-05, + "loss": 0.7212, + "step": 2552 + }, + { + "epoch": 1.511992893100385, + "grad_norm": 3.5138447414359173, + "learning_rate": 1.0351693737235296e-05, + "loss": 0.7277, + "step": 2553 + }, + { + "epoch": 1.512585134734972, + "grad_norm": 1.3857492580523203, + "learning_rate": 1.0345301871850843e-05, + "loss": 0.7219, + "step": 2554 + }, + { + "epoch": 1.5131773763695588, + "grad_norm": 2.6281307473566673, + "learning_rate": 1.0338909865218509e-05, + "loss": 0.7223, + "step": 2555 + }, + { + "epoch": 1.5137696180041456, + "grad_norm": 11.099970381589284, + "learning_rate": 1.0332517719952982e-05, + "loss": 0.7396, + "step": 2556 + }, + { + "epoch": 1.5143618596387327, + "grad_norm": 5.440070964115479, + "learning_rate": 1.0326125438669008e-05, + "loss": 0.7263, + "step": 2557 + }, + { + "epoch": 1.5149541012733194, + "grad_norm": 3.966364535381863, + "learning_rate": 1.0319733023981392e-05, + "loss": 0.7212, + "step": 2558 + }, + { + "epoch": 1.5155463429079066, + "grad_norm": 9.420534733675245, + "learning_rate": 1.031334047850499e-05, + "loss": 0.7458, + "step": 2559 + }, + { + "epoch": 1.5161385845424933, + "grad_norm": 2.6027648620090997, + "learning_rate": 1.030694780485471e-05, + "loss": 0.7141, + "step": 2560 + }, + { + "epoch": 1.5167308261770802, + "grad_norm": 1.4387286301673126, + "learning_rate": 1.030055500564552e-05, + "loss": 0.7644, + "step": 2561 + }, + { + "epoch": 1.5173230678116671, + "grad_norm": 1.956401956469561, + "learning_rate": 1.0294162083492429e-05, + "loss": 0.7265, + "step": 2562 + }, + { + "epoch": 1.517915309446254, + "grad_norm": 1.9974664355897158, + "learning_rate": 1.0287769041010506e-05, + "loss": 0.752, + "step": 2563 + }, + { + "epoch": 1.518507551080841, + "grad_norm": 2.317095214652806, + "learning_rate": 1.0281375880814864e-05, + "loss": 0.7235, + "step": 2564 + }, + { + "epoch": 1.519099792715428, + "grad_norm": 2.301255206281844, + "learning_rate": 1.0274982605520662e-05, + "loss": 0.7452, + "step": 2565 + }, + { + "epoch": 1.5196920343500149, + "grad_norm": 3.1153435591558387, + "learning_rate": 1.0268589217743114e-05, + "loss": 0.7009, + "step": 2566 + }, + { + "epoch": 1.5202842759846016, + "grad_norm": 8.104588610609271, + "learning_rate": 1.0262195720097472e-05, + "loss": 0.7581, + "step": 2567 + }, + { + "epoch": 1.5208765176191887, + "grad_norm": 1.629901500067734, + "learning_rate": 1.0255802115199034e-05, + "loss": 0.746, + "step": 2568 + }, + { + "epoch": 1.5214687592537754, + "grad_norm": 2.5849788803020193, + "learning_rate": 1.0249408405663148e-05, + "loss": 0.7137, + "step": 2569 + }, + { + "epoch": 1.5220610008883626, + "grad_norm": 1.7706404764967858, + "learning_rate": 1.0243014594105201e-05, + "loss": 0.6867, + "step": 2570 + }, + { + "epoch": 1.5226532425229493, + "grad_norm": 3.0042524859021564, + "learning_rate": 1.0236620683140616e-05, + "loss": 0.6803, + "step": 2571 + }, + { + "epoch": 1.5232454841575362, + "grad_norm": 1.7908570112056044, + "learning_rate": 1.0230226675384868e-05, + "loss": 0.7065, + "step": 2572 + }, + { + "epoch": 1.5238377257921232, + "grad_norm": 2.8915396186908393, + "learning_rate": 1.0223832573453463e-05, + "loss": 0.7225, + "step": 2573 + }, + { + "epoch": 1.52442996742671, + "grad_norm": 10.14931172158288, + "learning_rate": 1.021743837996195e-05, + "loss": 0.7015, + "step": 2574 + }, + { + "epoch": 1.525022209061297, + "grad_norm": 2.3225645983111307, + "learning_rate": 1.0211044097525908e-05, + "loss": 0.7365, + "step": 2575 + }, + { + "epoch": 1.525614450695884, + "grad_norm": 1.8117439188164521, + "learning_rate": 1.0204649728760969e-05, + "loss": 0.7822, + "step": 2576 + }, + { + "epoch": 1.526206692330471, + "grad_norm": 2.4639010150094647, + "learning_rate": 1.0198255276282778e-05, + "loss": 0.7265, + "step": 2577 + }, + { + "epoch": 1.5267989339650576, + "grad_norm": 1.866652089335267, + "learning_rate": 1.0191860742707034e-05, + "loss": 0.7577, + "step": 2578 + }, + { + "epoch": 1.5273911755996448, + "grad_norm": 2.434779909956098, + "learning_rate": 1.0185466130649455e-05, + "loss": 0.7516, + "step": 2579 + }, + { + "epoch": 1.5279834172342315, + "grad_norm": 2.0991586945316416, + "learning_rate": 1.0179071442725801e-05, + "loss": 0.7407, + "step": 2580 + }, + { + "epoch": 1.5285756588688186, + "grad_norm": 2.5982167009442847, + "learning_rate": 1.0172676681551857e-05, + "loss": 0.7136, + "step": 2581 + }, + { + "epoch": 1.5291679005034053, + "grad_norm": 4.3978929309621835, + "learning_rate": 1.0166281849743438e-05, + "loss": 0.7124, + "step": 2582 + }, + { + "epoch": 1.5297601421379923, + "grad_norm": 1.4422002498656223, + "learning_rate": 1.0159886949916394e-05, + "loss": 0.7052, + "step": 2583 + }, + { + "epoch": 1.5303523837725792, + "grad_norm": 2.917865956798247, + "learning_rate": 1.0153491984686595e-05, + "loss": 0.7503, + "step": 2584 + }, + { + "epoch": 1.5309446254071661, + "grad_norm": 1.5687207674041863, + "learning_rate": 1.0147096956669945e-05, + "loss": 0.7344, + "step": 2585 + }, + { + "epoch": 1.531536867041753, + "grad_norm": 2.2546664576414828, + "learning_rate": 1.0140701868482365e-05, + "loss": 0.7531, + "step": 2586 + }, + { + "epoch": 1.53212910867634, + "grad_norm": 1.4308336411744214, + "learning_rate": 1.013430672273981e-05, + "loss": 0.7343, + "step": 2587 + }, + { + "epoch": 1.532721350310927, + "grad_norm": 2.396679670787782, + "learning_rate": 1.0127911522058256e-05, + "loss": 0.7603, + "step": 2588 + }, + { + "epoch": 1.5333135919455136, + "grad_norm": 2.149803137842192, + "learning_rate": 1.0121516269053693e-05, + "loss": 0.715, + "step": 2589 + }, + { + "epoch": 1.5339058335801008, + "grad_norm": 1.5562821807570975, + "learning_rate": 1.0115120966342145e-05, + "loss": 0.7023, + "step": 2590 + }, + { + "epoch": 1.5344980752146875, + "grad_norm": 3.920771426341168, + "learning_rate": 1.0108725616539648e-05, + "loss": 0.7465, + "step": 2591 + }, + { + "epoch": 1.5350903168492747, + "grad_norm": 2.6406023004555945, + "learning_rate": 1.0102330222262257e-05, + "loss": 0.6824, + "step": 2592 + }, + { + "epoch": 1.5356825584838614, + "grad_norm": 1.999159078713688, + "learning_rate": 1.0095934786126055e-05, + "loss": 0.7184, + "step": 2593 + }, + { + "epoch": 1.5362748001184483, + "grad_norm": 2.6618576577531012, + "learning_rate": 1.0089539310747127e-05, + "loss": 0.6993, + "step": 2594 + }, + { + "epoch": 1.5368670417530352, + "grad_norm": 2.088264527488616, + "learning_rate": 1.0083143798741587e-05, + "loss": 0.7093, + "step": 2595 + }, + { + "epoch": 1.5374592833876222, + "grad_norm": 1.2592517848012976, + "learning_rate": 1.007674825272556e-05, + "loss": 0.7187, + "step": 2596 + }, + { + "epoch": 1.538051525022209, + "grad_norm": 2.4043437880802703, + "learning_rate": 1.007035267531518e-05, + "loss": 0.7162, + "step": 2597 + }, + { + "epoch": 1.5386437666567958, + "grad_norm": 1.3595141476229315, + "learning_rate": 1.0063957069126602e-05, + "loss": 0.7122, + "step": 2598 + }, + { + "epoch": 1.539236008291383, + "grad_norm": 1.5459899337725218, + "learning_rate": 1.0057561436775982e-05, + "loss": 0.7631, + "step": 2599 + }, + { + "epoch": 1.5398282499259697, + "grad_norm": 1.6231837605639283, + "learning_rate": 1.0051165780879503e-05, + "loss": 0.7219, + "step": 2600 + }, + { + "epoch": 1.5404204915605568, + "grad_norm": 2.0281678385057265, + "learning_rate": 1.0044770104053336e-05, + "loss": 0.7281, + "step": 2601 + }, + { + "epoch": 1.5410127331951435, + "grad_norm": 1.761034640996413, + "learning_rate": 1.0038374408913684e-05, + "loss": 0.7147, + "step": 2602 + }, + { + "epoch": 1.5416049748297307, + "grad_norm": 2.4171957792447216, + "learning_rate": 1.0031978698076738e-05, + "loss": 0.7625, + "step": 2603 + }, + { + "epoch": 1.5421972164643174, + "grad_norm": 2.4766490230586955, + "learning_rate": 1.002558297415871e-05, + "loss": 0.7588, + "step": 2604 + }, + { + "epoch": 1.5427894580989043, + "grad_norm": 1.6895743443704907, + "learning_rate": 1.00191872397758e-05, + "loss": 0.7303, + "step": 2605 + }, + { + "epoch": 1.5433816997334913, + "grad_norm": 3.162304535278318, + "learning_rate": 1.0012791497544238e-05, + "loss": 0.7232, + "step": 2606 + }, + { + "epoch": 1.5439739413680782, + "grad_norm": 1.6202855977215762, + "learning_rate": 1.000639575008023e-05, + "loss": 0.7778, + "step": 2607 + }, + { + "epoch": 1.5445661830026651, + "grad_norm": 2.9346216243576317, + "learning_rate": 1e-05, + "loss": 0.7523, + "step": 2608 + }, + { + "epoch": 1.5451584246372518, + "grad_norm": 2.5997819364592805, + "learning_rate": 9.993604249919773e-06, + "loss": 0.7149, + "step": 2609 + }, + { + "epoch": 1.545750666271839, + "grad_norm": 2.2690927538275956, + "learning_rate": 9.987208502455767e-06, + "loss": 0.7492, + "step": 2610 + }, + { + "epoch": 1.5463429079064257, + "grad_norm": 2.0044261798649456, + "learning_rate": 9.980812760224202e-06, + "loss": 0.7161, + "step": 2611 + }, + { + "epoch": 1.5469351495410129, + "grad_norm": 1.7008339193116702, + "learning_rate": 9.974417025841293e-06, + "loss": 0.7075, + "step": 2612 + }, + { + "epoch": 1.5475273911755996, + "grad_norm": 2.802313549002366, + "learning_rate": 9.968021301923264e-06, + "loss": 0.7673, + "step": 2613 + }, + { + "epoch": 1.5481196328101867, + "grad_norm": 5.540492098493376, + "learning_rate": 9.961625591086321e-06, + "loss": 0.6843, + "step": 2614 + }, + { + "epoch": 1.5487118744447734, + "grad_norm": 1.7826155088059201, + "learning_rate": 9.955229895946666e-06, + "loss": 0.6854, + "step": 2615 + }, + { + "epoch": 1.5493041160793604, + "grad_norm": 3.214394133267478, + "learning_rate": 9.9488342191205e-06, + "loss": 0.7126, + "step": 2616 + }, + { + "epoch": 1.5498963577139473, + "grad_norm": 3.977691609819524, + "learning_rate": 9.942438563224018e-06, + "loss": 0.7541, + "step": 2617 + }, + { + "epoch": 1.5504885993485342, + "grad_norm": 2.7729164430172393, + "learning_rate": 9.936042930873403e-06, + "loss": 0.7343, + "step": 2618 + }, + { + "epoch": 1.5510808409831212, + "grad_norm": 3.2382422721234945, + "learning_rate": 9.929647324684823e-06, + "loss": 0.6899, + "step": 2619 + }, + { + "epoch": 1.5516730826177079, + "grad_norm": 1.712687804564103, + "learning_rate": 9.923251747274441e-06, + "loss": 0.7011, + "step": 2620 + }, + { + "epoch": 1.552265324252295, + "grad_norm": 2.1866276642869127, + "learning_rate": 9.916856201258413e-06, + "loss": 0.7067, + "step": 2621 + }, + { + "epoch": 1.5528575658868817, + "grad_norm": 1.4026718170049086, + "learning_rate": 9.910460689252876e-06, + "loss": 0.7055, + "step": 2622 + }, + { + "epoch": 1.5534498075214689, + "grad_norm": 1.3836603781583152, + "learning_rate": 9.904065213873949e-06, + "loss": 0.7652, + "step": 2623 + }, + { + "epoch": 1.5540420491560556, + "grad_norm": 2.193906980580802, + "learning_rate": 9.897669777737745e-06, + "loss": 0.7304, + "step": 2624 + }, + { + "epoch": 1.5546342907906427, + "grad_norm": 7.203396030176273, + "learning_rate": 9.891274383460354e-06, + "loss": 0.7812, + "step": 2625 + }, + { + "epoch": 1.5552265324252295, + "grad_norm": 1.8434299981876887, + "learning_rate": 9.884879033657859e-06, + "loss": 0.748, + "step": 2626 + }, + { + "epoch": 1.5558187740598164, + "grad_norm": 1.821123721515522, + "learning_rate": 9.878483730946308e-06, + "loss": 0.6896, + "step": 2627 + }, + { + "epoch": 1.5564110156944033, + "grad_norm": 1.7271066903414265, + "learning_rate": 9.872088477941748e-06, + "loss": 0.7252, + "step": 2628 + }, + { + "epoch": 1.5570032573289903, + "grad_norm": 1.925482681465068, + "learning_rate": 9.86569327726019e-06, + "loss": 0.7099, + "step": 2629 + }, + { + "epoch": 1.5575954989635772, + "grad_norm": 1.6134202565322096, + "learning_rate": 9.859298131517639e-06, + "loss": 0.7507, + "step": 2630 + }, + { + "epoch": 1.558187740598164, + "grad_norm": 3.513343627530898, + "learning_rate": 9.852903043330059e-06, + "loss": 0.7638, + "step": 2631 + }, + { + "epoch": 1.558779982232751, + "grad_norm": 1.442402404811214, + "learning_rate": 9.846508015313407e-06, + "loss": 0.7419, + "step": 2632 + }, + { + "epoch": 1.5593722238673378, + "grad_norm": 2.2944058513698544, + "learning_rate": 9.84011305008361e-06, + "loss": 0.7012, + "step": 2633 + }, + { + "epoch": 1.559964465501925, + "grad_norm": 4.277848662731923, + "learning_rate": 9.833718150256567e-06, + "loss": 0.7286, + "step": 2634 + }, + { + "epoch": 1.5605567071365116, + "grad_norm": 2.2320066876585987, + "learning_rate": 9.827323318448148e-06, + "loss": 0.7014, + "step": 2635 + }, + { + "epoch": 1.5611489487710986, + "grad_norm": 2.592881982866087, + "learning_rate": 9.820928557274202e-06, + "loss": 0.7711, + "step": 2636 + }, + { + "epoch": 1.5617411904056855, + "grad_norm": 2.3317654184305923, + "learning_rate": 9.814533869350547e-06, + "loss": 0.6949, + "step": 2637 + }, + { + "epoch": 1.5623334320402724, + "grad_norm": 3.3958407254332323, + "learning_rate": 9.808139257292971e-06, + "loss": 0.7517, + "step": 2638 + }, + { + "epoch": 1.5629256736748594, + "grad_norm": 1.8868331278047943, + "learning_rate": 9.801744723717225e-06, + "loss": 0.7492, + "step": 2639 + }, + { + "epoch": 1.5635179153094463, + "grad_norm": 1.899167075781751, + "learning_rate": 9.795350271239034e-06, + "loss": 0.7545, + "step": 2640 + }, + { + "epoch": 1.5641101569440332, + "grad_norm": 1.6299887369278279, + "learning_rate": 9.78895590247409e-06, + "loss": 0.7296, + "step": 2641 + }, + { + "epoch": 1.56470239857862, + "grad_norm": 86.37994019353313, + "learning_rate": 9.782561620038055e-06, + "loss": 0.7508, + "step": 2642 + }, + { + "epoch": 1.565294640213207, + "grad_norm": 2.0626173331144932, + "learning_rate": 9.77616742654654e-06, + "loss": 0.7579, + "step": 2643 + }, + { + "epoch": 1.5658868818477938, + "grad_norm": 3.3316806059009325, + "learning_rate": 9.769773324615133e-06, + "loss": 0.7028, + "step": 2644 + }, + { + "epoch": 1.566479123482381, + "grad_norm": 1.7459776374665734, + "learning_rate": 9.763379316859386e-06, + "loss": 0.711, + "step": 2645 + }, + { + "epoch": 1.5670713651169677, + "grad_norm": 4.838875211558403, + "learning_rate": 9.756985405894802e-06, + "loss": 0.7106, + "step": 2646 + }, + { + "epoch": 1.5676636067515546, + "grad_norm": 1.3863030191233248, + "learning_rate": 9.750591594336854e-06, + "loss": 0.7431, + "step": 2647 + }, + { + "epoch": 1.5682558483861415, + "grad_norm": 1.734491733463122, + "learning_rate": 9.744197884800968e-06, + "loss": 0.7221, + "step": 2648 + }, + { + "epoch": 1.5688480900207284, + "grad_norm": 1.4554333705299076, + "learning_rate": 9.73780427990253e-06, + "loss": 0.7297, + "step": 2649 + }, + { + "epoch": 1.5694403316553154, + "grad_norm": 3.2911736777923952, + "learning_rate": 9.731410782256889e-06, + "loss": 0.7361, + "step": 2650 + }, + { + "epoch": 1.5700325732899023, + "grad_norm": 1.3472702183611696, + "learning_rate": 9.72501739447934e-06, + "loss": 0.7268, + "step": 2651 + }, + { + "epoch": 1.5706248149244892, + "grad_norm": 1.7709537123753905, + "learning_rate": 9.718624119185138e-06, + "loss": 0.7226, + "step": 2652 + }, + { + "epoch": 1.571217056559076, + "grad_norm": 2.0914175608541434, + "learning_rate": 9.712230958989494e-06, + "loss": 0.7413, + "step": 2653 + }, + { + "epoch": 1.571809298193663, + "grad_norm": 1.1748662544040405, + "learning_rate": 9.705837916507575e-06, + "loss": 0.7256, + "step": 2654 + }, + { + "epoch": 1.5724015398282498, + "grad_norm": 1.9165526544435088, + "learning_rate": 9.699444994354483e-06, + "loss": 0.7225, + "step": 2655 + }, + { + "epoch": 1.572993781462837, + "grad_norm": 1.4887726373278507, + "learning_rate": 9.693052195145292e-06, + "loss": 0.7339, + "step": 2656 + }, + { + "epoch": 1.5735860230974237, + "grad_norm": 2.8971799022670233, + "learning_rate": 9.68665952149501e-06, + "loss": 0.7242, + "step": 2657 + }, + { + "epoch": 1.5741782647320106, + "grad_norm": 2.3325271582358895, + "learning_rate": 9.680266976018613e-06, + "loss": 0.7538, + "step": 2658 + }, + { + "epoch": 1.5747705063665975, + "grad_norm": 1.4580873725563652, + "learning_rate": 9.673874561330994e-06, + "loss": 0.7313, + "step": 2659 + }, + { + "epoch": 1.5753627480011845, + "grad_norm": 1.498906817025249, + "learning_rate": 9.66748228004702e-06, + "loss": 0.7552, + "step": 2660 + }, + { + "epoch": 1.5759549896357714, + "grad_norm": 1.0553079729268877, + "learning_rate": 9.661090134781493e-06, + "loss": 0.7426, + "step": 2661 + }, + { + "epoch": 1.5765472312703583, + "grad_norm": 1.3421402504246218, + "learning_rate": 9.654698128149162e-06, + "loss": 0.7538, + "step": 2662 + }, + { + "epoch": 1.5771394729049453, + "grad_norm": 2.3722370387111322, + "learning_rate": 9.648306262764708e-06, + "loss": 0.7284, + "step": 2663 + }, + { + "epoch": 1.577731714539532, + "grad_norm": 1.80136298232067, + "learning_rate": 9.64191454124277e-06, + "loss": 0.7216, + "step": 2664 + }, + { + "epoch": 1.5783239561741191, + "grad_norm": 2.4789404827141723, + "learning_rate": 9.635522966197923e-06, + "loss": 0.6985, + "step": 2665 + }, + { + "epoch": 1.5789161978087058, + "grad_norm": 1.6133875143981933, + "learning_rate": 9.62913154024468e-06, + "loss": 0.7065, + "step": 2666 + }, + { + "epoch": 1.579508439443293, + "grad_norm": 1.878564085742203, + "learning_rate": 9.622740265997488e-06, + "loss": 0.7201, + "step": 2667 + }, + { + "epoch": 1.5801006810778797, + "grad_norm": 1.561377075762865, + "learning_rate": 9.61634914607074e-06, + "loss": 0.7158, + "step": 2668 + }, + { + "epoch": 1.5806929227124666, + "grad_norm": 1.7864895574938302, + "learning_rate": 9.60995818307877e-06, + "loss": 0.729, + "step": 2669 + }, + { + "epoch": 1.5812851643470536, + "grad_norm": 1.661603731541404, + "learning_rate": 9.603567379635836e-06, + "loss": 0.7625, + "step": 2670 + }, + { + "epoch": 1.5818774059816405, + "grad_norm": 1.299944010246059, + "learning_rate": 9.597176738356134e-06, + "loss": 0.7081, + "step": 2671 + }, + { + "epoch": 1.5824696476162274, + "grad_norm": 2.1529097270049706, + "learning_rate": 9.590786261853798e-06, + "loss": 0.6694, + "step": 2672 + }, + { + "epoch": 1.5830618892508144, + "grad_norm": 1.4703201941101165, + "learning_rate": 9.584395952742892e-06, + "loss": 0.696, + "step": 2673 + }, + { + "epoch": 1.5836541308854013, + "grad_norm": 1.9214984260880197, + "learning_rate": 9.578005813637414e-06, + "loss": 0.7194, + "step": 2674 + }, + { + "epoch": 1.584246372519988, + "grad_norm": 1.3154793348516993, + "learning_rate": 9.571615847151287e-06, + "loss": 0.7264, + "step": 2675 + }, + { + "epoch": 1.5848386141545752, + "grad_norm": 2.0731383642895658, + "learning_rate": 9.565226055898366e-06, + "loss": 0.7398, + "step": 2676 + }, + { + "epoch": 1.5854308557891619, + "grad_norm": 1.269718722070663, + "learning_rate": 9.558836442492437e-06, + "loss": 0.7618, + "step": 2677 + }, + { + "epoch": 1.586023097423749, + "grad_norm": 2.326022738330298, + "learning_rate": 9.552447009547214e-06, + "loss": 0.7153, + "step": 2678 + }, + { + "epoch": 1.5866153390583357, + "grad_norm": 2.108051114609614, + "learning_rate": 9.546057759676328e-06, + "loss": 0.6837, + "step": 2679 + }, + { + "epoch": 1.5872075806929227, + "grad_norm": 1.4247839172166672, + "learning_rate": 9.539668695493344e-06, + "loss": 0.7276, + "step": 2680 + }, + { + "epoch": 1.5877998223275096, + "grad_norm": 1.0542131540553648, + "learning_rate": 9.53327981961175e-06, + "loss": 0.7446, + "step": 2681 + }, + { + "epoch": 1.5883920639620965, + "grad_norm": 1.8059660461442077, + "learning_rate": 9.52689113464496e-06, + "loss": 0.731, + "step": 2682 + }, + { + "epoch": 1.5889843055966835, + "grad_norm": 1.568683168663255, + "learning_rate": 9.520502643206293e-06, + "loss": 0.712, + "step": 2683 + }, + { + "epoch": 1.5895765472312704, + "grad_norm": 3.4501788436091254, + "learning_rate": 9.514114347909011e-06, + "loss": 0.7132, + "step": 2684 + }, + { + "epoch": 1.5901687888658573, + "grad_norm": 2.0059923334239707, + "learning_rate": 9.507726251366283e-06, + "loss": 0.7292, + "step": 2685 + }, + { + "epoch": 1.590761030500444, + "grad_norm": 3.8951776491133745, + "learning_rate": 9.501338356191204e-06, + "loss": 0.7396, + "step": 2686 + }, + { + "epoch": 1.5913532721350312, + "grad_norm": 2.4788086356042216, + "learning_rate": 9.494950664996771e-06, + "loss": 0.7391, + "step": 2687 + }, + { + "epoch": 1.591945513769618, + "grad_norm": 2.9671138222012954, + "learning_rate": 9.488563180395922e-06, + "loss": 0.7501, + "step": 2688 + }, + { + "epoch": 1.592537755404205, + "grad_norm": 1.7325289372388857, + "learning_rate": 9.482175905001489e-06, + "loss": 0.7592, + "step": 2689 + }, + { + "epoch": 1.5931299970387918, + "grad_norm": 2.5802186860418552, + "learning_rate": 9.475788841426232e-06, + "loss": 0.6955, + "step": 2690 + }, + { + "epoch": 1.5937222386733787, + "grad_norm": 2.804837641485897, + "learning_rate": 9.469401992282817e-06, + "loss": 0.7185, + "step": 2691 + }, + { + "epoch": 1.5943144803079656, + "grad_norm": 2.4503401391313893, + "learning_rate": 9.463015360183819e-06, + "loss": 0.7131, + "step": 2692 + }, + { + "epoch": 1.5949067219425526, + "grad_norm": 2.1268807947669606, + "learning_rate": 9.456628947741738e-06, + "loss": 0.7144, + "step": 2693 + }, + { + "epoch": 1.5954989635771395, + "grad_norm": 2.110094339237852, + "learning_rate": 9.450242757568975e-06, + "loss": 0.7282, + "step": 2694 + }, + { + "epoch": 1.5960912052117264, + "grad_norm": 1.7700346930524404, + "learning_rate": 9.443856792277836e-06, + "loss": 0.7554, + "step": 2695 + }, + { + "epoch": 1.5966834468463134, + "grad_norm": 1.0280242622620175, + "learning_rate": 9.43747105448054e-06, + "loss": 0.7444, + "step": 2696 + }, + { + "epoch": 1.5972756884809, + "grad_norm": 1.3872664321744306, + "learning_rate": 9.431085546789218e-06, + "loss": 0.7354, + "step": 2697 + }, + { + "epoch": 1.5978679301154872, + "grad_norm": 1.2479775404016484, + "learning_rate": 9.424700271815901e-06, + "loss": 0.7271, + "step": 2698 + }, + { + "epoch": 1.598460171750074, + "grad_norm": 1.6017123862586502, + "learning_rate": 9.41831523217252e-06, + "loss": 0.7117, + "step": 2699 + }, + { + "epoch": 1.599052413384661, + "grad_norm": 3.210802359158483, + "learning_rate": 9.41193043047092e-06, + "loss": 0.7564, + "step": 2700 + }, + { + "epoch": 1.5996446550192478, + "grad_norm": 3.6114823786582515, + "learning_rate": 9.405545869322843e-06, + "loss": 0.7498, + "step": 2701 + }, + { + "epoch": 1.6002368966538347, + "grad_norm": 1.6710996935533942, + "learning_rate": 9.399161551339933e-06, + "loss": 0.7275, + "step": 2702 + }, + { + "epoch": 1.6008291382884217, + "grad_norm": 1.6984346113201354, + "learning_rate": 9.392777479133736e-06, + "loss": 0.7232, + "step": 2703 + }, + { + "epoch": 1.6014213799230086, + "grad_norm": 1.9419725358546698, + "learning_rate": 9.386393655315696e-06, + "loss": 0.7397, + "step": 2704 + }, + { + "epoch": 1.6020136215575955, + "grad_norm": 3.9234250963518043, + "learning_rate": 9.380010082497152e-06, + "loss": 0.7185, + "step": 2705 + }, + { + "epoch": 1.6026058631921825, + "grad_norm": 1.6194229418389212, + "learning_rate": 9.373626763289352e-06, + "loss": 0.7287, + "step": 2706 + }, + { + "epoch": 1.6031981048267694, + "grad_norm": 1.3815447699004129, + "learning_rate": 9.367243700303427e-06, + "loss": 0.7354, + "step": 2707 + }, + { + "epoch": 1.603790346461356, + "grad_norm": 2.310394866629508, + "learning_rate": 9.36086089615041e-06, + "loss": 0.7429, + "step": 2708 + }, + { + "epoch": 1.6043825880959433, + "grad_norm": 24.261527338301693, + "learning_rate": 9.354478353441226e-06, + "loss": 0.7592, + "step": 2709 + }, + { + "epoch": 1.60497482973053, + "grad_norm": 2.170178314510858, + "learning_rate": 9.3480960747867e-06, + "loss": 0.7337, + "step": 2710 + }, + { + "epoch": 1.6055670713651171, + "grad_norm": 3.053942328323577, + "learning_rate": 9.341714062797533e-06, + "loss": 0.7174, + "step": 2711 + }, + { + "epoch": 1.6061593129997038, + "grad_norm": 1.3735266945797255, + "learning_rate": 9.335332320084331e-06, + "loss": 0.7538, + "step": 2712 + }, + { + "epoch": 1.6067515546342908, + "grad_norm": 1.8525060260331747, + "learning_rate": 9.32895084925759e-06, + "loss": 0.7206, + "step": 2713 + }, + { + "epoch": 1.6073437962688777, + "grad_norm": 1.6312829687512218, + "learning_rate": 9.322569652927685e-06, + "loss": 0.7047, + "step": 2714 + }, + { + "epoch": 1.6079360379034646, + "grad_norm": 1.5960390029649218, + "learning_rate": 9.316188733704883e-06, + "loss": 0.7373, + "step": 2715 + }, + { + "epoch": 1.6085282795380516, + "grad_norm": 1.6939174639156191, + "learning_rate": 9.309808094199343e-06, + "loss": 0.7343, + "step": 2716 + }, + { + "epoch": 1.6091205211726385, + "grad_norm": 5.647495987042836, + "learning_rate": 9.303427737021105e-06, + "loss": 0.7441, + "step": 2717 + }, + { + "epoch": 1.6097127628072254, + "grad_norm": 1.0834842055792864, + "learning_rate": 9.297047664780093e-06, + "loss": 0.7104, + "step": 2718 + }, + { + "epoch": 1.6103050044418121, + "grad_norm": 1.3533184427520821, + "learning_rate": 9.290667880086115e-06, + "loss": 0.7428, + "step": 2719 + }, + { + "epoch": 1.6108972460763993, + "grad_norm": 1.1679095552719452, + "learning_rate": 9.284288385548858e-06, + "loss": 0.7262, + "step": 2720 + }, + { + "epoch": 1.611489487710986, + "grad_norm": 8.140610596435375, + "learning_rate": 9.2779091837779e-06, + "loss": 0.7327, + "step": 2721 + }, + { + "epoch": 1.6120817293455731, + "grad_norm": 1.2293837136922272, + "learning_rate": 9.271530277382695e-06, + "loss": 0.7129, + "step": 2722 + }, + { + "epoch": 1.6126739709801599, + "grad_norm": 1.1470847973636114, + "learning_rate": 9.265151668972566e-06, + "loss": 0.6852, + "step": 2723 + }, + { + "epoch": 1.6132662126147468, + "grad_norm": 2.4405096525740038, + "learning_rate": 9.258773361156725e-06, + "loss": 0.725, + "step": 2724 + }, + { + "epoch": 1.6138584542493337, + "grad_norm": 1.336420955664339, + "learning_rate": 9.252395356544263e-06, + "loss": 0.6653, + "step": 2725 + }, + { + "epoch": 1.6144506958839207, + "grad_norm": 3.435337274510319, + "learning_rate": 9.246017657744142e-06, + "loss": 0.7157, + "step": 2726 + }, + { + "epoch": 1.6150429375185076, + "grad_norm": 1.7205900312208158, + "learning_rate": 9.239640267365197e-06, + "loss": 0.6767, + "step": 2727 + }, + { + "epoch": 1.6156351791530945, + "grad_norm": 0.9404428582680933, + "learning_rate": 9.233263188016138e-06, + "loss": 0.7388, + "step": 2728 + }, + { + "epoch": 1.6162274207876814, + "grad_norm": 1.643384773023064, + "learning_rate": 9.22688642230555e-06, + "loss": 0.7729, + "step": 2729 + }, + { + "epoch": 1.6168196624222682, + "grad_norm": 0.9226766736351931, + "learning_rate": 9.220509972841893e-06, + "loss": 0.7097, + "step": 2730 + }, + { + "epoch": 1.6174119040568553, + "grad_norm": 1.292558562901911, + "learning_rate": 9.214133842233486e-06, + "loss": 0.7638, + "step": 2731 + }, + { + "epoch": 1.618004145691442, + "grad_norm": 1.268236145418197, + "learning_rate": 9.207758033088533e-06, + "loss": 0.7443, + "step": 2732 + }, + { + "epoch": 1.6185963873260292, + "grad_norm": 1.152636990406385, + "learning_rate": 9.20138254801509e-06, + "loss": 0.7477, + "step": 2733 + }, + { + "epoch": 1.6191886289606159, + "grad_norm": 2.325992339134092, + "learning_rate": 9.195007389621098e-06, + "loss": 0.7597, + "step": 2734 + }, + { + "epoch": 1.6197808705952028, + "grad_norm": 1.3096504733315142, + "learning_rate": 9.188632560514345e-06, + "loss": 0.7028, + "step": 2735 + }, + { + "epoch": 1.6203731122297897, + "grad_norm": 1.0546949833315633, + "learning_rate": 9.182258063302504e-06, + "loss": 0.7131, + "step": 2736 + }, + { + "epoch": 1.6209653538643767, + "grad_norm": 0.9210797626479261, + "learning_rate": 9.175883900593095e-06, + "loss": 0.7453, + "step": 2737 + }, + { + "epoch": 1.6215575954989636, + "grad_norm": 1.5075058070737895, + "learning_rate": 9.16951007499352e-06, + "loss": 0.7248, + "step": 2738 + }, + { + "epoch": 1.6221498371335505, + "grad_norm": 1.169767212334899, + "learning_rate": 9.163136589111019e-06, + "loss": 0.732, + "step": 2739 + }, + { + "epoch": 1.6227420787681375, + "grad_norm": 1.615392214615148, + "learning_rate": 9.156763445552714e-06, + "loss": 0.7017, + "step": 2740 + }, + { + "epoch": 1.6233343204027242, + "grad_norm": 1.0292405348020153, + "learning_rate": 9.150390646925578e-06, + "loss": 0.69, + "step": 2741 + }, + { + "epoch": 1.6239265620373113, + "grad_norm": 1.8762128663128925, + "learning_rate": 9.144018195836445e-06, + "loss": 0.7546, + "step": 2742 + }, + { + "epoch": 1.624518803671898, + "grad_norm": 0.8711466459720059, + "learning_rate": 9.137646094892e-06, + "loss": 0.7039, + "step": 2743 + }, + { + "epoch": 1.6251110453064852, + "grad_norm": 0.9463455729109375, + "learning_rate": 9.131274346698797e-06, + "loss": 0.7569, + "step": 2744 + }, + { + "epoch": 1.625703286941072, + "grad_norm": 0.9011458492231906, + "learning_rate": 9.124902953863235e-06, + "loss": 0.7567, + "step": 2745 + }, + { + "epoch": 1.6262955285756588, + "grad_norm": 2.289091980573055, + "learning_rate": 9.118531918991578e-06, + "loss": 0.749, + "step": 2746 + }, + { + "epoch": 1.6268877702102458, + "grad_norm": 1.2146535737974766, + "learning_rate": 9.112161244689931e-06, + "loss": 0.7254, + "step": 2747 + }, + { + "epoch": 1.6274800118448327, + "grad_norm": 2.4627393306018646, + "learning_rate": 9.105790933564259e-06, + "loss": 0.7543, + "step": 2748 + }, + { + "epoch": 1.6280722534794196, + "grad_norm": 1.002159085261613, + "learning_rate": 9.09942098822038e-06, + "loss": 0.7394, + "step": 2749 + }, + { + "epoch": 1.6286644951140063, + "grad_norm": 1.639564743885887, + "learning_rate": 9.09305141126396e-06, + "loss": 0.7178, + "step": 2750 + }, + { + "epoch": 1.6292567367485935, + "grad_norm": 0.9813191864395472, + "learning_rate": 9.086682205300512e-06, + "loss": 0.7354, + "step": 2751 + }, + { + "epoch": 1.6298489783831802, + "grad_norm": 2.1747632589697155, + "learning_rate": 9.080313372935399e-06, + "loss": 0.7072, + "step": 2752 + }, + { + "epoch": 1.6304412200177674, + "grad_norm": 0.8902643816753388, + "learning_rate": 9.073944916773835e-06, + "loss": 0.7272, + "step": 2753 + }, + { + "epoch": 1.631033461652354, + "grad_norm": 0.883321085183472, + "learning_rate": 9.067576839420876e-06, + "loss": 0.7433, + "step": 2754 + }, + { + "epoch": 1.6316257032869412, + "grad_norm": 1.191509270778969, + "learning_rate": 9.06120914348142e-06, + "loss": 0.7682, + "step": 2755 + }, + { + "epoch": 1.632217944921528, + "grad_norm": 0.8931773470587896, + "learning_rate": 9.054841831560216e-06, + "loss": 0.7217, + "step": 2756 + }, + { + "epoch": 1.6328101865561149, + "grad_norm": 0.9206149509553022, + "learning_rate": 9.04847490626185e-06, + "loss": 0.7244, + "step": 2757 + }, + { + "epoch": 1.6334024281907018, + "grad_norm": 0.9602886650943193, + "learning_rate": 9.042108370190757e-06, + "loss": 0.7327, + "step": 2758 + }, + { + "epoch": 1.6339946698252887, + "grad_norm": 1.0731812685624111, + "learning_rate": 9.035742225951203e-06, + "loss": 0.7128, + "step": 2759 + }, + { + "epoch": 1.6345869114598757, + "grad_norm": 1.0001996418004009, + "learning_rate": 9.029376476147303e-06, + "loss": 0.7283, + "step": 2760 + }, + { + "epoch": 1.6351791530944624, + "grad_norm": 1.0101272999135227, + "learning_rate": 9.023011123383002e-06, + "loss": 0.7698, + "step": 2761 + }, + { + "epoch": 1.6357713947290495, + "grad_norm": 7.445183865475451, + "learning_rate": 9.016646170262096e-06, + "loss": 0.7248, + "step": 2762 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.0354333262038447, + "learning_rate": 9.010281619388198e-06, + "loss": 0.7165, + "step": 2763 + }, + { + "epoch": 1.6369558779982234, + "grad_norm": 1.0303790501337338, + "learning_rate": 9.003917473364774e-06, + "loss": 0.773, + "step": 2764 + }, + { + "epoch": 1.63754811963281, + "grad_norm": 1.0048176835409444, + "learning_rate": 8.997553734795115e-06, + "loss": 0.7446, + "step": 2765 + }, + { + "epoch": 1.6381403612673973, + "grad_norm": 0.8644074880461828, + "learning_rate": 8.991190406282352e-06, + "loss": 0.7565, + "step": 2766 + }, + { + "epoch": 1.638732602901984, + "grad_norm": 1.484278848736965, + "learning_rate": 8.984827490429437e-06, + "loss": 0.7242, + "step": 2767 + }, + { + "epoch": 1.639324844536571, + "grad_norm": 1.2307819145316636, + "learning_rate": 8.978464989839165e-06, + "loss": 0.7396, + "step": 2768 + }, + { + "epoch": 1.6399170861711578, + "grad_norm": 1.2176803902884394, + "learning_rate": 8.972102907114157e-06, + "loss": 0.745, + "step": 2769 + }, + { + "epoch": 1.6405093278057448, + "grad_norm": 1.0887898932988884, + "learning_rate": 8.965741244856864e-06, + "loss": 0.6943, + "step": 2770 + }, + { + "epoch": 1.6411015694403317, + "grad_norm": 0.988196915871078, + "learning_rate": 8.959380005669559e-06, + "loss": 0.7085, + "step": 2771 + }, + { + "epoch": 1.6416938110749184, + "grad_norm": 0.9375850317494164, + "learning_rate": 8.953019192154344e-06, + "loss": 0.7181, + "step": 2772 + }, + { + "epoch": 1.6422860527095056, + "grad_norm": 1.0916414927857014, + "learning_rate": 8.946658806913158e-06, + "loss": 0.7182, + "step": 2773 + }, + { + "epoch": 1.6428782943440923, + "grad_norm": 0.9559469293114459, + "learning_rate": 8.940298852547753e-06, + "loss": 0.6924, + "step": 2774 + }, + { + "epoch": 1.6434705359786794, + "grad_norm": 0.9122282528732937, + "learning_rate": 8.933939331659707e-06, + "loss": 0.6802, + "step": 2775 + }, + { + "epoch": 1.6440627776132661, + "grad_norm": 1.2827645783241866, + "learning_rate": 8.927580246850418e-06, + "loss": 0.7219, + "step": 2776 + }, + { + "epoch": 1.6446550192478533, + "grad_norm": 1.0856897372943664, + "learning_rate": 8.921221600721115e-06, + "loss": 0.7749, + "step": 2777 + }, + { + "epoch": 1.64524726088244, + "grad_norm": 0.9976872404160881, + "learning_rate": 8.914863395872844e-06, + "loss": 0.6998, + "step": 2778 + }, + { + "epoch": 1.645839502517027, + "grad_norm": 0.9187575917239228, + "learning_rate": 8.908505634906461e-06, + "loss": 0.6714, + "step": 2779 + }, + { + "epoch": 1.6464317441516139, + "grad_norm": 0.8887264946359307, + "learning_rate": 8.90214832042265e-06, + "loss": 0.7127, + "step": 2780 + }, + { + "epoch": 1.6470239857862008, + "grad_norm": 0.8836649493280779, + "learning_rate": 8.895791455021912e-06, + "loss": 0.716, + "step": 2781 + }, + { + "epoch": 1.6476162274207877, + "grad_norm": 0.9011624604992315, + "learning_rate": 8.889435041304565e-06, + "loss": 0.723, + "step": 2782 + }, + { + "epoch": 1.6482084690553744, + "grad_norm": 1.2551214513482096, + "learning_rate": 8.88307908187073e-06, + "loss": 0.7301, + "step": 2783 + }, + { + "epoch": 1.6488007106899616, + "grad_norm": 1.030561508128019, + "learning_rate": 8.876723579320363e-06, + "loss": 0.6991, + "step": 2784 + }, + { + "epoch": 1.6493929523245483, + "grad_norm": 0.9450181046757289, + "learning_rate": 8.870368536253213e-06, + "loss": 0.7391, + "step": 2785 + }, + { + "epoch": 1.6499851939591355, + "grad_norm": 1.0028902776005366, + "learning_rate": 8.86401395526886e-06, + "loss": 0.7461, + "step": 2786 + }, + { + "epoch": 1.6505774355937222, + "grad_norm": 1.0849709154792493, + "learning_rate": 8.857659838966672e-06, + "loss": 0.7124, + "step": 2787 + }, + { + "epoch": 1.651169677228309, + "grad_norm": 1.418277931700241, + "learning_rate": 8.85130618994585e-06, + "loss": 0.7166, + "step": 2788 + }, + { + "epoch": 1.651761918862896, + "grad_norm": 1.0799657812191223, + "learning_rate": 8.844953010805388e-06, + "loss": 0.6659, + "step": 2789 + }, + { + "epoch": 1.652354160497483, + "grad_norm": 0.9047344447663203, + "learning_rate": 8.838600304144102e-06, + "loss": 0.7237, + "step": 2790 + }, + { + "epoch": 1.65294640213207, + "grad_norm": 0.8954991208705074, + "learning_rate": 8.832248072560594e-06, + "loss": 0.7404, + "step": 2791 + }, + { + "epoch": 1.6535386437666568, + "grad_norm": 1.255225726733933, + "learning_rate": 8.825896318653294e-06, + "loss": 0.718, + "step": 2792 + }, + { + "epoch": 1.6541308854012438, + "grad_norm": 1.8935384231370476, + "learning_rate": 8.81954504502042e-06, + "loss": 0.741, + "step": 2793 + }, + { + "epoch": 1.6547231270358305, + "grad_norm": 0.8395781786009291, + "learning_rate": 8.813194254260006e-06, + "loss": 0.7077, + "step": 2794 + }, + { + "epoch": 1.6553153686704176, + "grad_norm": 1.1680365597009021, + "learning_rate": 8.806843948969875e-06, + "loss": 0.7082, + "step": 2795 + }, + { + "epoch": 1.6559076103050043, + "grad_norm": 0.912567977351368, + "learning_rate": 8.800494131747667e-06, + "loss": 0.7654, + "step": 2796 + }, + { + "epoch": 1.6564998519395915, + "grad_norm": 2.4309897712239166, + "learning_rate": 8.794144805190809e-06, + "loss": 0.7536, + "step": 2797 + }, + { + "epoch": 1.6570920935741782, + "grad_norm": 1.0403551057860285, + "learning_rate": 8.787795971896536e-06, + "loss": 0.7459, + "step": 2798 + }, + { + "epoch": 1.6576843352087651, + "grad_norm": 1.5383317536648582, + "learning_rate": 8.781447634461874e-06, + "loss": 0.722, + "step": 2799 + }, + { + "epoch": 1.658276576843352, + "grad_norm": 0.9088453416120332, + "learning_rate": 8.775099795483651e-06, + "loss": 0.7737, + "step": 2800 + }, + { + "epoch": 1.658868818477939, + "grad_norm": 1.042835871852608, + "learning_rate": 8.768752457558492e-06, + "loss": 0.7334, + "step": 2801 + }, + { + "epoch": 1.659461060112526, + "grad_norm": 1.0274279053461044, + "learning_rate": 8.762405623282817e-06, + "loss": 0.6733, + "step": 2802 + }, + { + "epoch": 1.6600533017471129, + "grad_norm": 0.9918727284685397, + "learning_rate": 8.756059295252833e-06, + "loss": 0.7416, + "step": 2803 + }, + { + "epoch": 1.6606455433816998, + "grad_norm": 1.306893253891884, + "learning_rate": 8.749713476064547e-06, + "loss": 0.7739, + "step": 2804 + }, + { + "epoch": 1.6612377850162865, + "grad_norm": 1.2615372668097604, + "learning_rate": 8.743368168313757e-06, + "loss": 0.7619, + "step": 2805 + }, + { + "epoch": 1.6618300266508736, + "grad_norm": 0.905731661272337, + "learning_rate": 8.737023374596051e-06, + "loss": 0.73, + "step": 2806 + }, + { + "epoch": 1.6624222682854604, + "grad_norm": 1.609306011429277, + "learning_rate": 8.730679097506804e-06, + "loss": 0.7174, + "step": 2807 + }, + { + "epoch": 1.6630145099200475, + "grad_norm": 1.294248981532454, + "learning_rate": 8.724335339641185e-06, + "loss": 0.7094, + "step": 2808 + }, + { + "epoch": 1.6636067515546342, + "grad_norm": 0.939745188957189, + "learning_rate": 8.717992103594142e-06, + "loss": 0.6777, + "step": 2809 + }, + { + "epoch": 1.6641989931892212, + "grad_norm": 1.3256782257737456, + "learning_rate": 8.711649391960424e-06, + "loss": 0.6926, + "step": 2810 + }, + { + "epoch": 1.664791234823808, + "grad_norm": 0.8735845886685419, + "learning_rate": 8.705307207334552e-06, + "loss": 0.7247, + "step": 2811 + }, + { + "epoch": 1.665383476458395, + "grad_norm": 1.0270651510259472, + "learning_rate": 8.698965552310834e-06, + "loss": 0.7213, + "step": 2812 + }, + { + "epoch": 1.665975718092982, + "grad_norm": 0.8887392623106478, + "learning_rate": 8.692624429483364e-06, + "loss": 0.7645, + "step": 2813 + }, + { + "epoch": 1.6665679597275689, + "grad_norm": 1.1018807997801454, + "learning_rate": 8.686283841446027e-06, + "loss": 0.7248, + "step": 2814 + }, + { + "epoch": 1.6671602013621558, + "grad_norm": 0.9580382702616205, + "learning_rate": 8.679943790792466e-06, + "loss": 0.704, + "step": 2815 + }, + { + "epoch": 1.6677524429967425, + "grad_norm": 1.775823863507453, + "learning_rate": 8.673604280116127e-06, + "loss": 0.7412, + "step": 2816 + }, + { + "epoch": 1.6683446846313297, + "grad_norm": 0.8788118750792545, + "learning_rate": 8.667265312010224e-06, + "loss": 0.7352, + "step": 2817 + }, + { + "epoch": 1.6689369262659164, + "grad_norm": 1.55508648257972, + "learning_rate": 8.660926889067753e-06, + "loss": 0.7357, + "step": 2818 + }, + { + "epoch": 1.6695291679005035, + "grad_norm": 1.062045887571016, + "learning_rate": 8.654589013881481e-06, + "loss": 0.7618, + "step": 2819 + }, + { + "epoch": 1.6701214095350903, + "grad_norm": 0.8983582875998769, + "learning_rate": 8.648251689043961e-06, + "loss": 0.7351, + "step": 2820 + }, + { + "epoch": 1.6707136511696772, + "grad_norm": 1.03505281944072, + "learning_rate": 8.641914917147512e-06, + "loss": 0.7224, + "step": 2821 + }, + { + "epoch": 1.6713058928042641, + "grad_norm": 0.9169880645190774, + "learning_rate": 8.635578700784232e-06, + "loss": 0.7079, + "step": 2822 + }, + { + "epoch": 1.671898134438851, + "grad_norm": 1.2057396000296163, + "learning_rate": 8.629243042545989e-06, + "loss": 0.7403, + "step": 2823 + }, + { + "epoch": 1.672490376073438, + "grad_norm": 1.0465851255608474, + "learning_rate": 8.622907945024418e-06, + "loss": 0.7507, + "step": 2824 + }, + { + "epoch": 1.673082617708025, + "grad_norm": 0.9218356861943021, + "learning_rate": 8.616573410810938e-06, + "loss": 0.6975, + "step": 2825 + }, + { + "epoch": 1.6736748593426118, + "grad_norm": 1.1722796718213668, + "learning_rate": 8.61023944249673e-06, + "loss": 0.6915, + "step": 2826 + }, + { + "epoch": 1.6742671009771986, + "grad_norm": 1.0126608768047232, + "learning_rate": 8.603906042672738e-06, + "loss": 0.7349, + "step": 2827 + }, + { + "epoch": 1.6748593426117857, + "grad_norm": 0.8864132486291652, + "learning_rate": 8.597573213929677e-06, + "loss": 0.7473, + "step": 2828 + }, + { + "epoch": 1.6754515842463724, + "grad_norm": 0.8297726828669127, + "learning_rate": 8.591240958858036e-06, + "loss": 0.6998, + "step": 2829 + }, + { + "epoch": 1.6760438258809596, + "grad_norm": 1.0399711698669063, + "learning_rate": 8.584909280048064e-06, + "loss": 0.6883, + "step": 2830 + }, + { + "epoch": 1.6766360675155463, + "grad_norm": 1.4494905018858395, + "learning_rate": 8.57857818008977e-06, + "loss": 0.7302, + "step": 2831 + }, + { + "epoch": 1.6772283091501332, + "grad_norm": 2.2622801471005656, + "learning_rate": 8.572247661572926e-06, + "loss": 0.7195, + "step": 2832 + }, + { + "epoch": 1.6778205507847201, + "grad_norm": 0.9090049659373163, + "learning_rate": 8.565917727087078e-06, + "loss": 0.6993, + "step": 2833 + }, + { + "epoch": 1.678412792419307, + "grad_norm": 1.0718271118863054, + "learning_rate": 8.559588379221525e-06, + "loss": 0.7332, + "step": 2834 + }, + { + "epoch": 1.679005034053894, + "grad_norm": 1.3513550671215728, + "learning_rate": 8.55325962056532e-06, + "loss": 0.7081, + "step": 2835 + }, + { + "epoch": 1.679597275688481, + "grad_norm": 1.2868537736190298, + "learning_rate": 8.546931453707285e-06, + "loss": 0.7261, + "step": 2836 + }, + { + "epoch": 1.6801895173230679, + "grad_norm": 0.8991809072064495, + "learning_rate": 8.540603881235993e-06, + "loss": 0.7581, + "step": 2837 + }, + { + "epoch": 1.6807817589576546, + "grad_norm": 0.9833047762425767, + "learning_rate": 8.534276905739783e-06, + "loss": 0.7371, + "step": 2838 + }, + { + "epoch": 1.6813740005922417, + "grad_norm": 1.3090020706900172, + "learning_rate": 8.527950529806739e-06, + "loss": 0.7371, + "step": 2839 + }, + { + "epoch": 1.6819662422268284, + "grad_norm": 1.3428401139365922, + "learning_rate": 8.521624756024706e-06, + "loss": 0.7124, + "step": 2840 + }, + { + "epoch": 1.6825584838614156, + "grad_norm": 0.9822516800617229, + "learning_rate": 8.515299586981278e-06, + "loss": 0.7122, + "step": 2841 + }, + { + "epoch": 1.6831507254960023, + "grad_norm": 1.5404692861197815, + "learning_rate": 8.508975025263814e-06, + "loss": 0.7303, + "step": 2842 + }, + { + "epoch": 1.6837429671305892, + "grad_norm": 0.946622538768465, + "learning_rate": 8.502651073459403e-06, + "loss": 0.7374, + "step": 2843 + }, + { + "epoch": 1.6843352087651762, + "grad_norm": 1.1084910982075946, + "learning_rate": 8.496327734154905e-06, + "loss": 0.6946, + "step": 2844 + }, + { + "epoch": 1.684927450399763, + "grad_norm": 0.8603924254623376, + "learning_rate": 8.490005009936918e-06, + "loss": 0.7159, + "step": 2845 + }, + { + "epoch": 1.68551969203435, + "grad_norm": 1.0846672515700364, + "learning_rate": 8.483682903391796e-06, + "loss": 0.7591, + "step": 2846 + }, + { + "epoch": 1.686111933668937, + "grad_norm": 1.1750404325244188, + "learning_rate": 8.477361417105631e-06, + "loss": 0.7344, + "step": 2847 + }, + { + "epoch": 1.686704175303524, + "grad_norm": 1.0501143321182151, + "learning_rate": 8.471040553664269e-06, + "loss": 0.7063, + "step": 2848 + }, + { + "epoch": 1.6872964169381106, + "grad_norm": 1.2763094400193624, + "learning_rate": 8.464720315653298e-06, + "loss": 0.6708, + "step": 2849 + }, + { + "epoch": 1.6878886585726978, + "grad_norm": 1.111124108113765, + "learning_rate": 8.458400705658051e-06, + "loss": 0.7201, + "step": 2850 + }, + { + "epoch": 1.6884809002072845, + "grad_norm": 0.9062869224487132, + "learning_rate": 8.452081726263604e-06, + "loss": 0.7048, + "step": 2851 + }, + { + "epoch": 1.6890731418418716, + "grad_norm": 1.2248497508498524, + "learning_rate": 8.445763380054773e-06, + "loss": 0.7171, + "step": 2852 + }, + { + "epoch": 1.6896653834764583, + "grad_norm": 1.6317101564085743, + "learning_rate": 8.43944566961612e-06, + "loss": 0.6921, + "step": 2853 + }, + { + "epoch": 1.6902576251110453, + "grad_norm": 0.8594789130404841, + "learning_rate": 8.433128597531943e-06, + "loss": 0.7166, + "step": 2854 + }, + { + "epoch": 1.6908498667456322, + "grad_norm": 5.253886253022391, + "learning_rate": 8.426812166386278e-06, + "loss": 0.6991, + "step": 2855 + }, + { + "epoch": 1.6914421083802191, + "grad_norm": 1.0371352611636877, + "learning_rate": 8.420496378762901e-06, + "loss": 0.7239, + "step": 2856 + }, + { + "epoch": 1.692034350014806, + "grad_norm": 0.8825325351256947, + "learning_rate": 8.414181237245324e-06, + "loss": 0.7318, + "step": 2857 + }, + { + "epoch": 1.692626591649393, + "grad_norm": 1.0387049681798395, + "learning_rate": 8.407866744416801e-06, + "loss": 0.7132, + "step": 2858 + }, + { + "epoch": 1.69321883328398, + "grad_norm": 1.0193831157896063, + "learning_rate": 8.401552902860306e-06, + "loss": 0.7193, + "step": 2859 + }, + { + "epoch": 1.6938110749185666, + "grad_norm": 1.2765032613192198, + "learning_rate": 8.395239715158558e-06, + "loss": 0.7425, + "step": 2860 + }, + { + "epoch": 1.6944033165531538, + "grad_norm": 1.0423155817988132, + "learning_rate": 8.388927183894005e-06, + "loss": 0.7557, + "step": 2861 + }, + { + "epoch": 1.6949955581877405, + "grad_norm": 1.2949288338272016, + "learning_rate": 8.382615311648833e-06, + "loss": 0.7196, + "step": 2862 + }, + { + "epoch": 1.6955877998223277, + "grad_norm": 1.203764230621225, + "learning_rate": 8.376304101004946e-06, + "loss": 0.7589, + "step": 2863 + }, + { + "epoch": 1.6961800414569144, + "grad_norm": 1.1439950575179505, + "learning_rate": 8.369993554543987e-06, + "loss": 0.7518, + "step": 2864 + }, + { + "epoch": 1.6967722830915013, + "grad_norm": 1.768457656980784, + "learning_rate": 8.363683674847323e-06, + "loss": 0.7382, + "step": 2865 + }, + { + "epoch": 1.6973645247260882, + "grad_norm": 1.0904038835740486, + "learning_rate": 8.357374464496056e-06, + "loss": 0.7482, + "step": 2866 + }, + { + "epoch": 1.6979567663606752, + "grad_norm": 1.4275207295309171, + "learning_rate": 8.351065926070994e-06, + "loss": 0.7192, + "step": 2867 + }, + { + "epoch": 1.698549007995262, + "grad_norm": 0.8975759750455716, + "learning_rate": 8.344758062152696e-06, + "loss": 0.7208, + "step": 2868 + }, + { + "epoch": 1.699141249629849, + "grad_norm": 0.9962272566707975, + "learning_rate": 8.338450875321428e-06, + "loss": 0.7174, + "step": 2869 + }, + { + "epoch": 1.699733491264436, + "grad_norm": 0.9458627764840206, + "learning_rate": 8.332144368157192e-06, + "loss": 0.7004, + "step": 2870 + }, + { + "epoch": 1.7003257328990227, + "grad_norm": 0.9243786772003465, + "learning_rate": 8.325838543239688e-06, + "loss": 0.7212, + "step": 2871 + }, + { + "epoch": 1.7009179745336098, + "grad_norm": 1.095152314018298, + "learning_rate": 8.319533403148368e-06, + "loss": 0.7413, + "step": 2872 + }, + { + "epoch": 1.7015102161681965, + "grad_norm": 1.179387764922565, + "learning_rate": 8.31322895046238e-06, + "loss": 0.7317, + "step": 2873 + }, + { + "epoch": 1.7021024578027837, + "grad_norm": 1.0825336416478861, + "learning_rate": 8.306925187760608e-06, + "loss": 0.7383, + "step": 2874 + }, + { + "epoch": 1.7026946994373704, + "grad_norm": 1.1435368543161897, + "learning_rate": 8.300622117621634e-06, + "loss": 0.7225, + "step": 2875 + }, + { + "epoch": 1.7032869410719573, + "grad_norm": 1.0508503526183521, + "learning_rate": 8.29431974262378e-06, + "loss": 0.7231, + "step": 2876 + }, + { + "epoch": 1.7038791827065443, + "grad_norm": 2.6026780734301576, + "learning_rate": 8.288018065345063e-06, + "loss": 0.7578, + "step": 2877 + }, + { + "epoch": 1.7044714243411312, + "grad_norm": 1.036737957257567, + "learning_rate": 8.28171708836323e-06, + "loss": 0.727, + "step": 2878 + }, + { + "epoch": 1.7050636659757181, + "grad_norm": 1.2395033785247513, + "learning_rate": 8.275416814255731e-06, + "loss": 0.7323, + "step": 2879 + }, + { + "epoch": 1.705655907610305, + "grad_norm": 5.3070838585377915, + "learning_rate": 8.269117245599729e-06, + "loss": 0.7276, + "step": 2880 + }, + { + "epoch": 1.706248149244892, + "grad_norm": 1.9790319279693616, + "learning_rate": 8.262818384972108e-06, + "loss": 0.7232, + "step": 2881 + }, + { + "epoch": 1.7068403908794787, + "grad_norm": 0.9027953973682145, + "learning_rate": 8.256520234949456e-06, + "loss": 0.7473, + "step": 2882 + }, + { + "epoch": 1.7074326325140659, + "grad_norm": 1.0026460097696173, + "learning_rate": 8.250222798108068e-06, + "loss": 0.6998, + "step": 2883 + }, + { + "epoch": 1.7080248741486526, + "grad_norm": 1.4657670953683113, + "learning_rate": 8.243926077023945e-06, + "loss": 0.663, + "step": 2884 + }, + { + "epoch": 1.7086171157832397, + "grad_norm": 1.0969379355734514, + "learning_rate": 8.23763007427281e-06, + "loss": 0.7078, + "step": 2885 + }, + { + "epoch": 1.7092093574178264, + "grad_norm": 0.967745297644781, + "learning_rate": 8.23133479243008e-06, + "loss": 0.7153, + "step": 2886 + }, + { + "epoch": 1.7098015990524134, + "grad_norm": 1.2060481495888675, + "learning_rate": 8.225040234070873e-06, + "loss": 0.7624, + "step": 2887 + }, + { + "epoch": 1.7103938406870003, + "grad_norm": 2.120123919682014, + "learning_rate": 8.218746401770021e-06, + "loss": 0.7574, + "step": 2888 + }, + { + "epoch": 1.7109860823215872, + "grad_norm": 1.1153560787494186, + "learning_rate": 8.212453298102054e-06, + "loss": 0.7589, + "step": 2889 + }, + { + "epoch": 1.7115783239561742, + "grad_norm": 1.5107438969118132, + "learning_rate": 8.206160925641211e-06, + "loss": 0.7536, + "step": 2890 + }, + { + "epoch": 1.712170565590761, + "grad_norm": 1.054181917285995, + "learning_rate": 8.19986928696142e-06, + "loss": 0.7439, + "step": 2891 + }, + { + "epoch": 1.712762807225348, + "grad_norm": 1.530532649227264, + "learning_rate": 8.193578384636317e-06, + "loss": 0.7304, + "step": 2892 + }, + { + "epoch": 1.7133550488599347, + "grad_norm": 1.4100039607836896, + "learning_rate": 8.187288221239232e-06, + "loss": 0.7115, + "step": 2893 + }, + { + "epoch": 1.7139472904945219, + "grad_norm": 1.3653865280730575, + "learning_rate": 8.180998799343203e-06, + "loss": 0.6963, + "step": 2894 + }, + { + "epoch": 1.7145395321291086, + "grad_norm": 1.3761845598375588, + "learning_rate": 8.17471012152095e-06, + "loss": 0.7766, + "step": 2895 + }, + { + "epoch": 1.7151317737636957, + "grad_norm": 0.9896735131841189, + "learning_rate": 8.168422190344896e-06, + "loss": 0.7104, + "step": 2896 + }, + { + "epoch": 1.7157240153982825, + "grad_norm": 1.8180792395206247, + "learning_rate": 8.162135008387164e-06, + "loss": 0.7355, + "step": 2897 + }, + { + "epoch": 1.7163162570328694, + "grad_norm": 1.2917397756929976, + "learning_rate": 8.155848578219563e-06, + "loss": 0.7736, + "step": 2898 + }, + { + "epoch": 1.7169084986674563, + "grad_norm": 1.2049090200411698, + "learning_rate": 8.14956290241359e-06, + "loss": 0.7332, + "step": 2899 + }, + { + "epoch": 1.7175007403020432, + "grad_norm": 2.413612490345052, + "learning_rate": 8.14327798354045e-06, + "loss": 0.7142, + "step": 2900 + }, + { + "epoch": 1.7180929819366302, + "grad_norm": 1.0571402249668937, + "learning_rate": 8.136993824171019e-06, + "loss": 0.7277, + "step": 2901 + }, + { + "epoch": 1.718685223571217, + "grad_norm": 1.2201789441380906, + "learning_rate": 8.130710426875881e-06, + "loss": 0.7176, + "step": 2902 + }, + { + "epoch": 1.719277465205804, + "grad_norm": 1.041688994211538, + "learning_rate": 8.12442779422529e-06, + "loss": 0.6969, + "step": 2903 + }, + { + "epoch": 1.7198697068403908, + "grad_norm": 1.1679307028654387, + "learning_rate": 8.118145928789198e-06, + "loss": 0.6998, + "step": 2904 + }, + { + "epoch": 1.720461948474978, + "grad_norm": 2.7733038021139875, + "learning_rate": 8.111864833137246e-06, + "loss": 0.6915, + "step": 2905 + }, + { + "epoch": 1.7210541901095646, + "grad_norm": 1.5759312229276083, + "learning_rate": 8.105584509838754e-06, + "loss": 0.7227, + "step": 2906 + }, + { + "epoch": 1.7216464317441518, + "grad_norm": 1.3336882101988723, + "learning_rate": 8.099304961462722e-06, + "loss": 0.741, + "step": 2907 + }, + { + "epoch": 1.7222386733787385, + "grad_norm": 1.2123410041999985, + "learning_rate": 8.093026190577839e-06, + "loss": 0.7105, + "step": 2908 + }, + { + "epoch": 1.7228309150133254, + "grad_norm": 1.0373687336456456, + "learning_rate": 8.086748199752483e-06, + "loss": 0.6718, + "step": 2909 + }, + { + "epoch": 1.7234231566479123, + "grad_norm": 2.863392250283223, + "learning_rate": 8.080470991554703e-06, + "loss": 0.7303, + "step": 2910 + }, + { + "epoch": 1.7240153982824993, + "grad_norm": 29.798561463464726, + "learning_rate": 8.074194568552224e-06, + "loss": 0.7341, + "step": 2911 + }, + { + "epoch": 1.7246076399170862, + "grad_norm": 1.3629065680833, + "learning_rate": 8.067918933312459e-06, + "loss": 0.7307, + "step": 2912 + }, + { + "epoch": 1.725199881551673, + "grad_norm": 1.0966158414516722, + "learning_rate": 8.061644088402499e-06, + "loss": 0.7015, + "step": 2913 + }, + { + "epoch": 1.72579212318626, + "grad_norm": 1.270462849175571, + "learning_rate": 8.055370036389105e-06, + "loss": 0.7317, + "step": 2914 + }, + { + "epoch": 1.7263843648208468, + "grad_norm": 1.1763045807935812, + "learning_rate": 8.04909677983872e-06, + "loss": 0.7091, + "step": 2915 + }, + { + "epoch": 1.726976606455434, + "grad_norm": 3.4592157510743173, + "learning_rate": 8.042824321317453e-06, + "loss": 0.7266, + "step": 2916 + }, + { + "epoch": 1.7275688480900206, + "grad_norm": 1.1717886947291323, + "learning_rate": 8.036552663391099e-06, + "loss": 0.7113, + "step": 2917 + }, + { + "epoch": 1.7281610897246078, + "grad_norm": 4.1475058496080965, + "learning_rate": 8.030281808625114e-06, + "loss": 0.7183, + "step": 2918 + }, + { + "epoch": 1.7287533313591945, + "grad_norm": 1.5657993451062597, + "learning_rate": 8.02401175958463e-06, + "loss": 0.7218, + "step": 2919 + }, + { + "epoch": 1.7293455729937814, + "grad_norm": 1.070325978428891, + "learning_rate": 8.017742518834454e-06, + "loss": 0.723, + "step": 2920 + }, + { + "epoch": 1.7299378146283684, + "grad_norm": 1.1119220102928689, + "learning_rate": 8.011474088939056e-06, + "loss": 0.6992, + "step": 2921 + }, + { + "epoch": 1.7305300562629553, + "grad_norm": 1.3076317154786299, + "learning_rate": 8.005206472462576e-06, + "loss": 0.7237, + "step": 2922 + }, + { + "epoch": 1.7311222978975422, + "grad_norm": 1.2441146503285951, + "learning_rate": 7.998939671968817e-06, + "loss": 0.7263, + "step": 2923 + }, + { + "epoch": 1.731714539532129, + "grad_norm": 1.3530915910623935, + "learning_rate": 7.99267369002126e-06, + "loss": 0.7264, + "step": 2924 + }, + { + "epoch": 1.732306781166716, + "grad_norm": 1.1749639635147615, + "learning_rate": 7.986408529183045e-06, + "loss": 0.7354, + "step": 2925 + }, + { + "epoch": 1.7328990228013028, + "grad_norm": 0.9443164977742067, + "learning_rate": 7.980144192016967e-06, + "loss": 0.7564, + "step": 2926 + }, + { + "epoch": 1.73349126443589, + "grad_norm": 1.2969219620095116, + "learning_rate": 7.973880681085495e-06, + "loss": 0.7822, + "step": 2927 + }, + { + "epoch": 1.7340835060704767, + "grad_norm": 1.3340663571052536, + "learning_rate": 7.967617998950762e-06, + "loss": 0.7495, + "step": 2928 + }, + { + "epoch": 1.7346757477050638, + "grad_norm": 1.2273379932145483, + "learning_rate": 7.961356148174554e-06, + "loss": 0.7072, + "step": 2929 + }, + { + "epoch": 1.7352679893396505, + "grad_norm": 1.3141531019330541, + "learning_rate": 7.955095131318319e-06, + "loss": 0.7258, + "step": 2930 + }, + { + "epoch": 1.7358602309742375, + "grad_norm": 1.203363496166313, + "learning_rate": 7.948834950943165e-06, + "loss": 0.721, + "step": 2931 + }, + { + "epoch": 1.7364524726088244, + "grad_norm": 1.1742529128764918, + "learning_rate": 7.942575609609857e-06, + "loss": 0.7185, + "step": 2932 + }, + { + "epoch": 1.7370447142434113, + "grad_norm": 1.428016405572957, + "learning_rate": 7.936317109878824e-06, + "loss": 0.7647, + "step": 2933 + }, + { + "epoch": 1.7376369558779983, + "grad_norm": 1.2790352849117015, + "learning_rate": 7.930059454310138e-06, + "loss": 0.7581, + "step": 2934 + }, + { + "epoch": 1.738229197512585, + "grad_norm": 0.9761646608961596, + "learning_rate": 7.923802645463532e-06, + "loss": 0.7217, + "step": 2935 + }, + { + "epoch": 1.7388214391471721, + "grad_norm": 1.5751448044306215, + "learning_rate": 7.917546685898393e-06, + "loss": 0.6937, + "step": 2936 + }, + { + "epoch": 1.7394136807817588, + "grad_norm": 1.1640978244444167, + "learning_rate": 7.911291578173767e-06, + "loss": 0.6968, + "step": 2937 + }, + { + "epoch": 1.740005922416346, + "grad_norm": 1.4304427410478942, + "learning_rate": 7.905037324848334e-06, + "loss": 0.7239, + "step": 2938 + }, + { + "epoch": 1.7405981640509327, + "grad_norm": 0.9786975945632999, + "learning_rate": 7.898783928480442e-06, + "loss": 0.744, + "step": 2939 + }, + { + "epoch": 1.7411904056855196, + "grad_norm": 1.206680722391421, + "learning_rate": 7.89253139162808e-06, + "loss": 0.732, + "step": 2940 + }, + { + "epoch": 1.7417826473201066, + "grad_norm": 1.9134032463262014, + "learning_rate": 7.88627971684889e-06, + "loss": 0.7539, + "step": 2941 + }, + { + "epoch": 1.7423748889546935, + "grad_norm": 1.7876317020070605, + "learning_rate": 7.880028906700153e-06, + "loss": 0.7261, + "step": 2942 + }, + { + "epoch": 1.7429671305892804, + "grad_norm": 1.2450546265182965, + "learning_rate": 7.873778963738806e-06, + "loss": 0.692, + "step": 2943 + }, + { + "epoch": 1.7435593722238674, + "grad_norm": 1.273671537217201, + "learning_rate": 7.867529890521424e-06, + "loss": 0.6972, + "step": 2944 + }, + { + "epoch": 1.7441516138584543, + "grad_norm": 0.9478991818498798, + "learning_rate": 7.861281689604237e-06, + "loss": 0.7267, + "step": 2945 + }, + { + "epoch": 1.744743855493041, + "grad_norm": 0.9418695622023932, + "learning_rate": 7.8550343635431e-06, + "loss": 0.7253, + "step": 2946 + }, + { + "epoch": 1.7453360971276282, + "grad_norm": 1.5972561604224107, + "learning_rate": 7.848787914893525e-06, + "loss": 0.719, + "step": 2947 + }, + { + "epoch": 1.7459283387622149, + "grad_norm": 1.6316651832184046, + "learning_rate": 7.842542346210663e-06, + "loss": 0.6981, + "step": 2948 + }, + { + "epoch": 1.746520580396802, + "grad_norm": 1.1436403440246294, + "learning_rate": 7.836297660049303e-06, + "loss": 0.7392, + "step": 2949 + }, + { + "epoch": 1.7471128220313887, + "grad_norm": 1.1760242256445133, + "learning_rate": 7.83005385896387e-06, + "loss": 0.7368, + "step": 2950 + }, + { + "epoch": 1.7477050636659757, + "grad_norm": 1.0420980744058013, + "learning_rate": 7.823810945508427e-06, + "loss": 0.7202, + "step": 2951 + }, + { + "epoch": 1.7482973053005626, + "grad_norm": 1.236600456703946, + "learning_rate": 7.817568922236683e-06, + "loss": 0.7232, + "step": 2952 + }, + { + "epoch": 1.7488895469351495, + "grad_norm": 1.1503525583737613, + "learning_rate": 7.811327791701977e-06, + "loss": 0.7134, + "step": 2953 + }, + { + "epoch": 1.7494817885697365, + "grad_norm": 2.7772710614120597, + "learning_rate": 7.805087556457275e-06, + "loss": 0.6838, + "step": 2954 + }, + { + "epoch": 1.7500740302043234, + "grad_norm": 1.5536873224508294, + "learning_rate": 7.798848219055189e-06, + "loss": 0.7342, + "step": 2955 + }, + { + "epoch": 1.7506662718389103, + "grad_norm": 1.3445549621095612, + "learning_rate": 7.792609782047958e-06, + "loss": 0.7372, + "step": 2956 + }, + { + "epoch": 1.751258513473497, + "grad_norm": 1.0834091497432776, + "learning_rate": 7.786372247987454e-06, + "loss": 0.6987, + "step": 2957 + }, + { + "epoch": 1.7518507551080842, + "grad_norm": 3.789898070049763, + "learning_rate": 7.78013561942518e-06, + "loss": 0.7085, + "step": 2958 + }, + { + "epoch": 1.752442996742671, + "grad_norm": 1.2154346445934634, + "learning_rate": 7.773899898912266e-06, + "loss": 0.7192, + "step": 2959 + }, + { + "epoch": 1.753035238377258, + "grad_norm": 1.3478249479015387, + "learning_rate": 7.76766508899947e-06, + "loss": 0.7343, + "step": 2960 + }, + { + "epoch": 1.7536274800118448, + "grad_norm": 1.4976847760098018, + "learning_rate": 7.761431192237192e-06, + "loss": 0.7646, + "step": 2961 + }, + { + "epoch": 1.7542197216464317, + "grad_norm": 1.81752309263868, + "learning_rate": 7.755198211175428e-06, + "loss": 0.7278, + "step": 2962 + }, + { + "epoch": 1.7548119632810186, + "grad_norm": 1.1597990895135137, + "learning_rate": 7.74896614836383e-06, + "loss": 0.7343, + "step": 2963 + }, + { + "epoch": 1.7554042049156056, + "grad_norm": 1.5225622446141434, + "learning_rate": 7.742735006351656e-06, + "loss": 0.7068, + "step": 2964 + }, + { + "epoch": 1.7559964465501925, + "grad_norm": 1.1131198166318628, + "learning_rate": 7.736504787687804e-06, + "loss": 0.6979, + "step": 2965 + }, + { + "epoch": 1.7565886881847794, + "grad_norm": 1.236323876399398, + "learning_rate": 7.73027549492077e-06, + "loss": 0.7224, + "step": 2966 + }, + { + "epoch": 1.7571809298193664, + "grad_norm": 1.2400368319010493, + "learning_rate": 7.724047130598692e-06, + "loss": 0.6906, + "step": 2967 + }, + { + "epoch": 1.757773171453953, + "grad_norm": 1.28829954217693, + "learning_rate": 7.717819697269322e-06, + "loss": 0.6975, + "step": 2968 + }, + { + "epoch": 1.7583654130885402, + "grad_norm": 0.9178925212400618, + "learning_rate": 7.711593197480031e-06, + "loss": 0.7548, + "step": 2969 + }, + { + "epoch": 1.758957654723127, + "grad_norm": 1.3227709395440883, + "learning_rate": 7.7053676337778e-06, + "loss": 0.7114, + "step": 2970 + }, + { + "epoch": 1.759549896357714, + "grad_norm": 0.9702755831253254, + "learning_rate": 7.699143008709245e-06, + "loss": 0.7199, + "step": 2971 + }, + { + "epoch": 1.7601421379923008, + "grad_norm": 0.9961523350834506, + "learning_rate": 7.69291932482058e-06, + "loss": 0.7949, + "step": 2972 + }, + { + "epoch": 1.7607343796268877, + "grad_norm": 1.2207456078535526, + "learning_rate": 7.686696584657649e-06, + "loss": 0.743, + "step": 2973 + }, + { + "epoch": 1.7613266212614747, + "grad_norm": 1.3380369368359777, + "learning_rate": 7.680474790765895e-06, + "loss": 0.6948, + "step": 2974 + }, + { + "epoch": 1.7619188628960616, + "grad_norm": 0.8789763724986712, + "learning_rate": 7.674253945690383e-06, + "loss": 0.6813, + "step": 2975 + }, + { + "epoch": 1.7625111045306485, + "grad_norm": 1.0544272650985724, + "learning_rate": 7.668034051975793e-06, + "loss": 0.6844, + "step": 2976 + }, + { + "epoch": 1.7631033461652355, + "grad_norm": 1.257614243686341, + "learning_rate": 7.661815112166408e-06, + "loss": 0.7071, + "step": 2977 + }, + { + "epoch": 1.7636955877998224, + "grad_norm": 1.0368071883700545, + "learning_rate": 7.655597128806125e-06, + "loss": 0.7018, + "step": 2978 + }, + { + "epoch": 1.764287829434409, + "grad_norm": 1.5446506502619204, + "learning_rate": 7.649380104438446e-06, + "loss": 0.7544, + "step": 2979 + }, + { + "epoch": 1.7648800710689962, + "grad_norm": 0.990885528622176, + "learning_rate": 7.643164041606489e-06, + "loss": 0.7179, + "step": 2980 + }, + { + "epoch": 1.765472312703583, + "grad_norm": 0.9953972053282248, + "learning_rate": 7.63694894285297e-06, + "loss": 0.7065, + "step": 2981 + }, + { + "epoch": 1.7660645543381701, + "grad_norm": 6.360092353723965, + "learning_rate": 7.630734810720212e-06, + "loss": 0.708, + "step": 2982 + }, + { + "epoch": 1.7666567959727568, + "grad_norm": 2.422095125571542, + "learning_rate": 7.624521647750149e-06, + "loss": 0.7443, + "step": 2983 + }, + { + "epoch": 1.7672490376073438, + "grad_norm": 1.0870002901322164, + "learning_rate": 7.618309456484309e-06, + "loss": 0.7297, + "step": 2984 + }, + { + "epoch": 1.7678412792419307, + "grad_norm": 2.3370356729061803, + "learning_rate": 7.612098239463833e-06, + "loss": 0.718, + "step": 2985 + }, + { + "epoch": 1.7684335208765176, + "grad_norm": 1.2080348447951503, + "learning_rate": 7.605887999229454e-06, + "loss": 0.72, + "step": 2986 + }, + { + "epoch": 1.7690257625111045, + "grad_norm": 1.0232344898785648, + "learning_rate": 7.599678738321512e-06, + "loss": 0.7236, + "step": 2987 + }, + { + "epoch": 1.7696180041456915, + "grad_norm": 1.347583694196258, + "learning_rate": 7.593470459279939e-06, + "loss": 0.7651, + "step": 2988 + }, + { + "epoch": 1.7702102457802784, + "grad_norm": 1.4355786367789864, + "learning_rate": 7.58726316464428e-06, + "loss": 0.7125, + "step": 2989 + }, + { + "epoch": 1.7708024874148651, + "grad_norm": 1.0074909388188276, + "learning_rate": 7.581056856953656e-06, + "loss": 0.7373, + "step": 2990 + }, + { + "epoch": 1.7713947290494523, + "grad_norm": 1.06347341429192, + "learning_rate": 7.574851538746802e-06, + "loss": 0.7273, + "step": 2991 + }, + { + "epoch": 1.771986970684039, + "grad_norm": 1.4128916933234186, + "learning_rate": 7.568647212562043e-06, + "loss": 0.7169, + "step": 2992 + }, + { + "epoch": 1.7725792123186261, + "grad_norm": 0.9450378936333973, + "learning_rate": 7.562443880937297e-06, + "loss": 0.7388, + "step": 2993 + }, + { + "epoch": 1.7731714539532129, + "grad_norm": 0.9163136638470174, + "learning_rate": 7.55624154641007e-06, + "loss": 0.7219, + "step": 2994 + }, + { + "epoch": 1.7737636955877998, + "grad_norm": 1.324578116509981, + "learning_rate": 7.550040211517472e-06, + "loss": 0.7602, + "step": 2995 + }, + { + "epoch": 1.7743559372223867, + "grad_norm": 1.1787072674151278, + "learning_rate": 7.543839878796195e-06, + "loss": 0.7183, + "step": 2996 + }, + { + "epoch": 1.7749481788569736, + "grad_norm": 3.8914286195953993, + "learning_rate": 7.537640550782527e-06, + "loss": 0.7204, + "step": 2997 + }, + { + "epoch": 1.7755404204915606, + "grad_norm": 1.0219901640955456, + "learning_rate": 7.531442230012336e-06, + "loss": 0.7438, + "step": 2998 + }, + { + "epoch": 1.7761326621261475, + "grad_norm": 1.2855857922477254, + "learning_rate": 7.525244919021084e-06, + "loss": 0.738, + "step": 2999 + }, + { + "epoch": 1.7767249037607344, + "grad_norm": 2.344182476915904, + "learning_rate": 7.519048620343825e-06, + "loss": 0.7248, + "step": 3000 + }, + { + "epoch": 1.7773171453953212, + "grad_norm": 1.1756563315618664, + "learning_rate": 7.512853336515193e-06, + "loss": 0.7267, + "step": 3001 + }, + { + "epoch": 1.7779093870299083, + "grad_norm": 0.9661265454991339, + "learning_rate": 7.506659070069404e-06, + "loss": 0.6421, + "step": 3002 + }, + { + "epoch": 1.778501628664495, + "grad_norm": 2.2188327107977934, + "learning_rate": 7.5004658235402594e-06, + "loss": 0.7092, + "step": 3003 + }, + { + "epoch": 1.7790938702990822, + "grad_norm": 2.792499182936962, + "learning_rate": 7.494273599461153e-06, + "loss": 0.7787, + "step": 3004 + }, + { + "epoch": 1.7796861119336689, + "grad_norm": 1.4377298749803848, + "learning_rate": 7.4880824003650475e-06, + "loss": 0.7323, + "step": 3005 + }, + { + "epoch": 1.7802783535682558, + "grad_norm": 1.1084690396732224, + "learning_rate": 7.481892228784491e-06, + "loss": 0.728, + "step": 3006 + }, + { + "epoch": 1.7808705952028427, + "grad_norm": 1.375063278971446, + "learning_rate": 7.475703087251611e-06, + "loss": 0.7102, + "step": 3007 + }, + { + "epoch": 1.7814628368374297, + "grad_norm": 1.251032911053362, + "learning_rate": 7.469514978298119e-06, + "loss": 0.7353, + "step": 3008 + }, + { + "epoch": 1.7820550784720166, + "grad_norm": 0.8237693730719295, + "learning_rate": 7.463327904455299e-06, + "loss": 0.6709, + "step": 3009 + }, + { + "epoch": 1.7826473201066035, + "grad_norm": 1.3285542970533006, + "learning_rate": 7.457141868254007e-06, + "loss": 0.7046, + "step": 3010 + }, + { + "epoch": 1.7832395617411905, + "grad_norm": 2.4110688518280003, + "learning_rate": 7.450956872224684e-06, + "loss": 0.6963, + "step": 3011 + }, + { + "epoch": 1.7838318033757772, + "grad_norm": 1.0311760681825095, + "learning_rate": 7.444772918897336e-06, + "loss": 0.7458, + "step": 3012 + }, + { + "epoch": 1.7844240450103643, + "grad_norm": 3.042245299524146, + "learning_rate": 7.438590010801558e-06, + "loss": 0.7007, + "step": 3013 + }, + { + "epoch": 1.785016286644951, + "grad_norm": 1.1638757581822774, + "learning_rate": 7.432408150466497e-06, + "loss": 0.727, + "step": 3014 + }, + { + "epoch": 1.7856085282795382, + "grad_norm": 1.8489857858233996, + "learning_rate": 7.426227340420886e-06, + "loss": 0.7187, + "step": 3015 + }, + { + "epoch": 1.786200769914125, + "grad_norm": 1.8469830538002985, + "learning_rate": 7.42004758319302e-06, + "loss": 0.7172, + "step": 3016 + }, + { + "epoch": 1.7867930115487118, + "grad_norm": 1.411130583258253, + "learning_rate": 7.413868881310778e-06, + "loss": 0.7044, + "step": 3017 + }, + { + "epoch": 1.7873852531832988, + "grad_norm": 1.2920931978075625, + "learning_rate": 7.40769123730158e-06, + "loss": 0.7222, + "step": 3018 + }, + { + "epoch": 1.7879774948178857, + "grad_norm": 1.3026695316857428, + "learning_rate": 7.401514653692442e-06, + "loss": 0.7283, + "step": 3019 + }, + { + "epoch": 1.7885697364524726, + "grad_norm": 1.002137271655342, + "learning_rate": 7.395339133009931e-06, + "loss": 0.7016, + "step": 3020 + }, + { + "epoch": 1.7891619780870596, + "grad_norm": 1.755038606150744, + "learning_rate": 7.3891646777801826e-06, + "loss": 0.7114, + "step": 3021 + }, + { + "epoch": 1.7897542197216465, + "grad_norm": 1.0069551996461956, + "learning_rate": 7.382991290528892e-06, + "loss": 0.7261, + "step": 3022 + }, + { + "epoch": 1.7903464613562332, + "grad_norm": 1.144265082942235, + "learning_rate": 7.376818973781328e-06, + "loss": 0.6922, + "step": 3023 + }, + { + "epoch": 1.7909387029908204, + "grad_norm": 0.9697054263146545, + "learning_rate": 7.370647730062311e-06, + "loss": 0.7872, + "step": 3024 + }, + { + "epoch": 1.791530944625407, + "grad_norm": 1.0689276075884193, + "learning_rate": 7.364477561896231e-06, + "loss": 0.6867, + "step": 3025 + }, + { + "epoch": 1.7921231862599942, + "grad_norm": 0.9895555328791956, + "learning_rate": 7.358308471807028e-06, + "loss": 0.7047, + "step": 3026 + }, + { + "epoch": 1.792715427894581, + "grad_norm": 1.426959406213989, + "learning_rate": 7.3521404623182065e-06, + "loss": 0.6768, + "step": 3027 + }, + { + "epoch": 1.7933076695291679, + "grad_norm": 1.6038922032905296, + "learning_rate": 7.3459735359528366e-06, + "loss": 0.7169, + "step": 3028 + }, + { + "epoch": 1.7938999111637548, + "grad_norm": 1.2100286028193288, + "learning_rate": 7.339807695233534e-06, + "loss": 0.732, + "step": 3029 + }, + { + "epoch": 1.7944921527983417, + "grad_norm": 2.091644826171607, + "learning_rate": 7.333642942682473e-06, + "loss": 0.7597, + "step": 3030 + }, + { + "epoch": 1.7950843944329287, + "grad_norm": 0.9734538843147006, + "learning_rate": 7.327479280821381e-06, + "loss": 0.6885, + "step": 3031 + }, + { + "epoch": 1.7956766360675156, + "grad_norm": 0.9935775103575984, + "learning_rate": 7.3213167121715514e-06, + "loss": 0.7143, + "step": 3032 + }, + { + "epoch": 1.7962688777021025, + "grad_norm": 1.2604409769783473, + "learning_rate": 7.315155239253815e-06, + "loss": 0.7302, + "step": 3033 + }, + { + "epoch": 1.7968611193366892, + "grad_norm": 1.1543012632612109, + "learning_rate": 7.308994864588562e-06, + "loss": 0.7388, + "step": 3034 + }, + { + "epoch": 1.7974533609712764, + "grad_norm": 1.242912338801917, + "learning_rate": 7.302835590695731e-06, + "loss": 0.732, + "step": 3035 + }, + { + "epoch": 1.798045602605863, + "grad_norm": 0.9047047765521495, + "learning_rate": 7.296677420094811e-06, + "loss": 0.6763, + "step": 3036 + }, + { + "epoch": 1.7986378442404503, + "grad_norm": 1.0900389528678076, + "learning_rate": 7.290520355304844e-06, + "loss": 0.7187, + "step": 3037 + }, + { + "epoch": 1.799230085875037, + "grad_norm": 1.1766868265659325, + "learning_rate": 7.284364398844412e-06, + "loss": 0.7759, + "step": 3038 + }, + { + "epoch": 1.799822327509624, + "grad_norm": 0.969905582098005, + "learning_rate": 7.2782095532316486e-06, + "loss": 0.6934, + "step": 3039 + }, + { + "epoch": 1.8004145691442108, + "grad_norm": 1.0483421180595986, + "learning_rate": 7.27205582098423e-06, + "loss": 0.6879, + "step": 3040 + }, + { + "epoch": 1.8010068107787978, + "grad_norm": 1.4279823666510263, + "learning_rate": 7.265903204619386e-06, + "loss": 0.6903, + "step": 3041 + }, + { + "epoch": 1.8015990524133847, + "grad_norm": 1.036318973747682, + "learning_rate": 7.25975170665387e-06, + "loss": 0.7208, + "step": 3042 + }, + { + "epoch": 1.8021912940479716, + "grad_norm": 1.1321513164832115, + "learning_rate": 7.253601329604001e-06, + "loss": 0.7162, + "step": 3043 + }, + { + "epoch": 1.8027835356825586, + "grad_norm": 7.227215013383004, + "learning_rate": 7.247452075985622e-06, + "loss": 0.7006, + "step": 3044 + }, + { + "epoch": 1.8033757773171453, + "grad_norm": 0.9690712025929449, + "learning_rate": 7.241303948314135e-06, + "loss": 0.7178, + "step": 3045 + }, + { + "epoch": 1.8039680189517324, + "grad_norm": 2.051690522200872, + "learning_rate": 7.235156949104455e-06, + "loss": 0.6948, + "step": 3046 + }, + { + "epoch": 1.8045602605863191, + "grad_norm": 0.975264804113868, + "learning_rate": 7.22901108087106e-06, + "loss": 0.7484, + "step": 3047 + }, + { + "epoch": 1.8051525022209063, + "grad_norm": 1.286263931105427, + "learning_rate": 7.222866346127952e-06, + "loss": 0.7124, + "step": 3048 + }, + { + "epoch": 1.805744743855493, + "grad_norm": 1.5309931798364385, + "learning_rate": 7.216722747388678e-06, + "loss": 0.7074, + "step": 3049 + }, + { + "epoch": 1.80633698549008, + "grad_norm": 1.7076934436600835, + "learning_rate": 7.210580287166307e-06, + "loss": 0.7149, + "step": 3050 + }, + { + "epoch": 1.8069292271246669, + "grad_norm": 1.1300533995630113, + "learning_rate": 7.2044389679734564e-06, + "loss": 0.7234, + "step": 3051 + }, + { + "epoch": 1.8075214687592538, + "grad_norm": 1.0540182120458212, + "learning_rate": 7.198298792322271e-06, + "loss": 0.7289, + "step": 3052 + }, + { + "epoch": 1.8081137103938407, + "grad_norm": 1.4987232839307902, + "learning_rate": 7.192159762724427e-06, + "loss": 0.7248, + "step": 3053 + }, + { + "epoch": 1.8087059520284274, + "grad_norm": 1.0497678988574621, + "learning_rate": 7.186021881691132e-06, + "loss": 0.6714, + "step": 3054 + }, + { + "epoch": 1.8092981936630146, + "grad_norm": 1.0903848686159592, + "learning_rate": 7.179885151733124e-06, + "loss": 0.6885, + "step": 3055 + }, + { + "epoch": 1.8098904352976013, + "grad_norm": 1.0133138934590773, + "learning_rate": 7.173749575360671e-06, + "loss": 0.7397, + "step": 3056 + }, + { + "epoch": 1.8104826769321885, + "grad_norm": 1.2805524494775062, + "learning_rate": 7.167615155083574e-06, + "loss": 0.6995, + "step": 3057 + }, + { + "epoch": 1.8110749185667752, + "grad_norm": 1.353348949222249, + "learning_rate": 7.1614818934111475e-06, + "loss": 0.704, + "step": 3058 + }, + { + "epoch": 1.8116671602013623, + "grad_norm": 1.7884004913807032, + "learning_rate": 7.155349792852242e-06, + "loss": 0.7066, + "step": 3059 + }, + { + "epoch": 1.812259401835949, + "grad_norm": 1.1372875659844779, + "learning_rate": 7.1492188559152364e-06, + "loss": 0.7382, + "step": 3060 + }, + { + "epoch": 1.812851643470536, + "grad_norm": 7.427167503376271, + "learning_rate": 7.143089085108028e-06, + "loss": 0.7224, + "step": 3061 + }, + { + "epoch": 1.8134438851051229, + "grad_norm": 1.602444693750489, + "learning_rate": 7.136960482938035e-06, + "loss": 0.7293, + "step": 3062 + }, + { + "epoch": 1.8140361267397098, + "grad_norm": 4.355339095528307, + "learning_rate": 7.130833051912198e-06, + "loss": 0.7455, + "step": 3063 + }, + { + "epoch": 1.8146283683742968, + "grad_norm": 1.2740183492816513, + "learning_rate": 7.124706794536984e-06, + "loss": 0.7043, + "step": 3064 + }, + { + "epoch": 1.8152206100088835, + "grad_norm": 1.0624879073095552, + "learning_rate": 7.11858171331838e-06, + "loss": 0.7437, + "step": 3065 + }, + { + "epoch": 1.8158128516434706, + "grad_norm": 1.1249819254474553, + "learning_rate": 7.112457810761883e-06, + "loss": 0.7151, + "step": 3066 + }, + { + "epoch": 1.8164050932780573, + "grad_norm": 2.4801786068161737, + "learning_rate": 7.106335089372517e-06, + "loss": 0.7494, + "step": 3067 + }, + { + "epoch": 1.8169973349126445, + "grad_norm": 1.7960579111312531, + "learning_rate": 7.100213551654816e-06, + "loss": 0.6956, + "step": 3068 + }, + { + "epoch": 1.8175895765472312, + "grad_norm": 1.206786649404624, + "learning_rate": 7.0940932001128395e-06, + "loss": 0.7599, + "step": 3069 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.3862079077433727, + "learning_rate": 7.087974037250146e-06, + "loss": 0.7371, + "step": 3070 + }, + { + "epoch": 1.818774059816405, + "grad_norm": 1.9789719604629894, + "learning_rate": 7.0818560655698246e-06, + "loss": 0.7472, + "step": 3071 + }, + { + "epoch": 1.819366301450992, + "grad_norm": 1.4121044579900446, + "learning_rate": 7.075739287574467e-06, + "loss": 0.7147, + "step": 3072 + }, + { + "epoch": 1.819958543085579, + "grad_norm": 2.1510778746002455, + "learning_rate": 7.069623705766182e-06, + "loss": 0.7472, + "step": 3073 + }, + { + "epoch": 1.8205507847201658, + "grad_norm": 1.0077829214521765, + "learning_rate": 7.063509322646581e-06, + "loss": 0.6905, + "step": 3074 + }, + { + "epoch": 1.8211430263547528, + "grad_norm": 1.3469407014130284, + "learning_rate": 7.057396140716796e-06, + "loss": 0.7024, + "step": 3075 + }, + { + "epoch": 1.8217352679893395, + "grad_norm": 1.876760028468839, + "learning_rate": 7.051284162477459e-06, + "loss": 0.7303, + "step": 3076 + }, + { + "epoch": 1.8223275096239266, + "grad_norm": 1.049747547083592, + "learning_rate": 7.0451733904287166e-06, + "loss": 0.7227, + "step": 3077 + }, + { + "epoch": 1.8229197512585134, + "grad_norm": 1.0058819206164897, + "learning_rate": 7.039063827070214e-06, + "loss": 0.713, + "step": 3078 + }, + { + "epoch": 1.8235119928931005, + "grad_norm": 3.6076196015946236, + "learning_rate": 7.0329554749011045e-06, + "loss": 0.7295, + "step": 3079 + }, + { + "epoch": 1.8241042345276872, + "grad_norm": 1.0057618249589486, + "learning_rate": 7.026848336420053e-06, + "loss": 0.7129, + "step": 3080 + }, + { + "epoch": 1.8246964761622744, + "grad_norm": 1.7386425971569412, + "learning_rate": 7.020742414125223e-06, + "loss": 0.6903, + "step": 3081 + }, + { + "epoch": 1.825288717796861, + "grad_norm": 2.473617947321098, + "learning_rate": 7.014637710514274e-06, + "loss": 0.7285, + "step": 3082 + }, + { + "epoch": 1.825880959431448, + "grad_norm": 0.9327751534341746, + "learning_rate": 7.008534228084376e-06, + "loss": 0.7183, + "step": 3083 + }, + { + "epoch": 1.826473201066035, + "grad_norm": 1.0584729282570624, + "learning_rate": 7.002431969332197e-06, + "loss": 0.7485, + "step": 3084 + }, + { + "epoch": 1.8270654427006219, + "grad_norm": 0.9797461090483691, + "learning_rate": 6.996330936753907e-06, + "loss": 0.7756, + "step": 3085 + }, + { + "epoch": 1.8276576843352088, + "grad_norm": 1.1221799073488412, + "learning_rate": 6.990231132845169e-06, + "loss": 0.7499, + "step": 3086 + }, + { + "epoch": 1.8282499259697955, + "grad_norm": 0.9306758935450702, + "learning_rate": 6.984132560101143e-06, + "loss": 0.7304, + "step": 3087 + }, + { + "epoch": 1.8288421676043827, + "grad_norm": 1.0887511525648792, + "learning_rate": 6.978035221016487e-06, + "loss": 0.7267, + "step": 3088 + }, + { + "epoch": 1.8294344092389694, + "grad_norm": 1.1273648529558058, + "learning_rate": 6.971939118085365e-06, + "loss": 0.6752, + "step": 3089 + }, + { + "epoch": 1.8300266508735565, + "grad_norm": 1.0281268202300238, + "learning_rate": 6.965844253801416e-06, + "loss": 0.723, + "step": 3090 + }, + { + "epoch": 1.8306188925081432, + "grad_norm": 1.0970082995953543, + "learning_rate": 6.959750630657787e-06, + "loss": 0.7901, + "step": 3091 + }, + { + "epoch": 1.8312111341427302, + "grad_norm": 1.0998649102940627, + "learning_rate": 6.953658251147109e-06, + "loss": 0.7464, + "step": 3092 + }, + { + "epoch": 1.8318033757773171, + "grad_norm": 1.6332086008816058, + "learning_rate": 6.947567117761517e-06, + "loss": 0.7037, + "step": 3093 + }, + { + "epoch": 1.832395617411904, + "grad_norm": 2.4583445925569114, + "learning_rate": 6.941477232992614e-06, + "loss": 0.698, + "step": 3094 + }, + { + "epoch": 1.832987859046491, + "grad_norm": 1.1349251674267553, + "learning_rate": 6.935388599331514e-06, + "loss": 0.6615, + "step": 3095 + }, + { + "epoch": 1.833580100681078, + "grad_norm": 1.0921356725030107, + "learning_rate": 6.929301219268806e-06, + "loss": 0.7015, + "step": 3096 + }, + { + "epoch": 1.8341723423156648, + "grad_norm": 1.534601523832624, + "learning_rate": 6.92321509529458e-06, + "loss": 0.6909, + "step": 3097 + }, + { + "epoch": 1.8347645839502515, + "grad_norm": 1.2237585847130814, + "learning_rate": 6.917130229898387e-06, + "loss": 0.6804, + "step": 3098 + }, + { + "epoch": 1.8353568255848387, + "grad_norm": 1.6141007678469457, + "learning_rate": 6.911046625569293e-06, + "loss": 0.7228, + "step": 3099 + }, + { + "epoch": 1.8359490672194254, + "grad_norm": 0.9997505945449734, + "learning_rate": 6.90496428479583e-06, + "loss": 0.7101, + "step": 3100 + }, + { + "epoch": 1.8365413088540126, + "grad_norm": 1.0339643235502096, + "learning_rate": 6.898883210066018e-06, + "loss": 0.7141, + "step": 3101 + }, + { + "epoch": 1.8371335504885993, + "grad_norm": 0.9214598557673855, + "learning_rate": 6.892803403867352e-06, + "loss": 0.7178, + "step": 3102 + }, + { + "epoch": 1.8377257921231862, + "grad_norm": 1.2899691433939153, + "learning_rate": 6.886724868686823e-06, + "loss": 0.7322, + "step": 3103 + }, + { + "epoch": 1.8383180337577731, + "grad_norm": 1.0878616869188478, + "learning_rate": 6.8806476070108905e-06, + "loss": 0.7229, + "step": 3104 + }, + { + "epoch": 1.83891027539236, + "grad_norm": 2.1890307846418455, + "learning_rate": 6.874571621325498e-06, + "loss": 0.7179, + "step": 3105 + }, + { + "epoch": 1.839502517026947, + "grad_norm": 1.2042122977243805, + "learning_rate": 6.868496914116063e-06, + "loss": 0.7208, + "step": 3106 + }, + { + "epoch": 1.840094758661534, + "grad_norm": 1.1067009260330778, + "learning_rate": 6.86242348786748e-06, + "loss": 0.7104, + "step": 3107 + }, + { + "epoch": 1.8406870002961209, + "grad_norm": 1.977612629506757, + "learning_rate": 6.856351345064127e-06, + "loss": 0.7083, + "step": 3108 + }, + { + "epoch": 1.8412792419307076, + "grad_norm": 1.183996644590801, + "learning_rate": 6.850280488189851e-06, + "loss": 0.7136, + "step": 3109 + }, + { + "epoch": 1.8418714835652947, + "grad_norm": 1.1336099389139767, + "learning_rate": 6.844210919727971e-06, + "loss": 0.7121, + "step": 3110 + }, + { + "epoch": 1.8424637251998814, + "grad_norm": 0.9915840493395013, + "learning_rate": 6.838142642161283e-06, + "loss": 0.7146, + "step": 3111 + }, + { + "epoch": 1.8430559668344686, + "grad_norm": 1.0672864776594517, + "learning_rate": 6.8320756579720545e-06, + "loss": 0.7185, + "step": 3112 + }, + { + "epoch": 1.8436482084690553, + "grad_norm": 1.2611537685575909, + "learning_rate": 6.826009969642027e-06, + "loss": 0.7399, + "step": 3113 + }, + { + "epoch": 1.8442404501036422, + "grad_norm": 3.728635542061314, + "learning_rate": 6.819945579652401e-06, + "loss": 0.7557, + "step": 3114 + }, + { + "epoch": 1.8448326917382292, + "grad_norm": 1.3284266412732086, + "learning_rate": 6.813882490483854e-06, + "loss": 0.6961, + "step": 3115 + }, + { + "epoch": 1.845424933372816, + "grad_norm": 19.468979938411216, + "learning_rate": 6.807820704616532e-06, + "loss": 0.7411, + "step": 3116 + }, + { + "epoch": 1.846017175007403, + "grad_norm": 0.8689154585001878, + "learning_rate": 6.801760224530052e-06, + "loss": 0.7244, + "step": 3117 + }, + { + "epoch": 1.84660941664199, + "grad_norm": 1.068439800318271, + "learning_rate": 6.795701052703482e-06, + "loss": 0.7105, + "step": 3118 + }, + { + "epoch": 1.847201658276577, + "grad_norm": 0.9526703958307156, + "learning_rate": 6.7896431916153684e-06, + "loss": 0.7437, + "step": 3119 + }, + { + "epoch": 1.8477938999111636, + "grad_norm": 1.0630490772363779, + "learning_rate": 6.783586643743714e-06, + "loss": 0.7221, + "step": 3120 + }, + { + "epoch": 1.8483861415457508, + "grad_norm": 3.5896361583850567, + "learning_rate": 6.777531411565996e-06, + "loss": 0.7202, + "step": 3121 + }, + { + "epoch": 1.8489783831803375, + "grad_norm": 1.7343409372636471, + "learning_rate": 6.7714774975591335e-06, + "loss": 0.7145, + "step": 3122 + }, + { + "epoch": 1.8495706248149246, + "grad_norm": 1.0396615169001493, + "learning_rate": 6.7654249041995256e-06, + "loss": 0.7005, + "step": 3123 + }, + { + "epoch": 1.8501628664495113, + "grad_norm": 1.642370057671462, + "learning_rate": 6.75937363396302e-06, + "loss": 0.7372, + "step": 3124 + }, + { + "epoch": 1.8507551080840983, + "grad_norm": 1.1074855634634668, + "learning_rate": 6.753323689324931e-06, + "loss": 0.7431, + "step": 3125 + }, + { + "epoch": 1.8513473497186852, + "grad_norm": 0.9324152655129814, + "learning_rate": 6.7472750727600155e-06, + "loss": 0.7135, + "step": 3126 + }, + { + "epoch": 1.8519395913532721, + "grad_norm": 0.8914241503209785, + "learning_rate": 6.741227786742509e-06, + "loss": 0.7181, + "step": 3127 + }, + { + "epoch": 1.852531832987859, + "grad_norm": 1.1891158801295643, + "learning_rate": 6.735181833746087e-06, + "loss": 0.6945, + "step": 3128 + }, + { + "epoch": 1.853124074622446, + "grad_norm": 1.0727481880906515, + "learning_rate": 6.729137216243886e-06, + "loss": 0.6876, + "step": 3129 + }, + { + "epoch": 1.853716316257033, + "grad_norm": 1.0797517345200798, + "learning_rate": 6.7230939367084915e-06, + "loss": 0.7165, + "step": 3130 + }, + { + "epoch": 1.8543085578916196, + "grad_norm": 1.4893641209790685, + "learning_rate": 6.717051997611944e-06, + "loss": 0.7433, + "step": 3131 + }, + { + "epoch": 1.8549007995262068, + "grad_norm": 1.0413984547576909, + "learning_rate": 6.711011401425741e-06, + "loss": 0.6766, + "step": 3132 + }, + { + "epoch": 1.8554930411607935, + "grad_norm": 1.1532788341257012, + "learning_rate": 6.704972150620825e-06, + "loss": 0.6584, + "step": 3133 + }, + { + "epoch": 1.8560852827953807, + "grad_norm": 1.680364023670173, + "learning_rate": 6.698934247667587e-06, + "loss": 0.7095, + "step": 3134 + }, + { + "epoch": 1.8566775244299674, + "grad_norm": 1.5274601112160506, + "learning_rate": 6.692897695035866e-06, + "loss": 0.6719, + "step": 3135 + }, + { + "epoch": 1.8572697660645543, + "grad_norm": 1.4344751525084347, + "learning_rate": 6.686862495194958e-06, + "loss": 0.7164, + "step": 3136 + }, + { + "epoch": 1.8578620076991412, + "grad_norm": 1.305338228972161, + "learning_rate": 6.6808286506135975e-06, + "loss": 0.7633, + "step": 3137 + }, + { + "epoch": 1.8584542493337282, + "grad_norm": 1.1939810781229667, + "learning_rate": 6.6747961637599645e-06, + "loss": 0.7098, + "step": 3138 + }, + { + "epoch": 1.859046490968315, + "grad_norm": 1.454501033733436, + "learning_rate": 6.668765037101682e-06, + "loss": 0.7129, + "step": 3139 + }, + { + "epoch": 1.859638732602902, + "grad_norm": 0.9270487154278511, + "learning_rate": 6.662735273105827e-06, + "loss": 0.7298, + "step": 3140 + }, + { + "epoch": 1.860230974237489, + "grad_norm": 0.887328217763018, + "learning_rate": 6.656706874238909e-06, + "loss": 0.6998, + "step": 3141 + }, + { + "epoch": 1.8608232158720757, + "grad_norm": 0.9240842979460729, + "learning_rate": 6.650679842966881e-06, + "loss": 0.7242, + "step": 3142 + }, + { + "epoch": 1.8614154575066628, + "grad_norm": 1.4560811731823002, + "learning_rate": 6.644654181755139e-06, + "loss": 0.7467, + "step": 3143 + }, + { + "epoch": 1.8620076991412495, + "grad_norm": 1.8642002876762704, + "learning_rate": 6.638629893068516e-06, + "loss": 0.7094, + "step": 3144 + }, + { + "epoch": 1.8625999407758367, + "grad_norm": 1.3637576327896082, + "learning_rate": 6.632606979371287e-06, + "loss": 0.7453, + "step": 3145 + }, + { + "epoch": 1.8631921824104234, + "grad_norm": 1.318364652725098, + "learning_rate": 6.62658544312716e-06, + "loss": 0.6683, + "step": 3146 + }, + { + "epoch": 1.8637844240450103, + "grad_norm": 1.457651252213059, + "learning_rate": 6.620565286799283e-06, + "loss": 0.7037, + "step": 3147 + }, + { + "epoch": 1.8643766656795973, + "grad_norm": 1.7273054218815638, + "learning_rate": 6.614546512850237e-06, + "loss": 0.7301, + "step": 3148 + }, + { + "epoch": 1.8649689073141842, + "grad_norm": 1.2049671686395615, + "learning_rate": 6.608529123742047e-06, + "loss": 0.728, + "step": 3149 + }, + { + "epoch": 1.8655611489487711, + "grad_norm": 1.17602916283294, + "learning_rate": 6.6025131219361505e-06, + "loss": 0.7274, + "step": 3150 + }, + { + "epoch": 1.866153390583358, + "grad_norm": 2.03746704987476, + "learning_rate": 6.596498509893438e-06, + "loss": 0.7063, + "step": 3151 + }, + { + "epoch": 1.866745632217945, + "grad_norm": 1.1900210213393627, + "learning_rate": 6.590485290074224e-06, + "loss": 0.721, + "step": 3152 + }, + { + "epoch": 1.8673378738525317, + "grad_norm": 2.1534721717971452, + "learning_rate": 6.584473464938257e-06, + "loss": 0.7324, + "step": 3153 + }, + { + "epoch": 1.8679301154871188, + "grad_norm": 2.9498552630407606, + "learning_rate": 6.5784630369447e-06, + "loss": 0.7615, + "step": 3154 + }, + { + "epoch": 1.8685223571217056, + "grad_norm": 1.2185800168618288, + "learning_rate": 6.572454008552166e-06, + "loss": 0.7466, + "step": 3155 + }, + { + "epoch": 1.8691145987562927, + "grad_norm": 1.1826866438599948, + "learning_rate": 6.566446382218683e-06, + "loss": 0.745, + "step": 3156 + }, + { + "epoch": 1.8697068403908794, + "grad_norm": 1.6999039368124431, + "learning_rate": 6.5604401604017095e-06, + "loss": 0.6985, + "step": 3157 + }, + { + "epoch": 1.8702990820254664, + "grad_norm": 1.244053260256039, + "learning_rate": 6.5544353455581245e-06, + "loss": 0.6739, + "step": 3158 + }, + { + "epoch": 1.8708913236600533, + "grad_norm": 1.718673210214095, + "learning_rate": 6.5484319401442346e-06, + "loss": 0.7268, + "step": 3159 + }, + { + "epoch": 1.8714835652946402, + "grad_norm": 1.9410211268230035, + "learning_rate": 6.542429946615774e-06, + "loss": 0.7235, + "step": 3160 + }, + { + "epoch": 1.8720758069292271, + "grad_norm": 1.1807820843110362, + "learning_rate": 6.536429367427896e-06, + "loss": 0.7566, + "step": 3161 + }, + { + "epoch": 1.872668048563814, + "grad_norm": 1.4651639003519918, + "learning_rate": 6.53043020503517e-06, + "loss": 0.7173, + "step": 3162 + }, + { + "epoch": 1.873260290198401, + "grad_norm": 2.002968615540737, + "learning_rate": 6.5244324618915925e-06, + "loss": 0.7393, + "step": 3163 + }, + { + "epoch": 1.8738525318329877, + "grad_norm": 1.1224074176105243, + "learning_rate": 6.5184361404505795e-06, + "loss": 0.7074, + "step": 3164 + }, + { + "epoch": 1.8744447734675749, + "grad_norm": 1.5396485670035198, + "learning_rate": 6.512441243164967e-06, + "loss": 0.7426, + "step": 3165 + }, + { + "epoch": 1.8750370151021616, + "grad_norm": 1.1447433665685038, + "learning_rate": 6.506447772486997e-06, + "loss": 0.7526, + "step": 3166 + }, + { + "epoch": 1.8756292567367487, + "grad_norm": 1.943211358510783, + "learning_rate": 6.50045573086834e-06, + "loss": 0.7328, + "step": 3167 + }, + { + "epoch": 1.8762214983713354, + "grad_norm": 1.8487347809196877, + "learning_rate": 6.4944651207600765e-06, + "loss": 0.7435, + "step": 3168 + }, + { + "epoch": 1.8768137400059224, + "grad_norm": 0.9270621419455923, + "learning_rate": 6.488475944612709e-06, + "loss": 0.7267, + "step": 3169 + }, + { + "epoch": 1.8774059816405093, + "grad_norm": 1.4497042883892517, + "learning_rate": 6.4824882048761406e-06, + "loss": 0.7384, + "step": 3170 + }, + { + "epoch": 1.8779982232750962, + "grad_norm": 1.7243801348273786, + "learning_rate": 6.476501903999695e-06, + "loss": 0.7276, + "step": 3171 + }, + { + "epoch": 1.8785904649096832, + "grad_norm": 2.212944195283541, + "learning_rate": 6.470517044432104e-06, + "loss": 0.7246, + "step": 3172 + }, + { + "epoch": 1.87918270654427, + "grad_norm": 2.9708159494514095, + "learning_rate": 6.464533628621521e-06, + "loss": 0.7134, + "step": 3173 + }, + { + "epoch": 1.879774948178857, + "grad_norm": 1.1598634102463192, + "learning_rate": 6.458551659015486e-06, + "loss": 0.6919, + "step": 3174 + }, + { + "epoch": 1.8803671898134438, + "grad_norm": 1.0582572327884274, + "learning_rate": 6.452571138060971e-06, + "loss": 0.7249, + "step": 3175 + }, + { + "epoch": 1.880959431448031, + "grad_norm": 1.2040138425826092, + "learning_rate": 6.446592068204341e-06, + "loss": 0.7295, + "step": 3176 + }, + { + "epoch": 1.8815516730826176, + "grad_norm": 2.157200215176374, + "learning_rate": 6.440614451891376e-06, + "loss": 0.7359, + "step": 3177 + }, + { + "epoch": 1.8821439147172048, + "grad_norm": 1.6977745304387328, + "learning_rate": 6.43463829156725e-06, + "loss": 0.7341, + "step": 3178 + }, + { + "epoch": 1.8827361563517915, + "grad_norm": 1.5684451007006281, + "learning_rate": 6.428663589676554e-06, + "loss": 0.72, + "step": 3179 + }, + { + "epoch": 1.8833283979863784, + "grad_norm": 1.1271225922581531, + "learning_rate": 6.422690348663276e-06, + "loss": 0.7196, + "step": 3180 + }, + { + "epoch": 1.8839206396209653, + "grad_norm": 1.8620701322408784, + "learning_rate": 6.41671857097081e-06, + "loss": 0.6995, + "step": 3181 + }, + { + "epoch": 1.8845128812555523, + "grad_norm": 1.7096834779094625, + "learning_rate": 6.410748259041941e-06, + "loss": 0.7097, + "step": 3182 + }, + { + "epoch": 1.8851051228901392, + "grad_norm": 1.4823000392501897, + "learning_rate": 6.40477941531887e-06, + "loss": 0.7037, + "step": 3183 + }, + { + "epoch": 1.8856973645247261, + "grad_norm": 1.226315830076985, + "learning_rate": 6.398812042243187e-06, + "loss": 0.728, + "step": 3184 + }, + { + "epoch": 1.886289606159313, + "grad_norm": 0.9904003037669937, + "learning_rate": 6.392846142255886e-06, + "loss": 0.6943, + "step": 3185 + }, + { + "epoch": 1.8868818477938998, + "grad_norm": 1.2611406463421255, + "learning_rate": 6.3868817177973505e-06, + "loss": 0.7045, + "step": 3186 + }, + { + "epoch": 1.887474089428487, + "grad_norm": 1.4663012894424847, + "learning_rate": 6.380918771307365e-06, + "loss": 0.6833, + "step": 3187 + }, + { + "epoch": 1.8880663310630736, + "grad_norm": 1.3127393848518931, + "learning_rate": 6.3749573052251155e-06, + "loss": 0.7176, + "step": 3188 + }, + { + "epoch": 1.8886585726976608, + "grad_norm": 1.4212019430587715, + "learning_rate": 6.368997321989176e-06, + "loss": 0.7226, + "step": 3189 + }, + { + "epoch": 1.8892508143322475, + "grad_norm": 1.2265922099963948, + "learning_rate": 6.363038824037511e-06, + "loss": 0.7691, + "step": 3190 + }, + { + "epoch": 1.8898430559668344, + "grad_norm": 2.4233051026500196, + "learning_rate": 6.357081813807482e-06, + "loss": 0.736, + "step": 3191 + }, + { + "epoch": 1.8904352976014214, + "grad_norm": 1.3155312645080448, + "learning_rate": 6.351126293735843e-06, + "loss": 0.7628, + "step": 3192 + }, + { + "epoch": 1.8910275392360083, + "grad_norm": 1.411204552132623, + "learning_rate": 6.345172266258739e-06, + "loss": 0.7293, + "step": 3193 + }, + { + "epoch": 1.8916197808705952, + "grad_norm": 1.5791438885255882, + "learning_rate": 6.339219733811697e-06, + "loss": 0.6975, + "step": 3194 + }, + { + "epoch": 1.8922120225051822, + "grad_norm": 1.4220630376908199, + "learning_rate": 6.333268698829639e-06, + "loss": 0.7482, + "step": 3195 + }, + { + "epoch": 1.892804264139769, + "grad_norm": 1.3250303025534889, + "learning_rate": 6.327319163746871e-06, + "loss": 0.7284, + "step": 3196 + }, + { + "epoch": 1.8933965057743558, + "grad_norm": 1.0527611253118723, + "learning_rate": 6.321371130997095e-06, + "loss": 0.6912, + "step": 3197 + }, + { + "epoch": 1.893988747408943, + "grad_norm": 1.1033965769900478, + "learning_rate": 6.315424603013382e-06, + "loss": 0.7213, + "step": 3198 + }, + { + "epoch": 1.8945809890435297, + "grad_norm": 1.2063474598713093, + "learning_rate": 6.309479582228202e-06, + "loss": 0.7157, + "step": 3199 + }, + { + "epoch": 1.8951732306781168, + "grad_norm": 1.269742008185876, + "learning_rate": 6.303536071073397e-06, + "loss": 0.7374, + "step": 3200 + }, + { + "epoch": 1.8957654723127035, + "grad_norm": 1.8184812717508338, + "learning_rate": 6.297594071980208e-06, + "loss": 0.6878, + "step": 3201 + }, + { + "epoch": 1.8963577139472905, + "grad_norm": 1.511960235444543, + "learning_rate": 6.291653587379236e-06, + "loss": 0.7505, + "step": 3202 + }, + { + "epoch": 1.8969499555818774, + "grad_norm": 1.2751220645522783, + "learning_rate": 6.2857146197004755e-06, + "loss": 0.8061, + "step": 3203 + }, + { + "epoch": 1.8975421972164643, + "grad_norm": 1.1058079431721495, + "learning_rate": 6.2797771713733025e-06, + "loss": 0.7353, + "step": 3204 + }, + { + "epoch": 1.8981344388510513, + "grad_norm": 0.9715354260971804, + "learning_rate": 6.273841244826466e-06, + "loss": 0.7067, + "step": 3205 + }, + { + "epoch": 1.898726680485638, + "grad_norm": 1.3513816230607942, + "learning_rate": 6.267906842488088e-06, + "loss": 0.7182, + "step": 3206 + }, + { + "epoch": 1.8993189221202251, + "grad_norm": 2.4611133660776714, + "learning_rate": 6.261973966785679e-06, + "loss": 0.7578, + "step": 3207 + }, + { + "epoch": 1.8999111637548118, + "grad_norm": 3.4200966306498968, + "learning_rate": 6.256042620146119e-06, + "loss": 0.7041, + "step": 3208 + }, + { + "epoch": 1.900503405389399, + "grad_norm": 1.0636075328664951, + "learning_rate": 6.250112804995661e-06, + "loss": 0.7919, + "step": 3209 + }, + { + "epoch": 1.9010956470239857, + "grad_norm": 1.1252808954511786, + "learning_rate": 6.2441845237599285e-06, + "loss": 0.7849, + "step": 3210 + }, + { + "epoch": 1.9016878886585729, + "grad_norm": 0.9806450437275593, + "learning_rate": 6.238257778863925e-06, + "loss": 0.7062, + "step": 3211 + }, + { + "epoch": 1.9022801302931596, + "grad_norm": 1.6392036224674011, + "learning_rate": 6.232332572732025e-06, + "loss": 0.7196, + "step": 3212 + }, + { + "epoch": 1.9028723719277465, + "grad_norm": 1.5302573409478557, + "learning_rate": 6.226408907787971e-06, + "loss": 0.7276, + "step": 3213 + }, + { + "epoch": 1.9034646135623334, + "grad_norm": 3.974467334315002, + "learning_rate": 6.22048678645487e-06, + "loss": 0.6877, + "step": 3214 + }, + { + "epoch": 1.9040568551969204, + "grad_norm": 1.2222534462798487, + "learning_rate": 6.2145662111552045e-06, + "loss": 0.7145, + "step": 3215 + }, + { + "epoch": 1.9046490968315073, + "grad_norm": 0.8412923141136364, + "learning_rate": 6.208647184310826e-06, + "loss": 0.7067, + "step": 3216 + }, + { + "epoch": 1.905241338466094, + "grad_norm": 1.8567585845983157, + "learning_rate": 6.202729708342948e-06, + "loss": 0.7041, + "step": 3217 + }, + { + "epoch": 1.9058335801006812, + "grad_norm": 1.3812265601103704, + "learning_rate": 6.196813785672149e-06, + "loss": 0.7327, + "step": 3218 + }, + { + "epoch": 1.9064258217352679, + "grad_norm": 1.490042310350291, + "learning_rate": 6.1908994187183726e-06, + "loss": 0.7277, + "step": 3219 + }, + { + "epoch": 1.907018063369855, + "grad_norm": 1.954642417359045, + "learning_rate": 6.184986609900934e-06, + "loss": 0.6789, + "step": 3220 + }, + { + "epoch": 1.9076103050044417, + "grad_norm": 1.837812310692102, + "learning_rate": 6.1790753616385e-06, + "loss": 0.7116, + "step": 3221 + }, + { + "epoch": 1.9082025466390289, + "grad_norm": 1.352234369694832, + "learning_rate": 6.173165676349103e-06, + "loss": 0.6998, + "step": 3222 + }, + { + "epoch": 1.9087947882736156, + "grad_norm": 1.353210285362207, + "learning_rate": 6.167257556450139e-06, + "loss": 0.7536, + "step": 3223 + }, + { + "epoch": 1.9093870299082025, + "grad_norm": 2.00212211868531, + "learning_rate": 6.16135100435836e-06, + "loss": 0.6977, + "step": 3224 + }, + { + "epoch": 1.9099792715427895, + "grad_norm": 1.5184121891582234, + "learning_rate": 6.155446022489877e-06, + "loss": 0.7228, + "step": 3225 + }, + { + "epoch": 1.9105715131773764, + "grad_norm": 1.7192454857712378, + "learning_rate": 6.149542613260157e-06, + "loss": 0.7029, + "step": 3226 + }, + { + "epoch": 1.9111637548119633, + "grad_norm": 0.948075216205065, + "learning_rate": 6.143640779084035e-06, + "loss": 0.6953, + "step": 3227 + }, + { + "epoch": 1.91175599644655, + "grad_norm": 1.6132188424013671, + "learning_rate": 6.137740522375687e-06, + "loss": 0.7213, + "step": 3228 + }, + { + "epoch": 1.9123482380811372, + "grad_norm": 3.072529086497967, + "learning_rate": 6.13184184554865e-06, + "loss": 0.7483, + "step": 3229 + }, + { + "epoch": 1.912940479715724, + "grad_norm": 2.1449484779319925, + "learning_rate": 6.1259447510158136e-06, + "loss": 0.7394, + "step": 3230 + }, + { + "epoch": 1.913532721350311, + "grad_norm": 1.2064970456317576, + "learning_rate": 6.120049241189423e-06, + "loss": 0.7599, + "step": 3231 + }, + { + "epoch": 1.9141249629848978, + "grad_norm": 1.5178757032575456, + "learning_rate": 6.114155318481076e-06, + "loss": 0.7253, + "step": 3232 + }, + { + "epoch": 1.914717204619485, + "grad_norm": 1.7616627788897157, + "learning_rate": 6.108262985301714e-06, + "loss": 0.713, + "step": 3233 + }, + { + "epoch": 1.9153094462540716, + "grad_norm": 1.5277812357867129, + "learning_rate": 6.102372244061631e-06, + "loss": 0.7322, + "step": 3234 + }, + { + "epoch": 1.9159016878886586, + "grad_norm": 1.241728693166342, + "learning_rate": 6.0964830971704755e-06, + "loss": 0.7061, + "step": 3235 + }, + { + "epoch": 1.9164939295232455, + "grad_norm": 8.016340143006273, + "learning_rate": 6.090595547037242e-06, + "loss": 0.7629, + "step": 3236 + }, + { + "epoch": 1.9170861711578324, + "grad_norm": 3.098312029179306, + "learning_rate": 6.084709596070264e-06, + "loss": 0.7099, + "step": 3237 + }, + { + "epoch": 1.9176784127924194, + "grad_norm": 1.570777816333163, + "learning_rate": 6.078825246677229e-06, + "loss": 0.6777, + "step": 3238 + }, + { + "epoch": 1.918270654427006, + "grad_norm": 1.1666760383543127, + "learning_rate": 6.072942501265164e-06, + "loss": 0.7347, + "step": 3239 + }, + { + "epoch": 1.9188628960615932, + "grad_norm": 1.1072653690074732, + "learning_rate": 6.06706136224045e-06, + "loss": 0.7289, + "step": 3240 + }, + { + "epoch": 1.91945513769618, + "grad_norm": 1.534029814565199, + "learning_rate": 6.061181832008795e-06, + "loss": 0.7361, + "step": 3241 + }, + { + "epoch": 1.920047379330767, + "grad_norm": 11.474053636259848, + "learning_rate": 6.055303912975261e-06, + "loss": 0.7246, + "step": 3242 + }, + { + "epoch": 1.9206396209653538, + "grad_norm": 1.1182633924025185, + "learning_rate": 6.049427607544247e-06, + "loss": 0.716, + "step": 3243 + }, + { + "epoch": 1.9212318625999407, + "grad_norm": 1.7977091796689737, + "learning_rate": 6.0435529181195e-06, + "loss": 0.691, + "step": 3244 + }, + { + "epoch": 1.9218241042345277, + "grad_norm": 1.2523074350306502, + "learning_rate": 6.0376798471040835e-06, + "loss": 0.7398, + "step": 3245 + }, + { + "epoch": 1.9224163458691146, + "grad_norm": 1.379470202336427, + "learning_rate": 6.031808396900422e-06, + "loss": 0.7037, + "step": 3246 + }, + { + "epoch": 1.9230085875037015, + "grad_norm": 1.1064953536693023, + "learning_rate": 6.025938569910271e-06, + "loss": 0.7371, + "step": 3247 + }, + { + "epoch": 1.9236008291382884, + "grad_norm": 1.3352312831811772, + "learning_rate": 6.020070368534719e-06, + "loss": 0.7153, + "step": 3248 + }, + { + "epoch": 1.9241930707728754, + "grad_norm": 6.84844144864244, + "learning_rate": 6.0142037951741824e-06, + "loss": 0.6729, + "step": 3249 + }, + { + "epoch": 1.924785312407462, + "grad_norm": 1.2959405118553768, + "learning_rate": 6.00833885222843e-06, + "loss": 0.6972, + "step": 3250 + }, + { + "epoch": 1.9253775540420492, + "grad_norm": 1.6921129797399264, + "learning_rate": 6.002475542096548e-06, + "loss": 0.7132, + "step": 3251 + }, + { + "epoch": 1.925969795676636, + "grad_norm": 1.899108534796794, + "learning_rate": 5.996613867176964e-06, + "loss": 0.7072, + "step": 3252 + }, + { + "epoch": 1.926562037311223, + "grad_norm": 1.6781426823646384, + "learning_rate": 5.9907538298674265e-06, + "loss": 0.7478, + "step": 3253 + }, + { + "epoch": 1.9271542789458098, + "grad_norm": 1.3313919012515292, + "learning_rate": 5.984895432565022e-06, + "loss": 0.7394, + "step": 3254 + }, + { + "epoch": 1.9277465205803967, + "grad_norm": 1.556101241994347, + "learning_rate": 5.979038677666167e-06, + "loss": 0.7304, + "step": 3255 + }, + { + "epoch": 1.9283387622149837, + "grad_norm": 1.434032373329083, + "learning_rate": 5.973183567566605e-06, + "loss": 0.7178, + "step": 3256 + }, + { + "epoch": 1.9289310038495706, + "grad_norm": 2.078218024535206, + "learning_rate": 5.967330104661402e-06, + "loss": 0.6953, + "step": 3257 + }, + { + "epoch": 1.9295232454841575, + "grad_norm": 1.9496636146502733, + "learning_rate": 5.96147829134495e-06, + "loss": 0.6895, + "step": 3258 + }, + { + "epoch": 1.9301154871187445, + "grad_norm": 1.7924248358414332, + "learning_rate": 5.955628130010977e-06, + "loss": 0.7637, + "step": 3259 + }, + { + "epoch": 1.9307077287533314, + "grad_norm": 1.6764468299545592, + "learning_rate": 5.949779623052526e-06, + "loss": 0.6733, + "step": 3260 + }, + { + "epoch": 1.9312999703879181, + "grad_norm": 1.3373116114928854, + "learning_rate": 5.9439327728619634e-06, + "loss": 0.6799, + "step": 3261 + }, + { + "epoch": 1.9318922120225053, + "grad_norm": 1.6119800507160023, + "learning_rate": 5.9380875818309805e-06, + "loss": 0.6576, + "step": 3262 + }, + { + "epoch": 1.932484453657092, + "grad_norm": 2.318030452874941, + "learning_rate": 5.932244052350585e-06, + "loss": 0.6927, + "step": 3263 + }, + { + "epoch": 1.9330766952916791, + "grad_norm": 1.498808583130641, + "learning_rate": 5.926402186811118e-06, + "loss": 0.6914, + "step": 3264 + }, + { + "epoch": 1.9336689369262658, + "grad_norm": 1.1247024199131759, + "learning_rate": 5.920561987602224e-06, + "loss": 0.721, + "step": 3265 + }, + { + "epoch": 1.9342611785608528, + "grad_norm": 2.2083931482923993, + "learning_rate": 5.914723457112877e-06, + "loss": 0.6962, + "step": 3266 + }, + { + "epoch": 1.9348534201954397, + "grad_norm": 2.703573235446084, + "learning_rate": 5.908886597731358e-06, + "loss": 0.6997, + "step": 3267 + }, + { + "epoch": 1.9354456618300266, + "grad_norm": 1.4305912748643144, + "learning_rate": 5.903051411845282e-06, + "loss": 0.7246, + "step": 3268 + }, + { + "epoch": 1.9360379034646136, + "grad_norm": 1.4530135063532168, + "learning_rate": 5.897217901841554e-06, + "loss": 0.6948, + "step": 3269 + }, + { + "epoch": 1.9366301450992005, + "grad_norm": 1.4638908118569964, + "learning_rate": 5.8913860701064175e-06, + "loss": 0.7293, + "step": 3270 + }, + { + "epoch": 1.9372223867337874, + "grad_norm": 1.3397680960176233, + "learning_rate": 5.885555919025414e-06, + "loss": 0.7114, + "step": 3271 + }, + { + "epoch": 1.9378146283683741, + "grad_norm": 1.4925051038532378, + "learning_rate": 5.879727450983412e-06, + "loss": 0.6961, + "step": 3272 + }, + { + "epoch": 1.9384068700029613, + "grad_norm": 1.2079008891186787, + "learning_rate": 5.873900668364572e-06, + "loss": 0.7337, + "step": 3273 + }, + { + "epoch": 1.938999111637548, + "grad_norm": 1.1990174795782194, + "learning_rate": 5.868075573552383e-06, + "loss": 0.7555, + "step": 3274 + }, + { + "epoch": 1.9395913532721352, + "grad_norm": 1.473905606977259, + "learning_rate": 5.862252168929632e-06, + "loss": 0.7142, + "step": 3275 + }, + { + "epoch": 1.9401835949067219, + "grad_norm": 2.1885900149709694, + "learning_rate": 5.856430456878424e-06, + "loss": 0.7396, + "step": 3276 + }, + { + "epoch": 1.9407758365413088, + "grad_norm": 1.9165088813619322, + "learning_rate": 5.850610439780158e-06, + "loss": 0.7055, + "step": 3277 + }, + { + "epoch": 1.9413680781758957, + "grad_norm": 1.1598467306680342, + "learning_rate": 5.844792120015556e-06, + "loss": 0.7183, + "step": 3278 + }, + { + "epoch": 1.9419603198104827, + "grad_norm": 1.4065104988302588, + "learning_rate": 5.838975499964636e-06, + "loss": 0.6838, + "step": 3279 + }, + { + "epoch": 1.9425525614450696, + "grad_norm": 1.3241034015935482, + "learning_rate": 5.833160582006722e-06, + "loss": 0.7029, + "step": 3280 + }, + { + "epoch": 1.9431448030796565, + "grad_norm": 1.5431286389019339, + "learning_rate": 5.827347368520444e-06, + "loss": 0.7218, + "step": 3281 + }, + { + "epoch": 1.9437370447142435, + "grad_norm": 1.4239447287632976, + "learning_rate": 5.821535861883729e-06, + "loss": 0.7142, + "step": 3282 + }, + { + "epoch": 1.9443292863488302, + "grad_norm": 1.4592705812164701, + "learning_rate": 5.815726064473812e-06, + "loss": 0.7701, + "step": 3283 + }, + { + "epoch": 1.9449215279834173, + "grad_norm": 1.449449394771617, + "learning_rate": 5.8099179786672365e-06, + "loss": 0.7341, + "step": 3284 + }, + { + "epoch": 1.945513769618004, + "grad_norm": 1.131051802200009, + "learning_rate": 5.80411160683982e-06, + "loss": 0.7324, + "step": 3285 + }, + { + "epoch": 1.9461060112525912, + "grad_norm": 1.2255859547243508, + "learning_rate": 5.798306951366701e-06, + "loss": 0.7226, + "step": 3286 + }, + { + "epoch": 1.946698252887178, + "grad_norm": 1.8162184207193348, + "learning_rate": 5.7925040146223155e-06, + "loss": 0.6954, + "step": 3287 + }, + { + "epoch": 1.9472904945217648, + "grad_norm": 1.2603648716863414, + "learning_rate": 5.786702798980388e-06, + "loss": 0.7035, + "step": 3288 + }, + { + "epoch": 1.9478827361563518, + "grad_norm": 2.0203946682183487, + "learning_rate": 5.780903306813937e-06, + "loss": 0.696, + "step": 3289 + }, + { + "epoch": 1.9484749777909387, + "grad_norm": 2.638924315398722, + "learning_rate": 5.775105540495284e-06, + "loss": 0.6752, + "step": 3290 + }, + { + "epoch": 1.9490672194255256, + "grad_norm": 4.143046528582363, + "learning_rate": 5.769309502396046e-06, + "loss": 0.7165, + "step": 3291 + }, + { + "epoch": 1.9496594610601126, + "grad_norm": 1.9906732930148696, + "learning_rate": 5.763515194887126e-06, + "loss": 0.7242, + "step": 3292 + }, + { + "epoch": 1.9502517026946995, + "grad_norm": 1.5769279032518237, + "learning_rate": 5.757722620338715e-06, + "loss": 0.7076, + "step": 3293 + }, + { + "epoch": 1.9508439443292862, + "grad_norm": 1.4341838760864, + "learning_rate": 5.751931781120308e-06, + "loss": 0.7051, + "step": 3294 + }, + { + "epoch": 1.9514361859638734, + "grad_norm": 1.4434054947134414, + "learning_rate": 5.746142679600687e-06, + "loss": 0.7102, + "step": 3295 + }, + { + "epoch": 1.95202842759846, + "grad_norm": 2.0084791627967924, + "learning_rate": 5.740355318147916e-06, + "loss": 0.7052, + "step": 3296 + }, + { + "epoch": 1.9526206692330472, + "grad_norm": 1.3474318765956996, + "learning_rate": 5.734569699129347e-06, + "loss": 0.7527, + "step": 3297 + }, + { + "epoch": 1.953212910867634, + "grad_norm": 1.9311183919130495, + "learning_rate": 5.728785824911627e-06, + "loss": 0.7138, + "step": 3298 + }, + { + "epoch": 1.9538051525022209, + "grad_norm": 1.8418123316642743, + "learning_rate": 5.723003697860692e-06, + "loss": 0.6989, + "step": 3299 + }, + { + "epoch": 1.9543973941368078, + "grad_norm": 1.8187532795120556, + "learning_rate": 5.717223320341751e-06, + "loss": 0.6974, + "step": 3300 + }, + { + "epoch": 1.9549896357713947, + "grad_norm": 1.355389837624177, + "learning_rate": 5.711444694719299e-06, + "loss": 0.7415, + "step": 3301 + }, + { + "epoch": 1.9555818774059817, + "grad_norm": 1.4523243460694153, + "learning_rate": 5.70566782335713e-06, + "loss": 0.7046, + "step": 3302 + }, + { + "epoch": 1.9561741190405686, + "grad_norm": 1.3091809995414065, + "learning_rate": 5.699892708618297e-06, + "loss": 0.7168, + "step": 3303 + }, + { + "epoch": 1.9567663606751555, + "grad_norm": 2.142139867997893, + "learning_rate": 5.69411935286516e-06, + "loss": 0.7549, + "step": 3304 + }, + { + "epoch": 1.9573586023097422, + "grad_norm": 2.5024110696824455, + "learning_rate": 5.6883477584593325e-06, + "loss": 0.7202, + "step": 3305 + }, + { + "epoch": 1.9579508439443294, + "grad_norm": 2.2513293693234813, + "learning_rate": 5.682577927761732e-06, + "loss": 0.7172, + "step": 3306 + }, + { + "epoch": 1.958543085578916, + "grad_norm": 1.2507660474168645, + "learning_rate": 5.676809863132537e-06, + "loss": 0.711, + "step": 3307 + }, + { + "epoch": 1.9591353272135033, + "grad_norm": 1.2539660203271004, + "learning_rate": 5.671043566931216e-06, + "loss": 0.7007, + "step": 3308 + }, + { + "epoch": 1.95972756884809, + "grad_norm": 1.9143243645347823, + "learning_rate": 5.665279041516501e-06, + "loss": 0.7193, + "step": 3309 + }, + { + "epoch": 1.960319810482677, + "grad_norm": 6.37124555621677, + "learning_rate": 5.659516289246414e-06, + "loss": 0.7501, + "step": 3310 + }, + { + "epoch": 1.9609120521172638, + "grad_norm": 1.4767774435352106, + "learning_rate": 5.6537553124782395e-06, + "loss": 0.7479, + "step": 3311 + }, + { + "epoch": 1.9615042937518508, + "grad_norm": 1.6017773415211547, + "learning_rate": 5.647996113568547e-06, + "loss": 0.7181, + "step": 3312 + }, + { + "epoch": 1.9620965353864377, + "grad_norm": 2.1133632644301747, + "learning_rate": 5.642238694873165e-06, + "loss": 0.7301, + "step": 3313 + }, + { + "epoch": 1.9626887770210246, + "grad_norm": 1.4351193282078467, + "learning_rate": 5.636483058747209e-06, + "loss": 0.6863, + "step": 3314 + }, + { + "epoch": 1.9632810186556116, + "grad_norm": 1.7463182715444514, + "learning_rate": 5.63072920754505e-06, + "loss": 0.6823, + "step": 3315 + }, + { + "epoch": 1.9638732602901983, + "grad_norm": 1.9641501791574498, + "learning_rate": 5.624977143620347e-06, + "loss": 0.7183, + "step": 3316 + }, + { + "epoch": 1.9644655019247854, + "grad_norm": 1.5984745729773937, + "learning_rate": 5.619226869326006e-06, + "loss": 0.7001, + "step": 3317 + }, + { + "epoch": 1.9650577435593721, + "grad_norm": 1.504200180863183, + "learning_rate": 5.613478387014223e-06, + "loss": 0.7457, + "step": 3318 + }, + { + "epoch": 1.9656499851939593, + "grad_norm": 1.165932682338627, + "learning_rate": 5.6077316990364415e-06, + "loss": 0.7008, + "step": 3319 + }, + { + "epoch": 1.966242226828546, + "grad_norm": 1.2741156158971194, + "learning_rate": 5.601986807743388e-06, + "loss": 0.724, + "step": 3320 + }, + { + "epoch": 1.966834468463133, + "grad_norm": 1.376216878070863, + "learning_rate": 5.5962437154850434e-06, + "loss": 0.738, + "step": 3321 + }, + { + "epoch": 1.9674267100977199, + "grad_norm": 2.0904006798570567, + "learning_rate": 5.5905024246106485e-06, + "loss": 0.7218, + "step": 3322 + }, + { + "epoch": 1.9680189517323068, + "grad_norm": 1.0737572438567344, + "learning_rate": 5.584762937468722e-06, + "loss": 0.7327, + "step": 3323 + }, + { + "epoch": 1.9686111933668937, + "grad_norm": 2.387906025872129, + "learning_rate": 5.579025256407038e-06, + "loss": 0.7739, + "step": 3324 + }, + { + "epoch": 1.9692034350014807, + "grad_norm": 4.887193226453433, + "learning_rate": 5.573289383772628e-06, + "loss": 0.7045, + "step": 3325 + }, + { + "epoch": 1.9697956766360676, + "grad_norm": 1.2124274832546234, + "learning_rate": 5.567555321911782e-06, + "loss": 0.6857, + "step": 3326 + }, + { + "epoch": 1.9703879182706543, + "grad_norm": 1.2868902005451346, + "learning_rate": 5.561823073170056e-06, + "loss": 0.7062, + "step": 3327 + }, + { + "epoch": 1.9709801599052414, + "grad_norm": 1.4155299558301717, + "learning_rate": 5.55609263989227e-06, + "loss": 0.6972, + "step": 3328 + }, + { + "epoch": 1.9715724015398282, + "grad_norm": 1.990767811294212, + "learning_rate": 5.550364024422486e-06, + "loss": 0.742, + "step": 3329 + }, + { + "epoch": 1.9721646431744153, + "grad_norm": 1.2244919516098602, + "learning_rate": 5.544637229104027e-06, + "loss": 0.7393, + "step": 3330 + }, + { + "epoch": 1.972756884809002, + "grad_norm": 1.5478990157497663, + "learning_rate": 5.538912256279479e-06, + "loss": 0.7008, + "step": 3331 + }, + { + "epoch": 1.973349126443589, + "grad_norm": 1.248386021207724, + "learning_rate": 5.533189108290682e-06, + "loss": 0.708, + "step": 3332 + }, + { + "epoch": 1.9739413680781759, + "grad_norm": 1.1494240153147253, + "learning_rate": 5.5274677874787195e-06, + "loss": 0.7127, + "step": 3333 + }, + { + "epoch": 1.9745336097127628, + "grad_norm": 2.0642350332580097, + "learning_rate": 5.5217482961839305e-06, + "loss": 0.7059, + "step": 3334 + }, + { + "epoch": 1.9751258513473497, + "grad_norm": 1.4089725977182308, + "learning_rate": 5.516030636745914e-06, + "loss": 0.6773, + "step": 3335 + }, + { + "epoch": 1.9757180929819367, + "grad_norm": 1.9085909255822204, + "learning_rate": 5.51031481150352e-06, + "loss": 0.7134, + "step": 3336 + }, + { + "epoch": 1.9763103346165236, + "grad_norm": 2.0113642440749877, + "learning_rate": 5.504600822794827e-06, + "loss": 0.7131, + "step": 3337 + }, + { + "epoch": 1.9769025762511103, + "grad_norm": 1.0613716617183546, + "learning_rate": 5.498888672957187e-06, + "loss": 0.7234, + "step": 3338 + }, + { + "epoch": 1.9774948178856975, + "grad_norm": 1.9231732798281154, + "learning_rate": 5.493178364327192e-06, + "loss": 0.7543, + "step": 3339 + }, + { + "epoch": 1.9780870595202842, + "grad_norm": 1.2516392211577612, + "learning_rate": 5.487469899240678e-06, + "loss": 0.6811, + "step": 3340 + }, + { + "epoch": 1.9786793011548713, + "grad_norm": 1.767242904928232, + "learning_rate": 5.481763280032722e-06, + "loss": 0.7349, + "step": 3341 + }, + { + "epoch": 1.979271542789458, + "grad_norm": 1.6839167459983475, + "learning_rate": 5.476058509037658e-06, + "loss": 0.7343, + "step": 3342 + }, + { + "epoch": 1.979863784424045, + "grad_norm": 2.2075665620689353, + "learning_rate": 5.470355588589063e-06, + "loss": 0.7268, + "step": 3343 + }, + { + "epoch": 1.980456026058632, + "grad_norm": 2.8495458199508636, + "learning_rate": 5.4646545210197435e-06, + "loss": 0.761, + "step": 3344 + }, + { + "epoch": 1.9810482676932188, + "grad_norm": 1.7526127261012925, + "learning_rate": 5.458955308661758e-06, + "loss": 0.7317, + "step": 3345 + }, + { + "epoch": 1.9816405093278058, + "grad_norm": 1.953204597527354, + "learning_rate": 5.453257953846405e-06, + "loss": 0.7254, + "step": 3346 + }, + { + "epoch": 1.9822327509623927, + "grad_norm": 2.6955617878160893, + "learning_rate": 5.447562458904227e-06, + "loss": 0.7285, + "step": 3347 + }, + { + "epoch": 1.9828249925969796, + "grad_norm": 1.6118188469147372, + "learning_rate": 5.441868826165002e-06, + "loss": 0.7359, + "step": 3348 + }, + { + "epoch": 1.9834172342315664, + "grad_norm": 1.9770522714718386, + "learning_rate": 5.436177057957739e-06, + "loss": 0.7308, + "step": 3349 + }, + { + "epoch": 1.9840094758661535, + "grad_norm": 1.629247494410455, + "learning_rate": 5.430487156610695e-06, + "loss": 0.7142, + "step": 3350 + }, + { + "epoch": 1.9846017175007402, + "grad_norm": 1.2839049398259734, + "learning_rate": 5.4247991244513635e-06, + "loss": 0.7203, + "step": 3351 + }, + { + "epoch": 1.9851939591353274, + "grad_norm": 1.864717196615187, + "learning_rate": 5.419112963806468e-06, + "loss": 0.6983, + "step": 3352 + }, + { + "epoch": 1.985786200769914, + "grad_norm": 1.4608471929423679, + "learning_rate": 5.4134286770019595e-06, + "loss": 0.6954, + "step": 3353 + }, + { + "epoch": 1.986378442404501, + "grad_norm": 2.368513304613487, + "learning_rate": 5.407746266363039e-06, + "loss": 0.7199, + "step": 3354 + }, + { + "epoch": 1.986970684039088, + "grad_norm": 2.3133080313204375, + "learning_rate": 5.402065734214135e-06, + "loss": 0.7527, + "step": 3355 + }, + { + "epoch": 1.9875629256736749, + "grad_norm": 1.236304503675414, + "learning_rate": 5.3963870828788975e-06, + "loss": 0.6784, + "step": 3356 + }, + { + "epoch": 1.9881551673082618, + "grad_norm": 1.6483424083328333, + "learning_rate": 5.390710314680214e-06, + "loss": 0.7044, + "step": 3357 + }, + { + "epoch": 1.9887474089428485, + "grad_norm": 6.5521490398349735, + "learning_rate": 5.3850354319402095e-06, + "loss": 0.6524, + "step": 3358 + }, + { + "epoch": 1.9893396505774357, + "grad_norm": 1.2554308783934172, + "learning_rate": 5.379362436980222e-06, + "loss": 0.7141, + "step": 3359 + }, + { + "epoch": 1.9899318922120224, + "grad_norm": 1.8973904162055975, + "learning_rate": 5.373691332120832e-06, + "loss": 0.71, + "step": 3360 + }, + { + "epoch": 1.9905241338466095, + "grad_norm": 1.3579617035026215, + "learning_rate": 5.368022119681833e-06, + "loss": 0.7219, + "step": 3361 + }, + { + "epoch": 1.9911163754811962, + "grad_norm": 3.758540120537383, + "learning_rate": 5.362354801982259e-06, + "loss": 0.7086, + "step": 3362 + }, + { + "epoch": 1.9917086171157834, + "grad_norm": 4.086844855136015, + "learning_rate": 5.356689381340354e-06, + "loss": 0.7156, + "step": 3363 + }, + { + "epoch": 1.99230085875037, + "grad_norm": 2.7000807650636185, + "learning_rate": 5.351025860073604e-06, + "loss": 0.7186, + "step": 3364 + }, + { + "epoch": 1.992893100384957, + "grad_norm": 1.302878544865362, + "learning_rate": 5.345364240498696e-06, + "loss": 0.7177, + "step": 3365 + }, + { + "epoch": 1.993485342019544, + "grad_norm": 1.341163287967209, + "learning_rate": 5.3397045249315615e-06, + "loss": 0.747, + "step": 3366 + }, + { + "epoch": 1.994077583654131, + "grad_norm": 1.8874536605468482, + "learning_rate": 5.334046715687334e-06, + "loss": 0.736, + "step": 3367 + }, + { + "epoch": 1.9946698252887178, + "grad_norm": 1.1757321216705952, + "learning_rate": 5.328390815080381e-06, + "loss": 0.707, + "step": 3368 + }, + { + "epoch": 1.9952620669233045, + "grad_norm": 2.40844448714098, + "learning_rate": 5.32273682542428e-06, + "loss": 0.7085, + "step": 3369 + }, + { + "epoch": 1.9958543085578917, + "grad_norm": 1.7338073370473768, + "learning_rate": 5.317084749031835e-06, + "loss": 0.6737, + "step": 3370 + }, + { + "epoch": 1.9964465501924784, + "grad_norm": 1.602486533397963, + "learning_rate": 5.311434588215057e-06, + "loss": 0.7334, + "step": 3371 + }, + { + "epoch": 1.9970387918270656, + "grad_norm": 1.8450498725558269, + "learning_rate": 5.3057863452851875e-06, + "loss": 0.7309, + "step": 3372 + }, + { + "epoch": 1.9976310334616523, + "grad_norm": 1.1497170574169295, + "learning_rate": 5.300140022552671e-06, + "loss": 0.7375, + "step": 3373 + }, + { + "epoch": 1.9982232750962394, + "grad_norm": 1.5753751548871004, + "learning_rate": 5.294495622327167e-06, + "loss": 0.7498, + "step": 3374 + }, + { + "epoch": 1.9988155167308261, + "grad_norm": 7.073970313653954, + "learning_rate": 5.288853146917557e-06, + "loss": 0.7538, + "step": 3375 + }, + { + "epoch": 1.999407758365413, + "grad_norm": 1.2888380404329518, + "learning_rate": 5.283212598631935e-06, + "loss": 0.7392, + "step": 3376 + }, + { + "epoch": 2.0, + "grad_norm": 1.1340654280485976, + "learning_rate": 5.277573979777597e-06, + "loss": 0.6702, + "step": 3377 + }, + { + "epoch": 2.0005922416345867, + "grad_norm": 1.1704057850103424, + "learning_rate": 5.271937292661054e-06, + "loss": 0.6327, + "step": 3378 + }, + { + "epoch": 2.001184483269174, + "grad_norm": 1.2928344272015833, + "learning_rate": 5.266302539588029e-06, + "loss": 0.651, + "step": 3379 + }, + { + "epoch": 2.0017767249037606, + "grad_norm": 1.8183856023739275, + "learning_rate": 5.260669722863457e-06, + "loss": 0.6276, + "step": 3380 + }, + { + "epoch": 2.0023689665383477, + "grad_norm": 1.232591682636013, + "learning_rate": 5.255038844791475e-06, + "loss": 0.655, + "step": 3381 + }, + { + "epoch": 2.0029612081729344, + "grad_norm": 1.439614040820058, + "learning_rate": 5.249409907675422e-06, + "loss": 0.6478, + "step": 3382 + }, + { + "epoch": 2.0035534498075216, + "grad_norm": 1.2378098354139504, + "learning_rate": 5.243782913817858e-06, + "loss": 0.5961, + "step": 3383 + }, + { + "epoch": 2.0041456914421083, + "grad_norm": 1.3329846166899064, + "learning_rate": 5.238157865520539e-06, + "loss": 0.6206, + "step": 3384 + }, + { + "epoch": 2.0047379330766955, + "grad_norm": 2.1513887798109037, + "learning_rate": 5.232534765084425e-06, + "loss": 0.6225, + "step": 3385 + }, + { + "epoch": 2.005330174711282, + "grad_norm": 1.4341250498776674, + "learning_rate": 5.226913614809677e-06, + "loss": 0.6466, + "step": 3386 + }, + { + "epoch": 2.0059224163458693, + "grad_norm": 2.65394252956428, + "learning_rate": 5.221294416995661e-06, + "loss": 0.6219, + "step": 3387 + }, + { + "epoch": 2.006514657980456, + "grad_norm": 1.939125905672404, + "learning_rate": 5.215677173940959e-06, + "loss": 0.6105, + "step": 3388 + }, + { + "epoch": 2.0071068996150427, + "grad_norm": 1.6103895073133687, + "learning_rate": 5.210061887943318e-06, + "loss": 0.5788, + "step": 3389 + }, + { + "epoch": 2.00769914124963, + "grad_norm": 5.940476923941293, + "learning_rate": 5.204448561299718e-06, + "loss": 0.608, + "step": 3390 + }, + { + "epoch": 2.0082913828842166, + "grad_norm": 3.4355880637707736, + "learning_rate": 5.1988371963063235e-06, + "loss": 0.6455, + "step": 3391 + }, + { + "epoch": 2.0088836245188038, + "grad_norm": 1.709718605173615, + "learning_rate": 5.193227795258505e-06, + "loss": 0.6509, + "step": 3392 + }, + { + "epoch": 2.0094758661533905, + "grad_norm": 1.521862412293069, + "learning_rate": 5.187620360450809e-06, + "loss": 0.6062, + "step": 3393 + }, + { + "epoch": 2.0100681077879776, + "grad_norm": 2.304842968308541, + "learning_rate": 5.182014894176999e-06, + "loss": 0.6268, + "step": 3394 + }, + { + "epoch": 2.0106603494225643, + "grad_norm": 1.6981834404747536, + "learning_rate": 5.176411398730028e-06, + "loss": 0.6083, + "step": 3395 + }, + { + "epoch": 2.0112525910571515, + "grad_norm": 1.5414486726862469, + "learning_rate": 5.170809876402039e-06, + "loss": 0.5742, + "step": 3396 + }, + { + "epoch": 2.011844832691738, + "grad_norm": 1.8324966309812314, + "learning_rate": 5.165210329484366e-06, + "loss": 0.6401, + "step": 3397 + }, + { + "epoch": 2.0124370743263253, + "grad_norm": 1.5280075849514587, + "learning_rate": 5.159612760267541e-06, + "loss": 0.612, + "step": 3398 + }, + { + "epoch": 2.013029315960912, + "grad_norm": 1.7053593517597887, + "learning_rate": 5.154017171041289e-06, + "loss": 0.6234, + "step": 3399 + }, + { + "epoch": 2.0136215575954988, + "grad_norm": 2.384672543167793, + "learning_rate": 5.148423564094517e-06, + "loss": 0.5944, + "step": 3400 + }, + { + "epoch": 2.014213799230086, + "grad_norm": 1.7192733690837867, + "learning_rate": 5.142831941715321e-06, + "loss": 0.6438, + "step": 3401 + }, + { + "epoch": 2.0148060408646726, + "grad_norm": 1.2298645317583594, + "learning_rate": 5.137242306190991e-06, + "loss": 0.6059, + "step": 3402 + }, + { + "epoch": 2.01539828249926, + "grad_norm": 1.421035277118102, + "learning_rate": 5.13165465980801e-06, + "loss": 0.6251, + "step": 3403 + }, + { + "epoch": 2.0159905241338465, + "grad_norm": 1.3393515437383583, + "learning_rate": 5.126069004852033e-06, + "loss": 0.6224, + "step": 3404 + }, + { + "epoch": 2.0165827657684336, + "grad_norm": 1.7027652434897638, + "learning_rate": 5.120485343607901e-06, + "loss": 0.6055, + "step": 3405 + }, + { + "epoch": 2.0171750074030204, + "grad_norm": 1.1354965871126625, + "learning_rate": 5.114903678359655e-06, + "loss": 0.6418, + "step": 3406 + }, + { + "epoch": 2.0177672490376075, + "grad_norm": 2.17653820273378, + "learning_rate": 5.10932401139051e-06, + "loss": 0.6267, + "step": 3407 + }, + { + "epoch": 2.0183594906721942, + "grad_norm": 1.1887528683987758, + "learning_rate": 5.103746344982859e-06, + "loss": 0.603, + "step": 3408 + }, + { + "epoch": 2.0189517323067814, + "grad_norm": 1.2166339620141267, + "learning_rate": 5.09817068141828e-06, + "loss": 0.6271, + "step": 3409 + }, + { + "epoch": 2.019543973941368, + "grad_norm": 1.4946678433457767, + "learning_rate": 5.092597022977539e-06, + "loss": 0.6135, + "step": 3410 + }, + { + "epoch": 2.020136215575955, + "grad_norm": 2.181670296560735, + "learning_rate": 5.087025371940568e-06, + "loss": 0.5295, + "step": 3411 + }, + { + "epoch": 2.020728457210542, + "grad_norm": 1.1109403860055909, + "learning_rate": 5.081455730586495e-06, + "loss": 0.6452, + "step": 3412 + }, + { + "epoch": 2.0213206988451287, + "grad_norm": 1.5294096669192148, + "learning_rate": 5.075888101193605e-06, + "loss": 0.6353, + "step": 3413 + }, + { + "epoch": 2.021912940479716, + "grad_norm": 1.2375562922087688, + "learning_rate": 5.070322486039383e-06, + "loss": 0.6426, + "step": 3414 + }, + { + "epoch": 2.0225051821143025, + "grad_norm": 1.1958259044194008, + "learning_rate": 5.06475888740047e-06, + "loss": 0.6161, + "step": 3415 + }, + { + "epoch": 2.0230974237488897, + "grad_norm": 1.3364125839027878, + "learning_rate": 5.059197307552698e-06, + "loss": 0.6045, + "step": 3416 + }, + { + "epoch": 2.0236896653834764, + "grad_norm": 1.471484617701428, + "learning_rate": 5.053637748771058e-06, + "loss": 0.6244, + "step": 3417 + }, + { + "epoch": 2.0242819070180635, + "grad_norm": 1.9697646130426174, + "learning_rate": 5.048080213329729e-06, + "loss": 0.6303, + "step": 3418 + }, + { + "epoch": 2.0248741486526503, + "grad_norm": 1.2533508619798654, + "learning_rate": 5.04252470350205e-06, + "loss": 0.5929, + "step": 3419 + }, + { + "epoch": 2.0254663902872374, + "grad_norm": 1.249684000829904, + "learning_rate": 5.036971221560543e-06, + "loss": 0.6563, + "step": 3420 + }, + { + "epoch": 2.026058631921824, + "grad_norm": 1.7695633483618758, + "learning_rate": 5.031419769776887e-06, + "loss": 0.6099, + "step": 3421 + }, + { + "epoch": 2.026650873556411, + "grad_norm": 1.5156337584508315, + "learning_rate": 5.025870350421945e-06, + "loss": 0.6385, + "step": 3422 + }, + { + "epoch": 2.027243115190998, + "grad_norm": 1.1787473479254456, + "learning_rate": 5.020322965765736e-06, + "loss": 0.6373, + "step": 3423 + }, + { + "epoch": 2.0278353568255847, + "grad_norm": 1.2307377200251604, + "learning_rate": 5.0147776180774575e-06, + "loss": 0.6469, + "step": 3424 + }, + { + "epoch": 2.028427598460172, + "grad_norm": 1.2005680095700468, + "learning_rate": 5.009234309625467e-06, + "loss": 0.5903, + "step": 3425 + }, + { + "epoch": 2.0290198400947586, + "grad_norm": 1.8848824259084624, + "learning_rate": 5.003693042677283e-06, + "loss": 0.5888, + "step": 3426 + }, + { + "epoch": 2.0296120817293457, + "grad_norm": 1.4702739853724482, + "learning_rate": 4.998153819499601e-06, + "loss": 0.5864, + "step": 3427 + }, + { + "epoch": 2.0302043233639324, + "grad_norm": 1.3620518239621033, + "learning_rate": 4.992616642358279e-06, + "loss": 0.617, + "step": 3428 + }, + { + "epoch": 2.0307965649985196, + "grad_norm": 1.1287181683000989, + "learning_rate": 4.9870815135183285e-06, + "loss": 0.605, + "step": 3429 + }, + { + "epoch": 2.0313888066331063, + "grad_norm": 1.8477536160925232, + "learning_rate": 4.9815484352439244e-06, + "loss": 0.6376, + "step": 3430 + }, + { + "epoch": 2.0319810482676934, + "grad_norm": 1.2689842522920158, + "learning_rate": 4.976017409798413e-06, + "loss": 0.5849, + "step": 3431 + }, + { + "epoch": 2.03257328990228, + "grad_norm": 2.2267139790954498, + "learning_rate": 4.970488439444296e-06, + "loss": 0.6276, + "step": 3432 + }, + { + "epoch": 2.033165531536867, + "grad_norm": 2.263989349996733, + "learning_rate": 4.964961526443231e-06, + "loss": 0.6187, + "step": 3433 + }, + { + "epoch": 2.033757773171454, + "grad_norm": 1.1002565279108922, + "learning_rate": 4.95943667305603e-06, + "loss": 0.6165, + "step": 3434 + }, + { + "epoch": 2.0343500148060407, + "grad_norm": 1.1437907896314625, + "learning_rate": 4.953913881542677e-06, + "loss": 0.6687, + "step": 3435 + }, + { + "epoch": 2.034942256440628, + "grad_norm": 1.4291661707165033, + "learning_rate": 4.948393154162303e-06, + "loss": 0.6324, + "step": 3436 + }, + { + "epoch": 2.0355344980752146, + "grad_norm": 1.2447703838850679, + "learning_rate": 4.9428744931731965e-06, + "loss": 0.5916, + "step": 3437 + }, + { + "epoch": 2.0361267397098017, + "grad_norm": 1.3396172149707037, + "learning_rate": 4.937357900832793e-06, + "loss": 0.6117, + "step": 3438 + }, + { + "epoch": 2.0367189813443884, + "grad_norm": 1.327487438241329, + "learning_rate": 4.931843379397695e-06, + "loss": 0.6275, + "step": 3439 + }, + { + "epoch": 2.0373112229789756, + "grad_norm": 5.634667309355351, + "learning_rate": 4.926330931123659e-06, + "loss": 0.5808, + "step": 3440 + }, + { + "epoch": 2.0379034646135623, + "grad_norm": 1.712596745761093, + "learning_rate": 4.920820558265569e-06, + "loss": 0.6298, + "step": 3441 + }, + { + "epoch": 2.038495706248149, + "grad_norm": 1.6350123351542276, + "learning_rate": 4.915312263077488e-06, + "loss": 0.6246, + "step": 3442 + }, + { + "epoch": 2.039087947882736, + "grad_norm": 1.4377006668550185, + "learning_rate": 4.909806047812617e-06, + "loss": 0.6212, + "step": 3443 + }, + { + "epoch": 2.039680189517323, + "grad_norm": 1.3626893907998683, + "learning_rate": 4.904301914723315e-06, + "loss": 0.5958, + "step": 3444 + }, + { + "epoch": 2.04027243115191, + "grad_norm": 1.881609654716602, + "learning_rate": 4.898799866061068e-06, + "loss": 0.6389, + "step": 3445 + }, + { + "epoch": 2.0408646727864967, + "grad_norm": 1.0881012049516798, + "learning_rate": 4.89329990407653e-06, + "loss": 0.6571, + "step": 3446 + }, + { + "epoch": 2.041456914421084, + "grad_norm": 1.4033919456100592, + "learning_rate": 4.887802031019498e-06, + "loss": 0.631, + "step": 3447 + }, + { + "epoch": 2.0420491560556706, + "grad_norm": 3.113208111035224, + "learning_rate": 4.882306249138909e-06, + "loss": 0.6045, + "step": 3448 + }, + { + "epoch": 2.0426413976902578, + "grad_norm": 1.2724517686667787, + "learning_rate": 4.876812560682842e-06, + "loss": 0.6156, + "step": 3449 + }, + { + "epoch": 2.0432336393248445, + "grad_norm": 1.472100349180877, + "learning_rate": 4.871320967898528e-06, + "loss": 0.6171, + "step": 3450 + }, + { + "epoch": 2.0438258809594316, + "grad_norm": 1.7554861202098981, + "learning_rate": 4.865831473032342e-06, + "loss": 0.6485, + "step": 3451 + }, + { + "epoch": 2.0444181225940183, + "grad_norm": 0.892915766556675, + "learning_rate": 4.860344078329791e-06, + "loss": 0.5951, + "step": 3452 + }, + { + "epoch": 2.045010364228605, + "grad_norm": 1.2477613762328605, + "learning_rate": 4.8548587860355255e-06, + "loss": 0.6422, + "step": 3453 + }, + { + "epoch": 2.045602605863192, + "grad_norm": 1.3476862158178415, + "learning_rate": 4.849375598393342e-06, + "loss": 0.6421, + "step": 3454 + }, + { + "epoch": 2.046194847497779, + "grad_norm": 1.3151784488318263, + "learning_rate": 4.843894517646176e-06, + "loss": 0.6417, + "step": 3455 + }, + { + "epoch": 2.046787089132366, + "grad_norm": 2.0867879393200934, + "learning_rate": 4.838415546036095e-06, + "loss": 0.6006, + "step": 3456 + }, + { + "epoch": 2.0473793307669528, + "grad_norm": 1.528214198655404, + "learning_rate": 4.8329386858043005e-06, + "loss": 0.6131, + "step": 3457 + }, + { + "epoch": 2.04797157240154, + "grad_norm": 1.6703970326441546, + "learning_rate": 4.827463939191141e-06, + "loss": 0.6009, + "step": 3458 + }, + { + "epoch": 2.0485638140361266, + "grad_norm": 1.084600846636688, + "learning_rate": 4.821991308436102e-06, + "loss": 0.5974, + "step": 3459 + }, + { + "epoch": 2.049156055670714, + "grad_norm": 1.4736165984007512, + "learning_rate": 4.816520795777789e-06, + "loss": 0.5891, + "step": 3460 + }, + { + "epoch": 2.0497482973053005, + "grad_norm": 1.4108193712127584, + "learning_rate": 4.811052403453949e-06, + "loss": 0.5992, + "step": 3461 + }, + { + "epoch": 2.0503405389398877, + "grad_norm": 1.2902478597441098, + "learning_rate": 4.805586133701468e-06, + "loss": 0.652, + "step": 3462 + }, + { + "epoch": 2.0509327805744744, + "grad_norm": 1.4732401085829676, + "learning_rate": 4.800121988756352e-06, + "loss": 0.6209, + "step": 3463 + }, + { + "epoch": 2.051525022209061, + "grad_norm": 1.7492854759727523, + "learning_rate": 4.7946599708537485e-06, + "loss": 0.5899, + "step": 3464 + }, + { + "epoch": 2.0521172638436482, + "grad_norm": 1.3793707141008569, + "learning_rate": 4.789200082227924e-06, + "loss": 0.6281, + "step": 3465 + }, + { + "epoch": 2.052709505478235, + "grad_norm": 1.5203850237041459, + "learning_rate": 4.783742325112286e-06, + "loss": 0.6517, + "step": 3466 + }, + { + "epoch": 2.053301747112822, + "grad_norm": 1.5248222664948767, + "learning_rate": 4.7782867017393585e-06, + "loss": 0.6476, + "step": 3467 + }, + { + "epoch": 2.053893988747409, + "grad_norm": 1.2625856377787512, + "learning_rate": 4.772833214340805e-06, + "loss": 0.6413, + "step": 3468 + }, + { + "epoch": 2.054486230381996, + "grad_norm": 1.2851858362812827, + "learning_rate": 4.7673818651474e-06, + "loss": 0.629, + "step": 3469 + }, + { + "epoch": 2.0550784720165827, + "grad_norm": 1.296157068335424, + "learning_rate": 4.761932656389061e-06, + "loss": 0.6189, + "step": 3470 + }, + { + "epoch": 2.05567071365117, + "grad_norm": 1.4886765308566954, + "learning_rate": 4.756485590294813e-06, + "loss": 0.6472, + "step": 3471 + }, + { + "epoch": 2.0562629552857565, + "grad_norm": 1.1373661307548197, + "learning_rate": 4.751040669092819e-06, + "loss": 0.5558, + "step": 3472 + }, + { + "epoch": 2.0568551969203437, + "grad_norm": 1.3633758311544337, + "learning_rate": 4.745597895010351e-06, + "loss": 0.604, + "step": 3473 + }, + { + "epoch": 2.0574474385549304, + "grad_norm": 1.1929108488372469, + "learning_rate": 4.740157270273816e-06, + "loss": 0.6514, + "step": 3474 + }, + { + "epoch": 2.058039680189517, + "grad_norm": 3.1263654625674975, + "learning_rate": 4.7347187971087294e-06, + "loss": 0.6156, + "step": 3475 + }, + { + "epoch": 2.0586319218241043, + "grad_norm": 2.327298864991031, + "learning_rate": 4.729282477739741e-06, + "loss": 0.6476, + "step": 3476 + }, + { + "epoch": 2.059224163458691, + "grad_norm": 1.5846789608099434, + "learning_rate": 4.723848314390604e-06, + "loss": 0.6529, + "step": 3477 + }, + { + "epoch": 2.059816405093278, + "grad_norm": 1.4670029209129003, + "learning_rate": 4.718416309284196e-06, + "loss": 0.6004, + "step": 3478 + }, + { + "epoch": 2.060408646727865, + "grad_norm": 4.684220098207152, + "learning_rate": 4.712986464642515e-06, + "loss": 0.5856, + "step": 3479 + }, + { + "epoch": 2.061000888362452, + "grad_norm": 1.0163949128750598, + "learning_rate": 4.707558782686677e-06, + "loss": 0.5652, + "step": 3480 + }, + { + "epoch": 2.0615931299970387, + "grad_norm": 1.9902642913838888, + "learning_rate": 4.702133265636905e-06, + "loss": 0.595, + "step": 3481 + }, + { + "epoch": 2.062185371631626, + "grad_norm": 0.976503070725885, + "learning_rate": 4.6967099157125384e-06, + "loss": 0.5816, + "step": 3482 + }, + { + "epoch": 2.0627776132662126, + "grad_norm": 1.1674854697438484, + "learning_rate": 4.6912887351320336e-06, + "loss": 0.6192, + "step": 3483 + }, + { + "epoch": 2.0633698549007997, + "grad_norm": 1.080121092579032, + "learning_rate": 4.685869726112963e-06, + "loss": 0.5851, + "step": 3484 + }, + { + "epoch": 2.0639620965353864, + "grad_norm": 1.1715927470368515, + "learning_rate": 4.680452890872003e-06, + "loss": 0.6208, + "step": 3485 + }, + { + "epoch": 2.064554338169973, + "grad_norm": 2.3721843450455093, + "learning_rate": 4.675038231624939e-06, + "loss": 0.6333, + "step": 3486 + }, + { + "epoch": 2.0651465798045603, + "grad_norm": 1.4695028389780564, + "learning_rate": 4.669625750586675e-06, + "loss": 0.5885, + "step": 3487 + }, + { + "epoch": 2.065738821439147, + "grad_norm": 6.3301735183432, + "learning_rate": 4.664215449971225e-06, + "loss": 0.6118, + "step": 3488 + }, + { + "epoch": 2.066331063073734, + "grad_norm": 0.9900749638912005, + "learning_rate": 4.658807331991702e-06, + "loss": 0.5987, + "step": 3489 + }, + { + "epoch": 2.066923304708321, + "grad_norm": 1.1137652898298571, + "learning_rate": 4.653401398860324e-06, + "loss": 0.6017, + "step": 3490 + }, + { + "epoch": 2.067515546342908, + "grad_norm": 1.2870621869899241, + "learning_rate": 4.64799765278843e-06, + "loss": 0.6256, + "step": 3491 + }, + { + "epoch": 2.0681077879774947, + "grad_norm": 1.8663809342938509, + "learning_rate": 4.6425960959864556e-06, + "loss": 0.6282, + "step": 3492 + }, + { + "epoch": 2.068700029612082, + "grad_norm": 1.1936625573372333, + "learning_rate": 4.637196730663941e-06, + "loss": 0.6056, + "step": 3493 + }, + { + "epoch": 2.0692922712466686, + "grad_norm": 1.1774196803926553, + "learning_rate": 4.631799559029524e-06, + "loss": 0.6547, + "step": 3494 + }, + { + "epoch": 2.0698845128812557, + "grad_norm": 1.0847314619442951, + "learning_rate": 4.626404583290956e-06, + "loss": 0.6047, + "step": 3495 + }, + { + "epoch": 2.0704767545158425, + "grad_norm": 1.8732417061022584, + "learning_rate": 4.621011805655093e-06, + "loss": 0.6184, + "step": 3496 + }, + { + "epoch": 2.071068996150429, + "grad_norm": 1.0566441622558125, + "learning_rate": 4.615621228327869e-06, + "loss": 0.6037, + "step": 3497 + }, + { + "epoch": 2.0716612377850163, + "grad_norm": 1.4098753153815882, + "learning_rate": 4.61023285351434e-06, + "loss": 0.6616, + "step": 3498 + }, + { + "epoch": 2.072253479419603, + "grad_norm": 1.0888925606822704, + "learning_rate": 4.60484668341866e-06, + "loss": 0.6057, + "step": 3499 + }, + { + "epoch": 2.07284572105419, + "grad_norm": 1.6235022191865385, + "learning_rate": 4.599462720244071e-06, + "loss": 0.615, + "step": 3500 + }, + { + "epoch": 2.073437962688777, + "grad_norm": 1.268826339976644, + "learning_rate": 4.594080966192912e-06, + "loss": 0.6819, + "step": 3501 + }, + { + "epoch": 2.074030204323364, + "grad_norm": 1.188326581207744, + "learning_rate": 4.5887014234666275e-06, + "loss": 0.5907, + "step": 3502 + }, + { + "epoch": 2.0746224459579508, + "grad_norm": 1.6688453348930554, + "learning_rate": 4.583324094265757e-06, + "loss": 0.6598, + "step": 3503 + }, + { + "epoch": 2.075214687592538, + "grad_norm": 1.3153998492810253, + "learning_rate": 4.577948980789924e-06, + "loss": 0.6265, + "step": 3504 + }, + { + "epoch": 2.0758069292271246, + "grad_norm": 1.2858231765513355, + "learning_rate": 4.572576085237853e-06, + "loss": 0.5932, + "step": 3505 + }, + { + "epoch": 2.0763991708617118, + "grad_norm": 0.9882491735932337, + "learning_rate": 4.567205409807362e-06, + "loss": 0.5705, + "step": 3506 + }, + { + "epoch": 2.0769914124962985, + "grad_norm": 1.731064255571183, + "learning_rate": 4.561836956695362e-06, + "loss": 0.6653, + "step": 3507 + }, + { + "epoch": 2.077583654130885, + "grad_norm": 2.457969135297415, + "learning_rate": 4.556470728097849e-06, + "loss": 0.6452, + "step": 3508 + }, + { + "epoch": 2.0781758957654723, + "grad_norm": 1.9842081906939593, + "learning_rate": 4.55110672620991e-06, + "loss": 0.6041, + "step": 3509 + }, + { + "epoch": 2.078768137400059, + "grad_norm": 1.9749587351865325, + "learning_rate": 4.545744953225726e-06, + "loss": 0.5994, + "step": 3510 + }, + { + "epoch": 2.079360379034646, + "grad_norm": 3.1505013963136412, + "learning_rate": 4.540385411338567e-06, + "loss": 0.6332, + "step": 3511 + }, + { + "epoch": 2.079952620669233, + "grad_norm": 1.275314172819857, + "learning_rate": 4.535028102740785e-06, + "loss": 0.6609, + "step": 3512 + }, + { + "epoch": 2.08054486230382, + "grad_norm": 1.4137589146456349, + "learning_rate": 4.529673029623815e-06, + "loss": 0.626, + "step": 3513 + }, + { + "epoch": 2.081137103938407, + "grad_norm": 3.083605720052062, + "learning_rate": 4.524320194178189e-06, + "loss": 0.607, + "step": 3514 + }, + { + "epoch": 2.081729345572994, + "grad_norm": 4.411371237327307, + "learning_rate": 4.518969598593515e-06, + "loss": 0.6471, + "step": 3515 + }, + { + "epoch": 2.0823215872075806, + "grad_norm": 1.3266195802580654, + "learning_rate": 4.5136212450584895e-06, + "loss": 0.6135, + "step": 3516 + }, + { + "epoch": 2.082913828842168, + "grad_norm": 1.4653968395659105, + "learning_rate": 4.508275135760887e-06, + "loss": 0.6175, + "step": 3517 + }, + { + "epoch": 2.0835060704767545, + "grad_norm": 0.990190635939963, + "learning_rate": 4.502931272887572e-06, + "loss": 0.6221, + "step": 3518 + }, + { + "epoch": 2.0840983121113412, + "grad_norm": 1.8306688688850197, + "learning_rate": 4.497589658624477e-06, + "loss": 0.5972, + "step": 3519 + }, + { + "epoch": 2.0846905537459284, + "grad_norm": 1.2748246161237164, + "learning_rate": 4.492250295156632e-06, + "loss": 0.6064, + "step": 3520 + }, + { + "epoch": 2.085282795380515, + "grad_norm": 3.1717060924349134, + "learning_rate": 4.486913184668127e-06, + "loss": 0.6155, + "step": 3521 + }, + { + "epoch": 2.0858750370151022, + "grad_norm": 1.6215550053410732, + "learning_rate": 4.481578329342149e-06, + "loss": 0.6155, + "step": 3522 + }, + { + "epoch": 2.086467278649689, + "grad_norm": 1.8412996687036585, + "learning_rate": 4.476245731360947e-06, + "loss": 0.6175, + "step": 3523 + }, + { + "epoch": 2.087059520284276, + "grad_norm": 1.6036947552013425, + "learning_rate": 4.470915392905862e-06, + "loss": 0.6359, + "step": 3524 + }, + { + "epoch": 2.087651761918863, + "grad_norm": 1.2185010704003587, + "learning_rate": 4.465587316157291e-06, + "loss": 0.648, + "step": 3525 + }, + { + "epoch": 2.08824400355345, + "grad_norm": 2.117675570470455, + "learning_rate": 4.460261503294726e-06, + "loss": 0.5918, + "step": 3526 + }, + { + "epoch": 2.0888362451880367, + "grad_norm": 1.26985997913265, + "learning_rate": 4.4549379564967174e-06, + "loss": 0.6935, + "step": 3527 + }, + { + "epoch": 2.089428486822624, + "grad_norm": 1.421007208158808, + "learning_rate": 4.449616677940904e-06, + "loss": 0.5882, + "step": 3528 + }, + { + "epoch": 2.0900207284572105, + "grad_norm": 2.5687635296741846, + "learning_rate": 4.444297669803981e-06, + "loss": 0.5908, + "step": 3529 + }, + { + "epoch": 2.0906129700917973, + "grad_norm": 1.6681381199853806, + "learning_rate": 4.4389809342617195e-06, + "loss": 0.6284, + "step": 3530 + }, + { + "epoch": 2.0912052117263844, + "grad_norm": 1.6690216078536573, + "learning_rate": 4.433666473488971e-06, + "loss": 0.6287, + "step": 3531 + }, + { + "epoch": 2.091797453360971, + "grad_norm": 1.231182818609419, + "learning_rate": 4.428354289659641e-06, + "loss": 0.5921, + "step": 3532 + }, + { + "epoch": 2.0923896949955583, + "grad_norm": 1.1422701274620846, + "learning_rate": 4.423044384946719e-06, + "loss": 0.5949, + "step": 3533 + }, + { + "epoch": 2.092981936630145, + "grad_norm": 1.5358988678809373, + "learning_rate": 4.417736761522249e-06, + "loss": 0.5869, + "step": 3534 + }, + { + "epoch": 2.093574178264732, + "grad_norm": 1.8994263747170306, + "learning_rate": 4.412431421557351e-06, + "loss": 0.6305, + "step": 3535 + }, + { + "epoch": 2.094166419899319, + "grad_norm": 1.0574749107241654, + "learning_rate": 4.407128367222203e-06, + "loss": 0.5842, + "step": 3536 + }, + { + "epoch": 2.094758661533906, + "grad_norm": 1.756684625648516, + "learning_rate": 4.401827600686059e-06, + "loss": 0.6044, + "step": 3537 + }, + { + "epoch": 2.0953509031684927, + "grad_norm": 1.1516953678691502, + "learning_rate": 4.396529124117223e-06, + "loss": 0.6213, + "step": 3538 + }, + { + "epoch": 2.09594314480308, + "grad_norm": 1.208360469786924, + "learning_rate": 4.391232939683077e-06, + "loss": 0.6195, + "step": 3539 + }, + { + "epoch": 2.0965353864376666, + "grad_norm": 1.7581006133683124, + "learning_rate": 4.38593904955005e-06, + "loss": 0.609, + "step": 3540 + }, + { + "epoch": 2.0971276280722533, + "grad_norm": 1.370855782157487, + "learning_rate": 4.380647455883651e-06, + "loss": 0.5826, + "step": 3541 + }, + { + "epoch": 2.0977198697068404, + "grad_norm": 2.388671548513241, + "learning_rate": 4.37535816084843e-06, + "loss": 0.6563, + "step": 3542 + }, + { + "epoch": 2.098312111341427, + "grad_norm": 1.6360347895846583, + "learning_rate": 4.3700711666080135e-06, + "loss": 0.6808, + "step": 3543 + }, + { + "epoch": 2.0989043529760143, + "grad_norm": 1.4588269440103798, + "learning_rate": 4.364786475325072e-06, + "loss": 0.6258, + "step": 3544 + }, + { + "epoch": 2.099496594610601, + "grad_norm": 1.4736283479225063, + "learning_rate": 4.359504089161351e-06, + "loss": 0.6395, + "step": 3545 + }, + { + "epoch": 2.100088836245188, + "grad_norm": 1.1782359902227395, + "learning_rate": 4.354224010277632e-06, + "loss": 0.6292, + "step": 3546 + }, + { + "epoch": 2.100681077879775, + "grad_norm": 1.1179803013332563, + "learning_rate": 4.348946240833774e-06, + "loss": 0.5898, + "step": 3547 + }, + { + "epoch": 2.101273319514362, + "grad_norm": 1.4815157595037098, + "learning_rate": 4.343670782988679e-06, + "loss": 0.6271, + "step": 3548 + }, + { + "epoch": 2.1018655611489487, + "grad_norm": 1.291616988218737, + "learning_rate": 4.338397638900301e-06, + "loss": 0.5947, + "step": 3549 + }, + { + "epoch": 2.102457802783536, + "grad_norm": 1.2765987298770318, + "learning_rate": 4.333126810725655e-06, + "loss": 0.5707, + "step": 3550 + }, + { + "epoch": 2.1030500444181226, + "grad_norm": 1.375312819341615, + "learning_rate": 4.32785830062081e-06, + "loss": 0.6335, + "step": 3551 + }, + { + "epoch": 2.1036422860527093, + "grad_norm": 1.6256621838008019, + "learning_rate": 4.322592110740882e-06, + "loss": 0.6424, + "step": 3552 + }, + { + "epoch": 2.1042345276872965, + "grad_norm": 1.289927765078695, + "learning_rate": 4.317328243240032e-06, + "loss": 0.5974, + "step": 3553 + }, + { + "epoch": 2.104826769321883, + "grad_norm": 1.2729987598808552, + "learning_rate": 4.312066700271483e-06, + "loss": 0.6065, + "step": 3554 + }, + { + "epoch": 2.1054190109564703, + "grad_norm": 1.5025651892089609, + "learning_rate": 4.306807483987505e-06, + "loss": 0.6541, + "step": 3555 + }, + { + "epoch": 2.106011252591057, + "grad_norm": 4.027247679133015, + "learning_rate": 4.30155059653941e-06, + "loss": 0.6414, + "step": 3556 + }, + { + "epoch": 2.106603494225644, + "grad_norm": 1.3103868133396175, + "learning_rate": 4.296296040077557e-06, + "loss": 0.6134, + "step": 3557 + }, + { + "epoch": 2.107195735860231, + "grad_norm": 1.6626367225184855, + "learning_rate": 4.291043816751357e-06, + "loss": 0.5832, + "step": 3558 + }, + { + "epoch": 2.107787977494818, + "grad_norm": 1.8842126011304945, + "learning_rate": 4.285793928709271e-06, + "loss": 0.6293, + "step": 3559 + }, + { + "epoch": 2.1083802191294048, + "grad_norm": 1.5670020537319092, + "learning_rate": 4.280546378098792e-06, + "loss": 0.6197, + "step": 3560 + }, + { + "epoch": 2.108972460763992, + "grad_norm": 1.4217647311568198, + "learning_rate": 4.2753011670664625e-06, + "loss": 0.6005, + "step": 3561 + }, + { + "epoch": 2.1095647023985786, + "grad_norm": 1.4382610743287825, + "learning_rate": 4.270058297757871e-06, + "loss": 0.6199, + "step": 3562 + }, + { + "epoch": 2.1101569440331653, + "grad_norm": 1.5200269742920969, + "learning_rate": 4.264817772317653e-06, + "loss": 0.6163, + "step": 3563 + }, + { + "epoch": 2.1107491856677525, + "grad_norm": 2.061967748457973, + "learning_rate": 4.259579592889464e-06, + "loss": 0.6036, + "step": 3564 + }, + { + "epoch": 2.111341427302339, + "grad_norm": 1.193306955598674, + "learning_rate": 4.25434376161602e-06, + "loss": 0.5957, + "step": 3565 + }, + { + "epoch": 2.1119336689369264, + "grad_norm": 1.15551634693182, + "learning_rate": 4.249110280639076e-06, + "loss": 0.5846, + "step": 3566 + }, + { + "epoch": 2.112525910571513, + "grad_norm": 1.9325467571821633, + "learning_rate": 4.243879152099415e-06, + "loss": 0.5902, + "step": 3567 + }, + { + "epoch": 2.1131181522061, + "grad_norm": 1.4819027125682924, + "learning_rate": 4.238650378136859e-06, + "loss": 0.5652, + "step": 3568 + }, + { + "epoch": 2.113710393840687, + "grad_norm": 1.221063734006737, + "learning_rate": 4.233423960890275e-06, + "loss": 0.6213, + "step": 3569 + }, + { + "epoch": 2.114302635475274, + "grad_norm": 1.2311749892527528, + "learning_rate": 4.228199902497565e-06, + "loss": 0.621, + "step": 3570 + }, + { + "epoch": 2.114894877109861, + "grad_norm": 1.6157414496724527, + "learning_rate": 4.222978205095659e-06, + "loss": 0.6328, + "step": 3571 + }, + { + "epoch": 2.115487118744448, + "grad_norm": 2.1238378827778654, + "learning_rate": 4.217758870820522e-06, + "loss": 0.6247, + "step": 3572 + }, + { + "epoch": 2.1160793603790347, + "grad_norm": 1.6576077793873196, + "learning_rate": 4.2125419018071565e-06, + "loss": 0.5893, + "step": 3573 + }, + { + "epoch": 2.1166716020136214, + "grad_norm": 7.289033310054799, + "learning_rate": 4.207327300189602e-06, + "loss": 0.6229, + "step": 3574 + }, + { + "epoch": 2.1172638436482085, + "grad_norm": 1.9096206920777807, + "learning_rate": 4.202115068100919e-06, + "loss": 0.6134, + "step": 3575 + }, + { + "epoch": 2.1178560852827952, + "grad_norm": 1.3294555847995595, + "learning_rate": 4.196905207673201e-06, + "loss": 0.6541, + "step": 3576 + }, + { + "epoch": 2.1184483269173824, + "grad_norm": 2.478570928826208, + "learning_rate": 4.191697721037577e-06, + "loss": 0.64, + "step": 3577 + }, + { + "epoch": 2.119040568551969, + "grad_norm": 1.3384108708761757, + "learning_rate": 4.186492610324204e-06, + "loss": 0.6313, + "step": 3578 + }, + { + "epoch": 2.1196328101865562, + "grad_norm": 2.059122205094186, + "learning_rate": 4.181289877662263e-06, + "loss": 0.6483, + "step": 3579 + }, + { + "epoch": 2.120225051821143, + "grad_norm": 2.210349418550527, + "learning_rate": 4.176089525179961e-06, + "loss": 0.6482, + "step": 3580 + }, + { + "epoch": 2.12081729345573, + "grad_norm": 1.2899545185125494, + "learning_rate": 4.1708915550045385e-06, + "loss": 0.6031, + "step": 3581 + }, + { + "epoch": 2.121409535090317, + "grad_norm": 1.0636519968386704, + "learning_rate": 4.165695969262259e-06, + "loss": 0.6238, + "step": 3582 + }, + { + "epoch": 2.122001776724904, + "grad_norm": 1.8256192982659298, + "learning_rate": 4.160502770078407e-06, + "loss": 0.6507, + "step": 3583 + }, + { + "epoch": 2.1225940183594907, + "grad_norm": 1.3887604621578677, + "learning_rate": 4.155311959577289e-06, + "loss": 0.5671, + "step": 3584 + }, + { + "epoch": 2.1231862599940774, + "grad_norm": 1.0622378454165742, + "learning_rate": 4.150123539882246e-06, + "loss": 0.5976, + "step": 3585 + }, + { + "epoch": 2.1237785016286646, + "grad_norm": 1.3574063696461265, + "learning_rate": 4.144937513115627e-06, + "loss": 0.6602, + "step": 3586 + }, + { + "epoch": 2.1243707432632513, + "grad_norm": 1.4538940676417582, + "learning_rate": 4.139753881398813e-06, + "loss": 0.5957, + "step": 3587 + }, + { + "epoch": 2.1249629848978384, + "grad_norm": 2.264238665767258, + "learning_rate": 4.134572646852196e-06, + "loss": 0.5999, + "step": 3588 + }, + { + "epoch": 2.125555226532425, + "grad_norm": 1.2966266388996097, + "learning_rate": 4.1293938115952e-06, + "loss": 0.6102, + "step": 3589 + }, + { + "epoch": 2.1261474681670123, + "grad_norm": 33.071460074035244, + "learning_rate": 4.124217377746251e-06, + "loss": 0.566, + "step": 3590 + }, + { + "epoch": 2.126739709801599, + "grad_norm": 1.1430460336126136, + "learning_rate": 4.11904334742281e-06, + "loss": 0.6487, + "step": 3591 + }, + { + "epoch": 2.127331951436186, + "grad_norm": 1.340637625601947, + "learning_rate": 4.113871722741337e-06, + "loss": 0.614, + "step": 3592 + }, + { + "epoch": 2.127924193070773, + "grad_norm": 4.321691288846663, + "learning_rate": 4.108702505817327e-06, + "loss": 0.6521, + "step": 3593 + }, + { + "epoch": 2.12851643470536, + "grad_norm": 1.3840804956759036, + "learning_rate": 4.103535698765272e-06, + "loss": 0.6095, + "step": 3594 + }, + { + "epoch": 2.1291086763399467, + "grad_norm": 1.5085339460848268, + "learning_rate": 4.098371303698694e-06, + "loss": 0.5992, + "step": 3595 + }, + { + "epoch": 2.1297009179745334, + "grad_norm": 2.0820647527427854, + "learning_rate": 4.093209322730114e-06, + "loss": 0.5808, + "step": 3596 + }, + { + "epoch": 2.1302931596091206, + "grad_norm": 1.2113506268447882, + "learning_rate": 4.08804975797108e-06, + "loss": 0.6079, + "step": 3597 + }, + { + "epoch": 2.1308854012437073, + "grad_norm": 1.450986037398985, + "learning_rate": 4.082892611532136e-06, + "loss": 0.634, + "step": 3598 + }, + { + "epoch": 2.1314776428782944, + "grad_norm": 1.8781499277404807, + "learning_rate": 4.077737885522852e-06, + "loss": 0.6374, + "step": 3599 + }, + { + "epoch": 2.132069884512881, + "grad_norm": 1.7864320630965398, + "learning_rate": 4.072585582051798e-06, + "loss": 0.627, + "step": 3600 + }, + { + "epoch": 2.1326621261474683, + "grad_norm": 1.4956135203030114, + "learning_rate": 4.067435703226552e-06, + "loss": 0.6108, + "step": 3601 + }, + { + "epoch": 2.133254367782055, + "grad_norm": 2.2180019781234, + "learning_rate": 4.0622882511537076e-06, + "loss": 0.5863, + "step": 3602 + }, + { + "epoch": 2.133846609416642, + "grad_norm": 4.119143056595805, + "learning_rate": 4.057143227938866e-06, + "loss": 0.6186, + "step": 3603 + }, + { + "epoch": 2.134438851051229, + "grad_norm": 1.5204561646279764, + "learning_rate": 4.052000635686627e-06, + "loss": 0.5892, + "step": 3604 + }, + { + "epoch": 2.135031092685816, + "grad_norm": 4.47672704923902, + "learning_rate": 4.046860476500596e-06, + "loss": 0.6405, + "step": 3605 + }, + { + "epoch": 2.1356233343204027, + "grad_norm": 1.7257594440244672, + "learning_rate": 4.0417227524833925e-06, + "loss": 0.6228, + "step": 3606 + }, + { + "epoch": 2.1362155759549895, + "grad_norm": 1.0415334302691686, + "learning_rate": 4.036587465736635e-06, + "loss": 0.6577, + "step": 3607 + }, + { + "epoch": 2.1368078175895766, + "grad_norm": 1.265219398940147, + "learning_rate": 4.031454618360945e-06, + "loss": 0.6531, + "step": 3608 + }, + { + "epoch": 2.1374000592241633, + "grad_norm": 1.2617611265687647, + "learning_rate": 4.02632421245594e-06, + "loss": 0.5512, + "step": 3609 + }, + { + "epoch": 2.1379923008587505, + "grad_norm": 1.241378144123858, + "learning_rate": 4.021196250120248e-06, + "loss": 0.6223, + "step": 3610 + }, + { + "epoch": 2.138584542493337, + "grad_norm": 1.8843558424320774, + "learning_rate": 4.016070733451496e-06, + "loss": 0.6307, + "step": 3611 + }, + { + "epoch": 2.1391767841279243, + "grad_norm": 1.152700464562051, + "learning_rate": 4.0109476645463076e-06, + "loss": 0.6149, + "step": 3612 + }, + { + "epoch": 2.139769025762511, + "grad_norm": 1.646886331579089, + "learning_rate": 4.005827045500301e-06, + "loss": 0.6022, + "step": 3613 + }, + { + "epoch": 2.140361267397098, + "grad_norm": 1.4285955128879986, + "learning_rate": 4.000708878408103e-06, + "loss": 0.5811, + "step": 3614 + }, + { + "epoch": 2.140953509031685, + "grad_norm": 1.8299679014912458, + "learning_rate": 3.9955931653633365e-06, + "loss": 0.6211, + "step": 3615 + }, + { + "epoch": 2.141545750666272, + "grad_norm": 1.4096002284217504, + "learning_rate": 3.990479908458602e-06, + "loss": 0.6149, + "step": 3616 + }, + { + "epoch": 2.1421379923008588, + "grad_norm": 2.69120669503384, + "learning_rate": 3.985369109785516e-06, + "loss": 0.6343, + "step": 3617 + }, + { + "epoch": 2.1427302339354455, + "grad_norm": 1.1754488546881272, + "learning_rate": 3.980260771434685e-06, + "loss": 0.6071, + "step": 3618 + }, + { + "epoch": 2.1433224755700326, + "grad_norm": 1.6113726977096838, + "learning_rate": 3.975154895495711e-06, + "loss": 0.5882, + "step": 3619 + }, + { + "epoch": 2.1439147172046193, + "grad_norm": 2.791107478639297, + "learning_rate": 3.970051484057171e-06, + "loss": 0.635, + "step": 3620 + }, + { + "epoch": 2.1445069588392065, + "grad_norm": 1.5913773207532422, + "learning_rate": 3.9649505392066544e-06, + "loss": 0.5979, + "step": 3621 + }, + { + "epoch": 2.145099200473793, + "grad_norm": 1.5408491987361748, + "learning_rate": 3.959852063030738e-06, + "loss": 0.6264, + "step": 3622 + }, + { + "epoch": 2.1456914421083804, + "grad_norm": 1.4228715868052781, + "learning_rate": 3.9547560576149815e-06, + "loss": 0.6112, + "step": 3623 + }, + { + "epoch": 2.146283683742967, + "grad_norm": 1.894084960567437, + "learning_rate": 3.949662525043935e-06, + "loss": 0.6981, + "step": 3624 + }, + { + "epoch": 2.1468759253775542, + "grad_norm": 29.083821486159245, + "learning_rate": 3.944571467401142e-06, + "loss": 0.5861, + "step": 3625 + }, + { + "epoch": 2.147468167012141, + "grad_norm": 1.778409125554096, + "learning_rate": 3.939482886769136e-06, + "loss": 0.6135, + "step": 3626 + }, + { + "epoch": 2.148060408646728, + "grad_norm": 1.7787742540030977, + "learning_rate": 3.934396785229429e-06, + "loss": 0.5599, + "step": 3627 + }, + { + "epoch": 2.148652650281315, + "grad_norm": 1.9679765606795838, + "learning_rate": 3.929313164862518e-06, + "loss": 0.637, + "step": 3628 + }, + { + "epoch": 2.1492448919159015, + "grad_norm": 1.5563281004545866, + "learning_rate": 3.924232027747894e-06, + "loss": 0.6389, + "step": 3629 + }, + { + "epoch": 2.1498371335504887, + "grad_norm": 1.2422466217603876, + "learning_rate": 3.919153375964032e-06, + "loss": 0.567, + "step": 3630 + }, + { + "epoch": 2.1504293751850754, + "grad_norm": 2.00976111403471, + "learning_rate": 3.91407721158838e-06, + "loss": 0.5458, + "step": 3631 + }, + { + "epoch": 2.1510216168196625, + "grad_norm": 1.6951508776864774, + "learning_rate": 3.909003536697374e-06, + "loss": 0.605, + "step": 3632 + }, + { + "epoch": 2.1516138584542492, + "grad_norm": 1.558169994124345, + "learning_rate": 3.903932353366435e-06, + "loss": 0.595, + "step": 3633 + }, + { + "epoch": 2.1522061000888364, + "grad_norm": 1.6276022870031455, + "learning_rate": 3.898863663669965e-06, + "loss": 0.6334, + "step": 3634 + }, + { + "epoch": 2.152798341723423, + "grad_norm": 1.6198572914267186, + "learning_rate": 3.8937974696813405e-06, + "loss": 0.5621, + "step": 3635 + }, + { + "epoch": 2.1533905833580103, + "grad_norm": 1.9472727059260369, + "learning_rate": 3.888733773472916e-06, + "loss": 0.5736, + "step": 3636 + }, + { + "epoch": 2.153982824992597, + "grad_norm": 2.552984173348855, + "learning_rate": 3.883672577116035e-06, + "loss": 0.6166, + "step": 3637 + }, + { + "epoch": 2.154575066627184, + "grad_norm": 1.9217704150949895, + "learning_rate": 3.878613882681002e-06, + "loss": 0.6168, + "step": 3638 + }, + { + "epoch": 2.155167308261771, + "grad_norm": 1.5327012862258527, + "learning_rate": 3.873557692237119e-06, + "loss": 0.5939, + "step": 3639 + }, + { + "epoch": 2.1557595498963575, + "grad_norm": 1.4830748099663695, + "learning_rate": 3.868504007852641e-06, + "loss": 0.636, + "step": 3640 + }, + { + "epoch": 2.1563517915309447, + "grad_norm": 1.4934300428699974, + "learning_rate": 3.86345283159482e-06, + "loss": 0.5907, + "step": 3641 + }, + { + "epoch": 2.1569440331655314, + "grad_norm": 1.8303353542056642, + "learning_rate": 3.8584041655298606e-06, + "loss": 0.6217, + "step": 3642 + }, + { + "epoch": 2.1575362748001186, + "grad_norm": 2.6188924474056203, + "learning_rate": 3.853358011722961e-06, + "loss": 0.6247, + "step": 3643 + }, + { + "epoch": 2.1581285164347053, + "grad_norm": 1.9829717288285205, + "learning_rate": 3.848314372238272e-06, + "loss": 0.5971, + "step": 3644 + }, + { + "epoch": 2.1587207580692924, + "grad_norm": 2.9821607547301348, + "learning_rate": 3.8432732491389345e-06, + "loss": 0.6207, + "step": 3645 + }, + { + "epoch": 2.159312999703879, + "grad_norm": 1.4864857735219268, + "learning_rate": 3.838234644487045e-06, + "loss": 0.6185, + "step": 3646 + }, + { + "epoch": 2.1599052413384663, + "grad_norm": 2.2631386629045287, + "learning_rate": 3.833198560343682e-06, + "loss": 0.5928, + "step": 3647 + }, + { + "epoch": 2.160497482973053, + "grad_norm": 1.2595848974397195, + "learning_rate": 3.828164998768879e-06, + "loss": 0.6202, + "step": 3648 + }, + { + "epoch": 2.16108972460764, + "grad_norm": 1.2855787943939143, + "learning_rate": 3.8231339618216556e-06, + "loss": 0.5912, + "step": 3649 + }, + { + "epoch": 2.161681966242227, + "grad_norm": 3.733946616282587, + "learning_rate": 3.8181054515599806e-06, + "loss": 0.642, + "step": 3650 + }, + { + "epoch": 2.1622742078768136, + "grad_norm": 1.9716595141173552, + "learning_rate": 3.8130794700408027e-06, + "loss": 0.576, + "step": 3651 + }, + { + "epoch": 2.1628664495114007, + "grad_norm": 10.436223336411803, + "learning_rate": 3.8080560193200288e-06, + "loss": 0.59, + "step": 3652 + }, + { + "epoch": 2.1634586911459874, + "grad_norm": 1.6433990265618101, + "learning_rate": 3.803035101452531e-06, + "loss": 0.6323, + "step": 3653 + }, + { + "epoch": 2.1640509327805746, + "grad_norm": 1.4190321545816333, + "learning_rate": 3.798016718492148e-06, + "loss": 0.6114, + "step": 3654 + }, + { + "epoch": 2.1646431744151613, + "grad_norm": 1.559115691980612, + "learning_rate": 3.7930008724916846e-06, + "loss": 0.583, + "step": 3655 + }, + { + "epoch": 2.1652354160497485, + "grad_norm": 2.1204592019182655, + "learning_rate": 3.7879875655029018e-06, + "loss": 0.6153, + "step": 3656 + }, + { + "epoch": 2.165827657684335, + "grad_norm": 1.9790725686824444, + "learning_rate": 3.782976799576519e-06, + "loss": 0.6118, + "step": 3657 + }, + { + "epoch": 2.1664198993189223, + "grad_norm": 1.7029886439459894, + "learning_rate": 3.7779685767622255e-06, + "loss": 0.6315, + "step": 3658 + }, + { + "epoch": 2.167012140953509, + "grad_norm": 1.3790840194704055, + "learning_rate": 3.7729628991086687e-06, + "loss": 0.6036, + "step": 3659 + }, + { + "epoch": 2.167604382588096, + "grad_norm": 1.7802518189464025, + "learning_rate": 3.7679597686634495e-06, + "loss": 0.5877, + "step": 3660 + }, + { + "epoch": 2.168196624222683, + "grad_norm": 1.746849203801897, + "learning_rate": 3.7629591874731264e-06, + "loss": 0.6504, + "step": 3661 + }, + { + "epoch": 2.1687888658572696, + "grad_norm": 1.6998904736750884, + "learning_rate": 3.757961157583221e-06, + "loss": 0.6304, + "step": 3662 + }, + { + "epoch": 2.1693811074918568, + "grad_norm": 1.3810219749838843, + "learning_rate": 3.7529656810382133e-06, + "loss": 0.6296, + "step": 3663 + }, + { + "epoch": 2.1699733491264435, + "grad_norm": 1.298383302796027, + "learning_rate": 3.7479727598815287e-06, + "loss": 0.6187, + "step": 3664 + }, + { + "epoch": 2.1705655907610306, + "grad_norm": 2.1800686642840987, + "learning_rate": 3.7429823961555513e-06, + "loss": 0.6048, + "step": 3665 + }, + { + "epoch": 2.1711578323956173, + "grad_norm": 1.138658124740238, + "learning_rate": 3.7379945919016225e-06, + "loss": 0.659, + "step": 3666 + }, + { + "epoch": 2.1717500740302045, + "grad_norm": 1.3434860594717237, + "learning_rate": 3.733009349160042e-06, + "loss": 0.6048, + "step": 3667 + }, + { + "epoch": 2.172342315664791, + "grad_norm": 1.764600534002413, + "learning_rate": 3.7280266699700406e-06, + "loss": 0.5861, + "step": 3668 + }, + { + "epoch": 2.1729345572993783, + "grad_norm": 1.1095526363681014, + "learning_rate": 3.7230465563698214e-06, + "loss": 0.6066, + "step": 3669 + }, + { + "epoch": 2.173526798933965, + "grad_norm": 3.064677575014838, + "learning_rate": 3.7180690103965313e-06, + "loss": 0.6762, + "step": 3670 + }, + { + "epoch": 2.1741190405685518, + "grad_norm": 1.0892290245854241, + "learning_rate": 3.713094034086273e-06, + "loss": 0.6142, + "step": 3671 + }, + { + "epoch": 2.174711282203139, + "grad_norm": 2.014740928840878, + "learning_rate": 3.7081216294740773e-06, + "loss": 0.6006, + "step": 3672 + }, + { + "epoch": 2.1753035238377256, + "grad_norm": 1.6840077237428435, + "learning_rate": 3.703151798593945e-06, + "loss": 0.6111, + "step": 3673 + }, + { + "epoch": 2.175895765472313, + "grad_norm": 3.147435725567642, + "learning_rate": 3.6981845434788188e-06, + "loss": 0.59, + "step": 3674 + }, + { + "epoch": 2.1764880071068995, + "grad_norm": 2.3668044579752547, + "learning_rate": 3.693219866160582e-06, + "loss": 0.6486, + "step": 3675 + }, + { + "epoch": 2.1770802487414866, + "grad_norm": 1.7822979150577114, + "learning_rate": 3.688257768670065e-06, + "loss": 0.617, + "step": 3676 + }, + { + "epoch": 2.1776724903760734, + "grad_norm": 1.5174818432951496, + "learning_rate": 3.6832982530370465e-06, + "loss": 0.62, + "step": 3677 + }, + { + "epoch": 2.1782647320106605, + "grad_norm": 1.4448335060389763, + "learning_rate": 3.678341321290252e-06, + "loss": 0.6469, + "step": 3678 + }, + { + "epoch": 2.178856973645247, + "grad_norm": 1.55193829165598, + "learning_rate": 3.6733869754573403e-06, + "loss": 0.602, + "step": 3679 + }, + { + "epoch": 2.1794492152798344, + "grad_norm": 1.1427395519868442, + "learning_rate": 3.668435217564915e-06, + "loss": 0.6552, + "step": 3680 + }, + { + "epoch": 2.180041456914421, + "grad_norm": 1.3343442976110556, + "learning_rate": 3.663486049638527e-06, + "loss": 0.6427, + "step": 3681 + }, + { + "epoch": 2.180633698549008, + "grad_norm": 1.6318631166682127, + "learning_rate": 3.658539473702667e-06, + "loss": 0.6476, + "step": 3682 + }, + { + "epoch": 2.181225940183595, + "grad_norm": 1.4447436327409222, + "learning_rate": 3.65359549178076e-06, + "loss": 0.595, + "step": 3683 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 1.1309723434982337, + "learning_rate": 3.6486541058951696e-06, + "loss": 0.6008, + "step": 3684 + }, + { + "epoch": 2.182410423452769, + "grad_norm": 1.7100247943717297, + "learning_rate": 3.6437153180672034e-06, + "loss": 0.6291, + "step": 3685 + }, + { + "epoch": 2.1830026650873555, + "grad_norm": 1.6181277098556224, + "learning_rate": 3.638779130317106e-06, + "loss": 0.6039, + "step": 3686 + }, + { + "epoch": 2.1835949067219427, + "grad_norm": 1.5754065682938263, + "learning_rate": 3.633845544664053e-06, + "loss": 0.6282, + "step": 3687 + }, + { + "epoch": 2.1841871483565294, + "grad_norm": 2.344647272098969, + "learning_rate": 3.628914563126156e-06, + "loss": 0.6411, + "step": 3688 + }, + { + "epoch": 2.1847793899911165, + "grad_norm": 6.915326366107702, + "learning_rate": 3.6239861877204684e-06, + "loss": 0.5722, + "step": 3689 + }, + { + "epoch": 2.1853716316257032, + "grad_norm": 1.3363840965456972, + "learning_rate": 3.6190604204629685e-06, + "loss": 0.6376, + "step": 3690 + }, + { + "epoch": 2.1859638732602904, + "grad_norm": 1.5541366844193774, + "learning_rate": 3.6141372633685767e-06, + "loss": 0.6474, + "step": 3691 + }, + { + "epoch": 2.186556114894877, + "grad_norm": 2.254316560958191, + "learning_rate": 3.6092167184511352e-06, + "loss": 0.6008, + "step": 3692 + }, + { + "epoch": 2.187148356529464, + "grad_norm": 2.255989122727738, + "learning_rate": 3.6042987877234304e-06, + "loss": 0.6062, + "step": 3693 + }, + { + "epoch": 2.187740598164051, + "grad_norm": 1.320086179706794, + "learning_rate": 3.5993834731971654e-06, + "loss": 0.5777, + "step": 3694 + }, + { + "epoch": 2.1883328397986377, + "grad_norm": 1.8280860055033141, + "learning_rate": 3.594470776882989e-06, + "loss": 0.5831, + "step": 3695 + }, + { + "epoch": 2.188925081433225, + "grad_norm": 1.9043201529453062, + "learning_rate": 3.5895607007904597e-06, + "loss": 0.5996, + "step": 3696 + }, + { + "epoch": 2.1895173230678115, + "grad_norm": 2.358121027410781, + "learning_rate": 3.584653246928085e-06, + "loss": 0.6427, + "step": 3697 + }, + { + "epoch": 2.1901095647023987, + "grad_norm": 1.3402210675480135, + "learning_rate": 3.5797484173032806e-06, + "loss": 0.6018, + "step": 3698 + }, + { + "epoch": 2.1907018063369854, + "grad_norm": 2.2835653872932706, + "learning_rate": 3.5748462139224048e-06, + "loss": 0.656, + "step": 3699 + }, + { + "epoch": 2.1912940479715726, + "grad_norm": 1.1925267592647002, + "learning_rate": 3.569946638790729e-06, + "loss": 0.5937, + "step": 3700 + }, + { + "epoch": 2.1918862896061593, + "grad_norm": 6.950862477600291, + "learning_rate": 3.5650496939124602e-06, + "loss": 0.5998, + "step": 3701 + }, + { + "epoch": 2.1924785312407464, + "grad_norm": 1.5302796583951903, + "learning_rate": 3.5601553812907174e-06, + "loss": 0.6095, + "step": 3702 + }, + { + "epoch": 2.193070772875333, + "grad_norm": 1.6634344074466962, + "learning_rate": 3.555263702927558e-06, + "loss": 0.6575, + "step": 3703 + }, + { + "epoch": 2.19366301450992, + "grad_norm": 1.3910377831755019, + "learning_rate": 3.5503746608239487e-06, + "loss": 0.6121, + "step": 3704 + }, + { + "epoch": 2.194255256144507, + "grad_norm": 1.6713226754844968, + "learning_rate": 3.54548825697978e-06, + "loss": 0.641, + "step": 3705 + }, + { + "epoch": 2.1948474977790937, + "grad_norm": 1.5747338103446598, + "learning_rate": 3.5406044933938688e-06, + "loss": 0.6064, + "step": 3706 + }, + { + "epoch": 2.195439739413681, + "grad_norm": 1.5768868641879534, + "learning_rate": 3.535723372063952e-06, + "loss": 0.6197, + "step": 3707 + }, + { + "epoch": 2.1960319810482676, + "grad_norm": 1.6152013346052505, + "learning_rate": 3.5308448949866805e-06, + "loss": 0.6304, + "step": 3708 + }, + { + "epoch": 2.1966242226828547, + "grad_norm": 1.3489870236047408, + "learning_rate": 3.5259690641576216e-06, + "loss": 0.6066, + "step": 3709 + }, + { + "epoch": 2.1972164643174414, + "grad_norm": 2.0855784992139825, + "learning_rate": 3.5210958815712672e-06, + "loss": 0.5858, + "step": 3710 + }, + { + "epoch": 2.1978087059520286, + "grad_norm": 2.1655140503921855, + "learning_rate": 3.5162253492210276e-06, + "loss": 0.6431, + "step": 3711 + }, + { + "epoch": 2.1984009475866153, + "grad_norm": 1.3963981694086718, + "learning_rate": 3.5113574690992203e-06, + "loss": 0.6041, + "step": 3712 + }, + { + "epoch": 2.198993189221202, + "grad_norm": 1.9394182753615543, + "learning_rate": 3.5064922431970793e-06, + "loss": 0.5795, + "step": 3713 + }, + { + "epoch": 2.199585430855789, + "grad_norm": 1.3857185510161663, + "learning_rate": 3.5016296735047584e-06, + "loss": 0.6244, + "step": 3714 + }, + { + "epoch": 2.200177672490376, + "grad_norm": 1.6061122144253255, + "learning_rate": 3.496769762011325e-06, + "loss": 0.6293, + "step": 3715 + }, + { + "epoch": 2.200769914124963, + "grad_norm": 5.40581536454228, + "learning_rate": 3.4919125107047537e-06, + "loss": 0.6259, + "step": 3716 + }, + { + "epoch": 2.2013621557595497, + "grad_norm": 1.2657815177175666, + "learning_rate": 3.487057921571929e-06, + "loss": 0.5988, + "step": 3717 + }, + { + "epoch": 2.201954397394137, + "grad_norm": 1.7259059897471152, + "learning_rate": 3.482205996598654e-06, + "loss": 0.6134, + "step": 3718 + }, + { + "epoch": 2.2025466390287236, + "grad_norm": 1.184129395946116, + "learning_rate": 3.477356737769645e-06, + "loss": 0.6274, + "step": 3719 + }, + { + "epoch": 2.2031388806633108, + "grad_norm": 1.7443680463119289, + "learning_rate": 3.472510147068515e-06, + "loss": 0.6375, + "step": 3720 + }, + { + "epoch": 2.2037311222978975, + "grad_norm": 1.4046681220312973, + "learning_rate": 3.4676662264777905e-06, + "loss": 0.6456, + "step": 3721 + }, + { + "epoch": 2.2043233639324846, + "grad_norm": 2.5357065102499994, + "learning_rate": 3.4628249779789105e-06, + "loss": 0.6201, + "step": 3722 + }, + { + "epoch": 2.2049156055670713, + "grad_norm": 2.357882593644408, + "learning_rate": 3.4579864035522236e-06, + "loss": 0.6352, + "step": 3723 + }, + { + "epoch": 2.205507847201658, + "grad_norm": 1.2137268888170112, + "learning_rate": 3.4531505051769665e-06, + "loss": 0.6398, + "step": 3724 + }, + { + "epoch": 2.206100088836245, + "grad_norm": 2.239315397553286, + "learning_rate": 3.4483172848312994e-06, + "loss": 0.6236, + "step": 3725 + }, + { + "epoch": 2.206692330470832, + "grad_norm": 1.557932862394711, + "learning_rate": 3.4434867444922857e-06, + "loss": 0.5774, + "step": 3726 + }, + { + "epoch": 2.207284572105419, + "grad_norm": 1.5777287070135115, + "learning_rate": 3.438658886135884e-06, + "loss": 0.5508, + "step": 3727 + }, + { + "epoch": 2.2078768137400058, + "grad_norm": 1.413920945713966, + "learning_rate": 3.433833711736957e-06, + "loss": 0.6097, + "step": 3728 + }, + { + "epoch": 2.208469055374593, + "grad_norm": 1.258492807065135, + "learning_rate": 3.429011223269274e-06, + "loss": 0.6256, + "step": 3729 + }, + { + "epoch": 2.2090612970091796, + "grad_norm": 3.75247152278292, + "learning_rate": 3.4241914227055096e-06, + "loss": 0.5718, + "step": 3730 + }, + { + "epoch": 2.209653538643767, + "grad_norm": 3.7102331792806478, + "learning_rate": 3.4193743120172297e-06, + "loss": 0.5975, + "step": 3731 + }, + { + "epoch": 2.2102457802783535, + "grad_norm": 1.7080087366642982, + "learning_rate": 3.414559893174898e-06, + "loss": 0.5949, + "step": 3732 + }, + { + "epoch": 2.2108380219129407, + "grad_norm": 2.0252022919779153, + "learning_rate": 3.4097481681478873e-06, + "loss": 0.6596, + "step": 3733 + }, + { + "epoch": 2.2114302635475274, + "grad_norm": 1.7291383197002848, + "learning_rate": 3.4049391389044674e-06, + "loss": 0.581, + "step": 3734 + }, + { + "epoch": 2.212022505182114, + "grad_norm": 2.2935670684623286, + "learning_rate": 3.4001328074117977e-06, + "loss": 0.6159, + "step": 3735 + }, + { + "epoch": 2.2126147468167012, + "grad_norm": 1.5768762657428204, + "learning_rate": 3.3953291756359354e-06, + "loss": 0.6298, + "step": 3736 + }, + { + "epoch": 2.213206988451288, + "grad_norm": 2.260996804956774, + "learning_rate": 3.3905282455418375e-06, + "loss": 0.6006, + "step": 3737 + }, + { + "epoch": 2.213799230085875, + "grad_norm": 1.4582667950449473, + "learning_rate": 3.3857300190933606e-06, + "loss": 0.6607, + "step": 3738 + }, + { + "epoch": 2.214391471720462, + "grad_norm": 1.7196961502523231, + "learning_rate": 3.3809344982532435e-06, + "loss": 0.6154, + "step": 3739 + }, + { + "epoch": 2.214983713355049, + "grad_norm": 1.4224301706484694, + "learning_rate": 3.376141684983121e-06, + "loss": 0.6257, + "step": 3740 + }, + { + "epoch": 2.2155759549896357, + "grad_norm": 1.5242446120056001, + "learning_rate": 3.3713515812435305e-06, + "loss": 0.6395, + "step": 3741 + }, + { + "epoch": 2.216168196624223, + "grad_norm": 3.082276038619601, + "learning_rate": 3.366564188993887e-06, + "loss": 0.607, + "step": 3742 + }, + { + "epoch": 2.2167604382588095, + "grad_norm": 1.1091952397891178, + "learning_rate": 3.36177951019251e-06, + "loss": 0.5673, + "step": 3743 + }, + { + "epoch": 2.2173526798933967, + "grad_norm": 1.0623328174773825, + "learning_rate": 3.3569975467965955e-06, + "loss": 0.6197, + "step": 3744 + }, + { + "epoch": 2.2179449215279834, + "grad_norm": 1.9139569358167408, + "learning_rate": 3.352218300762241e-06, + "loss": 0.6325, + "step": 3745 + }, + { + "epoch": 2.21853716316257, + "grad_norm": 1.754753069962319, + "learning_rate": 3.347441774044421e-06, + "loss": 0.6194, + "step": 3746 + }, + { + "epoch": 2.2191294047971573, + "grad_norm": 1.311850230899881, + "learning_rate": 3.3426679685970096e-06, + "loss": 0.6365, + "step": 3747 + }, + { + "epoch": 2.219721646431744, + "grad_norm": 2.3171585866726243, + "learning_rate": 3.337896886372757e-06, + "loss": 0.6015, + "step": 3748 + }, + { + "epoch": 2.220313888066331, + "grad_norm": 3.610142905808541, + "learning_rate": 3.3331285293233086e-06, + "loss": 0.5806, + "step": 3749 + }, + { + "epoch": 2.220906129700918, + "grad_norm": 1.4393742110166086, + "learning_rate": 3.3283628993991846e-06, + "loss": 0.5956, + "step": 3750 + }, + { + "epoch": 2.221498371335505, + "grad_norm": 1.5219021843139517, + "learning_rate": 3.3235999985498036e-06, + "loss": 0.6268, + "step": 3751 + }, + { + "epoch": 2.2220906129700917, + "grad_norm": 8.505541739782977, + "learning_rate": 3.3188398287234504e-06, + "loss": 0.602, + "step": 3752 + }, + { + "epoch": 2.222682854604679, + "grad_norm": 1.4777906472192541, + "learning_rate": 3.3140823918673117e-06, + "loss": 0.5723, + "step": 3753 + }, + { + "epoch": 2.2232750962392656, + "grad_norm": 3.427821946647888, + "learning_rate": 3.3093276899274373e-06, + "loss": 0.6404, + "step": 3754 + }, + { + "epoch": 2.2238673378738527, + "grad_norm": 1.5440256787529059, + "learning_rate": 3.3045757248487763e-06, + "loss": 0.5923, + "step": 3755 + }, + { + "epoch": 2.2244595795084394, + "grad_norm": 1.0121502394479982, + "learning_rate": 3.2998264985751425e-06, + "loss": 0.6091, + "step": 3756 + }, + { + "epoch": 2.225051821143026, + "grad_norm": 1.435124556629406, + "learning_rate": 3.2950800130492434e-06, + "loss": 0.5865, + "step": 3757 + }, + { + "epoch": 2.2256440627776133, + "grad_norm": 1.400676794978446, + "learning_rate": 3.2903362702126516e-06, + "loss": 0.5851, + "step": 3758 + }, + { + "epoch": 2.2262363044122, + "grad_norm": 2.776885503195463, + "learning_rate": 3.2855952720058303e-06, + "loss": 0.6104, + "step": 3759 + }, + { + "epoch": 2.226828546046787, + "grad_norm": 1.2754379174271564, + "learning_rate": 3.2808570203681135e-06, + "loss": 0.6243, + "step": 3760 + }, + { + "epoch": 2.227420787681374, + "grad_norm": 2.047090232279367, + "learning_rate": 3.2761215172377057e-06, + "loss": 0.6244, + "step": 3761 + }, + { + "epoch": 2.228013029315961, + "grad_norm": 1.9009466375062916, + "learning_rate": 3.271388764551702e-06, + "loss": 0.6162, + "step": 3762 + }, + { + "epoch": 2.2286052709505477, + "grad_norm": 1.7400174906099655, + "learning_rate": 3.266658764246062e-06, + "loss": 0.5828, + "step": 3763 + }, + { + "epoch": 2.229197512585135, + "grad_norm": 1.4745943692336556, + "learning_rate": 3.2619315182556234e-06, + "loss": 0.6394, + "step": 3764 + }, + { + "epoch": 2.2297897542197216, + "grad_norm": 2.7463760563137667, + "learning_rate": 3.25720702851409e-06, + "loss": 0.5372, + "step": 3765 + }, + { + "epoch": 2.2303819958543087, + "grad_norm": 1.291326363777691, + "learning_rate": 3.2524852969540477e-06, + "loss": 0.6244, + "step": 3766 + }, + { + "epoch": 2.2309742374888955, + "grad_norm": 3.0151646713135456, + "learning_rate": 3.2477663255069536e-06, + "loss": 0.5868, + "step": 3767 + }, + { + "epoch": 2.231566479123482, + "grad_norm": 1.3901935328359527, + "learning_rate": 3.243050116103128e-06, + "loss": 0.6149, + "step": 3768 + }, + { + "epoch": 2.2321587207580693, + "grad_norm": 2.8729426761941332, + "learning_rate": 3.2383366706717647e-06, + "loss": 0.6374, + "step": 3769 + }, + { + "epoch": 2.232750962392656, + "grad_norm": 1.3514288298661428, + "learning_rate": 3.2336259911409283e-06, + "loss": 0.6504, + "step": 3770 + }, + { + "epoch": 2.233343204027243, + "grad_norm": 1.2214092762638862, + "learning_rate": 3.228918079437556e-06, + "loss": 0.6331, + "step": 3771 + }, + { + "epoch": 2.23393544566183, + "grad_norm": 1.9604923888716306, + "learning_rate": 3.2242129374874478e-06, + "loss": 0.5907, + "step": 3772 + }, + { + "epoch": 2.234527687296417, + "grad_norm": 1.726601509131056, + "learning_rate": 3.219510567215264e-06, + "loss": 0.6266, + "step": 3773 + }, + { + "epoch": 2.2351199289310038, + "grad_norm": 1.518643995060012, + "learning_rate": 3.2148109705445442e-06, + "loss": 0.6445, + "step": 3774 + }, + { + "epoch": 2.235712170565591, + "grad_norm": 1.386321333561844, + "learning_rate": 3.2101141493976938e-06, + "loss": 0.6196, + "step": 3775 + }, + { + "epoch": 2.2363044122001776, + "grad_norm": 1.5786767296496107, + "learning_rate": 3.205420105695963e-06, + "loss": 0.6102, + "step": 3776 + }, + { + "epoch": 2.2368966538347648, + "grad_norm": 1.2013453929038438, + "learning_rate": 3.200728841359487e-06, + "loss": 0.6102, + "step": 3777 + }, + { + "epoch": 2.2374888954693515, + "grad_norm": 1.4238732131144223, + "learning_rate": 3.1960403583072596e-06, + "loss": 0.5799, + "step": 3778 + }, + { + "epoch": 2.238081137103938, + "grad_norm": 1.2261171846122096, + "learning_rate": 3.191354658457131e-06, + "loss": 0.6501, + "step": 3779 + }, + { + "epoch": 2.2386733787385253, + "grad_norm": 1.5582931846027446, + "learning_rate": 3.186671743725812e-06, + "loss": 0.6227, + "step": 3780 + }, + { + "epoch": 2.239265620373112, + "grad_norm": 1.4151694573036904, + "learning_rate": 3.181991616028882e-06, + "loss": 0.6248, + "step": 3781 + }, + { + "epoch": 2.239857862007699, + "grad_norm": 1.2588101337192434, + "learning_rate": 3.1773142772807796e-06, + "loss": 0.5951, + "step": 3782 + }, + { + "epoch": 2.240450103642286, + "grad_norm": 1.4993382133597462, + "learning_rate": 3.172639729394795e-06, + "loss": 0.6159, + "step": 3783 + }, + { + "epoch": 2.241042345276873, + "grad_norm": 1.8917475122461924, + "learning_rate": 3.1679679742830806e-06, + "loss": 0.5925, + "step": 3784 + }, + { + "epoch": 2.24163458691146, + "grad_norm": 2.418861532155358, + "learning_rate": 3.1632990138566467e-06, + "loss": 0.614, + "step": 3785 + }, + { + "epoch": 2.242226828546047, + "grad_norm": 1.705180026300204, + "learning_rate": 3.158632850025367e-06, + "loss": 0.5872, + "step": 3786 + }, + { + "epoch": 2.2428190701806336, + "grad_norm": 1.377966079629573, + "learning_rate": 3.1539694846979594e-06, + "loss": 0.6421, + "step": 3787 + }, + { + "epoch": 2.243411311815221, + "grad_norm": 1.58665211047525, + "learning_rate": 3.1493089197820015e-06, + "loss": 0.6665, + "step": 3788 + }, + { + "epoch": 2.2440035534498075, + "grad_norm": 1.338148955510755, + "learning_rate": 3.1446511571839267e-06, + "loss": 0.6043, + "step": 3789 + }, + { + "epoch": 2.244595795084394, + "grad_norm": 1.1705176333063276, + "learning_rate": 3.139996198809028e-06, + "loss": 0.6098, + "step": 3790 + }, + { + "epoch": 2.2451880367189814, + "grad_norm": 1.8169627762543634, + "learning_rate": 3.1353440465614403e-06, + "loss": 0.6236, + "step": 3791 + }, + { + "epoch": 2.245780278353568, + "grad_norm": 1.291671583977258, + "learning_rate": 3.1306947023441524e-06, + "loss": 0.616, + "step": 3792 + }, + { + "epoch": 2.2463725199881552, + "grad_norm": 3.0359361674296963, + "learning_rate": 3.1260481680590116e-06, + "loss": 0.6486, + "step": 3793 + }, + { + "epoch": 2.246964761622742, + "grad_norm": 1.4138943360346141, + "learning_rate": 3.121404445606714e-06, + "loss": 0.654, + "step": 3794 + }, + { + "epoch": 2.247557003257329, + "grad_norm": 1.5503123140519945, + "learning_rate": 3.1167635368867997e-06, + "loss": 0.6364, + "step": 3795 + }, + { + "epoch": 2.248149244891916, + "grad_norm": 1.4287633207856516, + "learning_rate": 3.112125443797659e-06, + "loss": 0.5875, + "step": 3796 + }, + { + "epoch": 2.248741486526503, + "grad_norm": 2.002912860104456, + "learning_rate": 3.107490168236539e-06, + "loss": 0.6282, + "step": 3797 + }, + { + "epoch": 2.2493337281610897, + "grad_norm": 1.4230050402558123, + "learning_rate": 3.1028577120995216e-06, + "loss": 0.6012, + "step": 3798 + }, + { + "epoch": 2.249925969795677, + "grad_norm": 1.877917813203428, + "learning_rate": 3.0982280772815478e-06, + "loss": 0.6339, + "step": 3799 + }, + { + "epoch": 2.2505182114302635, + "grad_norm": 1.6288711989281626, + "learning_rate": 3.0936012656763937e-06, + "loss": 0.5788, + "step": 3800 + }, + { + "epoch": 2.2511104530648502, + "grad_norm": 1.6985228890250441, + "learning_rate": 3.0889772791766892e-06, + "loss": 0.6384, + "step": 3801 + }, + { + "epoch": 2.2517026946994374, + "grad_norm": 1.2149735698196262, + "learning_rate": 3.0843561196739013e-06, + "loss": 0.6121, + "step": 3802 + }, + { + "epoch": 2.252294936334024, + "grad_norm": 1.8781077925643568, + "learning_rate": 3.079737789058348e-06, + "loss": 0.6069, + "step": 3803 + }, + { + "epoch": 2.2528871779686113, + "grad_norm": 2.685977214125362, + "learning_rate": 3.075122289219181e-06, + "loss": 0.5838, + "step": 3804 + }, + { + "epoch": 2.253479419603198, + "grad_norm": 1.951312531114018, + "learning_rate": 3.0705096220444066e-06, + "loss": 0.5911, + "step": 3805 + }, + { + "epoch": 2.254071661237785, + "grad_norm": 1.9964640855025602, + "learning_rate": 3.0658997894208573e-06, + "loss": 0.5825, + "step": 3806 + }, + { + "epoch": 2.254663902872372, + "grad_norm": 1.4301827317146103, + "learning_rate": 3.0612927932342205e-06, + "loss": 0.5751, + "step": 3807 + }, + { + "epoch": 2.255256144506959, + "grad_norm": 1.3725748768219552, + "learning_rate": 3.0566886353690106e-06, + "loss": 0.6418, + "step": 3808 + }, + { + "epoch": 2.2558483861415457, + "grad_norm": 1.7520734317776463, + "learning_rate": 3.052087317708593e-06, + "loss": 0.6551, + "step": 3809 + }, + { + "epoch": 2.256440627776133, + "grad_norm": 1.6914593574417534, + "learning_rate": 3.047488842135159e-06, + "loss": 0.6149, + "step": 3810 + }, + { + "epoch": 2.2570328694107196, + "grad_norm": 1.15940591732544, + "learning_rate": 3.0428932105297516e-06, + "loss": 0.5848, + "step": 3811 + }, + { + "epoch": 2.2576251110453063, + "grad_norm": 1.2240564347576872, + "learning_rate": 3.038300424772237e-06, + "loss": 0.6324, + "step": 3812 + }, + { + "epoch": 2.2582173526798934, + "grad_norm": 1.4337226917688144, + "learning_rate": 3.0337104867413215e-06, + "loss": 0.678, + "step": 3813 + }, + { + "epoch": 2.25880959431448, + "grad_norm": 1.691348892207366, + "learning_rate": 3.0291233983145494e-06, + "loss": 0.6343, + "step": 3814 + }, + { + "epoch": 2.2594018359490673, + "grad_norm": 1.5404862204108754, + "learning_rate": 3.0245391613683027e-06, + "loss": 0.5813, + "step": 3815 + }, + { + "epoch": 2.259994077583654, + "grad_norm": 3.706170880625637, + "learning_rate": 3.019957777777788e-06, + "loss": 0.5958, + "step": 3816 + }, + { + "epoch": 2.260586319218241, + "grad_norm": 1.4585279326793774, + "learning_rate": 3.015379249417045e-06, + "loss": 0.6097, + "step": 3817 + }, + { + "epoch": 2.261178560852828, + "grad_norm": 1.9982994091231006, + "learning_rate": 3.010803578158954e-06, + "loss": 0.6247, + "step": 3818 + }, + { + "epoch": 2.261770802487415, + "grad_norm": 17.925157107499732, + "learning_rate": 3.006230765875224e-06, + "loss": 0.6337, + "step": 3819 + }, + { + "epoch": 2.2623630441220017, + "grad_norm": 2.42901036591496, + "learning_rate": 3.00166081443639e-06, + "loss": 0.6131, + "step": 3820 + }, + { + "epoch": 2.262955285756589, + "grad_norm": 1.4874686031510487, + "learning_rate": 2.997093725711815e-06, + "loss": 0.6154, + "step": 3821 + }, + { + "epoch": 2.2635475273911756, + "grad_norm": 1.5630584463741388, + "learning_rate": 2.9925295015696978e-06, + "loss": 0.6295, + "step": 3822 + }, + { + "epoch": 2.2641397690257623, + "grad_norm": 2.901494892160737, + "learning_rate": 2.987968143877068e-06, + "loss": 0.6562, + "step": 3823 + }, + { + "epoch": 2.2647320106603495, + "grad_norm": 1.97840113224956, + "learning_rate": 2.9834096544997725e-06, + "loss": 0.633, + "step": 3824 + }, + { + "epoch": 2.265324252294936, + "grad_norm": 2.1415089601168034, + "learning_rate": 2.9788540353024863e-06, + "loss": 0.5627, + "step": 3825 + }, + { + "epoch": 2.2659164939295233, + "grad_norm": 1.8670402712788894, + "learning_rate": 2.9743012881487187e-06, + "loss": 0.579, + "step": 3826 + }, + { + "epoch": 2.26650873556411, + "grad_norm": 2.79632330229638, + "learning_rate": 2.9697514149008044e-06, + "loss": 0.6382, + "step": 3827 + }, + { + "epoch": 2.267100977198697, + "grad_norm": 2.1965219327895813, + "learning_rate": 2.965204417419886e-06, + "loss": 0.595, + "step": 3828 + }, + { + "epoch": 2.267693218833284, + "grad_norm": 1.456269981743204, + "learning_rate": 2.960660297565945e-06, + "loss": 0.6127, + "step": 3829 + }, + { + "epoch": 2.268285460467871, + "grad_norm": 1.5711432419219262, + "learning_rate": 2.956119057197785e-06, + "loss": 0.6269, + "step": 3830 + }, + { + "epoch": 2.2688777021024578, + "grad_norm": 2.2290008657723965, + "learning_rate": 2.9515806981730322e-06, + "loss": 0.6162, + "step": 3831 + }, + { + "epoch": 2.269469943737045, + "grad_norm": 1.8539455895230874, + "learning_rate": 2.9470452223481206e-06, + "loss": 0.6062, + "step": 3832 + }, + { + "epoch": 2.2700621853716316, + "grad_norm": 1.7914148986066236, + "learning_rate": 2.942512631578318e-06, + "loss": 0.6494, + "step": 3833 + }, + { + "epoch": 2.2706544270062183, + "grad_norm": 1.579902714082181, + "learning_rate": 2.9379829277177152e-06, + "loss": 0.6123, + "step": 3834 + }, + { + "epoch": 2.2712466686408055, + "grad_norm": 1.8966069617893526, + "learning_rate": 2.933456112619212e-06, + "loss": 0.6139, + "step": 3835 + }, + { + "epoch": 2.271838910275392, + "grad_norm": 3.372987103884868, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.6225, + "step": 3836 + }, + { + "epoch": 2.2724311519099794, + "grad_norm": 2.358810973987772, + "learning_rate": 2.9244111561141997e-06, + "loss": 0.6399, + "step": 3837 + }, + { + "epoch": 2.273023393544566, + "grad_norm": 1.660137040911655, + "learning_rate": 2.9198930184075944e-06, + "loss": 0.5812, + "step": 3838 + }, + { + "epoch": 2.273615635179153, + "grad_norm": 2.200855422546957, + "learning_rate": 2.915377776862878e-06, + "loss": 0.6089, + "step": 3839 + }, + { + "epoch": 2.27420787681374, + "grad_norm": 1.6572553582906615, + "learning_rate": 2.9108654333270346e-06, + "loss": 0.5949, + "step": 3840 + }, + { + "epoch": 2.274800118448327, + "grad_norm": 1.4498432049468442, + "learning_rate": 2.9063559896458704e-06, + "loss": 0.6329, + "step": 3841 + }, + { + "epoch": 2.275392360082914, + "grad_norm": 1.1737273223504179, + "learning_rate": 2.901849447664008e-06, + "loss": 0.6561, + "step": 3842 + }, + { + "epoch": 2.275984601717501, + "grad_norm": 2.66610971272177, + "learning_rate": 2.897345809224864e-06, + "loss": 0.5836, + "step": 3843 + }, + { + "epoch": 2.2765768433520877, + "grad_norm": 1.839143446701005, + "learning_rate": 2.892845076170685e-06, + "loss": 0.6346, + "step": 3844 + }, + { + "epoch": 2.2771690849866744, + "grad_norm": 1.60827795865389, + "learning_rate": 2.8883472503425236e-06, + "loss": 0.6318, + "step": 3845 + }, + { + "epoch": 2.2777613266212615, + "grad_norm": 1.5301557763688713, + "learning_rate": 2.8838523335802525e-06, + "loss": 0.6759, + "step": 3846 + }, + { + "epoch": 2.2783535682558482, + "grad_norm": 1.5562091375300113, + "learning_rate": 2.8793603277225302e-06, + "loss": 0.6203, + "step": 3847 + }, + { + "epoch": 2.2789458098904354, + "grad_norm": 1.588136398742716, + "learning_rate": 2.8748712346068464e-06, + "loss": 0.5912, + "step": 3848 + }, + { + "epoch": 2.279538051525022, + "grad_norm": 1.841240319682772, + "learning_rate": 2.8703850560694966e-06, + "loss": 0.6484, + "step": 3849 + }, + { + "epoch": 2.2801302931596092, + "grad_norm": 8.661704543502498, + "learning_rate": 2.865901793945576e-06, + "loss": 0.6019, + "step": 3850 + }, + { + "epoch": 2.280722534794196, + "grad_norm": 1.9390476098819909, + "learning_rate": 2.8614214500689886e-06, + "loss": 0.6344, + "step": 3851 + }, + { + "epoch": 2.281314776428783, + "grad_norm": 1.4608667959226727, + "learning_rate": 2.8569440262724502e-06, + "loss": 0.608, + "step": 3852 + }, + { + "epoch": 2.28190701806337, + "grad_norm": 2.253638236763826, + "learning_rate": 2.8524695243874814e-06, + "loss": 0.6115, + "step": 3853 + }, + { + "epoch": 2.282499259697957, + "grad_norm": 1.5747389470810782, + "learning_rate": 2.8479979462444017e-06, + "loss": 0.6291, + "step": 3854 + }, + { + "epoch": 2.2830915013325437, + "grad_norm": 1.216195294388743, + "learning_rate": 2.8435292936723356e-06, + "loss": 0.6106, + "step": 3855 + }, + { + "epoch": 2.2836837429671304, + "grad_norm": 2.7276242343011745, + "learning_rate": 2.8390635684992163e-06, + "loss": 0.5509, + "step": 3856 + }, + { + "epoch": 2.2842759846017175, + "grad_norm": 2.323606613377955, + "learning_rate": 2.83460077255178e-06, + "loss": 0.5926, + "step": 3857 + }, + { + "epoch": 2.2848682262363043, + "grad_norm": 2.38216532290336, + "learning_rate": 2.8301409076555574e-06, + "loss": 0.6453, + "step": 3858 + }, + { + "epoch": 2.2854604678708914, + "grad_norm": 1.3563075087072833, + "learning_rate": 2.8256839756348807e-06, + "loss": 0.6039, + "step": 3859 + }, + { + "epoch": 2.286052709505478, + "grad_norm": 1.2668051387431, + "learning_rate": 2.821229978312889e-06, + "loss": 0.5929, + "step": 3860 + }, + { + "epoch": 2.2866449511400653, + "grad_norm": 1.2956377312836582, + "learning_rate": 2.8167789175115223e-06, + "loss": 0.6503, + "step": 3861 + }, + { + "epoch": 2.287237192774652, + "grad_norm": 1.9302456323206396, + "learning_rate": 2.8123307950515087e-06, + "loss": 0.6626, + "step": 3862 + }, + { + "epoch": 2.287829434409239, + "grad_norm": 1.6044535773095645, + "learning_rate": 2.80788561275238e-06, + "loss": 0.5979, + "step": 3863 + }, + { + "epoch": 2.288421676043826, + "grad_norm": 2.2939252755635975, + "learning_rate": 2.8034433724324716e-06, + "loss": 0.6646, + "step": 3864 + }, + { + "epoch": 2.289013917678413, + "grad_norm": 2.471817823244044, + "learning_rate": 2.7990040759089022e-06, + "loss": 0.5829, + "step": 3865 + }, + { + "epoch": 2.2896061593129997, + "grad_norm": 3.3806573958288566, + "learning_rate": 2.7945677249976e-06, + "loss": 0.6553, + "step": 3866 + }, + { + "epoch": 2.2901984009475864, + "grad_norm": 2.737369171178317, + "learning_rate": 2.7901343215132758e-06, + "loss": 0.6349, + "step": 3867 + }, + { + "epoch": 2.2907906425821736, + "grad_norm": 2.298571285772045, + "learning_rate": 2.7857038672694492e-06, + "loss": 0.5691, + "step": 3868 + }, + { + "epoch": 2.2913828842167603, + "grad_norm": 1.5520258709926762, + "learning_rate": 2.7812763640784155e-06, + "loss": 0.6444, + "step": 3869 + }, + { + "epoch": 2.2919751258513474, + "grad_norm": 1.7364947241082105, + "learning_rate": 2.776851813751281e-06, + "loss": 0.6432, + "step": 3870 + }, + { + "epoch": 2.292567367485934, + "grad_norm": 1.5769815666091875, + "learning_rate": 2.77243021809793e-06, + "loss": 0.6317, + "step": 3871 + }, + { + "epoch": 2.2931596091205213, + "grad_norm": 2.137316840538586, + "learning_rate": 2.7680115789270478e-06, + "loss": 0.6133, + "step": 3872 + }, + { + "epoch": 2.293751850755108, + "grad_norm": 2.816072721070139, + "learning_rate": 2.763595898046101e-06, + "loss": 0.562, + "step": 3873 + }, + { + "epoch": 2.294344092389695, + "grad_norm": 1.3276083688656188, + "learning_rate": 2.7591831772613576e-06, + "loss": 0.5834, + "step": 3874 + }, + { + "epoch": 2.294936334024282, + "grad_norm": 1.2689499619456701, + "learning_rate": 2.754773418377863e-06, + "loss": 0.6007, + "step": 3875 + }, + { + "epoch": 2.295528575658869, + "grad_norm": 3.840896322229496, + "learning_rate": 2.750366623199462e-06, + "loss": 0.6271, + "step": 3876 + }, + { + "epoch": 2.2961208172934557, + "grad_norm": 1.1950201080078542, + "learning_rate": 2.745962793528775e-06, + "loss": 0.585, + "step": 3877 + }, + { + "epoch": 2.2967130589280425, + "grad_norm": 2.0462234251989484, + "learning_rate": 2.7415619311672236e-06, + "loss": 0.639, + "step": 3878 + }, + { + "epoch": 2.2973053005626296, + "grad_norm": 1.853413330196118, + "learning_rate": 2.7371640379150032e-06, + "loss": 0.6103, + "step": 3879 + }, + { + "epoch": 2.2978975421972163, + "grad_norm": 2.1374274208984647, + "learning_rate": 2.7327691155710978e-06, + "loss": 0.5992, + "step": 3880 + }, + { + "epoch": 2.2984897838318035, + "grad_norm": 1.9972945709906602, + "learning_rate": 2.7283771659332805e-06, + "loss": 0.62, + "step": 3881 + }, + { + "epoch": 2.29908202546639, + "grad_norm": 1.0982870260711513, + "learning_rate": 2.723988190798108e-06, + "loss": 0.6119, + "step": 3882 + }, + { + "epoch": 2.2996742671009773, + "grad_norm": 1.2876134638144012, + "learning_rate": 2.7196021919609163e-06, + "loss": 0.6366, + "step": 3883 + }, + { + "epoch": 2.300266508735564, + "grad_norm": 1.8016509667404628, + "learning_rate": 2.7152191712158207e-06, + "loss": 0.6041, + "step": 3884 + }, + { + "epoch": 2.300858750370151, + "grad_norm": 1.2850926199615083, + "learning_rate": 2.710839130355727e-06, + "loss": 0.6157, + "step": 3885 + }, + { + "epoch": 2.301450992004738, + "grad_norm": 1.4731799841992599, + "learning_rate": 2.706462071172322e-06, + "loss": 0.5742, + "step": 3886 + }, + { + "epoch": 2.302043233639325, + "grad_norm": 1.7403825798627295, + "learning_rate": 2.7020879954560642e-06, + "loss": 0.6451, + "step": 3887 + }, + { + "epoch": 2.3026354752739118, + "grad_norm": 3.5575301409939653, + "learning_rate": 2.697716904996196e-06, + "loss": 0.6125, + "step": 3888 + }, + { + "epoch": 2.3032277169084985, + "grad_norm": 1.283131812355814, + "learning_rate": 2.6933488015807406e-06, + "loss": 0.6057, + "step": 3889 + }, + { + "epoch": 2.3038199585430856, + "grad_norm": 3.1172645215689783, + "learning_rate": 2.6889836869965016e-06, + "loss": 0.6172, + "step": 3890 + }, + { + "epoch": 2.3044122001776723, + "grad_norm": 2.218448251870392, + "learning_rate": 2.6846215630290516e-06, + "loss": 0.6496, + "step": 3891 + }, + { + "epoch": 2.3050044418122595, + "grad_norm": 2.3254463534891796, + "learning_rate": 2.6802624314627436e-06, + "loss": 0.6223, + "step": 3892 + }, + { + "epoch": 2.305596683446846, + "grad_norm": 1.2275387722818925, + "learning_rate": 2.67590629408071e-06, + "loss": 0.5998, + "step": 3893 + }, + { + "epoch": 2.3061889250814334, + "grad_norm": 3.0555438618968584, + "learning_rate": 2.6715531526648585e-06, + "loss": 0.6273, + "step": 3894 + }, + { + "epoch": 2.30678116671602, + "grad_norm": 1.5752564674463383, + "learning_rate": 2.6672030089958668e-06, + "loss": 0.6596, + "step": 3895 + }, + { + "epoch": 2.3073734083506072, + "grad_norm": 1.7511275969987508, + "learning_rate": 2.6628558648531845e-06, + "loss": 0.5788, + "step": 3896 + }, + { + "epoch": 2.307965649985194, + "grad_norm": 0.9655204456900459, + "learning_rate": 2.6585117220150403e-06, + "loss": 0.639, + "step": 3897 + }, + { + "epoch": 2.308557891619781, + "grad_norm": 1.1990519232172137, + "learning_rate": 2.654170582258441e-06, + "loss": 0.5954, + "step": 3898 + }, + { + "epoch": 2.309150133254368, + "grad_norm": 3.208547550346025, + "learning_rate": 2.649832447359142e-06, + "loss": 0.6034, + "step": 3899 + }, + { + "epoch": 2.3097423748889545, + "grad_norm": 2.2300260612204577, + "learning_rate": 2.645497319091692e-06, + "loss": 0.5987, + "step": 3900 + }, + { + "epoch": 2.3103346165235417, + "grad_norm": 2.0946840308163543, + "learning_rate": 2.6411651992294065e-06, + "loss": 0.6319, + "step": 3901 + }, + { + "epoch": 2.3109268581581284, + "grad_norm": 2.079061501961239, + "learning_rate": 2.63683608954436e-06, + "loss": 0.636, + "step": 3902 + }, + { + "epoch": 2.3115190997927155, + "grad_norm": 1.2445711582719796, + "learning_rate": 2.6325099918074017e-06, + "loss": 0.6395, + "step": 3903 + }, + { + "epoch": 2.3121113414273022, + "grad_norm": 1.864032684312602, + "learning_rate": 2.6281869077881507e-06, + "loss": 0.5782, + "step": 3904 + }, + { + "epoch": 2.3127035830618894, + "grad_norm": 1.3316080049678458, + "learning_rate": 2.6238668392549947e-06, + "loss": 0.5901, + "step": 3905 + }, + { + "epoch": 2.313295824696476, + "grad_norm": 1.6276785587062519, + "learning_rate": 2.619549787975081e-06, + "loss": 0.6372, + "step": 3906 + }, + { + "epoch": 2.3138880663310633, + "grad_norm": 5.7918361147135045, + "learning_rate": 2.615235755714324e-06, + "loss": 0.6331, + "step": 3907 + }, + { + "epoch": 2.31448030796565, + "grad_norm": 2.2663927095141925, + "learning_rate": 2.6109247442374088e-06, + "loss": 0.6412, + "step": 3908 + }, + { + "epoch": 2.315072549600237, + "grad_norm": 7.124389289252508, + "learning_rate": 2.6066167553077826e-06, + "loss": 0.5689, + "step": 3909 + }, + { + "epoch": 2.315664791234824, + "grad_norm": 1.363067404396117, + "learning_rate": 2.602311790687655e-06, + "loss": 0.6523, + "step": 3910 + }, + { + "epoch": 2.3162570328694105, + "grad_norm": 1.5813779524915303, + "learning_rate": 2.5980098521379936e-06, + "loss": 0.6186, + "step": 3911 + }, + { + "epoch": 2.3168492745039977, + "grad_norm": 1.481882250550122, + "learning_rate": 2.593710941418537e-06, + "loss": 0.5903, + "step": 3912 + }, + { + "epoch": 2.3174415161385844, + "grad_norm": 1.9025600054836969, + "learning_rate": 2.5894150602877834e-06, + "loss": 0.6135, + "step": 3913 + }, + { + "epoch": 2.3180337577731716, + "grad_norm": 1.2895740270749483, + "learning_rate": 2.585122210502987e-06, + "loss": 0.5903, + "step": 3914 + }, + { + "epoch": 2.3186259994077583, + "grad_norm": 1.647388330853826, + "learning_rate": 2.5808323938201642e-06, + "loss": 0.5979, + "step": 3915 + }, + { + "epoch": 2.3192182410423454, + "grad_norm": 1.28519854345509, + "learning_rate": 2.5765456119940933e-06, + "loss": 0.6295, + "step": 3916 + }, + { + "epoch": 2.319810482676932, + "grad_norm": 1.2525366592486644, + "learning_rate": 2.5722618667783063e-06, + "loss": 0.5877, + "step": 3917 + }, + { + "epoch": 2.3204027243115193, + "grad_norm": 1.3317926678190848, + "learning_rate": 2.5679811599251003e-06, + "loss": 0.5926, + "step": 3918 + }, + { + "epoch": 2.320994965946106, + "grad_norm": 2.428793102384277, + "learning_rate": 2.5637034931855197e-06, + "loss": 0.5911, + "step": 3919 + }, + { + "epoch": 2.321587207580693, + "grad_norm": 1.4721974585666002, + "learning_rate": 2.559428868309377e-06, + "loss": 0.579, + "step": 3920 + }, + { + "epoch": 2.32217944921528, + "grad_norm": 3.8645920617049465, + "learning_rate": 2.5551572870452268e-06, + "loss": 0.6181, + "step": 3921 + }, + { + "epoch": 2.3227716908498666, + "grad_norm": 1.5226219139675943, + "learning_rate": 2.5508887511403936e-06, + "loss": 0.5791, + "step": 3922 + }, + { + "epoch": 2.3233639324844537, + "grad_norm": 2.2075587892085116, + "learning_rate": 2.5466232623409416e-06, + "loss": 0.634, + "step": 3923 + }, + { + "epoch": 2.3239561741190404, + "grad_norm": 1.3599172487421667, + "learning_rate": 2.542360822391702e-06, + "loss": 0.6161, + "step": 3924 + }, + { + "epoch": 2.3245484157536276, + "grad_norm": 2.3033687836929557, + "learning_rate": 2.538101433036246e-06, + "loss": 0.6239, + "step": 3925 + }, + { + "epoch": 2.3251406573882143, + "grad_norm": 1.9111808167290096, + "learning_rate": 2.5338450960169105e-06, + "loss": 0.5993, + "step": 3926 + }, + { + "epoch": 2.3257328990228014, + "grad_norm": 3.326963232812606, + "learning_rate": 2.52959181307477e-06, + "loss": 0.6112, + "step": 3927 + }, + { + "epoch": 2.326325140657388, + "grad_norm": 1.796700647763413, + "learning_rate": 2.525341585949662e-06, + "loss": 0.6028, + "step": 3928 + }, + { + "epoch": 2.3269173822919753, + "grad_norm": 2.312629837062101, + "learning_rate": 2.521094416380162e-06, + "loss": 0.6497, + "step": 3929 + }, + { + "epoch": 2.327509623926562, + "grad_norm": 1.2457487465053927, + "learning_rate": 2.5168503061036086e-06, + "loss": 0.6098, + "step": 3930 + }, + { + "epoch": 2.328101865561149, + "grad_norm": 3.791923080168204, + "learning_rate": 2.5126092568560754e-06, + "loss": 0.5886, + "step": 3931 + }, + { + "epoch": 2.328694107195736, + "grad_norm": 4.853398587684118, + "learning_rate": 2.5083712703723952e-06, + "loss": 0.6286, + "step": 3932 + }, + { + "epoch": 2.3292863488303226, + "grad_norm": 2.2593203003697915, + "learning_rate": 2.5041363483861357e-06, + "loss": 0.6453, + "step": 3933 + }, + { + "epoch": 2.3298785904649097, + "grad_norm": 1.6615315018284669, + "learning_rate": 2.499904492629627e-06, + "loss": 0.5813, + "step": 3934 + }, + { + "epoch": 2.3304708320994965, + "grad_norm": 1.6816787030820344, + "learning_rate": 2.4956757048339307e-06, + "loss": 0.5839, + "step": 3935 + }, + { + "epoch": 2.3310630737340836, + "grad_norm": 2.769364650023261, + "learning_rate": 2.4914499867288577e-06, + "loss": 0.6956, + "step": 3936 + }, + { + "epoch": 2.3316553153686703, + "grad_norm": 3.642860545017731, + "learning_rate": 2.487227340042966e-06, + "loss": 0.6321, + "step": 3937 + }, + { + "epoch": 2.3322475570032575, + "grad_norm": 1.3381965332512193, + "learning_rate": 2.483007766503558e-06, + "loss": 0.5996, + "step": 3938 + }, + { + "epoch": 2.332839798637844, + "grad_norm": 1.7008693234557766, + "learning_rate": 2.4787912678366755e-06, + "loss": 0.593, + "step": 3939 + }, + { + "epoch": 2.3334320402724313, + "grad_norm": 1.3507684647991987, + "learning_rate": 2.474577845767099e-06, + "loss": 0.6144, + "step": 3940 + }, + { + "epoch": 2.334024281907018, + "grad_norm": 2.0794417313783233, + "learning_rate": 2.4703675020183583e-06, + "loss": 0.5919, + "step": 3941 + }, + { + "epoch": 2.334616523541605, + "grad_norm": 2.4036165381923915, + "learning_rate": 2.4661602383127235e-06, + "loss": 0.6163, + "step": 3942 + }, + { + "epoch": 2.335208765176192, + "grad_norm": 2.006173079525012, + "learning_rate": 2.461956056371201e-06, + "loss": 0.6006, + "step": 3943 + }, + { + "epoch": 2.3358010068107786, + "grad_norm": 2.5021972165514677, + "learning_rate": 2.4577549579135318e-06, + "loss": 0.6524, + "step": 3944 + }, + { + "epoch": 2.3363932484453658, + "grad_norm": 2.2918385752916772, + "learning_rate": 2.453556944658206e-06, + "loss": 0.6192, + "step": 3945 + }, + { + "epoch": 2.3369854900799525, + "grad_norm": 2.0313012400244546, + "learning_rate": 2.449362018322451e-06, + "loss": 0.5882, + "step": 3946 + }, + { + "epoch": 2.3375777317145396, + "grad_norm": 2.1865753540293245, + "learning_rate": 2.445170180622223e-06, + "loss": 0.5776, + "step": 3947 + }, + { + "epoch": 2.3381699733491264, + "grad_norm": 1.6788725756694558, + "learning_rate": 2.440981433272216e-06, + "loss": 0.6286, + "step": 3948 + }, + { + "epoch": 2.3387622149837135, + "grad_norm": 1.3993996393149903, + "learning_rate": 2.4367957779858675e-06, + "loss": 0.6201, + "step": 3949 + }, + { + "epoch": 2.3393544566183, + "grad_norm": 2.5163778162713877, + "learning_rate": 2.43261321647535e-06, + "loss": 0.6333, + "step": 3950 + }, + { + "epoch": 2.3399466982528874, + "grad_norm": 2.1183056118895647, + "learning_rate": 2.4284337504515577e-06, + "loss": 0.6251, + "step": 3951 + }, + { + "epoch": 2.340538939887474, + "grad_norm": 2.9285242355007957, + "learning_rate": 2.42425738162413e-06, + "loss": 0.6026, + "step": 3952 + }, + { + "epoch": 2.3411311815220612, + "grad_norm": 2.1726586914478, + "learning_rate": 2.420084111701442e-06, + "loss": 0.6219, + "step": 3953 + }, + { + "epoch": 2.341723423156648, + "grad_norm": 1.7766065564148414, + "learning_rate": 2.4159139423905898e-06, + "loss": 0.6071, + "step": 3954 + }, + { + "epoch": 2.3423156647912347, + "grad_norm": 1.798438650040322, + "learning_rate": 2.411746875397407e-06, + "loss": 0.5952, + "step": 3955 + }, + { + "epoch": 2.342907906425822, + "grad_norm": 1.8115265686116828, + "learning_rate": 2.4075829124264606e-06, + "loss": 0.6037, + "step": 3956 + }, + { + "epoch": 2.3435001480604085, + "grad_norm": 1.2886622921652453, + "learning_rate": 2.4034220551810484e-06, + "loss": 0.6178, + "step": 3957 + }, + { + "epoch": 2.3440923896949957, + "grad_norm": 1.2555330420863857, + "learning_rate": 2.3992643053631904e-06, + "loss": 0.617, + "step": 3958 + }, + { + "epoch": 2.3446846313295824, + "grad_norm": 1.6387727645080599, + "learning_rate": 2.3951096646736403e-06, + "loss": 0.5982, + "step": 3959 + }, + { + "epoch": 2.3452768729641695, + "grad_norm": 1.2682217733893184, + "learning_rate": 2.3909581348118803e-06, + "loss": 0.6339, + "step": 3960 + }, + { + "epoch": 2.3458691145987562, + "grad_norm": 1.3586475278862904, + "learning_rate": 2.386809717476123e-06, + "loss": 0.6429, + "step": 3961 + }, + { + "epoch": 2.3464613562333434, + "grad_norm": 1.3204559918998942, + "learning_rate": 2.3826644143633017e-06, + "loss": 0.5899, + "step": 3962 + }, + { + "epoch": 2.34705359786793, + "grad_norm": 1.3928626985319164, + "learning_rate": 2.3785222271690754e-06, + "loss": 0.6113, + "step": 3963 + }, + { + "epoch": 2.3476458395025173, + "grad_norm": 9.312328145389811, + "learning_rate": 2.3743831575878352e-06, + "loss": 0.6527, + "step": 3964 + }, + { + "epoch": 2.348238081137104, + "grad_norm": 3.170276661797359, + "learning_rate": 2.370247207312695e-06, + "loss": 0.6355, + "step": 3965 + }, + { + "epoch": 2.3488303227716907, + "grad_norm": 2.153769559929862, + "learning_rate": 2.366114378035489e-06, + "loss": 0.5854, + "step": 3966 + }, + { + "epoch": 2.349422564406278, + "grad_norm": 2.010875700018389, + "learning_rate": 2.361984671446773e-06, + "loss": 0.6136, + "step": 3967 + }, + { + "epoch": 2.3500148060408645, + "grad_norm": 1.8883203647412998, + "learning_rate": 2.3578580892358337e-06, + "loss": 0.6464, + "step": 3968 + }, + { + "epoch": 2.3506070476754517, + "grad_norm": 1.3913502661830128, + "learning_rate": 2.3537346330906776e-06, + "loss": 0.6297, + "step": 3969 + }, + { + "epoch": 2.3511992893100384, + "grad_norm": 1.5332485123384902, + "learning_rate": 2.3496143046980256e-06, + "loss": 0.5928, + "step": 3970 + }, + { + "epoch": 2.3517915309446256, + "grad_norm": 1.3001291166457274, + "learning_rate": 2.345497105743323e-06, + "loss": 0.6043, + "step": 3971 + }, + { + "epoch": 2.3523837725792123, + "grad_norm": 1.4754086076781734, + "learning_rate": 2.3413830379107395e-06, + "loss": 0.6199, + "step": 3972 + }, + { + "epoch": 2.352976014213799, + "grad_norm": 2.409278659416967, + "learning_rate": 2.337272102883157e-06, + "loss": 0.6453, + "step": 3973 + }, + { + "epoch": 2.353568255848386, + "grad_norm": 2.3466875892630683, + "learning_rate": 2.3331643023421813e-06, + "loss": 0.6564, + "step": 3974 + }, + { + "epoch": 2.3541604974829733, + "grad_norm": 2.714570742335837, + "learning_rate": 2.329059637968132e-06, + "loss": 0.6404, + "step": 3975 + }, + { + "epoch": 2.35475273911756, + "grad_norm": 2.3929716670625356, + "learning_rate": 2.324958111440051e-06, + "loss": 0.6362, + "step": 3976 + }, + { + "epoch": 2.3553449807521467, + "grad_norm": 1.6228125008698313, + "learning_rate": 2.3208597244356867e-06, + "loss": 0.5842, + "step": 3977 + }, + { + "epoch": 2.355937222386734, + "grad_norm": 3.384644411190184, + "learning_rate": 2.316764478631518e-06, + "loss": 0.627, + "step": 3978 + }, + { + "epoch": 2.3565294640213206, + "grad_norm": 1.5260570773569566, + "learning_rate": 2.3126723757027245e-06, + "loss": 0.6295, + "step": 3979 + }, + { + "epoch": 2.3571217056559077, + "grad_norm": 1.8203121426278104, + "learning_rate": 2.30858341732321e-06, + "loss": 0.6369, + "step": 3980 + }, + { + "epoch": 2.3577139472904944, + "grad_norm": 1.1006202274680998, + "learning_rate": 2.3044976051655854e-06, + "loss": 0.6267, + "step": 3981 + }, + { + "epoch": 2.3583061889250816, + "grad_norm": 1.8643488805973478, + "learning_rate": 2.300414940901182e-06, + "loss": 0.6002, + "step": 3982 + }, + { + "epoch": 2.3588984305596683, + "grad_norm": 1.4796650275447034, + "learning_rate": 2.2963354262000335e-06, + "loss": 0.6101, + "step": 3983 + }, + { + "epoch": 2.359490672194255, + "grad_norm": 3.064100269638554, + "learning_rate": 2.292259062730897e-06, + "loss": 0.5567, + "step": 3984 + }, + { + "epoch": 2.360082913828842, + "grad_norm": 1.5811392863582345, + "learning_rate": 2.2881858521612275e-06, + "loss": 0.6186, + "step": 3985 + }, + { + "epoch": 2.3606751554634293, + "grad_norm": 2.1586453620244597, + "learning_rate": 2.2841157961572034e-06, + "loss": 0.6014, + "step": 3986 + }, + { + "epoch": 2.361267397098016, + "grad_norm": 2.641507002266892, + "learning_rate": 2.2800488963837043e-06, + "loss": 0.6133, + "step": 3987 + }, + { + "epoch": 2.3618596387326027, + "grad_norm": 1.1730513592343241, + "learning_rate": 2.2759851545043175e-06, + "loss": 0.6223, + "step": 3988 + }, + { + "epoch": 2.36245188036719, + "grad_norm": 1.4092715716805935, + "learning_rate": 2.2719245721813455e-06, + "loss": 0.6399, + "step": 3989 + }, + { + "epoch": 2.3630441220017766, + "grad_norm": 1.8498348981776163, + "learning_rate": 2.2678671510757953e-06, + "loss": 0.6188, + "step": 3990 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 2.143878596825647, + "learning_rate": 2.263812892847381e-06, + "loss": 0.6321, + "step": 3991 + }, + { + "epoch": 2.3642286052709505, + "grad_norm": 1.2933879029907467, + "learning_rate": 2.259761799154516e-06, + "loss": 0.6037, + "step": 3992 + }, + { + "epoch": 2.3648208469055376, + "grad_norm": 1.5882145664047522, + "learning_rate": 2.2557138716543316e-06, + "loss": 0.5955, + "step": 3993 + }, + { + "epoch": 2.3654130885401243, + "grad_norm": 1.686459234669479, + "learning_rate": 2.251669112002657e-06, + "loss": 0.6064, + "step": 3994 + }, + { + "epoch": 2.366005330174711, + "grad_norm": 1.7498942595889782, + "learning_rate": 2.2476275218540266e-06, + "loss": 0.5981, + "step": 3995 + }, + { + "epoch": 2.366597571809298, + "grad_norm": 2.8193121891643997, + "learning_rate": 2.243589102861673e-06, + "loss": 0.6234, + "step": 3996 + }, + { + "epoch": 2.3671898134438853, + "grad_norm": 1.4594782573094889, + "learning_rate": 2.239553856677541e-06, + "loss": 0.6007, + "step": 3997 + }, + { + "epoch": 2.367782055078472, + "grad_norm": 1.485059207419929, + "learning_rate": 2.235521784952275e-06, + "loss": 0.6011, + "step": 3998 + }, + { + "epoch": 2.3683742967130588, + "grad_norm": 1.3309379388804508, + "learning_rate": 2.231492889335217e-06, + "loss": 0.5943, + "step": 3999 + }, + { + "epoch": 2.368966538347646, + "grad_norm": 2.6267695132329454, + "learning_rate": 2.227467171474409e-06, + "loss": 0.6136, + "step": 4000 + }, + { + "epoch": 2.3695587799822326, + "grad_norm": 4.446529481550082, + "learning_rate": 2.223444633016597e-06, + "loss": 0.6042, + "step": 4001 + }, + { + "epoch": 2.37015102161682, + "grad_norm": 1.9069444415000925, + "learning_rate": 2.2194252756072343e-06, + "loss": 0.6064, + "step": 4002 + }, + { + "epoch": 2.3707432632514065, + "grad_norm": 1.8238334546399355, + "learning_rate": 2.2154091008904497e-06, + "loss": 0.6344, + "step": 4003 + }, + { + "epoch": 2.3713355048859937, + "grad_norm": 2.5613993978909693, + "learning_rate": 2.2113961105090933e-06, + "loss": 0.5878, + "step": 4004 + }, + { + "epoch": 2.3719277465205804, + "grad_norm": 1.2783779891008764, + "learning_rate": 2.207386306104701e-06, + "loss": 0.6141, + "step": 4005 + }, + { + "epoch": 2.372519988155167, + "grad_norm": 1.4416435791183768, + "learning_rate": 2.2033796893175152e-06, + "loss": 0.6002, + "step": 4006 + }, + { + "epoch": 2.3731122297897542, + "grad_norm": 1.6055773889827405, + "learning_rate": 2.1993762617864555e-06, + "loss": 0.6106, + "step": 4007 + }, + { + "epoch": 2.3737044714243414, + "grad_norm": 1.33887538582045, + "learning_rate": 2.195376025149156e-06, + "loss": 0.6547, + "step": 4008 + }, + { + "epoch": 2.374296713058928, + "grad_norm": 1.7979827304398968, + "learning_rate": 2.1913789810419393e-06, + "loss": 0.5853, + "step": 4009 + }, + { + "epoch": 2.374888954693515, + "grad_norm": 1.678514747907677, + "learning_rate": 2.1873851310998194e-06, + "loss": 0.6229, + "step": 4010 + }, + { + "epoch": 2.375481196328102, + "grad_norm": 12.535062449115955, + "learning_rate": 2.183394476956504e-06, + "loss": 0.636, + "step": 4011 + }, + { + "epoch": 2.3760734379626887, + "grad_norm": 3.6917293032690623, + "learning_rate": 2.179407020244395e-06, + "loss": 0.6194, + "step": 4012 + }, + { + "epoch": 2.376665679597276, + "grad_norm": 1.3674840173653329, + "learning_rate": 2.175422762594591e-06, + "loss": 0.6274, + "step": 4013 + }, + { + "epoch": 2.3772579212318625, + "grad_norm": 1.7064037619998782, + "learning_rate": 2.1714417056368752e-06, + "loss": 0.6281, + "step": 4014 + }, + { + "epoch": 2.3778501628664497, + "grad_norm": 4.498067534083709, + "learning_rate": 2.167463850999719e-06, + "loss": 0.6391, + "step": 4015 + }, + { + "epoch": 2.3784424045010364, + "grad_norm": 3.0010927370986615, + "learning_rate": 2.1634892003102935e-06, + "loss": 0.6198, + "step": 4016 + }, + { + "epoch": 2.379034646135623, + "grad_norm": 2.2759248099052205, + "learning_rate": 2.159517755194456e-06, + "loss": 0.6536, + "step": 4017 + }, + { + "epoch": 2.3796268877702103, + "grad_norm": 1.5564439174661013, + "learning_rate": 2.155549517276747e-06, + "loss": 0.6276, + "step": 4018 + }, + { + "epoch": 2.3802191294047974, + "grad_norm": 1.6225752200568844, + "learning_rate": 2.1515844881803993e-06, + "loss": 0.6166, + "step": 4019 + }, + { + "epoch": 2.380811371039384, + "grad_norm": 1.9066398001458083, + "learning_rate": 2.1476226695273326e-06, + "loss": 0.6092, + "step": 4020 + }, + { + "epoch": 2.381403612673971, + "grad_norm": 2.76829177620669, + "learning_rate": 2.143664062938158e-06, + "loss": 0.6401, + "step": 4021 + }, + { + "epoch": 2.381995854308558, + "grad_norm": 3.0015528560284026, + "learning_rate": 2.1397086700321635e-06, + "loss": 0.6389, + "step": 4022 + }, + { + "epoch": 2.3825880959431447, + "grad_norm": 1.4409882465741872, + "learning_rate": 2.1357564924273265e-06, + "loss": 0.6066, + "step": 4023 + }, + { + "epoch": 2.383180337577732, + "grad_norm": 1.6002150244136195, + "learning_rate": 2.1318075317403152e-06, + "loss": 0.6025, + "step": 4024 + }, + { + "epoch": 2.3837725792123186, + "grad_norm": 1.9985808815147001, + "learning_rate": 2.1278617895864706e-06, + "loss": 0.6046, + "step": 4025 + }, + { + "epoch": 2.3843648208469057, + "grad_norm": 1.694290847273592, + "learning_rate": 2.123919267579828e-06, + "loss": 0.6045, + "step": 4026 + }, + { + "epoch": 2.3849570624814924, + "grad_norm": 2.032855773037069, + "learning_rate": 2.1199799673330956e-06, + "loss": 0.6228, + "step": 4027 + }, + { + "epoch": 2.385549304116079, + "grad_norm": 1.4539445622297456, + "learning_rate": 2.1160438904576743e-06, + "loss": 0.6275, + "step": 4028 + }, + { + "epoch": 2.3861415457506663, + "grad_norm": 2.5084329606273528, + "learning_rate": 2.1121110385636357e-06, + "loss": 0.6437, + "step": 4029 + }, + { + "epoch": 2.386733787385253, + "grad_norm": 1.5794313508804994, + "learning_rate": 2.108181413259741e-06, + "loss": 0.5742, + "step": 4030 + }, + { + "epoch": 2.38732602901984, + "grad_norm": 1.3385924339428654, + "learning_rate": 2.104255016153426e-06, + "loss": 0.6265, + "step": 4031 + }, + { + "epoch": 2.387918270654427, + "grad_norm": 2.2669122640842505, + "learning_rate": 2.1003318488508107e-06, + "loss": 0.6635, + "step": 4032 + }, + { + "epoch": 2.388510512289014, + "grad_norm": 1.7123022659492335, + "learning_rate": 2.0964119129566864e-06, + "loss": 0.634, + "step": 4033 + }, + { + "epoch": 2.3891027539236007, + "grad_norm": 2.074583285155401, + "learning_rate": 2.092495210074532e-06, + "loss": 0.6422, + "step": 4034 + }, + { + "epoch": 2.389694995558188, + "grad_norm": 1.9435860623142942, + "learning_rate": 2.0885817418064947e-06, + "loss": 0.6352, + "step": 4035 + }, + { + "epoch": 2.3902872371927746, + "grad_norm": 1.7101502157361959, + "learning_rate": 2.0846715097534087e-06, + "loss": 0.6161, + "step": 4036 + }, + { + "epoch": 2.3908794788273617, + "grad_norm": 1.7335618347196917, + "learning_rate": 2.0807645155147726e-06, + "loss": 0.6222, + "step": 4037 + }, + { + "epoch": 2.3914717204619484, + "grad_norm": 1.5176551517398615, + "learning_rate": 2.0768607606887724e-06, + "loss": 0.6145, + "step": 4038 + }, + { + "epoch": 2.392063962096535, + "grad_norm": 1.091782921163788, + "learning_rate": 2.072960246872261e-06, + "loss": 0.5978, + "step": 4039 + }, + { + "epoch": 2.3926562037311223, + "grad_norm": 2.07357282739027, + "learning_rate": 2.069062975660765e-06, + "loss": 0.6421, + "step": 4040 + }, + { + "epoch": 2.393248445365709, + "grad_norm": 1.5424667085190742, + "learning_rate": 2.0651689486484894e-06, + "loss": 0.6532, + "step": 4041 + }, + { + "epoch": 2.393840687000296, + "grad_norm": 1.6880108409323265, + "learning_rate": 2.0612781674283142e-06, + "loss": 0.5966, + "step": 4042 + }, + { + "epoch": 2.394432928634883, + "grad_norm": 1.3807759801970425, + "learning_rate": 2.057390633591785e-06, + "loss": 0.5957, + "step": 4043 + }, + { + "epoch": 2.39502517026947, + "grad_norm": 1.3765906802348291, + "learning_rate": 2.0535063487291176e-06, + "loss": 0.636, + "step": 4044 + }, + { + "epoch": 2.3956174119040567, + "grad_norm": 3.0061313109171595, + "learning_rate": 2.049625314429207e-06, + "loss": 0.6586, + "step": 4045 + }, + { + "epoch": 2.396209653538644, + "grad_norm": 1.1890562111369984, + "learning_rate": 2.045747532279616e-06, + "loss": 0.5867, + "step": 4046 + }, + { + "epoch": 2.3968018951732306, + "grad_norm": 1.4910777330850993, + "learning_rate": 2.0418730038665747e-06, + "loss": 0.654, + "step": 4047 + }, + { + "epoch": 2.3973941368078178, + "grad_norm": 2.874685506178504, + "learning_rate": 2.038001730774978e-06, + "loss": 0.6087, + "step": 4048 + }, + { + "epoch": 2.3979863784424045, + "grad_norm": 1.3061726816651926, + "learning_rate": 2.034133714588399e-06, + "loss": 0.5734, + "step": 4049 + }, + { + "epoch": 2.398578620076991, + "grad_norm": 1.7785239459876332, + "learning_rate": 2.0302689568890753e-06, + "loss": 0.656, + "step": 4050 + }, + { + "epoch": 2.3991708617115783, + "grad_norm": 1.6332928430746765, + "learning_rate": 2.0264074592579087e-06, + "loss": 0.6034, + "step": 4051 + }, + { + "epoch": 2.399763103346165, + "grad_norm": 1.5176551764079678, + "learning_rate": 2.022549223274465e-06, + "loss": 0.608, + "step": 4052 + }, + { + "epoch": 2.400355344980752, + "grad_norm": 1.2607503040846897, + "learning_rate": 2.0186942505169827e-06, + "loss": 0.6439, + "step": 4053 + }, + { + "epoch": 2.400947586615339, + "grad_norm": 2.158511661493091, + "learning_rate": 2.0148425425623673e-06, + "loss": 0.6046, + "step": 4054 + }, + { + "epoch": 2.401539828249926, + "grad_norm": 2.465164565539923, + "learning_rate": 2.0109941009861743e-06, + "loss": 0.6046, + "step": 4055 + }, + { + "epoch": 2.4021320698845128, + "grad_norm": 1.3196892824682709, + "learning_rate": 2.0071489273626376e-06, + "loss": 0.6952, + "step": 4056 + }, + { + "epoch": 2.4027243115191, + "grad_norm": 1.6021832014855415, + "learning_rate": 2.0033070232646488e-06, + "loss": 0.6369, + "step": 4057 + }, + { + "epoch": 2.4033165531536866, + "grad_norm": 6.273666490794738, + "learning_rate": 1.999468390263769e-06, + "loss": 0.5949, + "step": 4058 + }, + { + "epoch": 2.403908794788274, + "grad_norm": 1.3248267529829056, + "learning_rate": 1.995633029930204e-06, + "loss": 0.6476, + "step": 4059 + }, + { + "epoch": 2.4045010364228605, + "grad_norm": 4.130727246499741, + "learning_rate": 1.9918009438328365e-06, + "loss": 0.6206, + "step": 4060 + }, + { + "epoch": 2.405093278057447, + "grad_norm": 1.8044158856252108, + "learning_rate": 1.9879721335392088e-06, + "loss": 0.6502, + "step": 4061 + }, + { + "epoch": 2.4056855196920344, + "grad_norm": 1.2394089890201714, + "learning_rate": 1.9841466006155162e-06, + "loss": 0.62, + "step": 4062 + }, + { + "epoch": 2.406277761326621, + "grad_norm": 1.3020178733946122, + "learning_rate": 1.9803243466266154e-06, + "loss": 0.604, + "step": 4063 + }, + { + "epoch": 2.4068700029612082, + "grad_norm": 2.3665692021469855, + "learning_rate": 1.976505373136025e-06, + "loss": 0.5555, + "step": 4064 + }, + { + "epoch": 2.407462244595795, + "grad_norm": 1.8076468168639461, + "learning_rate": 1.9726896817059214e-06, + "loss": 0.5771, + "step": 4065 + }, + { + "epoch": 2.408054486230382, + "grad_norm": 2.2441477724689425, + "learning_rate": 1.968877273897136e-06, + "loss": 0.6202, + "step": 4066 + }, + { + "epoch": 2.408646727864969, + "grad_norm": 1.2331684527553564, + "learning_rate": 1.965068151269156e-06, + "loss": 0.6105, + "step": 4067 + }, + { + "epoch": 2.409238969499556, + "grad_norm": 3.5419262263798377, + "learning_rate": 1.9612623153801267e-06, + "loss": 0.6276, + "step": 4068 + }, + { + "epoch": 2.4098312111341427, + "grad_norm": 1.2135255066194424, + "learning_rate": 1.9574597677868535e-06, + "loss": 0.5914, + "step": 4069 + }, + { + "epoch": 2.41042345276873, + "grad_norm": 1.1257223069045508, + "learning_rate": 1.953660510044789e-06, + "loss": 0.6071, + "step": 4070 + }, + { + "epoch": 2.4110156944033165, + "grad_norm": 1.7369991319720672, + "learning_rate": 1.949864543708042e-06, + "loss": 0.6332, + "step": 4071 + }, + { + "epoch": 2.4116079360379032, + "grad_norm": 1.42930483040224, + "learning_rate": 1.946071870329377e-06, + "loss": 0.6303, + "step": 4072 + }, + { + "epoch": 2.4122001776724904, + "grad_norm": 1.5717751665701418, + "learning_rate": 1.9422824914602135e-06, + "loss": 0.6262, + "step": 4073 + }, + { + "epoch": 2.412792419307077, + "grad_norm": 1.6832189402748274, + "learning_rate": 1.9384964086506185e-06, + "loss": 0.5809, + "step": 4074 + }, + { + "epoch": 2.4133846609416643, + "grad_norm": 1.7647206463793592, + "learning_rate": 1.9347136234493093e-06, + "loss": 0.6337, + "step": 4075 + }, + { + "epoch": 2.413976902576251, + "grad_norm": 1.9697614615701116, + "learning_rate": 1.930934137403665e-06, + "loss": 0.5884, + "step": 4076 + }, + { + "epoch": 2.414569144210838, + "grad_norm": 2.1805592052438523, + "learning_rate": 1.9271579520597005e-06, + "loss": 0.5893, + "step": 4077 + }, + { + "epoch": 2.415161385845425, + "grad_norm": 2.016248237439067, + "learning_rate": 1.923385068962095e-06, + "loss": 0.6267, + "step": 4078 + }, + { + "epoch": 2.415753627480012, + "grad_norm": 2.363174925154651, + "learning_rate": 1.919615489654163e-06, + "loss": 0.6469, + "step": 4079 + }, + { + "epoch": 2.4163458691145987, + "grad_norm": 1.3925989121360791, + "learning_rate": 1.9158492156778807e-06, + "loss": 0.6612, + "step": 4080 + }, + { + "epoch": 2.416938110749186, + "grad_norm": 1.273060654648486, + "learning_rate": 1.91208624857386e-06, + "loss": 0.6226, + "step": 4081 + }, + { + "epoch": 2.4175303523837726, + "grad_norm": 1.4601623756383526, + "learning_rate": 1.908326589881372e-06, + "loss": 0.6188, + "step": 4082 + }, + { + "epoch": 2.4181225940183593, + "grad_norm": 1.5467664483267456, + "learning_rate": 1.9045702411383227e-06, + "loss": 0.5895, + "step": 4083 + }, + { + "epoch": 2.4187148356529464, + "grad_norm": 2.82373984208031, + "learning_rate": 1.9008172038812744e-06, + "loss": 0.6258, + "step": 4084 + }, + { + "epoch": 2.419307077287533, + "grad_norm": 1.9417680433070386, + "learning_rate": 1.897067479645428e-06, + "loss": 0.6163, + "step": 4085 + }, + { + "epoch": 2.4198993189221203, + "grad_norm": 2.2916627310601374, + "learning_rate": 1.8933210699646342e-06, + "loss": 0.6046, + "step": 4086 + }, + { + "epoch": 2.420491560556707, + "grad_norm": 1.5024563307793963, + "learning_rate": 1.8895779763713806e-06, + "loss": 0.5865, + "step": 4087 + }, + { + "epoch": 2.421083802191294, + "grad_norm": 1.446643763862051, + "learning_rate": 1.885838200396808e-06, + "loss": 0.6188, + "step": 4088 + }, + { + "epoch": 2.421676043825881, + "grad_norm": 1.0593789622862213, + "learning_rate": 1.8821017435706912e-06, + "loss": 0.6205, + "step": 4089 + }, + { + "epoch": 2.422268285460468, + "grad_norm": 1.5753893128495062, + "learning_rate": 1.8783686074214546e-06, + "loss": 0.6676, + "step": 4090 + }, + { + "epoch": 2.4228605270950547, + "grad_norm": 1.5478395153422457, + "learning_rate": 1.874638793476159e-06, + "loss": 0.6609, + "step": 4091 + }, + { + "epoch": 2.423452768729642, + "grad_norm": 2.513139396256183, + "learning_rate": 1.8709123032605058e-06, + "loss": 0.6139, + "step": 4092 + }, + { + "epoch": 2.4240450103642286, + "grad_norm": 4.903771647987582, + "learning_rate": 1.8671891382988416e-06, + "loss": 0.6128, + "step": 4093 + }, + { + "epoch": 2.4246372519988153, + "grad_norm": 1.0479089231627525, + "learning_rate": 1.8634693001141513e-06, + "loss": 0.6405, + "step": 4094 + }, + { + "epoch": 2.4252294936334025, + "grad_norm": 4.228476276270857, + "learning_rate": 1.8597527902280577e-06, + "loss": 0.6362, + "step": 4095 + }, + { + "epoch": 2.425821735267989, + "grad_norm": 1.5814852714772147, + "learning_rate": 1.856039610160818e-06, + "loss": 0.6704, + "step": 4096 + }, + { + "epoch": 2.4264139769025763, + "grad_norm": 2.1012557519164776, + "learning_rate": 1.8523297614313351e-06, + "loss": 0.5896, + "step": 4097 + }, + { + "epoch": 2.427006218537163, + "grad_norm": 1.463430055540698, + "learning_rate": 1.8486232455571473e-06, + "loss": 0.5984, + "step": 4098 + }, + { + "epoch": 2.42759846017175, + "grad_norm": 1.6658819397730513, + "learning_rate": 1.8449200640544274e-06, + "loss": 0.5641, + "step": 4099 + }, + { + "epoch": 2.428190701806337, + "grad_norm": 2.26128039729551, + "learning_rate": 1.8412202184379801e-06, + "loss": 0.6307, + "step": 4100 + }, + { + "epoch": 2.428782943440924, + "grad_norm": 1.4559178783717623, + "learning_rate": 1.837523710221254e-06, + "loss": 0.6036, + "step": 4101 + }, + { + "epoch": 2.4293751850755108, + "grad_norm": 1.7791634905991207, + "learning_rate": 1.8338305409163314e-06, + "loss": 0.6759, + "step": 4102 + }, + { + "epoch": 2.429967426710098, + "grad_norm": 1.697277492617549, + "learning_rate": 1.8301407120339232e-06, + "loss": 0.6221, + "step": 4103 + }, + { + "epoch": 2.4305596683446846, + "grad_norm": 2.2136330956036696, + "learning_rate": 1.826454225083375e-06, + "loss": 0.6591, + "step": 4104 + }, + { + "epoch": 2.4311519099792713, + "grad_norm": 1.3508855440744538, + "learning_rate": 1.8227710815726686e-06, + "loss": 0.6392, + "step": 4105 + }, + { + "epoch": 2.4317441516138585, + "grad_norm": 1.8566181281306422, + "learning_rate": 1.8190912830084207e-06, + "loss": 0.5795, + "step": 4106 + }, + { + "epoch": 2.432336393248445, + "grad_norm": 3.144464503693746, + "learning_rate": 1.815414830895873e-06, + "loss": 0.593, + "step": 4107 + }, + { + "epoch": 2.4329286348830323, + "grad_norm": 2.7242904341679846, + "learning_rate": 1.811741726738898e-06, + "loss": 0.6229, + "step": 4108 + }, + { + "epoch": 2.433520876517619, + "grad_norm": 3.3833912508202517, + "learning_rate": 1.8080719720400052e-06, + "loss": 0.6076, + "step": 4109 + }, + { + "epoch": 2.434113118152206, + "grad_norm": 3.0225222759099992, + "learning_rate": 1.8044055683003358e-06, + "loss": 0.6193, + "step": 4110 + }, + { + "epoch": 2.434705359786793, + "grad_norm": 1.4370596639040112, + "learning_rate": 1.8007425170196435e-06, + "loss": 0.6325, + "step": 4111 + }, + { + "epoch": 2.43529760142138, + "grad_norm": 1.5211255655218439, + "learning_rate": 1.7970828196963286e-06, + "loss": 0.556, + "step": 4112 + }, + { + "epoch": 2.435889843055967, + "grad_norm": 1.3392790413616675, + "learning_rate": 1.7934264778274157e-06, + "loss": 0.5869, + "step": 4113 + }, + { + "epoch": 2.436482084690554, + "grad_norm": 2.2652578152092957, + "learning_rate": 1.7897734929085508e-06, + "loss": 0.6022, + "step": 4114 + }, + { + "epoch": 2.4370743263251406, + "grad_norm": 1.5991779329388756, + "learning_rate": 1.7861238664340075e-06, + "loss": 0.6318, + "step": 4115 + }, + { + "epoch": 2.4376665679597274, + "grad_norm": 1.6856573090275266, + "learning_rate": 1.7824775998966926e-06, + "loss": 0.6195, + "step": 4116 + }, + { + "epoch": 2.4382588095943145, + "grad_norm": 1.872131972586157, + "learning_rate": 1.7788346947881352e-06, + "loss": 0.6015, + "step": 4117 + }, + { + "epoch": 2.4388510512289012, + "grad_norm": 1.3474243564708543, + "learning_rate": 1.7751951525984857e-06, + "loss": 0.5991, + "step": 4118 + }, + { + "epoch": 2.4394432928634884, + "grad_norm": 1.44125755213156, + "learning_rate": 1.7715589748165196e-06, + "loss": 0.6336, + "step": 4119 + }, + { + "epoch": 2.440035534498075, + "grad_norm": 1.628282300330072, + "learning_rate": 1.7679261629296408e-06, + "loss": 0.628, + "step": 4120 + }, + { + "epoch": 2.4406277761326622, + "grad_norm": 1.5525192264857999, + "learning_rate": 1.7642967184238758e-06, + "loss": 0.5914, + "step": 4121 + }, + { + "epoch": 2.441220017767249, + "grad_norm": 1.3773547139505093, + "learning_rate": 1.7606706427838682e-06, + "loss": 0.6638, + "step": 4122 + }, + { + "epoch": 2.441812259401836, + "grad_norm": 2.496487937041218, + "learning_rate": 1.7570479374928862e-06, + "loss": 0.6213, + "step": 4123 + }, + { + "epoch": 2.442404501036423, + "grad_norm": 3.2817990638971257, + "learning_rate": 1.7534286040328208e-06, + "loss": 0.6255, + "step": 4124 + }, + { + "epoch": 2.44299674267101, + "grad_norm": 3.6057888078556712, + "learning_rate": 1.7498126438841857e-06, + "loss": 0.6445, + "step": 4125 + }, + { + "epoch": 2.4435889843055967, + "grad_norm": 1.5818140283182256, + "learning_rate": 1.7462000585261096e-06, + "loss": 0.6281, + "step": 4126 + }, + { + "epoch": 2.4441812259401834, + "grad_norm": 1.9261028578432795, + "learning_rate": 1.7425908494363408e-06, + "loss": 0.6367, + "step": 4127 + }, + { + "epoch": 2.4447734675747705, + "grad_norm": 2.984646605350875, + "learning_rate": 1.7389850180912537e-06, + "loss": 0.6526, + "step": 4128 + }, + { + "epoch": 2.4453657092093573, + "grad_norm": 1.361931780545937, + "learning_rate": 1.735382565965832e-06, + "loss": 0.5942, + "step": 4129 + }, + { + "epoch": 2.4459579508439444, + "grad_norm": 1.230243762561541, + "learning_rate": 1.7317834945336843e-06, + "loss": 0.6713, + "step": 4130 + }, + { + "epoch": 2.446550192478531, + "grad_norm": 2.9100521565655173, + "learning_rate": 1.7281878052670288e-06, + "loss": 0.641, + "step": 4131 + }, + { + "epoch": 2.4471424341131183, + "grad_norm": 1.3667964273082245, + "learning_rate": 1.724595499636711e-06, + "loss": 0.5686, + "step": 4132 + }, + { + "epoch": 2.447734675747705, + "grad_norm": 1.2327143660385451, + "learning_rate": 1.7210065791121789e-06, + "loss": 0.6167, + "step": 4133 + }, + { + "epoch": 2.448326917382292, + "grad_norm": 2.0388204744932894, + "learning_rate": 1.7174210451615091e-06, + "loss": 0.6042, + "step": 4134 + }, + { + "epoch": 2.448919159016879, + "grad_norm": 1.5150026413148958, + "learning_rate": 1.713838899251381e-06, + "loss": 0.627, + "step": 4135 + }, + { + "epoch": 2.449511400651466, + "grad_norm": 1.1466180408826923, + "learning_rate": 1.7102601428470988e-06, + "loss": 0.6109, + "step": 4136 + }, + { + "epoch": 2.4501036422860527, + "grad_norm": 2.0780281123685422, + "learning_rate": 1.7066847774125716e-06, + "loss": 0.6083, + "step": 4137 + }, + { + "epoch": 2.4506958839206394, + "grad_norm": 1.3954878819749057, + "learning_rate": 1.7031128044103272e-06, + "loss": 0.5951, + "step": 4138 + }, + { + "epoch": 2.4512881255552266, + "grad_norm": 1.2417256271708208, + "learning_rate": 1.6995442253015003e-06, + "loss": 0.5816, + "step": 4139 + }, + { + "epoch": 2.4518803671898133, + "grad_norm": 2.1291302266514998, + "learning_rate": 1.6959790415458454e-06, + "loss": 0.6318, + "step": 4140 + }, + { + "epoch": 2.4524726088244004, + "grad_norm": 1.294464705411972, + "learning_rate": 1.692417254601717e-06, + "loss": 0.6079, + "step": 4141 + }, + { + "epoch": 2.453064850458987, + "grad_norm": 1.723843313270676, + "learning_rate": 1.6888588659260929e-06, + "loss": 0.6003, + "step": 4142 + }, + { + "epoch": 2.4536570920935743, + "grad_norm": 2.01346115839394, + "learning_rate": 1.6853038769745466e-06, + "loss": 0.6387, + "step": 4143 + }, + { + "epoch": 2.454249333728161, + "grad_norm": 1.7697041782949663, + "learning_rate": 1.6817522892012762e-06, + "loss": 0.6412, + "step": 4144 + }, + { + "epoch": 2.454841575362748, + "grad_norm": 1.1993784853907747, + "learning_rate": 1.6782041040590769e-06, + "loss": 0.595, + "step": 4145 + }, + { + "epoch": 2.455433816997335, + "grad_norm": 1.5669145113783172, + "learning_rate": 1.6746593229993545e-06, + "loss": 0.5877, + "step": 4146 + }, + { + "epoch": 2.456026058631922, + "grad_norm": 1.1370169190841048, + "learning_rate": 1.6711179474721272e-06, + "loss": 0.6322, + "step": 4147 + }, + { + "epoch": 2.4566183002665087, + "grad_norm": 3.5535580287557127, + "learning_rate": 1.6675799789260128e-06, + "loss": 0.631, + "step": 4148 + }, + { + "epoch": 2.4572105419010954, + "grad_norm": 1.85149070130156, + "learning_rate": 1.6640454188082444e-06, + "loss": 0.6152, + "step": 4149 + }, + { + "epoch": 2.4578027835356826, + "grad_norm": 2.303568347956373, + "learning_rate": 1.6605142685646503e-06, + "loss": 0.6196, + "step": 4150 + }, + { + "epoch": 2.4583950251702693, + "grad_norm": 2.3689995504227372, + "learning_rate": 1.6569865296396748e-06, + "loss": 0.5813, + "step": 4151 + }, + { + "epoch": 2.4589872668048565, + "grad_norm": 1.894218094477947, + "learning_rate": 1.6534622034763558e-06, + "loss": 0.6069, + "step": 4152 + }, + { + "epoch": 2.459579508439443, + "grad_norm": 4.169690853566014, + "learning_rate": 1.6499412915163481e-06, + "loss": 0.6636, + "step": 4153 + }, + { + "epoch": 2.4601717500740303, + "grad_norm": 1.6611317565107804, + "learning_rate": 1.6464237951998952e-06, + "loss": 0.561, + "step": 4154 + }, + { + "epoch": 2.460763991708617, + "grad_norm": 2.215191682910294, + "learning_rate": 1.642909715965857e-06, + "loss": 0.63, + "step": 4155 + }, + { + "epoch": 2.461356233343204, + "grad_norm": 1.6918194298709026, + "learning_rate": 1.6393990552516848e-06, + "loss": 0.6134, + "step": 4156 + }, + { + "epoch": 2.461948474977791, + "grad_norm": 1.5262087510396043, + "learning_rate": 1.63589181449344e-06, + "loss": 0.6413, + "step": 4157 + }, + { + "epoch": 2.462540716612378, + "grad_norm": 1.2968558382847615, + "learning_rate": 1.6323879951257783e-06, + "loss": 0.6197, + "step": 4158 + }, + { + "epoch": 2.4631329582469648, + "grad_norm": 1.6679364569860284, + "learning_rate": 1.628887598581962e-06, + "loss": 0.5777, + "step": 4159 + }, + { + "epoch": 2.4637251998815515, + "grad_norm": 1.0369459266334156, + "learning_rate": 1.6253906262938457e-06, + "loss": 0.6209, + "step": 4160 + }, + { + "epoch": 2.4643174415161386, + "grad_norm": 1.5112263549928397, + "learning_rate": 1.6218970796918931e-06, + "loss": 0.6424, + "step": 4161 + }, + { + "epoch": 2.4649096831507253, + "grad_norm": 1.2179638572938345, + "learning_rate": 1.6184069602051578e-06, + "loss": 0.6032, + "step": 4162 + }, + { + "epoch": 2.4655019247853125, + "grad_norm": 1.2210989669346337, + "learning_rate": 1.614920269261293e-06, + "loss": 0.6041, + "step": 4163 + }, + { + "epoch": 2.466094166419899, + "grad_norm": 1.5638850592616937, + "learning_rate": 1.611437008286555e-06, + "loss": 0.6265, + "step": 4164 + }, + { + "epoch": 2.4666864080544864, + "grad_norm": 1.9348031966836825, + "learning_rate": 1.6079571787057946e-06, + "loss": 0.588, + "step": 4165 + }, + { + "epoch": 2.467278649689073, + "grad_norm": 1.4655585715236081, + "learning_rate": 1.6044807819424545e-06, + "loss": 0.6662, + "step": 4166 + }, + { + "epoch": 2.46787089132366, + "grad_norm": 1.35471687098539, + "learning_rate": 1.6010078194185752e-06, + "loss": 0.6332, + "step": 4167 + }, + { + "epoch": 2.468463132958247, + "grad_norm": 1.4021986342558357, + "learning_rate": 1.5975382925547966e-06, + "loss": 0.6344, + "step": 4168 + }, + { + "epoch": 2.469055374592834, + "grad_norm": 1.5104775229268375, + "learning_rate": 1.5940722027703516e-06, + "loss": 0.595, + "step": 4169 + }, + { + "epoch": 2.469647616227421, + "grad_norm": 1.5242057096864499, + "learning_rate": 1.5906095514830645e-06, + "loss": 0.6058, + "step": 4170 + }, + { + "epoch": 2.4702398578620075, + "grad_norm": 1.446713003214256, + "learning_rate": 1.5871503401093501e-06, + "loss": 0.6319, + "step": 4171 + }, + { + "epoch": 2.4708320994965947, + "grad_norm": 2.343073886207364, + "learning_rate": 1.5836945700642248e-06, + "loss": 0.6254, + "step": 4172 + }, + { + "epoch": 2.4714243411311814, + "grad_norm": 2.399540643140653, + "learning_rate": 1.580242242761295e-06, + "loss": 0.5846, + "step": 4173 + }, + { + "epoch": 2.4720165827657685, + "grad_norm": 1.928520767924082, + "learning_rate": 1.5767933596127528e-06, + "loss": 0.6639, + "step": 4174 + }, + { + "epoch": 2.4726088244003552, + "grad_norm": 1.1218273751756729, + "learning_rate": 1.5733479220293847e-06, + "loss": 0.5455, + "step": 4175 + }, + { + "epoch": 2.4732010660349424, + "grad_norm": 1.379515727004053, + "learning_rate": 1.56990593142057e-06, + "loss": 0.6029, + "step": 4176 + }, + { + "epoch": 2.473793307669529, + "grad_norm": 2.093979049163968, + "learning_rate": 1.5664673891942805e-06, + "loss": 0.5524, + "step": 4177 + }, + { + "epoch": 2.4743855493041162, + "grad_norm": 1.4867542033675507, + "learning_rate": 1.5630322967570655e-06, + "loss": 0.6112, + "step": 4178 + }, + { + "epoch": 2.474977790938703, + "grad_norm": 1.5435512842922596, + "learning_rate": 1.559600655514074e-06, + "loss": 0.609, + "step": 4179 + }, + { + "epoch": 2.47557003257329, + "grad_norm": 1.5223906435727905, + "learning_rate": 1.5561724668690436e-06, + "loss": 0.5927, + "step": 4180 + }, + { + "epoch": 2.476162274207877, + "grad_norm": 1.7939175275171766, + "learning_rate": 1.5527477322242934e-06, + "loss": 0.5754, + "step": 4181 + }, + { + "epoch": 2.4767545158424635, + "grad_norm": 2.322851003634435, + "learning_rate": 1.5493264529807305e-06, + "loss": 0.6463, + "step": 4182 + }, + { + "epoch": 2.4773467574770507, + "grad_norm": 1.4341109913608379, + "learning_rate": 1.5459086305378524e-06, + "loss": 0.6512, + "step": 4183 + }, + { + "epoch": 2.4779389991116374, + "grad_norm": 2.000472244942328, + "learning_rate": 1.5424942662937436e-06, + "loss": 0.6175, + "step": 4184 + }, + { + "epoch": 2.4785312407462246, + "grad_norm": 1.1364991284051775, + "learning_rate": 1.5390833616450684e-06, + "loss": 0.5778, + "step": 4185 + }, + { + "epoch": 2.4791234823808113, + "grad_norm": 2.5864290370042187, + "learning_rate": 1.5356759179870762e-06, + "loss": 0.6695, + "step": 4186 + }, + { + "epoch": 2.4797157240153984, + "grad_norm": 1.4569921907313288, + "learning_rate": 1.5322719367136064e-06, + "loss": 0.5809, + "step": 4187 + }, + { + "epoch": 2.480307965649985, + "grad_norm": 1.3478923100787537, + "learning_rate": 1.5288714192170796e-06, + "loss": 0.5834, + "step": 4188 + }, + { + "epoch": 2.4809002072845723, + "grad_norm": 3.810585927154581, + "learning_rate": 1.5254743668884963e-06, + "loss": 0.6608, + "step": 4189 + }, + { + "epoch": 2.481492448919159, + "grad_norm": 1.3909152535067673, + "learning_rate": 1.522080781117441e-06, + "loss": 0.6216, + "step": 4190 + }, + { + "epoch": 2.482084690553746, + "grad_norm": 1.7514258321845537, + "learning_rate": 1.5186906632920816e-06, + "loss": 0.6203, + "step": 4191 + }, + { + "epoch": 2.482676932188333, + "grad_norm": 1.731744905638932, + "learning_rate": 1.5153040147991716e-06, + "loss": 0.613, + "step": 4192 + }, + { + "epoch": 2.4832691738229196, + "grad_norm": 2.3774079021997765, + "learning_rate": 1.5119208370240369e-06, + "loss": 0.6039, + "step": 4193 + }, + { + "epoch": 2.4838614154575067, + "grad_norm": 1.9933956673478872, + "learning_rate": 1.5085411313505849e-06, + "loss": 0.6405, + "step": 4194 + }, + { + "epoch": 2.4844536570920934, + "grad_norm": 2.8685180286755187, + "learning_rate": 1.5051648991613077e-06, + "loss": 0.621, + "step": 4195 + }, + { + "epoch": 2.4850458987266806, + "grad_norm": 1.2386381396531745, + "learning_rate": 1.5017921418372772e-06, + "loss": 0.6757, + "step": 4196 + }, + { + "epoch": 2.4856381403612673, + "grad_norm": 1.2887006546671949, + "learning_rate": 1.4984228607581386e-06, + "loss": 0.599, + "step": 4197 + }, + { + "epoch": 2.4862303819958544, + "grad_norm": 1.3284199306461797, + "learning_rate": 1.4950570573021138e-06, + "loss": 0.5998, + "step": 4198 + }, + { + "epoch": 2.486822623630441, + "grad_norm": 1.505776114908257, + "learning_rate": 1.4916947328460108e-06, + "loss": 0.6197, + "step": 4199 + }, + { + "epoch": 2.4874148652650283, + "grad_norm": 1.4391666401175998, + "learning_rate": 1.4883358887652044e-06, + "loss": 0.5938, + "step": 4200 + }, + { + "epoch": 2.488007106899615, + "grad_norm": 1.7810197494911437, + "learning_rate": 1.484980526433657e-06, + "loss": 0.6247, + "step": 4201 + }, + { + "epoch": 2.488599348534202, + "grad_norm": 1.8202240648965637, + "learning_rate": 1.4816286472238939e-06, + "loss": 0.5803, + "step": 4202 + }, + { + "epoch": 2.489191590168789, + "grad_norm": 1.5697032510584463, + "learning_rate": 1.4782802525070282e-06, + "loss": 0.6114, + "step": 4203 + }, + { + "epoch": 2.4897838318033756, + "grad_norm": 1.3273698484858674, + "learning_rate": 1.474935343652736e-06, + "loss": 0.6103, + "step": 4204 + }, + { + "epoch": 2.4903760734379627, + "grad_norm": 1.825999416280271, + "learning_rate": 1.4715939220292775e-06, + "loss": 0.6109, + "step": 4205 + }, + { + "epoch": 2.4909683150725495, + "grad_norm": 6.697769541146492, + "learning_rate": 1.4682559890034787e-06, + "loss": 0.5785, + "step": 4206 + }, + { + "epoch": 2.4915605567071366, + "grad_norm": 4.951124117950075, + "learning_rate": 1.4649215459407462e-06, + "loss": 0.6181, + "step": 4207 + }, + { + "epoch": 2.4921527983417233, + "grad_norm": 2.2685218356339996, + "learning_rate": 1.461590594205049e-06, + "loss": 0.5906, + "step": 4208 + }, + { + "epoch": 2.4927450399763105, + "grad_norm": 1.187772811816334, + "learning_rate": 1.4582631351589405e-06, + "loss": 0.621, + "step": 4209 + }, + { + "epoch": 2.493337281610897, + "grad_norm": 1.568313648134276, + "learning_rate": 1.4549391701635308e-06, + "loss": 0.6407, + "step": 4210 + }, + { + "epoch": 2.4939295232454843, + "grad_norm": 2.056945291130729, + "learning_rate": 1.4516187005785153e-06, + "loss": 0.6526, + "step": 4211 + }, + { + "epoch": 2.494521764880071, + "grad_norm": 2.5897828120036372, + "learning_rate": 1.4483017277621482e-06, + "loss": 0.5857, + "step": 4212 + }, + { + "epoch": 2.495114006514658, + "grad_norm": 2.003851685407157, + "learning_rate": 1.4449882530712621e-06, + "loss": 0.5891, + "step": 4213 + }, + { + "epoch": 2.495706248149245, + "grad_norm": 2.031397305896863, + "learning_rate": 1.4416782778612514e-06, + "loss": 0.6374, + "step": 4214 + }, + { + "epoch": 2.4962984897838316, + "grad_norm": 2.460742739390165, + "learning_rate": 1.4383718034860806e-06, + "loss": 0.6282, + "step": 4215 + }, + { + "epoch": 2.4968907314184188, + "grad_norm": 1.8883652271349463, + "learning_rate": 1.4350688312982864e-06, + "loss": 0.607, + "step": 4216 + }, + { + "epoch": 2.4974829730530055, + "grad_norm": 1.6383575358198943, + "learning_rate": 1.4317693626489715e-06, + "loss": 0.6155, + "step": 4217 + }, + { + "epoch": 2.4980752146875926, + "grad_norm": 1.5789440462021105, + "learning_rate": 1.428473398887802e-06, + "loss": 0.6475, + "step": 4218 + }, + { + "epoch": 2.4986674563221793, + "grad_norm": 1.7872165908651498, + "learning_rate": 1.4251809413630103e-06, + "loss": 0.6318, + "step": 4219 + }, + { + "epoch": 2.4992596979567665, + "grad_norm": 1.2177122630184891, + "learning_rate": 1.421891991421399e-06, + "loss": 0.6186, + "step": 4220 + }, + { + "epoch": 2.499851939591353, + "grad_norm": 5.5110163664079765, + "learning_rate": 1.4186065504083356e-06, + "loss": 0.6125, + "step": 4221 + }, + { + "epoch": 2.50044418122594, + "grad_norm": 1.3204263434006236, + "learning_rate": 1.4153246196677483e-06, + "loss": 0.5692, + "step": 4222 + }, + { + "epoch": 2.501036422860527, + "grad_norm": 2.040538027329011, + "learning_rate": 1.4120462005421287e-06, + "loss": 0.6165, + "step": 4223 + }, + { + "epoch": 2.5016286644951142, + "grad_norm": 4.142369576433715, + "learning_rate": 1.4087712943725384e-06, + "loss": 0.6178, + "step": 4224 + }, + { + "epoch": 2.502220906129701, + "grad_norm": 1.1861793588300895, + "learning_rate": 1.405499902498597e-06, + "loss": 0.6187, + "step": 4225 + }, + { + "epoch": 2.5028131477642876, + "grad_norm": 1.2674089845867869, + "learning_rate": 1.40223202625849e-06, + "loss": 0.6052, + "step": 4226 + }, + { + "epoch": 2.503405389398875, + "grad_norm": 1.6386149471462386, + "learning_rate": 1.3989676669889562e-06, + "loss": 0.5914, + "step": 4227 + }, + { + "epoch": 2.5039976310334615, + "grad_norm": 1.5069585186135463, + "learning_rate": 1.395706826025306e-06, + "loss": 0.5885, + "step": 4228 + }, + { + "epoch": 2.5045898726680487, + "grad_norm": 1.5769533827442805, + "learning_rate": 1.392449504701412e-06, + "loss": 0.578, + "step": 4229 + }, + { + "epoch": 2.5051821143026354, + "grad_norm": 1.9577342067869337, + "learning_rate": 1.3891957043496917e-06, + "loss": 0.5767, + "step": 4230 + }, + { + "epoch": 2.5057743559372225, + "grad_norm": 1.820348565915433, + "learning_rate": 1.3859454263011373e-06, + "loss": 0.6608, + "step": 4231 + }, + { + "epoch": 2.5063665975718092, + "grad_norm": 1.4196495980962014, + "learning_rate": 1.3826986718852952e-06, + "loss": 0.6256, + "step": 4232 + }, + { + "epoch": 2.506958839206396, + "grad_norm": 2.614783295311572, + "learning_rate": 1.3794554424302752e-06, + "loss": 0.6216, + "step": 4233 + }, + { + "epoch": 2.507551080840983, + "grad_norm": 2.0862712740764726, + "learning_rate": 1.3762157392627317e-06, + "loss": 0.6103, + "step": 4234 + }, + { + "epoch": 2.5081433224755703, + "grad_norm": 1.4923395580122512, + "learning_rate": 1.372979563707889e-06, + "loss": 0.6247, + "step": 4235 + }, + { + "epoch": 2.508735564110157, + "grad_norm": 1.6903494282840945, + "learning_rate": 1.3697469170895282e-06, + "loss": 0.6138, + "step": 4236 + }, + { + "epoch": 2.5093278057447437, + "grad_norm": 1.7287805590209488, + "learning_rate": 1.3665178007299818e-06, + "loss": 0.5927, + "step": 4237 + }, + { + "epoch": 2.509920047379331, + "grad_norm": 2.0975391419269314, + "learning_rate": 1.363292215950135e-06, + "loss": 0.5833, + "step": 4238 + }, + { + "epoch": 2.5105122890139175, + "grad_norm": 1.7555964609395533, + "learning_rate": 1.3600701640694392e-06, + "loss": 0.6121, + "step": 4239 + }, + { + "epoch": 2.5111045306485047, + "grad_norm": 1.6961673222728129, + "learning_rate": 1.3568516464058946e-06, + "loss": 0.6195, + "step": 4240 + }, + { + "epoch": 2.5116967722830914, + "grad_norm": 2.079432102930987, + "learning_rate": 1.3536366642760534e-06, + "loss": 0.6057, + "step": 4241 + }, + { + "epoch": 2.5122890139176786, + "grad_norm": 1.1628789903318941, + "learning_rate": 1.350425218995024e-06, + "loss": 0.5979, + "step": 4242 + }, + { + "epoch": 2.5128812555522653, + "grad_norm": 1.454884994824475, + "learning_rate": 1.3472173118764686e-06, + "loss": 0.5959, + "step": 4243 + }, + { + "epoch": 2.513473497186852, + "grad_norm": 1.977230587423518, + "learning_rate": 1.3440129442326045e-06, + "loss": 0.604, + "step": 4244 + }, + { + "epoch": 2.514065738821439, + "grad_norm": 1.4424961332700483, + "learning_rate": 1.3408121173741962e-06, + "loss": 0.609, + "step": 4245 + }, + { + "epoch": 2.5146579804560263, + "grad_norm": 1.178276635818679, + "learning_rate": 1.3376148326105586e-06, + "loss": 0.6249, + "step": 4246 + }, + { + "epoch": 2.515250222090613, + "grad_norm": 1.5026051517578134, + "learning_rate": 1.3344210912495648e-06, + "loss": 0.642, + "step": 4247 + }, + { + "epoch": 2.5158424637251997, + "grad_norm": 1.8145268871556535, + "learning_rate": 1.3312308945976348e-06, + "loss": 0.6375, + "step": 4248 + }, + { + "epoch": 2.516434705359787, + "grad_norm": 1.7801182768357935, + "learning_rate": 1.3280442439597384e-06, + "loss": 0.6133, + "step": 4249 + }, + { + "epoch": 2.5170269469943736, + "grad_norm": 2.2296683859183224, + "learning_rate": 1.3248611406393918e-06, + "loss": 0.6702, + "step": 4250 + }, + { + "epoch": 2.5176191886289607, + "grad_norm": 1.8998316765515955, + "learning_rate": 1.3216815859386667e-06, + "loss": 0.5944, + "step": 4251 + }, + { + "epoch": 2.5182114302635474, + "grad_norm": 2.0543176892307833, + "learning_rate": 1.318505581158177e-06, + "loss": 0.6315, + "step": 4252 + }, + { + "epoch": 2.5188036718981346, + "grad_norm": 1.5059166609031631, + "learning_rate": 1.3153331275970904e-06, + "loss": 0.5729, + "step": 4253 + }, + { + "epoch": 2.5193959135327213, + "grad_norm": 1.6358415801788577, + "learning_rate": 1.3121642265531154e-06, + "loss": 0.6296, + "step": 4254 + }, + { + "epoch": 2.519988155167308, + "grad_norm": 1.36873712039331, + "learning_rate": 1.3089988793225139e-06, + "loss": 0.615, + "step": 4255 + }, + { + "epoch": 2.520580396801895, + "grad_norm": 1.2889128303477049, + "learning_rate": 1.305837087200087e-06, + "loss": 0.6231, + "step": 4256 + }, + { + "epoch": 2.5211726384364823, + "grad_norm": 4.424445990535661, + "learning_rate": 1.30267885147919e-06, + "loss": 0.6012, + "step": 4257 + }, + { + "epoch": 2.521764880071069, + "grad_norm": 1.5525798274801588, + "learning_rate": 1.299524173451715e-06, + "loss": 0.5881, + "step": 4258 + }, + { + "epoch": 2.5223571217056557, + "grad_norm": 2.316949095558541, + "learning_rate": 1.2963730544081065e-06, + "loss": 0.6059, + "step": 4259 + }, + { + "epoch": 2.522949363340243, + "grad_norm": 1.6494823394802665, + "learning_rate": 1.2932254956373457e-06, + "loss": 0.6213, + "step": 4260 + }, + { + "epoch": 2.5235416049748296, + "grad_norm": 1.5280751943361606, + "learning_rate": 1.290081498426965e-06, + "loss": 0.5645, + "step": 4261 + }, + { + "epoch": 2.5241338466094168, + "grad_norm": 1.9376208589060975, + "learning_rate": 1.286941064063031e-06, + "loss": 0.5926, + "step": 4262 + }, + { + "epoch": 2.5247260882440035, + "grad_norm": 1.429613590939648, + "learning_rate": 1.2838041938301638e-06, + "loss": 0.6164, + "step": 4263 + }, + { + "epoch": 2.5253183298785906, + "grad_norm": 2.5583596054226247, + "learning_rate": 1.2806708890115138e-06, + "loss": 0.5951, + "step": 4264 + }, + { + "epoch": 2.5259105715131773, + "grad_norm": 2.2738048703872513, + "learning_rate": 1.2775411508887837e-06, + "loss": 0.6259, + "step": 4265 + }, + { + "epoch": 2.526502813147764, + "grad_norm": 2.872265401198682, + "learning_rate": 1.2744149807422113e-06, + "loss": 0.5911, + "step": 4266 + }, + { + "epoch": 2.527095054782351, + "grad_norm": 1.6508438193817, + "learning_rate": 1.2712923798505727e-06, + "loss": 0.6179, + "step": 4267 + }, + { + "epoch": 2.5276872964169383, + "grad_norm": 2.2705730263320825, + "learning_rate": 1.2681733494911897e-06, + "loss": 0.6396, + "step": 4268 + }, + { + "epoch": 2.528279538051525, + "grad_norm": 1.7932950964539862, + "learning_rate": 1.2650578909399225e-06, + "loss": 0.5945, + "step": 4269 + }, + { + "epoch": 2.5288717796861118, + "grad_norm": 2.039463332472355, + "learning_rate": 1.2619460054711685e-06, + "loss": 0.598, + "step": 4270 + }, + { + "epoch": 2.529464021320699, + "grad_norm": 2.1940382419730446, + "learning_rate": 1.2588376943578594e-06, + "loss": 0.6173, + "step": 4271 + }, + { + "epoch": 2.5300562629552856, + "grad_norm": 1.334149394220382, + "learning_rate": 1.2557329588714739e-06, + "loss": 0.6212, + "step": 4272 + }, + { + "epoch": 2.530648504589873, + "grad_norm": 1.153175614903197, + "learning_rate": 1.2526318002820237e-06, + "loss": 0.5776, + "step": 4273 + }, + { + "epoch": 2.5312407462244595, + "grad_norm": 2.7701095890179848, + "learning_rate": 1.2495342198580562e-06, + "loss": 0.6507, + "step": 4274 + }, + { + "epoch": 2.5318329878590466, + "grad_norm": 1.9084746730843827, + "learning_rate": 1.2464402188666524e-06, + "loss": 0.5635, + "step": 4275 + }, + { + "epoch": 2.5324252294936334, + "grad_norm": 3.3261345903916824, + "learning_rate": 1.2433497985734356e-06, + "loss": 0.6515, + "step": 4276 + }, + { + "epoch": 2.53301747112822, + "grad_norm": 1.3772062277063388, + "learning_rate": 1.2402629602425643e-06, + "loss": 0.6683, + "step": 4277 + }, + { + "epoch": 2.533609712762807, + "grad_norm": 1.5163170294865773, + "learning_rate": 1.237179705136725e-06, + "loss": 0.6679, + "step": 4278 + }, + { + "epoch": 2.5342019543973944, + "grad_norm": 1.5235149346531982, + "learning_rate": 1.2341000345171417e-06, + "loss": 0.5914, + "step": 4279 + }, + { + "epoch": 2.534794196031981, + "grad_norm": 1.4537614359095834, + "learning_rate": 1.2310239496435749e-06, + "loss": 0.5615, + "step": 4280 + }, + { + "epoch": 2.535386437666568, + "grad_norm": 3.3433167384515285, + "learning_rate": 1.2279514517743208e-06, + "loss": 0.5961, + "step": 4281 + }, + { + "epoch": 2.535978679301155, + "grad_norm": 1.759299393387278, + "learning_rate": 1.2248825421661937e-06, + "loss": 0.5994, + "step": 4282 + }, + { + "epoch": 2.5365709209357417, + "grad_norm": 5.824067607433355, + "learning_rate": 1.2218172220745562e-06, + "loss": 0.6236, + "step": 4283 + }, + { + "epoch": 2.537163162570329, + "grad_norm": 4.849146537686694, + "learning_rate": 1.2187554927532963e-06, + "loss": 0.6548, + "step": 4284 + }, + { + "epoch": 2.5377554042049155, + "grad_norm": 1.8728352219215554, + "learning_rate": 1.2156973554548369e-06, + "loss": 0.6009, + "step": 4285 + }, + { + "epoch": 2.5383476458395027, + "grad_norm": 2.628941079406281, + "learning_rate": 1.2126428114301204e-06, + "loss": 0.5838, + "step": 4286 + }, + { + "epoch": 2.5389398874740894, + "grad_norm": 1.7988812066523692, + "learning_rate": 1.2095918619286306e-06, + "loss": 0.611, + "step": 4287 + }, + { + "epoch": 2.539532129108676, + "grad_norm": 5.152642550826023, + "learning_rate": 1.2065445081983795e-06, + "loss": 0.627, + "step": 4288 + }, + { + "epoch": 2.5401243707432632, + "grad_norm": 2.539557904372852, + "learning_rate": 1.2035007514859054e-06, + "loss": 0.5975, + "step": 4289 + }, + { + "epoch": 2.5407166123778504, + "grad_norm": 1.4457429738287353, + "learning_rate": 1.2004605930362724e-06, + "loss": 0.566, + "step": 4290 + }, + { + "epoch": 2.541308854012437, + "grad_norm": 3.4785094614740677, + "learning_rate": 1.19742403409308e-06, + "loss": 0.5893, + "step": 4291 + }, + { + "epoch": 2.541901095647024, + "grad_norm": 2.4252390321481703, + "learning_rate": 1.194391075898451e-06, + "loss": 0.5936, + "step": 4292 + }, + { + "epoch": 2.542493337281611, + "grad_norm": 1.483539871741351, + "learning_rate": 1.191361719693036e-06, + "loss": 0.6091, + "step": 4293 + }, + { + "epoch": 2.5430855789161977, + "grad_norm": 1.7394003556175763, + "learning_rate": 1.1883359667160087e-06, + "loss": 0.6362, + "step": 4294 + }, + { + "epoch": 2.543677820550785, + "grad_norm": 1.2312450282067198, + "learning_rate": 1.185313818205076e-06, + "loss": 0.6277, + "step": 4295 + }, + { + "epoch": 2.5442700621853716, + "grad_norm": 1.2235952845823084, + "learning_rate": 1.1822952753964667e-06, + "loss": 0.5776, + "step": 4296 + }, + { + "epoch": 2.5448623038199587, + "grad_norm": 1.3836979953577477, + "learning_rate": 1.179280339524933e-06, + "loss": 0.6458, + "step": 4297 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 1.550091891298104, + "learning_rate": 1.1762690118237518e-06, + "loss": 0.6219, + "step": 4298 + }, + { + "epoch": 2.546046787089132, + "grad_norm": 2.6309104401555286, + "learning_rate": 1.1732612935247267e-06, + "loss": 0.6267, + "step": 4299 + }, + { + "epoch": 2.5466390287237193, + "grad_norm": 1.7871342600964928, + "learning_rate": 1.1702571858581867e-06, + "loss": 0.6586, + "step": 4300 + }, + { + "epoch": 2.5472312703583064, + "grad_norm": 3.2118108366654257, + "learning_rate": 1.167256690052978e-06, + "loss": 0.6428, + "step": 4301 + }, + { + "epoch": 2.547823511992893, + "grad_norm": 1.8557141426320194, + "learning_rate": 1.1642598073364707e-06, + "loss": 0.5865, + "step": 4302 + }, + { + "epoch": 2.54841575362748, + "grad_norm": 1.7419326195322122, + "learning_rate": 1.1612665389345613e-06, + "loss": 0.586, + "step": 4303 + }, + { + "epoch": 2.549007995262067, + "grad_norm": 4.789685134855233, + "learning_rate": 1.158276886071662e-06, + "loss": 0.6206, + "step": 4304 + }, + { + "epoch": 2.5496002368966537, + "grad_norm": 1.681521269697521, + "learning_rate": 1.1552908499707115e-06, + "loss": 0.5966, + "step": 4305 + }, + { + "epoch": 2.550192478531241, + "grad_norm": 1.7353432303503473, + "learning_rate": 1.1523084318531641e-06, + "loss": 0.6347, + "step": 4306 + }, + { + "epoch": 2.5507847201658276, + "grad_norm": 1.4255690439020448, + "learning_rate": 1.1493296329390003e-06, + "loss": 0.6146, + "step": 4307 + }, + { + "epoch": 2.5513769618004147, + "grad_norm": 2.180356591475331, + "learning_rate": 1.1463544544467109e-06, + "loss": 0.6158, + "step": 4308 + }, + { + "epoch": 2.5519692034350014, + "grad_norm": 1.6710087648778493, + "learning_rate": 1.143382897593316e-06, + "loss": 0.6162, + "step": 4309 + }, + { + "epoch": 2.552561445069588, + "grad_norm": 1.5114876553797243, + "learning_rate": 1.1404149635943462e-06, + "loss": 0.5982, + "step": 4310 + }, + { + "epoch": 2.5531536867041753, + "grad_norm": 1.9274949790613265, + "learning_rate": 1.1374506536638574e-06, + "loss": 0.5799, + "step": 4311 + }, + { + "epoch": 2.5537459283387625, + "grad_norm": 1.1942475585564856, + "learning_rate": 1.134489969014414e-06, + "loss": 0.5993, + "step": 4312 + }, + { + "epoch": 2.554338169973349, + "grad_norm": 2.7949402099250795, + "learning_rate": 1.1315329108571072e-06, + "loss": 0.5959, + "step": 4313 + }, + { + "epoch": 2.554930411607936, + "grad_norm": 2.1002894683071345, + "learning_rate": 1.1285794804015349e-06, + "loss": 0.6032, + "step": 4314 + }, + { + "epoch": 2.555522653242523, + "grad_norm": 1.332872243171736, + "learning_rate": 1.125629678855822e-06, + "loss": 0.6241, + "step": 4315 + }, + { + "epoch": 2.5561148948771097, + "grad_norm": 2.3220966071152294, + "learning_rate": 1.1226835074265985e-06, + "loss": 0.6073, + "step": 4316 + }, + { + "epoch": 2.556707136511697, + "grad_norm": 1.166136350433321, + "learning_rate": 1.1197409673190186e-06, + "loss": 0.5955, + "step": 4317 + }, + { + "epoch": 2.5572993781462836, + "grad_norm": 1.4802954541867506, + "learning_rate": 1.1168020597367435e-06, + "loss": 0.6378, + "step": 4318 + }, + { + "epoch": 2.5578916197808708, + "grad_norm": 1.4086127534519697, + "learning_rate": 1.1138667858819497e-06, + "loss": 0.5804, + "step": 4319 + }, + { + "epoch": 2.5584838614154575, + "grad_norm": 1.6694041074811221, + "learning_rate": 1.1109351469553331e-06, + "loss": 0.6109, + "step": 4320 + }, + { + "epoch": 2.559076103050044, + "grad_norm": 1.81052273937072, + "learning_rate": 1.1080071441560992e-06, + "loss": 0.6108, + "step": 4321 + }, + { + "epoch": 2.5596683446846313, + "grad_norm": 2.0222094639871147, + "learning_rate": 1.105082778681964e-06, + "loss": 0.5742, + "step": 4322 + }, + { + "epoch": 2.5602605863192185, + "grad_norm": 2.3303481014699003, + "learning_rate": 1.1021620517291566e-06, + "loss": 0.6679, + "step": 4323 + }, + { + "epoch": 2.560852827953805, + "grad_norm": 1.2189662484050676, + "learning_rate": 1.0992449644924186e-06, + "loss": 0.5603, + "step": 4324 + }, + { + "epoch": 2.561445069588392, + "grad_norm": 1.6090711124893189, + "learning_rate": 1.0963315181650058e-06, + "loss": 0.5734, + "step": 4325 + }, + { + "epoch": 2.562037311222979, + "grad_norm": 1.4371331051564935, + "learning_rate": 1.0934217139386805e-06, + "loss": 0.6292, + "step": 4326 + }, + { + "epoch": 2.5626295528575658, + "grad_norm": 1.637674628017765, + "learning_rate": 1.0905155530037125e-06, + "loss": 0.6119, + "step": 4327 + }, + { + "epoch": 2.563221794492153, + "grad_norm": 2.25569504293535, + "learning_rate": 1.087613036548888e-06, + "loss": 0.5816, + "step": 4328 + }, + { + "epoch": 2.5638140361267396, + "grad_norm": 2.2331219302066136, + "learning_rate": 1.0847141657615023e-06, + "loss": 0.6207, + "step": 4329 + }, + { + "epoch": 2.564406277761327, + "grad_norm": 1.0824128878079005, + "learning_rate": 1.0818189418273527e-06, + "loss": 0.6287, + "step": 4330 + }, + { + "epoch": 2.5649985193959135, + "grad_norm": 1.7593659910084987, + "learning_rate": 1.0789273659307476e-06, + "loss": 0.6439, + "step": 4331 + }, + { + "epoch": 2.5655907610305, + "grad_norm": 1.761466082062497, + "learning_rate": 1.0760394392545058e-06, + "loss": 0.6039, + "step": 4332 + }, + { + "epoch": 2.5661830026650874, + "grad_norm": 1.7622676780184343, + "learning_rate": 1.0731551629799542e-06, + "loss": 0.5764, + "step": 4333 + }, + { + "epoch": 2.5667752442996745, + "grad_norm": 3.3213032917052576, + "learning_rate": 1.0702745382869207e-06, + "loss": 0.6205, + "step": 4334 + }, + { + "epoch": 2.5673674859342612, + "grad_norm": 1.14995546121375, + "learning_rate": 1.0673975663537428e-06, + "loss": 0.5977, + "step": 4335 + }, + { + "epoch": 2.567959727568848, + "grad_norm": 1.225755679706984, + "learning_rate": 1.064524248357265e-06, + "loss": 0.6445, + "step": 4336 + }, + { + "epoch": 2.568551969203435, + "grad_norm": 1.9102011927596232, + "learning_rate": 1.0616545854728388e-06, + "loss": 0.6223, + "step": 4337 + }, + { + "epoch": 2.569144210838022, + "grad_norm": 1.644287131836029, + "learning_rate": 1.0587885788743112e-06, + "loss": 0.6169, + "step": 4338 + }, + { + "epoch": 2.569736452472609, + "grad_norm": 2.229153860357448, + "learning_rate": 1.0559262297340433e-06, + "loss": 0.6087, + "step": 4339 + }, + { + "epoch": 2.5703286941071957, + "grad_norm": 1.8865674189894543, + "learning_rate": 1.0530675392228995e-06, + "loss": 0.6173, + "step": 4340 + }, + { + "epoch": 2.570920935741783, + "grad_norm": 6.002385683935582, + "learning_rate": 1.0502125085102422e-06, + "loss": 0.6208, + "step": 4341 + }, + { + "epoch": 2.5715131773763695, + "grad_norm": 2.24501027749272, + "learning_rate": 1.047361138763937e-06, + "loss": 0.6327, + "step": 4342 + }, + { + "epoch": 2.5721054190109562, + "grad_norm": 1.488334337447085, + "learning_rate": 1.0445134311503592e-06, + "loss": 0.6553, + "step": 4343 + }, + { + "epoch": 2.5726976606455434, + "grad_norm": 5.179170676032384, + "learning_rate": 1.0416693868343796e-06, + "loss": 0.5995, + "step": 4344 + }, + { + "epoch": 2.5732899022801305, + "grad_norm": 2.911488695193342, + "learning_rate": 1.0388290069793726e-06, + "loss": 0.6315, + "step": 4345 + }, + { + "epoch": 2.5738821439147173, + "grad_norm": 1.257777620049107, + "learning_rate": 1.035992292747211e-06, + "loss": 0.5844, + "step": 4346 + }, + { + "epoch": 2.574474385549304, + "grad_norm": 1.0731010623057406, + "learning_rate": 1.0331592452982718e-06, + "loss": 0.5987, + "step": 4347 + }, + { + "epoch": 2.575066627183891, + "grad_norm": 3.003901689507578, + "learning_rate": 1.030329865791434e-06, + "loss": 0.6051, + "step": 4348 + }, + { + "epoch": 2.575658868818478, + "grad_norm": 1.7372715719465501, + "learning_rate": 1.0275041553840691e-06, + "loss": 0.6226, + "step": 4349 + }, + { + "epoch": 2.576251110453065, + "grad_norm": 1.7371414977000834, + "learning_rate": 1.0246821152320507e-06, + "loss": 0.5935, + "step": 4350 + }, + { + "epoch": 2.5768433520876517, + "grad_norm": 1.408153611176849, + "learning_rate": 1.0218637464897541e-06, + "loss": 0.5643, + "step": 4351 + }, + { + "epoch": 2.577435593722239, + "grad_norm": 1.780798414186877, + "learning_rate": 1.0190490503100515e-06, + "loss": 0.6226, + "step": 4352 + }, + { + "epoch": 2.5780278353568256, + "grad_norm": 2.2724733020724424, + "learning_rate": 1.0162380278443107e-06, + "loss": 0.6212, + "step": 4353 + }, + { + "epoch": 2.5786200769914123, + "grad_norm": 2.3417587051881874, + "learning_rate": 1.0134306802423965e-06, + "loss": 0.5835, + "step": 4354 + }, + { + "epoch": 2.5792123186259994, + "grad_norm": 2.1212455863514625, + "learning_rate": 1.010627008652675e-06, + "loss": 0.6133, + "step": 4355 + }, + { + "epoch": 2.5798045602605866, + "grad_norm": 2.605778840709358, + "learning_rate": 1.0078270142220015e-06, + "loss": 0.6165, + "step": 4356 + }, + { + "epoch": 2.5803968018951733, + "grad_norm": 1.30684333675587, + "learning_rate": 1.0050306980957358e-06, + "loss": 0.6219, + "step": 4357 + }, + { + "epoch": 2.58098904352976, + "grad_norm": 1.3407378615869856, + "learning_rate": 1.0022380614177251e-06, + "loss": 0.6391, + "step": 4358 + }, + { + "epoch": 2.581581285164347, + "grad_norm": 1.673176460809278, + "learning_rate": 9.994491053303169e-07, + "loss": 0.6031, + "step": 4359 + }, + { + "epoch": 2.582173526798934, + "grad_norm": 1.3507011115002057, + "learning_rate": 9.966638309743481e-07, + "loss": 0.6097, + "step": 4360 + }, + { + "epoch": 2.582765768433521, + "grad_norm": 1.31675965356572, + "learning_rate": 9.938822394891568e-07, + "loss": 0.613, + "step": 4361 + }, + { + "epoch": 2.5833580100681077, + "grad_norm": 1.3590258887169568, + "learning_rate": 9.911043320125657e-07, + "loss": 0.6662, + "step": 4362 + }, + { + "epoch": 2.583950251702695, + "grad_norm": 1.8770135822453762, + "learning_rate": 9.883301096808995e-07, + "loss": 0.6422, + "step": 4363 + }, + { + "epoch": 2.5845424933372816, + "grad_norm": 2.121220810517652, + "learning_rate": 9.85559573628967e-07, + "loss": 0.6463, + "step": 4364 + }, + { + "epoch": 2.5851347349718683, + "grad_norm": 1.939410319473363, + "learning_rate": 9.827927249900782e-07, + "loss": 0.6276, + "step": 4365 + }, + { + "epoch": 2.5857269766064555, + "grad_norm": 1.5896698304385093, + "learning_rate": 9.800295648960245e-07, + "loss": 0.5878, + "step": 4366 + }, + { + "epoch": 2.5863192182410426, + "grad_norm": 2.349609733093395, + "learning_rate": 9.772700944770973e-07, + "loss": 0.639, + "step": 4367 + }, + { + "epoch": 2.5869114598756293, + "grad_norm": 1.502375187876737, + "learning_rate": 9.74514314862074e-07, + "loss": 0.623, + "step": 4368 + }, + { + "epoch": 2.587503701510216, + "grad_norm": 1.2592816904026312, + "learning_rate": 9.717622271782234e-07, + "loss": 0.6151, + "step": 4369 + }, + { + "epoch": 2.588095943144803, + "grad_norm": 1.5838796869432388, + "learning_rate": 9.690138325513043e-07, + "loss": 0.6271, + "step": 4370 + }, + { + "epoch": 2.58868818477939, + "grad_norm": 3.9815136759012213, + "learning_rate": 9.66269132105565e-07, + "loss": 0.6234, + "step": 4371 + }, + { + "epoch": 2.589280426413977, + "grad_norm": 1.332493171381054, + "learning_rate": 9.635281269637409e-07, + "loss": 0.6035, + "step": 4372 + }, + { + "epoch": 2.5898726680485638, + "grad_norm": 1.7683470516926967, + "learning_rate": 9.607908182470593e-07, + "loss": 0.6349, + "step": 4373 + }, + { + "epoch": 2.590464909683151, + "grad_norm": 2.0170932132048334, + "learning_rate": 9.580572070752335e-07, + "loss": 0.6105, + "step": 4374 + }, + { + "epoch": 2.5910571513177376, + "grad_norm": 2.740791397745886, + "learning_rate": 9.553272945664604e-07, + "loss": 0.5933, + "step": 4375 + }, + { + "epoch": 2.5916493929523243, + "grad_norm": 1.4563051854293856, + "learning_rate": 9.52601081837431e-07, + "loss": 0.6438, + "step": 4376 + }, + { + "epoch": 2.5922416345869115, + "grad_norm": 2.7013335054171974, + "learning_rate": 9.498785700033197e-07, + "loss": 0.6372, + "step": 4377 + }, + { + "epoch": 2.5928338762214986, + "grad_norm": 2.0736824122584423, + "learning_rate": 9.471597601777871e-07, + "loss": 0.6126, + "step": 4378 + }, + { + "epoch": 2.5934261178560853, + "grad_norm": 1.3687787675542844, + "learning_rate": 9.444446534729767e-07, + "loss": 0.6138, + "step": 4379 + }, + { + "epoch": 2.594018359490672, + "grad_norm": 1.4601120548746158, + "learning_rate": 9.417332509995203e-07, + "loss": 0.5814, + "step": 4380 + }, + { + "epoch": 2.594610601125259, + "grad_norm": 1.6919905995540607, + "learning_rate": 9.390255538665383e-07, + "loss": 0.6357, + "step": 4381 + }, + { + "epoch": 2.595202842759846, + "grad_norm": 1.5823616402702225, + "learning_rate": 9.36321563181628e-07, + "loss": 0.585, + "step": 4382 + }, + { + "epoch": 2.595795084394433, + "grad_norm": 1.960553817746592, + "learning_rate": 9.336212800508715e-07, + "loss": 0.6303, + "step": 4383 + }, + { + "epoch": 2.59638732602902, + "grad_norm": 17.328103997100726, + "learning_rate": 9.309247055788384e-07, + "loss": 0.6071, + "step": 4384 + }, + { + "epoch": 2.596979567663607, + "grad_norm": 2.096167704908563, + "learning_rate": 9.282318408685809e-07, + "loss": 0.6205, + "step": 4385 + }, + { + "epoch": 2.5975718092981936, + "grad_norm": 3.5082685509985216, + "learning_rate": 9.255426870216311e-07, + "loss": 0.6005, + "step": 4386 + }, + { + "epoch": 2.5981640509327804, + "grad_norm": 1.6098721047621571, + "learning_rate": 9.228572451380024e-07, + "loss": 0.6077, + "step": 4387 + }, + { + "epoch": 2.5987562925673675, + "grad_norm": 2.5927416189734593, + "learning_rate": 9.201755163161918e-07, + "loss": 0.597, + "step": 4388 + }, + { + "epoch": 2.5993485342019547, + "grad_norm": 1.9744478641135261, + "learning_rate": 9.174975016531828e-07, + "loss": 0.5956, + "step": 4389 + }, + { + "epoch": 2.5999407758365414, + "grad_norm": 1.587520907903784, + "learning_rate": 9.148232022444259e-07, + "loss": 0.5978, + "step": 4390 + }, + { + "epoch": 2.600533017471128, + "grad_norm": 1.7030721112666247, + "learning_rate": 9.121526191838626e-07, + "loss": 0.6412, + "step": 4391 + }, + { + "epoch": 2.6011252591057152, + "grad_norm": 1.6855774573137718, + "learning_rate": 9.094857535639157e-07, + "loss": 0.6673, + "step": 4392 + }, + { + "epoch": 2.601717500740302, + "grad_norm": 1.670526552319908, + "learning_rate": 9.068226064754781e-07, + "loss": 0.6057, + "step": 4393 + }, + { + "epoch": 2.602309742374889, + "grad_norm": 3.2516145639118346, + "learning_rate": 9.041631790079275e-07, + "loss": 0.607, + "step": 4394 + }, + { + "epoch": 2.602901984009476, + "grad_norm": 2.0502691399541777, + "learning_rate": 9.015074722491212e-07, + "loss": 0.6516, + "step": 4395 + }, + { + "epoch": 2.603494225644063, + "grad_norm": 1.9295772880960242, + "learning_rate": 8.988554872853927e-07, + "loss": 0.6131, + "step": 4396 + }, + { + "epoch": 2.6040864672786497, + "grad_norm": 2.8204499089521318, + "learning_rate": 8.962072252015519e-07, + "loss": 0.6764, + "step": 4397 + }, + { + "epoch": 2.6046787089132364, + "grad_norm": 1.449343492660855, + "learning_rate": 8.935626870808856e-07, + "loss": 0.5797, + "step": 4398 + }, + { + "epoch": 2.6052709505478235, + "grad_norm": 1.4023824555418185, + "learning_rate": 8.909218740051596e-07, + "loss": 0.6024, + "step": 4399 + }, + { + "epoch": 2.6058631921824107, + "grad_norm": 1.9603813068399214, + "learning_rate": 8.882847870546174e-07, + "loss": 0.5941, + "step": 4400 + }, + { + "epoch": 2.6064554338169974, + "grad_norm": 2.1424478675974252, + "learning_rate": 8.856514273079741e-07, + "loss": 0.6012, + "step": 4401 + }, + { + "epoch": 2.607047675451584, + "grad_norm": 5.123535144408223, + "learning_rate": 8.830217958424192e-07, + "loss": 0.5819, + "step": 4402 + }, + { + "epoch": 2.6076399170861713, + "grad_norm": 2.2597985178839117, + "learning_rate": 8.803958937336221e-07, + "loss": 0.6186, + "step": 4403 + }, + { + "epoch": 2.608232158720758, + "grad_norm": 2.218872286361537, + "learning_rate": 8.77773722055727e-07, + "loss": 0.6509, + "step": 4404 + }, + { + "epoch": 2.608824400355345, + "grad_norm": 1.4707962923229974, + "learning_rate": 8.751552818813469e-07, + "loss": 0.6754, + "step": 4405 + }, + { + "epoch": 2.609416641989932, + "grad_norm": 1.45480818288004, + "learning_rate": 8.725405742815695e-07, + "loss": 0.5915, + "step": 4406 + }, + { + "epoch": 2.610008883624519, + "grad_norm": 1.3986566797454838, + "learning_rate": 8.699296003259594e-07, + "loss": 0.5885, + "step": 4407 + }, + { + "epoch": 2.6106011252591057, + "grad_norm": 5.235464145421576, + "learning_rate": 8.673223610825532e-07, + "loss": 0.6181, + "step": 4408 + }, + { + "epoch": 2.6111933668936924, + "grad_norm": 1.2189308602666966, + "learning_rate": 8.647188576178567e-07, + "loss": 0.606, + "step": 4409 + }, + { + "epoch": 2.6117856085282796, + "grad_norm": 1.4608812222029703, + "learning_rate": 8.62119090996848e-07, + "loss": 0.6097, + "step": 4410 + }, + { + "epoch": 2.6123778501628667, + "grad_norm": 1.7812634754687648, + "learning_rate": 8.595230622829797e-07, + "loss": 0.6253, + "step": 4411 + }, + { + "epoch": 2.6129700917974534, + "grad_norm": 1.3064781743089184, + "learning_rate": 8.569307725381715e-07, + "loss": 0.6165, + "step": 4412 + }, + { + "epoch": 2.61356233343204, + "grad_norm": 1.335671673105394, + "learning_rate": 8.543422228228182e-07, + "loss": 0.6033, + "step": 4413 + }, + { + "epoch": 2.6141545750666273, + "grad_norm": 1.5391838884745948, + "learning_rate": 8.517574141957796e-07, + "loss": 0.6247, + "step": 4414 + }, + { + "epoch": 2.614746816701214, + "grad_norm": 1.620678307778965, + "learning_rate": 8.491763477143911e-07, + "loss": 0.5832, + "step": 4415 + }, + { + "epoch": 2.615339058335801, + "grad_norm": 1.9416245565474328, + "learning_rate": 8.46599024434449e-07, + "loss": 0.6067, + "step": 4416 + }, + { + "epoch": 2.615931299970388, + "grad_norm": 3.5183251486251983, + "learning_rate": 8.440254454102303e-07, + "loss": 0.5849, + "step": 4417 + }, + { + "epoch": 2.616523541604975, + "grad_norm": 1.735719411627244, + "learning_rate": 8.414556116944672e-07, + "loss": 0.637, + "step": 4418 + }, + { + "epoch": 2.6171157832395617, + "grad_norm": 1.2608882302510347, + "learning_rate": 8.388895243383699e-07, + "loss": 0.6155, + "step": 4419 + }, + { + "epoch": 2.6177080248741484, + "grad_norm": 2.0273357584791865, + "learning_rate": 8.363271843916099e-07, + "loss": 0.6128, + "step": 4420 + }, + { + "epoch": 2.6183002665087356, + "grad_norm": 1.125910415428338, + "learning_rate": 8.33768592902332e-07, + "loss": 0.6086, + "step": 4421 + }, + { + "epoch": 2.6188925081433228, + "grad_norm": 2.818588547251622, + "learning_rate": 8.312137509171392e-07, + "loss": 0.5505, + "step": 4422 + }, + { + "epoch": 2.6194847497779095, + "grad_norm": 2.4748929370800457, + "learning_rate": 8.286626594811098e-07, + "loss": 0.5675, + "step": 4423 + }, + { + "epoch": 2.620076991412496, + "grad_norm": 2.5020858763222456, + "learning_rate": 8.261153196377814e-07, + "loss": 0.5973, + "step": 4424 + }, + { + "epoch": 2.6206692330470833, + "grad_norm": 5.677134946889842, + "learning_rate": 8.235717324291604e-07, + "loss": 0.622, + "step": 4425 + }, + { + "epoch": 2.62126147468167, + "grad_norm": 3.5621389375037813, + "learning_rate": 8.210318988957166e-07, + "loss": 0.584, + "step": 4426 + }, + { + "epoch": 2.621853716316257, + "grad_norm": 1.8691129401457045, + "learning_rate": 8.184958200763826e-07, + "loss": 0.5916, + "step": 4427 + }, + { + "epoch": 2.622445957950844, + "grad_norm": 4.897419852333168, + "learning_rate": 8.159634970085595e-07, + "loss": 0.5899, + "step": 4428 + }, + { + "epoch": 2.623038199585431, + "grad_norm": 1.4035158403121284, + "learning_rate": 8.134349307281109e-07, + "loss": 0.5928, + "step": 4429 + }, + { + "epoch": 2.6236304412200178, + "grad_norm": 2.473039251325127, + "learning_rate": 8.109101222693616e-07, + "loss": 0.5929, + "step": 4430 + }, + { + "epoch": 2.6242226828546045, + "grad_norm": 2.0204745202248353, + "learning_rate": 8.083890726650978e-07, + "loss": 0.6055, + "step": 4431 + }, + { + "epoch": 2.6248149244891916, + "grad_norm": 1.4459314914665022, + "learning_rate": 8.058717829465723e-07, + "loss": 0.6025, + "step": 4432 + }, + { + "epoch": 2.6254071661237783, + "grad_norm": 2.1952381195541553, + "learning_rate": 8.033582541435003e-07, + "loss": 0.6302, + "step": 4433 + }, + { + "epoch": 2.6259994077583655, + "grad_norm": 1.7238053969572114, + "learning_rate": 8.008484872840538e-07, + "loss": 0.6178, + "step": 4434 + }, + { + "epoch": 2.626591649392952, + "grad_norm": 1.4421860814594145, + "learning_rate": 7.983424833948694e-07, + "loss": 0.6132, + "step": 4435 + }, + { + "epoch": 2.6271838910275394, + "grad_norm": 33.61594311760086, + "learning_rate": 7.958402435010415e-07, + "loss": 0.6302, + "step": 4436 + }, + { + "epoch": 2.627776132662126, + "grad_norm": 1.3890112121581686, + "learning_rate": 7.933417686261325e-07, + "loss": 0.6571, + "step": 4437 + }, + { + "epoch": 2.628368374296713, + "grad_norm": 1.5916247723609014, + "learning_rate": 7.908470597921547e-07, + "loss": 0.6012, + "step": 4438 + }, + { + "epoch": 2.6289606159313, + "grad_norm": 1.1892053559966738, + "learning_rate": 7.883561180195831e-07, + "loss": 0.6098, + "step": 4439 + }, + { + "epoch": 2.629552857565887, + "grad_norm": 3.0816929839131504, + "learning_rate": 7.858689443273548e-07, + "loss": 0.6013, + "step": 4440 + }, + { + "epoch": 2.630145099200474, + "grad_norm": 3.903155006740842, + "learning_rate": 7.833855397328682e-07, + "loss": 0.5941, + "step": 4441 + }, + { + "epoch": 2.6307373408350605, + "grad_norm": 2.618889450319004, + "learning_rate": 7.809059052519674e-07, + "loss": 0.644, + "step": 4442 + }, + { + "epoch": 2.6313295824696477, + "grad_norm": 1.993232060605033, + "learning_rate": 7.784300418989665e-07, + "loss": 0.591, + "step": 4443 + }, + { + "epoch": 2.6319218241042344, + "grad_norm": 1.2377163825617075, + "learning_rate": 7.759579506866311e-07, + "loss": 0.5866, + "step": 4444 + }, + { + "epoch": 2.6325140657388215, + "grad_norm": 1.843881731614733, + "learning_rate": 7.73489632626192e-07, + "loss": 0.642, + "step": 4445 + }, + { + "epoch": 2.6331063073734082, + "grad_norm": 2.4131522683099518, + "learning_rate": 7.710250887273196e-07, + "loss": 0.6004, + "step": 4446 + }, + { + "epoch": 2.6336985490079954, + "grad_norm": 1.1424501229679849, + "learning_rate": 7.685643199981574e-07, + "loss": 0.6154, + "step": 4447 + }, + { + "epoch": 2.634290790642582, + "grad_norm": 1.353394407111203, + "learning_rate": 7.66107327445299e-07, + "loss": 0.6123, + "step": 4448 + }, + { + "epoch": 2.6348830322771692, + "grad_norm": 1.4623696640133033, + "learning_rate": 7.636541120737906e-07, + "loss": 0.5979, + "step": 4449 + }, + { + "epoch": 2.635475273911756, + "grad_norm": 1.6100152183594005, + "learning_rate": 7.612046748871327e-07, + "loss": 0.6087, + "step": 4450 + }, + { + "epoch": 2.636067515546343, + "grad_norm": 1.5347034575841871, + "learning_rate": 7.58759016887286e-07, + "loss": 0.6232, + "step": 4451 + }, + { + "epoch": 2.63665975718093, + "grad_norm": 1.487392673062803, + "learning_rate": 7.563171390746627e-07, + "loss": 0.5846, + "step": 4452 + }, + { + "epoch": 2.6372519988155165, + "grad_norm": 1.9959820986132857, + "learning_rate": 7.53879042448128e-07, + "loss": 0.6068, + "step": 4453 + }, + { + "epoch": 2.6378442404501037, + "grad_norm": 1.3610216722187833, + "learning_rate": 7.514447280049964e-07, + "loss": 0.5942, + "step": 4454 + }, + { + "epoch": 2.6384364820846904, + "grad_norm": 2.2754665990675136, + "learning_rate": 7.490141967410436e-07, + "loss": 0.6712, + "step": 4455 + }, + { + "epoch": 2.6390287237192775, + "grad_norm": 1.3895620105466362, + "learning_rate": 7.465874496504944e-07, + "loss": 0.6119, + "step": 4456 + }, + { + "epoch": 2.6396209653538643, + "grad_norm": 1.1470130133503953, + "learning_rate": 7.441644877260212e-07, + "loss": 0.5964, + "step": 4457 + }, + { + "epoch": 2.6402132069884514, + "grad_norm": 2.521466762407184, + "learning_rate": 7.417453119587525e-07, + "loss": 0.5729, + "step": 4458 + }, + { + "epoch": 2.640805448623038, + "grad_norm": 2.4309719657662376, + "learning_rate": 7.393299233382678e-07, + "loss": 0.6652, + "step": 4459 + }, + { + "epoch": 2.6413976902576253, + "grad_norm": 3.395055166928025, + "learning_rate": 7.369183228526e-07, + "loss": 0.6361, + "step": 4460 + }, + { + "epoch": 2.641989931892212, + "grad_norm": 2.006109738193744, + "learning_rate": 7.345105114882245e-07, + "loss": 0.5852, + "step": 4461 + }, + { + "epoch": 2.642582173526799, + "grad_norm": 1.4964269607335803, + "learning_rate": 7.321064902300723e-07, + "loss": 0.6232, + "step": 4462 + }, + { + "epoch": 2.643174415161386, + "grad_norm": 3.2714212450603393, + "learning_rate": 7.297062600615268e-07, + "loss": 0.6469, + "step": 4463 + }, + { + "epoch": 2.6437666567959726, + "grad_norm": 1.8897563765286818, + "learning_rate": 7.273098219644137e-07, + "loss": 0.6354, + "step": 4464 + }, + { + "epoch": 2.6443588984305597, + "grad_norm": 1.528350989418592, + "learning_rate": 7.249171769190111e-07, + "loss": 0.6233, + "step": 4465 + }, + { + "epoch": 2.6449511400651464, + "grad_norm": 3.275762358290711, + "learning_rate": 7.225283259040472e-07, + "loss": 0.578, + "step": 4466 + }, + { + "epoch": 2.6455433816997336, + "grad_norm": 1.5673100456277456, + "learning_rate": 7.201432698966959e-07, + "loss": 0.6478, + "step": 4467 + }, + { + "epoch": 2.6461356233343203, + "grad_norm": 1.447196429106465, + "learning_rate": 7.17762009872579e-07, + "loss": 0.5887, + "step": 4468 + }, + { + "epoch": 2.6467278649689074, + "grad_norm": 1.4964135311485778, + "learning_rate": 7.15384546805764e-07, + "loss": 0.6, + "step": 4469 + }, + { + "epoch": 2.647320106603494, + "grad_norm": 1.9472910578091667, + "learning_rate": 7.130108816687687e-07, + "loss": 0.5853, + "step": 4470 + }, + { + "epoch": 2.647912348238081, + "grad_norm": 1.2610856530380568, + "learning_rate": 7.106410154325571e-07, + "loss": 0.6081, + "step": 4471 + }, + { + "epoch": 2.648504589872668, + "grad_norm": 1.3825561685529, + "learning_rate": 7.082749490665353e-07, + "loss": 0.6435, + "step": 4472 + }, + { + "epoch": 2.649096831507255, + "grad_norm": 1.5107349206863576, + "learning_rate": 7.059126835385577e-07, + "loss": 0.6626, + "step": 4473 + }, + { + "epoch": 2.649689073141842, + "grad_norm": 1.9273475424947197, + "learning_rate": 7.035542198149237e-07, + "loss": 0.5902, + "step": 4474 + }, + { + "epoch": 2.6502813147764286, + "grad_norm": 2.5130850136023075, + "learning_rate": 7.011995588603804e-07, + "loss": 0.6471, + "step": 4475 + }, + { + "epoch": 2.6508735564110157, + "grad_norm": 1.5585005044488192, + "learning_rate": 6.98848701638114e-07, + "loss": 0.6371, + "step": 4476 + }, + { + "epoch": 2.6514657980456025, + "grad_norm": 1.3238990292587902, + "learning_rate": 6.965016491097553e-07, + "loss": 0.6118, + "step": 4477 + }, + { + "epoch": 2.6520580396801896, + "grad_norm": 1.3046884838922062, + "learning_rate": 6.941584022353865e-07, + "loss": 0.6345, + "step": 4478 + }, + { + "epoch": 2.6526502813147763, + "grad_norm": 5.787734168290906, + "learning_rate": 6.918189619735205e-07, + "loss": 0.6323, + "step": 4479 + }, + { + "epoch": 2.6532425229493635, + "grad_norm": 1.9229458354471165, + "learning_rate": 6.894833292811265e-07, + "loss": 0.6234, + "step": 4480 + }, + { + "epoch": 2.65383476458395, + "grad_norm": 1.4760976120663631, + "learning_rate": 6.871515051136046e-07, + "loss": 0.6093, + "step": 4481 + }, + { + "epoch": 2.654427006218537, + "grad_norm": 1.6700900299285095, + "learning_rate": 6.848234904248041e-07, + "loss": 0.6118, + "step": 4482 + }, + { + "epoch": 2.655019247853124, + "grad_norm": 2.3039568980390683, + "learning_rate": 6.824992861670132e-07, + "loss": 0.654, + "step": 4483 + }, + { + "epoch": 2.655611489487711, + "grad_norm": 1.779803366967843, + "learning_rate": 6.801788932909648e-07, + "loss": 0.6177, + "step": 4484 + }, + { + "epoch": 2.656203731122298, + "grad_norm": 1.4560279559121052, + "learning_rate": 6.778623127458261e-07, + "loss": 0.6, + "step": 4485 + }, + { + "epoch": 2.6567959727568846, + "grad_norm": 1.6569231878529236, + "learning_rate": 6.755495454792116e-07, + "loss": 0.6316, + "step": 4486 + }, + { + "epoch": 2.6573882143914718, + "grad_norm": 1.4672246843225127, + "learning_rate": 6.73240592437171e-07, + "loss": 0.5814, + "step": 4487 + }, + { + "epoch": 2.6579804560260585, + "grad_norm": 1.929651108949212, + "learning_rate": 6.709354545641989e-07, + "loss": 0.6056, + "step": 4488 + }, + { + "epoch": 2.6585726976606456, + "grad_norm": 1.1762025778237375, + "learning_rate": 6.68634132803222e-07, + "loss": 0.5838, + "step": 4489 + }, + { + "epoch": 2.6591649392952323, + "grad_norm": 1.2260691704307056, + "learning_rate": 6.663366280956152e-07, + "loss": 0.5867, + "step": 4490 + }, + { + "epoch": 2.6597571809298195, + "grad_norm": 1.5821020306273104, + "learning_rate": 6.640429413811833e-07, + "loss": 0.6146, + "step": 4491 + }, + { + "epoch": 2.660349422564406, + "grad_norm": 3.7438552072655042, + "learning_rate": 6.617530735981758e-07, + "loss": 0.6057, + "step": 4492 + }, + { + "epoch": 2.660941664198993, + "grad_norm": 1.4861734342086816, + "learning_rate": 6.594670256832769e-07, + "loss": 0.5755, + "step": 4493 + }, + { + "epoch": 2.66153390583358, + "grad_norm": 1.3176811467599943, + "learning_rate": 6.571847985716063e-07, + "loss": 0.6114, + "step": 4494 + }, + { + "epoch": 2.6621261474681672, + "grad_norm": 2.5936137237565986, + "learning_rate": 6.549063931967247e-07, + "loss": 0.6174, + "step": 4495 + }, + { + "epoch": 2.662718389102754, + "grad_norm": 1.5641301693338778, + "learning_rate": 6.526318104906293e-07, + "loss": 0.6262, + "step": 4496 + }, + { + "epoch": 2.6633106307373406, + "grad_norm": 1.7900396459995547, + "learning_rate": 6.503610513837522e-07, + "loss": 0.557, + "step": 4497 + }, + { + "epoch": 2.663902872371928, + "grad_norm": 1.3448660046809024, + "learning_rate": 6.480941168049593e-07, + "loss": 0.5565, + "step": 4498 + }, + { + "epoch": 2.6644951140065145, + "grad_norm": 1.5352709594541674, + "learning_rate": 6.458310076815544e-07, + "loss": 0.6334, + "step": 4499 + }, + { + "epoch": 2.6650873556411017, + "grad_norm": 1.5829649420114509, + "learning_rate": 6.435717249392803e-07, + "loss": 0.6498, + "step": 4500 + }, + { + "epoch": 2.6656795972756884, + "grad_norm": 1.8146453565537977, + "learning_rate": 6.413162695023078e-07, + "loss": 0.6241, + "step": 4501 + }, + { + "epoch": 2.6662718389102755, + "grad_norm": 1.6561074771734265, + "learning_rate": 6.390646422932445e-07, + "loss": 0.6356, + "step": 4502 + }, + { + "epoch": 2.6668640805448622, + "grad_norm": 1.4096379062668936, + "learning_rate": 6.368168442331324e-07, + "loss": 0.6571, + "step": 4503 + }, + { + "epoch": 2.667456322179449, + "grad_norm": 1.568445220559726, + "learning_rate": 6.345728762414504e-07, + "loss": 0.5933, + "step": 4504 + }, + { + "epoch": 2.668048563814036, + "grad_norm": 1.2456099463597277, + "learning_rate": 6.323327392361056e-07, + "loss": 0.604, + "step": 4505 + }, + { + "epoch": 2.6686408054486233, + "grad_norm": 2.1193064430307804, + "learning_rate": 6.300964341334382e-07, + "loss": 0.6107, + "step": 4506 + }, + { + "epoch": 2.66923304708321, + "grad_norm": 1.981391634375128, + "learning_rate": 6.278639618482241e-07, + "loss": 0.6398, + "step": 4507 + }, + { + "epoch": 2.6698252887177967, + "grad_norm": 2.04267323424306, + "learning_rate": 6.256353232936718e-07, + "loss": 0.5972, + "step": 4508 + }, + { + "epoch": 2.670417530352384, + "grad_norm": 1.8887831971144367, + "learning_rate": 6.234105193814177e-07, + "loss": 0.6266, + "step": 4509 + }, + { + "epoch": 2.6710097719869705, + "grad_norm": 1.7381054901434658, + "learning_rate": 6.211895510215316e-07, + "loss": 0.6574, + "step": 4510 + }, + { + "epoch": 2.6716020136215577, + "grad_norm": 1.443595604409538, + "learning_rate": 6.189724191225155e-07, + "loss": 0.6292, + "step": 4511 + }, + { + "epoch": 2.6721942552561444, + "grad_norm": 1.6341383418243982, + "learning_rate": 6.167591245913029e-07, + "loss": 0.6265, + "step": 4512 + }, + { + "epoch": 2.6727864968907316, + "grad_norm": 1.468790273283455, + "learning_rate": 6.145496683332508e-07, + "loss": 0.6301, + "step": 4513 + }, + { + "epoch": 2.6733787385253183, + "grad_norm": 1.3435540901590721, + "learning_rate": 6.123440512521539e-07, + "loss": 0.6249, + "step": 4514 + }, + { + "epoch": 2.673970980159905, + "grad_norm": 2.159903078044722, + "learning_rate": 6.101422742502349e-07, + "loss": 0.6205, + "step": 4515 + }, + { + "epoch": 2.674563221794492, + "grad_norm": 1.5612885890524655, + "learning_rate": 6.079443382281424e-07, + "loss": 0.6187, + "step": 4516 + }, + { + "epoch": 2.6751554634290793, + "grad_norm": 1.9002762382063694, + "learning_rate": 6.05750244084956e-07, + "loss": 0.6108, + "step": 4517 + }, + { + "epoch": 2.675747705063666, + "grad_norm": 3.9695947033078127, + "learning_rate": 6.035599927181834e-07, + "loss": 0.6114, + "step": 4518 + }, + { + "epoch": 2.6763399466982527, + "grad_norm": 1.3099993291001881, + "learning_rate": 6.013735850237623e-07, + "loss": 0.603, + "step": 4519 + }, + { + "epoch": 2.67693218833284, + "grad_norm": 1.2559274936986125, + "learning_rate": 5.99191021896055e-07, + "loss": 0.591, + "step": 4520 + }, + { + "epoch": 2.6775244299674266, + "grad_norm": 1.494733461660258, + "learning_rate": 5.97012304227852e-07, + "loss": 0.5815, + "step": 4521 + }, + { + "epoch": 2.6781166716020137, + "grad_norm": 3.1926894336773475, + "learning_rate": 5.948374329103723e-07, + "loss": 0.5866, + "step": 4522 + }, + { + "epoch": 2.6787089132366004, + "grad_norm": 1.4852694724957944, + "learning_rate": 5.926664088332612e-07, + "loss": 0.5798, + "step": 4523 + }, + { + "epoch": 2.6793011548711876, + "grad_norm": 2.351294923764927, + "learning_rate": 5.904992328845893e-07, + "loss": 0.6286, + "step": 4524 + }, + { + "epoch": 2.6798933965057743, + "grad_norm": 1.243806142521426, + "learning_rate": 5.88335905950852e-07, + "loss": 0.6393, + "step": 4525 + }, + { + "epoch": 2.680485638140361, + "grad_norm": 1.4595958476913502, + "learning_rate": 5.861764289169713e-07, + "loss": 0.6362, + "step": 4526 + }, + { + "epoch": 2.681077879774948, + "grad_norm": 2.5372193151492066, + "learning_rate": 5.840208026662986e-07, + "loss": 0.6335, + "step": 4527 + }, + { + "epoch": 2.6816701214095353, + "grad_norm": 1.9022729218122443, + "learning_rate": 5.818690280806038e-07, + "loss": 0.6046, + "step": 4528 + }, + { + "epoch": 2.682262363044122, + "grad_norm": 1.300761914523257, + "learning_rate": 5.797211060400809e-07, + "loss": 0.6347, + "step": 4529 + }, + { + "epoch": 2.6828546046787087, + "grad_norm": 1.545954882819192, + "learning_rate": 5.775770374233558e-07, + "loss": 0.6014, + "step": 4530 + }, + { + "epoch": 2.683446846313296, + "grad_norm": 1.3134870233911644, + "learning_rate": 5.754368231074703e-07, + "loss": 0.624, + "step": 4531 + }, + { + "epoch": 2.6840390879478826, + "grad_norm": 1.257399217998619, + "learning_rate": 5.73300463967893e-07, + "loss": 0.6017, + "step": 4532 + }, + { + "epoch": 2.6846313295824697, + "grad_norm": 2.0429701939538267, + "learning_rate": 5.711679608785136e-07, + "loss": 0.6038, + "step": 4533 + }, + { + "epoch": 2.6852235712170565, + "grad_norm": 2.112515949275844, + "learning_rate": 5.690393147116491e-07, + "loss": 0.595, + "step": 4534 + }, + { + "epoch": 2.6858158128516436, + "grad_norm": 1.4181848302497644, + "learning_rate": 5.669145263380316e-07, + "loss": 0.618, + "step": 4535 + }, + { + "epoch": 2.6864080544862303, + "grad_norm": 1.817228663528393, + "learning_rate": 5.647935966268225e-07, + "loss": 0.6222, + "step": 4536 + }, + { + "epoch": 2.687000296120817, + "grad_norm": 2.6043085112867095, + "learning_rate": 5.626765264455992e-07, + "loss": 0.6255, + "step": 4537 + }, + { + "epoch": 2.687592537755404, + "grad_norm": 1.3403299480462842, + "learning_rate": 5.60563316660363e-07, + "loss": 0.6334, + "step": 4538 + }, + { + "epoch": 2.6881847793899913, + "grad_norm": 1.5039519498314093, + "learning_rate": 5.58453968135535e-07, + "loss": 0.5825, + "step": 4539 + }, + { + "epoch": 2.688777021024578, + "grad_norm": 1.2364951870402103, + "learning_rate": 5.563484817339581e-07, + "loss": 0.6074, + "step": 4540 + }, + { + "epoch": 2.6893692626591648, + "grad_norm": 2.509772691752077, + "learning_rate": 5.542468583168936e-07, + "loss": 0.6249, + "step": 4541 + }, + { + "epoch": 2.689961504293752, + "grad_norm": 1.1947349763864274, + "learning_rate": 5.521490987440259e-07, + "loss": 0.6369, + "step": 4542 + }, + { + "epoch": 2.6905537459283386, + "grad_norm": 1.541811330073855, + "learning_rate": 5.500552038734541e-07, + "loss": 0.5862, + "step": 4543 + }, + { + "epoch": 2.691145987562926, + "grad_norm": 1.287641189747786, + "learning_rate": 5.47965174561701e-07, + "loss": 0.5996, + "step": 4544 + }, + { + "epoch": 2.6917382291975125, + "grad_norm": 1.7408712569531013, + "learning_rate": 5.458790116637036e-07, + "loss": 0.639, + "step": 4545 + }, + { + "epoch": 2.6923304708320996, + "grad_norm": 1.2121113431420716, + "learning_rate": 5.437967160328228e-07, + "loss": 0.5543, + "step": 4546 + }, + { + "epoch": 2.6929227124666864, + "grad_norm": 3.4295681334615797, + "learning_rate": 5.417182885208317e-07, + "loss": 0.6128, + "step": 4547 + }, + { + "epoch": 2.693514954101273, + "grad_norm": 2.73332565881467, + "learning_rate": 5.396437299779278e-07, + "loss": 0.6197, + "step": 4548 + }, + { + "epoch": 2.69410719573586, + "grad_norm": 1.7659745103441442, + "learning_rate": 5.375730412527191e-07, + "loss": 0.6616, + "step": 4549 + }, + { + "epoch": 2.6946994373704474, + "grad_norm": 6.0658236906308955, + "learning_rate": 5.355062231922326e-07, + "loss": 0.6502, + "step": 4550 + }, + { + "epoch": 2.695291679005034, + "grad_norm": 1.6358319710857092, + "learning_rate": 5.334432766419162e-07, + "loss": 0.6126, + "step": 4551 + }, + { + "epoch": 2.695883920639621, + "grad_norm": 1.8693759376554604, + "learning_rate": 5.313842024456306e-07, + "loss": 0.639, + "step": 4552 + }, + { + "epoch": 2.696476162274208, + "grad_norm": 1.2740182256023065, + "learning_rate": 5.29329001445652e-07, + "loss": 0.648, + "step": 4553 + }, + { + "epoch": 2.6970684039087947, + "grad_norm": 2.009984038445754, + "learning_rate": 5.272776744826724e-07, + "loss": 0.6539, + "step": 4554 + }, + { + "epoch": 2.697660645543382, + "grad_norm": 1.7618233562419785, + "learning_rate": 5.252302223958006e-07, + "loss": 0.5866, + "step": 4555 + }, + { + "epoch": 2.6982528871779685, + "grad_norm": 2.567467853853205, + "learning_rate": 5.231866460225621e-07, + "loss": 0.603, + "step": 4556 + }, + { + "epoch": 2.6988451288125557, + "grad_norm": 1.9843404125639688, + "learning_rate": 5.211469461988916e-07, + "loss": 0.6404, + "step": 4557 + }, + { + "epoch": 2.6994373704471424, + "grad_norm": 1.87203615558653, + "learning_rate": 5.191111237591406e-07, + "loss": 0.6232, + "step": 4558 + }, + { + "epoch": 2.700029612081729, + "grad_norm": 1.8160459925973307, + "learning_rate": 5.170791795360752e-07, + "loss": 0.6328, + "step": 4559 + }, + { + "epoch": 2.7006218537163162, + "grad_norm": 1.776112360381722, + "learning_rate": 5.150511143608782e-07, + "loss": 0.5592, + "step": 4560 + }, + { + "epoch": 2.7012140953509034, + "grad_norm": 1.7788576070929014, + "learning_rate": 5.130269290631407e-07, + "loss": 0.6206, + "step": 4561 + }, + { + "epoch": 2.70180633698549, + "grad_norm": 1.3844584078213082, + "learning_rate": 5.110066244708645e-07, + "loss": 0.5974, + "step": 4562 + }, + { + "epoch": 2.702398578620077, + "grad_norm": 2.280195440016197, + "learning_rate": 5.08990201410472e-07, + "loss": 0.6314, + "step": 4563 + }, + { + "epoch": 2.702990820254664, + "grad_norm": 1.5039208783289522, + "learning_rate": 5.069776607067944e-07, + "loss": 0.5886, + "step": 4564 + }, + { + "epoch": 2.7035830618892507, + "grad_norm": 2.515470221955119, + "learning_rate": 5.04969003183069e-07, + "loss": 0.5987, + "step": 4565 + }, + { + "epoch": 2.704175303523838, + "grad_norm": 1.4253523242169355, + "learning_rate": 5.029642296609538e-07, + "loss": 0.63, + "step": 4566 + }, + { + "epoch": 2.7047675451584245, + "grad_norm": 1.0838622160262505, + "learning_rate": 5.009633409605131e-07, + "loss": 0.5878, + "step": 4567 + }, + { + "epoch": 2.7053597867930117, + "grad_norm": 2.5944428791188345, + "learning_rate": 4.98966337900224e-07, + "loss": 0.577, + "step": 4568 + }, + { + "epoch": 2.7059520284275984, + "grad_norm": 1.6365174307980475, + "learning_rate": 4.969732212969691e-07, + "loss": 0.6307, + "step": 4569 + }, + { + "epoch": 2.706544270062185, + "grad_norm": 1.2985396532324271, + "learning_rate": 4.949839919660481e-07, + "loss": 0.5999, + "step": 4570 + }, + { + "epoch": 2.7071365116967723, + "grad_norm": 1.6192926604384, + "learning_rate": 4.929986507211681e-07, + "loss": 0.6267, + "step": 4571 + }, + { + "epoch": 2.7077287533313594, + "grad_norm": 1.550515843078195, + "learning_rate": 4.91017198374445e-07, + "loss": 0.6368, + "step": 4572 + }, + { + "epoch": 2.708320994965946, + "grad_norm": 3.445551249960511, + "learning_rate": 4.890396357364025e-07, + "loss": 0.6394, + "step": 4573 + }, + { + "epoch": 2.708913236600533, + "grad_norm": 2.0889306837221677, + "learning_rate": 4.870659636159758e-07, + "loss": 0.6194, + "step": 4574 + }, + { + "epoch": 2.70950547823512, + "grad_norm": 1.512212913808209, + "learning_rate": 4.850961828205103e-07, + "loss": 0.64, + "step": 4575 + }, + { + "epoch": 2.7100977198697067, + "grad_norm": 1.4199048585388745, + "learning_rate": 4.831302941557537e-07, + "loss": 0.6473, + "step": 4576 + }, + { + "epoch": 2.710689961504294, + "grad_norm": 1.785317344909293, + "learning_rate": 4.811682984258659e-07, + "loss": 0.6042, + "step": 4577 + }, + { + "epoch": 2.7112822031388806, + "grad_norm": 1.3142205419516673, + "learning_rate": 4.79210196433414e-07, + "loss": 0.6127, + "step": 4578 + }, + { + "epoch": 2.7118744447734677, + "grad_norm": 2.677760144755771, + "learning_rate": 4.772559889793716e-07, + "loss": 0.571, + "step": 4579 + }, + { + "epoch": 2.7124666864080544, + "grad_norm": 1.6507834481358612, + "learning_rate": 4.7530567686312035e-07, + "loss": 0.65, + "step": 4580 + }, + { + "epoch": 2.713058928042641, + "grad_norm": 1.8447154534722792, + "learning_rate": 4.7335926088244556e-07, + "loss": 0.5999, + "step": 4581 + }, + { + "epoch": 2.7136511696772283, + "grad_norm": 1.488823816754773, + "learning_rate": 4.7141674183354247e-07, + "loss": 0.6284, + "step": 4582 + }, + { + "epoch": 2.7142434113118155, + "grad_norm": 1.7787433279339757, + "learning_rate": 4.6947812051100995e-07, + "loss": 0.5897, + "step": 4583 + }, + { + "epoch": 2.714835652946402, + "grad_norm": 1.2062389769136208, + "learning_rate": 4.6754339770785474e-07, + "loss": 0.5984, + "step": 4584 + }, + { + "epoch": 2.715427894580989, + "grad_norm": 1.48700055953898, + "learning_rate": 4.6561257421548377e-07, + "loss": 0.5989, + "step": 4585 + }, + { + "epoch": 2.716020136215576, + "grad_norm": 1.7381187253725767, + "learning_rate": 4.636856508237164e-07, + "loss": 0.5941, + "step": 4586 + }, + { + "epoch": 2.7166123778501627, + "grad_norm": 8.014229956967728, + "learning_rate": 4.617626283207688e-07, + "loss": 0.6453, + "step": 4587 + }, + { + "epoch": 2.71720461948475, + "grad_norm": 1.655651410898122, + "learning_rate": 4.5984350749326835e-07, + "loss": 0.6157, + "step": 4588 + }, + { + "epoch": 2.7177968611193366, + "grad_norm": 1.4064515132654336, + "learning_rate": 4.5792828912624154e-07, + "loss": 0.5953, + "step": 4589 + }, + { + "epoch": 2.7183891027539238, + "grad_norm": 2.5112416819185404, + "learning_rate": 4.5601697400312175e-07, + "loss": 0.609, + "step": 4590 + }, + { + "epoch": 2.7189813443885105, + "grad_norm": 1.484009351861582, + "learning_rate": 4.541095629057435e-07, + "loss": 0.6504, + "step": 4591 + }, + { + "epoch": 2.719573586023097, + "grad_norm": 1.4442625515083019, + "learning_rate": 4.5220605661434605e-07, + "loss": 0.5921, + "step": 4592 + }, + { + "epoch": 2.7201658276576843, + "grad_norm": 1.2531427351279538, + "learning_rate": 4.503064559075687e-07, + "loss": 0.622, + "step": 4593 + }, + { + "epoch": 2.7207580692922715, + "grad_norm": 1.772483151004884, + "learning_rate": 4.4841076156245665e-07, + "loss": 0.5807, + "step": 4594 + }, + { + "epoch": 2.721350310926858, + "grad_norm": 1.01520707328506, + "learning_rate": 4.465189743544551e-07, + "loss": 0.6132, + "step": 4595 + }, + { + "epoch": 2.721942552561445, + "grad_norm": 1.8850344959256942, + "learning_rate": 4.4463109505741177e-07, + "loss": 0.62, + "step": 4596 + }, + { + "epoch": 2.722534794196032, + "grad_norm": 1.742313192790708, + "learning_rate": 4.427471244435733e-07, + "loss": 0.6011, + "step": 4597 + }, + { + "epoch": 2.7231270358306188, + "grad_norm": 1.4656681253439428, + "learning_rate": 4.408670632835932e-07, + "loss": 0.5727, + "step": 4598 + }, + { + "epoch": 2.723719277465206, + "grad_norm": 1.7292315439029244, + "learning_rate": 4.389909123465186e-07, + "loss": 0.5966, + "step": 4599 + }, + { + "epoch": 2.7243115190997926, + "grad_norm": 1.6450950995513567, + "learning_rate": 4.3711867239980335e-07, + "loss": 0.6282, + "step": 4600 + }, + { + "epoch": 2.72490376073438, + "grad_norm": 1.5986011485085827, + "learning_rate": 4.3525034420929815e-07, + "loss": 0.5897, + "step": 4601 + }, + { + "epoch": 2.7254960023689665, + "grad_norm": 1.668306756549204, + "learning_rate": 4.3338592853925277e-07, + "loss": 0.6291, + "step": 4602 + }, + { + "epoch": 2.726088244003553, + "grad_norm": 1.3048391452095505, + "learning_rate": 4.315254261523194e-07, + "loss": 0.6368, + "step": 4603 + }, + { + "epoch": 2.7266804856381404, + "grad_norm": 1.4097561245163255, + "learning_rate": 4.296688378095493e-07, + "loss": 0.659, + "step": 4604 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 4.424354272711558, + "learning_rate": 4.278161642703904e-07, + "loss": 0.6217, + "step": 4605 + }, + { + "epoch": 2.7278649689073142, + "grad_norm": 1.5665435392205018, + "learning_rate": 4.2596740629268997e-07, + "loss": 0.6582, + "step": 4606 + }, + { + "epoch": 2.728457210541901, + "grad_norm": 1.2828718477664653, + "learning_rate": 4.2412256463269295e-07, + "loss": 0.6032, + "step": 4607 + }, + { + "epoch": 2.729049452176488, + "grad_norm": 1.6646289965329508, + "learning_rate": 4.222816400450458e-07, + "loss": 0.6442, + "step": 4608 + }, + { + "epoch": 2.729641693811075, + "grad_norm": 1.2239860350713117, + "learning_rate": 4.204446332827894e-07, + "loss": 0.5899, + "step": 4609 + }, + { + "epoch": 2.730233935445662, + "grad_norm": 1.1809420431718864, + "learning_rate": 4.186115450973616e-07, + "loss": 0.5778, + "step": 4610 + }, + { + "epoch": 2.7308261770802487, + "grad_norm": 3.218531272958351, + "learning_rate": 4.1678237623859917e-07, + "loss": 0.604, + "step": 4611 + }, + { + "epoch": 2.731418418714836, + "grad_norm": 4.9639832700100195, + "learning_rate": 4.1495712745473595e-07, + "loss": 0.5754, + "step": 4612 + }, + { + "epoch": 2.7320106603494225, + "grad_norm": 1.6291943122524715, + "learning_rate": 4.1313579949240123e-07, + "loss": 0.6341, + "step": 4613 + }, + { + "epoch": 2.7326029019840092, + "grad_norm": 1.2955725976386623, + "learning_rate": 4.1131839309661803e-07, + "loss": 0.6163, + "step": 4614 + }, + { + "epoch": 2.7331951436185964, + "grad_norm": 1.6474895593726941, + "learning_rate": 4.0950490901081055e-07, + "loss": 0.5947, + "step": 4615 + }, + { + "epoch": 2.7337873852531835, + "grad_norm": 1.1944008670704511, + "learning_rate": 4.076953479767964e-07, + "loss": 0.6119, + "step": 4616 + }, + { + "epoch": 2.7343796268877703, + "grad_norm": 1.8527741398776432, + "learning_rate": 4.0588971073478477e-07, + "loss": 0.597, + "step": 4617 + }, + { + "epoch": 2.734971868522357, + "grad_norm": 1.6717926231193392, + "learning_rate": 4.040879980233836e-07, + "loss": 0.5978, + "step": 4618 + }, + { + "epoch": 2.735564110156944, + "grad_norm": 1.351512519792787, + "learning_rate": 4.022902105795956e-07, + "loss": 0.5928, + "step": 4619 + }, + { + "epoch": 2.736156351791531, + "grad_norm": 2.021922048948864, + "learning_rate": 4.004963491388203e-07, + "loss": 0.5927, + "step": 4620 + }, + { + "epoch": 2.736748593426118, + "grad_norm": 2.259608972990153, + "learning_rate": 3.987064144348407e-07, + "loss": 0.5992, + "step": 4621 + }, + { + "epoch": 2.7373408350607047, + "grad_norm": 2.5222440416768124, + "learning_rate": 3.969204071998445e-07, + "loss": 0.6311, + "step": 4622 + }, + { + "epoch": 2.737933076695292, + "grad_norm": 2.0706914867727875, + "learning_rate": 3.951383281644106e-07, + "loss": 0.6518, + "step": 4623 + }, + { + "epoch": 2.7385253183298786, + "grad_norm": 1.0448933909161093, + "learning_rate": 3.93360178057508e-07, + "loss": 0.6289, + "step": 4624 + }, + { + "epoch": 2.7391175599644653, + "grad_norm": 1.7416780397549962, + "learning_rate": 3.9158595760649954e-07, + "loss": 0.6319, + "step": 4625 + }, + { + "epoch": 2.7397098015990524, + "grad_norm": 7.362023134660734, + "learning_rate": 3.8981566753714116e-07, + "loss": 0.6252, + "step": 4626 + }, + { + "epoch": 2.7403020432336396, + "grad_norm": 1.4619722411701648, + "learning_rate": 3.8804930857358256e-07, + "loss": 0.5815, + "step": 4627 + }, + { + "epoch": 2.7408942848682263, + "grad_norm": 2.5074140857959697, + "learning_rate": 3.8628688143836244e-07, + "loss": 0.656, + "step": 4628 + }, + { + "epoch": 2.741486526502813, + "grad_norm": 1.7710644494678003, + "learning_rate": 3.845283868524119e-07, + "loss": 0.6125, + "step": 4629 + }, + { + "epoch": 2.7420787681374, + "grad_norm": 1.302185304449621, + "learning_rate": 3.827738255350555e-07, + "loss": 0.6145, + "step": 4630 + }, + { + "epoch": 2.742671009771987, + "grad_norm": 1.488799773221385, + "learning_rate": 3.810231982040091e-07, + "loss": 0.6006, + "step": 4631 + }, + { + "epoch": 2.743263251406574, + "grad_norm": 1.1890826196658626, + "learning_rate": 3.792765055753755e-07, + "loss": 0.5658, + "step": 4632 + }, + { + "epoch": 2.7438554930411607, + "grad_norm": 1.6943845095513352, + "learning_rate": 3.775337483636488e-07, + "loss": 0.6312, + "step": 4633 + }, + { + "epoch": 2.744447734675748, + "grad_norm": 2.162351020574408, + "learning_rate": 3.757949272817174e-07, + "loss": 0.6157, + "step": 4634 + }, + { + "epoch": 2.7450399763103346, + "grad_norm": 1.8172507844780956, + "learning_rate": 3.7406004304085584e-07, + "loss": 0.5986, + "step": 4635 + }, + { + "epoch": 2.7456322179449213, + "grad_norm": 2.857384440493848, + "learning_rate": 3.723290963507309e-07, + "loss": 0.6173, + "step": 4636 + }, + { + "epoch": 2.7462244595795084, + "grad_norm": 2.336254189546982, + "learning_rate": 3.706020879193939e-07, + "loss": 0.6436, + "step": 4637 + }, + { + "epoch": 2.7468167012140956, + "grad_norm": 2.576256451190125, + "learning_rate": 3.688790184532909e-07, + "loss": 0.61, + "step": 4638 + }, + { + "epoch": 2.7474089428486823, + "grad_norm": 2.029285692911393, + "learning_rate": 3.671598886572525e-07, + "loss": 0.6187, + "step": 4639 + }, + { + "epoch": 2.748001184483269, + "grad_norm": 1.3899689164399947, + "learning_rate": 3.654446992345018e-07, + "loss": 0.6091, + "step": 4640 + }, + { + "epoch": 2.748593426117856, + "grad_norm": 1.3664074951874416, + "learning_rate": 3.6373345088664525e-07, + "loss": 0.5813, + "step": 4641 + }, + { + "epoch": 2.749185667752443, + "grad_norm": 2.891864884426199, + "learning_rate": 3.620261443136819e-07, + "loss": 0.5623, + "step": 4642 + }, + { + "epoch": 2.74977790938703, + "grad_norm": 2.0669641721163403, + "learning_rate": 3.6032278021399415e-07, + "loss": 0.5992, + "step": 4643 + }, + { + "epoch": 2.7503701510216167, + "grad_norm": 1.6857686046412492, + "learning_rate": 3.5862335928435465e-07, + "loss": 0.6651, + "step": 4644 + }, + { + "epoch": 2.750962392656204, + "grad_norm": 1.6062047172916332, + "learning_rate": 3.569278822199218e-07, + "loss": 0.5714, + "step": 4645 + }, + { + "epoch": 2.7515546342907906, + "grad_norm": 1.4375077948074273, + "learning_rate": 3.5523634971424194e-07, + "loss": 0.624, + "step": 4646 + }, + { + "epoch": 2.7521468759253773, + "grad_norm": 2.233044908101775, + "learning_rate": 3.5354876245924596e-07, + "loss": 0.5747, + "step": 4647 + }, + { + "epoch": 2.7527391175599645, + "grad_norm": 6.883080250715156, + "learning_rate": 3.5186512114525283e-07, + "loss": 0.5813, + "step": 4648 + }, + { + "epoch": 2.7533313591945516, + "grad_norm": 2.2999746530713394, + "learning_rate": 3.50185426460965e-07, + "loss": 0.6157, + "step": 4649 + }, + { + "epoch": 2.7539236008291383, + "grad_norm": 1.8968933961570913, + "learning_rate": 3.485096790934739e-07, + "loss": 0.6322, + "step": 4650 + }, + { + "epoch": 2.754515842463725, + "grad_norm": 1.4387652445210517, + "learning_rate": 3.4683787972825345e-07, + "loss": 0.6494, + "step": 4651 + }, + { + "epoch": 2.755108084098312, + "grad_norm": 1.7110296382242969, + "learning_rate": 3.451700290491633e-07, + "loss": 0.5789, + "step": 4652 + }, + { + "epoch": 2.755700325732899, + "grad_norm": 1.781421507136215, + "learning_rate": 3.4350612773844996e-07, + "loss": 0.5754, + "step": 4653 + }, + { + "epoch": 2.756292567367486, + "grad_norm": 1.6547622885370215, + "learning_rate": 3.4184617647674e-07, + "loss": 0.5839, + "step": 4654 + }, + { + "epoch": 2.756884809002073, + "grad_norm": 1.5141595029370973, + "learning_rate": 3.40190175943047e-07, + "loss": 0.6275, + "step": 4655 + }, + { + "epoch": 2.75747705063666, + "grad_norm": 1.4347172437913116, + "learning_rate": 3.3853812681477136e-07, + "loss": 0.6384, + "step": 4656 + }, + { + "epoch": 2.7580692922712466, + "grad_norm": 2.134280092349403, + "learning_rate": 3.368900297676925e-07, + "loss": 0.6151, + "step": 4657 + }, + { + "epoch": 2.7586615339058334, + "grad_norm": 1.6195000996671558, + "learning_rate": 3.3524588547597327e-07, + "loss": 0.5915, + "step": 4658 + }, + { + "epoch": 2.7592537755404205, + "grad_norm": 1.4226797326345686, + "learning_rate": 3.336056946121613e-07, + "loss": 0.6438, + "step": 4659 + }, + { + "epoch": 2.7598460171750077, + "grad_norm": 2.076976223041084, + "learning_rate": 3.3196945784718993e-07, + "loss": 0.6008, + "step": 4660 + }, + { + "epoch": 2.7604382588095944, + "grad_norm": 1.4596510371769844, + "learning_rate": 3.303371758503693e-07, + "loss": 0.6462, + "step": 4661 + }, + { + "epoch": 2.761030500444181, + "grad_norm": 1.596455551281381, + "learning_rate": 3.287088492893942e-07, + "loss": 0.5848, + "step": 4662 + }, + { + "epoch": 2.7616227420787682, + "grad_norm": 1.3584900013265102, + "learning_rate": 3.270844788303429e-07, + "loss": 0.6033, + "step": 4663 + }, + { + "epoch": 2.762214983713355, + "grad_norm": 1.4668378316726418, + "learning_rate": 3.25464065137675e-07, + "loss": 0.5542, + "step": 4664 + }, + { + "epoch": 2.762807225347942, + "grad_norm": 2.1567477871091936, + "learning_rate": 3.2384760887423036e-07, + "loss": 0.6195, + "step": 4665 + }, + { + "epoch": 2.763399466982529, + "grad_norm": 1.7423110133051698, + "learning_rate": 3.2223511070122893e-07, + "loss": 0.6229, + "step": 4666 + }, + { + "epoch": 2.763991708617116, + "grad_norm": 5.691959866524479, + "learning_rate": 3.2062657127827413e-07, + "loss": 0.6046, + "step": 4667 + }, + { + "epoch": 2.7645839502517027, + "grad_norm": 2.0605725048101124, + "learning_rate": 3.190219912633519e-07, + "loss": 0.5933, + "step": 4668 + }, + { + "epoch": 2.7651761918862894, + "grad_norm": 1.6855632316948044, + "learning_rate": 3.1742137131281937e-07, + "loss": 0.5716, + "step": 4669 + }, + { + "epoch": 2.7657684335208765, + "grad_norm": 1.4437261940020687, + "learning_rate": 3.158247120814251e-07, + "loss": 0.6427, + "step": 4670 + }, + { + "epoch": 2.7663606751554637, + "grad_norm": 1.9620940489950571, + "learning_rate": 3.142320142222899e-07, + "loss": 0.6286, + "step": 4671 + }, + { + "epoch": 2.7669529167900504, + "grad_norm": 1.5525721497968263, + "learning_rate": 3.1264327838692153e-07, + "loss": 0.6121, + "step": 4672 + }, + { + "epoch": 2.767545158424637, + "grad_norm": 1.864998629614895, + "learning_rate": 3.1105850522519574e-07, + "loss": 0.6336, + "step": 4673 + }, + { + "epoch": 2.7681374000592243, + "grad_norm": 2.834526941324686, + "learning_rate": 3.094776953853762e-07, + "loss": 0.5941, + "step": 4674 + }, + { + "epoch": 2.768729641693811, + "grad_norm": 2.1256789045491966, + "learning_rate": 3.079008495141056e-07, + "loss": 0.6248, + "step": 4675 + }, + { + "epoch": 2.769321883328398, + "grad_norm": 1.8764504253445144, + "learning_rate": 3.063279682564002e-07, + "loss": 0.6022, + "step": 4676 + }, + { + "epoch": 2.769914124962985, + "grad_norm": 2.287234147163577, + "learning_rate": 3.047590522556565e-07, + "loss": 0.6267, + "step": 4677 + }, + { + "epoch": 2.770506366597572, + "grad_norm": 2.975356179185165, + "learning_rate": 3.0319410215365e-07, + "loss": 0.6251, + "step": 4678 + }, + { + "epoch": 2.7710986082321587, + "grad_norm": 2.5144656587013507, + "learning_rate": 3.0163311859053524e-07, + "loss": 0.585, + "step": 4679 + }, + { + "epoch": 2.7716908498667454, + "grad_norm": 1.9765530263280324, + "learning_rate": 3.0007610220483927e-07, + "loss": 0.5958, + "step": 4680 + }, + { + "epoch": 2.7722830915013326, + "grad_norm": 1.8855944082384668, + "learning_rate": 2.9852305363347044e-07, + "loss": 0.6075, + "step": 4681 + }, + { + "epoch": 2.7728753331359197, + "grad_norm": 1.4481580120122306, + "learning_rate": 2.969739735117128e-07, + "loss": 0.631, + "step": 4682 + }, + { + "epoch": 2.7734675747705064, + "grad_norm": 1.4994595976642986, + "learning_rate": 2.954288624732293e-07, + "loss": 0.6311, + "step": 4683 + }, + { + "epoch": 2.774059816405093, + "grad_norm": 2.9375587558348415, + "learning_rate": 2.9388772115005457e-07, + "loss": 0.6206, + "step": 4684 + }, + { + "epoch": 2.7746520580396803, + "grad_norm": 1.3813361341030252, + "learning_rate": 2.9235055017260205e-07, + "loss": 0.6059, + "step": 4685 + }, + { + "epoch": 2.775244299674267, + "grad_norm": 1.2073925168982438, + "learning_rate": 2.9081735016966205e-07, + "loss": 0.6153, + "step": 4686 + }, + { + "epoch": 2.775836541308854, + "grad_norm": 1.6373735275001204, + "learning_rate": 2.892881217684007e-07, + "loss": 0.6376, + "step": 4687 + }, + { + "epoch": 2.776428782943441, + "grad_norm": 1.3737783710680564, + "learning_rate": 2.877628655943576e-07, + "loss": 0.5936, + "step": 4688 + }, + { + "epoch": 2.777021024578028, + "grad_norm": 1.4776319579183617, + "learning_rate": 2.8624158227144703e-07, + "loss": 0.6292, + "step": 4689 + }, + { + "epoch": 2.7776132662126147, + "grad_norm": 1.4187427316372916, + "learning_rate": 2.847242724219612e-07, + "loss": 0.631, + "step": 4690 + }, + { + "epoch": 2.7782055078472014, + "grad_norm": 1.5610769267747608, + "learning_rate": 2.8321093666656253e-07, + "loss": 0.6175, + "step": 4691 + }, + { + "epoch": 2.7787977494817886, + "grad_norm": 1.3150995164435961, + "learning_rate": 2.8170157562429466e-07, + "loss": 0.6882, + "step": 4692 + }, + { + "epoch": 2.7793899911163757, + "grad_norm": 1.3910028681163722, + "learning_rate": 2.801961899125671e-07, + "loss": 0.6123, + "step": 4693 + }, + { + "epoch": 2.7799822327509625, + "grad_norm": 1.8024828509396476, + "learning_rate": 2.7869478014716953e-07, + "loss": 0.6624, + "step": 4694 + }, + { + "epoch": 2.780574474385549, + "grad_norm": 1.2492833932312706, + "learning_rate": 2.7719734694226065e-07, + "loss": 0.6252, + "step": 4695 + }, + { + "epoch": 2.7811667160201363, + "grad_norm": 1.8896802955465037, + "learning_rate": 2.757038909103793e-07, + "loss": 0.5868, + "step": 4696 + }, + { + "epoch": 2.781758957654723, + "grad_norm": 2.317063717995178, + "learning_rate": 2.74214412662428e-07, + "loss": 0.6063, + "step": 4697 + }, + { + "epoch": 2.78235119928931, + "grad_norm": 1.2618887822595275, + "learning_rate": 2.7272891280769044e-07, + "loss": 0.6231, + "step": 4698 + }, + { + "epoch": 2.782943440923897, + "grad_norm": 2.519622445595849, + "learning_rate": 2.7124739195381724e-07, + "loss": 0.6046, + "step": 4699 + }, + { + "epoch": 2.783535682558484, + "grad_norm": 1.3397833913049007, + "learning_rate": 2.697698507068358e-07, + "loss": 0.6201, + "step": 4700 + }, + { + "epoch": 2.7841279241930708, + "grad_norm": 1.6268323399434212, + "learning_rate": 2.682962896711427e-07, + "loss": 0.5909, + "step": 4701 + }, + { + "epoch": 2.7847201658276575, + "grad_norm": 1.6990969743760185, + "learning_rate": 2.6682670944950804e-07, + "loss": 0.6215, + "step": 4702 + }, + { + "epoch": 2.7853124074622446, + "grad_norm": 1.8833596532734562, + "learning_rate": 2.653611106430698e-07, + "loss": 0.6729, + "step": 4703 + }, + { + "epoch": 2.7859046490968318, + "grad_norm": 1.6243542801393849, + "learning_rate": 2.638994938513451e-07, + "loss": 0.5851, + "step": 4704 + }, + { + "epoch": 2.7864968907314185, + "grad_norm": 1.2192739780041595, + "learning_rate": 2.624418596722134e-07, + "loss": 0.6167, + "step": 4705 + }, + { + "epoch": 2.787089132366005, + "grad_norm": 1.8467947724763827, + "learning_rate": 2.609882087019311e-07, + "loss": 0.5764, + "step": 4706 + }, + { + "epoch": 2.7876813740005923, + "grad_norm": 1.4594864341656142, + "learning_rate": 2.595385415351215e-07, + "loss": 0.612, + "step": 4707 + }, + { + "epoch": 2.788273615635179, + "grad_norm": 1.854617047861561, + "learning_rate": 2.580928587647824e-07, + "loss": 0.6462, + "step": 4708 + }, + { + "epoch": 2.788865857269766, + "grad_norm": 1.4487640953791494, + "learning_rate": 2.566511609822775e-07, + "loss": 0.6045, + "step": 4709 + }, + { + "epoch": 2.789458098904353, + "grad_norm": 5.36212138647543, + "learning_rate": 2.5521344877734165e-07, + "loss": 0.6486, + "step": 4710 + }, + { + "epoch": 2.79005034053894, + "grad_norm": 2.5695670114046285, + "learning_rate": 2.5377972273808115e-07, + "loss": 0.6292, + "step": 4711 + }, + { + "epoch": 2.790642582173527, + "grad_norm": 1.674042925609699, + "learning_rate": 2.523499834509724e-07, + "loss": 0.6021, + "step": 4712 + }, + { + "epoch": 2.7912348238081135, + "grad_norm": 1.7422315504699424, + "learning_rate": 2.5092423150085643e-07, + "loss": 0.6147, + "step": 4713 + }, + { + "epoch": 2.7918270654427007, + "grad_norm": 1.518935615818262, + "learning_rate": 2.495024674709468e-07, + "loss": 0.6175, + "step": 4714 + }, + { + "epoch": 2.792419307077288, + "grad_norm": 3.9189187326117443, + "learning_rate": 2.480846919428237e-07, + "loss": 0.6206, + "step": 4715 + }, + { + "epoch": 2.7930115487118745, + "grad_norm": 1.5020841924960988, + "learning_rate": 2.4667090549644e-07, + "loss": 0.6688, + "step": 4716 + }, + { + "epoch": 2.7936037903464612, + "grad_norm": 2.035594629956855, + "learning_rate": 2.45261108710112e-07, + "loss": 0.556, + "step": 4717 + }, + { + "epoch": 2.7941960319810484, + "grad_norm": 1.7405526485195462, + "learning_rate": 2.43855302160525e-07, + "loss": 0.6238, + "step": 4718 + }, + { + "epoch": 2.794788273615635, + "grad_norm": 4.375187374245842, + "learning_rate": 2.424534864227346e-07, + "loss": 0.6078, + "step": 4719 + }, + { + "epoch": 2.7953805152502222, + "grad_norm": 12.710028326671614, + "learning_rate": 2.4105566207016207e-07, + "loss": 0.6527, + "step": 4720 + }, + { + "epoch": 2.795972756884809, + "grad_norm": 1.3120633605538479, + "learning_rate": 2.396618296745956e-07, + "loss": 0.6098, + "step": 4721 + }, + { + "epoch": 2.796564998519396, + "grad_norm": 1.6308815559170025, + "learning_rate": 2.3827198980619025e-07, + "loss": 0.6458, + "step": 4722 + }, + { + "epoch": 2.797157240153983, + "grad_norm": 1.3307922640687166, + "learning_rate": 2.3688614303347012e-07, + "loss": 0.59, + "step": 4723 + }, + { + "epoch": 2.7977494817885695, + "grad_norm": 1.3953095149862986, + "learning_rate": 2.3550428992332508e-07, + "loss": 0.5971, + "step": 4724 + }, + { + "epoch": 2.7983417234231567, + "grad_norm": 1.241408618714826, + "learning_rate": 2.341264310410085e-07, + "loss": 0.6275, + "step": 4725 + }, + { + "epoch": 2.798933965057744, + "grad_norm": 2.7908340168986254, + "learning_rate": 2.327525669501418e-07, + "loss": 0.5977, + "step": 4726 + }, + { + "epoch": 2.7995262066923305, + "grad_norm": 1.46971406756966, + "learning_rate": 2.3138269821271654e-07, + "loss": 0.6049, + "step": 4727 + }, + { + "epoch": 2.8001184483269173, + "grad_norm": 1.23275893833151, + "learning_rate": 2.3001682538908333e-07, + "loss": 0.5917, + "step": 4728 + }, + { + "epoch": 2.8007106899615044, + "grad_norm": 2.1840457881629582, + "learning_rate": 2.286549490379597e-07, + "loss": 0.5872, + "step": 4729 + }, + { + "epoch": 2.801302931596091, + "grad_norm": 1.9787660250351953, + "learning_rate": 2.2729706971643117e-07, + "loss": 0.6328, + "step": 4730 + }, + { + "epoch": 2.8018951732306783, + "grad_norm": 1.6062910758904774, + "learning_rate": 2.2594318797994895e-07, + "loss": 0.5954, + "step": 4731 + }, + { + "epoch": 2.802487414865265, + "grad_norm": 1.7630976777586607, + "learning_rate": 2.245933043823234e-07, + "loss": 0.5963, + "step": 4732 + }, + { + "epoch": 2.803079656499852, + "grad_norm": 1.5256593573589792, + "learning_rate": 2.232474194757339e-07, + "loss": 0.6046, + "step": 4733 + }, + { + "epoch": 2.803671898134439, + "grad_norm": 1.811287510313242, + "learning_rate": 2.2190553381072234e-07, + "loss": 0.6255, + "step": 4734 + }, + { + "epoch": 2.8042641397690256, + "grad_norm": 1.6696578027302151, + "learning_rate": 2.2056764793619845e-07, + "loss": 0.6031, + "step": 4735 + }, + { + "epoch": 2.8048563814036127, + "grad_norm": 2.2390362799933348, + "learning_rate": 2.1923376239942895e-07, + "loss": 0.6192, + "step": 4736 + }, + { + "epoch": 2.8054486230382, + "grad_norm": 1.9676446261540672, + "learning_rate": 2.179038777460507e-07, + "loss": 0.5974, + "step": 4737 + }, + { + "epoch": 2.8060408646727866, + "grad_norm": 1.3850897301178713, + "learning_rate": 2.1657799452005856e-07, + "loss": 0.58, + "step": 4738 + }, + { + "epoch": 2.8066331063073733, + "grad_norm": 1.5591433252070857, + "learning_rate": 2.1525611326381756e-07, + "loss": 0.6151, + "step": 4739 + }, + { + "epoch": 2.8072253479419604, + "grad_norm": 1.190679482654428, + "learning_rate": 2.139382345180474e-07, + "loss": 0.6089, + "step": 4740 + }, + { + "epoch": 2.807817589576547, + "grad_norm": 1.4443287017918227, + "learning_rate": 2.1262435882183685e-07, + "loss": 0.649, + "step": 4741 + }, + { + "epoch": 2.8084098312111343, + "grad_norm": 2.138041941041724, + "learning_rate": 2.1131448671263378e-07, + "loss": 0.602, + "step": 4742 + }, + { + "epoch": 2.809002072845721, + "grad_norm": 2.021125849306582, + "learning_rate": 2.1000861872625066e-07, + "loss": 0.6106, + "step": 4743 + }, + { + "epoch": 2.809594314480308, + "grad_norm": 1.1718395287718038, + "learning_rate": 2.0870675539686024e-07, + "loss": 0.6226, + "step": 4744 + }, + { + "epoch": 2.810186556114895, + "grad_norm": 2.4236065030797453, + "learning_rate": 2.0740889725699654e-07, + "loss": 0.5738, + "step": 4745 + }, + { + "epoch": 2.8107787977494816, + "grad_norm": 2.134568247280834, + "learning_rate": 2.0611504483756038e-07, + "loss": 0.6524, + "step": 4746 + }, + { + "epoch": 2.8113710393840687, + "grad_norm": 1.2716048058881149, + "learning_rate": 2.0482519866780516e-07, + "loss": 0.5964, + "step": 4747 + }, + { + "epoch": 2.8119632810186554, + "grad_norm": 1.2026027684878298, + "learning_rate": 2.0353935927535428e-07, + "loss": 0.5937, + "step": 4748 + }, + { + "epoch": 2.8125555226532426, + "grad_norm": 1.7177800039620705, + "learning_rate": 2.0225752718618707e-07, + "loss": 0.6003, + "step": 4749 + }, + { + "epoch": 2.8131477642878293, + "grad_norm": 2.3836143414025974, + "learning_rate": 2.009797029246452e-07, + "loss": 0.568, + "step": 4750 + }, + { + "epoch": 2.8137400059224165, + "grad_norm": 1.5700728206598682, + "learning_rate": 1.997058870134294e-07, + "loss": 0.5908, + "step": 4751 + }, + { + "epoch": 2.814332247557003, + "grad_norm": 1.4091090434483953, + "learning_rate": 1.9843607997360403e-07, + "loss": 0.5847, + "step": 4752 + }, + { + "epoch": 2.8149244891915903, + "grad_norm": 3.983754518350977, + "learning_rate": 1.9717028232458912e-07, + "loss": 0.6242, + "step": 4753 + }, + { + "epoch": 2.815516730826177, + "grad_norm": 3.552480776764616, + "learning_rate": 1.959084945841705e-07, + "loss": 0.6299, + "step": 4754 + }, + { + "epoch": 2.816108972460764, + "grad_norm": 1.1920497610991834, + "learning_rate": 1.9465071726848638e-07, + "loss": 0.6314, + "step": 4755 + }, + { + "epoch": 2.816701214095351, + "grad_norm": 1.0342228137761196, + "learning_rate": 1.9339695089204192e-07, + "loss": 0.5991, + "step": 4756 + }, + { + "epoch": 2.8172934557299376, + "grad_norm": 1.5464771974705371, + "learning_rate": 1.921471959676957e-07, + "loss": 0.5717, + "step": 4757 + }, + { + "epoch": 2.8178856973645248, + "grad_norm": 9.400365344050027, + "learning_rate": 1.9090145300666885e-07, + "loss": 0.5665, + "step": 4758 + }, + { + "epoch": 2.8184779389991115, + "grad_norm": 1.451507907352805, + "learning_rate": 1.8965972251854038e-07, + "loss": 0.6476, + "step": 4759 + }, + { + "epoch": 2.8190701806336986, + "grad_norm": 1.467526846495732, + "learning_rate": 1.884220050112462e-07, + "loss": 0.5874, + "step": 4760 + }, + { + "epoch": 2.8196624222682853, + "grad_norm": 1.2322692339357366, + "learning_rate": 1.8718830099108464e-07, + "loss": 0.5639, + "step": 4761 + }, + { + "epoch": 2.8202546639028725, + "grad_norm": 1.9431671531415993, + "learning_rate": 1.8595861096270874e-07, + "loss": 0.6446, + "step": 4762 + }, + { + "epoch": 2.820846905537459, + "grad_norm": 1.6843027054345925, + "learning_rate": 1.8473293542913163e-07, + "loss": 0.6301, + "step": 4763 + }, + { + "epoch": 2.8214391471720464, + "grad_norm": 2.487244252300873, + "learning_rate": 1.8351127489172227e-07, + "loss": 0.646, + "step": 4764 + }, + { + "epoch": 2.822031388806633, + "grad_norm": 3.497230980934624, + "learning_rate": 1.8229362985021092e-07, + "loss": 0.5818, + "step": 4765 + }, + { + "epoch": 2.82262363044122, + "grad_norm": 2.308374655735379, + "learning_rate": 1.8108000080267918e-07, + "loss": 0.6312, + "step": 4766 + }, + { + "epoch": 2.823215872075807, + "grad_norm": 2.0176260134625874, + "learning_rate": 1.7987038824557323e-07, + "loss": 0.5929, + "step": 4767 + }, + { + "epoch": 2.8238081137103936, + "grad_norm": 1.7213851889188727, + "learning_rate": 1.7866479267369062e-07, + "loss": 0.6419, + "step": 4768 + }, + { + "epoch": 2.824400355344981, + "grad_norm": 1.510546311903195, + "learning_rate": 1.7746321458018802e-07, + "loss": 0.5843, + "step": 4769 + }, + { + "epoch": 2.8249925969795675, + "grad_norm": 1.9187963373200847, + "learning_rate": 1.7626565445657883e-07, + "loss": 0.5998, + "step": 4770 + }, + { + "epoch": 2.8255848386141547, + "grad_norm": 1.3445654001439231, + "learning_rate": 1.750721127927324e-07, + "loss": 0.6266, + "step": 4771 + }, + { + "epoch": 2.8261770802487414, + "grad_norm": 1.4215769707003076, + "learning_rate": 1.7388259007687368e-07, + "loss": 0.6382, + "step": 4772 + }, + { + "epoch": 2.8267693218833285, + "grad_norm": 2.129246036856138, + "learning_rate": 1.7269708679558572e-07, + "loss": 0.6008, + "step": 4773 + }, + { + "epoch": 2.8273615635179152, + "grad_norm": 1.4627320973002913, + "learning_rate": 1.715156034338039e-07, + "loss": 0.6084, + "step": 4774 + }, + { + "epoch": 2.8279538051525024, + "grad_norm": 2.8153006747845875, + "learning_rate": 1.7033814047482388e-07, + "loss": 0.6042, + "step": 4775 + }, + { + "epoch": 2.828546046787089, + "grad_norm": 1.507076721738337, + "learning_rate": 1.691646984002937e-07, + "loss": 0.6466, + "step": 4776 + }, + { + "epoch": 2.8291382884216763, + "grad_norm": 1.2884117944639297, + "learning_rate": 1.6799527769021495e-07, + "loss": 0.6195, + "step": 4777 + }, + { + "epoch": 2.829730530056263, + "grad_norm": 1.6946887003219235, + "learning_rate": 1.6682987882294722e-07, + "loss": 0.6082, + "step": 4778 + }, + { + "epoch": 2.8303227716908497, + "grad_norm": 1.5567973673661333, + "learning_rate": 1.6566850227520693e-07, + "loss": 0.5863, + "step": 4779 + }, + { + "epoch": 2.830915013325437, + "grad_norm": 1.7210984837021315, + "learning_rate": 1.6451114852206073e-07, + "loss": 0.633, + "step": 4780 + }, + { + "epoch": 2.8315072549600235, + "grad_norm": 1.5582674489351216, + "learning_rate": 1.6335781803692884e-07, + "loss": 0.5535, + "step": 4781 + }, + { + "epoch": 2.8320994965946107, + "grad_norm": 3.604937316955796, + "learning_rate": 1.6220851129159164e-07, + "loss": 0.6094, + "step": 4782 + }, + { + "epoch": 2.8326917382291974, + "grad_norm": 1.468440659774134, + "learning_rate": 1.6106322875617974e-07, + "loss": 0.6098, + "step": 4783 + }, + { + "epoch": 2.8332839798637846, + "grad_norm": 1.5367592010242979, + "learning_rate": 1.5992197089917727e-07, + "loss": 0.5844, + "step": 4784 + }, + { + "epoch": 2.8338762214983713, + "grad_norm": 1.7789542552715796, + "learning_rate": 1.587847381874219e-07, + "loss": 0.6287, + "step": 4785 + }, + { + "epoch": 2.834468463132958, + "grad_norm": 5.853973890713451, + "learning_rate": 1.576515310861071e-07, + "loss": 0.5924, + "step": 4786 + }, + { + "epoch": 2.835060704767545, + "grad_norm": 1.7798910211182182, + "learning_rate": 1.565223500587798e-07, + "loss": 0.6681, + "step": 4787 + }, + { + "epoch": 2.8356529464021323, + "grad_norm": 2.8553443898913935, + "learning_rate": 1.55397195567335e-07, + "loss": 0.6537, + "step": 4788 + }, + { + "epoch": 2.836245188036719, + "grad_norm": 1.1857098480480426, + "learning_rate": 1.5427606807202676e-07, + "loss": 0.6129, + "step": 4789 + }, + { + "epoch": 2.8368374296713057, + "grad_norm": 1.5704580950283231, + "learning_rate": 1.5315896803145824e-07, + "loss": 0.6026, + "step": 4790 + }, + { + "epoch": 2.837429671305893, + "grad_norm": 4.451172736833375, + "learning_rate": 1.5204589590258722e-07, + "loss": 0.6233, + "step": 4791 + }, + { + "epoch": 2.8380219129404796, + "grad_norm": 1.6353470217624213, + "learning_rate": 1.5093685214072173e-07, + "loss": 0.6129, + "step": 4792 + }, + { + "epoch": 2.8386141545750667, + "grad_norm": 1.660643984933811, + "learning_rate": 1.4983183719952222e-07, + "loss": 0.6326, + "step": 4793 + }, + { + "epoch": 2.8392063962096534, + "grad_norm": 1.456440719487532, + "learning_rate": 1.4873085153100485e-07, + "loss": 0.6038, + "step": 4794 + }, + { + "epoch": 2.8397986378442406, + "grad_norm": 1.2676177701134983, + "learning_rate": 1.4763389558553164e-07, + "loss": 0.5914, + "step": 4795 + }, + { + "epoch": 2.8403908794788273, + "grad_norm": 1.4386662121643279, + "learning_rate": 1.4654096981182031e-07, + "loss": 0.617, + "step": 4796 + }, + { + "epoch": 2.840983121113414, + "grad_norm": 1.3829547994752223, + "learning_rate": 1.4545207465693877e-07, + "loss": 0.6172, + "step": 4797 + }, + { + "epoch": 2.841575362748001, + "grad_norm": 2.9258003797550542, + "learning_rate": 1.4436721056630853e-07, + "loss": 0.6146, + "step": 4798 + }, + { + "epoch": 2.8421676043825883, + "grad_norm": 3.4493369680899795, + "learning_rate": 1.432863779836968e-07, + "loss": 0.6168, + "step": 4799 + }, + { + "epoch": 2.842759846017175, + "grad_norm": 1.7932986975776593, + "learning_rate": 1.4220957735122663e-07, + "loss": 0.624, + "step": 4800 + }, + { + "epoch": 2.8433520876517617, + "grad_norm": 2.6126448363470867, + "learning_rate": 1.41136809109369e-07, + "loss": 0.5994, + "step": 4801 + }, + { + "epoch": 2.843944329286349, + "grad_norm": 1.9157604337658336, + "learning_rate": 1.400680736969484e-07, + "loss": 0.6276, + "step": 4802 + }, + { + "epoch": 2.8445365709209356, + "grad_norm": 1.7072043505201628, + "learning_rate": 1.390033715511363e-07, + "loss": 0.6496, + "step": 4803 + }, + { + "epoch": 2.8451288125555227, + "grad_norm": 1.6570154960641135, + "learning_rate": 1.3794270310745538e-07, + "loss": 0.6308, + "step": 4804 + }, + { + "epoch": 2.8457210541901095, + "grad_norm": 1.344278584688019, + "learning_rate": 1.3688606879977863e-07, + "loss": 0.58, + "step": 4805 + }, + { + "epoch": 2.8463132958246966, + "grad_norm": 1.4565547659975684, + "learning_rate": 1.3583346906033024e-07, + "loss": 0.6234, + "step": 4806 + }, + { + "epoch": 2.8469055374592833, + "grad_norm": 1.5371160933434378, + "learning_rate": 1.347849043196814e-07, + "loss": 0.6329, + "step": 4807 + }, + { + "epoch": 2.84749777909387, + "grad_norm": 1.5015920663462485, + "learning_rate": 1.3374037500675452e-07, + "loss": 0.6034, + "step": 4808 + }, + { + "epoch": 2.848090020728457, + "grad_norm": 1.2589321162204534, + "learning_rate": 1.326998815488212e-07, + "loss": 0.5696, + "step": 4809 + }, + { + "epoch": 2.8486822623630443, + "grad_norm": 1.5267234806346195, + "learning_rate": 1.3166342437150204e-07, + "loss": 0.6538, + "step": 4810 + }, + { + "epoch": 2.849274503997631, + "grad_norm": 3.0702462243014934, + "learning_rate": 1.306310038987657e-07, + "loss": 0.63, + "step": 4811 + }, + { + "epoch": 2.8498667456322178, + "grad_norm": 1.6574653955539411, + "learning_rate": 1.2960262055292884e-07, + "loss": 0.6405, + "step": 4812 + }, + { + "epoch": 2.850458987266805, + "grad_norm": 1.55721639877075, + "learning_rate": 1.2857827475466045e-07, + "loss": 0.6251, + "step": 4813 + }, + { + "epoch": 2.8510512289013916, + "grad_norm": 1.5458939977128197, + "learning_rate": 1.275579669229743e-07, + "loss": 0.6109, + "step": 4814 + }, + { + "epoch": 2.8516434705359788, + "grad_norm": 2.5596201627651243, + "learning_rate": 1.2654169747523425e-07, + "loss": 0.632, + "step": 4815 + }, + { + "epoch": 2.8522357121705655, + "grad_norm": 1.4732233414875766, + "learning_rate": 1.2552946682715116e-07, + "loss": 0.6403, + "step": 4816 + }, + { + "epoch": 2.8528279538051526, + "grad_norm": 1.900721516109027, + "learning_rate": 1.2452127539278493e-07, + "loss": 0.5844, + "step": 4817 + }, + { + "epoch": 2.8534201954397393, + "grad_norm": 1.634171972189307, + "learning_rate": 1.2351712358454115e-07, + "loss": 0.6421, + "step": 4818 + }, + { + "epoch": 2.854012437074326, + "grad_norm": 1.4903972592743484, + "learning_rate": 1.2251701181317577e-07, + "loss": 0.5858, + "step": 4819 + }, + { + "epoch": 2.854604678708913, + "grad_norm": 5.834292688749914, + "learning_rate": 1.215209404877904e-07, + "loss": 0.6248, + "step": 4820 + }, + { + "epoch": 2.8551969203435004, + "grad_norm": 2.127865932703041, + "learning_rate": 1.2052891001583356e-07, + "loss": 0.5898, + "step": 4821 + }, + { + "epoch": 2.855789161978087, + "grad_norm": 1.2644572972552057, + "learning_rate": 1.1954092080310288e-07, + "loss": 0.6052, + "step": 4822 + }, + { + "epoch": 2.856381403612674, + "grad_norm": 3.002356796290768, + "learning_rate": 1.185569732537406e-07, + "loss": 0.6027, + "step": 4823 + }, + { + "epoch": 2.856973645247261, + "grad_norm": 2.3834117533925405, + "learning_rate": 1.1757706777023592e-07, + "loss": 0.6043, + "step": 4824 + }, + { + "epoch": 2.8575658868818476, + "grad_norm": 1.9852462249046476, + "learning_rate": 1.1660120475342707e-07, + "loss": 0.5817, + "step": 4825 + }, + { + "epoch": 2.858158128516435, + "grad_norm": 3.440130845092385, + "learning_rate": 1.1562938460249473e-07, + "loss": 0.647, + "step": 4826 + }, + { + "epoch": 2.8587503701510215, + "grad_norm": 1.5159101594522972, + "learning_rate": 1.1466160771496982e-07, + "loss": 0.6234, + "step": 4827 + }, + { + "epoch": 2.8593426117856087, + "grad_norm": 1.7623292425646084, + "learning_rate": 1.1369787448672675e-07, + "loss": 0.6387, + "step": 4828 + }, + { + "epoch": 2.8599348534201954, + "grad_norm": 1.1069616291653879, + "learning_rate": 1.1273818531198689e-07, + "loss": 0.651, + "step": 4829 + }, + { + "epoch": 2.860527095054782, + "grad_norm": 1.5908965577567722, + "learning_rate": 1.1178254058331616e-07, + "loss": 0.5953, + "step": 4830 + }, + { + "epoch": 2.8611193366893692, + "grad_norm": 1.1175423505897313, + "learning_rate": 1.1083094069162747e-07, + "loss": 0.5764, + "step": 4831 + }, + { + "epoch": 2.8617115783239564, + "grad_norm": 2.6373508177619223, + "learning_rate": 1.0988338602618053e-07, + "loss": 0.5694, + "step": 4832 + }, + { + "epoch": 2.862303819958543, + "grad_norm": 1.5234468544649382, + "learning_rate": 1.0893987697457531e-07, + "loss": 0.602, + "step": 4833 + }, + { + "epoch": 2.86289606159313, + "grad_norm": 2.2250099868509743, + "learning_rate": 1.0800041392276194e-07, + "loss": 0.6286, + "step": 4834 + }, + { + "epoch": 2.863488303227717, + "grad_norm": 2.59536596351284, + "learning_rate": 1.0706499725503306e-07, + "loss": 0.6136, + "step": 4835 + }, + { + "epoch": 2.8640805448623037, + "grad_norm": 1.4278623489444429, + "learning_rate": 1.061336273540281e-07, + "loss": 0.616, + "step": 4836 + }, + { + "epoch": 2.864672786496891, + "grad_norm": 4.753893258101949, + "learning_rate": 1.0520630460072789e-07, + "loss": 0.6092, + "step": 4837 + }, + { + "epoch": 2.8652650281314775, + "grad_norm": 5.378554417045982, + "learning_rate": 1.0428302937445899e-07, + "loss": 0.6194, + "step": 4838 + }, + { + "epoch": 2.8658572697660647, + "grad_norm": 1.8174164225079488, + "learning_rate": 1.0336380205289598e-07, + "loss": 0.6245, + "step": 4839 + }, + { + "epoch": 2.8664495114006514, + "grad_norm": 1.8785119619391066, + "learning_rate": 1.024486230120525e-07, + "loss": 0.6206, + "step": 4840 + }, + { + "epoch": 2.867041753035238, + "grad_norm": 1.4437126735611092, + "learning_rate": 1.0153749262628798e-07, + "loss": 0.604, + "step": 4841 + }, + { + "epoch": 2.8676339946698253, + "grad_norm": 2.6007194776549225, + "learning_rate": 1.0063041126830542e-07, + "loss": 0.6137, + "step": 4842 + }, + { + "epoch": 2.8682262363044124, + "grad_norm": 1.7004655793931742, + "learning_rate": 9.972737930915576e-08, + "loss": 0.6176, + "step": 4843 + }, + { + "epoch": 2.868818477938999, + "grad_norm": 1.7229767787502699, + "learning_rate": 9.882839711822468e-08, + "loss": 0.6027, + "step": 4844 + }, + { + "epoch": 2.869410719573586, + "grad_norm": 2.90523507649429, + "learning_rate": 9.793346506325019e-08, + "loss": 0.6238, + "step": 4845 + }, + { + "epoch": 2.870002961208173, + "grad_norm": 2.42243276703994, + "learning_rate": 9.704258351030838e-08, + "loss": 0.6451, + "step": 4846 + }, + { + "epoch": 2.8705952028427597, + "grad_norm": 1.7458977764762762, + "learning_rate": 9.615575282381995e-08, + "loss": 0.5998, + "step": 4847 + }, + { + "epoch": 2.871187444477347, + "grad_norm": 1.3809462087198652, + "learning_rate": 9.527297336654917e-08, + "loss": 0.6044, + "step": 4848 + }, + { + "epoch": 2.8717796861119336, + "grad_norm": 2.8002775238184037, + "learning_rate": 9.439424549960164e-08, + "loss": 0.5919, + "step": 4849 + }, + { + "epoch": 2.8723719277465207, + "grad_norm": 1.6723003456861165, + "learning_rate": 9.351956958242648e-08, + "loss": 0.6054, + "step": 4850 + }, + { + "epoch": 2.8729641693811074, + "grad_norm": 1.2925306658163453, + "learning_rate": 9.264894597281637e-08, + "loss": 0.5964, + "step": 4851 + }, + { + "epoch": 2.873556411015694, + "grad_norm": 1.5873219385850321, + "learning_rate": 9.178237502690423e-08, + "loss": 0.6179, + "step": 4852 + }, + { + "epoch": 2.8741486526502813, + "grad_norm": 1.839675287935247, + "learning_rate": 9.091985709916651e-08, + "loss": 0.6329, + "step": 4853 + }, + { + "epoch": 2.8747408942848685, + "grad_norm": 1.1428648528723437, + "learning_rate": 9.006139254242319e-08, + "loss": 0.5924, + "step": 4854 + }, + { + "epoch": 2.875333135919455, + "grad_norm": 1.5256355451305668, + "learning_rate": 8.920698170783226e-08, + "loss": 0.572, + "step": 4855 + }, + { + "epoch": 2.875925377554042, + "grad_norm": 1.4817600093717371, + "learning_rate": 8.835662494489638e-08, + "loss": 0.653, + "step": 4856 + }, + { + "epoch": 2.876517619188629, + "grad_norm": 1.9011431000162187, + "learning_rate": 8.751032260145841e-08, + "loss": 0.6457, + "step": 4857 + }, + { + "epoch": 2.8771098608232157, + "grad_norm": 1.4632309745360057, + "learning_rate": 8.6668075023707e-08, + "loss": 0.6112, + "step": 4858 + }, + { + "epoch": 2.877702102457803, + "grad_norm": 2.231165669864582, + "learning_rate": 8.582988255616542e-08, + "loss": 0.6457, + "step": 4859 + }, + { + "epoch": 2.8782943440923896, + "grad_norm": 1.6820428525262583, + "learning_rate": 8.499574554170276e-08, + "loss": 0.6196, + "step": 4860 + }, + { + "epoch": 2.8788865857269768, + "grad_norm": 2.545677098827465, + "learning_rate": 8.416566432152717e-08, + "loss": 0.6142, + "step": 4861 + }, + { + "epoch": 2.8794788273615635, + "grad_norm": 7.836633886663809, + "learning_rate": 8.333963923519039e-08, + "loss": 0.6608, + "step": 4862 + }, + { + "epoch": 2.88007106899615, + "grad_norm": 2.1612339210989977, + "learning_rate": 8.251767062058102e-08, + "loss": 0.5832, + "step": 4863 + }, + { + "epoch": 2.8806633106307373, + "grad_norm": 4.073221900832398, + "learning_rate": 8.169975881393122e-08, + "loss": 0.6748, + "step": 4864 + }, + { + "epoch": 2.8812555522653245, + "grad_norm": 1.332971531493207, + "learning_rate": 8.088590414981445e-08, + "loss": 0.6198, + "step": 4865 + }, + { + "epoch": 2.881847793899911, + "grad_norm": 1.322813916282647, + "learning_rate": 8.007610696114e-08, + "loss": 0.6119, + "step": 4866 + }, + { + "epoch": 2.882440035534498, + "grad_norm": 1.4855030858592297, + "learning_rate": 7.927036757916284e-08, + "loss": 0.556, + "step": 4867 + }, + { + "epoch": 2.883032277169085, + "grad_norm": 1.3953005756472212, + "learning_rate": 7.846868633347492e-08, + "loss": 0.5986, + "step": 4868 + }, + { + "epoch": 2.8836245188036718, + "grad_norm": 2.9292427514375365, + "learning_rate": 7.767106355200949e-08, + "loss": 0.637, + "step": 4869 + }, + { + "epoch": 2.884216760438259, + "grad_norm": 1.5163064833083102, + "learning_rate": 7.687749956103774e-08, + "loss": 0.6435, + "step": 4870 + }, + { + "epoch": 2.8848090020728456, + "grad_norm": 2.053869781792157, + "learning_rate": 7.608799468517336e-08, + "loss": 0.5933, + "step": 4871 + }, + { + "epoch": 2.885401243707433, + "grad_norm": 1.1888149397737269, + "learning_rate": 7.530254924736691e-08, + "loss": 0.6247, + "step": 4872 + }, + { + "epoch": 2.8859934853420195, + "grad_norm": 1.6691352976148741, + "learning_rate": 7.452116356891136e-08, + "loss": 0.5899, + "step": 4873 + }, + { + "epoch": 2.886585726976606, + "grad_norm": 1.9588081253912122, + "learning_rate": 7.374383796943663e-08, + "loss": 0.6164, + "step": 4874 + }, + { + "epoch": 2.8871779686111934, + "grad_norm": 1.983783309455853, + "learning_rate": 7.297057276691277e-08, + "loss": 0.5889, + "step": 4875 + }, + { + "epoch": 2.8877702102457805, + "grad_norm": 1.2950722515129418, + "learning_rate": 7.22013682776479e-08, + "loss": 0.5975, + "step": 4876 + }, + { + "epoch": 2.888362451880367, + "grad_norm": 1.469392578281443, + "learning_rate": 7.143622481629031e-08, + "loss": 0.5583, + "step": 4877 + }, + { + "epoch": 2.888954693514954, + "grad_norm": 1.7592018298109806, + "learning_rate": 7.067514269582743e-08, + "loss": 0.5739, + "step": 4878 + }, + { + "epoch": 2.889546935149541, + "grad_norm": 1.3423603999327254, + "learning_rate": 6.991812222758354e-08, + "loss": 0.5982, + "step": 4879 + }, + { + "epoch": 2.890139176784128, + "grad_norm": 1.6619904015701097, + "learning_rate": 6.916516372122429e-08, + "loss": 0.65, + "step": 4880 + }, + { + "epoch": 2.890731418418715, + "grad_norm": 1.287530941188681, + "learning_rate": 6.841626748474994e-08, + "loss": 0.6254, + "step": 4881 + }, + { + "epoch": 2.8913236600533017, + "grad_norm": 1.6059010609864288, + "learning_rate": 6.767143382450214e-08, + "loss": 0.609, + "step": 4882 + }, + { + "epoch": 2.891915901687889, + "grad_norm": 1.4171569214064548, + "learning_rate": 6.693066304516049e-08, + "loss": 0.6497, + "step": 4883 + }, + { + "epoch": 2.8925081433224755, + "grad_norm": 2.499973096967963, + "learning_rate": 6.619395544974039e-08, + "loss": 0.6378, + "step": 4884 + }, + { + "epoch": 2.8931003849570622, + "grad_norm": 2.2303635681374527, + "learning_rate": 6.546131133959743e-08, + "loss": 0.5796, + "step": 4885 + }, + { + "epoch": 2.8936926265916494, + "grad_norm": 2.4145279843725187, + "learning_rate": 6.473273101442412e-08, + "loss": 0.6072, + "step": 4886 + }, + { + "epoch": 2.8942848682262365, + "grad_norm": 1.9679250585944843, + "learning_rate": 6.400821477225206e-08, + "loss": 0.5963, + "step": 4887 + }, + { + "epoch": 2.8948771098608232, + "grad_norm": 2.3343239037159074, + "learning_rate": 6.32877629094475e-08, + "loss": 0.5846, + "step": 4888 + }, + { + "epoch": 2.89546935149541, + "grad_norm": 1.5461453160570697, + "learning_rate": 6.25713757207158e-08, + "loss": 0.5769, + "step": 4889 + }, + { + "epoch": 2.896061593129997, + "grad_norm": 1.0705482530363706, + "learning_rate": 6.185905349910038e-08, + "loss": 0.6512, + "step": 4890 + }, + { + "epoch": 2.896653834764584, + "grad_norm": 1.2649633299278897, + "learning_rate": 6.115079653598032e-08, + "loss": 0.6649, + "step": 4891 + }, + { + "epoch": 2.897246076399171, + "grad_norm": 1.8470616975035683, + "learning_rate": 6.044660512107392e-08, + "loss": 0.6089, + "step": 4892 + }, + { + "epoch": 2.8978383180337577, + "grad_norm": 2.1565586383593662, + "learning_rate": 5.974647954243295e-08, + "loss": 0.5952, + "step": 4893 + }, + { + "epoch": 2.898430559668345, + "grad_norm": 1.854936286527562, + "learning_rate": 5.905042008645057e-08, + "loss": 0.5861, + "step": 4894 + }, + { + "epoch": 2.8990228013029316, + "grad_norm": 1.935575054571336, + "learning_rate": 5.835842703785233e-08, + "loss": 0.6239, + "step": 4895 + }, + { + "epoch": 2.8996150429375183, + "grad_norm": 1.7416078769823777, + "learning_rate": 5.7670500679702925e-08, + "loss": 0.6335, + "step": 4896 + }, + { + "epoch": 2.9002072845721054, + "grad_norm": 1.1470576618099952, + "learning_rate": 5.698664129340281e-08, + "loss": 0.5932, + "step": 4897 + }, + { + "epoch": 2.9007995262066926, + "grad_norm": 1.2955327547356683, + "learning_rate": 5.630684915868934e-08, + "loss": 0.5998, + "step": 4898 + }, + { + "epoch": 2.9013917678412793, + "grad_norm": 3.332322229387898, + "learning_rate": 5.5631124553636726e-08, + "loss": 0.6397, + "step": 4899 + }, + { + "epoch": 2.901984009475866, + "grad_norm": 1.3915052794106848, + "learning_rate": 5.4959467754651665e-08, + "loss": 0.6373, + "step": 4900 + }, + { + "epoch": 2.902576251110453, + "grad_norm": 1.8217424896661099, + "learning_rate": 5.429187903647992e-08, + "loss": 0.6303, + "step": 4901 + }, + { + "epoch": 2.90316849274504, + "grad_norm": 1.5269216925197477, + "learning_rate": 5.3628358672205285e-08, + "loss": 0.6163, + "step": 4902 + }, + { + "epoch": 2.903760734379627, + "grad_norm": 2.2464586629579517, + "learning_rate": 5.2968906933243966e-08, + "loss": 0.6387, + "step": 4903 + }, + { + "epoch": 2.9043529760142137, + "grad_norm": 1.4028855974264578, + "learning_rate": 5.231352408934687e-08, + "loss": 0.6465, + "step": 4904 + }, + { + "epoch": 2.904945217648801, + "grad_norm": 2.456044814128531, + "learning_rate": 5.1662210408605084e-08, + "loss": 0.5744, + "step": 4905 + }, + { + "epoch": 2.9055374592833876, + "grad_norm": 1.8315857516887082, + "learning_rate": 5.101496615744106e-08, + "loss": 0.5918, + "step": 4906 + }, + { + "epoch": 2.9061297009179743, + "grad_norm": 1.3272661044838177, + "learning_rate": 5.0371791600614115e-08, + "loss": 0.6366, + "step": 4907 + }, + { + "epoch": 2.9067219425525614, + "grad_norm": 2.857393405458701, + "learning_rate": 4.973268700121936e-08, + "loss": 0.5693, + "step": 4908 + }, + { + "epoch": 2.9073141841871486, + "grad_norm": 2.217648509730232, + "learning_rate": 4.9097652620685444e-08, + "loss": 0.628, + "step": 4909 + }, + { + "epoch": 2.9079064258217353, + "grad_norm": 1.286864564902065, + "learning_rate": 4.846668871877902e-08, + "loss": 0.641, + "step": 4910 + }, + { + "epoch": 2.908498667456322, + "grad_norm": 1.297605169345718, + "learning_rate": 4.783979555359808e-08, + "loss": 0.6, + "step": 4911 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 1.2656130698182047, + "learning_rate": 4.721697338157749e-08, + "loss": 0.5856, + "step": 4912 + }, + { + "epoch": 2.909683150725496, + "grad_norm": 1.1801287041757442, + "learning_rate": 4.65982224574868e-08, + "loss": 0.5315, + "step": 4913 + }, + { + "epoch": 2.910275392360083, + "grad_norm": 1.7243061122338232, + "learning_rate": 4.59835430344302e-08, + "loss": 0.5509, + "step": 4914 + }, + { + "epoch": 2.9108676339946697, + "grad_norm": 3.045143418965923, + "learning_rate": 4.537293536384657e-08, + "loss": 0.5648, + "step": 4915 + }, + { + "epoch": 2.911459875629257, + "grad_norm": 1.4709546764859878, + "learning_rate": 4.476639969550722e-08, + "loss": 0.5994, + "step": 4916 + }, + { + "epoch": 2.9120521172638436, + "grad_norm": 2.2971830899861776, + "learning_rate": 4.416393627752147e-08, + "loss": 0.6151, + "step": 4917 + }, + { + "epoch": 2.9126443588984303, + "grad_norm": 1.832372143366767, + "learning_rate": 4.3565545356327734e-08, + "loss": 0.6323, + "step": 4918 + }, + { + "epoch": 2.9132366005330175, + "grad_norm": 2.17503203723051, + "learning_rate": 4.2971227176704656e-08, + "loss": 0.6108, + "step": 4919 + }, + { + "epoch": 2.9138288421676046, + "grad_norm": 2.3265306058858797, + "learning_rate": 4.2380981981759994e-08, + "loss": 0.613, + "step": 4920 + }, + { + "epoch": 2.9144210838021913, + "grad_norm": 1.367396635079341, + "learning_rate": 4.179481001293839e-08, + "loss": 0.5899, + "step": 4921 + }, + { + "epoch": 2.915013325436778, + "grad_norm": 2.8847118428481857, + "learning_rate": 4.1212711510015826e-08, + "loss": 0.6114, + "step": 4922 + }, + { + "epoch": 2.915605567071365, + "grad_norm": 1.3502886513621417, + "learning_rate": 4.0634686711104043e-08, + "loss": 0.6113, + "step": 4923 + }, + { + "epoch": 2.916197808705952, + "grad_norm": 1.2799009361253797, + "learning_rate": 4.006073585264725e-08, + "loss": 0.611, + "step": 4924 + }, + { + "epoch": 2.916790050340539, + "grad_norm": 42.704793158144476, + "learning_rate": 3.94908591694243e-08, + "loss": 0.6016, + "step": 4925 + }, + { + "epoch": 2.9173822919751258, + "grad_norm": 3.2359359053380894, + "learning_rate": 3.89250568945454e-08, + "loss": 0.6014, + "step": 4926 + }, + { + "epoch": 2.917974533609713, + "grad_norm": 1.5404185217083588, + "learning_rate": 3.836332925945874e-08, + "loss": 0.6352, + "step": 4927 + }, + { + "epoch": 2.9185667752442996, + "grad_norm": 1.193895601439141, + "learning_rate": 3.7805676493938294e-08, + "loss": 0.6499, + "step": 4928 + }, + { + "epoch": 2.9191590168788863, + "grad_norm": 1.7488780731222586, + "learning_rate": 3.7252098826098256e-08, + "loss": 0.6066, + "step": 4929 + }, + { + "epoch": 2.9197512585134735, + "grad_norm": 1.3649816816969873, + "learning_rate": 3.6702596482381946e-08, + "loss": 0.5781, + "step": 4930 + }, + { + "epoch": 2.9203435001480607, + "grad_norm": 1.4996743472643042, + "learning_rate": 3.615716968756733e-08, + "loss": 0.6191, + "step": 4931 + }, + { + "epoch": 2.9209357417826474, + "grad_norm": 2.174241162693474, + "learning_rate": 3.5615818664764825e-08, + "loss": 0.6638, + "step": 4932 + }, + { + "epoch": 2.921527983417234, + "grad_norm": 1.0378189780160854, + "learning_rate": 3.507854363541619e-08, + "loss": 0.6222, + "step": 4933 + }, + { + "epoch": 2.9221202250518212, + "grad_norm": 1.273845744800928, + "learning_rate": 3.454534481929783e-08, + "loss": 0.6489, + "step": 4934 + }, + { + "epoch": 2.922712466686408, + "grad_norm": 1.665710013846029, + "learning_rate": 3.4016222434518634e-08, + "loss": 0.5685, + "step": 4935 + }, + { + "epoch": 2.923304708320995, + "grad_norm": 1.59905029910758, + "learning_rate": 3.349117669751767e-08, + "loss": 0.581, + "step": 4936 + }, + { + "epoch": 2.923896949955582, + "grad_norm": 1.6375386411169495, + "learning_rate": 3.297020782307092e-08, + "loss": 0.648, + "step": 4937 + }, + { + "epoch": 2.924489191590169, + "grad_norm": 2.9794337991351627, + "learning_rate": 3.245331602428126e-08, + "loss": 0.6716, + "step": 4938 + }, + { + "epoch": 2.9250814332247557, + "grad_norm": 1.1832076972764827, + "learning_rate": 3.194050151258732e-08, + "loss": 0.6242, + "step": 4939 + }, + { + "epoch": 2.9256736748593424, + "grad_norm": 1.4696232417709514, + "learning_rate": 3.14317644977602e-08, + "loss": 0.6092, + "step": 4940 + }, + { + "epoch": 2.9262659164939295, + "grad_norm": 1.448707576808793, + "learning_rate": 3.09271051879001e-08, + "loss": 0.6279, + "step": 4941 + }, + { + "epoch": 2.9268581581285167, + "grad_norm": 1.6727775055688563, + "learning_rate": 3.0426523789442994e-08, + "loss": 0.6081, + "step": 4942 + }, + { + "epoch": 2.9274503997631034, + "grad_norm": 1.3971103869765082, + "learning_rate": 2.9930020507153986e-08, + "loss": 0.6083, + "step": 4943 + }, + { + "epoch": 2.92804264139769, + "grad_norm": 2.2833557926974057, + "learning_rate": 2.9437595544130615e-08, + "loss": 0.5705, + "step": 4944 + }, + { + "epoch": 2.9286348830322773, + "grad_norm": 3.0762266827362024, + "learning_rate": 2.8949249101801747e-08, + "loss": 0.575, + "step": 4945 + }, + { + "epoch": 2.929227124666864, + "grad_norm": 1.5877663565907878, + "learning_rate": 2.8464981379929814e-08, + "loss": 0.5788, + "step": 4946 + }, + { + "epoch": 2.929819366301451, + "grad_norm": 1.87679323242289, + "learning_rate": 2.7984792576606355e-08, + "loss": 0.6044, + "step": 4947 + }, + { + "epoch": 2.930411607936038, + "grad_norm": 1.579589442857244, + "learning_rate": 2.7508682888257587e-08, + "loss": 0.6532, + "step": 4948 + }, + { + "epoch": 2.931003849570625, + "grad_norm": 2.2731032093347125, + "learning_rate": 2.7036652509636607e-08, + "loss": 0.6169, + "step": 4949 + }, + { + "epoch": 2.9315960912052117, + "grad_norm": 1.6629613159148906, + "learning_rate": 2.6568701633832295e-08, + "loss": 0.6204, + "step": 4950 + }, + { + "epoch": 2.9321883328397984, + "grad_norm": 1.539079572245288, + "learning_rate": 2.610483045226264e-08, + "loss": 0.6334, + "step": 4951 + }, + { + "epoch": 2.9327805744743856, + "grad_norm": 3.602880436876509, + "learning_rate": 2.5645039154675867e-08, + "loss": 0.639, + "step": 4952 + }, + { + "epoch": 2.9333728161089727, + "grad_norm": 1.987049639752013, + "learning_rate": 2.518932792915263e-08, + "loss": 0.5984, + "step": 4953 + }, + { + "epoch": 2.9339650577435594, + "grad_norm": 1.4505610755352987, + "learning_rate": 2.4737696962106038e-08, + "loss": 0.5792, + "step": 4954 + }, + { + "epoch": 2.934557299378146, + "grad_norm": 1.5647319892731515, + "learning_rate": 2.4290146438277205e-08, + "loss": 0.6523, + "step": 4955 + }, + { + "epoch": 2.9351495410127333, + "grad_norm": 1.6632250660344898, + "learning_rate": 2.3846676540739687e-08, + "loss": 0.6055, + "step": 4956 + }, + { + "epoch": 2.93574178264732, + "grad_norm": 6.646406313636889, + "learning_rate": 2.3407287450897265e-08, + "loss": 0.553, + "step": 4957 + }, + { + "epoch": 2.936334024281907, + "grad_norm": 1.5348933781298375, + "learning_rate": 2.2971979348485053e-08, + "loss": 0.5773, + "step": 4958 + }, + { + "epoch": 2.936926265916494, + "grad_norm": 7.44752974245422, + "learning_rate": 2.25407524115695e-08, + "loss": 0.6163, + "step": 4959 + }, + { + "epoch": 2.937518507551081, + "grad_norm": 1.6171911575067186, + "learning_rate": 2.2113606816546172e-08, + "loss": 0.6417, + "step": 4960 + }, + { + "epoch": 2.9381107491856677, + "grad_norm": 3.9860985428266287, + "learning_rate": 2.169054273814086e-08, + "loss": 0.6266, + "step": 4961 + }, + { + "epoch": 2.9387029908202544, + "grad_norm": 2.1282877407533936, + "learning_rate": 2.127156034941069e-08, + "loss": 0.6488, + "step": 4962 + }, + { + "epoch": 2.9392952324548416, + "grad_norm": 1.54117884175081, + "learning_rate": 2.085665982174412e-08, + "loss": 0.5842, + "step": 4963 + }, + { + "epoch": 2.9398874740894287, + "grad_norm": 1.6319922889959433, + "learning_rate": 2.0445841324856497e-08, + "loss": 0.5998, + "step": 4964 + }, + { + "epoch": 2.9404797157240155, + "grad_norm": 1.9689793022752626, + "learning_rate": 2.0039105026798956e-08, + "loss": 0.5922, + "step": 4965 + }, + { + "epoch": 2.941071957358602, + "grad_norm": 1.9910821968525951, + "learning_rate": 1.9636451093947296e-08, + "loss": 0.6316, + "step": 4966 + }, + { + "epoch": 2.9416641989931893, + "grad_norm": 1.1742386957010071, + "learning_rate": 1.9237879691009764e-08, + "loss": 0.615, + "step": 4967 + }, + { + "epoch": 2.942256440627776, + "grad_norm": 1.9540025838105881, + "learning_rate": 1.8843390981024835e-08, + "loss": 0.6529, + "step": 4968 + }, + { + "epoch": 2.942848682262363, + "grad_norm": 1.426910256416881, + "learning_rate": 1.84529851253612e-08, + "loss": 0.6153, + "step": 4969 + }, + { + "epoch": 2.94344092389695, + "grad_norm": 1.3438358517937634, + "learning_rate": 1.8066662283715562e-08, + "loss": 0.628, + "step": 4970 + }, + { + "epoch": 2.944033165531537, + "grad_norm": 1.3977430066085448, + "learning_rate": 1.768442261411707e-08, + "loss": 0.5951, + "step": 4971 + }, + { + "epoch": 2.9446254071661238, + "grad_norm": 1.205901999673902, + "learning_rate": 1.7306266272921756e-08, + "loss": 0.63, + "step": 4972 + }, + { + "epoch": 2.9452176488007105, + "grad_norm": 1.4995147029787064, + "learning_rate": 1.6932193414817e-08, + "loss": 0.6355, + "step": 4973 + }, + { + "epoch": 2.9458098904352976, + "grad_norm": 1.2679556957348548, + "learning_rate": 1.6562204192821507e-08, + "loss": 0.6433, + "step": 4974 + }, + { + "epoch": 2.9464021320698848, + "grad_norm": 10.158096968146605, + "learning_rate": 1.619629875827977e-08, + "loss": 0.5938, + "step": 4975 + }, + { + "epoch": 2.9469943737044715, + "grad_norm": 2.776393491326393, + "learning_rate": 1.583447726086762e-08, + "loss": 0.6448, + "step": 4976 + }, + { + "epoch": 2.947586615339058, + "grad_norm": 2.679025395136911, + "learning_rate": 1.5476739848592216e-08, + "loss": 0.6748, + "step": 4977 + }, + { + "epoch": 2.9481788569736453, + "grad_norm": 1.4748116377281646, + "learning_rate": 1.5123086667786502e-08, + "loss": 0.598, + "step": 4978 + }, + { + "epoch": 2.948771098608232, + "grad_norm": 1.4962671971975132, + "learning_rate": 1.4773517863114761e-08, + "loss": 0.5961, + "step": 4979 + }, + { + "epoch": 2.949363340242819, + "grad_norm": 1.6298275886188416, + "learning_rate": 1.4428033577571498e-08, + "loss": 0.6241, + "step": 4980 + }, + { + "epoch": 2.949955581877406, + "grad_norm": 1.314024889434035, + "learning_rate": 1.4086633952478113e-08, + "loss": 0.6465, + "step": 4981 + }, + { + "epoch": 2.950547823511993, + "grad_norm": 1.0907588428254045, + "learning_rate": 1.3749319127486228e-08, + "loss": 0.5865, + "step": 4982 + }, + { + "epoch": 2.95114006514658, + "grad_norm": 1.3791495256977004, + "learning_rate": 1.341608924057658e-08, + "loss": 0.6294, + "step": 4983 + }, + { + "epoch": 2.9517323067811665, + "grad_norm": 1.42971017695065, + "learning_rate": 1.3086944428060132e-08, + "loss": 0.5849, + "step": 4984 + }, + { + "epoch": 2.9523245484157536, + "grad_norm": 1.9580880132072813, + "learning_rate": 1.2761884824573634e-08, + "loss": 0.6101, + "step": 4985 + }, + { + "epoch": 2.952916790050341, + "grad_norm": 1.678533797027778, + "learning_rate": 1.2440910563086273e-08, + "loss": 0.6036, + "step": 4986 + }, + { + "epoch": 2.9535090316849275, + "grad_norm": 1.8624668233784294, + "learning_rate": 1.2124021774894134e-08, + "loss": 0.625, + "step": 4987 + }, + { + "epoch": 2.954101273319514, + "grad_norm": 1.7622666153251665, + "learning_rate": 1.181121858962353e-08, + "loss": 0.5638, + "step": 4988 + }, + { + "epoch": 2.9546935149541014, + "grad_norm": 1.4944115685837704, + "learning_rate": 1.1502501135225442e-08, + "loss": 0.5821, + "step": 4989 + }, + { + "epoch": 2.955285756588688, + "grad_norm": 2.772196851809732, + "learning_rate": 1.1197869537986627e-08, + "loss": 0.6126, + "step": 4990 + }, + { + "epoch": 2.9558779982232752, + "grad_norm": 2.6820552266235147, + "learning_rate": 1.089732392251519e-08, + "loss": 0.5898, + "step": 4991 + }, + { + "epoch": 2.956470239857862, + "grad_norm": 1.26193680911539, + "learning_rate": 1.0600864411753897e-08, + "loss": 0.6345, + "step": 4992 + }, + { + "epoch": 2.957062481492449, + "grad_norm": 3.7109469716570436, + "learning_rate": 1.0308491126969077e-08, + "loss": 0.5613, + "step": 4993 + }, + { + "epoch": 2.957654723127036, + "grad_norm": 1.426244864019141, + "learning_rate": 1.0020204187759507e-08, + "loss": 0.625, + "step": 4994 + }, + { + "epoch": 2.9582469647616225, + "grad_norm": 1.3313942225304733, + "learning_rate": 9.736003712050857e-09, + "loss": 0.5922, + "step": 4995 + }, + { + "epoch": 2.9588392063962097, + "grad_norm": 3.724539629662786, + "learning_rate": 9.455889816095687e-09, + "loss": 0.585, + "step": 4996 + }, + { + "epoch": 2.959431448030797, + "grad_norm": 1.8116214951176344, + "learning_rate": 9.179862614476787e-09, + "loss": 0.646, + "step": 4997 + }, + { + "epoch": 2.9600236896653835, + "grad_norm": 1.4870564082539035, + "learning_rate": 8.907922220104947e-09, + "loss": 0.6339, + "step": 4998 + }, + { + "epoch": 2.9606159312999702, + "grad_norm": 2.5117803113313917, + "learning_rate": 8.640068744220077e-09, + "loss": 0.5936, + "step": 4999 + }, + { + "epoch": 2.9612081729345574, + "grad_norm": 2.643791847377505, + "learning_rate": 8.376302296387862e-09, + "loss": 0.6284, + "step": 5000 + }, + { + "epoch": 2.961800414569144, + "grad_norm": 1.5248955556340484, + "learning_rate": 8.116622984504219e-09, + "loss": 0.608, + "step": 5001 + }, + { + "epoch": 2.9623926562037313, + "grad_norm": 1.4206625140578077, + "learning_rate": 7.861030914791956e-09, + "loss": 0.6016, + "step": 5002 + }, + { + "epoch": 2.962984897838318, + "grad_norm": 2.7467705894682704, + "learning_rate": 7.609526191804107e-09, + "loss": 0.6142, + "step": 5003 + }, + { + "epoch": 2.963577139472905, + "grad_norm": 2.227943416338275, + "learning_rate": 7.362108918418376e-09, + "loss": 0.6039, + "step": 5004 + }, + { + "epoch": 2.964169381107492, + "grad_norm": 2.2376624114324635, + "learning_rate": 7.118779195843806e-09, + "loss": 0.6396, + "step": 5005 + }, + { + "epoch": 2.9647616227420786, + "grad_norm": 2.2974484985519372, + "learning_rate": 6.8795371236163315e-09, + "loss": 0.6084, + "step": 5006 + }, + { + "epoch": 2.9653538643766657, + "grad_norm": 2.445011281093722, + "learning_rate": 6.64438279959767e-09, + "loss": 0.6647, + "step": 5007 + }, + { + "epoch": 2.965946106011253, + "grad_norm": 2.5322699217659825, + "learning_rate": 6.413316319979768e-09, + "loss": 0.6158, + "step": 5008 + }, + { + "epoch": 2.9665383476458396, + "grad_norm": 2.264489636975484, + "learning_rate": 6.186337779282569e-09, + "loss": 0.6319, + "step": 5009 + }, + { + "epoch": 2.9671305892804263, + "grad_norm": 1.276152934848022, + "learning_rate": 5.9634472703518075e-09, + "loss": 0.5986, + "step": 5010 + }, + { + "epoch": 2.9677228309150134, + "grad_norm": 1.7722200702077937, + "learning_rate": 5.744644884363437e-09, + "loss": 0.6465, + "step": 5011 + }, + { + "epoch": 2.9683150725496, + "grad_norm": 2.8327566548651344, + "learning_rate": 5.529930710820308e-09, + "loss": 0.5873, + "step": 5012 + }, + { + "epoch": 2.9689073141841873, + "grad_norm": 15.271569350154476, + "learning_rate": 5.319304837549943e-09, + "loss": 0.5923, + "step": 5013 + }, + { + "epoch": 2.969499555818774, + "grad_norm": 3.7239511417804705, + "learning_rate": 5.112767350713421e-09, + "loss": 0.6025, + "step": 5014 + }, + { + "epoch": 2.970091797453361, + "grad_norm": 1.4648198398137937, + "learning_rate": 4.9103183347942725e-09, + "loss": 0.6499, + "step": 5015 + }, + { + "epoch": 2.970684039087948, + "grad_norm": 1.6998540857276536, + "learning_rate": 4.711957872606254e-09, + "loss": 0.6304, + "step": 5016 + }, + { + "epoch": 2.9712762807225346, + "grad_norm": 1.6467406630255363, + "learning_rate": 4.517686045288905e-09, + "loss": 0.6548, + "step": 5017 + }, + { + "epoch": 2.9718685223571217, + "grad_norm": 1.6033639446909023, + "learning_rate": 4.327502932311989e-09, + "loss": 0.5407, + "step": 5018 + }, + { + "epoch": 2.972460763991709, + "grad_norm": 2.013297951719648, + "learning_rate": 4.141408611469944e-09, + "loss": 0.6036, + "step": 5019 + }, + { + "epoch": 2.9730530056262956, + "grad_norm": 1.5184280052213182, + "learning_rate": 3.959403158885211e-09, + "loss": 0.655, + "step": 5020 + }, + { + "epoch": 2.9736452472608823, + "grad_norm": 1.2805590152228392, + "learning_rate": 3.781486649010458e-09, + "loss": 0.6361, + "step": 5021 + }, + { + "epoch": 2.9742374888954695, + "grad_norm": 1.4488055335719843, + "learning_rate": 3.607659154621912e-09, + "loss": 0.5857, + "step": 5022 + }, + { + "epoch": 2.974829730530056, + "grad_norm": 1.5067233321604163, + "learning_rate": 3.437920746824919e-09, + "loss": 0.6314, + "step": 5023 + }, + { + "epoch": 2.9754219721646433, + "grad_norm": 2.505217366867062, + "learning_rate": 3.2722714950517154e-09, + "loss": 0.593, + "step": 5024 + }, + { + "epoch": 2.97601421379923, + "grad_norm": 2.030933501052515, + "learning_rate": 3.110711467063654e-09, + "loss": 0.6655, + "step": 5025 + }, + { + "epoch": 2.976606455433817, + "grad_norm": 1.2552202263134016, + "learning_rate": 2.95324072894565e-09, + "loss": 0.6237, + "step": 5026 + }, + { + "epoch": 2.977198697068404, + "grad_norm": 1.3223714382480334, + "learning_rate": 2.7998593451139534e-09, + "loss": 0.5913, + "step": 5027 + }, + { + "epoch": 2.9777909387029906, + "grad_norm": 1.3538171647456099, + "learning_rate": 2.6505673783094875e-09, + "loss": 0.6199, + "step": 5028 + }, + { + "epoch": 2.9783831803375778, + "grad_norm": 1.3318176094069416, + "learning_rate": 2.5053648896011804e-09, + "loss": 0.5841, + "step": 5029 + }, + { + "epoch": 2.978975421972165, + "grad_norm": 1.589489686996947, + "learning_rate": 2.364251938384854e-09, + "loss": 0.6094, + "step": 5030 + }, + { + "epoch": 2.9795676636067516, + "grad_norm": 1.7400278108274327, + "learning_rate": 2.227228582384333e-09, + "loss": 0.6143, + "step": 5031 + }, + { + "epoch": 2.9801599052413383, + "grad_norm": 2.585378676689252, + "learning_rate": 2.0942948776481175e-09, + "loss": 0.5828, + "step": 5032 + }, + { + "epoch": 2.9807521468759255, + "grad_norm": 1.7280844207518962, + "learning_rate": 1.965450878556041e-09, + "loss": 0.6314, + "step": 5033 + }, + { + "epoch": 2.981344388510512, + "grad_norm": 5.265908624204134, + "learning_rate": 1.8406966378103909e-09, + "loss": 0.5925, + "step": 5034 + }, + { + "epoch": 2.9819366301450994, + "grad_norm": 1.4033183600587051, + "learning_rate": 1.720032206443678e-09, + "loss": 0.6348, + "step": 5035 + }, + { + "epoch": 2.982528871779686, + "grad_norm": 1.8992093800736356, + "learning_rate": 1.6034576338141982e-09, + "loss": 0.655, + "step": 5036 + }, + { + "epoch": 2.983121113414273, + "grad_norm": 1.6151253256582707, + "learning_rate": 1.4909729676071405e-09, + "loss": 0.5662, + "step": 5037 + }, + { + "epoch": 2.98371335504886, + "grad_norm": 1.2292587721661237, + "learning_rate": 1.3825782538368083e-09, + "loss": 0.6521, + "step": 5038 + }, + { + "epoch": 2.9843055966834466, + "grad_norm": 1.4529622161539926, + "learning_rate": 1.278273536839958e-09, + "loss": 0.6323, + "step": 5039 + }, + { + "epoch": 2.984897838318034, + "grad_norm": 1.498906553325035, + "learning_rate": 1.178058859285791e-09, + "loss": 0.5986, + "step": 5040 + }, + { + "epoch": 2.985490079952621, + "grad_norm": 1.2660500957601097, + "learning_rate": 1.0819342621648522e-09, + "loss": 0.5796, + "step": 5041 + }, + { + "epoch": 2.9860823215872077, + "grad_norm": 1.336021937389783, + "learning_rate": 9.898997848001302e-10, + "loss": 0.5771, + "step": 5042 + }, + { + "epoch": 2.9866745632217944, + "grad_norm": 2.936187746887494, + "learning_rate": 9.019554648381778e-10, + "loss": 0.5746, + "step": 5043 + }, + { + "epoch": 2.9872668048563815, + "grad_norm": 1.974248988367071, + "learning_rate": 8.181013382524416e-10, + "loss": 0.596, + "step": 5044 + }, + { + "epoch": 2.9878590464909682, + "grad_norm": 2.076337021390945, + "learning_rate": 7.383374393454823e-10, + "loss": 0.6021, + "step": 5045 + }, + { + "epoch": 2.9884512881255554, + "grad_norm": 1.2554188823022927, + "learning_rate": 6.626638007434239e-10, + "loss": 0.5964, + "step": 5046 + }, + { + "epoch": 2.989043529760142, + "grad_norm": 1.4522435596098013, + "learning_rate": 5.910804534015046e-10, + "loss": 0.6149, + "step": 5047 + }, + { + "epoch": 2.9896357713947292, + "grad_norm": 1.785638501758462, + "learning_rate": 5.23587426601857e-10, + "loss": 0.6229, + "step": 5048 + }, + { + "epoch": 2.990228013029316, + "grad_norm": 1.5511181558630578, + "learning_rate": 4.601847479523969e-10, + "loss": 0.5953, + "step": 5049 + }, + { + "epoch": 2.9908202546639027, + "grad_norm": 1.5809877291449523, + "learning_rate": 4.008724433890443e-10, + "loss": 0.6185, + "step": 5050 + }, + { + "epoch": 2.99141249629849, + "grad_norm": 1.25867971827954, + "learning_rate": 3.4565053717350303e-10, + "loss": 0.6286, + "step": 5051 + }, + { + "epoch": 2.9920047379330765, + "grad_norm": 1.6791689479616938, + "learning_rate": 2.94519051895481e-10, + "loss": 0.6264, + "step": 5052 + }, + { + "epoch": 2.9925969795676637, + "grad_norm": 2.2845001847932447, + "learning_rate": 2.474780084682493e-10, + "loss": 0.6321, + "step": 5053 + }, + { + "epoch": 2.9931892212022504, + "grad_norm": 2.632181608604146, + "learning_rate": 2.0452742613641386e-10, + "loss": 0.6344, + "step": 5054 + }, + { + "epoch": 2.9937814628368375, + "grad_norm": 1.8651242937205437, + "learning_rate": 1.6566732246925398e-10, + "loss": 0.6184, + "step": 5055 + }, + { + "epoch": 2.9943737044714243, + "grad_norm": 1.375082869113834, + "learning_rate": 1.3089771336072256e-10, + "loss": 0.6076, + "step": 5056 + }, + { + "epoch": 2.9949659461060114, + "grad_norm": 1.3906730573347121, + "learning_rate": 1.0021861303610714e-10, + "loss": 0.5459, + "step": 5057 + }, + { + "epoch": 2.995558187740598, + "grad_norm": 1.4138902354298646, + "learning_rate": 7.363003404314839e-11, + "loss": 0.5922, + "step": 5058 + }, + { + "epoch": 2.9961504293751853, + "grad_norm": 4.735285490670945, + "learning_rate": 5.113198725870128e-11, + "loss": 0.643, + "step": 5059 + }, + { + "epoch": 2.996742671009772, + "grad_norm": 4.0056402967283296, + "learning_rate": 3.272448188429422e-11, + "loss": 0.5758, + "step": 5060 + }, + { + "epoch": 2.9973349126443587, + "grad_norm": 1.6787822740596676, + "learning_rate": 1.8407525452790454e-11, + "loss": 0.5937, + "step": 5061 + }, + { + "epoch": 2.997927154278946, + "grad_norm": 1.5279759168116307, + "learning_rate": 8.181123817285752e-12, + "loss": 0.6155, + "step": 5062 + }, + { + "epoch": 2.9985193959135326, + "grad_norm": 1.977332811161579, + "learning_rate": 2.0452811633209224e-12, + "loss": 0.6343, + "step": 5063 + }, + { + "epoch": 2.9991116375481197, + "grad_norm": 1.904074374443596, + "learning_rate": 0.0, + "loss": 0.595, + "step": 5064 + }, + { + "epoch": 2.9991116375481197, + "step": 5064, + "total_flos": 3794121211510784.0, + "train_loss": 0.7367976606951505, + "train_runtime": 36668.0035, + "train_samples_per_second": 17.677, + "train_steps_per_second": 0.138 + } + ], + "logging_steps": 1.0, + "max_steps": 5064, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 6000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3794121211510784.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}