diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26217 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100.0, + "global_step": 3741, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008019246190858059, + "grad_norm": 5.836986541748047, + "learning_rate": 1.7699115044247788e-07, + "loss": 1.6116, + "step": 1 + }, + { + "epoch": 0.0016038492381716118, + "grad_norm": 5.846986293792725, + "learning_rate": 3.5398230088495575e-07, + "loss": 1.5618, + "step": 2 + }, + { + "epoch": 0.0024057738572574178, + "grad_norm": 10.19965648651123, + "learning_rate": 5.309734513274336e-07, + "loss": 1.6029, + "step": 3 + }, + { + "epoch": 0.0032076984763432237, + "grad_norm": 4.988247394561768, + "learning_rate": 7.079646017699115e-07, + "loss": 1.4261, + "step": 4 + }, + { + "epoch": 0.00400962309542903, + "grad_norm": 2.4512386322021484, + "learning_rate": 8.849557522123895e-07, + "loss": 1.253, + "step": 5 + }, + { + "epoch": 0.0048115477145148355, + "grad_norm": 2.286949396133423, + "learning_rate": 1.0619469026548673e-06, + "loss": 1.3126, + "step": 6 + }, + { + "epoch": 0.0056134723336006415, + "grad_norm": 1.9182817935943604, + "learning_rate": 1.2389380530973452e-06, + "loss": 1.2349, + "step": 7 + }, + { + "epoch": 0.006415396952686447, + "grad_norm": 1.8475348949432373, + "learning_rate": 1.415929203539823e-06, + "loss": 1.2502, + "step": 8 + }, + { + "epoch": 0.007217321571772253, + "grad_norm": 1.5276049375534058, + "learning_rate": 1.592920353982301e-06, + "loss": 1.2345, + "step": 9 + }, + { + "epoch": 0.00801924619085806, + "grad_norm": 1.4844948053359985, + "learning_rate": 1.769911504424779e-06, + "loss": 1.1793, + "step": 10 + }, + { + "epoch": 0.008821170809943865, + "grad_norm": 1.4817630052566528, + "learning_rate": 1.9469026548672567e-06, + "loss": 1.1696, + "step": 11 + }, + { + "epoch": 0.009623095429029671, + "grad_norm": 1.3712340593338013, + "learning_rate": 2.1238938053097345e-06, + "loss": 1.1284, + "step": 12 + }, + { + "epoch": 0.010425020048115477, + "grad_norm": 1.4231791496276855, + "learning_rate": 2.3008849557522127e-06, + "loss": 1.1471, + "step": 13 + }, + { + "epoch": 0.011226944667201283, + "grad_norm": 1.3379884958267212, + "learning_rate": 2.4778761061946905e-06, + "loss": 1.161, + "step": 14 + }, + { + "epoch": 0.012028869286287089, + "grad_norm": 1.452820897102356, + "learning_rate": 2.6548672566371687e-06, + "loss": 1.2179, + "step": 15 + }, + { + "epoch": 0.012830793905372895, + "grad_norm": 1.2341582775115967, + "learning_rate": 2.831858407079646e-06, + "loss": 1.145, + "step": 16 + }, + { + "epoch": 0.0136327185244587, + "grad_norm": 1.3975578546524048, + "learning_rate": 3.0088495575221242e-06, + "loss": 1.1374, + "step": 17 + }, + { + "epoch": 0.014434643143544507, + "grad_norm": 1.4167269468307495, + "learning_rate": 3.185840707964602e-06, + "loss": 1.1401, + "step": 18 + }, + { + "epoch": 0.015236567762630313, + "grad_norm": 1.321500301361084, + "learning_rate": 3.36283185840708e-06, + "loss": 1.162, + "step": 19 + }, + { + "epoch": 0.01603849238171612, + "grad_norm": 1.303618311882019, + "learning_rate": 3.539823008849558e-06, + "loss": 1.1027, + "step": 20 + }, + { + "epoch": 0.016840417000801924, + "grad_norm": 1.3029676675796509, + "learning_rate": 3.7168141592920357e-06, + "loss": 1.1181, + "step": 21 + }, + { + "epoch": 0.01764234161988773, + "grad_norm": 1.4097836017608643, + "learning_rate": 3.8938053097345135e-06, + "loss": 1.0703, + "step": 22 + }, + { + "epoch": 0.018444266238973536, + "grad_norm": 1.1947312355041504, + "learning_rate": 4.070796460176992e-06, + "loss": 1.0716, + "step": 23 + }, + { + "epoch": 0.019246190858059342, + "grad_norm": 1.2278361320495605, + "learning_rate": 4.247787610619469e-06, + "loss": 1.0814, + "step": 24 + }, + { + "epoch": 0.020048115477145148, + "grad_norm": 1.2241326570510864, + "learning_rate": 4.424778761061948e-06, + "loss": 1.0598, + "step": 25 + }, + { + "epoch": 0.020850040096230954, + "grad_norm": 1.271500587463379, + "learning_rate": 4.6017699115044254e-06, + "loss": 1.0802, + "step": 26 + }, + { + "epoch": 0.02165196471531676, + "grad_norm": 1.3000115156173706, + "learning_rate": 4.778761061946903e-06, + "loss": 1.0799, + "step": 27 + }, + { + "epoch": 0.022453889334402566, + "grad_norm": 1.1881065368652344, + "learning_rate": 4.955752212389381e-06, + "loss": 1.1282, + "step": 28 + }, + { + "epoch": 0.023255813953488372, + "grad_norm": 1.1343086957931519, + "learning_rate": 5.132743362831859e-06, + "loss": 1.0606, + "step": 29 + }, + { + "epoch": 0.024057738572574178, + "grad_norm": 1.2190392017364502, + "learning_rate": 5.309734513274337e-06, + "loss": 1.0884, + "step": 30 + }, + { + "epoch": 0.024859663191659984, + "grad_norm": 1.234042763710022, + "learning_rate": 5.486725663716814e-06, + "loss": 1.0716, + "step": 31 + }, + { + "epoch": 0.02566158781074579, + "grad_norm": 1.2421082258224487, + "learning_rate": 5.663716814159292e-06, + "loss": 1.0962, + "step": 32 + }, + { + "epoch": 0.026463512429831595, + "grad_norm": 1.2859286069869995, + "learning_rate": 5.840707964601771e-06, + "loss": 1.0681, + "step": 33 + }, + { + "epoch": 0.0272654370489174, + "grad_norm": 1.1014221906661987, + "learning_rate": 6.0176991150442484e-06, + "loss": 1.026, + "step": 34 + }, + { + "epoch": 0.028067361668003207, + "grad_norm": 1.2788565158843994, + "learning_rate": 6.194690265486726e-06, + "loss": 1.0742, + "step": 35 + }, + { + "epoch": 0.028869286287089013, + "grad_norm": 1.2129113674163818, + "learning_rate": 6.371681415929204e-06, + "loss": 1.0913, + "step": 36 + }, + { + "epoch": 0.02967121090617482, + "grad_norm": 1.1455950736999512, + "learning_rate": 6.548672566371682e-06, + "loss": 1.0863, + "step": 37 + }, + { + "epoch": 0.030473135525260625, + "grad_norm": 1.3164713382720947, + "learning_rate": 6.72566371681416e-06, + "loss": 1.0514, + "step": 38 + }, + { + "epoch": 0.03127506014434643, + "grad_norm": 1.119469404220581, + "learning_rate": 6.902654867256637e-06, + "loss": 1.0738, + "step": 39 + }, + { + "epoch": 0.03207698476343224, + "grad_norm": 1.3728922605514526, + "learning_rate": 7.079646017699116e-06, + "loss": 1.0777, + "step": 40 + }, + { + "epoch": 0.03287890938251804, + "grad_norm": 1.2131261825561523, + "learning_rate": 7.256637168141594e-06, + "loss": 1.1212, + "step": 41 + }, + { + "epoch": 0.03368083400160385, + "grad_norm": 1.2123545408248901, + "learning_rate": 7.4336283185840714e-06, + "loss": 1.0375, + "step": 42 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 1.1865596771240234, + "learning_rate": 7.610619469026549e-06, + "loss": 1.0203, + "step": 43 + }, + { + "epoch": 0.03528468323977546, + "grad_norm": 1.120460033416748, + "learning_rate": 7.787610619469027e-06, + "loss": 1.0839, + "step": 44 + }, + { + "epoch": 0.03608660785886127, + "grad_norm": 1.2285219430923462, + "learning_rate": 7.964601769911505e-06, + "loss": 1.0964, + "step": 45 + }, + { + "epoch": 0.03688853247794707, + "grad_norm": 1.398607850074768, + "learning_rate": 8.141592920353984e-06, + "loss": 1.0015, + "step": 46 + }, + { + "epoch": 0.03769045709703288, + "grad_norm": 1.0944195985794067, + "learning_rate": 8.31858407079646e-06, + "loss": 1.0503, + "step": 47 + }, + { + "epoch": 0.038492381716118684, + "grad_norm": 1.28694486618042, + "learning_rate": 8.495575221238938e-06, + "loss": 1.0536, + "step": 48 + }, + { + "epoch": 0.03929430633520449, + "grad_norm": 1.1280242204666138, + "learning_rate": 8.672566371681418e-06, + "loss": 1.0503, + "step": 49 + }, + { + "epoch": 0.040096230954290296, + "grad_norm": 1.1579207181930542, + "learning_rate": 8.849557522123895e-06, + "loss": 1.0481, + "step": 50 + }, + { + "epoch": 0.0408981555733761, + "grad_norm": 1.3872171640396118, + "learning_rate": 9.026548672566371e-06, + "loss": 1.0827, + "step": 51 + }, + { + "epoch": 0.04170008019246191, + "grad_norm": 1.05665123462677, + "learning_rate": 9.203539823008851e-06, + "loss": 1.0334, + "step": 52 + }, + { + "epoch": 0.042502004811547714, + "grad_norm": 1.2471959590911865, + "learning_rate": 9.380530973451329e-06, + "loss": 1.0903, + "step": 53 + }, + { + "epoch": 0.04330392943063352, + "grad_norm": 1.1287788152694702, + "learning_rate": 9.557522123893806e-06, + "loss": 1.0782, + "step": 54 + }, + { + "epoch": 0.044105854049719326, + "grad_norm": 1.2654865980148315, + "learning_rate": 9.734513274336284e-06, + "loss": 1.0912, + "step": 55 + }, + { + "epoch": 0.04490777866880513, + "grad_norm": 1.0894954204559326, + "learning_rate": 9.911504424778762e-06, + "loss": 1.0549, + "step": 56 + }, + { + "epoch": 0.04570970328789094, + "grad_norm": 1.3154832124710083, + "learning_rate": 1.008849557522124e-05, + "loss": 1.0503, + "step": 57 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 1.2271111011505127, + "learning_rate": 1.0265486725663717e-05, + "loss": 1.0748, + "step": 58 + }, + { + "epoch": 0.04731355252606255, + "grad_norm": 1.06815767288208, + "learning_rate": 1.0442477876106197e-05, + "loss": 1.0622, + "step": 59 + }, + { + "epoch": 0.048115477145148355, + "grad_norm": 1.261401891708374, + "learning_rate": 1.0619469026548675e-05, + "loss": 1.0666, + "step": 60 + }, + { + "epoch": 0.04891740176423416, + "grad_norm": 1.1487725973129272, + "learning_rate": 1.079646017699115e-05, + "loss": 1.0687, + "step": 61 + }, + { + "epoch": 0.04971932638331997, + "grad_norm": 1.0832433700561523, + "learning_rate": 1.0973451327433629e-05, + "loss": 1.0784, + "step": 62 + }, + { + "epoch": 0.05052125100240577, + "grad_norm": 1.0512681007385254, + "learning_rate": 1.1150442477876106e-05, + "loss": 1.0862, + "step": 63 + }, + { + "epoch": 0.05132317562149158, + "grad_norm": 1.0937095880508423, + "learning_rate": 1.1327433628318584e-05, + "loss": 1.098, + "step": 64 + }, + { + "epoch": 0.052125100240577385, + "grad_norm": 1.1490569114685059, + "learning_rate": 1.1504424778761064e-05, + "loss": 1.068, + "step": 65 + }, + { + "epoch": 0.05292702485966319, + "grad_norm": 1.1199543476104736, + "learning_rate": 1.1681415929203541e-05, + "loss": 1.0263, + "step": 66 + }, + { + "epoch": 0.053728949478749, + "grad_norm": 1.1037191152572632, + "learning_rate": 1.1858407079646019e-05, + "loss": 1.031, + "step": 67 + }, + { + "epoch": 0.0545308740978348, + "grad_norm": 1.168930172920227, + "learning_rate": 1.2035398230088497e-05, + "loss": 1.0939, + "step": 68 + }, + { + "epoch": 0.05533279871692061, + "grad_norm": 1.1001696586608887, + "learning_rate": 1.2212389380530973e-05, + "loss": 1.0531, + "step": 69 + }, + { + "epoch": 0.056134723336006415, + "grad_norm": 1.248002052307129, + "learning_rate": 1.2389380530973452e-05, + "loss": 1.0443, + "step": 70 + }, + { + "epoch": 0.05693664795509222, + "grad_norm": 1.1349272727966309, + "learning_rate": 1.256637168141593e-05, + "loss": 1.0606, + "step": 71 + }, + { + "epoch": 0.057738572574178026, + "grad_norm": 1.0885883569717407, + "learning_rate": 1.2743362831858408e-05, + "loss": 1.104, + "step": 72 + }, + { + "epoch": 0.05854049719326383, + "grad_norm": 1.1218723058700562, + "learning_rate": 1.2920353982300886e-05, + "loss": 1.0647, + "step": 73 + }, + { + "epoch": 0.05934242181234964, + "grad_norm": 0.999465823173523, + "learning_rate": 1.3097345132743363e-05, + "loss": 1.0358, + "step": 74 + }, + { + "epoch": 0.060144346431435444, + "grad_norm": 1.1456373929977417, + "learning_rate": 1.3274336283185843e-05, + "loss": 1.0156, + "step": 75 + }, + { + "epoch": 0.06094627105052125, + "grad_norm": 1.0239201784133911, + "learning_rate": 1.345132743362832e-05, + "loss": 1.0935, + "step": 76 + }, + { + "epoch": 0.061748195669607056, + "grad_norm": 1.1573493480682373, + "learning_rate": 1.3628318584070797e-05, + "loss": 1.0202, + "step": 77 + }, + { + "epoch": 0.06255012028869286, + "grad_norm": 1.12896728515625, + "learning_rate": 1.3805309734513275e-05, + "loss": 1.0681, + "step": 78 + }, + { + "epoch": 0.06335204490777867, + "grad_norm": 1.1681842803955078, + "learning_rate": 1.3982300884955752e-05, + "loss": 1.1181, + "step": 79 + }, + { + "epoch": 0.06415396952686447, + "grad_norm": 1.1300599575042725, + "learning_rate": 1.4159292035398232e-05, + "loss": 1.0787, + "step": 80 + }, + { + "epoch": 0.06495589414595028, + "grad_norm": 0.9626098275184631, + "learning_rate": 1.433628318584071e-05, + "loss": 1.0674, + "step": 81 + }, + { + "epoch": 0.06575781876503609, + "grad_norm": 1.2202208042144775, + "learning_rate": 1.4513274336283187e-05, + "loss": 1.076, + "step": 82 + }, + { + "epoch": 0.06655974338412189, + "grad_norm": 1.2100845575332642, + "learning_rate": 1.4690265486725665e-05, + "loss": 1.0548, + "step": 83 + }, + { + "epoch": 0.0673616680032077, + "grad_norm": 1.066315770149231, + "learning_rate": 1.4867256637168143e-05, + "loss": 1.0481, + "step": 84 + }, + { + "epoch": 0.0681635926222935, + "grad_norm": 1.0019817352294922, + "learning_rate": 1.5044247787610619e-05, + "loss": 1.019, + "step": 85 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 1.112805724143982, + "learning_rate": 1.5221238938053098e-05, + "loss": 1.0574, + "step": 86 + }, + { + "epoch": 0.06976744186046512, + "grad_norm": 1.1630192995071411, + "learning_rate": 1.5398230088495576e-05, + "loss": 1.0499, + "step": 87 + }, + { + "epoch": 0.07056936647955092, + "grad_norm": 1.2215139865875244, + "learning_rate": 1.5575221238938054e-05, + "loss": 1.1022, + "step": 88 + }, + { + "epoch": 0.07137129109863673, + "grad_norm": 1.0800572633743286, + "learning_rate": 1.5752212389380532e-05, + "loss": 1.112, + "step": 89 + }, + { + "epoch": 0.07217321571772253, + "grad_norm": 1.1478803157806396, + "learning_rate": 1.592920353982301e-05, + "loss": 1.0589, + "step": 90 + }, + { + "epoch": 0.07297514033680834, + "grad_norm": 1.1359870433807373, + "learning_rate": 1.6106194690265487e-05, + "loss": 1.026, + "step": 91 + }, + { + "epoch": 0.07377706495589414, + "grad_norm": 1.0617109537124634, + "learning_rate": 1.628318584070797e-05, + "loss": 1.112, + "step": 92 + }, + { + "epoch": 0.07457898957497995, + "grad_norm": 1.0477237701416016, + "learning_rate": 1.6460176991150443e-05, + "loss": 1.1037, + "step": 93 + }, + { + "epoch": 0.07538091419406576, + "grad_norm": 1.1191864013671875, + "learning_rate": 1.663716814159292e-05, + "loss": 1.0756, + "step": 94 + }, + { + "epoch": 0.07618283881315156, + "grad_norm": 0.9961270093917847, + "learning_rate": 1.68141592920354e-05, + "loss": 1.0885, + "step": 95 + }, + { + "epoch": 0.07698476343223737, + "grad_norm": 1.0917061567306519, + "learning_rate": 1.6991150442477876e-05, + "loss": 1.0989, + "step": 96 + }, + { + "epoch": 0.07778668805132317, + "grad_norm": 0.9411718845367432, + "learning_rate": 1.7168141592920354e-05, + "loss": 1.0161, + "step": 97 + }, + { + "epoch": 0.07858861267040898, + "grad_norm": 1.052199363708496, + "learning_rate": 1.7345132743362835e-05, + "loss": 1.0793, + "step": 98 + }, + { + "epoch": 0.07939053728949479, + "grad_norm": 1.0183230638504028, + "learning_rate": 1.7522123893805313e-05, + "loss": 1.0708, + "step": 99 + }, + { + "epoch": 0.08019246190858059, + "grad_norm": 0.964535653591156, + "learning_rate": 1.769911504424779e-05, + "loss": 1.0328, + "step": 100 + }, + { + "epoch": 0.0809943865276664, + "grad_norm": 0.971592903137207, + "learning_rate": 1.7876106194690265e-05, + "loss": 1.0246, + "step": 101 + }, + { + "epoch": 0.0817963111467522, + "grad_norm": 0.9727990627288818, + "learning_rate": 1.8053097345132743e-05, + "loss": 1.0664, + "step": 102 + }, + { + "epoch": 0.08259823576583801, + "grad_norm": 1.0569050312042236, + "learning_rate": 1.823008849557522e-05, + "loss": 1.1197, + "step": 103 + }, + { + "epoch": 0.08340016038492382, + "grad_norm": 0.9434515237808228, + "learning_rate": 1.8407079646017702e-05, + "loss": 0.9806, + "step": 104 + }, + { + "epoch": 0.08420208500400962, + "grad_norm": 0.9337801337242126, + "learning_rate": 1.858407079646018e-05, + "loss": 1.0387, + "step": 105 + }, + { + "epoch": 0.08500400962309543, + "grad_norm": 1.0005512237548828, + "learning_rate": 1.8761061946902657e-05, + "loss": 1.0928, + "step": 106 + }, + { + "epoch": 0.08580593424218123, + "grad_norm": 0.9581108093261719, + "learning_rate": 1.8938053097345135e-05, + "loss": 1.081, + "step": 107 + }, + { + "epoch": 0.08660785886126704, + "grad_norm": 0.947929322719574, + "learning_rate": 1.9115044247787613e-05, + "loss": 1.078, + "step": 108 + }, + { + "epoch": 0.08740978348035285, + "grad_norm": 0.989020824432373, + "learning_rate": 1.929203539823009e-05, + "loss": 1.0446, + "step": 109 + }, + { + "epoch": 0.08821170809943865, + "grad_norm": 0.9517629742622375, + "learning_rate": 1.946902654867257e-05, + "loss": 1.0393, + "step": 110 + }, + { + "epoch": 0.08901363271852446, + "grad_norm": 0.9815518260002136, + "learning_rate": 1.9646017699115046e-05, + "loss": 1.1276, + "step": 111 + }, + { + "epoch": 0.08981555733761026, + "grad_norm": 0.9295392632484436, + "learning_rate": 1.9823008849557524e-05, + "loss": 1.0595, + "step": 112 + }, + { + "epoch": 0.09061748195669607, + "grad_norm": 0.9898239970207214, + "learning_rate": 2e-05, + "loss": 1.0555, + "step": 113 + }, + { + "epoch": 0.09141940657578188, + "grad_norm": 0.9700495004653931, + "learning_rate": 1.999999625082972e-05, + "loss": 1.0749, + "step": 114 + }, + { + "epoch": 0.09222133119486768, + "grad_norm": 1.0816751718521118, + "learning_rate": 1.9999985003321688e-05, + "loss": 1.0688, + "step": 115 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 1.0361632108688354, + "learning_rate": 1.999996625748434e-05, + "loss": 1.0596, + "step": 116 + }, + { + "epoch": 0.09382518043303929, + "grad_norm": 0.9109675884246826, + "learning_rate": 1.999994001333173e-05, + "loss": 1.0544, + "step": 117 + }, + { + "epoch": 0.0946271050521251, + "grad_norm": 0.9128645658493042, + "learning_rate": 1.9999906270883536e-05, + "loss": 1.0718, + "step": 118 + }, + { + "epoch": 0.0954290296712109, + "grad_norm": 0.8604353070259094, + "learning_rate": 1.999986503016506e-05, + "loss": 1.0443, + "step": 119 + }, + { + "epoch": 0.09623095429029671, + "grad_norm": 0.9511041641235352, + "learning_rate": 1.999981629120723e-05, + "loss": 1.0433, + "step": 120 + }, + { + "epoch": 0.09703287890938252, + "grad_norm": 0.9402782320976257, + "learning_rate": 1.999976005404659e-05, + "loss": 1.0483, + "step": 121 + }, + { + "epoch": 0.09783480352846832, + "grad_norm": 1.1139960289001465, + "learning_rate": 1.9999696318725305e-05, + "loss": 1.0563, + "step": 122 + }, + { + "epoch": 0.09863672814755413, + "grad_norm": 0.9203987717628479, + "learning_rate": 1.999962508529117e-05, + "loss": 1.0902, + "step": 123 + }, + { + "epoch": 0.09943865276663993, + "grad_norm": 0.9191240072250366, + "learning_rate": 1.999954635379759e-05, + "loss": 1.0501, + "step": 124 + }, + { + "epoch": 0.10024057738572574, + "grad_norm": 0.920760452747345, + "learning_rate": 1.9999460124303614e-05, + "loss": 1.075, + "step": 125 + }, + { + "epoch": 0.10104250200481155, + "grad_norm": 0.8883569836616516, + "learning_rate": 1.999936639687389e-05, + "loss": 1.0414, + "step": 126 + }, + { + "epoch": 0.10184442662389735, + "grad_norm": 0.9747478365898132, + "learning_rate": 1.9999265171578705e-05, + "loss": 1.0719, + "step": 127 + }, + { + "epoch": 0.10264635124298316, + "grad_norm": 0.9460424780845642, + "learning_rate": 1.999915644849395e-05, + "loss": 1.0065, + "step": 128 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 0.8998356461524963, + "learning_rate": 1.999904022770116e-05, + "loss": 1.0219, + "step": 129 + }, + { + "epoch": 0.10425020048115477, + "grad_norm": 0.8906877636909485, + "learning_rate": 1.9998916509287477e-05, + "loss": 1.0518, + "step": 130 + }, + { + "epoch": 0.10505212510024058, + "grad_norm": 1.0315589904785156, + "learning_rate": 1.999878529334567e-05, + "loss": 1.0851, + "step": 131 + }, + { + "epoch": 0.10585404971932638, + "grad_norm": 0.8850352764129639, + "learning_rate": 1.9998646579974133e-05, + "loss": 1.0648, + "step": 132 + }, + { + "epoch": 0.10665597433841219, + "grad_norm": 0.9172976613044739, + "learning_rate": 1.9998500369276873e-05, + "loss": 1.1327, + "step": 133 + }, + { + "epoch": 0.107457898957498, + "grad_norm": 0.8636643886566162, + "learning_rate": 1.999834666136352e-05, + "loss": 1.0097, + "step": 134 + }, + { + "epoch": 0.1082598235765838, + "grad_norm": 0.8971249461174011, + "learning_rate": 1.9998185456349338e-05, + "loss": 1.1087, + "step": 135 + }, + { + "epoch": 0.1090617481956696, + "grad_norm": 0.9222013354301453, + "learning_rate": 1.9998016754355198e-05, + "loss": 1.0297, + "step": 136 + }, + { + "epoch": 0.10986367281475541, + "grad_norm": 0.9246983528137207, + "learning_rate": 1.9997840555507605e-05, + "loss": 1.073, + "step": 137 + }, + { + "epoch": 0.11066559743384122, + "grad_norm": 0.7840830087661743, + "learning_rate": 1.9997656859938673e-05, + "loss": 1.0303, + "step": 138 + }, + { + "epoch": 0.11146752205292702, + "grad_norm": 0.919251024723053, + "learning_rate": 1.9997465667786143e-05, + "loss": 1.0788, + "step": 139 + }, + { + "epoch": 0.11226944667201283, + "grad_norm": 0.9178858995437622, + "learning_rate": 1.999726697919338e-05, + "loss": 1.092, + "step": 140 + }, + { + "epoch": 0.11307137129109864, + "grad_norm": 0.8240802884101868, + "learning_rate": 1.9997060794309367e-05, + "loss": 1.0582, + "step": 141 + }, + { + "epoch": 0.11387329591018444, + "grad_norm": 0.8621988892555237, + "learning_rate": 1.999684711328871e-05, + "loss": 1.0451, + "step": 142 + }, + { + "epoch": 0.11467522052927025, + "grad_norm": 0.8760347962379456, + "learning_rate": 1.999662593629163e-05, + "loss": 1.0468, + "step": 143 + }, + { + "epoch": 0.11547714514835605, + "grad_norm": 0.8880921006202698, + "learning_rate": 1.9996397263483973e-05, + "loss": 1.0447, + "step": 144 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 0.8683561682701111, + "learning_rate": 1.9996161095037215e-05, + "loss": 1.0719, + "step": 145 + }, + { + "epoch": 0.11708099438652766, + "grad_norm": 0.8298245668411255, + "learning_rate": 1.999591743112843e-05, + "loss": 0.9996, + "step": 146 + }, + { + "epoch": 0.11788291900561347, + "grad_norm": 0.8646122813224792, + "learning_rate": 1.9995666271940334e-05, + "loss": 0.9995, + "step": 147 + }, + { + "epoch": 0.11868484362469928, + "grad_norm": 0.8935620188713074, + "learning_rate": 1.9995407617661254e-05, + "loss": 1.05, + "step": 148 + }, + { + "epoch": 0.11948676824378508, + "grad_norm": 0.8892669081687927, + "learning_rate": 1.9995141468485138e-05, + "loss": 1.0375, + "step": 149 + }, + { + "epoch": 0.12028869286287089, + "grad_norm": 0.8888627886772156, + "learning_rate": 1.9994867824611552e-05, + "loss": 1.0586, + "step": 150 + }, + { + "epoch": 0.1210906174819567, + "grad_norm": 0.8920398354530334, + "learning_rate": 1.9994586686245682e-05, + "loss": 1.0305, + "step": 151 + }, + { + "epoch": 0.1218925421010425, + "grad_norm": 0.8657960295677185, + "learning_rate": 1.9994298053598335e-05, + "loss": 1.069, + "step": 152 + }, + { + "epoch": 0.1226944667201283, + "grad_norm": 0.8122588396072388, + "learning_rate": 1.9994001926885936e-05, + "loss": 1.0708, + "step": 153 + }, + { + "epoch": 0.12349639133921411, + "grad_norm": 0.8348774313926697, + "learning_rate": 1.9993698306330542e-05, + "loss": 1.0157, + "step": 154 + }, + { + "epoch": 0.12429831595829992, + "grad_norm": 0.9911849498748779, + "learning_rate": 1.9993387192159807e-05, + "loss": 1.0413, + "step": 155 + }, + { + "epoch": 0.12510024057738572, + "grad_norm": 0.8572826385498047, + "learning_rate": 1.9993068584607018e-05, + "loss": 1.0975, + "step": 156 + }, + { + "epoch": 0.12590216519647154, + "grad_norm": 0.8724331259727478, + "learning_rate": 1.999274248391108e-05, + "loss": 1.01, + "step": 157 + }, + { + "epoch": 0.12670408981555734, + "grad_norm": 0.9225433468818665, + "learning_rate": 1.999240889031651e-05, + "loss": 1.0625, + "step": 158 + }, + { + "epoch": 0.12750601443464316, + "grad_norm": 0.8641144633293152, + "learning_rate": 1.999206780407345e-05, + "loss": 1.0393, + "step": 159 + }, + { + "epoch": 0.12830793905372895, + "grad_norm": 0.8191494941711426, + "learning_rate": 1.999171922543766e-05, + "loss": 1.002, + "step": 160 + }, + { + "epoch": 0.12910986367281477, + "grad_norm": 0.769394040107727, + "learning_rate": 1.9991363154670512e-05, + "loss": 0.9976, + "step": 161 + }, + { + "epoch": 0.12991178829190056, + "grad_norm": 0.9122303128242493, + "learning_rate": 1.9990999592039007e-05, + "loss": 1.0378, + "step": 162 + }, + { + "epoch": 0.13071371291098638, + "grad_norm": 1.0107554197311401, + "learning_rate": 1.9990628537815748e-05, + "loss": 0.9659, + "step": 163 + }, + { + "epoch": 0.13151563753007217, + "grad_norm": 0.8204858303070068, + "learning_rate": 1.999024999227897e-05, + "loss": 1.0627, + "step": 164 + }, + { + "epoch": 0.132317562149158, + "grad_norm": 0.7805432081222534, + "learning_rate": 1.9989863955712518e-05, + "loss": 1.0367, + "step": 165 + }, + { + "epoch": 0.13311948676824378, + "grad_norm": 1.51397705078125, + "learning_rate": 1.9989470428405853e-05, + "loss": 1.0363, + "step": 166 + }, + { + "epoch": 0.1339214113873296, + "grad_norm": 0.8241011500358582, + "learning_rate": 1.9989069410654055e-05, + "loss": 1.0326, + "step": 167 + }, + { + "epoch": 0.1347233360064154, + "grad_norm": 0.7944259643554688, + "learning_rate": 1.998866090275783e-05, + "loss": 1.0861, + "step": 168 + }, + { + "epoch": 0.13552526062550121, + "grad_norm": 1.0405583381652832, + "learning_rate": 1.9988244905023476e-05, + "loss": 1.0366, + "step": 169 + }, + { + "epoch": 0.136327185244587, + "grad_norm": 0.7776370048522949, + "learning_rate": 1.9987821417762937e-05, + "loss": 1.0174, + "step": 170 + }, + { + "epoch": 0.13712910986367283, + "grad_norm": 0.8183755874633789, + "learning_rate": 1.9987390441293747e-05, + "loss": 1.0243, + "step": 171 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.776903510093689, + "learning_rate": 1.9986951975939073e-05, + "loss": 1.058, + "step": 172 + }, + { + "epoch": 0.13873295910184444, + "grad_norm": 0.7709075212478638, + "learning_rate": 1.998650602202769e-05, + "loss": 1.0626, + "step": 173 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.8181372284889221, + "learning_rate": 1.998605257989399e-05, + "loss": 0.9818, + "step": 174 + }, + { + "epoch": 0.14033680834001605, + "grad_norm": 0.8568731546401978, + "learning_rate": 1.9985591649877974e-05, + "loss": 1.0647, + "step": 175 + }, + { + "epoch": 0.14113873295910184, + "grad_norm": 0.7709632515907288, + "learning_rate": 1.998512323232527e-05, + "loss": 1.0027, + "step": 176 + }, + { + "epoch": 0.14194065757818766, + "grad_norm": 0.8511577248573303, + "learning_rate": 1.998464732758711e-05, + "loss": 1.0465, + "step": 177 + }, + { + "epoch": 0.14274258219727345, + "grad_norm": 0.8334391117095947, + "learning_rate": 1.9984163936020348e-05, + "loss": 0.9812, + "step": 178 + }, + { + "epoch": 0.14354450681635927, + "grad_norm": 0.7931066155433655, + "learning_rate": 1.9983673057987438e-05, + "loss": 1.0139, + "step": 179 + }, + { + "epoch": 0.14434643143544507, + "grad_norm": 0.8163535594940186, + "learning_rate": 1.9983174693856465e-05, + "loss": 1.0131, + "step": 180 + }, + { + "epoch": 0.14514835605453089, + "grad_norm": 0.7544008493423462, + "learning_rate": 1.998266884400112e-05, + "loss": 1.0507, + "step": 181 + }, + { + "epoch": 0.14595028067361668, + "grad_norm": 0.8450154066085815, + "learning_rate": 1.99821555088007e-05, + "loss": 0.9851, + "step": 182 + }, + { + "epoch": 0.1467522052927025, + "grad_norm": 0.9036790132522583, + "learning_rate": 1.9981634688640126e-05, + "loss": 1.0312, + "step": 183 + }, + { + "epoch": 0.1475541299117883, + "grad_norm": 0.903751015663147, + "learning_rate": 1.998110638390993e-05, + "loss": 1.0143, + "step": 184 + }, + { + "epoch": 0.1483560545308741, + "grad_norm": 0.8345926403999329, + "learning_rate": 1.9980570595006243e-05, + "loss": 1.0127, + "step": 185 + }, + { + "epoch": 0.1491579791499599, + "grad_norm": 0.9414835572242737, + "learning_rate": 1.9980027322330825e-05, + "loss": 1.0239, + "step": 186 + }, + { + "epoch": 0.14995990376904572, + "grad_norm": 0.801048219203949, + "learning_rate": 1.9979476566291038e-05, + "loss": 1.0562, + "step": 187 + }, + { + "epoch": 0.1507618283881315, + "grad_norm": 0.8513798117637634, + "learning_rate": 1.9978918327299855e-05, + "loss": 1.0221, + "step": 188 + }, + { + "epoch": 0.15156375300721733, + "grad_norm": 0.8290636539459229, + "learning_rate": 1.9978352605775874e-05, + "loss": 1.0666, + "step": 189 + }, + { + "epoch": 0.15236567762630313, + "grad_norm": 0.7645179629325867, + "learning_rate": 1.9977779402143277e-05, + "loss": 0.9785, + "step": 190 + }, + { + "epoch": 0.15316760224538895, + "grad_norm": 0.7943917512893677, + "learning_rate": 1.997719871683188e-05, + "loss": 1.0015, + "step": 191 + }, + { + "epoch": 0.15396952686447474, + "grad_norm": 0.8007308840751648, + "learning_rate": 1.9976610550277104e-05, + "loss": 0.9936, + "step": 192 + }, + { + "epoch": 0.15477145148356056, + "grad_norm": 0.8672998547554016, + "learning_rate": 1.997601490291997e-05, + "loss": 1.0385, + "step": 193 + }, + { + "epoch": 0.15557337610264635, + "grad_norm": 0.9103288650512695, + "learning_rate": 1.9975411775207113e-05, + "loss": 1.0314, + "step": 194 + }, + { + "epoch": 0.15637530072173217, + "grad_norm": 0.8059409856796265, + "learning_rate": 1.997480116759078e-05, + "loss": 0.9887, + "step": 195 + }, + { + "epoch": 0.15717722534081796, + "grad_norm": 0.8400903940200806, + "learning_rate": 1.9974183080528835e-05, + "loss": 1.0218, + "step": 196 + }, + { + "epoch": 0.15797914995990378, + "grad_norm": 0.7969584465026855, + "learning_rate": 1.9973557514484726e-05, + "loss": 0.9769, + "step": 197 + }, + { + "epoch": 0.15878107457898957, + "grad_norm": 0.8063225150108337, + "learning_rate": 1.997292446992754e-05, + "loss": 1.0702, + "step": 198 + }, + { + "epoch": 0.1595829991980754, + "grad_norm": 0.7932840585708618, + "learning_rate": 1.9972283947331937e-05, + "loss": 1.0466, + "step": 199 + }, + { + "epoch": 0.16038492381716118, + "grad_norm": 0.8667788505554199, + "learning_rate": 1.9971635947178214e-05, + "loss": 1.0006, + "step": 200 + }, + { + "epoch": 0.161186848436247, + "grad_norm": 0.8451849222183228, + "learning_rate": 1.9970980469952264e-05, + "loss": 0.9824, + "step": 201 + }, + { + "epoch": 0.1619887730553328, + "grad_norm": 0.7452207207679749, + "learning_rate": 1.9970317516145582e-05, + "loss": 1.0637, + "step": 202 + }, + { + "epoch": 0.16279069767441862, + "grad_norm": 0.7727819085121155, + "learning_rate": 1.9969647086255274e-05, + "loss": 0.9842, + "step": 203 + }, + { + "epoch": 0.1635926222935044, + "grad_norm": 0.8708699941635132, + "learning_rate": 1.9968969180784055e-05, + "loss": 0.9804, + "step": 204 + }, + { + "epoch": 0.16439454691259023, + "grad_norm": 0.7909506559371948, + "learning_rate": 1.996828380024024e-05, + "loss": 1.0178, + "step": 205 + }, + { + "epoch": 0.16519647153167602, + "grad_norm": 0.8187145590782166, + "learning_rate": 1.9967590945137744e-05, + "loss": 1.0755, + "step": 206 + }, + { + "epoch": 0.16599839615076184, + "grad_norm": 0.8718328475952148, + "learning_rate": 1.99668906159961e-05, + "loss": 1.07, + "step": 207 + }, + { + "epoch": 0.16680032076984763, + "grad_norm": 0.7883650660514832, + "learning_rate": 1.996618281334044e-05, + "loss": 1.0534, + "step": 208 + }, + { + "epoch": 0.16760224538893345, + "grad_norm": 0.7847347259521484, + "learning_rate": 1.9965467537701496e-05, + "loss": 1.0075, + "step": 209 + }, + { + "epoch": 0.16840417000801924, + "grad_norm": 0.7670832872390747, + "learning_rate": 1.9964744789615605e-05, + "loss": 1.0417, + "step": 210 + }, + { + "epoch": 0.16920609462710506, + "grad_norm": 0.7731572389602661, + "learning_rate": 1.996401456962471e-05, + "loss": 1.0227, + "step": 211 + }, + { + "epoch": 0.17000801924619086, + "grad_norm": 0.7575690150260925, + "learning_rate": 1.996327687827635e-05, + "loss": 0.9834, + "step": 212 + }, + { + "epoch": 0.17080994386527668, + "grad_norm": 0.8696389198303223, + "learning_rate": 1.996253171612368e-05, + "loss": 1.029, + "step": 213 + }, + { + "epoch": 0.17161186848436247, + "grad_norm": 0.8823282718658447, + "learning_rate": 1.9961779083725438e-05, + "loss": 1.0698, + "step": 214 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 0.7903878688812256, + "learning_rate": 1.9961018981645985e-05, + "loss": 1.0445, + "step": 215 + }, + { + "epoch": 0.17321571772253408, + "grad_norm": 0.9023079872131348, + "learning_rate": 1.996025141045526e-05, + "loss": 0.9562, + "step": 216 + }, + { + "epoch": 0.1740176423416199, + "grad_norm": 0.7859614491462708, + "learning_rate": 1.995947637072882e-05, + "loss": 0.9911, + "step": 217 + }, + { + "epoch": 0.1748195669607057, + "grad_norm": 0.7968483567237854, + "learning_rate": 1.9958693863047816e-05, + "loss": 0.9593, + "step": 218 + }, + { + "epoch": 0.1756214915797915, + "grad_norm": 0.744019091129303, + "learning_rate": 1.9957903887998993e-05, + "loss": 1.038, + "step": 219 + }, + { + "epoch": 0.1764234161988773, + "grad_norm": 0.7552821636199951, + "learning_rate": 1.9957106446174712e-05, + "loss": 1.005, + "step": 220 + }, + { + "epoch": 0.17722534081796312, + "grad_norm": 0.7813109755516052, + "learning_rate": 1.9956301538172913e-05, + "loss": 1.0407, + "step": 221 + }, + { + "epoch": 0.17802726543704891, + "grad_norm": 0.734769880771637, + "learning_rate": 1.995548916459715e-05, + "loss": 1.0074, + "step": 222 + }, + { + "epoch": 0.17882919005613473, + "grad_norm": 0.737060010433197, + "learning_rate": 1.995466932605656e-05, + "loss": 0.9774, + "step": 223 + }, + { + "epoch": 0.17963111467522053, + "grad_norm": 0.7727075219154358, + "learning_rate": 1.9953842023165894e-05, + "loss": 0.9878, + "step": 224 + }, + { + "epoch": 0.18043303929430635, + "grad_norm": 0.7828347086906433, + "learning_rate": 1.995300725654549e-05, + "loss": 0.9883, + "step": 225 + }, + { + "epoch": 0.18123496391339214, + "grad_norm": 0.7647070288658142, + "learning_rate": 1.995216502682128e-05, + "loss": 1.0475, + "step": 226 + }, + { + "epoch": 0.18203688853247796, + "grad_norm": 0.8048544526100159, + "learning_rate": 1.99513153346248e-05, + "loss": 1.0019, + "step": 227 + }, + { + "epoch": 0.18283881315156375, + "grad_norm": 0.7421616911888123, + "learning_rate": 1.995045818059318e-05, + "loss": 0.9968, + "step": 228 + }, + { + "epoch": 0.18364073777064957, + "grad_norm": 0.694659948348999, + "learning_rate": 1.994959356536914e-05, + "loss": 0.9692, + "step": 229 + }, + { + "epoch": 0.18444266238973536, + "grad_norm": 0.7353253364562988, + "learning_rate": 1.9948721489601e-05, + "loss": 1.0244, + "step": 230 + }, + { + "epoch": 0.18524458700882118, + "grad_norm": 0.8362675309181213, + "learning_rate": 1.994784195394267e-05, + "loss": 1.0315, + "step": 231 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.6850599646568298, + "learning_rate": 1.9946954959053656e-05, + "loss": 0.9722, + "step": 232 + }, + { + "epoch": 0.1868484362469928, + "grad_norm": 0.777386486530304, + "learning_rate": 1.9946060505599058e-05, + "loss": 1.0284, + "step": 233 + }, + { + "epoch": 0.18765036086607859, + "grad_norm": 0.7967085242271423, + "learning_rate": 1.994515859424957e-05, + "loss": 0.9779, + "step": 234 + }, + { + "epoch": 0.1884522854851644, + "grad_norm": 0.7552576661109924, + "learning_rate": 1.9944249225681468e-05, + "loss": 1.033, + "step": 235 + }, + { + "epoch": 0.1892542101042502, + "grad_norm": 0.8289223313331604, + "learning_rate": 1.994333240057664e-05, + "loss": 0.9745, + "step": 236 + }, + { + "epoch": 0.19005613472333602, + "grad_norm": 0.754082977771759, + "learning_rate": 1.994240811962254e-05, + "loss": 1.023, + "step": 237 + }, + { + "epoch": 0.1908580593424218, + "grad_norm": 0.7701601386070251, + "learning_rate": 1.9941476383512236e-05, + "loss": 1.0095, + "step": 238 + }, + { + "epoch": 0.19165998396150763, + "grad_norm": 0.7313913702964783, + "learning_rate": 1.9940537192944366e-05, + "loss": 1.0282, + "step": 239 + }, + { + "epoch": 0.19246190858059342, + "grad_norm": 0.7340826392173767, + "learning_rate": 1.9939590548623173e-05, + "loss": 0.9947, + "step": 240 + }, + { + "epoch": 0.19326383319967924, + "grad_norm": 0.7086189389228821, + "learning_rate": 1.993863645125848e-05, + "loss": 1.0004, + "step": 241 + }, + { + "epoch": 0.19406575781876503, + "grad_norm": 0.7488327622413635, + "learning_rate": 1.993767490156571e-05, + "loss": 0.9864, + "step": 242 + }, + { + "epoch": 0.19486768243785085, + "grad_norm": 0.7784487009048462, + "learning_rate": 1.9936705900265853e-05, + "loss": 1.0219, + "step": 243 + }, + { + "epoch": 0.19566960705693665, + "grad_norm": 0.784127414226532, + "learning_rate": 1.9935729448085507e-05, + "loss": 1.0036, + "step": 244 + }, + { + "epoch": 0.19647153167602247, + "grad_norm": 0.7619081139564514, + "learning_rate": 1.9934745545756847e-05, + "loss": 0.9849, + "step": 245 + }, + { + "epoch": 0.19727345629510826, + "grad_norm": 0.7807731628417969, + "learning_rate": 1.9933754194017636e-05, + "loss": 1.0147, + "step": 246 + }, + { + "epoch": 0.19807538091419408, + "grad_norm": 0.7649911642074585, + "learning_rate": 1.9932755393611223e-05, + "loss": 0.9643, + "step": 247 + }, + { + "epoch": 0.19887730553327987, + "grad_norm": 0.7342953085899353, + "learning_rate": 1.993174914528655e-05, + "loss": 0.9754, + "step": 248 + }, + { + "epoch": 0.1996792301523657, + "grad_norm": 0.7341395020484924, + "learning_rate": 1.9930735449798125e-05, + "loss": 0.9807, + "step": 249 + }, + { + "epoch": 0.20048115477145148, + "grad_norm": 0.8070521354675293, + "learning_rate": 1.9929714307906053e-05, + "loss": 0.9697, + "step": 250 + }, + { + "epoch": 0.2012830793905373, + "grad_norm": 0.7484433650970459, + "learning_rate": 1.992868572037603e-05, + "loss": 0.9969, + "step": 251 + }, + { + "epoch": 0.2020850040096231, + "grad_norm": 0.7424092292785645, + "learning_rate": 1.992764968797932e-05, + "loss": 1.0005, + "step": 252 + }, + { + "epoch": 0.2028869286287089, + "grad_norm": 0.7775983810424805, + "learning_rate": 1.9926606211492773e-05, + "loss": 1.0033, + "step": 253 + }, + { + "epoch": 0.2036888532477947, + "grad_norm": 0.7295064926147461, + "learning_rate": 1.9925555291698826e-05, + "loss": 1.0001, + "step": 254 + }, + { + "epoch": 0.20449077786688052, + "grad_norm": 0.7483360171318054, + "learning_rate": 1.9924496929385496e-05, + "loss": 0.998, + "step": 255 + }, + { + "epoch": 0.20529270248596632, + "grad_norm": 0.7402914762496948, + "learning_rate": 1.9923431125346376e-05, + "loss": 0.9109, + "step": 256 + }, + { + "epoch": 0.20609462710505214, + "grad_norm": 0.784766674041748, + "learning_rate": 1.9922357880380644e-05, + "loss": 1.0262, + "step": 257 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 0.8008718490600586, + "learning_rate": 1.9921277195293057e-05, + "loss": 1.0357, + "step": 258 + }, + { + "epoch": 0.20769847634322375, + "grad_norm": 0.7571882009506226, + "learning_rate": 1.9920189070893947e-05, + "loss": 1.0125, + "step": 259 + }, + { + "epoch": 0.20850040096230954, + "grad_norm": 0.7246387600898743, + "learning_rate": 1.9919093507999226e-05, + "loss": 1.0514, + "step": 260 + }, + { + "epoch": 0.20930232558139536, + "grad_norm": 0.7044005990028381, + "learning_rate": 1.9917990507430385e-05, + "loss": 0.9613, + "step": 261 + }, + { + "epoch": 0.21010425020048115, + "grad_norm": 0.7408387064933777, + "learning_rate": 1.9916880070014494e-05, + "loss": 0.997, + "step": 262 + }, + { + "epoch": 0.21090617481956697, + "grad_norm": 0.7599897384643555, + "learning_rate": 1.9915762196584193e-05, + "loss": 0.9701, + "step": 263 + }, + { + "epoch": 0.21170809943865276, + "grad_norm": 0.7892153859138489, + "learning_rate": 1.9914636887977706e-05, + "loss": 1.0608, + "step": 264 + }, + { + "epoch": 0.21251002405773858, + "grad_norm": 0.9341477155685425, + "learning_rate": 1.9913504145038823e-05, + "loss": 0.9908, + "step": 265 + }, + { + "epoch": 0.21331194867682438, + "grad_norm": 0.8016869425773621, + "learning_rate": 1.991236396861692e-05, + "loss": 1.0324, + "step": 266 + }, + { + "epoch": 0.2141138732959102, + "grad_norm": 0.7510752081871033, + "learning_rate": 1.991121635956693e-05, + "loss": 0.981, + "step": 267 + }, + { + "epoch": 0.214915797914996, + "grad_norm": 0.791854739189148, + "learning_rate": 1.9910061318749375e-05, + "loss": 0.9654, + "step": 268 + }, + { + "epoch": 0.2157177225340818, + "grad_norm": 0.7618932127952576, + "learning_rate": 1.9908898847030348e-05, + "loss": 1.0033, + "step": 269 + }, + { + "epoch": 0.2165196471531676, + "grad_norm": 0.7162818312644958, + "learning_rate": 1.9907728945281504e-05, + "loss": 0.9684, + "step": 270 + }, + { + "epoch": 0.21732157177225342, + "grad_norm": 0.8062979578971863, + "learning_rate": 1.9906551614380077e-05, + "loss": 0.964, + "step": 271 + }, + { + "epoch": 0.2181234963913392, + "grad_norm": 0.7345139384269714, + "learning_rate": 1.990536685520887e-05, + "loss": 0.9878, + "step": 272 + }, + { + "epoch": 0.21892542101042503, + "grad_norm": 0.8036599159240723, + "learning_rate": 1.9904174668656252e-05, + "loss": 0.9344, + "step": 273 + }, + { + "epoch": 0.21972734562951082, + "grad_norm": 0.7124471664428711, + "learning_rate": 1.990297505561617e-05, + "loss": 0.9884, + "step": 274 + }, + { + "epoch": 0.22052927024859664, + "grad_norm": 0.7641183733940125, + "learning_rate": 1.9901768016988136e-05, + "loss": 1.0405, + "step": 275 + }, + { + "epoch": 0.22133119486768243, + "grad_norm": 0.7073462605476379, + "learning_rate": 1.9900553553677227e-05, + "loss": 0.9733, + "step": 276 + }, + { + "epoch": 0.22213311948676825, + "grad_norm": 0.7479486465454102, + "learning_rate": 1.9899331666594085e-05, + "loss": 1.0005, + "step": 277 + }, + { + "epoch": 0.22293504410585405, + "grad_norm": 0.7818754315376282, + "learning_rate": 1.9898102356654926e-05, + "loss": 1.0504, + "step": 278 + }, + { + "epoch": 0.22373696872493987, + "grad_norm": 0.7143021821975708, + "learning_rate": 1.989686562478153e-05, + "loss": 1.0098, + "step": 279 + }, + { + "epoch": 0.22453889334402566, + "grad_norm": 0.8301097750663757, + "learning_rate": 1.9895621471901236e-05, + "loss": 1.0214, + "step": 280 + }, + { + "epoch": 0.22534081796311148, + "grad_norm": 0.7973410487174988, + "learning_rate": 1.9894369898946955e-05, + "loss": 0.9639, + "step": 281 + }, + { + "epoch": 0.22614274258219727, + "grad_norm": 0.7219634652137756, + "learning_rate": 1.9893110906857158e-05, + "loss": 0.9978, + "step": 282 + }, + { + "epoch": 0.2269446672012831, + "grad_norm": 0.7005908489227295, + "learning_rate": 1.9891844496575883e-05, + "loss": 0.9704, + "step": 283 + }, + { + "epoch": 0.22774659182036888, + "grad_norm": 0.8081759214401245, + "learning_rate": 1.9890570669052724e-05, + "loss": 0.9644, + "step": 284 + }, + { + "epoch": 0.2285485164394547, + "grad_norm": 0.7537055015563965, + "learning_rate": 1.9889289425242845e-05, + "loss": 1.0097, + "step": 285 + }, + { + "epoch": 0.2293504410585405, + "grad_norm": 0.7490015029907227, + "learning_rate": 1.9888000766106962e-05, + "loss": 1.0267, + "step": 286 + }, + { + "epoch": 0.2301523656776263, + "grad_norm": 0.7152953743934631, + "learning_rate": 1.9886704692611355e-05, + "loss": 1.0233, + "step": 287 + }, + { + "epoch": 0.2309542902967121, + "grad_norm": 0.6974421143531799, + "learning_rate": 1.9885401205727864e-05, + "loss": 0.9456, + "step": 288 + }, + { + "epoch": 0.23175621491579793, + "grad_norm": 0.7040266990661621, + "learning_rate": 1.9884090306433892e-05, + "loss": 0.9826, + "step": 289 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.8025934100151062, + "learning_rate": 1.9882771995712393e-05, + "loss": 0.9948, + "step": 290 + }, + { + "epoch": 0.23336006415396954, + "grad_norm": 0.736258864402771, + "learning_rate": 1.988144627455188e-05, + "loss": 1.0012, + "step": 291 + }, + { + "epoch": 0.23416198877305533, + "grad_norm": 0.7188357710838318, + "learning_rate": 1.9880113143946428e-05, + "loss": 1.0366, + "step": 292 + }, + { + "epoch": 0.23496391339214115, + "grad_norm": 0.7842757701873779, + "learning_rate": 1.9878772604895657e-05, + "loss": 1.02, + "step": 293 + }, + { + "epoch": 0.23576583801122694, + "grad_norm": 0.694830060005188, + "learning_rate": 1.9877424658404757e-05, + "loss": 0.9438, + "step": 294 + }, + { + "epoch": 0.23656776263031276, + "grad_norm": 0.8712981343269348, + "learning_rate": 1.987606930548446e-05, + "loss": 1.0039, + "step": 295 + }, + { + "epoch": 0.23736968724939855, + "grad_norm": 0.7846871614456177, + "learning_rate": 1.9874706547151054e-05, + "loss": 0.9635, + "step": 296 + }, + { + "epoch": 0.23817161186848437, + "grad_norm": 0.7604832053184509, + "learning_rate": 1.9873336384426388e-05, + "loss": 0.9759, + "step": 297 + }, + { + "epoch": 0.23897353648757017, + "grad_norm": 0.7759199142456055, + "learning_rate": 1.987195881833785e-05, + "loss": 1.0331, + "step": 298 + }, + { + "epoch": 0.23977546110665598, + "grad_norm": 0.7688100934028625, + "learning_rate": 1.9870573849918387e-05, + "loss": 1.0101, + "step": 299 + }, + { + "epoch": 0.24057738572574178, + "grad_norm": 0.7552735805511475, + "learning_rate": 1.98691814802065e-05, + "loss": 1.0091, + "step": 300 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 0.7121011018753052, + "learning_rate": 1.9867781710246228e-05, + "loss": 0.9895, + "step": 301 + }, + { + "epoch": 0.2421812349639134, + "grad_norm": 0.721831202507019, + "learning_rate": 1.986637454108717e-05, + "loss": 0.9863, + "step": 302 + }, + { + "epoch": 0.2429831595829992, + "grad_norm": 0.7194176316261292, + "learning_rate": 1.9864959973784474e-05, + "loss": 0.9869, + "step": 303 + }, + { + "epoch": 0.243785084202085, + "grad_norm": 0.7214140892028809, + "learning_rate": 1.9863538009398824e-05, + "loss": 1.0065, + "step": 304 + }, + { + "epoch": 0.24458700882117082, + "grad_norm": 0.7053404450416565, + "learning_rate": 1.9862108648996457e-05, + "loss": 0.9836, + "step": 305 + }, + { + "epoch": 0.2453889334402566, + "grad_norm": 0.7068779468536377, + "learning_rate": 1.986067189364916e-05, + "loss": 0.9545, + "step": 306 + }, + { + "epoch": 0.24619085805934243, + "grad_norm": 0.7764230370521545, + "learning_rate": 1.9859227744434264e-05, + "loss": 0.9764, + "step": 307 + }, + { + "epoch": 0.24699278267842822, + "grad_norm": 0.6795384287834167, + "learning_rate": 1.9857776202434633e-05, + "loss": 1.026, + "step": 308 + }, + { + "epoch": 0.24779470729751404, + "grad_norm": 0.7431665062904358, + "learning_rate": 1.985631726873869e-05, + "loss": 1.008, + "step": 309 + }, + { + "epoch": 0.24859663191659984, + "grad_norm": 0.7341127991676331, + "learning_rate": 1.9854850944440386e-05, + "loss": 0.9737, + "step": 310 + }, + { + "epoch": 0.24939855653568566, + "grad_norm": 0.6869488954544067, + "learning_rate": 1.9853377230639227e-05, + "loss": 0.9644, + "step": 311 + }, + { + "epoch": 0.25020048115477145, + "grad_norm": 0.6559352874755859, + "learning_rate": 1.9851896128440252e-05, + "loss": 0.9908, + "step": 312 + }, + { + "epoch": 0.25100240577385724, + "grad_norm": 0.7657687067985535, + "learning_rate": 1.985040763895404e-05, + "loss": 1.008, + "step": 313 + }, + { + "epoch": 0.2518043303929431, + "grad_norm": 0.7205827832221985, + "learning_rate": 1.9848911763296712e-05, + "loss": 1.0024, + "step": 314 + }, + { + "epoch": 0.2526062550120289, + "grad_norm": 0.6896849274635315, + "learning_rate": 1.9847408502589928e-05, + "loss": 1.0026, + "step": 315 + }, + { + "epoch": 0.25340817963111467, + "grad_norm": 0.706331193447113, + "learning_rate": 1.9845897857960886e-05, + "loss": 1.0175, + "step": 316 + }, + { + "epoch": 0.25421010425020046, + "grad_norm": 0.7068176865577698, + "learning_rate": 1.9844379830542312e-05, + "loss": 0.9796, + "step": 317 + }, + { + "epoch": 0.2550120288692863, + "grad_norm": 0.705200731754303, + "learning_rate": 1.9842854421472478e-05, + "loss": 1.0286, + "step": 318 + }, + { + "epoch": 0.2558139534883721, + "grad_norm": 0.7059184312820435, + "learning_rate": 1.984132163189519e-05, + "loss": 1.0374, + "step": 319 + }, + { + "epoch": 0.2566158781074579, + "grad_norm": 0.7251558303833008, + "learning_rate": 1.9839781462959787e-05, + "loss": 0.9749, + "step": 320 + }, + { + "epoch": 0.2574178027265437, + "grad_norm": 0.7353153824806213, + "learning_rate": 1.9838233915821133e-05, + "loss": 0.9462, + "step": 321 + }, + { + "epoch": 0.25821972734562953, + "grad_norm": 0.714249849319458, + "learning_rate": 1.9836678991639638e-05, + "loss": 0.9907, + "step": 322 + }, + { + "epoch": 0.2590216519647153, + "grad_norm": 0.7212216258049011, + "learning_rate": 1.9835116691581232e-05, + "loss": 1.0044, + "step": 323 + }, + { + "epoch": 0.2598235765838011, + "grad_norm": 0.6976492404937744, + "learning_rate": 1.9833547016817386e-05, + "loss": 0.9828, + "step": 324 + }, + { + "epoch": 0.2606255012028869, + "grad_norm": 0.7598298788070679, + "learning_rate": 1.9831969968525096e-05, + "loss": 0.9409, + "step": 325 + }, + { + "epoch": 0.26142742582197276, + "grad_norm": 0.6750239729881287, + "learning_rate": 1.983038554788688e-05, + "loss": 0.9493, + "step": 326 + }, + { + "epoch": 0.26222935044105855, + "grad_norm": 0.7285559773445129, + "learning_rate": 1.9828793756090794e-05, + "loss": 1.0369, + "step": 327 + }, + { + "epoch": 0.26303127506014434, + "grad_norm": 0.7011874914169312, + "learning_rate": 1.9827194594330418e-05, + "loss": 0.9345, + "step": 328 + }, + { + "epoch": 0.26383319967923013, + "grad_norm": 0.6847050786018372, + "learning_rate": 1.982558806380486e-05, + "loss": 1.0196, + "step": 329 + }, + { + "epoch": 0.264635124298316, + "grad_norm": 0.728759765625, + "learning_rate": 1.9823974165718748e-05, + "loss": 0.9736, + "step": 330 + }, + { + "epoch": 0.2654370489174018, + "grad_norm": 0.6663811802864075, + "learning_rate": 1.982235290128224e-05, + "loss": 0.9749, + "step": 331 + }, + { + "epoch": 0.26623897353648757, + "grad_norm": 0.6871486902236938, + "learning_rate": 1.9820724271711012e-05, + "loss": 0.9692, + "step": 332 + }, + { + "epoch": 0.26704089815557336, + "grad_norm": 0.6922380328178406, + "learning_rate": 1.9819088278226273e-05, + "loss": 0.9636, + "step": 333 + }, + { + "epoch": 0.2678428227746592, + "grad_norm": 0.7264043092727661, + "learning_rate": 1.9817444922054738e-05, + "loss": 0.9986, + "step": 334 + }, + { + "epoch": 0.268644747393745, + "grad_norm": 0.701865553855896, + "learning_rate": 1.9815794204428655e-05, + "loss": 1.0301, + "step": 335 + }, + { + "epoch": 0.2694466720128308, + "grad_norm": 0.6781471371650696, + "learning_rate": 1.981413612658579e-05, + "loss": 0.9804, + "step": 336 + }, + { + "epoch": 0.2702485966319166, + "grad_norm": 0.6568560004234314, + "learning_rate": 1.9812470689769424e-05, + "loss": 1.0257, + "step": 337 + }, + { + "epoch": 0.27105052125100243, + "grad_norm": 0.6593422889709473, + "learning_rate": 1.9810797895228358e-05, + "loss": 0.9611, + "step": 338 + }, + { + "epoch": 0.2718524458700882, + "grad_norm": 0.6801155805587769, + "learning_rate": 1.9809117744216916e-05, + "loss": 0.975, + "step": 339 + }, + { + "epoch": 0.272654370489174, + "grad_norm": 0.7064613699913025, + "learning_rate": 1.9807430237994925e-05, + "loss": 1.013, + "step": 340 + }, + { + "epoch": 0.2734562951082598, + "grad_norm": 0.7264713048934937, + "learning_rate": 1.9805735377827738e-05, + "loss": 0.945, + "step": 341 + }, + { + "epoch": 0.27425821972734565, + "grad_norm": 0.6898838877677917, + "learning_rate": 1.9804033164986215e-05, + "loss": 1.0042, + "step": 342 + }, + { + "epoch": 0.27506014434643145, + "grad_norm": 0.69865483045578, + "learning_rate": 1.980232360074674e-05, + "loss": 1.0004, + "step": 343 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.6840596199035645, + "learning_rate": 1.98006066863912e-05, + "loss": 0.9489, + "step": 344 + }, + { + "epoch": 0.27666399358460303, + "grad_norm": 0.7413778305053711, + "learning_rate": 1.979888242320699e-05, + "loss": 0.9722, + "step": 345 + }, + { + "epoch": 0.2774659182036889, + "grad_norm": 0.692329466342926, + "learning_rate": 1.9797150812487028e-05, + "loss": 0.997, + "step": 346 + }, + { + "epoch": 0.27826784282277467, + "grad_norm": 0.6711648106575012, + "learning_rate": 1.9795411855529735e-05, + "loss": 1.0132, + "step": 347 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 0.7313749194145203, + "learning_rate": 1.9793665553639038e-05, + "loss": 0.9723, + "step": 348 + }, + { + "epoch": 0.27987169206094625, + "grad_norm": 0.7389296293258667, + "learning_rate": 1.979191190812437e-05, + "loss": 1.0407, + "step": 349 + }, + { + "epoch": 0.2806736166800321, + "grad_norm": 0.7041956186294556, + "learning_rate": 1.9790150920300683e-05, + "loss": 1.0093, + "step": 350 + }, + { + "epoch": 0.2814755412991179, + "grad_norm": 0.7673702836036682, + "learning_rate": 1.9788382591488412e-05, + "loss": 0.9872, + "step": 351 + }, + { + "epoch": 0.2822774659182037, + "grad_norm": 0.7097181081771851, + "learning_rate": 1.9786606923013525e-05, + "loss": 0.9479, + "step": 352 + }, + { + "epoch": 0.2830793905372895, + "grad_norm": 0.6917240619659424, + "learning_rate": 1.9784823916207472e-05, + "loss": 1.0284, + "step": 353 + }, + { + "epoch": 0.2838813151563753, + "grad_norm": 0.7378222942352295, + "learning_rate": 1.978303357240721e-05, + "loss": 0.9602, + "step": 354 + }, + { + "epoch": 0.2846832397754611, + "grad_norm": 0.6886555552482605, + "learning_rate": 1.9781235892955206e-05, + "loss": 0.9283, + "step": 355 + }, + { + "epoch": 0.2854851643945469, + "grad_norm": 0.7055453658103943, + "learning_rate": 1.9779430879199414e-05, + "loss": 1.0428, + "step": 356 + }, + { + "epoch": 0.2862870890136327, + "grad_norm": 0.6854853630065918, + "learning_rate": 1.9777618532493298e-05, + "loss": 0.964, + "step": 357 + }, + { + "epoch": 0.28708901363271855, + "grad_norm": 0.6832833886146545, + "learning_rate": 1.977579885419582e-05, + "loss": 0.9413, + "step": 358 + }, + { + "epoch": 0.28789093825180434, + "grad_norm": 0.7093089818954468, + "learning_rate": 1.9773971845671435e-05, + "loss": 0.9909, + "step": 359 + }, + { + "epoch": 0.28869286287089013, + "grad_norm": 0.7339062690734863, + "learning_rate": 1.977213750829009e-05, + "loss": 0.9815, + "step": 360 + }, + { + "epoch": 0.2894947874899759, + "grad_norm": 0.6688426733016968, + "learning_rate": 1.9770295843427242e-05, + "loss": 0.9744, + "step": 361 + }, + { + "epoch": 0.29029671210906177, + "grad_norm": 0.675520122051239, + "learning_rate": 1.9768446852463832e-05, + "loss": 1.0187, + "step": 362 + }, + { + "epoch": 0.29109863672814756, + "grad_norm": 0.6580377817153931, + "learning_rate": 1.9766590536786294e-05, + "loss": 0.9624, + "step": 363 + }, + { + "epoch": 0.29190056134723336, + "grad_norm": 0.7211337089538574, + "learning_rate": 1.976472689778656e-05, + "loss": 0.9689, + "step": 364 + }, + { + "epoch": 0.29270248596631915, + "grad_norm": 0.6154484748840332, + "learning_rate": 1.976285593686205e-05, + "loss": 0.9385, + "step": 365 + }, + { + "epoch": 0.293504410585405, + "grad_norm": 0.6870887279510498, + "learning_rate": 1.976097765541567e-05, + "loss": 0.9684, + "step": 366 + }, + { + "epoch": 0.2943063352044908, + "grad_norm": 0.6625701785087585, + "learning_rate": 1.9759092054855822e-05, + "loss": 0.9666, + "step": 367 + }, + { + "epoch": 0.2951082598235766, + "grad_norm": 0.7698168158531189, + "learning_rate": 1.975719913659639e-05, + "loss": 0.9957, + "step": 368 + }, + { + "epoch": 0.29591018444266237, + "grad_norm": 0.6859940886497498, + "learning_rate": 1.9755298902056758e-05, + "loss": 0.967, + "step": 369 + }, + { + "epoch": 0.2967121090617482, + "grad_norm": 0.6606349349021912, + "learning_rate": 1.975339135266178e-05, + "loss": 0.9968, + "step": 370 + }, + { + "epoch": 0.297514033680834, + "grad_norm": 0.7765418291091919, + "learning_rate": 1.9751476489841796e-05, + "loss": 0.9744, + "step": 371 + }, + { + "epoch": 0.2983159582999198, + "grad_norm": 0.7239444851875305, + "learning_rate": 1.974955431503265e-05, + "loss": 1.0206, + "step": 372 + }, + { + "epoch": 0.2991178829190056, + "grad_norm": 0.7017782330513, + "learning_rate": 1.974762482967564e-05, + "loss": 0.9825, + "step": 373 + }, + { + "epoch": 0.29991980753809144, + "grad_norm": 0.730739951133728, + "learning_rate": 1.9745688035217563e-05, + "loss": 1.012, + "step": 374 + }, + { + "epoch": 0.30072173215717724, + "grad_norm": 0.7359748482704163, + "learning_rate": 1.97437439331107e-05, + "loss": 0.9267, + "step": 375 + }, + { + "epoch": 0.301523656776263, + "grad_norm": 0.7002710103988647, + "learning_rate": 1.97417925248128e-05, + "loss": 0.967, + "step": 376 + }, + { + "epoch": 0.3023255813953488, + "grad_norm": 0.7859190702438354, + "learning_rate": 1.9739833811787097e-05, + "loss": 0.9648, + "step": 377 + }, + { + "epoch": 0.30312750601443467, + "grad_norm": 0.7745339870452881, + "learning_rate": 1.9737867795502298e-05, + "loss": 0.9899, + "step": 378 + }, + { + "epoch": 0.30392943063352046, + "grad_norm": 0.6724187731742859, + "learning_rate": 1.973589447743259e-05, + "loss": 0.9445, + "step": 379 + }, + { + "epoch": 0.30473135525260625, + "grad_norm": 0.7949398159980774, + "learning_rate": 1.9733913859057637e-05, + "loss": 0.9603, + "step": 380 + }, + { + "epoch": 0.30553327987169204, + "grad_norm": 0.7890524864196777, + "learning_rate": 1.9731925941862573e-05, + "loss": 0.9573, + "step": 381 + }, + { + "epoch": 0.3063352044907779, + "grad_norm": 0.6943206787109375, + "learning_rate": 1.9729930727338004e-05, + "loss": 1.0042, + "step": 382 + }, + { + "epoch": 0.3071371291098637, + "grad_norm": 0.6898202300071716, + "learning_rate": 1.972792821698001e-05, + "loss": 0.9527, + "step": 383 + }, + { + "epoch": 0.3079390537289495, + "grad_norm": 0.7329158782958984, + "learning_rate": 1.9725918412290142e-05, + "loss": 0.9755, + "step": 384 + }, + { + "epoch": 0.30874097834803527, + "grad_norm": 0.7760060429573059, + "learning_rate": 1.9723901314775423e-05, + "loss": 1.0134, + "step": 385 + }, + { + "epoch": 0.3095429029671211, + "grad_norm": 0.6890391111373901, + "learning_rate": 1.9721876925948336e-05, + "loss": 0.973, + "step": 386 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 0.6739248037338257, + "learning_rate": 1.971984524732684e-05, + "loss": 1.0187, + "step": 387 + }, + { + "epoch": 0.3111467522052927, + "grad_norm": 0.729393482208252, + "learning_rate": 1.971780628043436e-05, + "loss": 0.9873, + "step": 388 + }, + { + "epoch": 0.3119486768243785, + "grad_norm": 0.7092537879943848, + "learning_rate": 1.9715760026799776e-05, + "loss": 0.9992, + "step": 389 + }, + { + "epoch": 0.31275060144346434, + "grad_norm": 0.657490074634552, + "learning_rate": 1.971370648795744e-05, + "loss": 0.9793, + "step": 390 + }, + { + "epoch": 0.31355252606255013, + "grad_norm": 0.7291781902313232, + "learning_rate": 1.971164566544717e-05, + "loss": 0.9412, + "step": 391 + }, + { + "epoch": 0.3143544506816359, + "grad_norm": 0.8005679845809937, + "learning_rate": 1.970957756081424e-05, + "loss": 0.994, + "step": 392 + }, + { + "epoch": 0.3151563753007217, + "grad_norm": 0.6840459704399109, + "learning_rate": 1.9707502175609377e-05, + "loss": 1.0069, + "step": 393 + }, + { + "epoch": 0.31595829991980756, + "grad_norm": 0.6707590222358704, + "learning_rate": 1.9705419511388784e-05, + "loss": 0.949, + "step": 394 + }, + { + "epoch": 0.31676022453889335, + "grad_norm": 0.66581130027771, + "learning_rate": 1.9703329569714114e-05, + "loss": 1.0329, + "step": 395 + }, + { + "epoch": 0.31756214915797915, + "grad_norm": 0.7102177739143372, + "learning_rate": 1.9701232352152472e-05, + "loss": 0.9578, + "step": 396 + }, + { + "epoch": 0.31836407377706494, + "grad_norm": 0.7258641719818115, + "learning_rate": 1.9699127860276426e-05, + "loss": 0.989, + "step": 397 + }, + { + "epoch": 0.3191659983961508, + "grad_norm": 0.646165668964386, + "learning_rate": 1.969701609566399e-05, + "loss": 0.9845, + "step": 398 + }, + { + "epoch": 0.3199679230152366, + "grad_norm": 0.6825928688049316, + "learning_rate": 1.9694897059898648e-05, + "loss": 1.009, + "step": 399 + }, + { + "epoch": 0.32076984763432237, + "grad_norm": 0.7049286365509033, + "learning_rate": 1.9692770754569316e-05, + "loss": 0.966, + "step": 400 + }, + { + "epoch": 0.32157177225340816, + "grad_norm": 0.6968984603881836, + "learning_rate": 1.9690637181270372e-05, + "loss": 0.9642, + "step": 401 + }, + { + "epoch": 0.322373696872494, + "grad_norm": 0.7258249521255493, + "learning_rate": 1.9688496341601647e-05, + "loss": 0.9723, + "step": 402 + }, + { + "epoch": 0.3231756214915798, + "grad_norm": 0.6794790029525757, + "learning_rate": 1.9686348237168408e-05, + "loss": 0.9803, + "step": 403 + }, + { + "epoch": 0.3239775461106656, + "grad_norm": 0.666203498840332, + "learning_rate": 1.9684192869581376e-05, + "loss": 0.9987, + "step": 404 + }, + { + "epoch": 0.3247794707297514, + "grad_norm": 0.6866022348403931, + "learning_rate": 1.968203024045673e-05, + "loss": 0.9196, + "step": 405 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 0.6703433990478516, + "learning_rate": 1.9679860351416075e-05, + "loss": 0.9677, + "step": 406 + }, + { + "epoch": 0.326383319967923, + "grad_norm": 0.7272356152534485, + "learning_rate": 1.967768320408647e-05, + "loss": 0.9897, + "step": 407 + }, + { + "epoch": 0.3271852445870088, + "grad_norm": 0.7687215805053711, + "learning_rate": 1.967549880010041e-05, + "loss": 0.9674, + "step": 408 + }, + { + "epoch": 0.3279871692060946, + "grad_norm": 0.6770045757293701, + "learning_rate": 1.967330714109584e-05, + "loss": 1.0214, + "step": 409 + }, + { + "epoch": 0.32878909382518046, + "grad_norm": 0.7255867123603821, + "learning_rate": 1.9671108228716142e-05, + "loss": 1.0004, + "step": 410 + }, + { + "epoch": 0.32959101844426625, + "grad_norm": 0.6760930418968201, + "learning_rate": 1.9668902064610128e-05, + "loss": 0.9386, + "step": 411 + }, + { + "epoch": 0.33039294306335204, + "grad_norm": 0.708378255367279, + "learning_rate": 1.9666688650432063e-05, + "loss": 0.9547, + "step": 412 + }, + { + "epoch": 0.33119486768243783, + "grad_norm": 0.7338705658912659, + "learning_rate": 1.9664467987841632e-05, + "loss": 0.9848, + "step": 413 + }, + { + "epoch": 0.3319967923015237, + "grad_norm": 0.6751573085784912, + "learning_rate": 1.9662240078503975e-05, + "loss": 1.0165, + "step": 414 + }, + { + "epoch": 0.33279871692060947, + "grad_norm": 0.6512075066566467, + "learning_rate": 1.9660004924089644e-05, + "loss": 0.9326, + "step": 415 + }, + { + "epoch": 0.33360064153969526, + "grad_norm": 0.6776466369628906, + "learning_rate": 1.965776252627464e-05, + "loss": 0.9494, + "step": 416 + }, + { + "epoch": 0.33440256615878106, + "grad_norm": 0.7271110415458679, + "learning_rate": 1.9655512886740383e-05, + "loss": 0.9866, + "step": 417 + }, + { + "epoch": 0.3352044907778669, + "grad_norm": 0.6701963543891907, + "learning_rate": 1.9653256007173735e-05, + "loss": 0.9433, + "step": 418 + }, + { + "epoch": 0.3360064153969527, + "grad_norm": 0.7227078080177307, + "learning_rate": 1.965099188926698e-05, + "loss": 1.0014, + "step": 419 + }, + { + "epoch": 0.3368083400160385, + "grad_norm": 0.6832866668701172, + "learning_rate": 1.964872053471783e-05, + "loss": 0.9595, + "step": 420 + }, + { + "epoch": 0.3376102646351243, + "grad_norm": 0.6797085404396057, + "learning_rate": 1.9646441945229424e-05, + "loss": 0.9574, + "step": 421 + }, + { + "epoch": 0.3384121892542101, + "grad_norm": 0.6965078711509705, + "learning_rate": 1.9644156122510326e-05, + "loss": 0.98, + "step": 422 + }, + { + "epoch": 0.3392141138732959, + "grad_norm": 0.6841316223144531, + "learning_rate": 1.9641863068274523e-05, + "loss": 0.9619, + "step": 423 + }, + { + "epoch": 0.3400160384923817, + "grad_norm": 0.7349586486816406, + "learning_rate": 1.9639562784241426e-05, + "loss": 0.979, + "step": 424 + }, + { + "epoch": 0.3408179631114675, + "grad_norm": 0.6306387782096863, + "learning_rate": 1.9637255272135863e-05, + "loss": 0.9508, + "step": 425 + }, + { + "epoch": 0.34161988773055335, + "grad_norm": 0.6524477601051331, + "learning_rate": 1.9634940533688094e-05, + "loss": 0.9172, + "step": 426 + }, + { + "epoch": 0.34242181234963914, + "grad_norm": 0.6789130568504333, + "learning_rate": 1.9632618570633782e-05, + "loss": 0.986, + "step": 427 + }, + { + "epoch": 0.34322373696872494, + "grad_norm": 0.6674696803092957, + "learning_rate": 1.9630289384714014e-05, + "loss": 0.9511, + "step": 428 + }, + { + "epoch": 0.3440256615878107, + "grad_norm": 0.703323245048523, + "learning_rate": 1.9627952977675292e-05, + "loss": 0.9889, + "step": 429 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 0.670559823513031, + "learning_rate": 1.962560935126954e-05, + "loss": 0.9957, + "step": 430 + }, + { + "epoch": 0.34562951082598237, + "grad_norm": 0.6248535513877869, + "learning_rate": 1.962325850725408e-05, + "loss": 0.9794, + "step": 431 + }, + { + "epoch": 0.34643143544506816, + "grad_norm": 0.6565462946891785, + "learning_rate": 1.9620900447391663e-05, + "loss": 0.951, + "step": 432 + }, + { + "epoch": 0.34723336006415395, + "grad_norm": 0.6794214844703674, + "learning_rate": 1.9618535173450434e-05, + "loss": 1.0089, + "step": 433 + }, + { + "epoch": 0.3480352846832398, + "grad_norm": 0.6835659742355347, + "learning_rate": 1.9616162687203966e-05, + "loss": 1.0291, + "step": 434 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 0.6657028794288635, + "learning_rate": 1.9613782990431223e-05, + "loss": 0.9599, + "step": 435 + }, + { + "epoch": 0.3496391339214114, + "grad_norm": 0.6608729362487793, + "learning_rate": 1.9611396084916587e-05, + "loss": 0.9688, + "step": 436 + }, + { + "epoch": 0.3504410585404972, + "grad_norm": 0.6747941374778748, + "learning_rate": 1.9609001972449834e-05, + "loss": 0.9353, + "step": 437 + }, + { + "epoch": 0.351242983159583, + "grad_norm": 0.6459916234016418, + "learning_rate": 1.960660065482616e-05, + "loss": 0.9698, + "step": 438 + }, + { + "epoch": 0.3520449077786688, + "grad_norm": 0.6540493369102478, + "learning_rate": 1.9604192133846147e-05, + "loss": 0.9672, + "step": 439 + }, + { + "epoch": 0.3528468323977546, + "grad_norm": 0.6698585152626038, + "learning_rate": 1.960177641131579e-05, + "loss": 0.9312, + "step": 440 + }, + { + "epoch": 0.3536487570168404, + "grad_norm": 0.6463971734046936, + "learning_rate": 1.959935348904648e-05, + "loss": 0.939, + "step": 441 + }, + { + "epoch": 0.35445068163592625, + "grad_norm": 0.7033871412277222, + "learning_rate": 1.9596923368855006e-05, + "loss": 0.9928, + "step": 442 + }, + { + "epoch": 0.35525260625501204, + "grad_norm": 0.6210078597068787, + "learning_rate": 1.9594486052563556e-05, + "loss": 0.8954, + "step": 443 + }, + { + "epoch": 0.35605453087409783, + "grad_norm": 0.658398449420929, + "learning_rate": 1.959204154199971e-05, + "loss": 1.0045, + "step": 444 + }, + { + "epoch": 0.3568564554931836, + "grad_norm": 0.6751113533973694, + "learning_rate": 1.958958983899645e-05, + "loss": 0.9873, + "step": 445 + }, + { + "epoch": 0.35765838011226947, + "grad_norm": 0.6434077024459839, + "learning_rate": 1.958713094539214e-05, + "loss": 0.9433, + "step": 446 + }, + { + "epoch": 0.35846030473135526, + "grad_norm": 0.7159045338630676, + "learning_rate": 1.958466486303055e-05, + "loss": 0.9705, + "step": 447 + }, + { + "epoch": 0.35926222935044105, + "grad_norm": 0.6778410077095032, + "learning_rate": 1.9582191593760825e-05, + "loss": 0.9326, + "step": 448 + }, + { + "epoch": 0.36006415396952685, + "grad_norm": 0.6995593905448914, + "learning_rate": 1.957971113943751e-05, + "loss": 0.9582, + "step": 449 + }, + { + "epoch": 0.3608660785886127, + "grad_norm": 0.6641433835029602, + "learning_rate": 1.9577223501920532e-05, + "loss": 0.9635, + "step": 450 + }, + { + "epoch": 0.3616680032076985, + "grad_norm": 0.6719247698783875, + "learning_rate": 1.957472868307521e-05, + "loss": 1.0151, + "step": 451 + }, + { + "epoch": 0.3624699278267843, + "grad_norm": 0.6560412049293518, + "learning_rate": 1.9572226684772243e-05, + "loss": 0.9371, + "step": 452 + }, + { + "epoch": 0.36327185244587007, + "grad_norm": 0.6818994879722595, + "learning_rate": 1.956971750888771e-05, + "loss": 0.9462, + "step": 453 + }, + { + "epoch": 0.3640737770649559, + "grad_norm": 0.7130508422851562, + "learning_rate": 1.9567201157303086e-05, + "loss": 0.9549, + "step": 454 + }, + { + "epoch": 0.3648757016840417, + "grad_norm": 0.6851775050163269, + "learning_rate": 1.956467763190521e-05, + "loss": 0.9852, + "step": 455 + }, + { + "epoch": 0.3656776263031275, + "grad_norm": 0.6840097308158875, + "learning_rate": 1.9562146934586307e-05, + "loss": 0.9623, + "step": 456 + }, + { + "epoch": 0.3664795509222133, + "grad_norm": 0.6426949501037598, + "learning_rate": 1.955960906724398e-05, + "loss": 0.9375, + "step": 457 + }, + { + "epoch": 0.36728147554129914, + "grad_norm": 0.6146557927131653, + "learning_rate": 1.9557064031781216e-05, + "loss": 0.9336, + "step": 458 + }, + { + "epoch": 0.36808340016038493, + "grad_norm": 0.6573811769485474, + "learning_rate": 1.9554511830106356e-05, + "loss": 0.95, + "step": 459 + }, + { + "epoch": 0.3688853247794707, + "grad_norm": 0.6667237877845764, + "learning_rate": 1.955195246413314e-05, + "loss": 0.946, + "step": 460 + }, + { + "epoch": 0.3696872493985565, + "grad_norm": 0.6584280729293823, + "learning_rate": 1.9549385935780664e-05, + "loss": 0.9359, + "step": 461 + }, + { + "epoch": 0.37048917401764236, + "grad_norm": 0.6643354296684265, + "learning_rate": 1.9546812246973395e-05, + "loss": 0.9396, + "step": 462 + }, + { + "epoch": 0.37129109863672816, + "grad_norm": 0.6772244572639465, + "learning_rate": 1.9544231399641176e-05, + "loss": 0.9443, + "step": 463 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.6740161180496216, + "learning_rate": 1.954164339571921e-05, + "loss": 0.9429, + "step": 464 + }, + { + "epoch": 0.37289494787489974, + "grad_norm": 0.70747309923172, + "learning_rate": 1.9539048237148078e-05, + "loss": 0.923, + "step": 465 + }, + { + "epoch": 0.3736968724939856, + "grad_norm": 0.6899964809417725, + "learning_rate": 1.953644592587371e-05, + "loss": 0.9421, + "step": 466 + }, + { + "epoch": 0.3744987971130714, + "grad_norm": 0.6563026905059814, + "learning_rate": 1.953383646384741e-05, + "loss": 0.9893, + "step": 467 + }, + { + "epoch": 0.37530072173215717, + "grad_norm": 0.624575674533844, + "learning_rate": 1.953121985302585e-05, + "loss": 0.902, + "step": 468 + }, + { + "epoch": 0.37610264635124296, + "grad_norm": 0.6469770669937134, + "learning_rate": 1.952859609537104e-05, + "loss": 0.9884, + "step": 469 + }, + { + "epoch": 0.3769045709703288, + "grad_norm": 0.6481389999389648, + "learning_rate": 1.952596519285037e-05, + "loss": 0.9554, + "step": 470 + }, + { + "epoch": 0.3777064955894146, + "grad_norm": 0.65255206823349, + "learning_rate": 1.9523327147436585e-05, + "loss": 0.9758, + "step": 471 + }, + { + "epoch": 0.3785084202085004, + "grad_norm": 0.6691866517066956, + "learning_rate": 1.9520681961107772e-05, + "loss": 0.9768, + "step": 472 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 0.6792327165603638, + "learning_rate": 1.9518029635847387e-05, + "loss": 0.9436, + "step": 473 + }, + { + "epoch": 0.38011226944667204, + "grad_norm": 0.6820612549781799, + "learning_rate": 1.9515370173644235e-05, + "loss": 0.9722, + "step": 474 + }, + { + "epoch": 0.3809141940657578, + "grad_norm": 0.6797659397125244, + "learning_rate": 1.9512703576492466e-05, + "loss": 1.0122, + "step": 475 + }, + { + "epoch": 0.3817161186848436, + "grad_norm": 0.6471715569496155, + "learning_rate": 1.9510029846391588e-05, + "loss": 0.954, + "step": 476 + }, + { + "epoch": 0.3825180433039294, + "grad_norm": 0.7427453398704529, + "learning_rate": 1.9507348985346458e-05, + "loss": 0.9461, + "step": 477 + }, + { + "epoch": 0.38331996792301526, + "grad_norm": 0.7047792077064514, + "learning_rate": 1.9504660995367275e-05, + "loss": 0.9503, + "step": 478 + }, + { + "epoch": 0.38412189254210105, + "grad_norm": 0.6744017601013184, + "learning_rate": 1.950196587846958e-05, + "loss": 0.9848, + "step": 479 + }, + { + "epoch": 0.38492381716118684, + "grad_norm": 0.7120094895362854, + "learning_rate": 1.9499263636674273e-05, + "loss": 0.9156, + "step": 480 + }, + { + "epoch": 0.38572574178027264, + "grad_norm": 0.6583890914916992, + "learning_rate": 1.949655427200758e-05, + "loss": 0.9404, + "step": 481 + }, + { + "epoch": 0.3865276663993585, + "grad_norm": 0.7101068496704102, + "learning_rate": 1.9493837786501077e-05, + "loss": 0.9957, + "step": 482 + }, + { + "epoch": 0.3873295910184443, + "grad_norm": 0.7440847754478455, + "learning_rate": 1.949111418219168e-05, + "loss": 1.0279, + "step": 483 + }, + { + "epoch": 0.38813151563753007, + "grad_norm": 0.7091655135154724, + "learning_rate": 1.9488383461121634e-05, + "loss": 0.9855, + "step": 484 + }, + { + "epoch": 0.38893344025661586, + "grad_norm": 0.6298947334289551, + "learning_rate": 1.948564562533853e-05, + "loss": 0.9564, + "step": 485 + }, + { + "epoch": 0.3897353648757017, + "grad_norm": 0.6431513428688049, + "learning_rate": 1.9482900676895297e-05, + "loss": 0.9372, + "step": 486 + }, + { + "epoch": 0.3905372894947875, + "grad_norm": 0.7604116201400757, + "learning_rate": 1.948014861785018e-05, + "loss": 0.9654, + "step": 487 + }, + { + "epoch": 0.3913392141138733, + "grad_norm": 0.652585506439209, + "learning_rate": 1.9477389450266768e-05, + "loss": 0.9184, + "step": 488 + }, + { + "epoch": 0.3921411387329591, + "grad_norm": 0.6592057943344116, + "learning_rate": 1.9474623176213988e-05, + "loss": 0.9951, + "step": 489 + }, + { + "epoch": 0.39294306335204493, + "grad_norm": 0.7231782674789429, + "learning_rate": 1.9471849797766075e-05, + "loss": 0.9337, + "step": 490 + }, + { + "epoch": 0.3937449879711307, + "grad_norm": 0.6437721848487854, + "learning_rate": 1.9469069317002614e-05, + "loss": 0.9529, + "step": 491 + }, + { + "epoch": 0.3945469125902165, + "grad_norm": 0.6871363520622253, + "learning_rate": 1.9466281736008495e-05, + "loss": 1.0073, + "step": 492 + }, + { + "epoch": 0.3953488372093023, + "grad_norm": 0.6335092782974243, + "learning_rate": 1.9463487056873945e-05, + "loss": 0.89, + "step": 493 + }, + { + "epoch": 0.39615076182838815, + "grad_norm": 0.6468705534934998, + "learning_rate": 1.946068528169451e-05, + "loss": 0.9542, + "step": 494 + }, + { + "epoch": 0.39695268644747395, + "grad_norm": 0.6464216709136963, + "learning_rate": 1.9457876412571053e-05, + "loss": 0.926, + "step": 495 + }, + { + "epoch": 0.39775461106655974, + "grad_norm": 0.6910549998283386, + "learning_rate": 1.9455060451609765e-05, + "loss": 0.9718, + "step": 496 + }, + { + "epoch": 0.39855653568564553, + "grad_norm": 0.6526033878326416, + "learning_rate": 1.9452237400922142e-05, + "loss": 0.9153, + "step": 497 + }, + { + "epoch": 0.3993584603047314, + "grad_norm": 0.6653629541397095, + "learning_rate": 1.9449407262625015e-05, + "loss": 0.9803, + "step": 498 + }, + { + "epoch": 0.40016038492381717, + "grad_norm": 0.6513515710830688, + "learning_rate": 1.9446570038840505e-05, + "loss": 0.9739, + "step": 499 + }, + { + "epoch": 0.40096230954290296, + "grad_norm": 0.7147772908210754, + "learning_rate": 1.944372573169607e-05, + "loss": 1.0026, + "step": 500 + }, + { + "epoch": 0.40176423416198875, + "grad_norm": 0.6582165360450745, + "learning_rate": 1.9440874343324464e-05, + "loss": 1.0261, + "step": 501 + }, + { + "epoch": 0.4025661587810746, + "grad_norm": 0.6714770197868347, + "learning_rate": 1.943801587586375e-05, + "loss": 0.9979, + "step": 502 + }, + { + "epoch": 0.4033680834001604, + "grad_norm": 0.6295056939125061, + "learning_rate": 1.9435150331457314e-05, + "loss": 1.0059, + "step": 503 + }, + { + "epoch": 0.4041700080192462, + "grad_norm": 0.6907420754432678, + "learning_rate": 1.943227771225383e-05, + "loss": 0.9456, + "step": 504 + }, + { + "epoch": 0.404971932638332, + "grad_norm": 0.6090110540390015, + "learning_rate": 1.9429398020407292e-05, + "loss": 0.9187, + "step": 505 + }, + { + "epoch": 0.4057738572574178, + "grad_norm": 0.6557995080947876, + "learning_rate": 1.9426511258076988e-05, + "loss": 0.952, + "step": 506 + }, + { + "epoch": 0.4065757818765036, + "grad_norm": 0.6791728138923645, + "learning_rate": 1.942361742742751e-05, + "loss": 0.9657, + "step": 507 + }, + { + "epoch": 0.4073777064955894, + "grad_norm": 0.6913565993309021, + "learning_rate": 1.9420716530628752e-05, + "loss": 1.0223, + "step": 508 + }, + { + "epoch": 0.4081796311146752, + "grad_norm": 0.6940714716911316, + "learning_rate": 1.9417808569855907e-05, + "loss": 0.9489, + "step": 509 + }, + { + "epoch": 0.40898155573376105, + "grad_norm": 0.733680009841919, + "learning_rate": 1.9414893547289458e-05, + "loss": 0.9388, + "step": 510 + }, + { + "epoch": 0.40978348035284684, + "grad_norm": 0.6628260016441345, + "learning_rate": 1.9411971465115197e-05, + "loss": 0.9455, + "step": 511 + }, + { + "epoch": 0.41058540497193263, + "grad_norm": 0.6788282990455627, + "learning_rate": 1.940904232552419e-05, + "loss": 0.9224, + "step": 512 + }, + { + "epoch": 0.4113873295910184, + "grad_norm": 0.6449699997901917, + "learning_rate": 1.9406106130712813e-05, + "loss": 0.9927, + "step": 513 + }, + { + "epoch": 0.41218925421010427, + "grad_norm": 0.6500270962715149, + "learning_rate": 1.9403162882882722e-05, + "loss": 0.9647, + "step": 514 + }, + { + "epoch": 0.41299117882919006, + "grad_norm": 0.6693797707557678, + "learning_rate": 1.9400212584240867e-05, + "loss": 0.967, + "step": 515 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.714789628982544, + "learning_rate": 1.9397255236999478e-05, + "loss": 0.9768, + "step": 516 + }, + { + "epoch": 0.41459502806736165, + "grad_norm": 0.6399978399276733, + "learning_rate": 1.939429084337608e-05, + "loss": 0.9542, + "step": 517 + }, + { + "epoch": 0.4153969526864475, + "grad_norm": 0.644829273223877, + "learning_rate": 1.939131940559347e-05, + "loss": 0.9968, + "step": 518 + }, + { + "epoch": 0.4161988773055333, + "grad_norm": 0.7262901067733765, + "learning_rate": 1.938834092587974e-05, + "loss": 0.9511, + "step": 519 + }, + { + "epoch": 0.4170008019246191, + "grad_norm": 0.6648424863815308, + "learning_rate": 1.938535540646825e-05, + "loss": 0.9986, + "step": 520 + }, + { + "epoch": 0.41780272654370487, + "grad_norm": 0.7087076902389526, + "learning_rate": 1.938236284959765e-05, + "loss": 0.9664, + "step": 521 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.7221333384513855, + "learning_rate": 1.9379363257511855e-05, + "loss": 0.9482, + "step": 522 + }, + { + "epoch": 0.4194065757818765, + "grad_norm": 0.6906344294548035, + "learning_rate": 1.9376356632460063e-05, + "loss": 1.0003, + "step": 523 + }, + { + "epoch": 0.4202085004009623, + "grad_norm": 0.7014548778533936, + "learning_rate": 1.9373342976696742e-05, + "loss": 0.9728, + "step": 524 + }, + { + "epoch": 0.4210104250200481, + "grad_norm": 0.6935135722160339, + "learning_rate": 1.9370322292481642e-05, + "loss": 0.9788, + "step": 525 + }, + { + "epoch": 0.42181234963913394, + "grad_norm": 0.6556846499443054, + "learning_rate": 1.9367294582079768e-05, + "loss": 0.982, + "step": 526 + }, + { + "epoch": 0.42261427425821974, + "grad_norm": 0.6862344145774841, + "learning_rate": 1.93642598477614e-05, + "loss": 0.9361, + "step": 527 + }, + { + "epoch": 0.4234161988773055, + "grad_norm": 0.6807497143745422, + "learning_rate": 1.9361218091802088e-05, + "loss": 0.9717, + "step": 528 + }, + { + "epoch": 0.4242181234963913, + "grad_norm": 0.646615743637085, + "learning_rate": 1.935816931648264e-05, + "loss": 0.9416, + "step": 529 + }, + { + "epoch": 0.42502004811547717, + "grad_norm": 0.646940290927887, + "learning_rate": 1.9355113524089137e-05, + "loss": 0.952, + "step": 530 + }, + { + "epoch": 0.42582197273456296, + "grad_norm": 0.7170730233192444, + "learning_rate": 1.9352050716912915e-05, + "loss": 0.9744, + "step": 531 + }, + { + "epoch": 0.42662389735364875, + "grad_norm": 0.6803928017616272, + "learning_rate": 1.934898089725057e-05, + "loss": 0.9409, + "step": 532 + }, + { + "epoch": 0.42742582197273454, + "grad_norm": 0.6328278183937073, + "learning_rate": 1.9345904067403953e-05, + "loss": 0.9368, + "step": 533 + }, + { + "epoch": 0.4282277465918204, + "grad_norm": 0.6864063143730164, + "learning_rate": 1.9342820229680185e-05, + "loss": 0.9771, + "step": 534 + }, + { + "epoch": 0.4290296712109062, + "grad_norm": 0.6935616135597229, + "learning_rate": 1.9339729386391622e-05, + "loss": 0.9774, + "step": 535 + }, + { + "epoch": 0.429831595829992, + "grad_norm": 0.6815831065177917, + "learning_rate": 1.9336631539855895e-05, + "loss": 0.9468, + "step": 536 + }, + { + "epoch": 0.43063352044907777, + "grad_norm": 0.6866287589073181, + "learning_rate": 1.9333526692395863e-05, + "loss": 0.9433, + "step": 537 + }, + { + "epoch": 0.4314354450681636, + "grad_norm": 0.7279961109161377, + "learning_rate": 1.9330414846339656e-05, + "loss": 0.9595, + "step": 538 + }, + { + "epoch": 0.4322373696872494, + "grad_norm": 0.659054160118103, + "learning_rate": 1.9327296004020638e-05, + "loss": 0.9593, + "step": 539 + }, + { + "epoch": 0.4330392943063352, + "grad_norm": 0.6249253749847412, + "learning_rate": 1.9324170167777425e-05, + "loss": 0.9569, + "step": 540 + }, + { + "epoch": 0.433841218925421, + "grad_norm": 0.6949421167373657, + "learning_rate": 1.9321037339953873e-05, + "loss": 0.9529, + "step": 541 + }, + { + "epoch": 0.43464314354450684, + "grad_norm": 0.7360992431640625, + "learning_rate": 1.9317897522899082e-05, + "loss": 1.0171, + "step": 542 + }, + { + "epoch": 0.43544506816359263, + "grad_norm": 0.6973049640655518, + "learning_rate": 1.93147507189674e-05, + "loss": 0.9597, + "step": 543 + }, + { + "epoch": 0.4362469927826784, + "grad_norm": 0.6927620768547058, + "learning_rate": 1.93115969305184e-05, + "loss": 0.9106, + "step": 544 + }, + { + "epoch": 0.4370489174017642, + "grad_norm": 0.6799963712692261, + "learning_rate": 1.9308436159916905e-05, + "loss": 0.9958, + "step": 545 + }, + { + "epoch": 0.43785084202085006, + "grad_norm": 0.6450375914573669, + "learning_rate": 1.9305268409532968e-05, + "loss": 0.9605, + "step": 546 + }, + { + "epoch": 0.43865276663993585, + "grad_norm": 0.6617172360420227, + "learning_rate": 1.9302093681741874e-05, + "loss": 0.9424, + "step": 547 + }, + { + "epoch": 0.43945469125902165, + "grad_norm": 0.7010754346847534, + "learning_rate": 1.9298911978924142e-05, + "loss": 0.9857, + "step": 548 + }, + { + "epoch": 0.44025661587810744, + "grad_norm": 0.665642499923706, + "learning_rate": 1.9295723303465523e-05, + "loss": 0.9495, + "step": 549 + }, + { + "epoch": 0.4410585404971933, + "grad_norm": 0.6675366759300232, + "learning_rate": 1.9292527657756994e-05, + "loss": 0.9411, + "step": 550 + }, + { + "epoch": 0.4418604651162791, + "grad_norm": 0.6773011684417725, + "learning_rate": 1.928932504419476e-05, + "loss": 0.9939, + "step": 551 + }, + { + "epoch": 0.44266238973536487, + "grad_norm": 0.691259503364563, + "learning_rate": 1.9286115465180248e-05, + "loss": 0.9641, + "step": 552 + }, + { + "epoch": 0.44346431435445066, + "grad_norm": 0.6108399033546448, + "learning_rate": 1.928289892312011e-05, + "loss": 0.9077, + "step": 553 + }, + { + "epoch": 0.4442662389735365, + "grad_norm": 0.6582357287406921, + "learning_rate": 1.927967542042622e-05, + "loss": 0.9379, + "step": 554 + }, + { + "epoch": 0.4450681635926223, + "grad_norm": 0.7069655060768127, + "learning_rate": 1.9276444959515664e-05, + "loss": 0.9621, + "step": 555 + }, + { + "epoch": 0.4458700882117081, + "grad_norm": 0.6511080265045166, + "learning_rate": 1.9273207542810764e-05, + "loss": 0.9675, + "step": 556 + }, + { + "epoch": 0.4466720128307939, + "grad_norm": 0.6380482912063599, + "learning_rate": 1.9269963172739033e-05, + "loss": 0.9744, + "step": 557 + }, + { + "epoch": 0.44747393744987973, + "grad_norm": 0.6568742394447327, + "learning_rate": 1.9266711851733214e-05, + "loss": 0.9644, + "step": 558 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 0.6376577019691467, + "learning_rate": 1.9263453582231265e-05, + "loss": 0.9969, + "step": 559 + }, + { + "epoch": 0.4490777866880513, + "grad_norm": 0.6453221440315247, + "learning_rate": 1.9260188366676337e-05, + "loss": 0.9894, + "step": 560 + }, + { + "epoch": 0.4498797113071371, + "grad_norm": 0.6480368375778198, + "learning_rate": 1.9256916207516806e-05, + "loss": 0.9315, + "step": 561 + }, + { + "epoch": 0.45068163592622296, + "grad_norm": 0.6618868708610535, + "learning_rate": 1.9253637107206246e-05, + "loss": 0.9886, + "step": 562 + }, + { + "epoch": 0.45148356054530875, + "grad_norm": 0.646225094795227, + "learning_rate": 1.9250351068203442e-05, + "loss": 0.9983, + "step": 563 + }, + { + "epoch": 0.45228548516439454, + "grad_norm": 0.6107761859893799, + "learning_rate": 1.9247058092972372e-05, + "loss": 0.9496, + "step": 564 + }, + { + "epoch": 0.45308740978348033, + "grad_norm": 0.6536424160003662, + "learning_rate": 1.9243758183982226e-05, + "loss": 0.9751, + "step": 565 + }, + { + "epoch": 0.4538893344025662, + "grad_norm": 0.5984099507331848, + "learning_rate": 1.9240451343707382e-05, + "loss": 0.9534, + "step": 566 + }, + { + "epoch": 0.454691259021652, + "grad_norm": 0.622818112373352, + "learning_rate": 1.9237137574627433e-05, + "loss": 0.9064, + "step": 567 + }, + { + "epoch": 0.45549318364073776, + "grad_norm": 0.6724113821983337, + "learning_rate": 1.923381687922714e-05, + "loss": 0.9416, + "step": 568 + }, + { + "epoch": 0.45629510825982356, + "grad_norm": 0.6443886160850525, + "learning_rate": 1.9230489259996487e-05, + "loss": 0.9413, + "step": 569 + }, + { + "epoch": 0.4570970328789094, + "grad_norm": 0.6603150963783264, + "learning_rate": 1.922715471943063e-05, + "loss": 0.9813, + "step": 570 + }, + { + "epoch": 0.4578989574979952, + "grad_norm": 0.642634928226471, + "learning_rate": 1.9223813260029922e-05, + "loss": 0.9405, + "step": 571 + }, + { + "epoch": 0.458700882117081, + "grad_norm": 0.668830931186676, + "learning_rate": 1.92204648842999e-05, + "loss": 0.9891, + "step": 572 + }, + { + "epoch": 0.4595028067361668, + "grad_norm": 0.617743968963623, + "learning_rate": 1.9217109594751303e-05, + "loss": 0.971, + "step": 573 + }, + { + "epoch": 0.4603047313552526, + "grad_norm": 0.6333216428756714, + "learning_rate": 1.9213747393900025e-05, + "loss": 0.9542, + "step": 574 + }, + { + "epoch": 0.4611066559743384, + "grad_norm": 0.6373317241668701, + "learning_rate": 1.9210378284267166e-05, + "loss": 0.9329, + "step": 575 + }, + { + "epoch": 0.4619085805934242, + "grad_norm": 0.617574155330658, + "learning_rate": 1.9207002268378998e-05, + "loss": 0.9708, + "step": 576 + }, + { + "epoch": 0.46271050521251, + "grad_norm": 0.6191926002502441, + "learning_rate": 1.9203619348766974e-05, + "loss": 0.9154, + "step": 577 + }, + { + "epoch": 0.46351242983159585, + "grad_norm": 0.6222400069236755, + "learning_rate": 1.9200229527967724e-05, + "loss": 0.9354, + "step": 578 + }, + { + "epoch": 0.46431435445068164, + "grad_norm": 0.6831260919570923, + "learning_rate": 1.9196832808523048e-05, + "loss": 0.9424, + "step": 579 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.6363519430160522, + "learning_rate": 1.919342919297992e-05, + "loss": 0.9589, + "step": 580 + }, + { + "epoch": 0.4659182036888532, + "grad_norm": 0.6219954490661621, + "learning_rate": 1.9190018683890492e-05, + "loss": 0.9204, + "step": 581 + }, + { + "epoch": 0.4667201283079391, + "grad_norm": 0.6711027026176453, + "learning_rate": 1.9186601283812077e-05, + "loss": 0.9249, + "step": 582 + }, + { + "epoch": 0.46752205292702487, + "grad_norm": 0.656484067440033, + "learning_rate": 1.9183176995307156e-05, + "loss": 0.9821, + "step": 583 + }, + { + "epoch": 0.46832397754611066, + "grad_norm": 0.6418080925941467, + "learning_rate": 1.9179745820943382e-05, + "loss": 0.9759, + "step": 584 + }, + { + "epoch": 0.46912590216519645, + "grad_norm": 0.7414655089378357, + "learning_rate": 1.9176307763293563e-05, + "loss": 0.9328, + "step": 585 + }, + { + "epoch": 0.4699278267842823, + "grad_norm": 0.634429931640625, + "learning_rate": 1.9172862824935677e-05, + "loss": 0.918, + "step": 586 + }, + { + "epoch": 0.4707297514033681, + "grad_norm": 0.6168124675750732, + "learning_rate": 1.9169411008452847e-05, + "loss": 0.9247, + "step": 587 + }, + { + "epoch": 0.4715316760224539, + "grad_norm": 0.6918452978134155, + "learning_rate": 1.9165952316433367e-05, + "loss": 0.9379, + "step": 588 + }, + { + "epoch": 0.4723336006415397, + "grad_norm": 0.6637231111526489, + "learning_rate": 1.9162486751470687e-05, + "loss": 0.9685, + "step": 589 + }, + { + "epoch": 0.4731355252606255, + "grad_norm": 0.6197507381439209, + "learning_rate": 1.9159014316163395e-05, + "loss": 0.9876, + "step": 590 + }, + { + "epoch": 0.4739374498797113, + "grad_norm": 0.6182752251625061, + "learning_rate": 1.915553501311525e-05, + "loss": 0.9288, + "step": 591 + }, + { + "epoch": 0.4747393744987971, + "grad_norm": 0.6498412489891052, + "learning_rate": 1.9152048844935152e-05, + "loss": 0.9284, + "step": 592 + }, + { + "epoch": 0.4755412991178829, + "grad_norm": 0.6597406268119812, + "learning_rate": 1.914855581423714e-05, + "loss": 0.9159, + "step": 593 + }, + { + "epoch": 0.47634322373696875, + "grad_norm": 0.6668150424957275, + "learning_rate": 1.9145055923640417e-05, + "loss": 0.9473, + "step": 594 + }, + { + "epoch": 0.47714514835605454, + "grad_norm": 0.7026738524436951, + "learning_rate": 1.9141549175769315e-05, + "loss": 0.9343, + "step": 595 + }, + { + "epoch": 0.47794707297514033, + "grad_norm": 0.7704558372497559, + "learning_rate": 1.9138035573253316e-05, + "loss": 0.9569, + "step": 596 + }, + { + "epoch": 0.4787489975942261, + "grad_norm": 0.6594985723495483, + "learning_rate": 1.9134515118727035e-05, + "loss": 0.9666, + "step": 597 + }, + { + "epoch": 0.47955092221331197, + "grad_norm": 0.6233870387077332, + "learning_rate": 1.913098781483023e-05, + "loss": 0.9473, + "step": 598 + }, + { + "epoch": 0.48035284683239776, + "grad_norm": 0.6997066736221313, + "learning_rate": 1.9127453664207798e-05, + "loss": 0.8946, + "step": 599 + }, + { + "epoch": 0.48115477145148355, + "grad_norm": 0.6761658191680908, + "learning_rate": 1.912391266950976e-05, + "loss": 0.9911, + "step": 600 + }, + { + "epoch": 0.48195669607056935, + "grad_norm": 0.6300480365753174, + "learning_rate": 1.9120364833391277e-05, + "loss": 0.9955, + "step": 601 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 0.6605967283248901, + "learning_rate": 1.9116810158512635e-05, + "loss": 0.9853, + "step": 602 + }, + { + "epoch": 0.483560545308741, + "grad_norm": 0.6040114164352417, + "learning_rate": 1.9113248647539253e-05, + "loss": 0.9011, + "step": 603 + }, + { + "epoch": 0.4843624699278268, + "grad_norm": 0.6693778038024902, + "learning_rate": 1.9109680303141673e-05, + "loss": 0.9038, + "step": 604 + }, + { + "epoch": 0.48516439454691257, + "grad_norm": 0.6784869432449341, + "learning_rate": 1.910610512799556e-05, + "loss": 0.9332, + "step": 605 + }, + { + "epoch": 0.4859663191659984, + "grad_norm": 0.6835043430328369, + "learning_rate": 1.91025231247817e-05, + "loss": 0.985, + "step": 606 + }, + { + "epoch": 0.4867682437850842, + "grad_norm": 0.6370753645896912, + "learning_rate": 1.9098934296186006e-05, + "loss": 1.0014, + "step": 607 + }, + { + "epoch": 0.48757016840417, + "grad_norm": 0.7216833233833313, + "learning_rate": 1.9095338644899502e-05, + "loss": 0.948, + "step": 608 + }, + { + "epoch": 0.4883720930232558, + "grad_norm": 0.6614647507667542, + "learning_rate": 1.9091736173618326e-05, + "loss": 0.9399, + "step": 609 + }, + { + "epoch": 0.48917401764234164, + "grad_norm": 0.6034402251243591, + "learning_rate": 1.908812688504374e-05, + "loss": 0.9501, + "step": 610 + }, + { + "epoch": 0.48997594226142743, + "grad_norm": 0.628848135471344, + "learning_rate": 1.9084510781882108e-05, + "loss": 0.9393, + "step": 611 + }, + { + "epoch": 0.4907778668805132, + "grad_norm": 0.5977146625518799, + "learning_rate": 1.9080887866844902e-05, + "loss": 0.9689, + "step": 612 + }, + { + "epoch": 0.491579791499599, + "grad_norm": 0.6800901889801025, + "learning_rate": 1.907725814264872e-05, + "loss": 0.9777, + "step": 613 + }, + { + "epoch": 0.49238171611868486, + "grad_norm": 0.6149044036865234, + "learning_rate": 1.9073621612015244e-05, + "loss": 0.9549, + "step": 614 + }, + { + "epoch": 0.49318364073777066, + "grad_norm": 0.7120502591133118, + "learning_rate": 1.9069978277671266e-05, + "loss": 0.9653, + "step": 615 + }, + { + "epoch": 0.49398556535685645, + "grad_norm": 0.59898442029953, + "learning_rate": 1.906632814234869e-05, + "loss": 0.9387, + "step": 616 + }, + { + "epoch": 0.49478748997594224, + "grad_norm": 0.6274296045303345, + "learning_rate": 1.9062671208784508e-05, + "loss": 0.9482, + "step": 617 + }, + { + "epoch": 0.4955894145950281, + "grad_norm": 0.6537023186683655, + "learning_rate": 1.9059007479720807e-05, + "loss": 0.9233, + "step": 618 + }, + { + "epoch": 0.4963913392141139, + "grad_norm": 0.6578821539878845, + "learning_rate": 1.905533695790479e-05, + "loss": 0.9676, + "step": 619 + }, + { + "epoch": 0.4971932638331997, + "grad_norm": 0.6332679986953735, + "learning_rate": 1.9051659646088726e-05, + "loss": 0.9104, + "step": 620 + }, + { + "epoch": 0.49799518845228546, + "grad_norm": 0.66425621509552, + "learning_rate": 1.9047975547029998e-05, + "loss": 0.9788, + "step": 621 + }, + { + "epoch": 0.4987971130713713, + "grad_norm": 0.6680029630661011, + "learning_rate": 1.9044284663491065e-05, + "loss": 0.9555, + "step": 622 + }, + { + "epoch": 0.4995990376904571, + "grad_norm": 0.6043557524681091, + "learning_rate": 1.9040586998239472e-05, + "loss": 0.988, + "step": 623 + }, + { + "epoch": 0.5004009623095429, + "grad_norm": 0.6627247929573059, + "learning_rate": 1.903688255404786e-05, + "loss": 0.953, + "step": 624 + }, + { + "epoch": 0.5012028869286287, + "grad_norm": 0.6448099613189697, + "learning_rate": 1.9033171333693952e-05, + "loss": 0.9308, + "step": 625 + }, + { + "epoch": 0.5020048115477145, + "grad_norm": 0.5838706493377686, + "learning_rate": 1.902945333996054e-05, + "loss": 0.9421, + "step": 626 + }, + { + "epoch": 0.5028067361668003, + "grad_norm": 0.6396023631095886, + "learning_rate": 1.9025728575635503e-05, + "loss": 0.9472, + "step": 627 + }, + { + "epoch": 0.5036086607858862, + "grad_norm": 0.5953710675239563, + "learning_rate": 1.9021997043511798e-05, + "loss": 0.9113, + "step": 628 + }, + { + "epoch": 0.504410585404972, + "grad_norm": 0.7014410495758057, + "learning_rate": 1.9018258746387458e-05, + "loss": 0.9839, + "step": 629 + }, + { + "epoch": 0.5052125100240578, + "grad_norm": 0.6346995830535889, + "learning_rate": 1.901451368706558e-05, + "loss": 0.9552, + "step": 630 + }, + { + "epoch": 0.5060144346431436, + "grad_norm": 0.6501613855361938, + "learning_rate": 1.9010761868354336e-05, + "loss": 0.9407, + "step": 631 + }, + { + "epoch": 0.5068163592622293, + "grad_norm": 0.7061483860015869, + "learning_rate": 1.9007003293066973e-05, + "loss": 0.9881, + "step": 632 + }, + { + "epoch": 0.5076182838813151, + "grad_norm": 0.6285912394523621, + "learning_rate": 1.9003237964021796e-05, + "loss": 0.9514, + "step": 633 + }, + { + "epoch": 0.5084202085004009, + "grad_norm": 0.7684087753295898, + "learning_rate": 1.899946588404218e-05, + "loss": 0.9336, + "step": 634 + }, + { + "epoch": 0.5092221331194867, + "grad_norm": 0.7490344047546387, + "learning_rate": 1.8995687055956555e-05, + "loss": 0.8914, + "step": 635 + }, + { + "epoch": 0.5100240577385726, + "grad_norm": 0.8029311299324036, + "learning_rate": 1.8991901482598414e-05, + "loss": 0.9701, + "step": 636 + }, + { + "epoch": 0.5108259823576584, + "grad_norm": 0.6485514044761658, + "learning_rate": 1.8988109166806313e-05, + "loss": 0.9437, + "step": 637 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.6395050883293152, + "learning_rate": 1.8984310111423855e-05, + "loss": 0.9561, + "step": 638 + }, + { + "epoch": 0.51242983159583, + "grad_norm": 0.6431874632835388, + "learning_rate": 1.8980504319299705e-05, + "loss": 0.9247, + "step": 639 + }, + { + "epoch": 0.5132317562149158, + "grad_norm": 0.675888180732727, + "learning_rate": 1.8976691793287575e-05, + "loss": 0.9203, + "step": 640 + }, + { + "epoch": 0.5140336808340016, + "grad_norm": 0.6630160212516785, + "learning_rate": 1.8972872536246224e-05, + "loss": 0.9709, + "step": 641 + }, + { + "epoch": 0.5148356054530874, + "grad_norm": 0.6319396495819092, + "learning_rate": 1.8969046551039466e-05, + "loss": 0.987, + "step": 642 + }, + { + "epoch": 0.5156375300721732, + "grad_norm": 0.6689966320991516, + "learning_rate": 1.8965213840536152e-05, + "loss": 0.9802, + "step": 643 + }, + { + "epoch": 0.5164394546912591, + "grad_norm": 0.6527170538902283, + "learning_rate": 1.8961374407610177e-05, + "loss": 0.9682, + "step": 644 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 0.5882049202919006, + "learning_rate": 1.8957528255140482e-05, + "loss": 0.9256, + "step": 645 + }, + { + "epoch": 0.5180433039294307, + "grad_norm": 0.6243289709091187, + "learning_rate": 1.895367538601104e-05, + "loss": 0.9512, + "step": 646 + }, + { + "epoch": 0.5188452285485164, + "grad_norm": 0.6396244764328003, + "learning_rate": 1.894981580311087e-05, + "loss": 0.9402, + "step": 647 + }, + { + "epoch": 0.5196471531676022, + "grad_norm": 0.6135784387588501, + "learning_rate": 1.8945949509334008e-05, + "loss": 0.9745, + "step": 648 + }, + { + "epoch": 0.520449077786688, + "grad_norm": 0.6294798851013184, + "learning_rate": 1.894207650757954e-05, + "loss": 0.929, + "step": 649 + }, + { + "epoch": 0.5212510024057738, + "grad_norm": 0.6499471664428711, + "learning_rate": 1.8938196800751575e-05, + "loss": 0.9595, + "step": 650 + }, + { + "epoch": 0.5220529270248596, + "grad_norm": 0.6707350611686707, + "learning_rate": 1.8934310391759247e-05, + "loss": 0.9328, + "step": 651 + }, + { + "epoch": 0.5228548516439455, + "grad_norm": 0.6414554715156555, + "learning_rate": 1.8930417283516717e-05, + "loss": 0.8878, + "step": 652 + }, + { + "epoch": 0.5236567762630313, + "grad_norm": 0.6393246650695801, + "learning_rate": 1.892651747894317e-05, + "loss": 0.9511, + "step": 653 + }, + { + "epoch": 0.5244587008821171, + "grad_norm": 0.658134937286377, + "learning_rate": 1.892261098096282e-05, + "loss": 0.9845, + "step": 654 + }, + { + "epoch": 0.5252606255012029, + "grad_norm": 0.6066871881484985, + "learning_rate": 1.891869779250488e-05, + "loss": 0.9655, + "step": 655 + }, + { + "epoch": 0.5260625501202887, + "grad_norm": 0.6272629499435425, + "learning_rate": 1.8914777916503602e-05, + "loss": 0.9605, + "step": 656 + }, + { + "epoch": 0.5268644747393745, + "grad_norm": 0.6052728295326233, + "learning_rate": 1.8910851355898238e-05, + "loss": 0.8884, + "step": 657 + }, + { + "epoch": 0.5276663993584603, + "grad_norm": 0.6381072998046875, + "learning_rate": 1.8906918113633054e-05, + "loss": 0.9684, + "step": 658 + }, + { + "epoch": 0.5284683239775461, + "grad_norm": 0.6366999745368958, + "learning_rate": 1.8902978192657334e-05, + "loss": 0.8999, + "step": 659 + }, + { + "epoch": 0.529270248596632, + "grad_norm": 0.6535215377807617, + "learning_rate": 1.8899031595925362e-05, + "loss": 0.9436, + "step": 660 + }, + { + "epoch": 0.5300721732157178, + "grad_norm": 0.6399298310279846, + "learning_rate": 1.8895078326396436e-05, + "loss": 0.9122, + "step": 661 + }, + { + "epoch": 0.5308740978348035, + "grad_norm": 0.6174817681312561, + "learning_rate": 1.8891118387034845e-05, + "loss": 0.9312, + "step": 662 + }, + { + "epoch": 0.5316760224538893, + "grad_norm": 0.6312207579612732, + "learning_rate": 1.888715178080989e-05, + "loss": 0.9274, + "step": 663 + }, + { + "epoch": 0.5324779470729751, + "grad_norm": 0.6061504483222961, + "learning_rate": 1.8883178510695868e-05, + "loss": 0.9038, + "step": 664 + }, + { + "epoch": 0.5332798716920609, + "grad_norm": 0.62549889087677, + "learning_rate": 1.8879198579672068e-05, + "loss": 0.9193, + "step": 665 + }, + { + "epoch": 0.5340817963111467, + "grad_norm": 0.6522451043128967, + "learning_rate": 1.8875211990722785e-05, + "loss": 0.931, + "step": 666 + }, + { + "epoch": 0.5348837209302325, + "grad_norm": 0.6099725365638733, + "learning_rate": 1.8871218746837294e-05, + "loss": 0.9345, + "step": 667 + }, + { + "epoch": 0.5356856455493184, + "grad_norm": 0.6159772872924805, + "learning_rate": 1.8867218851009862e-05, + "loss": 0.9469, + "step": 668 + }, + { + "epoch": 0.5364875701684042, + "grad_norm": 0.6051928400993347, + "learning_rate": 1.8863212306239753e-05, + "loss": 0.8725, + "step": 669 + }, + { + "epoch": 0.53728949478749, + "grad_norm": 0.5804814100265503, + "learning_rate": 1.8859199115531213e-05, + "loss": 0.9943, + "step": 670 + }, + { + "epoch": 0.5380914194065758, + "grad_norm": 0.6454379558563232, + "learning_rate": 1.8855179281893464e-05, + "loss": 1.008, + "step": 671 + }, + { + "epoch": 0.5388933440256616, + "grad_norm": 0.5961251258850098, + "learning_rate": 1.8851152808340715e-05, + "loss": 0.9135, + "step": 672 + }, + { + "epoch": 0.5396952686447474, + "grad_norm": 0.644010066986084, + "learning_rate": 1.884711969789215e-05, + "loss": 0.9762, + "step": 673 + }, + { + "epoch": 0.5404971932638332, + "grad_norm": 0.6359036564826965, + "learning_rate": 1.884307995357194e-05, + "loss": 0.9054, + "step": 674 + }, + { + "epoch": 0.541299117882919, + "grad_norm": 0.5981766581535339, + "learning_rate": 1.883903357840922e-05, + "loss": 0.9705, + "step": 675 + }, + { + "epoch": 0.5421010425020049, + "grad_norm": 0.6233227849006653, + "learning_rate": 1.8834980575438094e-05, + "loss": 0.9594, + "step": 676 + }, + { + "epoch": 0.5429029671210907, + "grad_norm": 0.6139412522315979, + "learning_rate": 1.883092094769765e-05, + "loss": 0.9626, + "step": 677 + }, + { + "epoch": 0.5437048917401764, + "grad_norm": 0.6309959292411804, + "learning_rate": 1.882685469823193e-05, + "loss": 0.9812, + "step": 678 + }, + { + "epoch": 0.5445068163592622, + "grad_norm": 0.6182360649108887, + "learning_rate": 1.882278183008995e-05, + "loss": 0.9537, + "step": 679 + }, + { + "epoch": 0.545308740978348, + "grad_norm": 0.6408948302268982, + "learning_rate": 1.881870234632568e-05, + "loss": 0.9611, + "step": 680 + }, + { + "epoch": 0.5461106655974338, + "grad_norm": 0.6162562966346741, + "learning_rate": 1.8814616249998063e-05, + "loss": 0.9661, + "step": 681 + }, + { + "epoch": 0.5469125902165196, + "grad_norm": 0.6035715341567993, + "learning_rate": 1.8810523544170986e-05, + "loss": 0.9394, + "step": 682 + }, + { + "epoch": 0.5477145148356054, + "grad_norm": 0.644282877445221, + "learning_rate": 1.88064242319133e-05, + "loss": 0.9357, + "step": 683 + }, + { + "epoch": 0.5485164394546913, + "grad_norm": 0.6356081962585449, + "learning_rate": 1.8802318316298817e-05, + "loss": 0.9142, + "step": 684 + }, + { + "epoch": 0.5493183640737771, + "grad_norm": 0.6344892978668213, + "learning_rate": 1.8798205800406283e-05, + "loss": 0.928, + "step": 685 + }, + { + "epoch": 0.5501202886928629, + "grad_norm": 0.6985346674919128, + "learning_rate": 1.8794086687319405e-05, + "loss": 0.9173, + "step": 686 + }, + { + "epoch": 0.5509222133119487, + "grad_norm": 0.6084068417549133, + "learning_rate": 1.8789960980126836e-05, + "loss": 0.9559, + "step": 687 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.6394224166870117, + "learning_rate": 1.8785828681922176e-05, + "loss": 0.9761, + "step": 688 + }, + { + "epoch": 0.5525260625501203, + "grad_norm": 0.637833833694458, + "learning_rate": 1.8781689795803954e-05, + "loss": 0.9265, + "step": 689 + }, + { + "epoch": 0.5533279871692061, + "grad_norm": 0.631105899810791, + "learning_rate": 1.8777544324875653e-05, + "loss": 0.9381, + "step": 690 + }, + { + "epoch": 0.5541299117882919, + "grad_norm": 0.6532770991325378, + "learning_rate": 1.8773392272245687e-05, + "loss": 0.938, + "step": 691 + }, + { + "epoch": 0.5549318364073778, + "grad_norm": 0.653390109539032, + "learning_rate": 1.8769233641027406e-05, + "loss": 0.9557, + "step": 692 + }, + { + "epoch": 0.5557337610264635, + "grad_norm": 0.6561689376831055, + "learning_rate": 1.8765068434339095e-05, + "loss": 0.8861, + "step": 693 + }, + { + "epoch": 0.5565356856455493, + "grad_norm": 0.6327289342880249, + "learning_rate": 1.8760896655303968e-05, + "loss": 0.9646, + "step": 694 + }, + { + "epoch": 0.5573376102646351, + "grad_norm": 0.6778370141983032, + "learning_rate": 1.875671830705016e-05, + "loss": 0.9892, + "step": 695 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.6418762803077698, + "learning_rate": 1.875253339271075e-05, + "loss": 0.9706, + "step": 696 + }, + { + "epoch": 0.5589414595028067, + "grad_norm": 0.6594840884208679, + "learning_rate": 1.8748341915423723e-05, + "loss": 0.9193, + "step": 697 + }, + { + "epoch": 0.5597433841218925, + "grad_norm": 0.6546462178230286, + "learning_rate": 1.874414387833199e-05, + "loss": 0.9528, + "step": 698 + }, + { + "epoch": 0.5605453087409783, + "grad_norm": 0.666907548904419, + "learning_rate": 1.8739939284583385e-05, + "loss": 0.9301, + "step": 699 + }, + { + "epoch": 0.5613472333600642, + "grad_norm": 0.6767510771751404, + "learning_rate": 1.873572813733066e-05, + "loss": 0.9624, + "step": 700 + }, + { + "epoch": 0.56214915797915, + "grad_norm": 0.6574323773384094, + "learning_rate": 1.8731510439731465e-05, + "loss": 0.9672, + "step": 701 + }, + { + "epoch": 0.5629510825982358, + "grad_norm": 0.6924127340316772, + "learning_rate": 1.872728619494838e-05, + "loss": 0.9405, + "step": 702 + }, + { + "epoch": 0.5637530072173216, + "grad_norm": 0.6515429615974426, + "learning_rate": 1.8723055406148894e-05, + "loss": 0.9477, + "step": 703 + }, + { + "epoch": 0.5645549318364074, + "grad_norm": 0.7073892951011658, + "learning_rate": 1.8718818076505385e-05, + "loss": 0.9403, + "step": 704 + }, + { + "epoch": 0.5653568564554932, + "grad_norm": 0.706851065158844, + "learning_rate": 1.8714574209195153e-05, + "loss": 0.9704, + "step": 705 + }, + { + "epoch": 0.566158781074579, + "grad_norm": 0.624336838722229, + "learning_rate": 1.8710323807400393e-05, + "loss": 0.9558, + "step": 706 + }, + { + "epoch": 0.5669607056936647, + "grad_norm": 0.6605740785598755, + "learning_rate": 1.8706066874308205e-05, + "loss": 0.9467, + "step": 707 + }, + { + "epoch": 0.5677626303127506, + "grad_norm": 0.7018135190010071, + "learning_rate": 1.870180341311057e-05, + "loss": 0.9277, + "step": 708 + }, + { + "epoch": 0.5685645549318364, + "grad_norm": 0.6792058348655701, + "learning_rate": 1.8697533427004395e-05, + "loss": 0.9706, + "step": 709 + }, + { + "epoch": 0.5693664795509222, + "grad_norm": 0.6452786326408386, + "learning_rate": 1.8693256919191446e-05, + "loss": 0.9426, + "step": 710 + }, + { + "epoch": 0.570168404170008, + "grad_norm": 0.7065607309341431, + "learning_rate": 1.8688973892878405e-05, + "loss": 0.9299, + "step": 711 + }, + { + "epoch": 0.5709703287890938, + "grad_norm": 0.6309828758239746, + "learning_rate": 1.8684684351276822e-05, + "loss": 0.9521, + "step": 712 + }, + { + "epoch": 0.5717722534081796, + "grad_norm": 0.6651354432106018, + "learning_rate": 1.868038829760314e-05, + "loss": 0.9899, + "step": 713 + }, + { + "epoch": 0.5725741780272654, + "grad_norm": 0.6422202587127686, + "learning_rate": 1.8676085735078696e-05, + "loss": 0.9125, + "step": 714 + }, + { + "epoch": 0.5733761026463512, + "grad_norm": 0.6563206315040588, + "learning_rate": 1.8671776666929694e-05, + "loss": 0.9854, + "step": 715 + }, + { + "epoch": 0.5741780272654371, + "grad_norm": 0.5996401906013489, + "learning_rate": 1.8667461096387217e-05, + "loss": 0.9754, + "step": 716 + }, + { + "epoch": 0.5749799518845229, + "grad_norm": 0.5983526706695557, + "learning_rate": 1.866313902668723e-05, + "loss": 0.9524, + "step": 717 + }, + { + "epoch": 0.5757818765036087, + "grad_norm": 0.5966793298721313, + "learning_rate": 1.8658810461070566e-05, + "loss": 0.8934, + "step": 718 + }, + { + "epoch": 0.5765838011226945, + "grad_norm": 0.636679470539093, + "learning_rate": 1.865447540278293e-05, + "loss": 0.9368, + "step": 719 + }, + { + "epoch": 0.5773857257417803, + "grad_norm": 0.6102825403213501, + "learning_rate": 1.8650133855074905e-05, + "loss": 0.9498, + "step": 720 + }, + { + "epoch": 0.5781876503608661, + "grad_norm": 0.6652585864067078, + "learning_rate": 1.8645785821201918e-05, + "loss": 0.9235, + "step": 721 + }, + { + "epoch": 0.5789895749799518, + "grad_norm": 0.6623063087463379, + "learning_rate": 1.864143130442428e-05, + "loss": 0.965, + "step": 722 + }, + { + "epoch": 0.5797914995990376, + "grad_norm": 0.6263592839241028, + "learning_rate": 1.8637070308007156e-05, + "loss": 0.9354, + "step": 723 + }, + { + "epoch": 0.5805934242181235, + "grad_norm": 0.6858724355697632, + "learning_rate": 1.8632702835220572e-05, + "loss": 0.9467, + "step": 724 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 0.63621586561203, + "learning_rate": 1.8628328889339403e-05, + "loss": 0.8885, + "step": 725 + }, + { + "epoch": 0.5821972734562951, + "grad_norm": 0.629024088382721, + "learning_rate": 1.8623948473643383e-05, + "loss": 0.9344, + "step": 726 + }, + { + "epoch": 0.5829991980753809, + "grad_norm": 0.6625981330871582, + "learning_rate": 1.86195615914171e-05, + "loss": 0.9756, + "step": 727 + }, + { + "epoch": 0.5838011226944667, + "grad_norm": 0.6435332894325256, + "learning_rate": 1.8615168245949982e-05, + "loss": 0.9895, + "step": 728 + }, + { + "epoch": 0.5846030473135525, + "grad_norm": 0.6450731158256531, + "learning_rate": 1.8610768440536317e-05, + "loss": 0.9327, + "step": 729 + }, + { + "epoch": 0.5854049719326383, + "grad_norm": 0.6825403571128845, + "learning_rate": 1.8606362178475227e-05, + "loss": 0.961, + "step": 730 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 0.6117799878120422, + "learning_rate": 1.860194946307067e-05, + "loss": 0.9043, + "step": 731 + }, + { + "epoch": 0.58700882117081, + "grad_norm": 0.6143025159835815, + "learning_rate": 1.859753029763146e-05, + "loss": 0.993, + "step": 732 + }, + { + "epoch": 0.5878107457898958, + "grad_norm": 0.5972070693969727, + "learning_rate": 1.859310468547123e-05, + "loss": 0.9069, + "step": 733 + }, + { + "epoch": 0.5886126704089816, + "grad_norm": 0.6459053158760071, + "learning_rate": 1.8588672629908462e-05, + "loss": 0.9822, + "step": 734 + }, + { + "epoch": 0.5894145950280674, + "grad_norm": 0.674164891242981, + "learning_rate": 1.8584234134266456e-05, + "loss": 0.9833, + "step": 735 + }, + { + "epoch": 0.5902165196471532, + "grad_norm": 0.6549596190452576, + "learning_rate": 1.857978920187335e-05, + "loss": 0.9851, + "step": 736 + }, + { + "epoch": 0.591018444266239, + "grad_norm": 0.621340811252594, + "learning_rate": 1.85753378360621e-05, + "loss": 0.9541, + "step": 737 + }, + { + "epoch": 0.5918203688853247, + "grad_norm": 0.652487576007843, + "learning_rate": 1.8570880040170504e-05, + "loss": 0.9206, + "step": 738 + }, + { + "epoch": 0.5926222935044105, + "grad_norm": 0.6780267953872681, + "learning_rate": 1.8566415817541157e-05, + "loss": 0.9676, + "step": 739 + }, + { + "epoch": 0.5934242181234964, + "grad_norm": 0.6120235323905945, + "learning_rate": 1.8561945171521498e-05, + "loss": 0.9223, + "step": 740 + }, + { + "epoch": 0.5942261427425822, + "grad_norm": 0.6822912096977234, + "learning_rate": 1.8557468105463753e-05, + "loss": 0.9164, + "step": 741 + }, + { + "epoch": 0.595028067361668, + "grad_norm": 0.6549542546272278, + "learning_rate": 1.855298462272499e-05, + "loss": 0.9028, + "step": 742 + }, + { + "epoch": 0.5958299919807538, + "grad_norm": 0.6103249788284302, + "learning_rate": 1.8548494726667076e-05, + "loss": 0.9741, + "step": 743 + }, + { + "epoch": 0.5966319165998396, + "grad_norm": 0.6277962923049927, + "learning_rate": 1.8543998420656686e-05, + "loss": 0.9629, + "step": 744 + }, + { + "epoch": 0.5974338412189254, + "grad_norm": 0.6683188676834106, + "learning_rate": 1.8539495708065304e-05, + "loss": 1.0021, + "step": 745 + }, + { + "epoch": 0.5982357658380112, + "grad_norm": 0.621095597743988, + "learning_rate": 1.8534986592269218e-05, + "loss": 0.9854, + "step": 746 + }, + { + "epoch": 0.599037690457097, + "grad_norm": 0.6299651861190796, + "learning_rate": 1.853047107664951e-05, + "loss": 0.966, + "step": 747 + }, + { + "epoch": 0.5998396150761829, + "grad_norm": 0.7200894355773926, + "learning_rate": 1.852594916459208e-05, + "loss": 0.9201, + "step": 748 + }, + { + "epoch": 0.6006415396952687, + "grad_norm": 0.6269078850746155, + "learning_rate": 1.85214208594876e-05, + "loss": 1.0063, + "step": 749 + }, + { + "epoch": 0.6014434643143545, + "grad_norm": 0.5880782008171082, + "learning_rate": 1.8516886164731554e-05, + "loss": 0.9167, + "step": 750 + }, + { + "epoch": 0.6022453889334403, + "grad_norm": 0.6221625208854675, + "learning_rate": 1.851234508372421e-05, + "loss": 0.9314, + "step": 751 + }, + { + "epoch": 0.603047313552526, + "grad_norm": 0.6242570281028748, + "learning_rate": 1.850779761987062e-05, + "loss": 0.9383, + "step": 752 + }, + { + "epoch": 0.6038492381716118, + "grad_norm": 0.6036713719367981, + "learning_rate": 1.8503243776580637e-05, + "loss": 0.9046, + "step": 753 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 0.6600368022918701, + "learning_rate": 1.8498683557268878e-05, + "loss": 0.9427, + "step": 754 + }, + { + "epoch": 0.6054530874097834, + "grad_norm": 0.6118487119674683, + "learning_rate": 1.8494116965354756e-05, + "loss": 0.9301, + "step": 755 + }, + { + "epoch": 0.6062550120288693, + "grad_norm": 0.6600939035415649, + "learning_rate": 1.8489544004262456e-05, + "loss": 0.9867, + "step": 756 + }, + { + "epoch": 0.6070569366479551, + "grad_norm": 0.6410656571388245, + "learning_rate": 1.8484964677420937e-05, + "loss": 0.904, + "step": 757 + }, + { + "epoch": 0.6078588612670409, + "grad_norm": 0.6048609614372253, + "learning_rate": 1.848037898826394e-05, + "loss": 0.9244, + "step": 758 + }, + { + "epoch": 0.6086607858861267, + "grad_norm": 0.600308895111084, + "learning_rate": 1.8475786940229965e-05, + "loss": 0.9042, + "step": 759 + }, + { + "epoch": 0.6094627105052125, + "grad_norm": 0.6293653249740601, + "learning_rate": 1.847118853676229e-05, + "loss": 1.0067, + "step": 760 + }, + { + "epoch": 0.6102646351242983, + "grad_norm": 0.6423448324203491, + "learning_rate": 1.8466583781308954e-05, + "loss": 0.9437, + "step": 761 + }, + { + "epoch": 0.6110665597433841, + "grad_norm": 0.591410756111145, + "learning_rate": 1.846197267732276e-05, + "loss": 0.8932, + "step": 762 + }, + { + "epoch": 0.6118684843624699, + "grad_norm": 0.602726936340332, + "learning_rate": 1.845735522826127e-05, + "loss": 0.8843, + "step": 763 + }, + { + "epoch": 0.6126704089815558, + "grad_norm": 0.6235020756721497, + "learning_rate": 1.84527314375868e-05, + "loss": 0.9544, + "step": 764 + }, + { + "epoch": 0.6134723336006416, + "grad_norm": 0.6325739622116089, + "learning_rate": 1.8448101308766433e-05, + "loss": 0.8938, + "step": 765 + }, + { + "epoch": 0.6142742582197274, + "grad_norm": 0.6697767972946167, + "learning_rate": 1.8443464845271995e-05, + "loss": 0.9345, + "step": 766 + }, + { + "epoch": 0.6150761828388132, + "grad_norm": 0.6331246495246887, + "learning_rate": 1.843882205058006e-05, + "loss": 0.9425, + "step": 767 + }, + { + "epoch": 0.615878107457899, + "grad_norm": 0.7046418190002441, + "learning_rate": 1.8434172928171962e-05, + "loss": 0.9709, + "step": 768 + }, + { + "epoch": 0.6166800320769847, + "grad_norm": 0.7394378185272217, + "learning_rate": 1.8429517481533762e-05, + "loss": 0.9588, + "step": 769 + }, + { + "epoch": 0.6174819566960705, + "grad_norm": 0.6277191638946533, + "learning_rate": 1.8424855714156277e-05, + "loss": 0.9141, + "step": 770 + }, + { + "epoch": 0.6182838813151563, + "grad_norm": 0.6583722233772278, + "learning_rate": 1.842018762953506e-05, + "loss": 0.9488, + "step": 771 + }, + { + "epoch": 0.6190858059342422, + "grad_norm": 0.6868898272514343, + "learning_rate": 1.8415513231170398e-05, + "loss": 0.9369, + "step": 772 + }, + { + "epoch": 0.619887730553328, + "grad_norm": 0.6717788577079773, + "learning_rate": 1.8410832522567318e-05, + "loss": 0.9142, + "step": 773 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 0.5902653932571411, + "learning_rate": 1.8406145507235566e-05, + "loss": 0.8938, + "step": 774 + }, + { + "epoch": 0.6214915797914996, + "grad_norm": 0.644224226474762, + "learning_rate": 1.8401452188689635e-05, + "loss": 0.9601, + "step": 775 + }, + { + "epoch": 0.6222935044105854, + "grad_norm": 0.7199499607086182, + "learning_rate": 1.839675257044873e-05, + "loss": 0.9192, + "step": 776 + }, + { + "epoch": 0.6230954290296712, + "grad_norm": 0.7300478219985962, + "learning_rate": 1.8392046656036788e-05, + "loss": 0.9351, + "step": 777 + }, + { + "epoch": 0.623897353648757, + "grad_norm": 0.7216119170188904, + "learning_rate": 1.8387334448982454e-05, + "loss": 0.9561, + "step": 778 + }, + { + "epoch": 0.6246992782678428, + "grad_norm": 0.6239175200462341, + "learning_rate": 1.8382615952819116e-05, + "loss": 0.9391, + "step": 779 + }, + { + "epoch": 0.6255012028869287, + "grad_norm": 0.6322103142738342, + "learning_rate": 1.8377891171084858e-05, + "loss": 0.998, + "step": 780 + }, + { + "epoch": 0.6263031275060145, + "grad_norm": 0.681839644908905, + "learning_rate": 1.8373160107322476e-05, + "loss": 0.9308, + "step": 781 + }, + { + "epoch": 0.6271050521251003, + "grad_norm": 0.6046080589294434, + "learning_rate": 1.8368422765079486e-05, + "loss": 0.9486, + "step": 782 + }, + { + "epoch": 0.627906976744186, + "grad_norm": 0.675331711769104, + "learning_rate": 1.8363679147908115e-05, + "loss": 0.907, + "step": 783 + }, + { + "epoch": 0.6287089013632718, + "grad_norm": 0.6665434241294861, + "learning_rate": 1.835892925936528e-05, + "loss": 0.9345, + "step": 784 + }, + { + "epoch": 0.6295108259823576, + "grad_norm": 0.6315925717353821, + "learning_rate": 1.8354173103012614e-05, + "loss": 0.9132, + "step": 785 + }, + { + "epoch": 0.6303127506014434, + "grad_norm": 0.6950697302818298, + "learning_rate": 1.8349410682416442e-05, + "loss": 0.8736, + "step": 786 + }, + { + "epoch": 0.6311146752205292, + "grad_norm": 0.6428248286247253, + "learning_rate": 1.8344642001147793e-05, + "loss": 0.9271, + "step": 787 + }, + { + "epoch": 0.6319165998396151, + "grad_norm": 0.6300097107887268, + "learning_rate": 1.8339867062782384e-05, + "loss": 0.9271, + "step": 788 + }, + { + "epoch": 0.6327185244587009, + "grad_norm": 0.6257496476173401, + "learning_rate": 1.8335085870900627e-05, + "loss": 0.9489, + "step": 789 + }, + { + "epoch": 0.6335204490777867, + "grad_norm": 0.5959362983703613, + "learning_rate": 1.8330298429087624e-05, + "loss": 0.926, + "step": 790 + }, + { + "epoch": 0.6343223736968725, + "grad_norm": 0.6299023032188416, + "learning_rate": 1.8325504740933157e-05, + "loss": 0.948, + "step": 791 + }, + { + "epoch": 0.6351242983159583, + "grad_norm": 0.632050633430481, + "learning_rate": 1.8320704810031702e-05, + "loss": 0.9001, + "step": 792 + }, + { + "epoch": 0.6359262229350441, + "grad_norm": 0.635412335395813, + "learning_rate": 1.8315898639982404e-05, + "loss": 0.8965, + "step": 793 + }, + { + "epoch": 0.6367281475541299, + "grad_norm": 0.5949950218200684, + "learning_rate": 1.8311086234389104e-05, + "loss": 0.9294, + "step": 794 + }, + { + "epoch": 0.6375300721732157, + "grad_norm": 0.6535398364067078, + "learning_rate": 1.83062675968603e-05, + "loss": 0.9333, + "step": 795 + }, + { + "epoch": 0.6383319967923016, + "grad_norm": 0.6044979095458984, + "learning_rate": 1.8301442731009168e-05, + "loss": 0.91, + "step": 796 + }, + { + "epoch": 0.6391339214113874, + "grad_norm": 0.607458770275116, + "learning_rate": 1.8296611640453562e-05, + "loss": 0.9109, + "step": 797 + }, + { + "epoch": 0.6399358460304732, + "grad_norm": 0.6543724536895752, + "learning_rate": 1.8291774328816e-05, + "loss": 0.9502, + "step": 798 + }, + { + "epoch": 0.640737770649559, + "grad_norm": 0.5994077920913696, + "learning_rate": 1.8286930799723658e-05, + "loss": 0.8956, + "step": 799 + }, + { + "epoch": 0.6415396952686447, + "grad_norm": 0.5721734166145325, + "learning_rate": 1.828208105680838e-05, + "loss": 0.9113, + "step": 800 + }, + { + "epoch": 0.6423416198877305, + "grad_norm": 0.611034631729126, + "learning_rate": 1.827722510370667e-05, + "loss": 0.9111, + "step": 801 + }, + { + "epoch": 0.6431435445068163, + "grad_norm": 0.6357942819595337, + "learning_rate": 1.8272362944059684e-05, + "loss": 0.9313, + "step": 802 + }, + { + "epoch": 0.6439454691259021, + "grad_norm": 0.6018952131271362, + "learning_rate": 1.8267494581513236e-05, + "loss": 0.9279, + "step": 803 + }, + { + "epoch": 0.644747393744988, + "grad_norm": 0.5941787958145142, + "learning_rate": 1.8262620019717794e-05, + "loss": 0.9433, + "step": 804 + }, + { + "epoch": 0.6455493183640738, + "grad_norm": 0.601996123790741, + "learning_rate": 1.825773926232847e-05, + "loss": 0.9269, + "step": 805 + }, + { + "epoch": 0.6463512429831596, + "grad_norm": 0.6564491987228394, + "learning_rate": 1.8252852313005015e-05, + "loss": 0.9359, + "step": 806 + }, + { + "epoch": 0.6471531676022454, + "grad_norm": 0.6465602517127991, + "learning_rate": 1.8247959175411836e-05, + "loss": 0.9534, + "step": 807 + }, + { + "epoch": 0.6479550922213312, + "grad_norm": 0.6189476251602173, + "learning_rate": 1.824305985321797e-05, + "loss": 0.939, + "step": 808 + }, + { + "epoch": 0.648757016840417, + "grad_norm": 0.5999793410301208, + "learning_rate": 1.8238154350097103e-05, + "loss": 0.9447, + "step": 809 + }, + { + "epoch": 0.6495589414595028, + "grad_norm": 0.6540852785110474, + "learning_rate": 1.8233242669727544e-05, + "loss": 0.917, + "step": 810 + }, + { + "epoch": 0.6503608660785886, + "grad_norm": 0.6192000508308411, + "learning_rate": 1.8228324815792236e-05, + "loss": 0.921, + "step": 811 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 0.6083493232727051, + "learning_rate": 1.8223400791978756e-05, + "loss": 0.9884, + "step": 812 + }, + { + "epoch": 0.6519647153167603, + "grad_norm": 0.6045847535133362, + "learning_rate": 1.8218470601979302e-05, + "loss": 0.9191, + "step": 813 + }, + { + "epoch": 0.652766639935846, + "grad_norm": 0.5809303522109985, + "learning_rate": 1.8213534249490706e-05, + "loss": 0.9332, + "step": 814 + }, + { + "epoch": 0.6535685645549318, + "grad_norm": 0.5929029583930969, + "learning_rate": 1.8208591738214403e-05, + "loss": 0.9094, + "step": 815 + }, + { + "epoch": 0.6543704891740176, + "grad_norm": 0.6310725212097168, + "learning_rate": 1.8203643071856462e-05, + "loss": 0.9628, + "step": 816 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 0.664486825466156, + "learning_rate": 1.819868825412756e-05, + "loss": 0.9297, + "step": 817 + }, + { + "epoch": 0.6559743384121892, + "grad_norm": 0.6123178601264954, + "learning_rate": 1.8193727288742987e-05, + "loss": 0.9559, + "step": 818 + }, + { + "epoch": 0.656776263031275, + "grad_norm": 0.6100270748138428, + "learning_rate": 1.818876017942265e-05, + "loss": 0.921, + "step": 819 + }, + { + "epoch": 0.6575781876503609, + "grad_norm": 0.6273778080940247, + "learning_rate": 1.818378692989105e-05, + "loss": 0.9472, + "step": 820 + }, + { + "epoch": 0.6583801122694467, + "grad_norm": 0.6654192805290222, + "learning_rate": 1.8178807543877303e-05, + "loss": 0.9388, + "step": 821 + }, + { + "epoch": 0.6591820368885325, + "grad_norm": 0.6100279688835144, + "learning_rate": 1.817382202511512e-05, + "loss": 0.9477, + "step": 822 + }, + { + "epoch": 0.6599839615076183, + "grad_norm": 0.6125680208206177, + "learning_rate": 1.816883037734281e-05, + "loss": 0.9419, + "step": 823 + }, + { + "epoch": 0.6607858861267041, + "grad_norm": 0.6193715333938599, + "learning_rate": 1.8163832604303284e-05, + "loss": 1.0237, + "step": 824 + }, + { + "epoch": 0.6615878107457899, + "grad_norm": 0.6256586313247681, + "learning_rate": 1.815882870974404e-05, + "loss": 0.9228, + "step": 825 + }, + { + "epoch": 0.6623897353648757, + "grad_norm": 0.6021474599838257, + "learning_rate": 1.8153818697417176e-05, + "loss": 0.9198, + "step": 826 + }, + { + "epoch": 0.6631916599839615, + "grad_norm": 0.5720776319503784, + "learning_rate": 1.814880257107936e-05, + "loss": 0.8507, + "step": 827 + }, + { + "epoch": 0.6639935846030474, + "grad_norm": 0.5865132808685303, + "learning_rate": 1.8143780334491863e-05, + "loss": 0.9298, + "step": 828 + }, + { + "epoch": 0.6647955092221332, + "grad_norm": 0.585963785648346, + "learning_rate": 1.8138751991420524e-05, + "loss": 0.8927, + "step": 829 + }, + { + "epoch": 0.6655974338412189, + "grad_norm": 0.6248182058334351, + "learning_rate": 1.8133717545635764e-05, + "loss": 0.972, + "step": 830 + }, + { + "epoch": 0.6663993584603047, + "grad_norm": 0.6154810190200806, + "learning_rate": 1.812867700091258e-05, + "loss": 0.9437, + "step": 831 + }, + { + "epoch": 0.6672012830793905, + "grad_norm": 0.603408932685852, + "learning_rate": 1.8123630361030557e-05, + "loss": 0.8818, + "step": 832 + }, + { + "epoch": 0.6680032076984763, + "grad_norm": 0.5872328877449036, + "learning_rate": 1.8118577629773824e-05, + "loss": 0.9342, + "step": 833 + }, + { + "epoch": 0.6688051323175621, + "grad_norm": 0.5850470066070557, + "learning_rate": 1.81135188109311e-05, + "loss": 0.9535, + "step": 834 + }, + { + "epoch": 0.6696070569366479, + "grad_norm": 0.6239657402038574, + "learning_rate": 1.8108453908295655e-05, + "loss": 0.9408, + "step": 835 + }, + { + "epoch": 0.6704089815557338, + "grad_norm": 0.6208472847938538, + "learning_rate": 1.8103382925665324e-05, + "loss": 0.9907, + "step": 836 + }, + { + "epoch": 0.6712109061748196, + "grad_norm": 0.5864999890327454, + "learning_rate": 1.8098305866842506e-05, + "loss": 0.964, + "step": 837 + }, + { + "epoch": 0.6720128307939054, + "grad_norm": 0.6111268997192383, + "learning_rate": 1.809322273563415e-05, + "loss": 0.969, + "step": 838 + }, + { + "epoch": 0.6728147554129912, + "grad_norm": 0.6360272169113159, + "learning_rate": 1.8088133535851763e-05, + "loss": 0.9177, + "step": 839 + }, + { + "epoch": 0.673616680032077, + "grad_norm": 0.6175538897514343, + "learning_rate": 1.80830382713114e-05, + "loss": 0.9047, + "step": 840 + }, + { + "epoch": 0.6744186046511628, + "grad_norm": 0.6100848317146301, + "learning_rate": 1.8077936945833662e-05, + "loss": 0.9443, + "step": 841 + }, + { + "epoch": 0.6752205292702486, + "grad_norm": 0.6124653220176697, + "learning_rate": 1.80728295632437e-05, + "loss": 0.9368, + "step": 842 + }, + { + "epoch": 0.6760224538893344, + "grad_norm": 0.6022012829780579, + "learning_rate": 1.8067716127371197e-05, + "loss": 0.9087, + "step": 843 + }, + { + "epoch": 0.6768243785084203, + "grad_norm": 0.6640161275863647, + "learning_rate": 1.806259664205039e-05, + "loss": 0.9418, + "step": 844 + }, + { + "epoch": 0.677626303127506, + "grad_norm": 0.5954174995422363, + "learning_rate": 1.805747111112004e-05, + "loss": 0.9169, + "step": 845 + }, + { + "epoch": 0.6784282277465918, + "grad_norm": 0.6202585101127625, + "learning_rate": 1.805233953842344e-05, + "loss": 0.9537, + "step": 846 + }, + { + "epoch": 0.6792301523656776, + "grad_norm": 0.5560839176177979, + "learning_rate": 1.8047201927808423e-05, + "loss": 0.9279, + "step": 847 + }, + { + "epoch": 0.6800320769847634, + "grad_norm": 0.6648291945457458, + "learning_rate": 1.8042058283127345e-05, + "loss": 0.934, + "step": 848 + }, + { + "epoch": 0.6808340016038492, + "grad_norm": 0.7005195021629333, + "learning_rate": 1.8036908608237085e-05, + "loss": 0.9258, + "step": 849 + }, + { + "epoch": 0.681635926222935, + "grad_norm": 0.6536465883255005, + "learning_rate": 1.803175290699904e-05, + "loss": 0.9609, + "step": 850 + }, + { + "epoch": 0.6824378508420208, + "grad_norm": 0.6565441489219666, + "learning_rate": 1.8026591183279136e-05, + "loss": 0.9085, + "step": 851 + }, + { + "epoch": 0.6832397754611067, + "grad_norm": 0.6199874877929688, + "learning_rate": 1.8021423440947808e-05, + "loss": 0.9386, + "step": 852 + }, + { + "epoch": 0.6840417000801925, + "grad_norm": 0.6430292725563049, + "learning_rate": 1.801624968388e-05, + "loss": 0.9389, + "step": 853 + }, + { + "epoch": 0.6848436246992783, + "grad_norm": 0.601648211479187, + "learning_rate": 1.801106991595518e-05, + "loss": 0.9225, + "step": 854 + }, + { + "epoch": 0.6856455493183641, + "grad_norm": 0.591111421585083, + "learning_rate": 1.800588414105731e-05, + "loss": 0.9545, + "step": 855 + }, + { + "epoch": 0.6864474739374499, + "grad_norm": 0.6806792616844177, + "learning_rate": 1.8000692363074862e-05, + "loss": 0.942, + "step": 856 + }, + { + "epoch": 0.6872493985565357, + "grad_norm": 0.5764021277427673, + "learning_rate": 1.7995494585900802e-05, + "loss": 0.9303, + "step": 857 + }, + { + "epoch": 0.6880513231756215, + "grad_norm": 0.6204013228416443, + "learning_rate": 1.7990290813432613e-05, + "loss": 0.955, + "step": 858 + }, + { + "epoch": 0.6888532477947072, + "grad_norm": 0.618166446685791, + "learning_rate": 1.7985081049572244e-05, + "loss": 0.9287, + "step": 859 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.5855494141578674, + "learning_rate": 1.797986529822617e-05, + "loss": 0.9297, + "step": 860 + }, + { + "epoch": 0.6904570970328789, + "grad_norm": 0.6061149835586548, + "learning_rate": 1.7974643563305326e-05, + "loss": 0.9884, + "step": 861 + }, + { + "epoch": 0.6912590216519647, + "grad_norm": 0.5847954750061035, + "learning_rate": 1.7969415848725155e-05, + "loss": 0.9607, + "step": 862 + }, + { + "epoch": 0.6920609462710505, + "grad_norm": 0.652940034866333, + "learning_rate": 1.7964182158405567e-05, + "loss": 0.9519, + "step": 863 + }, + { + "epoch": 0.6928628708901363, + "grad_norm": 0.6230655908584595, + "learning_rate": 1.795894249627097e-05, + "loss": 0.9627, + "step": 864 + }, + { + "epoch": 0.6936647955092221, + "grad_norm": 0.5886598825454712, + "learning_rate": 1.795369686625024e-05, + "loss": 0.8989, + "step": 865 + }, + { + "epoch": 0.6944667201283079, + "grad_norm": 0.6408997178077698, + "learning_rate": 1.7948445272276727e-05, + "loss": 0.9438, + "step": 866 + }, + { + "epoch": 0.6952686447473937, + "grad_norm": 0.6148324012756348, + "learning_rate": 1.794318771828825e-05, + "loss": 0.9283, + "step": 867 + }, + { + "epoch": 0.6960705693664796, + "grad_norm": 0.6214705109596252, + "learning_rate": 1.793792420822711e-05, + "loss": 0.955, + "step": 868 + }, + { + "epoch": 0.6968724939855654, + "grad_norm": 0.6310122013092041, + "learning_rate": 1.7932654746040063e-05, + "loss": 0.9252, + "step": 869 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.6389340758323669, + "learning_rate": 1.7927379335678333e-05, + "loss": 0.9219, + "step": 870 + }, + { + "epoch": 0.698476343223737, + "grad_norm": 0.597550630569458, + "learning_rate": 1.7922097981097596e-05, + "loss": 0.9396, + "step": 871 + }, + { + "epoch": 0.6992782678428228, + "grad_norm": 0.5725171566009521, + "learning_rate": 1.7916810686257998e-05, + "loss": 0.9493, + "step": 872 + }, + { + "epoch": 0.7000801924619086, + "grad_norm": 0.5874449014663696, + "learning_rate": 1.791151745512413e-05, + "loss": 0.9032, + "step": 873 + }, + { + "epoch": 0.7008821170809943, + "grad_norm": 0.6453227400779724, + "learning_rate": 1.790621829166504e-05, + "loss": 0.8794, + "step": 874 + }, + { + "epoch": 0.7016840417000801, + "grad_norm": 0.5927412509918213, + "learning_rate": 1.7900913199854218e-05, + "loss": 0.946, + "step": 875 + }, + { + "epoch": 0.702485966319166, + "grad_norm": 0.6199756860733032, + "learning_rate": 1.7895602183669602e-05, + "loss": 0.9298, + "step": 876 + }, + { + "epoch": 0.7032878909382518, + "grad_norm": 0.6697978377342224, + "learning_rate": 1.7890285247093574e-05, + "loss": 0.9928, + "step": 877 + }, + { + "epoch": 0.7040898155573376, + "grad_norm": 0.6035749912261963, + "learning_rate": 1.7884962394112953e-05, + "loss": 0.9256, + "step": 878 + }, + { + "epoch": 0.7048917401764234, + "grad_norm": 0.6426158547401428, + "learning_rate": 1.7879633628719e-05, + "loss": 0.9228, + "step": 879 + }, + { + "epoch": 0.7056936647955092, + "grad_norm": 0.5659704804420471, + "learning_rate": 1.7874298954907405e-05, + "loss": 0.9229, + "step": 880 + }, + { + "epoch": 0.706495589414595, + "grad_norm": 0.598106324672699, + "learning_rate": 1.786895837667828e-05, + "loss": 0.9421, + "step": 881 + }, + { + "epoch": 0.7072975140336808, + "grad_norm": 0.5607869029045105, + "learning_rate": 1.7863611898036175e-05, + "loss": 0.9289, + "step": 882 + }, + { + "epoch": 0.7080994386527666, + "grad_norm": 0.6277954578399658, + "learning_rate": 1.7858259522990067e-05, + "loss": 0.9785, + "step": 883 + }, + { + "epoch": 0.7089013632718525, + "grad_norm": 0.7224546670913696, + "learning_rate": 1.7852901255553346e-05, + "loss": 0.9637, + "step": 884 + }, + { + "epoch": 0.7097032878909383, + "grad_norm": 0.5827512145042419, + "learning_rate": 1.7847537099743824e-05, + "loss": 0.8912, + "step": 885 + }, + { + "epoch": 0.7105052125100241, + "grad_norm": 0.6022170186042786, + "learning_rate": 1.7842167059583723e-05, + "loss": 0.9232, + "step": 886 + }, + { + "epoch": 0.7113071371291099, + "grad_norm": 0.6745474934577942, + "learning_rate": 1.783679113909969e-05, + "loss": 0.9659, + "step": 887 + }, + { + "epoch": 0.7121090617481957, + "grad_norm": 0.6338194608688354, + "learning_rate": 1.7831409342322766e-05, + "loss": 0.9329, + "step": 888 + }, + { + "epoch": 0.7129109863672815, + "grad_norm": 0.638043224811554, + "learning_rate": 1.7826021673288413e-05, + "loss": 0.9881, + "step": 889 + }, + { + "epoch": 0.7137129109863672, + "grad_norm": 0.5955981016159058, + "learning_rate": 1.7820628136036483e-05, + "loss": 0.908, + "step": 890 + }, + { + "epoch": 0.714514835605453, + "grad_norm": 0.6151586771011353, + "learning_rate": 1.7815228734611233e-05, + "loss": 0.9438, + "step": 891 + }, + { + "epoch": 0.7153167602245389, + "grad_norm": 0.6584967970848083, + "learning_rate": 1.7809823473061324e-05, + "loss": 0.9605, + "step": 892 + }, + { + "epoch": 0.7161186848436247, + "grad_norm": 0.6425672769546509, + "learning_rate": 1.7804412355439803e-05, + "loss": 0.9248, + "step": 893 + }, + { + "epoch": 0.7169206094627105, + "grad_norm": 0.6150190234184265, + "learning_rate": 1.7798995385804107e-05, + "loss": 0.874, + "step": 894 + }, + { + "epoch": 0.7177225340817963, + "grad_norm": 0.6137731075286865, + "learning_rate": 1.7793572568216063e-05, + "loss": 0.9333, + "step": 895 + }, + { + "epoch": 0.7185244587008821, + "grad_norm": 0.6165148019790649, + "learning_rate": 1.778814390674189e-05, + "loss": 0.9447, + "step": 896 + }, + { + "epoch": 0.7193263833199679, + "grad_norm": 0.6286599040031433, + "learning_rate": 1.7782709405452184e-05, + "loss": 0.8696, + "step": 897 + }, + { + "epoch": 0.7201283079390537, + "grad_norm": 0.6361931562423706, + "learning_rate": 1.777726906842191e-05, + "loss": 0.9514, + "step": 898 + }, + { + "epoch": 0.7209302325581395, + "grad_norm": 0.7451531887054443, + "learning_rate": 1.777182289973043e-05, + "loss": 0.9249, + "step": 899 + }, + { + "epoch": 0.7217321571772254, + "grad_norm": 0.6656950116157532, + "learning_rate": 1.776637090346146e-05, + "loss": 0.8892, + "step": 900 + }, + { + "epoch": 0.7225340817963112, + "grad_norm": 0.7215845584869385, + "learning_rate": 1.7760913083703088e-05, + "loss": 0.8965, + "step": 901 + }, + { + "epoch": 0.723336006415397, + "grad_norm": 0.6370206475257874, + "learning_rate": 1.7755449444547783e-05, + "loss": 0.954, + "step": 902 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 0.6504797339439392, + "learning_rate": 1.7749979990092364e-05, + "loss": 0.8996, + "step": 903 + }, + { + "epoch": 0.7249398556535686, + "grad_norm": 1.0392930507659912, + "learning_rate": 1.774450472443801e-05, + "loss": 0.9311, + "step": 904 + }, + { + "epoch": 0.7257417802726543, + "grad_norm": 0.6109988689422607, + "learning_rate": 1.7739023651690267e-05, + "loss": 0.9424, + "step": 905 + }, + { + "epoch": 0.7265437048917401, + "grad_norm": 0.677939772605896, + "learning_rate": 1.7733536775959027e-05, + "loss": 0.9875, + "step": 906 + }, + { + "epoch": 0.7273456295108259, + "grad_norm": 0.5968044996261597, + "learning_rate": 1.7728044101358538e-05, + "loss": 0.9088, + "step": 907 + }, + { + "epoch": 0.7281475541299118, + "grad_norm": 0.5922111868858337, + "learning_rate": 1.7722545632007394e-05, + "loss": 0.9306, + "step": 908 + }, + { + "epoch": 0.7289494787489976, + "grad_norm": 0.6236370801925659, + "learning_rate": 1.771704137202853e-05, + "loss": 0.9302, + "step": 909 + }, + { + "epoch": 0.7297514033680834, + "grad_norm": 0.5695316195487976, + "learning_rate": 1.771153132554924e-05, + "loss": 0.8739, + "step": 910 + }, + { + "epoch": 0.7305533279871692, + "grad_norm": 0.6526502966880798, + "learning_rate": 1.770601549670113e-05, + "loss": 0.8775, + "step": 911 + }, + { + "epoch": 0.731355252606255, + "grad_norm": 0.5602655410766602, + "learning_rate": 1.7700493889620163e-05, + "loss": 0.9219, + "step": 912 + }, + { + "epoch": 0.7321571772253408, + "grad_norm": 0.6553357839584351, + "learning_rate": 1.769496650844663e-05, + "loss": 0.9495, + "step": 913 + }, + { + "epoch": 0.7329591018444266, + "grad_norm": 0.6299150586128235, + "learning_rate": 1.768943335732515e-05, + "loss": 0.8947, + "step": 914 + }, + { + "epoch": 0.7337610264635124, + "grad_norm": 0.6235426664352417, + "learning_rate": 1.7683894440404663e-05, + "loss": 0.9501, + "step": 915 + }, + { + "epoch": 0.7345629510825983, + "grad_norm": 0.6619741916656494, + "learning_rate": 1.7678349761838438e-05, + "loss": 0.9679, + "step": 916 + }, + { + "epoch": 0.7353648757016841, + "grad_norm": 0.6198428869247437, + "learning_rate": 1.7672799325784066e-05, + "loss": 0.9105, + "step": 917 + }, + { + "epoch": 0.7361668003207699, + "grad_norm": 2.144275426864624, + "learning_rate": 1.7667243136403455e-05, + "loss": 0.8585, + "step": 918 + }, + { + "epoch": 0.7369687249398557, + "grad_norm": 0.6071990728378296, + "learning_rate": 1.7661681197862823e-05, + "loss": 0.9773, + "step": 919 + }, + { + "epoch": 0.7377706495589414, + "grad_norm": 0.5878254175186157, + "learning_rate": 1.76561135143327e-05, + "loss": 0.9344, + "step": 920 + }, + { + "epoch": 0.7385725741780272, + "grad_norm": 0.616235077381134, + "learning_rate": 1.7650540089987926e-05, + "loss": 0.8986, + "step": 921 + }, + { + "epoch": 0.739374498797113, + "grad_norm": 0.6328748464584351, + "learning_rate": 1.7644960929007642e-05, + "loss": 0.9162, + "step": 922 + }, + { + "epoch": 0.7401764234161988, + "grad_norm": 0.610571563243866, + "learning_rate": 1.7639376035575296e-05, + "loss": 0.9292, + "step": 923 + }, + { + "epoch": 0.7409783480352847, + "grad_norm": 0.6044664978981018, + "learning_rate": 1.7633785413878634e-05, + "loss": 0.9503, + "step": 924 + }, + { + "epoch": 0.7417802726543705, + "grad_norm": 0.5878413319587708, + "learning_rate": 1.762818906810969e-05, + "loss": 0.8839, + "step": 925 + }, + { + "epoch": 0.7425821972734563, + "grad_norm": 0.99688321352005, + "learning_rate": 1.7622587002464792e-05, + "loss": 0.9361, + "step": 926 + }, + { + "epoch": 0.7433841218925421, + "grad_norm": 6.020138263702393, + "learning_rate": 1.7616979221144565e-05, + "loss": 0.9249, + "step": 927 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.65313321352005, + "learning_rate": 1.7611365728353907e-05, + "loss": 0.8932, + "step": 928 + }, + { + "epoch": 0.7449879711307137, + "grad_norm": 0.6319687962532043, + "learning_rate": 1.7605746528302017e-05, + "loss": 0.9224, + "step": 929 + }, + { + "epoch": 0.7457898957497995, + "grad_norm": 0.6352254152297974, + "learning_rate": 1.760012162520236e-05, + "loss": 0.9589, + "step": 930 + }, + { + "epoch": 0.7465918203688853, + "grad_norm": 0.6238382458686829, + "learning_rate": 1.759449102327267e-05, + "loss": 0.9495, + "step": 931 + }, + { + "epoch": 0.7473937449879712, + "grad_norm": 0.6095400452613831, + "learning_rate": 1.7588854726734974e-05, + "loss": 0.9395, + "step": 932 + }, + { + "epoch": 0.748195669607057, + "grad_norm": 0.5706982016563416, + "learning_rate": 1.7583212739815555e-05, + "loss": 0.9041, + "step": 933 + }, + { + "epoch": 0.7489975942261428, + "grad_norm": 0.5789833664894104, + "learning_rate": 1.757756506674497e-05, + "loss": 0.8945, + "step": 934 + }, + { + "epoch": 0.7497995188452286, + "grad_norm": 0.5853317975997925, + "learning_rate": 1.7571911711758032e-05, + "loss": 0.9189, + "step": 935 + }, + { + "epoch": 0.7506014434643143, + "grad_norm": 0.6032062768936157, + "learning_rate": 1.7566252679093826e-05, + "loss": 0.9125, + "step": 936 + }, + { + "epoch": 0.7514033680834001, + "grad_norm": 0.6213047504425049, + "learning_rate": 1.7560587972995678e-05, + "loss": 0.9299, + "step": 937 + }, + { + "epoch": 0.7522052927024859, + "grad_norm": 0.68639075756073, + "learning_rate": 1.7554917597711188e-05, + "loss": 0.9627, + "step": 938 + }, + { + "epoch": 0.7530072173215717, + "grad_norm": 0.5955672264099121, + "learning_rate": 1.7549241557492187e-05, + "loss": 0.9761, + "step": 939 + }, + { + "epoch": 0.7538091419406576, + "grad_norm": 0.6289668679237366, + "learning_rate": 1.754355985659477e-05, + "loss": 0.9432, + "step": 940 + }, + { + "epoch": 0.7546110665597434, + "grad_norm": 0.621410071849823, + "learning_rate": 1.7537872499279265e-05, + "loss": 0.9221, + "step": 941 + }, + { + "epoch": 0.7554129911788292, + "grad_norm": 1.4276875257492065, + "learning_rate": 1.753217948981025e-05, + "loss": 0.9208, + "step": 942 + }, + { + "epoch": 0.756214915797915, + "grad_norm": 0.6688864231109619, + "learning_rate": 1.7526480832456538e-05, + "loss": 0.9107, + "step": 943 + }, + { + "epoch": 0.7570168404170008, + "grad_norm": 0.6159544587135315, + "learning_rate": 1.752077653149117e-05, + "loss": 0.9473, + "step": 944 + }, + { + "epoch": 0.7578187650360866, + "grad_norm": 0.6279981732368469, + "learning_rate": 1.751506659119143e-05, + "loss": 0.908, + "step": 945 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 0.6103554964065552, + "learning_rate": 1.750935101583883e-05, + "loss": 0.9023, + "step": 946 + }, + { + "epoch": 0.7594226142742582, + "grad_norm": 0.6202152371406555, + "learning_rate": 1.7503629809719095e-05, + "loss": 0.9256, + "step": 947 + }, + { + "epoch": 0.7602245388933441, + "grad_norm": 0.7202157378196716, + "learning_rate": 1.749790297712218e-05, + "loss": 0.93, + "step": 948 + }, + { + "epoch": 0.7610264635124299, + "grad_norm": 0.6290937066078186, + "learning_rate": 1.7492170522342267e-05, + "loss": 0.9029, + "step": 949 + }, + { + "epoch": 0.7618283881315157, + "grad_norm": 0.6004471778869629, + "learning_rate": 1.748643244967774e-05, + "loss": 0.9173, + "step": 950 + }, + { + "epoch": 0.7626303127506014, + "grad_norm": 0.6735373139381409, + "learning_rate": 1.7480688763431203e-05, + "loss": 0.9121, + "step": 951 + }, + { + "epoch": 0.7634322373696872, + "grad_norm": 0.6806999444961548, + "learning_rate": 1.7474939467909468e-05, + "loss": 0.9696, + "step": 952 + }, + { + "epoch": 0.764234161988773, + "grad_norm": 0.6270620822906494, + "learning_rate": 1.7469184567423548e-05, + "loss": 0.8985, + "step": 953 + }, + { + "epoch": 0.7650360866078588, + "grad_norm": 0.660423994064331, + "learning_rate": 1.7463424066288668e-05, + "loss": 0.9334, + "step": 954 + }, + { + "epoch": 0.7658380112269446, + "grad_norm": 0.6471710205078125, + "learning_rate": 1.745765796882425e-05, + "loss": 0.9471, + "step": 955 + }, + { + "epoch": 0.7666399358460305, + "grad_norm": 0.5963034629821777, + "learning_rate": 1.7451886279353905e-05, + "loss": 0.8939, + "step": 956 + }, + { + "epoch": 0.7674418604651163, + "grad_norm": 0.6383598446846008, + "learning_rate": 1.7446109002205444e-05, + "loss": 0.9114, + "step": 957 + }, + { + "epoch": 0.7682437850842021, + "grad_norm": 0.6523898839950562, + "learning_rate": 1.744032614171087e-05, + "loss": 0.9375, + "step": 958 + }, + { + "epoch": 0.7690457097032879, + "grad_norm": 0.6452939510345459, + "learning_rate": 1.743453770220636e-05, + "loss": 0.9317, + "step": 959 + }, + { + "epoch": 0.7698476343223737, + "grad_norm": 0.6215782165527344, + "learning_rate": 1.7428743688032292e-05, + "loss": 0.9467, + "step": 960 + }, + { + "epoch": 0.7706495589414595, + "grad_norm": 0.6118282675743103, + "learning_rate": 1.7422944103533212e-05, + "loss": 0.9916, + "step": 961 + }, + { + "epoch": 0.7714514835605453, + "grad_norm": 0.6718006730079651, + "learning_rate": 1.7417138953057847e-05, + "loss": 0.9415, + "step": 962 + }, + { + "epoch": 0.7722534081796311, + "grad_norm": 0.6651148200035095, + "learning_rate": 1.7411328240959095e-05, + "loss": 0.9109, + "step": 963 + }, + { + "epoch": 0.773055332798717, + "grad_norm": 0.6115936636924744, + "learning_rate": 1.7405511971594022e-05, + "loss": 0.9311, + "step": 964 + }, + { + "epoch": 0.7738572574178028, + "grad_norm": 0.6545907855033875, + "learning_rate": 1.739969014932387e-05, + "loss": 0.8825, + "step": 965 + }, + { + "epoch": 0.7746591820368885, + "grad_norm": 0.6140168905258179, + "learning_rate": 1.7393862778514042e-05, + "loss": 0.9522, + "step": 966 + }, + { + "epoch": 0.7754611066559743, + "grad_norm": 0.5883581638336182, + "learning_rate": 1.738802986353409e-05, + "loss": 0.8981, + "step": 967 + }, + { + "epoch": 0.7762630312750601, + "grad_norm": 0.6271137595176697, + "learning_rate": 1.7382191408757744e-05, + "loss": 0.9418, + "step": 968 + }, + { + "epoch": 0.7770649558941459, + "grad_norm": 0.6357349157333374, + "learning_rate": 1.7376347418562866e-05, + "loss": 0.894, + "step": 969 + }, + { + "epoch": 0.7778668805132317, + "grad_norm": 0.6148669123649597, + "learning_rate": 1.7370497897331486e-05, + "loss": 0.9197, + "step": 970 + }, + { + "epoch": 0.7786688051323175, + "grad_norm": 0.589157223701477, + "learning_rate": 1.7364642849449767e-05, + "loss": 0.952, + "step": 971 + }, + { + "epoch": 0.7794707297514034, + "grad_norm": 0.6091080904006958, + "learning_rate": 1.735878227930803e-05, + "loss": 0.9722, + "step": 972 + }, + { + "epoch": 0.7802726543704892, + "grad_norm": 0.6389529705047607, + "learning_rate": 1.735291619130073e-05, + "loss": 0.924, + "step": 973 + }, + { + "epoch": 0.781074578989575, + "grad_norm": 0.6408534049987793, + "learning_rate": 1.7347044589826455e-05, + "loss": 0.9491, + "step": 974 + }, + { + "epoch": 0.7818765036086608, + "grad_norm": 0.5990006327629089, + "learning_rate": 1.7341167479287934e-05, + "loss": 0.9298, + "step": 975 + }, + { + "epoch": 0.7826784282277466, + "grad_norm": 0.5884609222412109, + "learning_rate": 1.7335284864092024e-05, + "loss": 0.8903, + "step": 976 + }, + { + "epoch": 0.7834803528468324, + "grad_norm": 0.5926967859268188, + "learning_rate": 1.732939674864971e-05, + "loss": 0.891, + "step": 977 + }, + { + "epoch": 0.7842822774659182, + "grad_norm": 0.697799801826477, + "learning_rate": 1.7323503137376102e-05, + "loss": 0.968, + "step": 978 + }, + { + "epoch": 0.785084202085004, + "grad_norm": 0.6278639435768127, + "learning_rate": 1.7317604034690434e-05, + "loss": 0.9672, + "step": 979 + }, + { + "epoch": 0.7858861267040899, + "grad_norm": 0.6392386555671692, + "learning_rate": 1.7311699445016046e-05, + "loss": 0.8997, + "step": 980 + }, + { + "epoch": 0.7866880513231757, + "grad_norm": 0.5999894738197327, + "learning_rate": 1.730578937278041e-05, + "loss": 0.9535, + "step": 981 + }, + { + "epoch": 0.7874899759422614, + "grad_norm": 0.6001031398773193, + "learning_rate": 1.7299873822415093e-05, + "loss": 0.892, + "step": 982 + }, + { + "epoch": 0.7882919005613472, + "grad_norm": 0.6019670963287354, + "learning_rate": 1.7293952798355776e-05, + "loss": 0.8658, + "step": 983 + }, + { + "epoch": 0.789093825180433, + "grad_norm": 0.6335600018501282, + "learning_rate": 1.728802630504225e-05, + "loss": 0.9091, + "step": 984 + }, + { + "epoch": 0.7898957497995188, + "grad_norm": 0.5757085680961609, + "learning_rate": 1.7282094346918395e-05, + "loss": 0.9317, + "step": 985 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.6094053387641907, + "learning_rate": 1.72761569284322e-05, + "loss": 0.9256, + "step": 986 + }, + { + "epoch": 0.7914995990376904, + "grad_norm": 0.6195594668388367, + "learning_rate": 1.7270214054035736e-05, + "loss": 0.9395, + "step": 987 + }, + { + "epoch": 0.7923015236567763, + "grad_norm": 0.6129851937294006, + "learning_rate": 1.7264265728185186e-05, + "loss": 0.8758, + "step": 988 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 0.6170161962509155, + "learning_rate": 1.7258311955340794e-05, + "loss": 0.9307, + "step": 989 + }, + { + "epoch": 0.7939053728949479, + "grad_norm": 0.6150994300842285, + "learning_rate": 1.725235273996691e-05, + "loss": 0.9174, + "step": 990 + }, + { + "epoch": 0.7947072975140337, + "grad_norm": 0.6161699891090393, + "learning_rate": 1.7246388086531953e-05, + "loss": 0.9244, + "step": 991 + }, + { + "epoch": 0.7955092221331195, + "grad_norm": 0.6051169037818909, + "learning_rate": 1.7240417999508424e-05, + "loss": 0.9147, + "step": 992 + }, + { + "epoch": 0.7963111467522053, + "grad_norm": 0.6212258338928223, + "learning_rate": 1.7234442483372894e-05, + "loss": 0.9861, + "step": 993 + }, + { + "epoch": 0.7971130713712911, + "grad_norm": 0.6092191934585571, + "learning_rate": 1.722846154260602e-05, + "loss": 0.9064, + "step": 994 + }, + { + "epoch": 0.7979149959903769, + "grad_norm": 0.6202679872512817, + "learning_rate": 1.72224751816925e-05, + "loss": 0.9129, + "step": 995 + }, + { + "epoch": 0.7987169206094628, + "grad_norm": 0.6190642714500427, + "learning_rate": 1.721648340512112e-05, + "loss": 0.9291, + "step": 996 + }, + { + "epoch": 0.7995188452285485, + "grad_norm": 0.5609696507453918, + "learning_rate": 1.721048621738472e-05, + "loss": 0.8931, + "step": 997 + }, + { + "epoch": 0.8003207698476343, + "grad_norm": 0.6554841995239258, + "learning_rate": 1.720448362298019e-05, + "loss": 0.9463, + "step": 998 + }, + { + "epoch": 0.8011226944667201, + "grad_norm": 0.661469042301178, + "learning_rate": 1.719847562640848e-05, + "loss": 0.9057, + "step": 999 + }, + { + "epoch": 0.8019246190858059, + "grad_norm": 0.581844687461853, + "learning_rate": 1.7192462232174595e-05, + "loss": 0.9095, + "step": 1000 + }, + { + "epoch": 0.8027265437048917, + "grad_norm": 0.6142575144767761, + "learning_rate": 1.7186443444787578e-05, + "loss": 0.8885, + "step": 1001 + }, + { + "epoch": 0.8035284683239775, + "grad_norm": 0.5919050574302673, + "learning_rate": 1.718041926876053e-05, + "loss": 0.8893, + "step": 1002 + }, + { + "epoch": 0.8043303929430633, + "grad_norm": 0.6128547787666321, + "learning_rate": 1.7174389708610565e-05, + "loss": 0.923, + "step": 1003 + }, + { + "epoch": 0.8051323175621492, + "grad_norm": 0.5759849548339844, + "learning_rate": 1.716835476885887e-05, + "loss": 0.9256, + "step": 1004 + }, + { + "epoch": 0.805934242181235, + "grad_norm": 0.5811640620231628, + "learning_rate": 1.7162314454030644e-05, + "loss": 0.9334, + "step": 1005 + }, + { + "epoch": 0.8067361668003208, + "grad_norm": 0.6075664758682251, + "learning_rate": 1.7156268768655118e-05, + "loss": 0.8993, + "step": 1006 + }, + { + "epoch": 0.8075380914194066, + "grad_norm": 0.6393078565597534, + "learning_rate": 1.715021771726555e-05, + "loss": 0.9181, + "step": 1007 + }, + { + "epoch": 0.8083400160384924, + "grad_norm": 0.6739677786827087, + "learning_rate": 1.714416130439923e-05, + "loss": 0.9329, + "step": 1008 + }, + { + "epoch": 0.8091419406575782, + "grad_norm": 0.5906496047973633, + "learning_rate": 1.7138099534597464e-05, + "loss": 0.9393, + "step": 1009 + }, + { + "epoch": 0.809943865276664, + "grad_norm": 0.6302242875099182, + "learning_rate": 1.7132032412405565e-05, + "loss": 0.9145, + "step": 1010 + }, + { + "epoch": 0.8107457898957497, + "grad_norm": 0.6030935645103455, + "learning_rate": 1.7125959942372875e-05, + "loss": 0.8723, + "step": 1011 + }, + { + "epoch": 0.8115477145148356, + "grad_norm": 0.6145809292793274, + "learning_rate": 1.711988212905274e-05, + "loss": 0.8957, + "step": 1012 + }, + { + "epoch": 0.8123496391339214, + "grad_norm": 0.5869849324226379, + "learning_rate": 1.7113798977002506e-05, + "loss": 0.9221, + "step": 1013 + }, + { + "epoch": 0.8131515637530072, + "grad_norm": 0.7540897130966187, + "learning_rate": 1.710771049078353e-05, + "loss": 0.9337, + "step": 1014 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 0.6184853911399841, + "learning_rate": 1.7101616674961165e-05, + "loss": 0.8933, + "step": 1015 + }, + { + "epoch": 0.8147554129911788, + "grad_norm": 0.592350959777832, + "learning_rate": 1.7095517534104762e-05, + "loss": 0.8933, + "step": 1016 + }, + { + "epoch": 0.8155573376102646, + "grad_norm": 0.5875340104103088, + "learning_rate": 1.7089413072787667e-05, + "loss": 0.9336, + "step": 1017 + }, + { + "epoch": 0.8163592622293504, + "grad_norm": 0.6324250102043152, + "learning_rate": 1.7083303295587212e-05, + "loss": 0.8972, + "step": 1018 + }, + { + "epoch": 0.8171611868484362, + "grad_norm": 0.6096128225326538, + "learning_rate": 1.7077188207084712e-05, + "loss": 0.9375, + "step": 1019 + }, + { + "epoch": 0.8179631114675221, + "grad_norm": 0.6442949771881104, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.9045, + "step": 1020 + }, + { + "epoch": 0.8187650360866079, + "grad_norm": 0.6120867133140564, + "learning_rate": 1.706494211451878e-05, + "loss": 0.9377, + "step": 1021 + }, + { + "epoch": 0.8195669607056937, + "grad_norm": 0.630803644657135, + "learning_rate": 1.7058811119637878e-05, + "loss": 0.9255, + "step": 1022 + }, + { + "epoch": 0.8203688853247795, + "grad_norm": 0.5878363251686096, + "learning_rate": 1.7052674831820008e-05, + "loss": 0.9195, + "step": 1023 + }, + { + "epoch": 0.8211708099438653, + "grad_norm": 0.6276422739028931, + "learning_rate": 1.704653325566636e-05, + "loss": 0.9699, + "step": 1024 + }, + { + "epoch": 0.8219727345629511, + "grad_norm": 0.5793137550354004, + "learning_rate": 1.7040386395782093e-05, + "loss": 0.8794, + "step": 1025 + }, + { + "epoch": 0.8227746591820368, + "grad_norm": 0.6176061630249023, + "learning_rate": 1.703423425677634e-05, + "loss": 0.8908, + "step": 1026 + }, + { + "epoch": 0.8235765838011226, + "grad_norm": 0.616875946521759, + "learning_rate": 1.7028076843262185e-05, + "loss": 0.9506, + "step": 1027 + }, + { + "epoch": 0.8243785084202085, + "grad_norm": 0.5971503257751465, + "learning_rate": 1.7021914159856664e-05, + "loss": 0.9218, + "step": 1028 + }, + { + "epoch": 0.8251804330392943, + "grad_norm": 0.6316090226173401, + "learning_rate": 1.701574621118076e-05, + "loss": 0.9296, + "step": 1029 + }, + { + "epoch": 0.8259823576583801, + "grad_norm": 0.6042530536651611, + "learning_rate": 1.700957300185942e-05, + "loss": 0.895, + "step": 1030 + }, + { + "epoch": 0.8267842822774659, + "grad_norm": 0.6263911128044128, + "learning_rate": 1.7003394536521525e-05, + "loss": 0.9031, + "step": 1031 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.5868535041809082, + "learning_rate": 1.6997210819799894e-05, + "loss": 0.8886, + "step": 1032 + }, + { + "epoch": 0.8283881315156375, + "grad_norm": 0.681711733341217, + "learning_rate": 1.6991021856331297e-05, + "loss": 0.9142, + "step": 1033 + }, + { + "epoch": 0.8291900561347233, + "grad_norm": 0.6533603072166443, + "learning_rate": 1.698482765075642e-05, + "loss": 0.886, + "step": 1034 + }, + { + "epoch": 0.8299919807538091, + "grad_norm": 0.6320748925209045, + "learning_rate": 1.6978628207719892e-05, + "loss": 0.8767, + "step": 1035 + }, + { + "epoch": 0.830793905372895, + "grad_norm": 0.6173900365829468, + "learning_rate": 1.6972423531870273e-05, + "loss": 0.9081, + "step": 1036 + }, + { + "epoch": 0.8315958299919808, + "grad_norm": 0.6053596138954163, + "learning_rate": 1.696621362786003e-05, + "loss": 0.9403, + "step": 1037 + }, + { + "epoch": 0.8323977546110666, + "grad_norm": 0.5761358141899109, + "learning_rate": 1.6959998500345572e-05, + "loss": 0.9318, + "step": 1038 + }, + { + "epoch": 0.8331996792301524, + "grad_norm": 0.6473420262336731, + "learning_rate": 1.6953778153987205e-05, + "loss": 0.9407, + "step": 1039 + }, + { + "epoch": 0.8340016038492382, + "grad_norm": 0.5871930122375488, + "learning_rate": 1.6947552593449154e-05, + "loss": 0.8952, + "step": 1040 + }, + { + "epoch": 0.834803528468324, + "grad_norm": 0.5863208770751953, + "learning_rate": 1.6941321823399567e-05, + "loss": 0.8676, + "step": 1041 + }, + { + "epoch": 0.8356054530874097, + "grad_norm": 0.6157333254814148, + "learning_rate": 1.6935085848510476e-05, + "loss": 0.884, + "step": 1042 + }, + { + "epoch": 0.8364073777064955, + "grad_norm": 0.6380476951599121, + "learning_rate": 1.6928844673457838e-05, + "loss": 0.9337, + "step": 1043 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.6176585555076599, + "learning_rate": 1.692259830292149e-05, + "loss": 0.9369, + "step": 1044 + }, + { + "epoch": 0.8380112269446672, + "grad_norm": 0.6162835359573364, + "learning_rate": 1.691634674158518e-05, + "loss": 0.9504, + "step": 1045 + }, + { + "epoch": 0.838813151563753, + "grad_norm": 0.6078632473945618, + "learning_rate": 1.6910089994136535e-05, + "loss": 0.9074, + "step": 1046 + }, + { + "epoch": 0.8396150761828388, + "grad_norm": 0.5939008593559265, + "learning_rate": 1.6903828065267083e-05, + "loss": 0.9469, + "step": 1047 + }, + { + "epoch": 0.8404170008019246, + "grad_norm": 0.6359356641769409, + "learning_rate": 1.6897560959672232e-05, + "loss": 0.8914, + "step": 1048 + }, + { + "epoch": 0.8412189254210104, + "grad_norm": 0.6040184497833252, + "learning_rate": 1.6891288682051264e-05, + "loss": 0.938, + "step": 1049 + }, + { + "epoch": 0.8420208500400962, + "grad_norm": 0.6027700901031494, + "learning_rate": 1.6885011237107353e-05, + "loss": 0.8751, + "step": 1050 + }, + { + "epoch": 0.842822774659182, + "grad_norm": 0.5934613943099976, + "learning_rate": 1.6878728629547536e-05, + "loss": 0.9169, + "step": 1051 + }, + { + "epoch": 0.8436246992782679, + "grad_norm": 0.6678500771522522, + "learning_rate": 1.6872440864082732e-05, + "loss": 0.9461, + "step": 1052 + }, + { + "epoch": 0.8444266238973537, + "grad_norm": 0.6098446249961853, + "learning_rate": 1.686614794542772e-05, + "loss": 0.9198, + "step": 1053 + }, + { + "epoch": 0.8452285485164395, + "grad_norm": 0.5894660949707031, + "learning_rate": 1.685984987830114e-05, + "loss": 0.9057, + "step": 1054 + }, + { + "epoch": 0.8460304731355253, + "grad_norm": 0.6063706874847412, + "learning_rate": 1.68535466674255e-05, + "loss": 0.9392, + "step": 1055 + }, + { + "epoch": 0.846832397754611, + "grad_norm": 0.6084437966346741, + "learning_rate": 1.6847238317527167e-05, + "loss": 0.9146, + "step": 1056 + }, + { + "epoch": 0.8476343223736968, + "grad_norm": 0.5813028812408447, + "learning_rate": 1.684092483333635e-05, + "loss": 0.9152, + "step": 1057 + }, + { + "epoch": 0.8484362469927826, + "grad_norm": 0.6176558136940002, + "learning_rate": 1.6834606219587114e-05, + "loss": 0.8822, + "step": 1058 + }, + { + "epoch": 0.8492381716118684, + "grad_norm": 0.5906162858009338, + "learning_rate": 1.682828248101738e-05, + "loss": 0.9067, + "step": 1059 + }, + { + "epoch": 0.8500400962309543, + "grad_norm": 0.5896495580673218, + "learning_rate": 1.682195362236889e-05, + "loss": 0.931, + "step": 1060 + }, + { + "epoch": 0.8508420208500401, + "grad_norm": 0.5951011776924133, + "learning_rate": 1.681561964838725e-05, + "loss": 0.9665, + "step": 1061 + }, + { + "epoch": 0.8516439454691259, + "grad_norm": 0.6564264297485352, + "learning_rate": 1.6809280563821878e-05, + "loss": 0.8821, + "step": 1062 + }, + { + "epoch": 0.8524458700882117, + "grad_norm": 0.5982756018638611, + "learning_rate": 1.6802936373426045e-05, + "loss": 0.8951, + "step": 1063 + }, + { + "epoch": 0.8532477947072975, + "grad_norm": 0.6046779155731201, + "learning_rate": 1.6796587081956833e-05, + "loss": 0.9748, + "step": 1064 + }, + { + "epoch": 0.8540497193263833, + "grad_norm": 0.5632441639900208, + "learning_rate": 1.6790232694175164e-05, + "loss": 0.8921, + "step": 1065 + }, + { + "epoch": 0.8548516439454691, + "grad_norm": 0.5854066610336304, + "learning_rate": 1.678387321484577e-05, + "loss": 0.9368, + "step": 1066 + }, + { + "epoch": 0.8556535685645549, + "grad_norm": 0.6365918517112732, + "learning_rate": 1.6777508648737203e-05, + "loss": 0.9264, + "step": 1067 + }, + { + "epoch": 0.8564554931836408, + "grad_norm": 0.5902692675590515, + "learning_rate": 1.677113900062184e-05, + "loss": 0.9111, + "step": 1068 + }, + { + "epoch": 0.8572574178027266, + "grad_norm": 0.6386597752571106, + "learning_rate": 1.6764764275275852e-05, + "loss": 0.9626, + "step": 1069 + }, + { + "epoch": 0.8580593424218124, + "grad_norm": 0.6048006415367126, + "learning_rate": 1.675838447747923e-05, + "loss": 0.9684, + "step": 1070 + }, + { + "epoch": 0.8588612670408982, + "grad_norm": 0.5801950693130493, + "learning_rate": 1.675199961201576e-05, + "loss": 0.9235, + "step": 1071 + }, + { + "epoch": 0.859663191659984, + "grad_norm": 0.599275529384613, + "learning_rate": 1.6745609683673034e-05, + "loss": 0.9174, + "step": 1072 + }, + { + "epoch": 0.8604651162790697, + "grad_norm": 0.6032297015190125, + "learning_rate": 1.6739214697242437e-05, + "loss": 0.9221, + "step": 1073 + }, + { + "epoch": 0.8612670408981555, + "grad_norm": 0.6280431151390076, + "learning_rate": 1.6732814657519146e-05, + "loss": 0.9157, + "step": 1074 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 0.7165436744689941, + "learning_rate": 1.6726409569302134e-05, + "loss": 0.9094, + "step": 1075 + }, + { + "epoch": 0.8628708901363272, + "grad_norm": 0.6161721348762512, + "learning_rate": 1.6719999437394146e-05, + "loss": 0.8972, + "step": 1076 + }, + { + "epoch": 0.863672814755413, + "grad_norm": 0.5874865055084229, + "learning_rate": 1.6713584266601728e-05, + "loss": 0.9607, + "step": 1077 + }, + { + "epoch": 0.8644747393744988, + "grad_norm": 0.5720422863960266, + "learning_rate": 1.6707164061735183e-05, + "loss": 0.8646, + "step": 1078 + }, + { + "epoch": 0.8652766639935846, + "grad_norm": 0.6087173223495483, + "learning_rate": 1.6700738827608606e-05, + "loss": 0.8971, + "step": 1079 + }, + { + "epoch": 0.8660785886126704, + "grad_norm": 0.5989866256713867, + "learning_rate": 1.6694308569039853e-05, + "loss": 0.9118, + "step": 1080 + }, + { + "epoch": 0.8668805132317562, + "grad_norm": 0.6564970016479492, + "learning_rate": 1.6687873290850554e-05, + "loss": 0.9619, + "step": 1081 + }, + { + "epoch": 0.867682437850842, + "grad_norm": 0.6022130250930786, + "learning_rate": 1.6681432997866097e-05, + "loss": 0.9252, + "step": 1082 + }, + { + "epoch": 0.8684843624699278, + "grad_norm": 0.5839381217956543, + "learning_rate": 1.667498769491563e-05, + "loss": 0.9207, + "step": 1083 + }, + { + "epoch": 0.8692862870890137, + "grad_norm": 0.6378865242004395, + "learning_rate": 1.666853738683207e-05, + "loss": 0.9339, + "step": 1084 + }, + { + "epoch": 0.8700882117080995, + "grad_norm": 0.667452335357666, + "learning_rate": 1.6662082078452068e-05, + "loss": 0.9323, + "step": 1085 + }, + { + "epoch": 0.8708901363271853, + "grad_norm": 0.5752742886543274, + "learning_rate": 1.665562177461604e-05, + "loss": 0.8857, + "step": 1086 + }, + { + "epoch": 0.871692060946271, + "grad_norm": 0.6381446719169617, + "learning_rate": 1.6649156480168137e-05, + "loss": 0.9146, + "step": 1087 + }, + { + "epoch": 0.8724939855653568, + "grad_norm": 0.6204044818878174, + "learning_rate": 1.6642686199956263e-05, + "loss": 0.9048, + "step": 1088 + }, + { + "epoch": 0.8732959101844426, + "grad_norm": 0.6212306618690491, + "learning_rate": 1.6636210938832053e-05, + "loss": 0.9792, + "step": 1089 + }, + { + "epoch": 0.8740978348035284, + "grad_norm": 0.5908262133598328, + "learning_rate": 1.662973070165088e-05, + "loss": 0.9181, + "step": 1090 + }, + { + "epoch": 0.8748997594226142, + "grad_norm": 0.6047478318214417, + "learning_rate": 1.6623245493271832e-05, + "loss": 0.953, + "step": 1091 + }, + { + "epoch": 0.8757016840417001, + "grad_norm": 0.5832977294921875, + "learning_rate": 1.6616755318557758e-05, + "loss": 0.9327, + "step": 1092 + }, + { + "epoch": 0.8765036086607859, + "grad_norm": 0.6228951811790466, + "learning_rate": 1.6610260182375202e-05, + "loss": 0.9074, + "step": 1093 + }, + { + "epoch": 0.8773055332798717, + "grad_norm": 0.5714309811592102, + "learning_rate": 1.660376008959444e-05, + "loss": 0.9113, + "step": 1094 + }, + { + "epoch": 0.8781074578989575, + "grad_norm": 0.57155442237854, + "learning_rate": 1.6597255045089466e-05, + "loss": 0.8875, + "step": 1095 + }, + { + "epoch": 0.8789093825180433, + "grad_norm": 0.6274538040161133, + "learning_rate": 1.6590745053737986e-05, + "loss": 0.9663, + "step": 1096 + }, + { + "epoch": 0.8797113071371291, + "grad_norm": 0.5771580934524536, + "learning_rate": 1.65842301204214e-05, + "loss": 0.9048, + "step": 1097 + }, + { + "epoch": 0.8805132317562149, + "grad_norm": 0.5920909643173218, + "learning_rate": 1.657771025002484e-05, + "loss": 0.9001, + "step": 1098 + }, + { + "epoch": 0.8813151563753007, + "grad_norm": 0.5597774386405945, + "learning_rate": 1.657118544743712e-05, + "loss": 0.921, + "step": 1099 + }, + { + "epoch": 0.8821170809943866, + "grad_norm": 0.5892897248268127, + "learning_rate": 1.6564655717550766e-05, + "loss": 0.9508, + "step": 1100 + }, + { + "epoch": 0.8829190056134724, + "grad_norm": 0.641369640827179, + "learning_rate": 1.6558121065261982e-05, + "loss": 0.9015, + "step": 1101 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 0.5892067551612854, + "learning_rate": 1.6551581495470683e-05, + "loss": 0.8589, + "step": 1102 + }, + { + "epoch": 0.884522854851644, + "grad_norm": 0.5978425145149231, + "learning_rate": 1.6545037013080455e-05, + "loss": 0.9548, + "step": 1103 + }, + { + "epoch": 0.8853247794707297, + "grad_norm": 0.6504059433937073, + "learning_rate": 1.6538487622998576e-05, + "loss": 0.9711, + "step": 1104 + }, + { + "epoch": 0.8861267040898155, + "grad_norm": 0.6230421662330627, + "learning_rate": 1.6531933330136e-05, + "loss": 0.9147, + "step": 1105 + }, + { + "epoch": 0.8869286287089013, + "grad_norm": 0.6024095416069031, + "learning_rate": 1.652537413940736e-05, + "loss": 0.963, + "step": 1106 + }, + { + "epoch": 0.8877305533279871, + "grad_norm": 0.643290102481842, + "learning_rate": 1.6518810055730962e-05, + "loss": 0.9197, + "step": 1107 + }, + { + "epoch": 0.888532477947073, + "grad_norm": 0.6359246373176575, + "learning_rate": 1.6512241084028775e-05, + "loss": 0.9211, + "step": 1108 + }, + { + "epoch": 0.8893344025661588, + "grad_norm": 0.5819621682167053, + "learning_rate": 1.6505667229226445e-05, + "loss": 0.8995, + "step": 1109 + }, + { + "epoch": 0.8901363271852446, + "grad_norm": 0.624454140663147, + "learning_rate": 1.6499088496253266e-05, + "loss": 0.901, + "step": 1110 + }, + { + "epoch": 0.8909382518043304, + "grad_norm": 0.608256459236145, + "learning_rate": 1.6492504890042196e-05, + "loss": 0.8551, + "step": 1111 + }, + { + "epoch": 0.8917401764234162, + "grad_norm": 0.6560264825820923, + "learning_rate": 1.6485916415529852e-05, + "loss": 0.9358, + "step": 1112 + }, + { + "epoch": 0.892542101042502, + "grad_norm": 0.5924005508422852, + "learning_rate": 1.6479323077656492e-05, + "loss": 0.9347, + "step": 1113 + }, + { + "epoch": 0.8933440256615878, + "grad_norm": 0.6272872686386108, + "learning_rate": 1.647272488136603e-05, + "loss": 0.9396, + "step": 1114 + }, + { + "epoch": 0.8941459502806736, + "grad_norm": 0.5873216986656189, + "learning_rate": 1.6466121831606013e-05, + "loss": 0.9505, + "step": 1115 + }, + { + "epoch": 0.8949478748997595, + "grad_norm": 0.5705021023750305, + "learning_rate": 1.6459513933327637e-05, + "loss": 0.9651, + "step": 1116 + }, + { + "epoch": 0.8957497995188453, + "grad_norm": 0.6147488951683044, + "learning_rate": 1.6452901191485725e-05, + "loss": 0.8757, + "step": 1117 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 0.589171826839447, + "learning_rate": 1.6446283611038735e-05, + "loss": 0.9019, + "step": 1118 + }, + { + "epoch": 0.8973536487570168, + "grad_norm": 0.5974717736244202, + "learning_rate": 1.643966119694876e-05, + "loss": 0.9234, + "step": 1119 + }, + { + "epoch": 0.8981555733761026, + "grad_norm": 0.5791064500808716, + "learning_rate": 1.643303395418151e-05, + "loss": 0.9127, + "step": 1120 + }, + { + "epoch": 0.8989574979951884, + "grad_norm": 0.6018791198730469, + "learning_rate": 1.642640188770632e-05, + "loss": 0.8784, + "step": 1121 + }, + { + "epoch": 0.8997594226142742, + "grad_norm": 0.5726858973503113, + "learning_rate": 1.641976500249613e-05, + "loss": 0.9173, + "step": 1122 + }, + { + "epoch": 0.90056134723336, + "grad_norm": 0.6228300333023071, + "learning_rate": 1.641312330352751e-05, + "loss": 0.9295, + "step": 1123 + }, + { + "epoch": 0.9013632718524459, + "grad_norm": 0.5762906670570374, + "learning_rate": 1.6406476795780634e-05, + "loss": 0.9149, + "step": 1124 + }, + { + "epoch": 0.9021651964715317, + "grad_norm": 0.6083019375801086, + "learning_rate": 1.639982548423927e-05, + "loss": 0.962, + "step": 1125 + }, + { + "epoch": 0.9029671210906175, + "grad_norm": 0.6070680022239685, + "learning_rate": 1.6393169373890805e-05, + "loss": 0.9129, + "step": 1126 + }, + { + "epoch": 0.9037690457097033, + "grad_norm": 0.5900879502296448, + "learning_rate": 1.6386508469726215e-05, + "loss": 0.9209, + "step": 1127 + }, + { + "epoch": 0.9045709703287891, + "grad_norm": 0.5943062901496887, + "learning_rate": 1.637984277674008e-05, + "loss": 0.9232, + "step": 1128 + }, + { + "epoch": 0.9053728949478749, + "grad_norm": 0.6167227029800415, + "learning_rate": 1.6373172299930553e-05, + "loss": 0.9191, + "step": 1129 + }, + { + "epoch": 0.9061748195669607, + "grad_norm": 0.5882411003112793, + "learning_rate": 1.636649704429939e-05, + "loss": 0.8991, + "step": 1130 + }, + { + "epoch": 0.9069767441860465, + "grad_norm": 0.5918989777565002, + "learning_rate": 1.6359817014851925e-05, + "loss": 0.9584, + "step": 1131 + }, + { + "epoch": 0.9077786688051324, + "grad_norm": 0.5983660817146301, + "learning_rate": 1.635313221659707e-05, + "loss": 0.9231, + "step": 1132 + }, + { + "epoch": 0.9085805934242182, + "grad_norm": 0.5728667378425598, + "learning_rate": 1.6346442654547314e-05, + "loss": 0.9037, + "step": 1133 + }, + { + "epoch": 0.909382518043304, + "grad_norm": 0.6043873429298401, + "learning_rate": 1.633974833371872e-05, + "loss": 0.8928, + "step": 1134 + }, + { + "epoch": 0.9101844426623897, + "grad_norm": 0.604079008102417, + "learning_rate": 1.633304925913092e-05, + "loss": 0.9516, + "step": 1135 + }, + { + "epoch": 0.9109863672814755, + "grad_norm": 0.611089825630188, + "learning_rate": 1.6326345435807104e-05, + "loss": 0.942, + "step": 1136 + }, + { + "epoch": 0.9117882919005613, + "grad_norm": 0.61098313331604, + "learning_rate": 1.631963686877403e-05, + "loss": 0.9315, + "step": 1137 + }, + { + "epoch": 0.9125902165196471, + "grad_norm": 0.597474217414856, + "learning_rate": 1.6312923563062008e-05, + "loss": 0.8947, + "step": 1138 + }, + { + "epoch": 0.9133921411387329, + "grad_norm": 0.6015665531158447, + "learning_rate": 1.6306205523704903e-05, + "loss": 0.9241, + "step": 1139 + }, + { + "epoch": 0.9141940657578188, + "grad_norm": 0.559998095035553, + "learning_rate": 1.6299482755740132e-05, + "loss": 0.9079, + "step": 1140 + }, + { + "epoch": 0.9149959903769046, + "grad_norm": 0.5764912962913513, + "learning_rate": 1.6292755264208656e-05, + "loss": 0.9465, + "step": 1141 + }, + { + "epoch": 0.9157979149959904, + "grad_norm": 0.6615179181098938, + "learning_rate": 1.6286023054154973e-05, + "loss": 0.9198, + "step": 1142 + }, + { + "epoch": 0.9165998396150762, + "grad_norm": 0.6102979183197021, + "learning_rate": 1.6279286130627124e-05, + "loss": 0.9332, + "step": 1143 + }, + { + "epoch": 0.917401764234162, + "grad_norm": 0.5873243808746338, + "learning_rate": 1.627254449867669e-05, + "loss": 0.9494, + "step": 1144 + }, + { + "epoch": 0.9182036888532478, + "grad_norm": 0.5706033110618591, + "learning_rate": 1.626579816335877e-05, + "loss": 0.8697, + "step": 1145 + }, + { + "epoch": 0.9190056134723336, + "grad_norm": 0.6418749094009399, + "learning_rate": 1.6259047129731996e-05, + "loss": 0.9287, + "step": 1146 + }, + { + "epoch": 0.9198075380914194, + "grad_norm": 0.6301258206367493, + "learning_rate": 1.6252291402858525e-05, + "loss": 0.9095, + "step": 1147 + }, + { + "epoch": 0.9206094627105053, + "grad_norm": 0.6077032685279846, + "learning_rate": 1.6245530987804034e-05, + "loss": 0.9062, + "step": 1148 + }, + { + "epoch": 0.921411387329591, + "grad_norm": 0.6020398139953613, + "learning_rate": 1.6238765889637704e-05, + "loss": 0.9294, + "step": 1149 + }, + { + "epoch": 0.9222133119486768, + "grad_norm": 0.6611399054527283, + "learning_rate": 1.6231996113432242e-05, + "loss": 0.9235, + "step": 1150 + }, + { + "epoch": 0.9230152365677626, + "grad_norm": 0.6157788634300232, + "learning_rate": 1.6225221664263857e-05, + "loss": 0.9033, + "step": 1151 + }, + { + "epoch": 0.9238171611868484, + "grad_norm": 0.59830242395401, + "learning_rate": 1.6218442547212265e-05, + "loss": 0.8995, + "step": 1152 + }, + { + "epoch": 0.9246190858059342, + "grad_norm": 0.626473069190979, + "learning_rate": 1.6211658767360667e-05, + "loss": 0.9215, + "step": 1153 + }, + { + "epoch": 0.92542101042502, + "grad_norm": 0.5951080322265625, + "learning_rate": 1.620487032979578e-05, + "loss": 0.9305, + "step": 1154 + }, + { + "epoch": 0.9262229350441058, + "grad_norm": 0.6206769943237305, + "learning_rate": 1.619807723960781e-05, + "loss": 0.9093, + "step": 1155 + }, + { + "epoch": 0.9270248596631917, + "grad_norm": 0.6188283562660217, + "learning_rate": 1.619127950189044e-05, + "loss": 0.9339, + "step": 1156 + }, + { + "epoch": 0.9278267842822775, + "grad_norm": 0.5791252851486206, + "learning_rate": 1.6184477121740848e-05, + "loss": 0.8635, + "step": 1157 + }, + { + "epoch": 0.9286287089013633, + "grad_norm": 0.5923981666564941, + "learning_rate": 1.6177670104259694e-05, + "loss": 0.8821, + "step": 1158 + }, + { + "epoch": 0.9294306335204491, + "grad_norm": 0.5693655610084534, + "learning_rate": 1.61708584545511e-05, + "loss": 0.8967, + "step": 1159 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.6008737087249756, + "learning_rate": 1.616404217772269e-05, + "loss": 0.9091, + "step": 1160 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 0.5927824974060059, + "learning_rate": 1.6157221278885523e-05, + "loss": 0.9188, + "step": 1161 + }, + { + "epoch": 0.9318364073777065, + "grad_norm": 0.6398462653160095, + "learning_rate": 1.615039576315415e-05, + "loss": 0.901, + "step": 1162 + }, + { + "epoch": 0.9326383319967922, + "grad_norm": 0.6090993285179138, + "learning_rate": 1.6143565635646575e-05, + "loss": 0.9274, + "step": 1163 + }, + { + "epoch": 0.9334402566158782, + "grad_norm": 0.6457433700561523, + "learning_rate": 1.6136730901484267e-05, + "loss": 0.9281, + "step": 1164 + }, + { + "epoch": 0.9342421812349639, + "grad_norm": 0.627136766910553, + "learning_rate": 1.612989156579213e-05, + "loss": 0.9133, + "step": 1165 + }, + { + "epoch": 0.9350441058540497, + "grad_norm": 0.6137925982475281, + "learning_rate": 1.612304763369853e-05, + "loss": 0.8857, + "step": 1166 + }, + { + "epoch": 0.9358460304731355, + "grad_norm": 0.6183207035064697, + "learning_rate": 1.6116199110335295e-05, + "loss": 0.9099, + "step": 1167 + }, + { + "epoch": 0.9366479550922213, + "grad_norm": 0.6730118989944458, + "learning_rate": 1.610934600083767e-05, + "loss": 0.9584, + "step": 1168 + }, + { + "epoch": 0.9374498797113071, + "grad_norm": 0.6072790622711182, + "learning_rate": 1.610248831034435e-05, + "loss": 0.9138, + "step": 1169 + }, + { + "epoch": 0.9382518043303929, + "grad_norm": 0.6239385008811951, + "learning_rate": 1.609562604399747e-05, + "loss": 0.938, + "step": 1170 + }, + { + "epoch": 0.9390537289494787, + "grad_norm": 0.6454656720161438, + "learning_rate": 1.6088759206942586e-05, + "loss": 0.8756, + "step": 1171 + }, + { + "epoch": 0.9398556535685646, + "grad_norm": 0.6884939074516296, + "learning_rate": 1.6081887804328687e-05, + "loss": 0.9057, + "step": 1172 + }, + { + "epoch": 0.9406575781876504, + "grad_norm": 0.6258487105369568, + "learning_rate": 1.607501184130819e-05, + "loss": 0.9183, + "step": 1173 + }, + { + "epoch": 0.9414595028067362, + "grad_norm": 0.576998770236969, + "learning_rate": 1.606813132303692e-05, + "loss": 0.9145, + "step": 1174 + }, + { + "epoch": 0.942261427425822, + "grad_norm": 0.6313689351081848, + "learning_rate": 1.606124625467413e-05, + "loss": 0.9295, + "step": 1175 + }, + { + "epoch": 0.9430633520449078, + "grad_norm": 0.6496961116790771, + "learning_rate": 1.605435664138247e-05, + "loss": 0.8613, + "step": 1176 + }, + { + "epoch": 0.9438652766639936, + "grad_norm": 0.6497421264648438, + "learning_rate": 1.6047462488328017e-05, + "loss": 0.942, + "step": 1177 + }, + { + "epoch": 0.9446672012830793, + "grad_norm": 0.6210437417030334, + "learning_rate": 1.604056380068023e-05, + "loss": 0.9454, + "step": 1178 + }, + { + "epoch": 0.9454691259021651, + "grad_norm": 0.5845626592636108, + "learning_rate": 1.6033660583611988e-05, + "loss": 0.8651, + "step": 1179 + }, + { + "epoch": 0.946271050521251, + "grad_norm": 0.643485426902771, + "learning_rate": 1.6026752842299564e-05, + "loss": 0.963, + "step": 1180 + }, + { + "epoch": 0.9470729751403368, + "grad_norm": 0.6154069900512695, + "learning_rate": 1.6019840581922604e-05, + "loss": 0.9274, + "step": 1181 + }, + { + "epoch": 0.9478748997594226, + "grad_norm": 0.6406455039978027, + "learning_rate": 1.6012923807664164e-05, + "loss": 0.936, + "step": 1182 + }, + { + "epoch": 0.9486768243785084, + "grad_norm": 0.6149894595146179, + "learning_rate": 1.6006002524710674e-05, + "loss": 0.8924, + "step": 1183 + }, + { + "epoch": 0.9494787489975942, + "grad_norm": 0.5921556353569031, + "learning_rate": 1.599907673825195e-05, + "loss": 0.9235, + "step": 1184 + }, + { + "epoch": 0.95028067361668, + "grad_norm": 0.6019387245178223, + "learning_rate": 1.599214645348118e-05, + "loss": 0.8885, + "step": 1185 + }, + { + "epoch": 0.9510825982357658, + "grad_norm": 0.6550159454345703, + "learning_rate": 1.5985211675594933e-05, + "loss": 0.9241, + "step": 1186 + }, + { + "epoch": 0.9518845228548516, + "grad_norm": 0.639525294303894, + "learning_rate": 1.5978272409793136e-05, + "loss": 0.965, + "step": 1187 + }, + { + "epoch": 0.9526864474739375, + "grad_norm": 0.6293081045150757, + "learning_rate": 1.597132866127909e-05, + "loss": 0.8969, + "step": 1188 + }, + { + "epoch": 0.9534883720930233, + "grad_norm": 0.6015990376472473, + "learning_rate": 1.5964380435259448e-05, + "loss": 0.869, + "step": 1189 + }, + { + "epoch": 0.9542902967121091, + "grad_norm": 0.5990379452705383, + "learning_rate": 1.595742773694424e-05, + "loss": 0.9152, + "step": 1190 + }, + { + "epoch": 0.9550922213311949, + "grad_norm": 0.6142351031303406, + "learning_rate": 1.5950470571546818e-05, + "loss": 0.9237, + "step": 1191 + }, + { + "epoch": 0.9558941459502807, + "grad_norm": 0.6138330101966858, + "learning_rate": 1.5943508944283916e-05, + "loss": 0.8922, + "step": 1192 + }, + { + "epoch": 0.9566960705693665, + "grad_norm": 0.6090849041938782, + "learning_rate": 1.5936542860375594e-05, + "loss": 0.9292, + "step": 1193 + }, + { + "epoch": 0.9574979951884522, + "grad_norm": 0.6265794634819031, + "learning_rate": 1.592957232504526e-05, + "loss": 0.8901, + "step": 1194 + }, + { + "epoch": 0.958299919807538, + "grad_norm": 0.6023232936859131, + "learning_rate": 1.5922597343519654e-05, + "loss": 0.8742, + "step": 1195 + }, + { + "epoch": 0.9591018444266239, + "grad_norm": 0.5652976632118225, + "learning_rate": 1.591561792102886e-05, + "loss": 0.8904, + "step": 1196 + }, + { + "epoch": 0.9599037690457097, + "grad_norm": 0.6332113742828369, + "learning_rate": 1.5908634062806285e-05, + "loss": 0.9088, + "step": 1197 + }, + { + "epoch": 0.9607056936647955, + "grad_norm": 0.6024392247200012, + "learning_rate": 1.5901645774088662e-05, + "loss": 0.8891, + "step": 1198 + }, + { + "epoch": 0.9615076182838813, + "grad_norm": 0.6040472984313965, + "learning_rate": 1.5894653060116053e-05, + "loss": 0.9047, + "step": 1199 + }, + { + "epoch": 0.9623095429029671, + "grad_norm": 0.5790461301803589, + "learning_rate": 1.5887655926131832e-05, + "loss": 0.9191, + "step": 1200 + }, + { + "epoch": 0.9631114675220529, + "grad_norm": 0.6125912666320801, + "learning_rate": 1.588065437738268e-05, + "loss": 0.9197, + "step": 1201 + }, + { + "epoch": 0.9639133921411387, + "grad_norm": 0.5846207141876221, + "learning_rate": 1.587364841911861e-05, + "loss": 0.879, + "step": 1202 + }, + { + "epoch": 0.9647153167602245, + "grad_norm": 0.5989664196968079, + "learning_rate": 1.5866638056592916e-05, + "loss": 0.9328, + "step": 1203 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.5732477307319641, + "learning_rate": 1.5859623295062215e-05, + "loss": 0.8551, + "step": 1204 + }, + { + "epoch": 0.9663191659983962, + "grad_norm": 0.6074816584587097, + "learning_rate": 1.585260413978641e-05, + "loss": 0.9435, + "step": 1205 + }, + { + "epoch": 0.967121090617482, + "grad_norm": 0.6296406388282776, + "learning_rate": 1.5845580596028697e-05, + "loss": 0.9607, + "step": 1206 + }, + { + "epoch": 0.9679230152365678, + "grad_norm": 3.621976375579834, + "learning_rate": 1.583855266905558e-05, + "loss": 0.9418, + "step": 1207 + }, + { + "epoch": 0.9687249398556536, + "grad_norm": 0.6303392648696899, + "learning_rate": 1.5831520364136835e-05, + "loss": 0.9094, + "step": 1208 + }, + { + "epoch": 0.9695268644747393, + "grad_norm": 0.5891981720924377, + "learning_rate": 1.5824483686545517e-05, + "loss": 0.9088, + "step": 1209 + }, + { + "epoch": 0.9703287890938251, + "grad_norm": 0.7623486518859863, + "learning_rate": 1.581744264155797e-05, + "loss": 0.9175, + "step": 1210 + }, + { + "epoch": 0.9711307137129109, + "grad_norm": 0.5808781385421753, + "learning_rate": 1.5810397234453816e-05, + "loss": 0.8938, + "step": 1211 + }, + { + "epoch": 0.9719326383319968, + "grad_norm": 0.5807702541351318, + "learning_rate": 1.5803347470515933e-05, + "loss": 0.9222, + "step": 1212 + }, + { + "epoch": 0.9727345629510826, + "grad_norm": 0.5763218998908997, + "learning_rate": 1.5796293355030476e-05, + "loss": 0.909, + "step": 1213 + }, + { + "epoch": 0.9735364875701684, + "grad_norm": 0.5519773364067078, + "learning_rate": 1.578923489328686e-05, + "loss": 0.8921, + "step": 1214 + }, + { + "epoch": 0.9743384121892542, + "grad_norm": 0.579440176486969, + "learning_rate": 1.5782172090577762e-05, + "loss": 0.887, + "step": 1215 + }, + { + "epoch": 0.97514033680834, + "grad_norm": 0.5852498412132263, + "learning_rate": 1.5775104952199113e-05, + "loss": 0.8632, + "step": 1216 + }, + { + "epoch": 0.9759422614274258, + "grad_norm": 0.606121301651001, + "learning_rate": 1.5768033483450088e-05, + "loss": 0.9183, + "step": 1217 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 0.6091791987419128, + "learning_rate": 1.5760957689633127e-05, + "loss": 0.9547, + "step": 1218 + }, + { + "epoch": 0.9775461106655974, + "grad_norm": 0.6013908386230469, + "learning_rate": 1.575387757605389e-05, + "loss": 0.8725, + "step": 1219 + }, + { + "epoch": 0.9783480352846833, + "grad_norm": 0.5744641423225403, + "learning_rate": 1.5746793148021292e-05, + "loss": 0.9157, + "step": 1220 + }, + { + "epoch": 0.9791499599037691, + "grad_norm": 0.6412164568901062, + "learning_rate": 1.5739704410847475e-05, + "loss": 0.9291, + "step": 1221 + }, + { + "epoch": 0.9799518845228549, + "grad_norm": 0.5992948412895203, + "learning_rate": 1.5732611369847818e-05, + "loss": 0.941, + "step": 1222 + }, + { + "epoch": 0.9807538091419407, + "grad_norm": 0.613304078578949, + "learning_rate": 1.5725514030340926e-05, + "loss": 0.8843, + "step": 1223 + }, + { + "epoch": 0.9815557337610264, + "grad_norm": 0.5996186137199402, + "learning_rate": 1.5718412397648627e-05, + "loss": 0.9606, + "step": 1224 + }, + { + "epoch": 0.9823576583801122, + "grad_norm": 0.8090897798538208, + "learning_rate": 1.5711306477095962e-05, + "loss": 0.8808, + "step": 1225 + }, + { + "epoch": 0.983159582999198, + "grad_norm": 0.5831454396247864, + "learning_rate": 1.5704196274011198e-05, + "loss": 0.9475, + "step": 1226 + }, + { + "epoch": 0.9839615076182838, + "grad_norm": 0.6127690672874451, + "learning_rate": 1.56970817937258e-05, + "loss": 0.9014, + "step": 1227 + }, + { + "epoch": 0.9847634322373697, + "grad_norm": 0.6199975609779358, + "learning_rate": 1.5689963041574453e-05, + "loss": 0.9017, + "step": 1228 + }, + { + "epoch": 0.9855653568564555, + "grad_norm": 0.618943452835083, + "learning_rate": 1.568284002289504e-05, + "loss": 0.9638, + "step": 1229 + }, + { + "epoch": 0.9863672814755413, + "grad_norm": 0.5724090337753296, + "learning_rate": 1.567571274302864e-05, + "loss": 0.9033, + "step": 1230 + }, + { + "epoch": 0.9871692060946271, + "grad_norm": 0.6071799397468567, + "learning_rate": 1.5668581207319536e-05, + "loss": 0.8814, + "step": 1231 + }, + { + "epoch": 0.9879711307137129, + "grad_norm": 0.6167645454406738, + "learning_rate": 1.5661445421115188e-05, + "loss": 0.9195, + "step": 1232 + }, + { + "epoch": 0.9887730553327987, + "grad_norm": 0.5625812411308289, + "learning_rate": 1.5654305389766257e-05, + "loss": 0.8856, + "step": 1233 + }, + { + "epoch": 0.9895749799518845, + "grad_norm": 0.6261424422264099, + "learning_rate": 1.5647161118626583e-05, + "loss": 0.8532, + "step": 1234 + }, + { + "epoch": 0.9903769045709703, + "grad_norm": 0.5534703135490417, + "learning_rate": 1.5640012613053176e-05, + "loss": 0.9229, + "step": 1235 + }, + { + "epoch": 0.9911788291900562, + "grad_norm": 0.5943836569786072, + "learning_rate": 1.563285987840624e-05, + "loss": 0.9122, + "step": 1236 + }, + { + "epoch": 0.991980753809142, + "grad_norm": 0.5869540572166443, + "learning_rate": 1.562570292004913e-05, + "loss": 0.8596, + "step": 1237 + }, + { + "epoch": 0.9927826784282278, + "grad_norm": 0.5831838846206665, + "learning_rate": 1.561854174334838e-05, + "loss": 0.8861, + "step": 1238 + }, + { + "epoch": 0.9935846030473136, + "grad_norm": 0.6431090831756592, + "learning_rate": 1.5611376353673686e-05, + "loss": 0.9125, + "step": 1239 + }, + { + "epoch": 0.9943865276663993, + "grad_norm": 0.5620553493499756, + "learning_rate": 1.56042067563979e-05, + "loss": 0.9392, + "step": 1240 + }, + { + "epoch": 0.9951884522854851, + "grad_norm": 0.5939339399337769, + "learning_rate": 1.5597032956897028e-05, + "loss": 0.892, + "step": 1241 + }, + { + "epoch": 0.9959903769045709, + "grad_norm": 0.6007006764411926, + "learning_rate": 1.558985496055023e-05, + "loss": 0.9498, + "step": 1242 + }, + { + "epoch": 0.9967923015236567, + "grad_norm": 0.5932491421699524, + "learning_rate": 1.5582672772739815e-05, + "loss": 0.8872, + "step": 1243 + }, + { + "epoch": 0.9975942261427426, + "grad_norm": 0.6062937378883362, + "learning_rate": 1.5575486398851232e-05, + "loss": 0.9013, + "step": 1244 + }, + { + "epoch": 0.9983961507618284, + "grad_norm": 0.6182659268379211, + "learning_rate": 1.5568295844273064e-05, + "loss": 0.8867, + "step": 1245 + }, + { + "epoch": 0.9991980753809142, + "grad_norm": 0.6514543294906616, + "learning_rate": 1.5561101114397043e-05, + "loss": 0.9485, + "step": 1246 + }, + { + "epoch": 1.0, + "grad_norm": 0.610464870929718, + "learning_rate": 1.555390221461801e-05, + "loss": 0.9416, + "step": 1247 + }, + { + "epoch": 1.0008019246190858, + "grad_norm": 0.5633330941200256, + "learning_rate": 1.554669915033395e-05, + "loss": 0.7783, + "step": 1248 + }, + { + "epoch": 1.0016038492381716, + "grad_norm": 0.5994012951850891, + "learning_rate": 1.553949192694597e-05, + "loss": 0.7874, + "step": 1249 + }, + { + "epoch": 1.0024057738572574, + "grad_norm": 0.6167377829551697, + "learning_rate": 1.553228054985829e-05, + "loss": 0.772, + "step": 1250 + }, + { + "epoch": 1.0032076984763432, + "grad_norm": 0.6523487567901611, + "learning_rate": 1.5525065024478245e-05, + "loss": 0.7683, + "step": 1251 + }, + { + "epoch": 1.004009623095429, + "grad_norm": 0.6528876423835754, + "learning_rate": 1.5517845356216283e-05, + "loss": 0.779, + "step": 1252 + }, + { + "epoch": 1.0048115477145148, + "grad_norm": 0.6447154879570007, + "learning_rate": 1.551062155048595e-05, + "loss": 0.7917, + "step": 1253 + }, + { + "epoch": 1.0056134723336005, + "grad_norm": 0.6762365698814392, + "learning_rate": 1.550339361270391e-05, + "loss": 0.7961, + "step": 1254 + }, + { + "epoch": 1.0064153969526863, + "grad_norm": 0.6918103098869324, + "learning_rate": 1.5496161548289918e-05, + "loss": 0.764, + "step": 1255 + }, + { + "epoch": 1.0072173215717724, + "grad_norm": 0.6805532574653625, + "learning_rate": 1.5488925362666818e-05, + "loss": 0.7675, + "step": 1256 + }, + { + "epoch": 1.0080192461908581, + "grad_norm": 0.698422908782959, + "learning_rate": 1.5481685061260547e-05, + "loss": 0.7496, + "step": 1257 + }, + { + "epoch": 1.008821170809944, + "grad_norm": 0.6384891271591187, + "learning_rate": 1.5474440649500132e-05, + "loss": 0.8026, + "step": 1258 + }, + { + "epoch": 1.0096230954290297, + "grad_norm": 0.6521022319793701, + "learning_rate": 1.5467192132817678e-05, + "loss": 0.7986, + "step": 1259 + }, + { + "epoch": 1.0104250200481155, + "grad_norm": 0.6222298741340637, + "learning_rate": 1.5459939516648374e-05, + "loss": 0.7312, + "step": 1260 + }, + { + "epoch": 1.0112269446672013, + "grad_norm": 0.6429084539413452, + "learning_rate": 1.5452682806430473e-05, + "loss": 0.7311, + "step": 1261 + }, + { + "epoch": 1.012028869286287, + "grad_norm": 0.7085431814193726, + "learning_rate": 1.544542200760531e-05, + "loss": 0.8077, + "step": 1262 + }, + { + "epoch": 1.012830793905373, + "grad_norm": 0.6494969725608826, + "learning_rate": 1.543815712561727e-05, + "loss": 0.7795, + "step": 1263 + }, + { + "epoch": 1.0136327185244587, + "grad_norm": 0.650427520275116, + "learning_rate": 1.5430888165913814e-05, + "loss": 0.7784, + "step": 1264 + }, + { + "epoch": 1.0144346431435445, + "grad_norm": 0.674248456954956, + "learning_rate": 1.5423615133945457e-05, + "loss": 0.7681, + "step": 1265 + }, + { + "epoch": 1.0152365677626303, + "grad_norm": 0.6563466191291809, + "learning_rate": 1.5416338035165766e-05, + "loss": 0.7758, + "step": 1266 + }, + { + "epoch": 1.016038492381716, + "grad_norm": 0.6918492317199707, + "learning_rate": 1.5409056875031355e-05, + "loss": 0.7597, + "step": 1267 + }, + { + "epoch": 1.0168404170008019, + "grad_norm": 0.6418508291244507, + "learning_rate": 1.5401771659001885e-05, + "loss": 0.7596, + "step": 1268 + }, + { + "epoch": 1.0176423416198876, + "grad_norm": 0.6073652505874634, + "learning_rate": 1.5394482392540066e-05, + "loss": 0.7344, + "step": 1269 + }, + { + "epoch": 1.0184442662389734, + "grad_norm": 0.7088474631309509, + "learning_rate": 1.5387189081111628e-05, + "loss": 0.7876, + "step": 1270 + }, + { + "epoch": 1.0192461908580595, + "grad_norm": 0.6955075263977051, + "learning_rate": 1.5379891730185352e-05, + "loss": 0.7867, + "step": 1271 + }, + { + "epoch": 1.0200481154771452, + "grad_norm": 0.6917376518249512, + "learning_rate": 1.537259034523304e-05, + "loss": 0.8059, + "step": 1272 + }, + { + "epoch": 1.020850040096231, + "grad_norm": 0.6557261943817139, + "learning_rate": 1.5365284931729513e-05, + "loss": 0.7737, + "step": 1273 + }, + { + "epoch": 1.0216519647153168, + "grad_norm": 0.6700888872146606, + "learning_rate": 1.5357975495152628e-05, + "loss": 0.7509, + "step": 1274 + }, + { + "epoch": 1.0224538893344026, + "grad_norm": 0.6783604025840759, + "learning_rate": 1.5350662040983236e-05, + "loss": 0.8075, + "step": 1275 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 0.6425763368606567, + "learning_rate": 1.5343344574705234e-05, + "loss": 0.7346, + "step": 1276 + }, + { + "epoch": 1.0240577385725742, + "grad_norm": 0.6592726707458496, + "learning_rate": 1.5336023101805486e-05, + "loss": 0.785, + "step": 1277 + }, + { + "epoch": 1.02485966319166, + "grad_norm": 0.6982232332229614, + "learning_rate": 1.5328697627773898e-05, + "loss": 0.7834, + "step": 1278 + }, + { + "epoch": 1.0256615878107458, + "grad_norm": 0.6378937363624573, + "learning_rate": 1.5321368158103346e-05, + "loss": 0.7505, + "step": 1279 + }, + { + "epoch": 1.0264635124298316, + "grad_norm": 0.6628844141960144, + "learning_rate": 1.531403469828973e-05, + "loss": 0.7627, + "step": 1280 + }, + { + "epoch": 1.0272654370489174, + "grad_norm": 0.6596646904945374, + "learning_rate": 1.5306697253831914e-05, + "loss": 0.7615, + "step": 1281 + }, + { + "epoch": 1.0280673616680032, + "grad_norm": 0.6652581095695496, + "learning_rate": 1.5299355830231776e-05, + "loss": 0.7921, + "step": 1282 + }, + { + "epoch": 1.028869286287089, + "grad_norm": 0.6460443735122681, + "learning_rate": 1.5292010432994162e-05, + "loss": 0.7812, + "step": 1283 + }, + { + "epoch": 1.0296712109061747, + "grad_norm": 0.6455625295639038, + "learning_rate": 1.5284661067626897e-05, + "loss": 0.7718, + "step": 1284 + }, + { + "epoch": 1.0304731355252605, + "grad_norm": 0.653901994228363, + "learning_rate": 1.5277307739640787e-05, + "loss": 0.7546, + "step": 1285 + }, + { + "epoch": 1.0312750601443463, + "grad_norm": 0.6660287976264954, + "learning_rate": 1.526995045454961e-05, + "loss": 0.7652, + "step": 1286 + }, + { + "epoch": 1.0320769847634321, + "grad_norm": 0.6647891998291016, + "learning_rate": 1.5262589217870106e-05, + "loss": 0.7771, + "step": 1287 + }, + { + "epoch": 1.0328789093825181, + "grad_norm": 0.6715826988220215, + "learning_rate": 1.5255224035121986e-05, + "loss": 0.7632, + "step": 1288 + }, + { + "epoch": 1.033680834001604, + "grad_norm": 0.6524284482002258, + "learning_rate": 1.524785491182791e-05, + "loss": 0.8042, + "step": 1289 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 0.7093574404716492, + "learning_rate": 1.5240481853513495e-05, + "loss": 0.8175, + "step": 1290 + }, + { + "epoch": 1.0352846832397755, + "grad_norm": 0.6630702018737793, + "learning_rate": 1.523310486570732e-05, + "loss": 0.8186, + "step": 1291 + }, + { + "epoch": 1.0360866078588613, + "grad_norm": 0.669937014579773, + "learning_rate": 1.5225723953940896e-05, + "loss": 0.7712, + "step": 1292 + }, + { + "epoch": 1.036888532477947, + "grad_norm": 0.6852511167526245, + "learning_rate": 1.5218339123748682e-05, + "loss": 0.7704, + "step": 1293 + }, + { + "epoch": 1.037690457097033, + "grad_norm": 0.6196748614311218, + "learning_rate": 1.5210950380668074e-05, + "loss": 0.7617, + "step": 1294 + }, + { + "epoch": 1.0384923817161187, + "grad_norm": 0.6314553618431091, + "learning_rate": 1.5203557730239408e-05, + "loss": 0.7316, + "step": 1295 + }, + { + "epoch": 1.0392943063352045, + "grad_norm": 0.6329060196876526, + "learning_rate": 1.5196161178005941e-05, + "loss": 0.7706, + "step": 1296 + }, + { + "epoch": 1.0400962309542903, + "grad_norm": 0.642294704914093, + "learning_rate": 1.5188760729513865e-05, + "loss": 0.7561, + "step": 1297 + }, + { + "epoch": 1.040898155573376, + "grad_norm": 0.6721711158752441, + "learning_rate": 1.5181356390312279e-05, + "loss": 0.8194, + "step": 1298 + }, + { + "epoch": 1.0417000801924619, + "grad_norm": 0.6798752546310425, + "learning_rate": 1.5173948165953216e-05, + "loss": 0.7759, + "step": 1299 + }, + { + "epoch": 1.0425020048115476, + "grad_norm": 0.6321367025375366, + "learning_rate": 1.5166536061991615e-05, + "loss": 0.7913, + "step": 1300 + }, + { + "epoch": 1.0433039294306334, + "grad_norm": 0.6367747783660889, + "learning_rate": 1.5159120083985319e-05, + "loss": 0.751, + "step": 1301 + }, + { + "epoch": 1.0441058540497192, + "grad_norm": 0.6426526308059692, + "learning_rate": 1.5151700237495087e-05, + "loss": 0.7406, + "step": 1302 + }, + { + "epoch": 1.0449077786688052, + "grad_norm": 0.6288602352142334, + "learning_rate": 1.5144276528084566e-05, + "loss": 0.7382, + "step": 1303 + }, + { + "epoch": 1.045709703287891, + "grad_norm": 0.6340166330337524, + "learning_rate": 1.513684896132031e-05, + "loss": 0.7271, + "step": 1304 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 0.6427846550941467, + "learning_rate": 1.5129417542771761e-05, + "loss": 0.7534, + "step": 1305 + }, + { + "epoch": 1.0473135525260626, + "grad_norm": 0.6341578960418701, + "learning_rate": 1.512198227801125e-05, + "loss": 0.73, + "step": 1306 + }, + { + "epoch": 1.0481154771451484, + "grad_norm": 0.6635767817497253, + "learning_rate": 1.5114543172613995e-05, + "loss": 0.7734, + "step": 1307 + }, + { + "epoch": 1.0489174017642342, + "grad_norm": 0.6806950569152832, + "learning_rate": 1.5107100232158085e-05, + "loss": 0.7465, + "step": 1308 + }, + { + "epoch": 1.04971932638332, + "grad_norm": 0.639504075050354, + "learning_rate": 1.5099653462224492e-05, + "loss": 0.7822, + "step": 1309 + }, + { + "epoch": 1.0505212510024058, + "grad_norm": 0.6781004667282104, + "learning_rate": 1.5092202868397056e-05, + "loss": 0.7742, + "step": 1310 + }, + { + "epoch": 1.0513231756214916, + "grad_norm": 0.6971407532691956, + "learning_rate": 1.5084748456262487e-05, + "loss": 0.7638, + "step": 1311 + }, + { + "epoch": 1.0521251002405774, + "grad_norm": 0.6818044781684875, + "learning_rate": 1.5077290231410367e-05, + "loss": 0.8214, + "step": 1312 + }, + { + "epoch": 1.0529270248596632, + "grad_norm": 0.6158934831619263, + "learning_rate": 1.506982819943311e-05, + "loss": 0.7426, + "step": 1313 + }, + { + "epoch": 1.053728949478749, + "grad_norm": 0.6084417700767517, + "learning_rate": 1.5062362365926012e-05, + "loss": 0.7396, + "step": 1314 + }, + { + "epoch": 1.0545308740978347, + "grad_norm": 0.6691953539848328, + "learning_rate": 1.5054892736487206e-05, + "loss": 0.7497, + "step": 1315 + }, + { + "epoch": 1.0553327987169205, + "grad_norm": 0.6629313826560974, + "learning_rate": 1.504741931671768e-05, + "loss": 0.773, + "step": 1316 + }, + { + "epoch": 1.0561347233360063, + "grad_norm": 0.641639232635498, + "learning_rate": 1.503994211222125e-05, + "loss": 0.7542, + "step": 1317 + }, + { + "epoch": 1.0569366479550921, + "grad_norm": 0.6214974522590637, + "learning_rate": 1.5032461128604583e-05, + "loss": 0.7645, + "step": 1318 + }, + { + "epoch": 1.057738572574178, + "grad_norm": 0.6951003670692444, + "learning_rate": 1.5024976371477175e-05, + "loss": 0.7688, + "step": 1319 + }, + { + "epoch": 1.058540497193264, + "grad_norm": 0.641646683216095, + "learning_rate": 1.5017487846451353e-05, + "loss": 0.7435, + "step": 1320 + }, + { + "epoch": 1.0593424218123497, + "grad_norm": 0.6781443953514099, + "learning_rate": 1.5009995559142268e-05, + "loss": 0.7606, + "step": 1321 + }, + { + "epoch": 1.0601443464314355, + "grad_norm": 0.6722328066825867, + "learning_rate": 1.5002499515167891e-05, + "loss": 0.7608, + "step": 1322 + }, + { + "epoch": 1.0609462710505213, + "grad_norm": 0.6786977052688599, + "learning_rate": 1.4994999720149008e-05, + "loss": 0.7563, + "step": 1323 + }, + { + "epoch": 1.061748195669607, + "grad_norm": 0.6650587320327759, + "learning_rate": 1.4987496179709226e-05, + "loss": 0.7366, + "step": 1324 + }, + { + "epoch": 1.062550120288693, + "grad_norm": 0.6645624041557312, + "learning_rate": 1.4979988899474955e-05, + "loss": 0.7738, + "step": 1325 + }, + { + "epoch": 1.0633520449077787, + "grad_norm": 0.622387170791626, + "learning_rate": 1.4972477885075404e-05, + "loss": 0.7404, + "step": 1326 + }, + { + "epoch": 1.0641539695268645, + "grad_norm": 0.6579604148864746, + "learning_rate": 1.4964963142142597e-05, + "loss": 0.7977, + "step": 1327 + }, + { + "epoch": 1.0649558941459503, + "grad_norm": 0.6148852109909058, + "learning_rate": 1.4957444676311333e-05, + "loss": 0.7356, + "step": 1328 + }, + { + "epoch": 1.065757818765036, + "grad_norm": 0.7013448476791382, + "learning_rate": 1.494992249321922e-05, + "loss": 0.792, + "step": 1329 + }, + { + "epoch": 1.0665597433841218, + "grad_norm": 0.6262637376785278, + "learning_rate": 1.4942396598506643e-05, + "loss": 0.7947, + "step": 1330 + }, + { + "epoch": 1.0673616680032076, + "grad_norm": 0.6252999901771545, + "learning_rate": 1.4934866997816779e-05, + "loss": 0.756, + "step": 1331 + }, + { + "epoch": 1.0681635926222934, + "grad_norm": 0.6794742345809937, + "learning_rate": 1.4927333696795581e-05, + "loss": 0.7121, + "step": 1332 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 0.6507592797279358, + "learning_rate": 1.4919796701091767e-05, + "loss": 0.7567, + "step": 1333 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 0.6619201898574829, + "learning_rate": 1.4912256016356837e-05, + "loss": 0.7232, + "step": 1334 + }, + { + "epoch": 1.070569366479551, + "grad_norm": 0.628643274307251, + "learning_rate": 1.4904711648245053e-05, + "loss": 0.7477, + "step": 1335 + }, + { + "epoch": 1.0713712910986368, + "grad_norm": 0.6922639608383179, + "learning_rate": 1.4897163602413438e-05, + "loss": 0.8047, + "step": 1336 + }, + { + "epoch": 1.0721732157177226, + "grad_norm": 0.6198031306266785, + "learning_rate": 1.4889611884521777e-05, + "loss": 0.7624, + "step": 1337 + }, + { + "epoch": 1.0729751403368084, + "grad_norm": 0.6809404492378235, + "learning_rate": 1.4882056500232604e-05, + "loss": 0.8046, + "step": 1338 + }, + { + "epoch": 1.0737770649558942, + "grad_norm": 0.6459743976593018, + "learning_rate": 1.4874497455211203e-05, + "loss": 0.7846, + "step": 1339 + }, + { + "epoch": 1.07457898957498, + "grad_norm": 0.6424413323402405, + "learning_rate": 1.48669347551256e-05, + "loss": 0.7692, + "step": 1340 + }, + { + "epoch": 1.0753809141940658, + "grad_norm": 0.7114162445068359, + "learning_rate": 1.4859368405646568e-05, + "loss": 0.774, + "step": 1341 + }, + { + "epoch": 1.0761828388131516, + "grad_norm": 0.658915638923645, + "learning_rate": 1.485179841244762e-05, + "loss": 0.7763, + "step": 1342 + }, + { + "epoch": 1.0769847634322374, + "grad_norm": 0.6556907296180725, + "learning_rate": 1.484422478120498e-05, + "loss": 0.7907, + "step": 1343 + }, + { + "epoch": 1.0777866880513232, + "grad_norm": 0.6900550723075867, + "learning_rate": 1.4836647517597627e-05, + "loss": 0.7479, + "step": 1344 + }, + { + "epoch": 1.078588612670409, + "grad_norm": 0.7201621532440186, + "learning_rate": 1.4829066627307246e-05, + "loss": 0.7893, + "step": 1345 + }, + { + "epoch": 1.0793905372894947, + "grad_norm": 0.6671075820922852, + "learning_rate": 1.4821482116018251e-05, + "loss": 0.7821, + "step": 1346 + }, + { + "epoch": 1.0801924619085805, + "grad_norm": 0.7838239669799805, + "learning_rate": 1.4813893989417762e-05, + "loss": 0.7846, + "step": 1347 + }, + { + "epoch": 1.0809943865276663, + "grad_norm": 0.6680654287338257, + "learning_rate": 1.4806302253195617e-05, + "loss": 0.7694, + "step": 1348 + }, + { + "epoch": 1.0817963111467521, + "grad_norm": 0.6512035131454468, + "learning_rate": 1.4798706913044357e-05, + "loss": 0.7297, + "step": 1349 + }, + { + "epoch": 1.082598235765838, + "grad_norm": 0.6682960391044617, + "learning_rate": 1.4791107974659229e-05, + "loss": 0.7998, + "step": 1350 + }, + { + "epoch": 1.0834001603849237, + "grad_norm": 0.7098135948181152, + "learning_rate": 1.4783505443738173e-05, + "loss": 0.7683, + "step": 1351 + }, + { + "epoch": 1.0842020850040097, + "grad_norm": 0.6800927519798279, + "learning_rate": 1.4775899325981828e-05, + "loss": 0.7553, + "step": 1352 + }, + { + "epoch": 1.0850040096230955, + "grad_norm": 0.6061440110206604, + "learning_rate": 1.476828962709352e-05, + "loss": 0.772, + "step": 1353 + }, + { + "epoch": 1.0858059342421813, + "grad_norm": 0.6747270226478577, + "learning_rate": 1.4760676352779258e-05, + "loss": 0.8075, + "step": 1354 + }, + { + "epoch": 1.086607858861267, + "grad_norm": 0.6570102572441101, + "learning_rate": 1.4753059508747738e-05, + "loss": 0.8008, + "step": 1355 + }, + { + "epoch": 1.0874097834803529, + "grad_norm": 0.6908283233642578, + "learning_rate": 1.4745439100710326e-05, + "loss": 0.7605, + "step": 1356 + }, + { + "epoch": 1.0882117080994387, + "grad_norm": 0.6615950465202332, + "learning_rate": 1.4737815134381066e-05, + "loss": 0.746, + "step": 1357 + }, + { + "epoch": 1.0890136327185245, + "grad_norm": 0.6627095937728882, + "learning_rate": 1.4730187615476663e-05, + "loss": 0.7629, + "step": 1358 + }, + { + "epoch": 1.0898155573376103, + "grad_norm": 0.7005937099456787, + "learning_rate": 1.4722556549716495e-05, + "loss": 0.7637, + "step": 1359 + }, + { + "epoch": 1.090617481956696, + "grad_norm": 0.7017346620559692, + "learning_rate": 1.4714921942822593e-05, + "loss": 0.7745, + "step": 1360 + }, + { + "epoch": 1.0914194065757818, + "grad_norm": 0.6601778268814087, + "learning_rate": 1.4707283800519647e-05, + "loss": 0.7665, + "step": 1361 + }, + { + "epoch": 1.0922213311948676, + "grad_norm": 0.6817474961280823, + "learning_rate": 1.4699642128534994e-05, + "loss": 0.8088, + "step": 1362 + }, + { + "epoch": 1.0930232558139534, + "grad_norm": 0.6907531023025513, + "learning_rate": 1.4691996932598621e-05, + "loss": 0.7555, + "step": 1363 + }, + { + "epoch": 1.0938251804330392, + "grad_norm": 0.7029712796211243, + "learning_rate": 1.4684348218443159e-05, + "loss": 0.7749, + "step": 1364 + }, + { + "epoch": 1.094627105052125, + "grad_norm": 0.7028645873069763, + "learning_rate": 1.4676695991803869e-05, + "loss": 0.7931, + "step": 1365 + }, + { + "epoch": 1.0954290296712108, + "grad_norm": 0.6735509634017944, + "learning_rate": 1.4669040258418652e-05, + "loss": 0.7675, + "step": 1366 + }, + { + "epoch": 1.0962309542902968, + "grad_norm": 0.6408675909042358, + "learning_rate": 1.4661381024028042e-05, + "loss": 0.7434, + "step": 1367 + }, + { + "epoch": 1.0970328789093826, + "grad_norm": 0.6668729186058044, + "learning_rate": 1.4653718294375192e-05, + "loss": 0.782, + "step": 1368 + }, + { + "epoch": 1.0978348035284684, + "grad_norm": 0.7412964701652527, + "learning_rate": 1.4646052075205874e-05, + "loss": 0.7711, + "step": 1369 + }, + { + "epoch": 1.0986367281475542, + "grad_norm": 0.6989220976829529, + "learning_rate": 1.4638382372268484e-05, + "loss": 0.7949, + "step": 1370 + }, + { + "epoch": 1.09943865276664, + "grad_norm": 0.6390843987464905, + "learning_rate": 1.4630709191314026e-05, + "loss": 0.7403, + "step": 1371 + }, + { + "epoch": 1.1002405773857258, + "grad_norm": 0.6512402892112732, + "learning_rate": 1.462303253809611e-05, + "loss": 0.7627, + "step": 1372 + }, + { + "epoch": 1.1010425020048116, + "grad_norm": 0.6433535218238831, + "learning_rate": 1.4615352418370958e-05, + "loss": 0.7596, + "step": 1373 + }, + { + "epoch": 1.1018444266238974, + "grad_norm": 0.6682513356208801, + "learning_rate": 1.460766883789738e-05, + "loss": 0.7647, + "step": 1374 + }, + { + "epoch": 1.1026463512429832, + "grad_norm": 0.6825112104415894, + "learning_rate": 1.4599981802436785e-05, + "loss": 0.7692, + "step": 1375 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.6553147435188293, + "learning_rate": 1.4592291317753178e-05, + "loss": 0.7661, + "step": 1376 + }, + { + "epoch": 1.1042502004811547, + "grad_norm": 0.688605010509491, + "learning_rate": 1.4584597389613144e-05, + "loss": 0.7896, + "step": 1377 + }, + { + "epoch": 1.1050521251002405, + "grad_norm": 0.6833084225654602, + "learning_rate": 1.4576900023785853e-05, + "loss": 0.776, + "step": 1378 + }, + { + "epoch": 1.1058540497193263, + "grad_norm": 0.6186316013336182, + "learning_rate": 1.4569199226043051e-05, + "loss": 0.7468, + "step": 1379 + }, + { + "epoch": 1.1066559743384121, + "grad_norm": 0.6914650201797485, + "learning_rate": 1.4561495002159066e-05, + "loss": 0.7954, + "step": 1380 + }, + { + "epoch": 1.107457898957498, + "grad_norm": 0.6579850912094116, + "learning_rate": 1.4553787357910774e-05, + "loss": 0.7775, + "step": 1381 + }, + { + "epoch": 1.1082598235765837, + "grad_norm": 0.6452553868293762, + "learning_rate": 1.4546076299077639e-05, + "loss": 0.7601, + "step": 1382 + }, + { + "epoch": 1.1090617481956695, + "grad_norm": 0.654435396194458, + "learning_rate": 1.4538361831441672e-05, + "loss": 0.7614, + "step": 1383 + }, + { + "epoch": 1.1098636728147555, + "grad_norm": 0.6667703986167908, + "learning_rate": 1.4530643960787445e-05, + "loss": 0.7705, + "step": 1384 + }, + { + "epoch": 1.1106655974338413, + "grad_norm": 0.6765471696853638, + "learning_rate": 1.452292269290208e-05, + "loss": 0.8051, + "step": 1385 + }, + { + "epoch": 1.111467522052927, + "grad_norm": 0.633200466632843, + "learning_rate": 1.4515198033575243e-05, + "loss": 0.7119, + "step": 1386 + }, + { + "epoch": 1.1122694466720129, + "grad_norm": 0.6916564702987671, + "learning_rate": 1.4507469988599153e-05, + "loss": 0.758, + "step": 1387 + }, + { + "epoch": 1.1130713712910987, + "grad_norm": 0.6819466352462769, + "learning_rate": 1.4499738563768557e-05, + "loss": 0.7795, + "step": 1388 + }, + { + "epoch": 1.1138732959101845, + "grad_norm": 0.6802613735198975, + "learning_rate": 1.4492003764880744e-05, + "loss": 0.8001, + "step": 1389 + }, + { + "epoch": 1.1146752205292703, + "grad_norm": 0.6491445302963257, + "learning_rate": 1.4484265597735525e-05, + "loss": 0.7703, + "step": 1390 + }, + { + "epoch": 1.115477145148356, + "grad_norm": 0.634710431098938, + "learning_rate": 1.4476524068135246e-05, + "loss": 0.7764, + "step": 1391 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.7030678391456604, + "learning_rate": 1.4468779181884762e-05, + "loss": 0.7844, + "step": 1392 + }, + { + "epoch": 1.1170809943865276, + "grad_norm": 0.6353664398193359, + "learning_rate": 1.4461030944791464e-05, + "loss": 0.7452, + "step": 1393 + }, + { + "epoch": 1.1178829190056134, + "grad_norm": 0.696847677230835, + "learning_rate": 1.4453279362665234e-05, + "loss": 0.7598, + "step": 1394 + }, + { + "epoch": 1.1186848436246992, + "grad_norm": 0.6439919471740723, + "learning_rate": 1.4445524441318477e-05, + "loss": 0.7681, + "step": 1395 + }, + { + "epoch": 1.119486768243785, + "grad_norm": 0.6072260737419128, + "learning_rate": 1.4437766186566094e-05, + "loss": 0.7165, + "step": 1396 + }, + { + "epoch": 1.1202886928628708, + "grad_norm": 0.6615963578224182, + "learning_rate": 1.4430004604225493e-05, + "loss": 0.757, + "step": 1397 + }, + { + "epoch": 1.1210906174819566, + "grad_norm": 0.6312723159790039, + "learning_rate": 1.4422239700116572e-05, + "loss": 0.7481, + "step": 1398 + }, + { + "epoch": 1.1218925421010426, + "grad_norm": 0.6664157509803772, + "learning_rate": 1.4414471480061716e-05, + "loss": 0.766, + "step": 1399 + }, + { + "epoch": 1.1226944667201284, + "grad_norm": 0.6936686038970947, + "learning_rate": 1.4406699949885803e-05, + "loss": 0.8061, + "step": 1400 + }, + { + "epoch": 1.1234963913392142, + "grad_norm": 0.6664496660232544, + "learning_rate": 1.4398925115416196e-05, + "loss": 0.7682, + "step": 1401 + }, + { + "epoch": 1.1242983159583, + "grad_norm": 0.6195146441459656, + "learning_rate": 1.4391146982482724e-05, + "loss": 0.7158, + "step": 1402 + }, + { + "epoch": 1.1251002405773858, + "grad_norm": 0.627631425857544, + "learning_rate": 1.4383365556917701e-05, + "loss": 0.7568, + "step": 1403 + }, + { + "epoch": 1.1259021651964716, + "grad_norm": 0.6510641574859619, + "learning_rate": 1.4375580844555898e-05, + "loss": 0.7522, + "step": 1404 + }, + { + "epoch": 1.1267040898155574, + "grad_norm": 0.6601302027702332, + "learning_rate": 1.4367792851234566e-05, + "loss": 0.7652, + "step": 1405 + }, + { + "epoch": 1.1275060144346432, + "grad_norm": 0.629599928855896, + "learning_rate": 1.4360001582793404e-05, + "loss": 0.7619, + "step": 1406 + }, + { + "epoch": 1.128307939053729, + "grad_norm": 0.7037693858146667, + "learning_rate": 1.4352207045074567e-05, + "loss": 0.7956, + "step": 1407 + }, + { + "epoch": 1.1291098636728147, + "grad_norm": 0.6922396421432495, + "learning_rate": 1.4344409243922667e-05, + "loss": 0.7827, + "step": 1408 + }, + { + "epoch": 1.1299117882919005, + "grad_norm": 0.6473610997200012, + "learning_rate": 1.4336608185184765e-05, + "loss": 0.7751, + "step": 1409 + }, + { + "epoch": 1.1307137129109863, + "grad_norm": 0.7747618556022644, + "learning_rate": 1.4328803874710358e-05, + "loss": 0.7786, + "step": 1410 + }, + { + "epoch": 1.1315156375300721, + "grad_norm": 0.6290801763534546, + "learning_rate": 1.4320996318351378e-05, + "loss": 0.7315, + "step": 1411 + }, + { + "epoch": 1.132317562149158, + "grad_norm": 0.6735879778862, + "learning_rate": 1.4313185521962205e-05, + "loss": 0.796, + "step": 1412 + }, + { + "epoch": 1.1331194867682437, + "grad_norm": 0.6589605212211609, + "learning_rate": 1.4305371491399638e-05, + "loss": 0.7771, + "step": 1413 + }, + { + "epoch": 1.1339214113873295, + "grad_norm": 0.6696829199790955, + "learning_rate": 1.4297554232522898e-05, + "loss": 0.7968, + "step": 1414 + }, + { + "epoch": 1.1347233360064153, + "grad_norm": 0.6309067010879517, + "learning_rate": 1.4289733751193643e-05, + "loss": 0.7734, + "step": 1415 + }, + { + "epoch": 1.1355252606255013, + "grad_norm": 0.6822018623352051, + "learning_rate": 1.4281910053275923e-05, + "loss": 0.7691, + "step": 1416 + }, + { + "epoch": 1.136327185244587, + "grad_norm": 0.6693670153617859, + "learning_rate": 1.427408314463622e-05, + "loss": 0.7564, + "step": 1417 + }, + { + "epoch": 1.1371291098636729, + "grad_norm": 0.6806270480155945, + "learning_rate": 1.4266253031143418e-05, + "loss": 0.7953, + "step": 1418 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 0.7900277376174927, + "learning_rate": 1.4258419718668801e-05, + "loss": 0.7782, + "step": 1419 + }, + { + "epoch": 1.1387329591018445, + "grad_norm": 0.6651455760002136, + "learning_rate": 1.4250583213086051e-05, + "loss": 0.7406, + "step": 1420 + }, + { + "epoch": 1.1395348837209303, + "grad_norm": 0.6853930950164795, + "learning_rate": 1.4242743520271249e-05, + "loss": 0.7845, + "step": 1421 + }, + { + "epoch": 1.140336808340016, + "grad_norm": 0.6740282773971558, + "learning_rate": 1.4234900646102864e-05, + "loss": 0.7476, + "step": 1422 + }, + { + "epoch": 1.1411387329591018, + "grad_norm": 0.6734980344772339, + "learning_rate": 1.4227054596461754e-05, + "loss": 0.7855, + "step": 1423 + }, + { + "epoch": 1.1419406575781876, + "grad_norm": 0.6694862842559814, + "learning_rate": 1.4219205377231147e-05, + "loss": 0.7757, + "step": 1424 + }, + { + "epoch": 1.1427425821972734, + "grad_norm": 0.68555748462677, + "learning_rate": 1.4211352994296655e-05, + "loss": 0.7891, + "step": 1425 + }, + { + "epoch": 1.1435445068163592, + "grad_norm": 0.6966123580932617, + "learning_rate": 1.4203497453546267e-05, + "loss": 0.766, + "step": 1426 + }, + { + "epoch": 1.144346431435445, + "grad_norm": 0.6659271121025085, + "learning_rate": 1.4195638760870334e-05, + "loss": 0.7537, + "step": 1427 + }, + { + "epoch": 1.1451483560545308, + "grad_norm": 0.6558569073677063, + "learning_rate": 1.418777692216157e-05, + "loss": 0.7547, + "step": 1428 + }, + { + "epoch": 1.1459502806736166, + "grad_norm": 0.6753950119018555, + "learning_rate": 1.417991194331505e-05, + "loss": 0.7408, + "step": 1429 + }, + { + "epoch": 1.1467522052927026, + "grad_norm": 0.720521092414856, + "learning_rate": 1.4172043830228202e-05, + "loss": 0.7769, + "step": 1430 + }, + { + "epoch": 1.1475541299117884, + "grad_norm": 0.685820996761322, + "learning_rate": 1.4164172588800809e-05, + "loss": 0.7925, + "step": 1431 + }, + { + "epoch": 1.1483560545308742, + "grad_norm": 0.6265881061553955, + "learning_rate": 1.415629822493499e-05, + "loss": 0.7714, + "step": 1432 + }, + { + "epoch": 1.14915797914996, + "grad_norm": 0.6419846415519714, + "learning_rate": 1.4148420744535214e-05, + "loss": 0.7476, + "step": 1433 + }, + { + "epoch": 1.1499599037690458, + "grad_norm": 0.6394557952880859, + "learning_rate": 1.4140540153508285e-05, + "loss": 0.7862, + "step": 1434 + }, + { + "epoch": 1.1507618283881316, + "grad_norm": 0.6818154454231262, + "learning_rate": 1.4132656457763338e-05, + "loss": 0.8058, + "step": 1435 + }, + { + "epoch": 1.1515637530072174, + "grad_norm": 0.6973996758460999, + "learning_rate": 1.4124769663211837e-05, + "loss": 0.75, + "step": 1436 + }, + { + "epoch": 1.1523656776263032, + "grad_norm": 0.6343415379524231, + "learning_rate": 1.4116879775767567e-05, + "loss": 0.7878, + "step": 1437 + }, + { + "epoch": 1.153167602245389, + "grad_norm": 0.6887206435203552, + "learning_rate": 1.4108986801346633e-05, + "loss": 0.7894, + "step": 1438 + }, + { + "epoch": 1.1539695268644747, + "grad_norm": 0.6938029527664185, + "learning_rate": 1.4101090745867464e-05, + "loss": 0.7608, + "step": 1439 + }, + { + "epoch": 1.1547714514835605, + "grad_norm": 0.6808055639266968, + "learning_rate": 1.4093191615250785e-05, + "loss": 0.7765, + "step": 1440 + }, + { + "epoch": 1.1555733761026463, + "grad_norm": 0.6570947766304016, + "learning_rate": 1.4085289415419632e-05, + "loss": 0.7583, + "step": 1441 + }, + { + "epoch": 1.1563753007217321, + "grad_norm": 0.6893501877784729, + "learning_rate": 1.4077384152299348e-05, + "loss": 0.7418, + "step": 1442 + }, + { + "epoch": 1.157177225340818, + "grad_norm": 0.689246654510498, + "learning_rate": 1.4069475831817564e-05, + "loss": 0.7751, + "step": 1443 + }, + { + "epoch": 1.1579791499599037, + "grad_norm": 0.6660308241844177, + "learning_rate": 1.4061564459904214e-05, + "loss": 0.7582, + "step": 1444 + }, + { + "epoch": 1.1587810745789895, + "grad_norm": 0.6339597702026367, + "learning_rate": 1.4053650042491507e-05, + "loss": 0.7172, + "step": 1445 + }, + { + "epoch": 1.1595829991980753, + "grad_norm": 0.6734780073165894, + "learning_rate": 1.4045732585513945e-05, + "loss": 0.7813, + "step": 1446 + }, + { + "epoch": 1.160384923817161, + "grad_norm": 0.6459300518035889, + "learning_rate": 1.403781209490831e-05, + "loss": 0.755, + "step": 1447 + }, + { + "epoch": 1.161186848436247, + "grad_norm": 0.6735323667526245, + "learning_rate": 1.4029888576613654e-05, + "loss": 0.7667, + "step": 1448 + }, + { + "epoch": 1.1619887730553329, + "grad_norm": 0.7017342448234558, + "learning_rate": 1.4021962036571301e-05, + "loss": 0.7973, + "step": 1449 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.6533816456794739, + "learning_rate": 1.4014032480724838e-05, + "loss": 0.7825, + "step": 1450 + }, + { + "epoch": 1.1635926222935045, + "grad_norm": 0.674637496471405, + "learning_rate": 1.400609991502012e-05, + "loss": 0.7378, + "step": 1451 + }, + { + "epoch": 1.1643945469125903, + "grad_norm": 0.6480211019515991, + "learning_rate": 1.3998164345405253e-05, + "loss": 0.7617, + "step": 1452 + }, + { + "epoch": 1.165196471531676, + "grad_norm": 0.6655580997467041, + "learning_rate": 1.3990225777830595e-05, + "loss": 0.788, + "step": 1453 + }, + { + "epoch": 1.1659983961507618, + "grad_norm": 0.6576639413833618, + "learning_rate": 1.3982284218248758e-05, + "loss": 0.7567, + "step": 1454 + }, + { + "epoch": 1.1668003207698476, + "grad_norm": 0.6448426246643066, + "learning_rate": 1.3974339672614594e-05, + "loss": 0.7859, + "step": 1455 + }, + { + "epoch": 1.1676022453889334, + "grad_norm": 0.6625378131866455, + "learning_rate": 1.396639214688519e-05, + "loss": 0.7589, + "step": 1456 + }, + { + "epoch": 1.1684041700080192, + "grad_norm": 0.679095983505249, + "learning_rate": 1.3958441647019877e-05, + "loss": 0.7464, + "step": 1457 + }, + { + "epoch": 1.169206094627105, + "grad_norm": 0.6716442704200745, + "learning_rate": 1.3950488178980203e-05, + "loss": 0.7687, + "step": 1458 + }, + { + "epoch": 1.1700080192461908, + "grad_norm": 0.6463879346847534, + "learning_rate": 1.394253174872996e-05, + "loss": 0.7424, + "step": 1459 + }, + { + "epoch": 1.1708099438652766, + "grad_norm": 0.6612167358398438, + "learning_rate": 1.393457236223514e-05, + "loss": 0.7628, + "step": 1460 + }, + { + "epoch": 1.1716118684843624, + "grad_norm": 0.7058016657829285, + "learning_rate": 1.3926610025463967e-05, + "loss": 0.7804, + "step": 1461 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 0.697573721408844, + "learning_rate": 1.3918644744386868e-05, + "loss": 0.7949, + "step": 1462 + }, + { + "epoch": 1.1732157177225342, + "grad_norm": 0.6801334023475647, + "learning_rate": 1.3910676524976489e-05, + "loss": 0.7611, + "step": 1463 + }, + { + "epoch": 1.17401764234162, + "grad_norm": 0.6564053893089294, + "learning_rate": 1.3902705373207669e-05, + "loss": 0.7559, + "step": 1464 + }, + { + "epoch": 1.1748195669607058, + "grad_norm": 0.6379392147064209, + "learning_rate": 1.3894731295057446e-05, + "loss": 0.7549, + "step": 1465 + }, + { + "epoch": 1.1756214915797916, + "grad_norm": 0.6612043380737305, + "learning_rate": 1.388675429650506e-05, + "loss": 0.7533, + "step": 1466 + }, + { + "epoch": 1.1764234161988774, + "grad_norm": 0.6841898560523987, + "learning_rate": 1.3878774383531935e-05, + "loss": 0.7963, + "step": 1467 + }, + { + "epoch": 1.1772253408179632, + "grad_norm": 0.6732792854309082, + "learning_rate": 1.3870791562121679e-05, + "loss": 0.7894, + "step": 1468 + }, + { + "epoch": 1.178027265437049, + "grad_norm": 0.6291318535804749, + "learning_rate": 1.3862805838260087e-05, + "loss": 0.7317, + "step": 1469 + }, + { + "epoch": 1.1788291900561347, + "grad_norm": 0.7076729536056519, + "learning_rate": 1.3854817217935126e-05, + "loss": 0.7777, + "step": 1470 + }, + { + "epoch": 1.1796311146752205, + "grad_norm": 0.6925358176231384, + "learning_rate": 1.384682570713693e-05, + "loss": 0.7755, + "step": 1471 + }, + { + "epoch": 1.1804330392943063, + "grad_norm": 0.6519325971603394, + "learning_rate": 1.3838831311857812e-05, + "loss": 0.7508, + "step": 1472 + }, + { + "epoch": 1.181234963913392, + "grad_norm": 0.6759724020957947, + "learning_rate": 1.383083403809224e-05, + "loss": 0.7513, + "step": 1473 + }, + { + "epoch": 1.182036888532478, + "grad_norm": 0.6461741328239441, + "learning_rate": 1.3822833891836846e-05, + "loss": 0.7574, + "step": 1474 + }, + { + "epoch": 1.1828388131515637, + "grad_norm": 0.6571494936943054, + "learning_rate": 1.3814830879090409e-05, + "loss": 0.7941, + "step": 1475 + }, + { + "epoch": 1.1836407377706495, + "grad_norm": 0.7067133784294128, + "learning_rate": 1.3806825005853855e-05, + "loss": 0.7657, + "step": 1476 + }, + { + "epoch": 1.1844426623897353, + "grad_norm": 0.6409063935279846, + "learning_rate": 1.3798816278130268e-05, + "loss": 0.7547, + "step": 1477 + }, + { + "epoch": 1.185244587008821, + "grad_norm": 0.6490313410758972, + "learning_rate": 1.3790804701924861e-05, + "loss": 0.7466, + "step": 1478 + }, + { + "epoch": 1.1860465116279069, + "grad_norm": 0.6448349952697754, + "learning_rate": 1.378279028324499e-05, + "loss": 0.7466, + "step": 1479 + }, + { + "epoch": 1.1868484362469929, + "grad_norm": 0.6649291515350342, + "learning_rate": 1.3774773028100135e-05, + "loss": 0.7569, + "step": 1480 + }, + { + "epoch": 1.1876503608660787, + "grad_norm": 0.6493022441864014, + "learning_rate": 1.3766752942501911e-05, + "loss": 0.7479, + "step": 1481 + }, + { + "epoch": 1.1884522854851645, + "grad_norm": 0.6873480081558228, + "learning_rate": 1.375873003246405e-05, + "loss": 0.7851, + "step": 1482 + }, + { + "epoch": 1.1892542101042503, + "grad_norm": 0.6878486275672913, + "learning_rate": 1.3750704304002398e-05, + "loss": 0.7799, + "step": 1483 + }, + { + "epoch": 1.190056134723336, + "grad_norm": 0.6725685596466064, + "learning_rate": 1.3742675763134926e-05, + "loss": 0.7607, + "step": 1484 + }, + { + "epoch": 1.1908580593424218, + "grad_norm": 0.715043842792511, + "learning_rate": 1.3734644415881708e-05, + "loss": 0.7748, + "step": 1485 + }, + { + "epoch": 1.1916599839615076, + "grad_norm": 0.7091044783592224, + "learning_rate": 1.3726610268264917e-05, + "loss": 0.7979, + "step": 1486 + }, + { + "epoch": 1.1924619085805934, + "grad_norm": 0.7154737710952759, + "learning_rate": 1.3718573326308834e-05, + "loss": 0.7526, + "step": 1487 + }, + { + "epoch": 1.1932638331996792, + "grad_norm": 0.6581177711486816, + "learning_rate": 1.3710533596039828e-05, + "loss": 0.7468, + "step": 1488 + }, + { + "epoch": 1.194065757818765, + "grad_norm": 0.7028568387031555, + "learning_rate": 1.3702491083486366e-05, + "loss": 0.7795, + "step": 1489 + }, + { + "epoch": 1.1948676824378508, + "grad_norm": 0.6829168200492859, + "learning_rate": 1.3694445794678996e-05, + "loss": 0.8018, + "step": 1490 + }, + { + "epoch": 1.1956696070569366, + "grad_norm": 0.6602156162261963, + "learning_rate": 1.3686397735650353e-05, + "loss": 0.7477, + "step": 1491 + }, + { + "epoch": 1.1964715316760224, + "grad_norm": 0.6569467782974243, + "learning_rate": 1.3678346912435141e-05, + "loss": 0.7608, + "step": 1492 + }, + { + "epoch": 1.1972734562951082, + "grad_norm": 0.6565800905227661, + "learning_rate": 1.3670293331070142e-05, + "loss": 0.7646, + "step": 1493 + }, + { + "epoch": 1.1980753809141942, + "grad_norm": 0.6818427443504333, + "learning_rate": 1.3662236997594209e-05, + "loss": 0.8143, + "step": 1494 + }, + { + "epoch": 1.19887730553328, + "grad_norm": 0.7073442935943604, + "learning_rate": 1.3654177918048253e-05, + "loss": 0.7981, + "step": 1495 + }, + { + "epoch": 1.1996792301523658, + "grad_norm": 0.6537404656410217, + "learning_rate": 1.3646116098475246e-05, + "loss": 0.7556, + "step": 1496 + }, + { + "epoch": 1.2004811547714516, + "grad_norm": 0.6920551061630249, + "learning_rate": 1.3638051544920217e-05, + "loss": 0.7843, + "step": 1497 + }, + { + "epoch": 1.2012830793905374, + "grad_norm": 0.9153415560722351, + "learning_rate": 1.3629984263430238e-05, + "loss": 0.7822, + "step": 1498 + }, + { + "epoch": 1.2020850040096231, + "grad_norm": 0.6674759387969971, + "learning_rate": 1.3621914260054437e-05, + "loss": 0.75, + "step": 1499 + }, + { + "epoch": 1.202886928628709, + "grad_norm": 0.6499200463294983, + "learning_rate": 1.3613841540843978e-05, + "loss": 0.7385, + "step": 1500 + }, + { + "epoch": 1.2036888532477947, + "grad_norm": 0.724064290523529, + "learning_rate": 1.3605766111852052e-05, + "loss": 0.7836, + "step": 1501 + }, + { + "epoch": 1.2044907778668805, + "grad_norm": 0.6528597474098206, + "learning_rate": 1.3597687979133898e-05, + "loss": 0.7304, + "step": 1502 + }, + { + "epoch": 1.2052927024859663, + "grad_norm": 0.6762754917144775, + "learning_rate": 1.3589607148746775e-05, + "loss": 0.7487, + "step": 1503 + }, + { + "epoch": 1.206094627105052, + "grad_norm": 0.693828284740448, + "learning_rate": 1.3581523626749966e-05, + "loss": 0.762, + "step": 1504 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 0.6931832432746887, + "learning_rate": 1.3573437419204765e-05, + "loss": 0.8055, + "step": 1505 + }, + { + "epoch": 1.2076984763432237, + "grad_norm": 0.691015362739563, + "learning_rate": 1.3565348532174487e-05, + "loss": 0.7511, + "step": 1506 + }, + { + "epoch": 1.2085004009623095, + "grad_norm": 0.6388340592384338, + "learning_rate": 1.355725697172446e-05, + "loss": 0.7281, + "step": 1507 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 0.6492217779159546, + "learning_rate": 1.354916274392201e-05, + "loss": 0.7788, + "step": 1508 + }, + { + "epoch": 1.210104250200481, + "grad_norm": 0.6737982034683228, + "learning_rate": 1.3541065854836464e-05, + "loss": 0.7426, + "step": 1509 + }, + { + "epoch": 1.2109061748195669, + "grad_norm": 0.6400448083877563, + "learning_rate": 1.3532966310539142e-05, + "loss": 0.7495, + "step": 1510 + }, + { + "epoch": 1.2117080994386527, + "grad_norm": 0.6762053370475769, + "learning_rate": 1.352486411710336e-05, + "loss": 0.7913, + "step": 1511 + }, + { + "epoch": 1.2125100240577387, + "grad_norm": 0.6663560271263123, + "learning_rate": 1.3516759280604423e-05, + "loss": 0.7498, + "step": 1512 + }, + { + "epoch": 1.2133119486768245, + "grad_norm": 0.6616519689559937, + "learning_rate": 1.3508651807119609e-05, + "loss": 0.7328, + "step": 1513 + }, + { + "epoch": 1.2141138732959103, + "grad_norm": 0.636685311794281, + "learning_rate": 1.3500541702728175e-05, + "loss": 0.7758, + "step": 1514 + }, + { + "epoch": 1.214915797914996, + "grad_norm": 0.6234886050224304, + "learning_rate": 1.3492428973511363e-05, + "loss": 0.7013, + "step": 1515 + }, + { + "epoch": 1.2157177225340818, + "grad_norm": 0.6537496447563171, + "learning_rate": 1.3484313625552362e-05, + "loss": 0.7369, + "step": 1516 + }, + { + "epoch": 1.2165196471531676, + "grad_norm": 0.6941004991531372, + "learning_rate": 1.3476195664936347e-05, + "loss": 0.7798, + "step": 1517 + }, + { + "epoch": 1.2173215717722534, + "grad_norm": 0.670886754989624, + "learning_rate": 1.3468075097750432e-05, + "loss": 0.7566, + "step": 1518 + }, + { + "epoch": 1.2181234963913392, + "grad_norm": 0.751348614692688, + "learning_rate": 1.3459951930083698e-05, + "loss": 0.7695, + "step": 1519 + }, + { + "epoch": 1.218925421010425, + "grad_norm": 0.6668398976325989, + "learning_rate": 1.345182616802718e-05, + "loss": 0.7587, + "step": 1520 + }, + { + "epoch": 1.2197273456295108, + "grad_norm": 0.6931249499320984, + "learning_rate": 1.3443697817673842e-05, + "loss": 0.7838, + "step": 1521 + }, + { + "epoch": 1.2205292702485966, + "grad_norm": 0.6422427892684937, + "learning_rate": 1.34355668851186e-05, + "loss": 0.7395, + "step": 1522 + }, + { + "epoch": 1.2213311948676824, + "grad_norm": 0.6797451376914978, + "learning_rate": 1.3427433376458306e-05, + "loss": 0.8113, + "step": 1523 + }, + { + "epoch": 1.2221331194867682, + "grad_norm": 0.6311590075492859, + "learning_rate": 1.341929729779174e-05, + "loss": 0.774, + "step": 1524 + }, + { + "epoch": 1.222935044105854, + "grad_norm": 0.6638842821121216, + "learning_rate": 1.3411158655219615e-05, + "loss": 0.7781, + "step": 1525 + }, + { + "epoch": 1.22373696872494, + "grad_norm": 0.6489601135253906, + "learning_rate": 1.3403017454844556e-05, + "loss": 0.7779, + "step": 1526 + }, + { + "epoch": 1.2245388933440258, + "grad_norm": 0.6765308976173401, + "learning_rate": 1.3394873702771114e-05, + "loss": 0.7736, + "step": 1527 + }, + { + "epoch": 1.2253408179631116, + "grad_norm": 0.6653009653091431, + "learning_rate": 1.3386727405105756e-05, + "loss": 0.7436, + "step": 1528 + }, + { + "epoch": 1.2261427425821974, + "grad_norm": 0.6361008286476135, + "learning_rate": 1.337857856795685e-05, + "loss": 0.7256, + "step": 1529 + }, + { + "epoch": 1.2269446672012831, + "grad_norm": 0.6522552967071533, + "learning_rate": 1.3370427197434673e-05, + "loss": 0.7511, + "step": 1530 + }, + { + "epoch": 1.227746591820369, + "grad_norm": 0.6462847590446472, + "learning_rate": 1.3362273299651395e-05, + "loss": 0.749, + "step": 1531 + }, + { + "epoch": 1.2285485164394547, + "grad_norm": 0.6825747489929199, + "learning_rate": 1.3354116880721093e-05, + "loss": 0.7598, + "step": 1532 + }, + { + "epoch": 1.2293504410585405, + "grad_norm": 0.6631978154182434, + "learning_rate": 1.334595794675973e-05, + "loss": 0.7954, + "step": 1533 + }, + { + "epoch": 1.2301523656776263, + "grad_norm": 0.6733466386795044, + "learning_rate": 1.333779650388514e-05, + "loss": 0.7623, + "step": 1534 + }, + { + "epoch": 1.230954290296712, + "grad_norm": 0.6870176792144775, + "learning_rate": 1.3329632558217065e-05, + "loss": 0.7626, + "step": 1535 + }, + { + "epoch": 1.231756214915798, + "grad_norm": 0.7062235474586487, + "learning_rate": 1.33214661158771e-05, + "loss": 0.7815, + "step": 1536 + }, + { + "epoch": 1.2325581395348837, + "grad_norm": 0.6793636679649353, + "learning_rate": 1.3313297182988722e-05, + "loss": 0.7597, + "step": 1537 + }, + { + "epoch": 1.2333600641539695, + "grad_norm": 0.697195291519165, + "learning_rate": 1.3305125765677283e-05, + "loss": 0.7883, + "step": 1538 + }, + { + "epoch": 1.2341619887730553, + "grad_norm": 0.7242034673690796, + "learning_rate": 1.3296951870069981e-05, + "loss": 0.7931, + "step": 1539 + }, + { + "epoch": 1.234963913392141, + "grad_norm": 0.6695935130119324, + "learning_rate": 1.328877550229589e-05, + "loss": 0.737, + "step": 1540 + }, + { + "epoch": 1.2357658380112269, + "grad_norm": 0.6315851807594299, + "learning_rate": 1.3280596668485919e-05, + "loss": 0.7701, + "step": 1541 + }, + { + "epoch": 1.2365677626303127, + "grad_norm": 0.6564438343048096, + "learning_rate": 1.3272415374772844e-05, + "loss": 0.7729, + "step": 1542 + }, + { + "epoch": 1.2373696872493984, + "grad_norm": 0.6894364953041077, + "learning_rate": 1.3264231627291273e-05, + "loss": 0.8072, + "step": 1543 + }, + { + "epoch": 1.2381716118684845, + "grad_norm": 0.7009897828102112, + "learning_rate": 1.325604543217766e-05, + "loss": 0.752, + "step": 1544 + }, + { + "epoch": 1.2389735364875702, + "grad_norm": 0.7217838764190674, + "learning_rate": 1.3247856795570295e-05, + "loss": 0.7707, + "step": 1545 + }, + { + "epoch": 1.239775461106656, + "grad_norm": 0.6767851710319519, + "learning_rate": 1.3239665723609294e-05, + "loss": 0.7444, + "step": 1546 + }, + { + "epoch": 1.2405773857257418, + "grad_norm": 0.68946373462677, + "learning_rate": 1.3231472222436605e-05, + "loss": 0.7341, + "step": 1547 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.7371004223823547, + "learning_rate": 1.3223276298195988e-05, + "loss": 0.7759, + "step": 1548 + }, + { + "epoch": 1.2421812349639134, + "grad_norm": 0.6482488512992859, + "learning_rate": 1.3215077957033032e-05, + "loss": 0.7692, + "step": 1549 + }, + { + "epoch": 1.2429831595829992, + "grad_norm": 0.6388265490531921, + "learning_rate": 1.3206877205095133e-05, + "loss": 0.7739, + "step": 1550 + }, + { + "epoch": 1.243785084202085, + "grad_norm": 0.7499321699142456, + "learning_rate": 1.3198674048531488e-05, + "loss": 0.8046, + "step": 1551 + }, + { + "epoch": 1.2445870088211708, + "grad_norm": 0.6427833437919617, + "learning_rate": 1.3190468493493107e-05, + "loss": 0.7477, + "step": 1552 + }, + { + "epoch": 1.2453889334402566, + "grad_norm": 0.6992602944374084, + "learning_rate": 1.3182260546132795e-05, + "loss": 0.7773, + "step": 1553 + }, + { + "epoch": 1.2461908580593424, + "grad_norm": 0.6634381413459778, + "learning_rate": 1.3174050212605147e-05, + "loss": 0.7649, + "step": 1554 + }, + { + "epoch": 1.2469927826784282, + "grad_norm": 0.6472319960594177, + "learning_rate": 1.316583749906656e-05, + "loss": 0.7866, + "step": 1555 + }, + { + "epoch": 1.247794707297514, + "grad_norm": 0.6691563725471497, + "learning_rate": 1.3157622411675195e-05, + "loss": 0.7582, + "step": 1556 + }, + { + "epoch": 1.2485966319165998, + "grad_norm": 0.7087457776069641, + "learning_rate": 1.3149404956591008e-05, + "loss": 0.7923, + "step": 1557 + }, + { + "epoch": 1.2493985565356858, + "grad_norm": 0.6310862898826599, + "learning_rate": 1.3141185139975728e-05, + "loss": 0.7327, + "step": 1558 + }, + { + "epoch": 1.2502004811547716, + "grad_norm": 0.6904287338256836, + "learning_rate": 1.3132962967992854e-05, + "loss": 0.8019, + "step": 1559 + }, + { + "epoch": 1.2510024057738574, + "grad_norm": 0.6679447889328003, + "learning_rate": 1.3124738446807652e-05, + "loss": 0.7593, + "step": 1560 + }, + { + "epoch": 1.2518043303929431, + "grad_norm": 0.6390910744667053, + "learning_rate": 1.3116511582587144e-05, + "loss": 0.7633, + "step": 1561 + }, + { + "epoch": 1.252606255012029, + "grad_norm": 0.7109375, + "learning_rate": 1.3108282381500113e-05, + "loss": 0.7773, + "step": 1562 + }, + { + "epoch": 1.2534081796311147, + "grad_norm": 0.7146701812744141, + "learning_rate": 1.3100050849717102e-05, + "loss": 0.7661, + "step": 1563 + }, + { + "epoch": 1.2542101042502005, + "grad_norm": 0.6369656920433044, + "learning_rate": 1.309181699341038e-05, + "loss": 0.7478, + "step": 1564 + }, + { + "epoch": 1.2550120288692863, + "grad_norm": 0.6547572016716003, + "learning_rate": 1.3083580818753985e-05, + "loss": 0.7648, + "step": 1565 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 0.6653978228569031, + "learning_rate": 1.3075342331923675e-05, + "loss": 0.7727, + "step": 1566 + }, + { + "epoch": 1.256615878107458, + "grad_norm": 0.6517826914787292, + "learning_rate": 1.3067101539096952e-05, + "loss": 0.7463, + "step": 1567 + }, + { + "epoch": 1.2574178027265437, + "grad_norm": 0.673172116279602, + "learning_rate": 1.305885844645304e-05, + "loss": 0.7474, + "step": 1568 + }, + { + "epoch": 1.2582197273456295, + "grad_norm": 0.6843745112419128, + "learning_rate": 1.3050613060172893e-05, + "loss": 0.7819, + "step": 1569 + }, + { + "epoch": 1.2590216519647153, + "grad_norm": 0.6570084095001221, + "learning_rate": 1.304236538643918e-05, + "loss": 0.7617, + "step": 1570 + }, + { + "epoch": 1.259823576583801, + "grad_norm": 0.6739295125007629, + "learning_rate": 1.3034115431436286e-05, + "loss": 0.7473, + "step": 1571 + }, + { + "epoch": 1.2606255012028869, + "grad_norm": 0.6281715035438538, + "learning_rate": 1.3025863201350315e-05, + "loss": 0.7476, + "step": 1572 + }, + { + "epoch": 1.2614274258219726, + "grad_norm": 0.6386378407478333, + "learning_rate": 1.3017608702369065e-05, + "loss": 0.7579, + "step": 1573 + }, + { + "epoch": 1.2622293504410584, + "grad_norm": 0.6800333857536316, + "learning_rate": 1.300935194068204e-05, + "loss": 0.7787, + "step": 1574 + }, + { + "epoch": 1.2630312750601442, + "grad_norm": 0.6754662394523621, + "learning_rate": 1.3001092922480445e-05, + "loss": 0.7985, + "step": 1575 + }, + { + "epoch": 1.26383319967923, + "grad_norm": 0.666191041469574, + "learning_rate": 1.2992831653957173e-05, + "loss": 0.7681, + "step": 1576 + }, + { + "epoch": 1.264635124298316, + "grad_norm": 0.636799693107605, + "learning_rate": 1.2984568141306797e-05, + "loss": 0.7185, + "step": 1577 + }, + { + "epoch": 1.2654370489174018, + "grad_norm": 0.6819969415664673, + "learning_rate": 1.2976302390725586e-05, + "loss": 0.8076, + "step": 1578 + }, + { + "epoch": 1.2662389735364876, + "grad_norm": 0.658902645111084, + "learning_rate": 1.296803440841148e-05, + "loss": 0.77, + "step": 1579 + }, + { + "epoch": 1.2670408981555734, + "grad_norm": 0.6477823853492737, + "learning_rate": 1.29597642005641e-05, + "loss": 0.7159, + "step": 1580 + }, + { + "epoch": 1.2678428227746592, + "grad_norm": 0.6573794484138489, + "learning_rate": 1.2951491773384722e-05, + "loss": 0.7573, + "step": 1581 + }, + { + "epoch": 1.268644747393745, + "grad_norm": 0.6093468070030212, + "learning_rate": 1.2943217133076294e-05, + "loss": 0.7136, + "step": 1582 + }, + { + "epoch": 1.2694466720128308, + "grad_norm": 0.6893898844718933, + "learning_rate": 1.2934940285843425e-05, + "loss": 0.7904, + "step": 1583 + }, + { + "epoch": 1.2702485966319166, + "grad_norm": 0.6822935938835144, + "learning_rate": 1.2926661237892377e-05, + "loss": 0.7664, + "step": 1584 + }, + { + "epoch": 1.2710505212510024, + "grad_norm": 0.6702585220336914, + "learning_rate": 1.2918379995431062e-05, + "loss": 0.7504, + "step": 1585 + }, + { + "epoch": 1.2718524458700882, + "grad_norm": 0.7012314200401306, + "learning_rate": 1.2910096564669037e-05, + "loss": 0.7665, + "step": 1586 + }, + { + "epoch": 1.272654370489174, + "grad_norm": 0.6500260233879089, + "learning_rate": 1.2901810951817499e-05, + "loss": 0.7426, + "step": 1587 + }, + { + "epoch": 1.2734562951082598, + "grad_norm": 0.6530648469924927, + "learning_rate": 1.2893523163089285e-05, + "loss": 0.7817, + "step": 1588 + }, + { + "epoch": 1.2742582197273458, + "grad_norm": 0.6491802334785461, + "learning_rate": 1.2885233204698866e-05, + "loss": 0.75, + "step": 1589 + }, + { + "epoch": 1.2750601443464316, + "grad_norm": 0.6376941204071045, + "learning_rate": 1.2876941082862324e-05, + "loss": 0.7574, + "step": 1590 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 0.6957758665084839, + "learning_rate": 1.2868646803797384e-05, + "loss": 0.7617, + "step": 1591 + }, + { + "epoch": 1.2766639935846031, + "grad_norm": 0.6394297480583191, + "learning_rate": 1.2860350373723374e-05, + "loss": 0.7365, + "step": 1592 + }, + { + "epoch": 1.277465918203689, + "grad_norm": 0.6853082180023193, + "learning_rate": 1.2852051798861243e-05, + "loss": 0.7461, + "step": 1593 + }, + { + "epoch": 1.2782678428227747, + "grad_norm": 0.6423301100730896, + "learning_rate": 1.2843751085433539e-05, + "loss": 0.7031, + "step": 1594 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 0.6504702568054199, + "learning_rate": 1.2835448239664425e-05, + "loss": 0.7431, + "step": 1595 + }, + { + "epoch": 1.2798716920609463, + "grad_norm": 0.6869426369667053, + "learning_rate": 1.2827143267779658e-05, + "loss": 0.7593, + "step": 1596 + }, + { + "epoch": 1.280673616680032, + "grad_norm": 0.708145022392273, + "learning_rate": 1.2818836176006586e-05, + "loss": 0.7978, + "step": 1597 + }, + { + "epoch": 1.281475541299118, + "grad_norm": 0.6440792679786682, + "learning_rate": 1.2810526970574151e-05, + "loss": 0.7948, + "step": 1598 + }, + { + "epoch": 1.2822774659182037, + "grad_norm": 0.6792011260986328, + "learning_rate": 1.2802215657712876e-05, + "loss": 0.7789, + "step": 1599 + }, + { + "epoch": 1.2830793905372895, + "grad_norm": 0.6505358815193176, + "learning_rate": 1.2793902243654868e-05, + "loss": 0.7309, + "step": 1600 + }, + { + "epoch": 1.2838813151563753, + "grad_norm": 0.6658748388290405, + "learning_rate": 1.278558673463381e-05, + "loss": 0.7607, + "step": 1601 + }, + { + "epoch": 1.284683239775461, + "grad_norm": 0.6865249276161194, + "learning_rate": 1.2777269136884952e-05, + "loss": 0.758, + "step": 1602 + }, + { + "epoch": 1.2854851643945469, + "grad_norm": 0.6584889888763428, + "learning_rate": 1.2768949456645108e-05, + "loss": 0.7318, + "step": 1603 + }, + { + "epoch": 1.2862870890136326, + "grad_norm": 0.6798214912414551, + "learning_rate": 1.2760627700152664e-05, + "loss": 0.7606, + "step": 1604 + }, + { + "epoch": 1.2870890136327184, + "grad_norm": 0.6833958625793457, + "learning_rate": 1.275230387364755e-05, + "loss": 0.8089, + "step": 1605 + }, + { + "epoch": 1.2878909382518042, + "grad_norm": 0.6360597014427185, + "learning_rate": 1.2743977983371268e-05, + "loss": 0.7443, + "step": 1606 + }, + { + "epoch": 1.28869286287089, + "grad_norm": 0.7116873860359192, + "learning_rate": 1.2735650035566836e-05, + "loss": 0.7916, + "step": 1607 + }, + { + "epoch": 1.2894947874899758, + "grad_norm": 0.6559532880783081, + "learning_rate": 1.2727320036478843e-05, + "loss": 0.7767, + "step": 1608 + }, + { + "epoch": 1.2902967121090618, + "grad_norm": 0.6576822400093079, + "learning_rate": 1.2718987992353403e-05, + "loss": 0.7425, + "step": 1609 + }, + { + "epoch": 1.2910986367281476, + "grad_norm": 0.6480294466018677, + "learning_rate": 1.2710653909438172e-05, + "loss": 0.7506, + "step": 1610 + }, + { + "epoch": 1.2919005613472334, + "grad_norm": 0.6556183099746704, + "learning_rate": 1.2702317793982327e-05, + "loss": 0.7592, + "step": 1611 + }, + { + "epoch": 1.2927024859663192, + "grad_norm": 0.7214966416358948, + "learning_rate": 1.2693979652236564e-05, + "loss": 0.7769, + "step": 1612 + }, + { + "epoch": 1.293504410585405, + "grad_norm": 0.6807628273963928, + "learning_rate": 1.2685639490453113e-05, + "loss": 0.7976, + "step": 1613 + }, + { + "epoch": 1.2943063352044908, + "grad_norm": 0.6581088900566101, + "learning_rate": 1.2677297314885708e-05, + "loss": 0.7919, + "step": 1614 + }, + { + "epoch": 1.2951082598235766, + "grad_norm": 0.6764626502990723, + "learning_rate": 1.2668953131789599e-05, + "loss": 0.774, + "step": 1615 + }, + { + "epoch": 1.2959101844426624, + "grad_norm": 0.696811854839325, + "learning_rate": 1.2660606947421537e-05, + "loss": 0.7757, + "step": 1616 + }, + { + "epoch": 1.2967121090617482, + "grad_norm": 0.6721716523170471, + "learning_rate": 1.2652258768039775e-05, + "loss": 0.7861, + "step": 1617 + }, + { + "epoch": 1.297514033680834, + "grad_norm": 0.7006967663764954, + "learning_rate": 1.2643908599904063e-05, + "loss": 0.7621, + "step": 1618 + }, + { + "epoch": 1.2983159582999197, + "grad_norm": 0.630478024482727, + "learning_rate": 1.2635556449275641e-05, + "loss": 0.7318, + "step": 1619 + }, + { + "epoch": 1.2991178829190055, + "grad_norm": 0.6607829332351685, + "learning_rate": 1.2627202322417235e-05, + "loss": 0.8096, + "step": 1620 + }, + { + "epoch": 1.2999198075380916, + "grad_norm": 0.6839569807052612, + "learning_rate": 1.2618846225593057e-05, + "loss": 0.7783, + "step": 1621 + }, + { + "epoch": 1.3007217321571773, + "grad_norm": 0.6771467328071594, + "learning_rate": 1.2610488165068793e-05, + "loss": 0.7737, + "step": 1622 + }, + { + "epoch": 1.3015236567762631, + "grad_norm": 0.6908150315284729, + "learning_rate": 1.2602128147111597e-05, + "loss": 0.7958, + "step": 1623 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 0.731979489326477, + "learning_rate": 1.2593766177990096e-05, + "loss": 0.7927, + "step": 1624 + }, + { + "epoch": 1.3031275060144347, + "grad_norm": 0.6440238952636719, + "learning_rate": 1.2585402263974383e-05, + "loss": 0.7429, + "step": 1625 + }, + { + "epoch": 1.3039294306335205, + "grad_norm": 0.6557809114456177, + "learning_rate": 1.2577036411336003e-05, + "loss": 0.758, + "step": 1626 + }, + { + "epoch": 1.3047313552526063, + "grad_norm": 0.670251727104187, + "learning_rate": 1.256866862634796e-05, + "loss": 0.7308, + "step": 1627 + }, + { + "epoch": 1.305533279871692, + "grad_norm": 0.6871209144592285, + "learning_rate": 1.2560298915284699e-05, + "loss": 0.7549, + "step": 1628 + }, + { + "epoch": 1.306335204490778, + "grad_norm": 0.6390516757965088, + "learning_rate": 1.2551927284422117e-05, + "loss": 0.7439, + "step": 1629 + }, + { + "epoch": 1.3071371291098637, + "grad_norm": 0.6146913170814514, + "learning_rate": 1.2543553740037546e-05, + "loss": 0.7618, + "step": 1630 + }, + { + "epoch": 1.3079390537289495, + "grad_norm": 0.6606637835502625, + "learning_rate": 1.2535178288409761e-05, + "loss": 0.7575, + "step": 1631 + }, + { + "epoch": 1.3087409783480353, + "grad_norm": 0.6702629327774048, + "learning_rate": 1.2526800935818956e-05, + "loss": 0.8049, + "step": 1632 + }, + { + "epoch": 1.309542902967121, + "grad_norm": 0.6716954112052917, + "learning_rate": 1.2518421688546757e-05, + "loss": 0.7453, + "step": 1633 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 0.6508925557136536, + "learning_rate": 1.2510040552876204e-05, + "loss": 0.7823, + "step": 1634 + }, + { + "epoch": 1.3111467522052926, + "grad_norm": 0.8851492404937744, + "learning_rate": 1.2501657535091765e-05, + "loss": 0.7895, + "step": 1635 + }, + { + "epoch": 1.3119486768243784, + "grad_norm": 0.648134171962738, + "learning_rate": 1.2493272641479311e-05, + "loss": 0.7168, + "step": 1636 + }, + { + "epoch": 1.3127506014434642, + "grad_norm": 0.6762018799781799, + "learning_rate": 1.2484885878326114e-05, + "loss": 0.7674, + "step": 1637 + }, + { + "epoch": 1.31355252606255, + "grad_norm": 0.6717413067817688, + "learning_rate": 1.247649725192086e-05, + "loss": 0.7295, + "step": 1638 + }, + { + "epoch": 1.3143544506816358, + "grad_norm": 0.7482190728187561, + "learning_rate": 1.246810676855363e-05, + "loss": 0.7754, + "step": 1639 + }, + { + "epoch": 1.3151563753007216, + "grad_norm": 0.6837365627288818, + "learning_rate": 1.2459714434515888e-05, + "loss": 0.7827, + "step": 1640 + }, + { + "epoch": 1.3159582999198076, + "grad_norm": 0.6977565288543701, + "learning_rate": 1.2451320256100497e-05, + "loss": 0.785, + "step": 1641 + }, + { + "epoch": 1.3167602245388934, + "grad_norm": 0.6693125367164612, + "learning_rate": 1.2442924239601692e-05, + "loss": 0.7275, + "step": 1642 + }, + { + "epoch": 1.3175621491579792, + "grad_norm": 0.7254882454872131, + "learning_rate": 1.2434526391315095e-05, + "loss": 0.8095, + "step": 1643 + }, + { + "epoch": 1.318364073777065, + "grad_norm": 0.6936510801315308, + "learning_rate": 1.2426126717537704e-05, + "loss": 0.7396, + "step": 1644 + }, + { + "epoch": 1.3191659983961508, + "grad_norm": 0.6720640063285828, + "learning_rate": 1.2417725224567872e-05, + "loss": 0.7378, + "step": 1645 + }, + { + "epoch": 1.3199679230152366, + "grad_norm": 0.655472457408905, + "learning_rate": 1.2409321918705329e-05, + "loss": 0.7372, + "step": 1646 + }, + { + "epoch": 1.3207698476343224, + "grad_norm": 0.7163864970207214, + "learning_rate": 1.2400916806251157e-05, + "loss": 0.7659, + "step": 1647 + }, + { + "epoch": 1.3215717722534082, + "grad_norm": 0.6562886238098145, + "learning_rate": 1.2392509893507799e-05, + "loss": 0.7724, + "step": 1648 + }, + { + "epoch": 1.322373696872494, + "grad_norm": 0.6730007529258728, + "learning_rate": 1.2384101186779042e-05, + "loss": 0.7505, + "step": 1649 + }, + { + "epoch": 1.3231756214915797, + "grad_norm": 0.7325595617294312, + "learning_rate": 1.2375690692370022e-05, + "loss": 0.784, + "step": 1650 + }, + { + "epoch": 1.3239775461106655, + "grad_norm": 0.6311853528022766, + "learning_rate": 1.2367278416587216e-05, + "loss": 0.7435, + "step": 1651 + }, + { + "epoch": 1.3247794707297513, + "grad_norm": 0.67484050989151, + "learning_rate": 1.235886436573843e-05, + "loss": 0.7545, + "step": 1652 + }, + { + "epoch": 1.3255813953488373, + "grad_norm": 0.7450650930404663, + "learning_rate": 1.235044854613281e-05, + "loss": 0.7657, + "step": 1653 + }, + { + "epoch": 1.3263833199679231, + "grad_norm": 0.6910774111747742, + "learning_rate": 1.2342030964080822e-05, + "loss": 0.7257, + "step": 1654 + }, + { + "epoch": 1.327185244587009, + "grad_norm": 0.6374717354774475, + "learning_rate": 1.2333611625894254e-05, + "loss": 0.7467, + "step": 1655 + }, + { + "epoch": 1.3279871692060947, + "grad_norm": 0.6719356775283813, + "learning_rate": 1.2325190537886221e-05, + "loss": 0.7808, + "step": 1656 + }, + { + "epoch": 1.3287890938251805, + "grad_norm": 0.7069136500358582, + "learning_rate": 1.231676770637113e-05, + "loss": 0.7416, + "step": 1657 + }, + { + "epoch": 1.3295910184442663, + "grad_norm": 0.6578108668327332, + "learning_rate": 1.2308343137664716e-05, + "loss": 0.7745, + "step": 1658 + }, + { + "epoch": 1.330392943063352, + "grad_norm": 0.684407114982605, + "learning_rate": 1.2299916838084001e-05, + "loss": 0.7411, + "step": 1659 + }, + { + "epoch": 1.3311948676824379, + "grad_norm": 0.6682063341140747, + "learning_rate": 1.2291488813947315e-05, + "loss": 0.7518, + "step": 1660 + }, + { + "epoch": 1.3319967923015237, + "grad_norm": 0.6518343091011047, + "learning_rate": 1.2283059071574278e-05, + "loss": 0.7582, + "step": 1661 + }, + { + "epoch": 1.3327987169206095, + "grad_norm": 0.6990170478820801, + "learning_rate": 1.2274627617285797e-05, + "loss": 0.7964, + "step": 1662 + }, + { + "epoch": 1.3336006415396953, + "grad_norm": 0.6665741205215454, + "learning_rate": 1.2266194457404061e-05, + "loss": 0.7903, + "step": 1663 + }, + { + "epoch": 1.334402566158781, + "grad_norm": 0.6582921743392944, + "learning_rate": 1.2257759598252543e-05, + "loss": 0.7422, + "step": 1664 + }, + { + "epoch": 1.3352044907778668, + "grad_norm": 0.6946974396705627, + "learning_rate": 1.224932304615599e-05, + "loss": 0.7956, + "step": 1665 + }, + { + "epoch": 1.3360064153969526, + "grad_norm": 0.6991271376609802, + "learning_rate": 1.2240884807440413e-05, + "loss": 0.772, + "step": 1666 + }, + { + "epoch": 1.3368083400160384, + "grad_norm": 0.6818379759788513, + "learning_rate": 1.223244488843309e-05, + "loss": 0.7572, + "step": 1667 + }, + { + "epoch": 1.3376102646351242, + "grad_norm": 0.6668140292167664, + "learning_rate": 1.2224003295462561e-05, + "loss": 0.736, + "step": 1668 + }, + { + "epoch": 1.33841218925421, + "grad_norm": 0.6249803304672241, + "learning_rate": 1.221556003485862e-05, + "loss": 0.7643, + "step": 1669 + }, + { + "epoch": 1.3392141138732958, + "grad_norm": 0.6772146821022034, + "learning_rate": 1.2207115112952313e-05, + "loss": 0.7568, + "step": 1670 + }, + { + "epoch": 1.3400160384923816, + "grad_norm": 0.6962553858757019, + "learning_rate": 1.2198668536075924e-05, + "loss": 0.8018, + "step": 1671 + }, + { + "epoch": 1.3408179631114674, + "grad_norm": 0.6322622299194336, + "learning_rate": 1.2190220310562992e-05, + "loss": 0.7296, + "step": 1672 + }, + { + "epoch": 1.3416198877305534, + "grad_norm": 0.6803750991821289, + "learning_rate": 1.218177044274828e-05, + "loss": 0.8024, + "step": 1673 + }, + { + "epoch": 1.3424218123496392, + "grad_norm": 0.6757524013519287, + "learning_rate": 1.217331893896779e-05, + "loss": 0.7697, + "step": 1674 + }, + { + "epoch": 1.343223736968725, + "grad_norm": 0.7142401337623596, + "learning_rate": 1.2164865805558738e-05, + "loss": 0.7728, + "step": 1675 + }, + { + "epoch": 1.3440256615878108, + "grad_norm": 0.631919801235199, + "learning_rate": 1.215641104885958e-05, + "loss": 0.7395, + "step": 1676 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 0.6723388433456421, + "learning_rate": 1.2147954675209982e-05, + "loss": 0.7703, + "step": 1677 + }, + { + "epoch": 1.3456295108259824, + "grad_norm": 0.654248833656311, + "learning_rate": 1.2139496690950813e-05, + "loss": 0.7829, + "step": 1678 + }, + { + "epoch": 1.3464314354450682, + "grad_norm": 0.6374508738517761, + "learning_rate": 1.2131037102424165e-05, + "loss": 0.7534, + "step": 1679 + }, + { + "epoch": 1.347233360064154, + "grad_norm": 0.6467137932777405, + "learning_rate": 1.2122575915973321e-05, + "loss": 0.7589, + "step": 1680 + }, + { + "epoch": 1.3480352846832397, + "grad_norm": 0.686490535736084, + "learning_rate": 1.2114113137942767e-05, + "loss": 0.7685, + "step": 1681 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 0.6674497127532959, + "learning_rate": 1.2105648774678188e-05, + "loss": 0.7302, + "step": 1682 + }, + { + "epoch": 1.3496391339214113, + "grad_norm": 0.6758518218994141, + "learning_rate": 1.2097182832526443e-05, + "loss": 0.7784, + "step": 1683 + }, + { + "epoch": 1.3504410585404971, + "grad_norm": 0.663431704044342, + "learning_rate": 1.2088715317835589e-05, + "loss": 0.745, + "step": 1684 + }, + { + "epoch": 1.3512429831595831, + "grad_norm": 0.6859851479530334, + "learning_rate": 1.2080246236954856e-05, + "loss": 0.8056, + "step": 1685 + }, + { + "epoch": 1.352044907778669, + "grad_norm": 0.6677384972572327, + "learning_rate": 1.2071775596234647e-05, + "loss": 0.7598, + "step": 1686 + }, + { + "epoch": 1.3528468323977547, + "grad_norm": 0.7199987769126892, + "learning_rate": 1.2063303402026545e-05, + "loss": 0.8167, + "step": 1687 + }, + { + "epoch": 1.3536487570168405, + "grad_norm": 0.6905441284179688, + "learning_rate": 1.2054829660683281e-05, + "loss": 0.7508, + "step": 1688 + }, + { + "epoch": 1.3544506816359263, + "grad_norm": 0.6765400171279907, + "learning_rate": 1.2046354378558753e-05, + "loss": 0.7208, + "step": 1689 + }, + { + "epoch": 1.355252606255012, + "grad_norm": 0.6659426093101501, + "learning_rate": 1.2037877562008025e-05, + "loss": 0.7675, + "step": 1690 + }, + { + "epoch": 1.3560545308740979, + "grad_norm": 0.6474359035491943, + "learning_rate": 1.2029399217387299e-05, + "loss": 0.753, + "step": 1691 + }, + { + "epoch": 1.3568564554931837, + "grad_norm": 0.7105376124382019, + "learning_rate": 1.2020919351053927e-05, + "loss": 0.7842, + "step": 1692 + }, + { + "epoch": 1.3576583801122695, + "grad_norm": 0.6748633980751038, + "learning_rate": 1.2012437969366397e-05, + "loss": 0.7136, + "step": 1693 + }, + { + "epoch": 1.3584603047313553, + "grad_norm": 0.6597528457641602, + "learning_rate": 1.2003955078684344e-05, + "loss": 0.7817, + "step": 1694 + }, + { + "epoch": 1.359262229350441, + "grad_norm": 0.6678489446640015, + "learning_rate": 1.1995470685368527e-05, + "loss": 0.7433, + "step": 1695 + }, + { + "epoch": 1.3600641539695268, + "grad_norm": 0.7042926549911499, + "learning_rate": 1.1986984795780829e-05, + "loss": 0.7635, + "step": 1696 + }, + { + "epoch": 1.3608660785886126, + "grad_norm": 0.6309202313423157, + "learning_rate": 1.1978497416284265e-05, + "loss": 0.703, + "step": 1697 + }, + { + "epoch": 1.3616680032076984, + "grad_norm": 0.7144943475723267, + "learning_rate": 1.1970008553242955e-05, + "loss": 0.7649, + "step": 1698 + }, + { + "epoch": 1.3624699278267842, + "grad_norm": 0.64893639087677, + "learning_rate": 1.196151821302214e-05, + "loss": 0.7039, + "step": 1699 + }, + { + "epoch": 1.36327185244587, + "grad_norm": 0.8356136083602905, + "learning_rate": 1.1953026401988172e-05, + "loss": 0.7479, + "step": 1700 + }, + { + "epoch": 1.3640737770649558, + "grad_norm": 0.6598641276359558, + "learning_rate": 1.1944533126508491e-05, + "loss": 0.7319, + "step": 1701 + }, + { + "epoch": 1.3648757016840416, + "grad_norm": 0.6679075360298157, + "learning_rate": 1.193603839295165e-05, + "loss": 0.7221, + "step": 1702 + }, + { + "epoch": 1.3656776263031274, + "grad_norm": 0.6554751396179199, + "learning_rate": 1.1927542207687287e-05, + "loss": 0.7633, + "step": 1703 + }, + { + "epoch": 1.3664795509222132, + "grad_norm": 0.6698046922683716, + "learning_rate": 1.1919044577086135e-05, + "loss": 0.7493, + "step": 1704 + }, + { + "epoch": 1.3672814755412992, + "grad_norm": 0.7508684396743774, + "learning_rate": 1.191054550752e-05, + "loss": 0.7613, + "step": 1705 + }, + { + "epoch": 1.368083400160385, + "grad_norm": 0.6737611293792725, + "learning_rate": 1.1902045005361775e-05, + "loss": 0.7632, + "step": 1706 + }, + { + "epoch": 1.3688853247794708, + "grad_norm": 0.6801590919494629, + "learning_rate": 1.1893543076985434e-05, + "loss": 0.7383, + "step": 1707 + }, + { + "epoch": 1.3696872493985566, + "grad_norm": 0.6327788233757019, + "learning_rate": 1.1885039728766006e-05, + "loss": 0.7311, + "step": 1708 + }, + { + "epoch": 1.3704891740176424, + "grad_norm": 0.6967527270317078, + "learning_rate": 1.187653496707959e-05, + "loss": 0.7488, + "step": 1709 + }, + { + "epoch": 1.3712910986367282, + "grad_norm": 0.6776365041732788, + "learning_rate": 1.1868028798303346e-05, + "loss": 0.7433, + "step": 1710 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 0.6518088579177856, + "learning_rate": 1.1859521228815495e-05, + "loss": 0.7492, + "step": 1711 + }, + { + "epoch": 1.3728949478748997, + "grad_norm": 0.7493876814842224, + "learning_rate": 1.1851012264995296e-05, + "loss": 0.7494, + "step": 1712 + }, + { + "epoch": 1.3736968724939855, + "grad_norm": 0.6971762180328369, + "learning_rate": 1.1842501913223066e-05, + "loss": 0.7581, + "step": 1713 + }, + { + "epoch": 1.3744987971130713, + "grad_norm": 0.6640205979347229, + "learning_rate": 1.1833990179880148e-05, + "loss": 0.7684, + "step": 1714 + }, + { + "epoch": 1.3753007217321571, + "grad_norm": 0.6524538397789001, + "learning_rate": 1.1825477071348937e-05, + "loss": 0.7265, + "step": 1715 + }, + { + "epoch": 1.376102646351243, + "grad_norm": 0.7143647074699402, + "learning_rate": 1.1816962594012849e-05, + "loss": 0.806, + "step": 1716 + }, + { + "epoch": 1.376904570970329, + "grad_norm": 0.605280876159668, + "learning_rate": 1.1808446754256329e-05, + "loss": 0.7126, + "step": 1717 + }, + { + "epoch": 1.3777064955894147, + "grad_norm": 0.6456320881843567, + "learning_rate": 1.1799929558464843e-05, + "loss": 0.7651, + "step": 1718 + }, + { + "epoch": 1.3785084202085005, + "grad_norm": 0.7090603709220886, + "learning_rate": 1.1791411013024873e-05, + "loss": 0.804, + "step": 1719 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.6659294962882996, + "learning_rate": 1.178289112432392e-05, + "loss": 0.7683, + "step": 1720 + }, + { + "epoch": 1.380112269446672, + "grad_norm": 0.6606502532958984, + "learning_rate": 1.1774369898750484e-05, + "loss": 0.7351, + "step": 1721 + }, + { + "epoch": 1.3809141940657579, + "grad_norm": 0.6632405519485474, + "learning_rate": 1.176584734269407e-05, + "loss": 0.7632, + "step": 1722 + }, + { + "epoch": 1.3817161186848437, + "grad_norm": 0.6848816275596619, + "learning_rate": 1.1757323462545177e-05, + "loss": 0.7277, + "step": 1723 + }, + { + "epoch": 1.3825180433039295, + "grad_norm": 0.6847621202468872, + "learning_rate": 1.1748798264695305e-05, + "loss": 0.7468, + "step": 1724 + }, + { + "epoch": 1.3833199679230153, + "grad_norm": 0.7737583518028259, + "learning_rate": 1.1740271755536939e-05, + "loss": 0.7894, + "step": 1725 + }, + { + "epoch": 1.384121892542101, + "grad_norm": 0.6926838755607605, + "learning_rate": 1.173174394146354e-05, + "loss": 0.7865, + "step": 1726 + }, + { + "epoch": 1.3849238171611868, + "grad_norm": 0.7265371680259705, + "learning_rate": 1.172321482886956e-05, + "loss": 0.7498, + "step": 1727 + }, + { + "epoch": 1.3857257417802726, + "grad_norm": 0.6517270803451538, + "learning_rate": 1.1714684424150413e-05, + "loss": 0.7666, + "step": 1728 + }, + { + "epoch": 1.3865276663993584, + "grad_norm": 0.7151353359222412, + "learning_rate": 1.1706152733702489e-05, + "loss": 0.7528, + "step": 1729 + }, + { + "epoch": 1.3873295910184442, + "grad_norm": 0.6981160044670105, + "learning_rate": 1.1697619763923143e-05, + "loss": 0.7428, + "step": 1730 + }, + { + "epoch": 1.38813151563753, + "grad_norm": 0.7034778594970703, + "learning_rate": 1.168908552121068e-05, + "loss": 0.7919, + "step": 1731 + }, + { + "epoch": 1.3889334402566158, + "grad_norm": 0.6916285753250122, + "learning_rate": 1.1680550011964374e-05, + "loss": 0.7575, + "step": 1732 + }, + { + "epoch": 1.3897353648757016, + "grad_norm": 0.660354495048523, + "learning_rate": 1.167201324258443e-05, + "loss": 0.7571, + "step": 1733 + }, + { + "epoch": 1.3905372894947874, + "grad_norm": 0.7056530117988586, + "learning_rate": 1.166347521947202e-05, + "loss": 0.7792, + "step": 1734 + }, + { + "epoch": 1.3913392141138732, + "grad_norm": 0.6491231918334961, + "learning_rate": 1.1654935949029234e-05, + "loss": 0.725, + "step": 1735 + }, + { + "epoch": 1.392141138732959, + "grad_norm": 0.681361198425293, + "learning_rate": 1.1646395437659112e-05, + "loss": 0.791, + "step": 1736 + }, + { + "epoch": 1.392943063352045, + "grad_norm": 0.6471366286277771, + "learning_rate": 1.1637853691765625e-05, + "loss": 0.769, + "step": 1737 + }, + { + "epoch": 1.3937449879711308, + "grad_norm": 0.7128544449806213, + "learning_rate": 1.162931071775366e-05, + "loss": 0.7581, + "step": 1738 + }, + { + "epoch": 1.3945469125902166, + "grad_norm": 0.6705619692802429, + "learning_rate": 1.162076652202903e-05, + "loss": 0.7889, + "step": 1739 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.642647922039032, + "learning_rate": 1.1612221110998463e-05, + "loss": 0.7282, + "step": 1740 + }, + { + "epoch": 1.3961507618283882, + "grad_norm": 0.6899245381355286, + "learning_rate": 1.16036744910696e-05, + "loss": 0.7476, + "step": 1741 + }, + { + "epoch": 1.396952686447474, + "grad_norm": 0.6138514876365662, + "learning_rate": 1.1595126668650993e-05, + "loss": 0.7197, + "step": 1742 + }, + { + "epoch": 1.3977546110665597, + "grad_norm": 0.7220246195793152, + "learning_rate": 1.1586577650152084e-05, + "loss": 0.7255, + "step": 1743 + }, + { + "epoch": 1.3985565356856455, + "grad_norm": 0.6729558706283569, + "learning_rate": 1.1578027441983219e-05, + "loss": 0.7612, + "step": 1744 + }, + { + "epoch": 1.3993584603047313, + "grad_norm": 0.6875990629196167, + "learning_rate": 1.1569476050555637e-05, + "loss": 0.7713, + "step": 1745 + }, + { + "epoch": 1.4001603849238171, + "grad_norm": 0.6855756044387817, + "learning_rate": 1.156092348228146e-05, + "loss": 0.7852, + "step": 1746 + }, + { + "epoch": 1.400962309542903, + "grad_norm": 0.666982114315033, + "learning_rate": 1.1552369743573699e-05, + "loss": 0.7576, + "step": 1747 + }, + { + "epoch": 1.4017642341619887, + "grad_norm": 0.6429185271263123, + "learning_rate": 1.1543814840846237e-05, + "loss": 0.7467, + "step": 1748 + }, + { + "epoch": 1.4025661587810747, + "grad_norm": 0.6782676577568054, + "learning_rate": 1.153525878051383e-05, + "loss": 0.7397, + "step": 1749 + }, + { + "epoch": 1.4033680834001605, + "grad_norm": 0.6263085007667542, + "learning_rate": 1.1526701568992102e-05, + "loss": 0.7627, + "step": 1750 + }, + { + "epoch": 1.4041700080192463, + "grad_norm": 0.7163293361663818, + "learning_rate": 1.1518143212697547e-05, + "loss": 0.7476, + "step": 1751 + }, + { + "epoch": 1.404971932638332, + "grad_norm": 0.6758559942245483, + "learning_rate": 1.1509583718047508e-05, + "loss": 0.7012, + "step": 1752 + }, + { + "epoch": 1.4057738572574179, + "grad_norm": 0.6683815717697144, + "learning_rate": 1.1501023091460187e-05, + "loss": 0.7344, + "step": 1753 + }, + { + "epoch": 1.4065757818765037, + "grad_norm": 0.7088032364845276, + "learning_rate": 1.149246133935463e-05, + "loss": 0.7192, + "step": 1754 + }, + { + "epoch": 1.4073777064955895, + "grad_norm": 0.6811531782150269, + "learning_rate": 1.1483898468150736e-05, + "loss": 0.7312, + "step": 1755 + }, + { + "epoch": 1.4081796311146753, + "grad_norm": 0.6816032528877258, + "learning_rate": 1.1475334484269234e-05, + "loss": 0.7249, + "step": 1756 + }, + { + "epoch": 1.408981555733761, + "grad_norm": 0.6524032354354858, + "learning_rate": 1.146676939413169e-05, + "loss": 0.7065, + "step": 1757 + }, + { + "epoch": 1.4097834803528468, + "grad_norm": 0.6719584465026855, + "learning_rate": 1.1458203204160503e-05, + "loss": 0.7836, + "step": 1758 + }, + { + "epoch": 1.4105854049719326, + "grad_norm": 0.6611328721046448, + "learning_rate": 1.1449635920778894e-05, + "loss": 0.7608, + "step": 1759 + }, + { + "epoch": 1.4113873295910184, + "grad_norm": 0.6723156571388245, + "learning_rate": 1.14410675504109e-05, + "loss": 0.7607, + "step": 1760 + }, + { + "epoch": 1.4121892542101042, + "grad_norm": 0.7079909443855286, + "learning_rate": 1.143249809948138e-05, + "loss": 0.7835, + "step": 1761 + }, + { + "epoch": 1.41299117882919, + "grad_norm": 0.6389018893241882, + "learning_rate": 1.1423927574415998e-05, + "loss": 0.7477, + "step": 1762 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 0.7065446972846985, + "learning_rate": 1.1415355981641229e-05, + "loss": 0.7811, + "step": 1763 + }, + { + "epoch": 1.4145950280673616, + "grad_norm": 0.6564798951148987, + "learning_rate": 1.1406783327584345e-05, + "loss": 0.7824, + "step": 1764 + }, + { + "epoch": 1.4153969526864474, + "grad_norm": 0.6832185983657837, + "learning_rate": 1.139820961867341e-05, + "loss": 0.7373, + "step": 1765 + }, + { + "epoch": 1.4161988773055332, + "grad_norm": 0.6919088363647461, + "learning_rate": 1.1389634861337284e-05, + "loss": 0.7716, + "step": 1766 + }, + { + "epoch": 1.417000801924619, + "grad_norm": 0.6253665685653687, + "learning_rate": 1.1381059062005617e-05, + "loss": 0.757, + "step": 1767 + }, + { + "epoch": 1.4178027265437048, + "grad_norm": 0.7172982692718506, + "learning_rate": 1.137248222710883e-05, + "loss": 0.7826, + "step": 1768 + }, + { + "epoch": 1.4186046511627908, + "grad_norm": 0.6555379629135132, + "learning_rate": 1.1363904363078126e-05, + "loss": 0.7525, + "step": 1769 + }, + { + "epoch": 1.4194065757818766, + "grad_norm": 0.6733851432800293, + "learning_rate": 1.135532547634548e-05, + "loss": 0.7448, + "step": 1770 + }, + { + "epoch": 1.4202085004009624, + "grad_norm": 0.6609330177307129, + "learning_rate": 1.1346745573343636e-05, + "loss": 0.7659, + "step": 1771 + }, + { + "epoch": 1.4210104250200482, + "grad_norm": 0.6952093839645386, + "learning_rate": 1.13381646605061e-05, + "loss": 0.7557, + "step": 1772 + }, + { + "epoch": 1.421812349639134, + "grad_norm": 0.7016155123710632, + "learning_rate": 1.1329582744267125e-05, + "loss": 0.7528, + "step": 1773 + }, + { + "epoch": 1.4226142742582197, + "grad_norm": 0.6713129281997681, + "learning_rate": 1.1320999831061727e-05, + "loss": 0.755, + "step": 1774 + }, + { + "epoch": 1.4234161988773055, + "grad_norm": 0.6963139176368713, + "learning_rate": 1.1312415927325668e-05, + "loss": 0.7445, + "step": 1775 + }, + { + "epoch": 1.4242181234963913, + "grad_norm": 0.6338862180709839, + "learning_rate": 1.1303831039495452e-05, + "loss": 0.7368, + "step": 1776 + }, + { + "epoch": 1.4250200481154771, + "grad_norm": 0.6802634596824646, + "learning_rate": 1.1295245174008317e-05, + "loss": 0.7628, + "step": 1777 + }, + { + "epoch": 1.425821972734563, + "grad_norm": 0.6717724204063416, + "learning_rate": 1.1286658337302243e-05, + "loss": 0.7874, + "step": 1778 + }, + { + "epoch": 1.4266238973536487, + "grad_norm": 0.6719787120819092, + "learning_rate": 1.1278070535815927e-05, + "loss": 0.7606, + "step": 1779 + }, + { + "epoch": 1.4274258219727345, + "grad_norm": 0.6636870503425598, + "learning_rate": 1.1269481775988793e-05, + "loss": 0.7203, + "step": 1780 + }, + { + "epoch": 1.4282277465918205, + "grad_norm": 0.6615099310874939, + "learning_rate": 1.1260892064260995e-05, + "loss": 0.7462, + "step": 1781 + }, + { + "epoch": 1.4290296712109063, + "grad_norm": 0.703061044216156, + "learning_rate": 1.1252301407073386e-05, + "loss": 0.7762, + "step": 1782 + }, + { + "epoch": 1.429831595829992, + "grad_norm": 0.743383526802063, + "learning_rate": 1.124370981086753e-05, + "loss": 0.7658, + "step": 1783 + }, + { + "epoch": 1.4306335204490779, + "grad_norm": 0.6267064809799194, + "learning_rate": 1.1235117282085704e-05, + "loss": 0.7852, + "step": 1784 + }, + { + "epoch": 1.4314354450681637, + "grad_norm": 0.6929275393486023, + "learning_rate": 1.1226523827170876e-05, + "loss": 0.7605, + "step": 1785 + }, + { + "epoch": 1.4322373696872495, + "grad_norm": 0.6413291692733765, + "learning_rate": 1.121792945256671e-05, + "loss": 0.7657, + "step": 1786 + }, + { + "epoch": 1.4330392943063353, + "grad_norm": 0.6685841083526611, + "learning_rate": 1.1209334164717562e-05, + "loss": 0.7738, + "step": 1787 + }, + { + "epoch": 1.433841218925421, + "grad_norm": 0.697259247303009, + "learning_rate": 1.1200737970068476e-05, + "loss": 0.7591, + "step": 1788 + }, + { + "epoch": 1.4346431435445068, + "grad_norm": 0.6685236096382141, + "learning_rate": 1.1192140875065167e-05, + "loss": 0.7636, + "step": 1789 + }, + { + "epoch": 1.4354450681635926, + "grad_norm": 0.6757694482803345, + "learning_rate": 1.1183542886154027e-05, + "loss": 0.7549, + "step": 1790 + }, + { + "epoch": 1.4362469927826784, + "grad_norm": 0.6777142286300659, + "learning_rate": 1.1174944009782123e-05, + "loss": 0.7774, + "step": 1791 + }, + { + "epoch": 1.4370489174017642, + "grad_norm": 0.7203055620193481, + "learning_rate": 1.1166344252397187e-05, + "loss": 0.7841, + "step": 1792 + }, + { + "epoch": 1.43785084202085, + "grad_norm": 0.6814801096916199, + "learning_rate": 1.1157743620447611e-05, + "loss": 0.7389, + "step": 1793 + }, + { + "epoch": 1.4386527666399358, + "grad_norm": 0.6721707582473755, + "learning_rate": 1.1149142120382443e-05, + "loss": 0.7395, + "step": 1794 + }, + { + "epoch": 1.4394546912590216, + "grad_norm": 0.6581818461418152, + "learning_rate": 1.1140539758651372e-05, + "loss": 0.7273, + "step": 1795 + }, + { + "epoch": 1.4402566158781074, + "grad_norm": 0.6775161027908325, + "learning_rate": 1.1131936541704749e-05, + "loss": 0.7649, + "step": 1796 + }, + { + "epoch": 1.4410585404971932, + "grad_norm": 0.6994383931159973, + "learning_rate": 1.112333247599356e-05, + "loss": 0.766, + "step": 1797 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 0.6743654012680054, + "learning_rate": 1.1114727567969423e-05, + "loss": 0.7642, + "step": 1798 + }, + { + "epoch": 1.4426623897353648, + "grad_norm": 0.6504762172698975, + "learning_rate": 1.1106121824084593e-05, + "loss": 0.758, + "step": 1799 + }, + { + "epoch": 1.4434643143544506, + "grad_norm": 0.6630826592445374, + "learning_rate": 1.1097515250791945e-05, + "loss": 0.7632, + "step": 1800 + }, + { + "epoch": 1.4442662389735366, + "grad_norm": 0.6329621076583862, + "learning_rate": 1.1088907854544985e-05, + "loss": 0.7214, + "step": 1801 + }, + { + "epoch": 1.4450681635926224, + "grad_norm": 0.6646215319633484, + "learning_rate": 1.1080299641797837e-05, + "loss": 0.732, + "step": 1802 + }, + { + "epoch": 1.4458700882117081, + "grad_norm": 0.6855160593986511, + "learning_rate": 1.1071690619005224e-05, + "loss": 0.7722, + "step": 1803 + }, + { + "epoch": 1.446672012830794, + "grad_norm": 0.7373548746109009, + "learning_rate": 1.1063080792622484e-05, + "loss": 0.7716, + "step": 1804 + }, + { + "epoch": 1.4474739374498797, + "grad_norm": 0.6910948753356934, + "learning_rate": 1.1054470169105564e-05, + "loss": 0.7635, + "step": 1805 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 0.6741712093353271, + "learning_rate": 1.1045858754911001e-05, + "loss": 0.7724, + "step": 1806 + }, + { + "epoch": 1.4490777866880513, + "grad_norm": 0.6909212470054626, + "learning_rate": 1.1037246556495922e-05, + "loss": 0.7664, + "step": 1807 + }, + { + "epoch": 1.449879711307137, + "grad_norm": 0.7179321050643921, + "learning_rate": 1.1028633580318056e-05, + "loss": 0.7787, + "step": 1808 + }, + { + "epoch": 1.450681635926223, + "grad_norm": 0.6822714805603027, + "learning_rate": 1.1020019832835694e-05, + "loss": 0.7634, + "step": 1809 + }, + { + "epoch": 1.4514835605453087, + "grad_norm": 0.680316686630249, + "learning_rate": 1.1011405320507726e-05, + "loss": 0.7614, + "step": 1810 + }, + { + "epoch": 1.4522854851643945, + "grad_norm": 0.6558269262313843, + "learning_rate": 1.1002790049793604e-05, + "loss": 0.6952, + "step": 1811 + }, + { + "epoch": 1.4530874097834803, + "grad_norm": 0.6913748979568481, + "learning_rate": 1.099417402715335e-05, + "loss": 0.7696, + "step": 1812 + }, + { + "epoch": 1.4538893344025663, + "grad_norm": 0.6790192723274231, + "learning_rate": 1.0985557259047557e-05, + "loss": 0.7428, + "step": 1813 + }, + { + "epoch": 1.454691259021652, + "grad_norm": 0.6557827591896057, + "learning_rate": 1.0976939751937361e-05, + "loss": 0.7443, + "step": 1814 + }, + { + "epoch": 1.4554931836407379, + "grad_norm": 0.6664519309997559, + "learning_rate": 1.0968321512284472e-05, + "loss": 0.7227, + "step": 1815 + }, + { + "epoch": 1.4562951082598237, + "grad_norm": 0.6873872876167297, + "learning_rate": 1.0959702546551135e-05, + "loss": 0.7558, + "step": 1816 + }, + { + "epoch": 1.4570970328789095, + "grad_norm": 0.6596401333808899, + "learning_rate": 1.0951082861200142e-05, + "loss": 0.7435, + "step": 1817 + }, + { + "epoch": 1.4578989574979953, + "grad_norm": 0.6788073182106018, + "learning_rate": 1.0942462462694834e-05, + "loss": 0.8009, + "step": 1818 + }, + { + "epoch": 1.458700882117081, + "grad_norm": 0.7057682871818542, + "learning_rate": 1.0933841357499074e-05, + "loss": 0.7332, + "step": 1819 + }, + { + "epoch": 1.4595028067361668, + "grad_norm": 0.6896910071372986, + "learning_rate": 1.0925219552077258e-05, + "loss": 0.7549, + "step": 1820 + }, + { + "epoch": 1.4603047313552526, + "grad_norm": 0.6752651333808899, + "learning_rate": 1.091659705289431e-05, + "loss": 0.7439, + "step": 1821 + }, + { + "epoch": 1.4611066559743384, + "grad_norm": 0.7121813297271729, + "learning_rate": 1.090797386641568e-05, + "loss": 0.7304, + "step": 1822 + }, + { + "epoch": 1.4619085805934242, + "grad_norm": 0.7077644467353821, + "learning_rate": 1.0899349999107325e-05, + "loss": 0.7529, + "step": 1823 + }, + { + "epoch": 1.46271050521251, + "grad_norm": 0.6643130779266357, + "learning_rate": 1.089072545743571e-05, + "loss": 0.7544, + "step": 1824 + }, + { + "epoch": 1.4635124298315958, + "grad_norm": 0.6911596655845642, + "learning_rate": 1.088210024786781e-05, + "loss": 0.76, + "step": 1825 + }, + { + "epoch": 1.4643143544506816, + "grad_norm": 0.6701585054397583, + "learning_rate": 1.0873474376871105e-05, + "loss": 0.7368, + "step": 1826 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 0.7087423801422119, + "learning_rate": 1.0864847850913568e-05, + "loss": 0.7703, + "step": 1827 + }, + { + "epoch": 1.4659182036888532, + "grad_norm": 0.7176198363304138, + "learning_rate": 1.0856220676463654e-05, + "loss": 0.812, + "step": 1828 + }, + { + "epoch": 1.466720128307939, + "grad_norm": 0.6819745302200317, + "learning_rate": 1.084759285999032e-05, + "loss": 0.792, + "step": 1829 + }, + { + "epoch": 1.4675220529270248, + "grad_norm": 0.7085966467857361, + "learning_rate": 1.0838964407962993e-05, + "loss": 0.7629, + "step": 1830 + }, + { + "epoch": 1.4683239775461105, + "grad_norm": 0.6867752075195312, + "learning_rate": 1.0830335326851577e-05, + "loss": 0.7573, + "step": 1831 + }, + { + "epoch": 1.4691259021651963, + "grad_norm": 0.665777325630188, + "learning_rate": 1.0821705623126461e-05, + "loss": 0.7561, + "step": 1832 + }, + { + "epoch": 1.4699278267842824, + "grad_norm": 0.7135064601898193, + "learning_rate": 1.0813075303258483e-05, + "loss": 0.7441, + "step": 1833 + }, + { + "epoch": 1.4707297514033681, + "grad_norm": 0.6736664772033691, + "learning_rate": 1.0804444373718952e-05, + "loss": 0.7223, + "step": 1834 + }, + { + "epoch": 1.471531676022454, + "grad_norm": 0.6878182888031006, + "learning_rate": 1.0795812840979632e-05, + "loss": 0.728, + "step": 1835 + }, + { + "epoch": 1.4723336006415397, + "grad_norm": 0.6910241842269897, + "learning_rate": 1.0787180711512744e-05, + "loss": 0.7475, + "step": 1836 + }, + { + "epoch": 1.4731355252606255, + "grad_norm": 0.739133894443512, + "learning_rate": 1.0778547991790946e-05, + "loss": 0.7601, + "step": 1837 + }, + { + "epoch": 1.4739374498797113, + "grad_norm": 0.7009455561637878, + "learning_rate": 1.076991468828735e-05, + "loss": 0.7574, + "step": 1838 + }, + { + "epoch": 1.474739374498797, + "grad_norm": 0.7256219983100891, + "learning_rate": 1.0761280807475504e-05, + "loss": 0.7775, + "step": 1839 + }, + { + "epoch": 1.475541299117883, + "grad_norm": 0.7107866406440735, + "learning_rate": 1.0752646355829382e-05, + "loss": 0.7355, + "step": 1840 + }, + { + "epoch": 1.4763432237369687, + "grad_norm": 0.7084487676620483, + "learning_rate": 1.0744011339823389e-05, + "loss": 0.7747, + "step": 1841 + }, + { + "epoch": 1.4771451483560545, + "grad_norm": 0.6767612099647522, + "learning_rate": 1.0735375765932352e-05, + "loss": 0.7539, + "step": 1842 + }, + { + "epoch": 1.4779470729751403, + "grad_norm": 0.7070626616477966, + "learning_rate": 1.0726739640631523e-05, + "loss": 0.788, + "step": 1843 + }, + { + "epoch": 1.478748997594226, + "grad_norm": 0.6804197430610657, + "learning_rate": 1.0718102970396564e-05, + "loss": 0.7404, + "step": 1844 + }, + { + "epoch": 1.479550922213312, + "grad_norm": 0.6973698139190674, + "learning_rate": 1.0709465761703542e-05, + "loss": 0.7441, + "step": 1845 + }, + { + "epoch": 1.4803528468323979, + "grad_norm": 0.6551907658576965, + "learning_rate": 1.0700828021028929e-05, + "loss": 0.7265, + "step": 1846 + }, + { + "epoch": 1.4811547714514837, + "grad_norm": 0.6486260890960693, + "learning_rate": 1.0692189754849595e-05, + "loss": 0.736, + "step": 1847 + }, + { + "epoch": 1.4819566960705695, + "grad_norm": 0.601466715335846, + "learning_rate": 1.0683550969642813e-05, + "loss": 0.6997, + "step": 1848 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 0.6700170636177063, + "learning_rate": 1.0674911671886236e-05, + "loss": 0.7438, + "step": 1849 + }, + { + "epoch": 1.483560545308741, + "grad_norm": 0.6908175349235535, + "learning_rate": 1.06662718680579e-05, + "loss": 0.7422, + "step": 1850 + }, + { + "epoch": 1.4843624699278268, + "grad_norm": 0.7098249197006226, + "learning_rate": 1.0657631564636226e-05, + "loss": 0.8039, + "step": 1851 + }, + { + "epoch": 1.4851643945469126, + "grad_norm": 0.7431460022926331, + "learning_rate": 1.0648990768100009e-05, + "loss": 0.7891, + "step": 1852 + }, + { + "epoch": 1.4859663191659984, + "grad_norm": 0.6714998483657837, + "learning_rate": 1.0640349484928413e-05, + "loss": 0.742, + "step": 1853 + }, + { + "epoch": 1.4867682437850842, + "grad_norm": 0.6916069984436035, + "learning_rate": 1.0631707721600965e-05, + "loss": 0.7708, + "step": 1854 + }, + { + "epoch": 1.48757016840417, + "grad_norm": 0.7198412418365479, + "learning_rate": 1.0623065484597555e-05, + "loss": 0.7498, + "step": 1855 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.6497362852096558, + "learning_rate": 1.0614422780398422e-05, + "loss": 0.7526, + "step": 1856 + }, + { + "epoch": 1.4891740176423416, + "grad_norm": 0.6391859650611877, + "learning_rate": 1.0605779615484167e-05, + "loss": 0.737, + "step": 1857 + }, + { + "epoch": 1.4899759422614274, + "grad_norm": 0.6709389090538025, + "learning_rate": 1.0597135996335723e-05, + "loss": 0.7439, + "step": 1858 + }, + { + "epoch": 1.4907778668805132, + "grad_norm": 0.7073272466659546, + "learning_rate": 1.0588491929434375e-05, + "loss": 0.7798, + "step": 1859 + }, + { + "epoch": 1.491579791499599, + "grad_norm": 0.6603201031684875, + "learning_rate": 1.0579847421261733e-05, + "loss": 0.7422, + "step": 1860 + }, + { + "epoch": 1.4923817161186848, + "grad_norm": 0.6485406160354614, + "learning_rate": 1.057120247829975e-05, + "loss": 0.7667, + "step": 1861 + }, + { + "epoch": 1.4931836407377705, + "grad_norm": 0.648036777973175, + "learning_rate": 1.0562557107030695e-05, + "loss": 0.7603, + "step": 1862 + }, + { + "epoch": 1.4939855653568563, + "grad_norm": 0.7099272012710571, + "learning_rate": 1.0553911313937162e-05, + "loss": 0.7725, + "step": 1863 + }, + { + "epoch": 1.4947874899759421, + "grad_norm": 0.7114027142524719, + "learning_rate": 1.0545265105502065e-05, + "loss": 0.7704, + "step": 1864 + }, + { + "epoch": 1.4955894145950281, + "grad_norm": 0.6591110825538635, + "learning_rate": 1.053661848820862e-05, + "loss": 0.733, + "step": 1865 + }, + { + "epoch": 1.496391339214114, + "grad_norm": 0.7078248262405396, + "learning_rate": 1.0527971468540356e-05, + "loss": 0.7702, + "step": 1866 + }, + { + "epoch": 1.4971932638331997, + "grad_norm": 0.6885595917701721, + "learning_rate": 1.0519324052981103e-05, + "loss": 0.7377, + "step": 1867 + }, + { + "epoch": 1.4979951884522855, + "grad_norm": 0.681225061416626, + "learning_rate": 1.0510676248014991e-05, + "loss": 0.7427, + "step": 1868 + }, + { + "epoch": 1.4987971130713713, + "grad_norm": 0.6760066747665405, + "learning_rate": 1.050202806012644e-05, + "loss": 0.7611, + "step": 1869 + }, + { + "epoch": 1.499599037690457, + "grad_norm": 0.6731633543968201, + "learning_rate": 1.0493379495800149e-05, + "loss": 0.7486, + "step": 1870 + }, + { + "epoch": 1.500400962309543, + "grad_norm": 0.6760236620903015, + "learning_rate": 1.0484730561521107e-05, + "loss": 0.7713, + "step": 1871 + }, + { + "epoch": 1.5012028869286287, + "grad_norm": 0.6664961576461792, + "learning_rate": 1.0476081263774585e-05, + "loss": 0.7235, + "step": 1872 + }, + { + "epoch": 1.5020048115477145, + "grad_norm": 0.6721197366714478, + "learning_rate": 1.0467431609046116e-05, + "loss": 0.7431, + "step": 1873 + }, + { + "epoch": 1.5028067361668003, + "grad_norm": 0.6789388060569763, + "learning_rate": 1.0458781603821508e-05, + "loss": 0.7797, + "step": 1874 + }, + { + "epoch": 1.5036086607858863, + "grad_norm": 0.7579322457313538, + "learning_rate": 1.045013125458683e-05, + "loss": 0.7856, + "step": 1875 + }, + { + "epoch": 1.504410585404972, + "grad_norm": 0.6685303449630737, + "learning_rate": 1.0441480567828408e-05, + "loss": 0.7576, + "step": 1876 + }, + { + "epoch": 1.5052125100240579, + "grad_norm": 0.6478756666183472, + "learning_rate": 1.0432829550032818e-05, + "loss": 0.7651, + "step": 1877 + }, + { + "epoch": 1.5060144346431437, + "grad_norm": 0.6975502967834473, + "learning_rate": 1.0424178207686894e-05, + "loss": 0.7516, + "step": 1878 + }, + { + "epoch": 1.5068163592622295, + "grad_norm": 0.6927207708358765, + "learning_rate": 1.0415526547277706e-05, + "loss": 0.767, + "step": 1879 + }, + { + "epoch": 1.5076182838813152, + "grad_norm": 0.7247032523155212, + "learning_rate": 1.0406874575292558e-05, + "loss": 0.7695, + "step": 1880 + }, + { + "epoch": 1.508420208500401, + "grad_norm": 0.6950458288192749, + "learning_rate": 1.0398222298218996e-05, + "loss": 0.7968, + "step": 1881 + }, + { + "epoch": 1.5092221331194868, + "grad_norm": 0.660862386226654, + "learning_rate": 1.0389569722544794e-05, + "loss": 0.7443, + "step": 1882 + }, + { + "epoch": 1.5100240577385726, + "grad_norm": 0.6698028445243835, + "learning_rate": 1.0380916854757948e-05, + "loss": 0.7537, + "step": 1883 + }, + { + "epoch": 1.5108259823576584, + "grad_norm": 0.6830818057060242, + "learning_rate": 1.0372263701346671e-05, + "loss": 0.7432, + "step": 1884 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 0.7310096025466919, + "learning_rate": 1.0363610268799393e-05, + "loss": 0.747, + "step": 1885 + }, + { + "epoch": 1.51242983159583, + "grad_norm": 0.7210774421691895, + "learning_rate": 1.035495656360475e-05, + "loss": 0.798, + "step": 1886 + }, + { + "epoch": 1.5132317562149158, + "grad_norm": 0.6990832686424255, + "learning_rate": 1.0346302592251591e-05, + "loss": 0.7494, + "step": 1887 + }, + { + "epoch": 1.5140336808340016, + "grad_norm": 0.6791195273399353, + "learning_rate": 1.033764836122895e-05, + "loss": 0.7489, + "step": 1888 + }, + { + "epoch": 1.5148356054530874, + "grad_norm": 0.6856716275215149, + "learning_rate": 1.0328993877026075e-05, + "loss": 0.7514, + "step": 1889 + }, + { + "epoch": 1.5156375300721732, + "grad_norm": 0.6292150020599365, + "learning_rate": 1.032033914613238e-05, + "loss": 0.7234, + "step": 1890 + }, + { + "epoch": 1.516439454691259, + "grad_norm": 0.6631137132644653, + "learning_rate": 1.0311684175037488e-05, + "loss": 0.7079, + "step": 1891 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.6873642206192017, + "learning_rate": 1.0303028970231185e-05, + "loss": 0.7566, + "step": 1892 + }, + { + "epoch": 1.5180433039294305, + "grad_norm": 0.6789519190788269, + "learning_rate": 1.0294373538203439e-05, + "loss": 0.7369, + "step": 1893 + }, + { + "epoch": 1.5188452285485163, + "grad_norm": 0.716335117816925, + "learning_rate": 1.028571788544439e-05, + "loss": 0.7133, + "step": 1894 + }, + { + "epoch": 1.5196471531676021, + "grad_norm": 0.7093126177787781, + "learning_rate": 1.0277062018444342e-05, + "loss": 0.7786, + "step": 1895 + }, + { + "epoch": 1.520449077786688, + "grad_norm": 0.6854731440544128, + "learning_rate": 1.0268405943693757e-05, + "loss": 0.78, + "step": 1896 + }, + { + "epoch": 1.5212510024057737, + "grad_norm": 0.6630930304527283, + "learning_rate": 1.0259749667683252e-05, + "loss": 0.7239, + "step": 1897 + }, + { + "epoch": 1.5220529270248595, + "grad_norm": 0.6397947669029236, + "learning_rate": 1.0251093196903601e-05, + "loss": 0.7385, + "step": 1898 + }, + { + "epoch": 1.5228548516439455, + "grad_norm": 0.6733710169792175, + "learning_rate": 1.0242436537845719e-05, + "loss": 0.7366, + "step": 1899 + }, + { + "epoch": 1.5236567762630313, + "grad_norm": 0.7009027600288391, + "learning_rate": 1.0233779697000667e-05, + "loss": 0.7702, + "step": 1900 + }, + { + "epoch": 1.524458700882117, + "grad_norm": 0.7578801512718201, + "learning_rate": 1.0225122680859633e-05, + "loss": 0.7649, + "step": 1901 + }, + { + "epoch": 1.525260625501203, + "grad_norm": 0.7465493083000183, + "learning_rate": 1.0216465495913947e-05, + "loss": 0.7646, + "step": 1902 + }, + { + "epoch": 1.5260625501202887, + "grad_norm": 0.6907299160957336, + "learning_rate": 1.020780814865506e-05, + "loss": 0.7389, + "step": 1903 + }, + { + "epoch": 1.5268644747393745, + "grad_norm": 0.682547926902771, + "learning_rate": 1.0199150645574548e-05, + "loss": 0.7454, + "step": 1904 + }, + { + "epoch": 1.5276663993584603, + "grad_norm": 0.6859135031700134, + "learning_rate": 1.0190492993164101e-05, + "loss": 0.7432, + "step": 1905 + }, + { + "epoch": 1.528468323977546, + "grad_norm": 0.6617407202720642, + "learning_rate": 1.0181835197915515e-05, + "loss": 0.7214, + "step": 1906 + }, + { + "epoch": 1.529270248596632, + "grad_norm": 0.6514879465103149, + "learning_rate": 1.0173177266320706e-05, + "loss": 0.7437, + "step": 1907 + }, + { + "epoch": 1.5300721732157179, + "grad_norm": 0.6830449104309082, + "learning_rate": 1.016451920487169e-05, + "loss": 0.7648, + "step": 1908 + }, + { + "epoch": 1.5308740978348037, + "grad_norm": 0.6907112002372742, + "learning_rate": 1.0155861020060566e-05, + "loss": 0.7236, + "step": 1909 + }, + { + "epoch": 1.5316760224538895, + "grad_norm": 0.6831691861152649, + "learning_rate": 1.0147202718379544e-05, + "loss": 0.7153, + "step": 1910 + }, + { + "epoch": 1.5324779470729752, + "grad_norm": 0.6687254905700684, + "learning_rate": 1.013854430632091e-05, + "loss": 0.7333, + "step": 1911 + }, + { + "epoch": 1.533279871692061, + "grad_norm": 0.6817905902862549, + "learning_rate": 1.0129885790377034e-05, + "loss": 0.7489, + "step": 1912 + }, + { + "epoch": 1.5340817963111468, + "grad_norm": 0.6689939498901367, + "learning_rate": 1.0121227177040373e-05, + "loss": 0.7337, + "step": 1913 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 0.6985632181167603, + "learning_rate": 1.0112568472803443e-05, + "loss": 0.7522, + "step": 1914 + }, + { + "epoch": 1.5356856455493184, + "grad_norm": 0.7158617973327637, + "learning_rate": 1.0103909684158841e-05, + "loss": 0.776, + "step": 1915 + }, + { + "epoch": 1.5364875701684042, + "grad_norm": 0.671501874923706, + "learning_rate": 1.0095250817599218e-05, + "loss": 0.7396, + "step": 1916 + }, + { + "epoch": 1.53728949478749, + "grad_norm": 0.6825124025344849, + "learning_rate": 1.008659187961729e-05, + "loss": 0.6984, + "step": 1917 + }, + { + "epoch": 1.5380914194065758, + "grad_norm": 0.6682149171829224, + "learning_rate": 1.0077932876705819e-05, + "loss": 0.7488, + "step": 1918 + }, + { + "epoch": 1.5388933440256616, + "grad_norm": 0.6929824948310852, + "learning_rate": 1.0069273815357621e-05, + "loss": 0.7576, + "step": 1919 + }, + { + "epoch": 1.5396952686447474, + "grad_norm": 0.6691644191741943, + "learning_rate": 1.006061470206556e-05, + "loss": 0.7617, + "step": 1920 + }, + { + "epoch": 1.5404971932638332, + "grad_norm": 0.6622249484062195, + "learning_rate": 1.0051955543322533e-05, + "loss": 0.7602, + "step": 1921 + }, + { + "epoch": 1.541299117882919, + "grad_norm": 0.6343883275985718, + "learning_rate": 1.0043296345621467e-05, + "loss": 0.7423, + "step": 1922 + }, + { + "epoch": 1.5421010425020047, + "grad_norm": 0.668969452381134, + "learning_rate": 1.0034637115455327e-05, + "loss": 0.7551, + "step": 1923 + }, + { + "epoch": 1.5429029671210905, + "grad_norm": 0.7144033908843994, + "learning_rate": 1.0025977859317097e-05, + "loss": 0.716, + "step": 1924 + }, + { + "epoch": 1.5437048917401763, + "grad_norm": 0.6621528267860413, + "learning_rate": 1.0017318583699786e-05, + "loss": 0.7375, + "step": 1925 + }, + { + "epoch": 1.5445068163592621, + "grad_norm": 0.7118502259254456, + "learning_rate": 1.0008659295096412e-05, + "loss": 0.8017, + "step": 1926 + }, + { + "epoch": 1.545308740978348, + "grad_norm": 0.6664971113204956, + "learning_rate": 1e-05, + "loss": 0.7292, + "step": 1927 + }, + { + "epoch": 1.5461106655974337, + "grad_norm": 0.7016831636428833, + "learning_rate": 9.991340704903593e-06, + "loss": 0.7586, + "step": 1928 + }, + { + "epoch": 1.5469125902165195, + "grad_norm": 0.7355296015739441, + "learning_rate": 9.982681416300217e-06, + "loss": 0.7695, + "step": 1929 + }, + { + "epoch": 1.5477145148356053, + "grad_norm": 0.7109652757644653, + "learning_rate": 9.974022140682906e-06, + "loss": 0.7447, + "step": 1930 + }, + { + "epoch": 1.5485164394546913, + "grad_norm": 0.8703607320785522, + "learning_rate": 9.965362884544674e-06, + "loss": 0.7483, + "step": 1931 + }, + { + "epoch": 1.549318364073777, + "grad_norm": 0.6655393242835999, + "learning_rate": 9.956703654378536e-06, + "loss": 0.7431, + "step": 1932 + }, + { + "epoch": 1.550120288692863, + "grad_norm": 0.6632983088493347, + "learning_rate": 9.948044456677472e-06, + "loss": 0.6951, + "step": 1933 + }, + { + "epoch": 1.5509222133119487, + "grad_norm": 0.7212697267532349, + "learning_rate": 9.939385297934441e-06, + "loss": 0.7628, + "step": 1934 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 0.6794565916061401, + "learning_rate": 9.930726184642382e-06, + "loss": 0.7403, + "step": 1935 + }, + { + "epoch": 1.5525260625501203, + "grad_norm": 0.7048789262771606, + "learning_rate": 9.922067123294183e-06, + "loss": 0.7689, + "step": 1936 + }, + { + "epoch": 1.553327987169206, + "grad_norm": 0.7070860862731934, + "learning_rate": 9.913408120382714e-06, + "loss": 0.7855, + "step": 1937 + }, + { + "epoch": 1.5541299117882919, + "grad_norm": 0.681030809879303, + "learning_rate": 9.904749182400786e-06, + "loss": 0.7341, + "step": 1938 + }, + { + "epoch": 1.5549318364073779, + "grad_norm": 0.6738923788070679, + "learning_rate": 9.896090315841162e-06, + "loss": 0.7725, + "step": 1939 + }, + { + "epoch": 1.5557337610264637, + "grad_norm": 0.6607416868209839, + "learning_rate": 9.88743152719656e-06, + "loss": 0.7083, + "step": 1940 + }, + { + "epoch": 1.5565356856455494, + "grad_norm": 0.6659730672836304, + "learning_rate": 9.878772822959628e-06, + "loss": 0.7544, + "step": 1941 + }, + { + "epoch": 1.5573376102646352, + "grad_norm": 0.669576108455658, + "learning_rate": 9.870114209622969e-06, + "loss": 0.741, + "step": 1942 + }, + { + "epoch": 1.558139534883721, + "grad_norm": 0.6290959715843201, + "learning_rate": 9.861455693679096e-06, + "loss": 0.7228, + "step": 1943 + }, + { + "epoch": 1.5589414595028068, + "grad_norm": 0.6587105393409729, + "learning_rate": 9.852797281620459e-06, + "loss": 0.716, + "step": 1944 + }, + { + "epoch": 1.5597433841218926, + "grad_norm": 0.7020542621612549, + "learning_rate": 9.844138979939437e-06, + "loss": 0.7684, + "step": 1945 + }, + { + "epoch": 1.5605453087409784, + "grad_norm": 0.6784189343452454, + "learning_rate": 9.835480795128314e-06, + "loss": 0.7699, + "step": 1946 + }, + { + "epoch": 1.5613472333600642, + "grad_norm": 0.6841933727264404, + "learning_rate": 9.826822733679296e-06, + "loss": 0.7578, + "step": 1947 + }, + { + "epoch": 1.56214915797915, + "grad_norm": 0.7108730673789978, + "learning_rate": 9.81816480208449e-06, + "loss": 0.7228, + "step": 1948 + }, + { + "epoch": 1.5629510825982358, + "grad_norm": 0.6719072461128235, + "learning_rate": 9.809507006835904e-06, + "loss": 0.7476, + "step": 1949 + }, + { + "epoch": 1.5637530072173216, + "grad_norm": 0.6494175791740417, + "learning_rate": 9.800849354425455e-06, + "loss": 0.7254, + "step": 1950 + }, + { + "epoch": 1.5645549318364074, + "grad_norm": 0.6985930800437927, + "learning_rate": 9.79219185134494e-06, + "loss": 0.7566, + "step": 1951 + }, + { + "epoch": 1.5653568564554932, + "grad_norm": 0.6902778744697571, + "learning_rate": 9.783534504086055e-06, + "loss": 0.7368, + "step": 1952 + }, + { + "epoch": 1.566158781074579, + "grad_norm": 0.6868149042129517, + "learning_rate": 9.774877319140372e-06, + "loss": 0.7191, + "step": 1953 + }, + { + "epoch": 1.5669607056936647, + "grad_norm": 0.6847664713859558, + "learning_rate": 9.766220302999336e-06, + "loss": 0.7588, + "step": 1954 + }, + { + "epoch": 1.5677626303127505, + "grad_norm": 0.6886143088340759, + "learning_rate": 9.757563462154283e-06, + "loss": 0.7608, + "step": 1955 + }, + { + "epoch": 1.5685645549318363, + "grad_norm": 0.7180765271186829, + "learning_rate": 9.7489068030964e-06, + "loss": 0.7273, + "step": 1956 + }, + { + "epoch": 1.5693664795509221, + "grad_norm": 0.6763858795166016, + "learning_rate": 9.74025033231675e-06, + "loss": 0.7549, + "step": 1957 + }, + { + "epoch": 1.570168404170008, + "grad_norm": 0.6915479898452759, + "learning_rate": 9.731594056306248e-06, + "loss": 0.7512, + "step": 1958 + }, + { + "epoch": 1.5709703287890937, + "grad_norm": 0.6777629852294922, + "learning_rate": 9.72293798155566e-06, + "loss": 0.7536, + "step": 1959 + }, + { + "epoch": 1.5717722534081795, + "grad_norm": 0.6455657482147217, + "learning_rate": 9.714282114555613e-06, + "loss": 0.7347, + "step": 1960 + }, + { + "epoch": 1.5725741780272653, + "grad_norm": 0.700589656829834, + "learning_rate": 9.70562646179656e-06, + "loss": 0.7709, + "step": 1961 + }, + { + "epoch": 1.573376102646351, + "grad_norm": 0.6736430525779724, + "learning_rate": 9.696971029768817e-06, + "loss": 0.7816, + "step": 1962 + }, + { + "epoch": 1.574178027265437, + "grad_norm": 0.6946107149124146, + "learning_rate": 9.688315824962516e-06, + "loss": 0.7248, + "step": 1963 + }, + { + "epoch": 1.5749799518845229, + "grad_norm": 0.6668774485588074, + "learning_rate": 9.679660853867621e-06, + "loss": 0.7486, + "step": 1964 + }, + { + "epoch": 1.5757818765036087, + "grad_norm": 0.7241086363792419, + "learning_rate": 9.67100612297393e-06, + "loss": 0.7377, + "step": 1965 + }, + { + "epoch": 1.5765838011226945, + "grad_norm": 0.6383355259895325, + "learning_rate": 9.662351638771049e-06, + "loss": 0.7387, + "step": 1966 + }, + { + "epoch": 1.5773857257417803, + "grad_norm": 0.6955791115760803, + "learning_rate": 9.653697407748412e-06, + "loss": 0.7487, + "step": 1967 + }, + { + "epoch": 1.578187650360866, + "grad_norm": 0.6960842609405518, + "learning_rate": 9.645043436395253e-06, + "loss": 0.7984, + "step": 1968 + }, + { + "epoch": 1.5789895749799518, + "grad_norm": 0.7226347923278809, + "learning_rate": 9.63638973120061e-06, + "loss": 0.7261, + "step": 1969 + }, + { + "epoch": 1.5797914995990376, + "grad_norm": 0.6601559519767761, + "learning_rate": 9.627736298653332e-06, + "loss": 0.732, + "step": 1970 + }, + { + "epoch": 1.5805934242181237, + "grad_norm": 0.6827449798583984, + "learning_rate": 9.619083145242053e-06, + "loss": 0.7392, + "step": 1971 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 0.6553324460983276, + "learning_rate": 9.610430277455209e-06, + "loss": 0.7435, + "step": 1972 + }, + { + "epoch": 1.5821972734562952, + "grad_norm": 0.6525527238845825, + "learning_rate": 9.601777701781009e-06, + "loss": 0.7591, + "step": 1973 + }, + { + "epoch": 1.582999198075381, + "grad_norm": 0.6529831290245056, + "learning_rate": 9.593125424707446e-06, + "loss": 0.7414, + "step": 1974 + }, + { + "epoch": 1.5838011226944668, + "grad_norm": 0.6787192821502686, + "learning_rate": 9.584473452722299e-06, + "loss": 0.7597, + "step": 1975 + }, + { + "epoch": 1.5846030473135526, + "grad_norm": 0.6774669289588928, + "learning_rate": 9.575821792313108e-06, + "loss": 0.7418, + "step": 1976 + }, + { + "epoch": 1.5854049719326384, + "grad_norm": 0.6893562078475952, + "learning_rate": 9.567170449967183e-06, + "loss": 0.6952, + "step": 1977 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 0.6523553133010864, + "learning_rate": 9.558519432171597e-06, + "loss": 0.7763, + "step": 1978 + }, + { + "epoch": 1.58700882117081, + "grad_norm": 0.668114960193634, + "learning_rate": 9.549868745413172e-06, + "loss": 0.7025, + "step": 1979 + }, + { + "epoch": 1.5878107457898958, + "grad_norm": 0.6470924019813538, + "learning_rate": 9.541218396178494e-06, + "loss": 0.7311, + "step": 1980 + }, + { + "epoch": 1.5886126704089816, + "grad_norm": 0.7354634404182434, + "learning_rate": 9.532568390953886e-06, + "loss": 0.8128, + "step": 1981 + }, + { + "epoch": 1.5894145950280674, + "grad_norm": 0.6740539073944092, + "learning_rate": 9.52391873622542e-06, + "loss": 0.7422, + "step": 1982 + }, + { + "epoch": 1.5902165196471532, + "grad_norm": 0.6699314117431641, + "learning_rate": 9.515269438478898e-06, + "loss": 0.7493, + "step": 1983 + }, + { + "epoch": 1.591018444266239, + "grad_norm": 0.6977644562721252, + "learning_rate": 9.506620504199854e-06, + "loss": 0.749, + "step": 1984 + }, + { + "epoch": 1.5918203688853247, + "grad_norm": 0.6674862504005432, + "learning_rate": 9.497971939873567e-06, + "loss": 0.7679, + "step": 1985 + }, + { + "epoch": 1.5926222935044105, + "grad_norm": 0.6880958676338196, + "learning_rate": 9.489323751985009e-06, + "loss": 0.7485, + "step": 1986 + }, + { + "epoch": 1.5934242181234963, + "grad_norm": 0.6663671731948853, + "learning_rate": 9.480675947018899e-06, + "loss": 0.7573, + "step": 1987 + }, + { + "epoch": 1.5942261427425821, + "grad_norm": 0.706123948097229, + "learning_rate": 9.472028531459649e-06, + "loss": 0.7605, + "step": 1988 + }, + { + "epoch": 1.595028067361668, + "grad_norm": 0.7029390931129456, + "learning_rate": 9.463381511791386e-06, + "loss": 0.7809, + "step": 1989 + }, + { + "epoch": 1.5958299919807537, + "grad_norm": 0.633929431438446, + "learning_rate": 9.454734894497942e-06, + "loss": 0.7103, + "step": 1990 + }, + { + "epoch": 1.5966319165998395, + "grad_norm": 0.6639130115509033, + "learning_rate": 9.446088686062838e-06, + "loss": 0.7599, + "step": 1991 + }, + { + "epoch": 1.5974338412189253, + "grad_norm": 0.6766201853752136, + "learning_rate": 9.437442892969308e-06, + "loss": 0.7605, + "step": 1992 + }, + { + "epoch": 1.598235765838011, + "grad_norm": 0.6676896810531616, + "learning_rate": 9.428797521700254e-06, + "loss": 0.7316, + "step": 1993 + }, + { + "epoch": 1.5990376904570969, + "grad_norm": 0.7439046502113342, + "learning_rate": 9.420152578738269e-06, + "loss": 0.7832, + "step": 1994 + }, + { + "epoch": 1.5998396150761829, + "grad_norm": 0.8324495553970337, + "learning_rate": 9.41150807056563e-06, + "loss": 0.7817, + "step": 1995 + }, + { + "epoch": 1.6006415396952687, + "grad_norm": 0.6500052213668823, + "learning_rate": 9.402864003664279e-06, + "loss": 0.7429, + "step": 1996 + }, + { + "epoch": 1.6014434643143545, + "grad_norm": 0.6784413456916809, + "learning_rate": 9.394220384515836e-06, + "loss": 0.7663, + "step": 1997 + }, + { + "epoch": 1.6022453889334403, + "grad_norm": 0.6711524724960327, + "learning_rate": 9.38557721960158e-06, + "loss": 0.7391, + "step": 1998 + }, + { + "epoch": 1.603047313552526, + "grad_norm": 0.6689935326576233, + "learning_rate": 9.37693451540245e-06, + "loss": 0.7391, + "step": 1999 + }, + { + "epoch": 1.6038492381716118, + "grad_norm": 0.7078515887260437, + "learning_rate": 9.368292278399038e-06, + "loss": 0.755, + "step": 2000 + }, + { + "epoch": 1.6046511627906976, + "grad_norm": 0.6978031396865845, + "learning_rate": 9.35965051507159e-06, + "loss": 0.7511, + "step": 2001 + }, + { + "epoch": 1.6054530874097834, + "grad_norm": 0.6822302937507629, + "learning_rate": 9.351009231899995e-06, + "loss": 0.7673, + "step": 2002 + }, + { + "epoch": 1.6062550120288694, + "grad_norm": 0.7071417570114136, + "learning_rate": 9.342368435363774e-06, + "loss": 0.7611, + "step": 2003 + }, + { + "epoch": 1.6070569366479552, + "grad_norm": 0.7437505722045898, + "learning_rate": 9.333728131942104e-06, + "loss": 0.7593, + "step": 2004 + }, + { + "epoch": 1.607858861267041, + "grad_norm": 0.6476147770881653, + "learning_rate": 9.325088328113769e-06, + "loss": 0.743, + "step": 2005 + }, + { + "epoch": 1.6086607858861268, + "grad_norm": 0.6686375737190247, + "learning_rate": 9.316449030357188e-06, + "loss": 0.7613, + "step": 2006 + }, + { + "epoch": 1.6094627105052126, + "grad_norm": 0.6458247303962708, + "learning_rate": 9.307810245150408e-06, + "loss": 0.7171, + "step": 2007 + }, + { + "epoch": 1.6102646351242984, + "grad_norm": 1.242529273033142, + "learning_rate": 9.299171978971073e-06, + "loss": 0.7636, + "step": 2008 + }, + { + "epoch": 1.6110665597433842, + "grad_norm": 0.7031643390655518, + "learning_rate": 9.290534238296462e-06, + "loss": 0.7621, + "step": 2009 + }, + { + "epoch": 1.61186848436247, + "grad_norm": 0.6745364665985107, + "learning_rate": 9.281897029603439e-06, + "loss": 0.6897, + "step": 2010 + }, + { + "epoch": 1.6126704089815558, + "grad_norm": 0.8014926314353943, + "learning_rate": 9.273260359368478e-06, + "loss": 0.7431, + "step": 2011 + }, + { + "epoch": 1.6134723336006416, + "grad_norm": 0.6873915791511536, + "learning_rate": 9.264624234067651e-06, + "loss": 0.7679, + "step": 2012 + }, + { + "epoch": 1.6142742582197274, + "grad_norm": 0.6493097543716431, + "learning_rate": 9.255988660176613e-06, + "loss": 0.7457, + "step": 2013 + }, + { + "epoch": 1.6150761828388132, + "grad_norm": 0.7045702934265137, + "learning_rate": 9.247353644170622e-06, + "loss": 0.7683, + "step": 2014 + }, + { + "epoch": 1.615878107457899, + "grad_norm": 0.6594098210334778, + "learning_rate": 9.238719192524501e-06, + "loss": 0.7129, + "step": 2015 + }, + { + "epoch": 1.6166800320769847, + "grad_norm": 0.7022315859794617, + "learning_rate": 9.23008531171265e-06, + "loss": 0.7376, + "step": 2016 + }, + { + "epoch": 1.6174819566960705, + "grad_norm": 0.6779872179031372, + "learning_rate": 9.221452008209057e-06, + "loss": 0.7507, + "step": 2017 + }, + { + "epoch": 1.6182838813151563, + "grad_norm": 0.7060291767120361, + "learning_rate": 9.21281928848726e-06, + "loss": 0.7652, + "step": 2018 + }, + { + "epoch": 1.6190858059342421, + "grad_norm": 0.6302214860916138, + "learning_rate": 9.204187159020372e-06, + "loss": 0.7142, + "step": 2019 + }, + { + "epoch": 1.619887730553328, + "grad_norm": 0.7084274291992188, + "learning_rate": 9.195555626281053e-06, + "loss": 0.7594, + "step": 2020 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 0.678536593914032, + "learning_rate": 9.186924696741519e-06, + "loss": 0.7467, + "step": 2021 + }, + { + "epoch": 1.6214915797914995, + "grad_norm": 0.670260488986969, + "learning_rate": 9.17829437687354e-06, + "loss": 0.7463, + "step": 2022 + }, + { + "epoch": 1.6222935044105853, + "grad_norm": 0.702285885810852, + "learning_rate": 9.169664673148421e-06, + "loss": 0.7582, + "step": 2023 + }, + { + "epoch": 1.623095429029671, + "grad_norm": 0.7096335291862488, + "learning_rate": 9.16103559203701e-06, + "loss": 0.706, + "step": 2024 + }, + { + "epoch": 1.6238973536487569, + "grad_norm": 0.6567702293395996, + "learning_rate": 9.152407140009684e-06, + "loss": 0.721, + "step": 2025 + }, + { + "epoch": 1.6246992782678427, + "grad_norm": 0.6644206047058105, + "learning_rate": 9.143779323536346e-06, + "loss": 0.7575, + "step": 2026 + }, + { + "epoch": 1.6255012028869287, + "grad_norm": 0.6624019145965576, + "learning_rate": 9.135152149086436e-06, + "loss": 0.7174, + "step": 2027 + }, + { + "epoch": 1.6263031275060145, + "grad_norm": 0.723945677280426, + "learning_rate": 9.126525623128896e-06, + "loss": 0.7682, + "step": 2028 + }, + { + "epoch": 1.6271050521251003, + "grad_norm": 0.684164822101593, + "learning_rate": 9.117899752132193e-06, + "loss": 0.7677, + "step": 2029 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.6822468042373657, + "learning_rate": 9.109274542564295e-06, + "loss": 0.7646, + "step": 2030 + }, + { + "epoch": 1.6287089013632718, + "grad_norm": 0.707531750202179, + "learning_rate": 9.100650000892679e-06, + "loss": 0.7443, + "step": 2031 + }, + { + "epoch": 1.6295108259823576, + "grad_norm": 0.6925025582313538, + "learning_rate": 9.092026133584322e-06, + "loss": 0.7717, + "step": 2032 + }, + { + "epoch": 1.6303127506014434, + "grad_norm": 0.6611237525939941, + "learning_rate": 9.083402947105688e-06, + "loss": 0.7586, + "step": 2033 + }, + { + "epoch": 1.6311146752205292, + "grad_norm": 0.6879470348358154, + "learning_rate": 9.074780447922746e-06, + "loss": 0.7076, + "step": 2034 + }, + { + "epoch": 1.6319165998396152, + "grad_norm": 0.6737411618232727, + "learning_rate": 9.066158642500933e-06, + "loss": 0.7587, + "step": 2035 + }, + { + "epoch": 1.632718524458701, + "grad_norm": 0.6747552752494812, + "learning_rate": 9.05753753730517e-06, + "loss": 0.7363, + "step": 2036 + }, + { + "epoch": 1.6335204490777868, + "grad_norm": 0.7112221717834473, + "learning_rate": 9.04891713879986e-06, + "loss": 0.7896, + "step": 2037 + }, + { + "epoch": 1.6343223736968726, + "grad_norm": 0.6935960650444031, + "learning_rate": 9.040297453448867e-06, + "loss": 0.7195, + "step": 2038 + }, + { + "epoch": 1.6351242983159584, + "grad_norm": 0.6707137227058411, + "learning_rate": 9.03167848771553e-06, + "loss": 0.7473, + "step": 2039 + }, + { + "epoch": 1.6359262229350442, + "grad_norm": 0.6829401254653931, + "learning_rate": 9.023060248062642e-06, + "loss": 0.7654, + "step": 2040 + }, + { + "epoch": 1.63672814755413, + "grad_norm": 0.6423894166946411, + "learning_rate": 9.014442740952446e-06, + "loss": 0.7386, + "step": 2041 + }, + { + "epoch": 1.6375300721732158, + "grad_norm": 0.678142786026001, + "learning_rate": 9.005825972846652e-06, + "loss": 0.7215, + "step": 2042 + }, + { + "epoch": 1.6383319967923016, + "grad_norm": 0.6874783039093018, + "learning_rate": 8.997209950206396e-06, + "loss": 0.7578, + "step": 2043 + }, + { + "epoch": 1.6391339214113874, + "grad_norm": 0.6972293257713318, + "learning_rate": 8.988594679492276e-06, + "loss": 0.7656, + "step": 2044 + }, + { + "epoch": 1.6399358460304732, + "grad_norm": 0.6379215121269226, + "learning_rate": 8.979980167164311e-06, + "loss": 0.7223, + "step": 2045 + }, + { + "epoch": 1.640737770649559, + "grad_norm": 0.6454169750213623, + "learning_rate": 8.971366419681948e-06, + "loss": 0.7007, + "step": 2046 + }, + { + "epoch": 1.6415396952686447, + "grad_norm": 0.6857996582984924, + "learning_rate": 8.96275344350408e-06, + "loss": 0.7664, + "step": 2047 + }, + { + "epoch": 1.6423416198877305, + "grad_norm": 0.7171909809112549, + "learning_rate": 8.954141245089002e-06, + "loss": 0.7698, + "step": 2048 + }, + { + "epoch": 1.6431435445068163, + "grad_norm": 0.6804106831550598, + "learning_rate": 8.945529830894439e-06, + "loss": 0.7481, + "step": 2049 + }, + { + "epoch": 1.6439454691259021, + "grad_norm": 0.6667369604110718, + "learning_rate": 8.93691920737752e-06, + "loss": 0.7417, + "step": 2050 + }, + { + "epoch": 1.644747393744988, + "grad_norm": 0.6525418758392334, + "learning_rate": 8.92830938099478e-06, + "loss": 0.6958, + "step": 2051 + }, + { + "epoch": 1.6455493183640737, + "grad_norm": 0.6953479647636414, + "learning_rate": 8.919700358202167e-06, + "loss": 0.7469, + "step": 2052 + }, + { + "epoch": 1.6463512429831595, + "grad_norm": 0.715549886226654, + "learning_rate": 8.911092145455015e-06, + "loss": 0.7828, + "step": 2053 + }, + { + "epoch": 1.6471531676022453, + "grad_norm": 0.9194969534873962, + "learning_rate": 8.902484749208058e-06, + "loss": 0.737, + "step": 2054 + }, + { + "epoch": 1.647955092221331, + "grad_norm": 0.6881150603294373, + "learning_rate": 8.893878175915414e-06, + "loss": 0.718, + "step": 2055 + }, + { + "epoch": 1.6487570168404169, + "grad_norm": 0.716201663017273, + "learning_rate": 8.885272432030579e-06, + "loss": 0.7638, + "step": 2056 + }, + { + "epoch": 1.6495589414595027, + "grad_norm": 0.6798361539840698, + "learning_rate": 8.876667524006442e-06, + "loss": 0.7288, + "step": 2057 + }, + { + "epoch": 1.6503608660785885, + "grad_norm": 0.6651695370674133, + "learning_rate": 8.868063458295251e-06, + "loss": 0.7251, + "step": 2058 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 0.6712408065795898, + "learning_rate": 8.85946024134863e-06, + "loss": 0.7271, + "step": 2059 + }, + { + "epoch": 1.6519647153167603, + "grad_norm": 0.6494314670562744, + "learning_rate": 8.850857879617562e-06, + "loss": 0.7224, + "step": 2060 + }, + { + "epoch": 1.652766639935846, + "grad_norm": 0.6849057674407959, + "learning_rate": 8.84225637955239e-06, + "loss": 0.7429, + "step": 2061 + }, + { + "epoch": 1.6535685645549318, + "grad_norm": 0.6645267009735107, + "learning_rate": 8.833655747602816e-06, + "loss": 0.6723, + "step": 2062 + }, + { + "epoch": 1.6543704891740176, + "grad_norm": 0.6581180691719055, + "learning_rate": 8.825055990217877e-06, + "loss": 0.7193, + "step": 2063 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.6753679513931274, + "learning_rate": 8.816457113845977e-06, + "loss": 0.7498, + "step": 2064 + }, + { + "epoch": 1.6559743384121892, + "grad_norm": 0.6934730410575867, + "learning_rate": 8.80785912493484e-06, + "loss": 0.755, + "step": 2065 + }, + { + "epoch": 1.656776263031275, + "grad_norm": 0.6492776870727539, + "learning_rate": 8.799262029931527e-06, + "loss": 0.7371, + "step": 2066 + }, + { + "epoch": 1.657578187650361, + "grad_norm": 0.6823244690895081, + "learning_rate": 8.79066583528244e-06, + "loss": 0.7319, + "step": 2067 + }, + { + "epoch": 1.6583801122694468, + "grad_norm": 0.6459192037582397, + "learning_rate": 8.78207054743329e-06, + "loss": 0.6997, + "step": 2068 + }, + { + "epoch": 1.6591820368885326, + "grad_norm": 0.6940407156944275, + "learning_rate": 8.773476172829127e-06, + "loss": 0.7578, + "step": 2069 + }, + { + "epoch": 1.6599839615076184, + "grad_norm": 0.6635679602622986, + "learning_rate": 8.7648827179143e-06, + "loss": 0.7466, + "step": 2070 + }, + { + "epoch": 1.6607858861267042, + "grad_norm": 0.7041206359863281, + "learning_rate": 8.756290189132473e-06, + "loss": 0.7564, + "step": 2071 + }, + { + "epoch": 1.66158781074579, + "grad_norm": 0.6900807619094849, + "learning_rate": 8.74769859292662e-06, + "loss": 0.723, + "step": 2072 + }, + { + "epoch": 1.6623897353648758, + "grad_norm": 0.7090346217155457, + "learning_rate": 8.739107935739004e-06, + "loss": 0.7512, + "step": 2073 + }, + { + "epoch": 1.6631916599839616, + "grad_norm": 0.6753419041633606, + "learning_rate": 8.730518224011209e-06, + "loss": 0.7404, + "step": 2074 + }, + { + "epoch": 1.6639935846030474, + "grad_norm": 0.6859150528907776, + "learning_rate": 8.721929464184079e-06, + "loss": 0.7529, + "step": 2075 + }, + { + "epoch": 1.6647955092221332, + "grad_norm": 0.713150143623352, + "learning_rate": 8.71334166269776e-06, + "loss": 0.7569, + "step": 2076 + }, + { + "epoch": 1.665597433841219, + "grad_norm": 0.6997129321098328, + "learning_rate": 8.704754825991684e-06, + "loss": 0.7733, + "step": 2077 + }, + { + "epoch": 1.6663993584603047, + "grad_norm": 0.6805722117424011, + "learning_rate": 8.69616896050455e-06, + "loss": 0.7407, + "step": 2078 + }, + { + "epoch": 1.6672012830793905, + "grad_norm": 0.6935459971427917, + "learning_rate": 8.687584072674335e-06, + "loss": 0.7519, + "step": 2079 + }, + { + "epoch": 1.6680032076984763, + "grad_norm": 0.6706266403198242, + "learning_rate": 8.679000168938278e-06, + "loss": 0.7196, + "step": 2080 + }, + { + "epoch": 1.6688051323175621, + "grad_norm": 0.6920490264892578, + "learning_rate": 8.670417255732876e-06, + "loss": 0.7471, + "step": 2081 + }, + { + "epoch": 1.669607056936648, + "grad_norm": 0.7178738117218018, + "learning_rate": 8.661835339493903e-06, + "loss": 0.734, + "step": 2082 + }, + { + "epoch": 1.6704089815557337, + "grad_norm": 0.6797714233398438, + "learning_rate": 8.653254426656364e-06, + "loss": 0.7109, + "step": 2083 + }, + { + "epoch": 1.6712109061748195, + "grad_norm": 0.7086063027381897, + "learning_rate": 8.644674523654522e-06, + "loss": 0.7198, + "step": 2084 + }, + { + "epoch": 1.6720128307939053, + "grad_norm": 0.660064160823822, + "learning_rate": 8.636095636921878e-06, + "loss": 0.7373, + "step": 2085 + }, + { + "epoch": 1.672814755412991, + "grad_norm": 0.6909743547439575, + "learning_rate": 8.627517772891172e-06, + "loss": 0.7312, + "step": 2086 + }, + { + "epoch": 1.6736166800320769, + "grad_norm": 0.6800159811973572, + "learning_rate": 8.618940937994387e-06, + "loss": 0.7629, + "step": 2087 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 0.7067490816116333, + "learning_rate": 8.610365138662716e-06, + "loss": 0.7388, + "step": 2088 + }, + { + "epoch": 1.6752205292702484, + "grad_norm": 0.7322930097579956, + "learning_rate": 8.601790381326593e-06, + "loss": 0.7714, + "step": 2089 + }, + { + "epoch": 1.6760224538893342, + "grad_norm": 0.6572024822235107, + "learning_rate": 8.59321667241566e-06, + "loss": 0.7291, + "step": 2090 + }, + { + "epoch": 1.6768243785084203, + "grad_norm": 0.6840153932571411, + "learning_rate": 8.584644018358773e-06, + "loss": 0.7107, + "step": 2091 + }, + { + "epoch": 1.677626303127506, + "grad_norm": 0.653813898563385, + "learning_rate": 8.576072425584004e-06, + "loss": 0.7084, + "step": 2092 + }, + { + "epoch": 1.6784282277465918, + "grad_norm": 0.6750298142433167, + "learning_rate": 8.56750190051862e-06, + "loss": 0.728, + "step": 2093 + }, + { + "epoch": 1.6792301523656776, + "grad_norm": 0.6693862080574036, + "learning_rate": 8.558932449589103e-06, + "loss": 0.7279, + "step": 2094 + }, + { + "epoch": 1.6800320769847634, + "grad_norm": 0.6705959439277649, + "learning_rate": 8.550364079221111e-06, + "loss": 0.7475, + "step": 2095 + }, + { + "epoch": 1.6808340016038492, + "grad_norm": 0.7326763868331909, + "learning_rate": 8.541796795839498e-06, + "loss": 0.7598, + "step": 2096 + }, + { + "epoch": 1.681635926222935, + "grad_norm": 0.6632883548736572, + "learning_rate": 8.533230605868314e-06, + "loss": 0.714, + "step": 2097 + }, + { + "epoch": 1.6824378508420208, + "grad_norm": 0.676483154296875, + "learning_rate": 8.524665515730766e-06, + "loss": 0.7276, + "step": 2098 + }, + { + "epoch": 1.6832397754611068, + "grad_norm": 0.6715077757835388, + "learning_rate": 8.516101531849266e-06, + "loss": 0.7608, + "step": 2099 + }, + { + "epoch": 1.6840417000801926, + "grad_norm": 0.6537802219390869, + "learning_rate": 8.507538660645372e-06, + "loss": 0.725, + "step": 2100 + }, + { + "epoch": 1.6848436246992784, + "grad_norm": 0.6930273771286011, + "learning_rate": 8.498976908539817e-06, + "loss": 0.7545, + "step": 2101 + }, + { + "epoch": 1.6856455493183642, + "grad_norm": 0.6789145469665527, + "learning_rate": 8.490416281952495e-06, + "loss": 0.7618, + "step": 2102 + }, + { + "epoch": 1.68644747393745, + "grad_norm": 0.6359390020370483, + "learning_rate": 8.481856787302454e-06, + "loss": 0.7155, + "step": 2103 + }, + { + "epoch": 1.6872493985565358, + "grad_norm": 0.658112108707428, + "learning_rate": 8.473298431007901e-06, + "loss": 0.7385, + "step": 2104 + }, + { + "epoch": 1.6880513231756216, + "grad_norm": 0.6744823455810547, + "learning_rate": 8.464741219486175e-06, + "loss": 0.7365, + "step": 2105 + }, + { + "epoch": 1.6888532477947074, + "grad_norm": 0.6585288643836975, + "learning_rate": 8.456185159153765e-06, + "loss": 0.7542, + "step": 2106 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 0.6513398885726929, + "learning_rate": 8.447630256426303e-06, + "loss": 0.7255, + "step": 2107 + }, + { + "epoch": 1.690457097032879, + "grad_norm": 0.6796519756317139, + "learning_rate": 8.439076517718541e-06, + "loss": 0.7541, + "step": 2108 + }, + { + "epoch": 1.6912590216519647, + "grad_norm": 0.6824721097946167, + "learning_rate": 8.430523949444367e-06, + "loss": 0.7137, + "step": 2109 + }, + { + "epoch": 1.6920609462710505, + "grad_norm": 0.6917021870613098, + "learning_rate": 8.421972558016786e-06, + "loss": 0.7435, + "step": 2110 + }, + { + "epoch": 1.6928628708901363, + "grad_norm": 0.6658660173416138, + "learning_rate": 8.413422349847918e-06, + "loss": 0.7403, + "step": 2111 + }, + { + "epoch": 1.693664795509222, + "grad_norm": 0.7041023373603821, + "learning_rate": 8.404873331349009e-06, + "loss": 0.7765, + "step": 2112 + }, + { + "epoch": 1.694466720128308, + "grad_norm": 0.7012107372283936, + "learning_rate": 8.396325508930398e-06, + "loss": 0.766, + "step": 2113 + }, + { + "epoch": 1.6952686447473937, + "grad_norm": 0.662287712097168, + "learning_rate": 8.387778889001539e-06, + "loss": 0.7359, + "step": 2114 + }, + { + "epoch": 1.6960705693664795, + "grad_norm": 0.692939043045044, + "learning_rate": 8.379233477970975e-06, + "loss": 0.7821, + "step": 2115 + }, + { + "epoch": 1.6968724939855653, + "grad_norm": 0.7044574022293091, + "learning_rate": 8.370689282246341e-06, + "loss": 0.7677, + "step": 2116 + }, + { + "epoch": 1.697674418604651, + "grad_norm": 0.6683332920074463, + "learning_rate": 8.36214630823438e-06, + "loss": 0.7545, + "step": 2117 + }, + { + "epoch": 1.6984763432237369, + "grad_norm": 0.7165277600288391, + "learning_rate": 8.353604562340886e-06, + "loss": 0.777, + "step": 2118 + }, + { + "epoch": 1.6992782678428227, + "grad_norm": 0.7054672241210938, + "learning_rate": 8.345064050970767e-06, + "loss": 0.7495, + "step": 2119 + }, + { + "epoch": 1.7000801924619084, + "grad_norm": 0.664445161819458, + "learning_rate": 8.336524780527986e-06, + "loss": 0.7648, + "step": 2120 + }, + { + "epoch": 1.7008821170809942, + "grad_norm": 0.6737078428268433, + "learning_rate": 8.327986757415571e-06, + "loss": 0.7248, + "step": 2121 + }, + { + "epoch": 1.70168404170008, + "grad_norm": 0.6758593320846558, + "learning_rate": 8.319449988035631e-06, + "loss": 0.7499, + "step": 2122 + }, + { + "epoch": 1.702485966319166, + "grad_norm": 0.668846845626831, + "learning_rate": 8.310914478789321e-06, + "loss": 0.7671, + "step": 2123 + }, + { + "epoch": 1.7032878909382518, + "grad_norm": 0.6598069071769714, + "learning_rate": 8.30238023607686e-06, + "loss": 0.7507, + "step": 2124 + }, + { + "epoch": 1.7040898155573376, + "grad_norm": 0.6659058928489685, + "learning_rate": 8.293847266297513e-06, + "loss": 0.7198, + "step": 2125 + }, + { + "epoch": 1.7048917401764234, + "grad_norm": 0.6601713299751282, + "learning_rate": 8.285315575849589e-06, + "loss": 0.7103, + "step": 2126 + }, + { + "epoch": 1.7056936647955092, + "grad_norm": 0.6853638887405396, + "learning_rate": 8.276785171130445e-06, + "loss": 0.7441, + "step": 2127 + }, + { + "epoch": 1.706495589414595, + "grad_norm": 0.71426922082901, + "learning_rate": 8.26825605853646e-06, + "loss": 0.744, + "step": 2128 + }, + { + "epoch": 1.7072975140336808, + "grad_norm": 0.6715183854103088, + "learning_rate": 8.259728244463065e-06, + "loss": 0.694, + "step": 2129 + }, + { + "epoch": 1.7080994386527666, + "grad_norm": 0.6622087359428406, + "learning_rate": 8.251201735304698e-06, + "loss": 0.7388, + "step": 2130 + }, + { + "epoch": 1.7089013632718526, + "grad_norm": 0.7121644616127014, + "learning_rate": 8.242676537454825e-06, + "loss": 0.775, + "step": 2131 + }, + { + "epoch": 1.7097032878909384, + "grad_norm": 0.6632829904556274, + "learning_rate": 8.234152657305936e-06, + "loss": 0.7363, + "step": 2132 + }, + { + "epoch": 1.7105052125100242, + "grad_norm": 0.6959726214408875, + "learning_rate": 8.22563010124952e-06, + "loss": 0.7378, + "step": 2133 + }, + { + "epoch": 1.71130713712911, + "grad_norm": 0.7143699526786804, + "learning_rate": 8.217108875676083e-06, + "loss": 0.7204, + "step": 2134 + }, + { + "epoch": 1.7121090617481958, + "grad_norm": 0.6711505651473999, + "learning_rate": 8.20858898697513e-06, + "loss": 0.7385, + "step": 2135 + }, + { + "epoch": 1.7129109863672816, + "grad_norm": 0.6337816119194031, + "learning_rate": 8.200070441535159e-06, + "loss": 0.7139, + "step": 2136 + }, + { + "epoch": 1.7137129109863674, + "grad_norm": 0.6956325173377991, + "learning_rate": 8.191553245743675e-06, + "loss": 0.7812, + "step": 2137 + }, + { + "epoch": 1.7145148356054531, + "grad_norm": 0.6819128394126892, + "learning_rate": 8.183037405987155e-06, + "loss": 0.7351, + "step": 2138 + }, + { + "epoch": 1.715316760224539, + "grad_norm": 0.6942816972732544, + "learning_rate": 8.174522928651068e-06, + "loss": 0.7443, + "step": 2139 + }, + { + "epoch": 1.7161186848436247, + "grad_norm": 0.653355598449707, + "learning_rate": 8.166009820119857e-06, + "loss": 0.7521, + "step": 2140 + }, + { + "epoch": 1.7169206094627105, + "grad_norm": 0.6483197808265686, + "learning_rate": 8.157498086776937e-06, + "loss": 0.7139, + "step": 2141 + }, + { + "epoch": 1.7177225340817963, + "grad_norm": 0.6634138822555542, + "learning_rate": 8.148987735004706e-06, + "loss": 0.7773, + "step": 2142 + }, + { + "epoch": 1.718524458700882, + "grad_norm": 0.6787661910057068, + "learning_rate": 8.140478771184507e-06, + "loss": 0.7243, + "step": 2143 + }, + { + "epoch": 1.719326383319968, + "grad_norm": 0.6924588680267334, + "learning_rate": 8.131971201696656e-06, + "loss": 0.7558, + "step": 2144 + }, + { + "epoch": 1.7201283079390537, + "grad_norm": 0.6862332820892334, + "learning_rate": 8.123465032920415e-06, + "loss": 0.7175, + "step": 2145 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 0.6828835010528564, + "learning_rate": 8.114960271233999e-06, + "loss": 0.7774, + "step": 2146 + }, + { + "epoch": 1.7217321571772253, + "grad_norm": 0.6388763189315796, + "learning_rate": 8.106456923014571e-06, + "loss": 0.7002, + "step": 2147 + }, + { + "epoch": 1.722534081796311, + "grad_norm": 0.6799570918083191, + "learning_rate": 8.097954994638225e-06, + "loss": 0.7579, + "step": 2148 + }, + { + "epoch": 1.7233360064153969, + "grad_norm": 0.6811538338661194, + "learning_rate": 8.089454492480004e-06, + "loss": 0.7354, + "step": 2149 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 0.7103093266487122, + "learning_rate": 8.080955422913872e-06, + "loss": 0.7416, + "step": 2150 + }, + { + "epoch": 1.7249398556535684, + "grad_norm": 0.6595764756202698, + "learning_rate": 8.072457792312715e-06, + "loss": 0.7321, + "step": 2151 + }, + { + "epoch": 1.7257417802726542, + "grad_norm": 0.6965066194534302, + "learning_rate": 8.063961607048353e-06, + "loss": 0.7535, + "step": 2152 + }, + { + "epoch": 1.72654370489174, + "grad_norm": 0.6480819582939148, + "learning_rate": 8.05546687349151e-06, + "loss": 0.7559, + "step": 2153 + }, + { + "epoch": 1.7273456295108258, + "grad_norm": 0.6977070569992065, + "learning_rate": 8.046973598011831e-06, + "loss": 0.7416, + "step": 2154 + }, + { + "epoch": 1.7281475541299118, + "grad_norm": 0.689796507358551, + "learning_rate": 8.038481786977858e-06, + "loss": 0.7546, + "step": 2155 + }, + { + "epoch": 1.7289494787489976, + "grad_norm": 0.6607633233070374, + "learning_rate": 8.029991446757047e-06, + "loss": 0.733, + "step": 2156 + }, + { + "epoch": 1.7297514033680834, + "grad_norm": 0.7048103213310242, + "learning_rate": 8.02150258371574e-06, + "loss": 0.7734, + "step": 2157 + }, + { + "epoch": 1.7305533279871692, + "grad_norm": 0.6884298324584961, + "learning_rate": 8.013015204219171e-06, + "loss": 0.7469, + "step": 2158 + }, + { + "epoch": 1.731355252606255, + "grad_norm": 0.7125540375709534, + "learning_rate": 8.004529314631476e-06, + "loss": 0.7367, + "step": 2159 + }, + { + "epoch": 1.7321571772253408, + "grad_norm": 0.6849104762077332, + "learning_rate": 7.996044921315656e-06, + "loss": 0.7246, + "step": 2160 + }, + { + "epoch": 1.7329591018444266, + "grad_norm": 0.6800344586372375, + "learning_rate": 7.987562030633604e-06, + "loss": 0.726, + "step": 2161 + }, + { + "epoch": 1.7337610264635124, + "grad_norm": 0.7025103569030762, + "learning_rate": 7.979080648946078e-06, + "loss": 0.7169, + "step": 2162 + }, + { + "epoch": 1.7345629510825984, + "grad_norm": 0.6762646436691284, + "learning_rate": 7.970600782612703e-06, + "loss": 0.7497, + "step": 2163 + }, + { + "epoch": 1.7353648757016842, + "grad_norm": 0.7249945998191833, + "learning_rate": 7.962122437991978e-06, + "loss": 0.7864, + "step": 2164 + }, + { + "epoch": 1.73616680032077, + "grad_norm": 0.6832449436187744, + "learning_rate": 7.953645621441245e-06, + "loss": 0.7619, + "step": 2165 + }, + { + "epoch": 1.7369687249398558, + "grad_norm": 0.6707166433334351, + "learning_rate": 7.945170339316724e-06, + "loss": 0.721, + "step": 2166 + }, + { + "epoch": 1.7377706495589416, + "grad_norm": 0.6761574149131775, + "learning_rate": 7.93669659797346e-06, + "loss": 0.7313, + "step": 2167 + }, + { + "epoch": 1.7385725741780274, + "grad_norm": 0.6892061829566956, + "learning_rate": 7.928224403765353e-06, + "loss": 0.736, + "step": 2168 + }, + { + "epoch": 1.7393744987971131, + "grad_norm": 0.6842492818832397, + "learning_rate": 7.919753763045148e-06, + "loss": 0.7297, + "step": 2169 + }, + { + "epoch": 1.740176423416199, + "grad_norm": 0.6731321811676025, + "learning_rate": 7.911284682164413e-06, + "loss": 0.7464, + "step": 2170 + }, + { + "epoch": 1.7409783480352847, + "grad_norm": 0.6783955693244934, + "learning_rate": 7.90281716747356e-06, + "loss": 0.737, + "step": 2171 + }, + { + "epoch": 1.7417802726543705, + "grad_norm": 0.6662715673446655, + "learning_rate": 7.894351225321817e-06, + "loss": 0.7445, + "step": 2172 + }, + { + "epoch": 1.7425821972734563, + "grad_norm": 0.6635359525680542, + "learning_rate": 7.885886862057233e-06, + "loss": 0.7155, + "step": 2173 + }, + { + "epoch": 1.743384121892542, + "grad_norm": 0.691408634185791, + "learning_rate": 7.877424084026682e-06, + "loss": 0.729, + "step": 2174 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 0.6881137490272522, + "learning_rate": 7.868962897575837e-06, + "loss": 0.7426, + "step": 2175 + }, + { + "epoch": 1.7449879711307137, + "grad_norm": 0.6919487714767456, + "learning_rate": 7.86050330904919e-06, + "loss": 0.7381, + "step": 2176 + }, + { + "epoch": 1.7457898957497995, + "grad_norm": 0.6565024852752686, + "learning_rate": 7.852045324790023e-06, + "loss": 0.7485, + "step": 2177 + }, + { + "epoch": 1.7465918203688853, + "grad_norm": 0.6828457117080688, + "learning_rate": 7.843588951140421e-06, + "loss": 0.7429, + "step": 2178 + }, + { + "epoch": 1.747393744987971, + "grad_norm": 0.6890912652015686, + "learning_rate": 7.835134194441265e-06, + "loss": 0.7139, + "step": 2179 + }, + { + "epoch": 1.7481956696070569, + "grad_norm": 0.6851256489753723, + "learning_rate": 7.826681061032216e-06, + "loss": 0.7731, + "step": 2180 + }, + { + "epoch": 1.7489975942261426, + "grad_norm": 0.6674900054931641, + "learning_rate": 7.818229557251722e-06, + "loss": 0.7865, + "step": 2181 + }, + { + "epoch": 1.7497995188452284, + "grad_norm": 0.693809986114502, + "learning_rate": 7.809779689437011e-06, + "loss": 0.7245, + "step": 2182 + }, + { + "epoch": 1.7506014434643142, + "grad_norm": 0.7122488617897034, + "learning_rate": 7.801331463924076e-06, + "loss": 0.7556, + "step": 2183 + }, + { + "epoch": 1.7514033680834, + "grad_norm": 0.6818574666976929, + "learning_rate": 7.79288488704769e-06, + "loss": 0.7313, + "step": 2184 + }, + { + "epoch": 1.7522052927024858, + "grad_norm": 0.6880291104316711, + "learning_rate": 7.784439965141381e-06, + "loss": 0.7223, + "step": 2185 + }, + { + "epoch": 1.7530072173215716, + "grad_norm": 0.6625512838363647, + "learning_rate": 7.775996704537442e-06, + "loss": 0.7173, + "step": 2186 + }, + { + "epoch": 1.7538091419406576, + "grad_norm": 0.669406533241272, + "learning_rate": 7.767555111566914e-06, + "loss": 0.7263, + "step": 2187 + }, + { + "epoch": 1.7546110665597434, + "grad_norm": 0.6676865816116333, + "learning_rate": 7.759115192559589e-06, + "loss": 0.7081, + "step": 2188 + }, + { + "epoch": 1.7554129911788292, + "grad_norm": 0.6633387804031372, + "learning_rate": 7.750676953844011e-06, + "loss": 0.741, + "step": 2189 + }, + { + "epoch": 1.756214915797915, + "grad_norm": 0.7206395864486694, + "learning_rate": 7.742240401747457e-06, + "loss": 0.7791, + "step": 2190 + }, + { + "epoch": 1.7570168404170008, + "grad_norm": 0.7158024311065674, + "learning_rate": 7.73380554259594e-06, + "loss": 0.7411, + "step": 2191 + }, + { + "epoch": 1.7578187650360866, + "grad_norm": 0.684083104133606, + "learning_rate": 7.725372382714208e-06, + "loss": 0.7193, + "step": 2192 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 0.6862793564796448, + "learning_rate": 7.716940928425724e-06, + "loss": 0.7775, + "step": 2193 + }, + { + "epoch": 1.7594226142742582, + "grad_norm": 0.718550980091095, + "learning_rate": 7.708511186052689e-06, + "loss": 0.7628, + "step": 2194 + }, + { + "epoch": 1.7602245388933442, + "grad_norm": 0.7173058986663818, + "learning_rate": 7.700083161916e-06, + "loss": 0.7639, + "step": 2195 + }, + { + "epoch": 1.76102646351243, + "grad_norm": 0.6749827861785889, + "learning_rate": 7.691656862335288e-06, + "loss": 0.705, + "step": 2196 + }, + { + "epoch": 1.7618283881315158, + "grad_norm": 0.6680623292922974, + "learning_rate": 7.683232293628873e-06, + "loss": 0.7156, + "step": 2197 + }, + { + "epoch": 1.7626303127506016, + "grad_norm": 0.6810491681098938, + "learning_rate": 7.674809462113782e-06, + "loss": 0.7494, + "step": 2198 + }, + { + "epoch": 1.7634322373696873, + "grad_norm": 0.6893939971923828, + "learning_rate": 7.666388374105747e-06, + "loss": 0.7559, + "step": 2199 + }, + { + "epoch": 1.7642341619887731, + "grad_norm": 0.6892242431640625, + "learning_rate": 7.65796903591918e-06, + "loss": 0.7398, + "step": 2200 + }, + { + "epoch": 1.765036086607859, + "grad_norm": 0.6665722131729126, + "learning_rate": 7.649551453867192e-06, + "loss": 0.7275, + "step": 2201 + }, + { + "epoch": 1.7658380112269447, + "grad_norm": 0.7008151412010193, + "learning_rate": 7.641135634261572e-06, + "loss": 0.7379, + "step": 2202 + }, + { + "epoch": 1.7666399358460305, + "grad_norm": 0.6838683485984802, + "learning_rate": 7.632721583412787e-06, + "loss": 0.77, + "step": 2203 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 0.691834032535553, + "learning_rate": 7.62430930762998e-06, + "loss": 0.7355, + "step": 2204 + }, + { + "epoch": 1.768243785084202, + "grad_norm": 0.6844693422317505, + "learning_rate": 7.615898813220958e-06, + "loss": 0.7353, + "step": 2205 + }, + { + "epoch": 1.769045709703288, + "grad_norm": 0.7072806358337402, + "learning_rate": 7.607490106492205e-06, + "loss": 0.747, + "step": 2206 + }, + { + "epoch": 1.7698476343223737, + "grad_norm": 0.6707396507263184, + "learning_rate": 7.5990831937488476e-06, + "loss": 0.697, + "step": 2207 + }, + { + "epoch": 1.7706495589414595, + "grad_norm": 0.7029158473014832, + "learning_rate": 7.590678081294673e-06, + "loss": 0.7167, + "step": 2208 + }, + { + "epoch": 1.7714514835605453, + "grad_norm": 0.7110798954963684, + "learning_rate": 7.5822747754321315e-06, + "loss": 0.7507, + "step": 2209 + }, + { + "epoch": 1.772253408179631, + "grad_norm": 0.6975316405296326, + "learning_rate": 7.573873282462299e-06, + "loss": 0.7402, + "step": 2210 + }, + { + "epoch": 1.7730553327987169, + "grad_norm": 0.6738576889038086, + "learning_rate": 7.5654736086849056e-06, + "loss": 0.714, + "step": 2211 + }, + { + "epoch": 1.7738572574178026, + "grad_norm": 0.6818029284477234, + "learning_rate": 7.5570757603983115e-06, + "loss": 0.7079, + "step": 2212 + }, + { + "epoch": 1.7746591820368884, + "grad_norm": 0.6604394316673279, + "learning_rate": 7.548679743899505e-06, + "loss": 0.7548, + "step": 2213 + }, + { + "epoch": 1.7754611066559742, + "grad_norm": 0.6959803104400635, + "learning_rate": 7.540285565484114e-06, + "loss": 0.7403, + "step": 2214 + }, + { + "epoch": 1.77626303127506, + "grad_norm": 0.7064805626869202, + "learning_rate": 7.531893231446372e-06, + "loss": 0.7615, + "step": 2215 + }, + { + "epoch": 1.7770649558941458, + "grad_norm": 0.6517053842544556, + "learning_rate": 7.523502748079141e-06, + "loss": 0.7474, + "step": 2216 + }, + { + "epoch": 1.7778668805132316, + "grad_norm": 0.674662172794342, + "learning_rate": 7.51511412167389e-06, + "loss": 0.702, + "step": 2217 + }, + { + "epoch": 1.7786688051323174, + "grad_norm": 0.7000203132629395, + "learning_rate": 7.506727358520693e-06, + "loss": 0.7111, + "step": 2218 + }, + { + "epoch": 1.7794707297514034, + "grad_norm": 0.6659766435623169, + "learning_rate": 7.498342464908237e-06, + "loss": 0.7337, + "step": 2219 + }, + { + "epoch": 1.7802726543704892, + "grad_norm": 0.6771808862686157, + "learning_rate": 7.489959447123797e-06, + "loss": 0.7378, + "step": 2220 + }, + { + "epoch": 1.781074578989575, + "grad_norm": 0.7038045525550842, + "learning_rate": 7.4815783114532485e-06, + "loss": 0.7649, + "step": 2221 + }, + { + "epoch": 1.7818765036086608, + "grad_norm": 0.6613171100616455, + "learning_rate": 7.473199064181048e-06, + "loss": 0.7119, + "step": 2222 + }, + { + "epoch": 1.7826784282277466, + "grad_norm": 0.6629149913787842, + "learning_rate": 7.464821711590242e-06, + "loss": 0.7293, + "step": 2223 + }, + { + "epoch": 1.7834803528468324, + "grad_norm": 0.6879216432571411, + "learning_rate": 7.456446259962455e-06, + "loss": 0.7311, + "step": 2224 + }, + { + "epoch": 1.7842822774659182, + "grad_norm": 0.6925482153892517, + "learning_rate": 7.448072715577885e-06, + "loss": 0.7511, + "step": 2225 + }, + { + "epoch": 1.785084202085004, + "grad_norm": 0.6826873421669006, + "learning_rate": 7.439701084715305e-06, + "loss": 0.7436, + "step": 2226 + }, + { + "epoch": 1.78588612670409, + "grad_norm": 0.7013863325119019, + "learning_rate": 7.431331373652046e-06, + "loss": 0.7159, + "step": 2227 + }, + { + "epoch": 1.7866880513231758, + "grad_norm": 0.6822634935379028, + "learning_rate": 7.422963588663998e-06, + "loss": 0.7404, + "step": 2228 + }, + { + "epoch": 1.7874899759422616, + "grad_norm": 0.6994298100471497, + "learning_rate": 7.414597736025621e-06, + "loss": 0.755, + "step": 2229 + }, + { + "epoch": 1.7882919005613473, + "grad_norm": 0.7196714282035828, + "learning_rate": 7.406233822009904e-06, + "loss": 0.7806, + "step": 2230 + }, + { + "epoch": 1.7890938251804331, + "grad_norm": 0.6684456467628479, + "learning_rate": 7.397871852888405e-06, + "loss": 0.7119, + "step": 2231 + }, + { + "epoch": 1.789895749799519, + "grad_norm": 0.6782661080360413, + "learning_rate": 7.389511834931211e-06, + "loss": 0.7417, + "step": 2232 + }, + { + "epoch": 1.7906976744186047, + "grad_norm": 0.7280923128128052, + "learning_rate": 7.381153774406944e-06, + "loss": 0.7621, + "step": 2233 + }, + { + "epoch": 1.7914995990376905, + "grad_norm": 0.6602609157562256, + "learning_rate": 7.372797677582767e-06, + "loss": 0.7315, + "step": 2234 + }, + { + "epoch": 1.7923015236567763, + "grad_norm": 0.6975564956665039, + "learning_rate": 7.36444355072436e-06, + "loss": 0.7265, + "step": 2235 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.6379725933074951, + "learning_rate": 7.356091400095942e-06, + "loss": 0.7065, + "step": 2236 + }, + { + "epoch": 1.793905372894948, + "grad_norm": 0.7008256316184998, + "learning_rate": 7.3477412319602306e-06, + "loss": 0.7275, + "step": 2237 + }, + { + "epoch": 1.7947072975140337, + "grad_norm": 0.6815301775932312, + "learning_rate": 7.339393052578465e-06, + "loss": 0.732, + "step": 2238 + }, + { + "epoch": 1.7955092221331195, + "grad_norm": 0.6972305178642273, + "learning_rate": 7.3310468682104055e-06, + "loss": 0.7292, + "step": 2239 + }, + { + "epoch": 1.7963111467522053, + "grad_norm": 0.6848737597465515, + "learning_rate": 7.322702685114295e-06, + "loss": 0.6968, + "step": 2240 + }, + { + "epoch": 1.797113071371291, + "grad_norm": 0.7986035943031311, + "learning_rate": 7.3143605095468915e-06, + "loss": 0.7678, + "step": 2241 + }, + { + "epoch": 1.7979149959903769, + "grad_norm": 0.6967061758041382, + "learning_rate": 7.30602034776344e-06, + "loss": 0.6936, + "step": 2242 + }, + { + "epoch": 1.7987169206094626, + "grad_norm": 0.6833521723747253, + "learning_rate": 7.297682206017676e-06, + "loss": 0.7218, + "step": 2243 + }, + { + "epoch": 1.7995188452285484, + "grad_norm": 0.6830206513404846, + "learning_rate": 7.289346090561828e-06, + "loss": 0.7502, + "step": 2244 + }, + { + "epoch": 1.8003207698476342, + "grad_norm": 0.68632972240448, + "learning_rate": 7.281012007646595e-06, + "loss": 0.7189, + "step": 2245 + }, + { + "epoch": 1.80112269446672, + "grad_norm": 0.7078197002410889, + "learning_rate": 7.272679963521158e-06, + "loss": 0.7313, + "step": 2246 + }, + { + "epoch": 1.8019246190858058, + "grad_norm": 0.6793120503425598, + "learning_rate": 7.264349964433168e-06, + "loss": 0.7321, + "step": 2247 + }, + { + "epoch": 1.8027265437048916, + "grad_norm": 0.7071113586425781, + "learning_rate": 7.2560220166287355e-06, + "loss": 0.7568, + "step": 2248 + }, + { + "epoch": 1.8035284683239774, + "grad_norm": 0.6845733523368835, + "learning_rate": 7.24769612635245e-06, + "loss": 0.7318, + "step": 2249 + }, + { + "epoch": 1.8043303929430632, + "grad_norm": 0.6932980418205261, + "learning_rate": 7.239372299847338e-06, + "loss": 0.7303, + "step": 2250 + }, + { + "epoch": 1.8051323175621492, + "grad_norm": 0.6790763139724731, + "learning_rate": 7.231050543354894e-06, + "loss": 0.7239, + "step": 2251 + }, + { + "epoch": 1.805934242181235, + "grad_norm": 0.7017188668251038, + "learning_rate": 7.2227308631150535e-06, + "loss": 0.7255, + "step": 2252 + }, + { + "epoch": 1.8067361668003208, + "grad_norm": 0.6675518155097961, + "learning_rate": 7.214413265366194e-06, + "loss": 0.7225, + "step": 2253 + }, + { + "epoch": 1.8075380914194066, + "grad_norm": 0.6851517558097839, + "learning_rate": 7.206097756345135e-06, + "loss": 0.7182, + "step": 2254 + }, + { + "epoch": 1.8083400160384924, + "grad_norm": 0.6712617874145508, + "learning_rate": 7.197784342287125e-06, + "loss": 0.7717, + "step": 2255 + }, + { + "epoch": 1.8091419406575782, + "grad_norm": 0.6820451617240906, + "learning_rate": 7.189473029425852e-06, + "loss": 0.7246, + "step": 2256 + }, + { + "epoch": 1.809943865276664, + "grad_norm": 0.6897710561752319, + "learning_rate": 7.181163823993418e-06, + "loss": 0.7586, + "step": 2257 + }, + { + "epoch": 1.8107457898957497, + "grad_norm": 0.6737632751464844, + "learning_rate": 7.172856732220344e-06, + "loss": 0.71, + "step": 2258 + }, + { + "epoch": 1.8115477145148358, + "grad_norm": 0.6887868046760559, + "learning_rate": 7.164551760335579e-06, + "loss": 0.7375, + "step": 2259 + }, + { + "epoch": 1.8123496391339216, + "grad_norm": 0.6990635395050049, + "learning_rate": 7.156248914566461e-06, + "loss": 0.7322, + "step": 2260 + }, + { + "epoch": 1.8131515637530073, + "grad_norm": 0.6845853924751282, + "learning_rate": 7.147948201138761e-06, + "loss": 0.7243, + "step": 2261 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 0.7157221436500549, + "learning_rate": 7.139649626276629e-06, + "loss": 0.7328, + "step": 2262 + }, + { + "epoch": 1.814755412991179, + "grad_norm": 0.668306827545166, + "learning_rate": 7.131353196202617e-06, + "loss": 0.737, + "step": 2263 + }, + { + "epoch": 1.8155573376102647, + "grad_norm": 0.6798452138900757, + "learning_rate": 7.123058917137677e-06, + "loss": 0.7033, + "step": 2264 + }, + { + "epoch": 1.8163592622293505, + "grad_norm": 0.7059512138366699, + "learning_rate": 7.114766795301138e-06, + "loss": 0.6999, + "step": 2265 + }, + { + "epoch": 1.8171611868484363, + "grad_norm": 0.6953184604644775, + "learning_rate": 7.106476836910716e-06, + "loss": 0.7199, + "step": 2266 + }, + { + "epoch": 1.817963111467522, + "grad_norm": 0.7047235369682312, + "learning_rate": 7.098189048182504e-06, + "loss": 0.7685, + "step": 2267 + }, + { + "epoch": 1.818765036086608, + "grad_norm": 0.7124036550521851, + "learning_rate": 7.089903435330966e-06, + "loss": 0.7466, + "step": 2268 + }, + { + "epoch": 1.8195669607056937, + "grad_norm": 0.6875273585319519, + "learning_rate": 7.081620004568943e-06, + "loss": 0.7218, + "step": 2269 + }, + { + "epoch": 1.8203688853247795, + "grad_norm": 0.6810701489448547, + "learning_rate": 7.073338762107627e-06, + "loss": 0.7362, + "step": 2270 + }, + { + "epoch": 1.8211708099438653, + "grad_norm": 0.6458592414855957, + "learning_rate": 7.065059714156579e-06, + "loss": 0.7142, + "step": 2271 + }, + { + "epoch": 1.821972734562951, + "grad_norm": 0.6925168037414551, + "learning_rate": 7.0567828669237125e-06, + "loss": 0.7441, + "step": 2272 + }, + { + "epoch": 1.8227746591820368, + "grad_norm": 0.7175741195678711, + "learning_rate": 7.048508226615282e-06, + "loss": 0.72, + "step": 2273 + }, + { + "epoch": 1.8235765838011226, + "grad_norm": 0.6916970610618591, + "learning_rate": 7.040235799435904e-06, + "loss": 0.727, + "step": 2274 + }, + { + "epoch": 1.8243785084202084, + "grad_norm": 0.6771306395530701, + "learning_rate": 7.0319655915885185e-06, + "loss": 0.7692, + "step": 2275 + }, + { + "epoch": 1.8251804330392942, + "grad_norm": 0.7066898941993713, + "learning_rate": 7.023697609274418e-06, + "loss": 0.7244, + "step": 2276 + }, + { + "epoch": 1.82598235765838, + "grad_norm": 0.6827045679092407, + "learning_rate": 7.015431858693209e-06, + "loss": 0.7411, + "step": 2277 + }, + { + "epoch": 1.8267842822774658, + "grad_norm": 0.7030799388885498, + "learning_rate": 7.007168346042832e-06, + "loss": 0.7481, + "step": 2278 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 0.6946167945861816, + "learning_rate": 6.998907077519561e-06, + "loss": 0.7296, + "step": 2279 + }, + { + "epoch": 1.8283881315156374, + "grad_norm": 0.6715916991233826, + "learning_rate": 6.990648059317961e-06, + "loss": 0.7295, + "step": 2280 + }, + { + "epoch": 1.8291900561347232, + "grad_norm": 0.6531383395195007, + "learning_rate": 6.982391297630939e-06, + "loss": 0.7294, + "step": 2281 + }, + { + "epoch": 1.829991980753809, + "grad_norm": 0.6955968141555786, + "learning_rate": 6.97413679864969e-06, + "loss": 0.7247, + "step": 2282 + }, + { + "epoch": 1.830793905372895, + "grad_norm": 0.7067756652832031, + "learning_rate": 6.965884568563717e-06, + "loss": 0.6878, + "step": 2283 + }, + { + "epoch": 1.8315958299919808, + "grad_norm": 0.6867192983627319, + "learning_rate": 6.957634613560827e-06, + "loss": 0.7231, + "step": 2284 + }, + { + "epoch": 1.8323977546110666, + "grad_norm": 0.6870403289794922, + "learning_rate": 6.94938693982711e-06, + "loss": 0.7236, + "step": 2285 + }, + { + "epoch": 1.8331996792301524, + "grad_norm": 0.687545120716095, + "learning_rate": 6.941141553546963e-06, + "loss": 0.7548, + "step": 2286 + }, + { + "epoch": 1.8340016038492382, + "grad_norm": 0.6433346271514893, + "learning_rate": 6.932898460903052e-06, + "loss": 0.7243, + "step": 2287 + }, + { + "epoch": 1.834803528468324, + "grad_norm": 0.7730824947357178, + "learning_rate": 6.924657668076326e-06, + "loss": 0.7515, + "step": 2288 + }, + { + "epoch": 1.8356054530874097, + "grad_norm": 0.6878124475479126, + "learning_rate": 6.9164191812460194e-06, + "loss": 0.714, + "step": 2289 + }, + { + "epoch": 1.8364073777064955, + "grad_norm": 0.6526556015014648, + "learning_rate": 6.90818300658962e-06, + "loss": 0.701, + "step": 2290 + }, + { + "epoch": 1.8372093023255816, + "grad_norm": 0.6894628405570984, + "learning_rate": 6.899949150282903e-06, + "loss": 0.714, + "step": 2291 + }, + { + "epoch": 1.8380112269446673, + "grad_norm": 0.7038993239402771, + "learning_rate": 6.8917176184998915e-06, + "loss": 0.7532, + "step": 2292 + }, + { + "epoch": 1.8388131515637531, + "grad_norm": 0.6736302375793457, + "learning_rate": 6.883488417412858e-06, + "loss": 0.7281, + "step": 2293 + }, + { + "epoch": 1.839615076182839, + "grad_norm": 0.7072314023971558, + "learning_rate": 6.875261553192352e-06, + "loss": 0.7431, + "step": 2294 + }, + { + "epoch": 1.8404170008019247, + "grad_norm": 0.6677948832511902, + "learning_rate": 6.8670370320071466e-06, + "loss": 0.6828, + "step": 2295 + }, + { + "epoch": 1.8412189254210105, + "grad_norm": 0.6706652641296387, + "learning_rate": 6.858814860024275e-06, + "loss": 0.7471, + "step": 2296 + }, + { + "epoch": 1.8420208500400963, + "grad_norm": 0.6636921763420105, + "learning_rate": 6.850595043408997e-06, + "loss": 0.691, + "step": 2297 + }, + { + "epoch": 1.842822774659182, + "grad_norm": 0.6821714639663696, + "learning_rate": 6.842377588324809e-06, + "loss": 0.7275, + "step": 2298 + }, + { + "epoch": 1.8436246992782679, + "grad_norm": 0.6688547730445862, + "learning_rate": 6.834162500933445e-06, + "loss": 0.7008, + "step": 2299 + }, + { + "epoch": 1.8444266238973537, + "grad_norm": 0.675174355506897, + "learning_rate": 6.825949787394853e-06, + "loss": 0.7175, + "step": 2300 + }, + { + "epoch": 1.8452285485164395, + "grad_norm": 0.6541465520858765, + "learning_rate": 6.817739453867209e-06, + "loss": 0.7273, + "step": 2301 + }, + { + "epoch": 1.8460304731355253, + "grad_norm": 0.6905247569084167, + "learning_rate": 6.809531506506898e-06, + "loss": 0.7551, + "step": 2302 + }, + { + "epoch": 1.846832397754611, + "grad_norm": 0.6996776461601257, + "learning_rate": 6.801325951468514e-06, + "loss": 0.7546, + "step": 2303 + }, + { + "epoch": 1.8476343223736968, + "grad_norm": 0.6827793717384338, + "learning_rate": 6.7931227949048714e-06, + "loss": 0.7418, + "step": 2304 + }, + { + "epoch": 1.8484362469927826, + "grad_norm": 0.6554014682769775, + "learning_rate": 6.784922042966968e-06, + "loss": 0.7051, + "step": 2305 + }, + { + "epoch": 1.8492381716118684, + "grad_norm": 0.6803217530250549, + "learning_rate": 6.776723701804013e-06, + "loss": 0.7335, + "step": 2306 + }, + { + "epoch": 1.8500400962309542, + "grad_norm": 0.6773452758789062, + "learning_rate": 6.768527777563396e-06, + "loss": 0.7053, + "step": 2307 + }, + { + "epoch": 1.85084202085004, + "grad_norm": 0.6819374561309814, + "learning_rate": 6.760334276390707e-06, + "loss": 0.7471, + "step": 2308 + }, + { + "epoch": 1.8516439454691258, + "grad_norm": 0.6662135720252991, + "learning_rate": 6.752143204429709e-06, + "loss": 0.7263, + "step": 2309 + }, + { + "epoch": 1.8524458700882116, + "grad_norm": 0.7113257646560669, + "learning_rate": 6.7439545678223404e-06, + "loss": 0.7027, + "step": 2310 + }, + { + "epoch": 1.8532477947072974, + "grad_norm": 0.697996973991394, + "learning_rate": 6.735768372708731e-06, + "loss": 0.7514, + "step": 2311 + }, + { + "epoch": 1.8540497193263832, + "grad_norm": 0.6723498702049255, + "learning_rate": 6.727584625227159e-06, + "loss": 0.7454, + "step": 2312 + }, + { + "epoch": 1.854851643945469, + "grad_norm": 0.6865621209144592, + "learning_rate": 6.719403331514085e-06, + "loss": 0.7167, + "step": 2313 + }, + { + "epoch": 1.8556535685645548, + "grad_norm": 0.6741671562194824, + "learning_rate": 6.711224497704116e-06, + "loss": 0.7182, + "step": 2314 + }, + { + "epoch": 1.8564554931836408, + "grad_norm": 0.6949282884597778, + "learning_rate": 6.703048129930019e-06, + "loss": 0.7246, + "step": 2315 + }, + { + "epoch": 1.8572574178027266, + "grad_norm": 0.6716841459274292, + "learning_rate": 6.694874234322719e-06, + "loss": 0.7259, + "step": 2316 + }, + { + "epoch": 1.8580593424218124, + "grad_norm": 0.7084620594978333, + "learning_rate": 6.686702817011277e-06, + "loss": 0.7328, + "step": 2317 + }, + { + "epoch": 1.8588612670408982, + "grad_norm": 0.6928534507751465, + "learning_rate": 6.678533884122904e-06, + "loss": 0.73, + "step": 2318 + }, + { + "epoch": 1.859663191659984, + "grad_norm": 0.6859990358352661, + "learning_rate": 6.670367441782941e-06, + "loss": 0.6775, + "step": 2319 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.7146435379981995, + "learning_rate": 6.66220349611486e-06, + "loss": 0.7571, + "step": 2320 + }, + { + "epoch": 1.8612670408981555, + "grad_norm": 0.702168345451355, + "learning_rate": 6.654042053240275e-06, + "loss": 0.7387, + "step": 2321 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 0.688408613204956, + "learning_rate": 6.645883119278906e-06, + "loss": 0.7394, + "step": 2322 + }, + { + "epoch": 1.8628708901363273, + "grad_norm": 0.6995466351509094, + "learning_rate": 6.637726700348606e-06, + "loss": 0.7156, + "step": 2323 + }, + { + "epoch": 1.8636728147554131, + "grad_norm": 0.6815131306648254, + "learning_rate": 6.629572802565332e-06, + "loss": 0.715, + "step": 2324 + }, + { + "epoch": 1.864474739374499, + "grad_norm": 0.6656938195228577, + "learning_rate": 6.6214214320431534e-06, + "loss": 0.7109, + "step": 2325 + }, + { + "epoch": 1.8652766639935847, + "grad_norm": 0.6970621943473816, + "learning_rate": 6.613272594894248e-06, + "loss": 0.7439, + "step": 2326 + }, + { + "epoch": 1.8660785886126705, + "grad_norm": 0.6916574835777283, + "learning_rate": 6.605126297228886e-06, + "loss": 0.7338, + "step": 2327 + }, + { + "epoch": 1.8668805132317563, + "grad_norm": 0.66231769323349, + "learning_rate": 6.596982545155447e-06, + "loss": 0.7179, + "step": 2328 + }, + { + "epoch": 1.867682437850842, + "grad_norm": 0.7257800102233887, + "learning_rate": 6.5888413447803905e-06, + "loss": 0.7485, + "step": 2329 + }, + { + "epoch": 1.8684843624699279, + "grad_norm": 0.6953336000442505, + "learning_rate": 6.580702702208261e-06, + "loss": 0.7652, + "step": 2330 + }, + { + "epoch": 1.8692862870890137, + "grad_norm": 0.6823389530181885, + "learning_rate": 6.572566623541697e-06, + "loss": 0.7024, + "step": 2331 + }, + { + "epoch": 1.8700882117080995, + "grad_norm": 0.6590924859046936, + "learning_rate": 6.5644331148814e-06, + "loss": 0.7128, + "step": 2332 + }, + { + "epoch": 1.8708901363271853, + "grad_norm": 0.6826373934745789, + "learning_rate": 6.55630218232616e-06, + "loss": 0.7213, + "step": 2333 + }, + { + "epoch": 1.871692060946271, + "grad_norm": 0.6541956663131714, + "learning_rate": 6.548173831972824e-06, + "loss": 0.6984, + "step": 2334 + }, + { + "epoch": 1.8724939855653568, + "grad_norm": 0.6662783622741699, + "learning_rate": 6.540048069916301e-06, + "loss": 0.7364, + "step": 2335 + }, + { + "epoch": 1.8732959101844426, + "grad_norm": 0.6726429462432861, + "learning_rate": 6.5319249022495715e-06, + "loss": 0.7371, + "step": 2336 + }, + { + "epoch": 1.8740978348035284, + "grad_norm": 0.6762657165527344, + "learning_rate": 6.523804335063655e-06, + "loss": 0.7046, + "step": 2337 + }, + { + "epoch": 1.8748997594226142, + "grad_norm": 0.7136995196342468, + "learning_rate": 6.515686374447641e-06, + "loss": 0.7201, + "step": 2338 + }, + { + "epoch": 1.8757016840417, + "grad_norm": 0.7213540077209473, + "learning_rate": 6.507571026488644e-06, + "loss": 0.7674, + "step": 2339 + }, + { + "epoch": 1.8765036086607858, + "grad_norm": 0.7228121161460876, + "learning_rate": 6.499458297271826e-06, + "loss": 0.7653, + "step": 2340 + }, + { + "epoch": 1.8773055332798716, + "grad_norm": 0.6905105113983154, + "learning_rate": 6.491348192880395e-06, + "loss": 0.7133, + "step": 2341 + }, + { + "epoch": 1.8781074578989574, + "grad_norm": 0.6937234997749329, + "learning_rate": 6.48324071939558e-06, + "loss": 0.7293, + "step": 2342 + }, + { + "epoch": 1.8789093825180432, + "grad_norm": 0.7513543963432312, + "learning_rate": 6.4751358828966415e-06, + "loss": 0.7285, + "step": 2343 + }, + { + "epoch": 1.879711307137129, + "grad_norm": 0.6793731451034546, + "learning_rate": 6.467033689460863e-06, + "loss": 0.7007, + "step": 2344 + }, + { + "epoch": 1.8805132317562148, + "grad_norm": 0.6910126805305481, + "learning_rate": 6.458934145163539e-06, + "loss": 0.7152, + "step": 2345 + }, + { + "epoch": 1.8813151563753006, + "grad_norm": 0.7317004203796387, + "learning_rate": 6.450837256077993e-06, + "loss": 0.7716, + "step": 2346 + }, + { + "epoch": 1.8821170809943866, + "grad_norm": 0.6594632863998413, + "learning_rate": 6.44274302827554e-06, + "loss": 0.7189, + "step": 2347 + }, + { + "epoch": 1.8829190056134724, + "grad_norm": 0.654815673828125, + "learning_rate": 6.434651467825515e-06, + "loss": 0.714, + "step": 2348 + }, + { + "epoch": 1.8837209302325582, + "grad_norm": 0.7162003517150879, + "learning_rate": 6.426562580795242e-06, + "loss": 0.7311, + "step": 2349 + }, + { + "epoch": 1.884522854851644, + "grad_norm": 0.6873356103897095, + "learning_rate": 6.4184763732500376e-06, + "loss": 0.7173, + "step": 2350 + }, + { + "epoch": 1.8853247794707297, + "grad_norm": 0.6748940944671631, + "learning_rate": 6.410392851253229e-06, + "loss": 0.7156, + "step": 2351 + }, + { + "epoch": 1.8861267040898155, + "grad_norm": 0.6667020916938782, + "learning_rate": 6.402312020866102e-06, + "loss": 0.7354, + "step": 2352 + }, + { + "epoch": 1.8869286287089013, + "grad_norm": 0.666428804397583, + "learning_rate": 6.39423388814795e-06, + "loss": 0.7357, + "step": 2353 + }, + { + "epoch": 1.8877305533279871, + "grad_norm": 0.6650567054748535, + "learning_rate": 6.386158459156029e-06, + "loss": 0.718, + "step": 2354 + }, + { + "epoch": 1.8885324779470731, + "grad_norm": 0.7268814444541931, + "learning_rate": 6.378085739945566e-06, + "loss": 0.7532, + "step": 2355 + }, + { + "epoch": 1.889334402566159, + "grad_norm": 0.696033239364624, + "learning_rate": 6.3700157365697655e-06, + "loss": 0.7387, + "step": 2356 + }, + { + "epoch": 1.8901363271852447, + "grad_norm": 0.7350199818611145, + "learning_rate": 6.361948455079785e-06, + "loss": 0.7662, + "step": 2357 + }, + { + "epoch": 1.8909382518043305, + "grad_norm": 0.6738780736923218, + "learning_rate": 6.353883901524756e-06, + "loss": 0.7182, + "step": 2358 + }, + { + "epoch": 1.8917401764234163, + "grad_norm": 0.7525630593299866, + "learning_rate": 6.34582208195175e-06, + "loss": 0.7417, + "step": 2359 + }, + { + "epoch": 1.892542101042502, + "grad_norm": 0.6829856038093567, + "learning_rate": 6.337763002405792e-06, + "loss": 0.7616, + "step": 2360 + }, + { + "epoch": 1.8933440256615879, + "grad_norm": 0.6920203566551208, + "learning_rate": 6.329706668929861e-06, + "loss": 0.7149, + "step": 2361 + }, + { + "epoch": 1.8941459502806737, + "grad_norm": 0.7000799775123596, + "learning_rate": 6.321653087564861e-06, + "loss": 0.754, + "step": 2362 + }, + { + "epoch": 1.8949478748997595, + "grad_norm": 0.6926515102386475, + "learning_rate": 6.31360226434965e-06, + "loss": 0.7099, + "step": 2363 + }, + { + "epoch": 1.8957497995188453, + "grad_norm": 0.6759518384933472, + "learning_rate": 6.305554205321005e-06, + "loss": 0.7287, + "step": 2364 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 0.6605546474456787, + "learning_rate": 6.297508916513636e-06, + "loss": 0.7242, + "step": 2365 + }, + { + "epoch": 1.8973536487570168, + "grad_norm": 0.6701246500015259, + "learning_rate": 6.289466403960175e-06, + "loss": 0.7318, + "step": 2366 + }, + { + "epoch": 1.8981555733761026, + "grad_norm": 0.6828826069831848, + "learning_rate": 6.281426673691169e-06, + "loss": 0.7054, + "step": 2367 + }, + { + "epoch": 1.8989574979951884, + "grad_norm": 0.6410530209541321, + "learning_rate": 6.273389731735087e-06, + "loss": 0.7241, + "step": 2368 + }, + { + "epoch": 1.8997594226142742, + "grad_norm": 0.7067154049873352, + "learning_rate": 6.265355584118297e-06, + "loss": 0.7071, + "step": 2369 + }, + { + "epoch": 1.90056134723336, + "grad_norm": 0.698462724685669, + "learning_rate": 6.257324236865074e-06, + "loss": 0.723, + "step": 2370 + }, + { + "epoch": 1.9013632718524458, + "grad_norm": 0.747157096862793, + "learning_rate": 6.249295695997604e-06, + "loss": 0.755, + "step": 2371 + }, + { + "epoch": 1.9021651964715316, + "grad_norm": 0.7117529511451721, + "learning_rate": 6.241269967535955e-06, + "loss": 0.71, + "step": 2372 + }, + { + "epoch": 1.9029671210906174, + "grad_norm": 0.7347584962844849, + "learning_rate": 6.233247057498093e-06, + "loss": 0.7238, + "step": 2373 + }, + { + "epoch": 1.9037690457097032, + "grad_norm": 0.6999946236610413, + "learning_rate": 6.225226971899869e-06, + "loss": 0.7107, + "step": 2374 + }, + { + "epoch": 1.904570970328789, + "grad_norm": 0.7275912761688232, + "learning_rate": 6.217209716755013e-06, + "loss": 0.7432, + "step": 2375 + }, + { + "epoch": 1.9053728949478748, + "grad_norm": 0.6637576222419739, + "learning_rate": 6.2091952980751414e-06, + "loss": 0.6802, + "step": 2376 + }, + { + "epoch": 1.9061748195669606, + "grad_norm": 0.7044709920883179, + "learning_rate": 6.201183721869735e-06, + "loss": 0.7396, + "step": 2377 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 0.6789054870605469, + "learning_rate": 6.193174994146148e-06, + "loss": 0.7369, + "step": 2378 + }, + { + "epoch": 1.9077786688051324, + "grad_norm": 0.6822087168693542, + "learning_rate": 6.185169120909598e-06, + "loss": 0.7293, + "step": 2379 + }, + { + "epoch": 1.9085805934242182, + "grad_norm": 0.7218993306159973, + "learning_rate": 6.177166108163155e-06, + "loss": 0.7156, + "step": 2380 + }, + { + "epoch": 1.909382518043304, + "grad_norm": 0.6758652329444885, + "learning_rate": 6.169165961907762e-06, + "loss": 0.7279, + "step": 2381 + }, + { + "epoch": 1.9101844426623897, + "grad_norm": 0.7355567216873169, + "learning_rate": 6.1611686881421875e-06, + "loss": 0.7328, + "step": 2382 + }, + { + "epoch": 1.9109863672814755, + "grad_norm": 0.7142036557197571, + "learning_rate": 6.153174292863071e-06, + "loss": 0.7094, + "step": 2383 + }, + { + "epoch": 1.9117882919005613, + "grad_norm": 0.7164692282676697, + "learning_rate": 6.145182782064879e-06, + "loss": 0.7163, + "step": 2384 + }, + { + "epoch": 1.9125902165196471, + "grad_norm": 0.6600916981697083, + "learning_rate": 6.137194161739915e-06, + "loss": 0.6753, + "step": 2385 + }, + { + "epoch": 1.913392141138733, + "grad_norm": 0.7095491290092468, + "learning_rate": 6.129208437878324e-06, + "loss": 0.7447, + "step": 2386 + }, + { + "epoch": 1.914194065757819, + "grad_norm": 0.6852803230285645, + "learning_rate": 6.121225616468065e-06, + "loss": 0.6973, + "step": 2387 + }, + { + "epoch": 1.9149959903769047, + "grad_norm": 0.6977118253707886, + "learning_rate": 6.113245703494941e-06, + "loss": 0.7793, + "step": 2388 + }, + { + "epoch": 1.9157979149959905, + "grad_norm": 0.6602482199668884, + "learning_rate": 6.105268704942555e-06, + "loss": 0.7237, + "step": 2389 + }, + { + "epoch": 1.9165998396150763, + "grad_norm": 0.7034119963645935, + "learning_rate": 6.097294626792334e-06, + "loss": 0.7226, + "step": 2390 + }, + { + "epoch": 1.917401764234162, + "grad_norm": 0.6831420063972473, + "learning_rate": 6.0893234750235145e-06, + "loss": 0.73, + "step": 2391 + }, + { + "epoch": 1.9182036888532479, + "grad_norm": 0.7341967225074768, + "learning_rate": 6.0813552556131315e-06, + "loss": 0.7314, + "step": 2392 + }, + { + "epoch": 1.9190056134723337, + "grad_norm": 0.7385361194610596, + "learning_rate": 6.073389974536037e-06, + "loss": 0.7362, + "step": 2393 + }, + { + "epoch": 1.9198075380914195, + "grad_norm": 0.6924091577529907, + "learning_rate": 6.065427637764865e-06, + "loss": 0.742, + "step": 2394 + }, + { + "epoch": 1.9206094627105053, + "grad_norm": 0.6444892883300781, + "learning_rate": 6.0574682512700444e-06, + "loss": 0.6972, + "step": 2395 + }, + { + "epoch": 1.921411387329591, + "grad_norm": 0.7088480591773987, + "learning_rate": 6.0495118210197975e-06, + "loss": 0.7426, + "step": 2396 + }, + { + "epoch": 1.9222133119486768, + "grad_norm": 0.7047684788703918, + "learning_rate": 6.041558352980126e-06, + "loss": 0.7335, + "step": 2397 + }, + { + "epoch": 1.9230152365677626, + "grad_norm": 0.7246830463409424, + "learning_rate": 6.033607853114813e-06, + "loss": 0.802, + "step": 2398 + }, + { + "epoch": 1.9238171611868484, + "grad_norm": 0.6952186822891235, + "learning_rate": 6.025660327385412e-06, + "loss": 0.7143, + "step": 2399 + }, + { + "epoch": 1.9246190858059342, + "grad_norm": 0.6386004090309143, + "learning_rate": 6.017715781751243e-06, + "loss": 0.7112, + "step": 2400 + }, + { + "epoch": 1.92542101042502, + "grad_norm": 0.6913342475891113, + "learning_rate": 6.009774222169409e-06, + "loss": 0.7377, + "step": 2401 + }, + { + "epoch": 1.9262229350441058, + "grad_norm": 0.6988136768341064, + "learning_rate": 6.001835654594751e-06, + "loss": 0.7444, + "step": 2402 + }, + { + "epoch": 1.9270248596631916, + "grad_norm": 0.7216395735740662, + "learning_rate": 5.993900084979884e-06, + "loss": 0.7516, + "step": 2403 + }, + { + "epoch": 1.9278267842822774, + "grad_norm": 0.6848301887512207, + "learning_rate": 5.985967519275167e-06, + "loss": 0.6978, + "step": 2404 + }, + { + "epoch": 1.9286287089013632, + "grad_norm": 0.6854767203330994, + "learning_rate": 5.978037963428702e-06, + "loss": 0.7278, + "step": 2405 + }, + { + "epoch": 1.929430633520449, + "grad_norm": 0.6917245984077454, + "learning_rate": 5.970111423386349e-06, + "loss": 0.7319, + "step": 2406 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 0.7103894948959351, + "learning_rate": 5.962187905091692e-06, + "loss": 0.7181, + "step": 2407 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6659692525863647, + "learning_rate": 5.954267414486057e-06, + "loss": 0.7213, + "step": 2408 + }, + { + "epoch": 1.9318364073777063, + "grad_norm": 0.7113282084465027, + "learning_rate": 5.946349957508499e-06, + "loss": 0.7317, + "step": 2409 + }, + { + "epoch": 1.9326383319967921, + "grad_norm": 0.6863912343978882, + "learning_rate": 5.93843554009579e-06, + "loss": 0.6955, + "step": 2410 + }, + { + "epoch": 1.9334402566158782, + "grad_norm": 0.6813226938247681, + "learning_rate": 5.930524168182441e-06, + "loss": 0.7199, + "step": 2411 + }, + { + "epoch": 1.934242181234964, + "grad_norm": 0.6929488778114319, + "learning_rate": 5.922615847700655e-06, + "loss": 0.7189, + "step": 2412 + }, + { + "epoch": 1.9350441058540497, + "grad_norm": 0.6780677437782288, + "learning_rate": 5.91471058458037e-06, + "loss": 0.7502, + "step": 2413 + }, + { + "epoch": 1.9358460304731355, + "grad_norm": 0.6948460340499878, + "learning_rate": 5.90680838474922e-06, + "loss": 0.6846, + "step": 2414 + }, + { + "epoch": 1.9366479550922213, + "grad_norm": 0.6975181698799133, + "learning_rate": 5.898909254132539e-06, + "loss": 0.6991, + "step": 2415 + }, + { + "epoch": 1.937449879711307, + "grad_norm": 0.7138963937759399, + "learning_rate": 5.891013198653368e-06, + "loss": 0.7178, + "step": 2416 + }, + { + "epoch": 1.938251804330393, + "grad_norm": 0.7381751537322998, + "learning_rate": 5.8831202242324345e-06, + "loss": 0.7485, + "step": 2417 + }, + { + "epoch": 1.9390537289494787, + "grad_norm": 0.6692777872085571, + "learning_rate": 5.875230336788167e-06, + "loss": 0.7372, + "step": 2418 + }, + { + "epoch": 1.9398556535685647, + "grad_norm": 0.699887216091156, + "learning_rate": 5.8673435422366656e-06, + "loss": 0.7446, + "step": 2419 + }, + { + "epoch": 1.9406575781876505, + "grad_norm": 0.6614972352981567, + "learning_rate": 5.859459846491718e-06, + "loss": 0.7047, + "step": 2420 + }, + { + "epoch": 1.9414595028067363, + "grad_norm": 0.6647498607635498, + "learning_rate": 5.85157925546479e-06, + "loss": 0.7272, + "step": 2421 + }, + { + "epoch": 1.942261427425822, + "grad_norm": 0.6981691122055054, + "learning_rate": 5.843701775065011e-06, + "loss": 0.7061, + "step": 2422 + }, + { + "epoch": 1.9430633520449079, + "grad_norm": 0.7187725305557251, + "learning_rate": 5.835827411199194e-06, + "loss": 0.7676, + "step": 2423 + }, + { + "epoch": 1.9438652766639937, + "grad_norm": 0.6973015069961548, + "learning_rate": 5.8279561697718025e-06, + "loss": 0.7354, + "step": 2424 + }, + { + "epoch": 1.9446672012830795, + "grad_norm": 0.69856858253479, + "learning_rate": 5.8200880566849535e-06, + "loss": 0.7591, + "step": 2425 + }, + { + "epoch": 1.9454691259021653, + "grad_norm": 0.674909770488739, + "learning_rate": 5.812223077838433e-06, + "loss": 0.7067, + "step": 2426 + }, + { + "epoch": 1.946271050521251, + "grad_norm": 0.6738868951797485, + "learning_rate": 5.804361239129668e-06, + "loss": 0.726, + "step": 2427 + }, + { + "epoch": 1.9470729751403368, + "grad_norm": 0.6930453777313232, + "learning_rate": 5.7965025464537336e-06, + "loss": 0.7368, + "step": 2428 + }, + { + "epoch": 1.9478748997594226, + "grad_norm": 0.6769317388534546, + "learning_rate": 5.788647005703349e-06, + "loss": 0.7118, + "step": 2429 + }, + { + "epoch": 1.9486768243785084, + "grad_norm": 0.6883741021156311, + "learning_rate": 5.780794622768859e-06, + "loss": 0.7179, + "step": 2430 + }, + { + "epoch": 1.9494787489975942, + "grad_norm": 0.6916329860687256, + "learning_rate": 5.77294540353825e-06, + "loss": 0.7072, + "step": 2431 + }, + { + "epoch": 1.95028067361668, + "grad_norm": 0.6634646654129028, + "learning_rate": 5.765099353897136e-06, + "loss": 0.7044, + "step": 2432 + }, + { + "epoch": 1.9510825982357658, + "grad_norm": 0.6738753318786621, + "learning_rate": 5.7572564797287525e-06, + "loss": 0.7317, + "step": 2433 + }, + { + "epoch": 1.9518845228548516, + "grad_norm": 0.6909292340278625, + "learning_rate": 5.749416786913954e-06, + "loss": 0.7059, + "step": 2434 + }, + { + "epoch": 1.9526864474739374, + "grad_norm": 0.6980206966400146, + "learning_rate": 5.741580281331204e-06, + "loss": 0.7367, + "step": 2435 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 0.7224229574203491, + "learning_rate": 5.733746968856585e-06, + "loss": 0.6925, + "step": 2436 + }, + { + "epoch": 1.954290296712109, + "grad_norm": 0.680873692035675, + "learning_rate": 5.7259168553637815e-06, + "loss": 0.7288, + "step": 2437 + }, + { + "epoch": 1.9550922213311948, + "grad_norm": 0.6676924824714661, + "learning_rate": 5.718089946724078e-06, + "loss": 0.6918, + "step": 2438 + }, + { + "epoch": 1.9558941459502805, + "grad_norm": 0.7096678614616394, + "learning_rate": 5.710266248806363e-06, + "loss": 0.7167, + "step": 2439 + }, + { + "epoch": 1.9566960705693663, + "grad_norm": 0.6810303926467896, + "learning_rate": 5.702445767477103e-06, + "loss": 0.6996, + "step": 2440 + }, + { + "epoch": 1.9574979951884521, + "grad_norm": 0.6997901201248169, + "learning_rate": 5.6946285086003636e-06, + "loss": 0.7609, + "step": 2441 + }, + { + "epoch": 1.958299919807538, + "grad_norm": 0.7114957571029663, + "learning_rate": 5.686814478037795e-06, + "loss": 0.7472, + "step": 2442 + }, + { + "epoch": 1.959101844426624, + "grad_norm": 0.7095164656639099, + "learning_rate": 5.679003681648625e-06, + "loss": 0.6977, + "step": 2443 + }, + { + "epoch": 1.9599037690457097, + "grad_norm": 0.7158694267272949, + "learning_rate": 5.671196125289647e-06, + "loss": 0.7513, + "step": 2444 + }, + { + "epoch": 1.9607056936647955, + "grad_norm": 0.6895703673362732, + "learning_rate": 5.663391814815238e-06, + "loss": 0.7194, + "step": 2445 + }, + { + "epoch": 1.9615076182838813, + "grad_norm": 0.6775211095809937, + "learning_rate": 5.655590756077334e-06, + "loss": 0.7336, + "step": 2446 + }, + { + "epoch": 1.962309542902967, + "grad_norm": 0.7095913290977478, + "learning_rate": 5.647792954925435e-06, + "loss": 0.7276, + "step": 2447 + }, + { + "epoch": 1.963111467522053, + "grad_norm": 0.6791907548904419, + "learning_rate": 5.639998417206602e-06, + "loss": 0.7091, + "step": 2448 + }, + { + "epoch": 1.9639133921411387, + "grad_norm": 0.7308151125907898, + "learning_rate": 5.632207148765438e-06, + "loss": 0.7246, + "step": 2449 + }, + { + "epoch": 1.9647153167602245, + "grad_norm": 0.7150318026542664, + "learning_rate": 5.6244191554441045e-06, + "loss": 0.7325, + "step": 2450 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 0.6908929347991943, + "learning_rate": 5.616634443082303e-06, + "loss": 0.7073, + "step": 2451 + }, + { + "epoch": 1.9663191659983963, + "grad_norm": 0.7129378318786621, + "learning_rate": 5.608853017517277e-06, + "loss": 0.6903, + "step": 2452 + }, + { + "epoch": 1.967121090617482, + "grad_norm": 0.6859320998191833, + "learning_rate": 5.601074884583809e-06, + "loss": 0.7296, + "step": 2453 + }, + { + "epoch": 1.9679230152365679, + "grad_norm": 0.6935213208198547, + "learning_rate": 5.593300050114199e-06, + "loss": 0.7437, + "step": 2454 + }, + { + "epoch": 1.9687249398556537, + "grad_norm": 0.6933846473693848, + "learning_rate": 5.585528519938288e-06, + "loss": 0.7118, + "step": 2455 + }, + { + "epoch": 1.9695268644747395, + "grad_norm": 0.6689132452011108, + "learning_rate": 5.5777602998834345e-06, + "loss": 0.7091, + "step": 2456 + }, + { + "epoch": 1.9703287890938253, + "grad_norm": 0.6558547616004944, + "learning_rate": 5.569995395774508e-06, + "loss": 0.6749, + "step": 2457 + }, + { + "epoch": 1.971130713712911, + "grad_norm": 0.680107057094574, + "learning_rate": 5.562233813433909e-06, + "loss": 0.7316, + "step": 2458 + }, + { + "epoch": 1.9719326383319968, + "grad_norm": 0.6814321875572205, + "learning_rate": 5.5544755586815265e-06, + "loss": 0.7284, + "step": 2459 + }, + { + "epoch": 1.9727345629510826, + "grad_norm": 0.6948514580726624, + "learning_rate": 5.546720637334769e-06, + "loss": 0.7091, + "step": 2460 + }, + { + "epoch": 1.9735364875701684, + "grad_norm": 0.6681773066520691, + "learning_rate": 5.538969055208543e-06, + "loss": 0.7373, + "step": 2461 + }, + { + "epoch": 1.9743384121892542, + "grad_norm": 0.7033309936523438, + "learning_rate": 5.5312208181152376e-06, + "loss": 0.7387, + "step": 2462 + }, + { + "epoch": 1.97514033680834, + "grad_norm": 0.6964126229286194, + "learning_rate": 5.523475931864759e-06, + "loss": 0.7446, + "step": 2463 + }, + { + "epoch": 1.9759422614274258, + "grad_norm": 0.6732887029647827, + "learning_rate": 5.515734402264478e-06, + "loss": 0.6558, + "step": 2464 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 0.6745656728744507, + "learning_rate": 5.5079962351192585e-06, + "loss": 0.7085, + "step": 2465 + }, + { + "epoch": 1.9775461106655974, + "grad_norm": 0.7190232276916504, + "learning_rate": 5.500261436231447e-06, + "loss": 0.7126, + "step": 2466 + }, + { + "epoch": 1.9783480352846832, + "grad_norm": 0.6871313452720642, + "learning_rate": 5.4925300114008465e-06, + "loss": 0.7176, + "step": 2467 + }, + { + "epoch": 1.979149959903769, + "grad_norm": 0.6837944388389587, + "learning_rate": 5.4848019664247575e-06, + "loss": 0.7013, + "step": 2468 + }, + { + "epoch": 1.9799518845228548, + "grad_norm": 0.706548810005188, + "learning_rate": 5.4770773070979225e-06, + "loss": 0.733, + "step": 2469 + }, + { + "epoch": 1.9807538091419405, + "grad_norm": 0.682320773601532, + "learning_rate": 5.469356039212557e-06, + "loss": 0.7227, + "step": 2470 + }, + { + "epoch": 1.9815557337610263, + "grad_norm": 0.7176364660263062, + "learning_rate": 5.461638168558332e-06, + "loss": 0.7324, + "step": 2471 + }, + { + "epoch": 1.9823576583801121, + "grad_norm": 0.6608320474624634, + "learning_rate": 5.453923700922366e-06, + "loss": 0.7153, + "step": 2472 + }, + { + "epoch": 1.983159582999198, + "grad_norm": 0.6956177353858948, + "learning_rate": 5.446212642089228e-06, + "loss": 0.7022, + "step": 2473 + }, + { + "epoch": 1.9839615076182837, + "grad_norm": 0.6896581053733826, + "learning_rate": 5.4385049978409385e-06, + "loss": 0.7185, + "step": 2474 + }, + { + "epoch": 1.9847634322373697, + "grad_norm": 0.7071901559829712, + "learning_rate": 5.430800773956948e-06, + "loss": 0.7393, + "step": 2475 + }, + { + "epoch": 1.9855653568564555, + "grad_norm": 0.7110061645507812, + "learning_rate": 5.42309997621415e-06, + "loss": 0.7563, + "step": 2476 + }, + { + "epoch": 1.9863672814755413, + "grad_norm": 0.7318345904350281, + "learning_rate": 5.415402610386859e-06, + "loss": 0.7583, + "step": 2477 + }, + { + "epoch": 1.987169206094627, + "grad_norm": 0.6681869029998779, + "learning_rate": 5.407708682246825e-06, + "loss": 0.6964, + "step": 2478 + }, + { + "epoch": 1.987971130713713, + "grad_norm": 0.6897268891334534, + "learning_rate": 5.400018197563217e-06, + "loss": 0.7301, + "step": 2479 + }, + { + "epoch": 1.9887730553327987, + "grad_norm": 0.6932487487792969, + "learning_rate": 5.392331162102622e-06, + "loss": 0.6904, + "step": 2480 + }, + { + "epoch": 1.9895749799518845, + "grad_norm": 0.7004687786102295, + "learning_rate": 5.384647581629045e-06, + "loss": 0.7364, + "step": 2481 + }, + { + "epoch": 1.9903769045709703, + "grad_norm": 0.6893764734268188, + "learning_rate": 5.37696746190389e-06, + "loss": 0.7311, + "step": 2482 + }, + { + "epoch": 1.9911788291900563, + "grad_norm": 0.7490194439888, + "learning_rate": 5.369290808685975e-06, + "loss": 0.7163, + "step": 2483 + }, + { + "epoch": 1.991980753809142, + "grad_norm": 0.7016685009002686, + "learning_rate": 5.3616176277315164e-06, + "loss": 0.7446, + "step": 2484 + }, + { + "epoch": 1.9927826784282279, + "grad_norm": 0.7102388739585876, + "learning_rate": 5.353947924794129e-06, + "loss": 0.7777, + "step": 2485 + }, + { + "epoch": 1.9935846030473137, + "grad_norm": 0.707472026348114, + "learning_rate": 5.346281705624812e-06, + "loss": 0.7304, + "step": 2486 + }, + { + "epoch": 1.9943865276663995, + "grad_norm": 0.6962066888809204, + "learning_rate": 5.33861897597196e-06, + "loss": 0.7545, + "step": 2487 + }, + { + "epoch": 1.9951884522854852, + "grad_norm": 0.684525191783905, + "learning_rate": 5.330959741581347e-06, + "loss": 0.712, + "step": 2488 + }, + { + "epoch": 1.995990376904571, + "grad_norm": 0.6817164421081543, + "learning_rate": 5.323304008196133e-06, + "loss": 0.7334, + "step": 2489 + }, + { + "epoch": 1.9967923015236568, + "grad_norm": 0.6918975114822388, + "learning_rate": 5.3156517815568455e-06, + "loss": 0.7334, + "step": 2490 + }, + { + "epoch": 1.9975942261427426, + "grad_norm": 0.6976943612098694, + "learning_rate": 5.30800306740138e-06, + "loss": 0.7255, + "step": 2491 + }, + { + "epoch": 1.9983961507618284, + "grad_norm": 0.7042475938796997, + "learning_rate": 5.300357871465007e-06, + "loss": 0.7338, + "step": 2492 + }, + { + "epoch": 1.9991980753809142, + "grad_norm": 0.6709238290786743, + "learning_rate": 5.292716199480354e-06, + "loss": 0.7195, + "step": 2493 + }, + { + "epoch": 2.0, + "grad_norm": 0.648729145526886, + "learning_rate": 5.285078057177406e-06, + "loss": 0.6162, + "step": 2494 + }, + { + "epoch": 2.000801924619086, + "grad_norm": 0.6973950862884521, + "learning_rate": 5.277443450283508e-06, + "loss": 0.5518, + "step": 2495 + }, + { + "epoch": 2.0016038492381716, + "grad_norm": 0.6707605123519897, + "learning_rate": 5.269812384523341e-06, + "loss": 0.5418, + "step": 2496 + }, + { + "epoch": 2.0024057738572574, + "grad_norm": 0.6641839146614075, + "learning_rate": 5.262184865618938e-06, + "loss": 0.5272, + "step": 2497 + }, + { + "epoch": 2.003207698476343, + "grad_norm": 0.703292191028595, + "learning_rate": 5.254560899289679e-06, + "loss": 0.5537, + "step": 2498 + }, + { + "epoch": 2.004009623095429, + "grad_norm": 0.7050741314888, + "learning_rate": 5.246940491252263e-06, + "loss": 0.5197, + "step": 2499 + }, + { + "epoch": 2.0048115477145148, + "grad_norm": 0.7010351419448853, + "learning_rate": 5.239323647220744e-06, + "loss": 0.5039, + "step": 2500 + }, + { + "epoch": 2.0056134723336005, + "grad_norm": 0.7894969582557678, + "learning_rate": 5.231710372906482e-06, + "loss": 0.5257, + "step": 2501 + }, + { + "epoch": 2.0064153969526863, + "grad_norm": 0.8365249633789062, + "learning_rate": 5.224100674018173e-06, + "loss": 0.5038, + "step": 2502 + }, + { + "epoch": 2.007217321571772, + "grad_norm": 0.9526363611221313, + "learning_rate": 5.216494556261831e-06, + "loss": 0.5347, + "step": 2503 + }, + { + "epoch": 2.008019246190858, + "grad_norm": 0.9146489500999451, + "learning_rate": 5.208892025340772e-06, + "loss": 0.506, + "step": 2504 + }, + { + "epoch": 2.0088211708099437, + "grad_norm": 0.8940325975418091, + "learning_rate": 5.201293086955646e-06, + "loss": 0.5, + "step": 2505 + }, + { + "epoch": 2.0096230954290295, + "grad_norm": 0.9119753837585449, + "learning_rate": 5.193697746804386e-06, + "loss": 0.5178, + "step": 2506 + }, + { + "epoch": 2.0104250200481153, + "grad_norm": 0.8597251176834106, + "learning_rate": 5.186106010582239e-06, + "loss": 0.5075, + "step": 2507 + }, + { + "epoch": 2.011226944667201, + "grad_norm": 0.7838432192802429, + "learning_rate": 5.178517883981753e-06, + "loss": 0.4928, + "step": 2508 + }, + { + "epoch": 2.012028869286287, + "grad_norm": 0.7873410582542419, + "learning_rate": 5.170933372692752e-06, + "loss": 0.5191, + "step": 2509 + }, + { + "epoch": 2.0128307939053727, + "grad_norm": 0.7258116006851196, + "learning_rate": 5.163352482402375e-06, + "loss": 0.5039, + "step": 2510 + }, + { + "epoch": 2.013632718524459, + "grad_norm": 0.8325080871582031, + "learning_rate": 5.15577521879502e-06, + "loss": 0.5224, + "step": 2511 + }, + { + "epoch": 2.0144346431435447, + "grad_norm": 0.7581323385238647, + "learning_rate": 5.148201587552384e-06, + "loss": 0.4951, + "step": 2512 + }, + { + "epoch": 2.0152365677626305, + "grad_norm": 0.7743967175483704, + "learning_rate": 5.140631594353434e-06, + "loss": 0.5489, + "step": 2513 + }, + { + "epoch": 2.0160384923817163, + "grad_norm": 0.7546889185905457, + "learning_rate": 5.133065244874404e-06, + "loss": 0.4962, + "step": 2514 + }, + { + "epoch": 2.016840417000802, + "grad_norm": 0.725445568561554, + "learning_rate": 5.1255025447888005e-06, + "loss": 0.4818, + "step": 2515 + }, + { + "epoch": 2.017642341619888, + "grad_norm": 0.7801692485809326, + "learning_rate": 5.117943499767402e-06, + "loss": 0.4848, + "step": 2516 + }, + { + "epoch": 2.0184442662389737, + "grad_norm": 0.7961569428443909, + "learning_rate": 5.110388115478222e-06, + "loss": 0.4813, + "step": 2517 + }, + { + "epoch": 2.0192461908580595, + "grad_norm": 0.840562641620636, + "learning_rate": 5.102836397586564e-06, + "loss": 0.4887, + "step": 2518 + }, + { + "epoch": 2.0200481154771452, + "grad_norm": 0.8071300983428955, + "learning_rate": 5.09528835175495e-06, + "loss": 0.4874, + "step": 2519 + }, + { + "epoch": 2.020850040096231, + "grad_norm": 0.8382665514945984, + "learning_rate": 5.087743983643165e-06, + "loss": 0.4947, + "step": 2520 + }, + { + "epoch": 2.021651964715317, + "grad_norm": 0.7823915481567383, + "learning_rate": 5.080203298908239e-06, + "loss": 0.4644, + "step": 2521 + }, + { + "epoch": 2.0224538893344026, + "grad_norm": 0.8397455215454102, + "learning_rate": 5.072666303204421e-06, + "loss": 0.4924, + "step": 2522 + }, + { + "epoch": 2.0232558139534884, + "grad_norm": 0.8278082609176636, + "learning_rate": 5.065133002183223e-06, + "loss": 0.5191, + "step": 2523 + }, + { + "epoch": 2.024057738572574, + "grad_norm": 0.8194684982299805, + "learning_rate": 5.057603401493358e-06, + "loss": 0.4957, + "step": 2524 + }, + { + "epoch": 2.02485966319166, + "grad_norm": 0.7626014947891235, + "learning_rate": 5.050077506780783e-06, + "loss": 0.4892, + "step": 2525 + }, + { + "epoch": 2.025661587810746, + "grad_norm": 0.7766503691673279, + "learning_rate": 5.042555323688673e-06, + "loss": 0.5108, + "step": 2526 + }, + { + "epoch": 2.0264635124298316, + "grad_norm": 0.7892016768455505, + "learning_rate": 5.035036857857405e-06, + "loss": 0.4472, + "step": 2527 + }, + { + "epoch": 2.0272654370489174, + "grad_norm": 0.7951651811599731, + "learning_rate": 5.027522114924597e-06, + "loss": 0.4936, + "step": 2528 + }, + { + "epoch": 2.028067361668003, + "grad_norm": 0.7646651864051819, + "learning_rate": 5.020011100525047e-06, + "loss": 0.5195, + "step": 2529 + }, + { + "epoch": 2.028869286287089, + "grad_norm": 0.7651566863059998, + "learning_rate": 5.0125038202907735e-06, + "loss": 0.4591, + "step": 2530 + }, + { + "epoch": 2.0296712109061747, + "grad_norm": 0.814940333366394, + "learning_rate": 5.0050002798509956e-06, + "loss": 0.5049, + "step": 2531 + }, + { + "epoch": 2.0304731355252605, + "grad_norm": 0.8493900895118713, + "learning_rate": 4.997500484832114e-06, + "loss": 0.5016, + "step": 2532 + }, + { + "epoch": 2.0312750601443463, + "grad_norm": 0.8357752561569214, + "learning_rate": 4.990004440857735e-06, + "loss": 0.4844, + "step": 2533 + }, + { + "epoch": 2.032076984763432, + "grad_norm": 0.8459290862083435, + "learning_rate": 4.9825121535486475e-06, + "loss": 0.5135, + "step": 2534 + }, + { + "epoch": 2.032878909382518, + "grad_norm": 0.7996906042098999, + "learning_rate": 4.975023628522825e-06, + "loss": 0.482, + "step": 2535 + }, + { + "epoch": 2.0336808340016037, + "grad_norm": 0.8636319041252136, + "learning_rate": 4.967538871395421e-06, + "loss": 0.4959, + "step": 2536 + }, + { + "epoch": 2.0344827586206895, + "grad_norm": 0.813450276851654, + "learning_rate": 4.960057887778754e-06, + "loss": 0.4843, + "step": 2537 + }, + { + "epoch": 2.0352846832397753, + "grad_norm": 0.807138979434967, + "learning_rate": 4.952580683282324e-06, + "loss": 0.497, + "step": 2538 + }, + { + "epoch": 2.036086607858861, + "grad_norm": 0.8144460916519165, + "learning_rate": 4.945107263512794e-06, + "loss": 0.4893, + "step": 2539 + }, + { + "epoch": 2.036888532477947, + "grad_norm": 0.8009450435638428, + "learning_rate": 4.937637634073988e-06, + "loss": 0.5016, + "step": 2540 + }, + { + "epoch": 2.0376904570970327, + "grad_norm": 0.7973288893699646, + "learning_rate": 4.930171800566893e-06, + "loss": 0.4966, + "step": 2541 + }, + { + "epoch": 2.038492381716119, + "grad_norm": 0.773374617099762, + "learning_rate": 4.922709768589638e-06, + "loss": 0.5013, + "step": 2542 + }, + { + "epoch": 2.0392943063352047, + "grad_norm": 0.8175215125083923, + "learning_rate": 4.915251543737512e-06, + "loss": 0.4926, + "step": 2543 + }, + { + "epoch": 2.0400962309542905, + "grad_norm": 0.8121562600135803, + "learning_rate": 4.907797131602945e-06, + "loss": 0.5069, + "step": 2544 + }, + { + "epoch": 2.0408981555733763, + "grad_norm": 0.7891466021537781, + "learning_rate": 4.900346537775513e-06, + "loss": 0.4978, + "step": 2545 + }, + { + "epoch": 2.041700080192462, + "grad_norm": 0.7856062650680542, + "learning_rate": 4.89289976784192e-06, + "loss": 0.4882, + "step": 2546 + }, + { + "epoch": 2.042502004811548, + "grad_norm": 0.7932535409927368, + "learning_rate": 4.885456827386008e-06, + "loss": 0.4836, + "step": 2547 + }, + { + "epoch": 2.0433039294306337, + "grad_norm": 0.7187968492507935, + "learning_rate": 4.87801772198875e-06, + "loss": 0.4609, + "step": 2548 + }, + { + "epoch": 2.0441058540497195, + "grad_norm": 0.8403437733650208, + "learning_rate": 4.870582457228239e-06, + "loss": 0.5197, + "step": 2549 + }, + { + "epoch": 2.0449077786688052, + "grad_norm": 0.8300922513008118, + "learning_rate": 4.863151038679694e-06, + "loss": 0.5118, + "step": 2550 + }, + { + "epoch": 2.045709703287891, + "grad_norm": 0.8011190891265869, + "learning_rate": 4.855723471915438e-06, + "loss": 0.4769, + "step": 2551 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 0.8211809992790222, + "learning_rate": 4.848299762504918e-06, + "loss": 0.5017, + "step": 2552 + }, + { + "epoch": 2.0473135525260626, + "grad_norm": 0.7941953539848328, + "learning_rate": 4.840879916014683e-06, + "loss": 0.4776, + "step": 2553 + }, + { + "epoch": 2.0481154771451484, + "grad_norm": 0.8611568212509155, + "learning_rate": 4.833463938008387e-06, + "loss": 0.4887, + "step": 2554 + }, + { + "epoch": 2.048917401764234, + "grad_norm": 0.8546658754348755, + "learning_rate": 4.826051834046787e-06, + "loss": 0.508, + "step": 2555 + }, + { + "epoch": 2.04971932638332, + "grad_norm": 0.8082013726234436, + "learning_rate": 4.818643609687724e-06, + "loss": 0.5016, + "step": 2556 + }, + { + "epoch": 2.050521251002406, + "grad_norm": 0.868209183216095, + "learning_rate": 4.811239270486139e-06, + "loss": 0.4957, + "step": 2557 + }, + { + "epoch": 2.0513231756214916, + "grad_norm": 0.8040471076965332, + "learning_rate": 4.803838821994062e-06, + "loss": 0.4874, + "step": 2558 + }, + { + "epoch": 2.0521251002405774, + "grad_norm": 0.7826542854309082, + "learning_rate": 4.796442269760592e-06, + "loss": 0.4845, + "step": 2559 + }, + { + "epoch": 2.052927024859663, + "grad_norm": 0.8145564794540405, + "learning_rate": 4.789049619331928e-06, + "loss": 0.5049, + "step": 2560 + }, + { + "epoch": 2.053728949478749, + "grad_norm": 0.8400808572769165, + "learning_rate": 4.781660876251322e-06, + "loss": 0.4969, + "step": 2561 + }, + { + "epoch": 2.0545308740978347, + "grad_norm": 0.8168050050735474, + "learning_rate": 4.774276046059107e-06, + "loss": 0.458, + "step": 2562 + }, + { + "epoch": 2.0553327987169205, + "grad_norm": 0.8102244734764099, + "learning_rate": 4.766895134292685e-06, + "loss": 0.5026, + "step": 2563 + }, + { + "epoch": 2.0561347233360063, + "grad_norm": 0.8787121772766113, + "learning_rate": 4.759518146486504e-06, + "loss": 0.5098, + "step": 2564 + }, + { + "epoch": 2.056936647955092, + "grad_norm": 1.1912168264389038, + "learning_rate": 4.752145088172094e-06, + "loss": 0.5131, + "step": 2565 + }, + { + "epoch": 2.057738572574178, + "grad_norm": 0.7951311469078064, + "learning_rate": 4.744775964878017e-06, + "loss": 0.4842, + "step": 2566 + }, + { + "epoch": 2.0585404971932637, + "grad_norm": 0.8363946080207825, + "learning_rate": 4.737410782129894e-06, + "loss": 0.4933, + "step": 2567 + }, + { + "epoch": 2.0593424218123495, + "grad_norm": 0.8067214488983154, + "learning_rate": 4.730049545450394e-06, + "loss": 0.4983, + "step": 2568 + }, + { + "epoch": 2.0601443464314353, + "grad_norm": 0.8054936528205872, + "learning_rate": 4.722692260359211e-06, + "loss": 0.4895, + "step": 2569 + }, + { + "epoch": 2.060946271050521, + "grad_norm": 0.8232284188270569, + "learning_rate": 4.715338932373107e-06, + "loss": 0.4964, + "step": 2570 + }, + { + "epoch": 2.061748195669607, + "grad_norm": 0.8356310725212097, + "learning_rate": 4.707989567005845e-06, + "loss": 0.482, + "step": 2571 + }, + { + "epoch": 2.0625501202886927, + "grad_norm": 0.8142298460006714, + "learning_rate": 4.700644169768223e-06, + "loss": 0.4894, + "step": 2572 + }, + { + "epoch": 2.0633520449077785, + "grad_norm": 0.8215280771255493, + "learning_rate": 4.693302746168088e-06, + "loss": 0.4808, + "step": 2573 + }, + { + "epoch": 2.0641539695268643, + "grad_norm": 0.8594109416007996, + "learning_rate": 4.685965301710276e-06, + "loss": 0.4986, + "step": 2574 + }, + { + "epoch": 2.0649558941459505, + "grad_norm": 0.7795203328132629, + "learning_rate": 4.678631841896657e-06, + "loss": 0.4765, + "step": 2575 + }, + { + "epoch": 2.0657578187650363, + "grad_norm": 0.8334662318229675, + "learning_rate": 4.6713023722261106e-06, + "loss": 0.4933, + "step": 2576 + }, + { + "epoch": 2.066559743384122, + "grad_norm": 0.8101879954338074, + "learning_rate": 4.663976898194516e-06, + "loss": 0.5096, + "step": 2577 + }, + { + "epoch": 2.067361668003208, + "grad_norm": 0.8048333525657654, + "learning_rate": 4.656655425294774e-06, + "loss": 0.4825, + "step": 2578 + }, + { + "epoch": 2.0681635926222937, + "grad_norm": 0.8437180519104004, + "learning_rate": 4.649337959016764e-06, + "loss": 0.4801, + "step": 2579 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.8027264475822449, + "learning_rate": 4.6420245048473766e-06, + "loss": 0.4864, + "step": 2580 + }, + { + "epoch": 2.0697674418604652, + "grad_norm": 0.8142487406730652, + "learning_rate": 4.634715068270491e-06, + "loss": 0.494, + "step": 2581 + }, + { + "epoch": 2.070569366479551, + "grad_norm": 0.8042910695075989, + "learning_rate": 4.6274096547669625e-06, + "loss": 0.4805, + "step": 2582 + }, + { + "epoch": 2.071371291098637, + "grad_norm": 0.8223109245300293, + "learning_rate": 4.62010826981465e-06, + "loss": 0.5094, + "step": 2583 + }, + { + "epoch": 2.0721732157177226, + "grad_norm": 0.8160894513130188, + "learning_rate": 4.612810918888374e-06, + "loss": 0.4881, + "step": 2584 + }, + { + "epoch": 2.0729751403368084, + "grad_norm": 0.8002573847770691, + "learning_rate": 4.605517607459938e-06, + "loss": 0.488, + "step": 2585 + }, + { + "epoch": 2.073777064955894, + "grad_norm": 0.7782284021377563, + "learning_rate": 4.598228340998118e-06, + "loss": 0.4976, + "step": 2586 + }, + { + "epoch": 2.07457898957498, + "grad_norm": 0.7785077691078186, + "learning_rate": 4.590943124968651e-06, + "loss": 0.4935, + "step": 2587 + }, + { + "epoch": 2.075380914194066, + "grad_norm": 0.8213626742362976, + "learning_rate": 4.583661964834238e-06, + "loss": 0.4781, + "step": 2588 + }, + { + "epoch": 2.0761828388131516, + "grad_norm": 0.8277866244316101, + "learning_rate": 4.576384866054546e-06, + "loss": 0.4833, + "step": 2589 + }, + { + "epoch": 2.0769847634322374, + "grad_norm": 0.8673251867294312, + "learning_rate": 4.5691118340861885e-06, + "loss": 0.4927, + "step": 2590 + }, + { + "epoch": 2.077786688051323, + "grad_norm": 0.8178399205207825, + "learning_rate": 4.561842874382737e-06, + "loss": 0.504, + "step": 2591 + }, + { + "epoch": 2.078588612670409, + "grad_norm": 0.8196151852607727, + "learning_rate": 4.554577992394697e-06, + "loss": 0.4728, + "step": 2592 + }, + { + "epoch": 2.0793905372894947, + "grad_norm": 0.8163505792617798, + "learning_rate": 4.54731719356953e-06, + "loss": 0.5019, + "step": 2593 + }, + { + "epoch": 2.0801924619085805, + "grad_norm": 0.8096843957901001, + "learning_rate": 4.540060483351628e-06, + "loss": 0.4947, + "step": 2594 + }, + { + "epoch": 2.0809943865276663, + "grad_norm": 0.846545398235321, + "learning_rate": 4.532807867182322e-06, + "loss": 0.4999, + "step": 2595 + }, + { + "epoch": 2.081796311146752, + "grad_norm": 0.8637265563011169, + "learning_rate": 4.525559350499872e-06, + "loss": 0.503, + "step": 2596 + }, + { + "epoch": 2.082598235765838, + "grad_norm": 0.795164942741394, + "learning_rate": 4.5183149387394566e-06, + "loss": 0.488, + "step": 2597 + }, + { + "epoch": 2.0834001603849237, + "grad_norm": 0.8308284282684326, + "learning_rate": 4.511074637333185e-06, + "loss": 0.4855, + "step": 2598 + }, + { + "epoch": 2.0842020850040095, + "grad_norm": 0.8101129531860352, + "learning_rate": 4.503838451710082e-06, + "loss": 0.483, + "step": 2599 + }, + { + "epoch": 2.0850040096230953, + "grad_norm": 0.8016064167022705, + "learning_rate": 4.49660638729609e-06, + "loss": 0.4921, + "step": 2600 + }, + { + "epoch": 2.085805934242181, + "grad_norm": 0.8189466595649719, + "learning_rate": 4.489378449514051e-06, + "loss": 0.4983, + "step": 2601 + }, + { + "epoch": 2.086607858861267, + "grad_norm": 0.8600638508796692, + "learning_rate": 4.482154643783722e-06, + "loss": 0.506, + "step": 2602 + }, + { + "epoch": 2.0874097834803527, + "grad_norm": 0.7984684705734253, + "learning_rate": 4.4749349755217575e-06, + "loss": 0.4746, + "step": 2603 + }, + { + "epoch": 2.0882117080994385, + "grad_norm": 0.8474909067153931, + "learning_rate": 4.467719450141711e-06, + "loss": 0.4957, + "step": 2604 + }, + { + "epoch": 2.0890136327185242, + "grad_norm": 0.8484524488449097, + "learning_rate": 4.460508073054033e-06, + "loss": 0.4828, + "step": 2605 + }, + { + "epoch": 2.0898155573376105, + "grad_norm": 0.8325912952423096, + "learning_rate": 4.453300849666053e-06, + "loss": 0.4942, + "step": 2606 + }, + { + "epoch": 2.0906174819566963, + "grad_norm": 0.8752564787864685, + "learning_rate": 4.446097785381995e-06, + "loss": 0.4944, + "step": 2607 + }, + { + "epoch": 2.091419406575782, + "grad_norm": 0.8515805006027222, + "learning_rate": 4.438898885602962e-06, + "loss": 0.5203, + "step": 2608 + }, + { + "epoch": 2.092221331194868, + "grad_norm": 0.8291308283805847, + "learning_rate": 4.431704155726936e-06, + "loss": 0.5042, + "step": 2609 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.7905226349830627, + "learning_rate": 4.424513601148772e-06, + "loss": 0.4855, + "step": 2610 + }, + { + "epoch": 2.0938251804330394, + "grad_norm": 0.7867658734321594, + "learning_rate": 4.417327227260183e-06, + "loss": 0.5087, + "step": 2611 + }, + { + "epoch": 2.0946271050521252, + "grad_norm": 0.7623449563980103, + "learning_rate": 4.410145039449771e-06, + "loss": 0.493, + "step": 2612 + }, + { + "epoch": 2.095429029671211, + "grad_norm": 0.816936194896698, + "learning_rate": 4.402967043102974e-06, + "loss": 0.4888, + "step": 2613 + }, + { + "epoch": 2.096230954290297, + "grad_norm": 0.7807714343070984, + "learning_rate": 4.395793243602102e-06, + "loss": 0.4777, + "step": 2614 + }, + { + "epoch": 2.0970328789093826, + "grad_norm": 0.7865536212921143, + "learning_rate": 4.388623646326318e-06, + "loss": 0.4997, + "step": 2615 + }, + { + "epoch": 2.0978348035284684, + "grad_norm": 0.8231841325759888, + "learning_rate": 4.381458256651622e-06, + "loss": 0.504, + "step": 2616 + }, + { + "epoch": 2.098636728147554, + "grad_norm": 0.8537681698799133, + "learning_rate": 4.374297079950872e-06, + "loss": 0.4859, + "step": 2617 + }, + { + "epoch": 2.09943865276664, + "grad_norm": 0.89604651927948, + "learning_rate": 4.367140121593764e-06, + "loss": 0.5137, + "step": 2618 + }, + { + "epoch": 2.100240577385726, + "grad_norm": 0.838858962059021, + "learning_rate": 4.359987386946822e-06, + "loss": 0.4979, + "step": 2619 + }, + { + "epoch": 2.1010425020048116, + "grad_norm": 0.8409374952316284, + "learning_rate": 4.352838881373421e-06, + "loss": 0.4767, + "step": 2620 + }, + { + "epoch": 2.1018444266238974, + "grad_norm": 0.7959094643592834, + "learning_rate": 4.345694610233744e-06, + "loss": 0.4772, + "step": 2621 + }, + { + "epoch": 2.102646351242983, + "grad_norm": 0.8662393689155579, + "learning_rate": 4.338554578884813e-06, + "loss": 0.4998, + "step": 2622 + }, + { + "epoch": 2.103448275862069, + "grad_norm": 0.8256474733352661, + "learning_rate": 4.331418792680468e-06, + "loss": 0.4968, + "step": 2623 + }, + { + "epoch": 2.1042502004811547, + "grad_norm": 0.8457236289978027, + "learning_rate": 4.324287256971358e-06, + "loss": 0.5243, + "step": 2624 + }, + { + "epoch": 2.1050521251002405, + "grad_norm": 0.7527933716773987, + "learning_rate": 4.3171599771049625e-06, + "loss": 0.4426, + "step": 2625 + }, + { + "epoch": 2.1058540497193263, + "grad_norm": 0.8174936175346375, + "learning_rate": 4.3100369584255475e-06, + "loss": 0.5017, + "step": 2626 + }, + { + "epoch": 2.106655974338412, + "grad_norm": 0.8383927941322327, + "learning_rate": 4.302918206274202e-06, + "loss": 0.4952, + "step": 2627 + }, + { + "epoch": 2.107457898957498, + "grad_norm": 0.8061890602111816, + "learning_rate": 4.295803725988807e-06, + "loss": 0.4923, + "step": 2628 + }, + { + "epoch": 2.1082598235765837, + "grad_norm": 0.8143693804740906, + "learning_rate": 4.2886935229040375e-06, + "loss": 0.5019, + "step": 2629 + }, + { + "epoch": 2.1090617481956695, + "grad_norm": 0.8190400004386902, + "learning_rate": 4.281587602351376e-06, + "loss": 0.4858, + "step": 2630 + }, + { + "epoch": 2.1098636728147553, + "grad_norm": 0.7694993615150452, + "learning_rate": 4.274485969659074e-06, + "loss": 0.4635, + "step": 2631 + }, + { + "epoch": 2.110665597433841, + "grad_norm": 0.840126633644104, + "learning_rate": 4.267388630152182e-06, + "loss": 0.506, + "step": 2632 + }, + { + "epoch": 2.111467522052927, + "grad_norm": 0.8422167301177979, + "learning_rate": 4.26029558915253e-06, + "loss": 0.4815, + "step": 2633 + }, + { + "epoch": 2.1122694466720127, + "grad_norm": 0.83141028881073, + "learning_rate": 4.2532068519787124e-06, + "loss": 0.4826, + "step": 2634 + }, + { + "epoch": 2.1130713712910985, + "grad_norm": 0.8261462450027466, + "learning_rate": 4.246122423946114e-06, + "loss": 0.4945, + "step": 2635 + }, + { + "epoch": 2.1138732959101842, + "grad_norm": 0.8225822448730469, + "learning_rate": 4.239042310366875e-06, + "loss": 0.4753, + "step": 2636 + }, + { + "epoch": 2.11467522052927, + "grad_norm": 0.8107204437255859, + "learning_rate": 4.23196651654991e-06, + "loss": 0.4763, + "step": 2637 + }, + { + "epoch": 2.115477145148356, + "grad_norm": 0.8356348276138306, + "learning_rate": 4.224895047800892e-06, + "loss": 0.4931, + "step": 2638 + }, + { + "epoch": 2.116279069767442, + "grad_norm": 0.803632915019989, + "learning_rate": 4.217827909422241e-06, + "loss": 0.4685, + "step": 2639 + }, + { + "epoch": 2.117080994386528, + "grad_norm": 0.8820094466209412, + "learning_rate": 4.210765106713143e-06, + "loss": 0.4863, + "step": 2640 + }, + { + "epoch": 2.1178829190056137, + "grad_norm": 0.8546995520591736, + "learning_rate": 4.2037066449695275e-06, + "loss": 0.496, + "step": 2641 + }, + { + "epoch": 2.1186848436246994, + "grad_norm": 0.8304917216300964, + "learning_rate": 4.196652529484068e-06, + "loss": 0.5053, + "step": 2642 + }, + { + "epoch": 2.1194867682437852, + "grad_norm": 0.8051602244377136, + "learning_rate": 4.189602765546188e-06, + "loss": 0.5033, + "step": 2643 + }, + { + "epoch": 2.120288692862871, + "grad_norm": 0.8486653566360474, + "learning_rate": 4.18255735844203e-06, + "loss": 0.5049, + "step": 2644 + }, + { + "epoch": 2.121090617481957, + "grad_norm": 0.8334927558898926, + "learning_rate": 4.175516313454485e-06, + "loss": 0.5047, + "step": 2645 + }, + { + "epoch": 2.1218925421010426, + "grad_norm": 0.8217371106147766, + "learning_rate": 4.168479635863167e-06, + "loss": 0.5031, + "step": 2646 + }, + { + "epoch": 2.1226944667201284, + "grad_norm": 0.8687995076179504, + "learning_rate": 4.161447330944422e-06, + "loss": 0.5255, + "step": 2647 + }, + { + "epoch": 2.123496391339214, + "grad_norm": 0.838424026966095, + "learning_rate": 4.154419403971305e-06, + "loss": 0.4944, + "step": 2648 + }, + { + "epoch": 2.1242983159583, + "grad_norm": 0.7598464488983154, + "learning_rate": 4.1473958602135956e-06, + "loss": 0.4791, + "step": 2649 + }, + { + "epoch": 2.125100240577386, + "grad_norm": 0.9013649821281433, + "learning_rate": 4.140376704937789e-06, + "loss": 0.5074, + "step": 2650 + }, + { + "epoch": 2.1259021651964716, + "grad_norm": 0.8267843723297119, + "learning_rate": 4.133361943407085e-06, + "loss": 0.4883, + "step": 2651 + }, + { + "epoch": 2.1267040898155574, + "grad_norm": 0.8689286112785339, + "learning_rate": 4.126351580881395e-06, + "loss": 0.4759, + "step": 2652 + }, + { + "epoch": 2.127506014434643, + "grad_norm": 0.912152886390686, + "learning_rate": 4.11934562261732e-06, + "loss": 0.5206, + "step": 2653 + }, + { + "epoch": 2.128307939053729, + "grad_norm": 0.832675576210022, + "learning_rate": 4.112344073868171e-06, + "loss": 0.4897, + "step": 2654 + }, + { + "epoch": 2.1291098636728147, + "grad_norm": 0.8386573791503906, + "learning_rate": 4.105346939883946e-06, + "loss": 0.5046, + "step": 2655 + }, + { + "epoch": 2.1299117882919005, + "grad_norm": 0.8459692597389221, + "learning_rate": 4.098354225911336e-06, + "loss": 0.5104, + "step": 2656 + }, + { + "epoch": 2.1307137129109863, + "grad_norm": 0.8297333121299744, + "learning_rate": 4.091365937193719e-06, + "loss": 0.4764, + "step": 2657 + }, + { + "epoch": 2.131515637530072, + "grad_norm": 0.8380979895591736, + "learning_rate": 4.084382078971143e-06, + "loss": 0.4874, + "step": 2658 + }, + { + "epoch": 2.132317562149158, + "grad_norm": 0.8121061325073242, + "learning_rate": 4.0774026564803494e-06, + "loss": 0.5001, + "step": 2659 + }, + { + "epoch": 2.1331194867682437, + "grad_norm": 0.8189265727996826, + "learning_rate": 4.070427674954748e-06, + "loss": 0.4694, + "step": 2660 + }, + { + "epoch": 2.1339214113873295, + "grad_norm": 0.8043553829193115, + "learning_rate": 4.063457139624407e-06, + "loss": 0.4957, + "step": 2661 + }, + { + "epoch": 2.1347233360064153, + "grad_norm": 0.7954015731811523, + "learning_rate": 4.056491055716088e-06, + "loss": 0.4764, + "step": 2662 + }, + { + "epoch": 2.135525260625501, + "grad_norm": 0.7770733833312988, + "learning_rate": 4.049529428453184e-06, + "loss": 0.4811, + "step": 2663 + }, + { + "epoch": 2.136327185244587, + "grad_norm": 0.8199390172958374, + "learning_rate": 4.042572263055765e-06, + "loss": 0.4998, + "step": 2664 + }, + { + "epoch": 2.1371291098636727, + "grad_norm": 0.8421967029571533, + "learning_rate": 4.035619564740555e-06, + "loss": 0.4694, + "step": 2665 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 0.8318009376525879, + "learning_rate": 4.028671338720912e-06, + "loss": 0.4854, + "step": 2666 + }, + { + "epoch": 2.1387329591018442, + "grad_norm": 0.8843598365783691, + "learning_rate": 4.021727590206868e-06, + "loss": 0.4877, + "step": 2667 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 0.8518946170806885, + "learning_rate": 4.01478832440507e-06, + "loss": 0.5328, + "step": 2668 + }, + { + "epoch": 2.140336808340016, + "grad_norm": 0.8255460858345032, + "learning_rate": 4.00785354651882e-06, + "loss": 0.5143, + "step": 2669 + }, + { + "epoch": 2.141138732959102, + "grad_norm": 0.8585113286972046, + "learning_rate": 4.000923261748055e-06, + "loss": 0.5018, + "step": 2670 + }, + { + "epoch": 2.141940657578188, + "grad_norm": 0.8432846665382385, + "learning_rate": 3.9939974752893275e-06, + "loss": 0.4755, + "step": 2671 + }, + { + "epoch": 2.1427425821972736, + "grad_norm": 0.9172224402427673, + "learning_rate": 3.9870761923358405e-06, + "loss": 0.4902, + "step": 2672 + }, + { + "epoch": 2.1435445068163594, + "grad_norm": 0.8651891946792603, + "learning_rate": 3.980159418077403e-06, + "loss": 0.5009, + "step": 2673 + }, + { + "epoch": 2.1443464314354452, + "grad_norm": 0.8147042393684387, + "learning_rate": 3.97324715770044e-06, + "loss": 0.5091, + "step": 2674 + }, + { + "epoch": 2.145148356054531, + "grad_norm": 0.8397491574287415, + "learning_rate": 3.966339416388013e-06, + "loss": 0.4879, + "step": 2675 + }, + { + "epoch": 2.145950280673617, + "grad_norm": 0.8129686713218689, + "learning_rate": 3.959436199319771e-06, + "loss": 0.5036, + "step": 2676 + }, + { + "epoch": 2.1467522052927026, + "grad_norm": 0.8532066345214844, + "learning_rate": 3.952537511671988e-06, + "loss": 0.4864, + "step": 2677 + }, + { + "epoch": 2.1475541299117884, + "grad_norm": 0.817719578742981, + "learning_rate": 3.9456433586175335e-06, + "loss": 0.4962, + "step": 2678 + }, + { + "epoch": 2.148356054530874, + "grad_norm": 0.8393282890319824, + "learning_rate": 3.938753745325872e-06, + "loss": 0.4987, + "step": 2679 + }, + { + "epoch": 2.14915797914996, + "grad_norm": 0.9043245315551758, + "learning_rate": 3.931868676963082e-06, + "loss": 0.5131, + "step": 2680 + }, + { + "epoch": 2.1499599037690458, + "grad_norm": 0.876274049282074, + "learning_rate": 3.924988158691812e-06, + "loss": 0.5018, + "step": 2681 + }, + { + "epoch": 2.1507618283881316, + "grad_norm": 0.8482949137687683, + "learning_rate": 3.918112195671313e-06, + "loss": 0.5111, + "step": 2682 + }, + { + "epoch": 2.1515637530072174, + "grad_norm": 0.8316020965576172, + "learning_rate": 3.9112407930574195e-06, + "loss": 0.5079, + "step": 2683 + }, + { + "epoch": 2.152365677626303, + "grad_norm": 0.8304176330566406, + "learning_rate": 3.904373956002532e-06, + "loss": 0.4938, + "step": 2684 + }, + { + "epoch": 2.153167602245389, + "grad_norm": 0.8400371670722961, + "learning_rate": 3.897511689655653e-06, + "loss": 0.4879, + "step": 2685 + }, + { + "epoch": 2.1539695268644747, + "grad_norm": 0.8171892762184143, + "learning_rate": 3.890653999162333e-06, + "loss": 0.4841, + "step": 2686 + }, + { + "epoch": 2.1547714514835605, + "grad_norm": 0.8671571612358093, + "learning_rate": 3.8838008896647075e-06, + "loss": 0.5202, + "step": 2687 + }, + { + "epoch": 2.1555733761026463, + "grad_norm": 0.8403294682502747, + "learning_rate": 3.876952366301472e-06, + "loss": 0.5099, + "step": 2688 + }, + { + "epoch": 2.156375300721732, + "grad_norm": 0.8804596066474915, + "learning_rate": 3.870108434207877e-06, + "loss": 0.508, + "step": 2689 + }, + { + "epoch": 2.157177225340818, + "grad_norm": 0.8431464433670044, + "learning_rate": 3.863269098515738e-06, + "loss": 0.5076, + "step": 2690 + }, + { + "epoch": 2.1579791499599037, + "grad_norm": 0.8322014212608337, + "learning_rate": 3.856434364353424e-06, + "loss": 0.4894, + "step": 2691 + }, + { + "epoch": 2.1587810745789895, + "grad_norm": 0.845043420791626, + "learning_rate": 3.84960423684585e-06, + "loss": 0.489, + "step": 2692 + }, + { + "epoch": 2.1595829991980753, + "grad_norm": 0.8513469696044922, + "learning_rate": 3.842778721114482e-06, + "loss": 0.4815, + "step": 2693 + }, + { + "epoch": 2.160384923817161, + "grad_norm": 0.8116580247879028, + "learning_rate": 3.835957822277317e-06, + "loss": 0.4791, + "step": 2694 + }, + { + "epoch": 2.161186848436247, + "grad_norm": 0.8411436676979065, + "learning_rate": 3.829141545448901e-06, + "loss": 0.5097, + "step": 2695 + }, + { + "epoch": 2.1619887730553327, + "grad_norm": 0.8362702131271362, + "learning_rate": 3.82232989574031e-06, + "loss": 0.4959, + "step": 2696 + }, + { + "epoch": 2.1627906976744184, + "grad_norm": 0.8088338375091553, + "learning_rate": 3.815522878259153e-06, + "loss": 0.5093, + "step": 2697 + }, + { + "epoch": 2.1635926222935042, + "grad_norm": 0.8484572768211365, + "learning_rate": 3.8087204981095625e-06, + "loss": 0.4834, + "step": 2698 + }, + { + "epoch": 2.16439454691259, + "grad_norm": 0.8214154839515686, + "learning_rate": 3.8019227603921927e-06, + "loss": 0.4883, + "step": 2699 + }, + { + "epoch": 2.165196471531676, + "grad_norm": 0.8145144581794739, + "learning_rate": 3.7951296702042194e-06, + "loss": 0.4891, + "step": 2700 + }, + { + "epoch": 2.165998396150762, + "grad_norm": 0.7915971875190735, + "learning_rate": 3.7883412326393352e-06, + "loss": 0.4821, + "step": 2701 + }, + { + "epoch": 2.1668003207698474, + "grad_norm": 0.809570848941803, + "learning_rate": 3.7815574527877395e-06, + "loss": 0.4941, + "step": 2702 + }, + { + "epoch": 2.1676022453889336, + "grad_norm": 0.8416109085083008, + "learning_rate": 3.7747783357361455e-06, + "loss": 0.4698, + "step": 2703 + }, + { + "epoch": 2.1684041700080194, + "grad_norm": 0.8841302394866943, + "learning_rate": 3.7680038865677603e-06, + "loss": 0.4938, + "step": 2704 + }, + { + "epoch": 2.1692060946271052, + "grad_norm": 0.8262732028961182, + "learning_rate": 3.7612341103622984e-06, + "loss": 0.4911, + "step": 2705 + }, + { + "epoch": 2.170008019246191, + "grad_norm": 0.815339207649231, + "learning_rate": 3.7544690121959704e-06, + "loss": 0.467, + "step": 2706 + }, + { + "epoch": 2.170809943865277, + "grad_norm": 0.8234750628471375, + "learning_rate": 3.7477085971414785e-06, + "loss": 0.4913, + "step": 2707 + }, + { + "epoch": 2.1716118684843626, + "grad_norm": 0.8718725442886353, + "learning_rate": 3.7409528702680078e-06, + "loss": 0.505, + "step": 2708 + }, + { + "epoch": 2.1724137931034484, + "grad_norm": 0.8053902983665466, + "learning_rate": 3.7342018366412336e-06, + "loss": 0.4958, + "step": 2709 + }, + { + "epoch": 2.173215717722534, + "grad_norm": 0.8399242758750916, + "learning_rate": 3.7274555013233118e-06, + "loss": 0.4882, + "step": 2710 + }, + { + "epoch": 2.17401764234162, + "grad_norm": 0.8590044975280762, + "learning_rate": 3.720713869372875e-06, + "loss": 0.4969, + "step": 2711 + }, + { + "epoch": 2.1748195669607058, + "grad_norm": 0.8541808724403381, + "learning_rate": 3.71397694584503e-06, + "loss": 0.4689, + "step": 2712 + }, + { + "epoch": 2.1756214915797916, + "grad_norm": 0.8489252328872681, + "learning_rate": 3.7072447357913477e-06, + "loss": 0.4941, + "step": 2713 + }, + { + "epoch": 2.1764234161988774, + "grad_norm": 0.8415629267692566, + "learning_rate": 3.700517244259868e-06, + "loss": 0.4818, + "step": 2714 + }, + { + "epoch": 2.177225340817963, + "grad_norm": 0.8425331711769104, + "learning_rate": 3.693794476295096e-06, + "loss": 0.5023, + "step": 2715 + }, + { + "epoch": 2.178027265437049, + "grad_norm": 0.8632931113243103, + "learning_rate": 3.687076436937992e-06, + "loss": 0.4943, + "step": 2716 + }, + { + "epoch": 2.1788291900561347, + "grad_norm": 0.8151559829711914, + "learning_rate": 3.6803631312259724e-06, + "loss": 0.4764, + "step": 2717 + }, + { + "epoch": 2.1796311146752205, + "grad_norm": 0.8196760416030884, + "learning_rate": 3.6736545641928965e-06, + "loss": 0.4779, + "step": 2718 + }, + { + "epoch": 2.1804330392943063, + "grad_norm": 0.7820659279823303, + "learning_rate": 3.6669507408690806e-06, + "loss": 0.4855, + "step": 2719 + }, + { + "epoch": 2.181234963913392, + "grad_norm": 0.8023489713668823, + "learning_rate": 3.6602516662812824e-06, + "loss": 0.4851, + "step": 2720 + }, + { + "epoch": 2.182036888532478, + "grad_norm": 0.9628251194953918, + "learning_rate": 3.653557345452685e-06, + "loss": 0.4893, + "step": 2721 + }, + { + "epoch": 2.1828388131515637, + "grad_norm": 0.8556442260742188, + "learning_rate": 3.6468677834029343e-06, + "loss": 0.4775, + "step": 2722 + }, + { + "epoch": 2.1836407377706495, + "grad_norm": 0.9213688373565674, + "learning_rate": 3.6401829851480786e-06, + "loss": 0.4782, + "step": 2723 + }, + { + "epoch": 2.1844426623897353, + "grad_norm": 0.8511019349098206, + "learning_rate": 3.6335029557006117e-06, + "loss": 0.487, + "step": 2724 + }, + { + "epoch": 2.185244587008821, + "grad_norm": 0.8275080323219299, + "learning_rate": 3.626827700069452e-06, + "loss": 0.4673, + "step": 2725 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.8424668908119202, + "learning_rate": 3.6201572232599227e-06, + "loss": 0.5106, + "step": 2726 + }, + { + "epoch": 2.1868484362469927, + "grad_norm": 0.9268709421157837, + "learning_rate": 3.6134915302737862e-06, + "loss": 0.5259, + "step": 2727 + }, + { + "epoch": 2.1876503608660784, + "grad_norm": 0.8410111665725708, + "learning_rate": 3.606830626109198e-06, + "loss": 0.5069, + "step": 2728 + }, + { + "epoch": 2.1884522854851642, + "grad_norm": 0.8103172779083252, + "learning_rate": 3.600174515760733e-06, + "loss": 0.4787, + "step": 2729 + }, + { + "epoch": 2.18925421010425, + "grad_norm": 0.8217037916183472, + "learning_rate": 3.5935232042193734e-06, + "loss": 0.5043, + "step": 2730 + }, + { + "epoch": 2.190056134723336, + "grad_norm": 0.8149628639221191, + "learning_rate": 3.58687669647249e-06, + "loss": 0.5117, + "step": 2731 + }, + { + "epoch": 2.1908580593424216, + "grad_norm": 0.8104637265205383, + "learning_rate": 3.5802349975038718e-06, + "loss": 0.4684, + "step": 2732 + }, + { + "epoch": 2.1916599839615074, + "grad_norm": 0.8468414545059204, + "learning_rate": 3.573598112293687e-06, + "loss": 0.4875, + "step": 2733 + }, + { + "epoch": 2.1924619085805936, + "grad_norm": 0.8977518677711487, + "learning_rate": 3.5669660458184886e-06, + "loss": 0.5076, + "step": 2734 + }, + { + "epoch": 2.1932638331996794, + "grad_norm": 0.8071417808532715, + "learning_rate": 3.560338803051241e-06, + "loss": 0.4962, + "step": 2735 + }, + { + "epoch": 2.1940657578187652, + "grad_norm": 0.8739727139472961, + "learning_rate": 3.5537163889612656e-06, + "loss": 0.5087, + "step": 2736 + }, + { + "epoch": 2.194867682437851, + "grad_norm": 0.8069101572036743, + "learning_rate": 3.547098808514279e-06, + "loss": 0.4997, + "step": 2737 + }, + { + "epoch": 2.195669607056937, + "grad_norm": 0.8516671061515808, + "learning_rate": 3.5404860666723695e-06, + "loss": 0.4875, + "step": 2738 + }, + { + "epoch": 2.1964715316760226, + "grad_norm": 0.8388312458992004, + "learning_rate": 3.5338781683939882e-06, + "loss": 0.4838, + "step": 2739 + }, + { + "epoch": 2.1972734562951084, + "grad_norm": 0.8394985198974609, + "learning_rate": 3.527275118633974e-06, + "loss": 0.5076, + "step": 2740 + }, + { + "epoch": 2.198075380914194, + "grad_norm": 0.8449310064315796, + "learning_rate": 3.52067692234351e-06, + "loss": 0.4994, + "step": 2741 + }, + { + "epoch": 2.19887730553328, + "grad_norm": 0.806086003780365, + "learning_rate": 3.514083584470149e-06, + "loss": 0.4839, + "step": 2742 + }, + { + "epoch": 2.1996792301523658, + "grad_norm": 0.8380177617073059, + "learning_rate": 3.507495109957808e-06, + "loss": 0.4765, + "step": 2743 + }, + { + "epoch": 2.2004811547714516, + "grad_norm": 0.8069396615028381, + "learning_rate": 3.5009115037467355e-06, + "loss": 0.4788, + "step": 2744 + }, + { + "epoch": 2.2012830793905374, + "grad_norm": 0.8960397243499756, + "learning_rate": 3.4943327707735586e-06, + "loss": 0.505, + "step": 2745 + }, + { + "epoch": 2.202085004009623, + "grad_norm": 0.8247140645980835, + "learning_rate": 3.4877589159712266e-06, + "loss": 0.4837, + "step": 2746 + }, + { + "epoch": 2.202886928628709, + "grad_norm": 0.8580472469329834, + "learning_rate": 3.481189944269041e-06, + "loss": 0.4965, + "step": 2747 + }, + { + "epoch": 2.2036888532477947, + "grad_norm": 0.8549031615257263, + "learning_rate": 3.4746258605926443e-06, + "loss": 0.4888, + "step": 2748 + }, + { + "epoch": 2.2044907778668805, + "grad_norm": 0.8439877033233643, + "learning_rate": 3.468066669864004e-06, + "loss": 0.4763, + "step": 2749 + }, + { + "epoch": 2.2052927024859663, + "grad_norm": 0.843323290348053, + "learning_rate": 3.461512377001427e-06, + "loss": 0.5199, + "step": 2750 + }, + { + "epoch": 2.206094627105052, + "grad_norm": 0.8714971542358398, + "learning_rate": 3.4549629869195467e-06, + "loss": 0.489, + "step": 2751 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.8079264163970947, + "learning_rate": 3.448418504529318e-06, + "loss": 0.4635, + "step": 2752 + }, + { + "epoch": 2.2076984763432237, + "grad_norm": 0.8408187031745911, + "learning_rate": 3.44187893473802e-06, + "loss": 0.4858, + "step": 2753 + }, + { + "epoch": 2.2085004009623095, + "grad_norm": 0.8397002220153809, + "learning_rate": 3.435344282449239e-06, + "loss": 0.4884, + "step": 2754 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 0.9133360981941223, + "learning_rate": 3.4288145525628813e-06, + "loss": 0.4985, + "step": 2755 + }, + { + "epoch": 2.210104250200481, + "grad_norm": 0.886596143245697, + "learning_rate": 3.422289749975163e-06, + "loss": 0.5094, + "step": 2756 + }, + { + "epoch": 2.210906174819567, + "grad_norm": 0.8573530912399292, + "learning_rate": 3.415769879578601e-06, + "loss": 0.5027, + "step": 2757 + }, + { + "epoch": 2.2117080994386527, + "grad_norm": 0.8812514543533325, + "learning_rate": 3.4092549462620215e-06, + "loss": 0.4878, + "step": 2758 + }, + { + "epoch": 2.2125100240577384, + "grad_norm": 0.8391367197036743, + "learning_rate": 3.4027449549105353e-06, + "loss": 0.4836, + "step": 2759 + }, + { + "epoch": 2.2133119486768242, + "grad_norm": 0.7842381596565247, + "learning_rate": 3.3962399104055597e-06, + "loss": 0.4747, + "step": 2760 + }, + { + "epoch": 2.21411387329591, + "grad_norm": 0.8407445549964905, + "learning_rate": 3.3897398176247984e-06, + "loss": 0.4775, + "step": 2761 + }, + { + "epoch": 2.214915797914996, + "grad_norm": 0.8051870465278625, + "learning_rate": 3.383244681442246e-06, + "loss": 0.5205, + "step": 2762 + }, + { + "epoch": 2.2157177225340816, + "grad_norm": 0.788661539554596, + "learning_rate": 3.376754506728167e-06, + "loss": 0.4794, + "step": 2763 + }, + { + "epoch": 2.2165196471531674, + "grad_norm": 0.8352063298225403, + "learning_rate": 3.370269298349128e-06, + "loss": 0.4687, + "step": 2764 + }, + { + "epoch": 2.2173215717722536, + "grad_norm": 0.8610337376594543, + "learning_rate": 3.363789061167949e-06, + "loss": 0.4769, + "step": 2765 + }, + { + "epoch": 2.218123496391339, + "grad_norm": 0.8776218891143799, + "learning_rate": 3.3573138000437367e-06, + "loss": 0.469, + "step": 2766 + }, + { + "epoch": 2.2189254210104252, + "grad_norm": 0.8633357882499695, + "learning_rate": 3.3508435198318645e-06, + "loss": 0.4961, + "step": 2767 + }, + { + "epoch": 2.219727345629511, + "grad_norm": 0.8187628984451294, + "learning_rate": 3.34437822538396e-06, + "loss": 0.4973, + "step": 2768 + }, + { + "epoch": 2.220529270248597, + "grad_norm": 0.805854082107544, + "learning_rate": 3.337917921547934e-06, + "loss": 0.5006, + "step": 2769 + }, + { + "epoch": 2.2213311948676826, + "grad_norm": 0.8261088132858276, + "learning_rate": 3.3314626131679328e-06, + "loss": 0.4847, + "step": 2770 + }, + { + "epoch": 2.2221331194867684, + "grad_norm": 0.8578211069107056, + "learning_rate": 3.3250123050843696e-06, + "loss": 0.4876, + "step": 2771 + }, + { + "epoch": 2.222935044105854, + "grad_norm": 0.8757601976394653, + "learning_rate": 3.318567002133909e-06, + "loss": 0.4817, + "step": 2772 + }, + { + "epoch": 2.22373696872494, + "grad_norm": 0.8412430286407471, + "learning_rate": 3.312126709149447e-06, + "loss": 0.4905, + "step": 2773 + }, + { + "epoch": 2.2245388933440258, + "grad_norm": 0.833476722240448, + "learning_rate": 3.3056914309601483e-06, + "loss": 0.5099, + "step": 2774 + }, + { + "epoch": 2.2253408179631116, + "grad_norm": 0.8237016797065735, + "learning_rate": 3.299261172391399e-06, + "loss": 0.4878, + "step": 2775 + }, + { + "epoch": 2.2261427425821974, + "grad_norm": 0.8863388299942017, + "learning_rate": 3.2928359382648166e-06, + "loss": 0.5018, + "step": 2776 + }, + { + "epoch": 2.226944667201283, + "grad_norm": 0.9207762479782104, + "learning_rate": 3.286415733398276e-06, + "loss": 0.5004, + "step": 2777 + }, + { + "epoch": 2.227746591820369, + "grad_norm": 0.8882100582122803, + "learning_rate": 3.280000562605854e-06, + "loss": 0.4769, + "step": 2778 + }, + { + "epoch": 2.2285485164394547, + "grad_norm": 0.8794893622398376, + "learning_rate": 3.2735904306978684e-06, + "loss": 0.4896, + "step": 2779 + }, + { + "epoch": 2.2293504410585405, + "grad_norm": 0.8332772850990295, + "learning_rate": 3.2671853424808574e-06, + "loss": 0.466, + "step": 2780 + }, + { + "epoch": 2.2301523656776263, + "grad_norm": 0.827366292476654, + "learning_rate": 3.2607853027575643e-06, + "loss": 0.4785, + "step": 2781 + }, + { + "epoch": 2.230954290296712, + "grad_norm": 0.8560011386871338, + "learning_rate": 3.2543903163269697e-06, + "loss": 0.5128, + "step": 2782 + }, + { + "epoch": 2.231756214915798, + "grad_norm": 0.8379847407341003, + "learning_rate": 3.2480003879842424e-06, + "loss": 0.4706, + "step": 2783 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.8355280756950378, + "learning_rate": 3.2416155225207726e-06, + "loss": 0.4936, + "step": 2784 + }, + { + "epoch": 2.2333600641539695, + "grad_norm": 0.8986692428588867, + "learning_rate": 3.2352357247241517e-06, + "loss": 0.4908, + "step": 2785 + }, + { + "epoch": 2.2341619887730553, + "grad_norm": 0.8830366730690002, + "learning_rate": 3.2288609993781606e-06, + "loss": 0.4813, + "step": 2786 + }, + { + "epoch": 2.234963913392141, + "grad_norm": 0.8496928811073303, + "learning_rate": 3.2224913512627976e-06, + "loss": 0.4854, + "step": 2787 + }, + { + "epoch": 2.235765838011227, + "grad_norm": 0.8446789979934692, + "learning_rate": 3.2161267851542333e-06, + "loss": 0.5021, + "step": 2788 + }, + { + "epoch": 2.2365677626303127, + "grad_norm": 0.8232909440994263, + "learning_rate": 3.2097673058248378e-06, + "loss": 0.4641, + "step": 2789 + }, + { + "epoch": 2.2373696872493984, + "grad_norm": 0.8378138542175293, + "learning_rate": 3.2034129180431705e-06, + "loss": 0.4796, + "step": 2790 + }, + { + "epoch": 2.2381716118684842, + "grad_norm": 0.8232404589653015, + "learning_rate": 3.1970636265739595e-06, + "loss": 0.4931, + "step": 2791 + }, + { + "epoch": 2.23897353648757, + "grad_norm": 0.8911257982254028, + "learning_rate": 3.1907194361781234e-06, + "loss": 0.4851, + "step": 2792 + }, + { + "epoch": 2.239775461106656, + "grad_norm": 0.8609054684638977, + "learning_rate": 3.1843803516127537e-06, + "loss": 0.4906, + "step": 2793 + }, + { + "epoch": 2.2405773857257416, + "grad_norm": 0.869034469127655, + "learning_rate": 3.178046377631109e-06, + "loss": 0.4871, + "step": 2794 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 0.8282964825630188, + "learning_rate": 3.1717175189826246e-06, + "loss": 0.4746, + "step": 2795 + }, + { + "epoch": 2.242181234963913, + "grad_norm": 0.7694876194000244, + "learning_rate": 3.1653937804128863e-06, + "loss": 0.4599, + "step": 2796 + }, + { + "epoch": 2.242983159582999, + "grad_norm": 0.8961299657821655, + "learning_rate": 3.159075166663653e-06, + "loss": 0.465, + "step": 2797 + }, + { + "epoch": 2.2437850842020852, + "grad_norm": 0.8370431661605835, + "learning_rate": 3.1527616824728356e-06, + "loss": 0.4863, + "step": 2798 + }, + { + "epoch": 2.244587008821171, + "grad_norm": 0.8903129696846008, + "learning_rate": 3.1464533325744997e-06, + "loss": 0.4786, + "step": 2799 + }, + { + "epoch": 2.245388933440257, + "grad_norm": 0.8716090321540833, + "learning_rate": 3.140150121698864e-06, + "loss": 0.5046, + "step": 2800 + }, + { + "epoch": 2.2461908580593426, + "grad_norm": 0.8388259410858154, + "learning_rate": 3.1338520545722852e-06, + "loss": 0.4695, + "step": 2801 + }, + { + "epoch": 2.2469927826784284, + "grad_norm": 0.8695220351219177, + "learning_rate": 3.1275591359172698e-06, + "loss": 0.497, + "step": 2802 + }, + { + "epoch": 2.247794707297514, + "grad_norm": 0.8859379291534424, + "learning_rate": 3.1212713704524644e-06, + "loss": 0.5044, + "step": 2803 + }, + { + "epoch": 2.2485966319166, + "grad_norm": 0.8411895632743835, + "learning_rate": 3.114988762892649e-06, + "loss": 0.4905, + "step": 2804 + }, + { + "epoch": 2.2493985565356858, + "grad_norm": 0.874116837978363, + "learning_rate": 3.1087113179487394e-06, + "loss": 0.5039, + "step": 2805 + }, + { + "epoch": 2.2502004811547716, + "grad_norm": 0.8561678528785706, + "learning_rate": 3.102439040327773e-06, + "loss": 0.4955, + "step": 2806 + }, + { + "epoch": 2.2510024057738574, + "grad_norm": 0.8340683579444885, + "learning_rate": 3.096171934732918e-06, + "loss": 0.4832, + "step": 2807 + }, + { + "epoch": 2.251804330392943, + "grad_norm": 0.8148999810218811, + "learning_rate": 3.0899100058634646e-06, + "loss": 0.4948, + "step": 2808 + }, + { + "epoch": 2.252606255012029, + "grad_norm": 0.827923595905304, + "learning_rate": 3.0836532584148237e-06, + "loss": 0.4748, + "step": 2809 + }, + { + "epoch": 2.2534081796311147, + "grad_norm": 0.852433443069458, + "learning_rate": 3.0774016970785116e-06, + "loss": 0.5062, + "step": 2810 + }, + { + "epoch": 2.2542101042502005, + "grad_norm": 0.8013387322425842, + "learning_rate": 3.0711553265421645e-06, + "loss": 0.4702, + "step": 2811 + }, + { + "epoch": 2.2550120288692863, + "grad_norm": 0.8474105596542358, + "learning_rate": 3.0649141514895243e-06, + "loss": 0.4855, + "step": 2812 + }, + { + "epoch": 2.255813953488372, + "grad_norm": 0.844879150390625, + "learning_rate": 3.058678176600436e-06, + "loss": 0.503, + "step": 2813 + }, + { + "epoch": 2.256615878107458, + "grad_norm": 0.8638342022895813, + "learning_rate": 3.0524474065508492e-06, + "loss": 0.4956, + "step": 2814 + }, + { + "epoch": 2.2574178027265437, + "grad_norm": 0.8933137655258179, + "learning_rate": 3.0462218460128e-06, + "loss": 0.5089, + "step": 2815 + }, + { + "epoch": 2.2582197273456295, + "grad_norm": 0.8648194074630737, + "learning_rate": 3.0400014996544314e-06, + "loss": 0.4774, + "step": 2816 + }, + { + "epoch": 2.2590216519647153, + "grad_norm": 0.8064270615577698, + "learning_rate": 3.0337863721399694e-06, + "loss": 0.4732, + "step": 2817 + }, + { + "epoch": 2.259823576583801, + "grad_norm": 0.8423921465873718, + "learning_rate": 3.0275764681297292e-06, + "loss": 0.4763, + "step": 2818 + }, + { + "epoch": 2.260625501202887, + "grad_norm": 0.8474961519241333, + "learning_rate": 3.02137179228011e-06, + "loss": 0.4917, + "step": 2819 + }, + { + "epoch": 2.2614274258219726, + "grad_norm": 0.8607854843139648, + "learning_rate": 3.0151723492435837e-06, + "loss": 0.4909, + "step": 2820 + }, + { + "epoch": 2.2622293504410584, + "grad_norm": 0.8570009469985962, + "learning_rate": 3.008978143668707e-06, + "loss": 0.48, + "step": 2821 + }, + { + "epoch": 2.2630312750601442, + "grad_norm": 0.8390832543373108, + "learning_rate": 3.00278918020011e-06, + "loss": 0.4902, + "step": 2822 + }, + { + "epoch": 2.26383319967923, + "grad_norm": 0.8215196132659912, + "learning_rate": 2.9966054634784756e-06, + "loss": 0.4789, + "step": 2823 + }, + { + "epoch": 2.264635124298316, + "grad_norm": 0.836609423160553, + "learning_rate": 2.990426998140582e-06, + "loss": 0.4532, + "step": 2824 + }, + { + "epoch": 2.2654370489174016, + "grad_norm": 0.8278366923332214, + "learning_rate": 2.9842537888192414e-06, + "loss": 0.4777, + "step": 2825 + }, + { + "epoch": 2.2662389735364874, + "grad_norm": 0.8481134176254272, + "learning_rate": 2.97808584014334e-06, + "loss": 0.476, + "step": 2826 + }, + { + "epoch": 2.267040898155573, + "grad_norm": 0.8720846176147461, + "learning_rate": 2.9719231567378182e-06, + "loss": 0.4861, + "step": 2827 + }, + { + "epoch": 2.267842822774659, + "grad_norm": 0.8726981282234192, + "learning_rate": 2.9657657432236573e-06, + "loss": 0.4858, + "step": 2828 + }, + { + "epoch": 2.268644747393745, + "grad_norm": 0.9136954545974731, + "learning_rate": 2.959613604217908e-06, + "loss": 0.5009, + "step": 2829 + }, + { + "epoch": 2.2694466720128306, + "grad_norm": 0.8403001427650452, + "learning_rate": 2.953466744333644e-06, + "loss": 0.4844, + "step": 2830 + }, + { + "epoch": 2.270248596631917, + "grad_norm": 0.8584656715393066, + "learning_rate": 2.947325168179994e-06, + "loss": 0.4881, + "step": 2831 + }, + { + "epoch": 2.2710505212510026, + "grad_norm": 0.8448304533958435, + "learning_rate": 2.9411888803621237e-06, + "loss": 0.4938, + "step": 2832 + }, + { + "epoch": 2.2718524458700884, + "grad_norm": 0.8816596269607544, + "learning_rate": 2.9350578854812194e-06, + "loss": 0.5019, + "step": 2833 + }, + { + "epoch": 2.272654370489174, + "grad_norm": 0.8737924098968506, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.4926, + "step": 2834 + }, + { + "epoch": 2.27345629510826, + "grad_norm": 0.8343014121055603, + "learning_rate": 2.922811792915291e-06, + "loss": 0.4696, + "step": 2835 + }, + { + "epoch": 2.2742582197273458, + "grad_norm": 0.8349881768226624, + "learning_rate": 2.916696704412789e-06, + "loss": 0.4689, + "step": 2836 + }, + { + "epoch": 2.2750601443464316, + "grad_norm": 0.8880947232246399, + "learning_rate": 2.9105869272123366e-06, + "loss": 0.5054, + "step": 2837 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 0.8601458072662354, + "learning_rate": 2.9044824658952407e-06, + "loss": 0.4892, + "step": 2838 + }, + { + "epoch": 2.276663993584603, + "grad_norm": 0.8508468270301819, + "learning_rate": 2.898383325038838e-06, + "loss": 0.48, + "step": 2839 + }, + { + "epoch": 2.277465918203689, + "grad_norm": 0.8209986090660095, + "learning_rate": 2.8922895092164773e-06, + "loss": 0.4873, + "step": 2840 + }, + { + "epoch": 2.2782678428227747, + "grad_norm": 0.820483922958374, + "learning_rate": 2.886201022997497e-06, + "loss": 0.4926, + "step": 2841 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 0.8504437208175659, + "learning_rate": 2.8801178709472645e-06, + "loss": 0.4753, + "step": 2842 + }, + { + "epoch": 2.2798716920609463, + "grad_norm": 0.8321656584739685, + "learning_rate": 2.8740400576271265e-06, + "loss": 0.465, + "step": 2843 + }, + { + "epoch": 2.280673616680032, + "grad_norm": 0.8143665194511414, + "learning_rate": 2.8679675875944356e-06, + "loss": 0.4641, + "step": 2844 + }, + { + "epoch": 2.281475541299118, + "grad_norm": 0.8252444267272949, + "learning_rate": 2.8619004654025418e-06, + "loss": 0.4784, + "step": 2845 + }, + { + "epoch": 2.2822774659182037, + "grad_norm": 0.8725248575210571, + "learning_rate": 2.85583869560077e-06, + "loss": 0.4836, + "step": 2846 + }, + { + "epoch": 2.2830793905372895, + "grad_norm": 0.8364203572273254, + "learning_rate": 2.8497822827344522e-06, + "loss": 0.4934, + "step": 2847 + }, + { + "epoch": 2.2838813151563753, + "grad_norm": 0.9842785000801086, + "learning_rate": 2.8437312313448863e-06, + "loss": 0.5294, + "step": 2848 + }, + { + "epoch": 2.284683239775461, + "grad_norm": 0.8217899799346924, + "learning_rate": 2.837685545969359e-06, + "loss": 0.4845, + "step": 2849 + }, + { + "epoch": 2.285485164394547, + "grad_norm": 0.847512423992157, + "learning_rate": 2.8316452311411326e-06, + "loss": 0.4888, + "step": 2850 + }, + { + "epoch": 2.2862870890136326, + "grad_norm": 0.8483214974403381, + "learning_rate": 2.8256102913894355e-06, + "loss": 0.4856, + "step": 2851 + }, + { + "epoch": 2.2870890136327184, + "grad_norm": 0.8673418760299683, + "learning_rate": 2.8195807312394763e-06, + "loss": 0.4837, + "step": 2852 + }, + { + "epoch": 2.2878909382518042, + "grad_norm": 0.8354519009590149, + "learning_rate": 2.8135565552124224e-06, + "loss": 0.4729, + "step": 2853 + }, + { + "epoch": 2.28869286287089, + "grad_norm": 0.8773465752601624, + "learning_rate": 2.8075377678254058e-06, + "loss": 0.492, + "step": 2854 + }, + { + "epoch": 2.289494787489976, + "grad_norm": 0.8436979055404663, + "learning_rate": 2.801524373591522e-06, + "loss": 0.4971, + "step": 2855 + }, + { + "epoch": 2.2902967121090616, + "grad_norm": 0.8587144613265991, + "learning_rate": 2.7955163770198136e-06, + "loss": 0.4994, + "step": 2856 + }, + { + "epoch": 2.2910986367281474, + "grad_norm": 0.8141018152236938, + "learning_rate": 2.789513782615283e-06, + "loss": 0.4767, + "step": 2857 + }, + { + "epoch": 2.291900561347233, + "grad_norm": 0.8428380489349365, + "learning_rate": 2.78351659487888e-06, + "loss": 0.5074, + "step": 2858 + }, + { + "epoch": 2.292702485966319, + "grad_norm": 0.8327274918556213, + "learning_rate": 2.777524818307501e-06, + "loss": 0.4801, + "step": 2859 + }, + { + "epoch": 2.293504410585405, + "grad_norm": 0.8546361327171326, + "learning_rate": 2.7715384573939865e-06, + "loss": 0.4726, + "step": 2860 + }, + { + "epoch": 2.2943063352044906, + "grad_norm": 0.8553916811943054, + "learning_rate": 2.7655575166271067e-06, + "loss": 0.4861, + "step": 2861 + }, + { + "epoch": 2.295108259823577, + "grad_norm": 0.8089895844459534, + "learning_rate": 2.7595820004915795e-06, + "loss": 0.4627, + "step": 2862 + }, + { + "epoch": 2.295910184442662, + "grad_norm": 0.8807501196861267, + "learning_rate": 2.7536119134680493e-06, + "loss": 0.4779, + "step": 2863 + }, + { + "epoch": 2.2967121090617484, + "grad_norm": 0.8504637479782104, + "learning_rate": 2.747647260033095e-06, + "loss": 0.5165, + "step": 2864 + }, + { + "epoch": 2.297514033680834, + "grad_norm": 0.8142641186714172, + "learning_rate": 2.7416880446592087e-06, + "loss": 0.4904, + "step": 2865 + }, + { + "epoch": 2.29831595829992, + "grad_norm": 0.8312305808067322, + "learning_rate": 2.7357342718148184e-06, + "loss": 0.5014, + "step": 2866 + }, + { + "epoch": 2.2991178829190058, + "grad_norm": 0.8162922859191895, + "learning_rate": 2.729785945964264e-06, + "loss": 0.4728, + "step": 2867 + }, + { + "epoch": 2.2999198075380916, + "grad_norm": 0.8514026999473572, + "learning_rate": 2.723843071567803e-06, + "loss": 0.4964, + "step": 2868 + }, + { + "epoch": 2.3007217321571773, + "grad_norm": 0.8272191882133484, + "learning_rate": 2.717905653081608e-06, + "loss": 0.4783, + "step": 2869 + }, + { + "epoch": 2.301523656776263, + "grad_norm": 0.8552436232566833, + "learning_rate": 2.7119736949577534e-06, + "loss": 0.4773, + "step": 2870 + }, + { + "epoch": 2.302325581395349, + "grad_norm": 0.8255532383918762, + "learning_rate": 2.706047201644224e-06, + "loss": 0.4754, + "step": 2871 + }, + { + "epoch": 2.3031275060144347, + "grad_norm": 0.8571800589561462, + "learning_rate": 2.7001261775849086e-06, + "loss": 0.5073, + "step": 2872 + }, + { + "epoch": 2.3039294306335205, + "grad_norm": 0.8622461557388306, + "learning_rate": 2.69421062721959e-06, + "loss": 0.5018, + "step": 2873 + }, + { + "epoch": 2.3047313552526063, + "grad_norm": 0.8514299392700195, + "learning_rate": 2.688300554983955e-06, + "loss": 0.4727, + "step": 2874 + }, + { + "epoch": 2.305533279871692, + "grad_norm": 0.8603047728538513, + "learning_rate": 2.682395965309569e-06, + "loss": 0.4859, + "step": 2875 + }, + { + "epoch": 2.306335204490778, + "grad_norm": 0.8229160308837891, + "learning_rate": 2.6764968626238986e-06, + "loss": 0.4869, + "step": 2876 + }, + { + "epoch": 2.3071371291098637, + "grad_norm": 0.8354660868644714, + "learning_rate": 2.6706032513502913e-06, + "loss": 0.4736, + "step": 2877 + }, + { + "epoch": 2.3079390537289495, + "grad_norm": 0.8746702075004578, + "learning_rate": 2.664715135907977e-06, + "loss": 0.482, + "step": 2878 + }, + { + "epoch": 2.3087409783480353, + "grad_norm": 0.8408623933792114, + "learning_rate": 2.65883252071207e-06, + "loss": 0.4917, + "step": 2879 + }, + { + "epoch": 2.309542902967121, + "grad_norm": 0.8481791615486145, + "learning_rate": 2.652955410173548e-06, + "loss": 0.4974, + "step": 2880 + }, + { + "epoch": 2.310344827586207, + "grad_norm": 0.8575053215026855, + "learning_rate": 2.6470838086992724e-06, + "loss": 0.4732, + "step": 2881 + }, + { + "epoch": 2.3111467522052926, + "grad_norm": 0.857205867767334, + "learning_rate": 2.641217720691972e-06, + "loss": 0.4728, + "step": 2882 + }, + { + "epoch": 2.3119486768243784, + "grad_norm": 0.8567532300949097, + "learning_rate": 2.6353571505502317e-06, + "loss": 0.4909, + "step": 2883 + }, + { + "epoch": 2.3127506014434642, + "grad_norm": 0.8322728276252747, + "learning_rate": 2.6295021026685176e-06, + "loss": 0.4863, + "step": 2884 + }, + { + "epoch": 2.31355252606255, + "grad_norm": 0.8363150954246521, + "learning_rate": 2.623652581437135e-06, + "loss": 0.4861, + "step": 2885 + }, + { + "epoch": 2.314354450681636, + "grad_norm": 0.8756260871887207, + "learning_rate": 2.617808591242258e-06, + "loss": 0.5115, + "step": 2886 + }, + { + "epoch": 2.3151563753007216, + "grad_norm": 0.8478529453277588, + "learning_rate": 2.6119701364659124e-06, + "loss": 0.4944, + "step": 2887 + }, + { + "epoch": 2.3159582999198074, + "grad_norm": 0.8674932718276978, + "learning_rate": 2.6061372214859595e-06, + "loss": 0.4886, + "step": 2888 + }, + { + "epoch": 2.316760224538893, + "grad_norm": 0.8758816719055176, + "learning_rate": 2.6003098506761316e-06, + "loss": 0.5083, + "step": 2889 + }, + { + "epoch": 2.317562149157979, + "grad_norm": 0.8525510430335999, + "learning_rate": 2.5944880284059804e-06, + "loss": 0.4809, + "step": 2890 + }, + { + "epoch": 2.3183640737770648, + "grad_norm": 0.840334415435791, + "learning_rate": 2.588671759040909e-06, + "loss": 0.4929, + "step": 2891 + }, + { + "epoch": 2.3191659983961506, + "grad_norm": 0.8207436203956604, + "learning_rate": 2.582861046942158e-06, + "loss": 0.4818, + "step": 2892 + }, + { + "epoch": 2.319967923015237, + "grad_norm": 0.8692548871040344, + "learning_rate": 2.577055896466788e-06, + "loss": 0.4912, + "step": 2893 + }, + { + "epoch": 2.320769847634322, + "grad_norm": 0.8304778337478638, + "learning_rate": 2.571256311967709e-06, + "loss": 0.4636, + "step": 2894 + }, + { + "epoch": 2.3215717722534084, + "grad_norm": 0.8468001484870911, + "learning_rate": 2.565462297793644e-06, + "loss": 0.4784, + "step": 2895 + }, + { + "epoch": 2.322373696872494, + "grad_norm": 0.8657370209693909, + "learning_rate": 2.5596738582891335e-06, + "loss": 0.4685, + "step": 2896 + }, + { + "epoch": 2.32317562149158, + "grad_norm": 0.8483834862709045, + "learning_rate": 2.5538909977945593e-06, + "loss": 0.4642, + "step": 2897 + }, + { + "epoch": 2.3239775461106658, + "grad_norm": 0.8330668210983276, + "learning_rate": 2.5481137206460994e-06, + "loss": 0.4746, + "step": 2898 + }, + { + "epoch": 2.3247794707297516, + "grad_norm": 0.7988243103027344, + "learning_rate": 2.542342031175754e-06, + "loss": 0.4604, + "step": 2899 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.8841968774795532, + "learning_rate": 2.536575933711336e-06, + "loss": 0.4834, + "step": 2900 + }, + { + "epoch": 2.326383319967923, + "grad_norm": 0.9015833735466003, + "learning_rate": 2.5308154325764543e-06, + "loss": 0.4787, + "step": 2901 + }, + { + "epoch": 2.327185244587009, + "grad_norm": 0.8655214905738831, + "learning_rate": 2.5250605320905387e-06, + "loss": 0.5111, + "step": 2902 + }, + { + "epoch": 2.3279871692060947, + "grad_norm": 0.8751394152641296, + "learning_rate": 2.519311236568801e-06, + "loss": 0.4828, + "step": 2903 + }, + { + "epoch": 2.3287890938251805, + "grad_norm": 0.8501570820808411, + "learning_rate": 2.5135675503222623e-06, + "loss": 0.4704, + "step": 2904 + }, + { + "epoch": 2.3295910184442663, + "grad_norm": 0.8615387678146362, + "learning_rate": 2.5078294776577372e-06, + "loss": 0.4816, + "step": 2905 + }, + { + "epoch": 2.330392943063352, + "grad_norm": 0.8755018711090088, + "learning_rate": 2.5020970228778198e-06, + "loss": 0.4831, + "step": 2906 + }, + { + "epoch": 2.331194867682438, + "grad_norm": 0.877673327922821, + "learning_rate": 2.49637019028091e-06, + "loss": 0.4664, + "step": 2907 + }, + { + "epoch": 2.3319967923015237, + "grad_norm": 0.8591618537902832, + "learning_rate": 2.4906489841611736e-06, + "loss": 0.4914, + "step": 2908 + }, + { + "epoch": 2.3327987169206095, + "grad_norm": 0.820887565612793, + "learning_rate": 2.48493340880857e-06, + "loss": 0.4756, + "step": 2909 + }, + { + "epoch": 2.3336006415396953, + "grad_norm": 0.8460831642150879, + "learning_rate": 2.4792234685088312e-06, + "loss": 0.4756, + "step": 2910 + }, + { + "epoch": 2.334402566158781, + "grad_norm": 0.8340601921081543, + "learning_rate": 2.473519167543467e-06, + "loss": 0.4647, + "step": 2911 + }, + { + "epoch": 2.335204490777867, + "grad_norm": 0.8154265284538269, + "learning_rate": 2.4678205101897523e-06, + "loss": 0.4587, + "step": 2912 + }, + { + "epoch": 2.3360064153969526, + "grad_norm": 0.8957749605178833, + "learning_rate": 2.462127500720737e-06, + "loss": 0.4981, + "step": 2913 + }, + { + "epoch": 2.3368083400160384, + "grad_norm": 0.8637145757675171, + "learning_rate": 2.456440143405232e-06, + "loss": 0.4774, + "step": 2914 + }, + { + "epoch": 2.3376102646351242, + "grad_norm": 0.8834245800971985, + "learning_rate": 2.4507584425078133e-06, + "loss": 0.4996, + "step": 2915 + }, + { + "epoch": 2.33841218925421, + "grad_norm": 0.8881711363792419, + "learning_rate": 2.4450824022888166e-06, + "loss": 0.509, + "step": 2916 + }, + { + "epoch": 2.339214113873296, + "grad_norm": 0.9144017100334167, + "learning_rate": 2.4394120270043233e-06, + "loss": 0.4873, + "step": 2917 + }, + { + "epoch": 2.3400160384923816, + "grad_norm": 0.8431651592254639, + "learning_rate": 2.433747320906177e-06, + "loss": 0.464, + "step": 2918 + }, + { + "epoch": 2.3408179631114674, + "grad_norm": 0.8841955661773682, + "learning_rate": 2.4280882882419676e-06, + "loss": 0.4791, + "step": 2919 + }, + { + "epoch": 2.341619887730553, + "grad_norm": 0.9004083871841431, + "learning_rate": 2.4224349332550313e-06, + "loss": 0.5035, + "step": 2920 + }, + { + "epoch": 2.342421812349639, + "grad_norm": 0.8716943264007568, + "learning_rate": 2.4167872601844476e-06, + "loss": 0.4744, + "step": 2921 + }, + { + "epoch": 2.3432237369687248, + "grad_norm": 0.8678621053695679, + "learning_rate": 2.411145273265029e-06, + "loss": 0.4983, + "step": 2922 + }, + { + "epoch": 2.3440256615878106, + "grad_norm": 0.8493649959564209, + "learning_rate": 2.405508976727332e-06, + "loss": 0.4798, + "step": 2923 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 0.8921751379966736, + "learning_rate": 2.3998783747976473e-06, + "loss": 0.5009, + "step": 2924 + }, + { + "epoch": 2.345629510825982, + "grad_norm": 0.8245465755462646, + "learning_rate": 2.3942534716979827e-06, + "loss": 0.473, + "step": 2925 + }, + { + "epoch": 2.3464314354450684, + "grad_norm": 0.7934049367904663, + "learning_rate": 2.3886342716460932e-06, + "loss": 0.452, + "step": 2926 + }, + { + "epoch": 2.3472333600641537, + "grad_norm": 0.8302115201950073, + "learning_rate": 2.3830207788554394e-06, + "loss": 0.4759, + "step": 2927 + }, + { + "epoch": 2.34803528468324, + "grad_norm": 0.8664802312850952, + "learning_rate": 2.3774129975352112e-06, + "loss": 0.4752, + "step": 2928 + }, + { + "epoch": 2.3488372093023258, + "grad_norm": 0.8159329295158386, + "learning_rate": 2.371810931890316e-06, + "loss": 0.4827, + "step": 2929 + }, + { + "epoch": 2.3496391339214115, + "grad_norm": 0.8080800771713257, + "learning_rate": 2.366214586121366e-06, + "loss": 0.4715, + "step": 2930 + }, + { + "epoch": 2.3504410585404973, + "grad_norm": 0.8522915840148926, + "learning_rate": 2.360623964424703e-06, + "loss": 0.494, + "step": 2931 + }, + { + "epoch": 2.351242983159583, + "grad_norm": 0.8710372447967529, + "learning_rate": 2.3550390709923575e-06, + "loss": 0.4748, + "step": 2932 + }, + { + "epoch": 2.352044907778669, + "grad_norm": 0.8452877998352051, + "learning_rate": 2.349459910012075e-06, + "loss": 0.4708, + "step": 2933 + }, + { + "epoch": 2.3528468323977547, + "grad_norm": 0.8797398805618286, + "learning_rate": 2.343886485667303e-06, + "loss": 0.4782, + "step": 2934 + }, + { + "epoch": 2.3536487570168405, + "grad_norm": 0.8327314853668213, + "learning_rate": 2.3383188021371773e-06, + "loss": 0.478, + "step": 2935 + }, + { + "epoch": 2.3544506816359263, + "grad_norm": 0.8496788740158081, + "learning_rate": 2.332756863596547e-06, + "loss": 0.4457, + "step": 2936 + }, + { + "epoch": 2.355252606255012, + "grad_norm": 0.842319130897522, + "learning_rate": 2.327200674215937e-06, + "loss": 0.4896, + "step": 2937 + }, + { + "epoch": 2.356054530874098, + "grad_norm": 0.8751649260520935, + "learning_rate": 2.3216502381615633e-06, + "loss": 0.5128, + "step": 2938 + }, + { + "epoch": 2.3568564554931837, + "grad_norm": 0.8716158866882324, + "learning_rate": 2.316105559595342e-06, + "loss": 0.4961, + "step": 2939 + }, + { + "epoch": 2.3576583801122695, + "grad_norm": 0.9204161763191223, + "learning_rate": 2.310566642674854e-06, + "loss": 0.4853, + "step": 2940 + }, + { + "epoch": 2.3584603047313553, + "grad_norm": 0.9012963175773621, + "learning_rate": 2.3050334915533713e-06, + "loss": 0.4946, + "step": 2941 + }, + { + "epoch": 2.359262229350441, + "grad_norm": 0.8790763020515442, + "learning_rate": 2.2995061103798397e-06, + "loss": 0.4866, + "step": 2942 + }, + { + "epoch": 2.360064153969527, + "grad_norm": 0.8495937585830688, + "learning_rate": 2.2939845032988707e-06, + "loss": 0.4877, + "step": 2943 + }, + { + "epoch": 2.3608660785886126, + "grad_norm": 0.8435322046279907, + "learning_rate": 2.288468674450766e-06, + "loss": 0.4856, + "step": 2944 + }, + { + "epoch": 2.3616680032076984, + "grad_norm": 0.8671093583106995, + "learning_rate": 2.28295862797147e-06, + "loss": 0.4737, + "step": 2945 + }, + { + "epoch": 2.362469927826784, + "grad_norm": 0.8573446273803711, + "learning_rate": 2.27745436799261e-06, + "loss": 0.5076, + "step": 2946 + }, + { + "epoch": 2.36327185244587, + "grad_norm": 0.8513672947883606, + "learning_rate": 2.271955898641467e-06, + "loss": 0.4653, + "step": 2947 + }, + { + "epoch": 2.364073777064956, + "grad_norm": 0.8536444306373596, + "learning_rate": 2.2664632240409746e-06, + "loss": 0.4721, + "step": 2948 + }, + { + "epoch": 2.3648757016840416, + "grad_norm": 0.8997591733932495, + "learning_rate": 2.260976348309737e-06, + "loss": 0.5051, + "step": 2949 + }, + { + "epoch": 2.3656776263031274, + "grad_norm": 0.854433536529541, + "learning_rate": 2.255495275561993e-06, + "loss": 0.485, + "step": 2950 + }, + { + "epoch": 2.366479550922213, + "grad_norm": 0.9025599956512451, + "learning_rate": 2.2500200099076395e-06, + "loss": 0.4982, + "step": 2951 + }, + { + "epoch": 2.367281475541299, + "grad_norm": 0.7927220463752747, + "learning_rate": 2.2445505554522207e-06, + "loss": 0.4662, + "step": 2952 + }, + { + "epoch": 2.3680834001603848, + "grad_norm": 0.8282037973403931, + "learning_rate": 2.239086916296914e-06, + "loss": 0.4682, + "step": 2953 + }, + { + "epoch": 2.3688853247794706, + "grad_norm": 0.8752132654190063, + "learning_rate": 2.2336290965385454e-06, + "loss": 0.4766, + "step": 2954 + }, + { + "epoch": 2.3696872493985564, + "grad_norm": 0.8500027656555176, + "learning_rate": 2.228177100269573e-06, + "loss": 0.4887, + "step": 2955 + }, + { + "epoch": 2.370489174017642, + "grad_norm": 0.8315219879150391, + "learning_rate": 2.22273093157809e-06, + "loss": 0.495, + "step": 2956 + }, + { + "epoch": 2.3712910986367284, + "grad_norm": 0.8728612065315247, + "learning_rate": 2.217290594547822e-06, + "loss": 0.4768, + "step": 2957 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.8619360327720642, + "learning_rate": 2.2118560932581123e-06, + "loss": 0.4768, + "step": 2958 + }, + { + "epoch": 2.3728949478749, + "grad_norm": 0.8790128231048584, + "learning_rate": 2.2064274317839394e-06, + "loss": 0.4638, + "step": 2959 + }, + { + "epoch": 2.3736968724939858, + "grad_norm": 0.8765968680381775, + "learning_rate": 2.2010046141958973e-06, + "loss": 0.4762, + "step": 2960 + }, + { + "epoch": 2.3744987971130715, + "grad_norm": 0.790704607963562, + "learning_rate": 2.1955876445602008e-06, + "loss": 0.4694, + "step": 2961 + }, + { + "epoch": 2.3753007217321573, + "grad_norm": 0.8622111082077026, + "learning_rate": 2.190176526938679e-06, + "loss": 0.4642, + "step": 2962 + }, + { + "epoch": 2.376102646351243, + "grad_norm": 0.8704236149787903, + "learning_rate": 2.1847712653887687e-06, + "loss": 0.4809, + "step": 2963 + }, + { + "epoch": 2.376904570970329, + "grad_norm": 0.8916087746620178, + "learning_rate": 2.17937186396352e-06, + "loss": 0.5033, + "step": 2964 + }, + { + "epoch": 2.3777064955894147, + "grad_norm": 0.8955850601196289, + "learning_rate": 2.1739783267115888e-06, + "loss": 0.5026, + "step": 2965 + }, + { + "epoch": 2.3785084202085005, + "grad_norm": 0.860222339630127, + "learning_rate": 2.1685906576772365e-06, + "loss": 0.4896, + "step": 2966 + }, + { + "epoch": 2.3793103448275863, + "grad_norm": 0.8153157234191895, + "learning_rate": 2.1632088609003133e-06, + "loss": 0.4615, + "step": 2967 + }, + { + "epoch": 2.380112269446672, + "grad_norm": 0.8858957290649414, + "learning_rate": 2.157832940416279e-06, + "loss": 0.4947, + "step": 2968 + }, + { + "epoch": 2.380914194065758, + "grad_norm": 0.8330368995666504, + "learning_rate": 2.1524629002561803e-06, + "loss": 0.4623, + "step": 2969 + }, + { + "epoch": 2.3817161186848437, + "grad_norm": 0.9089009165763855, + "learning_rate": 2.1470987444466564e-06, + "loss": 0.4907, + "step": 2970 + }, + { + "epoch": 2.3825180433039295, + "grad_norm": 0.805916965007782, + "learning_rate": 2.141740477009937e-06, + "loss": 0.4827, + "step": 2971 + }, + { + "epoch": 2.3833199679230153, + "grad_norm": 0.85927814245224, + "learning_rate": 2.1363881019638277e-06, + "loss": 0.4669, + "step": 2972 + }, + { + "epoch": 2.384121892542101, + "grad_norm": 0.9052659273147583, + "learning_rate": 2.1310416233217246e-06, + "loss": 0.5256, + "step": 2973 + }, + { + "epoch": 2.384923817161187, + "grad_norm": 0.8463844656944275, + "learning_rate": 2.1257010450926e-06, + "loss": 0.4947, + "step": 2974 + }, + { + "epoch": 2.3857257417802726, + "grad_norm": 0.8669213652610779, + "learning_rate": 2.1203663712809995e-06, + "loss": 0.4768, + "step": 2975 + }, + { + "epoch": 2.3865276663993584, + "grad_norm": 0.8719237446784973, + "learning_rate": 2.115037605887048e-06, + "loss": 0.5004, + "step": 2976 + }, + { + "epoch": 2.387329591018444, + "grad_norm": 0.8576990365982056, + "learning_rate": 2.1097147529064286e-06, + "loss": 0.4877, + "step": 2977 + }, + { + "epoch": 2.38813151563753, + "grad_norm": 0.8337488770484924, + "learning_rate": 2.104397816330401e-06, + "loss": 0.4715, + "step": 2978 + }, + { + "epoch": 2.388933440256616, + "grad_norm": 0.8252597451210022, + "learning_rate": 2.0990868001457853e-06, + "loss": 0.4819, + "step": 2979 + }, + { + "epoch": 2.3897353648757016, + "grad_norm": 0.8549776077270508, + "learning_rate": 2.093781708334962e-06, + "loss": 0.4702, + "step": 2980 + }, + { + "epoch": 2.3905372894947874, + "grad_norm": 0.8355644345283508, + "learning_rate": 2.088482544875873e-06, + "loss": 0.4758, + "step": 2981 + }, + { + "epoch": 2.391339214113873, + "grad_norm": 0.857735276222229, + "learning_rate": 2.0831893137420046e-06, + "loss": 0.4998, + "step": 2982 + }, + { + "epoch": 2.392141138732959, + "grad_norm": 0.9026484489440918, + "learning_rate": 2.077902018902407e-06, + "loss": 0.475, + "step": 2983 + }, + { + "epoch": 2.3929430633520448, + "grad_norm": 0.8243964314460754, + "learning_rate": 2.072620664321674e-06, + "loss": 0.4798, + "step": 2984 + }, + { + "epoch": 2.3937449879711306, + "grad_norm": 0.8589446544647217, + "learning_rate": 2.067345253959938e-06, + "loss": 0.4483, + "step": 2985 + }, + { + "epoch": 2.3945469125902163, + "grad_norm": 0.8188499212265015, + "learning_rate": 2.0620757917728927e-06, + "loss": 0.4368, + "step": 2986 + }, + { + "epoch": 2.395348837209302, + "grad_norm": 0.8684476613998413, + "learning_rate": 2.0568122817117507e-06, + "loss": 0.4714, + "step": 2987 + }, + { + "epoch": 2.3961507618283884, + "grad_norm": 0.8384298086166382, + "learning_rate": 2.051554727723276e-06, + "loss": 0.4778, + "step": 2988 + }, + { + "epoch": 2.3969526864474737, + "grad_norm": 0.872604250907898, + "learning_rate": 2.046303133749764e-06, + "loss": 0.488, + "step": 2989 + }, + { + "epoch": 2.39775461106656, + "grad_norm": 0.9264422655105591, + "learning_rate": 2.041057503729028e-06, + "loss": 0.4955, + "step": 2990 + }, + { + "epoch": 2.3985565356856453, + "grad_norm": 0.8388825058937073, + "learning_rate": 2.035817841594434e-06, + "loss": 0.4809, + "step": 2991 + }, + { + "epoch": 2.3993584603047315, + "grad_norm": 0.9164281487464905, + "learning_rate": 2.0305841512748494e-06, + "loss": 0.4903, + "step": 2992 + }, + { + "epoch": 2.4001603849238173, + "grad_norm": 0.9148788452148438, + "learning_rate": 2.0253564366946764e-06, + "loss": 0.5062, + "step": 2993 + }, + { + "epoch": 2.400962309542903, + "grad_norm": 0.8578090071678162, + "learning_rate": 2.020134701773836e-06, + "loss": 0.4907, + "step": 2994 + }, + { + "epoch": 2.401764234161989, + "grad_norm": 0.8483836650848389, + "learning_rate": 2.0149189504277553e-06, + "loss": 0.4526, + "step": 2995 + }, + { + "epoch": 2.4025661587810747, + "grad_norm": 0.8625277280807495, + "learning_rate": 2.0097091865673923e-06, + "loss": 0.468, + "step": 2996 + }, + { + "epoch": 2.4033680834001605, + "grad_norm": 0.8608683347702026, + "learning_rate": 2.0045054140992002e-06, + "loss": 0.4907, + "step": 2997 + }, + { + "epoch": 2.4041700080192463, + "grad_norm": 0.83229660987854, + "learning_rate": 1.9993076369251406e-06, + "loss": 0.4727, + "step": 2998 + }, + { + "epoch": 2.404971932638332, + "grad_norm": 0.8340911865234375, + "learning_rate": 1.9941158589426924e-06, + "loss": 0.472, + "step": 2999 + }, + { + "epoch": 2.405773857257418, + "grad_norm": 0.9387159943580627, + "learning_rate": 1.9889300840448224e-06, + "loss": 0.4952, + "step": 3000 + }, + { + "epoch": 2.4065757818765037, + "grad_norm": 0.8406565189361572, + "learning_rate": 1.98375031612e-06, + "loss": 0.4659, + "step": 3001 + }, + { + "epoch": 2.4073777064955895, + "grad_norm": 0.9237212538719177, + "learning_rate": 1.9785765590521978e-06, + "loss": 0.4815, + "step": 3002 + }, + { + "epoch": 2.4081796311146753, + "grad_norm": 0.8607485294342041, + "learning_rate": 1.9734088167208664e-06, + "loss": 0.4842, + "step": 3003 + }, + { + "epoch": 2.408981555733761, + "grad_norm": 0.8381433486938477, + "learning_rate": 1.968247093000963e-06, + "loss": 0.4651, + "step": 3004 + }, + { + "epoch": 2.409783480352847, + "grad_norm": 0.8409081697463989, + "learning_rate": 1.96309139176292e-06, + "loss": 0.484, + "step": 3005 + }, + { + "epoch": 2.4105854049719326, + "grad_norm": 0.8303772807121277, + "learning_rate": 1.9579417168726566e-06, + "loss": 0.4851, + "step": 3006 + }, + { + "epoch": 2.4113873295910184, + "grad_norm": 0.8264473676681519, + "learning_rate": 1.9527980721915798e-06, + "loss": 0.4631, + "step": 3007 + }, + { + "epoch": 2.412189254210104, + "grad_norm": 0.8889646530151367, + "learning_rate": 1.9476604615765605e-06, + "loss": 0.4722, + "step": 3008 + }, + { + "epoch": 2.41299117882919, + "grad_norm": 0.8740735650062561, + "learning_rate": 1.942528888879964e-06, + "loss": 0.4926, + "step": 3009 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 0.835271418094635, + "learning_rate": 1.937403357949611e-06, + "loss": 0.4663, + "step": 3010 + }, + { + "epoch": 2.4145950280673616, + "grad_norm": 0.8700141310691833, + "learning_rate": 1.932283872628803e-06, + "loss": 0.4773, + "step": 3011 + }, + { + "epoch": 2.4153969526864474, + "grad_norm": 0.8676986694335938, + "learning_rate": 1.927170436756305e-06, + "loss": 0.4952, + "step": 3012 + }, + { + "epoch": 2.416198877305533, + "grad_norm": 0.902349591255188, + "learning_rate": 1.922063054166341e-06, + "loss": 0.4806, + "step": 3013 + }, + { + "epoch": 2.417000801924619, + "grad_norm": 0.8744420409202576, + "learning_rate": 1.916961728688603e-06, + "loss": 0.4784, + "step": 3014 + }, + { + "epoch": 2.4178027265437048, + "grad_norm": 0.8729916214942932, + "learning_rate": 1.9118664641482386e-06, + "loss": 0.4974, + "step": 3015 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 0.9082834720611572, + "learning_rate": 1.9067772643658511e-06, + "loss": 0.487, + "step": 3016 + }, + { + "epoch": 2.4194065757818763, + "grad_norm": 0.8420137166976929, + "learning_rate": 1.901694133157499e-06, + "loss": 0.4468, + "step": 3017 + }, + { + "epoch": 2.420208500400962, + "grad_norm": 0.8739819526672363, + "learning_rate": 1.896617074334679e-06, + "loss": 0.4624, + "step": 3018 + }, + { + "epoch": 2.421010425020048, + "grad_norm": 0.8350537419319153, + "learning_rate": 1.8915460917043494e-06, + "loss": 0.4767, + "step": 3019 + }, + { + "epoch": 2.4218123496391337, + "grad_norm": 0.8347825407981873, + "learning_rate": 1.8864811890689016e-06, + "loss": 0.458, + "step": 3020 + }, + { + "epoch": 2.42261427425822, + "grad_norm": 0.8250964879989624, + "learning_rate": 1.8814223702261757e-06, + "loss": 0.4694, + "step": 3021 + }, + { + "epoch": 2.4234161988773053, + "grad_norm": 0.8303848505020142, + "learning_rate": 1.8763696389694463e-06, + "loss": 0.4831, + "step": 3022 + }, + { + "epoch": 2.4242181234963915, + "grad_norm": 0.8795514106750488, + "learning_rate": 1.8713229990874194e-06, + "loss": 0.4736, + "step": 3023 + }, + { + "epoch": 2.4250200481154773, + "grad_norm": 0.8881685733795166, + "learning_rate": 1.86628245436424e-06, + "loss": 0.4751, + "step": 3024 + }, + { + "epoch": 2.425821972734563, + "grad_norm": 0.9160009622573853, + "learning_rate": 1.8612480085794804e-06, + "loss": 0.5201, + "step": 3025 + }, + { + "epoch": 2.426623897353649, + "grad_norm": 0.8275811076164246, + "learning_rate": 1.8562196655081422e-06, + "loss": 0.4675, + "step": 3026 + }, + { + "epoch": 2.4274258219727347, + "grad_norm": 0.895634651184082, + "learning_rate": 1.8511974289206413e-06, + "loss": 0.4896, + "step": 3027 + }, + { + "epoch": 2.4282277465918205, + "grad_norm": 0.8538333177566528, + "learning_rate": 1.8461813025828268e-06, + "loss": 0.4749, + "step": 3028 + }, + { + "epoch": 2.4290296712109063, + "grad_norm": 0.8479071855545044, + "learning_rate": 1.8411712902559597e-06, + "loss": 0.4798, + "step": 3029 + }, + { + "epoch": 2.429831595829992, + "grad_norm": 0.875977635383606, + "learning_rate": 1.8361673956967175e-06, + "loss": 0.4951, + "step": 3030 + }, + { + "epoch": 2.430633520449078, + "grad_norm": 0.9012177586555481, + "learning_rate": 1.831169622657194e-06, + "loss": 0.5221, + "step": 3031 + }, + { + "epoch": 2.4314354450681637, + "grad_norm": 0.8884789943695068, + "learning_rate": 1.826177974884885e-06, + "loss": 0.4712, + "step": 3032 + }, + { + "epoch": 2.4322373696872495, + "grad_norm": 0.8643089532852173, + "learning_rate": 1.8211924561227001e-06, + "loss": 0.4645, + "step": 3033 + }, + { + "epoch": 2.4330392943063353, + "grad_norm": 0.8726524710655212, + "learning_rate": 1.816213070108951e-06, + "loss": 0.5044, + "step": 3034 + }, + { + "epoch": 2.433841218925421, + "grad_norm": 0.8304829597473145, + "learning_rate": 1.8112398205773507e-06, + "loss": 0.4747, + "step": 3035 + }, + { + "epoch": 2.434643143544507, + "grad_norm": 0.8465933799743652, + "learning_rate": 1.8062727112570133e-06, + "loss": 0.4924, + "step": 3036 + }, + { + "epoch": 2.4354450681635926, + "grad_norm": 0.8618703484535217, + "learning_rate": 1.8013117458724416e-06, + "loss": 0.4974, + "step": 3037 + }, + { + "epoch": 2.4362469927826784, + "grad_norm": 0.872769832611084, + "learning_rate": 1.79635692814354e-06, + "loss": 0.4641, + "step": 3038 + }, + { + "epoch": 2.437048917401764, + "grad_norm": 0.8966688513755798, + "learning_rate": 1.7914082617856022e-06, + "loss": 0.4678, + "step": 3039 + }, + { + "epoch": 2.43785084202085, + "grad_norm": 0.8599264025688171, + "learning_rate": 1.7864657505092964e-06, + "loss": 0.4602, + "step": 3040 + }, + { + "epoch": 2.438652766639936, + "grad_norm": 0.9014189839363098, + "learning_rate": 1.7815293980206993e-06, + "loss": 0.4846, + "step": 3041 + }, + { + "epoch": 2.4394546912590216, + "grad_norm": 0.8558383584022522, + "learning_rate": 1.776599208021247e-06, + "loss": 0.4674, + "step": 3042 + }, + { + "epoch": 2.4402566158781074, + "grad_norm": 0.8910719156265259, + "learning_rate": 1.7716751842077663e-06, + "loss": 0.4783, + "step": 3043 + }, + { + "epoch": 2.441058540497193, + "grad_norm": 0.865467369556427, + "learning_rate": 1.7667573302724606e-06, + "loss": 0.4687, + "step": 3044 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 0.9217147827148438, + "learning_rate": 1.7618456499028968e-06, + "loss": 0.487, + "step": 3045 + }, + { + "epoch": 2.4426623897353648, + "grad_norm": 0.8558123707771301, + "learning_rate": 1.7569401467820302e-06, + "loss": 0.4507, + "step": 3046 + }, + { + "epoch": 2.4434643143544506, + "grad_norm": 0.8037233352661133, + "learning_rate": 1.752040824588167e-06, + "loss": 0.4415, + "step": 3047 + }, + { + "epoch": 2.4442662389735363, + "grad_norm": 0.9270918369293213, + "learning_rate": 1.7471476869949877e-06, + "loss": 0.5039, + "step": 3048 + }, + { + "epoch": 2.445068163592622, + "grad_norm": 0.8629381060600281, + "learning_rate": 1.7422607376715362e-06, + "loss": 0.4791, + "step": 3049 + }, + { + "epoch": 2.445870088211708, + "grad_norm": 0.8409698605537415, + "learning_rate": 1.7373799802822067e-06, + "loss": 0.4942, + "step": 3050 + }, + { + "epoch": 2.4466720128307937, + "grad_norm": 0.8830838203430176, + "learning_rate": 1.7325054184867652e-06, + "loss": 0.4756, + "step": 3051 + }, + { + "epoch": 2.44747393744988, + "grad_norm": 0.9066561460494995, + "learning_rate": 1.7276370559403188e-06, + "loss": 0.478, + "step": 3052 + }, + { + "epoch": 2.4482758620689653, + "grad_norm": 0.9000374674797058, + "learning_rate": 1.7227748962933343e-06, + "loss": 0.4911, + "step": 3053 + }, + { + "epoch": 2.4490777866880515, + "grad_norm": 0.9526277184486389, + "learning_rate": 1.7179189431916254e-06, + "loss": 0.5002, + "step": 3054 + }, + { + "epoch": 2.449879711307137, + "grad_norm": 0.85796719789505, + "learning_rate": 1.713069200276346e-06, + "loss": 0.4973, + "step": 3055 + }, + { + "epoch": 2.450681635926223, + "grad_norm": 0.8333845734596252, + "learning_rate": 1.708225671184003e-06, + "loss": 0.4688, + "step": 3056 + }, + { + "epoch": 2.451483560545309, + "grad_norm": 0.8949407935142517, + "learning_rate": 1.7033883595464407e-06, + "loss": 0.4913, + "step": 3057 + }, + { + "epoch": 2.4522854851643947, + "grad_norm": 0.8218083381652832, + "learning_rate": 1.6985572689908326e-06, + "loss": 0.4581, + "step": 3058 + }, + { + "epoch": 2.4530874097834805, + "grad_norm": 0.8660341501235962, + "learning_rate": 1.693732403139705e-06, + "loss": 0.4905, + "step": 3059 + }, + { + "epoch": 2.4538893344025663, + "grad_norm": 0.8607407808303833, + "learning_rate": 1.688913765610899e-06, + "loss": 0.4618, + "step": 3060 + }, + { + "epoch": 2.454691259021652, + "grad_norm": 0.8669100403785706, + "learning_rate": 1.684101360017596e-06, + "loss": 0.4626, + "step": 3061 + }, + { + "epoch": 2.455493183640738, + "grad_norm": 0.8576903939247131, + "learning_rate": 1.6792951899683018e-06, + "loss": 0.4873, + "step": 3062 + }, + { + "epoch": 2.4562951082598237, + "grad_norm": 0.8291400074958801, + "learning_rate": 1.6744952590668452e-06, + "loss": 0.4976, + "step": 3063 + }, + { + "epoch": 2.4570970328789095, + "grad_norm": 0.8194727897644043, + "learning_rate": 1.669701570912381e-06, + "loss": 0.4716, + "step": 3064 + }, + { + "epoch": 2.4578989574979953, + "grad_norm": 0.8513212203979492, + "learning_rate": 1.6649141290993765e-06, + "loss": 0.4765, + "step": 3065 + }, + { + "epoch": 2.458700882117081, + "grad_norm": 0.8940701484680176, + "learning_rate": 1.6601329372176177e-06, + "loss": 0.4738, + "step": 3066 + }, + { + "epoch": 2.459502806736167, + "grad_norm": 0.8830768465995789, + "learning_rate": 1.6553579988522083e-06, + "loss": 0.4837, + "step": 3067 + }, + { + "epoch": 2.4603047313552526, + "grad_norm": 0.8472455143928528, + "learning_rate": 1.6505893175835585e-06, + "loss": 0.4635, + "step": 3068 + }, + { + "epoch": 2.4611066559743384, + "grad_norm": 0.8620253801345825, + "learning_rate": 1.6458268969873892e-06, + "loss": 0.4813, + "step": 3069 + }, + { + "epoch": 2.461908580593424, + "grad_norm": 0.8335537910461426, + "learning_rate": 1.6410707406347227e-06, + "loss": 0.4595, + "step": 3070 + }, + { + "epoch": 2.46271050521251, + "grad_norm": 0.8217028975486755, + "learning_rate": 1.6363208520918882e-06, + "loss": 0.4551, + "step": 3071 + }, + { + "epoch": 2.463512429831596, + "grad_norm": 0.8587481379508972, + "learning_rate": 1.6315772349205139e-06, + "loss": 0.487, + "step": 3072 + }, + { + "epoch": 2.4643143544506816, + "grad_norm": 0.8531529903411865, + "learning_rate": 1.6268398926775286e-06, + "loss": 0.477, + "step": 3073 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.8320519328117371, + "learning_rate": 1.6221088289151477e-06, + "loss": 0.4625, + "step": 3074 + }, + { + "epoch": 2.465918203688853, + "grad_norm": 0.8659622669219971, + "learning_rate": 1.6173840471808856e-06, + "loss": 0.4866, + "step": 3075 + }, + { + "epoch": 2.466720128307939, + "grad_norm": 0.8781241178512573, + "learning_rate": 1.612665551017546e-06, + "loss": 0.4842, + "step": 3076 + }, + { + "epoch": 2.4675220529270248, + "grad_norm": 0.8649770021438599, + "learning_rate": 1.6079533439632166e-06, + "loss": 0.4856, + "step": 3077 + }, + { + "epoch": 2.4683239775461105, + "grad_norm": 0.8794416785240173, + "learning_rate": 1.6032474295512733e-06, + "loss": 0.4727, + "step": 3078 + }, + { + "epoch": 2.4691259021651963, + "grad_norm": 0.8567477464675903, + "learning_rate": 1.598547811310368e-06, + "loss": 0.4757, + "step": 3079 + }, + { + "epoch": 2.469927826784282, + "grad_norm": 0.8346998691558838, + "learning_rate": 1.5938544927644351e-06, + "loss": 0.4751, + "step": 3080 + }, + { + "epoch": 2.470729751403368, + "grad_norm": 0.8880207538604736, + "learning_rate": 1.5891674774326848e-06, + "loss": 0.4901, + "step": 3081 + }, + { + "epoch": 2.4715316760224537, + "grad_norm": 0.8977006077766418, + "learning_rate": 1.5844867688296017e-06, + "loss": 0.494, + "step": 3082 + }, + { + "epoch": 2.4723336006415395, + "grad_norm": 0.8824754357337952, + "learning_rate": 1.5798123704649416e-06, + "loss": 0.4929, + "step": 3083 + }, + { + "epoch": 2.4731355252606253, + "grad_norm": 0.9107778072357178, + "learning_rate": 1.5751442858437238e-06, + "loss": 0.4771, + "step": 3084 + }, + { + "epoch": 2.4739374498797115, + "grad_norm": 0.8373488783836365, + "learning_rate": 1.5704825184662397e-06, + "loss": 0.4824, + "step": 3085 + }, + { + "epoch": 2.474739374498797, + "grad_norm": 0.8496052026748657, + "learning_rate": 1.5658270718280433e-06, + "loss": 0.4644, + "step": 3086 + }, + { + "epoch": 2.475541299117883, + "grad_norm": 0.884526252746582, + "learning_rate": 1.5611779494199398e-06, + "loss": 0.5145, + "step": 3087 + }, + { + "epoch": 2.476343223736969, + "grad_norm": 0.8632438778877258, + "learning_rate": 1.5565351547280084e-06, + "loss": 0.4807, + "step": 3088 + }, + { + "epoch": 2.4771451483560547, + "grad_norm": 0.8612059354782104, + "learning_rate": 1.5518986912335686e-06, + "loss": 0.4579, + "step": 3089 + }, + { + "epoch": 2.4779470729751405, + "grad_norm": 0.8448330163955688, + "learning_rate": 1.5472685624132012e-06, + "loss": 0.4687, + "step": 3090 + }, + { + "epoch": 2.4787489975942263, + "grad_norm": 0.8578251600265503, + "learning_rate": 1.5426447717387349e-06, + "loss": 0.4987, + "step": 3091 + }, + { + "epoch": 2.479550922213312, + "grad_norm": 0.8487175703048706, + "learning_rate": 1.5380273226772403e-06, + "loss": 0.4817, + "step": 3092 + }, + { + "epoch": 2.480352846832398, + "grad_norm": 0.8690900802612305, + "learning_rate": 1.5334162186910474e-06, + "loss": 0.4834, + "step": 3093 + }, + { + "epoch": 2.4811547714514837, + "grad_norm": 0.8409072160720825, + "learning_rate": 1.5288114632377105e-06, + "loss": 0.4603, + "step": 3094 + }, + { + "epoch": 2.4819566960705695, + "grad_norm": 0.8334317207336426, + "learning_rate": 1.5242130597700355e-06, + "loss": 0.48, + "step": 3095 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.8757611513137817, + "learning_rate": 1.5196210117360643e-06, + "loss": 0.4855, + "step": 3096 + }, + { + "epoch": 2.483560545308741, + "grad_norm": 0.8928870558738708, + "learning_rate": 1.5150353225790626e-06, + "loss": 0.4787, + "step": 3097 + }, + { + "epoch": 2.484362469927827, + "grad_norm": 0.8667248487472534, + "learning_rate": 1.5104559957375475e-06, + "loss": 0.4617, + "step": 3098 + }, + { + "epoch": 2.4851643945469126, + "grad_norm": 0.8710841536521912, + "learning_rate": 1.505883034645248e-06, + "loss": 0.4625, + "step": 3099 + }, + { + "epoch": 2.4859663191659984, + "grad_norm": 0.8655977249145508, + "learning_rate": 1.5013164427311223e-06, + "loss": 0.4653, + "step": 3100 + }, + { + "epoch": 2.486768243785084, + "grad_norm": 0.8573647737503052, + "learning_rate": 1.4967562234193655e-06, + "loss": 0.4835, + "step": 3101 + }, + { + "epoch": 2.48757016840417, + "grad_norm": 0.8854028582572937, + "learning_rate": 1.4922023801293795e-06, + "loss": 0.4899, + "step": 3102 + }, + { + "epoch": 2.488372093023256, + "grad_norm": 0.843406617641449, + "learning_rate": 1.4876549162757915e-06, + "loss": 0.468, + "step": 3103 + }, + { + "epoch": 2.4891740176423416, + "grad_norm": 0.8388236165046692, + "learning_rate": 1.4831138352684482e-06, + "loss": 0.467, + "step": 3104 + }, + { + "epoch": 2.4899759422614274, + "grad_norm": 0.864824116230011, + "learning_rate": 1.4785791405123995e-06, + "loss": 0.4789, + "step": 3105 + }, + { + "epoch": 2.490777866880513, + "grad_norm": 0.8695855140686035, + "learning_rate": 1.474050835407923e-06, + "loss": 0.4778, + "step": 3106 + }, + { + "epoch": 2.491579791499599, + "grad_norm": 0.8766945004463196, + "learning_rate": 1.4695289233504894e-06, + "loss": 0.4668, + "step": 3107 + }, + { + "epoch": 2.4923817161186848, + "grad_norm": 0.8195257782936096, + "learning_rate": 1.4650134077307853e-06, + "loss": 0.4725, + "step": 3108 + }, + { + "epoch": 2.4931836407377705, + "grad_norm": 0.8423482775688171, + "learning_rate": 1.4605042919347e-06, + "loss": 0.462, + "step": 3109 + }, + { + "epoch": 2.4939855653568563, + "grad_norm": 0.8940702080726624, + "learning_rate": 1.4560015793433145e-06, + "loss": 0.5092, + "step": 3110 + }, + { + "epoch": 2.494787489975942, + "grad_norm": 0.8922168612480164, + "learning_rate": 1.451505273332926e-06, + "loss": 0.4787, + "step": 3111 + }, + { + "epoch": 2.495589414595028, + "grad_norm": 0.8811621069908142, + "learning_rate": 1.4470153772750118e-06, + "loss": 0.487, + "step": 3112 + }, + { + "epoch": 2.4963913392141137, + "grad_norm": 0.8688673973083496, + "learning_rate": 1.4425318945362488e-06, + "loss": 0.465, + "step": 3113 + }, + { + "epoch": 2.4971932638331995, + "grad_norm": 0.8423486351966858, + "learning_rate": 1.438054828478509e-06, + "loss": 0.4719, + "step": 3114 + }, + { + "epoch": 2.4979951884522853, + "grad_norm": 0.8947927355766296, + "learning_rate": 1.4335841824588436e-06, + "loss": 0.4796, + "step": 3115 + }, + { + "epoch": 2.4987971130713715, + "grad_norm": 0.8390734791755676, + "learning_rate": 1.429119959829499e-06, + "loss": 0.468, + "step": 3116 + }, + { + "epoch": 2.499599037690457, + "grad_norm": 0.8673422336578369, + "learning_rate": 1.4246621639378998e-06, + "loss": 0.4661, + "step": 3117 + }, + { + "epoch": 2.500400962309543, + "grad_norm": 0.833712637424469, + "learning_rate": 1.4202107981266532e-06, + "loss": 0.4809, + "step": 3118 + }, + { + "epoch": 2.5012028869286285, + "grad_norm": 0.8182956576347351, + "learning_rate": 1.4157658657335494e-06, + "loss": 0.4687, + "step": 3119 + }, + { + "epoch": 2.5020048115477147, + "grad_norm": 0.8784580826759338, + "learning_rate": 1.411327370091542e-06, + "loss": 0.48, + "step": 3120 + }, + { + "epoch": 2.5028067361668, + "grad_norm": 0.8753635883331299, + "learning_rate": 1.406895314528771e-06, + "loss": 0.5129, + "step": 3121 + }, + { + "epoch": 2.5036086607858863, + "grad_norm": 0.8735189437866211, + "learning_rate": 1.4024697023685429e-06, + "loss": 0.4794, + "step": 3122 + }, + { + "epoch": 2.504410585404972, + "grad_norm": 0.9127287864685059, + "learning_rate": 1.3980505369293306e-06, + "loss": 0.4402, + "step": 3123 + }, + { + "epoch": 2.505212510024058, + "grad_norm": 0.86842280626297, + "learning_rate": 1.3936378215247771e-06, + "loss": 0.4943, + "step": 3124 + }, + { + "epoch": 2.5060144346431437, + "grad_norm": 0.8117753863334656, + "learning_rate": 1.389231559463684e-06, + "loss": 0.4747, + "step": 3125 + }, + { + "epoch": 2.5068163592622295, + "grad_norm": 0.8560154438018799, + "learning_rate": 1.3848317540500178e-06, + "loss": 0.5145, + "step": 3126 + }, + { + "epoch": 2.5076182838813152, + "grad_norm": 0.885636568069458, + "learning_rate": 1.3804384085829026e-06, + "loss": 0.4802, + "step": 3127 + }, + { + "epoch": 2.508420208500401, + "grad_norm": 0.8547667264938354, + "learning_rate": 1.376051526356621e-06, + "loss": 0.4833, + "step": 3128 + }, + { + "epoch": 2.509222133119487, + "grad_norm": 0.8286969065666199, + "learning_rate": 1.3716711106606007e-06, + "loss": 0.4416, + "step": 3129 + }, + { + "epoch": 2.5100240577385726, + "grad_norm": 0.9037246704101562, + "learning_rate": 1.367297164779431e-06, + "loss": 0.4571, + "step": 3130 + }, + { + "epoch": 2.5108259823576584, + "grad_norm": 0.8304953575134277, + "learning_rate": 1.3629296919928447e-06, + "loss": 0.4926, + "step": 3131 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 0.876008927822113, + "learning_rate": 1.3585686955757205e-06, + "loss": 0.4773, + "step": 3132 + }, + { + "epoch": 2.51242983159583, + "grad_norm": 0.9258241653442383, + "learning_rate": 1.3542141787980855e-06, + "loss": 0.4937, + "step": 3133 + }, + { + "epoch": 2.513231756214916, + "grad_norm": 0.846127986907959, + "learning_rate": 1.3498661449251006e-06, + "loss": 0.469, + "step": 3134 + }, + { + "epoch": 2.5140336808340016, + "grad_norm": 0.8556084036827087, + "learning_rate": 1.3455245972170694e-06, + "loss": 0.4554, + "step": 3135 + }, + { + "epoch": 2.5148356054530874, + "grad_norm": 0.8977357745170593, + "learning_rate": 1.341189538929436e-06, + "loss": 0.4738, + "step": 3136 + }, + { + "epoch": 2.515637530072173, + "grad_norm": 0.8365195989608765, + "learning_rate": 1.3368609733127714e-06, + "loss": 0.4724, + "step": 3137 + }, + { + "epoch": 2.516439454691259, + "grad_norm": 0.8728964328765869, + "learning_rate": 1.3325389036127855e-06, + "loss": 0.4714, + "step": 3138 + }, + { + "epoch": 2.5172413793103448, + "grad_norm": 0.8206337094306946, + "learning_rate": 1.3282233330703087e-06, + "loss": 0.4576, + "step": 3139 + }, + { + "epoch": 2.5180433039294305, + "grad_norm": 0.8753436207771301, + "learning_rate": 1.3239142649213044e-06, + "loss": 0.4825, + "step": 3140 + }, + { + "epoch": 2.5188452285485163, + "grad_norm": 0.8451640009880066, + "learning_rate": 1.3196117023968613e-06, + "loss": 0.4762, + "step": 3141 + }, + { + "epoch": 2.519647153167602, + "grad_norm": 0.8681123852729797, + "learning_rate": 1.315315648723181e-06, + "loss": 0.4776, + "step": 3142 + }, + { + "epoch": 2.520449077786688, + "grad_norm": 0.9339450001716614, + "learning_rate": 1.311026107121599e-06, + "loss": 0.4872, + "step": 3143 + }, + { + "epoch": 2.5212510024057737, + "grad_norm": 0.8395841121673584, + "learning_rate": 1.3067430808085534e-06, + "loss": 0.4718, + "step": 3144 + }, + { + "epoch": 2.5220529270248595, + "grad_norm": 0.8910388946533203, + "learning_rate": 1.3024665729956054e-06, + "loss": 0.4844, + "step": 3145 + }, + { + "epoch": 2.5228548516439453, + "grad_norm": 0.8879891037940979, + "learning_rate": 1.2981965868894287e-06, + "loss": 0.4937, + "step": 3146 + }, + { + "epoch": 2.5236567762630315, + "grad_norm": 0.8522821068763733, + "learning_rate": 1.2939331256917974e-06, + "loss": 0.4719, + "step": 3147 + }, + { + "epoch": 2.524458700882117, + "grad_norm": 0.8639122843742371, + "learning_rate": 1.2896761925996082e-06, + "loss": 0.4651, + "step": 3148 + }, + { + "epoch": 2.525260625501203, + "grad_norm": 0.832323431968689, + "learning_rate": 1.2854257908048483e-06, + "loss": 0.4804, + "step": 3149 + }, + { + "epoch": 2.5260625501202885, + "grad_norm": 0.8917650580406189, + "learning_rate": 1.2811819234946165e-06, + "loss": 0.4942, + "step": 3150 + }, + { + "epoch": 2.5268644747393747, + "grad_norm": 0.8569352030754089, + "learning_rate": 1.2769445938511104e-06, + "loss": 0.4817, + "step": 3151 + }, + { + "epoch": 2.52766639935846, + "grad_norm": 0.8589757084846497, + "learning_rate": 1.2727138050516175e-06, + "loss": 0.4906, + "step": 3152 + }, + { + "epoch": 2.5284683239775463, + "grad_norm": 0.868578314781189, + "learning_rate": 1.2684895602685377e-06, + "loss": 0.5072, + "step": 3153 + }, + { + "epoch": 2.529270248596632, + "grad_norm": 0.8775882124900818, + "learning_rate": 1.264271862669344e-06, + "loss": 0.4789, + "step": 3154 + }, + { + "epoch": 2.530072173215718, + "grad_norm": 0.9492253661155701, + "learning_rate": 1.2600607154166146e-06, + "loss": 0.4941, + "step": 3155 + }, + { + "epoch": 2.5308740978348037, + "grad_norm": 0.8520070910453796, + "learning_rate": 1.255856121668012e-06, + "loss": 0.4633, + "step": 3156 + }, + { + "epoch": 2.5316760224538895, + "grad_norm": 0.8257150053977966, + "learning_rate": 1.2516580845762804e-06, + "loss": 0.4652, + "step": 3157 + }, + { + "epoch": 2.5324779470729752, + "grad_norm": 0.885618269443512, + "learning_rate": 1.2474666072892527e-06, + "loss": 0.4865, + "step": 3158 + }, + { + "epoch": 2.533279871692061, + "grad_norm": 0.8766368627548218, + "learning_rate": 1.2432816929498425e-06, + "loss": 0.4958, + "step": 3159 + }, + { + "epoch": 2.534081796311147, + "grad_norm": 0.8656640648841858, + "learning_rate": 1.2391033446960355e-06, + "loss": 0.4913, + "step": 3160 + }, + { + "epoch": 2.5348837209302326, + "grad_norm": 0.8739482760429382, + "learning_rate": 1.2349315656609085e-06, + "loss": 0.4721, + "step": 3161 + }, + { + "epoch": 2.5356856455493184, + "grad_norm": 0.9599235653877258, + "learning_rate": 1.230766358972596e-06, + "loss": 0.4921, + "step": 3162 + }, + { + "epoch": 2.536487570168404, + "grad_norm": 0.8633760213851929, + "learning_rate": 1.2266077277543155e-06, + "loss": 0.4668, + "step": 3163 + }, + { + "epoch": 2.53728949478749, + "grad_norm": 0.8226913809776306, + "learning_rate": 1.22245567512435e-06, + "loss": 0.4562, + "step": 3164 + }, + { + "epoch": 2.538091419406576, + "grad_norm": 0.8109039664268494, + "learning_rate": 1.218310204196046e-06, + "loss": 0.4515, + "step": 3165 + }, + { + "epoch": 2.5388933440256616, + "grad_norm": 0.8582691550254822, + "learning_rate": 1.214171318077827e-06, + "loss": 0.4797, + "step": 3166 + }, + { + "epoch": 2.5396952686447474, + "grad_norm": 0.8951324224472046, + "learning_rate": 1.2100390198731627e-06, + "loss": 0.4859, + "step": 3167 + }, + { + "epoch": 2.540497193263833, + "grad_norm": 0.8818310499191284, + "learning_rate": 1.2059133126805956e-06, + "loss": 0.4909, + "step": 3168 + }, + { + "epoch": 2.541299117882919, + "grad_norm": 0.8491972088813782, + "learning_rate": 1.201794199593721e-06, + "loss": 0.4824, + "step": 3169 + }, + { + "epoch": 2.5421010425020047, + "grad_norm": 0.9278321266174316, + "learning_rate": 1.197681683701185e-06, + "loss": 0.4734, + "step": 3170 + }, + { + "epoch": 2.5429029671210905, + "grad_norm": 0.8294722437858582, + "learning_rate": 1.193575768086701e-06, + "loss": 0.4577, + "step": 3171 + }, + { + "epoch": 2.5437048917401763, + "grad_norm": 0.7998701333999634, + "learning_rate": 1.1894764558290172e-06, + "loss": 0.4741, + "step": 3172 + }, + { + "epoch": 2.544506816359262, + "grad_norm": 0.8978516459465027, + "learning_rate": 1.1853837500019406e-06, + "loss": 0.4739, + "step": 3173 + }, + { + "epoch": 2.545308740978348, + "grad_norm": 0.8539999723434448, + "learning_rate": 1.1812976536743226e-06, + "loss": 0.4827, + "step": 3174 + }, + { + "epoch": 2.5461106655974337, + "grad_norm": 0.8564225435256958, + "learning_rate": 1.1772181699100538e-06, + "loss": 0.4658, + "step": 3175 + }, + { + "epoch": 2.5469125902165195, + "grad_norm": 0.8567453026771545, + "learning_rate": 1.1731453017680716e-06, + "loss": 0.4624, + "step": 3176 + }, + { + "epoch": 2.5477145148356053, + "grad_norm": 0.8754240870475769, + "learning_rate": 1.169079052302352e-06, + "loss": 0.4923, + "step": 3177 + }, + { + "epoch": 2.5485164394546915, + "grad_norm": 0.8477223515510559, + "learning_rate": 1.1650194245619062e-06, + "loss": 0.5005, + "step": 3178 + }, + { + "epoch": 2.549318364073777, + "grad_norm": 0.8424686789512634, + "learning_rate": 1.1609664215907846e-06, + "loss": 0.4712, + "step": 3179 + }, + { + "epoch": 2.550120288692863, + "grad_norm": 0.8633278012275696, + "learning_rate": 1.1569200464280616e-06, + "loss": 0.502, + "step": 3180 + }, + { + "epoch": 2.5509222133119485, + "grad_norm": 0.8947163224220276, + "learning_rate": 1.1528803021078505e-06, + "loss": 0.4818, + "step": 3181 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 0.8522531986236572, + "learning_rate": 1.148847191659288e-06, + "loss": 0.4868, + "step": 3182 + }, + { + "epoch": 2.55252606255012, + "grad_norm": 0.8658236265182495, + "learning_rate": 1.1448207181065385e-06, + "loss": 0.4717, + "step": 3183 + }, + { + "epoch": 2.5533279871692063, + "grad_norm": 0.919199526309967, + "learning_rate": 1.1408008844687901e-06, + "loss": 0.5037, + "step": 3184 + }, + { + "epoch": 2.5541299117882916, + "grad_norm": 0.8818656802177429, + "learning_rate": 1.1367876937602474e-06, + "loss": 0.4904, + "step": 3185 + }, + { + "epoch": 2.554931836407378, + "grad_norm": 0.8673224449157715, + "learning_rate": 1.1327811489901398e-06, + "loss": 0.4395, + "step": 3186 + }, + { + "epoch": 2.5557337610264637, + "grad_norm": 0.8441712260246277, + "learning_rate": 1.1287812531627108e-06, + "loss": 0.4864, + "step": 3187 + }, + { + "epoch": 2.5565356856455494, + "grad_norm": 0.8633151650428772, + "learning_rate": 1.1247880092772202e-06, + "loss": 0.4841, + "step": 3188 + }, + { + "epoch": 2.5573376102646352, + "grad_norm": 0.8488594889640808, + "learning_rate": 1.120801420327935e-06, + "loss": 0.4965, + "step": 3189 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.8840992450714111, + "learning_rate": 1.1168214893041363e-06, + "loss": 0.4798, + "step": 3190 + }, + { + "epoch": 2.558941459502807, + "grad_norm": 0.909084677696228, + "learning_rate": 1.1128482191901124e-06, + "loss": 0.4761, + "step": 3191 + }, + { + "epoch": 2.5597433841218926, + "grad_norm": 0.8316271901130676, + "learning_rate": 1.1088816129651569e-06, + "loss": 0.4709, + "step": 3192 + }, + { + "epoch": 2.5605453087409784, + "grad_norm": 0.8329962491989136, + "learning_rate": 1.1049216736035673e-06, + "loss": 0.4643, + "step": 3193 + }, + { + "epoch": 2.561347233360064, + "grad_norm": 0.8709492683410645, + "learning_rate": 1.1009684040746394e-06, + "loss": 0.4716, + "step": 3194 + }, + { + "epoch": 2.56214915797915, + "grad_norm": 0.8427078723907471, + "learning_rate": 1.0970218073426674e-06, + "loss": 0.4831, + "step": 3195 + }, + { + "epoch": 2.562951082598236, + "grad_norm": 0.8805751204490662, + "learning_rate": 1.093081886366948e-06, + "loss": 0.4631, + "step": 3196 + }, + { + "epoch": 2.5637530072173216, + "grad_norm": 0.8472068905830383, + "learning_rate": 1.0891486441017652e-06, + "loss": 0.4822, + "step": 3197 + }, + { + "epoch": 2.5645549318364074, + "grad_norm": 0.8687289357185364, + "learning_rate": 1.085222083496401e-06, + "loss": 0.4685, + "step": 3198 + }, + { + "epoch": 2.565356856455493, + "grad_norm": 0.857964813709259, + "learning_rate": 1.0813022074951208e-06, + "loss": 0.4577, + "step": 3199 + }, + { + "epoch": 2.566158781074579, + "grad_norm": 0.8346413373947144, + "learning_rate": 1.0773890190371828e-06, + "loss": 0.4715, + "step": 3200 + }, + { + "epoch": 2.5669607056936647, + "grad_norm": 0.8784050345420837, + "learning_rate": 1.07348252105683e-06, + "loss": 0.5006, + "step": 3201 + }, + { + "epoch": 2.5677626303127505, + "grad_norm": 0.8425981402397156, + "learning_rate": 1.0695827164832828e-06, + "loss": 0.486, + "step": 3202 + }, + { + "epoch": 2.5685645549318363, + "grad_norm": 0.886447012424469, + "learning_rate": 1.0656896082407554e-06, + "loss": 0.4815, + "step": 3203 + }, + { + "epoch": 2.569366479550922, + "grad_norm": 0.8257114291191101, + "learning_rate": 1.0618031992484267e-06, + "loss": 0.4633, + "step": 3204 + }, + { + "epoch": 2.570168404170008, + "grad_norm": 0.8750101923942566, + "learning_rate": 1.0579234924204608e-06, + "loss": 0.4793, + "step": 3205 + }, + { + "epoch": 2.5709703287890937, + "grad_norm": 0.9001625180244446, + "learning_rate": 1.0540504906659955e-06, + "loss": 0.4668, + "step": 3206 + }, + { + "epoch": 2.5717722534081795, + "grad_norm": 0.8314618468284607, + "learning_rate": 1.0501841968891324e-06, + "loss": 0.4718, + "step": 3207 + }, + { + "epoch": 2.5725741780272653, + "grad_norm": 0.8423780202865601, + "learning_rate": 1.0463246139889604e-06, + "loss": 0.4886, + "step": 3208 + }, + { + "epoch": 2.573376102646351, + "grad_norm": 0.8424227237701416, + "learning_rate": 1.04247174485952e-06, + "loss": 0.4784, + "step": 3209 + }, + { + "epoch": 2.574178027265437, + "grad_norm": 0.8639745116233826, + "learning_rate": 1.0386255923898236e-06, + "loss": 0.4853, + "step": 3210 + }, + { + "epoch": 2.574979951884523, + "grad_norm": 0.906941294670105, + "learning_rate": 1.0347861594638519e-06, + "loss": 0.4797, + "step": 3211 + }, + { + "epoch": 2.5757818765036085, + "grad_norm": 0.8813538551330566, + "learning_rate": 1.0309534489605344e-06, + "loss": 0.4974, + "step": 3212 + }, + { + "epoch": 2.5765838011226947, + "grad_norm": 0.8710605502128601, + "learning_rate": 1.0271274637537764e-06, + "loss": 0.476, + "step": 3213 + }, + { + "epoch": 2.57738572574178, + "grad_norm": 0.9004622101783752, + "learning_rate": 1.0233082067124266e-06, + "loss": 0.4821, + "step": 3214 + }, + { + "epoch": 2.5781876503608663, + "grad_norm": 0.9231355786323547, + "learning_rate": 1.0194956807002965e-06, + "loss": 0.4919, + "step": 3215 + }, + { + "epoch": 2.5789895749799516, + "grad_norm": 0.8449090719223022, + "learning_rate": 1.015689888576149e-06, + "loss": 0.4442, + "step": 3216 + }, + { + "epoch": 2.579791499599038, + "grad_norm": 0.9300955533981323, + "learning_rate": 1.0118908331936915e-06, + "loss": 0.4931, + "step": 3217 + }, + { + "epoch": 2.5805934242181237, + "grad_norm": 0.8253883719444275, + "learning_rate": 1.0080985174015901e-06, + "loss": 0.4689, + "step": 3218 + }, + { + "epoch": 2.5813953488372094, + "grad_norm": 0.9097961783409119, + "learning_rate": 1.0043129440434496e-06, + "loss": 0.4745, + "step": 3219 + }, + { + "epoch": 2.5821972734562952, + "grad_norm": 0.8564082384109497, + "learning_rate": 1.000534115957823e-06, + "loss": 0.4759, + "step": 3220 + }, + { + "epoch": 2.582999198075381, + "grad_norm": 0.8668890595436096, + "learning_rate": 9.96762035978206e-07, + "loss": 0.4745, + "step": 3221 + }, + { + "epoch": 2.583801122694467, + "grad_norm": 0.8841666579246521, + "learning_rate": 9.929967069330282e-07, + "loss": 0.4765, + "step": 3222 + }, + { + "epoch": 2.5846030473135526, + "grad_norm": 0.8739838600158691, + "learning_rate": 9.892381316456656e-07, + "loss": 0.4634, + "step": 3223 + }, + { + "epoch": 2.5854049719326384, + "grad_norm": 0.8660984039306641, + "learning_rate": 9.854863129344229e-07, + "loss": 0.4719, + "step": 3224 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 0.8962447643280029, + "learning_rate": 9.817412536125449e-07, + "loss": 0.479, + "step": 3225 + }, + { + "epoch": 2.58700882117081, + "grad_norm": 0.8654753565788269, + "learning_rate": 9.780029564882032e-07, + "loss": 0.4506, + "step": 3226 + }, + { + "epoch": 2.587810745789896, + "grad_norm": 0.8893976211547852, + "learning_rate": 9.74271424364498e-07, + "loss": 0.459, + "step": 3227 + }, + { + "epoch": 2.5886126704089816, + "grad_norm": 0.9372872710227966, + "learning_rate": 9.70546660039462e-07, + "loss": 0.4895, + "step": 3228 + }, + { + "epoch": 2.5894145950280674, + "grad_norm": 0.8435534238815308, + "learning_rate": 9.66828666306049e-07, + "loss": 0.4948, + "step": 3229 + }, + { + "epoch": 2.590216519647153, + "grad_norm": 0.8981894850730896, + "learning_rate": 9.631174459521398e-07, + "loss": 0.4559, + "step": 3230 + }, + { + "epoch": 2.591018444266239, + "grad_norm": 0.8509783744812012, + "learning_rate": 9.594130017605296e-07, + "loss": 0.4704, + "step": 3231 + }, + { + "epoch": 2.5918203688853247, + "grad_norm": 0.8438981175422668, + "learning_rate": 9.5571533650894e-07, + "loss": 0.4753, + "step": 3232 + }, + { + "epoch": 2.5926222935044105, + "grad_norm": 0.8406031131744385, + "learning_rate": 9.520244529700041e-07, + "loss": 0.4596, + "step": 3233 + }, + { + "epoch": 2.5934242181234963, + "grad_norm": 0.8662395477294922, + "learning_rate": 9.483403539112735e-07, + "loss": 0.4481, + "step": 3234 + }, + { + "epoch": 2.594226142742582, + "grad_norm": 0.843547523021698, + "learning_rate": 9.44663042095213e-07, + "loss": 0.4611, + "step": 3235 + }, + { + "epoch": 2.595028067361668, + "grad_norm": 0.8858677744865417, + "learning_rate": 9.409925202791925e-07, + "loss": 0.4842, + "step": 3236 + }, + { + "epoch": 2.5958299919807537, + "grad_norm": 0.8714892864227295, + "learning_rate": 9.37328791215496e-07, + "loss": 0.462, + "step": 3237 + }, + { + "epoch": 2.5966319165998395, + "grad_norm": 0.8578234910964966, + "learning_rate": 9.336718576513127e-07, + "loss": 0.4721, + "step": 3238 + }, + { + "epoch": 2.5974338412189253, + "grad_norm": 0.8844730257987976, + "learning_rate": 9.300217223287345e-07, + "loss": 0.4791, + "step": 3239 + }, + { + "epoch": 2.598235765838011, + "grad_norm": 0.8658874034881592, + "learning_rate": 9.263783879847599e-07, + "loss": 0.4751, + "step": 3240 + }, + { + "epoch": 2.599037690457097, + "grad_norm": 0.8326420187950134, + "learning_rate": 9.227418573512825e-07, + "loss": 0.4736, + "step": 3241 + }, + { + "epoch": 2.599839615076183, + "grad_norm": 0.8331664800643921, + "learning_rate": 9.191121331550967e-07, + "loss": 0.4664, + "step": 3242 + }, + { + "epoch": 2.6006415396952685, + "grad_norm": 0.915407657623291, + "learning_rate": 9.154892181178954e-07, + "loss": 0.4962, + "step": 3243 + }, + { + "epoch": 2.6014434643143547, + "grad_norm": 0.8679154515266418, + "learning_rate": 9.11873114956261e-07, + "loss": 0.4601, + "step": 3244 + }, + { + "epoch": 2.60224538893344, + "grad_norm": 0.8816805481910706, + "learning_rate": 9.082638263816756e-07, + "loss": 0.4632, + "step": 3245 + }, + { + "epoch": 2.6030473135525263, + "grad_norm": 0.8480551838874817, + "learning_rate": 9.046613551005012e-07, + "loss": 0.4638, + "step": 3246 + }, + { + "epoch": 2.6038492381716116, + "grad_norm": 0.8449149131774902, + "learning_rate": 9.010657038139947e-07, + "loss": 0.4933, + "step": 3247 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.8377960920333862, + "learning_rate": 8.974768752183016e-07, + "loss": 0.4751, + "step": 3248 + }, + { + "epoch": 2.605453087409783, + "grad_norm": 0.8451635837554932, + "learning_rate": 8.938948720044416e-07, + "loss": 0.4731, + "step": 3249 + }, + { + "epoch": 2.6062550120288694, + "grad_norm": 0.8990350365638733, + "learning_rate": 8.903196968583295e-07, + "loss": 0.4893, + "step": 3250 + }, + { + "epoch": 2.6070569366479552, + "grad_norm": 0.8640419244766235, + "learning_rate": 8.867513524607485e-07, + "loss": 0.4816, + "step": 3251 + }, + { + "epoch": 2.607858861267041, + "grad_norm": 0.8659276366233826, + "learning_rate": 8.831898414873663e-07, + "loss": 0.4691, + "step": 3252 + }, + { + "epoch": 2.608660785886127, + "grad_norm": 0.8834055066108704, + "learning_rate": 8.796351666087266e-07, + "loss": 0.4584, + "step": 3253 + }, + { + "epoch": 2.6094627105052126, + "grad_norm": 0.8305429220199585, + "learning_rate": 8.760873304902406e-07, + "loss": 0.448, + "step": 3254 + }, + { + "epoch": 2.6102646351242984, + "grad_norm": 0.8624115586280823, + "learning_rate": 8.725463357922037e-07, + "loss": 0.4716, + "step": 3255 + }, + { + "epoch": 2.611066559743384, + "grad_norm": 0.838100016117096, + "learning_rate": 8.690121851697697e-07, + "loss": 0.4564, + "step": 3256 + }, + { + "epoch": 2.61186848436247, + "grad_norm": 0.8911872506141663, + "learning_rate": 8.654848812729655e-07, + "loss": 0.4695, + "step": 3257 + }, + { + "epoch": 2.612670408981556, + "grad_norm": 0.9087029099464417, + "learning_rate": 8.619644267466876e-07, + "loss": 0.487, + "step": 3258 + }, + { + "epoch": 2.6134723336006416, + "grad_norm": 0.8725029826164246, + "learning_rate": 8.584508242306844e-07, + "loss": 0.4856, + "step": 3259 + }, + { + "epoch": 2.6142742582197274, + "grad_norm": 0.9196523427963257, + "learning_rate": 8.549440763595851e-07, + "loss": 0.4754, + "step": 3260 + }, + { + "epoch": 2.615076182838813, + "grad_norm": 0.7980285286903381, + "learning_rate": 8.514441857628619e-07, + "loss": 0.4519, + "step": 3261 + }, + { + "epoch": 2.615878107457899, + "grad_norm": 0.8673176169395447, + "learning_rate": 8.479511550648512e-07, + "loss": 0.4633, + "step": 3262 + }, + { + "epoch": 2.6166800320769847, + "grad_norm": 0.8644046783447266, + "learning_rate": 8.44464986884751e-07, + "loss": 0.4848, + "step": 3263 + }, + { + "epoch": 2.6174819566960705, + "grad_norm": 0.8942599296569824, + "learning_rate": 8.40985683836606e-07, + "loss": 0.4959, + "step": 3264 + }, + { + "epoch": 2.6182838813151563, + "grad_norm": 0.8743693232536316, + "learning_rate": 8.375132485293158e-07, + "loss": 0.4744, + "step": 3265 + }, + { + "epoch": 2.619085805934242, + "grad_norm": 0.852287232875824, + "learning_rate": 8.340476835666345e-07, + "loss": 0.4812, + "step": 3266 + }, + { + "epoch": 2.619887730553328, + "grad_norm": 0.8935552835464478, + "learning_rate": 8.305889915471532e-07, + "loss": 0.4776, + "step": 3267 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.8734551072120667, + "learning_rate": 8.271371750643265e-07, + "loss": 0.4794, + "step": 3268 + }, + { + "epoch": 2.6214915797914995, + "grad_norm": 0.8658239245414734, + "learning_rate": 8.236922367064359e-07, + "loss": 0.4691, + "step": 3269 + }, + { + "epoch": 2.6222935044105853, + "grad_norm": 0.8544544577598572, + "learning_rate": 8.202541790566176e-07, + "loss": 0.4987, + "step": 3270 + }, + { + "epoch": 2.623095429029671, + "grad_norm": 0.8634313344955444, + "learning_rate": 8.16823004692845e-07, + "loss": 0.4704, + "step": 3271 + }, + { + "epoch": 2.623897353648757, + "grad_norm": 0.9538077116012573, + "learning_rate": 8.133987161879231e-07, + "loss": 0.4827, + "step": 3272 + }, + { + "epoch": 2.6246992782678427, + "grad_norm": 0.8885698914527893, + "learning_rate": 8.099813161095094e-07, + "loss": 0.4685, + "step": 3273 + }, + { + "epoch": 2.6255012028869285, + "grad_norm": 0.8560696840286255, + "learning_rate": 8.065708070200806e-07, + "loss": 0.4699, + "step": 3274 + }, + { + "epoch": 2.6263031275060147, + "grad_norm": 0.8704147934913635, + "learning_rate": 8.031671914769545e-07, + "loss": 0.4734, + "step": 3275 + }, + { + "epoch": 2.6271050521251, + "grad_norm": 0.8830083608627319, + "learning_rate": 7.997704720322785e-07, + "loss": 0.4737, + "step": 3276 + }, + { + "epoch": 2.6279069767441863, + "grad_norm": 0.8642050623893738, + "learning_rate": 7.963806512330275e-07, + "loss": 0.4699, + "step": 3277 + }, + { + "epoch": 2.6287089013632716, + "grad_norm": 0.8747822642326355, + "learning_rate": 7.929977316210036e-07, + "loss": 0.4592, + "step": 3278 + }, + { + "epoch": 2.629510825982358, + "grad_norm": 0.8815390467643738, + "learning_rate": 7.896217157328357e-07, + "loss": 0.4706, + "step": 3279 + }, + { + "epoch": 2.630312750601443, + "grad_norm": 0.8473573327064514, + "learning_rate": 7.862526060999775e-07, + "loss": 0.4753, + "step": 3280 + }, + { + "epoch": 2.6311146752205294, + "grad_norm": 0.8307991623878479, + "learning_rate": 7.828904052487019e-07, + "loss": 0.467, + "step": 3281 + }, + { + "epoch": 2.6319165998396152, + "grad_norm": 0.8285624384880066, + "learning_rate": 7.795351157000986e-07, + "loss": 0.4571, + "step": 3282 + }, + { + "epoch": 2.632718524458701, + "grad_norm": 0.8307252526283264, + "learning_rate": 7.761867399700796e-07, + "loss": 0.485, + "step": 3283 + }, + { + "epoch": 2.633520449077787, + "grad_norm": 0.8082962036132812, + "learning_rate": 7.72845280569372e-07, + "loss": 0.4594, + "step": 3284 + }, + { + "epoch": 2.6343223736968726, + "grad_norm": 0.8508808016777039, + "learning_rate": 7.69510740003514e-07, + "loss": 0.4566, + "step": 3285 + }, + { + "epoch": 2.6351242983159584, + "grad_norm": 0.8629611134529114, + "learning_rate": 7.66183120772862e-07, + "loss": 0.4669, + "step": 3286 + }, + { + "epoch": 2.635926222935044, + "grad_norm": 0.8811891078948975, + "learning_rate": 7.628624253725725e-07, + "loss": 0.4717, + "step": 3287 + }, + { + "epoch": 2.63672814755413, + "grad_norm": 0.8686394691467285, + "learning_rate": 7.59548656292618e-07, + "loss": 0.4819, + "step": 3288 + }, + { + "epoch": 2.637530072173216, + "grad_norm": 0.9036654829978943, + "learning_rate": 7.562418160177765e-07, + "loss": 0.4884, + "step": 3289 + }, + { + "epoch": 2.6383319967923016, + "grad_norm": 0.8560638427734375, + "learning_rate": 7.529419070276312e-07, + "loss": 0.4787, + "step": 3290 + }, + { + "epoch": 2.6391339214113874, + "grad_norm": 0.8455655574798584, + "learning_rate": 7.496489317965616e-07, + "loss": 0.4466, + "step": 3291 + }, + { + "epoch": 2.639935846030473, + "grad_norm": 0.9102020859718323, + "learning_rate": 7.463628927937549e-07, + "loss": 0.5097, + "step": 3292 + }, + { + "epoch": 2.640737770649559, + "grad_norm": 0.8347408175468445, + "learning_rate": 7.430837924831958e-07, + "loss": 0.4628, + "step": 3293 + }, + { + "epoch": 2.6415396952686447, + "grad_norm": 0.8465882539749146, + "learning_rate": 7.398116333236638e-07, + "loss": 0.4683, + "step": 3294 + }, + { + "epoch": 2.6423416198877305, + "grad_norm": 0.8932040929794312, + "learning_rate": 7.365464177687387e-07, + "loss": 0.4939, + "step": 3295 + }, + { + "epoch": 2.6431435445068163, + "grad_norm": 0.8394157290458679, + "learning_rate": 7.332881482667853e-07, + "loss": 0.4709, + "step": 3296 + }, + { + "epoch": 2.643945469125902, + "grad_norm": 0.8661454319953918, + "learning_rate": 7.300368272609692e-07, + "loss": 0.4766, + "step": 3297 + }, + { + "epoch": 2.644747393744988, + "grad_norm": 0.85788893699646, + "learning_rate": 7.267924571892382e-07, + "loss": 0.4845, + "step": 3298 + }, + { + "epoch": 2.6455493183640737, + "grad_norm": 0.8850892186164856, + "learning_rate": 7.23555040484335e-07, + "loss": 0.4652, + "step": 3299 + }, + { + "epoch": 2.6463512429831595, + "grad_norm": 0.9400017857551575, + "learning_rate": 7.203245795737834e-07, + "loss": 0.4986, + "step": 3300 + }, + { + "epoch": 2.6471531676022453, + "grad_norm": 0.8622959852218628, + "learning_rate": 7.171010768798925e-07, + "loss": 0.4876, + "step": 3301 + }, + { + "epoch": 2.647955092221331, + "grad_norm": 0.8049087524414062, + "learning_rate": 7.138845348197532e-07, + "loss": 0.4665, + "step": 3302 + }, + { + "epoch": 2.648757016840417, + "grad_norm": 0.8678800463676453, + "learning_rate": 7.106749558052428e-07, + "loss": 0.4727, + "step": 3303 + }, + { + "epoch": 2.6495589414595027, + "grad_norm": 0.8367797136306763, + "learning_rate": 7.074723422430052e-07, + "loss": 0.4743, + "step": 3304 + }, + { + "epoch": 2.6503608660785885, + "grad_norm": 0.8551909327507019, + "learning_rate": 7.042766965344782e-07, + "loss": 0.4498, + "step": 3305 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 0.8910350203514099, + "learning_rate": 7.010880210758597e-07, + "loss": 0.4905, + "step": 3306 + }, + { + "epoch": 2.65196471531676, + "grad_norm": 1.0603433847427368, + "learning_rate": 6.979063182581291e-07, + "loss": 0.4695, + "step": 3307 + }, + { + "epoch": 2.6527666399358463, + "grad_norm": 0.8967297673225403, + "learning_rate": 6.94731590467036e-07, + "loss": 0.4916, + "step": 3308 + }, + { + "epoch": 2.6535685645549316, + "grad_norm": 0.9202722311019897, + "learning_rate": 6.915638400830959e-07, + "loss": 0.4965, + "step": 3309 + }, + { + "epoch": 2.654370489174018, + "grad_norm": 0.8630130290985107, + "learning_rate": 6.884030694816024e-07, + "loss": 0.4634, + "step": 3310 + }, + { + "epoch": 2.655172413793103, + "grad_norm": 0.8898342847824097, + "learning_rate": 6.852492810326028e-07, + "loss": 0.463, + "step": 3311 + }, + { + "epoch": 2.6559743384121894, + "grad_norm": 0.8422430753707886, + "learning_rate": 6.821024771009188e-07, + "loss": 0.4686, + "step": 3312 + }, + { + "epoch": 2.656776263031275, + "grad_norm": 0.8274664282798767, + "learning_rate": 6.789626600461307e-07, + "loss": 0.4703, + "step": 3313 + }, + { + "epoch": 2.657578187650361, + "grad_norm": 0.8955613970756531, + "learning_rate": 6.758298322225765e-07, + "loss": 0.4771, + "step": 3314 + }, + { + "epoch": 2.658380112269447, + "grad_norm": 0.8348634243011475, + "learning_rate": 6.727039959793635e-07, + "loss": 0.4513, + "step": 3315 + }, + { + "epoch": 2.6591820368885326, + "grad_norm": 0.8191150426864624, + "learning_rate": 6.69585153660347e-07, + "loss": 0.454, + "step": 3316 + }, + { + "epoch": 2.6599839615076184, + "grad_norm": 0.8912159204483032, + "learning_rate": 6.664733076041374e-07, + "loss": 0.4835, + "step": 3317 + }, + { + "epoch": 2.660785886126704, + "grad_norm": 0.8224286437034607, + "learning_rate": 6.633684601441092e-07, + "loss": 0.4868, + "step": 3318 + }, + { + "epoch": 2.66158781074579, + "grad_norm": 0.8863853812217712, + "learning_rate": 6.602706136083792e-07, + "loss": 0.4837, + "step": 3319 + }, + { + "epoch": 2.6623897353648758, + "grad_norm": 0.9140603542327881, + "learning_rate": 6.57179770319819e-07, + "loss": 0.4754, + "step": 3320 + }, + { + "epoch": 2.6631916599839616, + "grad_norm": 0.8462338447570801, + "learning_rate": 6.540959325960494e-07, + "loss": 0.4739, + "step": 3321 + }, + { + "epoch": 2.6639935846030474, + "grad_norm": 0.90171217918396, + "learning_rate": 6.510191027494339e-07, + "loss": 0.4652, + "step": 3322 + }, + { + "epoch": 2.664795509222133, + "grad_norm": 0.878610372543335, + "learning_rate": 6.479492830870881e-07, + "loss": 0.4673, + "step": 3323 + }, + { + "epoch": 2.665597433841219, + "grad_norm": 0.8632857799530029, + "learning_rate": 6.448864759108642e-07, + "loss": 0.4864, + "step": 3324 + }, + { + "epoch": 2.6663993584603047, + "grad_norm": 0.8719748258590698, + "learning_rate": 6.418306835173605e-07, + "loss": 0.5048, + "step": 3325 + }, + { + "epoch": 2.6672012830793905, + "grad_norm": 0.8836221098899841, + "learning_rate": 6.387819081979163e-07, + "loss": 0.4702, + "step": 3326 + }, + { + "epoch": 2.6680032076984763, + "grad_norm": 0.8506346940994263, + "learning_rate": 6.35740152238602e-07, + "loss": 0.4492, + "step": 3327 + }, + { + "epoch": 2.668805132317562, + "grad_norm": 0.9193611145019531, + "learning_rate": 6.327054179202352e-07, + "loss": 0.4562, + "step": 3328 + }, + { + "epoch": 2.669607056936648, + "grad_norm": 0.8532218933105469, + "learning_rate": 6.296777075183602e-07, + "loss": 0.4693, + "step": 3329 + }, + { + "epoch": 2.6704089815557337, + "grad_norm": 0.8159738779067993, + "learning_rate": 6.266570233032576e-07, + "loss": 0.4745, + "step": 3330 + }, + { + "epoch": 2.6712109061748195, + "grad_norm": 0.8848310112953186, + "learning_rate": 6.236433675399412e-07, + "loss": 0.4801, + "step": 3331 + }, + { + "epoch": 2.6720128307939053, + "grad_norm": 0.80049067735672, + "learning_rate": 6.206367424881487e-07, + "loss": 0.4469, + "step": 3332 + }, + { + "epoch": 2.672814755412991, + "grad_norm": 0.8821406960487366, + "learning_rate": 6.176371504023537e-07, + "loss": 0.4731, + "step": 3333 + }, + { + "epoch": 2.673616680032077, + "grad_norm": 0.8810633420944214, + "learning_rate": 6.146445935317502e-07, + "loss": 0.467, + "step": 3334 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 0.8649298548698425, + "learning_rate": 6.116590741202611e-07, + "loss": 0.4911, + "step": 3335 + }, + { + "epoch": 2.6752205292702484, + "grad_norm": 0.8359307646751404, + "learning_rate": 6.08680594406531e-07, + "loss": 0.4564, + "step": 3336 + }, + { + "epoch": 2.6760224538893342, + "grad_norm": 0.876586377620697, + "learning_rate": 6.057091566239226e-07, + "loss": 0.4856, + "step": 3337 + }, + { + "epoch": 2.67682437850842, + "grad_norm": 0.9136984944343567, + "learning_rate": 6.027447630005234e-07, + "loss": 0.5089, + "step": 3338 + }, + { + "epoch": 2.6776263031275063, + "grad_norm": 0.899994432926178, + "learning_rate": 5.997874157591344e-07, + "loss": 0.4879, + "step": 3339 + }, + { + "epoch": 2.6784282277465916, + "grad_norm": 0.8839572072029114, + "learning_rate": 5.968371171172782e-07, + "loss": 0.4718, + "step": 3340 + }, + { + "epoch": 2.679230152365678, + "grad_norm": 0.8831132650375366, + "learning_rate": 5.938938692871887e-07, + "loss": 0.4668, + "step": 3341 + }, + { + "epoch": 2.680032076984763, + "grad_norm": 0.9043929576873779, + "learning_rate": 5.909576744758117e-07, + "loss": 0.4733, + "step": 3342 + }, + { + "epoch": 2.6808340016038494, + "grad_norm": 0.8958361744880676, + "learning_rate": 5.880285348848069e-07, + "loss": 0.478, + "step": 3343 + }, + { + "epoch": 2.681635926222935, + "grad_norm": 0.8278128504753113, + "learning_rate": 5.851064527105421e-07, + "loss": 0.4736, + "step": 3344 + }, + { + "epoch": 2.682437850842021, + "grad_norm": 0.8567398190498352, + "learning_rate": 5.821914301440956e-07, + "loss": 0.4548, + "step": 3345 + }, + { + "epoch": 2.683239775461107, + "grad_norm": 0.9090203046798706, + "learning_rate": 5.792834693712502e-07, + "loss": 0.4774, + "step": 3346 + }, + { + "epoch": 2.6840417000801926, + "grad_norm": 0.8930779099464417, + "learning_rate": 5.763825725724925e-07, + "loss": 0.4796, + "step": 3347 + }, + { + "epoch": 2.6848436246992784, + "grad_norm": 0.8618937730789185, + "learning_rate": 5.734887419230151e-07, + "loss": 0.4736, + "step": 3348 + }, + { + "epoch": 2.685645549318364, + "grad_norm": 0.8632767796516418, + "learning_rate": 5.70601979592711e-07, + "loss": 0.4666, + "step": 3349 + }, + { + "epoch": 2.68644747393745, + "grad_norm": 0.8496332764625549, + "learning_rate": 5.67722287746173e-07, + "loss": 0.4702, + "step": 3350 + }, + { + "epoch": 2.6872493985565358, + "grad_norm": 0.9124326705932617, + "learning_rate": 5.648496685426908e-07, + "loss": 0.5128, + "step": 3351 + }, + { + "epoch": 2.6880513231756216, + "grad_norm": 0.8609637022018433, + "learning_rate": 5.619841241362522e-07, + "loss": 0.4635, + "step": 3352 + }, + { + "epoch": 2.6888532477947074, + "grad_norm": 0.8587467074394226, + "learning_rate": 5.591256566755399e-07, + "loss": 0.4702, + "step": 3353 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 0.8724468350410461, + "learning_rate": 5.562742683039313e-07, + "loss": 0.5009, + "step": 3354 + }, + { + "epoch": 2.690457097032879, + "grad_norm": 0.8646350502967834, + "learning_rate": 5.534299611594962e-07, + "loss": 0.462, + "step": 3355 + }, + { + "epoch": 2.6912590216519647, + "grad_norm": 0.8256325721740723, + "learning_rate": 5.505927373749887e-07, + "loss": 0.4507, + "step": 3356 + }, + { + "epoch": 2.6920609462710505, + "grad_norm": 0.8692865371704102, + "learning_rate": 5.477625990778579e-07, + "loss": 0.4719, + "step": 3357 + }, + { + "epoch": 2.6928628708901363, + "grad_norm": 0.8937420845031738, + "learning_rate": 5.449395483902376e-07, + "loss": 0.4955, + "step": 3358 + }, + { + "epoch": 2.693664795509222, + "grad_norm": 0.8685393333435059, + "learning_rate": 5.421235874289488e-07, + "loss": 0.4784, + "step": 3359 + }, + { + "epoch": 2.694466720128308, + "grad_norm": 0.8677978515625, + "learning_rate": 5.393147183054936e-07, + "loss": 0.4807, + "step": 3360 + }, + { + "epoch": 2.6952686447473937, + "grad_norm": 0.8556413650512695, + "learning_rate": 5.365129431260574e-07, + "loss": 0.4603, + "step": 3361 + }, + { + "epoch": 2.6960705693664795, + "grad_norm": 0.9038963913917542, + "learning_rate": 5.337182639915073e-07, + "loss": 0.4935, + "step": 3362 + }, + { + "epoch": 2.6968724939855653, + "grad_norm": 0.8712881803512573, + "learning_rate": 5.309306829973892e-07, + "loss": 0.4893, + "step": 3363 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 0.8390231728553772, + "learning_rate": 5.281502022339236e-07, + "loss": 0.4682, + "step": 3364 + }, + { + "epoch": 2.698476343223737, + "grad_norm": 0.8879370093345642, + "learning_rate": 5.253768237860146e-07, + "loss": 0.4695, + "step": 3365 + }, + { + "epoch": 2.6992782678428227, + "grad_norm": 0.9008282423019409, + "learning_rate": 5.226105497332323e-07, + "loss": 0.4811, + "step": 3366 + }, + { + "epoch": 2.7000801924619084, + "grad_norm": 0.9389668107032776, + "learning_rate": 5.19851382149823e-07, + "loss": 0.4956, + "step": 3367 + }, + { + "epoch": 2.7008821170809942, + "grad_norm": 0.8313438296318054, + "learning_rate": 5.170993231047072e-07, + "loss": 0.4591, + "step": 3368 + }, + { + "epoch": 2.70168404170008, + "grad_norm": 0.8891331553459167, + "learning_rate": 5.143543746614688e-07, + "loss": 0.4968, + "step": 3369 + }, + { + "epoch": 2.7024859663191663, + "grad_norm": 0.861348569393158, + "learning_rate": 5.116165388783678e-07, + "loss": 0.4716, + "step": 3370 + }, + { + "epoch": 2.7032878909382516, + "grad_norm": 0.8426232933998108, + "learning_rate": 5.088858178083223e-07, + "loss": 0.487, + "step": 3371 + }, + { + "epoch": 2.704089815557338, + "grad_norm": 0.8665948510169983, + "learning_rate": 5.06162213498923e-07, + "loss": 0.4549, + "step": 3372 + }, + { + "epoch": 2.704891740176423, + "grad_norm": 0.8754244446754456, + "learning_rate": 5.034457279924221e-07, + "loss": 0.4989, + "step": 3373 + }, + { + "epoch": 2.7056936647955094, + "grad_norm": 0.8739911317825317, + "learning_rate": 5.007363633257278e-07, + "loss": 0.4558, + "step": 3374 + }, + { + "epoch": 2.706495589414595, + "grad_norm": 0.8367862701416016, + "learning_rate": 4.980341215304196e-07, + "loss": 0.4608, + "step": 3375 + }, + { + "epoch": 2.707297514033681, + "grad_norm": 0.8726009130477905, + "learning_rate": 4.953390046327278e-07, + "loss": 0.4706, + "step": 3376 + }, + { + "epoch": 2.7080994386527664, + "grad_norm": 0.8462924361228943, + "learning_rate": 4.926510146535434e-07, + "loss": 0.4618, + "step": 3377 + }, + { + "epoch": 2.7089013632718526, + "grad_norm": 0.9273150563240051, + "learning_rate": 4.899701536084134e-07, + "loss": 0.4875, + "step": 3378 + }, + { + "epoch": 2.7097032878909384, + "grad_norm": 0.8953900933265686, + "learning_rate": 4.872964235075361e-07, + "loss": 0.4932, + "step": 3379 + }, + { + "epoch": 2.710505212510024, + "grad_norm": 0.8775522708892822, + "learning_rate": 4.846298263557681e-07, + "loss": 0.4835, + "step": 3380 + }, + { + "epoch": 2.71130713712911, + "grad_norm": 0.8793307542800903, + "learning_rate": 4.819703641526141e-07, + "loss": 0.4748, + "step": 3381 + }, + { + "epoch": 2.7121090617481958, + "grad_norm": 0.8890559673309326, + "learning_rate": 4.793180388922292e-07, + "loss": 0.4906, + "step": 3382 + }, + { + "epoch": 2.7129109863672816, + "grad_norm": 0.8522745370864868, + "learning_rate": 4.766728525634179e-07, + "loss": 0.466, + "step": 3383 + }, + { + "epoch": 2.7137129109863674, + "grad_norm": 0.8627252578735352, + "learning_rate": 4.7403480714963037e-07, + "loss": 0.4565, + "step": 3384 + }, + { + "epoch": 2.714514835605453, + "grad_norm": 0.8617047071456909, + "learning_rate": 4.71403904628962e-07, + "loss": 0.4707, + "step": 3385 + }, + { + "epoch": 2.715316760224539, + "grad_norm": 0.8587998747825623, + "learning_rate": 4.6878014697415374e-07, + "loss": 0.4742, + "step": 3386 + }, + { + "epoch": 2.7161186848436247, + "grad_norm": 0.8326482176780701, + "learning_rate": 4.661635361525885e-07, + "loss": 0.4557, + "step": 3387 + }, + { + "epoch": 2.7169206094627105, + "grad_norm": 0.8801354765892029, + "learning_rate": 4.635540741262923e-07, + "loss": 0.4722, + "step": 3388 + }, + { + "epoch": 2.7177225340817963, + "grad_norm": 0.8579322695732117, + "learning_rate": 4.6095176285192556e-07, + "loss": 0.4689, + "step": 3389 + }, + { + "epoch": 2.718524458700882, + "grad_norm": 0.9005702137947083, + "learning_rate": 4.583566042807908e-07, + "loss": 0.4769, + "step": 3390 + }, + { + "epoch": 2.719326383319968, + "grad_norm": 0.8632087707519531, + "learning_rate": 4.557686003588269e-07, + "loss": 0.4771, + "step": 3391 + }, + { + "epoch": 2.7201283079390537, + "grad_norm": 0.8352670669555664, + "learning_rate": 4.531877530266071e-07, + "loss": 0.4509, + "step": 3392 + }, + { + "epoch": 2.7209302325581395, + "grad_norm": 0.8583428263664246, + "learning_rate": 4.506140642193391e-07, + "loss": 0.4455, + "step": 3393 + }, + { + "epoch": 2.7217321571772253, + "grad_norm": 0.9356517195701599, + "learning_rate": 4.4804753586686013e-07, + "loss": 0.4665, + "step": 3394 + }, + { + "epoch": 2.722534081796311, + "grad_norm": 0.8645097017288208, + "learning_rate": 4.454881698936431e-07, + "loss": 0.4485, + "step": 3395 + }, + { + "epoch": 2.723336006415397, + "grad_norm": 0.902538537979126, + "learning_rate": 4.4293596821878613e-07, + "loss": 0.4833, + "step": 3396 + }, + { + "epoch": 2.7241379310344827, + "grad_norm": 0.8527434468269348, + "learning_rate": 4.403909327560207e-07, + "loss": 0.4663, + "step": 3397 + }, + { + "epoch": 2.7249398556535684, + "grad_norm": 0.8391342759132385, + "learning_rate": 4.378530654136948e-07, + "loss": 0.4596, + "step": 3398 + }, + { + "epoch": 2.7257417802726542, + "grad_norm": 0.8753255009651184, + "learning_rate": 4.3532236809479265e-07, + "loss": 0.461, + "step": 3399 + }, + { + "epoch": 2.72654370489174, + "grad_norm": 0.8954243659973145, + "learning_rate": 4.327988426969154e-07, + "loss": 0.4777, + "step": 3400 + }, + { + "epoch": 2.727345629510826, + "grad_norm": 0.8503953218460083, + "learning_rate": 4.3028249111228824e-07, + "loss": 0.5002, + "step": 3401 + }, + { + "epoch": 2.7281475541299116, + "grad_norm": 0.8919215798377991, + "learning_rate": 4.277733152277597e-07, + "loss": 0.4702, + "step": 3402 + }, + { + "epoch": 2.728949478748998, + "grad_norm": 0.8524766564369202, + "learning_rate": 4.2527131692479127e-07, + "loss": 0.457, + "step": 3403 + }, + { + "epoch": 2.729751403368083, + "grad_norm": 0.8316718935966492, + "learning_rate": 4.227764980794691e-07, + "loss": 0.4827, + "step": 3404 + }, + { + "epoch": 2.7305533279871694, + "grad_norm": 0.9017997980117798, + "learning_rate": 4.202888605624944e-07, + "loss": 0.4885, + "step": 3405 + }, + { + "epoch": 2.731355252606255, + "grad_norm": 0.8500910401344299, + "learning_rate": 4.178084062391774e-07, + "loss": 0.4698, + "step": 3406 + }, + { + "epoch": 2.732157177225341, + "grad_norm": 0.8581136465072632, + "learning_rate": 4.153351369694536e-07, + "loss": 0.4843, + "step": 3407 + }, + { + "epoch": 2.7329591018444264, + "grad_norm": 0.8434959053993225, + "learning_rate": 4.128690546078606e-07, + "loss": 0.4624, + "step": 3408 + }, + { + "epoch": 2.7337610264635126, + "grad_norm": 0.9332457184791565, + "learning_rate": 4.104101610035527e-07, + "loss": 0.4558, + "step": 3409 + }, + { + "epoch": 2.7345629510825984, + "grad_norm": 0.8714927434921265, + "learning_rate": 4.0795845800029156e-07, + "loss": 0.4803, + "step": 3410 + }, + { + "epoch": 2.735364875701684, + "grad_norm": 0.8897708654403687, + "learning_rate": 4.055139474364456e-07, + "loss": 0.4536, + "step": 3411 + }, + { + "epoch": 2.73616680032077, + "grad_norm": 0.8492273688316345, + "learning_rate": 4.030766311449952e-07, + "loss": 0.4663, + "step": 3412 + }, + { + "epoch": 2.7369687249398558, + "grad_norm": 0.8852546811103821, + "learning_rate": 4.006465109535218e-07, + "loss": 0.4609, + "step": 3413 + }, + { + "epoch": 2.7377706495589416, + "grad_norm": 0.8187031149864197, + "learning_rate": 3.9822358868421116e-07, + "loss": 0.4631, + "step": 3414 + }, + { + "epoch": 2.7385725741780274, + "grad_norm": 0.8688458204269409, + "learning_rate": 3.958078661538567e-07, + "loss": 0.4694, + "step": 3415 + }, + { + "epoch": 2.739374498797113, + "grad_norm": 0.97969651222229, + "learning_rate": 3.933993451738427e-07, + "loss": 0.4959, + "step": 3416 + }, + { + "epoch": 2.740176423416199, + "grad_norm": 0.8926424384117126, + "learning_rate": 3.909980275501679e-07, + "loss": 0.4829, + "step": 3417 + }, + { + "epoch": 2.7409783480352847, + "grad_norm": 0.8417994379997253, + "learning_rate": 3.8860391508341754e-07, + "loss": 0.4633, + "step": 3418 + }, + { + "epoch": 2.7417802726543705, + "grad_norm": 0.9168954491615295, + "learning_rate": 3.8621700956877784e-07, + "loss": 0.4935, + "step": 3419 + }, + { + "epoch": 2.7425821972734563, + "grad_norm": 0.8628125786781311, + "learning_rate": 3.8383731279603597e-07, + "loss": 0.4714, + "step": 3420 + }, + { + "epoch": 2.743384121892542, + "grad_norm": 0.8950872421264648, + "learning_rate": 3.8146482654956574e-07, + "loss": 0.4796, + "step": 3421 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 0.8650218844413757, + "learning_rate": 3.7909955260833966e-07, + "loss": 0.4808, + "step": 3422 + }, + { + "epoch": 2.7449879711307137, + "grad_norm": 0.8490626215934753, + "learning_rate": 3.767414927459223e-07, + "loss": 0.4782, + "step": 3423 + }, + { + "epoch": 2.7457898957497995, + "grad_norm": 0.8351301550865173, + "learning_rate": 3.743906487304627e-07, + "loss": 0.4578, + "step": 3424 + }, + { + "epoch": 2.7465918203688853, + "grad_norm": 0.8559272289276123, + "learning_rate": 3.720470223247097e-07, + "loss": 0.4564, + "step": 3425 + }, + { + "epoch": 2.747393744987971, + "grad_norm": 0.8744351267814636, + "learning_rate": 3.697106152859886e-07, + "loss": 0.4932, + "step": 3426 + }, + { + "epoch": 2.748195669607057, + "grad_norm": 0.8866355419158936, + "learning_rate": 3.6738142936622035e-07, + "loss": 0.4939, + "step": 3427 + }, + { + "epoch": 2.7489975942261426, + "grad_norm": 0.8977624177932739, + "learning_rate": 3.650594663119089e-07, + "loss": 0.4924, + "step": 3428 + }, + { + "epoch": 2.7497995188452284, + "grad_norm": 0.8765467405319214, + "learning_rate": 3.6274472786413605e-07, + "loss": 0.4753, + "step": 3429 + }, + { + "epoch": 2.7506014434643142, + "grad_norm": 0.8574792146682739, + "learning_rate": 3.604372157585767e-07, + "loss": 0.4566, + "step": 3430 + }, + { + "epoch": 2.7514033680834, + "grad_norm": 0.9319592714309692, + "learning_rate": 3.5813693172548016e-07, + "loss": 0.4866, + "step": 3431 + }, + { + "epoch": 2.752205292702486, + "grad_norm": 0.8465300798416138, + "learning_rate": 3.5584387748967665e-07, + "loss": 0.4619, + "step": 3432 + }, + { + "epoch": 2.7530072173215716, + "grad_norm": 0.8767644166946411, + "learning_rate": 3.535580547705797e-07, + "loss": 0.4739, + "step": 3433 + }, + { + "epoch": 2.753809141940658, + "grad_norm": 0.8457480072975159, + "learning_rate": 3.512794652821716e-07, + "loss": 0.478, + "step": 3434 + }, + { + "epoch": 2.754611066559743, + "grad_norm": 0.871969997882843, + "learning_rate": 3.490081107330223e-07, + "loss": 0.4537, + "step": 3435 + }, + { + "epoch": 2.7554129911788294, + "grad_norm": 0.8666412234306335, + "learning_rate": 3.4674399282626616e-07, + "loss": 0.4704, + "step": 3436 + }, + { + "epoch": 2.7562149157979148, + "grad_norm": 0.8694742918014526, + "learning_rate": 3.4448711325961834e-07, + "loss": 0.4902, + "step": 3437 + }, + { + "epoch": 2.757016840417001, + "grad_norm": 0.8497282266616821, + "learning_rate": 3.422374737253642e-07, + "loss": 0.4604, + "step": 3438 + }, + { + "epoch": 2.7578187650360864, + "grad_norm": 0.9280922412872314, + "learning_rate": 3.399950759103576e-07, + "loss": 0.4865, + "step": 3439 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.8891953229904175, + "learning_rate": 3.37759921496027e-07, + "loss": 0.477, + "step": 3440 + }, + { + "epoch": 2.759422614274258, + "grad_norm": 0.9152265787124634, + "learning_rate": 3.355320121583672e-07, + "loss": 0.4823, + "step": 3441 + }, + { + "epoch": 2.760224538893344, + "grad_norm": 0.8602423071861267, + "learning_rate": 3.3331134956793965e-07, + "loss": 0.474, + "step": 3442 + }, + { + "epoch": 2.76102646351243, + "grad_norm": 0.8670658469200134, + "learning_rate": 3.3109793538987356e-07, + "loss": 0.4873, + "step": 3443 + }, + { + "epoch": 2.7618283881315158, + "grad_norm": 0.9038890600204468, + "learning_rate": 3.288917712838613e-07, + "loss": 0.4697, + "step": 3444 + }, + { + "epoch": 2.7626303127506016, + "grad_norm": 0.9339286684989929, + "learning_rate": 3.266928589041607e-07, + "loss": 0.4837, + "step": 3445 + }, + { + "epoch": 2.7634322373696873, + "grad_norm": 0.8574177622795105, + "learning_rate": 3.2450119989959064e-07, + "loss": 0.4526, + "step": 3446 + }, + { + "epoch": 2.764234161988773, + "grad_norm": 0.8637518882751465, + "learning_rate": 3.2231679591353203e-07, + "loss": 0.4726, + "step": 3447 + }, + { + "epoch": 2.765036086607859, + "grad_norm": 0.8453518152236938, + "learning_rate": 3.201396485839259e-07, + "loss": 0.4474, + "step": 3448 + }, + { + "epoch": 2.7658380112269447, + "grad_norm": 0.8873100280761719, + "learning_rate": 3.179697595432707e-07, + "loss": 0.4729, + "step": 3449 + }, + { + "epoch": 2.7666399358460305, + "grad_norm": 0.8434620499610901, + "learning_rate": 3.158071304186228e-07, + "loss": 0.4532, + "step": 3450 + }, + { + "epoch": 2.7674418604651163, + "grad_norm": 0.8255607485771179, + "learning_rate": 3.136517628315949e-07, + "loss": 0.4721, + "step": 3451 + }, + { + "epoch": 2.768243785084202, + "grad_norm": 0.8967651724815369, + "learning_rate": 3.1150365839835773e-07, + "loss": 0.4832, + "step": 3452 + }, + { + "epoch": 2.769045709703288, + "grad_norm": 0.9030566811561584, + "learning_rate": 3.093628187296294e-07, + "loss": 0.5076, + "step": 3453 + }, + { + "epoch": 2.7698476343223737, + "grad_norm": 0.8555736541748047, + "learning_rate": 3.0722924543068687e-07, + "loss": 0.4568, + "step": 3454 + }, + { + "epoch": 2.7706495589414595, + "grad_norm": 0.8584993481636047, + "learning_rate": 3.0510294010135387e-07, + "loss": 0.4755, + "step": 3455 + }, + { + "epoch": 2.7714514835605453, + "grad_norm": 0.9319979548454285, + "learning_rate": 3.0298390433600945e-07, + "loss": 0.494, + "step": 3456 + }, + { + "epoch": 2.772253408179631, + "grad_norm": 0.8971447348594666, + "learning_rate": 3.008721397235781e-07, + "loss": 0.4819, + "step": 3457 + }, + { + "epoch": 2.773055332798717, + "grad_norm": 0.8856346011161804, + "learning_rate": 2.9876764784753096e-07, + "loss": 0.4959, + "step": 3458 + }, + { + "epoch": 2.7738572574178026, + "grad_norm": 0.8331868052482605, + "learning_rate": 2.966704302858892e-07, + "loss": 0.4604, + "step": 3459 + }, + { + "epoch": 2.7746591820368884, + "grad_norm": 0.8967451453208923, + "learning_rate": 2.945804886112169e-07, + "loss": 0.4757, + "step": 3460 + }, + { + "epoch": 2.7754611066559742, + "grad_norm": 0.8893353343009949, + "learning_rate": 2.924978243906251e-07, + "loss": 0.4762, + "step": 3461 + }, + { + "epoch": 2.77626303127506, + "grad_norm": 0.9258598685264587, + "learning_rate": 2.9042243918576574e-07, + "loss": 0.4907, + "step": 3462 + }, + { + "epoch": 2.777064955894146, + "grad_norm": 0.8462851047515869, + "learning_rate": 2.883543345528328e-07, + "loss": 0.4659, + "step": 3463 + }, + { + "epoch": 2.7778668805132316, + "grad_norm": 0.8520975112915039, + "learning_rate": 2.862935120425614e-07, + "loss": 0.4619, + "step": 3464 + }, + { + "epoch": 2.7786688051323174, + "grad_norm": 0.9390130639076233, + "learning_rate": 2.8423997320022765e-07, + "loss": 0.4773, + "step": 3465 + }, + { + "epoch": 2.779470729751403, + "grad_norm": 0.9307414293289185, + "learning_rate": 2.821937195656421e-07, + "loss": 0.4857, + "step": 3466 + }, + { + "epoch": 2.7802726543704894, + "grad_norm": 0.9021451473236084, + "learning_rate": 2.801547526731596e-07, + "loss": 0.4645, + "step": 3467 + }, + { + "epoch": 2.7810745789895748, + "grad_norm": 0.8632877469062805, + "learning_rate": 2.781230740516649e-07, + "loss": 0.4581, + "step": 3468 + }, + { + "epoch": 2.781876503608661, + "grad_norm": 0.8747298717498779, + "learning_rate": 2.760986852245784e-07, + "loss": 0.4629, + "step": 3469 + }, + { + "epoch": 2.7826784282277464, + "grad_norm": 0.8483293652534485, + "learning_rate": 2.7408158770985905e-07, + "loss": 0.4507, + "step": 3470 + }, + { + "epoch": 2.7834803528468326, + "grad_norm": 0.853502094745636, + "learning_rate": 2.720717830199904e-07, + "loss": 0.4687, + "step": 3471 + }, + { + "epoch": 2.784282277465918, + "grad_norm": 0.9311491250991821, + "learning_rate": 2.70069272661998e-07, + "loss": 0.5025, + "step": 3472 + }, + { + "epoch": 2.785084202085004, + "grad_norm": 0.930825412273407, + "learning_rate": 2.680740581374286e-07, + "loss": 0.4954, + "step": 3473 + }, + { + "epoch": 2.78588612670409, + "grad_norm": 0.866186797618866, + "learning_rate": 2.6608614094236317e-07, + "loss": 0.4692, + "step": 3474 + }, + { + "epoch": 2.7866880513231758, + "grad_norm": 0.8751170635223389, + "learning_rate": 2.641055225674105e-07, + "loss": 0.4558, + "step": 3475 + }, + { + "epoch": 2.7874899759422616, + "grad_norm": 0.8723040819168091, + "learning_rate": 2.6213220449770373e-07, + "loss": 0.4459, + "step": 3476 + }, + { + "epoch": 2.7882919005613473, + "grad_norm": 0.8558136820793152, + "learning_rate": 2.6016618821290583e-07, + "loss": 0.4557, + "step": 3477 + }, + { + "epoch": 2.789093825180433, + "grad_norm": 0.8481677770614624, + "learning_rate": 2.5820747518720326e-07, + "loss": 0.4699, + "step": 3478 + }, + { + "epoch": 2.789895749799519, + "grad_norm": 0.8892841339111328, + "learning_rate": 2.5625606688930107e-07, + "loss": 0.4521, + "step": 3479 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 0.9031122326850891, + "learning_rate": 2.5431196478243767e-07, + "loss": 0.4977, + "step": 3480 + }, + { + "epoch": 2.7914995990376905, + "grad_norm": 0.8958612680435181, + "learning_rate": 2.5237517032436374e-07, + "loss": 0.4658, + "step": 3481 + }, + { + "epoch": 2.7923015236567763, + "grad_norm": 0.8278775811195374, + "learning_rate": 2.5044568496735534e-07, + "loss": 0.4532, + "step": 3482 + }, + { + "epoch": 2.793103448275862, + "grad_norm": 0.8773555159568787, + "learning_rate": 2.485235101582051e-07, + "loss": 0.4817, + "step": 3483 + }, + { + "epoch": 2.793905372894948, + "grad_norm": 0.8488254547119141, + "learning_rate": 2.466086473382234e-07, + "loss": 0.4544, + "step": 3484 + }, + { + "epoch": 2.7947072975140337, + "grad_norm": 0.8559311628341675, + "learning_rate": 2.4470109794324405e-07, + "loss": 0.4533, + "step": 3485 + }, + { + "epoch": 2.7955092221331195, + "grad_norm": 0.9088083505630493, + "learning_rate": 2.4280086340360944e-07, + "loss": 0.4757, + "step": 3486 + }, + { + "epoch": 2.7963111467522053, + "grad_norm": 0.8725008964538574, + "learning_rate": 2.409079451441809e-07, + "loss": 0.4424, + "step": 3487 + }, + { + "epoch": 2.797113071371291, + "grad_norm": 0.8594610095024109, + "learning_rate": 2.3902234458433315e-07, + "loss": 0.4581, + "step": 3488 + }, + { + "epoch": 2.797914995990377, + "grad_norm": 0.8405497670173645, + "learning_rate": 2.371440631379529e-07, + "loss": 0.4769, + "step": 3489 + }, + { + "epoch": 2.7987169206094626, + "grad_norm": 0.9169846177101135, + "learning_rate": 2.3527310221344136e-07, + "loss": 0.4886, + "step": 3490 + }, + { + "epoch": 2.7995188452285484, + "grad_norm": 0.9240108132362366, + "learning_rate": 2.334094632137063e-07, + "loss": 0.4912, + "step": 3491 + }, + { + "epoch": 2.8003207698476342, + "grad_norm": 0.853226900100708, + "learning_rate": 2.3155314753616874e-07, + "loss": 0.4897, + "step": 3492 + }, + { + "epoch": 2.80112269446672, + "grad_norm": 0.863446831703186, + "learning_rate": 2.297041565727598e-07, + "loss": 0.4654, + "step": 3493 + }, + { + "epoch": 2.801924619085806, + "grad_norm": 0.8915846347808838, + "learning_rate": 2.2786249170991148e-07, + "loss": 0.4854, + "step": 3494 + }, + { + "epoch": 2.8027265437048916, + "grad_norm": 0.8765985369682312, + "learning_rate": 2.260281543285703e-07, + "loss": 0.4911, + "step": 3495 + }, + { + "epoch": 2.8035284683239774, + "grad_norm": 0.8515004515647888, + "learning_rate": 2.2420114580418262e-07, + "loss": 0.4665, + "step": 3496 + }, + { + "epoch": 2.804330392943063, + "grad_norm": 0.8506413698196411, + "learning_rate": 2.2238146750670264e-07, + "loss": 0.4608, + "step": 3497 + }, + { + "epoch": 2.8051323175621494, + "grad_norm": 0.8681690096855164, + "learning_rate": 2.205691208005889e-07, + "loss": 0.4648, + "step": 3498 + }, + { + "epoch": 2.8059342421812348, + "grad_norm": 0.855263888835907, + "learning_rate": 2.1876410704479767e-07, + "loss": 0.4622, + "step": 3499 + }, + { + "epoch": 2.806736166800321, + "grad_norm": 0.8927332758903503, + "learning_rate": 2.1696642759279074e-07, + "loss": 0.4671, + "step": 3500 + }, + { + "epoch": 2.8075380914194064, + "grad_norm": 0.8977309465408325, + "learning_rate": 2.1517608379252985e-07, + "loss": 0.4874, + "step": 3501 + }, + { + "epoch": 2.8083400160384926, + "grad_norm": 0.8782375454902649, + "learning_rate": 2.133930769864756e-07, + "loss": 0.4645, + "step": 3502 + }, + { + "epoch": 2.809141940657578, + "grad_norm": 0.8151038885116577, + "learning_rate": 2.1161740851158742e-07, + "loss": 0.4574, + "step": 3503 + }, + { + "epoch": 2.809943865276664, + "grad_norm": 0.884947657585144, + "learning_rate": 2.0984907969932134e-07, + "loss": 0.4472, + "step": 3504 + }, + { + "epoch": 2.8107457898957495, + "grad_norm": 0.8524783253669739, + "learning_rate": 2.0808809187563118e-07, + "loss": 0.4669, + "step": 3505 + }, + { + "epoch": 2.8115477145148358, + "grad_norm": 0.8649755120277405, + "learning_rate": 2.063344463609651e-07, + "loss": 0.4632, + "step": 3506 + }, + { + "epoch": 2.8123496391339216, + "grad_norm": 0.849073052406311, + "learning_rate": 2.0458814447026687e-07, + "loss": 0.4809, + "step": 3507 + }, + { + "epoch": 2.8131515637530073, + "grad_norm": 0.872164785861969, + "learning_rate": 2.0284918751297235e-07, + "loss": 0.4942, + "step": 3508 + }, + { + "epoch": 2.813953488372093, + "grad_norm": 0.8174782395362854, + "learning_rate": 2.011175767930118e-07, + "loss": 0.44, + "step": 3509 + }, + { + "epoch": 2.814755412991179, + "grad_norm": 0.8372223973274231, + "learning_rate": 1.9939331360880442e-07, + "loss": 0.4673, + "step": 3510 + }, + { + "epoch": 2.8155573376102647, + "grad_norm": 0.8555891513824463, + "learning_rate": 1.9767639925326155e-07, + "loss": 0.4708, + "step": 3511 + }, + { + "epoch": 2.8163592622293505, + "grad_norm": 0.8843154311180115, + "learning_rate": 1.9596683501378666e-07, + "loss": 0.4717, + "step": 3512 + }, + { + "epoch": 2.8171611868484363, + "grad_norm": 0.8556921482086182, + "learning_rate": 1.942646221722655e-07, + "loss": 0.4618, + "step": 3513 + }, + { + "epoch": 2.817963111467522, + "grad_norm": 0.8737980127334595, + "learning_rate": 1.9256976200507814e-07, + "loss": 0.4838, + "step": 3514 + }, + { + "epoch": 2.818765036086608, + "grad_norm": 0.8971306085586548, + "learning_rate": 1.9088225578308582e-07, + "loss": 0.4755, + "step": 3515 + }, + { + "epoch": 2.8195669607056937, + "grad_norm": 0.8777005672454834, + "learning_rate": 1.892021047716408e-07, + "loss": 0.4624, + "step": 3516 + }, + { + "epoch": 2.8203688853247795, + "grad_norm": 0.844862163066864, + "learning_rate": 1.8752931023057753e-07, + "loss": 0.45, + "step": 3517 + }, + { + "epoch": 2.8211708099438653, + "grad_norm": 0.8649774789810181, + "learning_rate": 1.858638734142104e-07, + "loss": 0.4738, + "step": 3518 + }, + { + "epoch": 2.821972734562951, + "grad_norm": 0.8356218934059143, + "learning_rate": 1.842057955713461e-07, + "loss": 0.4632, + "step": 3519 + }, + { + "epoch": 2.822774659182037, + "grad_norm": 0.8642269968986511, + "learning_rate": 1.8255507794526338e-07, + "loss": 0.4708, + "step": 3520 + }, + { + "epoch": 2.8235765838011226, + "grad_norm": 0.8712006211280823, + "learning_rate": 1.8091172177372994e-07, + "loss": 0.4649, + "step": 3521 + }, + { + "epoch": 2.8243785084202084, + "grad_norm": 0.8947505354881287, + "learning_rate": 1.7927572828898788e-07, + "loss": 0.4599, + "step": 3522 + }, + { + "epoch": 2.8251804330392942, + "grad_norm": 0.8280917406082153, + "learning_rate": 1.776470987177614e-07, + "loss": 0.4514, + "step": 3523 + }, + { + "epoch": 2.82598235765838, + "grad_norm": 0.8675678968429565, + "learning_rate": 1.7602583428125263e-07, + "loss": 0.4769, + "step": 3524 + }, + { + "epoch": 2.826784282277466, + "grad_norm": 0.8008211255073547, + "learning_rate": 1.744119361951413e-07, + "loss": 0.4635, + "step": 3525 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 0.8509759306907654, + "learning_rate": 1.728054056695816e-07, + "loss": 0.4531, + "step": 3526 + }, + { + "epoch": 2.8283881315156374, + "grad_norm": 0.8491147756576538, + "learning_rate": 1.712062439092077e-07, + "loss": 0.4622, + "step": 3527 + }, + { + "epoch": 2.829190056134723, + "grad_norm": 0.8817112445831299, + "learning_rate": 1.6961445211312265e-07, + "loss": 0.4772, + "step": 3528 + }, + { + "epoch": 2.829991980753809, + "grad_norm": 0.8488009572029114, + "learning_rate": 1.6803003147490727e-07, + "loss": 0.4619, + "step": 3529 + }, + { + "epoch": 2.8307939053728948, + "grad_norm": 0.8665116429328918, + "learning_rate": 1.6645298318261449e-07, + "loss": 0.4584, + "step": 3530 + }, + { + "epoch": 2.831595829991981, + "grad_norm": 0.8897413015365601, + "learning_rate": 1.648833084187673e-07, + "loss": 0.4742, + "step": 3531 + }, + { + "epoch": 2.8323977546110664, + "grad_norm": 0.8791276216506958, + "learning_rate": 1.6332100836036425e-07, + "loss": 0.4718, + "step": 3532 + }, + { + "epoch": 2.8331996792301526, + "grad_norm": 0.8625409603118896, + "learning_rate": 1.617660841788682e-07, + "loss": 0.4715, + "step": 3533 + }, + { + "epoch": 2.834001603849238, + "grad_norm": 0.887173593044281, + "learning_rate": 1.602185370402154e-07, + "loss": 0.5021, + "step": 3534 + }, + { + "epoch": 2.834803528468324, + "grad_norm": 0.9261402487754822, + "learning_rate": 1.5867836810481095e-07, + "loss": 0.5014, + "step": 3535 + }, + { + "epoch": 2.8356054530874095, + "grad_norm": 0.8724820017814636, + "learning_rate": 1.5714557852752222e-07, + "loss": 0.4643, + "step": 3536 + }, + { + "epoch": 2.8364073777064958, + "grad_norm": 0.8310959339141846, + "learning_rate": 1.5562016945769088e-07, + "loss": 0.4687, + "step": 3537 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 0.8877079486846924, + "learning_rate": 1.5410214203911754e-07, + "loss": 0.4681, + "step": 3538 + }, + { + "epoch": 2.8380112269446673, + "grad_norm": 0.8687597513198853, + "learning_rate": 1.5259149741007284e-07, + "loss": 0.4639, + "step": 3539 + }, + { + "epoch": 2.838813151563753, + "grad_norm": 0.8550201058387756, + "learning_rate": 1.5108823670328954e-07, + "loss": 0.4792, + "step": 3540 + }, + { + "epoch": 2.839615076182839, + "grad_norm": 0.9106943011283875, + "learning_rate": 1.4959236104596265e-07, + "loss": 0.4758, + "step": 3541 + }, + { + "epoch": 2.8404170008019247, + "grad_norm": 0.8858274221420288, + "learning_rate": 1.4810387155975158e-07, + "loss": 0.4608, + "step": 3542 + }, + { + "epoch": 2.8412189254210105, + "grad_norm": 0.8574510216712952, + "learning_rate": 1.466227693607747e-07, + "loss": 0.4688, + "step": 3543 + }, + { + "epoch": 2.8420208500400963, + "grad_norm": 0.8784916400909424, + "learning_rate": 1.4514905555961578e-07, + "loss": 0.4763, + "step": 3544 + }, + { + "epoch": 2.842822774659182, + "grad_norm": 0.863102912902832, + "learning_rate": 1.4368273126131428e-07, + "loss": 0.4472, + "step": 3545 + }, + { + "epoch": 2.843624699278268, + "grad_norm": 0.8521451354026794, + "learning_rate": 1.4222379756536841e-07, + "loss": 0.4649, + "step": 3546 + }, + { + "epoch": 2.8444266238973537, + "grad_norm": 0.8406957983970642, + "learning_rate": 1.4077225556573872e-07, + "loss": 0.4707, + "step": 3547 + }, + { + "epoch": 2.8452285485164395, + "grad_norm": 0.8615586757659912, + "learning_rate": 1.3932810635083893e-07, + "loss": 0.4524, + "step": 3548 + }, + { + "epoch": 2.8460304731355253, + "grad_norm": 0.8704116344451904, + "learning_rate": 1.378913510035429e-07, + "loss": 0.4781, + "step": 3549 + }, + { + "epoch": 2.846832397754611, + "grad_norm": 0.8566069602966309, + "learning_rate": 1.3646199060117881e-07, + "loss": 0.4628, + "step": 3550 + }, + { + "epoch": 2.847634322373697, + "grad_norm": 0.8848322629928589, + "learning_rate": 1.3504002621552937e-07, + "loss": 0.4675, + "step": 3551 + }, + { + "epoch": 2.8484362469927826, + "grad_norm": 0.8653082251548767, + "learning_rate": 1.3362545891283052e-07, + "loss": 0.4589, + "step": 3552 + }, + { + "epoch": 2.8492381716118684, + "grad_norm": 0.8225123286247253, + "learning_rate": 1.3221828975377382e-07, + "loss": 0.4574, + "step": 3553 + }, + { + "epoch": 2.8500400962309542, + "grad_norm": 0.838966965675354, + "learning_rate": 1.3081851979350412e-07, + "loss": 0.4877, + "step": 3554 + }, + { + "epoch": 2.85084202085004, + "grad_norm": 0.8537912368774414, + "learning_rate": 1.294261500816152e-07, + "loss": 0.4686, + "step": 3555 + }, + { + "epoch": 2.851643945469126, + "grad_norm": 0.9012706279754639, + "learning_rate": 1.2804118166215297e-07, + "loss": 0.4968, + "step": 3556 + }, + { + "epoch": 2.8524458700882116, + "grad_norm": 0.8772068619728088, + "learning_rate": 1.266636155736145e-07, + "loss": 0.474, + "step": 3557 + }, + { + "epoch": 2.8532477947072974, + "grad_norm": 0.8455215692520142, + "learning_rate": 1.252934528489458e-07, + "loss": 0.4907, + "step": 3558 + }, + { + "epoch": 2.854049719326383, + "grad_norm": 0.9360870718955994, + "learning_rate": 1.2393069451554163e-07, + "loss": 0.5056, + "step": 3559 + }, + { + "epoch": 2.854851643945469, + "grad_norm": 0.8860448598861694, + "learning_rate": 1.2257534159524353e-07, + "loss": 0.4777, + "step": 3560 + }, + { + "epoch": 2.8556535685645548, + "grad_norm": 0.8781315088272095, + "learning_rate": 1.21227395104343e-07, + "loss": 0.4508, + "step": 3561 + }, + { + "epoch": 2.856455493183641, + "grad_norm": 0.8404563069343567, + "learning_rate": 1.1988685605357486e-07, + "loss": 0.4514, + "step": 3562 + }, + { + "epoch": 2.8572574178027264, + "grad_norm": 0.8559548258781433, + "learning_rate": 1.1855372544812172e-07, + "loss": 0.4616, + "step": 3563 + }, + { + "epoch": 2.8580593424218126, + "grad_norm": 0.9076322913169861, + "learning_rate": 1.172280042876106e-07, + "loss": 0.4811, + "step": 3564 + }, + { + "epoch": 2.858861267040898, + "grad_norm": 0.9246238470077515, + "learning_rate": 1.1590969356611081e-07, + "loss": 0.4823, + "step": 3565 + }, + { + "epoch": 2.859663191659984, + "grad_norm": 0.9094520211219788, + "learning_rate": 1.1459879427213827e-07, + "loss": 0.483, + "step": 3566 + }, + { + "epoch": 2.8604651162790695, + "grad_norm": 0.8826652765274048, + "learning_rate": 1.1329530738865003e-07, + "loss": 0.5114, + "step": 3567 + }, + { + "epoch": 2.8612670408981558, + "grad_norm": 0.8294800519943237, + "learning_rate": 1.1199923389304201e-07, + "loss": 0.4441, + "step": 3568 + }, + { + "epoch": 2.862068965517241, + "grad_norm": 0.8477868437767029, + "learning_rate": 1.1071057475715797e-07, + "loss": 0.4362, + "step": 3569 + }, + { + "epoch": 2.8628708901363273, + "grad_norm": 0.8606266379356384, + "learning_rate": 1.0942933094727715e-07, + "loss": 0.4741, + "step": 3570 + }, + { + "epoch": 2.863672814755413, + "grad_norm": 0.877900242805481, + "learning_rate": 1.0815550342411885e-07, + "loss": 0.4655, + "step": 3571 + }, + { + "epoch": 2.864474739374499, + "grad_norm": 0.8586121201515198, + "learning_rate": 1.0688909314284346e-07, + "loss": 0.4838, + "step": 3572 + }, + { + "epoch": 2.8652766639935847, + "grad_norm": 0.8372088074684143, + "learning_rate": 1.0563010105304694e-07, + "loss": 0.4737, + "step": 3573 + }, + { + "epoch": 2.8660785886126705, + "grad_norm": 0.8549312353134155, + "learning_rate": 1.0437852809876636e-07, + "loss": 0.4736, + "step": 3574 + }, + { + "epoch": 2.8668805132317563, + "grad_norm": 0.8639695644378662, + "learning_rate": 1.0313437521847325e-07, + "loss": 0.4589, + "step": 3575 + }, + { + "epoch": 2.867682437850842, + "grad_norm": 0.8854097723960876, + "learning_rate": 1.0189764334507579e-07, + "loss": 0.4866, + "step": 3576 + }, + { + "epoch": 2.868484362469928, + "grad_norm": 0.8394895792007446, + "learning_rate": 1.0066833340591664e-07, + "loss": 0.4556, + "step": 3577 + }, + { + "epoch": 2.8692862870890137, + "grad_norm": 0.8737897276878357, + "learning_rate": 9.944644632277512e-08, + "loss": 0.4882, + "step": 3578 + }, + { + "epoch": 2.8700882117080995, + "grad_norm": 0.9116494655609131, + "learning_rate": 9.823198301186387e-08, + "loss": 0.463, + "step": 3579 + }, + { + "epoch": 2.8708901363271853, + "grad_norm": 0.8393471837043762, + "learning_rate": 9.702494438383003e-08, + "loss": 0.4651, + "step": 3580 + }, + { + "epoch": 2.871692060946271, + "grad_norm": 0.8796486258506775, + "learning_rate": 9.582533134374849e-08, + "loss": 0.4699, + "step": 3581 + }, + { + "epoch": 2.872493985565357, + "grad_norm": 0.8335583209991455, + "learning_rate": 9.463314479113416e-08, + "loss": 0.4686, + "step": 3582 + }, + { + "epoch": 2.8732959101844426, + "grad_norm": 0.8907720446586609, + "learning_rate": 9.344838561992642e-08, + "loss": 0.4928, + "step": 3583 + }, + { + "epoch": 2.8740978348035284, + "grad_norm": 0.8920674920082092, + "learning_rate": 9.227105471849795e-08, + "loss": 0.4846, + "step": 3584 + }, + { + "epoch": 2.874899759422614, + "grad_norm": 0.8768170475959778, + "learning_rate": 9.110115296965482e-08, + "loss": 0.4715, + "step": 3585 + }, + { + "epoch": 2.8757016840417, + "grad_norm": 0.8467321395874023, + "learning_rate": 8.993868125062533e-08, + "loss": 0.4579, + "step": 3586 + }, + { + "epoch": 2.876503608660786, + "grad_norm": 0.9148172736167908, + "learning_rate": 8.87836404330722e-08, + "loss": 0.4825, + "step": 3587 + }, + { + "epoch": 2.8773055332798716, + "grad_norm": 0.8339017629623413, + "learning_rate": 8.763603138308485e-08, + "loss": 0.4647, + "step": 3588 + }, + { + "epoch": 2.8781074578989574, + "grad_norm": 0.8492835760116577, + "learning_rate": 8.64958549611783e-08, + "loss": 0.4491, + "step": 3589 + }, + { + "epoch": 2.878909382518043, + "grad_norm": 0.8415125608444214, + "learning_rate": 8.536311202229641e-08, + "loss": 0.484, + "step": 3590 + }, + { + "epoch": 2.879711307137129, + "grad_norm": 0.8640526533126831, + "learning_rate": 8.423780341580756e-08, + "loss": 0.4564, + "step": 3591 + }, + { + "epoch": 2.8805132317562148, + "grad_norm": 0.8569619655609131, + "learning_rate": 8.311992998550789e-08, + "loss": 0.4632, + "step": 3592 + }, + { + "epoch": 2.8813151563753006, + "grad_norm": 0.8628082871437073, + "learning_rate": 8.200949256961687e-08, + "loss": 0.4823, + "step": 3593 + }, + { + "epoch": 2.8821170809943863, + "grad_norm": 0.8405731916427612, + "learning_rate": 8.090649200077627e-08, + "loss": 0.4679, + "step": 3594 + }, + { + "epoch": 2.8829190056134726, + "grad_norm": 0.831079363822937, + "learning_rate": 7.98109291060567e-08, + "loss": 0.4371, + "step": 3595 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 0.8602555990219116, + "learning_rate": 7.872280470694549e-08, + "loss": 0.4744, + "step": 3596 + }, + { + "epoch": 2.884522854851644, + "grad_norm": 0.8295644521713257, + "learning_rate": 7.764211961935664e-08, + "loss": 0.4517, + "step": 3597 + }, + { + "epoch": 2.8853247794707295, + "grad_norm": 0.8858298659324646, + "learning_rate": 7.656887465362528e-08, + "loss": 0.4617, + "step": 3598 + }, + { + "epoch": 2.8861267040898158, + "grad_norm": 0.8486381769180298, + "learning_rate": 7.550307061450546e-08, + "loss": 0.474, + "step": 3599 + }, + { + "epoch": 2.886928628708901, + "grad_norm": 0.8799077272415161, + "learning_rate": 7.444470830117456e-08, + "loss": 0.4681, + "step": 3600 + }, + { + "epoch": 2.8877305533279873, + "grad_norm": 0.8846973776817322, + "learning_rate": 7.339378850722889e-08, + "loss": 0.4839, + "step": 3601 + }, + { + "epoch": 2.888532477947073, + "grad_norm": 0.8882151246070862, + "learning_rate": 7.235031202068255e-08, + "loss": 0.4527, + "step": 3602 + }, + { + "epoch": 2.889334402566159, + "grad_norm": 0.8758078813552856, + "learning_rate": 7.131427962397076e-08, + "loss": 0.5004, + "step": 3603 + }, + { + "epoch": 2.8901363271852447, + "grad_norm": 0.9011175632476807, + "learning_rate": 7.028569209394653e-08, + "loss": 0.4439, + "step": 3604 + }, + { + "epoch": 2.8909382518043305, + "grad_norm": 0.8999505043029785, + "learning_rate": 6.92645502018785e-08, + "loss": 0.5111, + "step": 3605 + }, + { + "epoch": 2.8917401764234163, + "grad_norm": 0.9070544242858887, + "learning_rate": 6.825085471345416e-08, + "loss": 0.4621, + "step": 3606 + }, + { + "epoch": 2.892542101042502, + "grad_norm": 0.8917433619499207, + "learning_rate": 6.724460638877661e-08, + "loss": 0.4679, + "step": 3607 + }, + { + "epoch": 2.893344025661588, + "grad_norm": 0.8720741868019104, + "learning_rate": 6.624580598236563e-08, + "loss": 0.4559, + "step": 3608 + }, + { + "epoch": 2.8941459502806737, + "grad_norm": 0.8378724455833435, + "learning_rate": 6.525445424315546e-08, + "loss": 0.4542, + "step": 3609 + }, + { + "epoch": 2.8949478748997595, + "grad_norm": 0.9167259335517883, + "learning_rate": 6.427055191449483e-08, + "loss": 0.4639, + "step": 3610 + }, + { + "epoch": 2.8957497995188453, + "grad_norm": 0.9500483274459839, + "learning_rate": 6.329409973414913e-08, + "loss": 0.4829, + "step": 3611 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.8519693613052368, + "learning_rate": 6.23250984342938e-08, + "loss": 0.4823, + "step": 3612 + }, + { + "epoch": 2.897353648757017, + "grad_norm": 0.8825336694717407, + "learning_rate": 6.136354874151874e-08, + "loss": 0.4694, + "step": 3613 + }, + { + "epoch": 2.8981555733761026, + "grad_norm": 0.8310946226119995, + "learning_rate": 6.04094513768283e-08, + "loss": 0.468, + "step": 3614 + }, + { + "epoch": 2.8989574979951884, + "grad_norm": 0.7865362763404846, + "learning_rate": 5.9462807055635787e-08, + "loss": 0.4555, + "step": 3615 + }, + { + "epoch": 2.899759422614274, + "grad_norm": 0.8779739141464233, + "learning_rate": 5.852361648776672e-08, + "loss": 0.4533, + "step": 3616 + }, + { + "epoch": 2.90056134723336, + "grad_norm": 0.9257890582084656, + "learning_rate": 5.7591880377459995e-08, + "loss": 0.4784, + "step": 3617 + }, + { + "epoch": 2.901363271852446, + "grad_norm": 0.8655847907066345, + "learning_rate": 5.666759942336231e-08, + "loss": 0.4619, + "step": 3618 + }, + { + "epoch": 2.9021651964715316, + "grad_norm": 0.9047146439552307, + "learning_rate": 5.5750774318531486e-08, + "loss": 0.4821, + "step": 3619 + }, + { + "epoch": 2.9029671210906174, + "grad_norm": 0.8486894965171814, + "learning_rate": 5.4841405750433175e-08, + "loss": 0.4506, + "step": 3620 + }, + { + "epoch": 2.903769045709703, + "grad_norm": 0.9041795134544373, + "learning_rate": 5.393949440094415e-08, + "loss": 0.4725, + "step": 3621 + }, + { + "epoch": 2.904570970328789, + "grad_norm": 0.8865926265716553, + "learning_rate": 5.304504094634677e-08, + "loss": 0.5029, + "step": 3622 + }, + { + "epoch": 2.9053728949478748, + "grad_norm": 0.8497272729873657, + "learning_rate": 5.2158046057333434e-08, + "loss": 0.4617, + "step": 3623 + }, + { + "epoch": 2.9061748195669606, + "grad_norm": 0.8616043925285339, + "learning_rate": 5.1278510399004334e-08, + "loss": 0.4697, + "step": 3624 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 0.8522729873657227, + "learning_rate": 5.040643463086303e-08, + "loss": 0.4575, + "step": 3625 + }, + { + "epoch": 2.9077786688051326, + "grad_norm": 0.8534807562828064, + "learning_rate": 4.954181940682201e-08, + "loss": 0.4719, + "step": 3626 + }, + { + "epoch": 2.908580593424218, + "grad_norm": 0.8463496565818787, + "learning_rate": 4.8684665375201553e-08, + "loss": 0.4726, + "step": 3627 + }, + { + "epoch": 2.909382518043304, + "grad_norm": 0.8361274003982544, + "learning_rate": 4.7834973178721986e-08, + "loss": 0.4646, + "step": 3628 + }, + { + "epoch": 2.9101844426623895, + "grad_norm": 0.8507230877876282, + "learning_rate": 4.6992743454513654e-08, + "loss": 0.4596, + "step": 3629 + }, + { + "epoch": 2.9109863672814758, + "grad_norm": 0.8449583649635315, + "learning_rate": 4.615797683410694e-08, + "loss": 0.4887, + "step": 3630 + }, + { + "epoch": 2.911788291900561, + "grad_norm": 0.8460783362388611, + "learning_rate": 4.533067394344115e-08, + "loss": 0.4702, + "step": 3631 + }, + { + "epoch": 2.9125902165196473, + "grad_norm": 0.89262855052948, + "learning_rate": 4.4510835402853394e-08, + "loss": 0.478, + "step": 3632 + }, + { + "epoch": 2.9133921411387327, + "grad_norm": 0.8806936144828796, + "learning_rate": 4.369846182708748e-08, + "loss": 0.4603, + "step": 3633 + }, + { + "epoch": 2.914194065757819, + "grad_norm": 0.9069940447807312, + "learning_rate": 4.289355382529059e-08, + "loss": 0.4593, + "step": 3634 + }, + { + "epoch": 2.9149959903769047, + "grad_norm": 0.9094383716583252, + "learning_rate": 4.2096112001006604e-08, + "loss": 0.4879, + "step": 3635 + }, + { + "epoch": 2.9157979149959905, + "grad_norm": 0.8786039352416992, + "learning_rate": 4.1306136952187214e-08, + "loss": 0.4808, + "step": 3636 + }, + { + "epoch": 2.9165998396150763, + "grad_norm": 0.9403489828109741, + "learning_rate": 4.052362927118303e-08, + "loss": 0.4875, + "step": 3637 + }, + { + "epoch": 2.917401764234162, + "grad_norm": 0.7999061346054077, + "learning_rate": 3.974858954474248e-08, + "loss": 0.45, + "step": 3638 + }, + { + "epoch": 2.918203688853248, + "grad_norm": 0.8469072580337524, + "learning_rate": 3.898101835401846e-08, + "loss": 0.4521, + "step": 3639 + }, + { + "epoch": 2.9190056134723337, + "grad_norm": 0.8923320770263672, + "learning_rate": 3.82209162745617e-08, + "loss": 0.48, + "step": 3640 + }, + { + "epoch": 2.9198075380914195, + "grad_norm": 0.8438341021537781, + "learning_rate": 3.746828387632184e-08, + "loss": 0.4469, + "step": 3641 + }, + { + "epoch": 2.9206094627105053, + "grad_norm": 0.8810524940490723, + "learning_rate": 3.672312172365078e-08, + "loss": 0.4609, + "step": 3642 + }, + { + "epoch": 2.921411387329591, + "grad_norm": 0.902431309223175, + "learning_rate": 3.598543037529378e-08, + "loss": 0.4814, + "step": 3643 + }, + { + "epoch": 2.922213311948677, + "grad_norm": 0.8674122095108032, + "learning_rate": 3.525521038439728e-08, + "loss": 0.465, + "step": 3644 + }, + { + "epoch": 2.9230152365677626, + "grad_norm": 0.8674834370613098, + "learning_rate": 3.4532462298506596e-08, + "loss": 0.4554, + "step": 3645 + }, + { + "epoch": 2.9238171611868484, + "grad_norm": 0.8510974049568176, + "learning_rate": 3.3817186659560466e-08, + "loss": 0.4579, + "step": 3646 + }, + { + "epoch": 2.924619085805934, + "grad_norm": 0.8944957852363586, + "learning_rate": 3.3109384003899844e-08, + "loss": 0.4764, + "step": 3647 + }, + { + "epoch": 2.92542101042502, + "grad_norm": 0.8609873056411743, + "learning_rate": 3.2409054862256875e-08, + "loss": 0.4763, + "step": 3648 + }, + { + "epoch": 2.926222935044106, + "grad_norm": 0.8213399648666382, + "learning_rate": 3.17161997597637e-08, + "loss": 0.4574, + "step": 3649 + }, + { + "epoch": 2.9270248596631916, + "grad_norm": 0.8279301524162292, + "learning_rate": 3.103081921594586e-08, + "loss": 0.467, + "step": 3650 + }, + { + "epoch": 2.9278267842822774, + "grad_norm": 0.8031629323959351, + "learning_rate": 3.03529137447256e-08, + "loss": 0.4496, + "step": 3651 + }, + { + "epoch": 2.928628708901363, + "grad_norm": 0.8145686984062195, + "learning_rate": 2.968248385441852e-08, + "loss": 0.4587, + "step": 3652 + }, + { + "epoch": 2.929430633520449, + "grad_norm": 0.8566755652427673, + "learning_rate": 2.9019530047736944e-08, + "loss": 0.4765, + "step": 3653 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 0.8937522768974304, + "learning_rate": 2.836405282178656e-08, + "loss": 0.4975, + "step": 3654 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 0.8584054708480835, + "learning_rate": 2.7716052668064208e-08, + "loss": 0.4558, + "step": 3655 + }, + { + "epoch": 2.9318364073777063, + "grad_norm": 0.8949685096740723, + "learning_rate": 2.707553007246455e-08, + "loss": 0.491, + "step": 3656 + }, + { + "epoch": 2.932638331996792, + "grad_norm": 0.8435682654380798, + "learning_rate": 2.6442485515273397e-08, + "loss": 0.4682, + "step": 3657 + }, + { + "epoch": 2.933440256615878, + "grad_norm": 0.8290677070617676, + "learning_rate": 2.581691947116771e-08, + "loss": 0.4771, + "step": 3658 + }, + { + "epoch": 2.934242181234964, + "grad_norm": 0.8999835252761841, + "learning_rate": 2.5198832409218944e-08, + "loss": 0.4779, + "step": 3659 + }, + { + "epoch": 2.9350441058540495, + "grad_norm": 0.8274721503257751, + "learning_rate": 2.458822479288969e-08, + "loss": 0.4586, + "step": 3660 + }, + { + "epoch": 2.9358460304731357, + "grad_norm": 0.8735244274139404, + "learning_rate": 2.3985097080033715e-08, + "loss": 0.4738, + "step": 3661 + }, + { + "epoch": 2.936647955092221, + "grad_norm": 0.8940444588661194, + "learning_rate": 2.3389449722898137e-08, + "loss": 0.4773, + "step": 3662 + }, + { + "epoch": 2.9374498797113073, + "grad_norm": 0.9149758219718933, + "learning_rate": 2.2801283168119028e-08, + "loss": 0.4798, + "step": 3663 + }, + { + "epoch": 2.9382518043303927, + "grad_norm": 0.8782206773757935, + "learning_rate": 2.222059785672359e-08, + "loss": 0.4672, + "step": 3664 + }, + { + "epoch": 2.939053728949479, + "grad_norm": 0.8763145804405212, + "learning_rate": 2.1647394224129092e-08, + "loss": 0.483, + "step": 3665 + }, + { + "epoch": 2.9398556535685647, + "grad_norm": 0.8692938089370728, + "learning_rate": 2.108167270014394e-08, + "loss": 0.4818, + "step": 3666 + }, + { + "epoch": 2.9406575781876505, + "grad_norm": 0.8743387460708618, + "learning_rate": 2.052343370896437e-08, + "loss": 0.4554, + "step": 3667 + }, + { + "epoch": 2.9414595028067363, + "grad_norm": 0.8664737343788147, + "learning_rate": 1.9972677669177766e-08, + "loss": 0.4854, + "step": 3668 + }, + { + "epoch": 2.942261427425822, + "grad_norm": 0.8775107860565186, + "learning_rate": 1.942940499376045e-08, + "loss": 0.4664, + "step": 3669 + }, + { + "epoch": 2.943063352044908, + "grad_norm": 0.8614839315414429, + "learning_rate": 1.889361609007434e-08, + "loss": 0.4388, + "step": 3670 + }, + { + "epoch": 2.9438652766639937, + "grad_norm": 0.9102433323860168, + "learning_rate": 1.836531135987474e-08, + "loss": 0.4858, + "step": 3671 + }, + { + "epoch": 2.9446672012830795, + "grad_norm": 0.8594423532485962, + "learning_rate": 1.7844491199301428e-08, + "loss": 0.4674, + "step": 3672 + }, + { + "epoch": 2.9454691259021653, + "grad_norm": 0.9068805575370789, + "learning_rate": 1.733115599888202e-08, + "loss": 0.4674, + "step": 3673 + }, + { + "epoch": 2.946271050521251, + "grad_norm": 0.9371825456619263, + "learning_rate": 1.682530614353528e-08, + "loss": 0.5031, + "step": 3674 + }, + { + "epoch": 2.947072975140337, + "grad_norm": 0.9196876287460327, + "learning_rate": 1.6326942012562242e-08, + "loss": 0.4942, + "step": 3675 + }, + { + "epoch": 2.9478748997594226, + "grad_norm": 0.7983100414276123, + "learning_rate": 1.5836063979656202e-08, + "loss": 0.4588, + "step": 3676 + }, + { + "epoch": 2.9486768243785084, + "grad_norm": 0.8527657389640808, + "learning_rate": 1.535267241289051e-08, + "loss": 0.4615, + "step": 3677 + }, + { + "epoch": 2.949478748997594, + "grad_norm": 0.8166870474815369, + "learning_rate": 1.4876767674730786e-08, + "loss": 0.4497, + "step": 3678 + }, + { + "epoch": 2.95028067361668, + "grad_norm": 0.9895662665367126, + "learning_rate": 1.4408350122027126e-08, + "loss": 0.4559, + "step": 3679 + }, + { + "epoch": 2.951082598235766, + "grad_norm": 0.8738000988960266, + "learning_rate": 1.3947420106013021e-08, + "loss": 0.4756, + "step": 3680 + }, + { + "epoch": 2.9518845228548516, + "grad_norm": 0.8730235695838928, + "learning_rate": 1.3493977972312e-08, + "loss": 0.4423, + "step": 3681 + }, + { + "epoch": 2.9526864474739374, + "grad_norm": 0.9113731980323792, + "learning_rate": 1.3048024060928754e-08, + "loss": 0.4899, + "step": 3682 + }, + { + "epoch": 2.953488372093023, + "grad_norm": 0.8570337891578674, + "learning_rate": 1.2609558706253578e-08, + "loss": 0.4661, + "step": 3683 + }, + { + "epoch": 2.954290296712109, + "grad_norm": 0.8690351843833923, + "learning_rate": 1.2178582237065695e-08, + "loss": 0.4826, + "step": 3684 + }, + { + "epoch": 2.9550922213311948, + "grad_norm": 0.8450669050216675, + "learning_rate": 1.1755094976523273e-08, + "loss": 0.4757, + "step": 3685 + }, + { + "epoch": 2.9558941459502805, + "grad_norm": 0.8436766266822815, + "learning_rate": 1.1339097242173414e-08, + "loss": 0.449, + "step": 3686 + }, + { + "epoch": 2.9566960705693663, + "grad_norm": 0.9269471168518066, + "learning_rate": 1.0930589345944376e-08, + "loss": 0.4733, + "step": 3687 + }, + { + "epoch": 2.957497995188452, + "grad_norm": 0.8610197305679321, + "learning_rate": 1.0529571594150023e-08, + "loss": 0.4396, + "step": 3688 + }, + { + "epoch": 2.958299919807538, + "grad_norm": 0.879664957523346, + "learning_rate": 1.013604428748538e-08, + "loss": 0.4536, + "step": 3689 + }, + { + "epoch": 2.959101844426624, + "grad_norm": 0.9061784148216248, + "learning_rate": 9.750007721032184e-09, + "loss": 0.489, + "step": 3690 + }, + { + "epoch": 2.9599037690457095, + "grad_norm": 0.8848870992660522, + "learning_rate": 9.371462184254443e-09, + "loss": 0.4883, + "step": 3691 + }, + { + "epoch": 2.9607056936647957, + "grad_norm": 0.8711914420127869, + "learning_rate": 9.000407960996216e-09, + "loss": 0.4915, + "step": 3692 + }, + { + "epoch": 2.961507618283881, + "grad_norm": 0.8504834175109863, + "learning_rate": 8.636845329488274e-09, + "loss": 0.4564, + "step": 3693 + }, + { + "epoch": 2.9623095429029673, + "grad_norm": 0.8460233807563782, + "learning_rate": 8.280774562342552e-09, + "loss": 0.4749, + "step": 3694 + }, + { + "epoch": 2.9631114675220527, + "grad_norm": 0.9107170104980469, + "learning_rate": 7.932195926552144e-09, + "loss": 0.4831, + "step": 3695 + }, + { + "epoch": 2.963913392141139, + "grad_norm": 0.9115839600563049, + "learning_rate": 7.591109683492415e-09, + "loss": 0.5077, + "step": 3696 + }, + { + "epoch": 2.9647153167602243, + "grad_norm": 0.849886953830719, + "learning_rate": 7.257516088923222e-09, + "loss": 0.4558, + "step": 3697 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 0.8572622537612915, + "learning_rate": 6.9314153929833646e-09, + "loss": 0.4509, + "step": 3698 + }, + { + "epoch": 2.9663191659983963, + "grad_norm": 0.8848903179168701, + "learning_rate": 6.612807840195024e-09, + "loss": 0.4814, + "step": 3699 + }, + { + "epoch": 2.967121090617482, + "grad_norm": 0.811222493648529, + "learning_rate": 6.301693669459319e-09, + "loss": 0.4582, + "step": 3700 + }, + { + "epoch": 2.967923015236568, + "grad_norm": 0.8343945741653442, + "learning_rate": 5.998073114062975e-09, + "loss": 0.4534, + "step": 3701 + }, + { + "epoch": 2.9687249398556537, + "grad_norm": 0.8955221176147461, + "learning_rate": 5.701946401668324e-09, + "loss": 0.4938, + "step": 3702 + }, + { + "epoch": 2.9695268644747395, + "grad_norm": 0.9045392274856567, + "learning_rate": 5.413313754322192e-09, + "loss": 0.4815, + "step": 3703 + }, + { + "epoch": 2.9703287890938253, + "grad_norm": 0.8312162756919861, + "learning_rate": 5.132175388452565e-09, + "loss": 0.4714, + "step": 3704 + }, + { + "epoch": 2.971130713712911, + "grad_norm": 0.8867566585540771, + "learning_rate": 4.858531514864151e-09, + "loss": 0.4735, + "step": 3705 + }, + { + "epoch": 2.971932638331997, + "grad_norm": 0.9006369709968567, + "learning_rate": 4.592382338746148e-09, + "loss": 0.4656, + "step": 3706 + }, + { + "epoch": 2.9727345629510826, + "grad_norm": 0.8595499396324158, + "learning_rate": 4.3337280596655876e-09, + "loss": 0.4582, + "step": 3707 + }, + { + "epoch": 2.9735364875701684, + "grad_norm": 0.927030622959137, + "learning_rate": 4.082568871570658e-09, + "loss": 0.4734, + "step": 3708 + }, + { + "epoch": 2.974338412189254, + "grad_norm": 0.8537117838859558, + "learning_rate": 3.838904962788492e-09, + "loss": 0.4759, + "step": 3709 + }, + { + "epoch": 2.97514033680834, + "grad_norm": 0.8765459656715393, + "learning_rate": 3.602736516027383e-09, + "loss": 0.4711, + "step": 3710 + }, + { + "epoch": 2.975942261427426, + "grad_norm": 0.8582831025123596, + "learning_rate": 3.374063708373454e-09, + "loss": 0.4654, + "step": 3711 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.9093928337097168, + "learning_rate": 3.15288671129399e-09, + "loss": 0.4866, + "step": 3712 + }, + { + "epoch": 2.9775461106655974, + "grad_norm": 0.9072393178939819, + "learning_rate": 2.9392056906352162e-09, + "loss": 0.4787, + "step": 3713 + }, + { + "epoch": 2.978348035284683, + "grad_norm": 0.8780279159545898, + "learning_rate": 2.7330208066222996e-09, + "loss": 0.4809, + "step": 3714 + }, + { + "epoch": 2.979149959903769, + "grad_norm": 0.8489347100257874, + "learning_rate": 2.5343322138593472e-09, + "loss": 0.4603, + "step": 3715 + }, + { + "epoch": 2.9799518845228548, + "grad_norm": 0.9047530889511108, + "learning_rate": 2.3431400613305176e-09, + "loss": 0.5009, + "step": 3716 + }, + { + "epoch": 2.9807538091419405, + "grad_norm": 0.8335652947425842, + "learning_rate": 2.1594444923978e-09, + "loss": 0.454, + "step": 3717 + }, + { + "epoch": 2.9815557337610263, + "grad_norm": 0.8720241189002991, + "learning_rate": 1.983245644802123e-09, + "loss": 0.471, + "step": 3718 + }, + { + "epoch": 2.982357658380112, + "grad_norm": 0.8841453790664673, + "learning_rate": 1.8145436506633585e-09, + "loss": 0.4726, + "step": 3719 + }, + { + "epoch": 2.983159582999198, + "grad_norm": 0.8820040822029114, + "learning_rate": 1.6533386364814274e-09, + "loss": 0.4732, + "step": 3720 + }, + { + "epoch": 2.9839615076182837, + "grad_norm": 0.8319164514541626, + "learning_rate": 1.4996307231307517e-09, + "loss": 0.4638, + "step": 3721 + }, + { + "epoch": 2.9847634322373695, + "grad_norm": 0.8456513285636902, + "learning_rate": 1.3534200258691343e-09, + "loss": 0.4903, + "step": 3722 + }, + { + "epoch": 2.9855653568564557, + "grad_norm": 0.8635490536689758, + "learning_rate": 1.2147066543288787e-09, + "loss": 0.4643, + "step": 3723 + }, + { + "epoch": 2.986367281475541, + "grad_norm": 0.8873840570449829, + "learning_rate": 1.0834907125223392e-09, + "loss": 0.4919, + "step": 3724 + }, + { + "epoch": 2.9871692060946273, + "grad_norm": 0.8440971970558167, + "learning_rate": 9.59772298840811e-10, + "loss": 0.4576, + "step": 3725 + }, + { + "epoch": 2.9879711307137127, + "grad_norm": 0.8713566064834595, + "learning_rate": 8.435515060500888e-10, + "loss": 0.4615, + "step": 3726 + }, + { + "epoch": 2.988773055332799, + "grad_norm": 0.877994954586029, + "learning_rate": 7.348284212993495e-10, + "loss": 0.4763, + "step": 3727 + }, + { + "epoch": 2.9895749799518843, + "grad_norm": 0.8891599178314209, + "learning_rate": 6.336031261111597e-10, + "loss": 0.4877, + "step": 3728 + }, + { + "epoch": 2.9903769045709705, + "grad_norm": 0.8762668967247009, + "learning_rate": 5.398756963881368e-10, + "loss": 0.4856, + "step": 3729 + }, + { + "epoch": 2.9911788291900563, + "grad_norm": 0.8617690205574036, + "learning_rate": 4.5364620240961885e-10, + "loss": 0.4847, + "step": 3730 + }, + { + "epoch": 2.991980753809142, + "grad_norm": 0.8446850180625916, + "learning_rate": 3.749147088349947e-10, + "loss": 0.4549, + "step": 3731 + }, + { + "epoch": 2.992782678428228, + "grad_norm": 0.9086197018623352, + "learning_rate": 3.0368127469815324e-10, + "loss": 0.4741, + "step": 3732 + }, + { + "epoch": 2.9935846030473137, + "grad_norm": 0.9534429907798767, + "learning_rate": 2.399459534130344e-10, + "loss": 0.4584, + "step": 3733 + }, + { + "epoch": 2.9943865276663995, + "grad_norm": 0.8938235640525818, + "learning_rate": 1.8370879277140874e-10, + "loss": 0.4728, + "step": 3734 + }, + { + "epoch": 2.9951884522854852, + "grad_norm": 0.8239015936851501, + "learning_rate": 1.3496983493954673e-10, + "loss": 0.4659, + "step": 3735 + }, + { + "epoch": 2.995990376904571, + "grad_norm": 0.8396766781806946, + "learning_rate": 9.372911646599037e-11, + "loss": 0.4637, + "step": 3736 + }, + { + "epoch": 2.996792301523657, + "grad_norm": 0.8573855757713318, + "learning_rate": 5.998666827378153e-11, + "loss": 0.4648, + "step": 3737 + }, + { + "epoch": 2.9975942261427426, + "grad_norm": 0.8772425055503845, + "learning_rate": 3.3742515662682496e-11, + "loss": 0.4768, + "step": 3738 + }, + { + "epoch": 2.9983961507618284, + "grad_norm": 0.9464996457099915, + "learning_rate": 1.4996678313616842e-11, + "loss": 0.4722, + "step": 3739 + }, + { + "epoch": 2.999198075380914, + "grad_norm": 0.9083192944526672, + "learning_rate": 3.749170280897829e-12, + "loss": 0.4938, + "step": 3740 + }, + { + "epoch": 3.0, + "grad_norm": 0.7624624967575073, + "learning_rate": 0.0, + "loss": 0.3898, + "step": 3741 + }, + { + "epoch": 3.0, + "step": 3741, + "total_flos": 7.432135506978144e+18, + "train_loss": 0.7324594869090414, + "train_runtime": 79144.443, + "train_samples_per_second": 18.139, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1.0, + "max_steps": 3741, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 32860.0, + "total_flos": 7.432135506978144e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}