diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,64757 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.998197873490719, + "eval_steps": 500, + "global_step": 9245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005406379527842854, + "grad_norm": 5.852370738983154, + "learning_rate": 1.0810810810810811e-08, + "loss": 0.8755, + "step": 1 + }, + { + "epoch": 0.001081275905568571, + "grad_norm": 5.950657367706299, + "learning_rate": 2.1621621621621623e-08, + "loss": 0.8924, + "step": 2 + }, + { + "epoch": 0.0016219138583528565, + "grad_norm": 6.0590081214904785, + "learning_rate": 3.2432432432432436e-08, + "loss": 0.8789, + "step": 3 + }, + { + "epoch": 0.002162551811137142, + "grad_norm": 5.988236427307129, + "learning_rate": 4.3243243243243246e-08, + "loss": 0.8708, + "step": 4 + }, + { + "epoch": 0.0027031897639214274, + "grad_norm": 5.799636363983154, + "learning_rate": 5.405405405405406e-08, + "loss": 0.8629, + "step": 5 + }, + { + "epoch": 0.003243827716705713, + "grad_norm": 6.322631359100342, + "learning_rate": 6.486486486486487e-08, + "loss": 0.9277, + "step": 6 + }, + { + "epoch": 0.003784465669489998, + "grad_norm": 6.104108810424805, + "learning_rate": 7.567567567567568e-08, + "loss": 0.9011, + "step": 7 + }, + { + "epoch": 0.004325103622274284, + "grad_norm": 6.1530280113220215, + "learning_rate": 8.648648648648649e-08, + "loss": 0.89, + "step": 8 + }, + { + "epoch": 0.004865741575058569, + "grad_norm": 5.7841596603393555, + "learning_rate": 9.72972972972973e-08, + "loss": 0.8426, + "step": 9 + }, + { + "epoch": 0.005406379527842855, + "grad_norm": 5.69771671295166, + "learning_rate": 1.0810810810810812e-07, + "loss": 0.846, + "step": 10 + }, + { + "epoch": 0.00594701748062714, + "grad_norm": 5.842521667480469, + "learning_rate": 1.1891891891891893e-07, + "loss": 0.8756, + "step": 11 + }, + { + "epoch": 0.006487655433411426, + "grad_norm": 5.661792278289795, + "learning_rate": 1.2972972972972974e-07, + "loss": 0.8472, + "step": 12 + }, + { + "epoch": 0.0070282933861957105, + "grad_norm": 5.664099216461182, + "learning_rate": 1.4054054054054055e-07, + "loss": 0.8736, + "step": 13 + }, + { + "epoch": 0.007568931338979996, + "grad_norm": 5.9114603996276855, + "learning_rate": 1.5135135135135135e-07, + "loss": 0.8842, + "step": 14 + }, + { + "epoch": 0.008109569291764282, + "grad_norm": 5.857370853424072, + "learning_rate": 1.6216216216216218e-07, + "loss": 0.8719, + "step": 15 + }, + { + "epoch": 0.008650207244548567, + "grad_norm": 5.932351112365723, + "learning_rate": 1.7297297297297298e-07, + "loss": 0.9108, + "step": 16 + }, + { + "epoch": 0.009190845197332853, + "grad_norm": 5.903899669647217, + "learning_rate": 1.8378378378378379e-07, + "loss": 0.8717, + "step": 17 + }, + { + "epoch": 0.009731483150117138, + "grad_norm": 5.735729694366455, + "learning_rate": 1.945945945945946e-07, + "loss": 0.8519, + "step": 18 + }, + { + "epoch": 0.010272121102901424, + "grad_norm": 5.9615159034729, + "learning_rate": 2.0540540540540542e-07, + "loss": 0.8696, + "step": 19 + }, + { + "epoch": 0.01081275905568571, + "grad_norm": 5.628999710083008, + "learning_rate": 2.1621621621621625e-07, + "loss": 0.8514, + "step": 20 + }, + { + "epoch": 0.011353397008469995, + "grad_norm": 5.775051593780518, + "learning_rate": 2.2702702702702705e-07, + "loss": 0.8545, + "step": 21 + }, + { + "epoch": 0.01189403496125428, + "grad_norm": 5.561074733734131, + "learning_rate": 2.3783783783783785e-07, + "loss": 0.8694, + "step": 22 + }, + { + "epoch": 0.012434672914038566, + "grad_norm": 5.445178985595703, + "learning_rate": 2.486486486486487e-07, + "loss": 0.8473, + "step": 23 + }, + { + "epoch": 0.012975310866822852, + "grad_norm": 5.540316581726074, + "learning_rate": 2.594594594594595e-07, + "loss": 0.8695, + "step": 24 + }, + { + "epoch": 0.013515948819607137, + "grad_norm": 5.309917449951172, + "learning_rate": 2.702702702702703e-07, + "loss": 0.8353, + "step": 25 + }, + { + "epoch": 0.014056586772391421, + "grad_norm": 5.3465189933776855, + "learning_rate": 2.810810810810811e-07, + "loss": 0.8436, + "step": 26 + }, + { + "epoch": 0.014597224725175707, + "grad_norm": 5.423165798187256, + "learning_rate": 2.918918918918919e-07, + "loss": 0.8549, + "step": 27 + }, + { + "epoch": 0.015137862677959992, + "grad_norm": 5.301184177398682, + "learning_rate": 3.027027027027027e-07, + "loss": 0.8415, + "step": 28 + }, + { + "epoch": 0.015678500630744278, + "grad_norm": 4.40065860748291, + "learning_rate": 3.135135135135135e-07, + "loss": 0.8191, + "step": 29 + }, + { + "epoch": 0.016219138583528563, + "grad_norm": 4.320012092590332, + "learning_rate": 3.2432432432432436e-07, + "loss": 0.8082, + "step": 30 + }, + { + "epoch": 0.01675977653631285, + "grad_norm": 4.359866619110107, + "learning_rate": 3.3513513513513516e-07, + "loss": 0.84, + "step": 31 + }, + { + "epoch": 0.017300414489097134, + "grad_norm": 4.388699531555176, + "learning_rate": 3.4594594594594597e-07, + "loss": 0.8323, + "step": 32 + }, + { + "epoch": 0.01784105244188142, + "grad_norm": 4.4560747146606445, + "learning_rate": 3.567567567567568e-07, + "loss": 0.8409, + "step": 33 + }, + { + "epoch": 0.018381690394665706, + "grad_norm": 4.259395122528076, + "learning_rate": 3.6756756756756757e-07, + "loss": 0.7983, + "step": 34 + }, + { + "epoch": 0.01892232834744999, + "grad_norm": 4.209655284881592, + "learning_rate": 3.7837837837837843e-07, + "loss": 0.8256, + "step": 35 + }, + { + "epoch": 0.019462966300234277, + "grad_norm": 3.996368885040283, + "learning_rate": 3.891891891891892e-07, + "loss": 0.8241, + "step": 36 + }, + { + "epoch": 0.020003604253018562, + "grad_norm": 4.126471042633057, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.8436, + "step": 37 + }, + { + "epoch": 0.020544242205802848, + "grad_norm": 3.9851861000061035, + "learning_rate": 4.1081081081081084e-07, + "loss": 0.8082, + "step": 38 + }, + { + "epoch": 0.021084880158587133, + "grad_norm": 3.124469518661499, + "learning_rate": 4.2162162162162164e-07, + "loss": 0.8038, + "step": 39 + }, + { + "epoch": 0.02162551811137142, + "grad_norm": 2.3070788383483887, + "learning_rate": 4.324324324324325e-07, + "loss": 0.7691, + "step": 40 + }, + { + "epoch": 0.022166156064155704, + "grad_norm": 2.2585034370422363, + "learning_rate": 4.4324324324324325e-07, + "loss": 0.7469, + "step": 41 + }, + { + "epoch": 0.02270679401693999, + "grad_norm": 2.3153300285339355, + "learning_rate": 4.540540540540541e-07, + "loss": 0.7967, + "step": 42 + }, + { + "epoch": 0.023247431969724276, + "grad_norm": 2.2460641860961914, + "learning_rate": 4.6486486486486485e-07, + "loss": 0.7945, + "step": 43 + }, + { + "epoch": 0.02378806992250856, + "grad_norm": 2.041891574859619, + "learning_rate": 4.756756756756757e-07, + "loss": 0.7405, + "step": 44 + }, + { + "epoch": 0.024328707875292847, + "grad_norm": 2.197927474975586, + "learning_rate": 4.864864864864865e-07, + "loss": 0.7911, + "step": 45 + }, + { + "epoch": 0.024869345828077132, + "grad_norm": 2.032799243927002, + "learning_rate": 4.972972972972974e-07, + "loss": 0.7952, + "step": 46 + }, + { + "epoch": 0.025409983780861418, + "grad_norm": 1.8903582096099854, + "learning_rate": 5.081081081081081e-07, + "loss": 0.757, + "step": 47 + }, + { + "epoch": 0.025950621733645703, + "grad_norm": 1.9750922918319702, + "learning_rate": 5.18918918918919e-07, + "loss": 0.754, + "step": 48 + }, + { + "epoch": 0.02649125968642999, + "grad_norm": 1.8000760078430176, + "learning_rate": 5.297297297297297e-07, + "loss": 0.7691, + "step": 49 + }, + { + "epoch": 0.027031897639214274, + "grad_norm": 1.7124290466308594, + "learning_rate": 5.405405405405406e-07, + "loss": 0.7252, + "step": 50 + }, + { + "epoch": 0.02757253559199856, + "grad_norm": 1.6576565504074097, + "learning_rate": 5.513513513513514e-07, + "loss": 0.7495, + "step": 51 + }, + { + "epoch": 0.028113173544782842, + "grad_norm": 1.3588616847991943, + "learning_rate": 5.621621621621622e-07, + "loss": 0.7624, + "step": 52 + }, + { + "epoch": 0.028653811497567128, + "grad_norm": 1.3072082996368408, + "learning_rate": 5.72972972972973e-07, + "loss": 0.752, + "step": 53 + }, + { + "epoch": 0.029194449450351413, + "grad_norm": 1.344785213470459, + "learning_rate": 5.837837837837838e-07, + "loss": 0.7367, + "step": 54 + }, + { + "epoch": 0.0297350874031357, + "grad_norm": 1.5406584739685059, + "learning_rate": 5.945945945945947e-07, + "loss": 0.7839, + "step": 55 + }, + { + "epoch": 0.030275725355919984, + "grad_norm": 1.5574135780334473, + "learning_rate": 6.054054054054054e-07, + "loss": 0.7108, + "step": 56 + }, + { + "epoch": 0.03081636330870427, + "grad_norm": 1.608785629272461, + "learning_rate": 6.162162162162163e-07, + "loss": 0.7083, + "step": 57 + }, + { + "epoch": 0.031357001261488555, + "grad_norm": 1.6619904041290283, + "learning_rate": 6.27027027027027e-07, + "loss": 0.7478, + "step": 58 + }, + { + "epoch": 0.03189763921427284, + "grad_norm": 1.5372380018234253, + "learning_rate": 6.378378378378379e-07, + "loss": 0.719, + "step": 59 + }, + { + "epoch": 0.03243827716705713, + "grad_norm": 1.4971483945846558, + "learning_rate": 6.486486486486487e-07, + "loss": 0.7139, + "step": 60 + }, + { + "epoch": 0.03297891511984141, + "grad_norm": 1.4338963031768799, + "learning_rate": 6.594594594594596e-07, + "loss": 0.7308, + "step": 61 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 1.2289550304412842, + "learning_rate": 6.702702702702703e-07, + "loss": 0.739, + "step": 62 + }, + { + "epoch": 0.03406019102540998, + "grad_norm": 1.2169913053512573, + "learning_rate": 6.810810810810811e-07, + "loss": 0.7369, + "step": 63 + }, + { + "epoch": 0.03460082897819427, + "grad_norm": 0.9731535911560059, + "learning_rate": 6.918918918918919e-07, + "loss": 0.6949, + "step": 64 + }, + { + "epoch": 0.035141466930978554, + "grad_norm": 0.9446195960044861, + "learning_rate": 7.027027027027028e-07, + "loss": 0.7081, + "step": 65 + }, + { + "epoch": 0.03568210488376284, + "grad_norm": 0.9269970059394836, + "learning_rate": 7.135135135135136e-07, + "loss": 0.7229, + "step": 66 + }, + { + "epoch": 0.036222742836547125, + "grad_norm": 0.8842006325721741, + "learning_rate": 7.243243243243243e-07, + "loss": 0.696, + "step": 67 + }, + { + "epoch": 0.03676338078933141, + "grad_norm": 0.8420222997665405, + "learning_rate": 7.351351351351351e-07, + "loss": 0.7491, + "step": 68 + }, + { + "epoch": 0.0373040187421157, + "grad_norm": 0.8452447056770325, + "learning_rate": 7.45945945945946e-07, + "loss": 0.7079, + "step": 69 + }, + { + "epoch": 0.03784465669489998, + "grad_norm": 0.8561710119247437, + "learning_rate": 7.567567567567569e-07, + "loss": 0.6656, + "step": 70 + }, + { + "epoch": 0.03838529464768427, + "grad_norm": 0.8087544441223145, + "learning_rate": 7.675675675675676e-07, + "loss": 0.6841, + "step": 71 + }, + { + "epoch": 0.03892593260046855, + "grad_norm": 0.8077256679534912, + "learning_rate": 7.783783783783784e-07, + "loss": 0.665, + "step": 72 + }, + { + "epoch": 0.03946657055325284, + "grad_norm": 0.7797601819038391, + "learning_rate": 7.891891891891892e-07, + "loss": 0.7235, + "step": 73 + }, + { + "epoch": 0.040007208506037124, + "grad_norm": 0.6821624040603638, + "learning_rate": 8.000000000000001e-07, + "loss": 0.7056, + "step": 74 + }, + { + "epoch": 0.04054784645882141, + "grad_norm": 0.7008046507835388, + "learning_rate": 8.108108108108109e-07, + "loss": 0.6892, + "step": 75 + }, + { + "epoch": 0.041088484411605695, + "grad_norm": 0.6429518461227417, + "learning_rate": 8.216216216216217e-07, + "loss": 0.6758, + "step": 76 + }, + { + "epoch": 0.04162912236438998, + "grad_norm": 0.6321120858192444, + "learning_rate": 8.324324324324324e-07, + "loss": 0.6655, + "step": 77 + }, + { + "epoch": 0.04216976031717427, + "grad_norm": 0.6515984535217285, + "learning_rate": 8.432432432432433e-07, + "loss": 0.6735, + "step": 78 + }, + { + "epoch": 0.04271039826995855, + "grad_norm": 0.5830267667770386, + "learning_rate": 8.540540540540541e-07, + "loss": 0.6482, + "step": 79 + }, + { + "epoch": 0.04325103622274284, + "grad_norm": 0.5883980393409729, + "learning_rate": 8.64864864864865e-07, + "loss": 0.6438, + "step": 80 + }, + { + "epoch": 0.04379167417552712, + "grad_norm": 0.5688890814781189, + "learning_rate": 8.756756756756756e-07, + "loss": 0.6729, + "step": 81 + }, + { + "epoch": 0.04433231212831141, + "grad_norm": 0.6022357940673828, + "learning_rate": 8.864864864864865e-07, + "loss": 0.6681, + "step": 82 + }, + { + "epoch": 0.044872950081095694, + "grad_norm": 0.557185709476471, + "learning_rate": 8.972972972972974e-07, + "loss": 0.6756, + "step": 83 + }, + { + "epoch": 0.04541358803387998, + "grad_norm": 0.5133994221687317, + "learning_rate": 9.081081081081082e-07, + "loss": 0.6096, + "step": 84 + }, + { + "epoch": 0.045954225986664266, + "grad_norm": 0.5198540687561035, + "learning_rate": 9.189189189189191e-07, + "loss": 0.631, + "step": 85 + }, + { + "epoch": 0.04649486393944855, + "grad_norm": 0.49539971351623535, + "learning_rate": 9.297297297297297e-07, + "loss": 0.6403, + "step": 86 + }, + { + "epoch": 0.04703550189223284, + "grad_norm": 0.527891218662262, + "learning_rate": 9.405405405405406e-07, + "loss": 0.664, + "step": 87 + }, + { + "epoch": 0.04757613984501712, + "grad_norm": 0.5219648480415344, + "learning_rate": 9.513513513513514e-07, + "loss": 0.63, + "step": 88 + }, + { + "epoch": 0.04811677779780141, + "grad_norm": 0.5005459189414978, + "learning_rate": 9.621621621621622e-07, + "loss": 0.64, + "step": 89 + }, + { + "epoch": 0.04865741575058569, + "grad_norm": 0.5416708588600159, + "learning_rate": 9.72972972972973e-07, + "loss": 0.6376, + "step": 90 + }, + { + "epoch": 0.04919805370336998, + "grad_norm": 0.460190087556839, + "learning_rate": 9.837837837837839e-07, + "loss": 0.6326, + "step": 91 + }, + { + "epoch": 0.049738691656154264, + "grad_norm": 0.518156886100769, + "learning_rate": 9.945945945945947e-07, + "loss": 0.6809, + "step": 92 + }, + { + "epoch": 0.05027932960893855, + "grad_norm": 0.453222393989563, + "learning_rate": 1.0054054054054054e-06, + "loss": 0.6444, + "step": 93 + }, + { + "epoch": 0.050819967561722836, + "grad_norm": 0.4345966875553131, + "learning_rate": 1.0162162162162162e-06, + "loss": 0.6285, + "step": 94 + }, + { + "epoch": 0.05136060551450712, + "grad_norm": 0.46965357661247253, + "learning_rate": 1.027027027027027e-06, + "loss": 0.6763, + "step": 95 + }, + { + "epoch": 0.05190124346729141, + "grad_norm": 0.43054792284965515, + "learning_rate": 1.037837837837838e-06, + "loss": 0.6359, + "step": 96 + }, + { + "epoch": 0.05244188142007569, + "grad_norm": 0.4354804754257202, + "learning_rate": 1.0486486486486488e-06, + "loss": 0.623, + "step": 97 + }, + { + "epoch": 0.05298251937285998, + "grad_norm": 0.4513593912124634, + "learning_rate": 1.0594594594594595e-06, + "loss": 0.6113, + "step": 98 + }, + { + "epoch": 0.05352315732564426, + "grad_norm": 0.42707252502441406, + "learning_rate": 1.0702702702702703e-06, + "loss": 0.6193, + "step": 99 + }, + { + "epoch": 0.05406379527842855, + "grad_norm": 0.41745811700820923, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.6259, + "step": 100 + }, + { + "epoch": 0.054604433231212834, + "grad_norm": 0.4500015676021576, + "learning_rate": 1.091891891891892e-06, + "loss": 0.6484, + "step": 101 + }, + { + "epoch": 0.05514507118399712, + "grad_norm": 0.4127950966358185, + "learning_rate": 1.1027027027027029e-06, + "loss": 0.6298, + "step": 102 + }, + { + "epoch": 0.0556857091367814, + "grad_norm": 0.48789313435554504, + "learning_rate": 1.1135135135135135e-06, + "loss": 0.6584, + "step": 103 + }, + { + "epoch": 0.056226347089565684, + "grad_norm": 0.4403907358646393, + "learning_rate": 1.1243243243243244e-06, + "loss": 0.6092, + "step": 104 + }, + { + "epoch": 0.05676698504234997, + "grad_norm": 0.4237998127937317, + "learning_rate": 1.1351351351351352e-06, + "loss": 0.6304, + "step": 105 + }, + { + "epoch": 0.057307622995134255, + "grad_norm": 0.411171019077301, + "learning_rate": 1.145945945945946e-06, + "loss": 0.5931, + "step": 106 + }, + { + "epoch": 0.05784826094791854, + "grad_norm": 0.4114699065685272, + "learning_rate": 1.1567567567567567e-06, + "loss": 0.6625, + "step": 107 + }, + { + "epoch": 0.058388898900702826, + "grad_norm": 0.4349936544895172, + "learning_rate": 1.1675675675675676e-06, + "loss": 0.6267, + "step": 108 + }, + { + "epoch": 0.05892953685348711, + "grad_norm": 0.390249103307724, + "learning_rate": 1.1783783783783784e-06, + "loss": 0.6373, + "step": 109 + }, + { + "epoch": 0.0594701748062714, + "grad_norm": 0.41917869448661804, + "learning_rate": 1.1891891891891893e-06, + "loss": 0.6195, + "step": 110 + }, + { + "epoch": 0.06001081275905568, + "grad_norm": 0.40572988986968994, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.5918, + "step": 111 + }, + { + "epoch": 0.06055145071183997, + "grad_norm": 0.41473883390426636, + "learning_rate": 1.2108108108108108e-06, + "loss": 0.616, + "step": 112 + }, + { + "epoch": 0.061092088664624254, + "grad_norm": 0.4362037181854248, + "learning_rate": 1.2216216216216217e-06, + "loss": 0.6047, + "step": 113 + }, + { + "epoch": 0.06163272661740854, + "grad_norm": 0.4196556508541107, + "learning_rate": 1.2324324324324325e-06, + "loss": 0.6072, + "step": 114 + }, + { + "epoch": 0.062173364570192825, + "grad_norm": 0.3973116874694824, + "learning_rate": 1.2432432432432434e-06, + "loss": 0.5933, + "step": 115 + }, + { + "epoch": 0.06271400252297711, + "grad_norm": 0.42316290736198425, + "learning_rate": 1.254054054054054e-06, + "loss": 0.619, + "step": 116 + }, + { + "epoch": 0.0632546404757614, + "grad_norm": 0.4168538451194763, + "learning_rate": 1.264864864864865e-06, + "loss": 0.5945, + "step": 117 + }, + { + "epoch": 0.06379527842854568, + "grad_norm": 0.4332961440086365, + "learning_rate": 1.2756756756756757e-06, + "loss": 0.6496, + "step": 118 + }, + { + "epoch": 0.06433591638132997, + "grad_norm": 0.40682727098464966, + "learning_rate": 1.2864864864864866e-06, + "loss": 0.6178, + "step": 119 + }, + { + "epoch": 0.06487655433411425, + "grad_norm": 0.4624468982219696, + "learning_rate": 1.2972972972972974e-06, + "loss": 0.632, + "step": 120 + }, + { + "epoch": 0.06541719228689855, + "grad_norm": 0.4604220986366272, + "learning_rate": 1.308108108108108e-06, + "loss": 0.5954, + "step": 121 + }, + { + "epoch": 0.06595783023968282, + "grad_norm": 0.3809317648410797, + "learning_rate": 1.3189189189189192e-06, + "loss": 0.5924, + "step": 122 + }, + { + "epoch": 0.06649846819246712, + "grad_norm": 0.4062090516090393, + "learning_rate": 1.3297297297297298e-06, + "loss": 0.6023, + "step": 123 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 0.37150242924690247, + "learning_rate": 1.3405405405405407e-06, + "loss": 0.6204, + "step": 124 + }, + { + "epoch": 0.06757974409803569, + "grad_norm": 0.40651530027389526, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.6133, + "step": 125 + }, + { + "epoch": 0.06812038205081997, + "grad_norm": 0.41236740350723267, + "learning_rate": 1.3621621621621622e-06, + "loss": 0.5861, + "step": 126 + }, + { + "epoch": 0.06866102000360426, + "grad_norm": 0.35920029878616333, + "learning_rate": 1.3729729729729732e-06, + "loss": 0.5781, + "step": 127 + }, + { + "epoch": 0.06920165795638854, + "grad_norm": 0.44092392921447754, + "learning_rate": 1.3837837837837839e-06, + "loss": 0.6472, + "step": 128 + }, + { + "epoch": 0.06974229590917283, + "grad_norm": 0.4139423966407776, + "learning_rate": 1.3945945945945947e-06, + "loss": 0.6242, + "step": 129 + }, + { + "epoch": 0.07028293386195711, + "grad_norm": 0.3792780041694641, + "learning_rate": 1.4054054054054056e-06, + "loss": 0.6001, + "step": 130 + }, + { + "epoch": 0.0708235718147414, + "grad_norm": 0.41122201085090637, + "learning_rate": 1.4162162162162162e-06, + "loss": 0.583, + "step": 131 + }, + { + "epoch": 0.07136420976752568, + "grad_norm": 0.37738776206970215, + "learning_rate": 1.4270270270270273e-06, + "loss": 0.6071, + "step": 132 + }, + { + "epoch": 0.07190484772030997, + "grad_norm": 0.3700563609600067, + "learning_rate": 1.437837837837838e-06, + "loss": 0.6138, + "step": 133 + }, + { + "epoch": 0.07244548567309425, + "grad_norm": 0.4004262387752533, + "learning_rate": 1.4486486486486486e-06, + "loss": 0.6297, + "step": 134 + }, + { + "epoch": 0.07298612362587854, + "grad_norm": 0.35735851526260376, + "learning_rate": 1.4594594594594596e-06, + "loss": 0.6531, + "step": 135 + }, + { + "epoch": 0.07352676157866282, + "grad_norm": 0.40201085805892944, + "learning_rate": 1.4702702702702703e-06, + "loss": 0.5891, + "step": 136 + }, + { + "epoch": 0.07406739953144711, + "grad_norm": 0.42626097798347473, + "learning_rate": 1.4810810810810814e-06, + "loss": 0.6261, + "step": 137 + }, + { + "epoch": 0.0746080374842314, + "grad_norm": 0.39403700828552246, + "learning_rate": 1.491891891891892e-06, + "loss": 0.5996, + "step": 138 + }, + { + "epoch": 0.07514867543701567, + "grad_norm": 0.3774298429489136, + "learning_rate": 1.5027027027027026e-06, + "loss": 0.6039, + "step": 139 + }, + { + "epoch": 0.07568931338979996, + "grad_norm": 0.40128400921821594, + "learning_rate": 1.5135135135135137e-06, + "loss": 0.6185, + "step": 140 + }, + { + "epoch": 0.07622995134258424, + "grad_norm": 0.3873974680900574, + "learning_rate": 1.5243243243243244e-06, + "loss": 0.5823, + "step": 141 + }, + { + "epoch": 0.07677058929536854, + "grad_norm": 0.3734970688819885, + "learning_rate": 1.5351351351351352e-06, + "loss": 0.6091, + "step": 142 + }, + { + "epoch": 0.07731122724815281, + "grad_norm": 0.36722680926322937, + "learning_rate": 1.545945945945946e-06, + "loss": 0.5835, + "step": 143 + }, + { + "epoch": 0.0778518652009371, + "grad_norm": 0.4275578260421753, + "learning_rate": 1.5567567567567567e-06, + "loss": 0.582, + "step": 144 + }, + { + "epoch": 0.07839250315372139, + "grad_norm": 0.4208986163139343, + "learning_rate": 1.5675675675675678e-06, + "loss": 0.5632, + "step": 145 + }, + { + "epoch": 0.07893314110650568, + "grad_norm": 0.3910946547985077, + "learning_rate": 1.5783783783783784e-06, + "loss": 0.6165, + "step": 146 + }, + { + "epoch": 0.07947377905928996, + "grad_norm": 0.4066162407398224, + "learning_rate": 1.5891891891891893e-06, + "loss": 0.6138, + "step": 147 + }, + { + "epoch": 0.08001441701207425, + "grad_norm": 0.386677622795105, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.594, + "step": 148 + }, + { + "epoch": 0.08055505496485853, + "grad_norm": 0.41342198848724365, + "learning_rate": 1.6108108108108108e-06, + "loss": 0.5992, + "step": 149 + }, + { + "epoch": 0.08109569291764282, + "grad_norm": 0.3809179365634918, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.5768, + "step": 150 + }, + { + "epoch": 0.0816363308704271, + "grad_norm": 0.39420464634895325, + "learning_rate": 1.6324324324324325e-06, + "loss": 0.5875, + "step": 151 + }, + { + "epoch": 0.08217696882321139, + "grad_norm": 0.37135252356529236, + "learning_rate": 1.6432432432432434e-06, + "loss": 0.6015, + "step": 152 + }, + { + "epoch": 0.08271760677599567, + "grad_norm": 0.39454683661460876, + "learning_rate": 1.6540540540540542e-06, + "loss": 0.5874, + "step": 153 + }, + { + "epoch": 0.08325824472877996, + "grad_norm": 0.3885583281517029, + "learning_rate": 1.6648648648648649e-06, + "loss": 0.5867, + "step": 154 + }, + { + "epoch": 0.08379888268156424, + "grad_norm": 0.37252458930015564, + "learning_rate": 1.675675675675676e-06, + "loss": 0.5975, + "step": 155 + }, + { + "epoch": 0.08433952063434853, + "grad_norm": 0.39173391461372375, + "learning_rate": 1.6864864864864866e-06, + "loss": 0.585, + "step": 156 + }, + { + "epoch": 0.08488015858713281, + "grad_norm": 0.4323269724845886, + "learning_rate": 1.6972972972972972e-06, + "loss": 0.5694, + "step": 157 + }, + { + "epoch": 0.0854207965399171, + "grad_norm": 0.3776825964450836, + "learning_rate": 1.7081081081081083e-06, + "loss": 0.5765, + "step": 158 + }, + { + "epoch": 0.08596143449270138, + "grad_norm": 0.3599972724914551, + "learning_rate": 1.718918918918919e-06, + "loss": 0.5845, + "step": 159 + }, + { + "epoch": 0.08650207244548568, + "grad_norm": 0.37649205327033997, + "learning_rate": 1.72972972972973e-06, + "loss": 0.5836, + "step": 160 + }, + { + "epoch": 0.08704271039826995, + "grad_norm": 0.3656623065471649, + "learning_rate": 1.7405405405405406e-06, + "loss": 0.5357, + "step": 161 + }, + { + "epoch": 0.08758334835105425, + "grad_norm": 0.37666720151901245, + "learning_rate": 1.7513513513513513e-06, + "loss": 0.5441, + "step": 162 + }, + { + "epoch": 0.08812398630383853, + "grad_norm": 0.3532339036464691, + "learning_rate": 1.7621621621621623e-06, + "loss": 0.5623, + "step": 163 + }, + { + "epoch": 0.08866462425662282, + "grad_norm": 0.3519897758960724, + "learning_rate": 1.772972972972973e-06, + "loss": 0.5903, + "step": 164 + }, + { + "epoch": 0.0892052622094071, + "grad_norm": 0.40431109070777893, + "learning_rate": 1.783783783783784e-06, + "loss": 0.6142, + "step": 165 + }, + { + "epoch": 0.08974590016219139, + "grad_norm": 0.4137735366821289, + "learning_rate": 1.7945945945945947e-06, + "loss": 0.5966, + "step": 166 + }, + { + "epoch": 0.09028653811497567, + "grad_norm": 0.3849121332168579, + "learning_rate": 1.8054054054054053e-06, + "loss": 0.5876, + "step": 167 + }, + { + "epoch": 0.09082717606775996, + "grad_norm": 0.37211450934410095, + "learning_rate": 1.8162162162162164e-06, + "loss": 0.5441, + "step": 168 + }, + { + "epoch": 0.09136781402054424, + "grad_norm": 0.35666146874427795, + "learning_rate": 1.827027027027027e-06, + "loss": 0.5828, + "step": 169 + }, + { + "epoch": 0.09190845197332853, + "grad_norm": 0.3767271637916565, + "learning_rate": 1.8378378378378381e-06, + "loss": 0.5968, + "step": 170 + }, + { + "epoch": 0.09244908992611281, + "grad_norm": 0.3989579379558563, + "learning_rate": 1.8486486486486488e-06, + "loss": 0.5844, + "step": 171 + }, + { + "epoch": 0.0929897278788971, + "grad_norm": 0.364155113697052, + "learning_rate": 1.8594594594594594e-06, + "loss": 0.5798, + "step": 172 + }, + { + "epoch": 0.09353036583168138, + "grad_norm": 0.38305285573005676, + "learning_rate": 1.8702702702702705e-06, + "loss": 0.5977, + "step": 173 + }, + { + "epoch": 0.09407100378446567, + "grad_norm": 0.44144946336746216, + "learning_rate": 1.8810810810810811e-06, + "loss": 0.5732, + "step": 174 + }, + { + "epoch": 0.09461164173724995, + "grad_norm": 0.35379329323768616, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.5859, + "step": 175 + }, + { + "epoch": 0.09515227969003424, + "grad_norm": 0.3723418414592743, + "learning_rate": 1.9027027027027028e-06, + "loss": 0.5889, + "step": 176 + }, + { + "epoch": 0.09569291764281852, + "grad_norm": 0.38720229268074036, + "learning_rate": 1.9135135135135135e-06, + "loss": 0.6096, + "step": 177 + }, + { + "epoch": 0.09623355559560282, + "grad_norm": 0.384669691324234, + "learning_rate": 1.9243243243243243e-06, + "loss": 0.5813, + "step": 178 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.3830939531326294, + "learning_rate": 1.935135135135135e-06, + "loss": 0.535, + "step": 179 + }, + { + "epoch": 0.09731483150117139, + "grad_norm": 0.3666214048862457, + "learning_rate": 1.945945945945946e-06, + "loss": 0.5891, + "step": 180 + }, + { + "epoch": 0.09785546945395567, + "grad_norm": 0.37354567646980286, + "learning_rate": 1.956756756756757e-06, + "loss": 0.6001, + "step": 181 + }, + { + "epoch": 0.09839610740673996, + "grad_norm": 0.3545113205909729, + "learning_rate": 1.9675675675675678e-06, + "loss": 0.5886, + "step": 182 + }, + { + "epoch": 0.09893674535952424, + "grad_norm": 0.3804175555706024, + "learning_rate": 1.9783783783783786e-06, + "loss": 0.5686, + "step": 183 + }, + { + "epoch": 0.09947738331230853, + "grad_norm": 0.37139418721199036, + "learning_rate": 1.9891891891891895e-06, + "loss": 0.5788, + "step": 184 + }, + { + "epoch": 0.10001802126509281, + "grad_norm": 0.34333956241607666, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5442, + "step": 185 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 0.37467560172080994, + "learning_rate": 2.0108108108108108e-06, + "loss": 0.5533, + "step": 186 + }, + { + "epoch": 0.10109929717066138, + "grad_norm": 0.40123438835144043, + "learning_rate": 2.0216216216216216e-06, + "loss": 0.554, + "step": 187 + }, + { + "epoch": 0.10163993512344567, + "grad_norm": 0.41539064049720764, + "learning_rate": 2.0324324324324325e-06, + "loss": 0.5893, + "step": 188 + }, + { + "epoch": 0.10218057307622995, + "grad_norm": 0.3829135596752167, + "learning_rate": 2.0432432432432433e-06, + "loss": 0.5845, + "step": 189 + }, + { + "epoch": 0.10272121102901424, + "grad_norm": 0.3924802541732788, + "learning_rate": 2.054054054054054e-06, + "loss": 0.5521, + "step": 190 + }, + { + "epoch": 0.10326184898179852, + "grad_norm": 0.37531155347824097, + "learning_rate": 2.064864864864865e-06, + "loss": 0.571, + "step": 191 + }, + { + "epoch": 0.10380248693458281, + "grad_norm": 0.3559173345565796, + "learning_rate": 2.075675675675676e-06, + "loss": 0.5636, + "step": 192 + }, + { + "epoch": 0.10434312488736709, + "grad_norm": 0.3618552088737488, + "learning_rate": 2.0864864864864868e-06, + "loss": 0.5587, + "step": 193 + }, + { + "epoch": 0.10488376284015138, + "grad_norm": 0.36590248346328735, + "learning_rate": 2.0972972972972976e-06, + "loss": 0.557, + "step": 194 + }, + { + "epoch": 0.10542440079293566, + "grad_norm": 0.3591752052307129, + "learning_rate": 2.1081081081081085e-06, + "loss": 0.5506, + "step": 195 + }, + { + "epoch": 0.10596503874571996, + "grad_norm": 0.41891753673553467, + "learning_rate": 2.118918918918919e-06, + "loss": 0.5663, + "step": 196 + }, + { + "epoch": 0.10650567669850423, + "grad_norm": 0.37557801604270935, + "learning_rate": 2.1297297297297298e-06, + "loss": 0.5797, + "step": 197 + }, + { + "epoch": 0.10704631465128853, + "grad_norm": 0.3890296220779419, + "learning_rate": 2.1405405405405406e-06, + "loss": 0.5387, + "step": 198 + }, + { + "epoch": 0.1075869526040728, + "grad_norm": 0.43507513403892517, + "learning_rate": 2.1513513513513515e-06, + "loss": 0.5988, + "step": 199 + }, + { + "epoch": 0.1081275905568571, + "grad_norm": 0.3923652470111847, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.6047, + "step": 200 + }, + { + "epoch": 0.10866822850964138, + "grad_norm": 0.38558241724967957, + "learning_rate": 2.172972972972973e-06, + "loss": 0.5726, + "step": 201 + }, + { + "epoch": 0.10920886646242567, + "grad_norm": 0.3587067425251007, + "learning_rate": 2.183783783783784e-06, + "loss": 0.5832, + "step": 202 + }, + { + "epoch": 0.10974950441520995, + "grad_norm": 0.3850076198577881, + "learning_rate": 2.194594594594595e-06, + "loss": 0.556, + "step": 203 + }, + { + "epoch": 0.11029014236799424, + "grad_norm": 0.38720348477363586, + "learning_rate": 2.2054054054054058e-06, + "loss": 0.5684, + "step": 204 + }, + { + "epoch": 0.11083078032077852, + "grad_norm": 0.40378227829933167, + "learning_rate": 2.2162162162162166e-06, + "loss": 0.5897, + "step": 205 + }, + { + "epoch": 0.1113714182735628, + "grad_norm": 0.4102720320224762, + "learning_rate": 2.227027027027027e-06, + "loss": 0.5628, + "step": 206 + }, + { + "epoch": 0.11191205622634709, + "grad_norm": 0.36637264490127563, + "learning_rate": 2.237837837837838e-06, + "loss": 0.5643, + "step": 207 + }, + { + "epoch": 0.11245269417913137, + "grad_norm": 0.4347660541534424, + "learning_rate": 2.2486486486486488e-06, + "loss": 0.5591, + "step": 208 + }, + { + "epoch": 0.11299333213191566, + "grad_norm": 0.3727447986602783, + "learning_rate": 2.2594594594594596e-06, + "loss": 0.5625, + "step": 209 + }, + { + "epoch": 0.11353397008469994, + "grad_norm": 0.3756256103515625, + "learning_rate": 2.2702702702702705e-06, + "loss": 0.5595, + "step": 210 + }, + { + "epoch": 0.11407460803748423, + "grad_norm": 0.40890535712242126, + "learning_rate": 2.2810810810810813e-06, + "loss": 0.5918, + "step": 211 + }, + { + "epoch": 0.11461524599026851, + "grad_norm": 0.4243277907371521, + "learning_rate": 2.291891891891892e-06, + "loss": 0.5755, + "step": 212 + }, + { + "epoch": 0.1151558839430528, + "grad_norm": 0.41777583956718445, + "learning_rate": 2.302702702702703e-06, + "loss": 0.5478, + "step": 213 + }, + { + "epoch": 0.11569652189583708, + "grad_norm": 0.4105633497238159, + "learning_rate": 2.3135135135135135e-06, + "loss": 0.5595, + "step": 214 + }, + { + "epoch": 0.11623715984862137, + "grad_norm": 0.40353402495384216, + "learning_rate": 2.3243243243243247e-06, + "loss": 0.5876, + "step": 215 + }, + { + "epoch": 0.11677779780140565, + "grad_norm": 0.3937397301197052, + "learning_rate": 2.335135135135135e-06, + "loss": 0.5392, + "step": 216 + }, + { + "epoch": 0.11731843575418995, + "grad_norm": 0.4327187240123749, + "learning_rate": 2.345945945945946e-06, + "loss": 0.5869, + "step": 217 + }, + { + "epoch": 0.11785907370697422, + "grad_norm": 0.42935019731521606, + "learning_rate": 2.356756756756757e-06, + "loss": 0.5673, + "step": 218 + }, + { + "epoch": 0.11839971165975852, + "grad_norm": 0.3601163923740387, + "learning_rate": 2.3675675675675677e-06, + "loss": 0.5625, + "step": 219 + }, + { + "epoch": 0.1189403496125428, + "grad_norm": 0.4096478223800659, + "learning_rate": 2.3783783783783786e-06, + "loss": 0.5521, + "step": 220 + }, + { + "epoch": 0.11948098756532709, + "grad_norm": 0.36564552783966064, + "learning_rate": 2.3891891891891895e-06, + "loss": 0.554, + "step": 221 + }, + { + "epoch": 0.12002162551811137, + "grad_norm": 0.39213013648986816, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.5721, + "step": 222 + }, + { + "epoch": 0.12056226347089566, + "grad_norm": 0.37426066398620605, + "learning_rate": 2.410810810810811e-06, + "loss": 0.5603, + "step": 223 + }, + { + "epoch": 0.12110290142367994, + "grad_norm": 0.42405861616134644, + "learning_rate": 2.4216216216216216e-06, + "loss": 0.5629, + "step": 224 + }, + { + "epoch": 0.12164353937646423, + "grad_norm": 0.39380067586898804, + "learning_rate": 2.432432432432433e-06, + "loss": 0.5963, + "step": 225 + }, + { + "epoch": 0.12218417732924851, + "grad_norm": 0.3430948257446289, + "learning_rate": 2.4432432432432433e-06, + "loss": 0.5709, + "step": 226 + }, + { + "epoch": 0.1227248152820328, + "grad_norm": 0.3845606744289398, + "learning_rate": 2.454054054054054e-06, + "loss": 0.5535, + "step": 227 + }, + { + "epoch": 0.12326545323481708, + "grad_norm": 0.3496875762939453, + "learning_rate": 2.464864864864865e-06, + "loss": 0.5422, + "step": 228 + }, + { + "epoch": 0.12380609118760137, + "grad_norm": 0.3898939788341522, + "learning_rate": 2.475675675675676e-06, + "loss": 0.574, + "step": 229 + }, + { + "epoch": 0.12434672914038565, + "grad_norm": 0.4322075843811035, + "learning_rate": 2.4864864864864867e-06, + "loss": 0.5608, + "step": 230 + }, + { + "epoch": 0.12488736709316994, + "grad_norm": 0.34907200932502747, + "learning_rate": 2.4972972972972976e-06, + "loss": 0.5549, + "step": 231 + }, + { + "epoch": 0.12542800504595422, + "grad_norm": 0.3408522307872772, + "learning_rate": 2.508108108108108e-06, + "loss": 0.5739, + "step": 232 + }, + { + "epoch": 0.1259686429987385, + "grad_norm": 0.3903674781322479, + "learning_rate": 2.518918918918919e-06, + "loss": 0.5778, + "step": 233 + }, + { + "epoch": 0.1265092809515228, + "grad_norm": 0.3834710717201233, + "learning_rate": 2.52972972972973e-06, + "loss": 0.5541, + "step": 234 + }, + { + "epoch": 0.12704991890430709, + "grad_norm": 0.39471179246902466, + "learning_rate": 2.540540540540541e-06, + "loss": 0.5499, + "step": 235 + }, + { + "epoch": 0.12759055685709136, + "grad_norm": 0.3803953230381012, + "learning_rate": 2.5513513513513515e-06, + "loss": 0.5729, + "step": 236 + }, + { + "epoch": 0.12813119480987564, + "grad_norm": 0.37784209847450256, + "learning_rate": 2.5621621621621623e-06, + "loss": 0.5713, + "step": 237 + }, + { + "epoch": 0.12867183276265995, + "grad_norm": 0.43192943930625916, + "learning_rate": 2.572972972972973e-06, + "loss": 0.5796, + "step": 238 + }, + { + "epoch": 0.12921247071544423, + "grad_norm": 0.35144469141960144, + "learning_rate": 2.5837837837837844e-06, + "loss": 0.5362, + "step": 239 + }, + { + "epoch": 0.1297531086682285, + "grad_norm": 0.40244975686073303, + "learning_rate": 2.594594594594595e-06, + "loss": 0.5254, + "step": 240 + }, + { + "epoch": 0.13029374662101278, + "grad_norm": 0.3917606770992279, + "learning_rate": 2.6054054054054057e-06, + "loss": 0.5728, + "step": 241 + }, + { + "epoch": 0.1308343845737971, + "grad_norm": 0.423109233379364, + "learning_rate": 2.616216216216216e-06, + "loss": 0.5823, + "step": 242 + }, + { + "epoch": 0.13137502252658137, + "grad_norm": 0.3996261954307556, + "learning_rate": 2.627027027027027e-06, + "loss": 0.5372, + "step": 243 + }, + { + "epoch": 0.13191566047936565, + "grad_norm": 0.41690677404403687, + "learning_rate": 2.6378378378378383e-06, + "loss": 0.5515, + "step": 244 + }, + { + "epoch": 0.13245629843214993, + "grad_norm": 0.3668651878833771, + "learning_rate": 2.648648648648649e-06, + "loss": 0.5446, + "step": 245 + }, + { + "epoch": 0.13299693638493423, + "grad_norm": 0.33522850275039673, + "learning_rate": 2.6594594594594596e-06, + "loss": 0.5253, + "step": 246 + }, + { + "epoch": 0.1335375743377185, + "grad_norm": 0.38769128918647766, + "learning_rate": 2.6702702702702704e-06, + "loss": 0.5796, + "step": 247 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 0.4321063160896301, + "learning_rate": 2.6810810810810813e-06, + "loss": 0.545, + "step": 248 + }, + { + "epoch": 0.13461885024328707, + "grad_norm": 0.35301586985588074, + "learning_rate": 2.6918918918918926e-06, + "loss": 0.5416, + "step": 249 + }, + { + "epoch": 0.13515948819607138, + "grad_norm": 0.39626818895339966, + "learning_rate": 2.702702702702703e-06, + "loss": 0.5494, + "step": 250 + }, + { + "epoch": 0.13570012614885565, + "grad_norm": 0.3496905565261841, + "learning_rate": 2.713513513513514e-06, + "loss": 0.5474, + "step": 251 + }, + { + "epoch": 0.13624076410163993, + "grad_norm": 0.4058341383934021, + "learning_rate": 2.7243243243243243e-06, + "loss": 0.5699, + "step": 252 + }, + { + "epoch": 0.1367814020544242, + "grad_norm": 0.39988845586776733, + "learning_rate": 2.735135135135135e-06, + "loss": 0.5506, + "step": 253 + }, + { + "epoch": 0.13732204000720852, + "grad_norm": 0.3820221722126007, + "learning_rate": 2.7459459459459464e-06, + "loss": 0.5321, + "step": 254 + }, + { + "epoch": 0.1378626779599928, + "grad_norm": 0.38915273547172546, + "learning_rate": 2.7567567567567573e-06, + "loss": 0.5377, + "step": 255 + }, + { + "epoch": 0.13840331591277708, + "grad_norm": 0.35650572180747986, + "learning_rate": 2.7675675675675677e-06, + "loss": 0.5215, + "step": 256 + }, + { + "epoch": 0.13894395386556135, + "grad_norm": 0.4189383387565613, + "learning_rate": 2.7783783783783786e-06, + "loss": 0.5357, + "step": 257 + }, + { + "epoch": 0.13948459181834566, + "grad_norm": 0.367410808801651, + "learning_rate": 2.7891891891891894e-06, + "loss": 0.5716, + "step": 258 + }, + { + "epoch": 0.14002522977112994, + "grad_norm": 0.4189624488353729, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.5677, + "step": 259 + }, + { + "epoch": 0.14056586772391422, + "grad_norm": 0.4024697542190552, + "learning_rate": 2.810810810810811e-06, + "loss": 0.5188, + "step": 260 + }, + { + "epoch": 0.1411065056766985, + "grad_norm": 0.4311826229095459, + "learning_rate": 2.821621621621622e-06, + "loss": 0.5732, + "step": 261 + }, + { + "epoch": 0.1416471436294828, + "grad_norm": 0.373671293258667, + "learning_rate": 2.8324324324324324e-06, + "loss": 0.5558, + "step": 262 + }, + { + "epoch": 0.14218778158226708, + "grad_norm": 0.39763349294662476, + "learning_rate": 2.8432432432432433e-06, + "loss": 0.5529, + "step": 263 + }, + { + "epoch": 0.14272841953505136, + "grad_norm": 0.3955563008785248, + "learning_rate": 2.8540540540540546e-06, + "loss": 0.5517, + "step": 264 + }, + { + "epoch": 0.14326905748783564, + "grad_norm": 0.3701838552951813, + "learning_rate": 2.8648648648648654e-06, + "loss": 0.5422, + "step": 265 + }, + { + "epoch": 0.14380969544061994, + "grad_norm": 0.38581880927085876, + "learning_rate": 2.875675675675676e-06, + "loss": 0.5332, + "step": 266 + }, + { + "epoch": 0.14435033339340422, + "grad_norm": 0.3548599183559418, + "learning_rate": 2.8864864864864867e-06, + "loss": 0.5484, + "step": 267 + }, + { + "epoch": 0.1448909713461885, + "grad_norm": 0.37688708305358887, + "learning_rate": 2.897297297297297e-06, + "loss": 0.5266, + "step": 268 + }, + { + "epoch": 0.14543160929897278, + "grad_norm": 0.36959773302078247, + "learning_rate": 2.9081081081081084e-06, + "loss": 0.5356, + "step": 269 + }, + { + "epoch": 0.1459722472517571, + "grad_norm": 0.3831232786178589, + "learning_rate": 2.9189189189189193e-06, + "loss": 0.5296, + "step": 270 + }, + { + "epoch": 0.14651288520454137, + "grad_norm": 0.38201797008514404, + "learning_rate": 2.92972972972973e-06, + "loss": 0.5175, + "step": 271 + }, + { + "epoch": 0.14705352315732564, + "grad_norm": 0.36365392804145813, + "learning_rate": 2.9405405405405406e-06, + "loss": 0.5827, + "step": 272 + }, + { + "epoch": 0.14759416111010992, + "grad_norm": 0.3701610565185547, + "learning_rate": 2.9513513513513514e-06, + "loss": 0.554, + "step": 273 + }, + { + "epoch": 0.14813479906289423, + "grad_norm": 0.4099448025226593, + "learning_rate": 2.9621621621621627e-06, + "loss": 0.5455, + "step": 274 + }, + { + "epoch": 0.1486754370156785, + "grad_norm": 0.3831491470336914, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.5252, + "step": 275 + }, + { + "epoch": 0.1492160749684628, + "grad_norm": 0.3430909812450409, + "learning_rate": 2.983783783783784e-06, + "loss": 0.5578, + "step": 276 + }, + { + "epoch": 0.14975671292124706, + "grad_norm": 0.39842531085014343, + "learning_rate": 2.994594594594595e-06, + "loss": 0.5418, + "step": 277 + }, + { + "epoch": 0.15029735087403134, + "grad_norm": 0.37241166830062866, + "learning_rate": 3.0054054054054053e-06, + "loss": 0.5336, + "step": 278 + }, + { + "epoch": 0.15083798882681565, + "grad_norm": 0.4128376543521881, + "learning_rate": 3.016216216216216e-06, + "loss": 0.5677, + "step": 279 + }, + { + "epoch": 0.15137862677959993, + "grad_norm": 0.42050155997276306, + "learning_rate": 3.0270270270270274e-06, + "loss": 0.5476, + "step": 280 + }, + { + "epoch": 0.1519192647323842, + "grad_norm": 0.40356847643852234, + "learning_rate": 3.0378378378378383e-06, + "loss": 0.5339, + "step": 281 + }, + { + "epoch": 0.15245990268516849, + "grad_norm": 0.38171207904815674, + "learning_rate": 3.0486486486486487e-06, + "loss": 0.5655, + "step": 282 + }, + { + "epoch": 0.1530005406379528, + "grad_norm": 0.34982213377952576, + "learning_rate": 3.0594594594594596e-06, + "loss": 0.557, + "step": 283 + }, + { + "epoch": 0.15354117859073707, + "grad_norm": 0.42924508452415466, + "learning_rate": 3.0702702702702704e-06, + "loss": 0.5474, + "step": 284 + }, + { + "epoch": 0.15408181654352135, + "grad_norm": 0.3678603172302246, + "learning_rate": 3.0810810810810817e-06, + "loss": 0.5421, + "step": 285 + }, + { + "epoch": 0.15462245449630563, + "grad_norm": 0.3565238118171692, + "learning_rate": 3.091891891891892e-06, + "loss": 0.5412, + "step": 286 + }, + { + "epoch": 0.15516309244908993, + "grad_norm": 0.36498183012008667, + "learning_rate": 3.102702702702703e-06, + "loss": 0.5455, + "step": 287 + }, + { + "epoch": 0.1557037304018742, + "grad_norm": 0.38334113359451294, + "learning_rate": 3.1135135135135134e-06, + "loss": 0.5392, + "step": 288 + }, + { + "epoch": 0.1562443683546585, + "grad_norm": 0.36851766705513, + "learning_rate": 3.1243243243243243e-06, + "loss": 0.5403, + "step": 289 + }, + { + "epoch": 0.15678500630744277, + "grad_norm": 0.384204626083374, + "learning_rate": 3.1351351351351356e-06, + "loss": 0.5414, + "step": 290 + }, + { + "epoch": 0.15732564426022708, + "grad_norm": 0.35691317915916443, + "learning_rate": 3.1459459459459464e-06, + "loss": 0.5319, + "step": 291 + }, + { + "epoch": 0.15786628221301136, + "grad_norm": 0.41789743304252625, + "learning_rate": 3.156756756756757e-06, + "loss": 0.5399, + "step": 292 + }, + { + "epoch": 0.15840692016579563, + "grad_norm": 0.370802104473114, + "learning_rate": 3.1675675675675677e-06, + "loss": 0.5379, + "step": 293 + }, + { + "epoch": 0.1589475581185799, + "grad_norm": 0.42494484782218933, + "learning_rate": 3.1783783783783786e-06, + "loss": 0.5087, + "step": 294 + }, + { + "epoch": 0.15948819607136422, + "grad_norm": 0.38942399621009827, + "learning_rate": 3.1891891891891894e-06, + "loss": 0.549, + "step": 295 + }, + { + "epoch": 0.1600288340241485, + "grad_norm": 0.4098314344882965, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.5276, + "step": 296 + }, + { + "epoch": 0.16056947197693278, + "grad_norm": 0.4091181457042694, + "learning_rate": 3.210810810810811e-06, + "loss": 0.5031, + "step": 297 + }, + { + "epoch": 0.16111010992971705, + "grad_norm": 0.4209524691104889, + "learning_rate": 3.2216216216216216e-06, + "loss": 0.5755, + "step": 298 + }, + { + "epoch": 0.16165074788250136, + "grad_norm": 0.46801844239234924, + "learning_rate": 3.2324324324324324e-06, + "loss": 0.5484, + "step": 299 + }, + { + "epoch": 0.16219138583528564, + "grad_norm": 0.3911043703556061, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.5408, + "step": 300 + }, + { + "epoch": 0.16273202378806992, + "grad_norm": 0.3921964466571808, + "learning_rate": 3.2540540540540546e-06, + "loss": 0.5569, + "step": 301 + }, + { + "epoch": 0.1632726617408542, + "grad_norm": 0.4882521331310272, + "learning_rate": 3.264864864864865e-06, + "loss": 0.5486, + "step": 302 + }, + { + "epoch": 0.1638132996936385, + "grad_norm": 0.511541485786438, + "learning_rate": 3.275675675675676e-06, + "loss": 0.556, + "step": 303 + }, + { + "epoch": 0.16435393764642278, + "grad_norm": 0.3747992217540741, + "learning_rate": 3.2864864864864867e-06, + "loss": 0.5433, + "step": 304 + }, + { + "epoch": 0.16489457559920706, + "grad_norm": 0.4771915078163147, + "learning_rate": 3.2972972972972976e-06, + "loss": 0.5317, + "step": 305 + }, + { + "epoch": 0.16543521355199134, + "grad_norm": 0.4885832965373993, + "learning_rate": 3.3081081081081084e-06, + "loss": 0.5573, + "step": 306 + }, + { + "epoch": 0.16597585150477565, + "grad_norm": 0.4261198341846466, + "learning_rate": 3.3189189189189193e-06, + "loss": 0.5403, + "step": 307 + }, + { + "epoch": 0.16651648945755992, + "grad_norm": 0.45114848017692566, + "learning_rate": 3.3297297297297297e-06, + "loss": 0.5356, + "step": 308 + }, + { + "epoch": 0.1670571274103442, + "grad_norm": 0.4223145842552185, + "learning_rate": 3.3405405405405406e-06, + "loss": 0.5298, + "step": 309 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 0.4394230842590332, + "learning_rate": 3.351351351351352e-06, + "loss": 0.5417, + "step": 310 + }, + { + "epoch": 0.1681384033159128, + "grad_norm": 0.46592485904693604, + "learning_rate": 3.3621621621621627e-06, + "loss": 0.5412, + "step": 311 + }, + { + "epoch": 0.16867904126869707, + "grad_norm": 0.42755240201950073, + "learning_rate": 3.372972972972973e-06, + "loss": 0.5346, + "step": 312 + }, + { + "epoch": 0.16921967922148134, + "grad_norm": 0.37076422572135925, + "learning_rate": 3.383783783783784e-06, + "loss": 0.5483, + "step": 313 + }, + { + "epoch": 0.16976031717426562, + "grad_norm": 0.4419684112071991, + "learning_rate": 3.3945945945945944e-06, + "loss": 0.5465, + "step": 314 + }, + { + "epoch": 0.17030095512704993, + "grad_norm": 0.4431931972503662, + "learning_rate": 3.4054054054054057e-06, + "loss": 0.5445, + "step": 315 + }, + { + "epoch": 0.1708415930798342, + "grad_norm": 0.43325522541999817, + "learning_rate": 3.4162162162162166e-06, + "loss": 0.5653, + "step": 316 + }, + { + "epoch": 0.1713822310326185, + "grad_norm": 0.37014782428741455, + "learning_rate": 3.4270270270270274e-06, + "loss": 0.5208, + "step": 317 + }, + { + "epoch": 0.17192286898540277, + "grad_norm": 0.43956634402275085, + "learning_rate": 3.437837837837838e-06, + "loss": 0.5343, + "step": 318 + }, + { + "epoch": 0.17246350693818707, + "grad_norm": 0.38273492455482483, + "learning_rate": 3.4486486486486487e-06, + "loss": 0.5368, + "step": 319 + }, + { + "epoch": 0.17300414489097135, + "grad_norm": 0.3921017348766327, + "learning_rate": 3.45945945945946e-06, + "loss": 0.5535, + "step": 320 + }, + { + "epoch": 0.17354478284375563, + "grad_norm": 0.3745984137058258, + "learning_rate": 3.470270270270271e-06, + "loss": 0.5361, + "step": 321 + }, + { + "epoch": 0.1740854207965399, + "grad_norm": 0.40335318446159363, + "learning_rate": 3.4810810810810813e-06, + "loss": 0.5894, + "step": 322 + }, + { + "epoch": 0.17462605874932421, + "grad_norm": 0.35682252049446106, + "learning_rate": 3.491891891891892e-06, + "loss": 0.5241, + "step": 323 + }, + { + "epoch": 0.1751666967021085, + "grad_norm": 0.37713858485221863, + "learning_rate": 3.5027027027027026e-06, + "loss": 0.5321, + "step": 324 + }, + { + "epoch": 0.17570733465489277, + "grad_norm": 0.449979692697525, + "learning_rate": 3.513513513513514e-06, + "loss": 0.531, + "step": 325 + }, + { + "epoch": 0.17624797260767705, + "grad_norm": 0.40404993295669556, + "learning_rate": 3.5243243243243247e-06, + "loss": 0.5409, + "step": 326 + }, + { + "epoch": 0.17678861056046136, + "grad_norm": 0.36095544695854187, + "learning_rate": 3.5351351351351355e-06, + "loss": 0.5302, + "step": 327 + }, + { + "epoch": 0.17732924851324564, + "grad_norm": 0.4284367561340332, + "learning_rate": 3.545945945945946e-06, + "loss": 0.5538, + "step": 328 + }, + { + "epoch": 0.17786988646602991, + "grad_norm": 0.40965935587882996, + "learning_rate": 3.556756756756757e-06, + "loss": 0.518, + "step": 329 + }, + { + "epoch": 0.1784105244188142, + "grad_norm": 0.39643898606300354, + "learning_rate": 3.567567567567568e-06, + "loss": 0.5696, + "step": 330 + }, + { + "epoch": 0.1789511623715985, + "grad_norm": 0.4323926270008087, + "learning_rate": 3.5783783783783785e-06, + "loss": 0.523, + "step": 331 + }, + { + "epoch": 0.17949180032438278, + "grad_norm": 0.45723623037338257, + "learning_rate": 3.5891891891891894e-06, + "loss": 0.5406, + "step": 332 + }, + { + "epoch": 0.18003243827716706, + "grad_norm": 0.4142730236053467, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.5098, + "step": 333 + }, + { + "epoch": 0.18057307622995133, + "grad_norm": 0.4000495672225952, + "learning_rate": 3.6108108108108107e-06, + "loss": 0.5312, + "step": 334 + }, + { + "epoch": 0.18111371418273564, + "grad_norm": 0.391166090965271, + "learning_rate": 3.621621621621622e-06, + "loss": 0.5144, + "step": 335 + }, + { + "epoch": 0.18165435213551992, + "grad_norm": 0.4517064392566681, + "learning_rate": 3.632432432432433e-06, + "loss": 0.5252, + "step": 336 + }, + { + "epoch": 0.1821949900883042, + "grad_norm": 0.3642129898071289, + "learning_rate": 3.6432432432432437e-06, + "loss": 0.5576, + "step": 337 + }, + { + "epoch": 0.18273562804108848, + "grad_norm": 0.3912559449672699, + "learning_rate": 3.654054054054054e-06, + "loss": 0.4994, + "step": 338 + }, + { + "epoch": 0.18327626599387278, + "grad_norm": 0.39499011635780334, + "learning_rate": 3.664864864864865e-06, + "loss": 0.5234, + "step": 339 + }, + { + "epoch": 0.18381690394665706, + "grad_norm": 0.44011837244033813, + "learning_rate": 3.6756756756756763e-06, + "loss": 0.536, + "step": 340 + }, + { + "epoch": 0.18435754189944134, + "grad_norm": 0.4400962293148041, + "learning_rate": 3.6864864864864867e-06, + "loss": 0.5515, + "step": 341 + }, + { + "epoch": 0.18489817985222562, + "grad_norm": 0.37328705191612244, + "learning_rate": 3.6972972972972975e-06, + "loss": 0.5448, + "step": 342 + }, + { + "epoch": 0.1854388178050099, + "grad_norm": 0.4239843785762787, + "learning_rate": 3.7081081081081084e-06, + "loss": 0.5415, + "step": 343 + }, + { + "epoch": 0.1859794557577942, + "grad_norm": 0.37935659289360046, + "learning_rate": 3.718918918918919e-06, + "loss": 0.5329, + "step": 344 + }, + { + "epoch": 0.18652009371057848, + "grad_norm": 0.4207356870174408, + "learning_rate": 3.72972972972973e-06, + "loss": 0.5708, + "step": 345 + }, + { + "epoch": 0.18706073166336276, + "grad_norm": 0.39368534088134766, + "learning_rate": 3.740540540540541e-06, + "loss": 0.5069, + "step": 346 + }, + { + "epoch": 0.18760136961614704, + "grad_norm": 0.4316987693309784, + "learning_rate": 3.751351351351352e-06, + "loss": 0.5682, + "step": 347 + }, + { + "epoch": 0.18814200756893135, + "grad_norm": 0.4061681032180786, + "learning_rate": 3.7621621621621623e-06, + "loss": 0.5243, + "step": 348 + }, + { + "epoch": 0.18868264552171563, + "grad_norm": 0.41535401344299316, + "learning_rate": 3.772972972972973e-06, + "loss": 0.5242, + "step": 349 + }, + { + "epoch": 0.1892232834744999, + "grad_norm": 0.4037801921367645, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.5053, + "step": 350 + }, + { + "epoch": 0.18976392142728418, + "grad_norm": 0.38925549387931824, + "learning_rate": 3.794594594594595e-06, + "loss": 0.5159, + "step": 351 + }, + { + "epoch": 0.1903045593800685, + "grad_norm": 0.42589956521987915, + "learning_rate": 3.8054054054054057e-06, + "loss": 0.5292, + "step": 352 + }, + { + "epoch": 0.19084519733285277, + "grad_norm": 0.4325747787952423, + "learning_rate": 3.8162162162162165e-06, + "loss": 0.5386, + "step": 353 + }, + { + "epoch": 0.19138583528563705, + "grad_norm": 0.4532448649406433, + "learning_rate": 3.827027027027027e-06, + "loss": 0.4949, + "step": 354 + }, + { + "epoch": 0.19192647323842132, + "grad_norm": 0.4209156036376953, + "learning_rate": 3.837837837837838e-06, + "loss": 0.568, + "step": 355 + }, + { + "epoch": 0.19246711119120563, + "grad_norm": 0.4481404423713684, + "learning_rate": 3.848648648648649e-06, + "loss": 0.5192, + "step": 356 + }, + { + "epoch": 0.1930077491439899, + "grad_norm": 0.46470949053764343, + "learning_rate": 3.85945945945946e-06, + "loss": 0.5272, + "step": 357 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.39657002687454224, + "learning_rate": 3.87027027027027e-06, + "loss": 0.5328, + "step": 358 + }, + { + "epoch": 0.19408902504955847, + "grad_norm": 0.4064314365386963, + "learning_rate": 3.881081081081081e-06, + "loss": 0.5537, + "step": 359 + }, + { + "epoch": 0.19462966300234277, + "grad_norm": 0.4129345417022705, + "learning_rate": 3.891891891891892e-06, + "loss": 0.5393, + "step": 360 + }, + { + "epoch": 0.19517030095512705, + "grad_norm": 0.4236624538898468, + "learning_rate": 3.902702702702703e-06, + "loss": 0.5184, + "step": 361 + }, + { + "epoch": 0.19571093890791133, + "grad_norm": 0.4176543354988098, + "learning_rate": 3.913513513513514e-06, + "loss": 0.5385, + "step": 362 + }, + { + "epoch": 0.1962515768606956, + "grad_norm": 0.380862832069397, + "learning_rate": 3.924324324324324e-06, + "loss": 0.5184, + "step": 363 + }, + { + "epoch": 0.19679221481347992, + "grad_norm": 0.4342859983444214, + "learning_rate": 3.9351351351351355e-06, + "loss": 0.5366, + "step": 364 + }, + { + "epoch": 0.1973328527662642, + "grad_norm": 0.42140740156173706, + "learning_rate": 3.945945945945947e-06, + "loss": 0.5391, + "step": 365 + }, + { + "epoch": 0.19787349071904847, + "grad_norm": 0.4249398410320282, + "learning_rate": 3.956756756756757e-06, + "loss": 0.5344, + "step": 366 + }, + { + "epoch": 0.19841412867183275, + "grad_norm": 0.41181671619415283, + "learning_rate": 3.967567567567568e-06, + "loss": 0.5461, + "step": 367 + }, + { + "epoch": 0.19895476662461706, + "grad_norm": 0.4575495719909668, + "learning_rate": 3.978378378378379e-06, + "loss": 0.5401, + "step": 368 + }, + { + "epoch": 0.19949540457740134, + "grad_norm": 0.40454474091529846, + "learning_rate": 3.989189189189189e-06, + "loss": 0.5209, + "step": 369 + }, + { + "epoch": 0.20003604253018561, + "grad_norm": 0.3996911346912384, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5215, + "step": 370 + }, + { + "epoch": 0.2005766804829699, + "grad_norm": 0.3987369239330292, + "learning_rate": 4.010810810810811e-06, + "loss": 0.5282, + "step": 371 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 0.426537424325943, + "learning_rate": 4.0216216216216215e-06, + "loss": 0.5152, + "step": 372 + }, + { + "epoch": 0.20165795638853848, + "grad_norm": 0.4209159314632416, + "learning_rate": 4.032432432432433e-06, + "loss": 0.4976, + "step": 373 + }, + { + "epoch": 0.20219859434132276, + "grad_norm": 0.39458101987838745, + "learning_rate": 4.043243243243243e-06, + "loss": 0.5333, + "step": 374 + }, + { + "epoch": 0.20273923229410704, + "grad_norm": 0.4080427587032318, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.5364, + "step": 375 + }, + { + "epoch": 0.20327987024689134, + "grad_norm": 0.41344642639160156, + "learning_rate": 4.064864864864865e-06, + "loss": 0.5157, + "step": 376 + }, + { + "epoch": 0.20382050819967562, + "grad_norm": 0.4057735502719879, + "learning_rate": 4.075675675675676e-06, + "loss": 0.546, + "step": 377 + }, + { + "epoch": 0.2043611461524599, + "grad_norm": 0.41593965888023376, + "learning_rate": 4.086486486486487e-06, + "loss": 0.5241, + "step": 378 + }, + { + "epoch": 0.20490178410524418, + "grad_norm": 0.4008232653141022, + "learning_rate": 4.097297297297297e-06, + "loss": 0.5214, + "step": 379 + }, + { + "epoch": 0.20544242205802848, + "grad_norm": 0.4257887303829193, + "learning_rate": 4.108108108108108e-06, + "loss": 0.558, + "step": 380 + }, + { + "epoch": 0.20598306001081276, + "grad_norm": 0.3643846809864044, + "learning_rate": 4.11891891891892e-06, + "loss": 0.5604, + "step": 381 + }, + { + "epoch": 0.20652369796359704, + "grad_norm": 0.4190754294395447, + "learning_rate": 4.12972972972973e-06, + "loss": 0.4863, + "step": 382 + }, + { + "epoch": 0.20706433591638132, + "grad_norm": 0.3812675476074219, + "learning_rate": 4.1405405405405405e-06, + "loss": 0.528, + "step": 383 + }, + { + "epoch": 0.20760497386916563, + "grad_norm": 0.36235401034355164, + "learning_rate": 4.151351351351352e-06, + "loss": 0.5291, + "step": 384 + }, + { + "epoch": 0.2081456118219499, + "grad_norm": 0.4425322115421295, + "learning_rate": 4.162162162162163e-06, + "loss": 0.525, + "step": 385 + }, + { + "epoch": 0.20868624977473418, + "grad_norm": 0.4001314342021942, + "learning_rate": 4.1729729729729735e-06, + "loss": 0.5457, + "step": 386 + }, + { + "epoch": 0.20922688772751846, + "grad_norm": 0.36931946873664856, + "learning_rate": 4.183783783783784e-06, + "loss": 0.5271, + "step": 387 + }, + { + "epoch": 0.20976752568030277, + "grad_norm": 0.4713948965072632, + "learning_rate": 4.194594594594595e-06, + "loss": 0.5195, + "step": 388 + }, + { + "epoch": 0.21030816363308705, + "grad_norm": 0.38855504989624023, + "learning_rate": 4.205405405405406e-06, + "loss": 0.565, + "step": 389 + }, + { + "epoch": 0.21084880158587133, + "grad_norm": 0.4155072867870331, + "learning_rate": 4.216216216216217e-06, + "loss": 0.5278, + "step": 390 + }, + { + "epoch": 0.2113894395386556, + "grad_norm": 0.42699480056762695, + "learning_rate": 4.227027027027027e-06, + "loss": 0.5426, + "step": 391 + }, + { + "epoch": 0.2119300774914399, + "grad_norm": 0.4101499915122986, + "learning_rate": 4.237837837837838e-06, + "loss": 0.5184, + "step": 392 + }, + { + "epoch": 0.2124707154442242, + "grad_norm": 0.41484978795051575, + "learning_rate": 4.248648648648649e-06, + "loss": 0.5373, + "step": 393 + }, + { + "epoch": 0.21301135339700847, + "grad_norm": 0.36816850304603577, + "learning_rate": 4.2594594594594595e-06, + "loss": 0.5512, + "step": 394 + }, + { + "epoch": 0.21355199134979275, + "grad_norm": 0.4117318391799927, + "learning_rate": 4.270270270270271e-06, + "loss": 0.5204, + "step": 395 + }, + { + "epoch": 0.21409262930257705, + "grad_norm": 0.38216206431388855, + "learning_rate": 4.281081081081081e-06, + "loss": 0.5353, + "step": 396 + }, + { + "epoch": 0.21463326725536133, + "grad_norm": 0.3792795240879059, + "learning_rate": 4.2918918918918925e-06, + "loss": 0.5727, + "step": 397 + }, + { + "epoch": 0.2151739052081456, + "grad_norm": 0.41061505675315857, + "learning_rate": 4.302702702702703e-06, + "loss": 0.5343, + "step": 398 + }, + { + "epoch": 0.2157145431609299, + "grad_norm": 0.39943134784698486, + "learning_rate": 4.313513513513513e-06, + "loss": 0.4986, + "step": 399 + }, + { + "epoch": 0.2162551811137142, + "grad_norm": 0.41517776250839233, + "learning_rate": 4.324324324324325e-06, + "loss": 0.5402, + "step": 400 + }, + { + "epoch": 0.21679581906649847, + "grad_norm": 0.45214927196502686, + "learning_rate": 4.335135135135136e-06, + "loss": 0.5219, + "step": 401 + }, + { + "epoch": 0.21733645701928275, + "grad_norm": 0.36262819170951843, + "learning_rate": 4.345945945945946e-06, + "loss": 0.5203, + "step": 402 + }, + { + "epoch": 0.21787709497206703, + "grad_norm": 0.3521535396575928, + "learning_rate": 4.356756756756757e-06, + "loss": 0.4823, + "step": 403 + }, + { + "epoch": 0.21841773292485134, + "grad_norm": 0.4195312559604645, + "learning_rate": 4.367567567567568e-06, + "loss": 0.5261, + "step": 404 + }, + { + "epoch": 0.21895837087763562, + "grad_norm": 0.42293110489845276, + "learning_rate": 4.378378378378379e-06, + "loss": 0.5414, + "step": 405 + }, + { + "epoch": 0.2194990088304199, + "grad_norm": 0.38924261927604675, + "learning_rate": 4.38918918918919e-06, + "loss": 0.5251, + "step": 406 + }, + { + "epoch": 0.22003964678320417, + "grad_norm": 0.42991113662719727, + "learning_rate": 4.4e-06, + "loss": 0.5396, + "step": 407 + }, + { + "epoch": 0.22058028473598848, + "grad_norm": 0.37058499455451965, + "learning_rate": 4.4108108108108115e-06, + "loss": 0.5398, + "step": 408 + }, + { + "epoch": 0.22112092268877276, + "grad_norm": 0.3862038254737854, + "learning_rate": 4.421621621621622e-06, + "loss": 0.5369, + "step": 409 + }, + { + "epoch": 0.22166156064155704, + "grad_norm": 0.39511534571647644, + "learning_rate": 4.432432432432433e-06, + "loss": 0.5137, + "step": 410 + }, + { + "epoch": 0.22220219859434132, + "grad_norm": 0.4125954210758209, + "learning_rate": 4.443243243243244e-06, + "loss": 0.4952, + "step": 411 + }, + { + "epoch": 0.2227428365471256, + "grad_norm": 0.3931781053543091, + "learning_rate": 4.454054054054054e-06, + "loss": 0.4975, + "step": 412 + }, + { + "epoch": 0.2232834744999099, + "grad_norm": 0.41780611872673035, + "learning_rate": 4.464864864864865e-06, + "loss": 0.5341, + "step": 413 + }, + { + "epoch": 0.22382411245269418, + "grad_norm": 0.4228370487689972, + "learning_rate": 4.475675675675676e-06, + "loss": 0.5232, + "step": 414 + }, + { + "epoch": 0.22436475040547846, + "grad_norm": 0.4475858509540558, + "learning_rate": 4.486486486486487e-06, + "loss": 0.5384, + "step": 415 + }, + { + "epoch": 0.22490538835826274, + "grad_norm": 0.41531985998153687, + "learning_rate": 4.4972972972972975e-06, + "loss": 0.5101, + "step": 416 + }, + { + "epoch": 0.22544602631104704, + "grad_norm": 0.35512199997901917, + "learning_rate": 4.508108108108109e-06, + "loss": 0.5023, + "step": 417 + }, + { + "epoch": 0.22598666426383132, + "grad_norm": 0.40374282002449036, + "learning_rate": 4.518918918918919e-06, + "loss": 0.5207, + "step": 418 + }, + { + "epoch": 0.2265273022166156, + "grad_norm": 0.4103836119174957, + "learning_rate": 4.52972972972973e-06, + "loss": 0.5279, + "step": 419 + }, + { + "epoch": 0.22706794016939988, + "grad_norm": 0.4135481119155884, + "learning_rate": 4.540540540540541e-06, + "loss": 0.5147, + "step": 420 + }, + { + "epoch": 0.22760857812218419, + "grad_norm": 0.39813584089279175, + "learning_rate": 4.551351351351352e-06, + "loss": 0.5228, + "step": 421 + }, + { + "epoch": 0.22814921607496846, + "grad_norm": 0.38910800218582153, + "learning_rate": 4.562162162162163e-06, + "loss": 0.5271, + "step": 422 + }, + { + "epoch": 0.22868985402775274, + "grad_norm": 0.45979151129722595, + "learning_rate": 4.572972972972973e-06, + "loss": 0.5279, + "step": 423 + }, + { + "epoch": 0.22923049198053702, + "grad_norm": 0.3684897720813751, + "learning_rate": 4.583783783783784e-06, + "loss": 0.5373, + "step": 424 + }, + { + "epoch": 0.22977112993332133, + "grad_norm": 0.4329938292503357, + "learning_rate": 4.594594594594596e-06, + "loss": 0.5363, + "step": 425 + }, + { + "epoch": 0.2303117678861056, + "grad_norm": 0.37203264236450195, + "learning_rate": 4.605405405405406e-06, + "loss": 0.5156, + "step": 426 + }, + { + "epoch": 0.23085240583888988, + "grad_norm": 0.39848098158836365, + "learning_rate": 4.6162162162162165e-06, + "loss": 0.5154, + "step": 427 + }, + { + "epoch": 0.23139304379167416, + "grad_norm": 0.41350409388542175, + "learning_rate": 4.627027027027027e-06, + "loss": 0.5437, + "step": 428 + }, + { + "epoch": 0.23193368174445847, + "grad_norm": 0.4166233539581299, + "learning_rate": 4.637837837837838e-06, + "loss": 0.4931, + "step": 429 + }, + { + "epoch": 0.23247431969724275, + "grad_norm": 0.42275962233543396, + "learning_rate": 4.6486486486486495e-06, + "loss": 0.518, + "step": 430 + }, + { + "epoch": 0.23301495765002703, + "grad_norm": 0.4211007356643677, + "learning_rate": 4.65945945945946e-06, + "loss": 0.5162, + "step": 431 + }, + { + "epoch": 0.2335555956028113, + "grad_norm": 0.5026114583015442, + "learning_rate": 4.67027027027027e-06, + "loss": 0.5065, + "step": 432 + }, + { + "epoch": 0.2340962335555956, + "grad_norm": 0.3831497132778168, + "learning_rate": 4.681081081081082e-06, + "loss": 0.534, + "step": 433 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 0.39763692021369934, + "learning_rate": 4.691891891891892e-06, + "loss": 0.5248, + "step": 434 + }, + { + "epoch": 0.23517750946116417, + "grad_norm": 0.4102565050125122, + "learning_rate": 4.702702702702703e-06, + "loss": 0.5093, + "step": 435 + }, + { + "epoch": 0.23571814741394845, + "grad_norm": 0.4028921127319336, + "learning_rate": 4.713513513513514e-06, + "loss": 0.508, + "step": 436 + }, + { + "epoch": 0.23625878536673275, + "grad_norm": 0.42078977823257446, + "learning_rate": 4.724324324324325e-06, + "loss": 0.5275, + "step": 437 + }, + { + "epoch": 0.23679942331951703, + "grad_norm": 0.38161078095436096, + "learning_rate": 4.7351351351351355e-06, + "loss": 0.5021, + "step": 438 + }, + { + "epoch": 0.2373400612723013, + "grad_norm": 0.3946899473667145, + "learning_rate": 4.745945945945946e-06, + "loss": 0.5612, + "step": 439 + }, + { + "epoch": 0.2378806992250856, + "grad_norm": 0.36809343099594116, + "learning_rate": 4.756756756756757e-06, + "loss": 0.5318, + "step": 440 + }, + { + "epoch": 0.2384213371778699, + "grad_norm": 0.45496001839637756, + "learning_rate": 4.7675675675675685e-06, + "loss": 0.543, + "step": 441 + }, + { + "epoch": 0.23896197513065418, + "grad_norm": 0.409921258687973, + "learning_rate": 4.778378378378379e-06, + "loss": 0.5507, + "step": 442 + }, + { + "epoch": 0.23950261308343845, + "grad_norm": 0.36140987277030945, + "learning_rate": 4.789189189189189e-06, + "loss": 0.4944, + "step": 443 + }, + { + "epoch": 0.24004325103622273, + "grad_norm": 0.4025464951992035, + "learning_rate": 4.800000000000001e-06, + "loss": 0.5327, + "step": 444 + }, + { + "epoch": 0.24058388898900704, + "grad_norm": 0.43016675114631653, + "learning_rate": 4.810810810810811e-06, + "loss": 0.5118, + "step": 445 + }, + { + "epoch": 0.24112452694179132, + "grad_norm": 0.4288389980792999, + "learning_rate": 4.821621621621622e-06, + "loss": 0.5224, + "step": 446 + }, + { + "epoch": 0.2416651648945756, + "grad_norm": 0.3723931610584259, + "learning_rate": 4.832432432432433e-06, + "loss": 0.5475, + "step": 447 + }, + { + "epoch": 0.24220580284735987, + "grad_norm": 0.4410860538482666, + "learning_rate": 4.843243243243243e-06, + "loss": 0.5215, + "step": 448 + }, + { + "epoch": 0.24274644080014418, + "grad_norm": 0.3706596791744232, + "learning_rate": 4.8540540540540545e-06, + "loss": 0.5163, + "step": 449 + }, + { + "epoch": 0.24328707875292846, + "grad_norm": 0.39684516191482544, + "learning_rate": 4.864864864864866e-06, + "loss": 0.5322, + "step": 450 + }, + { + "epoch": 0.24382771670571274, + "grad_norm": 0.36360347270965576, + "learning_rate": 4.875675675675676e-06, + "loss": 0.5238, + "step": 451 + }, + { + "epoch": 0.24436835465849702, + "grad_norm": 0.39696750044822693, + "learning_rate": 4.886486486486487e-06, + "loss": 0.5106, + "step": 452 + }, + { + "epoch": 0.24490899261128132, + "grad_norm": 0.3955710828304291, + "learning_rate": 4.897297297297298e-06, + "loss": 0.5243, + "step": 453 + }, + { + "epoch": 0.2454496305640656, + "grad_norm": 0.44684091210365295, + "learning_rate": 4.908108108108108e-06, + "loss": 0.5061, + "step": 454 + }, + { + "epoch": 0.24599026851684988, + "grad_norm": 0.3783811032772064, + "learning_rate": 4.91891891891892e-06, + "loss": 0.5005, + "step": 455 + }, + { + "epoch": 0.24653090646963416, + "grad_norm": 0.37094858288764954, + "learning_rate": 4.92972972972973e-06, + "loss": 0.5342, + "step": 456 + }, + { + "epoch": 0.24707154442241847, + "grad_norm": 0.4043397307395935, + "learning_rate": 4.940540540540541e-06, + "loss": 0.5345, + "step": 457 + }, + { + "epoch": 0.24761218237520274, + "grad_norm": 0.3585631549358368, + "learning_rate": 4.951351351351352e-06, + "loss": 0.5163, + "step": 458 + }, + { + "epoch": 0.24815282032798702, + "grad_norm": 0.39285627007484436, + "learning_rate": 4.962162162162162e-06, + "loss": 0.5038, + "step": 459 + }, + { + "epoch": 0.2486934582807713, + "grad_norm": 0.4428061842918396, + "learning_rate": 4.9729729729729735e-06, + "loss": 0.523, + "step": 460 + }, + { + "epoch": 0.2492340962335556, + "grad_norm": 0.42921119928359985, + "learning_rate": 4.983783783783785e-06, + "loss": 0.5051, + "step": 461 + }, + { + "epoch": 0.2497747341863399, + "grad_norm": 0.40692782402038574, + "learning_rate": 4.994594594594595e-06, + "loss": 0.5137, + "step": 462 + }, + { + "epoch": 0.2503153721391242, + "grad_norm": 0.44367969036102295, + "learning_rate": 5.005405405405406e-06, + "loss": 0.5026, + "step": 463 + }, + { + "epoch": 0.25085601009190844, + "grad_norm": 0.42934948205947876, + "learning_rate": 5.016216216216216e-06, + "loss": 0.5215, + "step": 464 + }, + { + "epoch": 0.25139664804469275, + "grad_norm": 0.4149029552936554, + "learning_rate": 5.027027027027027e-06, + "loss": 0.5419, + "step": 465 + }, + { + "epoch": 0.251937285997477, + "grad_norm": 0.38296329975128174, + "learning_rate": 5.037837837837838e-06, + "loss": 0.4804, + "step": 466 + }, + { + "epoch": 0.2524779239502613, + "grad_norm": 0.3958582580089569, + "learning_rate": 5.048648648648648e-06, + "loss": 0.4778, + "step": 467 + }, + { + "epoch": 0.2530185619030456, + "grad_norm": 0.4397348165512085, + "learning_rate": 5.05945945945946e-06, + "loss": 0.5233, + "step": 468 + }, + { + "epoch": 0.25355919985582986, + "grad_norm": 0.3755987286567688, + "learning_rate": 5.070270270270271e-06, + "loss": 0.5157, + "step": 469 + }, + { + "epoch": 0.25409983780861417, + "grad_norm": 0.3970812261104584, + "learning_rate": 5.081081081081082e-06, + "loss": 0.5501, + "step": 470 + }, + { + "epoch": 0.2546404757613985, + "grad_norm": 0.4246158003807068, + "learning_rate": 5.0918918918918925e-06, + "loss": 0.5231, + "step": 471 + }, + { + "epoch": 0.25518111371418273, + "grad_norm": 0.4050613343715668, + "learning_rate": 5.102702702702703e-06, + "loss": 0.5035, + "step": 472 + }, + { + "epoch": 0.25572175166696703, + "grad_norm": 0.40757644176483154, + "learning_rate": 5.113513513513514e-06, + "loss": 0.5176, + "step": 473 + }, + { + "epoch": 0.2562623896197513, + "grad_norm": 0.4407704174518585, + "learning_rate": 5.124324324324325e-06, + "loss": 0.4858, + "step": 474 + }, + { + "epoch": 0.2568030275725356, + "grad_norm": 0.418594092130661, + "learning_rate": 5.135135135135135e-06, + "loss": 0.5224, + "step": 475 + }, + { + "epoch": 0.2573436655253199, + "grad_norm": 0.3590792119503021, + "learning_rate": 5.145945945945946e-06, + "loss": 0.4955, + "step": 476 + }, + { + "epoch": 0.25788430347810415, + "grad_norm": 0.42272108793258667, + "learning_rate": 5.156756756756757e-06, + "loss": 0.5289, + "step": 477 + }, + { + "epoch": 0.25842494143088846, + "grad_norm": 0.41260215640068054, + "learning_rate": 5.167567567567569e-06, + "loss": 0.5104, + "step": 478 + }, + { + "epoch": 0.25896557938367276, + "grad_norm": 0.4283524751663208, + "learning_rate": 5.178378378378379e-06, + "loss": 0.5138, + "step": 479 + }, + { + "epoch": 0.259506217336457, + "grad_norm": 0.44842174649238586, + "learning_rate": 5.18918918918919e-06, + "loss": 0.5199, + "step": 480 + }, + { + "epoch": 0.2600468552892413, + "grad_norm": 0.41024303436279297, + "learning_rate": 5.2e-06, + "loss": 0.5197, + "step": 481 + }, + { + "epoch": 0.26058749324202557, + "grad_norm": 0.41243913769721985, + "learning_rate": 5.2108108108108115e-06, + "loss": 0.5571, + "step": 482 + }, + { + "epoch": 0.2611281311948099, + "grad_norm": 0.48198091983795166, + "learning_rate": 5.221621621621622e-06, + "loss": 0.5142, + "step": 483 + }, + { + "epoch": 0.2616687691475942, + "grad_norm": 0.43659496307373047, + "learning_rate": 5.232432432432432e-06, + "loss": 0.5217, + "step": 484 + }, + { + "epoch": 0.26220940710037843, + "grad_norm": 0.4106965959072113, + "learning_rate": 5.243243243243244e-06, + "loss": 0.5165, + "step": 485 + }, + { + "epoch": 0.26275004505316274, + "grad_norm": 0.3779434561729431, + "learning_rate": 5.254054054054054e-06, + "loss": 0.542, + "step": 486 + }, + { + "epoch": 0.263290683005947, + "grad_norm": 0.4298580288887024, + "learning_rate": 5.2648648648648645e-06, + "loss": 0.5366, + "step": 487 + }, + { + "epoch": 0.2638313209587313, + "grad_norm": 0.41386786103248596, + "learning_rate": 5.275675675675677e-06, + "loss": 0.5227, + "step": 488 + }, + { + "epoch": 0.2643719589115156, + "grad_norm": 0.36224329471588135, + "learning_rate": 5.286486486486487e-06, + "loss": 0.4809, + "step": 489 + }, + { + "epoch": 0.26491259686429985, + "grad_norm": 0.41032809019088745, + "learning_rate": 5.297297297297298e-06, + "loss": 0.4871, + "step": 490 + }, + { + "epoch": 0.26545323481708416, + "grad_norm": 0.4151748716831207, + "learning_rate": 5.308108108108109e-06, + "loss": 0.5274, + "step": 491 + }, + { + "epoch": 0.26599387276986847, + "grad_norm": 0.45982950925827026, + "learning_rate": 5.318918918918919e-06, + "loss": 0.5358, + "step": 492 + }, + { + "epoch": 0.2665345107226527, + "grad_norm": 0.43396931886672974, + "learning_rate": 5.3297297297297305e-06, + "loss": 0.5012, + "step": 493 + }, + { + "epoch": 0.267075148675437, + "grad_norm": 0.4250200688838959, + "learning_rate": 5.340540540540541e-06, + "loss": 0.4813, + "step": 494 + }, + { + "epoch": 0.2676157866282213, + "grad_norm": 0.43627679347991943, + "learning_rate": 5.351351351351351e-06, + "loss": 0.5231, + "step": 495 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 0.4634500741958618, + "learning_rate": 5.362162162162163e-06, + "loss": 0.539, + "step": 496 + }, + { + "epoch": 0.2686970625337899, + "grad_norm": 0.39024657011032104, + "learning_rate": 5.372972972972973e-06, + "loss": 0.5073, + "step": 497 + }, + { + "epoch": 0.26923770048657414, + "grad_norm": 0.5674286484718323, + "learning_rate": 5.383783783783785e-06, + "loss": 0.5221, + "step": 498 + }, + { + "epoch": 0.26977833843935844, + "grad_norm": 0.37081941962242126, + "learning_rate": 5.394594594594596e-06, + "loss": 0.5141, + "step": 499 + }, + { + "epoch": 0.27031897639214275, + "grad_norm": 0.4416964650154114, + "learning_rate": 5.405405405405406e-06, + "loss": 0.5246, + "step": 500 + }, + { + "epoch": 0.270859614344927, + "grad_norm": 0.4249023199081421, + "learning_rate": 5.4162162162162165e-06, + "loss": 0.5269, + "step": 501 + }, + { + "epoch": 0.2714002522977113, + "grad_norm": 0.40850886702537537, + "learning_rate": 5.427027027027028e-06, + "loss": 0.5444, + "step": 502 + }, + { + "epoch": 0.27194089025049556, + "grad_norm": 0.45538514852523804, + "learning_rate": 5.437837837837838e-06, + "loss": 0.5308, + "step": 503 + }, + { + "epoch": 0.27248152820327987, + "grad_norm": 0.4299362301826477, + "learning_rate": 5.448648648648649e-06, + "loss": 0.4999, + "step": 504 + }, + { + "epoch": 0.27302216615606417, + "grad_norm": 0.36193957924842834, + "learning_rate": 5.45945945945946e-06, + "loss": 0.4833, + "step": 505 + }, + { + "epoch": 0.2735628041088484, + "grad_norm": 0.4897722005844116, + "learning_rate": 5.47027027027027e-06, + "loss": 0.5118, + "step": 506 + }, + { + "epoch": 0.27410344206163273, + "grad_norm": 0.4764401614665985, + "learning_rate": 5.481081081081081e-06, + "loss": 0.5213, + "step": 507 + }, + { + "epoch": 0.27464408001441704, + "grad_norm": 0.42601868510246277, + "learning_rate": 5.491891891891893e-06, + "loss": 0.4803, + "step": 508 + }, + { + "epoch": 0.2751847179672013, + "grad_norm": 0.46341875195503235, + "learning_rate": 5.502702702702703e-06, + "loss": 0.5192, + "step": 509 + }, + { + "epoch": 0.2757253559199856, + "grad_norm": 0.4102022349834442, + "learning_rate": 5.513513513513515e-06, + "loss": 0.5356, + "step": 510 + }, + { + "epoch": 0.27626599387276984, + "grad_norm": 0.3962627649307251, + "learning_rate": 5.524324324324325e-06, + "loss": 0.5035, + "step": 511 + }, + { + "epoch": 0.27680663182555415, + "grad_norm": 0.4694930613040924, + "learning_rate": 5.5351351351351355e-06, + "loss": 0.4978, + "step": 512 + }, + { + "epoch": 0.27734726977833846, + "grad_norm": 0.37531691789627075, + "learning_rate": 5.545945945945947e-06, + "loss": 0.4892, + "step": 513 + }, + { + "epoch": 0.2778879077311227, + "grad_norm": 0.4179112911224365, + "learning_rate": 5.556756756756757e-06, + "loss": 0.5121, + "step": 514 + }, + { + "epoch": 0.278428545683907, + "grad_norm": 0.49263471364974976, + "learning_rate": 5.567567567567568e-06, + "loss": 0.55, + "step": 515 + }, + { + "epoch": 0.2789691836366913, + "grad_norm": 0.38340288400650024, + "learning_rate": 5.578378378378379e-06, + "loss": 0.5081, + "step": 516 + }, + { + "epoch": 0.27950982158947557, + "grad_norm": 0.41528379917144775, + "learning_rate": 5.589189189189189e-06, + "loss": 0.5343, + "step": 517 + }, + { + "epoch": 0.2800504595422599, + "grad_norm": 0.45989990234375, + "learning_rate": 5.600000000000001e-06, + "loss": 0.4923, + "step": 518 + }, + { + "epoch": 0.28059109749504413, + "grad_norm": 0.4144502580165863, + "learning_rate": 5.610810810810812e-06, + "loss": 0.498, + "step": 519 + }, + { + "epoch": 0.28113173544782843, + "grad_norm": 0.4108593463897705, + "learning_rate": 5.621621621621622e-06, + "loss": 0.5354, + "step": 520 + }, + { + "epoch": 0.28167237340061274, + "grad_norm": 0.4681094288825989, + "learning_rate": 5.632432432432433e-06, + "loss": 0.5247, + "step": 521 + }, + { + "epoch": 0.282213011353397, + "grad_norm": 0.42972666025161743, + "learning_rate": 5.643243243243244e-06, + "loss": 0.5047, + "step": 522 + }, + { + "epoch": 0.2827536493061813, + "grad_norm": 0.39277395606040955, + "learning_rate": 5.6540540540540545e-06, + "loss": 0.5076, + "step": 523 + }, + { + "epoch": 0.2832942872589656, + "grad_norm": 0.3716997802257538, + "learning_rate": 5.664864864864865e-06, + "loss": 0.5112, + "step": 524 + }, + { + "epoch": 0.28383492521174986, + "grad_norm": 0.45764875411987305, + "learning_rate": 5.675675675675676e-06, + "loss": 0.5132, + "step": 525 + }, + { + "epoch": 0.28437556316453416, + "grad_norm": 0.3713022470474243, + "learning_rate": 5.686486486486487e-06, + "loss": 0.5225, + "step": 526 + }, + { + "epoch": 0.2849162011173184, + "grad_norm": 0.38339367508888245, + "learning_rate": 5.697297297297297e-06, + "loss": 0.5333, + "step": 527 + }, + { + "epoch": 0.2854568390701027, + "grad_norm": 0.44646668434143066, + "learning_rate": 5.708108108108109e-06, + "loss": 0.5145, + "step": 528 + }, + { + "epoch": 0.285997477022887, + "grad_norm": 0.38393041491508484, + "learning_rate": 5.71891891891892e-06, + "loss": 0.5363, + "step": 529 + }, + { + "epoch": 0.2865381149756713, + "grad_norm": 0.43032675981521606, + "learning_rate": 5.729729729729731e-06, + "loss": 0.5306, + "step": 530 + }, + { + "epoch": 0.2870787529284556, + "grad_norm": 0.39705681800842285, + "learning_rate": 5.740540540540541e-06, + "loss": 0.5062, + "step": 531 + }, + { + "epoch": 0.2876193908812399, + "grad_norm": 0.3787713050842285, + "learning_rate": 5.751351351351352e-06, + "loss": 0.5135, + "step": 532 + }, + { + "epoch": 0.28816002883402414, + "grad_norm": 0.4790848195552826, + "learning_rate": 5.762162162162163e-06, + "loss": 0.5115, + "step": 533 + }, + { + "epoch": 0.28870066678680845, + "grad_norm": 0.41118842363357544, + "learning_rate": 5.7729729729729734e-06, + "loss": 0.5205, + "step": 534 + }, + { + "epoch": 0.2892413047395927, + "grad_norm": 0.43289870023727417, + "learning_rate": 5.783783783783784e-06, + "loss": 0.4883, + "step": 535 + }, + { + "epoch": 0.289781942692377, + "grad_norm": 0.4232019782066345, + "learning_rate": 5.794594594594594e-06, + "loss": 0.49, + "step": 536 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.4845992922782898, + "learning_rate": 5.805405405405406e-06, + "loss": 0.5161, + "step": 537 + }, + { + "epoch": 0.29086321859794556, + "grad_norm": 0.4551534354686737, + "learning_rate": 5.816216216216217e-06, + "loss": 0.4807, + "step": 538 + }, + { + "epoch": 0.29140385655072987, + "grad_norm": 0.4931628704071045, + "learning_rate": 5.827027027027028e-06, + "loss": 0.5086, + "step": 539 + }, + { + "epoch": 0.2919444945035142, + "grad_norm": 0.4340893626213074, + "learning_rate": 5.837837837837839e-06, + "loss": 0.5285, + "step": 540 + }, + { + "epoch": 0.2924851324562984, + "grad_norm": 0.43022266030311584, + "learning_rate": 5.848648648648649e-06, + "loss": 0.5156, + "step": 541 + }, + { + "epoch": 0.29302577040908273, + "grad_norm": 0.425538569688797, + "learning_rate": 5.85945945945946e-06, + "loss": 0.4961, + "step": 542 + }, + { + "epoch": 0.293566408361867, + "grad_norm": 0.3988337218761444, + "learning_rate": 5.870270270270271e-06, + "loss": 0.511, + "step": 543 + }, + { + "epoch": 0.2941070463146513, + "grad_norm": 0.4838657081127167, + "learning_rate": 5.881081081081081e-06, + "loss": 0.5001, + "step": 544 + }, + { + "epoch": 0.2946476842674356, + "grad_norm": 0.3641073703765869, + "learning_rate": 5.8918918918918924e-06, + "loss": 0.5078, + "step": 545 + }, + { + "epoch": 0.29518832222021985, + "grad_norm": 0.4349755644798279, + "learning_rate": 5.902702702702703e-06, + "loss": 0.508, + "step": 546 + }, + { + "epoch": 0.29572896017300415, + "grad_norm": 0.4596833884716034, + "learning_rate": 5.913513513513513e-06, + "loss": 0.5187, + "step": 547 + }, + { + "epoch": 0.29626959812578846, + "grad_norm": 0.38194069266319275, + "learning_rate": 5.9243243243243254e-06, + "loss": 0.5192, + "step": 548 + }, + { + "epoch": 0.2968102360785727, + "grad_norm": 0.46379154920578003, + "learning_rate": 5.935135135135136e-06, + "loss": 0.5088, + "step": 549 + }, + { + "epoch": 0.297350874031357, + "grad_norm": 0.39888012409210205, + "learning_rate": 5.945945945945947e-06, + "loss": 0.502, + "step": 550 + }, + { + "epoch": 0.29789151198414127, + "grad_norm": 0.4444519877433777, + "learning_rate": 5.9567567567567576e-06, + "loss": 0.5161, + "step": 551 + }, + { + "epoch": 0.2984321499369256, + "grad_norm": 0.39441052079200745, + "learning_rate": 5.967567567567568e-06, + "loss": 0.5412, + "step": 552 + }, + { + "epoch": 0.2989727878897099, + "grad_norm": 0.38844984769821167, + "learning_rate": 5.978378378378379e-06, + "loss": 0.5206, + "step": 553 + }, + { + "epoch": 0.29951342584249413, + "grad_norm": 0.4147244989871979, + "learning_rate": 5.98918918918919e-06, + "loss": 0.4934, + "step": 554 + }, + { + "epoch": 0.30005406379527844, + "grad_norm": 0.4128243327140808, + "learning_rate": 6e-06, + "loss": 0.5326, + "step": 555 + }, + { + "epoch": 0.3005947017480627, + "grad_norm": 0.42865583300590515, + "learning_rate": 6.010810810810811e-06, + "loss": 0.5191, + "step": 556 + }, + { + "epoch": 0.301135339700847, + "grad_norm": 0.4863196015357971, + "learning_rate": 6.021621621621622e-06, + "loss": 0.5329, + "step": 557 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 0.4681631326675415, + "learning_rate": 6.032432432432432e-06, + "loss": 0.5294, + "step": 558 + }, + { + "epoch": 0.30221661560641555, + "grad_norm": 0.43204593658447266, + "learning_rate": 6.043243243243244e-06, + "loss": 0.4957, + "step": 559 + }, + { + "epoch": 0.30275725355919986, + "grad_norm": 0.4105343520641327, + "learning_rate": 6.054054054054055e-06, + "loss": 0.5097, + "step": 560 + }, + { + "epoch": 0.30329789151198416, + "grad_norm": 0.47166261076927185, + "learning_rate": 6.064864864864865e-06, + "loss": 0.475, + "step": 561 + }, + { + "epoch": 0.3038385294647684, + "grad_norm": 0.3812154531478882, + "learning_rate": 6.0756756756756766e-06, + "loss": 0.5167, + "step": 562 + }, + { + "epoch": 0.3043791674175527, + "grad_norm": 0.47877979278564453, + "learning_rate": 6.086486486486487e-06, + "loss": 0.5109, + "step": 563 + }, + { + "epoch": 0.30491980537033697, + "grad_norm": 0.4922278821468353, + "learning_rate": 6.0972972972972974e-06, + "loss": 0.5142, + "step": 564 + }, + { + "epoch": 0.3054604433231213, + "grad_norm": 0.4220641851425171, + "learning_rate": 6.108108108108109e-06, + "loss": 0.5106, + "step": 565 + }, + { + "epoch": 0.3060010812759056, + "grad_norm": 0.45131972432136536, + "learning_rate": 6.118918918918919e-06, + "loss": 0.503, + "step": 566 + }, + { + "epoch": 0.30654171922868984, + "grad_norm": 0.4449264407157898, + "learning_rate": 6.12972972972973e-06, + "loss": 0.524, + "step": 567 + }, + { + "epoch": 0.30708235718147414, + "grad_norm": 0.4497717618942261, + "learning_rate": 6.140540540540541e-06, + "loss": 0.4707, + "step": 568 + }, + { + "epoch": 0.30762299513425845, + "grad_norm": 0.41556915640830994, + "learning_rate": 6.151351351351352e-06, + "loss": 0.5225, + "step": 569 + }, + { + "epoch": 0.3081636330870427, + "grad_norm": 0.4636322855949402, + "learning_rate": 6.162162162162163e-06, + "loss": 0.5551, + "step": 570 + }, + { + "epoch": 0.308704271039827, + "grad_norm": 0.43292292952537537, + "learning_rate": 6.172972972972974e-06, + "loss": 0.5122, + "step": 571 + }, + { + "epoch": 0.30924490899261126, + "grad_norm": 0.41534942388534546, + "learning_rate": 6.183783783783784e-06, + "loss": 0.5409, + "step": 572 + }, + { + "epoch": 0.30978554694539556, + "grad_norm": 0.511157214641571, + "learning_rate": 6.194594594594595e-06, + "loss": 0.5344, + "step": 573 + }, + { + "epoch": 0.31032618489817987, + "grad_norm": 0.3864719271659851, + "learning_rate": 6.205405405405406e-06, + "loss": 0.5174, + "step": 574 + }, + { + "epoch": 0.3108668228509641, + "grad_norm": 0.497886061668396, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.5055, + "step": 575 + }, + { + "epoch": 0.3114074608037484, + "grad_norm": 0.4637546241283417, + "learning_rate": 6.227027027027027e-06, + "loss": 0.5136, + "step": 576 + }, + { + "epoch": 0.31194809875653273, + "grad_norm": 0.3968018591403961, + "learning_rate": 6.237837837837838e-06, + "loss": 0.4929, + "step": 577 + }, + { + "epoch": 0.312488736709317, + "grad_norm": 0.425836443901062, + "learning_rate": 6.2486486486486486e-06, + "loss": 0.5039, + "step": 578 + }, + { + "epoch": 0.3130293746621013, + "grad_norm": 0.4729524850845337, + "learning_rate": 6.259459459459461e-06, + "loss": 0.5123, + "step": 579 + }, + { + "epoch": 0.31357001261488554, + "grad_norm": 0.39072057604789734, + "learning_rate": 6.270270270270271e-06, + "loss": 0.5353, + "step": 580 + }, + { + "epoch": 0.31411065056766985, + "grad_norm": 0.42268452048301697, + "learning_rate": 6.2810810810810816e-06, + "loss": 0.4989, + "step": 581 + }, + { + "epoch": 0.31465128852045415, + "grad_norm": 0.4206162989139557, + "learning_rate": 6.291891891891893e-06, + "loss": 0.4921, + "step": 582 + }, + { + "epoch": 0.3151919264732384, + "grad_norm": 0.432198166847229, + "learning_rate": 6.302702702702703e-06, + "loss": 0.5086, + "step": 583 + }, + { + "epoch": 0.3157325644260227, + "grad_norm": 0.4336259663105011, + "learning_rate": 6.313513513513514e-06, + "loss": 0.5164, + "step": 584 + }, + { + "epoch": 0.316273202378807, + "grad_norm": 0.40399715304374695, + "learning_rate": 6.324324324324325e-06, + "loss": 0.5118, + "step": 585 + }, + { + "epoch": 0.31681384033159127, + "grad_norm": 0.4406914710998535, + "learning_rate": 6.335135135135135e-06, + "loss": 0.5025, + "step": 586 + }, + { + "epoch": 0.3173544782843756, + "grad_norm": 0.39285317063331604, + "learning_rate": 6.345945945945946e-06, + "loss": 0.4876, + "step": 587 + }, + { + "epoch": 0.3178951162371598, + "grad_norm": 0.3913812041282654, + "learning_rate": 6.356756756756757e-06, + "loss": 0.51, + "step": 588 + }, + { + "epoch": 0.31843575418994413, + "grad_norm": 0.4313332736492157, + "learning_rate": 6.367567567567568e-06, + "loss": 0.5002, + "step": 589 + }, + { + "epoch": 0.31897639214272844, + "grad_norm": 0.43284839391708374, + "learning_rate": 6.378378378378379e-06, + "loss": 0.5064, + "step": 590 + }, + { + "epoch": 0.3195170300955127, + "grad_norm": 0.46096357703208923, + "learning_rate": 6.38918918918919e-06, + "loss": 0.5427, + "step": 591 + }, + { + "epoch": 0.320057668048297, + "grad_norm": 0.381386399269104, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.525, + "step": 592 + }, + { + "epoch": 0.3205983060010813, + "grad_norm": 0.3691636025905609, + "learning_rate": 6.410810810810811e-06, + "loss": 0.5181, + "step": 593 + }, + { + "epoch": 0.32113894395386555, + "grad_norm": 0.43879830837249756, + "learning_rate": 6.421621621621622e-06, + "loss": 0.5185, + "step": 594 + }, + { + "epoch": 0.32167958190664986, + "grad_norm": 0.39650213718414307, + "learning_rate": 6.432432432432433e-06, + "loss": 0.5095, + "step": 595 + }, + { + "epoch": 0.3222202198594341, + "grad_norm": 0.4146016538143158, + "learning_rate": 6.443243243243243e-06, + "loss": 0.4853, + "step": 596 + }, + { + "epoch": 0.3227608578122184, + "grad_norm": 0.45325127243995667, + "learning_rate": 6.454054054054054e-06, + "loss": 0.5022, + "step": 597 + }, + { + "epoch": 0.3233014957650027, + "grad_norm": 0.4641883373260498, + "learning_rate": 6.464864864864865e-06, + "loss": 0.4808, + "step": 598 + }, + { + "epoch": 0.323842133717787, + "grad_norm": 0.47337257862091064, + "learning_rate": 6.475675675675677e-06, + "loss": 0.4783, + "step": 599 + }, + { + "epoch": 0.3243827716705713, + "grad_norm": 0.43825823068618774, + "learning_rate": 6.486486486486487e-06, + "loss": 0.4901, + "step": 600 + }, + { + "epoch": 0.3249234096233556, + "grad_norm": 0.45925870537757874, + "learning_rate": 6.497297297297298e-06, + "loss": 0.5031, + "step": 601 + }, + { + "epoch": 0.32546404757613984, + "grad_norm": 0.3915764391422272, + "learning_rate": 6.508108108108109e-06, + "loss": 0.5013, + "step": 602 + }, + { + "epoch": 0.32600468552892414, + "grad_norm": 0.4049145579338074, + "learning_rate": 6.5189189189189196e-06, + "loss": 0.5256, + "step": 603 + }, + { + "epoch": 0.3265453234817084, + "grad_norm": 0.4432182312011719, + "learning_rate": 6.52972972972973e-06, + "loss": 0.5022, + "step": 604 + }, + { + "epoch": 0.3270859614344927, + "grad_norm": 0.406897634267807, + "learning_rate": 6.540540540540541e-06, + "loss": 0.4814, + "step": 605 + }, + { + "epoch": 0.327626599387277, + "grad_norm": 0.42340806126594543, + "learning_rate": 6.551351351351352e-06, + "loss": 0.4999, + "step": 606 + }, + { + "epoch": 0.32816723734006126, + "grad_norm": 0.3778286874294281, + "learning_rate": 6.562162162162162e-06, + "loss": 0.4939, + "step": 607 + }, + { + "epoch": 0.32870787529284556, + "grad_norm": 0.4262266159057617, + "learning_rate": 6.572972972972973e-06, + "loss": 0.5157, + "step": 608 + }, + { + "epoch": 0.32924851324562987, + "grad_norm": 0.36879613995552063, + "learning_rate": 6.583783783783785e-06, + "loss": 0.4924, + "step": 609 + }, + { + "epoch": 0.3297891511984141, + "grad_norm": 0.4040409028530121, + "learning_rate": 6.594594594594595e-06, + "loss": 0.5092, + "step": 610 + }, + { + "epoch": 0.3303297891511984, + "grad_norm": 0.46349969506263733, + "learning_rate": 6.605405405405406e-06, + "loss": 0.5096, + "step": 611 + }, + { + "epoch": 0.3308704271039827, + "grad_norm": 0.38087141513824463, + "learning_rate": 6.616216216216217e-06, + "loss": 0.4965, + "step": 612 + }, + { + "epoch": 0.331411065056767, + "grad_norm": 0.46425509452819824, + "learning_rate": 6.627027027027027e-06, + "loss": 0.5177, + "step": 613 + }, + { + "epoch": 0.3319517030095513, + "grad_norm": 0.3941066563129425, + "learning_rate": 6.6378378378378385e-06, + "loss": 0.5098, + "step": 614 + }, + { + "epoch": 0.33249234096233554, + "grad_norm": 0.448647141456604, + "learning_rate": 6.648648648648649e-06, + "loss": 0.5162, + "step": 615 + }, + { + "epoch": 0.33303297891511985, + "grad_norm": 0.4808944761753082, + "learning_rate": 6.659459459459459e-06, + "loss": 0.5126, + "step": 616 + }, + { + "epoch": 0.3335736168679041, + "grad_norm": 0.3767862021923065, + "learning_rate": 6.670270270270271e-06, + "loss": 0.4942, + "step": 617 + }, + { + "epoch": 0.3341142548206884, + "grad_norm": 0.45056983828544617, + "learning_rate": 6.681081081081081e-06, + "loss": 0.5181, + "step": 618 + }, + { + "epoch": 0.3346548927734727, + "grad_norm": 0.4133990705013275, + "learning_rate": 6.691891891891893e-06, + "loss": 0.5102, + "step": 619 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 0.4201817512512207, + "learning_rate": 6.702702702702704e-06, + "loss": 0.5051, + "step": 620 + }, + { + "epoch": 0.33573616867904127, + "grad_norm": 0.42155951261520386, + "learning_rate": 6.713513513513514e-06, + "loss": 0.5313, + "step": 621 + }, + { + "epoch": 0.3362768066318256, + "grad_norm": 0.4288482367992401, + "learning_rate": 6.724324324324325e-06, + "loss": 0.5151, + "step": 622 + }, + { + "epoch": 0.3368174445846098, + "grad_norm": 0.4295049011707306, + "learning_rate": 6.735135135135136e-06, + "loss": 0.5024, + "step": 623 + }, + { + "epoch": 0.33735808253739413, + "grad_norm": 0.49055394530296326, + "learning_rate": 6.745945945945946e-06, + "loss": 0.4825, + "step": 624 + }, + { + "epoch": 0.3378987204901784, + "grad_norm": 0.448925644159317, + "learning_rate": 6.7567567567567575e-06, + "loss": 0.5031, + "step": 625 + }, + { + "epoch": 0.3384393584429627, + "grad_norm": 0.4884868562221527, + "learning_rate": 6.767567567567568e-06, + "loss": 0.5085, + "step": 626 + }, + { + "epoch": 0.338979996395747, + "grad_norm": 0.42473411560058594, + "learning_rate": 6.778378378378378e-06, + "loss": 0.5232, + "step": 627 + }, + { + "epoch": 0.33952063434853125, + "grad_norm": 0.41970014572143555, + "learning_rate": 6.789189189189189e-06, + "loss": 0.486, + "step": 628 + }, + { + "epoch": 0.34006127230131555, + "grad_norm": 0.5064523220062256, + "learning_rate": 6.800000000000001e-06, + "loss": 0.5403, + "step": 629 + }, + { + "epoch": 0.34060191025409986, + "grad_norm": 0.4276356101036072, + "learning_rate": 6.810810810810811e-06, + "loss": 0.5248, + "step": 630 + }, + { + "epoch": 0.3411425482068841, + "grad_norm": 0.42455577850341797, + "learning_rate": 6.821621621621623e-06, + "loss": 0.493, + "step": 631 + }, + { + "epoch": 0.3416831861596684, + "grad_norm": 0.41362032294273376, + "learning_rate": 6.832432432432433e-06, + "loss": 0.5029, + "step": 632 + }, + { + "epoch": 0.34222382411245267, + "grad_norm": 0.4547756016254425, + "learning_rate": 6.8432432432432435e-06, + "loss": 0.5121, + "step": 633 + }, + { + "epoch": 0.342764462065237, + "grad_norm": 0.4629051387310028, + "learning_rate": 6.854054054054055e-06, + "loss": 0.5363, + "step": 634 + }, + { + "epoch": 0.3433051000180213, + "grad_norm": 0.40793025493621826, + "learning_rate": 6.864864864864865e-06, + "loss": 0.5012, + "step": 635 + }, + { + "epoch": 0.34384573797080553, + "grad_norm": 0.4819142520427704, + "learning_rate": 6.875675675675676e-06, + "loss": 0.5064, + "step": 636 + }, + { + "epoch": 0.34438637592358984, + "grad_norm": 0.38700756430625916, + "learning_rate": 6.886486486486487e-06, + "loss": 0.4911, + "step": 637 + }, + { + "epoch": 0.34492701387637414, + "grad_norm": 0.39876338839530945, + "learning_rate": 6.897297297297297e-06, + "loss": 0.4797, + "step": 638 + }, + { + "epoch": 0.3454676518291584, + "grad_norm": 0.4039168953895569, + "learning_rate": 6.9081081081081095e-06, + "loss": 0.5161, + "step": 639 + }, + { + "epoch": 0.3460082897819427, + "grad_norm": 0.38275012373924255, + "learning_rate": 6.91891891891892e-06, + "loss": 0.4998, + "step": 640 + }, + { + "epoch": 0.34654892773472695, + "grad_norm": 0.3935031294822693, + "learning_rate": 6.92972972972973e-06, + "loss": 0.4834, + "step": 641 + }, + { + "epoch": 0.34708956568751126, + "grad_norm": 0.39518025517463684, + "learning_rate": 6.940540540540542e-06, + "loss": 0.5069, + "step": 642 + }, + { + "epoch": 0.34763020364029557, + "grad_norm": 0.43953222036361694, + "learning_rate": 6.951351351351352e-06, + "loss": 0.5199, + "step": 643 + }, + { + "epoch": 0.3481708415930798, + "grad_norm": 0.37210172414779663, + "learning_rate": 6.9621621621621625e-06, + "loss": 0.4859, + "step": 644 + }, + { + "epoch": 0.3487114795458641, + "grad_norm": 0.394963800907135, + "learning_rate": 6.972972972972973e-06, + "loss": 0.5049, + "step": 645 + }, + { + "epoch": 0.34925211749864843, + "grad_norm": 0.3912118971347809, + "learning_rate": 6.983783783783784e-06, + "loss": 0.5092, + "step": 646 + }, + { + "epoch": 0.3497927554514327, + "grad_norm": 0.35926076769828796, + "learning_rate": 6.994594594594595e-06, + "loss": 0.4674, + "step": 647 + }, + { + "epoch": 0.350333393404217, + "grad_norm": 0.41905924677848816, + "learning_rate": 7.005405405405405e-06, + "loss": 0.5523, + "step": 648 + }, + { + "epoch": 0.35087403135700124, + "grad_norm": 0.4475938677787781, + "learning_rate": 7.016216216216217e-06, + "loss": 0.4922, + "step": 649 + }, + { + "epoch": 0.35141466930978554, + "grad_norm": 0.41779378056526184, + "learning_rate": 7.027027027027028e-06, + "loss": 0.48, + "step": 650 + }, + { + "epoch": 0.35195530726256985, + "grad_norm": 0.514970600605011, + "learning_rate": 7.037837837837839e-06, + "loss": 0.4898, + "step": 651 + }, + { + "epoch": 0.3524959452153541, + "grad_norm": 0.4767131209373474, + "learning_rate": 7.048648648648649e-06, + "loss": 0.4883, + "step": 652 + }, + { + "epoch": 0.3530365831681384, + "grad_norm": 0.41229724884033203, + "learning_rate": 7.05945945945946e-06, + "loss": 0.5173, + "step": 653 + }, + { + "epoch": 0.3535772211209227, + "grad_norm": 0.5163008570671082, + "learning_rate": 7.070270270270271e-06, + "loss": 0.5112, + "step": 654 + }, + { + "epoch": 0.35411785907370696, + "grad_norm": 0.4336077570915222, + "learning_rate": 7.0810810810810815e-06, + "loss": 0.4909, + "step": 655 + }, + { + "epoch": 0.35465849702649127, + "grad_norm": 0.427859902381897, + "learning_rate": 7.091891891891892e-06, + "loss": 0.505, + "step": 656 + }, + { + "epoch": 0.3551991349792755, + "grad_norm": 0.5053046941757202, + "learning_rate": 7.102702702702703e-06, + "loss": 0.5302, + "step": 657 + }, + { + "epoch": 0.35573977293205983, + "grad_norm": 0.45524269342422485, + "learning_rate": 7.113513513513514e-06, + "loss": 0.5067, + "step": 658 + }, + { + "epoch": 0.35628041088484413, + "grad_norm": 0.4407271444797516, + "learning_rate": 7.124324324324326e-06, + "loss": 0.4893, + "step": 659 + }, + { + "epoch": 0.3568210488376284, + "grad_norm": 0.6443756222724915, + "learning_rate": 7.135135135135136e-06, + "loss": 0.5403, + "step": 660 + }, + { + "epoch": 0.3573616867904127, + "grad_norm": 0.4299008548259735, + "learning_rate": 7.145945945945947e-06, + "loss": 0.5098, + "step": 661 + }, + { + "epoch": 0.357902324743197, + "grad_norm": 0.5455183386802673, + "learning_rate": 7.156756756756757e-06, + "loss": 0.5076, + "step": 662 + }, + { + "epoch": 0.35844296269598125, + "grad_norm": 0.44949883222579956, + "learning_rate": 7.167567567567568e-06, + "loss": 0.4943, + "step": 663 + }, + { + "epoch": 0.35898360064876556, + "grad_norm": 0.5578931570053101, + "learning_rate": 7.178378378378379e-06, + "loss": 0.5043, + "step": 664 + }, + { + "epoch": 0.3595242386015498, + "grad_norm": 0.5532004833221436, + "learning_rate": 7.189189189189189e-06, + "loss": 0.5407, + "step": 665 + }, + { + "epoch": 0.3600648765543341, + "grad_norm": 0.5385209321975708, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.5239, + "step": 666 + }, + { + "epoch": 0.3606055145071184, + "grad_norm": 0.4979776442050934, + "learning_rate": 7.210810810810811e-06, + "loss": 0.4752, + "step": 667 + }, + { + "epoch": 0.36114615245990267, + "grad_norm": 0.4731312692165375, + "learning_rate": 7.221621621621621e-06, + "loss": 0.5116, + "step": 668 + }, + { + "epoch": 0.361686790412687, + "grad_norm": 0.40968212485313416, + "learning_rate": 7.2324324324324335e-06, + "loss": 0.5132, + "step": 669 + }, + { + "epoch": 0.3622274283654713, + "grad_norm": 0.42712947726249695, + "learning_rate": 7.243243243243244e-06, + "loss": 0.4824, + "step": 670 + }, + { + "epoch": 0.36276806631825553, + "grad_norm": 0.4217482805252075, + "learning_rate": 7.254054054054055e-06, + "loss": 0.5044, + "step": 671 + }, + { + "epoch": 0.36330870427103984, + "grad_norm": 0.38178786635398865, + "learning_rate": 7.264864864864866e-06, + "loss": 0.4892, + "step": 672 + }, + { + "epoch": 0.3638493422238241, + "grad_norm": 0.3964153230190277, + "learning_rate": 7.275675675675676e-06, + "loss": 0.5067, + "step": 673 + }, + { + "epoch": 0.3643899801766084, + "grad_norm": 0.40324804186820984, + "learning_rate": 7.286486486486487e-06, + "loss": 0.5193, + "step": 674 + }, + { + "epoch": 0.3649306181293927, + "grad_norm": 0.42960992455482483, + "learning_rate": 7.297297297297298e-06, + "loss": 0.5044, + "step": 675 + }, + { + "epoch": 0.36547125608217695, + "grad_norm": 0.3774147629737854, + "learning_rate": 7.308108108108108e-06, + "loss": 0.4853, + "step": 676 + }, + { + "epoch": 0.36601189403496126, + "grad_norm": 0.4166586399078369, + "learning_rate": 7.3189189189189195e-06, + "loss": 0.4982, + "step": 677 + }, + { + "epoch": 0.36655253198774557, + "grad_norm": 0.4181104004383087, + "learning_rate": 7.32972972972973e-06, + "loss": 0.5031, + "step": 678 + }, + { + "epoch": 0.3670931699405298, + "grad_norm": 0.4069172143936157, + "learning_rate": 7.340540540540542e-06, + "loss": 0.4984, + "step": 679 + }, + { + "epoch": 0.3676338078933141, + "grad_norm": 0.3852164149284363, + "learning_rate": 7.3513513513513525e-06, + "loss": 0.4852, + "step": 680 + }, + { + "epoch": 0.3681744458460984, + "grad_norm": 0.41952404379844666, + "learning_rate": 7.362162162162163e-06, + "loss": 0.5073, + "step": 681 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 0.41212549805641174, + "learning_rate": 7.372972972972973e-06, + "loss": 0.5102, + "step": 682 + }, + { + "epoch": 0.369255721751667, + "grad_norm": 0.4410938322544098, + "learning_rate": 7.383783783783785e-06, + "loss": 0.5301, + "step": 683 + }, + { + "epoch": 0.36979635970445124, + "grad_norm": 0.41383302211761475, + "learning_rate": 7.394594594594595e-06, + "loss": 0.4985, + "step": 684 + }, + { + "epoch": 0.37033699765723554, + "grad_norm": 0.4224023222923279, + "learning_rate": 7.4054054054054055e-06, + "loss": 0.508, + "step": 685 + }, + { + "epoch": 0.3708776356100198, + "grad_norm": 0.4155189096927643, + "learning_rate": 7.416216216216217e-06, + "loss": 0.5059, + "step": 686 + }, + { + "epoch": 0.3714182735628041, + "grad_norm": 0.4235122501850128, + "learning_rate": 7.427027027027027e-06, + "loss": 0.5151, + "step": 687 + }, + { + "epoch": 0.3719589115155884, + "grad_norm": 0.45957595109939575, + "learning_rate": 7.437837837837838e-06, + "loss": 0.5124, + "step": 688 + }, + { + "epoch": 0.37249954946837266, + "grad_norm": 0.5240591764450073, + "learning_rate": 7.44864864864865e-06, + "loss": 0.509, + "step": 689 + }, + { + "epoch": 0.37304018742115697, + "grad_norm": 0.49150049686431885, + "learning_rate": 7.45945945945946e-06, + "loss": 0.5061, + "step": 690 + }, + { + "epoch": 0.37358082537394127, + "grad_norm": 0.4253467619419098, + "learning_rate": 7.4702702702702715e-06, + "loss": 0.5015, + "step": 691 + }, + { + "epoch": 0.3741214633267255, + "grad_norm": 0.41869038343429565, + "learning_rate": 7.481081081081082e-06, + "loss": 0.4868, + "step": 692 + }, + { + "epoch": 0.37466210127950983, + "grad_norm": 0.42171812057495117, + "learning_rate": 7.491891891891892e-06, + "loss": 0.4947, + "step": 693 + }, + { + "epoch": 0.3752027392322941, + "grad_norm": 0.4336179494857788, + "learning_rate": 7.502702702702704e-06, + "loss": 0.4918, + "step": 694 + }, + { + "epoch": 0.3757433771850784, + "grad_norm": 0.42052584886550903, + "learning_rate": 7.513513513513514e-06, + "loss": 0.4798, + "step": 695 + }, + { + "epoch": 0.3762840151378627, + "grad_norm": 0.45332416892051697, + "learning_rate": 7.5243243243243245e-06, + "loss": 0.4838, + "step": 696 + }, + { + "epoch": 0.37682465309064694, + "grad_norm": 0.4129831790924072, + "learning_rate": 7.535135135135136e-06, + "loss": 0.4943, + "step": 697 + }, + { + "epoch": 0.37736529104343125, + "grad_norm": 0.4154702126979828, + "learning_rate": 7.545945945945946e-06, + "loss": 0.5256, + "step": 698 + }, + { + "epoch": 0.37790592899621556, + "grad_norm": 0.4334423840045929, + "learning_rate": 7.5567567567567575e-06, + "loss": 0.4954, + "step": 699 + }, + { + "epoch": 0.3784465669489998, + "grad_norm": 0.43977442383766174, + "learning_rate": 7.567567567567569e-06, + "loss": 0.5084, + "step": 700 + }, + { + "epoch": 0.3789872049017841, + "grad_norm": 0.4473543167114258, + "learning_rate": 7.578378378378379e-06, + "loss": 0.5121, + "step": 701 + }, + { + "epoch": 0.37952784285456836, + "grad_norm": 0.3824658691883087, + "learning_rate": 7.58918918918919e-06, + "loss": 0.4946, + "step": 702 + }, + { + "epoch": 0.38006848080735267, + "grad_norm": 0.43371015787124634, + "learning_rate": 7.600000000000001e-06, + "loss": 0.4827, + "step": 703 + }, + { + "epoch": 0.380609118760137, + "grad_norm": 0.4218309819698334, + "learning_rate": 7.610810810810811e-06, + "loss": 0.4795, + "step": 704 + }, + { + "epoch": 0.38114975671292123, + "grad_norm": 0.42625129222869873, + "learning_rate": 7.621621621621622e-06, + "loss": 0.4908, + "step": 705 + }, + { + "epoch": 0.38169039466570553, + "grad_norm": 0.42555558681488037, + "learning_rate": 7.632432432432433e-06, + "loss": 0.5112, + "step": 706 + }, + { + "epoch": 0.38223103261848984, + "grad_norm": 0.4020969867706299, + "learning_rate": 7.643243243243244e-06, + "loss": 0.5084, + "step": 707 + }, + { + "epoch": 0.3827716705712741, + "grad_norm": 0.38164466619491577, + "learning_rate": 7.654054054054054e-06, + "loss": 0.477, + "step": 708 + }, + { + "epoch": 0.3833123085240584, + "grad_norm": 0.3920608162879944, + "learning_rate": 7.664864864864866e-06, + "loss": 0.4878, + "step": 709 + }, + { + "epoch": 0.38385294647684265, + "grad_norm": 0.4406064748764038, + "learning_rate": 7.675675675675676e-06, + "loss": 0.5438, + "step": 710 + }, + { + "epoch": 0.38439358442962696, + "grad_norm": 0.45603087544441223, + "learning_rate": 7.686486486486487e-06, + "loss": 0.5086, + "step": 711 + }, + { + "epoch": 0.38493422238241126, + "grad_norm": 0.391500324010849, + "learning_rate": 7.697297297297297e-06, + "loss": 0.4899, + "step": 712 + }, + { + "epoch": 0.3854748603351955, + "grad_norm": 0.3915819823741913, + "learning_rate": 7.70810810810811e-06, + "loss": 0.5003, + "step": 713 + }, + { + "epoch": 0.3860154982879798, + "grad_norm": 0.4155017137527466, + "learning_rate": 7.71891891891892e-06, + "loss": 0.5054, + "step": 714 + }, + { + "epoch": 0.3865561362407641, + "grad_norm": 0.42171376943588257, + "learning_rate": 7.72972972972973e-06, + "loss": 0.5209, + "step": 715 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.4514196217060089, + "learning_rate": 7.74054054054054e-06, + "loss": 0.4754, + "step": 716 + }, + { + "epoch": 0.3876374121463327, + "grad_norm": 0.4144493043422699, + "learning_rate": 7.751351351351351e-06, + "loss": 0.4744, + "step": 717 + }, + { + "epoch": 0.38817805009911693, + "grad_norm": 0.43851664662361145, + "learning_rate": 7.762162162162162e-06, + "loss": 0.4882, + "step": 718 + }, + { + "epoch": 0.38871868805190124, + "grad_norm": 0.42534932494163513, + "learning_rate": 7.772972972972974e-06, + "loss": 0.5156, + "step": 719 + }, + { + "epoch": 0.38925932600468555, + "grad_norm": 0.4332650303840637, + "learning_rate": 7.783783783783784e-06, + "loss": 0.5038, + "step": 720 + }, + { + "epoch": 0.3897999639574698, + "grad_norm": 0.42367222905158997, + "learning_rate": 7.794594594594596e-06, + "loss": 0.4778, + "step": 721 + }, + { + "epoch": 0.3903406019102541, + "grad_norm": 0.38908496499061584, + "learning_rate": 7.805405405405407e-06, + "loss": 0.5106, + "step": 722 + }, + { + "epoch": 0.3908812398630384, + "grad_norm": 0.41853058338165283, + "learning_rate": 7.816216216216217e-06, + "loss": 0.5044, + "step": 723 + }, + { + "epoch": 0.39142187781582266, + "grad_norm": 0.3883611261844635, + "learning_rate": 7.827027027027028e-06, + "loss": 0.5226, + "step": 724 + }, + { + "epoch": 0.39196251576860697, + "grad_norm": 0.41836339235305786, + "learning_rate": 7.837837837837838e-06, + "loss": 0.4814, + "step": 725 + }, + { + "epoch": 0.3925031537213912, + "grad_norm": 0.42219579219818115, + "learning_rate": 7.848648648648648e-06, + "loss": 0.4845, + "step": 726 + }, + { + "epoch": 0.3930437916741755, + "grad_norm": 0.42650148272514343, + "learning_rate": 7.859459459459459e-06, + "loss": 0.4963, + "step": 727 + }, + { + "epoch": 0.39358442962695983, + "grad_norm": 0.4264717102050781, + "learning_rate": 7.870270270270271e-06, + "loss": 0.4947, + "step": 728 + }, + { + "epoch": 0.3941250675797441, + "grad_norm": 0.4335402250289917, + "learning_rate": 7.881081081081081e-06, + "loss": 0.5013, + "step": 729 + }, + { + "epoch": 0.3946657055325284, + "grad_norm": 0.3991732597351074, + "learning_rate": 7.891891891891894e-06, + "loss": 0.4918, + "step": 730 + }, + { + "epoch": 0.3952063434853127, + "grad_norm": 0.4234263300895691, + "learning_rate": 7.902702702702704e-06, + "loss": 0.4999, + "step": 731 + }, + { + "epoch": 0.39574698143809695, + "grad_norm": 0.453852117061615, + "learning_rate": 7.913513513513514e-06, + "loss": 0.4612, + "step": 732 + }, + { + "epoch": 0.39628761939088125, + "grad_norm": 0.38619935512542725, + "learning_rate": 7.924324324324325e-06, + "loss": 0.5215, + "step": 733 + }, + { + "epoch": 0.3968282573436655, + "grad_norm": 0.48475557565689087, + "learning_rate": 7.935135135135135e-06, + "loss": 0.4862, + "step": 734 + }, + { + "epoch": 0.3973688952964498, + "grad_norm": 0.46672797203063965, + "learning_rate": 7.945945945945946e-06, + "loss": 0.5342, + "step": 735 + }, + { + "epoch": 0.3979095332492341, + "grad_norm": 0.3979105055332184, + "learning_rate": 7.956756756756758e-06, + "loss": 0.5131, + "step": 736 + }, + { + "epoch": 0.39845017120201837, + "grad_norm": 0.4489961266517639, + "learning_rate": 7.967567567567568e-06, + "loss": 0.5055, + "step": 737 + }, + { + "epoch": 0.3989908091548027, + "grad_norm": 0.47179901599884033, + "learning_rate": 7.978378378378379e-06, + "loss": 0.4689, + "step": 738 + }, + { + "epoch": 0.399531447107587, + "grad_norm": 0.4245077073574066, + "learning_rate": 7.989189189189191e-06, + "loss": 0.4865, + "step": 739 + }, + { + "epoch": 0.40007208506037123, + "grad_norm": 0.44382888078689575, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5002, + "step": 740 + }, + { + "epoch": 0.40061272301315554, + "grad_norm": 0.36594852805137634, + "learning_rate": 8.010810810810812e-06, + "loss": 0.5004, + "step": 741 + }, + { + "epoch": 0.4011533609659398, + "grad_norm": 0.41024965047836304, + "learning_rate": 8.021621621621622e-06, + "loss": 0.4795, + "step": 742 + }, + { + "epoch": 0.4016939989187241, + "grad_norm": 0.40164679288864136, + "learning_rate": 8.032432432432433e-06, + "loss": 0.5139, + "step": 743 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 0.36995360255241394, + "learning_rate": 8.043243243243243e-06, + "loss": 0.4804, + "step": 744 + }, + { + "epoch": 0.40277527482429265, + "grad_norm": 0.47509828209877014, + "learning_rate": 8.054054054054055e-06, + "loss": 0.5083, + "step": 745 + }, + { + "epoch": 0.40331591277707696, + "grad_norm": 0.4017750918865204, + "learning_rate": 8.064864864864866e-06, + "loss": 0.4956, + "step": 746 + }, + { + "epoch": 0.40385655072986126, + "grad_norm": 0.4321381449699402, + "learning_rate": 8.075675675675676e-06, + "loss": 0.4857, + "step": 747 + }, + { + "epoch": 0.4043971886826455, + "grad_norm": 0.44780081510543823, + "learning_rate": 8.086486486486486e-06, + "loss": 0.4923, + "step": 748 + }, + { + "epoch": 0.4049378266354298, + "grad_norm": 0.4115518033504486, + "learning_rate": 8.097297297297297e-06, + "loss": 0.5112, + "step": 749 + }, + { + "epoch": 0.40547846458821407, + "grad_norm": 0.42116203904151917, + "learning_rate": 8.108108108108109e-06, + "loss": 0.4866, + "step": 750 + }, + { + "epoch": 0.4060191025409984, + "grad_norm": 0.4541042149066925, + "learning_rate": 8.11891891891892e-06, + "loss": 0.4817, + "step": 751 + }, + { + "epoch": 0.4065597404937827, + "grad_norm": 0.4548441469669342, + "learning_rate": 8.12972972972973e-06, + "loss": 0.506, + "step": 752 + }, + { + "epoch": 0.40710037844656694, + "grad_norm": 0.4485017657279968, + "learning_rate": 8.140540540540542e-06, + "loss": 0.524, + "step": 753 + }, + { + "epoch": 0.40764101639935124, + "grad_norm": 0.45467260479927063, + "learning_rate": 8.151351351351352e-06, + "loss": 0.4907, + "step": 754 + }, + { + "epoch": 0.4081816543521355, + "grad_norm": 0.39999452233314514, + "learning_rate": 8.162162162162163e-06, + "loss": 0.4985, + "step": 755 + }, + { + "epoch": 0.4087222923049198, + "grad_norm": 0.48386865854263306, + "learning_rate": 8.172972972972973e-06, + "loss": 0.494, + "step": 756 + }, + { + "epoch": 0.4092629302577041, + "grad_norm": 0.39953333139419556, + "learning_rate": 8.183783783783784e-06, + "loss": 0.4965, + "step": 757 + }, + { + "epoch": 0.40980356821048836, + "grad_norm": 0.46499499678611755, + "learning_rate": 8.194594594594594e-06, + "loss": 0.484, + "step": 758 + }, + { + "epoch": 0.41034420616327266, + "grad_norm": 0.4665820598602295, + "learning_rate": 8.205405405405406e-06, + "loss": 0.4871, + "step": 759 + }, + { + "epoch": 0.41088484411605697, + "grad_norm": 0.4831865429878235, + "learning_rate": 8.216216216216217e-06, + "loss": 0.4941, + "step": 760 + }, + { + "epoch": 0.4114254820688412, + "grad_norm": 0.4756391644477844, + "learning_rate": 8.227027027027029e-06, + "loss": 0.5108, + "step": 761 + }, + { + "epoch": 0.4119661200216255, + "grad_norm": 0.5320138931274414, + "learning_rate": 8.23783783783784e-06, + "loss": 0.4794, + "step": 762 + }, + { + "epoch": 0.4125067579744098, + "grad_norm": 0.4134671688079834, + "learning_rate": 8.24864864864865e-06, + "loss": 0.4939, + "step": 763 + }, + { + "epoch": 0.4130473959271941, + "grad_norm": 0.48983827233314514, + "learning_rate": 8.25945945945946e-06, + "loss": 0.5266, + "step": 764 + }, + { + "epoch": 0.4135880338799784, + "grad_norm": 0.44971051812171936, + "learning_rate": 8.27027027027027e-06, + "loss": 0.5201, + "step": 765 + }, + { + "epoch": 0.41412867183276264, + "grad_norm": 0.43499618768692017, + "learning_rate": 8.281081081081081e-06, + "loss": 0.4692, + "step": 766 + }, + { + "epoch": 0.41466930978554695, + "grad_norm": 0.3952126204967499, + "learning_rate": 8.291891891891891e-06, + "loss": 0.4571, + "step": 767 + }, + { + "epoch": 0.41520994773833125, + "grad_norm": 0.43571987748146057, + "learning_rate": 8.302702702702704e-06, + "loss": 0.5128, + "step": 768 + }, + { + "epoch": 0.4157505856911155, + "grad_norm": 0.41786375641822815, + "learning_rate": 8.313513513513514e-06, + "loss": 0.5126, + "step": 769 + }, + { + "epoch": 0.4162912236438998, + "grad_norm": 0.40955227613449097, + "learning_rate": 8.324324324324326e-06, + "loss": 0.5046, + "step": 770 + }, + { + "epoch": 0.41683186159668406, + "grad_norm": 0.4118677079677582, + "learning_rate": 8.335135135135137e-06, + "loss": 0.5037, + "step": 771 + }, + { + "epoch": 0.41737249954946837, + "grad_norm": 0.43250730633735657, + "learning_rate": 8.345945945945947e-06, + "loss": 0.4943, + "step": 772 + }, + { + "epoch": 0.4179131375022527, + "grad_norm": 0.4406627416610718, + "learning_rate": 8.356756756756757e-06, + "loss": 0.4897, + "step": 773 + }, + { + "epoch": 0.4184537754550369, + "grad_norm": 0.42739009857177734, + "learning_rate": 8.367567567567568e-06, + "loss": 0.4963, + "step": 774 + }, + { + "epoch": 0.41899441340782123, + "grad_norm": 0.44201985001564026, + "learning_rate": 8.378378378378378e-06, + "loss": 0.5287, + "step": 775 + }, + { + "epoch": 0.41953505136060554, + "grad_norm": 0.4748345613479614, + "learning_rate": 8.38918918918919e-06, + "loss": 0.4876, + "step": 776 + }, + { + "epoch": 0.4200756893133898, + "grad_norm": 0.3963128924369812, + "learning_rate": 8.400000000000001e-06, + "loss": 0.4826, + "step": 777 + }, + { + "epoch": 0.4206163272661741, + "grad_norm": 0.4639700651168823, + "learning_rate": 8.410810810810811e-06, + "loss": 0.4927, + "step": 778 + }, + { + "epoch": 0.42115696521895835, + "grad_norm": 0.4406186044216156, + "learning_rate": 8.421621621621622e-06, + "loss": 0.506, + "step": 779 + }, + { + "epoch": 0.42169760317174265, + "grad_norm": 0.42991262674331665, + "learning_rate": 8.432432432432434e-06, + "loss": 0.4856, + "step": 780 + }, + { + "epoch": 0.42223824112452696, + "grad_norm": 0.4585091769695282, + "learning_rate": 8.443243243243244e-06, + "loss": 0.5051, + "step": 781 + }, + { + "epoch": 0.4227788790773112, + "grad_norm": 0.4431307911872864, + "learning_rate": 8.454054054054055e-06, + "loss": 0.4688, + "step": 782 + }, + { + "epoch": 0.4233195170300955, + "grad_norm": 0.3975408375263214, + "learning_rate": 8.464864864864865e-06, + "loss": 0.4898, + "step": 783 + }, + { + "epoch": 0.4238601549828798, + "grad_norm": 0.5118733644485474, + "learning_rate": 8.475675675675676e-06, + "loss": 0.5056, + "step": 784 + }, + { + "epoch": 0.4244007929356641, + "grad_norm": 0.3885602056980133, + "learning_rate": 8.486486486486488e-06, + "loss": 0.4638, + "step": 785 + }, + { + "epoch": 0.4249414308884484, + "grad_norm": 0.5289828181266785, + "learning_rate": 8.497297297297298e-06, + "loss": 0.5084, + "step": 786 + }, + { + "epoch": 0.42548206884123263, + "grad_norm": 0.4151102602481842, + "learning_rate": 8.508108108108109e-06, + "loss": 0.4546, + "step": 787 + }, + { + "epoch": 0.42602270679401694, + "grad_norm": 0.47288182377815247, + "learning_rate": 8.518918918918919e-06, + "loss": 0.4731, + "step": 788 + }, + { + "epoch": 0.42656334474680124, + "grad_norm": 0.5048025250434875, + "learning_rate": 8.52972972972973e-06, + "loss": 0.4979, + "step": 789 + }, + { + "epoch": 0.4271039826995855, + "grad_norm": 0.4965549111366272, + "learning_rate": 8.540540540540542e-06, + "loss": 0.4864, + "step": 790 + }, + { + "epoch": 0.4276446206523698, + "grad_norm": 0.46014106273651123, + "learning_rate": 8.551351351351352e-06, + "loss": 0.5041, + "step": 791 + }, + { + "epoch": 0.4281852586051541, + "grad_norm": 0.4505784809589386, + "learning_rate": 8.562162162162162e-06, + "loss": 0.4712, + "step": 792 + }, + { + "epoch": 0.42872589655793836, + "grad_norm": 0.42782407999038696, + "learning_rate": 8.572972972972975e-06, + "loss": 0.4536, + "step": 793 + }, + { + "epoch": 0.42926653451072266, + "grad_norm": 0.47117555141448975, + "learning_rate": 8.583783783783785e-06, + "loss": 0.4968, + "step": 794 + }, + { + "epoch": 0.4298071724635069, + "grad_norm": 0.47286736965179443, + "learning_rate": 8.594594594594595e-06, + "loss": 0.5189, + "step": 795 + }, + { + "epoch": 0.4303478104162912, + "grad_norm": 0.4804339110851288, + "learning_rate": 8.605405405405406e-06, + "loss": 0.4976, + "step": 796 + }, + { + "epoch": 0.4308884483690755, + "grad_norm": 0.4585397243499756, + "learning_rate": 8.616216216216216e-06, + "loss": 0.4704, + "step": 797 + }, + { + "epoch": 0.4314290863218598, + "grad_norm": 0.46230971813201904, + "learning_rate": 8.627027027027027e-06, + "loss": 0.464, + "step": 798 + }, + { + "epoch": 0.4319697242746441, + "grad_norm": 0.47910165786743164, + "learning_rate": 8.637837837837837e-06, + "loss": 0.49, + "step": 799 + }, + { + "epoch": 0.4325103622274284, + "grad_norm": 0.4503049850463867, + "learning_rate": 8.64864864864865e-06, + "loss": 0.468, + "step": 800 + }, + { + "epoch": 0.43305100018021264, + "grad_norm": 0.4591892659664154, + "learning_rate": 8.65945945945946e-06, + "loss": 0.4667, + "step": 801 + }, + { + "epoch": 0.43359163813299695, + "grad_norm": 0.4422140121459961, + "learning_rate": 8.670270270270272e-06, + "loss": 0.4797, + "step": 802 + }, + { + "epoch": 0.4341322760857812, + "grad_norm": 0.39958667755126953, + "learning_rate": 8.681081081081082e-06, + "loss": 0.4966, + "step": 803 + }, + { + "epoch": 0.4346729140385655, + "grad_norm": 0.41113945841789246, + "learning_rate": 8.691891891891893e-06, + "loss": 0.4689, + "step": 804 + }, + { + "epoch": 0.4352135519913498, + "grad_norm": 0.412852942943573, + "learning_rate": 8.702702702702703e-06, + "loss": 0.4716, + "step": 805 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 0.39494407176971436, + "learning_rate": 8.713513513513514e-06, + "loss": 0.4642, + "step": 806 + }, + { + "epoch": 0.43629482789691837, + "grad_norm": 0.48442819714546204, + "learning_rate": 8.724324324324324e-06, + "loss": 0.5024, + "step": 807 + }, + { + "epoch": 0.4368354658497027, + "grad_norm": 0.435715913772583, + "learning_rate": 8.735135135135136e-06, + "loss": 0.4853, + "step": 808 + }, + { + "epoch": 0.4373761038024869, + "grad_norm": 0.48299872875213623, + "learning_rate": 8.745945945945947e-06, + "loss": 0.4968, + "step": 809 + }, + { + "epoch": 0.43791674175527123, + "grad_norm": 0.44625693559646606, + "learning_rate": 8.756756756756759e-06, + "loss": 0.5115, + "step": 810 + }, + { + "epoch": 0.4384573797080555, + "grad_norm": 0.4905652403831482, + "learning_rate": 8.767567567567569e-06, + "loss": 0.5058, + "step": 811 + }, + { + "epoch": 0.4389980176608398, + "grad_norm": 0.5242207050323486, + "learning_rate": 8.77837837837838e-06, + "loss": 0.4825, + "step": 812 + }, + { + "epoch": 0.4395386556136241, + "grad_norm": 0.3821278214454651, + "learning_rate": 8.78918918918919e-06, + "loss": 0.4632, + "step": 813 + }, + { + "epoch": 0.44007929356640835, + "grad_norm": 0.5015333890914917, + "learning_rate": 8.8e-06, + "loss": 0.4676, + "step": 814 + }, + { + "epoch": 0.44061993151919265, + "grad_norm": 0.40313366055488586, + "learning_rate": 8.810810810810811e-06, + "loss": 0.4801, + "step": 815 + }, + { + "epoch": 0.44116056947197696, + "grad_norm": 0.44474324584007263, + "learning_rate": 8.821621621621623e-06, + "loss": 0.4948, + "step": 816 + }, + { + "epoch": 0.4417012074247612, + "grad_norm": 0.38576412200927734, + "learning_rate": 8.832432432432433e-06, + "loss": 0.4921, + "step": 817 + }, + { + "epoch": 0.4422418453775455, + "grad_norm": 0.4141075611114502, + "learning_rate": 8.843243243243244e-06, + "loss": 0.4777, + "step": 818 + }, + { + "epoch": 0.44278248333032977, + "grad_norm": 0.39242422580718994, + "learning_rate": 8.854054054054054e-06, + "loss": 0.5041, + "step": 819 + }, + { + "epoch": 0.4433231212831141, + "grad_norm": 0.3840166926383972, + "learning_rate": 8.864864864864866e-06, + "loss": 0.4682, + "step": 820 + }, + { + "epoch": 0.4438637592358984, + "grad_norm": 0.40388667583465576, + "learning_rate": 8.875675675675677e-06, + "loss": 0.5043, + "step": 821 + }, + { + "epoch": 0.44440439718868263, + "grad_norm": 0.39842742681503296, + "learning_rate": 8.886486486486487e-06, + "loss": 0.4772, + "step": 822 + }, + { + "epoch": 0.44494503514146694, + "grad_norm": 0.45574501156806946, + "learning_rate": 8.897297297297298e-06, + "loss": 0.5023, + "step": 823 + }, + { + "epoch": 0.4454856730942512, + "grad_norm": 0.43097352981567383, + "learning_rate": 8.908108108108108e-06, + "loss": 0.4865, + "step": 824 + }, + { + "epoch": 0.4460263110470355, + "grad_norm": 0.45888960361480713, + "learning_rate": 8.91891891891892e-06, + "loss": 0.4877, + "step": 825 + }, + { + "epoch": 0.4465669489998198, + "grad_norm": 0.4810618758201599, + "learning_rate": 8.92972972972973e-06, + "loss": 0.5086, + "step": 826 + }, + { + "epoch": 0.44710758695260405, + "grad_norm": 0.4267573952674866, + "learning_rate": 8.940540540540541e-06, + "loss": 0.4821, + "step": 827 + }, + { + "epoch": 0.44764822490538836, + "grad_norm": 0.41196686029434204, + "learning_rate": 8.951351351351352e-06, + "loss": 0.4911, + "step": 828 + }, + { + "epoch": 0.44818886285817267, + "grad_norm": 0.5312232971191406, + "learning_rate": 8.962162162162162e-06, + "loss": 0.4836, + "step": 829 + }, + { + "epoch": 0.4487295008109569, + "grad_norm": 0.42353129386901855, + "learning_rate": 8.972972972972974e-06, + "loss": 0.5265, + "step": 830 + }, + { + "epoch": 0.4492701387637412, + "grad_norm": 0.45074447989463806, + "learning_rate": 8.983783783783785e-06, + "loss": 0.4786, + "step": 831 + }, + { + "epoch": 0.4498107767165255, + "grad_norm": 0.43651434779167175, + "learning_rate": 8.994594594594595e-06, + "loss": 0.4862, + "step": 832 + }, + { + "epoch": 0.4503514146693098, + "grad_norm": 0.4827129542827606, + "learning_rate": 9.005405405405407e-06, + "loss": 0.4819, + "step": 833 + }, + { + "epoch": 0.4508920526220941, + "grad_norm": 0.5431792736053467, + "learning_rate": 9.016216216216218e-06, + "loss": 0.4788, + "step": 834 + }, + { + "epoch": 0.45143269057487834, + "grad_norm": 0.4722309708595276, + "learning_rate": 9.027027027027028e-06, + "loss": 0.4944, + "step": 835 + }, + { + "epoch": 0.45197332852766264, + "grad_norm": 0.5574517846107483, + "learning_rate": 9.037837837837838e-06, + "loss": 0.488, + "step": 836 + }, + { + "epoch": 0.45251396648044695, + "grad_norm": 0.5468060970306396, + "learning_rate": 9.048648648648649e-06, + "loss": 0.4972, + "step": 837 + }, + { + "epoch": 0.4530546044332312, + "grad_norm": 0.4097936451435089, + "learning_rate": 9.05945945945946e-06, + "loss": 0.4536, + "step": 838 + }, + { + "epoch": 0.4535952423860155, + "grad_norm": 0.44951799511909485, + "learning_rate": 9.07027027027027e-06, + "loss": 0.5394, + "step": 839 + }, + { + "epoch": 0.45413588033879976, + "grad_norm": 0.5354393124580383, + "learning_rate": 9.081081081081082e-06, + "loss": 0.4717, + "step": 840 + }, + { + "epoch": 0.45467651829158406, + "grad_norm": 0.38110360503196716, + "learning_rate": 9.091891891891892e-06, + "loss": 0.4923, + "step": 841 + }, + { + "epoch": 0.45521715624436837, + "grad_norm": 0.5034909248352051, + "learning_rate": 9.102702702702704e-06, + "loss": 0.5018, + "step": 842 + }, + { + "epoch": 0.4557577941971526, + "grad_norm": 0.42320170998573303, + "learning_rate": 9.113513513513515e-06, + "loss": 0.4791, + "step": 843 + }, + { + "epoch": 0.45629843214993693, + "grad_norm": 0.4968869388103485, + "learning_rate": 9.124324324324325e-06, + "loss": 0.5325, + "step": 844 + }, + { + "epoch": 0.45683907010272123, + "grad_norm": 0.4150351583957672, + "learning_rate": 9.135135135135136e-06, + "loss": 0.5068, + "step": 845 + }, + { + "epoch": 0.4573797080555055, + "grad_norm": 0.541016161441803, + "learning_rate": 9.145945945945946e-06, + "loss": 0.4865, + "step": 846 + }, + { + "epoch": 0.4579203460082898, + "grad_norm": 0.418040931224823, + "learning_rate": 9.156756756756757e-06, + "loss": 0.4926, + "step": 847 + }, + { + "epoch": 0.45846098396107404, + "grad_norm": 0.5246473550796509, + "learning_rate": 9.167567567567569e-06, + "loss": 0.4897, + "step": 848 + }, + { + "epoch": 0.45900162191385835, + "grad_norm": 0.4080889821052551, + "learning_rate": 9.178378378378379e-06, + "loss": 0.5093, + "step": 849 + }, + { + "epoch": 0.45954225986664266, + "grad_norm": 0.5630201697349548, + "learning_rate": 9.189189189189191e-06, + "loss": 0.4847, + "step": 850 + }, + { + "epoch": 0.4600828978194269, + "grad_norm": 0.4602736234664917, + "learning_rate": 9.200000000000002e-06, + "loss": 0.4777, + "step": 851 + }, + { + "epoch": 0.4606235357722112, + "grad_norm": 0.5108170509338379, + "learning_rate": 9.210810810810812e-06, + "loss": 0.4929, + "step": 852 + }, + { + "epoch": 0.4611641737249955, + "grad_norm": 0.4432177245616913, + "learning_rate": 9.221621621621623e-06, + "loss": 0.475, + "step": 853 + }, + { + "epoch": 0.46170481167777977, + "grad_norm": 0.46254798769950867, + "learning_rate": 9.232432432432433e-06, + "loss": 0.5245, + "step": 854 + }, + { + "epoch": 0.4622454496305641, + "grad_norm": 0.45064395666122437, + "learning_rate": 9.243243243243243e-06, + "loss": 0.4997, + "step": 855 + }, + { + "epoch": 0.4627860875833483, + "grad_norm": 0.5564315915107727, + "learning_rate": 9.254054054054054e-06, + "loss": 0.5174, + "step": 856 + }, + { + "epoch": 0.46332672553613263, + "grad_norm": 0.40368762612342834, + "learning_rate": 9.264864864864866e-06, + "loss": 0.4819, + "step": 857 + }, + { + "epoch": 0.46386736348891694, + "grad_norm": 0.528178334236145, + "learning_rate": 9.275675675675676e-06, + "loss": 0.5142, + "step": 858 + }, + { + "epoch": 0.4644080014417012, + "grad_norm": 0.39754781126976013, + "learning_rate": 9.286486486486487e-06, + "loss": 0.4875, + "step": 859 + }, + { + "epoch": 0.4649486393944855, + "grad_norm": 0.49752360582351685, + "learning_rate": 9.297297297297299e-06, + "loss": 0.529, + "step": 860 + }, + { + "epoch": 0.4654892773472698, + "grad_norm": 0.4451174736022949, + "learning_rate": 9.30810810810811e-06, + "loss": 0.508, + "step": 861 + }, + { + "epoch": 0.46602991530005405, + "grad_norm": 0.42956191301345825, + "learning_rate": 9.31891891891892e-06, + "loss": 0.4789, + "step": 862 + }, + { + "epoch": 0.46657055325283836, + "grad_norm": 0.5587397813796997, + "learning_rate": 9.32972972972973e-06, + "loss": 0.4988, + "step": 863 + }, + { + "epoch": 0.4671111912056226, + "grad_norm": 0.47554630041122437, + "learning_rate": 9.34054054054054e-06, + "loss": 0.4953, + "step": 864 + }, + { + "epoch": 0.4676518291584069, + "grad_norm": 0.47560325264930725, + "learning_rate": 9.351351351351353e-06, + "loss": 0.4597, + "step": 865 + }, + { + "epoch": 0.4681924671111912, + "grad_norm": 0.5811336636543274, + "learning_rate": 9.362162162162163e-06, + "loss": 0.5106, + "step": 866 + }, + { + "epoch": 0.4687331050639755, + "grad_norm": 0.4019627571105957, + "learning_rate": 9.372972972972974e-06, + "loss": 0.509, + "step": 867 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 0.46386411786079407, + "learning_rate": 9.383783783783784e-06, + "loss": 0.4927, + "step": 868 + }, + { + "epoch": 0.4698143809695441, + "grad_norm": 0.4141891300678253, + "learning_rate": 9.394594594594595e-06, + "loss": 0.4789, + "step": 869 + }, + { + "epoch": 0.47035501892232834, + "grad_norm": 0.45206698775291443, + "learning_rate": 9.405405405405407e-06, + "loss": 0.5101, + "step": 870 + }, + { + "epoch": 0.47089565687511264, + "grad_norm": 0.44673988223075867, + "learning_rate": 9.416216216216217e-06, + "loss": 0.4695, + "step": 871 + }, + { + "epoch": 0.4714362948278969, + "grad_norm": 0.3691551089286804, + "learning_rate": 9.427027027027028e-06, + "loss": 0.4872, + "step": 872 + }, + { + "epoch": 0.4719769327806812, + "grad_norm": 0.46097657084465027, + "learning_rate": 9.437837837837838e-06, + "loss": 0.499, + "step": 873 + }, + { + "epoch": 0.4725175707334655, + "grad_norm": 0.4701019823551178, + "learning_rate": 9.44864864864865e-06, + "loss": 0.4717, + "step": 874 + }, + { + "epoch": 0.47305820868624976, + "grad_norm": 0.40333500504493713, + "learning_rate": 9.45945945945946e-06, + "loss": 0.4815, + "step": 875 + }, + { + "epoch": 0.47359884663903407, + "grad_norm": 0.5153550505638123, + "learning_rate": 9.470270270270271e-06, + "loss": 0.4859, + "step": 876 + }, + { + "epoch": 0.47413948459181837, + "grad_norm": 0.43943801522254944, + "learning_rate": 9.481081081081081e-06, + "loss": 0.495, + "step": 877 + }, + { + "epoch": 0.4746801225446026, + "grad_norm": 0.4598594307899475, + "learning_rate": 9.491891891891892e-06, + "loss": 0.4883, + "step": 878 + }, + { + "epoch": 0.47522076049738693, + "grad_norm": 0.49989616870880127, + "learning_rate": 9.502702702702702e-06, + "loss": 0.4905, + "step": 879 + }, + { + "epoch": 0.4757613984501712, + "grad_norm": 0.4743301272392273, + "learning_rate": 9.513513513513514e-06, + "loss": 0.5096, + "step": 880 + }, + { + "epoch": 0.4763020364029555, + "grad_norm": 0.47162696719169617, + "learning_rate": 9.524324324324325e-06, + "loss": 0.5105, + "step": 881 + }, + { + "epoch": 0.4768426743557398, + "grad_norm": 0.4532954692840576, + "learning_rate": 9.535135135135137e-06, + "loss": 0.493, + "step": 882 + }, + { + "epoch": 0.47738331230852404, + "grad_norm": 0.4708048403263092, + "learning_rate": 9.545945945945947e-06, + "loss": 0.4864, + "step": 883 + }, + { + "epoch": 0.47792395026130835, + "grad_norm": 0.49821099638938904, + "learning_rate": 9.556756756756758e-06, + "loss": 0.4927, + "step": 884 + }, + { + "epoch": 0.47846458821409266, + "grad_norm": 0.4563126564025879, + "learning_rate": 9.567567567567568e-06, + "loss": 0.4955, + "step": 885 + }, + { + "epoch": 0.4790052261668769, + "grad_norm": 0.600861132144928, + "learning_rate": 9.578378378378379e-06, + "loss": 0.4744, + "step": 886 + }, + { + "epoch": 0.4795458641196612, + "grad_norm": 0.600531816482544, + "learning_rate": 9.589189189189189e-06, + "loss": 0.5105, + "step": 887 + }, + { + "epoch": 0.48008650207244546, + "grad_norm": 0.44443172216415405, + "learning_rate": 9.600000000000001e-06, + "loss": 0.5067, + "step": 888 + }, + { + "epoch": 0.48062714002522977, + "grad_norm": 0.5058892369270325, + "learning_rate": 9.610810810810812e-06, + "loss": 0.4788, + "step": 889 + }, + { + "epoch": 0.4811677779780141, + "grad_norm": 0.4944954812526703, + "learning_rate": 9.621621621621622e-06, + "loss": 0.5049, + "step": 890 + }, + { + "epoch": 0.48170841593079833, + "grad_norm": 0.48458966612815857, + "learning_rate": 9.632432432432434e-06, + "loss": 0.497, + "step": 891 + }, + { + "epoch": 0.48224905388358263, + "grad_norm": 0.4858192503452301, + "learning_rate": 9.643243243243245e-06, + "loss": 0.5056, + "step": 892 + }, + { + "epoch": 0.4827896918363669, + "grad_norm": 0.5097047686576843, + "learning_rate": 9.654054054054055e-06, + "loss": 0.5185, + "step": 893 + }, + { + "epoch": 0.4833303297891512, + "grad_norm": 0.441301167011261, + "learning_rate": 9.664864864864866e-06, + "loss": 0.4605, + "step": 894 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.47697439789772034, + "learning_rate": 9.675675675675676e-06, + "loss": 0.4996, + "step": 895 + }, + { + "epoch": 0.48441160569471975, + "grad_norm": 0.4510503113269806, + "learning_rate": 9.686486486486486e-06, + "loss": 0.4805, + "step": 896 + }, + { + "epoch": 0.48495224364750406, + "grad_norm": 0.4589500427246094, + "learning_rate": 9.697297297297299e-06, + "loss": 0.492, + "step": 897 + }, + { + "epoch": 0.48549288160028836, + "grad_norm": 0.43563947081565857, + "learning_rate": 9.708108108108109e-06, + "loss": 0.5112, + "step": 898 + }, + { + "epoch": 0.4860335195530726, + "grad_norm": 0.4395964741706848, + "learning_rate": 9.71891891891892e-06, + "loss": 0.5, + "step": 899 + }, + { + "epoch": 0.4865741575058569, + "grad_norm": 0.44560515880584717, + "learning_rate": 9.729729729729732e-06, + "loss": 0.4992, + "step": 900 + }, + { + "epoch": 0.48711479545864117, + "grad_norm": 0.4070109724998474, + "learning_rate": 9.740540540540542e-06, + "loss": 0.4875, + "step": 901 + }, + { + "epoch": 0.4876554334114255, + "grad_norm": 0.47943007946014404, + "learning_rate": 9.751351351351352e-06, + "loss": 0.4986, + "step": 902 + }, + { + "epoch": 0.4881960713642098, + "grad_norm": 0.3845299184322357, + "learning_rate": 9.762162162162163e-06, + "loss": 0.4804, + "step": 903 + }, + { + "epoch": 0.48873670931699403, + "grad_norm": 0.43732935190200806, + "learning_rate": 9.772972972972973e-06, + "loss": 0.5169, + "step": 904 + }, + { + "epoch": 0.48927734726977834, + "grad_norm": 0.4056658446788788, + "learning_rate": 9.783783783783785e-06, + "loss": 0.4862, + "step": 905 + }, + { + "epoch": 0.48981798522256265, + "grad_norm": 0.4409744441509247, + "learning_rate": 9.794594594594596e-06, + "loss": 0.491, + "step": 906 + }, + { + "epoch": 0.4903586231753469, + "grad_norm": 0.41933363676071167, + "learning_rate": 9.805405405405406e-06, + "loss": 0.4925, + "step": 907 + }, + { + "epoch": 0.4908992611281312, + "grad_norm": 0.5303579568862915, + "learning_rate": 9.816216216216217e-06, + "loss": 0.5048, + "step": 908 + }, + { + "epoch": 0.49143989908091545, + "grad_norm": 0.40795671939849854, + "learning_rate": 9.827027027027027e-06, + "loss": 0.4779, + "step": 909 + }, + { + "epoch": 0.49198053703369976, + "grad_norm": 0.4110850989818573, + "learning_rate": 9.83783783783784e-06, + "loss": 0.4776, + "step": 910 + }, + { + "epoch": 0.49252117498648407, + "grad_norm": 0.42300498485565186, + "learning_rate": 9.84864864864865e-06, + "loss": 0.4823, + "step": 911 + }, + { + "epoch": 0.4930618129392683, + "grad_norm": 0.4710748791694641, + "learning_rate": 9.85945945945946e-06, + "loss": 0.4955, + "step": 912 + }, + { + "epoch": 0.4936024508920526, + "grad_norm": 0.42153269052505493, + "learning_rate": 9.87027027027027e-06, + "loss": 0.5232, + "step": 913 + }, + { + "epoch": 0.49414308884483693, + "grad_norm": 0.3920591175556183, + "learning_rate": 9.881081081081083e-06, + "loss": 0.4944, + "step": 914 + }, + { + "epoch": 0.4946837267976212, + "grad_norm": 0.39492344856262207, + "learning_rate": 9.891891891891893e-06, + "loss": 0.5115, + "step": 915 + }, + { + "epoch": 0.4952243647504055, + "grad_norm": 0.4347766041755676, + "learning_rate": 9.902702702702704e-06, + "loss": 0.4869, + "step": 916 + }, + { + "epoch": 0.49576500270318974, + "grad_norm": 0.41567090153694153, + "learning_rate": 9.913513513513514e-06, + "loss": 0.4873, + "step": 917 + }, + { + "epoch": 0.49630564065597405, + "grad_norm": 0.3792874813079834, + "learning_rate": 9.924324324324324e-06, + "loss": 0.494, + "step": 918 + }, + { + "epoch": 0.49684627860875835, + "grad_norm": 0.47275909781455994, + "learning_rate": 9.935135135135135e-06, + "loss": 0.504, + "step": 919 + }, + { + "epoch": 0.4973869165615426, + "grad_norm": 0.4395892918109894, + "learning_rate": 9.945945945945947e-06, + "loss": 0.4731, + "step": 920 + }, + { + "epoch": 0.4979275545143269, + "grad_norm": 0.5087149143218994, + "learning_rate": 9.956756756756757e-06, + "loss": 0.4962, + "step": 921 + }, + { + "epoch": 0.4984681924671112, + "grad_norm": 0.4961149990558624, + "learning_rate": 9.96756756756757e-06, + "loss": 0.505, + "step": 922 + }, + { + "epoch": 0.49900883041989547, + "grad_norm": 0.4813699424266815, + "learning_rate": 9.97837837837838e-06, + "loss": 0.4839, + "step": 923 + }, + { + "epoch": 0.4995494683726798, + "grad_norm": 0.5078656077384949, + "learning_rate": 9.98918918918919e-06, + "loss": 0.4867, + "step": 924 + }, + { + "epoch": 0.500090106325464, + "grad_norm": 0.4412589371204376, + "learning_rate": 1e-05, + "loss": 0.5309, + "step": 925 + }, + { + "epoch": 0.5006307442782484, + "grad_norm": 0.46785590052604675, + "learning_rate": 9.999999643554535e-06, + "loss": 0.4825, + "step": 926 + }, + { + "epoch": 0.5011713822310326, + "grad_norm": 0.5026047229766846, + "learning_rate": 9.999998574218182e-06, + "loss": 0.4846, + "step": 927 + }, + { + "epoch": 0.5017120201838169, + "grad_norm": 0.42060837149620056, + "learning_rate": 9.9999967919911e-06, + "loss": 0.4552, + "step": 928 + }, + { + "epoch": 0.5022526581366011, + "grad_norm": 0.48285382986068726, + "learning_rate": 9.999994296873541e-06, + "loss": 0.4936, + "step": 929 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 0.3900724947452545, + "learning_rate": 9.999991088865861e-06, + "loss": 0.4824, + "step": 930 + }, + { + "epoch": 0.5033339340421698, + "grad_norm": 0.45784997940063477, + "learning_rate": 9.999987167968517e-06, + "loss": 0.4894, + "step": 931 + }, + { + "epoch": 0.503874571994954, + "grad_norm": 0.4959925711154938, + "learning_rate": 9.999982534182068e-06, + "loss": 0.4813, + "step": 932 + }, + { + "epoch": 0.5044152099477384, + "grad_norm": 0.39042192697525024, + "learning_rate": 9.999977187507175e-06, + "loss": 0.5168, + "step": 933 + }, + { + "epoch": 0.5049558479005226, + "grad_norm": 0.4453545808792114, + "learning_rate": 9.9999711279446e-06, + "loss": 0.471, + "step": 934 + }, + { + "epoch": 0.5054964858533069, + "grad_norm": 0.4667404890060425, + "learning_rate": 9.999964355495207e-06, + "loss": 0.4893, + "step": 935 + }, + { + "epoch": 0.5060371238060912, + "grad_norm": 0.39634689688682556, + "learning_rate": 9.999956870159961e-06, + "loss": 0.4771, + "step": 936 + }, + { + "epoch": 0.5065777617588755, + "grad_norm": 0.5295143723487854, + "learning_rate": 9.999948671939931e-06, + "loss": 0.5007, + "step": 937 + }, + { + "epoch": 0.5071183997116597, + "grad_norm": 0.4279506206512451, + "learning_rate": 9.999939760836287e-06, + "loss": 0.4681, + "step": 938 + }, + { + "epoch": 0.5076590376644441, + "grad_norm": 0.47725000977516174, + "learning_rate": 9.999930136850293e-06, + "loss": 0.5091, + "step": 939 + }, + { + "epoch": 0.5081996756172283, + "grad_norm": 0.4600881338119507, + "learning_rate": 9.999919799983327e-06, + "loss": 0.499, + "step": 940 + }, + { + "epoch": 0.5087403135700126, + "grad_norm": 0.43939170241355896, + "learning_rate": 9.999908750236862e-06, + "loss": 0.5016, + "step": 941 + }, + { + "epoch": 0.509280951522797, + "grad_norm": 0.4821349084377289, + "learning_rate": 9.999896987612473e-06, + "loss": 0.4762, + "step": 942 + }, + { + "epoch": 0.5098215894755812, + "grad_norm": 0.4333769381046295, + "learning_rate": 9.999884512111837e-06, + "loss": 0.4742, + "step": 943 + }, + { + "epoch": 0.5103622274283655, + "grad_norm": 0.4694845974445343, + "learning_rate": 9.999871323736732e-06, + "loss": 0.4831, + "step": 944 + }, + { + "epoch": 0.5109028653811497, + "grad_norm": 0.4363771378993988, + "learning_rate": 9.99985742248904e-06, + "loss": 0.4514, + "step": 945 + }, + { + "epoch": 0.5114435033339341, + "grad_norm": 0.45749279856681824, + "learning_rate": 9.99984280837074e-06, + "loss": 0.4961, + "step": 946 + }, + { + "epoch": 0.5119841412867183, + "grad_norm": 0.4644739329814911, + "learning_rate": 9.999827481383919e-06, + "loss": 0.5025, + "step": 947 + }, + { + "epoch": 0.5125247792395026, + "grad_norm": 0.45313525199890137, + "learning_rate": 9.999811441530761e-06, + "loss": 0.5196, + "step": 948 + }, + { + "epoch": 0.5130654171922869, + "grad_norm": 0.38895201683044434, + "learning_rate": 9.999794688813551e-06, + "loss": 0.4669, + "step": 949 + }, + { + "epoch": 0.5136060551450712, + "grad_norm": 0.5269231796264648, + "learning_rate": 9.999777223234682e-06, + "loss": 0.5097, + "step": 950 + }, + { + "epoch": 0.5141466930978554, + "grad_norm": 0.39397677779197693, + "learning_rate": 9.99975904479664e-06, + "loss": 0.5056, + "step": 951 + }, + { + "epoch": 0.5146873310506398, + "grad_norm": 0.4682712256908417, + "learning_rate": 9.99974015350202e-06, + "loss": 0.4958, + "step": 952 + }, + { + "epoch": 0.515227969003424, + "grad_norm": 0.37193721532821655, + "learning_rate": 9.999720549353513e-06, + "loss": 0.4654, + "step": 953 + }, + { + "epoch": 0.5157686069562083, + "grad_norm": 0.4029079079627991, + "learning_rate": 9.999700232353916e-06, + "loss": 0.4991, + "step": 954 + }, + { + "epoch": 0.5163092449089927, + "grad_norm": 0.4342360198497772, + "learning_rate": 9.999679202506126e-06, + "loss": 0.5069, + "step": 955 + }, + { + "epoch": 0.5168498828617769, + "grad_norm": 0.4561976492404938, + "learning_rate": 9.999657459813137e-06, + "loss": 0.4831, + "step": 956 + }, + { + "epoch": 0.5173905208145612, + "grad_norm": 0.4105386734008789, + "learning_rate": 9.999635004278054e-06, + "loss": 0.4802, + "step": 957 + }, + { + "epoch": 0.5179311587673455, + "grad_norm": 0.4714571535587311, + "learning_rate": 9.999611835904078e-06, + "loss": 0.4846, + "step": 958 + }, + { + "epoch": 0.5184717967201298, + "grad_norm": 0.41593897342681885, + "learning_rate": 9.99958795469451e-06, + "loss": 0.4963, + "step": 959 + }, + { + "epoch": 0.519012434672914, + "grad_norm": 0.5482795834541321, + "learning_rate": 9.999563360652757e-06, + "loss": 0.4869, + "step": 960 + }, + { + "epoch": 0.5195530726256983, + "grad_norm": 0.4962485134601593, + "learning_rate": 9.999538053782323e-06, + "loss": 0.5279, + "step": 961 + }, + { + "epoch": 0.5200937105784826, + "grad_norm": 0.5878384113311768, + "learning_rate": 9.99951203408682e-06, + "loss": 0.518, + "step": 962 + }, + { + "epoch": 0.5206343485312669, + "grad_norm": 0.4186796545982361, + "learning_rate": 9.999485301569955e-06, + "loss": 0.4825, + "step": 963 + }, + { + "epoch": 0.5211749864840511, + "grad_norm": 0.5061416029930115, + "learning_rate": 9.999457856235542e-06, + "loss": 0.4795, + "step": 964 + }, + { + "epoch": 0.5217156244368355, + "grad_norm": 0.5024672746658325, + "learning_rate": 9.999429698087491e-06, + "loss": 0.4988, + "step": 965 + }, + { + "epoch": 0.5222562623896198, + "grad_norm": 0.5579217076301575, + "learning_rate": 9.999400827129817e-06, + "loss": 0.4675, + "step": 966 + }, + { + "epoch": 0.522796900342404, + "grad_norm": 0.5686841607093811, + "learning_rate": 9.99937124336664e-06, + "loss": 0.5074, + "step": 967 + }, + { + "epoch": 0.5233375382951884, + "grad_norm": 0.4582173824310303, + "learning_rate": 9.999340946802173e-06, + "loss": 0.4826, + "step": 968 + }, + { + "epoch": 0.5238781762479726, + "grad_norm": 0.63434898853302, + "learning_rate": 9.99930993744074e-06, + "loss": 0.4696, + "step": 969 + }, + { + "epoch": 0.5244188142007569, + "grad_norm": 0.46918338537216187, + "learning_rate": 9.99927821528676e-06, + "loss": 0.4864, + "step": 970 + }, + { + "epoch": 0.5249594521535412, + "grad_norm": 0.5041794180870056, + "learning_rate": 9.999245780344758e-06, + "loss": 0.4982, + "step": 971 + }, + { + "epoch": 0.5255000901063255, + "grad_norm": 0.5856842994689941, + "learning_rate": 9.999212632619356e-06, + "loss": 0.4844, + "step": 972 + }, + { + "epoch": 0.5260407280591097, + "grad_norm": 0.46322301030158997, + "learning_rate": 9.999178772115279e-06, + "loss": 0.5029, + "step": 973 + }, + { + "epoch": 0.526581366011894, + "grad_norm": 0.6072745323181152, + "learning_rate": 9.999144198837358e-06, + "loss": 0.4939, + "step": 974 + }, + { + "epoch": 0.5271220039646783, + "grad_norm": 0.49522969126701355, + "learning_rate": 9.999108912790521e-06, + "loss": 0.4633, + "step": 975 + }, + { + "epoch": 0.5276626419174626, + "grad_norm": 0.48789697885513306, + "learning_rate": 9.9990729139798e-06, + "loss": 0.5224, + "step": 976 + }, + { + "epoch": 0.5282032798702468, + "grad_norm": 0.5480666160583496, + "learning_rate": 9.999036202410324e-06, + "loss": 0.5199, + "step": 977 + }, + { + "epoch": 0.5287439178230312, + "grad_norm": 0.43815743923187256, + "learning_rate": 9.998998778087333e-06, + "loss": 0.4733, + "step": 978 + }, + { + "epoch": 0.5292845557758155, + "grad_norm": 0.41781720519065857, + "learning_rate": 9.99896064101616e-06, + "loss": 0.4953, + "step": 979 + }, + { + "epoch": 0.5298251937285997, + "grad_norm": 0.5263339877128601, + "learning_rate": 9.99892179120224e-06, + "loss": 0.5135, + "step": 980 + }, + { + "epoch": 0.5303658316813841, + "grad_norm": 0.43066519498825073, + "learning_rate": 9.998882228651117e-06, + "loss": 0.4939, + "step": 981 + }, + { + "epoch": 0.5309064696341683, + "grad_norm": 0.4255053400993347, + "learning_rate": 9.99884195336843e-06, + "loss": 0.4641, + "step": 982 + }, + { + "epoch": 0.5314471075869526, + "grad_norm": 0.4449794292449951, + "learning_rate": 9.998800965359918e-06, + "loss": 0.4738, + "step": 983 + }, + { + "epoch": 0.5319877455397369, + "grad_norm": 0.378813773393631, + "learning_rate": 9.99875926463143e-06, + "loss": 0.4687, + "step": 984 + }, + { + "epoch": 0.5325283834925212, + "grad_norm": 0.4286113679409027, + "learning_rate": 9.99871685118891e-06, + "loss": 0.4721, + "step": 985 + }, + { + "epoch": 0.5330690214453054, + "grad_norm": 0.37922412157058716, + "learning_rate": 9.998673725038401e-06, + "loss": 0.4848, + "step": 986 + }, + { + "epoch": 0.5336096593980898, + "grad_norm": 0.3998044729232788, + "learning_rate": 9.998629886186058e-06, + "loss": 0.4849, + "step": 987 + }, + { + "epoch": 0.534150297350874, + "grad_norm": 0.40176922082901, + "learning_rate": 9.998585334638128e-06, + "loss": 0.5059, + "step": 988 + }, + { + "epoch": 0.5346909353036583, + "grad_norm": 0.37144216895103455, + "learning_rate": 9.998540070400966e-06, + "loss": 0.5057, + "step": 989 + }, + { + "epoch": 0.5352315732564425, + "grad_norm": 0.3971169888973236, + "learning_rate": 9.998494093481022e-06, + "loss": 0.4729, + "step": 990 + }, + { + "epoch": 0.5357722112092269, + "grad_norm": 0.39043352007865906, + "learning_rate": 9.998447403884853e-06, + "loss": 0.504, + "step": 991 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 0.41439974308013916, + "learning_rate": 9.998400001619116e-06, + "loss": 0.4721, + "step": 992 + }, + { + "epoch": 0.5368534871147954, + "grad_norm": 0.4104996621608734, + "learning_rate": 9.998351886690569e-06, + "loss": 0.4801, + "step": 993 + }, + { + "epoch": 0.5373941250675798, + "grad_norm": 0.5444105863571167, + "learning_rate": 9.998303059106073e-06, + "loss": 0.4981, + "step": 994 + }, + { + "epoch": 0.537934763020364, + "grad_norm": 0.3640936613082886, + "learning_rate": 9.998253518872592e-06, + "loss": 0.4988, + "step": 995 + }, + { + "epoch": 0.5384754009731483, + "grad_norm": 0.456035852432251, + "learning_rate": 9.998203265997184e-06, + "loss": 0.4709, + "step": 996 + }, + { + "epoch": 0.5390160389259326, + "grad_norm": 0.403956800699234, + "learning_rate": 9.998152300487016e-06, + "loss": 0.4912, + "step": 997 + }, + { + "epoch": 0.5395566768787169, + "grad_norm": 0.40325304865837097, + "learning_rate": 9.998100622349355e-06, + "loss": 0.5039, + "step": 998 + }, + { + "epoch": 0.5400973148315011, + "grad_norm": 0.3739657402038574, + "learning_rate": 9.998048231591572e-06, + "loss": 0.4892, + "step": 999 + }, + { + "epoch": 0.5406379527842855, + "grad_norm": 0.4549963176250458, + "learning_rate": 9.997995128221131e-06, + "loss": 0.5058, + "step": 1000 + }, + { + "epoch": 0.5411785907370698, + "grad_norm": 0.44868186116218567, + "learning_rate": 9.99794131224561e-06, + "loss": 0.4867, + "step": 1001 + }, + { + "epoch": 0.541719228689854, + "grad_norm": 0.43002334237098694, + "learning_rate": 9.997886783672677e-06, + "loss": 0.4481, + "step": 1002 + }, + { + "epoch": 0.5422598666426384, + "grad_norm": 0.4119184613227844, + "learning_rate": 9.997831542510107e-06, + "loss": 0.4738, + "step": 1003 + }, + { + "epoch": 0.5428005045954226, + "grad_norm": 0.38203638792037964, + "learning_rate": 9.997775588765779e-06, + "loss": 0.4523, + "step": 1004 + }, + { + "epoch": 0.5433411425482069, + "grad_norm": 0.4788086414337158, + "learning_rate": 9.997718922447669e-06, + "loss": 0.4946, + "step": 1005 + }, + { + "epoch": 0.5438817805009911, + "grad_norm": 0.38479748368263245, + "learning_rate": 9.997661543563855e-06, + "loss": 0.4975, + "step": 1006 + }, + { + "epoch": 0.5444224184537755, + "grad_norm": 0.4428192973136902, + "learning_rate": 9.99760345212252e-06, + "loss": 0.4737, + "step": 1007 + }, + { + "epoch": 0.5449630564065597, + "grad_norm": 0.36287838220596313, + "learning_rate": 9.997544648131946e-06, + "loss": 0.4715, + "step": 1008 + }, + { + "epoch": 0.545503694359344, + "grad_norm": 0.45783060789108276, + "learning_rate": 9.997485131600517e-06, + "loss": 0.4992, + "step": 1009 + }, + { + "epoch": 0.5460443323121283, + "grad_norm": 0.4367043077945709, + "learning_rate": 9.99742490253672e-06, + "loss": 0.4823, + "step": 1010 + }, + { + "epoch": 0.5465849702649126, + "grad_norm": 0.4037528932094574, + "learning_rate": 9.99736396094914e-06, + "loss": 0.4802, + "step": 1011 + }, + { + "epoch": 0.5471256082176968, + "grad_norm": 0.4337640702724457, + "learning_rate": 9.997302306846468e-06, + "loss": 0.4825, + "step": 1012 + }, + { + "epoch": 0.5476662461704812, + "grad_norm": 0.3727249205112457, + "learning_rate": 9.997239940237495e-06, + "loss": 0.5, + "step": 1013 + }, + { + "epoch": 0.5482068841232655, + "grad_norm": 0.46426814794540405, + "learning_rate": 9.99717686113111e-06, + "loss": 0.5231, + "step": 1014 + }, + { + "epoch": 0.5487475220760497, + "grad_norm": 0.41532108187675476, + "learning_rate": 9.99711306953631e-06, + "loss": 0.4877, + "step": 1015 + }, + { + "epoch": 0.5492881600288341, + "grad_norm": 0.404090940952301, + "learning_rate": 9.997048565462188e-06, + "loss": 0.4797, + "step": 1016 + }, + { + "epoch": 0.5498287979816183, + "grad_norm": 0.37468329071998596, + "learning_rate": 9.996983348917941e-06, + "loss": 0.4654, + "step": 1017 + }, + { + "epoch": 0.5503694359344026, + "grad_norm": 0.42747071385383606, + "learning_rate": 9.996917419912869e-06, + "loss": 0.4719, + "step": 1018 + }, + { + "epoch": 0.5509100738871869, + "grad_norm": 0.38618093729019165, + "learning_rate": 9.996850778456371e-06, + "loss": 0.486, + "step": 1019 + }, + { + "epoch": 0.5514507118399712, + "grad_norm": 0.4205274283885956, + "learning_rate": 9.99678342455795e-06, + "loss": 0.4851, + "step": 1020 + }, + { + "epoch": 0.5519913497927554, + "grad_norm": 0.39830052852630615, + "learning_rate": 9.996715358227208e-06, + "loss": 0.4658, + "step": 1021 + }, + { + "epoch": 0.5525319877455397, + "grad_norm": 0.4612605571746826, + "learning_rate": 9.996646579473848e-06, + "loss": 0.5131, + "step": 1022 + }, + { + "epoch": 0.553072625698324, + "grad_norm": 0.3865067958831787, + "learning_rate": 9.99657708830768e-06, + "loss": 0.4773, + "step": 1023 + }, + { + "epoch": 0.5536132636511083, + "grad_norm": 0.48155269026756287, + "learning_rate": 9.99650688473861e-06, + "loss": 0.4887, + "step": 1024 + }, + { + "epoch": 0.5541539016038926, + "grad_norm": 0.4000060558319092, + "learning_rate": 9.996435968776646e-06, + "loss": 0.4736, + "step": 1025 + }, + { + "epoch": 0.5546945395566769, + "grad_norm": 0.3519155979156494, + "learning_rate": 9.9963643404319e-06, + "loss": 0.495, + "step": 1026 + }, + { + "epoch": 0.5552351775094612, + "grad_norm": 0.4599016010761261, + "learning_rate": 9.99629199971459e-06, + "loss": 0.5075, + "step": 1027 + }, + { + "epoch": 0.5557758154622454, + "grad_norm": 0.4261663258075714, + "learning_rate": 9.996218946635021e-06, + "loss": 0.4885, + "step": 1028 + }, + { + "epoch": 0.5563164534150298, + "grad_norm": 0.433912992477417, + "learning_rate": 9.996145181203616e-06, + "loss": 0.5027, + "step": 1029 + }, + { + "epoch": 0.556857091367814, + "grad_norm": 0.4443386495113373, + "learning_rate": 9.996070703430888e-06, + "loss": 0.4919, + "step": 1030 + }, + { + "epoch": 0.5573977293205983, + "grad_norm": 0.36704230308532715, + "learning_rate": 9.995995513327459e-06, + "loss": 0.4832, + "step": 1031 + }, + { + "epoch": 0.5579383672733826, + "grad_norm": 0.41120779514312744, + "learning_rate": 9.995919610904045e-06, + "loss": 0.4813, + "step": 1032 + }, + { + "epoch": 0.5584790052261669, + "grad_norm": 0.42510706186294556, + "learning_rate": 9.995842996171475e-06, + "loss": 0.4859, + "step": 1033 + }, + { + "epoch": 0.5590196431789511, + "grad_norm": 0.38486090302467346, + "learning_rate": 9.995765669140668e-06, + "loss": 0.4739, + "step": 1034 + }, + { + "epoch": 0.5595602811317354, + "grad_norm": 0.42321518063545227, + "learning_rate": 9.995687629822647e-06, + "loss": 0.5091, + "step": 1035 + }, + { + "epoch": 0.5601009190845198, + "grad_norm": 0.4389384388923645, + "learning_rate": 9.995608878228544e-06, + "loss": 0.4745, + "step": 1036 + }, + { + "epoch": 0.560641557037304, + "grad_norm": 0.41088220477104187, + "learning_rate": 9.995529414369582e-06, + "loss": 0.4879, + "step": 1037 + }, + { + "epoch": 0.5611821949900883, + "grad_norm": 0.4821391999721527, + "learning_rate": 9.995449238257097e-06, + "loss": 0.4993, + "step": 1038 + }, + { + "epoch": 0.5617228329428726, + "grad_norm": 0.39829981327056885, + "learning_rate": 9.995368349902514e-06, + "loss": 0.4885, + "step": 1039 + }, + { + "epoch": 0.5622634708956569, + "grad_norm": 0.48630595207214355, + "learning_rate": 9.99528674931737e-06, + "loss": 0.4893, + "step": 1040 + }, + { + "epoch": 0.5628041088484411, + "grad_norm": 0.3742629587650299, + "learning_rate": 9.9952044365133e-06, + "loss": 0.4858, + "step": 1041 + }, + { + "epoch": 0.5633447468012255, + "grad_norm": 0.4473123550415039, + "learning_rate": 9.995121411502037e-06, + "loss": 0.4932, + "step": 1042 + }, + { + "epoch": 0.5638853847540097, + "grad_norm": 0.44922560453414917, + "learning_rate": 9.995037674295419e-06, + "loss": 0.5062, + "step": 1043 + }, + { + "epoch": 0.564426022706794, + "grad_norm": 0.43669557571411133, + "learning_rate": 9.994953224905387e-06, + "loss": 0.4833, + "step": 1044 + }, + { + "epoch": 0.5649666606595783, + "grad_norm": 0.4496200382709503, + "learning_rate": 9.99486806334398e-06, + "loss": 0.4857, + "step": 1045 + }, + { + "epoch": 0.5655072986123626, + "grad_norm": 0.47932863235473633, + "learning_rate": 9.994782189623342e-06, + "loss": 0.4964, + "step": 1046 + }, + { + "epoch": 0.5660479365651468, + "grad_norm": 0.41345056891441345, + "learning_rate": 9.994695603755714e-06, + "loss": 0.4876, + "step": 1047 + }, + { + "epoch": 0.5665885745179312, + "grad_norm": 0.38490182161331177, + "learning_rate": 9.994608305753443e-06, + "loss": 0.5018, + "step": 1048 + }, + { + "epoch": 0.5671292124707155, + "grad_norm": 0.4293023943901062, + "learning_rate": 9.994520295628976e-06, + "loss": 0.4534, + "step": 1049 + }, + { + "epoch": 0.5676698504234997, + "grad_norm": 0.4372541010379791, + "learning_rate": 9.994431573394861e-06, + "loss": 0.5271, + "step": 1050 + }, + { + "epoch": 0.568210488376284, + "grad_norm": 0.4250541031360626, + "learning_rate": 9.994342139063748e-06, + "loss": 0.4934, + "step": 1051 + }, + { + "epoch": 0.5687511263290683, + "grad_norm": 0.4278985261917114, + "learning_rate": 9.994251992648386e-06, + "loss": 0.5077, + "step": 1052 + }, + { + "epoch": 0.5692917642818526, + "grad_norm": 0.4010027348995209, + "learning_rate": 9.994161134161635e-06, + "loss": 0.4575, + "step": 1053 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 0.40400251746177673, + "learning_rate": 9.99406956361644e-06, + "loss": 0.4913, + "step": 1054 + }, + { + "epoch": 0.5703730401874212, + "grad_norm": 0.42402681708335876, + "learning_rate": 9.993977281025862e-06, + "loss": 0.4637, + "step": 1055 + }, + { + "epoch": 0.5709136781402054, + "grad_norm": 0.5451776385307312, + "learning_rate": 9.99388428640306e-06, + "loss": 0.5021, + "step": 1056 + }, + { + "epoch": 0.5714543160929897, + "grad_norm": 0.40865638852119446, + "learning_rate": 9.99379057976129e-06, + "loss": 0.4933, + "step": 1057 + }, + { + "epoch": 0.571994954045774, + "grad_norm": 0.457672655582428, + "learning_rate": 9.993696161113913e-06, + "loss": 0.4834, + "step": 1058 + }, + { + "epoch": 0.5725355919985583, + "grad_norm": 0.4288524389266968, + "learning_rate": 9.993601030474392e-06, + "loss": 0.5025, + "step": 1059 + }, + { + "epoch": 0.5730762299513426, + "grad_norm": 0.4974946081638336, + "learning_rate": 9.993505187856289e-06, + "loss": 0.494, + "step": 1060 + }, + { + "epoch": 0.5736168679041269, + "grad_norm": 0.4515710771083832, + "learning_rate": 9.99340863327327e-06, + "loss": 0.4805, + "step": 1061 + }, + { + "epoch": 0.5741575058569112, + "grad_norm": 0.44452914595603943, + "learning_rate": 9.993311366739103e-06, + "loss": 0.4488, + "step": 1062 + }, + { + "epoch": 0.5746981438096954, + "grad_norm": 0.447654664516449, + "learning_rate": 9.993213388267653e-06, + "loss": 0.454, + "step": 1063 + }, + { + "epoch": 0.5752387817624798, + "grad_norm": 0.5111740827560425, + "learning_rate": 9.993114697872894e-06, + "loss": 0.4944, + "step": 1064 + }, + { + "epoch": 0.575779419715264, + "grad_norm": 0.41599783301353455, + "learning_rate": 9.993015295568893e-06, + "loss": 0.4739, + "step": 1065 + }, + { + "epoch": 0.5763200576680483, + "grad_norm": 0.4951651990413666, + "learning_rate": 9.992915181369823e-06, + "loss": 0.4785, + "step": 1066 + }, + { + "epoch": 0.5768606956208325, + "grad_norm": 0.45784544944763184, + "learning_rate": 9.99281435528996e-06, + "loss": 0.4757, + "step": 1067 + }, + { + "epoch": 0.5774013335736169, + "grad_norm": 0.45814573764801025, + "learning_rate": 9.99271281734368e-06, + "loss": 0.4734, + "step": 1068 + }, + { + "epoch": 0.5779419715264011, + "grad_norm": 0.422540545463562, + "learning_rate": 9.992610567545458e-06, + "loss": 0.4826, + "step": 1069 + }, + { + "epoch": 0.5784826094791854, + "grad_norm": 0.3874336779117584, + "learning_rate": 9.992507605909873e-06, + "loss": 0.4722, + "step": 1070 + }, + { + "epoch": 0.5790232474319698, + "grad_norm": 0.42774495482444763, + "learning_rate": 9.992403932451605e-06, + "loss": 0.4768, + "step": 1071 + }, + { + "epoch": 0.579563885384754, + "grad_norm": 0.4025377035140991, + "learning_rate": 9.992299547185439e-06, + "loss": 0.4956, + "step": 1072 + }, + { + "epoch": 0.5801045233375383, + "grad_norm": 0.3693923354148865, + "learning_rate": 9.992194450126252e-06, + "loss": 0.4696, + "step": 1073 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.3981197476387024, + "learning_rate": 9.992088641289033e-06, + "loss": 0.4505, + "step": 1074 + }, + { + "epoch": 0.5811857992431069, + "grad_norm": 0.4446949362754822, + "learning_rate": 9.991982120688865e-06, + "loss": 0.4714, + "step": 1075 + }, + { + "epoch": 0.5817264371958911, + "grad_norm": 0.4017349183559418, + "learning_rate": 9.99187488834094e-06, + "loss": 0.4514, + "step": 1076 + }, + { + "epoch": 0.5822670751486755, + "grad_norm": 0.4286927282810211, + "learning_rate": 9.991766944260544e-06, + "loss": 0.479, + "step": 1077 + }, + { + "epoch": 0.5828077131014597, + "grad_norm": 0.41668716073036194, + "learning_rate": 9.991658288463067e-06, + "loss": 0.4992, + "step": 1078 + }, + { + "epoch": 0.583348351054244, + "grad_norm": 0.4160311222076416, + "learning_rate": 9.991548920964001e-06, + "loss": 0.5021, + "step": 1079 + }, + { + "epoch": 0.5838889890070283, + "grad_norm": 0.46067795157432556, + "learning_rate": 9.99143884177894e-06, + "loss": 0.5154, + "step": 1080 + }, + { + "epoch": 0.5844296269598126, + "grad_norm": 0.38376685976982117, + "learning_rate": 9.99132805092358e-06, + "loss": 0.4836, + "step": 1081 + }, + { + "epoch": 0.5849702649125968, + "grad_norm": 0.37903881072998047, + "learning_rate": 9.991216548413715e-06, + "loss": 0.5008, + "step": 1082 + }, + { + "epoch": 0.5855109028653811, + "grad_norm": 0.41807129979133606, + "learning_rate": 9.991104334265246e-06, + "loss": 0.4646, + "step": 1083 + }, + { + "epoch": 0.5860515408181655, + "grad_norm": 0.46291136741638184, + "learning_rate": 9.99099140849417e-06, + "loss": 0.4829, + "step": 1084 + }, + { + "epoch": 0.5865921787709497, + "grad_norm": 0.4093681573867798, + "learning_rate": 9.990877771116588e-06, + "loss": 0.465, + "step": 1085 + }, + { + "epoch": 0.587132816723734, + "grad_norm": 0.4017224609851837, + "learning_rate": 9.990763422148703e-06, + "loss": 0.4684, + "step": 1086 + }, + { + "epoch": 0.5876734546765183, + "grad_norm": 0.49730250239372253, + "learning_rate": 9.990648361606815e-06, + "loss": 0.4807, + "step": 1087 + }, + { + "epoch": 0.5882140926293026, + "grad_norm": 0.4142555892467499, + "learning_rate": 9.990532589507336e-06, + "loss": 0.4974, + "step": 1088 + }, + { + "epoch": 0.5887547305820868, + "grad_norm": 0.4138866364955902, + "learning_rate": 9.990416105866768e-06, + "loss": 0.4708, + "step": 1089 + }, + { + "epoch": 0.5892953685348712, + "grad_norm": 0.4031008780002594, + "learning_rate": 9.99029891070172e-06, + "loss": 0.459, + "step": 1090 + }, + { + "epoch": 0.5898360064876554, + "grad_norm": 0.4528948664665222, + "learning_rate": 9.9901810040289e-06, + "loss": 0.481, + "step": 1091 + }, + { + "epoch": 0.5903766444404397, + "grad_norm": 0.4991556406021118, + "learning_rate": 9.990062385865121e-06, + "loss": 0.515, + "step": 1092 + }, + { + "epoch": 0.590917282393224, + "grad_norm": 0.39666545391082764, + "learning_rate": 9.989943056227294e-06, + "loss": 0.4718, + "step": 1093 + }, + { + "epoch": 0.5914579203460083, + "grad_norm": 0.4354439973831177, + "learning_rate": 9.989823015132433e-06, + "loss": 0.4642, + "step": 1094 + }, + { + "epoch": 0.5919985582987926, + "grad_norm": 0.4511450529098511, + "learning_rate": 9.989702262597656e-06, + "loss": 0.5067, + "step": 1095 + }, + { + "epoch": 0.5925391962515769, + "grad_norm": 0.42287683486938477, + "learning_rate": 9.989580798640175e-06, + "loss": 0.4525, + "step": 1096 + }, + { + "epoch": 0.5930798342043612, + "grad_norm": 0.38862234354019165, + "learning_rate": 9.98945862327731e-06, + "loss": 0.4897, + "step": 1097 + }, + { + "epoch": 0.5936204721571454, + "grad_norm": 0.46156349778175354, + "learning_rate": 9.989335736526483e-06, + "loss": 0.4876, + "step": 1098 + }, + { + "epoch": 0.5941611101099297, + "grad_norm": 0.37772494554519653, + "learning_rate": 9.989212138405213e-06, + "loss": 0.4678, + "step": 1099 + }, + { + "epoch": 0.594701748062714, + "grad_norm": 0.4161140024662018, + "learning_rate": 9.989087828931121e-06, + "loss": 0.4855, + "step": 1100 + }, + { + "epoch": 0.5952423860154983, + "grad_norm": 0.4513489007949829, + "learning_rate": 9.988962808121932e-06, + "loss": 0.5092, + "step": 1101 + }, + { + "epoch": 0.5957830239682825, + "grad_norm": 0.4317286014556885, + "learning_rate": 9.988837075995472e-06, + "loss": 0.4915, + "step": 1102 + }, + { + "epoch": 0.5963236619210669, + "grad_norm": 0.4530947804450989, + "learning_rate": 9.988710632569667e-06, + "loss": 0.4924, + "step": 1103 + }, + { + "epoch": 0.5968642998738511, + "grad_norm": 0.42525094747543335, + "learning_rate": 9.988583477862544e-06, + "loss": 0.5151, + "step": 1104 + }, + { + "epoch": 0.5974049378266354, + "grad_norm": 0.41204145550727844, + "learning_rate": 9.988455611892237e-06, + "loss": 0.4995, + "step": 1105 + }, + { + "epoch": 0.5979455757794198, + "grad_norm": 0.45464006066322327, + "learning_rate": 9.98832703467697e-06, + "loss": 0.4783, + "step": 1106 + }, + { + "epoch": 0.598486213732204, + "grad_norm": 0.3779187500476837, + "learning_rate": 9.98819774623508e-06, + "loss": 0.4559, + "step": 1107 + }, + { + "epoch": 0.5990268516849883, + "grad_norm": 0.4304542541503906, + "learning_rate": 9.988067746584999e-06, + "loss": 0.4827, + "step": 1108 + }, + { + "epoch": 0.5995674896377726, + "grad_norm": 0.4540359079837799, + "learning_rate": 9.987937035745264e-06, + "loss": 0.471, + "step": 1109 + }, + { + "epoch": 0.6001081275905569, + "grad_norm": 0.4767281115055084, + "learning_rate": 9.987805613734508e-06, + "loss": 0.4547, + "step": 1110 + }, + { + "epoch": 0.6006487655433411, + "grad_norm": 0.40696877241134644, + "learning_rate": 9.987673480571472e-06, + "loss": 0.4869, + "step": 1111 + }, + { + "epoch": 0.6011894034961254, + "grad_norm": 0.3924008011817932, + "learning_rate": 9.987540636274995e-06, + "loss": 0.4839, + "step": 1112 + }, + { + "epoch": 0.6017300414489097, + "grad_norm": 0.40512779355049133, + "learning_rate": 9.987407080864017e-06, + "loss": 0.4998, + "step": 1113 + }, + { + "epoch": 0.602270679401694, + "grad_norm": 0.4261493384838104, + "learning_rate": 9.987272814357579e-06, + "loss": 0.5025, + "step": 1114 + }, + { + "epoch": 0.6028113173544782, + "grad_norm": 0.4489496648311615, + "learning_rate": 9.987137836774827e-06, + "loss": 0.4764, + "step": 1115 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 0.3959837257862091, + "learning_rate": 9.987002148135004e-06, + "loss": 0.488, + "step": 1116 + }, + { + "epoch": 0.6038925932600469, + "grad_norm": 0.4422343969345093, + "learning_rate": 9.986865748457457e-06, + "loss": 0.4584, + "step": 1117 + }, + { + "epoch": 0.6044332312128311, + "grad_norm": 0.4124646484851837, + "learning_rate": 9.986728637761632e-06, + "loss": 0.4802, + "step": 1118 + }, + { + "epoch": 0.6049738691656155, + "grad_norm": 0.4822395443916321, + "learning_rate": 9.98659081606708e-06, + "loss": 0.456, + "step": 1119 + }, + { + "epoch": 0.6055145071183997, + "grad_norm": 0.3804273307323456, + "learning_rate": 9.986452283393452e-06, + "loss": 0.4689, + "step": 1120 + }, + { + "epoch": 0.606055145071184, + "grad_norm": 0.4936402142047882, + "learning_rate": 9.986313039760497e-06, + "loss": 0.4651, + "step": 1121 + }, + { + "epoch": 0.6065957830239683, + "grad_norm": 0.40174707770347595, + "learning_rate": 9.98617308518807e-06, + "loss": 0.5118, + "step": 1122 + }, + { + "epoch": 0.6071364209767526, + "grad_norm": 0.4455929398536682, + "learning_rate": 9.986032419696126e-06, + "loss": 0.4786, + "step": 1123 + }, + { + "epoch": 0.6076770589295368, + "grad_norm": 0.4902845323085785, + "learning_rate": 9.985891043304718e-06, + "loss": 0.4959, + "step": 1124 + }, + { + "epoch": 0.6082176968823212, + "grad_norm": 0.4242980480194092, + "learning_rate": 9.985748956034007e-06, + "loss": 0.4847, + "step": 1125 + }, + { + "epoch": 0.6087583348351054, + "grad_norm": 0.4486798942089081, + "learning_rate": 9.985606157904249e-06, + "loss": 0.5114, + "step": 1126 + }, + { + "epoch": 0.6092989727878897, + "grad_norm": 0.46665695309638977, + "learning_rate": 9.985462648935802e-06, + "loss": 0.4665, + "step": 1127 + }, + { + "epoch": 0.6098396107406739, + "grad_norm": 0.3827839195728302, + "learning_rate": 9.985318429149133e-06, + "loss": 0.4718, + "step": 1128 + }, + { + "epoch": 0.6103802486934583, + "grad_norm": 0.5129957795143127, + "learning_rate": 9.985173498564799e-06, + "loss": 0.5015, + "step": 1129 + }, + { + "epoch": 0.6109208866462426, + "grad_norm": 0.44158732891082764, + "learning_rate": 9.985027857203469e-06, + "loss": 0.5041, + "step": 1130 + }, + { + "epoch": 0.6114615245990268, + "grad_norm": 0.4281488060951233, + "learning_rate": 9.984881505085904e-06, + "loss": 0.4996, + "step": 1131 + }, + { + "epoch": 0.6120021625518112, + "grad_norm": 0.4399912655353546, + "learning_rate": 9.984734442232972e-06, + "loss": 0.5111, + "step": 1132 + }, + { + "epoch": 0.6125428005045954, + "grad_norm": 0.4394729733467102, + "learning_rate": 9.984586668665641e-06, + "loss": 0.4912, + "step": 1133 + }, + { + "epoch": 0.6130834384573797, + "grad_norm": 0.4478180408477783, + "learning_rate": 9.984438184404981e-06, + "loss": 0.4768, + "step": 1134 + }, + { + "epoch": 0.613624076410164, + "grad_norm": 0.41149696707725525, + "learning_rate": 9.984288989472162e-06, + "loss": 0.5033, + "step": 1135 + }, + { + "epoch": 0.6141647143629483, + "grad_norm": 0.4182077646255493, + "learning_rate": 9.984139083888454e-06, + "loss": 0.4516, + "step": 1136 + }, + { + "epoch": 0.6147053523157325, + "grad_norm": 0.4762760400772095, + "learning_rate": 9.983988467675234e-06, + "loss": 0.5038, + "step": 1137 + }, + { + "epoch": 0.6152459902685169, + "grad_norm": 0.456305593252182, + "learning_rate": 9.983837140853977e-06, + "loss": 0.4756, + "step": 1138 + }, + { + "epoch": 0.6157866282213011, + "grad_norm": 0.4571511745452881, + "learning_rate": 9.983685103446253e-06, + "loss": 0.4954, + "step": 1139 + }, + { + "epoch": 0.6163272661740854, + "grad_norm": 0.45587003231048584, + "learning_rate": 9.983532355473744e-06, + "loss": 0.4619, + "step": 1140 + }, + { + "epoch": 0.6168679041268698, + "grad_norm": 0.426291286945343, + "learning_rate": 9.983378896958228e-06, + "loss": 0.499, + "step": 1141 + }, + { + "epoch": 0.617408542079654, + "grad_norm": 0.4334041178226471, + "learning_rate": 9.983224727921584e-06, + "loss": 0.4913, + "step": 1142 + }, + { + "epoch": 0.6179491800324383, + "grad_norm": 0.38864368200302124, + "learning_rate": 9.983069848385794e-06, + "loss": 0.4726, + "step": 1143 + }, + { + "epoch": 0.6184898179852225, + "grad_norm": 0.43272289633750916, + "learning_rate": 9.982914258372939e-06, + "loss": 0.4992, + "step": 1144 + }, + { + "epoch": 0.6190304559380069, + "grad_norm": 0.47826868295669556, + "learning_rate": 9.982757957905204e-06, + "loss": 0.4701, + "step": 1145 + }, + { + "epoch": 0.6195710938907911, + "grad_norm": 0.40173253417015076, + "learning_rate": 9.982600947004875e-06, + "loss": 0.5083, + "step": 1146 + }, + { + "epoch": 0.6201117318435754, + "grad_norm": 0.42841464281082153, + "learning_rate": 9.982443225694335e-06, + "loss": 0.4963, + "step": 1147 + }, + { + "epoch": 0.6206523697963597, + "grad_norm": 0.48060914874076843, + "learning_rate": 9.982284793996075e-06, + "loss": 0.4913, + "step": 1148 + }, + { + "epoch": 0.621193007749144, + "grad_norm": 0.37836718559265137, + "learning_rate": 9.982125651932681e-06, + "loss": 0.4872, + "step": 1149 + }, + { + "epoch": 0.6217336457019282, + "grad_norm": 0.39764174818992615, + "learning_rate": 9.981965799526846e-06, + "loss": 0.4615, + "step": 1150 + }, + { + "epoch": 0.6222742836547126, + "grad_norm": 0.4169751703739166, + "learning_rate": 9.981805236801359e-06, + "loss": 0.4836, + "step": 1151 + }, + { + "epoch": 0.6228149216074969, + "grad_norm": 0.38069164752960205, + "learning_rate": 9.981643963779116e-06, + "loss": 0.489, + "step": 1152 + }, + { + "epoch": 0.6233555595602811, + "grad_norm": 0.41259583830833435, + "learning_rate": 9.981481980483107e-06, + "loss": 0.5223, + "step": 1153 + }, + { + "epoch": 0.6238961975130655, + "grad_norm": 0.44381216168403625, + "learning_rate": 9.98131928693643e-06, + "loss": 0.4799, + "step": 1154 + }, + { + "epoch": 0.6244368354658497, + "grad_norm": 0.37238988280296326, + "learning_rate": 9.981155883162281e-06, + "loss": 0.4953, + "step": 1155 + }, + { + "epoch": 0.624977473418634, + "grad_norm": 0.4142807424068451, + "learning_rate": 9.980991769183957e-06, + "loss": 0.454, + "step": 1156 + }, + { + "epoch": 0.6255181113714183, + "grad_norm": 0.38495945930480957, + "learning_rate": 9.980826945024858e-06, + "loss": 0.478, + "step": 1157 + }, + { + "epoch": 0.6260587493242026, + "grad_norm": 0.4185667335987091, + "learning_rate": 9.980661410708484e-06, + "loss": 0.4915, + "step": 1158 + }, + { + "epoch": 0.6265993872769868, + "grad_norm": 0.3721354305744171, + "learning_rate": 9.980495166258437e-06, + "loss": 0.4879, + "step": 1159 + }, + { + "epoch": 0.6271400252297711, + "grad_norm": 0.3885502219200134, + "learning_rate": 9.980328211698418e-06, + "loss": 0.4821, + "step": 1160 + }, + { + "epoch": 0.6276806631825554, + "grad_norm": 0.38209933042526245, + "learning_rate": 9.980160547052233e-06, + "loss": 0.4753, + "step": 1161 + }, + { + "epoch": 0.6282213011353397, + "grad_norm": 0.3807179033756256, + "learning_rate": 9.979992172343789e-06, + "loss": 0.4965, + "step": 1162 + }, + { + "epoch": 0.628761939088124, + "grad_norm": 0.4110669493675232, + "learning_rate": 9.979823087597088e-06, + "loss": 0.5032, + "step": 1163 + }, + { + "epoch": 0.6293025770409083, + "grad_norm": 0.3786408007144928, + "learning_rate": 9.97965329283624e-06, + "loss": 0.4762, + "step": 1164 + }, + { + "epoch": 0.6298432149936926, + "grad_norm": 0.3834894597530365, + "learning_rate": 9.979482788085455e-06, + "loss": 0.4709, + "step": 1165 + }, + { + "epoch": 0.6303838529464768, + "grad_norm": 0.39199164509773254, + "learning_rate": 9.979311573369044e-06, + "loss": 0.4881, + "step": 1166 + }, + { + "epoch": 0.6309244908992612, + "grad_norm": 0.40243616700172424, + "learning_rate": 9.979139648711415e-06, + "loss": 0.4584, + "step": 1167 + }, + { + "epoch": 0.6314651288520454, + "grad_norm": 0.42804813385009766, + "learning_rate": 9.978967014137082e-06, + "loss": 0.4939, + "step": 1168 + }, + { + "epoch": 0.6320057668048297, + "grad_norm": 0.3774058222770691, + "learning_rate": 9.978793669670661e-06, + "loss": 0.4811, + "step": 1169 + }, + { + "epoch": 0.632546404757614, + "grad_norm": 0.36529508233070374, + "learning_rate": 9.978619615336866e-06, + "loss": 0.4537, + "step": 1170 + }, + { + "epoch": 0.6330870427103983, + "grad_norm": 0.3665902018547058, + "learning_rate": 9.978444851160511e-06, + "loss": 0.4753, + "step": 1171 + }, + { + "epoch": 0.6336276806631825, + "grad_norm": 0.3722003400325775, + "learning_rate": 9.978269377166517e-06, + "loss": 0.4888, + "step": 1172 + }, + { + "epoch": 0.6341683186159668, + "grad_norm": 0.3868032693862915, + "learning_rate": 9.978093193379901e-06, + "loss": 0.4977, + "step": 1173 + }, + { + "epoch": 0.6347089565687511, + "grad_norm": 0.3496233820915222, + "learning_rate": 9.977916299825786e-06, + "loss": 0.4864, + "step": 1174 + }, + { + "epoch": 0.6352495945215354, + "grad_norm": 0.3908425569534302, + "learning_rate": 9.977738696529387e-06, + "loss": 0.5026, + "step": 1175 + }, + { + "epoch": 0.6357902324743196, + "grad_norm": 0.39688825607299805, + "learning_rate": 9.97756038351603e-06, + "loss": 0.4768, + "step": 1176 + }, + { + "epoch": 0.636330870427104, + "grad_norm": 0.5034068822860718, + "learning_rate": 9.97738136081114e-06, + "loss": 0.5099, + "step": 1177 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 0.3846215307712555, + "learning_rate": 9.97720162844024e-06, + "loss": 0.4817, + "step": 1178 + }, + { + "epoch": 0.6374121463326725, + "grad_norm": 0.39605143666267395, + "learning_rate": 9.977021186428955e-06, + "loss": 0.4495, + "step": 1179 + }, + { + "epoch": 0.6379527842854569, + "grad_norm": 0.4012170732021332, + "learning_rate": 9.976840034803014e-06, + "loss": 0.4809, + "step": 1180 + }, + { + "epoch": 0.6384934222382411, + "grad_norm": 0.3802087604999542, + "learning_rate": 9.976658173588244e-06, + "loss": 0.4671, + "step": 1181 + }, + { + "epoch": 0.6390340601910254, + "grad_norm": 0.36879563331604004, + "learning_rate": 9.976475602810575e-06, + "loss": 0.4917, + "step": 1182 + }, + { + "epoch": 0.6395746981438097, + "grad_norm": 0.40868595242500305, + "learning_rate": 9.976292322496037e-06, + "loss": 0.4921, + "step": 1183 + }, + { + "epoch": 0.640115336096594, + "grad_norm": 0.39740028977394104, + "learning_rate": 9.976108332670763e-06, + "loss": 0.4939, + "step": 1184 + }, + { + "epoch": 0.6406559740493782, + "grad_norm": 0.386565625667572, + "learning_rate": 9.975923633360985e-06, + "loss": 0.4817, + "step": 1185 + }, + { + "epoch": 0.6411966120021626, + "grad_norm": 0.43585434556007385, + "learning_rate": 9.975738224593036e-06, + "loss": 0.4609, + "step": 1186 + }, + { + "epoch": 0.6417372499549469, + "grad_norm": 0.43665891885757446, + "learning_rate": 9.975552106393356e-06, + "loss": 0.4849, + "step": 1187 + }, + { + "epoch": 0.6422778879077311, + "grad_norm": 0.35513120889663696, + "learning_rate": 9.975365278788474e-06, + "loss": 0.4719, + "step": 1188 + }, + { + "epoch": 0.6428185258605154, + "grad_norm": 0.3776432275772095, + "learning_rate": 9.975177741805034e-06, + "loss": 0.48, + "step": 1189 + }, + { + "epoch": 0.6433591638132997, + "grad_norm": 0.4380965530872345, + "learning_rate": 9.974989495469771e-06, + "loss": 0.4909, + "step": 1190 + }, + { + "epoch": 0.643899801766084, + "grad_norm": 0.4393802583217621, + "learning_rate": 9.974800539809526e-06, + "loss": 0.5087, + "step": 1191 + }, + { + "epoch": 0.6444404397188682, + "grad_norm": 0.3763464689254761, + "learning_rate": 9.97461087485124e-06, + "loss": 0.4719, + "step": 1192 + }, + { + "epoch": 0.6449810776716526, + "grad_norm": 0.3885161578655243, + "learning_rate": 9.974420500621956e-06, + "loss": 0.457, + "step": 1193 + }, + { + "epoch": 0.6455217156244368, + "grad_norm": 0.4224901795387268, + "learning_rate": 9.974229417148815e-06, + "loss": 0.4776, + "step": 1194 + }, + { + "epoch": 0.6460623535772211, + "grad_norm": 0.45251429080963135, + "learning_rate": 9.974037624459063e-06, + "loss": 0.4674, + "step": 1195 + }, + { + "epoch": 0.6466029915300054, + "grad_norm": 0.34927940368652344, + "learning_rate": 9.973845122580046e-06, + "loss": 0.4562, + "step": 1196 + }, + { + "epoch": 0.6471436294827897, + "grad_norm": 0.4282715916633606, + "learning_rate": 9.973651911539209e-06, + "loss": 0.4931, + "step": 1197 + }, + { + "epoch": 0.647684267435574, + "grad_norm": 0.42950549721717834, + "learning_rate": 9.973457991364098e-06, + "loss": 0.4701, + "step": 1198 + }, + { + "epoch": 0.6482249053883583, + "grad_norm": 0.35387134552001953, + "learning_rate": 9.973263362082365e-06, + "loss": 0.4715, + "step": 1199 + }, + { + "epoch": 0.6487655433411426, + "grad_norm": 0.42833399772644043, + "learning_rate": 9.973068023721761e-06, + "loss": 0.4997, + "step": 1200 + }, + { + "epoch": 0.6493061812939268, + "grad_norm": 0.4325339198112488, + "learning_rate": 9.972871976310134e-06, + "loss": 0.4868, + "step": 1201 + }, + { + "epoch": 0.6498468192467112, + "grad_norm": 0.4114973843097687, + "learning_rate": 9.972675219875437e-06, + "loss": 0.505, + "step": 1202 + }, + { + "epoch": 0.6503874571994954, + "grad_norm": 0.40939682722091675, + "learning_rate": 9.972477754445723e-06, + "loss": 0.5068, + "step": 1203 + }, + { + "epoch": 0.6509280951522797, + "grad_norm": 0.4267498552799225, + "learning_rate": 9.972279580049145e-06, + "loss": 0.4905, + "step": 1204 + }, + { + "epoch": 0.6514687331050639, + "grad_norm": 0.44173136353492737, + "learning_rate": 9.972080696713962e-06, + "loss": 0.4844, + "step": 1205 + }, + { + "epoch": 0.6520093710578483, + "grad_norm": 0.3955075442790985, + "learning_rate": 9.971881104468527e-06, + "loss": 0.482, + "step": 1206 + }, + { + "epoch": 0.6525500090106325, + "grad_norm": 0.46882182359695435, + "learning_rate": 9.971680803341299e-06, + "loss": 0.465, + "step": 1207 + }, + { + "epoch": 0.6530906469634168, + "grad_norm": 0.4290851950645447, + "learning_rate": 9.971479793360837e-06, + "loss": 0.4676, + "step": 1208 + }, + { + "epoch": 0.6536312849162011, + "grad_norm": 0.3703151345252991, + "learning_rate": 9.9712780745558e-06, + "loss": 0.4683, + "step": 1209 + }, + { + "epoch": 0.6541719228689854, + "grad_norm": 0.43469929695129395, + "learning_rate": 9.971075646954946e-06, + "loss": 0.4984, + "step": 1210 + }, + { + "epoch": 0.6547125608217697, + "grad_norm": 0.4252399802207947, + "learning_rate": 9.970872510587142e-06, + "loss": 0.4777, + "step": 1211 + }, + { + "epoch": 0.655253198774554, + "grad_norm": 0.418330579996109, + "learning_rate": 9.970668665481347e-06, + "loss": 0.4934, + "step": 1212 + }, + { + "epoch": 0.6557938367273383, + "grad_norm": 0.4227907657623291, + "learning_rate": 9.970464111666627e-06, + "loss": 0.4675, + "step": 1213 + }, + { + "epoch": 0.6563344746801225, + "grad_norm": 0.42185381054878235, + "learning_rate": 9.970258849172146e-06, + "loss": 0.4926, + "step": 1214 + }, + { + "epoch": 0.6568751126329069, + "grad_norm": 0.4280652403831482, + "learning_rate": 9.970052878027169e-06, + "loss": 0.4808, + "step": 1215 + }, + { + "epoch": 0.6574157505856911, + "grad_norm": 0.36444810032844543, + "learning_rate": 9.969846198261063e-06, + "loss": 0.4805, + "step": 1216 + }, + { + "epoch": 0.6579563885384754, + "grad_norm": 0.40645283460617065, + "learning_rate": 9.9696388099033e-06, + "loss": 0.4702, + "step": 1217 + }, + { + "epoch": 0.6584970264912597, + "grad_norm": 0.3740103840827942, + "learning_rate": 9.969430712983443e-06, + "loss": 0.5026, + "step": 1218 + }, + { + "epoch": 0.659037664444044, + "grad_norm": 0.3753998875617981, + "learning_rate": 9.969221907531168e-06, + "loss": 0.455, + "step": 1219 + }, + { + "epoch": 0.6595783023968282, + "grad_norm": 0.3687969744205475, + "learning_rate": 9.969012393576241e-06, + "loss": 0.4831, + "step": 1220 + }, + { + "epoch": 0.6601189403496125, + "grad_norm": 0.42063668370246887, + "learning_rate": 9.968802171148537e-06, + "loss": 0.5077, + "step": 1221 + }, + { + "epoch": 0.6606595783023969, + "grad_norm": 0.41339847445487976, + "learning_rate": 9.968591240278028e-06, + "loss": 0.4857, + "step": 1222 + }, + { + "epoch": 0.6612002162551811, + "grad_norm": 0.3697742819786072, + "learning_rate": 9.96837960099479e-06, + "loss": 0.4841, + "step": 1223 + }, + { + "epoch": 0.6617408542079654, + "grad_norm": 0.40105369687080383, + "learning_rate": 9.968167253328995e-06, + "loss": 0.4776, + "step": 1224 + }, + { + "epoch": 0.6622814921607497, + "grad_norm": 0.3760954737663269, + "learning_rate": 9.967954197310922e-06, + "loss": 0.4687, + "step": 1225 + }, + { + "epoch": 0.662822130113534, + "grad_norm": 0.42928802967071533, + "learning_rate": 9.967740432970948e-06, + "loss": 0.4781, + "step": 1226 + }, + { + "epoch": 0.6633627680663182, + "grad_norm": 0.43400323390960693, + "learning_rate": 9.967525960339548e-06, + "loss": 0.4772, + "step": 1227 + }, + { + "epoch": 0.6639034060191026, + "grad_norm": 0.36896318197250366, + "learning_rate": 9.967310779447303e-06, + "loss": 0.4509, + "step": 1228 + }, + { + "epoch": 0.6644440439718868, + "grad_norm": 0.380281537771225, + "learning_rate": 9.967094890324894e-06, + "loss": 0.4915, + "step": 1229 + }, + { + "epoch": 0.6649846819246711, + "grad_norm": 0.46708944439888, + "learning_rate": 9.966878293003102e-06, + "loss": 0.491, + "step": 1230 + }, + { + "epoch": 0.6655253198774554, + "grad_norm": 0.397403746843338, + "learning_rate": 9.966660987512809e-06, + "loss": 0.4518, + "step": 1231 + }, + { + "epoch": 0.6660659578302397, + "grad_norm": 0.45788633823394775, + "learning_rate": 9.966442973884996e-06, + "loss": 0.4717, + "step": 1232 + }, + { + "epoch": 0.666606595783024, + "grad_norm": 0.3888041079044342, + "learning_rate": 9.96622425215075e-06, + "loss": 0.4547, + "step": 1233 + }, + { + "epoch": 0.6671472337358082, + "grad_norm": 0.46574699878692627, + "learning_rate": 9.966004822341254e-06, + "loss": 0.4882, + "step": 1234 + }, + { + "epoch": 0.6676878716885926, + "grad_norm": 0.40038323402404785, + "learning_rate": 9.965784684487794e-06, + "loss": 0.4799, + "step": 1235 + }, + { + "epoch": 0.6682285096413768, + "grad_norm": 0.3611610531806946, + "learning_rate": 9.965563838621758e-06, + "loss": 0.4567, + "step": 1236 + }, + { + "epoch": 0.6687691475941611, + "grad_norm": 0.4081619083881378, + "learning_rate": 9.965342284774633e-06, + "loss": 0.4668, + "step": 1237 + }, + { + "epoch": 0.6693097855469454, + "grad_norm": 0.44255727529525757, + "learning_rate": 9.965120022978007e-06, + "loss": 0.4829, + "step": 1238 + }, + { + "epoch": 0.6698504234997297, + "grad_norm": 0.38775256276130676, + "learning_rate": 9.96489705326357e-06, + "loss": 0.4827, + "step": 1239 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.45577195286750793, + "learning_rate": 9.964673375663114e-06, + "loss": 0.4854, + "step": 1240 + }, + { + "epoch": 0.6709316994052983, + "grad_norm": 0.4752182066440582, + "learning_rate": 9.96444899020853e-06, + "loss": 0.4689, + "step": 1241 + }, + { + "epoch": 0.6714723373580825, + "grad_norm": 0.40431809425354004, + "learning_rate": 9.964223896931809e-06, + "loss": 0.482, + "step": 1242 + }, + { + "epoch": 0.6720129753108668, + "grad_norm": 0.4463716149330139, + "learning_rate": 9.963998095865047e-06, + "loss": 0.4692, + "step": 1243 + }, + { + "epoch": 0.6725536132636512, + "grad_norm": 0.4271736443042755, + "learning_rate": 9.963771587040435e-06, + "loss": 0.4754, + "step": 1244 + }, + { + "epoch": 0.6730942512164354, + "grad_norm": 0.36027437448501587, + "learning_rate": 9.96354437049027e-06, + "loss": 0.4458, + "step": 1245 + }, + { + "epoch": 0.6736348891692197, + "grad_norm": 0.43132495880126953, + "learning_rate": 9.963316446246949e-06, + "loss": 0.4931, + "step": 1246 + }, + { + "epoch": 0.674175527122004, + "grad_norm": 0.39921796321868896, + "learning_rate": 9.963087814342968e-06, + "loss": 0.4604, + "step": 1247 + }, + { + "epoch": 0.6747161650747883, + "grad_norm": 0.4080028235912323, + "learning_rate": 9.962858474810926e-06, + "loss": 0.4781, + "step": 1248 + }, + { + "epoch": 0.6752568030275725, + "grad_norm": 0.48804527521133423, + "learning_rate": 9.96262842768352e-06, + "loss": 0.4637, + "step": 1249 + }, + { + "epoch": 0.6757974409803568, + "grad_norm": 0.37016865611076355, + "learning_rate": 9.962397672993552e-06, + "loss": 0.4768, + "step": 1250 + }, + { + "epoch": 0.6763380789331411, + "grad_norm": 0.42169275879859924, + "learning_rate": 9.962166210773918e-06, + "loss": 0.489, + "step": 1251 + }, + { + "epoch": 0.6768787168859254, + "grad_norm": 0.4805634915828705, + "learning_rate": 9.961934041057627e-06, + "loss": 0.5032, + "step": 1252 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.37036967277526855, + "learning_rate": 9.961701163877773e-06, + "loss": 0.4523, + "step": 1253 + }, + { + "epoch": 0.677959992791494, + "grad_norm": 0.37964698672294617, + "learning_rate": 9.961467579267565e-06, + "loss": 0.4996, + "step": 1254 + }, + { + "epoch": 0.6785006307442782, + "grad_norm": 0.41072869300842285, + "learning_rate": 9.961233287260305e-06, + "loss": 0.461, + "step": 1255 + }, + { + "epoch": 0.6790412686970625, + "grad_norm": 0.3730875253677368, + "learning_rate": 9.960998287889397e-06, + "loss": 0.4772, + "step": 1256 + }, + { + "epoch": 0.6795819066498469, + "grad_norm": 0.4048408567905426, + "learning_rate": 9.96076258118835e-06, + "loss": 0.4691, + "step": 1257 + }, + { + "epoch": 0.6801225446026311, + "grad_norm": 0.4101283848285675, + "learning_rate": 9.960526167190767e-06, + "loss": 0.4616, + "step": 1258 + }, + { + "epoch": 0.6806631825554154, + "grad_norm": 0.3689689636230469, + "learning_rate": 9.960289045930358e-06, + "loss": 0.4529, + "step": 1259 + }, + { + "epoch": 0.6812038205081997, + "grad_norm": 0.4019097089767456, + "learning_rate": 9.96005121744093e-06, + "loss": 0.4804, + "step": 1260 + }, + { + "epoch": 0.681744458460984, + "grad_norm": 0.4565180242061615, + "learning_rate": 9.959812681756394e-06, + "loss": 0.4852, + "step": 1261 + }, + { + "epoch": 0.6822850964137682, + "grad_norm": 0.4157910645008087, + "learning_rate": 9.959573438910757e-06, + "loss": 0.48, + "step": 1262 + }, + { + "epoch": 0.6828257343665526, + "grad_norm": 0.4228622317314148, + "learning_rate": 9.95933348893813e-06, + "loss": 0.4938, + "step": 1263 + }, + { + "epoch": 0.6833663723193368, + "grad_norm": 0.41255107522010803, + "learning_rate": 9.959092831872729e-06, + "loss": 0.4863, + "step": 1264 + }, + { + "epoch": 0.6839070102721211, + "grad_norm": 0.3899734318256378, + "learning_rate": 9.958851467748863e-06, + "loss": 0.457, + "step": 1265 + }, + { + "epoch": 0.6844476482249053, + "grad_norm": 0.4378751814365387, + "learning_rate": 9.958609396600944e-06, + "loss": 0.467, + "step": 1266 + }, + { + "epoch": 0.6849882861776897, + "grad_norm": 0.4468434154987335, + "learning_rate": 9.958366618463488e-06, + "loss": 0.4727, + "step": 1267 + }, + { + "epoch": 0.685528924130474, + "grad_norm": 0.45046213269233704, + "learning_rate": 9.958123133371111e-06, + "loss": 0.4862, + "step": 1268 + }, + { + "epoch": 0.6860695620832582, + "grad_norm": 0.41851988434791565, + "learning_rate": 9.957878941358526e-06, + "loss": 0.4733, + "step": 1269 + }, + { + "epoch": 0.6866102000360426, + "grad_norm": 0.43167203664779663, + "learning_rate": 9.957634042460551e-06, + "loss": 0.4898, + "step": 1270 + }, + { + "epoch": 0.6871508379888268, + "grad_norm": 0.4302287995815277, + "learning_rate": 9.957388436712103e-06, + "loss": 0.5006, + "step": 1271 + }, + { + "epoch": 0.6876914759416111, + "grad_norm": 0.43457236886024475, + "learning_rate": 9.957142124148201e-06, + "loss": 0.4946, + "step": 1272 + }, + { + "epoch": 0.6882321138943954, + "grad_norm": 0.3867625594139099, + "learning_rate": 9.95689510480396e-06, + "loss": 0.4711, + "step": 1273 + }, + { + "epoch": 0.6887727518471797, + "grad_norm": 0.38470458984375, + "learning_rate": 9.956647378714606e-06, + "loss": 0.4362, + "step": 1274 + }, + { + "epoch": 0.6893133897999639, + "grad_norm": 0.489411324262619, + "learning_rate": 9.956398945915455e-06, + "loss": 0.4919, + "step": 1275 + }, + { + "epoch": 0.6898540277527483, + "grad_norm": 0.3836153447628021, + "learning_rate": 9.956149806441927e-06, + "loss": 0.4708, + "step": 1276 + }, + { + "epoch": 0.6903946657055325, + "grad_norm": 0.46178779006004333, + "learning_rate": 9.955899960329546e-06, + "loss": 0.4884, + "step": 1277 + }, + { + "epoch": 0.6909353036583168, + "grad_norm": 0.4728642702102661, + "learning_rate": 9.955649407613936e-06, + "loss": 0.4491, + "step": 1278 + }, + { + "epoch": 0.6914759416111012, + "grad_norm": 0.4373341202735901, + "learning_rate": 9.955398148330816e-06, + "loss": 0.4728, + "step": 1279 + }, + { + "epoch": 0.6920165795638854, + "grad_norm": 0.44000309705734253, + "learning_rate": 9.955146182516015e-06, + "loss": 0.4747, + "step": 1280 + }, + { + "epoch": 0.6925572175166697, + "grad_norm": 0.4856782853603363, + "learning_rate": 9.954893510205455e-06, + "loss": 0.4607, + "step": 1281 + }, + { + "epoch": 0.6930978554694539, + "grad_norm": 0.4034997522830963, + "learning_rate": 9.954640131435162e-06, + "loss": 0.4984, + "step": 1282 + }, + { + "epoch": 0.6936384934222383, + "grad_norm": 0.4275628924369812, + "learning_rate": 9.954386046241262e-06, + "loss": 0.4602, + "step": 1283 + }, + { + "epoch": 0.6941791313750225, + "grad_norm": 0.44787707924842834, + "learning_rate": 9.954131254659983e-06, + "loss": 0.4518, + "step": 1284 + }, + { + "epoch": 0.6947197693278068, + "grad_norm": 0.4457874894142151, + "learning_rate": 9.95387575672765e-06, + "loss": 0.4858, + "step": 1285 + }, + { + "epoch": 0.6952604072805911, + "grad_norm": 0.4127665162086487, + "learning_rate": 9.953619552480697e-06, + "loss": 0.483, + "step": 1286 + }, + { + "epoch": 0.6958010452333754, + "grad_norm": 0.4638175964355469, + "learning_rate": 9.953362641955649e-06, + "loss": 0.4869, + "step": 1287 + }, + { + "epoch": 0.6963416831861596, + "grad_norm": 0.4228275418281555, + "learning_rate": 9.953105025189134e-06, + "loss": 0.486, + "step": 1288 + }, + { + "epoch": 0.696882321138944, + "grad_norm": 0.41768065094947815, + "learning_rate": 9.952846702217886e-06, + "loss": 0.4561, + "step": 1289 + }, + { + "epoch": 0.6974229590917282, + "grad_norm": 0.4437529146671295, + "learning_rate": 9.952587673078738e-06, + "loss": 0.4815, + "step": 1290 + }, + { + "epoch": 0.6979635970445125, + "grad_norm": 0.42724665999412537, + "learning_rate": 9.952327937808616e-06, + "loss": 0.4735, + "step": 1291 + }, + { + "epoch": 0.6985042349972969, + "grad_norm": 0.43202629685401917, + "learning_rate": 9.952067496444557e-06, + "loss": 0.5053, + "step": 1292 + }, + { + "epoch": 0.6990448729500811, + "grad_norm": 0.3635312020778656, + "learning_rate": 9.951806349023693e-06, + "loss": 0.4426, + "step": 1293 + }, + { + "epoch": 0.6995855109028654, + "grad_norm": 0.3907013535499573, + "learning_rate": 9.951544495583258e-06, + "loss": 0.471, + "step": 1294 + }, + { + "epoch": 0.7001261488556497, + "grad_norm": 0.4162873327732086, + "learning_rate": 9.951281936160587e-06, + "loss": 0.4902, + "step": 1295 + }, + { + "epoch": 0.700666786808434, + "grad_norm": 0.37522828578948975, + "learning_rate": 9.951018670793114e-06, + "loss": 0.4931, + "step": 1296 + }, + { + "epoch": 0.7012074247612182, + "grad_norm": 0.39658185839653015, + "learning_rate": 9.950754699518374e-06, + "loss": 0.4784, + "step": 1297 + }, + { + "epoch": 0.7017480627140025, + "grad_norm": 0.40695467591285706, + "learning_rate": 9.95049002237401e-06, + "loss": 0.4808, + "step": 1298 + }, + { + "epoch": 0.7022887006667868, + "grad_norm": 0.43931230902671814, + "learning_rate": 9.950224639397749e-06, + "loss": 0.4709, + "step": 1299 + }, + { + "epoch": 0.7028293386195711, + "grad_norm": 0.44696277379989624, + "learning_rate": 9.949958550627436e-06, + "loss": 0.5105, + "step": 1300 + }, + { + "epoch": 0.7033699765723553, + "grad_norm": 0.4636048972606659, + "learning_rate": 9.94969175610101e-06, + "loss": 0.477, + "step": 1301 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 0.4922928810119629, + "learning_rate": 9.949424255856506e-06, + "loss": 0.4618, + "step": 1302 + }, + { + "epoch": 0.704451252477924, + "grad_norm": 0.39951130747795105, + "learning_rate": 9.949156049932065e-06, + "loss": 0.4568, + "step": 1303 + }, + { + "epoch": 0.7049918904307082, + "grad_norm": 0.4968952536582947, + "learning_rate": 9.948887138365929e-06, + "loss": 0.4889, + "step": 1304 + }, + { + "epoch": 0.7055325283834926, + "grad_norm": 0.4298841953277588, + "learning_rate": 9.948617521196438e-06, + "loss": 0.4794, + "step": 1305 + }, + { + "epoch": 0.7060731663362768, + "grad_norm": 0.3872496783733368, + "learning_rate": 9.948347198462031e-06, + "loss": 0.4762, + "step": 1306 + }, + { + "epoch": 0.7066138042890611, + "grad_norm": 0.518255889415741, + "learning_rate": 9.948076170201254e-06, + "loss": 0.5167, + "step": 1307 + }, + { + "epoch": 0.7071544422418454, + "grad_norm": 0.37470534443855286, + "learning_rate": 9.947804436452748e-06, + "loss": 0.4737, + "step": 1308 + }, + { + "epoch": 0.7076950801946297, + "grad_norm": 0.39828360080718994, + "learning_rate": 9.947531997255256e-06, + "loss": 0.4744, + "step": 1309 + }, + { + "epoch": 0.7082357181474139, + "grad_norm": 0.41359883546829224, + "learning_rate": 9.947258852647623e-06, + "loss": 0.5009, + "step": 1310 + }, + { + "epoch": 0.7087763561001982, + "grad_norm": 0.41263335943222046, + "learning_rate": 9.946985002668791e-06, + "loss": 0.4463, + "step": 1311 + }, + { + "epoch": 0.7093169940529825, + "grad_norm": 0.42026588320732117, + "learning_rate": 9.94671044735781e-06, + "loss": 0.4446, + "step": 1312 + }, + { + "epoch": 0.7098576320057668, + "grad_norm": 0.451593816280365, + "learning_rate": 9.94643518675382e-06, + "loss": 0.4787, + "step": 1313 + }, + { + "epoch": 0.710398269958551, + "grad_norm": 0.39087212085723877, + "learning_rate": 9.94615922089607e-06, + "loss": 0.4456, + "step": 1314 + }, + { + "epoch": 0.7109389079113354, + "grad_norm": 0.38521644473075867, + "learning_rate": 9.945882549823906e-06, + "loss": 0.4822, + "step": 1315 + }, + { + "epoch": 0.7114795458641197, + "grad_norm": 0.35561037063598633, + "learning_rate": 9.945605173576775e-06, + "loss": 0.4748, + "step": 1316 + }, + { + "epoch": 0.7120201838169039, + "grad_norm": 0.46212202310562134, + "learning_rate": 9.945327092194225e-06, + "loss": 0.4913, + "step": 1317 + }, + { + "epoch": 0.7125608217696883, + "grad_norm": 0.3639814853668213, + "learning_rate": 9.945048305715907e-06, + "loss": 0.5147, + "step": 1318 + }, + { + "epoch": 0.7131014597224725, + "grad_norm": 0.4061166048049927, + "learning_rate": 9.944768814181566e-06, + "loss": 0.4557, + "step": 1319 + }, + { + "epoch": 0.7136420976752568, + "grad_norm": 0.41976243257522583, + "learning_rate": 9.944488617631053e-06, + "loss": 0.4714, + "step": 1320 + }, + { + "epoch": 0.7141827356280411, + "grad_norm": 0.4113599359989166, + "learning_rate": 9.944207716104318e-06, + "loss": 0.5008, + "step": 1321 + }, + { + "epoch": 0.7147233735808254, + "grad_norm": 0.3758300840854645, + "learning_rate": 9.943926109641411e-06, + "loss": 0.4904, + "step": 1322 + }, + { + "epoch": 0.7152640115336096, + "grad_norm": 0.3485066294670105, + "learning_rate": 9.943643798282483e-06, + "loss": 0.4385, + "step": 1323 + }, + { + "epoch": 0.715804649486394, + "grad_norm": 0.3554293215274811, + "learning_rate": 9.943360782067786e-06, + "loss": 0.447, + "step": 1324 + }, + { + "epoch": 0.7163452874391782, + "grad_norm": 0.40508759021759033, + "learning_rate": 9.943077061037672e-06, + "loss": 0.4715, + "step": 1325 + }, + { + "epoch": 0.7168859253919625, + "grad_norm": 0.37202188372612, + "learning_rate": 9.942792635232591e-06, + "loss": 0.461, + "step": 1326 + }, + { + "epoch": 0.7174265633447467, + "grad_norm": 0.422195166349411, + "learning_rate": 9.9425075046931e-06, + "loss": 0.4944, + "step": 1327 + }, + { + "epoch": 0.7179672012975311, + "grad_norm": 0.38294172286987305, + "learning_rate": 9.94222166945985e-06, + "loss": 0.4745, + "step": 1328 + }, + { + "epoch": 0.7185078392503154, + "grad_norm": 0.4370891749858856, + "learning_rate": 9.941935129573596e-06, + "loss": 0.4918, + "step": 1329 + }, + { + "epoch": 0.7190484772030996, + "grad_norm": 0.41351190209388733, + "learning_rate": 9.94164788507519e-06, + "loss": 0.4768, + "step": 1330 + }, + { + "epoch": 0.719589115155884, + "grad_norm": 0.40625694394111633, + "learning_rate": 9.941359936005588e-06, + "loss": 0.4781, + "step": 1331 + }, + { + "epoch": 0.7201297531086682, + "grad_norm": 0.42381858825683594, + "learning_rate": 9.941071282405848e-06, + "loss": 0.4455, + "step": 1332 + }, + { + "epoch": 0.7206703910614525, + "grad_norm": 0.39242449402809143, + "learning_rate": 9.94078192431712e-06, + "loss": 0.4503, + "step": 1333 + }, + { + "epoch": 0.7212110290142368, + "grad_norm": 0.36869487166404724, + "learning_rate": 9.940491861780666e-06, + "loss": 0.4376, + "step": 1334 + }, + { + "epoch": 0.7217516669670211, + "grad_norm": 0.44459354877471924, + "learning_rate": 9.940201094837838e-06, + "loss": 0.4842, + "step": 1335 + }, + { + "epoch": 0.7222923049198053, + "grad_norm": 0.4525735676288605, + "learning_rate": 9.939909623530098e-06, + "loss": 0.4697, + "step": 1336 + }, + { + "epoch": 0.7228329428725897, + "grad_norm": 0.3791995048522949, + "learning_rate": 9.939617447898998e-06, + "loss": 0.4845, + "step": 1337 + }, + { + "epoch": 0.723373580825374, + "grad_norm": 0.4672689437866211, + "learning_rate": 9.9393245679862e-06, + "loss": 0.4522, + "step": 1338 + }, + { + "epoch": 0.7239142187781582, + "grad_norm": 0.5381559133529663, + "learning_rate": 9.939030983833459e-06, + "loss": 0.4785, + "step": 1339 + }, + { + "epoch": 0.7244548567309426, + "grad_norm": 0.40444672107696533, + "learning_rate": 9.938736695482636e-06, + "loss": 0.4875, + "step": 1340 + }, + { + "epoch": 0.7249954946837268, + "grad_norm": 0.44594690203666687, + "learning_rate": 9.938441702975689e-06, + "loss": 0.4863, + "step": 1341 + }, + { + "epoch": 0.7255361326365111, + "grad_norm": 0.39273694157600403, + "learning_rate": 9.938146006354678e-06, + "loss": 0.4682, + "step": 1342 + }, + { + "epoch": 0.7260767705892953, + "grad_norm": 0.4593668580055237, + "learning_rate": 9.937849605661765e-06, + "loss": 0.502, + "step": 1343 + }, + { + "epoch": 0.7266174085420797, + "grad_norm": 0.38885119557380676, + "learning_rate": 9.937552500939205e-06, + "loss": 0.4496, + "step": 1344 + }, + { + "epoch": 0.7271580464948639, + "grad_norm": 0.43517187237739563, + "learning_rate": 9.937254692229363e-06, + "loss": 0.4791, + "step": 1345 + }, + { + "epoch": 0.7276986844476482, + "grad_norm": 0.4335128664970398, + "learning_rate": 9.9369561795747e-06, + "loss": 0.4662, + "step": 1346 + }, + { + "epoch": 0.7282393224004325, + "grad_norm": 0.387592613697052, + "learning_rate": 9.936656963017774e-06, + "loss": 0.4814, + "step": 1347 + }, + { + "epoch": 0.7287799603532168, + "grad_norm": 0.43556758761405945, + "learning_rate": 9.936357042601252e-06, + "loss": 0.4942, + "step": 1348 + }, + { + "epoch": 0.729320598306001, + "grad_norm": 0.3670481741428375, + "learning_rate": 9.93605641836789e-06, + "loss": 0.4919, + "step": 1349 + }, + { + "epoch": 0.7298612362587854, + "grad_norm": 0.3413862884044647, + "learning_rate": 9.935755090360554e-06, + "loss": 0.4627, + "step": 1350 + }, + { + "epoch": 0.7304018742115697, + "grad_norm": 0.3749856650829315, + "learning_rate": 9.935453058622208e-06, + "loss": 0.4553, + "step": 1351 + }, + { + "epoch": 0.7309425121643539, + "grad_norm": 0.36184626817703247, + "learning_rate": 9.935150323195912e-06, + "loss": 0.4648, + "step": 1352 + }, + { + "epoch": 0.7314831501171383, + "grad_norm": 0.33902183175086975, + "learning_rate": 9.934846884124831e-06, + "loss": 0.4704, + "step": 1353 + }, + { + "epoch": 0.7320237880699225, + "grad_norm": 0.37856411933898926, + "learning_rate": 9.93454274145223e-06, + "loss": 0.4719, + "step": 1354 + }, + { + "epoch": 0.7325644260227068, + "grad_norm": 0.42122843861579895, + "learning_rate": 9.93423789522147e-06, + "loss": 0.4759, + "step": 1355 + }, + { + "epoch": 0.7331050639754911, + "grad_norm": 0.3885326683521271, + "learning_rate": 9.933932345476019e-06, + "loss": 0.4794, + "step": 1356 + }, + { + "epoch": 0.7336457019282754, + "grad_norm": 0.4524173438549042, + "learning_rate": 9.933626092259439e-06, + "loss": 0.4581, + "step": 1357 + }, + { + "epoch": 0.7341863398810596, + "grad_norm": 0.4196130037307739, + "learning_rate": 9.933319135615396e-06, + "loss": 0.4661, + "step": 1358 + }, + { + "epoch": 0.7347269778338439, + "grad_norm": 0.4285523295402527, + "learning_rate": 9.933011475587654e-06, + "loss": 0.4712, + "step": 1359 + }, + { + "epoch": 0.7352676157866282, + "grad_norm": 0.39787888526916504, + "learning_rate": 9.932703112220084e-06, + "loss": 0.4795, + "step": 1360 + }, + { + "epoch": 0.7358082537394125, + "grad_norm": 0.40517759323120117, + "learning_rate": 9.932394045556644e-06, + "loss": 0.5015, + "step": 1361 + }, + { + "epoch": 0.7363488916921967, + "grad_norm": 0.401108980178833, + "learning_rate": 9.932084275641405e-06, + "loss": 0.4867, + "step": 1362 + }, + { + "epoch": 0.7368895296449811, + "grad_norm": 0.4435082972049713, + "learning_rate": 9.931773802518532e-06, + "loss": 0.4919, + "step": 1363 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 0.35231056809425354, + "learning_rate": 9.931462626232294e-06, + "loss": 0.4706, + "step": 1364 + }, + { + "epoch": 0.7379708055505496, + "grad_norm": 0.3894501328468323, + "learning_rate": 9.931150746827055e-06, + "loss": 0.4825, + "step": 1365 + }, + { + "epoch": 0.738511443503334, + "grad_norm": 0.42613503336906433, + "learning_rate": 9.930838164347282e-06, + "loss": 0.4625, + "step": 1366 + }, + { + "epoch": 0.7390520814561182, + "grad_norm": 0.35955050587654114, + "learning_rate": 9.930524878837544e-06, + "loss": 0.4622, + "step": 1367 + }, + { + "epoch": 0.7395927194089025, + "grad_norm": 0.4098225235939026, + "learning_rate": 9.93021089034251e-06, + "loss": 0.4604, + "step": 1368 + }, + { + "epoch": 0.7401333573616868, + "grad_norm": 0.40953758358955383, + "learning_rate": 9.929896198906945e-06, + "loss": 0.4694, + "step": 1369 + }, + { + "epoch": 0.7406739953144711, + "grad_norm": 0.4058552384376526, + "learning_rate": 9.929580804575718e-06, + "loss": 0.4743, + "step": 1370 + }, + { + "epoch": 0.7412146332672553, + "grad_norm": 0.38848456740379333, + "learning_rate": 9.929264707393799e-06, + "loss": 0.4914, + "step": 1371 + }, + { + "epoch": 0.7417552712200396, + "grad_norm": 0.4356609880924225, + "learning_rate": 9.928947907406255e-06, + "loss": 0.4753, + "step": 1372 + }, + { + "epoch": 0.742295909172824, + "grad_norm": 0.40148693323135376, + "learning_rate": 9.928630404658255e-06, + "loss": 0.4745, + "step": 1373 + }, + { + "epoch": 0.7428365471256082, + "grad_norm": 0.34132710099220276, + "learning_rate": 9.928312199195068e-06, + "loss": 0.4791, + "step": 1374 + }, + { + "epoch": 0.7433771850783925, + "grad_norm": 0.42795494198799133, + "learning_rate": 9.927993291062064e-06, + "loss": 0.4778, + "step": 1375 + }, + { + "epoch": 0.7439178230311768, + "grad_norm": 0.3633674681186676, + "learning_rate": 9.927673680304711e-06, + "loss": 0.4842, + "step": 1376 + }, + { + "epoch": 0.7444584609839611, + "grad_norm": 0.39830613136291504, + "learning_rate": 9.927353366968578e-06, + "loss": 0.4737, + "step": 1377 + }, + { + "epoch": 0.7449990989367453, + "grad_norm": 0.38402634859085083, + "learning_rate": 9.927032351099337e-06, + "loss": 0.5031, + "step": 1378 + }, + { + "epoch": 0.7455397368895297, + "grad_norm": 0.3705873489379883, + "learning_rate": 9.926710632742758e-06, + "loss": 0.4559, + "step": 1379 + }, + { + "epoch": 0.7460803748423139, + "grad_norm": 0.41539448499679565, + "learning_rate": 9.926388211944707e-06, + "loss": 0.4836, + "step": 1380 + }, + { + "epoch": 0.7466210127950982, + "grad_norm": 0.4214273989200592, + "learning_rate": 9.926065088751157e-06, + "loss": 0.479, + "step": 1381 + }, + { + "epoch": 0.7471616507478825, + "grad_norm": 0.3808612823486328, + "learning_rate": 9.92574126320818e-06, + "loss": 0.4864, + "step": 1382 + }, + { + "epoch": 0.7477022887006668, + "grad_norm": 0.37214893102645874, + "learning_rate": 9.925416735361943e-06, + "loss": 0.4502, + "step": 1383 + }, + { + "epoch": 0.748242926653451, + "grad_norm": 0.39607030153274536, + "learning_rate": 9.925091505258719e-06, + "loss": 0.4415, + "step": 1384 + }, + { + "epoch": 0.7487835646062354, + "grad_norm": 0.3850662410259247, + "learning_rate": 9.924765572944879e-06, + "loss": 0.5051, + "step": 1385 + }, + { + "epoch": 0.7493242025590197, + "grad_norm": 0.35740000009536743, + "learning_rate": 9.924438938466891e-06, + "loss": 0.4403, + "step": 1386 + }, + { + "epoch": 0.7498648405118039, + "grad_norm": 0.39912787079811096, + "learning_rate": 9.92411160187133e-06, + "loss": 0.472, + "step": 1387 + }, + { + "epoch": 0.7504054784645882, + "grad_norm": 0.39975598454475403, + "learning_rate": 9.923783563204863e-06, + "loss": 0.4767, + "step": 1388 + }, + { + "epoch": 0.7509461164173725, + "grad_norm": 0.4057595729827881, + "learning_rate": 9.923454822514262e-06, + "loss": 0.4497, + "step": 1389 + }, + { + "epoch": 0.7514867543701568, + "grad_norm": 0.423401802778244, + "learning_rate": 9.9231253798464e-06, + "loss": 0.4804, + "step": 1390 + }, + { + "epoch": 0.752027392322941, + "grad_norm": 0.4875843822956085, + "learning_rate": 9.922795235248248e-06, + "loss": 0.4802, + "step": 1391 + }, + { + "epoch": 0.7525680302757254, + "grad_norm": 0.43842393159866333, + "learning_rate": 9.922464388766878e-06, + "loss": 0.4887, + "step": 1392 + }, + { + "epoch": 0.7531086682285096, + "grad_norm": 0.4424212574958801, + "learning_rate": 9.922132840449459e-06, + "loss": 0.497, + "step": 1393 + }, + { + "epoch": 0.7536493061812939, + "grad_norm": 0.5426601767539978, + "learning_rate": 9.921800590343264e-06, + "loss": 0.5274, + "step": 1394 + }, + { + "epoch": 0.7541899441340782, + "grad_norm": 0.5073828101158142, + "learning_rate": 9.921467638495666e-06, + "loss": 0.4597, + "step": 1395 + }, + { + "epoch": 0.7547305820868625, + "grad_norm": 0.4607311189174652, + "learning_rate": 9.921133984954134e-06, + "loss": 0.4833, + "step": 1396 + }, + { + "epoch": 0.7552712200396468, + "grad_norm": 0.43839946389198303, + "learning_rate": 9.92079962976624e-06, + "loss": 0.4606, + "step": 1397 + }, + { + "epoch": 0.7558118579924311, + "grad_norm": 0.5186724662780762, + "learning_rate": 9.92046457297966e-06, + "loss": 0.5192, + "step": 1398 + }, + { + "epoch": 0.7563524959452154, + "grad_norm": 0.43365365266799927, + "learning_rate": 9.92012881464216e-06, + "loss": 0.5014, + "step": 1399 + }, + { + "epoch": 0.7568931338979996, + "grad_norm": 0.44823896884918213, + "learning_rate": 9.919792354801614e-06, + "loss": 0.4609, + "step": 1400 + }, + { + "epoch": 0.757433771850784, + "grad_norm": 0.44195356965065, + "learning_rate": 9.919455193505996e-06, + "loss": 0.4758, + "step": 1401 + }, + { + "epoch": 0.7579744098035682, + "grad_norm": 0.4172375500202179, + "learning_rate": 9.919117330803374e-06, + "loss": 0.4824, + "step": 1402 + }, + { + "epoch": 0.7585150477563525, + "grad_norm": 0.4180469214916229, + "learning_rate": 9.918778766741924e-06, + "loss": 0.4607, + "step": 1403 + }, + { + "epoch": 0.7590556857091367, + "grad_norm": 0.48548153042793274, + "learning_rate": 9.918439501369914e-06, + "loss": 0.4904, + "step": 1404 + }, + { + "epoch": 0.7595963236619211, + "grad_norm": 0.3765825927257538, + "learning_rate": 9.91809953473572e-06, + "loss": 0.5043, + "step": 1405 + }, + { + "epoch": 0.7601369616147053, + "grad_norm": 0.5639870762825012, + "learning_rate": 9.917758866887808e-06, + "loss": 0.4973, + "step": 1406 + }, + { + "epoch": 0.7606775995674896, + "grad_norm": 0.3645475208759308, + "learning_rate": 9.917417497874756e-06, + "loss": 0.4755, + "step": 1407 + }, + { + "epoch": 0.761218237520274, + "grad_norm": 0.419842928647995, + "learning_rate": 9.917075427745232e-06, + "loss": 0.4727, + "step": 1408 + }, + { + "epoch": 0.7617588754730582, + "grad_norm": 0.3567120432853699, + "learning_rate": 9.916732656548008e-06, + "loss": 0.4612, + "step": 1409 + }, + { + "epoch": 0.7622995134258425, + "grad_norm": 0.4124714136123657, + "learning_rate": 9.916389184331957e-06, + "loss": 0.465, + "step": 1410 + }, + { + "epoch": 0.7628401513786268, + "grad_norm": 0.3979954719543457, + "learning_rate": 9.916045011146052e-06, + "loss": 0.4812, + "step": 1411 + }, + { + "epoch": 0.7633807893314111, + "grad_norm": 0.38222837448120117, + "learning_rate": 9.915700137039359e-06, + "loss": 0.4742, + "step": 1412 + }, + { + "epoch": 0.7639214272841953, + "grad_norm": 0.38148653507232666, + "learning_rate": 9.915354562061056e-06, + "loss": 0.4833, + "step": 1413 + }, + { + "epoch": 0.7644620652369797, + "grad_norm": 0.4023478329181671, + "learning_rate": 9.91500828626041e-06, + "loss": 0.4591, + "step": 1414 + }, + { + "epoch": 0.7650027031897639, + "grad_norm": 0.3840067386627197, + "learning_rate": 9.914661309686796e-06, + "loss": 0.4685, + "step": 1415 + }, + { + "epoch": 0.7655433411425482, + "grad_norm": 0.3622734844684601, + "learning_rate": 9.914313632389682e-06, + "loss": 0.4836, + "step": 1416 + }, + { + "epoch": 0.7660839790953325, + "grad_norm": 0.3705524504184723, + "learning_rate": 9.91396525441864e-06, + "loss": 0.4529, + "step": 1417 + }, + { + "epoch": 0.7666246170481168, + "grad_norm": 0.35576826333999634, + "learning_rate": 9.913616175823343e-06, + "loss": 0.4771, + "step": 1418 + }, + { + "epoch": 0.767165255000901, + "grad_norm": 0.42125403881073, + "learning_rate": 9.91326639665356e-06, + "loss": 0.4655, + "step": 1419 + }, + { + "epoch": 0.7677058929536853, + "grad_norm": 0.3521760404109955, + "learning_rate": 9.912915916959162e-06, + "loss": 0.4439, + "step": 1420 + }, + { + "epoch": 0.7682465309064697, + "grad_norm": 0.4294596314430237, + "learning_rate": 9.91256473679012e-06, + "loss": 0.5037, + "step": 1421 + }, + { + "epoch": 0.7687871688592539, + "grad_norm": 0.3854302763938904, + "learning_rate": 9.912212856196506e-06, + "loss": 0.4947, + "step": 1422 + }, + { + "epoch": 0.7693278068120382, + "grad_norm": 0.4472251534461975, + "learning_rate": 9.911860275228489e-06, + "loss": 0.4737, + "step": 1423 + }, + { + "epoch": 0.7698684447648225, + "grad_norm": 0.4280487298965454, + "learning_rate": 9.91150699393634e-06, + "loss": 0.4991, + "step": 1424 + }, + { + "epoch": 0.7704090827176068, + "grad_norm": 0.4406580328941345, + "learning_rate": 9.911153012370427e-06, + "loss": 0.4579, + "step": 1425 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 0.42495012283325195, + "learning_rate": 9.910798330581224e-06, + "loss": 0.466, + "step": 1426 + }, + { + "epoch": 0.7714903586231754, + "grad_norm": 0.40381088852882385, + "learning_rate": 9.910442948619298e-06, + "loss": 0.4617, + "step": 1427 + }, + { + "epoch": 0.7720309965759596, + "grad_norm": 0.46315282583236694, + "learning_rate": 9.91008686653532e-06, + "loss": 0.4639, + "step": 1428 + }, + { + "epoch": 0.7725716345287439, + "grad_norm": 0.3891032040119171, + "learning_rate": 9.90973008438006e-06, + "loss": 0.4897, + "step": 1429 + }, + { + "epoch": 0.7731122724815283, + "grad_norm": 0.3929683268070221, + "learning_rate": 9.909372602204385e-06, + "loss": 0.4713, + "step": 1430 + }, + { + "epoch": 0.7736529104343125, + "grad_norm": 0.40358665585517883, + "learning_rate": 9.909014420059266e-06, + "loss": 0.4846, + "step": 1431 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.43662330508232117, + "learning_rate": 9.908655537995772e-06, + "loss": 0.4898, + "step": 1432 + }, + { + "epoch": 0.7747341863398811, + "grad_norm": 0.3911401927471161, + "learning_rate": 9.90829595606507e-06, + "loss": 0.4864, + "step": 1433 + }, + { + "epoch": 0.7752748242926654, + "grad_norm": 0.3795257806777954, + "learning_rate": 9.907935674318431e-06, + "loss": 0.509, + "step": 1434 + }, + { + "epoch": 0.7758154622454496, + "grad_norm": 0.41082119941711426, + "learning_rate": 9.907574692807223e-06, + "loss": 0.5005, + "step": 1435 + }, + { + "epoch": 0.7763561001982339, + "grad_norm": 0.3686942458152771, + "learning_rate": 9.907213011582912e-06, + "loss": 0.474, + "step": 1436 + }, + { + "epoch": 0.7768967381510182, + "grad_norm": 0.3893868327140808, + "learning_rate": 9.906850630697068e-06, + "loss": 0.4844, + "step": 1437 + }, + { + "epoch": 0.7774373761038025, + "grad_norm": 0.4082487225532532, + "learning_rate": 9.906487550201357e-06, + "loss": 0.4775, + "step": 1438 + }, + { + "epoch": 0.7779780140565867, + "grad_norm": 0.3809872567653656, + "learning_rate": 9.906123770147548e-06, + "loss": 0.479, + "step": 1439 + }, + { + "epoch": 0.7785186520093711, + "grad_norm": 0.38394907116889954, + "learning_rate": 9.905759290587506e-06, + "loss": 0.4658, + "step": 1440 + }, + { + "epoch": 0.7790592899621553, + "grad_norm": 0.35896220803260803, + "learning_rate": 9.905394111573201e-06, + "loss": 0.4584, + "step": 1441 + }, + { + "epoch": 0.7795999279149396, + "grad_norm": 0.41028234362602234, + "learning_rate": 9.905028233156695e-06, + "loss": 0.4857, + "step": 1442 + }, + { + "epoch": 0.780140565867724, + "grad_norm": 0.3853236138820648, + "learning_rate": 9.90466165539016e-06, + "loss": 0.5003, + "step": 1443 + }, + { + "epoch": 0.7806812038205082, + "grad_norm": 0.40787339210510254, + "learning_rate": 9.904294378325856e-06, + "loss": 0.4851, + "step": 1444 + }, + { + "epoch": 0.7812218417732925, + "grad_norm": 0.38476669788360596, + "learning_rate": 9.903926402016153e-06, + "loss": 0.4824, + "step": 1445 + }, + { + "epoch": 0.7817624797260768, + "grad_norm": 0.40921327471733093, + "learning_rate": 9.903557726513515e-06, + "loss": 0.4718, + "step": 1446 + }, + { + "epoch": 0.7823031176788611, + "grad_norm": 0.37435829639434814, + "learning_rate": 9.903188351870508e-06, + "loss": 0.4491, + "step": 1447 + }, + { + "epoch": 0.7828437556316453, + "grad_norm": 0.41100597381591797, + "learning_rate": 9.902818278139794e-06, + "loss": 0.4665, + "step": 1448 + }, + { + "epoch": 0.7833843935844296, + "grad_norm": 0.36656615138053894, + "learning_rate": 9.90244750537414e-06, + "loss": 0.4828, + "step": 1449 + }, + { + "epoch": 0.7839250315372139, + "grad_norm": 0.37309545278549194, + "learning_rate": 9.902076033626409e-06, + "loss": 0.4722, + "step": 1450 + }, + { + "epoch": 0.7844656694899982, + "grad_norm": 0.38068512082099915, + "learning_rate": 9.901703862949566e-06, + "loss": 0.4807, + "step": 1451 + }, + { + "epoch": 0.7850063074427824, + "grad_norm": 0.35660654306411743, + "learning_rate": 9.901330993396671e-06, + "loss": 0.4819, + "step": 1452 + }, + { + "epoch": 0.7855469453955668, + "grad_norm": 0.3687169551849365, + "learning_rate": 9.900957425020894e-06, + "loss": 0.4913, + "step": 1453 + }, + { + "epoch": 0.786087583348351, + "grad_norm": 0.3449983596801758, + "learning_rate": 9.90058315787549e-06, + "loss": 0.4555, + "step": 1454 + }, + { + "epoch": 0.7866282213011353, + "grad_norm": 0.4075060188770294, + "learning_rate": 9.900208192013825e-06, + "loss": 0.4736, + "step": 1455 + }, + { + "epoch": 0.7871688592539197, + "grad_norm": 0.3767589330673218, + "learning_rate": 9.899832527489362e-06, + "loss": 0.4451, + "step": 1456 + }, + { + "epoch": 0.7877094972067039, + "grad_norm": 0.4201042354106903, + "learning_rate": 9.899456164355661e-06, + "loss": 0.4325, + "step": 1457 + }, + { + "epoch": 0.7882501351594882, + "grad_norm": 0.3488657772541046, + "learning_rate": 9.899079102666382e-06, + "loss": 0.4328, + "step": 1458 + }, + { + "epoch": 0.7887907731122725, + "grad_norm": 0.43522709608078003, + "learning_rate": 9.898701342475287e-06, + "loss": 0.4739, + "step": 1459 + }, + { + "epoch": 0.7893314110650568, + "grad_norm": 0.4166751205921173, + "learning_rate": 9.898322883836239e-06, + "loss": 0.4805, + "step": 1460 + }, + { + "epoch": 0.789872049017841, + "grad_norm": 0.40781643986701965, + "learning_rate": 9.897943726803195e-06, + "loss": 0.4888, + "step": 1461 + }, + { + "epoch": 0.7904126869706254, + "grad_norm": 0.41623154282569885, + "learning_rate": 9.897563871430212e-06, + "loss": 0.4716, + "step": 1462 + }, + { + "epoch": 0.7909533249234096, + "grad_norm": 0.3918064832687378, + "learning_rate": 9.897183317771455e-06, + "loss": 0.4712, + "step": 1463 + }, + { + "epoch": 0.7914939628761939, + "grad_norm": 0.3671140968799591, + "learning_rate": 9.896802065881178e-06, + "loss": 0.461, + "step": 1464 + }, + { + "epoch": 0.7920346008289781, + "grad_norm": 0.3947750926017761, + "learning_rate": 9.896420115813741e-06, + "loss": 0.4964, + "step": 1465 + }, + { + "epoch": 0.7925752387817625, + "grad_norm": 0.4626021683216095, + "learning_rate": 9.896037467623603e-06, + "loss": 0.52, + "step": 1466 + }, + { + "epoch": 0.7931158767345468, + "grad_norm": 0.3657527267932892, + "learning_rate": 9.895654121365318e-06, + "loss": 0.492, + "step": 1467 + }, + { + "epoch": 0.793656514687331, + "grad_norm": 0.4211752414703369, + "learning_rate": 9.895270077093547e-06, + "loss": 0.4833, + "step": 1468 + }, + { + "epoch": 0.7941971526401154, + "grad_norm": 0.4107241630554199, + "learning_rate": 9.894885334863044e-06, + "loss": 0.4552, + "step": 1469 + }, + { + "epoch": 0.7947377905928996, + "grad_norm": 0.35872262716293335, + "learning_rate": 9.894499894728665e-06, + "loss": 0.4925, + "step": 1470 + }, + { + "epoch": 0.7952784285456839, + "grad_norm": 0.36545562744140625, + "learning_rate": 9.894113756745362e-06, + "loss": 0.4484, + "step": 1471 + }, + { + "epoch": 0.7958190664984682, + "grad_norm": 0.41451945900917053, + "learning_rate": 9.893726920968196e-06, + "loss": 0.4796, + "step": 1472 + }, + { + "epoch": 0.7963597044512525, + "grad_norm": 0.4425036907196045, + "learning_rate": 9.893339387452319e-06, + "loss": 0.504, + "step": 1473 + }, + { + "epoch": 0.7969003424040367, + "grad_norm": 0.397030770778656, + "learning_rate": 9.892951156252982e-06, + "loss": 0.4888, + "step": 1474 + }, + { + "epoch": 0.7974409803568211, + "grad_norm": 0.37259095907211304, + "learning_rate": 9.892562227425541e-06, + "loss": 0.4654, + "step": 1475 + }, + { + "epoch": 0.7979816183096053, + "grad_norm": 0.4276980757713318, + "learning_rate": 9.89217260102545e-06, + "loss": 0.4685, + "step": 1476 + }, + { + "epoch": 0.7985222562623896, + "grad_norm": 0.35599982738494873, + "learning_rate": 9.89178227710826e-06, + "loss": 0.4595, + "step": 1477 + }, + { + "epoch": 0.799062894215174, + "grad_norm": 0.3910425901412964, + "learning_rate": 9.891391255729621e-06, + "loss": 0.4874, + "step": 1478 + }, + { + "epoch": 0.7996035321679582, + "grad_norm": 0.42644253373146057, + "learning_rate": 9.890999536945284e-06, + "loss": 0.4894, + "step": 1479 + }, + { + "epoch": 0.8001441701207425, + "grad_norm": 0.4408269226551056, + "learning_rate": 9.890607120811104e-06, + "loss": 0.4637, + "step": 1480 + }, + { + "epoch": 0.8006848080735267, + "grad_norm": 0.39104950428009033, + "learning_rate": 9.890214007383026e-06, + "loss": 0.4785, + "step": 1481 + }, + { + "epoch": 0.8012254460263111, + "grad_norm": 0.3987670838832855, + "learning_rate": 9.889820196717103e-06, + "loss": 0.473, + "step": 1482 + }, + { + "epoch": 0.8017660839790953, + "grad_norm": 0.4160068929195404, + "learning_rate": 9.88942568886948e-06, + "loss": 0.4616, + "step": 1483 + }, + { + "epoch": 0.8023067219318796, + "grad_norm": 0.4000867009162903, + "learning_rate": 9.88903048389641e-06, + "loss": 0.5004, + "step": 1484 + }, + { + "epoch": 0.8028473598846639, + "grad_norm": 0.36520716547966003, + "learning_rate": 9.888634581854235e-06, + "loss": 0.4564, + "step": 1485 + }, + { + "epoch": 0.8033879978374482, + "grad_norm": 0.3652910590171814, + "learning_rate": 9.888237982799407e-06, + "loss": 0.4861, + "step": 1486 + }, + { + "epoch": 0.8039286357902324, + "grad_norm": 0.3691157400608063, + "learning_rate": 9.88784068678847e-06, + "loss": 0.4612, + "step": 1487 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 0.40664616227149963, + "learning_rate": 9.887442693878068e-06, + "loss": 0.4854, + "step": 1488 + }, + { + "epoch": 0.805009911695801, + "grad_norm": 0.3441387414932251, + "learning_rate": 9.887044004124951e-06, + "loss": 0.4863, + "step": 1489 + }, + { + "epoch": 0.8055505496485853, + "grad_norm": 0.3915308117866516, + "learning_rate": 9.88664461758596e-06, + "loss": 0.4784, + "step": 1490 + }, + { + "epoch": 0.8060911876013697, + "grad_norm": 0.38703295588493347, + "learning_rate": 9.886244534318038e-06, + "loss": 0.4666, + "step": 1491 + }, + { + "epoch": 0.8066318255541539, + "grad_norm": 0.3933779299259186, + "learning_rate": 9.885843754378233e-06, + "loss": 0.4811, + "step": 1492 + }, + { + "epoch": 0.8071724635069382, + "grad_norm": 0.3390825390815735, + "learning_rate": 9.88544227782368e-06, + "loss": 0.4719, + "step": 1493 + }, + { + "epoch": 0.8077131014597225, + "grad_norm": 0.38970884680747986, + "learning_rate": 9.885040104711628e-06, + "loss": 0.4446, + "step": 1494 + }, + { + "epoch": 0.8082537394125068, + "grad_norm": 0.3798392713069916, + "learning_rate": 9.884637235099414e-06, + "loss": 0.4508, + "step": 1495 + }, + { + "epoch": 0.808794377365291, + "grad_norm": 0.3552364706993103, + "learning_rate": 9.884233669044479e-06, + "loss": 0.4485, + "step": 1496 + }, + { + "epoch": 0.8093350153180753, + "grad_norm": 0.3914445638656616, + "learning_rate": 9.883829406604363e-06, + "loss": 0.4576, + "step": 1497 + }, + { + "epoch": 0.8098756532708596, + "grad_norm": 0.35896924138069153, + "learning_rate": 9.883424447836705e-06, + "loss": 0.4664, + "step": 1498 + }, + { + "epoch": 0.8104162912236439, + "grad_norm": 0.3930225372314453, + "learning_rate": 9.883018792799243e-06, + "loss": 0.4897, + "step": 1499 + }, + { + "epoch": 0.8109569291764281, + "grad_norm": 0.3645854890346527, + "learning_rate": 9.882612441549817e-06, + "loss": 0.458, + "step": 1500 + }, + { + "epoch": 0.8114975671292125, + "grad_norm": 0.4014187455177307, + "learning_rate": 9.882205394146362e-06, + "loss": 0.4635, + "step": 1501 + }, + { + "epoch": 0.8120382050819968, + "grad_norm": 0.3631240427494049, + "learning_rate": 9.881797650646911e-06, + "loss": 0.473, + "step": 1502 + }, + { + "epoch": 0.812578843034781, + "grad_norm": 0.37682297825813293, + "learning_rate": 9.881389211109604e-06, + "loss": 0.4612, + "step": 1503 + }, + { + "epoch": 0.8131194809875654, + "grad_norm": 0.4124244153499603, + "learning_rate": 9.880980075592674e-06, + "loss": 0.4795, + "step": 1504 + }, + { + "epoch": 0.8136601189403496, + "grad_norm": 0.3777317702770233, + "learning_rate": 9.880570244154455e-06, + "loss": 0.4741, + "step": 1505 + }, + { + "epoch": 0.8142007568931339, + "grad_norm": 0.3755948841571808, + "learning_rate": 9.880159716853379e-06, + "loss": 0.504, + "step": 1506 + }, + { + "epoch": 0.8147413948459182, + "grad_norm": 0.3681623637676239, + "learning_rate": 9.879748493747978e-06, + "loss": 0.4705, + "step": 1507 + }, + { + "epoch": 0.8152820327987025, + "grad_norm": 0.3670575022697449, + "learning_rate": 9.879336574896885e-06, + "loss": 0.4829, + "step": 1508 + }, + { + "epoch": 0.8158226707514867, + "grad_norm": 0.40238499641418457, + "learning_rate": 9.878923960358831e-06, + "loss": 0.4778, + "step": 1509 + }, + { + "epoch": 0.816363308704271, + "grad_norm": 0.4063103199005127, + "learning_rate": 9.878510650192644e-06, + "loss": 0.4859, + "step": 1510 + }, + { + "epoch": 0.8169039466570553, + "grad_norm": 0.3617895841598511, + "learning_rate": 9.878096644457254e-06, + "loss": 0.4689, + "step": 1511 + }, + { + "epoch": 0.8174445846098396, + "grad_norm": 0.3812806010246277, + "learning_rate": 9.877681943211688e-06, + "loss": 0.4841, + "step": 1512 + }, + { + "epoch": 0.8179852225626238, + "grad_norm": 0.45879244804382324, + "learning_rate": 9.877266546515075e-06, + "loss": 0.4704, + "step": 1513 + }, + { + "epoch": 0.8185258605154082, + "grad_norm": 0.3604988753795624, + "learning_rate": 9.87685045442664e-06, + "loss": 0.4815, + "step": 1514 + }, + { + "epoch": 0.8190664984681925, + "grad_norm": 0.4087460935115814, + "learning_rate": 9.876433667005711e-06, + "loss": 0.4805, + "step": 1515 + }, + { + "epoch": 0.8196071364209767, + "grad_norm": 0.3688940107822418, + "learning_rate": 9.87601618431171e-06, + "loss": 0.4603, + "step": 1516 + }, + { + "epoch": 0.8201477743737611, + "grad_norm": 0.3499005138874054, + "learning_rate": 9.875598006404164e-06, + "loss": 0.49, + "step": 1517 + }, + { + "epoch": 0.8206884123265453, + "grad_norm": 0.38962242007255554, + "learning_rate": 9.875179133342692e-06, + "loss": 0.4882, + "step": 1518 + }, + { + "epoch": 0.8212290502793296, + "grad_norm": 0.38138699531555176, + "learning_rate": 9.87475956518702e-06, + "loss": 0.4711, + "step": 1519 + }, + { + "epoch": 0.8217696882321139, + "grad_norm": 0.37326300144195557, + "learning_rate": 9.874339301996968e-06, + "loss": 0.4894, + "step": 1520 + }, + { + "epoch": 0.8223103261848982, + "grad_norm": 0.3874557912349701, + "learning_rate": 9.873918343832454e-06, + "loss": 0.4858, + "step": 1521 + }, + { + "epoch": 0.8228509641376824, + "grad_norm": 0.371981143951416, + "learning_rate": 9.873496690753502e-06, + "loss": 0.4413, + "step": 1522 + }, + { + "epoch": 0.8233916020904668, + "grad_norm": 0.370301753282547, + "learning_rate": 9.873074342820225e-06, + "loss": 0.4494, + "step": 1523 + }, + { + "epoch": 0.823932240043251, + "grad_norm": 0.3950522243976593, + "learning_rate": 9.872651300092845e-06, + "loss": 0.4624, + "step": 1524 + }, + { + "epoch": 0.8244728779960353, + "grad_norm": 0.4005773365497589, + "learning_rate": 9.87222756263168e-06, + "loss": 0.4758, + "step": 1525 + }, + { + "epoch": 0.8250135159488196, + "grad_norm": 0.326057106256485, + "learning_rate": 9.871803130497139e-06, + "loss": 0.4696, + "step": 1526 + }, + { + "epoch": 0.8255541539016039, + "grad_norm": 0.39111393690109253, + "learning_rate": 9.871378003749744e-06, + "loss": 0.4595, + "step": 1527 + }, + { + "epoch": 0.8260947918543882, + "grad_norm": 0.34531497955322266, + "learning_rate": 9.870952182450104e-06, + "loss": 0.4675, + "step": 1528 + }, + { + "epoch": 0.8266354298071724, + "grad_norm": 0.37008053064346313, + "learning_rate": 9.870525666658933e-06, + "loss": 0.4645, + "step": 1529 + }, + { + "epoch": 0.8271760677599568, + "grad_norm": 0.3724628984928131, + "learning_rate": 9.870098456437045e-06, + "loss": 0.4595, + "step": 1530 + }, + { + "epoch": 0.827716705712741, + "grad_norm": 0.3725714385509491, + "learning_rate": 9.869670551845347e-06, + "loss": 0.4588, + "step": 1531 + }, + { + "epoch": 0.8282573436655253, + "grad_norm": 0.4514995515346527, + "learning_rate": 9.869241952944852e-06, + "loss": 0.4459, + "step": 1532 + }, + { + "epoch": 0.8287979816183096, + "grad_norm": 0.42052534222602844, + "learning_rate": 9.868812659796669e-06, + "loss": 0.4677, + "step": 1533 + }, + { + "epoch": 0.8293386195710939, + "grad_norm": 0.40265753865242004, + "learning_rate": 9.868382672462002e-06, + "loss": 0.4826, + "step": 1534 + }, + { + "epoch": 0.8298792575238781, + "grad_norm": 0.3773770034313202, + "learning_rate": 9.867951991002162e-06, + "loss": 0.4521, + "step": 1535 + }, + { + "epoch": 0.8304198954766625, + "grad_norm": 0.40366092324256897, + "learning_rate": 9.867520615478554e-06, + "loss": 0.4692, + "step": 1536 + }, + { + "epoch": 0.8309605334294468, + "grad_norm": 0.3797312080860138, + "learning_rate": 9.867088545952682e-06, + "loss": 0.4673, + "step": 1537 + }, + { + "epoch": 0.831501171382231, + "grad_norm": 0.41201868653297424, + "learning_rate": 9.866655782486147e-06, + "loss": 0.4709, + "step": 1538 + }, + { + "epoch": 0.8320418093350154, + "grad_norm": 0.35594871640205383, + "learning_rate": 9.866222325140657e-06, + "loss": 0.4818, + "step": 1539 + }, + { + "epoch": 0.8325824472877996, + "grad_norm": 0.41564980149269104, + "learning_rate": 9.865788173978011e-06, + "loss": 0.4866, + "step": 1540 + }, + { + "epoch": 0.8331230852405839, + "grad_norm": 0.37932664155960083, + "learning_rate": 9.865353329060108e-06, + "loss": 0.4673, + "step": 1541 + }, + { + "epoch": 0.8336637231933681, + "grad_norm": 0.37759143114089966, + "learning_rate": 9.86491779044895e-06, + "loss": 0.4855, + "step": 1542 + }, + { + "epoch": 0.8342043611461525, + "grad_norm": 0.3636254668235779, + "learning_rate": 9.864481558206633e-06, + "loss": 0.4706, + "step": 1543 + }, + { + "epoch": 0.8347449990989367, + "grad_norm": 0.35987749695777893, + "learning_rate": 9.864044632395357e-06, + "loss": 0.4674, + "step": 1544 + }, + { + "epoch": 0.835285637051721, + "grad_norm": 0.39580675959587097, + "learning_rate": 9.863607013077414e-06, + "loss": 0.4847, + "step": 1545 + }, + { + "epoch": 0.8358262750045053, + "grad_norm": 0.38225269317626953, + "learning_rate": 9.863168700315204e-06, + "loss": 0.4625, + "step": 1546 + }, + { + "epoch": 0.8363669129572896, + "grad_norm": 0.3854118883609772, + "learning_rate": 9.862729694171216e-06, + "loss": 0.456, + "step": 1547 + }, + { + "epoch": 0.8369075509100738, + "grad_norm": 0.4238852262496948, + "learning_rate": 9.862289994708044e-06, + "loss": 0.4848, + "step": 1548 + }, + { + "epoch": 0.8374481888628582, + "grad_norm": 0.40586042404174805, + "learning_rate": 9.861849601988384e-06, + "loss": 0.4621, + "step": 1549 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 0.43681904673576355, + "learning_rate": 9.86140851607502e-06, + "loss": 0.494, + "step": 1550 + }, + { + "epoch": 0.8385294647684267, + "grad_norm": 0.38671132922172546, + "learning_rate": 9.860966737030846e-06, + "loss": 0.4984, + "step": 1551 + }, + { + "epoch": 0.8390701027212111, + "grad_norm": 0.40254631638526917, + "learning_rate": 9.860524264918847e-06, + "loss": 0.4929, + "step": 1552 + }, + { + "epoch": 0.8396107406739953, + "grad_norm": 0.38910743594169617, + "learning_rate": 9.860081099802111e-06, + "loss": 0.4692, + "step": 1553 + }, + { + "epoch": 0.8401513786267796, + "grad_norm": 0.35398274660110474, + "learning_rate": 9.859637241743824e-06, + "loss": 0.4821, + "step": 1554 + }, + { + "epoch": 0.8406920165795639, + "grad_norm": 0.4559416174888611, + "learning_rate": 9.85919269080727e-06, + "loss": 0.4785, + "step": 1555 + }, + { + "epoch": 0.8412326545323482, + "grad_norm": 0.38288402557373047, + "learning_rate": 9.858747447055832e-06, + "loss": 0.4379, + "step": 1556 + }, + { + "epoch": 0.8417732924851324, + "grad_norm": 0.3954237699508667, + "learning_rate": 9.858301510552993e-06, + "loss": 0.4988, + "step": 1557 + }, + { + "epoch": 0.8423139304379167, + "grad_norm": 0.37373679876327515, + "learning_rate": 9.857854881362334e-06, + "loss": 0.4434, + "step": 1558 + }, + { + "epoch": 0.842854568390701, + "grad_norm": 0.3693135678768158, + "learning_rate": 9.857407559547531e-06, + "loss": 0.4698, + "step": 1559 + }, + { + "epoch": 0.8433952063434853, + "grad_norm": 0.3623071014881134, + "learning_rate": 9.856959545172369e-06, + "loss": 0.4677, + "step": 1560 + }, + { + "epoch": 0.8439358442962696, + "grad_norm": 0.35580453276634216, + "learning_rate": 9.856510838300719e-06, + "loss": 0.4777, + "step": 1561 + }, + { + "epoch": 0.8444764822490539, + "grad_norm": 0.35490602254867554, + "learning_rate": 9.85606143899656e-06, + "loss": 0.4522, + "step": 1562 + }, + { + "epoch": 0.8450171202018382, + "grad_norm": 0.35406145453453064, + "learning_rate": 9.855611347323965e-06, + "loss": 0.4664, + "step": 1563 + }, + { + "epoch": 0.8455577581546224, + "grad_norm": 0.3772537410259247, + "learning_rate": 9.855160563347108e-06, + "loss": 0.4514, + "step": 1564 + }, + { + "epoch": 0.8460983961074068, + "grad_norm": 0.40558236837387085, + "learning_rate": 9.854709087130261e-06, + "loss": 0.4579, + "step": 1565 + }, + { + "epoch": 0.846639034060191, + "grad_norm": 0.4021514654159546, + "learning_rate": 9.854256918737794e-06, + "loss": 0.4742, + "step": 1566 + }, + { + "epoch": 0.8471796720129753, + "grad_norm": 0.3992575705051422, + "learning_rate": 9.853804058234177e-06, + "loss": 0.4879, + "step": 1567 + }, + { + "epoch": 0.8477203099657596, + "grad_norm": 0.37950408458709717, + "learning_rate": 9.853350505683978e-06, + "loss": 0.4649, + "step": 1568 + }, + { + "epoch": 0.8482609479185439, + "grad_norm": 0.37125805020332336, + "learning_rate": 9.852896261151865e-06, + "loss": 0.4818, + "step": 1569 + }, + { + "epoch": 0.8488015858713281, + "grad_norm": 0.366233229637146, + "learning_rate": 9.852441324702599e-06, + "loss": 0.4941, + "step": 1570 + }, + { + "epoch": 0.8493422238241125, + "grad_norm": 0.34432452917099, + "learning_rate": 9.85198569640105e-06, + "loss": 0.4836, + "step": 1571 + }, + { + "epoch": 0.8498828617768968, + "grad_norm": 0.38173070549964905, + "learning_rate": 9.851529376312176e-06, + "loss": 0.4613, + "step": 1572 + }, + { + "epoch": 0.850423499729681, + "grad_norm": 0.35302814841270447, + "learning_rate": 9.85107236450104e-06, + "loss": 0.4597, + "step": 1573 + }, + { + "epoch": 0.8509641376824653, + "grad_norm": 0.3876490592956543, + "learning_rate": 9.850614661032803e-06, + "loss": 0.4761, + "step": 1574 + }, + { + "epoch": 0.8515047756352496, + "grad_norm": 0.3852789103984833, + "learning_rate": 9.850156265972722e-06, + "loss": 0.4698, + "step": 1575 + }, + { + "epoch": 0.8520454135880339, + "grad_norm": 0.4346942603588104, + "learning_rate": 9.849697179386152e-06, + "loss": 0.4583, + "step": 1576 + }, + { + "epoch": 0.8525860515408181, + "grad_norm": 0.36047878861427307, + "learning_rate": 9.849237401338554e-06, + "loss": 0.4565, + "step": 1577 + }, + { + "epoch": 0.8531266894936025, + "grad_norm": 0.38957422971725464, + "learning_rate": 9.848776931895478e-06, + "loss": 0.4873, + "step": 1578 + }, + { + "epoch": 0.8536673274463867, + "grad_norm": 0.35288092494010925, + "learning_rate": 9.84831577112258e-06, + "loss": 0.4777, + "step": 1579 + }, + { + "epoch": 0.854207965399171, + "grad_norm": 0.4233868420124054, + "learning_rate": 9.847853919085608e-06, + "loss": 0.481, + "step": 1580 + }, + { + "epoch": 0.8547486033519553, + "grad_norm": 0.38051849603652954, + "learning_rate": 9.847391375850415e-06, + "loss": 0.4453, + "step": 1581 + }, + { + "epoch": 0.8552892413047396, + "grad_norm": 0.35708117485046387, + "learning_rate": 9.84692814148295e-06, + "loss": 0.4701, + "step": 1582 + }, + { + "epoch": 0.8558298792575239, + "grad_norm": 0.38620758056640625, + "learning_rate": 9.846464216049256e-06, + "loss": 0.4564, + "step": 1583 + }, + { + "epoch": 0.8563705172103082, + "grad_norm": 0.39909282326698303, + "learning_rate": 9.845999599615481e-06, + "loss": 0.5078, + "step": 1584 + }, + { + "epoch": 0.8569111551630925, + "grad_norm": 0.341009259223938, + "learning_rate": 9.845534292247872e-06, + "loss": 0.4972, + "step": 1585 + }, + { + "epoch": 0.8574517931158767, + "grad_norm": 0.4746313989162445, + "learning_rate": 9.845068294012767e-06, + "loss": 0.4795, + "step": 1586 + }, + { + "epoch": 0.857992431068661, + "grad_norm": 0.47726917266845703, + "learning_rate": 9.844601604976611e-06, + "loss": 0.4888, + "step": 1587 + }, + { + "epoch": 0.8585330690214453, + "grad_norm": 0.33320072293281555, + "learning_rate": 9.844134225205941e-06, + "loss": 0.4423, + "step": 1588 + }, + { + "epoch": 0.8590737069742296, + "grad_norm": 0.44583311676979065, + "learning_rate": 9.843666154767396e-06, + "loss": 0.4602, + "step": 1589 + }, + { + "epoch": 0.8596143449270138, + "grad_norm": 0.4997808337211609, + "learning_rate": 9.843197393727713e-06, + "loss": 0.4743, + "step": 1590 + }, + { + "epoch": 0.8601549828797982, + "grad_norm": 0.32006052136421204, + "learning_rate": 9.842727942153728e-06, + "loss": 0.4549, + "step": 1591 + }, + { + "epoch": 0.8606956208325824, + "grad_norm": 0.4787975549697876, + "learning_rate": 9.842257800112372e-06, + "loss": 0.4767, + "step": 1592 + }, + { + "epoch": 0.8612362587853667, + "grad_norm": 0.3997071087360382, + "learning_rate": 9.84178696767068e-06, + "loss": 0.473, + "step": 1593 + }, + { + "epoch": 0.861776896738151, + "grad_norm": 0.3409503996372223, + "learning_rate": 9.841315444895778e-06, + "loss": 0.4823, + "step": 1594 + }, + { + "epoch": 0.8623175346909353, + "grad_norm": 0.35580211877822876, + "learning_rate": 9.8408432318549e-06, + "loss": 0.4511, + "step": 1595 + }, + { + "epoch": 0.8628581726437196, + "grad_norm": 0.40508773922920227, + "learning_rate": 9.84037032861537e-06, + "loss": 0.4593, + "step": 1596 + }, + { + "epoch": 0.8633988105965039, + "grad_norm": 0.3585672378540039, + "learning_rate": 9.839896735244615e-06, + "loss": 0.4548, + "step": 1597 + }, + { + "epoch": 0.8639394485492882, + "grad_norm": 0.37689289450645447, + "learning_rate": 9.839422451810159e-06, + "loss": 0.4784, + "step": 1598 + }, + { + "epoch": 0.8644800865020724, + "grad_norm": 0.42086952924728394, + "learning_rate": 9.838947478379623e-06, + "loss": 0.4825, + "step": 1599 + }, + { + "epoch": 0.8650207244548568, + "grad_norm": 0.3835284113883972, + "learning_rate": 9.838471815020731e-06, + "loss": 0.4775, + "step": 1600 + }, + { + "epoch": 0.865561362407641, + "grad_norm": 0.38955605030059814, + "learning_rate": 9.8379954618013e-06, + "loss": 0.4632, + "step": 1601 + }, + { + "epoch": 0.8661020003604253, + "grad_norm": 0.43403568863868713, + "learning_rate": 9.837518418789247e-06, + "loss": 0.4784, + "step": 1602 + }, + { + "epoch": 0.8666426383132095, + "grad_norm": 0.40000101923942566, + "learning_rate": 9.837040686052591e-06, + "loss": 0.4623, + "step": 1603 + }, + { + "epoch": 0.8671832762659939, + "grad_norm": 0.4432052671909332, + "learning_rate": 9.836562263659441e-06, + "loss": 0.4765, + "step": 1604 + }, + { + "epoch": 0.8677239142187781, + "grad_norm": 0.3897626996040344, + "learning_rate": 9.836083151678014e-06, + "loss": 0.5006, + "step": 1605 + }, + { + "epoch": 0.8682645521715624, + "grad_norm": 0.4313247501850128, + "learning_rate": 9.835603350176618e-06, + "loss": 0.4695, + "step": 1606 + }, + { + "epoch": 0.8688051901243468, + "grad_norm": 0.3997137248516083, + "learning_rate": 9.835122859223668e-06, + "loss": 0.465, + "step": 1607 + }, + { + "epoch": 0.869345828077131, + "grad_norm": 0.4124264419078827, + "learning_rate": 9.834641678887664e-06, + "loss": 0.4776, + "step": 1608 + }, + { + "epoch": 0.8698864660299153, + "grad_norm": 0.3896905779838562, + "learning_rate": 9.834159809237217e-06, + "loss": 0.4554, + "step": 1609 + }, + { + "epoch": 0.8704271039826996, + "grad_norm": 0.46274518966674805, + "learning_rate": 9.833677250341027e-06, + "loss": 0.4832, + "step": 1610 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.3725559413433075, + "learning_rate": 9.833194002267901e-06, + "loss": 0.4704, + "step": 1611 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 0.3861299753189087, + "learning_rate": 9.832710065086736e-06, + "loss": 0.487, + "step": 1612 + }, + { + "epoch": 0.8720490178410525, + "grad_norm": 0.42216816544532776, + "learning_rate": 9.832225438866532e-06, + "loss": 0.5083, + "step": 1613 + }, + { + "epoch": 0.8725896557938367, + "grad_norm": 0.3609369695186615, + "learning_rate": 9.831740123676387e-06, + "loss": 0.4579, + "step": 1614 + }, + { + "epoch": 0.873130293746621, + "grad_norm": 0.410856693983078, + "learning_rate": 9.831254119585497e-06, + "loss": 0.4679, + "step": 1615 + }, + { + "epoch": 0.8736709316994054, + "grad_norm": 0.418223112821579, + "learning_rate": 9.83076742666315e-06, + "loss": 0.4571, + "step": 1616 + }, + { + "epoch": 0.8742115696521896, + "grad_norm": 0.37163031101226807, + "learning_rate": 9.830280044978745e-06, + "loss": 0.4606, + "step": 1617 + }, + { + "epoch": 0.8747522076049739, + "grad_norm": 0.3737943768501282, + "learning_rate": 9.82979197460177e-06, + "loss": 0.463, + "step": 1618 + }, + { + "epoch": 0.8752928455577581, + "grad_norm": 0.37015417218208313, + "learning_rate": 9.82930321560181e-06, + "loss": 0.4651, + "step": 1619 + }, + { + "epoch": 0.8758334835105425, + "grad_norm": 0.3578423857688904, + "learning_rate": 9.828813768048555e-06, + "loss": 0.4543, + "step": 1620 + }, + { + "epoch": 0.8763741214633267, + "grad_norm": 0.35040757060050964, + "learning_rate": 9.828323632011789e-06, + "loss": 0.4631, + "step": 1621 + }, + { + "epoch": 0.876914759416111, + "grad_norm": 0.44472888112068176, + "learning_rate": 9.827832807561392e-06, + "loss": 0.4635, + "step": 1622 + }, + { + "epoch": 0.8774553973688953, + "grad_norm": 0.3634223937988281, + "learning_rate": 9.827341294767347e-06, + "loss": 0.4762, + "step": 1623 + }, + { + "epoch": 0.8779960353216796, + "grad_norm": 0.43389296531677246, + "learning_rate": 9.826849093699733e-06, + "loss": 0.4297, + "step": 1624 + }, + { + "epoch": 0.8785366732744638, + "grad_norm": 0.43347981572151184, + "learning_rate": 9.826356204428726e-06, + "loss": 0.4498, + "step": 1625 + }, + { + "epoch": 0.8790773112272482, + "grad_norm": 0.40552183985710144, + "learning_rate": 9.825862627024606e-06, + "loss": 0.4644, + "step": 1626 + }, + { + "epoch": 0.8796179491800324, + "grad_norm": 0.41467517614364624, + "learning_rate": 9.825368361557738e-06, + "loss": 0.4461, + "step": 1627 + }, + { + "epoch": 0.8801585871328167, + "grad_norm": 0.44060826301574707, + "learning_rate": 9.824873408098598e-06, + "loss": 0.4671, + "step": 1628 + }, + { + "epoch": 0.8806992250856011, + "grad_norm": 0.3913240134716034, + "learning_rate": 9.824377766717758e-06, + "loss": 0.4754, + "step": 1629 + }, + { + "epoch": 0.8812398630383853, + "grad_norm": 0.4083966314792633, + "learning_rate": 9.823881437485882e-06, + "loss": 0.4769, + "step": 1630 + }, + { + "epoch": 0.8817805009911696, + "grad_norm": 0.4295070171356201, + "learning_rate": 9.823384420473738e-06, + "loss": 0.4634, + "step": 1631 + }, + { + "epoch": 0.8823211389439539, + "grad_norm": 0.41920551657676697, + "learning_rate": 9.822886715752187e-06, + "loss": 0.4609, + "step": 1632 + }, + { + "epoch": 0.8828617768967382, + "grad_norm": 0.39997240900993347, + "learning_rate": 9.822388323392193e-06, + "loss": 0.4757, + "step": 1633 + }, + { + "epoch": 0.8834024148495224, + "grad_norm": 0.38902947306632996, + "learning_rate": 9.821889243464816e-06, + "loss": 0.4798, + "step": 1634 + }, + { + "epoch": 0.8839430528023067, + "grad_norm": 0.41250500082969666, + "learning_rate": 9.821389476041212e-06, + "loss": 0.4552, + "step": 1635 + }, + { + "epoch": 0.884483690755091, + "grad_norm": 0.45538103580474854, + "learning_rate": 9.82088902119264e-06, + "loss": 0.4787, + "step": 1636 + }, + { + "epoch": 0.8850243287078753, + "grad_norm": 0.4461895823478699, + "learning_rate": 9.820387878990451e-06, + "loss": 0.4538, + "step": 1637 + }, + { + "epoch": 0.8855649666606595, + "grad_norm": 0.47144433856010437, + "learning_rate": 9.819886049506098e-06, + "loss": 0.4438, + "step": 1638 + }, + { + "epoch": 0.8861056046134439, + "grad_norm": 0.43563005328178406, + "learning_rate": 9.819383532811134e-06, + "loss": 0.4701, + "step": 1639 + }, + { + "epoch": 0.8866462425662281, + "grad_norm": 0.43610212206840515, + "learning_rate": 9.8188803289772e-06, + "loss": 0.4917, + "step": 1640 + }, + { + "epoch": 0.8871868805190124, + "grad_norm": 0.39954859018325806, + "learning_rate": 9.818376438076047e-06, + "loss": 0.4816, + "step": 1641 + }, + { + "epoch": 0.8877275184717968, + "grad_norm": 0.3857881426811218, + "learning_rate": 9.817871860179519e-06, + "loss": 0.4716, + "step": 1642 + }, + { + "epoch": 0.888268156424581, + "grad_norm": 0.4059131443500519, + "learning_rate": 9.817366595359556e-06, + "loss": 0.4572, + "step": 1643 + }, + { + "epoch": 0.8888087943773653, + "grad_norm": 0.4096532464027405, + "learning_rate": 9.816860643688197e-06, + "loss": 0.4901, + "step": 1644 + }, + { + "epoch": 0.8893494323301496, + "grad_norm": 0.45331189036369324, + "learning_rate": 9.816354005237583e-06, + "loss": 0.4495, + "step": 1645 + }, + { + "epoch": 0.8898900702829339, + "grad_norm": 0.429908812046051, + "learning_rate": 9.815846680079946e-06, + "loss": 0.4694, + "step": 1646 + }, + { + "epoch": 0.8904307082357181, + "grad_norm": 0.37314942479133606, + "learning_rate": 9.815338668287621e-06, + "loss": 0.4854, + "step": 1647 + }, + { + "epoch": 0.8909713461885024, + "grad_norm": 0.4905018210411072, + "learning_rate": 9.81482996993304e-06, + "loss": 0.4551, + "step": 1648 + }, + { + "epoch": 0.8915119841412867, + "grad_norm": 0.3743281960487366, + "learning_rate": 9.814320585088732e-06, + "loss": 0.4557, + "step": 1649 + }, + { + "epoch": 0.892052622094071, + "grad_norm": 0.42336776852607727, + "learning_rate": 9.813810513827324e-06, + "loss": 0.4594, + "step": 1650 + }, + { + "epoch": 0.8925932600468552, + "grad_norm": 0.4253554046154022, + "learning_rate": 9.813299756221539e-06, + "loss": 0.4816, + "step": 1651 + }, + { + "epoch": 0.8931338979996396, + "grad_norm": 0.45747578144073486, + "learning_rate": 9.812788312344203e-06, + "loss": 0.4935, + "step": 1652 + }, + { + "epoch": 0.8936745359524239, + "grad_norm": 0.4536707401275635, + "learning_rate": 9.812276182268236e-06, + "loss": 0.4865, + "step": 1653 + }, + { + "epoch": 0.8942151739052081, + "grad_norm": 0.43989959359169006, + "learning_rate": 9.811763366066657e-06, + "loss": 0.4677, + "step": 1654 + }, + { + "epoch": 0.8947558118579925, + "grad_norm": 0.45789846777915955, + "learning_rate": 9.811249863812581e-06, + "loss": 0.4643, + "step": 1655 + }, + { + "epoch": 0.8952964498107767, + "grad_norm": 0.3918055295944214, + "learning_rate": 9.810735675579221e-06, + "loss": 0.4734, + "step": 1656 + }, + { + "epoch": 0.895837087763561, + "grad_norm": 0.4731237590312958, + "learning_rate": 9.810220801439894e-06, + "loss": 0.4887, + "step": 1657 + }, + { + "epoch": 0.8963777257163453, + "grad_norm": 0.39390793442726135, + "learning_rate": 9.809705241468004e-06, + "loss": 0.4299, + "step": 1658 + }, + { + "epoch": 0.8969183636691296, + "grad_norm": 0.4457356035709381, + "learning_rate": 9.809188995737062e-06, + "loss": 0.467, + "step": 1659 + }, + { + "epoch": 0.8974590016219138, + "grad_norm": 0.43071913719177246, + "learning_rate": 9.808672064320672e-06, + "loss": 0.4533, + "step": 1660 + }, + { + "epoch": 0.8979996395746982, + "grad_norm": 0.505720317363739, + "learning_rate": 9.808154447292539e-06, + "loss": 0.4819, + "step": 1661 + }, + { + "epoch": 0.8985402775274824, + "grad_norm": 0.3969588875770569, + "learning_rate": 9.807636144726463e-06, + "loss": 0.4719, + "step": 1662 + }, + { + "epoch": 0.8990809154802667, + "grad_norm": 0.4451639950275421, + "learning_rate": 9.80711715669634e-06, + "loss": 0.4385, + "step": 1663 + }, + { + "epoch": 0.899621553433051, + "grad_norm": 0.40204963088035583, + "learning_rate": 9.80659748327617e-06, + "loss": 0.4589, + "step": 1664 + }, + { + "epoch": 0.9001621913858353, + "grad_norm": 0.37509143352508545, + "learning_rate": 9.806077124540045e-06, + "loss": 0.449, + "step": 1665 + }, + { + "epoch": 0.9007028293386196, + "grad_norm": 0.4153278172016144, + "learning_rate": 9.80555608056216e-06, + "loss": 0.473, + "step": 1666 + }, + { + "epoch": 0.9012434672914038, + "grad_norm": 0.35227838158607483, + "learning_rate": 9.805034351416799e-06, + "loss": 0.4741, + "step": 1667 + }, + { + "epoch": 0.9017841052441882, + "grad_norm": 0.39335453510284424, + "learning_rate": 9.804511937178353e-06, + "loss": 0.4675, + "step": 1668 + }, + { + "epoch": 0.9023247431969724, + "grad_norm": 0.38329482078552246, + "learning_rate": 9.803988837921307e-06, + "loss": 0.4883, + "step": 1669 + }, + { + "epoch": 0.9028653811497567, + "grad_norm": 0.3808650076389313, + "learning_rate": 9.803465053720242e-06, + "loss": 0.4607, + "step": 1670 + }, + { + "epoch": 0.903406019102541, + "grad_norm": 0.37646231055259705, + "learning_rate": 9.80294058464984e-06, + "loss": 0.4577, + "step": 1671 + }, + { + "epoch": 0.9039466570553253, + "grad_norm": 0.36820298433303833, + "learning_rate": 9.802415430784877e-06, + "loss": 0.4797, + "step": 1672 + }, + { + "epoch": 0.9044872950081095, + "grad_norm": 0.37384963035583496, + "learning_rate": 9.801889592200229e-06, + "loss": 0.4568, + "step": 1673 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 0.3755015730857849, + "learning_rate": 9.80136306897087e-06, + "loss": 0.4753, + "step": 1674 + }, + { + "epoch": 0.9055685709136782, + "grad_norm": 0.4412163197994232, + "learning_rate": 9.800835861171869e-06, + "loss": 0.4805, + "step": 1675 + }, + { + "epoch": 0.9061092088664624, + "grad_norm": 0.4151507318019867, + "learning_rate": 9.800307968878395e-06, + "loss": 0.4629, + "step": 1676 + }, + { + "epoch": 0.9066498468192468, + "grad_norm": 0.4136570990085602, + "learning_rate": 9.799779392165716e-06, + "loss": 0.4265, + "step": 1677 + }, + { + "epoch": 0.907190484772031, + "grad_norm": 0.40677595138549805, + "learning_rate": 9.799250131109192e-06, + "loss": 0.4717, + "step": 1678 + }, + { + "epoch": 0.9077311227248153, + "grad_norm": 0.4381828308105469, + "learning_rate": 9.798720185784288e-06, + "loss": 0.4893, + "step": 1679 + }, + { + "epoch": 0.9082717606775995, + "grad_norm": 0.4856142997741699, + "learning_rate": 9.798189556266559e-06, + "loss": 0.4538, + "step": 1680 + }, + { + "epoch": 0.9088123986303839, + "grad_norm": 0.40494972467422485, + "learning_rate": 9.797658242631664e-06, + "loss": 0.4772, + "step": 1681 + }, + { + "epoch": 0.9093530365831681, + "grad_norm": 0.4658674895763397, + "learning_rate": 9.797126244955355e-06, + "loss": 0.4778, + "step": 1682 + }, + { + "epoch": 0.9098936745359524, + "grad_norm": 0.3895665407180786, + "learning_rate": 9.796593563313483e-06, + "loss": 0.4633, + "step": 1683 + }, + { + "epoch": 0.9104343124887367, + "grad_norm": 0.46936503052711487, + "learning_rate": 9.796060197781998e-06, + "loss": 0.4864, + "step": 1684 + }, + { + "epoch": 0.910974950441521, + "grad_norm": 0.3557020425796509, + "learning_rate": 9.795526148436945e-06, + "loss": 0.4757, + "step": 1685 + }, + { + "epoch": 0.9115155883943052, + "grad_norm": 0.3669896423816681, + "learning_rate": 9.794991415354468e-06, + "loss": 0.4801, + "step": 1686 + }, + { + "epoch": 0.9120562263470896, + "grad_norm": 0.35865387320518494, + "learning_rate": 9.794455998610812e-06, + "loss": 0.4782, + "step": 1687 + }, + { + "epoch": 0.9125968642998739, + "grad_norm": 0.3865331709384918, + "learning_rate": 9.79391989828231e-06, + "loss": 0.4783, + "step": 1688 + }, + { + "epoch": 0.9131375022526581, + "grad_norm": 0.37714827060699463, + "learning_rate": 9.793383114445403e-06, + "loss": 0.4599, + "step": 1689 + }, + { + "epoch": 0.9136781402054425, + "grad_norm": 0.37188130617141724, + "learning_rate": 9.792845647176621e-06, + "loss": 0.4952, + "step": 1690 + }, + { + "epoch": 0.9142187781582267, + "grad_norm": 0.3693977892398834, + "learning_rate": 9.792307496552596e-06, + "loss": 0.4661, + "step": 1691 + }, + { + "epoch": 0.914759416111011, + "grad_norm": 0.3558450937271118, + "learning_rate": 9.791768662650059e-06, + "loss": 0.47, + "step": 1692 + }, + { + "epoch": 0.9153000540637953, + "grad_norm": 0.3815310299396515, + "learning_rate": 9.791229145545832e-06, + "loss": 0.4877, + "step": 1693 + }, + { + "epoch": 0.9158406920165796, + "grad_norm": 0.3444439768791199, + "learning_rate": 9.790688945316841e-06, + "loss": 0.4563, + "step": 1694 + }, + { + "epoch": 0.9163813299693638, + "grad_norm": 0.43752437829971313, + "learning_rate": 9.790148062040108e-06, + "loss": 0.4387, + "step": 1695 + }, + { + "epoch": 0.9169219679221481, + "grad_norm": 0.40530988574028015, + "learning_rate": 9.789606495792748e-06, + "loss": 0.5028, + "step": 1696 + }, + { + "epoch": 0.9174626058749324, + "grad_norm": 0.40507200360298157, + "learning_rate": 9.789064246651978e-06, + "loss": 0.4567, + "step": 1697 + }, + { + "epoch": 0.9180032438277167, + "grad_norm": 0.40165984630584717, + "learning_rate": 9.78852131469511e-06, + "loss": 0.4753, + "step": 1698 + }, + { + "epoch": 0.918543881780501, + "grad_norm": 0.3914419114589691, + "learning_rate": 9.787977699999556e-06, + "loss": 0.4578, + "step": 1699 + }, + { + "epoch": 0.9190845197332853, + "grad_norm": 0.3427618443965912, + "learning_rate": 9.787433402642823e-06, + "loss": 0.4669, + "step": 1700 + }, + { + "epoch": 0.9196251576860696, + "grad_norm": 0.3847512900829315, + "learning_rate": 9.786888422702516e-06, + "loss": 0.4994, + "step": 1701 + }, + { + "epoch": 0.9201657956388538, + "grad_norm": 0.4131098985671997, + "learning_rate": 9.786342760256336e-06, + "loss": 0.4595, + "step": 1702 + }, + { + "epoch": 0.9207064335916382, + "grad_norm": 0.3641406297683716, + "learning_rate": 9.785796415382084e-06, + "loss": 0.4453, + "step": 1703 + }, + { + "epoch": 0.9212470715444224, + "grad_norm": 0.4470204710960388, + "learning_rate": 9.785249388157656e-06, + "loss": 0.4708, + "step": 1704 + }, + { + "epoch": 0.9217877094972067, + "grad_norm": 0.3770560026168823, + "learning_rate": 9.784701678661045e-06, + "loss": 0.4624, + "step": 1705 + }, + { + "epoch": 0.922328347449991, + "grad_norm": 0.4491824209690094, + "learning_rate": 9.784153286970346e-06, + "loss": 0.4868, + "step": 1706 + }, + { + "epoch": 0.9228689854027753, + "grad_norm": 0.3728610873222351, + "learning_rate": 9.783604213163744e-06, + "loss": 0.4444, + "step": 1707 + }, + { + "epoch": 0.9234096233555595, + "grad_norm": 0.3844902813434601, + "learning_rate": 9.783054457319528e-06, + "loss": 0.4467, + "step": 1708 + }, + { + "epoch": 0.9239502613083439, + "grad_norm": 0.3831673264503479, + "learning_rate": 9.782504019516079e-06, + "loss": 0.4898, + "step": 1709 + }, + { + "epoch": 0.9244908992611282, + "grad_norm": 0.35555362701416016, + "learning_rate": 9.781952899831876e-06, + "loss": 0.481, + "step": 1710 + }, + { + "epoch": 0.9250315372139124, + "grad_norm": 0.39292532205581665, + "learning_rate": 9.781401098345503e-06, + "loss": 0.4751, + "step": 1711 + }, + { + "epoch": 0.9255721751666967, + "grad_norm": 0.3829486072063446, + "learning_rate": 9.780848615135627e-06, + "loss": 0.48, + "step": 1712 + }, + { + "epoch": 0.926112813119481, + "grad_norm": 0.4107460379600525, + "learning_rate": 9.780295450281026e-06, + "loss": 0.4799, + "step": 1713 + }, + { + "epoch": 0.9266534510722653, + "grad_norm": 0.379866361618042, + "learning_rate": 9.779741603860567e-06, + "loss": 0.4582, + "step": 1714 + }, + { + "epoch": 0.9271940890250495, + "grad_norm": 0.37752336263656616, + "learning_rate": 9.779187075953215e-06, + "loss": 0.4599, + "step": 1715 + }, + { + "epoch": 0.9277347269778339, + "grad_norm": 0.4197329580783844, + "learning_rate": 9.778631866638036e-06, + "loss": 0.4722, + "step": 1716 + }, + { + "epoch": 0.9282753649306181, + "grad_norm": 0.38856256008148193, + "learning_rate": 9.778075975994188e-06, + "loss": 0.4679, + "step": 1717 + }, + { + "epoch": 0.9288160028834024, + "grad_norm": 0.44080206751823425, + "learning_rate": 9.777519404100933e-06, + "loss": 0.4653, + "step": 1718 + }, + { + "epoch": 0.9293566408361867, + "grad_norm": 0.39855122566223145, + "learning_rate": 9.77696215103762e-06, + "loss": 0.4706, + "step": 1719 + }, + { + "epoch": 0.929897278788971, + "grad_norm": 0.41832059621810913, + "learning_rate": 9.776404216883709e-06, + "loss": 0.5104, + "step": 1720 + }, + { + "epoch": 0.9304379167417552, + "grad_norm": 0.45665568113327026, + "learning_rate": 9.775845601718742e-06, + "loss": 0.4891, + "step": 1721 + }, + { + "epoch": 0.9309785546945396, + "grad_norm": 0.44155561923980713, + "learning_rate": 9.775286305622368e-06, + "loss": 0.4684, + "step": 1722 + }, + { + "epoch": 0.9315191926473239, + "grad_norm": 0.4479288160800934, + "learning_rate": 9.774726328674333e-06, + "loss": 0.4584, + "step": 1723 + }, + { + "epoch": 0.9320598306001081, + "grad_norm": 0.40802037715911865, + "learning_rate": 9.774165670954474e-06, + "loss": 0.4648, + "step": 1724 + }, + { + "epoch": 0.9326004685528924, + "grad_norm": 0.39233508706092834, + "learning_rate": 9.77360433254273e-06, + "loss": 0.4705, + "step": 1725 + }, + { + "epoch": 0.9331411065056767, + "grad_norm": 0.49127158522605896, + "learning_rate": 9.773042313519135e-06, + "loss": 0.4733, + "step": 1726 + }, + { + "epoch": 0.933681744458461, + "grad_norm": 0.36547306180000305, + "learning_rate": 9.77247961396382e-06, + "loss": 0.4773, + "step": 1727 + }, + { + "epoch": 0.9342223824112452, + "grad_norm": 0.45778974890708923, + "learning_rate": 9.771916233957015e-06, + "loss": 0.4535, + "step": 1728 + }, + { + "epoch": 0.9347630203640296, + "grad_norm": 0.4647793471813202, + "learning_rate": 9.771352173579048e-06, + "loss": 0.4809, + "step": 1729 + }, + { + "epoch": 0.9353036583168138, + "grad_norm": 0.3391762375831604, + "learning_rate": 9.770787432910336e-06, + "loss": 0.4735, + "step": 1730 + }, + { + "epoch": 0.9358442962695981, + "grad_norm": 0.428021103143692, + "learning_rate": 9.770222012031404e-06, + "loss": 0.4604, + "step": 1731 + }, + { + "epoch": 0.9363849342223824, + "grad_norm": 0.41578346490859985, + "learning_rate": 9.769655911022864e-06, + "loss": 0.4928, + "step": 1732 + }, + { + "epoch": 0.9369255721751667, + "grad_norm": 0.39558741450309753, + "learning_rate": 9.769089129965435e-06, + "loss": 0.4755, + "step": 1733 + }, + { + "epoch": 0.937466210127951, + "grad_norm": 0.3967934846878052, + "learning_rate": 9.768521668939924e-06, + "loss": 0.4611, + "step": 1734 + }, + { + "epoch": 0.9380068480807353, + "grad_norm": 0.34875285625457764, + "learning_rate": 9.767953528027238e-06, + "loss": 0.4419, + "step": 1735 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 0.36679312586784363, + "learning_rate": 9.767384707308383e-06, + "loss": 0.4631, + "step": 1736 + }, + { + "epoch": 0.9390881239863038, + "grad_norm": 0.3865496516227722, + "learning_rate": 9.76681520686446e-06, + "loss": 0.4864, + "step": 1737 + }, + { + "epoch": 0.9396287619390882, + "grad_norm": 0.362447053194046, + "learning_rate": 9.766245026776668e-06, + "loss": 0.4684, + "step": 1738 + }, + { + "epoch": 0.9401693998918724, + "grad_norm": 0.358012855052948, + "learning_rate": 9.765674167126303e-06, + "loss": 0.4952, + "step": 1739 + }, + { + "epoch": 0.9407100378446567, + "grad_norm": 0.349895715713501, + "learning_rate": 9.765102627994757e-06, + "loss": 0.4928, + "step": 1740 + }, + { + "epoch": 0.9412506757974409, + "grad_norm": 0.38196805119514465, + "learning_rate": 9.764530409463516e-06, + "loss": 0.4672, + "step": 1741 + }, + { + "epoch": 0.9417913137502253, + "grad_norm": 0.3335559070110321, + "learning_rate": 9.763957511614166e-06, + "loss": 0.4631, + "step": 1742 + }, + { + "epoch": 0.9423319517030095, + "grad_norm": 0.35727524757385254, + "learning_rate": 9.763383934528393e-06, + "loss": 0.4684, + "step": 1743 + }, + { + "epoch": 0.9428725896557938, + "grad_norm": 0.3617507517337799, + "learning_rate": 9.762809678287977e-06, + "loss": 0.4639, + "step": 1744 + }, + { + "epoch": 0.9434132276085782, + "grad_norm": 0.35392940044403076, + "learning_rate": 9.762234742974793e-06, + "loss": 0.4491, + "step": 1745 + }, + { + "epoch": 0.9439538655613624, + "grad_norm": 0.3508428633213043, + "learning_rate": 9.761659128670811e-06, + "loss": 0.4596, + "step": 1746 + }, + { + "epoch": 0.9444945035141467, + "grad_norm": 0.35308611392974854, + "learning_rate": 9.761082835458104e-06, + "loss": 0.4254, + "step": 1747 + }, + { + "epoch": 0.945035141466931, + "grad_norm": 0.3707454800605774, + "learning_rate": 9.760505863418841e-06, + "loss": 0.4768, + "step": 1748 + }, + { + "epoch": 0.9455757794197153, + "grad_norm": 0.32324856519699097, + "learning_rate": 9.759928212635281e-06, + "loss": 0.4895, + "step": 1749 + }, + { + "epoch": 0.9461164173724995, + "grad_norm": 0.3526584506034851, + "learning_rate": 9.759349883189788e-06, + "loss": 0.47, + "step": 1750 + }, + { + "epoch": 0.9466570553252839, + "grad_norm": 0.3857341408729553, + "learning_rate": 9.758770875164817e-06, + "loss": 0.4596, + "step": 1751 + }, + { + "epoch": 0.9471976932780681, + "grad_norm": 0.3677152395248413, + "learning_rate": 9.758191188642924e-06, + "loss": 0.4959, + "step": 1752 + }, + { + "epoch": 0.9477383312308524, + "grad_norm": 0.3096745014190674, + "learning_rate": 9.75761082370676e-06, + "loss": 0.4465, + "step": 1753 + }, + { + "epoch": 0.9482789691836367, + "grad_norm": 0.3877503275871277, + "learning_rate": 9.757029780439069e-06, + "loss": 0.4668, + "step": 1754 + }, + { + "epoch": 0.948819607136421, + "grad_norm": 0.3441794216632843, + "learning_rate": 9.756448058922697e-06, + "loss": 0.4602, + "step": 1755 + }, + { + "epoch": 0.9493602450892052, + "grad_norm": 0.3353310227394104, + "learning_rate": 9.755865659240585e-06, + "loss": 0.4535, + "step": 1756 + }, + { + "epoch": 0.9499008830419895, + "grad_norm": 0.361272931098938, + "learning_rate": 9.755282581475769e-06, + "loss": 0.4874, + "step": 1757 + }, + { + "epoch": 0.9504415209947739, + "grad_norm": 0.37511539459228516, + "learning_rate": 9.754698825711384e-06, + "loss": 0.4829, + "step": 1758 + }, + { + "epoch": 0.9509821589475581, + "grad_norm": 0.36202606558799744, + "learning_rate": 9.754114392030663e-06, + "loss": 0.4436, + "step": 1759 + }, + { + "epoch": 0.9515227969003424, + "grad_norm": 0.36360007524490356, + "learning_rate": 9.753529280516931e-06, + "loss": 0.4749, + "step": 1760 + }, + { + "epoch": 0.9520634348531267, + "grad_norm": 0.3650102913379669, + "learning_rate": 9.752943491253614e-06, + "loss": 0.4808, + "step": 1761 + }, + { + "epoch": 0.952604072805911, + "grad_norm": 0.4095380902290344, + "learning_rate": 9.75235702432423e-06, + "loss": 0.489, + "step": 1762 + }, + { + "epoch": 0.9531447107586952, + "grad_norm": 0.3650003969669342, + "learning_rate": 9.7517698798124e-06, + "loss": 0.4751, + "step": 1763 + }, + { + "epoch": 0.9536853487114796, + "grad_norm": 4.295322895050049, + "learning_rate": 9.751182057801835e-06, + "loss": 0.4924, + "step": 1764 + }, + { + "epoch": 0.9542259866642638, + "grad_norm": 0.4038078486919403, + "learning_rate": 9.750593558376347e-06, + "loss": 0.4769, + "step": 1765 + }, + { + "epoch": 0.9547666246170481, + "grad_norm": 0.352078378200531, + "learning_rate": 9.750004381619841e-06, + "loss": 0.4449, + "step": 1766 + }, + { + "epoch": 0.9553072625698324, + "grad_norm": 0.3834569454193115, + "learning_rate": 9.749414527616325e-06, + "loss": 0.4789, + "step": 1767 + }, + { + "epoch": 0.9558479005226167, + "grad_norm": 0.40125179290771484, + "learning_rate": 9.748823996449895e-06, + "loss": 0.4755, + "step": 1768 + }, + { + "epoch": 0.956388538475401, + "grad_norm": 0.3302454352378845, + "learning_rate": 9.74823278820475e-06, + "loss": 0.4495, + "step": 1769 + }, + { + "epoch": 0.9569291764281853, + "grad_norm": 0.4053800404071808, + "learning_rate": 9.747640902965185e-06, + "loss": 0.4675, + "step": 1770 + }, + { + "epoch": 0.9574698143809696, + "grad_norm": 0.3767763674259186, + "learning_rate": 9.747048340815586e-06, + "loss": 0.4757, + "step": 1771 + }, + { + "epoch": 0.9580104523337538, + "grad_norm": 0.3748930096626282, + "learning_rate": 9.746455101840442e-06, + "loss": 0.458, + "step": 1772 + }, + { + "epoch": 0.9585510902865381, + "grad_norm": 0.34963858127593994, + "learning_rate": 9.745861186124336e-06, + "loss": 0.4685, + "step": 1773 + }, + { + "epoch": 0.9590917282393224, + "grad_norm": 0.374912291765213, + "learning_rate": 9.745266593751946e-06, + "loss": 0.4811, + "step": 1774 + }, + { + "epoch": 0.9596323661921067, + "grad_norm": 0.38071802258491516, + "learning_rate": 9.74467132480805e-06, + "loss": 0.4724, + "step": 1775 + }, + { + "epoch": 0.9601730041448909, + "grad_norm": 0.34633877873420715, + "learning_rate": 9.744075379377518e-06, + "loss": 0.4496, + "step": 1776 + }, + { + "epoch": 0.9607136420976753, + "grad_norm": 0.39294639229774475, + "learning_rate": 9.743478757545321e-06, + "loss": 0.4599, + "step": 1777 + }, + { + "epoch": 0.9612542800504595, + "grad_norm": 0.42019978165626526, + "learning_rate": 9.742881459396522e-06, + "loss": 0.4521, + "step": 1778 + }, + { + "epoch": 0.9617949180032438, + "grad_norm": 0.4065021574497223, + "learning_rate": 9.742283485016285e-06, + "loss": 0.4968, + "step": 1779 + }, + { + "epoch": 0.9623355559560282, + "grad_norm": 0.3769640624523163, + "learning_rate": 9.741684834489866e-06, + "loss": 0.4609, + "step": 1780 + }, + { + "epoch": 0.9628761939088124, + "grad_norm": 0.34647080302238464, + "learning_rate": 9.741085507902622e-06, + "loss": 0.4587, + "step": 1781 + }, + { + "epoch": 0.9634168318615967, + "grad_norm": 0.36830389499664307, + "learning_rate": 9.740485505340002e-06, + "loss": 0.4353, + "step": 1782 + }, + { + "epoch": 0.963957469814381, + "grad_norm": 0.40501725673675537, + "learning_rate": 9.739884826887554e-06, + "loss": 0.434, + "step": 1783 + }, + { + "epoch": 0.9644981077671653, + "grad_norm": 0.4197520315647125, + "learning_rate": 9.739283472630919e-06, + "loss": 0.4318, + "step": 1784 + }, + { + "epoch": 0.9650387457199495, + "grad_norm": 0.3837212324142456, + "learning_rate": 9.738681442655842e-06, + "loss": 0.4664, + "step": 1785 + }, + { + "epoch": 0.9655793836727338, + "grad_norm": 0.3517027795314789, + "learning_rate": 9.738078737048156e-06, + "loss": 0.459, + "step": 1786 + }, + { + "epoch": 0.9661200216255181, + "grad_norm": 0.43149450421333313, + "learning_rate": 9.737475355893793e-06, + "loss": 0.4861, + "step": 1787 + }, + { + "epoch": 0.9666606595783024, + "grad_norm": 0.36714836955070496, + "learning_rate": 9.736871299278786e-06, + "loss": 0.4603, + "step": 1788 + }, + { + "epoch": 0.9672012975310866, + "grad_norm": 0.3596876263618469, + "learning_rate": 9.736266567289255e-06, + "loss": 0.4898, + "step": 1789 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.45363107323646545, + "learning_rate": 9.735661160011424e-06, + "loss": 0.4787, + "step": 1790 + }, + { + "epoch": 0.9682825734366552, + "grad_norm": 0.36468809843063354, + "learning_rate": 9.735055077531612e-06, + "loss": 0.4616, + "step": 1791 + }, + { + "epoch": 0.9688232113894395, + "grad_norm": 0.3992007076740265, + "learning_rate": 9.734448319936234e-06, + "loss": 0.4556, + "step": 1792 + }, + { + "epoch": 0.9693638493422239, + "grad_norm": 0.37230291962623596, + "learning_rate": 9.733840887311796e-06, + "loss": 0.4585, + "step": 1793 + }, + { + "epoch": 0.9699044872950081, + "grad_norm": 0.41598498821258545, + "learning_rate": 9.733232779744909e-06, + "loss": 0.4779, + "step": 1794 + }, + { + "epoch": 0.9704451252477924, + "grad_norm": 0.3878816068172455, + "learning_rate": 9.732623997322274e-06, + "loss": 0.4846, + "step": 1795 + }, + { + "epoch": 0.9709857632005767, + "grad_norm": 0.39695218205451965, + "learning_rate": 9.73201454013069e-06, + "loss": 0.4606, + "step": 1796 + }, + { + "epoch": 0.971526401153361, + "grad_norm": 0.42593061923980713, + "learning_rate": 9.731404408257052e-06, + "loss": 0.4784, + "step": 1797 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 0.4588161110877991, + "learning_rate": 9.730793601788353e-06, + "loss": 0.4844, + "step": 1798 + }, + { + "epoch": 0.9726076770589296, + "grad_norm": 0.3776327073574066, + "learning_rate": 9.730182120811679e-06, + "loss": 0.4468, + "step": 1799 + }, + { + "epoch": 0.9731483150117138, + "grad_norm": 0.3473685085773468, + "learning_rate": 9.729569965414214e-06, + "loss": 0.4567, + "step": 1800 + }, + { + "epoch": 0.9736889529644981, + "grad_norm": 0.39597994089126587, + "learning_rate": 9.728957135683238e-06, + "loss": 0.478, + "step": 1801 + }, + { + "epoch": 0.9742295909172823, + "grad_norm": 0.3842535614967346, + "learning_rate": 9.72834363170613e-06, + "loss": 0.4458, + "step": 1802 + }, + { + "epoch": 0.9747702288700667, + "grad_norm": 0.42436766624450684, + "learning_rate": 9.72772945357036e-06, + "loss": 0.4971, + "step": 1803 + }, + { + "epoch": 0.975310866822851, + "grad_norm": 0.4073885381221771, + "learning_rate": 9.727114601363496e-06, + "loss": 0.4751, + "step": 1804 + }, + { + "epoch": 0.9758515047756352, + "grad_norm": 0.35591429471969604, + "learning_rate": 9.726499075173201e-06, + "loss": 0.4507, + "step": 1805 + }, + { + "epoch": 0.9763921427284196, + "grad_norm": 0.34828415513038635, + "learning_rate": 9.72588287508724e-06, + "loss": 0.4693, + "step": 1806 + }, + { + "epoch": 0.9769327806812038, + "grad_norm": 0.3742208778858185, + "learning_rate": 9.725266001193466e-06, + "loss": 0.4504, + "step": 1807 + }, + { + "epoch": 0.9774734186339881, + "grad_norm": 0.3611016571521759, + "learning_rate": 9.724648453579834e-06, + "loss": 0.4732, + "step": 1808 + }, + { + "epoch": 0.9780140565867724, + "grad_norm": 0.32930028438568115, + "learning_rate": 9.72403023233439e-06, + "loss": 0.4482, + "step": 1809 + }, + { + "epoch": 0.9785546945395567, + "grad_norm": 0.3373595178127289, + "learning_rate": 9.723411337545283e-06, + "loss": 0.4635, + "step": 1810 + }, + { + "epoch": 0.9790953324923409, + "grad_norm": 0.3561059534549713, + "learning_rate": 9.72279176930075e-06, + "loss": 0.4581, + "step": 1811 + }, + { + "epoch": 0.9796359704451253, + "grad_norm": 0.3475349545478821, + "learning_rate": 9.722171527689131e-06, + "loss": 0.4419, + "step": 1812 + }, + { + "epoch": 0.9801766083979095, + "grad_norm": 0.3433786928653717, + "learning_rate": 9.721550612798856e-06, + "loss": 0.4448, + "step": 1813 + }, + { + "epoch": 0.9807172463506938, + "grad_norm": 0.3646323084831238, + "learning_rate": 9.720929024718456e-06, + "loss": 0.4638, + "step": 1814 + }, + { + "epoch": 0.9812578843034782, + "grad_norm": 0.35177263617515564, + "learning_rate": 9.720306763536553e-06, + "loss": 0.4595, + "step": 1815 + }, + { + "epoch": 0.9817985222562624, + "grad_norm": 0.37401440739631653, + "learning_rate": 9.719683829341873e-06, + "loss": 0.452, + "step": 1816 + }, + { + "epoch": 0.9823391602090467, + "grad_norm": 0.3282455503940582, + "learning_rate": 9.71906022222323e-06, + "loss": 0.4522, + "step": 1817 + }, + { + "epoch": 0.9828797981618309, + "grad_norm": 0.34565940499305725, + "learning_rate": 9.718435942269534e-06, + "loss": 0.45, + "step": 1818 + }, + { + "epoch": 0.9834204361146153, + "grad_norm": 0.3489386737346649, + "learning_rate": 9.717810989569798e-06, + "loss": 0.4791, + "step": 1819 + }, + { + "epoch": 0.9839610740673995, + "grad_norm": 0.3730597198009491, + "learning_rate": 9.717185364213127e-06, + "loss": 0.4963, + "step": 1820 + }, + { + "epoch": 0.9845017120201838, + "grad_norm": 0.33707693219184875, + "learning_rate": 9.716559066288716e-06, + "loss": 0.486, + "step": 1821 + }, + { + "epoch": 0.9850423499729681, + "grad_norm": 0.34130632877349854, + "learning_rate": 9.715932095885867e-06, + "loss": 0.4575, + "step": 1822 + }, + { + "epoch": 0.9855829879257524, + "grad_norm": 0.35243359208106995, + "learning_rate": 9.71530445309397e-06, + "loss": 0.4883, + "step": 1823 + }, + { + "epoch": 0.9861236258785366, + "grad_norm": 0.3288569152355194, + "learning_rate": 9.714676138002514e-06, + "loss": 0.4736, + "step": 1824 + }, + { + "epoch": 0.986664263831321, + "grad_norm": 0.36741355061531067, + "learning_rate": 9.714047150701082e-06, + "loss": 0.4668, + "step": 1825 + }, + { + "epoch": 0.9872049017841052, + "grad_norm": 0.3808061480522156, + "learning_rate": 9.713417491279354e-06, + "loss": 0.444, + "step": 1826 + }, + { + "epoch": 0.9877455397368895, + "grad_norm": 0.3768243193626404, + "learning_rate": 9.712787159827107e-06, + "loss": 0.4654, + "step": 1827 + }, + { + "epoch": 0.9882861776896739, + "grad_norm": 0.3869498074054718, + "learning_rate": 9.71215615643421e-06, + "loss": 0.4807, + "step": 1828 + }, + { + "epoch": 0.9888268156424581, + "grad_norm": 0.4101428687572479, + "learning_rate": 9.711524481190633e-06, + "loss": 0.4621, + "step": 1829 + }, + { + "epoch": 0.9893674535952424, + "grad_norm": 0.36245039105415344, + "learning_rate": 9.710892134186438e-06, + "loss": 0.4475, + "step": 1830 + }, + { + "epoch": 0.9899080915480267, + "grad_norm": 0.40394195914268494, + "learning_rate": 9.710259115511784e-06, + "loss": 0.4703, + "step": 1831 + }, + { + "epoch": 0.990448729500811, + "grad_norm": 0.3817879259586334, + "learning_rate": 9.709625425256926e-06, + "loss": 0.456, + "step": 1832 + }, + { + "epoch": 0.9909893674535952, + "grad_norm": 0.40098169445991516, + "learning_rate": 9.708991063512213e-06, + "loss": 0.4658, + "step": 1833 + }, + { + "epoch": 0.9915300054063795, + "grad_norm": 0.3755854368209839, + "learning_rate": 9.708356030368091e-06, + "loss": 0.4603, + "step": 1834 + }, + { + "epoch": 0.9920706433591638, + "grad_norm": 0.4370364844799042, + "learning_rate": 9.707720325915105e-06, + "loss": 0.4402, + "step": 1835 + }, + { + "epoch": 0.9926112813119481, + "grad_norm": 0.3481072187423706, + "learning_rate": 9.707083950243889e-06, + "loss": 0.4588, + "step": 1836 + }, + { + "epoch": 0.9931519192647323, + "grad_norm": 0.3732248544692993, + "learning_rate": 9.706446903445179e-06, + "loss": 0.4536, + "step": 1837 + }, + { + "epoch": 0.9936925572175167, + "grad_norm": 0.3808124363422394, + "learning_rate": 9.705809185609802e-06, + "loss": 0.4734, + "step": 1838 + }, + { + "epoch": 0.994233195170301, + "grad_norm": 0.35636451840400696, + "learning_rate": 9.705170796828684e-06, + "loss": 0.4513, + "step": 1839 + }, + { + "epoch": 0.9947738331230852, + "grad_norm": 0.37091416120529175, + "learning_rate": 9.704531737192847e-06, + "loss": 0.4496, + "step": 1840 + }, + { + "epoch": 0.9953144710758696, + "grad_norm": 0.4094441831111908, + "learning_rate": 9.703892006793401e-06, + "loss": 0.5067, + "step": 1841 + }, + { + "epoch": 0.9958551090286538, + "grad_norm": 0.3345203101634979, + "learning_rate": 9.703251605721565e-06, + "loss": 0.4448, + "step": 1842 + }, + { + "epoch": 0.9963957469814381, + "grad_norm": 0.44573473930358887, + "learning_rate": 9.702610534068639e-06, + "loss": 0.4547, + "step": 1843 + }, + { + "epoch": 0.9969363849342224, + "grad_norm": 0.3749711215496063, + "learning_rate": 9.701968791926031e-06, + "loss": 0.5019, + "step": 1844 + }, + { + "epoch": 0.9974770228870067, + "grad_norm": 0.3989967703819275, + "learning_rate": 9.701326379385238e-06, + "loss": 0.4534, + "step": 1845 + }, + { + "epoch": 0.9980176608397909, + "grad_norm": 0.44340914487838745, + "learning_rate": 9.700683296537855e-06, + "loss": 0.4914, + "step": 1846 + }, + { + "epoch": 0.9985582987925753, + "grad_norm": 0.3767038583755493, + "learning_rate": 9.700039543475569e-06, + "loss": 0.4627, + "step": 1847 + }, + { + "epoch": 0.9990989367453595, + "grad_norm": 0.43750816583633423, + "learning_rate": 9.699395120290166e-06, + "loss": 0.4709, + "step": 1848 + }, + { + "epoch": 0.9996395746981438, + "grad_norm": 0.3936137557029724, + "learning_rate": 9.698750027073529e-06, + "loss": 0.4575, + "step": 1849 + }, + { + "epoch": 1.000180212650928, + "grad_norm": 0.521910548210144, + "learning_rate": 9.698104263917632e-06, + "loss": 0.6287, + "step": 1850 + }, + { + "epoch": 1.0007208506037124, + "grad_norm": 0.353114515542984, + "learning_rate": 9.697457830914546e-06, + "loss": 0.4359, + "step": 1851 + }, + { + "epoch": 1.0012614885564968, + "grad_norm": 0.3728412985801697, + "learning_rate": 9.696810728156441e-06, + "loss": 0.4542, + "step": 1852 + }, + { + "epoch": 1.001802126509281, + "grad_norm": 0.3588818907737732, + "learning_rate": 9.696162955735577e-06, + "loss": 0.4268, + "step": 1853 + }, + { + "epoch": 1.0023427644620653, + "grad_norm": 0.37867024540901184, + "learning_rate": 9.695514513744314e-06, + "loss": 0.4717, + "step": 1854 + }, + { + "epoch": 1.0028834024148494, + "grad_norm": 0.386565625667572, + "learning_rate": 9.694865402275105e-06, + "loss": 0.4342, + "step": 1855 + }, + { + "epoch": 1.0034240403676338, + "grad_norm": 0.3667563796043396, + "learning_rate": 9.6942156214205e-06, + "loss": 0.4577, + "step": 1856 + }, + { + "epoch": 1.0039646783204181, + "grad_norm": 0.3618047535419464, + "learning_rate": 9.693565171273143e-06, + "loss": 0.4196, + "step": 1857 + }, + { + "epoch": 1.0045053162732023, + "grad_norm": 0.3565126955509186, + "learning_rate": 9.692914051925773e-06, + "loss": 0.465, + "step": 1858 + }, + { + "epoch": 1.0050459542259866, + "grad_norm": 0.3447374105453491, + "learning_rate": 9.692262263471226e-06, + "loss": 0.4333, + "step": 1859 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 0.35998278856277466, + "learning_rate": 9.691609806002433e-06, + "loss": 0.4527, + "step": 1860 + }, + { + "epoch": 1.0061272301315551, + "grad_norm": 0.3468131721019745, + "learning_rate": 9.690956679612422e-06, + "loss": 0.4629, + "step": 1861 + }, + { + "epoch": 1.0066678680843395, + "grad_norm": 0.36353087425231934, + "learning_rate": 9.690302884394312e-06, + "loss": 0.4308, + "step": 1862 + }, + { + "epoch": 1.0072085060371239, + "grad_norm": 0.35647186636924744, + "learning_rate": 9.68964842044132e-06, + "loss": 0.4528, + "step": 1863 + }, + { + "epoch": 1.007749143989908, + "grad_norm": 0.3625536561012268, + "learning_rate": 9.68899328784676e-06, + "loss": 0.4397, + "step": 1864 + }, + { + "epoch": 1.0082897819426924, + "grad_norm": 0.3450867831707001, + "learning_rate": 9.688337486704038e-06, + "loss": 0.454, + "step": 1865 + }, + { + "epoch": 1.0088304198954767, + "grad_norm": 0.33882075548171997, + "learning_rate": 9.687681017106659e-06, + "loss": 0.429, + "step": 1866 + }, + { + "epoch": 1.0093710578482609, + "grad_norm": 0.35694748163223267, + "learning_rate": 9.687023879148217e-06, + "loss": 0.4633, + "step": 1867 + }, + { + "epoch": 1.0099116958010452, + "grad_norm": 0.3318149447441101, + "learning_rate": 9.686366072922411e-06, + "loss": 0.4278, + "step": 1868 + }, + { + "epoch": 1.0104523337538296, + "grad_norm": 0.38252225518226624, + "learning_rate": 9.685707598523027e-06, + "loss": 0.4348, + "step": 1869 + }, + { + "epoch": 1.0109929717066137, + "grad_norm": 0.3737165629863739, + "learning_rate": 9.68504845604395e-06, + "loss": 0.4475, + "step": 1870 + }, + { + "epoch": 1.011533609659398, + "grad_norm": 0.35093238949775696, + "learning_rate": 9.684388645579156e-06, + "loss": 0.4219, + "step": 1871 + }, + { + "epoch": 1.0120742476121825, + "grad_norm": 0.42347413301467896, + "learning_rate": 9.683728167222723e-06, + "loss": 0.4769, + "step": 1872 + }, + { + "epoch": 1.0126148855649666, + "grad_norm": 0.36415350437164307, + "learning_rate": 9.683067021068822e-06, + "loss": 0.4377, + "step": 1873 + }, + { + "epoch": 1.013155523517751, + "grad_norm": 0.43035557866096497, + "learning_rate": 9.682405207211714e-06, + "loss": 0.4721, + "step": 1874 + }, + { + "epoch": 1.0136961614705353, + "grad_norm": 0.35217908024787903, + "learning_rate": 9.681742725745762e-06, + "loss": 0.4266, + "step": 1875 + }, + { + "epoch": 1.0142367994233195, + "grad_norm": 0.36210188269615173, + "learning_rate": 9.681079576765422e-06, + "loss": 0.4207, + "step": 1876 + }, + { + "epoch": 1.0147774373761038, + "grad_norm": 0.35266774892807007, + "learning_rate": 9.680415760365242e-06, + "loss": 0.4016, + "step": 1877 + }, + { + "epoch": 1.0153180753288882, + "grad_norm": 0.3950832486152649, + "learning_rate": 9.679751276639869e-06, + "loss": 0.4759, + "step": 1878 + }, + { + "epoch": 1.0158587132816723, + "grad_norm": 0.39965641498565674, + "learning_rate": 9.679086125684043e-06, + "loss": 0.5019, + "step": 1879 + }, + { + "epoch": 1.0163993512344567, + "grad_norm": 0.36587411165237427, + "learning_rate": 9.678420307592602e-06, + "loss": 0.4369, + "step": 1880 + }, + { + "epoch": 1.016939989187241, + "grad_norm": 0.3512573540210724, + "learning_rate": 9.677753822460478e-06, + "loss": 0.4442, + "step": 1881 + }, + { + "epoch": 1.0174806271400252, + "grad_norm": 0.36345434188842773, + "learning_rate": 9.677086670382692e-06, + "loss": 0.4394, + "step": 1882 + }, + { + "epoch": 1.0180212650928095, + "grad_norm": 0.3650181293487549, + "learning_rate": 9.67641885145437e-06, + "loss": 0.4639, + "step": 1883 + }, + { + "epoch": 1.018561903045594, + "grad_norm": 0.40704137086868286, + "learning_rate": 9.675750365770727e-06, + "loss": 0.4686, + "step": 1884 + }, + { + "epoch": 1.019102540998378, + "grad_norm": 0.34051263332366943, + "learning_rate": 9.675081213427076e-06, + "loss": 0.412, + "step": 1885 + }, + { + "epoch": 1.0196431789511624, + "grad_norm": 0.3666491210460663, + "learning_rate": 9.67441139451882e-06, + "loss": 0.4777, + "step": 1886 + }, + { + "epoch": 1.0201838169039465, + "grad_norm": 0.35916945338249207, + "learning_rate": 9.673740909141463e-06, + "loss": 0.445, + "step": 1887 + }, + { + "epoch": 1.020724454856731, + "grad_norm": 0.3610200881958008, + "learning_rate": 9.6730697573906e-06, + "loss": 0.4602, + "step": 1888 + }, + { + "epoch": 1.0212650928095153, + "grad_norm": 0.36404091119766235, + "learning_rate": 9.672397939361926e-06, + "loss": 0.454, + "step": 1889 + }, + { + "epoch": 1.0218057307622994, + "grad_norm": 0.39531588554382324, + "learning_rate": 9.671725455151226e-06, + "loss": 0.4119, + "step": 1890 + }, + { + "epoch": 1.0223463687150838, + "grad_norm": 0.39778226613998413, + "learning_rate": 9.671052304854378e-06, + "loss": 0.4638, + "step": 1891 + }, + { + "epoch": 1.0228870066678681, + "grad_norm": 0.40537434816360474, + "learning_rate": 9.670378488567365e-06, + "loss": 0.4265, + "step": 1892 + }, + { + "epoch": 1.0234276446206523, + "grad_norm": 0.3681070804595947, + "learning_rate": 9.669704006386252e-06, + "loss": 0.4466, + "step": 1893 + }, + { + "epoch": 1.0239682825734366, + "grad_norm": 0.37263795733451843, + "learning_rate": 9.66902885840721e-06, + "loss": 0.4359, + "step": 1894 + }, + { + "epoch": 1.024508920526221, + "grad_norm": 0.3602556884288788, + "learning_rate": 9.668353044726498e-06, + "loss": 0.4045, + "step": 1895 + }, + { + "epoch": 1.0250495584790051, + "grad_norm": 0.3647526502609253, + "learning_rate": 9.667676565440474e-06, + "loss": 0.4777, + "step": 1896 + }, + { + "epoch": 1.0255901964317895, + "grad_norm": 0.3482561707496643, + "learning_rate": 9.666999420645589e-06, + "loss": 0.4224, + "step": 1897 + }, + { + "epoch": 1.0261308343845739, + "grad_norm": 0.363437682390213, + "learning_rate": 9.666321610438386e-06, + "loss": 0.4548, + "step": 1898 + }, + { + "epoch": 1.026671472337358, + "grad_norm": 0.3769209682941437, + "learning_rate": 9.66564313491551e-06, + "loss": 0.4224, + "step": 1899 + }, + { + "epoch": 1.0272121102901424, + "grad_norm": 0.38231518864631653, + "learning_rate": 9.664963994173695e-06, + "loss": 0.4719, + "step": 1900 + }, + { + "epoch": 1.0277527482429267, + "grad_norm": 0.3820781409740448, + "learning_rate": 9.66428418830977e-06, + "loss": 0.4256, + "step": 1901 + }, + { + "epoch": 1.0282933861957109, + "grad_norm": 0.40323346853256226, + "learning_rate": 9.663603717420667e-06, + "loss": 0.4791, + "step": 1902 + }, + { + "epoch": 1.0288340241484952, + "grad_norm": 0.34871387481689453, + "learning_rate": 9.662922581603398e-06, + "loss": 0.4104, + "step": 1903 + }, + { + "epoch": 1.0293746621012796, + "grad_norm": 0.47941431403160095, + "learning_rate": 9.662240780955082e-06, + "loss": 0.4853, + "step": 1904 + }, + { + "epoch": 1.0299153000540637, + "grad_norm": 0.3837866485118866, + "learning_rate": 9.66155831557293e-06, + "loss": 0.4254, + "step": 1905 + }, + { + "epoch": 1.030455938006848, + "grad_norm": 0.43927279114723206, + "learning_rate": 9.660875185554244e-06, + "loss": 0.4251, + "step": 1906 + }, + { + "epoch": 1.0309965759596325, + "grad_norm": 0.37006092071533203, + "learning_rate": 9.660191390996426e-06, + "loss": 0.4738, + "step": 1907 + }, + { + "epoch": 1.0315372139124166, + "grad_norm": 0.48216795921325684, + "learning_rate": 9.659506931996967e-06, + "loss": 0.452, + "step": 1908 + }, + { + "epoch": 1.032077851865201, + "grad_norm": 0.32206961512565613, + "learning_rate": 9.65882180865346e-06, + "loss": 0.4309, + "step": 1909 + }, + { + "epoch": 1.0326184898179853, + "grad_norm": 0.4409984350204468, + "learning_rate": 9.658136021063585e-06, + "loss": 0.4706, + "step": 1910 + }, + { + "epoch": 1.0331591277707695, + "grad_norm": 0.4066472053527832, + "learning_rate": 9.65744956932512e-06, + "loss": 0.4532, + "step": 1911 + }, + { + "epoch": 1.0336997657235538, + "grad_norm": 0.36346372961997986, + "learning_rate": 9.656762453535945e-06, + "loss": 0.4866, + "step": 1912 + }, + { + "epoch": 1.0342404036763382, + "grad_norm": 0.3586696982383728, + "learning_rate": 9.656074673794018e-06, + "loss": 0.4415, + "step": 1913 + }, + { + "epoch": 1.0347810416291223, + "grad_norm": 0.3591172397136688, + "learning_rate": 9.655386230197408e-06, + "loss": 0.4435, + "step": 1914 + }, + { + "epoch": 1.0353216795819067, + "grad_norm": 0.4151376485824585, + "learning_rate": 9.65469712284427e-06, + "loss": 0.4366, + "step": 1915 + }, + { + "epoch": 1.0358623175346908, + "grad_norm": 0.4572712481021881, + "learning_rate": 9.654007351832856e-06, + "loss": 0.494, + "step": 1916 + }, + { + "epoch": 1.0364029554874752, + "grad_norm": 0.40585753321647644, + "learning_rate": 9.653316917261511e-06, + "loss": 0.4527, + "step": 1917 + }, + { + "epoch": 1.0369435934402595, + "grad_norm": 0.3337477147579193, + "learning_rate": 9.652625819228679e-06, + "loss": 0.4078, + "step": 1918 + }, + { + "epoch": 1.0374842313930437, + "grad_norm": 0.5288644433021545, + "learning_rate": 9.65193405783289e-06, + "loss": 0.518, + "step": 1919 + }, + { + "epoch": 1.038024869345828, + "grad_norm": 0.38844338059425354, + "learning_rate": 9.651241633172782e-06, + "loss": 0.4227, + "step": 1920 + }, + { + "epoch": 1.0385655072986124, + "grad_norm": 0.45273417234420776, + "learning_rate": 9.650548545347072e-06, + "loss": 0.4283, + "step": 1921 + }, + { + "epoch": 1.0391061452513966, + "grad_norm": 0.3740539848804474, + "learning_rate": 9.649854794454583e-06, + "loss": 0.4156, + "step": 1922 + }, + { + "epoch": 1.039646783204181, + "grad_norm": 0.3912931978702545, + "learning_rate": 9.649160380594227e-06, + "loss": 0.4163, + "step": 1923 + }, + { + "epoch": 1.0401874211569653, + "grad_norm": 0.3708667457103729, + "learning_rate": 9.648465303865016e-06, + "loss": 0.4039, + "step": 1924 + }, + { + "epoch": 1.0407280591097494, + "grad_norm": 0.4059867560863495, + "learning_rate": 9.647769564366048e-06, + "loss": 0.4632, + "step": 1925 + }, + { + "epoch": 1.0412686970625338, + "grad_norm": 0.37832167744636536, + "learning_rate": 9.647073162196524e-06, + "loss": 0.4548, + "step": 1926 + }, + { + "epoch": 1.0418093350153181, + "grad_norm": 0.40500980615615845, + "learning_rate": 9.646376097455732e-06, + "loss": 0.4753, + "step": 1927 + }, + { + "epoch": 1.0423499729681023, + "grad_norm": 0.38375428318977356, + "learning_rate": 9.645678370243063e-06, + "loss": 0.4413, + "step": 1928 + }, + { + "epoch": 1.0428906109208866, + "grad_norm": 0.3782906234264374, + "learning_rate": 9.644979980657993e-06, + "loss": 0.4726, + "step": 1929 + }, + { + "epoch": 1.043431248873671, + "grad_norm": 0.4120505452156067, + "learning_rate": 9.644280928800101e-06, + "loss": 0.4907, + "step": 1930 + }, + { + "epoch": 1.0439718868264551, + "grad_norm": 0.35727861523628235, + "learning_rate": 9.643581214769053e-06, + "loss": 0.3948, + "step": 1931 + }, + { + "epoch": 1.0445125247792395, + "grad_norm": 0.42740848660469055, + "learning_rate": 9.642880838664617e-06, + "loss": 0.4586, + "step": 1932 + }, + { + "epoch": 1.0450531627320239, + "grad_norm": 0.3294697105884552, + "learning_rate": 9.642179800586648e-06, + "loss": 0.4003, + "step": 1933 + }, + { + "epoch": 1.045593800684808, + "grad_norm": 0.40418773889541626, + "learning_rate": 9.6414781006351e-06, + "loss": 0.4738, + "step": 1934 + }, + { + "epoch": 1.0461344386375924, + "grad_norm": 0.35638585686683655, + "learning_rate": 9.640775738910019e-06, + "loss": 0.4416, + "step": 1935 + }, + { + "epoch": 1.0466750765903767, + "grad_norm": 0.37873512506484985, + "learning_rate": 9.640072715511547e-06, + "loss": 0.4497, + "step": 1936 + }, + { + "epoch": 1.0472157145431609, + "grad_norm": 0.40080496668815613, + "learning_rate": 9.639369030539922e-06, + "loss": 0.4547, + "step": 1937 + }, + { + "epoch": 1.0477563524959452, + "grad_norm": 0.3935638964176178, + "learning_rate": 9.638664684095472e-06, + "loss": 0.4574, + "step": 1938 + }, + { + "epoch": 1.0482969904487296, + "grad_norm": 0.3729800283908844, + "learning_rate": 9.637959676278621e-06, + "loss": 0.4383, + "step": 1939 + }, + { + "epoch": 1.0488376284015137, + "grad_norm": 0.3998558819293976, + "learning_rate": 9.63725400718989e-06, + "loss": 0.4737, + "step": 1940 + }, + { + "epoch": 1.049378266354298, + "grad_norm": 0.3633527159690857, + "learning_rate": 9.636547676929889e-06, + "loss": 0.3989, + "step": 1941 + }, + { + "epoch": 1.0499189043070825, + "grad_norm": 0.37377429008483887, + "learning_rate": 9.635840685599328e-06, + "loss": 0.4225, + "step": 1942 + }, + { + "epoch": 1.0504595422598666, + "grad_norm": 0.3647961914539337, + "learning_rate": 9.635133033299005e-06, + "loss": 0.451, + "step": 1943 + }, + { + "epoch": 1.051000180212651, + "grad_norm": 0.4128117263317108, + "learning_rate": 9.63442472012982e-06, + "loss": 0.4678, + "step": 1944 + }, + { + "epoch": 1.0515408181654353, + "grad_norm": 0.35981616377830505, + "learning_rate": 9.633715746192762e-06, + "loss": 0.439, + "step": 1945 + }, + { + "epoch": 1.0520814561182195, + "grad_norm": 0.3871247470378876, + "learning_rate": 9.633006111588912e-06, + "loss": 0.4444, + "step": 1946 + }, + { + "epoch": 1.0526220940710038, + "grad_norm": 0.35950541496276855, + "learning_rate": 9.632295816419453e-06, + "loss": 0.4392, + "step": 1947 + }, + { + "epoch": 1.0531627320237882, + "grad_norm": 0.4200381338596344, + "learning_rate": 9.631584860785654e-06, + "loss": 0.4601, + "step": 1948 + }, + { + "epoch": 1.0537033699765723, + "grad_norm": 0.3932918608188629, + "learning_rate": 9.630873244788884e-06, + "loss": 0.4588, + "step": 1949 + }, + { + "epoch": 1.0542440079293567, + "grad_norm": 0.3635420799255371, + "learning_rate": 9.630160968530601e-06, + "loss": 0.4535, + "step": 1950 + }, + { + "epoch": 1.0547846458821408, + "grad_norm": 0.35260698199272156, + "learning_rate": 9.629448032112365e-06, + "loss": 0.4062, + "step": 1951 + }, + { + "epoch": 1.0553252838349252, + "grad_norm": 0.41558098793029785, + "learning_rate": 9.62873443563582e-06, + "loss": 0.4876, + "step": 1952 + }, + { + "epoch": 1.0558659217877095, + "grad_norm": 0.31702765822410583, + "learning_rate": 9.628020179202713e-06, + "loss": 0.4188, + "step": 1953 + }, + { + "epoch": 1.0564065597404937, + "grad_norm": 0.407604843378067, + "learning_rate": 9.62730526291488e-06, + "loss": 0.4382, + "step": 1954 + }, + { + "epoch": 1.056947197693278, + "grad_norm": 0.358194500207901, + "learning_rate": 9.626589686874252e-06, + "loss": 0.4139, + "step": 1955 + }, + { + "epoch": 1.0574878356460624, + "grad_norm": 0.4273225963115692, + "learning_rate": 9.625873451182855e-06, + "loss": 0.4718, + "step": 1956 + }, + { + "epoch": 1.0580284735988466, + "grad_norm": 0.3338474929332733, + "learning_rate": 9.62515655594281e-06, + "loss": 0.3965, + "step": 1957 + }, + { + "epoch": 1.058569111551631, + "grad_norm": 0.45933595299720764, + "learning_rate": 9.624439001256327e-06, + "loss": 0.4446, + "step": 1958 + }, + { + "epoch": 1.0591097495044153, + "grad_norm": 0.35661551356315613, + "learning_rate": 9.623720787225716e-06, + "loss": 0.4321, + "step": 1959 + }, + { + "epoch": 1.0596503874571994, + "grad_norm": 0.422316312789917, + "learning_rate": 9.62300191395338e-06, + "loss": 0.428, + "step": 1960 + }, + { + "epoch": 1.0601910254099838, + "grad_norm": 0.39070868492126465, + "learning_rate": 9.622282381541812e-06, + "loss": 0.4235, + "step": 1961 + }, + { + "epoch": 1.0607316633627681, + "grad_norm": 0.46358242630958557, + "learning_rate": 9.621562190093603e-06, + "loss": 0.4778, + "step": 1962 + }, + { + "epoch": 1.0612723013155523, + "grad_norm": 0.3455776274204254, + "learning_rate": 9.620841339711437e-06, + "loss": 0.4426, + "step": 1963 + }, + { + "epoch": 1.0618129392683366, + "grad_norm": 0.3895058333873749, + "learning_rate": 9.62011983049809e-06, + "loss": 0.4379, + "step": 1964 + }, + { + "epoch": 1.062353577221121, + "grad_norm": 0.42753368616104126, + "learning_rate": 9.619397662556434e-06, + "loss": 0.4499, + "step": 1965 + }, + { + "epoch": 1.0628942151739051, + "grad_norm": 0.3436278998851776, + "learning_rate": 9.618674835989437e-06, + "loss": 0.4534, + "step": 1966 + }, + { + "epoch": 1.0634348531266895, + "grad_norm": 0.4359297454357147, + "learning_rate": 9.617951350900154e-06, + "loss": 0.4324, + "step": 1967 + }, + { + "epoch": 1.0639754910794739, + "grad_norm": 0.43395212292671204, + "learning_rate": 9.61722720739174e-06, + "loss": 0.5134, + "step": 1968 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 0.35746920108795166, + "learning_rate": 9.616502405567445e-06, + "loss": 0.4204, + "step": 1969 + }, + { + "epoch": 1.0650567669850424, + "grad_norm": 0.4134131968021393, + "learning_rate": 9.615776945530603e-06, + "loss": 0.4691, + "step": 1970 + }, + { + "epoch": 1.0655974049378267, + "grad_norm": 0.37661606073379517, + "learning_rate": 9.615050827384656e-06, + "loss": 0.4183, + "step": 1971 + }, + { + "epoch": 1.0661380428906109, + "grad_norm": 0.3690008521080017, + "learning_rate": 9.614324051233131e-06, + "loss": 0.4714, + "step": 1972 + }, + { + "epoch": 1.0666786808433952, + "grad_norm": 0.4408411681652069, + "learning_rate": 9.613596617179645e-06, + "loss": 0.4368, + "step": 1973 + }, + { + "epoch": 1.0672193187961796, + "grad_norm": 0.3526779115200043, + "learning_rate": 9.612868525327921e-06, + "loss": 0.4193, + "step": 1974 + }, + { + "epoch": 1.0677599567489637, + "grad_norm": 0.40113815665245056, + "learning_rate": 9.612139775781766e-06, + "loss": 0.4518, + "step": 1975 + }, + { + "epoch": 1.068300594701748, + "grad_norm": 0.4572330117225647, + "learning_rate": 9.611410368645085e-06, + "loss": 0.4592, + "step": 1976 + }, + { + "epoch": 1.0688412326545325, + "grad_norm": 0.3182274401187897, + "learning_rate": 9.610680304021873e-06, + "loss": 0.4426, + "step": 1977 + }, + { + "epoch": 1.0693818706073166, + "grad_norm": 0.37616831064224243, + "learning_rate": 9.609949582016223e-06, + "loss": 0.415, + "step": 1978 + }, + { + "epoch": 1.069922508560101, + "grad_norm": 0.40873757004737854, + "learning_rate": 9.609218202732322e-06, + "loss": 0.4509, + "step": 1979 + }, + { + "epoch": 1.070463146512885, + "grad_norm": 0.3298252522945404, + "learning_rate": 9.608486166274444e-06, + "loss": 0.4476, + "step": 1980 + }, + { + "epoch": 1.0710037844656695, + "grad_norm": 0.4793800115585327, + "learning_rate": 9.607753472746967e-06, + "loss": 0.4329, + "step": 1981 + }, + { + "epoch": 1.0715444224184538, + "grad_norm": 0.3937591016292572, + "learning_rate": 9.607020122254352e-06, + "loss": 0.423, + "step": 1982 + }, + { + "epoch": 1.072085060371238, + "grad_norm": 0.3705812692642212, + "learning_rate": 9.60628611490116e-06, + "loss": 0.4036, + "step": 1983 + }, + { + "epoch": 1.0726256983240223, + "grad_norm": 0.4920952618122101, + "learning_rate": 9.605551450792048e-06, + "loss": 0.4605, + "step": 1984 + }, + { + "epoch": 1.0731663362768067, + "grad_norm": 0.35678839683532715, + "learning_rate": 9.60481613003176e-06, + "loss": 0.4243, + "step": 1985 + }, + { + "epoch": 1.0737069742295908, + "grad_norm": 0.41355428099632263, + "learning_rate": 9.604080152725137e-06, + "loss": 0.4419, + "step": 1986 + }, + { + "epoch": 1.0742476121823752, + "grad_norm": 0.40846797823905945, + "learning_rate": 9.603343518977113e-06, + "loss": 0.4522, + "step": 1987 + }, + { + "epoch": 1.0747882501351596, + "grad_norm": 0.4189375042915344, + "learning_rate": 9.602606228892717e-06, + "loss": 0.5046, + "step": 1988 + }, + { + "epoch": 1.0753288880879437, + "grad_norm": 0.37765973806381226, + "learning_rate": 9.601868282577069e-06, + "loss": 0.4458, + "step": 1989 + }, + { + "epoch": 1.075869526040728, + "grad_norm": 0.37527140974998474, + "learning_rate": 9.601129680135386e-06, + "loss": 0.4313, + "step": 1990 + }, + { + "epoch": 1.0764101639935124, + "grad_norm": 0.34690606594085693, + "learning_rate": 9.600390421672976e-06, + "loss": 0.4276, + "step": 1991 + }, + { + "epoch": 1.0769508019462966, + "grad_norm": 0.42187508940696716, + "learning_rate": 9.59965050729524e-06, + "loss": 0.4851, + "step": 1992 + }, + { + "epoch": 1.077491439899081, + "grad_norm": 0.3796841502189636, + "learning_rate": 9.598909937107674e-06, + "loss": 0.38, + "step": 1993 + }, + { + "epoch": 1.0780320778518653, + "grad_norm": 0.386795312166214, + "learning_rate": 9.598168711215868e-06, + "loss": 0.4132, + "step": 1994 + }, + { + "epoch": 1.0785727158046494, + "grad_norm": 0.3887866139411926, + "learning_rate": 9.597426829725504e-06, + "loss": 0.4518, + "step": 1995 + }, + { + "epoch": 1.0791133537574338, + "grad_norm": 0.43749120831489563, + "learning_rate": 9.59668429274236e-06, + "loss": 0.4726, + "step": 1996 + }, + { + "epoch": 1.0796539917102181, + "grad_norm": 0.40146055817604065, + "learning_rate": 9.595941100372301e-06, + "loss": 0.454, + "step": 1997 + }, + { + "epoch": 1.0801946296630023, + "grad_norm": 0.35357674956321716, + "learning_rate": 9.595197252721293e-06, + "loss": 0.4288, + "step": 1998 + }, + { + "epoch": 1.0807352676157866, + "grad_norm": 0.34494102001190186, + "learning_rate": 9.594452749895395e-06, + "loss": 0.4223, + "step": 1999 + }, + { + "epoch": 1.081275905568571, + "grad_norm": 0.42233067750930786, + "learning_rate": 9.593707592000751e-06, + "loss": 0.4356, + "step": 2000 + }, + { + "epoch": 1.0818165435213551, + "grad_norm": 0.4014831483364105, + "learning_rate": 9.59296177914361e-06, + "loss": 0.45, + "step": 2001 + }, + { + "epoch": 1.0823571814741395, + "grad_norm": 0.36923670768737793, + "learning_rate": 9.592215311430305e-06, + "loss": 0.4123, + "step": 2002 + }, + { + "epoch": 1.0828978194269239, + "grad_norm": 0.4248311519622803, + "learning_rate": 9.591468188967267e-06, + "loss": 0.4579, + "step": 2003 + }, + { + "epoch": 1.083438457379708, + "grad_norm": 0.39018672704696655, + "learning_rate": 9.590720411861022e-06, + "loss": 0.4405, + "step": 2004 + }, + { + "epoch": 1.0839790953324924, + "grad_norm": 0.3719469904899597, + "learning_rate": 9.58997198021818e-06, + "loss": 0.4133, + "step": 2005 + }, + { + "epoch": 1.0845197332852767, + "grad_norm": 0.49077853560447693, + "learning_rate": 9.589222894145458e-06, + "loss": 0.4758, + "step": 2006 + }, + { + "epoch": 1.0850603712380609, + "grad_norm": 0.3614523708820343, + "learning_rate": 9.588473153749656e-06, + "loss": 0.4529, + "step": 2007 + }, + { + "epoch": 1.0856010091908452, + "grad_norm": 0.36509764194488525, + "learning_rate": 9.58772275913767e-06, + "loss": 0.3923, + "step": 2008 + }, + { + "epoch": 1.0861416471436294, + "grad_norm": 0.48173028230667114, + "learning_rate": 9.586971710416493e-06, + "loss": 0.458, + "step": 2009 + }, + { + "epoch": 1.0866822850964137, + "grad_norm": 0.4093618094921112, + "learning_rate": 9.586220007693205e-06, + "loss": 0.4655, + "step": 2010 + }, + { + "epoch": 1.087222923049198, + "grad_norm": 0.4406038224697113, + "learning_rate": 9.585467651074983e-06, + "loss": 0.4169, + "step": 2011 + }, + { + "epoch": 1.0877635610019822, + "grad_norm": 0.5025647282600403, + "learning_rate": 9.584714640669099e-06, + "loss": 0.4674, + "step": 2012 + }, + { + "epoch": 1.0883041989547666, + "grad_norm": 0.4379287660121918, + "learning_rate": 9.583960976582914e-06, + "loss": 0.4344, + "step": 2013 + }, + { + "epoch": 1.088844836907551, + "grad_norm": 0.49229374527931213, + "learning_rate": 9.583206658923882e-06, + "loss": 0.4062, + "step": 2014 + }, + { + "epoch": 1.089385474860335, + "grad_norm": 0.44954460859298706, + "learning_rate": 9.582451687799557e-06, + "loss": 0.4786, + "step": 2015 + }, + { + "epoch": 1.0899261128131195, + "grad_norm": 0.4374828040599823, + "learning_rate": 9.581696063317577e-06, + "loss": 0.4025, + "step": 2016 + }, + { + "epoch": 1.0904667507659038, + "grad_norm": 0.5120025873184204, + "learning_rate": 9.58093978558568e-06, + "loss": 0.4771, + "step": 2017 + }, + { + "epoch": 1.091007388718688, + "grad_norm": 0.4352531433105469, + "learning_rate": 9.580182854711695e-06, + "loss": 0.4597, + "step": 2018 + }, + { + "epoch": 1.0915480266714723, + "grad_norm": 0.4131707549095154, + "learning_rate": 9.579425270803542e-06, + "loss": 0.4323, + "step": 2019 + }, + { + "epoch": 1.0920886646242567, + "grad_norm": 0.39147377014160156, + "learning_rate": 9.578667033969238e-06, + "loss": 0.4271, + "step": 2020 + }, + { + "epoch": 1.0926293025770408, + "grad_norm": 0.40212157368659973, + "learning_rate": 9.577908144316888e-06, + "loss": 0.4159, + "step": 2021 + }, + { + "epoch": 1.0931699405298252, + "grad_norm": 0.3936586380004883, + "learning_rate": 9.577148601954697e-06, + "loss": 0.4443, + "step": 2022 + }, + { + "epoch": 1.0937105784826096, + "grad_norm": 0.4311821758747101, + "learning_rate": 9.576388406990957e-06, + "loss": 0.4309, + "step": 2023 + }, + { + "epoch": 1.0942512164353937, + "grad_norm": 0.38196295499801636, + "learning_rate": 9.575627559534055e-06, + "loss": 0.4731, + "step": 2024 + }, + { + "epoch": 1.094791854388178, + "grad_norm": 0.41277268528938293, + "learning_rate": 9.574866059692471e-06, + "loss": 0.4391, + "step": 2025 + }, + { + "epoch": 1.0953324923409624, + "grad_norm": 0.4242735207080841, + "learning_rate": 9.57410390757478e-06, + "loss": 0.4473, + "step": 2026 + }, + { + "epoch": 1.0958731302937466, + "grad_norm": 0.4308188259601593, + "learning_rate": 9.573341103289646e-06, + "loss": 0.5019, + "step": 2027 + }, + { + "epoch": 1.096413768246531, + "grad_norm": 0.35142946243286133, + "learning_rate": 9.572577646945831e-06, + "loss": 0.4185, + "step": 2028 + }, + { + "epoch": 1.0969544061993153, + "grad_norm": 0.3706936240196228, + "learning_rate": 9.571813538652184e-06, + "loss": 0.4477, + "step": 2029 + }, + { + "epoch": 1.0974950441520994, + "grad_norm": 0.4424436092376709, + "learning_rate": 9.571048778517655e-06, + "loss": 0.465, + "step": 2030 + }, + { + "epoch": 1.0980356821048838, + "grad_norm": 0.3273000121116638, + "learning_rate": 9.570283366651277e-06, + "loss": 0.4133, + "step": 2031 + }, + { + "epoch": 1.0985763200576681, + "grad_norm": 0.41131043434143066, + "learning_rate": 9.56951730316218e-06, + "loss": 0.4554, + "step": 2032 + }, + { + "epoch": 1.0991169580104523, + "grad_norm": 0.3751986622810364, + "learning_rate": 9.568750588159596e-06, + "loss": 0.4196, + "step": 2033 + }, + { + "epoch": 1.0996575959632366, + "grad_norm": 0.4607866704463959, + "learning_rate": 9.567983221752835e-06, + "loss": 0.4272, + "step": 2034 + }, + { + "epoch": 1.100198233916021, + "grad_norm": 0.3737804889678955, + "learning_rate": 9.567215204051307e-06, + "loss": 0.4948, + "step": 2035 + }, + { + "epoch": 1.1007388718688051, + "grad_norm": 0.4024915099143982, + "learning_rate": 9.566446535164518e-06, + "loss": 0.4065, + "step": 2036 + }, + { + "epoch": 1.1012795098215895, + "grad_norm": 0.42979228496551514, + "learning_rate": 9.565677215202062e-06, + "loss": 0.4041, + "step": 2037 + }, + { + "epoch": 1.1018201477743736, + "grad_norm": 0.3591923117637634, + "learning_rate": 9.564907244273624e-06, + "loss": 0.4536, + "step": 2038 + }, + { + "epoch": 1.102360785727158, + "grad_norm": 0.4698256254196167, + "learning_rate": 9.564136622488991e-06, + "loss": 0.4662, + "step": 2039 + }, + { + "epoch": 1.1029014236799424, + "grad_norm": 0.36560025811195374, + "learning_rate": 9.563365349958032e-06, + "loss": 0.4361, + "step": 2040 + }, + { + "epoch": 1.1034420616327267, + "grad_norm": 0.4091884195804596, + "learning_rate": 9.562593426790715e-06, + "loss": 0.4253, + "step": 2041 + }, + { + "epoch": 1.1039826995855109, + "grad_norm": 0.3399011492729187, + "learning_rate": 9.5618208530971e-06, + "loss": 0.4345, + "step": 2042 + }, + { + "epoch": 1.1045233375382952, + "grad_norm": 0.4654679596424103, + "learning_rate": 9.561047628987338e-06, + "loss": 0.489, + "step": 2043 + }, + { + "epoch": 1.1050639754910794, + "grad_norm": 0.36112046241760254, + "learning_rate": 9.560273754571678e-06, + "loss": 0.481, + "step": 2044 + }, + { + "epoch": 1.1056046134438637, + "grad_norm": 0.36224114894866943, + "learning_rate": 9.55949922996045e-06, + "loss": 0.4154, + "step": 2045 + }, + { + "epoch": 1.106145251396648, + "grad_norm": 0.38548043370246887, + "learning_rate": 9.558724055264093e-06, + "loss": 0.4901, + "step": 2046 + }, + { + "epoch": 1.1066858893494322, + "grad_norm": 0.3203273117542267, + "learning_rate": 9.557948230593122e-06, + "loss": 0.4238, + "step": 2047 + }, + { + "epoch": 1.1072265273022166, + "grad_norm": 0.40722090005874634, + "learning_rate": 9.55717175605816e-06, + "loss": 0.4541, + "step": 2048 + }, + { + "epoch": 1.107767165255001, + "grad_norm": 0.4180580973625183, + "learning_rate": 9.556394631769907e-06, + "loss": 0.4814, + "step": 2049 + }, + { + "epoch": 1.108307803207785, + "grad_norm": 0.41281113028526306, + "learning_rate": 9.555616857839171e-06, + "loss": 0.4413, + "step": 2050 + }, + { + "epoch": 1.1088484411605695, + "grad_norm": 0.3930680453777313, + "learning_rate": 9.554838434376845e-06, + "loss": 0.4443, + "step": 2051 + }, + { + "epoch": 1.1093890791133538, + "grad_norm": 0.4233580231666565, + "learning_rate": 9.554059361493913e-06, + "loss": 0.4212, + "step": 2052 + }, + { + "epoch": 1.109929717066138, + "grad_norm": 0.3436315953731537, + "learning_rate": 9.553279639301452e-06, + "loss": 0.4226, + "step": 2053 + }, + { + "epoch": 1.1104703550189223, + "grad_norm": 0.381905734539032, + "learning_rate": 9.552499267910637e-06, + "loss": 0.4105, + "step": 2054 + }, + { + "epoch": 1.1110109929717067, + "grad_norm": 0.4006160497665405, + "learning_rate": 9.551718247432732e-06, + "loss": 0.4805, + "step": 2055 + }, + { + "epoch": 1.1115516309244908, + "grad_norm": 0.3256142735481262, + "learning_rate": 9.55093657797909e-06, + "loss": 0.4155, + "step": 2056 + }, + { + "epoch": 1.1120922688772752, + "grad_norm": 0.36843231320381165, + "learning_rate": 9.550154259661162e-06, + "loss": 0.4562, + "step": 2057 + }, + { + "epoch": 1.1126329068300596, + "grad_norm": 0.34505757689476013, + "learning_rate": 9.54937129259049e-06, + "loss": 0.4096, + "step": 2058 + }, + { + "epoch": 1.1131735447828437, + "grad_norm": 0.3614414632320404, + "learning_rate": 9.548587676878709e-06, + "loss": 0.452, + "step": 2059 + }, + { + "epoch": 1.113714182735628, + "grad_norm": 0.34557512402534485, + "learning_rate": 9.547803412637542e-06, + "loss": 0.4648, + "step": 2060 + }, + { + "epoch": 1.1142548206884124, + "grad_norm": 0.35479235649108887, + "learning_rate": 9.547018499978811e-06, + "loss": 0.4242, + "step": 2061 + }, + { + "epoch": 1.1147954586411966, + "grad_norm": 0.37343519926071167, + "learning_rate": 9.546232939014428e-06, + "loss": 0.4435, + "step": 2062 + }, + { + "epoch": 1.115336096593981, + "grad_norm": 0.3476622998714447, + "learning_rate": 9.545446729856394e-06, + "loss": 0.4179, + "step": 2063 + }, + { + "epoch": 1.1158767345467653, + "grad_norm": 0.3708413243293762, + "learning_rate": 9.544659872616806e-06, + "loss": 0.4859, + "step": 2064 + }, + { + "epoch": 1.1164173724995494, + "grad_norm": 0.31397050619125366, + "learning_rate": 9.543872367407854e-06, + "loss": 0.4047, + "step": 2065 + }, + { + "epoch": 1.1169580104523338, + "grad_norm": 0.3721030354499817, + "learning_rate": 9.54308421434182e-06, + "loss": 0.4644, + "step": 2066 + }, + { + "epoch": 1.117498648405118, + "grad_norm": 0.2971539795398712, + "learning_rate": 9.542295413531073e-06, + "loss": 0.3858, + "step": 2067 + }, + { + "epoch": 1.1180392863579023, + "grad_norm": 0.418592244386673, + "learning_rate": 9.541505965088083e-06, + "loss": 0.4695, + "step": 2068 + }, + { + "epoch": 1.1185799243106866, + "grad_norm": 0.36569511890411377, + "learning_rate": 9.540715869125407e-06, + "loss": 0.4404, + "step": 2069 + }, + { + "epoch": 1.119120562263471, + "grad_norm": 0.3991091251373291, + "learning_rate": 9.539925125755695e-06, + "loss": 0.4525, + "step": 2070 + }, + { + "epoch": 1.1196612002162551, + "grad_norm": 0.33858662843704224, + "learning_rate": 9.53913373509169e-06, + "loss": 0.386, + "step": 2071 + }, + { + "epoch": 1.1202018381690395, + "grad_norm": 0.4000351130962372, + "learning_rate": 9.538341697246228e-06, + "loss": 0.4707, + "step": 2072 + }, + { + "epoch": 1.1207424761218236, + "grad_norm": 0.34329837560653687, + "learning_rate": 9.537549012332234e-06, + "loss": 0.4293, + "step": 2073 + }, + { + "epoch": 1.121283114074608, + "grad_norm": 0.392547607421875, + "learning_rate": 9.536755680462729e-06, + "loss": 0.4555, + "step": 2074 + }, + { + "epoch": 1.1218237520273924, + "grad_norm": 0.38397929072380066, + "learning_rate": 9.535961701750825e-06, + "loss": 0.439, + "step": 2075 + }, + { + "epoch": 1.1223643899801765, + "grad_norm": 0.3610386848449707, + "learning_rate": 9.535167076309726e-06, + "loss": 0.4423, + "step": 2076 + }, + { + "epoch": 1.1229050279329609, + "grad_norm": 0.3509500324726105, + "learning_rate": 9.534371804252727e-06, + "loss": 0.4486, + "step": 2077 + }, + { + "epoch": 1.1234456658857452, + "grad_norm": 0.342852383852005, + "learning_rate": 9.53357588569322e-06, + "loss": 0.3888, + "step": 2078 + }, + { + "epoch": 1.1239863038385294, + "grad_norm": 0.44941166043281555, + "learning_rate": 9.53277932074468e-06, + "loss": 0.4635, + "step": 2079 + }, + { + "epoch": 1.1245269417913137, + "grad_norm": 0.3370332717895508, + "learning_rate": 9.531982109520686e-06, + "loss": 0.4314, + "step": 2080 + }, + { + "epoch": 1.125067579744098, + "grad_norm": 0.36963319778442383, + "learning_rate": 9.531184252134897e-06, + "loss": 0.449, + "step": 2081 + }, + { + "epoch": 1.1256082176968822, + "grad_norm": 0.3691161274909973, + "learning_rate": 9.530385748701074e-06, + "loss": 0.4106, + "step": 2082 + }, + { + "epoch": 1.1261488556496666, + "grad_norm": 0.3494841158390045, + "learning_rate": 9.529586599333066e-06, + "loss": 0.4389, + "step": 2083 + }, + { + "epoch": 1.126689493602451, + "grad_norm": 0.3670741319656372, + "learning_rate": 9.528786804144812e-06, + "loss": 0.4453, + "step": 2084 + }, + { + "epoch": 1.127230131555235, + "grad_norm": 0.3459216058254242, + "learning_rate": 9.527986363250348e-06, + "loss": 0.4509, + "step": 2085 + }, + { + "epoch": 1.1277707695080195, + "grad_norm": 0.38125917315483093, + "learning_rate": 9.527185276763797e-06, + "loss": 0.4244, + "step": 2086 + }, + { + "epoch": 1.1283114074608038, + "grad_norm": 0.3307487666606903, + "learning_rate": 9.526383544799378e-06, + "loss": 0.441, + "step": 2087 + }, + { + "epoch": 1.128852045413588, + "grad_norm": 0.38201895356178284, + "learning_rate": 9.525581167471399e-06, + "loss": 0.4615, + "step": 2088 + }, + { + "epoch": 1.1293926833663723, + "grad_norm": 0.3586467206478119, + "learning_rate": 9.524778144894265e-06, + "loss": 0.4586, + "step": 2089 + }, + { + "epoch": 1.1299333213191567, + "grad_norm": 0.35074567794799805, + "learning_rate": 9.523974477182465e-06, + "loss": 0.4258, + "step": 2090 + }, + { + "epoch": 1.1304739592719408, + "grad_norm": 0.41212746500968933, + "learning_rate": 9.523170164450586e-06, + "loss": 0.4729, + "step": 2091 + }, + { + "epoch": 1.1310145972247252, + "grad_norm": 0.34268930554389954, + "learning_rate": 9.522365206813307e-06, + "loss": 0.4335, + "step": 2092 + }, + { + "epoch": 1.1315552351775096, + "grad_norm": 0.40464693307876587, + "learning_rate": 9.521559604385396e-06, + "loss": 0.4127, + "step": 2093 + }, + { + "epoch": 1.1320958731302937, + "grad_norm": 0.40215039253234863, + "learning_rate": 9.520753357281716e-06, + "loss": 0.4875, + "step": 2094 + }, + { + "epoch": 1.132636511083078, + "grad_norm": 0.3894921541213989, + "learning_rate": 9.519946465617217e-06, + "loss": 0.4091, + "step": 2095 + }, + { + "epoch": 1.1331771490358622, + "grad_norm": 0.35323572158813477, + "learning_rate": 9.519138929506949e-06, + "loss": 0.4147, + "step": 2096 + }, + { + "epoch": 1.1337177869886466, + "grad_norm": 0.3723592758178711, + "learning_rate": 9.518330749066042e-06, + "loss": 0.4824, + "step": 2097 + }, + { + "epoch": 1.134258424941431, + "grad_norm": 0.3743079602718353, + "learning_rate": 9.517521924409731e-06, + "loss": 0.4413, + "step": 2098 + }, + { + "epoch": 1.1347990628942153, + "grad_norm": 0.39084959030151367, + "learning_rate": 9.516712455653337e-06, + "loss": 0.466, + "step": 2099 + }, + { + "epoch": 1.1353397008469994, + "grad_norm": 0.3701314926147461, + "learning_rate": 9.515902342912268e-06, + "loss": 0.4594, + "step": 2100 + }, + { + "epoch": 1.1358803387997838, + "grad_norm": 0.3685029149055481, + "learning_rate": 9.51509158630203e-06, + "loss": 0.4459, + "step": 2101 + }, + { + "epoch": 1.136420976752568, + "grad_norm": 0.38407519459724426, + "learning_rate": 9.514280185938223e-06, + "loss": 0.466, + "step": 2102 + }, + { + "epoch": 1.1369616147053523, + "grad_norm": 0.3733687102794647, + "learning_rate": 9.51346814193653e-06, + "loss": 0.4422, + "step": 2103 + }, + { + "epoch": 1.1375022526581366, + "grad_norm": 0.4305700957775116, + "learning_rate": 9.512655454412734e-06, + "loss": 0.4432, + "step": 2104 + }, + { + "epoch": 1.138042890610921, + "grad_norm": 0.3547512888908386, + "learning_rate": 9.511842123482703e-06, + "loss": 0.4748, + "step": 2105 + }, + { + "epoch": 1.1385835285637051, + "grad_norm": 0.3857305347919464, + "learning_rate": 9.511028149262405e-06, + "loss": 0.4531, + "step": 2106 + }, + { + "epoch": 1.1391241665164895, + "grad_norm": 0.3905250132083893, + "learning_rate": 9.510213531867891e-06, + "loss": 0.4121, + "step": 2107 + }, + { + "epoch": 1.1396648044692737, + "grad_norm": 0.4033034145832062, + "learning_rate": 9.509398271415308e-06, + "loss": 0.4839, + "step": 2108 + }, + { + "epoch": 1.140205442422058, + "grad_norm": 0.3537909984588623, + "learning_rate": 9.508582368020897e-06, + "loss": 0.4396, + "step": 2109 + }, + { + "epoch": 1.1407460803748424, + "grad_norm": 0.352022260427475, + "learning_rate": 9.507765821800988e-06, + "loss": 0.4231, + "step": 2110 + }, + { + "epoch": 1.1412867183276265, + "grad_norm": 0.4184352457523346, + "learning_rate": 9.506948632872e-06, + "loss": 0.4879, + "step": 2111 + }, + { + "epoch": 1.1418273562804109, + "grad_norm": 0.38211655616760254, + "learning_rate": 9.506130801350447e-06, + "loss": 0.4507, + "step": 2112 + }, + { + "epoch": 1.1423679942331952, + "grad_norm": 0.4082963168621063, + "learning_rate": 9.505312327352935e-06, + "loss": 0.431, + "step": 2113 + }, + { + "epoch": 1.1429086321859794, + "grad_norm": 0.32051509618759155, + "learning_rate": 9.504493210996159e-06, + "loss": 0.3991, + "step": 2114 + }, + { + "epoch": 1.1434492701387637, + "grad_norm": 0.3416892886161804, + "learning_rate": 9.503673452396909e-06, + "loss": 0.4656, + "step": 2115 + }, + { + "epoch": 1.143989908091548, + "grad_norm": 0.35561808943748474, + "learning_rate": 9.502853051672066e-06, + "loss": 0.4353, + "step": 2116 + }, + { + "epoch": 1.1445305460443322, + "grad_norm": 0.4583134949207306, + "learning_rate": 9.502032008938595e-06, + "loss": 0.4664, + "step": 2117 + }, + { + "epoch": 1.1450711839971166, + "grad_norm": 0.3298965394496918, + "learning_rate": 9.501210324313566e-06, + "loss": 0.4249, + "step": 2118 + }, + { + "epoch": 1.145611821949901, + "grad_norm": 0.42214271426200867, + "learning_rate": 9.500387997914127e-06, + "loss": 0.4299, + "step": 2119 + }, + { + "epoch": 1.146152459902685, + "grad_norm": 0.3864177465438843, + "learning_rate": 9.499565029857529e-06, + "loss": 0.4365, + "step": 2120 + }, + { + "epoch": 1.1466930978554695, + "grad_norm": 0.3692407011985779, + "learning_rate": 9.498741420261109e-06, + "loss": 0.4334, + "step": 2121 + }, + { + "epoch": 1.1472337358082538, + "grad_norm": 0.45088258385658264, + "learning_rate": 9.497917169242293e-06, + "loss": 0.4399, + "step": 2122 + }, + { + "epoch": 1.147774373761038, + "grad_norm": 0.42142733931541443, + "learning_rate": 9.4970922769186e-06, + "loss": 0.4337, + "step": 2123 + }, + { + "epoch": 1.1483150117138223, + "grad_norm": 0.37217411398887634, + "learning_rate": 9.496266743407646e-06, + "loss": 0.4719, + "step": 2124 + }, + { + "epoch": 1.1488556496666065, + "grad_norm": 0.3911516070365906, + "learning_rate": 9.49544056882713e-06, + "loss": 0.4479, + "step": 2125 + }, + { + "epoch": 1.1493962876193908, + "grad_norm": 0.44513747096061707, + "learning_rate": 9.49461375329485e-06, + "loss": 0.4667, + "step": 2126 + }, + { + "epoch": 1.1499369255721752, + "grad_norm": 0.33743032813072205, + "learning_rate": 9.493786296928691e-06, + "loss": 0.4108, + "step": 2127 + }, + { + "epoch": 1.1504775635249596, + "grad_norm": 0.36376333236694336, + "learning_rate": 9.492958199846628e-06, + "loss": 0.418, + "step": 2128 + }, + { + "epoch": 1.1510182014777437, + "grad_norm": 0.4417121410369873, + "learning_rate": 9.492129462166732e-06, + "loss": 0.4843, + "step": 2129 + }, + { + "epoch": 1.151558839430528, + "grad_norm": 0.3500215709209442, + "learning_rate": 9.491300084007162e-06, + "loss": 0.4525, + "step": 2130 + }, + { + "epoch": 1.1520994773833122, + "grad_norm": 0.40554940700531006, + "learning_rate": 9.490470065486168e-06, + "loss": 0.4391, + "step": 2131 + }, + { + "epoch": 1.1526401153360966, + "grad_norm": 0.4034478962421417, + "learning_rate": 9.489639406722095e-06, + "loss": 0.4953, + "step": 2132 + }, + { + "epoch": 1.153180753288881, + "grad_norm": 0.35386618971824646, + "learning_rate": 9.488808107833376e-06, + "loss": 0.4658, + "step": 2133 + }, + { + "epoch": 1.1537213912416653, + "grad_norm": 0.35145196318626404, + "learning_rate": 9.487976168938535e-06, + "loss": 0.4166, + "step": 2134 + }, + { + "epoch": 1.1542620291944494, + "grad_norm": 0.3624265789985657, + "learning_rate": 9.48714359015619e-06, + "loss": 0.4281, + "step": 2135 + }, + { + "epoch": 1.1548026671472338, + "grad_norm": 0.3582821190357208, + "learning_rate": 9.486310371605046e-06, + "loss": 0.4896, + "step": 2136 + }, + { + "epoch": 1.155343305100018, + "grad_norm": 0.36306634545326233, + "learning_rate": 9.485476513403905e-06, + "loss": 0.4282, + "step": 2137 + }, + { + "epoch": 1.1558839430528023, + "grad_norm": 0.39350152015686035, + "learning_rate": 9.484642015671655e-06, + "loss": 0.4531, + "step": 2138 + }, + { + "epoch": 1.1564245810055866, + "grad_norm": 0.34382736682891846, + "learning_rate": 9.483806878527277e-06, + "loss": 0.4247, + "step": 2139 + }, + { + "epoch": 1.1569652189583708, + "grad_norm": 0.3771136701107025, + "learning_rate": 9.482971102089845e-06, + "loss": 0.4273, + "step": 2140 + }, + { + "epoch": 1.1575058569111552, + "grad_norm": 0.430793434381485, + "learning_rate": 9.48213468647852e-06, + "loss": 0.4638, + "step": 2141 + }, + { + "epoch": 1.1580464948639395, + "grad_norm": 0.37223073840141296, + "learning_rate": 9.481297631812558e-06, + "loss": 0.4602, + "step": 2142 + }, + { + "epoch": 1.1585871328167237, + "grad_norm": 0.41819655895233154, + "learning_rate": 9.480459938211305e-06, + "loss": 0.4695, + "step": 2143 + }, + { + "epoch": 1.159127770769508, + "grad_norm": 0.35040050745010376, + "learning_rate": 9.479621605794199e-06, + "loss": 0.4223, + "step": 2144 + }, + { + "epoch": 1.1596684087222924, + "grad_norm": 0.3786729574203491, + "learning_rate": 9.478782634680765e-06, + "loss": 0.4101, + "step": 2145 + }, + { + "epoch": 1.1602090466750765, + "grad_norm": 0.445541650056839, + "learning_rate": 9.477943024990623e-06, + "loss": 0.4645, + "step": 2146 + }, + { + "epoch": 1.1607496846278609, + "grad_norm": 0.3482339382171631, + "learning_rate": 9.477102776843486e-06, + "loss": 0.4434, + "step": 2147 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 0.4120996594429016, + "learning_rate": 9.476261890359151e-06, + "loss": 0.4368, + "step": 2148 + }, + { + "epoch": 1.1618309605334294, + "grad_norm": 0.40635013580322266, + "learning_rate": 9.475420365657512e-06, + "loss": 0.4764, + "step": 2149 + }, + { + "epoch": 1.1623715984862137, + "grad_norm": 0.37310487031936646, + "learning_rate": 9.47457820285855e-06, + "loss": 0.3937, + "step": 2150 + }, + { + "epoch": 1.162912236438998, + "grad_norm": 0.35417458415031433, + "learning_rate": 9.473735402082342e-06, + "loss": 0.3948, + "step": 2151 + }, + { + "epoch": 1.1634528743917822, + "grad_norm": 0.3947520852088928, + "learning_rate": 9.472891963449053e-06, + "loss": 0.4356, + "step": 2152 + }, + { + "epoch": 1.1639935123445666, + "grad_norm": 0.4242643117904663, + "learning_rate": 9.472047887078937e-06, + "loss": 0.4677, + "step": 2153 + }, + { + "epoch": 1.164534150297351, + "grad_norm": 0.34379175305366516, + "learning_rate": 9.471203173092341e-06, + "loss": 0.4273, + "step": 2154 + }, + { + "epoch": 1.165074788250135, + "grad_norm": 0.38708722591400146, + "learning_rate": 9.470357821609703e-06, + "loss": 0.4671, + "step": 2155 + }, + { + "epoch": 1.1656154262029195, + "grad_norm": 0.3811572194099426, + "learning_rate": 9.469511832751555e-06, + "loss": 0.4166, + "step": 2156 + }, + { + "epoch": 1.1661560641557038, + "grad_norm": 0.3911553919315338, + "learning_rate": 9.46866520663851e-06, + "loss": 0.4457, + "step": 2157 + }, + { + "epoch": 1.166696702108488, + "grad_norm": 0.43305057287216187, + "learning_rate": 9.467817943391284e-06, + "loss": 0.5152, + "step": 2158 + }, + { + "epoch": 1.1672373400612723, + "grad_norm": 0.39823833107948303, + "learning_rate": 9.466970043130676e-06, + "loss": 0.3998, + "step": 2159 + }, + { + "epoch": 1.1677779780140565, + "grad_norm": 0.3863241970539093, + "learning_rate": 9.466121505977577e-06, + "loss": 0.4543, + "step": 2160 + }, + { + "epoch": 1.1683186159668408, + "grad_norm": 0.45706862211227417, + "learning_rate": 9.465272332052972e-06, + "loss": 0.3999, + "step": 2161 + }, + { + "epoch": 1.1688592539196252, + "grad_norm": 0.3852021396160126, + "learning_rate": 9.464422521477935e-06, + "loss": 0.4424, + "step": 2162 + }, + { + "epoch": 1.1693998918724096, + "grad_norm": 0.45317286252975464, + "learning_rate": 9.463572074373628e-06, + "loss": 0.4563, + "step": 2163 + }, + { + "epoch": 1.1699405298251937, + "grad_norm": 0.4411197900772095, + "learning_rate": 9.46272099086131e-06, + "loss": 0.4417, + "step": 2164 + }, + { + "epoch": 1.170481167777978, + "grad_norm": 0.344078004360199, + "learning_rate": 9.461869271062322e-06, + "loss": 0.4278, + "step": 2165 + }, + { + "epoch": 1.1710218057307622, + "grad_norm": 0.40181687474250793, + "learning_rate": 9.461016915098104e-06, + "loss": 0.4331, + "step": 2166 + }, + { + "epoch": 1.1715624436835466, + "grad_norm": 0.42046919465065, + "learning_rate": 9.460163923090184e-06, + "loss": 0.4202, + "step": 2167 + }, + { + "epoch": 1.172103081636331, + "grad_norm": 0.36066851019859314, + "learning_rate": 9.459310295160176e-06, + "loss": 0.4571, + "step": 2168 + }, + { + "epoch": 1.172643719589115, + "grad_norm": 0.38671255111694336, + "learning_rate": 9.458456031429792e-06, + "loss": 0.4384, + "step": 2169 + }, + { + "epoch": 1.1731843575418994, + "grad_norm": 0.36749428510665894, + "learning_rate": 9.457601132020832e-06, + "loss": 0.446, + "step": 2170 + }, + { + "epoch": 1.1737249954946838, + "grad_norm": 0.3736770749092102, + "learning_rate": 9.456745597055185e-06, + "loss": 0.44, + "step": 2171 + }, + { + "epoch": 1.174265633447468, + "grad_norm": 0.39715027809143066, + "learning_rate": 9.45588942665483e-06, + "loss": 0.4351, + "step": 2172 + }, + { + "epoch": 1.1748062714002523, + "grad_norm": 0.34449928998947144, + "learning_rate": 9.45503262094184e-06, + "loss": 0.451, + "step": 2173 + }, + { + "epoch": 1.1753469093530367, + "grad_norm": 0.3449814021587372, + "learning_rate": 9.454175180038376e-06, + "loss": 0.4125, + "step": 2174 + }, + { + "epoch": 1.1758875473058208, + "grad_norm": 0.33945605158805847, + "learning_rate": 9.453317104066693e-06, + "loss": 0.3961, + "step": 2175 + }, + { + "epoch": 1.1764281852586052, + "grad_norm": 0.39236050844192505, + "learning_rate": 9.45245839314913e-06, + "loss": 0.4637, + "step": 2176 + }, + { + "epoch": 1.1769688232113895, + "grad_norm": 0.3773477077484131, + "learning_rate": 9.45159904740812e-06, + "loss": 0.4501, + "step": 2177 + }, + { + "epoch": 1.1775094611641737, + "grad_norm": 0.34886983036994934, + "learning_rate": 9.450739066966192e-06, + "loss": 0.458, + "step": 2178 + }, + { + "epoch": 1.178050099116958, + "grad_norm": 0.40819504857063293, + "learning_rate": 9.449878451945958e-06, + "loss": 0.4038, + "step": 2179 + }, + { + "epoch": 1.1785907370697424, + "grad_norm": 0.3563839793205261, + "learning_rate": 9.44901720247012e-06, + "loss": 0.4686, + "step": 2180 + }, + { + "epoch": 1.1791313750225265, + "grad_norm": 0.3313881754875183, + "learning_rate": 9.448155318661476e-06, + "loss": 0.4203, + "step": 2181 + }, + { + "epoch": 1.1796720129753109, + "grad_norm": 0.3622712790966034, + "learning_rate": 9.447292800642913e-06, + "loss": 0.4537, + "step": 2182 + }, + { + "epoch": 1.1802126509280952, + "grad_norm": 0.29673299193382263, + "learning_rate": 9.446429648537406e-06, + "loss": 0.4283, + "step": 2183 + }, + { + "epoch": 1.1807532888808794, + "grad_norm": 0.3906388580799103, + "learning_rate": 9.445565862468021e-06, + "loss": 0.4324, + "step": 2184 + }, + { + "epoch": 1.1812939268336637, + "grad_norm": 0.35276076197624207, + "learning_rate": 9.444701442557917e-06, + "loss": 0.4522, + "step": 2185 + }, + { + "epoch": 1.181834564786448, + "grad_norm": 0.3876296877861023, + "learning_rate": 9.443836388930339e-06, + "loss": 0.4133, + "step": 2186 + }, + { + "epoch": 1.1823752027392322, + "grad_norm": 0.35179904103279114, + "learning_rate": 9.442970701708625e-06, + "loss": 0.437, + "step": 2187 + }, + { + "epoch": 1.1829158406920166, + "grad_norm": 0.40207886695861816, + "learning_rate": 9.442104381016206e-06, + "loss": 0.4446, + "step": 2188 + }, + { + "epoch": 1.1834564786448007, + "grad_norm": 0.35293862223625183, + "learning_rate": 9.441237426976596e-06, + "loss": 0.4451, + "step": 2189 + }, + { + "epoch": 1.183997116597585, + "grad_norm": 0.38812461495399475, + "learning_rate": 9.440369839713407e-06, + "loss": 0.4249, + "step": 2190 + }, + { + "epoch": 1.1845377545503695, + "grad_norm": 0.3562657833099365, + "learning_rate": 9.439501619350338e-06, + "loss": 0.4276, + "step": 2191 + }, + { + "epoch": 1.1850783925031538, + "grad_norm": 0.40622127056121826, + "learning_rate": 9.438632766011177e-06, + "loss": 0.4296, + "step": 2192 + }, + { + "epoch": 1.185619030455938, + "grad_norm": 0.37603306770324707, + "learning_rate": 9.437763279819803e-06, + "loss": 0.4983, + "step": 2193 + }, + { + "epoch": 1.1861596684087223, + "grad_norm": 0.32640597224235535, + "learning_rate": 9.436893160900188e-06, + "loss": 0.4332, + "step": 2194 + }, + { + "epoch": 1.1867003063615065, + "grad_norm": 0.4022713601589203, + "learning_rate": 9.436022409376391e-06, + "loss": 0.4563, + "step": 2195 + }, + { + "epoch": 1.1872409443142908, + "grad_norm": 0.3431012034416199, + "learning_rate": 9.43515102537256e-06, + "loss": 0.3772, + "step": 2196 + }, + { + "epoch": 1.1877815822670752, + "grad_norm": 0.3568943440914154, + "learning_rate": 9.434279009012938e-06, + "loss": 0.4523, + "step": 2197 + }, + { + "epoch": 1.1883222202198593, + "grad_norm": 0.4028702974319458, + "learning_rate": 9.433406360421857e-06, + "loss": 0.4223, + "step": 2198 + }, + { + "epoch": 1.1888628581726437, + "grad_norm": 0.3365423381328583, + "learning_rate": 9.432533079723734e-06, + "loss": 0.4347, + "step": 2199 + }, + { + "epoch": 1.189403496125428, + "grad_norm": 0.395528107881546, + "learning_rate": 9.431659167043079e-06, + "loss": 0.4713, + "step": 2200 + }, + { + "epoch": 1.1899441340782122, + "grad_norm": 0.37539371848106384, + "learning_rate": 9.430784622504497e-06, + "loss": 0.4018, + "step": 2201 + }, + { + "epoch": 1.1904847720309966, + "grad_norm": 0.34760650992393494, + "learning_rate": 9.429909446232676e-06, + "loss": 0.4432, + "step": 2202 + }, + { + "epoch": 1.191025409983781, + "grad_norm": 0.3822764754295349, + "learning_rate": 9.4290336383524e-06, + "loss": 0.472, + "step": 2203 + }, + { + "epoch": 1.191566047936565, + "grad_norm": 0.3600846827030182, + "learning_rate": 9.428157198988537e-06, + "loss": 0.4183, + "step": 2204 + }, + { + "epoch": 1.1921066858893494, + "grad_norm": 0.37059861421585083, + "learning_rate": 9.427280128266049e-06, + "loss": 0.461, + "step": 2205 + }, + { + "epoch": 1.1926473238421338, + "grad_norm": 0.3323056697845459, + "learning_rate": 9.426402426309989e-06, + "loss": 0.4355, + "step": 2206 + }, + { + "epoch": 1.193187961794918, + "grad_norm": 0.4206660985946655, + "learning_rate": 9.425524093245495e-06, + "loss": 0.4678, + "step": 2207 + }, + { + "epoch": 1.1937285997477023, + "grad_norm": 0.33384454250335693, + "learning_rate": 9.424645129197801e-06, + "loss": 0.4143, + "step": 2208 + }, + { + "epoch": 1.1942692377004867, + "grad_norm": 0.3933919072151184, + "learning_rate": 9.423765534292226e-06, + "loss": 0.462, + "step": 2209 + }, + { + "epoch": 1.1948098756532708, + "grad_norm": 0.3982803225517273, + "learning_rate": 9.422885308654183e-06, + "loss": 0.435, + "step": 2210 + }, + { + "epoch": 1.1953505136060552, + "grad_norm": 0.34469297528266907, + "learning_rate": 9.42200445240917e-06, + "loss": 0.4267, + "step": 2211 + }, + { + "epoch": 1.1958911515588395, + "grad_norm": 0.37136533856391907, + "learning_rate": 9.421122965682782e-06, + "loss": 0.4386, + "step": 2212 + }, + { + "epoch": 1.1964317895116237, + "grad_norm": 0.40113547444343567, + "learning_rate": 9.420240848600696e-06, + "loss": 0.4275, + "step": 2213 + }, + { + "epoch": 1.196972427464408, + "grad_norm": 0.3625503480434418, + "learning_rate": 9.419358101288684e-06, + "loss": 0.438, + "step": 2214 + }, + { + "epoch": 1.1975130654171924, + "grad_norm": 0.4281715154647827, + "learning_rate": 9.418474723872609e-06, + "loss": 0.4306, + "step": 2215 + }, + { + "epoch": 1.1980537033699765, + "grad_norm": 0.36719268560409546, + "learning_rate": 9.417590716478416e-06, + "loss": 0.4596, + "step": 2216 + }, + { + "epoch": 1.1985943413227609, + "grad_norm": 0.3527308702468872, + "learning_rate": 9.41670607923215e-06, + "loss": 0.48, + "step": 2217 + }, + { + "epoch": 1.199134979275545, + "grad_norm": 0.3650606572628021, + "learning_rate": 9.41582081225994e-06, + "loss": 0.393, + "step": 2218 + }, + { + "epoch": 1.1996756172283294, + "grad_norm": 0.38191086053848267, + "learning_rate": 9.414934915688003e-06, + "loss": 0.4144, + "step": 2219 + }, + { + "epoch": 1.2002162551811137, + "grad_norm": 0.35002824664115906, + "learning_rate": 9.414048389642652e-06, + "loss": 0.4261, + "step": 2220 + }, + { + "epoch": 1.200756893133898, + "grad_norm": 0.3725591003894806, + "learning_rate": 9.413161234250284e-06, + "loss": 0.4481, + "step": 2221 + }, + { + "epoch": 1.2012975310866822, + "grad_norm": 0.39697352051734924, + "learning_rate": 9.412273449637388e-06, + "loss": 0.4509, + "step": 2222 + }, + { + "epoch": 1.2018381690394666, + "grad_norm": 0.323587566614151, + "learning_rate": 9.411385035930545e-06, + "loss": 0.4621, + "step": 2223 + }, + { + "epoch": 1.2023788069922507, + "grad_norm": 0.3549892008304596, + "learning_rate": 9.410495993256422e-06, + "loss": 0.4807, + "step": 2224 + }, + { + "epoch": 1.202919444945035, + "grad_norm": 0.3448998034000397, + "learning_rate": 9.409606321741776e-06, + "loss": 0.4487, + "step": 2225 + }, + { + "epoch": 1.2034600828978195, + "grad_norm": 0.33572226762771606, + "learning_rate": 9.408716021513455e-06, + "loss": 0.4429, + "step": 2226 + }, + { + "epoch": 1.2040007208506038, + "grad_norm": 0.3907988965511322, + "learning_rate": 9.4078250926984e-06, + "loss": 0.4633, + "step": 2227 + }, + { + "epoch": 1.204541358803388, + "grad_norm": 0.35494303703308105, + "learning_rate": 9.406933535423632e-06, + "loss": 0.4233, + "step": 2228 + }, + { + "epoch": 1.2050819967561723, + "grad_norm": 0.4350510239601135, + "learning_rate": 9.406041349816272e-06, + "loss": 0.4327, + "step": 2229 + }, + { + "epoch": 1.2056226347089565, + "grad_norm": 0.3504043221473694, + "learning_rate": 9.405148536003527e-06, + "loss": 0.4136, + "step": 2230 + }, + { + "epoch": 1.2061632726617408, + "grad_norm": 0.383758008480072, + "learning_rate": 9.404255094112688e-06, + "loss": 0.4472, + "step": 2231 + }, + { + "epoch": 1.2067039106145252, + "grad_norm": 0.4049503207206726, + "learning_rate": 9.403361024271145e-06, + "loss": 0.4226, + "step": 2232 + }, + { + "epoch": 1.2072445485673093, + "grad_norm": 0.3764093220233917, + "learning_rate": 9.402466326606369e-06, + "loss": 0.4626, + "step": 2233 + }, + { + "epoch": 1.2077851865200937, + "grad_norm": 0.3602176308631897, + "learning_rate": 9.401571001245928e-06, + "loss": 0.4518, + "step": 2234 + }, + { + "epoch": 1.208325824472878, + "grad_norm": 0.3844587206840515, + "learning_rate": 9.400675048317473e-06, + "loss": 0.4539, + "step": 2235 + }, + { + "epoch": 1.2088664624256622, + "grad_norm": 0.385905921459198, + "learning_rate": 9.39977846794875e-06, + "loss": 0.4654, + "step": 2236 + }, + { + "epoch": 1.2094071003784466, + "grad_norm": 0.3673568367958069, + "learning_rate": 9.398881260267589e-06, + "loss": 0.4718, + "step": 2237 + }, + { + "epoch": 1.209947738331231, + "grad_norm": 0.3255668878555298, + "learning_rate": 9.397983425401915e-06, + "loss": 0.3788, + "step": 2238 + }, + { + "epoch": 1.210488376284015, + "grad_norm": 0.3674350082874298, + "learning_rate": 9.397084963479738e-06, + "loss": 0.4678, + "step": 2239 + }, + { + "epoch": 1.2110290142367994, + "grad_norm": 0.34120893478393555, + "learning_rate": 9.396185874629158e-06, + "loss": 0.4141, + "step": 2240 + }, + { + "epoch": 1.2115696521895838, + "grad_norm": 0.39488181471824646, + "learning_rate": 9.395286158978367e-06, + "loss": 0.4583, + "step": 2241 + }, + { + "epoch": 1.212110290142368, + "grad_norm": 0.31320297718048096, + "learning_rate": 9.394385816655647e-06, + "loss": 0.4305, + "step": 2242 + }, + { + "epoch": 1.2126509280951523, + "grad_norm": 0.39865848422050476, + "learning_rate": 9.393484847789363e-06, + "loss": 0.463, + "step": 2243 + }, + { + "epoch": 1.2131915660479367, + "grad_norm": 0.38404038548469543, + "learning_rate": 9.392583252507974e-06, + "loss": 0.4668, + "step": 2244 + }, + { + "epoch": 1.2137322040007208, + "grad_norm": 0.3422393500804901, + "learning_rate": 9.391681030940031e-06, + "loss": 0.4188, + "step": 2245 + }, + { + "epoch": 1.2142728419535052, + "grad_norm": 0.3838838040828705, + "learning_rate": 9.390778183214168e-06, + "loss": 0.4607, + "step": 2246 + }, + { + "epoch": 1.2148134799062895, + "grad_norm": 0.35177937150001526, + "learning_rate": 9.389874709459113e-06, + "loss": 0.4314, + "step": 2247 + }, + { + "epoch": 1.2153541178590737, + "grad_norm": 0.3254885971546173, + "learning_rate": 9.388970609803683e-06, + "loss": 0.4369, + "step": 2248 + }, + { + "epoch": 1.215894755811858, + "grad_norm": 0.35308679938316345, + "learning_rate": 9.388065884376778e-06, + "loss": 0.4392, + "step": 2249 + }, + { + "epoch": 1.2164353937646424, + "grad_norm": 0.3629399538040161, + "learning_rate": 9.387160533307398e-06, + "loss": 0.4443, + "step": 2250 + }, + { + "epoch": 1.2169760317174265, + "grad_norm": 0.29690372943878174, + "learning_rate": 9.386254556724622e-06, + "loss": 0.4125, + "step": 2251 + }, + { + "epoch": 1.2175166696702109, + "grad_norm": 0.3469884395599365, + "learning_rate": 9.385347954757625e-06, + "loss": 0.4298, + "step": 2252 + }, + { + "epoch": 1.218057307622995, + "grad_norm": 0.350612610578537, + "learning_rate": 9.384440727535666e-06, + "loss": 0.4291, + "step": 2253 + }, + { + "epoch": 1.2185979455757794, + "grad_norm": 0.36138424277305603, + "learning_rate": 9.383532875188099e-06, + "loss": 0.4649, + "step": 2254 + }, + { + "epoch": 1.2191385835285637, + "grad_norm": 0.3611285090446472, + "learning_rate": 9.382624397844363e-06, + "loss": 0.4186, + "step": 2255 + }, + { + "epoch": 1.219679221481348, + "grad_norm": 0.4227101504802704, + "learning_rate": 9.381715295633987e-06, + "loss": 0.4508, + "step": 2256 + }, + { + "epoch": 1.2202198594341322, + "grad_norm": 0.34860163927078247, + "learning_rate": 9.380805568686586e-06, + "loss": 0.4743, + "step": 2257 + }, + { + "epoch": 1.2207604973869166, + "grad_norm": 0.34503501653671265, + "learning_rate": 9.379895217131873e-06, + "loss": 0.3927, + "step": 2258 + }, + { + "epoch": 1.2213011353397007, + "grad_norm": 0.4622492492198944, + "learning_rate": 9.378984241099638e-06, + "loss": 0.4687, + "step": 2259 + }, + { + "epoch": 1.2218417732924851, + "grad_norm": 0.3430006504058838, + "learning_rate": 9.378072640719773e-06, + "loss": 0.4237, + "step": 2260 + }, + { + "epoch": 1.2223824112452695, + "grad_norm": 0.42140287160873413, + "learning_rate": 9.377160416122247e-06, + "loss": 0.4448, + "step": 2261 + }, + { + "epoch": 1.2229230491980536, + "grad_norm": 0.3985527455806732, + "learning_rate": 9.376247567437124e-06, + "loss": 0.4172, + "step": 2262 + }, + { + "epoch": 1.223463687150838, + "grad_norm": 0.38346680998802185, + "learning_rate": 9.375334094794558e-06, + "loss": 0.4948, + "step": 2263 + }, + { + "epoch": 1.2240043251036223, + "grad_norm": 0.39689841866493225, + "learning_rate": 9.374419998324792e-06, + "loss": 0.4087, + "step": 2264 + }, + { + "epoch": 1.2245449630564065, + "grad_norm": 0.4045921266078949, + "learning_rate": 9.373505278158152e-06, + "loss": 0.4642, + "step": 2265 + }, + { + "epoch": 1.2250856010091908, + "grad_norm": 0.3763102889060974, + "learning_rate": 9.37258993442506e-06, + "loss": 0.4534, + "step": 2266 + }, + { + "epoch": 1.2256262389619752, + "grad_norm": 0.4438874423503876, + "learning_rate": 9.371673967256023e-06, + "loss": 0.4202, + "step": 2267 + }, + { + "epoch": 1.2261668769147593, + "grad_norm": 0.3829502463340759, + "learning_rate": 9.370757376781638e-06, + "loss": 0.4208, + "step": 2268 + }, + { + "epoch": 1.2267075148675437, + "grad_norm": 0.41881024837493896, + "learning_rate": 9.36984016313259e-06, + "loss": 0.4552, + "step": 2269 + }, + { + "epoch": 1.227248152820328, + "grad_norm": 0.4431045651435852, + "learning_rate": 9.368922326439655e-06, + "loss": 0.4628, + "step": 2270 + }, + { + "epoch": 1.2277887907731122, + "grad_norm": 0.370781272649765, + "learning_rate": 9.368003866833697e-06, + "loss": 0.4015, + "step": 2271 + }, + { + "epoch": 1.2283294287258966, + "grad_norm": 0.331416517496109, + "learning_rate": 9.367084784445668e-06, + "loss": 0.4552, + "step": 2272 + }, + { + "epoch": 1.228870066678681, + "grad_norm": 0.39645448327064514, + "learning_rate": 9.366165079406606e-06, + "loss": 0.4651, + "step": 2273 + }, + { + "epoch": 1.229410704631465, + "grad_norm": 0.31915992498397827, + "learning_rate": 9.365244751847644e-06, + "loss": 0.4231, + "step": 2274 + }, + { + "epoch": 1.2299513425842494, + "grad_norm": 0.3478055000305176, + "learning_rate": 9.364323801900002e-06, + "loss": 0.482, + "step": 2275 + }, + { + "epoch": 1.2304919805370338, + "grad_norm": 0.3081183135509491, + "learning_rate": 9.363402229694982e-06, + "loss": 0.4057, + "step": 2276 + }, + { + "epoch": 1.231032618489818, + "grad_norm": 0.39849919080734253, + "learning_rate": 9.362480035363987e-06, + "loss": 0.483, + "step": 2277 + }, + { + "epoch": 1.2315732564426023, + "grad_norm": 0.35171177983283997, + "learning_rate": 9.361557219038494e-06, + "loss": 0.4569, + "step": 2278 + }, + { + "epoch": 1.2321138943953867, + "grad_norm": 0.36089226603507996, + "learning_rate": 9.360633780850086e-06, + "loss": 0.4671, + "step": 2279 + }, + { + "epoch": 1.2326545323481708, + "grad_norm": 0.3670012056827545, + "learning_rate": 9.359709720930417e-06, + "loss": 0.4142, + "step": 2280 + }, + { + "epoch": 1.2331951703009552, + "grad_norm": 0.3502783477306366, + "learning_rate": 9.35878503941124e-06, + "loss": 0.4344, + "step": 2281 + }, + { + "epoch": 1.2337358082537393, + "grad_norm": 0.389818400144577, + "learning_rate": 9.357859736424395e-06, + "loss": 0.4934, + "step": 2282 + }, + { + "epoch": 1.2342764462065237, + "grad_norm": 0.40615352988243103, + "learning_rate": 9.356933812101812e-06, + "loss": 0.3921, + "step": 2283 + }, + { + "epoch": 1.234817084159308, + "grad_norm": 0.4099143147468567, + "learning_rate": 9.356007266575504e-06, + "loss": 0.4831, + "step": 2284 + }, + { + "epoch": 1.2353577221120924, + "grad_norm": 0.4174445867538452, + "learning_rate": 9.355080099977579e-06, + "loss": 0.4462, + "step": 2285 + }, + { + "epoch": 1.2358983600648765, + "grad_norm": 0.4430469274520874, + "learning_rate": 9.354152312440228e-06, + "loss": 0.4472, + "step": 2286 + }, + { + "epoch": 1.2364389980176609, + "grad_norm": 0.39883628487586975, + "learning_rate": 9.353223904095736e-06, + "loss": 0.4447, + "step": 2287 + }, + { + "epoch": 1.236979635970445, + "grad_norm": 0.3867606818675995, + "learning_rate": 9.352294875076472e-06, + "loss": 0.427, + "step": 2288 + }, + { + "epoch": 1.2375202739232294, + "grad_norm": 0.45105767250061035, + "learning_rate": 9.351365225514898e-06, + "loss": 0.4467, + "step": 2289 + }, + { + "epoch": 1.2380609118760137, + "grad_norm": 0.39790135622024536, + "learning_rate": 9.350434955543557e-06, + "loss": 0.4362, + "step": 2290 + }, + { + "epoch": 1.2386015498287979, + "grad_norm": 0.389241099357605, + "learning_rate": 9.34950406529509e-06, + "loss": 0.4269, + "step": 2291 + }, + { + "epoch": 1.2391421877815822, + "grad_norm": 0.46503782272338867, + "learning_rate": 9.34857255490222e-06, + "loss": 0.4359, + "step": 2292 + }, + { + "epoch": 1.2396828257343666, + "grad_norm": 0.3403927981853485, + "learning_rate": 9.347640424497757e-06, + "loss": 0.4373, + "step": 2293 + }, + { + "epoch": 1.2402234636871508, + "grad_norm": 0.39097025990486145, + "learning_rate": 9.346707674214606e-06, + "loss": 0.4222, + "step": 2294 + }, + { + "epoch": 1.2407641016399351, + "grad_norm": 0.42722076177597046, + "learning_rate": 9.345774304185756e-06, + "loss": 0.4445, + "step": 2295 + }, + { + "epoch": 1.2413047395927195, + "grad_norm": 0.40439140796661377, + "learning_rate": 9.344840314544286e-06, + "loss": 0.4665, + "step": 2296 + }, + { + "epoch": 1.2418453775455036, + "grad_norm": 0.37643495202064514, + "learning_rate": 9.34390570542336e-06, + "loss": 0.3981, + "step": 2297 + }, + { + "epoch": 1.242386015498288, + "grad_norm": 0.341826468706131, + "learning_rate": 9.342970476956234e-06, + "loss": 0.3975, + "step": 2298 + }, + { + "epoch": 1.2429266534510723, + "grad_norm": 0.4274151027202606, + "learning_rate": 9.342034629276255e-06, + "loss": 0.4438, + "step": 2299 + }, + { + "epoch": 1.2434672914038565, + "grad_norm": 0.3313400149345398, + "learning_rate": 9.341098162516848e-06, + "loss": 0.4179, + "step": 2300 + }, + { + "epoch": 1.2440079293566408, + "grad_norm": 0.39104312658309937, + "learning_rate": 9.340161076811539e-06, + "loss": 0.4657, + "step": 2301 + }, + { + "epoch": 1.2445485673094252, + "grad_norm": 0.3932245671749115, + "learning_rate": 9.33922337229393e-06, + "loss": 0.4612, + "step": 2302 + }, + { + "epoch": 1.2450892052622093, + "grad_norm": 0.37315264344215393, + "learning_rate": 9.338285049097722e-06, + "loss": 0.4403, + "step": 2303 + }, + { + "epoch": 1.2456298432149937, + "grad_norm": 0.3951815366744995, + "learning_rate": 9.337346107356695e-06, + "loss": 0.4602, + "step": 2304 + }, + { + "epoch": 1.246170481167778, + "grad_norm": 0.37928786873817444, + "learning_rate": 9.336406547204726e-06, + "loss": 0.4451, + "step": 2305 + }, + { + "epoch": 1.2467111191205622, + "grad_norm": 0.3693735599517822, + "learning_rate": 9.335466368775774e-06, + "loss": 0.4122, + "step": 2306 + }, + { + "epoch": 1.2472517570733466, + "grad_norm": 0.4383481740951538, + "learning_rate": 9.334525572203887e-06, + "loss": 0.4892, + "step": 2307 + }, + { + "epoch": 1.247792395026131, + "grad_norm": 0.33080145716667175, + "learning_rate": 9.333584157623204e-06, + "loss": 0.4127, + "step": 2308 + }, + { + "epoch": 1.248333032978915, + "grad_norm": 0.36075839400291443, + "learning_rate": 9.332642125167948e-06, + "loss": 0.4394, + "step": 2309 + }, + { + "epoch": 1.2488736709316994, + "grad_norm": 0.35682231187820435, + "learning_rate": 9.331699474972434e-06, + "loss": 0.4496, + "step": 2310 + }, + { + "epoch": 1.2494143088844836, + "grad_norm": 0.3295380473136902, + "learning_rate": 9.330756207171064e-06, + "loss": 0.4384, + "step": 2311 + }, + { + "epoch": 1.249954946837268, + "grad_norm": 0.36135971546173096, + "learning_rate": 9.329812321898323e-06, + "loss": 0.3627, + "step": 2312 + }, + { + "epoch": 1.2504955847900523, + "grad_norm": 0.43534931540489197, + "learning_rate": 9.328867819288794e-06, + "loss": 0.465, + "step": 2313 + }, + { + "epoch": 1.2510362227428367, + "grad_norm": 0.3011641204357147, + "learning_rate": 9.327922699477139e-06, + "loss": 0.4207, + "step": 2314 + }, + { + "epoch": 1.2515768606956208, + "grad_norm": 0.43106555938720703, + "learning_rate": 9.326976962598113e-06, + "loss": 0.4948, + "step": 2315 + }, + { + "epoch": 1.2521174986484052, + "grad_norm": 0.3331192135810852, + "learning_rate": 9.326030608786558e-06, + "loss": 0.367, + "step": 2316 + }, + { + "epoch": 1.2526581366011893, + "grad_norm": 0.34783822298049927, + "learning_rate": 9.325083638177401e-06, + "loss": 0.4393, + "step": 2317 + }, + { + "epoch": 1.2531987745539737, + "grad_norm": 0.3120574355125427, + "learning_rate": 9.32413605090566e-06, + "loss": 0.4223, + "step": 2318 + }, + { + "epoch": 1.253739412506758, + "grad_norm": 0.3396937847137451, + "learning_rate": 9.323187847106441e-06, + "loss": 0.4543, + "step": 2319 + }, + { + "epoch": 1.2542800504595424, + "grad_norm": 0.3857153654098511, + "learning_rate": 9.322239026914938e-06, + "loss": 0.4591, + "step": 2320 + }, + { + "epoch": 1.2548206884123265, + "grad_norm": 0.3564053177833557, + "learning_rate": 9.321289590466434e-06, + "loss": 0.4577, + "step": 2321 + }, + { + "epoch": 1.2553613263651109, + "grad_norm": 0.34253233671188354, + "learning_rate": 9.32033953789629e-06, + "loss": 0.4183, + "step": 2322 + }, + { + "epoch": 1.255901964317895, + "grad_norm": 0.383951336145401, + "learning_rate": 9.319388869339971e-06, + "loss": 0.4299, + "step": 2323 + }, + { + "epoch": 1.2564426022706794, + "grad_norm": 0.33586639165878296, + "learning_rate": 9.318437584933018e-06, + "loss": 0.4426, + "step": 2324 + }, + { + "epoch": 1.2569832402234637, + "grad_norm": 0.3765944242477417, + "learning_rate": 9.317485684811065e-06, + "loss": 0.498, + "step": 2325 + }, + { + "epoch": 1.2575238781762481, + "grad_norm": 0.35276341438293457, + "learning_rate": 9.31653316910983e-06, + "loss": 0.4591, + "step": 2326 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.3316299319267273, + "learning_rate": 9.315580037965123e-06, + "loss": 0.4082, + "step": 2327 + }, + { + "epoch": 1.2586051540818166, + "grad_norm": 0.3217739760875702, + "learning_rate": 9.314626291512838e-06, + "loss": 0.4168, + "step": 2328 + }, + { + "epoch": 1.2591457920346008, + "grad_norm": 0.31653720140457153, + "learning_rate": 9.31367192988896e-06, + "loss": 0.4382, + "step": 2329 + }, + { + "epoch": 1.2596864299873851, + "grad_norm": 0.3542955815792084, + "learning_rate": 9.31271695322956e-06, + "loss": 0.3904, + "step": 2330 + }, + { + "epoch": 1.2602270679401695, + "grad_norm": 0.3201338052749634, + "learning_rate": 9.311761361670794e-06, + "loss": 0.4461, + "step": 2331 + }, + { + "epoch": 1.2607677058929536, + "grad_norm": 0.3405888080596924, + "learning_rate": 9.310805155348912e-06, + "loss": 0.4609, + "step": 2332 + }, + { + "epoch": 1.261308343845738, + "grad_norm": 0.3953564465045929, + "learning_rate": 9.309848334400247e-06, + "loss": 0.4452, + "step": 2333 + }, + { + "epoch": 1.2618489817985221, + "grad_norm": 0.3923310935497284, + "learning_rate": 9.30889089896122e-06, + "loss": 0.4389, + "step": 2334 + }, + { + "epoch": 1.2623896197513065, + "grad_norm": 0.2811641991138458, + "learning_rate": 9.307932849168341e-06, + "loss": 0.4077, + "step": 2335 + }, + { + "epoch": 1.2629302577040908, + "grad_norm": 0.4195081293582916, + "learning_rate": 9.306974185158209e-06, + "loss": 0.4563, + "step": 2336 + }, + { + "epoch": 1.2634708956568752, + "grad_norm": 0.37984922528266907, + "learning_rate": 9.306014907067507e-06, + "loss": 0.4557, + "step": 2337 + }, + { + "epoch": 1.2640115336096593, + "grad_norm": 0.3626854419708252, + "learning_rate": 9.305055015033004e-06, + "loss": 0.4338, + "step": 2338 + }, + { + "epoch": 1.2645521715624437, + "grad_norm": 0.3649787902832031, + "learning_rate": 9.304094509191564e-06, + "loss": 0.4677, + "step": 2339 + }, + { + "epoch": 1.2650928095152278, + "grad_norm": 0.35816383361816406, + "learning_rate": 9.303133389680134e-06, + "loss": 0.4417, + "step": 2340 + }, + { + "epoch": 1.2656334474680122, + "grad_norm": 0.34932854771614075, + "learning_rate": 9.302171656635746e-06, + "loss": 0.4226, + "step": 2341 + }, + { + "epoch": 1.2661740854207966, + "grad_norm": 0.3992402255535126, + "learning_rate": 9.301209310195523e-06, + "loss": 0.4574, + "step": 2342 + }, + { + "epoch": 1.266714723373581, + "grad_norm": 0.35466626286506653, + "learning_rate": 9.300246350496676e-06, + "loss": 0.4379, + "step": 2343 + }, + { + "epoch": 1.267255361326365, + "grad_norm": 0.3635440468788147, + "learning_rate": 9.2992827776765e-06, + "loss": 0.425, + "step": 2344 + }, + { + "epoch": 1.2677959992791494, + "grad_norm": 0.3766593933105469, + "learning_rate": 9.298318591872381e-06, + "loss": 0.4503, + "step": 2345 + }, + { + "epoch": 1.2683366372319336, + "grad_norm": 0.40644508600234985, + "learning_rate": 9.297353793221793e-06, + "loss": 0.434, + "step": 2346 + }, + { + "epoch": 1.268877275184718, + "grad_norm": 0.33753782510757446, + "learning_rate": 9.29638838186229e-06, + "loss": 0.4649, + "step": 2347 + }, + { + "epoch": 1.2694179131375023, + "grad_norm": 0.35078006982803345, + "learning_rate": 9.295422357931523e-06, + "loss": 0.4276, + "step": 2348 + }, + { + "epoch": 1.2699585510902867, + "grad_norm": 0.41513198614120483, + "learning_rate": 9.294455721567224e-06, + "loss": 0.4546, + "step": 2349 + }, + { + "epoch": 1.2704991890430708, + "grad_norm": 0.337698757648468, + "learning_rate": 9.293488472907213e-06, + "loss": 0.4209, + "step": 2350 + }, + { + "epoch": 1.2710398269958552, + "grad_norm": 0.37815606594085693, + "learning_rate": 9.292520612089402e-06, + "loss": 0.4791, + "step": 2351 + }, + { + "epoch": 1.2715804649486393, + "grad_norm": 0.33892422914505005, + "learning_rate": 9.291552139251784e-06, + "loss": 0.4178, + "step": 2352 + }, + { + "epoch": 1.2721211029014237, + "grad_norm": 0.35459035634994507, + "learning_rate": 9.290583054532443e-06, + "loss": 0.446, + "step": 2353 + }, + { + "epoch": 1.272661740854208, + "grad_norm": 0.3055948317050934, + "learning_rate": 9.289613358069549e-06, + "loss": 0.423, + "step": 2354 + }, + { + "epoch": 1.2732023788069924, + "grad_norm": 0.305288165807724, + "learning_rate": 9.288643050001362e-06, + "loss": 0.4593, + "step": 2355 + }, + { + "epoch": 1.2737430167597765, + "grad_norm": 0.36002859473228455, + "learning_rate": 9.287672130466223e-06, + "loss": 0.4326, + "step": 2356 + }, + { + "epoch": 1.2742836547125609, + "grad_norm": 0.3907860517501831, + "learning_rate": 9.286700599602565e-06, + "loss": 0.4918, + "step": 2357 + }, + { + "epoch": 1.274824292665345, + "grad_norm": 0.3049376308917999, + "learning_rate": 9.285728457548909e-06, + "loss": 0.3754, + "step": 2358 + }, + { + "epoch": 1.2753649306181294, + "grad_norm": 0.39047765731811523, + "learning_rate": 9.28475570444386e-06, + "loss": 0.4517, + "step": 2359 + }, + { + "epoch": 1.2759055685709138, + "grad_norm": 0.38072285056114197, + "learning_rate": 9.283782340426112e-06, + "loss": 0.4627, + "step": 2360 + }, + { + "epoch": 1.276446206523698, + "grad_norm": 0.41873809695243835, + "learning_rate": 9.282808365634444e-06, + "loss": 0.4289, + "step": 2361 + }, + { + "epoch": 1.2769868444764823, + "grad_norm": 0.3254391849040985, + "learning_rate": 9.281833780207725e-06, + "loss": 0.4131, + "step": 2362 + }, + { + "epoch": 1.2775274824292666, + "grad_norm": 0.519358217716217, + "learning_rate": 9.280858584284909e-06, + "loss": 0.4975, + "step": 2363 + }, + { + "epoch": 1.2780681203820508, + "grad_norm": 0.3705024719238281, + "learning_rate": 9.279882778005035e-06, + "loss": 0.4235, + "step": 2364 + }, + { + "epoch": 1.2786087583348351, + "grad_norm": 0.45141515135765076, + "learning_rate": 9.278906361507238e-06, + "loss": 0.4301, + "step": 2365 + }, + { + "epoch": 1.2791493962876195, + "grad_norm": 0.4273265600204468, + "learning_rate": 9.27792933493073e-06, + "loss": 0.4363, + "step": 2366 + }, + { + "epoch": 1.2796900342404036, + "grad_norm": 0.414331316947937, + "learning_rate": 9.276951698414812e-06, + "loss": 0.4153, + "step": 2367 + }, + { + "epoch": 1.280230672193188, + "grad_norm": 0.4092567265033722, + "learning_rate": 9.275973452098877e-06, + "loss": 0.4542, + "step": 2368 + }, + { + "epoch": 1.2807713101459721, + "grad_norm": 0.4097854495048523, + "learning_rate": 9.2749945961224e-06, + "loss": 0.4499, + "step": 2369 + }, + { + "epoch": 1.2813119480987565, + "grad_norm": 0.40357089042663574, + "learning_rate": 9.274015130624943e-06, + "loss": 0.4272, + "step": 2370 + }, + { + "epoch": 1.2818525860515408, + "grad_norm": 0.3889763057231903, + "learning_rate": 9.273035055746159e-06, + "loss": 0.4346, + "step": 2371 + }, + { + "epoch": 1.2823932240043252, + "grad_norm": 0.3984520733356476, + "learning_rate": 9.272054371625783e-06, + "loss": 0.4514, + "step": 2372 + }, + { + "epoch": 1.2829338619571093, + "grad_norm": 0.35287633538246155, + "learning_rate": 9.271073078403643e-06, + "loss": 0.4235, + "step": 2373 + }, + { + "epoch": 1.2834744999098937, + "grad_norm": 0.4218217432498932, + "learning_rate": 9.270091176219645e-06, + "loss": 0.474, + "step": 2374 + }, + { + "epoch": 1.2840151378626778, + "grad_norm": 0.3276956081390381, + "learning_rate": 9.26910866521379e-06, + "loss": 0.4135, + "step": 2375 + }, + { + "epoch": 1.2845557758154622, + "grad_norm": 0.3250516653060913, + "learning_rate": 9.268125545526163e-06, + "loss": 0.4082, + "step": 2376 + }, + { + "epoch": 1.2850964137682466, + "grad_norm": 0.44134101271629333, + "learning_rate": 9.267141817296933e-06, + "loss": 0.4576, + "step": 2377 + }, + { + "epoch": 1.285637051721031, + "grad_norm": 0.3511035442352295, + "learning_rate": 9.26615748066636e-06, + "loss": 0.4313, + "step": 2378 + }, + { + "epoch": 1.286177689673815, + "grad_norm": 0.40824609994888306, + "learning_rate": 9.265172535774788e-06, + "loss": 0.4029, + "step": 2379 + }, + { + "epoch": 1.2867183276265994, + "grad_norm": 0.4215698540210724, + "learning_rate": 9.264186982762649e-06, + "loss": 0.4939, + "step": 2380 + }, + { + "epoch": 1.2872589655793836, + "grad_norm": 0.3959537446498871, + "learning_rate": 9.263200821770462e-06, + "loss": 0.4302, + "step": 2381 + }, + { + "epoch": 1.287799603532168, + "grad_norm": 0.3674355447292328, + "learning_rate": 9.262214052938832e-06, + "loss": 0.4367, + "step": 2382 + }, + { + "epoch": 1.2883402414849523, + "grad_norm": 0.4184582829475403, + "learning_rate": 9.26122667640845e-06, + "loss": 0.4503, + "step": 2383 + }, + { + "epoch": 1.2888808794377367, + "grad_norm": 0.3745483458042145, + "learning_rate": 9.260238692320093e-06, + "loss": 0.3753, + "step": 2384 + }, + { + "epoch": 1.2894215173905208, + "grad_norm": 0.35480156540870667, + "learning_rate": 9.25925010081463e-06, + "loss": 0.5004, + "step": 2385 + }, + { + "epoch": 1.2899621553433052, + "grad_norm": 0.3390335738658905, + "learning_rate": 9.258260902033007e-06, + "loss": 0.4054, + "step": 2386 + }, + { + "epoch": 1.2905027932960893, + "grad_norm": 0.3922213315963745, + "learning_rate": 9.257271096116268e-06, + "loss": 0.4065, + "step": 2387 + }, + { + "epoch": 1.2910434312488737, + "grad_norm": 0.3549047112464905, + "learning_rate": 9.256280683205534e-06, + "loss": 0.4525, + "step": 2388 + }, + { + "epoch": 1.291584069201658, + "grad_norm": 0.36089131236076355, + "learning_rate": 9.255289663442018e-06, + "loss": 0.4325, + "step": 2389 + }, + { + "epoch": 1.2921247071544422, + "grad_norm": 0.33818209171295166, + "learning_rate": 9.254298036967015e-06, + "loss": 0.4321, + "step": 2390 + }, + { + "epoch": 1.2926653451072265, + "grad_norm": 0.367330402135849, + "learning_rate": 9.253305803921915e-06, + "loss": 0.4429, + "step": 2391 + }, + { + "epoch": 1.2932059830600109, + "grad_norm": 0.32385024428367615, + "learning_rate": 9.252312964448182e-06, + "loss": 0.4145, + "step": 2392 + }, + { + "epoch": 1.293746621012795, + "grad_norm": 0.3290650546550751, + "learning_rate": 9.251319518687379e-06, + "loss": 0.4268, + "step": 2393 + }, + { + "epoch": 1.2942872589655794, + "grad_norm": 0.3983169496059418, + "learning_rate": 9.250325466781145e-06, + "loss": 0.4725, + "step": 2394 + }, + { + "epoch": 1.2948278969183638, + "grad_norm": 0.3585212528705597, + "learning_rate": 9.249330808871213e-06, + "loss": 0.429, + "step": 2395 + }, + { + "epoch": 1.295368534871148, + "grad_norm": 0.3783895969390869, + "learning_rate": 9.248335545099398e-06, + "loss": 0.4486, + "step": 2396 + }, + { + "epoch": 1.2959091728239323, + "grad_norm": 0.34847337007522583, + "learning_rate": 9.247339675607606e-06, + "loss": 0.4414, + "step": 2397 + }, + { + "epoch": 1.2964498107767164, + "grad_norm": 0.3564877212047577, + "learning_rate": 9.246343200537823e-06, + "loss": 0.458, + "step": 2398 + }, + { + "epoch": 1.2969904487295008, + "grad_norm": 0.34090301394462585, + "learning_rate": 9.245346120032124e-06, + "loss": 0.4475, + "step": 2399 + }, + { + "epoch": 1.2975310866822851, + "grad_norm": 0.41178473830223083, + "learning_rate": 9.244348434232676e-06, + "loss": 0.4478, + "step": 2400 + }, + { + "epoch": 1.2980717246350695, + "grad_norm": 0.3171329200267792, + "learning_rate": 9.24335014328172e-06, + "loss": 0.4077, + "step": 2401 + }, + { + "epoch": 1.2986123625878536, + "grad_norm": 0.36641278862953186, + "learning_rate": 9.242351247321595e-06, + "loss": 0.4764, + "step": 2402 + }, + { + "epoch": 1.299153000540638, + "grad_norm": 0.34632667899131775, + "learning_rate": 9.241351746494723e-06, + "loss": 0.4268, + "step": 2403 + }, + { + "epoch": 1.2996936384934221, + "grad_norm": 0.3175496757030487, + "learning_rate": 9.240351640943607e-06, + "loss": 0.4124, + "step": 2404 + }, + { + "epoch": 1.3002342764462065, + "grad_norm": 0.40579769015312195, + "learning_rate": 9.239350930810843e-06, + "loss": 0.4939, + "step": 2405 + }, + { + "epoch": 1.3007749143989908, + "grad_norm": 0.30617377161979675, + "learning_rate": 9.23834961623911e-06, + "loss": 0.4298, + "step": 2406 + }, + { + "epoch": 1.3013155523517752, + "grad_norm": 0.3338660001754761, + "learning_rate": 9.237347697371173e-06, + "loss": 0.4309, + "step": 2407 + }, + { + "epoch": 1.3018561903045593, + "grad_norm": 0.3111022412776947, + "learning_rate": 9.236345174349884e-06, + "loss": 0.4361, + "step": 2408 + }, + { + "epoch": 1.3023968282573437, + "grad_norm": 0.35824936628341675, + "learning_rate": 9.23534204731818e-06, + "loss": 0.4738, + "step": 2409 + }, + { + "epoch": 1.3029374662101278, + "grad_norm": 0.31303870677948, + "learning_rate": 9.23433831641909e-06, + "loss": 0.4224, + "step": 2410 + }, + { + "epoch": 1.3034781041629122, + "grad_norm": 0.30870863795280457, + "learning_rate": 9.233333981795715e-06, + "loss": 0.3822, + "step": 2411 + }, + { + "epoch": 1.3040187421156966, + "grad_norm": 0.41559818387031555, + "learning_rate": 9.23232904359126e-06, + "loss": 0.4988, + "step": 2412 + }, + { + "epoch": 1.304559380068481, + "grad_norm": 0.3499990701675415, + "learning_rate": 9.231323501949003e-06, + "loss": 0.4313, + "step": 2413 + }, + { + "epoch": 1.305100018021265, + "grad_norm": 0.3727319538593292, + "learning_rate": 9.230317357012312e-06, + "loss": 0.459, + "step": 2414 + }, + { + "epoch": 1.3056406559740494, + "grad_norm": 0.34095412492752075, + "learning_rate": 9.229310608924643e-06, + "loss": 0.3827, + "step": 2415 + }, + { + "epoch": 1.3061812939268336, + "grad_norm": 0.4113728106021881, + "learning_rate": 9.228303257829535e-06, + "loss": 0.423, + "step": 2416 + }, + { + "epoch": 1.306721931879618, + "grad_norm": 0.38452550768852234, + "learning_rate": 9.227295303870615e-06, + "loss": 0.4199, + "step": 2417 + }, + { + "epoch": 1.3072625698324023, + "grad_norm": 0.3986125886440277, + "learning_rate": 9.226286747191597e-06, + "loss": 0.4625, + "step": 2418 + }, + { + "epoch": 1.3078032077851864, + "grad_norm": 0.43863949179649353, + "learning_rate": 9.225277587936275e-06, + "loss": 0.4138, + "step": 2419 + }, + { + "epoch": 1.3083438457379708, + "grad_norm": 0.3248741626739502, + "learning_rate": 9.224267826248536e-06, + "loss": 0.423, + "step": 2420 + }, + { + "epoch": 1.3088844836907552, + "grad_norm": 0.41215142607688904, + "learning_rate": 9.22325746227235e-06, + "loss": 0.4702, + "step": 2421 + }, + { + "epoch": 1.3094251216435393, + "grad_norm": 0.37813472747802734, + "learning_rate": 9.222246496151772e-06, + "loss": 0.447, + "step": 2422 + }, + { + "epoch": 1.3099657595963237, + "grad_norm": 0.3352581262588501, + "learning_rate": 9.221234928030944e-06, + "loss": 0.4113, + "step": 2423 + }, + { + "epoch": 1.310506397549108, + "grad_norm": 0.3993983864784241, + "learning_rate": 9.220222758054093e-06, + "loss": 0.4622, + "step": 2424 + }, + { + "epoch": 1.3110470355018922, + "grad_norm": 0.34393075108528137, + "learning_rate": 9.219209986365533e-06, + "loss": 0.4406, + "step": 2425 + }, + { + "epoch": 1.3115876734546765, + "grad_norm": 0.41056370735168457, + "learning_rate": 9.218196613109664e-06, + "loss": 0.4707, + "step": 2426 + }, + { + "epoch": 1.3121283114074607, + "grad_norm": 0.3283563554286957, + "learning_rate": 9.21718263843097e-06, + "loss": 0.4335, + "step": 2427 + }, + { + "epoch": 1.312668949360245, + "grad_norm": 0.36037319898605347, + "learning_rate": 9.21616806247402e-06, + "loss": 0.4211, + "step": 2428 + }, + { + "epoch": 1.3132095873130294, + "grad_norm": 0.3254454433917999, + "learning_rate": 9.215152885383473e-06, + "loss": 0.4663, + "step": 2429 + }, + { + "epoch": 1.3137502252658138, + "grad_norm": 0.3504960238933563, + "learning_rate": 9.21413710730407e-06, + "loss": 0.4154, + "step": 2430 + }, + { + "epoch": 1.314290863218598, + "grad_norm": 0.3567231595516205, + "learning_rate": 9.21312072838064e-06, + "loss": 0.4223, + "step": 2431 + }, + { + "epoch": 1.3148315011713823, + "grad_norm": 0.3479120433330536, + "learning_rate": 9.212103748758095e-06, + "loss": 0.453, + "step": 2432 + }, + { + "epoch": 1.3153721391241664, + "grad_norm": 0.31994810700416565, + "learning_rate": 9.211086168581433e-06, + "loss": 0.438, + "step": 2433 + }, + { + "epoch": 1.3159127770769508, + "grad_norm": 0.3787417411804199, + "learning_rate": 9.210067987995742e-06, + "loss": 0.4615, + "step": 2434 + }, + { + "epoch": 1.3164534150297351, + "grad_norm": 0.30681583285331726, + "learning_rate": 9.20904920714619e-06, + "loss": 0.3856, + "step": 2435 + }, + { + "epoch": 1.3169940529825195, + "grad_norm": 0.35557037591934204, + "learning_rate": 9.208029826178034e-06, + "loss": 0.45, + "step": 2436 + }, + { + "epoch": 1.3175346909353036, + "grad_norm": 0.382725328207016, + "learning_rate": 9.207009845236614e-06, + "loss": 0.4463, + "step": 2437 + }, + { + "epoch": 1.318075328888088, + "grad_norm": 0.2951371967792511, + "learning_rate": 9.205989264467359e-06, + "loss": 0.3897, + "step": 2438 + }, + { + "epoch": 1.3186159668408721, + "grad_norm": 0.33292078971862793, + "learning_rate": 9.20496808401578e-06, + "loss": 0.4581, + "step": 2439 + }, + { + "epoch": 1.3191566047936565, + "grad_norm": 0.3527860939502716, + "learning_rate": 9.203946304027476e-06, + "loss": 0.4135, + "step": 2440 + }, + { + "epoch": 1.3196972427464408, + "grad_norm": 0.3484804928302765, + "learning_rate": 9.20292392464813e-06, + "loss": 0.4608, + "step": 2441 + }, + { + "epoch": 1.3202378806992252, + "grad_norm": 0.32849547266960144, + "learning_rate": 9.201900946023512e-06, + "loss": 0.4735, + "step": 2442 + }, + { + "epoch": 1.3207785186520093, + "grad_norm": 0.345099538564682, + "learning_rate": 9.200877368299474e-06, + "loss": 0.4386, + "step": 2443 + }, + { + "epoch": 1.3213191566047937, + "grad_norm": 0.34676334261894226, + "learning_rate": 9.19985319162196e-06, + "loss": 0.4486, + "step": 2444 + }, + { + "epoch": 1.3218597945575778, + "grad_norm": 0.31078365445137024, + "learning_rate": 9.198828416136991e-06, + "loss": 0.3761, + "step": 2445 + }, + { + "epoch": 1.3224004325103622, + "grad_norm": 0.39399051666259766, + "learning_rate": 9.19780304199068e-06, + "loss": 0.4963, + "step": 2446 + }, + { + "epoch": 1.3229410704631466, + "grad_norm": 0.3411425054073334, + "learning_rate": 9.196777069329222e-06, + "loss": 0.4087, + "step": 2447 + }, + { + "epoch": 1.323481708415931, + "grad_norm": 0.34350520372390747, + "learning_rate": 9.195750498298898e-06, + "loss": 0.4536, + "step": 2448 + }, + { + "epoch": 1.324022346368715, + "grad_norm": 0.3334350883960724, + "learning_rate": 9.194723329046076e-06, + "loss": 0.425, + "step": 2449 + }, + { + "epoch": 1.3245629843214994, + "grad_norm": 0.34668123722076416, + "learning_rate": 9.193695561717207e-06, + "loss": 0.4197, + "step": 2450 + }, + { + "epoch": 1.3251036222742836, + "grad_norm": 0.3792758584022522, + "learning_rate": 9.192667196458829e-06, + "loss": 0.4212, + "step": 2451 + }, + { + "epoch": 1.325644260227068, + "grad_norm": 0.3879375457763672, + "learning_rate": 9.191638233417563e-06, + "loss": 0.4541, + "step": 2452 + }, + { + "epoch": 1.3261848981798523, + "grad_norm": 0.38033583760261536, + "learning_rate": 9.190608672740118e-06, + "loss": 0.4258, + "step": 2453 + }, + { + "epoch": 1.3267255361326364, + "grad_norm": 0.31433993577957153, + "learning_rate": 9.189578514573287e-06, + "loss": 0.3939, + "step": 2454 + }, + { + "epoch": 1.3272661740854208, + "grad_norm": 0.40909436345100403, + "learning_rate": 9.188547759063948e-06, + "loss": 0.4741, + "step": 2455 + }, + { + "epoch": 1.3278068120382052, + "grad_norm": 0.40054723620414734, + "learning_rate": 9.187516406359062e-06, + "loss": 0.4371, + "step": 2456 + }, + { + "epoch": 1.3283474499909893, + "grad_norm": 0.38349390029907227, + "learning_rate": 9.186484456605682e-06, + "loss": 0.4281, + "step": 2457 + }, + { + "epoch": 1.3288880879437737, + "grad_norm": 0.4046049416065216, + "learning_rate": 9.185451909950937e-06, + "loss": 0.4476, + "step": 2458 + }, + { + "epoch": 1.329428725896558, + "grad_norm": 0.33745673298835754, + "learning_rate": 9.184418766542046e-06, + "loss": 0.4072, + "step": 2459 + }, + { + "epoch": 1.3299693638493422, + "grad_norm": 0.3759952187538147, + "learning_rate": 9.183385026526317e-06, + "loss": 0.453, + "step": 2460 + }, + { + "epoch": 1.3305100018021265, + "grad_norm": 0.37083351612091064, + "learning_rate": 9.182350690051134e-06, + "loss": 0.455, + "step": 2461 + }, + { + "epoch": 1.3310506397549107, + "grad_norm": 0.35298165678977966, + "learning_rate": 9.181315757263973e-06, + "loss": 0.4801, + "step": 2462 + }, + { + "epoch": 1.331591277707695, + "grad_norm": 0.30566030740737915, + "learning_rate": 9.180280228312394e-06, + "loss": 0.3776, + "step": 2463 + }, + { + "epoch": 1.3321319156604794, + "grad_norm": 0.33040979504585266, + "learning_rate": 9.179244103344039e-06, + "loss": 0.4015, + "step": 2464 + }, + { + "epoch": 1.3326725536132638, + "grad_norm": 0.3727269172668457, + "learning_rate": 9.178207382506634e-06, + "loss": 0.5066, + "step": 2465 + }, + { + "epoch": 1.333213191566048, + "grad_norm": 0.36058521270751953, + "learning_rate": 9.177170065948e-06, + "loss": 0.4391, + "step": 2466 + }, + { + "epoch": 1.3337538295188323, + "grad_norm": 0.3290637731552124, + "learning_rate": 9.17613215381603e-06, + "loss": 0.466, + "step": 2467 + }, + { + "epoch": 1.3342944674716164, + "grad_norm": 0.3557436466217041, + "learning_rate": 9.175093646258709e-06, + "loss": 0.4387, + "step": 2468 + }, + { + "epoch": 1.3348351054244008, + "grad_norm": 0.39177340269088745, + "learning_rate": 9.174054543424106e-06, + "loss": 0.4607, + "step": 2469 + }, + { + "epoch": 1.3353757433771851, + "grad_norm": 0.32272791862487793, + "learning_rate": 9.173014845460375e-06, + "loss": 0.4258, + "step": 2470 + }, + { + "epoch": 1.3359163813299695, + "grad_norm": 0.42237669229507446, + "learning_rate": 9.171974552515753e-06, + "loss": 0.4199, + "step": 2471 + }, + { + "epoch": 1.3364570192827536, + "grad_norm": 0.3549451529979706, + "learning_rate": 9.170933664738563e-06, + "loss": 0.4044, + "step": 2472 + }, + { + "epoch": 1.336997657235538, + "grad_norm": 0.3337773382663727, + "learning_rate": 9.169892182277214e-06, + "loss": 0.4506, + "step": 2473 + }, + { + "epoch": 1.3375382951883221, + "grad_norm": 0.3925420939922333, + "learning_rate": 9.168850105280198e-06, + "loss": 0.4141, + "step": 2474 + }, + { + "epoch": 1.3380789331411065, + "grad_norm": 0.37819182872772217, + "learning_rate": 9.167807433896091e-06, + "loss": 0.5166, + "step": 2475 + }, + { + "epoch": 1.3386195710938908, + "grad_norm": 0.38570624589920044, + "learning_rate": 9.166764168273559e-06, + "loss": 0.4778, + "step": 2476 + }, + { + "epoch": 1.3391602090466752, + "grad_norm": 0.31898075342178345, + "learning_rate": 9.165720308561347e-06, + "loss": 0.3757, + "step": 2477 + }, + { + "epoch": 1.3397008469994593, + "grad_norm": 0.3864937424659729, + "learning_rate": 9.164675854908284e-06, + "loss": 0.494, + "step": 2478 + }, + { + "epoch": 1.3402414849522437, + "grad_norm": 0.3443322479724884, + "learning_rate": 9.163630807463292e-06, + "loss": 0.4216, + "step": 2479 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 0.43592381477355957, + "learning_rate": 9.162585166375367e-06, + "loss": 0.4449, + "step": 2480 + }, + { + "epoch": 1.3413227608578122, + "grad_norm": 0.36814263463020325, + "learning_rate": 9.161538931793595e-06, + "loss": 0.448, + "step": 2481 + }, + { + "epoch": 1.3418633988105966, + "grad_norm": 0.325903058052063, + "learning_rate": 9.160492103867149e-06, + "loss": 0.4119, + "step": 2482 + }, + { + "epoch": 1.3424040367633807, + "grad_norm": 0.4102088212966919, + "learning_rate": 9.159444682745282e-06, + "loss": 0.4866, + "step": 2483 + }, + { + "epoch": 1.342944674716165, + "grad_norm": 0.3582179844379425, + "learning_rate": 9.158396668577333e-06, + "loss": 0.4608, + "step": 2484 + }, + { + "epoch": 1.3434853126689494, + "grad_norm": 0.3896026313304901, + "learning_rate": 9.157348061512728e-06, + "loss": 0.4008, + "step": 2485 + }, + { + "epoch": 1.3440259506217336, + "grad_norm": 0.3490865230560303, + "learning_rate": 9.156298861700971e-06, + "loss": 0.4442, + "step": 2486 + }, + { + "epoch": 1.344566588574518, + "grad_norm": 0.39922162890434265, + "learning_rate": 9.155249069291661e-06, + "loss": 0.4744, + "step": 2487 + }, + { + "epoch": 1.3451072265273023, + "grad_norm": 0.30951571464538574, + "learning_rate": 9.154198684434472e-06, + "loss": 0.3839, + "step": 2488 + }, + { + "epoch": 1.3456478644800864, + "grad_norm": 0.3632635176181793, + "learning_rate": 9.153147707279168e-06, + "loss": 0.4455, + "step": 2489 + }, + { + "epoch": 1.3461885024328708, + "grad_norm": 0.3484949469566345, + "learning_rate": 9.152096137975593e-06, + "loss": 0.4681, + "step": 2490 + }, + { + "epoch": 1.346729140385655, + "grad_norm": 0.33641254901885986, + "learning_rate": 9.151043976673676e-06, + "loss": 0.4173, + "step": 2491 + }, + { + "epoch": 1.3472697783384393, + "grad_norm": 0.34935262799263, + "learning_rate": 9.149991223523439e-06, + "loss": 0.4061, + "step": 2492 + }, + { + "epoch": 1.3478104162912237, + "grad_norm": 0.37855634093284607, + "learning_rate": 9.148937878674975e-06, + "loss": 0.4633, + "step": 2493 + }, + { + "epoch": 1.348351054244008, + "grad_norm": 0.3477320969104767, + "learning_rate": 9.147883942278474e-06, + "loss": 0.4195, + "step": 2494 + }, + { + "epoch": 1.3488916921967922, + "grad_norm": 0.39148709177970886, + "learning_rate": 9.146829414484198e-06, + "loss": 0.4698, + "step": 2495 + }, + { + "epoch": 1.3494323301495765, + "grad_norm": 0.40924400091171265, + "learning_rate": 9.145774295442504e-06, + "loss": 0.4393, + "step": 2496 + }, + { + "epoch": 1.3499729681023607, + "grad_norm": 0.49655890464782715, + "learning_rate": 9.144718585303829e-06, + "loss": 0.4513, + "step": 2497 + }, + { + "epoch": 1.350513606055145, + "grad_norm": 0.36215028166770935, + "learning_rate": 9.143662284218691e-06, + "loss": 0.451, + "step": 2498 + }, + { + "epoch": 1.3510542440079294, + "grad_norm": 0.37285175919532776, + "learning_rate": 9.142605392337697e-06, + "loss": 0.4216, + "step": 2499 + }, + { + "epoch": 1.3515948819607138, + "grad_norm": 0.4430399239063263, + "learning_rate": 9.14154790981154e-06, + "loss": 0.4176, + "step": 2500 + }, + { + "epoch": 1.352135519913498, + "grad_norm": 0.3936831057071686, + "learning_rate": 9.140489836790989e-06, + "loss": 0.4633, + "step": 2501 + }, + { + "epoch": 1.3526761578662823, + "grad_norm": 0.37488722801208496, + "learning_rate": 9.139431173426905e-06, + "loss": 0.4296, + "step": 2502 + }, + { + "epoch": 1.3532167958190664, + "grad_norm": 0.38206738233566284, + "learning_rate": 9.13837191987023e-06, + "loss": 0.4323, + "step": 2503 + }, + { + "epoch": 1.3537574337718508, + "grad_norm": 0.3599168062210083, + "learning_rate": 9.137312076271989e-06, + "loss": 0.4255, + "step": 2504 + }, + { + "epoch": 1.3542980717246351, + "grad_norm": 0.4199734032154083, + "learning_rate": 9.136251642783294e-06, + "loss": 0.4803, + "step": 2505 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 0.4077081084251404, + "learning_rate": 9.135190619555339e-06, + "loss": 0.4305, + "step": 2506 + }, + { + "epoch": 1.3553793476302036, + "grad_norm": 0.4488022029399872, + "learning_rate": 9.134129006739403e-06, + "loss": 0.4646, + "step": 2507 + }, + { + "epoch": 1.355919985582988, + "grad_norm": 0.3567434847354889, + "learning_rate": 9.13306680448685e-06, + "loss": 0.3833, + "step": 2508 + }, + { + "epoch": 1.3564606235357721, + "grad_norm": 0.46651482582092285, + "learning_rate": 9.132004012949124e-06, + "loss": 0.4489, + "step": 2509 + }, + { + "epoch": 1.3570012614885565, + "grad_norm": 0.3879334330558777, + "learning_rate": 9.130940632277757e-06, + "loss": 0.4286, + "step": 2510 + }, + { + "epoch": 1.3575418994413408, + "grad_norm": 0.37734895944595337, + "learning_rate": 9.129876662624366e-06, + "loss": 0.4454, + "step": 2511 + }, + { + "epoch": 1.358082537394125, + "grad_norm": 0.3508733808994293, + "learning_rate": 9.12881210414065e-06, + "loss": 0.4183, + "step": 2512 + }, + { + "epoch": 1.3586231753469094, + "grad_norm": 0.36155420541763306, + "learning_rate": 9.127746956978388e-06, + "loss": 0.4631, + "step": 2513 + }, + { + "epoch": 1.3591638132996937, + "grad_norm": 0.3453098237514496, + "learning_rate": 9.126681221289448e-06, + "loss": 0.4324, + "step": 2514 + }, + { + "epoch": 1.3597044512524779, + "grad_norm": 0.38498273491859436, + "learning_rate": 9.125614897225785e-06, + "loss": 0.4327, + "step": 2515 + }, + { + "epoch": 1.3602450892052622, + "grad_norm": 0.39644426107406616, + "learning_rate": 9.124547984939427e-06, + "loss": 0.4537, + "step": 2516 + }, + { + "epoch": 1.3607857271580466, + "grad_norm": 0.34871289134025574, + "learning_rate": 9.123480484582498e-06, + "loss": 0.43, + "step": 2517 + }, + { + "epoch": 1.3613263651108307, + "grad_norm": 0.3775876760482788, + "learning_rate": 9.122412396307196e-06, + "loss": 0.4601, + "step": 2518 + }, + { + "epoch": 1.361867003063615, + "grad_norm": 0.3927980959415436, + "learning_rate": 9.12134372026581e-06, + "loss": 0.4253, + "step": 2519 + }, + { + "epoch": 1.3624076410163992, + "grad_norm": 0.3621505796909332, + "learning_rate": 9.120274456610708e-06, + "loss": 0.4899, + "step": 2520 + }, + { + "epoch": 1.3629482789691836, + "grad_norm": 0.30624163150787354, + "learning_rate": 9.119204605494345e-06, + "loss": 0.4045, + "step": 2521 + }, + { + "epoch": 1.363488916921968, + "grad_norm": 0.2986149489879608, + "learning_rate": 9.118134167069258e-06, + "loss": 0.418, + "step": 2522 + }, + { + "epoch": 1.3640295548747523, + "grad_norm": 0.3559814989566803, + "learning_rate": 9.117063141488067e-06, + "loss": 0.4618, + "step": 2523 + }, + { + "epoch": 1.3645701928275364, + "grad_norm": 0.3439945578575134, + "learning_rate": 9.11599152890348e-06, + "loss": 0.4326, + "step": 2524 + }, + { + "epoch": 1.3651108307803208, + "grad_norm": 0.31808650493621826, + "learning_rate": 9.114919329468283e-06, + "loss": 0.4275, + "step": 2525 + }, + { + "epoch": 1.365651468733105, + "grad_norm": 0.3407554030418396, + "learning_rate": 9.113846543335349e-06, + "loss": 0.4439, + "step": 2526 + }, + { + "epoch": 1.3661921066858893, + "grad_norm": 0.3182757794857025, + "learning_rate": 9.112773170657631e-06, + "loss": 0.4177, + "step": 2527 + }, + { + "epoch": 1.3667327446386737, + "grad_norm": 0.34512054920196533, + "learning_rate": 9.111699211588175e-06, + "loss": 0.4515, + "step": 2528 + }, + { + "epoch": 1.367273382591458, + "grad_norm": 0.3278456926345825, + "learning_rate": 9.110624666280099e-06, + "loss": 0.439, + "step": 2529 + }, + { + "epoch": 1.3678140205442422, + "grad_norm": 0.30506500601768494, + "learning_rate": 9.10954953488661e-06, + "loss": 0.4169, + "step": 2530 + }, + { + "epoch": 1.3683546584970265, + "grad_norm": 0.3284112811088562, + "learning_rate": 9.108473817561e-06, + "loss": 0.4181, + "step": 2531 + }, + { + "epoch": 1.3688952964498107, + "grad_norm": 0.37319162487983704, + "learning_rate": 9.107397514456643e-06, + "loss": 0.4607, + "step": 2532 + }, + { + "epoch": 1.369435934402595, + "grad_norm": 0.31024980545043945, + "learning_rate": 9.106320625726995e-06, + "loss": 0.4059, + "step": 2533 + }, + { + "epoch": 1.3699765723553794, + "grad_norm": 0.3992244601249695, + "learning_rate": 9.105243151525598e-06, + "loss": 0.4932, + "step": 2534 + }, + { + "epoch": 1.3705172103081638, + "grad_norm": 0.3026205897331238, + "learning_rate": 9.104165092006075e-06, + "loss": 0.4204, + "step": 2535 + }, + { + "epoch": 1.371057848260948, + "grad_norm": 0.3684896230697632, + "learning_rate": 9.103086447322136e-06, + "loss": 0.4454, + "step": 2536 + }, + { + "epoch": 1.3715984862137323, + "grad_norm": 0.38126397132873535, + "learning_rate": 9.102007217627568e-06, + "loss": 0.4732, + "step": 2537 + }, + { + "epoch": 1.3721391241665164, + "grad_norm": 0.3135012686252594, + "learning_rate": 9.10092740307625e-06, + "loss": 0.4374, + "step": 2538 + }, + { + "epoch": 1.3726797621193008, + "grad_norm": 0.3915354311466217, + "learning_rate": 9.099847003822139e-06, + "loss": 0.4375, + "step": 2539 + }, + { + "epoch": 1.3732204000720851, + "grad_norm": 0.3551939129829407, + "learning_rate": 9.098766020019273e-06, + "loss": 0.4527, + "step": 2540 + }, + { + "epoch": 1.3737610380248693, + "grad_norm": 0.3653704524040222, + "learning_rate": 9.097684451821783e-06, + "loss": 0.3931, + "step": 2541 + }, + { + "epoch": 1.3743016759776536, + "grad_norm": 0.41091975569725037, + "learning_rate": 9.096602299383872e-06, + "loss": 0.4849, + "step": 2542 + }, + { + "epoch": 1.374842313930438, + "grad_norm": 0.35535484552383423, + "learning_rate": 9.09551956285983e-06, + "loss": 0.4221, + "step": 2543 + }, + { + "epoch": 1.3753829518832221, + "grad_norm": 0.4004269540309906, + "learning_rate": 9.094436242404039e-06, + "loss": 0.4366, + "step": 2544 + }, + { + "epoch": 1.3759235898360065, + "grad_norm": 0.3528960049152374, + "learning_rate": 9.09335233817095e-06, + "loss": 0.3928, + "step": 2545 + }, + { + "epoch": 1.3764642277887909, + "grad_norm": 0.510593056678772, + "learning_rate": 9.092267850315106e-06, + "loss": 0.4862, + "step": 2546 + }, + { + "epoch": 1.377004865741575, + "grad_norm": 0.3259771168231964, + "learning_rate": 9.091182778991132e-06, + "loss": 0.4215, + "step": 2547 + }, + { + "epoch": 1.3775455036943594, + "grad_norm": 0.451362669467926, + "learning_rate": 9.090097124353737e-06, + "loss": 0.4199, + "step": 2548 + }, + { + "epoch": 1.3780861416471435, + "grad_norm": 0.3616625964641571, + "learning_rate": 9.089010886557706e-06, + "loss": 0.4328, + "step": 2549 + }, + { + "epoch": 1.3786267795999279, + "grad_norm": 0.37003451585769653, + "learning_rate": 9.08792406575792e-06, + "loss": 0.4332, + "step": 2550 + }, + { + "epoch": 1.3791674175527122, + "grad_norm": 0.3810839354991913, + "learning_rate": 9.08683666210933e-06, + "loss": 0.4253, + "step": 2551 + }, + { + "epoch": 1.3797080555054966, + "grad_norm": 0.4266107380390167, + "learning_rate": 9.085748675766981e-06, + "loss": 0.4508, + "step": 2552 + }, + { + "epoch": 1.3802486934582807, + "grad_norm": 0.39522749185562134, + "learning_rate": 9.084660106885992e-06, + "loss": 0.4043, + "step": 2553 + }, + { + "epoch": 1.380789331411065, + "grad_norm": 0.4028782546520233, + "learning_rate": 9.083570955621572e-06, + "loss": 0.4656, + "step": 2554 + }, + { + "epoch": 1.3813299693638492, + "grad_norm": 0.34329450130462646, + "learning_rate": 9.082481222129008e-06, + "loss": 0.4385, + "step": 2555 + }, + { + "epoch": 1.3818706073166336, + "grad_norm": 0.4008656144142151, + "learning_rate": 9.081390906563675e-06, + "loss": 0.4881, + "step": 2556 + }, + { + "epoch": 1.382411245269418, + "grad_norm": 0.3827539384365082, + "learning_rate": 9.080300009081025e-06, + "loss": 0.4496, + "step": 2557 + }, + { + "epoch": 1.3829518832222023, + "grad_norm": 0.35076501965522766, + "learning_rate": 9.079208529836598e-06, + "loss": 0.4231, + "step": 2558 + }, + { + "epoch": 1.3834925211749864, + "grad_norm": 0.32431963086128235, + "learning_rate": 9.078116468986016e-06, + "loss": 0.4315, + "step": 2559 + }, + { + "epoch": 1.3840331591277708, + "grad_norm": 0.354658305644989, + "learning_rate": 9.07702382668498e-06, + "loss": 0.4423, + "step": 2560 + }, + { + "epoch": 1.384573797080555, + "grad_norm": 0.3761045038700104, + "learning_rate": 9.07593060308928e-06, + "loss": 0.4651, + "step": 2561 + }, + { + "epoch": 1.3851144350333393, + "grad_norm": 0.3039899468421936, + "learning_rate": 9.074836798354785e-06, + "loss": 0.3852, + "step": 2562 + }, + { + "epoch": 1.3856550729861237, + "grad_norm": 0.42946872115135193, + "learning_rate": 9.073742412637448e-06, + "loss": 0.4575, + "step": 2563 + }, + { + "epoch": 1.386195710938908, + "grad_norm": 0.3606507182121277, + "learning_rate": 9.072647446093304e-06, + "loss": 0.479, + "step": 2564 + }, + { + "epoch": 1.3867363488916922, + "grad_norm": 0.3425506353378296, + "learning_rate": 9.071551898878471e-06, + "loss": 0.4277, + "step": 2565 + }, + { + "epoch": 1.3872769868444765, + "grad_norm": 0.35136839747428894, + "learning_rate": 9.070455771149149e-06, + "loss": 0.4338, + "step": 2566 + }, + { + "epoch": 1.3878176247972607, + "grad_norm": 0.3452429473400116, + "learning_rate": 9.069359063061624e-06, + "loss": 0.4693, + "step": 2567 + }, + { + "epoch": 1.388358262750045, + "grad_norm": 0.338131308555603, + "learning_rate": 9.068261774772262e-06, + "loss": 0.4126, + "step": 2568 + }, + { + "epoch": 1.3888989007028294, + "grad_norm": 0.3617740571498871, + "learning_rate": 9.067163906437513e-06, + "loss": 0.4393, + "step": 2569 + }, + { + "epoch": 1.3894395386556138, + "grad_norm": 0.3760451674461365, + "learning_rate": 9.066065458213908e-06, + "loss": 0.4787, + "step": 2570 + }, + { + "epoch": 1.389980176608398, + "grad_norm": 0.35575684905052185, + "learning_rate": 9.064966430258064e-06, + "loss": 0.4009, + "step": 2571 + }, + { + "epoch": 1.3905208145611823, + "grad_norm": 0.4492211639881134, + "learning_rate": 9.063866822726675e-06, + "loss": 0.4452, + "step": 2572 + }, + { + "epoch": 1.3910614525139664, + "grad_norm": 0.31847888231277466, + "learning_rate": 9.062766635776523e-06, + "loss": 0.4393, + "step": 2573 + }, + { + "epoch": 1.3916020904667508, + "grad_norm": 0.38668292760849, + "learning_rate": 9.061665869564468e-06, + "loss": 0.4882, + "step": 2574 + }, + { + "epoch": 1.3921427284195351, + "grad_norm": 0.31565096974372864, + "learning_rate": 9.06056452424746e-06, + "loss": 0.3894, + "step": 2575 + }, + { + "epoch": 1.3926833663723193, + "grad_norm": 0.37927380204200745, + "learning_rate": 9.059462599982525e-06, + "loss": 0.4499, + "step": 2576 + }, + { + "epoch": 1.3932240043251036, + "grad_norm": 0.3331967890262604, + "learning_rate": 9.058360096926771e-06, + "loss": 0.4102, + "step": 2577 + }, + { + "epoch": 1.393764642277888, + "grad_norm": 0.3800790011882782, + "learning_rate": 9.057257015237394e-06, + "loss": 0.4605, + "step": 2578 + }, + { + "epoch": 1.3943052802306721, + "grad_norm": 0.3449414074420929, + "learning_rate": 9.056153355071668e-06, + "loss": 0.4421, + "step": 2579 + }, + { + "epoch": 1.3948459181834565, + "grad_norm": 0.3734491765499115, + "learning_rate": 9.055049116586951e-06, + "loss": 0.4663, + "step": 2580 + }, + { + "epoch": 1.3953865561362409, + "grad_norm": 0.4147307276725769, + "learning_rate": 9.05394429994068e-06, + "loss": 0.4177, + "step": 2581 + }, + { + "epoch": 1.395927194089025, + "grad_norm": 0.3757738173007965, + "learning_rate": 9.052838905290386e-06, + "loss": 0.4443, + "step": 2582 + }, + { + "epoch": 1.3964678320418094, + "grad_norm": 0.39692026376724243, + "learning_rate": 9.051732932793667e-06, + "loss": 0.4309, + "step": 2583 + }, + { + "epoch": 1.3970084699945935, + "grad_norm": 0.41125962138175964, + "learning_rate": 9.050626382608212e-06, + "loss": 0.4539, + "step": 2584 + }, + { + "epoch": 1.3975491079473779, + "grad_norm": 0.34175458550453186, + "learning_rate": 9.049519254891793e-06, + "loss": 0.4387, + "step": 2585 + }, + { + "epoch": 1.3980897459001622, + "grad_norm": 0.38721558451652527, + "learning_rate": 9.048411549802259e-06, + "loss": 0.4768, + "step": 2586 + }, + { + "epoch": 1.3986303838529466, + "grad_norm": 0.3092331886291504, + "learning_rate": 9.047303267497547e-06, + "loss": 0.397, + "step": 2587 + }, + { + "epoch": 1.3991710218057307, + "grad_norm": 0.3423914611339569, + "learning_rate": 9.046194408135673e-06, + "loss": 0.4816, + "step": 2588 + }, + { + "epoch": 1.399711659758515, + "grad_norm": 0.3521196246147156, + "learning_rate": 9.045084971874738e-06, + "loss": 0.4182, + "step": 2589 + }, + { + "epoch": 1.4002522977112992, + "grad_norm": 0.3921590745449066, + "learning_rate": 9.04397495887292e-06, + "loss": 0.4861, + "step": 2590 + }, + { + "epoch": 1.4007929356640836, + "grad_norm": 0.32359740138053894, + "learning_rate": 9.042864369288487e-06, + "loss": 0.438, + "step": 2591 + }, + { + "epoch": 1.401333573616868, + "grad_norm": 0.3559328317642212, + "learning_rate": 9.041753203279781e-06, + "loss": 0.4198, + "step": 2592 + }, + { + "epoch": 1.4018742115696523, + "grad_norm": 0.38676345348358154, + "learning_rate": 9.040641461005232e-06, + "loss": 0.4698, + "step": 2593 + }, + { + "epoch": 1.4024148495224364, + "grad_norm": 0.3294946849346161, + "learning_rate": 9.039529142623348e-06, + "loss": 0.4192, + "step": 2594 + }, + { + "epoch": 1.4029554874752208, + "grad_norm": 0.35132449865341187, + "learning_rate": 9.038416248292725e-06, + "loss": 0.425, + "step": 2595 + }, + { + "epoch": 1.403496125428005, + "grad_norm": 0.3458995521068573, + "learning_rate": 9.037302778172034e-06, + "loss": 0.4588, + "step": 2596 + }, + { + "epoch": 1.4040367633807893, + "grad_norm": 0.3966250419616699, + "learning_rate": 9.036188732420035e-06, + "loss": 0.4995, + "step": 2597 + }, + { + "epoch": 1.4045774013335737, + "grad_norm": 0.32241538166999817, + "learning_rate": 9.035074111195563e-06, + "loss": 0.3888, + "step": 2598 + }, + { + "epoch": 1.405118039286358, + "grad_norm": 0.3781411051750183, + "learning_rate": 9.03395891465754e-06, + "loss": 0.4392, + "step": 2599 + }, + { + "epoch": 1.4056586772391422, + "grad_norm": 0.3311634957790375, + "learning_rate": 9.03284314296497e-06, + "loss": 0.4485, + "step": 2600 + }, + { + "epoch": 1.4061993151919265, + "grad_norm": 0.3563523590564728, + "learning_rate": 9.031726796276935e-06, + "loss": 0.4227, + "step": 2601 + }, + { + "epoch": 1.4067399531447107, + "grad_norm": 0.3249850869178772, + "learning_rate": 9.030609874752604e-06, + "loss": 0.4468, + "step": 2602 + }, + { + "epoch": 1.407280591097495, + "grad_norm": 0.30806684494018555, + "learning_rate": 9.029492378551228e-06, + "loss": 0.4329, + "step": 2603 + }, + { + "epoch": 1.4078212290502794, + "grad_norm": 0.38034096360206604, + "learning_rate": 9.028374307832131e-06, + "loss": 0.4332, + "step": 2604 + }, + { + "epoch": 1.4083618670030635, + "grad_norm": 0.33460333943367004, + "learning_rate": 9.02725566275473e-06, + "loss": 0.4521, + "step": 2605 + }, + { + "epoch": 1.408902504955848, + "grad_norm": 0.35097768902778625, + "learning_rate": 9.02613644347852e-06, + "loss": 0.4261, + "step": 2606 + }, + { + "epoch": 1.4094431429086323, + "grad_norm": 0.3345774710178375, + "learning_rate": 9.025016650163074e-06, + "loss": 0.4488, + "step": 2607 + }, + { + "epoch": 1.4099837808614164, + "grad_norm": 0.3553728759288788, + "learning_rate": 9.023896282968052e-06, + "loss": 0.4551, + "step": 2608 + }, + { + "epoch": 1.4105244188142008, + "grad_norm": 0.34653764963150024, + "learning_rate": 9.022775342053194e-06, + "loss": 0.4303, + "step": 2609 + }, + { + "epoch": 1.4110650567669851, + "grad_norm": 0.3573172688484192, + "learning_rate": 9.021653827578322e-06, + "loss": 0.4306, + "step": 2610 + }, + { + "epoch": 1.4116056947197693, + "grad_norm": 0.43176814913749695, + "learning_rate": 9.020531739703338e-06, + "loss": 0.4513, + "step": 2611 + }, + { + "epoch": 1.4121463326725536, + "grad_norm": 0.3422747850418091, + "learning_rate": 9.01940907858823e-06, + "loss": 0.4489, + "step": 2612 + }, + { + "epoch": 1.4126869706253378, + "grad_norm": 0.37848156690597534, + "learning_rate": 9.018285844393061e-06, + "loss": 0.3838, + "step": 2613 + }, + { + "epoch": 1.4132276085781221, + "grad_norm": 0.44722023606300354, + "learning_rate": 9.017162037277983e-06, + "loss": 0.4741, + "step": 2614 + }, + { + "epoch": 1.4137682465309065, + "grad_norm": 0.38165798783302307, + "learning_rate": 9.016037657403225e-06, + "loss": 0.4159, + "step": 2615 + }, + { + "epoch": 1.4143088844836909, + "grad_norm": 0.37461984157562256, + "learning_rate": 9.0149127049291e-06, + "loss": 0.4323, + "step": 2616 + }, + { + "epoch": 1.414849522436475, + "grad_norm": 0.3761579692363739, + "learning_rate": 9.013787180016e-06, + "loss": 0.4729, + "step": 2617 + }, + { + "epoch": 1.4153901603892594, + "grad_norm": 0.3706890046596527, + "learning_rate": 9.012661082824404e-06, + "loss": 0.4611, + "step": 2618 + }, + { + "epoch": 1.4159307983420435, + "grad_norm": 0.33320996165275574, + "learning_rate": 9.011534413514862e-06, + "loss": 0.4077, + "step": 2619 + }, + { + "epoch": 1.4164714362948279, + "grad_norm": 0.37726783752441406, + "learning_rate": 9.01040717224802e-06, + "loss": 0.408, + "step": 2620 + }, + { + "epoch": 1.4170120742476122, + "grad_norm": 0.4023379683494568, + "learning_rate": 9.009279359184594e-06, + "loss": 0.4518, + "step": 2621 + }, + { + "epoch": 1.4175527122003966, + "grad_norm": 0.3516872525215149, + "learning_rate": 9.008150974485386e-06, + "loss": 0.4082, + "step": 2622 + }, + { + "epoch": 1.4180933501531807, + "grad_norm": 0.4246536195278168, + "learning_rate": 9.007022018311277e-06, + "loss": 0.4818, + "step": 2623 + }, + { + "epoch": 1.418633988105965, + "grad_norm": 0.3128785490989685, + "learning_rate": 9.005892490823237e-06, + "loss": 0.4043, + "step": 2624 + }, + { + "epoch": 1.4191746260587492, + "grad_norm": 0.3647494316101074, + "learning_rate": 9.004762392182307e-06, + "loss": 0.4162, + "step": 2625 + }, + { + "epoch": 1.4197152640115336, + "grad_norm": 0.3745572865009308, + "learning_rate": 9.003631722549617e-06, + "loss": 0.4648, + "step": 2626 + }, + { + "epoch": 1.420255901964318, + "grad_norm": 0.3839242458343506, + "learning_rate": 9.002500482086377e-06, + "loss": 0.42, + "step": 2627 + }, + { + "epoch": 1.4207965399171023, + "grad_norm": 0.354693740606308, + "learning_rate": 9.001368670953872e-06, + "loss": 0.4153, + "step": 2628 + }, + { + "epoch": 1.4213371778698864, + "grad_norm": 0.3716014325618744, + "learning_rate": 9.000236289313479e-06, + "loss": 0.4531, + "step": 2629 + }, + { + "epoch": 1.4218778158226708, + "grad_norm": 0.3571845293045044, + "learning_rate": 8.999103337326646e-06, + "loss": 0.4221, + "step": 2630 + }, + { + "epoch": 1.422418453775455, + "grad_norm": 0.3787524998188019, + "learning_rate": 8.997969815154913e-06, + "loss": 0.4424, + "step": 2631 + }, + { + "epoch": 1.4229590917282393, + "grad_norm": 0.37358084321022034, + "learning_rate": 8.99683572295989e-06, + "loss": 0.436, + "step": 2632 + }, + { + "epoch": 1.4234997296810237, + "grad_norm": 0.37591075897216797, + "learning_rate": 8.995701060903279e-06, + "loss": 0.4121, + "step": 2633 + }, + { + "epoch": 1.4240403676338078, + "grad_norm": 0.3382313847541809, + "learning_rate": 8.994565829146855e-06, + "loss": 0.4449, + "step": 2634 + }, + { + "epoch": 1.4245810055865922, + "grad_norm": 0.3377707600593567, + "learning_rate": 8.993430027852476e-06, + "loss": 0.4205, + "step": 2635 + }, + { + "epoch": 1.4251216435393765, + "grad_norm": 0.34145236015319824, + "learning_rate": 8.992293657182085e-06, + "loss": 0.4755, + "step": 2636 + }, + { + "epoch": 1.4256622814921607, + "grad_norm": 0.36096832156181335, + "learning_rate": 8.991156717297702e-06, + "loss": 0.4393, + "step": 2637 + }, + { + "epoch": 1.426202919444945, + "grad_norm": 0.33001676201820374, + "learning_rate": 8.990019208361432e-06, + "loss": 0.4076, + "step": 2638 + }, + { + "epoch": 1.4267435573977294, + "grad_norm": 0.357586145401001, + "learning_rate": 8.988881130535459e-06, + "loss": 0.4401, + "step": 2639 + }, + { + "epoch": 1.4272841953505135, + "grad_norm": 0.37538856267929077, + "learning_rate": 8.987742483982044e-06, + "loss": 0.4397, + "step": 2640 + }, + { + "epoch": 1.427824833303298, + "grad_norm": 0.4129542410373688, + "learning_rate": 8.986603268863536e-06, + "loss": 0.48, + "step": 2641 + }, + { + "epoch": 1.428365471256082, + "grad_norm": 0.36001160740852356, + "learning_rate": 8.985463485342363e-06, + "loss": 0.4259, + "step": 2642 + }, + { + "epoch": 1.4289061092088664, + "grad_norm": 0.3682215213775635, + "learning_rate": 8.984323133581032e-06, + "loss": 0.4293, + "step": 2643 + }, + { + "epoch": 1.4294467471616508, + "grad_norm": 0.4558807909488678, + "learning_rate": 8.983182213742135e-06, + "loss": 0.4569, + "step": 2644 + }, + { + "epoch": 1.4299873851144351, + "grad_norm": 0.34552398324012756, + "learning_rate": 8.982040725988337e-06, + "loss": 0.4642, + "step": 2645 + }, + { + "epoch": 1.4305280230672193, + "grad_norm": 0.42189887166023254, + "learning_rate": 8.980898670482392e-06, + "loss": 0.441, + "step": 2646 + }, + { + "epoch": 1.4310686610200036, + "grad_norm": 0.3556559085845947, + "learning_rate": 8.979756047387134e-06, + "loss": 0.3828, + "step": 2647 + }, + { + "epoch": 1.4316092989727878, + "grad_norm": 0.36019167304039, + "learning_rate": 8.978612856865474e-06, + "loss": 0.4522, + "step": 2648 + }, + { + "epoch": 1.4321499369255721, + "grad_norm": 0.34309664368629456, + "learning_rate": 8.977469099080405e-06, + "loss": 0.4102, + "step": 2649 + }, + { + "epoch": 1.4326905748783565, + "grad_norm": 0.3519548773765564, + "learning_rate": 8.976324774195005e-06, + "loss": 0.4334, + "step": 2650 + }, + { + "epoch": 1.4332312128311409, + "grad_norm": 0.3870348632335663, + "learning_rate": 8.975179882372428e-06, + "loss": 0.4601, + "step": 2651 + }, + { + "epoch": 1.433771850783925, + "grad_norm": 0.32236963510513306, + "learning_rate": 8.974034423775912e-06, + "loss": 0.4295, + "step": 2652 + }, + { + "epoch": 1.4343124887367094, + "grad_norm": 0.3496725559234619, + "learning_rate": 8.972888398568772e-06, + "loss": 0.405, + "step": 2653 + }, + { + "epoch": 1.4348531266894935, + "grad_norm": 0.28755730390548706, + "learning_rate": 8.971741806914409e-06, + "loss": 0.3739, + "step": 2654 + }, + { + "epoch": 1.4353937646422779, + "grad_norm": 0.34134241938591003, + "learning_rate": 8.970594648976299e-06, + "loss": 0.4223, + "step": 2655 + }, + { + "epoch": 1.4359344025950622, + "grad_norm": 0.38335177302360535, + "learning_rate": 8.969446924918001e-06, + "loss": 0.4435, + "step": 2656 + }, + { + "epoch": 1.4364750405478466, + "grad_norm": 0.3327115476131439, + "learning_rate": 8.96829863490316e-06, + "loss": 0.4378, + "step": 2657 + }, + { + "epoch": 1.4370156785006307, + "grad_norm": 0.333930641412735, + "learning_rate": 8.967149779095494e-06, + "loss": 0.4356, + "step": 2658 + }, + { + "epoch": 1.437556316453415, + "grad_norm": 0.34585675597190857, + "learning_rate": 8.966000357658807e-06, + "loss": 0.4388, + "step": 2659 + }, + { + "epoch": 1.4380969544061992, + "grad_norm": 0.3636191487312317, + "learning_rate": 8.964850370756978e-06, + "loss": 0.406, + "step": 2660 + }, + { + "epoch": 1.4386375923589836, + "grad_norm": 0.3512509763240814, + "learning_rate": 8.963699818553972e-06, + "loss": 0.4522, + "step": 2661 + }, + { + "epoch": 1.439178230311768, + "grad_norm": 0.35486963391304016, + "learning_rate": 8.962548701213834e-06, + "loss": 0.4345, + "step": 2662 + }, + { + "epoch": 1.4397188682645523, + "grad_norm": 0.3926379680633545, + "learning_rate": 8.961397018900685e-06, + "loss": 0.4312, + "step": 2663 + }, + { + "epoch": 1.4402595062173364, + "grad_norm": 0.3730280101299286, + "learning_rate": 8.960244771778732e-06, + "loss": 0.4754, + "step": 2664 + }, + { + "epoch": 1.4408001441701208, + "grad_norm": 0.34494009613990784, + "learning_rate": 8.95909196001226e-06, + "loss": 0.4289, + "step": 2665 + }, + { + "epoch": 1.441340782122905, + "grad_norm": 0.373043030500412, + "learning_rate": 8.957938583765636e-06, + "loss": 0.4349, + "step": 2666 + }, + { + "epoch": 1.4418814200756893, + "grad_norm": 0.35607317090034485, + "learning_rate": 8.956784643203303e-06, + "loss": 0.454, + "step": 2667 + }, + { + "epoch": 1.4424220580284737, + "grad_norm": 0.33781003952026367, + "learning_rate": 8.955630138489788e-06, + "loss": 0.4377, + "step": 2668 + }, + { + "epoch": 1.4429626959812578, + "grad_norm": 0.3684767186641693, + "learning_rate": 8.954475069789703e-06, + "loss": 0.4445, + "step": 2669 + }, + { + "epoch": 1.4435033339340422, + "grad_norm": 0.34141626954078674, + "learning_rate": 8.953319437267731e-06, + "loss": 0.3888, + "step": 2670 + }, + { + "epoch": 1.4440439718868265, + "grad_norm": 0.3129776418209076, + "learning_rate": 8.952163241088642e-06, + "loss": 0.4355, + "step": 2671 + }, + { + "epoch": 1.4445846098396107, + "grad_norm": 0.36519676446914673, + "learning_rate": 8.951006481417284e-06, + "loss": 0.4312, + "step": 2672 + }, + { + "epoch": 1.445125247792395, + "grad_norm": 0.40581244230270386, + "learning_rate": 8.949849158418586e-06, + "loss": 0.4849, + "step": 2673 + }, + { + "epoch": 1.4456658857451794, + "grad_norm": 0.3111916184425354, + "learning_rate": 8.948691272257555e-06, + "loss": 0.3988, + "step": 2674 + }, + { + "epoch": 1.4462065236979635, + "grad_norm": 0.4079495370388031, + "learning_rate": 8.947532823099284e-06, + "loss": 0.4706, + "step": 2675 + }, + { + "epoch": 1.446747161650748, + "grad_norm": 0.3530767261981964, + "learning_rate": 8.946373811108939e-06, + "loss": 0.4219, + "step": 2676 + }, + { + "epoch": 1.447287799603532, + "grad_norm": 0.3372650742530823, + "learning_rate": 8.94521423645177e-06, + "loss": 0.3823, + "step": 2677 + }, + { + "epoch": 1.4478284375563164, + "grad_norm": 0.383865088224411, + "learning_rate": 8.944054099293109e-06, + "loss": 0.4292, + "step": 2678 + }, + { + "epoch": 1.4483690755091008, + "grad_norm": 0.3813159167766571, + "learning_rate": 8.942893399798367e-06, + "loss": 0.4788, + "step": 2679 + }, + { + "epoch": 1.4489097134618851, + "grad_norm": 0.3461150825023651, + "learning_rate": 8.941732138133032e-06, + "loss": 0.3961, + "step": 2680 + }, + { + "epoch": 1.4494503514146693, + "grad_norm": 0.42276474833488464, + "learning_rate": 8.940570314462676e-06, + "loss": 0.4753, + "step": 2681 + }, + { + "epoch": 1.4499909893674536, + "grad_norm": 0.3687373101711273, + "learning_rate": 8.93940792895295e-06, + "loss": 0.4402, + "step": 2682 + }, + { + "epoch": 1.4505316273202378, + "grad_norm": 0.3514117896556854, + "learning_rate": 8.938244981769581e-06, + "loss": 0.4164, + "step": 2683 + }, + { + "epoch": 1.4510722652730221, + "grad_norm": 0.3533627688884735, + "learning_rate": 8.937081473078387e-06, + "loss": 0.4901, + "step": 2684 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 0.4426608383655548, + "learning_rate": 8.935917403045251e-06, + "loss": 0.466, + "step": 2685 + }, + { + "epoch": 1.4521535411785909, + "grad_norm": 0.34184348583221436, + "learning_rate": 8.93475277183615e-06, + "loss": 0.4276, + "step": 2686 + }, + { + "epoch": 1.452694179131375, + "grad_norm": 0.4027365446090698, + "learning_rate": 8.933587579617134e-06, + "loss": 0.4524, + "step": 2687 + }, + { + "epoch": 1.4532348170841594, + "grad_norm": 0.34122905135154724, + "learning_rate": 8.932421826554332e-06, + "loss": 0.4221, + "step": 2688 + }, + { + "epoch": 1.4537754550369435, + "grad_norm": 0.36274346709251404, + "learning_rate": 8.931255512813954e-06, + "loss": 0.4649, + "step": 2689 + }, + { + "epoch": 1.4543160929897279, + "grad_norm": 0.3560095727443695, + "learning_rate": 8.930088638562296e-06, + "loss": 0.435, + "step": 2690 + }, + { + "epoch": 1.4548567309425122, + "grad_norm": 0.31499168276786804, + "learning_rate": 8.928921203965724e-06, + "loss": 0.4129, + "step": 2691 + }, + { + "epoch": 1.4553973688952966, + "grad_norm": 0.3829094469547272, + "learning_rate": 8.927753209190691e-06, + "loss": 0.4623, + "step": 2692 + }, + { + "epoch": 1.4559380068480807, + "grad_norm": 0.32538989186286926, + "learning_rate": 8.926584654403725e-06, + "loss": 0.4188, + "step": 2693 + }, + { + "epoch": 1.456478644800865, + "grad_norm": 0.3670380711555481, + "learning_rate": 8.925415539771441e-06, + "loss": 0.4482, + "step": 2694 + }, + { + "epoch": 1.4570192827536492, + "grad_norm": 0.3919623792171478, + "learning_rate": 8.924245865460523e-06, + "loss": 0.4852, + "step": 2695 + }, + { + "epoch": 1.4575599207064336, + "grad_norm": 0.33094435930252075, + "learning_rate": 8.923075631637748e-06, + "loss": 0.3942, + "step": 2696 + }, + { + "epoch": 1.458100558659218, + "grad_norm": 0.3392255902290344, + "learning_rate": 8.921904838469962e-06, + "loss": 0.4479, + "step": 2697 + }, + { + "epoch": 1.458641196612002, + "grad_norm": 0.3547809422016144, + "learning_rate": 8.920733486124093e-06, + "loss": 0.4303, + "step": 2698 + }, + { + "epoch": 1.4591818345647865, + "grad_norm": 0.38857102394104004, + "learning_rate": 8.919561574767154e-06, + "loss": 0.4945, + "step": 2699 + }, + { + "epoch": 1.4597224725175708, + "grad_norm": 0.2873074412345886, + "learning_rate": 8.918389104566232e-06, + "loss": 0.3641, + "step": 2700 + }, + { + "epoch": 1.460263110470355, + "grad_norm": 0.40835830569267273, + "learning_rate": 8.917216075688496e-06, + "loss": 0.4893, + "step": 2701 + }, + { + "epoch": 1.4608037484231393, + "grad_norm": 0.29825371503829956, + "learning_rate": 8.916042488301195e-06, + "loss": 0.427, + "step": 2702 + }, + { + "epoch": 1.4613443863759237, + "grad_norm": 0.38785210251808167, + "learning_rate": 8.914868342571655e-06, + "loss": 0.4327, + "step": 2703 + }, + { + "epoch": 1.4618850243287078, + "grad_norm": 0.32820212841033936, + "learning_rate": 8.913693638667284e-06, + "loss": 0.4663, + "step": 2704 + }, + { + "epoch": 1.4624256622814922, + "grad_norm": 0.32327958941459656, + "learning_rate": 8.912518376755572e-06, + "loss": 0.4259, + "step": 2705 + }, + { + "epoch": 1.4629663002342763, + "grad_norm": 0.3118366003036499, + "learning_rate": 8.911342557004084e-06, + "loss": 0.411, + "step": 2706 + }, + { + "epoch": 1.4635069381870607, + "grad_norm": 0.32654812932014465, + "learning_rate": 8.910166179580463e-06, + "loss": 0.4306, + "step": 2707 + }, + { + "epoch": 1.464047576139845, + "grad_norm": 0.3658846318721771, + "learning_rate": 8.90898924465244e-06, + "loss": 0.462, + "step": 2708 + }, + { + "epoch": 1.4645882140926294, + "grad_norm": 0.3317575752735138, + "learning_rate": 8.907811752387818e-06, + "loss": 0.4846, + "step": 2709 + }, + { + "epoch": 1.4651288520454135, + "grad_norm": 0.38094812631607056, + "learning_rate": 8.906633702954482e-06, + "loss": 0.4601, + "step": 2710 + }, + { + "epoch": 1.465669489998198, + "grad_norm": 0.35576844215393066, + "learning_rate": 8.905455096520394e-06, + "loss": 0.4251, + "step": 2711 + }, + { + "epoch": 1.466210127950982, + "grad_norm": 0.31583791971206665, + "learning_rate": 8.9042759332536e-06, + "loss": 0.4207, + "step": 2712 + }, + { + "epoch": 1.4667507659037664, + "grad_norm": 0.4097664952278137, + "learning_rate": 8.903096213322222e-06, + "loss": 0.4022, + "step": 2713 + }, + { + "epoch": 1.4672914038565508, + "grad_norm": 0.34725257754325867, + "learning_rate": 8.901915936894462e-06, + "loss": 0.4665, + "step": 2714 + }, + { + "epoch": 1.4678320418093351, + "grad_norm": 0.389870285987854, + "learning_rate": 8.900735104138605e-06, + "loss": 0.4675, + "step": 2715 + }, + { + "epoch": 1.4683726797621193, + "grad_norm": 0.3505401909351349, + "learning_rate": 8.899553715223008e-06, + "loss": 0.4101, + "step": 2716 + }, + { + "epoch": 1.4689133177149036, + "grad_norm": 0.44681039452552795, + "learning_rate": 8.898371770316113e-06, + "loss": 0.4471, + "step": 2717 + }, + { + "epoch": 1.4694539556676878, + "grad_norm": 0.4092337191104889, + "learning_rate": 8.89718926958644e-06, + "loss": 0.4758, + "step": 2718 + }, + { + "epoch": 1.4699945936204721, + "grad_norm": 0.3586428761482239, + "learning_rate": 8.896006213202584e-06, + "loss": 0.4015, + "step": 2719 + }, + { + "epoch": 1.4705352315732565, + "grad_norm": 0.40717989206314087, + "learning_rate": 8.894822601333228e-06, + "loss": 0.4515, + "step": 2720 + }, + { + "epoch": 1.4710758695260409, + "grad_norm": 0.3298538327217102, + "learning_rate": 8.893638434147126e-06, + "loss": 0.4324, + "step": 2721 + }, + { + "epoch": 1.471616507478825, + "grad_norm": 0.3751235902309418, + "learning_rate": 8.892453711813119e-06, + "loss": 0.4405, + "step": 2722 + }, + { + "epoch": 1.4721571454316094, + "grad_norm": 0.4075682759284973, + "learning_rate": 8.891268434500116e-06, + "loss": 0.4411, + "step": 2723 + }, + { + "epoch": 1.4726977833843935, + "grad_norm": 0.35277068614959717, + "learning_rate": 8.890082602377115e-06, + "loss": 0.4457, + "step": 2724 + }, + { + "epoch": 1.4732384213371779, + "grad_norm": 0.36235764622688293, + "learning_rate": 8.888896215613192e-06, + "loss": 0.4229, + "step": 2725 + }, + { + "epoch": 1.4737790592899622, + "grad_norm": 0.36335426568984985, + "learning_rate": 8.887709274377496e-06, + "loss": 0.4636, + "step": 2726 + }, + { + "epoch": 1.4743196972427464, + "grad_norm": 0.340276300907135, + "learning_rate": 8.88652177883926e-06, + "loss": 0.4184, + "step": 2727 + }, + { + "epoch": 1.4748603351955307, + "grad_norm": 0.3393535017967224, + "learning_rate": 8.885333729167797e-06, + "loss": 0.4512, + "step": 2728 + }, + { + "epoch": 1.475400973148315, + "grad_norm": 0.38643527030944824, + "learning_rate": 8.884145125532494e-06, + "loss": 0.3916, + "step": 2729 + }, + { + "epoch": 1.4759416111010992, + "grad_norm": 0.3693040609359741, + "learning_rate": 8.882955968102822e-06, + "loss": 0.4452, + "step": 2730 + }, + { + "epoch": 1.4764822490538836, + "grad_norm": 0.35152468085289, + "learning_rate": 8.881766257048328e-06, + "loss": 0.45, + "step": 2731 + }, + { + "epoch": 1.477022887006668, + "grad_norm": 0.35957586765289307, + "learning_rate": 8.88057599253864e-06, + "loss": 0.4246, + "step": 2732 + }, + { + "epoch": 1.477563524959452, + "grad_norm": 0.3535710871219635, + "learning_rate": 8.879385174743462e-06, + "loss": 0.4292, + "step": 2733 + }, + { + "epoch": 1.4781041629122365, + "grad_norm": 0.34068745374679565, + "learning_rate": 8.87819380383258e-06, + "loss": 0.4507, + "step": 2734 + }, + { + "epoch": 1.4786448008650206, + "grad_norm": 0.37489134073257446, + "learning_rate": 8.877001879975857e-06, + "loss": 0.4613, + "step": 2735 + }, + { + "epoch": 1.479185438817805, + "grad_norm": 0.29304268956184387, + "learning_rate": 8.875809403343236e-06, + "loss": 0.3901, + "step": 2736 + }, + { + "epoch": 1.4797260767705893, + "grad_norm": 0.3183952569961548, + "learning_rate": 8.874616374104736e-06, + "loss": 0.4281, + "step": 2737 + }, + { + "epoch": 1.4802667147233737, + "grad_norm": 0.33998361229896545, + "learning_rate": 8.87342279243046e-06, + "loss": 0.4294, + "step": 2738 + }, + { + "epoch": 1.4808073526761578, + "grad_norm": 0.34192410111427307, + "learning_rate": 8.872228658490585e-06, + "loss": 0.4864, + "step": 2739 + }, + { + "epoch": 1.4813479906289422, + "grad_norm": 0.3485790491104126, + "learning_rate": 8.87103397245537e-06, + "loss": 0.4256, + "step": 2740 + }, + { + "epoch": 1.4818886285817263, + "grad_norm": 0.33720722794532776, + "learning_rate": 8.869838734495147e-06, + "loss": 0.4147, + "step": 2741 + }, + { + "epoch": 1.4824292665345107, + "grad_norm": 0.35417404770851135, + "learning_rate": 8.868642944780334e-06, + "loss": 0.4734, + "step": 2742 + }, + { + "epoch": 1.482969904487295, + "grad_norm": 0.38683000206947327, + "learning_rate": 8.867446603481427e-06, + "loss": 0.4333, + "step": 2743 + }, + { + "epoch": 1.4835105424400794, + "grad_norm": 0.37127575278282166, + "learning_rate": 8.866249710768992e-06, + "loss": 0.4378, + "step": 2744 + }, + { + "epoch": 1.4840511803928635, + "grad_norm": 0.3404901921749115, + "learning_rate": 8.865052266813686e-06, + "loss": 0.4425, + "step": 2745 + }, + { + "epoch": 1.484591818345648, + "grad_norm": 0.3562512695789337, + "learning_rate": 8.863854271786234e-06, + "loss": 0.4009, + "step": 2746 + }, + { + "epoch": 1.485132456298432, + "grad_norm": 0.3635586202144623, + "learning_rate": 8.862655725857445e-06, + "loss": 0.4643, + "step": 2747 + }, + { + "epoch": 1.4856730942512164, + "grad_norm": 0.3443708121776581, + "learning_rate": 8.861456629198209e-06, + "loss": 0.4301, + "step": 2748 + }, + { + "epoch": 1.4862137322040008, + "grad_norm": 0.34321269392967224, + "learning_rate": 8.860256981979485e-06, + "loss": 0.418, + "step": 2749 + }, + { + "epoch": 1.4867543701567851, + "grad_norm": 0.38231542706489563, + "learning_rate": 8.85905678437232e-06, + "loss": 0.4622, + "step": 2750 + }, + { + "epoch": 1.4872950081095693, + "grad_norm": 0.3586867153644562, + "learning_rate": 8.857856036547837e-06, + "loss": 0.4336, + "step": 2751 + }, + { + "epoch": 1.4878356460623536, + "grad_norm": 0.41343262791633606, + "learning_rate": 8.856654738677234e-06, + "loss": 0.4435, + "step": 2752 + }, + { + "epoch": 1.4883762840151378, + "grad_norm": 0.37144145369529724, + "learning_rate": 8.85545289093179e-06, + "loss": 0.4096, + "step": 2753 + }, + { + "epoch": 1.4889169219679221, + "grad_norm": 0.4296935796737671, + "learning_rate": 8.854250493482865e-06, + "loss": 0.4716, + "step": 2754 + }, + { + "epoch": 1.4894575599207065, + "grad_norm": 0.3688115179538727, + "learning_rate": 8.853047546501893e-06, + "loss": 0.4137, + "step": 2755 + }, + { + "epoch": 1.4899981978734906, + "grad_norm": 0.3428502082824707, + "learning_rate": 8.851844050160387e-06, + "loss": 0.4605, + "step": 2756 + }, + { + "epoch": 1.490538835826275, + "grad_norm": 0.3452523350715637, + "learning_rate": 8.85064000462994e-06, + "loss": 0.4294, + "step": 2757 + }, + { + "epoch": 1.4910794737790594, + "grad_norm": 0.3571106791496277, + "learning_rate": 8.849435410082224e-06, + "loss": 0.4651, + "step": 2758 + }, + { + "epoch": 1.4916201117318435, + "grad_norm": 0.40978145599365234, + "learning_rate": 8.848230266688984e-06, + "loss": 0.5045, + "step": 2759 + }, + { + "epoch": 1.4921607496846279, + "grad_norm": 0.33386868238449097, + "learning_rate": 8.847024574622051e-06, + "loss": 0.3977, + "step": 2760 + }, + { + "epoch": 1.4927013876374122, + "grad_norm": 0.32085907459259033, + "learning_rate": 8.845818334053332e-06, + "loss": 0.4055, + "step": 2761 + }, + { + "epoch": 1.4932420255901964, + "grad_norm": 0.35079190135002136, + "learning_rate": 8.844611545154804e-06, + "loss": 0.3969, + "step": 2762 + }, + { + "epoch": 1.4937826635429807, + "grad_norm": 0.3813706934452057, + "learning_rate": 8.843404208098536e-06, + "loss": 0.4684, + "step": 2763 + }, + { + "epoch": 1.4943233014957649, + "grad_norm": 0.3383070230484009, + "learning_rate": 8.842196323056662e-06, + "loss": 0.4306, + "step": 2764 + }, + { + "epoch": 1.4948639394485492, + "grad_norm": 0.37092912197113037, + "learning_rate": 8.840987890201404e-06, + "loss": 0.4309, + "step": 2765 + }, + { + "epoch": 1.4954045774013336, + "grad_norm": 0.3601703941822052, + "learning_rate": 8.839778909705055e-06, + "loss": 0.484, + "step": 2766 + }, + { + "epoch": 1.495945215354118, + "grad_norm": 0.36565494537353516, + "learning_rate": 8.838569381739993e-06, + "loss": 0.4247, + "step": 2767 + }, + { + "epoch": 1.496485853306902, + "grad_norm": 0.37783923745155334, + "learning_rate": 8.837359306478667e-06, + "loss": 0.4507, + "step": 2768 + }, + { + "epoch": 1.4970264912596865, + "grad_norm": 0.3881077170372009, + "learning_rate": 8.83614868409361e-06, + "loss": 0.4566, + "step": 2769 + }, + { + "epoch": 1.4975671292124706, + "grad_norm": 0.33153507113456726, + "learning_rate": 8.834937514757428e-06, + "loss": 0.4205, + "step": 2770 + }, + { + "epoch": 1.498107767165255, + "grad_norm": 0.3849857449531555, + "learning_rate": 8.833725798642809e-06, + "loss": 0.4349, + "step": 2771 + }, + { + "epoch": 1.4986484051180393, + "grad_norm": 0.37943586707115173, + "learning_rate": 8.832513535922516e-06, + "loss": 0.4195, + "step": 2772 + }, + { + "epoch": 1.4991890430708237, + "grad_norm": 0.3738144338130951, + "learning_rate": 8.831300726769391e-06, + "loss": 0.4469, + "step": 2773 + }, + { + "epoch": 1.4997296810236078, + "grad_norm": 0.3525858521461487, + "learning_rate": 8.830087371356356e-06, + "loss": 0.3896, + "step": 2774 + }, + { + "epoch": 1.5002703189763922, + "grad_norm": 0.33611002564430237, + "learning_rate": 8.828873469856408e-06, + "loss": 0.4355, + "step": 2775 + }, + { + "epoch": 1.5008109569291763, + "grad_norm": 0.36726000905036926, + "learning_rate": 8.827659022442622e-06, + "loss": 0.466, + "step": 2776 + }, + { + "epoch": 1.5013515948819607, + "grad_norm": 0.40099841356277466, + "learning_rate": 8.826444029288154e-06, + "loss": 0.4297, + "step": 2777 + }, + { + "epoch": 1.501892232834745, + "grad_norm": 0.32601767778396606, + "learning_rate": 8.825228490566233e-06, + "loss": 0.4192, + "step": 2778 + }, + { + "epoch": 1.5024328707875294, + "grad_norm": 0.3288547098636627, + "learning_rate": 8.824012406450171e-06, + "loss": 0.4107, + "step": 2779 + }, + { + "epoch": 1.5029735087403135, + "grad_norm": 0.38603484630584717, + "learning_rate": 8.822795777113352e-06, + "loss": 0.4437, + "step": 2780 + }, + { + "epoch": 1.503514146693098, + "grad_norm": 0.32590451836586, + "learning_rate": 8.821578602729242e-06, + "loss": 0.4262, + "step": 2781 + }, + { + "epoch": 1.504054784645882, + "grad_norm": 0.3509955108165741, + "learning_rate": 8.820360883471383e-06, + "loss": 0.4291, + "step": 2782 + }, + { + "epoch": 1.5045954225986664, + "grad_norm": 0.3051590919494629, + "learning_rate": 8.819142619513399e-06, + "loss": 0.4262, + "step": 2783 + }, + { + "epoch": 1.5051360605514508, + "grad_norm": 0.3273349404335022, + "learning_rate": 8.817923811028984e-06, + "loss": 0.4559, + "step": 2784 + }, + { + "epoch": 1.5056766985042351, + "grad_norm": 0.3385399878025055, + "learning_rate": 8.816704458191913e-06, + "loss": 0.4628, + "step": 2785 + }, + { + "epoch": 1.5062173364570193, + "grad_norm": 0.341138631105423, + "learning_rate": 8.815484561176041e-06, + "loss": 0.4523, + "step": 2786 + }, + { + "epoch": 1.5067579744098034, + "grad_norm": 0.2973570227622986, + "learning_rate": 8.814264120155297e-06, + "loss": 0.4419, + "step": 2787 + }, + { + "epoch": 1.5072986123625878, + "grad_norm": 0.3023761212825775, + "learning_rate": 8.813043135303692e-06, + "loss": 0.4082, + "step": 2788 + }, + { + "epoch": 1.5078392503153721, + "grad_norm": 0.35281112790107727, + "learning_rate": 8.81182160679531e-06, + "loss": 0.4583, + "step": 2789 + }, + { + "epoch": 1.5083798882681565, + "grad_norm": 0.3194134533405304, + "learning_rate": 8.810599534804315e-06, + "loss": 0.4434, + "step": 2790 + }, + { + "epoch": 1.5089205262209409, + "grad_norm": 0.353096067905426, + "learning_rate": 8.809376919504946e-06, + "loss": 0.4103, + "step": 2791 + }, + { + "epoch": 1.509461164173725, + "grad_norm": 0.3596133887767792, + "learning_rate": 8.808153761071525e-06, + "loss": 0.468, + "step": 2792 + }, + { + "epoch": 1.5100018021265091, + "grad_norm": 0.3381783664226532, + "learning_rate": 8.806930059678442e-06, + "loss": 0.454, + "step": 2793 + }, + { + "epoch": 1.5105424400792935, + "grad_norm": 0.37229710817337036, + "learning_rate": 8.805705815500177e-06, + "loss": 0.4548, + "step": 2794 + }, + { + "epoch": 1.5110830780320779, + "grad_norm": 0.35773658752441406, + "learning_rate": 8.804481028711274e-06, + "loss": 0.4905, + "step": 2795 + }, + { + "epoch": 1.5116237159848622, + "grad_norm": 0.3648654818534851, + "learning_rate": 8.803255699486367e-06, + "loss": 0.4349, + "step": 2796 + }, + { + "epoch": 1.5121643539376466, + "grad_norm": 0.3511752188205719, + "learning_rate": 8.802029828000157e-06, + "loss": 0.4158, + "step": 2797 + }, + { + "epoch": 1.5127049918904307, + "grad_norm": 0.3634643852710724, + "learning_rate": 8.800803414427426e-06, + "loss": 0.4287, + "step": 2798 + }, + { + "epoch": 1.5132456298432149, + "grad_norm": 0.364849716424942, + "learning_rate": 8.799576458943036e-06, + "loss": 0.4453, + "step": 2799 + }, + { + "epoch": 1.5137862677959992, + "grad_norm": 0.3459816873073578, + "learning_rate": 8.798348961721925e-06, + "loss": 0.4362, + "step": 2800 + }, + { + "epoch": 1.5143269057487836, + "grad_norm": 0.37838080525398254, + "learning_rate": 8.797120922939104e-06, + "loss": 0.4421, + "step": 2801 + }, + { + "epoch": 1.514867543701568, + "grad_norm": 0.36956077814102173, + "learning_rate": 8.795892342769668e-06, + "loss": 0.4374, + "step": 2802 + }, + { + "epoch": 1.515408181654352, + "grad_norm": 0.3102215528488159, + "learning_rate": 8.794663221388782e-06, + "loss": 0.4251, + "step": 2803 + }, + { + "epoch": 1.5159488196071365, + "grad_norm": 0.3910534381866455, + "learning_rate": 8.793433558971695e-06, + "loss": 0.4706, + "step": 2804 + }, + { + "epoch": 1.5164894575599206, + "grad_norm": 0.3742954730987549, + "learning_rate": 8.792203355693731e-06, + "loss": 0.4495, + "step": 2805 + }, + { + "epoch": 1.517030095512705, + "grad_norm": 0.32030007243156433, + "learning_rate": 8.790972611730286e-06, + "loss": 0.4461, + "step": 2806 + }, + { + "epoch": 1.5175707334654893, + "grad_norm": 0.4075528383255005, + "learning_rate": 8.789741327256841e-06, + "loss": 0.4306, + "step": 2807 + }, + { + "epoch": 1.5181113714182737, + "grad_norm": 0.38579171895980835, + "learning_rate": 8.788509502448948e-06, + "loss": 0.4444, + "step": 2808 + }, + { + "epoch": 1.5186520093710578, + "grad_norm": 0.3903457820415497, + "learning_rate": 8.78727713748224e-06, + "loss": 0.4682, + "step": 2809 + }, + { + "epoch": 1.5191926473238422, + "grad_norm": 0.44318467378616333, + "learning_rate": 8.786044232532423e-06, + "loss": 0.4599, + "step": 2810 + }, + { + "epoch": 1.5197332852766263, + "grad_norm": 0.3772420287132263, + "learning_rate": 8.784810787775285e-06, + "loss": 0.4243, + "step": 2811 + }, + { + "epoch": 1.5202739232294107, + "grad_norm": 0.37346014380455017, + "learning_rate": 8.783576803386687e-06, + "loss": 0.4687, + "step": 2812 + }, + { + "epoch": 1.520814561182195, + "grad_norm": 0.3519136607646942, + "learning_rate": 8.782342279542569e-06, + "loss": 0.4046, + "step": 2813 + }, + { + "epoch": 1.5213551991349794, + "grad_norm": 0.40191009640693665, + "learning_rate": 8.781107216418945e-06, + "loss": 0.4479, + "step": 2814 + }, + { + "epoch": 1.5218958370877635, + "grad_norm": 0.36206626892089844, + "learning_rate": 8.77987161419191e-06, + "loss": 0.4261, + "step": 2815 + }, + { + "epoch": 1.5224364750405477, + "grad_norm": 0.36779820919036865, + "learning_rate": 8.778635473037635e-06, + "loss": 0.4459, + "step": 2816 + }, + { + "epoch": 1.522977112993332, + "grad_norm": 0.3682043254375458, + "learning_rate": 8.777398793132364e-06, + "loss": 0.4385, + "step": 2817 + }, + { + "epoch": 1.5235177509461164, + "grad_norm": 0.3300313949584961, + "learning_rate": 8.776161574652423e-06, + "loss": 0.4134, + "step": 2818 + }, + { + "epoch": 1.5240583888989008, + "grad_norm": 0.36796510219573975, + "learning_rate": 8.774923817774211e-06, + "loss": 0.4872, + "step": 2819 + }, + { + "epoch": 1.5245990268516851, + "grad_norm": 0.37004774808883667, + "learning_rate": 8.773685522674205e-06, + "loss": 0.4239, + "step": 2820 + }, + { + "epoch": 1.5251396648044693, + "grad_norm": 0.3955453932285309, + "learning_rate": 8.77244668952896e-06, + "loss": 0.4465, + "step": 2821 + }, + { + "epoch": 1.5256803027572534, + "grad_norm": 0.364418089389801, + "learning_rate": 8.771207318515104e-06, + "loss": 0.4375, + "step": 2822 + }, + { + "epoch": 1.5262209407100378, + "grad_norm": 0.46262267231941223, + "learning_rate": 8.769967409809348e-06, + "loss": 0.4684, + "step": 2823 + }, + { + "epoch": 1.5267615786628221, + "grad_norm": 0.36952295899391174, + "learning_rate": 8.768726963588475e-06, + "loss": 0.4226, + "step": 2824 + }, + { + "epoch": 1.5273022166156065, + "grad_norm": 0.38018321990966797, + "learning_rate": 8.767485980029342e-06, + "loss": 0.4038, + "step": 2825 + }, + { + "epoch": 1.5278428545683909, + "grad_norm": 0.37219712138175964, + "learning_rate": 8.76624445930889e-06, + "loss": 0.4761, + "step": 2826 + }, + { + "epoch": 1.528383492521175, + "grad_norm": 0.38064977526664734, + "learning_rate": 8.765002401604133e-06, + "loss": 0.4146, + "step": 2827 + }, + { + "epoch": 1.5289241304739591, + "grad_norm": 0.40907156467437744, + "learning_rate": 8.763759807092157e-06, + "loss": 0.4225, + "step": 2828 + }, + { + "epoch": 1.5294647684267435, + "grad_norm": 0.31351208686828613, + "learning_rate": 8.762516675950134e-06, + "loss": 0.4277, + "step": 2829 + }, + { + "epoch": 1.5300054063795279, + "grad_norm": 0.3757653832435608, + "learning_rate": 8.761273008355306e-06, + "loss": 0.4729, + "step": 2830 + }, + { + "epoch": 1.5305460443323122, + "grad_norm": 0.38561713695526123, + "learning_rate": 8.76002880448499e-06, + "loss": 0.4109, + "step": 2831 + }, + { + "epoch": 1.5310866822850964, + "grad_norm": 0.32167428731918335, + "learning_rate": 8.758784064516585e-06, + "loss": 0.4125, + "step": 2832 + }, + { + "epoch": 1.5316273202378807, + "grad_norm": 0.3748205006122589, + "learning_rate": 8.757538788627563e-06, + "loss": 0.4633, + "step": 2833 + }, + { + "epoch": 1.5321679581906649, + "grad_norm": 0.3118850588798523, + "learning_rate": 8.756292976995475e-06, + "loss": 0.3965, + "step": 2834 + }, + { + "epoch": 1.5327085961434492, + "grad_norm": 0.3302631378173828, + "learning_rate": 8.755046629797944e-06, + "loss": 0.4082, + "step": 2835 + }, + { + "epoch": 1.5332492340962336, + "grad_norm": 0.34935998916625977, + "learning_rate": 8.753799747212672e-06, + "loss": 0.4552, + "step": 2836 + }, + { + "epoch": 1.533789872049018, + "grad_norm": 0.3248444199562073, + "learning_rate": 8.752552329417439e-06, + "loss": 0.4346, + "step": 2837 + }, + { + "epoch": 1.534330510001802, + "grad_norm": 0.33430078625679016, + "learning_rate": 8.7513043765901e-06, + "loss": 0.4498, + "step": 2838 + }, + { + "epoch": 1.5348711479545865, + "grad_norm": 0.31254705786705017, + "learning_rate": 8.750055888908582e-06, + "loss": 0.3941, + "step": 2839 + }, + { + "epoch": 1.5354117859073706, + "grad_norm": 0.3183569312095642, + "learning_rate": 8.748806866550895e-06, + "loss": 0.4209, + "step": 2840 + }, + { + "epoch": 1.535952423860155, + "grad_norm": 0.3188185393810272, + "learning_rate": 8.747557309695123e-06, + "loss": 0.4231, + "step": 2841 + }, + { + "epoch": 1.5364930618129393, + "grad_norm": 0.37554019689559937, + "learning_rate": 8.746307218519424e-06, + "loss": 0.4536, + "step": 2842 + }, + { + "epoch": 1.5370336997657237, + "grad_norm": 0.3110560178756714, + "learning_rate": 8.745056593202033e-06, + "loss": 0.4261, + "step": 2843 + }, + { + "epoch": 1.5375743377185078, + "grad_norm": 0.3565128743648529, + "learning_rate": 8.743805433921265e-06, + "loss": 0.4492, + "step": 2844 + }, + { + "epoch": 1.538114975671292, + "grad_norm": 0.32924672961235046, + "learning_rate": 8.742553740855507e-06, + "loss": 0.437, + "step": 2845 + }, + { + "epoch": 1.5386556136240763, + "grad_norm": 0.310278058052063, + "learning_rate": 8.74130151418322e-06, + "loss": 0.3881, + "step": 2846 + }, + { + "epoch": 1.5391962515768607, + "grad_norm": 0.33028700947761536, + "learning_rate": 8.740048754082949e-06, + "loss": 0.4407, + "step": 2847 + }, + { + "epoch": 1.539736889529645, + "grad_norm": 0.32446345686912537, + "learning_rate": 8.738795460733305e-06, + "loss": 0.4159, + "step": 2848 + }, + { + "epoch": 1.5402775274824294, + "grad_norm": 0.35544806718826294, + "learning_rate": 8.737541634312985e-06, + "loss": 0.475, + "step": 2849 + }, + { + "epoch": 1.5408181654352135, + "grad_norm": 0.3118836283683777, + "learning_rate": 8.736287275000755e-06, + "loss": 0.3991, + "step": 2850 + }, + { + "epoch": 1.5413588033879977, + "grad_norm": 0.33479005098342896, + "learning_rate": 8.735032382975459e-06, + "loss": 0.4306, + "step": 2851 + }, + { + "epoch": 1.541899441340782, + "grad_norm": 0.3764827251434326, + "learning_rate": 8.733776958416018e-06, + "loss": 0.4292, + "step": 2852 + }, + { + "epoch": 1.5424400792935664, + "grad_norm": 0.35498976707458496, + "learning_rate": 8.732521001501428e-06, + "loss": 0.4456, + "step": 2853 + }, + { + "epoch": 1.5429807172463508, + "grad_norm": 0.3415829539299011, + "learning_rate": 8.731264512410762e-06, + "loss": 0.4211, + "step": 2854 + }, + { + "epoch": 1.5435213551991351, + "grad_norm": 0.36124956607818604, + "learning_rate": 8.730007491323167e-06, + "loss": 0.4473, + "step": 2855 + }, + { + "epoch": 1.5440619931519193, + "grad_norm": 0.3353195786476135, + "learning_rate": 8.728749938417867e-06, + "loss": 0.4127, + "step": 2856 + }, + { + "epoch": 1.5446026311047034, + "grad_norm": 0.34685778617858887, + "learning_rate": 8.727491853874159e-06, + "loss": 0.4386, + "step": 2857 + }, + { + "epoch": 1.5451432690574878, + "grad_norm": 0.3679361343383789, + "learning_rate": 8.726233237871424e-06, + "loss": 0.4551, + "step": 2858 + }, + { + "epoch": 1.5456839070102721, + "grad_norm": 0.3218771815299988, + "learning_rate": 8.724974090589107e-06, + "loss": 0.4331, + "step": 2859 + }, + { + "epoch": 1.5462245449630565, + "grad_norm": 0.3502300977706909, + "learning_rate": 8.723714412206741e-06, + "loss": 0.4591, + "step": 2860 + }, + { + "epoch": 1.5467651829158406, + "grad_norm": 0.3705553710460663, + "learning_rate": 8.722454202903923e-06, + "loss": 0.4244, + "step": 2861 + }, + { + "epoch": 1.547305820868625, + "grad_norm": 0.33638572692871094, + "learning_rate": 8.721193462860335e-06, + "loss": 0.4398, + "step": 2862 + }, + { + "epoch": 1.5478464588214091, + "grad_norm": 0.3592517375946045, + "learning_rate": 8.719932192255732e-06, + "loss": 0.4252, + "step": 2863 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 0.3698059320449829, + "learning_rate": 8.718670391269939e-06, + "loss": 0.4756, + "step": 2864 + }, + { + "epoch": 1.5489277347269779, + "grad_norm": 0.3670055568218231, + "learning_rate": 8.717408060082865e-06, + "loss": 0.4137, + "step": 2865 + }, + { + "epoch": 1.5494683726797622, + "grad_norm": 0.36520129442214966, + "learning_rate": 8.71614519887449e-06, + "loss": 0.4402, + "step": 2866 + }, + { + "epoch": 1.5500090106325464, + "grad_norm": 0.3617687523365021, + "learning_rate": 8.71488180782487e-06, + "loss": 0.4098, + "step": 2867 + }, + { + "epoch": 1.5505496485853307, + "grad_norm": 0.3548413813114166, + "learning_rate": 8.713617887114137e-06, + "loss": 0.4374, + "step": 2868 + }, + { + "epoch": 1.5510902865381149, + "grad_norm": 0.3364056348800659, + "learning_rate": 8.712353436922501e-06, + "loss": 0.4491, + "step": 2869 + }, + { + "epoch": 1.5516309244908992, + "grad_norm": 0.3399386405944824, + "learning_rate": 8.711088457430239e-06, + "loss": 0.4152, + "step": 2870 + }, + { + "epoch": 1.5521715624436836, + "grad_norm": 0.3334507346153259, + "learning_rate": 8.709822948817716e-06, + "loss": 0.4189, + "step": 2871 + }, + { + "epoch": 1.552712200396468, + "grad_norm": 0.32251378893852234, + "learning_rate": 8.708556911265363e-06, + "loss": 0.45, + "step": 2872 + }, + { + "epoch": 1.553252838349252, + "grad_norm": 0.32779017090797424, + "learning_rate": 8.70729034495369e-06, + "loss": 0.4086, + "step": 2873 + }, + { + "epoch": 1.5537934763020365, + "grad_norm": 0.3283478021621704, + "learning_rate": 8.70602325006328e-06, + "loss": 0.4603, + "step": 2874 + }, + { + "epoch": 1.5543341142548206, + "grad_norm": 0.29777684807777405, + "learning_rate": 8.704755626774796e-06, + "loss": 0.4148, + "step": 2875 + }, + { + "epoch": 1.554874752207605, + "grad_norm": 0.3250730335712433, + "learning_rate": 8.703487475268972e-06, + "loss": 0.4264, + "step": 2876 + }, + { + "epoch": 1.5554153901603893, + "grad_norm": 0.3259025514125824, + "learning_rate": 8.702218795726619e-06, + "loss": 0.4506, + "step": 2877 + }, + { + "epoch": 1.5559560281131737, + "grad_norm": 0.30676570534706116, + "learning_rate": 8.70094958832862e-06, + "loss": 0.4403, + "step": 2878 + }, + { + "epoch": 1.5564966660659578, + "grad_norm": 0.30886489152908325, + "learning_rate": 8.69967985325594e-06, + "loss": 0.4271, + "step": 2879 + }, + { + "epoch": 1.557037304018742, + "grad_norm": 0.30409660935401917, + "learning_rate": 8.698409590689616e-06, + "loss": 0.4394, + "step": 2880 + }, + { + "epoch": 1.5575779419715263, + "grad_norm": 0.30880311131477356, + "learning_rate": 8.697138800810756e-06, + "loss": 0.4429, + "step": 2881 + }, + { + "epoch": 1.5581185799243107, + "grad_norm": 0.3233693540096283, + "learning_rate": 8.695867483800551e-06, + "loss": 0.4363, + "step": 2882 + }, + { + "epoch": 1.558659217877095, + "grad_norm": 0.3493873178958893, + "learning_rate": 8.69459563984026e-06, + "loss": 0.4717, + "step": 2883 + }, + { + "epoch": 1.5591998558298794, + "grad_norm": 0.3600580096244812, + "learning_rate": 8.693323269111222e-06, + "loss": 0.4428, + "step": 2884 + }, + { + "epoch": 1.5597404937826636, + "grad_norm": 0.35579100251197815, + "learning_rate": 8.692050371794849e-06, + "loss": 0.4299, + "step": 2885 + }, + { + "epoch": 1.5602811317354477, + "grad_norm": 0.2761000990867615, + "learning_rate": 8.690776948072629e-06, + "loss": 0.3494, + "step": 2886 + }, + { + "epoch": 1.560821769688232, + "grad_norm": 0.39794984459877014, + "learning_rate": 8.689502998126121e-06, + "loss": 0.4744, + "step": 2887 + }, + { + "epoch": 1.5613624076410164, + "grad_norm": 0.3184354603290558, + "learning_rate": 8.688228522136966e-06, + "loss": 0.4155, + "step": 2888 + }, + { + "epoch": 1.5619030455938008, + "grad_norm": 0.33132901787757874, + "learning_rate": 8.686953520286876e-06, + "loss": 0.4332, + "step": 2889 + }, + { + "epoch": 1.562443683546585, + "grad_norm": 0.4157227575778961, + "learning_rate": 8.685677992757637e-06, + "loss": 0.4832, + "step": 2890 + }, + { + "epoch": 1.5629843214993693, + "grad_norm": 0.3433517813682556, + "learning_rate": 8.684401939731114e-06, + "loss": 0.4207, + "step": 2891 + }, + { + "epoch": 1.5635249594521534, + "grad_norm": 0.3725986182689667, + "learning_rate": 8.683125361389241e-06, + "loss": 0.4232, + "step": 2892 + }, + { + "epoch": 1.5640655974049378, + "grad_norm": 0.34945711493492126, + "learning_rate": 8.681848257914031e-06, + "loss": 0.4275, + "step": 2893 + }, + { + "epoch": 1.5646062353577221, + "grad_norm": 0.35210081934928894, + "learning_rate": 8.680570629487575e-06, + "loss": 0.4357, + "step": 2894 + }, + { + "epoch": 1.5651468733105065, + "grad_norm": 0.39097723364830017, + "learning_rate": 8.67929247629203e-06, + "loss": 0.468, + "step": 2895 + }, + { + "epoch": 1.5656875112632906, + "grad_norm": 0.4142407774925232, + "learning_rate": 8.678013798509636e-06, + "loss": 0.4771, + "step": 2896 + }, + { + "epoch": 1.566228149216075, + "grad_norm": 0.33647021651268005, + "learning_rate": 8.676734596322702e-06, + "loss": 0.4007, + "step": 2897 + }, + { + "epoch": 1.5667687871688591, + "grad_norm": 0.3727369010448456, + "learning_rate": 8.675454869913616e-06, + "loss": 0.4262, + "step": 2898 + }, + { + "epoch": 1.5673094251216435, + "grad_norm": 0.42712923884391785, + "learning_rate": 8.674174619464838e-06, + "loss": 0.4331, + "step": 2899 + }, + { + "epoch": 1.5678500630744279, + "grad_norm": 0.34135958552360535, + "learning_rate": 8.672893845158908e-06, + "loss": 0.4467, + "step": 2900 + }, + { + "epoch": 1.5683907010272122, + "grad_norm": 0.3845239579677582, + "learning_rate": 8.671612547178428e-06, + "loss": 0.4315, + "step": 2901 + }, + { + "epoch": 1.5689313389799964, + "grad_norm": 0.3987267315387726, + "learning_rate": 8.67033072570609e-06, + "loss": 0.4823, + "step": 2902 + }, + { + "epoch": 1.5694719769327807, + "grad_norm": 0.2945195436477661, + "learning_rate": 8.669048380924654e-06, + "loss": 0.4257, + "step": 2903 + }, + { + "epoch": 1.5700126148855649, + "grad_norm": 0.38530126214027405, + "learning_rate": 8.667765513016949e-06, + "loss": 0.4269, + "step": 2904 + }, + { + "epoch": 1.5705532528383492, + "grad_norm": 0.3750806450843811, + "learning_rate": 8.66648212216589e-06, + "loss": 0.481, + "step": 2905 + }, + { + "epoch": 1.5710938907911336, + "grad_norm": 0.3303103446960449, + "learning_rate": 8.665198208554456e-06, + "loss": 0.4464, + "step": 2906 + }, + { + "epoch": 1.571634528743918, + "grad_norm": 0.3678765296936035, + "learning_rate": 8.663913772365706e-06, + "loss": 0.4626, + "step": 2907 + }, + { + "epoch": 1.572175166696702, + "grad_norm": 0.3794427216053009, + "learning_rate": 8.662628813782775e-06, + "loss": 0.4398, + "step": 2908 + }, + { + "epoch": 1.5727158046494862, + "grad_norm": 0.3422366678714752, + "learning_rate": 8.661343332988869e-06, + "loss": 0.4414, + "step": 2909 + }, + { + "epoch": 1.5732564426022706, + "grad_norm": 0.3901372253894806, + "learning_rate": 8.660057330167267e-06, + "loss": 0.4565, + "step": 2910 + }, + { + "epoch": 1.573797080555055, + "grad_norm": 0.32709166407585144, + "learning_rate": 8.658770805501328e-06, + "loss": 0.393, + "step": 2911 + }, + { + "epoch": 1.5743377185078393, + "grad_norm": 0.3676150143146515, + "learning_rate": 8.657483759174482e-06, + "loss": 0.4566, + "step": 2912 + }, + { + "epoch": 1.5748783564606237, + "grad_norm": 0.3521651327610016, + "learning_rate": 8.656196191370233e-06, + "loss": 0.4469, + "step": 2913 + }, + { + "epoch": 1.5754189944134078, + "grad_norm": 0.33962196111679077, + "learning_rate": 8.65490810227216e-06, + "loss": 0.3909, + "step": 2914 + }, + { + "epoch": 1.575959632366192, + "grad_norm": 0.40406396985054016, + "learning_rate": 8.653619492063916e-06, + "loss": 0.4976, + "step": 2915 + }, + { + "epoch": 1.5765002703189763, + "grad_norm": 0.32255756855010986, + "learning_rate": 8.652330360929228e-06, + "loss": 0.4204, + "step": 2916 + }, + { + "epoch": 1.5770409082717607, + "grad_norm": 0.3527771532535553, + "learning_rate": 8.6510407090519e-06, + "loss": 0.3998, + "step": 2917 + }, + { + "epoch": 1.577581546224545, + "grad_norm": 0.4008268117904663, + "learning_rate": 8.64975053661581e-06, + "loss": 0.5013, + "step": 2918 + }, + { + "epoch": 1.5781221841773294, + "grad_norm": 0.30029141902923584, + "learning_rate": 8.648459843804904e-06, + "loss": 0.3843, + "step": 2919 + }, + { + "epoch": 1.5786628221301136, + "grad_norm": 0.3258407413959503, + "learning_rate": 8.647168630803208e-06, + "loss": 0.4322, + "step": 2920 + }, + { + "epoch": 1.5792034600828977, + "grad_norm": 0.37515318393707275, + "learning_rate": 8.645876897794823e-06, + "loss": 0.4661, + "step": 2921 + }, + { + "epoch": 1.579744098035682, + "grad_norm": 0.3038284182548523, + "learning_rate": 8.644584644963918e-06, + "loss": 0.4474, + "step": 2922 + }, + { + "epoch": 1.5802847359884664, + "grad_norm": 0.3194510042667389, + "learning_rate": 8.643291872494745e-06, + "loss": 0.4175, + "step": 2923 + }, + { + "epoch": 1.5808253739412508, + "grad_norm": 0.32912907004356384, + "learning_rate": 8.64199858057162e-06, + "loss": 0.4195, + "step": 2924 + }, + { + "epoch": 1.581366011894035, + "grad_norm": 0.39627596735954285, + "learning_rate": 8.640704769378943e-06, + "loss": 0.4388, + "step": 2925 + }, + { + "epoch": 1.5819066498468193, + "grad_norm": 0.31934958696365356, + "learning_rate": 8.63941043910118e-06, + "loss": 0.4587, + "step": 2926 + }, + { + "epoch": 1.5824472877996034, + "grad_norm": 0.35360094904899597, + "learning_rate": 8.638115589922875e-06, + "loss": 0.4516, + "step": 2927 + }, + { + "epoch": 1.5829879257523878, + "grad_norm": 0.336273729801178, + "learning_rate": 8.636820222028645e-06, + "loss": 0.431, + "step": 2928 + }, + { + "epoch": 1.5835285637051721, + "grad_norm": 0.31268832087516785, + "learning_rate": 8.635524335603183e-06, + "loss": 0.4043, + "step": 2929 + }, + { + "epoch": 1.5840692016579565, + "grad_norm": 0.3598516583442688, + "learning_rate": 8.634227930831252e-06, + "loss": 0.476, + "step": 2930 + }, + { + "epoch": 1.5846098396107406, + "grad_norm": 0.28740474581718445, + "learning_rate": 8.632931007897693e-06, + "loss": 0.4236, + "step": 2931 + }, + { + "epoch": 1.585150477563525, + "grad_norm": 0.34720805287361145, + "learning_rate": 8.631633566987416e-06, + "loss": 0.4373, + "step": 2932 + }, + { + "epoch": 1.5856911155163091, + "grad_norm": 0.31540805101394653, + "learning_rate": 8.630335608285412e-06, + "loss": 0.4298, + "step": 2933 + }, + { + "epoch": 1.5862317534690935, + "grad_norm": 0.284653902053833, + "learning_rate": 8.629037131976737e-06, + "loss": 0.4067, + "step": 2934 + }, + { + "epoch": 1.5867723914218779, + "grad_norm": 0.3056807219982147, + "learning_rate": 8.627738138246529e-06, + "loss": 0.4201, + "step": 2935 + }, + { + "epoch": 1.5873130293746622, + "grad_norm": 0.33723872900009155, + "learning_rate": 8.626438627279993e-06, + "loss": 0.463, + "step": 2936 + }, + { + "epoch": 1.5878536673274464, + "grad_norm": 0.3082202672958374, + "learning_rate": 8.625138599262416e-06, + "loss": 0.4211, + "step": 2937 + }, + { + "epoch": 1.5883943052802305, + "grad_norm": 0.31902506947517395, + "learning_rate": 8.623838054379145e-06, + "loss": 0.4535, + "step": 2938 + }, + { + "epoch": 1.5889349432330149, + "grad_norm": 0.35180023312568665, + "learning_rate": 8.62253699281562e-06, + "loss": 0.4971, + "step": 2939 + }, + { + "epoch": 1.5894755811857992, + "grad_norm": 0.33312559127807617, + "learning_rate": 8.621235414757337e-06, + "loss": 0.4408, + "step": 2940 + }, + { + "epoch": 1.5900162191385836, + "grad_norm": 0.2971533536911011, + "learning_rate": 8.619933320389872e-06, + "loss": 0.3947, + "step": 2941 + }, + { + "epoch": 1.590556857091368, + "grad_norm": 0.3088550865650177, + "learning_rate": 8.618630709898878e-06, + "loss": 0.4251, + "step": 2942 + }, + { + "epoch": 1.591097495044152, + "grad_norm": 0.30490434169769287, + "learning_rate": 8.61732758347008e-06, + "loss": 0.4071, + "step": 2943 + }, + { + "epoch": 1.5916381329969362, + "grad_norm": 0.3358185291290283, + "learning_rate": 8.616023941289274e-06, + "loss": 0.4123, + "step": 2944 + }, + { + "epoch": 1.5921787709497206, + "grad_norm": 0.35692816972732544, + "learning_rate": 8.61471978354233e-06, + "loss": 0.4366, + "step": 2945 + }, + { + "epoch": 1.592719408902505, + "grad_norm": 0.3419777750968933, + "learning_rate": 8.613415110415194e-06, + "loss": 0.4426, + "step": 2946 + }, + { + "epoch": 1.5932600468552893, + "grad_norm": 0.37838178873062134, + "learning_rate": 8.612109922093881e-06, + "loss": 0.4715, + "step": 2947 + }, + { + "epoch": 1.5938006848080737, + "grad_norm": 0.3336822986602783, + "learning_rate": 8.610804218764487e-06, + "loss": 0.3702, + "step": 2948 + }, + { + "epoch": 1.5943413227608578, + "grad_norm": 0.33858394622802734, + "learning_rate": 8.609498000613173e-06, + "loss": 0.457, + "step": 2949 + }, + { + "epoch": 1.594881960713642, + "grad_norm": 0.3557717502117157, + "learning_rate": 8.608191267826179e-06, + "loss": 0.4291, + "step": 2950 + }, + { + "epoch": 1.5954225986664263, + "grad_norm": 0.36976203322410583, + "learning_rate": 8.606884020589816e-06, + "loss": 0.456, + "step": 2951 + }, + { + "epoch": 1.5959632366192107, + "grad_norm": 0.33798712491989136, + "learning_rate": 8.605576259090467e-06, + "loss": 0.4667, + "step": 2952 + }, + { + "epoch": 1.596503874571995, + "grad_norm": 0.37304526567459106, + "learning_rate": 8.604267983514595e-06, + "loss": 0.403, + "step": 2953 + }, + { + "epoch": 1.5970445125247792, + "grad_norm": 0.37876787781715393, + "learning_rate": 8.602959194048728e-06, + "loss": 0.4562, + "step": 2954 + }, + { + "epoch": 1.5975851504775636, + "grad_norm": 0.2938414514064789, + "learning_rate": 8.60164989087947e-06, + "loss": 0.4321, + "step": 2955 + }, + { + "epoch": 1.5981257884303477, + "grad_norm": 0.3825919032096863, + "learning_rate": 8.600340074193504e-06, + "loss": 0.4688, + "step": 2956 + }, + { + "epoch": 1.598666426383132, + "grad_norm": 0.35982459783554077, + "learning_rate": 8.599029744177577e-06, + "loss": 0.4318, + "step": 2957 + }, + { + "epoch": 1.5992070643359164, + "grad_norm": 0.3182411193847656, + "learning_rate": 8.597718901018512e-06, + "loss": 0.4157, + "step": 2958 + }, + { + "epoch": 1.5997477022887008, + "grad_norm": 0.3472853899002075, + "learning_rate": 8.596407544903212e-06, + "loss": 0.4471, + "step": 2959 + }, + { + "epoch": 1.600288340241485, + "grad_norm": 0.3751436471939087, + "learning_rate": 8.595095676018645e-06, + "loss": 0.4626, + "step": 2960 + }, + { + "epoch": 1.6008289781942693, + "grad_norm": 0.2762688994407654, + "learning_rate": 8.593783294551853e-06, + "loss": 0.3576, + "step": 2961 + }, + { + "epoch": 1.6013696161470534, + "grad_norm": 0.3292232155799866, + "learning_rate": 8.592470400689956e-06, + "loss": 0.4851, + "step": 2962 + }, + { + "epoch": 1.6019102540998378, + "grad_norm": 0.3487094044685364, + "learning_rate": 8.591156994620142e-06, + "loss": 0.4279, + "step": 2963 + }, + { + "epoch": 1.6024508920526221, + "grad_norm": 0.34718480706214905, + "learning_rate": 8.589843076529675e-06, + "loss": 0.4374, + "step": 2964 + }, + { + "epoch": 1.6029915300054065, + "grad_norm": 0.3252781629562378, + "learning_rate": 8.588528646605893e-06, + "loss": 0.4086, + "step": 2965 + }, + { + "epoch": 1.6035321679581906, + "grad_norm": 0.35437238216400146, + "learning_rate": 8.587213705036202e-06, + "loss": 0.4082, + "step": 2966 + }, + { + "epoch": 1.6040728059109748, + "grad_norm": 0.31416869163513184, + "learning_rate": 8.585898252008082e-06, + "loss": 0.4597, + "step": 2967 + }, + { + "epoch": 1.6046134438637591, + "grad_norm": 0.37345847487449646, + "learning_rate": 8.584582287709094e-06, + "loss": 0.4391, + "step": 2968 + }, + { + "epoch": 1.6051540818165435, + "grad_norm": 0.39789295196533203, + "learning_rate": 8.583265812326862e-06, + "loss": 0.4491, + "step": 2969 + }, + { + "epoch": 1.6056947197693279, + "grad_norm": 0.33515751361846924, + "learning_rate": 8.581948826049086e-06, + "loss": 0.3954, + "step": 2970 + }, + { + "epoch": 1.6062353577221122, + "grad_norm": 0.39586490392684937, + "learning_rate": 8.580631329063544e-06, + "loss": 0.4189, + "step": 2971 + }, + { + "epoch": 1.6067759956748964, + "grad_norm": 0.3636821508407593, + "learning_rate": 8.579313321558076e-06, + "loss": 0.4267, + "step": 2972 + }, + { + "epoch": 1.6073166336276805, + "grad_norm": 0.35685521364212036, + "learning_rate": 8.577994803720605e-06, + "loss": 0.4306, + "step": 2973 + }, + { + "epoch": 1.6078572715804649, + "grad_norm": 0.3229580223560333, + "learning_rate": 8.576675775739125e-06, + "loss": 0.43, + "step": 2974 + }, + { + "epoch": 1.6083979095332492, + "grad_norm": 0.34237968921661377, + "learning_rate": 8.575356237801695e-06, + "loss": 0.4498, + "step": 2975 + }, + { + "epoch": 1.6089385474860336, + "grad_norm": 0.32243970036506653, + "learning_rate": 8.574036190096455e-06, + "loss": 0.4135, + "step": 2976 + }, + { + "epoch": 1.609479185438818, + "grad_norm": 0.3718234598636627, + "learning_rate": 8.572715632811616e-06, + "loss": 0.4753, + "step": 2977 + }, + { + "epoch": 1.610019823391602, + "grad_norm": 0.3125777244567871, + "learning_rate": 8.57139456613546e-06, + "loss": 0.4278, + "step": 2978 + }, + { + "epoch": 1.6105604613443862, + "grad_norm": 0.3171471357345581, + "learning_rate": 8.570072990256342e-06, + "loss": 0.4292, + "step": 2979 + }, + { + "epoch": 1.6111010992971706, + "grad_norm": 0.29979678988456726, + "learning_rate": 8.56875090536269e-06, + "loss": 0.4332, + "step": 2980 + }, + { + "epoch": 1.611641737249955, + "grad_norm": 0.3105889558792114, + "learning_rate": 8.567428311643005e-06, + "loss": 0.4319, + "step": 2981 + }, + { + "epoch": 1.6121823752027393, + "grad_norm": 0.33198606967926025, + "learning_rate": 8.566105209285857e-06, + "loss": 0.4812, + "step": 2982 + }, + { + "epoch": 1.6127230131555235, + "grad_norm": 0.30810120701789856, + "learning_rate": 8.564781598479897e-06, + "loss": 0.405, + "step": 2983 + }, + { + "epoch": 1.6132636511083078, + "grad_norm": 0.3284092843532562, + "learning_rate": 8.563457479413839e-06, + "loss": 0.4186, + "step": 2984 + }, + { + "epoch": 1.613804289061092, + "grad_norm": 0.33219486474990845, + "learning_rate": 8.562132852276474e-06, + "loss": 0.4559, + "step": 2985 + }, + { + "epoch": 1.6143449270138763, + "grad_norm": 0.36312806606292725, + "learning_rate": 8.560807717256666e-06, + "loss": 0.4533, + "step": 2986 + }, + { + "epoch": 1.6148855649666607, + "grad_norm": 0.32629111409187317, + "learning_rate": 8.55948207454335e-06, + "loss": 0.4116, + "step": 2987 + }, + { + "epoch": 1.615426202919445, + "grad_norm": 0.3450940251350403, + "learning_rate": 8.558155924325533e-06, + "loss": 0.4427, + "step": 2988 + }, + { + "epoch": 1.6159668408722292, + "grad_norm": 0.3320872187614441, + "learning_rate": 8.556829266792297e-06, + "loss": 0.4304, + "step": 2989 + }, + { + "epoch": 1.6165074788250136, + "grad_norm": 0.31070026755332947, + "learning_rate": 8.555502102132792e-06, + "loss": 0.4164, + "step": 2990 + }, + { + "epoch": 1.6170481167777977, + "grad_norm": 0.3339788615703583, + "learning_rate": 8.554174430536243e-06, + "loss": 0.4475, + "step": 2991 + }, + { + "epoch": 1.617588754730582, + "grad_norm": 0.36194276809692383, + "learning_rate": 8.552846252191949e-06, + "loss": 0.4633, + "step": 2992 + }, + { + "epoch": 1.6181293926833664, + "grad_norm": 0.35281410813331604, + "learning_rate": 8.551517567289279e-06, + "loss": 0.4368, + "step": 2993 + }, + { + "epoch": 1.6186700306361508, + "grad_norm": 0.34481269121170044, + "learning_rate": 8.55018837601767e-06, + "loss": 0.4275, + "step": 2994 + }, + { + "epoch": 1.619210668588935, + "grad_norm": 0.36388522386550903, + "learning_rate": 8.548858678566643e-06, + "loss": 0.4433, + "step": 2995 + }, + { + "epoch": 1.6197513065417193, + "grad_norm": 0.3803480863571167, + "learning_rate": 8.547528475125778e-06, + "loss": 0.3977, + "step": 2996 + }, + { + "epoch": 1.6202919444945034, + "grad_norm": 0.4043755829334259, + "learning_rate": 8.546197765884736e-06, + "loss": 0.4438, + "step": 2997 + }, + { + "epoch": 1.6208325824472878, + "grad_norm": 0.3669350743293762, + "learning_rate": 8.544866551033246e-06, + "loss": 0.4257, + "step": 2998 + }, + { + "epoch": 1.6213732204000721, + "grad_norm": 0.3811838924884796, + "learning_rate": 8.54353483076111e-06, + "loss": 0.432, + "step": 2999 + }, + { + "epoch": 1.6219138583528565, + "grad_norm": 0.4157685935497284, + "learning_rate": 8.542202605258204e-06, + "loss": 0.4744, + "step": 3000 + }, + { + "epoch": 1.6224544963056406, + "grad_norm": 0.3191993832588196, + "learning_rate": 8.54086987471447e-06, + "loss": 0.4048, + "step": 3001 + }, + { + "epoch": 1.6229951342584248, + "grad_norm": 0.3500971794128418, + "learning_rate": 8.539536639319932e-06, + "loss": 0.4533, + "step": 3002 + }, + { + "epoch": 1.6235357722112091, + "grad_norm": 0.3561091721057892, + "learning_rate": 8.538202899264678e-06, + "loss": 0.4441, + "step": 3003 + }, + { + "epoch": 1.6240764101639935, + "grad_norm": 0.32293397188186646, + "learning_rate": 8.536868654738867e-06, + "loss": 0.4242, + "step": 3004 + }, + { + "epoch": 1.6246170481167779, + "grad_norm": 0.34320175647735596, + "learning_rate": 8.535533905932739e-06, + "loss": 0.4134, + "step": 3005 + }, + { + "epoch": 1.6251576860695622, + "grad_norm": 0.3411339521408081, + "learning_rate": 8.534198653036595e-06, + "loss": 0.4609, + "step": 3006 + }, + { + "epoch": 1.6256983240223464, + "grad_norm": 0.31736695766448975, + "learning_rate": 8.532862896240815e-06, + "loss": 0.4261, + "step": 3007 + }, + { + "epoch": 1.6262389619751305, + "grad_norm": 0.3072502613067627, + "learning_rate": 8.53152663573585e-06, + "loss": 0.4099, + "step": 3008 + }, + { + "epoch": 1.6267795999279149, + "grad_norm": 0.3346844017505646, + "learning_rate": 8.53018987171222e-06, + "loss": 0.4835, + "step": 3009 + }, + { + "epoch": 1.6273202378806992, + "grad_norm": 0.31881600618362427, + "learning_rate": 8.528852604360518e-06, + "loss": 0.4327, + "step": 3010 + }, + { + "epoch": 1.6278608758334836, + "grad_norm": 0.3123781681060791, + "learning_rate": 8.527514833871411e-06, + "loss": 0.4201, + "step": 3011 + }, + { + "epoch": 1.628401513786268, + "grad_norm": 0.33178600668907166, + "learning_rate": 8.526176560435634e-06, + "loss": 0.4304, + "step": 3012 + }, + { + "epoch": 1.628942151739052, + "grad_norm": 0.33425503969192505, + "learning_rate": 8.524837784243995e-06, + "loss": 0.4199, + "step": 3013 + }, + { + "epoch": 1.6294827896918362, + "grad_norm": 0.3384115695953369, + "learning_rate": 8.523498505487377e-06, + "loss": 0.4123, + "step": 3014 + }, + { + "epoch": 1.6300234276446206, + "grad_norm": 0.37247204780578613, + "learning_rate": 8.52215872435673e-06, + "loss": 0.446, + "step": 3015 + }, + { + "epoch": 1.630564065597405, + "grad_norm": 0.33755308389663696, + "learning_rate": 8.52081844104308e-06, + "loss": 0.4209, + "step": 3016 + }, + { + "epoch": 1.6311047035501893, + "grad_norm": 0.324717253446579, + "learning_rate": 8.51947765573752e-06, + "loss": 0.4097, + "step": 3017 + }, + { + "epoch": 1.6316453415029735, + "grad_norm": 0.3523276746273041, + "learning_rate": 8.518136368631216e-06, + "loss": 0.4053, + "step": 3018 + }, + { + "epoch": 1.6321859794557578, + "grad_norm": 0.3019939064979553, + "learning_rate": 8.516794579915407e-06, + "loss": 0.4097, + "step": 3019 + }, + { + "epoch": 1.632726617408542, + "grad_norm": 0.30737465620040894, + "learning_rate": 8.515452289781403e-06, + "loss": 0.4222, + "step": 3020 + }, + { + "epoch": 1.6332672553613263, + "grad_norm": 0.3415035605430603, + "learning_rate": 8.514109498420586e-06, + "loss": 0.4827, + "step": 3021 + }, + { + "epoch": 1.6338078933141107, + "grad_norm": 0.30833402276039124, + "learning_rate": 8.51276620602441e-06, + "loss": 0.4109, + "step": 3022 + }, + { + "epoch": 1.634348531266895, + "grad_norm": 0.34067338705062866, + "learning_rate": 8.511422412784396e-06, + "loss": 0.4255, + "step": 3023 + }, + { + "epoch": 1.6348891692196792, + "grad_norm": 0.31600379943847656, + "learning_rate": 8.51007811889214e-06, + "loss": 0.4368, + "step": 3024 + }, + { + "epoch": 1.6354298071724636, + "grad_norm": 0.33969056606292725, + "learning_rate": 8.50873332453931e-06, + "loss": 0.4142, + "step": 3025 + }, + { + "epoch": 1.6359704451252477, + "grad_norm": 0.33624815940856934, + "learning_rate": 8.507388029917646e-06, + "loss": 0.4426, + "step": 3026 + }, + { + "epoch": 1.636511083078032, + "grad_norm": 0.3214746117591858, + "learning_rate": 8.506042235218955e-06, + "loss": 0.4558, + "step": 3027 + }, + { + "epoch": 1.6370517210308164, + "grad_norm": 0.3398401439189911, + "learning_rate": 8.504695940635117e-06, + "loss": 0.4614, + "step": 3028 + }, + { + "epoch": 1.6375923589836008, + "grad_norm": 0.3270202875137329, + "learning_rate": 8.50334914635809e-06, + "loss": 0.4314, + "step": 3029 + }, + { + "epoch": 1.638132996936385, + "grad_norm": 0.3382250666618347, + "learning_rate": 8.50200185257989e-06, + "loss": 0.3988, + "step": 3030 + }, + { + "epoch": 1.638673634889169, + "grad_norm": 0.37231630086898804, + "learning_rate": 8.500654059492618e-06, + "loss": 0.4684, + "step": 3031 + }, + { + "epoch": 1.6392142728419534, + "grad_norm": 0.3468717336654663, + "learning_rate": 8.499305767288438e-06, + "loss": 0.4703, + "step": 3032 + }, + { + "epoch": 1.6397549107947378, + "grad_norm": 0.33282721042633057, + "learning_rate": 8.497956976159585e-06, + "loss": 0.4088, + "step": 3033 + }, + { + "epoch": 1.6402955487475221, + "grad_norm": 0.42642226815223694, + "learning_rate": 8.496607686298368e-06, + "loss": 0.4518, + "step": 3034 + }, + { + "epoch": 1.6408361867003065, + "grad_norm": 0.3754686713218689, + "learning_rate": 8.495257897897166e-06, + "loss": 0.4452, + "step": 3035 + }, + { + "epoch": 1.6413768246530906, + "grad_norm": 0.31679567694664, + "learning_rate": 8.493907611148433e-06, + "loss": 0.4025, + "step": 3036 + }, + { + "epoch": 1.6419174626058748, + "grad_norm": 0.4182370901107788, + "learning_rate": 8.492556826244687e-06, + "loss": 0.4908, + "step": 3037 + }, + { + "epoch": 1.6424581005586592, + "grad_norm": 0.3175565302371979, + "learning_rate": 8.491205543378518e-06, + "loss": 0.4201, + "step": 3038 + }, + { + "epoch": 1.6429987385114435, + "grad_norm": 0.36803102493286133, + "learning_rate": 8.489853762742596e-06, + "loss": 0.4387, + "step": 3039 + }, + { + "epoch": 1.6435393764642279, + "grad_norm": 0.3387838900089264, + "learning_rate": 8.48850148452965e-06, + "loss": 0.4604, + "step": 3040 + }, + { + "epoch": 1.6440800144170122, + "grad_norm": 0.3143702447414398, + "learning_rate": 8.487148708932489e-06, + "loss": 0.4285, + "step": 3041 + }, + { + "epoch": 1.6446206523697964, + "grad_norm": 0.36707639694213867, + "learning_rate": 8.485795436143987e-06, + "loss": 0.4527, + "step": 3042 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.34523433446884155, + "learning_rate": 8.484441666357093e-06, + "loss": 0.444, + "step": 3043 + }, + { + "epoch": 1.6457019282753649, + "grad_norm": 0.371557354927063, + "learning_rate": 8.48308739976482e-06, + "loss": 0.4698, + "step": 3044 + }, + { + "epoch": 1.6462425662281492, + "grad_norm": 0.33267924189567566, + "learning_rate": 8.481732636560266e-06, + "loss": 0.41, + "step": 3045 + }, + { + "epoch": 1.6467832041809336, + "grad_norm": 0.36271023750305176, + "learning_rate": 8.480377376936582e-06, + "loss": 0.434, + "step": 3046 + }, + { + "epoch": 1.6473238421337177, + "grad_norm": 0.31835541129112244, + "learning_rate": 8.479021621087002e-06, + "loss": 0.402, + "step": 3047 + }, + { + "epoch": 1.647864480086502, + "grad_norm": 0.3511029779911041, + "learning_rate": 8.477665369204829e-06, + "loss": 0.434, + "step": 3048 + }, + { + "epoch": 1.6484051180392862, + "grad_norm": 0.3473978638648987, + "learning_rate": 8.476308621483433e-06, + "loss": 0.4391, + "step": 3049 + }, + { + "epoch": 1.6489457559920706, + "grad_norm": 0.3333207070827484, + "learning_rate": 8.474951378116253e-06, + "loss": 0.4286, + "step": 3050 + }, + { + "epoch": 1.649486393944855, + "grad_norm": 0.3969205617904663, + "learning_rate": 8.473593639296811e-06, + "loss": 0.4527, + "step": 3051 + }, + { + "epoch": 1.6500270318976393, + "grad_norm": 0.30005496740341187, + "learning_rate": 8.472235405218682e-06, + "loss": 0.414, + "step": 3052 + }, + { + "epoch": 1.6505676698504235, + "grad_norm": 0.41392895579338074, + "learning_rate": 8.470876676075528e-06, + "loss": 0.4504, + "step": 3053 + }, + { + "epoch": 1.6511083078032078, + "grad_norm": 0.3443209230899811, + "learning_rate": 8.46951745206107e-06, + "loss": 0.4172, + "step": 3054 + }, + { + "epoch": 1.651648945755992, + "grad_norm": 0.3992152512073517, + "learning_rate": 8.468157733369102e-06, + "loss": 0.4553, + "step": 3055 + }, + { + "epoch": 1.6521895837087763, + "grad_norm": 0.37463444471359253, + "learning_rate": 8.466797520193492e-06, + "loss": 0.4543, + "step": 3056 + }, + { + "epoch": 1.6527302216615607, + "grad_norm": 0.33600836992263794, + "learning_rate": 8.465436812728181e-06, + "loss": 0.4298, + "step": 3057 + }, + { + "epoch": 1.653270859614345, + "grad_norm": 0.4305607080459595, + "learning_rate": 8.46407561116717e-06, + "loss": 0.4148, + "step": 3058 + }, + { + "epoch": 1.6538114975671292, + "grad_norm": 0.35718268156051636, + "learning_rate": 8.46271391570454e-06, + "loss": 0.41, + "step": 3059 + }, + { + "epoch": 1.6543521355199133, + "grad_norm": 0.4128228724002838, + "learning_rate": 8.461351726534438e-06, + "loss": 0.4067, + "step": 3060 + }, + { + "epoch": 1.6548927734726977, + "grad_norm": 0.3360671401023865, + "learning_rate": 8.459989043851082e-06, + "loss": 0.4042, + "step": 3061 + }, + { + "epoch": 1.655433411425482, + "grad_norm": 0.33517733216285706, + "learning_rate": 8.45862586784876e-06, + "loss": 0.4024, + "step": 3062 + }, + { + "epoch": 1.6559740493782664, + "grad_norm": 0.4027295708656311, + "learning_rate": 8.457262198721836e-06, + "loss": 0.447, + "step": 3063 + }, + { + "epoch": 1.6565146873310508, + "grad_norm": 0.3583768308162689, + "learning_rate": 8.455898036664734e-06, + "loss": 0.4516, + "step": 3064 + }, + { + "epoch": 1.657055325283835, + "grad_norm": 0.39112964272499084, + "learning_rate": 8.454533381871957e-06, + "loss": 0.4538, + "step": 3065 + }, + { + "epoch": 1.657595963236619, + "grad_norm": 0.4554952383041382, + "learning_rate": 8.453168234538075e-06, + "loss": 0.4292, + "step": 3066 + }, + { + "epoch": 1.6581366011894034, + "grad_norm": 0.42617353796958923, + "learning_rate": 8.451802594857725e-06, + "loss": 0.4653, + "step": 3067 + }, + { + "epoch": 1.6586772391421878, + "grad_norm": 0.3546878695487976, + "learning_rate": 8.45043646302562e-06, + "loss": 0.3308, + "step": 3068 + }, + { + "epoch": 1.6592178770949721, + "grad_norm": 0.4163733124732971, + "learning_rate": 8.44906983923654e-06, + "loss": 0.4218, + "step": 3069 + }, + { + "epoch": 1.6597585150477565, + "grad_norm": 0.42740800976753235, + "learning_rate": 8.447702723685335e-06, + "loss": 0.4925, + "step": 3070 + }, + { + "epoch": 1.6602991530005407, + "grad_norm": 0.3994062840938568, + "learning_rate": 8.446335116566927e-06, + "loss": 0.4142, + "step": 3071 + }, + { + "epoch": 1.6608397909533248, + "grad_norm": 0.3746875524520874, + "learning_rate": 8.44496701807631e-06, + "loss": 0.4115, + "step": 3072 + }, + { + "epoch": 1.6613804289061092, + "grad_norm": 0.41194552183151245, + "learning_rate": 8.443598428408537e-06, + "loss": 0.4565, + "step": 3073 + }, + { + "epoch": 1.6619210668588935, + "grad_norm": 0.3965880870819092, + "learning_rate": 8.442229347758748e-06, + "loss": 0.4606, + "step": 3074 + }, + { + "epoch": 1.6624617048116779, + "grad_norm": 0.386476069688797, + "learning_rate": 8.440859776322137e-06, + "loss": 0.3842, + "step": 3075 + }, + { + "epoch": 1.663002342764462, + "grad_norm": 0.348588228225708, + "learning_rate": 8.439489714293978e-06, + "loss": 0.4534, + "step": 3076 + }, + { + "epoch": 1.6635429807172464, + "grad_norm": 0.38109901547431946, + "learning_rate": 8.43811916186961e-06, + "loss": 0.4271, + "step": 3077 + }, + { + "epoch": 1.6640836186700305, + "grad_norm": 0.42218753695487976, + "learning_rate": 8.43674811924445e-06, + "loss": 0.4368, + "step": 3078 + }, + { + "epoch": 1.6646242566228149, + "grad_norm": 0.34757694602012634, + "learning_rate": 8.435376586613972e-06, + "loss": 0.4115, + "step": 3079 + }, + { + "epoch": 1.6651648945755992, + "grad_norm": 0.4021320343017578, + "learning_rate": 8.43400456417373e-06, + "loss": 0.4767, + "step": 3080 + }, + { + "epoch": 1.6657055325283836, + "grad_norm": 0.36043521761894226, + "learning_rate": 8.432632052119342e-06, + "loss": 0.4206, + "step": 3081 + }, + { + "epoch": 1.6662461704811677, + "grad_norm": 0.37315505743026733, + "learning_rate": 8.431259050646502e-06, + "loss": 0.4035, + "step": 3082 + }, + { + "epoch": 1.666786808433952, + "grad_norm": 0.3432157635688782, + "learning_rate": 8.429885559950965e-06, + "loss": 0.4299, + "step": 3083 + }, + { + "epoch": 1.6673274463867362, + "grad_norm": 0.3866594731807709, + "learning_rate": 8.428511580228564e-06, + "loss": 0.4573, + "step": 3084 + }, + { + "epoch": 1.6678680843395206, + "grad_norm": 0.37218984961509705, + "learning_rate": 8.4271371116752e-06, + "loss": 0.4442, + "step": 3085 + }, + { + "epoch": 1.668408722292305, + "grad_norm": 0.405370831489563, + "learning_rate": 8.42576215448684e-06, + "loss": 0.4491, + "step": 3086 + }, + { + "epoch": 1.6689493602450893, + "grad_norm": 0.3809480667114258, + "learning_rate": 8.424386708859522e-06, + "loss": 0.4166, + "step": 3087 + }, + { + "epoch": 1.6694899981978735, + "grad_norm": 0.36650386452674866, + "learning_rate": 8.423010774989357e-06, + "loss": 0.4182, + "step": 3088 + }, + { + "epoch": 1.6700306361506576, + "grad_norm": 0.4135519564151764, + "learning_rate": 8.421634353072522e-06, + "loss": 0.4545, + "step": 3089 + }, + { + "epoch": 1.670571274103442, + "grad_norm": 0.33149343729019165, + "learning_rate": 8.420257443305264e-06, + "loss": 0.4127, + "step": 3090 + }, + { + "epoch": 1.6711119120562263, + "grad_norm": 0.40432754158973694, + "learning_rate": 8.418880045883902e-06, + "loss": 0.4292, + "step": 3091 + }, + { + "epoch": 1.6716525500090107, + "grad_norm": 0.3729030191898346, + "learning_rate": 8.41750216100482e-06, + "loss": 0.4231, + "step": 3092 + }, + { + "epoch": 1.672193187961795, + "grad_norm": 0.3287682831287384, + "learning_rate": 8.416123788864478e-06, + "loss": 0.4342, + "step": 3093 + }, + { + "epoch": 1.6727338259145792, + "grad_norm": 0.43830937147140503, + "learning_rate": 8.4147449296594e-06, + "loss": 0.406, + "step": 3094 + }, + { + "epoch": 1.6732744638673633, + "grad_norm": 0.3575669825077057, + "learning_rate": 8.41336558358618e-06, + "loss": 0.4064, + "step": 3095 + }, + { + "epoch": 1.6738151018201477, + "grad_norm": 0.4404871165752411, + "learning_rate": 8.411985750841484e-06, + "loss": 0.4708, + "step": 3096 + }, + { + "epoch": 1.674355739772932, + "grad_norm": 0.33472740650177, + "learning_rate": 8.410605431622048e-06, + "loss": 0.411, + "step": 3097 + }, + { + "epoch": 1.6748963777257164, + "grad_norm": 0.3318942189216614, + "learning_rate": 8.409224626124672e-06, + "loss": 0.4288, + "step": 3098 + }, + { + "epoch": 1.6754370156785008, + "grad_norm": 0.38175883889198303, + "learning_rate": 8.40784333454623e-06, + "loss": 0.416, + "step": 3099 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 0.3427804708480835, + "learning_rate": 8.406461557083666e-06, + "loss": 0.448, + "step": 3100 + }, + { + "epoch": 1.676518291584069, + "grad_norm": 0.36495766043663025, + "learning_rate": 8.405079293933986e-06, + "loss": 0.4756, + "step": 3101 + }, + { + "epoch": 1.6770589295368534, + "grad_norm": 0.3412352204322815, + "learning_rate": 8.403696545294276e-06, + "loss": 0.4158, + "step": 3102 + }, + { + "epoch": 1.6775995674896378, + "grad_norm": 0.31895023584365845, + "learning_rate": 8.402313311361684e-06, + "loss": 0.4354, + "step": 3103 + }, + { + "epoch": 1.6781402054424222, + "grad_norm": 0.41563037037849426, + "learning_rate": 8.400929592333429e-06, + "loss": 0.4755, + "step": 3104 + }, + { + "epoch": 1.6786808433952063, + "grad_norm": 0.3583192825317383, + "learning_rate": 8.399545388406798e-06, + "loss": 0.446, + "step": 3105 + }, + { + "epoch": 1.6792214813479907, + "grad_norm": 0.32093483209609985, + "learning_rate": 8.39816069977915e-06, + "loss": 0.4305, + "step": 3106 + }, + { + "epoch": 1.6797621193007748, + "grad_norm": 0.4041621685028076, + "learning_rate": 8.396775526647911e-06, + "loss": 0.4506, + "step": 3107 + }, + { + "epoch": 1.6803027572535592, + "grad_norm": 0.3379467725753784, + "learning_rate": 8.395389869210576e-06, + "loss": 0.3782, + "step": 3108 + }, + { + "epoch": 1.6808433952063435, + "grad_norm": 0.37511587142944336, + "learning_rate": 8.39400372766471e-06, + "loss": 0.4389, + "step": 3109 + }, + { + "epoch": 1.6813840331591279, + "grad_norm": 0.29660964012145996, + "learning_rate": 8.392617102207945e-06, + "loss": 0.3928, + "step": 3110 + }, + { + "epoch": 1.681924671111912, + "grad_norm": 0.38168632984161377, + "learning_rate": 8.391229993037986e-06, + "loss": 0.4428, + "step": 3111 + }, + { + "epoch": 1.6824653090646964, + "grad_norm": 0.36777037382125854, + "learning_rate": 8.389842400352603e-06, + "loss": 0.4462, + "step": 3112 + }, + { + "epoch": 1.6830059470174805, + "grad_norm": 0.3301421105861664, + "learning_rate": 8.388454324349636e-06, + "loss": 0.4568, + "step": 3113 + }, + { + "epoch": 1.6835465849702649, + "grad_norm": 0.3039977252483368, + "learning_rate": 8.387065765226995e-06, + "loss": 0.3929, + "step": 3114 + }, + { + "epoch": 1.6840872229230492, + "grad_norm": 0.37604808807373047, + "learning_rate": 8.38567672318266e-06, + "loss": 0.4395, + "step": 3115 + }, + { + "epoch": 1.6846278608758336, + "grad_norm": 0.31758004426956177, + "learning_rate": 8.384287198414676e-06, + "loss": 0.4309, + "step": 3116 + }, + { + "epoch": 1.6851684988286177, + "grad_norm": 0.3382377624511719, + "learning_rate": 8.382897191121157e-06, + "loss": 0.4281, + "step": 3117 + }, + { + "epoch": 1.685709136781402, + "grad_norm": 0.37232866883277893, + "learning_rate": 8.381506701500292e-06, + "loss": 0.4368, + "step": 3118 + }, + { + "epoch": 1.6862497747341862, + "grad_norm": 0.36527279019355774, + "learning_rate": 8.380115729750333e-06, + "loss": 0.4015, + "step": 3119 + }, + { + "epoch": 1.6867904126869706, + "grad_norm": 0.3765943646430969, + "learning_rate": 8.3787242760696e-06, + "loss": 0.4732, + "step": 3120 + }, + { + "epoch": 1.687331050639755, + "grad_norm": 0.3340546786785126, + "learning_rate": 8.377332340656488e-06, + "loss": 0.4017, + "step": 3121 + }, + { + "epoch": 1.6878716885925393, + "grad_norm": 0.412190318107605, + "learning_rate": 8.375939923709453e-06, + "loss": 0.474, + "step": 3122 + }, + { + "epoch": 1.6884123265453235, + "grad_norm": 0.3258398175239563, + "learning_rate": 8.374547025427024e-06, + "loss": 0.4413, + "step": 3123 + }, + { + "epoch": 1.6889529644981076, + "grad_norm": 0.31814485788345337, + "learning_rate": 8.373153646007802e-06, + "loss": 0.4178, + "step": 3124 + }, + { + "epoch": 1.689493602450892, + "grad_norm": 0.39647987484931946, + "learning_rate": 8.371759785650444e-06, + "loss": 0.456, + "step": 3125 + }, + { + "epoch": 1.6900342404036763, + "grad_norm": 0.3643586039543152, + "learning_rate": 8.370365444553692e-06, + "loss": 0.4342, + "step": 3126 + }, + { + "epoch": 1.6905748783564607, + "grad_norm": 0.33919721841812134, + "learning_rate": 8.368970622916346e-06, + "loss": 0.4159, + "step": 3127 + }, + { + "epoch": 1.691115516309245, + "grad_norm": 0.38754889369010925, + "learning_rate": 8.367575320937276e-06, + "loss": 0.4435, + "step": 3128 + }, + { + "epoch": 1.6916561542620292, + "grad_norm": 0.3378376364707947, + "learning_rate": 8.366179538815424e-06, + "loss": 0.4812, + "step": 3129 + }, + { + "epoch": 1.6921967922148133, + "grad_norm": 0.33486446738243103, + "learning_rate": 8.364783276749794e-06, + "loss": 0.4108, + "step": 3130 + }, + { + "epoch": 1.6927374301675977, + "grad_norm": 0.3412155210971832, + "learning_rate": 8.363386534939467e-06, + "loss": 0.4107, + "step": 3131 + }, + { + "epoch": 1.693278068120382, + "grad_norm": 0.36446821689605713, + "learning_rate": 8.361989313583586e-06, + "loss": 0.4241, + "step": 3132 + }, + { + "epoch": 1.6938187060731664, + "grad_norm": 0.3438349962234497, + "learning_rate": 8.360591612881363e-06, + "loss": 0.4735, + "step": 3133 + }, + { + "epoch": 1.6943593440259508, + "grad_norm": 0.3088810443878174, + "learning_rate": 8.359193433032083e-06, + "loss": 0.4163, + "step": 3134 + }, + { + "epoch": 1.694899981978735, + "grad_norm": 0.37794172763824463, + "learning_rate": 8.357794774235094e-06, + "loss": 0.4534, + "step": 3135 + }, + { + "epoch": 1.695440619931519, + "grad_norm": 0.3165605068206787, + "learning_rate": 8.356395636689811e-06, + "loss": 0.3943, + "step": 3136 + }, + { + "epoch": 1.6959812578843034, + "grad_norm": 0.33255523443222046, + "learning_rate": 8.354996020595728e-06, + "loss": 0.4647, + "step": 3137 + }, + { + "epoch": 1.6965218958370878, + "grad_norm": 0.30814820528030396, + "learning_rate": 8.353595926152391e-06, + "loss": 0.4561, + "step": 3138 + }, + { + "epoch": 1.6970625337898722, + "grad_norm": 0.34673863649368286, + "learning_rate": 8.35219535355943e-06, + "loss": 0.419, + "step": 3139 + }, + { + "epoch": 1.6976031717426563, + "grad_norm": 0.32622379064559937, + "learning_rate": 8.350794303016533e-06, + "loss": 0.4528, + "step": 3140 + }, + { + "epoch": 1.6981438096954407, + "grad_norm": 0.2980291545391083, + "learning_rate": 8.349392774723459e-06, + "loss": 0.4075, + "step": 3141 + }, + { + "epoch": 1.6986844476482248, + "grad_norm": 0.3580862879753113, + "learning_rate": 8.347990768880036e-06, + "loss": 0.4604, + "step": 3142 + }, + { + "epoch": 1.6992250856010092, + "grad_norm": 0.3246307969093323, + "learning_rate": 8.34658828568616e-06, + "loss": 0.3947, + "step": 3143 + }, + { + "epoch": 1.6997657235537935, + "grad_norm": 0.30707135796546936, + "learning_rate": 8.345185325341794e-06, + "loss": 0.405, + "step": 3144 + }, + { + "epoch": 1.7003063615065779, + "grad_norm": 0.311701238155365, + "learning_rate": 8.343781888046971e-06, + "loss": 0.4258, + "step": 3145 + }, + { + "epoch": 1.700846999459362, + "grad_norm": 0.2869536280632019, + "learning_rate": 8.342377974001787e-06, + "loss": 0.3938, + "step": 3146 + }, + { + "epoch": 1.7013876374121464, + "grad_norm": 0.3633367717266083, + "learning_rate": 8.340973583406412e-06, + "loss": 0.4839, + "step": 3147 + }, + { + "epoch": 1.7019282753649305, + "grad_norm": 0.3297747075557709, + "learning_rate": 8.339568716461082e-06, + "loss": 0.4462, + "step": 3148 + }, + { + "epoch": 1.7024689133177149, + "grad_norm": 0.28560447692871094, + "learning_rate": 8.338163373366099e-06, + "loss": 0.4178, + "step": 3149 + }, + { + "epoch": 1.7030095512704992, + "grad_norm": 0.31055206060409546, + "learning_rate": 8.336757554321832e-06, + "loss": 0.4093, + "step": 3150 + }, + { + "epoch": 1.7035501892232836, + "grad_norm": 0.3611248731613159, + "learning_rate": 8.335351259528726e-06, + "loss": 0.4674, + "step": 3151 + }, + { + "epoch": 1.7040908271760677, + "grad_norm": 0.31785741448402405, + "learning_rate": 8.333944489187284e-06, + "loss": 0.4416, + "step": 3152 + }, + { + "epoch": 1.7046314651288519, + "grad_norm": 0.3182879388332367, + "learning_rate": 8.332537243498082e-06, + "loss": 0.4145, + "step": 3153 + }, + { + "epoch": 1.7051721030816362, + "grad_norm": 0.3585261106491089, + "learning_rate": 8.331129522661761e-06, + "loss": 0.4515, + "step": 3154 + }, + { + "epoch": 1.7057127410344206, + "grad_norm": 0.3109356760978699, + "learning_rate": 8.329721326879032e-06, + "loss": 0.4235, + "step": 3155 + }, + { + "epoch": 1.706253378987205, + "grad_norm": 0.38349929451942444, + "learning_rate": 8.328312656350675e-06, + "loss": 0.4808, + "step": 3156 + }, + { + "epoch": 1.7067940169399893, + "grad_norm": 0.32526955008506775, + "learning_rate": 8.326903511277535e-06, + "loss": 0.4377, + "step": 3157 + }, + { + "epoch": 1.7073346548927735, + "grad_norm": 0.34731414914131165, + "learning_rate": 8.32549389186052e-06, + "loss": 0.4497, + "step": 3158 + }, + { + "epoch": 1.7078752928455576, + "grad_norm": 0.37489038705825806, + "learning_rate": 8.32408379830062e-06, + "loss": 0.4258, + "step": 3159 + }, + { + "epoch": 1.708415930798342, + "grad_norm": 0.3703058958053589, + "learning_rate": 8.322673230798877e-06, + "loss": 0.4201, + "step": 3160 + }, + { + "epoch": 1.7089565687511263, + "grad_norm": 0.32067111134529114, + "learning_rate": 8.32126218955641e-06, + "loss": 0.4305, + "step": 3161 + }, + { + "epoch": 1.7094972067039107, + "grad_norm": 0.3572859466075897, + "learning_rate": 8.319850674774401e-06, + "loss": 0.4379, + "step": 3162 + }, + { + "epoch": 1.710037844656695, + "grad_norm": 0.3251093924045563, + "learning_rate": 8.318438686654101e-06, + "loss": 0.4293, + "step": 3163 + }, + { + "epoch": 1.7105784826094792, + "grad_norm": 0.2939565181732178, + "learning_rate": 8.317026225396832e-06, + "loss": 0.42, + "step": 3164 + }, + { + "epoch": 1.7111191205622633, + "grad_norm": 0.3347533643245697, + "learning_rate": 8.315613291203977e-06, + "loss": 0.4308, + "step": 3165 + }, + { + "epoch": 1.7116597585150477, + "grad_norm": 0.30677613615989685, + "learning_rate": 8.31419988427699e-06, + "loss": 0.3876, + "step": 3166 + }, + { + "epoch": 1.712200396467832, + "grad_norm": 0.3365405201911926, + "learning_rate": 8.312786004817394e-06, + "loss": 0.4636, + "step": 3167 + }, + { + "epoch": 1.7127410344206164, + "grad_norm": 0.3438670337200165, + "learning_rate": 8.311371653026775e-06, + "loss": 0.4645, + "step": 3168 + }, + { + "epoch": 1.7132816723734006, + "grad_norm": 0.31338000297546387, + "learning_rate": 8.309956829106789e-06, + "loss": 0.4463, + "step": 3169 + }, + { + "epoch": 1.713822310326185, + "grad_norm": 0.32951968908309937, + "learning_rate": 8.30854153325916e-06, + "loss": 0.4077, + "step": 3170 + }, + { + "epoch": 1.714362948278969, + "grad_norm": 0.3423302173614502, + "learning_rate": 8.307125765685677e-06, + "loss": 0.4197, + "step": 3171 + }, + { + "epoch": 1.7149035862317534, + "grad_norm": 0.30592572689056396, + "learning_rate": 8.3057095265882e-06, + "loss": 0.4028, + "step": 3172 + }, + { + "epoch": 1.7154442241845378, + "grad_norm": 0.3736177682876587, + "learning_rate": 8.304292816168653e-06, + "loss": 0.4519, + "step": 3173 + }, + { + "epoch": 1.7159848621373222, + "grad_norm": 0.34032773971557617, + "learning_rate": 8.302875634629027e-06, + "loss": 0.4213, + "step": 3174 + }, + { + "epoch": 1.7165255000901063, + "grad_norm": 0.33573025465011597, + "learning_rate": 8.30145798217138e-06, + "loss": 0.4315, + "step": 3175 + }, + { + "epoch": 1.7170661380428907, + "grad_norm": 0.3742954134941101, + "learning_rate": 8.30003985899784e-06, + "loss": 0.4146, + "step": 3176 + }, + { + "epoch": 1.7176067759956748, + "grad_norm": 0.32530078291893005, + "learning_rate": 8.298621265310602e-06, + "loss": 0.4337, + "step": 3177 + }, + { + "epoch": 1.7181474139484592, + "grad_norm": 0.33018314838409424, + "learning_rate": 8.297202201311923e-06, + "loss": 0.4435, + "step": 3178 + }, + { + "epoch": 1.7186880519012435, + "grad_norm": 0.3356422781944275, + "learning_rate": 8.295782667204133e-06, + "loss": 0.4602, + "step": 3179 + }, + { + "epoch": 1.7192286898540279, + "grad_norm": 0.30430036783218384, + "learning_rate": 8.294362663189626e-06, + "loss": 0.383, + "step": 3180 + }, + { + "epoch": 1.719769327806812, + "grad_norm": 0.3536764979362488, + "learning_rate": 8.292942189470863e-06, + "loss": 0.4313, + "step": 3181 + }, + { + "epoch": 1.7203099657595962, + "grad_norm": 0.30938300490379333, + "learning_rate": 8.291521246250373e-06, + "loss": 0.3956, + "step": 3182 + }, + { + "epoch": 1.7208506037123805, + "grad_norm": 0.36522185802459717, + "learning_rate": 8.290099833730753e-06, + "loss": 0.461, + "step": 3183 + }, + { + "epoch": 1.7213912416651649, + "grad_norm": 0.3340790569782257, + "learning_rate": 8.288677952114663e-06, + "loss": 0.4611, + "step": 3184 + }, + { + "epoch": 1.7219318796179492, + "grad_norm": 0.31987613439559937, + "learning_rate": 8.287255601604834e-06, + "loss": 0.3968, + "step": 3185 + }, + { + "epoch": 1.7224725175707336, + "grad_norm": 0.35184627771377563, + "learning_rate": 8.285832782404061e-06, + "loss": 0.4627, + "step": 3186 + }, + { + "epoch": 1.7230131555235177, + "grad_norm": 0.29496318101882935, + "learning_rate": 8.284409494715208e-06, + "loss": 0.4, + "step": 3187 + }, + { + "epoch": 1.7235537934763019, + "grad_norm": 0.32895147800445557, + "learning_rate": 8.282985738741202e-06, + "loss": 0.4251, + "step": 3188 + }, + { + "epoch": 1.7240944314290862, + "grad_norm": 0.3419833481311798, + "learning_rate": 8.281561514685046e-06, + "loss": 0.4547, + "step": 3189 + }, + { + "epoch": 1.7246350693818706, + "grad_norm": 0.2946586310863495, + "learning_rate": 8.280136822749796e-06, + "loss": 0.3943, + "step": 3190 + }, + { + "epoch": 1.725175707334655, + "grad_norm": 0.3011445105075836, + "learning_rate": 8.278711663138585e-06, + "loss": 0.4056, + "step": 3191 + }, + { + "epoch": 1.7257163452874393, + "grad_norm": 0.36560511589050293, + "learning_rate": 8.277286036054611e-06, + "loss": 0.5145, + "step": 3192 + }, + { + "epoch": 1.7262569832402235, + "grad_norm": 0.2690065801143646, + "learning_rate": 8.275859941701137e-06, + "loss": 0.3663, + "step": 3193 + }, + { + "epoch": 1.7267976211930076, + "grad_norm": 0.347079336643219, + "learning_rate": 8.27443338028149e-06, + "loss": 0.444, + "step": 3194 + }, + { + "epoch": 1.727338259145792, + "grad_norm": 0.34917938709259033, + "learning_rate": 8.27300635199907e-06, + "loss": 0.4253, + "step": 3195 + }, + { + "epoch": 1.7278788970985763, + "grad_norm": 0.32194095849990845, + "learning_rate": 8.271578857057337e-06, + "loss": 0.4584, + "step": 3196 + }, + { + "epoch": 1.7284195350513607, + "grad_norm": 0.3533184826374054, + "learning_rate": 8.270150895659824e-06, + "loss": 0.4777, + "step": 3197 + }, + { + "epoch": 1.7289601730041448, + "grad_norm": 0.3372891843318939, + "learning_rate": 8.268722468010123e-06, + "loss": 0.4002, + "step": 3198 + }, + { + "epoch": 1.7295008109569292, + "grad_norm": 0.30643367767333984, + "learning_rate": 8.267293574311901e-06, + "loss": 0.4468, + "step": 3199 + }, + { + "epoch": 1.7300414489097133, + "grad_norm": 0.3147575855255127, + "learning_rate": 8.265864214768883e-06, + "loss": 0.3975, + "step": 3200 + }, + { + "epoch": 1.7305820868624977, + "grad_norm": 0.3653241693973541, + "learning_rate": 8.26443438958487e-06, + "loss": 0.4848, + "step": 3201 + }, + { + "epoch": 1.731122724815282, + "grad_norm": 0.3251360058784485, + "learning_rate": 8.263004098963719e-06, + "loss": 0.424, + "step": 3202 + }, + { + "epoch": 1.7316633627680664, + "grad_norm": 0.31904032826423645, + "learning_rate": 8.261573343109359e-06, + "loss": 0.4258, + "step": 3203 + }, + { + "epoch": 1.7322040007208506, + "grad_norm": 0.32821953296661377, + "learning_rate": 8.260142122225788e-06, + "loss": 0.4668, + "step": 3204 + }, + { + "epoch": 1.732744638673635, + "grad_norm": 0.34518858790397644, + "learning_rate": 8.25871043651706e-06, + "loss": 0.4592, + "step": 3205 + }, + { + "epoch": 1.733285276626419, + "grad_norm": 0.332118421792984, + "learning_rate": 8.25727828618731e-06, + "loss": 0.4063, + "step": 3206 + }, + { + "epoch": 1.7338259145792034, + "grad_norm": 0.34514275193214417, + "learning_rate": 8.255845671440726e-06, + "loss": 0.5073, + "step": 3207 + }, + { + "epoch": 1.7343665525319878, + "grad_norm": 0.3004824221134186, + "learning_rate": 8.25441259248157e-06, + "loss": 0.4085, + "step": 3208 + }, + { + "epoch": 1.7349071904847722, + "grad_norm": 0.29278916120529175, + "learning_rate": 8.252979049514168e-06, + "loss": 0.3678, + "step": 3209 + }, + { + "epoch": 1.7354478284375563, + "grad_norm": 0.36105865240097046, + "learning_rate": 8.25154504274291e-06, + "loss": 0.4853, + "step": 3210 + }, + { + "epoch": 1.7359884663903407, + "grad_norm": 0.31009751558303833, + "learning_rate": 8.250110572372255e-06, + "loss": 0.428, + "step": 3211 + }, + { + "epoch": 1.7365291043431248, + "grad_norm": 0.5618311762809753, + "learning_rate": 8.248675638606729e-06, + "loss": 0.4689, + "step": 3212 + }, + { + "epoch": 1.7370697422959092, + "grad_norm": 0.35816720128059387, + "learning_rate": 8.247240241650918e-06, + "loss": 0.4614, + "step": 3213 + }, + { + "epoch": 1.7376103802486935, + "grad_norm": 0.3334486484527588, + "learning_rate": 8.245804381709483e-06, + "loss": 0.422, + "step": 3214 + }, + { + "epoch": 1.7381510182014779, + "grad_norm": 0.34021323919296265, + "learning_rate": 8.244368058987145e-06, + "loss": 0.414, + "step": 3215 + }, + { + "epoch": 1.738691656154262, + "grad_norm": 0.3346158266067505, + "learning_rate": 8.24293127368869e-06, + "loss": 0.4688, + "step": 3216 + }, + { + "epoch": 1.7392322941070462, + "grad_norm": 0.350480318069458, + "learning_rate": 8.241494026018974e-06, + "loss": 0.4408, + "step": 3217 + }, + { + "epoch": 1.7397729320598305, + "grad_norm": 0.32346951961517334, + "learning_rate": 8.240056316182917e-06, + "loss": 0.4241, + "step": 3218 + }, + { + "epoch": 1.7403135700126149, + "grad_norm": 0.33489251136779785, + "learning_rate": 8.238618144385506e-06, + "loss": 0.4288, + "step": 3219 + }, + { + "epoch": 1.7408542079653992, + "grad_norm": 0.3779529929161072, + "learning_rate": 8.237179510831792e-06, + "loss": 0.4622, + "step": 3220 + }, + { + "epoch": 1.7413948459181836, + "grad_norm": 0.31131255626678467, + "learning_rate": 8.23574041572689e-06, + "loss": 0.4228, + "step": 3221 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 0.3840460777282715, + "learning_rate": 8.234300859275989e-06, + "loss": 0.4145, + "step": 3222 + }, + { + "epoch": 1.7424761218237519, + "grad_norm": 0.3488578200340271, + "learning_rate": 8.232860841684333e-06, + "loss": 0.4694, + "step": 3223 + }, + { + "epoch": 1.7430167597765363, + "grad_norm": 0.3081468343734741, + "learning_rate": 8.231420363157243e-06, + "loss": 0.3967, + "step": 3224 + }, + { + "epoch": 1.7435573977293206, + "grad_norm": 0.3889080584049225, + "learning_rate": 8.229979423900095e-06, + "loss": 0.448, + "step": 3225 + }, + { + "epoch": 1.744098035682105, + "grad_norm": 0.3414023816585541, + "learning_rate": 8.228538024118338e-06, + "loss": 0.4822, + "step": 3226 + }, + { + "epoch": 1.7446386736348891, + "grad_norm": 0.30643653869628906, + "learning_rate": 8.227096164017482e-06, + "loss": 0.4196, + "step": 3227 + }, + { + "epoch": 1.7451793115876735, + "grad_norm": 0.36826562881469727, + "learning_rate": 8.225653843803107e-06, + "loss": 0.4308, + "step": 3228 + }, + { + "epoch": 1.7457199495404576, + "grad_norm": 0.37800219655036926, + "learning_rate": 8.224211063680854e-06, + "loss": 0.4503, + "step": 3229 + }, + { + "epoch": 1.746260587493242, + "grad_norm": 0.3215877115726471, + "learning_rate": 8.222767823856435e-06, + "loss": 0.3991, + "step": 3230 + }, + { + "epoch": 1.7468012254460263, + "grad_norm": 0.40726178884506226, + "learning_rate": 8.221324124535622e-06, + "loss": 0.4337, + "step": 3231 + }, + { + "epoch": 1.7473418633988107, + "grad_norm": 0.3118278682231903, + "learning_rate": 8.219879965924255e-06, + "loss": 0.4753, + "step": 3232 + }, + { + "epoch": 1.7478825013515948, + "grad_norm": 0.31255823373794556, + "learning_rate": 8.218435348228241e-06, + "loss": 0.3601, + "step": 3233 + }, + { + "epoch": 1.7484231393043792, + "grad_norm": 0.3631909489631653, + "learning_rate": 8.216990271653553e-06, + "loss": 0.4126, + "step": 3234 + }, + { + "epoch": 1.7489637772571633, + "grad_norm": 0.34900087118148804, + "learning_rate": 8.215544736406223e-06, + "loss": 0.4599, + "step": 3235 + }, + { + "epoch": 1.7495044152099477, + "grad_norm": 0.2817728519439697, + "learning_rate": 8.214098742692353e-06, + "loss": 0.3794, + "step": 3236 + }, + { + "epoch": 1.750045053162732, + "grad_norm": 0.4058758616447449, + "learning_rate": 8.212652290718113e-06, + "loss": 0.4523, + "step": 3237 + }, + { + "epoch": 1.7505856911155164, + "grad_norm": 0.3100168704986572, + "learning_rate": 8.211205380689735e-06, + "loss": 0.4086, + "step": 3238 + }, + { + "epoch": 1.7511263290683006, + "grad_norm": 0.3628126084804535, + "learning_rate": 8.209758012813515e-06, + "loss": 0.4776, + "step": 3239 + }, + { + "epoch": 1.751666967021085, + "grad_norm": 0.29844430088996887, + "learning_rate": 8.20831018729582e-06, + "loss": 0.406, + "step": 3240 + }, + { + "epoch": 1.752207604973869, + "grad_norm": 0.3558284044265747, + "learning_rate": 8.206861904343074e-06, + "loss": 0.4451, + "step": 3241 + }, + { + "epoch": 1.7527482429266534, + "grad_norm": 0.35172179341316223, + "learning_rate": 8.20541316416177e-06, + "loss": 0.4426, + "step": 3242 + }, + { + "epoch": 1.7532888808794378, + "grad_norm": 0.36479464173316956, + "learning_rate": 8.20396396695847e-06, + "loss": 0.4168, + "step": 3243 + }, + { + "epoch": 1.7538295188322222, + "grad_norm": 0.3374549150466919, + "learning_rate": 8.202514312939798e-06, + "loss": 0.4347, + "step": 3244 + }, + { + "epoch": 1.7543701567850063, + "grad_norm": 0.357465535402298, + "learning_rate": 8.20106420231244e-06, + "loss": 0.4579, + "step": 3245 + }, + { + "epoch": 1.7549107947377904, + "grad_norm": 0.41111886501312256, + "learning_rate": 8.199613635283154e-06, + "loss": 0.45, + "step": 3246 + }, + { + "epoch": 1.7554514326905748, + "grad_norm": 0.33777353167533875, + "learning_rate": 8.198162612058755e-06, + "loss": 0.3852, + "step": 3247 + }, + { + "epoch": 1.7559920706433592, + "grad_norm": 0.34722068905830383, + "learning_rate": 8.19671113284613e-06, + "loss": 0.4177, + "step": 3248 + }, + { + "epoch": 1.7565327085961435, + "grad_norm": 0.3915463984012604, + "learning_rate": 8.19525919785223e-06, + "loss": 0.4462, + "step": 3249 + }, + { + "epoch": 1.7570733465489279, + "grad_norm": 0.3315128982067108, + "learning_rate": 8.193806807284064e-06, + "loss": 0.4572, + "step": 3250 + }, + { + "epoch": 1.757613984501712, + "grad_norm": 0.3063032925128937, + "learning_rate": 8.192353961348717e-06, + "loss": 0.4245, + "step": 3251 + }, + { + "epoch": 1.7581546224544962, + "grad_norm": 0.3548220992088318, + "learning_rate": 8.190900660253327e-06, + "loss": 0.4619, + "step": 3252 + }, + { + "epoch": 1.7586952604072805, + "grad_norm": 0.3348757028579712, + "learning_rate": 8.189446904205107e-06, + "loss": 0.4173, + "step": 3253 + }, + { + "epoch": 1.7592358983600649, + "grad_norm": 0.3433801233768463, + "learning_rate": 8.187992693411333e-06, + "loss": 0.4607, + "step": 3254 + }, + { + "epoch": 1.7597765363128492, + "grad_norm": 0.30504295229911804, + "learning_rate": 8.186538028079338e-06, + "loss": 0.396, + "step": 3255 + }, + { + "epoch": 1.7603171742656336, + "grad_norm": 0.349337100982666, + "learning_rate": 8.18508290841653e-06, + "loss": 0.4742, + "step": 3256 + }, + { + "epoch": 1.7608578122184178, + "grad_norm": 0.31097519397735596, + "learning_rate": 8.183627334630376e-06, + "loss": 0.3798, + "step": 3257 + }, + { + "epoch": 1.761398450171202, + "grad_norm": 0.36984965205192566, + "learning_rate": 8.182171306928407e-06, + "loss": 0.4295, + "step": 3258 + }, + { + "epoch": 1.7619390881239863, + "grad_norm": 0.32913482189178467, + "learning_rate": 8.180714825518223e-06, + "loss": 0.4201, + "step": 3259 + }, + { + "epoch": 1.7624797260767706, + "grad_norm": 0.3395659923553467, + "learning_rate": 8.179257890607489e-06, + "loss": 0.3953, + "step": 3260 + }, + { + "epoch": 1.763020364029555, + "grad_norm": 0.34365546703338623, + "learning_rate": 8.177800502403928e-06, + "loss": 0.4944, + "step": 3261 + }, + { + "epoch": 1.7635610019823391, + "grad_norm": 0.3863326609134674, + "learning_rate": 8.176342661115332e-06, + "loss": 0.4325, + "step": 3262 + }, + { + "epoch": 1.7641016399351235, + "grad_norm": 0.3482382297515869, + "learning_rate": 8.174884366949558e-06, + "loss": 0.4258, + "step": 3263 + }, + { + "epoch": 1.7646422778879076, + "grad_norm": 0.3736988306045532, + "learning_rate": 8.173425620114529e-06, + "loss": 0.4245, + "step": 3264 + }, + { + "epoch": 1.765182915840692, + "grad_norm": 0.40473508834838867, + "learning_rate": 8.171966420818227e-06, + "loss": 0.4628, + "step": 3265 + }, + { + "epoch": 1.7657235537934763, + "grad_norm": 0.3368479907512665, + "learning_rate": 8.170506769268706e-06, + "loss": 0.4379, + "step": 3266 + }, + { + "epoch": 1.7662641917462607, + "grad_norm": 0.4241042137145996, + "learning_rate": 8.16904666567408e-06, + "loss": 0.4502, + "step": 3267 + }, + { + "epoch": 1.7668048296990448, + "grad_norm": 0.354518860578537, + "learning_rate": 8.167586110242522e-06, + "loss": 0.438, + "step": 3268 + }, + { + "epoch": 1.7673454676518292, + "grad_norm": 0.3048177659511566, + "learning_rate": 8.16612510318228e-06, + "loss": 0.4042, + "step": 3269 + }, + { + "epoch": 1.7678861056046133, + "grad_norm": 0.32324472069740295, + "learning_rate": 8.164663644701662e-06, + "loss": 0.4426, + "step": 3270 + }, + { + "epoch": 1.7684267435573977, + "grad_norm": 0.3259657919406891, + "learning_rate": 8.163201735009041e-06, + "loss": 0.4184, + "step": 3271 + }, + { + "epoch": 1.768967381510182, + "grad_norm": 0.3852538466453552, + "learning_rate": 8.161739374312852e-06, + "loss": 0.4176, + "step": 3272 + }, + { + "epoch": 1.7695080194629664, + "grad_norm": 0.30960458517074585, + "learning_rate": 8.160276562821594e-06, + "loss": 0.4033, + "step": 3273 + }, + { + "epoch": 1.7700486574157506, + "grad_norm": 0.3406146466732025, + "learning_rate": 8.158813300743835e-06, + "loss": 0.3973, + "step": 3274 + }, + { + "epoch": 1.7705892953685347, + "grad_norm": 0.3403013348579407, + "learning_rate": 8.157349588288202e-06, + "loss": 0.4278, + "step": 3275 + }, + { + "epoch": 1.771129933321319, + "grad_norm": 0.3388356864452362, + "learning_rate": 8.155885425663389e-06, + "loss": 0.4677, + "step": 3276 + }, + { + "epoch": 1.7716705712741034, + "grad_norm": 0.3182254433631897, + "learning_rate": 8.154420813078155e-06, + "loss": 0.4176, + "step": 3277 + }, + { + "epoch": 1.7722112092268878, + "grad_norm": 0.34564125537872314, + "learning_rate": 8.15295575074132e-06, + "loss": 0.401, + "step": 3278 + }, + { + "epoch": 1.7727518471796722, + "grad_norm": 0.3367253839969635, + "learning_rate": 8.15149023886177e-06, + "loss": 0.4193, + "step": 3279 + }, + { + "epoch": 1.7732924851324563, + "grad_norm": 0.3374026119709015, + "learning_rate": 8.150024277648458e-06, + "loss": 0.4362, + "step": 3280 + }, + { + "epoch": 1.7738331230852404, + "grad_norm": 0.3175216615200043, + "learning_rate": 8.148557867310393e-06, + "loss": 0.426, + "step": 3281 + }, + { + "epoch": 1.7743737610380248, + "grad_norm": 0.3488418161869049, + "learning_rate": 8.147091008056658e-06, + "loss": 0.4802, + "step": 3282 + }, + { + "epoch": 1.7749143989908092, + "grad_norm": 0.34109702706336975, + "learning_rate": 8.145623700096394e-06, + "loss": 0.4823, + "step": 3283 + }, + { + "epoch": 1.7754550369435935, + "grad_norm": 0.3144458532333374, + "learning_rate": 8.144155943638804e-06, + "loss": 0.3939, + "step": 3284 + }, + { + "epoch": 1.7759956748963779, + "grad_norm": 0.33580702543258667, + "learning_rate": 8.142687738893161e-06, + "loss": 0.434, + "step": 3285 + }, + { + "epoch": 1.776536312849162, + "grad_norm": 0.3042031228542328, + "learning_rate": 8.1412190860688e-06, + "loss": 0.4064, + "step": 3286 + }, + { + "epoch": 1.7770769508019462, + "grad_norm": 0.30901986360549927, + "learning_rate": 8.139749985375113e-06, + "loss": 0.4476, + "step": 3287 + }, + { + "epoch": 1.7776175887547305, + "grad_norm": 0.32003363966941833, + "learning_rate": 8.138280437021569e-06, + "loss": 0.3802, + "step": 3288 + }, + { + "epoch": 1.7781582267075149, + "grad_norm": 0.3311749994754791, + "learning_rate": 8.13681044121769e-06, + "loss": 0.4242, + "step": 3289 + }, + { + "epoch": 1.7786988646602993, + "grad_norm": 0.3157782256603241, + "learning_rate": 8.135339998173064e-06, + "loss": 0.4406, + "step": 3290 + }, + { + "epoch": 1.7792395026130834, + "grad_norm": 0.34921789169311523, + "learning_rate": 8.133869108097349e-06, + "loss": 0.4257, + "step": 3291 + }, + { + "epoch": 1.7797801405658678, + "grad_norm": 0.3359015882015228, + "learning_rate": 8.132397771200256e-06, + "loss": 0.4576, + "step": 3292 + }, + { + "epoch": 1.780320778518652, + "grad_norm": 0.31737324595451355, + "learning_rate": 8.13092598769157e-06, + "loss": 0.4271, + "step": 3293 + }, + { + "epoch": 1.7808614164714363, + "grad_norm": 0.35905441641807556, + "learning_rate": 8.129453757781132e-06, + "loss": 0.4522, + "step": 3294 + }, + { + "epoch": 1.7814020544242206, + "grad_norm": 0.30049991607666016, + "learning_rate": 8.12798108167885e-06, + "loss": 0.3941, + "step": 3295 + }, + { + "epoch": 1.781942692377005, + "grad_norm": 0.34811657667160034, + "learning_rate": 8.1265079595947e-06, + "loss": 0.4553, + "step": 3296 + }, + { + "epoch": 1.7824833303297891, + "grad_norm": 0.31443458795547485, + "learning_rate": 8.125034391738712e-06, + "loss": 0.4523, + "step": 3297 + }, + { + "epoch": 1.7830239682825735, + "grad_norm": 0.29668858647346497, + "learning_rate": 8.123560378320988e-06, + "loss": 0.3684, + "step": 3298 + }, + { + "epoch": 1.7835646062353576, + "grad_norm": 0.3364712595939636, + "learning_rate": 8.122085919551685e-06, + "loss": 0.4364, + "step": 3299 + }, + { + "epoch": 1.784105244188142, + "grad_norm": 0.3368052542209625, + "learning_rate": 8.120611015641036e-06, + "loss": 0.4778, + "step": 3300 + }, + { + "epoch": 1.7846458821409263, + "grad_norm": 0.3242607116699219, + "learning_rate": 8.119135666799324e-06, + "loss": 0.4486, + "step": 3301 + }, + { + "epoch": 1.7851865200937107, + "grad_norm": 0.30162885785102844, + "learning_rate": 8.117659873236906e-06, + "loss": 0.4081, + "step": 3302 + }, + { + "epoch": 1.7857271580464948, + "grad_norm": 0.32313260436058044, + "learning_rate": 8.116183635164193e-06, + "loss": 0.4478, + "step": 3303 + }, + { + "epoch": 1.786267795999279, + "grad_norm": 0.34067660570144653, + "learning_rate": 8.11470695279167e-06, + "loss": 0.4393, + "step": 3304 + }, + { + "epoch": 1.7868084339520633, + "grad_norm": 0.30169734358787537, + "learning_rate": 8.113229826329876e-06, + "loss": 0.4139, + "step": 3305 + }, + { + "epoch": 1.7873490719048477, + "grad_norm": 0.3240569829940796, + "learning_rate": 8.11175225598942e-06, + "loss": 0.4215, + "step": 3306 + }, + { + "epoch": 1.787889709857632, + "grad_norm": 0.35240086913108826, + "learning_rate": 8.110274241980967e-06, + "loss": 0.4272, + "step": 3307 + }, + { + "epoch": 1.7884303478104164, + "grad_norm": 0.2917160093784332, + "learning_rate": 8.108795784515252e-06, + "loss": 0.4377, + "step": 3308 + }, + { + "epoch": 1.7889709857632006, + "grad_norm": 0.3504859507083893, + "learning_rate": 8.107316883803071e-06, + "loss": 0.4809, + "step": 3309 + }, + { + "epoch": 1.7895116237159847, + "grad_norm": 0.3004903197288513, + "learning_rate": 8.105837540055284e-06, + "loss": 0.3998, + "step": 3310 + }, + { + "epoch": 1.790052261668769, + "grad_norm": 0.3329935371875763, + "learning_rate": 8.10435775348281e-06, + "loss": 0.4854, + "step": 3311 + }, + { + "epoch": 1.7905928996215534, + "grad_norm": 0.28653180599212646, + "learning_rate": 8.102877524296637e-06, + "loss": 0.3824, + "step": 3312 + }, + { + "epoch": 1.7911335375743378, + "grad_norm": 0.34429025650024414, + "learning_rate": 8.101396852707811e-06, + "loss": 0.4145, + "step": 3313 + }, + { + "epoch": 1.7916741755271222, + "grad_norm": 0.33222684264183044, + "learning_rate": 8.099915738927446e-06, + "loss": 0.458, + "step": 3314 + }, + { + "epoch": 1.7922148134799063, + "grad_norm": 0.3459024727344513, + "learning_rate": 8.098434183166716e-06, + "loss": 0.4562, + "step": 3315 + }, + { + "epoch": 1.7927554514326904, + "grad_norm": 0.3265814781188965, + "learning_rate": 8.096952185636856e-06, + "loss": 0.4364, + "step": 3316 + }, + { + "epoch": 1.7932960893854748, + "grad_norm": 0.36054661870002747, + "learning_rate": 8.095469746549172e-06, + "loss": 0.4636, + "step": 3317 + }, + { + "epoch": 1.7938367273382592, + "grad_norm": 0.3394835889339447, + "learning_rate": 8.09398686611502e-06, + "loss": 0.4258, + "step": 3318 + }, + { + "epoch": 1.7943773652910435, + "grad_norm": 0.3196641802787781, + "learning_rate": 8.092503544545834e-06, + "loss": 0.3846, + "step": 3319 + }, + { + "epoch": 1.7949180032438277, + "grad_norm": 0.34875744581222534, + "learning_rate": 8.091019782053097e-06, + "loss": 0.4571, + "step": 3320 + }, + { + "epoch": 1.795458641196612, + "grad_norm": 0.34042981266975403, + "learning_rate": 8.089535578848364e-06, + "loss": 0.4589, + "step": 3321 + }, + { + "epoch": 1.7959992791493962, + "grad_norm": 0.32495275139808655, + "learning_rate": 8.088050935143252e-06, + "loss": 0.4319, + "step": 3322 + }, + { + "epoch": 1.7965399171021805, + "grad_norm": 0.33212369680404663, + "learning_rate": 8.086565851149435e-06, + "loss": 0.419, + "step": 3323 + }, + { + "epoch": 1.797080555054965, + "grad_norm": 0.30814874172210693, + "learning_rate": 8.085080327078656e-06, + "loss": 0.4376, + "step": 3324 + }, + { + "epoch": 1.7976211930077493, + "grad_norm": 0.35443365573883057, + "learning_rate": 8.083594363142717e-06, + "loss": 0.4306, + "step": 3325 + }, + { + "epoch": 1.7981618309605334, + "grad_norm": 0.33060944080352783, + "learning_rate": 8.082107959553484e-06, + "loss": 0.4332, + "step": 3326 + }, + { + "epoch": 1.7987024689133178, + "grad_norm": 0.3036220967769623, + "learning_rate": 8.080621116522886e-06, + "loss": 0.4198, + "step": 3327 + }, + { + "epoch": 1.799243106866102, + "grad_norm": 0.3334490954875946, + "learning_rate": 8.079133834262916e-06, + "loss": 0.4342, + "step": 3328 + }, + { + "epoch": 1.7997837448188863, + "grad_norm": 0.3297540843486786, + "learning_rate": 8.077646112985626e-06, + "loss": 0.4735, + "step": 3329 + }, + { + "epoch": 1.8003243827716706, + "grad_norm": 0.32654431462287903, + "learning_rate": 8.076157952903134e-06, + "loss": 0.3758, + "step": 3330 + }, + { + "epoch": 1.800865020724455, + "grad_norm": 0.3422190845012665, + "learning_rate": 8.07466935422762e-06, + "loss": 0.4157, + "step": 3331 + }, + { + "epoch": 1.8014056586772391, + "grad_norm": 0.34716933965682983, + "learning_rate": 8.073180317171322e-06, + "loss": 0.5095, + "step": 3332 + }, + { + "epoch": 1.8019462966300235, + "grad_norm": 0.31180334091186523, + "learning_rate": 8.071690841946547e-06, + "loss": 0.3699, + "step": 3333 + }, + { + "epoch": 1.8024869345828076, + "grad_norm": 0.3031768500804901, + "learning_rate": 8.070200928765661e-06, + "loss": 0.4465, + "step": 3334 + }, + { + "epoch": 1.803027572535592, + "grad_norm": 0.31092366576194763, + "learning_rate": 8.068710577841093e-06, + "loss": 0.4141, + "step": 3335 + }, + { + "epoch": 1.8035682104883763, + "grad_norm": 0.3233906030654907, + "learning_rate": 8.067219789385335e-06, + "loss": 0.4207, + "step": 3336 + }, + { + "epoch": 1.8041088484411607, + "grad_norm": 0.3430812954902649, + "learning_rate": 8.06572856361094e-06, + "loss": 0.4896, + "step": 3337 + }, + { + "epoch": 1.8046494863939448, + "grad_norm": 0.2886134684085846, + "learning_rate": 8.064236900730526e-06, + "loss": 0.4108, + "step": 3338 + }, + { + "epoch": 1.805190124346729, + "grad_norm": 0.3320710062980652, + "learning_rate": 8.06274480095677e-06, + "loss": 0.3786, + "step": 3339 + }, + { + "epoch": 1.8057307622995133, + "grad_norm": 0.34547311067581177, + "learning_rate": 8.061252264502415e-06, + "loss": 0.4575, + "step": 3340 + }, + { + "epoch": 1.8062714002522977, + "grad_norm": 0.32114607095718384, + "learning_rate": 8.05975929158026e-06, + "loss": 0.4365, + "step": 3341 + }, + { + "epoch": 1.806812038205082, + "grad_norm": 0.3137049973011017, + "learning_rate": 8.058265882403174e-06, + "loss": 0.4302, + "step": 3342 + }, + { + "epoch": 1.8073526761578664, + "grad_norm": 0.33785489201545715, + "learning_rate": 8.056772037184083e-06, + "loss": 0.4535, + "step": 3343 + }, + { + "epoch": 1.8078933141106506, + "grad_norm": 0.3604099452495575, + "learning_rate": 8.055277756135978e-06, + "loss": 0.4406, + "step": 3344 + }, + { + "epoch": 1.8084339520634347, + "grad_norm": 0.35606586933135986, + "learning_rate": 8.053783039471909e-06, + "loss": 0.455, + "step": 3345 + }, + { + "epoch": 1.808974590016219, + "grad_norm": 0.3593486547470093, + "learning_rate": 8.052287887404992e-06, + "loss": 0.4388, + "step": 3346 + }, + { + "epoch": 1.8095152279690034, + "grad_norm": 0.3589855134487152, + "learning_rate": 8.050792300148402e-06, + "loss": 0.439, + "step": 3347 + }, + { + "epoch": 1.8100558659217878, + "grad_norm": 0.3543134033679962, + "learning_rate": 8.049296277915378e-06, + "loss": 0.4468, + "step": 3348 + }, + { + "epoch": 1.8105965038745722, + "grad_norm": 0.40692999958992004, + "learning_rate": 8.047799820919218e-06, + "loss": 0.4285, + "step": 3349 + }, + { + "epoch": 1.8111371418273563, + "grad_norm": 0.35559090971946716, + "learning_rate": 8.046302929373286e-06, + "loss": 0.4082, + "step": 3350 + }, + { + "epoch": 1.8116777797801404, + "grad_norm": 0.40692654252052307, + "learning_rate": 8.044805603491005e-06, + "loss": 0.4688, + "step": 3351 + }, + { + "epoch": 1.8122184177329248, + "grad_norm": 0.36968672275543213, + "learning_rate": 8.043307843485863e-06, + "loss": 0.4103, + "step": 3352 + }, + { + "epoch": 1.8127590556857092, + "grad_norm": 0.4615669548511505, + "learning_rate": 8.041809649571406e-06, + "loss": 0.4533, + "step": 3353 + }, + { + "epoch": 1.8132996936384935, + "grad_norm": 0.3097493052482605, + "learning_rate": 8.040311021961245e-06, + "loss": 0.4162, + "step": 3354 + }, + { + "epoch": 1.8138403315912777, + "grad_norm": 0.374896764755249, + "learning_rate": 8.038811960869051e-06, + "loss": 0.3935, + "step": 3355 + }, + { + "epoch": 1.814380969544062, + "grad_norm": 0.3678896129131317, + "learning_rate": 8.037312466508555e-06, + "loss": 0.4263, + "step": 3356 + }, + { + "epoch": 1.8149216074968462, + "grad_norm": 0.3088608682155609, + "learning_rate": 8.035812539093557e-06, + "loss": 0.4408, + "step": 3357 + }, + { + "epoch": 1.8154622454496305, + "grad_norm": 0.3997863233089447, + "learning_rate": 8.034312178837911e-06, + "loss": 0.4149, + "step": 3358 + }, + { + "epoch": 1.816002883402415, + "grad_norm": 0.3411961793899536, + "learning_rate": 8.032811385955535e-06, + "loss": 0.4484, + "step": 3359 + }, + { + "epoch": 1.8165435213551993, + "grad_norm": 0.3319053649902344, + "learning_rate": 8.031310160660411e-06, + "loss": 0.4202, + "step": 3360 + }, + { + "epoch": 1.8170841593079834, + "grad_norm": 0.36736491322517395, + "learning_rate": 8.02980850316658e-06, + "loss": 0.4341, + "step": 3361 + }, + { + "epoch": 1.8176247972607678, + "grad_norm": 0.34897086024284363, + "learning_rate": 8.028306413688147e-06, + "loss": 0.4524, + "step": 3362 + }, + { + "epoch": 1.818165435213552, + "grad_norm": 0.36966174840927124, + "learning_rate": 8.026803892439276e-06, + "loss": 0.4693, + "step": 3363 + }, + { + "epoch": 1.8187060731663363, + "grad_norm": 0.32758980989456177, + "learning_rate": 8.025300939634193e-06, + "loss": 0.3934, + "step": 3364 + }, + { + "epoch": 1.8192467111191206, + "grad_norm": 0.31811997294425964, + "learning_rate": 8.023797555487188e-06, + "loss": 0.4073, + "step": 3365 + }, + { + "epoch": 1.819787349071905, + "grad_norm": 0.3481244742870331, + "learning_rate": 8.02229374021261e-06, + "loss": 0.4686, + "step": 3366 + }, + { + "epoch": 1.8203279870246891, + "grad_norm": 0.2885033190250397, + "learning_rate": 8.02078949402487e-06, + "loss": 0.3879, + "step": 3367 + }, + { + "epoch": 1.8208686249774733, + "grad_norm": 0.33795928955078125, + "learning_rate": 8.019284817138442e-06, + "loss": 0.4468, + "step": 3368 + }, + { + "epoch": 1.8214092629302576, + "grad_norm": 0.35017502307891846, + "learning_rate": 8.017779709767857e-06, + "loss": 0.403, + "step": 3369 + }, + { + "epoch": 1.821949900883042, + "grad_norm": 0.32768213748931885, + "learning_rate": 8.016274172127715e-06, + "loss": 0.4891, + "step": 3370 + }, + { + "epoch": 1.8224905388358263, + "grad_norm": 0.37308111786842346, + "learning_rate": 8.01476820443267e-06, + "loss": 0.4717, + "step": 3371 + }, + { + "epoch": 1.8230311767886107, + "grad_norm": 0.3486446440219879, + "learning_rate": 8.01326180689744e-06, + "loss": 0.3905, + "step": 3372 + }, + { + "epoch": 1.8235718147413948, + "grad_norm": 0.3119492828845978, + "learning_rate": 8.011754979736804e-06, + "loss": 0.4412, + "step": 3373 + }, + { + "epoch": 1.824112452694179, + "grad_norm": 0.41840484738349915, + "learning_rate": 8.010247723165604e-06, + "loss": 0.4477, + "step": 3374 + }, + { + "epoch": 1.8246530906469633, + "grad_norm": 0.34604212641716003, + "learning_rate": 8.008740037398742e-06, + "loss": 0.4532, + "step": 3375 + }, + { + "epoch": 1.8251937285997477, + "grad_norm": 0.3138395845890045, + "learning_rate": 8.00723192265118e-06, + "loss": 0.421, + "step": 3376 + }, + { + "epoch": 1.825734366552532, + "grad_norm": 0.3667658865451813, + "learning_rate": 8.005723379137944e-06, + "loss": 0.4477, + "step": 3377 + }, + { + "epoch": 1.8262750045053164, + "grad_norm": 0.3710743486881256, + "learning_rate": 8.004214407074118e-06, + "loss": 0.4334, + "step": 3378 + }, + { + "epoch": 1.8268156424581006, + "grad_norm": 0.35465577244758606, + "learning_rate": 8.002705006674849e-06, + "loss": 0.4332, + "step": 3379 + }, + { + "epoch": 1.8273562804108847, + "grad_norm": 0.3599276542663574, + "learning_rate": 8.001195178155344e-06, + "loss": 0.4571, + "step": 3380 + }, + { + "epoch": 1.827896918363669, + "grad_norm": 0.35257986187934875, + "learning_rate": 7.999684921730872e-06, + "loss": 0.4335, + "step": 3381 + }, + { + "epoch": 1.8284375563164534, + "grad_norm": 0.3256742060184479, + "learning_rate": 7.998174237616763e-06, + "loss": 0.4154, + "step": 3382 + }, + { + "epoch": 1.8289781942692378, + "grad_norm": 0.3676372468471527, + "learning_rate": 7.996663126028406e-06, + "loss": 0.4634, + "step": 3383 + }, + { + "epoch": 1.829518832222022, + "grad_norm": 0.2912156581878662, + "learning_rate": 7.995151587181256e-06, + "loss": 0.3677, + "step": 3384 + }, + { + "epoch": 1.8300594701748063, + "grad_norm": 0.31365203857421875, + "learning_rate": 7.99363962129082e-06, + "loss": 0.4162, + "step": 3385 + }, + { + "epoch": 1.8306001081275904, + "grad_norm": 0.3299397826194763, + "learning_rate": 7.992127228572677e-06, + "loss": 0.4345, + "step": 3386 + }, + { + "epoch": 1.8311407460803748, + "grad_norm": 0.33139047026634216, + "learning_rate": 7.990614409242458e-06, + "loss": 0.4299, + "step": 3387 + }, + { + "epoch": 1.8316813840331592, + "grad_norm": 0.3328225314617157, + "learning_rate": 7.98910116351586e-06, + "loss": 0.4504, + "step": 3388 + }, + { + "epoch": 1.8322220219859435, + "grad_norm": 0.3534029424190521, + "learning_rate": 7.987587491608636e-06, + "loss": 0.4632, + "step": 3389 + }, + { + "epoch": 1.8327626599387277, + "grad_norm": 0.3083217144012451, + "learning_rate": 7.986073393736607e-06, + "loss": 0.4132, + "step": 3390 + }, + { + "epoch": 1.833303297891512, + "grad_norm": 0.3547525703907013, + "learning_rate": 7.984558870115645e-06, + "loss": 0.4629, + "step": 3391 + }, + { + "epoch": 1.8338439358442962, + "grad_norm": 0.3502817749977112, + "learning_rate": 7.983043920961692e-06, + "loss": 0.4646, + "step": 3392 + }, + { + "epoch": 1.8343845737970805, + "grad_norm": 0.33607080578804016, + "learning_rate": 7.981528546490744e-06, + "loss": 0.4354, + "step": 3393 + }, + { + "epoch": 1.834925211749865, + "grad_norm": 0.33097922801971436, + "learning_rate": 7.980012746918863e-06, + "loss": 0.4483, + "step": 3394 + }, + { + "epoch": 1.8354658497026493, + "grad_norm": 0.3429686725139618, + "learning_rate": 7.978496522462167e-06, + "loss": 0.4176, + "step": 3395 + }, + { + "epoch": 1.8360064876554334, + "grad_norm": 0.34447550773620605, + "learning_rate": 7.976979873336838e-06, + "loss": 0.4116, + "step": 3396 + }, + { + "epoch": 1.8365471256082175, + "grad_norm": 0.30354902148246765, + "learning_rate": 7.975462799759115e-06, + "loss": 0.4629, + "step": 3397 + }, + { + "epoch": 1.837087763561002, + "grad_norm": 0.318607360124588, + "learning_rate": 7.973945301945302e-06, + "loss": 0.4459, + "step": 3398 + }, + { + "epoch": 1.8376284015137863, + "grad_norm": 0.35816827416419983, + "learning_rate": 7.97242738011176e-06, + "loss": 0.4536, + "step": 3399 + }, + { + "epoch": 1.8381690394665706, + "grad_norm": 0.31781914830207825, + "learning_rate": 7.97090903447491e-06, + "loss": 0.4347, + "step": 3400 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 0.34621742367744446, + "learning_rate": 7.969390265251238e-06, + "loss": 0.4336, + "step": 3401 + }, + { + "epoch": 1.8392503153721391, + "grad_norm": 0.36284780502319336, + "learning_rate": 7.967871072657285e-06, + "loss": 0.4342, + "step": 3402 + }, + { + "epoch": 1.8397909533249233, + "grad_norm": 0.3561781346797943, + "learning_rate": 7.966351456909656e-06, + "loss": 0.4454, + "step": 3403 + }, + { + "epoch": 1.8403315912777076, + "grad_norm": 0.3318120241165161, + "learning_rate": 7.964831418225015e-06, + "loss": 0.4255, + "step": 3404 + }, + { + "epoch": 1.840872229230492, + "grad_norm": 0.376350998878479, + "learning_rate": 7.963310956820085e-06, + "loss": 0.434, + "step": 3405 + }, + { + "epoch": 1.8414128671832763, + "grad_norm": 0.3097081184387207, + "learning_rate": 7.96179007291165e-06, + "loss": 0.4006, + "step": 3406 + }, + { + "epoch": 1.8419535051360607, + "grad_norm": 0.4447126090526581, + "learning_rate": 7.960268766716561e-06, + "loss": 0.4955, + "step": 3407 + }, + { + "epoch": 1.8424941430888448, + "grad_norm": 0.3248273432254791, + "learning_rate": 7.958747038451715e-06, + "loss": 0.4237, + "step": 3408 + }, + { + "epoch": 1.843034781041629, + "grad_norm": 0.3516506552696228, + "learning_rate": 7.957224888334084e-06, + "loss": 0.4339, + "step": 3409 + }, + { + "epoch": 1.8435754189944134, + "grad_norm": 0.34608927369117737, + "learning_rate": 7.955702316580686e-06, + "loss": 0.3852, + "step": 3410 + }, + { + "epoch": 1.8441160569471977, + "grad_norm": 0.37326952815055847, + "learning_rate": 7.954179323408613e-06, + "loss": 0.4479, + "step": 3411 + }, + { + "epoch": 1.844656694899982, + "grad_norm": 0.36792048811912537, + "learning_rate": 7.952655909035008e-06, + "loss": 0.4638, + "step": 3412 + }, + { + "epoch": 1.8451973328527662, + "grad_norm": 0.32509738206863403, + "learning_rate": 7.951132073677077e-06, + "loss": 0.4225, + "step": 3413 + }, + { + "epoch": 1.8457379708055506, + "grad_norm": 0.3608253002166748, + "learning_rate": 7.949607817552086e-06, + "loss": 0.4148, + "step": 3414 + }, + { + "epoch": 1.8462786087583347, + "grad_norm": 0.38289326429367065, + "learning_rate": 7.94808314087736e-06, + "loss": 0.4439, + "step": 3415 + }, + { + "epoch": 1.846819246711119, + "grad_norm": 0.35737040638923645, + "learning_rate": 7.946558043870286e-06, + "loss": 0.4638, + "step": 3416 + }, + { + "epoch": 1.8473598846639034, + "grad_norm": 0.32094112038612366, + "learning_rate": 7.945032526748308e-06, + "loss": 0.4111, + "step": 3417 + }, + { + "epoch": 1.8479005226166878, + "grad_norm": 0.3625240921974182, + "learning_rate": 7.943506589728931e-06, + "loss": 0.4163, + "step": 3418 + }, + { + "epoch": 1.848441160569472, + "grad_norm": 0.346235990524292, + "learning_rate": 7.941980233029723e-06, + "loss": 0.4296, + "step": 3419 + }, + { + "epoch": 1.8489817985222563, + "grad_norm": 0.31741318106651306, + "learning_rate": 7.940453456868304e-06, + "loss": 0.4189, + "step": 3420 + }, + { + "epoch": 1.8495224364750404, + "grad_norm": 0.39256808161735535, + "learning_rate": 7.938926261462366e-06, + "loss": 0.4774, + "step": 3421 + }, + { + "epoch": 1.8500630744278248, + "grad_norm": 0.39664962887763977, + "learning_rate": 7.93739864702965e-06, + "loss": 0.4316, + "step": 3422 + }, + { + "epoch": 1.8506037123806092, + "grad_norm": 0.328249990940094, + "learning_rate": 7.93587061378796e-06, + "loss": 0.4067, + "step": 3423 + }, + { + "epoch": 1.8511443503333935, + "grad_norm": 0.4215303063392639, + "learning_rate": 7.93434216195516e-06, + "loss": 0.4366, + "step": 3424 + }, + { + "epoch": 1.8516849882861777, + "grad_norm": 0.3874909281730652, + "learning_rate": 7.932813291749177e-06, + "loss": 0.4325, + "step": 3425 + }, + { + "epoch": 1.8522256262389618, + "grad_norm": 0.30493733286857605, + "learning_rate": 7.93128400338799e-06, + "loss": 0.388, + "step": 3426 + }, + { + "epoch": 1.8527662641917462, + "grad_norm": 0.4207254648208618, + "learning_rate": 7.929754297089646e-06, + "loss": 0.4686, + "step": 3427 + }, + { + "epoch": 1.8533069021445305, + "grad_norm": 0.3271644115447998, + "learning_rate": 7.928224173072247e-06, + "loss": 0.4329, + "step": 3428 + }, + { + "epoch": 1.853847540097315, + "grad_norm": 0.3401440978050232, + "learning_rate": 7.926693631553955e-06, + "loss": 0.4076, + "step": 3429 + }, + { + "epoch": 1.8543881780500993, + "grad_norm": 0.44994547963142395, + "learning_rate": 7.925162672752989e-06, + "loss": 0.4323, + "step": 3430 + }, + { + "epoch": 1.8549288160028834, + "grad_norm": 0.3586891293525696, + "learning_rate": 7.923631296887634e-06, + "loss": 0.4322, + "step": 3431 + }, + { + "epoch": 1.8554694539556675, + "grad_norm": 0.3234873414039612, + "learning_rate": 7.92209950417623e-06, + "loss": 0.4233, + "step": 3432 + }, + { + "epoch": 1.856010091908452, + "grad_norm": 0.3968302309513092, + "learning_rate": 7.920567294837176e-06, + "loss": 0.4589, + "step": 3433 + }, + { + "epoch": 1.8565507298612363, + "grad_norm": 0.3297528028488159, + "learning_rate": 7.919034669088933e-06, + "loss": 0.4487, + "step": 3434 + }, + { + "epoch": 1.8570913678140206, + "grad_norm": 0.3599737286567688, + "learning_rate": 7.917501627150019e-06, + "loss": 0.4642, + "step": 3435 + }, + { + "epoch": 1.857632005766805, + "grad_norm": 0.3447016477584839, + "learning_rate": 7.915968169239012e-06, + "loss": 0.4301, + "step": 3436 + }, + { + "epoch": 1.8581726437195891, + "grad_norm": 0.3434467315673828, + "learning_rate": 7.914434295574552e-06, + "loss": 0.4359, + "step": 3437 + }, + { + "epoch": 1.8587132816723733, + "grad_norm": 0.3599534034729004, + "learning_rate": 7.912900006375334e-06, + "loss": 0.4322, + "step": 3438 + }, + { + "epoch": 1.8592539196251576, + "grad_norm": 0.3040129244327545, + "learning_rate": 7.911365301860114e-06, + "loss": 0.3793, + "step": 3439 + }, + { + "epoch": 1.859794557577942, + "grad_norm": 0.352145254611969, + "learning_rate": 7.90983018224771e-06, + "loss": 0.4731, + "step": 3440 + }, + { + "epoch": 1.8603351955307263, + "grad_norm": 0.36871862411499023, + "learning_rate": 7.908294647756992e-06, + "loss": 0.4668, + "step": 3441 + }, + { + "epoch": 1.8608758334835105, + "grad_norm": 0.3542006015777588, + "learning_rate": 7.906758698606895e-06, + "loss": 0.4712, + "step": 3442 + }, + { + "epoch": 1.8614164714362949, + "grad_norm": 0.3433796465396881, + "learning_rate": 7.905222335016417e-06, + "loss": 0.3947, + "step": 3443 + }, + { + "epoch": 1.861957109389079, + "grad_norm": 0.3266144394874573, + "learning_rate": 7.903685557204601e-06, + "loss": 0.461, + "step": 3444 + }, + { + "epoch": 1.8624977473418634, + "grad_norm": 0.36006054282188416, + "learning_rate": 7.902148365390567e-06, + "loss": 0.414, + "step": 3445 + }, + { + "epoch": 1.8630383852946477, + "grad_norm": 0.3393409550189972, + "learning_rate": 7.90061075979348e-06, + "loss": 0.4392, + "step": 3446 + }, + { + "epoch": 1.863579023247432, + "grad_norm": 0.30579596757888794, + "learning_rate": 7.89907274063257e-06, + "loss": 0.4311, + "step": 3447 + }, + { + "epoch": 1.8641196612002162, + "grad_norm": 0.37538549304008484, + "learning_rate": 7.897534308127123e-06, + "loss": 0.4638, + "step": 3448 + }, + { + "epoch": 1.8646602991530006, + "grad_norm": 0.3323426842689514, + "learning_rate": 7.895995462496491e-06, + "loss": 0.4363, + "step": 3449 + }, + { + "epoch": 1.8652009371057847, + "grad_norm": 0.30412882566452026, + "learning_rate": 7.894456203960075e-06, + "loss": 0.4247, + "step": 3450 + }, + { + "epoch": 1.865741575058569, + "grad_norm": 0.3254438042640686, + "learning_rate": 7.892916532737343e-06, + "loss": 0.4238, + "step": 3451 + }, + { + "epoch": 1.8662822130113534, + "grad_norm": 0.3321169316768646, + "learning_rate": 7.891376449047813e-06, + "loss": 0.4779, + "step": 3452 + }, + { + "epoch": 1.8668228509641378, + "grad_norm": 0.28796014189720154, + "learning_rate": 7.889835953111075e-06, + "loss": 0.3889, + "step": 3453 + }, + { + "epoch": 1.867363488916922, + "grad_norm": 0.33776959776878357, + "learning_rate": 7.888295045146766e-06, + "loss": 0.4302, + "step": 3454 + }, + { + "epoch": 1.8679041268697063, + "grad_norm": 0.30615368485450745, + "learning_rate": 7.886753725374586e-06, + "loss": 0.4116, + "step": 3455 + }, + { + "epoch": 1.8684447648224904, + "grad_norm": 0.3729323744773865, + "learning_rate": 7.885211994014294e-06, + "loss": 0.4937, + "step": 3456 + }, + { + "epoch": 1.8689854027752748, + "grad_norm": 0.2820781469345093, + "learning_rate": 7.883669851285707e-06, + "loss": 0.3901, + "step": 3457 + }, + { + "epoch": 1.8695260407280592, + "grad_norm": 0.3319065570831299, + "learning_rate": 7.8821272974087e-06, + "loss": 0.4388, + "step": 3458 + }, + { + "epoch": 1.8700666786808435, + "grad_norm": 0.32416149973869324, + "learning_rate": 7.88058433260321e-06, + "loss": 0.4215, + "step": 3459 + }, + { + "epoch": 1.8706073166336277, + "grad_norm": 0.37668269872665405, + "learning_rate": 7.879040957089229e-06, + "loss": 0.4589, + "step": 3460 + }, + { + "epoch": 1.8711479545864118, + "grad_norm": 0.3454728126525879, + "learning_rate": 7.877497171086805e-06, + "loss": 0.4593, + "step": 3461 + }, + { + "epoch": 1.8716885925391962, + "grad_norm": 0.2921670377254486, + "learning_rate": 7.875952974816054e-06, + "loss": 0.4146, + "step": 3462 + }, + { + "epoch": 1.8722292304919805, + "grad_norm": 0.33144211769104004, + "learning_rate": 7.874408368497142e-06, + "loss": 0.4077, + "step": 3463 + }, + { + "epoch": 1.872769868444765, + "grad_norm": 0.3379572033882141, + "learning_rate": 7.872863352350298e-06, + "loss": 0.4074, + "step": 3464 + }, + { + "epoch": 1.8733105063975493, + "grad_norm": 0.34386518597602844, + "learning_rate": 7.871317926595804e-06, + "loss": 0.4467, + "step": 3465 + }, + { + "epoch": 1.8738511443503334, + "grad_norm": 0.35571232438087463, + "learning_rate": 7.869772091454007e-06, + "loss": 0.481, + "step": 3466 + }, + { + "epoch": 1.8743917823031175, + "grad_norm": 0.30590566992759705, + "learning_rate": 7.868225847145308e-06, + "loss": 0.3902, + "step": 3467 + }, + { + "epoch": 1.874932420255902, + "grad_norm": 0.3952900767326355, + "learning_rate": 7.86667919389017e-06, + "loss": 0.4655, + "step": 3468 + }, + { + "epoch": 1.8754730582086863, + "grad_norm": 0.31838780641555786, + "learning_rate": 7.865132131909106e-06, + "loss": 0.4241, + "step": 3469 + }, + { + "epoch": 1.8760136961614706, + "grad_norm": 0.3257013261318207, + "learning_rate": 7.8635846614227e-06, + "loss": 0.4763, + "step": 3470 + }, + { + "epoch": 1.876554334114255, + "grad_norm": 0.35523343086242676, + "learning_rate": 7.862036782651586e-06, + "loss": 0.4286, + "step": 3471 + }, + { + "epoch": 1.8770949720670391, + "grad_norm": 0.3634128272533417, + "learning_rate": 7.860488495816456e-06, + "loss": 0.4624, + "step": 3472 + }, + { + "epoch": 1.8776356100198233, + "grad_norm": 0.3118461072444916, + "learning_rate": 7.858939801138061e-06, + "loss": 0.3846, + "step": 3473 + }, + { + "epoch": 1.8781762479726076, + "grad_norm": 0.39412805438041687, + "learning_rate": 7.857390698837214e-06, + "loss": 0.4756, + "step": 3474 + }, + { + "epoch": 1.878716885925392, + "grad_norm": 0.33649295568466187, + "learning_rate": 7.855841189134784e-06, + "loss": 0.418, + "step": 3475 + }, + { + "epoch": 1.8792575238781764, + "grad_norm": 0.3323207497596741, + "learning_rate": 7.854291272251692e-06, + "loss": 0.3778, + "step": 3476 + }, + { + "epoch": 1.8797981618309605, + "grad_norm": 0.3266110122203827, + "learning_rate": 7.852740948408928e-06, + "loss": 0.4403, + "step": 3477 + }, + { + "epoch": 1.8803387997837449, + "grad_norm": 0.35497087240219116, + "learning_rate": 7.85119021782753e-06, + "loss": 0.4386, + "step": 3478 + }, + { + "epoch": 1.880879437736529, + "grad_norm": 0.3806227445602417, + "learning_rate": 7.849639080728601e-06, + "loss": 0.4426, + "step": 3479 + }, + { + "epoch": 1.8814200756893134, + "grad_norm": 0.33290979266166687, + "learning_rate": 7.848087537333298e-06, + "loss": 0.46, + "step": 3480 + }, + { + "epoch": 1.8819607136420977, + "grad_norm": 0.42978647351264954, + "learning_rate": 7.846535587862838e-06, + "loss": 0.409, + "step": 3481 + }, + { + "epoch": 1.882501351594882, + "grad_norm": 0.33629342913627625, + "learning_rate": 7.844983232538497e-06, + "loss": 0.4345, + "step": 3482 + }, + { + "epoch": 1.8830419895476662, + "grad_norm": 0.36188969016075134, + "learning_rate": 7.843430471581603e-06, + "loss": 0.4136, + "step": 3483 + }, + { + "epoch": 1.8835826275004506, + "grad_norm": 0.34933236241340637, + "learning_rate": 7.841877305213548e-06, + "loss": 0.4289, + "step": 3484 + }, + { + "epoch": 1.8841232654532347, + "grad_norm": 0.3108839988708496, + "learning_rate": 7.84032373365578e-06, + "loss": 0.4197, + "step": 3485 + }, + { + "epoch": 1.884663903406019, + "grad_norm": 0.3623836934566498, + "learning_rate": 7.838769757129804e-06, + "loss": 0.4456, + "step": 3486 + }, + { + "epoch": 1.8852045413588034, + "grad_norm": 0.34876587986946106, + "learning_rate": 7.837215375857182e-06, + "loss": 0.4237, + "step": 3487 + }, + { + "epoch": 1.8857451793115878, + "grad_norm": 0.3331240713596344, + "learning_rate": 7.835660590059537e-06, + "loss": 0.4507, + "step": 3488 + }, + { + "epoch": 1.886285817264372, + "grad_norm": 0.3328709006309509, + "learning_rate": 7.834105399958545e-06, + "loss": 0.4107, + "step": 3489 + }, + { + "epoch": 1.886826455217156, + "grad_norm": 0.29972803592681885, + "learning_rate": 7.832549805775945e-06, + "loss": 0.3918, + "step": 3490 + }, + { + "epoch": 1.8873670931699404, + "grad_norm": 0.3482237458229065, + "learning_rate": 7.83099380773353e-06, + "loss": 0.4382, + "step": 3491 + }, + { + "epoch": 1.8879077311227248, + "grad_norm": 0.3243218660354614, + "learning_rate": 7.829437406053149e-06, + "loss": 0.4319, + "step": 3492 + }, + { + "epoch": 1.8884483690755092, + "grad_norm": 0.3584705591201782, + "learning_rate": 7.827880600956714e-06, + "loss": 0.4782, + "step": 3493 + }, + { + "epoch": 1.8889890070282935, + "grad_norm": 0.30217036604881287, + "learning_rate": 7.82632339266619e-06, + "loss": 0.4364, + "step": 3494 + }, + { + "epoch": 1.8895296449810777, + "grad_norm": 0.3277549147605896, + "learning_rate": 7.8247657814036e-06, + "loss": 0.4096, + "step": 3495 + }, + { + "epoch": 1.8900702829338618, + "grad_norm": 0.32769298553466797, + "learning_rate": 7.823207767391027e-06, + "loss": 0.4361, + "step": 3496 + }, + { + "epoch": 1.8906109208866462, + "grad_norm": 0.2843538820743561, + "learning_rate": 7.82164935085061e-06, + "loss": 0.3908, + "step": 3497 + }, + { + "epoch": 1.8911515588394305, + "grad_norm": 0.35002461075782776, + "learning_rate": 7.820090532004546e-06, + "loss": 0.4466, + "step": 3498 + }, + { + "epoch": 1.891692196792215, + "grad_norm": 0.3506307899951935, + "learning_rate": 7.818531311075084e-06, + "loss": 0.4412, + "step": 3499 + }, + { + "epoch": 1.8922328347449993, + "grad_norm": 0.3184232711791992, + "learning_rate": 7.81697168828454e-06, + "loss": 0.4426, + "step": 3500 + }, + { + "epoch": 1.8927734726977834, + "grad_norm": 0.30824220180511475, + "learning_rate": 7.815411663855279e-06, + "loss": 0.389, + "step": 3501 + }, + { + "epoch": 1.8933141106505675, + "grad_norm": 0.3077496588230133, + "learning_rate": 7.813851238009728e-06, + "loss": 0.435, + "step": 3502 + }, + { + "epoch": 1.893854748603352, + "grad_norm": 0.350620299577713, + "learning_rate": 7.81229041097037e-06, + "loss": 0.4233, + "step": 3503 + }, + { + "epoch": 1.8943953865561363, + "grad_norm": 0.34502118825912476, + "learning_rate": 7.810729182959744e-06, + "loss": 0.4362, + "step": 3504 + }, + { + "epoch": 1.8949360245089206, + "grad_norm": 0.3388058841228485, + "learning_rate": 7.809167554200446e-06, + "loss": 0.4121, + "step": 3505 + }, + { + "epoch": 1.8954766624617048, + "grad_norm": 0.3333636224269867, + "learning_rate": 7.807605524915133e-06, + "loss": 0.4545, + "step": 3506 + }, + { + "epoch": 1.8960173004144891, + "grad_norm": 0.34134441614151, + "learning_rate": 7.806043095326515e-06, + "loss": 0.4315, + "step": 3507 + }, + { + "epoch": 1.8965579383672733, + "grad_norm": 0.3529305160045624, + "learning_rate": 7.804480265657359e-06, + "loss": 0.4034, + "step": 3508 + }, + { + "epoch": 1.8970985763200576, + "grad_norm": 0.3405922055244446, + "learning_rate": 7.802917036130491e-06, + "loss": 0.4338, + "step": 3509 + }, + { + "epoch": 1.897639214272842, + "grad_norm": 0.34475529193878174, + "learning_rate": 7.801353406968795e-06, + "loss": 0.4081, + "step": 3510 + }, + { + "epoch": 1.8981798522256264, + "grad_norm": 0.3404204845428467, + "learning_rate": 7.79978937839521e-06, + "loss": 0.4392, + "step": 3511 + }, + { + "epoch": 1.8987204901784105, + "grad_norm": 0.3219848871231079, + "learning_rate": 7.79822495063273e-06, + "loss": 0.4109, + "step": 3512 + }, + { + "epoch": 1.8992611281311949, + "grad_norm": 0.30592772364616394, + "learning_rate": 7.796660123904412e-06, + "loss": 0.415, + "step": 3513 + }, + { + "epoch": 1.899801766083979, + "grad_norm": 0.2983476221561432, + "learning_rate": 7.795094898433364e-06, + "loss": 0.4261, + "step": 3514 + }, + { + "epoch": 1.9003424040367634, + "grad_norm": 0.3173729479312897, + "learning_rate": 7.793529274442753e-06, + "loss": 0.4306, + "step": 3515 + }, + { + "epoch": 1.9008830419895477, + "grad_norm": 0.27732619643211365, + "learning_rate": 7.791963252155803e-06, + "loss": 0.4165, + "step": 3516 + }, + { + "epoch": 1.901423679942332, + "grad_norm": 0.31612640619277954, + "learning_rate": 7.790396831795792e-06, + "loss": 0.4575, + "step": 3517 + }, + { + "epoch": 1.9019643178951162, + "grad_norm": 0.30507561564445496, + "learning_rate": 7.788830013586064e-06, + "loss": 0.4295, + "step": 3518 + }, + { + "epoch": 1.9025049558479004, + "grad_norm": 0.34407278895378113, + "learning_rate": 7.787262797750006e-06, + "loss": 0.4468, + "step": 3519 + }, + { + "epoch": 1.9030455938006847, + "grad_norm": 0.2905230224132538, + "learning_rate": 7.785695184511074e-06, + "loss": 0.4223, + "step": 3520 + }, + { + "epoch": 1.903586231753469, + "grad_norm": 0.3119387924671173, + "learning_rate": 7.784127174092773e-06, + "loss": 0.4197, + "step": 3521 + }, + { + "epoch": 1.9041268697062534, + "grad_norm": 0.30937427282333374, + "learning_rate": 7.782558766718668e-06, + "loss": 0.4294, + "step": 3522 + }, + { + "epoch": 1.9046675076590378, + "grad_norm": 0.3073981702327728, + "learning_rate": 7.780989962612377e-06, + "loss": 0.408, + "step": 3523 + }, + { + "epoch": 1.905208145611822, + "grad_norm": 0.3197193741798401, + "learning_rate": 7.779420761997582e-06, + "loss": 0.454, + "step": 3524 + }, + { + "epoch": 1.905748783564606, + "grad_norm": 0.3320379853248596, + "learning_rate": 7.777851165098012e-06, + "loss": 0.4208, + "step": 3525 + }, + { + "epoch": 1.9062894215173904, + "grad_norm": 0.3256038725376129, + "learning_rate": 7.77628117213746e-06, + "loss": 0.4337, + "step": 3526 + }, + { + "epoch": 1.9068300594701748, + "grad_norm": 0.3060012459754944, + "learning_rate": 7.774710783339772e-06, + "loss": 0.4515, + "step": 3527 + }, + { + "epoch": 1.9073706974229592, + "grad_norm": 0.33129894733428955, + "learning_rate": 7.773139998928852e-06, + "loss": 0.421, + "step": 3528 + }, + { + "epoch": 1.9079113353757435, + "grad_norm": 0.3311441242694855, + "learning_rate": 7.771568819128659e-06, + "loss": 0.4309, + "step": 3529 + }, + { + "epoch": 1.9084519733285277, + "grad_norm": 0.3432542383670807, + "learning_rate": 7.769997244163209e-06, + "loss": 0.4302, + "step": 3530 + }, + { + "epoch": 1.9089926112813118, + "grad_norm": 0.33644434809684753, + "learning_rate": 7.768425274256575e-06, + "loss": 0.4412, + "step": 3531 + }, + { + "epoch": 1.9095332492340962, + "grad_norm": 0.30734914541244507, + "learning_rate": 7.766852909632882e-06, + "loss": 0.4113, + "step": 3532 + }, + { + "epoch": 1.9100738871868805, + "grad_norm": 0.34845325350761414, + "learning_rate": 7.765280150516322e-06, + "loss": 0.3858, + "step": 3533 + }, + { + "epoch": 1.910614525139665, + "grad_norm": 0.35980260372161865, + "learning_rate": 7.763706997131129e-06, + "loss": 0.4582, + "step": 3534 + }, + { + "epoch": 1.911155163092449, + "grad_norm": 0.32207977771759033, + "learning_rate": 7.762133449701603e-06, + "loss": 0.4366, + "step": 3535 + }, + { + "epoch": 1.9116958010452334, + "grad_norm": 0.32457974553108215, + "learning_rate": 7.760559508452099e-06, + "loss": 0.4374, + "step": 3536 + }, + { + "epoch": 1.9122364389980175, + "grad_norm": 0.33621159195899963, + "learning_rate": 7.758985173607026e-06, + "loss": 0.4739, + "step": 3537 + }, + { + "epoch": 1.912777076950802, + "grad_norm": 0.3402005136013031, + "learning_rate": 7.757410445390847e-06, + "loss": 0.4149, + "step": 3538 + }, + { + "epoch": 1.9133177149035863, + "grad_norm": 0.35434216260910034, + "learning_rate": 7.755835324028089e-06, + "loss": 0.424, + "step": 3539 + }, + { + "epoch": 1.9138583528563706, + "grad_norm": 0.33663997054100037, + "learning_rate": 7.754259809743325e-06, + "loss": 0.4818, + "step": 3540 + }, + { + "epoch": 1.9143989908091548, + "grad_norm": 0.30200737714767456, + "learning_rate": 7.752683902761193e-06, + "loss": 0.3807, + "step": 3541 + }, + { + "epoch": 1.9149396287619391, + "grad_norm": 0.35024553537368774, + "learning_rate": 7.75110760330638e-06, + "loss": 0.4713, + "step": 3542 + }, + { + "epoch": 1.9154802667147233, + "grad_norm": 0.3063545227050781, + "learning_rate": 7.749530911603634e-06, + "loss": 0.4605, + "step": 3543 + }, + { + "epoch": 1.9160209046675076, + "grad_norm": 0.3189338147640228, + "learning_rate": 7.747953827877754e-06, + "loss": 0.4019, + "step": 3544 + }, + { + "epoch": 1.916561542620292, + "grad_norm": 0.313576877117157, + "learning_rate": 7.746376352353599e-06, + "loss": 0.4276, + "step": 3545 + }, + { + "epoch": 1.9171021805730764, + "grad_norm": 0.33931392431259155, + "learning_rate": 7.744798485256085e-06, + "loss": 0.4364, + "step": 3546 + }, + { + "epoch": 1.9176428185258605, + "grad_norm": 0.3177691400051117, + "learning_rate": 7.74322022681018e-06, + "loss": 0.4454, + "step": 3547 + }, + { + "epoch": 1.9181834564786449, + "grad_norm": 0.30451542139053345, + "learning_rate": 7.741641577240908e-06, + "loss": 0.4067, + "step": 3548 + }, + { + "epoch": 1.918724094431429, + "grad_norm": 0.33761847019195557, + "learning_rate": 7.740062536773352e-06, + "loss": 0.4385, + "step": 3549 + }, + { + "epoch": 1.9192647323842134, + "grad_norm": 0.3786282241344452, + "learning_rate": 7.738483105632644e-06, + "loss": 0.4336, + "step": 3550 + }, + { + "epoch": 1.9198053703369977, + "grad_norm": 0.3104192912578583, + "learning_rate": 7.736903284043985e-06, + "loss": 0.4376, + "step": 3551 + }, + { + "epoch": 1.920346008289782, + "grad_norm": 0.39415085315704346, + "learning_rate": 7.735323072232615e-06, + "loss": 0.4964, + "step": 3552 + }, + { + "epoch": 1.9208866462425662, + "grad_norm": 0.32172977924346924, + "learning_rate": 7.73374247042384e-06, + "loss": 0.3928, + "step": 3553 + }, + { + "epoch": 1.9214272841953504, + "grad_norm": 0.32149016857147217, + "learning_rate": 7.732161478843021e-06, + "loss": 0.4396, + "step": 3554 + }, + { + "epoch": 1.9219679221481347, + "grad_norm": 0.32530462741851807, + "learning_rate": 7.730580097715575e-06, + "loss": 0.4396, + "step": 3555 + }, + { + "epoch": 1.922508560100919, + "grad_norm": 0.3648417294025421, + "learning_rate": 7.728998327266966e-06, + "loss": 0.4428, + "step": 3556 + }, + { + "epoch": 1.9230491980537034, + "grad_norm": 0.3257485032081604, + "learning_rate": 7.727416167722724e-06, + "loss": 0.4678, + "step": 3557 + }, + { + "epoch": 1.9235898360064878, + "grad_norm": 0.3238253891468048, + "learning_rate": 7.72583361930843e-06, + "loss": 0.4, + "step": 3558 + }, + { + "epoch": 1.924130473959272, + "grad_norm": 0.37113291025161743, + "learning_rate": 7.724250682249723e-06, + "loss": 0.4275, + "step": 3559 + }, + { + "epoch": 1.924671111912056, + "grad_norm": 0.32570475339889526, + "learning_rate": 7.722667356772291e-06, + "loss": 0.4404, + "step": 3560 + }, + { + "epoch": 1.9252117498648404, + "grad_norm": 0.28276491165161133, + "learning_rate": 7.721083643101883e-06, + "loss": 0.3831, + "step": 3561 + }, + { + "epoch": 1.9257523878176248, + "grad_norm": 0.35973110795021057, + "learning_rate": 7.719499541464304e-06, + "loss": 0.4794, + "step": 3562 + }, + { + "epoch": 1.9262930257704092, + "grad_norm": 0.3179782032966614, + "learning_rate": 7.717915052085412e-06, + "loss": 0.4124, + "step": 3563 + }, + { + "epoch": 1.9268336637231933, + "grad_norm": 0.3386150598526001, + "learning_rate": 7.716330175191118e-06, + "loss": 0.4497, + "step": 3564 + }, + { + "epoch": 1.9273743016759777, + "grad_norm": 0.29650476574897766, + "learning_rate": 7.714744911007395e-06, + "loss": 0.4274, + "step": 3565 + }, + { + "epoch": 1.9279149396287618, + "grad_norm": 0.3160037696361542, + "learning_rate": 7.713159259760262e-06, + "loss": 0.4642, + "step": 3566 + }, + { + "epoch": 1.9284555775815462, + "grad_norm": 0.35160017013549805, + "learning_rate": 7.711573221675804e-06, + "loss": 0.4352, + "step": 3567 + }, + { + "epoch": 1.9289962155343305, + "grad_norm": 0.33104562759399414, + "learning_rate": 7.709986796980148e-06, + "loss": 0.4122, + "step": 3568 + }, + { + "epoch": 1.929536853487115, + "grad_norm": 0.3283877670764923, + "learning_rate": 7.708399985899492e-06, + "loss": 0.431, + "step": 3569 + }, + { + "epoch": 1.930077491439899, + "grad_norm": 0.34296366572380066, + "learning_rate": 7.706812788660075e-06, + "loss": 0.4243, + "step": 3570 + }, + { + "epoch": 1.9306181293926834, + "grad_norm": 0.3678690195083618, + "learning_rate": 7.705225205488201e-06, + "loss": 0.4164, + "step": 3571 + }, + { + "epoch": 1.9311587673454675, + "grad_norm": 0.331254243850708, + "learning_rate": 7.703637236610217e-06, + "loss": 0.4632, + "step": 3572 + }, + { + "epoch": 1.931699405298252, + "grad_norm": 0.3256266415119171, + "learning_rate": 7.702048882252541e-06, + "loss": 0.4003, + "step": 3573 + }, + { + "epoch": 1.9322400432510363, + "grad_norm": 0.3572953939437866, + "learning_rate": 7.700460142641635e-06, + "loss": 0.4536, + "step": 3574 + }, + { + "epoch": 1.9327806812038206, + "grad_norm": 0.36617356538772583, + "learning_rate": 7.698871018004016e-06, + "loss": 0.4225, + "step": 3575 + }, + { + "epoch": 1.9333213191566048, + "grad_norm": 0.3246054947376251, + "learning_rate": 7.697281508566264e-06, + "loss": 0.4174, + "step": 3576 + }, + { + "epoch": 1.9338619571093891, + "grad_norm": 0.3796486258506775, + "learning_rate": 7.695691614555002e-06, + "loss": 0.4441, + "step": 3577 + }, + { + "epoch": 1.9344025950621733, + "grad_norm": 0.4053588807582855, + "learning_rate": 7.694101336196917e-06, + "loss": 0.4574, + "step": 3578 + }, + { + "epoch": 1.9349432330149576, + "grad_norm": 0.31780388951301575, + "learning_rate": 7.69251067371875e-06, + "loss": 0.3953, + "step": 3579 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 0.38607701659202576, + "learning_rate": 7.690919627347292e-06, + "loss": 0.4131, + "step": 3580 + }, + { + "epoch": 1.9360245089205264, + "grad_norm": 0.39074504375457764, + "learning_rate": 7.689328197309394e-06, + "loss": 0.4427, + "step": 3581 + }, + { + "epoch": 1.9365651468733105, + "grad_norm": 0.3454664647579193, + "learning_rate": 7.687736383831956e-06, + "loss": 0.4408, + "step": 3582 + }, + { + "epoch": 1.9371057848260946, + "grad_norm": 0.3508855998516083, + "learning_rate": 7.686144187141938e-06, + "loss": 0.4216, + "step": 3583 + }, + { + "epoch": 1.937646422778879, + "grad_norm": 0.38397687673568726, + "learning_rate": 7.684551607466351e-06, + "loss": 0.4718, + "step": 3584 + }, + { + "epoch": 1.9381870607316634, + "grad_norm": 0.330385684967041, + "learning_rate": 7.682958645032265e-06, + "loss": 0.3939, + "step": 3585 + }, + { + "epoch": 1.9387276986844477, + "grad_norm": 0.3208334445953369, + "learning_rate": 7.681365300066798e-06, + "loss": 0.4651, + "step": 3586 + }, + { + "epoch": 1.939268336637232, + "grad_norm": 0.36974313855171204, + "learning_rate": 7.67977157279713e-06, + "loss": 0.4421, + "step": 3587 + }, + { + "epoch": 1.9398089745900162, + "grad_norm": 0.32609590888023376, + "learning_rate": 7.67817746345049e-06, + "loss": 0.4336, + "step": 3588 + }, + { + "epoch": 1.9403496125428004, + "grad_norm": 0.3425266742706299, + "learning_rate": 7.676582972254162e-06, + "loss": 0.4271, + "step": 3589 + }, + { + "epoch": 1.9408902504955847, + "grad_norm": 0.3551659882068634, + "learning_rate": 7.674988099435487e-06, + "loss": 0.4233, + "step": 3590 + }, + { + "epoch": 1.941430888448369, + "grad_norm": 0.36338621377944946, + "learning_rate": 7.673392845221859e-06, + "loss": 0.4265, + "step": 3591 + }, + { + "epoch": 1.9419715264011534, + "grad_norm": 0.3407050669193268, + "learning_rate": 7.671797209840725e-06, + "loss": 0.4562, + "step": 3592 + }, + { + "epoch": 1.9425121643539378, + "grad_norm": 0.3623160123825073, + "learning_rate": 7.67020119351959e-06, + "loss": 0.431, + "step": 3593 + }, + { + "epoch": 1.943052802306722, + "grad_norm": 0.3682960271835327, + "learning_rate": 7.668604796486013e-06, + "loss": 0.4688, + "step": 3594 + }, + { + "epoch": 1.943593440259506, + "grad_norm": 0.3230957090854645, + "learning_rate": 7.667008018967598e-06, + "loss": 0.405, + "step": 3595 + }, + { + "epoch": 1.9441340782122905, + "grad_norm": 0.34176433086395264, + "learning_rate": 7.665410861192018e-06, + "loss": 0.4731, + "step": 3596 + }, + { + "epoch": 1.9446747161650748, + "grad_norm": 0.35060107707977295, + "learning_rate": 7.663813323386988e-06, + "loss": 0.4242, + "step": 3597 + }, + { + "epoch": 1.9452153541178592, + "grad_norm": 0.37970679998397827, + "learning_rate": 7.662215405780287e-06, + "loss": 0.4504, + "step": 3598 + }, + { + "epoch": 1.9457559920706433, + "grad_norm": 0.3594030737876892, + "learning_rate": 7.66061710859974e-06, + "loss": 0.4249, + "step": 3599 + }, + { + "epoch": 1.9462966300234277, + "grad_norm": 0.3039158582687378, + "learning_rate": 7.65901843207323e-06, + "loss": 0.4038, + "step": 3600 + }, + { + "epoch": 1.9468372679762118, + "grad_norm": 0.3613775968551636, + "learning_rate": 7.65741937642869e-06, + "loss": 0.4651, + "step": 3601 + }, + { + "epoch": 1.9473779059289962, + "grad_norm": 0.340748131275177, + "learning_rate": 7.655819941894116e-06, + "loss": 0.4911, + "step": 3602 + }, + { + "epoch": 1.9479185438817805, + "grad_norm": 0.2853771150112152, + "learning_rate": 7.654220128697547e-06, + "loss": 0.3756, + "step": 3603 + }, + { + "epoch": 1.948459181834565, + "grad_norm": 0.3236043453216553, + "learning_rate": 7.652619937067087e-06, + "loss": 0.4097, + "step": 3604 + }, + { + "epoch": 1.948999819787349, + "grad_norm": 0.36772769689559937, + "learning_rate": 7.651019367230886e-06, + "loss": 0.4804, + "step": 3605 + }, + { + "epoch": 1.9495404577401334, + "grad_norm": 0.31228798627853394, + "learning_rate": 7.64941841941715e-06, + "loss": 0.4372, + "step": 3606 + }, + { + "epoch": 1.9500810956929175, + "grad_norm": 0.27597206830978394, + "learning_rate": 7.64781709385414e-06, + "loss": 0.3691, + "step": 3607 + }, + { + "epoch": 1.950621733645702, + "grad_norm": 0.3294198513031006, + "learning_rate": 7.646215390770167e-06, + "loss": 0.4796, + "step": 3608 + }, + { + "epoch": 1.9511623715984863, + "grad_norm": 0.3063039779663086, + "learning_rate": 7.644613310393604e-06, + "loss": 0.3831, + "step": 3609 + }, + { + "epoch": 1.9517030095512706, + "grad_norm": 0.31528839468955994, + "learning_rate": 7.643010852952871e-06, + "loss": 0.4339, + "step": 3610 + }, + { + "epoch": 1.9522436475040548, + "grad_norm": 0.3063347637653351, + "learning_rate": 7.641408018676439e-06, + "loss": 0.4614, + "step": 3611 + }, + { + "epoch": 1.952784285456839, + "grad_norm": 0.3073441684246063, + "learning_rate": 7.639804807792843e-06, + "loss": 0.4229, + "step": 3612 + }, + { + "epoch": 1.9533249234096233, + "grad_norm": 0.2667381167411804, + "learning_rate": 7.638201220530664e-06, + "loss": 0.4005, + "step": 3613 + }, + { + "epoch": 1.9538655613624076, + "grad_norm": 0.3190470039844513, + "learning_rate": 7.63659725711854e-06, + "loss": 0.4365, + "step": 3614 + }, + { + "epoch": 1.954406199315192, + "grad_norm": 0.29926618933677673, + "learning_rate": 7.634992917785156e-06, + "loss": 0.4455, + "step": 3615 + }, + { + "epoch": 1.9549468372679764, + "grad_norm": 0.29153937101364136, + "learning_rate": 7.633388202759262e-06, + "loss": 0.4145, + "step": 3616 + }, + { + "epoch": 1.9554874752207605, + "grad_norm": 0.35738757252693176, + "learning_rate": 7.63178311226965e-06, + "loss": 0.4494, + "step": 3617 + }, + { + "epoch": 1.9560281131735446, + "grad_norm": 0.32931193709373474, + "learning_rate": 7.630177646545176e-06, + "loss": 0.4564, + "step": 3618 + }, + { + "epoch": 1.956568751126329, + "grad_norm": 0.3191809058189392, + "learning_rate": 7.628571805814742e-06, + "loss": 0.4342, + "step": 3619 + }, + { + "epoch": 1.9571093890791134, + "grad_norm": 0.3088025748729706, + "learning_rate": 7.626965590307305e-06, + "loss": 0.4067, + "step": 3620 + }, + { + "epoch": 1.9576500270318977, + "grad_norm": 0.2964020371437073, + "learning_rate": 7.625359000251875e-06, + "loss": 0.3987, + "step": 3621 + }, + { + "epoch": 1.958190664984682, + "grad_norm": 0.39133739471435547, + "learning_rate": 7.623752035877523e-06, + "loss": 0.4738, + "step": 3622 + }, + { + "epoch": 1.9587313029374662, + "grad_norm": 0.3323116898536682, + "learning_rate": 7.622144697413361e-06, + "loss": 0.4001, + "step": 3623 + }, + { + "epoch": 1.9592719408902504, + "grad_norm": 0.3124770522117615, + "learning_rate": 7.620536985088562e-06, + "loss": 0.4063, + "step": 3624 + }, + { + "epoch": 1.9598125788430347, + "grad_norm": 0.359270304441452, + "learning_rate": 7.6189288991323505e-06, + "loss": 0.4482, + "step": 3625 + }, + { + "epoch": 1.960353216795819, + "grad_norm": 0.3633440136909485, + "learning_rate": 7.617320439774005e-06, + "loss": 0.4319, + "step": 3626 + }, + { + "epoch": 1.9608938547486034, + "grad_norm": 0.3513455390930176, + "learning_rate": 7.615711607242857e-06, + "loss": 0.4525, + "step": 3627 + }, + { + "epoch": 1.9614344927013876, + "grad_norm": 0.2922728359699249, + "learning_rate": 7.614102401768293e-06, + "loss": 0.4126, + "step": 3628 + }, + { + "epoch": 1.961975130654172, + "grad_norm": 0.3656744956970215, + "learning_rate": 7.612492823579744e-06, + "loss": 0.4565, + "step": 3629 + }, + { + "epoch": 1.962515768606956, + "grad_norm": 0.3397183120250702, + "learning_rate": 7.610882872906709e-06, + "loss": 0.393, + "step": 3630 + }, + { + "epoch": 1.9630564065597405, + "grad_norm": 0.41956302523612976, + "learning_rate": 7.609272549978725e-06, + "loss": 0.498, + "step": 3631 + }, + { + "epoch": 1.9635970445125248, + "grad_norm": 0.3319515883922577, + "learning_rate": 7.607661855025393e-06, + "loss": 0.4167, + "step": 3632 + }, + { + "epoch": 1.9641376824653092, + "grad_norm": 0.3784463703632355, + "learning_rate": 7.606050788276361e-06, + "loss": 0.4434, + "step": 3633 + }, + { + "epoch": 1.9646783204180933, + "grad_norm": 0.3500726521015167, + "learning_rate": 7.604439349961335e-06, + "loss": 0.4038, + "step": 3634 + }, + { + "epoch": 1.9652189583708777, + "grad_norm": 0.31805264949798584, + "learning_rate": 7.602827540310065e-06, + "loss": 0.3747, + "step": 3635 + }, + { + "epoch": 1.9657595963236618, + "grad_norm": 0.35021552443504333, + "learning_rate": 7.601215359552365e-06, + "loss": 0.4301, + "step": 3636 + }, + { + "epoch": 1.9663002342764462, + "grad_norm": 0.3575853407382965, + "learning_rate": 7.599602807918096e-06, + "loss": 0.4343, + "step": 3637 + }, + { + "epoch": 1.9668408722292305, + "grad_norm": 0.3714921474456787, + "learning_rate": 7.597989885637172e-06, + "loss": 0.3832, + "step": 3638 + }, + { + "epoch": 1.967381510182015, + "grad_norm": 0.3801950514316559, + "learning_rate": 7.596376592939559e-06, + "loss": 0.4923, + "step": 3639 + }, + { + "epoch": 1.967922148134799, + "grad_norm": 0.3086744248867035, + "learning_rate": 7.594762930055281e-06, + "loss": 0.4363, + "step": 3640 + }, + { + "epoch": 1.9684627860875832, + "grad_norm": 0.37697261571884155, + "learning_rate": 7.593148897214409e-06, + "loss": 0.409, + "step": 3641 + }, + { + "epoch": 1.9690034240403675, + "grad_norm": 0.35460126399993896, + "learning_rate": 7.591534494647066e-06, + "loss": 0.4284, + "step": 3642 + }, + { + "epoch": 1.969544061993152, + "grad_norm": 0.3188256621360779, + "learning_rate": 7.5899197225834364e-06, + "loss": 0.4454, + "step": 3643 + }, + { + "epoch": 1.9700846999459363, + "grad_norm": 0.3555510342121124, + "learning_rate": 7.5883045812537485e-06, + "loss": 0.4528, + "step": 3644 + }, + { + "epoch": 1.9706253378987206, + "grad_norm": 0.38000836968421936, + "learning_rate": 7.586689070888284e-06, + "loss": 0.4331, + "step": 3645 + }, + { + "epoch": 1.9711659758515048, + "grad_norm": 0.2912016808986664, + "learning_rate": 7.585073191717385e-06, + "loss": 0.3979, + "step": 3646 + }, + { + "epoch": 1.971706613804289, + "grad_norm": 0.3479340672492981, + "learning_rate": 7.583456943971435e-06, + "loss": 0.477, + "step": 3647 + }, + { + "epoch": 1.9722472517570733, + "grad_norm": 0.31951528787612915, + "learning_rate": 7.581840327880878e-06, + "loss": 0.4319, + "step": 3648 + }, + { + "epoch": 1.9727878897098576, + "grad_norm": 0.33479586243629456, + "learning_rate": 7.580223343676209e-06, + "loss": 0.4288, + "step": 3649 + }, + { + "epoch": 1.973328527662642, + "grad_norm": 0.33754345774650574, + "learning_rate": 7.578605991587974e-06, + "loss": 0.4253, + "step": 3650 + }, + { + "epoch": 1.9738691656154264, + "grad_norm": 0.3368953466415405, + "learning_rate": 7.576988271846771e-06, + "loss": 0.4239, + "step": 3651 + }, + { + "epoch": 1.9744098035682105, + "grad_norm": 0.4046306908130646, + "learning_rate": 7.575370184683255e-06, + "loss": 0.4576, + "step": 3652 + }, + { + "epoch": 1.9749504415209946, + "grad_norm": 0.3182411789894104, + "learning_rate": 7.573751730328125e-06, + "loss": 0.4007, + "step": 3653 + }, + { + "epoch": 1.975491079473779, + "grad_norm": 0.34206026792526245, + "learning_rate": 7.572132909012139e-06, + "loss": 0.4597, + "step": 3654 + }, + { + "epoch": 1.9760317174265634, + "grad_norm": 0.31874173879623413, + "learning_rate": 7.570513720966108e-06, + "loss": 0.4068, + "step": 3655 + }, + { + "epoch": 1.9765723553793477, + "grad_norm": 0.37682193517684937, + "learning_rate": 7.568894166420892e-06, + "loss": 0.461, + "step": 3656 + }, + { + "epoch": 1.9771129933321319, + "grad_norm": 0.3360116481781006, + "learning_rate": 7.567274245607403e-06, + "loss": 0.3982, + "step": 3657 + }, + { + "epoch": 1.9776536312849162, + "grad_norm": 0.31616827845573425, + "learning_rate": 7.5656539587566066e-06, + "loss": 0.4308, + "step": 3658 + }, + { + "epoch": 1.9781942692377004, + "grad_norm": 0.3326205909252167, + "learning_rate": 7.5640333060995215e-06, + "loss": 0.4388, + "step": 3659 + }, + { + "epoch": 1.9787349071904847, + "grad_norm": 0.30274784564971924, + "learning_rate": 7.562412287867214e-06, + "loss": 0.4266, + "step": 3660 + }, + { + "epoch": 1.979275545143269, + "grad_norm": 0.3377111256122589, + "learning_rate": 7.5607909042908115e-06, + "loss": 0.4476, + "step": 3661 + }, + { + "epoch": 1.9798161830960535, + "grad_norm": 0.31110507249832153, + "learning_rate": 7.559169155601483e-06, + "loss": 0.4095, + "step": 3662 + }, + { + "epoch": 1.9803568210488376, + "grad_norm": 0.310968816280365, + "learning_rate": 7.557547042030458e-06, + "loss": 0.4085, + "step": 3663 + }, + { + "epoch": 1.980897459001622, + "grad_norm": 0.29778537154197693, + "learning_rate": 7.555924563809011e-06, + "loss": 0.4395, + "step": 3664 + }, + { + "epoch": 1.981438096954406, + "grad_norm": 0.3150721788406372, + "learning_rate": 7.5543017211684745e-06, + "loss": 0.485, + "step": 3665 + }, + { + "epoch": 1.9819787349071905, + "grad_norm": 0.3148729205131531, + "learning_rate": 7.552678514340229e-06, + "loss": 0.4126, + "step": 3666 + }, + { + "epoch": 1.9825193728599748, + "grad_norm": 0.37179580330848694, + "learning_rate": 7.551054943555711e-06, + "loss": 0.4871, + "step": 3667 + }, + { + "epoch": 1.9830600108127592, + "grad_norm": 0.32496505975723267, + "learning_rate": 7.549431009046404e-06, + "loss": 0.4354, + "step": 3668 + }, + { + "epoch": 1.9836006487655433, + "grad_norm": 0.31917044520378113, + "learning_rate": 7.547806711043846e-06, + "loss": 0.4393, + "step": 3669 + }, + { + "epoch": 1.9841412867183277, + "grad_norm": 0.3260762393474579, + "learning_rate": 7.5461820497796255e-06, + "loss": 0.429, + "step": 3670 + }, + { + "epoch": 1.9846819246711118, + "grad_norm": 0.33916527032852173, + "learning_rate": 7.544557025485386e-06, + "loss": 0.4321, + "step": 3671 + }, + { + "epoch": 1.9852225626238962, + "grad_norm": 0.34038153290748596, + "learning_rate": 7.542931638392818e-06, + "loss": 0.4241, + "step": 3672 + }, + { + "epoch": 1.9857632005766805, + "grad_norm": 0.3504200577735901, + "learning_rate": 7.54130588873367e-06, + "loss": 0.4447, + "step": 3673 + }, + { + "epoch": 1.986303838529465, + "grad_norm": 0.3235137164592743, + "learning_rate": 7.5396797767397345e-06, + "loss": 0.4268, + "step": 3674 + }, + { + "epoch": 1.986844476482249, + "grad_norm": 0.34521469473838806, + "learning_rate": 7.5380533026428625e-06, + "loss": 0.4137, + "step": 3675 + }, + { + "epoch": 1.9873851144350332, + "grad_norm": 0.38376936316490173, + "learning_rate": 7.536426466674951e-06, + "loss": 0.4826, + "step": 3676 + }, + { + "epoch": 1.9879257523878175, + "grad_norm": 0.30524715781211853, + "learning_rate": 7.534799269067952e-06, + "loss": 0.3893, + "step": 3677 + }, + { + "epoch": 1.988466390340602, + "grad_norm": 0.3902176022529602, + "learning_rate": 7.533171710053871e-06, + "loss": 0.4489, + "step": 3678 + }, + { + "epoch": 1.9890070282933863, + "grad_norm": 0.4094945788383484, + "learning_rate": 7.531543789864759e-06, + "loss": 0.4433, + "step": 3679 + }, + { + "epoch": 1.9895476662461706, + "grad_norm": 0.3604152798652649, + "learning_rate": 7.529915508732725e-06, + "loss": 0.4362, + "step": 3680 + }, + { + "epoch": 1.9900883041989548, + "grad_norm": 0.4172627925872803, + "learning_rate": 7.528286866889924e-06, + "loss": 0.449, + "step": 3681 + }, + { + "epoch": 1.990628942151739, + "grad_norm": 0.3344157636165619, + "learning_rate": 7.526657864568565e-06, + "loss": 0.4196, + "step": 3682 + }, + { + "epoch": 1.9911695801045233, + "grad_norm": 0.3269338011741638, + "learning_rate": 7.52502850200091e-06, + "loss": 0.4154, + "step": 3683 + }, + { + "epoch": 1.9917102180573076, + "grad_norm": 0.36076146364212036, + "learning_rate": 7.5233987794192675e-06, + "loss": 0.4295, + "step": 3684 + }, + { + "epoch": 1.992250856010092, + "grad_norm": 0.3513505160808563, + "learning_rate": 7.521768697056004e-06, + "loss": 0.4206, + "step": 3685 + }, + { + "epoch": 1.9927914939628764, + "grad_norm": 0.3319392502307892, + "learning_rate": 7.520138255143532e-06, + "loss": 0.4619, + "step": 3686 + }, + { + "epoch": 1.9933321319156605, + "grad_norm": 0.3468388617038727, + "learning_rate": 7.518507453914317e-06, + "loss": 0.4198, + "step": 3687 + }, + { + "epoch": 1.9938727698684446, + "grad_norm": 0.3449645936489105, + "learning_rate": 7.5168762936008744e-06, + "loss": 0.4282, + "step": 3688 + }, + { + "epoch": 1.994413407821229, + "grad_norm": 0.33245494961738586, + "learning_rate": 7.515244774435773e-06, + "loss": 0.4228, + "step": 3689 + }, + { + "epoch": 1.9949540457740134, + "grad_norm": 0.36770904064178467, + "learning_rate": 7.513612896651632e-06, + "loss": 0.4486, + "step": 3690 + }, + { + "epoch": 1.9954946837267977, + "grad_norm": 0.31897082924842834, + "learning_rate": 7.511980660481123e-06, + "loss": 0.4252, + "step": 3691 + }, + { + "epoch": 1.9960353216795819, + "grad_norm": 0.3040165603160858, + "learning_rate": 7.510348066156965e-06, + "loss": 0.4024, + "step": 3692 + }, + { + "epoch": 1.9965759596323662, + "grad_norm": 0.4437829256057739, + "learning_rate": 7.508715113911933e-06, + "loss": 0.4148, + "step": 3693 + }, + { + "epoch": 1.9971165975851504, + "grad_norm": 0.3649711608886719, + "learning_rate": 7.5070818039788455e-06, + "loss": 0.4666, + "step": 3694 + }, + { + "epoch": 1.9976572355379347, + "grad_norm": 0.34586653113365173, + "learning_rate": 7.505448136590583e-06, + "loss": 0.4076, + "step": 3695 + }, + { + "epoch": 1.998197873490719, + "grad_norm": 0.3936247229576111, + "learning_rate": 7.5038141119800655e-06, + "loss": 0.4342, + "step": 3696 + }, + { + "epoch": 1.9987385114435035, + "grad_norm": 0.36371323466300964, + "learning_rate": 7.502179730380274e-06, + "loss": 0.4855, + "step": 3697 + }, + { + "epoch": 1.9992791493962876, + "grad_norm": 0.3190724551677704, + "learning_rate": 7.500544992024231e-06, + "loss": 0.3891, + "step": 3698 + }, + { + "epoch": 1.999819787349072, + "grad_norm": 0.4348657429218292, + "learning_rate": 7.498909897145017e-06, + "loss": 0.5114, + "step": 3699 + }, + { + "epoch": 2.000360425301856, + "grad_norm": 0.4390939176082611, + "learning_rate": 7.497274445975762e-06, + "loss": 0.4686, + "step": 3700 + }, + { + "epoch": 2.0009010632546405, + "grad_norm": 0.38029152154922485, + "learning_rate": 7.495638638749645e-06, + "loss": 0.4204, + "step": 3701 + }, + { + "epoch": 2.001441701207425, + "grad_norm": 0.33925145864486694, + "learning_rate": 7.494002475699893e-06, + "loss": 0.4045, + "step": 3702 + }, + { + "epoch": 2.001982339160209, + "grad_norm": 0.35955920815467834, + "learning_rate": 7.492365957059793e-06, + "loss": 0.3878, + "step": 3703 + }, + { + "epoch": 2.0025229771129935, + "grad_norm": 0.35933130979537964, + "learning_rate": 7.490729083062671e-06, + "loss": 0.4152, + "step": 3704 + }, + { + "epoch": 2.0030636150657775, + "grad_norm": 0.3439458906650543, + "learning_rate": 7.489091853941914e-06, + "loss": 0.4004, + "step": 3705 + }, + { + "epoch": 2.003604253018562, + "grad_norm": 0.40505823493003845, + "learning_rate": 7.487454269930953e-06, + "loss": 0.4297, + "step": 3706 + }, + { + "epoch": 2.004144890971346, + "grad_norm": 0.317466139793396, + "learning_rate": 7.485816331263273e-06, + "loss": 0.3772, + "step": 3707 + }, + { + "epoch": 2.0046855289241305, + "grad_norm": 0.37041476368904114, + "learning_rate": 7.484178038172407e-06, + "loss": 0.4075, + "step": 3708 + }, + { + "epoch": 2.005226166876915, + "grad_norm": 0.3804113566875458, + "learning_rate": 7.482539390891941e-06, + "loss": 0.4189, + "step": 3709 + }, + { + "epoch": 2.005766804829699, + "grad_norm": 0.33350056409835815, + "learning_rate": 7.480900389655508e-06, + "loss": 0.3992, + "step": 3710 + }, + { + "epoch": 2.006307442782483, + "grad_norm": 0.37814444303512573, + "learning_rate": 7.479261034696797e-06, + "loss": 0.3577, + "step": 3711 + }, + { + "epoch": 2.0068480807352675, + "grad_norm": 0.35549354553222656, + "learning_rate": 7.4776213262495425e-06, + "loss": 0.4154, + "step": 3712 + }, + { + "epoch": 2.007388718688052, + "grad_norm": 0.30540111660957336, + "learning_rate": 7.475981264547531e-06, + "loss": 0.3675, + "step": 3713 + }, + { + "epoch": 2.0079293566408363, + "grad_norm": 0.33743345737457275, + "learning_rate": 7.474340849824601e-06, + "loss": 0.4002, + "step": 3714 + }, + { + "epoch": 2.0084699945936206, + "grad_norm": 0.3345016539096832, + "learning_rate": 7.4727000823146386e-06, + "loss": 0.4265, + "step": 3715 + }, + { + "epoch": 2.0090106325464046, + "grad_norm": 0.26939669251441956, + "learning_rate": 7.471058962251582e-06, + "loss": 0.3433, + "step": 3716 + }, + { + "epoch": 2.009551270499189, + "grad_norm": 0.3526904284954071, + "learning_rate": 7.4694174898694186e-06, + "loss": 0.4127, + "step": 3717 + }, + { + "epoch": 2.0100919084519733, + "grad_norm": 0.32508018612861633, + "learning_rate": 7.467775665402186e-06, + "loss": 0.3917, + "step": 3718 + }, + { + "epoch": 2.0106325464047576, + "grad_norm": 0.3042299151420593, + "learning_rate": 7.466133489083975e-06, + "loss": 0.3881, + "step": 3719 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 0.288141667842865, + "learning_rate": 7.464490961148921e-06, + "loss": 0.3708, + "step": 3720 + }, + { + "epoch": 2.0117138223103264, + "grad_norm": 0.31178662180900574, + "learning_rate": 7.462848081831214e-06, + "loss": 0.399, + "step": 3721 + }, + { + "epoch": 2.0122544602631103, + "grad_norm": 0.33863207697868347, + "learning_rate": 7.461204851365095e-06, + "loss": 0.4177, + "step": 3722 + }, + { + "epoch": 2.0127950982158946, + "grad_norm": 0.3190074861049652, + "learning_rate": 7.459561269984848e-06, + "loss": 0.408, + "step": 3723 + }, + { + "epoch": 2.013335736168679, + "grad_norm": 0.38593724370002747, + "learning_rate": 7.457917337924817e-06, + "loss": 0.4353, + "step": 3724 + }, + { + "epoch": 2.0138763741214634, + "grad_norm": 0.3085099458694458, + "learning_rate": 7.4562730554193875e-06, + "loss": 0.424, + "step": 3725 + }, + { + "epoch": 2.0144170120742477, + "grad_norm": 0.3138388693332672, + "learning_rate": 7.454628422703e-06, + "loss": 0.3525, + "step": 3726 + }, + { + "epoch": 2.014957650027032, + "grad_norm": 0.37040871381759644, + "learning_rate": 7.452983440010141e-06, + "loss": 0.4242, + "step": 3727 + }, + { + "epoch": 2.015498287979816, + "grad_norm": 0.2976129651069641, + "learning_rate": 7.451338107575351e-06, + "loss": 0.3775, + "step": 3728 + }, + { + "epoch": 2.0160389259326004, + "grad_norm": 0.40502285957336426, + "learning_rate": 7.449692425633219e-06, + "loss": 0.4125, + "step": 3729 + }, + { + "epoch": 2.0165795638853847, + "grad_norm": 0.31863898038864136, + "learning_rate": 7.448046394418383e-06, + "loss": 0.3827, + "step": 3730 + }, + { + "epoch": 2.017120201838169, + "grad_norm": 0.3395459055900574, + "learning_rate": 7.446400014165529e-06, + "loss": 0.4056, + "step": 3731 + }, + { + "epoch": 2.0176608397909535, + "grad_norm": 0.32964348793029785, + "learning_rate": 7.444753285109399e-06, + "loss": 0.3992, + "step": 3732 + }, + { + "epoch": 2.018201477743738, + "grad_norm": 0.33514976501464844, + "learning_rate": 7.443106207484776e-06, + "loss": 0.4061, + "step": 3733 + }, + { + "epoch": 2.0187421156965217, + "grad_norm": 0.33783411979675293, + "learning_rate": 7.4414587815265e-06, + "loss": 0.3938, + "step": 3734 + }, + { + "epoch": 2.019282753649306, + "grad_norm": 0.3240983784198761, + "learning_rate": 7.439811007469457e-06, + "loss": 0.4301, + "step": 3735 + }, + { + "epoch": 2.0198233916020905, + "grad_norm": 0.3201411962509155, + "learning_rate": 7.438162885548585e-06, + "loss": 0.3957, + "step": 3736 + }, + { + "epoch": 2.020364029554875, + "grad_norm": 0.3318484425544739, + "learning_rate": 7.43651441599887e-06, + "loss": 0.4134, + "step": 3737 + }, + { + "epoch": 2.020904667507659, + "grad_norm": 0.3605779707431793, + "learning_rate": 7.434865599055348e-06, + "loss": 0.4148, + "step": 3738 + }, + { + "epoch": 2.0214453054604435, + "grad_norm": 0.3019539415836334, + "learning_rate": 7.433216434953101e-06, + "loss": 0.3711, + "step": 3739 + }, + { + "epoch": 2.0219859434132275, + "grad_norm": 0.3316117525100708, + "learning_rate": 7.431566923927267e-06, + "loss": 0.4522, + "step": 3740 + }, + { + "epoch": 2.022526581366012, + "grad_norm": 0.34275633096694946, + "learning_rate": 7.42991706621303e-06, + "loss": 0.3895, + "step": 3741 + }, + { + "epoch": 2.023067219318796, + "grad_norm": 0.3309865891933441, + "learning_rate": 7.428266862045625e-06, + "loss": 0.412, + "step": 3742 + }, + { + "epoch": 2.0236078572715805, + "grad_norm": 0.29785841703414917, + "learning_rate": 7.426616311660332e-06, + "loss": 0.4133, + "step": 3743 + }, + { + "epoch": 2.024148495224365, + "grad_norm": 0.29913681745529175, + "learning_rate": 7.424965415292487e-06, + "loss": 0.3711, + "step": 3744 + }, + { + "epoch": 2.024689133177149, + "grad_norm": 0.32399049401283264, + "learning_rate": 7.423314173177467e-06, + "loss": 0.4411, + "step": 3745 + }, + { + "epoch": 2.025229771129933, + "grad_norm": 0.29665037989616394, + "learning_rate": 7.421662585550707e-06, + "loss": 0.3682, + "step": 3746 + }, + { + "epoch": 2.0257704090827175, + "grad_norm": 0.350752592086792, + "learning_rate": 7.4200106526476865e-06, + "loss": 0.4197, + "step": 3747 + }, + { + "epoch": 2.026311047035502, + "grad_norm": 0.30756109952926636, + "learning_rate": 7.418358374703936e-06, + "loss": 0.3995, + "step": 3748 + }, + { + "epoch": 2.0268516849882863, + "grad_norm": 0.3451012670993805, + "learning_rate": 7.416705751955031e-06, + "loss": 0.3939, + "step": 3749 + }, + { + "epoch": 2.0273923229410706, + "grad_norm": 0.35276153683662415, + "learning_rate": 7.415052784636603e-06, + "loss": 0.4161, + "step": 3750 + }, + { + "epoch": 2.0279329608938546, + "grad_norm": 0.33562684059143066, + "learning_rate": 7.4133994729843275e-06, + "loss": 0.3954, + "step": 3751 + }, + { + "epoch": 2.028473598846639, + "grad_norm": 0.34942105412483215, + "learning_rate": 7.41174581723393e-06, + "loss": 0.4171, + "step": 3752 + }, + { + "epoch": 2.0290142367994233, + "grad_norm": 0.3535307049751282, + "learning_rate": 7.4100918176211876e-06, + "loss": 0.4104, + "step": 3753 + }, + { + "epoch": 2.0295548747522076, + "grad_norm": 0.32952383160591125, + "learning_rate": 7.408437474381924e-06, + "loss": 0.3924, + "step": 3754 + }, + { + "epoch": 2.030095512704992, + "grad_norm": 0.3535063862800598, + "learning_rate": 7.406782787752011e-06, + "loss": 0.384, + "step": 3755 + }, + { + "epoch": 2.0306361506577764, + "grad_norm": 0.29314327239990234, + "learning_rate": 7.4051277579673746e-06, + "loss": 0.3447, + "step": 3756 + }, + { + "epoch": 2.0311767886105603, + "grad_norm": 0.3491775095462799, + "learning_rate": 7.403472385263979e-06, + "loss": 0.4355, + "step": 3757 + }, + { + "epoch": 2.0317174265633446, + "grad_norm": 0.31308335065841675, + "learning_rate": 7.401816669877852e-06, + "loss": 0.4135, + "step": 3758 + }, + { + "epoch": 2.032258064516129, + "grad_norm": 0.3441132605075836, + "learning_rate": 7.400160612045057e-06, + "loss": 0.4233, + "step": 3759 + }, + { + "epoch": 2.0327987024689134, + "grad_norm": 0.3010082244873047, + "learning_rate": 7.398504212001714e-06, + "loss": 0.374, + "step": 3760 + }, + { + "epoch": 2.0333393404216977, + "grad_norm": 0.36063721776008606, + "learning_rate": 7.39684746998399e-06, + "loss": 0.4582, + "step": 3761 + }, + { + "epoch": 2.033879978374482, + "grad_norm": 0.3262293338775635, + "learning_rate": 7.395190386228098e-06, + "loss": 0.4108, + "step": 3762 + }, + { + "epoch": 2.034420616327266, + "grad_norm": 0.3062325716018677, + "learning_rate": 7.393532960970305e-06, + "loss": 0.3861, + "step": 3763 + }, + { + "epoch": 2.0349612542800504, + "grad_norm": 0.374900221824646, + "learning_rate": 7.391875194446922e-06, + "loss": 0.412, + "step": 3764 + }, + { + "epoch": 2.0355018922328347, + "grad_norm": 0.3617575764656067, + "learning_rate": 7.390217086894309e-06, + "loss": 0.3655, + "step": 3765 + }, + { + "epoch": 2.036042530185619, + "grad_norm": 0.345198392868042, + "learning_rate": 7.38855863854888e-06, + "loss": 0.4332, + "step": 3766 + }, + { + "epoch": 2.0365831681384035, + "grad_norm": 0.3414783775806427, + "learning_rate": 7.386899849647089e-06, + "loss": 0.4083, + "step": 3767 + }, + { + "epoch": 2.037123806091188, + "grad_norm": 0.3830156624317169, + "learning_rate": 7.385240720425446e-06, + "loss": 0.4146, + "step": 3768 + }, + { + "epoch": 2.0376644440439717, + "grad_norm": 0.3343322277069092, + "learning_rate": 7.3835812511205055e-06, + "loss": 0.4011, + "step": 3769 + }, + { + "epoch": 2.038205081996756, + "grad_norm": 0.35519006848335266, + "learning_rate": 7.3819214419688725e-06, + "loss": 0.4243, + "step": 3770 + }, + { + "epoch": 2.0387457199495405, + "grad_norm": 0.3095337748527527, + "learning_rate": 7.380261293207198e-06, + "loss": 0.3586, + "step": 3771 + }, + { + "epoch": 2.039286357902325, + "grad_norm": 0.37620463967323303, + "learning_rate": 7.378600805072186e-06, + "loss": 0.4157, + "step": 3772 + }, + { + "epoch": 2.039826995855109, + "grad_norm": 0.31508979201316833, + "learning_rate": 7.376939977800581e-06, + "loss": 0.3668, + "step": 3773 + }, + { + "epoch": 2.040367633807893, + "grad_norm": 0.3138459324836731, + "learning_rate": 7.375278811629185e-06, + "loss": 0.4474, + "step": 3774 + }, + { + "epoch": 2.0409082717606775, + "grad_norm": 0.302555114030838, + "learning_rate": 7.373617306794844e-06, + "loss": 0.4138, + "step": 3775 + }, + { + "epoch": 2.041448909713462, + "grad_norm": 0.2993220090866089, + "learning_rate": 7.3719554635344505e-06, + "loss": 0.3921, + "step": 3776 + }, + { + "epoch": 2.041989547666246, + "grad_norm": 0.3078453242778778, + "learning_rate": 7.370293282084946e-06, + "loss": 0.419, + "step": 3777 + }, + { + "epoch": 2.0425301856190305, + "grad_norm": 0.2940816581249237, + "learning_rate": 7.368630762683324e-06, + "loss": 0.3867, + "step": 3778 + }, + { + "epoch": 2.043070823571815, + "grad_norm": 0.3160632848739624, + "learning_rate": 7.366967905566622e-06, + "loss": 0.4225, + "step": 3779 + }, + { + "epoch": 2.043611461524599, + "grad_norm": 0.33914080262184143, + "learning_rate": 7.365304710971928e-06, + "loss": 0.4386, + "step": 3780 + }, + { + "epoch": 2.044152099477383, + "grad_norm": 0.29692062735557556, + "learning_rate": 7.363641179136377e-06, + "loss": 0.3939, + "step": 3781 + }, + { + "epoch": 2.0446927374301676, + "grad_norm": 0.33448755741119385, + "learning_rate": 7.361977310297153e-06, + "loss": 0.4324, + "step": 3782 + }, + { + "epoch": 2.045233375382952, + "grad_norm": 0.2931983470916748, + "learning_rate": 7.360313104691485e-06, + "loss": 0.3849, + "step": 3783 + }, + { + "epoch": 2.0457740133357363, + "grad_norm": 0.3646359145641327, + "learning_rate": 7.358648562556656e-06, + "loss": 0.4415, + "step": 3784 + }, + { + "epoch": 2.0463146512885206, + "grad_norm": 0.3032764494419098, + "learning_rate": 7.3569836841299905e-06, + "loss": 0.4167, + "step": 3785 + }, + { + "epoch": 2.0468552892413046, + "grad_norm": 0.2867969274520874, + "learning_rate": 7.3553184696488625e-06, + "loss": 0.3492, + "step": 3786 + }, + { + "epoch": 2.047395927194089, + "grad_norm": 0.3223315477371216, + "learning_rate": 7.3536529193507e-06, + "loss": 0.4154, + "step": 3787 + }, + { + "epoch": 2.0479365651468733, + "grad_norm": 0.3315862715244293, + "learning_rate": 7.351987033472971e-06, + "loss": 0.398, + "step": 3788 + }, + { + "epoch": 2.0484772030996576, + "grad_norm": 0.3055437207221985, + "learning_rate": 7.350320812253196e-06, + "loss": 0.3715, + "step": 3789 + }, + { + "epoch": 2.049017841052442, + "grad_norm": 0.29256078600883484, + "learning_rate": 7.348654255928941e-06, + "loss": 0.381, + "step": 3790 + }, + { + "epoch": 2.0495584790052264, + "grad_norm": 0.29239529371261597, + "learning_rate": 7.346987364737819e-06, + "loss": 0.3993, + "step": 3791 + }, + { + "epoch": 2.0500991169580103, + "grad_norm": 0.3444713056087494, + "learning_rate": 7.345320138917496e-06, + "loss": 0.4172, + "step": 3792 + }, + { + "epoch": 2.0506397549107946, + "grad_norm": 0.32070884108543396, + "learning_rate": 7.343652578705678e-06, + "loss": 0.4356, + "step": 3793 + }, + { + "epoch": 2.051180392863579, + "grad_norm": 0.2924729883670807, + "learning_rate": 7.341984684340125e-06, + "loss": 0.3917, + "step": 3794 + }, + { + "epoch": 2.0517210308163634, + "grad_norm": 0.34132906794548035, + "learning_rate": 7.340316456058644e-06, + "loss": 0.4196, + "step": 3795 + }, + { + "epoch": 2.0522616687691477, + "grad_norm": 0.2765282988548279, + "learning_rate": 7.338647894099085e-06, + "loss": 0.3524, + "step": 3796 + }, + { + "epoch": 2.052802306721932, + "grad_norm": 0.29886043071746826, + "learning_rate": 7.336978998699348e-06, + "loss": 0.3891, + "step": 3797 + }, + { + "epoch": 2.053342944674716, + "grad_norm": 0.3242138624191284, + "learning_rate": 7.335309770097383e-06, + "loss": 0.447, + "step": 3798 + }, + { + "epoch": 2.0538835826275004, + "grad_norm": 0.3068273067474365, + "learning_rate": 7.333640208531187e-06, + "loss": 0.4029, + "step": 3799 + }, + { + "epoch": 2.0544242205802847, + "grad_norm": 0.320049911737442, + "learning_rate": 7.331970314238799e-06, + "loss": 0.3743, + "step": 3800 + }, + { + "epoch": 2.054964858533069, + "grad_norm": 0.33802640438079834, + "learning_rate": 7.330300087458313e-06, + "loss": 0.4385, + "step": 3801 + }, + { + "epoch": 2.0555054964858535, + "grad_norm": 0.32195529341697693, + "learning_rate": 7.328629528427865e-06, + "loss": 0.4199, + "step": 3802 + }, + { + "epoch": 2.0560461344386374, + "grad_norm": 0.30489280819892883, + "learning_rate": 7.3269586373856415e-06, + "loss": 0.4167, + "step": 3803 + }, + { + "epoch": 2.0565867723914217, + "grad_norm": 0.3016655147075653, + "learning_rate": 7.325287414569874e-06, + "loss": 0.4095, + "step": 3804 + }, + { + "epoch": 2.057127410344206, + "grad_norm": 0.31946998834609985, + "learning_rate": 7.323615860218844e-06, + "loss": 0.439, + "step": 3805 + }, + { + "epoch": 2.0576680482969905, + "grad_norm": 0.29036518931388855, + "learning_rate": 7.321943974570876e-06, + "loss": 0.3623, + "step": 3806 + }, + { + "epoch": 2.058208686249775, + "grad_norm": 0.3299275040626526, + "learning_rate": 7.320271757864348e-06, + "loss": 0.4034, + "step": 3807 + }, + { + "epoch": 2.058749324202559, + "grad_norm": 0.31324502825737, + "learning_rate": 7.318599210337678e-06, + "loss": 0.4124, + "step": 3808 + }, + { + "epoch": 2.059289962155343, + "grad_norm": 0.3337649703025818, + "learning_rate": 7.316926332229337e-06, + "loss": 0.4473, + "step": 3809 + }, + { + "epoch": 2.0598306001081275, + "grad_norm": 0.27556607127189636, + "learning_rate": 7.31525312377784e-06, + "loss": 0.3664, + "step": 3810 + }, + { + "epoch": 2.060371238060912, + "grad_norm": 0.3056291937828064, + "learning_rate": 7.313579585221752e-06, + "loss": 0.4191, + "step": 3811 + }, + { + "epoch": 2.060911876013696, + "grad_norm": 0.34928005933761597, + "learning_rate": 7.31190571679968e-06, + "loss": 0.4415, + "step": 3812 + }, + { + "epoch": 2.0614525139664805, + "grad_norm": 0.28535032272338867, + "learning_rate": 7.310231518750284e-06, + "loss": 0.3789, + "step": 3813 + }, + { + "epoch": 2.061993151919265, + "grad_norm": 0.32170960307121277, + "learning_rate": 7.308556991312263e-06, + "loss": 0.392, + "step": 3814 + }, + { + "epoch": 2.062533789872049, + "grad_norm": 0.351576030254364, + "learning_rate": 7.306882134724376e-06, + "loss": 0.4495, + "step": 3815 + }, + { + "epoch": 2.063074427824833, + "grad_norm": 0.31351256370544434, + "learning_rate": 7.3052069492254154e-06, + "loss": 0.4306, + "step": 3816 + }, + { + "epoch": 2.0636150657776176, + "grad_norm": 0.29618924856185913, + "learning_rate": 7.303531435054229e-06, + "loss": 0.3763, + "step": 3817 + }, + { + "epoch": 2.064155703730402, + "grad_norm": 0.33462122082710266, + "learning_rate": 7.301855592449704e-06, + "loss": 0.4162, + "step": 3818 + }, + { + "epoch": 2.0646963416831863, + "grad_norm": 0.3589819669723511, + "learning_rate": 7.3001794216507845e-06, + "loss": 0.4185, + "step": 3819 + }, + { + "epoch": 2.0652369796359706, + "grad_norm": 0.3030599057674408, + "learning_rate": 7.298502922896453e-06, + "loss": 0.4091, + "step": 3820 + }, + { + "epoch": 2.0657776175887546, + "grad_norm": 0.3502945005893707, + "learning_rate": 7.296826096425743e-06, + "loss": 0.4215, + "step": 3821 + }, + { + "epoch": 2.066318255541539, + "grad_norm": 0.3764542043209076, + "learning_rate": 7.295148942477732e-06, + "loss": 0.3956, + "step": 3822 + }, + { + "epoch": 2.0668588934943233, + "grad_norm": 0.2936761975288391, + "learning_rate": 7.293471461291546e-06, + "loss": 0.3914, + "step": 3823 + }, + { + "epoch": 2.0673995314471076, + "grad_norm": 0.36366504430770874, + "learning_rate": 7.291793653106357e-06, + "loss": 0.4001, + "step": 3824 + }, + { + "epoch": 2.067940169399892, + "grad_norm": 0.40998461842536926, + "learning_rate": 7.290115518161385e-06, + "loss": 0.4144, + "step": 3825 + }, + { + "epoch": 2.0684808073526764, + "grad_norm": 0.2950108051300049, + "learning_rate": 7.288437056695894e-06, + "loss": 0.4165, + "step": 3826 + }, + { + "epoch": 2.0690214453054603, + "grad_norm": 0.34533965587615967, + "learning_rate": 7.286758268949198e-06, + "loss": 0.3996, + "step": 3827 + }, + { + "epoch": 2.0695620832582446, + "grad_norm": 0.34263163805007935, + "learning_rate": 7.285079155160652e-06, + "loss": 0.4258, + "step": 3828 + }, + { + "epoch": 2.070102721211029, + "grad_norm": 0.333176851272583, + "learning_rate": 7.283399715569666e-06, + "loss": 0.3754, + "step": 3829 + }, + { + "epoch": 2.0706433591638134, + "grad_norm": 0.3491484224796295, + "learning_rate": 7.281719950415686e-06, + "loss": 0.4263, + "step": 3830 + }, + { + "epoch": 2.0711839971165977, + "grad_norm": 0.31822386384010315, + "learning_rate": 7.280039859938213e-06, + "loss": 0.3965, + "step": 3831 + }, + { + "epoch": 2.0717246350693816, + "grad_norm": 0.3203738033771515, + "learning_rate": 7.27835944437679e-06, + "loss": 0.408, + "step": 3832 + }, + { + "epoch": 2.072265273022166, + "grad_norm": 0.3189562261104584, + "learning_rate": 7.276678703971011e-06, + "loss": 0.4019, + "step": 3833 + }, + { + "epoch": 2.0728059109749504, + "grad_norm": 0.3436689078807831, + "learning_rate": 7.274997638960508e-06, + "loss": 0.381, + "step": 3834 + }, + { + "epoch": 2.0733465489277347, + "grad_norm": 0.33741891384124756, + "learning_rate": 7.273316249584969e-06, + "loss": 0.4172, + "step": 3835 + }, + { + "epoch": 2.073887186880519, + "grad_norm": 0.32150623202323914, + "learning_rate": 7.271634536084118e-06, + "loss": 0.4375, + "step": 3836 + }, + { + "epoch": 2.0744278248333035, + "grad_norm": 0.3251088261604309, + "learning_rate": 7.269952498697734e-06, + "loss": 0.3904, + "step": 3837 + }, + { + "epoch": 2.0749684627860874, + "grad_norm": 0.3126635253429413, + "learning_rate": 7.268270137665639e-06, + "loss": 0.373, + "step": 3838 + }, + { + "epoch": 2.0755091007388717, + "grad_norm": 0.3661896586418152, + "learning_rate": 7.266587453227703e-06, + "loss": 0.4026, + "step": 3839 + }, + { + "epoch": 2.076049738691656, + "grad_norm": 0.3148078918457031, + "learning_rate": 7.2649044456238334e-06, + "loss": 0.3691, + "step": 3840 + }, + { + "epoch": 2.0765903766444405, + "grad_norm": 0.35688409209251404, + "learning_rate": 7.263221115093997e-06, + "loss": 0.4114, + "step": 3841 + }, + { + "epoch": 2.077131014597225, + "grad_norm": 0.371732234954834, + "learning_rate": 7.261537461878196e-06, + "loss": 0.4202, + "step": 3842 + }, + { + "epoch": 2.077671652550009, + "grad_norm": 0.3141860365867615, + "learning_rate": 7.259853486216485e-06, + "loss": 0.3495, + "step": 3843 + }, + { + "epoch": 2.078212290502793, + "grad_norm": 0.3581951856613159, + "learning_rate": 7.2581691883489605e-06, + "loss": 0.4252, + "step": 3844 + }, + { + "epoch": 2.0787529284555775, + "grad_norm": 0.2928514778614044, + "learning_rate": 7.256484568515769e-06, + "loss": 0.3449, + "step": 3845 + }, + { + "epoch": 2.079293566408362, + "grad_norm": 0.419791579246521, + "learning_rate": 7.254799626957098e-06, + "loss": 0.4256, + "step": 3846 + }, + { + "epoch": 2.079834204361146, + "grad_norm": 0.33046984672546387, + "learning_rate": 7.253114363913185e-06, + "loss": 0.4195, + "step": 3847 + }, + { + "epoch": 2.0803748423139306, + "grad_norm": 0.3584929406642914, + "learning_rate": 7.251428779624309e-06, + "loss": 0.4098, + "step": 3848 + }, + { + "epoch": 2.080915480266715, + "grad_norm": 0.31421512365341187, + "learning_rate": 7.249742874330802e-06, + "loss": 0.3536, + "step": 3849 + }, + { + "epoch": 2.081456118219499, + "grad_norm": 0.29763343930244446, + "learning_rate": 7.248056648273034e-06, + "loss": 0.3575, + "step": 3850 + }, + { + "epoch": 2.081996756172283, + "grad_norm": 0.39056849479675293, + "learning_rate": 7.246370101691424e-06, + "loss": 0.4319, + "step": 3851 + }, + { + "epoch": 2.0825373941250676, + "grad_norm": 0.34038469195365906, + "learning_rate": 7.244683234826441e-06, + "loss": 0.4107, + "step": 3852 + }, + { + "epoch": 2.083078032077852, + "grad_norm": 0.3491818308830261, + "learning_rate": 7.242996047918589e-06, + "loss": 0.4068, + "step": 3853 + }, + { + "epoch": 2.0836186700306363, + "grad_norm": 0.37478092312812805, + "learning_rate": 7.241308541208429e-06, + "loss": 0.3975, + "step": 3854 + }, + { + "epoch": 2.0841593079834206, + "grad_norm": 0.33679676055908203, + "learning_rate": 7.239620714936561e-06, + "loss": 0.4176, + "step": 3855 + }, + { + "epoch": 2.0846999459362046, + "grad_norm": 0.3508088290691376, + "learning_rate": 7.237932569343632e-06, + "loss": 0.4104, + "step": 3856 + }, + { + "epoch": 2.085240583888989, + "grad_norm": 0.3401055932044983, + "learning_rate": 7.2362441046703344e-06, + "loss": 0.4029, + "step": 3857 + }, + { + "epoch": 2.0857812218417733, + "grad_norm": 0.38918283581733704, + "learning_rate": 7.2345553211574086e-06, + "loss": 0.373, + "step": 3858 + }, + { + "epoch": 2.0863218597945576, + "grad_norm": 0.321361780166626, + "learning_rate": 7.232866219045634e-06, + "loss": 0.4362, + "step": 3859 + }, + { + "epoch": 2.086862497747342, + "grad_norm": 0.33438366651535034, + "learning_rate": 7.231176798575843e-06, + "loss": 0.4012, + "step": 3860 + }, + { + "epoch": 2.087403135700126, + "grad_norm": 0.3418934941291809, + "learning_rate": 7.22948705998891e-06, + "loss": 0.403, + "step": 3861 + }, + { + "epoch": 2.0879437736529103, + "grad_norm": 0.30169782042503357, + "learning_rate": 7.227797003525755e-06, + "loss": 0.389, + "step": 3862 + }, + { + "epoch": 2.0884844116056946, + "grad_norm": 0.3699062466621399, + "learning_rate": 7.226106629427342e-06, + "loss": 0.4012, + "step": 3863 + }, + { + "epoch": 2.089025049558479, + "grad_norm": 0.34036868810653687, + "learning_rate": 7.2244159379346826e-06, + "loss": 0.4144, + "step": 3864 + }, + { + "epoch": 2.0895656875112634, + "grad_norm": 0.30740082263946533, + "learning_rate": 7.22272492928883e-06, + "loss": 0.4116, + "step": 3865 + }, + { + "epoch": 2.0901063254640477, + "grad_norm": 0.4237309396266937, + "learning_rate": 7.221033603730888e-06, + "loss": 0.4236, + "step": 3866 + }, + { + "epoch": 2.0906469634168316, + "grad_norm": 0.30990955233573914, + "learning_rate": 7.219341961502002e-06, + "loss": 0.4114, + "step": 3867 + }, + { + "epoch": 2.091187601369616, + "grad_norm": 0.3375382423400879, + "learning_rate": 7.217650002843364e-06, + "loss": 0.4131, + "step": 3868 + }, + { + "epoch": 2.0917282393224004, + "grad_norm": 0.32740265130996704, + "learning_rate": 7.215957727996208e-06, + "loss": 0.3892, + "step": 3869 + }, + { + "epoch": 2.0922688772751847, + "grad_norm": 0.30971941351890564, + "learning_rate": 7.214265137201817e-06, + "loss": 0.4316, + "step": 3870 + }, + { + "epoch": 2.092809515227969, + "grad_norm": 0.3367486596107483, + "learning_rate": 7.212572230701517e-06, + "loss": 0.3895, + "step": 3871 + }, + { + "epoch": 2.0933501531807535, + "grad_norm": 0.3166569769382477, + "learning_rate": 7.210879008736681e-06, + "loss": 0.398, + "step": 3872 + }, + { + "epoch": 2.0938907911335374, + "grad_norm": 0.2920517325401306, + "learning_rate": 7.209185471548724e-06, + "loss": 0.3801, + "step": 3873 + }, + { + "epoch": 2.0944314290863217, + "grad_norm": 0.36852407455444336, + "learning_rate": 7.207491619379109e-06, + "loss": 0.4359, + "step": 3874 + }, + { + "epoch": 2.094972067039106, + "grad_norm": 0.3067874610424042, + "learning_rate": 7.205797452469341e-06, + "loss": 0.3864, + "step": 3875 + }, + { + "epoch": 2.0955127049918905, + "grad_norm": 0.2726293206214905, + "learning_rate": 7.204102971060971e-06, + "loss": 0.3885, + "step": 3876 + }, + { + "epoch": 2.096053342944675, + "grad_norm": 0.3440646529197693, + "learning_rate": 7.2024081753955944e-06, + "loss": 0.4552, + "step": 3877 + }, + { + "epoch": 2.096593980897459, + "grad_norm": 0.32205453515052795, + "learning_rate": 7.200713065714856e-06, + "loss": 0.3593, + "step": 3878 + }, + { + "epoch": 2.097134618850243, + "grad_norm": 0.31225115060806274, + "learning_rate": 7.1990176422604375e-06, + "loss": 0.3778, + "step": 3879 + }, + { + "epoch": 2.0976752568030275, + "grad_norm": 0.3493831157684326, + "learning_rate": 7.197321905274071e-06, + "loss": 0.4246, + "step": 3880 + }, + { + "epoch": 2.098215894755812, + "grad_norm": 0.3429039418697357, + "learning_rate": 7.195625854997531e-06, + "loss": 0.4213, + "step": 3881 + }, + { + "epoch": 2.098756532708596, + "grad_norm": 0.3157693147659302, + "learning_rate": 7.1939294916726375e-06, + "loss": 0.3945, + "step": 3882 + }, + { + "epoch": 2.0992971706613806, + "grad_norm": 0.32176530361175537, + "learning_rate": 7.1922328155412545e-06, + "loss": 0.3816, + "step": 3883 + }, + { + "epoch": 2.099837808614165, + "grad_norm": 0.3419540822505951, + "learning_rate": 7.190535826845293e-06, + "loss": 0.4148, + "step": 3884 + }, + { + "epoch": 2.100378446566949, + "grad_norm": 0.3286668360233307, + "learning_rate": 7.188838525826702e-06, + "loss": 0.4135, + "step": 3885 + }, + { + "epoch": 2.100919084519733, + "grad_norm": 0.34139060974121094, + "learning_rate": 7.187140912727486e-06, + "loss": 0.4178, + "step": 3886 + }, + { + "epoch": 2.1014597224725176, + "grad_norm": 0.3621421456336975, + "learning_rate": 7.185442987789683e-06, + "loss": 0.4192, + "step": 3887 + }, + { + "epoch": 2.102000360425302, + "grad_norm": 0.3126319944858551, + "learning_rate": 7.18374475125538e-06, + "loss": 0.3858, + "step": 3888 + }, + { + "epoch": 2.1025409983780863, + "grad_norm": 0.3388730585575104, + "learning_rate": 7.18204620336671e-06, + "loss": 0.4164, + "step": 3889 + }, + { + "epoch": 2.1030816363308706, + "grad_norm": 0.32302772998809814, + "learning_rate": 7.18034734436585e-06, + "loss": 0.4124, + "step": 3890 + }, + { + "epoch": 2.1036222742836546, + "grad_norm": 0.32162100076675415, + "learning_rate": 7.1786481744950186e-06, + "loss": 0.4305, + "step": 3891 + }, + { + "epoch": 2.104162912236439, + "grad_norm": 0.2945658266544342, + "learning_rate": 7.17694869399648e-06, + "loss": 0.3711, + "step": 3892 + }, + { + "epoch": 2.1047035501892233, + "grad_norm": 0.3670465350151062, + "learning_rate": 7.175248903112544e-06, + "loss": 0.4306, + "step": 3893 + }, + { + "epoch": 2.1052441881420076, + "grad_norm": 0.29766687750816345, + "learning_rate": 7.173548802085564e-06, + "loss": 0.3714, + "step": 3894 + }, + { + "epoch": 2.105784826094792, + "grad_norm": 0.320868581533432, + "learning_rate": 7.171848391157935e-06, + "loss": 0.4301, + "step": 3895 + }, + { + "epoch": 2.1063254640475764, + "grad_norm": 0.28945621848106384, + "learning_rate": 7.170147670572102e-06, + "loss": 0.3641, + "step": 3896 + }, + { + "epoch": 2.1068661020003603, + "grad_norm": 0.3298172652721405, + "learning_rate": 7.1684466405705475e-06, + "loss": 0.4205, + "step": 3897 + }, + { + "epoch": 2.1074067399531446, + "grad_norm": 0.318289577960968, + "learning_rate": 7.166745301395804e-06, + "loss": 0.4215, + "step": 3898 + }, + { + "epoch": 2.107947377905929, + "grad_norm": 0.3189803957939148, + "learning_rate": 7.165043653290443e-06, + "loss": 0.3796, + "step": 3899 + }, + { + "epoch": 2.1084880158587134, + "grad_norm": 0.29908958077430725, + "learning_rate": 7.163341696497084e-06, + "loss": 0.3637, + "step": 3900 + }, + { + "epoch": 2.1090286538114977, + "grad_norm": 0.3295332193374634, + "learning_rate": 7.161639431258387e-06, + "loss": 0.4193, + "step": 3901 + }, + { + "epoch": 2.1095692917642817, + "grad_norm": 0.31142479181289673, + "learning_rate": 7.15993685781706e-06, + "loss": 0.3817, + "step": 3902 + }, + { + "epoch": 2.110109929717066, + "grad_norm": 0.33382973074913025, + "learning_rate": 7.158233976415852e-06, + "loss": 0.4067, + "step": 3903 + }, + { + "epoch": 2.1106505676698504, + "grad_norm": 0.3381470739841461, + "learning_rate": 7.1565307872975576e-06, + "loss": 0.4123, + "step": 3904 + }, + { + "epoch": 2.1111912056226347, + "grad_norm": 0.30384355783462524, + "learning_rate": 7.154827290705012e-06, + "loss": 0.3842, + "step": 3905 + }, + { + "epoch": 2.111731843575419, + "grad_norm": 0.30277004837989807, + "learning_rate": 7.1531234868811e-06, + "loss": 0.3578, + "step": 3906 + }, + { + "epoch": 2.1122724815282035, + "grad_norm": 0.390278160572052, + "learning_rate": 7.151419376068743e-06, + "loss": 0.4165, + "step": 3907 + }, + { + "epoch": 2.1128131194809874, + "grad_norm": 0.3517399728298187, + "learning_rate": 7.149714958510914e-06, + "loss": 0.371, + "step": 3908 + }, + { + "epoch": 2.1133537574337717, + "grad_norm": 0.3465571701526642, + "learning_rate": 7.148010234450623e-06, + "loss": 0.3995, + "step": 3909 + }, + { + "epoch": 2.113894395386556, + "grad_norm": 0.4046260714530945, + "learning_rate": 7.146305204130928e-06, + "loss": 0.4302, + "step": 3910 + }, + { + "epoch": 2.1144350333393405, + "grad_norm": 0.3325851559638977, + "learning_rate": 7.144599867794927e-06, + "loss": 0.3922, + "step": 3911 + }, + { + "epoch": 2.114975671292125, + "grad_norm": 0.3606100082397461, + "learning_rate": 7.142894225685767e-06, + "loss": 0.4152, + "step": 3912 + }, + { + "epoch": 2.115516309244909, + "grad_norm": 0.3804704546928406, + "learning_rate": 7.141188278046632e-06, + "loss": 0.3951, + "step": 3913 + }, + { + "epoch": 2.116056947197693, + "grad_norm": 0.39560452103614807, + "learning_rate": 7.139482025120757e-06, + "loss": 0.431, + "step": 3914 + }, + { + "epoch": 2.1165975851504775, + "grad_norm": 0.300865113735199, + "learning_rate": 7.137775467151411e-06, + "loss": 0.3932, + "step": 3915 + }, + { + "epoch": 2.117138223103262, + "grad_norm": 0.3441906273365021, + "learning_rate": 7.136068604381916e-06, + "loss": 0.3983, + "step": 3916 + }, + { + "epoch": 2.117678861056046, + "grad_norm": 0.40447837114334106, + "learning_rate": 7.134361437055633e-06, + "loss": 0.3983, + "step": 3917 + }, + { + "epoch": 2.1182194990088306, + "grad_norm": 0.3152672052383423, + "learning_rate": 7.132653965415965e-06, + "loss": 0.3887, + "step": 3918 + }, + { + "epoch": 2.118760136961615, + "grad_norm": 0.3489867150783539, + "learning_rate": 7.130946189706364e-06, + "loss": 0.434, + "step": 3919 + }, + { + "epoch": 2.119300774914399, + "grad_norm": 0.33735814690589905, + "learning_rate": 7.129238110170315e-06, + "loss": 0.3816, + "step": 3920 + }, + { + "epoch": 2.119841412867183, + "grad_norm": 0.3443851172924042, + "learning_rate": 7.1275297270513614e-06, + "loss": 0.4159, + "step": 3921 + }, + { + "epoch": 2.1203820508199676, + "grad_norm": 0.32266682386398315, + "learning_rate": 7.125821040593073e-06, + "loss": 0.3909, + "step": 3922 + }, + { + "epoch": 2.120922688772752, + "grad_norm": 0.31138888001441956, + "learning_rate": 7.124112051039076e-06, + "loss": 0.371, + "step": 3923 + }, + { + "epoch": 2.1214633267255363, + "grad_norm": 0.3722877502441406, + "learning_rate": 7.122402758633033e-06, + "loss": 0.4169, + "step": 3924 + }, + { + "epoch": 2.1220039646783206, + "grad_norm": 0.3135385811328888, + "learning_rate": 7.120693163618656e-06, + "loss": 0.4054, + "step": 3925 + }, + { + "epoch": 2.1225446026311046, + "grad_norm": 0.3250928521156311, + "learning_rate": 7.118983266239691e-06, + "loss": 0.3783, + "step": 3926 + }, + { + "epoch": 2.123085240583889, + "grad_norm": 0.3110595941543579, + "learning_rate": 7.117273066739934e-06, + "loss": 0.3776, + "step": 3927 + }, + { + "epoch": 2.1236258785366733, + "grad_norm": 0.3043820858001709, + "learning_rate": 7.115562565363221e-06, + "loss": 0.4084, + "step": 3928 + }, + { + "epoch": 2.1241665164894576, + "grad_norm": 0.31215688586235046, + "learning_rate": 7.1138517623534346e-06, + "loss": 0.3875, + "step": 3929 + }, + { + "epoch": 2.124707154442242, + "grad_norm": 0.33761128783226013, + "learning_rate": 7.112140657954495e-06, + "loss": 0.4115, + "step": 3930 + }, + { + "epoch": 2.125247792395026, + "grad_norm": 0.31252968311309814, + "learning_rate": 7.110429252410371e-06, + "loss": 0.4143, + "step": 3931 + }, + { + "epoch": 2.1257884303478103, + "grad_norm": 0.31016919016838074, + "learning_rate": 7.108717545965072e-06, + "loss": 0.4142, + "step": 3932 + }, + { + "epoch": 2.1263290683005946, + "grad_norm": 0.3337792158126831, + "learning_rate": 7.107005538862647e-06, + "loss": 0.4247, + "step": 3933 + }, + { + "epoch": 2.126869706253379, + "grad_norm": 0.2973635792732239, + "learning_rate": 7.105293231347192e-06, + "loss": 0.3987, + "step": 3934 + }, + { + "epoch": 2.1274103442061634, + "grad_norm": 0.30230849981307983, + "learning_rate": 7.103580623662845e-06, + "loss": 0.3848, + "step": 3935 + }, + { + "epoch": 2.1279509821589477, + "grad_norm": 0.31636372208595276, + "learning_rate": 7.101867716053787e-06, + "loss": 0.4301, + "step": 3936 + }, + { + "epoch": 2.1284916201117317, + "grad_norm": 0.30398645997047424, + "learning_rate": 7.100154508764243e-06, + "loss": 0.3712, + "step": 3937 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.334203839302063, + "learning_rate": 7.098441002038476e-06, + "loss": 0.4045, + "step": 3938 + }, + { + "epoch": 2.1295728960173004, + "grad_norm": 0.368012934923172, + "learning_rate": 7.096727196120796e-06, + "loss": 0.3982, + "step": 3939 + }, + { + "epoch": 2.1301135339700847, + "grad_norm": 0.35910269618034363, + "learning_rate": 7.0950130912555515e-06, + "loss": 0.4148, + "step": 3940 + }, + { + "epoch": 2.130654171922869, + "grad_norm": 0.3062628507614136, + "learning_rate": 7.093298687687141e-06, + "loss": 0.3762, + "step": 3941 + }, + { + "epoch": 2.1311948098756535, + "grad_norm": 0.3462604880332947, + "learning_rate": 7.091583985659999e-06, + "loss": 0.3984, + "step": 3942 + }, + { + "epoch": 2.1317354478284374, + "grad_norm": 0.3089619278907776, + "learning_rate": 7.089868985418605e-06, + "loss": 0.379, + "step": 3943 + }, + { + "epoch": 2.1322760857812217, + "grad_norm": 0.34678173065185547, + "learning_rate": 7.088153687207479e-06, + "loss": 0.4223, + "step": 3944 + }, + { + "epoch": 2.132816723734006, + "grad_norm": 0.27627551555633545, + "learning_rate": 7.086438091271186e-06, + "loss": 0.3217, + "step": 3945 + }, + { + "epoch": 2.1333573616867905, + "grad_norm": 0.3420870304107666, + "learning_rate": 7.084722197854334e-06, + "loss": 0.3856, + "step": 3946 + }, + { + "epoch": 2.133897999639575, + "grad_norm": 0.3347543478012085, + "learning_rate": 7.08300600720157e-06, + "loss": 0.4363, + "step": 3947 + }, + { + "epoch": 2.134438637592359, + "grad_norm": 0.30943727493286133, + "learning_rate": 7.0812895195575875e-06, + "loss": 0.3733, + "step": 3948 + }, + { + "epoch": 2.134979275545143, + "grad_norm": 0.32437270879745483, + "learning_rate": 7.079572735167119e-06, + "loss": 0.4325, + "step": 3949 + }, + { + "epoch": 2.1355199134979275, + "grad_norm": 0.30971601605415344, + "learning_rate": 7.077855654274939e-06, + "loss": 0.3662, + "step": 3950 + }, + { + "epoch": 2.136060551450712, + "grad_norm": 0.3495504856109619, + "learning_rate": 7.076138277125868e-06, + "loss": 0.4172, + "step": 3951 + }, + { + "epoch": 2.136601189403496, + "grad_norm": 0.28133028745651245, + "learning_rate": 7.0744206039647645e-06, + "loss": 0.3627, + "step": 3952 + }, + { + "epoch": 2.1371418273562806, + "grad_norm": 0.32612937688827515, + "learning_rate": 7.072702635036535e-06, + "loss": 0.4179, + "step": 3953 + }, + { + "epoch": 2.137682465309065, + "grad_norm": 0.304006427526474, + "learning_rate": 7.070984370586119e-06, + "loss": 0.3927, + "step": 3954 + }, + { + "epoch": 2.138223103261849, + "grad_norm": 0.30550140142440796, + "learning_rate": 7.069265810858509e-06, + "loss": 0.3633, + "step": 3955 + }, + { + "epoch": 2.138763741214633, + "grad_norm": 0.3439962565898895, + "learning_rate": 7.0675469560987295e-06, + "loss": 0.4209, + "step": 3956 + }, + { + "epoch": 2.1393043791674176, + "grad_norm": 0.3123915195465088, + "learning_rate": 7.065827806551855e-06, + "loss": 0.3985, + "step": 3957 + }, + { + "epoch": 2.139845017120202, + "grad_norm": 0.3467862904071808, + "learning_rate": 7.064108362462996e-06, + "loss": 0.409, + "step": 3958 + }, + { + "epoch": 2.1403856550729863, + "grad_norm": 0.3011173605918884, + "learning_rate": 7.062388624077311e-06, + "loss": 0.3816, + "step": 3959 + }, + { + "epoch": 2.14092629302577, + "grad_norm": 0.3171491026878357, + "learning_rate": 7.0606685916399945e-06, + "loss": 0.4161, + "step": 3960 + }, + { + "epoch": 2.1414669309785546, + "grad_norm": 0.3087047338485718, + "learning_rate": 7.0589482653962856e-06, + "loss": 0.3937, + "step": 3961 + }, + { + "epoch": 2.142007568931339, + "grad_norm": 0.30052605271339417, + "learning_rate": 7.057227645591467e-06, + "loss": 0.3914, + "step": 3962 + }, + { + "epoch": 2.1425482068841233, + "grad_norm": 0.28875285387039185, + "learning_rate": 7.0555067324708604e-06, + "loss": 0.3875, + "step": 3963 + }, + { + "epoch": 2.1430888448369076, + "grad_norm": 0.34616565704345703, + "learning_rate": 7.05378552627983e-06, + "loss": 0.4161, + "step": 3964 + }, + { + "epoch": 2.143629482789692, + "grad_norm": 0.31041425466537476, + "learning_rate": 7.052064027263785e-06, + "loss": 0.3853, + "step": 3965 + }, + { + "epoch": 2.144170120742476, + "grad_norm": 0.37378445267677307, + "learning_rate": 7.05034223566817e-06, + "loss": 0.4231, + "step": 3966 + }, + { + "epoch": 2.1447107586952603, + "grad_norm": 0.30695459246635437, + "learning_rate": 7.048620151738478e-06, + "loss": 0.3955, + "step": 3967 + }, + { + "epoch": 2.1452513966480447, + "grad_norm": 0.36416110396385193, + "learning_rate": 7.0468977757202375e-06, + "loss": 0.4027, + "step": 3968 + }, + { + "epoch": 2.145792034600829, + "grad_norm": 0.305324524641037, + "learning_rate": 7.045175107859024e-06, + "loss": 0.3895, + "step": 3969 + }, + { + "epoch": 2.1463326725536134, + "grad_norm": 0.32880768179893494, + "learning_rate": 7.043452148400452e-06, + "loss": 0.3969, + "step": 3970 + }, + { + "epoch": 2.1468733105063977, + "grad_norm": 0.3543906807899475, + "learning_rate": 7.041728897590178e-06, + "loss": 0.4065, + "step": 3971 + }, + { + "epoch": 2.1474139484591817, + "grad_norm": 0.3406868577003479, + "learning_rate": 7.040005355673899e-06, + "loss": 0.4306, + "step": 3972 + }, + { + "epoch": 2.147954586411966, + "grad_norm": 0.316324919462204, + "learning_rate": 7.038281522897356e-06, + "loss": 0.365, + "step": 3973 + }, + { + "epoch": 2.1484952243647504, + "grad_norm": 0.29952096939086914, + "learning_rate": 7.036557399506327e-06, + "loss": 0.4209, + "step": 3974 + }, + { + "epoch": 2.1490358623175347, + "grad_norm": 0.3157758116722107, + "learning_rate": 7.034832985746638e-06, + "loss": 0.3852, + "step": 3975 + }, + { + "epoch": 2.149576500270319, + "grad_norm": 0.34011179208755493, + "learning_rate": 7.033108281864152e-06, + "loss": 0.4282, + "step": 3976 + }, + { + "epoch": 2.1501171382231035, + "grad_norm": 0.29371097683906555, + "learning_rate": 7.0313832881047725e-06, + "loss": 0.3558, + "step": 3977 + }, + { + "epoch": 2.1506577761758874, + "grad_norm": 0.35102444887161255, + "learning_rate": 7.029658004714447e-06, + "loss": 0.4375, + "step": 3978 + }, + { + "epoch": 2.1511984141286717, + "grad_norm": 0.30691254138946533, + "learning_rate": 7.027932431939163e-06, + "loss": 0.3707, + "step": 3979 + }, + { + "epoch": 2.151739052081456, + "grad_norm": 0.294600248336792, + "learning_rate": 7.026206570024949e-06, + "loss": 0.3815, + "step": 3980 + }, + { + "epoch": 2.1522796900342405, + "grad_norm": 0.30190160870552063, + "learning_rate": 7.024480419217878e-06, + "loss": 0.3601, + "step": 3981 + }, + { + "epoch": 2.152820327987025, + "grad_norm": 0.31586983799934387, + "learning_rate": 7.022753979764058e-06, + "loss": 0.4305, + "step": 3982 + }, + { + "epoch": 2.153360965939809, + "grad_norm": 0.31739407777786255, + "learning_rate": 7.021027251909643e-06, + "loss": 0.3953, + "step": 3983 + }, + { + "epoch": 2.153901603892593, + "grad_norm": 0.3141368627548218, + "learning_rate": 7.019300235900829e-06, + "loss": 0.4127, + "step": 3984 + }, + { + "epoch": 2.1544422418453775, + "grad_norm": 0.27301716804504395, + "learning_rate": 7.017572931983846e-06, + "loss": 0.3737, + "step": 3985 + }, + { + "epoch": 2.154982879798162, + "grad_norm": 0.3058796226978302, + "learning_rate": 7.015845340404973e-06, + "loss": 0.4071, + "step": 3986 + }, + { + "epoch": 2.155523517750946, + "grad_norm": 0.3368726968765259, + "learning_rate": 7.014117461410526e-06, + "loss": 0.3808, + "step": 3987 + }, + { + "epoch": 2.1560641557037306, + "grad_norm": 0.3034706115722656, + "learning_rate": 7.012389295246865e-06, + "loss": 0.363, + "step": 3988 + }, + { + "epoch": 2.1566047936565145, + "grad_norm": 0.33887624740600586, + "learning_rate": 7.010660842160386e-06, + "loss": 0.4696, + "step": 3989 + }, + { + "epoch": 2.157145431609299, + "grad_norm": 0.29883840680122375, + "learning_rate": 7.00893210239753e-06, + "loss": 0.4038, + "step": 3990 + }, + { + "epoch": 2.157686069562083, + "grad_norm": 0.30450916290283203, + "learning_rate": 7.007203076204776e-06, + "loss": 0.3788, + "step": 3991 + }, + { + "epoch": 2.1582267075148676, + "grad_norm": 0.3299378454685211, + "learning_rate": 7.005473763828647e-06, + "loss": 0.3999, + "step": 3992 + }, + { + "epoch": 2.158767345467652, + "grad_norm": 0.3248005211353302, + "learning_rate": 7.0037441655157045e-06, + "loss": 0.4304, + "step": 3993 + }, + { + "epoch": 2.1593079834204363, + "grad_norm": 0.33100029826164246, + "learning_rate": 7.0020142815125545e-06, + "loss": 0.4253, + "step": 3994 + }, + { + "epoch": 2.15984862137322, + "grad_norm": 0.31740209460258484, + "learning_rate": 7.000284112065836e-06, + "loss": 0.4396, + "step": 3995 + }, + { + "epoch": 2.1603892593260046, + "grad_norm": 0.2936188876628876, + "learning_rate": 6.998553657422236e-06, + "loss": 0.3766, + "step": 3996 + }, + { + "epoch": 2.160929897278789, + "grad_norm": 0.2977309226989746, + "learning_rate": 6.9968229178284775e-06, + "loss": 0.3841, + "step": 3997 + }, + { + "epoch": 2.1614705352315733, + "grad_norm": 0.289065420627594, + "learning_rate": 6.9950918935313305e-06, + "loss": 0.3942, + "step": 3998 + }, + { + "epoch": 2.1620111731843576, + "grad_norm": 0.2957701086997986, + "learning_rate": 6.993360584777597e-06, + "loss": 0.3966, + "step": 3999 + }, + { + "epoch": 2.162551811137142, + "grad_norm": 0.29311293363571167, + "learning_rate": 6.9916289918141265e-06, + "loss": 0.3617, + "step": 4000 + }, + { + "epoch": 2.163092449089926, + "grad_norm": 0.341141015291214, + "learning_rate": 6.989897114887805e-06, + "loss": 0.4497, + "step": 4001 + }, + { + "epoch": 2.1636330870427103, + "grad_norm": 0.2881960868835449, + "learning_rate": 6.98816495424556e-06, + "loss": 0.4157, + "step": 4002 + }, + { + "epoch": 2.1641737249954947, + "grad_norm": 0.2937532961368561, + "learning_rate": 6.986432510134361e-06, + "loss": 0.397, + "step": 4003 + }, + { + "epoch": 2.164714362948279, + "grad_norm": 0.29233914613723755, + "learning_rate": 6.9846997828012174e-06, + "loss": 0.3754, + "step": 4004 + }, + { + "epoch": 2.1652550009010634, + "grad_norm": 0.32360079884529114, + "learning_rate": 6.982966772493176e-06, + "loss": 0.4484, + "step": 4005 + }, + { + "epoch": 2.1657956388538477, + "grad_norm": 0.320137083530426, + "learning_rate": 6.9812334794573285e-06, + "loss": 0.4113, + "step": 4006 + }, + { + "epoch": 2.1663362768066317, + "grad_norm": 0.2784048914909363, + "learning_rate": 6.979499903940803e-06, + "loss": 0.3888, + "step": 4007 + }, + { + "epoch": 2.166876914759416, + "grad_norm": 0.3299471437931061, + "learning_rate": 6.977766046190771e-06, + "loss": 0.4527, + "step": 4008 + }, + { + "epoch": 2.1674175527122004, + "grad_norm": 0.3056192398071289, + "learning_rate": 6.976031906454441e-06, + "loss": 0.387, + "step": 4009 + }, + { + "epoch": 2.1679581906649847, + "grad_norm": 0.31629693508148193, + "learning_rate": 6.974297484979066e-06, + "loss": 0.4065, + "step": 4010 + }, + { + "epoch": 2.168498828617769, + "grad_norm": 0.307719886302948, + "learning_rate": 6.972562782011934e-06, + "loss": 0.4331, + "step": 4011 + }, + { + "epoch": 2.1690394665705535, + "grad_norm": 0.31467607617378235, + "learning_rate": 6.970827797800378e-06, + "loss": 0.403, + "step": 4012 + }, + { + "epoch": 2.1695801045233374, + "grad_norm": 0.2923955023288727, + "learning_rate": 6.969092532591767e-06, + "loss": 0.3769, + "step": 4013 + }, + { + "epoch": 2.1701207424761217, + "grad_norm": 0.31039196252822876, + "learning_rate": 6.967356986633512e-06, + "loss": 0.3884, + "step": 4014 + }, + { + "epoch": 2.170661380428906, + "grad_norm": 0.32309892773628235, + "learning_rate": 6.965621160173066e-06, + "loss": 0.4091, + "step": 4015 + }, + { + "epoch": 2.1712020183816905, + "grad_norm": 0.3064098358154297, + "learning_rate": 6.96388505345792e-06, + "loss": 0.3954, + "step": 4016 + }, + { + "epoch": 2.171742656334475, + "grad_norm": 0.32693055272102356, + "learning_rate": 6.962148666735602e-06, + "loss": 0.4027, + "step": 4017 + }, + { + "epoch": 2.1722832942872587, + "grad_norm": 0.3119713366031647, + "learning_rate": 6.960412000253687e-06, + "loss": 0.4001, + "step": 4018 + }, + { + "epoch": 2.172823932240043, + "grad_norm": 0.31579023599624634, + "learning_rate": 6.95867505425978e-06, + "loss": 0.4049, + "step": 4019 + }, + { + "epoch": 2.1733645701928275, + "grad_norm": 0.32477495074272156, + "learning_rate": 6.9569378290015375e-06, + "loss": 0.3839, + "step": 4020 + }, + { + "epoch": 2.173905208145612, + "grad_norm": 0.29349246621131897, + "learning_rate": 6.9552003247266465e-06, + "loss": 0.3928, + "step": 4021 + }, + { + "epoch": 2.174445846098396, + "grad_norm": 0.30101755261421204, + "learning_rate": 6.95346254168284e-06, + "loss": 0.3904, + "step": 4022 + }, + { + "epoch": 2.1749864840511806, + "grad_norm": 0.33197450637817383, + "learning_rate": 6.951724480117884e-06, + "loss": 0.4102, + "step": 4023 + }, + { + "epoch": 2.1755271220039645, + "grad_norm": 0.342006117105484, + "learning_rate": 6.949986140279592e-06, + "loss": 0.4215, + "step": 4024 + }, + { + "epoch": 2.176067759956749, + "grad_norm": 0.3066186308860779, + "learning_rate": 6.948247522415811e-06, + "loss": 0.3803, + "step": 4025 + }, + { + "epoch": 2.176608397909533, + "grad_norm": 0.36611872911453247, + "learning_rate": 6.94650862677443e-06, + "loss": 0.4349, + "step": 4026 + }, + { + "epoch": 2.1771490358623176, + "grad_norm": 0.3239299952983856, + "learning_rate": 6.944769453603378e-06, + "loss": 0.41, + "step": 4027 + }, + { + "epoch": 2.177689673815102, + "grad_norm": 0.35315537452697754, + "learning_rate": 6.9430300031506244e-06, + "loss": 0.415, + "step": 4028 + }, + { + "epoch": 2.1782303117678863, + "grad_norm": 0.38641583919525146, + "learning_rate": 6.941290275664175e-06, + "loss": 0.3841, + "step": 4029 + }, + { + "epoch": 2.17877094972067, + "grad_norm": 0.3433842957019806, + "learning_rate": 6.939550271392079e-06, + "loss": 0.4018, + "step": 4030 + }, + { + "epoch": 2.1793115876734546, + "grad_norm": 0.35578569769859314, + "learning_rate": 6.937809990582421e-06, + "loss": 0.4095, + "step": 4031 + }, + { + "epoch": 2.179852225626239, + "grad_norm": 0.35455605387687683, + "learning_rate": 6.936069433483329e-06, + "loss": 0.3989, + "step": 4032 + }, + { + "epoch": 2.1803928635790233, + "grad_norm": 0.32006603479385376, + "learning_rate": 6.934328600342966e-06, + "loss": 0.3701, + "step": 4033 + }, + { + "epoch": 2.1809335015318077, + "grad_norm": 0.3517322242259979, + "learning_rate": 6.93258749140954e-06, + "loss": 0.4208, + "step": 4034 + }, + { + "epoch": 2.181474139484592, + "grad_norm": 0.3352504372596741, + "learning_rate": 6.930846106931292e-06, + "loss": 0.4202, + "step": 4035 + }, + { + "epoch": 2.182014777437376, + "grad_norm": 0.3257448971271515, + "learning_rate": 6.929104447156508e-06, + "loss": 0.3979, + "step": 4036 + }, + { + "epoch": 2.1825554153901603, + "grad_norm": 0.3509262800216675, + "learning_rate": 6.9273625123335085e-06, + "loss": 0.4567, + "step": 4037 + }, + { + "epoch": 2.1830960533429447, + "grad_norm": 0.28874605894088745, + "learning_rate": 6.9256203027106585e-06, + "loss": 0.3563, + "step": 4038 + }, + { + "epoch": 2.183636691295729, + "grad_norm": 0.35471436381340027, + "learning_rate": 6.923877818536355e-06, + "loss": 0.4083, + "step": 4039 + }, + { + "epoch": 2.1841773292485134, + "grad_norm": 0.3229536712169647, + "learning_rate": 6.922135060059043e-06, + "loss": 0.4115, + "step": 4040 + }, + { + "epoch": 2.1847179672012977, + "grad_norm": 0.3280383050441742, + "learning_rate": 6.9203920275271965e-06, + "loss": 0.4076, + "step": 4041 + }, + { + "epoch": 2.1852586051540817, + "grad_norm": 0.3501759171485901, + "learning_rate": 6.9186487211893374e-06, + "loss": 0.4334, + "step": 4042 + }, + { + "epoch": 2.185799243106866, + "grad_norm": 0.33414164185523987, + "learning_rate": 6.916905141294023e-06, + "loss": 0.3978, + "step": 4043 + }, + { + "epoch": 2.1863398810596504, + "grad_norm": 0.2950269281864166, + "learning_rate": 6.915161288089849e-06, + "loss": 0.4086, + "step": 4044 + }, + { + "epoch": 2.1868805190124347, + "grad_norm": 0.33400097489356995, + "learning_rate": 6.913417161825449e-06, + "loss": 0.4427, + "step": 4045 + }, + { + "epoch": 2.187421156965219, + "grad_norm": 0.32226991653442383, + "learning_rate": 6.911672762749502e-06, + "loss": 0.3979, + "step": 4046 + }, + { + "epoch": 2.187961794918003, + "grad_norm": 0.3366377055644989, + "learning_rate": 6.9099280911107166e-06, + "loss": 0.4158, + "step": 4047 + }, + { + "epoch": 2.1885024328707874, + "grad_norm": 0.3064139783382416, + "learning_rate": 6.908183147157847e-06, + "loss": 0.3948, + "step": 4048 + }, + { + "epoch": 2.1890430708235717, + "grad_norm": 0.3099367320537567, + "learning_rate": 6.906437931139686e-06, + "loss": 0.3498, + "step": 4049 + }, + { + "epoch": 2.189583708776356, + "grad_norm": 0.40618830919265747, + "learning_rate": 6.904692443305059e-06, + "loss": 0.3899, + "step": 4050 + }, + { + "epoch": 2.1901243467291405, + "grad_norm": 0.3219248354434967, + "learning_rate": 6.902946683902839e-06, + "loss": 0.4097, + "step": 4051 + }, + { + "epoch": 2.190664984681925, + "grad_norm": 0.349616676568985, + "learning_rate": 6.90120065318193e-06, + "loss": 0.4085, + "step": 4052 + }, + { + "epoch": 2.1912056226347087, + "grad_norm": 0.3385084569454193, + "learning_rate": 6.899454351391279e-06, + "loss": 0.4001, + "step": 4053 + }, + { + "epoch": 2.191746260587493, + "grad_norm": 0.3068590462207794, + "learning_rate": 6.897707778779871e-06, + "loss": 0.3806, + "step": 4054 + }, + { + "epoch": 2.1922868985402775, + "grad_norm": 0.34392955899238586, + "learning_rate": 6.895960935596728e-06, + "loss": 0.4064, + "step": 4055 + }, + { + "epoch": 2.192827536493062, + "grad_norm": 0.35632532835006714, + "learning_rate": 6.8942138220909116e-06, + "loss": 0.4246, + "step": 4056 + }, + { + "epoch": 2.193368174445846, + "grad_norm": 0.293804794549942, + "learning_rate": 6.892466438511525e-06, + "loss": 0.4068, + "step": 4057 + }, + { + "epoch": 2.1939088123986306, + "grad_norm": 0.3229387700557709, + "learning_rate": 6.8907187851077026e-06, + "loss": 0.385, + "step": 4058 + }, + { + "epoch": 2.1944494503514145, + "grad_norm": 0.32888251543045044, + "learning_rate": 6.888970862128627e-06, + "loss": 0.3808, + "step": 4059 + }, + { + "epoch": 2.194990088304199, + "grad_norm": 0.3496990501880646, + "learning_rate": 6.8872226698235065e-06, + "loss": 0.4257, + "step": 4060 + }, + { + "epoch": 2.195530726256983, + "grad_norm": 0.29963988065719604, + "learning_rate": 6.885474208441602e-06, + "loss": 0.3935, + "step": 4061 + }, + { + "epoch": 2.1960713642097676, + "grad_norm": 0.31232914328575134, + "learning_rate": 6.883725478232204e-06, + "loss": 0.3689, + "step": 4062 + }, + { + "epoch": 2.196612002162552, + "grad_norm": 0.3286808133125305, + "learning_rate": 6.8819764794446434e-06, + "loss": 0.4064, + "step": 4063 + }, + { + "epoch": 2.1971526401153363, + "grad_norm": 0.31781333684921265, + "learning_rate": 6.880227212328285e-06, + "loss": 0.4102, + "step": 4064 + }, + { + "epoch": 2.19769327806812, + "grad_norm": 0.30479738116264343, + "learning_rate": 6.8784776771325426e-06, + "loss": 0.4077, + "step": 4065 + }, + { + "epoch": 2.1982339160209046, + "grad_norm": 0.3371626138687134, + "learning_rate": 6.876727874106858e-06, + "loss": 0.3896, + "step": 4066 + }, + { + "epoch": 2.198774553973689, + "grad_norm": 0.2924918532371521, + "learning_rate": 6.874977803500716e-06, + "loss": 0.3964, + "step": 4067 + }, + { + "epoch": 2.1993151919264733, + "grad_norm": 0.30662932991981506, + "learning_rate": 6.873227465563639e-06, + "loss": 0.4427, + "step": 4068 + }, + { + "epoch": 2.1998558298792577, + "grad_norm": 0.2745401859283447, + "learning_rate": 6.8714768605451865e-06, + "loss": 0.3562, + "step": 4069 + }, + { + "epoch": 2.200396467832042, + "grad_norm": 0.3846108019351959, + "learning_rate": 6.869725988694955e-06, + "loss": 0.437, + "step": 4070 + }, + { + "epoch": 2.200937105784826, + "grad_norm": 0.29725199937820435, + "learning_rate": 6.867974850262582e-06, + "loss": 0.3998, + "step": 4071 + }, + { + "epoch": 2.2014777437376103, + "grad_norm": 0.3217564821243286, + "learning_rate": 6.866223445497743e-06, + "loss": 0.411, + "step": 4072 + }, + { + "epoch": 2.2020183816903947, + "grad_norm": 0.32411977648735046, + "learning_rate": 6.864471774650147e-06, + "loss": 0.4095, + "step": 4073 + }, + { + "epoch": 2.202559019643179, + "grad_norm": 0.33379268646240234, + "learning_rate": 6.862719837969548e-06, + "loss": 0.4391, + "step": 4074 + }, + { + "epoch": 2.2030996575959634, + "grad_norm": 0.3779868483543396, + "learning_rate": 6.860967635705732e-06, + "loss": 0.4423, + "step": 4075 + }, + { + "epoch": 2.2036402955487473, + "grad_norm": 0.3187415897846222, + "learning_rate": 6.859215168108523e-06, + "loss": 0.4474, + "step": 4076 + }, + { + "epoch": 2.2041809335015317, + "grad_norm": 0.2879991829395294, + "learning_rate": 6.8574624354277866e-06, + "loss": 0.368, + "step": 4077 + }, + { + "epoch": 2.204721571454316, + "grad_norm": 0.3639747202396393, + "learning_rate": 6.855709437913424e-06, + "loss": 0.3946, + "step": 4078 + }, + { + "epoch": 2.2052622094071004, + "grad_norm": 0.305832177400589, + "learning_rate": 6.853956175815375e-06, + "loss": 0.3756, + "step": 4079 + }, + { + "epoch": 2.2058028473598847, + "grad_norm": 0.32758861780166626, + "learning_rate": 6.8522026493836144e-06, + "loss": 0.4268, + "step": 4080 + }, + { + "epoch": 2.206343485312669, + "grad_norm": 0.29522502422332764, + "learning_rate": 6.850448858868161e-06, + "loss": 0.4, + "step": 4081 + }, + { + "epoch": 2.2068841232654535, + "grad_norm": 0.29559990763664246, + "learning_rate": 6.848694804519063e-06, + "loss": 0.4127, + "step": 4082 + }, + { + "epoch": 2.2074247612182374, + "grad_norm": 0.3084014058113098, + "learning_rate": 6.846940486586411e-06, + "loss": 0.3849, + "step": 4083 + }, + { + "epoch": 2.2079653991710217, + "grad_norm": 0.33527031540870667, + "learning_rate": 6.845185905320333e-06, + "loss": 0.3979, + "step": 4084 + }, + { + "epoch": 2.208506037123806, + "grad_norm": 0.30264729261398315, + "learning_rate": 6.843431060970995e-06, + "loss": 0.3825, + "step": 4085 + }, + { + "epoch": 2.2090466750765905, + "grad_norm": 0.323540061712265, + "learning_rate": 6.841675953788598e-06, + "loss": 0.4055, + "step": 4086 + }, + { + "epoch": 2.209587313029375, + "grad_norm": 0.32292747497558594, + "learning_rate": 6.839920584023384e-06, + "loss": 0.4317, + "step": 4087 + }, + { + "epoch": 2.2101279509821588, + "grad_norm": 0.3411714732646942, + "learning_rate": 6.838164951925628e-06, + "loss": 0.3958, + "step": 4088 + }, + { + "epoch": 2.210668588934943, + "grad_norm": 0.31609079241752625, + "learning_rate": 6.836409057745645e-06, + "loss": 0.419, + "step": 4089 + }, + { + "epoch": 2.2112092268877275, + "grad_norm": 0.30467334389686584, + "learning_rate": 6.834652901733789e-06, + "loss": 0.3877, + "step": 4090 + }, + { + "epoch": 2.211749864840512, + "grad_norm": 0.33683645725250244, + "learning_rate": 6.83289648414045e-06, + "loss": 0.4336, + "step": 4091 + }, + { + "epoch": 2.212290502793296, + "grad_norm": 0.29473188519477844, + "learning_rate": 6.831139805216053e-06, + "loss": 0.3971, + "step": 4092 + }, + { + "epoch": 2.2128311407460806, + "grad_norm": 0.3108297884464264, + "learning_rate": 6.829382865211063e-06, + "loss": 0.4082, + "step": 4093 + }, + { + "epoch": 2.2133717786988645, + "grad_norm": 0.30056747794151306, + "learning_rate": 6.827625664375979e-06, + "loss": 0.3676, + "step": 4094 + }, + { + "epoch": 2.213912416651649, + "grad_norm": 0.31948915123939514, + "learning_rate": 6.825868202961343e-06, + "loss": 0.4109, + "step": 4095 + }, + { + "epoch": 2.214453054604433, + "grad_norm": 0.32373663783073425, + "learning_rate": 6.824110481217728e-06, + "loss": 0.3751, + "step": 4096 + }, + { + "epoch": 2.2149936925572176, + "grad_norm": 0.3314642608165741, + "learning_rate": 6.822352499395751e-06, + "loss": 0.3544, + "step": 4097 + }, + { + "epoch": 2.215534330510002, + "grad_norm": 0.3104372024536133, + "learning_rate": 6.820594257746055e-06, + "loss": 0.4083, + "step": 4098 + }, + { + "epoch": 2.2160749684627863, + "grad_norm": 0.3360132873058319, + "learning_rate": 6.818835756519331e-06, + "loss": 0.4017, + "step": 4099 + }, + { + "epoch": 2.21661560641557, + "grad_norm": 0.3249875009059906, + "learning_rate": 6.8170769959663045e-06, + "loss": 0.434, + "step": 4100 + }, + { + "epoch": 2.2171562443683546, + "grad_norm": 0.28293728828430176, + "learning_rate": 6.815317976337734e-06, + "loss": 0.3623, + "step": 4101 + }, + { + "epoch": 2.217696882321139, + "grad_norm": 0.3353882133960724, + "learning_rate": 6.8135586978844175e-06, + "loss": 0.3761, + "step": 4102 + }, + { + "epoch": 2.2182375202739233, + "grad_norm": 0.3167102038860321, + "learning_rate": 6.811799160857191e-06, + "loss": 0.3856, + "step": 4103 + }, + { + "epoch": 2.2187781582267077, + "grad_norm": 0.31739798188209534, + "learning_rate": 6.810039365506923e-06, + "loss": 0.3971, + "step": 4104 + }, + { + "epoch": 2.2193187961794916, + "grad_norm": 0.2923511266708374, + "learning_rate": 6.808279312084525e-06, + "loss": 0.3651, + "step": 4105 + }, + { + "epoch": 2.219859434132276, + "grad_norm": 0.32602718472480774, + "learning_rate": 6.806519000840941e-06, + "loss": 0.4572, + "step": 4106 + }, + { + "epoch": 2.2204000720850603, + "grad_norm": 0.3319365680217743, + "learning_rate": 6.8047584320271555e-06, + "loss": 0.4192, + "step": 4107 + }, + { + "epoch": 2.2209407100378447, + "grad_norm": 0.33049488067626953, + "learning_rate": 6.802997605894183e-06, + "loss": 0.3878, + "step": 4108 + }, + { + "epoch": 2.221481347990629, + "grad_norm": 0.3242778778076172, + "learning_rate": 6.8012365226930825e-06, + "loss": 0.4217, + "step": 4109 + }, + { + "epoch": 2.2220219859434134, + "grad_norm": 0.33477145433425903, + "learning_rate": 6.799475182674942e-06, + "loss": 0.3831, + "step": 4110 + }, + { + "epoch": 2.2225626238961977, + "grad_norm": 0.3455541133880615, + "learning_rate": 6.797713586090893e-06, + "loss": 0.4395, + "step": 4111 + }, + { + "epoch": 2.2231032618489817, + "grad_norm": 0.3297395706176758, + "learning_rate": 6.795951733192101e-06, + "loss": 0.4273, + "step": 4112 + }, + { + "epoch": 2.223643899801766, + "grad_norm": 0.3724072277545929, + "learning_rate": 6.794189624229768e-06, + "loss": 0.4298, + "step": 4113 + }, + { + "epoch": 2.2241845377545504, + "grad_norm": 0.3061438500881195, + "learning_rate": 6.792427259455131e-06, + "loss": 0.3993, + "step": 4114 + }, + { + "epoch": 2.2247251757073347, + "grad_norm": 0.3195669651031494, + "learning_rate": 6.790664639119464e-06, + "loss": 0.3715, + "step": 4115 + }, + { + "epoch": 2.225265813660119, + "grad_norm": 0.4220083951950073, + "learning_rate": 6.788901763474082e-06, + "loss": 0.4554, + "step": 4116 + }, + { + "epoch": 2.225806451612903, + "grad_norm": 0.29164981842041016, + "learning_rate": 6.787138632770327e-06, + "loss": 0.3788, + "step": 4117 + }, + { + "epoch": 2.2263470895656874, + "grad_norm": 0.35630443692207336, + "learning_rate": 6.785375247259588e-06, + "loss": 0.4045, + "step": 4118 + }, + { + "epoch": 2.2268877275184717, + "grad_norm": 0.41154932975769043, + "learning_rate": 6.783611607193282e-06, + "loss": 0.4645, + "step": 4119 + }, + { + "epoch": 2.227428365471256, + "grad_norm": 0.3228520154953003, + "learning_rate": 6.781847712822869e-06, + "loss": 0.4128, + "step": 4120 + }, + { + "epoch": 2.2279690034240405, + "grad_norm": 0.32682761549949646, + "learning_rate": 6.7800835643998374e-06, + "loss": 0.3776, + "step": 4121 + }, + { + "epoch": 2.228509641376825, + "grad_norm": 0.3961796164512634, + "learning_rate": 6.778319162175722e-06, + "loss": 0.4248, + "step": 4122 + }, + { + "epoch": 2.2290502793296088, + "grad_norm": 0.32837003469467163, + "learning_rate": 6.776554506402081e-06, + "loss": 0.418, + "step": 4123 + }, + { + "epoch": 2.229590917282393, + "grad_norm": 0.3298190236091614, + "learning_rate": 6.774789597330523e-06, + "loss": 0.4255, + "step": 4124 + }, + { + "epoch": 2.2301315552351775, + "grad_norm": 0.3101571202278137, + "learning_rate": 6.773024435212678e-06, + "loss": 0.4013, + "step": 4125 + }, + { + "epoch": 2.230672193187962, + "grad_norm": 0.31095200777053833, + "learning_rate": 6.771259020300227e-06, + "loss": 0.3868, + "step": 4126 + }, + { + "epoch": 2.231212831140746, + "grad_norm": 0.3008178770542145, + "learning_rate": 6.769493352844876e-06, + "loss": 0.3842, + "step": 4127 + }, + { + "epoch": 2.2317534690935306, + "grad_norm": 0.3014555275440216, + "learning_rate": 6.76772743309837e-06, + "loss": 0.4122, + "step": 4128 + }, + { + "epoch": 2.2322941070463145, + "grad_norm": 0.29356569051742554, + "learning_rate": 6.765961261312492e-06, + "loss": 0.3627, + "step": 4129 + }, + { + "epoch": 2.232834744999099, + "grad_norm": 0.32311195135116577, + "learning_rate": 6.76419483773906e-06, + "loss": 0.4066, + "step": 4130 + }, + { + "epoch": 2.233375382951883, + "grad_norm": 0.2994847893714905, + "learning_rate": 6.762428162629925e-06, + "loss": 0.3879, + "step": 4131 + }, + { + "epoch": 2.2339160209046676, + "grad_norm": 0.32352471351623535, + "learning_rate": 6.76066123623698e-06, + "loss": 0.3943, + "step": 4132 + }, + { + "epoch": 2.234456658857452, + "grad_norm": 0.30245065689086914, + "learning_rate": 6.758894058812146e-06, + "loss": 0.3753, + "step": 4133 + }, + { + "epoch": 2.234997296810236, + "grad_norm": 0.3160868287086487, + "learning_rate": 6.757126630607389e-06, + "loss": 0.4269, + "step": 4134 + }, + { + "epoch": 2.23553793476302, + "grad_norm": 0.3280041813850403, + "learning_rate": 6.755358951874701e-06, + "loss": 0.4646, + "step": 4135 + }, + { + "epoch": 2.2360785727158046, + "grad_norm": 0.2949141561985016, + "learning_rate": 6.753591022866117e-06, + "loss": 0.4271, + "step": 4136 + }, + { + "epoch": 2.236619210668589, + "grad_norm": 0.284419983625412, + "learning_rate": 6.751822843833704e-06, + "loss": 0.3972, + "step": 4137 + }, + { + "epoch": 2.2371598486213733, + "grad_norm": 0.2773050367832184, + "learning_rate": 6.750054415029567e-06, + "loss": 0.3976, + "step": 4138 + }, + { + "epoch": 2.2377004865741577, + "grad_norm": 0.33673977851867676, + "learning_rate": 6.748285736705844e-06, + "loss": 0.4122, + "step": 4139 + }, + { + "epoch": 2.238241124526942, + "grad_norm": 0.3167836368083954, + "learning_rate": 6.7465168091147094e-06, + "loss": 0.3868, + "step": 4140 + }, + { + "epoch": 2.238781762479726, + "grad_norm": 0.3082415461540222, + "learning_rate": 6.7447476325083764e-06, + "loss": 0.4109, + "step": 4141 + }, + { + "epoch": 2.2393224004325103, + "grad_norm": 0.34712666273117065, + "learning_rate": 6.7429782071390895e-06, + "loss": 0.3989, + "step": 4142 + }, + { + "epoch": 2.2398630383852947, + "grad_norm": 0.3211289346218109, + "learning_rate": 6.741208533259128e-06, + "loss": 0.4312, + "step": 4143 + }, + { + "epoch": 2.240403676338079, + "grad_norm": 0.30233892798423767, + "learning_rate": 6.739438611120813e-06, + "loss": 0.4164, + "step": 4144 + }, + { + "epoch": 2.2409443142908634, + "grad_norm": 0.3043302595615387, + "learning_rate": 6.737668440976494e-06, + "loss": 0.3808, + "step": 4145 + }, + { + "epoch": 2.2414849522436473, + "grad_norm": 0.3419129550457001, + "learning_rate": 6.735898023078558e-06, + "loss": 0.4315, + "step": 4146 + }, + { + "epoch": 2.2420255901964317, + "grad_norm": 0.3226799964904785, + "learning_rate": 6.734127357679431e-06, + "loss": 0.384, + "step": 4147 + }, + { + "epoch": 2.242566228149216, + "grad_norm": 0.3249484896659851, + "learning_rate": 6.732356445031569e-06, + "loss": 0.4241, + "step": 4148 + }, + { + "epoch": 2.2431068661020004, + "grad_norm": 0.36469215154647827, + "learning_rate": 6.730585285387465e-06, + "loss": 0.4364, + "step": 4149 + }, + { + "epoch": 2.2436475040547847, + "grad_norm": 0.32166287302970886, + "learning_rate": 6.728813878999652e-06, + "loss": 0.3981, + "step": 4150 + }, + { + "epoch": 2.244188142007569, + "grad_norm": 0.3083460330963135, + "learning_rate": 6.727042226120686e-06, + "loss": 0.3986, + "step": 4151 + }, + { + "epoch": 2.244728779960353, + "grad_norm": 0.31005361676216125, + "learning_rate": 6.725270327003174e-06, + "loss": 0.3685, + "step": 4152 + }, + { + "epoch": 2.2452694179131374, + "grad_norm": 0.347377747297287, + "learning_rate": 6.723498181899746e-06, + "loss": 0.4205, + "step": 4153 + }, + { + "epoch": 2.2458100558659218, + "grad_norm": 0.3285289704799652, + "learning_rate": 6.721725791063071e-06, + "loss": 0.4045, + "step": 4154 + }, + { + "epoch": 2.246350693818706, + "grad_norm": 0.32021549344062805, + "learning_rate": 6.719953154745857e-06, + "loss": 0.3981, + "step": 4155 + }, + { + "epoch": 2.2468913317714905, + "grad_norm": 0.3265751600265503, + "learning_rate": 6.7181802732008385e-06, + "loss": 0.3944, + "step": 4156 + }, + { + "epoch": 2.247431969724275, + "grad_norm": 0.36013728380203247, + "learning_rate": 6.716407146680793e-06, + "loss": 0.4218, + "step": 4157 + }, + { + "epoch": 2.2479726076770588, + "grad_norm": 0.29418373107910156, + "learning_rate": 6.714633775438528e-06, + "loss": 0.3816, + "step": 4158 + }, + { + "epoch": 2.248513245629843, + "grad_norm": 0.34002870321273804, + "learning_rate": 6.712860159726887e-06, + "loss": 0.4253, + "step": 4159 + }, + { + "epoch": 2.2490538835826275, + "grad_norm": 0.34566155076026917, + "learning_rate": 6.7110862997987525e-06, + "loss": 0.4148, + "step": 4160 + }, + { + "epoch": 2.249594521535412, + "grad_norm": 0.2925388216972351, + "learning_rate": 6.709312195907034e-06, + "loss": 0.3885, + "step": 4161 + }, + { + "epoch": 2.250135159488196, + "grad_norm": 0.34234461188316345, + "learning_rate": 6.707537848304682e-06, + "loss": 0.4115, + "step": 4162 + }, + { + "epoch": 2.25067579744098, + "grad_norm": 0.3295589089393616, + "learning_rate": 6.705763257244679e-06, + "loss": 0.4087, + "step": 4163 + }, + { + "epoch": 2.2512164353937645, + "grad_norm": 0.32688218355178833, + "learning_rate": 6.703988422980045e-06, + "loss": 0.414, + "step": 4164 + }, + { + "epoch": 2.251757073346549, + "grad_norm": 0.30534660816192627, + "learning_rate": 6.70221334576383e-06, + "loss": 0.3775, + "step": 4165 + }, + { + "epoch": 2.252297711299333, + "grad_norm": 0.34004101157188416, + "learning_rate": 6.7004380258491256e-06, + "loss": 0.4108, + "step": 4166 + }, + { + "epoch": 2.2528383492521176, + "grad_norm": 0.3068768382072449, + "learning_rate": 6.698662463489047e-06, + "loss": 0.3879, + "step": 4167 + }, + { + "epoch": 2.253378987204902, + "grad_norm": 0.3664450943470001, + "learning_rate": 6.696886658936754e-06, + "loss": 0.4355, + "step": 4168 + }, + { + "epoch": 2.2539196251576863, + "grad_norm": 0.35220062732696533, + "learning_rate": 6.695110612445439e-06, + "loss": 0.4041, + "step": 4169 + }, + { + "epoch": 2.25446026311047, + "grad_norm": 0.35381558537483215, + "learning_rate": 6.693334324268328e-06, + "loss": 0.4549, + "step": 4170 + }, + { + "epoch": 2.2550009010632546, + "grad_norm": 0.30724719166755676, + "learning_rate": 6.691557794658676e-06, + "loss": 0.3792, + "step": 4171 + }, + { + "epoch": 2.255541539016039, + "grad_norm": 0.35158011317253113, + "learning_rate": 6.689781023869784e-06, + "loss": 0.4359, + "step": 4172 + }, + { + "epoch": 2.2560821769688233, + "grad_norm": 0.33039185404777527, + "learning_rate": 6.688004012154975e-06, + "loss": 0.3976, + "step": 4173 + }, + { + "epoch": 2.2566228149216077, + "grad_norm": 0.3083387315273285, + "learning_rate": 6.686226759767616e-06, + "loss": 0.3943, + "step": 4174 + }, + { + "epoch": 2.2571634528743916, + "grad_norm": 0.3018893599510193, + "learning_rate": 6.684449266961101e-06, + "loss": 0.3741, + "step": 4175 + }, + { + "epoch": 2.257704090827176, + "grad_norm": 0.3342258334159851, + "learning_rate": 6.682671533988864e-06, + "loss": 0.4128, + "step": 4176 + }, + { + "epoch": 2.2582447287799603, + "grad_norm": 0.30471324920654297, + "learning_rate": 6.680893561104373e-06, + "loss": 0.4202, + "step": 4177 + }, + { + "epoch": 2.2587853667327447, + "grad_norm": 0.32386910915374756, + "learning_rate": 6.679115348561122e-06, + "loss": 0.4405, + "step": 4178 + }, + { + "epoch": 2.259326004685529, + "grad_norm": 0.2943861186504364, + "learning_rate": 6.677336896612652e-06, + "loss": 0.3788, + "step": 4179 + }, + { + "epoch": 2.2598666426383134, + "grad_norm": 0.35374173521995544, + "learning_rate": 6.675558205512527e-06, + "loss": 0.408, + "step": 4180 + }, + { + "epoch": 2.2604072805910973, + "grad_norm": 0.30802565813064575, + "learning_rate": 6.673779275514351e-06, + "loss": 0.4028, + "step": 4181 + }, + { + "epoch": 2.2609479185438817, + "grad_norm": 0.2978096008300781, + "learning_rate": 6.672000106871761e-06, + "loss": 0.3682, + "step": 4182 + }, + { + "epoch": 2.261488556496666, + "grad_norm": 0.31863123178482056, + "learning_rate": 6.670220699838429e-06, + "loss": 0.4376, + "step": 4183 + }, + { + "epoch": 2.2620291944494504, + "grad_norm": 0.3239419758319855, + "learning_rate": 6.668441054668055e-06, + "loss": 0.4333, + "step": 4184 + }, + { + "epoch": 2.2625698324022347, + "grad_norm": 0.31215062737464905, + "learning_rate": 6.666661171614382e-06, + "loss": 0.404, + "step": 4185 + }, + { + "epoch": 2.263110470355019, + "grad_norm": 0.2932754456996918, + "learning_rate": 6.66488105093118e-06, + "loss": 0.3572, + "step": 4186 + }, + { + "epoch": 2.263651108307803, + "grad_norm": 0.3539320230484009, + "learning_rate": 6.663100692872259e-06, + "loss": 0.4356, + "step": 4187 + }, + { + "epoch": 2.2641917462605874, + "grad_norm": 0.2999046742916107, + "learning_rate": 6.661320097691454e-06, + "loss": 0.3999, + "step": 4188 + }, + { + "epoch": 2.2647323842133718, + "grad_norm": 0.32621529698371887, + "learning_rate": 6.659539265642643e-06, + "loss": 0.407, + "step": 4189 + }, + { + "epoch": 2.265273022166156, + "grad_norm": 0.2953600287437439, + "learning_rate": 6.657758196979732e-06, + "loss": 0.3681, + "step": 4190 + }, + { + "epoch": 2.2658136601189405, + "grad_norm": 0.30013948678970337, + "learning_rate": 6.655976891956662e-06, + "loss": 0.3866, + "step": 4191 + }, + { + "epoch": 2.2663542980717244, + "grad_norm": 0.34683653712272644, + "learning_rate": 6.654195350827411e-06, + "loss": 0.4192, + "step": 4192 + }, + { + "epoch": 2.2668949360245088, + "grad_norm": 0.31711041927337646, + "learning_rate": 6.652413573845985e-06, + "loss": 0.3867, + "step": 4193 + }, + { + "epoch": 2.267435573977293, + "grad_norm": 0.29812562465667725, + "learning_rate": 6.650631561266427e-06, + "loss": 0.3507, + "step": 4194 + }, + { + "epoch": 2.2679762119300775, + "grad_norm": 0.349576473236084, + "learning_rate": 6.648849313342816e-06, + "loss": 0.4139, + "step": 4195 + }, + { + "epoch": 2.268516849882862, + "grad_norm": 0.31229591369628906, + "learning_rate": 6.647066830329258e-06, + "loss": 0.3888, + "step": 4196 + }, + { + "epoch": 2.269057487835646, + "grad_norm": 0.3425281047821045, + "learning_rate": 6.645284112479897e-06, + "loss": 0.42, + "step": 4197 + }, + { + "epoch": 2.2695981257884306, + "grad_norm": 0.3798248767852783, + "learning_rate": 6.643501160048911e-06, + "loss": 0.429, + "step": 4198 + }, + { + "epoch": 2.2701387637412145, + "grad_norm": 0.30689990520477295, + "learning_rate": 6.6417179732905104e-06, + "loss": 0.3916, + "step": 4199 + }, + { + "epoch": 2.270679401693999, + "grad_norm": 0.3451610207557678, + "learning_rate": 6.6399345524589366e-06, + "loss": 0.4665, + "step": 4200 + }, + { + "epoch": 2.271220039646783, + "grad_norm": 0.29163801670074463, + "learning_rate": 6.638150897808469e-06, + "loss": 0.3605, + "step": 4201 + }, + { + "epoch": 2.2717606775995676, + "grad_norm": 0.3330785632133484, + "learning_rate": 6.636367009593415e-06, + "loss": 0.4162, + "step": 4202 + }, + { + "epoch": 2.272301315552352, + "grad_norm": 0.3180115222930908, + "learning_rate": 6.63458288806812e-06, + "loss": 0.3684, + "step": 4203 + }, + { + "epoch": 2.272841953505136, + "grad_norm": 0.33413010835647583, + "learning_rate": 6.632798533486961e-06, + "loss": 0.4144, + "step": 4204 + }, + { + "epoch": 2.27338259145792, + "grad_norm": 0.3052087724208832, + "learning_rate": 6.631013946104348e-06, + "loss": 0.4008, + "step": 4205 + }, + { + "epoch": 2.2739232294107046, + "grad_norm": 0.3361079692840576, + "learning_rate": 6.6292291261747225e-06, + "loss": 0.3795, + "step": 4206 + }, + { + "epoch": 2.274463867363489, + "grad_norm": 0.29779067635536194, + "learning_rate": 6.6274440739525635e-06, + "loss": 0.3918, + "step": 4207 + }, + { + "epoch": 2.2750045053162733, + "grad_norm": 0.3415556252002716, + "learning_rate": 6.6256587896923785e-06, + "loss": 0.4196, + "step": 4208 + }, + { + "epoch": 2.2755451432690577, + "grad_norm": 0.31661680340766907, + "learning_rate": 6.62387327364871e-06, + "loss": 0.3929, + "step": 4209 + }, + { + "epoch": 2.276085781221842, + "grad_norm": 0.2811526358127594, + "learning_rate": 6.622087526076135e-06, + "loss": 0.3871, + "step": 4210 + }, + { + "epoch": 2.276626419174626, + "grad_norm": 0.33795827627182007, + "learning_rate": 6.620301547229262e-06, + "loss": 0.4119, + "step": 4211 + }, + { + "epoch": 2.2771670571274103, + "grad_norm": 0.326340913772583, + "learning_rate": 6.618515337362732e-06, + "loss": 0.4251, + "step": 4212 + }, + { + "epoch": 2.2777076950801947, + "grad_norm": 0.3061928153038025, + "learning_rate": 6.61672889673122e-06, + "loss": 0.4028, + "step": 4213 + }, + { + "epoch": 2.278248333032979, + "grad_norm": 0.31582531332969666, + "learning_rate": 6.614942225589432e-06, + "loss": 0.3773, + "step": 4214 + }, + { + "epoch": 2.2787889709857634, + "grad_norm": 0.30591699481010437, + "learning_rate": 6.613155324192111e-06, + "loss": 0.416, + "step": 4215 + }, + { + "epoch": 2.2793296089385473, + "grad_norm": 0.3101104497909546, + "learning_rate": 6.611368192794028e-06, + "loss": 0.3711, + "step": 4216 + }, + { + "epoch": 2.2798702468913317, + "grad_norm": 0.3451632857322693, + "learning_rate": 6.609580831649991e-06, + "loss": 0.3934, + "step": 4217 + }, + { + "epoch": 2.280410884844116, + "grad_norm": 0.32937222719192505, + "learning_rate": 6.607793241014835e-06, + "loss": 0.4237, + "step": 4218 + }, + { + "epoch": 2.2809515227969004, + "grad_norm": 0.3161192536354065, + "learning_rate": 6.606005421143436e-06, + "loss": 0.3976, + "step": 4219 + }, + { + "epoch": 2.2814921607496847, + "grad_norm": 0.32822996377944946, + "learning_rate": 6.604217372290693e-06, + "loss": 0.4398, + "step": 4220 + }, + { + "epoch": 2.2820327987024687, + "grad_norm": 0.30190369486808777, + "learning_rate": 6.602429094711549e-06, + "loss": 0.4031, + "step": 4221 + }, + { + "epoch": 2.282573436655253, + "grad_norm": 0.3189171552658081, + "learning_rate": 6.600640588660968e-06, + "loss": 0.4208, + "step": 4222 + }, + { + "epoch": 2.2831140746080374, + "grad_norm": 0.315018892288208, + "learning_rate": 6.598851854393956e-06, + "loss": 0.4253, + "step": 4223 + }, + { + "epoch": 2.2836547125608218, + "grad_norm": 0.3788672089576721, + "learning_rate": 6.5970628921655445e-06, + "loss": 0.4235, + "step": 4224 + }, + { + "epoch": 2.284195350513606, + "grad_norm": 0.3006134033203125, + "learning_rate": 6.5952737022308e-06, + "loss": 0.3976, + "step": 4225 + }, + { + "epoch": 2.2847359884663905, + "grad_norm": 0.3113172948360443, + "learning_rate": 6.5934842848448245e-06, + "loss": 0.4296, + "step": 4226 + }, + { + "epoch": 2.285276626419175, + "grad_norm": 0.2868853509426117, + "learning_rate": 6.591694640262749e-06, + "loss": 0.3771, + "step": 4227 + }, + { + "epoch": 2.2858172643719588, + "grad_norm": 0.3638264536857605, + "learning_rate": 6.589904768739737e-06, + "loss": 0.4157, + "step": 4228 + }, + { + "epoch": 2.286357902324743, + "grad_norm": 0.33750247955322266, + "learning_rate": 6.588114670530989e-06, + "loss": 0.3962, + "step": 4229 + }, + { + "epoch": 2.2868985402775275, + "grad_norm": 0.29702359437942505, + "learning_rate": 6.586324345891727e-06, + "loss": 0.3742, + "step": 4230 + }, + { + "epoch": 2.287439178230312, + "grad_norm": 0.31465309858322144, + "learning_rate": 6.584533795077217e-06, + "loss": 0.3886, + "step": 4231 + }, + { + "epoch": 2.287979816183096, + "grad_norm": 0.3077332079410553, + "learning_rate": 6.582743018342751e-06, + "loss": 0.398, + "step": 4232 + }, + { + "epoch": 2.28852045413588, + "grad_norm": 0.3539144992828369, + "learning_rate": 6.580952015943656e-06, + "loss": 0.4202, + "step": 4233 + }, + { + "epoch": 2.2890610920886645, + "grad_norm": 0.2928106486797333, + "learning_rate": 6.579160788135288e-06, + "loss": 0.3878, + "step": 4234 + }, + { + "epoch": 2.289601730041449, + "grad_norm": 0.318897545337677, + "learning_rate": 6.57736933517304e-06, + "loss": 0.4048, + "step": 4235 + }, + { + "epoch": 2.290142367994233, + "grad_norm": 0.3368491530418396, + "learning_rate": 6.57557765731233e-06, + "loss": 0.3887, + "step": 4236 + }, + { + "epoch": 2.2906830059470176, + "grad_norm": 0.3481080234050751, + "learning_rate": 6.573785754808615e-06, + "loss": 0.3766, + "step": 4237 + }, + { + "epoch": 2.291223643899802, + "grad_norm": 0.31124183535575867, + "learning_rate": 6.5719936279173805e-06, + "loss": 0.4233, + "step": 4238 + }, + { + "epoch": 2.2917642818525863, + "grad_norm": 0.3217078447341919, + "learning_rate": 6.570201276894146e-06, + "loss": 0.3749, + "step": 4239 + }, + { + "epoch": 2.29230491980537, + "grad_norm": 0.36222043633461, + "learning_rate": 6.568408701994459e-06, + "loss": 0.3869, + "step": 4240 + }, + { + "epoch": 2.2928455577581546, + "grad_norm": 0.32569992542266846, + "learning_rate": 6.566615903473902e-06, + "loss": 0.4175, + "step": 4241 + }, + { + "epoch": 2.293386195710939, + "grad_norm": 0.3401198983192444, + "learning_rate": 6.564822881588092e-06, + "loss": 0.3587, + "step": 4242 + }, + { + "epoch": 2.2939268336637233, + "grad_norm": 0.3378213047981262, + "learning_rate": 6.563029636592671e-06, + "loss": 0.3994, + "step": 4243 + }, + { + "epoch": 2.2944674716165077, + "grad_norm": 0.3366250693798065, + "learning_rate": 6.56123616874332e-06, + "loss": 0.3975, + "step": 4244 + }, + { + "epoch": 2.2950081095692916, + "grad_norm": 0.31765657663345337, + "learning_rate": 6.559442478295745e-06, + "loss": 0.4338, + "step": 4245 + }, + { + "epoch": 2.295548747522076, + "grad_norm": 0.31671106815338135, + "learning_rate": 6.557648565505691e-06, + "loss": 0.4179, + "step": 4246 + }, + { + "epoch": 2.2960893854748603, + "grad_norm": 0.331931471824646, + "learning_rate": 6.555854430628927e-06, + "loss": 0.3769, + "step": 4247 + }, + { + "epoch": 2.2966300234276447, + "grad_norm": 0.3320377767086029, + "learning_rate": 6.55406007392126e-06, + "loss": 0.4169, + "step": 4248 + }, + { + "epoch": 2.297170661380429, + "grad_norm": 0.2971826493740082, + "learning_rate": 6.5522654956385254e-06, + "loss": 0.3569, + "step": 4249 + }, + { + "epoch": 2.297711299333213, + "grad_norm": 0.3325771391391754, + "learning_rate": 6.550470696036591e-06, + "loss": 0.44, + "step": 4250 + }, + { + "epoch": 2.2982519372859973, + "grad_norm": 0.3099461495876312, + "learning_rate": 6.548675675371356e-06, + "loss": 0.4025, + "step": 4251 + }, + { + "epoch": 2.2987925752387817, + "grad_norm": 0.3246251344680786, + "learning_rate": 6.5468804338987515e-06, + "loss": 0.3984, + "step": 4252 + }, + { + "epoch": 2.299333213191566, + "grad_norm": 0.33402061462402344, + "learning_rate": 6.545084971874738e-06, + "loss": 0.4016, + "step": 4253 + }, + { + "epoch": 2.2998738511443504, + "grad_norm": 0.28140220046043396, + "learning_rate": 6.5432892895553115e-06, + "loss": 0.3769, + "step": 4254 + }, + { + "epoch": 2.3004144890971348, + "grad_norm": 0.3311731517314911, + "learning_rate": 6.541493387196496e-06, + "loss": 0.426, + "step": 4255 + }, + { + "epoch": 2.300955127049919, + "grad_norm": 0.3210577666759491, + "learning_rate": 6.539697265054348e-06, + "loss": 0.3881, + "step": 4256 + }, + { + "epoch": 2.301495765002703, + "grad_norm": 0.28738880157470703, + "learning_rate": 6.537900923384956e-06, + "loss": 0.3882, + "step": 4257 + }, + { + "epoch": 2.3020364029554874, + "grad_norm": 0.3025340735912323, + "learning_rate": 6.536104362444439e-06, + "loss": 0.3794, + "step": 4258 + }, + { + "epoch": 2.3025770409082718, + "grad_norm": 0.32594531774520874, + "learning_rate": 6.534307582488946e-06, + "loss": 0.4204, + "step": 4259 + }, + { + "epoch": 2.303117678861056, + "grad_norm": 0.31777167320251465, + "learning_rate": 6.5325105837746604e-06, + "loss": 0.4193, + "step": 4260 + }, + { + "epoch": 2.3036583168138405, + "grad_norm": 0.3318474292755127, + "learning_rate": 6.5307133665577945e-06, + "loss": 0.4251, + "step": 4261 + }, + { + "epoch": 2.3041989547666244, + "grad_norm": 0.29136520624160767, + "learning_rate": 6.528915931094594e-06, + "loss": 0.3787, + "step": 4262 + }, + { + "epoch": 2.3047395927194088, + "grad_norm": 0.29807668924331665, + "learning_rate": 6.527118277641329e-06, + "loss": 0.4066, + "step": 4263 + }, + { + "epoch": 2.305280230672193, + "grad_norm": 0.301704466342926, + "learning_rate": 6.525320406454312e-06, + "loss": 0.4013, + "step": 4264 + }, + { + "epoch": 2.3058208686249775, + "grad_norm": 0.31239601969718933, + "learning_rate": 6.523522317789874e-06, + "loss": 0.4284, + "step": 4265 + }, + { + "epoch": 2.306361506577762, + "grad_norm": 0.2982839345932007, + "learning_rate": 6.521724011904387e-06, + "loss": 0.3791, + "step": 4266 + }, + { + "epoch": 2.306902144530546, + "grad_norm": 0.47317105531692505, + "learning_rate": 6.5199254890542496e-06, + "loss": 0.4585, + "step": 4267 + }, + { + "epoch": 2.3074427824833306, + "grad_norm": 0.3282627761363983, + "learning_rate": 6.518126749495894e-06, + "loss": 0.443, + "step": 4268 + }, + { + "epoch": 2.3079834204361145, + "grad_norm": 0.32441818714141846, + "learning_rate": 6.516327793485776e-06, + "loss": 0.4201, + "step": 4269 + }, + { + "epoch": 2.308524058388899, + "grad_norm": 0.2971220910549164, + "learning_rate": 6.514528621280391e-06, + "loss": 0.3927, + "step": 4270 + }, + { + "epoch": 2.309064696341683, + "grad_norm": 0.361585795879364, + "learning_rate": 6.512729233136262e-06, + "loss": 0.393, + "step": 4271 + }, + { + "epoch": 2.3096053342944676, + "grad_norm": 0.30100318789482117, + "learning_rate": 6.510929629309941e-06, + "loss": 0.4155, + "step": 4272 + }, + { + "epoch": 2.310145972247252, + "grad_norm": 0.29078614711761475, + "learning_rate": 6.509129810058014e-06, + "loss": 0.3956, + "step": 4273 + }, + { + "epoch": 2.310686610200036, + "grad_norm": 0.27215951681137085, + "learning_rate": 6.507329775637095e-06, + "loss": 0.343, + "step": 4274 + }, + { + "epoch": 2.31122724815282, + "grad_norm": 0.3316851258277893, + "learning_rate": 6.5055295263038286e-06, + "loss": 0.4285, + "step": 4275 + }, + { + "epoch": 2.3117678861056046, + "grad_norm": 0.3086435794830322, + "learning_rate": 6.503729062314893e-06, + "loss": 0.4265, + "step": 4276 + }, + { + "epoch": 2.312308524058389, + "grad_norm": 0.2726685106754303, + "learning_rate": 6.501928383926992e-06, + "loss": 0.3544, + "step": 4277 + }, + { + "epoch": 2.3128491620111733, + "grad_norm": 0.3564140498638153, + "learning_rate": 6.500127491396867e-06, + "loss": 0.448, + "step": 4278 + }, + { + "epoch": 2.3133897999639577, + "grad_norm": 0.3078831434249878, + "learning_rate": 6.4983263849812835e-06, + "loss": 0.3846, + "step": 4279 + }, + { + "epoch": 2.3139304379167416, + "grad_norm": 0.31032395362854004, + "learning_rate": 6.496525064937042e-06, + "loss": 0.4148, + "step": 4280 + }, + { + "epoch": 2.314471075869526, + "grad_norm": 0.318835586309433, + "learning_rate": 6.494723531520968e-06, + "loss": 0.4032, + "step": 4281 + }, + { + "epoch": 2.3150117138223103, + "grad_norm": 0.31365159153938293, + "learning_rate": 6.492921784989924e-06, + "loss": 0.3961, + "step": 4282 + }, + { + "epoch": 2.3155523517750947, + "grad_norm": 0.3446863889694214, + "learning_rate": 6.4911198256007994e-06, + "loss": 0.4357, + "step": 4283 + }, + { + "epoch": 2.316092989727879, + "grad_norm": 0.3149493336677551, + "learning_rate": 6.489317653610513e-06, + "loss": 0.4261, + "step": 4284 + }, + { + "epoch": 2.3166336276806634, + "grad_norm": 0.29682353138923645, + "learning_rate": 6.487515269276015e-06, + "loss": 0.3862, + "step": 4285 + }, + { + "epoch": 2.3171742656334473, + "grad_norm": 0.34198012948036194, + "learning_rate": 6.485712672854289e-06, + "loss": 0.4254, + "step": 4286 + }, + { + "epoch": 2.3177149035862317, + "grad_norm": 0.30188503861427307, + "learning_rate": 6.483909864602342e-06, + "loss": 0.4027, + "step": 4287 + }, + { + "epoch": 2.318255541539016, + "grad_norm": 0.29825958609580994, + "learning_rate": 6.482106844777219e-06, + "loss": 0.3976, + "step": 4288 + }, + { + "epoch": 2.3187961794918004, + "grad_norm": 0.2877810299396515, + "learning_rate": 6.480303613635986e-06, + "loss": 0.3549, + "step": 4289 + }, + { + "epoch": 2.3193368174445848, + "grad_norm": 0.3249087631702423, + "learning_rate": 6.478500171435751e-06, + "loss": 0.4514, + "step": 4290 + }, + { + "epoch": 2.3198774553973687, + "grad_norm": 0.31923142075538635, + "learning_rate": 6.476696518433641e-06, + "loss": 0.404, + "step": 4291 + }, + { + "epoch": 2.320418093350153, + "grad_norm": 0.32674330472946167, + "learning_rate": 6.474892654886819e-06, + "loss": 0.4099, + "step": 4292 + }, + { + "epoch": 2.3209587313029374, + "grad_norm": 0.3069779872894287, + "learning_rate": 6.473088581052476e-06, + "loss": 0.3957, + "step": 4293 + }, + { + "epoch": 2.3214993692557218, + "grad_norm": 0.30247175693511963, + "learning_rate": 6.471284297187834e-06, + "loss": 0.3873, + "step": 4294 + }, + { + "epoch": 2.322040007208506, + "grad_norm": 0.3422676920890808, + "learning_rate": 6.469479803550144e-06, + "loss": 0.3926, + "step": 4295 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 0.31129398941993713, + "learning_rate": 6.46767510039669e-06, + "loss": 0.4031, + "step": 4296 + }, + { + "epoch": 2.323121283114075, + "grad_norm": 0.3049183189868927, + "learning_rate": 6.46587018798478e-06, + "loss": 0.3974, + "step": 4297 + }, + { + "epoch": 2.3236619210668588, + "grad_norm": 0.31287881731987, + "learning_rate": 6.464065066571756e-06, + "loss": 0.3665, + "step": 4298 + }, + { + "epoch": 2.324202559019643, + "grad_norm": 0.3058696985244751, + "learning_rate": 6.46225973641499e-06, + "loss": 0.4005, + "step": 4299 + }, + { + "epoch": 2.3247431969724275, + "grad_norm": 0.2986745238304138, + "learning_rate": 6.460454197771881e-06, + "loss": 0.4264, + "step": 4300 + }, + { + "epoch": 2.325283834925212, + "grad_norm": 0.3037396967411041, + "learning_rate": 6.45864845089986e-06, + "loss": 0.3759, + "step": 4301 + }, + { + "epoch": 2.325824472877996, + "grad_norm": 0.322265088558197, + "learning_rate": 6.45684249605639e-06, + "loss": 0.3629, + "step": 4302 + }, + { + "epoch": 2.32636511083078, + "grad_norm": 0.3143858015537262, + "learning_rate": 6.455036333498956e-06, + "loss": 0.3923, + "step": 4303 + }, + { + "epoch": 2.3269057487835645, + "grad_norm": 0.34491589665412903, + "learning_rate": 6.453229963485081e-06, + "loss": 0.4167, + "step": 4304 + }, + { + "epoch": 2.327446386736349, + "grad_norm": 0.32867443561553955, + "learning_rate": 6.451423386272312e-06, + "loss": 0.4052, + "step": 4305 + }, + { + "epoch": 2.327987024689133, + "grad_norm": 0.306112676858902, + "learning_rate": 6.449616602118228e-06, + "loss": 0.3867, + "step": 4306 + }, + { + "epoch": 2.3285276626419176, + "grad_norm": 0.3475956618785858, + "learning_rate": 6.447809611280439e-06, + "loss": 0.4371, + "step": 4307 + }, + { + "epoch": 2.329068300594702, + "grad_norm": 0.3290245532989502, + "learning_rate": 6.446002414016579e-06, + "loss": 0.3924, + "step": 4308 + }, + { + "epoch": 2.329608938547486, + "grad_norm": 0.34356066584587097, + "learning_rate": 6.444195010584318e-06, + "loss": 0.423, + "step": 4309 + }, + { + "epoch": 2.33014957650027, + "grad_norm": 0.290546715259552, + "learning_rate": 6.442387401241349e-06, + "loss": 0.3812, + "step": 4310 + }, + { + "epoch": 2.3306902144530546, + "grad_norm": 0.30830860137939453, + "learning_rate": 6.4405795862454e-06, + "loss": 0.4228, + "step": 4311 + }, + { + "epoch": 2.331230852405839, + "grad_norm": 0.3429715037345886, + "learning_rate": 6.438771565854226e-06, + "loss": 0.4478, + "step": 4312 + }, + { + "epoch": 2.3317714903586233, + "grad_norm": 0.2854730486869812, + "learning_rate": 6.436963340325611e-06, + "loss": 0.39, + "step": 4313 + }, + { + "epoch": 2.3323121283114077, + "grad_norm": 0.3231770396232605, + "learning_rate": 6.4351549099173685e-06, + "loss": 0.4313, + "step": 4314 + }, + { + "epoch": 2.3328527662641916, + "grad_norm": 0.31456172466278076, + "learning_rate": 6.433346274887341e-06, + "loss": 0.394, + "step": 4315 + }, + { + "epoch": 2.333393404216976, + "grad_norm": 0.3068475127220154, + "learning_rate": 6.4315374354934e-06, + "loss": 0.3842, + "step": 4316 + }, + { + "epoch": 2.3339340421697603, + "grad_norm": 0.3269546329975128, + "learning_rate": 6.429728391993446e-06, + "loss": 0.4229, + "step": 4317 + }, + { + "epoch": 2.3344746801225447, + "grad_norm": 0.3204357326030731, + "learning_rate": 6.427919144645411e-06, + "loss": 0.3981, + "step": 4318 + }, + { + "epoch": 2.335015318075329, + "grad_norm": 0.2805817127227783, + "learning_rate": 6.426109693707254e-06, + "loss": 0.3681, + "step": 4319 + }, + { + "epoch": 2.335555956028113, + "grad_norm": 0.3602171540260315, + "learning_rate": 6.4243000394369626e-06, + "loss": 0.4511, + "step": 4320 + }, + { + "epoch": 2.3360965939808973, + "grad_norm": 0.30108556151390076, + "learning_rate": 6.4224901820925545e-06, + "loss": 0.4065, + "step": 4321 + }, + { + "epoch": 2.3366372319336817, + "grad_norm": 0.30365124344825745, + "learning_rate": 6.420680121932074e-06, + "loss": 0.3956, + "step": 4322 + }, + { + "epoch": 2.337177869886466, + "grad_norm": 0.3073485195636749, + "learning_rate": 6.418869859213598e-06, + "loss": 0.4045, + "step": 4323 + }, + { + "epoch": 2.3377185078392504, + "grad_norm": 0.31126657128334045, + "learning_rate": 6.417059394195228e-06, + "loss": 0.3786, + "step": 4324 + }, + { + "epoch": 2.3382591457920348, + "grad_norm": 0.34042781591415405, + "learning_rate": 6.415248727135103e-06, + "loss": 0.4239, + "step": 4325 + }, + { + "epoch": 2.338799783744819, + "grad_norm": 0.3334735631942749, + "learning_rate": 6.413437858291378e-06, + "loss": 0.404, + "step": 4326 + }, + { + "epoch": 2.339340421697603, + "grad_norm": 0.3195256292819977, + "learning_rate": 6.411626787922247e-06, + "loss": 0.379, + "step": 4327 + }, + { + "epoch": 2.3398810596503874, + "grad_norm": 0.3124770224094391, + "learning_rate": 6.409815516285927e-06, + "loss": 0.3759, + "step": 4328 + }, + { + "epoch": 2.3404216976031718, + "grad_norm": 0.3149203062057495, + "learning_rate": 6.408004043640667e-06, + "loss": 0.4137, + "step": 4329 + }, + { + "epoch": 2.340962335555956, + "grad_norm": 0.29894107580184937, + "learning_rate": 6.406192370244742e-06, + "loss": 0.4012, + "step": 4330 + }, + { + "epoch": 2.3415029735087405, + "grad_norm": 0.3471241295337677, + "learning_rate": 6.4043804963564616e-06, + "loss": 0.418, + "step": 4331 + }, + { + "epoch": 2.3420436114615244, + "grad_norm": 0.34101614356040955, + "learning_rate": 6.402568422234154e-06, + "loss": 0.4392, + "step": 4332 + }, + { + "epoch": 2.3425842494143088, + "grad_norm": 0.29668599367141724, + "learning_rate": 6.400756148136185e-06, + "loss": 0.3709, + "step": 4333 + }, + { + "epoch": 2.343124887367093, + "grad_norm": 0.28744280338287354, + "learning_rate": 6.398943674320942e-06, + "loss": 0.3881, + "step": 4334 + }, + { + "epoch": 2.3436655253198775, + "grad_norm": 0.29911768436431885, + "learning_rate": 6.397131001046849e-06, + "loss": 0.4208, + "step": 4335 + }, + { + "epoch": 2.344206163272662, + "grad_norm": 0.3078124225139618, + "learning_rate": 6.39531812857235e-06, + "loss": 0.3949, + "step": 4336 + }, + { + "epoch": 2.344746801225446, + "grad_norm": 0.3062511682510376, + "learning_rate": 6.393505057155922e-06, + "loss": 0.4346, + "step": 4337 + }, + { + "epoch": 2.34528743917823, + "grad_norm": 0.28572142124176025, + "learning_rate": 6.3916917870560695e-06, + "loss": 0.3935, + "step": 4338 + }, + { + "epoch": 2.3458280771310145, + "grad_norm": 0.3043213188648224, + "learning_rate": 6.389878318531325e-06, + "loss": 0.3959, + "step": 4339 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 0.3265484571456909, + "learning_rate": 6.38806465184025e-06, + "loss": 0.4122, + "step": 4340 + }, + { + "epoch": 2.346909353036583, + "grad_norm": 0.3467870354652405, + "learning_rate": 6.3862507872414345e-06, + "loss": 0.4254, + "step": 4341 + }, + { + "epoch": 2.3474499909893676, + "grad_norm": 0.3298346996307373, + "learning_rate": 6.384436724993494e-06, + "loss": 0.4301, + "step": 4342 + }, + { + "epoch": 2.347990628942152, + "grad_norm": 0.3124622106552124, + "learning_rate": 6.382622465355077e-06, + "loss": 0.3696, + "step": 4343 + }, + { + "epoch": 2.348531266894936, + "grad_norm": 0.4205555021762848, + "learning_rate": 6.3808080085848544e-06, + "loss": 0.4354, + "step": 4344 + }, + { + "epoch": 2.34907190484772, + "grad_norm": 0.3335787057876587, + "learning_rate": 6.378993354941529e-06, + "loss": 0.4019, + "step": 4345 + }, + { + "epoch": 2.3496125428005046, + "grad_norm": 0.346219539642334, + "learning_rate": 6.377178504683832e-06, + "loss": 0.4124, + "step": 4346 + }, + { + "epoch": 2.350153180753289, + "grad_norm": 0.34190288186073303, + "learning_rate": 6.3753634580705225e-06, + "loss": 0.4029, + "step": 4347 + }, + { + "epoch": 2.3506938187060733, + "grad_norm": 0.34621718525886536, + "learning_rate": 6.373548215360382e-06, + "loss": 0.3996, + "step": 4348 + }, + { + "epoch": 2.351234456658857, + "grad_norm": 0.3342035412788391, + "learning_rate": 6.37173277681223e-06, + "loss": 0.4408, + "step": 4349 + }, + { + "epoch": 2.3517750946116416, + "grad_norm": 0.30481693148612976, + "learning_rate": 6.3699171426849036e-06, + "loss": 0.4001, + "step": 4350 + }, + { + "epoch": 2.352315732564426, + "grad_norm": 0.3469946086406708, + "learning_rate": 6.368101313237276e-06, + "loss": 0.4217, + "step": 4351 + }, + { + "epoch": 2.3528563705172103, + "grad_norm": 0.31573811173439026, + "learning_rate": 6.366285288728242e-06, + "loss": 0.3654, + "step": 4352 + }, + { + "epoch": 2.3533970084699947, + "grad_norm": 0.32064908742904663, + "learning_rate": 6.364469069416731e-06, + "loss": 0.4058, + "step": 4353 + }, + { + "epoch": 2.353937646422779, + "grad_norm": 0.310427188873291, + "learning_rate": 6.362652655561693e-06, + "loss": 0.3917, + "step": 4354 + }, + { + "epoch": 2.3544782843755634, + "grad_norm": 0.3326384127140045, + "learning_rate": 6.3608360474221106e-06, + "loss": 0.462, + "step": 4355 + }, + { + "epoch": 2.3550189223283473, + "grad_norm": 0.29486921429634094, + "learning_rate": 6.359019245256992e-06, + "loss": 0.3774, + "step": 4356 + }, + { + "epoch": 2.3555595602811317, + "grad_norm": 0.36760416626930237, + "learning_rate": 6.3572022493253715e-06, + "loss": 0.4397, + "step": 4357 + }, + { + "epoch": 2.356100198233916, + "grad_norm": 0.3134630024433136, + "learning_rate": 6.355385059886316e-06, + "loss": 0.4056, + "step": 4358 + }, + { + "epoch": 2.3566408361867004, + "grad_norm": 0.31693926453590393, + "learning_rate": 6.353567677198917e-06, + "loss": 0.3867, + "step": 4359 + }, + { + "epoch": 2.3571814741394848, + "grad_norm": 0.3252623975276947, + "learning_rate": 6.3517501015222924e-06, + "loss": 0.3738, + "step": 4360 + }, + { + "epoch": 2.3577221120922687, + "grad_norm": 0.294716477394104, + "learning_rate": 6.349932333115591e-06, + "loss": 0.4094, + "step": 4361 + }, + { + "epoch": 2.358262750045053, + "grad_norm": 0.28273141384124756, + "learning_rate": 6.348114372237983e-06, + "loss": 0.3618, + "step": 4362 + }, + { + "epoch": 2.3588033879978374, + "grad_norm": 0.31758493185043335, + "learning_rate": 6.346296219148671e-06, + "loss": 0.407, + "step": 4363 + }, + { + "epoch": 2.3593440259506218, + "grad_norm": 0.29638582468032837, + "learning_rate": 6.344477874106887e-06, + "loss": 0.3816, + "step": 4364 + }, + { + "epoch": 2.359884663903406, + "grad_norm": 0.3291521370410919, + "learning_rate": 6.342659337371884e-06, + "loss": 0.4152, + "step": 4365 + }, + { + "epoch": 2.3604253018561905, + "grad_norm": 0.3485001027584076, + "learning_rate": 6.340840609202949e-06, + "loss": 0.4198, + "step": 4366 + }, + { + "epoch": 2.3609659398089744, + "grad_norm": 0.29301851987838745, + "learning_rate": 6.33902168985939e-06, + "loss": 0.4179, + "step": 4367 + }, + { + "epoch": 2.3615065777617588, + "grad_norm": 0.3409198820590973, + "learning_rate": 6.337202579600546e-06, + "loss": 0.4069, + "step": 4368 + }, + { + "epoch": 2.362047215714543, + "grad_norm": 0.28736093640327454, + "learning_rate": 6.3353832786857825e-06, + "loss": 0.3684, + "step": 4369 + }, + { + "epoch": 2.3625878536673275, + "grad_norm": 0.3112078309059143, + "learning_rate": 6.333563787374493e-06, + "loss": 0.4161, + "step": 4370 + }, + { + "epoch": 2.363128491620112, + "grad_norm": 0.3426651060581207, + "learning_rate": 6.331744105926095e-06, + "loss": 0.4033, + "step": 4371 + }, + { + "epoch": 2.363669129572896, + "grad_norm": 0.31945520639419556, + "learning_rate": 6.32992423460004e-06, + "loss": 0.3956, + "step": 4372 + }, + { + "epoch": 2.36420976752568, + "grad_norm": 0.29475951194763184, + "learning_rate": 6.328104173655797e-06, + "loss": 0.3612, + "step": 4373 + }, + { + "epoch": 2.3647504054784645, + "grad_norm": 0.3905600309371948, + "learning_rate": 6.326283923352868e-06, + "loss": 0.433, + "step": 4374 + }, + { + "epoch": 2.365291043431249, + "grad_norm": 0.29405343532562256, + "learning_rate": 6.3244634839507834e-06, + "loss": 0.3981, + "step": 4375 + }, + { + "epoch": 2.365831681384033, + "grad_norm": 0.31816449761390686, + "learning_rate": 6.3226428557090966e-06, + "loss": 0.4358, + "step": 4376 + }, + { + "epoch": 2.3663723193368176, + "grad_norm": 0.31002146005630493, + "learning_rate": 6.320822038887388e-06, + "loss": 0.3585, + "step": 4377 + }, + { + "epoch": 2.3669129572896015, + "grad_norm": 0.31022676825523376, + "learning_rate": 6.319001033745271e-06, + "loss": 0.3939, + "step": 4378 + }, + { + "epoch": 2.367453595242386, + "grad_norm": 0.33718812465667725, + "learning_rate": 6.3171798405423755e-06, + "loss": 0.4338, + "step": 4379 + }, + { + "epoch": 2.36799423319517, + "grad_norm": 0.3083488345146179, + "learning_rate": 6.315358459538367e-06, + "loss": 0.3672, + "step": 4380 + }, + { + "epoch": 2.3685348711479546, + "grad_norm": 0.2915855944156647, + "learning_rate": 6.313536890992935e-06, + "loss": 0.372, + "step": 4381 + }, + { + "epoch": 2.369075509100739, + "grad_norm": 0.28754106163978577, + "learning_rate": 6.3117151351657944e-06, + "loss": 0.3773, + "step": 4382 + }, + { + "epoch": 2.3696161470535233, + "grad_norm": 0.358699768781662, + "learning_rate": 6.309893192316687e-06, + "loss": 0.4501, + "step": 4383 + }, + { + "epoch": 2.3701567850063077, + "grad_norm": 0.3254886865615845, + "learning_rate": 6.308071062705385e-06, + "loss": 0.417, + "step": 4384 + }, + { + "epoch": 2.3706974229590916, + "grad_norm": 0.3160276412963867, + "learning_rate": 6.3062487465916825e-06, + "loss": 0.4035, + "step": 4385 + }, + { + "epoch": 2.371238060911876, + "grad_norm": 0.2895281910896301, + "learning_rate": 6.304426244235401e-06, + "loss": 0.356, + "step": 4386 + }, + { + "epoch": 2.3717786988646603, + "grad_norm": 0.32785364985466003, + "learning_rate": 6.30260355589639e-06, + "loss": 0.441, + "step": 4387 + }, + { + "epoch": 2.3723193368174447, + "grad_norm": 0.31930238008499146, + "learning_rate": 6.300780681834529e-06, + "loss": 0.409, + "step": 4388 + }, + { + "epoch": 2.372859974770229, + "grad_norm": 0.31189289689064026, + "learning_rate": 6.298957622309713e-06, + "loss": 0.3836, + "step": 4389 + }, + { + "epoch": 2.373400612723013, + "grad_norm": 0.3160669505596161, + "learning_rate": 6.297134377581877e-06, + "loss": 0.3918, + "step": 4390 + }, + { + "epoch": 2.3739412506757973, + "grad_norm": 0.27443766593933105, + "learning_rate": 6.295310947910972e-06, + "loss": 0.3779, + "step": 4391 + }, + { + "epoch": 2.3744818886285817, + "grad_norm": 0.3024101257324219, + "learning_rate": 6.2934873335569806e-06, + "loss": 0.3866, + "step": 4392 + }, + { + "epoch": 2.375022526581366, + "grad_norm": 0.3389159142971039, + "learning_rate": 6.29166353477991e-06, + "loss": 0.4335, + "step": 4393 + }, + { + "epoch": 2.3755631645341504, + "grad_norm": 0.31065016984939575, + "learning_rate": 6.289839551839796e-06, + "loss": 0.3878, + "step": 4394 + }, + { + "epoch": 2.3761038024869348, + "grad_norm": 0.3136639893054962, + "learning_rate": 6.2880153849966966e-06, + "loss": 0.427, + "step": 4395 + }, + { + "epoch": 2.3766444404397187, + "grad_norm": 0.30515700578689575, + "learning_rate": 6.2861910345107e-06, + "loss": 0.4246, + "step": 4396 + }, + { + "epoch": 2.377185078392503, + "grad_norm": 0.33984237909317017, + "learning_rate": 6.284366500641914e-06, + "loss": 0.4069, + "step": 4397 + }, + { + "epoch": 2.3777257163452874, + "grad_norm": 0.3370453417301178, + "learning_rate": 6.282541783650486e-06, + "loss": 0.4111, + "step": 4398 + }, + { + "epoch": 2.3782663542980718, + "grad_norm": 0.3021021783351898, + "learning_rate": 6.280716883796573e-06, + "loss": 0.351, + "step": 4399 + }, + { + "epoch": 2.378806992250856, + "grad_norm": 0.34611067175865173, + "learning_rate": 6.2788918013403695e-06, + "loss": 0.4108, + "step": 4400 + }, + { + "epoch": 2.3793476302036405, + "grad_norm": 0.2943975329399109, + "learning_rate": 6.277066536542091e-06, + "loss": 0.4119, + "step": 4401 + }, + { + "epoch": 2.3798882681564244, + "grad_norm": 0.35113584995269775, + "learning_rate": 6.275241089661982e-06, + "loss": 0.4489, + "step": 4402 + }, + { + "epoch": 2.3804289061092088, + "grad_norm": 0.3373362421989441, + "learning_rate": 6.273415460960309e-06, + "loss": 0.3663, + "step": 4403 + }, + { + "epoch": 2.380969544061993, + "grad_norm": 0.32804322242736816, + "learning_rate": 6.271589650697371e-06, + "loss": 0.4132, + "step": 4404 + }, + { + "epoch": 2.3815101820147775, + "grad_norm": 0.31782838702201843, + "learning_rate": 6.269763659133486e-06, + "loss": 0.4333, + "step": 4405 + }, + { + "epoch": 2.382050819967562, + "grad_norm": 0.3200927674770355, + "learning_rate": 6.267937486528999e-06, + "loss": 0.3873, + "step": 4406 + }, + { + "epoch": 2.3825914579203458, + "grad_norm": 0.3715694844722748, + "learning_rate": 6.266111133144285e-06, + "loss": 0.4296, + "step": 4407 + }, + { + "epoch": 2.38313209587313, + "grad_norm": 0.283189058303833, + "learning_rate": 6.264284599239741e-06, + "loss": 0.3674, + "step": 4408 + }, + { + "epoch": 2.3836727338259145, + "grad_norm": 0.34319746494293213, + "learning_rate": 6.26245788507579e-06, + "loss": 0.4131, + "step": 4409 + }, + { + "epoch": 2.384213371778699, + "grad_norm": 0.32422542572021484, + "learning_rate": 6.2606309909128845e-06, + "loss": 0.3834, + "step": 4410 + }, + { + "epoch": 2.384754009731483, + "grad_norm": 0.29863274097442627, + "learning_rate": 6.258803917011497e-06, + "loss": 0.3962, + "step": 4411 + }, + { + "epoch": 2.3852946476842676, + "grad_norm": 0.3698817193508148, + "learning_rate": 6.256976663632131e-06, + "loss": 0.4527, + "step": 4412 + }, + { + "epoch": 2.385835285637052, + "grad_norm": 0.278394877910614, + "learning_rate": 6.2551492310353094e-06, + "loss": 0.3525, + "step": 4413 + }, + { + "epoch": 2.386375923589836, + "grad_norm": 0.2943514287471771, + "learning_rate": 6.253321619481586e-06, + "loss": 0.3689, + "step": 4414 + }, + { + "epoch": 2.38691656154262, + "grad_norm": 0.3604004681110382, + "learning_rate": 6.251493829231539e-06, + "loss": 0.4397, + "step": 4415 + }, + { + "epoch": 2.3874571994954046, + "grad_norm": 0.296670526266098, + "learning_rate": 6.249665860545773e-06, + "loss": 0.3612, + "step": 4416 + }, + { + "epoch": 2.387997837448189, + "grad_norm": 0.3186405897140503, + "learning_rate": 6.247837713684911e-06, + "loss": 0.4183, + "step": 4417 + }, + { + "epoch": 2.3885384754009733, + "grad_norm": 0.3029390275478363, + "learning_rate": 6.246009388909613e-06, + "loss": 0.3691, + "step": 4418 + }, + { + "epoch": 2.3890791133537572, + "grad_norm": 0.33948981761932373, + "learning_rate": 6.244180886480555e-06, + "loss": 0.4404, + "step": 4419 + }, + { + "epoch": 2.3896197513065416, + "grad_norm": 0.31117311120033264, + "learning_rate": 6.24235220665844e-06, + "loss": 0.3941, + "step": 4420 + }, + { + "epoch": 2.390160389259326, + "grad_norm": 0.30091291666030884, + "learning_rate": 6.240523349704002e-06, + "loss": 0.3869, + "step": 4421 + }, + { + "epoch": 2.3907010272121103, + "grad_norm": 0.3182219862937927, + "learning_rate": 6.238694315877994e-06, + "loss": 0.4322, + "step": 4422 + }, + { + "epoch": 2.3912416651648947, + "grad_norm": 0.2877960801124573, + "learning_rate": 6.236865105441194e-06, + "loss": 0.3592, + "step": 4423 + }, + { + "epoch": 2.391782303117679, + "grad_norm": 0.3085620105266571, + "learning_rate": 6.235035718654413e-06, + "loss": 0.4263, + "step": 4424 + }, + { + "epoch": 2.3923229410704634, + "grad_norm": 0.3202997148036957, + "learning_rate": 6.233206155778476e-06, + "loss": 0.4291, + "step": 4425 + }, + { + "epoch": 2.3928635790232473, + "grad_norm": 0.3051954209804535, + "learning_rate": 6.231376417074243e-06, + "loss": 0.387, + "step": 4426 + }, + { + "epoch": 2.3934042169760317, + "grad_norm": 0.3193745017051697, + "learning_rate": 6.229546502802591e-06, + "loss": 0.4108, + "step": 4427 + }, + { + "epoch": 2.393944854928816, + "grad_norm": 0.31878572702407837, + "learning_rate": 6.2277164132244305e-06, + "loss": 0.4014, + "step": 4428 + }, + { + "epoch": 2.3944854928816004, + "grad_norm": 0.2973358631134033, + "learning_rate": 6.225886148600688e-06, + "loss": 0.389, + "step": 4429 + }, + { + "epoch": 2.3950261308343848, + "grad_norm": 0.2974739670753479, + "learning_rate": 6.224055709192323e-06, + "loss": 0.454, + "step": 4430 + }, + { + "epoch": 2.3955667687871687, + "grad_norm": 0.31804347038269043, + "learning_rate": 6.222225095260311e-06, + "loss": 0.4335, + "step": 4431 + }, + { + "epoch": 2.396107406739953, + "grad_norm": 0.2907731533050537, + "learning_rate": 6.220394307065665e-06, + "loss": 0.3707, + "step": 4432 + }, + { + "epoch": 2.3966480446927374, + "grad_norm": 0.2965898811817169, + "learning_rate": 6.218563344869408e-06, + "loss": 0.3832, + "step": 4433 + }, + { + "epoch": 2.3971886826455218, + "grad_norm": 0.3112434148788452, + "learning_rate": 6.216732208932601e-06, + "loss": 0.457, + "step": 4434 + }, + { + "epoch": 2.397729320598306, + "grad_norm": 0.28128254413604736, + "learning_rate": 6.21490089951632e-06, + "loss": 0.3674, + "step": 4435 + }, + { + "epoch": 2.39826995855109, + "grad_norm": 0.33449041843414307, + "learning_rate": 6.213069416881672e-06, + "loss": 0.4326, + "step": 4436 + }, + { + "epoch": 2.3988105965038744, + "grad_norm": 0.27020829916000366, + "learning_rate": 6.211237761289787e-06, + "loss": 0.3616, + "step": 4437 + }, + { + "epoch": 2.3993512344566588, + "grad_norm": 0.3244174122810364, + "learning_rate": 6.2094059330018165e-06, + "loss": 0.384, + "step": 4438 + }, + { + "epoch": 2.399891872409443, + "grad_norm": 0.3321855068206787, + "learning_rate": 6.207573932278943e-06, + "loss": 0.4299, + "step": 4439 + }, + { + "epoch": 2.4004325103622275, + "grad_norm": 0.30754154920578003, + "learning_rate": 6.205741759382365e-06, + "loss": 0.388, + "step": 4440 + }, + { + "epoch": 2.400973148315012, + "grad_norm": 0.321622759103775, + "learning_rate": 6.203909414573316e-06, + "loss": 0.3919, + "step": 4441 + }, + { + "epoch": 2.401513786267796, + "grad_norm": 0.3470320701599121, + "learning_rate": 6.202076898113043e-06, + "loss": 0.4141, + "step": 4442 + }, + { + "epoch": 2.40205442422058, + "grad_norm": 0.30662572383880615, + "learning_rate": 6.200244210262827e-06, + "loss": 0.3852, + "step": 4443 + }, + { + "epoch": 2.4025950621733645, + "grad_norm": 0.34137198328971863, + "learning_rate": 6.198411351283966e-06, + "loss": 0.4033, + "step": 4444 + }, + { + "epoch": 2.403135700126149, + "grad_norm": 0.34139108657836914, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.4092, + "step": 4445 + }, + { + "epoch": 2.403676338078933, + "grad_norm": 0.35481759905815125, + "learning_rate": 6.194745120985644e-06, + "loss": 0.4397, + "step": 4446 + }, + { + "epoch": 2.4042169760317176, + "grad_norm": 0.3117373585700989, + "learning_rate": 6.192911750188907e-06, + "loss": 0.3776, + "step": 4447 + }, + { + "epoch": 2.4047576139845015, + "grad_norm": 0.35322365164756775, + "learning_rate": 6.191078209308974e-06, + "loss": 0.4319, + "step": 4448 + }, + { + "epoch": 2.405298251937286, + "grad_norm": 0.31928256154060364, + "learning_rate": 6.1892444986072695e-06, + "loss": 0.3733, + "step": 4449 + }, + { + "epoch": 2.40583888989007, + "grad_norm": 0.3453699052333832, + "learning_rate": 6.187410618345241e-06, + "loss": 0.4663, + "step": 4450 + }, + { + "epoch": 2.4063795278428546, + "grad_norm": 0.28178903460502625, + "learning_rate": 6.1855765687843595e-06, + "loss": 0.3854, + "step": 4451 + }, + { + "epoch": 2.406920165795639, + "grad_norm": 0.37080490589141846, + "learning_rate": 6.1837423501861205e-06, + "loss": 0.4157, + "step": 4452 + }, + { + "epoch": 2.4074608037484233, + "grad_norm": 0.3195529282093048, + "learning_rate": 6.181907962812044e-06, + "loss": 0.385, + "step": 4453 + }, + { + "epoch": 2.4080014417012077, + "grad_norm": 0.3060106337070465, + "learning_rate": 6.180073406923672e-06, + "loss": 0.391, + "step": 4454 + }, + { + "epoch": 2.4085420796539916, + "grad_norm": 0.37391820549964905, + "learning_rate": 6.178238682782574e-06, + "loss": 0.4299, + "step": 4455 + }, + { + "epoch": 2.409082717606776, + "grad_norm": 0.32717540860176086, + "learning_rate": 6.1764037906503395e-06, + "loss": 0.4189, + "step": 4456 + }, + { + "epoch": 2.4096233555595603, + "grad_norm": 0.28247183561325073, + "learning_rate": 6.174568730788587e-06, + "loss": 0.3802, + "step": 4457 + }, + { + "epoch": 2.4101639935123447, + "grad_norm": 0.32408735156059265, + "learning_rate": 6.172733503458954e-06, + "loss": 0.4352, + "step": 4458 + }, + { + "epoch": 2.410704631465129, + "grad_norm": 0.3047981858253479, + "learning_rate": 6.170898108923105e-06, + "loss": 0.3846, + "step": 4459 + }, + { + "epoch": 2.411245269417913, + "grad_norm": 0.2936648726463318, + "learning_rate": 6.169062547442724e-06, + "loss": 0.4196, + "step": 4460 + }, + { + "epoch": 2.4117859073706973, + "grad_norm": 0.3075091242790222, + "learning_rate": 6.1672268192795285e-06, + "loss": 0.4279, + "step": 4461 + }, + { + "epoch": 2.4123265453234817, + "grad_norm": 0.3028499186038971, + "learning_rate": 6.165390924695247e-06, + "loss": 0.4138, + "step": 4462 + }, + { + "epoch": 2.412867183276266, + "grad_norm": 0.30673113465309143, + "learning_rate": 6.1635548639516415e-06, + "loss": 0.396, + "step": 4463 + }, + { + "epoch": 2.4134078212290504, + "grad_norm": 0.31158581376075745, + "learning_rate": 6.161718637310492e-06, + "loss": 0.4055, + "step": 4464 + }, + { + "epoch": 2.4139484591818343, + "grad_norm": 0.3450443148612976, + "learning_rate": 6.159882245033606e-06, + "loss": 0.3807, + "step": 4465 + }, + { + "epoch": 2.4144890971346187, + "grad_norm": 0.3178133964538574, + "learning_rate": 6.158045687382812e-06, + "loss": 0.4181, + "step": 4466 + }, + { + "epoch": 2.415029735087403, + "grad_norm": 0.30661630630493164, + "learning_rate": 6.156208964619965e-06, + "loss": 0.3869, + "step": 4467 + }, + { + "epoch": 2.4155703730401874, + "grad_norm": 0.31599074602127075, + "learning_rate": 6.154372077006939e-06, + "loss": 0.4319, + "step": 4468 + }, + { + "epoch": 2.4161110109929718, + "grad_norm": 0.29334557056427, + "learning_rate": 6.152535024805637e-06, + "loss": 0.4171, + "step": 4469 + }, + { + "epoch": 2.416651648945756, + "grad_norm": 0.2875959277153015, + "learning_rate": 6.150697808277979e-06, + "loss": 0.3593, + "step": 4470 + }, + { + "epoch": 2.4171922868985405, + "grad_norm": 0.3488504886627197, + "learning_rate": 6.148860427685914e-06, + "loss": 0.4441, + "step": 4471 + }, + { + "epoch": 2.4177329248513244, + "grad_norm": 0.30295583605766296, + "learning_rate": 6.147022883291412e-06, + "loss": 0.3676, + "step": 4472 + }, + { + "epoch": 2.4182735628041088, + "grad_norm": 0.3115604817867279, + "learning_rate": 6.145185175356468e-06, + "loss": 0.3938, + "step": 4473 + }, + { + "epoch": 2.418814200756893, + "grad_norm": 0.2997246980667114, + "learning_rate": 6.143347304143098e-06, + "loss": 0.3981, + "step": 4474 + }, + { + "epoch": 2.4193548387096775, + "grad_norm": 0.33742642402648926, + "learning_rate": 6.141509269913343e-06, + "loss": 0.4301, + "step": 4475 + }, + { + "epoch": 2.419895476662462, + "grad_norm": 0.31137093901634216, + "learning_rate": 6.139671072929264e-06, + "loss": 0.3888, + "step": 4476 + }, + { + "epoch": 2.4204361146152458, + "grad_norm": 0.29388248920440674, + "learning_rate": 6.13783271345295e-06, + "loss": 0.3868, + "step": 4477 + }, + { + "epoch": 2.42097675256803, + "grad_norm": 0.32453441619873047, + "learning_rate": 6.135994191746511e-06, + "loss": 0.4564, + "step": 4478 + }, + { + "epoch": 2.4215173905208145, + "grad_norm": 0.3015061318874359, + "learning_rate": 6.134155508072081e-06, + "loss": 0.3957, + "step": 4479 + }, + { + "epoch": 2.422058028473599, + "grad_norm": 0.2795189619064331, + "learning_rate": 6.132316662691815e-06, + "loss": 0.3913, + "step": 4480 + }, + { + "epoch": 2.422598666426383, + "grad_norm": 0.31015628576278687, + "learning_rate": 6.130477655867893e-06, + "loss": 0.4248, + "step": 4481 + }, + { + "epoch": 2.4231393043791676, + "grad_norm": 0.308458149433136, + "learning_rate": 6.128638487862514e-06, + "loss": 0.4203, + "step": 4482 + }, + { + "epoch": 2.423679942331952, + "grad_norm": 0.2929111421108246, + "learning_rate": 6.126799158937906e-06, + "loss": 0.4078, + "step": 4483 + }, + { + "epoch": 2.424220580284736, + "grad_norm": 0.3093794286251068, + "learning_rate": 6.124959669356319e-06, + "loss": 0.3899, + "step": 4484 + }, + { + "epoch": 2.42476121823752, + "grad_norm": 0.28093525767326355, + "learning_rate": 6.123120019380021e-06, + "loss": 0.397, + "step": 4485 + }, + { + "epoch": 2.4253018561903046, + "grad_norm": 0.2979103922843933, + "learning_rate": 6.121280209271306e-06, + "loss": 0.4463, + "step": 4486 + }, + { + "epoch": 2.425842494143089, + "grad_norm": 0.2875688076019287, + "learning_rate": 6.119440239292493e-06, + "loss": 0.3858, + "step": 4487 + }, + { + "epoch": 2.4263831320958733, + "grad_norm": 0.34129956364631653, + "learning_rate": 6.117600109705919e-06, + "loss": 0.4681, + "step": 4488 + }, + { + "epoch": 2.4269237700486572, + "grad_norm": 0.28495529294013977, + "learning_rate": 6.1157598207739496e-06, + "loss": 0.3874, + "step": 4489 + }, + { + "epoch": 2.4274644080014416, + "grad_norm": 0.3119986355304718, + "learning_rate": 6.1139193727589665e-06, + "loss": 0.386, + "step": 4490 + }, + { + "epoch": 2.428005045954226, + "grad_norm": 0.3121979534626007, + "learning_rate": 6.1120787659233805e-06, + "loss": 0.4054, + "step": 4491 + }, + { + "epoch": 2.4285456839070103, + "grad_norm": 0.29938867688179016, + "learning_rate": 6.110238000529619e-06, + "loss": 0.4111, + "step": 4492 + }, + { + "epoch": 2.4290863218597947, + "grad_norm": 0.3388979136943817, + "learning_rate": 6.108397076840137e-06, + "loss": 0.3868, + "step": 4493 + }, + { + "epoch": 2.429626959812579, + "grad_norm": 0.35237500071525574, + "learning_rate": 6.106555995117408e-06, + "loss": 0.4234, + "step": 4494 + }, + { + "epoch": 2.430167597765363, + "grad_norm": 0.3059629499912262, + "learning_rate": 6.1047147556239325e-06, + "loss": 0.3874, + "step": 4495 + }, + { + "epoch": 2.4307082357181473, + "grad_norm": 0.3108119070529938, + "learning_rate": 6.10287335862223e-06, + "loss": 0.4132, + "step": 4496 + }, + { + "epoch": 2.4312488736709317, + "grad_norm": 0.31125408411026, + "learning_rate": 6.101031804374845e-06, + "loss": 0.3836, + "step": 4497 + }, + { + "epoch": 2.431789511623716, + "grad_norm": 0.3597297668457031, + "learning_rate": 6.099190093144341e-06, + "loss": 0.4046, + "step": 4498 + }, + { + "epoch": 2.4323301495765004, + "grad_norm": 0.3167576789855957, + "learning_rate": 6.097348225193305e-06, + "loss": 0.4121, + "step": 4499 + }, + { + "epoch": 2.4328707875292848, + "grad_norm": 0.335818886756897, + "learning_rate": 6.095506200784349e-06, + "loss": 0.4143, + "step": 4500 + }, + { + "epoch": 2.4334114254820687, + "grad_norm": 0.3068419098854065, + "learning_rate": 6.093664020180106e-06, + "loss": 0.3896, + "step": 4501 + }, + { + "epoch": 2.433952063434853, + "grad_norm": 0.3358902037143707, + "learning_rate": 6.091821683643231e-06, + "loss": 0.423, + "step": 4502 + }, + { + "epoch": 2.4344927013876374, + "grad_norm": 0.3129465579986572, + "learning_rate": 6.089979191436398e-06, + "loss": 0.3915, + "step": 4503 + }, + { + "epoch": 2.4350333393404218, + "grad_norm": 0.3382420241832733, + "learning_rate": 6.088136543822309e-06, + "loss": 0.4193, + "step": 4504 + }, + { + "epoch": 2.435573977293206, + "grad_norm": 0.33931517601013184, + "learning_rate": 6.086293741063685e-06, + "loss": 0.4202, + "step": 4505 + }, + { + "epoch": 2.43611461524599, + "grad_norm": 0.31078824400901794, + "learning_rate": 6.084450783423268e-06, + "loss": 0.4075, + "step": 4506 + }, + { + "epoch": 2.4366552531987744, + "grad_norm": 0.28337806463241577, + "learning_rate": 6.082607671163823e-06, + "loss": 0.3763, + "step": 4507 + }, + { + "epoch": 2.4371958911515588, + "grad_norm": 0.3330620527267456, + "learning_rate": 6.0807644045481425e-06, + "loss": 0.3932, + "step": 4508 + }, + { + "epoch": 2.437736529104343, + "grad_norm": 0.2874024212360382, + "learning_rate": 6.078920983839032e-06, + "loss": 0.3703, + "step": 4509 + }, + { + "epoch": 2.4382771670571275, + "grad_norm": 0.3029569387435913, + "learning_rate": 6.077077409299323e-06, + "loss": 0.4456, + "step": 4510 + }, + { + "epoch": 2.438817805009912, + "grad_norm": 0.31840410828590393, + "learning_rate": 6.07523368119187e-06, + "loss": 0.4065, + "step": 4511 + }, + { + "epoch": 2.439358442962696, + "grad_norm": 0.29436418414115906, + "learning_rate": 6.073389799779547e-06, + "loss": 0.3617, + "step": 4512 + }, + { + "epoch": 2.43989908091548, + "grad_norm": 0.32618609070777893, + "learning_rate": 6.071545765325254e-06, + "loss": 0.4419, + "step": 4513 + }, + { + "epoch": 2.4404397188682645, + "grad_norm": 0.3147270977497101, + "learning_rate": 6.069701578091909e-06, + "loss": 0.4055, + "step": 4514 + }, + { + "epoch": 2.440980356821049, + "grad_norm": 0.35851117968559265, + "learning_rate": 6.067857238342451e-06, + "loss": 0.4027, + "step": 4515 + }, + { + "epoch": 2.441520994773833, + "grad_norm": 0.33268657326698303, + "learning_rate": 6.066012746339847e-06, + "loss": 0.3888, + "step": 4516 + }, + { + "epoch": 2.4420616327266176, + "grad_norm": 0.3391202688217163, + "learning_rate": 6.064168102347074e-06, + "loss": 0.453, + "step": 4517 + }, + { + "epoch": 2.4426022706794015, + "grad_norm": 0.3072931170463562, + "learning_rate": 6.062323306627146e-06, + "loss": 0.3971, + "step": 4518 + }, + { + "epoch": 2.443142908632186, + "grad_norm": 0.3158687651157379, + "learning_rate": 6.060478359443085e-06, + "loss": 0.4018, + "step": 4519 + }, + { + "epoch": 2.4436835465849702, + "grad_norm": 0.2785457968711853, + "learning_rate": 6.058633261057945e-06, + "loss": 0.4063, + "step": 4520 + }, + { + "epoch": 2.4442241845377546, + "grad_norm": 0.2931409776210785, + "learning_rate": 6.056788011734791e-06, + "loss": 0.3744, + "step": 4521 + }, + { + "epoch": 2.444764822490539, + "grad_norm": 0.33004578948020935, + "learning_rate": 6.0549426117367195e-06, + "loss": 0.4643, + "step": 4522 + }, + { + "epoch": 2.4453054604433233, + "grad_norm": 0.2701852023601532, + "learning_rate": 6.053097061326843e-06, + "loss": 0.3744, + "step": 4523 + }, + { + "epoch": 2.4458460983961072, + "grad_norm": 0.3357827961444855, + "learning_rate": 6.0512513607682976e-06, + "loss": 0.4313, + "step": 4524 + }, + { + "epoch": 2.4463867363488916, + "grad_norm": 0.31330615282058716, + "learning_rate": 6.049405510324237e-06, + "loss": 0.4108, + "step": 4525 + }, + { + "epoch": 2.446927374301676, + "grad_norm": 0.30435165762901306, + "learning_rate": 6.0475595102578455e-06, + "loss": 0.42, + "step": 4526 + }, + { + "epoch": 2.4474680122544603, + "grad_norm": 0.3326108753681183, + "learning_rate": 6.045713360832315e-06, + "loss": 0.403, + "step": 4527 + }, + { + "epoch": 2.4480086502072447, + "grad_norm": 0.30275386571884155, + "learning_rate": 6.04386706231087e-06, + "loss": 0.4118, + "step": 4528 + }, + { + "epoch": 2.448549288160029, + "grad_norm": 0.2818882465362549, + "learning_rate": 6.042020614956753e-06, + "loss": 0.3621, + "step": 4529 + }, + { + "epoch": 2.449089926112813, + "grad_norm": 0.3257218301296234, + "learning_rate": 6.040174019033226e-06, + "loss": 0.4251, + "step": 4530 + }, + { + "epoch": 2.4496305640655973, + "grad_norm": 0.293617844581604, + "learning_rate": 6.0383272748035724e-06, + "loss": 0.389, + "step": 4531 + }, + { + "epoch": 2.4501712020183817, + "grad_norm": 0.30719658732414246, + "learning_rate": 6.036480382531099e-06, + "loss": 0.4113, + "step": 4532 + }, + { + "epoch": 2.450711839971166, + "grad_norm": 0.3174991309642792, + "learning_rate": 6.0346333424791325e-06, + "loss": 0.3894, + "step": 4533 + }, + { + "epoch": 2.4512524779239504, + "grad_norm": 0.33805057406425476, + "learning_rate": 6.032786154911019e-06, + "loss": 0.392, + "step": 4534 + }, + { + "epoch": 2.4517931158767343, + "grad_norm": 0.295246422290802, + "learning_rate": 6.030938820090128e-06, + "loss": 0.3868, + "step": 4535 + }, + { + "epoch": 2.4523337538295187, + "grad_norm": 0.347849041223526, + "learning_rate": 6.02909133827985e-06, + "loss": 0.4459, + "step": 4536 + }, + { + "epoch": 2.452874391782303, + "grad_norm": 0.35072389245033264, + "learning_rate": 6.027243709743595e-06, + "loss": 0.4103, + "step": 4537 + }, + { + "epoch": 2.4534150297350874, + "grad_norm": 0.3043549954891205, + "learning_rate": 6.025395934744793e-06, + "loss": 0.4218, + "step": 4538 + }, + { + "epoch": 2.4539556676878718, + "grad_norm": 0.32718175649642944, + "learning_rate": 6.023548013546899e-06, + "loss": 0.4026, + "step": 4539 + }, + { + "epoch": 2.454496305640656, + "grad_norm": 0.3174917995929718, + "learning_rate": 6.021699946413384e-06, + "loss": 0.3996, + "step": 4540 + }, + { + "epoch": 2.4550369435934405, + "grad_norm": 0.2845597565174103, + "learning_rate": 6.019851733607744e-06, + "loss": 0.4009, + "step": 4541 + }, + { + "epoch": 2.4555775815462244, + "grad_norm": 0.3087173104286194, + "learning_rate": 6.018003375393493e-06, + "loss": 0.4119, + "step": 4542 + }, + { + "epoch": 2.4561182194990088, + "grad_norm": 0.3086446523666382, + "learning_rate": 6.016154872034167e-06, + "loss": 0.4218, + "step": 4543 + }, + { + "epoch": 2.456658857451793, + "grad_norm": 0.3066726326942444, + "learning_rate": 6.014306223793321e-06, + "loss": 0.3948, + "step": 4544 + }, + { + "epoch": 2.4571994954045775, + "grad_norm": 0.31927090883255005, + "learning_rate": 6.012457430934532e-06, + "loss": 0.4437, + "step": 4545 + }, + { + "epoch": 2.457740133357362, + "grad_norm": 0.27344468235969543, + "learning_rate": 6.010608493721399e-06, + "loss": 0.3903, + "step": 4546 + }, + { + "epoch": 2.4582807713101458, + "grad_norm": 0.30820268392562866, + "learning_rate": 6.008759412417539e-06, + "loss": 0.3931, + "step": 4547 + }, + { + "epoch": 2.45882140926293, + "grad_norm": 0.3438683748245239, + "learning_rate": 6.006910187286592e-06, + "loss": 0.4159, + "step": 4548 + }, + { + "epoch": 2.4593620472157145, + "grad_norm": 0.2932104766368866, + "learning_rate": 6.005060818592214e-06, + "loss": 0.3802, + "step": 4549 + }, + { + "epoch": 2.459902685168499, + "grad_norm": 0.3381466567516327, + "learning_rate": 6.003211306598089e-06, + "loss": 0.4422, + "step": 4550 + }, + { + "epoch": 2.460443323121283, + "grad_norm": 0.33015963435173035, + "learning_rate": 6.001361651567913e-06, + "loss": 0.4047, + "step": 4551 + }, + { + "epoch": 2.4609839610740676, + "grad_norm": 0.30845144391059875, + "learning_rate": 5.99951185376541e-06, + "loss": 0.4006, + "step": 4552 + }, + { + "epoch": 2.4615245990268515, + "grad_norm": 0.3066163957118988, + "learning_rate": 5.9976619134543175e-06, + "loss": 0.393, + "step": 4553 + }, + { + "epoch": 2.462065236979636, + "grad_norm": 0.2943369746208191, + "learning_rate": 5.995811830898399e-06, + "loss": 0.401, + "step": 4554 + }, + { + "epoch": 2.4626058749324202, + "grad_norm": 0.35713574290275574, + "learning_rate": 5.993961606361436e-06, + "loss": 0.3973, + "step": 4555 + }, + { + "epoch": 2.4631465128852046, + "grad_norm": 0.35209155082702637, + "learning_rate": 5.9921112401072275e-06, + "loss": 0.4029, + "step": 4556 + }, + { + "epoch": 2.463687150837989, + "grad_norm": 0.31026819348335266, + "learning_rate": 5.990260732399598e-06, + "loss": 0.4012, + "step": 4557 + }, + { + "epoch": 2.4642277887907733, + "grad_norm": 0.3545580804347992, + "learning_rate": 5.988410083502389e-06, + "loss": 0.3933, + "step": 4558 + }, + { + "epoch": 2.4647684267435572, + "grad_norm": 0.33118224143981934, + "learning_rate": 5.986559293679464e-06, + "loss": 0.3995, + "step": 4559 + }, + { + "epoch": 2.4653090646963416, + "grad_norm": 0.33403223752975464, + "learning_rate": 5.984708363194702e-06, + "loss": 0.4219, + "step": 4560 + }, + { + "epoch": 2.465849702649126, + "grad_norm": 0.30222567915916443, + "learning_rate": 5.982857292312007e-06, + "loss": 0.392, + "step": 4561 + }, + { + "epoch": 2.4663903406019103, + "grad_norm": 0.31281334161758423, + "learning_rate": 5.981006081295301e-06, + "loss": 0.3606, + "step": 4562 + }, + { + "epoch": 2.4669309785546947, + "grad_norm": 0.32246240973472595, + "learning_rate": 5.979154730408526e-06, + "loss": 0.442, + "step": 4563 + }, + { + "epoch": 2.4674716165074786, + "grad_norm": 0.3207295536994934, + "learning_rate": 5.977303239915646e-06, + "loss": 0.3983, + "step": 4564 + }, + { + "epoch": 2.468012254460263, + "grad_norm": 0.3525555431842804, + "learning_rate": 5.975451610080643e-06, + "loss": 0.4066, + "step": 4565 + }, + { + "epoch": 2.4685528924130473, + "grad_norm": 0.29853981733322144, + "learning_rate": 5.973599841167516e-06, + "loss": 0.411, + "step": 4566 + }, + { + "epoch": 2.4690935303658317, + "grad_norm": 0.3197988271713257, + "learning_rate": 5.97174793344029e-06, + "loss": 0.4275, + "step": 4567 + }, + { + "epoch": 2.469634168318616, + "grad_norm": 0.29205453395843506, + "learning_rate": 5.969895887163005e-06, + "loss": 0.3614, + "step": 4568 + }, + { + "epoch": 2.4701748062714004, + "grad_norm": 0.28719770908355713, + "learning_rate": 5.968043702599723e-06, + "loss": 0.3925, + "step": 4569 + }, + { + "epoch": 2.4707154442241848, + "grad_norm": 0.3598988652229309, + "learning_rate": 5.966191380014524e-06, + "loss": 0.4228, + "step": 4570 + }, + { + "epoch": 2.4712560821769687, + "grad_norm": 0.28883805871009827, + "learning_rate": 5.9643389196715125e-06, + "loss": 0.3829, + "step": 4571 + }, + { + "epoch": 2.471796720129753, + "grad_norm": 0.28611552715301514, + "learning_rate": 5.962486321834805e-06, + "loss": 0.3518, + "step": 4572 + }, + { + "epoch": 2.4723373580825374, + "grad_norm": 0.3418418765068054, + "learning_rate": 5.9606335867685424e-06, + "loss": 0.4148, + "step": 4573 + }, + { + "epoch": 2.4728779960353218, + "grad_norm": 0.31138092279434204, + "learning_rate": 5.958780714736886e-06, + "loss": 0.3593, + "step": 4574 + }, + { + "epoch": 2.473418633988106, + "grad_norm": 0.3250178098678589, + "learning_rate": 5.956927706004012e-06, + "loss": 0.392, + "step": 4575 + }, + { + "epoch": 2.47395927194089, + "grad_norm": 0.3134720027446747, + "learning_rate": 5.955074560834121e-06, + "loss": 0.4526, + "step": 4576 + }, + { + "epoch": 2.4744999098936744, + "grad_norm": 0.28503695130348206, + "learning_rate": 5.953221279491432e-06, + "loss": 0.3592, + "step": 4577 + }, + { + "epoch": 2.4750405478464588, + "grad_norm": 0.3523981273174286, + "learning_rate": 5.95136786224018e-06, + "loss": 0.4211, + "step": 4578 + }, + { + "epoch": 2.475581185799243, + "grad_norm": 0.31300053000450134, + "learning_rate": 5.949514309344624e-06, + "loss": 0.3936, + "step": 4579 + }, + { + "epoch": 2.4761218237520275, + "grad_norm": 0.30527982115745544, + "learning_rate": 5.947660621069038e-06, + "loss": 0.3958, + "step": 4580 + }, + { + "epoch": 2.476662461704812, + "grad_norm": 0.32108503580093384, + "learning_rate": 5.94580679767772e-06, + "loss": 0.4074, + "step": 4581 + }, + { + "epoch": 2.4772030996575958, + "grad_norm": 0.289093941450119, + "learning_rate": 5.9439528394349835e-06, + "loss": 0.3972, + "step": 4582 + }, + { + "epoch": 2.47774373761038, + "grad_norm": 0.29553407430648804, + "learning_rate": 5.942098746605164e-06, + "loss": 0.3958, + "step": 4583 + }, + { + "epoch": 2.4782843755631645, + "grad_norm": 0.32005220651626587, + "learning_rate": 5.940244519452612e-06, + "loss": 0.4394, + "step": 4584 + }, + { + "epoch": 2.478825013515949, + "grad_norm": 0.27880796790122986, + "learning_rate": 5.938390158241701e-06, + "loss": 0.3645, + "step": 4585 + }, + { + "epoch": 2.4793656514687332, + "grad_norm": 0.3102542459964752, + "learning_rate": 5.936535663236822e-06, + "loss": 0.3932, + "step": 4586 + }, + { + "epoch": 2.4799062894215176, + "grad_norm": 0.31755635142326355, + "learning_rate": 5.934681034702387e-06, + "loss": 0.4033, + "step": 4587 + }, + { + "epoch": 2.4804469273743015, + "grad_norm": 0.341374009847641, + "learning_rate": 5.932826272902825e-06, + "loss": 0.3751, + "step": 4588 + }, + { + "epoch": 2.480987565327086, + "grad_norm": 0.3454676866531372, + "learning_rate": 5.930971378102585e-06, + "loss": 0.4437, + "step": 4589 + }, + { + "epoch": 2.4815282032798702, + "grad_norm": 0.3300340175628662, + "learning_rate": 5.929116350566132e-06, + "loss": 0.4481, + "step": 4590 + }, + { + "epoch": 2.4820688412326546, + "grad_norm": 0.3258576989173889, + "learning_rate": 5.927261190557955e-06, + "loss": 0.3914, + "step": 4591 + }, + { + "epoch": 2.482609479185439, + "grad_norm": 0.35531044006347656, + "learning_rate": 5.925405898342559e-06, + "loss": 0.4038, + "step": 4592 + }, + { + "epoch": 2.483150117138223, + "grad_norm": 0.3070998787879944, + "learning_rate": 5.9235504741844686e-06, + "loss": 0.4024, + "step": 4593 + }, + { + "epoch": 2.4836907550910072, + "grad_norm": 0.3188576400279999, + "learning_rate": 5.9216949183482245e-06, + "loss": 0.3864, + "step": 4594 + }, + { + "epoch": 2.4842313930437916, + "grad_norm": 0.32584765553474426, + "learning_rate": 5.919839231098392e-06, + "loss": 0.4016, + "step": 4595 + }, + { + "epoch": 2.484772030996576, + "grad_norm": 0.3314954340457916, + "learning_rate": 5.917983412699549e-06, + "loss": 0.4243, + "step": 4596 + }, + { + "epoch": 2.4853126689493603, + "grad_norm": 0.2937794327735901, + "learning_rate": 5.916127463416293e-06, + "loss": 0.3844, + "step": 4597 + }, + { + "epoch": 2.4858533069021447, + "grad_norm": 0.29798558354377747, + "learning_rate": 5.914271383513247e-06, + "loss": 0.4113, + "step": 4598 + }, + { + "epoch": 2.486393944854929, + "grad_norm": 0.3137997090816498, + "learning_rate": 5.912415173255045e-06, + "loss": 0.4324, + "step": 4599 + }, + { + "epoch": 2.486934582807713, + "grad_norm": 0.3202819526195526, + "learning_rate": 5.910558832906341e-06, + "loss": 0.4093, + "step": 4600 + }, + { + "epoch": 2.4874752207604973, + "grad_norm": 0.29597213864326477, + "learning_rate": 5.90870236273181e-06, + "loss": 0.3855, + "step": 4601 + }, + { + "epoch": 2.4880158587132817, + "grad_norm": 0.3094213604927063, + "learning_rate": 5.906845762996143e-06, + "loss": 0.384, + "step": 4602 + }, + { + "epoch": 2.488556496666066, + "grad_norm": 0.3382684290409088, + "learning_rate": 5.904989033964051e-06, + "loss": 0.4134, + "step": 4603 + }, + { + "epoch": 2.4890971346188504, + "grad_norm": 0.31785866618156433, + "learning_rate": 5.903132175900264e-06, + "loss": 0.4363, + "step": 4604 + }, + { + "epoch": 2.4896377725716343, + "grad_norm": 0.30754876136779785, + "learning_rate": 5.90127518906953e-06, + "loss": 0.4128, + "step": 4605 + }, + { + "epoch": 2.4901784105244187, + "grad_norm": 0.28241950273513794, + "learning_rate": 5.8994180737366125e-06, + "loss": 0.3662, + "step": 4606 + }, + { + "epoch": 2.490719048477203, + "grad_norm": 0.27633342146873474, + "learning_rate": 5.897560830166297e-06, + "loss": 0.3934, + "step": 4607 + }, + { + "epoch": 2.4912596864299874, + "grad_norm": 0.2971936762332916, + "learning_rate": 5.8957034586233855e-06, + "loss": 0.3798, + "step": 4608 + }, + { + "epoch": 2.4918003243827718, + "grad_norm": 0.3198434114456177, + "learning_rate": 5.8938459593726985e-06, + "loss": 0.3967, + "step": 4609 + }, + { + "epoch": 2.492340962335556, + "grad_norm": 0.2966821789741516, + "learning_rate": 5.891988332679075e-06, + "loss": 0.3967, + "step": 4610 + }, + { + "epoch": 2.49288160028834, + "grad_norm": 0.3066607415676117, + "learning_rate": 5.8901305788073735e-06, + "loss": 0.3915, + "step": 4611 + }, + { + "epoch": 2.4934222382411244, + "grad_norm": 0.36355921626091003, + "learning_rate": 5.888272698022468e-06, + "loss": 0.4414, + "step": 4612 + }, + { + "epoch": 2.4939628761939088, + "grad_norm": 0.3148326873779297, + "learning_rate": 5.886414690589252e-06, + "loss": 0.3541, + "step": 4613 + }, + { + "epoch": 2.494503514146693, + "grad_norm": 0.33225294947624207, + "learning_rate": 5.884556556772634e-06, + "loss": 0.405, + "step": 4614 + }, + { + "epoch": 2.4950441520994775, + "grad_norm": 0.3356441855430603, + "learning_rate": 5.882698296837549e-06, + "loss": 0.3887, + "step": 4615 + }, + { + "epoch": 2.495584790052262, + "grad_norm": 0.3474816381931305, + "learning_rate": 5.880839911048939e-06, + "loss": 0.3972, + "step": 4616 + }, + { + "epoch": 2.4961254280050458, + "grad_norm": 0.3433469235897064, + "learning_rate": 5.878981399671774e-06, + "loss": 0.3986, + "step": 4617 + }, + { + "epoch": 2.49666606595783, + "grad_norm": 0.3132109045982361, + "learning_rate": 5.877122762971033e-06, + "loss": 0.4208, + "step": 4618 + }, + { + "epoch": 2.4972067039106145, + "grad_norm": 0.3536732792854309, + "learning_rate": 5.875264001211719e-06, + "loss": 0.4096, + "step": 4619 + }, + { + "epoch": 2.497747341863399, + "grad_norm": 0.35122716426849365, + "learning_rate": 5.87340511465885e-06, + "loss": 0.4056, + "step": 4620 + }, + { + "epoch": 2.4982879798161832, + "grad_norm": 0.3390818238258362, + "learning_rate": 5.871546103577464e-06, + "loss": 0.3954, + "step": 4621 + }, + { + "epoch": 2.498828617768967, + "grad_norm": 0.3005041778087616, + "learning_rate": 5.869686968232615e-06, + "loss": 0.3719, + "step": 4622 + }, + { + "epoch": 2.4993692557217515, + "grad_norm": 0.33602702617645264, + "learning_rate": 5.867827708889375e-06, + "loss": 0.4081, + "step": 4623 + }, + { + "epoch": 2.499909893674536, + "grad_norm": 0.31842952966690063, + "learning_rate": 5.8659683258128344e-06, + "loss": 0.4403, + "step": 4624 + }, + { + "epoch": 2.5004505316273202, + "grad_norm": 0.2911980152130127, + "learning_rate": 5.864108819268098e-06, + "loss": 0.399, + "step": 4625 + }, + { + "epoch": 2.5009911695801046, + "grad_norm": 0.2828127145767212, + "learning_rate": 5.862249189520293e-06, + "loss": 0.3872, + "step": 4626 + }, + { + "epoch": 2.501531807532889, + "grad_norm": 0.2884853780269623, + "learning_rate": 5.860389436834565e-06, + "loss": 0.3947, + "step": 4627 + }, + { + "epoch": 2.5020724454856733, + "grad_norm": 0.28914675116539, + "learning_rate": 5.858529561476069e-06, + "loss": 0.413, + "step": 4628 + }, + { + "epoch": 2.5026130834384572, + "grad_norm": 0.3035869002342224, + "learning_rate": 5.856669563709985e-06, + "loss": 0.3678, + "step": 4629 + }, + { + "epoch": 2.5031537213912416, + "grad_norm": 0.31376922130584717, + "learning_rate": 5.8548094438015065e-06, + "loss": 0.421, + "step": 4630 + }, + { + "epoch": 2.503694359344026, + "grad_norm": 0.2984936833381653, + "learning_rate": 5.852949202015849e-06, + "loss": 0.4254, + "step": 4631 + }, + { + "epoch": 2.5042349972968103, + "grad_norm": 0.2927659749984741, + "learning_rate": 5.851088838618239e-06, + "loss": 0.4027, + "step": 4632 + }, + { + "epoch": 2.5047756352495947, + "grad_norm": 0.3192109763622284, + "learning_rate": 5.849228353873927e-06, + "loss": 0.405, + "step": 4633 + }, + { + "epoch": 2.5053162732023786, + "grad_norm": 0.29724055528640747, + "learning_rate": 5.847367748048177e-06, + "loss": 0.3795, + "step": 4634 + }, + { + "epoch": 2.505856911155163, + "grad_norm": 0.30973199009895325, + "learning_rate": 5.8455070214062685e-06, + "loss": 0.4421, + "step": 4635 + }, + { + "epoch": 2.5063975491079473, + "grad_norm": 0.29204344749450684, + "learning_rate": 5.843646174213502e-06, + "loss": 0.3985, + "step": 4636 + }, + { + "epoch": 2.5069381870607317, + "grad_norm": 0.29323068261146545, + "learning_rate": 5.841785206735192e-06, + "loss": 0.4091, + "step": 4637 + }, + { + "epoch": 2.507478825013516, + "grad_norm": 0.32439741492271423, + "learning_rate": 5.839924119236676e-06, + "loss": 0.4158, + "step": 4638 + }, + { + "epoch": 2.5080194629663, + "grad_norm": 0.3009544909000397, + "learning_rate": 5.838062911983301e-06, + "loss": 0.3853, + "step": 4639 + }, + { + "epoch": 2.5085601009190848, + "grad_norm": 0.3224140405654907, + "learning_rate": 5.8362015852404365e-06, + "loss": 0.4289, + "step": 4640 + }, + { + "epoch": 2.5091007388718687, + "grad_norm": 0.2925460636615753, + "learning_rate": 5.834340139273465e-06, + "loss": 0.3596, + "step": 4641 + }, + { + "epoch": 2.509641376824653, + "grad_norm": 0.3109346926212311, + "learning_rate": 5.832478574347789e-06, + "loss": 0.3939, + "step": 4642 + }, + { + "epoch": 2.5101820147774374, + "grad_norm": 0.34562817215919495, + "learning_rate": 5.830616890728828e-06, + "loss": 0.4246, + "step": 4643 + }, + { + "epoch": 2.5107226527302218, + "grad_norm": 0.3096538186073303, + "learning_rate": 5.828755088682016e-06, + "loss": 0.4123, + "step": 4644 + }, + { + "epoch": 2.511263290683006, + "grad_norm": 0.31189072132110596, + "learning_rate": 5.826893168472807e-06, + "loss": 0.3759, + "step": 4645 + }, + { + "epoch": 2.51180392863579, + "grad_norm": 0.36504462361335754, + "learning_rate": 5.82503113036667e-06, + "loss": 0.3922, + "step": 4646 + }, + { + "epoch": 2.5123445665885744, + "grad_norm": 0.3030308783054352, + "learning_rate": 5.823168974629088e-06, + "loss": 0.371, + "step": 4647 + }, + { + "epoch": 2.5128852045413588, + "grad_norm": 0.3304159641265869, + "learning_rate": 5.821306701525566e-06, + "loss": 0.4055, + "step": 4648 + }, + { + "epoch": 2.513425842494143, + "grad_norm": 0.31587693095207214, + "learning_rate": 5.819444311321624e-06, + "loss": 0.3859, + "step": 4649 + }, + { + "epoch": 2.5139664804469275, + "grad_norm": 0.3149249851703644, + "learning_rate": 5.8175818042828e-06, + "loss": 0.4065, + "step": 4650 + }, + { + "epoch": 2.5145071183997114, + "grad_norm": 0.2843349277973175, + "learning_rate": 5.815719180674644e-06, + "loss": 0.4158, + "step": 4651 + }, + { + "epoch": 2.5150477563524962, + "grad_norm": 0.3453637659549713, + "learning_rate": 5.813856440762726e-06, + "loss": 0.4431, + "step": 4652 + }, + { + "epoch": 2.51558839430528, + "grad_norm": 0.28882738947868347, + "learning_rate": 5.811993584812631e-06, + "loss": 0.3657, + "step": 4653 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.3001982271671295, + "learning_rate": 5.810130613089964e-06, + "loss": 0.3916, + "step": 4654 + }, + { + "epoch": 2.516669670210849, + "grad_norm": 0.3245398998260498, + "learning_rate": 5.808267525860343e-06, + "loss": 0.4098, + "step": 4655 + }, + { + "epoch": 2.5172103081636332, + "grad_norm": 0.2959372401237488, + "learning_rate": 5.806404323389403e-06, + "loss": 0.3907, + "step": 4656 + }, + { + "epoch": 2.5177509461164176, + "grad_norm": 0.33096209168434143, + "learning_rate": 5.8045410059427964e-06, + "loss": 0.4194, + "step": 4657 + }, + { + "epoch": 2.5182915840692015, + "grad_norm": 0.33586692810058594, + "learning_rate": 5.802677573786194e-06, + "loss": 0.4213, + "step": 4658 + }, + { + "epoch": 2.518832222021986, + "grad_norm": 0.30121517181396484, + "learning_rate": 5.800814027185276e-06, + "loss": 0.3844, + "step": 4659 + }, + { + "epoch": 2.5193728599747702, + "grad_norm": 0.32516399025917053, + "learning_rate": 5.798950366405748e-06, + "loss": 0.3805, + "step": 4660 + }, + { + "epoch": 2.5199134979275546, + "grad_norm": 0.3295672535896301, + "learning_rate": 5.797086591713324e-06, + "loss": 0.4169, + "step": 4661 + }, + { + "epoch": 2.520454135880339, + "grad_norm": 0.2826218903064728, + "learning_rate": 5.79522270337374e-06, + "loss": 0.3613, + "step": 4662 + }, + { + "epoch": 2.520994773833123, + "grad_norm": 0.34624770283699036, + "learning_rate": 5.793358701652743e-06, + "loss": 0.4241, + "step": 4663 + }, + { + "epoch": 2.5215354117859072, + "grad_norm": 0.30984780192375183, + "learning_rate": 5.7914945868161035e-06, + "loss": 0.4, + "step": 4664 + }, + { + "epoch": 2.5220760497386916, + "grad_norm": 0.3277011811733246, + "learning_rate": 5.789630359129599e-06, + "loss": 0.4549, + "step": 4665 + }, + { + "epoch": 2.522616687691476, + "grad_norm": 0.2756793797016144, + "learning_rate": 5.787766018859029e-06, + "loss": 0.3835, + "step": 4666 + }, + { + "epoch": 2.5231573256442603, + "grad_norm": 0.2876024842262268, + "learning_rate": 5.785901566270209e-06, + "loss": 0.4025, + "step": 4667 + }, + { + "epoch": 2.5236979635970442, + "grad_norm": 0.2890456020832062, + "learning_rate": 5.784037001628969e-06, + "loss": 0.3946, + "step": 4668 + }, + { + "epoch": 2.524238601549829, + "grad_norm": 0.2782418131828308, + "learning_rate": 5.782172325201155e-06, + "loss": 0.4002, + "step": 4669 + }, + { + "epoch": 2.524779239502613, + "grad_norm": 0.3088947534561157, + "learning_rate": 5.780307537252629e-06, + "loss": 0.4331, + "step": 4670 + }, + { + "epoch": 2.5253198774553973, + "grad_norm": 0.29838061332702637, + "learning_rate": 5.778442638049269e-06, + "loss": 0.3774, + "step": 4671 + }, + { + "epoch": 2.5258605154081817, + "grad_norm": 0.28203192353248596, + "learning_rate": 5.776577627856969e-06, + "loss": 0.3978, + "step": 4672 + }, + { + "epoch": 2.526401153360966, + "grad_norm": 0.29183122515678406, + "learning_rate": 5.7747125069416374e-06, + "loss": 0.4065, + "step": 4673 + }, + { + "epoch": 2.5269417913137504, + "grad_norm": 0.3165287375450134, + "learning_rate": 5.772847275569204e-06, + "loss": 0.4, + "step": 4674 + }, + { + "epoch": 2.5274824292665343, + "grad_norm": 0.30928778648376465, + "learning_rate": 5.770981934005606e-06, + "loss": 0.4046, + "step": 4675 + }, + { + "epoch": 2.5280230672193187, + "grad_norm": 0.2827933132648468, + "learning_rate": 5.769116482516801e-06, + "loss": 0.3991, + "step": 4676 + }, + { + "epoch": 2.528563705172103, + "grad_norm": 0.3062750995159149, + "learning_rate": 5.767250921368763e-06, + "loss": 0.437, + "step": 4677 + }, + { + "epoch": 2.5291043431248874, + "grad_norm": 0.32716378569602966, + "learning_rate": 5.7653852508274796e-06, + "loss": 0.4036, + "step": 4678 + }, + { + "epoch": 2.5296449810776718, + "grad_norm": 0.29937511682510376, + "learning_rate": 5.763519471158956e-06, + "loss": 0.3997, + "step": 4679 + }, + { + "epoch": 2.5301856190304557, + "grad_norm": 0.2924257814884186, + "learning_rate": 5.76165358262921e-06, + "loss": 0.3738, + "step": 4680 + }, + { + "epoch": 2.5307262569832405, + "grad_norm": 0.30196434259414673, + "learning_rate": 5.7597875855042765e-06, + "loss": 0.4284, + "step": 4681 + }, + { + "epoch": 2.5312668949360244, + "grad_norm": 0.29077422618865967, + "learning_rate": 5.757921480050206e-06, + "loss": 0.3903, + "step": 4682 + }, + { + "epoch": 2.5318075328888088, + "grad_norm": 0.3043310344219208, + "learning_rate": 5.756055266533066e-06, + "loss": 0.3825, + "step": 4683 + }, + { + "epoch": 2.532348170841593, + "grad_norm": 0.3497193157672882, + "learning_rate": 5.754188945218937e-06, + "loss": 0.4463, + "step": 4684 + }, + { + "epoch": 2.5328888087943775, + "grad_norm": 0.28372862935066223, + "learning_rate": 5.752322516373916e-06, + "loss": 0.3929, + "step": 4685 + }, + { + "epoch": 2.533429446747162, + "grad_norm": 0.34064146876335144, + "learning_rate": 5.7504559802641144e-06, + "loss": 0.4059, + "step": 4686 + }, + { + "epoch": 2.533970084699946, + "grad_norm": 0.30861422419548035, + "learning_rate": 5.748589337155659e-06, + "loss": 0.4047, + "step": 4687 + }, + { + "epoch": 2.53451072265273, + "grad_norm": 0.29709404706954956, + "learning_rate": 5.746722587314693e-06, + "loss": 0.3686, + "step": 4688 + }, + { + "epoch": 2.5350513606055145, + "grad_norm": 0.30919149518013, + "learning_rate": 5.744855731007376e-06, + "loss": 0.3693, + "step": 4689 + }, + { + "epoch": 2.535591998558299, + "grad_norm": 0.29292532801628113, + "learning_rate": 5.742988768499879e-06, + "loss": 0.3915, + "step": 4690 + }, + { + "epoch": 2.5361326365110832, + "grad_norm": 0.31115463376045227, + "learning_rate": 5.74112170005839e-06, + "loss": 0.4598, + "step": 4691 + }, + { + "epoch": 2.536673274463867, + "grad_norm": 0.3015570342540741, + "learning_rate": 5.739254525949113e-06, + "loss": 0.4062, + "step": 4692 + }, + { + "epoch": 2.5372139124166515, + "grad_norm": 0.3175840973854065, + "learning_rate": 5.737387246438266e-06, + "loss": 0.3894, + "step": 4693 + }, + { + "epoch": 2.537754550369436, + "grad_norm": 0.3196827471256256, + "learning_rate": 5.735519861792081e-06, + "loss": 0.3792, + "step": 4694 + }, + { + "epoch": 2.5382951883222202, + "grad_norm": 0.3393493890762329, + "learning_rate": 5.733652372276809e-06, + "loss": 0.4803, + "step": 4695 + }, + { + "epoch": 2.5388358262750046, + "grad_norm": 0.2754509150981903, + "learning_rate": 5.731784778158712e-06, + "loss": 0.3826, + "step": 4696 + }, + { + "epoch": 2.539376464227789, + "grad_norm": 0.31321465969085693, + "learning_rate": 5.729917079704068e-06, + "loss": 0.4181, + "step": 4697 + }, + { + "epoch": 2.5399171021805733, + "grad_norm": 0.30417436361312866, + "learning_rate": 5.72804927717917e-06, + "loss": 0.3997, + "step": 4698 + }, + { + "epoch": 2.5404577401333572, + "grad_norm": 0.3323020339012146, + "learning_rate": 5.726181370850327e-06, + "loss": 0.3878, + "step": 4699 + }, + { + "epoch": 2.5409983780861416, + "grad_norm": 0.3090798854827881, + "learning_rate": 5.724313360983859e-06, + "loss": 0.4242, + "step": 4700 + }, + { + "epoch": 2.541539016038926, + "grad_norm": 0.3007925748825073, + "learning_rate": 5.722445247846107e-06, + "loss": 0.395, + "step": 4701 + }, + { + "epoch": 2.5420796539917103, + "grad_norm": 0.31759968400001526, + "learning_rate": 5.72057703170342e-06, + "loss": 0.4297, + "step": 4702 + }, + { + "epoch": 2.5426202919444947, + "grad_norm": 0.3306910991668701, + "learning_rate": 5.7187087128221685e-06, + "loss": 0.4202, + "step": 4703 + }, + { + "epoch": 2.5431609298972786, + "grad_norm": 0.30417653918266296, + "learning_rate": 5.71684029146873e-06, + "loss": 0.3866, + "step": 4704 + }, + { + "epoch": 2.543701567850063, + "grad_norm": 0.29840660095214844, + "learning_rate": 5.7149717679095026e-06, + "loss": 0.3956, + "step": 4705 + }, + { + "epoch": 2.5442422058028473, + "grad_norm": 0.3214104175567627, + "learning_rate": 5.713103142410896e-06, + "loss": 0.3872, + "step": 4706 + }, + { + "epoch": 2.5447828437556317, + "grad_norm": 0.31197574734687805, + "learning_rate": 5.71123441523934e-06, + "loss": 0.3789, + "step": 4707 + }, + { + "epoch": 2.545323481708416, + "grad_norm": 0.32361727952957153, + "learning_rate": 5.709365586661266e-06, + "loss": 0.4224, + "step": 4708 + }, + { + "epoch": 2.5458641196612, + "grad_norm": 0.267334908246994, + "learning_rate": 5.707496656943137e-06, + "loss": 0.3408, + "step": 4709 + }, + { + "epoch": 2.5464047576139848, + "grad_norm": 0.34889429807662964, + "learning_rate": 5.705627626351415e-06, + "loss": 0.4383, + "step": 4710 + }, + { + "epoch": 2.5469453955667687, + "grad_norm": 0.28287893533706665, + "learning_rate": 5.703758495152585e-06, + "loss": 0.4015, + "step": 4711 + }, + { + "epoch": 2.547486033519553, + "grad_norm": 0.29935476183891296, + "learning_rate": 5.701889263613145e-06, + "loss": 0.4103, + "step": 4712 + }, + { + "epoch": 2.5480266714723374, + "grad_norm": 0.32376861572265625, + "learning_rate": 5.700019931999607e-06, + "loss": 0.3986, + "step": 4713 + }, + { + "epoch": 2.5485673094251218, + "grad_norm": 0.2909325659275055, + "learning_rate": 5.698150500578497e-06, + "loss": 0.4137, + "step": 4714 + }, + { + "epoch": 2.549107947377906, + "grad_norm": 0.28764599561691284, + "learning_rate": 5.6962809696163536e-06, + "loss": 0.4149, + "step": 4715 + }, + { + "epoch": 2.54964858533069, + "grad_norm": 0.2751624286174774, + "learning_rate": 5.6944113393797314e-06, + "loss": 0.404, + "step": 4716 + }, + { + "epoch": 2.5501892232834744, + "grad_norm": 0.28708869218826294, + "learning_rate": 5.6925416101352e-06, + "loss": 0.3879, + "step": 4717 + }, + { + "epoch": 2.5507298612362588, + "grad_norm": 0.28639307618141174, + "learning_rate": 5.690671782149342e-06, + "loss": 0.377, + "step": 4718 + }, + { + "epoch": 2.551270499189043, + "grad_norm": 0.3289453983306885, + "learning_rate": 5.688801855688752e-06, + "loss": 0.3934, + "step": 4719 + }, + { + "epoch": 2.5518111371418275, + "grad_norm": 0.2868677079677582, + "learning_rate": 5.686931831020044e-06, + "loss": 0.4083, + "step": 4720 + }, + { + "epoch": 2.5523517750946114, + "grad_norm": 0.3312520682811737, + "learning_rate": 5.6850617084098416e-06, + "loss": 0.4313, + "step": 4721 + }, + { + "epoch": 2.552892413047396, + "grad_norm": 0.2790953516960144, + "learning_rate": 5.683191488124782e-06, + "loss": 0.3846, + "step": 4722 + }, + { + "epoch": 2.55343305100018, + "grad_norm": 0.3344435393810272, + "learning_rate": 5.681321170431517e-06, + "loss": 0.4598, + "step": 4723 + }, + { + "epoch": 2.5539736889529645, + "grad_norm": 0.3081229627132416, + "learning_rate": 5.679450755596716e-06, + "loss": 0.4078, + "step": 4724 + }, + { + "epoch": 2.554514326905749, + "grad_norm": 0.31888335943222046, + "learning_rate": 5.6775802438870596e-06, + "loss": 0.412, + "step": 4725 + }, + { + "epoch": 2.5550549648585332, + "grad_norm": 0.28536099195480347, + "learning_rate": 5.67570963556924e-06, + "loss": 0.3876, + "step": 4726 + }, + { + "epoch": 2.5555956028113176, + "grad_norm": 0.3071342408657074, + "learning_rate": 5.673838930909965e-06, + "loss": 0.3933, + "step": 4727 + }, + { + "epoch": 2.5561362407641015, + "grad_norm": 0.30499646067619324, + "learning_rate": 5.671968130175957e-06, + "loss": 0.4141, + "step": 4728 + }, + { + "epoch": 2.556676878716886, + "grad_norm": 0.2743418216705322, + "learning_rate": 5.670097233633951e-06, + "loss": 0.4209, + "step": 4729 + }, + { + "epoch": 2.5572175166696702, + "grad_norm": 0.29773473739624023, + "learning_rate": 5.668226241550698e-06, + "loss": 0.3932, + "step": 4730 + }, + { + "epoch": 2.5577581546224546, + "grad_norm": 0.3202398121356964, + "learning_rate": 5.666355154192958e-06, + "loss": 0.3952, + "step": 4731 + }, + { + "epoch": 2.558298792575239, + "grad_norm": 0.2851807475090027, + "learning_rate": 5.664483971827508e-06, + "loss": 0.3783, + "step": 4732 + }, + { + "epoch": 2.558839430528023, + "grad_norm": 0.31066733598709106, + "learning_rate": 5.662612694721139e-06, + "loss": 0.4034, + "step": 4733 + }, + { + "epoch": 2.5593800684808072, + "grad_norm": 0.29603439569473267, + "learning_rate": 5.660741323140651e-06, + "loss": 0.3991, + "step": 4734 + }, + { + "epoch": 2.5599207064335916, + "grad_norm": 0.2804199159145355, + "learning_rate": 5.658869857352866e-06, + "loss": 0.3779, + "step": 4735 + }, + { + "epoch": 2.560461344386376, + "grad_norm": 0.27277815341949463, + "learning_rate": 5.65699829762461e-06, + "loss": 0.3828, + "step": 4736 + }, + { + "epoch": 2.5610019823391603, + "grad_norm": 0.28206098079681396, + "learning_rate": 5.655126644222728e-06, + "loss": 0.413, + "step": 4737 + }, + { + "epoch": 2.5615426202919442, + "grad_norm": 0.28778842091560364, + "learning_rate": 5.653254897414076e-06, + "loss": 0.3704, + "step": 4738 + }, + { + "epoch": 2.562083258244729, + "grad_norm": 0.29356199502944946, + "learning_rate": 5.651383057465527e-06, + "loss": 0.4005, + "step": 4739 + }, + { + "epoch": 2.562623896197513, + "grad_norm": 0.32198256254196167, + "learning_rate": 5.649511124643962e-06, + "loss": 0.4573, + "step": 4740 + }, + { + "epoch": 2.5631645341502973, + "grad_norm": 0.2882884442806244, + "learning_rate": 5.647639099216278e-06, + "loss": 0.407, + "step": 4741 + }, + { + "epoch": 2.5637051721030817, + "grad_norm": 0.2837863862514496, + "learning_rate": 5.6457669814493855e-06, + "loss": 0.3928, + "step": 4742 + }, + { + "epoch": 2.564245810055866, + "grad_norm": 0.29012438654899597, + "learning_rate": 5.6438947716102085e-06, + "loss": 0.4155, + "step": 4743 + }, + { + "epoch": 2.5647864480086504, + "grad_norm": 0.2923259437084198, + "learning_rate": 5.642022469965682e-06, + "loss": 0.3967, + "step": 4744 + }, + { + "epoch": 2.5653270859614343, + "grad_norm": 0.33380910754203796, + "learning_rate": 5.640150076782755e-06, + "loss": 0.4377, + "step": 4745 + }, + { + "epoch": 2.5658677239142187, + "grad_norm": 0.2974168360233307, + "learning_rate": 5.638277592328392e-06, + "loss": 0.3964, + "step": 4746 + }, + { + "epoch": 2.566408361867003, + "grad_norm": 0.34730857610702515, + "learning_rate": 5.636405016869567e-06, + "loss": 0.4124, + "step": 4747 + }, + { + "epoch": 2.5669489998197874, + "grad_norm": 0.3285943865776062, + "learning_rate": 5.634532350673267e-06, + "loss": 0.3749, + "step": 4748 + }, + { + "epoch": 2.5674896377725718, + "grad_norm": 0.2939063608646393, + "learning_rate": 5.632659594006498e-06, + "loss": 0.3693, + "step": 4749 + }, + { + "epoch": 2.5680302757253557, + "grad_norm": 0.31589987874031067, + "learning_rate": 5.630786747136269e-06, + "loss": 0.4131, + "step": 4750 + }, + { + "epoch": 2.56857091367814, + "grad_norm": 0.3158331513404846, + "learning_rate": 5.628913810329608e-06, + "loss": 0.3751, + "step": 4751 + }, + { + "epoch": 2.5691115516309244, + "grad_norm": 0.30993759632110596, + "learning_rate": 5.6270407838535575e-06, + "loss": 0.4034, + "step": 4752 + }, + { + "epoch": 2.569652189583709, + "grad_norm": 0.30420321226119995, + "learning_rate": 5.625167667975171e-06, + "loss": 0.4026, + "step": 4753 + }, + { + "epoch": 2.570192827536493, + "grad_norm": 0.3265765309333801, + "learning_rate": 5.623294462961509e-06, + "loss": 0.4372, + "step": 4754 + }, + { + "epoch": 2.5707334654892775, + "grad_norm": 0.3445880711078644, + "learning_rate": 5.621421169079655e-06, + "loss": 0.4083, + "step": 4755 + }, + { + "epoch": 2.571274103442062, + "grad_norm": 0.3127903938293457, + "learning_rate": 5.619547786596695e-06, + "loss": 0.4445, + "step": 4756 + }, + { + "epoch": 2.571814741394846, + "grad_norm": 0.2989613711833954, + "learning_rate": 5.617674315779737e-06, + "loss": 0.3752, + "step": 4757 + }, + { + "epoch": 2.57235537934763, + "grad_norm": 0.3358243703842163, + "learning_rate": 5.615800756895893e-06, + "loss": 0.3885, + "step": 4758 + }, + { + "epoch": 2.5728960173004145, + "grad_norm": 0.2883393168449402, + "learning_rate": 5.613927110212295e-06, + "loss": 0.4034, + "step": 4759 + }, + { + "epoch": 2.573436655253199, + "grad_norm": 0.2877352237701416, + "learning_rate": 5.612053375996082e-06, + "loss": 0.4145, + "step": 4760 + }, + { + "epoch": 2.5739772932059832, + "grad_norm": 0.30452650785446167, + "learning_rate": 5.610179554514408e-06, + "loss": 0.3687, + "step": 4761 + }, + { + "epoch": 2.574517931158767, + "grad_norm": 0.3329680263996124, + "learning_rate": 5.608305646034441e-06, + "loss": 0.4156, + "step": 4762 + }, + { + "epoch": 2.5750585691115515, + "grad_norm": 0.27887311577796936, + "learning_rate": 5.6064316508233555e-06, + "loss": 0.3646, + "step": 4763 + }, + { + "epoch": 2.575599207064336, + "grad_norm": 0.291676789522171, + "learning_rate": 5.604557569148347e-06, + "loss": 0.4007, + "step": 4764 + }, + { + "epoch": 2.5761398450171202, + "grad_norm": 0.3124271333217621, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.409, + "step": 4765 + }, + { + "epoch": 2.5766804829699046, + "grad_norm": 0.3063427805900574, + "learning_rate": 5.600809147475378e-06, + "loss": 0.3944, + "step": 4766 + }, + { + "epoch": 2.5772211209226885, + "grad_norm": 0.290528267621994, + "learning_rate": 5.598934808011861e-06, + "loss": 0.4085, + "step": 4767 + }, + { + "epoch": 2.5777617588754733, + "grad_norm": 0.2874440848827362, + "learning_rate": 5.5970603831533055e-06, + "loss": 0.401, + "step": 4768 + }, + { + "epoch": 2.5783023968282572, + "grad_norm": 0.3331311345100403, + "learning_rate": 5.595185873166961e-06, + "loss": 0.3851, + "step": 4769 + }, + { + "epoch": 2.5788430347810416, + "grad_norm": 0.31015604734420776, + "learning_rate": 5.593311278320097e-06, + "loss": 0.4194, + "step": 4770 + }, + { + "epoch": 2.579383672733826, + "grad_norm": 0.306537002325058, + "learning_rate": 5.5914365988799854e-06, + "loss": 0.4077, + "step": 4771 + }, + { + "epoch": 2.5799243106866103, + "grad_norm": 0.3082190752029419, + "learning_rate": 5.589561835113917e-06, + "loss": 0.3948, + "step": 4772 + }, + { + "epoch": 2.5804649486393947, + "grad_norm": 0.2981390655040741, + "learning_rate": 5.587686987289189e-06, + "loss": 0.3974, + "step": 4773 + }, + { + "epoch": 2.5810055865921786, + "grad_norm": 0.28734248876571655, + "learning_rate": 5.585812055673117e-06, + "loss": 0.3953, + "step": 4774 + }, + { + "epoch": 2.581546224544963, + "grad_norm": 0.3071170747280121, + "learning_rate": 5.583937040533023e-06, + "loss": 0.4089, + "step": 4775 + }, + { + "epoch": 2.5820868624977473, + "grad_norm": 0.31013354659080505, + "learning_rate": 5.582061942136247e-06, + "loss": 0.4042, + "step": 4776 + }, + { + "epoch": 2.5826275004505317, + "grad_norm": 0.3202647566795349, + "learning_rate": 5.580186760750132e-06, + "loss": 0.3825, + "step": 4777 + }, + { + "epoch": 2.583168138403316, + "grad_norm": 0.2962801158428192, + "learning_rate": 5.578311496642042e-06, + "loss": 0.3861, + "step": 4778 + }, + { + "epoch": 2.5837087763561, + "grad_norm": 0.3064135015010834, + "learning_rate": 5.576436150079347e-06, + "loss": 0.4166, + "step": 4779 + }, + { + "epoch": 2.5842494143088843, + "grad_norm": 0.3127712607383728, + "learning_rate": 5.574560721329431e-06, + "loss": 0.3752, + "step": 4780 + }, + { + "epoch": 2.5847900522616687, + "grad_norm": 0.28814229369163513, + "learning_rate": 5.572685210659688e-06, + "loss": 0.389, + "step": 4781 + }, + { + "epoch": 2.585330690214453, + "grad_norm": 0.28176191449165344, + "learning_rate": 5.570809618337528e-06, + "loss": 0.4222, + "step": 4782 + }, + { + "epoch": 2.5858713281672374, + "grad_norm": 0.33255767822265625, + "learning_rate": 5.568933944630367e-06, + "loss": 0.4089, + "step": 4783 + }, + { + "epoch": 2.5864119661200218, + "grad_norm": 0.29645800590515137, + "learning_rate": 5.567058189805636e-06, + "loss": 0.3528, + "step": 4784 + }, + { + "epoch": 2.586952604072806, + "grad_norm": 0.3250539302825928, + "learning_rate": 5.565182354130776e-06, + "loss": 0.4067, + "step": 4785 + }, + { + "epoch": 2.58749324202559, + "grad_norm": 0.31018659472465515, + "learning_rate": 5.563306437873239e-06, + "loss": 0.4024, + "step": 4786 + }, + { + "epoch": 2.5880338799783744, + "grad_norm": 0.30082499980926514, + "learning_rate": 5.561430441300493e-06, + "loss": 0.3976, + "step": 4787 + }, + { + "epoch": 2.588574517931159, + "grad_norm": 0.3291463255882263, + "learning_rate": 5.5595543646800134e-06, + "loss": 0.4123, + "step": 4788 + }, + { + "epoch": 2.589115155883943, + "grad_norm": 0.2999454140663147, + "learning_rate": 5.557678208279286e-06, + "loss": 0.4015, + "step": 4789 + }, + { + "epoch": 2.5896557938367275, + "grad_norm": 0.3320070803165436, + "learning_rate": 5.555801972365812e-06, + "loss": 0.3789, + "step": 4790 + }, + { + "epoch": 2.5901964317895114, + "grad_norm": 0.3625023663043976, + "learning_rate": 5.553925657207098e-06, + "loss": 0.426, + "step": 4791 + }, + { + "epoch": 2.590737069742296, + "grad_norm": 0.29068654775619507, + "learning_rate": 5.5520492630706705e-06, + "loss": 0.3966, + "step": 4792 + }, + { + "epoch": 2.59127770769508, + "grad_norm": 0.3232583701610565, + "learning_rate": 5.5501727902240584e-06, + "loss": 0.4406, + "step": 4793 + }, + { + "epoch": 2.5918183456478645, + "grad_norm": 0.3110728859901428, + "learning_rate": 5.5482962389348084e-06, + "loss": 0.3989, + "step": 4794 + }, + { + "epoch": 2.592358983600649, + "grad_norm": 0.3199319839477539, + "learning_rate": 5.5464196094704745e-06, + "loss": 0.3721, + "step": 4795 + }, + { + "epoch": 2.592899621553433, + "grad_norm": 0.29083722829818726, + "learning_rate": 5.544542902098624e-06, + "loss": 0.3863, + "step": 4796 + }, + { + "epoch": 2.5934402595062176, + "grad_norm": 0.3540262281894684, + "learning_rate": 5.542666117086832e-06, + "loss": 0.4659, + "step": 4797 + }, + { + "epoch": 2.5939808974590015, + "grad_norm": 0.2991962134838104, + "learning_rate": 5.540789254702691e-06, + "loss": 0.4013, + "step": 4798 + }, + { + "epoch": 2.594521535411786, + "grad_norm": 0.28734034299850464, + "learning_rate": 5.5389123152137965e-06, + "loss": 0.4237, + "step": 4799 + }, + { + "epoch": 2.5950621733645702, + "grad_norm": 0.28482699394226074, + "learning_rate": 5.537035298887764e-06, + "loss": 0.3689, + "step": 4800 + }, + { + "epoch": 2.5956028113173546, + "grad_norm": 0.2997708320617676, + "learning_rate": 5.53515820599221e-06, + "loss": 0.4029, + "step": 4801 + }, + { + "epoch": 2.596143449270139, + "grad_norm": 0.30606913566589355, + "learning_rate": 5.53328103679477e-06, + "loss": 0.4063, + "step": 4802 + }, + { + "epoch": 2.596684087222923, + "grad_norm": 0.30242788791656494, + "learning_rate": 5.5314037915630855e-06, + "loss": 0.3998, + "step": 4803 + }, + { + "epoch": 2.5972247251757072, + "grad_norm": 0.3239155411720276, + "learning_rate": 5.529526470564814e-06, + "loss": 0.3953, + "step": 4804 + }, + { + "epoch": 2.5977653631284916, + "grad_norm": 0.2938888967037201, + "learning_rate": 5.527649074067618e-06, + "loss": 0.4338, + "step": 4805 + }, + { + "epoch": 2.598306001081276, + "grad_norm": 0.2955934703350067, + "learning_rate": 5.525771602339174e-06, + "loss": 0.3807, + "step": 4806 + }, + { + "epoch": 2.5988466390340603, + "grad_norm": 0.3063485026359558, + "learning_rate": 5.523894055647167e-06, + "loss": 0.397, + "step": 4807 + }, + { + "epoch": 2.5993872769868442, + "grad_norm": 0.28256726264953613, + "learning_rate": 5.522016434259295e-06, + "loss": 0.3811, + "step": 4808 + }, + { + "epoch": 2.5999279149396286, + "grad_norm": 0.3530399203300476, + "learning_rate": 5.520138738443267e-06, + "loss": 0.3994, + "step": 4809 + }, + { + "epoch": 2.600468552892413, + "grad_norm": 0.31889668107032776, + "learning_rate": 5.5182609684668024e-06, + "loss": 0.4323, + "step": 4810 + }, + { + "epoch": 2.6010091908451973, + "grad_norm": 0.3093354105949402, + "learning_rate": 5.516383124597626e-06, + "loss": 0.3996, + "step": 4811 + }, + { + "epoch": 2.6015498287979817, + "grad_norm": 0.33319196105003357, + "learning_rate": 5.514505207103482e-06, + "loss": 0.3765, + "step": 4812 + }, + { + "epoch": 2.602090466750766, + "grad_norm": 0.337973028421402, + "learning_rate": 5.512627216252117e-06, + "loss": 0.4089, + "step": 4813 + }, + { + "epoch": 2.6026311047035504, + "grad_norm": 0.3009018898010254, + "learning_rate": 5.510749152311293e-06, + "loss": 0.4139, + "step": 4814 + }, + { + "epoch": 2.6031717426563343, + "grad_norm": 0.3260372281074524, + "learning_rate": 5.508871015548781e-06, + "loss": 0.4199, + "step": 4815 + }, + { + "epoch": 2.6037123806091187, + "grad_norm": 0.3295881450176239, + "learning_rate": 5.506992806232363e-06, + "loss": 0.379, + "step": 4816 + }, + { + "epoch": 2.604253018561903, + "grad_norm": 0.3007758855819702, + "learning_rate": 5.50511452462983e-06, + "loss": 0.3894, + "step": 4817 + }, + { + "epoch": 2.6047936565146874, + "grad_norm": 0.2632093131542206, + "learning_rate": 5.503236171008983e-06, + "loss": 0.3655, + "step": 4818 + }, + { + "epoch": 2.605334294467472, + "grad_norm": 0.3454212248325348, + "learning_rate": 5.501357745637635e-06, + "loss": 0.4068, + "step": 4819 + }, + { + "epoch": 2.6058749324202557, + "grad_norm": 0.3434267044067383, + "learning_rate": 5.49947924878361e-06, + "loss": 0.3986, + "step": 4820 + }, + { + "epoch": 2.60641557037304, + "grad_norm": 0.27694422006607056, + "learning_rate": 5.497600680714738e-06, + "loss": 0.3915, + "step": 4821 + }, + { + "epoch": 2.6069562083258244, + "grad_norm": 0.3544457256793976, + "learning_rate": 5.495722041698864e-06, + "loss": 0.436, + "step": 4822 + }, + { + "epoch": 2.607496846278609, + "grad_norm": 0.30748435854911804, + "learning_rate": 5.4938433320038395e-06, + "loss": 0.3588, + "step": 4823 + }, + { + "epoch": 2.608037484231393, + "grad_norm": 0.301878422498703, + "learning_rate": 5.49196455189753e-06, + "loss": 0.4161, + "step": 4824 + }, + { + "epoch": 2.608578122184177, + "grad_norm": 0.297625333070755, + "learning_rate": 5.490085701647805e-06, + "loss": 0.4219, + "step": 4825 + }, + { + "epoch": 2.609118760136962, + "grad_norm": 0.31237471103668213, + "learning_rate": 5.488206781522547e-06, + "loss": 0.4199, + "step": 4826 + }, + { + "epoch": 2.609659398089746, + "grad_norm": 0.3142875134944916, + "learning_rate": 5.486327791789654e-06, + "loss": 0.4171, + "step": 4827 + }, + { + "epoch": 2.61020003604253, + "grad_norm": 0.30160701274871826, + "learning_rate": 5.484448732717026e-06, + "loss": 0.3549, + "step": 4828 + }, + { + "epoch": 2.6107406739953145, + "grad_norm": 0.30564194917678833, + "learning_rate": 5.482569604572577e-06, + "loss": 0.4039, + "step": 4829 + }, + { + "epoch": 2.611281311948099, + "grad_norm": 0.357769638299942, + "learning_rate": 5.480690407624227e-06, + "loss": 0.4275, + "step": 4830 + }, + { + "epoch": 2.6118219499008832, + "grad_norm": 0.31541019678115845, + "learning_rate": 5.47881114213991e-06, + "loss": 0.4173, + "step": 4831 + }, + { + "epoch": 2.612362587853667, + "grad_norm": 0.28554439544677734, + "learning_rate": 5.476931808387569e-06, + "loss": 0.4039, + "step": 4832 + }, + { + "epoch": 2.6129032258064515, + "grad_norm": 0.31113046407699585, + "learning_rate": 5.475052406635158e-06, + "loss": 0.4206, + "step": 4833 + }, + { + "epoch": 2.613443863759236, + "grad_norm": 0.2959286868572235, + "learning_rate": 5.473172937150633e-06, + "loss": 0.3876, + "step": 4834 + }, + { + "epoch": 2.6139845017120202, + "grad_norm": 0.31250184774398804, + "learning_rate": 5.47129340020197e-06, + "loss": 0.3951, + "step": 4835 + }, + { + "epoch": 2.6145251396648046, + "grad_norm": 0.33424627780914307, + "learning_rate": 5.469413796057147e-06, + "loss": 0.4076, + "step": 4836 + }, + { + "epoch": 2.6150657776175885, + "grad_norm": 0.29603421688079834, + "learning_rate": 5.467534124984158e-06, + "loss": 0.3878, + "step": 4837 + }, + { + "epoch": 2.615606415570373, + "grad_norm": 0.3031003177165985, + "learning_rate": 5.4656543872509994e-06, + "loss": 0.4352, + "step": 4838 + }, + { + "epoch": 2.6161470535231572, + "grad_norm": 0.2941995859146118, + "learning_rate": 5.4637745831256835e-06, + "loss": 0.3927, + "step": 4839 + }, + { + "epoch": 2.6166876914759416, + "grad_norm": 0.2877275049686432, + "learning_rate": 5.461894712876228e-06, + "loss": 0.3931, + "step": 4840 + }, + { + "epoch": 2.617228329428726, + "grad_norm": 0.2958478033542633, + "learning_rate": 5.460014776770663e-06, + "loss": 0.4354, + "step": 4841 + }, + { + "epoch": 2.6177689673815103, + "grad_norm": 0.2891026735305786, + "learning_rate": 5.458134775077024e-06, + "loss": 0.3842, + "step": 4842 + }, + { + "epoch": 2.6183096053342947, + "grad_norm": 0.28329116106033325, + "learning_rate": 5.45625470806336e-06, + "loss": 0.3872, + "step": 4843 + }, + { + "epoch": 2.6188502432870786, + "grad_norm": 0.2999545931816101, + "learning_rate": 5.4543745759977265e-06, + "loss": 0.4215, + "step": 4844 + }, + { + "epoch": 2.619390881239863, + "grad_norm": 0.30322304368019104, + "learning_rate": 5.45249437914819e-06, + "loss": 0.4065, + "step": 4845 + }, + { + "epoch": 2.6199315191926473, + "grad_norm": 0.2818833887577057, + "learning_rate": 5.4506141177828255e-06, + "loss": 0.3785, + "step": 4846 + }, + { + "epoch": 2.6204721571454317, + "grad_norm": 0.2854217290878296, + "learning_rate": 5.448733792169717e-06, + "loss": 0.4159, + "step": 4847 + }, + { + "epoch": 2.621012795098216, + "grad_norm": 0.2914997339248657, + "learning_rate": 5.446853402576958e-06, + "loss": 0.4188, + "step": 4848 + }, + { + "epoch": 2.621553433051, + "grad_norm": 0.2967655658721924, + "learning_rate": 5.44497294927265e-06, + "loss": 0.3725, + "step": 4849 + }, + { + "epoch": 2.6220940710037843, + "grad_norm": 0.3074378967285156, + "learning_rate": 5.443092432524906e-06, + "loss": 0.4387, + "step": 4850 + }, + { + "epoch": 2.6226347089565687, + "grad_norm": 0.3074101209640503, + "learning_rate": 5.441211852601849e-06, + "loss": 0.421, + "step": 4851 + }, + { + "epoch": 2.623175346909353, + "grad_norm": 0.30987322330474854, + "learning_rate": 5.439331209771604e-06, + "loss": 0.3981, + "step": 4852 + }, + { + "epoch": 2.6237159848621374, + "grad_norm": 0.29704779386520386, + "learning_rate": 5.437450504302312e-06, + "loss": 0.3864, + "step": 4853 + }, + { + "epoch": 2.6242566228149213, + "grad_norm": 0.29757240414619446, + "learning_rate": 5.435569736462119e-06, + "loss": 0.4094, + "step": 4854 + }, + { + "epoch": 2.624797260767706, + "grad_norm": 0.2916492819786072, + "learning_rate": 5.433688906519183e-06, + "loss": 0.3857, + "step": 4855 + }, + { + "epoch": 2.62533789872049, + "grad_norm": 0.30935555696487427, + "learning_rate": 5.4318080147416695e-06, + "loss": 0.3868, + "step": 4856 + }, + { + "epoch": 2.6258785366732744, + "grad_norm": 0.2876951992511749, + "learning_rate": 5.429927061397754e-06, + "loss": 0.3804, + "step": 4857 + }, + { + "epoch": 2.626419174626059, + "grad_norm": 0.29807013273239136, + "learning_rate": 5.428046046755615e-06, + "loss": 0.3899, + "step": 4858 + }, + { + "epoch": 2.626959812578843, + "grad_norm": 0.30588850378990173, + "learning_rate": 5.426164971083447e-06, + "loss": 0.4116, + "step": 4859 + }, + { + "epoch": 2.6275004505316275, + "grad_norm": 0.2891402840614319, + "learning_rate": 5.424283834649451e-06, + "loss": 0.3746, + "step": 4860 + }, + { + "epoch": 2.6280410884844114, + "grad_norm": 0.3078900873661041, + "learning_rate": 5.4224026377218365e-06, + "loss": 0.4146, + "step": 4861 + }, + { + "epoch": 2.628581726437196, + "grad_norm": 0.2716504633426666, + "learning_rate": 5.4205213805688174e-06, + "loss": 0.3487, + "step": 4862 + }, + { + "epoch": 2.62912236438998, + "grad_norm": 0.29706239700317383, + "learning_rate": 5.4186400634586246e-06, + "loss": 0.4373, + "step": 4863 + }, + { + "epoch": 2.6296630023427645, + "grad_norm": 0.28729817271232605, + "learning_rate": 5.416758686659488e-06, + "loss": 0.3455, + "step": 4864 + }, + { + "epoch": 2.630203640295549, + "grad_norm": 0.3301662802696228, + "learning_rate": 5.414877250439654e-06, + "loss": 0.4292, + "step": 4865 + }, + { + "epoch": 2.630744278248333, + "grad_norm": 0.29704442620277405, + "learning_rate": 5.412995755067375e-06, + "loss": 0.402, + "step": 4866 + }, + { + "epoch": 2.631284916201117, + "grad_norm": 0.30376604199409485, + "learning_rate": 5.41111420081091e-06, + "loss": 0.374, + "step": 4867 + }, + { + "epoch": 2.6318255541539015, + "grad_norm": 0.2995263636112213, + "learning_rate": 5.4092325879385264e-06, + "loss": 0.3908, + "step": 4868 + }, + { + "epoch": 2.632366192106686, + "grad_norm": 0.2971474528312683, + "learning_rate": 5.4073509167185045e-06, + "loss": 0.4179, + "step": 4869 + }, + { + "epoch": 2.6329068300594702, + "grad_norm": 0.2971806228160858, + "learning_rate": 5.405469187419126e-06, + "loss": 0.3567, + "step": 4870 + }, + { + "epoch": 2.6334474680122546, + "grad_norm": 0.2885863482952118, + "learning_rate": 5.403587400308685e-06, + "loss": 0.3911, + "step": 4871 + }, + { + "epoch": 2.633988105965039, + "grad_norm": 0.30982890725135803, + "learning_rate": 5.401705555655485e-06, + "loss": 0.4027, + "step": 4872 + }, + { + "epoch": 2.634528743917823, + "grad_norm": 0.29124149680137634, + "learning_rate": 5.399823653727837e-06, + "loss": 0.4034, + "step": 4873 + }, + { + "epoch": 2.6350693818706072, + "grad_norm": 0.3088896572589874, + "learning_rate": 5.3979416947940556e-06, + "loss": 0.4279, + "step": 4874 + }, + { + "epoch": 2.6356100198233916, + "grad_norm": 0.26539069414138794, + "learning_rate": 5.39605967912247e-06, + "loss": 0.3746, + "step": 4875 + }, + { + "epoch": 2.636150657776176, + "grad_norm": 0.31384748220443726, + "learning_rate": 5.3941776069814124e-06, + "loss": 0.4475, + "step": 4876 + }, + { + "epoch": 2.6366912957289603, + "grad_norm": 0.287557989358902, + "learning_rate": 5.392295478639226e-06, + "loss": 0.404, + "step": 4877 + }, + { + "epoch": 2.6372319336817442, + "grad_norm": 0.2968330681324005, + "learning_rate": 5.390413294364261e-06, + "loss": 0.4049, + "step": 4878 + }, + { + "epoch": 2.6377725716345286, + "grad_norm": 0.29474076628685, + "learning_rate": 5.388531054424878e-06, + "loss": 0.3785, + "step": 4879 + }, + { + "epoch": 2.638313209587313, + "grad_norm": 0.3032850921154022, + "learning_rate": 5.386648759089441e-06, + "loss": 0.4237, + "step": 4880 + }, + { + "epoch": 2.6388538475400973, + "grad_norm": 0.2936110496520996, + "learning_rate": 5.3847664086263264e-06, + "loss": 0.4111, + "step": 4881 + }, + { + "epoch": 2.6393944854928817, + "grad_norm": 0.3138837516307831, + "learning_rate": 5.382884003303913e-06, + "loss": 0.4136, + "step": 4882 + }, + { + "epoch": 2.6399351234456656, + "grad_norm": 0.28655844926834106, + "learning_rate": 5.381001543390592e-06, + "loss": 0.4019, + "step": 4883 + }, + { + "epoch": 2.6404757613984504, + "grad_norm": 0.3090074360370636, + "learning_rate": 5.379119029154763e-06, + "loss": 0.3926, + "step": 4884 + }, + { + "epoch": 2.6410163993512343, + "grad_norm": 0.3107156455516815, + "learning_rate": 5.3772364608648304e-06, + "loss": 0.4038, + "step": 4885 + }, + { + "epoch": 2.6415570373040187, + "grad_norm": 0.28442808985710144, + "learning_rate": 5.375353838789207e-06, + "loss": 0.3933, + "step": 4886 + }, + { + "epoch": 2.642097675256803, + "grad_norm": 0.2985711991786957, + "learning_rate": 5.373471163196314e-06, + "loss": 0.3881, + "step": 4887 + }, + { + "epoch": 2.6426383132095874, + "grad_norm": 0.29716822504997253, + "learning_rate": 5.371588434354579e-06, + "loss": 0.3739, + "step": 4888 + }, + { + "epoch": 2.643178951162372, + "grad_norm": 0.2983829975128174, + "learning_rate": 5.36970565253244e-06, + "loss": 0.4014, + "step": 4889 + }, + { + "epoch": 2.6437195891151557, + "grad_norm": 0.28274622559547424, + "learning_rate": 5.367822817998338e-06, + "loss": 0.3979, + "step": 4890 + }, + { + "epoch": 2.64426022706794, + "grad_norm": 0.2913801372051239, + "learning_rate": 5.365939931020725e-06, + "loss": 0.4069, + "step": 4891 + }, + { + "epoch": 2.6448008650207244, + "grad_norm": 0.37011346220970154, + "learning_rate": 5.364056991868063e-06, + "loss": 0.4528, + "step": 4892 + }, + { + "epoch": 2.645341502973509, + "grad_norm": 0.31129953265190125, + "learning_rate": 5.362174000808813e-06, + "loss": 0.3882, + "step": 4893 + }, + { + "epoch": 2.645882140926293, + "grad_norm": 0.279175341129303, + "learning_rate": 5.360290958111451e-06, + "loss": 0.3741, + "step": 4894 + }, + { + "epoch": 2.646422778879077, + "grad_norm": 0.290469765663147, + "learning_rate": 5.358407864044456e-06, + "loss": 0.3578, + "step": 4895 + }, + { + "epoch": 2.646963416831862, + "grad_norm": 0.3076058030128479, + "learning_rate": 5.35652471887632e-06, + "loss": 0.393, + "step": 4896 + }, + { + "epoch": 2.647504054784646, + "grad_norm": 0.3355569541454315, + "learning_rate": 5.354641522875535e-06, + "loss": 0.4244, + "step": 4897 + }, + { + "epoch": 2.64804469273743, + "grad_norm": 0.2836383879184723, + "learning_rate": 5.352758276310606e-06, + "loss": 0.4281, + "step": 4898 + }, + { + "epoch": 2.6485853306902145, + "grad_norm": 0.3006925582885742, + "learning_rate": 5.3508749794500395e-06, + "loss": 0.4293, + "step": 4899 + }, + { + "epoch": 2.649125968642999, + "grad_norm": 0.33312705159187317, + "learning_rate": 5.348991632562355e-06, + "loss": 0.3727, + "step": 4900 + }, + { + "epoch": 2.6496666065957832, + "grad_norm": 0.30963844060897827, + "learning_rate": 5.347108235916077e-06, + "loss": 0.3869, + "step": 4901 + }, + { + "epoch": 2.650207244548567, + "grad_norm": 0.32791048288345337, + "learning_rate": 5.345224789779735e-06, + "loss": 0.4117, + "step": 4902 + }, + { + "epoch": 2.6507478825013515, + "grad_norm": 0.2883923053741455, + "learning_rate": 5.343341294421868e-06, + "loss": 0.4054, + "step": 4903 + }, + { + "epoch": 2.651288520454136, + "grad_norm": 0.2899568974971771, + "learning_rate": 5.3414577501110255e-06, + "loss": 0.4046, + "step": 4904 + }, + { + "epoch": 2.6518291584069202, + "grad_norm": 0.3008023798465729, + "learning_rate": 5.339574157115752e-06, + "loss": 0.3975, + "step": 4905 + }, + { + "epoch": 2.6523697963597046, + "grad_norm": 0.31351393461227417, + "learning_rate": 5.337690515704612e-06, + "loss": 0.4381, + "step": 4906 + }, + { + "epoch": 2.6529104343124885, + "grad_norm": 0.2951430678367615, + "learning_rate": 5.335806826146171e-06, + "loss": 0.3764, + "step": 4907 + }, + { + "epoch": 2.653451072265273, + "grad_norm": 0.2831011414527893, + "learning_rate": 5.333923088709002e-06, + "loss": 0.4136, + "step": 4908 + }, + { + "epoch": 2.6539917102180572, + "grad_norm": 0.3104168176651001, + "learning_rate": 5.332039303661683e-06, + "loss": 0.4166, + "step": 4909 + }, + { + "epoch": 2.6545323481708416, + "grad_norm": 0.26301953196525574, + "learning_rate": 5.330155471272804e-06, + "loss": 0.3623, + "step": 4910 + }, + { + "epoch": 2.655072986123626, + "grad_norm": 0.3025093972682953, + "learning_rate": 5.328271591810956e-06, + "loss": 0.4161, + "step": 4911 + }, + { + "epoch": 2.6556136240764103, + "grad_norm": 0.2944580912590027, + "learning_rate": 5.326387665544739e-06, + "loss": 0.3918, + "step": 4912 + }, + { + "epoch": 2.6561542620291947, + "grad_norm": 0.2983967661857605, + "learning_rate": 5.32450369274276e-06, + "loss": 0.3895, + "step": 4913 + }, + { + "epoch": 2.6566948999819786, + "grad_norm": 0.29173171520233154, + "learning_rate": 5.3226196736736345e-06, + "loss": 0.3957, + "step": 4914 + }, + { + "epoch": 2.657235537934763, + "grad_norm": 0.3415560722351074, + "learning_rate": 5.320735608605979e-06, + "loss": 0.3912, + "step": 4915 + }, + { + "epoch": 2.6577761758875473, + "grad_norm": 0.287569522857666, + "learning_rate": 5.318851497808424e-06, + "loss": 0.3931, + "step": 4916 + }, + { + "epoch": 2.6583168138403317, + "grad_norm": 0.2986423671245575, + "learning_rate": 5.316967341549598e-06, + "loss": 0.4022, + "step": 4917 + }, + { + "epoch": 2.658857451793116, + "grad_norm": 0.2977146804332733, + "learning_rate": 5.315083140098145e-06, + "loss": 0.3984, + "step": 4918 + }, + { + "epoch": 2.6593980897459, + "grad_norm": 0.26954126358032227, + "learning_rate": 5.313198893722708e-06, + "loss": 0.4017, + "step": 4919 + }, + { + "epoch": 2.6599387276986843, + "grad_norm": 0.29893094301223755, + "learning_rate": 5.311314602691943e-06, + "loss": 0.4225, + "step": 4920 + }, + { + "epoch": 2.6604793656514687, + "grad_norm": 0.28754350543022156, + "learning_rate": 5.309430267274503e-06, + "loss": 0.4092, + "step": 4921 + }, + { + "epoch": 2.661020003604253, + "grad_norm": 0.29370546340942383, + "learning_rate": 5.307545887739059e-06, + "loss": 0.4352, + "step": 4922 + }, + { + "epoch": 2.6615606415570374, + "grad_norm": 0.28276485204696655, + "learning_rate": 5.305661464354278e-06, + "loss": 0.3883, + "step": 4923 + }, + { + "epoch": 2.6621012795098213, + "grad_norm": 0.31154152750968933, + "learning_rate": 5.303776997388842e-06, + "loss": 0.3647, + "step": 4924 + }, + { + "epoch": 2.662641917462606, + "grad_norm": 0.31019219756126404, + "learning_rate": 5.301892487111431e-06, + "loss": 0.4203, + "step": 4925 + }, + { + "epoch": 2.66318255541539, + "grad_norm": 0.28385746479034424, + "learning_rate": 5.300007933790737e-06, + "loss": 0.4078, + "step": 4926 + }, + { + "epoch": 2.6637231933681744, + "grad_norm": 0.298709511756897, + "learning_rate": 5.298123337695455e-06, + "loss": 0.4308, + "step": 4927 + }, + { + "epoch": 2.664263831320959, + "grad_norm": 0.2809273898601532, + "learning_rate": 5.296238699094288e-06, + "loss": 0.3719, + "step": 4928 + }, + { + "epoch": 2.664804469273743, + "grad_norm": 0.33656007051467896, + "learning_rate": 5.294354018255945e-06, + "loss": 0.4532, + "step": 4929 + }, + { + "epoch": 2.6653451072265275, + "grad_norm": 0.31055185198783875, + "learning_rate": 5.292469295449141e-06, + "loss": 0.3959, + "step": 4930 + }, + { + "epoch": 2.6658857451793114, + "grad_norm": 0.303205281496048, + "learning_rate": 5.290584530942593e-06, + "loss": 0.4183, + "step": 4931 + }, + { + "epoch": 2.666426383132096, + "grad_norm": 0.3017641603946686, + "learning_rate": 5.288699725005031e-06, + "loss": 0.3883, + "step": 4932 + }, + { + "epoch": 2.66696702108488, + "grad_norm": 0.34416377544403076, + "learning_rate": 5.286814877905186e-06, + "loss": 0.4114, + "step": 4933 + }, + { + "epoch": 2.6675076590376645, + "grad_norm": 0.29349640011787415, + "learning_rate": 5.284929989911793e-06, + "loss": 0.3976, + "step": 4934 + }, + { + "epoch": 2.668048296990449, + "grad_norm": 0.2900095582008362, + "learning_rate": 5.2830450612936e-06, + "loss": 0.3908, + "step": 4935 + }, + { + "epoch": 2.668588934943233, + "grad_norm": 0.31933918595314026, + "learning_rate": 5.281160092319358e-06, + "loss": 0.406, + "step": 4936 + }, + { + "epoch": 2.669129572896017, + "grad_norm": 0.32243990898132324, + "learning_rate": 5.2792750832578164e-06, + "loss": 0.4107, + "step": 4937 + }, + { + "epoch": 2.6696702108488015, + "grad_norm": 0.2958493232727051, + "learning_rate": 5.277390034377742e-06, + "loss": 0.4199, + "step": 4938 + }, + { + "epoch": 2.670210848801586, + "grad_norm": 0.31553322076797485, + "learning_rate": 5.275504945947898e-06, + "loss": 0.3843, + "step": 4939 + }, + { + "epoch": 2.6707514867543702, + "grad_norm": 0.3030598759651184, + "learning_rate": 5.273619818237058e-06, + "loss": 0.3627, + "step": 4940 + }, + { + "epoch": 2.6712921247071546, + "grad_norm": 0.2761203944683075, + "learning_rate": 5.271734651514001e-06, + "loss": 0.3991, + "step": 4941 + }, + { + "epoch": 2.671832762659939, + "grad_norm": 0.26242905855178833, + "learning_rate": 5.26984944604751e-06, + "loss": 0.3785, + "step": 4942 + }, + { + "epoch": 2.672373400612723, + "grad_norm": 0.3502192497253418, + "learning_rate": 5.267964202106375e-06, + "loss": 0.4364, + "step": 4943 + }, + { + "epoch": 2.6729140385655072, + "grad_norm": 0.3181464672088623, + "learning_rate": 5.26607891995939e-06, + "loss": 0.4247, + "step": 4944 + }, + { + "epoch": 2.6734546765182916, + "grad_norm": 0.2903212010860443, + "learning_rate": 5.264193599875353e-06, + "loss": 0.384, + "step": 4945 + }, + { + "epoch": 2.673995314471076, + "grad_norm": 0.30827388167381287, + "learning_rate": 5.2623082421230735e-06, + "loss": 0.4271, + "step": 4946 + }, + { + "epoch": 2.6745359524238603, + "grad_norm": 0.3044445514678955, + "learning_rate": 5.260422846971359e-06, + "loss": 0.4173, + "step": 4947 + }, + { + "epoch": 2.6750765903766442, + "grad_norm": 0.3259648382663727, + "learning_rate": 5.258537414689029e-06, + "loss": 0.4217, + "step": 4948 + }, + { + "epoch": 2.6756172283294286, + "grad_norm": 0.2715407907962799, + "learning_rate": 5.256651945544902e-06, + "loss": 0.3924, + "step": 4949 + }, + { + "epoch": 2.676157866282213, + "grad_norm": 0.3154553174972534, + "learning_rate": 5.254766439807807e-06, + "loss": 0.3936, + "step": 4950 + }, + { + "epoch": 2.6766985042349973, + "grad_norm": 0.3301718533039093, + "learning_rate": 5.252880897746573e-06, + "loss": 0.4239, + "step": 4951 + }, + { + "epoch": 2.6772391421877817, + "grad_norm": 0.3474384546279907, + "learning_rate": 5.25099531963004e-06, + "loss": 0.417, + "step": 4952 + }, + { + "epoch": 2.6777797801405656, + "grad_norm": 0.29553794860839844, + "learning_rate": 5.249109705727049e-06, + "loss": 0.3806, + "step": 4953 + }, + { + "epoch": 2.6783204180933504, + "grad_norm": 0.2954624593257904, + "learning_rate": 5.24722405630645e-06, + "loss": 0.4002, + "step": 4954 + }, + { + "epoch": 2.6788610560461343, + "grad_norm": 0.3270542621612549, + "learning_rate": 5.245338371637091e-06, + "loss": 0.397, + "step": 4955 + }, + { + "epoch": 2.6794016939989187, + "grad_norm": 0.3222622275352478, + "learning_rate": 5.243452651987833e-06, + "loss": 0.3858, + "step": 4956 + }, + { + "epoch": 2.679942331951703, + "grad_norm": 0.28066954016685486, + "learning_rate": 5.241566897627536e-06, + "loss": 0.3499, + "step": 4957 + }, + { + "epoch": 2.6804829699044874, + "grad_norm": 0.33865922689437866, + "learning_rate": 5.239681108825069e-06, + "loss": 0.4373, + "step": 4958 + }, + { + "epoch": 2.681023607857272, + "grad_norm": 0.318342387676239, + "learning_rate": 5.237795285849305e-06, + "loss": 0.3795, + "step": 4959 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 0.31130820512771606, + "learning_rate": 5.235909428969119e-06, + "loss": 0.4446, + "step": 4960 + }, + { + "epoch": 2.68210488376284, + "grad_norm": 0.29390275478363037, + "learning_rate": 5.234023538453396e-06, + "loss": 0.39, + "step": 4961 + }, + { + "epoch": 2.6826455217156244, + "grad_norm": 0.2983420491218567, + "learning_rate": 5.23213761457102e-06, + "loss": 0.3953, + "step": 4962 + }, + { + "epoch": 2.683186159668409, + "grad_norm": 0.3359537124633789, + "learning_rate": 5.230251657590884e-06, + "loss": 0.4291, + "step": 4963 + }, + { + "epoch": 2.683726797621193, + "grad_norm": 0.30198025703430176, + "learning_rate": 5.228365667781885e-06, + "loss": 0.376, + "step": 4964 + }, + { + "epoch": 2.684267435573977, + "grad_norm": 0.3557318449020386, + "learning_rate": 5.226479645412923e-06, + "loss": 0.451, + "step": 4965 + }, + { + "epoch": 2.6848080735267614, + "grad_norm": 0.328320175409317, + "learning_rate": 5.224593590752902e-06, + "loss": 0.3907, + "step": 4966 + }, + { + "epoch": 2.685348711479546, + "grad_norm": 0.31203341484069824, + "learning_rate": 5.222707504070737e-06, + "loss": 0.3777, + "step": 4967 + }, + { + "epoch": 2.68588934943233, + "grad_norm": 0.3094787001609802, + "learning_rate": 5.220821385635337e-06, + "loss": 0.3683, + "step": 4968 + }, + { + "epoch": 2.6864299873851145, + "grad_norm": 0.30369725823402405, + "learning_rate": 5.218935235715625e-06, + "loss": 0.4159, + "step": 4969 + }, + { + "epoch": 2.686970625337899, + "grad_norm": 0.3582439124584198, + "learning_rate": 5.2170490545805255e-06, + "loss": 0.4446, + "step": 4970 + }, + { + "epoch": 2.6875112632906832, + "grad_norm": 0.2925317585468292, + "learning_rate": 5.215162842498964e-06, + "loss": 0.3696, + "step": 4971 + }, + { + "epoch": 2.688051901243467, + "grad_norm": 0.3087409734725952, + "learning_rate": 5.213276599739875e-06, + "loss": 0.423, + "step": 4972 + }, + { + "epoch": 2.6885925391962515, + "grad_norm": 0.351330429315567, + "learning_rate": 5.211390326572196e-06, + "loss": 0.4392, + "step": 4973 + }, + { + "epoch": 2.689133177149036, + "grad_norm": 0.3118128478527069, + "learning_rate": 5.209504023264865e-06, + "loss": 0.4046, + "step": 4974 + }, + { + "epoch": 2.6896738151018202, + "grad_norm": 0.3023654520511627, + "learning_rate": 5.207617690086831e-06, + "loss": 0.4049, + "step": 4975 + }, + { + "epoch": 2.6902144530546046, + "grad_norm": 0.27988362312316895, + "learning_rate": 5.205731327307044e-06, + "loss": 0.4134, + "step": 4976 + }, + { + "epoch": 2.6907550910073885, + "grad_norm": 0.2854805290699005, + "learning_rate": 5.2038449351944585e-06, + "loss": 0.4033, + "step": 4977 + }, + { + "epoch": 2.691295728960173, + "grad_norm": 0.2994115352630615, + "learning_rate": 5.2019585140180295e-06, + "loss": 0.4255, + "step": 4978 + }, + { + "epoch": 2.6918363669129572, + "grad_norm": 0.2791630029678345, + "learning_rate": 5.200072064046724e-06, + "loss": 0.3844, + "step": 4979 + }, + { + "epoch": 2.6923770048657416, + "grad_norm": 0.28842294216156006, + "learning_rate": 5.1981855855495035e-06, + "loss": 0.3967, + "step": 4980 + }, + { + "epoch": 2.692917642818526, + "grad_norm": 0.31106212735176086, + "learning_rate": 5.1962990787953436e-06, + "loss": 0.3962, + "step": 4981 + }, + { + "epoch": 2.69345828077131, + "grad_norm": 0.30779221653938293, + "learning_rate": 5.194412544053217e-06, + "loss": 0.4067, + "step": 4982 + }, + { + "epoch": 2.6939989187240947, + "grad_norm": 0.31400802731513977, + "learning_rate": 5.192525981592101e-06, + "loss": 0.4035, + "step": 4983 + }, + { + "epoch": 2.6945395566768786, + "grad_norm": 0.3023657202720642, + "learning_rate": 5.190639391680981e-06, + "loss": 0.4192, + "step": 4984 + }, + { + "epoch": 2.695080194629663, + "grad_norm": 0.27142763137817383, + "learning_rate": 5.188752774588841e-06, + "loss": 0.3672, + "step": 4985 + }, + { + "epoch": 2.6956208325824473, + "grad_norm": 0.33573058247566223, + "learning_rate": 5.186866130584674e-06, + "loss": 0.421, + "step": 4986 + }, + { + "epoch": 2.6961614705352317, + "grad_norm": 0.3077828288078308, + "learning_rate": 5.184979459937471e-06, + "loss": 0.4268, + "step": 4987 + }, + { + "epoch": 2.696702108488016, + "grad_norm": 0.28344956040382385, + "learning_rate": 5.183092762916234e-06, + "loss": 0.4138, + "step": 4988 + }, + { + "epoch": 2.6972427464408, + "grad_norm": 0.2885357737541199, + "learning_rate": 5.1812060397899624e-06, + "loss": 0.4027, + "step": 4989 + }, + { + "epoch": 2.6977833843935843, + "grad_norm": 0.2987276315689087, + "learning_rate": 5.179319290827661e-06, + "loss": 0.3597, + "step": 4990 + }, + { + "epoch": 2.6983240223463687, + "grad_norm": 0.30471739172935486, + "learning_rate": 5.177432516298341e-06, + "loss": 0.4168, + "step": 4991 + }, + { + "epoch": 2.698864660299153, + "grad_norm": 0.3163413405418396, + "learning_rate": 5.175545716471014e-06, + "loss": 0.4396, + "step": 4992 + }, + { + "epoch": 2.6994052982519374, + "grad_norm": 0.28372013568878174, + "learning_rate": 5.173658891614699e-06, + "loss": 0.3795, + "step": 4993 + }, + { + "epoch": 2.6999459362047213, + "grad_norm": 0.2838355004787445, + "learning_rate": 5.171772041998412e-06, + "loss": 0.3829, + "step": 4994 + }, + { + "epoch": 2.7004865741575057, + "grad_norm": 0.3122652471065521, + "learning_rate": 5.16988516789118e-06, + "loss": 0.3888, + "step": 4995 + }, + { + "epoch": 2.70102721211029, + "grad_norm": 0.32194578647613525, + "learning_rate": 5.167998269562028e-06, + "loss": 0.4292, + "step": 4996 + }, + { + "epoch": 2.7015678500630744, + "grad_norm": 0.27087098360061646, + "learning_rate": 5.166111347279987e-06, + "loss": 0.3972, + "step": 4997 + }, + { + "epoch": 2.702108488015859, + "grad_norm": 0.2952033579349518, + "learning_rate": 5.164224401314092e-06, + "loss": 0.3987, + "step": 4998 + }, + { + "epoch": 2.702649125968643, + "grad_norm": 0.3722441494464874, + "learning_rate": 5.16233743193338e-06, + "loss": 0.4366, + "step": 4999 + }, + { + "epoch": 2.7031897639214275, + "grad_norm": 0.27760955691337585, + "learning_rate": 5.16045043940689e-06, + "loss": 0.4061, + "step": 5000 + }, + { + "epoch": 2.7037304018742114, + "grad_norm": 0.29309314489364624, + "learning_rate": 5.158563424003669e-06, + "loss": 0.4173, + "step": 5001 + }, + { + "epoch": 2.704271039826996, + "grad_norm": 0.30780085921287537, + "learning_rate": 5.15667638599276e-06, + "loss": 0.4378, + "step": 5002 + }, + { + "epoch": 2.70481167777978, + "grad_norm": 0.2956397533416748, + "learning_rate": 5.154789325643218e-06, + "loss": 0.4055, + "step": 5003 + }, + { + "epoch": 2.7053523157325645, + "grad_norm": 0.3192119598388672, + "learning_rate": 5.152902243224093e-06, + "loss": 0.4091, + "step": 5004 + }, + { + "epoch": 2.705892953685349, + "grad_norm": 0.28897231817245483, + "learning_rate": 5.151015139004445e-06, + "loss": 0.3974, + "step": 5005 + }, + { + "epoch": 2.706433591638133, + "grad_norm": 0.34239473938941956, + "learning_rate": 5.149128013253332e-06, + "loss": 0.4338, + "step": 5006 + }, + { + "epoch": 2.706974229590917, + "grad_norm": 0.29271113872528076, + "learning_rate": 5.147240866239817e-06, + "loss": 0.3649, + "step": 5007 + }, + { + "epoch": 2.7075148675437015, + "grad_norm": 0.3110845685005188, + "learning_rate": 5.145353698232966e-06, + "loss": 0.4095, + "step": 5008 + }, + { + "epoch": 2.708055505496486, + "grad_norm": 0.3172319829463959, + "learning_rate": 5.143466509501849e-06, + "loss": 0.3673, + "step": 5009 + }, + { + "epoch": 2.7085961434492702, + "grad_norm": 0.29760801792144775, + "learning_rate": 5.141579300315536e-06, + "loss": 0.385, + "step": 5010 + }, + { + "epoch": 2.709136781402054, + "grad_norm": 0.29533249139785767, + "learning_rate": 5.139692070943104e-06, + "loss": 0.368, + "step": 5011 + }, + { + "epoch": 2.709677419354839, + "grad_norm": 0.3004533648490906, + "learning_rate": 5.137804821653629e-06, + "loss": 0.4129, + "step": 5012 + }, + { + "epoch": 2.710218057307623, + "grad_norm": 0.2970389425754547, + "learning_rate": 5.135917552716194e-06, + "loss": 0.3935, + "step": 5013 + }, + { + "epoch": 2.7107586952604072, + "grad_norm": 0.34341931343078613, + "learning_rate": 5.1340302643998775e-06, + "loss": 0.4217, + "step": 5014 + }, + { + "epoch": 2.7112993332131916, + "grad_norm": 0.28611016273498535, + "learning_rate": 5.132142956973773e-06, + "loss": 0.3685, + "step": 5015 + }, + { + "epoch": 2.711839971165976, + "grad_norm": 0.3188721835613251, + "learning_rate": 5.130255630706962e-06, + "loss": 0.3965, + "step": 5016 + }, + { + "epoch": 2.7123806091187603, + "grad_norm": 0.32119816541671753, + "learning_rate": 5.128368285868542e-06, + "loss": 0.4312, + "step": 5017 + }, + { + "epoch": 2.7129212470715443, + "grad_norm": 0.3183414340019226, + "learning_rate": 5.126480922727602e-06, + "loss": 0.3982, + "step": 5018 + }, + { + "epoch": 2.7134618850243286, + "grad_norm": 0.2703632414340973, + "learning_rate": 5.124593541553243e-06, + "loss": 0.395, + "step": 5019 + }, + { + "epoch": 2.714002522977113, + "grad_norm": 0.30107101798057556, + "learning_rate": 5.122706142614562e-06, + "loss": 0.3866, + "step": 5020 + }, + { + "epoch": 2.7145431609298973, + "grad_norm": 0.35960131883621216, + "learning_rate": 5.120818726180662e-06, + "loss": 0.4124, + "step": 5021 + }, + { + "epoch": 2.7150837988826817, + "grad_norm": 0.310278058052063, + "learning_rate": 5.118931292520647e-06, + "loss": 0.3942, + "step": 5022 + }, + { + "epoch": 2.7156244368354656, + "grad_norm": 0.3234735429286957, + "learning_rate": 5.117043841903624e-06, + "loss": 0.4256, + "step": 5023 + }, + { + "epoch": 2.71616507478825, + "grad_norm": 0.30350586771965027, + "learning_rate": 5.115156374598703e-06, + "loss": 0.3811, + "step": 5024 + }, + { + "epoch": 2.7167057127410343, + "grad_norm": 0.3205268979072571, + "learning_rate": 5.113268890874994e-06, + "loss": 0.4017, + "step": 5025 + }, + { + "epoch": 2.7172463506938187, + "grad_norm": 0.2997487783432007, + "learning_rate": 5.111381391001612e-06, + "loss": 0.4003, + "step": 5026 + }, + { + "epoch": 2.717786988646603, + "grad_norm": 0.28808310627937317, + "learning_rate": 5.109493875247672e-06, + "loss": 0.3904, + "step": 5027 + }, + { + "epoch": 2.7183276265993874, + "grad_norm": 0.30357658863067627, + "learning_rate": 5.1076063438822965e-06, + "loss": 0.3867, + "step": 5028 + }, + { + "epoch": 2.718868264552172, + "grad_norm": 0.32023051381111145, + "learning_rate": 5.105718797174601e-06, + "loss": 0.4064, + "step": 5029 + }, + { + "epoch": 2.7194089025049557, + "grad_norm": 0.2869778871536255, + "learning_rate": 5.103831235393714e-06, + "loss": 0.3559, + "step": 5030 + }, + { + "epoch": 2.71994954045774, + "grad_norm": 0.38330546021461487, + "learning_rate": 5.1019436588087555e-06, + "loss": 0.4117, + "step": 5031 + }, + { + "epoch": 2.7204901784105244, + "grad_norm": 0.30030086636543274, + "learning_rate": 5.100056067688854e-06, + "loss": 0.4111, + "step": 5032 + }, + { + "epoch": 2.721030816363309, + "grad_norm": 0.29235854744911194, + "learning_rate": 5.098168462303141e-06, + "loss": 0.3544, + "step": 5033 + }, + { + "epoch": 2.721571454316093, + "grad_norm": 0.34840068221092224, + "learning_rate": 5.096280842920748e-06, + "loss": 0.4158, + "step": 5034 + }, + { + "epoch": 2.722112092268877, + "grad_norm": 0.34116119146347046, + "learning_rate": 5.094393209810806e-06, + "loss": 0.4145, + "step": 5035 + }, + { + "epoch": 2.7226527302216614, + "grad_norm": 0.30736395716667175, + "learning_rate": 5.092505563242451e-06, + "loss": 0.4202, + "step": 5036 + }, + { + "epoch": 2.723193368174446, + "grad_norm": 0.32079383730888367, + "learning_rate": 5.09061790348482e-06, + "loss": 0.4001, + "step": 5037 + }, + { + "epoch": 2.72373400612723, + "grad_norm": 0.32415834069252014, + "learning_rate": 5.088730230807054e-06, + "loss": 0.4203, + "step": 5038 + }, + { + "epoch": 2.7242746440800145, + "grad_norm": 0.3238414525985718, + "learning_rate": 5.086842545478291e-06, + "loss": 0.4165, + "step": 5039 + }, + { + "epoch": 2.7248152820327984, + "grad_norm": 0.2785307765007019, + "learning_rate": 5.084954847767677e-06, + "loss": 0.3935, + "step": 5040 + }, + { + "epoch": 2.7253559199855832, + "grad_norm": 0.34599483013153076, + "learning_rate": 5.083067137944354e-06, + "loss": 0.4417, + "step": 5041 + }, + { + "epoch": 2.725896557938367, + "grad_norm": 0.3113754689693451, + "learning_rate": 5.081179416277469e-06, + "loss": 0.4182, + "step": 5042 + }, + { + "epoch": 2.7264371958911515, + "grad_norm": 0.2806205451488495, + "learning_rate": 5.079291683036169e-06, + "loss": 0.3837, + "step": 5043 + }, + { + "epoch": 2.726977833843936, + "grad_norm": 0.3233984410762787, + "learning_rate": 5.077403938489607e-06, + "loss": 0.4077, + "step": 5044 + }, + { + "epoch": 2.7275184717967202, + "grad_norm": 0.32577744126319885, + "learning_rate": 5.07551618290693e-06, + "loss": 0.3978, + "step": 5045 + }, + { + "epoch": 2.7280591097495046, + "grad_norm": 0.35160160064697266, + "learning_rate": 5.073628416557293e-06, + "loss": 0.4106, + "step": 5046 + }, + { + "epoch": 2.7285997477022885, + "grad_norm": 0.34330466389656067, + "learning_rate": 5.07174063970985e-06, + "loss": 0.4159, + "step": 5047 + }, + { + "epoch": 2.729140385655073, + "grad_norm": 0.289131224155426, + "learning_rate": 5.069852852633757e-06, + "loss": 0.3944, + "step": 5048 + }, + { + "epoch": 2.7296810236078572, + "grad_norm": 0.28988659381866455, + "learning_rate": 5.06796505559817e-06, + "loss": 0.3709, + "step": 5049 + }, + { + "epoch": 2.7302216615606416, + "grad_norm": 0.3208027780056, + "learning_rate": 5.06607724887225e-06, + "loss": 0.4192, + "step": 5050 + }, + { + "epoch": 2.730762299513426, + "grad_norm": 0.3088265657424927, + "learning_rate": 5.064189432725154e-06, + "loss": 0.4046, + "step": 5051 + }, + { + "epoch": 2.73130293746621, + "grad_norm": 0.2850970923900604, + "learning_rate": 5.062301607426047e-06, + "loss": 0.3722, + "step": 5052 + }, + { + "epoch": 2.7318435754189943, + "grad_norm": 0.32113608717918396, + "learning_rate": 5.0604137732440875e-06, + "loss": 0.424, + "step": 5053 + }, + { + "epoch": 2.7323842133717786, + "grad_norm": 0.29261910915374756, + "learning_rate": 5.058525930448443e-06, + "loss": 0.3948, + "step": 5054 + }, + { + "epoch": 2.732924851324563, + "grad_norm": 0.28865715861320496, + "learning_rate": 5.056638079308277e-06, + "loss": 0.3758, + "step": 5055 + }, + { + "epoch": 2.7334654892773473, + "grad_norm": 0.3075926601886749, + "learning_rate": 5.054750220092757e-06, + "loss": 0.3927, + "step": 5056 + }, + { + "epoch": 2.7340061272301317, + "grad_norm": 0.30959898233413696, + "learning_rate": 5.05286235307105e-06, + "loss": 0.4275, + "step": 5057 + }, + { + "epoch": 2.734546765182916, + "grad_norm": 0.2836802005767822, + "learning_rate": 5.050974478512324e-06, + "loss": 0.3876, + "step": 5058 + }, + { + "epoch": 2.7350874031357, + "grad_norm": 0.2989635169506073, + "learning_rate": 5.049086596685749e-06, + "loss": 0.4404, + "step": 5059 + }, + { + "epoch": 2.7356280410884843, + "grad_norm": 0.31573420763015747, + "learning_rate": 5.047198707860496e-06, + "loss": 0.3916, + "step": 5060 + }, + { + "epoch": 2.7361686790412687, + "grad_norm": 0.33187058568000793, + "learning_rate": 5.045310812305737e-06, + "loss": 0.4139, + "step": 5061 + }, + { + "epoch": 2.736709316994053, + "grad_norm": 0.3388705551624298, + "learning_rate": 5.043422910290645e-06, + "loss": 0.3856, + "step": 5062 + }, + { + "epoch": 2.7372499549468374, + "grad_norm": 0.30032768845558167, + "learning_rate": 5.041535002084394e-06, + "loss": 0.3926, + "step": 5063 + }, + { + "epoch": 2.7377905928996213, + "grad_norm": 0.31506726145744324, + "learning_rate": 5.0396470879561564e-06, + "loss": 0.4287, + "step": 5064 + }, + { + "epoch": 2.7383312308524057, + "grad_norm": 0.2879992425441742, + "learning_rate": 5.037759168175109e-06, + "loss": 0.3782, + "step": 5065 + }, + { + "epoch": 2.73887186880519, + "grad_norm": 0.2926962673664093, + "learning_rate": 5.035871243010427e-06, + "loss": 0.429, + "step": 5066 + }, + { + "epoch": 2.7394125067579744, + "grad_norm": 0.3311244249343872, + "learning_rate": 5.0339833127312885e-06, + "loss": 0.4295, + "step": 5067 + }, + { + "epoch": 2.739953144710759, + "grad_norm": 0.34304389357566833, + "learning_rate": 5.032095377606873e-06, + "loss": 0.4186, + "step": 5068 + }, + { + "epoch": 2.7404937826635427, + "grad_norm": 0.27116817235946655, + "learning_rate": 5.030207437906354e-06, + "loss": 0.3807, + "step": 5069 + }, + { + "epoch": 2.7410344206163275, + "grad_norm": 0.3297407031059265, + "learning_rate": 5.028319493898916e-06, + "loss": 0.4224, + "step": 5070 + }, + { + "epoch": 2.7415750585691114, + "grad_norm": 0.3084923326969147, + "learning_rate": 5.026431545853734e-06, + "loss": 0.3962, + "step": 5071 + }, + { + "epoch": 2.742115696521896, + "grad_norm": 0.29288437962532043, + "learning_rate": 5.024543594039991e-06, + "loss": 0.4289, + "step": 5072 + }, + { + "epoch": 2.74265633447468, + "grad_norm": 0.30632689595222473, + "learning_rate": 5.022655638726866e-06, + "loss": 0.3883, + "step": 5073 + }, + { + "epoch": 2.7431969724274645, + "grad_norm": 0.3066025972366333, + "learning_rate": 5.020767680183543e-06, + "loss": 0.4027, + "step": 5074 + }, + { + "epoch": 2.743737610380249, + "grad_norm": 0.32585495710372925, + "learning_rate": 5.018879718679199e-06, + "loss": 0.4198, + "step": 5075 + }, + { + "epoch": 2.744278248333033, + "grad_norm": 0.2831990718841553, + "learning_rate": 5.0169917544830205e-06, + "loss": 0.4104, + "step": 5076 + }, + { + "epoch": 2.744818886285817, + "grad_norm": 0.29489606618881226, + "learning_rate": 5.015103787864187e-06, + "loss": 0.3778, + "step": 5077 + }, + { + "epoch": 2.7453595242386015, + "grad_norm": 0.3362656831741333, + "learning_rate": 5.013215819091886e-06, + "loss": 0.438, + "step": 5078 + }, + { + "epoch": 2.745900162191386, + "grad_norm": 0.27341189980506897, + "learning_rate": 5.0113278484352945e-06, + "loss": 0.3881, + "step": 5079 + }, + { + "epoch": 2.7464408001441702, + "grad_norm": 0.30783629417419434, + "learning_rate": 5.009439876163601e-06, + "loss": 0.3815, + "step": 5080 + }, + { + "epoch": 2.746981438096954, + "grad_norm": 0.3110550343990326, + "learning_rate": 5.007551902545986e-06, + "loss": 0.4104, + "step": 5081 + }, + { + "epoch": 2.7475220760497385, + "grad_norm": 0.32011663913726807, + "learning_rate": 5.0056639278516335e-06, + "loss": 0.4023, + "step": 5082 + }, + { + "epoch": 2.748062714002523, + "grad_norm": 0.29512232542037964, + "learning_rate": 5.00377595234973e-06, + "loss": 0.4089, + "step": 5083 + }, + { + "epoch": 2.7486033519553073, + "grad_norm": 0.2946411371231079, + "learning_rate": 5.0018879763094575e-06, + "loss": 0.3855, + "step": 5084 + }, + { + "epoch": 2.7491439899080916, + "grad_norm": 0.31604093313217163, + "learning_rate": 5e-06, + "loss": 0.3985, + "step": 5085 + }, + { + "epoch": 2.749684627860876, + "grad_norm": 0.29789918661117554, + "learning_rate": 4.998112023690543e-06, + "loss": 0.4138, + "step": 5086 + }, + { + "epoch": 2.7502252658136603, + "grad_norm": 0.30474555492401123, + "learning_rate": 4.996224047650271e-06, + "loss": 0.4167, + "step": 5087 + }, + { + "epoch": 2.7507659037664443, + "grad_norm": 0.31081917881965637, + "learning_rate": 4.994336072148367e-06, + "loss": 0.4325, + "step": 5088 + }, + { + "epoch": 2.7513065417192286, + "grad_norm": 0.2845231592655182, + "learning_rate": 4.992448097454016e-06, + "loss": 0.3983, + "step": 5089 + }, + { + "epoch": 2.751847179672013, + "grad_norm": 0.29780399799346924, + "learning_rate": 4.9905601238364006e-06, + "loss": 0.4036, + "step": 5090 + }, + { + "epoch": 2.7523878176247973, + "grad_norm": 0.29843631386756897, + "learning_rate": 4.9886721515647055e-06, + "loss": 0.3982, + "step": 5091 + }, + { + "epoch": 2.7529284555775817, + "grad_norm": 0.29134443402290344, + "learning_rate": 4.986784180908117e-06, + "loss": 0.3975, + "step": 5092 + }, + { + "epoch": 2.7534690935303656, + "grad_norm": 0.2783971130847931, + "learning_rate": 4.984896212135814e-06, + "loss": 0.3691, + "step": 5093 + }, + { + "epoch": 2.75400973148315, + "grad_norm": 0.3527921438217163, + "learning_rate": 4.983008245516981e-06, + "loss": 0.4279, + "step": 5094 + }, + { + "epoch": 2.7545503694359343, + "grad_norm": 0.28707146644592285, + "learning_rate": 4.981120281320801e-06, + "loss": 0.3835, + "step": 5095 + }, + { + "epoch": 2.7550910073887187, + "grad_norm": 0.30036625266075134, + "learning_rate": 4.979232319816461e-06, + "loss": 0.4288, + "step": 5096 + }, + { + "epoch": 2.755631645341503, + "grad_norm": 0.3112487494945526, + "learning_rate": 4.977344361273135e-06, + "loss": 0.399, + "step": 5097 + }, + { + "epoch": 2.756172283294287, + "grad_norm": 0.31154346466064453, + "learning_rate": 4.97545640596001e-06, + "loss": 0.3928, + "step": 5098 + }, + { + "epoch": 2.756712921247072, + "grad_norm": 0.2694031596183777, + "learning_rate": 4.973568454146267e-06, + "loss": 0.3935, + "step": 5099 + }, + { + "epoch": 2.7572535591998557, + "grad_norm": 0.3083007037639618, + "learning_rate": 4.971680506101086e-06, + "loss": 0.4266, + "step": 5100 + }, + { + "epoch": 2.75779419715264, + "grad_norm": 0.307720810174942, + "learning_rate": 4.9697925620936464e-06, + "loss": 0.4154, + "step": 5101 + }, + { + "epoch": 2.7583348351054244, + "grad_norm": 0.3052947521209717, + "learning_rate": 4.967904622393128e-06, + "loss": 0.3713, + "step": 5102 + }, + { + "epoch": 2.758875473058209, + "grad_norm": 0.29163241386413574, + "learning_rate": 4.966016687268711e-06, + "loss": 0.3715, + "step": 5103 + }, + { + "epoch": 2.759416111010993, + "grad_norm": 0.30811169743537903, + "learning_rate": 4.964128756989575e-06, + "loss": 0.45, + "step": 5104 + }, + { + "epoch": 2.759956748963777, + "grad_norm": 0.31915563344955444, + "learning_rate": 4.9622408318248925e-06, + "loss": 0.3916, + "step": 5105 + }, + { + "epoch": 2.7604973869165614, + "grad_norm": 0.34357088804244995, + "learning_rate": 4.960352912043845e-06, + "loss": 0.4074, + "step": 5106 + }, + { + "epoch": 2.761038024869346, + "grad_norm": 0.28964367508888245, + "learning_rate": 4.958464997915607e-06, + "loss": 0.4198, + "step": 5107 + }, + { + "epoch": 2.76157866282213, + "grad_norm": 0.28185534477233887, + "learning_rate": 4.9565770897093565e-06, + "loss": 0.3914, + "step": 5108 + }, + { + "epoch": 2.7621193007749145, + "grad_norm": 0.3085354268550873, + "learning_rate": 4.954689187694265e-06, + "loss": 0.3799, + "step": 5109 + }, + { + "epoch": 2.7626599387276984, + "grad_norm": 0.3020288050174713, + "learning_rate": 4.952801292139505e-06, + "loss": 0.4126, + "step": 5110 + }, + { + "epoch": 2.7632005766804832, + "grad_norm": 0.2997831404209137, + "learning_rate": 4.9509134033142525e-06, + "loss": 0.4141, + "step": 5111 + }, + { + "epoch": 2.763741214633267, + "grad_norm": 0.2812916934490204, + "learning_rate": 4.9490255214876785e-06, + "loss": 0.4075, + "step": 5112 + }, + { + "epoch": 2.7642818525860515, + "grad_norm": 0.3080768883228302, + "learning_rate": 4.947137646928952e-06, + "loss": 0.4211, + "step": 5113 + }, + { + "epoch": 2.764822490538836, + "grad_norm": 0.3185937702655792, + "learning_rate": 4.945249779907244e-06, + "loss": 0.443, + "step": 5114 + }, + { + "epoch": 2.7653631284916202, + "grad_norm": 0.2650339603424072, + "learning_rate": 4.9433619206917234e-06, + "loss": 0.354, + "step": 5115 + }, + { + "epoch": 2.7659037664444046, + "grad_norm": 0.30960190296173096, + "learning_rate": 4.941474069551559e-06, + "loss": 0.4216, + "step": 5116 + }, + { + "epoch": 2.7664444043971885, + "grad_norm": 0.3096233904361725, + "learning_rate": 4.939586226755913e-06, + "loss": 0.4039, + "step": 5117 + }, + { + "epoch": 2.766985042349973, + "grad_norm": 0.28972771763801575, + "learning_rate": 4.937698392573955e-06, + "loss": 0.39, + "step": 5118 + }, + { + "epoch": 2.7675256803027573, + "grad_norm": 0.3311610519886017, + "learning_rate": 4.935810567274846e-06, + "loss": 0.4151, + "step": 5119 + }, + { + "epoch": 2.7680663182555416, + "grad_norm": 0.29001107811927795, + "learning_rate": 4.933922751127753e-06, + "loss": 0.395, + "step": 5120 + }, + { + "epoch": 2.768606956208326, + "grad_norm": 0.3485519289970398, + "learning_rate": 4.932034944401832e-06, + "loss": 0.4181, + "step": 5121 + }, + { + "epoch": 2.76914759416111, + "grad_norm": 0.2745487093925476, + "learning_rate": 4.930147147366245e-06, + "loss": 0.3793, + "step": 5122 + }, + { + "epoch": 2.7696882321138943, + "grad_norm": 0.35208848118782043, + "learning_rate": 4.928259360290151e-06, + "loss": 0.4083, + "step": 5123 + }, + { + "epoch": 2.7702288700666786, + "grad_norm": 0.3201996088027954, + "learning_rate": 4.926371583442709e-06, + "loss": 0.3866, + "step": 5124 + }, + { + "epoch": 2.770769508019463, + "grad_norm": 0.31646934151649475, + "learning_rate": 4.924483817093071e-06, + "loss": 0.4419, + "step": 5125 + }, + { + "epoch": 2.7713101459722473, + "grad_norm": 0.2841111719608307, + "learning_rate": 4.922596061510394e-06, + "loss": 0.3893, + "step": 5126 + }, + { + "epoch": 2.7718507839250317, + "grad_norm": 0.33771705627441406, + "learning_rate": 4.920708316963831e-06, + "loss": 0.419, + "step": 5127 + }, + { + "epoch": 2.772391421877816, + "grad_norm": 0.30035245418548584, + "learning_rate": 4.918820583722533e-06, + "loss": 0.3993, + "step": 5128 + }, + { + "epoch": 2.7729320598306, + "grad_norm": 0.29486024379730225, + "learning_rate": 4.916932862055648e-06, + "loss": 0.4014, + "step": 5129 + }, + { + "epoch": 2.7734726977833843, + "grad_norm": 0.31270942091941833, + "learning_rate": 4.915045152232324e-06, + "loss": 0.3945, + "step": 5130 + }, + { + "epoch": 2.7740133357361687, + "grad_norm": 0.30171117186546326, + "learning_rate": 4.9131574545217095e-06, + "loss": 0.3831, + "step": 5131 + }, + { + "epoch": 2.774553973688953, + "grad_norm": 0.31151530146598816, + "learning_rate": 4.911269769192949e-06, + "loss": 0.4271, + "step": 5132 + }, + { + "epoch": 2.7750946116417374, + "grad_norm": 0.33287256956100464, + "learning_rate": 4.909382096515182e-06, + "loss": 0.4108, + "step": 5133 + }, + { + "epoch": 2.7756352495945213, + "grad_norm": 0.280910849571228, + "learning_rate": 4.90749443675755e-06, + "loss": 0.3765, + "step": 5134 + }, + { + "epoch": 2.7761758875473057, + "grad_norm": 0.2836659252643585, + "learning_rate": 4.9056067901891945e-06, + "loss": 0.3699, + "step": 5135 + }, + { + "epoch": 2.77671652550009, + "grad_norm": 0.32560449838638306, + "learning_rate": 4.903719157079254e-06, + "loss": 0.4183, + "step": 5136 + }, + { + "epoch": 2.7772571634528744, + "grad_norm": 0.3005795180797577, + "learning_rate": 4.90183153769686e-06, + "loss": 0.3789, + "step": 5137 + }, + { + "epoch": 2.777797801405659, + "grad_norm": 0.30739063024520874, + "learning_rate": 4.8999439323111465e-06, + "loss": 0.4145, + "step": 5138 + }, + { + "epoch": 2.7783384393584427, + "grad_norm": 0.26225677132606506, + "learning_rate": 4.898056341191246e-06, + "loss": 0.3773, + "step": 5139 + }, + { + "epoch": 2.7788790773112275, + "grad_norm": 0.32766658067703247, + "learning_rate": 4.896168764606289e-06, + "loss": 0.4375, + "step": 5140 + }, + { + "epoch": 2.7794197152640114, + "grad_norm": 0.29584774374961853, + "learning_rate": 4.8942812028254e-06, + "loss": 0.3759, + "step": 5141 + }, + { + "epoch": 2.779960353216796, + "grad_norm": 0.2836137115955353, + "learning_rate": 4.892393656117705e-06, + "loss": 0.3962, + "step": 5142 + }, + { + "epoch": 2.78050099116958, + "grad_norm": 0.2864339053630829, + "learning_rate": 4.890506124752328e-06, + "loss": 0.391, + "step": 5143 + }, + { + "epoch": 2.7810416291223645, + "grad_norm": 0.3437691628932953, + "learning_rate": 4.88861860899839e-06, + "loss": 0.39, + "step": 5144 + }, + { + "epoch": 2.781582267075149, + "grad_norm": 0.30174973607063293, + "learning_rate": 4.886731109125007e-06, + "loss": 0.3956, + "step": 5145 + }, + { + "epoch": 2.782122905027933, + "grad_norm": 0.30526629090309143, + "learning_rate": 4.884843625401298e-06, + "loss": 0.4309, + "step": 5146 + }, + { + "epoch": 2.782663542980717, + "grad_norm": 0.2743600308895111, + "learning_rate": 4.882956158096376e-06, + "loss": 0.3689, + "step": 5147 + }, + { + "epoch": 2.7832041809335015, + "grad_norm": 0.35046494007110596, + "learning_rate": 4.881068707479355e-06, + "loss": 0.4318, + "step": 5148 + }, + { + "epoch": 2.783744818886286, + "grad_norm": 0.2684626579284668, + "learning_rate": 4.87918127381934e-06, + "loss": 0.372, + "step": 5149 + }, + { + "epoch": 2.7842854568390702, + "grad_norm": 0.28792130947113037, + "learning_rate": 4.87729385738544e-06, + "loss": 0.3874, + "step": 5150 + }, + { + "epoch": 2.784826094791854, + "grad_norm": 0.3143690824508667, + "learning_rate": 4.8754064584467585e-06, + "loss": 0.4189, + "step": 5151 + }, + { + "epoch": 2.7853667327446385, + "grad_norm": 0.2900733947753906, + "learning_rate": 4.873519077272398e-06, + "loss": 0.3864, + "step": 5152 + }, + { + "epoch": 2.785907370697423, + "grad_norm": 0.3068050742149353, + "learning_rate": 4.871631714131461e-06, + "loss": 0.425, + "step": 5153 + }, + { + "epoch": 2.7864480086502073, + "grad_norm": 0.29391971230506897, + "learning_rate": 4.869744369293039e-06, + "loss": 0.3719, + "step": 5154 + }, + { + "epoch": 2.7869886466029916, + "grad_norm": 0.30774518847465515, + "learning_rate": 4.867857043026229e-06, + "loss": 0.4271, + "step": 5155 + }, + { + "epoch": 2.787529284555776, + "grad_norm": 0.32500559091567993, + "learning_rate": 4.8659697356001225e-06, + "loss": 0.4138, + "step": 5156 + }, + { + "epoch": 2.7880699225085603, + "grad_norm": 0.2907654941082001, + "learning_rate": 4.864082447283809e-06, + "loss": 0.3994, + "step": 5157 + }, + { + "epoch": 2.7886105604613443, + "grad_norm": 0.2912384867668152, + "learning_rate": 4.862195178346372e-06, + "loss": 0.3792, + "step": 5158 + }, + { + "epoch": 2.7891511984141286, + "grad_norm": 0.27517130970954895, + "learning_rate": 4.860307929056897e-06, + "loss": 0.39, + "step": 5159 + }, + { + "epoch": 2.789691836366913, + "grad_norm": 0.2983931601047516, + "learning_rate": 4.858420699684464e-06, + "loss": 0.3849, + "step": 5160 + }, + { + "epoch": 2.7902324743196973, + "grad_norm": 0.3149608075618744, + "learning_rate": 4.856533490498155e-06, + "loss": 0.4299, + "step": 5161 + }, + { + "epoch": 2.7907731122724817, + "grad_norm": 0.2737068235874176, + "learning_rate": 4.854646301767037e-06, + "loss": 0.3624, + "step": 5162 + }, + { + "epoch": 2.7913137502252656, + "grad_norm": 0.32693517208099365, + "learning_rate": 4.852759133760184e-06, + "loss": 0.3982, + "step": 5163 + }, + { + "epoch": 2.79185438817805, + "grad_norm": 0.2964058220386505, + "learning_rate": 4.850871986746668e-06, + "loss": 0.3978, + "step": 5164 + }, + { + "epoch": 2.7923950261308343, + "grad_norm": 0.2971150577068329, + "learning_rate": 4.848984860995557e-06, + "loss": 0.3665, + "step": 5165 + }, + { + "epoch": 2.7929356640836187, + "grad_norm": 0.31982165575027466, + "learning_rate": 4.847097756775908e-06, + "loss": 0.388, + "step": 5166 + }, + { + "epoch": 2.793476302036403, + "grad_norm": 0.2987110912799835, + "learning_rate": 4.845210674356784e-06, + "loss": 0.4078, + "step": 5167 + }, + { + "epoch": 2.794016939989187, + "grad_norm": 0.3030100166797638, + "learning_rate": 4.843323614007241e-06, + "loss": 0.4205, + "step": 5168 + }, + { + "epoch": 2.794557577941972, + "grad_norm": 0.301750123500824, + "learning_rate": 4.841436575996334e-06, + "loss": 0.4118, + "step": 5169 + }, + { + "epoch": 2.7950982158947557, + "grad_norm": 0.29823026061058044, + "learning_rate": 4.839549560593111e-06, + "loss": 0.4, + "step": 5170 + }, + { + "epoch": 2.79563885384754, + "grad_norm": 0.2757793962955475, + "learning_rate": 4.837662568066622e-06, + "loss": 0.39, + "step": 5171 + }, + { + "epoch": 2.7961794918003244, + "grad_norm": 0.2817678153514862, + "learning_rate": 4.835775598685909e-06, + "loss": 0.3878, + "step": 5172 + }, + { + "epoch": 2.796720129753109, + "grad_norm": 0.30766770243644714, + "learning_rate": 4.833888652720015e-06, + "loss": 0.4055, + "step": 5173 + }, + { + "epoch": 2.797260767705893, + "grad_norm": 0.3036001920700073, + "learning_rate": 4.832001730437973e-06, + "loss": 0.428, + "step": 5174 + }, + { + "epoch": 2.797801405658677, + "grad_norm": 0.2731790840625763, + "learning_rate": 4.830114832108822e-06, + "loss": 0.4067, + "step": 5175 + }, + { + "epoch": 2.7983420436114614, + "grad_norm": 0.34506237506866455, + "learning_rate": 4.828227958001589e-06, + "loss": 0.4352, + "step": 5176 + }, + { + "epoch": 2.798882681564246, + "grad_norm": 0.2925313413143158, + "learning_rate": 4.826341108385304e-06, + "loss": 0.3855, + "step": 5177 + }, + { + "epoch": 2.79942331951703, + "grad_norm": 0.2907009720802307, + "learning_rate": 4.824454283528987e-06, + "loss": 0.4019, + "step": 5178 + }, + { + "epoch": 2.7999639574698145, + "grad_norm": 0.3044201731681824, + "learning_rate": 4.82256748370166e-06, + "loss": 0.4271, + "step": 5179 + }, + { + "epoch": 2.8005045954225984, + "grad_norm": 0.3054400384426117, + "learning_rate": 4.82068070917234e-06, + "loss": 0.4082, + "step": 5180 + }, + { + "epoch": 2.801045233375383, + "grad_norm": 0.277808278799057, + "learning_rate": 4.81879396021004e-06, + "loss": 0.3911, + "step": 5181 + }, + { + "epoch": 2.801585871328167, + "grad_norm": 0.29107189178466797, + "learning_rate": 4.816907237083768e-06, + "loss": 0.3831, + "step": 5182 + }, + { + "epoch": 2.8021265092809515, + "grad_norm": 0.29215705394744873, + "learning_rate": 4.81502054006253e-06, + "loss": 0.3933, + "step": 5183 + }, + { + "epoch": 2.802667147233736, + "grad_norm": 0.32308441400527954, + "learning_rate": 4.813133869415327e-06, + "loss": 0.3916, + "step": 5184 + }, + { + "epoch": 2.8032077851865203, + "grad_norm": 0.3158899247646332, + "learning_rate": 4.81124722541116e-06, + "loss": 0.4313, + "step": 5185 + }, + { + "epoch": 2.8037484231393046, + "grad_norm": 0.3029237687587738, + "learning_rate": 4.80936060831902e-06, + "loss": 0.423, + "step": 5186 + }, + { + "epoch": 2.8042890610920885, + "grad_norm": 0.2986266613006592, + "learning_rate": 4.807474018407899e-06, + "loss": 0.374, + "step": 5187 + }, + { + "epoch": 2.804829699044873, + "grad_norm": 0.3070341944694519, + "learning_rate": 4.805587455946784e-06, + "loss": 0.4146, + "step": 5188 + }, + { + "epoch": 2.8053703369976573, + "grad_norm": 0.2837662696838379, + "learning_rate": 4.803700921204659e-06, + "loss": 0.3944, + "step": 5189 + }, + { + "epoch": 2.8059109749504416, + "grad_norm": 0.3108964264392853, + "learning_rate": 4.801814414450498e-06, + "loss": 0.3902, + "step": 5190 + }, + { + "epoch": 2.806451612903226, + "grad_norm": 0.2998983860015869, + "learning_rate": 4.799927935953278e-06, + "loss": 0.4153, + "step": 5191 + }, + { + "epoch": 2.80699225085601, + "grad_norm": 0.31420305371284485, + "learning_rate": 4.7980414859819705e-06, + "loss": 0.4208, + "step": 5192 + }, + { + "epoch": 2.8075328888087943, + "grad_norm": 0.26373064517974854, + "learning_rate": 4.796155064805544e-06, + "loss": 0.3538, + "step": 5193 + }, + { + "epoch": 2.8080735267615786, + "grad_norm": 0.29200348258018494, + "learning_rate": 4.7942686726929575e-06, + "loss": 0.4091, + "step": 5194 + }, + { + "epoch": 2.808614164714363, + "grad_norm": 0.3131158947944641, + "learning_rate": 4.7923823099131694e-06, + "loss": 0.4298, + "step": 5195 + }, + { + "epoch": 2.8091548026671473, + "grad_norm": 0.29583874344825745, + "learning_rate": 4.790495976735136e-06, + "loss": 0.4342, + "step": 5196 + }, + { + "epoch": 2.8096954406199313, + "grad_norm": 0.25764796137809753, + "learning_rate": 4.788609673427807e-06, + "loss": 0.3699, + "step": 5197 + }, + { + "epoch": 2.810236078572716, + "grad_norm": 0.31256401538848877, + "learning_rate": 4.786723400260127e-06, + "loss": 0.4407, + "step": 5198 + }, + { + "epoch": 2.8107767165255, + "grad_norm": 0.30436599254608154, + "learning_rate": 4.784837157501037e-06, + "loss": 0.4169, + "step": 5199 + }, + { + "epoch": 2.8113173544782843, + "grad_norm": 0.2923033833503723, + "learning_rate": 4.782950945419475e-06, + "loss": 0.3958, + "step": 5200 + }, + { + "epoch": 2.8118579924310687, + "grad_norm": 0.28773999214172363, + "learning_rate": 4.781064764284376e-06, + "loss": 0.4236, + "step": 5201 + }, + { + "epoch": 2.812398630383853, + "grad_norm": 0.282601535320282, + "learning_rate": 4.779178614364664e-06, + "loss": 0.3776, + "step": 5202 + }, + { + "epoch": 2.8129392683366374, + "grad_norm": 0.3376825749874115, + "learning_rate": 4.777292495929264e-06, + "loss": 0.462, + "step": 5203 + }, + { + "epoch": 2.8134799062894214, + "grad_norm": 0.27360770106315613, + "learning_rate": 4.775406409247097e-06, + "loss": 0.381, + "step": 5204 + }, + { + "epoch": 2.8140205442422057, + "grad_norm": 0.29848596453666687, + "learning_rate": 4.7735203545870794e-06, + "loss": 0.4162, + "step": 5205 + }, + { + "epoch": 2.81456118219499, + "grad_norm": 0.31024810671806335, + "learning_rate": 4.771634332218117e-06, + "loss": 0.3737, + "step": 5206 + }, + { + "epoch": 2.8151018201477744, + "grad_norm": 0.3096352517604828, + "learning_rate": 4.7697483424091166e-06, + "loss": 0.4111, + "step": 5207 + }, + { + "epoch": 2.815642458100559, + "grad_norm": 0.29883527755737305, + "learning_rate": 4.767862385428981e-06, + "loss": 0.4147, + "step": 5208 + }, + { + "epoch": 2.8161830960533427, + "grad_norm": 0.31646376848220825, + "learning_rate": 4.765976461546606e-06, + "loss": 0.4131, + "step": 5209 + }, + { + "epoch": 2.816723734006127, + "grad_norm": 0.307599276304245, + "learning_rate": 4.764090571030882e-06, + "loss": 0.3846, + "step": 5210 + }, + { + "epoch": 2.8172643719589114, + "grad_norm": 0.2796662747859955, + "learning_rate": 4.762204714150696e-06, + "loss": 0.39, + "step": 5211 + }, + { + "epoch": 2.817805009911696, + "grad_norm": 0.2873065173625946, + "learning_rate": 4.760318891174932e-06, + "loss": 0.407, + "step": 5212 + }, + { + "epoch": 2.81834564786448, + "grad_norm": 0.29299312829971313, + "learning_rate": 4.758433102372466e-06, + "loss": 0.4046, + "step": 5213 + }, + { + "epoch": 2.8188862858172645, + "grad_norm": 0.331016480922699, + "learning_rate": 4.75654734801217e-06, + "loss": 0.4342, + "step": 5214 + }, + { + "epoch": 2.819426923770049, + "grad_norm": 0.27721527218818665, + "learning_rate": 4.75466162836291e-06, + "loss": 0.3645, + "step": 5215 + }, + { + "epoch": 2.819967561722833, + "grad_norm": 0.29178667068481445, + "learning_rate": 4.7527759436935516e-06, + "loss": 0.3988, + "step": 5216 + }, + { + "epoch": 2.820508199675617, + "grad_norm": 0.34997060894966125, + "learning_rate": 4.750890294272951e-06, + "loss": 0.4273, + "step": 5217 + }, + { + "epoch": 2.8210488376284015, + "grad_norm": 0.2890905439853668, + "learning_rate": 4.749004680369963e-06, + "loss": 0.3943, + "step": 5218 + }, + { + "epoch": 2.821589475581186, + "grad_norm": 0.3408333659172058, + "learning_rate": 4.747119102253429e-06, + "loss": 0.4074, + "step": 5219 + }, + { + "epoch": 2.8221301135339703, + "grad_norm": 0.33997246623039246, + "learning_rate": 4.745233560192195e-06, + "loss": 0.4061, + "step": 5220 + }, + { + "epoch": 2.822670751486754, + "grad_norm": 0.3487636148929596, + "learning_rate": 4.743348054455099e-06, + "loss": 0.4412, + "step": 5221 + }, + { + "epoch": 2.8232113894395385, + "grad_norm": 0.2814512550830841, + "learning_rate": 4.741462585310973e-06, + "loss": 0.3424, + "step": 5222 + }, + { + "epoch": 2.823752027392323, + "grad_norm": 0.33952921628952026, + "learning_rate": 4.739577153028642e-06, + "loss": 0.4312, + "step": 5223 + }, + { + "epoch": 2.8242926653451073, + "grad_norm": 0.2770572602748871, + "learning_rate": 4.737691757876928e-06, + "loss": 0.3893, + "step": 5224 + }, + { + "epoch": 2.8248333032978916, + "grad_norm": 0.30399399995803833, + "learning_rate": 4.735806400124648e-06, + "loss": 0.3823, + "step": 5225 + }, + { + "epoch": 2.8253739412506755, + "grad_norm": 0.3472537696361542, + "learning_rate": 4.733921080040613e-06, + "loss": 0.4421, + "step": 5226 + }, + { + "epoch": 2.8259145792034603, + "grad_norm": 0.3420334756374359, + "learning_rate": 4.7320357978936264e-06, + "loss": 0.4464, + "step": 5227 + }, + { + "epoch": 2.8264552171562443, + "grad_norm": 0.2712843418121338, + "learning_rate": 4.730150553952491e-06, + "loss": 0.3556, + "step": 5228 + }, + { + "epoch": 2.8269958551090286, + "grad_norm": 0.3301163613796234, + "learning_rate": 4.728265348486e-06, + "loss": 0.4097, + "step": 5229 + }, + { + "epoch": 2.827536493061813, + "grad_norm": 0.2986605167388916, + "learning_rate": 4.726380181762943e-06, + "loss": 0.4089, + "step": 5230 + }, + { + "epoch": 2.8280771310145973, + "grad_norm": 0.28485170006752014, + "learning_rate": 4.724495054052104e-06, + "loss": 0.3546, + "step": 5231 + }, + { + "epoch": 2.8286177689673817, + "grad_norm": 0.3346530497074127, + "learning_rate": 4.72260996562226e-06, + "loss": 0.3961, + "step": 5232 + }, + { + "epoch": 2.8291584069201656, + "grad_norm": 0.39170563220977783, + "learning_rate": 4.720724916742184e-06, + "loss": 0.4424, + "step": 5233 + }, + { + "epoch": 2.82969904487295, + "grad_norm": 0.26930177211761475, + "learning_rate": 4.718839907680646e-06, + "loss": 0.3744, + "step": 5234 + }, + { + "epoch": 2.8302396828257343, + "grad_norm": 0.3284896910190582, + "learning_rate": 4.716954938706401e-06, + "loss": 0.4225, + "step": 5235 + }, + { + "epoch": 2.8307803207785187, + "grad_norm": 0.3415636122226715, + "learning_rate": 4.715070010088208e-06, + "loss": 0.4, + "step": 5236 + }, + { + "epoch": 2.831320958731303, + "grad_norm": 0.32044726610183716, + "learning_rate": 4.713185122094816e-06, + "loss": 0.3905, + "step": 5237 + }, + { + "epoch": 2.831861596684087, + "grad_norm": 0.3255140781402588, + "learning_rate": 4.711300274994971e-06, + "loss": 0.4328, + "step": 5238 + }, + { + "epoch": 2.8324022346368714, + "grad_norm": 0.31972357630729675, + "learning_rate": 4.709415469057408e-06, + "loss": 0.364, + "step": 5239 + }, + { + "epoch": 2.8329428725896557, + "grad_norm": 0.33498212695121765, + "learning_rate": 4.707530704550861e-06, + "loss": 0.4336, + "step": 5240 + }, + { + "epoch": 2.83348351054244, + "grad_norm": 0.34578371047973633, + "learning_rate": 4.705645981744055e-06, + "loss": 0.4, + "step": 5241 + }, + { + "epoch": 2.8340241484952244, + "grad_norm": 0.327239990234375, + "learning_rate": 4.703761300905712e-06, + "loss": 0.3677, + "step": 5242 + }, + { + "epoch": 2.834564786448009, + "grad_norm": 0.3138897716999054, + "learning_rate": 4.701876662304546e-06, + "loss": 0.4205, + "step": 5243 + }, + { + "epoch": 2.835105424400793, + "grad_norm": 0.31926262378692627, + "learning_rate": 4.699992066209264e-06, + "loss": 0.4351, + "step": 5244 + }, + { + "epoch": 2.835646062353577, + "grad_norm": 0.35049229860305786, + "learning_rate": 4.69810751288857e-06, + "loss": 0.3678, + "step": 5245 + }, + { + "epoch": 2.8361867003063614, + "grad_norm": 0.3599819839000702, + "learning_rate": 4.696223002611161e-06, + "loss": 0.4248, + "step": 5246 + }, + { + "epoch": 2.836727338259146, + "grad_norm": 0.28022849559783936, + "learning_rate": 4.6943385356457235e-06, + "loss": 0.3645, + "step": 5247 + }, + { + "epoch": 2.83726797621193, + "grad_norm": 0.33059850335121155, + "learning_rate": 4.692454112260943e-06, + "loss": 0.4147, + "step": 5248 + }, + { + "epoch": 2.8378086141647145, + "grad_norm": 0.30869242548942566, + "learning_rate": 4.690569732725497e-06, + "loss": 0.3879, + "step": 5249 + }, + { + "epoch": 2.8383492521174984, + "grad_norm": 0.3129158318042755, + "learning_rate": 4.688685397308061e-06, + "loss": 0.3778, + "step": 5250 + }, + { + "epoch": 2.838889890070283, + "grad_norm": 0.3148750960826874, + "learning_rate": 4.686801106277293e-06, + "loss": 0.4033, + "step": 5251 + }, + { + "epoch": 2.839430528023067, + "grad_norm": 0.35224729776382446, + "learning_rate": 4.684916859901856e-06, + "loss": 0.4547, + "step": 5252 + }, + { + "epoch": 2.8399711659758515, + "grad_norm": 0.3005257546901703, + "learning_rate": 4.6830326584504026e-06, + "loss": 0.4178, + "step": 5253 + }, + { + "epoch": 2.840511803928636, + "grad_norm": 0.29805541038513184, + "learning_rate": 4.6811485021915784e-06, + "loss": 0.3445, + "step": 5254 + }, + { + "epoch": 2.84105244188142, + "grad_norm": 0.3516843020915985, + "learning_rate": 4.679264391394022e-06, + "loss": 0.3812, + "step": 5255 + }, + { + "epoch": 2.8415930798342046, + "grad_norm": 0.36728399991989136, + "learning_rate": 4.677380326326367e-06, + "loss": 0.4471, + "step": 5256 + }, + { + "epoch": 2.8421337177869885, + "grad_norm": 0.3440658152103424, + "learning_rate": 4.67549630725724e-06, + "loss": 0.3866, + "step": 5257 + }, + { + "epoch": 2.842674355739773, + "grad_norm": 0.3516998887062073, + "learning_rate": 4.673612334455264e-06, + "loss": 0.4219, + "step": 5258 + }, + { + "epoch": 2.8432149936925573, + "grad_norm": 0.4027232825756073, + "learning_rate": 4.671728408189046e-06, + "loss": 0.391, + "step": 5259 + }, + { + "epoch": 2.8437556316453416, + "grad_norm": 0.33593621850013733, + "learning_rate": 4.669844528727197e-06, + "loss": 0.4254, + "step": 5260 + }, + { + "epoch": 2.844296269598126, + "grad_norm": 0.2753894031047821, + "learning_rate": 4.6679606963383166e-06, + "loss": 0.3736, + "step": 5261 + }, + { + "epoch": 2.84483690755091, + "grad_norm": 0.3739554286003113, + "learning_rate": 4.666076911291001e-06, + "loss": 0.4039, + "step": 5262 + }, + { + "epoch": 2.8453775455036943, + "grad_norm": 0.35964787006378174, + "learning_rate": 4.66419317385383e-06, + "loss": 0.4048, + "step": 5263 + }, + { + "epoch": 2.8459181834564786, + "grad_norm": 0.2805907130241394, + "learning_rate": 4.662309484295389e-06, + "loss": 0.3859, + "step": 5264 + }, + { + "epoch": 2.846458821409263, + "grad_norm": 0.28541240096092224, + "learning_rate": 4.660425842884249e-06, + "loss": 0.4036, + "step": 5265 + }, + { + "epoch": 2.8469994593620473, + "grad_norm": 0.37052708864212036, + "learning_rate": 4.658542249888978e-06, + "loss": 0.415, + "step": 5266 + }, + { + "epoch": 2.8475400973148313, + "grad_norm": 0.31775808334350586, + "learning_rate": 4.6566587055781324e-06, + "loss": 0.3948, + "step": 5267 + }, + { + "epoch": 2.8480807352676156, + "grad_norm": 0.32983338832855225, + "learning_rate": 4.654775210220266e-06, + "loss": 0.4242, + "step": 5268 + }, + { + "epoch": 2.8486213732204, + "grad_norm": 0.28167930245399475, + "learning_rate": 4.652891764083924e-06, + "loss": 0.3789, + "step": 5269 + }, + { + "epoch": 2.8491620111731844, + "grad_norm": 0.2902034521102905, + "learning_rate": 4.651008367437646e-06, + "loss": 0.4115, + "step": 5270 + }, + { + "epoch": 2.8497026491259687, + "grad_norm": 0.3520466983318329, + "learning_rate": 4.649125020549962e-06, + "loss": 0.3903, + "step": 5271 + }, + { + "epoch": 2.850243287078753, + "grad_norm": 0.3077685534954071, + "learning_rate": 4.647241723689396e-06, + "loss": 0.4187, + "step": 5272 + }, + { + "epoch": 2.8507839250315374, + "grad_norm": 0.3050772249698639, + "learning_rate": 4.645358477124465e-06, + "loss": 0.3745, + "step": 5273 + }, + { + "epoch": 2.8513245629843214, + "grad_norm": 0.3538419008255005, + "learning_rate": 4.643475281123683e-06, + "loss": 0.4066, + "step": 5274 + }, + { + "epoch": 2.8518652009371057, + "grad_norm": 0.31801462173461914, + "learning_rate": 4.641592135955545e-06, + "loss": 0.4074, + "step": 5275 + }, + { + "epoch": 2.85240583888989, + "grad_norm": 0.2762812077999115, + "learning_rate": 4.639709041888552e-06, + "loss": 0.4242, + "step": 5276 + }, + { + "epoch": 2.8529464768426744, + "grad_norm": 0.30232739448547363, + "learning_rate": 4.637825999191189e-06, + "loss": 0.3955, + "step": 5277 + }, + { + "epoch": 2.853487114795459, + "grad_norm": 0.34319016337394714, + "learning_rate": 4.63594300813194e-06, + "loss": 0.4274, + "step": 5278 + }, + { + "epoch": 2.8540277527482427, + "grad_norm": 0.3106715679168701, + "learning_rate": 4.634060068979276e-06, + "loss": 0.3634, + "step": 5279 + }, + { + "epoch": 2.854568390701027, + "grad_norm": 0.30517441034317017, + "learning_rate": 4.6321771820016635e-06, + "loss": 0.4139, + "step": 5280 + }, + { + "epoch": 2.8551090286538114, + "grad_norm": 0.2777802050113678, + "learning_rate": 4.6302943474675625e-06, + "loss": 0.3912, + "step": 5281 + }, + { + "epoch": 2.855649666606596, + "grad_norm": 0.33198273181915283, + "learning_rate": 4.628411565645422e-06, + "loss": 0.4017, + "step": 5282 + }, + { + "epoch": 2.85619030455938, + "grad_norm": 0.29595592617988586, + "learning_rate": 4.626528836803688e-06, + "loss": 0.4181, + "step": 5283 + }, + { + "epoch": 2.856730942512164, + "grad_norm": 0.3072974383831024, + "learning_rate": 4.624646161210795e-06, + "loss": 0.3844, + "step": 5284 + }, + { + "epoch": 2.857271580464949, + "grad_norm": 0.32644233107566833, + "learning_rate": 4.62276353913517e-06, + "loss": 0.4185, + "step": 5285 + }, + { + "epoch": 2.857812218417733, + "grad_norm": 0.3057146370410919, + "learning_rate": 4.6208809708452375e-06, + "loss": 0.3726, + "step": 5286 + }, + { + "epoch": 2.858352856370517, + "grad_norm": 0.29417580366134644, + "learning_rate": 4.61899845660941e-06, + "loss": 0.3738, + "step": 5287 + }, + { + "epoch": 2.8588934943233015, + "grad_norm": 0.315688818693161, + "learning_rate": 4.6171159966960885e-06, + "loss": 0.392, + "step": 5288 + }, + { + "epoch": 2.859434132276086, + "grad_norm": 0.3033626675605774, + "learning_rate": 4.615233591373676e-06, + "loss": 0.377, + "step": 5289 + }, + { + "epoch": 2.8599747702288703, + "grad_norm": 0.33641305565834045, + "learning_rate": 4.6133512409105595e-06, + "loss": 0.4405, + "step": 5290 + }, + { + "epoch": 2.860515408181654, + "grad_norm": 0.28855594992637634, + "learning_rate": 4.6114689455751245e-06, + "loss": 0.3829, + "step": 5291 + }, + { + "epoch": 2.8610560461344385, + "grad_norm": 0.30093568563461304, + "learning_rate": 4.60958670563574e-06, + "loss": 0.3877, + "step": 5292 + }, + { + "epoch": 2.861596684087223, + "grad_norm": 0.3042225241661072, + "learning_rate": 4.6077045213607765e-06, + "loss": 0.3645, + "step": 5293 + }, + { + "epoch": 2.8621373220400073, + "grad_norm": 0.31260597705841064, + "learning_rate": 4.60582239301859e-06, + "loss": 0.4032, + "step": 5294 + }, + { + "epoch": 2.8626779599927916, + "grad_norm": 0.3076440095901489, + "learning_rate": 4.603940320877533e-06, + "loss": 0.3937, + "step": 5295 + }, + { + "epoch": 2.8632185979455755, + "grad_norm": 0.311769038438797, + "learning_rate": 4.602058305205946e-06, + "loss": 0.419, + "step": 5296 + }, + { + "epoch": 2.86375923589836, + "grad_norm": 0.33874577283859253, + "learning_rate": 4.600176346272165e-06, + "loss": 0.4025, + "step": 5297 + }, + { + "epoch": 2.8642998738511443, + "grad_norm": 0.31491518020629883, + "learning_rate": 4.598294444344515e-06, + "loss": 0.423, + "step": 5298 + }, + { + "epoch": 2.8648405118039286, + "grad_norm": 0.33583423495292664, + "learning_rate": 4.596412599691316e-06, + "loss": 0.41, + "step": 5299 + }, + { + "epoch": 2.865381149756713, + "grad_norm": 0.2975970506668091, + "learning_rate": 4.594530812580876e-06, + "loss": 0.3777, + "step": 5300 + }, + { + "epoch": 2.8659217877094973, + "grad_norm": 0.32264795899391174, + "learning_rate": 4.592649083281497e-06, + "loss": 0.3981, + "step": 5301 + }, + { + "epoch": 2.8664624256622817, + "grad_norm": 0.30629464983940125, + "learning_rate": 4.5907674120614735e-06, + "loss": 0.3899, + "step": 5302 + }, + { + "epoch": 2.8670030636150656, + "grad_norm": 0.285491406917572, + "learning_rate": 4.5888857991890925e-06, + "loss": 0.4198, + "step": 5303 + }, + { + "epoch": 2.86754370156785, + "grad_norm": 0.3029111325740814, + "learning_rate": 4.5870042449326265e-06, + "loss": 0.3949, + "step": 5304 + }, + { + "epoch": 2.8680843395206344, + "grad_norm": 0.2939251661300659, + "learning_rate": 4.585122749560347e-06, + "loss": 0.3983, + "step": 5305 + }, + { + "epoch": 2.8686249774734187, + "grad_norm": 0.30623742938041687, + "learning_rate": 4.583241313340512e-06, + "loss": 0.4184, + "step": 5306 + }, + { + "epoch": 2.869165615426203, + "grad_norm": 0.29048800468444824, + "learning_rate": 4.581359936541379e-06, + "loss": 0.3976, + "step": 5307 + }, + { + "epoch": 2.869706253378987, + "grad_norm": 0.3232608437538147, + "learning_rate": 4.579478619431184e-06, + "loss": 0.4461, + "step": 5308 + }, + { + "epoch": 2.8702468913317714, + "grad_norm": 0.2938759922981262, + "learning_rate": 4.577597362278165e-06, + "loss": 0.3785, + "step": 5309 + }, + { + "epoch": 2.8707875292845557, + "grad_norm": 0.31486600637435913, + "learning_rate": 4.575716165350549e-06, + "loss": 0.4096, + "step": 5310 + }, + { + "epoch": 2.87132816723734, + "grad_norm": 0.31167715787887573, + "learning_rate": 4.573835028916554e-06, + "loss": 0.3929, + "step": 5311 + }, + { + "epoch": 2.8718688051901244, + "grad_norm": 0.3271113932132721, + "learning_rate": 4.5719539532443865e-06, + "loss": 0.4228, + "step": 5312 + }, + { + "epoch": 2.8724094431429084, + "grad_norm": 0.27412769198417664, + "learning_rate": 4.570072938602248e-06, + "loss": 0.3602, + "step": 5313 + }, + { + "epoch": 2.872950081095693, + "grad_norm": 0.3018361032009125, + "learning_rate": 4.5681919852583304e-06, + "loss": 0.4308, + "step": 5314 + }, + { + "epoch": 2.873490719048477, + "grad_norm": 0.3016452193260193, + "learning_rate": 4.566311093480818e-06, + "loss": 0.4124, + "step": 5315 + }, + { + "epoch": 2.8740313570012614, + "grad_norm": 0.2964125871658325, + "learning_rate": 4.564430263537884e-06, + "loss": 0.4095, + "step": 5316 + }, + { + "epoch": 2.874571994954046, + "grad_norm": 0.2763165533542633, + "learning_rate": 4.56254949569769e-06, + "loss": 0.3888, + "step": 5317 + }, + { + "epoch": 2.87511263290683, + "grad_norm": 0.2857678234577179, + "learning_rate": 4.560668790228397e-06, + "loss": 0.4014, + "step": 5318 + }, + { + "epoch": 2.8756532708596145, + "grad_norm": 0.3083879053592682, + "learning_rate": 4.5587881473981535e-06, + "loss": 0.4118, + "step": 5319 + }, + { + "epoch": 2.8761939088123984, + "grad_norm": 0.31368550658226013, + "learning_rate": 4.556907567475094e-06, + "loss": 0.4348, + "step": 5320 + }, + { + "epoch": 2.876734546765183, + "grad_norm": 0.2863789200782776, + "learning_rate": 4.555027050727351e-06, + "loss": 0.3506, + "step": 5321 + }, + { + "epoch": 2.877275184717967, + "grad_norm": 0.27195513248443604, + "learning_rate": 4.553146597423044e-06, + "loss": 0.3719, + "step": 5322 + }, + { + "epoch": 2.8778158226707515, + "grad_norm": 0.3159834146499634, + "learning_rate": 4.551266207830285e-06, + "loss": 0.4071, + "step": 5323 + }, + { + "epoch": 2.878356460623536, + "grad_norm": 0.28979089856147766, + "learning_rate": 4.549385882217177e-06, + "loss": 0.4029, + "step": 5324 + }, + { + "epoch": 2.87889709857632, + "grad_norm": 0.3067147433757782, + "learning_rate": 4.547505620851812e-06, + "loss": 0.4145, + "step": 5325 + }, + { + "epoch": 2.8794377365291046, + "grad_norm": 0.29320406913757324, + "learning_rate": 4.545625424002274e-06, + "loss": 0.3894, + "step": 5326 + }, + { + "epoch": 2.8799783744818885, + "grad_norm": 0.2783004343509674, + "learning_rate": 4.543745291936642e-06, + "loss": 0.3781, + "step": 5327 + }, + { + "epoch": 2.880519012434673, + "grad_norm": 0.30723005533218384, + "learning_rate": 4.541865224922977e-06, + "loss": 0.4331, + "step": 5328 + }, + { + "epoch": 2.8810596503874573, + "grad_norm": 0.27082914113998413, + "learning_rate": 4.5399852232293384e-06, + "loss": 0.3764, + "step": 5329 + }, + { + "epoch": 2.8816002883402416, + "grad_norm": 0.2909558415412903, + "learning_rate": 4.538105287123772e-06, + "loss": 0.3847, + "step": 5330 + }, + { + "epoch": 2.882140926293026, + "grad_norm": 0.26408976316452026, + "learning_rate": 4.536225416874319e-06, + "loss": 0.374, + "step": 5331 + }, + { + "epoch": 2.88268156424581, + "grad_norm": 0.3040916919708252, + "learning_rate": 4.534345612749002e-06, + "loss": 0.392, + "step": 5332 + }, + { + "epoch": 2.8832222021985943, + "grad_norm": 0.2888509929180145, + "learning_rate": 4.532465875015845e-06, + "loss": 0.4131, + "step": 5333 + }, + { + "epoch": 2.8837628401513786, + "grad_norm": 0.2653530538082123, + "learning_rate": 4.530586203942854e-06, + "loss": 0.3732, + "step": 5334 + }, + { + "epoch": 2.884303478104163, + "grad_norm": 0.3019903898239136, + "learning_rate": 4.528706599798033e-06, + "loss": 0.4276, + "step": 5335 + }, + { + "epoch": 2.8848441160569473, + "grad_norm": 0.29084232449531555, + "learning_rate": 4.526827062849369e-06, + "loss": 0.3941, + "step": 5336 + }, + { + "epoch": 2.8853847540097313, + "grad_norm": 0.3075839579105377, + "learning_rate": 4.524947593364845e-06, + "loss": 0.3991, + "step": 5337 + }, + { + "epoch": 2.8859253919625156, + "grad_norm": 0.2972691059112549, + "learning_rate": 4.5230681916124305e-06, + "loss": 0.4216, + "step": 5338 + }, + { + "epoch": 2.8864660299153, + "grad_norm": 0.28446921706199646, + "learning_rate": 4.521188857860091e-06, + "loss": 0.4271, + "step": 5339 + }, + { + "epoch": 2.8870066678680844, + "grad_norm": 0.28713396191596985, + "learning_rate": 4.5193095923757745e-06, + "loss": 0.3991, + "step": 5340 + }, + { + "epoch": 2.8875473058208687, + "grad_norm": 0.29077884554862976, + "learning_rate": 4.517430395427424e-06, + "loss": 0.4136, + "step": 5341 + }, + { + "epoch": 2.888087943773653, + "grad_norm": 0.27056440711021423, + "learning_rate": 4.515551267282974e-06, + "loss": 0.3811, + "step": 5342 + }, + { + "epoch": 2.8886285817264374, + "grad_norm": 0.29237234592437744, + "learning_rate": 4.5136722082103476e-06, + "loss": 0.4162, + "step": 5343 + }, + { + "epoch": 2.8891692196792214, + "grad_norm": 0.317898154258728, + "learning_rate": 4.511793218477454e-06, + "loss": 0.4328, + "step": 5344 + }, + { + "epoch": 2.8897098576320057, + "grad_norm": 0.2820221781730652, + "learning_rate": 4.509914298352197e-06, + "loss": 0.4096, + "step": 5345 + }, + { + "epoch": 2.89025049558479, + "grad_norm": 0.3054172992706299, + "learning_rate": 4.508035448102472e-06, + "loss": 0.3753, + "step": 5346 + }, + { + "epoch": 2.8907911335375744, + "grad_norm": 0.2961260974407196, + "learning_rate": 4.5061566679961605e-06, + "loss": 0.4046, + "step": 5347 + }, + { + "epoch": 2.891331771490359, + "grad_norm": 0.3012271523475647, + "learning_rate": 4.504277958301138e-06, + "loss": 0.4176, + "step": 5348 + }, + { + "epoch": 2.8918724094431427, + "grad_norm": 0.3134300410747528, + "learning_rate": 4.502399319285263e-06, + "loss": 0.3887, + "step": 5349 + }, + { + "epoch": 2.892413047395927, + "grad_norm": 0.28506678342819214, + "learning_rate": 4.5005207512163914e-06, + "loss": 0.3928, + "step": 5350 + }, + { + "epoch": 2.8929536853487114, + "grad_norm": 0.29594171047210693, + "learning_rate": 4.4986422543623655e-06, + "loss": 0.4026, + "step": 5351 + }, + { + "epoch": 2.893494323301496, + "grad_norm": 0.2881558835506439, + "learning_rate": 4.496763828991019e-06, + "loss": 0.3788, + "step": 5352 + }, + { + "epoch": 2.89403496125428, + "grad_norm": 0.2928634583950043, + "learning_rate": 4.494885475370172e-06, + "loss": 0.3898, + "step": 5353 + }, + { + "epoch": 2.894575599207064, + "grad_norm": 0.30739927291870117, + "learning_rate": 4.493007193767638e-06, + "loss": 0.3915, + "step": 5354 + }, + { + "epoch": 2.895116237159849, + "grad_norm": 0.3106321692466736, + "learning_rate": 4.491128984451219e-06, + "loss": 0.4095, + "step": 5355 + }, + { + "epoch": 2.895656875112633, + "grad_norm": 0.30263790488243103, + "learning_rate": 4.489250847688708e-06, + "loss": 0.4198, + "step": 5356 + }, + { + "epoch": 2.896197513065417, + "grad_norm": 0.29731953144073486, + "learning_rate": 4.487372783747884e-06, + "loss": 0.4124, + "step": 5357 + }, + { + "epoch": 2.8967381510182015, + "grad_norm": 0.27237144112586975, + "learning_rate": 4.485494792896519e-06, + "loss": 0.4029, + "step": 5358 + }, + { + "epoch": 2.897278788970986, + "grad_norm": 0.2843637764453888, + "learning_rate": 4.483616875402374e-06, + "loss": 0.3669, + "step": 5359 + }, + { + "epoch": 2.8978194269237703, + "grad_norm": 0.31389912962913513, + "learning_rate": 4.481739031533201e-06, + "loss": 0.3815, + "step": 5360 + }, + { + "epoch": 2.898360064876554, + "grad_norm": 0.3242151439189911, + "learning_rate": 4.4798612615567345e-06, + "loss": 0.4115, + "step": 5361 + }, + { + "epoch": 2.8989007028293385, + "grad_norm": 0.3388815224170685, + "learning_rate": 4.477983565740706e-06, + "loss": 0.4132, + "step": 5362 + }, + { + "epoch": 2.899441340782123, + "grad_norm": 0.2897244393825531, + "learning_rate": 4.476105944352834e-06, + "loss": 0.3665, + "step": 5363 + }, + { + "epoch": 2.8999819787349073, + "grad_norm": 0.32335197925567627, + "learning_rate": 4.474228397660829e-06, + "loss": 0.4141, + "step": 5364 + }, + { + "epoch": 2.9005226166876916, + "grad_norm": 0.35047903656959534, + "learning_rate": 4.472350925932384e-06, + "loss": 0.4355, + "step": 5365 + }, + { + "epoch": 2.9010632546404755, + "grad_norm": 0.2771613597869873, + "learning_rate": 4.470473529435187e-06, + "loss": 0.3756, + "step": 5366 + }, + { + "epoch": 2.90160389259326, + "grad_norm": 0.29492348432540894, + "learning_rate": 4.468596208436914e-06, + "loss": 0.3888, + "step": 5367 + }, + { + "epoch": 2.9021445305460443, + "grad_norm": 0.2763018012046814, + "learning_rate": 4.466718963205231e-06, + "loss": 0.3867, + "step": 5368 + }, + { + "epoch": 2.9026851684988286, + "grad_norm": 0.3185831308364868, + "learning_rate": 4.464841794007791e-06, + "loss": 0.3711, + "step": 5369 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.29999685287475586, + "learning_rate": 4.462964701112237e-06, + "loss": 0.3877, + "step": 5370 + }, + { + "epoch": 2.9037664444043974, + "grad_norm": 0.32476794719696045, + "learning_rate": 4.4610876847862034e-06, + "loss": 0.4353, + "step": 5371 + }, + { + "epoch": 2.9043070823571817, + "grad_norm": 0.28298261761665344, + "learning_rate": 4.459210745297312e-06, + "loss": 0.3844, + "step": 5372 + }, + { + "epoch": 2.9048477203099656, + "grad_norm": 0.31960076093673706, + "learning_rate": 4.45733388291317e-06, + "loss": 0.4018, + "step": 5373 + }, + { + "epoch": 2.90538835826275, + "grad_norm": 0.3158363997936249, + "learning_rate": 4.455457097901377e-06, + "loss": 0.4069, + "step": 5374 + }, + { + "epoch": 2.9059289962155344, + "grad_norm": 0.3054044544696808, + "learning_rate": 4.453580390529526e-06, + "loss": 0.4169, + "step": 5375 + }, + { + "epoch": 2.9064696341683187, + "grad_norm": 0.28569933772087097, + "learning_rate": 4.451703761065193e-06, + "loss": 0.3805, + "step": 5376 + }, + { + "epoch": 2.907010272121103, + "grad_norm": 0.280813068151474, + "learning_rate": 4.449827209775943e-06, + "loss": 0.4047, + "step": 5377 + }, + { + "epoch": 2.907550910073887, + "grad_norm": 0.2897249758243561, + "learning_rate": 4.447950736929331e-06, + "loss": 0.3892, + "step": 5378 + }, + { + "epoch": 2.9080915480266714, + "grad_norm": 0.3146097958087921, + "learning_rate": 4.4460743427929024e-06, + "loss": 0.4322, + "step": 5379 + }, + { + "epoch": 2.9086321859794557, + "grad_norm": 0.264935165643692, + "learning_rate": 4.444198027634191e-06, + "loss": 0.3782, + "step": 5380 + }, + { + "epoch": 2.90917282393224, + "grad_norm": 0.27351197600364685, + "learning_rate": 4.4423217917207155e-06, + "loss": 0.3905, + "step": 5381 + }, + { + "epoch": 2.9097134618850244, + "grad_norm": 0.30898234248161316, + "learning_rate": 4.440445635319987e-06, + "loss": 0.4077, + "step": 5382 + }, + { + "epoch": 2.9102540998378084, + "grad_norm": 0.31730180978775024, + "learning_rate": 4.438569558699507e-06, + "loss": 0.4121, + "step": 5383 + }, + { + "epoch": 2.910794737790593, + "grad_norm": 0.2789078950881958, + "learning_rate": 4.436693562126762e-06, + "loss": 0.3892, + "step": 5384 + }, + { + "epoch": 2.911335375743377, + "grad_norm": 0.2933134138584137, + "learning_rate": 4.434817645869226e-06, + "loss": 0.3922, + "step": 5385 + }, + { + "epoch": 2.9118760136961614, + "grad_norm": 0.3190993368625641, + "learning_rate": 4.4329418101943655e-06, + "loss": 0.427, + "step": 5386 + }, + { + "epoch": 2.912416651648946, + "grad_norm": 0.2673807442188263, + "learning_rate": 4.431066055369633e-06, + "loss": 0.3726, + "step": 5387 + }, + { + "epoch": 2.91295728960173, + "grad_norm": 0.29508569836616516, + "learning_rate": 4.429190381662473e-06, + "loss": 0.4127, + "step": 5388 + }, + { + "epoch": 2.9134979275545145, + "grad_norm": 0.28597328066825867, + "learning_rate": 4.4273147893403126e-06, + "loss": 0.3588, + "step": 5389 + }, + { + "epoch": 2.9140385655072985, + "grad_norm": 0.31585755944252014, + "learning_rate": 4.42543927867057e-06, + "loss": 0.4496, + "step": 5390 + }, + { + "epoch": 2.914579203460083, + "grad_norm": 0.29464495182037354, + "learning_rate": 4.4235638499206544e-06, + "loss": 0.3817, + "step": 5391 + }, + { + "epoch": 2.915119841412867, + "grad_norm": 0.33141279220581055, + "learning_rate": 4.42168850335796e-06, + "loss": 0.428, + "step": 5392 + }, + { + "epoch": 2.9156604793656515, + "grad_norm": 0.29135453701019287, + "learning_rate": 4.4198132392498695e-06, + "loss": 0.3828, + "step": 5393 + }, + { + "epoch": 2.916201117318436, + "grad_norm": 0.2641162872314453, + "learning_rate": 4.417938057863755e-06, + "loss": 0.3823, + "step": 5394 + }, + { + "epoch": 2.91674175527122, + "grad_norm": 0.2904815375804901, + "learning_rate": 4.416062959466978e-06, + "loss": 0.4163, + "step": 5395 + }, + { + "epoch": 2.917282393224004, + "grad_norm": 0.3001211881637573, + "learning_rate": 4.414187944326885e-06, + "loss": 0.3864, + "step": 5396 + }, + { + "epoch": 2.9178230311767885, + "grad_norm": 0.28068217635154724, + "learning_rate": 4.4123130127108125e-06, + "loss": 0.4227, + "step": 5397 + }, + { + "epoch": 2.918363669129573, + "grad_norm": 0.28638994693756104, + "learning_rate": 4.410438164886085e-06, + "loss": 0.4238, + "step": 5398 + }, + { + "epoch": 2.9189043070823573, + "grad_norm": 0.2821596562862396, + "learning_rate": 4.408563401120015e-06, + "loss": 0.4073, + "step": 5399 + }, + { + "epoch": 2.9194449450351416, + "grad_norm": 0.27199241518974304, + "learning_rate": 4.4066887216799055e-06, + "loss": 0.3903, + "step": 5400 + }, + { + "epoch": 2.919985582987926, + "grad_norm": 0.26883572340011597, + "learning_rate": 4.4048141268330395e-06, + "loss": 0.4008, + "step": 5401 + }, + { + "epoch": 2.92052622094071, + "grad_norm": 0.3116496503353119, + "learning_rate": 4.402939616846696e-06, + "loss": 0.3721, + "step": 5402 + }, + { + "epoch": 2.9210668588934943, + "grad_norm": 0.2937193214893341, + "learning_rate": 4.401065191988139e-06, + "loss": 0.3932, + "step": 5403 + }, + { + "epoch": 2.9216074968462786, + "grad_norm": 0.3178521990776062, + "learning_rate": 4.399190852524624e-06, + "loss": 0.4182, + "step": 5404 + }, + { + "epoch": 2.922148134799063, + "grad_norm": 0.27353766560554504, + "learning_rate": 4.397316598723385e-06, + "loss": 0.3572, + "step": 5405 + }, + { + "epoch": 2.9226887727518474, + "grad_norm": 0.3278379738330841, + "learning_rate": 4.395442430851654e-06, + "loss": 0.4477, + "step": 5406 + }, + { + "epoch": 2.9232294107046313, + "grad_norm": 0.30886223912239075, + "learning_rate": 4.3935683491766445e-06, + "loss": 0.3951, + "step": 5407 + }, + { + "epoch": 2.9237700486574156, + "grad_norm": 0.31581950187683105, + "learning_rate": 4.391694353965562e-06, + "loss": 0.424, + "step": 5408 + }, + { + "epoch": 2.9243106866102, + "grad_norm": 0.247950479388237, + "learning_rate": 4.389820445485593e-06, + "loss": 0.3396, + "step": 5409 + }, + { + "epoch": 2.9248513245629844, + "grad_norm": 0.30187222361564636, + "learning_rate": 4.38794662400392e-06, + "loss": 0.4398, + "step": 5410 + }, + { + "epoch": 2.9253919625157687, + "grad_norm": 0.2787235975265503, + "learning_rate": 4.386072889787706e-06, + "loss": 0.3721, + "step": 5411 + }, + { + "epoch": 2.9259326004685526, + "grad_norm": 0.29677945375442505, + "learning_rate": 4.384199243104107e-06, + "loss": 0.4313, + "step": 5412 + }, + { + "epoch": 2.9264732384213374, + "grad_norm": 0.2835438847541809, + "learning_rate": 4.382325684220266e-06, + "loss": 0.4192, + "step": 5413 + }, + { + "epoch": 2.9270138763741214, + "grad_norm": 0.31162241101264954, + "learning_rate": 4.380452213403306e-06, + "loss": 0.4169, + "step": 5414 + }, + { + "epoch": 2.9275545143269057, + "grad_norm": 0.3062303066253662, + "learning_rate": 4.3785788309203466e-06, + "loss": 0.4379, + "step": 5415 + }, + { + "epoch": 2.92809515227969, + "grad_norm": 0.3071049153804779, + "learning_rate": 4.376705537038491e-06, + "loss": 0.3876, + "step": 5416 + }, + { + "epoch": 2.9286357902324744, + "grad_norm": 0.3191245198249817, + "learning_rate": 4.3748323320248325e-06, + "loss": 0.4023, + "step": 5417 + }, + { + "epoch": 2.929176428185259, + "grad_norm": 0.2853299379348755, + "learning_rate": 4.372959216146443e-06, + "loss": 0.366, + "step": 5418 + }, + { + "epoch": 2.9297170661380427, + "grad_norm": 0.3157528340816498, + "learning_rate": 4.371086189670393e-06, + "loss": 0.3921, + "step": 5419 + }, + { + "epoch": 2.930257704090827, + "grad_norm": 0.30653080344200134, + "learning_rate": 4.369213252863733e-06, + "loss": 0.3965, + "step": 5420 + }, + { + "epoch": 2.9307983420436114, + "grad_norm": 0.2971093952655792, + "learning_rate": 4.367340405993505e-06, + "loss": 0.4146, + "step": 5421 + }, + { + "epoch": 2.931338979996396, + "grad_norm": 0.3022221624851227, + "learning_rate": 4.3654676493267335e-06, + "loss": 0.3983, + "step": 5422 + }, + { + "epoch": 2.93187961794918, + "grad_norm": 0.30804184079170227, + "learning_rate": 4.363594983130435e-06, + "loss": 0.4365, + "step": 5423 + }, + { + "epoch": 2.932420255901964, + "grad_norm": 0.29444482922554016, + "learning_rate": 4.361722407671609e-06, + "loss": 0.3646, + "step": 5424 + }, + { + "epoch": 2.9329608938547485, + "grad_norm": 0.325641930103302, + "learning_rate": 4.359849923217246e-06, + "loss": 0.4196, + "step": 5425 + }, + { + "epoch": 2.933501531807533, + "grad_norm": 0.2978176474571228, + "learning_rate": 4.357977530034319e-06, + "loss": 0.3721, + "step": 5426 + }, + { + "epoch": 2.934042169760317, + "grad_norm": 0.3437287211418152, + "learning_rate": 4.356105228389792e-06, + "loss": 0.4467, + "step": 5427 + }, + { + "epoch": 2.9345828077131015, + "grad_norm": 0.3110363483428955, + "learning_rate": 4.3542330185506145e-06, + "loss": 0.3747, + "step": 5428 + }, + { + "epoch": 2.935123445665886, + "grad_norm": 0.310379296541214, + "learning_rate": 4.352360900783724e-06, + "loss": 0.403, + "step": 5429 + }, + { + "epoch": 2.9356640836186703, + "grad_norm": 0.34377607703208923, + "learning_rate": 4.350488875356041e-06, + "loss": 0.4229, + "step": 5430 + }, + { + "epoch": 2.936204721571454, + "grad_norm": 0.3300040364265442, + "learning_rate": 4.348616942534475e-06, + "loss": 0.3755, + "step": 5431 + }, + { + "epoch": 2.9367453595242385, + "grad_norm": 0.28814318776130676, + "learning_rate": 4.346745102585923e-06, + "loss": 0.3635, + "step": 5432 + }, + { + "epoch": 2.937285997477023, + "grad_norm": 0.2987310588359833, + "learning_rate": 4.344873355777274e-06, + "loss": 0.4341, + "step": 5433 + }, + { + "epoch": 2.9378266354298073, + "grad_norm": 0.2876870334148407, + "learning_rate": 4.3430017023753925e-06, + "loss": 0.4048, + "step": 5434 + }, + { + "epoch": 2.9383672733825916, + "grad_norm": 0.31516033411026, + "learning_rate": 4.341130142647136e-06, + "loss": 0.4341, + "step": 5435 + }, + { + "epoch": 2.9389079113353755, + "grad_norm": 0.2953851819038391, + "learning_rate": 4.339258676859349e-06, + "loss": 0.3732, + "step": 5436 + }, + { + "epoch": 2.93944854928816, + "grad_norm": 0.28782448172569275, + "learning_rate": 4.337387305278864e-06, + "loss": 0.4046, + "step": 5437 + }, + { + "epoch": 2.9399891872409443, + "grad_norm": 0.2743951082229614, + "learning_rate": 4.3355160281724935e-06, + "loss": 0.3728, + "step": 5438 + }, + { + "epoch": 2.9405298251937286, + "grad_norm": 0.28903907537460327, + "learning_rate": 4.333644845807044e-06, + "loss": 0.3713, + "step": 5439 + }, + { + "epoch": 2.941070463146513, + "grad_norm": 0.2945643365383148, + "learning_rate": 4.331773758449303e-06, + "loss": 0.3941, + "step": 5440 + }, + { + "epoch": 2.941611101099297, + "grad_norm": 0.3084539473056793, + "learning_rate": 4.329902766366051e-06, + "loss": 0.4243, + "step": 5441 + }, + { + "epoch": 2.9421517390520817, + "grad_norm": 0.2908715009689331, + "learning_rate": 4.328031869824044e-06, + "loss": 0.4025, + "step": 5442 + }, + { + "epoch": 2.9426923770048656, + "grad_norm": 0.29896068572998047, + "learning_rate": 4.3261610690900365e-06, + "loss": 0.4012, + "step": 5443 + }, + { + "epoch": 2.94323301495765, + "grad_norm": 0.29760220646858215, + "learning_rate": 4.324290364430761e-06, + "loss": 0.3691, + "step": 5444 + }, + { + "epoch": 2.9437736529104344, + "grad_norm": 0.2887779474258423, + "learning_rate": 4.322419756112943e-06, + "loss": 0.4186, + "step": 5445 + }, + { + "epoch": 2.9443142908632187, + "grad_norm": 0.3118552565574646, + "learning_rate": 4.320549244403285e-06, + "loss": 0.4232, + "step": 5446 + }, + { + "epoch": 2.944854928816003, + "grad_norm": 0.3034192621707916, + "learning_rate": 4.318678829568484e-06, + "loss": 0.3947, + "step": 5447 + }, + { + "epoch": 2.945395566768787, + "grad_norm": 0.294358491897583, + "learning_rate": 4.3168085118752205e-06, + "loss": 0.4275, + "step": 5448 + }, + { + "epoch": 2.9459362047215714, + "grad_norm": 0.31848475337028503, + "learning_rate": 4.314938291590161e-06, + "loss": 0.3882, + "step": 5449 + }, + { + "epoch": 2.9464768426743557, + "grad_norm": 0.279231458902359, + "learning_rate": 4.313068168979957e-06, + "loss": 0.4152, + "step": 5450 + }, + { + "epoch": 2.94701748062714, + "grad_norm": 0.32018351554870605, + "learning_rate": 4.3111981443112486e-06, + "loss": 0.4382, + "step": 5451 + }, + { + "epoch": 2.9475581185799244, + "grad_norm": 0.30242928862571716, + "learning_rate": 4.309328217850659e-06, + "loss": 0.3996, + "step": 5452 + }, + { + "epoch": 2.9480987565327084, + "grad_norm": 0.272429883480072, + "learning_rate": 4.3074583898648016e-06, + "loss": 0.3735, + "step": 5453 + }, + { + "epoch": 2.9486393944854927, + "grad_norm": 0.2945402264595032, + "learning_rate": 4.305588660620269e-06, + "loss": 0.4183, + "step": 5454 + }, + { + "epoch": 2.949180032438277, + "grad_norm": 0.31038814783096313, + "learning_rate": 4.303719030383648e-06, + "loss": 0.4091, + "step": 5455 + }, + { + "epoch": 2.9497206703910615, + "grad_norm": 0.301114946603775, + "learning_rate": 4.301849499421504e-06, + "loss": 0.4221, + "step": 5456 + }, + { + "epoch": 2.950261308343846, + "grad_norm": 0.28582969307899475, + "learning_rate": 4.299980068000395e-06, + "loss": 0.415, + "step": 5457 + }, + { + "epoch": 2.95080194629663, + "grad_norm": 0.2952626049518585, + "learning_rate": 4.2981107363868564e-06, + "loss": 0.4163, + "step": 5458 + }, + { + "epoch": 2.9513425842494145, + "grad_norm": 0.27767154574394226, + "learning_rate": 4.296241504847417e-06, + "loss": 0.3702, + "step": 5459 + }, + { + "epoch": 2.9518832222021985, + "grad_norm": 0.28652843832969666, + "learning_rate": 4.294372373648587e-06, + "loss": 0.3967, + "step": 5460 + }, + { + "epoch": 2.952423860154983, + "grad_norm": 0.3140363395214081, + "learning_rate": 4.292503343056866e-06, + "loss": 0.407, + "step": 5461 + }, + { + "epoch": 2.952964498107767, + "grad_norm": 0.30203577876091003, + "learning_rate": 4.290634413338735e-06, + "loss": 0.4303, + "step": 5462 + }, + { + "epoch": 2.9535051360605515, + "grad_norm": 0.2728308439254761, + "learning_rate": 4.288765584760663e-06, + "loss": 0.382, + "step": 5463 + }, + { + "epoch": 2.954045774013336, + "grad_norm": 0.28560322523117065, + "learning_rate": 4.286896857589103e-06, + "loss": 0.4026, + "step": 5464 + }, + { + "epoch": 2.95458641196612, + "grad_norm": 0.2892841398715973, + "learning_rate": 4.285028232090499e-06, + "loss": 0.4071, + "step": 5465 + }, + { + "epoch": 2.955127049918904, + "grad_norm": 0.27572664618492126, + "learning_rate": 4.283159708531272e-06, + "loss": 0.3624, + "step": 5466 + }, + { + "epoch": 2.9556676878716885, + "grad_norm": 0.3437039852142334, + "learning_rate": 4.281291287177833e-06, + "loss": 0.4019, + "step": 5467 + }, + { + "epoch": 2.956208325824473, + "grad_norm": 0.3084211051464081, + "learning_rate": 4.27942296829658e-06, + "loss": 0.4061, + "step": 5468 + }, + { + "epoch": 2.9567489637772573, + "grad_norm": 0.29442712664604187, + "learning_rate": 4.277554752153895e-06, + "loss": 0.3714, + "step": 5469 + }, + { + "epoch": 2.957289601730041, + "grad_norm": 0.2936548590660095, + "learning_rate": 4.275686639016142e-06, + "loss": 0.3708, + "step": 5470 + }, + { + "epoch": 2.957830239682826, + "grad_norm": 0.29006555676460266, + "learning_rate": 4.273818629149674e-06, + "loss": 0.3731, + "step": 5471 + }, + { + "epoch": 2.95837087763561, + "grad_norm": 0.2901497781276703, + "learning_rate": 4.2719507228208305e-06, + "loss": 0.4264, + "step": 5472 + }, + { + "epoch": 2.9589115155883943, + "grad_norm": 0.3107115626335144, + "learning_rate": 4.270082920295934e-06, + "loss": 0.3747, + "step": 5473 + }, + { + "epoch": 2.9594521535411786, + "grad_norm": 0.2886500060558319, + "learning_rate": 4.26821522184129e-06, + "loss": 0.4096, + "step": 5474 + }, + { + "epoch": 2.959992791493963, + "grad_norm": 0.2953193485736847, + "learning_rate": 4.266347627723192e-06, + "loss": 0.4331, + "step": 5475 + }, + { + "epoch": 2.9605334294467474, + "grad_norm": 0.3297775387763977, + "learning_rate": 4.26448013820792e-06, + "loss": 0.3978, + "step": 5476 + }, + { + "epoch": 2.9610740673995313, + "grad_norm": 0.3226925730705261, + "learning_rate": 4.262612753561736e-06, + "loss": 0.411, + "step": 5477 + }, + { + "epoch": 2.9616147053523156, + "grad_norm": 0.29509931802749634, + "learning_rate": 4.260745474050889e-06, + "loss": 0.4123, + "step": 5478 + }, + { + "epoch": 2.9621553433051, + "grad_norm": 0.2667893171310425, + "learning_rate": 4.258878299941612e-06, + "loss": 0.3607, + "step": 5479 + }, + { + "epoch": 2.9626959812578844, + "grad_norm": 0.3503236174583435, + "learning_rate": 4.257011231500122e-06, + "loss": 0.4425, + "step": 5480 + }, + { + "epoch": 2.9632366192106687, + "grad_norm": 0.27424386143684387, + "learning_rate": 4.2551442689926246e-06, + "loss": 0.3407, + "step": 5481 + }, + { + "epoch": 2.9637772571634526, + "grad_norm": 0.36829856038093567, + "learning_rate": 4.2532774126853075e-06, + "loss": 0.4627, + "step": 5482 + }, + { + "epoch": 2.964317895116237, + "grad_norm": 0.29745393991470337, + "learning_rate": 4.2514106628443415e-06, + "loss": 0.3851, + "step": 5483 + }, + { + "epoch": 2.9648585330690214, + "grad_norm": 0.3074614107608795, + "learning_rate": 4.249544019735886e-06, + "loss": 0.4216, + "step": 5484 + }, + { + "epoch": 2.9653991710218057, + "grad_norm": 0.290180504322052, + "learning_rate": 4.247677483626085e-06, + "loss": 0.4299, + "step": 5485 + }, + { + "epoch": 2.96593980897459, + "grad_norm": 0.3519513010978699, + "learning_rate": 4.245811054781065e-06, + "loss": 0.3814, + "step": 5486 + }, + { + "epoch": 2.9664804469273744, + "grad_norm": 0.34274551272392273, + "learning_rate": 4.243944733466935e-06, + "loss": 0.433, + "step": 5487 + }, + { + "epoch": 2.967021084880159, + "grad_norm": 0.3035910725593567, + "learning_rate": 4.242078519949795e-06, + "loss": 0.3677, + "step": 5488 + }, + { + "epoch": 2.9675617228329427, + "grad_norm": 0.3082733750343323, + "learning_rate": 4.240212414495724e-06, + "loss": 0.4269, + "step": 5489 + }, + { + "epoch": 2.968102360785727, + "grad_norm": 0.31705421209335327, + "learning_rate": 4.238346417370793e-06, + "loss": 0.4254, + "step": 5490 + }, + { + "epoch": 2.9686429987385115, + "grad_norm": 0.3229638934135437, + "learning_rate": 4.236480528841046e-06, + "loss": 0.3812, + "step": 5491 + }, + { + "epoch": 2.969183636691296, + "grad_norm": 0.3046262860298157, + "learning_rate": 4.234614749172521e-06, + "loss": 0.3773, + "step": 5492 + }, + { + "epoch": 2.96972427464408, + "grad_norm": 0.34981080889701843, + "learning_rate": 4.232749078631237e-06, + "loss": 0.4321, + "step": 5493 + }, + { + "epoch": 2.970264912596864, + "grad_norm": 0.27616679668426514, + "learning_rate": 4.2308835174832e-06, + "loss": 0.3885, + "step": 5494 + }, + { + "epoch": 2.9708055505496485, + "grad_norm": 0.3084140419960022, + "learning_rate": 4.229018065994396e-06, + "loss": 0.423, + "step": 5495 + }, + { + "epoch": 2.971346188502433, + "grad_norm": 0.3051964342594147, + "learning_rate": 4.2271527244307975e-06, + "loss": 0.4013, + "step": 5496 + }, + { + "epoch": 2.971886826455217, + "grad_norm": 0.31605881452560425, + "learning_rate": 4.225287493058362e-06, + "loss": 0.3895, + "step": 5497 + }, + { + "epoch": 2.9724274644080015, + "grad_norm": 0.3115125894546509, + "learning_rate": 4.223422372143034e-06, + "loss": 0.4296, + "step": 5498 + }, + { + "epoch": 2.9729681023607855, + "grad_norm": 0.2768317759037018, + "learning_rate": 4.221557361950734e-06, + "loss": 0.3807, + "step": 5499 + }, + { + "epoch": 2.9735087403135703, + "grad_norm": 0.3032225966453552, + "learning_rate": 4.2196924627473715e-06, + "loss": 0.3855, + "step": 5500 + }, + { + "epoch": 2.974049378266354, + "grad_norm": 0.29233628511428833, + "learning_rate": 4.217827674798845e-06, + "loss": 0.3735, + "step": 5501 + }, + { + "epoch": 2.9745900162191385, + "grad_norm": 0.313637375831604, + "learning_rate": 4.215962998371032e-06, + "loss": 0.4408, + "step": 5502 + }, + { + "epoch": 2.975130654171923, + "grad_norm": 0.27353334426879883, + "learning_rate": 4.214098433729792e-06, + "loss": 0.3802, + "step": 5503 + }, + { + "epoch": 2.9756712921247073, + "grad_norm": 0.31625521183013916, + "learning_rate": 4.212233981140972e-06, + "loss": 0.4286, + "step": 5504 + }, + { + "epoch": 2.9762119300774916, + "grad_norm": 0.28535178303718567, + "learning_rate": 4.210369640870403e-06, + "loss": 0.3642, + "step": 5505 + }, + { + "epoch": 2.9767525680302755, + "grad_norm": 0.26238372921943665, + "learning_rate": 4.208505413183899e-06, + "loss": 0.3838, + "step": 5506 + }, + { + "epoch": 2.97729320598306, + "grad_norm": 0.3076592683792114, + "learning_rate": 4.206641298347258e-06, + "loss": 0.3996, + "step": 5507 + }, + { + "epoch": 2.9778338439358443, + "grad_norm": 0.3326891362667084, + "learning_rate": 4.204777296626262e-06, + "loss": 0.4509, + "step": 5508 + }, + { + "epoch": 2.9783744818886286, + "grad_norm": 0.32030439376831055, + "learning_rate": 4.202913408286677e-06, + "loss": 0.3774, + "step": 5509 + }, + { + "epoch": 2.978915119841413, + "grad_norm": 0.31616464257240295, + "learning_rate": 4.201049633594254e-06, + "loss": 0.4099, + "step": 5510 + }, + { + "epoch": 2.979455757794197, + "grad_norm": 0.28750118613243103, + "learning_rate": 4.1991859728147245e-06, + "loss": 0.3809, + "step": 5511 + }, + { + "epoch": 2.9799963957469813, + "grad_norm": 0.3051993250846863, + "learning_rate": 4.1973224262138075e-06, + "loss": 0.4296, + "step": 5512 + }, + { + "epoch": 2.9805370336997656, + "grad_norm": 0.29199936985969543, + "learning_rate": 4.1954589940572035e-06, + "loss": 0.3975, + "step": 5513 + }, + { + "epoch": 2.98107767165255, + "grad_norm": 0.27910029888153076, + "learning_rate": 4.193595676610599e-06, + "loss": 0.4004, + "step": 5514 + }, + { + "epoch": 2.9816183096053344, + "grad_norm": 0.290517121553421, + "learning_rate": 4.1917324741396595e-06, + "loss": 0.3889, + "step": 5515 + }, + { + "epoch": 2.9821589475581187, + "grad_norm": 0.2989426851272583, + "learning_rate": 4.189869386910038e-06, + "loss": 0.3773, + "step": 5516 + }, + { + "epoch": 2.982699585510903, + "grad_norm": 0.31459930539131165, + "learning_rate": 4.18800641518737e-06, + "loss": 0.4488, + "step": 5517 + }, + { + "epoch": 2.983240223463687, + "grad_norm": 0.312425434589386, + "learning_rate": 4.1861435592372766e-06, + "loss": 0.422, + "step": 5518 + }, + { + "epoch": 2.9837808614164714, + "grad_norm": 0.29273882508277893, + "learning_rate": 4.184280819325358e-06, + "loss": 0.4171, + "step": 5519 + }, + { + "epoch": 2.9843214993692557, + "grad_norm": 0.30007869005203247, + "learning_rate": 4.1824181957172014e-06, + "loss": 0.3704, + "step": 5520 + }, + { + "epoch": 2.98486213732204, + "grad_norm": 0.3161482810974121, + "learning_rate": 4.180555688678376e-06, + "loss": 0.3924, + "step": 5521 + }, + { + "epoch": 2.9854027752748244, + "grad_norm": 0.3036750257015228, + "learning_rate": 4.1786932984744345e-06, + "loss": 0.3936, + "step": 5522 + }, + { + "epoch": 2.9859434132276084, + "grad_norm": 0.3443000316619873, + "learning_rate": 4.176831025370914e-06, + "loss": 0.395, + "step": 5523 + }, + { + "epoch": 2.9864840511803927, + "grad_norm": 0.34082910418510437, + "learning_rate": 4.174968869633333e-06, + "loss": 0.4453, + "step": 5524 + }, + { + "epoch": 2.987024689133177, + "grad_norm": 0.29087314009666443, + "learning_rate": 4.173106831527194e-06, + "loss": 0.3861, + "step": 5525 + }, + { + "epoch": 2.9875653270859615, + "grad_norm": 0.32361748814582825, + "learning_rate": 4.171244911317986e-06, + "loss": 0.4255, + "step": 5526 + }, + { + "epoch": 2.988105965038746, + "grad_norm": 0.28390243649482727, + "learning_rate": 4.169383109271174e-06, + "loss": 0.3854, + "step": 5527 + }, + { + "epoch": 2.9886466029915297, + "grad_norm": 0.35260480642318726, + "learning_rate": 4.167521425652212e-06, + "loss": 0.4224, + "step": 5528 + }, + { + "epoch": 2.9891872409443145, + "grad_norm": 0.32985785603523254, + "learning_rate": 4.165659860726535e-06, + "loss": 0.4329, + "step": 5529 + }, + { + "epoch": 2.9897278788970985, + "grad_norm": 0.29288801550865173, + "learning_rate": 4.163798414759566e-06, + "loss": 0.4038, + "step": 5530 + }, + { + "epoch": 2.990268516849883, + "grad_norm": 0.29671889543533325, + "learning_rate": 4.161937088016701e-06, + "loss": 0.4096, + "step": 5531 + }, + { + "epoch": 2.990809154802667, + "grad_norm": 0.2902495265007019, + "learning_rate": 4.160075880763325e-06, + "loss": 0.3764, + "step": 5532 + }, + { + "epoch": 2.9913497927554515, + "grad_norm": 0.28445449471473694, + "learning_rate": 4.158214793264808e-06, + "loss": 0.3758, + "step": 5533 + }, + { + "epoch": 2.991890430708236, + "grad_norm": 0.3205009698867798, + "learning_rate": 4.1563538257865e-06, + "loss": 0.4267, + "step": 5534 + }, + { + "epoch": 2.99243106866102, + "grad_norm": 0.2863154113292694, + "learning_rate": 4.154492978593733e-06, + "loss": 0.396, + "step": 5535 + }, + { + "epoch": 2.992971706613804, + "grad_norm": 0.28448721766471863, + "learning_rate": 4.1526322519518245e-06, + "loss": 0.4027, + "step": 5536 + }, + { + "epoch": 2.9935123445665885, + "grad_norm": 0.2850733995437622, + "learning_rate": 4.150771646126073e-06, + "loss": 0.3698, + "step": 5537 + }, + { + "epoch": 2.994052982519373, + "grad_norm": 0.34360191226005554, + "learning_rate": 4.148911161381763e-06, + "loss": 0.4403, + "step": 5538 + }, + { + "epoch": 2.9945936204721573, + "grad_norm": 0.2677406668663025, + "learning_rate": 4.147050797984152e-06, + "loss": 0.3659, + "step": 5539 + }, + { + "epoch": 2.995134258424941, + "grad_norm": 0.3009760081768036, + "learning_rate": 4.145190556198494e-06, + "loss": 0.4231, + "step": 5540 + }, + { + "epoch": 2.995674896377726, + "grad_norm": 0.30065473914146423, + "learning_rate": 4.143330436290016e-06, + "loss": 0.3978, + "step": 5541 + }, + { + "epoch": 2.99621553433051, + "grad_norm": 0.2900971472263336, + "learning_rate": 4.141470438523932e-06, + "loss": 0.3595, + "step": 5542 + }, + { + "epoch": 2.9967561722832943, + "grad_norm": 0.31957828998565674, + "learning_rate": 4.139610563165438e-06, + "loss": 0.4176, + "step": 5543 + }, + { + "epoch": 2.9972968102360786, + "grad_norm": 0.3145180344581604, + "learning_rate": 4.1377508104797075e-06, + "loss": 0.4469, + "step": 5544 + }, + { + "epoch": 2.997837448188863, + "grad_norm": 0.3263266980648041, + "learning_rate": 4.135891180731903e-06, + "loss": 0.4027, + "step": 5545 + }, + { + "epoch": 2.9983780861416474, + "grad_norm": 0.28440696001052856, + "learning_rate": 4.134031674187167e-06, + "loss": 0.399, + "step": 5546 + }, + { + "epoch": 2.9989187240944313, + "grad_norm": 0.30456775426864624, + "learning_rate": 4.132172291110626e-06, + "loss": 0.4125, + "step": 5547 + }, + { + "epoch": 2.9994593620472156, + "grad_norm": 0.3088439702987671, + "learning_rate": 4.130313031767386e-06, + "loss": 0.3781, + "step": 5548 + }, + { + "epoch": 3.0, + "grad_norm": 0.4863080382347107, + "learning_rate": 4.1284538964225364e-06, + "loss": 0.5696, + "step": 5549 + }, + { + "epoch": 3.0005406379527844, + "grad_norm": 0.33911392092704773, + "learning_rate": 4.1265948853411506e-06, + "loss": 0.3871, + "step": 5550 + }, + { + "epoch": 3.0010812759055687, + "grad_norm": 0.3231358528137207, + "learning_rate": 4.124735998788283e-06, + "loss": 0.3744, + "step": 5551 + }, + { + "epoch": 3.0016219138583526, + "grad_norm": 0.3206525146961212, + "learning_rate": 4.122877237028969e-06, + "loss": 0.3783, + "step": 5552 + }, + { + "epoch": 3.002162551811137, + "grad_norm": 0.3616611957550049, + "learning_rate": 4.1210186003282275e-06, + "loss": 0.3955, + "step": 5553 + }, + { + "epoch": 3.0027031897639214, + "grad_norm": 0.31090232729911804, + "learning_rate": 4.119160088951061e-06, + "loss": 0.3653, + "step": 5554 + }, + { + "epoch": 3.0032438277167057, + "grad_norm": 0.30760252475738525, + "learning_rate": 4.1173017031624544e-06, + "loss": 0.3766, + "step": 5555 + }, + { + "epoch": 3.00378446566949, + "grad_norm": 0.3399474322795868, + "learning_rate": 4.115443443227367e-06, + "loss": 0.3756, + "step": 5556 + }, + { + "epoch": 3.0043251036222745, + "grad_norm": 0.3576647937297821, + "learning_rate": 4.11358530941075e-06, + "loss": 0.3586, + "step": 5557 + }, + { + "epoch": 3.0048657415750584, + "grad_norm": 0.3041995167732239, + "learning_rate": 4.1117273019775326e-06, + "loss": 0.3681, + "step": 5558 + }, + { + "epoch": 3.0054063795278427, + "grad_norm": 0.31850144267082214, + "learning_rate": 4.109869421192628e-06, + "loss": 0.3922, + "step": 5559 + }, + { + "epoch": 3.005947017480627, + "grad_norm": 0.3466080129146576, + "learning_rate": 4.108011667320926e-06, + "loss": 0.4141, + "step": 5560 + }, + { + "epoch": 3.0064876554334115, + "grad_norm": 0.3188363313674927, + "learning_rate": 4.106154040627302e-06, + "loss": 0.3696, + "step": 5561 + }, + { + "epoch": 3.007028293386196, + "grad_norm": 0.32207629084587097, + "learning_rate": 4.104296541376616e-06, + "loss": 0.3754, + "step": 5562 + }, + { + "epoch": 3.00756893133898, + "grad_norm": 0.291013240814209, + "learning_rate": 4.102439169833705e-06, + "loss": 0.3674, + "step": 5563 + }, + { + "epoch": 3.008109569291764, + "grad_norm": 0.3080962896347046, + "learning_rate": 4.100581926263389e-06, + "loss": 0.3742, + "step": 5564 + }, + { + "epoch": 3.0086502072445485, + "grad_norm": 0.316063791513443, + "learning_rate": 4.098724810930472e-06, + "loss": 0.3762, + "step": 5565 + }, + { + "epoch": 3.009190845197333, + "grad_norm": 0.3149975836277008, + "learning_rate": 4.096867824099736e-06, + "loss": 0.3629, + "step": 5566 + }, + { + "epoch": 3.009731483150117, + "grad_norm": 0.2868281602859497, + "learning_rate": 4.09501096603595e-06, + "loss": 0.3745, + "step": 5567 + }, + { + "epoch": 3.0102721211029015, + "grad_norm": 0.3042278587818146, + "learning_rate": 4.093154237003858e-06, + "loss": 0.341, + "step": 5568 + }, + { + "epoch": 3.010812759055686, + "grad_norm": 0.32166436314582825, + "learning_rate": 4.091297637268191e-06, + "loss": 0.3601, + "step": 5569 + }, + { + "epoch": 3.01135339700847, + "grad_norm": 0.264839231967926, + "learning_rate": 4.08944116709366e-06, + "loss": 0.3842, + "step": 5570 + }, + { + "epoch": 3.011894034961254, + "grad_norm": 0.2903689444065094, + "learning_rate": 4.087584826744957e-06, + "loss": 0.3755, + "step": 5571 + }, + { + "epoch": 3.0124346729140385, + "grad_norm": 0.2931392192840576, + "learning_rate": 4.085728616486754e-06, + "loss": 0.3777, + "step": 5572 + }, + { + "epoch": 3.012975310866823, + "grad_norm": 0.2999896705150604, + "learning_rate": 4.083872536583708e-06, + "loss": 0.3731, + "step": 5573 + }, + { + "epoch": 3.0135159488196073, + "grad_norm": 0.30667611956596375, + "learning_rate": 4.082016587300453e-06, + "loss": 0.3729, + "step": 5574 + }, + { + "epoch": 3.0140565867723916, + "grad_norm": 0.3090130090713501, + "learning_rate": 4.08016076890161e-06, + "loss": 0.4096, + "step": 5575 + }, + { + "epoch": 3.0145972247251756, + "grad_norm": 0.30549952387809753, + "learning_rate": 4.078305081651776e-06, + "loss": 0.3932, + "step": 5576 + }, + { + "epoch": 3.01513786267796, + "grad_norm": 0.29738929867744446, + "learning_rate": 4.076449525815533e-06, + "loss": 0.3804, + "step": 5577 + }, + { + "epoch": 3.0156785006307443, + "grad_norm": 0.3099241256713867, + "learning_rate": 4.074594101657441e-06, + "loss": 0.3697, + "step": 5578 + }, + { + "epoch": 3.0162191385835286, + "grad_norm": 0.2834163308143616, + "learning_rate": 4.072738809442046e-06, + "loss": 0.3692, + "step": 5579 + }, + { + "epoch": 3.016759776536313, + "grad_norm": 0.3082650899887085, + "learning_rate": 4.0708836494338695e-06, + "loss": 0.3759, + "step": 5580 + }, + { + "epoch": 3.017300414489097, + "grad_norm": 0.3224112391471863, + "learning_rate": 4.069028621897417e-06, + "loss": 0.3847, + "step": 5581 + }, + { + "epoch": 3.0178410524418813, + "grad_norm": 0.3178497850894928, + "learning_rate": 4.067173727097176e-06, + "loss": 0.3752, + "step": 5582 + }, + { + "epoch": 3.0183816903946656, + "grad_norm": 0.34015393257141113, + "learning_rate": 4.065318965297615e-06, + "loss": 0.3724, + "step": 5583 + }, + { + "epoch": 3.01892232834745, + "grad_norm": 0.28791576623916626, + "learning_rate": 4.06346433676318e-06, + "loss": 0.3669, + "step": 5584 + }, + { + "epoch": 3.0194629663002344, + "grad_norm": 0.3029351532459259, + "learning_rate": 4.061609841758302e-06, + "loss": 0.3673, + "step": 5585 + }, + { + "epoch": 3.0200036042530187, + "grad_norm": 0.2851192057132721, + "learning_rate": 4.059755480547389e-06, + "loss": 0.3712, + "step": 5586 + }, + { + "epoch": 3.0205442422058026, + "grad_norm": 0.296670526266098, + "learning_rate": 4.057901253394839e-06, + "loss": 0.3641, + "step": 5587 + }, + { + "epoch": 3.021084880158587, + "grad_norm": 0.3029664158821106, + "learning_rate": 4.056047160565017e-06, + "loss": 0.3707, + "step": 5588 + }, + { + "epoch": 3.0216255181113714, + "grad_norm": 0.305296927690506, + "learning_rate": 4.0541932023222806e-06, + "loss": 0.3651, + "step": 5589 + }, + { + "epoch": 3.0221661560641557, + "grad_norm": 0.2943899631500244, + "learning_rate": 4.0523393789309625e-06, + "loss": 0.3876, + "step": 5590 + }, + { + "epoch": 3.02270679401694, + "grad_norm": 0.3069864511489868, + "learning_rate": 4.050485690655378e-06, + "loss": 0.393, + "step": 5591 + }, + { + "epoch": 3.0232474319697245, + "grad_norm": 0.3245830237865448, + "learning_rate": 4.048632137759821e-06, + "loss": 0.3731, + "step": 5592 + }, + { + "epoch": 3.0237880699225084, + "grad_norm": 0.3169190287590027, + "learning_rate": 4.0467787205085694e-06, + "loss": 0.3733, + "step": 5593 + }, + { + "epoch": 3.0243287078752927, + "grad_norm": 0.28116780519485474, + "learning_rate": 4.044925439165879e-06, + "loss": 0.3721, + "step": 5594 + }, + { + "epoch": 3.024869345828077, + "grad_norm": 0.27948346734046936, + "learning_rate": 4.04307229399599e-06, + "loss": 0.3731, + "step": 5595 + }, + { + "epoch": 3.0254099837808615, + "grad_norm": 0.32127025723457336, + "learning_rate": 4.041219285263116e-06, + "loss": 0.3831, + "step": 5596 + }, + { + "epoch": 3.025950621733646, + "grad_norm": 0.30989739298820496, + "learning_rate": 4.039366413231458e-06, + "loss": 0.3855, + "step": 5597 + }, + { + "epoch": 3.02649125968643, + "grad_norm": 0.2933104336261749, + "learning_rate": 4.037513678165196e-06, + "loss": 0.3745, + "step": 5598 + }, + { + "epoch": 3.027031897639214, + "grad_norm": 0.31462398171424866, + "learning_rate": 4.03566108032849e-06, + "loss": 0.3724, + "step": 5599 + }, + { + "epoch": 3.0275725355919985, + "grad_norm": 0.2990002930164337, + "learning_rate": 4.0338086199854765e-06, + "loss": 0.3613, + "step": 5600 + }, + { + "epoch": 3.028113173544783, + "grad_norm": 0.30223512649536133, + "learning_rate": 4.031956297400279e-06, + "loss": 0.406, + "step": 5601 + }, + { + "epoch": 3.028653811497567, + "grad_norm": 0.3021981716156006, + "learning_rate": 4.030104112836997e-06, + "loss": 0.3753, + "step": 5602 + }, + { + "epoch": 3.0291944494503515, + "grad_norm": 0.2970791757106781, + "learning_rate": 4.028252066559712e-06, + "loss": 0.3922, + "step": 5603 + }, + { + "epoch": 3.029735087403136, + "grad_norm": 0.2738272547721863, + "learning_rate": 4.026400158832486e-06, + "loss": 0.3916, + "step": 5604 + }, + { + "epoch": 3.03027572535592, + "grad_norm": 0.30851078033447266, + "learning_rate": 4.02454838991936e-06, + "loss": 0.3699, + "step": 5605 + }, + { + "epoch": 3.030816363308704, + "grad_norm": 0.2815330922603607, + "learning_rate": 4.022696760084355e-06, + "loss": 0.3881, + "step": 5606 + }, + { + "epoch": 3.0313570012614885, + "grad_norm": 0.2881687581539154, + "learning_rate": 4.020845269591474e-06, + "loss": 0.3756, + "step": 5607 + }, + { + "epoch": 3.031897639214273, + "grad_norm": 0.29471635818481445, + "learning_rate": 4.018993918704701e-06, + "loss": 0.3869, + "step": 5608 + }, + { + "epoch": 3.0324382771670573, + "grad_norm": 0.29964473843574524, + "learning_rate": 4.017142707687995e-06, + "loss": 0.3849, + "step": 5609 + }, + { + "epoch": 3.032978915119841, + "grad_norm": 0.3093760907649994, + "learning_rate": 4.0152916368053e-06, + "loss": 0.3662, + "step": 5610 + }, + { + "epoch": 3.0335195530726256, + "grad_norm": 0.2925645112991333, + "learning_rate": 4.013440706320537e-06, + "loss": 0.3823, + "step": 5611 + }, + { + "epoch": 3.03406019102541, + "grad_norm": 0.32020100951194763, + "learning_rate": 4.0115899164976125e-06, + "loss": 0.3713, + "step": 5612 + }, + { + "epoch": 3.0346008289781943, + "grad_norm": 0.2816184163093567, + "learning_rate": 4.009739267600403e-06, + "loss": 0.3553, + "step": 5613 + }, + { + "epoch": 3.0351414669309786, + "grad_norm": 0.30468878149986267, + "learning_rate": 4.007888759892773e-06, + "loss": 0.3694, + "step": 5614 + }, + { + "epoch": 3.035682104883763, + "grad_norm": 0.2784128487110138, + "learning_rate": 4.006038393638565e-06, + "loss": 0.3943, + "step": 5615 + }, + { + "epoch": 3.036222742836547, + "grad_norm": 0.2900444269180298, + "learning_rate": 4.004188169101603e-06, + "loss": 0.3794, + "step": 5616 + }, + { + "epoch": 3.0367633807893313, + "grad_norm": 0.2990294396877289, + "learning_rate": 4.002338086545684e-06, + "loss": 0.368, + "step": 5617 + }, + { + "epoch": 3.0373040187421156, + "grad_norm": 0.29112526774406433, + "learning_rate": 4.000488146234592e-06, + "loss": 0.4074, + "step": 5618 + }, + { + "epoch": 3.0378446566949, + "grad_norm": 0.2832828164100647, + "learning_rate": 3.9986383484320875e-06, + "loss": 0.3825, + "step": 5619 + }, + { + "epoch": 3.0383852946476844, + "grad_norm": 0.29137280583381653, + "learning_rate": 3.996788693401914e-06, + "loss": 0.3766, + "step": 5620 + }, + { + "epoch": 3.0389259326004687, + "grad_norm": 0.2979818880558014, + "learning_rate": 3.994939181407787e-06, + "loss": 0.378, + "step": 5621 + }, + { + "epoch": 3.0394665705532526, + "grad_norm": 0.3082057237625122, + "learning_rate": 3.99308981271341e-06, + "loss": 0.3754, + "step": 5622 + }, + { + "epoch": 3.040007208506037, + "grad_norm": 0.2824176251888275, + "learning_rate": 3.991240587582461e-06, + "loss": 0.362, + "step": 5623 + }, + { + "epoch": 3.0405478464588214, + "grad_norm": 0.2940672039985657, + "learning_rate": 3.989391506278603e-06, + "loss": 0.394, + "step": 5624 + }, + { + "epoch": 3.0410884844116057, + "grad_norm": 0.2896587550640106, + "learning_rate": 3.987542569065469e-06, + "loss": 0.3864, + "step": 5625 + }, + { + "epoch": 3.04162912236439, + "grad_norm": 0.2874264419078827, + "learning_rate": 3.98569377620668e-06, + "loss": 0.3654, + "step": 5626 + }, + { + "epoch": 3.0421697603171745, + "grad_norm": 0.28716790676116943, + "learning_rate": 3.983845127965834e-06, + "loss": 0.3939, + "step": 5627 + }, + { + "epoch": 3.0427103982699584, + "grad_norm": 0.2853327691555023, + "learning_rate": 3.981996624606509e-06, + "loss": 0.3952, + "step": 5628 + }, + { + "epoch": 3.0432510362227427, + "grad_norm": 0.28403279185295105, + "learning_rate": 3.980148266392257e-06, + "loss": 0.3773, + "step": 5629 + }, + { + "epoch": 3.043791674175527, + "grad_norm": 0.27723419666290283, + "learning_rate": 3.978300053586617e-06, + "loss": 0.407, + "step": 5630 + }, + { + "epoch": 3.0443323121283115, + "grad_norm": 0.3116014897823334, + "learning_rate": 3.9764519864531026e-06, + "loss": 0.3884, + "step": 5631 + }, + { + "epoch": 3.044872950081096, + "grad_norm": 0.27871468663215637, + "learning_rate": 3.974604065255208e-06, + "loss": 0.3865, + "step": 5632 + }, + { + "epoch": 3.04541358803388, + "grad_norm": 0.3040054440498352, + "learning_rate": 3.972756290256407e-06, + "loss": 0.3831, + "step": 5633 + }, + { + "epoch": 3.045954225986664, + "grad_norm": 0.30204376578330994, + "learning_rate": 3.970908661720151e-06, + "loss": 0.3485, + "step": 5634 + }, + { + "epoch": 3.0464948639394485, + "grad_norm": 0.33566614985466003, + "learning_rate": 3.969061179909872e-06, + "loss": 0.4, + "step": 5635 + }, + { + "epoch": 3.047035501892233, + "grad_norm": 0.28941500186920166, + "learning_rate": 3.967213845088983e-06, + "loss": 0.3739, + "step": 5636 + }, + { + "epoch": 3.047576139845017, + "grad_norm": 0.2895706593990326, + "learning_rate": 3.965366657520869e-06, + "loss": 0.3738, + "step": 5637 + }, + { + "epoch": 3.0481167777978015, + "grad_norm": 0.34246695041656494, + "learning_rate": 3.963519617468902e-06, + "loss": 0.3986, + "step": 5638 + }, + { + "epoch": 3.048657415750586, + "grad_norm": 0.30092760920524597, + "learning_rate": 3.961672725196428e-06, + "loss": 0.3581, + "step": 5639 + }, + { + "epoch": 3.04919805370337, + "grad_norm": 0.2948783040046692, + "learning_rate": 3.959825980966777e-06, + "loss": 0.3714, + "step": 5640 + }, + { + "epoch": 3.049738691656154, + "grad_norm": 0.2991983890533447, + "learning_rate": 3.957979385043249e-06, + "loss": 0.3702, + "step": 5641 + }, + { + "epoch": 3.0502793296089385, + "grad_norm": 0.32025763392448425, + "learning_rate": 3.956132937689131e-06, + "loss": 0.3529, + "step": 5642 + }, + { + "epoch": 3.050819967561723, + "grad_norm": 0.30374646186828613, + "learning_rate": 3.954286639167686e-06, + "loss": 0.3803, + "step": 5643 + }, + { + "epoch": 3.0513606055145073, + "grad_norm": 0.3011425733566284, + "learning_rate": 3.952440489742158e-06, + "loss": 0.378, + "step": 5644 + }, + { + "epoch": 3.051901243467291, + "grad_norm": 0.28585678339004517, + "learning_rate": 3.9505944896757635e-06, + "loss": 0.3779, + "step": 5645 + }, + { + "epoch": 3.0524418814200756, + "grad_norm": 0.2828434407711029, + "learning_rate": 3.948748639231704e-06, + "loss": 0.3628, + "step": 5646 + }, + { + "epoch": 3.05298251937286, + "grad_norm": 0.3154783248901367, + "learning_rate": 3.946902938673158e-06, + "loss": 0.375, + "step": 5647 + }, + { + "epoch": 3.0535231573256443, + "grad_norm": 0.3142322897911072, + "learning_rate": 3.945057388263282e-06, + "loss": 0.3565, + "step": 5648 + }, + { + "epoch": 3.0540637952784286, + "grad_norm": 0.30780693888664246, + "learning_rate": 3.943211988265211e-06, + "loss": 0.3578, + "step": 5649 + }, + { + "epoch": 3.054604433231213, + "grad_norm": 0.30567556619644165, + "learning_rate": 3.941366738942058e-06, + "loss": 0.3922, + "step": 5650 + }, + { + "epoch": 3.055145071183997, + "grad_norm": 0.3061469793319702, + "learning_rate": 3.939521640556915e-06, + "loss": 0.3899, + "step": 5651 + }, + { + "epoch": 3.0556857091367813, + "grad_norm": 0.3018254041671753, + "learning_rate": 3.937676693372857e-06, + "loss": 0.3567, + "step": 5652 + }, + { + "epoch": 3.0562263470895656, + "grad_norm": 0.3027184307575226, + "learning_rate": 3.935831897652927e-06, + "loss": 0.3733, + "step": 5653 + }, + { + "epoch": 3.05676698504235, + "grad_norm": 0.29756540060043335, + "learning_rate": 3.933987253660156e-06, + "loss": 0.4123, + "step": 5654 + }, + { + "epoch": 3.0573076229951344, + "grad_norm": 0.28348982334136963, + "learning_rate": 3.932142761657549e-06, + "loss": 0.3723, + "step": 5655 + }, + { + "epoch": 3.0578482609479187, + "grad_norm": 0.2800956070423126, + "learning_rate": 3.930298421908093e-06, + "loss": 0.3664, + "step": 5656 + }, + { + "epoch": 3.0583888989007026, + "grad_norm": 0.30221107602119446, + "learning_rate": 3.928454234674748e-06, + "loss": 0.3762, + "step": 5657 + }, + { + "epoch": 3.058929536853487, + "grad_norm": 0.29003018140792847, + "learning_rate": 3.926610200220453e-06, + "loss": 0.3943, + "step": 5658 + }, + { + "epoch": 3.0594701748062714, + "grad_norm": 0.27501335740089417, + "learning_rate": 3.924766318808132e-06, + "loss": 0.367, + "step": 5659 + }, + { + "epoch": 3.0600108127590557, + "grad_norm": 0.2978987693786621, + "learning_rate": 3.922922590700679e-06, + "loss": 0.3734, + "step": 5660 + }, + { + "epoch": 3.06055145071184, + "grad_norm": 0.295075386762619, + "learning_rate": 3.92107901616097e-06, + "loss": 0.3926, + "step": 5661 + }, + { + "epoch": 3.0610920886646245, + "grad_norm": 0.3045724034309387, + "learning_rate": 3.919235595451858e-06, + "loss": 0.3673, + "step": 5662 + }, + { + "epoch": 3.0616327266174084, + "grad_norm": 0.28967294096946716, + "learning_rate": 3.917392328836177e-06, + "loss": 0.3806, + "step": 5663 + }, + { + "epoch": 3.0621733645701927, + "grad_norm": 0.3140982687473297, + "learning_rate": 3.9155492165767336e-06, + "loss": 0.3908, + "step": 5664 + }, + { + "epoch": 3.062714002522977, + "grad_norm": 0.2868986427783966, + "learning_rate": 3.913706258936317e-06, + "loss": 0.3851, + "step": 5665 + }, + { + "epoch": 3.0632546404757615, + "grad_norm": 0.28521233797073364, + "learning_rate": 3.911863456177692e-06, + "loss": 0.3717, + "step": 5666 + }, + { + "epoch": 3.063795278428546, + "grad_norm": 0.3208141326904297, + "learning_rate": 3.910020808563603e-06, + "loss": 0.3994, + "step": 5667 + }, + { + "epoch": 3.06433591638133, + "grad_norm": 0.3301263749599457, + "learning_rate": 3.908178316356772e-06, + "loss": 0.3587, + "step": 5668 + }, + { + "epoch": 3.064876554334114, + "grad_norm": 0.2989030182361603, + "learning_rate": 3.906335979819896e-06, + "loss": 0.3682, + "step": 5669 + }, + { + "epoch": 3.0654171922868985, + "grad_norm": 0.28814154863357544, + "learning_rate": 3.904493799215652e-06, + "loss": 0.3759, + "step": 5670 + }, + { + "epoch": 3.065957830239683, + "grad_norm": 0.3023150861263275, + "learning_rate": 3.902651774806696e-06, + "loss": 0.3759, + "step": 5671 + }, + { + "epoch": 3.066498468192467, + "grad_norm": 0.29809585213661194, + "learning_rate": 3.90080990685566e-06, + "loss": 0.3781, + "step": 5672 + }, + { + "epoch": 3.0670391061452515, + "grad_norm": 0.28152433037757874, + "learning_rate": 3.898968195625157e-06, + "loss": 0.396, + "step": 5673 + }, + { + "epoch": 3.067579744098036, + "grad_norm": 0.282197505235672, + "learning_rate": 3.897126641377771e-06, + "loss": 0.3671, + "step": 5674 + }, + { + "epoch": 3.06812038205082, + "grad_norm": 0.301152765750885, + "learning_rate": 3.895285244376068e-06, + "loss": 0.3644, + "step": 5675 + }, + { + "epoch": 3.068661020003604, + "grad_norm": 0.30550146102905273, + "learning_rate": 3.893444004882593e-06, + "loss": 0.3612, + "step": 5676 + }, + { + "epoch": 3.0692016579563886, + "grad_norm": 0.280722051858902, + "learning_rate": 3.8916029231598655e-06, + "loss": 0.3675, + "step": 5677 + }, + { + "epoch": 3.069742295909173, + "grad_norm": 0.3012804388999939, + "learning_rate": 3.889761999470383e-06, + "loss": 0.3529, + "step": 5678 + }, + { + "epoch": 3.0702829338619573, + "grad_norm": 0.2924235463142395, + "learning_rate": 3.887921234076621e-06, + "loss": 0.3994, + "step": 5679 + }, + { + "epoch": 3.070823571814741, + "grad_norm": 0.26820337772369385, + "learning_rate": 3.886080627241034e-06, + "loss": 0.3565, + "step": 5680 + }, + { + "epoch": 3.0713642097675256, + "grad_norm": 0.3064356744289398, + "learning_rate": 3.884240179226053e-06, + "loss": 0.3973, + "step": 5681 + }, + { + "epoch": 3.07190484772031, + "grad_norm": 0.3161008656024933, + "learning_rate": 3.882399890294083e-06, + "loss": 0.3618, + "step": 5682 + }, + { + "epoch": 3.0724454856730943, + "grad_norm": 0.28452223539352417, + "learning_rate": 3.880559760707508e-06, + "loss": 0.3775, + "step": 5683 + }, + { + "epoch": 3.0729861236258786, + "grad_norm": 0.29817309975624084, + "learning_rate": 3.878719790728695e-06, + "loss": 0.3659, + "step": 5684 + }, + { + "epoch": 3.073526761578663, + "grad_norm": 0.2943325638771057, + "learning_rate": 3.876879980619982e-06, + "loss": 0.3801, + "step": 5685 + }, + { + "epoch": 3.074067399531447, + "grad_norm": 0.2672979533672333, + "learning_rate": 3.875040330643684e-06, + "loss": 0.3571, + "step": 5686 + }, + { + "epoch": 3.0746080374842313, + "grad_norm": 0.2672359347343445, + "learning_rate": 3.873200841062095e-06, + "loss": 0.3749, + "step": 5687 + }, + { + "epoch": 3.0751486754370156, + "grad_norm": 0.3029986321926117, + "learning_rate": 3.871361512137487e-06, + "loss": 0.3842, + "step": 5688 + }, + { + "epoch": 3.0756893133898, + "grad_norm": 0.30264905095100403, + "learning_rate": 3.86952234413211e-06, + "loss": 0.3935, + "step": 5689 + }, + { + "epoch": 3.0762299513425844, + "grad_norm": 0.2739836275577545, + "learning_rate": 3.8676833373081864e-06, + "loss": 0.3643, + "step": 5690 + }, + { + "epoch": 3.0767705892953687, + "grad_norm": 0.285391241312027, + "learning_rate": 3.8658444919279195e-06, + "loss": 0.3678, + "step": 5691 + }, + { + "epoch": 3.0773112272481526, + "grad_norm": 0.2950049638748169, + "learning_rate": 3.864005808253488e-06, + "loss": 0.3632, + "step": 5692 + }, + { + "epoch": 3.077851865200937, + "grad_norm": 0.26679345965385437, + "learning_rate": 3.8621672865470505e-06, + "loss": 0.3662, + "step": 5693 + }, + { + "epoch": 3.0783925031537214, + "grad_norm": 0.2964516878128052, + "learning_rate": 3.860328927070737e-06, + "loss": 0.3809, + "step": 5694 + }, + { + "epoch": 3.0789331411065057, + "grad_norm": 0.2846154272556305, + "learning_rate": 3.8584907300866595e-06, + "loss": 0.3832, + "step": 5695 + }, + { + "epoch": 3.07947377905929, + "grad_norm": 0.28660333156585693, + "learning_rate": 3.8566526958569025e-06, + "loss": 0.3679, + "step": 5696 + }, + { + "epoch": 3.0800144170120745, + "grad_norm": 0.2944757640361786, + "learning_rate": 3.8548148246435345e-06, + "loss": 0.371, + "step": 5697 + }, + { + "epoch": 3.0805550549648584, + "grad_norm": 0.29508352279663086, + "learning_rate": 3.8529771167085894e-06, + "loss": 0.3701, + "step": 5698 + }, + { + "epoch": 3.0810956929176427, + "grad_norm": 0.2802375257015228, + "learning_rate": 3.851139572314088e-06, + "loss": 0.4006, + "step": 5699 + }, + { + "epoch": 3.081636330870427, + "grad_norm": 0.294456422328949, + "learning_rate": 3.8493021917220225e-06, + "loss": 0.3991, + "step": 5700 + }, + { + "epoch": 3.0821769688232115, + "grad_norm": 0.31199342012405396, + "learning_rate": 3.847464975194366e-06, + "loss": 0.389, + "step": 5701 + }, + { + "epoch": 3.082717606775996, + "grad_norm": 0.28415027260780334, + "learning_rate": 3.845627922993062e-06, + "loss": 0.3664, + "step": 5702 + }, + { + "epoch": 3.08325824472878, + "grad_norm": 0.3120895326137543, + "learning_rate": 3.843791035380036e-06, + "loss": 0.3991, + "step": 5703 + }, + { + "epoch": 3.083798882681564, + "grad_norm": 0.29247161746025085, + "learning_rate": 3.841954312617188e-06, + "loss": 0.3734, + "step": 5704 + }, + { + "epoch": 3.0843395206343485, + "grad_norm": 0.28878575563430786, + "learning_rate": 3.840117754966396e-06, + "loss": 0.3824, + "step": 5705 + }, + { + "epoch": 3.084880158587133, + "grad_norm": 0.2994827926158905, + "learning_rate": 3.8382813626895095e-06, + "loss": 0.3827, + "step": 5706 + }, + { + "epoch": 3.085420796539917, + "grad_norm": 0.28692129254341125, + "learning_rate": 3.83644513604836e-06, + "loss": 0.3502, + "step": 5707 + }, + { + "epoch": 3.0859614344927015, + "grad_norm": 0.2684290409088135, + "learning_rate": 3.834609075304754e-06, + "loss": 0.3819, + "step": 5708 + }, + { + "epoch": 3.0865020724454855, + "grad_norm": 0.3084404766559601, + "learning_rate": 3.832773180720475e-06, + "loss": 0.3682, + "step": 5709 + }, + { + "epoch": 3.08704271039827, + "grad_norm": 0.28687548637390137, + "learning_rate": 3.8309374525572765e-06, + "loss": 0.4114, + "step": 5710 + }, + { + "epoch": 3.087583348351054, + "grad_norm": 0.2810053825378418, + "learning_rate": 3.829101891076896e-06, + "loss": 0.3751, + "step": 5711 + }, + { + "epoch": 3.0881239863038386, + "grad_norm": 0.2930857241153717, + "learning_rate": 3.827266496541047e-06, + "loss": 0.3969, + "step": 5712 + }, + { + "epoch": 3.088664624256623, + "grad_norm": 0.3972669243812561, + "learning_rate": 3.825431269211416e-06, + "loss": 0.4011, + "step": 5713 + }, + { + "epoch": 3.0892052622094073, + "grad_norm": 0.28189221024513245, + "learning_rate": 3.823596209349662e-06, + "loss": 0.3759, + "step": 5714 + }, + { + "epoch": 3.089745900162191, + "grad_norm": 0.2784714996814728, + "learning_rate": 3.821761317217428e-06, + "loss": 0.3675, + "step": 5715 + }, + { + "epoch": 3.0902865381149756, + "grad_norm": 0.3123396337032318, + "learning_rate": 3.819926593076329e-06, + "loss": 0.3839, + "step": 5716 + }, + { + "epoch": 3.09082717606776, + "grad_norm": 0.2724792957305908, + "learning_rate": 3.818092037187959e-06, + "loss": 0.3636, + "step": 5717 + }, + { + "epoch": 3.0913678140205443, + "grad_norm": 0.27904802560806274, + "learning_rate": 3.816257649813881e-06, + "loss": 0.3567, + "step": 5718 + }, + { + "epoch": 3.0919084519733286, + "grad_norm": 0.29857680201530457, + "learning_rate": 3.8144234312156413e-06, + "loss": 0.3951, + "step": 5719 + }, + { + "epoch": 3.092449089926113, + "grad_norm": 0.29130810499191284, + "learning_rate": 3.81258938165476e-06, + "loss": 0.3747, + "step": 5720 + }, + { + "epoch": 3.092989727878897, + "grad_norm": 0.2917025089263916, + "learning_rate": 3.8107555013927334e-06, + "loss": 0.3658, + "step": 5721 + }, + { + "epoch": 3.0935303658316813, + "grad_norm": 0.2853851020336151, + "learning_rate": 3.8089217906910274e-06, + "loss": 0.3988, + "step": 5722 + }, + { + "epoch": 3.0940710037844656, + "grad_norm": 0.2891107499599457, + "learning_rate": 3.8070882498110946e-06, + "loss": 0.3775, + "step": 5723 + }, + { + "epoch": 3.09461164173725, + "grad_norm": 0.2973926067352295, + "learning_rate": 3.805254879014356e-06, + "loss": 0.3951, + "step": 5724 + }, + { + "epoch": 3.0951522796900344, + "grad_norm": 0.2941666841506958, + "learning_rate": 3.803421678562213e-06, + "loss": 0.3773, + "step": 5725 + }, + { + "epoch": 3.0956929176428187, + "grad_norm": 0.2840958833694458, + "learning_rate": 3.8015886487160347e-06, + "loss": 0.363, + "step": 5726 + }, + { + "epoch": 3.0962335555956026, + "grad_norm": 0.30144110321998596, + "learning_rate": 3.799755789737175e-06, + "loss": 0.3756, + "step": 5727 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 0.28447234630584717, + "learning_rate": 3.7979231018869578e-06, + "loss": 0.3637, + "step": 5728 + }, + { + "epoch": 3.0973148315011714, + "grad_norm": 0.29409652948379517, + "learning_rate": 3.7960905854266865e-06, + "loss": 0.4036, + "step": 5729 + }, + { + "epoch": 3.0978554694539557, + "grad_norm": 0.2620141804218292, + "learning_rate": 3.794258240617636e-06, + "loss": 0.3884, + "step": 5730 + }, + { + "epoch": 3.09839610740674, + "grad_norm": 0.2847660183906555, + "learning_rate": 3.792426067721059e-06, + "loss": 0.3819, + "step": 5731 + }, + { + "epoch": 3.0989367453595245, + "grad_norm": 0.31244954466819763, + "learning_rate": 3.790594066998184e-06, + "loss": 0.3752, + "step": 5732 + }, + { + "epoch": 3.0994773833123084, + "grad_norm": 0.28319838643074036, + "learning_rate": 3.788762238710215e-06, + "loss": 0.3779, + "step": 5733 + }, + { + "epoch": 3.1000180212650927, + "grad_norm": 0.29199621081352234, + "learning_rate": 3.786930583118329e-06, + "loss": 0.3779, + "step": 5734 + }, + { + "epoch": 3.100558659217877, + "grad_norm": 0.3003768026828766, + "learning_rate": 3.7850991004836813e-06, + "loss": 0.376, + "step": 5735 + }, + { + "epoch": 3.1010992971706615, + "grad_norm": 0.3079928457736969, + "learning_rate": 3.7832677910674005e-06, + "loss": 0.3879, + "step": 5736 + }, + { + "epoch": 3.101639935123446, + "grad_norm": 0.29220160841941833, + "learning_rate": 3.781436655130592e-06, + "loss": 0.3818, + "step": 5737 + }, + { + "epoch": 3.1021805730762297, + "grad_norm": 0.32593148946762085, + "learning_rate": 3.7796056929343384e-06, + "loss": 0.3809, + "step": 5738 + }, + { + "epoch": 3.102721211029014, + "grad_norm": 0.2868834137916565, + "learning_rate": 3.77777490473969e-06, + "loss": 0.3781, + "step": 5739 + }, + { + "epoch": 3.1032618489817985, + "grad_norm": 0.2728521227836609, + "learning_rate": 3.7759442908076786e-06, + "loss": 0.3779, + "step": 5740 + }, + { + "epoch": 3.103802486934583, + "grad_norm": 0.2837616503238678, + "learning_rate": 3.774113851399312e-06, + "loss": 0.3562, + "step": 5741 + }, + { + "epoch": 3.104343124887367, + "grad_norm": 0.2798760235309601, + "learning_rate": 3.772283586775572e-06, + "loss": 0.3721, + "step": 5742 + }, + { + "epoch": 3.1048837628401516, + "grad_norm": 0.27324506640434265, + "learning_rate": 3.77045349719741e-06, + "loss": 0.3916, + "step": 5743 + }, + { + "epoch": 3.1054244007929355, + "grad_norm": 0.2995244264602661, + "learning_rate": 3.7686235829257587e-06, + "loss": 0.383, + "step": 5744 + }, + { + "epoch": 3.10596503874572, + "grad_norm": 0.29339131712913513, + "learning_rate": 3.7667938442215247e-06, + "loss": 0.3611, + "step": 5745 + }, + { + "epoch": 3.106505676698504, + "grad_norm": 0.2979373335838318, + "learning_rate": 3.7649642813455893e-06, + "loss": 0.3562, + "step": 5746 + }, + { + "epoch": 3.1070463146512886, + "grad_norm": 0.3166741728782654, + "learning_rate": 3.7631348945588064e-06, + "loss": 0.3613, + "step": 5747 + }, + { + "epoch": 3.107586952604073, + "grad_norm": 0.29804494976997375, + "learning_rate": 3.761305684122008e-06, + "loss": 0.3868, + "step": 5748 + }, + { + "epoch": 3.1081275905568573, + "grad_norm": 0.2938231825828552, + "learning_rate": 3.759476650295999e-06, + "loss": 0.3866, + "step": 5749 + }, + { + "epoch": 3.108668228509641, + "grad_norm": 0.3072431981563568, + "learning_rate": 3.7576477933415612e-06, + "loss": 0.373, + "step": 5750 + }, + { + "epoch": 3.1092088664624256, + "grad_norm": 0.3000989556312561, + "learning_rate": 3.755819113519447e-06, + "loss": 0.4068, + "step": 5751 + }, + { + "epoch": 3.10974950441521, + "grad_norm": 0.30347174406051636, + "learning_rate": 3.7539906110903885e-06, + "loss": 0.3819, + "step": 5752 + }, + { + "epoch": 3.1102901423679943, + "grad_norm": 0.2933652698993683, + "learning_rate": 3.7521622863150887e-06, + "loss": 0.3845, + "step": 5753 + }, + { + "epoch": 3.1108307803207786, + "grad_norm": 0.3027891218662262, + "learning_rate": 3.7503341394542305e-06, + "loss": 0.3929, + "step": 5754 + }, + { + "epoch": 3.111371418273563, + "grad_norm": 0.31118011474609375, + "learning_rate": 3.748506170768462e-06, + "loss": 0.3432, + "step": 5755 + }, + { + "epoch": 3.111912056226347, + "grad_norm": 0.29796212911605835, + "learning_rate": 3.7466783805184146e-06, + "loss": 0.3837, + "step": 5756 + }, + { + "epoch": 3.1124526941791313, + "grad_norm": 0.26798829436302185, + "learning_rate": 3.744850768964692e-06, + "loss": 0.3675, + "step": 5757 + }, + { + "epoch": 3.1129933321319156, + "grad_norm": 0.27877551317214966, + "learning_rate": 3.743023336367872e-06, + "loss": 0.379, + "step": 5758 + }, + { + "epoch": 3.1135339700847, + "grad_norm": 0.3038302958011627, + "learning_rate": 3.7411960829885042e-06, + "loss": 0.3816, + "step": 5759 + }, + { + "epoch": 3.1140746080374844, + "grad_norm": 0.3026898205280304, + "learning_rate": 3.739369009087117e-06, + "loss": 0.3847, + "step": 5760 + }, + { + "epoch": 3.1146152459902687, + "grad_norm": 0.29449212551116943, + "learning_rate": 3.7375421149242102e-06, + "loss": 0.3868, + "step": 5761 + }, + { + "epoch": 3.1151558839430527, + "grad_norm": 0.2972925007343292, + "learning_rate": 3.7357154007602612e-06, + "loss": 0.3549, + "step": 5762 + }, + { + "epoch": 3.115696521895837, + "grad_norm": 0.3053343892097473, + "learning_rate": 3.733888866855717e-06, + "loss": 0.3998, + "step": 5763 + }, + { + "epoch": 3.1162371598486214, + "grad_norm": 0.28995615243911743, + "learning_rate": 3.732062513471002e-06, + "loss": 0.4114, + "step": 5764 + }, + { + "epoch": 3.1167777978014057, + "grad_norm": 0.2779656946659088, + "learning_rate": 3.7302363408665155e-06, + "loss": 0.3538, + "step": 5765 + }, + { + "epoch": 3.11731843575419, + "grad_norm": 0.3097536861896515, + "learning_rate": 3.7284103493026312e-06, + "loss": 0.3837, + "step": 5766 + }, + { + "epoch": 3.117859073706974, + "grad_norm": 0.2779688239097595, + "learning_rate": 3.7265845390396915e-06, + "loss": 0.3603, + "step": 5767 + }, + { + "epoch": 3.1183997116597584, + "grad_norm": 0.2756264805793762, + "learning_rate": 3.72475891033802e-06, + "loss": 0.3803, + "step": 5768 + }, + { + "epoch": 3.1189403496125427, + "grad_norm": 0.27909591794013977, + "learning_rate": 3.7229334634579093e-06, + "loss": 0.378, + "step": 5769 + }, + { + "epoch": 3.119480987565327, + "grad_norm": 0.27548834681510925, + "learning_rate": 3.721108198659633e-06, + "loss": 0.3862, + "step": 5770 + }, + { + "epoch": 3.1200216255181115, + "grad_norm": 0.28045108914375305, + "learning_rate": 3.7192831162034292e-06, + "loss": 0.3906, + "step": 5771 + }, + { + "epoch": 3.120562263470896, + "grad_norm": 0.28375497460365295, + "learning_rate": 3.7174582163495167e-06, + "loss": 0.3755, + "step": 5772 + }, + { + "epoch": 3.1211029014236797, + "grad_norm": 0.26914161443710327, + "learning_rate": 3.7156334993580854e-06, + "loss": 0.3632, + "step": 5773 + }, + { + "epoch": 3.121643539376464, + "grad_norm": 0.2826704978942871, + "learning_rate": 3.7138089654893027e-06, + "loss": 0.3828, + "step": 5774 + }, + { + "epoch": 3.1221841773292485, + "grad_norm": 0.2840440273284912, + "learning_rate": 3.7119846150033047e-06, + "loss": 0.3661, + "step": 5775 + }, + { + "epoch": 3.122724815282033, + "grad_norm": 0.30267229676246643, + "learning_rate": 3.710160448160205e-06, + "loss": 0.3846, + "step": 5776 + }, + { + "epoch": 3.123265453234817, + "grad_norm": 0.3090881109237671, + "learning_rate": 3.7083364652200902e-06, + "loss": 0.3697, + "step": 5777 + }, + { + "epoch": 3.1238060911876016, + "grad_norm": 0.2867450416088104, + "learning_rate": 3.706512666443022e-06, + "loss": 0.3787, + "step": 5778 + }, + { + "epoch": 3.1243467291403855, + "grad_norm": 0.3012264370918274, + "learning_rate": 3.7046890520890295e-06, + "loss": 0.3548, + "step": 5779 + }, + { + "epoch": 3.12488736709317, + "grad_norm": 0.28534871339797974, + "learning_rate": 3.702865622418125e-06, + "loss": 0.3618, + "step": 5780 + }, + { + "epoch": 3.125428005045954, + "grad_norm": 0.2811122238636017, + "learning_rate": 3.701042377690287e-06, + "loss": 0.3762, + "step": 5781 + }, + { + "epoch": 3.1259686429987386, + "grad_norm": 0.2708125114440918, + "learning_rate": 3.6992193181654747e-06, + "loss": 0.3701, + "step": 5782 + }, + { + "epoch": 3.126509280951523, + "grad_norm": 0.2966213524341583, + "learning_rate": 3.697396444103611e-06, + "loss": 0.3905, + "step": 5783 + }, + { + "epoch": 3.1270499189043073, + "grad_norm": 0.3003375828266144, + "learning_rate": 3.695573755764601e-06, + "loss": 0.3591, + "step": 5784 + }, + { + "epoch": 3.127590556857091, + "grad_norm": 0.295940637588501, + "learning_rate": 3.693751253408319e-06, + "loss": 0.3811, + "step": 5785 + }, + { + "epoch": 3.1281311948098756, + "grad_norm": 0.2861578166484833, + "learning_rate": 3.6919289372946167e-06, + "loss": 0.3657, + "step": 5786 + }, + { + "epoch": 3.12867183276266, + "grad_norm": 0.3032360374927521, + "learning_rate": 3.6901068076833136e-06, + "loss": 0.3778, + "step": 5787 + }, + { + "epoch": 3.1292124707154443, + "grad_norm": 0.2989215850830078, + "learning_rate": 3.688284864834207e-06, + "loss": 0.361, + "step": 5788 + }, + { + "epoch": 3.1297531086682286, + "grad_norm": 0.31468385457992554, + "learning_rate": 3.6864631090070656e-06, + "loss": 0.3696, + "step": 5789 + }, + { + "epoch": 3.130293746621013, + "grad_norm": 0.3105284571647644, + "learning_rate": 3.6846415404616344e-06, + "loss": 0.3659, + "step": 5790 + }, + { + "epoch": 3.130834384573797, + "grad_norm": 0.2899421751499176, + "learning_rate": 3.6828201594576253e-06, + "loss": 0.3778, + "step": 5791 + }, + { + "epoch": 3.1313750225265813, + "grad_norm": 0.3059687912464142, + "learning_rate": 3.6809989662547306e-06, + "loss": 0.388, + "step": 5792 + }, + { + "epoch": 3.1319156604793656, + "grad_norm": 0.29407617449760437, + "learning_rate": 3.679177961112611e-06, + "loss": 0.3737, + "step": 5793 + }, + { + "epoch": 3.13245629843215, + "grad_norm": 0.3061511218547821, + "learning_rate": 3.6773571442909055e-06, + "loss": 0.4101, + "step": 5794 + }, + { + "epoch": 3.1329969363849344, + "grad_norm": 0.29926207661628723, + "learning_rate": 3.6755365160492187e-06, + "loss": 0.388, + "step": 5795 + }, + { + "epoch": 3.1335375743377183, + "grad_norm": 0.31455034017562866, + "learning_rate": 3.673716076647133e-06, + "loss": 0.3842, + "step": 5796 + }, + { + "epoch": 3.1340782122905027, + "grad_norm": 0.2747032642364502, + "learning_rate": 3.6718958263442052e-06, + "loss": 0.3595, + "step": 5797 + }, + { + "epoch": 3.134618850243287, + "grad_norm": 0.2993357479572296, + "learning_rate": 3.670075765399963e-06, + "loss": 0.3736, + "step": 5798 + }, + { + "epoch": 3.1351594881960714, + "grad_norm": 0.29780399799346924, + "learning_rate": 3.6682558940739053e-06, + "loss": 0.3625, + "step": 5799 + }, + { + "epoch": 3.1357001261488557, + "grad_norm": 0.29055240750312805, + "learning_rate": 3.6664362126255087e-06, + "loss": 0.3645, + "step": 5800 + }, + { + "epoch": 3.13624076410164, + "grad_norm": 0.27076342701911926, + "learning_rate": 3.6646167213142187e-06, + "loss": 0.3953, + "step": 5801 + }, + { + "epoch": 3.136781402054424, + "grad_norm": 0.2839217185974121, + "learning_rate": 3.6627974203994555e-06, + "loss": 0.3804, + "step": 5802 + }, + { + "epoch": 3.1373220400072084, + "grad_norm": 0.30638810992240906, + "learning_rate": 3.660978310140612e-06, + "loss": 0.373, + "step": 5803 + }, + { + "epoch": 3.1378626779599927, + "grad_norm": 0.2920389175415039, + "learning_rate": 3.659159390797053e-06, + "loss": 0.3669, + "step": 5804 + }, + { + "epoch": 3.138403315912777, + "grad_norm": 0.29665204882621765, + "learning_rate": 3.657340662628116e-06, + "loss": 0.3778, + "step": 5805 + }, + { + "epoch": 3.1389439538655615, + "grad_norm": 0.31666800379753113, + "learning_rate": 3.6555221258931137e-06, + "loss": 0.3883, + "step": 5806 + }, + { + "epoch": 3.139484591818346, + "grad_norm": 0.2841288149356842, + "learning_rate": 3.653703780851331e-06, + "loss": 0.3593, + "step": 5807 + }, + { + "epoch": 3.1400252297711297, + "grad_norm": 0.2947778105735779, + "learning_rate": 3.651885627762019e-06, + "loss": 0.3727, + "step": 5808 + }, + { + "epoch": 3.140565867723914, + "grad_norm": 0.32678353786468506, + "learning_rate": 3.650067666884411e-06, + "loss": 0.3583, + "step": 5809 + }, + { + "epoch": 3.1411065056766985, + "grad_norm": 0.2785697281360626, + "learning_rate": 3.648249898477707e-06, + "loss": 0.3766, + "step": 5810 + }, + { + "epoch": 3.141647143629483, + "grad_norm": 0.2932497560977936, + "learning_rate": 3.6464323228010845e-06, + "loss": 0.369, + "step": 5811 + }, + { + "epoch": 3.142187781582267, + "grad_norm": 0.323253333568573, + "learning_rate": 3.6446149401136847e-06, + "loss": 0.3988, + "step": 5812 + }, + { + "epoch": 3.1427284195350516, + "grad_norm": 0.27933064103126526, + "learning_rate": 3.6427977506746293e-06, + "loss": 0.3695, + "step": 5813 + }, + { + "epoch": 3.1432690574878355, + "grad_norm": 0.293927937746048, + "learning_rate": 3.64098075474301e-06, + "loss": 0.3955, + "step": 5814 + }, + { + "epoch": 3.14380969544062, + "grad_norm": 0.30490684509277344, + "learning_rate": 3.6391639525778915e-06, + "loss": 0.3935, + "step": 5815 + }, + { + "epoch": 3.144350333393404, + "grad_norm": 0.2889328598976135, + "learning_rate": 3.6373473444383083e-06, + "loss": 0.3622, + "step": 5816 + }, + { + "epoch": 3.1448909713461886, + "grad_norm": 0.2749510705471039, + "learning_rate": 3.6355309305832698e-06, + "loss": 0.3764, + "step": 5817 + }, + { + "epoch": 3.145431609298973, + "grad_norm": 0.2956562936306, + "learning_rate": 3.6337147112717575e-06, + "loss": 0.3717, + "step": 5818 + }, + { + "epoch": 3.1459722472517573, + "grad_norm": 0.29461973905563354, + "learning_rate": 3.631898686762726e-06, + "loss": 0.3658, + "step": 5819 + }, + { + "epoch": 3.146512885204541, + "grad_norm": 0.2857483923435211, + "learning_rate": 3.6300828573150977e-06, + "loss": 0.3576, + "step": 5820 + }, + { + "epoch": 3.1470535231573256, + "grad_norm": 0.29898321628570557, + "learning_rate": 3.6282672231877714e-06, + "loss": 0.3526, + "step": 5821 + }, + { + "epoch": 3.14759416111011, + "grad_norm": 0.3071466088294983, + "learning_rate": 3.6264517846396174e-06, + "loss": 0.3775, + "step": 5822 + }, + { + "epoch": 3.1481347990628943, + "grad_norm": 0.30894705653190613, + "learning_rate": 3.6246365419294805e-06, + "loss": 0.3945, + "step": 5823 + }, + { + "epoch": 3.1486754370156786, + "grad_norm": 0.31548750400543213, + "learning_rate": 3.622821495316169e-06, + "loss": 0.3726, + "step": 5824 + }, + { + "epoch": 3.1492160749684626, + "grad_norm": 0.29202592372894287, + "learning_rate": 3.621006645058472e-06, + "loss": 0.3875, + "step": 5825 + }, + { + "epoch": 3.149756712921247, + "grad_norm": 0.2818850874900818, + "learning_rate": 3.619191991415146e-06, + "loss": 0.3945, + "step": 5826 + }, + { + "epoch": 3.1502973508740313, + "grad_norm": 0.2867623269557953, + "learning_rate": 3.6173775346449253e-06, + "loss": 0.3725, + "step": 5827 + }, + { + "epoch": 3.1508379888268156, + "grad_norm": 0.3019481301307678, + "learning_rate": 3.6155632750065074e-06, + "loss": 0.3734, + "step": 5828 + }, + { + "epoch": 3.1513786267796, + "grad_norm": 0.282046377658844, + "learning_rate": 3.6137492127585667e-06, + "loss": 0.3599, + "step": 5829 + }, + { + "epoch": 3.1519192647323844, + "grad_norm": 0.29858216643333435, + "learning_rate": 3.6119353481597504e-06, + "loss": 0.3739, + "step": 5830 + }, + { + "epoch": 3.1524599026851683, + "grad_norm": 0.30497896671295166, + "learning_rate": 3.610121681468676e-06, + "loss": 0.3975, + "step": 5831 + }, + { + "epoch": 3.1530005406379527, + "grad_norm": 0.3183683753013611, + "learning_rate": 3.608308212943932e-06, + "loss": 0.3812, + "step": 5832 + }, + { + "epoch": 3.153541178590737, + "grad_norm": 0.29859915375709534, + "learning_rate": 3.6064949428440787e-06, + "loss": 0.3697, + "step": 5833 + }, + { + "epoch": 3.1540818165435214, + "grad_norm": 0.3005802035331726, + "learning_rate": 3.6046818714276512e-06, + "loss": 0.3741, + "step": 5834 + }, + { + "epoch": 3.1546224544963057, + "grad_norm": 0.27503278851509094, + "learning_rate": 3.6028689989531533e-06, + "loss": 0.3855, + "step": 5835 + }, + { + "epoch": 3.15516309244909, + "grad_norm": 0.2953731417655945, + "learning_rate": 3.6010563256790587e-06, + "loss": 0.3773, + "step": 5836 + }, + { + "epoch": 3.155703730401874, + "grad_norm": 0.28386104106903076, + "learning_rate": 3.599243851863816e-06, + "loss": 0.3872, + "step": 5837 + }, + { + "epoch": 3.1562443683546584, + "grad_norm": 0.275812029838562, + "learning_rate": 3.5974315777658463e-06, + "loss": 0.3592, + "step": 5838 + }, + { + "epoch": 3.1567850063074427, + "grad_norm": 0.2900319993495941, + "learning_rate": 3.595619503643541e-06, + "loss": 0.3715, + "step": 5839 + }, + { + "epoch": 3.157325644260227, + "grad_norm": 0.2936703562736511, + "learning_rate": 3.593807629755258e-06, + "loss": 0.3966, + "step": 5840 + }, + { + "epoch": 3.1578662822130115, + "grad_norm": 0.27260488271713257, + "learning_rate": 3.591995956359335e-06, + "loss": 0.3958, + "step": 5841 + }, + { + "epoch": 3.158406920165796, + "grad_norm": 0.2719656825065613, + "learning_rate": 3.5901844837140743e-06, + "loss": 0.363, + "step": 5842 + }, + { + "epoch": 3.1589475581185797, + "grad_norm": 0.29166799783706665, + "learning_rate": 3.588373212077756e-06, + "loss": 0.3669, + "step": 5843 + }, + { + "epoch": 3.159488196071364, + "grad_norm": 0.29750218987464905, + "learning_rate": 3.586562141708624e-06, + "loss": 0.396, + "step": 5844 + }, + { + "epoch": 3.1600288340241485, + "grad_norm": 0.2957509458065033, + "learning_rate": 3.584751272864899e-06, + "loss": 0.3881, + "step": 5845 + }, + { + "epoch": 3.160569471976933, + "grad_norm": 0.28142228722572327, + "learning_rate": 3.582940605804771e-06, + "loss": 0.3775, + "step": 5846 + }, + { + "epoch": 3.161110109929717, + "grad_norm": 0.3182094991207123, + "learning_rate": 3.581130140786404e-06, + "loss": 0.3683, + "step": 5847 + }, + { + "epoch": 3.1616507478825016, + "grad_norm": 0.32549846172332764, + "learning_rate": 3.579319878067927e-06, + "loss": 0.3822, + "step": 5848 + }, + { + "epoch": 3.1621913858352855, + "grad_norm": 0.28233757615089417, + "learning_rate": 3.5775098179074476e-06, + "loss": 0.3824, + "step": 5849 + }, + { + "epoch": 3.16273202378807, + "grad_norm": 0.2875683903694153, + "learning_rate": 3.575699960563038e-06, + "loss": 0.3833, + "step": 5850 + }, + { + "epoch": 3.163272661740854, + "grad_norm": 0.28352779150009155, + "learning_rate": 3.5738903062927477e-06, + "loss": 0.3551, + "step": 5851 + }, + { + "epoch": 3.1638132996936386, + "grad_norm": 0.27940163016319275, + "learning_rate": 3.5720808553545894e-06, + "loss": 0.3766, + "step": 5852 + }, + { + "epoch": 3.164353937646423, + "grad_norm": 0.2728763818740845, + "learning_rate": 3.5702716080065546e-06, + "loss": 0.3512, + "step": 5853 + }, + { + "epoch": 3.164894575599207, + "grad_norm": 0.2832372188568115, + "learning_rate": 3.568462564506602e-06, + "loss": 0.3751, + "step": 5854 + }, + { + "epoch": 3.165435213551991, + "grad_norm": 0.2728310525417328, + "learning_rate": 3.566653725112661e-06, + "loss": 0.3546, + "step": 5855 + }, + { + "epoch": 3.1659758515047756, + "grad_norm": 0.26566481590270996, + "learning_rate": 3.564845090082633e-06, + "loss": 0.3927, + "step": 5856 + }, + { + "epoch": 3.16651648945756, + "grad_norm": 0.31351128220558167, + "learning_rate": 3.56303665967439e-06, + "loss": 0.392, + "step": 5857 + }, + { + "epoch": 3.1670571274103443, + "grad_norm": 0.3172876536846161, + "learning_rate": 3.5612284341457743e-06, + "loss": 0.3841, + "step": 5858 + }, + { + "epoch": 3.1675977653631286, + "grad_norm": 0.28172582387924194, + "learning_rate": 3.5594204137546005e-06, + "loss": 0.3869, + "step": 5859 + }, + { + "epoch": 3.168138403315913, + "grad_norm": 0.2776242792606354, + "learning_rate": 3.557612598758652e-06, + "loss": 0.3898, + "step": 5860 + }, + { + "epoch": 3.168679041268697, + "grad_norm": 0.2968648672103882, + "learning_rate": 3.5558049894156836e-06, + "loss": 0.3938, + "step": 5861 + }, + { + "epoch": 3.1692196792214813, + "grad_norm": 0.3051604926586151, + "learning_rate": 3.5539975859834216e-06, + "loss": 0.3765, + "step": 5862 + }, + { + "epoch": 3.1697603171742657, + "grad_norm": 0.296024888753891, + "learning_rate": 3.5521903887195637e-06, + "loss": 0.3815, + "step": 5863 + }, + { + "epoch": 3.17030095512705, + "grad_norm": 0.2797988951206207, + "learning_rate": 3.5503833978817733e-06, + "loss": 0.3605, + "step": 5864 + }, + { + "epoch": 3.1708415930798344, + "grad_norm": 0.2781578004360199, + "learning_rate": 3.5485766137276894e-06, + "loss": 0.3932, + "step": 5865 + }, + { + "epoch": 3.1713822310326183, + "grad_norm": 0.27553510665893555, + "learning_rate": 3.546770036514919e-06, + "loss": 0.3753, + "step": 5866 + }, + { + "epoch": 3.1719228689854027, + "grad_norm": 0.2807295024394989, + "learning_rate": 3.5449636665010433e-06, + "loss": 0.3821, + "step": 5867 + }, + { + "epoch": 3.172463506938187, + "grad_norm": 0.29495182633399963, + "learning_rate": 3.543157503943613e-06, + "loss": 0.3627, + "step": 5868 + }, + { + "epoch": 3.1730041448909714, + "grad_norm": 0.2861941158771515, + "learning_rate": 3.541351549100141e-06, + "loss": 0.3734, + "step": 5869 + }, + { + "epoch": 3.1735447828437557, + "grad_norm": 0.32167086005210876, + "learning_rate": 3.5395458022281205e-06, + "loss": 0.3863, + "step": 5870 + }, + { + "epoch": 3.17408542079654, + "grad_norm": 0.2828007936477661, + "learning_rate": 3.5377402635850123e-06, + "loss": 0.3767, + "step": 5871 + }, + { + "epoch": 3.174626058749324, + "grad_norm": 0.31658950448036194, + "learning_rate": 3.5359349334282466e-06, + "loss": 0.358, + "step": 5872 + }, + { + "epoch": 3.1751666967021084, + "grad_norm": 0.2916194498538971, + "learning_rate": 3.5341298120152224e-06, + "loss": 0.3965, + "step": 5873 + }, + { + "epoch": 3.1757073346548927, + "grad_norm": 0.30083373188972473, + "learning_rate": 3.532324899603312e-06, + "loss": 0.3728, + "step": 5874 + }, + { + "epoch": 3.176247972607677, + "grad_norm": 0.31124067306518555, + "learning_rate": 3.5305201964498557e-06, + "loss": 0.3932, + "step": 5875 + }, + { + "epoch": 3.1767886105604615, + "grad_norm": 0.2781218886375427, + "learning_rate": 3.5287157028121676e-06, + "loss": 0.3709, + "step": 5876 + }, + { + "epoch": 3.177329248513246, + "grad_norm": 0.3084629774093628, + "learning_rate": 3.5269114189475255e-06, + "loss": 0.4255, + "step": 5877 + }, + { + "epoch": 3.1778698864660297, + "grad_norm": 0.31070947647094727, + "learning_rate": 3.5251073451131824e-06, + "loss": 0.3753, + "step": 5878 + }, + { + "epoch": 3.178410524418814, + "grad_norm": 0.3050396144390106, + "learning_rate": 3.52330348156636e-06, + "loss": 0.3829, + "step": 5879 + }, + { + "epoch": 3.1789511623715985, + "grad_norm": 0.29054388403892517, + "learning_rate": 3.5214998285642517e-06, + "loss": 0.3592, + "step": 5880 + }, + { + "epoch": 3.179491800324383, + "grad_norm": 0.2981637418270111, + "learning_rate": 3.5196963863640147e-06, + "loss": 0.3663, + "step": 5881 + }, + { + "epoch": 3.180032438277167, + "grad_norm": 0.2945329248905182, + "learning_rate": 3.5178931552227837e-06, + "loss": 0.3823, + "step": 5882 + }, + { + "epoch": 3.180573076229951, + "grad_norm": 0.31379660964012146, + "learning_rate": 3.516090135397659e-06, + "loss": 0.3866, + "step": 5883 + }, + { + "epoch": 3.1811137141827355, + "grad_norm": 0.28403589129447937, + "learning_rate": 3.5142873271457132e-06, + "loss": 0.3963, + "step": 5884 + }, + { + "epoch": 3.18165435213552, + "grad_norm": 0.2573620676994324, + "learning_rate": 3.5124847307239863e-06, + "loss": 0.3902, + "step": 5885 + }, + { + "epoch": 3.182194990088304, + "grad_norm": 0.26139920949935913, + "learning_rate": 3.5106823463894884e-06, + "loss": 0.3825, + "step": 5886 + }, + { + "epoch": 3.1827356280410886, + "grad_norm": 0.290723979473114, + "learning_rate": 3.508880174399202e-06, + "loss": 0.3845, + "step": 5887 + }, + { + "epoch": 3.183276265993873, + "grad_norm": 0.31544339656829834, + "learning_rate": 3.507078215010077e-06, + "loss": 0.3835, + "step": 5888 + }, + { + "epoch": 3.1838169039466573, + "grad_norm": 0.27778783440589905, + "learning_rate": 3.505276468479033e-06, + "loss": 0.383, + "step": 5889 + }, + { + "epoch": 3.184357541899441, + "grad_norm": 0.2908482551574707, + "learning_rate": 3.5034749350629593e-06, + "loss": 0.3796, + "step": 5890 + }, + { + "epoch": 3.1848981798522256, + "grad_norm": 0.3057006895542145, + "learning_rate": 3.501673615018717e-06, + "loss": 0.3382, + "step": 5891 + }, + { + "epoch": 3.18543881780501, + "grad_norm": 0.29850953817367554, + "learning_rate": 3.4998725086031353e-06, + "loss": 0.3731, + "step": 5892 + }, + { + "epoch": 3.1859794557577943, + "grad_norm": 0.3245144486427307, + "learning_rate": 3.49807161607301e-06, + "loss": 0.3903, + "step": 5893 + }, + { + "epoch": 3.1865200937105786, + "grad_norm": 0.2751791477203369, + "learning_rate": 3.496270937685109e-06, + "loss": 0.3835, + "step": 5894 + }, + { + "epoch": 3.1870607316633626, + "grad_norm": 0.3082123398780823, + "learning_rate": 3.4944704736961722e-06, + "loss": 0.3621, + "step": 5895 + }, + { + "epoch": 3.187601369616147, + "grad_norm": 0.3176043927669525, + "learning_rate": 3.4926702243629075e-06, + "loss": 0.3404, + "step": 5896 + }, + { + "epoch": 3.1881420075689313, + "grad_norm": 0.26966592669487, + "learning_rate": 3.490870189941987e-06, + "loss": 0.3726, + "step": 5897 + }, + { + "epoch": 3.1886826455217157, + "grad_norm": 0.29819998145103455, + "learning_rate": 3.4890703706900596e-06, + "loss": 0.4085, + "step": 5898 + }, + { + "epoch": 3.1892232834745, + "grad_norm": 0.2900310158729553, + "learning_rate": 3.4872707668637387e-06, + "loss": 0.3579, + "step": 5899 + }, + { + "epoch": 3.1897639214272844, + "grad_norm": 0.3027432858943939, + "learning_rate": 3.4854713787196105e-06, + "loss": 0.3992, + "step": 5900 + }, + { + "epoch": 3.1903045593800683, + "grad_norm": 0.2828446328639984, + "learning_rate": 3.483672206514226e-06, + "loss": 0.3548, + "step": 5901 + }, + { + "epoch": 3.1908451973328527, + "grad_norm": 0.2956213057041168, + "learning_rate": 3.4818732505041085e-06, + "loss": 0.3749, + "step": 5902 + }, + { + "epoch": 3.191385835285637, + "grad_norm": 0.30007094144821167, + "learning_rate": 3.48007451094575e-06, + "loss": 0.3817, + "step": 5903 + }, + { + "epoch": 3.1919264732384214, + "grad_norm": 0.28123414516448975, + "learning_rate": 3.478275988095615e-06, + "loss": 0.3866, + "step": 5904 + }, + { + "epoch": 3.1924671111912057, + "grad_norm": 0.31397712230682373, + "learning_rate": 3.4764776822101275e-06, + "loss": 0.3592, + "step": 5905 + }, + { + "epoch": 3.19300774914399, + "grad_norm": 0.28460144996643066, + "learning_rate": 3.47467959354569e-06, + "loss": 0.3806, + "step": 5906 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 0.3189926743507385, + "learning_rate": 3.472881722358671e-06, + "loss": 0.3946, + "step": 5907 + }, + { + "epoch": 3.1940890250495584, + "grad_norm": 0.2741137444972992, + "learning_rate": 3.471084068905409e-06, + "loss": 0.3796, + "step": 5908 + }, + { + "epoch": 3.1946296630023427, + "grad_norm": 0.2794220745563507, + "learning_rate": 3.4692866334422063e-06, + "loss": 0.3621, + "step": 5909 + }, + { + "epoch": 3.195170300955127, + "grad_norm": 0.2943795323371887, + "learning_rate": 3.4674894162253404e-06, + "loss": 0.3867, + "step": 5910 + }, + { + "epoch": 3.1957109389079115, + "grad_norm": 0.27503079175949097, + "learning_rate": 3.4656924175110544e-06, + "loss": 0.3585, + "step": 5911 + }, + { + "epoch": 3.1962515768606954, + "grad_norm": 0.3133552074432373, + "learning_rate": 3.463895637555563e-06, + "loss": 0.3775, + "step": 5912 + }, + { + "epoch": 3.1967922148134797, + "grad_norm": 0.2888537049293518, + "learning_rate": 3.4620990766150453e-06, + "loss": 0.3638, + "step": 5913 + }, + { + "epoch": 3.197332852766264, + "grad_norm": 0.2793637812137604, + "learning_rate": 3.460302734945653e-06, + "loss": 0.3757, + "step": 5914 + }, + { + "epoch": 3.1978734907190485, + "grad_norm": 0.2881743907928467, + "learning_rate": 3.458506612803505e-06, + "loss": 0.3795, + "step": 5915 + }, + { + "epoch": 3.198414128671833, + "grad_norm": 0.2902590334415436, + "learning_rate": 3.4567107104446906e-06, + "loss": 0.375, + "step": 5916 + }, + { + "epoch": 3.198954766624617, + "grad_norm": 0.3024289608001709, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.3933, + "step": 5917 + }, + { + "epoch": 3.1994954045774016, + "grad_norm": 0.29701340198516846, + "learning_rate": 3.4531195661012506e-06, + "loss": 0.3992, + "step": 5918 + }, + { + "epoch": 3.2000360425301855, + "grad_norm": 0.28090935945510864, + "learning_rate": 3.451324324628645e-06, + "loss": 0.3847, + "step": 5919 + }, + { + "epoch": 3.20057668048297, + "grad_norm": 0.29121866822242737, + "learning_rate": 3.4495293039634113e-06, + "loss": 0.3854, + "step": 5920 + }, + { + "epoch": 3.201117318435754, + "grad_norm": 0.2985329031944275, + "learning_rate": 3.4477345043614762e-06, + "loss": 0.3944, + "step": 5921 + }, + { + "epoch": 3.2016579563885386, + "grad_norm": 0.2983471751213074, + "learning_rate": 3.445939926078741e-06, + "loss": 0.3774, + "step": 5922 + }, + { + "epoch": 3.202198594341323, + "grad_norm": 0.30166491866111755, + "learning_rate": 3.444145569371073e-06, + "loss": 0.4013, + "step": 5923 + }, + { + "epoch": 3.202739232294107, + "grad_norm": 0.2873641550540924, + "learning_rate": 3.442351434494311e-06, + "loss": 0.3625, + "step": 5924 + }, + { + "epoch": 3.203279870246891, + "grad_norm": 0.29263344407081604, + "learning_rate": 3.440557521704256e-06, + "loss": 0.3699, + "step": 5925 + }, + { + "epoch": 3.2038205081996756, + "grad_norm": 0.30022427439689636, + "learning_rate": 3.4387638312566817e-06, + "loss": 0.3887, + "step": 5926 + }, + { + "epoch": 3.20436114615246, + "grad_norm": 0.2768106162548065, + "learning_rate": 3.4369703634073293e-06, + "loss": 0.373, + "step": 5927 + }, + { + "epoch": 3.2049017841052443, + "grad_norm": 0.30571630597114563, + "learning_rate": 3.4351771184119104e-06, + "loss": 0.3851, + "step": 5928 + }, + { + "epoch": 3.2054424220580287, + "grad_norm": 0.29890185594558716, + "learning_rate": 3.433384096526099e-06, + "loss": 0.3643, + "step": 5929 + }, + { + "epoch": 3.2059830600108126, + "grad_norm": 0.346992164850235, + "learning_rate": 3.4315912980055433e-06, + "loss": 0.387, + "step": 5930 + }, + { + "epoch": 3.206523697963597, + "grad_norm": 0.25692644715309143, + "learning_rate": 3.429798723105856e-06, + "loss": 0.3648, + "step": 5931 + }, + { + "epoch": 3.2070643359163813, + "grad_norm": 0.27923527359962463, + "learning_rate": 3.4280063720826203e-06, + "loss": 0.3656, + "step": 5932 + }, + { + "epoch": 3.2076049738691657, + "grad_norm": 0.36623239517211914, + "learning_rate": 3.4262142451913865e-06, + "loss": 0.3856, + "step": 5933 + }, + { + "epoch": 3.20814561182195, + "grad_norm": 0.31749051809310913, + "learning_rate": 3.424422342687671e-06, + "loss": 0.364, + "step": 5934 + }, + { + "epoch": 3.2086862497747344, + "grad_norm": 0.28651195764541626, + "learning_rate": 3.4226306648269616e-06, + "loss": 0.3564, + "step": 5935 + }, + { + "epoch": 3.2092268877275183, + "grad_norm": 0.29628702998161316, + "learning_rate": 3.420839211864712e-06, + "loss": 0.3576, + "step": 5936 + }, + { + "epoch": 3.2097675256803027, + "grad_norm": 0.289095401763916, + "learning_rate": 3.419047984056346e-06, + "loss": 0.3795, + "step": 5937 + }, + { + "epoch": 3.210308163633087, + "grad_norm": 0.2880508601665497, + "learning_rate": 3.417256981657251e-06, + "loss": 0.3869, + "step": 5938 + }, + { + "epoch": 3.2108488015858714, + "grad_norm": 0.30657216906547546, + "learning_rate": 3.4154662049227848e-06, + "loss": 0.3809, + "step": 5939 + }, + { + "epoch": 3.2113894395386557, + "grad_norm": 0.2976471483707428, + "learning_rate": 3.413675654108275e-06, + "loss": 0.373, + "step": 5940 + }, + { + "epoch": 3.21193007749144, + "grad_norm": 0.27903828024864197, + "learning_rate": 3.4118853294690148e-06, + "loss": 0.3802, + "step": 5941 + }, + { + "epoch": 3.212470715444224, + "grad_norm": 0.2901027500629425, + "learning_rate": 3.410095231260263e-06, + "loss": 0.3923, + "step": 5942 + }, + { + "epoch": 3.2130113533970084, + "grad_norm": 0.2960743308067322, + "learning_rate": 3.4083053597372517e-06, + "loss": 0.3554, + "step": 5943 + }, + { + "epoch": 3.2135519913497927, + "grad_norm": 0.2988324463367462, + "learning_rate": 3.406515715155176e-06, + "loss": 0.3729, + "step": 5944 + }, + { + "epoch": 3.214092629302577, + "grad_norm": 0.2973558306694031, + "learning_rate": 3.4047262977692014e-06, + "loss": 0.3838, + "step": 5945 + }, + { + "epoch": 3.2146332672553615, + "grad_norm": 0.2790868282318115, + "learning_rate": 3.4029371078344576e-06, + "loss": 0.3736, + "step": 5946 + }, + { + "epoch": 3.215173905208146, + "grad_norm": 0.30476298928260803, + "learning_rate": 3.4011481456060457e-06, + "loss": 0.3829, + "step": 5947 + }, + { + "epoch": 3.2157145431609298, + "grad_norm": 0.29460281133651733, + "learning_rate": 3.3993594113390316e-06, + "loss": 0.4076, + "step": 5948 + }, + { + "epoch": 3.216255181113714, + "grad_norm": 0.2972364127635956, + "learning_rate": 3.397570905288453e-06, + "loss": 0.377, + "step": 5949 + }, + { + "epoch": 3.2167958190664985, + "grad_norm": 0.2778155207633972, + "learning_rate": 3.3957826277093074e-06, + "loss": 0.3802, + "step": 5950 + }, + { + "epoch": 3.217336457019283, + "grad_norm": 0.26630061864852905, + "learning_rate": 3.3939945788565664e-06, + "loss": 0.3561, + "step": 5951 + }, + { + "epoch": 3.217877094972067, + "grad_norm": 0.28079941868782043, + "learning_rate": 3.392206758985165e-06, + "loss": 0.3567, + "step": 5952 + }, + { + "epoch": 3.218417732924851, + "grad_norm": 0.2812572121620178, + "learning_rate": 3.390419168350012e-06, + "loss": 0.3904, + "step": 5953 + }, + { + "epoch": 3.2189583708776355, + "grad_norm": 0.3071571886539459, + "learning_rate": 3.3886318072059733e-06, + "loss": 0.3763, + "step": 5954 + }, + { + "epoch": 3.21949900883042, + "grad_norm": 0.28525233268737793, + "learning_rate": 3.3868446758078897e-06, + "loss": 0.3895, + "step": 5955 + }, + { + "epoch": 3.220039646783204, + "grad_norm": 0.2900713384151459, + "learning_rate": 3.3850577744105682e-06, + "loss": 0.38, + "step": 5956 + }, + { + "epoch": 3.2205802847359886, + "grad_norm": 0.2950497269630432, + "learning_rate": 3.383271103268782e-06, + "loss": 0.3707, + "step": 5957 + }, + { + "epoch": 3.221120922688773, + "grad_norm": 0.27890312671661377, + "learning_rate": 3.3814846626372693e-06, + "loss": 0.3807, + "step": 5958 + }, + { + "epoch": 3.221661560641557, + "grad_norm": 0.3092116415500641, + "learning_rate": 3.379698452770739e-06, + "loss": 0.3523, + "step": 5959 + }, + { + "epoch": 3.222202198594341, + "grad_norm": 0.2828221619129181, + "learning_rate": 3.3779124739238657e-06, + "loss": 0.3718, + "step": 5960 + }, + { + "epoch": 3.2227428365471256, + "grad_norm": 0.2936552166938782, + "learning_rate": 3.376126726351292e-06, + "loss": 0.3725, + "step": 5961 + }, + { + "epoch": 3.22328347449991, + "grad_norm": 0.307039350271225, + "learning_rate": 3.3743412103076235e-06, + "loss": 0.3881, + "step": 5962 + }, + { + "epoch": 3.2238241124526943, + "grad_norm": 0.29379069805145264, + "learning_rate": 3.3725559260474378e-06, + "loss": 0.3699, + "step": 5963 + }, + { + "epoch": 3.2243647504054787, + "grad_norm": 0.30865761637687683, + "learning_rate": 3.3707708738252774e-06, + "loss": 0.3749, + "step": 5964 + }, + { + "epoch": 3.2249053883582626, + "grad_norm": 0.29781603813171387, + "learning_rate": 3.3689860538956547e-06, + "loss": 0.3577, + "step": 5965 + }, + { + "epoch": 3.225446026311047, + "grad_norm": 0.2828763723373413, + "learning_rate": 3.3672014665130404e-06, + "loss": 0.3856, + "step": 5966 + }, + { + "epoch": 3.2259866642638313, + "grad_norm": 0.29489558935165405, + "learning_rate": 3.3654171119318814e-06, + "loss": 0.3837, + "step": 5967 + }, + { + "epoch": 3.2265273022166157, + "grad_norm": 0.2914468050003052, + "learning_rate": 3.3636329904065863e-06, + "loss": 0.3527, + "step": 5968 + }, + { + "epoch": 3.2270679401694, + "grad_norm": 0.27329519391059875, + "learning_rate": 3.3618491021915334e-06, + "loss": 0.3659, + "step": 5969 + }, + { + "epoch": 3.2276085781221844, + "grad_norm": 0.29091718792915344, + "learning_rate": 3.3600654475410643e-06, + "loss": 0.3832, + "step": 5970 + }, + { + "epoch": 3.2281492160749683, + "grad_norm": 0.33960139751434326, + "learning_rate": 3.358282026709491e-06, + "loss": 0.3927, + "step": 5971 + }, + { + "epoch": 3.2286898540277527, + "grad_norm": 0.32778263092041016, + "learning_rate": 3.356498839951089e-06, + "loss": 0.3613, + "step": 5972 + }, + { + "epoch": 3.229230491980537, + "grad_norm": 0.285845011472702, + "learning_rate": 3.354715887520104e-06, + "loss": 0.3569, + "step": 5973 + }, + { + "epoch": 3.2297711299333214, + "grad_norm": 0.2771826386451721, + "learning_rate": 3.3529331696707434e-06, + "loss": 0.4028, + "step": 5974 + }, + { + "epoch": 3.2303117678861057, + "grad_norm": 0.32097023725509644, + "learning_rate": 3.351150686657185e-06, + "loss": 0.3813, + "step": 5975 + }, + { + "epoch": 3.23085240583889, + "grad_norm": 0.3273971676826477, + "learning_rate": 3.349368438733572e-06, + "loss": 0.3857, + "step": 5976 + }, + { + "epoch": 3.231393043791674, + "grad_norm": 0.28685319423675537, + "learning_rate": 3.347586426154017e-06, + "loss": 0.4034, + "step": 5977 + }, + { + "epoch": 3.2319336817444584, + "grad_norm": 0.2998667061328888, + "learning_rate": 3.3458046491725915e-06, + "loss": 0.4105, + "step": 5978 + }, + { + "epoch": 3.2324743196972427, + "grad_norm": 0.2772698402404785, + "learning_rate": 3.344023108043339e-06, + "loss": 0.377, + "step": 5979 + }, + { + "epoch": 3.233014957650027, + "grad_norm": 0.2977933883666992, + "learning_rate": 3.3422418030202696e-06, + "loss": 0.3806, + "step": 5980 + }, + { + "epoch": 3.2335555956028115, + "grad_norm": 0.311739057302475, + "learning_rate": 3.340460734357359e-06, + "loss": 0.3769, + "step": 5981 + }, + { + "epoch": 3.2340962335555954, + "grad_norm": 0.28269141912460327, + "learning_rate": 3.338679902308547e-06, + "loss": 0.3553, + "step": 5982 + }, + { + "epoch": 3.2346368715083798, + "grad_norm": 0.302408367395401, + "learning_rate": 3.3368993071277426e-06, + "loss": 0.4017, + "step": 5983 + }, + { + "epoch": 3.235177509461164, + "grad_norm": 0.3035907745361328, + "learning_rate": 3.33511894906882e-06, + "loss": 0.3826, + "step": 5984 + }, + { + "epoch": 3.2357181474139485, + "grad_norm": 0.2984257638454437, + "learning_rate": 3.3333388283856195e-06, + "loss": 0.3719, + "step": 5985 + }, + { + "epoch": 3.236258785366733, + "grad_norm": 0.28472962975502014, + "learning_rate": 3.331558945331946e-06, + "loss": 0.3894, + "step": 5986 + }, + { + "epoch": 3.236799423319517, + "grad_norm": 0.303316205739975, + "learning_rate": 3.329779300161573e-06, + "loss": 0.3767, + "step": 5987 + }, + { + "epoch": 3.237340061272301, + "grad_norm": 0.2893165051937103, + "learning_rate": 3.3279998931282388e-06, + "loss": 0.3614, + "step": 5988 + }, + { + "epoch": 3.2378806992250855, + "grad_norm": 0.32114818692207336, + "learning_rate": 3.326220724485651e-06, + "loss": 0.3804, + "step": 5989 + }, + { + "epoch": 3.23842133717787, + "grad_norm": 0.30687180161476135, + "learning_rate": 3.324441794487475e-06, + "loss": 0.3905, + "step": 5990 + }, + { + "epoch": 3.238961975130654, + "grad_norm": 0.303936630487442, + "learning_rate": 3.322663103387349e-06, + "loss": 0.3863, + "step": 5991 + }, + { + "epoch": 3.2395026130834386, + "grad_norm": 0.28427422046661377, + "learning_rate": 3.3208846514388776e-06, + "loss": 0.3948, + "step": 5992 + }, + { + "epoch": 3.240043251036223, + "grad_norm": 0.2725846767425537, + "learning_rate": 3.3191064388956306e-06, + "loss": 0.3514, + "step": 5993 + }, + { + "epoch": 3.240583888989007, + "grad_norm": 0.2954155504703522, + "learning_rate": 3.317328466011137e-06, + "loss": 0.3685, + "step": 5994 + }, + { + "epoch": 3.241124526941791, + "grad_norm": 0.3024853467941284, + "learning_rate": 3.3155507330389004e-06, + "loss": 0.3712, + "step": 5995 + }, + { + "epoch": 3.2416651648945756, + "grad_norm": 0.27961960434913635, + "learning_rate": 3.3137732402323863e-06, + "loss": 0.377, + "step": 5996 + }, + { + "epoch": 3.24220580284736, + "grad_norm": 0.2657700777053833, + "learning_rate": 3.3119959878450257e-06, + "loss": 0.3724, + "step": 5997 + }, + { + "epoch": 3.2427464408001443, + "grad_norm": 0.2811744511127472, + "learning_rate": 3.3102189761302185e-06, + "loss": 0.3878, + "step": 5998 + }, + { + "epoch": 3.2432870787529287, + "grad_norm": 0.3270666301250458, + "learning_rate": 3.3084422053413247e-06, + "loss": 0.3962, + "step": 5999 + }, + { + "epoch": 3.2438277167057126, + "grad_norm": 0.26997464895248413, + "learning_rate": 3.306665675731674e-06, + "loss": 0.3983, + "step": 6000 + }, + { + "epoch": 3.244368354658497, + "grad_norm": 0.283130943775177, + "learning_rate": 3.3048893875545606e-06, + "loss": 0.3924, + "step": 6001 + }, + { + "epoch": 3.2449089926112813, + "grad_norm": 0.29643136262893677, + "learning_rate": 3.3031133410632465e-06, + "loss": 0.3707, + "step": 6002 + }, + { + "epoch": 3.2454496305640657, + "grad_norm": 0.27885952591896057, + "learning_rate": 3.3013375365109547e-06, + "loss": 0.3666, + "step": 6003 + }, + { + "epoch": 3.24599026851685, + "grad_norm": 0.2765432298183441, + "learning_rate": 3.2995619741508765e-06, + "loss": 0.3773, + "step": 6004 + }, + { + "epoch": 3.2465309064696344, + "grad_norm": 0.2908070385456085, + "learning_rate": 3.297786654236169e-06, + "loss": 0.356, + "step": 6005 + }, + { + "epoch": 3.2470715444224183, + "grad_norm": 0.27661949396133423, + "learning_rate": 3.2960115770199563e-06, + "loss": 0.3748, + "step": 6006 + }, + { + "epoch": 3.2476121823752027, + "grad_norm": 0.31730109453201294, + "learning_rate": 3.294236742755322e-06, + "loss": 0.3699, + "step": 6007 + }, + { + "epoch": 3.248152820327987, + "grad_norm": 0.3065454363822937, + "learning_rate": 3.2924621516953195e-06, + "loss": 0.3852, + "step": 6008 + }, + { + "epoch": 3.2486934582807714, + "grad_norm": 0.29832738637924194, + "learning_rate": 3.2906878040929664e-06, + "loss": 0.3576, + "step": 6009 + }, + { + "epoch": 3.2492340962335557, + "grad_norm": 0.2941710948944092, + "learning_rate": 3.28891370020125e-06, + "loss": 0.356, + "step": 6010 + }, + { + "epoch": 3.2497747341863397, + "grad_norm": 0.29035940766334534, + "learning_rate": 3.2871398402731134e-06, + "loss": 0.3717, + "step": 6011 + }, + { + "epoch": 3.250315372139124, + "grad_norm": 0.3080737292766571, + "learning_rate": 3.285366224561474e-06, + "loss": 0.4003, + "step": 6012 + }, + { + "epoch": 3.2508560100919084, + "grad_norm": 0.28215858340263367, + "learning_rate": 3.2835928533192086e-06, + "loss": 0.3624, + "step": 6013 + }, + { + "epoch": 3.2513966480446927, + "grad_norm": 0.28796327114105225, + "learning_rate": 3.2818197267991636e-06, + "loss": 0.3644, + "step": 6014 + }, + { + "epoch": 3.251937285997477, + "grad_norm": 0.2933882772922516, + "learning_rate": 3.280046845254145e-06, + "loss": 0.3685, + "step": 6015 + }, + { + "epoch": 3.2524779239502615, + "grad_norm": 0.3135727643966675, + "learning_rate": 3.278274208936929e-06, + "loss": 0.3678, + "step": 6016 + }, + { + "epoch": 3.253018561903046, + "grad_norm": 0.31397953629493713, + "learning_rate": 3.276501818100255e-06, + "loss": 0.3591, + "step": 6017 + }, + { + "epoch": 3.2535591998558298, + "grad_norm": 0.30858469009399414, + "learning_rate": 3.274729672996829e-06, + "loss": 0.37, + "step": 6018 + }, + { + "epoch": 3.254099837808614, + "grad_norm": 0.308296263217926, + "learning_rate": 3.272957773879315e-06, + "loss": 0.3973, + "step": 6019 + }, + { + "epoch": 3.2546404757613985, + "grad_norm": 0.30975034832954407, + "learning_rate": 3.2711861210003503e-06, + "loss": 0.3685, + "step": 6020 + }, + { + "epoch": 3.255181113714183, + "grad_norm": 0.31152427196502686, + "learning_rate": 3.269414714612534e-06, + "loss": 0.3967, + "step": 6021 + }, + { + "epoch": 3.255721751666967, + "grad_norm": 0.27662646770477295, + "learning_rate": 3.267643554968433e-06, + "loss": 0.3754, + "step": 6022 + }, + { + "epoch": 3.256262389619751, + "grad_norm": 0.28363901376724243, + "learning_rate": 3.265872642320571e-06, + "loss": 0.3753, + "step": 6023 + }, + { + "epoch": 3.2568030275725355, + "grad_norm": 0.29461127519607544, + "learning_rate": 3.2641019769214433e-06, + "loss": 0.3595, + "step": 6024 + }, + { + "epoch": 3.25734366552532, + "grad_norm": 0.30085429549217224, + "learning_rate": 3.2623315590235076e-06, + "loss": 0.3663, + "step": 6025 + }, + { + "epoch": 3.257884303478104, + "grad_norm": 0.28633686900138855, + "learning_rate": 3.260561388879189e-06, + "loss": 0.3913, + "step": 6026 + }, + { + "epoch": 3.2584249414308886, + "grad_norm": 0.30552682280540466, + "learning_rate": 3.258791466740873e-06, + "loss": 0.3871, + "step": 6027 + }, + { + "epoch": 3.258965579383673, + "grad_norm": 0.2848190367221832, + "learning_rate": 3.2570217928609126e-06, + "loss": 0.4061, + "step": 6028 + }, + { + "epoch": 3.259506217336457, + "grad_norm": 0.28970232605934143, + "learning_rate": 3.255252367491625e-06, + "loss": 0.3773, + "step": 6029 + }, + { + "epoch": 3.260046855289241, + "grad_norm": 0.3319888114929199, + "learning_rate": 3.2534831908852914e-06, + "loss": 0.382, + "step": 6030 + }, + { + "epoch": 3.2605874932420256, + "grad_norm": 0.2861935496330261, + "learning_rate": 3.251714263294158e-06, + "loss": 0.371, + "step": 6031 + }, + { + "epoch": 3.26112813119481, + "grad_norm": 0.29303890466690063, + "learning_rate": 3.2499455849704344e-06, + "loss": 0.3969, + "step": 6032 + }, + { + "epoch": 3.2616687691475943, + "grad_norm": 0.2763252258300781, + "learning_rate": 3.2481771561662965e-06, + "loss": 0.3829, + "step": 6033 + }, + { + "epoch": 3.2622094071003787, + "grad_norm": 0.3113039433956146, + "learning_rate": 3.2464089771338856e-06, + "loss": 0.3892, + "step": 6034 + }, + { + "epoch": 3.2627500450531626, + "grad_norm": 0.281794935464859, + "learning_rate": 3.244641048125301e-06, + "loss": 0.3798, + "step": 6035 + }, + { + "epoch": 3.263290683005947, + "grad_norm": 0.28851622343063354, + "learning_rate": 3.242873369392613e-06, + "loss": 0.3862, + "step": 6036 + }, + { + "epoch": 3.2638313209587313, + "grad_norm": 0.27319225668907166, + "learning_rate": 3.241105941187854e-06, + "loss": 0.3702, + "step": 6037 + }, + { + "epoch": 3.2643719589115157, + "grad_norm": 0.29888665676116943, + "learning_rate": 3.2393387637630223e-06, + "loss": 0.3803, + "step": 6038 + }, + { + "epoch": 3.2649125968643, + "grad_norm": 0.2770290672779083, + "learning_rate": 3.237571837370076e-06, + "loss": 0.363, + "step": 6039 + }, + { + "epoch": 3.265453234817084, + "grad_norm": 0.28202933073043823, + "learning_rate": 3.235805162260942e-06, + "loss": 0.3668, + "step": 6040 + }, + { + "epoch": 3.2659938727698683, + "grad_norm": 0.29129648208618164, + "learning_rate": 3.2340387386875095e-06, + "loss": 0.384, + "step": 6041 + }, + { + "epoch": 3.2665345107226527, + "grad_norm": 0.27913179993629456, + "learning_rate": 3.232272566901632e-06, + "loss": 0.373, + "step": 6042 + }, + { + "epoch": 3.267075148675437, + "grad_norm": 0.3138071298599243, + "learning_rate": 3.230506647155126e-06, + "loss": 0.39, + "step": 6043 + }, + { + "epoch": 3.2676157866282214, + "grad_norm": 0.2757951319217682, + "learning_rate": 3.228740979699774e-06, + "loss": 0.3786, + "step": 6044 + }, + { + "epoch": 3.2681564245810057, + "grad_norm": 0.31086909770965576, + "learning_rate": 3.226975564787322e-06, + "loss": 0.3838, + "step": 6045 + }, + { + "epoch": 3.26869706253379, + "grad_norm": 0.29600006341934204, + "learning_rate": 3.2252104026694807e-06, + "loss": 0.3657, + "step": 6046 + }, + { + "epoch": 3.269237700486574, + "grad_norm": 0.28875118494033813, + "learning_rate": 3.223445493597921e-06, + "loss": 0.3905, + "step": 6047 + }, + { + "epoch": 3.2697783384393584, + "grad_norm": 0.28248870372772217, + "learning_rate": 3.2216808378242802e-06, + "loss": 0.3522, + "step": 6048 + }, + { + "epoch": 3.2703189763921428, + "grad_norm": 0.2825542390346527, + "learning_rate": 3.219916435600162e-06, + "loss": 0.3729, + "step": 6049 + }, + { + "epoch": 3.270859614344927, + "grad_norm": 0.2938670814037323, + "learning_rate": 3.218152287177133e-06, + "loss": 0.3855, + "step": 6050 + }, + { + "epoch": 3.2714002522977115, + "grad_norm": 0.28813305497169495, + "learning_rate": 3.216388392806719e-06, + "loss": 0.3824, + "step": 6051 + }, + { + "epoch": 3.2719408902504954, + "grad_norm": 0.26874107122421265, + "learning_rate": 3.214624752740413e-06, + "loss": 0.3724, + "step": 6052 + }, + { + "epoch": 3.2724815282032798, + "grad_norm": 0.2823548913002014, + "learning_rate": 3.2128613672296737e-06, + "loss": 0.378, + "step": 6053 + }, + { + "epoch": 3.273022166156064, + "grad_norm": 0.2911010682582855, + "learning_rate": 3.2110982365259206e-06, + "loss": 0.3757, + "step": 6054 + }, + { + "epoch": 3.2735628041088485, + "grad_norm": 0.28966856002807617, + "learning_rate": 3.2093353608805368e-06, + "loss": 0.3711, + "step": 6055 + }, + { + "epoch": 3.274103442061633, + "grad_norm": 0.28305694460868835, + "learning_rate": 3.2075727405448707e-06, + "loss": 0.3608, + "step": 6056 + }, + { + "epoch": 3.274644080014417, + "grad_norm": 0.2980901300907135, + "learning_rate": 3.205810375770233e-06, + "loss": 0.3692, + "step": 6057 + }, + { + "epoch": 3.275184717967201, + "grad_norm": 0.3049103319644928, + "learning_rate": 3.204048266807901e-06, + "loss": 0.3812, + "step": 6058 + }, + { + "epoch": 3.2757253559199855, + "grad_norm": 0.279361367225647, + "learning_rate": 3.202286413909108e-06, + "loss": 0.4078, + "step": 6059 + }, + { + "epoch": 3.27626599387277, + "grad_norm": 0.2576105296611786, + "learning_rate": 3.2005248173250593e-06, + "loss": 0.3704, + "step": 6060 + }, + { + "epoch": 3.276806631825554, + "grad_norm": 0.29323723912239075, + "learning_rate": 3.19876347730692e-06, + "loss": 0.3791, + "step": 6061 + }, + { + "epoch": 3.2773472697783386, + "grad_norm": 0.28759148716926575, + "learning_rate": 3.197002394105818e-06, + "loss": 0.3611, + "step": 6062 + }, + { + "epoch": 3.277887907731123, + "grad_norm": 0.2939707040786743, + "learning_rate": 3.195241567972848e-06, + "loss": 0.392, + "step": 6063 + }, + { + "epoch": 3.278428545683907, + "grad_norm": 0.2800416052341461, + "learning_rate": 3.19348099915906e-06, + "loss": 0.3885, + "step": 6064 + }, + { + "epoch": 3.278969183636691, + "grad_norm": 0.2766898274421692, + "learning_rate": 3.1917206879154762e-06, + "loss": 0.3834, + "step": 6065 + }, + { + "epoch": 3.2795098215894756, + "grad_norm": 0.28272634744644165, + "learning_rate": 3.189960634493078e-06, + "loss": 0.3836, + "step": 6066 + }, + { + "epoch": 3.28005045954226, + "grad_norm": 0.2898666560649872, + "learning_rate": 3.1882008391428123e-06, + "loss": 0.3881, + "step": 6067 + }, + { + "epoch": 3.2805910974950443, + "grad_norm": 0.2773001790046692, + "learning_rate": 3.1864413021155842e-06, + "loss": 0.3578, + "step": 6068 + }, + { + "epoch": 3.281131735447828, + "grad_norm": 0.28953588008880615, + "learning_rate": 3.184682023662268e-06, + "loss": 0.3784, + "step": 6069 + }, + { + "epoch": 3.2816723734006126, + "grad_norm": 0.311117023229599, + "learning_rate": 3.1829230040336967e-06, + "loss": 0.3625, + "step": 6070 + }, + { + "epoch": 3.282213011353397, + "grad_norm": 0.272461473941803, + "learning_rate": 3.18116424348067e-06, + "loss": 0.3865, + "step": 6071 + }, + { + "epoch": 3.2827536493061813, + "grad_norm": 0.2678007185459137, + "learning_rate": 3.179405742253947e-06, + "loss": 0.3823, + "step": 6072 + }, + { + "epoch": 3.2832942872589657, + "grad_norm": 0.2916713356971741, + "learning_rate": 3.177647500604252e-06, + "loss": 0.3852, + "step": 6073 + }, + { + "epoch": 3.28383492521175, + "grad_norm": 0.2645060420036316, + "learning_rate": 3.1758895187822725e-06, + "loss": 0.3974, + "step": 6074 + }, + { + "epoch": 3.2843755631645344, + "grad_norm": 0.2962478697299957, + "learning_rate": 3.1741317970386597e-06, + "loss": 0.3865, + "step": 6075 + }, + { + "epoch": 3.2849162011173183, + "grad_norm": 0.272658109664917, + "learning_rate": 3.1723743356240232e-06, + "loss": 0.3695, + "step": 6076 + }, + { + "epoch": 3.2854568390701027, + "grad_norm": 0.28344181180000305, + "learning_rate": 3.170617134788939e-06, + "loss": 0.3651, + "step": 6077 + }, + { + "epoch": 3.285997477022887, + "grad_norm": 0.29079508781433105, + "learning_rate": 3.1688601947839477e-06, + "loss": 0.396, + "step": 6078 + }, + { + "epoch": 3.2865381149756714, + "grad_norm": 0.27090904116630554, + "learning_rate": 3.167103515859552e-06, + "loss": 0.3734, + "step": 6079 + }, + { + "epoch": 3.2870787529284557, + "grad_norm": 0.2851702570915222, + "learning_rate": 3.1653470982662114e-06, + "loss": 0.3583, + "step": 6080 + }, + { + "epoch": 3.2876193908812397, + "grad_norm": 0.2843466103076935, + "learning_rate": 3.1635909422543556e-06, + "loss": 0.3719, + "step": 6081 + }, + { + "epoch": 3.288160028834024, + "grad_norm": 0.3082217574119568, + "learning_rate": 3.1618350480743733e-06, + "loss": 0.3906, + "step": 6082 + }, + { + "epoch": 3.2887006667868084, + "grad_norm": 0.286344051361084, + "learning_rate": 3.1600794159766184e-06, + "loss": 0.3894, + "step": 6083 + }, + { + "epoch": 3.2892413047395928, + "grad_norm": 0.2779954671859741, + "learning_rate": 3.158324046211403e-06, + "loss": 0.3602, + "step": 6084 + }, + { + "epoch": 3.289781942692377, + "grad_norm": 0.31051650643348694, + "learning_rate": 3.1565689390290067e-06, + "loss": 0.3653, + "step": 6085 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 0.2746830880641937, + "learning_rate": 3.154814094679668e-06, + "loss": 0.3704, + "step": 6086 + }, + { + "epoch": 3.2908632185979454, + "grad_norm": 0.2721113860607147, + "learning_rate": 3.153059513413591e-06, + "loss": 0.3835, + "step": 6087 + }, + { + "epoch": 3.2914038565507298, + "grad_norm": 0.2715761363506317, + "learning_rate": 3.151305195480939e-06, + "loss": 0.3557, + "step": 6088 + }, + { + "epoch": 3.291944494503514, + "grad_norm": 0.2994410991668701, + "learning_rate": 3.1495511411318402e-06, + "loss": 0.3719, + "step": 6089 + }, + { + "epoch": 3.2924851324562985, + "grad_norm": 0.28619715571403503, + "learning_rate": 3.147797350616385e-06, + "loss": 0.3666, + "step": 6090 + }, + { + "epoch": 3.293025770409083, + "grad_norm": 0.2863656282424927, + "learning_rate": 3.146043824184627e-06, + "loss": 0.3739, + "step": 6091 + }, + { + "epoch": 3.293566408361867, + "grad_norm": 0.2747470736503601, + "learning_rate": 3.1442905620865773e-06, + "loss": 0.3659, + "step": 6092 + }, + { + "epoch": 3.294107046314651, + "grad_norm": 0.2745727002620697, + "learning_rate": 3.1425375645722147e-06, + "loss": 0.3479, + "step": 6093 + }, + { + "epoch": 3.2946476842674355, + "grad_norm": 0.29741722345352173, + "learning_rate": 3.140784831891478e-06, + "loss": 0.3618, + "step": 6094 + }, + { + "epoch": 3.29518832222022, + "grad_norm": 0.3148142695426941, + "learning_rate": 3.139032364294271e-06, + "loss": 0.3738, + "step": 6095 + }, + { + "epoch": 3.295728960173004, + "grad_norm": 0.2951778173446655, + "learning_rate": 3.1372801620304532e-06, + "loss": 0.3362, + "step": 6096 + }, + { + "epoch": 3.2962695981257886, + "grad_norm": 0.2876696288585663, + "learning_rate": 3.135528225349853e-06, + "loss": 0.3621, + "step": 6097 + }, + { + "epoch": 3.2968102360785725, + "grad_norm": 0.29712897539138794, + "learning_rate": 3.133776554502258e-06, + "loss": 0.3946, + "step": 6098 + }, + { + "epoch": 3.297350874031357, + "grad_norm": 0.29079294204711914, + "learning_rate": 3.1320251497374187e-06, + "loss": 0.371, + "step": 6099 + }, + { + "epoch": 3.297891511984141, + "grad_norm": 0.2989571988582611, + "learning_rate": 3.130274011305047e-06, + "loss": 0.3702, + "step": 6100 + }, + { + "epoch": 3.2984321499369256, + "grad_norm": 0.2969949245452881, + "learning_rate": 3.1285231394548156e-06, + "loss": 0.3692, + "step": 6101 + }, + { + "epoch": 3.29897278788971, + "grad_norm": 0.28161007165908813, + "learning_rate": 3.126772534436362e-06, + "loss": 0.3687, + "step": 6102 + }, + { + "epoch": 3.2995134258424943, + "grad_norm": 0.29006463289260864, + "learning_rate": 3.1250221964992855e-06, + "loss": 0.3796, + "step": 6103 + }, + { + "epoch": 3.3000540637952787, + "grad_norm": 0.30350908637046814, + "learning_rate": 3.123272125893143e-06, + "loss": 0.3919, + "step": 6104 + }, + { + "epoch": 3.3005947017480626, + "grad_norm": 0.28305956721305847, + "learning_rate": 3.1215223228674587e-06, + "loss": 0.3848, + "step": 6105 + }, + { + "epoch": 3.301135339700847, + "grad_norm": 0.2784910202026367, + "learning_rate": 3.1197727876717143e-06, + "loss": 0.3696, + "step": 6106 + }, + { + "epoch": 3.3016759776536313, + "grad_norm": 0.2670733332633972, + "learning_rate": 3.11802352055536e-06, + "loss": 0.3553, + "step": 6107 + }, + { + "epoch": 3.3022166156064157, + "grad_norm": 0.2794773280620575, + "learning_rate": 3.1162745217677976e-06, + "loss": 0.3848, + "step": 6108 + }, + { + "epoch": 3.3027572535592, + "grad_norm": 0.2845619022846222, + "learning_rate": 3.114525791558398e-06, + "loss": 0.3938, + "step": 6109 + }, + { + "epoch": 3.303297891511984, + "grad_norm": 0.271085649728775, + "learning_rate": 3.1127773301764935e-06, + "loss": 0.3858, + "step": 6110 + }, + { + "epoch": 3.3038385294647683, + "grad_norm": 0.284370481967926, + "learning_rate": 3.1110291378713763e-06, + "loss": 0.3717, + "step": 6111 + }, + { + "epoch": 3.3043791674175527, + "grad_norm": 0.255216121673584, + "learning_rate": 3.109281214892298e-06, + "loss": 0.4005, + "step": 6112 + }, + { + "epoch": 3.304919805370337, + "grad_norm": 0.2828678488731384, + "learning_rate": 3.1075335614884767e-06, + "loss": 0.3606, + "step": 6113 + }, + { + "epoch": 3.3054604433231214, + "grad_norm": 0.27755460143089294, + "learning_rate": 3.105786177909088e-06, + "loss": 0.366, + "step": 6114 + }, + { + "epoch": 3.3060010812759058, + "grad_norm": 0.2801077365875244, + "learning_rate": 3.1040390644032746e-06, + "loss": 0.3757, + "step": 6115 + }, + { + "epoch": 3.3065417192286897, + "grad_norm": 0.2761895954608917, + "learning_rate": 3.1022922212201307e-06, + "loss": 0.3744, + "step": 6116 + }, + { + "epoch": 3.307082357181474, + "grad_norm": 0.27318400144577026, + "learning_rate": 3.1005456486087217e-06, + "loss": 0.3691, + "step": 6117 + }, + { + "epoch": 3.3076229951342584, + "grad_norm": 0.30086633563041687, + "learning_rate": 3.0987993468180706e-06, + "loss": 0.3989, + "step": 6118 + }, + { + "epoch": 3.3081636330870428, + "grad_norm": 0.34112340211868286, + "learning_rate": 3.097053316097163e-06, + "loss": 0.4067, + "step": 6119 + }, + { + "epoch": 3.308704271039827, + "grad_norm": 0.34706053137779236, + "learning_rate": 3.095307556694942e-06, + "loss": 0.3729, + "step": 6120 + }, + { + "epoch": 3.3092449089926115, + "grad_norm": 0.27965566515922546, + "learning_rate": 3.0935620688603156e-06, + "loss": 0.3688, + "step": 6121 + }, + { + "epoch": 3.3097855469453954, + "grad_norm": 0.2663114666938782, + "learning_rate": 3.091816852842153e-06, + "loss": 0.3584, + "step": 6122 + }, + { + "epoch": 3.3103261848981798, + "grad_norm": 0.2900125980377197, + "learning_rate": 3.090071908889285e-06, + "loss": 0.395, + "step": 6123 + }, + { + "epoch": 3.310866822850964, + "grad_norm": 0.2807668447494507, + "learning_rate": 3.0883272372505004e-06, + "loss": 0.3697, + "step": 6124 + }, + { + "epoch": 3.3114074608037485, + "grad_norm": 0.28236493468284607, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.3662, + "step": 6125 + }, + { + "epoch": 3.311948098756533, + "grad_norm": 0.29553312063217163, + "learning_rate": 3.084838711910153e-06, + "loss": 0.378, + "step": 6126 + }, + { + "epoch": 3.3124887367093168, + "grad_norm": 0.28542473912239075, + "learning_rate": 3.083094858705978e-06, + "loss": 0.3646, + "step": 6127 + }, + { + "epoch": 3.313029374662101, + "grad_norm": 0.28612953424453735, + "learning_rate": 3.081351278810664e-06, + "loss": 0.4114, + "step": 6128 + }, + { + "epoch": 3.3135700126148855, + "grad_norm": 0.289182186126709, + "learning_rate": 3.0796079724728047e-06, + "loss": 0.3918, + "step": 6129 + }, + { + "epoch": 3.31411065056767, + "grad_norm": 0.2852880656719208, + "learning_rate": 3.077864939940959e-06, + "loss": 0.4152, + "step": 6130 + }, + { + "epoch": 3.314651288520454, + "grad_norm": 0.2754029333591461, + "learning_rate": 3.076122181463644e-06, + "loss": 0.3622, + "step": 6131 + }, + { + "epoch": 3.3151919264732386, + "grad_norm": 0.29560017585754395, + "learning_rate": 3.0743796972893436e-06, + "loss": 0.4024, + "step": 6132 + }, + { + "epoch": 3.315732564426023, + "grad_norm": 0.28034523129463196, + "learning_rate": 3.0726374876664923e-06, + "loss": 0.3622, + "step": 6133 + }, + { + "epoch": 3.316273202378807, + "grad_norm": 0.2680700123310089, + "learning_rate": 3.0708955528434933e-06, + "loss": 0.3837, + "step": 6134 + }, + { + "epoch": 3.316813840331591, + "grad_norm": 0.2749667167663574, + "learning_rate": 3.0691538930687076e-06, + "loss": 0.3667, + "step": 6135 + }, + { + "epoch": 3.3173544782843756, + "grad_norm": 0.2794339954853058, + "learning_rate": 3.0674125085904617e-06, + "loss": 0.3869, + "step": 6136 + }, + { + "epoch": 3.31789511623716, + "grad_norm": 0.3205060064792633, + "learning_rate": 3.065671399657035e-06, + "loss": 0.3634, + "step": 6137 + }, + { + "epoch": 3.3184357541899443, + "grad_norm": 0.2961784899234772, + "learning_rate": 3.0639305665166724e-06, + "loss": 0.3605, + "step": 6138 + }, + { + "epoch": 3.318976392142728, + "grad_norm": 0.31420665979385376, + "learning_rate": 3.0621900094175794e-06, + "loss": 0.3919, + "step": 6139 + }, + { + "epoch": 3.3195170300955126, + "grad_norm": 0.27574458718299866, + "learning_rate": 3.0604497286079227e-06, + "loss": 0.3796, + "step": 6140 + }, + { + "epoch": 3.320057668048297, + "grad_norm": 0.26589855551719666, + "learning_rate": 3.0587097243358254e-06, + "loss": 0.3874, + "step": 6141 + }, + { + "epoch": 3.3205983060010813, + "grad_norm": 0.2985301911830902, + "learning_rate": 3.0569699968493764e-06, + "loss": 0.3748, + "step": 6142 + }, + { + "epoch": 3.3211389439538657, + "grad_norm": 0.27616506814956665, + "learning_rate": 3.0552305463966224e-06, + "loss": 0.3767, + "step": 6143 + }, + { + "epoch": 3.32167958190665, + "grad_norm": 0.2935973107814789, + "learning_rate": 3.053491373225573e-06, + "loss": 0.3864, + "step": 6144 + }, + { + "epoch": 3.322220219859434, + "grad_norm": 0.2736741006374359, + "learning_rate": 3.051752477584191e-06, + "loss": 0.3912, + "step": 6145 + }, + { + "epoch": 3.3227608578122183, + "grad_norm": 0.27544134855270386, + "learning_rate": 3.05001385972041e-06, + "loss": 0.3766, + "step": 6146 + }, + { + "epoch": 3.3233014957650027, + "grad_norm": 0.25983256101608276, + "learning_rate": 3.048275519882116e-06, + "loss": 0.3574, + "step": 6147 + }, + { + "epoch": 3.323842133717787, + "grad_norm": 0.2896862030029297, + "learning_rate": 3.0465374583171627e-06, + "loss": 0.394, + "step": 6148 + }, + { + "epoch": 3.3243827716705714, + "grad_norm": 0.2907646596431732, + "learning_rate": 3.0447996752733543e-06, + "loss": 0.3759, + "step": 6149 + }, + { + "epoch": 3.3249234096233558, + "grad_norm": 0.28527727723121643, + "learning_rate": 3.043062170998464e-06, + "loss": 0.3691, + "step": 6150 + }, + { + "epoch": 3.3254640475761397, + "grad_norm": 0.28328537940979004, + "learning_rate": 3.0413249457402206e-06, + "loss": 0.38, + "step": 6151 + }, + { + "epoch": 3.326004685528924, + "grad_norm": 0.2843831777572632, + "learning_rate": 3.0395879997463164e-06, + "loss": 0.3842, + "step": 6152 + }, + { + "epoch": 3.3265453234817084, + "grad_norm": 0.2758019268512726, + "learning_rate": 3.037851333264399e-06, + "loss": 0.3848, + "step": 6153 + }, + { + "epoch": 3.3270859614344928, + "grad_norm": 0.27419313788414, + "learning_rate": 3.0361149465420814e-06, + "loss": 0.3772, + "step": 6154 + }, + { + "epoch": 3.327626599387277, + "grad_norm": 0.2987149655818939, + "learning_rate": 3.0343788398269342e-06, + "loss": 0.3724, + "step": 6155 + }, + { + "epoch": 3.328167237340061, + "grad_norm": 0.2917717397212982, + "learning_rate": 3.0326430133664888e-06, + "loss": 0.3539, + "step": 6156 + }, + { + "epoch": 3.3287078752928454, + "grad_norm": 0.28162145614624023, + "learning_rate": 3.030907467408235e-06, + "loss": 0.3897, + "step": 6157 + }, + { + "epoch": 3.3292485132456298, + "grad_norm": 0.27886053919792175, + "learning_rate": 3.029172202199624e-06, + "loss": 0.3509, + "step": 6158 + }, + { + "epoch": 3.329789151198414, + "grad_norm": 0.3052806854248047, + "learning_rate": 3.0274372179880667e-06, + "loss": 0.3906, + "step": 6159 + }, + { + "epoch": 3.3303297891511985, + "grad_norm": 0.2795872390270233, + "learning_rate": 3.025702515020937e-06, + "loss": 0.3521, + "step": 6160 + }, + { + "epoch": 3.330870427103983, + "grad_norm": 0.2896803021430969, + "learning_rate": 3.0239680935455607e-06, + "loss": 0.3932, + "step": 6161 + }, + { + "epoch": 3.331411065056767, + "grad_norm": 0.28419041633605957, + "learning_rate": 3.0222339538092306e-06, + "loss": 0.3749, + "step": 6162 + }, + { + "epoch": 3.331951703009551, + "grad_norm": 0.2858562767505646, + "learning_rate": 3.020500096059198e-06, + "loss": 0.3701, + "step": 6163 + }, + { + "epoch": 3.3324923409623355, + "grad_norm": 0.2957281768321991, + "learning_rate": 3.018766520542673e-06, + "loss": 0.3824, + "step": 6164 + }, + { + "epoch": 3.33303297891512, + "grad_norm": 0.30355215072631836, + "learning_rate": 3.0170332275068247e-06, + "loss": 0.4036, + "step": 6165 + }, + { + "epoch": 3.333573616867904, + "grad_norm": 0.3126343786716461, + "learning_rate": 3.015300217198784e-06, + "loss": 0.4117, + "step": 6166 + }, + { + "epoch": 3.3341142548206886, + "grad_norm": 0.2678101360797882, + "learning_rate": 3.0135674898656392e-06, + "loss": 0.3734, + "step": 6167 + }, + { + "epoch": 3.3346548927734725, + "grad_norm": 0.28154507279396057, + "learning_rate": 3.011835045754441e-06, + "loss": 0.3974, + "step": 6168 + }, + { + "epoch": 3.335195530726257, + "grad_norm": 0.26780441403388977, + "learning_rate": 3.0101028851121963e-06, + "loss": 0.3576, + "step": 6169 + }, + { + "epoch": 3.335736168679041, + "grad_norm": 0.28477615118026733, + "learning_rate": 3.0083710081858748e-06, + "loss": 0.3941, + "step": 6170 + }, + { + "epoch": 3.3362768066318256, + "grad_norm": 0.2977590560913086, + "learning_rate": 3.0066394152224034e-06, + "loss": 0.3896, + "step": 6171 + }, + { + "epoch": 3.33681744458461, + "grad_norm": 0.2784702181816101, + "learning_rate": 3.004908106468672e-06, + "loss": 0.396, + "step": 6172 + }, + { + "epoch": 3.3373580825373943, + "grad_norm": 0.2852799892425537, + "learning_rate": 3.0031770821715233e-06, + "loss": 0.3602, + "step": 6173 + }, + { + "epoch": 3.337898720490178, + "grad_norm": 0.2881910502910614, + "learning_rate": 3.001446342577765e-06, + "loss": 0.3507, + "step": 6174 + }, + { + "epoch": 3.3384393584429626, + "grad_norm": 0.305225670337677, + "learning_rate": 2.9997158879341647e-06, + "loss": 0.3928, + "step": 6175 + }, + { + "epoch": 3.338979996395747, + "grad_norm": 0.3159213066101074, + "learning_rate": 2.9979857184874484e-06, + "loss": 0.3838, + "step": 6176 + }, + { + "epoch": 3.3395206343485313, + "grad_norm": 0.28928208351135254, + "learning_rate": 2.9962558344842963e-06, + "loss": 0.3992, + "step": 6177 + }, + { + "epoch": 3.3400612723013157, + "grad_norm": 0.2776452302932739, + "learning_rate": 2.9945262361713545e-06, + "loss": 0.3805, + "step": 6178 + }, + { + "epoch": 3.3406019102541, + "grad_norm": 0.2972255051136017, + "learning_rate": 2.9927969237952254e-06, + "loss": 0.3702, + "step": 6179 + }, + { + "epoch": 3.341142548206884, + "grad_norm": 0.2896145284175873, + "learning_rate": 2.9910678976024733e-06, + "loss": 0.3675, + "step": 6180 + }, + { + "epoch": 3.3416831861596683, + "grad_norm": 0.3609926402568817, + "learning_rate": 2.989339157839616e-06, + "loss": 0.4042, + "step": 6181 + }, + { + "epoch": 3.3422238241124527, + "grad_norm": 0.28718194365501404, + "learning_rate": 2.9876107047531367e-06, + "loss": 0.367, + "step": 6182 + }, + { + "epoch": 3.342764462065237, + "grad_norm": 0.2966802716255188, + "learning_rate": 2.985882538589474e-06, + "loss": 0.3647, + "step": 6183 + }, + { + "epoch": 3.3433051000180214, + "grad_norm": 0.29517579078674316, + "learning_rate": 2.984154659595028e-06, + "loss": 0.369, + "step": 6184 + }, + { + "epoch": 3.3438457379708053, + "grad_norm": 0.27919119596481323, + "learning_rate": 2.982427068016155e-06, + "loss": 0.3851, + "step": 6185 + }, + { + "epoch": 3.3443863759235897, + "grad_norm": 0.2870360314846039, + "learning_rate": 2.9806997640991733e-06, + "loss": 0.3736, + "step": 6186 + }, + { + "epoch": 3.344927013876374, + "grad_norm": 0.2866764962673187, + "learning_rate": 2.9789727480903564e-06, + "loss": 0.358, + "step": 6187 + }, + { + "epoch": 3.3454676518291584, + "grad_norm": 0.3390553891658783, + "learning_rate": 2.9772460202359437e-06, + "loss": 0.3928, + "step": 6188 + }, + { + "epoch": 3.3460082897819428, + "grad_norm": 0.3193557560443878, + "learning_rate": 2.9755195807821236e-06, + "loss": 0.3761, + "step": 6189 + }, + { + "epoch": 3.346548927734727, + "grad_norm": 0.2649048864841461, + "learning_rate": 2.9737934299750514e-06, + "loss": 0.3747, + "step": 6190 + }, + { + "epoch": 3.3470895656875115, + "grad_norm": 0.2913542687892914, + "learning_rate": 2.972067568060838e-06, + "loss": 0.3988, + "step": 6191 + }, + { + "epoch": 3.3476302036402954, + "grad_norm": 0.27233168482780457, + "learning_rate": 2.970341995285553e-06, + "loss": 0.3664, + "step": 6192 + }, + { + "epoch": 3.3481708415930798, + "grad_norm": 0.321533739566803, + "learning_rate": 2.968616711895229e-06, + "loss": 0.3597, + "step": 6193 + }, + { + "epoch": 3.348711479545864, + "grad_norm": 0.27270835638046265, + "learning_rate": 2.96689171813585e-06, + "loss": 0.353, + "step": 6194 + }, + { + "epoch": 3.3492521174986485, + "grad_norm": 0.29473403096199036, + "learning_rate": 2.965167014253363e-06, + "loss": 0.371, + "step": 6195 + }, + { + "epoch": 3.349792755451433, + "grad_norm": 0.27694326639175415, + "learning_rate": 2.9634426004936735e-06, + "loss": 0.3718, + "step": 6196 + }, + { + "epoch": 3.3503333934042168, + "grad_norm": 0.2827945649623871, + "learning_rate": 2.9617184771026464e-06, + "loss": 0.4004, + "step": 6197 + }, + { + "epoch": 3.350874031357001, + "grad_norm": 0.27870264649391174, + "learning_rate": 2.959994644326103e-06, + "loss": 0.3717, + "step": 6198 + }, + { + "epoch": 3.3514146693097855, + "grad_norm": 0.2858085036277771, + "learning_rate": 2.958271102409823e-06, + "loss": 0.3873, + "step": 6199 + }, + { + "epoch": 3.35195530726257, + "grad_norm": 0.2923094630241394, + "learning_rate": 2.956547851599548e-06, + "loss": 0.3943, + "step": 6200 + }, + { + "epoch": 3.352495945215354, + "grad_norm": 0.30211901664733887, + "learning_rate": 2.954824892140978e-06, + "loss": 0.3929, + "step": 6201 + }, + { + "epoch": 3.3530365831681386, + "grad_norm": 0.2655908763408661, + "learning_rate": 2.9531022242797646e-06, + "loss": 0.3643, + "step": 6202 + }, + { + "epoch": 3.353577221120923, + "grad_norm": 0.2907673418521881, + "learning_rate": 2.951379848261523e-06, + "loss": 0.3825, + "step": 6203 + }, + { + "epoch": 3.354117859073707, + "grad_norm": 0.28303998708724976, + "learning_rate": 2.9496577643318302e-06, + "loss": 0.3639, + "step": 6204 + }, + { + "epoch": 3.354658497026491, + "grad_norm": 0.30183348059654236, + "learning_rate": 2.947935972736217e-06, + "loss": 0.3621, + "step": 6205 + }, + { + "epoch": 3.3551991349792756, + "grad_norm": 0.26821714639663696, + "learning_rate": 2.946214473720171e-06, + "loss": 0.3786, + "step": 6206 + }, + { + "epoch": 3.35573977293206, + "grad_norm": 0.29525643587112427, + "learning_rate": 2.944493267529141e-06, + "loss": 0.3998, + "step": 6207 + }, + { + "epoch": 3.3562804108848443, + "grad_norm": 0.27004489302635193, + "learning_rate": 2.942772354408534e-06, + "loss": 0.3555, + "step": 6208 + }, + { + "epoch": 3.356821048837628, + "grad_norm": 0.2798943519592285, + "learning_rate": 2.941051734603716e-06, + "loss": 0.3649, + "step": 6209 + }, + { + "epoch": 3.3573616867904126, + "grad_norm": 0.28168627619743347, + "learning_rate": 2.9393314083600076e-06, + "loss": 0.3624, + "step": 6210 + }, + { + "epoch": 3.357902324743197, + "grad_norm": 0.30059489607810974, + "learning_rate": 2.9376113759226903e-06, + "loss": 0.395, + "step": 6211 + }, + { + "epoch": 3.3584429626959813, + "grad_norm": 0.28024792671203613, + "learning_rate": 2.935891637537004e-06, + "loss": 0.3746, + "step": 6212 + }, + { + "epoch": 3.3589836006487657, + "grad_norm": 0.27950024604797363, + "learning_rate": 2.934172193448147e-06, + "loss": 0.3576, + "step": 6213 + }, + { + "epoch": 3.3595242386015496, + "grad_norm": 0.28534096479415894, + "learning_rate": 2.932453043901271e-06, + "loss": 0.3775, + "step": 6214 + }, + { + "epoch": 3.360064876554334, + "grad_norm": 0.294626921415329, + "learning_rate": 2.930734189141492e-06, + "loss": 0.3291, + "step": 6215 + }, + { + "epoch": 3.3606055145071183, + "grad_norm": 0.30831092596054077, + "learning_rate": 2.9290156294138807e-06, + "loss": 0.398, + "step": 6216 + }, + { + "epoch": 3.3611461524599027, + "grad_norm": 0.31744620203971863, + "learning_rate": 2.927297364963468e-06, + "loss": 0.3538, + "step": 6217 + }, + { + "epoch": 3.361686790412687, + "grad_norm": 0.3032221794128418, + "learning_rate": 2.9255793960352364e-06, + "loss": 0.3664, + "step": 6218 + }, + { + "epoch": 3.3622274283654714, + "grad_norm": 0.29594576358795166, + "learning_rate": 2.923861722874134e-06, + "loss": 0.3809, + "step": 6219 + }, + { + "epoch": 3.3627680663182558, + "grad_norm": 0.28834328055381775, + "learning_rate": 2.922144345725062e-06, + "loss": 0.3772, + "step": 6220 + }, + { + "epoch": 3.3633087042710397, + "grad_norm": 0.3238198757171631, + "learning_rate": 2.9204272648328835e-06, + "loss": 0.383, + "step": 6221 + }, + { + "epoch": 3.363849342223824, + "grad_norm": 0.27415868639945984, + "learning_rate": 2.9187104804424138e-06, + "loss": 0.3743, + "step": 6222 + }, + { + "epoch": 3.3643899801766084, + "grad_norm": 0.2742730379104614, + "learning_rate": 2.9169939927984293e-06, + "loss": 0.391, + "step": 6223 + }, + { + "epoch": 3.3649306181293928, + "grad_norm": 0.2784969210624695, + "learning_rate": 2.915277802145667e-06, + "loss": 0.3785, + "step": 6224 + }, + { + "epoch": 3.365471256082177, + "grad_norm": 0.30221015214920044, + "learning_rate": 2.9135619087288153e-06, + "loss": 0.393, + "step": 6225 + }, + { + "epoch": 3.366011894034961, + "grad_norm": 0.2925558090209961, + "learning_rate": 2.9118463127925235e-06, + "loss": 0.3772, + "step": 6226 + }, + { + "epoch": 3.3665525319877454, + "grad_norm": 0.2875816822052002, + "learning_rate": 2.9101310145813966e-06, + "loss": 0.3817, + "step": 6227 + }, + { + "epoch": 3.3670931699405298, + "grad_norm": 0.29908493161201477, + "learning_rate": 2.908416014340003e-06, + "loss": 0.3747, + "step": 6228 + }, + { + "epoch": 3.367633807893314, + "grad_norm": 0.28526777029037476, + "learning_rate": 2.906701312312861e-06, + "loss": 0.3823, + "step": 6229 + }, + { + "epoch": 3.3681744458460985, + "grad_norm": 0.33051398396492004, + "learning_rate": 2.9049869087444493e-06, + "loss": 0.4201, + "step": 6230 + }, + { + "epoch": 3.368715083798883, + "grad_norm": 0.36779916286468506, + "learning_rate": 2.903272803879207e-06, + "loss": 0.3933, + "step": 6231 + }, + { + "epoch": 3.369255721751667, + "grad_norm": 0.28889724612236023, + "learning_rate": 2.9015589979615244e-06, + "loss": 0.3779, + "step": 6232 + }, + { + "epoch": 3.369796359704451, + "grad_norm": 0.2862876057624817, + "learning_rate": 2.8998454912357578e-06, + "loss": 0.4034, + "step": 6233 + }, + { + "epoch": 3.3703369976572355, + "grad_norm": 0.29388126730918884, + "learning_rate": 2.8981322839462135e-06, + "loss": 0.3605, + "step": 6234 + }, + { + "epoch": 3.37087763561002, + "grad_norm": 0.27290070056915283, + "learning_rate": 2.8964193763371546e-06, + "loss": 0.4089, + "step": 6235 + }, + { + "epoch": 3.371418273562804, + "grad_norm": 0.30332091450691223, + "learning_rate": 2.894706768652809e-06, + "loss": 0.3899, + "step": 6236 + }, + { + "epoch": 3.3719589115155886, + "grad_norm": 0.30067744851112366, + "learning_rate": 2.8929944611373555e-06, + "loss": 0.3771, + "step": 6237 + }, + { + "epoch": 3.3724995494683725, + "grad_norm": 0.3083347976207733, + "learning_rate": 2.8912824540349315e-06, + "loss": 0.3745, + "step": 6238 + }, + { + "epoch": 3.373040187421157, + "grad_norm": 0.28111353516578674, + "learning_rate": 2.8895707475896295e-06, + "loss": 0.3767, + "step": 6239 + }, + { + "epoch": 3.373580825373941, + "grad_norm": 0.28267189860343933, + "learning_rate": 2.887859342045506e-06, + "loss": 0.372, + "step": 6240 + }, + { + "epoch": 3.3741214633267256, + "grad_norm": 0.31190162897109985, + "learning_rate": 2.8861482376465684e-06, + "loss": 0.3789, + "step": 6241 + }, + { + "epoch": 3.37466210127951, + "grad_norm": 0.2946929335594177, + "learning_rate": 2.88443743463678e-06, + "loss": 0.3609, + "step": 6242 + }, + { + "epoch": 3.375202739232294, + "grad_norm": 0.27581751346588135, + "learning_rate": 2.882726933260068e-06, + "loss": 0.356, + "step": 6243 + }, + { + "epoch": 3.375743377185078, + "grad_norm": 0.2839164435863495, + "learning_rate": 2.88101673376031e-06, + "loss": 0.3769, + "step": 6244 + }, + { + "epoch": 3.3762840151378626, + "grad_norm": 0.2926939129829407, + "learning_rate": 2.879306836381345e-06, + "loss": 0.3661, + "step": 6245 + }, + { + "epoch": 3.376824653090647, + "grad_norm": 0.2834808826446533, + "learning_rate": 2.877597241366967e-06, + "loss": 0.379, + "step": 6246 + }, + { + "epoch": 3.3773652910434313, + "grad_norm": 0.29527443647384644, + "learning_rate": 2.8758879489609243e-06, + "loss": 0.3931, + "step": 6247 + }, + { + "epoch": 3.3779059289962157, + "grad_norm": 0.2936485707759857, + "learning_rate": 2.874178959406928e-06, + "loss": 0.3575, + "step": 6248 + }, + { + "epoch": 3.378446566949, + "grad_norm": 0.2918454706668854, + "learning_rate": 2.872470272948642e-06, + "loss": 0.3708, + "step": 6249 + }, + { + "epoch": 3.378987204901784, + "grad_norm": 0.3029497563838959, + "learning_rate": 2.8707618898296864e-06, + "loss": 0.3921, + "step": 6250 + }, + { + "epoch": 3.3795278428545683, + "grad_norm": 0.3112659156322479, + "learning_rate": 2.869053810293638e-06, + "loss": 0.3724, + "step": 6251 + }, + { + "epoch": 3.3800684808073527, + "grad_norm": 0.2860032916069031, + "learning_rate": 2.8673460345840343e-06, + "loss": 0.3844, + "step": 6252 + }, + { + "epoch": 3.380609118760137, + "grad_norm": 0.28944653272628784, + "learning_rate": 2.8656385629443694e-06, + "loss": 0.3831, + "step": 6253 + }, + { + "epoch": 3.3811497567129214, + "grad_norm": 0.29049044847488403, + "learning_rate": 2.863931395618085e-06, + "loss": 0.3866, + "step": 6254 + }, + { + "epoch": 3.3816903946657053, + "grad_norm": 0.29390716552734375, + "learning_rate": 2.862224532848591e-06, + "loss": 0.3831, + "step": 6255 + }, + { + "epoch": 3.3822310326184897, + "grad_norm": 0.3249385356903076, + "learning_rate": 2.860517974879245e-06, + "loss": 0.3621, + "step": 6256 + }, + { + "epoch": 3.382771670571274, + "grad_norm": 0.2906329929828644, + "learning_rate": 2.858811721953369e-06, + "loss": 0.3788, + "step": 6257 + }, + { + "epoch": 3.3833123085240584, + "grad_norm": 0.28749001026153564, + "learning_rate": 2.8571057743142362e-06, + "loss": 0.3688, + "step": 6258 + }, + { + "epoch": 3.3838529464768428, + "grad_norm": 0.285239040851593, + "learning_rate": 2.855400132205074e-06, + "loss": 0.3551, + "step": 6259 + }, + { + "epoch": 3.384393584429627, + "grad_norm": 0.28307268023490906, + "learning_rate": 2.853694795869074e-06, + "loss": 0.3697, + "step": 6260 + }, + { + "epoch": 3.3849342223824115, + "grad_norm": 0.26978787779808044, + "learning_rate": 2.851989765549378e-06, + "loss": 0.3703, + "step": 6261 + }, + { + "epoch": 3.3854748603351954, + "grad_norm": 0.2827216386795044, + "learning_rate": 2.850285041489087e-06, + "loss": 0.367, + "step": 6262 + }, + { + "epoch": 3.3860154982879798, + "grad_norm": 0.2850663959980011, + "learning_rate": 2.8485806239312583e-06, + "loss": 0.3843, + "step": 6263 + }, + { + "epoch": 3.386556136240764, + "grad_norm": 0.31980374455451965, + "learning_rate": 2.8468765131189014e-06, + "loss": 0.3701, + "step": 6264 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.2919912338256836, + "learning_rate": 2.845172709294989e-06, + "loss": 0.3895, + "step": 6265 + }, + { + "epoch": 3.387637412146333, + "grad_norm": 0.27873820066452026, + "learning_rate": 2.843469212702445e-06, + "loss": 0.3873, + "step": 6266 + }, + { + "epoch": 3.3881780500991168, + "grad_norm": 0.2797130048274994, + "learning_rate": 2.8417660235841505e-06, + "loss": 0.3817, + "step": 6267 + }, + { + "epoch": 3.388718688051901, + "grad_norm": 0.27694785594940186, + "learning_rate": 2.840063142182941e-06, + "loss": 0.3765, + "step": 6268 + }, + { + "epoch": 3.3892593260046855, + "grad_norm": 0.27863171696662903, + "learning_rate": 2.838360568741613e-06, + "loss": 0.3735, + "step": 6269 + }, + { + "epoch": 3.38979996395747, + "grad_norm": 0.27632227540016174, + "learning_rate": 2.8366583035029194e-06, + "loss": 0.3637, + "step": 6270 + }, + { + "epoch": 3.390340601910254, + "grad_norm": 0.26599693298339844, + "learning_rate": 2.834956346709559e-06, + "loss": 0.3661, + "step": 6271 + }, + { + "epoch": 3.3908812398630386, + "grad_norm": 0.2922784090042114, + "learning_rate": 2.8332546986041986e-06, + "loss": 0.3835, + "step": 6272 + }, + { + "epoch": 3.3914218778158225, + "grad_norm": 0.2666107714176178, + "learning_rate": 2.831553359429453e-06, + "loss": 0.3753, + "step": 6273 + }, + { + "epoch": 3.391962515768607, + "grad_norm": 0.28382694721221924, + "learning_rate": 2.829852329427899e-06, + "loss": 0.3893, + "step": 6274 + }, + { + "epoch": 3.392503153721391, + "grad_norm": 0.2854803800582886, + "learning_rate": 2.8281516088420665e-06, + "loss": 0.3689, + "step": 6275 + }, + { + "epoch": 3.3930437916741756, + "grad_norm": 0.2664931118488312, + "learning_rate": 2.826451197914437e-06, + "loss": 0.3808, + "step": 6276 + }, + { + "epoch": 3.39358442962696, + "grad_norm": 0.2853836715221405, + "learning_rate": 2.824751096887457e-06, + "loss": 0.3745, + "step": 6277 + }, + { + "epoch": 3.3941250675797443, + "grad_norm": 0.2897249758243561, + "learning_rate": 2.8230513060035214e-06, + "loss": 0.3848, + "step": 6278 + }, + { + "epoch": 3.394665705532528, + "grad_norm": 0.28952860832214355, + "learning_rate": 2.821351825504984e-06, + "loss": 0.3816, + "step": 6279 + }, + { + "epoch": 3.3952063434853126, + "grad_norm": 0.3057219982147217, + "learning_rate": 2.819652655634151e-06, + "loss": 0.3585, + "step": 6280 + }, + { + "epoch": 3.395746981438097, + "grad_norm": 0.29877203702926636, + "learning_rate": 2.817953796633289e-06, + "loss": 0.3764, + "step": 6281 + }, + { + "epoch": 3.3962876193908813, + "grad_norm": 0.279131680727005, + "learning_rate": 2.816255248744622e-06, + "loss": 0.3687, + "step": 6282 + }, + { + "epoch": 3.3968282573436657, + "grad_norm": 0.27392497658729553, + "learning_rate": 2.8145570122103187e-06, + "loss": 0.3576, + "step": 6283 + }, + { + "epoch": 3.3973688952964496, + "grad_norm": 0.28131553530693054, + "learning_rate": 2.812859087272516e-06, + "loss": 0.3697, + "step": 6284 + }, + { + "epoch": 3.397909533249234, + "grad_norm": 0.2834169268608093, + "learning_rate": 2.8111614741732975e-06, + "loss": 0.3599, + "step": 6285 + }, + { + "epoch": 3.3984501712020183, + "grad_norm": 0.3012988269329071, + "learning_rate": 2.8094641731547088e-06, + "loss": 0.376, + "step": 6286 + }, + { + "epoch": 3.3989908091548027, + "grad_norm": 0.27377527952194214, + "learning_rate": 2.807767184458747e-06, + "loss": 0.364, + "step": 6287 + }, + { + "epoch": 3.399531447107587, + "grad_norm": 0.2820917069911957, + "learning_rate": 2.8060705083273633e-06, + "loss": 0.3602, + "step": 6288 + }, + { + "epoch": 3.4000720850603714, + "grad_norm": 0.2810240387916565, + "learning_rate": 2.8043741450024707e-06, + "loss": 0.3518, + "step": 6289 + }, + { + "epoch": 3.4006127230131558, + "grad_norm": 0.2800905704498291, + "learning_rate": 2.802678094725931e-06, + "loss": 0.3767, + "step": 6290 + }, + { + "epoch": 3.4011533609659397, + "grad_norm": 0.2750107944011688, + "learning_rate": 2.8009823577395633e-06, + "loss": 0.3773, + "step": 6291 + }, + { + "epoch": 3.401693998918724, + "grad_norm": 0.2940891981124878, + "learning_rate": 2.799286934285146e-06, + "loss": 0.3639, + "step": 6292 + }, + { + "epoch": 3.4022346368715084, + "grad_norm": 0.2891536355018616, + "learning_rate": 2.7975918246044047e-06, + "loss": 0.3637, + "step": 6293 + }, + { + "epoch": 3.4027752748242928, + "grad_norm": 0.289329469203949, + "learning_rate": 2.7958970289390317e-06, + "loss": 0.3712, + "step": 6294 + }, + { + "epoch": 3.403315912777077, + "grad_norm": 0.2884690463542938, + "learning_rate": 2.794202547530661e-06, + "loss": 0.3755, + "step": 6295 + }, + { + "epoch": 3.403856550729861, + "grad_norm": 0.2806275188922882, + "learning_rate": 2.7925083806208932e-06, + "loss": 0.4142, + "step": 6296 + }, + { + "epoch": 3.4043971886826454, + "grad_norm": 0.2923715114593506, + "learning_rate": 2.7908145284512765e-06, + "loss": 0.3557, + "step": 6297 + }, + { + "epoch": 3.4049378266354298, + "grad_norm": 0.303189754486084, + "learning_rate": 2.78912099126332e-06, + "loss": 0.3538, + "step": 6298 + }, + { + "epoch": 3.405478464588214, + "grad_norm": 0.3140781819820404, + "learning_rate": 2.7874277692984847e-06, + "loss": 0.3787, + "step": 6299 + }, + { + "epoch": 3.4060191025409985, + "grad_norm": 0.3113609552383423, + "learning_rate": 2.785734862798184e-06, + "loss": 0.3737, + "step": 6300 + }, + { + "epoch": 3.406559740493783, + "grad_norm": 0.29418936371803284, + "learning_rate": 2.7840422720037943e-06, + "loss": 0.3814, + "step": 6301 + }, + { + "epoch": 3.4071003784465668, + "grad_norm": 0.29379862546920776, + "learning_rate": 2.7823499971566393e-06, + "loss": 0.3775, + "step": 6302 + }, + { + "epoch": 3.407641016399351, + "grad_norm": 0.30091267824172974, + "learning_rate": 2.7806580384979986e-06, + "loss": 0.3897, + "step": 6303 + }, + { + "epoch": 3.4081816543521355, + "grad_norm": 0.2900888919830322, + "learning_rate": 2.7789663962691134e-06, + "loss": 0.3649, + "step": 6304 + }, + { + "epoch": 3.40872229230492, + "grad_norm": 0.2886979579925537, + "learning_rate": 2.77727507071117e-06, + "loss": 0.3823, + "step": 6305 + }, + { + "epoch": 3.409262930257704, + "grad_norm": 0.27417394518852234, + "learning_rate": 2.7755840620653212e-06, + "loss": 0.3676, + "step": 6306 + }, + { + "epoch": 3.4098035682104886, + "grad_norm": 0.321492999792099, + "learning_rate": 2.77389337057266e-06, + "loss": 0.396, + "step": 6307 + }, + { + "epoch": 3.4103442061632725, + "grad_norm": 0.29740339517593384, + "learning_rate": 2.7722029964742455e-06, + "loss": 0.3658, + "step": 6308 + }, + { + "epoch": 3.410884844116057, + "grad_norm": 0.2698226869106293, + "learning_rate": 2.77051294001109e-06, + "loss": 0.3788, + "step": 6309 + }, + { + "epoch": 3.411425482068841, + "grad_norm": 0.2841397523880005, + "learning_rate": 2.768823201424158e-06, + "loss": 0.3695, + "step": 6310 + }, + { + "epoch": 3.4119661200216256, + "grad_norm": 0.30393242835998535, + "learning_rate": 2.7671337809543684e-06, + "loss": 0.3695, + "step": 6311 + }, + { + "epoch": 3.41250675797441, + "grad_norm": 0.2840659022331238, + "learning_rate": 2.7654446788425935e-06, + "loss": 0.3843, + "step": 6312 + }, + { + "epoch": 3.413047395927194, + "grad_norm": 0.29241564869880676, + "learning_rate": 2.7637558953296672e-06, + "loss": 0.3643, + "step": 6313 + }, + { + "epoch": 3.4135880338799782, + "grad_norm": 0.32568466663360596, + "learning_rate": 2.7620674306563705e-06, + "loss": 0.3919, + "step": 6314 + }, + { + "epoch": 3.4141286718327626, + "grad_norm": 0.3067713677883148, + "learning_rate": 2.7603792850634402e-06, + "loss": 0.3785, + "step": 6315 + }, + { + "epoch": 3.414669309785547, + "grad_norm": 0.29410862922668457, + "learning_rate": 2.7586914587915727e-06, + "loss": 0.3765, + "step": 6316 + }, + { + "epoch": 3.4152099477383313, + "grad_norm": 0.27362555265426636, + "learning_rate": 2.757003952081411e-06, + "loss": 0.3661, + "step": 6317 + }, + { + "epoch": 3.4157505856911157, + "grad_norm": 0.280722051858902, + "learning_rate": 2.7553167651735624e-06, + "loss": 0.3637, + "step": 6318 + }, + { + "epoch": 3.4162912236439, + "grad_norm": 0.29213541746139526, + "learning_rate": 2.7536298983085762e-06, + "loss": 0.3646, + "step": 6319 + }, + { + "epoch": 3.416831861596684, + "grad_norm": 0.2971615195274353, + "learning_rate": 2.7519433517269665e-06, + "loss": 0.3644, + "step": 6320 + }, + { + "epoch": 3.4173724995494683, + "grad_norm": 0.28067779541015625, + "learning_rate": 2.7502571256691996e-06, + "loss": 0.3767, + "step": 6321 + }, + { + "epoch": 3.4179131375022527, + "grad_norm": 0.2956728935241699, + "learning_rate": 2.748571220375691e-06, + "loss": 0.3907, + "step": 6322 + }, + { + "epoch": 3.418453775455037, + "grad_norm": 0.2931300699710846, + "learning_rate": 2.746885636086819e-06, + "loss": 0.3807, + "step": 6323 + }, + { + "epoch": 3.4189944134078214, + "grad_norm": 0.27655330300331116, + "learning_rate": 2.745200373042904e-06, + "loss": 0.4191, + "step": 6324 + }, + { + "epoch": 3.4195350513606053, + "grad_norm": 0.2742091119289398, + "learning_rate": 2.7435154314842337e-06, + "loss": 0.3764, + "step": 6325 + }, + { + "epoch": 3.4200756893133897, + "grad_norm": 0.29484522342681885, + "learning_rate": 2.7418308116510395e-06, + "loss": 0.3795, + "step": 6326 + }, + { + "epoch": 3.420616327266174, + "grad_norm": 0.28292086720466614, + "learning_rate": 2.7401465137835164e-06, + "loss": 0.3737, + "step": 6327 + }, + { + "epoch": 3.4211569652189584, + "grad_norm": 0.2676083743572235, + "learning_rate": 2.7384625381218063e-06, + "loss": 0.3308, + "step": 6328 + }, + { + "epoch": 3.4216976031717428, + "grad_norm": 0.30103984475135803, + "learning_rate": 2.736778884906004e-06, + "loss": 0.3645, + "step": 6329 + }, + { + "epoch": 3.422238241124527, + "grad_norm": 0.2882280647754669, + "learning_rate": 2.7350955543761682e-06, + "loss": 0.3736, + "step": 6330 + }, + { + "epoch": 3.422778879077311, + "grad_norm": 0.28636249899864197, + "learning_rate": 2.7334125467723004e-06, + "loss": 0.3717, + "step": 6331 + }, + { + "epoch": 3.4233195170300954, + "grad_norm": 0.2803739309310913, + "learning_rate": 2.731729862334361e-06, + "loss": 0.3849, + "step": 6332 + }, + { + "epoch": 3.4238601549828798, + "grad_norm": 0.2867929935455322, + "learning_rate": 2.7300475013022666e-06, + "loss": 0.3846, + "step": 6333 + }, + { + "epoch": 3.424400792935664, + "grad_norm": 0.28699979186058044, + "learning_rate": 2.7283654639158817e-06, + "loss": 0.3647, + "step": 6334 + }, + { + "epoch": 3.4249414308884485, + "grad_norm": 0.34597089886665344, + "learning_rate": 2.7266837504150345e-06, + "loss": 0.3619, + "step": 6335 + }, + { + "epoch": 3.425482068841233, + "grad_norm": 0.2675110995769501, + "learning_rate": 2.7250023610394926e-06, + "loss": 0.3752, + "step": 6336 + }, + { + "epoch": 3.4260227067940168, + "grad_norm": 0.28611132502555847, + "learning_rate": 2.723321296028989e-06, + "loss": 0.3931, + "step": 6337 + }, + { + "epoch": 3.426563344746801, + "grad_norm": 0.2917852997779846, + "learning_rate": 2.7216405556232093e-06, + "loss": 0.3918, + "step": 6338 + }, + { + "epoch": 3.4271039826995855, + "grad_norm": 0.2738979458808899, + "learning_rate": 2.719960140061788e-06, + "loss": 0.3762, + "step": 6339 + }, + { + "epoch": 3.42764462065237, + "grad_norm": 0.2562112808227539, + "learning_rate": 2.7182800495843166e-06, + "loss": 0.389, + "step": 6340 + }, + { + "epoch": 3.428185258605154, + "grad_norm": 0.28876200318336487, + "learning_rate": 2.7166002844303365e-06, + "loss": 0.3626, + "step": 6341 + }, + { + "epoch": 3.428725896557938, + "grad_norm": 0.28462162613868713, + "learning_rate": 2.7149208448393494e-06, + "loss": 0.3956, + "step": 6342 + }, + { + "epoch": 3.4292665345107225, + "grad_norm": 0.27688026428222656, + "learning_rate": 2.713241731050805e-06, + "loss": 0.3967, + "step": 6343 + }, + { + "epoch": 3.429807172463507, + "grad_norm": 0.28863951563835144, + "learning_rate": 2.711562943304107e-06, + "loss": 0.3624, + "step": 6344 + }, + { + "epoch": 3.430347810416291, + "grad_norm": 0.30096060037612915, + "learning_rate": 2.7098844818386164e-06, + "loss": 0.388, + "step": 6345 + }, + { + "epoch": 3.4308884483690756, + "grad_norm": 0.2913476228713989, + "learning_rate": 2.7082063468936427e-06, + "loss": 0.3303, + "step": 6346 + }, + { + "epoch": 3.43142908632186, + "grad_norm": 0.2739192843437195, + "learning_rate": 2.706528538708455e-06, + "loss": 0.3583, + "step": 6347 + }, + { + "epoch": 3.4319697242746443, + "grad_norm": 0.2901630699634552, + "learning_rate": 2.70485105752227e-06, + "loss": 0.3728, + "step": 6348 + }, + { + "epoch": 3.4325103622274282, + "grad_norm": 0.2617281973361969, + "learning_rate": 2.7031739035742575e-06, + "loss": 0.3658, + "step": 6349 + }, + { + "epoch": 3.4330510001802126, + "grad_norm": 0.26667746901512146, + "learning_rate": 2.7014970771035474e-06, + "loss": 0.3693, + "step": 6350 + }, + { + "epoch": 3.433591638132997, + "grad_norm": 0.2691449820995331, + "learning_rate": 2.6998205783492167e-06, + "loss": 0.3736, + "step": 6351 + }, + { + "epoch": 3.4341322760857813, + "grad_norm": 0.29237818717956543, + "learning_rate": 2.6981444075502973e-06, + "loss": 0.3716, + "step": 6352 + }, + { + "epoch": 3.4346729140385657, + "grad_norm": 0.28428447246551514, + "learning_rate": 2.6964685649457727e-06, + "loss": 0.3765, + "step": 6353 + }, + { + "epoch": 3.4352135519913496, + "grad_norm": 0.254072368144989, + "learning_rate": 2.694793050774586e-06, + "loss": 0.352, + "step": 6354 + }, + { + "epoch": 3.435754189944134, + "grad_norm": 0.29546475410461426, + "learning_rate": 2.6931178652756262e-06, + "loss": 0.3975, + "step": 6355 + }, + { + "epoch": 3.4362948278969183, + "grad_norm": 0.30482378602027893, + "learning_rate": 2.6914430086877365e-06, + "loss": 0.3714, + "step": 6356 + }, + { + "epoch": 3.4368354658497027, + "grad_norm": 0.30705294013023376, + "learning_rate": 2.6897684812497193e-06, + "loss": 0.3917, + "step": 6357 + }, + { + "epoch": 3.437376103802487, + "grad_norm": 0.2723318636417389, + "learning_rate": 2.688094283200321e-06, + "loss": 0.3823, + "step": 6358 + }, + { + "epoch": 3.4379167417552714, + "grad_norm": 0.2744968831539154, + "learning_rate": 2.68642041477825e-06, + "loss": 0.3712, + "step": 6359 + }, + { + "epoch": 3.4384573797080553, + "grad_norm": 0.27377021312713623, + "learning_rate": 2.6847468762221616e-06, + "loss": 0.3744, + "step": 6360 + }, + { + "epoch": 3.4389980176608397, + "grad_norm": 0.31088465452194214, + "learning_rate": 2.6830736677706637e-06, + "loss": 0.3433, + "step": 6361 + }, + { + "epoch": 3.439538655613624, + "grad_norm": 0.28350019454956055, + "learning_rate": 2.6814007896623235e-06, + "loss": 0.3728, + "step": 6362 + }, + { + "epoch": 3.4400792935664084, + "grad_norm": 0.293083131313324, + "learning_rate": 2.6797282421356546e-06, + "loss": 0.3854, + "step": 6363 + }, + { + "epoch": 3.4406199315191928, + "grad_norm": 0.26938867568969727, + "learning_rate": 2.6780560254291267e-06, + "loss": 0.3763, + "step": 6364 + }, + { + "epoch": 3.441160569471977, + "grad_norm": 0.523354709148407, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.3775, + "step": 6365 + }, + { + "epoch": 3.441701207424761, + "grad_norm": 0.28603750467300415, + "learning_rate": 2.674712585430126e-06, + "loss": 0.3551, + "step": 6366 + }, + { + "epoch": 3.4422418453775454, + "grad_norm": 0.28473588824272156, + "learning_rate": 2.673041362614361e-06, + "loss": 0.3991, + "step": 6367 + }, + { + "epoch": 3.4427824833303298, + "grad_norm": 0.2877833843231201, + "learning_rate": 2.6713704715721357e-06, + "loss": 0.3777, + "step": 6368 + }, + { + "epoch": 3.443323121283114, + "grad_norm": 0.26864901185035706, + "learning_rate": 2.6696999125416887e-06, + "loss": 0.3776, + "step": 6369 + }, + { + "epoch": 3.4438637592358985, + "grad_norm": 0.2873307764530182, + "learning_rate": 2.668029685761201e-06, + "loss": 0.3812, + "step": 6370 + }, + { + "epoch": 3.4444043971886824, + "grad_norm": 0.29649487137794495, + "learning_rate": 2.666359791468815e-06, + "loss": 0.3738, + "step": 6371 + }, + { + "epoch": 3.4449450351414668, + "grad_norm": 0.2977284789085388, + "learning_rate": 2.6646902299026183e-06, + "loss": 0.3693, + "step": 6372 + }, + { + "epoch": 3.445485673094251, + "grad_norm": 0.28812456130981445, + "learning_rate": 2.663021001300653e-06, + "loss": 0.3965, + "step": 6373 + }, + { + "epoch": 3.4460263110470355, + "grad_norm": 0.3119604289531708, + "learning_rate": 2.6613521059009172e-06, + "loss": 0.3864, + "step": 6374 + }, + { + "epoch": 3.44656694899982, + "grad_norm": 0.3055315911769867, + "learning_rate": 2.6596835439413584e-06, + "loss": 0.3916, + "step": 6375 + }, + { + "epoch": 3.447107586952604, + "grad_norm": 0.27192702889442444, + "learning_rate": 2.6580153156598742e-06, + "loss": 0.3581, + "step": 6376 + }, + { + "epoch": 3.4476482249053886, + "grad_norm": 0.32477402687072754, + "learning_rate": 2.656347421294323e-06, + "loss": 0.3677, + "step": 6377 + }, + { + "epoch": 3.4481888628581725, + "grad_norm": 0.30228251218795776, + "learning_rate": 2.6546798610825043e-06, + "loss": 0.3747, + "step": 6378 + }, + { + "epoch": 3.448729500810957, + "grad_norm": 0.281707763671875, + "learning_rate": 2.6530126352621834e-06, + "loss": 0.3537, + "step": 6379 + }, + { + "epoch": 3.4492701387637412, + "grad_norm": 0.2878420054912567, + "learning_rate": 2.6513457440710612e-06, + "loss": 0.3984, + "step": 6380 + }, + { + "epoch": 3.4498107767165256, + "grad_norm": 0.29713553190231323, + "learning_rate": 2.6496791877468063e-06, + "loss": 0.3733, + "step": 6381 + }, + { + "epoch": 3.45035141466931, + "grad_norm": 0.304828941822052, + "learning_rate": 2.6480129665270295e-06, + "loss": 0.3698, + "step": 6382 + }, + { + "epoch": 3.450892052622094, + "grad_norm": 0.29700446128845215, + "learning_rate": 2.6463470806493012e-06, + "loss": 0.3744, + "step": 6383 + }, + { + "epoch": 3.4514326905748782, + "grad_norm": 0.3028167486190796, + "learning_rate": 2.644681530351139e-06, + "loss": 0.3363, + "step": 6384 + }, + { + "epoch": 3.4519733285276626, + "grad_norm": 0.28015580773353577, + "learning_rate": 2.6430163158700116e-06, + "loss": 0.3655, + "step": 6385 + }, + { + "epoch": 3.452513966480447, + "grad_norm": 0.2895985543727875, + "learning_rate": 2.641351437443347e-06, + "loss": 0.3962, + "step": 6386 + }, + { + "epoch": 3.4530546044332313, + "grad_norm": 0.28600218892097473, + "learning_rate": 2.639686895308515e-06, + "loss": 0.3939, + "step": 6387 + }, + { + "epoch": 3.4535952423860157, + "grad_norm": 0.292057067155838, + "learning_rate": 2.638022689702849e-06, + "loss": 0.3628, + "step": 6388 + }, + { + "epoch": 3.4541358803387996, + "grad_norm": 0.2844478487968445, + "learning_rate": 2.6363588208636246e-06, + "loss": 0.3835, + "step": 6389 + }, + { + "epoch": 3.454676518291584, + "grad_norm": 0.2956898808479309, + "learning_rate": 2.634695289028072e-06, + "loss": 0.3734, + "step": 6390 + }, + { + "epoch": 3.4552171562443683, + "grad_norm": 0.29271790385246277, + "learning_rate": 2.6330320944333787e-06, + "loss": 0.3908, + "step": 6391 + }, + { + "epoch": 3.4557577941971527, + "grad_norm": 0.2900015711784363, + "learning_rate": 2.6313692373166777e-06, + "loss": 0.3847, + "step": 6392 + }, + { + "epoch": 3.456298432149937, + "grad_norm": 0.2838262617588043, + "learning_rate": 2.6297067179150566e-06, + "loss": 0.3811, + "step": 6393 + }, + { + "epoch": 3.4568390701027214, + "grad_norm": 0.27128735184669495, + "learning_rate": 2.6280445364655516e-06, + "loss": 0.3587, + "step": 6394 + }, + { + "epoch": 3.4573797080555053, + "grad_norm": 0.26563310623168945, + "learning_rate": 2.6263826932051562e-06, + "loss": 0.4066, + "step": 6395 + }, + { + "epoch": 3.4579203460082897, + "grad_norm": 0.2762858271598816, + "learning_rate": 2.624721188370817e-06, + "loss": 0.3763, + "step": 6396 + }, + { + "epoch": 3.458460983961074, + "grad_norm": 0.2815711796283722, + "learning_rate": 2.6230600221994195e-06, + "loss": 0.3641, + "step": 6397 + }, + { + "epoch": 3.4590016219138584, + "grad_norm": 0.28200116753578186, + "learning_rate": 2.621399194927817e-06, + "loss": 0.372, + "step": 6398 + }, + { + "epoch": 3.4595422598666428, + "grad_norm": 0.27632999420166016, + "learning_rate": 2.619738706792802e-06, + "loss": 0.3757, + "step": 6399 + }, + { + "epoch": 3.4600828978194267, + "grad_norm": 0.2843749523162842, + "learning_rate": 2.6180785580311284e-06, + "loss": 0.3813, + "step": 6400 + }, + { + "epoch": 3.460623535772211, + "grad_norm": 0.2992981970310211, + "learning_rate": 2.6164187488794958e-06, + "loss": 0.3514, + "step": 6401 + }, + { + "epoch": 3.4611641737249954, + "grad_norm": 0.27975744009017944, + "learning_rate": 2.614759279574555e-06, + "loss": 0.3778, + "step": 6402 + }, + { + "epoch": 3.4617048116777798, + "grad_norm": 0.2758598327636719, + "learning_rate": 2.613100150352912e-06, + "loss": 0.3692, + "step": 6403 + }, + { + "epoch": 3.462245449630564, + "grad_norm": 0.3005410134792328, + "learning_rate": 2.6114413614511227e-06, + "loss": 0.3693, + "step": 6404 + }, + { + "epoch": 3.4627860875833485, + "grad_norm": 0.2945283055305481, + "learning_rate": 2.609782913105691e-06, + "loss": 0.371, + "step": 6405 + }, + { + "epoch": 3.463326725536133, + "grad_norm": 0.29336419701576233, + "learning_rate": 2.6081248055530796e-06, + "loss": 0.3838, + "step": 6406 + }, + { + "epoch": 3.4638673634889168, + "grad_norm": 0.2629316449165344, + "learning_rate": 2.606467039029695e-06, + "loss": 0.3449, + "step": 6407 + }, + { + "epoch": 3.464408001441701, + "grad_norm": 0.2649190127849579, + "learning_rate": 2.604809613771904e-06, + "loss": 0.3821, + "step": 6408 + }, + { + "epoch": 3.4649486393944855, + "grad_norm": 0.2621277868747711, + "learning_rate": 2.603152530016012e-06, + "loss": 0.3835, + "step": 6409 + }, + { + "epoch": 3.46548927734727, + "grad_norm": 0.2772118151187897, + "learning_rate": 2.601495787998288e-06, + "loss": 0.3505, + "step": 6410 + }, + { + "epoch": 3.466029915300054, + "grad_norm": 0.28124377131462097, + "learning_rate": 2.5998393879549444e-06, + "loss": 0.3998, + "step": 6411 + }, + { + "epoch": 3.466570553252838, + "grad_norm": 0.30214518308639526, + "learning_rate": 2.5981833301221505e-06, + "loss": 0.37, + "step": 6412 + }, + { + "epoch": 3.4671111912056225, + "grad_norm": 0.28282877802848816, + "learning_rate": 2.5965276147360226e-06, + "loss": 0.3745, + "step": 6413 + }, + { + "epoch": 3.467651829158407, + "grad_norm": 0.301487535238266, + "learning_rate": 2.594872242032628e-06, + "loss": 0.3783, + "step": 6414 + }, + { + "epoch": 3.4681924671111912, + "grad_norm": 0.281084805727005, + "learning_rate": 2.59321721224799e-06, + "loss": 0.3889, + "step": 6415 + }, + { + "epoch": 3.4687331050639756, + "grad_norm": 0.29001906514167786, + "learning_rate": 2.591562525618078e-06, + "loss": 0.404, + "step": 6416 + }, + { + "epoch": 3.46927374301676, + "grad_norm": 0.2739716172218323, + "learning_rate": 2.589908182378813e-06, + "loss": 0.3885, + "step": 6417 + }, + { + "epoch": 3.4698143809695443, + "grad_norm": 0.26952242851257324, + "learning_rate": 2.5882541827660713e-06, + "loss": 0.3499, + "step": 6418 + }, + { + "epoch": 3.4703550189223282, + "grad_norm": 0.30054211616516113, + "learning_rate": 2.586600527015673e-06, + "loss": 0.3917, + "step": 6419 + }, + { + "epoch": 3.4708956568751126, + "grad_norm": 0.28374937176704407, + "learning_rate": 2.5849472153634003e-06, + "loss": 0.3627, + "step": 6420 + }, + { + "epoch": 3.471436294827897, + "grad_norm": 0.2651611864566803, + "learning_rate": 2.583294248044971e-06, + "loss": 0.3922, + "step": 6421 + }, + { + "epoch": 3.4719769327806813, + "grad_norm": 0.3104220926761627, + "learning_rate": 2.5816416252960673e-06, + "loss": 0.3828, + "step": 6422 + }, + { + "epoch": 3.4725175707334657, + "grad_norm": 0.26732105016708374, + "learning_rate": 2.579989347352314e-06, + "loss": 0.3582, + "step": 6423 + }, + { + "epoch": 3.4730582086862496, + "grad_norm": 0.285390704870224, + "learning_rate": 2.5783374144492946e-06, + "loss": 0.3707, + "step": 6424 + }, + { + "epoch": 3.473598846639034, + "grad_norm": 0.2892547845840454, + "learning_rate": 2.576685826822535e-06, + "loss": 0.3966, + "step": 6425 + }, + { + "epoch": 3.4741394845918183, + "grad_norm": 0.28802600502967834, + "learning_rate": 2.575034584707515e-06, + "loss": 0.4054, + "step": 6426 + }, + { + "epoch": 3.4746801225446027, + "grad_norm": 0.2835301458835602, + "learning_rate": 2.573383688339669e-06, + "loss": 0.3619, + "step": 6427 + }, + { + "epoch": 3.475220760497387, + "grad_norm": 0.28567731380462646, + "learning_rate": 2.5717331379543775e-06, + "loss": 0.3949, + "step": 6428 + }, + { + "epoch": 3.475761398450171, + "grad_norm": 0.3027955889701843, + "learning_rate": 2.57008293378697e-06, + "loss": 0.3935, + "step": 6429 + }, + { + "epoch": 3.4763020364029553, + "grad_norm": 0.2885318994522095, + "learning_rate": 2.568433076072734e-06, + "loss": 0.3822, + "step": 6430 + }, + { + "epoch": 3.4768426743557397, + "grad_norm": 0.2797601521015167, + "learning_rate": 2.566783565046899e-06, + "loss": 0.3862, + "step": 6431 + }, + { + "epoch": 3.477383312308524, + "grad_norm": 0.27476048469543457, + "learning_rate": 2.565134400944656e-06, + "loss": 0.36, + "step": 6432 + }, + { + "epoch": 3.4779239502613084, + "grad_norm": 0.2893754541873932, + "learning_rate": 2.563485584001132e-06, + "loss": 0.3567, + "step": 6433 + }, + { + "epoch": 3.4784645882140928, + "grad_norm": 0.27068889141082764, + "learning_rate": 2.5618371144514147e-06, + "loss": 0.4028, + "step": 6434 + }, + { + "epoch": 3.479005226166877, + "grad_norm": 0.30441489815711975, + "learning_rate": 2.5601889925305433e-06, + "loss": 0.3997, + "step": 6435 + }, + { + "epoch": 3.479545864119661, + "grad_norm": 0.291677862405777, + "learning_rate": 2.558541218473502e-06, + "loss": 0.3899, + "step": 6436 + }, + { + "epoch": 3.4800865020724454, + "grad_norm": 0.3089935779571533, + "learning_rate": 2.5568937925152272e-06, + "loss": 0.3796, + "step": 6437 + }, + { + "epoch": 3.4806271400252298, + "grad_norm": 0.2960951030254364, + "learning_rate": 2.5552467148906034e-06, + "loss": 0.3662, + "step": 6438 + }, + { + "epoch": 3.481167777978014, + "grad_norm": 0.2769000232219696, + "learning_rate": 2.553599985834472e-06, + "loss": 0.3844, + "step": 6439 + }, + { + "epoch": 3.4817084159307985, + "grad_norm": 0.2807958722114563, + "learning_rate": 2.5519536055816194e-06, + "loss": 0.3914, + "step": 6440 + }, + { + "epoch": 3.4822490538835824, + "grad_norm": 0.2794989049434662, + "learning_rate": 2.5503075743667815e-06, + "loss": 0.3942, + "step": 6441 + }, + { + "epoch": 3.4827896918363668, + "grad_norm": 0.2745060920715332, + "learning_rate": 2.54866189242465e-06, + "loss": 0.3946, + "step": 6442 + }, + { + "epoch": 3.483330329789151, + "grad_norm": 0.293303906917572, + "learning_rate": 2.5470165599898588e-06, + "loss": 0.3793, + "step": 6443 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 0.26405566930770874, + "learning_rate": 2.545371577297002e-06, + "loss": 0.3808, + "step": 6444 + }, + { + "epoch": 3.48441160569472, + "grad_norm": 0.28149479627609253, + "learning_rate": 2.5437269445806146e-06, + "loss": 0.372, + "step": 6445 + }, + { + "epoch": 3.484952243647504, + "grad_norm": 0.2879987359046936, + "learning_rate": 2.5420826620751837e-06, + "loss": 0.3932, + "step": 6446 + }, + { + "epoch": 3.4854928816002886, + "grad_norm": 0.2887576222419739, + "learning_rate": 2.540438730015152e-06, + "loss": 0.3646, + "step": 6447 + }, + { + "epoch": 3.4860335195530725, + "grad_norm": 0.2707485258579254, + "learning_rate": 2.538795148634907e-06, + "loss": 0.3803, + "step": 6448 + }, + { + "epoch": 3.486574157505857, + "grad_norm": 0.26814743876457214, + "learning_rate": 2.5371519181687877e-06, + "loss": 0.385, + "step": 6449 + }, + { + "epoch": 3.4871147954586412, + "grad_norm": 0.2806251645088196, + "learning_rate": 2.5355090388510806e-06, + "loss": 0.3794, + "step": 6450 + }, + { + "epoch": 3.4876554334114256, + "grad_norm": 0.275668740272522, + "learning_rate": 2.5338665109160274e-06, + "loss": 0.3707, + "step": 6451 + }, + { + "epoch": 3.48819607136421, + "grad_norm": 0.42301616072654724, + "learning_rate": 2.5322243345978147e-06, + "loss": 0.365, + "step": 6452 + }, + { + "epoch": 3.488736709316994, + "grad_norm": 0.28061380982398987, + "learning_rate": 2.5305825101305835e-06, + "loss": 0.3672, + "step": 6453 + }, + { + "epoch": 3.4892773472697782, + "grad_norm": 0.2842809855937958, + "learning_rate": 2.5289410377484202e-06, + "loss": 0.3786, + "step": 6454 + }, + { + "epoch": 3.4898179852225626, + "grad_norm": 0.2861570715904236, + "learning_rate": 2.527299917685362e-06, + "loss": 0.4012, + "step": 6455 + }, + { + "epoch": 3.490358623175347, + "grad_norm": 0.30325013399124146, + "learning_rate": 2.5256591501754003e-06, + "loss": 0.3936, + "step": 6456 + }, + { + "epoch": 3.4908992611281313, + "grad_norm": 0.25236570835113525, + "learning_rate": 2.5240187354524704e-06, + "loss": 0.3708, + "step": 6457 + }, + { + "epoch": 3.4914398990809152, + "grad_norm": 0.2806876003742218, + "learning_rate": 2.5223786737504587e-06, + "loss": 0.3862, + "step": 6458 + }, + { + "epoch": 3.4919805370336996, + "grad_norm": 0.2911386787891388, + "learning_rate": 2.5207389653032044e-06, + "loss": 0.3806, + "step": 6459 + }, + { + "epoch": 3.492521174986484, + "grad_norm": 0.28653475642204285, + "learning_rate": 2.519099610344492e-06, + "loss": 0.3868, + "step": 6460 + }, + { + "epoch": 3.4930618129392683, + "grad_norm": 0.2671637535095215, + "learning_rate": 2.517460609108063e-06, + "loss": 0.3599, + "step": 6461 + }, + { + "epoch": 3.4936024508920527, + "grad_norm": 0.2724429965019226, + "learning_rate": 2.515821961827595e-06, + "loss": 0.3865, + "step": 6462 + }, + { + "epoch": 3.494143088844837, + "grad_norm": 0.27978870272636414, + "learning_rate": 2.5141836687367273e-06, + "loss": 0.385, + "step": 6463 + }, + { + "epoch": 3.4946837267976214, + "grad_norm": 0.3110909163951874, + "learning_rate": 2.5125457300690477e-06, + "loss": 0.3856, + "step": 6464 + }, + { + "epoch": 3.4952243647504053, + "grad_norm": 0.27133485674858093, + "learning_rate": 2.5109081460580875e-06, + "loss": 0.3717, + "step": 6465 + }, + { + "epoch": 3.4957650027031897, + "grad_norm": 0.2729180157184601, + "learning_rate": 2.5092709169373307e-06, + "loss": 0.3655, + "step": 6466 + }, + { + "epoch": 3.496305640655974, + "grad_norm": 0.31530043482780457, + "learning_rate": 2.5076340429402086e-06, + "loss": 0.3953, + "step": 6467 + }, + { + "epoch": 3.4968462786087584, + "grad_norm": 0.2778479754924774, + "learning_rate": 2.5059975243001077e-06, + "loss": 0.3731, + "step": 6468 + }, + { + "epoch": 3.4973869165615428, + "grad_norm": 0.28354212641716003, + "learning_rate": 2.504361361250358e-06, + "loss": 0.3647, + "step": 6469 + }, + { + "epoch": 3.4979275545143267, + "grad_norm": 0.29331958293914795, + "learning_rate": 2.502725554024239e-06, + "loss": 0.3747, + "step": 6470 + }, + { + "epoch": 3.498468192467111, + "grad_norm": 0.29107895493507385, + "learning_rate": 2.501090102854984e-06, + "loss": 0.3673, + "step": 6471 + }, + { + "epoch": 3.4990088304198954, + "grad_norm": 0.3013686239719391, + "learning_rate": 2.4994550079757696e-06, + "loss": 0.3842, + "step": 6472 + }, + { + "epoch": 3.4995494683726798, + "grad_norm": 0.2720975875854492, + "learning_rate": 2.497820269619728e-06, + "loss": 0.3525, + "step": 6473 + }, + { + "epoch": 3.500090106325464, + "grad_norm": 0.290024071931839, + "learning_rate": 2.4961858880199357e-06, + "loss": 0.3963, + "step": 6474 + }, + { + "epoch": 3.5006307442782485, + "grad_norm": 0.30103790760040283, + "learning_rate": 2.494551863409418e-06, + "loss": 0.3658, + "step": 6475 + }, + { + "epoch": 3.501171382231033, + "grad_norm": 0.3023599088191986, + "learning_rate": 2.4929181960211553e-06, + "loss": 0.372, + "step": 6476 + }, + { + "epoch": 3.5017120201838168, + "grad_norm": 0.27599939703941345, + "learning_rate": 2.49128488608807e-06, + "loss": 0.3597, + "step": 6477 + }, + { + "epoch": 3.502252658136601, + "grad_norm": 0.2933019995689392, + "learning_rate": 2.4896519338430376e-06, + "loss": 0.3846, + "step": 6478 + }, + { + "epoch": 3.5027932960893855, + "grad_norm": 0.2922401428222656, + "learning_rate": 2.4880193395188785e-06, + "loss": 0.3885, + "step": 6479 + }, + { + "epoch": 3.50333393404217, + "grad_norm": 0.30111080408096313, + "learning_rate": 2.4863871033483693e-06, + "loss": 0.3723, + "step": 6480 + }, + { + "epoch": 3.5038745719949542, + "grad_norm": 0.2843570113182068, + "learning_rate": 2.48475522556423e-06, + "loss": 0.3926, + "step": 6481 + }, + { + "epoch": 3.504415209947738, + "grad_norm": 0.27177754044532776, + "learning_rate": 2.4831237063991277e-06, + "loss": 0.3886, + "step": 6482 + }, + { + "epoch": 3.5049558479005225, + "grad_norm": 0.28537437319755554, + "learning_rate": 2.481492546085686e-06, + "loss": 0.3618, + "step": 6483 + }, + { + "epoch": 3.505496485853307, + "grad_norm": 0.29277586936950684, + "learning_rate": 2.4798617448564688e-06, + "loss": 0.3695, + "step": 6484 + }, + { + "epoch": 3.5060371238060912, + "grad_norm": 0.27317342162132263, + "learning_rate": 2.478231302943997e-06, + "loss": 0.3676, + "step": 6485 + }, + { + "epoch": 3.5065777617588756, + "grad_norm": 0.283464640378952, + "learning_rate": 2.4766012205807333e-06, + "loss": 0.3673, + "step": 6486 + }, + { + "epoch": 3.5071183997116595, + "grad_norm": 0.2851029336452484, + "learning_rate": 2.474971497999091e-06, + "loss": 0.39, + "step": 6487 + }, + { + "epoch": 3.5076590376644443, + "grad_norm": 0.2650279700756073, + "learning_rate": 2.4733421354314355e-06, + "loss": 0.387, + "step": 6488 + }, + { + "epoch": 3.5081996756172282, + "grad_norm": 0.28088340163230896, + "learning_rate": 2.471713133110078e-06, + "loss": 0.3983, + "step": 6489 + }, + { + "epoch": 3.5087403135700126, + "grad_norm": 0.2757152020931244, + "learning_rate": 2.470084491267278e-06, + "loss": 0.3508, + "step": 6490 + }, + { + "epoch": 3.509280951522797, + "grad_norm": 0.2685333788394928, + "learning_rate": 2.4684562101352414e-06, + "loss": 0.3643, + "step": 6491 + }, + { + "epoch": 3.5098215894755813, + "grad_norm": 0.301870197057724, + "learning_rate": 2.466828289946129e-06, + "loss": 0.3753, + "step": 6492 + }, + { + "epoch": 3.5103622274283657, + "grad_norm": 0.27955588698387146, + "learning_rate": 2.4652007309320497e-06, + "loss": 0.3644, + "step": 6493 + }, + { + "epoch": 3.5109028653811496, + "grad_norm": 0.26468172669410706, + "learning_rate": 2.4635735333250506e-06, + "loss": 0.3469, + "step": 6494 + }, + { + "epoch": 3.511443503333934, + "grad_norm": 0.3008752465248108, + "learning_rate": 2.46194669735714e-06, + "loss": 0.3648, + "step": 6495 + }, + { + "epoch": 3.5119841412867183, + "grad_norm": 0.2742179036140442, + "learning_rate": 2.460320223260266e-06, + "loss": 0.3562, + "step": 6496 + }, + { + "epoch": 3.5125247792395027, + "grad_norm": 0.26549032330513, + "learning_rate": 2.4586941112663315e-06, + "loss": 0.4081, + "step": 6497 + }, + { + "epoch": 3.513065417192287, + "grad_norm": 0.2619190216064453, + "learning_rate": 2.457068361607183e-06, + "loss": 0.3814, + "step": 6498 + }, + { + "epoch": 3.513606055145071, + "grad_norm": 0.301523357629776, + "learning_rate": 2.4554429745146145e-06, + "loss": 0.3753, + "step": 6499 + }, + { + "epoch": 3.5141466930978553, + "grad_norm": 0.2761991620063782, + "learning_rate": 2.4538179502203753e-06, + "loss": 0.3595, + "step": 6500 + }, + { + "epoch": 3.5146873310506397, + "grad_norm": 0.28614601492881775, + "learning_rate": 2.452193288956157e-06, + "loss": 0.3744, + "step": 6501 + }, + { + "epoch": 3.515227969003424, + "grad_norm": 0.27287283539772034, + "learning_rate": 2.4505689909535967e-06, + "loss": 0.3642, + "step": 6502 + }, + { + "epoch": 3.5157686069562084, + "grad_norm": 0.28784000873565674, + "learning_rate": 2.4489450564442903e-06, + "loss": 0.3818, + "step": 6503 + }, + { + "epoch": 3.5163092449089928, + "grad_norm": 0.26401007175445557, + "learning_rate": 2.44732148565977e-06, + "loss": 0.3725, + "step": 6504 + }, + { + "epoch": 3.516849882861777, + "grad_norm": 0.2846088111400604, + "learning_rate": 2.445698278831528e-06, + "loss": 0.3716, + "step": 6505 + }, + { + "epoch": 3.517390520814561, + "grad_norm": 0.2900800406932831, + "learning_rate": 2.44407543619099e-06, + "loss": 0.3732, + "step": 6506 + }, + { + "epoch": 3.5179311587673454, + "grad_norm": 0.28280070424079895, + "learning_rate": 2.442452957969545e-06, + "loss": 0.4047, + "step": 6507 + }, + { + "epoch": 3.5184717967201298, + "grad_norm": 0.2799927890300751, + "learning_rate": 2.4408308443985172e-06, + "loss": 0.3684, + "step": 6508 + }, + { + "epoch": 3.519012434672914, + "grad_norm": 0.30428990721702576, + "learning_rate": 2.43920909570919e-06, + "loss": 0.3816, + "step": 6509 + }, + { + "epoch": 3.5195530726256985, + "grad_norm": 0.27521735429763794, + "learning_rate": 2.437587712132787e-06, + "loss": 0.3545, + "step": 6510 + }, + { + "epoch": 3.5200937105784824, + "grad_norm": 0.2745533585548401, + "learning_rate": 2.4359666939004793e-06, + "loss": 0.3666, + "step": 6511 + }, + { + "epoch": 3.5206343485312668, + "grad_norm": 0.2788096070289612, + "learning_rate": 2.4343460412433947e-06, + "loss": 0.378, + "step": 6512 + }, + { + "epoch": 3.521174986484051, + "grad_norm": 0.28982996940612793, + "learning_rate": 2.4327257543925986e-06, + "loss": 0.3944, + "step": 6513 + }, + { + "epoch": 3.5217156244368355, + "grad_norm": 0.2812902629375458, + "learning_rate": 2.431105833579108e-06, + "loss": 0.3789, + "step": 6514 + }, + { + "epoch": 3.52225626238962, + "grad_norm": 0.29696497321128845, + "learning_rate": 2.429486279033892e-06, + "loss": 0.3819, + "step": 6515 + }, + { + "epoch": 3.522796900342404, + "grad_norm": 0.25321662425994873, + "learning_rate": 2.4278670909878597e-06, + "loss": 0.3673, + "step": 6516 + }, + { + "epoch": 3.5233375382951886, + "grad_norm": 0.2907755970954895, + "learning_rate": 2.4262482696718765e-06, + "loss": 0.3633, + "step": 6517 + }, + { + "epoch": 3.5238781762479725, + "grad_norm": 0.30284127593040466, + "learning_rate": 2.424629815316748e-06, + "loss": 0.3858, + "step": 6518 + }, + { + "epoch": 3.524418814200757, + "grad_norm": 0.308379590511322, + "learning_rate": 2.4230117281532305e-06, + "loss": 0.3742, + "step": 6519 + }, + { + "epoch": 3.5249594521535412, + "grad_norm": 0.274372935295105, + "learning_rate": 2.4213940084120274e-06, + "loss": 0.3865, + "step": 6520 + }, + { + "epoch": 3.5255000901063256, + "grad_norm": 0.301439493894577, + "learning_rate": 2.4197766563237908e-06, + "loss": 0.3326, + "step": 6521 + }, + { + "epoch": 3.52604072805911, + "grad_norm": 0.28579795360565186, + "learning_rate": 2.418159672119124e-06, + "loss": 0.3484, + "step": 6522 + }, + { + "epoch": 3.526581366011894, + "grad_norm": 0.28113120794296265, + "learning_rate": 2.416543056028567e-06, + "loss": 0.3645, + "step": 6523 + }, + { + "epoch": 3.5271220039646782, + "grad_norm": 0.28649863600730896, + "learning_rate": 2.414926808282618e-06, + "loss": 0.3811, + "step": 6524 + }, + { + "epoch": 3.5276626419174626, + "grad_norm": 0.2908145785331726, + "learning_rate": 2.4133109291117156e-06, + "loss": 0.3853, + "step": 6525 + }, + { + "epoch": 3.528203279870247, + "grad_norm": 0.27370089292526245, + "learning_rate": 2.411695418746253e-06, + "loss": 0.3836, + "step": 6526 + }, + { + "epoch": 3.5287439178230313, + "grad_norm": 0.3009878695011139, + "learning_rate": 2.4100802774165657e-06, + "loss": 0.3527, + "step": 6527 + }, + { + "epoch": 3.5292845557758152, + "grad_norm": 0.28535473346710205, + "learning_rate": 2.4084655053529337e-06, + "loss": 0.3824, + "step": 6528 + }, + { + "epoch": 3.5298251937285996, + "grad_norm": 0.2865583300590515, + "learning_rate": 2.4068511027855935e-06, + "loss": 0.4155, + "step": 6529 + }, + { + "epoch": 3.530365831681384, + "grad_norm": 0.28438109159469604, + "learning_rate": 2.405237069944721e-06, + "loss": 0.3742, + "step": 6530 + }, + { + "epoch": 3.5309064696341683, + "grad_norm": 0.297911137342453, + "learning_rate": 2.403623407060441e-06, + "loss": 0.405, + "step": 6531 + }, + { + "epoch": 3.5314471075869527, + "grad_norm": 0.2770686447620392, + "learning_rate": 2.40201011436283e-06, + "loss": 0.3728, + "step": 6532 + }, + { + "epoch": 3.531987745539737, + "grad_norm": 0.28494513034820557, + "learning_rate": 2.400397192081904e-06, + "loss": 0.393, + "step": 6533 + }, + { + "epoch": 3.5325283834925214, + "grad_norm": 0.30899500846862793, + "learning_rate": 2.3987846404476374e-06, + "loss": 0.3759, + "step": 6534 + }, + { + "epoch": 3.5330690214453053, + "grad_norm": 0.2798628509044647, + "learning_rate": 2.397172459689936e-06, + "loss": 0.3861, + "step": 6535 + }, + { + "epoch": 3.5336096593980897, + "grad_norm": 0.2783905863761902, + "learning_rate": 2.3955606500386685e-06, + "loss": 0.3896, + "step": 6536 + }, + { + "epoch": 3.534150297350874, + "grad_norm": 0.28773874044418335, + "learning_rate": 2.3939492117236397e-06, + "loss": 0.3626, + "step": 6537 + }, + { + "epoch": 3.5346909353036584, + "grad_norm": 0.2815054953098297, + "learning_rate": 2.3923381449746086e-06, + "loss": 0.3915, + "step": 6538 + }, + { + "epoch": 3.5352315732564428, + "grad_norm": 0.28169700503349304, + "learning_rate": 2.3907274500212767e-06, + "loss": 0.358, + "step": 6539 + }, + { + "epoch": 3.5357722112092267, + "grad_norm": 0.2728629410266876, + "learning_rate": 2.3891171270932923e-06, + "loss": 0.3783, + "step": 6540 + }, + { + "epoch": 3.536312849162011, + "grad_norm": 0.30166539549827576, + "learning_rate": 2.387507176420256e-06, + "loss": 0.3772, + "step": 6541 + }, + { + "epoch": 3.5368534871147954, + "grad_norm": 0.28927499055862427, + "learning_rate": 2.38589759823171e-06, + "loss": 0.3843, + "step": 6542 + }, + { + "epoch": 3.5373941250675798, + "grad_norm": 0.2623683512210846, + "learning_rate": 2.3842883927571424e-06, + "loss": 0.3501, + "step": 6543 + }, + { + "epoch": 3.537934763020364, + "grad_norm": 0.3098645508289337, + "learning_rate": 2.3826795602259956e-06, + "loss": 0.381, + "step": 6544 + }, + { + "epoch": 3.538475400973148, + "grad_norm": 0.28410932421684265, + "learning_rate": 2.3810711008676495e-06, + "loss": 0.3844, + "step": 6545 + }, + { + "epoch": 3.539016038925933, + "grad_norm": 0.3092157542705536, + "learning_rate": 2.379463014911441e-06, + "loss": 0.3956, + "step": 6546 + }, + { + "epoch": 3.539556676878717, + "grad_norm": 0.2804025113582611, + "learning_rate": 2.3778553025866415e-06, + "loss": 0.3666, + "step": 6547 + }, + { + "epoch": 3.540097314831501, + "grad_norm": 0.29580822587013245, + "learning_rate": 2.3762479641224794e-06, + "loss": 0.3759, + "step": 6548 + }, + { + "epoch": 3.5406379527842855, + "grad_norm": 0.30240368843078613, + "learning_rate": 2.3746409997481248e-06, + "loss": 0.3954, + "step": 6549 + }, + { + "epoch": 3.54117859073707, + "grad_norm": 0.2760869860649109, + "learning_rate": 2.3730344096926974e-06, + "loss": 0.3554, + "step": 6550 + }, + { + "epoch": 3.5417192286898542, + "grad_norm": 0.2570158541202545, + "learning_rate": 2.3714281941852608e-06, + "loss": 0.3456, + "step": 6551 + }, + { + "epoch": 3.542259866642638, + "grad_norm": 0.29335662722587585, + "learning_rate": 2.3698223534548248e-06, + "loss": 0.3965, + "step": 6552 + }, + { + "epoch": 3.5428005045954225, + "grad_norm": 0.3044717609882355, + "learning_rate": 2.3682168877303508e-06, + "loss": 0.3732, + "step": 6553 + }, + { + "epoch": 3.543341142548207, + "grad_norm": 0.29319342970848083, + "learning_rate": 2.366611797240741e-06, + "loss": 0.4018, + "step": 6554 + }, + { + "epoch": 3.5438817805009912, + "grad_norm": 0.3057367205619812, + "learning_rate": 2.3650070822148447e-06, + "loss": 0.3595, + "step": 6555 + }, + { + "epoch": 3.5444224184537756, + "grad_norm": 0.27213531732559204, + "learning_rate": 2.3634027428814632e-06, + "loss": 0.3722, + "step": 6556 + }, + { + "epoch": 3.5449630564065595, + "grad_norm": 0.2879851162433624, + "learning_rate": 2.3617987794693358e-06, + "loss": 0.3957, + "step": 6557 + }, + { + "epoch": 3.545503694359344, + "grad_norm": 0.3101233243942261, + "learning_rate": 2.360195192207159e-06, + "loss": 0.3913, + "step": 6558 + }, + { + "epoch": 3.5460443323121282, + "grad_norm": 0.27164915204048157, + "learning_rate": 2.358591981323562e-06, + "loss": 0.3782, + "step": 6559 + }, + { + "epoch": 3.5465849702649126, + "grad_norm": 0.2786481976509094, + "learning_rate": 2.3569891470471308e-06, + "loss": 0.361, + "step": 6560 + }, + { + "epoch": 3.547125608217697, + "grad_norm": 0.2545458674430847, + "learning_rate": 2.355386689606397e-06, + "loss": 0.3839, + "step": 6561 + }, + { + "epoch": 3.5476662461704813, + "grad_norm": 0.29571396112442017, + "learning_rate": 2.3537846092298337e-06, + "loss": 0.3896, + "step": 6562 + }, + { + "epoch": 3.5482068841232657, + "grad_norm": 0.31128859519958496, + "learning_rate": 2.352182906145863e-06, + "loss": 0.3525, + "step": 6563 + }, + { + "epoch": 3.5487475220760496, + "grad_norm": 0.31172728538513184, + "learning_rate": 2.3505815805828515e-06, + "loss": 0.4075, + "step": 6564 + }, + { + "epoch": 3.549288160028834, + "grad_norm": 0.2759602665901184, + "learning_rate": 2.3489806327691156e-06, + "loss": 0.378, + "step": 6565 + }, + { + "epoch": 3.5498287979816183, + "grad_norm": 0.2782008647918701, + "learning_rate": 2.3473800629329145e-06, + "loss": 0.3883, + "step": 6566 + }, + { + "epoch": 3.5503694359344027, + "grad_norm": 0.3004755675792694, + "learning_rate": 2.345779871302453e-06, + "loss": 0.3815, + "step": 6567 + }, + { + "epoch": 3.550910073887187, + "grad_norm": 0.25604912638664246, + "learning_rate": 2.344180058105887e-06, + "loss": 0.3995, + "step": 6568 + }, + { + "epoch": 3.551450711839971, + "grad_norm": 0.28551968932151794, + "learning_rate": 2.342580623571311e-06, + "loss": 0.3897, + "step": 6569 + }, + { + "epoch": 3.5519913497927553, + "grad_norm": 0.2856443226337433, + "learning_rate": 2.3409815679267733e-06, + "loss": 0.3918, + "step": 6570 + }, + { + "epoch": 3.5525319877455397, + "grad_norm": 0.2901134788990021, + "learning_rate": 2.3393828914002623e-06, + "loss": 0.3636, + "step": 6571 + }, + { + "epoch": 3.553072625698324, + "grad_norm": 0.28899016976356506, + "learning_rate": 2.3377845942197133e-06, + "loss": 0.3849, + "step": 6572 + }, + { + "epoch": 3.5536132636511084, + "grad_norm": 0.2661123275756836, + "learning_rate": 2.3361866766130114e-06, + "loss": 0.3739, + "step": 6573 + }, + { + "epoch": 3.5541539016038923, + "grad_norm": 0.30045759677886963, + "learning_rate": 2.3345891388079837e-06, + "loss": 0.3683, + "step": 6574 + }, + { + "epoch": 3.554694539556677, + "grad_norm": 0.2721780240535736, + "learning_rate": 2.3329919810324036e-06, + "loss": 0.3842, + "step": 6575 + }, + { + "epoch": 3.555235177509461, + "grad_norm": 0.27868637442588806, + "learning_rate": 2.3313952035139896e-06, + "loss": 0.3904, + "step": 6576 + }, + { + "epoch": 3.5557758154622454, + "grad_norm": 0.2812824249267578, + "learning_rate": 2.3297988064804106e-06, + "loss": 0.362, + "step": 6577 + }, + { + "epoch": 3.5563164534150298, + "grad_norm": 0.2818116545677185, + "learning_rate": 2.3282027901592762e-06, + "loss": 0.3835, + "step": 6578 + }, + { + "epoch": 3.556857091367814, + "grad_norm": 0.28291478753089905, + "learning_rate": 2.3266071547781427e-06, + "loss": 0.385, + "step": 6579 + }, + { + "epoch": 3.5573977293205985, + "grad_norm": 0.27867481112480164, + "learning_rate": 2.325011900564515e-06, + "loss": 0.3833, + "step": 6580 + }, + { + "epoch": 3.5579383672733824, + "grad_norm": 0.28014400601387024, + "learning_rate": 2.323417027745839e-06, + "loss": 0.384, + "step": 6581 + }, + { + "epoch": 3.558479005226167, + "grad_norm": 0.27470117807388306, + "learning_rate": 2.3218225365495117e-06, + "loss": 0.3857, + "step": 6582 + }, + { + "epoch": 3.559019643178951, + "grad_norm": 0.28751593828201294, + "learning_rate": 2.3202284272028717e-06, + "loss": 0.3762, + "step": 6583 + }, + { + "epoch": 3.5595602811317355, + "grad_norm": 0.28930631279945374, + "learning_rate": 2.3186346999332015e-06, + "loss": 0.4086, + "step": 6584 + }, + { + "epoch": 3.56010091908452, + "grad_norm": 0.29160645604133606, + "learning_rate": 2.3170413549677367e-06, + "loss": 0.3872, + "step": 6585 + }, + { + "epoch": 3.560641557037304, + "grad_norm": 0.2774616479873657, + "learning_rate": 2.3154483925336486e-06, + "loss": 0.3702, + "step": 6586 + }, + { + "epoch": 3.561182194990088, + "grad_norm": 0.2756505310535431, + "learning_rate": 2.3138558128580653e-06, + "loss": 0.4019, + "step": 6587 + }, + { + "epoch": 3.5617228329428725, + "grad_norm": 0.2749829888343811, + "learning_rate": 2.3122636161680454e-06, + "loss": 0.3687, + "step": 6588 + }, + { + "epoch": 3.562263470895657, + "grad_norm": 0.26820358633995056, + "learning_rate": 2.3106718026906073e-06, + "loss": 0.3785, + "step": 6589 + }, + { + "epoch": 3.5628041088484412, + "grad_norm": 0.31059184670448303, + "learning_rate": 2.3090803726527083e-06, + "loss": 0.364, + "step": 6590 + }, + { + "epoch": 3.5633447468012256, + "grad_norm": 0.3090270757675171, + "learning_rate": 2.3074893262812513e-06, + "loss": 0.3679, + "step": 6591 + }, + { + "epoch": 3.56388538475401, + "grad_norm": 0.2815909683704376, + "learning_rate": 2.305898663803084e-06, + "loss": 0.4074, + "step": 6592 + }, + { + "epoch": 3.564426022706794, + "grad_norm": 0.3041037321090698, + "learning_rate": 2.304308385444999e-06, + "loss": 0.3909, + "step": 6593 + }, + { + "epoch": 3.5649666606595782, + "grad_norm": 0.2612096071243286, + "learning_rate": 2.3027184914337387e-06, + "loss": 0.3678, + "step": 6594 + }, + { + "epoch": 3.5655072986123626, + "grad_norm": 0.27386558055877686, + "learning_rate": 2.301128981995985e-06, + "loss": 0.4038, + "step": 6595 + }, + { + "epoch": 3.566047936565147, + "grad_norm": 0.2815191447734833, + "learning_rate": 2.299539857358366e-06, + "loss": 0.3886, + "step": 6596 + }, + { + "epoch": 3.5665885745179313, + "grad_norm": 0.2967953681945801, + "learning_rate": 2.2979511177474594e-06, + "loss": 0.3516, + "step": 6597 + }, + { + "epoch": 3.5671292124707152, + "grad_norm": 0.2771591544151306, + "learning_rate": 2.2963627633897824e-06, + "loss": 0.3541, + "step": 6598 + }, + { + "epoch": 3.5676698504234996, + "grad_norm": 0.29038968682289124, + "learning_rate": 2.2947747945118013e-06, + "loss": 0.3862, + "step": 6599 + }, + { + "epoch": 3.568210488376284, + "grad_norm": 0.7554365992546082, + "learning_rate": 2.293187211339926e-06, + "loss": 0.3744, + "step": 6600 + }, + { + "epoch": 3.5687511263290683, + "grad_norm": 0.27392783761024475, + "learning_rate": 2.2916000141005077e-06, + "loss": 0.373, + "step": 6601 + }, + { + "epoch": 3.5692917642818527, + "grad_norm": 0.2828829884529114, + "learning_rate": 2.2900132030198513e-06, + "loss": 0.3884, + "step": 6602 + }, + { + "epoch": 3.5698324022346366, + "grad_norm": 0.29404449462890625, + "learning_rate": 2.288426778324199e-06, + "loss": 0.3958, + "step": 6603 + }, + { + "epoch": 3.5703730401874214, + "grad_norm": 0.3024761378765106, + "learning_rate": 2.28684074023974e-06, + "loss": 0.3958, + "step": 6604 + }, + { + "epoch": 3.5709136781402053, + "grad_norm": 0.2863123118877411, + "learning_rate": 2.2852550889926067e-06, + "loss": 0.3877, + "step": 6605 + }, + { + "epoch": 3.5714543160929897, + "grad_norm": 0.2825278639793396, + "learning_rate": 2.2836698248088814e-06, + "loss": 0.3873, + "step": 6606 + }, + { + "epoch": 3.571994954045774, + "grad_norm": 0.28320351243019104, + "learning_rate": 2.282084947914591e-06, + "loss": 0.3653, + "step": 6607 + }, + { + "epoch": 3.5725355919985584, + "grad_norm": 0.3045535385608673, + "learning_rate": 2.2805004585356964e-06, + "loss": 0.3852, + "step": 6608 + }, + { + "epoch": 3.5730762299513428, + "grad_norm": 0.30215948820114136, + "learning_rate": 2.2789163568981183e-06, + "loss": 0.3829, + "step": 6609 + }, + { + "epoch": 3.5736168679041267, + "grad_norm": 0.2773192822933197, + "learning_rate": 2.2773326432277097e-06, + "loss": 0.3983, + "step": 6610 + }, + { + "epoch": 3.574157505856911, + "grad_norm": 0.28059136867523193, + "learning_rate": 2.2757493177502795e-06, + "loss": 0.3567, + "step": 6611 + }, + { + "epoch": 3.5746981438096954, + "grad_norm": 0.2972952723503113, + "learning_rate": 2.274166380691571e-06, + "loss": 0.3764, + "step": 6612 + }, + { + "epoch": 3.57523878176248, + "grad_norm": 0.3074398636817932, + "learning_rate": 2.2725838322772765e-06, + "loss": 0.37, + "step": 6613 + }, + { + "epoch": 3.575779419715264, + "grad_norm": 0.2970588505268097, + "learning_rate": 2.271001672733036e-06, + "loss": 0.3785, + "step": 6614 + }, + { + "epoch": 3.576320057668048, + "grad_norm": 0.2987179756164551, + "learning_rate": 2.2694199022844284e-06, + "loss": 0.4056, + "step": 6615 + }, + { + "epoch": 3.5768606956208324, + "grad_norm": 0.28840872645378113, + "learning_rate": 2.26783852115698e-06, + "loss": 0.385, + "step": 6616 + }, + { + "epoch": 3.577401333573617, + "grad_norm": 0.3053003251552582, + "learning_rate": 2.266257529576161e-06, + "loss": 0.3917, + "step": 6617 + }, + { + "epoch": 3.577941971526401, + "grad_norm": 0.2751826047897339, + "learning_rate": 2.264676927767386e-06, + "loss": 0.3795, + "step": 6618 + }, + { + "epoch": 3.5784826094791855, + "grad_norm": 0.28055238723754883, + "learning_rate": 2.263096715956019e-06, + "loss": 0.3611, + "step": 6619 + }, + { + "epoch": 3.57902324743197, + "grad_norm": 0.3108740448951721, + "learning_rate": 2.261516894367356e-06, + "loss": 0.3618, + "step": 6620 + }, + { + "epoch": 3.5795638853847542, + "grad_norm": 0.29139307141304016, + "learning_rate": 2.2599374632266514e-06, + "loss": 0.3754, + "step": 6621 + }, + { + "epoch": 3.580104523337538, + "grad_norm": 0.2887786626815796, + "learning_rate": 2.2583584227590927e-06, + "loss": 0.3881, + "step": 6622 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 0.2744469940662384, + "learning_rate": 2.2567797731898217e-06, + "loss": 0.3749, + "step": 6623 + }, + { + "epoch": 3.581185799243107, + "grad_norm": 0.27826839685440063, + "learning_rate": 2.2552015147439166e-06, + "loss": 0.364, + "step": 6624 + }, + { + "epoch": 3.5817264371958912, + "grad_norm": 0.2944410443305969, + "learning_rate": 2.2536236476464007e-06, + "loss": 0.3793, + "step": 6625 + }, + { + "epoch": 3.5822670751486756, + "grad_norm": 0.2951698899269104, + "learning_rate": 2.252046172122248e-06, + "loss": 0.3636, + "step": 6626 + }, + { + "epoch": 3.5828077131014595, + "grad_norm": 0.28781840205192566, + "learning_rate": 2.250469088396369e-06, + "loss": 0.3786, + "step": 6627 + }, + { + "epoch": 3.583348351054244, + "grad_norm": 0.29098764061927795, + "learning_rate": 2.248892396693621e-06, + "loss": 0.368, + "step": 6628 + }, + { + "epoch": 3.5838889890070282, + "grad_norm": 0.2595488429069519, + "learning_rate": 2.247316097238809e-06, + "loss": 0.3861, + "step": 6629 + }, + { + "epoch": 3.5844296269598126, + "grad_norm": 0.2806659936904907, + "learning_rate": 2.2457401902566745e-06, + "loss": 0.4064, + "step": 6630 + }, + { + "epoch": 3.584970264912597, + "grad_norm": 0.2566904127597809, + "learning_rate": 2.244164675971914e-06, + "loss": 0.3639, + "step": 6631 + }, + { + "epoch": 3.585510902865381, + "grad_norm": 0.2670801281929016, + "learning_rate": 2.2425895546091534e-06, + "loss": 0.3639, + "step": 6632 + }, + { + "epoch": 3.5860515408181657, + "grad_norm": 0.287205308675766, + "learning_rate": 2.2410148263929767e-06, + "loss": 0.3775, + "step": 6633 + }, + { + "epoch": 3.5865921787709496, + "grad_norm": 0.32024917006492615, + "learning_rate": 2.2394404915479017e-06, + "loss": 0.3577, + "step": 6634 + }, + { + "epoch": 3.587132816723734, + "grad_norm": 0.2705610394477844, + "learning_rate": 2.2378665502983976e-06, + "loss": 0.3933, + "step": 6635 + }, + { + "epoch": 3.5876734546765183, + "grad_norm": 0.2769106328487396, + "learning_rate": 2.2362930028688736e-06, + "loss": 0.3678, + "step": 6636 + }, + { + "epoch": 3.5882140926293027, + "grad_norm": 0.27824127674102783, + "learning_rate": 2.23471984948368e-06, + "loss": 0.3824, + "step": 6637 + }, + { + "epoch": 3.588754730582087, + "grad_norm": 0.28756049275398254, + "learning_rate": 2.2331470903671183e-06, + "loss": 0.3901, + "step": 6638 + }, + { + "epoch": 3.589295368534871, + "grad_norm": 0.2813909351825714, + "learning_rate": 2.2315747257434277e-06, + "loss": 0.3771, + "step": 6639 + }, + { + "epoch": 3.5898360064876553, + "grad_norm": 0.26315838098526, + "learning_rate": 2.2300027558367917e-06, + "loss": 0.4028, + "step": 6640 + }, + { + "epoch": 3.5903766444404397, + "grad_norm": 0.29794737696647644, + "learning_rate": 2.228431180871342e-06, + "loss": 0.3755, + "step": 6641 + }, + { + "epoch": 3.590917282393224, + "grad_norm": 0.27428027987480164, + "learning_rate": 2.2268600010711477e-06, + "loss": 0.3614, + "step": 6642 + }, + { + "epoch": 3.5914579203460084, + "grad_norm": 0.30103349685668945, + "learning_rate": 2.2252892166602304e-06, + "loss": 0.3662, + "step": 6643 + }, + { + "epoch": 3.5919985582987923, + "grad_norm": 0.32324516773223877, + "learning_rate": 2.2237188278625415e-06, + "loss": 0.3648, + "step": 6644 + }, + { + "epoch": 3.592539196251577, + "grad_norm": 0.3174627125263214, + "learning_rate": 2.2221488349019903e-06, + "loss": 0.3693, + "step": 6645 + }, + { + "epoch": 3.593079834204361, + "grad_norm": 0.26844707131385803, + "learning_rate": 2.22057923800242e-06, + "loss": 0.3683, + "step": 6646 + }, + { + "epoch": 3.5936204721571454, + "grad_norm": 0.30172064900398254, + "learning_rate": 2.2190100373876228e-06, + "loss": 0.3975, + "step": 6647 + }, + { + "epoch": 3.59416111010993, + "grad_norm": 0.2951805293560028, + "learning_rate": 2.2174412332813353e-06, + "loss": 0.39, + "step": 6648 + }, + { + "epoch": 3.594701748062714, + "grad_norm": 0.28690823912620544, + "learning_rate": 2.215872825907228e-06, + "loss": 0.3807, + "step": 6649 + }, + { + "epoch": 3.5952423860154985, + "grad_norm": 0.2679624855518341, + "learning_rate": 2.2143048154889272e-06, + "loss": 0.3619, + "step": 6650 + }, + { + "epoch": 3.5957830239682824, + "grad_norm": 0.3021341562271118, + "learning_rate": 2.212737202249994e-06, + "loss": 0.3732, + "step": 6651 + }, + { + "epoch": 3.596323661921067, + "grad_norm": 0.2939276695251465, + "learning_rate": 2.211169986413938e-06, + "loss": 0.3675, + "step": 6652 + }, + { + "epoch": 3.596864299873851, + "grad_norm": 0.28871920704841614, + "learning_rate": 2.209603168204209e-06, + "loss": 0.3456, + "step": 6653 + }, + { + "epoch": 3.5974049378266355, + "grad_norm": 0.2954472601413727, + "learning_rate": 2.208036747844199e-06, + "loss": 0.3724, + "step": 6654 + }, + { + "epoch": 3.59794557577942, + "grad_norm": 0.29399073123931885, + "learning_rate": 2.2064707255572494e-06, + "loss": 0.3684, + "step": 6655 + }, + { + "epoch": 3.598486213732204, + "grad_norm": 0.3921755254268646, + "learning_rate": 2.2049051015666384e-06, + "loss": 0.3695, + "step": 6656 + }, + { + "epoch": 3.599026851684988, + "grad_norm": 0.28325673937797546, + "learning_rate": 2.203339876095588e-06, + "loss": 0.3729, + "step": 6657 + }, + { + "epoch": 3.5995674896377725, + "grad_norm": 0.2699066996574402, + "learning_rate": 2.2017750493672704e-06, + "loss": 0.351, + "step": 6658 + }, + { + "epoch": 3.600108127590557, + "grad_norm": 0.2776723802089691, + "learning_rate": 2.2002106216047904e-06, + "loss": 0.3922, + "step": 6659 + }, + { + "epoch": 3.6006487655433412, + "grad_norm": 0.2835797369480133, + "learning_rate": 2.1986465930312067e-06, + "loss": 0.3712, + "step": 6660 + }, + { + "epoch": 3.601189403496125, + "grad_norm": 0.2748779356479645, + "learning_rate": 2.1970829638695096e-06, + "loss": 0.3867, + "step": 6661 + }, + { + "epoch": 3.60173004144891, + "grad_norm": 0.27030235528945923, + "learning_rate": 2.1955197343426432e-06, + "loss": 0.3745, + "step": 6662 + }, + { + "epoch": 3.602270679401694, + "grad_norm": 0.27696895599365234, + "learning_rate": 2.1939569046734865e-06, + "loss": 0.3874, + "step": 6663 + }, + { + "epoch": 3.6028113173544782, + "grad_norm": 0.270863801240921, + "learning_rate": 2.192394475084868e-06, + "loss": 0.3653, + "step": 6664 + }, + { + "epoch": 3.6033519553072626, + "grad_norm": 0.26304101943969727, + "learning_rate": 2.1908324457995556e-06, + "loss": 0.3816, + "step": 6665 + }, + { + "epoch": 3.603892593260047, + "grad_norm": 0.27188053727149963, + "learning_rate": 2.1892708170402572e-06, + "loss": 0.3514, + "step": 6666 + }, + { + "epoch": 3.6044332312128313, + "grad_norm": 0.2782359719276428, + "learning_rate": 2.187709589029631e-06, + "loss": 0.3509, + "step": 6667 + }, + { + "epoch": 3.6049738691656152, + "grad_norm": 0.2933637499809265, + "learning_rate": 2.1861487619902733e-06, + "loss": 0.3836, + "step": 6668 + }, + { + "epoch": 3.6055145071183996, + "grad_norm": 0.2583782374858856, + "learning_rate": 2.1845883361447218e-06, + "loss": 0.3881, + "step": 6669 + }, + { + "epoch": 3.606055145071184, + "grad_norm": 0.3089904189109802, + "learning_rate": 2.1830283117154616e-06, + "loss": 0.366, + "step": 6670 + }, + { + "epoch": 3.6065957830239683, + "grad_norm": 0.28782257437705994, + "learning_rate": 2.181468688924916e-06, + "loss": 0.3873, + "step": 6671 + }, + { + "epoch": 3.6071364209767527, + "grad_norm": 0.27572405338287354, + "learning_rate": 2.1799094679954575e-06, + "loss": 0.3646, + "step": 6672 + }, + { + "epoch": 3.6076770589295366, + "grad_norm": 0.2745559811592102, + "learning_rate": 2.1783506491493906e-06, + "loss": 0.362, + "step": 6673 + }, + { + "epoch": 3.6082176968823214, + "grad_norm": 0.27891382575035095, + "learning_rate": 2.1767922326089725e-06, + "loss": 0.3773, + "step": 6674 + }, + { + "epoch": 3.6087583348351053, + "grad_norm": 0.2770380973815918, + "learning_rate": 2.1752342185964003e-06, + "loss": 0.3846, + "step": 6675 + }, + { + "epoch": 3.6092989727878897, + "grad_norm": 0.30474746227264404, + "learning_rate": 2.173676607333812e-06, + "loss": 0.3796, + "step": 6676 + }, + { + "epoch": 3.609839610740674, + "grad_norm": 0.27506735920906067, + "learning_rate": 2.172119399043288e-06, + "loss": 0.383, + "step": 6677 + }, + { + "epoch": 3.6103802486934584, + "grad_norm": 0.2605507969856262, + "learning_rate": 2.1705625939468517e-06, + "loss": 0.3585, + "step": 6678 + }, + { + "epoch": 3.6109208866462428, + "grad_norm": 0.27277642488479614, + "learning_rate": 2.1690061922664722e-06, + "loss": 0.3659, + "step": 6679 + }, + { + "epoch": 3.6114615245990267, + "grad_norm": 0.26635366678237915, + "learning_rate": 2.1674501942240567e-06, + "loss": 0.3623, + "step": 6680 + }, + { + "epoch": 3.612002162551811, + "grad_norm": 0.27611610293388367, + "learning_rate": 2.1658946000414553e-06, + "loss": 0.3771, + "step": 6681 + }, + { + "epoch": 3.6125428005045954, + "grad_norm": 0.2707509994506836, + "learning_rate": 2.1643394099404652e-06, + "loss": 0.3705, + "step": 6682 + }, + { + "epoch": 3.61308343845738, + "grad_norm": 0.26408860087394714, + "learning_rate": 2.1627846241428186e-06, + "loss": 0.3807, + "step": 6683 + }, + { + "epoch": 3.613624076410164, + "grad_norm": 0.2970049977302551, + "learning_rate": 2.1612302428701993e-06, + "loss": 0.3864, + "step": 6684 + }, + { + "epoch": 3.614164714362948, + "grad_norm": 0.2735101282596588, + "learning_rate": 2.159676266344222e-06, + "loss": 0.3632, + "step": 6685 + }, + { + "epoch": 3.6147053523157324, + "grad_norm": 0.2762027978897095, + "learning_rate": 2.1581226947864524e-06, + "loss": 0.3636, + "step": 6686 + }, + { + "epoch": 3.615245990268517, + "grad_norm": 0.2806672155857086, + "learning_rate": 2.156569528418398e-06, + "loss": 0.3651, + "step": 6687 + }, + { + "epoch": 3.615786628221301, + "grad_norm": 0.28479084372520447, + "learning_rate": 2.155016767461505e-06, + "loss": 0.3599, + "step": 6688 + }, + { + "epoch": 3.6163272661740855, + "grad_norm": 0.25960230827331543, + "learning_rate": 2.1534644121371633e-06, + "loss": 0.3572, + "step": 6689 + }, + { + "epoch": 3.61686790412687, + "grad_norm": 0.3097879886627197, + "learning_rate": 2.151912462666703e-06, + "loss": 0.3768, + "step": 6690 + }, + { + "epoch": 3.6174085420796542, + "grad_norm": 0.26245662569999695, + "learning_rate": 2.1503609192714008e-06, + "loss": 0.3711, + "step": 6691 + }, + { + "epoch": 3.617949180032438, + "grad_norm": 0.27237895131111145, + "learning_rate": 2.148809782172472e-06, + "loss": 0.3926, + "step": 6692 + }, + { + "epoch": 3.6184898179852225, + "grad_norm": 0.2845414876937866, + "learning_rate": 2.147259051591074e-06, + "loss": 0.3593, + "step": 6693 + }, + { + "epoch": 3.619030455938007, + "grad_norm": 0.28626132011413574, + "learning_rate": 2.145708727748309e-06, + "loss": 0.3747, + "step": 6694 + }, + { + "epoch": 3.6195710938907912, + "grad_norm": 0.29464709758758545, + "learning_rate": 2.144158810865217e-06, + "loss": 0.3709, + "step": 6695 + }, + { + "epoch": 3.6201117318435756, + "grad_norm": 0.28694719076156616, + "learning_rate": 2.142609301162786e-06, + "loss": 0.3935, + "step": 6696 + }, + { + "epoch": 3.6206523697963595, + "grad_norm": 0.30178123712539673, + "learning_rate": 2.1410601988619394e-06, + "loss": 0.3697, + "step": 6697 + }, + { + "epoch": 3.621193007749144, + "grad_norm": 0.27384212613105774, + "learning_rate": 2.1395115041835447e-06, + "loss": 0.3738, + "step": 6698 + }, + { + "epoch": 3.6217336457019282, + "grad_norm": 0.2784256637096405, + "learning_rate": 2.137963217348415e-06, + "loss": 0.3738, + "step": 6699 + }, + { + "epoch": 3.6222742836547126, + "grad_norm": 0.2793726921081543, + "learning_rate": 2.1364153385773007e-06, + "loss": 0.3835, + "step": 6700 + }, + { + "epoch": 3.622814921607497, + "grad_norm": 0.2873479425907135, + "learning_rate": 2.134867868090895e-06, + "loss": 0.3775, + "step": 6701 + }, + { + "epoch": 3.623355559560281, + "grad_norm": 0.266814649105072, + "learning_rate": 2.1333208061098325e-06, + "loss": 0.3757, + "step": 6702 + }, + { + "epoch": 3.6238961975130657, + "grad_norm": 0.27134865522384644, + "learning_rate": 2.1317741528546913e-06, + "loss": 0.3746, + "step": 6703 + }, + { + "epoch": 3.6244368354658496, + "grad_norm": 0.2763728201389313, + "learning_rate": 2.1302279085459953e-06, + "loss": 0.3979, + "step": 6704 + }, + { + "epoch": 3.624977473418634, + "grad_norm": 0.2741091847419739, + "learning_rate": 2.128682073404197e-06, + "loss": 0.3766, + "step": 6705 + }, + { + "epoch": 3.6255181113714183, + "grad_norm": 0.2954896092414856, + "learning_rate": 2.1271366476497048e-06, + "loss": 0.3702, + "step": 6706 + }, + { + "epoch": 3.6260587493242027, + "grad_norm": 0.28821152448654175, + "learning_rate": 2.125591631502858e-06, + "loss": 0.3909, + "step": 6707 + }, + { + "epoch": 3.626599387276987, + "grad_norm": 0.29107242822647095, + "learning_rate": 2.124047025183947e-06, + "loss": 0.3732, + "step": 6708 + }, + { + "epoch": 3.627140025229771, + "grad_norm": 0.2932164669036865, + "learning_rate": 2.122502828913196e-06, + "loss": 0.4049, + "step": 6709 + }, + { + "epoch": 3.6276806631825553, + "grad_norm": 0.27281996607780457, + "learning_rate": 2.1209590429107734e-06, + "loss": 0.3443, + "step": 6710 + }, + { + "epoch": 3.6282213011353397, + "grad_norm": 0.2741299271583557, + "learning_rate": 2.119415667396792e-06, + "loss": 0.3499, + "step": 6711 + }, + { + "epoch": 3.628761939088124, + "grad_norm": 0.27361026406288147, + "learning_rate": 2.1178727025913005e-06, + "loss": 0.3838, + "step": 6712 + }, + { + "epoch": 3.6293025770409084, + "grad_norm": 0.26464131474494934, + "learning_rate": 2.1163301487142945e-06, + "loss": 0.3843, + "step": 6713 + }, + { + "epoch": 3.6298432149936923, + "grad_norm": 0.27855372428894043, + "learning_rate": 2.114788005985708e-06, + "loss": 0.3646, + "step": 6714 + }, + { + "epoch": 3.6303838529464767, + "grad_norm": 0.2654821574687958, + "learning_rate": 2.1132462746254147e-06, + "loss": 0.3646, + "step": 6715 + }, + { + "epoch": 3.630924490899261, + "grad_norm": 0.2883964776992798, + "learning_rate": 2.111704954853235e-06, + "loss": 0.3755, + "step": 6716 + }, + { + "epoch": 3.6314651288520454, + "grad_norm": 0.28079986572265625, + "learning_rate": 2.1101640468889255e-06, + "loss": 0.3661, + "step": 6717 + }, + { + "epoch": 3.63200576680483, + "grad_norm": 0.26388463377952576, + "learning_rate": 2.1086235509521875e-06, + "loss": 0.3709, + "step": 6718 + }, + { + "epoch": 3.632546404757614, + "grad_norm": 0.2793663442134857, + "learning_rate": 2.107083467262659e-06, + "loss": 0.3662, + "step": 6719 + }, + { + "epoch": 3.6330870427103985, + "grad_norm": 0.27099892497062683, + "learning_rate": 2.1055437960399266e-06, + "loss": 0.3662, + "step": 6720 + }, + { + "epoch": 3.6336276806631824, + "grad_norm": 0.26378142833709717, + "learning_rate": 2.104004537503512e-06, + "loss": 0.3746, + "step": 6721 + }, + { + "epoch": 3.634168318615967, + "grad_norm": 0.27467218041419983, + "learning_rate": 2.102465691872877e-06, + "loss": 0.3714, + "step": 6722 + }, + { + "epoch": 3.634708956568751, + "grad_norm": 0.2755574882030487, + "learning_rate": 2.1009272593674323e-06, + "loss": 0.3444, + "step": 6723 + }, + { + "epoch": 3.6352495945215355, + "grad_norm": 0.2726738750934601, + "learning_rate": 2.0993892402065207e-06, + "loss": 0.3741, + "step": 6724 + }, + { + "epoch": 3.63579023247432, + "grad_norm": 0.25456562638282776, + "learning_rate": 2.0978516346094342e-06, + "loss": 0.3673, + "step": 6725 + }, + { + "epoch": 3.636330870427104, + "grad_norm": 0.27345624566078186, + "learning_rate": 2.0963144427953998e-06, + "loss": 0.355, + "step": 6726 + }, + { + "epoch": 3.636871508379888, + "grad_norm": 0.2940003275871277, + "learning_rate": 2.0947776649835854e-06, + "loss": 0.3673, + "step": 6727 + }, + { + "epoch": 3.6374121463326725, + "grad_norm": 0.26671913266181946, + "learning_rate": 2.093241301393106e-06, + "loss": 0.3814, + "step": 6728 + }, + { + "epoch": 3.637952784285457, + "grad_norm": 0.28704431653022766, + "learning_rate": 2.0917053522430114e-06, + "loss": 0.3849, + "step": 6729 + }, + { + "epoch": 3.6384934222382412, + "grad_norm": 0.28842297196388245, + "learning_rate": 2.0901698177522944e-06, + "loss": 0.3746, + "step": 6730 + }, + { + "epoch": 3.639034060191025, + "grad_norm": 0.28242138028144836, + "learning_rate": 2.0886346981398876e-06, + "loss": 0.3792, + "step": 6731 + }, + { + "epoch": 3.63957469814381, + "grad_norm": 0.31240415573120117, + "learning_rate": 2.0870999936246662e-06, + "loss": 0.3831, + "step": 6732 + }, + { + "epoch": 3.640115336096594, + "grad_norm": 0.279478520154953, + "learning_rate": 2.0855657044254503e-06, + "loss": 0.3753, + "step": 6733 + }, + { + "epoch": 3.6406559740493782, + "grad_norm": 0.2873874306678772, + "learning_rate": 2.0840318307609887e-06, + "loss": 0.3654, + "step": 6734 + }, + { + "epoch": 3.6411966120021626, + "grad_norm": 0.28576597571372986, + "learning_rate": 2.082498372849983e-06, + "loss": 0.4005, + "step": 6735 + }, + { + "epoch": 3.641737249954947, + "grad_norm": 0.2662777304649353, + "learning_rate": 2.0809653309110685e-06, + "loss": 0.3665, + "step": 6736 + }, + { + "epoch": 3.6422778879077313, + "grad_norm": 0.27650347352027893, + "learning_rate": 2.0794327051628255e-06, + "loss": 0.363, + "step": 6737 + }, + { + "epoch": 3.6428185258605152, + "grad_norm": 0.29481005668640137, + "learning_rate": 2.0779004958237724e-06, + "loss": 0.3853, + "step": 6738 + }, + { + "epoch": 3.6433591638132996, + "grad_norm": 0.26555564999580383, + "learning_rate": 2.0763687031123668e-06, + "loss": 0.3607, + "step": 6739 + }, + { + "epoch": 3.643899801766084, + "grad_norm": 0.2699344754219055, + "learning_rate": 2.074837327247012e-06, + "loss": 0.356, + "step": 6740 + }, + { + "epoch": 3.6444404397188683, + "grad_norm": 0.29361844062805176, + "learning_rate": 2.073306368446048e-06, + "loss": 0.3841, + "step": 6741 + }, + { + "epoch": 3.6449810776716527, + "grad_norm": 0.2983313202857971, + "learning_rate": 2.071775826927754e-06, + "loss": 0.3809, + "step": 6742 + }, + { + "epoch": 3.6455217156244366, + "grad_norm": 0.2917824685573578, + "learning_rate": 2.0702457029103547e-06, + "loss": 0.376, + "step": 6743 + }, + { + "epoch": 3.646062353577221, + "grad_norm": 0.27565157413482666, + "learning_rate": 2.068715996612009e-06, + "loss": 0.3784, + "step": 6744 + }, + { + "epoch": 3.6466029915300053, + "grad_norm": 0.2832021117210388, + "learning_rate": 2.067186708250826e-06, + "loss": 0.3865, + "step": 6745 + }, + { + "epoch": 3.6471436294827897, + "grad_norm": 0.2892693281173706, + "learning_rate": 2.0656578380448404e-06, + "loss": 0.375, + "step": 6746 + }, + { + "epoch": 3.647684267435574, + "grad_norm": 0.2889207899570465, + "learning_rate": 2.064129386212042e-06, + "loss": 0.3685, + "step": 6747 + }, + { + "epoch": 3.6482249053883584, + "grad_norm": 0.2712843418121338, + "learning_rate": 2.062601352970351e-06, + "loss": 0.3845, + "step": 6748 + }, + { + "epoch": 3.648765543341143, + "grad_norm": 0.30522647500038147, + "learning_rate": 2.061073738537635e-06, + "loss": 0.3758, + "step": 6749 + }, + { + "epoch": 3.6493061812939267, + "grad_norm": 0.2705490291118622, + "learning_rate": 2.059546543131696e-06, + "loss": 0.3713, + "step": 6750 + }, + { + "epoch": 3.649846819246711, + "grad_norm": 0.2821035385131836, + "learning_rate": 2.058019766970279e-06, + "loss": 0.3815, + "step": 6751 + }, + { + "epoch": 3.6503874571994954, + "grad_norm": 0.26890015602111816, + "learning_rate": 2.0564934102710706e-06, + "loss": 0.3786, + "step": 6752 + }, + { + "epoch": 3.65092809515228, + "grad_norm": 0.2866307199001312, + "learning_rate": 2.054967473251695e-06, + "loss": 0.3686, + "step": 6753 + }, + { + "epoch": 3.651468733105064, + "grad_norm": 0.3068501651287079, + "learning_rate": 2.0534419561297153e-06, + "loss": 0.3674, + "step": 6754 + }, + { + "epoch": 3.652009371057848, + "grad_norm": 0.27632516622543335, + "learning_rate": 2.051916859122641e-06, + "loss": 0.3744, + "step": 6755 + }, + { + "epoch": 3.6525500090106324, + "grad_norm": 0.2828952670097351, + "learning_rate": 2.050392182447914e-06, + "loss": 0.357, + "step": 6756 + }, + { + "epoch": 3.653090646963417, + "grad_norm": 0.2775524854660034, + "learning_rate": 2.0488679263229257e-06, + "loss": 0.353, + "step": 6757 + }, + { + "epoch": 3.653631284916201, + "grad_norm": 0.290147066116333, + "learning_rate": 2.0473440909649932e-06, + "loss": 0.3719, + "step": 6758 + }, + { + "epoch": 3.6541719228689855, + "grad_norm": 0.2628694772720337, + "learning_rate": 2.045820676591389e-06, + "loss": 0.358, + "step": 6759 + }, + { + "epoch": 3.6547125608217694, + "grad_norm": 0.2677537798881531, + "learning_rate": 2.0442976834193146e-06, + "loss": 0.3977, + "step": 6760 + }, + { + "epoch": 3.6552531987745542, + "grad_norm": 0.2836852967739105, + "learning_rate": 2.042775111665919e-06, + "loss": 0.3677, + "step": 6761 + }, + { + "epoch": 3.655793836727338, + "grad_norm": 0.32650119066238403, + "learning_rate": 2.0412529615482867e-06, + "loss": 0.3577, + "step": 6762 + }, + { + "epoch": 3.6563344746801225, + "grad_norm": 0.27181607484817505, + "learning_rate": 2.0397312332834408e-06, + "loss": 0.3641, + "step": 6763 + }, + { + "epoch": 3.656875112632907, + "grad_norm": 0.26769402623176575, + "learning_rate": 2.0382099270883493e-06, + "loss": 0.3965, + "step": 6764 + }, + { + "epoch": 3.6574157505856912, + "grad_norm": 0.30101466178894043, + "learning_rate": 2.036689043179917e-06, + "loss": 0.3786, + "step": 6765 + }, + { + "epoch": 3.6579563885384756, + "grad_norm": 0.27948784828186035, + "learning_rate": 2.0351685817749867e-06, + "loss": 0.3748, + "step": 6766 + }, + { + "epoch": 3.6584970264912595, + "grad_norm": 0.2739773988723755, + "learning_rate": 2.0336485430903453e-06, + "loss": 0.3867, + "step": 6767 + }, + { + "epoch": 3.659037664444044, + "grad_norm": 0.2507238984107971, + "learning_rate": 2.0321289273427155e-06, + "loss": 0.4019, + "step": 6768 + }, + { + "epoch": 3.6595783023968282, + "grad_norm": 0.26378685235977173, + "learning_rate": 2.0306097347487645e-06, + "loss": 0.3499, + "step": 6769 + }, + { + "epoch": 3.6601189403496126, + "grad_norm": 0.2907329201698303, + "learning_rate": 2.0290909655250913e-06, + "loss": 0.3707, + "step": 6770 + }, + { + "epoch": 3.660659578302397, + "grad_norm": 0.2970120310783386, + "learning_rate": 2.0275726198882404e-06, + "loss": 0.3615, + "step": 6771 + }, + { + "epoch": 3.661200216255181, + "grad_norm": 0.2724264860153198, + "learning_rate": 2.026054698054699e-06, + "loss": 0.3852, + "step": 6772 + }, + { + "epoch": 3.6617408542079652, + "grad_norm": 0.28857192397117615, + "learning_rate": 2.0245372002408857e-06, + "loss": 0.3767, + "step": 6773 + }, + { + "epoch": 3.6622814921607496, + "grad_norm": 0.2930293083190918, + "learning_rate": 2.0230201266631644e-06, + "loss": 0.374, + "step": 6774 + }, + { + "epoch": 3.662822130113534, + "grad_norm": 0.28278347849845886, + "learning_rate": 2.0215034775378336e-06, + "loss": 0.3733, + "step": 6775 + }, + { + "epoch": 3.6633627680663183, + "grad_norm": 0.30604609847068787, + "learning_rate": 2.019987253081138e-06, + "loss": 0.3776, + "step": 6776 + }, + { + "epoch": 3.6639034060191027, + "grad_norm": 0.2731034457683563, + "learning_rate": 2.018471453509256e-06, + "loss": 0.3715, + "step": 6777 + }, + { + "epoch": 3.664444043971887, + "grad_norm": 0.27089616656303406, + "learning_rate": 2.016956079038309e-06, + "loss": 0.3679, + "step": 6778 + }, + { + "epoch": 3.664984681924671, + "grad_norm": 0.26166582107543945, + "learning_rate": 2.0154411298843564e-06, + "loss": 0.3676, + "step": 6779 + }, + { + "epoch": 3.6655253198774553, + "grad_norm": 0.2884708046913147, + "learning_rate": 2.013926606263394e-06, + "loss": 0.3587, + "step": 6780 + }, + { + "epoch": 3.6660659578302397, + "grad_norm": 0.30864015221595764, + "learning_rate": 2.0124125083913636e-06, + "loss": 0.3716, + "step": 6781 + }, + { + "epoch": 3.666606595783024, + "grad_norm": 0.24878321588039398, + "learning_rate": 2.0108988364841413e-06, + "loss": 0.3695, + "step": 6782 + }, + { + "epoch": 3.6671472337358084, + "grad_norm": 0.25419852137565613, + "learning_rate": 2.0093855907575416e-06, + "loss": 0.3615, + "step": 6783 + }, + { + "epoch": 3.6676878716885923, + "grad_norm": 0.2902009189128876, + "learning_rate": 2.0078727714273238e-06, + "loss": 0.364, + "step": 6784 + }, + { + "epoch": 3.6682285096413767, + "grad_norm": 0.28060466051101685, + "learning_rate": 2.0063603787091788e-06, + "loss": 0.3932, + "step": 6785 + }, + { + "epoch": 3.668769147594161, + "grad_norm": 0.28917181491851807, + "learning_rate": 2.0048484128187473e-06, + "loss": 0.3891, + "step": 6786 + }, + { + "epoch": 3.6693097855469454, + "grad_norm": 0.2723115384578705, + "learning_rate": 2.0033368739715953e-06, + "loss": 0.3769, + "step": 6787 + }, + { + "epoch": 3.66985042349973, + "grad_norm": 0.2720555365085602, + "learning_rate": 2.0018257623832393e-06, + "loss": 0.3799, + "step": 6788 + }, + { + "epoch": 3.6703910614525137, + "grad_norm": 0.2846823036670685, + "learning_rate": 2.000315078269129e-06, + "loss": 0.3742, + "step": 6789 + }, + { + "epoch": 3.6709316994052985, + "grad_norm": 0.273112416267395, + "learning_rate": 1.9988048218446577e-06, + "loss": 0.4078, + "step": 6790 + }, + { + "epoch": 3.6714723373580824, + "grad_norm": 0.30613189935684204, + "learning_rate": 1.9972949933251534e-06, + "loss": 0.402, + "step": 6791 + }, + { + "epoch": 3.672012975310867, + "grad_norm": 0.26700952649116516, + "learning_rate": 1.995785592925883e-06, + "loss": 0.3789, + "step": 6792 + }, + { + "epoch": 3.672553613263651, + "grad_norm": 0.28727391362190247, + "learning_rate": 1.994276620862057e-06, + "loss": 0.3957, + "step": 6793 + }, + { + "epoch": 3.6730942512164355, + "grad_norm": 0.2805801331996918, + "learning_rate": 1.9927680773488216e-06, + "loss": 0.3882, + "step": 6794 + }, + { + "epoch": 3.67363488916922, + "grad_norm": 0.2893593907356262, + "learning_rate": 1.9912599626012593e-06, + "loss": 0.3681, + "step": 6795 + }, + { + "epoch": 3.674175527122004, + "grad_norm": 0.2825118899345398, + "learning_rate": 1.9897522768343974e-06, + "loss": 0.3632, + "step": 6796 + }, + { + "epoch": 3.674716165074788, + "grad_norm": 0.2712789475917816, + "learning_rate": 1.988245020263197e-06, + "loss": 0.3782, + "step": 6797 + }, + { + "epoch": 3.6752568030275725, + "grad_norm": 0.3012026250362396, + "learning_rate": 1.9867381931025637e-06, + "loss": 0.3926, + "step": 6798 + }, + { + "epoch": 3.675797440980357, + "grad_norm": 0.2728487253189087, + "learning_rate": 1.9852317955673324e-06, + "loss": 0.3742, + "step": 6799 + }, + { + "epoch": 3.6763380789331412, + "grad_norm": 0.2810962498188019, + "learning_rate": 1.9837258278722855e-06, + "loss": 0.3523, + "step": 6800 + }, + { + "epoch": 3.676878716885925, + "grad_norm": 0.28678956627845764, + "learning_rate": 1.982220290232143e-06, + "loss": 0.3617, + "step": 6801 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 0.27895835041999817, + "learning_rate": 1.98071518286156e-06, + "loss": 0.3712, + "step": 6802 + }, + { + "epoch": 3.677959992791494, + "grad_norm": 0.281220406293869, + "learning_rate": 1.9792105059751314e-06, + "loss": 0.3925, + "step": 6803 + }, + { + "epoch": 3.6785006307442782, + "grad_norm": 0.29164451360702515, + "learning_rate": 1.977706259787391e-06, + "loss": 0.3867, + "step": 6804 + }, + { + "epoch": 3.6790412686970626, + "grad_norm": 0.2751726806163788, + "learning_rate": 1.976202444512813e-06, + "loss": 0.3948, + "step": 6805 + }, + { + "epoch": 3.679581906649847, + "grad_norm": 0.27914339303970337, + "learning_rate": 1.974699060365809e-06, + "loss": 0.3877, + "step": 6806 + }, + { + "epoch": 3.6801225446026313, + "grad_norm": 0.2867169976234436, + "learning_rate": 1.973196107560725e-06, + "loss": 0.3653, + "step": 6807 + }, + { + "epoch": 3.6806631825554152, + "grad_norm": 0.2664088010787964, + "learning_rate": 1.9716935863118546e-06, + "loss": 0.3901, + "step": 6808 + }, + { + "epoch": 3.6812038205081996, + "grad_norm": 0.25899437069892883, + "learning_rate": 1.9701914968334197e-06, + "loss": 0.3688, + "step": 6809 + }, + { + "epoch": 3.681744458460984, + "grad_norm": 0.2655908167362213, + "learning_rate": 1.96868983933959e-06, + "loss": 0.378, + "step": 6810 + }, + { + "epoch": 3.6822850964137683, + "grad_norm": 0.30866947770118713, + "learning_rate": 1.9671886140444667e-06, + "loss": 0.3471, + "step": 6811 + }, + { + "epoch": 3.6828257343665527, + "grad_norm": 0.2657456696033478, + "learning_rate": 1.96568782116209e-06, + "loss": 0.3515, + "step": 6812 + }, + { + "epoch": 3.6833663723193366, + "grad_norm": 0.27322810888290405, + "learning_rate": 1.9641874609064443e-06, + "loss": 0.3968, + "step": 6813 + }, + { + "epoch": 3.683907010272121, + "grad_norm": 0.27966296672821045, + "learning_rate": 1.962687533491446e-06, + "loss": 0.3661, + "step": 6814 + }, + { + "epoch": 3.6844476482249053, + "grad_norm": 0.2770301401615143, + "learning_rate": 1.9611880391309524e-06, + "loss": 0.3653, + "step": 6815 + }, + { + "epoch": 3.6849882861776897, + "grad_norm": 0.27900561690330505, + "learning_rate": 1.959688978038756e-06, + "loss": 0.3933, + "step": 6816 + }, + { + "epoch": 3.685528924130474, + "grad_norm": 0.29549020528793335, + "learning_rate": 1.958190350428595e-06, + "loss": 0.4131, + "step": 6817 + }, + { + "epoch": 3.686069562083258, + "grad_norm": 0.31528905034065247, + "learning_rate": 1.956692156514139e-06, + "loss": 0.3985, + "step": 6818 + }, + { + "epoch": 3.686610200036043, + "grad_norm": 0.28341177105903625, + "learning_rate": 1.9551943965089947e-06, + "loss": 0.3719, + "step": 6819 + }, + { + "epoch": 3.6871508379888267, + "grad_norm": 0.2914537489414215, + "learning_rate": 1.9536970706267156e-06, + "loss": 0.3659, + "step": 6820 + }, + { + "epoch": 3.687691475941611, + "grad_norm": 0.2779093384742737, + "learning_rate": 1.952200179080783e-06, + "loss": 0.3936, + "step": 6821 + }, + { + "epoch": 3.6882321138943954, + "grad_norm": 0.3025447130203247, + "learning_rate": 1.9507037220846236e-06, + "loss": 0.3983, + "step": 6822 + }, + { + "epoch": 3.68877275184718, + "grad_norm": 0.2952757775783539, + "learning_rate": 1.9492076998515997e-06, + "loss": 0.38, + "step": 6823 + }, + { + "epoch": 3.689313389799964, + "grad_norm": 0.27697935700416565, + "learning_rate": 1.9477121125950084e-06, + "loss": 0.3702, + "step": 6824 + }, + { + "epoch": 3.689854027752748, + "grad_norm": 0.2632052004337311, + "learning_rate": 1.946216960528092e-06, + "loss": 0.3746, + "step": 6825 + }, + { + "epoch": 3.6903946657055324, + "grad_norm": 0.277616947889328, + "learning_rate": 1.944722243864024e-06, + "loss": 0.3638, + "step": 6826 + }, + { + "epoch": 3.690935303658317, + "grad_norm": 0.30968624353408813, + "learning_rate": 1.9432279628159188e-06, + "loss": 0.3812, + "step": 6827 + }, + { + "epoch": 3.691475941611101, + "grad_norm": 0.27993831038475037, + "learning_rate": 1.9417341175968274e-06, + "loss": 0.3606, + "step": 6828 + }, + { + "epoch": 3.6920165795638855, + "grad_norm": 0.27457091212272644, + "learning_rate": 1.94024070841974e-06, + "loss": 0.35, + "step": 6829 + }, + { + "epoch": 3.6925572175166694, + "grad_norm": 0.2656506299972534, + "learning_rate": 1.9387477354975885e-06, + "loss": 0.3716, + "step": 6830 + }, + { + "epoch": 3.693097855469454, + "grad_norm": 0.2725432813167572, + "learning_rate": 1.9372551990432307e-06, + "loss": 0.39, + "step": 6831 + }, + { + "epoch": 3.693638493422238, + "grad_norm": 0.27034181356430054, + "learning_rate": 1.9357630992694753e-06, + "loss": 0.3845, + "step": 6832 + }, + { + "epoch": 3.6941791313750225, + "grad_norm": 0.28433382511138916, + "learning_rate": 1.9342714363890596e-06, + "loss": 0.3649, + "step": 6833 + }, + { + "epoch": 3.694719769327807, + "grad_norm": 0.28269556164741516, + "learning_rate": 1.932780210614666e-06, + "loss": 0.3721, + "step": 6834 + }, + { + "epoch": 3.6952604072805912, + "grad_norm": 0.26568377017974854, + "learning_rate": 1.9312894221589085e-06, + "loss": 0.4029, + "step": 6835 + }, + { + "epoch": 3.6958010452333756, + "grad_norm": 0.2716286778450012, + "learning_rate": 1.9297990712343396e-06, + "loss": 0.378, + "step": 6836 + }, + { + "epoch": 3.6963416831861595, + "grad_norm": 0.26363158226013184, + "learning_rate": 1.9283091580534548e-06, + "loss": 0.3699, + "step": 6837 + }, + { + "epoch": 3.696882321138944, + "grad_norm": 0.27918702363967896, + "learning_rate": 1.9268196828286802e-06, + "loss": 0.3727, + "step": 6838 + }, + { + "epoch": 3.6974229590917282, + "grad_norm": 0.2825523912906647, + "learning_rate": 1.9253306457723815e-06, + "loss": 0.3949, + "step": 6839 + }, + { + "epoch": 3.6979635970445126, + "grad_norm": 0.2709399461746216, + "learning_rate": 1.9238420470968665e-06, + "loss": 0.3762, + "step": 6840 + }, + { + "epoch": 3.698504234997297, + "grad_norm": 0.26949992775917053, + "learning_rate": 1.922353887014373e-06, + "loss": 0.364, + "step": 6841 + }, + { + "epoch": 3.699044872950081, + "grad_norm": 0.28536126017570496, + "learning_rate": 1.9208661657370843e-06, + "loss": 0.366, + "step": 6842 + }, + { + "epoch": 3.6995855109028652, + "grad_norm": 0.27294227480888367, + "learning_rate": 1.919378883477114e-06, + "loss": 0.3949, + "step": 6843 + }, + { + "epoch": 3.7001261488556496, + "grad_norm": 0.2738019526004791, + "learning_rate": 1.9178920404465183e-06, + "loss": 0.3863, + "step": 6844 + }, + { + "epoch": 3.700666786808434, + "grad_norm": 0.2529360353946686, + "learning_rate": 1.9164056368572847e-06, + "loss": 0.361, + "step": 6845 + }, + { + "epoch": 3.7012074247612183, + "grad_norm": 0.2742631137371063, + "learning_rate": 1.9149196729213464e-06, + "loss": 0.3908, + "step": 6846 + }, + { + "epoch": 3.7017480627140023, + "grad_norm": 0.2806893587112427, + "learning_rate": 1.9134341488505676e-06, + "loss": 0.3768, + "step": 6847 + }, + { + "epoch": 3.702288700666787, + "grad_norm": 0.28769466280937195, + "learning_rate": 1.9119490648567496e-06, + "loss": 0.3811, + "step": 6848 + }, + { + "epoch": 3.702829338619571, + "grad_norm": 0.2752440869808197, + "learning_rate": 1.9104644211516373e-06, + "loss": 0.4065, + "step": 6849 + }, + { + "epoch": 3.7033699765723553, + "grad_norm": 0.2697683572769165, + "learning_rate": 1.9089802179469036e-06, + "loss": 0.3683, + "step": 6850 + }, + { + "epoch": 3.7039106145251397, + "grad_norm": 0.2717171609401703, + "learning_rate": 1.907496455454168e-06, + "loss": 0.3905, + "step": 6851 + }, + { + "epoch": 3.704451252477924, + "grad_norm": 0.28985825181007385, + "learning_rate": 1.9060131338849808e-06, + "loss": 0.3945, + "step": 6852 + }, + { + "epoch": 3.7049918904307084, + "grad_norm": 0.2849089503288269, + "learning_rate": 1.9045302534508298e-06, + "loss": 0.3923, + "step": 6853 + }, + { + "epoch": 3.7055325283834923, + "grad_norm": 0.2646790146827698, + "learning_rate": 1.9030478143631442e-06, + "loss": 0.4143, + "step": 6854 + }, + { + "epoch": 3.7060731663362767, + "grad_norm": 0.2752073407173157, + "learning_rate": 1.9015658168332863e-06, + "loss": 0.3596, + "step": 6855 + }, + { + "epoch": 3.706613804289061, + "grad_norm": 0.28556573390960693, + "learning_rate": 1.9000842610725562e-06, + "loss": 0.3937, + "step": 6856 + }, + { + "epoch": 3.7071544422418454, + "grad_norm": 0.27374550700187683, + "learning_rate": 1.8986031472921902e-06, + "loss": 0.3855, + "step": 6857 + }, + { + "epoch": 3.70769508019463, + "grad_norm": 0.2739173173904419, + "learning_rate": 1.897122475703364e-06, + "loss": 0.3713, + "step": 6858 + }, + { + "epoch": 3.7082357181474137, + "grad_norm": 0.28062182664871216, + "learning_rate": 1.8956422465171924e-06, + "loss": 0.3718, + "step": 6859 + }, + { + "epoch": 3.708776356100198, + "grad_norm": 0.27734068036079407, + "learning_rate": 1.8941624599447178e-06, + "loss": 0.3702, + "step": 6860 + }, + { + "epoch": 3.7093169940529824, + "grad_norm": 0.31852149963378906, + "learning_rate": 1.8926831161969306e-06, + "loss": 0.3747, + "step": 6861 + }, + { + "epoch": 3.709857632005767, + "grad_norm": 0.2986151874065399, + "learning_rate": 1.8912042154847482e-06, + "loss": 0.3909, + "step": 6862 + }, + { + "epoch": 3.710398269958551, + "grad_norm": 0.2765163481235504, + "learning_rate": 1.8897257580190342e-06, + "loss": 0.3761, + "step": 6863 + }, + { + "epoch": 3.7109389079113355, + "grad_norm": 0.279621958732605, + "learning_rate": 1.8882477440105824e-06, + "loss": 0.3787, + "step": 6864 + }, + { + "epoch": 3.71147954586412, + "grad_norm": 0.27998921275138855, + "learning_rate": 1.8867701736701238e-06, + "loss": 0.3677, + "step": 6865 + }, + { + "epoch": 3.712020183816904, + "grad_norm": 0.2997812330722809, + "learning_rate": 1.8852930472083304e-06, + "loss": 0.3716, + "step": 6866 + }, + { + "epoch": 3.712560821769688, + "grad_norm": 0.25540295243263245, + "learning_rate": 1.8838163648358071e-06, + "loss": 0.3752, + "step": 6867 + }, + { + "epoch": 3.7131014597224725, + "grad_norm": 0.2944501042366028, + "learning_rate": 1.8823401267630952e-06, + "loss": 0.3693, + "step": 6868 + }, + { + "epoch": 3.713642097675257, + "grad_norm": 0.26505914330482483, + "learning_rate": 1.8808643332006765e-06, + "loss": 0.3775, + "step": 6869 + }, + { + "epoch": 3.7141827356280412, + "grad_norm": 0.26692885160446167, + "learning_rate": 1.8793889843589647e-06, + "loss": 0.3777, + "step": 6870 + }, + { + "epoch": 3.714723373580825, + "grad_norm": 0.26996320486068726, + "learning_rate": 1.8779140804483164e-06, + "loss": 0.3844, + "step": 6871 + }, + { + "epoch": 3.7152640115336095, + "grad_norm": 0.29633286595344543, + "learning_rate": 1.8764396216790148e-06, + "loss": 0.3702, + "step": 6872 + }, + { + "epoch": 3.715804649486394, + "grad_norm": 0.29292699694633484, + "learning_rate": 1.87496560826129e-06, + "loss": 0.3902, + "step": 6873 + }, + { + "epoch": 3.7163452874391782, + "grad_norm": 0.2757956385612488, + "learning_rate": 1.8734920404053013e-06, + "loss": 0.3812, + "step": 6874 + }, + { + "epoch": 3.7168859253919626, + "grad_norm": 0.26927411556243896, + "learning_rate": 1.87201891832115e-06, + "loss": 0.3752, + "step": 6875 + }, + { + "epoch": 3.7174265633447465, + "grad_norm": 0.2939903140068054, + "learning_rate": 1.8705462422188703e-06, + "loss": 0.3712, + "step": 6876 + }, + { + "epoch": 3.7179672012975313, + "grad_norm": 0.3061712980270386, + "learning_rate": 1.8690740123084316e-06, + "loss": 0.3908, + "step": 6877 + }, + { + "epoch": 3.7185078392503153, + "grad_norm": 0.283538281917572, + "learning_rate": 1.8676022287997454e-06, + "loss": 0.3942, + "step": 6878 + }, + { + "epoch": 3.7190484772030996, + "grad_norm": 0.25709348917007446, + "learning_rate": 1.8661308919026533e-06, + "loss": 0.3735, + "step": 6879 + }, + { + "epoch": 3.719589115155884, + "grad_norm": 0.287073016166687, + "learning_rate": 1.8646600018269356e-06, + "loss": 0.3779, + "step": 6880 + }, + { + "epoch": 3.7201297531086683, + "grad_norm": 0.27224376797676086, + "learning_rate": 1.8631895587823112e-06, + "loss": 0.381, + "step": 6881 + }, + { + "epoch": 3.7206703910614527, + "grad_norm": 0.31198450922966003, + "learning_rate": 1.8617195629784308e-06, + "loss": 0.4011, + "step": 6882 + }, + { + "epoch": 3.7212110290142366, + "grad_norm": 0.27526232600212097, + "learning_rate": 1.8602500146248885e-06, + "loss": 0.3786, + "step": 6883 + }, + { + "epoch": 3.721751666967021, + "grad_norm": 0.2905593812465668, + "learning_rate": 1.858780913931203e-06, + "loss": 0.3625, + "step": 6884 + }, + { + "epoch": 3.7222923049198053, + "grad_norm": 0.2826231122016907, + "learning_rate": 1.8573122611068406e-06, + "loss": 0.3787, + "step": 6885 + }, + { + "epoch": 3.7228329428725897, + "grad_norm": 0.29400330781936646, + "learning_rate": 1.855844056361197e-06, + "loss": 0.3737, + "step": 6886 + }, + { + "epoch": 3.723373580825374, + "grad_norm": 0.25836947560310364, + "learning_rate": 1.854376299903608e-06, + "loss": 0.369, + "step": 6887 + }, + { + "epoch": 3.723914218778158, + "grad_norm": 0.2573744058609009, + "learning_rate": 1.8529089919433435e-06, + "loss": 0.3723, + "step": 6888 + }, + { + "epoch": 3.724454856730943, + "grad_norm": 0.28248631954193115, + "learning_rate": 1.8514421326896071e-06, + "loss": 0.3772, + "step": 6889 + }, + { + "epoch": 3.7249954946837267, + "grad_norm": 0.2774384319782257, + "learning_rate": 1.8499757223515442e-06, + "loss": 0.3886, + "step": 6890 + }, + { + "epoch": 3.725536132636511, + "grad_norm": 0.27647051215171814, + "learning_rate": 1.8485097611382312e-06, + "loss": 0.3926, + "step": 6891 + }, + { + "epoch": 3.7260767705892954, + "grad_norm": 0.2677972614765167, + "learning_rate": 1.847044249258681e-06, + "loss": 0.3624, + "step": 6892 + }, + { + "epoch": 3.72661740854208, + "grad_norm": 0.28188949823379517, + "learning_rate": 1.845579186921847e-06, + "loss": 0.3951, + "step": 6893 + }, + { + "epoch": 3.727158046494864, + "grad_norm": 0.27737295627593994, + "learning_rate": 1.8441145743366113e-06, + "loss": 0.404, + "step": 6894 + }, + { + "epoch": 3.727698684447648, + "grad_norm": 0.29896578192710876, + "learning_rate": 1.8426504117118011e-06, + "loss": 0.3942, + "step": 6895 + }, + { + "epoch": 3.7282393224004324, + "grad_norm": 0.2849474251270294, + "learning_rate": 1.8411866992561667e-06, + "loss": 0.3693, + "step": 6896 + }, + { + "epoch": 3.728779960353217, + "grad_norm": 0.28557971119880676, + "learning_rate": 1.8397234371784062e-06, + "loss": 0.3778, + "step": 6897 + }, + { + "epoch": 3.729320598306001, + "grad_norm": 0.2757243812084198, + "learning_rate": 1.8382606256871494e-06, + "loss": 0.3872, + "step": 6898 + }, + { + "epoch": 3.7298612362587855, + "grad_norm": 0.27281227707862854, + "learning_rate": 1.83679826499096e-06, + "loss": 0.3908, + "step": 6899 + }, + { + "epoch": 3.7304018742115694, + "grad_norm": 0.26603302359580994, + "learning_rate": 1.8353363552983382e-06, + "loss": 0.3461, + "step": 6900 + }, + { + "epoch": 3.730942512164354, + "grad_norm": 0.267747700214386, + "learning_rate": 1.83387489681772e-06, + "loss": 0.3675, + "step": 6901 + }, + { + "epoch": 3.731483150117138, + "grad_norm": 0.2858692705631256, + "learning_rate": 1.83241388975748e-06, + "loss": 0.3683, + "step": 6902 + }, + { + "epoch": 3.7320237880699225, + "grad_norm": 0.27688363194465637, + "learning_rate": 1.8309533343259246e-06, + "loss": 0.373, + "step": 6903 + }, + { + "epoch": 3.732564426022707, + "grad_norm": 0.26876163482666016, + "learning_rate": 1.8294932307312946e-06, + "loss": 0.3468, + "step": 6904 + }, + { + "epoch": 3.7331050639754912, + "grad_norm": 0.28595924377441406, + "learning_rate": 1.8280335791817733e-06, + "loss": 0.3799, + "step": 6905 + }, + { + "epoch": 3.7336457019282756, + "grad_norm": 0.2635115087032318, + "learning_rate": 1.8265743798854719e-06, + "loss": 0.3904, + "step": 6906 + }, + { + "epoch": 3.7341863398810595, + "grad_norm": 0.2736213803291321, + "learning_rate": 1.8251156330504427e-06, + "loss": 0.3771, + "step": 6907 + }, + { + "epoch": 3.734726977833844, + "grad_norm": 0.29771164059638977, + "learning_rate": 1.82365733888467e-06, + "loss": 0.3865, + "step": 6908 + }, + { + "epoch": 3.7352676157866282, + "grad_norm": 0.30484992265701294, + "learning_rate": 1.8221994975960739e-06, + "loss": 0.4009, + "step": 6909 + }, + { + "epoch": 3.7358082537394126, + "grad_norm": 0.29641515016555786, + "learning_rate": 1.8207421093925127e-06, + "loss": 0.3815, + "step": 6910 + }, + { + "epoch": 3.736348891692197, + "grad_norm": 0.275765597820282, + "learning_rate": 1.8192851744817757e-06, + "loss": 0.3752, + "step": 6911 + }, + { + "epoch": 3.736889529644981, + "grad_norm": 0.29648923873901367, + "learning_rate": 1.817828693071595e-06, + "loss": 0.3919, + "step": 6912 + }, + { + "epoch": 3.7374301675977653, + "grad_norm": 0.27310508489608765, + "learning_rate": 1.8163726653696263e-06, + "loss": 0.367, + "step": 6913 + }, + { + "epoch": 3.7379708055505496, + "grad_norm": 0.28955864906311035, + "learning_rate": 1.8149170915834723e-06, + "loss": 0.3819, + "step": 6914 + }, + { + "epoch": 3.738511443503334, + "grad_norm": 0.26339635252952576, + "learning_rate": 1.8134619719206624e-06, + "loss": 0.3803, + "step": 6915 + }, + { + "epoch": 3.7390520814561183, + "grad_norm": 0.2843658924102783, + "learning_rate": 1.8120073065886695e-06, + "loss": 0.3706, + "step": 6916 + }, + { + "epoch": 3.7395927194089023, + "grad_norm": 0.27032530307769775, + "learning_rate": 1.8105530957948941e-06, + "loss": 0.3872, + "step": 6917 + }, + { + "epoch": 3.740133357361687, + "grad_norm": 0.26732128858566284, + "learning_rate": 1.809099339746674e-06, + "loss": 0.3417, + "step": 6918 + }, + { + "epoch": 3.740673995314471, + "grad_norm": 0.2693036198616028, + "learning_rate": 1.8076460386512855e-06, + "loss": 0.389, + "step": 6919 + }, + { + "epoch": 3.7412146332672553, + "grad_norm": 0.27490392327308655, + "learning_rate": 1.8061931927159377e-06, + "loss": 0.3756, + "step": 6920 + }, + { + "epoch": 3.7417552712200397, + "grad_norm": 0.2863559424877167, + "learning_rate": 1.8047408021477713e-06, + "loss": 0.3897, + "step": 6921 + }, + { + "epoch": 3.742295909172824, + "grad_norm": 0.2561712861061096, + "learning_rate": 1.80328886715387e-06, + "loss": 0.3495, + "step": 6922 + }, + { + "epoch": 3.7428365471256084, + "grad_norm": 0.27250537276268005, + "learning_rate": 1.8018373879412442e-06, + "loss": 0.3602, + "step": 6923 + }, + { + "epoch": 3.7433771850783923, + "grad_norm": 0.26523685455322266, + "learning_rate": 1.800386364716849e-06, + "loss": 0.3701, + "step": 6924 + }, + { + "epoch": 3.7439178230311767, + "grad_norm": 0.28787243366241455, + "learning_rate": 1.7989357976875603e-06, + "loss": 0.384, + "step": 6925 + }, + { + "epoch": 3.744458460983961, + "grad_norm": 0.2766694128513336, + "learning_rate": 1.7974856870602025e-06, + "loss": 0.3885, + "step": 6926 + }, + { + "epoch": 3.7449990989367454, + "grad_norm": 0.2729831635951996, + "learning_rate": 1.79603603304153e-06, + "loss": 0.3803, + "step": 6927 + }, + { + "epoch": 3.74553973688953, + "grad_norm": 0.27197712659835815, + "learning_rate": 1.7945868358382311e-06, + "loss": 0.3531, + "step": 6928 + }, + { + "epoch": 3.7460803748423137, + "grad_norm": 0.28468289971351624, + "learning_rate": 1.7931380956569294e-06, + "loss": 0.3855, + "step": 6929 + }, + { + "epoch": 3.746621012795098, + "grad_norm": 0.2688024342060089, + "learning_rate": 1.7916898127041815e-06, + "loss": 0.3631, + "step": 6930 + }, + { + "epoch": 3.7471616507478824, + "grad_norm": 0.27505674958229065, + "learning_rate": 1.790241987186485e-06, + "loss": 0.3652, + "step": 6931 + }, + { + "epoch": 3.747702288700667, + "grad_norm": 0.25617873668670654, + "learning_rate": 1.7887946193102663e-06, + "loss": 0.3393, + "step": 6932 + }, + { + "epoch": 3.748242926653451, + "grad_norm": 0.2912514805793762, + "learning_rate": 1.787347709281887e-06, + "loss": 0.3834, + "step": 6933 + }, + { + "epoch": 3.7487835646062355, + "grad_norm": 0.2743835151195526, + "learning_rate": 1.7859012573076478e-06, + "loss": 0.3702, + "step": 6934 + }, + { + "epoch": 3.74932420255902, + "grad_norm": 0.2596570551395416, + "learning_rate": 1.7844552635937784e-06, + "loss": 0.3882, + "step": 6935 + }, + { + "epoch": 3.749864840511804, + "grad_norm": 0.28975582122802734, + "learning_rate": 1.7830097283464486e-06, + "loss": 0.3903, + "step": 6936 + }, + { + "epoch": 3.750405478464588, + "grad_norm": 0.2721347510814667, + "learning_rate": 1.7815646517717595e-06, + "loss": 0.3865, + "step": 6937 + }, + { + "epoch": 3.7509461164173725, + "grad_norm": 0.2891277074813843, + "learning_rate": 1.7801200340757452e-06, + "loss": 0.3686, + "step": 6938 + }, + { + "epoch": 3.751486754370157, + "grad_norm": 0.2961825132369995, + "learning_rate": 1.7786758754643795e-06, + "loss": 0.3852, + "step": 6939 + }, + { + "epoch": 3.7520273923229412, + "grad_norm": 0.25924432277679443, + "learning_rate": 1.7772321761435674e-06, + "loss": 0.3807, + "step": 6940 + }, + { + "epoch": 3.752568030275725, + "grad_norm": 0.2748173475265503, + "learning_rate": 1.7757889363191484e-06, + "loss": 0.3799, + "step": 6941 + }, + { + "epoch": 3.7531086682285095, + "grad_norm": 0.2915605902671814, + "learning_rate": 1.774346156196895e-06, + "loss": 0.3723, + "step": 6942 + }, + { + "epoch": 3.753649306181294, + "grad_norm": 0.2920202314853668, + "learning_rate": 1.7729038359825201e-06, + "loss": 0.3893, + "step": 6943 + }, + { + "epoch": 3.7541899441340782, + "grad_norm": 0.2847879230976105, + "learning_rate": 1.7714619758816649e-06, + "loss": 0.3809, + "step": 6944 + }, + { + "epoch": 3.7547305820868626, + "grad_norm": 0.28939902782440186, + "learning_rate": 1.7700205760999061e-06, + "loss": 0.4034, + "step": 6945 + }, + { + "epoch": 3.7552712200396465, + "grad_norm": 0.26719948649406433, + "learning_rate": 1.7685796368427587e-06, + "loss": 0.383, + "step": 6946 + }, + { + "epoch": 3.7558118579924313, + "grad_norm": 0.2696559429168701, + "learning_rate": 1.767139158315666e-06, + "loss": 0.3681, + "step": 6947 + }, + { + "epoch": 3.7563524959452153, + "grad_norm": 0.278033584356308, + "learning_rate": 1.7656991407240126e-06, + "loss": 0.4143, + "step": 6948 + }, + { + "epoch": 3.7568931338979996, + "grad_norm": 0.2916466295719147, + "learning_rate": 1.7642595842731113e-06, + "loss": 0.3837, + "step": 6949 + }, + { + "epoch": 3.757433771850784, + "grad_norm": 0.2705329358577728, + "learning_rate": 1.76282048916821e-06, + "loss": 0.3577, + "step": 6950 + }, + { + "epoch": 3.7579744098035683, + "grad_norm": 0.27100685238838196, + "learning_rate": 1.7613818556144956e-06, + "loss": 0.3997, + "step": 6951 + }, + { + "epoch": 3.7585150477563527, + "grad_norm": 0.27594810724258423, + "learning_rate": 1.7599436838170847e-06, + "loss": 0.3903, + "step": 6952 + }, + { + "epoch": 3.7590556857091366, + "grad_norm": 0.28989678621292114, + "learning_rate": 1.7585059739810284e-06, + "loss": 0.3876, + "step": 6953 + }, + { + "epoch": 3.759596323661921, + "grad_norm": 0.27664127945899963, + "learning_rate": 1.7570687263113112e-06, + "loss": 0.3886, + "step": 6954 + }, + { + "epoch": 3.7601369616147053, + "grad_norm": 0.2947249710559845, + "learning_rate": 1.7556319410128557e-06, + "loss": 0.3829, + "step": 6955 + }, + { + "epoch": 3.7606775995674897, + "grad_norm": 0.27407416701316833, + "learning_rate": 1.754195618290519e-06, + "loss": 0.3877, + "step": 6956 + }, + { + "epoch": 3.761218237520274, + "grad_norm": 0.31988996267318726, + "learning_rate": 1.7527597583490825e-06, + "loss": 0.3944, + "step": 6957 + }, + { + "epoch": 3.761758875473058, + "grad_norm": 0.25325557589530945, + "learning_rate": 1.7513243613932734e-06, + "loss": 0.3541, + "step": 6958 + }, + { + "epoch": 3.7622995134258423, + "grad_norm": 0.2626549303531647, + "learning_rate": 1.749889427627745e-06, + "loss": 0.3802, + "step": 6959 + }, + { + "epoch": 3.7628401513786267, + "grad_norm": 0.2787989377975464, + "learning_rate": 1.7484549572570913e-06, + "loss": 0.3694, + "step": 6960 + }, + { + "epoch": 3.763380789331411, + "grad_norm": 0.2660752236843109, + "learning_rate": 1.7470209504858343e-06, + "loss": 0.3541, + "step": 6961 + }, + { + "epoch": 3.7639214272841954, + "grad_norm": 0.26545456051826477, + "learning_rate": 1.7455874075184297e-06, + "loss": 0.3999, + "step": 6962 + }, + { + "epoch": 3.76446206523698, + "grad_norm": 0.27897003293037415, + "learning_rate": 1.7441543285592743e-06, + "loss": 0.3667, + "step": 6963 + }, + { + "epoch": 3.765002703189764, + "grad_norm": 0.27769017219543457, + "learning_rate": 1.7427217138126916e-06, + "loss": 0.3749, + "step": 6964 + }, + { + "epoch": 3.765543341142548, + "grad_norm": 0.2526131570339203, + "learning_rate": 1.7412895634829391e-06, + "loss": 0.3932, + "step": 6965 + }, + { + "epoch": 3.7660839790953324, + "grad_norm": 0.26294299960136414, + "learning_rate": 1.7398578777742142e-06, + "loss": 0.3746, + "step": 6966 + }, + { + "epoch": 3.766624617048117, + "grad_norm": 0.26726728677749634, + "learning_rate": 1.7384266568906404e-06, + "loss": 0.3586, + "step": 6967 + }, + { + "epoch": 3.767165255000901, + "grad_norm": 0.29011353850364685, + "learning_rate": 1.7369959010362836e-06, + "loss": 0.3668, + "step": 6968 + }, + { + "epoch": 3.7677058929536855, + "grad_norm": 0.27279070019721985, + "learning_rate": 1.7355656104151314e-06, + "loss": 0.3779, + "step": 6969 + }, + { + "epoch": 3.7682465309064694, + "grad_norm": 0.2736707031726837, + "learning_rate": 1.7341357852311175e-06, + "loss": 0.4005, + "step": 6970 + }, + { + "epoch": 3.768787168859254, + "grad_norm": 0.2550220787525177, + "learning_rate": 1.7327064256881e-06, + "loss": 0.3768, + "step": 6971 + }, + { + "epoch": 3.769327806812038, + "grad_norm": 0.2679320275783539, + "learning_rate": 1.7312775319898768e-06, + "loss": 0.3571, + "step": 6972 + }, + { + "epoch": 3.7698684447648225, + "grad_norm": 0.27891236543655396, + "learning_rate": 1.7298491043401794e-06, + "loss": 0.3807, + "step": 6973 + }, + { + "epoch": 3.770409082717607, + "grad_norm": 0.28013134002685547, + "learning_rate": 1.7284211429426645e-06, + "loss": 0.3754, + "step": 6974 + }, + { + "epoch": 3.770949720670391, + "grad_norm": 0.26919159293174744, + "learning_rate": 1.726993648000933e-06, + "loss": 0.3819, + "step": 6975 + }, + { + "epoch": 3.7714903586231756, + "grad_norm": 0.2816323935985565, + "learning_rate": 1.7255666197185111e-06, + "loss": 0.3647, + "step": 6976 + }, + { + "epoch": 3.7720309965759595, + "grad_norm": 0.27056410908699036, + "learning_rate": 1.7241400582988654e-06, + "loss": 0.389, + "step": 6977 + }, + { + "epoch": 3.772571634528744, + "grad_norm": 0.26639676094055176, + "learning_rate": 1.7227139639453904e-06, + "loss": 0.3745, + "step": 6978 + }, + { + "epoch": 3.7731122724815283, + "grad_norm": 0.27519285678863525, + "learning_rate": 1.7212883368614153e-06, + "loss": 0.3664, + "step": 6979 + }, + { + "epoch": 3.7736529104343126, + "grad_norm": 0.2709251046180725, + "learning_rate": 1.7198631772502057e-06, + "loss": 0.3697, + "step": 6980 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.2782084047794342, + "learning_rate": 1.7184384853149566e-06, + "loss": 0.3985, + "step": 6981 + }, + { + "epoch": 3.774734186339881, + "grad_norm": 0.27686184644699097, + "learning_rate": 1.7170142612587986e-06, + "loss": 0.3673, + "step": 6982 + }, + { + "epoch": 3.7752748242926653, + "grad_norm": 0.2701174318790436, + "learning_rate": 1.7155905052847938e-06, + "loss": 0.3876, + "step": 6983 + }, + { + "epoch": 3.7758154622454496, + "grad_norm": 0.27983951568603516, + "learning_rate": 1.714167217595939e-06, + "loss": 0.3854, + "step": 6984 + }, + { + "epoch": 3.776356100198234, + "grad_norm": 0.26138070225715637, + "learning_rate": 1.7127443983951687e-06, + "loss": 0.3786, + "step": 6985 + }, + { + "epoch": 3.7768967381510183, + "grad_norm": 0.27046772837638855, + "learning_rate": 1.7113220478853375e-06, + "loss": 0.369, + "step": 6986 + }, + { + "epoch": 3.7774373761038023, + "grad_norm": 0.2825208902359009, + "learning_rate": 1.7099001662692488e-06, + "loss": 0.3846, + "step": 6987 + }, + { + "epoch": 3.7779780140565866, + "grad_norm": 0.29027751088142395, + "learning_rate": 1.7084787537496266e-06, + "loss": 0.3749, + "step": 6988 + }, + { + "epoch": 3.778518652009371, + "grad_norm": 0.2870740294456482, + "learning_rate": 1.707057810529138e-06, + "loss": 0.4073, + "step": 6989 + }, + { + "epoch": 3.7790592899621553, + "grad_norm": 0.276230126619339, + "learning_rate": 1.7056373368103756e-06, + "loss": 0.3773, + "step": 6990 + }, + { + "epoch": 3.7795999279149397, + "grad_norm": 0.24382857978343964, + "learning_rate": 1.7042173327958678e-06, + "loss": 0.3642, + "step": 6991 + }, + { + "epoch": 3.780140565867724, + "grad_norm": 0.26477882266044617, + "learning_rate": 1.7027977986880784e-06, + "loss": 0.3497, + "step": 6992 + }, + { + "epoch": 3.7806812038205084, + "grad_norm": 0.27764496207237244, + "learning_rate": 1.7013787346894006e-06, + "loss": 0.3573, + "step": 6993 + }, + { + "epoch": 3.7812218417732923, + "grad_norm": 0.2853281795978546, + "learning_rate": 1.6999601410021605e-06, + "loss": 0.3737, + "step": 6994 + }, + { + "epoch": 3.7817624797260767, + "grad_norm": 0.2836633026599884, + "learning_rate": 1.6985420178286216e-06, + "loss": 0.3883, + "step": 6995 + }, + { + "epoch": 3.782303117678861, + "grad_norm": 0.2709035277366638, + "learning_rate": 1.697124365370974e-06, + "loss": 0.3629, + "step": 6996 + }, + { + "epoch": 3.7828437556316454, + "grad_norm": 0.2787929177284241, + "learning_rate": 1.69570718383135e-06, + "loss": 0.3654, + "step": 6997 + }, + { + "epoch": 3.78338439358443, + "grad_norm": 0.2762378752231598, + "learning_rate": 1.6942904734118004e-06, + "loss": 0.3936, + "step": 6998 + }, + { + "epoch": 3.7839250315372137, + "grad_norm": 0.24872547388076782, + "learning_rate": 1.692874234314324e-06, + "loss": 0.3461, + "step": 6999 + }, + { + "epoch": 3.784465669489998, + "grad_norm": 0.2528185546398163, + "learning_rate": 1.6914584667408408e-06, + "loss": 0.374, + "step": 7000 + }, + { + "epoch": 3.7850063074427824, + "grad_norm": 0.2733992040157318, + "learning_rate": 1.6900431708932124e-06, + "loss": 0.3708, + "step": 7001 + }, + { + "epoch": 3.785546945395567, + "grad_norm": 0.30628329515457153, + "learning_rate": 1.6886283469732279e-06, + "loss": 0.3975, + "step": 7002 + }, + { + "epoch": 3.786087583348351, + "grad_norm": 0.26259320974349976, + "learning_rate": 1.6872139951826078e-06, + "loss": 0.3767, + "step": 7003 + }, + { + "epoch": 3.786628221301135, + "grad_norm": 0.2805614769458771, + "learning_rate": 1.685800115723011e-06, + "loss": 0.3974, + "step": 7004 + }, + { + "epoch": 3.78716885925392, + "grad_norm": 0.2630981504917145, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.3703, + "step": 7005 + }, + { + "epoch": 3.787709497206704, + "grad_norm": 0.27293825149536133, + "learning_rate": 1.6829737746031687e-06, + "loss": 0.3829, + "step": 7006 + }, + { + "epoch": 3.788250135159488, + "grad_norm": 0.2765561044216156, + "learning_rate": 1.6815613133458998e-06, + "loss": 0.3638, + "step": 7007 + }, + { + "epoch": 3.7887907731122725, + "grad_norm": 0.2968769371509552, + "learning_rate": 1.6801493252255995e-06, + "loss": 0.3745, + "step": 7008 + }, + { + "epoch": 3.789331411065057, + "grad_norm": 0.2884106934070587, + "learning_rate": 1.6787378104435931e-06, + "loss": 0.3857, + "step": 7009 + }, + { + "epoch": 3.7898720490178412, + "grad_norm": 0.2724694013595581, + "learning_rate": 1.6773267692011242e-06, + "loss": 0.3548, + "step": 7010 + }, + { + "epoch": 3.790412686970625, + "grad_norm": 0.26447156071662903, + "learning_rate": 1.6759162016993808e-06, + "loss": 0.3886, + "step": 7011 + }, + { + "epoch": 3.7909533249234095, + "grad_norm": 0.2667076885700226, + "learning_rate": 1.6745061081394792e-06, + "loss": 0.3839, + "step": 7012 + }, + { + "epoch": 3.791493962876194, + "grad_norm": 0.27531537413597107, + "learning_rate": 1.6730964887224677e-06, + "loss": 0.3818, + "step": 7013 + }, + { + "epoch": 3.7920346008289783, + "grad_norm": 0.2675216495990753, + "learning_rate": 1.6716873436493263e-06, + "loss": 0.3761, + "step": 7014 + }, + { + "epoch": 3.7925752387817626, + "grad_norm": 0.30674728751182556, + "learning_rate": 1.6702786731209681e-06, + "loss": 0.3599, + "step": 7015 + }, + { + "epoch": 3.7931158767345465, + "grad_norm": 0.2617681324481964, + "learning_rate": 1.6688704773382403e-06, + "loss": 0.3785, + "step": 7016 + }, + { + "epoch": 3.793656514687331, + "grad_norm": 0.2874067425727844, + "learning_rate": 1.6674627565019203e-06, + "loss": 0.3601, + "step": 7017 + }, + { + "epoch": 3.7941971526401153, + "grad_norm": 0.2876586318016052, + "learning_rate": 1.6660555108127169e-06, + "loss": 0.4128, + "step": 7018 + }, + { + "epoch": 3.7947377905928996, + "grad_norm": 0.28677424788475037, + "learning_rate": 1.6646487404712753e-06, + "loss": 0.3905, + "step": 7019 + }, + { + "epoch": 3.795278428545684, + "grad_norm": 0.26501747965812683, + "learning_rate": 1.6632424456781675e-06, + "loss": 0.3728, + "step": 7020 + }, + { + "epoch": 3.7958190664984683, + "grad_norm": 0.2984306514263153, + "learning_rate": 1.6618366266339048e-06, + "loss": 0.3871, + "step": 7021 + }, + { + "epoch": 3.7963597044512527, + "grad_norm": 0.2869909405708313, + "learning_rate": 1.6604312835389202e-06, + "loss": 0.3847, + "step": 7022 + }, + { + "epoch": 3.7969003424040366, + "grad_norm": 0.29960310459136963, + "learning_rate": 1.6590264165935882e-06, + "loss": 0.3771, + "step": 7023 + }, + { + "epoch": 3.797440980356821, + "grad_norm": 0.25678813457489014, + "learning_rate": 1.657622025998214e-06, + "loss": 0.3744, + "step": 7024 + }, + { + "epoch": 3.7979816183096053, + "grad_norm": 0.26842546463012695, + "learning_rate": 1.6562181119530314e-06, + "loss": 0.3958, + "step": 7025 + }, + { + "epoch": 3.7985222562623897, + "grad_norm": 0.28335508704185486, + "learning_rate": 1.6548146746582072e-06, + "loss": 0.3817, + "step": 7026 + }, + { + "epoch": 3.799062894215174, + "grad_norm": 0.268964558839798, + "learning_rate": 1.6534117143138402e-06, + "loss": 0.3837, + "step": 7027 + }, + { + "epoch": 3.799603532167958, + "grad_norm": 0.2672850489616394, + "learning_rate": 1.6520092311199648e-06, + "loss": 0.3691, + "step": 7028 + }, + { + "epoch": 3.8001441701207423, + "grad_norm": 0.26871156692504883, + "learning_rate": 1.6506072252765427e-06, + "loss": 0.4003, + "step": 7029 + }, + { + "epoch": 3.8006848080735267, + "grad_norm": 0.28091421723365784, + "learning_rate": 1.649205696983468e-06, + "loss": 0.3917, + "step": 7030 + }, + { + "epoch": 3.801225446026311, + "grad_norm": 0.28011780977249146, + "learning_rate": 1.6478046464405717e-06, + "loss": 0.3744, + "step": 7031 + }, + { + "epoch": 3.8017660839790954, + "grad_norm": 0.26686403155326843, + "learning_rate": 1.646404073847609e-06, + "loss": 0.3762, + "step": 7032 + }, + { + "epoch": 3.8023067219318794, + "grad_norm": 0.28495582938194275, + "learning_rate": 1.6450039794042743e-06, + "loss": 0.3852, + "step": 7033 + }, + { + "epoch": 3.802847359884664, + "grad_norm": 0.25214681029319763, + "learning_rate": 1.6436043633101901e-06, + "loss": 0.3713, + "step": 7034 + }, + { + "epoch": 3.803387997837448, + "grad_norm": 0.30977126955986023, + "learning_rate": 1.642205225764908e-06, + "loss": 0.3836, + "step": 7035 + }, + { + "epoch": 3.8039286357902324, + "grad_norm": 0.27312731742858887, + "learning_rate": 1.6408065669679184e-06, + "loss": 0.3908, + "step": 7036 + }, + { + "epoch": 3.804469273743017, + "grad_norm": 0.25544628500938416, + "learning_rate": 1.6394083871186362e-06, + "loss": 0.3818, + "step": 7037 + }, + { + "epoch": 3.805009911695801, + "grad_norm": 0.27852949500083923, + "learning_rate": 1.6380106864164163e-06, + "loss": 0.3981, + "step": 7038 + }, + { + "epoch": 3.8055505496485855, + "grad_norm": 0.26494163274765015, + "learning_rate": 1.6366134650605342e-06, + "loss": 0.3477, + "step": 7039 + }, + { + "epoch": 3.8060911876013694, + "grad_norm": 0.2715262472629547, + "learning_rate": 1.635216723250206e-06, + "loss": 0.3625, + "step": 7040 + }, + { + "epoch": 3.806631825554154, + "grad_norm": 0.26712551712989807, + "learning_rate": 1.6338204611845775e-06, + "loss": 0.3679, + "step": 7041 + }, + { + "epoch": 3.807172463506938, + "grad_norm": 0.2706104815006256, + "learning_rate": 1.6324246790627252e-06, + "loss": 0.3953, + "step": 7042 + }, + { + "epoch": 3.8077131014597225, + "grad_norm": 0.25851455330848694, + "learning_rate": 1.631029377083656e-06, + "loss": 0.3784, + "step": 7043 + }, + { + "epoch": 3.808253739412507, + "grad_norm": 0.29127827286720276, + "learning_rate": 1.6296345554463084e-06, + "loss": 0.3729, + "step": 7044 + }, + { + "epoch": 3.808794377365291, + "grad_norm": 0.2785845100879669, + "learning_rate": 1.6282402143495568e-06, + "loss": 0.3614, + "step": 7045 + }, + { + "epoch": 3.809335015318075, + "grad_norm": 0.28293830156326294, + "learning_rate": 1.6268463539922018e-06, + "loss": 0.3811, + "step": 7046 + }, + { + "epoch": 3.8098756532708595, + "grad_norm": 0.2674233019351959, + "learning_rate": 1.6254529745729759e-06, + "loss": 0.3838, + "step": 7047 + }, + { + "epoch": 3.810416291223644, + "grad_norm": 0.26191818714141846, + "learning_rate": 1.6240600762905485e-06, + "loss": 0.3617, + "step": 7048 + }, + { + "epoch": 3.8109569291764283, + "grad_norm": 0.2639159858226776, + "learning_rate": 1.6226676593435126e-06, + "loss": 0.3626, + "step": 7049 + }, + { + "epoch": 3.8114975671292126, + "grad_norm": 0.2967204451560974, + "learning_rate": 1.6212757239304e-06, + "loss": 0.3813, + "step": 7050 + }, + { + "epoch": 3.812038205081997, + "grad_norm": 0.257874995470047, + "learning_rate": 1.6198842702496687e-06, + "loss": 0.3817, + "step": 7051 + }, + { + "epoch": 3.812578843034781, + "grad_norm": 0.26568150520324707, + "learning_rate": 1.6184932984997082e-06, + "loss": 0.3399, + "step": 7052 + }, + { + "epoch": 3.8131194809875653, + "grad_norm": 0.2827441990375519, + "learning_rate": 1.6171028088788432e-06, + "loss": 0.3653, + "step": 7053 + }, + { + "epoch": 3.8136601189403496, + "grad_norm": 0.27088630199432373, + "learning_rate": 1.6157128015853269e-06, + "loss": 0.3645, + "step": 7054 + }, + { + "epoch": 3.814200756893134, + "grad_norm": 0.2815869152545929, + "learning_rate": 1.6143232768173428e-06, + "loss": 0.3761, + "step": 7055 + }, + { + "epoch": 3.8147413948459183, + "grad_norm": 0.2995468080043793, + "learning_rate": 1.6129342347730054e-06, + "loss": 0.3815, + "step": 7056 + }, + { + "epoch": 3.8152820327987023, + "grad_norm": 0.28593334555625916, + "learning_rate": 1.6115456756503656e-06, + "loss": 0.3745, + "step": 7057 + }, + { + "epoch": 3.8158226707514866, + "grad_norm": 0.26630958914756775, + "learning_rate": 1.6101575996473994e-06, + "loss": 0.3604, + "step": 7058 + }, + { + "epoch": 3.816363308704271, + "grad_norm": 0.27000048756599426, + "learning_rate": 1.6087700069620155e-06, + "loss": 0.3569, + "step": 7059 + }, + { + "epoch": 3.8169039466570553, + "grad_norm": 0.25779932737350464, + "learning_rate": 1.6073828977920564e-06, + "loss": 0.3878, + "step": 7060 + }, + { + "epoch": 3.8174445846098397, + "grad_norm": 0.2818188965320587, + "learning_rate": 1.6059962723352912e-06, + "loss": 0.3803, + "step": 7061 + }, + { + "epoch": 3.8179852225626236, + "grad_norm": 0.30023640394210815, + "learning_rate": 1.6046101307894251e-06, + "loss": 0.4007, + "step": 7062 + }, + { + "epoch": 3.8185258605154084, + "grad_norm": 0.2842724919319153, + "learning_rate": 1.6032244733520901e-06, + "loss": 0.3988, + "step": 7063 + }, + { + "epoch": 3.8190664984681923, + "grad_norm": 0.2938917577266693, + "learning_rate": 1.60183930022085e-06, + "loss": 0.3605, + "step": 7064 + }, + { + "epoch": 3.8196071364209767, + "grad_norm": 0.28421473503112793, + "learning_rate": 1.6004546115932023e-06, + "loss": 0.3936, + "step": 7065 + }, + { + "epoch": 3.820147774373761, + "grad_norm": 0.2783248722553253, + "learning_rate": 1.5990704076665726e-06, + "loss": 0.3705, + "step": 7066 + }, + { + "epoch": 3.8206884123265454, + "grad_norm": 0.2888815999031067, + "learning_rate": 1.597686688638318e-06, + "loss": 0.3707, + "step": 7067 + }, + { + "epoch": 3.82122905027933, + "grad_norm": 0.27927088737487793, + "learning_rate": 1.5963034547057249e-06, + "loss": 0.4013, + "step": 7068 + }, + { + "epoch": 3.8217696882321137, + "grad_norm": 0.2906007170677185, + "learning_rate": 1.5949207060660138e-06, + "loss": 0.38, + "step": 7069 + }, + { + "epoch": 3.822310326184898, + "grad_norm": 0.2735014855861664, + "learning_rate": 1.5935384429163376e-06, + "loss": 0.3678, + "step": 7070 + }, + { + "epoch": 3.8228509641376824, + "grad_norm": 0.27961209416389465, + "learning_rate": 1.5921566654537706e-06, + "loss": 0.4182, + "step": 7071 + }, + { + "epoch": 3.823391602090467, + "grad_norm": 0.26644429564476013, + "learning_rate": 1.5907753738753296e-06, + "loss": 0.3791, + "step": 7072 + }, + { + "epoch": 3.823932240043251, + "grad_norm": 0.2603045403957367, + "learning_rate": 1.5893945683779526e-06, + "loss": 0.3727, + "step": 7073 + }, + { + "epoch": 3.824472877996035, + "grad_norm": 0.2659344971179962, + "learning_rate": 1.588014249158516e-06, + "loss": 0.3748, + "step": 7074 + }, + { + "epoch": 3.8250135159488194, + "grad_norm": 0.2584983706474304, + "learning_rate": 1.5866344164138214e-06, + "loss": 0.3872, + "step": 7075 + }, + { + "epoch": 3.825554153901604, + "grad_norm": 0.2933692932128906, + "learning_rate": 1.585255070340601e-06, + "loss": 0.3687, + "step": 7076 + }, + { + "epoch": 3.826094791854388, + "grad_norm": 0.27587223052978516, + "learning_rate": 1.5838762111355234e-06, + "loss": 0.398, + "step": 7077 + }, + { + "epoch": 3.8266354298071725, + "grad_norm": 0.2780817747116089, + "learning_rate": 1.5824978389951812e-06, + "loss": 0.3907, + "step": 7078 + }, + { + "epoch": 3.827176067759957, + "grad_norm": 0.28126052021980286, + "learning_rate": 1.5811199541160994e-06, + "loss": 0.3804, + "step": 7079 + }, + { + "epoch": 3.8277167057127413, + "grad_norm": 0.2787818908691406, + "learning_rate": 1.5797425566947378e-06, + "loss": 0.3908, + "step": 7080 + }, + { + "epoch": 3.828257343665525, + "grad_norm": 0.2890182137489319, + "learning_rate": 1.578365646927479e-06, + "loss": 0.3868, + "step": 7081 + }, + { + "epoch": 3.8287979816183095, + "grad_norm": 0.2658842206001282, + "learning_rate": 1.5769892250106456e-06, + "loss": 0.3922, + "step": 7082 + }, + { + "epoch": 3.829338619571094, + "grad_norm": 0.27497076988220215, + "learning_rate": 1.5756132911404792e-06, + "loss": 0.3867, + "step": 7083 + }, + { + "epoch": 3.8298792575238783, + "grad_norm": 0.26899388432502747, + "learning_rate": 1.574237845513163e-06, + "loss": 0.3635, + "step": 7084 + }, + { + "epoch": 3.8304198954766626, + "grad_norm": 0.27387529611587524, + "learning_rate": 1.572862888324801e-06, + "loss": 0.3923, + "step": 7085 + }, + { + "epoch": 3.8309605334294465, + "grad_norm": 0.2729697823524475, + "learning_rate": 1.5714884197714369e-06, + "loss": 0.3887, + "step": 7086 + }, + { + "epoch": 3.831501171382231, + "grad_norm": 0.2589680552482605, + "learning_rate": 1.570114440049037e-06, + "loss": 0.3986, + "step": 7087 + }, + { + "epoch": 3.8320418093350153, + "grad_norm": 0.2655174136161804, + "learning_rate": 1.5687409493535004e-06, + "loss": 0.3948, + "step": 7088 + }, + { + "epoch": 3.8325824472877996, + "grad_norm": 0.2721310555934906, + "learning_rate": 1.5673679478806592e-06, + "loss": 0.3716, + "step": 7089 + }, + { + "epoch": 3.833123085240584, + "grad_norm": 0.273525208234787, + "learning_rate": 1.5659954358262724e-06, + "loss": 0.369, + "step": 7090 + }, + { + "epoch": 3.833663723193368, + "grad_norm": 0.25570881366729736, + "learning_rate": 1.5646234133860288e-06, + "loss": 0.3807, + "step": 7091 + }, + { + "epoch": 3.8342043611461527, + "grad_norm": 0.27765756845474243, + "learning_rate": 1.5632518807555513e-06, + "loss": 0.3923, + "step": 7092 + }, + { + "epoch": 3.8347449990989366, + "grad_norm": 0.26038357615470886, + "learning_rate": 1.561880838130388e-06, + "loss": 0.3872, + "step": 7093 + }, + { + "epoch": 3.835285637051721, + "grad_norm": 0.278969407081604, + "learning_rate": 1.5605102857060245e-06, + "loss": 0.3861, + "step": 7094 + }, + { + "epoch": 3.8358262750045053, + "grad_norm": 0.2753232717514038, + "learning_rate": 1.5591402236778647e-06, + "loss": 0.3883, + "step": 7095 + }, + { + "epoch": 3.8363669129572897, + "grad_norm": 0.31216442584991455, + "learning_rate": 1.557770652241255e-06, + "loss": 0.3798, + "step": 7096 + }, + { + "epoch": 3.836907550910074, + "grad_norm": 0.2636694610118866, + "learning_rate": 1.5564015715914627e-06, + "loss": 0.3922, + "step": 7097 + }, + { + "epoch": 3.837448188862858, + "grad_norm": 0.2814835011959076, + "learning_rate": 1.5550329819236926e-06, + "loss": 0.3713, + "step": 7098 + }, + { + "epoch": 3.8379888268156424, + "grad_norm": 0.29137900471687317, + "learning_rate": 1.5536648834330736e-06, + "loss": 0.3797, + "step": 7099 + }, + { + "epoch": 3.8385294647684267, + "grad_norm": 0.2699742615222931, + "learning_rate": 1.5522972763146653e-06, + "loss": 0.3661, + "step": 7100 + }, + { + "epoch": 3.839070102721211, + "grad_norm": 0.2807996869087219, + "learning_rate": 1.550930160763462e-06, + "loss": 0.3548, + "step": 7101 + }, + { + "epoch": 3.8396107406739954, + "grad_norm": 0.30388930439949036, + "learning_rate": 1.5495635369743812e-06, + "loss": 0.385, + "step": 7102 + }, + { + "epoch": 3.8401513786267794, + "grad_norm": 0.2817404866218567, + "learning_rate": 1.548197405142277e-06, + "loss": 0.3692, + "step": 7103 + }, + { + "epoch": 3.840692016579564, + "grad_norm": 0.26451969146728516, + "learning_rate": 1.546831765461928e-06, + "loss": 0.3753, + "step": 7104 + }, + { + "epoch": 3.841232654532348, + "grad_norm": 0.27831169962882996, + "learning_rate": 1.5454666181280437e-06, + "loss": 0.3756, + "step": 7105 + }, + { + "epoch": 3.8417732924851324, + "grad_norm": 0.2777852416038513, + "learning_rate": 1.5441019633352666e-06, + "loss": 0.3823, + "step": 7106 + }, + { + "epoch": 3.842313930437917, + "grad_norm": 0.2742907702922821, + "learning_rate": 1.5427378012781657e-06, + "loss": 0.3715, + "step": 7107 + }, + { + "epoch": 3.842854568390701, + "grad_norm": 0.2902372479438782, + "learning_rate": 1.5413741321512394e-06, + "loss": 0.412, + "step": 7108 + }, + { + "epoch": 3.8433952063434855, + "grad_norm": 0.27315741777420044, + "learning_rate": 1.5400109561489196e-06, + "loss": 0.3911, + "step": 7109 + }, + { + "epoch": 3.8439358442962694, + "grad_norm": 0.251729816198349, + "learning_rate": 1.5386482734655633e-06, + "loss": 0.3626, + "step": 7110 + }, + { + "epoch": 3.844476482249054, + "grad_norm": 0.27802354097366333, + "learning_rate": 1.5372860842954629e-06, + "loss": 0.4074, + "step": 7111 + }, + { + "epoch": 3.845017120201838, + "grad_norm": 0.28890305757522583, + "learning_rate": 1.5359243888328317e-06, + "loss": 0.3614, + "step": 7112 + }, + { + "epoch": 3.8455577581546225, + "grad_norm": 0.25736257433891296, + "learning_rate": 1.5345631872718214e-06, + "loss": 0.377, + "step": 7113 + }, + { + "epoch": 3.846098396107407, + "grad_norm": 0.24798278510570526, + "learning_rate": 1.5332024798065077e-06, + "loss": 0.3553, + "step": 7114 + }, + { + "epoch": 3.846639034060191, + "grad_norm": 0.25928983092308044, + "learning_rate": 1.5318422666308997e-06, + "loss": 0.3716, + "step": 7115 + }, + { + "epoch": 3.847179672012975, + "grad_norm": 0.2826147973537445, + "learning_rate": 1.5304825479389334e-06, + "loss": 0.3605, + "step": 7116 + }, + { + "epoch": 3.8477203099657595, + "grad_norm": 0.2706470787525177, + "learning_rate": 1.5291233239244728e-06, + "loss": 0.3676, + "step": 7117 + }, + { + "epoch": 3.848260947918544, + "grad_norm": 0.25623151659965515, + "learning_rate": 1.527764594781318e-06, + "loss": 0.3903, + "step": 7118 + }, + { + "epoch": 3.8488015858713283, + "grad_norm": 0.30254796147346497, + "learning_rate": 1.526406360703191e-06, + "loss": 0.3538, + "step": 7119 + }, + { + "epoch": 3.8493422238241126, + "grad_norm": 0.2859158515930176, + "learning_rate": 1.5250486218837458e-06, + "loss": 0.3651, + "step": 7120 + }, + { + "epoch": 3.849882861776897, + "grad_norm": 0.2832469940185547, + "learning_rate": 1.5236913785165692e-06, + "loss": 0.4027, + "step": 7121 + }, + { + "epoch": 3.850423499729681, + "grad_norm": 0.25426462292671204, + "learning_rate": 1.5223346307951713e-06, + "loss": 0.3634, + "step": 7122 + }, + { + "epoch": 3.8509641376824653, + "grad_norm": 0.27135756611824036, + "learning_rate": 1.5209783789129995e-06, + "loss": 0.3682, + "step": 7123 + }, + { + "epoch": 3.8515047756352496, + "grad_norm": 0.2907152473926544, + "learning_rate": 1.5196226230634193e-06, + "loss": 0.3905, + "step": 7124 + }, + { + "epoch": 3.852045413588034, + "grad_norm": 0.2654511630535126, + "learning_rate": 1.5182673634397365e-06, + "loss": 0.3814, + "step": 7125 + }, + { + "epoch": 3.8525860515408183, + "grad_norm": 0.26027148962020874, + "learning_rate": 1.5169126002351791e-06, + "loss": 0.3711, + "step": 7126 + }, + { + "epoch": 3.8531266894936023, + "grad_norm": 0.27235719561576843, + "learning_rate": 1.5155583336429097e-06, + "loss": 0.3989, + "step": 7127 + }, + { + "epoch": 3.8536673274463866, + "grad_norm": 0.27417096495628357, + "learning_rate": 1.5142045638560149e-06, + "loss": 0.3799, + "step": 7128 + }, + { + "epoch": 3.854207965399171, + "grad_norm": 0.28646403551101685, + "learning_rate": 1.5128512910675119e-06, + "loss": 0.3887, + "step": 7129 + }, + { + "epoch": 3.8547486033519553, + "grad_norm": 0.2741946876049042, + "learning_rate": 1.5114985154703505e-06, + "loss": 0.3782, + "step": 7130 + }, + { + "epoch": 3.8552892413047397, + "grad_norm": 0.28979212045669556, + "learning_rate": 1.510146237257406e-06, + "loss": 0.3976, + "step": 7131 + }, + { + "epoch": 3.8558298792575236, + "grad_norm": 0.26713863015174866, + "learning_rate": 1.508794456621482e-06, + "loss": 0.3836, + "step": 7132 + }, + { + "epoch": 3.8563705172103084, + "grad_norm": 0.2656858265399933, + "learning_rate": 1.5074431737553158e-06, + "loss": 0.3975, + "step": 7133 + }, + { + "epoch": 3.8569111551630924, + "grad_norm": 0.2840319573879242, + "learning_rate": 1.5060923888515677e-06, + "loss": 0.3587, + "step": 7134 + }, + { + "epoch": 3.8574517931158767, + "grad_norm": 0.2595888376235962, + "learning_rate": 1.5047421021028353e-06, + "loss": 0.3604, + "step": 7135 + }, + { + "epoch": 3.857992431068661, + "grad_norm": 0.2776036858558655, + "learning_rate": 1.5033923137016336e-06, + "loss": 0.3752, + "step": 7136 + }, + { + "epoch": 3.8585330690214454, + "grad_norm": 0.27982330322265625, + "learning_rate": 1.502043023840416e-06, + "loss": 0.3837, + "step": 7137 + }, + { + "epoch": 3.85907370697423, + "grad_norm": 0.2833450138568878, + "learning_rate": 1.5006942327115637e-06, + "loss": 0.3706, + "step": 7138 + }, + { + "epoch": 3.8596143449270137, + "grad_norm": 0.2692069411277771, + "learning_rate": 1.4993459405073825e-06, + "loss": 0.365, + "step": 7139 + }, + { + "epoch": 3.860154982879798, + "grad_norm": 0.26060062646865845, + "learning_rate": 1.4979981474201106e-06, + "loss": 0.3778, + "step": 7140 + }, + { + "epoch": 3.8606956208325824, + "grad_norm": 0.2728719711303711, + "learning_rate": 1.4966508536419111e-06, + "loss": 0.3705, + "step": 7141 + }, + { + "epoch": 3.861236258785367, + "grad_norm": 0.2779045104980469, + "learning_rate": 1.4953040593648833e-06, + "loss": 0.3998, + "step": 7142 + }, + { + "epoch": 3.861776896738151, + "grad_norm": 0.2656700313091278, + "learning_rate": 1.4939577647810477e-06, + "loss": 0.3637, + "step": 7143 + }, + { + "epoch": 3.862317534690935, + "grad_norm": 0.28458893299102783, + "learning_rate": 1.4926119700823554e-06, + "loss": 0.3571, + "step": 7144 + }, + { + "epoch": 3.8628581726437194, + "grad_norm": 0.2771691679954529, + "learning_rate": 1.4912666754606914e-06, + "loss": 0.3818, + "step": 7145 + }, + { + "epoch": 3.863398810596504, + "grad_norm": 0.2585548460483551, + "learning_rate": 1.489921881107861e-06, + "loss": 0.3797, + "step": 7146 + }, + { + "epoch": 3.863939448549288, + "grad_norm": 0.2747470736503601, + "learning_rate": 1.488577587215606e-06, + "loss": 0.395, + "step": 7147 + }, + { + "epoch": 3.8644800865020725, + "grad_norm": 0.27049556374549866, + "learning_rate": 1.4872337939755926e-06, + "loss": 0.3557, + "step": 7148 + }, + { + "epoch": 3.865020724454857, + "grad_norm": 0.27144232392311096, + "learning_rate": 1.485890501579414e-06, + "loss": 0.408, + "step": 7149 + }, + { + "epoch": 3.8655613624076413, + "grad_norm": 0.26395317912101746, + "learning_rate": 1.4845477102185974e-06, + "loss": 0.3795, + "step": 7150 + }, + { + "epoch": 3.866102000360425, + "grad_norm": 0.2912643551826477, + "learning_rate": 1.4832054200845947e-06, + "loss": 0.386, + "step": 7151 + }, + { + "epoch": 3.8666426383132095, + "grad_norm": 0.2866555452346802, + "learning_rate": 1.4818636313687868e-06, + "loss": 0.3854, + "step": 7152 + }, + { + "epoch": 3.867183276265994, + "grad_norm": 0.2535575330257416, + "learning_rate": 1.4805223442624818e-06, + "loss": 0.3741, + "step": 7153 + }, + { + "epoch": 3.8677239142187783, + "grad_norm": 0.2779158353805542, + "learning_rate": 1.4791815589569215e-06, + "loss": 0.3796, + "step": 7154 + }, + { + "epoch": 3.8682645521715626, + "grad_norm": 0.29194313287734985, + "learning_rate": 1.4778412756432709e-06, + "loss": 0.3818, + "step": 7155 + }, + { + "epoch": 3.8688051901243465, + "grad_norm": 0.29119977355003357, + "learning_rate": 1.4765014945126232e-06, + "loss": 0.3792, + "step": 7156 + }, + { + "epoch": 3.869345828077131, + "grad_norm": 0.27721136808395386, + "learning_rate": 1.4751622157560065e-06, + "loss": 0.4054, + "step": 7157 + }, + { + "epoch": 3.8698864660299153, + "grad_norm": 0.2895916998386383, + "learning_rate": 1.4738234395643674e-06, + "loss": 0.392, + "step": 7158 + }, + { + "epoch": 3.8704271039826996, + "grad_norm": 0.2725016474723816, + "learning_rate": 1.472485166128591e-06, + "loss": 0.3713, + "step": 7159 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 0.2660664916038513, + "learning_rate": 1.471147395639484e-06, + "loss": 0.3855, + "step": 7160 + }, + { + "epoch": 3.871508379888268, + "grad_norm": 0.27325960993766785, + "learning_rate": 1.4698101282877813e-06, + "loss": 0.3721, + "step": 7161 + }, + { + "epoch": 3.8720490178410527, + "grad_norm": 0.2715884745121002, + "learning_rate": 1.4684733642641514e-06, + "loss": 0.3776, + "step": 7162 + }, + { + "epoch": 3.8725896557938366, + "grad_norm": 0.2671349048614502, + "learning_rate": 1.4671371037591864e-06, + "loss": 0.3739, + "step": 7163 + }, + { + "epoch": 3.873130293746621, + "grad_norm": 0.287344753742218, + "learning_rate": 1.4658013469634075e-06, + "loss": 0.3593, + "step": 7164 + }, + { + "epoch": 3.8736709316994054, + "grad_norm": 0.2602589428424835, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.3675, + "step": 7165 + }, + { + "epoch": 3.8742115696521897, + "grad_norm": 0.2793913781642914, + "learning_rate": 1.4631313452611323e-06, + "loss": 0.3722, + "step": 7166 + }, + { + "epoch": 3.874752207604974, + "grad_norm": 0.2809554636478424, + "learning_rate": 1.4617971007353237e-06, + "loss": 0.3817, + "step": 7167 + }, + { + "epoch": 3.875292845557758, + "grad_norm": 0.282959908246994, + "learning_rate": 1.4604633606800689e-06, + "loss": 0.378, + "step": 7168 + }, + { + "epoch": 3.8758334835105424, + "grad_norm": 0.2529660165309906, + "learning_rate": 1.4591301252855306e-06, + "loss": 0.3772, + "step": 7169 + }, + { + "epoch": 3.8763741214633267, + "grad_norm": 0.2665364444255829, + "learning_rate": 1.457797394741798e-06, + "loss": 0.3603, + "step": 7170 + }, + { + "epoch": 3.876914759416111, + "grad_norm": 0.28510573506355286, + "learning_rate": 1.4564651692388916e-06, + "loss": 0.3705, + "step": 7171 + }, + { + "epoch": 3.8774553973688954, + "grad_norm": 0.26954007148742676, + "learning_rate": 1.4551334489667562e-06, + "loss": 0.3949, + "step": 7172 + }, + { + "epoch": 3.8779960353216794, + "grad_norm": 0.28254538774490356, + "learning_rate": 1.4538022341152653e-06, + "loss": 0.3858, + "step": 7173 + }, + { + "epoch": 3.8785366732744637, + "grad_norm": 0.27030500769615173, + "learning_rate": 1.4524715248742232e-06, + "loss": 0.3921, + "step": 7174 + }, + { + "epoch": 3.879077311227248, + "grad_norm": 0.2663464844226837, + "learning_rate": 1.451141321433358e-06, + "loss": 0.3831, + "step": 7175 + }, + { + "epoch": 3.8796179491800324, + "grad_norm": 0.28364238142967224, + "learning_rate": 1.4498116239823301e-06, + "loss": 0.3794, + "step": 7176 + }, + { + "epoch": 3.880158587132817, + "grad_norm": 0.2636932134628296, + "learning_rate": 1.448482432710724e-06, + "loss": 0.3896, + "step": 7177 + }, + { + "epoch": 3.880699225085601, + "grad_norm": 0.2695843577384949, + "learning_rate": 1.4471537478080516e-06, + "loss": 0.3908, + "step": 7178 + }, + { + "epoch": 3.8812398630383855, + "grad_norm": 0.30649521946907043, + "learning_rate": 1.445825569463758e-06, + "loss": 0.3775, + "step": 7179 + }, + { + "epoch": 3.8817805009911694, + "grad_norm": 0.28447046875953674, + "learning_rate": 1.4444978978672103e-06, + "loss": 0.3681, + "step": 7180 + }, + { + "epoch": 3.882321138943954, + "grad_norm": 0.271747350692749, + "learning_rate": 1.443170733207706e-06, + "loss": 0.3762, + "step": 7181 + }, + { + "epoch": 3.882861776896738, + "grad_norm": 0.26996535062789917, + "learning_rate": 1.4418440756744678e-06, + "loss": 0.3616, + "step": 7182 + }, + { + "epoch": 3.8834024148495225, + "grad_norm": 0.25783035159111023, + "learning_rate": 1.4405179254566515e-06, + "loss": 0.3727, + "step": 7183 + }, + { + "epoch": 3.883943052802307, + "grad_norm": 0.2718077600002289, + "learning_rate": 1.4391922827433359e-06, + "loss": 0.3654, + "step": 7184 + }, + { + "epoch": 3.884483690755091, + "grad_norm": 0.2731360197067261, + "learning_rate": 1.4378671477235268e-06, + "loss": 0.3733, + "step": 7185 + }, + { + "epoch": 3.885024328707875, + "grad_norm": 0.2789834439754486, + "learning_rate": 1.4365425205861627e-06, + "loss": 0.3751, + "step": 7186 + }, + { + "epoch": 3.8855649666606595, + "grad_norm": 0.2919517755508423, + "learning_rate": 1.4352184015201036e-06, + "loss": 0.364, + "step": 7187 + }, + { + "epoch": 3.886105604613444, + "grad_norm": 0.2683340609073639, + "learning_rate": 1.4338947907141431e-06, + "loss": 0.3696, + "step": 7188 + }, + { + "epoch": 3.8866462425662283, + "grad_norm": 0.267026424407959, + "learning_rate": 1.4325716883569973e-06, + "loss": 0.3792, + "step": 7189 + }, + { + "epoch": 3.887186880519012, + "grad_norm": 0.26800042390823364, + "learning_rate": 1.431249094637311e-06, + "loss": 0.3877, + "step": 7190 + }, + { + "epoch": 3.887727518471797, + "grad_norm": 0.29322460293769836, + "learning_rate": 1.429927009743659e-06, + "loss": 0.4035, + "step": 7191 + }, + { + "epoch": 3.888268156424581, + "grad_norm": 0.2647303640842438, + "learning_rate": 1.4286054338645416e-06, + "loss": 0.3989, + "step": 7192 + }, + { + "epoch": 3.8888087943773653, + "grad_norm": 0.28105291724205017, + "learning_rate": 1.4272843671883857e-06, + "loss": 0.3661, + "step": 7193 + }, + { + "epoch": 3.8893494323301496, + "grad_norm": 0.2671509385108948, + "learning_rate": 1.4259638099035456e-06, + "loss": 0.3713, + "step": 7194 + }, + { + "epoch": 3.889890070282934, + "grad_norm": 0.286563515663147, + "learning_rate": 1.4246437621983057e-06, + "loss": 0.3491, + "step": 7195 + }, + { + "epoch": 3.8904307082357183, + "grad_norm": 0.3068971335887909, + "learning_rate": 1.423324224260878e-06, + "loss": 0.3804, + "step": 7196 + }, + { + "epoch": 3.8909713461885023, + "grad_norm": 0.258400559425354, + "learning_rate": 1.4220051962793952e-06, + "loss": 0.3483, + "step": 7197 + }, + { + "epoch": 3.8915119841412866, + "grad_norm": 0.25736966729164124, + "learning_rate": 1.4206866784419248e-06, + "loss": 0.3508, + "step": 7198 + }, + { + "epoch": 3.892052622094071, + "grad_norm": 0.27688178420066833, + "learning_rate": 1.4193686709364574e-06, + "loss": 0.4045, + "step": 7199 + }, + { + "epoch": 3.8925932600468554, + "grad_norm": 0.2596736252307892, + "learning_rate": 1.418051173950914e-06, + "loss": 0.384, + "step": 7200 + }, + { + "epoch": 3.8931338979996397, + "grad_norm": 0.29413244128227234, + "learning_rate": 1.41673418767314e-06, + "loss": 0.3846, + "step": 7201 + }, + { + "epoch": 3.8936745359524236, + "grad_norm": 0.29847484827041626, + "learning_rate": 1.4154177122909068e-06, + "loss": 0.3643, + "step": 7202 + }, + { + "epoch": 3.894215173905208, + "grad_norm": 0.2602514326572418, + "learning_rate": 1.4141017479919184e-06, + "loss": 0.3579, + "step": 7203 + }, + { + "epoch": 3.8947558118579924, + "grad_norm": 0.27241677045822144, + "learning_rate": 1.412786294963801e-06, + "loss": 0.3588, + "step": 7204 + }, + { + "epoch": 3.8952964498107767, + "grad_norm": 0.26186132431030273, + "learning_rate": 1.4114713533941082e-06, + "loss": 0.3478, + "step": 7205 + }, + { + "epoch": 3.895837087763561, + "grad_norm": 0.29094818234443665, + "learning_rate": 1.4101569234703256e-06, + "loss": 0.4012, + "step": 7206 + }, + { + "epoch": 3.8963777257163454, + "grad_norm": 0.2835518717765808, + "learning_rate": 1.408843005379858e-06, + "loss": 0.3631, + "step": 7207 + }, + { + "epoch": 3.89691836366913, + "grad_norm": 0.28836387395858765, + "learning_rate": 1.4075295993100462e-06, + "loss": 0.3722, + "step": 7208 + }, + { + "epoch": 3.8974590016219137, + "grad_norm": 0.27291521430015564, + "learning_rate": 1.4062167054481479e-06, + "loss": 0.3616, + "step": 7209 + }, + { + "epoch": 3.897999639574698, + "grad_norm": 0.2893563210964203, + "learning_rate": 1.4049043239813575e-06, + "loss": 0.3706, + "step": 7210 + }, + { + "epoch": 3.8985402775274824, + "grad_norm": 0.26893335580825806, + "learning_rate": 1.4035924550967888e-06, + "loss": 0.3485, + "step": 7211 + }, + { + "epoch": 3.899080915480267, + "grad_norm": 0.28393498063087463, + "learning_rate": 1.4022810989814884e-06, + "loss": 0.3735, + "step": 7212 + }, + { + "epoch": 3.899621553433051, + "grad_norm": 0.27619096636772156, + "learning_rate": 1.4009702558224258e-06, + "loss": 0.3656, + "step": 7213 + }, + { + "epoch": 3.900162191385835, + "grad_norm": 0.256229043006897, + "learning_rate": 1.3996599258064968e-06, + "loss": 0.3792, + "step": 7214 + }, + { + "epoch": 3.9007028293386194, + "grad_norm": 0.2728364169597626, + "learning_rate": 1.3983501091205298e-06, + "loss": 0.3768, + "step": 7215 + }, + { + "epoch": 3.901243467291404, + "grad_norm": 0.27944228053092957, + "learning_rate": 1.3970408059512741e-06, + "loss": 0.3658, + "step": 7216 + }, + { + "epoch": 3.901784105244188, + "grad_norm": 0.2501264214515686, + "learning_rate": 1.395732016485406e-06, + "loss": 0.3527, + "step": 7217 + }, + { + "epoch": 3.9023247431969725, + "grad_norm": 0.26919788122177124, + "learning_rate": 1.3944237409095335e-06, + "loss": 0.3738, + "step": 7218 + }, + { + "epoch": 3.9028653811497565, + "grad_norm": 0.2669914662837982, + "learning_rate": 1.3931159794101855e-06, + "loss": 0.3725, + "step": 7219 + }, + { + "epoch": 3.9034060191025413, + "grad_norm": 0.27003178000450134, + "learning_rate": 1.3918087321738244e-06, + "loss": 0.3713, + "step": 7220 + }, + { + "epoch": 3.903946657055325, + "grad_norm": 0.2867528200149536, + "learning_rate": 1.3905019993868285e-06, + "loss": 0.3562, + "step": 7221 + }, + { + "epoch": 3.9044872950081095, + "grad_norm": 0.2855518162250519, + "learning_rate": 1.3891957812355156e-06, + "loss": 0.3729, + "step": 7222 + }, + { + "epoch": 3.905027932960894, + "grad_norm": 0.2666948437690735, + "learning_rate": 1.3878900779061194e-06, + "loss": 0.3712, + "step": 7223 + }, + { + "epoch": 3.9055685709136783, + "grad_norm": 0.27920854091644287, + "learning_rate": 1.3865848895848078e-06, + "loss": 0.3908, + "step": 7224 + }, + { + "epoch": 3.9061092088664626, + "grad_norm": 0.28267914056777954, + "learning_rate": 1.3852802164576717e-06, + "loss": 0.3775, + "step": 7225 + }, + { + "epoch": 3.9066498468192465, + "grad_norm": 0.2642427086830139, + "learning_rate": 1.3839760587107271e-06, + "loss": 0.3928, + "step": 7226 + }, + { + "epoch": 3.907190484772031, + "grad_norm": 0.2898047864437103, + "learning_rate": 1.3826724165299205e-06, + "loss": 0.3631, + "step": 7227 + }, + { + "epoch": 3.9077311227248153, + "grad_norm": 0.2983902394771576, + "learning_rate": 1.3813692901011228e-06, + "loss": 0.4023, + "step": 7228 + }, + { + "epoch": 3.9082717606775996, + "grad_norm": 0.2528410255908966, + "learning_rate": 1.3800666796101291e-06, + "loss": 0.379, + "step": 7229 + }, + { + "epoch": 3.908812398630384, + "grad_norm": 0.2616298794746399, + "learning_rate": 1.3787645852426663e-06, + "loss": 0.3869, + "step": 7230 + }, + { + "epoch": 3.909353036583168, + "grad_norm": 0.26782476902008057, + "learning_rate": 1.3774630071843814e-06, + "loss": 0.401, + "step": 7231 + }, + { + "epoch": 3.9098936745359523, + "grad_norm": 0.26978132128715515, + "learning_rate": 1.3761619456208548e-06, + "loss": 0.364, + "step": 7232 + }, + { + "epoch": 3.9104343124887366, + "grad_norm": 0.27226102352142334, + "learning_rate": 1.374861400737587e-06, + "loss": 0.3954, + "step": 7233 + }, + { + "epoch": 3.910974950441521, + "grad_norm": 0.27029043436050415, + "learning_rate": 1.373561372720007e-06, + "loss": 0.3548, + "step": 7234 + }, + { + "epoch": 3.9115155883943054, + "grad_norm": 0.2666410207748413, + "learning_rate": 1.3722618617534727e-06, + "loss": 0.3621, + "step": 7235 + }, + { + "epoch": 3.9120562263470897, + "grad_norm": 0.2646237015724182, + "learning_rate": 1.3709628680232628e-06, + "loss": 0.3803, + "step": 7236 + }, + { + "epoch": 3.912596864299874, + "grad_norm": 0.29291343688964844, + "learning_rate": 1.3696643917145908e-06, + "loss": 0.3845, + "step": 7237 + }, + { + "epoch": 3.913137502252658, + "grad_norm": 0.2636338174343109, + "learning_rate": 1.3683664330125846e-06, + "loss": 0.3754, + "step": 7238 + }, + { + "epoch": 3.9136781402054424, + "grad_norm": 0.25800561904907227, + "learning_rate": 1.3670689921023088e-06, + "loss": 0.37, + "step": 7239 + }, + { + "epoch": 3.9142187781582267, + "grad_norm": 0.2600545883178711, + "learning_rate": 1.3657720691687481e-06, + "loss": 0.386, + "step": 7240 + }, + { + "epoch": 3.914759416111011, + "grad_norm": 0.2758975327014923, + "learning_rate": 1.3644756643968183e-06, + "loss": 0.3843, + "step": 7241 + }, + { + "epoch": 3.9153000540637954, + "grad_norm": 0.2715599536895752, + "learning_rate": 1.3631797779713557e-06, + "loss": 0.3434, + "step": 7242 + }, + { + "epoch": 3.9158406920165794, + "grad_norm": 0.261996328830719, + "learning_rate": 1.3618844100771256e-06, + "loss": 0.3837, + "step": 7243 + }, + { + "epoch": 3.9163813299693637, + "grad_norm": 0.28563106060028076, + "learning_rate": 1.3605895608988212e-06, + "loss": 0.3657, + "step": 7244 + }, + { + "epoch": 3.916921967922148, + "grad_norm": 0.29061299562454224, + "learning_rate": 1.3592952306210589e-06, + "loss": 0.362, + "step": 7245 + }, + { + "epoch": 3.9174626058749324, + "grad_norm": 0.2673431634902954, + "learning_rate": 1.3580014194283796e-06, + "loss": 0.3625, + "step": 7246 + }, + { + "epoch": 3.918003243827717, + "grad_norm": 0.2761494815349579, + "learning_rate": 1.3567081275052562e-06, + "loss": 0.3673, + "step": 7247 + }, + { + "epoch": 3.9185438817805007, + "grad_norm": 0.29554659128189087, + "learning_rate": 1.355415355036081e-06, + "loss": 0.3734, + "step": 7248 + }, + { + "epoch": 3.9190845197332855, + "grad_norm": 0.2864658534526825, + "learning_rate": 1.3541231022051794e-06, + "loss": 0.3463, + "step": 7249 + }, + { + "epoch": 3.9196251576860694, + "grad_norm": 0.276905357837677, + "learning_rate": 1.3528313691967926e-06, + "loss": 0.38, + "step": 7250 + }, + { + "epoch": 3.920165795638854, + "grad_norm": 0.2607681155204773, + "learning_rate": 1.3515401561950974e-06, + "loss": 0.3925, + "step": 7251 + }, + { + "epoch": 3.920706433591638, + "grad_norm": 0.2601732015609741, + "learning_rate": 1.3502494633841906e-06, + "loss": 0.3958, + "step": 7252 + }, + { + "epoch": 3.9212470715444225, + "grad_norm": 0.25898584723472595, + "learning_rate": 1.3489592909480993e-06, + "loss": 0.3639, + "step": 7253 + }, + { + "epoch": 3.921787709497207, + "grad_norm": 0.2667955458164215, + "learning_rate": 1.3476696390707727e-06, + "loss": 0.4018, + "step": 7254 + }, + { + "epoch": 3.922328347449991, + "grad_norm": 0.2603578567504883, + "learning_rate": 1.3463805079360854e-06, + "loss": 0.3357, + "step": 7255 + }, + { + "epoch": 3.922868985402775, + "grad_norm": 0.28907307982444763, + "learning_rate": 1.345091897727842e-06, + "loss": 0.398, + "step": 7256 + }, + { + "epoch": 3.9234096233555595, + "grad_norm": 0.28409260511398315, + "learning_rate": 1.343803808629769e-06, + "loss": 0.3638, + "step": 7257 + }, + { + "epoch": 3.923950261308344, + "grad_norm": 0.26542195677757263, + "learning_rate": 1.3425162408255188e-06, + "loss": 0.3674, + "step": 7258 + }, + { + "epoch": 3.9244908992611283, + "grad_norm": 0.26268041133880615, + "learning_rate": 1.3412291944986726e-06, + "loss": 0.3769, + "step": 7259 + }, + { + "epoch": 3.925031537213912, + "grad_norm": 0.2736770808696747, + "learning_rate": 1.3399426698327329e-06, + "loss": 0.3651, + "step": 7260 + }, + { + "epoch": 3.9255721751666965, + "grad_norm": 0.27217063307762146, + "learning_rate": 1.3386566670111339e-06, + "loss": 0.36, + "step": 7261 + }, + { + "epoch": 3.926112813119481, + "grad_norm": 0.2648680508136749, + "learning_rate": 1.3373711862172262e-06, + "loss": 0.3919, + "step": 7262 + }, + { + "epoch": 3.9266534510722653, + "grad_norm": 0.2955801486968994, + "learning_rate": 1.336086227634294e-06, + "loss": 0.3488, + "step": 7263 + }, + { + "epoch": 3.9271940890250496, + "grad_norm": 0.2573787569999695, + "learning_rate": 1.3348017914455458e-06, + "loss": 0.3637, + "step": 7264 + }, + { + "epoch": 3.927734726977834, + "grad_norm": 0.25642362236976624, + "learning_rate": 1.3335178778341123e-06, + "loss": 0.3641, + "step": 7265 + }, + { + "epoch": 3.9282753649306184, + "grad_norm": 0.2628449499607086, + "learning_rate": 1.3322344869830528e-06, + "loss": 0.3773, + "step": 7266 + }, + { + "epoch": 3.9288160028834023, + "grad_norm": 0.277749627828598, + "learning_rate": 1.330951619075348e-06, + "loss": 0.3815, + "step": 7267 + }, + { + "epoch": 3.9293566408361866, + "grad_norm": 0.27905476093292236, + "learning_rate": 1.3296692742939104e-06, + "loss": 0.3664, + "step": 7268 + }, + { + "epoch": 3.929897278788971, + "grad_norm": 0.25372523069381714, + "learning_rate": 1.3283874528215735e-06, + "loss": 0.3882, + "step": 7269 + }, + { + "epoch": 3.9304379167417554, + "grad_norm": 0.2904640734195709, + "learning_rate": 1.3271061548410947e-06, + "loss": 0.3734, + "step": 7270 + }, + { + "epoch": 3.9309785546945397, + "grad_norm": 0.2711356282234192, + "learning_rate": 1.3258253805351622e-06, + "loss": 0.3669, + "step": 7271 + }, + { + "epoch": 3.9315191926473236, + "grad_norm": 0.2662886679172516, + "learning_rate": 1.3245451300863842e-06, + "loss": 0.371, + "step": 7272 + }, + { + "epoch": 3.932059830600108, + "grad_norm": 0.2688289284706116, + "learning_rate": 1.323265403677299e-06, + "loss": 0.3582, + "step": 7273 + }, + { + "epoch": 3.9326004685528924, + "grad_norm": 0.26735720038414, + "learning_rate": 1.3219862014903663e-06, + "loss": 0.373, + "step": 7274 + }, + { + "epoch": 3.9331411065056767, + "grad_norm": 0.2805106043815613, + "learning_rate": 1.3207075237079702e-06, + "loss": 0.3777, + "step": 7275 + }, + { + "epoch": 3.933681744458461, + "grad_norm": 0.2979760766029358, + "learning_rate": 1.3194293705124262e-06, + "loss": 0.3886, + "step": 7276 + }, + { + "epoch": 3.934222382411245, + "grad_norm": 0.28045132756233215, + "learning_rate": 1.3181517420859696e-06, + "loss": 0.3878, + "step": 7277 + }, + { + "epoch": 3.93476302036403, + "grad_norm": 0.2650907635688782, + "learning_rate": 1.3168746386107617e-06, + "loss": 0.3503, + "step": 7278 + }, + { + "epoch": 3.9353036583168137, + "grad_norm": 0.27181705832481384, + "learning_rate": 1.3155980602688884e-06, + "loss": 0.3728, + "step": 7279 + }, + { + "epoch": 3.935844296269598, + "grad_norm": 0.2796936333179474, + "learning_rate": 1.3143220072423647e-06, + "loss": 0.3669, + "step": 7280 + }, + { + "epoch": 3.9363849342223824, + "grad_norm": 0.2801571190357208, + "learning_rate": 1.313046479713127e-06, + "loss": 0.3595, + "step": 7281 + }, + { + "epoch": 3.936925572175167, + "grad_norm": 0.25436931848526, + "learning_rate": 1.3117714778630358e-06, + "loss": 0.3857, + "step": 7282 + }, + { + "epoch": 3.937466210127951, + "grad_norm": 0.2766454815864563, + "learning_rate": 1.3104970018738812e-06, + "loss": 0.3805, + "step": 7283 + }, + { + "epoch": 3.938006848080735, + "grad_norm": 0.27439621090888977, + "learning_rate": 1.3092230519273736e-06, + "loss": 0.367, + "step": 7284 + }, + { + "epoch": 3.9385474860335195, + "grad_norm": 0.2701241970062256, + "learning_rate": 1.307949628205153e-06, + "loss": 0.3837, + "step": 7285 + }, + { + "epoch": 3.939088123986304, + "grad_norm": 0.26668402552604675, + "learning_rate": 1.3066767308887796e-06, + "loss": 0.3855, + "step": 7286 + }, + { + "epoch": 3.939628761939088, + "grad_norm": 0.27629777789115906, + "learning_rate": 1.3054043601597404e-06, + "loss": 0.3773, + "step": 7287 + }, + { + "epoch": 3.9401693998918725, + "grad_norm": 0.28795719146728516, + "learning_rate": 1.3041325161994506e-06, + "loss": 0.3897, + "step": 7288 + }, + { + "epoch": 3.9407100378446565, + "grad_norm": 0.2713862657546997, + "learning_rate": 1.3028611991892454e-06, + "loss": 0.3592, + "step": 7289 + }, + { + "epoch": 3.941250675797441, + "grad_norm": 0.25722599029541016, + "learning_rate": 1.301590409310387e-06, + "loss": 0.3506, + "step": 7290 + }, + { + "epoch": 3.941791313750225, + "grad_norm": 0.2745075821876526, + "learning_rate": 1.3003201467440607e-06, + "loss": 0.3731, + "step": 7291 + }, + { + "epoch": 3.9423319517030095, + "grad_norm": 0.27006328105926514, + "learning_rate": 1.2990504116713803e-06, + "loss": 0.3795, + "step": 7292 + }, + { + "epoch": 3.942872589655794, + "grad_norm": 0.2963608503341675, + "learning_rate": 1.297781204273385e-06, + "loss": 0.3752, + "step": 7293 + }, + { + "epoch": 3.9434132276085783, + "grad_norm": 0.2789985239505768, + "learning_rate": 1.2965125247310296e-06, + "loss": 0.3512, + "step": 7294 + }, + { + "epoch": 3.9439538655613626, + "grad_norm": 0.2877563536167145, + "learning_rate": 1.2952443732252058e-06, + "loss": 0.3747, + "step": 7295 + }, + { + "epoch": 3.9444945035141465, + "grad_norm": 0.3294397294521332, + "learning_rate": 1.29397674993672e-06, + "loss": 0.3789, + "step": 7296 + }, + { + "epoch": 3.945035141466931, + "grad_norm": 0.2651817500591278, + "learning_rate": 1.2927096550463114e-06, + "loss": 0.3737, + "step": 7297 + }, + { + "epoch": 3.9455757794197153, + "grad_norm": 0.26083993911743164, + "learning_rate": 1.2914430887346385e-06, + "loss": 0.3838, + "step": 7298 + }, + { + "epoch": 3.9461164173724996, + "grad_norm": 0.27897948026657104, + "learning_rate": 1.2901770511822843e-06, + "loss": 0.3788, + "step": 7299 + }, + { + "epoch": 3.946657055325284, + "grad_norm": 0.29251763224601746, + "learning_rate": 1.2889115425697612e-06, + "loss": 0.3824, + "step": 7300 + }, + { + "epoch": 3.947197693278068, + "grad_norm": 0.2713179290294647, + "learning_rate": 1.287646563077501e-06, + "loss": 0.372, + "step": 7301 + }, + { + "epoch": 3.9477383312308523, + "grad_norm": 0.2765255272388458, + "learning_rate": 1.2863821128858633e-06, + "loss": 0.4016, + "step": 7302 + }, + { + "epoch": 3.9482789691836366, + "grad_norm": 0.26739639043807983, + "learning_rate": 1.2851181921751316e-06, + "loss": 0.3687, + "step": 7303 + }, + { + "epoch": 3.948819607136421, + "grad_norm": 0.29228445887565613, + "learning_rate": 1.283854801125511e-06, + "loss": 0.3475, + "step": 7304 + }, + { + "epoch": 3.9493602450892054, + "grad_norm": 0.26782652735710144, + "learning_rate": 1.282591939917136e-06, + "loss": 0.4014, + "step": 7305 + }, + { + "epoch": 3.9499008830419893, + "grad_norm": 0.25561168789863586, + "learning_rate": 1.2813296087300625e-06, + "loss": 0.3914, + "step": 7306 + }, + { + "epoch": 3.950441520994774, + "grad_norm": 0.29365113377571106, + "learning_rate": 1.2800678077442707e-06, + "loss": 0.3918, + "step": 7307 + }, + { + "epoch": 3.950982158947558, + "grad_norm": 0.2888663113117218, + "learning_rate": 1.2788065371396652e-06, + "loss": 0.374, + "step": 7308 + }, + { + "epoch": 3.9515227969003424, + "grad_norm": 0.2578252851963043, + "learning_rate": 1.2775457970960765e-06, + "loss": 0.3657, + "step": 7309 + }, + { + "epoch": 3.9520634348531267, + "grad_norm": 0.2820899188518524, + "learning_rate": 1.2762855877932617e-06, + "loss": 0.3671, + "step": 7310 + }, + { + "epoch": 3.952604072805911, + "grad_norm": 0.26194140315055847, + "learning_rate": 1.275025909410893e-06, + "loss": 0.3794, + "step": 7311 + }, + { + "epoch": 3.9531447107586954, + "grad_norm": 0.27738311886787415, + "learning_rate": 1.2737667621285782e-06, + "loss": 0.4047, + "step": 7312 + }, + { + "epoch": 3.9536853487114794, + "grad_norm": 0.268296480178833, + "learning_rate": 1.272508146125841e-06, + "loss": 0.3875, + "step": 7313 + }, + { + "epoch": 3.9542259866642637, + "grad_norm": 0.27902549505233765, + "learning_rate": 1.2712500615821348e-06, + "loss": 0.3637, + "step": 7314 + }, + { + "epoch": 3.954766624617048, + "grad_norm": 0.2829354703426361, + "learning_rate": 1.269992508676835e-06, + "loss": 0.345, + "step": 7315 + }, + { + "epoch": 3.9553072625698324, + "grad_norm": 0.26511573791503906, + "learning_rate": 1.2687354875892382e-06, + "loss": 0.3737, + "step": 7316 + }, + { + "epoch": 3.955847900522617, + "grad_norm": 0.2595377266407013, + "learning_rate": 1.2674789984985725e-06, + "loss": 0.3678, + "step": 7317 + }, + { + "epoch": 3.9563885384754007, + "grad_norm": 0.27837511897087097, + "learning_rate": 1.2662230415839831e-06, + "loss": 0.3793, + "step": 7318 + }, + { + "epoch": 3.9569291764281855, + "grad_norm": 0.2738182246685028, + "learning_rate": 1.2649676170245433e-06, + "loss": 0.3769, + "step": 7319 + }, + { + "epoch": 3.9574698143809695, + "grad_norm": 0.2933426797389984, + "learning_rate": 1.2637127249992465e-06, + "loss": 0.4089, + "step": 7320 + }, + { + "epoch": 3.958010452333754, + "grad_norm": 0.2945869266986847, + "learning_rate": 1.2624583656870153e-06, + "loss": 0.3705, + "step": 7321 + }, + { + "epoch": 3.958551090286538, + "grad_norm": 0.2781412899494171, + "learning_rate": 1.2612045392666965e-06, + "loss": 0.3952, + "step": 7322 + }, + { + "epoch": 3.9590917282393225, + "grad_norm": 0.28177520632743835, + "learning_rate": 1.2599512459170532e-06, + "loss": 0.3698, + "step": 7323 + }, + { + "epoch": 3.959632366192107, + "grad_norm": 0.257065087556839, + "learning_rate": 1.2586984858167812e-06, + "loss": 0.3961, + "step": 7324 + }, + { + "epoch": 3.960173004144891, + "grad_norm": 0.27955007553100586, + "learning_rate": 1.257446259144494e-06, + "loss": 0.3787, + "step": 7325 + }, + { + "epoch": 3.960713642097675, + "grad_norm": 0.27634406089782715, + "learning_rate": 1.2561945660787357e-06, + "loss": 0.3677, + "step": 7326 + }, + { + "epoch": 3.9612542800504595, + "grad_norm": 0.2737906575202942, + "learning_rate": 1.2549434067979677e-06, + "loss": 0.4032, + "step": 7327 + }, + { + "epoch": 3.961794918003244, + "grad_norm": 0.2561657428741455, + "learning_rate": 1.2536927814805772e-06, + "loss": 0.3641, + "step": 7328 + }, + { + "epoch": 3.9623355559560283, + "grad_norm": 0.2667194604873657, + "learning_rate": 1.2524426903048786e-06, + "loss": 0.3719, + "step": 7329 + }, + { + "epoch": 3.962876193908812, + "grad_norm": 0.27609458565711975, + "learning_rate": 1.2511931334491068e-06, + "loss": 0.36, + "step": 7330 + }, + { + "epoch": 3.9634168318615965, + "grad_norm": 0.27734148502349854, + "learning_rate": 1.2499441110914195e-06, + "loss": 0.3823, + "step": 7331 + }, + { + "epoch": 3.963957469814381, + "grad_norm": 0.28398409485816956, + "learning_rate": 1.2486956234099029e-06, + "loss": 0.367, + "step": 7332 + }, + { + "epoch": 3.9644981077671653, + "grad_norm": 0.27384549379348755, + "learning_rate": 1.2474476705825611e-06, + "loss": 0.3743, + "step": 7333 + }, + { + "epoch": 3.9650387457199496, + "grad_norm": 0.29215607047080994, + "learning_rate": 1.2462002527873301e-06, + "loss": 0.3868, + "step": 7334 + }, + { + "epoch": 3.9655793836727335, + "grad_norm": 0.25008103251457214, + "learning_rate": 1.2449533702020578e-06, + "loss": 0.3705, + "step": 7335 + }, + { + "epoch": 3.9661200216255184, + "grad_norm": 0.2669306695461273, + "learning_rate": 1.2437070230045272e-06, + "loss": 0.3718, + "step": 7336 + }, + { + "epoch": 3.9666606595783023, + "grad_norm": 0.30136021971702576, + "learning_rate": 1.2424612113724372e-06, + "loss": 0.3762, + "step": 7337 + }, + { + "epoch": 3.9672012975310866, + "grad_norm": 0.28893977403640747, + "learning_rate": 1.2412159354834159e-06, + "loss": 0.3498, + "step": 7338 + }, + { + "epoch": 3.967741935483871, + "grad_norm": 0.2777029573917389, + "learning_rate": 1.2399711955150117e-06, + "loss": 0.4018, + "step": 7339 + }, + { + "epoch": 3.9682825734366554, + "grad_norm": 0.2783864140510559, + "learning_rate": 1.238726991644696e-06, + "loss": 0.3786, + "step": 7340 + }, + { + "epoch": 3.9688232113894397, + "grad_norm": 0.2676764726638794, + "learning_rate": 1.2374833240498668e-06, + "loss": 0.3614, + "step": 7341 + }, + { + "epoch": 3.9693638493422236, + "grad_norm": 0.2902882695198059, + "learning_rate": 1.2362401929078438e-06, + "loss": 0.3802, + "step": 7342 + }, + { + "epoch": 3.969904487295008, + "grad_norm": 0.2899143099784851, + "learning_rate": 1.234997598395869e-06, + "loss": 0.3983, + "step": 7343 + }, + { + "epoch": 3.9704451252477924, + "grad_norm": 0.26067546010017395, + "learning_rate": 1.2337555406911111e-06, + "loss": 0.3695, + "step": 7344 + }, + { + "epoch": 3.9709857632005767, + "grad_norm": 0.2756107747554779, + "learning_rate": 1.232514019970658e-06, + "loss": 0.3934, + "step": 7345 + }, + { + "epoch": 3.971526401153361, + "grad_norm": 0.27496638894081116, + "learning_rate": 1.2312730364115282e-06, + "loss": 0.4033, + "step": 7346 + }, + { + "epoch": 3.972067039106145, + "grad_norm": 0.2726435959339142, + "learning_rate": 1.2300325901906529e-06, + "loss": 0.3791, + "step": 7347 + }, + { + "epoch": 3.97260767705893, + "grad_norm": 0.26076987385749817, + "learning_rate": 1.2287926814848955e-06, + "loss": 0.376, + "step": 7348 + }, + { + "epoch": 3.9731483150117137, + "grad_norm": 0.2784844934940338, + "learning_rate": 1.2275533104710413e-06, + "loss": 0.3805, + "step": 7349 + }, + { + "epoch": 3.973688952964498, + "grad_norm": 0.2745589017868042, + "learning_rate": 1.2263144773257967e-06, + "loss": 0.3862, + "step": 7350 + }, + { + "epoch": 3.9742295909172825, + "grad_norm": 0.2718479335308075, + "learning_rate": 1.2250761822257912e-06, + "loss": 0.36, + "step": 7351 + }, + { + "epoch": 3.974770228870067, + "grad_norm": 0.2668786644935608, + "learning_rate": 1.2238384253475783e-06, + "loss": 0.3851, + "step": 7352 + }, + { + "epoch": 3.975310866822851, + "grad_norm": 0.2801748216152191, + "learning_rate": 1.222601206867637e-06, + "loss": 0.3639, + "step": 7353 + }, + { + "epoch": 3.975851504775635, + "grad_norm": 0.2815357446670532, + "learning_rate": 1.221364526962367e-06, + "loss": 0.3723, + "step": 7354 + }, + { + "epoch": 3.9763921427284195, + "grad_norm": 0.2539142668247223, + "learning_rate": 1.2201283858080903e-06, + "loss": 0.3728, + "step": 7355 + }, + { + "epoch": 3.976932780681204, + "grad_norm": 0.27295762300491333, + "learning_rate": 1.218892783581056e-06, + "loss": 0.3844, + "step": 7356 + }, + { + "epoch": 3.977473418633988, + "grad_norm": 0.2792266011238098, + "learning_rate": 1.2176577204574318e-06, + "loss": 0.3751, + "step": 7357 + }, + { + "epoch": 3.9780140565867725, + "grad_norm": 0.2757304608821869, + "learning_rate": 1.2164231966133156e-06, + "loss": 0.3813, + "step": 7358 + }, + { + "epoch": 3.9785546945395565, + "grad_norm": 0.2661166489124298, + "learning_rate": 1.215189212224716e-06, + "loss": 0.3848, + "step": 7359 + }, + { + "epoch": 3.979095332492341, + "grad_norm": 0.26561239361763, + "learning_rate": 1.2139557674675773e-06, + "loss": 0.3671, + "step": 7360 + }, + { + "epoch": 3.979635970445125, + "grad_norm": 0.2694830596446991, + "learning_rate": 1.2127228625177611e-06, + "loss": 0.3756, + "step": 7361 + }, + { + "epoch": 3.9801766083979095, + "grad_norm": 0.27653172612190247, + "learning_rate": 1.2114904975510516e-06, + "loss": 0.3554, + "step": 7362 + }, + { + "epoch": 3.980717246350694, + "grad_norm": 0.2755624055862427, + "learning_rate": 1.210258672743161e-06, + "loss": 0.3605, + "step": 7363 + }, + { + "epoch": 3.9812578843034783, + "grad_norm": 0.27337151765823364, + "learning_rate": 1.209027388269714e-06, + "loss": 0.3826, + "step": 7364 + }, + { + "epoch": 3.9817985222562626, + "grad_norm": 0.2733648419380188, + "learning_rate": 1.2077966443062706e-06, + "loss": 0.3659, + "step": 7365 + }, + { + "epoch": 3.9823391602090465, + "grad_norm": 0.26981082558631897, + "learning_rate": 1.2065664410283046e-06, + "loss": 0.3739, + "step": 7366 + }, + { + "epoch": 3.982879798161831, + "grad_norm": 0.2549477815628052, + "learning_rate": 1.2053367786112185e-06, + "loss": 0.3737, + "step": 7367 + }, + { + "epoch": 3.9834204361146153, + "grad_norm": 0.2778071463108063, + "learning_rate": 1.2041076572303345e-06, + "loss": 0.3902, + "step": 7368 + }, + { + "epoch": 3.9839610740673996, + "grad_norm": 0.30516862869262695, + "learning_rate": 1.2028790770608968e-06, + "loss": 0.385, + "step": 7369 + }, + { + "epoch": 3.984501712020184, + "grad_norm": 0.2869141101837158, + "learning_rate": 1.2016510382780772e-06, + "loss": 0.3667, + "step": 7370 + }, + { + "epoch": 3.985042349972968, + "grad_norm": 0.2690754234790802, + "learning_rate": 1.2004235410569659e-06, + "loss": 0.3881, + "step": 7371 + }, + { + "epoch": 3.9855829879257523, + "grad_norm": 0.2717999517917633, + "learning_rate": 1.199196585572575e-06, + "loss": 0.3745, + "step": 7372 + }, + { + "epoch": 3.9861236258785366, + "grad_norm": 0.2818255126476288, + "learning_rate": 1.1979701719998454e-06, + "loss": 0.3766, + "step": 7373 + }, + { + "epoch": 3.986664263831321, + "grad_norm": 0.28154563903808594, + "learning_rate": 1.1967443005136343e-06, + "loss": 0.3848, + "step": 7374 + }, + { + "epoch": 3.9872049017841054, + "grad_norm": 0.2657877504825592, + "learning_rate": 1.1955189712887272e-06, + "loss": 0.3894, + "step": 7375 + }, + { + "epoch": 3.9877455397368893, + "grad_norm": 0.29316869378089905, + "learning_rate": 1.1942941844998246e-06, + "loss": 0.3719, + "step": 7376 + }, + { + "epoch": 3.988286177689674, + "grad_norm": 0.26492777466773987, + "learning_rate": 1.1930699403215573e-06, + "loss": 0.3673, + "step": 7377 + }, + { + "epoch": 3.988826815642458, + "grad_norm": 0.2546434700489044, + "learning_rate": 1.1918462389284762e-06, + "loss": 0.3582, + "step": 7378 + }, + { + "epoch": 3.9893674535952424, + "grad_norm": 0.2561786472797394, + "learning_rate": 1.1906230804950547e-06, + "loss": 0.3791, + "step": 7379 + }, + { + "epoch": 3.9899080915480267, + "grad_norm": 0.2614768147468567, + "learning_rate": 1.189400465195687e-06, + "loss": 0.365, + "step": 7380 + }, + { + "epoch": 3.990448729500811, + "grad_norm": 0.2605297267436981, + "learning_rate": 1.1881783932046904e-06, + "loss": 0.3858, + "step": 7381 + }, + { + "epoch": 3.9909893674535954, + "grad_norm": 0.259212851524353, + "learning_rate": 1.1869568646963086e-06, + "loss": 0.3705, + "step": 7382 + }, + { + "epoch": 3.9915300054063794, + "grad_norm": 0.2636473476886749, + "learning_rate": 1.1857358798447038e-06, + "loss": 0.3624, + "step": 7383 + }, + { + "epoch": 3.9920706433591637, + "grad_norm": 0.2585376799106598, + "learning_rate": 1.18451543882396e-06, + "loss": 0.378, + "step": 7384 + }, + { + "epoch": 3.992611281311948, + "grad_norm": 0.2568807899951935, + "learning_rate": 1.183295541808089e-06, + "loss": 0.387, + "step": 7385 + }, + { + "epoch": 3.9931519192647325, + "grad_norm": 0.2846553325653076, + "learning_rate": 1.1820761889710175e-06, + "loss": 0.3925, + "step": 7386 + }, + { + "epoch": 3.993692557217517, + "grad_norm": 0.30891963839530945, + "learning_rate": 1.180857380486602e-06, + "loss": 0.3711, + "step": 7387 + }, + { + "epoch": 3.9942331951703007, + "grad_norm": 0.25803038477897644, + "learning_rate": 1.1796391165286169e-06, + "loss": 0.3926, + "step": 7388 + }, + { + "epoch": 3.994773833123085, + "grad_norm": 0.27402520179748535, + "learning_rate": 1.1784213972707581e-06, + "loss": 0.3925, + "step": 7389 + }, + { + "epoch": 3.9953144710758695, + "grad_norm": 0.2832760810852051, + "learning_rate": 1.1772042228866493e-06, + "loss": 0.3906, + "step": 7390 + }, + { + "epoch": 3.995855109028654, + "grad_norm": 0.28747400641441345, + "learning_rate": 1.1759875935498311e-06, + "loss": 0.3872, + "step": 7391 + }, + { + "epoch": 3.996395746981438, + "grad_norm": 0.2541361451148987, + "learning_rate": 1.174771509433768e-06, + "loss": 0.3697, + "step": 7392 + }, + { + "epoch": 3.9969363849342225, + "grad_norm": 0.25962769985198975, + "learning_rate": 1.1735559707118465e-06, + "loss": 0.3778, + "step": 7393 + }, + { + "epoch": 3.997477022887007, + "grad_norm": 0.26915955543518066, + "learning_rate": 1.1723409775573785e-06, + "loss": 0.3741, + "step": 7394 + }, + { + "epoch": 3.998017660839791, + "grad_norm": 0.2751399874687195, + "learning_rate": 1.1711265301435937e-06, + "loss": 0.3754, + "step": 7395 + }, + { + "epoch": 3.998558298792575, + "grad_norm": 0.28093770146369934, + "learning_rate": 1.1699126286436445e-06, + "loss": 0.3397, + "step": 7396 + }, + { + "epoch": 3.9990989367453595, + "grad_norm": 0.2902621626853943, + "learning_rate": 1.1686992732306102e-06, + "loss": 0.3721, + "step": 7397 + }, + { + "epoch": 3.999639574698144, + "grad_norm": 0.27061375975608826, + "learning_rate": 1.1674864640774852e-06, + "loss": 0.3708, + "step": 7398 + }, + { + "epoch": 4.000180212650928, + "grad_norm": 0.3631877303123474, + "learning_rate": 1.1662742013571926e-06, + "loss": 0.4688, + "step": 7399 + }, + { + "epoch": 4.000720850603712, + "grad_norm": 0.30023327469825745, + "learning_rate": 1.165062485242574e-06, + "loss": 0.3554, + "step": 7400 + }, + { + "epoch": 4.001261488556497, + "grad_norm": 0.29913586378097534, + "learning_rate": 1.1638513159063914e-06, + "loss": 0.4124, + "step": 7401 + }, + { + "epoch": 4.001802126509281, + "grad_norm": 0.26184019446372986, + "learning_rate": 1.1626406935213335e-06, + "loss": 0.3153, + "step": 7402 + }, + { + "epoch": 4.002342764462065, + "grad_norm": 0.2806876301765442, + "learning_rate": 1.1614306182600087e-06, + "loss": 0.3657, + "step": 7403 + }, + { + "epoch": 4.00288340241485, + "grad_norm": 0.27725282311439514, + "learning_rate": 1.1602210902949462e-06, + "loss": 0.3403, + "step": 7404 + }, + { + "epoch": 4.0034240403676336, + "grad_norm": 0.2558286786079407, + "learning_rate": 1.159012109798598e-06, + "loss": 0.3563, + "step": 7405 + }, + { + "epoch": 4.003964678320418, + "grad_norm": 0.29177579283714294, + "learning_rate": 1.1578036769433382e-06, + "loss": 0.353, + "step": 7406 + }, + { + "epoch": 4.004505316273202, + "grad_norm": 0.27496418356895447, + "learning_rate": 1.156595791901467e-06, + "loss": 0.3708, + "step": 7407 + }, + { + "epoch": 4.005045954225987, + "grad_norm": 0.2852287292480469, + "learning_rate": 1.155388454845196e-06, + "loss": 0.3692, + "step": 7408 + }, + { + "epoch": 4.005586592178771, + "grad_norm": 0.2811695337295532, + "learning_rate": 1.1541816659466703e-06, + "loss": 0.3662, + "step": 7409 + }, + { + "epoch": 4.006127230131555, + "grad_norm": 0.26302269101142883, + "learning_rate": 1.1529754253779486e-06, + "loss": 0.3651, + "step": 7410 + }, + { + "epoch": 4.00666786808434, + "grad_norm": 0.2623359262943268, + "learning_rate": 1.1517697333110162e-06, + "loss": 0.3767, + "step": 7411 + }, + { + "epoch": 4.007208506037124, + "grad_norm": 0.26116177439689636, + "learning_rate": 1.1505645899177786e-06, + "loss": 0.3774, + "step": 7412 + }, + { + "epoch": 4.0077491439899084, + "grad_norm": 0.2673705816268921, + "learning_rate": 1.1493599953700606e-06, + "loss": 0.325, + "step": 7413 + }, + { + "epoch": 4.008289781942692, + "grad_norm": 0.3017926812171936, + "learning_rate": 1.1481559498396145e-06, + "loss": 0.42, + "step": 7414 + }, + { + "epoch": 4.008830419895476, + "grad_norm": 0.2560037672519684, + "learning_rate": 1.1469524534981091e-06, + "loss": 0.3439, + "step": 7415 + }, + { + "epoch": 4.009371057848261, + "grad_norm": 0.2601366341114044, + "learning_rate": 1.1457495065171353e-06, + "loss": 0.3643, + "step": 7416 + }, + { + "epoch": 4.009911695801045, + "grad_norm": 0.27450212836265564, + "learning_rate": 1.1445471090682104e-06, + "loss": 0.3472, + "step": 7417 + }, + { + "epoch": 4.01045233375383, + "grad_norm": 0.27664631605148315, + "learning_rate": 1.1433452613227664e-06, + "loss": 0.3931, + "step": 7418 + }, + { + "epoch": 4.010992971706614, + "grad_norm": 0.2645692527294159, + "learning_rate": 1.1421439634521652e-06, + "loss": 0.3292, + "step": 7419 + }, + { + "epoch": 4.011533609659398, + "grad_norm": 0.25841808319091797, + "learning_rate": 1.1409432156276805e-06, + "loss": 0.3447, + "step": 7420 + }, + { + "epoch": 4.0120742476121825, + "grad_norm": 0.26279181241989136, + "learning_rate": 1.139743018020517e-06, + "loss": 0.3644, + "step": 7421 + }, + { + "epoch": 4.012614885564966, + "grad_norm": 0.2843421399593353, + "learning_rate": 1.1385433708017929e-06, + "loss": 0.3647, + "step": 7422 + }, + { + "epoch": 4.013155523517751, + "grad_norm": 0.3022117614746094, + "learning_rate": 1.1373442741425556e-06, + "loss": 0.3708, + "step": 7423 + }, + { + "epoch": 4.013696161470535, + "grad_norm": 0.25190797448158264, + "learning_rate": 1.1361457282137677e-06, + "loss": 0.335, + "step": 7424 + }, + { + "epoch": 4.01423679942332, + "grad_norm": 0.2671329081058502, + "learning_rate": 1.134947733186315e-06, + "loss": 0.3778, + "step": 7425 + }, + { + "epoch": 4.014777437376104, + "grad_norm": 0.2704976201057434, + "learning_rate": 1.1337502892310088e-06, + "loss": 0.3754, + "step": 7426 + }, + { + "epoch": 4.015318075328888, + "grad_norm": 0.2797398865222931, + "learning_rate": 1.1325533965185742e-06, + "loss": 0.4209, + "step": 7427 + }, + { + "epoch": 4.0158587132816725, + "grad_norm": 0.23628182709217072, + "learning_rate": 1.1313570552196656e-06, + "loss": 0.3442, + "step": 7428 + }, + { + "epoch": 4.0163993512344565, + "grad_norm": 0.271152138710022, + "learning_rate": 1.1301612655048545e-06, + "loss": 0.3381, + "step": 7429 + }, + { + "epoch": 4.016939989187241, + "grad_norm": 0.28068703413009644, + "learning_rate": 1.1289660275446318e-06, + "loss": 0.3767, + "step": 7430 + }, + { + "epoch": 4.017480627140025, + "grad_norm": 0.2739368677139282, + "learning_rate": 1.1277713415094155e-06, + "loss": 0.3539, + "step": 7431 + }, + { + "epoch": 4.018021265092809, + "grad_norm": 0.28415143489837646, + "learning_rate": 1.1265772075695409e-06, + "loss": 0.3613, + "step": 7432 + }, + { + "epoch": 4.018561903045594, + "grad_norm": 0.2874448299407959, + "learning_rate": 1.125383625895265e-06, + "loss": 0.3758, + "step": 7433 + }, + { + "epoch": 4.019102540998378, + "grad_norm": 0.25903433561325073, + "learning_rate": 1.1241905966567652e-06, + "loss": 0.3356, + "step": 7434 + }, + { + "epoch": 4.019643178951163, + "grad_norm": 0.2529353201389313, + "learning_rate": 1.1229981200241424e-06, + "loss": 0.3417, + "step": 7435 + }, + { + "epoch": 4.0201838169039465, + "grad_norm": 0.26373714208602905, + "learning_rate": 1.1218061961674214e-06, + "loss": 0.3887, + "step": 7436 + }, + { + "epoch": 4.020724454856731, + "grad_norm": 0.2680737376213074, + "learning_rate": 1.1206148252565385e-06, + "loss": 0.3858, + "step": 7437 + }, + { + "epoch": 4.021265092809515, + "grad_norm": 0.2669246196746826, + "learning_rate": 1.1194240074613617e-06, + "loss": 0.3806, + "step": 7438 + }, + { + "epoch": 4.021805730762299, + "grad_norm": 0.258058100938797, + "learning_rate": 1.1182337429516722e-06, + "loss": 0.356, + "step": 7439 + }, + { + "epoch": 4.022346368715084, + "grad_norm": 0.2669731378555298, + "learning_rate": 1.1170440318971788e-06, + "loss": 0.3808, + "step": 7440 + }, + { + "epoch": 4.022887006667868, + "grad_norm": 0.27213218808174133, + "learning_rate": 1.1158548744675073e-06, + "loss": 0.3606, + "step": 7441 + }, + { + "epoch": 4.023427644620653, + "grad_norm": 0.26563283801078796, + "learning_rate": 1.1146662708322043e-06, + "loss": 0.3278, + "step": 7442 + }, + { + "epoch": 4.023968282573437, + "grad_norm": 0.28348419070243835, + "learning_rate": 1.113478221160741e-06, + "loss": 0.3874, + "step": 7443 + }, + { + "epoch": 4.024508920526221, + "grad_norm": 0.25104525685310364, + "learning_rate": 1.1122907256225064e-06, + "loss": 0.3303, + "step": 7444 + }, + { + "epoch": 4.025049558479005, + "grad_norm": 0.26987916231155396, + "learning_rate": 1.1111037843868095e-06, + "loss": 0.3581, + "step": 7445 + }, + { + "epoch": 4.025590196431789, + "grad_norm": 0.2615717351436615, + "learning_rate": 1.1099173976228854e-06, + "loss": 0.345, + "step": 7446 + }, + { + "epoch": 4.026130834384574, + "grad_norm": 0.2686591148376465, + "learning_rate": 1.1087315654998842e-06, + "loss": 0.3736, + "step": 7447 + }, + { + "epoch": 4.026671472337358, + "grad_norm": 0.2937152683734894, + "learning_rate": 1.1075462881868842e-06, + "loss": 0.3502, + "step": 7448 + }, + { + "epoch": 4.027212110290143, + "grad_norm": 0.3030618131160736, + "learning_rate": 1.1063615658528742e-06, + "loss": 0.381, + "step": 7449 + }, + { + "epoch": 4.027752748242927, + "grad_norm": 0.2527458965778351, + "learning_rate": 1.1051773986667735e-06, + "loss": 0.3131, + "step": 7450 + }, + { + "epoch": 4.028293386195711, + "grad_norm": 0.29645195603370667, + "learning_rate": 1.1039937867974166e-06, + "loss": 0.4008, + "step": 7451 + }, + { + "epoch": 4.0288340241484955, + "grad_norm": 0.299660325050354, + "learning_rate": 1.1028107304135626e-06, + "loss": 0.3426, + "step": 7452 + }, + { + "epoch": 4.029374662101279, + "grad_norm": 0.26119139790534973, + "learning_rate": 1.1016282296838887e-06, + "loss": 0.3553, + "step": 7453 + }, + { + "epoch": 4.029915300054064, + "grad_norm": 0.3071519136428833, + "learning_rate": 1.1004462847769925e-06, + "loss": 0.4091, + "step": 7454 + }, + { + "epoch": 4.030455938006848, + "grad_norm": 0.262119859457016, + "learning_rate": 1.0992648958613961e-06, + "loss": 0.3263, + "step": 7455 + }, + { + "epoch": 4.030996575959632, + "grad_norm": 0.2911357283592224, + "learning_rate": 1.0980840631055378e-06, + "loss": 0.3556, + "step": 7456 + }, + { + "epoch": 4.031537213912417, + "grad_norm": 0.2899060845375061, + "learning_rate": 1.0969037866777782e-06, + "loss": 0.3751, + "step": 7457 + }, + { + "epoch": 4.032077851865201, + "grad_norm": 0.25036853551864624, + "learning_rate": 1.0957240667464014e-06, + "loss": 0.3484, + "step": 7458 + }, + { + "epoch": 4.0326184898179855, + "grad_norm": 0.2819577157497406, + "learning_rate": 1.0945449034796068e-06, + "loss": 0.381, + "step": 7459 + }, + { + "epoch": 4.0331591277707695, + "grad_norm": 0.2584560811519623, + "learning_rate": 1.0933662970455217e-06, + "loss": 0.3426, + "step": 7460 + }, + { + "epoch": 4.033699765723553, + "grad_norm": 0.28365328907966614, + "learning_rate": 1.0921882476121837e-06, + "loss": 0.3842, + "step": 7461 + }, + { + "epoch": 4.034240403676338, + "grad_norm": 0.28638550639152527, + "learning_rate": 1.091010755347562e-06, + "loss": 0.3934, + "step": 7462 + }, + { + "epoch": 4.034781041629122, + "grad_norm": 0.28289759159088135, + "learning_rate": 1.0898338204195375e-06, + "loss": 0.3702, + "step": 7463 + }, + { + "epoch": 4.035321679581907, + "grad_norm": 0.2768246829509735, + "learning_rate": 1.0886574429959185e-06, + "loss": 0.3442, + "step": 7464 + }, + { + "epoch": 4.035862317534691, + "grad_norm": 0.27342909574508667, + "learning_rate": 1.0874816232444297e-06, + "loss": 0.417, + "step": 7465 + }, + { + "epoch": 4.036402955487476, + "grad_norm": 0.27252939343452454, + "learning_rate": 1.0863063613327162e-06, + "loss": 0.3747, + "step": 7466 + }, + { + "epoch": 4.0369435934402595, + "grad_norm": 0.2616770267486572, + "learning_rate": 1.0851316574283466e-06, + "loss": 0.347, + "step": 7467 + }, + { + "epoch": 4.0374842313930435, + "grad_norm": 0.26894411444664, + "learning_rate": 1.0839575116988077e-06, + "loss": 0.3305, + "step": 7468 + }, + { + "epoch": 4.038024869345828, + "grad_norm": 0.2860458195209503, + "learning_rate": 1.0827839243115046e-06, + "loss": 0.364, + "step": 7469 + }, + { + "epoch": 4.038565507298612, + "grad_norm": 0.2837781012058258, + "learning_rate": 1.081610895433769e-06, + "loss": 0.3886, + "step": 7470 + }, + { + "epoch": 4.039106145251397, + "grad_norm": 0.27345579862594604, + "learning_rate": 1.080438425232846e-06, + "loss": 0.3965, + "step": 7471 + }, + { + "epoch": 4.039646783204181, + "grad_norm": 0.25032445788383484, + "learning_rate": 1.0792665138759085e-06, + "loss": 0.3554, + "step": 7472 + }, + { + "epoch": 4.040187421156965, + "grad_norm": 0.25677117705345154, + "learning_rate": 1.07809516153004e-06, + "loss": 0.3647, + "step": 7473 + }, + { + "epoch": 4.04072805910975, + "grad_norm": 0.29125097393989563, + "learning_rate": 1.0769243683622522e-06, + "loss": 0.3889, + "step": 7474 + }, + { + "epoch": 4.0412686970625336, + "grad_norm": 0.2656898498535156, + "learning_rate": 1.0757541345394768e-06, + "loss": 0.3758, + "step": 7475 + }, + { + "epoch": 4.041809335015318, + "grad_norm": 0.2595039904117584, + "learning_rate": 1.0745844602285615e-06, + "loss": 0.3711, + "step": 7476 + }, + { + "epoch": 4.042349972968102, + "grad_norm": 0.277091920375824, + "learning_rate": 1.0734153455962765e-06, + "loss": 0.3832, + "step": 7477 + }, + { + "epoch": 4.042890610920887, + "grad_norm": 0.25206366181373596, + "learning_rate": 1.072246790809311e-06, + "loss": 0.3557, + "step": 7478 + }, + { + "epoch": 4.043431248873671, + "grad_norm": 0.2778056263923645, + "learning_rate": 1.0710787960342777e-06, + "loss": 0.3675, + "step": 7479 + }, + { + "epoch": 4.043971886826455, + "grad_norm": 0.2501332461833954, + "learning_rate": 1.0699113614377065e-06, + "loss": 0.3222, + "step": 7480 + }, + { + "epoch": 4.04451252477924, + "grad_norm": 0.2865346372127533, + "learning_rate": 1.0687444871860459e-06, + "loss": 0.3606, + "step": 7481 + }, + { + "epoch": 4.045053162732024, + "grad_norm": 0.25500455498695374, + "learning_rate": 1.0675781734456703e-06, + "loss": 0.3517, + "step": 7482 + }, + { + "epoch": 4.0455938006848084, + "grad_norm": 0.26119256019592285, + "learning_rate": 1.0664124203828667e-06, + "loss": 0.3413, + "step": 7483 + }, + { + "epoch": 4.046134438637592, + "grad_norm": 0.271000474691391, + "learning_rate": 1.0652472281638505e-06, + "loss": 0.3477, + "step": 7484 + }, + { + "epoch": 4.046675076590376, + "grad_norm": 0.2632492780685425, + "learning_rate": 1.0640825969547498e-06, + "loss": 0.3189, + "step": 7485 + }, + { + "epoch": 4.047215714543161, + "grad_norm": 0.29628267884254456, + "learning_rate": 1.0629185269216147e-06, + "loss": 0.3924, + "step": 7486 + }, + { + "epoch": 4.047756352495945, + "grad_norm": 0.25676894187927246, + "learning_rate": 1.0617550182304193e-06, + "loss": 0.3379, + "step": 7487 + }, + { + "epoch": 4.04829699044873, + "grad_norm": 0.2630210518836975, + "learning_rate": 1.0605920710470529e-06, + "loss": 0.3425, + "step": 7488 + }, + { + "epoch": 4.048837628401514, + "grad_norm": 0.2740321755409241, + "learning_rate": 1.0594296855373265e-06, + "loss": 0.3959, + "step": 7489 + }, + { + "epoch": 4.049378266354298, + "grad_norm": 0.2768412232398987, + "learning_rate": 1.058267861866969e-06, + "loss": 0.376, + "step": 7490 + }, + { + "epoch": 4.0499189043070825, + "grad_norm": 0.2641516923904419, + "learning_rate": 1.0571066002016345e-06, + "loss": 0.3771, + "step": 7491 + }, + { + "epoch": 4.050459542259866, + "grad_norm": 0.2585950493812561, + "learning_rate": 1.0559459007068907e-06, + "loss": 0.3135, + "step": 7492 + }, + { + "epoch": 4.051000180212651, + "grad_norm": 0.27136123180389404, + "learning_rate": 1.0547857635482306e-06, + "loss": 0.3591, + "step": 7493 + }, + { + "epoch": 4.051540818165435, + "grad_norm": 0.2911006212234497, + "learning_rate": 1.0536261888910637e-06, + "loss": 0.3604, + "step": 7494 + }, + { + "epoch": 4.05208145611822, + "grad_norm": 0.2715640366077423, + "learning_rate": 1.0524671769007177e-06, + "loss": 0.3647, + "step": 7495 + }, + { + "epoch": 4.052622094071004, + "grad_norm": 0.2566773295402527, + "learning_rate": 1.051308727742446e-06, + "loss": 0.334, + "step": 7496 + }, + { + "epoch": 4.053162732023788, + "grad_norm": 0.2693837583065033, + "learning_rate": 1.050150841581416e-06, + "loss": 0.3632, + "step": 7497 + }, + { + "epoch": 4.0537033699765725, + "grad_norm": 0.267478346824646, + "learning_rate": 1.0489935185827166e-06, + "loss": 0.3364, + "step": 7498 + }, + { + "epoch": 4.0542440079293565, + "grad_norm": 0.263049453496933, + "learning_rate": 1.0478367589113586e-06, + "loss": 0.3546, + "step": 7499 + }, + { + "epoch": 4.054784645882141, + "grad_norm": 0.28416845202445984, + "learning_rate": 1.0466805627322685e-06, + "loss": 0.3441, + "step": 7500 + }, + { + "epoch": 4.055325283834925, + "grad_norm": 0.27740010619163513, + "learning_rate": 1.0455249302102994e-06, + "loss": 0.319, + "step": 7501 + }, + { + "epoch": 4.055865921787709, + "grad_norm": 0.2731799781322479, + "learning_rate": 1.0443698615102121e-06, + "loss": 0.3694, + "step": 7502 + }, + { + "epoch": 4.056406559740494, + "grad_norm": 0.2924058735370636, + "learning_rate": 1.0432153567966985e-06, + "loss": 0.3872, + "step": 7503 + }, + { + "epoch": 4.056947197693278, + "grad_norm": 0.2668353319168091, + "learning_rate": 1.0420614162343661e-06, + "loss": 0.3557, + "step": 7504 + }, + { + "epoch": 4.057487835646063, + "grad_norm": 0.27553117275238037, + "learning_rate": 1.0409080399877413e-06, + "loss": 0.4105, + "step": 7505 + }, + { + "epoch": 4.0580284735988466, + "grad_norm": 0.25538763403892517, + "learning_rate": 1.0397552282212698e-06, + "loss": 0.3556, + "step": 7506 + }, + { + "epoch": 4.058569111551631, + "grad_norm": 0.2744927406311035, + "learning_rate": 1.0386029810993159e-06, + "loss": 0.331, + "step": 7507 + }, + { + "epoch": 4.059109749504415, + "grad_norm": 0.2575755715370178, + "learning_rate": 1.0374512987861679e-06, + "loss": 0.3515, + "step": 7508 + }, + { + "epoch": 4.059650387457199, + "grad_norm": 0.27319324016571045, + "learning_rate": 1.0363001814460294e-06, + "loss": 0.3937, + "step": 7509 + }, + { + "epoch": 4.060191025409984, + "grad_norm": 0.24155323207378387, + "learning_rate": 1.035149629243023e-06, + "loss": 0.3334, + "step": 7510 + }, + { + "epoch": 4.060731663362768, + "grad_norm": 0.32312268018722534, + "learning_rate": 1.0339996423411946e-06, + "loss": 0.4033, + "step": 7511 + }, + { + "epoch": 4.061272301315553, + "grad_norm": 0.265438973903656, + "learning_rate": 1.0328502209045056e-06, + "loss": 0.3488, + "step": 7512 + }, + { + "epoch": 4.061812939268337, + "grad_norm": 0.2738054692745209, + "learning_rate": 1.0317013650968404e-06, + "loss": 0.3697, + "step": 7513 + }, + { + "epoch": 4.062353577221121, + "grad_norm": 0.2512156367301941, + "learning_rate": 1.0305530750819992e-06, + "loss": 0.3097, + "step": 7514 + }, + { + "epoch": 4.062894215173905, + "grad_norm": 0.2932851016521454, + "learning_rate": 1.0294053510237028e-06, + "loss": 0.3783, + "step": 7515 + }, + { + "epoch": 4.063434853126689, + "grad_norm": 0.2610452473163605, + "learning_rate": 1.0282581930855933e-06, + "loss": 0.3499, + "step": 7516 + }, + { + "epoch": 4.063975491079474, + "grad_norm": 0.2715596556663513, + "learning_rate": 1.0271116014312293e-06, + "loss": 0.3466, + "step": 7517 + }, + { + "epoch": 4.064516129032258, + "grad_norm": 0.26254555583000183, + "learning_rate": 1.02596557622409e-06, + "loss": 0.3654, + "step": 7518 + }, + { + "epoch": 4.065056766985042, + "grad_norm": 0.28570741415023804, + "learning_rate": 1.0248201176275717e-06, + "loss": 0.3774, + "step": 7519 + }, + { + "epoch": 4.065597404937827, + "grad_norm": 0.263133704662323, + "learning_rate": 1.0236752258049954e-06, + "loss": 0.3464, + "step": 7520 + }, + { + "epoch": 4.066138042890611, + "grad_norm": 0.2521113157272339, + "learning_rate": 1.0225309009195962e-06, + "loss": 0.3394, + "step": 7521 + }, + { + "epoch": 4.0666786808433955, + "grad_norm": 0.2839896082878113, + "learning_rate": 1.0213871431345274e-06, + "loss": 0.3802, + "step": 7522 + }, + { + "epoch": 4.067219318796179, + "grad_norm": 0.2782779932022095, + "learning_rate": 1.0202439526128677e-06, + "loss": 0.3373, + "step": 7523 + }, + { + "epoch": 4.067759956748964, + "grad_norm": 0.30508559942245483, + "learning_rate": 1.0191013295176082e-06, + "loss": 0.3945, + "step": 7524 + }, + { + "epoch": 4.068300594701748, + "grad_norm": 0.2754221260547638, + "learning_rate": 1.017959274011665e-06, + "loss": 0.3205, + "step": 7525 + }, + { + "epoch": 4.068841232654532, + "grad_norm": 0.26116254925727844, + "learning_rate": 1.0168177862578683e-06, + "loss": 0.3332, + "step": 7526 + }, + { + "epoch": 4.069381870607317, + "grad_norm": 0.27342498302459717, + "learning_rate": 1.0156768664189681e-06, + "loss": 0.3979, + "step": 7527 + }, + { + "epoch": 4.069922508560101, + "grad_norm": 0.27583229541778564, + "learning_rate": 1.0145365146576375e-06, + "loss": 0.3334, + "step": 7528 + }, + { + "epoch": 4.0704631465128855, + "grad_norm": 0.28629687428474426, + "learning_rate": 1.013396731136465e-06, + "loss": 0.3668, + "step": 7529 + }, + { + "epoch": 4.0710037844656695, + "grad_norm": 0.2998204529285431, + "learning_rate": 1.0122575160179582e-06, + "loss": 0.3859, + "step": 7530 + }, + { + "epoch": 4.071544422418453, + "grad_norm": 0.2818513810634613, + "learning_rate": 1.0111188694645435e-06, + "loss": 0.3498, + "step": 7531 + }, + { + "epoch": 4.072085060371238, + "grad_norm": 0.26502078771591187, + "learning_rate": 1.0099807916385673e-06, + "loss": 0.3501, + "step": 7532 + }, + { + "epoch": 4.072625698324022, + "grad_norm": 0.2796849012374878, + "learning_rate": 1.0088432827022986e-06, + "loss": 0.3642, + "step": 7533 + }, + { + "epoch": 4.073166336276807, + "grad_norm": 0.284131795167923, + "learning_rate": 1.0077063428179156e-06, + "loss": 0.375, + "step": 7534 + }, + { + "epoch": 4.073706974229591, + "grad_norm": 0.28391698002815247, + "learning_rate": 1.0065699721475253e-06, + "loss": 0.3576, + "step": 7535 + }, + { + "epoch": 4.074247612182376, + "grad_norm": 0.2852577865123749, + "learning_rate": 1.0054341708531462e-06, + "loss": 0.3574, + "step": 7536 + }, + { + "epoch": 4.0747882501351596, + "grad_norm": 0.2701577842235565, + "learning_rate": 1.0042989390967218e-06, + "loss": 0.3599, + "step": 7537 + }, + { + "epoch": 4.0753288880879435, + "grad_norm": 0.2770707607269287, + "learning_rate": 1.00316427704011e-06, + "loss": 0.3553, + "step": 7538 + }, + { + "epoch": 4.075869526040728, + "grad_norm": 0.28347349166870117, + "learning_rate": 1.0020301848450874e-06, + "loss": 0.3752, + "step": 7539 + }, + { + "epoch": 4.076410163993512, + "grad_norm": 0.28405776619911194, + "learning_rate": 1.0008966626733541e-06, + "loss": 0.3656, + "step": 7540 + }, + { + "epoch": 4.076950801946297, + "grad_norm": 0.27447542548179626, + "learning_rate": 9.997637106865232e-07, + "loss": 0.3806, + "step": 7541 + }, + { + "epoch": 4.077491439899081, + "grad_norm": 0.2700732946395874, + "learning_rate": 9.986313290461287e-07, + "loss": 0.3215, + "step": 7542 + }, + { + "epoch": 4.078032077851865, + "grad_norm": 0.279715359210968, + "learning_rate": 9.974995179136254e-07, + "loss": 0.3964, + "step": 7543 + }, + { + "epoch": 4.07857271580465, + "grad_norm": 0.2824874520301819, + "learning_rate": 9.963682774503824e-07, + "loss": 0.3763, + "step": 7544 + }, + { + "epoch": 4.079113353757434, + "grad_norm": 0.2741328775882721, + "learning_rate": 9.95237607817694e-07, + "loss": 0.3209, + "step": 7545 + }, + { + "epoch": 4.079653991710218, + "grad_norm": 0.2551615238189697, + "learning_rate": 9.941075091767643e-07, + "loss": 0.3558, + "step": 7546 + }, + { + "epoch": 4.080194629663002, + "grad_norm": 0.281745046377182, + "learning_rate": 9.929779816887237e-07, + "loss": 0.3688, + "step": 7547 + }, + { + "epoch": 4.080735267615786, + "grad_norm": 0.2685527503490448, + "learning_rate": 9.918490255146158e-07, + "loss": 0.3682, + "step": 7548 + }, + { + "epoch": 4.081275905568571, + "grad_norm": 0.2721521556377411, + "learning_rate": 9.90720640815408e-07, + "loss": 0.3678, + "step": 7549 + }, + { + "epoch": 4.081816543521355, + "grad_norm": 0.27016156911849976, + "learning_rate": 9.895928277519822e-07, + "loss": 0.3401, + "step": 7550 + }, + { + "epoch": 4.08235718147414, + "grad_norm": 0.27403244376182556, + "learning_rate": 9.884655864851384e-07, + "loss": 0.3382, + "step": 7551 + }, + { + "epoch": 4.082897819426924, + "grad_norm": 0.2650195062160492, + "learning_rate": 9.873389171755987e-07, + "loss": 0.3412, + "step": 7552 + }, + { + "epoch": 4.0834384573797085, + "grad_norm": 0.28170421719551086, + "learning_rate": 9.86212819984001e-07, + "loss": 0.3962, + "step": 7553 + }, + { + "epoch": 4.083979095332492, + "grad_norm": 0.28326892852783203, + "learning_rate": 9.850872950709012e-07, + "loss": 0.3798, + "step": 7554 + }, + { + "epoch": 4.084519733285276, + "grad_norm": 0.2657798230648041, + "learning_rate": 9.83962342596776e-07, + "loss": 0.3543, + "step": 7555 + }, + { + "epoch": 4.085060371238061, + "grad_norm": 0.2940162718296051, + "learning_rate": 9.828379627220174e-07, + "loss": 0.3665, + "step": 7556 + }, + { + "epoch": 4.085601009190845, + "grad_norm": 0.25168517231941223, + "learning_rate": 9.817141556069398e-07, + "loss": 0.3127, + "step": 7557 + }, + { + "epoch": 4.08614164714363, + "grad_norm": 0.26851770281791687, + "learning_rate": 9.805909214117721e-07, + "loss": 0.3598, + "step": 7558 + }, + { + "epoch": 4.086682285096414, + "grad_norm": 0.27158963680267334, + "learning_rate": 9.794682602966637e-07, + "loss": 0.3913, + "step": 7559 + }, + { + "epoch": 4.087222923049198, + "grad_norm": 0.26728105545043945, + "learning_rate": 9.783461724216793e-07, + "loss": 0.3428, + "step": 7560 + }, + { + "epoch": 4.0877635610019825, + "grad_norm": 0.29941266775131226, + "learning_rate": 9.77224657946806e-07, + "loss": 0.3893, + "step": 7561 + }, + { + "epoch": 4.088304198954766, + "grad_norm": 0.27292317152023315, + "learning_rate": 9.761037170319498e-07, + "loss": 0.3579, + "step": 7562 + }, + { + "epoch": 4.088844836907551, + "grad_norm": 0.28188255429267883, + "learning_rate": 9.74983349836927e-07, + "loss": 0.3684, + "step": 7563 + }, + { + "epoch": 4.089385474860335, + "grad_norm": 0.26834139227867126, + "learning_rate": 9.73863556521482e-07, + "loss": 0.3837, + "step": 7564 + }, + { + "epoch": 4.08992611281312, + "grad_norm": 0.27812516689300537, + "learning_rate": 9.7274433724527e-07, + "loss": 0.342, + "step": 7565 + }, + { + "epoch": 4.090466750765904, + "grad_norm": 0.2742692828178406, + "learning_rate": 9.716256921678696e-07, + "loss": 0.3735, + "step": 7566 + }, + { + "epoch": 4.091007388718688, + "grad_norm": 0.27141088247299194, + "learning_rate": 9.705076214487747e-07, + "loss": 0.3504, + "step": 7567 + }, + { + "epoch": 4.0915480266714725, + "grad_norm": 0.2742808163166046, + "learning_rate": 9.693901252473953e-07, + "loss": 0.3575, + "step": 7568 + }, + { + "epoch": 4.0920886646242565, + "grad_norm": 0.27946266531944275, + "learning_rate": 9.682732037230652e-07, + "loss": 0.3518, + "step": 7569 + }, + { + "epoch": 4.092629302577041, + "grad_norm": 0.3077234923839569, + "learning_rate": 9.671568570350321e-07, + "loss": 0.412, + "step": 7570 + }, + { + "epoch": 4.093169940529825, + "grad_norm": 0.25089436769485474, + "learning_rate": 9.660410853424607e-07, + "loss": 0.2997, + "step": 7571 + }, + { + "epoch": 4.093710578482609, + "grad_norm": 0.28145653009414673, + "learning_rate": 9.649258888044384e-07, + "loss": 0.406, + "step": 7572 + }, + { + "epoch": 4.094251216435394, + "grad_norm": 0.2497064620256424, + "learning_rate": 9.63811267579966e-07, + "loss": 0.3202, + "step": 7573 + }, + { + "epoch": 4.094791854388178, + "grad_norm": 0.2823668122291565, + "learning_rate": 9.626972218279674e-07, + "loss": 0.3966, + "step": 7574 + }, + { + "epoch": 4.095332492340963, + "grad_norm": 0.2755703032016754, + "learning_rate": 9.615837517072758e-07, + "loss": 0.3865, + "step": 7575 + }, + { + "epoch": 4.0958731302937466, + "grad_norm": 0.2881629765033722, + "learning_rate": 9.604708573766525e-07, + "loss": 0.3609, + "step": 7576 + }, + { + "epoch": 4.0964137682465305, + "grad_norm": 0.27658626437187195, + "learning_rate": 9.59358538994769e-07, + "loss": 0.3351, + "step": 7577 + }, + { + "epoch": 4.096954406199315, + "grad_norm": 0.2800852954387665, + "learning_rate": 9.582467967202202e-07, + "loss": 0.3609, + "step": 7578 + }, + { + "epoch": 4.097495044152099, + "grad_norm": 0.28268930315971375, + "learning_rate": 9.571356307115149e-07, + "loss": 0.3765, + "step": 7579 + }, + { + "epoch": 4.098035682104884, + "grad_norm": 0.27372539043426514, + "learning_rate": 9.560250411270794e-07, + "loss": 0.3326, + "step": 7580 + }, + { + "epoch": 4.098576320057668, + "grad_norm": 0.29689791798591614, + "learning_rate": 9.549150281252633e-07, + "loss": 0.3613, + "step": 7581 + }, + { + "epoch": 4.099116958010453, + "grad_norm": 0.30527156591415405, + "learning_rate": 9.53805591864328e-07, + "loss": 0.3653, + "step": 7582 + }, + { + "epoch": 4.099657595963237, + "grad_norm": 0.27635329961776733, + "learning_rate": 9.526967325024539e-07, + "loss": 0.3771, + "step": 7583 + }, + { + "epoch": 4.100198233916021, + "grad_norm": 0.2942197918891907, + "learning_rate": 9.51588450197743e-07, + "loss": 0.3516, + "step": 7584 + }, + { + "epoch": 4.100738871868805, + "grad_norm": 0.2760907709598541, + "learning_rate": 9.504807451082088e-07, + "loss": 0.3433, + "step": 7585 + }, + { + "epoch": 4.101279509821589, + "grad_norm": 0.27573254704475403, + "learning_rate": 9.493736173917906e-07, + "loss": 0.3617, + "step": 7586 + }, + { + "epoch": 4.101820147774374, + "grad_norm": 0.292228639125824, + "learning_rate": 9.482670672063354e-07, + "loss": 0.3543, + "step": 7587 + }, + { + "epoch": 4.102360785727158, + "grad_norm": 0.2821485996246338, + "learning_rate": 9.471610947096166e-07, + "loss": 0.3438, + "step": 7588 + }, + { + "epoch": 4.102901423679942, + "grad_norm": 0.2708372473716736, + "learning_rate": 9.460557000593196e-07, + "loss": 0.3772, + "step": 7589 + }, + { + "epoch": 4.103442061632727, + "grad_norm": 0.2668057978153229, + "learning_rate": 9.449508834130517e-07, + "loss": 0.3831, + "step": 7590 + }, + { + "epoch": 4.103982699585511, + "grad_norm": 0.28821754455566406, + "learning_rate": 9.438466449283345e-07, + "loss": 0.3845, + "step": 7591 + }, + { + "epoch": 4.1045233375382955, + "grad_norm": 0.24543441832065582, + "learning_rate": 9.427429847626068e-07, + "loss": 0.3344, + "step": 7592 + }, + { + "epoch": 4.105063975491079, + "grad_norm": 0.283571720123291, + "learning_rate": 9.416399030732298e-07, + "loss": 0.3689, + "step": 7593 + }, + { + "epoch": 4.105604613443864, + "grad_norm": 0.28722450137138367, + "learning_rate": 9.405374000174772e-07, + "loss": 0.3789, + "step": 7594 + }, + { + "epoch": 4.106145251396648, + "grad_norm": 0.271992564201355, + "learning_rate": 9.394354757525404e-07, + "loss": 0.3537, + "step": 7595 + }, + { + "epoch": 4.106685889349432, + "grad_norm": 0.2753427028656006, + "learning_rate": 9.383341304355326e-07, + "loss": 0.3799, + "step": 7596 + }, + { + "epoch": 4.107226527302217, + "grad_norm": 0.26946526765823364, + "learning_rate": 9.372333642234787e-07, + "loss": 0.363, + "step": 7597 + }, + { + "epoch": 4.107767165255001, + "grad_norm": 0.27203747630119324, + "learning_rate": 9.361331772733284e-07, + "loss": 0.3729, + "step": 7598 + }, + { + "epoch": 4.1083078032077855, + "grad_norm": 0.24915803968906403, + "learning_rate": 9.350335697419382e-07, + "loss": 0.368, + "step": 7599 + }, + { + "epoch": 4.1088484411605695, + "grad_norm": 0.30128878355026245, + "learning_rate": 9.339345417860918e-07, + "loss": 0.4005, + "step": 7600 + }, + { + "epoch": 4.109389079113353, + "grad_norm": 0.2533441483974457, + "learning_rate": 9.328360935624875e-07, + "loss": 0.3328, + "step": 7601 + }, + { + "epoch": 4.109929717066138, + "grad_norm": 0.2531205713748932, + "learning_rate": 9.317382252277391e-07, + "loss": 0.3785, + "step": 7602 + }, + { + "epoch": 4.110470355018922, + "grad_norm": 0.27615755796432495, + "learning_rate": 9.306409369383779e-07, + "loss": 0.3252, + "step": 7603 + }, + { + "epoch": 4.111010992971707, + "grad_norm": 0.27425599098205566, + "learning_rate": 9.295442288508522e-07, + "loss": 0.3868, + "step": 7604 + }, + { + "epoch": 4.111551630924491, + "grad_norm": 0.2784785032272339, + "learning_rate": 9.284481011215318e-07, + "loss": 0.3574, + "step": 7605 + }, + { + "epoch": 4.112092268877275, + "grad_norm": 0.2659832239151001, + "learning_rate": 9.273525539066985e-07, + "loss": 0.38, + "step": 7606 + }, + { + "epoch": 4.1126329068300596, + "grad_norm": 0.25805604457855225, + "learning_rate": 9.262575873625529e-07, + "loss": 0.3279, + "step": 7607 + }, + { + "epoch": 4.1131735447828435, + "grad_norm": 0.2785314917564392, + "learning_rate": 9.251632016452156e-07, + "loss": 0.4154, + "step": 7608 + }, + { + "epoch": 4.113714182735628, + "grad_norm": 0.2850290834903717, + "learning_rate": 9.240693969107195e-07, + "loss": 0.355, + "step": 7609 + }, + { + "epoch": 4.114254820688412, + "grad_norm": 0.2698410153388977, + "learning_rate": 9.229761733150205e-07, + "loss": 0.35, + "step": 7610 + }, + { + "epoch": 4.114795458641197, + "grad_norm": 0.3033217489719391, + "learning_rate": 9.218835310139862e-07, + "loss": 0.407, + "step": 7611 + }, + { + "epoch": 4.115336096593981, + "grad_norm": 0.2788592576980591, + "learning_rate": 9.207914701634024e-07, + "loss": 0.3572, + "step": 7612 + }, + { + "epoch": 4.115876734546765, + "grad_norm": 0.2555527985095978, + "learning_rate": 9.196999909189764e-07, + "loss": 0.3346, + "step": 7613 + }, + { + "epoch": 4.11641737249955, + "grad_norm": 0.2944771647453308, + "learning_rate": 9.186090934363274e-07, + "loss": 0.3987, + "step": 7614 + }, + { + "epoch": 4.116958010452334, + "grad_norm": 0.28323036432266235, + "learning_rate": 9.175187778709937e-07, + "loss": 0.3669, + "step": 7615 + }, + { + "epoch": 4.117498648405118, + "grad_norm": 0.26645544171333313, + "learning_rate": 9.164290443784296e-07, + "loss": 0.353, + "step": 7616 + }, + { + "epoch": 4.118039286357902, + "grad_norm": 0.27114954590797424, + "learning_rate": 9.153398931140095e-07, + "loss": 0.3511, + "step": 7617 + }, + { + "epoch": 4.118579924310686, + "grad_norm": 0.2679999768733978, + "learning_rate": 9.142513242330214e-07, + "loss": 0.3677, + "step": 7618 + }, + { + "epoch": 4.119120562263471, + "grad_norm": 0.2583546042442322, + "learning_rate": 9.131633378906707e-07, + "loss": 0.3517, + "step": 7619 + }, + { + "epoch": 4.119661200216255, + "grad_norm": 0.272354394197464, + "learning_rate": 9.120759342420821e-07, + "loss": 0.4086, + "step": 7620 + }, + { + "epoch": 4.12020183816904, + "grad_norm": 0.246135875582695, + "learning_rate": 9.109891134422944e-07, + "loss": 0.3132, + "step": 7621 + }, + { + "epoch": 4.120742476121824, + "grad_norm": 0.2872975468635559, + "learning_rate": 9.099028756462658e-07, + "loss": 0.3891, + "step": 7622 + }, + { + "epoch": 4.1212831140746085, + "grad_norm": 0.27051842212677, + "learning_rate": 9.088172210088692e-07, + "loss": 0.3507, + "step": 7623 + }, + { + "epoch": 4.121823752027392, + "grad_norm": 0.26585105061531067, + "learning_rate": 9.077321496848945e-07, + "loss": 0.3237, + "step": 7624 + }, + { + "epoch": 4.122364389980176, + "grad_norm": 0.2786099314689636, + "learning_rate": 9.066476618290515e-07, + "loss": 0.3655, + "step": 7625 + }, + { + "epoch": 4.122905027932961, + "grad_norm": 0.2532130181789398, + "learning_rate": 9.055637575959614e-07, + "loss": 0.3422, + "step": 7626 + }, + { + "epoch": 4.123445665885745, + "grad_norm": 0.2793992757797241, + "learning_rate": 9.044804371401699e-07, + "loss": 0.3775, + "step": 7627 + }, + { + "epoch": 4.12398630383853, + "grad_norm": 0.26155126094818115, + "learning_rate": 9.033977006161299e-07, + "loss": 0.3372, + "step": 7628 + }, + { + "epoch": 4.124526941791314, + "grad_norm": 0.28930824995040894, + "learning_rate": 9.023155481782175e-07, + "loss": 0.3846, + "step": 7629 + }, + { + "epoch": 4.125067579744098, + "grad_norm": 0.27423110604286194, + "learning_rate": 9.012339799807263e-07, + "loss": 0.3555, + "step": 7630 + }, + { + "epoch": 4.1256082176968825, + "grad_norm": 0.28901660442352295, + "learning_rate": 9.001529961778627e-07, + "loss": 0.3563, + "step": 7631 + }, + { + "epoch": 4.126148855649666, + "grad_norm": 0.2584855258464813, + "learning_rate": 8.990725969237513e-07, + "loss": 0.336, + "step": 7632 + }, + { + "epoch": 4.126689493602451, + "grad_norm": 0.28122976422309875, + "learning_rate": 8.979927823724321e-07, + "loss": 0.3897, + "step": 7633 + }, + { + "epoch": 4.127230131555235, + "grad_norm": 0.25875017046928406, + "learning_rate": 8.96913552677866e-07, + "loss": 0.3741, + "step": 7634 + }, + { + "epoch": 4.12777076950802, + "grad_norm": 0.24487367272377014, + "learning_rate": 8.95834907993926e-07, + "loss": 0.3215, + "step": 7635 + }, + { + "epoch": 4.128311407460804, + "grad_norm": 0.27708467841148376, + "learning_rate": 8.947568484744029e-07, + "loss": 0.3661, + "step": 7636 + }, + { + "epoch": 4.128852045413588, + "grad_norm": 0.2652656137943268, + "learning_rate": 8.936793742730054e-07, + "loss": 0.3541, + "step": 7637 + }, + { + "epoch": 4.1293926833663726, + "grad_norm": 0.2697584331035614, + "learning_rate": 8.926024855433569e-07, + "loss": 0.3531, + "step": 7638 + }, + { + "epoch": 4.1299333213191565, + "grad_norm": 0.2621810734272003, + "learning_rate": 8.915261824389998e-07, + "loss": 0.3563, + "step": 7639 + }, + { + "epoch": 4.130473959271941, + "grad_norm": 0.2676509916782379, + "learning_rate": 8.904504651133905e-07, + "loss": 0.4032, + "step": 7640 + }, + { + "epoch": 4.131014597224725, + "grad_norm": 0.25960397720336914, + "learning_rate": 8.893753337199018e-07, + "loss": 0.3208, + "step": 7641 + }, + { + "epoch": 4.131555235177509, + "grad_norm": 0.2807498574256897, + "learning_rate": 8.883007884118261e-07, + "loss": 0.3703, + "step": 7642 + }, + { + "epoch": 4.132095873130294, + "grad_norm": 0.28550735116004944, + "learning_rate": 8.872268293423691e-07, + "loss": 0.3879, + "step": 7643 + }, + { + "epoch": 4.132636511083078, + "grad_norm": 0.2682303488254547, + "learning_rate": 8.861534566646534e-07, + "loss": 0.3524, + "step": 7644 + }, + { + "epoch": 4.133177149035863, + "grad_norm": 0.28757935762405396, + "learning_rate": 8.850806705317183e-07, + "loss": 0.3955, + "step": 7645 + }, + { + "epoch": 4.133717786988647, + "grad_norm": 0.28239282965660095, + "learning_rate": 8.840084710965202e-07, + "loss": 0.3573, + "step": 7646 + }, + { + "epoch": 4.1342584249414305, + "grad_norm": 0.2718372344970703, + "learning_rate": 8.829368585119335e-07, + "loss": 0.3518, + "step": 7647 + }, + { + "epoch": 4.134799062894215, + "grad_norm": 0.2815796434879303, + "learning_rate": 8.818658329307428e-07, + "loss": 0.3709, + "step": 7648 + }, + { + "epoch": 4.135339700846999, + "grad_norm": 0.2764212489128113, + "learning_rate": 8.807953945056563e-07, + "loss": 0.3715, + "step": 7649 + }, + { + "epoch": 4.135880338799784, + "grad_norm": 0.2787606120109558, + "learning_rate": 8.797255433892926e-07, + "loss": 0.3421, + "step": 7650 + }, + { + "epoch": 4.136420976752568, + "grad_norm": 0.2819206118583679, + "learning_rate": 8.786562797341913e-07, + "loss": 0.383, + "step": 7651 + }, + { + "epoch": 4.136961614705353, + "grad_norm": 0.2716692090034485, + "learning_rate": 8.775876036928055e-07, + "loss": 0.35, + "step": 7652 + }, + { + "epoch": 4.137502252658137, + "grad_norm": 0.27148786187171936, + "learning_rate": 8.765195154175032e-07, + "loss": 0.3499, + "step": 7653 + }, + { + "epoch": 4.138042890610921, + "grad_norm": 0.27956753969192505, + "learning_rate": 8.754520150605739e-07, + "loss": 0.3918, + "step": 7654 + }, + { + "epoch": 4.138583528563705, + "grad_norm": 0.2726864516735077, + "learning_rate": 8.743851027742172e-07, + "loss": 0.3444, + "step": 7655 + }, + { + "epoch": 4.139124166516489, + "grad_norm": 0.2886405289173126, + "learning_rate": 8.73318778710553e-07, + "loss": 0.3778, + "step": 7656 + }, + { + "epoch": 4.139664804469274, + "grad_norm": 0.2725919783115387, + "learning_rate": 8.722530430216137e-07, + "loss": 0.3448, + "step": 7657 + }, + { + "epoch": 4.140205442422058, + "grad_norm": 0.2917782962322235, + "learning_rate": 8.711878958593512e-07, + "loss": 0.4016, + "step": 7658 + }, + { + "epoch": 4.140746080374842, + "grad_norm": 0.252810001373291, + "learning_rate": 8.701233373756352e-07, + "loss": 0.3422, + "step": 7659 + }, + { + "epoch": 4.141286718327627, + "grad_norm": 0.2579265236854553, + "learning_rate": 8.690593677222431e-07, + "loss": 0.3366, + "step": 7660 + }, + { + "epoch": 4.141827356280411, + "grad_norm": 0.2721611559391022, + "learning_rate": 8.67995987050878e-07, + "loss": 0.3984, + "step": 7661 + }, + { + "epoch": 4.1423679942331955, + "grad_norm": 0.276642382144928, + "learning_rate": 8.669331955131521e-07, + "loss": 0.3421, + "step": 7662 + }, + { + "epoch": 4.142908632185979, + "grad_norm": 0.2617895007133484, + "learning_rate": 8.658709932605985e-07, + "loss": 0.3139, + "step": 7663 + }, + { + "epoch": 4.143449270138763, + "grad_norm": 0.2905745804309845, + "learning_rate": 8.648093804446633e-07, + "loss": 0.3975, + "step": 7664 + }, + { + "epoch": 4.143989908091548, + "grad_norm": 0.26618894934654236, + "learning_rate": 8.637483572167077e-07, + "loss": 0.3516, + "step": 7665 + }, + { + "epoch": 4.144530546044332, + "grad_norm": 0.24945282936096191, + "learning_rate": 8.626879237280128e-07, + "loss": 0.352, + "step": 7666 + }, + { + "epoch": 4.145071183997117, + "grad_norm": 0.2615450918674469, + "learning_rate": 8.616280801297727e-07, + "loss": 0.3153, + "step": 7667 + }, + { + "epoch": 4.145611821949901, + "grad_norm": 0.2893877923488617, + "learning_rate": 8.60568826573096e-07, + "loss": 0.4022, + "step": 7668 + }, + { + "epoch": 4.1461524599026855, + "grad_norm": 0.25844526290893555, + "learning_rate": 8.59510163209012e-07, + "loss": 0.371, + "step": 7669 + }, + { + "epoch": 4.1466930978554695, + "grad_norm": 0.25491437315940857, + "learning_rate": 8.584520901884608e-07, + "loss": 0.3228, + "step": 7670 + }, + { + "epoch": 4.147233735808253, + "grad_norm": 0.2766035795211792, + "learning_rate": 8.573946076623035e-07, + "loss": 0.3799, + "step": 7671 + }, + { + "epoch": 4.147774373761038, + "grad_norm": 0.2754434049129486, + "learning_rate": 8.563377157813102e-07, + "loss": 0.3537, + "step": 7672 + }, + { + "epoch": 4.148315011713822, + "grad_norm": 0.26777127385139465, + "learning_rate": 8.55281414696173e-07, + "loss": 0.3135, + "step": 7673 + }, + { + "epoch": 4.148855649666607, + "grad_norm": 0.2970055341720581, + "learning_rate": 8.542257045574959e-07, + "loss": 0.3784, + "step": 7674 + }, + { + "epoch": 4.149396287619391, + "grad_norm": 0.25933367013931274, + "learning_rate": 8.531705855158024e-07, + "loss": 0.3665, + "step": 7675 + }, + { + "epoch": 4.149936925572175, + "grad_norm": 0.2712830603122711, + "learning_rate": 8.521160577215282e-07, + "loss": 0.3862, + "step": 7676 + }, + { + "epoch": 4.15047756352496, + "grad_norm": 0.26192939281463623, + "learning_rate": 8.510621213250248e-07, + "loss": 0.3636, + "step": 7677 + }, + { + "epoch": 4.1510182014777435, + "grad_norm": 0.27634140849113464, + "learning_rate": 8.500087764765624e-07, + "loss": 0.3897, + "step": 7678 + }, + { + "epoch": 4.151558839430528, + "grad_norm": 0.2597748339176178, + "learning_rate": 8.489560233263244e-07, + "loss": 0.3558, + "step": 7679 + }, + { + "epoch": 4.152099477383312, + "grad_norm": 0.26447004079818726, + "learning_rate": 8.479038620244089e-07, + "loss": 0.3543, + "step": 7680 + }, + { + "epoch": 4.152640115336097, + "grad_norm": 0.261905312538147, + "learning_rate": 8.468522927208345e-07, + "loss": 0.3448, + "step": 7681 + }, + { + "epoch": 4.153180753288881, + "grad_norm": 0.27034062147140503, + "learning_rate": 8.458013155655281e-07, + "loss": 0.3575, + "step": 7682 + }, + { + "epoch": 4.153721391241665, + "grad_norm": 0.27242255210876465, + "learning_rate": 8.447509307083406e-07, + "loss": 0.37, + "step": 7683 + }, + { + "epoch": 4.15426202919445, + "grad_norm": 0.25309810042381287, + "learning_rate": 8.43701138299029e-07, + "loss": 0.327, + "step": 7684 + }, + { + "epoch": 4.154802667147234, + "grad_norm": 0.2773438096046448, + "learning_rate": 8.426519384872733e-07, + "loss": 0.4048, + "step": 7685 + }, + { + "epoch": 4.155343305100018, + "grad_norm": 0.2672390937805176, + "learning_rate": 8.416033314226679e-07, + "loss": 0.3757, + "step": 7686 + }, + { + "epoch": 4.155883943052802, + "grad_norm": 0.2610105574131012, + "learning_rate": 8.405553172547188e-07, + "loss": 0.341, + "step": 7687 + }, + { + "epoch": 4.156424581005586, + "grad_norm": 0.25678926706314087, + "learning_rate": 8.395078961328529e-07, + "loss": 0.3319, + "step": 7688 + }, + { + "epoch": 4.156965218958371, + "grad_norm": 0.2516123354434967, + "learning_rate": 8.384610682064054e-07, + "loss": 0.3575, + "step": 7689 + }, + { + "epoch": 4.157505856911155, + "grad_norm": 0.26414674520492554, + "learning_rate": 8.374148336246352e-07, + "loss": 0.3365, + "step": 7690 + }, + { + "epoch": 4.15804649486394, + "grad_norm": 0.28988468647003174, + "learning_rate": 8.36369192536709e-07, + "loss": 0.3601, + "step": 7691 + }, + { + "epoch": 4.158587132816724, + "grad_norm": 0.2789464294910431, + "learning_rate": 8.353241450917154e-07, + "loss": 0.4034, + "step": 7692 + }, + { + "epoch": 4.1591277707695085, + "grad_norm": 0.2817886173725128, + "learning_rate": 8.342796914386548e-07, + "loss": 0.371, + "step": 7693 + }, + { + "epoch": 4.159668408722292, + "grad_norm": 0.2487310916185379, + "learning_rate": 8.332358317264411e-07, + "loss": 0.3197, + "step": 7694 + }, + { + "epoch": 4.160209046675076, + "grad_norm": 0.2966874837875366, + "learning_rate": 8.321925661039088e-07, + "loss": 0.3647, + "step": 7695 + }, + { + "epoch": 4.160749684627861, + "grad_norm": 0.2805984616279602, + "learning_rate": 8.311498947198037e-07, + "loss": 0.3694, + "step": 7696 + }, + { + "epoch": 4.161290322580645, + "grad_norm": 0.2664489150047302, + "learning_rate": 8.301078177227873e-07, + "loss": 0.3814, + "step": 7697 + }, + { + "epoch": 4.16183096053343, + "grad_norm": 0.2719545066356659, + "learning_rate": 8.290663352614386e-07, + "loss": 0.3699, + "step": 7698 + }, + { + "epoch": 4.162371598486214, + "grad_norm": 0.2809935510158539, + "learning_rate": 8.28025447484248e-07, + "loss": 0.3504, + "step": 7699 + }, + { + "epoch": 4.162912236438998, + "grad_norm": 0.284708708524704, + "learning_rate": 8.269851545396279e-07, + "loss": 0.4177, + "step": 7700 + }, + { + "epoch": 4.1634528743917825, + "grad_norm": 0.27322468161582947, + "learning_rate": 8.259454565758951e-07, + "loss": 0.3687, + "step": 7701 + }, + { + "epoch": 4.163993512344566, + "grad_norm": 0.27562010288238525, + "learning_rate": 8.249063537412926e-07, + "loss": 0.3645, + "step": 7702 + }, + { + "epoch": 4.164534150297351, + "grad_norm": 0.27685800194740295, + "learning_rate": 8.238678461839711e-07, + "loss": 0.3691, + "step": 7703 + }, + { + "epoch": 4.165074788250135, + "grad_norm": 0.28065788745880127, + "learning_rate": 8.228299340520018e-07, + "loss": 0.3663, + "step": 7704 + }, + { + "epoch": 4.165615426202919, + "grad_norm": 0.25994250178337097, + "learning_rate": 8.217926174933665e-07, + "loss": 0.3384, + "step": 7705 + }, + { + "epoch": 4.166156064155704, + "grad_norm": 0.29021185636520386, + "learning_rate": 8.207558966559631e-07, + "loss": 0.3729, + "step": 7706 + }, + { + "epoch": 4.166696702108488, + "grad_norm": 0.27554744482040405, + "learning_rate": 8.197197716876076e-07, + "loss": 0.3611, + "step": 7707 + }, + { + "epoch": 4.1672373400612726, + "grad_norm": 0.2929653525352478, + "learning_rate": 8.186842427360275e-07, + "loss": 0.3552, + "step": 7708 + }, + { + "epoch": 4.1677779780140565, + "grad_norm": 0.2645328640937805, + "learning_rate": 8.176493099488664e-07, + "loss": 0.3384, + "step": 7709 + }, + { + "epoch": 4.168318615966841, + "grad_norm": 0.2650566101074219, + "learning_rate": 8.166149734736845e-07, + "loss": 0.3365, + "step": 7710 + }, + { + "epoch": 4.168859253919625, + "grad_norm": 0.27955883741378784, + "learning_rate": 8.155812334579532e-07, + "loss": 0.3455, + "step": 7711 + }, + { + "epoch": 4.169399891872409, + "grad_norm": 0.2708788812160492, + "learning_rate": 8.145480900490654e-07, + "loss": 0.3581, + "step": 7712 + }, + { + "epoch": 4.169940529825194, + "grad_norm": 0.27975592017173767, + "learning_rate": 8.135155433943199e-07, + "loss": 0.3926, + "step": 7713 + }, + { + "epoch": 4.170481167777978, + "grad_norm": 0.26336678862571716, + "learning_rate": 8.124835936409376e-07, + "loss": 0.3385, + "step": 7714 + }, + { + "epoch": 4.171021805730763, + "grad_norm": 0.24862268567085266, + "learning_rate": 8.114522409360531e-07, + "loss": 0.3613, + "step": 7715 + }, + { + "epoch": 4.171562443683547, + "grad_norm": 0.2469266802072525, + "learning_rate": 8.104214854267134e-07, + "loss": 0.37, + "step": 7716 + }, + { + "epoch": 4.1721030816363305, + "grad_norm": 0.2724321484565735, + "learning_rate": 8.09391327259883e-07, + "loss": 0.4061, + "step": 7717 + }, + { + "epoch": 4.172643719589115, + "grad_norm": 0.2582729458808899, + "learning_rate": 8.083617665824373e-07, + "loss": 0.3288, + "step": 7718 + }, + { + "epoch": 4.173184357541899, + "grad_norm": 0.26399657130241394, + "learning_rate": 8.073328035411726e-07, + "loss": 0.3592, + "step": 7719 + }, + { + "epoch": 4.173724995494684, + "grad_norm": 0.267774760723114, + "learning_rate": 8.063044382827945e-07, + "loss": 0.38, + "step": 7720 + }, + { + "epoch": 4.174265633447468, + "grad_norm": 0.29686951637268066, + "learning_rate": 8.05276670953925e-07, + "loss": 0.3831, + "step": 7721 + }, + { + "epoch": 4.174806271400252, + "grad_norm": 0.27032238245010376, + "learning_rate": 8.042495017011037e-07, + "loss": 0.3454, + "step": 7722 + }, + { + "epoch": 4.175346909353037, + "grad_norm": 0.27771714329719543, + "learning_rate": 8.032229306707795e-07, + "loss": 0.3752, + "step": 7723 + }, + { + "epoch": 4.175887547305821, + "grad_norm": 0.3001413643360138, + "learning_rate": 8.021969580093231e-07, + "loss": 0.3948, + "step": 7724 + }, + { + "epoch": 4.176428185258605, + "grad_norm": 0.26011037826538086, + "learning_rate": 8.011715838630107e-07, + "loss": 0.3121, + "step": 7725 + }, + { + "epoch": 4.176968823211389, + "grad_norm": 0.2860434055328369, + "learning_rate": 8.001468083780418e-07, + "loss": 0.3841, + "step": 7726 + }, + { + "epoch": 4.177509461164174, + "grad_norm": 0.28014615178108215, + "learning_rate": 7.991226317005263e-07, + "loss": 0.3785, + "step": 7727 + }, + { + "epoch": 4.178050099116958, + "grad_norm": 0.27071434259414673, + "learning_rate": 7.980990539764898e-07, + "loss": 0.3471, + "step": 7728 + }, + { + "epoch": 4.178590737069742, + "grad_norm": 0.2784453332424164, + "learning_rate": 7.970760753518713e-07, + "loss": 0.3494, + "step": 7729 + }, + { + "epoch": 4.179131375022527, + "grad_norm": 0.2652834355831146, + "learning_rate": 7.960536959725252e-07, + "loss": 0.3575, + "step": 7730 + }, + { + "epoch": 4.179672012975311, + "grad_norm": 0.2955927848815918, + "learning_rate": 7.950319159842212e-07, + "loss": 0.3861, + "step": 7731 + }, + { + "epoch": 4.1802126509280955, + "grad_norm": 0.2936646640300751, + "learning_rate": 7.940107355326431e-07, + "loss": 0.3866, + "step": 7732 + }, + { + "epoch": 4.180753288880879, + "grad_norm": 0.2582504153251648, + "learning_rate": 7.929901547633867e-07, + "loss": 0.3455, + "step": 7733 + }, + { + "epoch": 4.181293926833663, + "grad_norm": 0.29754340648651123, + "learning_rate": 7.919701738219677e-07, + "loss": 0.3326, + "step": 7734 + }, + { + "epoch": 4.181834564786448, + "grad_norm": 0.27368760108947754, + "learning_rate": 7.909507928538107e-07, + "loss": 0.3812, + "step": 7735 + }, + { + "epoch": 4.182375202739232, + "grad_norm": 0.2942587733268738, + "learning_rate": 7.899320120042592e-07, + "loss": 0.3581, + "step": 7736 + }, + { + "epoch": 4.182915840692017, + "grad_norm": 0.27439722418785095, + "learning_rate": 7.88913831418568e-07, + "loss": 0.3367, + "step": 7737 + }, + { + "epoch": 4.183456478644801, + "grad_norm": 0.2757376730442047, + "learning_rate": 7.878962512419064e-07, + "loss": 0.3921, + "step": 7738 + }, + { + "epoch": 4.1839971165975856, + "grad_norm": 0.25311020016670227, + "learning_rate": 7.868792716193613e-07, + "loss": 0.3511, + "step": 7739 + }, + { + "epoch": 4.1845377545503695, + "grad_norm": 0.2917262017726898, + "learning_rate": 7.858628926959311e-07, + "loss": 0.408, + "step": 7740 + }, + { + "epoch": 4.185078392503153, + "grad_norm": 0.26574715971946716, + "learning_rate": 7.848471146165287e-07, + "loss": 0.3321, + "step": 7741 + }, + { + "epoch": 4.185619030455938, + "grad_norm": 0.2669033110141754, + "learning_rate": 7.838319375259806e-07, + "loss": 0.3482, + "step": 7742 + }, + { + "epoch": 4.186159668408722, + "grad_norm": 0.29009807109832764, + "learning_rate": 7.828173615690309e-07, + "loss": 0.3863, + "step": 7743 + }, + { + "epoch": 4.186700306361507, + "grad_norm": 0.278519868850708, + "learning_rate": 7.81803386890338e-07, + "loss": 0.3781, + "step": 7744 + }, + { + "epoch": 4.187240944314291, + "grad_norm": 0.2703883945941925, + "learning_rate": 7.807900136344676e-07, + "loss": 0.3731, + "step": 7745 + }, + { + "epoch": 4.187781582267075, + "grad_norm": 0.2684577405452728, + "learning_rate": 7.797772419459082e-07, + "loss": 0.3341, + "step": 7746 + }, + { + "epoch": 4.18832222021986, + "grad_norm": 0.2858303189277649, + "learning_rate": 7.78765071969057e-07, + "loss": 0.3782, + "step": 7747 + }, + { + "epoch": 4.1888628581726435, + "grad_norm": 0.24063695967197418, + "learning_rate": 7.777535038482293e-07, + "loss": 0.3238, + "step": 7748 + }, + { + "epoch": 4.189403496125428, + "grad_norm": 0.2742394506931305, + "learning_rate": 7.767425377276516e-07, + "loss": 0.3743, + "step": 7749 + }, + { + "epoch": 4.189944134078212, + "grad_norm": 0.26466435194015503, + "learning_rate": 7.757321737514645e-07, + "loss": 0.3632, + "step": 7750 + }, + { + "epoch": 4.190484772030997, + "grad_norm": 0.2548874616622925, + "learning_rate": 7.747224120637265e-07, + "loss": 0.3337, + "step": 7751 + }, + { + "epoch": 4.191025409983781, + "grad_norm": 0.2508438229560852, + "learning_rate": 7.73713252808404e-07, + "loss": 0.3469, + "step": 7752 + }, + { + "epoch": 4.191566047936565, + "grad_norm": 0.274068146944046, + "learning_rate": 7.727046961293849e-07, + "loss": 0.4007, + "step": 7753 + }, + { + "epoch": 4.19210668588935, + "grad_norm": 0.2605467140674591, + "learning_rate": 7.716967421704658e-07, + "loss": 0.3373, + "step": 7754 + }, + { + "epoch": 4.192647323842134, + "grad_norm": 0.269639253616333, + "learning_rate": 7.706893910753571e-07, + "loss": 0.3881, + "step": 7755 + }, + { + "epoch": 4.193187961794918, + "grad_norm": 0.26445725560188293, + "learning_rate": 7.696826429876885e-07, + "loss": 0.3579, + "step": 7756 + }, + { + "epoch": 4.193728599747702, + "grad_norm": 0.2784506380558014, + "learning_rate": 7.686764980509986e-07, + "loss": 0.3451, + "step": 7757 + }, + { + "epoch": 4.194269237700486, + "grad_norm": 0.29687944054603577, + "learning_rate": 7.676709564087414e-07, + "loss": 0.3371, + "step": 7758 + }, + { + "epoch": 4.194809875653271, + "grad_norm": 0.25223809480667114, + "learning_rate": 7.666660182042845e-07, + "loss": 0.3639, + "step": 7759 + }, + { + "epoch": 4.195350513606055, + "grad_norm": 0.25930145382881165, + "learning_rate": 7.656616835809122e-07, + "loss": 0.383, + "step": 7760 + }, + { + "epoch": 4.19589115155884, + "grad_norm": 0.2787495255470276, + "learning_rate": 7.646579526818198e-07, + "loss": 0.3904, + "step": 7761 + }, + { + "epoch": 4.196431789511624, + "grad_norm": 0.25165069103240967, + "learning_rate": 7.636548256501164e-07, + "loss": 0.3399, + "step": 7762 + }, + { + "epoch": 4.196972427464408, + "grad_norm": 0.2803820073604584, + "learning_rate": 7.626523026288279e-07, + "loss": 0.369, + "step": 7763 + }, + { + "epoch": 4.197513065417192, + "grad_norm": 0.2803477942943573, + "learning_rate": 7.616503837608907e-07, + "loss": 0.3955, + "step": 7764 + }, + { + "epoch": 4.198053703369976, + "grad_norm": 0.25558480620384216, + "learning_rate": 7.606490691891577e-07, + "loss": 0.3223, + "step": 7765 + }, + { + "epoch": 4.198594341322761, + "grad_norm": 0.26520413160324097, + "learning_rate": 7.596483590563942e-07, + "loss": 0.3362, + "step": 7766 + }, + { + "epoch": 4.199134979275545, + "grad_norm": 0.2912404537200928, + "learning_rate": 7.586482535052781e-07, + "loss": 0.4001, + "step": 7767 + }, + { + "epoch": 4.19967561722833, + "grad_norm": 0.2695380747318268, + "learning_rate": 7.576487526784054e-07, + "loss": 0.3304, + "step": 7768 + }, + { + "epoch": 4.200216255181114, + "grad_norm": 0.280534029006958, + "learning_rate": 7.566498567182812e-07, + "loss": 0.381, + "step": 7769 + }, + { + "epoch": 4.200756893133898, + "grad_norm": 0.24744150042533875, + "learning_rate": 7.556515657673274e-07, + "loss": 0.3198, + "step": 7770 + }, + { + "epoch": 4.2012975310866825, + "grad_norm": 0.2521040439605713, + "learning_rate": 7.54653879967876e-07, + "loss": 0.3515, + "step": 7771 + }, + { + "epoch": 4.201838169039466, + "grad_norm": 0.2577064037322998, + "learning_rate": 7.536567994621774e-07, + "loss": 0.3666, + "step": 7772 + }, + { + "epoch": 4.202378806992251, + "grad_norm": 0.26439905166625977, + "learning_rate": 7.526603243923958e-07, + "loss": 0.3363, + "step": 7773 + }, + { + "epoch": 4.202919444945035, + "grad_norm": 0.2696458399295807, + "learning_rate": 7.516644549006019e-07, + "loss": 0.3617, + "step": 7774 + }, + { + "epoch": 4.203460082897819, + "grad_norm": 0.3011111915111542, + "learning_rate": 7.506691911287883e-07, + "loss": 0.3981, + "step": 7775 + }, + { + "epoch": 4.204000720850604, + "grad_norm": 0.26315832138061523, + "learning_rate": 7.496745332188555e-07, + "loss": 0.3306, + "step": 7776 + }, + { + "epoch": 4.204541358803388, + "grad_norm": 0.27317333221435547, + "learning_rate": 7.486804813126224e-07, + "loss": 0.3573, + "step": 7777 + }, + { + "epoch": 4.205081996756173, + "grad_norm": 0.26644495129585266, + "learning_rate": 7.47687035551819e-07, + "loss": 0.322, + "step": 7778 + }, + { + "epoch": 4.2056226347089565, + "grad_norm": 0.2756754159927368, + "learning_rate": 7.466941960780866e-07, + "loss": 0.3571, + "step": 7779 + }, + { + "epoch": 4.206163272661741, + "grad_norm": 0.2685154676437378, + "learning_rate": 7.457019630329848e-07, + "loss": 0.3729, + "step": 7780 + }, + { + "epoch": 4.206703910614525, + "grad_norm": 0.25748974084854126, + "learning_rate": 7.447103365579839e-07, + "loss": 0.3518, + "step": 7781 + }, + { + "epoch": 4.207244548567309, + "grad_norm": 0.28093305230140686, + "learning_rate": 7.437193167944668e-07, + "loss": 0.367, + "step": 7782 + }, + { + "epoch": 4.207785186520094, + "grad_norm": 0.26227885484695435, + "learning_rate": 7.427289038837332e-07, + "loss": 0.3396, + "step": 7783 + }, + { + "epoch": 4.208325824472878, + "grad_norm": 0.28405529260635376, + "learning_rate": 7.417390979669925e-07, + "loss": 0.3752, + "step": 7784 + }, + { + "epoch": 4.208866462425663, + "grad_norm": 0.27451229095458984, + "learning_rate": 7.407498991853729e-07, + "loss": 0.3739, + "step": 7785 + }, + { + "epoch": 4.209407100378447, + "grad_norm": 0.26741325855255127, + "learning_rate": 7.397613076799082e-07, + "loss": 0.3477, + "step": 7786 + }, + { + "epoch": 4.2099477383312305, + "grad_norm": 0.26035529375076294, + "learning_rate": 7.387733235915528e-07, + "loss": 0.3541, + "step": 7787 + }, + { + "epoch": 4.210488376284015, + "grad_norm": 0.30314940214157104, + "learning_rate": 7.377859470611692e-07, + "loss": 0.405, + "step": 7788 + }, + { + "epoch": 4.211029014236799, + "grad_norm": 0.26151275634765625, + "learning_rate": 7.367991782295392e-07, + "loss": 0.3248, + "step": 7789 + }, + { + "epoch": 4.211569652189584, + "grad_norm": 0.2777373492717743, + "learning_rate": 7.358130172373523e-07, + "loss": 0.3692, + "step": 7790 + }, + { + "epoch": 4.212110290142368, + "grad_norm": 0.2807091176509857, + "learning_rate": 7.348274642252129e-07, + "loss": 0.3795, + "step": 7791 + }, + { + "epoch": 4.212650928095153, + "grad_norm": 0.2608586251735687, + "learning_rate": 7.338425193336418e-07, + "loss": 0.334, + "step": 7792 + }, + { + "epoch": 4.213191566047937, + "grad_norm": 0.27377480268478394, + "learning_rate": 7.328581827030689e-07, + "loss": 0.3557, + "step": 7793 + }, + { + "epoch": 4.213732204000721, + "grad_norm": 0.2679743766784668, + "learning_rate": 7.318744544738387e-07, + "loss": 0.3732, + "step": 7794 + }, + { + "epoch": 4.214272841953505, + "grad_norm": 0.2861184775829315, + "learning_rate": 7.308913347862112e-07, + "loss": 0.3551, + "step": 7795 + }, + { + "epoch": 4.214813479906289, + "grad_norm": 0.2709786891937256, + "learning_rate": 7.299088237803559e-07, + "loss": 0.3327, + "step": 7796 + }, + { + "epoch": 4.215354117859074, + "grad_norm": 0.2617168426513672, + "learning_rate": 7.289269215963602e-07, + "loss": 0.3601, + "step": 7797 + }, + { + "epoch": 4.215894755811858, + "grad_norm": 0.24455086886882782, + "learning_rate": 7.279456283742175e-07, + "loss": 0.356, + "step": 7798 + }, + { + "epoch": 4.216435393764642, + "grad_norm": 0.261747807264328, + "learning_rate": 7.269649442538435e-07, + "loss": 0.3554, + "step": 7799 + }, + { + "epoch": 4.216976031717427, + "grad_norm": 0.2773091197013855, + "learning_rate": 7.259848693750582e-07, + "loss": 0.3502, + "step": 7800 + }, + { + "epoch": 4.217516669670211, + "grad_norm": 0.29389703273773193, + "learning_rate": 7.250054038776022e-07, + "loss": 0.4092, + "step": 7801 + }, + { + "epoch": 4.2180573076229955, + "grad_norm": 0.26419511437416077, + "learning_rate": 7.240265479011249e-07, + "loss": 0.3365, + "step": 7802 + }, + { + "epoch": 4.218597945575779, + "grad_norm": 0.2726242244243622, + "learning_rate": 7.230483015851886e-07, + "loss": 0.375, + "step": 7803 + }, + { + "epoch": 4.219138583528563, + "grad_norm": 0.2631964087486267, + "learning_rate": 7.220706650692716e-07, + "loss": 0.3767, + "step": 7804 + }, + { + "epoch": 4.219679221481348, + "grad_norm": 0.2645455002784729, + "learning_rate": 7.210936384927631e-07, + "loss": 0.3748, + "step": 7805 + }, + { + "epoch": 4.220219859434132, + "grad_norm": 0.2717190086841583, + "learning_rate": 7.201172219949643e-07, + "loss": 0.363, + "step": 7806 + }, + { + "epoch": 4.220760497386917, + "grad_norm": 0.2631307542324066, + "learning_rate": 7.191414157150933e-07, + "loss": 0.3387, + "step": 7807 + }, + { + "epoch": 4.221301135339701, + "grad_norm": 0.27695712447166443, + "learning_rate": 7.181662197922762e-07, + "loss": 0.3628, + "step": 7808 + }, + { + "epoch": 4.2218417732924856, + "grad_norm": 0.2771584689617157, + "learning_rate": 7.171916343655583e-07, + "loss": 0.3686, + "step": 7809 + }, + { + "epoch": 4.2223824112452695, + "grad_norm": 0.29694145917892456, + "learning_rate": 7.162176595738895e-07, + "loss": 0.4222, + "step": 7810 + }, + { + "epoch": 4.222923049198053, + "grad_norm": 0.26398229598999023, + "learning_rate": 7.1524429555614e-07, + "loss": 0.3479, + "step": 7811 + }, + { + "epoch": 4.223463687150838, + "grad_norm": 0.288509726524353, + "learning_rate": 7.142715424510915e-07, + "loss": 0.397, + "step": 7812 + }, + { + "epoch": 4.224004325103622, + "grad_norm": 0.2654188275337219, + "learning_rate": 7.132994003974359e-07, + "loss": 0.32, + "step": 7813 + }, + { + "epoch": 4.224544963056407, + "grad_norm": 0.28310319781303406, + "learning_rate": 7.123278695337793e-07, + "loss": 0.3652, + "step": 7814 + }, + { + "epoch": 4.225085601009191, + "grad_norm": 0.2651018500328064, + "learning_rate": 7.113569499986401e-07, + "loss": 0.3697, + "step": 7815 + }, + { + "epoch": 4.225626238961975, + "grad_norm": 0.27345407009124756, + "learning_rate": 7.103866419304517e-07, + "loss": 0.3935, + "step": 7816 + }, + { + "epoch": 4.22616687691476, + "grad_norm": 0.24579817056655884, + "learning_rate": 7.094169454675575e-07, + "loss": 0.3125, + "step": 7817 + }, + { + "epoch": 4.2267075148675435, + "grad_norm": 0.284580796957016, + "learning_rate": 7.084478607482176e-07, + "loss": 0.3887, + "step": 7818 + }, + { + "epoch": 4.227248152820328, + "grad_norm": 0.2559194564819336, + "learning_rate": 7.074793879106001e-07, + "loss": 0.3079, + "step": 7819 + }, + { + "epoch": 4.227788790773112, + "grad_norm": 0.28715476393699646, + "learning_rate": 7.065115270927875e-07, + "loss": 0.3845, + "step": 7820 + }, + { + "epoch": 4.228329428725896, + "grad_norm": 0.26812219619750977, + "learning_rate": 7.055442784327782e-07, + "loss": 0.3495, + "step": 7821 + }, + { + "epoch": 4.228870066678681, + "grad_norm": 0.2592289447784424, + "learning_rate": 7.045776420684791e-07, + "loss": 0.3553, + "step": 7822 + }, + { + "epoch": 4.229410704631465, + "grad_norm": 0.24718797206878662, + "learning_rate": 7.036116181377106e-07, + "loss": 0.341, + "step": 7823 + }, + { + "epoch": 4.22995134258425, + "grad_norm": 0.26759886741638184, + "learning_rate": 7.026462067782086e-07, + "loss": 0.3739, + "step": 7824 + }, + { + "epoch": 4.230491980537034, + "grad_norm": 0.25544220209121704, + "learning_rate": 7.01681408127618e-07, + "loss": 0.3363, + "step": 7825 + }, + { + "epoch": 4.231032618489818, + "grad_norm": 0.2759501039981842, + "learning_rate": 7.00717222323501e-07, + "loss": 0.4003, + "step": 7826 + }, + { + "epoch": 4.231573256442602, + "grad_norm": 0.26685962080955505, + "learning_rate": 6.997536495033252e-07, + "loss": 0.3538, + "step": 7827 + }, + { + "epoch": 4.232113894395386, + "grad_norm": 0.28557097911834717, + "learning_rate": 6.987906898044783e-07, + "loss": 0.3911, + "step": 7828 + }, + { + "epoch": 4.232654532348171, + "grad_norm": 0.25702571868896484, + "learning_rate": 6.978283433642552e-07, + "loss": 0.3415, + "step": 7829 + }, + { + "epoch": 4.233195170300955, + "grad_norm": 0.2851904630661011, + "learning_rate": 6.968666103198679e-07, + "loss": 0.3892, + "step": 7830 + }, + { + "epoch": 4.23373580825374, + "grad_norm": 0.27647873759269714, + "learning_rate": 6.959054908084367e-07, + "loss": 0.3471, + "step": 7831 + }, + { + "epoch": 4.234276446206524, + "grad_norm": 0.2786784768104553, + "learning_rate": 6.949449849669965e-07, + "loss": 0.3326, + "step": 7832 + }, + { + "epoch": 4.234817084159308, + "grad_norm": 0.2770426571369171, + "learning_rate": 6.939850929324954e-07, + "loss": 0.3848, + "step": 7833 + }, + { + "epoch": 4.235357722112092, + "grad_norm": 0.2511410117149353, + "learning_rate": 6.930258148417924e-07, + "loss": 0.3682, + "step": 7834 + }, + { + "epoch": 4.235898360064876, + "grad_norm": 0.26557570695877075, + "learning_rate": 6.920671508316584e-07, + "loss": 0.379, + "step": 7835 + }, + { + "epoch": 4.236438998017661, + "grad_norm": 0.26921749114990234, + "learning_rate": 6.911091010387805e-07, + "loss": 0.347, + "step": 7836 + }, + { + "epoch": 4.236979635970445, + "grad_norm": 0.2844080924987793, + "learning_rate": 6.901516655997536e-07, + "loss": 0.3849, + "step": 7837 + }, + { + "epoch": 4.23752027392323, + "grad_norm": 0.27364280819892883, + "learning_rate": 6.891948446510899e-07, + "loss": 0.3396, + "step": 7838 + }, + { + "epoch": 4.238060911876014, + "grad_norm": 0.29257693886756897, + "learning_rate": 6.882386383292072e-07, + "loss": 0.4091, + "step": 7839 + }, + { + "epoch": 4.238601549828798, + "grad_norm": 0.26295551657676697, + "learning_rate": 6.872830467704417e-07, + "loss": 0.3373, + "step": 7840 + }, + { + "epoch": 4.2391421877815825, + "grad_norm": 0.2602781653404236, + "learning_rate": 6.863280701110409e-07, + "loss": 0.359, + "step": 7841 + }, + { + "epoch": 4.239682825734366, + "grad_norm": 0.2525506019592285, + "learning_rate": 6.853737084871631e-07, + "loss": 0.3564, + "step": 7842 + }, + { + "epoch": 4.240223463687151, + "grad_norm": 0.2838027775287628, + "learning_rate": 6.844199620348784e-07, + "loss": 0.386, + "step": 7843 + }, + { + "epoch": 4.240764101639935, + "grad_norm": 0.2779183089733124, + "learning_rate": 6.834668308901704e-07, + "loss": 0.3735, + "step": 7844 + }, + { + "epoch": 4.241304739592719, + "grad_norm": 0.26235607266426086, + "learning_rate": 6.825143151889358e-07, + "loss": 0.3564, + "step": 7845 + }, + { + "epoch": 4.241845377545504, + "grad_norm": 0.25980979204177856, + "learning_rate": 6.815624150669825e-07, + "loss": 0.3438, + "step": 7846 + }, + { + "epoch": 4.242386015498288, + "grad_norm": 0.2772977948188782, + "learning_rate": 6.806111306600289e-07, + "loss": 0.379, + "step": 7847 + }, + { + "epoch": 4.242926653451073, + "grad_norm": 0.2637321650981903, + "learning_rate": 6.7966046210371e-07, + "loss": 0.3652, + "step": 7848 + }, + { + "epoch": 4.2434672914038565, + "grad_norm": 0.28105807304382324, + "learning_rate": 6.787104095335678e-07, + "loss": 0.3663, + "step": 7849 + }, + { + "epoch": 4.244007929356641, + "grad_norm": 0.28423410654067993, + "learning_rate": 6.777609730850615e-07, + "loss": 0.3369, + "step": 7850 + }, + { + "epoch": 4.244548567309425, + "grad_norm": 0.2776162326335907, + "learning_rate": 6.768121528935595e-07, + "loss": 0.4145, + "step": 7851 + }, + { + "epoch": 4.245089205262209, + "grad_norm": 0.2545382082462311, + "learning_rate": 6.758639490943408e-07, + "loss": 0.3385, + "step": 7852 + }, + { + "epoch": 4.245629843214994, + "grad_norm": 0.273418664932251, + "learning_rate": 6.749163618226006e-07, + "loss": 0.37, + "step": 7853 + }, + { + "epoch": 4.246170481167778, + "grad_norm": 0.2679308354854584, + "learning_rate": 6.739693912134448e-07, + "loss": 0.3502, + "step": 7854 + }, + { + "epoch": 4.246711119120563, + "grad_norm": 0.29377102851867676, + "learning_rate": 6.730230374018886e-07, + "loss": 0.3969, + "step": 7855 + }, + { + "epoch": 4.247251757073347, + "grad_norm": 0.2562815546989441, + "learning_rate": 6.72077300522862e-07, + "loss": 0.3387, + "step": 7856 + }, + { + "epoch": 4.2477923950261305, + "grad_norm": 0.2699720859527588, + "learning_rate": 6.711321807112076e-07, + "loss": 0.3437, + "step": 7857 + }, + { + "epoch": 4.248333032978915, + "grad_norm": 0.27340906858444214, + "learning_rate": 6.701876781016786e-07, + "loss": 0.3707, + "step": 7858 + }, + { + "epoch": 4.248873670931699, + "grad_norm": 0.2673344314098358, + "learning_rate": 6.692437928289385e-07, + "loss": 0.3436, + "step": 7859 + }, + { + "epoch": 4.249414308884484, + "grad_norm": 0.2760732173919678, + "learning_rate": 6.683005250275676e-07, + "loss": 0.3901, + "step": 7860 + }, + { + "epoch": 4.249954946837268, + "grad_norm": 0.262892484664917, + "learning_rate": 6.673578748320525e-07, + "loss": 0.3256, + "step": 7861 + }, + { + "epoch": 4.250495584790052, + "grad_norm": 0.26716530323028564, + "learning_rate": 6.664158423767975e-07, + "loss": 0.341, + "step": 7862 + }, + { + "epoch": 4.251036222742837, + "grad_norm": 0.2554490864276886, + "learning_rate": 6.654744277961139e-07, + "loss": 0.3437, + "step": 7863 + }, + { + "epoch": 4.251576860695621, + "grad_norm": 0.2965814471244812, + "learning_rate": 6.645336312242267e-07, + "loss": 0.4271, + "step": 7864 + }, + { + "epoch": 4.252117498648405, + "grad_norm": 0.2557962238788605, + "learning_rate": 6.635934527952747e-07, + "loss": 0.3366, + "step": 7865 + }, + { + "epoch": 4.252658136601189, + "grad_norm": 0.27538374066352844, + "learning_rate": 6.626538926433057e-07, + "loss": 0.3835, + "step": 7866 + }, + { + "epoch": 4.253198774553974, + "grad_norm": 0.27699223160743713, + "learning_rate": 6.617149509022807e-07, + "loss": 0.3508, + "step": 7867 + }, + { + "epoch": 4.253739412506758, + "grad_norm": 0.28138530254364014, + "learning_rate": 6.607766277060712e-07, + "loss": 0.3594, + "step": 7868 + }, + { + "epoch": 4.254280050459542, + "grad_norm": 0.2613767385482788, + "learning_rate": 6.598389231884628e-07, + "loss": 0.3442, + "step": 7869 + }, + { + "epoch": 4.254820688412327, + "grad_norm": 0.2648288607597351, + "learning_rate": 6.589018374831529e-07, + "loss": 0.3515, + "step": 7870 + }, + { + "epoch": 4.255361326365111, + "grad_norm": 0.2746146321296692, + "learning_rate": 6.579653707237465e-07, + "loss": 0.3661, + "step": 7871 + }, + { + "epoch": 4.2559019643178955, + "grad_norm": 0.2693077623844147, + "learning_rate": 6.570295230437663e-07, + "loss": 0.3794, + "step": 7872 + }, + { + "epoch": 4.256442602270679, + "grad_norm": 0.265910267829895, + "learning_rate": 6.560942945766408e-07, + "loss": 0.3581, + "step": 7873 + }, + { + "epoch": 4.256983240223463, + "grad_norm": 0.27036747336387634, + "learning_rate": 6.551596854557158e-07, + "loss": 0.3578, + "step": 7874 + }, + { + "epoch": 4.257523878176248, + "grad_norm": 0.2775130569934845, + "learning_rate": 6.542256958142456e-07, + "loss": 0.3833, + "step": 7875 + }, + { + "epoch": 4.258064516129032, + "grad_norm": 0.2686364948749542, + "learning_rate": 6.532923257853952e-07, + "loss": 0.3435, + "step": 7876 + }, + { + "epoch": 4.258605154081817, + "grad_norm": 0.27295759320259094, + "learning_rate": 6.523595755022444e-07, + "loss": 0.3478, + "step": 7877 + }, + { + "epoch": 4.259145792034601, + "grad_norm": 0.28856146335601807, + "learning_rate": 6.514274450977831e-07, + "loss": 0.3973, + "step": 7878 + }, + { + "epoch": 4.259686429987385, + "grad_norm": 0.26978960633277893, + "learning_rate": 6.504959347049111e-07, + "loss": 0.3632, + "step": 7879 + }, + { + "epoch": 4.2602270679401695, + "grad_norm": 0.28080758452415466, + "learning_rate": 6.495650444564433e-07, + "loss": 0.3511, + "step": 7880 + }, + { + "epoch": 4.260767705892953, + "grad_norm": 0.2930925786495209, + "learning_rate": 6.486347744851029e-07, + "loss": 0.3753, + "step": 7881 + }, + { + "epoch": 4.261308343845738, + "grad_norm": 0.26841792464256287, + "learning_rate": 6.477051249235278e-07, + "loss": 0.3594, + "step": 7882 + }, + { + "epoch": 4.261848981798522, + "grad_norm": 0.2728954553604126, + "learning_rate": 6.467760959042646e-07, + "loss": 0.3729, + "step": 7883 + }, + { + "epoch": 4.262389619751307, + "grad_norm": 0.2672972083091736, + "learning_rate": 6.458476875597731e-07, + "loss": 0.357, + "step": 7884 + }, + { + "epoch": 4.262930257704091, + "grad_norm": 0.2825981378555298, + "learning_rate": 6.449199000224221e-07, + "loss": 0.3432, + "step": 7885 + }, + { + "epoch": 4.263470895656875, + "grad_norm": 0.2652915120124817, + "learning_rate": 6.439927334244972e-07, + "loss": 0.3693, + "step": 7886 + }, + { + "epoch": 4.26401153360966, + "grad_norm": 0.25142279267311096, + "learning_rate": 6.430661878981898e-07, + "loss": 0.3296, + "step": 7887 + }, + { + "epoch": 4.2645521715624435, + "grad_norm": 0.27410030364990234, + "learning_rate": 6.421402635756053e-07, + "loss": 0.3734, + "step": 7888 + }, + { + "epoch": 4.265092809515228, + "grad_norm": 0.2869071662425995, + "learning_rate": 6.412149605887613e-07, + "loss": 0.3995, + "step": 7889 + }, + { + "epoch": 4.265633447468012, + "grad_norm": 0.25402575731277466, + "learning_rate": 6.402902790695842e-07, + "loss": 0.3128, + "step": 7890 + }, + { + "epoch": 4.266174085420796, + "grad_norm": 0.27300745248794556, + "learning_rate": 6.393662191499156e-07, + "loss": 0.3782, + "step": 7891 + }, + { + "epoch": 4.266714723373581, + "grad_norm": 0.26714909076690674, + "learning_rate": 6.384427809615052e-07, + "loss": 0.3728, + "step": 7892 + }, + { + "epoch": 4.267255361326365, + "grad_norm": 0.27628183364868164, + "learning_rate": 6.375199646360142e-07, + "loss": 0.375, + "step": 7893 + }, + { + "epoch": 4.26779599927915, + "grad_norm": 0.26573359966278076, + "learning_rate": 6.36597770305018e-07, + "loss": 0.3544, + "step": 7894 + }, + { + "epoch": 4.268336637231934, + "grad_norm": 0.2471429854631424, + "learning_rate": 6.356761980999998e-07, + "loss": 0.3516, + "step": 7895 + }, + { + "epoch": 4.268877275184718, + "grad_norm": 0.2759144604206085, + "learning_rate": 6.347552481523567e-07, + "loss": 0.3688, + "step": 7896 + }, + { + "epoch": 4.269417913137502, + "grad_norm": 0.2551169991493225, + "learning_rate": 6.338349205933947e-07, + "loss": 0.3372, + "step": 7897 + }, + { + "epoch": 4.269958551090286, + "grad_norm": 0.2697770893573761, + "learning_rate": 6.329152155543333e-07, + "loss": 0.3766, + "step": 7898 + }, + { + "epoch": 4.270499189043071, + "grad_norm": 0.25746408104896545, + "learning_rate": 6.319961331663043e-07, + "loss": 0.3511, + "step": 7899 + }, + { + "epoch": 4.271039826995855, + "grad_norm": 0.2764779031276703, + "learning_rate": 6.310776735603452e-07, + "loss": 0.3884, + "step": 7900 + }, + { + "epoch": 4.27158046494864, + "grad_norm": 0.25703078508377075, + "learning_rate": 6.301598368674106e-07, + "loss": 0.3577, + "step": 7901 + }, + { + "epoch": 4.272121102901424, + "grad_norm": 0.2684788107872009, + "learning_rate": 6.29242623218363e-07, + "loss": 0.3632, + "step": 7902 + }, + { + "epoch": 4.272661740854208, + "grad_norm": 0.2789716422557831, + "learning_rate": 6.283260327439777e-07, + "loss": 0.3806, + "step": 7903 + }, + { + "epoch": 4.273202378806992, + "grad_norm": 0.2688591182231903, + "learning_rate": 6.27410065574941e-07, + "loss": 0.3551, + "step": 7904 + }, + { + "epoch": 4.273743016759776, + "grad_norm": 0.29315346479415894, + "learning_rate": 6.264947218418482e-07, + "loss": 0.3705, + "step": 7905 + }, + { + "epoch": 4.274283654712561, + "grad_norm": 0.2573510408401489, + "learning_rate": 6.255800016752089e-07, + "loss": 0.3632, + "step": 7906 + }, + { + "epoch": 4.274824292665345, + "grad_norm": 0.2660392224788666, + "learning_rate": 6.246659052054416e-07, + "loss": 0.3503, + "step": 7907 + }, + { + "epoch": 4.27536493061813, + "grad_norm": 0.26365071535110474, + "learning_rate": 6.237524325628757e-07, + "loss": 0.3484, + "step": 7908 + }, + { + "epoch": 4.275905568570914, + "grad_norm": 0.2605839669704437, + "learning_rate": 6.228395838777545e-07, + "loss": 0.3662, + "step": 7909 + }, + { + "epoch": 4.276446206523698, + "grad_norm": 0.2649060785770416, + "learning_rate": 6.219273592802278e-07, + "loss": 0.3571, + "step": 7910 + }, + { + "epoch": 4.2769868444764825, + "grad_norm": 0.26063892245292664, + "learning_rate": 6.210157589003624e-07, + "loss": 0.3796, + "step": 7911 + }, + { + "epoch": 4.277527482429266, + "grad_norm": 0.24683474004268646, + "learning_rate": 6.201047828681289e-07, + "loss": 0.3394, + "step": 7912 + }, + { + "epoch": 4.278068120382051, + "grad_norm": 0.26715049147605896, + "learning_rate": 6.191944313134146e-07, + "loss": 0.3765, + "step": 7913 + }, + { + "epoch": 4.278608758334835, + "grad_norm": 0.27711889147758484, + "learning_rate": 6.182847043660145e-07, + "loss": 0.3673, + "step": 7914 + }, + { + "epoch": 4.279149396287619, + "grad_norm": 0.2727544605731964, + "learning_rate": 6.173756021556377e-07, + "loss": 0.3727, + "step": 7915 + }, + { + "epoch": 4.279690034240404, + "grad_norm": 0.26233989000320435, + "learning_rate": 6.164671248119014e-07, + "loss": 0.3434, + "step": 7916 + }, + { + "epoch": 4.280230672193188, + "grad_norm": 0.2919680178165436, + "learning_rate": 6.155592724643339e-07, + "loss": 0.3797, + "step": 7917 + }, + { + "epoch": 4.280771310145973, + "grad_norm": 0.26716017723083496, + "learning_rate": 6.146520452423765e-07, + "loss": 0.3695, + "step": 7918 + }, + { + "epoch": 4.2813119480987565, + "grad_norm": 0.2504497766494751, + "learning_rate": 6.137454432753798e-07, + "loss": 0.3341, + "step": 7919 + }, + { + "epoch": 4.28185258605154, + "grad_norm": 0.2768787741661072, + "learning_rate": 6.128394666926035e-07, + "loss": 0.3859, + "step": 7920 + }, + { + "epoch": 4.282393224004325, + "grad_norm": 0.2738492488861084, + "learning_rate": 6.119341156232228e-07, + "loss": 0.4069, + "step": 7921 + }, + { + "epoch": 4.282933861957109, + "grad_norm": 0.2559797167778015, + "learning_rate": 6.110293901963188e-07, + "loss": 0.3368, + "step": 7922 + }, + { + "epoch": 4.283474499909894, + "grad_norm": 0.26948344707489014, + "learning_rate": 6.101252905408883e-07, + "loss": 0.3714, + "step": 7923 + }, + { + "epoch": 4.284015137862678, + "grad_norm": 0.2513277530670166, + "learning_rate": 6.092218167858327e-07, + "loss": 0.3311, + "step": 7924 + }, + { + "epoch": 4.284555775815463, + "grad_norm": 0.27394694089889526, + "learning_rate": 6.083189690599712e-07, + "loss": 0.3546, + "step": 7925 + }, + { + "epoch": 4.285096413768247, + "grad_norm": 0.27439451217651367, + "learning_rate": 6.074167474920267e-07, + "loss": 0.369, + "step": 7926 + }, + { + "epoch": 4.2856370517210305, + "grad_norm": 0.27608034014701843, + "learning_rate": 6.065151522106394e-07, + "loss": 0.411, + "step": 7927 + }, + { + "epoch": 4.286177689673815, + "grad_norm": 0.24546043574810028, + "learning_rate": 6.056141833443552e-07, + "loss": 0.3314, + "step": 7928 + }, + { + "epoch": 4.286718327626599, + "grad_norm": 0.28318652510643005, + "learning_rate": 6.04713841021633e-07, + "loss": 0.371, + "step": 7929 + }, + { + "epoch": 4.287258965579384, + "grad_norm": 0.2628489136695862, + "learning_rate": 6.038141253708429e-07, + "loss": 0.3466, + "step": 7930 + }, + { + "epoch": 4.287799603532168, + "grad_norm": 0.2661830186843872, + "learning_rate": 6.02915036520264e-07, + "loss": 0.3781, + "step": 7931 + }, + { + "epoch": 4.288340241484952, + "grad_norm": 0.25960102677345276, + "learning_rate": 6.020165745980855e-07, + "loss": 0.3274, + "step": 7932 + }, + { + "epoch": 4.288880879437737, + "grad_norm": 0.262113094329834, + "learning_rate": 6.011187397324114e-07, + "loss": 0.3639, + "step": 7933 + }, + { + "epoch": 4.289421517390521, + "grad_norm": 0.28189143538475037, + "learning_rate": 6.0022153205125e-07, + "loss": 0.3905, + "step": 7934 + }, + { + "epoch": 4.289962155343305, + "grad_norm": 0.273446649312973, + "learning_rate": 5.993249516825278e-07, + "loss": 0.3563, + "step": 7935 + }, + { + "epoch": 4.290502793296089, + "grad_norm": 0.2756039798259735, + "learning_rate": 5.984289987540726e-07, + "loss": 0.3835, + "step": 7936 + }, + { + "epoch": 4.291043431248873, + "grad_norm": 0.2741425931453705, + "learning_rate": 5.975336733936305e-07, + "loss": 0.3448, + "step": 7937 + }, + { + "epoch": 4.291584069201658, + "grad_norm": 0.28097325563430786, + "learning_rate": 5.96638975728856e-07, + "loss": 0.3627, + "step": 7938 + }, + { + "epoch": 4.292124707154442, + "grad_norm": 0.26080629229545593, + "learning_rate": 5.957449058873127e-07, + "loss": 0.3403, + "step": 7939 + }, + { + "epoch": 4.292665345107227, + "grad_norm": 0.29398655891418457, + "learning_rate": 5.948514639964748e-07, + "loss": 0.3888, + "step": 7940 + }, + { + "epoch": 4.293205983060011, + "grad_norm": 0.2571951746940613, + "learning_rate": 5.939586501837275e-07, + "loss": 0.3038, + "step": 7941 + }, + { + "epoch": 4.2937466210127955, + "grad_norm": 0.2912275195121765, + "learning_rate": 5.930664645763684e-07, + "loss": 0.3928, + "step": 7942 + }, + { + "epoch": 4.294287258965579, + "grad_norm": 0.2838453948497772, + "learning_rate": 5.92174907301602e-07, + "loss": 0.4175, + "step": 7943 + }, + { + "epoch": 4.294827896918363, + "grad_norm": 0.26570606231689453, + "learning_rate": 5.912839784865448e-07, + "loss": 0.3613, + "step": 7944 + }, + { + "epoch": 4.295368534871148, + "grad_norm": 0.25844183564186096, + "learning_rate": 5.903936782582253e-07, + "loss": 0.3463, + "step": 7945 + }, + { + "epoch": 4.295909172823932, + "grad_norm": 0.2600902020931244, + "learning_rate": 5.895040067435793e-07, + "loss": 0.342, + "step": 7946 + }, + { + "epoch": 4.296449810776717, + "grad_norm": 0.2728821635246277, + "learning_rate": 5.886149640694561e-07, + "loss": 0.3715, + "step": 7947 + }, + { + "epoch": 4.296990448729501, + "grad_norm": 0.2795557975769043, + "learning_rate": 5.877265503626129e-07, + "loss": 0.3523, + "step": 7948 + }, + { + "epoch": 4.297531086682286, + "grad_norm": 0.2914360463619232, + "learning_rate": 5.868387657497171e-07, + "loss": 0.3669, + "step": 7949 + }, + { + "epoch": 4.2980717246350695, + "grad_norm": 0.2712486684322357, + "learning_rate": 5.859516103573492e-07, + "loss": 0.3495, + "step": 7950 + }, + { + "epoch": 4.298612362587853, + "grad_norm": 0.26411083340644836, + "learning_rate": 5.850650843119971e-07, + "loss": 0.3447, + "step": 7951 + }, + { + "epoch": 4.299153000540638, + "grad_norm": 0.2834503948688507, + "learning_rate": 5.841791877400627e-07, + "loss": 0.3628, + "step": 7952 + }, + { + "epoch": 4.299693638493422, + "grad_norm": 0.2847490906715393, + "learning_rate": 5.832939207678507e-07, + "loss": 0.343, + "step": 7953 + }, + { + "epoch": 4.300234276446207, + "grad_norm": 0.2924007773399353, + "learning_rate": 5.82409283521585e-07, + "loss": 0.3697, + "step": 7954 + }, + { + "epoch": 4.300774914398991, + "grad_norm": 0.28452688455581665, + "learning_rate": 5.815252761273927e-07, + "loss": 0.4201, + "step": 7955 + }, + { + "epoch": 4.301315552351775, + "grad_norm": 0.2323017120361328, + "learning_rate": 5.806418987113161e-07, + "loss": 0.2984, + "step": 7956 + }, + { + "epoch": 4.30185619030456, + "grad_norm": 0.2853470742702484, + "learning_rate": 5.797591513993051e-07, + "loss": 0.3815, + "step": 7957 + }, + { + "epoch": 4.3023968282573435, + "grad_norm": 0.2698294222354889, + "learning_rate": 5.78877034317219e-07, + "loss": 0.3643, + "step": 7958 + }, + { + "epoch": 4.302937466210128, + "grad_norm": 0.25463762879371643, + "learning_rate": 5.7799554759083e-07, + "loss": 0.3744, + "step": 7959 + }, + { + "epoch": 4.303478104162912, + "grad_norm": 0.2504331171512604, + "learning_rate": 5.771146913458187e-07, + "loss": 0.3385, + "step": 7960 + }, + { + "epoch": 4.304018742115696, + "grad_norm": 0.2713220715522766, + "learning_rate": 5.76234465707774e-07, + "loss": 0.3943, + "step": 7961 + }, + { + "epoch": 4.304559380068481, + "grad_norm": 0.2801530957221985, + "learning_rate": 5.753548708022e-07, + "loss": 0.3833, + "step": 7962 + }, + { + "epoch": 4.305100018021265, + "grad_norm": 0.25731515884399414, + "learning_rate": 5.744759067545047e-07, + "loss": 0.329, + "step": 7963 + }, + { + "epoch": 4.30564065597405, + "grad_norm": 0.2828752100467682, + "learning_rate": 5.735975736900123e-07, + "loss": 0.3945, + "step": 7964 + }, + { + "epoch": 4.306181293926834, + "grad_norm": 0.24314965307712555, + "learning_rate": 5.727198717339511e-07, + "loss": 0.3121, + "step": 7965 + }, + { + "epoch": 4.306721931879618, + "grad_norm": 0.26475682854652405, + "learning_rate": 5.718428010114629e-07, + "loss": 0.3674, + "step": 7966 + }, + { + "epoch": 4.307262569832402, + "grad_norm": 0.2800459563732147, + "learning_rate": 5.709663616476002e-07, + "loss": 0.3533, + "step": 7967 + }, + { + "epoch": 4.307803207785186, + "grad_norm": 0.2647830843925476, + "learning_rate": 5.700905537673234e-07, + "loss": 0.3776, + "step": 7968 + }, + { + "epoch": 4.308343845737971, + "grad_norm": 0.2571731209754944, + "learning_rate": 5.69215377495504e-07, + "loss": 0.3346, + "step": 7969 + }, + { + "epoch": 4.308884483690755, + "grad_norm": 0.31464195251464844, + "learning_rate": 5.683408329569212e-07, + "loss": 0.41, + "step": 7970 + }, + { + "epoch": 4.30942512164354, + "grad_norm": 0.2723197638988495, + "learning_rate": 5.674669202762684e-07, + "loss": 0.343, + "step": 7971 + }, + { + "epoch": 4.309965759596324, + "grad_norm": 0.2688208818435669, + "learning_rate": 5.665936395781452e-07, + "loss": 0.3383, + "step": 7972 + }, + { + "epoch": 4.310506397549108, + "grad_norm": 0.26611441373825073, + "learning_rate": 5.657209909870621e-07, + "loss": 0.3676, + "step": 7973 + }, + { + "epoch": 4.311047035501892, + "grad_norm": 0.2646256387233734, + "learning_rate": 5.648489746274405e-07, + "loss": 0.3511, + "step": 7974 + }, + { + "epoch": 4.311587673454676, + "grad_norm": 0.2690008580684662, + "learning_rate": 5.6397759062361e-07, + "loss": 0.3677, + "step": 7975 + }, + { + "epoch": 4.312128311407461, + "grad_norm": 0.25940191745758057, + "learning_rate": 5.631068390998129e-07, + "loss": 0.3556, + "step": 7976 + }, + { + "epoch": 4.312668949360245, + "grad_norm": 0.2780297100543976, + "learning_rate": 5.622367201801976e-07, + "loss": 0.3874, + "step": 7977 + }, + { + "epoch": 4.313209587313029, + "grad_norm": 0.2555844187736511, + "learning_rate": 5.613672339888238e-07, + "loss": 0.3506, + "step": 7978 + }, + { + "epoch": 4.313750225265814, + "grad_norm": 0.2518936097621918, + "learning_rate": 5.604983806496633e-07, + "loss": 0.3615, + "step": 7979 + }, + { + "epoch": 4.314290863218598, + "grad_norm": 0.2545207142829895, + "learning_rate": 5.596301602865938e-07, + "loss": 0.3641, + "step": 7980 + }, + { + "epoch": 4.3148315011713825, + "grad_norm": 0.26856258511543274, + "learning_rate": 5.587625730234059e-07, + "loss": 0.3988, + "step": 7981 + }, + { + "epoch": 4.315372139124166, + "grad_norm": 0.26470619440078735, + "learning_rate": 5.578956189837964e-07, + "loss": 0.3908, + "step": 7982 + }, + { + "epoch": 4.315912777076951, + "grad_norm": 0.25664377212524414, + "learning_rate": 5.57029298291376e-07, + "loss": 0.3184, + "step": 7983 + }, + { + "epoch": 4.316453415029735, + "grad_norm": 0.2958470284938812, + "learning_rate": 5.561636110696634e-07, + "loss": 0.3678, + "step": 7984 + }, + { + "epoch": 4.316994052982519, + "grad_norm": 0.2789238393306732, + "learning_rate": 5.55298557442085e-07, + "loss": 0.3753, + "step": 7985 + }, + { + "epoch": 4.317534690935304, + "grad_norm": 0.2565482258796692, + "learning_rate": 5.544341375319801e-07, + "loss": 0.3489, + "step": 7986 + }, + { + "epoch": 4.318075328888088, + "grad_norm": 0.30428025126457214, + "learning_rate": 5.535703514625946e-07, + "loss": 0.3777, + "step": 7987 + }, + { + "epoch": 4.318615966840873, + "grad_norm": 0.28930214047431946, + "learning_rate": 5.527071993570876e-07, + "loss": 0.3816, + "step": 7988 + }, + { + "epoch": 4.3191566047936565, + "grad_norm": 0.2668907344341278, + "learning_rate": 5.518446813385248e-07, + "loss": 0.3654, + "step": 7989 + }, + { + "epoch": 4.31969724274644, + "grad_norm": 0.2620943486690521, + "learning_rate": 5.509827975298809e-07, + "loss": 0.3649, + "step": 7990 + }, + { + "epoch": 4.320237880699225, + "grad_norm": 0.2591368556022644, + "learning_rate": 5.501215480540445e-07, + "loss": 0.3546, + "step": 7991 + }, + { + "epoch": 4.320778518652009, + "grad_norm": 0.2721455693244934, + "learning_rate": 5.492609330338095e-07, + "loss": 0.4084, + "step": 7992 + }, + { + "epoch": 4.321319156604794, + "grad_norm": 0.24927014112472534, + "learning_rate": 5.48400952591881e-07, + "loss": 0.3344, + "step": 7993 + }, + { + "epoch": 4.321859794557578, + "grad_norm": 0.265815794467926, + "learning_rate": 5.475416068508721e-07, + "loss": 0.3558, + "step": 7994 + }, + { + "epoch": 4.322400432510362, + "grad_norm": 0.25444939732551575, + "learning_rate": 5.466828959333087e-07, + "loss": 0.329, + "step": 7995 + }, + { + "epoch": 4.322941070463147, + "grad_norm": 0.29459238052368164, + "learning_rate": 5.45824819961625e-07, + "loss": 0.397, + "step": 7996 + }, + { + "epoch": 4.3234817084159305, + "grad_norm": 0.2681017220020294, + "learning_rate": 5.449673790581611e-07, + "loss": 0.3831, + "step": 7997 + }, + { + "epoch": 4.324022346368715, + "grad_norm": 0.2532649338245392, + "learning_rate": 5.441105733451713e-07, + "loss": 0.3515, + "step": 7998 + }, + { + "epoch": 4.324562984321499, + "grad_norm": 0.2653908431529999, + "learning_rate": 5.432544029448162e-07, + "loss": 0.3999, + "step": 7999 + }, + { + "epoch": 4.325103622274284, + "grad_norm": 0.24886465072631836, + "learning_rate": 5.423988679791686e-07, + "loss": 0.3504, + "step": 8000 + }, + { + "epoch": 4.325644260227068, + "grad_norm": 0.24687694013118744, + "learning_rate": 5.415439685702085e-07, + "loss": 0.3281, + "step": 8001 + }, + { + "epoch": 4.326184898179852, + "grad_norm": 0.2668905556201935, + "learning_rate": 5.406897048398247e-07, + "loss": 0.3535, + "step": 8002 + }, + { + "epoch": 4.326725536132637, + "grad_norm": 0.2891635000705719, + "learning_rate": 5.398360769098182e-07, + "loss": 0.3809, + "step": 8003 + }, + { + "epoch": 4.327266174085421, + "grad_norm": 0.2738891541957855, + "learning_rate": 5.389830849018973e-07, + "loss": 0.3764, + "step": 8004 + }, + { + "epoch": 4.327806812038205, + "grad_norm": 0.27342963218688965, + "learning_rate": 5.381307289376786e-07, + "loss": 0.3664, + "step": 8005 + }, + { + "epoch": 4.328347449990989, + "grad_norm": 0.27071672677993774, + "learning_rate": 5.37279009138692e-07, + "loss": 0.3812, + "step": 8006 + }, + { + "epoch": 4.328888087943774, + "grad_norm": 0.256654292345047, + "learning_rate": 5.364279256263716e-07, + "loss": 0.3596, + "step": 8007 + }, + { + "epoch": 4.329428725896558, + "grad_norm": 0.2899309992790222, + "learning_rate": 5.355774785220669e-07, + "loss": 0.385, + "step": 8008 + }, + { + "epoch": 4.329969363849342, + "grad_norm": 0.2631729245185852, + "learning_rate": 5.347276679470281e-07, + "loss": 0.349, + "step": 8009 + }, + { + "epoch": 4.330510001802127, + "grad_norm": 0.26797574758529663, + "learning_rate": 5.338784940224239e-07, + "loss": 0.3836, + "step": 8010 + }, + { + "epoch": 4.331050639754911, + "grad_norm": 0.28088507056236267, + "learning_rate": 5.330299568693253e-07, + "loss": 0.3902, + "step": 8011 + }, + { + "epoch": 4.3315912777076955, + "grad_norm": 0.25558122992515564, + "learning_rate": 5.321820566087166e-07, + "loss": 0.3424, + "step": 8012 + }, + { + "epoch": 4.332131915660479, + "grad_norm": 0.26682865619659424, + "learning_rate": 5.313347933614915e-07, + "loss": 0.3561, + "step": 8013 + }, + { + "epoch": 4.332672553613263, + "grad_norm": 0.2649161219596863, + "learning_rate": 5.304881672484475e-07, + "loss": 0.3951, + "step": 8014 + }, + { + "epoch": 4.333213191566048, + "grad_norm": 0.2671195864677429, + "learning_rate": 5.296421783902972e-07, + "loss": 0.3678, + "step": 8015 + }, + { + "epoch": 4.333753829518832, + "grad_norm": 0.26547911763191223, + "learning_rate": 5.287968269076593e-07, + "loss": 0.3428, + "step": 8016 + }, + { + "epoch": 4.334294467471617, + "grad_norm": 0.25527921319007874, + "learning_rate": 5.27952112921064e-07, + "loss": 0.3552, + "step": 8017 + }, + { + "epoch": 4.334835105424401, + "grad_norm": 0.27284345030784607, + "learning_rate": 5.271080365509479e-07, + "loss": 0.3538, + "step": 8018 + }, + { + "epoch": 4.335375743377185, + "grad_norm": 0.2744116485118866, + "learning_rate": 5.262645979176572e-07, + "loss": 0.3986, + "step": 8019 + }, + { + "epoch": 4.3359163813299695, + "grad_norm": 0.260938823223114, + "learning_rate": 5.254217971414499e-07, + "loss": 0.3283, + "step": 8020 + }, + { + "epoch": 4.336457019282753, + "grad_norm": 0.2803042232990265, + "learning_rate": 5.245796343424897e-07, + "loss": 0.371, + "step": 8021 + }, + { + "epoch": 4.336997657235538, + "grad_norm": 0.2794663906097412, + "learning_rate": 5.237381096408512e-07, + "loss": 0.3735, + "step": 8022 + }, + { + "epoch": 4.337538295188322, + "grad_norm": 0.2702719569206238, + "learning_rate": 5.228972231565155e-07, + "loss": 0.3818, + "step": 8023 + }, + { + "epoch": 4.338078933141107, + "grad_norm": 0.2615105211734772, + "learning_rate": 5.220569750093763e-07, + "loss": 0.3623, + "step": 8024 + }, + { + "epoch": 4.338619571093891, + "grad_norm": 0.2775976061820984, + "learning_rate": 5.212173653192365e-07, + "loss": 0.3621, + "step": 8025 + }, + { + "epoch": 4.339160209046675, + "grad_norm": 0.2725214660167694, + "learning_rate": 5.203783942058021e-07, + "loss": 0.3462, + "step": 8026 + }, + { + "epoch": 4.33970084699946, + "grad_norm": 0.2889401316642761, + "learning_rate": 5.195400617886959e-07, + "loss": 0.3699, + "step": 8027 + }, + { + "epoch": 4.3402414849522435, + "grad_norm": 0.28472912311553955, + "learning_rate": 5.187023681874426e-07, + "loss": 0.3433, + "step": 8028 + }, + { + "epoch": 4.340782122905028, + "grad_norm": 0.2755737900733948, + "learning_rate": 5.178653135214811e-07, + "loss": 0.3711, + "step": 8029 + }, + { + "epoch": 4.341322760857812, + "grad_norm": 0.260037899017334, + "learning_rate": 5.170288979101573e-07, + "loss": 0.372, + "step": 8030 + }, + { + "epoch": 4.341863398810596, + "grad_norm": 0.25487974286079407, + "learning_rate": 5.16193121472724e-07, + "loss": 0.3784, + "step": 8031 + }, + { + "epoch": 4.342404036763381, + "grad_norm": 0.253598690032959, + "learning_rate": 5.153579843283463e-07, + "loss": 0.3414, + "step": 8032 + }, + { + "epoch": 4.342944674716165, + "grad_norm": 0.27968794107437134, + "learning_rate": 5.145234865960963e-07, + "loss": 0.3731, + "step": 8033 + }, + { + "epoch": 4.34348531266895, + "grad_norm": 0.2652742266654968, + "learning_rate": 5.136896283949544e-07, + "loss": 0.3334, + "step": 8034 + }, + { + "epoch": 4.344025950621734, + "grad_norm": 0.2636103630065918, + "learning_rate": 5.128564098438116e-07, + "loss": 0.3793, + "step": 8035 + }, + { + "epoch": 4.3445665885745175, + "grad_norm": 0.24500229954719543, + "learning_rate": 5.12023831061465e-07, + "loss": 0.3394, + "step": 8036 + }, + { + "epoch": 4.345107226527302, + "grad_norm": 0.28855642676353455, + "learning_rate": 5.111918921666254e-07, + "loss": 0.4037, + "step": 8037 + }, + { + "epoch": 4.345647864480086, + "grad_norm": 0.27344831824302673, + "learning_rate": 5.103605932779055e-07, + "loss": 0.3424, + "step": 8038 + }, + { + "epoch": 4.346188502432871, + "grad_norm": 0.2778145670890808, + "learning_rate": 5.095299345138327e-07, + "loss": 0.3836, + "step": 8039 + }, + { + "epoch": 4.346729140385655, + "grad_norm": 0.2843970060348511, + "learning_rate": 5.086999159928391e-07, + "loss": 0.3607, + "step": 8040 + }, + { + "epoch": 4.34726977833844, + "grad_norm": 0.25553035736083984, + "learning_rate": 5.078705378332693e-07, + "loss": 0.3075, + "step": 8041 + }, + { + "epoch": 4.347810416291224, + "grad_norm": 0.2807932496070862, + "learning_rate": 5.070418001533733e-07, + "loss": 0.3858, + "step": 8042 + }, + { + "epoch": 4.348351054244008, + "grad_norm": 0.2711091637611389, + "learning_rate": 5.062137030713105e-07, + "loss": 0.3428, + "step": 8043 + }, + { + "epoch": 4.348891692196792, + "grad_norm": 0.2752896547317505, + "learning_rate": 5.053862467051507e-07, + "loss": 0.3883, + "step": 8044 + }, + { + "epoch": 4.349432330149576, + "grad_norm": 0.25309717655181885, + "learning_rate": 5.045594311728708e-07, + "loss": 0.346, + "step": 8045 + }, + { + "epoch": 4.349972968102361, + "grad_norm": 0.2608318328857422, + "learning_rate": 5.037332565923558e-07, + "loss": 0.3685, + "step": 8046 + }, + { + "epoch": 4.350513606055145, + "grad_norm": 0.2767925262451172, + "learning_rate": 5.029077230814011e-07, + "loss": 0.3693, + "step": 8047 + }, + { + "epoch": 4.351054244007929, + "grad_norm": 0.2608930170536041, + "learning_rate": 5.020828307577091e-07, + "loss": 0.3341, + "step": 8048 + }, + { + "epoch": 4.351594881960714, + "grad_norm": 0.2874591648578644, + "learning_rate": 5.012585797388936e-07, + "loss": 0.3364, + "step": 8049 + }, + { + "epoch": 4.352135519913498, + "grad_norm": 0.2939295470714569, + "learning_rate": 5.00434970142471e-07, + "loss": 0.3745, + "step": 8050 + }, + { + "epoch": 4.3526761578662825, + "grad_norm": 0.2753792107105255, + "learning_rate": 4.996120020858725e-07, + "loss": 0.3452, + "step": 8051 + }, + { + "epoch": 4.353216795819066, + "grad_norm": 0.25311657786369324, + "learning_rate": 4.987896756864357e-07, + "loss": 0.3241, + "step": 8052 + }, + { + "epoch": 4.35375743377185, + "grad_norm": 0.2743209898471832, + "learning_rate": 4.97967991061406e-07, + "loss": 0.358, + "step": 8053 + }, + { + "epoch": 4.354298071724635, + "grad_norm": 0.2603088617324829, + "learning_rate": 4.971469483279373e-07, + "loss": 0.3443, + "step": 8054 + }, + { + "epoch": 4.354838709677419, + "grad_norm": 0.26635169982910156, + "learning_rate": 4.963265476030916e-07, + "loss": 0.355, + "step": 8055 + }, + { + "epoch": 4.355379347630204, + "grad_norm": 0.265351265668869, + "learning_rate": 4.955067890038417e-07, + "loss": 0.3395, + "step": 8056 + }, + { + "epoch": 4.355919985582988, + "grad_norm": 0.2658833861351013, + "learning_rate": 4.946876726470667e-07, + "loss": 0.3568, + "step": 8057 + }, + { + "epoch": 4.356460623535773, + "grad_norm": 0.2775501608848572, + "learning_rate": 4.938691986495542e-07, + "loss": 0.3966, + "step": 8058 + }, + { + "epoch": 4.3570012614885565, + "grad_norm": 0.2574467957019806, + "learning_rate": 4.930513671280018e-07, + "loss": 0.3336, + "step": 8059 + }, + { + "epoch": 4.35754189944134, + "grad_norm": 0.2852720320224762, + "learning_rate": 4.922341781990131e-07, + "loss": 0.3811, + "step": 8060 + }, + { + "epoch": 4.358082537394125, + "grad_norm": 0.2721101641654968, + "learning_rate": 4.914176319791037e-07, + "loss": 0.4074, + "step": 8061 + }, + { + "epoch": 4.358623175346909, + "grad_norm": 0.2626602351665497, + "learning_rate": 4.906017285846921e-07, + "loss": 0.3849, + "step": 8062 + }, + { + "epoch": 4.359163813299694, + "grad_norm": 0.2770666182041168, + "learning_rate": 4.897864681321101e-07, + "loss": 0.3448, + "step": 8063 + }, + { + "epoch": 4.359704451252478, + "grad_norm": 0.26337531208992004, + "learning_rate": 4.889718507375968e-07, + "loss": 0.3607, + "step": 8064 + }, + { + "epoch": 4.360245089205263, + "grad_norm": 0.2588321268558502, + "learning_rate": 4.881578765172979e-07, + "loss": 0.3656, + "step": 8065 + }, + { + "epoch": 4.360785727158047, + "grad_norm": 0.27049943804740906, + "learning_rate": 4.873445455872689e-07, + "loss": 0.3831, + "step": 8066 + }, + { + "epoch": 4.3613263651108305, + "grad_norm": 0.2564168870449066, + "learning_rate": 4.865318580634714e-07, + "loss": 0.3848, + "step": 8067 + }, + { + "epoch": 4.361867003063615, + "grad_norm": 0.2557253837585449, + "learning_rate": 4.857198140617786e-07, + "loss": 0.3286, + "step": 8068 + }, + { + "epoch": 4.362407641016399, + "grad_norm": 0.2757374048233032, + "learning_rate": 4.849084136979703e-07, + "loss": 0.3498, + "step": 8069 + }, + { + "epoch": 4.362948278969184, + "grad_norm": 0.2650987505912781, + "learning_rate": 4.840976570877332e-07, + "loss": 0.3872, + "step": 8070 + }, + { + "epoch": 4.363488916921968, + "grad_norm": 0.2564825415611267, + "learning_rate": 4.83287544346665e-07, + "loss": 0.3634, + "step": 8071 + }, + { + "epoch": 4.364029554874752, + "grad_norm": 0.2613122761249542, + "learning_rate": 4.824780755902686e-07, + "loss": 0.3629, + "step": 8072 + }, + { + "epoch": 4.364570192827537, + "grad_norm": 0.27393218874931335, + "learning_rate": 4.816692509339583e-07, + "loss": 0.4092, + "step": 8073 + }, + { + "epoch": 4.365110830780321, + "grad_norm": 0.25998541712760925, + "learning_rate": 4.808610704930539e-07, + "loss": 0.339, + "step": 8074 + }, + { + "epoch": 4.365651468733105, + "grad_norm": 0.2728840112686157, + "learning_rate": 4.800535343827834e-07, + "loss": 0.3727, + "step": 8075 + }, + { + "epoch": 4.366192106685889, + "grad_norm": 0.25305524468421936, + "learning_rate": 4.792466427182857e-07, + "loss": 0.3337, + "step": 8076 + }, + { + "epoch": 4.366732744638673, + "grad_norm": 0.27281883358955383, + "learning_rate": 4.784403956146039e-07, + "loss": 0.3623, + "step": 8077 + }, + { + "epoch": 4.367273382591458, + "grad_norm": 0.26225027441978455, + "learning_rate": 4.776347931866948e-07, + "loss": 0.3333, + "step": 8078 + }, + { + "epoch": 4.367814020544242, + "grad_norm": 0.25581321120262146, + "learning_rate": 4.7682983554941495e-07, + "loss": 0.3641, + "step": 8079 + }, + { + "epoch": 4.368354658497027, + "grad_norm": 0.2649233043193817, + "learning_rate": 4.7602552281753647e-07, + "loss": 0.3971, + "step": 8080 + }, + { + "epoch": 4.368895296449811, + "grad_norm": 0.2630901336669922, + "learning_rate": 4.752218551057369e-07, + "loss": 0.3278, + "step": 8081 + }, + { + "epoch": 4.3694359344025955, + "grad_norm": 0.2889975607395172, + "learning_rate": 4.7441883252860143e-07, + "loss": 0.3677, + "step": 8082 + }, + { + "epoch": 4.369976572355379, + "grad_norm": 0.2752499282360077, + "learning_rate": 4.736164552006239e-07, + "loss": 0.3918, + "step": 8083 + }, + { + "epoch": 4.370517210308163, + "grad_norm": 0.24661561846733093, + "learning_rate": 4.72814723236204e-07, + "loss": 0.3262, + "step": 8084 + }, + { + "epoch": 4.371057848260948, + "grad_norm": 0.2726038694381714, + "learning_rate": 4.720136367496536e-07, + "loss": 0.3611, + "step": 8085 + }, + { + "epoch": 4.371598486213732, + "grad_norm": 0.25609326362609863, + "learning_rate": 4.7121319585518907e-07, + "loss": 0.3769, + "step": 8086 + }, + { + "epoch": 4.372139124166517, + "grad_norm": 0.25862959027290344, + "learning_rate": 4.704134006669347e-07, + "loss": 0.3496, + "step": 8087 + }, + { + "epoch": 4.372679762119301, + "grad_norm": 0.25048795342445374, + "learning_rate": 4.6961425129892655e-07, + "loss": 0.3117, + "step": 8088 + }, + { + "epoch": 4.373220400072085, + "grad_norm": 0.2924381196498871, + "learning_rate": 4.688157478651029e-07, + "loss": 0.3875, + "step": 8089 + }, + { + "epoch": 4.3737610380248695, + "grad_norm": 0.26332539319992065, + "learning_rate": 4.6801789047931535e-07, + "loss": 0.368, + "step": 8090 + }, + { + "epoch": 4.374301675977653, + "grad_norm": 0.25561949610710144, + "learning_rate": 4.6722067925532024e-07, + "loss": 0.3496, + "step": 8091 + }, + { + "epoch": 4.374842313930438, + "grad_norm": 0.2899666130542755, + "learning_rate": 4.6642411430678105e-07, + "loss": 0.3957, + "step": 8092 + }, + { + "epoch": 4.375382951883222, + "grad_norm": 0.2681226134300232, + "learning_rate": 4.6562819574727304e-07, + "loss": 0.3366, + "step": 8093 + }, + { + "epoch": 4.375923589836006, + "grad_norm": 0.2914165258407593, + "learning_rate": 4.6483292369027487e-07, + "loss": 0.3676, + "step": 8094 + }, + { + "epoch": 4.376464227788791, + "grad_norm": 0.26949185132980347, + "learning_rate": 4.6403829824917643e-07, + "loss": 0.3305, + "step": 8095 + }, + { + "epoch": 4.377004865741575, + "grad_norm": 0.2690637409687042, + "learning_rate": 4.632443195372716e-07, + "loss": 0.3927, + "step": 8096 + }, + { + "epoch": 4.37754550369436, + "grad_norm": 0.2764582931995392, + "learning_rate": 4.624509876677674e-07, + "loss": 0.3597, + "step": 8097 + }, + { + "epoch": 4.3780861416471435, + "grad_norm": 0.2755921483039856, + "learning_rate": 4.616583027537741e-07, + "loss": 0.3582, + "step": 8098 + }, + { + "epoch": 4.378626779599928, + "grad_norm": 0.27690377831459045, + "learning_rate": 4.6086626490831067e-07, + "loss": 0.3525, + "step": 8099 + }, + { + "epoch": 4.379167417552712, + "grad_norm": 0.26605695486068726, + "learning_rate": 4.6007487424430565e-07, + "loss": 0.3481, + "step": 8100 + }, + { + "epoch": 4.379708055505496, + "grad_norm": 0.2655012905597687, + "learning_rate": 4.5928413087459325e-07, + "loss": 0.3713, + "step": 8101 + }, + { + "epoch": 4.380248693458281, + "grad_norm": 0.2613704204559326, + "learning_rate": 4.584940349119177e-07, + "loss": 0.349, + "step": 8102 + }, + { + "epoch": 4.380789331411065, + "grad_norm": 0.26111432909965515, + "learning_rate": 4.577045864689278e-07, + "loss": 0.3666, + "step": 8103 + }, + { + "epoch": 4.38132996936385, + "grad_norm": 0.30547577142715454, + "learning_rate": 4.569157856581818e-07, + "loss": 0.413, + "step": 8104 + }, + { + "epoch": 4.381870607316634, + "grad_norm": 0.2722497582435608, + "learning_rate": 4.5612763259214653e-07, + "loss": 0.3337, + "step": 8105 + }, + { + "epoch": 4.3824112452694175, + "grad_norm": 0.24953995645046234, + "learning_rate": 4.553401273831948e-07, + "loss": 0.36, + "step": 8106 + }, + { + "epoch": 4.382951883222202, + "grad_norm": 0.2641351521015167, + "learning_rate": 4.545532701436084e-07, + "loss": 0.34, + "step": 8107 + }, + { + "epoch": 4.383492521174986, + "grad_norm": 0.2744714319705963, + "learning_rate": 4.5376706098557376e-07, + "loss": 0.3453, + "step": 8108 + }, + { + "epoch": 4.384033159127771, + "grad_norm": 0.26822155714035034, + "learning_rate": 4.52981500021189e-07, + "loss": 0.3623, + "step": 8109 + }, + { + "epoch": 4.384573797080555, + "grad_norm": 0.27691131830215454, + "learning_rate": 4.52196587362459e-07, + "loss": 0.3635, + "step": 8110 + }, + { + "epoch": 4.38511443503334, + "grad_norm": 0.25168412923812866, + "learning_rate": 4.5141232312129247e-07, + "loss": 0.3559, + "step": 8111 + }, + { + "epoch": 4.385655072986124, + "grad_norm": 0.2736569046974182, + "learning_rate": 4.5062870740951113e-07, + "loss": 0.3815, + "step": 8112 + }, + { + "epoch": 4.386195710938908, + "grad_norm": 0.2856607139110565, + "learning_rate": 4.4984574033883846e-07, + "loss": 0.355, + "step": 8113 + }, + { + "epoch": 4.386736348891692, + "grad_norm": 0.2555479109287262, + "learning_rate": 4.490634220209117e-07, + "loss": 0.342, + "step": 8114 + }, + { + "epoch": 4.387276986844476, + "grad_norm": 0.26296502351760864, + "learning_rate": 4.4828175256727056e-07, + "loss": 0.3566, + "step": 8115 + }, + { + "epoch": 4.387817624797261, + "grad_norm": 0.25987017154693604, + "learning_rate": 4.4750073208936373e-07, + "loss": 0.3644, + "step": 8116 + }, + { + "epoch": 4.388358262750045, + "grad_norm": 0.28312379121780396, + "learning_rate": 4.4672036069854876e-07, + "loss": 0.3795, + "step": 8117 + }, + { + "epoch": 4.388898900702829, + "grad_norm": 0.2769124507904053, + "learning_rate": 4.459406385060894e-07, + "loss": 0.3512, + "step": 8118 + }, + { + "epoch": 4.389439538655614, + "grad_norm": 0.2513204514980316, + "learning_rate": 4.451615656231556e-07, + "loss": 0.3399, + "step": 8119 + }, + { + "epoch": 4.389980176608398, + "grad_norm": 0.28434517979621887, + "learning_rate": 4.4438314216082856e-07, + "loss": 0.3707, + "step": 8120 + }, + { + "epoch": 4.3905208145611825, + "grad_norm": 0.2877276539802551, + "learning_rate": 4.436053682300923e-07, + "loss": 0.3706, + "step": 8121 + }, + { + "epoch": 4.391061452513966, + "grad_norm": 0.2597291171550751, + "learning_rate": 4.4282824394184297e-07, + "loss": 0.341, + "step": 8122 + }, + { + "epoch": 4.391602090466751, + "grad_norm": 0.2784497141838074, + "learning_rate": 4.4205176940687823e-07, + "loss": 0.3641, + "step": 8123 + }, + { + "epoch": 4.392142728419535, + "grad_norm": 0.28118282556533813, + "learning_rate": 4.412759447359094e-07, + "loss": 0.3819, + "step": 8124 + }, + { + "epoch": 4.392683366372319, + "grad_norm": 0.2607361078262329, + "learning_rate": 4.405007700395497e-07, + "loss": 0.3425, + "step": 8125 + }, + { + "epoch": 4.393224004325104, + "grad_norm": 0.2748037576675415, + "learning_rate": 4.397262454283241e-07, + "loss": 0.4068, + "step": 8126 + }, + { + "epoch": 4.393764642277888, + "grad_norm": 0.2521860599517822, + "learning_rate": 4.3895237101266195e-07, + "loss": 0.3411, + "step": 8127 + }, + { + "epoch": 4.394305280230673, + "grad_norm": 0.2599768340587616, + "learning_rate": 4.3817914690290064e-07, + "loss": 0.3614, + "step": 8128 + }, + { + "epoch": 4.3948459181834565, + "grad_norm": 0.2581174969673157, + "learning_rate": 4.374065732092858e-07, + "loss": 0.3733, + "step": 8129 + }, + { + "epoch": 4.39538655613624, + "grad_norm": 0.24220918118953705, + "learning_rate": 4.3663465004196995e-07, + "loss": 0.3331, + "step": 8130 + }, + { + "epoch": 4.395927194089025, + "grad_norm": 0.26094678044319153, + "learning_rate": 4.358633775110105e-07, + "loss": 0.3831, + "step": 8131 + }, + { + "epoch": 4.396467832041809, + "grad_norm": 0.25512203574180603, + "learning_rate": 4.3509275572637623e-07, + "loss": 0.3708, + "step": 8132 + }, + { + "epoch": 4.397008469994594, + "grad_norm": 0.27119317650794983, + "learning_rate": 4.343227847979392e-07, + "loss": 0.3969, + "step": 8133 + }, + { + "epoch": 4.397549107947378, + "grad_norm": 0.26803910732269287, + "learning_rate": 4.335534648354833e-07, + "loss": 0.3357, + "step": 8134 + }, + { + "epoch": 4.398089745900162, + "grad_norm": 0.2746613621711731, + "learning_rate": 4.3278479594869307e-07, + "loss": 0.3577, + "step": 8135 + }, + { + "epoch": 4.398630383852947, + "grad_norm": 0.2638416588306427, + "learning_rate": 4.320167782471668e-07, + "loss": 0.3924, + "step": 8136 + }, + { + "epoch": 4.3991710218057305, + "grad_norm": 0.27367305755615234, + "learning_rate": 4.312494118404048e-07, + "loss": 0.3774, + "step": 8137 + }, + { + "epoch": 4.399711659758515, + "grad_norm": 0.2653174102306366, + "learning_rate": 4.3048269683781894e-07, + "loss": 0.3545, + "step": 8138 + }, + { + "epoch": 4.400252297711299, + "grad_norm": 0.25145334005355835, + "learning_rate": 4.297166333487257e-07, + "loss": 0.3486, + "step": 8139 + }, + { + "epoch": 4.400792935664084, + "grad_norm": 0.25891542434692383, + "learning_rate": 4.289512214823466e-07, + "loss": 0.3622, + "step": 8140 + }, + { + "epoch": 4.401333573616868, + "grad_norm": 0.23026682436466217, + "learning_rate": 4.281864613478159e-07, + "loss": 0.3058, + "step": 8141 + }, + { + "epoch": 4.401874211569652, + "grad_norm": 0.2723310589790344, + "learning_rate": 4.2742235305416936e-07, + "loss": 0.3836, + "step": 8142 + }, + { + "epoch": 4.402414849522437, + "grad_norm": 0.2739313542842865, + "learning_rate": 4.2665889671035407e-07, + "loss": 0.3952, + "step": 8143 + }, + { + "epoch": 4.402955487475221, + "grad_norm": 0.2878687381744385, + "learning_rate": 4.258960924252215e-07, + "loss": 0.3775, + "step": 8144 + }, + { + "epoch": 4.403496125428005, + "grad_norm": 0.2662462592124939, + "learning_rate": 4.251339403075294e-07, + "loss": 0.3382, + "step": 8145 + }, + { + "epoch": 4.404036763380789, + "grad_norm": 0.2829868793487549, + "learning_rate": 4.243724404659466e-07, + "loss": 0.371, + "step": 8146 + }, + { + "epoch": 4.404577401333573, + "grad_norm": 0.28387123346328735, + "learning_rate": 4.2361159300904454e-07, + "loss": 0.3885, + "step": 8147 + }, + { + "epoch": 4.405118039286358, + "grad_norm": 0.28340548276901245, + "learning_rate": 4.228513980453036e-07, + "loss": 0.3648, + "step": 8148 + }, + { + "epoch": 4.405658677239142, + "grad_norm": 0.2605474889278412, + "learning_rate": 4.2209185568311216e-07, + "loss": 0.3234, + "step": 8149 + }, + { + "epoch": 4.406199315191927, + "grad_norm": 0.2815400958061218, + "learning_rate": 4.21332966030763e-07, + "loss": 0.3625, + "step": 8150 + }, + { + "epoch": 4.406739953144711, + "grad_norm": 0.265581876039505, + "learning_rate": 4.2057472919645957e-07, + "loss": 0.3703, + "step": 8151 + }, + { + "epoch": 4.407280591097495, + "grad_norm": 0.26388537883758545, + "learning_rate": 4.1981714528830596e-07, + "loss": 0.3737, + "step": 8152 + }, + { + "epoch": 4.407821229050279, + "grad_norm": 0.2655022144317627, + "learning_rate": 4.1906021441432074e-07, + "loss": 0.3557, + "step": 8153 + }, + { + "epoch": 4.408361867003063, + "grad_norm": 0.26379454135894775, + "learning_rate": 4.1830393668242376e-07, + "loss": 0.3495, + "step": 8154 + }, + { + "epoch": 4.408902504955848, + "grad_norm": 0.2671830952167511, + "learning_rate": 4.175483122004448e-07, + "loss": 0.3895, + "step": 8155 + }, + { + "epoch": 4.409443142908632, + "grad_norm": 0.267318457365036, + "learning_rate": 4.167933410761188e-07, + "loss": 0.3803, + "step": 8156 + }, + { + "epoch": 4.409983780861417, + "grad_norm": 0.3021106421947479, + "learning_rate": 4.1603902341708804e-07, + "loss": 0.3878, + "step": 8157 + }, + { + "epoch": 4.410524418814201, + "grad_norm": 0.2709572911262512, + "learning_rate": 4.1528535933090253e-07, + "loss": 0.3403, + "step": 8158 + }, + { + "epoch": 4.411065056766985, + "grad_norm": 0.26195964217185974, + "learning_rate": 4.1453234892501804e-07, + "loss": 0.3541, + "step": 8159 + }, + { + "epoch": 4.4116056947197695, + "grad_norm": 0.27845486998558044, + "learning_rate": 4.1377999230679646e-07, + "loss": 0.3818, + "step": 8160 + }, + { + "epoch": 4.412146332672553, + "grad_norm": 0.2635180652141571, + "learning_rate": 4.130282895835086e-07, + "loss": 0.299, + "step": 8161 + }, + { + "epoch": 4.412686970625338, + "grad_norm": 0.2581978738307953, + "learning_rate": 4.1227724086233045e-07, + "loss": 0.3304, + "step": 8162 + }, + { + "epoch": 4.413227608578122, + "grad_norm": 0.2778031826019287, + "learning_rate": 4.1152684625034633e-07, + "loss": 0.3798, + "step": 8163 + }, + { + "epoch": 4.413768246530907, + "grad_norm": 0.25137409567832947, + "learning_rate": 4.1077710585454344e-07, + "loss": 0.3231, + "step": 8164 + }, + { + "epoch": 4.414308884483691, + "grad_norm": 0.2556723654270172, + "learning_rate": 4.100280197818207e-07, + "loss": 0.388, + "step": 8165 + }, + { + "epoch": 4.414849522436475, + "grad_norm": 0.2570623755455017, + "learning_rate": 4.092795881389805e-07, + "loss": 0.3387, + "step": 8166 + }, + { + "epoch": 4.41539016038926, + "grad_norm": 0.27952271699905396, + "learning_rate": 4.0853181103273356e-07, + "loss": 0.3874, + "step": 8167 + }, + { + "epoch": 4.4159307983420435, + "grad_norm": 0.26863715052604675, + "learning_rate": 4.0778468856969623e-07, + "loss": 0.3947, + "step": 8168 + }, + { + "epoch": 4.416471436294828, + "grad_norm": 0.2620304226875305, + "learning_rate": 4.0703822085639057e-07, + "loss": 0.3569, + "step": 8169 + }, + { + "epoch": 4.417012074247612, + "grad_norm": 0.2710581421852112, + "learning_rate": 4.062924079992492e-07, + "loss": 0.3922, + "step": 8170 + }, + { + "epoch": 4.417552712200396, + "grad_norm": 0.25852853059768677, + "learning_rate": 4.0554725010460704e-07, + "loss": 0.3544, + "step": 8171 + }, + { + "epoch": 4.418093350153181, + "grad_norm": 0.2708817720413208, + "learning_rate": 4.0480274727870696e-07, + "loss": 0.3567, + "step": 8172 + }, + { + "epoch": 4.418633988105965, + "grad_norm": 0.27214178442955017, + "learning_rate": 4.040588996277006e-07, + "loss": 0.3465, + "step": 8173 + }, + { + "epoch": 4.41917462605875, + "grad_norm": 0.28624093532562256, + "learning_rate": 4.0331570725764215e-07, + "loss": 0.3512, + "step": 8174 + }, + { + "epoch": 4.419715264011534, + "grad_norm": 0.2977980375289917, + "learning_rate": 4.025731702744978e-07, + "loss": 0.388, + "step": 8175 + }, + { + "epoch": 4.4202559019643175, + "grad_norm": 0.26138395071029663, + "learning_rate": 4.0183128878413356e-07, + "loss": 0.3167, + "step": 8176 + }, + { + "epoch": 4.420796539917102, + "grad_norm": 0.277291476726532, + "learning_rate": 4.0109006289232646e-07, + "loss": 0.3832, + "step": 8177 + }, + { + "epoch": 4.421337177869886, + "grad_norm": 0.24840909242630005, + "learning_rate": 4.003494927047613e-07, + "loss": 0.3348, + "step": 8178 + }, + { + "epoch": 4.421877815822671, + "grad_norm": 0.2633357644081116, + "learning_rate": 3.9960957832702594e-07, + "loss": 0.3663, + "step": 8179 + }, + { + "epoch": 4.422418453775455, + "grad_norm": 0.28666359186172485, + "learning_rate": 3.9887031986461546e-07, + "loss": 0.3985, + "step": 8180 + }, + { + "epoch": 4.42295909172824, + "grad_norm": 0.2765139639377594, + "learning_rate": 3.9813171742293156e-07, + "loss": 0.3887, + "step": 8181 + }, + { + "epoch": 4.423499729681024, + "grad_norm": 0.2654871642589569, + "learning_rate": 3.9739377110728504e-07, + "loss": 0.3288, + "step": 8182 + }, + { + "epoch": 4.424040367633808, + "grad_norm": 0.26870468258857727, + "learning_rate": 3.96656481022889e-07, + "loss": 0.3754, + "step": 8183 + }, + { + "epoch": 4.424581005586592, + "grad_norm": 0.2566099762916565, + "learning_rate": 3.959198472748649e-07, + "loss": 0.3629, + "step": 8184 + }, + { + "epoch": 4.425121643539376, + "grad_norm": 0.28181445598602295, + "learning_rate": 3.9518386996824196e-07, + "loss": 0.3706, + "step": 8185 + }, + { + "epoch": 4.425662281492161, + "grad_norm": 0.26720869541168213, + "learning_rate": 3.9444854920795307e-07, + "loss": 0.3453, + "step": 8186 + }, + { + "epoch": 4.426202919444945, + "grad_norm": 0.26364749670028687, + "learning_rate": 3.9371388509884033e-07, + "loss": 0.3735, + "step": 8187 + }, + { + "epoch": 4.426743557397729, + "grad_norm": 0.2517613470554352, + "learning_rate": 3.9297987774565003e-07, + "loss": 0.3587, + "step": 8188 + }, + { + "epoch": 4.427284195350514, + "grad_norm": 0.2620115578174591, + "learning_rate": 3.9224652725303514e-07, + "loss": 0.3773, + "step": 8189 + }, + { + "epoch": 4.427824833303298, + "grad_norm": 0.2648788094520569, + "learning_rate": 3.9151383372555696e-07, + "loss": 0.3507, + "step": 8190 + }, + { + "epoch": 4.4283654712560825, + "grad_norm": 0.2630835175514221, + "learning_rate": 3.9078179726768027e-07, + "loss": 0.3661, + "step": 8191 + }, + { + "epoch": 4.428906109208866, + "grad_norm": 0.26218724250793457, + "learning_rate": 3.9005041798377827e-07, + "loss": 0.3777, + "step": 8192 + }, + { + "epoch": 4.42944674716165, + "grad_norm": 0.265948086977005, + "learning_rate": 3.8931969597812813e-07, + "loss": 0.3542, + "step": 8193 + }, + { + "epoch": 4.429987385114435, + "grad_norm": 0.2752498984336853, + "learning_rate": 3.88589631354917e-07, + "loss": 0.3637, + "step": 8194 + }, + { + "epoch": 4.430528023067219, + "grad_norm": 0.2578594982624054, + "learning_rate": 3.8786022421823497e-07, + "loss": 0.3829, + "step": 8195 + }, + { + "epoch": 4.431068661020004, + "grad_norm": 0.25675004720687866, + "learning_rate": 3.8713147467207946e-07, + "loss": 0.3783, + "step": 8196 + }, + { + "epoch": 4.431609298972788, + "grad_norm": 0.28650611639022827, + "learning_rate": 3.8640338282035507e-07, + "loss": 0.3796, + "step": 8197 + }, + { + "epoch": 4.432149936925573, + "grad_norm": 0.2919948399066925, + "learning_rate": 3.85675948766871e-07, + "loss": 0.3345, + "step": 8198 + }, + { + "epoch": 4.4326905748783565, + "grad_norm": 0.3023063838481903, + "learning_rate": 3.8494917261534427e-07, + "loss": 0.3488, + "step": 8199 + }, + { + "epoch": 4.43323121283114, + "grad_norm": 0.2518389821052551, + "learning_rate": 3.84223054469397e-07, + "loss": 0.3614, + "step": 8200 + }, + { + "epoch": 4.433771850783925, + "grad_norm": 0.2588181793689728, + "learning_rate": 3.83497594432557e-07, + "loss": 0.3692, + "step": 8201 + }, + { + "epoch": 4.434312488736709, + "grad_norm": 0.25840917229652405, + "learning_rate": 3.827727926082603e-07, + "loss": 0.3659, + "step": 8202 + }, + { + "epoch": 4.434853126689494, + "grad_norm": 0.25818681716918945, + "learning_rate": 3.8204864909984764e-07, + "loss": 0.3509, + "step": 8203 + }, + { + "epoch": 4.435393764642278, + "grad_norm": 0.2708577513694763, + "learning_rate": 3.813251640105653e-07, + "loss": 0.3754, + "step": 8204 + }, + { + "epoch": 4.435934402595062, + "grad_norm": 0.2632346451282501, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.3706, + "step": 8205 + }, + { + "epoch": 4.436475040547847, + "grad_norm": 0.27098697423934937, + "learning_rate": 3.7988016950191055e-07, + "loss": 0.3473, + "step": 8206 + }, + { + "epoch": 4.4370156785006305, + "grad_norm": 0.2811775803565979, + "learning_rate": 3.791586602885644e-07, + "loss": 0.3416, + "step": 8207 + }, + { + "epoch": 4.437556316453415, + "grad_norm": 0.2631094753742218, + "learning_rate": 3.7843780990639787e-07, + "loss": 0.3689, + "step": 8208 + }, + { + "epoch": 4.438096954406199, + "grad_norm": 0.2584248483181, + "learning_rate": 3.777176184581893e-07, + "loss": 0.3532, + "step": 8209 + }, + { + "epoch": 4.438637592358983, + "grad_norm": 0.2771499454975128, + "learning_rate": 3.76998086046621e-07, + "loss": 0.3666, + "step": 8210 + }, + { + "epoch": 4.439178230311768, + "grad_norm": 0.281101793050766, + "learning_rate": 3.762792127742848e-07, + "loss": 0.418, + "step": 8211 + }, + { + "epoch": 4.439718868264552, + "grad_norm": 0.2672911286354065, + "learning_rate": 3.755609987436748e-07, + "loss": 0.3356, + "step": 8212 + }, + { + "epoch": 4.440259506217337, + "grad_norm": 0.2672131359577179, + "learning_rate": 3.7484344405719186e-07, + "loss": 0.3506, + "step": 8213 + }, + { + "epoch": 4.440800144170121, + "grad_norm": 0.2638314664363861, + "learning_rate": 3.741265488171458e-07, + "loss": 0.3783, + "step": 8214 + }, + { + "epoch": 4.441340782122905, + "grad_norm": 0.26125532388687134, + "learning_rate": 3.7341031312574827e-07, + "loss": 0.365, + "step": 8215 + }, + { + "epoch": 4.441881420075689, + "grad_norm": 0.25414150953292847, + "learning_rate": 3.7269473708512084e-07, + "loss": 0.3544, + "step": 8216 + }, + { + "epoch": 4.442422058028473, + "grad_norm": 0.2596067190170288, + "learning_rate": 3.7197982079728745e-07, + "loss": 0.3733, + "step": 8217 + }, + { + "epoch": 4.442962695981258, + "grad_norm": 0.2612897455692291, + "learning_rate": 3.7126556436417993e-07, + "loss": 0.3468, + "step": 8218 + }, + { + "epoch": 4.443503333934042, + "grad_norm": 0.26086223125457764, + "learning_rate": 3.7055196788763625e-07, + "loss": 0.3606, + "step": 8219 + }, + { + "epoch": 4.444043971886827, + "grad_norm": 0.26433905959129333, + "learning_rate": 3.6983903146939894e-07, + "loss": 0.3635, + "step": 8220 + }, + { + "epoch": 4.444584609839611, + "grad_norm": 0.29702532291412354, + "learning_rate": 3.691267552111183e-07, + "loss": 0.3817, + "step": 8221 + }, + { + "epoch": 4.4451252477923955, + "grad_norm": 0.2539384961128235, + "learning_rate": 3.6841513921434704e-07, + "loss": 0.3267, + "step": 8222 + }, + { + "epoch": 4.445665885745179, + "grad_norm": 0.2569662630558014, + "learning_rate": 3.6770418358054894e-07, + "loss": 0.3415, + "step": 8223 + }, + { + "epoch": 4.446206523697963, + "grad_norm": 0.26443180441856384, + "learning_rate": 3.6699388841108907e-07, + "loss": 0.3618, + "step": 8224 + }, + { + "epoch": 4.446747161650748, + "grad_norm": 0.26646578311920166, + "learning_rate": 3.6628425380723975e-07, + "loss": 0.3675, + "step": 8225 + }, + { + "epoch": 4.447287799603532, + "grad_norm": 0.2674654424190521, + "learning_rate": 3.6557527987018114e-07, + "loss": 0.3586, + "step": 8226 + }, + { + "epoch": 4.447828437556317, + "grad_norm": 0.271610289812088, + "learning_rate": 3.648669667009952e-07, + "loss": 0.3737, + "step": 8227 + }, + { + "epoch": 4.448369075509101, + "grad_norm": 0.2587081789970398, + "learning_rate": 3.6415931440067443e-07, + "loss": 0.3455, + "step": 8228 + }, + { + "epoch": 4.448909713461885, + "grad_norm": 0.2835042476654053, + "learning_rate": 3.6345232307011257e-07, + "loss": 0.4038, + "step": 8229 + }, + { + "epoch": 4.4494503514146695, + "grad_norm": 0.26958781480789185, + "learning_rate": 3.627459928101118e-07, + "loss": 0.3582, + "step": 8230 + }, + { + "epoch": 4.449990989367453, + "grad_norm": 0.2784334123134613, + "learning_rate": 3.620403237213799e-07, + "loss": 0.3644, + "step": 8231 + }, + { + "epoch": 4.450531627320238, + "grad_norm": 0.26102492213249207, + "learning_rate": 3.6133531590452963e-07, + "loss": 0.3449, + "step": 8232 + }, + { + "epoch": 4.451072265273022, + "grad_norm": 0.27513423562049866, + "learning_rate": 3.606309694600796e-07, + "loss": 0.3646, + "step": 8233 + }, + { + "epoch": 4.451612903225806, + "grad_norm": 0.27059316635131836, + "learning_rate": 3.5992728448845326e-07, + "loss": 0.3599, + "step": 8234 + }, + { + "epoch": 4.452153541178591, + "grad_norm": 0.27595481276512146, + "learning_rate": 3.5922426108998154e-07, + "loss": 0.362, + "step": 8235 + }, + { + "epoch": 4.452694179131375, + "grad_norm": 0.28539323806762695, + "learning_rate": 3.5852189936490255e-07, + "loss": 0.367, + "step": 8236 + }, + { + "epoch": 4.45323481708416, + "grad_norm": 0.2783656418323517, + "learning_rate": 3.5782019941335345e-07, + "loss": 0.3888, + "step": 8237 + }, + { + "epoch": 4.4537754550369435, + "grad_norm": 0.2655050754547119, + "learning_rate": 3.571191613353847e-07, + "loss": 0.3494, + "step": 8238 + }, + { + "epoch": 4.454316092989728, + "grad_norm": 0.26864656805992126, + "learning_rate": 3.5641878523094697e-07, + "loss": 0.3511, + "step": 8239 + }, + { + "epoch": 4.454856730942512, + "grad_norm": 0.2563180923461914, + "learning_rate": 3.5571907119990033e-07, + "loss": 0.3507, + "step": 8240 + }, + { + "epoch": 4.455397368895296, + "grad_norm": 0.26620420813560486, + "learning_rate": 3.550200193420078e-07, + "loss": 0.3501, + "step": 8241 + }, + { + "epoch": 4.455938006848081, + "grad_norm": 0.2963588535785675, + "learning_rate": 3.543216297569385e-07, + "loss": 0.3868, + "step": 8242 + }, + { + "epoch": 4.456478644800865, + "grad_norm": 0.2520950138568878, + "learning_rate": 3.5362390254426836e-07, + "loss": 0.367, + "step": 8243 + }, + { + "epoch": 4.45701928275365, + "grad_norm": 0.24793481826782227, + "learning_rate": 3.5292683780347834e-07, + "loss": 0.3411, + "step": 8244 + }, + { + "epoch": 4.457559920706434, + "grad_norm": 0.280124306678772, + "learning_rate": 3.522304356339529e-07, + "loss": 0.3872, + "step": 8245 + }, + { + "epoch": 4.4581005586592175, + "grad_norm": 0.2688871920108795, + "learning_rate": 3.5153469613498583e-07, + "loss": 0.3764, + "step": 8246 + }, + { + "epoch": 4.458641196612002, + "grad_norm": 0.26355719566345215, + "learning_rate": 3.508396194057728e-07, + "loss": 0.3501, + "step": 8247 + }, + { + "epoch": 4.459181834564786, + "grad_norm": 0.27029556035995483, + "learning_rate": 3.501452055454191e-07, + "loss": 0.3733, + "step": 8248 + }, + { + "epoch": 4.459722472517571, + "grad_norm": 0.26230961084365845, + "learning_rate": 3.4945145465292987e-07, + "loss": 0.3632, + "step": 8249 + }, + { + "epoch": 4.460263110470355, + "grad_norm": 0.2691025137901306, + "learning_rate": 3.4875836682722096e-07, + "loss": 0.3772, + "step": 8250 + }, + { + "epoch": 4.460803748423139, + "grad_norm": 0.26911598443984985, + "learning_rate": 3.4806594216710956e-07, + "loss": 0.3758, + "step": 8251 + }, + { + "epoch": 4.461344386375924, + "grad_norm": 0.2633644938468933, + "learning_rate": 3.473741807713232e-07, + "loss": 0.3691, + "step": 8252 + }, + { + "epoch": 4.461885024328708, + "grad_norm": 0.25110191106796265, + "learning_rate": 3.4668308273848985e-07, + "loss": 0.3279, + "step": 8253 + }, + { + "epoch": 4.462425662281492, + "grad_norm": 0.2690924108028412, + "learning_rate": 3.4599264816714497e-07, + "loss": 0.3688, + "step": 8254 + }, + { + "epoch": 4.462966300234276, + "grad_norm": 0.2681609094142914, + "learning_rate": 3.45302877155731e-07, + "loss": 0.3894, + "step": 8255 + }, + { + "epoch": 4.463506938187061, + "grad_norm": 0.25030145049095154, + "learning_rate": 3.4461376980259307e-07, + "loss": 0.3546, + "step": 8256 + }, + { + "epoch": 4.464047576139845, + "grad_norm": 0.261446475982666, + "learning_rate": 3.439253262059822e-07, + "loss": 0.389, + "step": 8257 + }, + { + "epoch": 4.464588214092629, + "grad_norm": 0.2674567401409149, + "learning_rate": 3.4323754646405747e-07, + "loss": 0.3604, + "step": 8258 + }, + { + "epoch": 4.465128852045414, + "grad_norm": 0.2563493251800537, + "learning_rate": 3.4255043067487893e-07, + "loss": 0.338, + "step": 8259 + }, + { + "epoch": 4.465669489998198, + "grad_norm": 0.27313074469566345, + "learning_rate": 3.418639789364175e-07, + "loss": 0.3599, + "step": 8260 + }, + { + "epoch": 4.4662101279509825, + "grad_norm": 0.26698416471481323, + "learning_rate": 3.411781913465423e-07, + "loss": 0.3919, + "step": 8261 + }, + { + "epoch": 4.466750765903766, + "grad_norm": 0.2576042115688324, + "learning_rate": 3.404930680030344e-07, + "loss": 0.369, + "step": 8262 + }, + { + "epoch": 4.46729140385655, + "grad_norm": 0.2721457779407501, + "learning_rate": 3.398086090035757e-07, + "loss": 0.3669, + "step": 8263 + }, + { + "epoch": 4.467832041809335, + "grad_norm": 0.265586256980896, + "learning_rate": 3.3912481444575763e-07, + "loss": 0.3697, + "step": 8264 + }, + { + "epoch": 4.468372679762119, + "grad_norm": 0.2615722417831421, + "learning_rate": 3.3844168442707213e-07, + "loss": 0.3671, + "step": 8265 + }, + { + "epoch": 4.468913317714904, + "grad_norm": 0.26328277587890625, + "learning_rate": 3.377592190449186e-07, + "loss": 0.3512, + "step": 8266 + }, + { + "epoch": 4.469453955667688, + "grad_norm": 0.31026870012283325, + "learning_rate": 3.370774183966036e-07, + "loss": 0.3721, + "step": 8267 + }, + { + "epoch": 4.469994593620472, + "grad_norm": 0.26836591958999634, + "learning_rate": 3.363962825793354e-07, + "loss": 0.3935, + "step": 8268 + }, + { + "epoch": 4.4705352315732565, + "grad_norm": 0.25650352239608765, + "learning_rate": 3.357158116902287e-07, + "loss": 0.3358, + "step": 8269 + }, + { + "epoch": 4.47107586952604, + "grad_norm": 0.2633151113986969, + "learning_rate": 3.350360058263058e-07, + "loss": 0.3537, + "step": 8270 + }, + { + "epoch": 4.471616507478825, + "grad_norm": 0.2551864683628082, + "learning_rate": 3.3435686508449026e-07, + "loss": 0.3431, + "step": 8271 + }, + { + "epoch": 4.472157145431609, + "grad_norm": 0.2526355981826782, + "learning_rate": 3.336783895616147e-07, + "loss": 0.3347, + "step": 8272 + }, + { + "epoch": 4.472697783384394, + "grad_norm": 0.2782466411590576, + "learning_rate": 3.330005793544133e-07, + "loss": 0.3964, + "step": 8273 + }, + { + "epoch": 4.473238421337178, + "grad_norm": 0.27180591225624084, + "learning_rate": 3.3232343455952664e-07, + "loss": 0.3659, + "step": 8274 + }, + { + "epoch": 4.473779059289962, + "grad_norm": 0.2725878059864044, + "learning_rate": 3.3164695527350244e-07, + "loss": 0.3364, + "step": 8275 + }, + { + "epoch": 4.474319697242747, + "grad_norm": 0.2825901508331299, + "learning_rate": 3.309711415927908e-07, + "loss": 0.3589, + "step": 8276 + }, + { + "epoch": 4.4748603351955305, + "grad_norm": 0.26633691787719727, + "learning_rate": 3.3029599361374955e-07, + "loss": 0.3908, + "step": 8277 + }, + { + "epoch": 4.475400973148315, + "grad_norm": 0.34270578622817993, + "learning_rate": 3.296215114326368e-07, + "loss": 0.3681, + "step": 8278 + }, + { + "epoch": 4.475941611101099, + "grad_norm": 0.2730425298213959, + "learning_rate": 3.289476951456222e-07, + "loss": 0.3264, + "step": 8279 + }, + { + "epoch": 4.476482249053884, + "grad_norm": 0.2668716311454773, + "learning_rate": 3.2827454484877564e-07, + "loss": 0.3531, + "step": 8280 + }, + { + "epoch": 4.477022887006668, + "grad_norm": 0.2741301655769348, + "learning_rate": 3.276020606380742e-07, + "loss": 0.3679, + "step": 8281 + }, + { + "epoch": 4.477563524959452, + "grad_norm": 0.2575318217277527, + "learning_rate": 3.269302426094001e-07, + "loss": 0.3535, + "step": 8282 + }, + { + "epoch": 4.478104162912237, + "grad_norm": 0.2573346197605133, + "learning_rate": 3.262590908585378e-07, + "loss": 0.3599, + "step": 8283 + }, + { + "epoch": 4.478644800865021, + "grad_norm": 0.27236178517341614, + "learning_rate": 3.255886054811813e-07, + "loss": 0.395, + "step": 8284 + }, + { + "epoch": 4.479185438817805, + "grad_norm": 0.2652297019958496, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.3193, + "step": 8285 + }, + { + "epoch": 4.479726076770589, + "grad_norm": 0.2728855609893799, + "learning_rate": 3.2424963422927335e-07, + "loss": 0.3745, + "step": 8286 + }, + { + "epoch": 4.480266714723373, + "grad_norm": 0.26942577958106995, + "learning_rate": 3.2358114854563086e-07, + "loss": 0.3725, + "step": 8287 + }, + { + "epoch": 4.480807352676158, + "grad_norm": 0.2845991849899292, + "learning_rate": 3.2291332961730817e-07, + "loss": 0.3882, + "step": 8288 + }, + { + "epoch": 4.481347990628942, + "grad_norm": 0.27013665437698364, + "learning_rate": 3.222461775395247e-07, + "loss": 0.3673, + "step": 8289 + }, + { + "epoch": 4.481888628581727, + "grad_norm": 0.2531822919845581, + "learning_rate": 3.215796924073983e-07, + "loss": 0.3679, + "step": 8290 + }, + { + "epoch": 4.482429266534511, + "grad_norm": 0.2829582989215851, + "learning_rate": 3.209138743159573e-07, + "loss": 0.3566, + "step": 8291 + }, + { + "epoch": 4.482969904487295, + "grad_norm": 0.29151779413223267, + "learning_rate": 3.2024872336013204e-07, + "loss": 0.3817, + "step": 8292 + }, + { + "epoch": 4.483510542440079, + "grad_norm": 0.2873472273349762, + "learning_rate": 3.195842396347598e-07, + "loss": 0.3305, + "step": 8293 + }, + { + "epoch": 4.484051180392863, + "grad_norm": 0.24851979315280914, + "learning_rate": 3.1892042323457995e-07, + "loss": 0.3635, + "step": 8294 + }, + { + "epoch": 4.484591818345648, + "grad_norm": 0.2865833640098572, + "learning_rate": 3.1825727425423837e-07, + "loss": 0.3742, + "step": 8295 + }, + { + "epoch": 4.485132456298432, + "grad_norm": 0.2495800107717514, + "learning_rate": 3.1759479278828665e-07, + "loss": 0.3423, + "step": 8296 + }, + { + "epoch": 4.485673094251217, + "grad_norm": 0.29070883989334106, + "learning_rate": 3.169329789311798e-07, + "loss": 0.4068, + "step": 8297 + }, + { + "epoch": 4.486213732204001, + "grad_norm": 0.24659143388271332, + "learning_rate": 3.1627183277727734e-07, + "loss": 0.3373, + "step": 8298 + }, + { + "epoch": 4.486754370156785, + "grad_norm": 0.28675511479377747, + "learning_rate": 3.1561135442084556e-07, + "loss": 0.3762, + "step": 8299 + }, + { + "epoch": 4.4872950081095695, + "grad_norm": 0.271820604801178, + "learning_rate": 3.149515439560524e-07, + "loss": 0.3444, + "step": 8300 + }, + { + "epoch": 4.487835646062353, + "grad_norm": 0.25091075897216797, + "learning_rate": 3.142924014769755e-07, + "loss": 0.3268, + "step": 8301 + }, + { + "epoch": 4.488376284015138, + "grad_norm": 0.2675650715827942, + "learning_rate": 3.136339270775901e-07, + "loss": 0.3969, + "step": 8302 + }, + { + "epoch": 4.488916921967922, + "grad_norm": 0.2635056674480438, + "learning_rate": 3.1297612085178284e-07, + "loss": 0.3673, + "step": 8303 + }, + { + "epoch": 4.489457559920706, + "grad_norm": 0.25954359769821167, + "learning_rate": 3.123189828933432e-07, + "loss": 0.3568, + "step": 8304 + }, + { + "epoch": 4.489998197873491, + "grad_norm": 0.26797178387641907, + "learning_rate": 3.116625132959633e-07, + "loss": 0.384, + "step": 8305 + }, + { + "epoch": 4.490538835826275, + "grad_norm": 0.2877870202064514, + "learning_rate": 3.110067121532417e-07, + "loss": 0.3672, + "step": 8306 + }, + { + "epoch": 4.49107947377906, + "grad_norm": 0.2584758400917053, + "learning_rate": 3.103515795586809e-07, + "loss": 0.361, + "step": 8307 + }, + { + "epoch": 4.4916201117318435, + "grad_norm": 0.27120763063430786, + "learning_rate": 3.0969711560568996e-07, + "loss": 0.3548, + "step": 8308 + }, + { + "epoch": 4.492160749684627, + "grad_norm": 0.281546026468277, + "learning_rate": 3.0904332038757977e-07, + "loss": 0.3869, + "step": 8309 + }, + { + "epoch": 4.492701387637412, + "grad_norm": 0.2806074619293213, + "learning_rate": 3.083901939975675e-07, + "loss": 0.361, + "step": 8310 + }, + { + "epoch": 4.493242025590196, + "grad_norm": 0.26624199748039246, + "learning_rate": 3.0773773652877537e-07, + "loss": 0.3682, + "step": 8311 + }, + { + "epoch": 4.493782663542981, + "grad_norm": 0.24980990588665009, + "learning_rate": 3.070859480742283e-07, + "loss": 0.3591, + "step": 8312 + }, + { + "epoch": 4.494323301495765, + "grad_norm": 0.28086620569229126, + "learning_rate": 3.064348287268587e-07, + "loss": 0.3963, + "step": 8313 + }, + { + "epoch": 4.49486393944855, + "grad_norm": 0.2712896466255188, + "learning_rate": 3.0578437857950117e-07, + "loss": 0.3577, + "step": 8314 + }, + { + "epoch": 4.495404577401334, + "grad_norm": 0.24350707232952118, + "learning_rate": 3.051345977248954e-07, + "loss": 0.3017, + "step": 8315 + }, + { + "epoch": 4.4959452153541175, + "grad_norm": 0.26775676012039185, + "learning_rate": 3.044854862556867e-07, + "loss": 0.3622, + "step": 8316 + }, + { + "epoch": 4.496485853306902, + "grad_norm": 0.29231777787208557, + "learning_rate": 3.0383704426442396e-07, + "loss": 0.3987, + "step": 8317 + }, + { + "epoch": 4.497026491259686, + "grad_norm": 0.26771900057792664, + "learning_rate": 3.0318927184356086e-07, + "loss": 0.3696, + "step": 8318 + }, + { + "epoch": 4.497567129212471, + "grad_norm": 0.2685577869415283, + "learning_rate": 3.025421690854552e-07, + "loss": 0.3372, + "step": 8319 + }, + { + "epoch": 4.498107767165255, + "grad_norm": 0.2681006193161011, + "learning_rate": 3.018957360823699e-07, + "loss": 0.3474, + "step": 8320 + }, + { + "epoch": 4.498648405118039, + "grad_norm": 0.26560941338539124, + "learning_rate": 3.0124997292647286e-07, + "loss": 0.3773, + "step": 8321 + }, + { + "epoch": 4.499189043070824, + "grad_norm": 0.271924763917923, + "learning_rate": 3.006048797098349e-07, + "loss": 0.376, + "step": 8322 + }, + { + "epoch": 4.499729681023608, + "grad_norm": 0.2682948410511017, + "learning_rate": 2.9996045652443294e-07, + "loss": 0.3559, + "step": 8323 + }, + { + "epoch": 4.500270318976392, + "grad_norm": 0.2738342583179474, + "learning_rate": 2.993167034621464e-07, + "loss": 0.3634, + "step": 8324 + }, + { + "epoch": 4.500810956929176, + "grad_norm": 0.24666568636894226, + "learning_rate": 2.986736206147628e-07, + "loss": 0.3307, + "step": 8325 + }, + { + "epoch": 4.50135159488196, + "grad_norm": 0.26715728640556335, + "learning_rate": 2.9803120807397003e-07, + "loss": 0.3704, + "step": 8326 + }, + { + "epoch": 4.501892232834745, + "grad_norm": 0.25745439529418945, + "learning_rate": 2.9738946593136144e-07, + "loss": 0.3563, + "step": 8327 + }, + { + "epoch": 4.502432870787529, + "grad_norm": 0.27403581142425537, + "learning_rate": 2.9674839427843715e-07, + "loss": 0.3414, + "step": 8328 + }, + { + "epoch": 4.502973508740314, + "grad_norm": 0.28739896416664124, + "learning_rate": 2.9610799320659964e-07, + "loss": 0.3801, + "step": 8329 + }, + { + "epoch": 4.503514146693098, + "grad_norm": 0.27232155203819275, + "learning_rate": 2.9546826280715536e-07, + "loss": 0.3586, + "step": 8330 + }, + { + "epoch": 4.5040547846458825, + "grad_norm": 0.28253716230392456, + "learning_rate": 2.948292031713157e-07, + "loss": 0.3742, + "step": 8331 + }, + { + "epoch": 4.504595422598666, + "grad_norm": 0.2904265522956848, + "learning_rate": 2.9419081439019727e-07, + "loss": 0.3555, + "step": 8332 + }, + { + "epoch": 4.50513606055145, + "grad_norm": 0.2432306855916977, + "learning_rate": 2.9355309655482224e-07, + "loss": 0.3587, + "step": 8333 + }, + { + "epoch": 4.505676698504235, + "grad_norm": 0.27025797963142395, + "learning_rate": 2.9291604975611123e-07, + "loss": 0.3758, + "step": 8334 + }, + { + "epoch": 4.506217336457019, + "grad_norm": 0.2538128197193146, + "learning_rate": 2.9227967408489653e-07, + "loss": 0.3542, + "step": 8335 + }, + { + "epoch": 4.506757974409804, + "grad_norm": 0.2916772663593292, + "learning_rate": 2.9164396963190954e-07, + "loss": 0.4006, + "step": 8336 + }, + { + "epoch": 4.507298612362588, + "grad_norm": 0.26848316192626953, + "learning_rate": 2.910089364877888e-07, + "loss": 0.3406, + "step": 8337 + }, + { + "epoch": 4.507839250315373, + "grad_norm": 0.2920156419277191, + "learning_rate": 2.903745747430764e-07, + "loss": 0.4109, + "step": 8338 + }, + { + "epoch": 4.5083798882681565, + "grad_norm": 0.26490744948387146, + "learning_rate": 2.897408844882171e-07, + "loss": 0.3825, + "step": 8339 + }, + { + "epoch": 4.50892052622094, + "grad_norm": 0.2634319067001343, + "learning_rate": 2.891078658135632e-07, + "loss": 0.3604, + "step": 8340 + }, + { + "epoch": 4.509461164173725, + "grad_norm": 0.25874748826026917, + "learning_rate": 2.88475518809368e-07, + "loss": 0.3843, + "step": 8341 + }, + { + "epoch": 4.510001802126509, + "grad_norm": 0.2628515958786011, + "learning_rate": 2.87843843565791e-07, + "loss": 0.3653, + "step": 8342 + }, + { + "epoch": 4.510542440079294, + "grad_norm": 0.25855329632759094, + "learning_rate": 2.8721284017289517e-07, + "loss": 0.3514, + "step": 8343 + }, + { + "epoch": 4.511083078032078, + "grad_norm": 0.27656733989715576, + "learning_rate": 2.8658250872064696e-07, + "loss": 0.3682, + "step": 8344 + }, + { + "epoch": 4.511623715984862, + "grad_norm": 0.27417272329330444, + "learning_rate": 2.859528492989194e-07, + "loss": 0.3484, + "step": 8345 + }, + { + "epoch": 4.512164353937647, + "grad_norm": 0.2899097502231598, + "learning_rate": 2.853238619974874e-07, + "loss": 0.3567, + "step": 8346 + }, + { + "epoch": 4.5127049918904305, + "grad_norm": 0.2767457365989685, + "learning_rate": 2.8469554690603143e-07, + "loss": 0.3438, + "step": 8347 + }, + { + "epoch": 4.513245629843215, + "grad_norm": 0.2730007767677307, + "learning_rate": 2.8406790411413366e-07, + "loss": 0.3362, + "step": 8348 + }, + { + "epoch": 4.513786267795999, + "grad_norm": 0.25781869888305664, + "learning_rate": 2.834409337112842e-07, + "loss": 0.3599, + "step": 8349 + }, + { + "epoch": 4.514326905748783, + "grad_norm": 0.27375370264053345, + "learning_rate": 2.828146357868755e-07, + "loss": 0.3511, + "step": 8350 + }, + { + "epoch": 4.514867543701568, + "grad_norm": 0.27994829416275024, + "learning_rate": 2.821890104302022e-07, + "loss": 0.3468, + "step": 8351 + }, + { + "epoch": 4.515408181654352, + "grad_norm": 0.2658286988735199, + "learning_rate": 2.815640577304668e-07, + "loss": 0.3529, + "step": 8352 + }, + { + "epoch": 4.515948819607137, + "grad_norm": 0.2623860239982605, + "learning_rate": 2.8093977777677195e-07, + "loss": 0.3604, + "step": 8353 + }, + { + "epoch": 4.516489457559921, + "grad_norm": 0.2674957513809204, + "learning_rate": 2.803161706581281e-07, + "loss": 0.3431, + "step": 8354 + }, + { + "epoch": 4.517030095512705, + "grad_norm": 0.27279454469680786, + "learning_rate": 2.796932364634475e-07, + "loss": 0.3777, + "step": 8355 + }, + { + "epoch": 4.517570733465489, + "grad_norm": 0.24342961609363556, + "learning_rate": 2.790709752815457e-07, + "loss": 0.3348, + "step": 8356 + }, + { + "epoch": 4.518111371418273, + "grad_norm": 0.2706700563430786, + "learning_rate": 2.7844938720114566e-07, + "loss": 0.3639, + "step": 8357 + }, + { + "epoch": 4.518652009371058, + "grad_norm": 0.27748116850852966, + "learning_rate": 2.7782847231087095e-07, + "loss": 0.3534, + "step": 8358 + }, + { + "epoch": 4.519192647323842, + "grad_norm": 0.2530544698238373, + "learning_rate": 2.772082306992513e-07, + "loss": 0.357, + "step": 8359 + }, + { + "epoch": 4.519733285276627, + "grad_norm": 0.2769469618797302, + "learning_rate": 2.765886624547182e-07, + "loss": 0.4058, + "step": 8360 + }, + { + "epoch": 4.520273923229411, + "grad_norm": 0.277963250875473, + "learning_rate": 2.7596976766560977e-07, + "loss": 0.359, + "step": 8361 + }, + { + "epoch": 4.520814561182195, + "grad_norm": 0.2735162675380707, + "learning_rate": 2.753515464201678e-07, + "loss": 0.3487, + "step": 8362 + }, + { + "epoch": 4.521355199134979, + "grad_norm": 0.26233893632888794, + "learning_rate": 2.747339988065345e-07, + "loss": 0.3617, + "step": 8363 + }, + { + "epoch": 4.521895837087763, + "grad_norm": 0.2705513536930084, + "learning_rate": 2.741171249127611e-07, + "loss": 0.3567, + "step": 8364 + }, + { + "epoch": 4.522436475040548, + "grad_norm": 0.24441170692443848, + "learning_rate": 2.7350092482679836e-07, + "loss": 0.3372, + "step": 8365 + }, + { + "epoch": 4.522977112993332, + "grad_norm": 0.2987588942050934, + "learning_rate": 2.7288539863650544e-07, + "loss": 0.3831, + "step": 8366 + }, + { + "epoch": 4.523517750946116, + "grad_norm": 0.2871190309524536, + "learning_rate": 2.72270546429641e-07, + "loss": 0.3495, + "step": 8367 + }, + { + "epoch": 4.524058388898901, + "grad_norm": 0.2715509533882141, + "learning_rate": 2.7165636829387e-07, + "loss": 0.3692, + "step": 8368 + }, + { + "epoch": 4.524599026851685, + "grad_norm": 0.26629409193992615, + "learning_rate": 2.710428643167612e-07, + "loss": 0.3899, + "step": 8369 + }, + { + "epoch": 4.5251396648044695, + "grad_norm": 0.2310323566198349, + "learning_rate": 2.7043003458578685e-07, + "loss": 0.309, + "step": 8370 + }, + { + "epoch": 4.525680302757253, + "grad_norm": 0.2734542489051819, + "learning_rate": 2.6981787918832216e-07, + "loss": 0.3796, + "step": 8371 + }, + { + "epoch": 4.526220940710038, + "grad_norm": 0.2826656103134155, + "learning_rate": 2.6920639821164883e-07, + "loss": 0.3614, + "step": 8372 + }, + { + "epoch": 4.526761578662822, + "grad_norm": 0.2604270577430725, + "learning_rate": 2.685955917429489e-07, + "loss": 0.3586, + "step": 8373 + }, + { + "epoch": 4.527302216615606, + "grad_norm": 0.255300909280777, + "learning_rate": 2.6798545986931214e-07, + "loss": 0.3541, + "step": 8374 + }, + { + "epoch": 4.527842854568391, + "grad_norm": 0.273477703332901, + "learning_rate": 2.673760026777272e-07, + "loss": 0.3721, + "step": 8375 + }, + { + "epoch": 4.528383492521175, + "grad_norm": 0.2807256281375885, + "learning_rate": 2.667672202550925e-07, + "loss": 0.4015, + "step": 8376 + }, + { + "epoch": 4.52892413047396, + "grad_norm": 0.2528318464756012, + "learning_rate": 2.661591126882046e-07, + "loss": 0.3233, + "step": 8377 + }, + { + "epoch": 4.5294647684267435, + "grad_norm": 0.2718980312347412, + "learning_rate": 2.6555168006376755e-07, + "loss": 0.3847, + "step": 8378 + }, + { + "epoch": 4.530005406379528, + "grad_norm": 0.26820477843284607, + "learning_rate": 2.6494492246838863e-07, + "loss": 0.3554, + "step": 8379 + }, + { + "epoch": 4.530546044332312, + "grad_norm": 0.25228914618492126, + "learning_rate": 2.6433883998857657e-07, + "loss": 0.3264, + "step": 8380 + }, + { + "epoch": 4.531086682285096, + "grad_norm": 0.308725506067276, + "learning_rate": 2.6373343271074657e-07, + "loss": 0.4118, + "step": 8381 + }, + { + "epoch": 4.531627320237881, + "grad_norm": 0.2523513436317444, + "learning_rate": 2.631287007212169e-07, + "loss": 0.3539, + "step": 8382 + }, + { + "epoch": 4.532167958190665, + "grad_norm": 0.2601075768470764, + "learning_rate": 2.6252464410620793e-07, + "loss": 0.3166, + "step": 8383 + }, + { + "epoch": 4.532708596143449, + "grad_norm": 0.2990708351135254, + "learning_rate": 2.6192126295184584e-07, + "loss": 0.3905, + "step": 8384 + }, + { + "epoch": 4.533249234096234, + "grad_norm": 0.2582680881023407, + "learning_rate": 2.61318557344159e-07, + "loss": 0.3348, + "step": 8385 + }, + { + "epoch": 4.5337898720490175, + "grad_norm": 0.25455453991889954, + "learning_rate": 2.6071652736908203e-07, + "loss": 0.3461, + "step": 8386 + }, + { + "epoch": 4.534330510001802, + "grad_norm": 0.272830069065094, + "learning_rate": 2.601151731124485e-07, + "loss": 0.3809, + "step": 8387 + }, + { + "epoch": 4.534871147954586, + "grad_norm": 0.282148540019989, + "learning_rate": 2.5951449465999925e-07, + "loss": 0.3666, + "step": 8388 + }, + { + "epoch": 4.535411785907371, + "grad_norm": 0.26278114318847656, + "learning_rate": 2.5891449209737906e-07, + "loss": 0.3652, + "step": 8389 + }, + { + "epoch": 4.535952423860155, + "grad_norm": 0.2743205726146698, + "learning_rate": 2.5831516551013405e-07, + "loss": 0.355, + "step": 8390 + }, + { + "epoch": 4.536493061812939, + "grad_norm": 0.2829950749874115, + "learning_rate": 2.577165149837163e-07, + "loss": 0.41, + "step": 8391 + }, + { + "epoch": 4.537033699765724, + "grad_norm": 0.2564539313316345, + "learning_rate": 2.5711854060347817e-07, + "loss": 0.3283, + "step": 8392 + }, + { + "epoch": 4.537574337718508, + "grad_norm": 0.2629248797893524, + "learning_rate": 2.5652124245468033e-07, + "loss": 0.3654, + "step": 8393 + }, + { + "epoch": 4.538114975671292, + "grad_norm": 0.2670019865036011, + "learning_rate": 2.5592462062248304e-07, + "loss": 0.3524, + "step": 8394 + }, + { + "epoch": 4.538655613624076, + "grad_norm": 0.27533864974975586, + "learning_rate": 2.553286751919509e-07, + "loss": 0.3658, + "step": 8395 + }, + { + "epoch": 4.539196251576861, + "grad_norm": 0.25868937373161316, + "learning_rate": 2.547334062480544e-07, + "loss": 0.3483, + "step": 8396 + }, + { + "epoch": 4.539736889529645, + "grad_norm": 0.27601099014282227, + "learning_rate": 2.541388138756645e-07, + "loss": 0.3938, + "step": 8397 + }, + { + "epoch": 4.540277527482429, + "grad_norm": 0.26688718795776367, + "learning_rate": 2.535448981595595e-07, + "loss": 0.3755, + "step": 8398 + }, + { + "epoch": 4.540818165435214, + "grad_norm": 0.27354729175567627, + "learning_rate": 2.52951659184415e-07, + "loss": 0.3688, + "step": 8399 + }, + { + "epoch": 4.541358803387998, + "grad_norm": 0.27443838119506836, + "learning_rate": 2.523590970348166e-07, + "loss": 0.3485, + "step": 8400 + }, + { + "epoch": 4.5418994413407825, + "grad_norm": 0.26458561420440674, + "learning_rate": 2.517672117952502e-07, + "loss": 0.3582, + "step": 8401 + }, + { + "epoch": 4.542440079293566, + "grad_norm": 0.2567475140094757, + "learning_rate": 2.511760035501054e-07, + "loss": 0.3792, + "step": 8402 + }, + { + "epoch": 4.54298071724635, + "grad_norm": 0.259311705827713, + "learning_rate": 2.5058547238367703e-07, + "loss": 0.3247, + "step": 8403 + }, + { + "epoch": 4.543521355199135, + "grad_norm": 0.2569766342639923, + "learning_rate": 2.4999561838015996e-07, + "loss": 0.3825, + "step": 8404 + }, + { + "epoch": 4.544061993151919, + "grad_norm": 0.2629188895225525, + "learning_rate": 2.4940644162365523e-07, + "loss": 0.3596, + "step": 8405 + }, + { + "epoch": 4.544602631104704, + "grad_norm": 0.2798813581466675, + "learning_rate": 2.4881794219816624e-07, + "loss": 0.36, + "step": 8406 + }, + { + "epoch": 4.545143269057488, + "grad_norm": 0.2585632801055908, + "learning_rate": 2.482301201876014e-07, + "loss": 0.3513, + "step": 8407 + }, + { + "epoch": 4.545683907010272, + "grad_norm": 0.26448342204093933, + "learning_rate": 2.4764297567577035e-07, + "loss": 0.3891, + "step": 8408 + }, + { + "epoch": 4.5462245449630565, + "grad_norm": 0.2770906686782837, + "learning_rate": 2.4705650874638667e-07, + "loss": 0.3563, + "step": 8409 + }, + { + "epoch": 4.54676518291584, + "grad_norm": 0.2554166913032532, + "learning_rate": 2.46470719483069e-07, + "loss": 0.3375, + "step": 8410 + }, + { + "epoch": 4.547305820868625, + "grad_norm": 0.23727665841579437, + "learning_rate": 2.458856079693378e-07, + "loss": 0.3559, + "step": 8411 + }, + { + "epoch": 4.547846458821409, + "grad_norm": 0.25934919714927673, + "learning_rate": 2.4530117428861576e-07, + "loss": 0.3787, + "step": 8412 + }, + { + "epoch": 4.548387096774194, + "grad_norm": 0.2681872844696045, + "learning_rate": 2.447174185242324e-07, + "loss": 0.3744, + "step": 8413 + }, + { + "epoch": 4.548927734726978, + "grad_norm": 0.26082801818847656, + "learning_rate": 2.4413434075941657e-07, + "loss": 0.336, + "step": 8414 + }, + { + "epoch": 4.549468372679762, + "grad_norm": 0.2786238193511963, + "learning_rate": 2.435519410773052e-07, + "loss": 0.3871, + "step": 8415 + }, + { + "epoch": 4.550009010632547, + "grad_norm": 0.281931608915329, + "learning_rate": 2.429702195609329e-07, + "loss": 0.404, + "step": 8416 + }, + { + "epoch": 4.5505496485853305, + "grad_norm": 0.2451360821723938, + "learning_rate": 2.4238917629324124e-07, + "loss": 0.329, + "step": 8417 + }, + { + "epoch": 4.551090286538115, + "grad_norm": 0.27641189098358154, + "learning_rate": 2.4180881135707547e-07, + "loss": 0.4183, + "step": 8418 + }, + { + "epoch": 4.551630924490899, + "grad_norm": 0.24302369356155396, + "learning_rate": 2.4122912483518226e-07, + "loss": 0.3248, + "step": 8419 + }, + { + "epoch": 4.552171562443684, + "grad_norm": 0.2982977628707886, + "learning_rate": 2.4065011681021266e-07, + "loss": 0.3694, + "step": 8420 + }, + { + "epoch": 4.552712200396468, + "grad_norm": 0.25425615906715393, + "learning_rate": 2.40071787364719e-07, + "loss": 0.3448, + "step": 8421 + }, + { + "epoch": 4.553252838349252, + "grad_norm": 0.2629137933254242, + "learning_rate": 2.3949413658116037e-07, + "loss": 0.3513, + "step": 8422 + }, + { + "epoch": 4.553793476302037, + "grad_norm": 0.2689550817012787, + "learning_rate": 2.389171645418964e-07, + "loss": 0.381, + "step": 8423 + }, + { + "epoch": 4.554334114254821, + "grad_norm": 0.282644659280777, + "learning_rate": 2.3834087132919016e-07, + "loss": 0.4088, + "step": 8424 + }, + { + "epoch": 4.5548747522076045, + "grad_norm": 0.2677907645702362, + "learning_rate": 2.3776525702520925e-07, + "loss": 0.3203, + "step": 8425 + }, + { + "epoch": 4.555415390160389, + "grad_norm": 0.2817721962928772, + "learning_rate": 2.3719032171202362e-07, + "loss": 0.3726, + "step": 8426 + }, + { + "epoch": 4.555956028113173, + "grad_norm": 0.2728714346885681, + "learning_rate": 2.3661606547160653e-07, + "loss": 0.3582, + "step": 8427 + }, + { + "epoch": 4.556496666065958, + "grad_norm": 0.2762387692928314, + "learning_rate": 2.3604248838583421e-07, + "loss": 0.3635, + "step": 8428 + }, + { + "epoch": 4.557037304018742, + "grad_norm": 0.26854419708251953, + "learning_rate": 2.3546959053648565e-07, + "loss": 0.3672, + "step": 8429 + }, + { + "epoch": 4.557577941971527, + "grad_norm": 0.25551918148994446, + "learning_rate": 2.3489737200524498e-07, + "loss": 0.3367, + "step": 8430 + }, + { + "epoch": 4.558118579924311, + "grad_norm": 0.27728936076164246, + "learning_rate": 2.3432583287369747e-07, + "loss": 0.3621, + "step": 8431 + }, + { + "epoch": 4.558659217877095, + "grad_norm": 0.26189514994621277, + "learning_rate": 2.3375497322333186e-07, + "loss": 0.329, + "step": 8432 + }, + { + "epoch": 4.559199855829879, + "grad_norm": 0.27320173382759094, + "learning_rate": 2.3318479313554022e-07, + "loss": 0.3576, + "step": 8433 + }, + { + "epoch": 4.559740493782663, + "grad_norm": 0.26311782002449036, + "learning_rate": 2.326152926916181e-07, + "loss": 0.3691, + "step": 8434 + }, + { + "epoch": 4.560281131735448, + "grad_norm": 0.25582045316696167, + "learning_rate": 2.3204647197276387e-07, + "loss": 0.3302, + "step": 8435 + }, + { + "epoch": 4.560821769688232, + "grad_norm": 0.26089537143707275, + "learning_rate": 2.3147833106007823e-07, + "loss": 0.346, + "step": 8436 + }, + { + "epoch": 4.561362407641017, + "grad_norm": 0.27061784267425537, + "learning_rate": 2.309108700345669e-07, + "loss": 0.4184, + "step": 8437 + }, + { + "epoch": 4.561903045593801, + "grad_norm": 0.27126196026802063, + "learning_rate": 2.303440889771358e-07, + "loss": 0.3583, + "step": 8438 + }, + { + "epoch": 4.562443683546585, + "grad_norm": 0.2695308029651642, + "learning_rate": 2.2977798796859796e-07, + "loss": 0.3792, + "step": 8439 + }, + { + "epoch": 4.5629843214993695, + "grad_norm": 0.2534771263599396, + "learning_rate": 2.29212567089665e-07, + "loss": 0.3195, + "step": 8440 + }, + { + "epoch": 4.563524959452153, + "grad_norm": 0.2534823715686798, + "learning_rate": 2.2864782642095407e-07, + "loss": 0.3346, + "step": 8441 + }, + { + "epoch": 4.564065597404937, + "grad_norm": 0.28253212571144104, + "learning_rate": 2.2808376604298522e-07, + "loss": 0.3789, + "step": 8442 + }, + { + "epoch": 4.564606235357722, + "grad_norm": 0.2741342782974243, + "learning_rate": 2.2752038603618132e-07, + "loss": 0.3627, + "step": 8443 + }, + { + "epoch": 4.565146873310506, + "grad_norm": 0.2669365406036377, + "learning_rate": 2.2695768648086758e-07, + "loss": 0.3652, + "step": 8444 + }, + { + "epoch": 4.565687511263291, + "grad_norm": 0.27536723017692566, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.3726, + "step": 8445 + }, + { + "epoch": 4.566228149216075, + "grad_norm": 0.2741495668888092, + "learning_rate": 2.2583432904552726e-07, + "loss": 0.3858, + "step": 8446 + }, + { + "epoch": 4.56676878716886, + "grad_norm": 0.2663235366344452, + "learning_rate": 2.2527367132566925e-07, + "loss": 0.3532, + "step": 8447 + }, + { + "epoch": 4.5673094251216435, + "grad_norm": 0.2848953604698181, + "learning_rate": 2.2471369437763234e-07, + "loss": 0.3713, + "step": 8448 + }, + { + "epoch": 4.567850063074427, + "grad_norm": 0.2695309519767761, + "learning_rate": 2.241543982812594e-07, + "loss": 0.3672, + "step": 8449 + }, + { + "epoch": 4.568390701027212, + "grad_norm": 0.26727294921875, + "learning_rate": 2.2359578311629272e-07, + "loss": 0.3472, + "step": 8450 + }, + { + "epoch": 4.568931338979996, + "grad_norm": 0.26325517892837524, + "learning_rate": 2.2303784896238022e-07, + "loss": 0.3649, + "step": 8451 + }, + { + "epoch": 4.569471976932781, + "grad_norm": 0.26693716645240784, + "learning_rate": 2.2248059589906944e-07, + "loss": 0.4039, + "step": 8452 + }, + { + "epoch": 4.570012614885565, + "grad_norm": 0.2606445848941803, + "learning_rate": 2.2192402400581237e-07, + "loss": 0.3561, + "step": 8453 + }, + { + "epoch": 4.57055325283835, + "grad_norm": 0.24945513904094696, + "learning_rate": 2.2136813336196606e-07, + "loss": 0.3808, + "step": 8454 + }, + { + "epoch": 4.571093890791134, + "grad_norm": 0.25371333956718445, + "learning_rate": 2.2081292404678655e-07, + "loss": 0.3405, + "step": 8455 + }, + { + "epoch": 4.5716345287439175, + "grad_norm": 0.2556355893611908, + "learning_rate": 2.2025839613943445e-07, + "loss": 0.3594, + "step": 8456 + }, + { + "epoch": 4.572175166696702, + "grad_norm": 0.27583563327789307, + "learning_rate": 2.1970454971897483e-07, + "loss": 0.396, + "step": 8457 + }, + { + "epoch": 4.572715804649486, + "grad_norm": 0.24128134548664093, + "learning_rate": 2.1915138486437281e-07, + "loss": 0.3408, + "step": 8458 + }, + { + "epoch": 4.573256442602271, + "grad_norm": 0.278267502784729, + "learning_rate": 2.1859890165449926e-07, + "loss": 0.3846, + "step": 8459 + }, + { + "epoch": 4.573797080555055, + "grad_norm": 0.2626984715461731, + "learning_rate": 2.1804710016812337e-07, + "loss": 0.3731, + "step": 8460 + }, + { + "epoch": 4.574337718507839, + "grad_norm": 0.25800734758377075, + "learning_rate": 2.1749598048392272e-07, + "loss": 0.336, + "step": 8461 + }, + { + "epoch": 4.574878356460624, + "grad_norm": 0.2768734097480774, + "learning_rate": 2.1694554268047342e-07, + "loss": 0.3683, + "step": 8462 + }, + { + "epoch": 4.575418994413408, + "grad_norm": 0.26480376720428467, + "learning_rate": 2.1639578683625707e-07, + "loss": 0.3771, + "step": 8463 + }, + { + "epoch": 4.575959632366192, + "grad_norm": 0.3137277066707611, + "learning_rate": 2.1584671302965598e-07, + "loss": 0.339, + "step": 8464 + }, + { + "epoch": 4.576500270318976, + "grad_norm": 0.2833629548549652, + "learning_rate": 2.152983213389559e-07, + "loss": 0.3893, + "step": 8465 + }, + { + "epoch": 4.57704090827176, + "grad_norm": 0.27549314498901367, + "learning_rate": 2.1475061184234648e-07, + "loss": 0.3544, + "step": 8466 + }, + { + "epoch": 4.577581546224545, + "grad_norm": 0.27217674255371094, + "learning_rate": 2.1420358461791745e-07, + "loss": 0.3616, + "step": 8467 + }, + { + "epoch": 4.578122184177329, + "grad_norm": 0.28430917859077454, + "learning_rate": 2.1365723974366537e-07, + "loss": 0.3506, + "step": 8468 + }, + { + "epoch": 4.578662822130114, + "grad_norm": 0.2661433219909668, + "learning_rate": 2.1311157729748566e-07, + "loss": 0.3662, + "step": 8469 + }, + { + "epoch": 4.579203460082898, + "grad_norm": 0.260281503200531, + "learning_rate": 2.1256659735717777e-07, + "loss": 0.347, + "step": 8470 + }, + { + "epoch": 4.5797440980356825, + "grad_norm": 0.2790489196777344, + "learning_rate": 2.1202230000044455e-07, + "loss": 0.3983, + "step": 8471 + }, + { + "epoch": 4.580284735988466, + "grad_norm": 0.24340921640396118, + "learning_rate": 2.1147868530489113e-07, + "loss": 0.3427, + "step": 8472 + }, + { + "epoch": 4.58082537394125, + "grad_norm": 0.2590758502483368, + "learning_rate": 2.1093575334802385e-07, + "loss": 0.3462, + "step": 8473 + }, + { + "epoch": 4.581366011894035, + "grad_norm": 0.2715151309967041, + "learning_rate": 2.1039350420725358e-07, + "loss": 0.3955, + "step": 8474 + }, + { + "epoch": 4.581906649846819, + "grad_norm": 0.2760257422924042, + "learning_rate": 2.0985193795989345e-07, + "loss": 0.3629, + "step": 8475 + }, + { + "epoch": 4.582447287799604, + "grad_norm": 0.26709234714508057, + "learning_rate": 2.0931105468316005e-07, + "loss": 0.3857, + "step": 8476 + }, + { + "epoch": 4.582987925752388, + "grad_norm": 0.24396254122257233, + "learning_rate": 2.0877085445416889e-07, + "loss": 0.3469, + "step": 8477 + }, + { + "epoch": 4.583528563705173, + "grad_norm": 0.262498140335083, + "learning_rate": 2.082313373499434e-07, + "loss": 0.3802, + "step": 8478 + }, + { + "epoch": 4.5840692016579565, + "grad_norm": 0.28324538469314575, + "learning_rate": 2.0769250344740476e-07, + "loss": 0.3766, + "step": 8479 + }, + { + "epoch": 4.58460983961074, + "grad_norm": 0.29584234952926636, + "learning_rate": 2.071543528233805e-07, + "loss": 0.3435, + "step": 8480 + }, + { + "epoch": 4.585150477563525, + "grad_norm": 0.2602497935295105, + "learning_rate": 2.0661688555459915e-07, + "loss": 0.3223, + "step": 8481 + }, + { + "epoch": 4.585691115516309, + "grad_norm": 0.28299152851104736, + "learning_rate": 2.0608010171768998e-07, + "loss": 0.3996, + "step": 8482 + }, + { + "epoch": 4.586231753469093, + "grad_norm": 0.2951735854148865, + "learning_rate": 2.0554400138918907e-07, + "loss": 0.3568, + "step": 8483 + }, + { + "epoch": 4.586772391421878, + "grad_norm": 0.26150617003440857, + "learning_rate": 2.0500858464553186e-07, + "loss": 0.3524, + "step": 8484 + }, + { + "epoch": 4.587313029374662, + "grad_norm": 2.3454174995422363, + "learning_rate": 2.0447385156305565e-07, + "loss": 0.3718, + "step": 8485 + }, + { + "epoch": 4.587853667327447, + "grad_norm": 0.27141591906547546, + "learning_rate": 2.0393980221800337e-07, + "loss": 0.3694, + "step": 8486 + }, + { + "epoch": 4.5883943052802305, + "grad_norm": 0.2717527449131012, + "learning_rate": 2.0340643668651794e-07, + "loss": 0.3807, + "step": 8487 + }, + { + "epoch": 4.588934943233015, + "grad_norm": 0.2657257616519928, + "learning_rate": 2.0287375504464746e-07, + "loss": 0.3714, + "step": 8488 + }, + { + "epoch": 4.589475581185799, + "grad_norm": 0.2644079923629761, + "learning_rate": 2.0234175736833727e-07, + "loss": 0.3854, + "step": 8489 + }, + { + "epoch": 4.590016219138583, + "grad_norm": 0.2563624978065491, + "learning_rate": 2.0181044373344172e-07, + "loss": 0.3454, + "step": 8490 + }, + { + "epoch": 4.590556857091368, + "grad_norm": 0.26360106468200684, + "learning_rate": 2.0127981421571295e-07, + "loss": 0.3908, + "step": 8491 + }, + { + "epoch": 4.591097495044152, + "grad_norm": 0.243517205119133, + "learning_rate": 2.0074986889080826e-07, + "loss": 0.3355, + "step": 8492 + }, + { + "epoch": 4.591638132996937, + "grad_norm": 0.2634882628917694, + "learning_rate": 2.0022060783428553e-07, + "loss": 0.3639, + "step": 8493 + }, + { + "epoch": 4.592178770949721, + "grad_norm": 0.2720161974430084, + "learning_rate": 1.9969203112160497e-07, + "loss": 0.3923, + "step": 8494 + }, + { + "epoch": 4.592719408902505, + "grad_norm": 0.2561497688293457, + "learning_rate": 1.9916413882813235e-07, + "loss": 0.3674, + "step": 8495 + }, + { + "epoch": 4.593260046855289, + "grad_norm": 0.2585059106349945, + "learning_rate": 1.9863693102913195e-07, + "loss": 0.3531, + "step": 8496 + }, + { + "epoch": 4.593800684808073, + "grad_norm": 0.2782512903213501, + "learning_rate": 1.9811040779977196e-07, + "loss": 0.3882, + "step": 8497 + }, + { + "epoch": 4.594341322760858, + "grad_norm": 0.25891584157943726, + "learning_rate": 1.9758456921512403e-07, + "loss": 0.3651, + "step": 8498 + }, + { + "epoch": 4.594881960713642, + "grad_norm": 0.2681910991668701, + "learning_rate": 1.9705941535016092e-07, + "loss": 0.3663, + "step": 8499 + }, + { + "epoch": 4.595422598666426, + "grad_norm": 0.25091880559921265, + "learning_rate": 1.9653494627975888e-07, + "loss": 0.3547, + "step": 8500 + }, + { + "epoch": 4.595963236619211, + "grad_norm": 0.24863027036190033, + "learning_rate": 1.9601116207869365e-07, + "loss": 0.3442, + "step": 8501 + }, + { + "epoch": 4.596503874571995, + "grad_norm": 0.2696053385734558, + "learning_rate": 1.9548806282164768e-07, + "loss": 0.3943, + "step": 8502 + }, + { + "epoch": 4.597044512524779, + "grad_norm": 0.25909149646759033, + "learning_rate": 1.9496564858320187e-07, + "loss": 0.3766, + "step": 8503 + }, + { + "epoch": 4.597585150477563, + "grad_norm": 0.25561127066612244, + "learning_rate": 1.9444391943784225e-07, + "loss": 0.3593, + "step": 8504 + }, + { + "epoch": 4.598125788430348, + "grad_norm": 0.2775280177593231, + "learning_rate": 1.9392287545995536e-07, + "loss": 0.3556, + "step": 8505 + }, + { + "epoch": 4.598666426383132, + "grad_norm": 0.27647945284843445, + "learning_rate": 1.934025167238307e-07, + "loss": 0.3747, + "step": 8506 + }, + { + "epoch": 4.599207064335916, + "grad_norm": 0.26714831590652466, + "learning_rate": 1.9288284330366113e-07, + "loss": 0.3921, + "step": 8507 + }, + { + "epoch": 4.599747702288701, + "grad_norm": 0.2643846273422241, + "learning_rate": 1.9236385527353906e-07, + "loss": 0.356, + "step": 8508 + }, + { + "epoch": 4.600288340241485, + "grad_norm": 0.25512391328811646, + "learning_rate": 1.9184555270746198e-07, + "loss": 0.3306, + "step": 8509 + }, + { + "epoch": 4.6008289781942695, + "grad_norm": 0.26640191674232483, + "learning_rate": 1.913279356793285e-07, + "loss": 0.3658, + "step": 8510 + }, + { + "epoch": 4.601369616147053, + "grad_norm": 0.2815695106983185, + "learning_rate": 1.9081100426293852e-07, + "loss": 0.3834, + "step": 8511 + }, + { + "epoch": 4.601910254099838, + "grad_norm": 0.27226167917251587, + "learning_rate": 1.9029475853199754e-07, + "loss": 0.3463, + "step": 8512 + }, + { + "epoch": 4.602450892052622, + "grad_norm": 0.2682137191295624, + "learning_rate": 1.8977919856010773e-07, + "loss": 0.3779, + "step": 8513 + }, + { + "epoch": 4.602991530005406, + "grad_norm": 0.26758265495300293, + "learning_rate": 1.8926432442077868e-07, + "loss": 0.3668, + "step": 8514 + }, + { + "epoch": 4.603532167958191, + "grad_norm": 0.2831733226776123, + "learning_rate": 1.8875013618742e-07, + "loss": 0.4153, + "step": 8515 + }, + { + "epoch": 4.604072805910975, + "grad_norm": 0.26426440477371216, + "learning_rate": 1.8823663393334358e-07, + "loss": 0.3115, + "step": 8516 + }, + { + "epoch": 4.60461344386376, + "grad_norm": 0.27677780389785767, + "learning_rate": 1.8772381773176417e-07, + "loss": 0.3663, + "step": 8517 + }, + { + "epoch": 4.6051540818165435, + "grad_norm": 0.2731216549873352, + "learning_rate": 1.8721168765579668e-07, + "loss": 0.3564, + "step": 8518 + }, + { + "epoch": 4.605694719769327, + "grad_norm": 0.26677405834198, + "learning_rate": 1.8670024377846098e-07, + "loss": 0.3406, + "step": 8519 + }, + { + "epoch": 4.606235357722112, + "grad_norm": 0.2840011417865753, + "learning_rate": 1.8618948617267764e-07, + "loss": 0.3752, + "step": 8520 + }, + { + "epoch": 4.606775995674896, + "grad_norm": 0.2730851471424103, + "learning_rate": 1.8567941491126896e-07, + "loss": 0.3844, + "step": 8521 + }, + { + "epoch": 4.607316633627681, + "grad_norm": 0.25464585423469543, + "learning_rate": 1.8517003006696067e-07, + "loss": 0.3598, + "step": 8522 + }, + { + "epoch": 4.607857271580465, + "grad_norm": 0.2869928181171417, + "learning_rate": 1.846613317123791e-07, + "loss": 0.4345, + "step": 8523 + }, + { + "epoch": 4.608397909533249, + "grad_norm": 0.26284706592559814, + "learning_rate": 1.8415331992005514e-07, + "loss": 0.3499, + "step": 8524 + }, + { + "epoch": 4.608938547486034, + "grad_norm": 0.27476251125335693, + "learning_rate": 1.8364599476241862e-07, + "loss": 0.3402, + "step": 8525 + }, + { + "epoch": 4.6094791854388175, + "grad_norm": 0.27824050188064575, + "learning_rate": 1.8313935631180334e-07, + "loss": 0.3668, + "step": 8526 + }, + { + "epoch": 4.610019823391602, + "grad_norm": 0.2557879090309143, + "learning_rate": 1.8263340464044542e-07, + "loss": 0.3522, + "step": 8527 + }, + { + "epoch": 4.610560461344386, + "grad_norm": 0.28685206174850464, + "learning_rate": 1.8212813982048217e-07, + "loss": 0.3824, + "step": 8528 + }, + { + "epoch": 4.611101099297171, + "grad_norm": 0.2578665018081665, + "learning_rate": 1.8162356192395368e-07, + "loss": 0.3158, + "step": 8529 + }, + { + "epoch": 4.611641737249955, + "grad_norm": 0.25863519310951233, + "learning_rate": 1.8111967102280082e-07, + "loss": 0.3714, + "step": 8530 + }, + { + "epoch": 4.612182375202739, + "grad_norm": 0.2879449725151062, + "learning_rate": 1.8061646718886882e-07, + "loss": 0.3867, + "step": 8531 + }, + { + "epoch": 4.612723013155524, + "grad_norm": 0.26990994811058044, + "learning_rate": 1.8011395049390202e-07, + "loss": 0.3338, + "step": 8532 + }, + { + "epoch": 4.613263651108308, + "grad_norm": 0.26068446040153503, + "learning_rate": 1.796121210095497e-07, + "loss": 0.3387, + "step": 8533 + }, + { + "epoch": 4.613804289061092, + "grad_norm": 0.2538585066795349, + "learning_rate": 1.7911097880736083e-07, + "loss": 0.3633, + "step": 8534 + }, + { + "epoch": 4.614344927013876, + "grad_norm": 0.26799604296684265, + "learning_rate": 1.7861052395878764e-07, + "loss": 0.3882, + "step": 8535 + }, + { + "epoch": 4.614885564966661, + "grad_norm": 0.24625979363918304, + "learning_rate": 1.781107565351853e-07, + "loss": 0.3487, + "step": 8536 + }, + { + "epoch": 4.615426202919445, + "grad_norm": 0.25957322120666504, + "learning_rate": 1.7761167660780787e-07, + "loss": 0.3542, + "step": 8537 + }, + { + "epoch": 4.615966840872229, + "grad_norm": 0.26084718108177185, + "learning_rate": 1.771132842478135e-07, + "loss": 0.3694, + "step": 8538 + }, + { + "epoch": 4.616507478825014, + "grad_norm": 0.25296398997306824, + "learning_rate": 1.7661557952626361e-07, + "loss": 0.3508, + "step": 8539 + }, + { + "epoch": 4.617048116777798, + "grad_norm": 0.26038071513175964, + "learning_rate": 1.7611856251411818e-07, + "loss": 0.3655, + "step": 8540 + }, + { + "epoch": 4.617588754730582, + "grad_norm": 0.281929075717926, + "learning_rate": 1.7562223328224327e-07, + "loss": 0.3744, + "step": 8541 + }, + { + "epoch": 4.618129392683366, + "grad_norm": 0.2631310224533081, + "learning_rate": 1.751265919014017e-07, + "loss": 0.3511, + "step": 8542 + }, + { + "epoch": 4.61867003063615, + "grad_norm": 0.2645679712295532, + "learning_rate": 1.7463163844226304e-07, + "loss": 0.3575, + "step": 8543 + }, + { + "epoch": 4.619210668588935, + "grad_norm": 0.2860918343067169, + "learning_rate": 1.7413737297539647e-07, + "loss": 0.3759, + "step": 8544 + }, + { + "epoch": 4.619751306541719, + "grad_norm": 0.2724156975746155, + "learning_rate": 1.7364379557127387e-07, + "loss": 0.3489, + "step": 8545 + }, + { + "epoch": 4.620291944494504, + "grad_norm": 0.28222528100013733, + "learning_rate": 1.7315090630026788e-07, + "loss": 0.3622, + "step": 8546 + }, + { + "epoch": 4.620832582447288, + "grad_norm": 0.25786492228507996, + "learning_rate": 1.7265870523265393e-07, + "loss": 0.3657, + "step": 8547 + }, + { + "epoch": 4.621373220400072, + "grad_norm": 0.24484671652317047, + "learning_rate": 1.7216719243860924e-07, + "loss": 0.3474, + "step": 8548 + }, + { + "epoch": 4.6219138583528565, + "grad_norm": 0.255718469619751, + "learning_rate": 1.716763679882133e-07, + "loss": 0.3242, + "step": 8549 + }, + { + "epoch": 4.62245449630564, + "grad_norm": 0.2624998390674591, + "learning_rate": 1.711862319514457e-07, + "loss": 0.3797, + "step": 8550 + }, + { + "epoch": 4.622995134258425, + "grad_norm": 0.2688087821006775, + "learning_rate": 1.7069678439819047e-07, + "loss": 0.3833, + "step": 8551 + }, + { + "epoch": 4.623535772211209, + "grad_norm": 0.25261446833610535, + "learning_rate": 1.702080253982308e-07, + "loss": 0.3768, + "step": 8552 + }, + { + "epoch": 4.624076410163994, + "grad_norm": 0.25700151920318604, + "learning_rate": 1.697199550212547e-07, + "loss": 0.3668, + "step": 8553 + }, + { + "epoch": 4.624617048116778, + "grad_norm": 0.2585085332393646, + "learning_rate": 1.6923257333684995e-07, + "loss": 0.3557, + "step": 8554 + }, + { + "epoch": 4.625157686069562, + "grad_norm": 0.2769460380077362, + "learning_rate": 1.6874588041450535e-07, + "loss": 0.3694, + "step": 8555 + }, + { + "epoch": 4.625698324022347, + "grad_norm": 0.27887338399887085, + "learning_rate": 1.6825987632361373e-07, + "loss": 0.3733, + "step": 8556 + }, + { + "epoch": 4.6262389619751305, + "grad_norm": 0.2611374855041504, + "learning_rate": 1.6777456113346857e-07, + "loss": 0.3301, + "step": 8557 + }, + { + "epoch": 4.626779599927915, + "grad_norm": 0.266277939081192, + "learning_rate": 1.672899349132656e-07, + "loss": 0.3639, + "step": 8558 + }, + { + "epoch": 4.627320237880699, + "grad_norm": 0.257068932056427, + "learning_rate": 1.6680599773210017e-07, + "loss": 0.3363, + "step": 8559 + }, + { + "epoch": 4.627860875833483, + "grad_norm": 0.2819574177265167, + "learning_rate": 1.6632274965897365e-07, + "loss": 0.4051, + "step": 8560 + }, + { + "epoch": 4.628401513786268, + "grad_norm": 0.25879088044166565, + "learning_rate": 1.6584019076278492e-07, + "loss": 0.3298, + "step": 8561 + }, + { + "epoch": 4.628942151739052, + "grad_norm": 0.2662845849990845, + "learning_rate": 1.6535832111233662e-07, + "loss": 0.3971, + "step": 8562 + }, + { + "epoch": 4.629482789691837, + "grad_norm": 0.26270678639411926, + "learning_rate": 1.6487714077633387e-07, + "loss": 0.3357, + "step": 8563 + }, + { + "epoch": 4.630023427644621, + "grad_norm": 0.2785986661911011, + "learning_rate": 1.643966498233812e-07, + "loss": 0.3668, + "step": 8564 + }, + { + "epoch": 4.6305640655974045, + "grad_norm": 0.2605617940425873, + "learning_rate": 1.639168483219872e-07, + "loss": 0.3591, + "step": 8565 + }, + { + "epoch": 4.631104703550189, + "grad_norm": 0.2554558515548706, + "learning_rate": 1.6343773634056038e-07, + "loss": 0.3616, + "step": 8566 + }, + { + "epoch": 4.631645341502973, + "grad_norm": 0.2634442150592804, + "learning_rate": 1.6295931394741116e-07, + "loss": 0.3675, + "step": 8567 + }, + { + "epoch": 4.632185979455758, + "grad_norm": 0.2761819362640381, + "learning_rate": 1.6248158121075387e-07, + "loss": 0.3621, + "step": 8568 + }, + { + "epoch": 4.632726617408542, + "grad_norm": 0.2668101191520691, + "learning_rate": 1.6200453819870122e-07, + "loss": 0.346, + "step": 8569 + }, + { + "epoch": 4.633267255361327, + "grad_norm": 0.28411874175071716, + "learning_rate": 1.6152818497926993e-07, + "loss": 0.3801, + "step": 8570 + }, + { + "epoch": 4.633807893314111, + "grad_norm": 0.27118346095085144, + "learning_rate": 1.6105252162037677e-07, + "loss": 0.3863, + "step": 8571 + }, + { + "epoch": 4.634348531266895, + "grad_norm": 0.26166531443595886, + "learning_rate": 1.6057754818984195e-07, + "loss": 0.3405, + "step": 8572 + }, + { + "epoch": 4.634889169219679, + "grad_norm": 0.283855676651001, + "learning_rate": 1.6010326475538628e-07, + "loss": 0.342, + "step": 8573 + }, + { + "epoch": 4.635429807172463, + "grad_norm": 0.27583634853363037, + "learning_rate": 1.5962967138463126e-07, + "loss": 0.3663, + "step": 8574 + }, + { + "epoch": 4.635970445125248, + "grad_norm": 0.2751389145851135, + "learning_rate": 1.5915676814510173e-07, + "loss": 0.3658, + "step": 8575 + }, + { + "epoch": 4.636511083078032, + "grad_norm": 0.26085153222084045, + "learning_rate": 1.5868455510422266e-07, + "loss": 0.3592, + "step": 8576 + }, + { + "epoch": 4.637051721030816, + "grad_norm": 0.25824809074401855, + "learning_rate": 1.5821303232932239e-07, + "loss": 0.3468, + "step": 8577 + }, + { + "epoch": 4.637592358983601, + "grad_norm": 0.2741011083126068, + "learning_rate": 1.577421998876294e-07, + "loss": 0.3728, + "step": 8578 + }, + { + "epoch": 4.638132996936385, + "grad_norm": 0.2605046033859253, + "learning_rate": 1.5727205784627388e-07, + "loss": 0.3647, + "step": 8579 + }, + { + "epoch": 4.6386736348891695, + "grad_norm": 0.2632634937763214, + "learning_rate": 1.5680260627228772e-07, + "loss": 0.3515, + "step": 8580 + }, + { + "epoch": 4.639214272841953, + "grad_norm": 0.2835623621940613, + "learning_rate": 1.563338452326052e-07, + "loss": 0.3799, + "step": 8581 + }, + { + "epoch": 4.639754910794737, + "grad_norm": 0.2817102074623108, + "learning_rate": 1.5586577479406006e-07, + "loss": 0.342, + "step": 8582 + }, + { + "epoch": 4.640295548747522, + "grad_norm": 0.2846224904060364, + "learning_rate": 1.5539839502339005e-07, + "loss": 0.3808, + "step": 8583 + }, + { + "epoch": 4.640836186700306, + "grad_norm": 0.2742244601249695, + "learning_rate": 1.5493170598723296e-07, + "loss": 0.3507, + "step": 8584 + }, + { + "epoch": 4.641376824653091, + "grad_norm": 0.2718614935874939, + "learning_rate": 1.5446570775212944e-07, + "loss": 0.3235, + "step": 8585 + }, + { + "epoch": 4.641917462605875, + "grad_norm": 0.24936747550964355, + "learning_rate": 1.5400040038451913e-07, + "loss": 0.3444, + "step": 8586 + }, + { + "epoch": 4.64245810055866, + "grad_norm": 0.2608190178871155, + "learning_rate": 1.5353578395074563e-07, + "loss": 0.3764, + "step": 8587 + }, + { + "epoch": 4.6429987385114435, + "grad_norm": 0.2773157060146332, + "learning_rate": 1.530718585170521e-07, + "loss": 0.3802, + "step": 8588 + }, + { + "epoch": 4.643539376464227, + "grad_norm": 0.254817932844162, + "learning_rate": 1.5260862414958554e-07, + "loss": 0.365, + "step": 8589 + }, + { + "epoch": 4.644080014417012, + "grad_norm": 0.28640419244766235, + "learning_rate": 1.5214608091439265e-07, + "loss": 0.3834, + "step": 8590 + }, + { + "epoch": 4.644620652369796, + "grad_norm": 0.2514747083187103, + "learning_rate": 1.5168422887742174e-07, + "loss": 0.3186, + "step": 8591 + }, + { + "epoch": 4.645161290322581, + "grad_norm": 0.2734508812427521, + "learning_rate": 1.5122306810452292e-07, + "loss": 0.397, + "step": 8592 + }, + { + "epoch": 4.645701928275365, + "grad_norm": 0.2521887719631195, + "learning_rate": 1.5076259866144748e-07, + "loss": 0.3206, + "step": 8593 + }, + { + "epoch": 4.64624256622815, + "grad_norm": 0.26810285449028015, + "learning_rate": 1.5030282061384848e-07, + "loss": 0.3685, + "step": 8594 + }, + { + "epoch": 4.646783204180934, + "grad_norm": 0.27116814255714417, + "learning_rate": 1.4984373402728014e-07, + "loss": 0.3657, + "step": 8595 + }, + { + "epoch": 4.6473238421337175, + "grad_norm": 0.2740531265735626, + "learning_rate": 1.4938533896719843e-07, + "loss": 0.3719, + "step": 8596 + }, + { + "epoch": 4.647864480086502, + "grad_norm": 0.27691882848739624, + "learning_rate": 1.489276354989605e-07, + "loss": 0.3637, + "step": 8597 + }, + { + "epoch": 4.648405118039286, + "grad_norm": 0.2679653763771057, + "learning_rate": 1.4847062368782473e-07, + "loss": 0.3186, + "step": 8598 + }, + { + "epoch": 4.64894575599207, + "grad_norm": 0.27665218710899353, + "learning_rate": 1.480143035989512e-07, + "loss": 0.3707, + "step": 8599 + }, + { + "epoch": 4.649486393944855, + "grad_norm": 0.27336403727531433, + "learning_rate": 1.4755867529740064e-07, + "loss": 0.3846, + "step": 8600 + }, + { + "epoch": 4.650027031897639, + "grad_norm": 0.2560247480869293, + "learning_rate": 1.4710373884813612e-07, + "loss": 0.3475, + "step": 8601 + }, + { + "epoch": 4.650567669850424, + "grad_norm": 0.25172582268714905, + "learning_rate": 1.4664949431602238e-07, + "loss": 0.3465, + "step": 8602 + }, + { + "epoch": 4.651108307803208, + "grad_norm": 0.25150173902511597, + "learning_rate": 1.4619594176582318e-07, + "loss": 0.3624, + "step": 8603 + }, + { + "epoch": 4.651648945755992, + "grad_norm": 0.2822224497795105, + "learning_rate": 1.4574308126220682e-07, + "loss": 0.3827, + "step": 8604 + }, + { + "epoch": 4.652189583708776, + "grad_norm": 0.26971569657325745, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.3686, + "step": 8605 + }, + { + "epoch": 4.65273022166156, + "grad_norm": 0.29332101345062256, + "learning_rate": 1.448394366528927e-07, + "loss": 0.3802, + "step": 8606 + }, + { + "epoch": 4.653270859614345, + "grad_norm": 0.2725779116153717, + "learning_rate": 1.443886526760363e-07, + "loss": 0.3729, + "step": 8607 + }, + { + "epoch": 4.653811497567129, + "grad_norm": 0.25920751690864563, + "learning_rate": 1.4393856100344107e-07, + "loss": 0.3562, + "step": 8608 + }, + { + "epoch": 4.654352135519914, + "grad_norm": 0.24775615334510803, + "learning_rate": 1.4348916169928173e-07, + "loss": 0.3508, + "step": 8609 + }, + { + "epoch": 4.654892773472698, + "grad_norm": 0.25473129749298096, + "learning_rate": 1.4304045482763263e-07, + "loss": 0.372, + "step": 8610 + }, + { + "epoch": 4.6554334114254825, + "grad_norm": 0.247678741812706, + "learning_rate": 1.425924404524681e-07, + "loss": 0.3693, + "step": 8611 + }, + { + "epoch": 4.655974049378266, + "grad_norm": 0.24631451070308685, + "learning_rate": 1.4214511863766767e-07, + "loss": 0.3654, + "step": 8612 + }, + { + "epoch": 4.65651468733105, + "grad_norm": 0.2660142481327057, + "learning_rate": 1.4169848944700748e-07, + "loss": 0.3671, + "step": 8613 + }, + { + "epoch": 4.657055325283835, + "grad_norm": 0.2603265643119812, + "learning_rate": 1.4125255294416885e-07, + "loss": 0.3441, + "step": 8614 + }, + { + "epoch": 4.657595963236619, + "grad_norm": 0.2734012305736542, + "learning_rate": 1.408073091927309e-07, + "loss": 0.4051, + "step": 8615 + }, + { + "epoch": 4.658136601189404, + "grad_norm": 0.2639838457107544, + "learning_rate": 1.403627582561773e-07, + "loss": 0.3557, + "step": 8616 + }, + { + "epoch": 4.658677239142188, + "grad_norm": 0.2553168535232544, + "learning_rate": 1.3991890019788956e-07, + "loss": 0.3512, + "step": 8617 + }, + { + "epoch": 4.659217877094972, + "grad_norm": 0.25102850794792175, + "learning_rate": 1.3947573508115374e-07, + "loss": 0.3293, + "step": 8618 + }, + { + "epoch": 4.6597585150477565, + "grad_norm": 0.3015282154083252, + "learning_rate": 1.3903326296915543e-07, + "loss": 0.3693, + "step": 8619 + }, + { + "epoch": 4.66029915300054, + "grad_norm": 0.27171650528907776, + "learning_rate": 1.3859148392498023e-07, + "loss": 0.4019, + "step": 8620 + }, + { + "epoch": 4.660839790953325, + "grad_norm": 0.2531915009021759, + "learning_rate": 1.3815039801161723e-07, + "loss": 0.3722, + "step": 8621 + }, + { + "epoch": 4.661380428906109, + "grad_norm": 0.24812015891075134, + "learning_rate": 1.3771000529195555e-07, + "loss": 0.3424, + "step": 8622 + }, + { + "epoch": 4.661921066858893, + "grad_norm": 0.2861292362213135, + "learning_rate": 1.3727030582878498e-07, + "loss": 0.3951, + "step": 8623 + }, + { + "epoch": 4.662461704811678, + "grad_norm": 0.2552623450756073, + "learning_rate": 1.368312996847976e-07, + "loss": 0.3466, + "step": 8624 + }, + { + "epoch": 4.663002342764462, + "grad_norm": 0.2605513036251068, + "learning_rate": 1.3639298692258606e-07, + "loss": 0.3674, + "step": 8625 + }, + { + "epoch": 4.663542980717247, + "grad_norm": 0.27852219343185425, + "learning_rate": 1.359553676046449e-07, + "loss": 0.3538, + "step": 8626 + }, + { + "epoch": 4.6640836186700305, + "grad_norm": 0.2792563736438751, + "learning_rate": 1.3551844179336748e-07, + "loss": 0.3595, + "step": 8627 + }, + { + "epoch": 4.664624256622815, + "grad_norm": 0.26457229256629944, + "learning_rate": 1.3508220955105122e-07, + "loss": 0.3704, + "step": 8628 + }, + { + "epoch": 4.665164894575599, + "grad_norm": 0.2538105845451355, + "learning_rate": 1.3464667093989248e-07, + "loss": 0.3153, + "step": 8629 + }, + { + "epoch": 4.665705532528383, + "grad_norm": 0.27984851598739624, + "learning_rate": 1.342118260219899e-07, + "loss": 0.3732, + "step": 8630 + }, + { + "epoch": 4.666246170481168, + "grad_norm": 0.2557315230369568, + "learning_rate": 1.3377767485934333e-07, + "loss": 0.3524, + "step": 8631 + }, + { + "epoch": 4.666786808433952, + "grad_norm": 0.2679039239883423, + "learning_rate": 1.3334421751385275e-07, + "loss": 0.3676, + "step": 8632 + }, + { + "epoch": 4.667327446386737, + "grad_norm": 0.2723951041698456, + "learning_rate": 1.3291145404731976e-07, + "loss": 0.3827, + "step": 8633 + }, + { + "epoch": 4.667868084339521, + "grad_norm": 0.24252431094646454, + "learning_rate": 1.3247938452144727e-07, + "loss": 0.3614, + "step": 8634 + }, + { + "epoch": 4.668408722292305, + "grad_norm": 0.25721076130867004, + "learning_rate": 1.320480089978382e-07, + "loss": 0.3989, + "step": 8635 + }, + { + "epoch": 4.668949360245089, + "grad_norm": 0.2676815986633301, + "learning_rate": 1.3161732753799838e-07, + "loss": 0.3976, + "step": 8636 + }, + { + "epoch": 4.669489998197873, + "grad_norm": 0.24728180468082428, + "learning_rate": 1.3118734020333257e-07, + "loss": 0.3154, + "step": 8637 + }, + { + "epoch": 4.670030636150658, + "grad_norm": 0.27126219868659973, + "learning_rate": 1.3075804705514894e-07, + "loss": 0.3669, + "step": 8638 + }, + { + "epoch": 4.670571274103442, + "grad_norm": 0.281053751707077, + "learning_rate": 1.303294481546541e-07, + "loss": 0.3981, + "step": 8639 + }, + { + "epoch": 4.671111912056226, + "grad_norm": 0.27033770084381104, + "learning_rate": 1.2990154356295636e-07, + "loss": 0.3418, + "step": 8640 + }, + { + "epoch": 4.671652550009011, + "grad_norm": 0.2688961923122406, + "learning_rate": 1.294743333410675e-07, + "loss": 0.3703, + "step": 8641 + }, + { + "epoch": 4.672193187961795, + "grad_norm": 0.25255006551742554, + "learning_rate": 1.2904781754989715e-07, + "loss": 0.3725, + "step": 8642 + }, + { + "epoch": 4.672733825914579, + "grad_norm": 0.2689112722873688, + "learning_rate": 1.2862199625025772e-07, + "loss": 0.3504, + "step": 8643 + }, + { + "epoch": 4.673274463867363, + "grad_norm": 0.2672416567802429, + "learning_rate": 1.2819686950286125e-07, + "loss": 0.4042, + "step": 8644 + }, + { + "epoch": 4.673815101820148, + "grad_norm": 0.26514679193496704, + "learning_rate": 1.2777243736832202e-07, + "loss": 0.3839, + "step": 8645 + }, + { + "epoch": 4.674355739772932, + "grad_norm": 0.2719597816467285, + "learning_rate": 1.2734869990715493e-07, + "loss": 0.3735, + "step": 8646 + }, + { + "epoch": 4.674896377725716, + "grad_norm": 0.27342677116394043, + "learning_rate": 1.26925657179775e-07, + "loss": 0.3675, + "step": 8647 + }, + { + "epoch": 4.675437015678501, + "grad_norm": 0.24384188652038574, + "learning_rate": 1.2650330924650013e-07, + "loss": 0.3447, + "step": 8648 + }, + { + "epoch": 4.675977653631285, + "grad_norm": 0.26947176456451416, + "learning_rate": 1.2608165616754653e-07, + "loss": 0.3502, + "step": 8649 + }, + { + "epoch": 4.6765182915840695, + "grad_norm": 0.2672954797744751, + "learning_rate": 1.2566069800303393e-07, + "loss": 0.3774, + "step": 8650 + }, + { + "epoch": 4.677058929536853, + "grad_norm": 0.2530985176563263, + "learning_rate": 1.25240434812981e-07, + "loss": 0.3526, + "step": 8651 + }, + { + "epoch": 4.677599567489638, + "grad_norm": 0.26235222816467285, + "learning_rate": 1.2482086665730862e-07, + "loss": 0.3638, + "step": 8652 + }, + { + "epoch": 4.678140205442422, + "grad_norm": 0.2701789140701294, + "learning_rate": 1.2440199359583792e-07, + "loss": 0.3511, + "step": 8653 + }, + { + "epoch": 4.678680843395206, + "grad_norm": 0.27301889657974243, + "learning_rate": 1.2398381568829055e-07, + "loss": 0.3391, + "step": 8654 + }, + { + "epoch": 4.679221481347991, + "grad_norm": 0.2889951169490814, + "learning_rate": 1.2356633299429044e-07, + "loss": 0.3925, + "step": 8655 + }, + { + "epoch": 4.679762119300775, + "grad_norm": 0.25548380613327026, + "learning_rate": 1.2314954557336055e-07, + "loss": 0.3488, + "step": 8656 + }, + { + "epoch": 4.680302757253559, + "grad_norm": 0.252651184797287, + "learning_rate": 1.2273345348492614e-07, + "loss": 0.3554, + "step": 8657 + }, + { + "epoch": 4.6808433952063435, + "grad_norm": 0.24984490871429443, + "learning_rate": 1.2231805678831365e-07, + "loss": 0.3489, + "step": 8658 + }, + { + "epoch": 4.681384033159127, + "grad_norm": 0.26597264409065247, + "learning_rate": 1.219033555427479e-07, + "loss": 0.3746, + "step": 8659 + }, + { + "epoch": 4.681924671111912, + "grad_norm": 0.28862953186035156, + "learning_rate": 1.2148934980735772e-07, + "loss": 0.3719, + "step": 8660 + }, + { + "epoch": 4.682465309064696, + "grad_norm": 0.27415987849235535, + "learning_rate": 1.2107603964117033e-07, + "loss": 0.3694, + "step": 8661 + }, + { + "epoch": 4.683005947017481, + "grad_norm": 0.24074959754943848, + "learning_rate": 1.2066342510311523e-07, + "loss": 0.3506, + "step": 8662 + }, + { + "epoch": 4.683546584970265, + "grad_norm": 0.2678668797016144, + "learning_rate": 1.2025150625202265e-07, + "loss": 0.4074, + "step": 8663 + }, + { + "epoch": 4.684087222923049, + "grad_norm": 0.2623341381549835, + "learning_rate": 1.198402831466222e-07, + "loss": 0.3623, + "step": 8664 + }, + { + "epoch": 4.684627860875834, + "grad_norm": 0.2642357349395752, + "learning_rate": 1.1942975584554594e-07, + "loss": 0.3547, + "step": 8665 + }, + { + "epoch": 4.6851684988286175, + "grad_norm": 0.25888511538505554, + "learning_rate": 1.1901992440732591e-07, + "loss": 0.3565, + "step": 8666 + }, + { + "epoch": 4.685709136781402, + "grad_norm": 0.2689421474933624, + "learning_rate": 1.1861078889039646e-07, + "loss": 0.3529, + "step": 8667 + }, + { + "epoch": 4.686249774734186, + "grad_norm": 0.27439960837364197, + "learning_rate": 1.1820234935308927e-07, + "loss": 0.3886, + "step": 8668 + }, + { + "epoch": 4.686790412686971, + "grad_norm": 0.3315673768520355, + "learning_rate": 1.1779460585363945e-07, + "loss": 0.3214, + "step": 8669 + }, + { + "epoch": 4.687331050639755, + "grad_norm": 0.2640538513660431, + "learning_rate": 1.1738755845018323e-07, + "loss": 0.3665, + "step": 8670 + }, + { + "epoch": 4.687871688592539, + "grad_norm": 0.26083624362945557, + "learning_rate": 1.1698120720075645e-07, + "loss": 0.3813, + "step": 8671 + }, + { + "epoch": 4.688412326545324, + "grad_norm": 0.2745174765586853, + "learning_rate": 1.1657555216329553e-07, + "loss": 0.405, + "step": 8672 + }, + { + "epoch": 4.688952964498108, + "grad_norm": 0.2702055871486664, + "learning_rate": 1.1617059339563807e-07, + "loss": 0.3339, + "step": 8673 + }, + { + "epoch": 4.689493602450892, + "grad_norm": 0.27788063883781433, + "learning_rate": 1.1576633095552237e-07, + "loss": 0.3624, + "step": 8674 + }, + { + "epoch": 4.690034240403676, + "grad_norm": 0.2618255019187927, + "learning_rate": 1.1536276490058784e-07, + "loss": 0.3667, + "step": 8675 + }, + { + "epoch": 4.69057487835646, + "grad_norm": 0.2847457826137543, + "learning_rate": 1.1495989528837347e-07, + "loss": 0.3795, + "step": 8676 + }, + { + "epoch": 4.691115516309245, + "grad_norm": 0.2758665680885315, + "learning_rate": 1.1455772217632e-07, + "loss": 0.3722, + "step": 8677 + }, + { + "epoch": 4.691656154262029, + "grad_norm": 0.2383020520210266, + "learning_rate": 1.1415624562176875e-07, + "loss": 0.3336, + "step": 8678 + }, + { + "epoch": 4.692196792214814, + "grad_norm": 0.26600027084350586, + "learning_rate": 1.1375546568196172e-07, + "loss": 0.3632, + "step": 8679 + }, + { + "epoch": 4.692737430167598, + "grad_norm": 0.2663227915763855, + "learning_rate": 1.1335538241404099e-07, + "loss": 0.3406, + "step": 8680 + }, + { + "epoch": 4.693278068120382, + "grad_norm": 0.2766534090042114, + "learning_rate": 1.1295599587504924e-07, + "loss": 0.3687, + "step": 8681 + }, + { + "epoch": 4.693818706073166, + "grad_norm": 0.2662915587425232, + "learning_rate": 1.125573061219315e-07, + "loss": 0.3535, + "step": 8682 + }, + { + "epoch": 4.69435934402595, + "grad_norm": 0.2799853980541229, + "learning_rate": 1.1215931321153172e-07, + "loss": 0.3573, + "step": 8683 + }, + { + "epoch": 4.694899981978735, + "grad_norm": 0.28400909900665283, + "learning_rate": 1.1176201720059454e-07, + "loss": 0.3882, + "step": 8684 + }, + { + "epoch": 4.695440619931519, + "grad_norm": 0.26994532346725464, + "learning_rate": 1.1136541814576574e-07, + "loss": 0.338, + "step": 8685 + }, + { + "epoch": 4.695981257884304, + "grad_norm": 0.2557099759578705, + "learning_rate": 1.1096951610359174e-07, + "loss": 0.3383, + "step": 8686 + }, + { + "epoch": 4.696521895837088, + "grad_norm": 0.26642677187919617, + "learning_rate": 1.1057431113052075e-07, + "loss": 0.399, + "step": 8687 + }, + { + "epoch": 4.697062533789872, + "grad_norm": 0.272213339805603, + "learning_rate": 1.1017980328289823e-07, + "loss": 0.3846, + "step": 8688 + }, + { + "epoch": 4.6976031717426565, + "grad_norm": 0.26038268208503723, + "learning_rate": 1.0978599261697476e-07, + "loss": 0.362, + "step": 8689 + }, + { + "epoch": 4.69814380969544, + "grad_norm": 0.2504449486732483, + "learning_rate": 1.0939287918889652e-07, + "loss": 0.3786, + "step": 8690 + }, + { + "epoch": 4.698684447648225, + "grad_norm": 0.26650890707969666, + "learning_rate": 1.0900046305471535e-07, + "loss": 0.3734, + "step": 8691 + }, + { + "epoch": 4.699225085601009, + "grad_norm": 0.2502971589565277, + "learning_rate": 1.0860874427038037e-07, + "loss": 0.3511, + "step": 8692 + }, + { + "epoch": 4.699765723553794, + "grad_norm": 0.26762133836746216, + "learning_rate": 1.0821772289174138e-07, + "loss": 0.3579, + "step": 8693 + }, + { + "epoch": 4.700306361506578, + "grad_norm": 0.29343181848526, + "learning_rate": 1.0782739897455041e-07, + "loss": 0.4183, + "step": 8694 + }, + { + "epoch": 4.700846999459362, + "grad_norm": 0.24956832826137543, + "learning_rate": 1.0743777257445853e-07, + "loss": 0.33, + "step": 8695 + }, + { + "epoch": 4.701387637412147, + "grad_norm": 0.2719976007938385, + "learning_rate": 1.0704884374701908e-07, + "loss": 0.3587, + "step": 8696 + }, + { + "epoch": 4.7019282753649305, + "grad_norm": 0.2573912739753723, + "learning_rate": 1.0666061254768268e-07, + "loss": 0.3655, + "step": 8697 + }, + { + "epoch": 4.702468913317714, + "grad_norm": 0.27466002106666565, + "learning_rate": 1.0627307903180451e-07, + "loss": 0.3967, + "step": 8698 + }, + { + "epoch": 4.703009551270499, + "grad_norm": 0.2574291527271271, + "learning_rate": 1.058862432546387e-07, + "loss": 0.3396, + "step": 8699 + }, + { + "epoch": 4.703550189223283, + "grad_norm": 0.2641294300556183, + "learning_rate": 1.055001052713378e-07, + "loss": 0.3767, + "step": 8700 + }, + { + "epoch": 4.704090827176068, + "grad_norm": 0.2860811948776245, + "learning_rate": 1.0511466513695778e-07, + "loss": 0.3512, + "step": 8701 + }, + { + "epoch": 4.704631465128852, + "grad_norm": 0.27938511967658997, + "learning_rate": 1.0472992290645356e-07, + "loss": 0.3663, + "step": 8702 + }, + { + "epoch": 4.705172103081637, + "grad_norm": 0.27721625566482544, + "learning_rate": 1.0434587863468182e-07, + "loss": 0.3847, + "step": 8703 + }, + { + "epoch": 4.705712741034421, + "grad_norm": 0.28188326954841614, + "learning_rate": 1.039625323763982e-07, + "loss": 0.3615, + "step": 8704 + }, + { + "epoch": 4.7062533789872045, + "grad_norm": 0.262968510389328, + "learning_rate": 1.0357988418625897e-07, + "loss": 0.3405, + "step": 8705 + }, + { + "epoch": 4.706794016939989, + "grad_norm": 0.2621868848800659, + "learning_rate": 1.0319793411882273e-07, + "loss": 0.3761, + "step": 8706 + }, + { + "epoch": 4.707334654892773, + "grad_norm": 0.2894149124622345, + "learning_rate": 1.0281668222854645e-07, + "loss": 0.4006, + "step": 8707 + }, + { + "epoch": 4.707875292845558, + "grad_norm": 0.25812411308288574, + "learning_rate": 1.0243612856978835e-07, + "loss": 0.3484, + "step": 8708 + }, + { + "epoch": 4.708415930798342, + "grad_norm": 0.25666138529777527, + "learning_rate": 1.0205627319680723e-07, + "loss": 0.3476, + "step": 8709 + }, + { + "epoch": 4.708956568751127, + "grad_norm": 0.27466803789138794, + "learning_rate": 1.0167711616376196e-07, + "loss": 0.3759, + "step": 8710 + }, + { + "epoch": 4.709497206703911, + "grad_norm": 0.2503379285335541, + "learning_rate": 1.0129865752471325e-07, + "loss": 0.3087, + "step": 8711 + }, + { + "epoch": 4.710037844656695, + "grad_norm": 0.254130482673645, + "learning_rate": 1.0092089733361898e-07, + "loss": 0.3427, + "step": 8712 + }, + { + "epoch": 4.710578482609479, + "grad_norm": 0.24710287153720856, + "learning_rate": 1.0054383564434056e-07, + "loss": 0.3637, + "step": 8713 + }, + { + "epoch": 4.711119120562263, + "grad_norm": 0.2547876834869385, + "learning_rate": 1.0016747251063885e-07, + "loss": 0.399, + "step": 8714 + }, + { + "epoch": 4.711659758515047, + "grad_norm": 0.2763873338699341, + "learning_rate": 9.979180798617538e-08, + "loss": 0.3925, + "step": 8715 + }, + { + "epoch": 4.712200396467832, + "grad_norm": 0.26490187644958496, + "learning_rate": 9.941684212451119e-08, + "loss": 0.3227, + "step": 8716 + }, + { + "epoch": 4.712741034420616, + "grad_norm": 0.2720109522342682, + "learning_rate": 9.904257497910796e-08, + "loss": 0.3736, + "step": 8717 + }, + { + "epoch": 4.713281672373401, + "grad_norm": 0.29464131593704224, + "learning_rate": 9.866900660332912e-08, + "loss": 0.3878, + "step": 8718 + }, + { + "epoch": 4.713822310326185, + "grad_norm": 0.2727126181125641, + "learning_rate": 9.829613705043594e-08, + "loss": 0.3353, + "step": 8719 + }, + { + "epoch": 4.7143629482789695, + "grad_norm": 0.27135735750198364, + "learning_rate": 9.792396637359203e-08, + "loss": 0.3429, + "step": 8720 + }, + { + "epoch": 4.714903586231753, + "grad_norm": 0.27487772703170776, + "learning_rate": 9.755249462586158e-08, + "loss": 0.3821, + "step": 8721 + }, + { + "epoch": 4.715444224184537, + "grad_norm": 0.2727549076080322, + "learning_rate": 9.718172186020724e-08, + "loss": 0.3586, + "step": 8722 + }, + { + "epoch": 4.715984862137322, + "grad_norm": 0.2874700129032135, + "learning_rate": 9.68116481294945e-08, + "loss": 0.3546, + "step": 8723 + }, + { + "epoch": 4.716525500090106, + "grad_norm": 0.26061776280403137, + "learning_rate": 9.644227348648616e-08, + "loss": 0.3738, + "step": 8724 + }, + { + "epoch": 4.717066138042891, + "grad_norm": 0.256819486618042, + "learning_rate": 9.607359798384785e-08, + "loss": 0.3529, + "step": 8725 + }, + { + "epoch": 4.717606775995675, + "grad_norm": 0.2749132513999939, + "learning_rate": 9.570562167414477e-08, + "loss": 0.3625, + "step": 8726 + }, + { + "epoch": 4.71814741394846, + "grad_norm": 0.24957223236560822, + "learning_rate": 9.533834460984159e-08, + "loss": 0.3048, + "step": 8727 + }, + { + "epoch": 4.7186880519012435, + "grad_norm": 0.2842405438423157, + "learning_rate": 9.497176684330534e-08, + "loss": 0.3962, + "step": 8728 + }, + { + "epoch": 4.719228689854027, + "grad_norm": 0.26971182227134705, + "learning_rate": 9.46058884268003e-08, + "loss": 0.3884, + "step": 8729 + }, + { + "epoch": 4.719769327806812, + "grad_norm": 0.2548538148403168, + "learning_rate": 9.424070941249419e-08, + "loss": 0.3491, + "step": 8730 + }, + { + "epoch": 4.720309965759596, + "grad_norm": 0.2831175923347473, + "learning_rate": 9.387622985245259e-08, + "loss": 0.3825, + "step": 8731 + }, + { + "epoch": 4.720850603712381, + "grad_norm": 0.263994961977005, + "learning_rate": 9.351244979864338e-08, + "loss": 0.3717, + "step": 8732 + }, + { + "epoch": 4.721391241665165, + "grad_norm": 0.26066383719444275, + "learning_rate": 9.314936930293283e-08, + "loss": 0.3565, + "step": 8733 + }, + { + "epoch": 4.721931879617949, + "grad_norm": 0.2533550262451172, + "learning_rate": 9.278698841708844e-08, + "loss": 0.3714, + "step": 8734 + }, + { + "epoch": 4.722472517570734, + "grad_norm": 0.24598509073257446, + "learning_rate": 9.242530719277776e-08, + "loss": 0.3527, + "step": 8735 + }, + { + "epoch": 4.7230131555235175, + "grad_norm": 0.2712627649307251, + "learning_rate": 9.206432568156953e-08, + "loss": 0.409, + "step": 8736 + }, + { + "epoch": 4.723553793476302, + "grad_norm": 0.26807889342308044, + "learning_rate": 9.170404393492982e-08, + "loss": 0.3909, + "step": 8737 + }, + { + "epoch": 4.724094431429086, + "grad_norm": 0.2640073001384735, + "learning_rate": 9.134446200422919e-08, + "loss": 0.3569, + "step": 8738 + }, + { + "epoch": 4.72463506938187, + "grad_norm": 0.26649776101112366, + "learning_rate": 9.098557994073443e-08, + "loss": 0.3611, + "step": 8739 + }, + { + "epoch": 4.725175707334655, + "grad_norm": 0.293518990278244, + "learning_rate": 9.062739779561624e-08, + "loss": 0.3994, + "step": 8740 + }, + { + "epoch": 4.725716345287439, + "grad_norm": 0.261219322681427, + "learning_rate": 9.026991561994158e-08, + "loss": 0.3336, + "step": 8741 + }, + { + "epoch": 4.726256983240224, + "grad_norm": 0.27696412801742554, + "learning_rate": 8.991313346468078e-08, + "loss": 0.3654, + "step": 8742 + }, + { + "epoch": 4.726797621193008, + "grad_norm": 0.275884211063385, + "learning_rate": 8.955705138070258e-08, + "loss": 0.3417, + "step": 8743 + }, + { + "epoch": 4.727338259145792, + "grad_norm": 0.2763936221599579, + "learning_rate": 8.920166941877695e-08, + "loss": 0.3618, + "step": 8744 + }, + { + "epoch": 4.727878897098576, + "grad_norm": 0.2690500319004059, + "learning_rate": 8.884698762957334e-08, + "loss": 0.3577, + "step": 8745 + }, + { + "epoch": 4.72841953505136, + "grad_norm": 0.2605028450489044, + "learning_rate": 8.849300606366185e-08, + "loss": 0.3246, + "step": 8746 + }, + { + "epoch": 4.728960173004145, + "grad_norm": 0.27978426218032837, + "learning_rate": 8.813972477151211e-08, + "loss": 0.3892, + "step": 8747 + }, + { + "epoch": 4.729500810956929, + "grad_norm": 0.24701228737831116, + "learning_rate": 8.778714380349551e-08, + "loss": 0.3488, + "step": 8748 + }, + { + "epoch": 4.730041448909714, + "grad_norm": 0.26005181670188904, + "learning_rate": 8.743526320988016e-08, + "loss": 0.3773, + "step": 8749 + }, + { + "epoch": 4.730582086862498, + "grad_norm": 0.2552785277366638, + "learning_rate": 8.708408304083927e-08, + "loss": 0.3593, + "step": 8750 + }, + { + "epoch": 4.7311227248152825, + "grad_norm": 0.25979378819465637, + "learning_rate": 8.67336033464411e-08, + "loss": 0.3496, + "step": 8751 + }, + { + "epoch": 4.731663362768066, + "grad_norm": 0.24903003871440887, + "learning_rate": 8.638382417665847e-08, + "loss": 0.3521, + "step": 8752 + }, + { + "epoch": 4.73220400072085, + "grad_norm": 0.26014840602874756, + "learning_rate": 8.603474558136038e-08, + "loss": 0.3489, + "step": 8753 + }, + { + "epoch": 4.732744638673635, + "grad_norm": 0.2891535460948944, + "learning_rate": 8.568636761031868e-08, + "loss": 0.4002, + "step": 8754 + }, + { + "epoch": 4.733285276626419, + "grad_norm": 0.2797335386276245, + "learning_rate": 8.53386903132053e-08, + "loss": 0.3369, + "step": 8755 + }, + { + "epoch": 4.733825914579203, + "grad_norm": 0.2770991921424866, + "learning_rate": 8.499171373959004e-08, + "loss": 0.3516, + "step": 8756 + }, + { + "epoch": 4.734366552531988, + "grad_norm": 0.27074334025382996, + "learning_rate": 8.464543793894498e-08, + "loss": 0.3832, + "step": 8757 + }, + { + "epoch": 4.734907190484772, + "grad_norm": 0.2525061368942261, + "learning_rate": 8.429986296064118e-08, + "loss": 0.357, + "step": 8758 + }, + { + "epoch": 4.7354478284375565, + "grad_norm": 0.2681528925895691, + "learning_rate": 8.395498885394981e-08, + "loss": 0.3473, + "step": 8759 + }, + { + "epoch": 4.73598846639034, + "grad_norm": 0.25846293568611145, + "learning_rate": 8.361081566804318e-08, + "loss": 0.3748, + "step": 8760 + }, + { + "epoch": 4.736529104343125, + "grad_norm": 0.25526463985443115, + "learning_rate": 8.326734345199261e-08, + "loss": 0.3901, + "step": 8761 + }, + { + "epoch": 4.737069742295909, + "grad_norm": 0.2344433218240738, + "learning_rate": 8.292457225476946e-08, + "loss": 0.3507, + "step": 8762 + }, + { + "epoch": 4.737610380248693, + "grad_norm": 0.2653151750564575, + "learning_rate": 8.258250212524522e-08, + "loss": 0.3851, + "step": 8763 + }, + { + "epoch": 4.738151018201478, + "grad_norm": 0.26171717047691345, + "learning_rate": 8.224113311219251e-08, + "loss": 0.3576, + "step": 8764 + }, + { + "epoch": 4.738691656154262, + "grad_norm": 0.2592165470123291, + "learning_rate": 8.190046526428241e-08, + "loss": 0.3615, + "step": 8765 + }, + { + "epoch": 4.739232294107047, + "grad_norm": 0.2712307572364807, + "learning_rate": 8.156049863008664e-08, + "loss": 0.352, + "step": 8766 + }, + { + "epoch": 4.7397729320598305, + "grad_norm": 0.2761649489402771, + "learning_rate": 8.122123325807751e-08, + "loss": 0.3976, + "step": 8767 + }, + { + "epoch": 4.740313570012615, + "grad_norm": 0.2663705050945282, + "learning_rate": 8.088266919662635e-08, + "loss": 0.3314, + "step": 8768 + }, + { + "epoch": 4.740854207965399, + "grad_norm": 0.2638547420501709, + "learning_rate": 8.054480649400565e-08, + "loss": 0.3517, + "step": 8769 + }, + { + "epoch": 4.741394845918183, + "grad_norm": 0.262474000453949, + "learning_rate": 8.020764519838686e-08, + "loss": 0.3455, + "step": 8770 + }, + { + "epoch": 4.741935483870968, + "grad_norm": 0.2786615788936615, + "learning_rate": 7.987118535784155e-08, + "loss": 0.3489, + "step": 8771 + }, + { + "epoch": 4.742476121823752, + "grad_norm": 0.2552831172943115, + "learning_rate": 7.953542702034245e-08, + "loss": 0.3454, + "step": 8772 + }, + { + "epoch": 4.743016759776537, + "grad_norm": 0.25150927901268005, + "learning_rate": 7.920037023376014e-08, + "loss": 0.3582, + "step": 8773 + }, + { + "epoch": 4.743557397729321, + "grad_norm": 0.28234729170799255, + "learning_rate": 7.886601504586755e-08, + "loss": 0.3829, + "step": 8774 + }, + { + "epoch": 4.7440980356821045, + "grad_norm": 0.26238328218460083, + "learning_rate": 7.853236150433541e-08, + "loss": 0.372, + "step": 8775 + }, + { + "epoch": 4.744638673634889, + "grad_norm": 0.2578582763671875, + "learning_rate": 7.819940965673678e-08, + "loss": 0.3385, + "step": 8776 + }, + { + "epoch": 4.745179311587673, + "grad_norm": 0.27182072401046753, + "learning_rate": 7.786715955054202e-08, + "loss": 0.3575, + "step": 8777 + }, + { + "epoch": 4.745719949540458, + "grad_norm": 0.2770176827907562, + "learning_rate": 7.753561123312326e-08, + "loss": 0.3527, + "step": 8778 + }, + { + "epoch": 4.746260587493242, + "grad_norm": 0.2684987783432007, + "learning_rate": 7.720476475175209e-08, + "loss": 0.3528, + "step": 8779 + }, + { + "epoch": 4.746801225446026, + "grad_norm": 0.26776623725891113, + "learning_rate": 7.687462015360026e-08, + "loss": 0.3877, + "step": 8780 + }, + { + "epoch": 4.747341863398811, + "grad_norm": 0.24656769633293152, + "learning_rate": 7.654517748573842e-08, + "loss": 0.3399, + "step": 8781 + }, + { + "epoch": 4.747882501351595, + "grad_norm": 0.2421443909406662, + "learning_rate": 7.621643679513846e-08, + "loss": 0.3287, + "step": 8782 + }, + { + "epoch": 4.748423139304379, + "grad_norm": 0.26763293147087097, + "learning_rate": 7.588839812867177e-08, + "loss": 0.3768, + "step": 8783 + }, + { + "epoch": 4.748963777257163, + "grad_norm": 0.2524373233318329, + "learning_rate": 7.556106153310927e-08, + "loss": 0.375, + "step": 8784 + }, + { + "epoch": 4.749504415209948, + "grad_norm": 0.2663939297199249, + "learning_rate": 7.523442705512196e-08, + "loss": 0.3695, + "step": 8785 + }, + { + "epoch": 4.750045053162732, + "grad_norm": 0.2635733187198639, + "learning_rate": 7.490849474128093e-08, + "loss": 0.37, + "step": 8786 + }, + { + "epoch": 4.750585691115516, + "grad_norm": 0.257835328578949, + "learning_rate": 7.458326463805677e-08, + "loss": 0.3749, + "step": 8787 + }, + { + "epoch": 4.751126329068301, + "grad_norm": 0.262051522731781, + "learning_rate": 7.425873679182072e-08, + "loss": 0.3664, + "step": 8788 + }, + { + "epoch": 4.751666967021085, + "grad_norm": 0.26971325278282166, + "learning_rate": 7.3934911248843e-08, + "loss": 0.3289, + "step": 8789 + }, + { + "epoch": 4.7522076049738695, + "grad_norm": 0.27077168226242065, + "learning_rate": 7.36117880552939e-08, + "loss": 0.3871, + "step": 8790 + }, + { + "epoch": 4.752748242926653, + "grad_norm": 0.27058687806129456, + "learning_rate": 7.328936725724378e-08, + "loss": 0.3692, + "step": 8791 + }, + { + "epoch": 4.753288880879437, + "grad_norm": 0.278969407081604, + "learning_rate": 7.29676489006631e-08, + "loss": 0.3556, + "step": 8792 + }, + { + "epoch": 4.753829518832222, + "grad_norm": 0.2769360840320587, + "learning_rate": 7.264663303142239e-08, + "loss": 0.3483, + "step": 8793 + }, + { + "epoch": 4.754370156785006, + "grad_norm": 0.2593977749347687, + "learning_rate": 7.232631969529058e-08, + "loss": 0.3288, + "step": 8794 + }, + { + "epoch": 4.754910794737791, + "grad_norm": 0.2765919864177704, + "learning_rate": 7.200670893793727e-08, + "loss": 0.3402, + "step": 8795 + }, + { + "epoch": 4.755451432690575, + "grad_norm": 0.25877347588539124, + "learning_rate": 7.168780080493265e-08, + "loss": 0.3466, + "step": 8796 + }, + { + "epoch": 4.755992070643359, + "grad_norm": 0.27611830830574036, + "learning_rate": 7.136959534174592e-08, + "loss": 0.3944, + "step": 8797 + }, + { + "epoch": 4.7565327085961435, + "grad_norm": 0.2555706202983856, + "learning_rate": 7.105209259374579e-08, + "loss": 0.3752, + "step": 8798 + }, + { + "epoch": 4.757073346548927, + "grad_norm": 0.25722599029541016, + "learning_rate": 7.07352926062016e-08, + "loss": 0.3599, + "step": 8799 + }, + { + "epoch": 4.757613984501712, + "grad_norm": 0.263687402009964, + "learning_rate": 7.041919542428221e-08, + "loss": 0.3653, + "step": 8800 + }, + { + "epoch": 4.758154622454496, + "grad_norm": 0.28717342019081116, + "learning_rate": 7.010380109305603e-08, + "loss": 0.4053, + "step": 8801 + }, + { + "epoch": 4.758695260407281, + "grad_norm": 0.2686096727848053, + "learning_rate": 6.978910965749097e-08, + "loss": 0.3319, + "step": 8802 + }, + { + "epoch": 4.759235898360065, + "grad_norm": 0.2717582583427429, + "learning_rate": 6.947512116245669e-08, + "loss": 0.3801, + "step": 8803 + }, + { + "epoch": 4.759776536312849, + "grad_norm": 0.27994662523269653, + "learning_rate": 6.916183565271905e-08, + "loss": 0.3736, + "step": 8804 + }, + { + "epoch": 4.760317174265634, + "grad_norm": 0.26790112257003784, + "learning_rate": 6.884925317294678e-08, + "loss": 0.3872, + "step": 8805 + }, + { + "epoch": 4.7608578122184175, + "grad_norm": 0.2642842233181, + "learning_rate": 6.853737376770752e-08, + "loss": 0.3568, + "step": 8806 + }, + { + "epoch": 4.761398450171202, + "grad_norm": 0.278881698846817, + "learning_rate": 6.822619748146797e-08, + "loss": 0.3994, + "step": 8807 + }, + { + "epoch": 4.761939088123986, + "grad_norm": 0.28038862347602844, + "learning_rate": 6.791572435859595e-08, + "loss": 0.3639, + "step": 8808 + }, + { + "epoch": 4.762479726076771, + "grad_norm": 0.25316354632377625, + "learning_rate": 6.760595444335716e-08, + "loss": 0.3563, + "step": 8809 + }, + { + "epoch": 4.763020364029555, + "grad_norm": 0.25088751316070557, + "learning_rate": 6.72968877799185e-08, + "loss": 0.3758, + "step": 8810 + }, + { + "epoch": 4.763561001982339, + "grad_norm": 0.25079676508903503, + "learning_rate": 6.698852441234527e-08, + "loss": 0.3484, + "step": 8811 + }, + { + "epoch": 4.764101639935124, + "grad_norm": 0.25764012336730957, + "learning_rate": 6.668086438460453e-08, + "loss": 0.3488, + "step": 8812 + }, + { + "epoch": 4.764642277887908, + "grad_norm": 0.2521570920944214, + "learning_rate": 6.63739077405623e-08, + "loss": 0.3526, + "step": 8813 + }, + { + "epoch": 4.7651829158406915, + "grad_norm": 0.2827494442462921, + "learning_rate": 6.60676545239819e-08, + "loss": 0.3994, + "step": 8814 + }, + { + "epoch": 4.765723553793476, + "grad_norm": 0.26093700528144836, + "learning_rate": 6.576210477853007e-08, + "loss": 0.3585, + "step": 8815 + }, + { + "epoch": 4.76626419174626, + "grad_norm": 0.25771933794021606, + "learning_rate": 6.545725854777086e-08, + "loss": 0.3815, + "step": 8816 + }, + { + "epoch": 4.766804829699045, + "grad_norm": 0.2567058801651001, + "learning_rate": 6.515311587516893e-08, + "loss": 0.3344, + "step": 8817 + }, + { + "epoch": 4.767345467651829, + "grad_norm": 0.25780367851257324, + "learning_rate": 6.484967680408849e-08, + "loss": 0.3375, + "step": 8818 + }, + { + "epoch": 4.767886105604614, + "grad_norm": 0.2630016505718231, + "learning_rate": 6.454694137779272e-08, + "loss": 0.3289, + "step": 8819 + }, + { + "epoch": 4.768426743557398, + "grad_norm": 0.2782217264175415, + "learning_rate": 6.424490963944597e-08, + "loss": 0.3748, + "step": 8820 + }, + { + "epoch": 4.768967381510182, + "grad_norm": 0.28443044424057007, + "learning_rate": 6.394358163211046e-08, + "loss": 0.3899, + "step": 8821 + }, + { + "epoch": 4.769508019462966, + "grad_norm": 0.2556963264942169, + "learning_rate": 6.36429573987496e-08, + "loss": 0.3531, + "step": 8822 + }, + { + "epoch": 4.77004865741575, + "grad_norm": 0.2648627460002899, + "learning_rate": 6.334303698222577e-08, + "loss": 0.3637, + "step": 8823 + }, + { + "epoch": 4.770589295368535, + "grad_norm": 0.248936265707016, + "learning_rate": 6.304382042530088e-08, + "loss": 0.3878, + "step": 8824 + }, + { + "epoch": 4.771129933321319, + "grad_norm": 0.25302061438560486, + "learning_rate": 6.274530777063747e-08, + "loss": 0.3514, + "step": 8825 + }, + { + "epoch": 4.771670571274104, + "grad_norm": 0.2634265124797821, + "learning_rate": 6.244749906079539e-08, + "loss": 0.3356, + "step": 8826 + }, + { + "epoch": 4.772211209226888, + "grad_norm": 0.26157331466674805, + "learning_rate": 6.215039433823677e-08, + "loss": 0.3807, + "step": 8827 + }, + { + "epoch": 4.772751847179672, + "grad_norm": 0.25523486733436584, + "learning_rate": 6.185399364532163e-08, + "loss": 0.373, + "step": 8828 + }, + { + "epoch": 4.7732924851324565, + "grad_norm": 0.2643509805202484, + "learning_rate": 6.15582970243117e-08, + "loss": 0.3446, + "step": 8829 + }, + { + "epoch": 4.77383312308524, + "grad_norm": 0.2760935127735138, + "learning_rate": 6.126330451736495e-08, + "loss": 0.3738, + "step": 8830 + }, + { + "epoch": 4.774373761038025, + "grad_norm": 0.26611626148223877, + "learning_rate": 6.096901616654216e-08, + "loss": 0.3294, + "step": 8831 + }, + { + "epoch": 4.774914398990809, + "grad_norm": 0.25371184945106506, + "learning_rate": 6.067543201380199e-08, + "loss": 0.3523, + "step": 8832 + }, + { + "epoch": 4.775455036943593, + "grad_norm": 0.2639772891998291, + "learning_rate": 6.03825521010032e-08, + "loss": 0.3566, + "step": 8833 + }, + { + "epoch": 4.775995674896378, + "grad_norm": 0.2542242407798767, + "learning_rate": 6.009037646990346e-08, + "loss": 0.379, + "step": 8834 + }, + { + "epoch": 4.776536312849162, + "grad_norm": 0.2683775722980499, + "learning_rate": 5.97989051621617e-08, + "loss": 0.3855, + "step": 8835 + }, + { + "epoch": 4.777076950801947, + "grad_norm": 0.2643851041793823, + "learning_rate": 5.950813821933465e-08, + "loss": 0.3858, + "step": 8836 + }, + { + "epoch": 4.7776175887547305, + "grad_norm": 0.2688920497894287, + "learning_rate": 5.9218075682880293e-08, + "loss": 0.3392, + "step": 8837 + }, + { + "epoch": 4.7781582267075144, + "grad_norm": 0.2585931122303009, + "learning_rate": 5.892871759415386e-08, + "loss": 0.3595, + "step": 8838 + }, + { + "epoch": 4.778698864660299, + "grad_norm": 0.2401088923215866, + "learning_rate": 5.864006399441236e-08, + "loss": 0.3158, + "step": 8839 + }, + { + "epoch": 4.779239502613083, + "grad_norm": 0.25406181812286377, + "learning_rate": 5.835211492481063e-08, + "loss": 0.3688, + "step": 8840 + }, + { + "epoch": 4.779780140565868, + "grad_norm": 0.26199793815612793, + "learning_rate": 5.8064870426405295e-08, + "loss": 0.3774, + "step": 8841 + }, + { + "epoch": 4.780320778518652, + "grad_norm": 0.28002819418907166, + "learning_rate": 5.777833054015025e-08, + "loss": 0.3555, + "step": 8842 + }, + { + "epoch": 4.780861416471437, + "grad_norm": 0.25967445969581604, + "learning_rate": 5.749249530690004e-08, + "loss": 0.3708, + "step": 8843 + }, + { + "epoch": 4.781402054424221, + "grad_norm": 0.24986760318279266, + "learning_rate": 5.7207364767408734e-08, + "loss": 0.3409, + "step": 8844 + }, + { + "epoch": 4.7819426923770045, + "grad_norm": 0.26675549149513245, + "learning_rate": 5.6922938962329364e-08, + "loss": 0.3778, + "step": 8845 + }, + { + "epoch": 4.782483330329789, + "grad_norm": 0.26861754059791565, + "learning_rate": 5.66392179322145e-08, + "loss": 0.3701, + "step": 8846 + }, + { + "epoch": 4.783023968282573, + "grad_norm": 0.26682743430137634, + "learning_rate": 5.635620171751732e-08, + "loss": 0.3815, + "step": 8847 + }, + { + "epoch": 4.783564606235358, + "grad_norm": 0.268576443195343, + "learning_rate": 5.6073890358589454e-08, + "loss": 0.3668, + "step": 8848 + }, + { + "epoch": 4.784105244188142, + "grad_norm": 0.2416997253894806, + "learning_rate": 5.579228389568314e-08, + "loss": 0.3382, + "step": 8849 + }, + { + "epoch": 4.784645882140927, + "grad_norm": 0.26931247115135193, + "learning_rate": 5.551138236894793e-08, + "loss": 0.378, + "step": 8850 + }, + { + "epoch": 4.785186520093711, + "grad_norm": 0.2577950954437256, + "learning_rate": 5.5231185818434563e-08, + "loss": 0.3394, + "step": 8851 + }, + { + "epoch": 4.785727158046495, + "grad_norm": 0.27060166001319885, + "learning_rate": 5.495169428409386e-08, + "loss": 0.342, + "step": 8852 + }, + { + "epoch": 4.786267795999279, + "grad_norm": 0.2755787968635559, + "learning_rate": 5.46729078057745e-08, + "loss": 0.3485, + "step": 8853 + }, + { + "epoch": 4.786808433952063, + "grad_norm": 0.27702271938323975, + "learning_rate": 5.43948264232258e-08, + "loss": 0.3765, + "step": 8854 + }, + { + "epoch": 4.787349071904847, + "grad_norm": 0.2815020978450775, + "learning_rate": 5.411745017609493e-08, + "loss": 0.3504, + "step": 8855 + }, + { + "epoch": 4.787889709857632, + "grad_norm": 0.2912232577800751, + "learning_rate": 5.384077910393137e-08, + "loss": 0.3901, + "step": 8856 + }, + { + "epoch": 4.788430347810416, + "grad_norm": 0.26429423689842224, + "learning_rate": 5.3564813246181345e-08, + "loss": 0.3414, + "step": 8857 + }, + { + "epoch": 4.788970985763201, + "grad_norm": 0.27630752325057983, + "learning_rate": 5.328955264219171e-08, + "loss": 0.3731, + "step": 8858 + }, + { + "epoch": 4.789511623715985, + "grad_norm": 0.24857951700687408, + "learning_rate": 5.301499733120885e-08, + "loss": 0.3579, + "step": 8859 + }, + { + "epoch": 4.7900522616687695, + "grad_norm": 0.24628034234046936, + "learning_rate": 5.274114735237812e-08, + "loss": 0.3303, + "step": 8860 + }, + { + "epoch": 4.790592899621553, + "grad_norm": 0.256889671087265, + "learning_rate": 5.246800274474439e-08, + "loss": 0.3466, + "step": 8861 + }, + { + "epoch": 4.791133537574337, + "grad_norm": 0.25723087787628174, + "learning_rate": 5.219556354725264e-08, + "loss": 0.3496, + "step": 8862 + }, + { + "epoch": 4.791674175527122, + "grad_norm": 0.2660854160785675, + "learning_rate": 5.192382979874677e-08, + "loss": 0.3271, + "step": 8863 + }, + { + "epoch": 4.792214813479906, + "grad_norm": 0.27716538310050964, + "learning_rate": 5.16528015379697e-08, + "loss": 0.3943, + "step": 8864 + }, + { + "epoch": 4.792755451432691, + "grad_norm": 0.26225516200065613, + "learning_rate": 5.138247880356384e-08, + "loss": 0.3606, + "step": 8865 + }, + { + "epoch": 4.793296089385475, + "grad_norm": 0.2535618543624878, + "learning_rate": 5.1112861634072256e-08, + "loss": 0.3415, + "step": 8866 + }, + { + "epoch": 4.79383672733826, + "grad_norm": 0.2489616423845291, + "learning_rate": 5.08439500679353e-08, + "loss": 0.3303, + "step": 8867 + }, + { + "epoch": 4.7943773652910435, + "grad_norm": 0.26061302423477173, + "learning_rate": 5.0575744143495084e-08, + "loss": 0.3604, + "step": 8868 + }, + { + "epoch": 4.794918003243827, + "grad_norm": 0.25412821769714355, + "learning_rate": 5.0308243898991025e-08, + "loss": 0.3668, + "step": 8869 + }, + { + "epoch": 4.795458641196612, + "grad_norm": 0.31018903851509094, + "learning_rate": 5.004144937256372e-08, + "loss": 0.3978, + "step": 8870 + }, + { + "epoch": 4.795999279149396, + "grad_norm": 0.26542267203330994, + "learning_rate": 4.977536060225163e-08, + "loss": 0.3442, + "step": 8871 + }, + { + "epoch": 4.79653991710218, + "grad_norm": 0.2577354907989502, + "learning_rate": 4.9509977625992745e-08, + "loss": 0.3666, + "step": 8872 + }, + { + "epoch": 4.797080555054965, + "grad_norm": 0.2683751881122589, + "learning_rate": 4.9245300481626234e-08, + "loss": 0.3355, + "step": 8873 + }, + { + "epoch": 4.797621193007749, + "grad_norm": 0.24945707619190216, + "learning_rate": 4.898132920688803e-08, + "loss": 0.366, + "step": 8874 + }, + { + "epoch": 4.798161830960534, + "grad_norm": 0.26416832208633423, + "learning_rate": 4.8718063839414683e-08, + "loss": 0.3606, + "step": 8875 + }, + { + "epoch": 4.7987024689133175, + "grad_norm": 0.28955134749412537, + "learning_rate": 4.84555044167434e-08, + "loss": 0.3629, + "step": 8876 + }, + { + "epoch": 4.799243106866102, + "grad_norm": 0.25828447937965393, + "learning_rate": 4.8193650976308124e-08, + "loss": 0.3518, + "step": 8877 + }, + { + "epoch": 4.799783744818886, + "grad_norm": 0.27702564001083374, + "learning_rate": 4.7932503555443986e-08, + "loss": 0.3753, + "step": 8878 + }, + { + "epoch": 4.80032438277167, + "grad_norm": 0.26961031556129456, + "learning_rate": 4.7672062191385094e-08, + "loss": 0.3611, + "step": 8879 + }, + { + "epoch": 4.800865020724455, + "grad_norm": 0.2614027261734009, + "learning_rate": 4.741232692126396e-08, + "loss": 0.3303, + "step": 8880 + }, + { + "epoch": 4.801405658677239, + "grad_norm": 0.24823996424674988, + "learning_rate": 4.715329778211375e-08, + "loss": 0.3457, + "step": 8881 + }, + { + "epoch": 4.801946296630024, + "grad_norm": 0.2648354470729828, + "learning_rate": 4.6894974810866575e-08, + "loss": 0.3897, + "step": 8882 + }, + { + "epoch": 4.802486934582808, + "grad_norm": 0.24569006264209747, + "learning_rate": 4.6637358044352985e-08, + "loss": 0.3837, + "step": 8883 + }, + { + "epoch": 4.803027572535592, + "grad_norm": 0.26638856530189514, + "learning_rate": 4.638044751930415e-08, + "loss": 0.3491, + "step": 8884 + }, + { + "epoch": 4.803568210488376, + "grad_norm": 0.27749237418174744, + "learning_rate": 4.612424327234966e-08, + "loss": 0.3965, + "step": 8885 + }, + { + "epoch": 4.80410884844116, + "grad_norm": 0.27472639083862305, + "learning_rate": 4.5868745340018064e-08, + "loss": 0.3524, + "step": 8886 + }, + { + "epoch": 4.804649486393945, + "grad_norm": 0.26227518916130066, + "learning_rate": 4.561395375873856e-08, + "loss": 0.3653, + "step": 8887 + }, + { + "epoch": 4.805190124346729, + "grad_norm": 0.26707348227500916, + "learning_rate": 4.5359868564839317e-08, + "loss": 0.3761, + "step": 8888 + }, + { + "epoch": 4.805730762299514, + "grad_norm": 0.274200975894928, + "learning_rate": 4.510648979454579e-08, + "loss": 0.3796, + "step": 8889 + }, + { + "epoch": 4.806271400252298, + "grad_norm": 0.26825666427612305, + "learning_rate": 4.485381748398576e-08, + "loss": 0.3631, + "step": 8890 + }, + { + "epoch": 4.806812038205082, + "grad_norm": 0.25708481669425964, + "learning_rate": 4.4601851669183736e-08, + "loss": 0.3635, + "step": 8891 + }, + { + "epoch": 4.807352676157866, + "grad_norm": 0.26601260900497437, + "learning_rate": 4.435059238606543e-08, + "loss": 0.38, + "step": 8892 + }, + { + "epoch": 4.80789331411065, + "grad_norm": 0.25704678893089294, + "learning_rate": 4.41000396704544e-08, + "loss": 0.3608, + "step": 8893 + }, + { + "epoch": 4.808433952063435, + "grad_norm": 0.25019845366477966, + "learning_rate": 4.3850193558073736e-08, + "loss": 0.3548, + "step": 8894 + }, + { + "epoch": 4.808974590016219, + "grad_norm": 0.2615974545478821, + "learning_rate": 4.360105408454718e-08, + "loss": 0.3566, + "step": 8895 + }, + { + "epoch": 4.809515227969003, + "grad_norm": 0.2789876461029053, + "learning_rate": 4.335262128539519e-08, + "loss": 0.3867, + "step": 8896 + }, + { + "epoch": 4.810055865921788, + "grad_norm": 0.27569377422332764, + "learning_rate": 4.310489519603944e-08, + "loss": 0.358, + "step": 8897 + }, + { + "epoch": 4.810596503874572, + "grad_norm": 0.26313361525535583, + "learning_rate": 4.285787585180057e-08, + "loss": 0.3527, + "step": 8898 + }, + { + "epoch": 4.8111371418273565, + "grad_norm": 0.24816150963306427, + "learning_rate": 4.261156328789762e-08, + "loss": 0.3361, + "step": 8899 + }, + { + "epoch": 4.81167777978014, + "grad_norm": 0.26686063408851624, + "learning_rate": 4.236595753944972e-08, + "loss": 0.3979, + "step": 8900 + }, + { + "epoch": 4.812218417732925, + "grad_norm": 0.2610894739627838, + "learning_rate": 4.2121058641474974e-08, + "loss": 0.366, + "step": 8901 + }, + { + "epoch": 4.812759055685709, + "grad_norm": 0.2641092836856842, + "learning_rate": 4.187686662889045e-08, + "loss": 0.3702, + "step": 8902 + }, + { + "epoch": 4.813299693638493, + "grad_norm": 0.2756359279155731, + "learning_rate": 4.163338153651275e-08, + "loss": 0.3579, + "step": 8903 + }, + { + "epoch": 4.813840331591278, + "grad_norm": 0.2738838493824005, + "learning_rate": 4.139060339905743e-08, + "loss": 0.347, + "step": 8904 + }, + { + "epoch": 4.814380969544062, + "grad_norm": 0.2661038041114807, + "learning_rate": 4.114853225113902e-08, + "loss": 0.3798, + "step": 8905 + }, + { + "epoch": 4.814921607496847, + "grad_norm": 0.24749571084976196, + "learning_rate": 4.090716812727214e-08, + "loss": 0.3497, + "step": 8906 + }, + { + "epoch": 4.8154622454496305, + "grad_norm": 0.2771914005279541, + "learning_rate": 4.0666511061869804e-08, + "loss": 0.3636, + "step": 8907 + }, + { + "epoch": 4.816002883402415, + "grad_norm": 0.2644779086112976, + "learning_rate": 4.042656108924459e-08, + "loss": 0.3731, + "step": 8908 + }, + { + "epoch": 4.816543521355199, + "grad_norm": 0.24924729764461517, + "learning_rate": 4.018731824360744e-08, + "loss": 0.3561, + "step": 8909 + }, + { + "epoch": 4.817084159307983, + "grad_norm": 0.26898232102394104, + "learning_rate": 3.994878255907053e-08, + "loss": 0.3302, + "step": 8910 + }, + { + "epoch": 4.817624797260768, + "grad_norm": 0.28812891244888306, + "learning_rate": 3.971095406964276e-08, + "loss": 0.4111, + "step": 8911 + }, + { + "epoch": 4.818165435213552, + "grad_norm": 0.26695603132247925, + "learning_rate": 3.947383280923367e-08, + "loss": 0.3736, + "step": 8912 + }, + { + "epoch": 4.818706073166336, + "grad_norm": 0.24491271376609802, + "learning_rate": 3.923741881165122e-08, + "loss": 0.3297, + "step": 8913 + }, + { + "epoch": 4.819246711119121, + "grad_norm": 0.26488637924194336, + "learning_rate": 3.900171211060344e-08, + "loss": 0.3876, + "step": 8914 + }, + { + "epoch": 4.8197873490719045, + "grad_norm": 0.26854172348976135, + "learning_rate": 3.8766712739696786e-08, + "loss": 0.3586, + "step": 8915 + }, + { + "epoch": 4.820327987024689, + "grad_norm": 0.26286134123802185, + "learning_rate": 3.853242073243668e-08, + "loss": 0.3338, + "step": 8916 + }, + { + "epoch": 4.820868624977473, + "grad_norm": 0.26648515462875366, + "learning_rate": 3.8298836122228064e-08, + "loss": 0.4151, + "step": 8917 + }, + { + "epoch": 4.821409262930258, + "grad_norm": 0.2651606500148773, + "learning_rate": 3.8065958942375966e-08, + "loss": 0.3663, + "step": 8918 + }, + { + "epoch": 4.821949900883042, + "grad_norm": 0.2581683099269867, + "learning_rate": 3.783378922608216e-08, + "loss": 0.3803, + "step": 8919 + }, + { + "epoch": 4.822490538835826, + "grad_norm": 0.2398492991924286, + "learning_rate": 3.7602327006450166e-08, + "loss": 0.3348, + "step": 8920 + }, + { + "epoch": 4.823031176788611, + "grad_norm": 0.25325441360473633, + "learning_rate": 3.7371572316480806e-08, + "loss": 0.3443, + "step": 8921 + }, + { + "epoch": 4.823571814741395, + "grad_norm": 0.2561415433883667, + "learning_rate": 3.714152518907499e-08, + "loss": 0.3763, + "step": 8922 + }, + { + "epoch": 4.824112452694179, + "grad_norm": 0.24386979639530182, + "learning_rate": 3.691218565703203e-08, + "loss": 0.334, + "step": 8923 + }, + { + "epoch": 4.824653090646963, + "grad_norm": 0.25620338320732117, + "learning_rate": 3.6683553753051326e-08, + "loss": 0.3649, + "step": 8924 + }, + { + "epoch": 4.825193728599748, + "grad_norm": 0.27384117245674133, + "learning_rate": 3.645562950973014e-08, + "loss": 0.408, + "step": 8925 + }, + { + "epoch": 4.825734366552532, + "grad_norm": 0.2522866427898407, + "learning_rate": 3.6228412959565805e-08, + "loss": 0.3711, + "step": 8926 + }, + { + "epoch": 4.826275004505316, + "grad_norm": 0.2863280475139618, + "learning_rate": 3.600190413495463e-08, + "loss": 0.3813, + "step": 8927 + }, + { + "epoch": 4.826815642458101, + "grad_norm": 0.26387637853622437, + "learning_rate": 3.57761030681919e-08, + "loss": 0.3987, + "step": 8928 + }, + { + "epoch": 4.827356280410885, + "grad_norm": 0.2458793967962265, + "learning_rate": 3.55510097914713e-08, + "loss": 0.3436, + "step": 8929 + }, + { + "epoch": 4.827896918363669, + "grad_norm": 0.27182090282440186, + "learning_rate": 3.5326624336886604e-08, + "loss": 0.3549, + "step": 8930 + }, + { + "epoch": 4.828437556316453, + "grad_norm": 0.2573881447315216, + "learning_rate": 3.510294673643056e-08, + "loss": 0.3556, + "step": 8931 + }, + { + "epoch": 4.828978194269237, + "grad_norm": 0.26516303420066833, + "learning_rate": 3.4879977021994304e-08, + "loss": 0.3717, + "step": 8932 + }, + { + "epoch": 4.829518832222022, + "grad_norm": 0.23965980112552643, + "learning_rate": 3.465771522536854e-08, + "loss": 0.3464, + "step": 8933 + }, + { + "epoch": 4.830059470174806, + "grad_norm": 0.272983580827713, + "learning_rate": 3.4436161378242907e-08, + "loss": 0.3539, + "step": 8934 + }, + { + "epoch": 4.830600108127591, + "grad_norm": 0.28406354784965515, + "learning_rate": 3.4215315512206584e-08, + "loss": 0.371, + "step": 8935 + }, + { + "epoch": 4.831140746080375, + "grad_norm": 0.271476149559021, + "learning_rate": 3.399517765874716e-08, + "loss": 0.3745, + "step": 8936 + }, + { + "epoch": 4.831681384033159, + "grad_norm": 0.24661186337471008, + "learning_rate": 3.377574784925064e-08, + "loss": 0.3329, + "step": 8937 + }, + { + "epoch": 4.8322220219859435, + "grad_norm": 0.2657792270183563, + "learning_rate": 3.355702611500422e-08, + "loss": 0.348, + "step": 8938 + }, + { + "epoch": 4.8327626599387274, + "grad_norm": 0.2854871153831482, + "learning_rate": 3.33390124871924e-08, + "loss": 0.3917, + "step": 8939 + }, + { + "epoch": 4.833303297891512, + "grad_norm": 0.2525235116481781, + "learning_rate": 3.312170699689865e-08, + "loss": 0.3605, + "step": 8940 + }, + { + "epoch": 4.833843935844296, + "grad_norm": 0.27078115940093994, + "learning_rate": 3.2905109675106515e-08, + "loss": 0.3555, + "step": 8941 + }, + { + "epoch": 4.834384573797081, + "grad_norm": 0.26227429509162903, + "learning_rate": 3.268922055269741e-08, + "loss": 0.3676, + "step": 8942 + }, + { + "epoch": 4.834925211749865, + "grad_norm": 0.2645623981952667, + "learning_rate": 3.247403966045393e-08, + "loss": 0.3534, + "step": 8943 + }, + { + "epoch": 4.835465849702649, + "grad_norm": 0.25363367795944214, + "learning_rate": 3.225956702905486e-08, + "loss": 0.372, + "step": 8944 + }, + { + "epoch": 4.836006487655434, + "grad_norm": 0.26763975620269775, + "learning_rate": 3.204580268907909e-08, + "loss": 0.375, + "step": 8945 + }, + { + "epoch": 4.8365471256082175, + "grad_norm": 0.27136990427970886, + "learning_rate": 3.183274667100611e-08, + "loss": 0.376, + "step": 8946 + }, + { + "epoch": 4.837087763561002, + "grad_norm": 0.2796955406665802, + "learning_rate": 3.1620399005211634e-08, + "loss": 0.3741, + "step": 8947 + }, + { + "epoch": 4.837628401513786, + "grad_norm": 0.24199721217155457, + "learning_rate": 3.140875972197255e-08, + "loss": 0.3395, + "step": 8948 + }, + { + "epoch": 4.83816903946657, + "grad_norm": 0.25753477215766907, + "learning_rate": 3.1197828851464164e-08, + "loss": 0.3678, + "step": 8949 + }, + { + "epoch": 4.838709677419355, + "grad_norm": 0.270973265171051, + "learning_rate": 3.0987606423759644e-08, + "loss": 0.3769, + "step": 8950 + }, + { + "epoch": 4.839250315372139, + "grad_norm": 0.2731953561306, + "learning_rate": 3.0778092468833897e-08, + "loss": 0.3535, + "step": 8951 + }, + { + "epoch": 4.839790953324924, + "grad_norm": 0.3085453510284424, + "learning_rate": 3.056928701655692e-08, + "loss": 0.3554, + "step": 8952 + }, + { + "epoch": 4.840331591277708, + "grad_norm": 0.2651394307613373, + "learning_rate": 3.0361190096701573e-08, + "loss": 0.3614, + "step": 8953 + }, + { + "epoch": 4.8408722292304915, + "grad_norm": 0.2528552711009979, + "learning_rate": 3.015380173893689e-08, + "loss": 0.3334, + "step": 8954 + }, + { + "epoch": 4.841412867183276, + "grad_norm": 0.2840980589389801, + "learning_rate": 2.9947121972832e-08, + "loss": 0.3762, + "step": 8955 + }, + { + "epoch": 4.84195350513606, + "grad_norm": 0.23905372619628906, + "learning_rate": 2.974115082785556e-08, + "loss": 0.3235, + "step": 8956 + }, + { + "epoch": 4.842494143088845, + "grad_norm": 0.25684115290641785, + "learning_rate": 2.9535888333374064e-08, + "loss": 0.3945, + "step": 8957 + }, + { + "epoch": 4.843034781041629, + "grad_norm": 0.27147263288497925, + "learning_rate": 2.9331334518653554e-08, + "loss": 0.3767, + "step": 8958 + }, + { + "epoch": 4.843575418994414, + "grad_norm": 0.26529526710510254, + "learning_rate": 2.9127489412859033e-08, + "loss": 0.4167, + "step": 8959 + }, + { + "epoch": 4.844116056947198, + "grad_norm": 0.2645881175994873, + "learning_rate": 2.8924353045054475e-08, + "loss": 0.3754, + "step": 8960 + }, + { + "epoch": 4.844656694899982, + "grad_norm": 0.27821385860443115, + "learning_rate": 2.872192544420227e-08, + "loss": 0.3982, + "step": 8961 + }, + { + "epoch": 4.845197332852766, + "grad_norm": 0.26622623205184937, + "learning_rate": 2.8520206639164328e-08, + "loss": 0.3406, + "step": 8962 + }, + { + "epoch": 4.84573797080555, + "grad_norm": 0.25798436999320984, + "learning_rate": 2.8319196658702087e-08, + "loss": 0.3635, + "step": 8963 + }, + { + "epoch": 4.846278608758335, + "grad_norm": 0.26354706287384033, + "learning_rate": 2.8118895531473733e-08, + "loss": 0.3447, + "step": 8964 + }, + { + "epoch": 4.846819246711119, + "grad_norm": 0.2673446238040924, + "learning_rate": 2.7919303286039202e-08, + "loss": 0.3957, + "step": 8965 + }, + { + "epoch": 4.847359884663904, + "grad_norm": 0.2549054026603699, + "learning_rate": 2.772041995085517e-08, + "loss": 0.3468, + "step": 8966 + }, + { + "epoch": 4.847900522616688, + "grad_norm": 0.2655335068702698, + "learning_rate": 2.7522245554278404e-08, + "loss": 0.3316, + "step": 8967 + }, + { + "epoch": 4.848441160569472, + "grad_norm": 0.2460017055273056, + "learning_rate": 2.7324780124564633e-08, + "loss": 0.3642, + "step": 8968 + }, + { + "epoch": 4.8489817985222565, + "grad_norm": 0.28284773230552673, + "learning_rate": 2.7128023689866888e-08, + "loss": 0.4121, + "step": 8969 + }, + { + "epoch": 4.84952243647504, + "grad_norm": 0.2812318205833435, + "learning_rate": 2.693197627823996e-08, + "loss": 0.3492, + "step": 8970 + }, + { + "epoch": 4.850063074427824, + "grad_norm": 0.2731960713863373, + "learning_rate": 2.673663791763481e-08, + "loss": 0.356, + "step": 8971 + }, + { + "epoch": 4.850603712380609, + "grad_norm": 0.25460806488990784, + "learning_rate": 2.6542008635902504e-08, + "loss": 0.344, + "step": 8972 + }, + { + "epoch": 4.851144350333393, + "grad_norm": 0.27592790126800537, + "learning_rate": 2.6348088460793064e-08, + "loss": 0.3392, + "step": 8973 + }, + { + "epoch": 4.851684988286178, + "grad_norm": 0.2760111689567566, + "learning_rate": 2.6154877419955483e-08, + "loss": 0.3602, + "step": 8974 + }, + { + "epoch": 4.852225626238962, + "grad_norm": 0.26083844900131226, + "learning_rate": 2.5962375540937724e-08, + "loss": 0.3553, + "step": 8975 + }, + { + "epoch": 4.852766264191747, + "grad_norm": 0.26439404487609863, + "learning_rate": 2.577058285118561e-08, + "loss": 0.3644, + "step": 8976 + }, + { + "epoch": 4.8533069021445305, + "grad_norm": 0.2698628604412079, + "learning_rate": 2.557949937804505e-08, + "loss": 0.3721, + "step": 8977 + }, + { + "epoch": 4.8538475400973145, + "grad_norm": 0.26598867774009705, + "learning_rate": 2.5389125148760353e-08, + "loss": 0.3501, + "step": 8978 + }, + { + "epoch": 4.854388178050099, + "grad_norm": 0.26747986674308777, + "learning_rate": 2.5199460190474255e-08, + "loss": 0.4075, + "step": 8979 + }, + { + "epoch": 4.854928816002883, + "grad_norm": 0.2578566372394562, + "learning_rate": 2.5010504530229574e-08, + "loss": 0.3406, + "step": 8980 + }, + { + "epoch": 4.855469453955668, + "grad_norm": 0.2632170617580414, + "learning_rate": 2.4822258194966975e-08, + "loss": 0.3354, + "step": 8981 + }, + { + "epoch": 4.856010091908452, + "grad_norm": 0.2677387595176697, + "learning_rate": 2.4634721211526102e-08, + "loss": 0.3469, + "step": 8982 + }, + { + "epoch": 4.856550729861237, + "grad_norm": 0.27075430750846863, + "learning_rate": 2.4447893606645567e-08, + "loss": 0.3443, + "step": 8983 + }, + { + "epoch": 4.857091367814021, + "grad_norm": 0.26497140526771545, + "learning_rate": 2.4261775406963505e-08, + "loss": 0.3584, + "step": 8984 + }, + { + "epoch": 4.8576320057668045, + "grad_norm": 0.2607765197753906, + "learning_rate": 2.4076366639015914e-08, + "loss": 0.369, + "step": 8985 + }, + { + "epoch": 4.858172643719589, + "grad_norm": 0.28138893842697144, + "learning_rate": 2.3891667329237756e-08, + "loss": 0.3623, + "step": 8986 + }, + { + "epoch": 4.858713281672373, + "grad_norm": 0.2865180969238281, + "learning_rate": 2.3707677503963523e-08, + "loss": 0.3966, + "step": 8987 + }, + { + "epoch": 4.859253919625158, + "grad_norm": 0.2480352520942688, + "learning_rate": 2.3524397189426117e-08, + "loss": 0.3419, + "step": 8988 + }, + { + "epoch": 4.859794557577942, + "grad_norm": 0.2529453635215759, + "learning_rate": 2.3341826411756863e-08, + "loss": 0.3579, + "step": 8989 + }, + { + "epoch": 4.860335195530726, + "grad_norm": 0.27298611402511597, + "learning_rate": 2.3159965196987156e-08, + "loss": 0.3687, + "step": 8990 + }, + { + "epoch": 4.860875833483511, + "grad_norm": 0.24878400564193726, + "learning_rate": 2.29788135710457e-08, + "loss": 0.3376, + "step": 8991 + }, + { + "epoch": 4.861416471436295, + "grad_norm": 0.28801393508911133, + "learning_rate": 2.2798371559761835e-08, + "loss": 0.3756, + "step": 8992 + }, + { + "epoch": 4.861957109389079, + "grad_norm": 0.2819991409778595, + "learning_rate": 2.261863918886109e-08, + "loss": 0.3451, + "step": 8993 + }, + { + "epoch": 4.862497747341863, + "grad_norm": 0.2526842951774597, + "learning_rate": 2.2439616483970748e-08, + "loss": 0.3584, + "step": 8994 + }, + { + "epoch": 4.863038385294647, + "grad_norm": 0.2582084536552429, + "learning_rate": 2.2261303470614282e-08, + "loss": 0.3716, + "step": 8995 + }, + { + "epoch": 4.863579023247432, + "grad_norm": 0.2674556076526642, + "learning_rate": 2.2083700174216348e-08, + "loss": 0.38, + "step": 8996 + }, + { + "epoch": 4.864119661200216, + "grad_norm": 0.2626592218875885, + "learning_rate": 2.1906806620099473e-08, + "loss": 0.3382, + "step": 8997 + }, + { + "epoch": 4.864660299153001, + "grad_norm": 0.27217191457748413, + "learning_rate": 2.1730622833483484e-08, + "loss": 0.3627, + "step": 8998 + }, + { + "epoch": 4.865200937105785, + "grad_norm": 0.2672560214996338, + "learning_rate": 2.1555148839489392e-08, + "loss": 0.3331, + "step": 8999 + }, + { + "epoch": 4.8657415750585695, + "grad_norm": 0.2712641656398773, + "learning_rate": 2.1380384663135523e-08, + "loss": 0.401, + "step": 9000 + }, + { + "epoch": 4.866282213011353, + "grad_norm": 0.2781943380832672, + "learning_rate": 2.1206330329339718e-08, + "loss": 0.3508, + "step": 9001 + }, + { + "epoch": 4.866822850964137, + "grad_norm": 0.2640174329280853, + "learning_rate": 2.1032985862918242e-08, + "loss": 0.355, + "step": 9002 + }, + { + "epoch": 4.867363488916922, + "grad_norm": 0.2671608626842499, + "learning_rate": 2.086035128858632e-08, + "loss": 0.3841, + "step": 9003 + }, + { + "epoch": 4.867904126869706, + "grad_norm": 0.25497519969940186, + "learning_rate": 2.0688426630958158e-08, + "loss": 0.3467, + "step": 9004 + }, + { + "epoch": 4.868444764822491, + "grad_norm": 0.2529909312725067, + "learning_rate": 2.0517211914545254e-08, + "loss": 0.3519, + "step": 9005 + }, + { + "epoch": 4.868985402775275, + "grad_norm": 0.2696881592273712, + "learning_rate": 2.0346707163760304e-08, + "loss": 0.3768, + "step": 9006 + }, + { + "epoch": 4.869526040728059, + "grad_norm": 0.2658160626888275, + "learning_rate": 2.0176912402912752e-08, + "loss": 0.3659, + "step": 9007 + }, + { + "epoch": 4.8700666786808435, + "grad_norm": 0.24771720170974731, + "learning_rate": 2.0007827656212674e-08, + "loss": 0.3515, + "step": 9008 + }, + { + "epoch": 4.8706073166336274, + "grad_norm": 0.27035635709762573, + "learning_rate": 1.98394529477669e-08, + "loss": 0.3962, + "step": 9009 + }, + { + "epoch": 4.871147954586412, + "grad_norm": 0.2588082551956177, + "learning_rate": 1.967178830158234e-08, + "loss": 0.3257, + "step": 9010 + }, + { + "epoch": 4.871688592539196, + "grad_norm": 0.2636779248714447, + "learning_rate": 1.950483374156431e-08, + "loss": 0.3616, + "step": 9011 + }, + { + "epoch": 4.87222923049198, + "grad_norm": 0.2517573833465576, + "learning_rate": 1.9338589291516553e-08, + "loss": 0.3473, + "step": 9012 + }, + { + "epoch": 4.872769868444765, + "grad_norm": 0.2549690306186676, + "learning_rate": 1.9173054975142326e-08, + "loss": 0.3874, + "step": 9013 + }, + { + "epoch": 4.873310506397549, + "grad_norm": 0.2793142795562744, + "learning_rate": 1.900823081604386e-08, + "loss": 0.3795, + "step": 9014 + }, + { + "epoch": 4.873851144350334, + "grad_norm": 0.2743738889694214, + "learning_rate": 1.8844116837719582e-08, + "loss": 0.3434, + "step": 9015 + }, + { + "epoch": 4.8743917823031175, + "grad_norm": 0.2686752676963806, + "learning_rate": 1.8680713063570777e-08, + "loss": 0.3588, + "step": 9016 + }, + { + "epoch": 4.874932420255902, + "grad_norm": 0.2675032317638397, + "learning_rate": 1.8518019516893803e-08, + "loss": 0.3351, + "step": 9017 + }, + { + "epoch": 4.875473058208686, + "grad_norm": 0.28255701065063477, + "learning_rate": 1.835603622088511e-08, + "loss": 0.3802, + "step": 9018 + }, + { + "epoch": 4.87601369616147, + "grad_norm": 0.27471938729286194, + "learning_rate": 1.819476319864122e-08, + "loss": 0.4098, + "step": 9019 + }, + { + "epoch": 4.876554334114255, + "grad_norm": 0.24794790148735046, + "learning_rate": 1.803420047315485e-08, + "loss": 0.3012, + "step": 9020 + }, + { + "epoch": 4.877094972067039, + "grad_norm": 0.26921573281288147, + "learning_rate": 1.7874348067319912e-08, + "loss": 0.374, + "step": 9021 + }, + { + "epoch": 4.877635610019824, + "grad_norm": 0.2633867859840393, + "learning_rate": 1.771520600392651e-08, + "loss": 0.3559, + "step": 9022 + }, + { + "epoch": 4.878176247972608, + "grad_norm": 0.26123112440109253, + "learning_rate": 1.7556774305665935e-08, + "loss": 0.3666, + "step": 9023 + }, + { + "epoch": 4.878716885925392, + "grad_norm": 0.25380924344062805, + "learning_rate": 1.7399052995126787e-08, + "loss": 0.3476, + "step": 9024 + }, + { + "epoch": 4.879257523878176, + "grad_norm": 0.26871591806411743, + "learning_rate": 1.724204209479663e-08, + "loss": 0.3613, + "step": 9025 + }, + { + "epoch": 4.87979816183096, + "grad_norm": 0.2530689537525177, + "learning_rate": 1.7085741627062003e-08, + "loss": 0.3532, + "step": 9026 + }, + { + "epoch": 4.880338799783745, + "grad_norm": 0.2639843225479126, + "learning_rate": 1.6930151614207302e-08, + "loss": 0.3783, + "step": 9027 + }, + { + "epoch": 4.880879437736529, + "grad_norm": 0.2442837506532669, + "learning_rate": 1.6775272078417004e-08, + "loss": 0.3196, + "step": 9028 + }, + { + "epoch": 4.881420075689313, + "grad_norm": 0.2900562882423401, + "learning_rate": 1.662110304177289e-08, + "loss": 0.3927, + "step": 9029 + }, + { + "epoch": 4.881960713642098, + "grad_norm": 0.2681589126586914, + "learning_rate": 1.646764452625682e-08, + "loss": 0.3594, + "step": 9030 + }, + { + "epoch": 4.882501351594882, + "grad_norm": 0.27078258991241455, + "learning_rate": 1.6314896553748515e-08, + "loss": 0.3651, + "step": 9031 + }, + { + "epoch": 4.883041989547666, + "grad_norm": 0.2650523781776428, + "learning_rate": 1.6162859146025557e-08, + "loss": 0.3488, + "step": 9032 + }, + { + "epoch": 4.88358262750045, + "grad_norm": 0.26287534832954407, + "learning_rate": 1.601153232476671e-08, + "loss": 0.3725, + "step": 9033 + }, + { + "epoch": 4.884123265453235, + "grad_norm": 0.23999665677547455, + "learning_rate": 1.5860916111546386e-08, + "loss": 0.3072, + "step": 9034 + }, + { + "epoch": 4.884663903406019, + "grad_norm": 0.2521497905254364, + "learning_rate": 1.5711010527839633e-08, + "loss": 0.3726, + "step": 9035 + }, + { + "epoch": 4.885204541358803, + "grad_norm": 0.2601372003555298, + "learning_rate": 1.5561815595020457e-08, + "loss": 0.3824, + "step": 9036 + }, + { + "epoch": 4.885745179311588, + "grad_norm": 0.24594874680042267, + "learning_rate": 1.541333133436018e-08, + "loss": 0.3534, + "step": 9037 + }, + { + "epoch": 4.886285817264372, + "grad_norm": 0.2619156241416931, + "learning_rate": 1.526555776702965e-08, + "loss": 0.3611, + "step": 9038 + }, + { + "epoch": 4.8868264552171565, + "grad_norm": 0.25986140966415405, + "learning_rate": 1.511849491409756e-08, + "loss": 0.3362, + "step": 9039 + }, + { + "epoch": 4.8873670931699404, + "grad_norm": 0.2690037488937378, + "learning_rate": 1.4972142796532696e-08, + "loss": 0.3891, + "step": 9040 + }, + { + "epoch": 4.887907731122725, + "grad_norm": 0.2490912824869156, + "learning_rate": 1.482650143520059e-08, + "loss": 0.3477, + "step": 9041 + }, + { + "epoch": 4.888448369075509, + "grad_norm": 0.270921528339386, + "learning_rate": 1.4681570850867966e-08, + "loss": 0.3392, + "step": 9042 + }, + { + "epoch": 4.888989007028293, + "grad_norm": 0.2785066068172455, + "learning_rate": 1.4537351064197736e-08, + "loss": 0.3663, + "step": 9043 + }, + { + "epoch": 4.889529644981078, + "grad_norm": 0.24923615157604218, + "learning_rate": 1.4393842095752896e-08, + "loss": 0.3329, + "step": 9044 + }, + { + "epoch": 4.890070282933862, + "grad_norm": 0.2749128043651581, + "learning_rate": 1.4251043965994304e-08, + "loss": 0.409, + "step": 9045 + }, + { + "epoch": 4.890610920886647, + "grad_norm": 0.25903937220573425, + "learning_rate": 1.410895669528234e-08, + "loss": 0.3411, + "step": 9046 + }, + { + "epoch": 4.8911515588394305, + "grad_norm": 0.2720274031162262, + "learning_rate": 1.3967580303875239e-08, + "loss": 0.3729, + "step": 9047 + }, + { + "epoch": 4.8916921967922145, + "grad_norm": 0.2366250902414322, + "learning_rate": 1.3826914811930214e-08, + "loss": 0.3264, + "step": 9048 + }, + { + "epoch": 4.892232834744999, + "grad_norm": 0.25673407316207886, + "learning_rate": 1.3686960239503444e-08, + "loss": 0.3811, + "step": 9049 + }, + { + "epoch": 4.892773472697783, + "grad_norm": 0.25436335802078247, + "learning_rate": 1.3547716606548967e-08, + "loss": 0.3911, + "step": 9050 + }, + { + "epoch": 4.893314110650568, + "grad_norm": 0.25915470719337463, + "learning_rate": 1.3409183932919788e-08, + "loss": 0.395, + "step": 9051 + }, + { + "epoch": 4.893854748603352, + "grad_norm": 0.26253634691238403, + "learning_rate": 1.3271362238368447e-08, + "loss": 0.366, + "step": 9052 + }, + { + "epoch": 4.894395386556136, + "grad_norm": 0.26160550117492676, + "learning_rate": 1.3134251542544774e-08, + "loss": 0.3661, + "step": 9053 + }, + { + "epoch": 4.894936024508921, + "grad_norm": 0.2520962655544281, + "learning_rate": 1.2997851864997024e-08, + "loss": 0.3599, + "step": 9054 + }, + { + "epoch": 4.8954766624617045, + "grad_norm": 0.2571839392185211, + "learning_rate": 1.2862163225174084e-08, + "loss": 0.376, + "step": 9055 + }, + { + "epoch": 4.896017300414489, + "grad_norm": 0.24826057255268097, + "learning_rate": 1.272718564242159e-08, + "loss": 0.3455, + "step": 9056 + }, + { + "epoch": 4.896557938367273, + "grad_norm": 0.25594672560691833, + "learning_rate": 1.259291913598415e-08, + "loss": 0.3614, + "step": 9057 + }, + { + "epoch": 4.897098576320058, + "grad_norm": 0.2774297297000885, + "learning_rate": 1.2459363725005891e-08, + "loss": 0.4316, + "step": 9058 + }, + { + "epoch": 4.897639214272842, + "grad_norm": 0.25605112314224243, + "learning_rate": 1.2326519428528805e-08, + "loss": 0.3426, + "step": 9059 + }, + { + "epoch": 4.898179852225626, + "grad_norm": 0.27040329575538635, + "learning_rate": 1.2194386265492742e-08, + "loss": 0.3753, + "step": 9060 + }, + { + "epoch": 4.898720490178411, + "grad_norm": 0.26323947310447693, + "learning_rate": 1.2062964254738186e-08, + "loss": 0.3588, + "step": 9061 + }, + { + "epoch": 4.899261128131195, + "grad_norm": 0.27045533061027527, + "learning_rate": 1.193225341500237e-08, + "loss": 0.3274, + "step": 9062 + }, + { + "epoch": 4.899801766083979, + "grad_norm": 0.29912617802619934, + "learning_rate": 1.18022537649215e-08, + "loss": 0.3502, + "step": 9063 + }, + { + "epoch": 4.900342404036763, + "grad_norm": 0.27602618932724, + "learning_rate": 1.1672965323031304e-08, + "loss": 0.3213, + "step": 9064 + }, + { + "epoch": 4.900883041989548, + "grad_norm": 0.2750244438648224, + "learning_rate": 1.1544388107765924e-08, + "loss": 0.4147, + "step": 9065 + }, + { + "epoch": 4.901423679942332, + "grad_norm": 0.2645430266857147, + "learning_rate": 1.1416522137456254e-08, + "loss": 0.3692, + "step": 9066 + }, + { + "epoch": 4.901964317895116, + "grad_norm": 0.2459685057401657, + "learning_rate": 1.1289367430334375e-08, + "loss": 0.3079, + "step": 9067 + }, + { + "epoch": 4.902504955847901, + "grad_norm": 0.27539220452308655, + "learning_rate": 1.116292400452912e-08, + "loss": 0.4131, + "step": 9068 + }, + { + "epoch": 4.903045593800685, + "grad_norm": 0.24851562082767487, + "learning_rate": 1.1037191878068843e-08, + "loss": 0.3484, + "step": 9069 + }, + { + "epoch": 4.903586231753469, + "grad_norm": 0.24963010847568512, + "learning_rate": 1.0912171068880318e-08, + "loss": 0.3385, + "step": 9070 + }, + { + "epoch": 4.9041268697062534, + "grad_norm": 0.26785072684288025, + "learning_rate": 1.0787861594788728e-08, + "loss": 0.3773, + "step": 9071 + }, + { + "epoch": 4.904667507659037, + "grad_norm": 0.26428255438804626, + "learning_rate": 1.0664263473517677e-08, + "loss": 0.3968, + "step": 9072 + }, + { + "epoch": 4.905208145611822, + "grad_norm": 0.2475302517414093, + "learning_rate": 1.0541376722689734e-08, + "loss": 0.3291, + "step": 9073 + }, + { + "epoch": 4.905748783564606, + "grad_norm": 0.2793020009994507, + "learning_rate": 1.0419201359825881e-08, + "loss": 0.3545, + "step": 9074 + }, + { + "epoch": 4.906289421517391, + "grad_norm": 0.2719111144542694, + "learning_rate": 1.029773740234552e-08, + "loss": 0.4021, + "step": 9075 + }, + { + "epoch": 4.906830059470175, + "grad_norm": 0.2759902775287628, + "learning_rate": 1.0176984867567018e-08, + "loss": 0.3442, + "step": 9076 + }, + { + "epoch": 4.907370697422959, + "grad_norm": 0.2784024775028229, + "learning_rate": 1.0056943772706607e-08, + "loss": 0.3617, + "step": 9077 + }, + { + "epoch": 4.9079113353757435, + "grad_norm": 0.2713329792022705, + "learning_rate": 9.937614134880036e-09, + "loss": 0.374, + "step": 9078 + }, + { + "epoch": 4.9084519733285275, + "grad_norm": 0.2651738226413727, + "learning_rate": 9.81899597110092e-09, + "loss": 0.3532, + "step": 9079 + }, + { + "epoch": 4.908992611281312, + "grad_norm": 0.2688627541065216, + "learning_rate": 9.701089298281285e-09, + "loss": 0.3752, + "step": 9080 + }, + { + "epoch": 4.909533249234096, + "grad_norm": 0.27973148226737976, + "learning_rate": 9.583894133232685e-09, + "loss": 0.3755, + "step": 9081 + }, + { + "epoch": 4.910073887186881, + "grad_norm": 0.26119181513786316, + "learning_rate": 9.46741049266453e-09, + "loss": 0.3515, + "step": 9082 + }, + { + "epoch": 4.910614525139665, + "grad_norm": 0.2695755958557129, + "learning_rate": 9.351638393184092e-09, + "loss": 0.3567, + "step": 9083 + }, + { + "epoch": 4.911155163092449, + "grad_norm": 0.2547578513622284, + "learning_rate": 9.236577851298168e-09, + "loss": 0.3326, + "step": 9084 + }, + { + "epoch": 4.911695801045234, + "grad_norm": 0.27387887239456177, + "learning_rate": 9.12222888341252e-09, + "loss": 0.3887, + "step": 9085 + }, + { + "epoch": 4.9122364389980175, + "grad_norm": 0.2561882734298706, + "learning_rate": 9.008591505830777e-09, + "loss": 0.3682, + "step": 9086 + }, + { + "epoch": 4.9127770769508015, + "grad_norm": 0.25791123509407043, + "learning_rate": 8.895665734754422e-09, + "loss": 0.3444, + "step": 9087 + }, + { + "epoch": 4.913317714903586, + "grad_norm": 0.27500414848327637, + "learning_rate": 8.783451586284464e-09, + "loss": 0.3766, + "step": 9088 + }, + { + "epoch": 4.91385835285637, + "grad_norm": 0.2534409761428833, + "learning_rate": 8.671949076420883e-09, + "loss": 0.3507, + "step": 9089 + }, + { + "epoch": 4.914398990809155, + "grad_norm": 0.2635025084018707, + "learning_rate": 8.561158221060406e-09, + "loss": 0.3895, + "step": 9090 + }, + { + "epoch": 4.914939628761939, + "grad_norm": 0.25874948501586914, + "learning_rate": 8.451079035999843e-09, + "loss": 0.3622, + "step": 9091 + }, + { + "epoch": 4.915480266714724, + "grad_norm": 0.26105883717536926, + "learning_rate": 8.341711536934415e-09, + "loss": 0.3452, + "step": 9092 + }, + { + "epoch": 4.916020904667508, + "grad_norm": 0.252156138420105, + "learning_rate": 8.233055739457762e-09, + "loss": 0.3682, + "step": 9093 + }, + { + "epoch": 4.9165615426202915, + "grad_norm": 0.266645222902298, + "learning_rate": 8.125111659060826e-09, + "loss": 0.3792, + "step": 9094 + }, + { + "epoch": 4.917102180573076, + "grad_norm": 0.2581704258918762, + "learning_rate": 8.017879311134624e-09, + "loss": 0.3233, + "step": 9095 + }, + { + "epoch": 4.91764281852586, + "grad_norm": 0.27391037344932556, + "learning_rate": 7.911358710968042e-09, + "loss": 0.3656, + "step": 9096 + }, + { + "epoch": 4.918183456478645, + "grad_norm": 0.2647905647754669, + "learning_rate": 7.805549873749485e-09, + "loss": 0.3522, + "step": 9097 + }, + { + "epoch": 4.918724094431429, + "grad_norm": 0.2863467335700989, + "learning_rate": 7.700452814563552e-09, + "loss": 0.387, + "step": 9098 + }, + { + "epoch": 4.919264732384214, + "grad_norm": 0.2514527440071106, + "learning_rate": 7.596067548395481e-09, + "loss": 0.363, + "step": 9099 + }, + { + "epoch": 4.919805370336998, + "grad_norm": 0.2609706223011017, + "learning_rate": 7.492394090128364e-09, + "loss": 0.3352, + "step": 9100 + }, + { + "epoch": 4.920346008289782, + "grad_norm": 0.2651589810848236, + "learning_rate": 7.38943245454371e-09, + "loss": 0.373, + "step": 9101 + }, + { + "epoch": 4.920886646242566, + "grad_norm": 0.2527099847793579, + "learning_rate": 7.2871826563214454e-09, + "loss": 0.3637, + "step": 9102 + }, + { + "epoch": 4.92142728419535, + "grad_norm": 0.23803900182247162, + "learning_rate": 7.185644710040463e-09, + "loss": 0.331, + "step": 9103 + }, + { + "epoch": 4.921967922148135, + "grad_norm": 0.26620054244995117, + "learning_rate": 7.0848186301775145e-09, + "loss": 0.3664, + "step": 9104 + }, + { + "epoch": 4.922508560100919, + "grad_norm": 0.2763153612613678, + "learning_rate": 6.98470443110888e-09, + "loss": 0.3781, + "step": 9105 + }, + { + "epoch": 4.923049198053703, + "grad_norm": 0.2591570019721985, + "learning_rate": 6.88530212710814e-09, + "loss": 0.3638, + "step": 9106 + }, + { + "epoch": 4.923589836006488, + "grad_norm": 0.2564665973186493, + "learning_rate": 6.7866117323472925e-09, + "loss": 0.3666, + "step": 9107 + }, + { + "epoch": 4.924130473959272, + "grad_norm": 0.2689834535121918, + "learning_rate": 6.688633260898414e-09, + "loss": 0.3554, + "step": 9108 + }, + { + "epoch": 4.9246711119120565, + "grad_norm": 0.24325302243232727, + "learning_rate": 6.591366726730885e-09, + "loss": 0.3345, + "step": 9109 + }, + { + "epoch": 4.9252117498648404, + "grad_norm": 0.2511265277862549, + "learning_rate": 6.4948121437125035e-09, + "loss": 0.3754, + "step": 9110 + }, + { + "epoch": 4.925752387817624, + "grad_norm": 0.25291627645492554, + "learning_rate": 6.398969525610032e-09, + "loss": 0.3385, + "step": 9111 + }, + { + "epoch": 4.926293025770409, + "grad_norm": 0.2580259442329407, + "learning_rate": 6.303838886088653e-09, + "loss": 0.3644, + "step": 9112 + }, + { + "epoch": 4.926833663723193, + "grad_norm": 0.23568320274353027, + "learning_rate": 6.20942023871196e-09, + "loss": 0.3328, + "step": 9113 + }, + { + "epoch": 4.927374301675978, + "grad_norm": 0.2598705589771271, + "learning_rate": 6.115713596941408e-09, + "loss": 0.3851, + "step": 9114 + }, + { + "epoch": 4.927914939628762, + "grad_norm": 0.2612532675266266, + "learning_rate": 6.022718974137976e-09, + "loss": 0.3697, + "step": 9115 + }, + { + "epoch": 4.928455577581547, + "grad_norm": 0.26220640540122986, + "learning_rate": 5.930436383561056e-09, + "loss": 0.3278, + "step": 9116 + }, + { + "epoch": 4.9289962155343305, + "grad_norm": 0.2938258945941925, + "learning_rate": 5.838865838366792e-09, + "loss": 0.4168, + "step": 9117 + }, + { + "epoch": 4.9295368534871145, + "grad_norm": 0.25957944989204407, + "learning_rate": 5.748007351613072e-09, + "loss": 0.3328, + "step": 9118 + }, + { + "epoch": 4.930077491439899, + "grad_norm": 0.2673860192298889, + "learning_rate": 5.657860936252868e-09, + "loss": 0.3636, + "step": 9119 + }, + { + "epoch": 4.930618129392683, + "grad_norm": 0.26958364248275757, + "learning_rate": 5.568426605139232e-09, + "loss": 0.4007, + "step": 9120 + }, + { + "epoch": 4.931158767345468, + "grad_norm": 0.24389295279979706, + "learning_rate": 5.479704371024186e-09, + "loss": 0.3277, + "step": 9121 + }, + { + "epoch": 4.931699405298252, + "grad_norm": 0.2678050100803375, + "learning_rate": 5.391694246557056e-09, + "loss": 0.365, + "step": 9122 + }, + { + "epoch": 4.932240043251037, + "grad_norm": 0.27633118629455566, + "learning_rate": 5.304396244286691e-09, + "loss": 0.3767, + "step": 9123 + }, + { + "epoch": 4.932780681203821, + "grad_norm": 0.25347909331321716, + "learning_rate": 5.217810376659249e-09, + "loss": 0.3511, + "step": 9124 + }, + { + "epoch": 4.9333213191566045, + "grad_norm": 0.24641470611095428, + "learning_rate": 5.131936656020409e-09, + "loss": 0.3514, + "step": 9125 + }, + { + "epoch": 4.933861957109389, + "grad_norm": 0.27080366015434265, + "learning_rate": 5.046775094613709e-09, + "loss": 0.3732, + "step": 9126 + }, + { + "epoch": 4.934402595062173, + "grad_norm": 0.2650505304336548, + "learning_rate": 4.962325704581661e-09, + "loss": 0.3553, + "step": 9127 + }, + { + "epoch": 4.934943233014957, + "grad_norm": 0.2590729594230652, + "learning_rate": 4.878588497964077e-09, + "loss": 0.3604, + "step": 9128 + }, + { + "epoch": 4.935483870967742, + "grad_norm": 0.2556917667388916, + "learning_rate": 4.795563486700849e-09, + "loss": 0.3628, + "step": 9129 + }, + { + "epoch": 4.936024508920526, + "grad_norm": 0.2660444676876068, + "learning_rate": 4.713250682629733e-09, + "loss": 0.3569, + "step": 9130 + }, + { + "epoch": 4.936565146873311, + "grad_norm": 0.27511194348335266, + "learning_rate": 4.631650097485784e-09, + "loss": 0.359, + "step": 9131 + }, + { + "epoch": 4.937105784826095, + "grad_norm": 0.2793119251728058, + "learning_rate": 4.550761742904142e-09, + "loss": 0.3719, + "step": 9132 + }, + { + "epoch": 4.937646422778879, + "grad_norm": 0.25426700711250305, + "learning_rate": 4.470585630417801e-09, + "loss": 0.3715, + "step": 9133 + }, + { + "epoch": 4.938187060731663, + "grad_norm": 0.2769455015659332, + "learning_rate": 4.391121771457618e-09, + "loss": 0.3939, + "step": 9134 + }, + { + "epoch": 4.938727698684447, + "grad_norm": 0.23823679983615875, + "learning_rate": 4.312370177353975e-09, + "loss": 0.3269, + "step": 9135 + }, + { + "epoch": 4.939268336637232, + "grad_norm": 0.25965994596481323, + "learning_rate": 4.234330859334557e-09, + "loss": 0.398, + "step": 9136 + }, + { + "epoch": 4.939808974590016, + "grad_norm": 0.2441171556711197, + "learning_rate": 4.157003828526573e-09, + "loss": 0.344, + "step": 9137 + }, + { + "epoch": 4.940349612542801, + "grad_norm": 0.26520293951034546, + "learning_rate": 4.08038909595454e-09, + "loss": 0.3712, + "step": 9138 + }, + { + "epoch": 4.940890250495585, + "grad_norm": 0.2556518316268921, + "learning_rate": 4.004486672542496e-09, + "loss": 0.3578, + "step": 9139 + }, + { + "epoch": 4.9414308884483695, + "grad_norm": 0.268156498670578, + "learning_rate": 3.929296569112895e-09, + "loss": 0.3801, + "step": 9140 + }, + { + "epoch": 4.9419715264011534, + "grad_norm": 0.24451026320457458, + "learning_rate": 3.854818796385495e-09, + "loss": 0.3474, + "step": 9141 + }, + { + "epoch": 4.942512164353937, + "grad_norm": 0.27529263496398926, + "learning_rate": 3.781053364979026e-09, + "loss": 0.3721, + "step": 9142 + }, + { + "epoch": 4.943052802306722, + "grad_norm": 0.27681949734687805, + "learning_rate": 3.708000285411739e-09, + "loss": 0.3746, + "step": 9143 + }, + { + "epoch": 4.943593440259506, + "grad_norm": 0.259630024433136, + "learning_rate": 3.6356595680986375e-09, + "loss": 0.3512, + "step": 9144 + }, + { + "epoch": 4.94413407821229, + "grad_norm": 0.2611832916736603, + "learning_rate": 3.5640312233548024e-09, + "loss": 0.3503, + "step": 9145 + }, + { + "epoch": 4.944674716165075, + "grad_norm": 0.26331421732902527, + "learning_rate": 3.493115261391511e-09, + "loss": 0.3478, + "step": 9146 + }, + { + "epoch": 4.945215354117859, + "grad_norm": 0.25908151268959045, + "learning_rate": 3.4229116923212293e-09, + "loss": 0.3655, + "step": 9147 + }, + { + "epoch": 4.9457559920706435, + "grad_norm": 0.23877955973148346, + "learning_rate": 3.3534205261526174e-09, + "loss": 0.3454, + "step": 9148 + }, + { + "epoch": 4.9462966300234275, + "grad_norm": 0.269363671541214, + "learning_rate": 3.284641772793862e-09, + "loss": 0.4178, + "step": 9149 + }, + { + "epoch": 4.946837267976212, + "grad_norm": 0.27045199275016785, + "learning_rate": 3.2165754420510063e-09, + "loss": 0.3526, + "step": 9150 + }, + { + "epoch": 4.947377905928996, + "grad_norm": 0.2779121696949005, + "learning_rate": 3.149221543629066e-09, + "loss": 0.3688, + "step": 9151 + }, + { + "epoch": 4.94791854388178, + "grad_norm": 0.26373735070228577, + "learning_rate": 3.0825800871314705e-09, + "loss": 0.341, + "step": 9152 + }, + { + "epoch": 4.948459181834565, + "grad_norm": 0.2782399356365204, + "learning_rate": 3.0166510820595074e-09, + "loss": 0.3608, + "step": 9153 + }, + { + "epoch": 4.948999819787349, + "grad_norm": 0.27947893738746643, + "learning_rate": 2.9514345378134357e-09, + "loss": 0.3699, + "step": 9154 + }, + { + "epoch": 4.949540457740134, + "grad_norm": 0.25066402554512024, + "learning_rate": 2.886930463691928e-09, + "loss": 0.3203, + "step": 9155 + }, + { + "epoch": 4.9500810956929175, + "grad_norm": 0.2683260440826416, + "learning_rate": 2.823138868890962e-09, + "loss": 0.3778, + "step": 9156 + }, + { + "epoch": 4.950621733645702, + "grad_norm": 0.2609219253063202, + "learning_rate": 2.760059762506595e-09, + "loss": 0.3574, + "step": 9157 + }, + { + "epoch": 4.951162371598486, + "grad_norm": 0.26026174426078796, + "learning_rate": 2.6976931535321884e-09, + "loss": 0.3393, + "step": 9158 + }, + { + "epoch": 4.95170300955127, + "grad_norm": 0.2527477741241455, + "learning_rate": 2.636039050860073e-09, + "loss": 0.3538, + "step": 9159 + }, + { + "epoch": 4.952243647504055, + "grad_norm": 0.2678411900997162, + "learning_rate": 2.5750974632809955e-09, + "loss": 0.3948, + "step": 9160 + }, + { + "epoch": 4.952784285456839, + "grad_norm": 0.2649003267288208, + "learning_rate": 2.514868399483561e-09, + "loss": 0.33, + "step": 9161 + }, + { + "epoch": 4.953324923409624, + "grad_norm": 0.2374754697084427, + "learning_rate": 2.4553518680547893e-09, + "loss": 0.3412, + "step": 9162 + }, + { + "epoch": 4.953865561362408, + "grad_norm": 0.2582480013370514, + "learning_rate": 2.3965478774812256e-09, + "loss": 0.3679, + "step": 9163 + }, + { + "epoch": 4.9544061993151916, + "grad_norm": 0.26913562417030334, + "learning_rate": 2.3384564361461635e-09, + "loss": 0.356, + "step": 9164 + }, + { + "epoch": 4.954946837267976, + "grad_norm": 0.24180622398853302, + "learning_rate": 2.2810775523329775e-09, + "loss": 0.3745, + "step": 9165 + }, + { + "epoch": 4.95548747522076, + "grad_norm": 0.2550312280654907, + "learning_rate": 2.2244112342223456e-09, + "loss": 0.3821, + "step": 9166 + }, + { + "epoch": 4.956028113173545, + "grad_norm": 0.2468712031841278, + "learning_rate": 2.168457489893916e-09, + "loss": 0.3558, + "step": 9167 + }, + { + "epoch": 4.956568751126329, + "grad_norm": 0.26661866903305054, + "learning_rate": 2.113216327324641e-09, + "loss": 0.3618, + "step": 9168 + }, + { + "epoch": 4.957109389079113, + "grad_norm": 0.2754621207714081, + "learning_rate": 2.058687754391553e-09, + "loss": 0.3653, + "step": 9169 + }, + { + "epoch": 4.957650027031898, + "grad_norm": 0.2769436836242676, + "learning_rate": 2.0048717788684335e-09, + "loss": 0.3803, + "step": 9170 + }, + { + "epoch": 4.958190664984682, + "grad_norm": 0.26223111152648926, + "learning_rate": 1.9517684084291442e-09, + "loss": 0.315, + "step": 9171 + }, + { + "epoch": 4.9587313029374664, + "grad_norm": 0.2742185890674591, + "learning_rate": 1.899377650644851e-09, + "loss": 0.404, + "step": 9172 + }, + { + "epoch": 4.95927194089025, + "grad_norm": 0.24863441288471222, + "learning_rate": 1.847699512985135e-09, + "loss": 0.3541, + "step": 9173 + }, + { + "epoch": 4.959812578843035, + "grad_norm": 0.2914765775203705, + "learning_rate": 1.7967340028179902e-09, + "loss": 0.3721, + "step": 9174 + }, + { + "epoch": 4.960353216795819, + "grad_norm": 0.2604866027832031, + "learning_rate": 1.746481127409827e-09, + "loss": 0.4057, + "step": 9175 + }, + { + "epoch": 4.960893854748603, + "grad_norm": 0.26574966311454773, + "learning_rate": 1.6969408939265796e-09, + "loss": 0.3536, + "step": 9176 + }, + { + "epoch": 4.961434492701388, + "grad_norm": 0.283233106136322, + "learning_rate": 1.648113309430932e-09, + "loss": 0.3746, + "step": 9177 + }, + { + "epoch": 4.961975130654172, + "grad_norm": 0.24985407292842865, + "learning_rate": 1.5999983808845376e-09, + "loss": 0.3275, + "step": 9178 + }, + { + "epoch": 4.9625157686069565, + "grad_norm": 0.26592645049095154, + "learning_rate": 1.5525961151474645e-09, + "loss": 0.3592, + "step": 9179 + }, + { + "epoch": 4.9630564065597405, + "grad_norm": 0.2678586542606354, + "learning_rate": 1.5059065189787502e-09, + "loss": 0.3529, + "step": 9180 + }, + { + "epoch": 4.963597044512525, + "grad_norm": 0.2732866108417511, + "learning_rate": 1.4599295990352924e-09, + "loss": 0.3571, + "step": 9181 + }, + { + "epoch": 4.964137682465309, + "grad_norm": 0.2503153681755066, + "learning_rate": 1.4146653618718475e-09, + "loss": 0.3396, + "step": 9182 + }, + { + "epoch": 4.964678320418093, + "grad_norm": 0.2600671052932739, + "learning_rate": 1.3701138139421422e-09, + "loss": 0.368, + "step": 9183 + }, + { + "epoch": 4.965218958370878, + "grad_norm": 0.2618968188762665, + "learning_rate": 1.3262749615988723e-09, + "loss": 0.3564, + "step": 9184 + }, + { + "epoch": 4.965759596323662, + "grad_norm": 0.26783448457717896, + "learning_rate": 1.2831488110920386e-09, + "loss": 0.3705, + "step": 9185 + }, + { + "epoch": 4.966300234276446, + "grad_norm": 0.27529674768447876, + "learning_rate": 1.2407353685706115e-09, + "loss": 0.3619, + "step": 9186 + }, + { + "epoch": 4.9668408722292305, + "grad_norm": 0.2847377359867096, + "learning_rate": 1.1990346400819752e-09, + "loss": 0.3629, + "step": 9187 + }, + { + "epoch": 4.9673815101820145, + "grad_norm": 0.2857806980609894, + "learning_rate": 1.1580466315713746e-09, + "loss": 0.3815, + "step": 9188 + }, + { + "epoch": 4.967922148134799, + "grad_norm": 0.2735769748687744, + "learning_rate": 1.1177713488830233e-09, + "loss": 0.3889, + "step": 9189 + }, + { + "epoch": 4.968462786087583, + "grad_norm": 0.2641699016094208, + "learning_rate": 1.07820879775955e-09, + "loss": 0.3761, + "step": 9190 + }, + { + "epoch": 4.969003424040368, + "grad_norm": 0.2488427609205246, + "learning_rate": 1.0393589838414431e-09, + "loss": 0.359, + "step": 9191 + }, + { + "epoch": 4.969544061993152, + "grad_norm": 0.26051753759384155, + "learning_rate": 1.0012219126676048e-09, + "loss": 0.3674, + "step": 9192 + }, + { + "epoch": 4.970084699945936, + "grad_norm": 0.2759767472743988, + "learning_rate": 9.637975896759077e-10, + "loss": 0.3853, + "step": 9193 + }, + { + "epoch": 4.970625337898721, + "grad_norm": 0.2598288655281067, + "learning_rate": 9.270860202020837e-10, + "loss": 0.3597, + "step": 9194 + }, + { + "epoch": 4.9711659758515045, + "grad_norm": 0.26341670751571655, + "learning_rate": 8.910872094802792e-10, + "loss": 0.3741, + "step": 9195 + }, + { + "epoch": 4.971706613804289, + "grad_norm": 0.2670850455760956, + "learning_rate": 8.558011626430551e-10, + "loss": 0.3818, + "step": 9196 + }, + { + "epoch": 4.972247251757073, + "grad_norm": 0.2679150700569153, + "learning_rate": 8.212278847224975e-10, + "loss": 0.39, + "step": 9197 + }, + { + "epoch": 4.972787889709858, + "grad_norm": 0.2692587375640869, + "learning_rate": 7.873673806463311e-10, + "loss": 0.3403, + "step": 9198 + }, + { + "epoch": 4.973328527662642, + "grad_norm": 0.2623067796230316, + "learning_rate": 7.542196552440262e-10, + "loss": 0.3545, + "step": 9199 + }, + { + "epoch": 4.973869165615426, + "grad_norm": 0.26583847403526306, + "learning_rate": 7.217847132401367e-10, + "loss": 0.363, + "step": 9200 + }, + { + "epoch": 4.974409803568211, + "grad_norm": 0.2767746150493622, + "learning_rate": 6.900625592604071e-10, + "loss": 0.3512, + "step": 9201 + }, + { + "epoch": 4.974950441520995, + "grad_norm": 0.28464779257774353, + "learning_rate": 6.590531978267756e-10, + "loss": 0.3609, + "step": 9202 + }, + { + "epoch": 4.975491079473779, + "grad_norm": 0.2627117931842804, + "learning_rate": 6.287566333612605e-10, + "loss": 0.3351, + "step": 9203 + }, + { + "epoch": 4.976031717426563, + "grad_norm": 0.26655125617980957, + "learning_rate": 5.991728701831845e-10, + "loss": 0.3772, + "step": 9204 + }, + { + "epoch": 4.976572355379347, + "grad_norm": 0.25739941000938416, + "learning_rate": 5.703019125102849e-10, + "loss": 0.3814, + "step": 9205 + }, + { + "epoch": 4.977112993332132, + "grad_norm": 0.2623976767063141, + "learning_rate": 5.421437644598237e-10, + "loss": 0.3908, + "step": 9206 + }, + { + "epoch": 4.977653631284916, + "grad_norm": 0.242452472448349, + "learning_rate": 5.146984300452574e-10, + "loss": 0.3146, + "step": 9207 + }, + { + "epoch": 4.978194269237701, + "grad_norm": 0.25939616560935974, + "learning_rate": 4.879659131806769e-10, + "loss": 0.3615, + "step": 9208 + }, + { + "epoch": 4.978734907190485, + "grad_norm": 0.24832409620285034, + "learning_rate": 4.619462176769229e-10, + "loss": 0.3905, + "step": 9209 + }, + { + "epoch": 4.979275545143269, + "grad_norm": 0.25697964429855347, + "learning_rate": 4.3663934724436086e-10, + "loss": 0.4133, + "step": 9210 + }, + { + "epoch": 4.9798161830960535, + "grad_norm": 0.25979509949684143, + "learning_rate": 4.120453054912155e-10, + "loss": 0.3705, + "step": 9211 + }, + { + "epoch": 4.980356821048837, + "grad_norm": 0.25299936532974243, + "learning_rate": 3.8816409592357106e-10, + "loss": 0.34, + "step": 9212 + }, + { + "epoch": 4.980897459001622, + "grad_norm": 0.2749847173690796, + "learning_rate": 3.649957219464817e-10, + "loss": 0.3785, + "step": 9213 + }, + { + "epoch": 4.981438096954406, + "grad_norm": 0.2649393379688263, + "learning_rate": 3.4254018686341596e-10, + "loss": 0.3681, + "step": 9214 + }, + { + "epoch": 4.981978734907191, + "grad_norm": 0.26384660601615906, + "learning_rate": 3.20797493876257e-10, + "loss": 0.3484, + "step": 9215 + }, + { + "epoch": 4.982519372859975, + "grad_norm": 0.26114729046821594, + "learning_rate": 2.9976764608474764e-10, + "loss": 0.3658, + "step": 9216 + }, + { + "epoch": 4.983060010812759, + "grad_norm": 0.27179184556007385, + "learning_rate": 2.79450646487045e-10, + "loss": 0.3604, + "step": 9217 + }, + { + "epoch": 4.9836006487655435, + "grad_norm": 0.26943889260292053, + "learning_rate": 2.598464979808313e-10, + "loss": 0.3804, + "step": 9218 + }, + { + "epoch": 4.9841412867183275, + "grad_norm": 0.2641843259334564, + "learning_rate": 2.4095520335998266e-10, + "loss": 0.3371, + "step": 9219 + }, + { + "epoch": 4.984681924671112, + "grad_norm": 0.28133559226989746, + "learning_rate": 2.227767653190105e-10, + "loss": 0.3596, + "step": 9220 + }, + { + "epoch": 4.985222562623896, + "grad_norm": 0.2666134536266327, + "learning_rate": 2.0531118644917524e-10, + "loss": 0.3377, + "step": 9221 + }, + { + "epoch": 4.98576320057668, + "grad_norm": 0.2675313353538513, + "learning_rate": 1.885584692407072e-10, + "loss": 0.3531, + "step": 9222 + }, + { + "epoch": 4.986303838529465, + "grad_norm": 0.24799907207489014, + "learning_rate": 1.725186160822512e-10, + "loss": 0.3539, + "step": 9223 + }, + { + "epoch": 4.986844476482249, + "grad_norm": 0.24728845059871674, + "learning_rate": 1.571916292608666e-10, + "loss": 0.3509, + "step": 9224 + }, + { + "epoch": 4.987385114435034, + "grad_norm": 0.25327473878860474, + "learning_rate": 1.4257751096202755e-10, + "loss": 0.3473, + "step": 9225 + }, + { + "epoch": 4.9879257523878175, + "grad_norm": 0.26934900879859924, + "learning_rate": 1.2867626326962258e-10, + "loss": 0.3581, + "step": 9226 + }, + { + "epoch": 4.9884663903406015, + "grad_norm": 0.27040383219718933, + "learning_rate": 1.1548788816428957e-10, + "loss": 0.403, + "step": 9227 + }, + { + "epoch": 4.989007028293386, + "grad_norm": 0.2764724791049957, + "learning_rate": 1.0301238752785659e-10, + "loss": 0.3967, + "step": 9228 + }, + { + "epoch": 4.98954766624617, + "grad_norm": 0.2611216902732849, + "learning_rate": 9.124976313834577e-11, + "loss": 0.3546, + "step": 9229 + }, + { + "epoch": 4.990088304198955, + "grad_norm": 0.26182636618614197, + "learning_rate": 8.020001667330412e-11, + "loss": 0.3885, + "step": 9230 + }, + { + "epoch": 4.990628942151739, + "grad_norm": 0.2629258632659912, + "learning_rate": 6.986314970758301e-11, + "loss": 0.3823, + "step": 9231 + }, + { + "epoch": 4.991169580104524, + "grad_norm": 0.26330283284187317, + "learning_rate": 6.02391637155586e-11, + "loss": 0.3729, + "step": 9232 + }, + { + "epoch": 4.991710218057308, + "grad_norm": 0.2656315267086029, + "learning_rate": 5.1328060069466554e-11, + "loss": 0.3384, + "step": 9233 + }, + { + "epoch": 4.9922508560100916, + "grad_norm": 0.2702144384384155, + "learning_rate": 4.3129840038846904e-11, + "loss": 0.337, + "step": 9234 + }, + { + "epoch": 4.992791493962876, + "grad_norm": 0.30379825830459595, + "learning_rate": 3.564450479387471e-11, + "loss": 0.3407, + "step": 9235 + }, + { + "epoch": 4.99333213191566, + "grad_norm": 0.25860539078712463, + "learning_rate": 2.8872055400919198e-11, + "loss": 0.3428, + "step": 9236 + }, + { + "epoch": 4.993872769868445, + "grad_norm": 0.2679060101509094, + "learning_rate": 2.2812492825874388e-11, + "loss": 0.3864, + "step": 9237 + }, + { + "epoch": 4.994413407821229, + "grad_norm": 0.27441951632499695, + "learning_rate": 1.74658179324938e-11, + "loss": 0.3757, + "step": 9238 + }, + { + "epoch": 4.994954045774014, + "grad_norm": 0.27093935012817383, + "learning_rate": 1.283203148350065e-11, + "loss": 0.3606, + "step": 9239 + }, + { + "epoch": 4.995494683726798, + "grad_norm": 0.2916185259819031, + "learning_rate": 8.911134139477639e-12, + "loss": 0.3834, + "step": 9240 + }, + { + "epoch": 4.996035321679582, + "grad_norm": 0.26088154315948486, + "learning_rate": 5.703126458866948e-12, + "loss": 0.3402, + "step": 9241 + }, + { + "epoch": 4.9965759596323664, + "grad_norm": 0.25968217849731445, + "learning_rate": 3.2080089001906845e-12, + "loss": 0.3528, + "step": 9242 + }, + { + "epoch": 4.99711659758515, + "grad_norm": 0.27741682529449463, + "learning_rate": 1.4257818181651062e-12, + "loss": 0.4118, + "step": 9243 + }, + { + "epoch": 4.997657235537934, + "grad_norm": 0.24423567950725555, + "learning_rate": 3.5644546703128557e-13, + "loss": 0.3267, + "step": 9244 + }, + { + "epoch": 4.998197873490719, + "grad_norm": 0.2597697377204895, + "learning_rate": 0.0, + "loss": 0.3684, + "step": 9245 + }, + { + "epoch": 4.998197873490719, + "step": 9245, + "total_flos": 1.4140474867580928e+16, + "train_loss": 0.4185003241859558, + "train_runtime": 192830.8421, + "train_samples_per_second": 4.604, + "train_steps_per_second": 0.048 + } + ], + "logging_steps": 1.0, + "max_steps": 9245, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4140474867580928e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}