diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12957 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999729070712544, + "eval_steps": 500, + "global_step": 1845, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000541858574911948, + "grad_norm": 0.43530920147895813, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8962, + "step": 1 + }, + { + "epoch": 0.001083717149823896, + "grad_norm": 0.6412757635116577, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9589, + "step": 2 + }, + { + "epoch": 0.001625575724735844, + "grad_norm": 0.47238072752952576, + "learning_rate": 6e-06, + "loss": 1.9824, + "step": 3 + }, + { + "epoch": 0.002167434299647792, + "grad_norm": 0.4227930009365082, + "learning_rate": 8.000000000000001e-06, + "loss": 1.9207, + "step": 4 + }, + { + "epoch": 0.00270929287455974, + "grad_norm": 0.46128469705581665, + "learning_rate": 1e-05, + "loss": 1.9365, + "step": 5 + }, + { + "epoch": 0.003251151449471688, + "grad_norm": 0.5934392213821411, + "learning_rate": 1.2e-05, + "loss": 1.8873, + "step": 6 + }, + { + "epoch": 0.003793010024383636, + "grad_norm": 0.39757072925567627, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.8462, + "step": 7 + }, + { + "epoch": 0.004334868599295584, + "grad_norm": 0.3870404064655304, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.9607, + "step": 8 + }, + { + "epoch": 0.004876727174207532, + "grad_norm": 0.335467666387558, + "learning_rate": 1.8e-05, + "loss": 2.0285, + "step": 9 + }, + { + "epoch": 0.00541858574911948, + "grad_norm": 0.3384644091129303, + "learning_rate": 2e-05, + "loss": 1.9302, + "step": 10 + }, + { + "epoch": 0.005960444324031428, + "grad_norm": 0.29000476002693176, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.9069, + "step": 11 + }, + { + "epoch": 0.006502302898943376, + "grad_norm": 0.22133825719356537, + "learning_rate": 2.4e-05, + "loss": 1.8441, + "step": 12 + }, + { + "epoch": 0.007044161473855324, + "grad_norm": 0.2738000154495239, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.8498, + "step": 13 + }, + { + "epoch": 0.007586020048767272, + "grad_norm": 0.2651294469833374, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.8377, + "step": 14 + }, + { + "epoch": 0.00812787862367922, + "grad_norm": 0.3199697732925415, + "learning_rate": 3e-05, + "loss": 1.9074, + "step": 15 + }, + { + "epoch": 0.008669737198591168, + "grad_norm": 0.28017792105674744, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.8188, + "step": 16 + }, + { + "epoch": 0.009211595773503115, + "grad_norm": 0.2615055739879608, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.9161, + "step": 17 + }, + { + "epoch": 0.009753454348415064, + "grad_norm": 0.19840335845947266, + "learning_rate": 3.6e-05, + "loss": 1.8613, + "step": 18 + }, + { + "epoch": 0.010295312923327011, + "grad_norm": 0.14807601273059845, + "learning_rate": 3.8e-05, + "loss": 1.7986, + "step": 19 + }, + { + "epoch": 0.01083717149823896, + "grad_norm": 0.15882571041584015, + "learning_rate": 4e-05, + "loss": 1.8822, + "step": 20 + }, + { + "epoch": 0.011379030073150907, + "grad_norm": 0.1381528228521347, + "learning_rate": 4.2e-05, + "loss": 1.8117, + "step": 21 + }, + { + "epoch": 0.011920888648062856, + "grad_norm": 0.17201915383338928, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.8538, + "step": 22 + }, + { + "epoch": 0.012462747222974803, + "grad_norm": 0.17138561606407166, + "learning_rate": 4.600000000000001e-05, + "loss": 1.7731, + "step": 23 + }, + { + "epoch": 0.013004605797886752, + "grad_norm": 0.16845813393592834, + "learning_rate": 4.8e-05, + "loss": 1.8515, + "step": 24 + }, + { + "epoch": 0.0135464643727987, + "grad_norm": 0.13196821510791779, + "learning_rate": 5e-05, + "loss": 1.7643, + "step": 25 + }, + { + "epoch": 0.014088322947710648, + "grad_norm": 0.15632478892803192, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.9114, + "step": 26 + }, + { + "epoch": 0.014630181522622595, + "grad_norm": 0.1211211085319519, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.8366, + "step": 27 + }, + { + "epoch": 0.015172040097534544, + "grad_norm": 0.11397989839315414, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7707, + "step": 28 + }, + { + "epoch": 0.01571389867244649, + "grad_norm": 0.11910925805568695, + "learning_rate": 5.8e-05, + "loss": 1.8173, + "step": 29 + }, + { + "epoch": 0.01625575724735844, + "grad_norm": 0.12323752790689468, + "learning_rate": 6e-05, + "loss": 1.7767, + "step": 30 + }, + { + "epoch": 0.01679761582227039, + "grad_norm": 0.13322019577026367, + "learning_rate": 6.2e-05, + "loss": 1.8332, + "step": 31 + }, + { + "epoch": 0.017339474397182336, + "grad_norm": 0.12622202932834625, + "learning_rate": 6.400000000000001e-05, + "loss": 1.8144, + "step": 32 + }, + { + "epoch": 0.017881332972094283, + "grad_norm": 0.09827224165201187, + "learning_rate": 6.6e-05, + "loss": 1.8452, + "step": 33 + }, + { + "epoch": 0.01842319154700623, + "grad_norm": 0.10827549546957016, + "learning_rate": 6.800000000000001e-05, + "loss": 1.8212, + "step": 34 + }, + { + "epoch": 0.01896505012191818, + "grad_norm": 0.10839946568012238, + "learning_rate": 7e-05, + "loss": 1.7667, + "step": 35 + }, + { + "epoch": 0.019506908696830128, + "grad_norm": 0.09671394526958466, + "learning_rate": 7.2e-05, + "loss": 1.7415, + "step": 36 + }, + { + "epoch": 0.020048767271742075, + "grad_norm": 0.10029514878988266, + "learning_rate": 7.4e-05, + "loss": 1.748, + "step": 37 + }, + { + "epoch": 0.020590625846654022, + "grad_norm": 0.09756534546613693, + "learning_rate": 7.6e-05, + "loss": 1.8568, + "step": 38 + }, + { + "epoch": 0.02113248442156597, + "grad_norm": 0.10066157579421997, + "learning_rate": 7.800000000000001e-05, + "loss": 1.8028, + "step": 39 + }, + { + "epoch": 0.02167434299647792, + "grad_norm": 0.08360892534255981, + "learning_rate": 8e-05, + "loss": 1.7305, + "step": 40 + }, + { + "epoch": 0.022216201571389867, + "grad_norm": 0.09223756939172745, + "learning_rate": 8.2e-05, + "loss": 1.755, + "step": 41 + }, + { + "epoch": 0.022758060146301815, + "grad_norm": 0.09345275908708572, + "learning_rate": 8.4e-05, + "loss": 1.7815, + "step": 42 + }, + { + "epoch": 0.02329991872121376, + "grad_norm": 0.09920019656419754, + "learning_rate": 8.6e-05, + "loss": 1.8139, + "step": 43 + }, + { + "epoch": 0.023841777296125712, + "grad_norm": 0.09716581553220749, + "learning_rate": 8.800000000000001e-05, + "loss": 1.7593, + "step": 44 + }, + { + "epoch": 0.02438363587103766, + "grad_norm": 0.09761088341474533, + "learning_rate": 9e-05, + "loss": 1.8432, + "step": 45 + }, + { + "epoch": 0.024925494445949607, + "grad_norm": 0.10412333160638809, + "learning_rate": 9.200000000000001e-05, + "loss": 1.788, + "step": 46 + }, + { + "epoch": 0.025467353020861554, + "grad_norm": 0.09423588216304779, + "learning_rate": 9.4e-05, + "loss": 1.7802, + "step": 47 + }, + { + "epoch": 0.026009211595773504, + "grad_norm": 0.09519781917333603, + "learning_rate": 9.6e-05, + "loss": 1.7829, + "step": 48 + }, + { + "epoch": 0.02655107017068545, + "grad_norm": 0.09376217424869537, + "learning_rate": 9.8e-05, + "loss": 1.7308, + "step": 49 + }, + { + "epoch": 0.0270929287455974, + "grad_norm": 0.09659101814031601, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 50 + }, + { + "epoch": 0.027634787320509346, + "grad_norm": 0.10163891315460205, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 51 + }, + { + "epoch": 0.028176645895421296, + "grad_norm": 0.10365260392427444, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 52 + }, + { + "epoch": 0.028718504470333243, + "grad_norm": 0.11085116118192673, + "learning_rate": 0.0001, + "loss": 1.7985, + "step": 53 + }, + { + "epoch": 0.02926036304524519, + "grad_norm": 0.0932428166270256, + "learning_rate": 0.0001, + "loss": 1.7481, + "step": 54 + }, + { + "epoch": 0.029802221620157138, + "grad_norm": 0.0988590344786644, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 55 + }, + { + "epoch": 0.03034408019506909, + "grad_norm": 0.10976176708936691, + "learning_rate": 0.0001, + "loss": 1.8151, + "step": 56 + }, + { + "epoch": 0.030885938769981035, + "grad_norm": 0.09677836298942566, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 57 + }, + { + "epoch": 0.03142779734489298, + "grad_norm": 0.11221853643655777, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 58 + }, + { + "epoch": 0.03196965591980493, + "grad_norm": 0.09712743014097214, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 59 + }, + { + "epoch": 0.03251151449471688, + "grad_norm": 0.11139461398124695, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 60 + }, + { + "epoch": 0.03305337306962883, + "grad_norm": 0.10642059892416, + "learning_rate": 0.0001, + "loss": 1.7627, + "step": 61 + }, + { + "epoch": 0.03359523164454078, + "grad_norm": 0.1016763374209404, + "learning_rate": 0.0001, + "loss": 1.7372, + "step": 62 + }, + { + "epoch": 0.03413709021945272, + "grad_norm": 0.10269554704427719, + "learning_rate": 0.0001, + "loss": 1.7287, + "step": 63 + }, + { + "epoch": 0.03467894879436467, + "grad_norm": 0.1037546694278717, + "learning_rate": 0.0001, + "loss": 1.7356, + "step": 64 + }, + { + "epoch": 0.035220807369276616, + "grad_norm": 0.10047534108161926, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 65 + }, + { + "epoch": 0.03576266594418857, + "grad_norm": 0.09919369220733643, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 66 + }, + { + "epoch": 0.03630452451910052, + "grad_norm": 0.0938708484172821, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 67 + }, + { + "epoch": 0.03684638309401246, + "grad_norm": 0.10731608420610428, + "learning_rate": 0.0001, + "loss": 1.7493, + "step": 68 + }, + { + "epoch": 0.03738824166892441, + "grad_norm": 0.09051501005887985, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 69 + }, + { + "epoch": 0.03793010024383636, + "grad_norm": 0.10202842205762863, + "learning_rate": 0.0001, + "loss": 1.7613, + "step": 70 + }, + { + "epoch": 0.038471958818748306, + "grad_norm": 0.09244200587272644, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 71 + }, + { + "epoch": 0.039013817393660256, + "grad_norm": 0.09826190024614334, + "learning_rate": 0.0001, + "loss": 1.7634, + "step": 72 + }, + { + "epoch": 0.0395556759685722, + "grad_norm": 0.10156849771738052, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 73 + }, + { + "epoch": 0.04009753454348415, + "grad_norm": 0.10479609668254852, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 74 + }, + { + "epoch": 0.0406393931183961, + "grad_norm": 0.09295053780078888, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 75 + }, + { + "epoch": 0.041181251693308045, + "grad_norm": 0.10030414164066315, + "learning_rate": 0.0001, + "loss": 1.7147, + "step": 76 + }, + { + "epoch": 0.041723110268219996, + "grad_norm": 0.09261474758386612, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 77 + }, + { + "epoch": 0.04226496884313194, + "grad_norm": 0.09939026832580566, + "learning_rate": 0.0001, + "loss": 1.7681, + "step": 78 + }, + { + "epoch": 0.04280682741804389, + "grad_norm": 0.09494777768850327, + "learning_rate": 0.0001, + "loss": 1.7222, + "step": 79 + }, + { + "epoch": 0.04334868599295584, + "grad_norm": 0.0940719023346901, + "learning_rate": 0.0001, + "loss": 1.7207, + "step": 80 + }, + { + "epoch": 0.043890544567867784, + "grad_norm": 0.11140227317810059, + "learning_rate": 0.0001, + "loss": 1.7504, + "step": 81 + }, + { + "epoch": 0.044432403142779735, + "grad_norm": 0.08920968323945999, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 82 + }, + { + "epoch": 0.044974261717691685, + "grad_norm": 0.10845290124416351, + "learning_rate": 0.0001, + "loss": 1.7558, + "step": 83 + }, + { + "epoch": 0.04551612029260363, + "grad_norm": 0.08898979425430298, + "learning_rate": 0.0001, + "loss": 1.7082, + "step": 84 + }, + { + "epoch": 0.04605797886751558, + "grad_norm": 0.1022668331861496, + "learning_rate": 0.0001, + "loss": 1.7659, + "step": 85 + }, + { + "epoch": 0.04659983744242752, + "grad_norm": 0.09408645331859589, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 86 + }, + { + "epoch": 0.047141696017339474, + "grad_norm": 0.0999981164932251, + "learning_rate": 0.0001, + "loss": 1.7425, + "step": 87 + }, + { + "epoch": 0.047683554592251425, + "grad_norm": 0.1010698676109314, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 88 + }, + { + "epoch": 0.04822541316716337, + "grad_norm": 0.0986516997218132, + "learning_rate": 0.0001, + "loss": 1.7166, + "step": 89 + }, + { + "epoch": 0.04876727174207532, + "grad_norm": 0.11564943194389343, + "learning_rate": 0.0001, + "loss": 1.7131, + "step": 90 + }, + { + "epoch": 0.04930913031698727, + "grad_norm": 0.10187874734401703, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 91 + }, + { + "epoch": 0.04985098889189921, + "grad_norm": 0.11014661937952042, + "learning_rate": 0.0001, + "loss": 1.7228, + "step": 92 + }, + { + "epoch": 0.050392847466811164, + "grad_norm": 0.09390980005264282, + "learning_rate": 0.0001, + "loss": 1.6759, + "step": 93 + }, + { + "epoch": 0.05093470604172311, + "grad_norm": 0.10052553564310074, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 94 + }, + { + "epoch": 0.05147656461663506, + "grad_norm": 0.10119052976369858, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 95 + }, + { + "epoch": 0.05201842319154701, + "grad_norm": 0.10390573740005493, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 96 + }, + { + "epoch": 0.05256028176645895, + "grad_norm": 0.09522074460983276, + "learning_rate": 0.0001, + "loss": 1.7072, + "step": 97 + }, + { + "epoch": 0.0531021403413709, + "grad_norm": 0.11148355901241302, + "learning_rate": 0.0001, + "loss": 1.7544, + "step": 98 + }, + { + "epoch": 0.05364399891628285, + "grad_norm": 0.09624631702899933, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 99 + }, + { + "epoch": 0.0541858574911948, + "grad_norm": 0.09778071939945221, + "learning_rate": 0.0001, + "loss": 1.7695, + "step": 100 + }, + { + "epoch": 0.05472771606610675, + "grad_norm": 0.11497867852449417, + "learning_rate": 0.0001, + "loss": 1.7457, + "step": 101 + }, + { + "epoch": 0.05526957464101869, + "grad_norm": 0.09833558648824692, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 102 + }, + { + "epoch": 0.05581143321593064, + "grad_norm": 0.10873506963253021, + "learning_rate": 0.0001, + "loss": 1.7898, + "step": 103 + }, + { + "epoch": 0.05635329179084259, + "grad_norm": 0.10418610274791718, + "learning_rate": 0.0001, + "loss": 1.6935, + "step": 104 + }, + { + "epoch": 0.056895150365754536, + "grad_norm": 0.10068942606449127, + "learning_rate": 0.0001, + "loss": 1.6845, + "step": 105 + }, + { + "epoch": 0.05743700894066649, + "grad_norm": 0.11650339514017105, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 106 + }, + { + "epoch": 0.05797886751557844, + "grad_norm": 0.09679100662469864, + "learning_rate": 0.0001, + "loss": 1.7633, + "step": 107 + }, + { + "epoch": 0.05852072609049038, + "grad_norm": 0.10021305084228516, + "learning_rate": 0.0001, + "loss": 1.6978, + "step": 108 + }, + { + "epoch": 0.05906258466540233, + "grad_norm": 0.10400475561618805, + "learning_rate": 0.0001, + "loss": 1.8073, + "step": 109 + }, + { + "epoch": 0.059604443240314275, + "grad_norm": 0.11318189650774002, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 110 + }, + { + "epoch": 0.060146301815226226, + "grad_norm": 0.09900004416704178, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 111 + }, + { + "epoch": 0.06068816039013818, + "grad_norm": 0.09888526797294617, + "learning_rate": 0.0001, + "loss": 1.7403, + "step": 112 + }, + { + "epoch": 0.06123001896505012, + "grad_norm": 0.09070870280265808, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 113 + }, + { + "epoch": 0.06177187753996207, + "grad_norm": 0.10428040474653244, + "learning_rate": 0.0001, + "loss": 1.7713, + "step": 114 + }, + { + "epoch": 0.062313736114874015, + "grad_norm": 0.09860703349113464, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 115 + }, + { + "epoch": 0.06285559468978597, + "grad_norm": 0.09450509399175644, + "learning_rate": 0.0001, + "loss": 1.7076, + "step": 116 + }, + { + "epoch": 0.06339745326469791, + "grad_norm": 0.09789243340492249, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 117 + }, + { + "epoch": 0.06393931183960987, + "grad_norm": 0.1094205304980278, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 118 + }, + { + "epoch": 0.06448117041452181, + "grad_norm": 0.09620770812034607, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 119 + }, + { + "epoch": 0.06502302898943375, + "grad_norm": 0.10695544630289078, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 120 + }, + { + "epoch": 0.06556488756434571, + "grad_norm": 0.09900528192520142, + "learning_rate": 0.0001, + "loss": 1.7043, + "step": 121 + }, + { + "epoch": 0.06610674613925765, + "grad_norm": 0.10099681466817856, + "learning_rate": 0.0001, + "loss": 1.7315, + "step": 122 + }, + { + "epoch": 0.0666486047141696, + "grad_norm": 0.09247930347919464, + "learning_rate": 0.0001, + "loss": 1.6921, + "step": 123 + }, + { + "epoch": 0.06719046328908156, + "grad_norm": 0.1142578274011612, + "learning_rate": 0.0001, + "loss": 1.7487, + "step": 124 + }, + { + "epoch": 0.0677323218639935, + "grad_norm": 0.12242250144481659, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 125 + }, + { + "epoch": 0.06827418043890544, + "grad_norm": 0.10806268453598022, + "learning_rate": 0.0001, + "loss": 1.732, + "step": 126 + }, + { + "epoch": 0.06881603901381739, + "grad_norm": 0.09226547926664352, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 127 + }, + { + "epoch": 0.06935789758872934, + "grad_norm": 0.09697569161653519, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 128 + }, + { + "epoch": 0.06989975616364129, + "grad_norm": 0.10045196861028671, + "learning_rate": 0.0001, + "loss": 1.7323, + "step": 129 + }, + { + "epoch": 0.07044161473855323, + "grad_norm": 0.1046331375837326, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 130 + }, + { + "epoch": 0.07098347331346519, + "grad_norm": 0.10865035653114319, + "learning_rate": 0.0001, + "loss": 1.7459, + "step": 131 + }, + { + "epoch": 0.07152533188837713, + "grad_norm": 0.1041274145245552, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 132 + }, + { + "epoch": 0.07206719046328908, + "grad_norm": 0.10414282977581024, + "learning_rate": 0.0001, + "loss": 1.7071, + "step": 133 + }, + { + "epoch": 0.07260904903820103, + "grad_norm": 0.12221338599920273, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 134 + }, + { + "epoch": 0.07315090761311298, + "grad_norm": 0.10498306155204773, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 135 + }, + { + "epoch": 0.07369276618802492, + "grad_norm": 0.13094539940357208, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 136 + }, + { + "epoch": 0.07423462476293688, + "grad_norm": 0.120379239320755, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 137 + }, + { + "epoch": 0.07477648333784882, + "grad_norm": 0.12071491032838821, + "learning_rate": 0.0001, + "loss": 1.7722, + "step": 138 + }, + { + "epoch": 0.07531834191276077, + "grad_norm": 0.1514299064874649, + "learning_rate": 0.0001, + "loss": 1.6833, + "step": 139 + }, + { + "epoch": 0.07586020048767272, + "grad_norm": 0.10760881751775742, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 140 + }, + { + "epoch": 0.07640205906258467, + "grad_norm": 0.10171575099229813, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 141 + }, + { + "epoch": 0.07694391763749661, + "grad_norm": 0.11112318933010101, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 142 + }, + { + "epoch": 0.07748577621240856, + "grad_norm": 0.10578908771276474, + "learning_rate": 0.0001, + "loss": 1.7001, + "step": 143 + }, + { + "epoch": 0.07802763478732051, + "grad_norm": 0.10570411384105682, + "learning_rate": 0.0001, + "loss": 1.7551, + "step": 144 + }, + { + "epoch": 0.07856949336223246, + "grad_norm": 0.09980179369449615, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 145 + }, + { + "epoch": 0.0791113519371444, + "grad_norm": 0.10519526153802872, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 146 + }, + { + "epoch": 0.07965321051205636, + "grad_norm": 0.11583776772022247, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 147 + }, + { + "epoch": 0.0801950690869683, + "grad_norm": 0.10443723201751709, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 148 + }, + { + "epoch": 0.08073692766188025, + "grad_norm": 0.11888331174850464, + "learning_rate": 0.0001, + "loss": 1.7212, + "step": 149 + }, + { + "epoch": 0.0812787862367922, + "grad_norm": 0.0985705554485321, + "learning_rate": 0.0001, + "loss": 1.7169, + "step": 150 + }, + { + "epoch": 0.08182064481170415, + "grad_norm": 0.11832987517118454, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 151 + }, + { + "epoch": 0.08236250338661609, + "grad_norm": 0.10486771911382675, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 152 + }, + { + "epoch": 0.08290436196152805, + "grad_norm": 0.12332990765571594, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 153 + }, + { + "epoch": 0.08344622053643999, + "grad_norm": 0.1274755448102951, + "learning_rate": 0.0001, + "loss": 1.7229, + "step": 154 + }, + { + "epoch": 0.08398807911135193, + "grad_norm": 0.10862415283918381, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 155 + }, + { + "epoch": 0.08452993768626388, + "grad_norm": 0.12293940037488937, + "learning_rate": 0.0001, + "loss": 1.7031, + "step": 156 + }, + { + "epoch": 0.08507179626117584, + "grad_norm": 0.11512468010187149, + "learning_rate": 0.0001, + "loss": 1.7219, + "step": 157 + }, + { + "epoch": 0.08561365483608778, + "grad_norm": 0.10460253059864044, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 158 + }, + { + "epoch": 0.08615551341099972, + "grad_norm": 0.12816384434700012, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 159 + }, + { + "epoch": 0.08669737198591168, + "grad_norm": 0.09755271673202515, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 160 + }, + { + "epoch": 0.08723923056082362, + "grad_norm": 0.1255766898393631, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 161 + }, + { + "epoch": 0.08778108913573557, + "grad_norm": 0.10234647989273071, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 162 + }, + { + "epoch": 0.08832294771064753, + "grad_norm": 0.1072743609547615, + "learning_rate": 0.0001, + "loss": 1.7496, + "step": 163 + }, + { + "epoch": 0.08886480628555947, + "grad_norm": 0.09563694894313812, + "learning_rate": 0.0001, + "loss": 1.7028, + "step": 164 + }, + { + "epoch": 0.08940666486047141, + "grad_norm": 0.10931570082902908, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 165 + }, + { + "epoch": 0.08994852343538337, + "grad_norm": 0.0974336788058281, + "learning_rate": 0.0001, + "loss": 1.7089, + "step": 166 + }, + { + "epoch": 0.09049038201029531, + "grad_norm": 0.11343063414096832, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 167 + }, + { + "epoch": 0.09103224058520726, + "grad_norm": 0.09566744416952133, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 168 + }, + { + "epoch": 0.09157409916011922, + "grad_norm": 0.10925479978322983, + "learning_rate": 0.0001, + "loss": 1.7122, + "step": 169 + }, + { + "epoch": 0.09211595773503116, + "grad_norm": 0.1133953258395195, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 170 + }, + { + "epoch": 0.0926578163099431, + "grad_norm": 0.10159795731306076, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 171 + }, + { + "epoch": 0.09319967488485505, + "grad_norm": 0.09628516435623169, + "learning_rate": 0.0001, + "loss": 1.6852, + "step": 172 + }, + { + "epoch": 0.093741533459767, + "grad_norm": 0.09967010468244553, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 173 + }, + { + "epoch": 0.09428339203467895, + "grad_norm": 0.09833619743585587, + "learning_rate": 0.0001, + "loss": 1.673, + "step": 174 + }, + { + "epoch": 0.09482525060959089, + "grad_norm": 0.10003640502691269, + "learning_rate": 0.0001, + "loss": 1.6846, + "step": 175 + }, + { + "epoch": 0.09536710918450285, + "grad_norm": 0.09707822650671005, + "learning_rate": 0.0001, + "loss": 1.6911, + "step": 176 + }, + { + "epoch": 0.09590896775941479, + "grad_norm": 0.09149082750082016, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 177 + }, + { + "epoch": 0.09645082633432674, + "grad_norm": 0.11072653532028198, + "learning_rate": 0.0001, + "loss": 1.7419, + "step": 178 + }, + { + "epoch": 0.0969926849092387, + "grad_norm": 0.09115342795848846, + "learning_rate": 0.0001, + "loss": 1.5458, + "step": 179 + }, + { + "epoch": 0.09753454348415064, + "grad_norm": 0.0993184745311737, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 180 + }, + { + "epoch": 0.09807640205906258, + "grad_norm": 0.09827783703804016, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 181 + }, + { + "epoch": 0.09861826063397454, + "grad_norm": 0.1036309152841568, + "learning_rate": 0.0001, + "loss": 1.6966, + "step": 182 + }, + { + "epoch": 0.09916011920888648, + "grad_norm": 0.10436933487653732, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 183 + }, + { + "epoch": 0.09970197778379843, + "grad_norm": 0.10254193842411041, + "learning_rate": 0.0001, + "loss": 1.776, + "step": 184 + }, + { + "epoch": 0.10024383635871037, + "grad_norm": 0.10822900384664536, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 185 + }, + { + "epoch": 0.10078569493362233, + "grad_norm": 0.1024482473731041, + "learning_rate": 0.0001, + "loss": 1.7474, + "step": 186 + }, + { + "epoch": 0.10132755350853427, + "grad_norm": 0.10238222032785416, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 187 + }, + { + "epoch": 0.10186941208344621, + "grad_norm": 0.1048850268125534, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 188 + }, + { + "epoch": 0.10241127065835817, + "grad_norm": 0.10192476212978363, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 189 + }, + { + "epoch": 0.10295312923327012, + "grad_norm": 0.10868800431489944, + "learning_rate": 0.0001, + "loss": 1.7563, + "step": 190 + }, + { + "epoch": 0.10349498780818206, + "grad_norm": 0.10962677747011185, + "learning_rate": 0.0001, + "loss": 1.6806, + "step": 191 + }, + { + "epoch": 0.10403684638309402, + "grad_norm": 0.09616486728191376, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 192 + }, + { + "epoch": 0.10457870495800596, + "grad_norm": 0.1281924992799759, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 193 + }, + { + "epoch": 0.1051205635329179, + "grad_norm": 0.11889411509037018, + "learning_rate": 0.0001, + "loss": 1.7362, + "step": 194 + }, + { + "epoch": 0.10566242210782986, + "grad_norm": 0.1125204935669899, + "learning_rate": 0.0001, + "loss": 1.7325, + "step": 195 + }, + { + "epoch": 0.1062042806827418, + "grad_norm": 0.12667258083820343, + "learning_rate": 0.0001, + "loss": 1.7177, + "step": 196 + }, + { + "epoch": 0.10674613925765375, + "grad_norm": 0.09924928098917007, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 197 + }, + { + "epoch": 0.1072879978325657, + "grad_norm": 0.11108226329088211, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 198 + }, + { + "epoch": 0.10782985640747765, + "grad_norm": 0.09894610941410065, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 199 + }, + { + "epoch": 0.1083717149823896, + "grad_norm": 0.12750208377838135, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 200 + }, + { + "epoch": 0.10891357355730154, + "grad_norm": 0.10455714166164398, + "learning_rate": 0.0001, + "loss": 1.7068, + "step": 201 + }, + { + "epoch": 0.1094554321322135, + "grad_norm": 0.1203424334526062, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 202 + }, + { + "epoch": 0.10999729070712544, + "grad_norm": 0.11161962151527405, + "learning_rate": 0.0001, + "loss": 1.7124, + "step": 203 + }, + { + "epoch": 0.11053914928203738, + "grad_norm": 0.11139895021915436, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 204 + }, + { + "epoch": 0.11108100785694934, + "grad_norm": 0.14790917932987213, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 205 + }, + { + "epoch": 0.11162286643186128, + "grad_norm": 0.1384461224079132, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 206 + }, + { + "epoch": 0.11216472500677323, + "grad_norm": 0.11863648891448975, + "learning_rate": 0.0001, + "loss": 1.7007, + "step": 207 + }, + { + "epoch": 0.11270658358168519, + "grad_norm": 0.1461704522371292, + "learning_rate": 0.0001, + "loss": 1.6736, + "step": 208 + }, + { + "epoch": 0.11324844215659713, + "grad_norm": 0.11320877820253372, + "learning_rate": 0.0001, + "loss": 1.7092, + "step": 209 + }, + { + "epoch": 0.11379030073150907, + "grad_norm": 0.1436343491077423, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 210 + }, + { + "epoch": 0.11433215930642103, + "grad_norm": 0.12915484607219696, + "learning_rate": 0.0001, + "loss": 1.7741, + "step": 211 + }, + { + "epoch": 0.11487401788133297, + "grad_norm": 0.16453833878040314, + "learning_rate": 0.0001, + "loss": 1.7416, + "step": 212 + }, + { + "epoch": 0.11541587645624492, + "grad_norm": 0.10235466808080673, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 213 + }, + { + "epoch": 0.11595773503115687, + "grad_norm": 0.13166211545467377, + "learning_rate": 0.0001, + "loss": 1.7094, + "step": 214 + }, + { + "epoch": 0.11649959360606882, + "grad_norm": 0.1139000654220581, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 215 + }, + { + "epoch": 0.11704145218098076, + "grad_norm": 0.1456412523984909, + "learning_rate": 0.0001, + "loss": 1.6858, + "step": 216 + }, + { + "epoch": 0.1175833107558927, + "grad_norm": 0.10928891599178314, + "learning_rate": 0.0001, + "loss": 1.7381, + "step": 217 + }, + { + "epoch": 0.11812516933080466, + "grad_norm": 0.11825280636548996, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 218 + }, + { + "epoch": 0.11866702790571661, + "grad_norm": 0.12118617445230484, + "learning_rate": 0.0001, + "loss": 1.7545, + "step": 219 + }, + { + "epoch": 0.11920888648062855, + "grad_norm": 0.13101591169834137, + "learning_rate": 0.0001, + "loss": 1.7525, + "step": 220 + }, + { + "epoch": 0.11975074505554051, + "grad_norm": 0.09695487469434738, + "learning_rate": 0.0001, + "loss": 1.6776, + "step": 221 + }, + { + "epoch": 0.12029260363045245, + "grad_norm": 0.11916449666023254, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 222 + }, + { + "epoch": 0.1208344622053644, + "grad_norm": 0.10929539054632187, + "learning_rate": 0.0001, + "loss": 1.77, + "step": 223 + }, + { + "epoch": 0.12137632078027635, + "grad_norm": 0.10067623108625412, + "learning_rate": 0.0001, + "loss": 1.6743, + "step": 224 + }, + { + "epoch": 0.1219181793551883, + "grad_norm": 0.12053167074918747, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 225 + }, + { + "epoch": 0.12246003793010024, + "grad_norm": 0.12418187409639359, + "learning_rate": 0.0001, + "loss": 1.7093, + "step": 226 + }, + { + "epoch": 0.1230018965050122, + "grad_norm": 0.1266579031944275, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 227 + }, + { + "epoch": 0.12354375507992414, + "grad_norm": 0.11025568097829819, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 228 + }, + { + "epoch": 0.12408561365483609, + "grad_norm": 0.11573028564453125, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 229 + }, + { + "epoch": 0.12462747222974803, + "grad_norm": 0.11125994473695755, + "learning_rate": 0.0001, + "loss": 1.6752, + "step": 230 + }, + { + "epoch": 0.12516933080465997, + "grad_norm": 0.1128525361418724, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 231 + }, + { + "epoch": 0.12571118937957193, + "grad_norm": 0.09718891233205795, + "learning_rate": 0.0001, + "loss": 1.6745, + "step": 232 + }, + { + "epoch": 0.1262530479544839, + "grad_norm": 0.1529717892408371, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 233 + }, + { + "epoch": 0.12679490652939582, + "grad_norm": 0.0977558046579361, + "learning_rate": 0.0001, + "loss": 1.7009, + "step": 234 + }, + { + "epoch": 0.12733676510430778, + "grad_norm": 0.14934347569942474, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 235 + }, + { + "epoch": 0.12787862367921973, + "grad_norm": 0.09760584682226181, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 236 + }, + { + "epoch": 0.12842048225413166, + "grad_norm": 0.11174532771110535, + "learning_rate": 0.0001, + "loss": 1.739, + "step": 237 + }, + { + "epoch": 0.12896234082904362, + "grad_norm": 0.11080338060855865, + "learning_rate": 0.0001, + "loss": 1.7127, + "step": 238 + }, + { + "epoch": 0.12950419940395558, + "grad_norm": 0.10834498703479767, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 239 + }, + { + "epoch": 0.1300460579788675, + "grad_norm": 0.11970813572406769, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 240 + }, + { + "epoch": 0.13058791655377947, + "grad_norm": 0.11999791115522385, + "learning_rate": 0.0001, + "loss": 1.6949, + "step": 241 + }, + { + "epoch": 0.13112977512869142, + "grad_norm": 0.10226437449455261, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 242 + }, + { + "epoch": 0.13167163370360335, + "grad_norm": 0.11835068464279175, + "learning_rate": 0.0001, + "loss": 1.6934, + "step": 243 + }, + { + "epoch": 0.1322134922785153, + "grad_norm": 0.10222174972295761, + "learning_rate": 0.0001, + "loss": 1.7147, + "step": 244 + }, + { + "epoch": 0.13275535085342727, + "grad_norm": 0.09475545585155487, + "learning_rate": 0.0001, + "loss": 1.6798, + "step": 245 + }, + { + "epoch": 0.1332972094283392, + "grad_norm": 0.1266164779663086, + "learning_rate": 0.0001, + "loss": 1.7424, + "step": 246 + }, + { + "epoch": 0.13383906800325115, + "grad_norm": 0.10481266677379608, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 247 + }, + { + "epoch": 0.1343809265781631, + "grad_norm": 0.10396500676870346, + "learning_rate": 0.0001, + "loss": 1.7314, + "step": 248 + }, + { + "epoch": 0.13492278515307504, + "grad_norm": 0.14904353022575378, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 249 + }, + { + "epoch": 0.135464643727987, + "grad_norm": 0.0941733717918396, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 250 + }, + { + "epoch": 0.13600650230289893, + "grad_norm": 0.15529394149780273, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 251 + }, + { + "epoch": 0.1365483608778109, + "grad_norm": 0.12021831423044205, + "learning_rate": 0.0001, + "loss": 1.721, + "step": 252 + }, + { + "epoch": 0.13709021945272284, + "grad_norm": 0.15852448344230652, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 253 + }, + { + "epoch": 0.13763207802763477, + "grad_norm": 0.11266610771417618, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 254 + }, + { + "epoch": 0.13817393660254673, + "grad_norm": 0.13919475674629211, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 255 + }, + { + "epoch": 0.1387157951774587, + "grad_norm": 0.11103470623493195, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 256 + }, + { + "epoch": 0.13925765375237062, + "grad_norm": 0.15118271112442017, + "learning_rate": 0.0001, + "loss": 1.7035, + "step": 257 + }, + { + "epoch": 0.13979951232728258, + "grad_norm": 0.11419171839952469, + "learning_rate": 0.0001, + "loss": 1.6692, + "step": 258 + }, + { + "epoch": 0.14034137090219453, + "grad_norm": 0.11274544894695282, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 259 + }, + { + "epoch": 0.14088322947710646, + "grad_norm": 0.14849093556404114, + "learning_rate": 0.0001, + "loss": 1.7188, + "step": 260 + }, + { + "epoch": 0.14142508805201842, + "grad_norm": 0.10871066898107529, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 261 + }, + { + "epoch": 0.14196694662693038, + "grad_norm": 0.13676348328590393, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 262 + }, + { + "epoch": 0.1425088052018423, + "grad_norm": 0.12139032036066055, + "learning_rate": 0.0001, + "loss": 1.6857, + "step": 263 + }, + { + "epoch": 0.14305066377675427, + "grad_norm": 0.11282926797866821, + "learning_rate": 0.0001, + "loss": 1.6851, + "step": 264 + }, + { + "epoch": 0.14359252235166622, + "grad_norm": 0.12527044117450714, + "learning_rate": 0.0001, + "loss": 1.7553, + "step": 265 + }, + { + "epoch": 0.14413438092657815, + "grad_norm": 0.14149242639541626, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 266 + }, + { + "epoch": 0.1446762395014901, + "grad_norm": 0.10425418615341187, + "learning_rate": 0.0001, + "loss": 1.7013, + "step": 267 + }, + { + "epoch": 0.14521809807640207, + "grad_norm": 0.13150091469287872, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 268 + }, + { + "epoch": 0.145759956651314, + "grad_norm": 0.09711414575576782, + "learning_rate": 0.0001, + "loss": 1.7004, + "step": 269 + }, + { + "epoch": 0.14630181522622596, + "grad_norm": 0.1150781586766243, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 270 + }, + { + "epoch": 0.14684367380113791, + "grad_norm": 0.11720975488424301, + "learning_rate": 0.0001, + "loss": 1.6947, + "step": 271 + }, + { + "epoch": 0.14738553237604984, + "grad_norm": 0.10064122825860977, + "learning_rate": 0.0001, + "loss": 1.7101, + "step": 272 + }, + { + "epoch": 0.1479273909509618, + "grad_norm": 0.11132777482271194, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 273 + }, + { + "epoch": 0.14846924952587376, + "grad_norm": 0.10732540488243103, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 274 + }, + { + "epoch": 0.1490111081007857, + "grad_norm": 0.11525800824165344, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 275 + }, + { + "epoch": 0.14955296667569765, + "grad_norm": 0.1057300716638565, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 276 + }, + { + "epoch": 0.1500948252506096, + "grad_norm": 0.10139822959899902, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 277 + }, + { + "epoch": 0.15063668382552153, + "grad_norm": 0.10546170920133591, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 278 + }, + { + "epoch": 0.1511785424004335, + "grad_norm": 0.0943392738699913, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 279 + }, + { + "epoch": 0.15172040097534545, + "grad_norm": 0.10627435147762299, + "learning_rate": 0.0001, + "loss": 1.7194, + "step": 280 + }, + { + "epoch": 0.15226225955025738, + "grad_norm": 0.09746234118938446, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 281 + }, + { + "epoch": 0.15280411812516934, + "grad_norm": 0.09403081238269806, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 282 + }, + { + "epoch": 0.15334597670008127, + "grad_norm": 0.09880942851305008, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 283 + }, + { + "epoch": 0.15388783527499322, + "grad_norm": 0.10196152329444885, + "learning_rate": 0.0001, + "loss": 1.6484, + "step": 284 + }, + { + "epoch": 0.15442969384990518, + "grad_norm": 0.09791263937950134, + "learning_rate": 0.0001, + "loss": 1.7203, + "step": 285 + }, + { + "epoch": 0.1549715524248171, + "grad_norm": 0.09796151518821716, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 286 + }, + { + "epoch": 0.15551341099972907, + "grad_norm": 0.10709752887487411, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 287 + }, + { + "epoch": 0.15605526957464103, + "grad_norm": 0.09797371178865433, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 288 + }, + { + "epoch": 0.15659712814955296, + "grad_norm": 0.10401292890310287, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 289 + }, + { + "epoch": 0.1571389867244649, + "grad_norm": 0.09455517679452896, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 290 + }, + { + "epoch": 0.15768084529937687, + "grad_norm": 0.09385957568883896, + "learning_rate": 0.0001, + "loss": 1.6095, + "step": 291 + }, + { + "epoch": 0.1582227038742888, + "grad_norm": 0.10476952791213989, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 292 + }, + { + "epoch": 0.15876456244920076, + "grad_norm": 0.1275656670331955, + "learning_rate": 0.0001, + "loss": 1.6689, + "step": 293 + }, + { + "epoch": 0.15930642102411272, + "grad_norm": 0.0954967811703682, + "learning_rate": 0.0001, + "loss": 1.708, + "step": 294 + }, + { + "epoch": 0.15984827959902465, + "grad_norm": 0.12397397309541702, + "learning_rate": 0.0001, + "loss": 1.7002, + "step": 295 + }, + { + "epoch": 0.1603901381739366, + "grad_norm": 0.09896747022867203, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 296 + }, + { + "epoch": 0.16093199674884856, + "grad_norm": 0.13791395723819733, + "learning_rate": 0.0001, + "loss": 1.7306, + "step": 297 + }, + { + "epoch": 0.1614738553237605, + "grad_norm": 0.09542802721261978, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 298 + }, + { + "epoch": 0.16201571389867245, + "grad_norm": 0.11375903338193893, + "learning_rate": 0.0001, + "loss": 1.7341, + "step": 299 + }, + { + "epoch": 0.1625575724735844, + "grad_norm": 0.10889938473701477, + "learning_rate": 0.0001, + "loss": 1.731, + "step": 300 + }, + { + "epoch": 0.16309943104849633, + "grad_norm": 0.11272832006216049, + "learning_rate": 0.0001, + "loss": 1.6999, + "step": 301 + }, + { + "epoch": 0.1636412896234083, + "grad_norm": 0.11351019144058228, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 302 + }, + { + "epoch": 0.16418314819832025, + "grad_norm": 0.11626233905553818, + "learning_rate": 0.0001, + "loss": 1.7394, + "step": 303 + }, + { + "epoch": 0.16472500677323218, + "grad_norm": 0.10919834673404694, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 304 + }, + { + "epoch": 0.16526686534814414, + "grad_norm": 0.09492017328739166, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 305 + }, + { + "epoch": 0.1658087239230561, + "grad_norm": 0.11643269658088684, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 306 + }, + { + "epoch": 0.16635058249796802, + "grad_norm": 0.09301548451185226, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 307 + }, + { + "epoch": 0.16689244107287998, + "grad_norm": 0.099439337849617, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 308 + }, + { + "epoch": 0.16743429964779194, + "grad_norm": 0.10214002430438995, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 309 + }, + { + "epoch": 0.16797615822270387, + "grad_norm": 0.0960782915353775, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 310 + }, + { + "epoch": 0.16851801679761583, + "grad_norm": 0.1130334809422493, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 311 + }, + { + "epoch": 0.16905987537252776, + "grad_norm": 0.09629669040441513, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 312 + }, + { + "epoch": 0.16960173394743971, + "grad_norm": 0.1021374985575676, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 313 + }, + { + "epoch": 0.17014359252235167, + "grad_norm": 0.09490853548049927, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 314 + }, + { + "epoch": 0.1706854510972636, + "grad_norm": 0.09929566830396652, + "learning_rate": 0.0001, + "loss": 1.6869, + "step": 315 + }, + { + "epoch": 0.17122730967217556, + "grad_norm": 0.09823586791753769, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 316 + }, + { + "epoch": 0.17176916824708752, + "grad_norm": 0.09158893674612045, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 317 + }, + { + "epoch": 0.17231102682199945, + "grad_norm": 0.09908587485551834, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 318 + }, + { + "epoch": 0.1728528853969114, + "grad_norm": 0.10510025173425674, + "learning_rate": 0.0001, + "loss": 1.6959, + "step": 319 + }, + { + "epoch": 0.17339474397182336, + "grad_norm": 0.10323290526866913, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 320 + }, + { + "epoch": 0.1739366025467353, + "grad_norm": 0.10068578273057938, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 321 + }, + { + "epoch": 0.17447846112164725, + "grad_norm": 0.10480581969022751, + "learning_rate": 0.0001, + "loss": 1.7247, + "step": 322 + }, + { + "epoch": 0.1750203196965592, + "grad_norm": 0.10604649782180786, + "learning_rate": 0.0001, + "loss": 1.7256, + "step": 323 + }, + { + "epoch": 0.17556217827147114, + "grad_norm": 0.10042749345302582, + "learning_rate": 0.0001, + "loss": 1.6702, + "step": 324 + }, + { + "epoch": 0.1761040368463831, + "grad_norm": 0.11323428899049759, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 325 + }, + { + "epoch": 0.17664589542129505, + "grad_norm": 0.10794851928949356, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 326 + }, + { + "epoch": 0.17718775399620698, + "grad_norm": 0.12508118152618408, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 327 + }, + { + "epoch": 0.17772961257111894, + "grad_norm": 0.1013966053724289, + "learning_rate": 0.0001, + "loss": 1.7129, + "step": 328 + }, + { + "epoch": 0.1782714711460309, + "grad_norm": 0.13972260057926178, + "learning_rate": 0.0001, + "loss": 1.7003, + "step": 329 + }, + { + "epoch": 0.17881332972094283, + "grad_norm": 0.1019311174750328, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 330 + }, + { + "epoch": 0.17935518829585478, + "grad_norm": 0.14320091903209686, + "learning_rate": 0.0001, + "loss": 1.6931, + "step": 331 + }, + { + "epoch": 0.17989704687076674, + "grad_norm": 0.10839429497718811, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 332 + }, + { + "epoch": 0.18043890544567867, + "grad_norm": 0.11457662284374237, + "learning_rate": 0.0001, + "loss": 1.6539, + "step": 333 + }, + { + "epoch": 0.18098076402059063, + "grad_norm": 0.11482690274715424, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 334 + }, + { + "epoch": 0.1815226225955026, + "grad_norm": 0.12048427760601044, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 335 + }, + { + "epoch": 0.18206448117041452, + "grad_norm": 0.11739542335271835, + "learning_rate": 0.0001, + "loss": 1.5914, + "step": 336 + }, + { + "epoch": 0.18260633974532647, + "grad_norm": 0.11013977229595184, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 337 + }, + { + "epoch": 0.18314819832023843, + "grad_norm": 0.12094748020172119, + "learning_rate": 0.0001, + "loss": 1.7103, + "step": 338 + }, + { + "epoch": 0.18369005689515036, + "grad_norm": 0.10130317509174347, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 339 + }, + { + "epoch": 0.18423191547006232, + "grad_norm": 0.10509441047906876, + "learning_rate": 0.0001, + "loss": 1.6784, + "step": 340 + }, + { + "epoch": 0.18477377404497425, + "grad_norm": 0.10123977065086365, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 341 + }, + { + "epoch": 0.1853156326198862, + "grad_norm": 0.12331061065196991, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 342 + }, + { + "epoch": 0.18585749119479816, + "grad_norm": 0.09549771249294281, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 343 + }, + { + "epoch": 0.1863993497697101, + "grad_norm": 0.10492638498544693, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 344 + }, + { + "epoch": 0.18694120834462205, + "grad_norm": 0.11436952650547028, + "learning_rate": 0.0001, + "loss": 1.7484, + "step": 345 + }, + { + "epoch": 0.187483066919534, + "grad_norm": 0.11754444241523743, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 346 + }, + { + "epoch": 0.18802492549444594, + "grad_norm": 0.11410810798406601, + "learning_rate": 0.0001, + "loss": 1.7044, + "step": 347 + }, + { + "epoch": 0.1885667840693579, + "grad_norm": 0.10755981504917145, + "learning_rate": 0.0001, + "loss": 1.7453, + "step": 348 + }, + { + "epoch": 0.18910864264426985, + "grad_norm": 0.1117858737707138, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 349 + }, + { + "epoch": 0.18965050121918178, + "grad_norm": 0.10845154523849487, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 350 + }, + { + "epoch": 0.19019235979409374, + "grad_norm": 0.10038649290800095, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 351 + }, + { + "epoch": 0.1907342183690057, + "grad_norm": 0.10859506577253342, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 352 + }, + { + "epoch": 0.19127607694391763, + "grad_norm": 0.10127723217010498, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 353 + }, + { + "epoch": 0.19181793551882959, + "grad_norm": 0.11287626624107361, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 354 + }, + { + "epoch": 0.19235979409374154, + "grad_norm": 0.11790165305137634, + "learning_rate": 0.0001, + "loss": 1.7098, + "step": 355 + }, + { + "epoch": 0.19290165266865347, + "grad_norm": 0.10026410967111588, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 356 + }, + { + "epoch": 0.19344351124356543, + "grad_norm": 0.11183635890483856, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 357 + }, + { + "epoch": 0.1939853698184774, + "grad_norm": 0.10310844331979752, + "learning_rate": 0.0001, + "loss": 1.6916, + "step": 358 + }, + { + "epoch": 0.19452722839338932, + "grad_norm": 0.09812598675489426, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 359 + }, + { + "epoch": 0.19506908696830128, + "grad_norm": 0.11021661758422852, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 360 + }, + { + "epoch": 0.19561094554321323, + "grad_norm": 0.09344365447759628, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 361 + }, + { + "epoch": 0.19615280411812516, + "grad_norm": 0.10808692872524261, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 362 + }, + { + "epoch": 0.19669466269303712, + "grad_norm": 0.11180002242326736, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 363 + }, + { + "epoch": 0.19723652126794908, + "grad_norm": 0.10783065110445023, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 364 + }, + { + "epoch": 0.197778379842861, + "grad_norm": 0.1020706295967102, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 365 + }, + { + "epoch": 0.19832023841777296, + "grad_norm": 0.09901352971792221, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 366 + }, + { + "epoch": 0.19886209699268492, + "grad_norm": 0.10985004901885986, + "learning_rate": 0.0001, + "loss": 1.6885, + "step": 367 + }, + { + "epoch": 0.19940395556759685, + "grad_norm": 0.10345008969306946, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 368 + }, + { + "epoch": 0.1999458141425088, + "grad_norm": 0.09653160721063614, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 369 + }, + { + "epoch": 0.20048767271742074, + "grad_norm": 0.1158447191119194, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 370 + }, + { + "epoch": 0.2010295312923327, + "grad_norm": 0.1000077947974205, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 371 + }, + { + "epoch": 0.20157138986724465, + "grad_norm": 0.11747664958238602, + "learning_rate": 0.0001, + "loss": 1.7121, + "step": 372 + }, + { + "epoch": 0.20211324844215658, + "grad_norm": 0.0929274931550026, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 373 + }, + { + "epoch": 0.20265510701706854, + "grad_norm": 0.12577304244041443, + "learning_rate": 0.0001, + "loss": 1.7982, + "step": 374 + }, + { + "epoch": 0.2031969655919805, + "grad_norm": 0.09752815961837769, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 375 + }, + { + "epoch": 0.20373882416689243, + "grad_norm": 0.11870884150266647, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 376 + }, + { + "epoch": 0.2042806827418044, + "grad_norm": 0.10201161354780197, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 377 + }, + { + "epoch": 0.20482254131671634, + "grad_norm": 0.10613151639699936, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 378 + }, + { + "epoch": 0.20536439989162827, + "grad_norm": 0.10123321413993835, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 379 + }, + { + "epoch": 0.20590625846654023, + "grad_norm": 0.09528015553951263, + "learning_rate": 0.0001, + "loss": 1.7344, + "step": 380 + }, + { + "epoch": 0.2064481170414522, + "grad_norm": 0.10694505274295807, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 381 + }, + { + "epoch": 0.20698997561636412, + "grad_norm": 0.10317772626876831, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 382 + }, + { + "epoch": 0.20753183419127608, + "grad_norm": 0.11525990068912506, + "learning_rate": 0.0001, + "loss": 1.6948, + "step": 383 + }, + { + "epoch": 0.20807369276618803, + "grad_norm": 0.09970982372760773, + "learning_rate": 0.0001, + "loss": 1.6761, + "step": 384 + }, + { + "epoch": 0.20861555134109996, + "grad_norm": 0.11480505764484406, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 385 + }, + { + "epoch": 0.20915740991601192, + "grad_norm": 0.10115650296211243, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 386 + }, + { + "epoch": 0.20969926849092388, + "grad_norm": 0.13291612267494202, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 387 + }, + { + "epoch": 0.2102411270658358, + "grad_norm": 0.09564415365457535, + "learning_rate": 0.0001, + "loss": 1.6884, + "step": 388 + }, + { + "epoch": 0.21078298564074777, + "grad_norm": 0.11687403172254562, + "learning_rate": 0.0001, + "loss": 1.7079, + "step": 389 + }, + { + "epoch": 0.21132484421565972, + "grad_norm": 0.09814255684614182, + "learning_rate": 0.0001, + "loss": 1.7098, + "step": 390 + }, + { + "epoch": 0.21186670279057165, + "grad_norm": 0.10159056633710861, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 391 + }, + { + "epoch": 0.2124085613654836, + "grad_norm": 0.10189589112997055, + "learning_rate": 0.0001, + "loss": 1.7352, + "step": 392 + }, + { + "epoch": 0.21295041994039557, + "grad_norm": 0.10089243203401566, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 393 + }, + { + "epoch": 0.2134922785153075, + "grad_norm": 0.11454922705888748, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 394 + }, + { + "epoch": 0.21403413709021946, + "grad_norm": 0.101554736495018, + "learning_rate": 0.0001, + "loss": 1.716, + "step": 395 + }, + { + "epoch": 0.2145759956651314, + "grad_norm": 0.1074264720082283, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 396 + }, + { + "epoch": 0.21511785424004334, + "grad_norm": 0.10516046732664108, + "learning_rate": 0.0001, + "loss": 1.6887, + "step": 397 + }, + { + "epoch": 0.2156597128149553, + "grad_norm": 0.09886284172534943, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 398 + }, + { + "epoch": 0.21620157138986723, + "grad_norm": 0.09954807907342911, + "learning_rate": 0.0001, + "loss": 1.6712, + "step": 399 + }, + { + "epoch": 0.2167434299647792, + "grad_norm": 0.09766504168510437, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 400 + }, + { + "epoch": 0.21728528853969115, + "grad_norm": 0.09490859508514404, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 401 + }, + { + "epoch": 0.21782714711460308, + "grad_norm": 0.100528784096241, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 402 + }, + { + "epoch": 0.21836900568951503, + "grad_norm": 0.10056279599666595, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 403 + }, + { + "epoch": 0.218910864264427, + "grad_norm": 0.10368312150239944, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 404 + }, + { + "epoch": 0.21945272283933892, + "grad_norm": 0.10767726600170135, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 405 + }, + { + "epoch": 0.21999458141425088, + "grad_norm": 0.09969790279865265, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 406 + }, + { + "epoch": 0.22053643998916284, + "grad_norm": 0.12781274318695068, + "learning_rate": 0.0001, + "loss": 1.7639, + "step": 407 + }, + { + "epoch": 0.22107829856407477, + "grad_norm": 0.11066941916942596, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 408 + }, + { + "epoch": 0.22162015713898672, + "grad_norm": 0.10894569009542465, + "learning_rate": 0.0001, + "loss": 1.7199, + "step": 409 + }, + { + "epoch": 0.22216201571389868, + "grad_norm": 0.116873599588871, + "learning_rate": 0.0001, + "loss": 1.7119, + "step": 410 + }, + { + "epoch": 0.2227038742888106, + "grad_norm": 0.10727427154779434, + "learning_rate": 0.0001, + "loss": 1.7022, + "step": 411 + }, + { + "epoch": 0.22324573286372257, + "grad_norm": 0.10264337062835693, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 412 + }, + { + "epoch": 0.22378759143863453, + "grad_norm": 0.11667589843273163, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 413 + }, + { + "epoch": 0.22432945001354646, + "grad_norm": 0.09983885288238525, + "learning_rate": 0.0001, + "loss": 1.7386, + "step": 414 + }, + { + "epoch": 0.2248713085884584, + "grad_norm": 0.10147198289632797, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 415 + }, + { + "epoch": 0.22541316716337037, + "grad_norm": 0.10878276824951172, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 416 + }, + { + "epoch": 0.2259550257382823, + "grad_norm": 0.10325547307729721, + "learning_rate": 0.0001, + "loss": 1.7173, + "step": 417 + }, + { + "epoch": 0.22649688431319426, + "grad_norm": 0.09619980305433273, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 418 + }, + { + "epoch": 0.22703874288810622, + "grad_norm": 0.1018766313791275, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 419 + }, + { + "epoch": 0.22758060146301815, + "grad_norm": 0.10426010936498642, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 420 + }, + { + "epoch": 0.2281224600379301, + "grad_norm": 0.09923192858695984, + "learning_rate": 0.0001, + "loss": 1.7055, + "step": 421 + }, + { + "epoch": 0.22866431861284206, + "grad_norm": 0.10499102622270584, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 422 + }, + { + "epoch": 0.229206177187754, + "grad_norm": 0.10078815370798111, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 423 + }, + { + "epoch": 0.22974803576266595, + "grad_norm": 0.09503324329853058, + "learning_rate": 0.0001, + "loss": 1.6076, + "step": 424 + }, + { + "epoch": 0.2302898943375779, + "grad_norm": 0.09994073212146759, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 425 + }, + { + "epoch": 0.23083175291248983, + "grad_norm": 0.1010836511850357, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 426 + }, + { + "epoch": 0.2313736114874018, + "grad_norm": 0.10668766498565674, + "learning_rate": 0.0001, + "loss": 1.7398, + "step": 427 + }, + { + "epoch": 0.23191547006231375, + "grad_norm": 0.09935601055622101, + "learning_rate": 0.0001, + "loss": 1.6781, + "step": 428 + }, + { + "epoch": 0.23245732863722568, + "grad_norm": 0.10159440338611603, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 429 + }, + { + "epoch": 0.23299918721213764, + "grad_norm": 0.09990345686674118, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 430 + }, + { + "epoch": 0.23354104578704957, + "grad_norm": 0.11645045876502991, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 431 + }, + { + "epoch": 0.23408290436196152, + "grad_norm": 0.1003262847661972, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 432 + }, + { + "epoch": 0.23462476293687348, + "grad_norm": 0.11681778728961945, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 433 + }, + { + "epoch": 0.2351666215117854, + "grad_norm": 0.09814731776714325, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 434 + }, + { + "epoch": 0.23570848008669737, + "grad_norm": 0.11347492784261703, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 435 + }, + { + "epoch": 0.23625033866160933, + "grad_norm": 0.09687422215938568, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 436 + }, + { + "epoch": 0.23679219723652126, + "grad_norm": 0.13785980641841888, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 437 + }, + { + "epoch": 0.23733405581143321, + "grad_norm": 0.0989057645201683, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 438 + }, + { + "epoch": 0.23787591438634517, + "grad_norm": 0.13210071623325348, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 439 + }, + { + "epoch": 0.2384177729612571, + "grad_norm": 0.11280949413776398, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 440 + }, + { + "epoch": 0.23895963153616906, + "grad_norm": 0.12719376385211945, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 441 + }, + { + "epoch": 0.23950149011108102, + "grad_norm": 0.1163676455616951, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 442 + }, + { + "epoch": 0.24004334868599295, + "grad_norm": 0.11804498732089996, + "learning_rate": 0.0001, + "loss": 1.563, + "step": 443 + }, + { + "epoch": 0.2405852072609049, + "grad_norm": 0.12228959053754807, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 444 + }, + { + "epoch": 0.24112706583581686, + "grad_norm": 0.11664146929979324, + "learning_rate": 0.0001, + "loss": 1.6865, + "step": 445 + }, + { + "epoch": 0.2416689244107288, + "grad_norm": 0.13231196999549866, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 446 + }, + { + "epoch": 0.24221078298564075, + "grad_norm": 0.12525129318237305, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 447 + }, + { + "epoch": 0.2427526415605527, + "grad_norm": 0.11747252941131592, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 448 + }, + { + "epoch": 0.24329450013546464, + "grad_norm": 0.11415065079927444, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 449 + }, + { + "epoch": 0.2438363587103766, + "grad_norm": 0.12443743646144867, + "learning_rate": 0.0001, + "loss": 1.5737, + "step": 450 + }, + { + "epoch": 0.24437821728528855, + "grad_norm": 0.10312895476818085, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 451 + }, + { + "epoch": 0.24492007586020048, + "grad_norm": 0.1094118282198906, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 452 + }, + { + "epoch": 0.24546193443511244, + "grad_norm": 0.10630492120981216, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 453 + }, + { + "epoch": 0.2460037930100244, + "grad_norm": 0.11164608597755432, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 454 + }, + { + "epoch": 0.24654565158493633, + "grad_norm": 0.10897690057754517, + "learning_rate": 0.0001, + "loss": 1.5994, + "step": 455 + }, + { + "epoch": 0.24708751015984828, + "grad_norm": 0.12230285257101059, + "learning_rate": 0.0001, + "loss": 1.7062, + "step": 456 + }, + { + "epoch": 0.24762936873476024, + "grad_norm": 0.10371015965938568, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 457 + }, + { + "epoch": 0.24817122730967217, + "grad_norm": 0.11259844154119492, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 458 + }, + { + "epoch": 0.24871308588458413, + "grad_norm": 0.10375142097473145, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 459 + }, + { + "epoch": 0.24925494445949606, + "grad_norm": 0.09826785326004028, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 460 + }, + { + "epoch": 0.24979680303440802, + "grad_norm": 0.11986129730939865, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 461 + }, + { + "epoch": 0.25033866160931995, + "grad_norm": 0.09854941070079803, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 462 + }, + { + "epoch": 0.2508805201842319, + "grad_norm": 0.12960360944271088, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 463 + }, + { + "epoch": 0.25142237875914386, + "grad_norm": 0.10426454246044159, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 464 + }, + { + "epoch": 0.2519642373340558, + "grad_norm": 0.10605210810899734, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 465 + }, + { + "epoch": 0.2525060959089678, + "grad_norm": 0.10613849759101868, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 466 + }, + { + "epoch": 0.25304795448387973, + "grad_norm": 0.10717356204986572, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 467 + }, + { + "epoch": 0.25358981305879164, + "grad_norm": 0.10753747075796127, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 468 + }, + { + "epoch": 0.2541316716337036, + "grad_norm": 0.10362911969423294, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 469 + }, + { + "epoch": 0.25467353020861555, + "grad_norm": 0.11893575638532639, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 470 + }, + { + "epoch": 0.2552153887835275, + "grad_norm": 0.10144247114658356, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 471 + }, + { + "epoch": 0.25575724735843947, + "grad_norm": 0.11344347894191742, + "learning_rate": 0.0001, + "loss": 1.6524, + "step": 472 + }, + { + "epoch": 0.25629910593335137, + "grad_norm": 0.10365281254053116, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 473 + }, + { + "epoch": 0.2568409645082633, + "grad_norm": 0.13139420747756958, + "learning_rate": 0.0001, + "loss": 1.7037, + "step": 474 + }, + { + "epoch": 0.2573828230831753, + "grad_norm": 0.10608001798391342, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 475 + }, + { + "epoch": 0.25792468165808724, + "grad_norm": 0.12491496652364731, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 476 + }, + { + "epoch": 0.2584665402329992, + "grad_norm": 0.10368447750806808, + "learning_rate": 0.0001, + "loss": 1.774, + "step": 477 + }, + { + "epoch": 0.25900839880791116, + "grad_norm": 0.13536019623279572, + "learning_rate": 0.0001, + "loss": 1.6922, + "step": 478 + }, + { + "epoch": 0.25955025738282306, + "grad_norm": 0.10196977853775024, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 479 + }, + { + "epoch": 0.260092115957735, + "grad_norm": 0.09920040518045425, + "learning_rate": 0.0001, + "loss": 1.5884, + "step": 480 + }, + { + "epoch": 0.260633974532647, + "grad_norm": 0.12618587911128998, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 481 + }, + { + "epoch": 0.26117583310755893, + "grad_norm": 0.10382471978664398, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 482 + }, + { + "epoch": 0.2617176916824709, + "grad_norm": 0.11146610975265503, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 483 + }, + { + "epoch": 0.26225955025738285, + "grad_norm": 0.1008019968867302, + "learning_rate": 0.0001, + "loss": 1.7193, + "step": 484 + }, + { + "epoch": 0.26280140883229475, + "grad_norm": 0.11461275070905685, + "learning_rate": 0.0001, + "loss": 1.7452, + "step": 485 + }, + { + "epoch": 0.2633432674072067, + "grad_norm": 0.10348404943943024, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 486 + }, + { + "epoch": 0.26388512598211866, + "grad_norm": 0.1001395434141159, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 487 + }, + { + "epoch": 0.2644269845570306, + "grad_norm": 0.10220689326524734, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 488 + }, + { + "epoch": 0.2649688431319426, + "grad_norm": 0.10228809714317322, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 489 + }, + { + "epoch": 0.26551070170685453, + "grad_norm": 0.1036238893866539, + "learning_rate": 0.0001, + "loss": 1.7073, + "step": 490 + }, + { + "epoch": 0.26605256028176644, + "grad_norm": 0.10383958369493484, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 491 + }, + { + "epoch": 0.2665944188566784, + "grad_norm": 0.10285360366106033, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 492 + }, + { + "epoch": 0.26713627743159035, + "grad_norm": 0.10352769494056702, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 493 + }, + { + "epoch": 0.2676781360065023, + "grad_norm": 0.10272315889596939, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 494 + }, + { + "epoch": 0.26821999458141427, + "grad_norm": 0.11963304132223129, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 495 + }, + { + "epoch": 0.2687618531563262, + "grad_norm": 0.10235189646482468, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 496 + }, + { + "epoch": 0.2693037117312381, + "grad_norm": 0.13295263051986694, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 497 + }, + { + "epoch": 0.2698455703061501, + "grad_norm": 0.10163283348083496, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 498 + }, + { + "epoch": 0.27038742888106204, + "grad_norm": 0.12278732657432556, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 499 + }, + { + "epoch": 0.270929287455974, + "grad_norm": 0.10807237774133682, + "learning_rate": 0.0001, + "loss": 1.6579, + "step": 500 + }, + { + "epoch": 0.27147114603088596, + "grad_norm": 0.1201099380850792, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 501 + }, + { + "epoch": 0.27201300460579786, + "grad_norm": 0.1056004986166954, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 502 + }, + { + "epoch": 0.2725548631807098, + "grad_norm": 0.10857392847537994, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 503 + }, + { + "epoch": 0.2730967217556218, + "grad_norm": 0.11863652616739273, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 504 + }, + { + "epoch": 0.27363858033053373, + "grad_norm": 0.10198169946670532, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 505 + }, + { + "epoch": 0.2741804389054457, + "grad_norm": 0.1210508868098259, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 506 + }, + { + "epoch": 0.27472229748035765, + "grad_norm": 0.10578924417495728, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 507 + }, + { + "epoch": 0.27526415605526955, + "grad_norm": 0.11378353834152222, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 508 + }, + { + "epoch": 0.2758060146301815, + "grad_norm": 0.13088662922382355, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 509 + }, + { + "epoch": 0.27634787320509346, + "grad_norm": 0.10998623073101044, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 510 + }, + { + "epoch": 0.2768897317800054, + "grad_norm": 0.1617870032787323, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 511 + }, + { + "epoch": 0.2774315903549174, + "grad_norm": 0.13237886130809784, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 512 + }, + { + "epoch": 0.27797344892982934, + "grad_norm": 0.13461074233055115, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 513 + }, + { + "epoch": 0.27851530750474124, + "grad_norm": 0.12076643109321594, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 514 + }, + { + "epoch": 0.2790571660796532, + "grad_norm": 0.14691396057605743, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 515 + }, + { + "epoch": 0.27959902465456515, + "grad_norm": 0.10137977451086044, + "learning_rate": 0.0001, + "loss": 1.5602, + "step": 516 + }, + { + "epoch": 0.2801408832294771, + "grad_norm": 0.13679617643356323, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 517 + }, + { + "epoch": 0.28068274180438907, + "grad_norm": 0.10055786371231079, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 518 + }, + { + "epoch": 0.281224600379301, + "grad_norm": 0.11336284130811691, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 519 + }, + { + "epoch": 0.28176645895421293, + "grad_norm": 0.10535544157028198, + "learning_rate": 0.0001, + "loss": 1.7045, + "step": 520 + }, + { + "epoch": 0.2823083175291249, + "grad_norm": 0.10427578538656235, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 521 + }, + { + "epoch": 0.28285017610403684, + "grad_norm": 0.12511974573135376, + "learning_rate": 0.0001, + "loss": 1.7355, + "step": 522 + }, + { + "epoch": 0.2833920346789488, + "grad_norm": 0.105614572763443, + "learning_rate": 0.0001, + "loss": 1.6429, + "step": 523 + }, + { + "epoch": 0.28393389325386076, + "grad_norm": 0.12445893883705139, + "learning_rate": 0.0001, + "loss": 1.6925, + "step": 524 + }, + { + "epoch": 0.2844757518287727, + "grad_norm": 0.12350346893072128, + "learning_rate": 0.0001, + "loss": 1.7239, + "step": 525 + }, + { + "epoch": 0.2850176104036846, + "grad_norm": 0.11333523690700531, + "learning_rate": 0.0001, + "loss": 1.6035, + "step": 526 + }, + { + "epoch": 0.2855594689785966, + "grad_norm": 0.1175926998257637, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 527 + }, + { + "epoch": 0.28610132755350853, + "grad_norm": 0.10162296891212463, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 528 + }, + { + "epoch": 0.2866431861284205, + "grad_norm": 0.13615354895591736, + "learning_rate": 0.0001, + "loss": 1.6623, + "step": 529 + }, + { + "epoch": 0.28718504470333245, + "grad_norm": 0.10404025763273239, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 530 + }, + { + "epoch": 0.2877269032782444, + "grad_norm": 0.12436753511428833, + "learning_rate": 0.0001, + "loss": 1.7225, + "step": 531 + }, + { + "epoch": 0.2882687618531563, + "grad_norm": 0.11511912941932678, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 532 + }, + { + "epoch": 0.28881062042806827, + "grad_norm": 0.11416777223348618, + "learning_rate": 0.0001, + "loss": 1.6608, + "step": 533 + }, + { + "epoch": 0.2893524790029802, + "grad_norm": 0.1348169595003128, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 534 + }, + { + "epoch": 0.2898943375778922, + "grad_norm": 0.09927986562252045, + "learning_rate": 0.0001, + "loss": 1.5467, + "step": 535 + }, + { + "epoch": 0.29043619615280414, + "grad_norm": 0.13574469089508057, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 536 + }, + { + "epoch": 0.29097805472771604, + "grad_norm": 0.10188048332929611, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 537 + }, + { + "epoch": 0.291519913302628, + "grad_norm": 0.1282220184803009, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 538 + }, + { + "epoch": 0.29206177187753996, + "grad_norm": 0.1127096638083458, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 539 + }, + { + "epoch": 0.2926036304524519, + "grad_norm": 0.1312182992696762, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 540 + }, + { + "epoch": 0.29314548902736387, + "grad_norm": 0.10491517931222916, + "learning_rate": 0.0001, + "loss": 1.5932, + "step": 541 + }, + { + "epoch": 0.29368734760227583, + "grad_norm": 0.1108413115143776, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 542 + }, + { + "epoch": 0.29422920617718773, + "grad_norm": 0.1049225851893425, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 543 + }, + { + "epoch": 0.2947710647520997, + "grad_norm": 0.09961540251970291, + "learning_rate": 0.0001, + "loss": 1.6962, + "step": 544 + }, + { + "epoch": 0.29531292332701164, + "grad_norm": 0.11128535121679306, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 545 + }, + { + "epoch": 0.2958547819019236, + "grad_norm": 0.1057731881737709, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 546 + }, + { + "epoch": 0.29639664047683556, + "grad_norm": 0.09957876056432724, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 547 + }, + { + "epoch": 0.2969384990517475, + "grad_norm": 0.10864951461553574, + "learning_rate": 0.0001, + "loss": 1.7211, + "step": 548 + }, + { + "epoch": 0.2974803576266594, + "grad_norm": 0.10822419822216034, + "learning_rate": 0.0001, + "loss": 1.759, + "step": 549 + }, + { + "epoch": 0.2980222162015714, + "grad_norm": 0.11316762119531631, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 550 + }, + { + "epoch": 0.29856407477648333, + "grad_norm": 0.09931345283985138, + "learning_rate": 0.0001, + "loss": 1.6934, + "step": 551 + }, + { + "epoch": 0.2991059333513953, + "grad_norm": 0.09663783013820648, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 552 + }, + { + "epoch": 0.29964779192630725, + "grad_norm": 0.11659286171197891, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 553 + }, + { + "epoch": 0.3001896505012192, + "grad_norm": 0.09947337210178375, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 554 + }, + { + "epoch": 0.3007315090761311, + "grad_norm": 0.09798821806907654, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 555 + }, + { + "epoch": 0.30127336765104307, + "grad_norm": 0.10035169124603271, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 556 + }, + { + "epoch": 0.301815226225955, + "grad_norm": 0.1058754101395607, + "learning_rate": 0.0001, + "loss": 1.741, + "step": 557 + }, + { + "epoch": 0.302357084800867, + "grad_norm": 0.09941428154706955, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 558 + }, + { + "epoch": 0.30289894337577894, + "grad_norm": 0.09876841306686401, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 559 + }, + { + "epoch": 0.3034408019506909, + "grad_norm": 0.10757549852132797, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 560 + }, + { + "epoch": 0.3039826605256028, + "grad_norm": 0.1080511137843132, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 561 + }, + { + "epoch": 0.30452451910051476, + "grad_norm": 0.10125991702079773, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 562 + }, + { + "epoch": 0.3050663776754267, + "grad_norm": 0.10050351917743683, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 563 + }, + { + "epoch": 0.30560823625033867, + "grad_norm": 0.11019833385944366, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 564 + }, + { + "epoch": 0.30615009482525063, + "grad_norm": 0.09782030433416367, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 565 + }, + { + "epoch": 0.30669195340016253, + "grad_norm": 0.11155736446380615, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 566 + }, + { + "epoch": 0.3072338119750745, + "grad_norm": 0.10135099291801453, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 567 + }, + { + "epoch": 0.30777567054998645, + "grad_norm": 0.11518092453479767, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 568 + }, + { + "epoch": 0.3083175291248984, + "grad_norm": 0.09859606623649597, + "learning_rate": 0.0001, + "loss": 1.5585, + "step": 569 + }, + { + "epoch": 0.30885938769981036, + "grad_norm": 0.10348886996507645, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 570 + }, + { + "epoch": 0.3094012462747223, + "grad_norm": 0.11707218736410141, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 571 + }, + { + "epoch": 0.3099431048496342, + "grad_norm": 0.10389493405818939, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 572 + }, + { + "epoch": 0.3104849634245462, + "grad_norm": 0.11538241803646088, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 573 + }, + { + "epoch": 0.31102682199945814, + "grad_norm": 0.1050158217549324, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 574 + }, + { + "epoch": 0.3115686805743701, + "grad_norm": 0.11141128093004227, + "learning_rate": 0.0001, + "loss": 1.7506, + "step": 575 + }, + { + "epoch": 0.31211053914928205, + "grad_norm": 0.09710174798965454, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 576 + }, + { + "epoch": 0.312652397724194, + "grad_norm": 0.10418973118066788, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 577 + }, + { + "epoch": 0.3131942562991059, + "grad_norm": 0.10552489757537842, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 578 + }, + { + "epoch": 0.31373611487401787, + "grad_norm": 0.11075478047132492, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 579 + }, + { + "epoch": 0.3142779734489298, + "grad_norm": 0.09793159365653992, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 580 + }, + { + "epoch": 0.3148198320238418, + "grad_norm": 0.1039569228887558, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 581 + }, + { + "epoch": 0.31536169059875374, + "grad_norm": 0.1017010286450386, + "learning_rate": 0.0001, + "loss": 1.5596, + "step": 582 + }, + { + "epoch": 0.3159035491736657, + "grad_norm": 0.09644217044115067, + "learning_rate": 0.0001, + "loss": 1.5719, + "step": 583 + }, + { + "epoch": 0.3164454077485776, + "grad_norm": 0.0962899774312973, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 584 + }, + { + "epoch": 0.31698726632348956, + "grad_norm": 0.10612763464450836, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 585 + }, + { + "epoch": 0.3175291248984015, + "grad_norm": 0.10435491800308228, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 586 + }, + { + "epoch": 0.3180709834733135, + "grad_norm": 0.1048903837800026, + "learning_rate": 0.0001, + "loss": 1.6114, + "step": 587 + }, + { + "epoch": 0.31861284204822543, + "grad_norm": 0.09729161113500595, + "learning_rate": 0.0001, + "loss": 1.5857, + "step": 588 + }, + { + "epoch": 0.3191547006231374, + "grad_norm": 0.1105666309595108, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 589 + }, + { + "epoch": 0.3196965591980493, + "grad_norm": 0.10651809722185135, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 590 + }, + { + "epoch": 0.32023841777296125, + "grad_norm": 0.09992777556180954, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 591 + }, + { + "epoch": 0.3207802763478732, + "grad_norm": 0.09944906085729599, + "learning_rate": 0.0001, + "loss": 1.5162, + "step": 592 + }, + { + "epoch": 0.32132213492278516, + "grad_norm": 0.10559218376874924, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 593 + }, + { + "epoch": 0.3218639934976971, + "grad_norm": 0.11096035689115524, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 594 + }, + { + "epoch": 0.322405852072609, + "grad_norm": 0.10335852205753326, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 595 + }, + { + "epoch": 0.322947710647521, + "grad_norm": 0.119107186794281, + "learning_rate": 0.0001, + "loss": 1.7471, + "step": 596 + }, + { + "epoch": 0.32348956922243294, + "grad_norm": 0.11370395123958588, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 597 + }, + { + "epoch": 0.3240314277973449, + "grad_norm": 0.11013813316822052, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 598 + }, + { + "epoch": 0.32457328637225685, + "grad_norm": 0.11540937423706055, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 599 + }, + { + "epoch": 0.3251151449471688, + "grad_norm": 0.10792079567909241, + "learning_rate": 0.0001, + "loss": 1.6451, + "step": 600 + }, + { + "epoch": 0.3256570035220807, + "grad_norm": 0.12482734024524689, + "learning_rate": 0.0001, + "loss": 1.6973, + "step": 601 + }, + { + "epoch": 0.32619886209699267, + "grad_norm": 0.10253299027681351, + "learning_rate": 0.0001, + "loss": 1.6808, + "step": 602 + }, + { + "epoch": 0.3267407206719046, + "grad_norm": 0.10867742449045181, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 603 + }, + { + "epoch": 0.3272825792468166, + "grad_norm": 0.10623064637184143, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 604 + }, + { + "epoch": 0.32782443782172854, + "grad_norm": 0.10389222949743271, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 605 + }, + { + "epoch": 0.3283662963966405, + "grad_norm": 0.10566440224647522, + "learning_rate": 0.0001, + "loss": 1.6995, + "step": 606 + }, + { + "epoch": 0.3289081549715524, + "grad_norm": 0.10380306094884872, + "learning_rate": 0.0001, + "loss": 1.6832, + "step": 607 + }, + { + "epoch": 0.32945001354646436, + "grad_norm": 0.10225769877433777, + "learning_rate": 0.0001, + "loss": 1.6459, + "step": 608 + }, + { + "epoch": 0.3299918721213763, + "grad_norm": 0.10640621185302734, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 609 + }, + { + "epoch": 0.3305337306962883, + "grad_norm": 0.10927685350179672, + "learning_rate": 0.0001, + "loss": 1.7528, + "step": 610 + }, + { + "epoch": 0.33107558927120023, + "grad_norm": 0.1068321168422699, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 611 + }, + { + "epoch": 0.3316174478461122, + "grad_norm": 0.1025937870144844, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 612 + }, + { + "epoch": 0.3321593064210241, + "grad_norm": 0.10666365176439285, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 613 + }, + { + "epoch": 0.33270116499593605, + "grad_norm": 0.10716580599546432, + "learning_rate": 0.0001, + "loss": 1.6932, + "step": 614 + }, + { + "epoch": 0.333243023570848, + "grad_norm": 0.10392767935991287, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 615 + }, + { + "epoch": 0.33378488214575996, + "grad_norm": 0.10266393423080444, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 616 + }, + { + "epoch": 0.3343267407206719, + "grad_norm": 0.09854996204376221, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 617 + }, + { + "epoch": 0.3348685992955839, + "grad_norm": 0.10747377574443817, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 618 + }, + { + "epoch": 0.3354104578704958, + "grad_norm": 0.09462318569421768, + "learning_rate": 0.0001, + "loss": 1.5792, + "step": 619 + }, + { + "epoch": 0.33595231644540774, + "grad_norm": 0.10060413926839828, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 620 + }, + { + "epoch": 0.3364941750203197, + "grad_norm": 0.10601072758436203, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 621 + }, + { + "epoch": 0.33703603359523165, + "grad_norm": 0.1055201068520546, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 622 + }, + { + "epoch": 0.3375778921701436, + "grad_norm": 0.10256335884332657, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 623 + }, + { + "epoch": 0.3381197507450555, + "grad_norm": 0.10397038608789444, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 624 + }, + { + "epoch": 0.33866160931996747, + "grad_norm": 0.10368067026138306, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 625 + }, + { + "epoch": 0.33920346789487943, + "grad_norm": 0.11170101165771484, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 626 + }, + { + "epoch": 0.3397453264697914, + "grad_norm": 0.10459307581186295, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 627 + }, + { + "epoch": 0.34028718504470334, + "grad_norm": 0.1107502356171608, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 628 + }, + { + "epoch": 0.3408290436196153, + "grad_norm": 0.10105384141206741, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 629 + }, + { + "epoch": 0.3413709021945272, + "grad_norm": 0.12728677690029144, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 630 + }, + { + "epoch": 0.34191276076943916, + "grad_norm": 0.11482071131467819, + "learning_rate": 0.0001, + "loss": 1.6842, + "step": 631 + }, + { + "epoch": 0.3424546193443511, + "grad_norm": 0.12361173331737518, + "learning_rate": 0.0001, + "loss": 1.6981, + "step": 632 + }, + { + "epoch": 0.3429964779192631, + "grad_norm": 0.09973514825105667, + "learning_rate": 0.0001, + "loss": 1.5814, + "step": 633 + }, + { + "epoch": 0.34353833649417503, + "grad_norm": 0.10917801409959793, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 634 + }, + { + "epoch": 0.344080195069087, + "grad_norm": 0.10635889321565628, + "learning_rate": 0.0001, + "loss": 1.7117, + "step": 635 + }, + { + "epoch": 0.3446220536439989, + "grad_norm": 0.10422027111053467, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 636 + }, + { + "epoch": 0.34516391221891085, + "grad_norm": 0.10354586690664291, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 637 + }, + { + "epoch": 0.3457057707938228, + "grad_norm": 0.1029241606593132, + "learning_rate": 0.0001, + "loss": 1.6797, + "step": 638 + }, + { + "epoch": 0.34624762936873477, + "grad_norm": 0.11714927852153778, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 639 + }, + { + "epoch": 0.3467894879436467, + "grad_norm": 0.10488483309745789, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 640 + }, + { + "epoch": 0.3473313465185587, + "grad_norm": 0.11477353423833847, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 641 + }, + { + "epoch": 0.3478732050934706, + "grad_norm": 0.10494516044855118, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 642 + }, + { + "epoch": 0.34841506366838254, + "grad_norm": 0.11135809123516083, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 643 + }, + { + "epoch": 0.3489569222432945, + "grad_norm": 0.10758684575557709, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 644 + }, + { + "epoch": 0.34949878081820646, + "grad_norm": 0.1039474606513977, + "learning_rate": 0.0001, + "loss": 1.6933, + "step": 645 + }, + { + "epoch": 0.3500406393931184, + "grad_norm": 0.10481411963701248, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 646 + }, + { + "epoch": 0.35058249796803037, + "grad_norm": 0.10531273484230042, + "learning_rate": 0.0001, + "loss": 1.586, + "step": 647 + }, + { + "epoch": 0.3511243565429423, + "grad_norm": 0.11362781375646591, + "learning_rate": 0.0001, + "loss": 1.6792, + "step": 648 + }, + { + "epoch": 0.35166621511785423, + "grad_norm": 0.10828311741352081, + "learning_rate": 0.0001, + "loss": 1.7087, + "step": 649 + }, + { + "epoch": 0.3522080736927662, + "grad_norm": 0.11995867639780045, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 650 + }, + { + "epoch": 0.35274993226767815, + "grad_norm": 0.10260593146085739, + "learning_rate": 0.0001, + "loss": 1.617, + "step": 651 + }, + { + "epoch": 0.3532917908425901, + "grad_norm": 0.11493167281150818, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 652 + }, + { + "epoch": 0.353833649417502, + "grad_norm": 0.11218332499265671, + "learning_rate": 0.0001, + "loss": 1.6972, + "step": 653 + }, + { + "epoch": 0.35437550799241396, + "grad_norm": 0.12246052175760269, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 654 + }, + { + "epoch": 0.3549173665673259, + "grad_norm": 0.1194012314081192, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 655 + }, + { + "epoch": 0.3554592251422379, + "grad_norm": 0.172341987490654, + "learning_rate": 0.0001, + "loss": 1.7179, + "step": 656 + }, + { + "epoch": 0.35600108371714984, + "grad_norm": 0.09841404110193253, + "learning_rate": 0.0001, + "loss": 1.5871, + "step": 657 + }, + { + "epoch": 0.3565429422920618, + "grad_norm": 0.1325990855693817, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 658 + }, + { + "epoch": 0.3570848008669737, + "grad_norm": 0.11859223991632462, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 659 + }, + { + "epoch": 0.35762665944188565, + "grad_norm": 0.1360592097043991, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 660 + }, + { + "epoch": 0.3581685180167976, + "grad_norm": 0.10509878396987915, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 661 + }, + { + "epoch": 0.35871037659170957, + "grad_norm": 0.11457318812608719, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 662 + }, + { + "epoch": 0.3592522351666215, + "grad_norm": 0.10837317258119583, + "learning_rate": 0.0001, + "loss": 1.714, + "step": 663 + }, + { + "epoch": 0.3597940937415335, + "grad_norm": 0.1057453602552414, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 664 + }, + { + "epoch": 0.3603359523164454, + "grad_norm": 0.10376082360744476, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 665 + }, + { + "epoch": 0.36087781089135734, + "grad_norm": 0.11164995282888412, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 666 + }, + { + "epoch": 0.3614196694662693, + "grad_norm": 0.10537731647491455, + "learning_rate": 0.0001, + "loss": 1.73, + "step": 667 + }, + { + "epoch": 0.36196152804118126, + "grad_norm": 0.1094699501991272, + "learning_rate": 0.0001, + "loss": 1.6837, + "step": 668 + }, + { + "epoch": 0.3625033866160932, + "grad_norm": 0.09954968839883804, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 669 + }, + { + "epoch": 0.3630452451910052, + "grad_norm": 0.10807305574417114, + "learning_rate": 0.0001, + "loss": 1.5642, + "step": 670 + }, + { + "epoch": 0.3635871037659171, + "grad_norm": 0.11197733134031296, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 671 + }, + { + "epoch": 0.36412896234082903, + "grad_norm": 0.11459947377443314, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 672 + }, + { + "epoch": 0.364670820915741, + "grad_norm": 0.11452600359916687, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 673 + }, + { + "epoch": 0.36521267949065295, + "grad_norm": 0.10195346176624298, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 674 + }, + { + "epoch": 0.3657545380655649, + "grad_norm": 0.11506009101867676, + "learning_rate": 0.0001, + "loss": 1.693, + "step": 675 + }, + { + "epoch": 0.36629639664047686, + "grad_norm": 0.10550053417682648, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 676 + }, + { + "epoch": 0.36683825521538876, + "grad_norm": 0.11007264256477356, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 677 + }, + { + "epoch": 0.3673801137903007, + "grad_norm": 0.10047126561403275, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 678 + }, + { + "epoch": 0.3679219723652127, + "grad_norm": 0.11524063348770142, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 679 + }, + { + "epoch": 0.36846383094012464, + "grad_norm": 0.10718086361885071, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 680 + }, + { + "epoch": 0.3690056895150366, + "grad_norm": 0.10243549197912216, + "learning_rate": 0.0001, + "loss": 1.6088, + "step": 681 + }, + { + "epoch": 0.3695475480899485, + "grad_norm": 0.1065065935254097, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 682 + }, + { + "epoch": 0.37008940666486045, + "grad_norm": 0.10459452867507935, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 683 + }, + { + "epoch": 0.3706312652397724, + "grad_norm": 0.10934063792228699, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 684 + }, + { + "epoch": 0.37117312381468437, + "grad_norm": 0.10245130956172943, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 685 + }, + { + "epoch": 0.3717149823895963, + "grad_norm": 0.10711225867271423, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 686 + }, + { + "epoch": 0.3722568409645083, + "grad_norm": 0.10337112098932266, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 687 + }, + { + "epoch": 0.3727986995394202, + "grad_norm": 0.11192546039819717, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 688 + }, + { + "epoch": 0.37334055811433214, + "grad_norm": 0.10183734446763992, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 689 + }, + { + "epoch": 0.3738824166892441, + "grad_norm": 0.11607497930526733, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 690 + }, + { + "epoch": 0.37442427526415606, + "grad_norm": 0.10997983813285828, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 691 + }, + { + "epoch": 0.374966133839068, + "grad_norm": 0.1069745421409607, + "learning_rate": 0.0001, + "loss": 1.5989, + "step": 692 + }, + { + "epoch": 0.37550799241398, + "grad_norm": 0.1032508909702301, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 693 + }, + { + "epoch": 0.3760498509888919, + "grad_norm": 0.10387293994426727, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 694 + }, + { + "epoch": 0.37659170956380383, + "grad_norm": 0.10035529732704163, + "learning_rate": 0.0001, + "loss": 1.6991, + "step": 695 + }, + { + "epoch": 0.3771335681387158, + "grad_norm": 0.1046760082244873, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 696 + }, + { + "epoch": 0.37767542671362775, + "grad_norm": 0.10290009528398514, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 697 + }, + { + "epoch": 0.3782172852885397, + "grad_norm": 0.0992322564125061, + "learning_rate": 0.0001, + "loss": 1.667, + "step": 698 + }, + { + "epoch": 0.37875914386345166, + "grad_norm": 0.09865851700305939, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 699 + }, + { + "epoch": 0.37930100243836357, + "grad_norm": 0.09838226437568665, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 700 + }, + { + "epoch": 0.3798428610132755, + "grad_norm": 0.10080188512802124, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 701 + }, + { + "epoch": 0.3803847195881875, + "grad_norm": 0.10885747522115707, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 702 + }, + { + "epoch": 0.38092657816309944, + "grad_norm": 0.1085219532251358, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 703 + }, + { + "epoch": 0.3814684367380114, + "grad_norm": 0.10353940725326538, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 704 + }, + { + "epoch": 0.38201029531292335, + "grad_norm": 0.10917379707098007, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 705 + }, + { + "epoch": 0.38255215388783526, + "grad_norm": 0.09883071482181549, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 706 + }, + { + "epoch": 0.3830940124627472, + "grad_norm": 0.11153585463762283, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 707 + }, + { + "epoch": 0.38363587103765917, + "grad_norm": 0.11329423636198044, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 708 + }, + { + "epoch": 0.38417772961257113, + "grad_norm": 0.11220398545265198, + "learning_rate": 0.0001, + "loss": 1.7422, + "step": 709 + }, + { + "epoch": 0.3847195881874831, + "grad_norm": 0.09743146598339081, + "learning_rate": 0.0001, + "loss": 1.5827, + "step": 710 + }, + { + "epoch": 0.385261446762395, + "grad_norm": 0.11069986969232559, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 711 + }, + { + "epoch": 0.38580330533730695, + "grad_norm": 0.11028224229812622, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 712 + }, + { + "epoch": 0.3863451639122189, + "grad_norm": 0.107399120926857, + "learning_rate": 0.0001, + "loss": 1.7154, + "step": 713 + }, + { + "epoch": 0.38688702248713086, + "grad_norm": 0.10606145858764648, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 714 + }, + { + "epoch": 0.3874288810620428, + "grad_norm": 0.09998262673616409, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 715 + }, + { + "epoch": 0.3879707396369548, + "grad_norm": 0.10442587733268738, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 716 + }, + { + "epoch": 0.3885125982118667, + "grad_norm": 0.10855881869792938, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 717 + }, + { + "epoch": 0.38905445678677864, + "grad_norm": 0.10240405052900314, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 718 + }, + { + "epoch": 0.3895963153616906, + "grad_norm": 0.10467260330915451, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 719 + }, + { + "epoch": 0.39013817393660255, + "grad_norm": 0.10330167412757874, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 720 + }, + { + "epoch": 0.3906800325115145, + "grad_norm": 0.10810812562704086, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 721 + }, + { + "epoch": 0.39122189108642647, + "grad_norm": 0.12047295272350311, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 722 + }, + { + "epoch": 0.39176374966133837, + "grad_norm": 0.09939727187156677, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 723 + }, + { + "epoch": 0.3923056082362503, + "grad_norm": 0.11785425990819931, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 724 + }, + { + "epoch": 0.3928474668111623, + "grad_norm": 0.09766566753387451, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 725 + }, + { + "epoch": 0.39338932538607424, + "grad_norm": 0.10398256778717041, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 726 + }, + { + "epoch": 0.3939311839609862, + "grad_norm": 0.11227762699127197, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 727 + }, + { + "epoch": 0.39447304253589816, + "grad_norm": 0.10695900768041611, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 728 + }, + { + "epoch": 0.39501490111081006, + "grad_norm": 0.11811427026987076, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 729 + }, + { + "epoch": 0.395556759685722, + "grad_norm": 0.10070157796144485, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 730 + }, + { + "epoch": 0.396098618260634, + "grad_norm": 0.13773372769355774, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 731 + }, + { + "epoch": 0.39664047683554593, + "grad_norm": 0.10539493709802628, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 732 + }, + { + "epoch": 0.3971823354104579, + "grad_norm": 0.13001807034015656, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 733 + }, + { + "epoch": 0.39772419398536984, + "grad_norm": 0.10862381756305695, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 734 + }, + { + "epoch": 0.39826605256028175, + "grad_norm": 0.14452537894248962, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 735 + }, + { + "epoch": 0.3988079111351937, + "grad_norm": 0.10327973961830139, + "learning_rate": 0.0001, + "loss": 1.6522, + "step": 736 + }, + { + "epoch": 0.39934976971010566, + "grad_norm": 0.13244539499282837, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 737 + }, + { + "epoch": 0.3998916282850176, + "grad_norm": 0.11024655401706696, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 738 + }, + { + "epoch": 0.4004334868599296, + "grad_norm": 0.13484923541545868, + "learning_rate": 0.0001, + "loss": 1.74, + "step": 739 + }, + { + "epoch": 0.4009753454348415, + "grad_norm": 0.12410994619131088, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 740 + }, + { + "epoch": 0.40151720400975344, + "grad_norm": 0.12810753285884857, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 741 + }, + { + "epoch": 0.4020590625846654, + "grad_norm": 0.12648116052150726, + "learning_rate": 0.0001, + "loss": 1.6104, + "step": 742 + }, + { + "epoch": 0.40260092115957735, + "grad_norm": 0.11238245666027069, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 743 + }, + { + "epoch": 0.4031427797344893, + "grad_norm": 0.11832413822412491, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 744 + }, + { + "epoch": 0.40368463830940127, + "grad_norm": 0.10859525948762894, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 745 + }, + { + "epoch": 0.40422649688431317, + "grad_norm": 0.11857281625270844, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 746 + }, + { + "epoch": 0.4047683554592251, + "grad_norm": 0.10530444979667664, + "learning_rate": 0.0001, + "loss": 1.6318, + "step": 747 + }, + { + "epoch": 0.4053102140341371, + "grad_norm": 0.11615604907274246, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 748 + }, + { + "epoch": 0.40585207260904904, + "grad_norm": 0.10375487059354782, + "learning_rate": 0.0001, + "loss": 1.5567, + "step": 749 + }, + { + "epoch": 0.406393931183961, + "grad_norm": 0.11461639404296875, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 750 + }, + { + "epoch": 0.40693578975887296, + "grad_norm": 0.10505295544862747, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 751 + }, + { + "epoch": 0.40747764833378486, + "grad_norm": 0.10666048526763916, + "learning_rate": 0.0001, + "loss": 1.5553, + "step": 752 + }, + { + "epoch": 0.4080195069086968, + "grad_norm": 0.10326603800058365, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 753 + }, + { + "epoch": 0.4085613654836088, + "grad_norm": 0.10881656408309937, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 754 + }, + { + "epoch": 0.40910322405852073, + "grad_norm": 0.11879541724920273, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 755 + }, + { + "epoch": 0.4096450826334327, + "grad_norm": 0.10726792365312576, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 756 + }, + { + "epoch": 0.41018694120834465, + "grad_norm": 0.12982334196567535, + "learning_rate": 0.0001, + "loss": 1.614, + "step": 757 + }, + { + "epoch": 0.41072879978325655, + "grad_norm": 0.10279403626918793, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 758 + }, + { + "epoch": 0.4112706583581685, + "grad_norm": 0.13861803710460663, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 759 + }, + { + "epoch": 0.41181251693308046, + "grad_norm": 0.1008225530385971, + "learning_rate": 0.0001, + "loss": 1.6515, + "step": 760 + }, + { + "epoch": 0.4123543755079924, + "grad_norm": 0.11922475695610046, + "learning_rate": 0.0001, + "loss": 1.6635, + "step": 761 + }, + { + "epoch": 0.4128962340829044, + "grad_norm": 0.10964781790971756, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 762 + }, + { + "epoch": 0.41343809265781634, + "grad_norm": 0.11899418383836746, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 763 + }, + { + "epoch": 0.41397995123272824, + "grad_norm": 0.10138355940580368, + "learning_rate": 0.0001, + "loss": 1.5951, + "step": 764 + }, + { + "epoch": 0.4145218098076402, + "grad_norm": 0.10140690952539444, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 765 + }, + { + "epoch": 0.41506366838255215, + "grad_norm": 0.21386539936065674, + "learning_rate": 0.0001, + "loss": 1.6831, + "step": 766 + }, + { + "epoch": 0.4156055269574641, + "grad_norm": 0.10448230057954788, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 767 + }, + { + "epoch": 0.41614738553237607, + "grad_norm": 0.12657177448272705, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 768 + }, + { + "epoch": 0.41668924410728797, + "grad_norm": 0.10476984828710556, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 769 + }, + { + "epoch": 0.41723110268219993, + "grad_norm": 0.11148498952388763, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 770 + }, + { + "epoch": 0.4177729612571119, + "grad_norm": 0.11308016628026962, + "learning_rate": 0.0001, + "loss": 1.7415, + "step": 771 + }, + { + "epoch": 0.41831481983202384, + "grad_norm": 0.11244909465312958, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 772 + }, + { + "epoch": 0.4188566784069358, + "grad_norm": 0.11405385285615921, + "learning_rate": 0.0001, + "loss": 1.6767, + "step": 773 + }, + { + "epoch": 0.41939853698184776, + "grad_norm": 0.10433017462491989, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 774 + }, + { + "epoch": 0.41994039555675966, + "grad_norm": 0.11652455478906631, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 775 + }, + { + "epoch": 0.4204822541316716, + "grad_norm": 0.1095375195145607, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 776 + }, + { + "epoch": 0.4210241127065836, + "grad_norm": 0.12977243959903717, + "learning_rate": 0.0001, + "loss": 1.6794, + "step": 777 + }, + { + "epoch": 0.42156597128149553, + "grad_norm": 0.10516102612018585, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 778 + }, + { + "epoch": 0.4221078298564075, + "grad_norm": 0.12051164358854294, + "learning_rate": 0.0001, + "loss": 1.7294, + "step": 779 + }, + { + "epoch": 0.42264968843131945, + "grad_norm": 0.1023736223578453, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 780 + }, + { + "epoch": 0.42319154700623135, + "grad_norm": 0.11457734555006027, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 781 + }, + { + "epoch": 0.4237334055811433, + "grad_norm": 0.11762605607509613, + "learning_rate": 0.0001, + "loss": 1.7058, + "step": 782 + }, + { + "epoch": 0.42427526415605527, + "grad_norm": 0.10679729282855988, + "learning_rate": 0.0001, + "loss": 1.5849, + "step": 783 + }, + { + "epoch": 0.4248171227309672, + "grad_norm": 0.11970070749521255, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 784 + }, + { + "epoch": 0.4253589813058792, + "grad_norm": 0.10416082292795181, + "learning_rate": 0.0001, + "loss": 1.6155, + "step": 785 + }, + { + "epoch": 0.42590083988079114, + "grad_norm": 0.12200033664703369, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 786 + }, + { + "epoch": 0.42644269845570304, + "grad_norm": 0.10045525431632996, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 787 + }, + { + "epoch": 0.426984557030615, + "grad_norm": 0.12497124820947647, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 788 + }, + { + "epoch": 0.42752641560552695, + "grad_norm": 0.10556544363498688, + "learning_rate": 0.0001, + "loss": 1.5744, + "step": 789 + }, + { + "epoch": 0.4280682741804389, + "grad_norm": 0.12096355110406876, + "learning_rate": 0.0001, + "loss": 1.6924, + "step": 790 + }, + { + "epoch": 0.42861013275535087, + "grad_norm": 0.10694337636232376, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 791 + }, + { + "epoch": 0.4291519913302628, + "grad_norm": 0.1038854718208313, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 792 + }, + { + "epoch": 0.42969384990517473, + "grad_norm": 0.11172711104154587, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 793 + }, + { + "epoch": 0.4302357084800867, + "grad_norm": 0.0996694564819336, + "learning_rate": 0.0001, + "loss": 1.5751, + "step": 794 + }, + { + "epoch": 0.43077756705499864, + "grad_norm": 0.11230441927909851, + "learning_rate": 0.0001, + "loss": 1.5449, + "step": 795 + }, + { + "epoch": 0.4313194256299106, + "grad_norm": 0.10604969412088394, + "learning_rate": 0.0001, + "loss": 1.6912, + "step": 796 + }, + { + "epoch": 0.43186128420482256, + "grad_norm": 0.10176172107458115, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 797 + }, + { + "epoch": 0.43240314277973446, + "grad_norm": 0.10164622962474823, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 798 + }, + { + "epoch": 0.4329450013546464, + "grad_norm": 0.10655150562524796, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 799 + }, + { + "epoch": 0.4334868599295584, + "grad_norm": 0.10253231972455978, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 800 + }, + { + "epoch": 0.43402871850447033, + "grad_norm": 0.10146262496709824, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 801 + }, + { + "epoch": 0.4345705770793823, + "grad_norm": 0.11094497889280319, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 802 + }, + { + "epoch": 0.43511243565429425, + "grad_norm": 0.1083596721291542, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 803 + }, + { + "epoch": 0.43565429422920615, + "grad_norm": 0.11484643816947937, + "learning_rate": 0.0001, + "loss": 1.7213, + "step": 804 + }, + { + "epoch": 0.4361961528041181, + "grad_norm": 0.10802962630987167, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 805 + }, + { + "epoch": 0.43673801137903007, + "grad_norm": 0.10771377384662628, + "learning_rate": 0.0001, + "loss": 1.7367, + "step": 806 + }, + { + "epoch": 0.437279869953942, + "grad_norm": 0.10246739536523819, + "learning_rate": 0.0001, + "loss": 1.6671, + "step": 807 + }, + { + "epoch": 0.437821728528854, + "grad_norm": 0.09854665398597717, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 808 + }, + { + "epoch": 0.43836358710376594, + "grad_norm": 0.1048893928527832, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 809 + }, + { + "epoch": 0.43890544567867784, + "grad_norm": 0.09986301511526108, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 810 + }, + { + "epoch": 0.4394473042535898, + "grad_norm": 0.10390371829271317, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 811 + }, + { + "epoch": 0.43998916282850176, + "grad_norm": 0.104039765894413, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 812 + }, + { + "epoch": 0.4405310214034137, + "grad_norm": 0.10459230840206146, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 813 + }, + { + "epoch": 0.44107287997832567, + "grad_norm": 0.10667424649000168, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 814 + }, + { + "epoch": 0.44161473855323763, + "grad_norm": 0.10749981552362442, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 815 + }, + { + "epoch": 0.44215659712814953, + "grad_norm": 0.10850109905004501, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 816 + }, + { + "epoch": 0.4426984557030615, + "grad_norm": 0.10601247102022171, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 817 + }, + { + "epoch": 0.44324031427797345, + "grad_norm": 0.10685880482196808, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 818 + }, + { + "epoch": 0.4437821728528854, + "grad_norm": 0.10400949418544769, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 819 + }, + { + "epoch": 0.44432403142779736, + "grad_norm": 0.117488332092762, + "learning_rate": 0.0001, + "loss": 1.7304, + "step": 820 + }, + { + "epoch": 0.4448658900027093, + "grad_norm": 0.1075533851981163, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 821 + }, + { + "epoch": 0.4454077485776212, + "grad_norm": 0.1162843331694603, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 822 + }, + { + "epoch": 0.4459496071525332, + "grad_norm": 0.10688474774360657, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 823 + }, + { + "epoch": 0.44649146572744514, + "grad_norm": 0.1137152761220932, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 824 + }, + { + "epoch": 0.4470333243023571, + "grad_norm": 0.10783565044403076, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 825 + }, + { + "epoch": 0.44757518287726905, + "grad_norm": 0.11750900000333786, + "learning_rate": 0.0001, + "loss": 1.6704, + "step": 826 + }, + { + "epoch": 0.44811704145218095, + "grad_norm": 0.1140749379992485, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 827 + }, + { + "epoch": 0.4486589000270929, + "grad_norm": 0.10287487506866455, + "learning_rate": 0.0001, + "loss": 1.5714, + "step": 828 + }, + { + "epoch": 0.44920075860200487, + "grad_norm": 0.123508021235466, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 829 + }, + { + "epoch": 0.4497426171769168, + "grad_norm": 0.10874208062887192, + "learning_rate": 0.0001, + "loss": 1.5999, + "step": 830 + }, + { + "epoch": 0.4502844757518288, + "grad_norm": 0.12272533774375916, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 831 + }, + { + "epoch": 0.45082633432674074, + "grad_norm": 0.1147417426109314, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 832 + }, + { + "epoch": 0.45136819290165264, + "grad_norm": 0.10647159069776535, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 833 + }, + { + "epoch": 0.4519100514765646, + "grad_norm": 0.12309867888689041, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 834 + }, + { + "epoch": 0.45245191005147656, + "grad_norm": 0.11148115247488022, + "learning_rate": 0.0001, + "loss": 1.6481, + "step": 835 + }, + { + "epoch": 0.4529937686263885, + "grad_norm": 0.10988766700029373, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 836 + }, + { + "epoch": 0.4535356272013005, + "grad_norm": 0.1154412180185318, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 837 + }, + { + "epoch": 0.45407748577621243, + "grad_norm": 0.1137893870472908, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 838 + }, + { + "epoch": 0.45461934435112433, + "grad_norm": 0.10951977968215942, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 839 + }, + { + "epoch": 0.4551612029260363, + "grad_norm": 0.104254350066185, + "learning_rate": 0.0001, + "loss": 1.592, + "step": 840 + }, + { + "epoch": 0.45570306150094825, + "grad_norm": 0.10429386794567108, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 841 + }, + { + "epoch": 0.4562449200758602, + "grad_norm": 0.1142311543226242, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 842 + }, + { + "epoch": 0.45678677865077216, + "grad_norm": 0.10919320583343506, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 843 + }, + { + "epoch": 0.4573286372256841, + "grad_norm": 0.11463817954063416, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 844 + }, + { + "epoch": 0.457870495800596, + "grad_norm": 0.11288406699895859, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 845 + }, + { + "epoch": 0.458412354375508, + "grad_norm": 0.11115078628063202, + "learning_rate": 0.0001, + "loss": 1.5919, + "step": 846 + }, + { + "epoch": 0.45895421295041994, + "grad_norm": 0.12078949809074402, + "learning_rate": 0.0001, + "loss": 1.6096, + "step": 847 + }, + { + "epoch": 0.4594960715253319, + "grad_norm": 0.10990909487009048, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 848 + }, + { + "epoch": 0.46003793010024385, + "grad_norm": 0.1061215028166771, + "learning_rate": 0.0001, + "loss": 1.5879, + "step": 849 + }, + { + "epoch": 0.4605797886751558, + "grad_norm": 0.11332777887582779, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 850 + }, + { + "epoch": 0.4611216472500677, + "grad_norm": 0.1126977950334549, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 851 + }, + { + "epoch": 0.46166350582497967, + "grad_norm": 0.10927823930978775, + "learning_rate": 0.0001, + "loss": 1.6078, + "step": 852 + }, + { + "epoch": 0.4622053643998916, + "grad_norm": 0.11738968640565872, + "learning_rate": 0.0001, + "loss": 1.5666, + "step": 853 + }, + { + "epoch": 0.4627472229748036, + "grad_norm": 0.10907457768917084, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 854 + }, + { + "epoch": 0.46328908154971554, + "grad_norm": 0.12172666192054749, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 855 + }, + { + "epoch": 0.4638309401246275, + "grad_norm": 0.10580083727836609, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 856 + }, + { + "epoch": 0.4643727986995394, + "grad_norm": 0.13606320321559906, + "learning_rate": 0.0001, + "loss": 1.6021, + "step": 857 + }, + { + "epoch": 0.46491465727445136, + "grad_norm": 0.11144983768463135, + "learning_rate": 0.0001, + "loss": 1.6698, + "step": 858 + }, + { + "epoch": 0.4654565158493633, + "grad_norm": 0.11384718120098114, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 859 + }, + { + "epoch": 0.4659983744242753, + "grad_norm": 0.10568007081747055, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 860 + }, + { + "epoch": 0.46654023299918723, + "grad_norm": 0.12521082162857056, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 861 + }, + { + "epoch": 0.46708209157409913, + "grad_norm": 0.1050972193479538, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 862 + }, + { + "epoch": 0.4676239501490111, + "grad_norm": 0.11627109348773956, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 863 + }, + { + "epoch": 0.46816580872392305, + "grad_norm": 0.10397864878177643, + "learning_rate": 0.0001, + "loss": 1.5968, + "step": 864 + }, + { + "epoch": 0.468707667298835, + "grad_norm": 0.12034126371145248, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 865 + }, + { + "epoch": 0.46924952587374696, + "grad_norm": 0.10460563004016876, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 866 + }, + { + "epoch": 0.4697913844486589, + "grad_norm": 0.10565747320652008, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 867 + }, + { + "epoch": 0.4703332430235708, + "grad_norm": 0.10394787043333054, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 868 + }, + { + "epoch": 0.4708751015984828, + "grad_norm": 0.10103076696395874, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 869 + }, + { + "epoch": 0.47141696017339474, + "grad_norm": 0.10692822188138962, + "learning_rate": 0.0001, + "loss": 1.6152, + "step": 870 + }, + { + "epoch": 0.4719588187483067, + "grad_norm": 0.10232596099376678, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 871 + }, + { + "epoch": 0.47250067732321865, + "grad_norm": 0.11097247153520584, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 872 + }, + { + "epoch": 0.4730425358981306, + "grad_norm": 0.10421837866306305, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 873 + }, + { + "epoch": 0.4735843944730425, + "grad_norm": 0.10425719618797302, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 874 + }, + { + "epoch": 0.47412625304795447, + "grad_norm": 0.10385365039110184, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 875 + }, + { + "epoch": 0.47466811162286643, + "grad_norm": 0.11015225201845169, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 876 + }, + { + "epoch": 0.4752099701977784, + "grad_norm": 0.10669953376054764, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 877 + }, + { + "epoch": 0.47575182877269034, + "grad_norm": 0.10671980679035187, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 878 + }, + { + "epoch": 0.4762936873476023, + "grad_norm": 0.10373824834823608, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 879 + }, + { + "epoch": 0.4768355459225142, + "grad_norm": 0.10339619219303131, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 880 + }, + { + "epoch": 0.47737740449742616, + "grad_norm": 0.12287923693656921, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 881 + }, + { + "epoch": 0.4779192630723381, + "grad_norm": 0.11370906978845596, + "learning_rate": 0.0001, + "loss": 1.5958, + "step": 882 + }, + { + "epoch": 0.4784611216472501, + "grad_norm": 0.10995329171419144, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 883 + }, + { + "epoch": 0.47900298022216203, + "grad_norm": 0.11137809604406357, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 884 + }, + { + "epoch": 0.479544838797074, + "grad_norm": 0.11190925538539886, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 885 + }, + { + "epoch": 0.4800866973719859, + "grad_norm": 0.10670820623636246, + "learning_rate": 0.0001, + "loss": 1.6122, + "step": 886 + }, + { + "epoch": 0.48062855594689785, + "grad_norm": 0.12012673169374466, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 887 + }, + { + "epoch": 0.4811704145218098, + "grad_norm": 0.11470359563827515, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 888 + }, + { + "epoch": 0.48171227309672177, + "grad_norm": 0.1273769736289978, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 889 + }, + { + "epoch": 0.4822541316716337, + "grad_norm": 0.11515619605779648, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 890 + }, + { + "epoch": 0.4827959902465456, + "grad_norm": 0.12130913883447647, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 891 + }, + { + "epoch": 0.4833378488214576, + "grad_norm": 0.11784867197275162, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 892 + }, + { + "epoch": 0.48387970739636954, + "grad_norm": 0.10914217680692673, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 893 + }, + { + "epoch": 0.4844215659712815, + "grad_norm": 0.11672643572092056, + "learning_rate": 0.0001, + "loss": 1.7317, + "step": 894 + }, + { + "epoch": 0.48496342454619346, + "grad_norm": 0.11123480647802353, + "learning_rate": 0.0001, + "loss": 1.564, + "step": 895 + }, + { + "epoch": 0.4855052831211054, + "grad_norm": 0.10239136964082718, + "learning_rate": 0.0001, + "loss": 1.5774, + "step": 896 + }, + { + "epoch": 0.4860471416960173, + "grad_norm": 0.12416405975818634, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 897 + }, + { + "epoch": 0.4865890002709293, + "grad_norm": 0.11070729047060013, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 898 + }, + { + "epoch": 0.48713085884584123, + "grad_norm": 0.13239996135234833, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 899 + }, + { + "epoch": 0.4876727174207532, + "grad_norm": 0.10215065628290176, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 900 + }, + { + "epoch": 0.48821457599566515, + "grad_norm": 0.12313413619995117, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 901 + }, + { + "epoch": 0.4887564345705771, + "grad_norm": 0.11618530005216599, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 902 + }, + { + "epoch": 0.489298293145489, + "grad_norm": 0.11647929251194, + "learning_rate": 0.0001, + "loss": 1.6813, + "step": 903 + }, + { + "epoch": 0.48984015172040096, + "grad_norm": 0.12761937081813812, + "learning_rate": 0.0001, + "loss": 1.6652, + "step": 904 + }, + { + "epoch": 0.4903820102953129, + "grad_norm": 0.13200801610946655, + "learning_rate": 0.0001, + "loss": 1.7227, + "step": 905 + }, + { + "epoch": 0.4909238688702249, + "grad_norm": 0.11139468103647232, + "learning_rate": 0.0001, + "loss": 1.6142, + "step": 906 + }, + { + "epoch": 0.49146572744513684, + "grad_norm": 0.11942821741104126, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 907 + }, + { + "epoch": 0.4920075860200488, + "grad_norm": 0.11830473691225052, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 908 + }, + { + "epoch": 0.4925494445949607, + "grad_norm": 0.12805293500423431, + "learning_rate": 0.0001, + "loss": 1.7355, + "step": 909 + }, + { + "epoch": 0.49309130316987265, + "grad_norm": 0.1151934266090393, + "learning_rate": 0.0001, + "loss": 1.5538, + "step": 910 + }, + { + "epoch": 0.4936331617447846, + "grad_norm": 0.11872893571853638, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 911 + }, + { + "epoch": 0.49417502031969657, + "grad_norm": 0.11597079783678055, + "learning_rate": 0.0001, + "loss": 1.683, + "step": 912 + }, + { + "epoch": 0.4947168788946085, + "grad_norm": 0.10476577281951904, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 913 + }, + { + "epoch": 0.4952587374695205, + "grad_norm": 0.12531496584415436, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 914 + }, + { + "epoch": 0.4958005960444324, + "grad_norm": 0.10528986901044846, + "learning_rate": 0.0001, + "loss": 1.6081, + "step": 915 + }, + { + "epoch": 0.49634245461934434, + "grad_norm": 0.12118838727474213, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 916 + }, + { + "epoch": 0.4968843131942563, + "grad_norm": 0.11182791739702225, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 917 + }, + { + "epoch": 0.49742617176916826, + "grad_norm": 0.11458185315132141, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 918 + }, + { + "epoch": 0.4979680303440802, + "grad_norm": 0.10385309159755707, + "learning_rate": 0.0001, + "loss": 1.5791, + "step": 919 + }, + { + "epoch": 0.4985098889189921, + "grad_norm": 0.112492136657238, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 920 + }, + { + "epoch": 0.4990517474939041, + "grad_norm": 0.10807826370000839, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 921 + }, + { + "epoch": 0.49959360606881603, + "grad_norm": 0.10996340960264206, + "learning_rate": 0.0001, + "loss": 1.5915, + "step": 922 + }, + { + "epoch": 0.5001354646437279, + "grad_norm": 0.12523028254508972, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 923 + }, + { + "epoch": 0.5006773232186399, + "grad_norm": 0.10851467400789261, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 924 + }, + { + "epoch": 0.5012191817935518, + "grad_norm": 0.12981802225112915, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 925 + }, + { + "epoch": 0.5017610403684638, + "grad_norm": 0.1098252385854721, + "learning_rate": 0.0001, + "loss": 1.6871, + "step": 926 + }, + { + "epoch": 0.5023028989433758, + "grad_norm": 0.1102382093667984, + "learning_rate": 0.0001, + "loss": 1.6117, + "step": 927 + }, + { + "epoch": 0.5028447575182877, + "grad_norm": 0.11334949731826782, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 928 + }, + { + "epoch": 0.5033866160931997, + "grad_norm": 0.12077392637729645, + "learning_rate": 0.0001, + "loss": 1.534, + "step": 929 + }, + { + "epoch": 0.5039284746681116, + "grad_norm": 0.12375643104314804, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 930 + }, + { + "epoch": 0.5044703332430236, + "grad_norm": 0.11033709347248077, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 931 + }, + { + "epoch": 0.5050121918179356, + "grad_norm": 0.14901308715343475, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 932 + }, + { + "epoch": 0.5055540503928475, + "grad_norm": 0.10846343636512756, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 933 + }, + { + "epoch": 0.5060959089677595, + "grad_norm": 0.13782711327075958, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 934 + }, + { + "epoch": 0.5066377675426713, + "grad_norm": 0.11779604107141495, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 935 + }, + { + "epoch": 0.5071796261175833, + "grad_norm": 0.11643415689468384, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 936 + }, + { + "epoch": 0.5077214846924952, + "grad_norm": 0.13498874008655548, + "learning_rate": 0.0001, + "loss": 1.5645, + "step": 937 + }, + { + "epoch": 0.5082633432674072, + "grad_norm": 0.11232909560203552, + "learning_rate": 0.0001, + "loss": 1.6849, + "step": 938 + }, + { + "epoch": 0.5088052018423191, + "grad_norm": 0.1520954966545105, + "learning_rate": 0.0001, + "loss": 1.7269, + "step": 939 + }, + { + "epoch": 0.5093470604172311, + "grad_norm": 0.10932721942663193, + "learning_rate": 0.0001, + "loss": 1.6174, + "step": 940 + }, + { + "epoch": 0.5098889189921431, + "grad_norm": 0.11859098821878433, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 941 + }, + { + "epoch": 0.510430777567055, + "grad_norm": 0.12220406532287598, + "learning_rate": 0.0001, + "loss": 1.6904, + "step": 942 + }, + { + "epoch": 0.510972636141967, + "grad_norm": 0.11712736636400223, + "learning_rate": 0.0001, + "loss": 1.6838, + "step": 943 + }, + { + "epoch": 0.5115144947168789, + "grad_norm": 0.12665888667106628, + "learning_rate": 0.0001, + "loss": 1.663, + "step": 944 + }, + { + "epoch": 0.5120563532917909, + "grad_norm": 0.109413743019104, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 945 + }, + { + "epoch": 0.5125982118667027, + "grad_norm": 0.11188017576932907, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 946 + }, + { + "epoch": 0.5131400704416147, + "grad_norm": 0.10553425550460815, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 947 + }, + { + "epoch": 0.5136819290165267, + "grad_norm": 0.11408665776252747, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 948 + }, + { + "epoch": 0.5142237875914386, + "grad_norm": 0.11511809378862381, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 949 + }, + { + "epoch": 0.5147656461663506, + "grad_norm": 0.12249071151018143, + "learning_rate": 0.0001, + "loss": 1.651, + "step": 950 + }, + { + "epoch": 0.5153075047412625, + "grad_norm": 0.11768020689487457, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 951 + }, + { + "epoch": 0.5158493633161745, + "grad_norm": 0.12663574516773224, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 952 + }, + { + "epoch": 0.5163912218910864, + "grad_norm": 0.11186866462230682, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 953 + }, + { + "epoch": 0.5169330804659984, + "grad_norm": 0.10449830442667007, + "learning_rate": 0.0001, + "loss": 1.6612, + "step": 954 + }, + { + "epoch": 0.5174749390409104, + "grad_norm": 0.11975737661123276, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 955 + }, + { + "epoch": 0.5180167976158223, + "grad_norm": 0.11905168741941452, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 956 + }, + { + "epoch": 0.5185586561907343, + "grad_norm": 0.10545483231544495, + "learning_rate": 0.0001, + "loss": 1.5699, + "step": 957 + }, + { + "epoch": 0.5191005147656461, + "grad_norm": 0.10853522270917892, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 958 + }, + { + "epoch": 0.5196423733405581, + "grad_norm": 0.12581409513950348, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 959 + }, + { + "epoch": 0.52018423191547, + "grad_norm": 0.10773199796676636, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 960 + }, + { + "epoch": 0.520726090490382, + "grad_norm": 0.11740659922361374, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 961 + }, + { + "epoch": 0.521267949065294, + "grad_norm": 0.113717220723629, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 962 + }, + { + "epoch": 0.5218098076402059, + "grad_norm": 0.11157488822937012, + "learning_rate": 0.0001, + "loss": 1.6061, + "step": 963 + }, + { + "epoch": 0.5223516662151179, + "grad_norm": 0.11375845968723297, + "learning_rate": 0.0001, + "loss": 1.603, + "step": 964 + }, + { + "epoch": 0.5228935247900298, + "grad_norm": 0.10816894471645355, + "learning_rate": 0.0001, + "loss": 1.6872, + "step": 965 + }, + { + "epoch": 0.5234353833649418, + "grad_norm": 0.10593447834253311, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 966 + }, + { + "epoch": 0.5239772419398537, + "grad_norm": 0.1219218447804451, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 967 + }, + { + "epoch": 0.5245191005147657, + "grad_norm": 0.11132703721523285, + "learning_rate": 0.0001, + "loss": 1.6937, + "step": 968 + }, + { + "epoch": 0.5250609590896776, + "grad_norm": 0.12354977428913116, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 969 + }, + { + "epoch": 0.5256028176645895, + "grad_norm": 0.1068456694483757, + "learning_rate": 0.0001, + "loss": 1.5655, + "step": 970 + }, + { + "epoch": 0.5261446762395015, + "grad_norm": 0.12704645097255707, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 971 + }, + { + "epoch": 0.5266865348144134, + "grad_norm": 0.11148631572723389, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 972 + }, + { + "epoch": 0.5272283933893254, + "grad_norm": 0.13174760341644287, + "learning_rate": 0.0001, + "loss": 1.7158, + "step": 973 + }, + { + "epoch": 0.5277702519642373, + "grad_norm": 0.10785822570323944, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 974 + }, + { + "epoch": 0.5283121105391493, + "grad_norm": 0.13404607772827148, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 975 + }, + { + "epoch": 0.5288539691140612, + "grad_norm": 0.10916657000780106, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 976 + }, + { + "epoch": 0.5293958276889732, + "grad_norm": 0.11268014460802078, + "learning_rate": 0.0001, + "loss": 1.5499, + "step": 977 + }, + { + "epoch": 0.5299376862638852, + "grad_norm": 0.13467144966125488, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 978 + }, + { + "epoch": 0.5304795448387971, + "grad_norm": 0.12466847151517868, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 979 + }, + { + "epoch": 0.5310214034137091, + "grad_norm": 0.1344923973083496, + "learning_rate": 0.0001, + "loss": 1.61, + "step": 980 + }, + { + "epoch": 0.5315632619886209, + "grad_norm": 0.10806672275066376, + "learning_rate": 0.0001, + "loss": 1.5325, + "step": 981 + }, + { + "epoch": 0.5321051205635329, + "grad_norm": 0.1430986523628235, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 982 + }, + { + "epoch": 0.5326469791384448, + "grad_norm": 0.10808341205120087, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 983 + }, + { + "epoch": 0.5331888377133568, + "grad_norm": 0.11897061765193939, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 984 + }, + { + "epoch": 0.5337306962882687, + "grad_norm": 0.11438190191984177, + "learning_rate": 0.0001, + "loss": 1.525, + "step": 985 + }, + { + "epoch": 0.5342725548631807, + "grad_norm": 0.12540669739246368, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 986 + }, + { + "epoch": 0.5348144134380927, + "grad_norm": 0.12103486806154251, + "learning_rate": 0.0001, + "loss": 1.5627, + "step": 987 + }, + { + "epoch": 0.5353562720130046, + "grad_norm": 0.10832776874303818, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 988 + }, + { + "epoch": 0.5358981305879166, + "grad_norm": 0.13159573078155518, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 989 + }, + { + "epoch": 0.5364399891628285, + "grad_norm": 0.101260244846344, + "learning_rate": 0.0001, + "loss": 1.549, + "step": 990 + }, + { + "epoch": 0.5369818477377405, + "grad_norm": 0.12760812044143677, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 991 + }, + { + "epoch": 0.5375237063126524, + "grad_norm": 0.1126088872551918, + "learning_rate": 0.0001, + "loss": 1.5693, + "step": 992 + }, + { + "epoch": 0.5380655648875643, + "grad_norm": 0.13307423889636993, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 993 + }, + { + "epoch": 0.5386074234624763, + "grad_norm": 0.11326015740633011, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 994 + }, + { + "epoch": 0.5391492820373882, + "grad_norm": 0.11083459854125977, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 995 + }, + { + "epoch": 0.5396911406123002, + "grad_norm": 0.12092125415802002, + "learning_rate": 0.0001, + "loss": 1.6074, + "step": 996 + }, + { + "epoch": 0.5402329991872121, + "grad_norm": 0.11150885373353958, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 997 + }, + { + "epoch": 0.5407748577621241, + "grad_norm": 0.12295088917016983, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 998 + }, + { + "epoch": 0.541316716337036, + "grad_norm": 0.1229565292596817, + "learning_rate": 0.0001, + "loss": 1.6828, + "step": 999 + }, + { + "epoch": 0.541858574911948, + "grad_norm": 0.13467469811439514, + "learning_rate": 0.0001, + "loss": 1.5319, + "step": 1000 + }, + { + "epoch": 0.54240043348686, + "grad_norm": 0.11792799085378647, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 1001 + }, + { + "epoch": 0.5429422920617719, + "grad_norm": 0.11050891876220703, + "learning_rate": 0.0001, + "loss": 1.512, + "step": 1002 + }, + { + "epoch": 0.5434841506366839, + "grad_norm": 0.11616943776607513, + "learning_rate": 0.0001, + "loss": 1.6496, + "step": 1003 + }, + { + "epoch": 0.5440260092115957, + "grad_norm": 0.10420012474060059, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 1004 + }, + { + "epoch": 0.5445678677865077, + "grad_norm": 0.1318942904472351, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 1005 + }, + { + "epoch": 0.5451097263614196, + "grad_norm": 0.1039443239569664, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 1006 + }, + { + "epoch": 0.5456515849363316, + "grad_norm": 0.11699001491069794, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 1007 + }, + { + "epoch": 0.5461934435112435, + "grad_norm": 0.11182098835706711, + "learning_rate": 0.0001, + "loss": 1.5891, + "step": 1008 + }, + { + "epoch": 0.5467353020861555, + "grad_norm": 0.13418853282928467, + "learning_rate": 0.0001, + "loss": 1.6667, + "step": 1009 + }, + { + "epoch": 0.5472771606610675, + "grad_norm": 0.11194122582674026, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 1010 + }, + { + "epoch": 0.5478190192359794, + "grad_norm": 0.11652825772762299, + "learning_rate": 0.0001, + "loss": 1.6053, + "step": 1011 + }, + { + "epoch": 0.5483608778108914, + "grad_norm": 0.11751655489206314, + "learning_rate": 0.0001, + "loss": 1.6098, + "step": 1012 + }, + { + "epoch": 0.5489027363858033, + "grad_norm": 0.1132623553276062, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 1013 + }, + { + "epoch": 0.5494445949607153, + "grad_norm": 0.1154278889298439, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 1014 + }, + { + "epoch": 0.5499864535356273, + "grad_norm": 0.11338574439287186, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 1015 + }, + { + "epoch": 0.5505283121105391, + "grad_norm": 0.13384795188903809, + "learning_rate": 0.0001, + "loss": 1.6097, + "step": 1016 + }, + { + "epoch": 0.551070170685451, + "grad_norm": 0.1129109263420105, + "learning_rate": 0.0001, + "loss": 1.6028, + "step": 1017 + }, + { + "epoch": 0.551612029260363, + "grad_norm": 0.10652612894773483, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 1018 + }, + { + "epoch": 0.552153887835275, + "grad_norm": 0.12745411694049835, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 1019 + }, + { + "epoch": 0.5526957464101869, + "grad_norm": 0.11083333939313889, + "learning_rate": 0.0001, + "loss": 1.5846, + "step": 1020 + }, + { + "epoch": 0.5532376049850989, + "grad_norm": 0.11409562081098557, + "learning_rate": 0.0001, + "loss": 1.5939, + "step": 1021 + }, + { + "epoch": 0.5537794635600108, + "grad_norm": 0.1340702772140503, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 1022 + }, + { + "epoch": 0.5543213221349228, + "grad_norm": 0.11746427416801453, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 1023 + }, + { + "epoch": 0.5548631807098348, + "grad_norm": 0.11569317430257797, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 1024 + }, + { + "epoch": 0.5554050392847467, + "grad_norm": 0.10323930531740189, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 1025 + }, + { + "epoch": 0.5559468978596587, + "grad_norm": 0.1139453575015068, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1026 + }, + { + "epoch": 0.5564887564345706, + "grad_norm": 0.11864421516656876, + "learning_rate": 0.0001, + "loss": 1.6012, + "step": 1027 + }, + { + "epoch": 0.5570306150094825, + "grad_norm": 0.11720296740531921, + "learning_rate": 0.0001, + "loss": 1.688, + "step": 1028 + }, + { + "epoch": 0.5575724735843944, + "grad_norm": 0.11221726983785629, + "learning_rate": 0.0001, + "loss": 1.664, + "step": 1029 + }, + { + "epoch": 0.5581143321593064, + "grad_norm": 0.1101880744099617, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 1030 + }, + { + "epoch": 0.5586561907342183, + "grad_norm": 0.1299498975276947, + "learning_rate": 0.0001, + "loss": 1.598, + "step": 1031 + }, + { + "epoch": 0.5591980493091303, + "grad_norm": 0.10887090861797333, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 1032 + }, + { + "epoch": 0.5597399078840423, + "grad_norm": 0.11657392233610153, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 1033 + }, + { + "epoch": 0.5602817664589542, + "grad_norm": 0.11606595665216446, + "learning_rate": 0.0001, + "loss": 1.6024, + "step": 1034 + }, + { + "epoch": 0.5608236250338662, + "grad_norm": 0.10020414739847183, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 1035 + }, + { + "epoch": 0.5613654836087781, + "grad_norm": 0.11371590197086334, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 1036 + }, + { + "epoch": 0.5619073421836901, + "grad_norm": 0.10955478996038437, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 1037 + }, + { + "epoch": 0.562449200758602, + "grad_norm": 0.10717011988162994, + "learning_rate": 0.0001, + "loss": 1.5618, + "step": 1038 + }, + { + "epoch": 0.5629910593335139, + "grad_norm": 0.1138799712061882, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 1039 + }, + { + "epoch": 0.5635329179084259, + "grad_norm": 0.10846222937107086, + "learning_rate": 0.0001, + "loss": 1.7128, + "step": 1040 + }, + { + "epoch": 0.5640747764833378, + "grad_norm": 0.11552631109952927, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 1041 + }, + { + "epoch": 0.5646166350582498, + "grad_norm": 0.10366268455982208, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 1042 + }, + { + "epoch": 0.5651584936331617, + "grad_norm": 0.10912511497735977, + "learning_rate": 0.0001, + "loss": 1.577, + "step": 1043 + }, + { + "epoch": 0.5657003522080737, + "grad_norm": 0.11409895122051239, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 1044 + }, + { + "epoch": 0.5662422107829856, + "grad_norm": 0.1048092395067215, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 1045 + }, + { + "epoch": 0.5667840693578976, + "grad_norm": 0.11702732741832733, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 1046 + }, + { + "epoch": 0.5673259279328096, + "grad_norm": 0.11176422983407974, + "learning_rate": 0.0001, + "loss": 1.6681, + "step": 1047 + }, + { + "epoch": 0.5678677865077215, + "grad_norm": 0.11163422465324402, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 1048 + }, + { + "epoch": 0.5684096450826335, + "grad_norm": 0.10756085067987442, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 1049 + }, + { + "epoch": 0.5689515036575454, + "grad_norm": 0.10789232701063156, + "learning_rate": 0.0001, + "loss": 1.6125, + "step": 1050 + }, + { + "epoch": 0.5694933622324573, + "grad_norm": 0.10716675966978073, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 1051 + }, + { + "epoch": 0.5700352208073692, + "grad_norm": 0.11019588261842728, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 1052 + }, + { + "epoch": 0.5705770793822812, + "grad_norm": 0.10888416320085526, + "learning_rate": 0.0001, + "loss": 1.612, + "step": 1053 + }, + { + "epoch": 0.5711189379571932, + "grad_norm": 0.10670910775661469, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 1054 + }, + { + "epoch": 0.5716607965321051, + "grad_norm": 0.105767622590065, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 1055 + }, + { + "epoch": 0.5722026551070171, + "grad_norm": 0.10374841094017029, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 1056 + }, + { + "epoch": 0.572744513681929, + "grad_norm": 0.1148417666554451, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 1057 + }, + { + "epoch": 0.573286372256841, + "grad_norm": 0.10581746697425842, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 1058 + }, + { + "epoch": 0.5738282308317529, + "grad_norm": 0.11768446117639542, + "learning_rate": 0.0001, + "loss": 1.587, + "step": 1059 + }, + { + "epoch": 0.5743700894066649, + "grad_norm": 0.10721245408058167, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 1060 + }, + { + "epoch": 0.5749119479815769, + "grad_norm": 0.11277879774570465, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1061 + }, + { + "epoch": 0.5754538065564888, + "grad_norm": 0.11104092746973038, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 1062 + }, + { + "epoch": 0.5759956651314007, + "grad_norm": 0.11404809355735779, + "learning_rate": 0.0001, + "loss": 1.538, + "step": 1063 + }, + { + "epoch": 0.5765375237063126, + "grad_norm": 0.10597414523363113, + "learning_rate": 0.0001, + "loss": 1.6545, + "step": 1064 + }, + { + "epoch": 0.5770793822812246, + "grad_norm": 0.11097580939531326, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 1065 + }, + { + "epoch": 0.5776212408561365, + "grad_norm": 0.1147984117269516, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1066 + }, + { + "epoch": 0.5781630994310485, + "grad_norm": 0.132141575217247, + "learning_rate": 0.0001, + "loss": 1.689, + "step": 1067 + }, + { + "epoch": 0.5787049580059604, + "grad_norm": 0.1056254580616951, + "learning_rate": 0.0001, + "loss": 1.6023, + "step": 1068 + }, + { + "epoch": 0.5792468165808724, + "grad_norm": 0.11850099265575409, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1069 + }, + { + "epoch": 0.5797886751557844, + "grad_norm": 0.1169082298874855, + "learning_rate": 0.0001, + "loss": 1.5843, + "step": 1070 + }, + { + "epoch": 0.5803305337306963, + "grad_norm": 0.11147496849298477, + "learning_rate": 0.0001, + "loss": 1.5971, + "step": 1071 + }, + { + "epoch": 0.5808723923056083, + "grad_norm": 0.11038046330213547, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 1072 + }, + { + "epoch": 0.5814142508805202, + "grad_norm": 0.10539822280406952, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 1073 + }, + { + "epoch": 0.5819561094554321, + "grad_norm": 0.12041206657886505, + "learning_rate": 0.0001, + "loss": 1.6747, + "step": 1074 + }, + { + "epoch": 0.582497968030344, + "grad_norm": 0.11082960665225983, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 1075 + }, + { + "epoch": 0.583039826605256, + "grad_norm": 0.12492961436510086, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 1076 + }, + { + "epoch": 0.583581685180168, + "grad_norm": 0.11056699603796005, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 1077 + }, + { + "epoch": 0.5841235437550799, + "grad_norm": 0.11277845501899719, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 1078 + }, + { + "epoch": 0.5846654023299919, + "grad_norm": 0.1074712872505188, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 1079 + }, + { + "epoch": 0.5852072609049038, + "grad_norm": 0.10993622243404388, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 1080 + }, + { + "epoch": 0.5857491194798158, + "grad_norm": 0.11170591413974762, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1081 + }, + { + "epoch": 0.5862909780547277, + "grad_norm": 0.1051788404583931, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 1082 + }, + { + "epoch": 0.5868328366296397, + "grad_norm": 0.11397905647754669, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 1083 + }, + { + "epoch": 0.5873746952045517, + "grad_norm": 0.11181167513132095, + "learning_rate": 0.0001, + "loss": 1.6593, + "step": 1084 + }, + { + "epoch": 0.5879165537794636, + "grad_norm": 0.1070471853017807, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 1085 + }, + { + "epoch": 0.5884584123543755, + "grad_norm": 0.11638469249010086, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 1086 + }, + { + "epoch": 0.5890002709292874, + "grad_norm": 0.10291191935539246, + "learning_rate": 0.0001, + "loss": 1.6169, + "step": 1087 + }, + { + "epoch": 0.5895421295041994, + "grad_norm": 0.10486762970685959, + "learning_rate": 0.0001, + "loss": 1.5439, + "step": 1088 + }, + { + "epoch": 0.5900839880791113, + "grad_norm": 0.12095001339912415, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 1089 + }, + { + "epoch": 0.5906258466540233, + "grad_norm": 0.1037125512957573, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 1090 + }, + { + "epoch": 0.5911677052289352, + "grad_norm": 0.11337342858314514, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 1091 + }, + { + "epoch": 0.5917095638038472, + "grad_norm": 0.11184096336364746, + "learning_rate": 0.0001, + "loss": 1.6075, + "step": 1092 + }, + { + "epoch": 0.5922514223787592, + "grad_norm": 0.1075851172208786, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1093 + }, + { + "epoch": 0.5927932809536711, + "grad_norm": 0.11272459477186203, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 1094 + }, + { + "epoch": 0.5933351395285831, + "grad_norm": 0.1100969910621643, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 1095 + }, + { + "epoch": 0.593876998103495, + "grad_norm": 0.10581720620393753, + "learning_rate": 0.0001, + "loss": 1.5741, + "step": 1096 + }, + { + "epoch": 0.5944188566784069, + "grad_norm": 0.11307034641504288, + "learning_rate": 0.0001, + "loss": 1.5815, + "step": 1097 + }, + { + "epoch": 0.5949607152533188, + "grad_norm": 0.1128588542342186, + "learning_rate": 0.0001, + "loss": 1.6952, + "step": 1098 + }, + { + "epoch": 0.5955025738282308, + "grad_norm": 0.11140985786914825, + "learning_rate": 0.0001, + "loss": 1.6055, + "step": 1099 + }, + { + "epoch": 0.5960444324031428, + "grad_norm": 0.11232408136129379, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 1100 + }, + { + "epoch": 0.5965862909780547, + "grad_norm": 0.12389969825744629, + "learning_rate": 0.0001, + "loss": 1.6926, + "step": 1101 + }, + { + "epoch": 0.5971281495529667, + "grad_norm": 0.11053042858839035, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 1102 + }, + { + "epoch": 0.5976700081278786, + "grad_norm": 0.11876285076141357, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 1103 + }, + { + "epoch": 0.5982118667027906, + "grad_norm": 0.11597663164138794, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 1104 + }, + { + "epoch": 0.5987537252777025, + "grad_norm": 0.11426155269145966, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 1105 + }, + { + "epoch": 0.5992955838526145, + "grad_norm": 0.13617175817489624, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 1106 + }, + { + "epoch": 0.5998374424275265, + "grad_norm": 0.12879601120948792, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 1107 + }, + { + "epoch": 0.6003793010024384, + "grad_norm": 0.11163545399904251, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 1108 + }, + { + "epoch": 0.6009211595773503, + "grad_norm": 0.10743013024330139, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 1109 + }, + { + "epoch": 0.6014630181522622, + "grad_norm": 0.1111200824379921, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 1110 + }, + { + "epoch": 0.6020048767271742, + "grad_norm": 0.11115365475416183, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 1111 + }, + { + "epoch": 0.6025467353020861, + "grad_norm": 0.10575967282056808, + "learning_rate": 0.0001, + "loss": 1.5795, + "step": 1112 + }, + { + "epoch": 0.6030885938769981, + "grad_norm": 0.11594882607460022, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 1113 + }, + { + "epoch": 0.60363045245191, + "grad_norm": 0.1093570813536644, + "learning_rate": 0.0001, + "loss": 1.5788, + "step": 1114 + }, + { + "epoch": 0.604172311026822, + "grad_norm": 0.10965988039970398, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 1115 + }, + { + "epoch": 0.604714169601734, + "grad_norm": 0.12208772450685501, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 1116 + }, + { + "epoch": 0.6052560281766459, + "grad_norm": 0.10870245844125748, + "learning_rate": 0.0001, + "loss": 1.602, + "step": 1117 + }, + { + "epoch": 0.6057978867515579, + "grad_norm": 0.12144634127616882, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1118 + }, + { + "epoch": 0.6063397453264698, + "grad_norm": 0.11317051947116852, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 1119 + }, + { + "epoch": 0.6068816039013818, + "grad_norm": 0.12039986252784729, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 1120 + }, + { + "epoch": 0.6074234624762936, + "grad_norm": 0.1101534515619278, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 1121 + }, + { + "epoch": 0.6079653210512056, + "grad_norm": 0.11682349443435669, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 1122 + }, + { + "epoch": 0.6085071796261176, + "grad_norm": 0.10640917718410492, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 1123 + }, + { + "epoch": 0.6090490382010295, + "grad_norm": 0.10861624032258987, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 1124 + }, + { + "epoch": 0.6095908967759415, + "grad_norm": 0.10645128786563873, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 1125 + }, + { + "epoch": 0.6101327553508534, + "grad_norm": 0.10756590217351913, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 1126 + }, + { + "epoch": 0.6106746139257654, + "grad_norm": 0.10730548202991486, + "learning_rate": 0.0001, + "loss": 1.5956, + "step": 1127 + }, + { + "epoch": 0.6112164725006773, + "grad_norm": 0.10790326446294785, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 1128 + }, + { + "epoch": 0.6117583310755893, + "grad_norm": 0.10814284533262253, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 1129 + }, + { + "epoch": 0.6123001896505013, + "grad_norm": 0.11095184832811356, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 1130 + }, + { + "epoch": 0.6128420482254132, + "grad_norm": 0.11202125251293182, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 1131 + }, + { + "epoch": 0.6133839068003251, + "grad_norm": 0.1061285063624382, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 1132 + }, + { + "epoch": 0.613925765375237, + "grad_norm": 0.11121700704097748, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 1133 + }, + { + "epoch": 0.614467623950149, + "grad_norm": 0.11035364866256714, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 1134 + }, + { + "epoch": 0.6150094825250609, + "grad_norm": 0.11122874170541763, + "learning_rate": 0.0001, + "loss": 1.6271, + "step": 1135 + }, + { + "epoch": 0.6155513410999729, + "grad_norm": 0.10672580450773239, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 1136 + }, + { + "epoch": 0.6160931996748849, + "grad_norm": 0.11283120512962341, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 1137 + }, + { + "epoch": 0.6166350582497968, + "grad_norm": 0.10613071173429489, + "learning_rate": 0.0001, + "loss": 1.5301, + "step": 1138 + }, + { + "epoch": 0.6171769168247088, + "grad_norm": 0.11445633322000504, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 1139 + }, + { + "epoch": 0.6177187753996207, + "grad_norm": 0.10982243716716766, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 1140 + }, + { + "epoch": 0.6182606339745327, + "grad_norm": 0.11763114482164383, + "learning_rate": 0.0001, + "loss": 1.7074, + "step": 1141 + }, + { + "epoch": 0.6188024925494446, + "grad_norm": 0.11409911513328552, + "learning_rate": 0.0001, + "loss": 1.609, + "step": 1142 + }, + { + "epoch": 0.6193443511243566, + "grad_norm": 0.12272528558969498, + "learning_rate": 0.0001, + "loss": 1.5739, + "step": 1143 + }, + { + "epoch": 0.6198862096992684, + "grad_norm": 0.10473363101482391, + "learning_rate": 0.0001, + "loss": 1.6001, + "step": 1144 + }, + { + "epoch": 0.6204280682741804, + "grad_norm": 0.10619332641363144, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 1145 + }, + { + "epoch": 0.6209699268490924, + "grad_norm": 0.11708606034517288, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 1146 + }, + { + "epoch": 0.6215117854240043, + "grad_norm": 0.11772305518388748, + "learning_rate": 0.0001, + "loss": 1.6009, + "step": 1147 + }, + { + "epoch": 0.6220536439989163, + "grad_norm": 0.109929159283638, + "learning_rate": 0.0001, + "loss": 1.6116, + "step": 1148 + }, + { + "epoch": 0.6225955025738282, + "grad_norm": 0.11920684576034546, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1149 + }, + { + "epoch": 0.6231373611487402, + "grad_norm": 0.1033051609992981, + "learning_rate": 0.0001, + "loss": 1.5432, + "step": 1150 + }, + { + "epoch": 0.6236792197236521, + "grad_norm": 0.10906940698623657, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1151 + }, + { + "epoch": 0.6242210782985641, + "grad_norm": 0.10953330248594284, + "learning_rate": 0.0001, + "loss": 1.5654, + "step": 1152 + }, + { + "epoch": 0.6247629368734761, + "grad_norm": 0.11275649815797806, + "learning_rate": 0.0001, + "loss": 1.6157, + "step": 1153 + }, + { + "epoch": 0.625304795448388, + "grad_norm": 0.1148492619395256, + "learning_rate": 0.0001, + "loss": 1.6836, + "step": 1154 + }, + { + "epoch": 0.6258466540232999, + "grad_norm": 0.10278470069169998, + "learning_rate": 0.0001, + "loss": 1.5747, + "step": 1155 + }, + { + "epoch": 0.6263885125982118, + "grad_norm": 0.12111508101224899, + "learning_rate": 0.0001, + "loss": 1.5517, + "step": 1156 + }, + { + "epoch": 0.6269303711731238, + "grad_norm": 0.10696611553430557, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 1157 + }, + { + "epoch": 0.6274722297480357, + "grad_norm": 0.1212284117937088, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 1158 + }, + { + "epoch": 0.6280140883229477, + "grad_norm": 0.10886389762163162, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 1159 + }, + { + "epoch": 0.6285559468978597, + "grad_norm": 0.11113671213388443, + "learning_rate": 0.0001, + "loss": 1.5473, + "step": 1160 + }, + { + "epoch": 0.6290978054727716, + "grad_norm": 0.10953107476234436, + "learning_rate": 0.0001, + "loss": 1.603, + "step": 1161 + }, + { + "epoch": 0.6296396640476836, + "grad_norm": 0.1181061714887619, + "learning_rate": 0.0001, + "loss": 1.6897, + "step": 1162 + }, + { + "epoch": 0.6301815226225955, + "grad_norm": 0.12265321612358093, + "learning_rate": 0.0001, + "loss": 1.6021, + "step": 1163 + }, + { + "epoch": 0.6307233811975075, + "grad_norm": 0.11034665256738663, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 1164 + }, + { + "epoch": 0.6312652397724194, + "grad_norm": 0.10766472667455673, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 1165 + }, + { + "epoch": 0.6318070983473314, + "grad_norm": 0.12280579656362534, + "learning_rate": 0.0001, + "loss": 1.6164, + "step": 1166 + }, + { + "epoch": 0.6323489569222432, + "grad_norm": 0.10680815577507019, + "learning_rate": 0.0001, + "loss": 1.6087, + "step": 1167 + }, + { + "epoch": 0.6328908154971552, + "grad_norm": 0.11566875129938126, + "learning_rate": 0.0001, + "loss": 1.536, + "step": 1168 + }, + { + "epoch": 0.6334326740720672, + "grad_norm": 0.12584605813026428, + "learning_rate": 0.0001, + "loss": 1.6341, + "step": 1169 + }, + { + "epoch": 0.6339745326469791, + "grad_norm": 0.12214954942464828, + "learning_rate": 0.0001, + "loss": 1.6041, + "step": 1170 + }, + { + "epoch": 0.6345163912218911, + "grad_norm": 0.12063132226467133, + "learning_rate": 0.0001, + "loss": 1.6083, + "step": 1171 + }, + { + "epoch": 0.635058249796803, + "grad_norm": 0.11362355202436447, + "learning_rate": 0.0001, + "loss": 1.5818, + "step": 1172 + }, + { + "epoch": 0.635600108371715, + "grad_norm": 0.11989773064851761, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 1173 + }, + { + "epoch": 0.636141966946627, + "grad_norm": 0.1103493869304657, + "learning_rate": 0.0001, + "loss": 1.5539, + "step": 1174 + }, + { + "epoch": 0.6366838255215389, + "grad_norm": 0.11394510418176651, + "learning_rate": 0.0001, + "loss": 1.7143, + "step": 1175 + }, + { + "epoch": 0.6372256840964509, + "grad_norm": 0.11421728134155273, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 1176 + }, + { + "epoch": 0.6377675426713628, + "grad_norm": 0.11668463051319122, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 1177 + }, + { + "epoch": 0.6383094012462748, + "grad_norm": 0.10839111357927322, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 1178 + }, + { + "epoch": 0.6388512598211866, + "grad_norm": 0.11521162837743759, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 1179 + }, + { + "epoch": 0.6393931183960986, + "grad_norm": 0.11129699647426605, + "learning_rate": 0.0001, + "loss": 1.5996, + "step": 1180 + }, + { + "epoch": 0.6399349769710105, + "grad_norm": 0.11090409010648727, + "learning_rate": 0.0001, + "loss": 1.6005, + "step": 1181 + }, + { + "epoch": 0.6404768355459225, + "grad_norm": 0.11323232203722, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 1182 + }, + { + "epoch": 0.6410186941208345, + "grad_norm": 0.10628697276115417, + "learning_rate": 0.0001, + "loss": 1.6099, + "step": 1183 + }, + { + "epoch": 0.6415605526957464, + "grad_norm": 0.11381271481513977, + "learning_rate": 0.0001, + "loss": 1.6332, + "step": 1184 + }, + { + "epoch": 0.6421024112706584, + "grad_norm": 0.114079050719738, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 1185 + }, + { + "epoch": 0.6426442698455703, + "grad_norm": 0.10727323591709137, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 1186 + }, + { + "epoch": 0.6431861284204823, + "grad_norm": 0.11506172269582748, + "learning_rate": 0.0001, + "loss": 1.5872, + "step": 1187 + }, + { + "epoch": 0.6437279869953942, + "grad_norm": 0.11387591063976288, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 1188 + }, + { + "epoch": 0.6442698455703062, + "grad_norm": 0.11320862174034119, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 1189 + }, + { + "epoch": 0.644811704145218, + "grad_norm": 0.12005679309368134, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 1190 + }, + { + "epoch": 0.64535356272013, + "grad_norm": 0.11804357171058655, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 1191 + }, + { + "epoch": 0.645895421295042, + "grad_norm": 0.13115650415420532, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 1192 + }, + { + "epoch": 0.6464372798699539, + "grad_norm": 0.12141333520412445, + "learning_rate": 0.0001, + "loss": 1.5315, + "step": 1193 + }, + { + "epoch": 0.6469791384448659, + "grad_norm": 0.11301975697278976, + "learning_rate": 0.0001, + "loss": 1.5603, + "step": 1194 + }, + { + "epoch": 0.6475209970197778, + "grad_norm": 0.10810407996177673, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 1195 + }, + { + "epoch": 0.6480628555946898, + "grad_norm": 0.1332971453666687, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1196 + }, + { + "epoch": 0.6486047141696017, + "grad_norm": 0.11277639120817184, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 1197 + }, + { + "epoch": 0.6491465727445137, + "grad_norm": 0.12171521037817001, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1198 + }, + { + "epoch": 0.6496884313194257, + "grad_norm": 0.10795322060585022, + "learning_rate": 0.0001, + "loss": 1.6728, + "step": 1199 + }, + { + "epoch": 0.6502302898943376, + "grad_norm": 0.10944177210330963, + "learning_rate": 0.0001, + "loss": 1.6788, + "step": 1200 + }, + { + "epoch": 0.6507721484692496, + "grad_norm": 0.11161891371011734, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 1201 + }, + { + "epoch": 0.6513140070441614, + "grad_norm": 0.11086007207632065, + "learning_rate": 0.0001, + "loss": 1.6029, + "step": 1202 + }, + { + "epoch": 0.6518558656190734, + "grad_norm": 0.10901938378810883, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 1203 + }, + { + "epoch": 0.6523977241939853, + "grad_norm": 0.11241175979375839, + "learning_rate": 0.0001, + "loss": 1.548, + "step": 1204 + }, + { + "epoch": 0.6529395827688973, + "grad_norm": 0.11044015735387802, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 1205 + }, + { + "epoch": 0.6534814413438093, + "grad_norm": 0.10838108509778976, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 1206 + }, + { + "epoch": 0.6540232999187212, + "grad_norm": 0.11160946637392044, + "learning_rate": 0.0001, + "loss": 1.5725, + "step": 1207 + }, + { + "epoch": 0.6545651584936332, + "grad_norm": 0.11063013225793839, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 1208 + }, + { + "epoch": 0.6551070170685451, + "grad_norm": 0.11617591977119446, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 1209 + }, + { + "epoch": 0.6556488756434571, + "grad_norm": 0.11063467711210251, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 1210 + }, + { + "epoch": 0.656190734218369, + "grad_norm": 0.11112099140882492, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 1211 + }, + { + "epoch": 0.656732592793281, + "grad_norm": 0.11596956104040146, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 1212 + }, + { + "epoch": 0.6572744513681928, + "grad_norm": 0.11139282584190369, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 1213 + }, + { + "epoch": 0.6578163099431048, + "grad_norm": 0.11790062487125397, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 1214 + }, + { + "epoch": 0.6583581685180168, + "grad_norm": 0.11024824529886246, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1215 + }, + { + "epoch": 0.6589000270929287, + "grad_norm": 0.10710059851408005, + "learning_rate": 0.0001, + "loss": 1.6154, + "step": 1216 + }, + { + "epoch": 0.6594418856678407, + "grad_norm": 0.1107814610004425, + "learning_rate": 0.0001, + "loss": 1.569, + "step": 1217 + }, + { + "epoch": 0.6599837442427526, + "grad_norm": 0.10909678041934967, + "learning_rate": 0.0001, + "loss": 1.6034, + "step": 1218 + }, + { + "epoch": 0.6605256028176646, + "grad_norm": 0.12020471692085266, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 1219 + }, + { + "epoch": 0.6610674613925765, + "grad_norm": 0.10776503384113312, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 1220 + }, + { + "epoch": 0.6616093199674885, + "grad_norm": 0.10916958749294281, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 1221 + }, + { + "epoch": 0.6621511785424005, + "grad_norm": 0.13428278267383575, + "learning_rate": 0.0001, + "loss": 1.5782, + "step": 1222 + }, + { + "epoch": 0.6626930371173124, + "grad_norm": 0.11061003804206848, + "learning_rate": 0.0001, + "loss": 1.5575, + "step": 1223 + }, + { + "epoch": 0.6632348956922244, + "grad_norm": 0.13390862941741943, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 1224 + }, + { + "epoch": 0.6637767542671362, + "grad_norm": 0.10983669757843018, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 1225 + }, + { + "epoch": 0.6643186128420482, + "grad_norm": 0.11912653595209122, + "learning_rate": 0.0001, + "loss": 1.5967, + "step": 1226 + }, + { + "epoch": 0.6648604714169601, + "grad_norm": 0.11113286763429642, + "learning_rate": 0.0001, + "loss": 1.5528, + "step": 1227 + }, + { + "epoch": 0.6654023299918721, + "grad_norm": 0.11705653369426727, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1228 + }, + { + "epoch": 0.6659441885667841, + "grad_norm": 0.11865860223770142, + "learning_rate": 0.0001, + "loss": 1.546, + "step": 1229 + }, + { + "epoch": 0.666486047141696, + "grad_norm": 0.11989482492208481, + "learning_rate": 0.0001, + "loss": 1.6036, + "step": 1230 + }, + { + "epoch": 0.667027905716608, + "grad_norm": 0.11474844813346863, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 1231 + }, + { + "epoch": 0.6675697642915199, + "grad_norm": 0.10648138076066971, + "learning_rate": 0.0001, + "loss": 1.6084, + "step": 1232 + }, + { + "epoch": 0.6681116228664319, + "grad_norm": 0.11240139603614807, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 1233 + }, + { + "epoch": 0.6686534814413438, + "grad_norm": 0.1064700037240982, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 1234 + }, + { + "epoch": 0.6691953400162558, + "grad_norm": 0.1053568571805954, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 1235 + }, + { + "epoch": 0.6697371985911678, + "grad_norm": 0.11085887253284454, + "learning_rate": 0.0001, + "loss": 1.606, + "step": 1236 + }, + { + "epoch": 0.6702790571660796, + "grad_norm": 0.10840357840061188, + "learning_rate": 0.0001, + "loss": 1.6147, + "step": 1237 + }, + { + "epoch": 0.6708209157409916, + "grad_norm": 0.11242581158876419, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 1238 + }, + { + "epoch": 0.6713627743159035, + "grad_norm": 0.10839978605508804, + "learning_rate": 0.0001, + "loss": 1.5522, + "step": 1239 + }, + { + "epoch": 0.6719046328908155, + "grad_norm": 0.11424274742603302, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 1240 + }, + { + "epoch": 0.6724464914657274, + "grad_norm": 0.10760857164859772, + "learning_rate": 0.0001, + "loss": 1.6201, + "step": 1241 + }, + { + "epoch": 0.6729883500406394, + "grad_norm": 0.11048325896263123, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 1242 + }, + { + "epoch": 0.6735302086155514, + "grad_norm": 0.11890499293804169, + "learning_rate": 0.0001, + "loss": 1.6474, + "step": 1243 + }, + { + "epoch": 0.6740720671904633, + "grad_norm": 0.12253390997648239, + "learning_rate": 0.0001, + "loss": 1.6528, + "step": 1244 + }, + { + "epoch": 0.6746139257653753, + "grad_norm": 0.10284169018268585, + "learning_rate": 0.0001, + "loss": 1.5805, + "step": 1245 + }, + { + "epoch": 0.6751557843402872, + "grad_norm": 0.1186986118555069, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 1246 + }, + { + "epoch": 0.6756976429151992, + "grad_norm": 0.11720992624759674, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 1247 + }, + { + "epoch": 0.676239501490111, + "grad_norm": 0.1224815845489502, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 1248 + }, + { + "epoch": 0.676781360065023, + "grad_norm": 0.13205626606941223, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 1249 + }, + { + "epoch": 0.6773232186399349, + "grad_norm": 0.11484634131193161, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 1250 + }, + { + "epoch": 0.6778650772148469, + "grad_norm": 0.10785496979951859, + "learning_rate": 0.0001, + "loss": 1.6136, + "step": 1251 + }, + { + "epoch": 0.6784069357897589, + "grad_norm": 0.10727088153362274, + "learning_rate": 0.0001, + "loss": 1.6261, + "step": 1252 + }, + { + "epoch": 0.6789487943646708, + "grad_norm": 0.10857659578323364, + "learning_rate": 0.0001, + "loss": 1.5756, + "step": 1253 + }, + { + "epoch": 0.6794906529395828, + "grad_norm": 0.10896406322717667, + "learning_rate": 0.0001, + "loss": 1.5913, + "step": 1254 + }, + { + "epoch": 0.6800325115144947, + "grad_norm": 0.11162500828504562, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 1255 + }, + { + "epoch": 0.6805743700894067, + "grad_norm": 0.1097574532032013, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 1256 + }, + { + "epoch": 0.6811162286643186, + "grad_norm": 0.10825852304697037, + "learning_rate": 0.0001, + "loss": 1.6086, + "step": 1257 + }, + { + "epoch": 0.6816580872392306, + "grad_norm": 0.11132719367742538, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 1258 + }, + { + "epoch": 0.6821999458141426, + "grad_norm": 0.10512889176607132, + "learning_rate": 0.0001, + "loss": 1.5418, + "step": 1259 + }, + { + "epoch": 0.6827418043890544, + "grad_norm": 0.11441686749458313, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 1260 + }, + { + "epoch": 0.6832836629639664, + "grad_norm": 0.10713682323694229, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 1261 + }, + { + "epoch": 0.6838255215388783, + "grad_norm": 0.10768163204193115, + "learning_rate": 0.0001, + "loss": 1.574, + "step": 1262 + }, + { + "epoch": 0.6843673801137903, + "grad_norm": 0.12209385633468628, + "learning_rate": 0.0001, + "loss": 1.6456, + "step": 1263 + }, + { + "epoch": 0.6849092386887022, + "grad_norm": 0.10845624655485153, + "learning_rate": 0.0001, + "loss": 1.5962, + "step": 1264 + }, + { + "epoch": 0.6854510972636142, + "grad_norm": 0.12702186405658722, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 1265 + }, + { + "epoch": 0.6859929558385262, + "grad_norm": 0.10910164564847946, + "learning_rate": 0.0001, + "loss": 1.6058, + "step": 1266 + }, + { + "epoch": 0.6865348144134381, + "grad_norm": 0.12984834611415863, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1267 + }, + { + "epoch": 0.6870766729883501, + "grad_norm": 0.10938838124275208, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 1268 + }, + { + "epoch": 0.687618531563262, + "grad_norm": 0.12864528596401215, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 1269 + }, + { + "epoch": 0.688160390138174, + "grad_norm": 0.11228509992361069, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 1270 + }, + { + "epoch": 0.6887022487130858, + "grad_norm": 0.14109857380390167, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1271 + }, + { + "epoch": 0.6892441072879978, + "grad_norm": 0.11296511441469193, + "learning_rate": 0.0001, + "loss": 1.5076, + "step": 1272 + }, + { + "epoch": 0.6897859658629097, + "grad_norm": 0.13155023753643036, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 1273 + }, + { + "epoch": 0.6903278244378217, + "grad_norm": 0.12987349927425385, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 1274 + }, + { + "epoch": 0.6908696830127337, + "grad_norm": 0.1197320967912674, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 1275 + }, + { + "epoch": 0.6914115415876456, + "grad_norm": 0.13939501345157623, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 1276 + }, + { + "epoch": 0.6919534001625576, + "grad_norm": 0.11400146782398224, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 1277 + }, + { + "epoch": 0.6924952587374695, + "grad_norm": 0.1326090395450592, + "learning_rate": 0.0001, + "loss": 1.6103, + "step": 1278 + }, + { + "epoch": 0.6930371173123815, + "grad_norm": 0.11246149986982346, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 1279 + }, + { + "epoch": 0.6935789758872934, + "grad_norm": 0.129455104470253, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 1280 + }, + { + "epoch": 0.6941208344622054, + "grad_norm": 0.10411140322685242, + "learning_rate": 0.0001, + "loss": 1.5131, + "step": 1281 + }, + { + "epoch": 0.6946626930371174, + "grad_norm": 0.11562889814376831, + "learning_rate": 0.0001, + "loss": 1.5684, + "step": 1282 + }, + { + "epoch": 0.6952045516120292, + "grad_norm": 0.12287727743387222, + "learning_rate": 0.0001, + "loss": 1.5769, + "step": 1283 + }, + { + "epoch": 0.6957464101869412, + "grad_norm": 0.11982248723506927, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 1284 + }, + { + "epoch": 0.6962882687618531, + "grad_norm": 0.11934027075767517, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1285 + }, + { + "epoch": 0.6968301273367651, + "grad_norm": 0.10960278660058975, + "learning_rate": 0.0001, + "loss": 1.5929, + "step": 1286 + }, + { + "epoch": 0.697371985911677, + "grad_norm": 0.1165471076965332, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 1287 + }, + { + "epoch": 0.697913844486589, + "grad_norm": 0.11723136901855469, + "learning_rate": 0.0001, + "loss": 1.6482, + "step": 1288 + }, + { + "epoch": 0.698455703061501, + "grad_norm": 0.12743420898914337, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 1289 + }, + { + "epoch": 0.6989975616364129, + "grad_norm": 0.1108272448182106, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 1290 + }, + { + "epoch": 0.6995394202113249, + "grad_norm": 0.1132800430059433, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 1291 + }, + { + "epoch": 0.7000812787862368, + "grad_norm": 0.10868791490793228, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 1292 + }, + { + "epoch": 0.7006231373611488, + "grad_norm": 0.10428892821073532, + "learning_rate": 0.0001, + "loss": 1.5417, + "step": 1293 + }, + { + "epoch": 0.7011649959360607, + "grad_norm": 0.10921305418014526, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 1294 + }, + { + "epoch": 0.7017068545109726, + "grad_norm": 0.1108875572681427, + "learning_rate": 0.0001, + "loss": 1.5401, + "step": 1295 + }, + { + "epoch": 0.7022487130858845, + "grad_norm": 0.11594570428133011, + "learning_rate": 0.0001, + "loss": 1.6711, + "step": 1296 + }, + { + "epoch": 0.7027905716607965, + "grad_norm": 0.10920985043048859, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 1297 + }, + { + "epoch": 0.7033324302357085, + "grad_norm": 0.11153151839971542, + "learning_rate": 0.0001, + "loss": 1.5176, + "step": 1298 + }, + { + "epoch": 0.7038742888106204, + "grad_norm": 0.12206792831420898, + "learning_rate": 0.0001, + "loss": 1.5998, + "step": 1299 + }, + { + "epoch": 0.7044161473855324, + "grad_norm": 0.11783460527658463, + "learning_rate": 0.0001, + "loss": 1.6188, + "step": 1300 + }, + { + "epoch": 0.7049580059604443, + "grad_norm": 0.11225185543298721, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1301 + }, + { + "epoch": 0.7054998645353563, + "grad_norm": 0.10486938804388046, + "learning_rate": 0.0001, + "loss": 1.5367, + "step": 1302 + }, + { + "epoch": 0.7060417231102682, + "grad_norm": 0.10571251064538956, + "learning_rate": 0.0001, + "loss": 1.593, + "step": 1303 + }, + { + "epoch": 0.7065835816851802, + "grad_norm": 0.10847926884889603, + "learning_rate": 0.0001, + "loss": 1.5934, + "step": 1304 + }, + { + "epoch": 0.7071254402600922, + "grad_norm": 0.10501236468553543, + "learning_rate": 0.0001, + "loss": 1.5665, + "step": 1305 + }, + { + "epoch": 0.707667298835004, + "grad_norm": 0.10688818991184235, + "learning_rate": 0.0001, + "loss": 1.6137, + "step": 1306 + }, + { + "epoch": 0.708209157409916, + "grad_norm": 0.11243797838687897, + "learning_rate": 0.0001, + "loss": 1.5612, + "step": 1307 + }, + { + "epoch": 0.7087510159848279, + "grad_norm": 0.11504892259836197, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 1308 + }, + { + "epoch": 0.7092928745597399, + "grad_norm": 0.10843952745199203, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 1309 + }, + { + "epoch": 0.7098347331346518, + "grad_norm": 0.12078722566366196, + "learning_rate": 0.0001, + "loss": 1.5868, + "step": 1310 + }, + { + "epoch": 0.7103765917095638, + "grad_norm": 0.1083657294511795, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 1311 + }, + { + "epoch": 0.7109184502844758, + "grad_norm": 0.124849334359169, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 1312 + }, + { + "epoch": 0.7114603088593877, + "grad_norm": 0.11586251109838486, + "learning_rate": 0.0001, + "loss": 1.5828, + "step": 1313 + }, + { + "epoch": 0.7120021674342997, + "grad_norm": 0.12780465185642242, + "learning_rate": 0.0001, + "loss": 1.5908, + "step": 1314 + }, + { + "epoch": 0.7125440260092116, + "grad_norm": 0.1156454011797905, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 1315 + }, + { + "epoch": 0.7130858845841236, + "grad_norm": 0.10399503260850906, + "learning_rate": 0.0001, + "loss": 1.5726, + "step": 1316 + }, + { + "epoch": 0.7136277431590355, + "grad_norm": 0.13720951974391937, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 1317 + }, + { + "epoch": 0.7141696017339474, + "grad_norm": 0.1113177239894867, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 1318 + }, + { + "epoch": 0.7147114603088593, + "grad_norm": 0.11841485649347305, + "learning_rate": 0.0001, + "loss": 1.6748, + "step": 1319 + }, + { + "epoch": 0.7152533188837713, + "grad_norm": 0.10778098553419113, + "learning_rate": 0.0001, + "loss": 1.6085, + "step": 1320 + }, + { + "epoch": 0.7157951774586833, + "grad_norm": 0.12273528426885605, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 1321 + }, + { + "epoch": 0.7163370360335952, + "grad_norm": 0.11129626631736755, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 1322 + }, + { + "epoch": 0.7168788946085072, + "grad_norm": 0.11495073139667511, + "learning_rate": 0.0001, + "loss": 1.594, + "step": 1323 + }, + { + "epoch": 0.7174207531834191, + "grad_norm": 0.13620105385780334, + "learning_rate": 0.0001, + "loss": 1.5392, + "step": 1324 + }, + { + "epoch": 0.7179626117583311, + "grad_norm": 0.11839475482702255, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 1325 + }, + { + "epoch": 0.718504470333243, + "grad_norm": 0.1258990615606308, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 1326 + }, + { + "epoch": 0.719046328908155, + "grad_norm": 0.11513817310333252, + "learning_rate": 0.0001, + "loss": 1.7049, + "step": 1327 + }, + { + "epoch": 0.719588187483067, + "grad_norm": 0.11587271839380264, + "learning_rate": 0.0001, + "loss": 1.6574, + "step": 1328 + }, + { + "epoch": 0.7201300460579788, + "grad_norm": 0.10889148712158203, + "learning_rate": 0.0001, + "loss": 1.531, + "step": 1329 + }, + { + "epoch": 0.7206719046328908, + "grad_norm": 0.12195302546024323, + "learning_rate": 0.0001, + "loss": 1.6679, + "step": 1330 + }, + { + "epoch": 0.7212137632078027, + "grad_norm": 0.1033078134059906, + "learning_rate": 0.0001, + "loss": 1.5089, + "step": 1331 + }, + { + "epoch": 0.7217556217827147, + "grad_norm": 0.11243455857038498, + "learning_rate": 0.0001, + "loss": 1.5574, + "step": 1332 + }, + { + "epoch": 0.7222974803576266, + "grad_norm": 0.11422085016965866, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 1333 + }, + { + "epoch": 0.7228393389325386, + "grad_norm": 0.1344115287065506, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 1334 + }, + { + "epoch": 0.7233811975074506, + "grad_norm": 0.10726635903120041, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 1335 + }, + { + "epoch": 0.7239230560823625, + "grad_norm": 0.11307653784751892, + "learning_rate": 0.0001, + "loss": 1.6068, + "step": 1336 + }, + { + "epoch": 0.7244649146572745, + "grad_norm": 0.10623101890087128, + "learning_rate": 0.0001, + "loss": 1.5428, + "step": 1337 + }, + { + "epoch": 0.7250067732321864, + "grad_norm": 0.11460467427968979, + "learning_rate": 0.0001, + "loss": 1.6064, + "step": 1338 + }, + { + "epoch": 0.7255486318070984, + "grad_norm": 0.10942284762859344, + "learning_rate": 0.0001, + "loss": 1.5735, + "step": 1339 + }, + { + "epoch": 0.7260904903820103, + "grad_norm": 0.11083227396011353, + "learning_rate": 0.0001, + "loss": 1.679, + "step": 1340 + }, + { + "epoch": 0.7266323489569222, + "grad_norm": 0.11292646080255508, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 1341 + }, + { + "epoch": 0.7271742075318341, + "grad_norm": 0.1074613481760025, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 1342 + }, + { + "epoch": 0.7277160661067461, + "grad_norm": 0.11060116440057755, + "learning_rate": 0.0001, + "loss": 1.5856, + "step": 1343 + }, + { + "epoch": 0.7282579246816581, + "grad_norm": 0.1097818911075592, + "learning_rate": 0.0001, + "loss": 1.5837, + "step": 1344 + }, + { + "epoch": 0.72879978325657, + "grad_norm": 0.10460485517978668, + "learning_rate": 0.0001, + "loss": 1.5402, + "step": 1345 + }, + { + "epoch": 0.729341641831482, + "grad_norm": 0.10759514570236206, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 1346 + }, + { + "epoch": 0.7298835004063939, + "grad_norm": 0.11438702791929245, + "learning_rate": 0.0001, + "loss": 1.6547, + "step": 1347 + }, + { + "epoch": 0.7304253589813059, + "grad_norm": 0.11222861707210541, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 1348 + }, + { + "epoch": 0.7309672175562179, + "grad_norm": 0.11586595326662064, + "learning_rate": 0.0001, + "loss": 1.6007, + "step": 1349 + }, + { + "epoch": 0.7315090761311298, + "grad_norm": 0.115098737180233, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 1350 + }, + { + "epoch": 0.7320509347060418, + "grad_norm": 0.1137763112783432, + "learning_rate": 0.0001, + "loss": 1.5883, + "step": 1351 + }, + { + "epoch": 0.7325927932809537, + "grad_norm": 0.12027715146541595, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 1352 + }, + { + "epoch": 0.7331346518558656, + "grad_norm": 0.11682108789682388, + "learning_rate": 0.0001, + "loss": 1.6115, + "step": 1353 + }, + { + "epoch": 0.7336765104307775, + "grad_norm": 0.11551779508590698, + "learning_rate": 0.0001, + "loss": 1.5927, + "step": 1354 + }, + { + "epoch": 0.7342183690056895, + "grad_norm": 0.11855588108301163, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 1355 + }, + { + "epoch": 0.7347602275806014, + "grad_norm": 0.10961764305830002, + "learning_rate": 0.0001, + "loss": 1.5728, + "step": 1356 + }, + { + "epoch": 0.7353020861555134, + "grad_norm": 0.11965698003768921, + "learning_rate": 0.0001, + "loss": 1.5622, + "step": 1357 + }, + { + "epoch": 0.7358439447304254, + "grad_norm": 0.11887028068304062, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 1358 + }, + { + "epoch": 0.7363858033053373, + "grad_norm": 0.12262041866779327, + "learning_rate": 0.0001, + "loss": 1.5519, + "step": 1359 + }, + { + "epoch": 0.7369276618802493, + "grad_norm": 0.12121155858039856, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 1360 + }, + { + "epoch": 0.7374695204551612, + "grad_norm": 0.1178460344672203, + "learning_rate": 0.0001, + "loss": 1.5731, + "step": 1361 + }, + { + "epoch": 0.7380113790300732, + "grad_norm": 0.1300288289785385, + "learning_rate": 0.0001, + "loss": 1.5982, + "step": 1362 + }, + { + "epoch": 0.7385532376049851, + "grad_norm": 0.10765120387077332, + "learning_rate": 0.0001, + "loss": 1.5586, + "step": 1363 + }, + { + "epoch": 0.739095096179897, + "grad_norm": 0.12704487144947052, + "learning_rate": 0.0001, + "loss": 1.6121, + "step": 1364 + }, + { + "epoch": 0.739636954754809, + "grad_norm": 0.11278389394283295, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 1365 + }, + { + "epoch": 0.7401788133297209, + "grad_norm": 0.11722677946090698, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 1366 + }, + { + "epoch": 0.7407206719046329, + "grad_norm": 0.11791490763425827, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 1367 + }, + { + "epoch": 0.7412625304795448, + "grad_norm": 0.10806185752153397, + "learning_rate": 0.0001, + "loss": 1.5503, + "step": 1368 + }, + { + "epoch": 0.7418043890544568, + "grad_norm": 0.11138779670000076, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1369 + }, + { + "epoch": 0.7423462476293687, + "grad_norm": 0.12193464487791061, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 1370 + }, + { + "epoch": 0.7428881062042807, + "grad_norm": 0.12534891068935394, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 1371 + }, + { + "epoch": 0.7434299647791927, + "grad_norm": 0.12328426539897919, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 1372 + }, + { + "epoch": 0.7439718233541046, + "grad_norm": 0.11469186097383499, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 1373 + }, + { + "epoch": 0.7445136819290166, + "grad_norm": 0.13587550818920135, + "learning_rate": 0.0001, + "loss": 1.5469, + "step": 1374 + }, + { + "epoch": 0.7450555405039285, + "grad_norm": 0.125528484582901, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 1375 + }, + { + "epoch": 0.7455973990788404, + "grad_norm": 0.12690255045890808, + "learning_rate": 0.0001, + "loss": 1.6139, + "step": 1376 + }, + { + "epoch": 0.7461392576537523, + "grad_norm": 0.12415812164545059, + "learning_rate": 0.0001, + "loss": 1.6077, + "step": 1377 + }, + { + "epoch": 0.7466811162286643, + "grad_norm": 0.12645591795444489, + "learning_rate": 0.0001, + "loss": 1.5801, + "step": 1378 + }, + { + "epoch": 0.7472229748035762, + "grad_norm": 0.13845498859882355, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 1379 + }, + { + "epoch": 0.7477648333784882, + "grad_norm": 0.11183203011751175, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 1380 + }, + { + "epoch": 0.7483066919534002, + "grad_norm": 0.12703092396259308, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 1381 + }, + { + "epoch": 0.7488485505283121, + "grad_norm": 0.11635295301675797, + "learning_rate": 0.0001, + "loss": 1.6565, + "step": 1382 + }, + { + "epoch": 0.7493904091032241, + "grad_norm": 0.12178429961204529, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 1383 + }, + { + "epoch": 0.749932267678136, + "grad_norm": 0.11872105300426483, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 1384 + }, + { + "epoch": 0.750474126253048, + "grad_norm": 0.11461754888296127, + "learning_rate": 0.0001, + "loss": 1.6153, + "step": 1385 + }, + { + "epoch": 0.75101598482796, + "grad_norm": 0.11746447533369064, + "learning_rate": 0.0001, + "loss": 1.5677, + "step": 1386 + }, + { + "epoch": 0.7515578434028719, + "grad_norm": 0.11214593797922134, + "learning_rate": 0.0001, + "loss": 1.5812, + "step": 1387 + }, + { + "epoch": 0.7520997019777838, + "grad_norm": 0.12819403409957886, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 1388 + }, + { + "epoch": 0.7526415605526957, + "grad_norm": 0.11914908140897751, + "learning_rate": 0.0001, + "loss": 1.5917, + "step": 1389 + }, + { + "epoch": 0.7531834191276077, + "grad_norm": 0.1351640224456787, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 1390 + }, + { + "epoch": 0.7537252777025196, + "grad_norm": 0.10753028094768524, + "learning_rate": 0.0001, + "loss": 1.5323, + "step": 1391 + }, + { + "epoch": 0.7542671362774316, + "grad_norm": 0.12622836232185364, + "learning_rate": 0.0001, + "loss": 1.5256, + "step": 1392 + }, + { + "epoch": 0.7548089948523435, + "grad_norm": 0.12361244857311249, + "learning_rate": 0.0001, + "loss": 1.5888, + "step": 1393 + }, + { + "epoch": 0.7553508534272555, + "grad_norm": 0.12058836221694946, + "learning_rate": 0.0001, + "loss": 1.5289, + "step": 1394 + }, + { + "epoch": 0.7558927120021675, + "grad_norm": 0.12233013659715652, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 1395 + }, + { + "epoch": 0.7564345705770794, + "grad_norm": 0.1099410429596901, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 1396 + }, + { + "epoch": 0.7569764291519914, + "grad_norm": 0.1292077898979187, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 1397 + }, + { + "epoch": 0.7575182877269033, + "grad_norm": 0.11043606698513031, + "learning_rate": 0.0001, + "loss": 1.607, + "step": 1398 + }, + { + "epoch": 0.7580601463018152, + "grad_norm": 0.12485775351524353, + "learning_rate": 0.0001, + "loss": 1.581, + "step": 1399 + }, + { + "epoch": 0.7586020048767271, + "grad_norm": 0.12309932708740234, + "learning_rate": 0.0001, + "loss": 1.5904, + "step": 1400 + }, + { + "epoch": 0.7591438634516391, + "grad_norm": 0.1205844134092331, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 1401 + }, + { + "epoch": 0.759685722026551, + "grad_norm": 0.1165509968996048, + "learning_rate": 0.0001, + "loss": 1.5597, + "step": 1402 + }, + { + "epoch": 0.760227580601463, + "grad_norm": 0.10734628140926361, + "learning_rate": 0.0001, + "loss": 1.567, + "step": 1403 + }, + { + "epoch": 0.760769439176375, + "grad_norm": 0.11562097817659378, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 1404 + }, + { + "epoch": 0.7613112977512869, + "grad_norm": 0.11254249513149261, + "learning_rate": 0.0001, + "loss": 1.5583, + "step": 1405 + }, + { + "epoch": 0.7618531563261989, + "grad_norm": 0.12835754454135895, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1406 + }, + { + "epoch": 0.7623950149011108, + "grad_norm": 0.11308079212903976, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 1407 + }, + { + "epoch": 0.7629368734760228, + "grad_norm": 0.10926039516925812, + "learning_rate": 0.0001, + "loss": 1.5487, + "step": 1408 + }, + { + "epoch": 0.7634787320509347, + "grad_norm": 0.11723046004772186, + "learning_rate": 0.0001, + "loss": 1.6109, + "step": 1409 + }, + { + "epoch": 0.7640205906258467, + "grad_norm": 0.11550256609916687, + "learning_rate": 0.0001, + "loss": 1.5957, + "step": 1410 + }, + { + "epoch": 0.7645624492007586, + "grad_norm": 0.11019278317689896, + "learning_rate": 0.0001, + "loss": 1.5274, + "step": 1411 + }, + { + "epoch": 0.7651043077756705, + "grad_norm": 0.10726244002580643, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 1412 + }, + { + "epoch": 0.7656461663505825, + "grad_norm": 0.11210119724273682, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 1413 + }, + { + "epoch": 0.7661880249254944, + "grad_norm": 0.12100458890199661, + "learning_rate": 0.0001, + "loss": 1.6004, + "step": 1414 + }, + { + "epoch": 0.7667298835004064, + "grad_norm": 0.1142323911190033, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 1415 + }, + { + "epoch": 0.7672717420753183, + "grad_norm": 0.11224039644002914, + "learning_rate": 0.0001, + "loss": 1.5877, + "step": 1416 + }, + { + "epoch": 0.7678136006502303, + "grad_norm": 0.10693727433681488, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 1417 + }, + { + "epoch": 0.7683554592251423, + "grad_norm": 0.10863177478313446, + "learning_rate": 0.0001, + "loss": 1.6159, + "step": 1418 + }, + { + "epoch": 0.7688973178000542, + "grad_norm": 0.11036339402198792, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 1419 + }, + { + "epoch": 0.7694391763749662, + "grad_norm": 0.11554139107465744, + "learning_rate": 0.0001, + "loss": 1.7011, + "step": 1420 + }, + { + "epoch": 0.7699810349498781, + "grad_norm": 0.10645373910665512, + "learning_rate": 0.0001, + "loss": 1.5964, + "step": 1421 + }, + { + "epoch": 0.77052289352479, + "grad_norm": 0.1134442389011383, + "learning_rate": 0.0001, + "loss": 1.5695, + "step": 1422 + }, + { + "epoch": 0.7710647520997019, + "grad_norm": 0.11070393770933151, + "learning_rate": 0.0001, + "loss": 1.5552, + "step": 1423 + }, + { + "epoch": 0.7716066106746139, + "grad_norm": 0.10615087300539017, + "learning_rate": 0.0001, + "loss": 1.5362, + "step": 1424 + }, + { + "epoch": 0.7721484692495258, + "grad_norm": 0.11273098737001419, + "learning_rate": 0.0001, + "loss": 1.5949, + "step": 1425 + }, + { + "epoch": 0.7726903278244378, + "grad_norm": 0.11117110401391983, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 1426 + }, + { + "epoch": 0.7732321863993498, + "grad_norm": 0.11692970991134644, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 1427 + }, + { + "epoch": 0.7737740449742617, + "grad_norm": 0.11009499430656433, + "learning_rate": 0.0001, + "loss": 1.5598, + "step": 1428 + }, + { + "epoch": 0.7743159035491737, + "grad_norm": 0.11022921651601791, + "learning_rate": 0.0001, + "loss": 1.5573, + "step": 1429 + }, + { + "epoch": 0.7748577621240856, + "grad_norm": 0.11034803092479706, + "learning_rate": 0.0001, + "loss": 1.6008, + "step": 1430 + }, + { + "epoch": 0.7753996206989976, + "grad_norm": 0.10987105965614319, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 1431 + }, + { + "epoch": 0.7759414792739096, + "grad_norm": 0.10811598598957062, + "learning_rate": 0.0001, + "loss": 1.5902, + "step": 1432 + }, + { + "epoch": 0.7764833378488215, + "grad_norm": 0.11513319611549377, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 1433 + }, + { + "epoch": 0.7770251964237334, + "grad_norm": 0.10890763998031616, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 1434 + }, + { + "epoch": 0.7775670549986453, + "grad_norm": 0.11387001723051071, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 1435 + }, + { + "epoch": 0.7781089135735573, + "grad_norm": 0.11095337569713593, + "learning_rate": 0.0001, + "loss": 1.5434, + "step": 1436 + }, + { + "epoch": 0.7786507721484692, + "grad_norm": 0.11978809535503387, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 1437 + }, + { + "epoch": 0.7791926307233812, + "grad_norm": 0.10958437621593475, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 1438 + }, + { + "epoch": 0.7797344892982931, + "grad_norm": 0.11390339583158493, + "learning_rate": 0.0001, + "loss": 1.599, + "step": 1439 + }, + { + "epoch": 0.7802763478732051, + "grad_norm": 0.11856385320425034, + "learning_rate": 0.0001, + "loss": 1.5302, + "step": 1440 + }, + { + "epoch": 0.7808182064481171, + "grad_norm": 0.11179933696985245, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 1441 + }, + { + "epoch": 0.781360065023029, + "grad_norm": 0.11821099370718002, + "learning_rate": 0.0001, + "loss": 1.6782, + "step": 1442 + }, + { + "epoch": 0.781901923597941, + "grad_norm": 0.11423931270837784, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 1443 + }, + { + "epoch": 0.7824437821728529, + "grad_norm": 0.11144139617681503, + "learning_rate": 0.0001, + "loss": 1.5897, + "step": 1444 + }, + { + "epoch": 0.7829856407477649, + "grad_norm": 0.11281657218933105, + "learning_rate": 0.0001, + "loss": 1.6016, + "step": 1445 + }, + { + "epoch": 0.7835274993226767, + "grad_norm": 0.12321026623249054, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 1446 + }, + { + "epoch": 0.7840693578975887, + "grad_norm": 0.11253192275762558, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 1447 + }, + { + "epoch": 0.7846112164725006, + "grad_norm": 0.12315592169761658, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 1448 + }, + { + "epoch": 0.7851530750474126, + "grad_norm": 0.11318863183259964, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 1449 + }, + { + "epoch": 0.7856949336223246, + "grad_norm": 0.11038617789745331, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 1450 + }, + { + "epoch": 0.7862367921972365, + "grad_norm": 0.12725314497947693, + "learning_rate": 0.0001, + "loss": 1.5371, + "step": 1451 + }, + { + "epoch": 0.7867786507721485, + "grad_norm": 0.11323723942041397, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 1452 + }, + { + "epoch": 0.7873205093470604, + "grad_norm": 0.13219019770622253, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1453 + }, + { + "epoch": 0.7878623679219724, + "grad_norm": 0.11126276105642319, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 1454 + }, + { + "epoch": 0.7884042264968844, + "grad_norm": 0.12477418780326843, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 1455 + }, + { + "epoch": 0.7889460850717963, + "grad_norm": 0.11030017584562302, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1456 + }, + { + "epoch": 0.7894879436467082, + "grad_norm": 0.11762640625238419, + "learning_rate": 0.0001, + "loss": 1.595, + "step": 1457 + }, + { + "epoch": 0.7900298022216201, + "grad_norm": 0.11468858271837234, + "learning_rate": 0.0001, + "loss": 1.6587, + "step": 1458 + }, + { + "epoch": 0.7905716607965321, + "grad_norm": 0.11451311409473419, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 1459 + }, + { + "epoch": 0.791113519371444, + "grad_norm": 0.12499954551458359, + "learning_rate": 0.0001, + "loss": 1.5696, + "step": 1460 + }, + { + "epoch": 0.791655377946356, + "grad_norm": 0.11596235632896423, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 1461 + }, + { + "epoch": 0.792197236521268, + "grad_norm": 0.12140510976314545, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 1462 + }, + { + "epoch": 0.7927390950961799, + "grad_norm": 0.11294886469841003, + "learning_rate": 0.0001, + "loss": 1.5746, + "step": 1463 + }, + { + "epoch": 0.7932809536710919, + "grad_norm": 0.12936176359653473, + "learning_rate": 0.0001, + "loss": 1.5847, + "step": 1464 + }, + { + "epoch": 0.7938228122460038, + "grad_norm": 0.10991284996271133, + "learning_rate": 0.0001, + "loss": 1.5691, + "step": 1465 + }, + { + "epoch": 0.7943646708209158, + "grad_norm": 0.12364400923252106, + "learning_rate": 0.0001, + "loss": 1.5882, + "step": 1466 + }, + { + "epoch": 0.7949065293958277, + "grad_norm": 0.12108241766691208, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 1467 + }, + { + "epoch": 0.7954483879707397, + "grad_norm": 0.12121594697237015, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 1468 + }, + { + "epoch": 0.7959902465456515, + "grad_norm": 0.11251336336135864, + "learning_rate": 0.0001, + "loss": 1.5009, + "step": 1469 + }, + { + "epoch": 0.7965321051205635, + "grad_norm": 0.10613922774791718, + "learning_rate": 0.0001, + "loss": 1.4703, + "step": 1470 + }, + { + "epoch": 0.7970739636954755, + "grad_norm": 0.11324452608823776, + "learning_rate": 0.0001, + "loss": 1.588, + "step": 1471 + }, + { + "epoch": 0.7976158222703874, + "grad_norm": 0.12203339487314224, + "learning_rate": 0.0001, + "loss": 1.5984, + "step": 1472 + }, + { + "epoch": 0.7981576808452994, + "grad_norm": 0.11607290059328079, + "learning_rate": 0.0001, + "loss": 1.5953, + "step": 1473 + }, + { + "epoch": 0.7986995394202113, + "grad_norm": 0.11903396248817444, + "learning_rate": 0.0001, + "loss": 1.6032, + "step": 1474 + }, + { + "epoch": 0.7992413979951233, + "grad_norm": 0.11452293395996094, + "learning_rate": 0.0001, + "loss": 1.608, + "step": 1475 + }, + { + "epoch": 0.7997832565700352, + "grad_norm": 0.1214212104678154, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 1476 + }, + { + "epoch": 0.8003251151449472, + "grad_norm": 0.11239127069711685, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 1477 + }, + { + "epoch": 0.8008669737198592, + "grad_norm": 0.12349693477153778, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 1478 + }, + { + "epoch": 0.8014088322947711, + "grad_norm": 0.11215246468782425, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 1479 + }, + { + "epoch": 0.801950690869683, + "grad_norm": 0.12210462242364883, + "learning_rate": 0.0001, + "loss": 1.6145, + "step": 1480 + }, + { + "epoch": 0.8024925494445949, + "grad_norm": 0.1136668398976326, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 1481 + }, + { + "epoch": 0.8030344080195069, + "grad_norm": 0.1275959461927414, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 1482 + }, + { + "epoch": 0.8035762665944188, + "grad_norm": 0.11355528235435486, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 1483 + }, + { + "epoch": 0.8041181251693308, + "grad_norm": 0.11473184078931808, + "learning_rate": 0.0001, + "loss": 1.5067, + "step": 1484 + }, + { + "epoch": 0.8046599837442427, + "grad_norm": 0.12829527258872986, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 1485 + }, + { + "epoch": 0.8052018423191547, + "grad_norm": 0.1119534894824028, + "learning_rate": 0.0001, + "loss": 1.5945, + "step": 1486 + }, + { + "epoch": 0.8057437008940667, + "grad_norm": 0.12512047588825226, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 1487 + }, + { + "epoch": 0.8062855594689786, + "grad_norm": 0.11971007287502289, + "learning_rate": 0.0001, + "loss": 1.6031, + "step": 1488 + }, + { + "epoch": 0.8068274180438906, + "grad_norm": 0.11718935519456863, + "learning_rate": 0.0001, + "loss": 1.4837, + "step": 1489 + }, + { + "epoch": 0.8073692766188025, + "grad_norm": 0.121201291680336, + "learning_rate": 0.0001, + "loss": 1.6057, + "step": 1490 + }, + { + "epoch": 0.8079111351937145, + "grad_norm": 0.12238092720508575, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 1491 + }, + { + "epoch": 0.8084529937686263, + "grad_norm": 0.1160135269165039, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 1492 + }, + { + "epoch": 0.8089948523435383, + "grad_norm": 0.12266741693019867, + "learning_rate": 0.0001, + "loss": 1.578, + "step": 1493 + }, + { + "epoch": 0.8095367109184503, + "grad_norm": 0.12016115337610245, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 1494 + }, + { + "epoch": 0.8100785694933622, + "grad_norm": 0.12188202142715454, + "learning_rate": 0.0001, + "loss": 1.615, + "step": 1495 + }, + { + "epoch": 0.8106204280682742, + "grad_norm": 0.12086134403944016, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 1496 + }, + { + "epoch": 0.8111622866431861, + "grad_norm": 0.11443319916725159, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 1497 + }, + { + "epoch": 0.8117041452180981, + "grad_norm": 0.11497814953327179, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1498 + }, + { + "epoch": 0.81224600379301, + "grad_norm": 0.11970335990190506, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 1499 + }, + { + "epoch": 0.812787862367922, + "grad_norm": 0.11911267787218094, + "learning_rate": 0.0001, + "loss": 1.6128, + "step": 1500 + }, + { + "epoch": 0.813329720942834, + "grad_norm": 0.11438138782978058, + "learning_rate": 0.0001, + "loss": 1.6165, + "step": 1501 + }, + { + "epoch": 0.8138715795177459, + "grad_norm": 0.1144600659608841, + "learning_rate": 0.0001, + "loss": 1.5604, + "step": 1502 + }, + { + "epoch": 0.8144134380926579, + "grad_norm": 0.11835461854934692, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 1503 + }, + { + "epoch": 0.8149552966675697, + "grad_norm": 0.11068934202194214, + "learning_rate": 0.0001, + "loss": 1.5806, + "step": 1504 + }, + { + "epoch": 0.8154971552424817, + "grad_norm": 0.11289520561695099, + "learning_rate": 0.0001, + "loss": 1.6141, + "step": 1505 + }, + { + "epoch": 0.8160390138173936, + "grad_norm": 0.11913792043924332, + "learning_rate": 0.0001, + "loss": 1.5351, + "step": 1506 + }, + { + "epoch": 0.8165808723923056, + "grad_norm": 0.11238417029380798, + "learning_rate": 0.0001, + "loss": 1.5889, + "step": 1507 + }, + { + "epoch": 0.8171227309672175, + "grad_norm": 0.14196346700191498, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 1508 + }, + { + "epoch": 0.8176645895421295, + "grad_norm": 0.1079757958650589, + "learning_rate": 0.0001, + "loss": 1.5546, + "step": 1509 + }, + { + "epoch": 0.8182064481170415, + "grad_norm": 0.12480804324150085, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 1510 + }, + { + "epoch": 0.8187483066919534, + "grad_norm": 0.12916377186775208, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 1511 + }, + { + "epoch": 0.8192901652668654, + "grad_norm": 0.11713727563619614, + "learning_rate": 0.0001, + "loss": 1.5121, + "step": 1512 + }, + { + "epoch": 0.8198320238417773, + "grad_norm": 0.13380920886993408, + "learning_rate": 0.0001, + "loss": 1.5758, + "step": 1513 + }, + { + "epoch": 0.8203738824166893, + "grad_norm": 0.11877034604549408, + "learning_rate": 0.0001, + "loss": 1.5613, + "step": 1514 + }, + { + "epoch": 0.8209157409916011, + "grad_norm": 0.12588542699813843, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 1515 + }, + { + "epoch": 0.8214575995665131, + "grad_norm": 0.1106722354888916, + "learning_rate": 0.0001, + "loss": 1.5663, + "step": 1516 + }, + { + "epoch": 0.821999458141425, + "grad_norm": 0.1203051209449768, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 1517 + }, + { + "epoch": 0.822541316716337, + "grad_norm": 0.10928182303905487, + "learning_rate": 0.0001, + "loss": 1.5544, + "step": 1518 + }, + { + "epoch": 0.823083175291249, + "grad_norm": 0.12915323674678802, + "learning_rate": 0.0001, + "loss": 1.625, + "step": 1519 + }, + { + "epoch": 0.8236250338661609, + "grad_norm": 0.11285202205181122, + "learning_rate": 0.0001, + "loss": 1.5965, + "step": 1520 + }, + { + "epoch": 0.8241668924410729, + "grad_norm": 0.1192052811384201, + "learning_rate": 0.0001, + "loss": 1.5219, + "step": 1521 + }, + { + "epoch": 0.8247087510159848, + "grad_norm": 0.10996171832084656, + "learning_rate": 0.0001, + "loss": 1.5862, + "step": 1522 + }, + { + "epoch": 0.8252506095908968, + "grad_norm": 0.13144908845424652, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 1523 + }, + { + "epoch": 0.8257924681658088, + "grad_norm": 0.13091161847114563, + "learning_rate": 0.0001, + "loss": 1.5625, + "step": 1524 + }, + { + "epoch": 0.8263343267407207, + "grad_norm": 0.1159205511212349, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 1525 + }, + { + "epoch": 0.8268761853156327, + "grad_norm": 0.14918328821659088, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 1526 + }, + { + "epoch": 0.8274180438905445, + "grad_norm": 0.11102715879678726, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 1527 + }, + { + "epoch": 0.8279599024654565, + "grad_norm": 0.13266460597515106, + "learning_rate": 0.0001, + "loss": 1.5865, + "step": 1528 + }, + { + "epoch": 0.8285017610403684, + "grad_norm": 0.12381194531917572, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 1529 + }, + { + "epoch": 0.8290436196152804, + "grad_norm": 0.1211717426776886, + "learning_rate": 0.0001, + "loss": 1.5247, + "step": 1530 + }, + { + "epoch": 0.8295854781901923, + "grad_norm": 0.13359831273555756, + "learning_rate": 0.0001, + "loss": 1.5855, + "step": 1531 + }, + { + "epoch": 0.8301273367651043, + "grad_norm": 0.11108848452568054, + "learning_rate": 0.0001, + "loss": 1.5986, + "step": 1532 + }, + { + "epoch": 0.8306691953400163, + "grad_norm": 0.149922713637352, + "learning_rate": 0.0001, + "loss": 1.6151, + "step": 1533 + }, + { + "epoch": 0.8312110539149282, + "grad_norm": 0.11118970066308975, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 1534 + }, + { + "epoch": 0.8317529124898402, + "grad_norm": 0.13041344285011292, + "learning_rate": 0.0001, + "loss": 1.5987, + "step": 1535 + }, + { + "epoch": 0.8322947710647521, + "grad_norm": 0.11675264686346054, + "learning_rate": 0.0001, + "loss": 1.4878, + "step": 1536 + }, + { + "epoch": 0.8328366296396641, + "grad_norm": 0.13788774609565735, + "learning_rate": 0.0001, + "loss": 1.6123, + "step": 1537 + }, + { + "epoch": 0.8333784882145759, + "grad_norm": 0.12169042974710464, + "learning_rate": 0.0001, + "loss": 1.6448, + "step": 1538 + }, + { + "epoch": 0.8339203467894879, + "grad_norm": 0.11629584431648254, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 1539 + }, + { + "epoch": 0.8344622053643999, + "grad_norm": 0.12246973067522049, + "learning_rate": 0.0001, + "loss": 1.6119, + "step": 1540 + }, + { + "epoch": 0.8350040639393118, + "grad_norm": 0.11287442594766617, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 1541 + }, + { + "epoch": 0.8355459225142238, + "grad_norm": 0.11604714393615723, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 1542 + }, + { + "epoch": 0.8360877810891357, + "grad_norm": 0.11228446662425995, + "learning_rate": 0.0001, + "loss": 1.5365, + "step": 1543 + }, + { + "epoch": 0.8366296396640477, + "grad_norm": 0.12683261930942535, + "learning_rate": 0.0001, + "loss": 1.601, + "step": 1544 + }, + { + "epoch": 0.8371714982389596, + "grad_norm": 0.11033685505390167, + "learning_rate": 0.0001, + "loss": 1.5819, + "step": 1545 + }, + { + "epoch": 0.8377133568138716, + "grad_norm": 0.1107892394065857, + "learning_rate": 0.0001, + "loss": 1.5048, + "step": 1546 + }, + { + "epoch": 0.8382552153887836, + "grad_norm": 0.11377444863319397, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 1547 + }, + { + "epoch": 0.8387970739636955, + "grad_norm": 0.10976476967334747, + "learning_rate": 0.0001, + "loss": 1.5409, + "step": 1548 + }, + { + "epoch": 0.8393389325386075, + "grad_norm": 0.11617878079414368, + "learning_rate": 0.0001, + "loss": 1.6056, + "step": 1549 + }, + { + "epoch": 0.8398807911135193, + "grad_norm": 0.11169909685850143, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 1550 + }, + { + "epoch": 0.8404226496884313, + "grad_norm": 0.1158488541841507, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 1551 + }, + { + "epoch": 0.8409645082633432, + "grad_norm": 0.11124692112207413, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 1552 + }, + { + "epoch": 0.8415063668382552, + "grad_norm": 0.11569403111934662, + "learning_rate": 0.0001, + "loss": 1.5848, + "step": 1553 + }, + { + "epoch": 0.8420482254131672, + "grad_norm": 0.1057923287153244, + "learning_rate": 0.0001, + "loss": 1.5125, + "step": 1554 + }, + { + "epoch": 0.8425900839880791, + "grad_norm": 0.11266665160655975, + "learning_rate": 0.0001, + "loss": 1.5761, + "step": 1555 + }, + { + "epoch": 0.8431319425629911, + "grad_norm": 0.1118483692407608, + "learning_rate": 0.0001, + "loss": 1.6092, + "step": 1556 + }, + { + "epoch": 0.843673801137903, + "grad_norm": 0.12413822114467621, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 1557 + }, + { + "epoch": 0.844215659712815, + "grad_norm": 0.11185813695192337, + "learning_rate": 0.0001, + "loss": 1.5966, + "step": 1558 + }, + { + "epoch": 0.8447575182877269, + "grad_norm": 0.11938433349132538, + "learning_rate": 0.0001, + "loss": 1.5775, + "step": 1559 + }, + { + "epoch": 0.8452993768626389, + "grad_norm": 0.1135723888874054, + "learning_rate": 0.0001, + "loss": 1.5854, + "step": 1560 + }, + { + "epoch": 0.8458412354375509, + "grad_norm": 0.1225573718547821, + "learning_rate": 0.0001, + "loss": 1.6488, + "step": 1561 + }, + { + "epoch": 0.8463830940124627, + "grad_norm": 0.11226433515548706, + "learning_rate": 0.0001, + "loss": 1.5614, + "step": 1562 + }, + { + "epoch": 0.8469249525873747, + "grad_norm": 0.13214121758937836, + "learning_rate": 0.0001, + "loss": 1.7078, + "step": 1563 + }, + { + "epoch": 0.8474668111622866, + "grad_norm": 0.1112339049577713, + "learning_rate": 0.0001, + "loss": 1.5144, + "step": 1564 + }, + { + "epoch": 0.8480086697371986, + "grad_norm": 0.12047260254621506, + "learning_rate": 0.0001, + "loss": 1.521, + "step": 1565 + }, + { + "epoch": 0.8485505283121105, + "grad_norm": 0.1157664805650711, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 1566 + }, + { + "epoch": 0.8490923868870225, + "grad_norm": 0.11576071381568909, + "learning_rate": 0.0001, + "loss": 1.6143, + "step": 1567 + }, + { + "epoch": 0.8496342454619344, + "grad_norm": 0.11805963516235352, + "learning_rate": 0.0001, + "loss": 1.5349, + "step": 1568 + }, + { + "epoch": 0.8501761040368464, + "grad_norm": 0.11272211372852325, + "learning_rate": 0.0001, + "loss": 1.6067, + "step": 1569 + }, + { + "epoch": 0.8507179626117584, + "grad_norm": 0.12440693378448486, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 1570 + }, + { + "epoch": 0.8512598211866703, + "grad_norm": 0.1087651476264, + "learning_rate": 0.0001, + "loss": 1.5478, + "step": 1571 + }, + { + "epoch": 0.8518016797615823, + "grad_norm": 0.130729079246521, + "learning_rate": 0.0001, + "loss": 1.5842, + "step": 1572 + }, + { + "epoch": 0.8523435383364941, + "grad_norm": 0.11392576992511749, + "learning_rate": 0.0001, + "loss": 1.5629, + "step": 1573 + }, + { + "epoch": 0.8528853969114061, + "grad_norm": 0.12628479301929474, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 1574 + }, + { + "epoch": 0.853427255486318, + "grad_norm": 0.11537759751081467, + "learning_rate": 0.0001, + "loss": 1.5839, + "step": 1575 + }, + { + "epoch": 0.85396911406123, + "grad_norm": 0.12094999104738235, + "learning_rate": 0.0001, + "loss": 1.5822, + "step": 1576 + }, + { + "epoch": 0.854510972636142, + "grad_norm": 0.12219975888729095, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 1577 + }, + { + "epoch": 0.8550528312110539, + "grad_norm": 0.11441099643707275, + "learning_rate": 0.0001, + "loss": 1.6, + "step": 1578 + }, + { + "epoch": 0.8555946897859659, + "grad_norm": 0.11610981822013855, + "learning_rate": 0.0001, + "loss": 1.4884, + "step": 1579 + }, + { + "epoch": 0.8561365483608778, + "grad_norm": 0.11038801074028015, + "learning_rate": 0.0001, + "loss": 1.5624, + "step": 1580 + }, + { + "epoch": 0.8566784069357898, + "grad_norm": 0.1118963435292244, + "learning_rate": 0.0001, + "loss": 1.6111, + "step": 1581 + }, + { + "epoch": 0.8572202655107017, + "grad_norm": 0.10892907530069351, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 1582 + }, + { + "epoch": 0.8577621240856137, + "grad_norm": 0.11459089815616608, + "learning_rate": 0.0001, + "loss": 1.6161, + "step": 1583 + }, + { + "epoch": 0.8583039826605257, + "grad_norm": 0.11651195585727692, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 1584 + }, + { + "epoch": 0.8588458412354375, + "grad_norm": 0.11360273510217667, + "learning_rate": 0.0001, + "loss": 1.6131, + "step": 1585 + }, + { + "epoch": 0.8593876998103495, + "grad_norm": 0.1199052482843399, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 1586 + }, + { + "epoch": 0.8599295583852614, + "grad_norm": 0.10889112204313278, + "learning_rate": 0.0001, + "loss": 1.605, + "step": 1587 + }, + { + "epoch": 0.8604714169601734, + "grad_norm": 0.11999368667602539, + "learning_rate": 0.0001, + "loss": 1.5743, + "step": 1588 + }, + { + "epoch": 0.8610132755350853, + "grad_norm": 0.11200165003538132, + "learning_rate": 0.0001, + "loss": 1.56, + "step": 1589 + }, + { + "epoch": 0.8615551341099973, + "grad_norm": 0.12450709939002991, + "learning_rate": 0.0001, + "loss": 1.5852, + "step": 1590 + }, + { + "epoch": 0.8620969926849092, + "grad_norm": 0.1191956177353859, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 1591 + }, + { + "epoch": 0.8626388512598212, + "grad_norm": 0.11459418386220932, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 1592 + }, + { + "epoch": 0.8631807098347332, + "grad_norm": 0.1097928062081337, + "learning_rate": 0.0001, + "loss": 1.5656, + "step": 1593 + }, + { + "epoch": 0.8637225684096451, + "grad_norm": 0.1171281486749649, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 1594 + }, + { + "epoch": 0.8642644269845571, + "grad_norm": 0.1114104762673378, + "learning_rate": 0.0001, + "loss": 1.5707, + "step": 1595 + }, + { + "epoch": 0.8648062855594689, + "grad_norm": 0.11036089062690735, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 1596 + }, + { + "epoch": 0.8653481441343809, + "grad_norm": 0.11589822173118591, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 1597 + }, + { + "epoch": 0.8658900027092928, + "grad_norm": 0.11399046331644058, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 1598 + }, + { + "epoch": 0.8664318612842048, + "grad_norm": 0.11825186759233475, + "learning_rate": 0.0001, + "loss": 1.5587, + "step": 1599 + }, + { + "epoch": 0.8669737198591168, + "grad_norm": 0.10993462055921555, + "learning_rate": 0.0001, + "loss": 1.5817, + "step": 1600 + }, + { + "epoch": 0.8675155784340287, + "grad_norm": 0.11864310503005981, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 1601 + }, + { + "epoch": 0.8680574370089407, + "grad_norm": 0.11332954466342926, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 1602 + }, + { + "epoch": 0.8685992955838526, + "grad_norm": 0.12260433286428452, + "learning_rate": 0.0001, + "loss": 1.5922, + "step": 1603 + }, + { + "epoch": 0.8691411541587646, + "grad_norm": 0.11385665833950043, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 1604 + }, + { + "epoch": 0.8696830127336765, + "grad_norm": 0.12241163849830627, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 1605 + }, + { + "epoch": 0.8702248713085885, + "grad_norm": 0.12055647373199463, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1606 + }, + { + "epoch": 0.8707667298835005, + "grad_norm": 0.11905807256698608, + "learning_rate": 0.0001, + "loss": 1.5488, + "step": 1607 + }, + { + "epoch": 0.8713085884584123, + "grad_norm": 0.11884074658155441, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 1608 + }, + { + "epoch": 0.8718504470333243, + "grad_norm": 0.11740755289793015, + "learning_rate": 0.0001, + "loss": 1.5892, + "step": 1609 + }, + { + "epoch": 0.8723923056082362, + "grad_norm": 0.11741320043802261, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 1610 + }, + { + "epoch": 0.8729341641831482, + "grad_norm": 0.1233908161520958, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 1611 + }, + { + "epoch": 0.8734760227580601, + "grad_norm": 0.12131594866514206, + "learning_rate": 0.0001, + "loss": 1.5793, + "step": 1612 + }, + { + "epoch": 0.8740178813329721, + "grad_norm": 0.119773268699646, + "learning_rate": 0.0001, + "loss": 1.5435, + "step": 1613 + }, + { + "epoch": 0.874559739907884, + "grad_norm": 0.12392249703407288, + "learning_rate": 0.0001, + "loss": 1.6645, + "step": 1614 + }, + { + "epoch": 0.875101598482796, + "grad_norm": 0.11792998760938644, + "learning_rate": 0.0001, + "loss": 1.5462, + "step": 1615 + }, + { + "epoch": 0.875643457057708, + "grad_norm": 0.12786734104156494, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 1616 + }, + { + "epoch": 0.8761853156326199, + "grad_norm": 0.11728377640247345, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 1617 + }, + { + "epoch": 0.8767271742075319, + "grad_norm": 0.12557296454906464, + "learning_rate": 0.0001, + "loss": 1.5278, + "step": 1618 + }, + { + "epoch": 0.8772690327824438, + "grad_norm": 0.12002141028642654, + "learning_rate": 0.0001, + "loss": 1.5358, + "step": 1619 + }, + { + "epoch": 0.8778108913573557, + "grad_norm": 0.12448980659246445, + "learning_rate": 0.0001, + "loss": 1.5732, + "step": 1620 + }, + { + "epoch": 0.8783527499322676, + "grad_norm": 0.12446106970310211, + "learning_rate": 0.0001, + "loss": 1.6156, + "step": 1621 + }, + { + "epoch": 0.8788946085071796, + "grad_norm": 0.1173003539443016, + "learning_rate": 0.0001, + "loss": 1.5494, + "step": 1622 + }, + { + "epoch": 0.8794364670820916, + "grad_norm": 0.1369483768939972, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 1623 + }, + { + "epoch": 0.8799783256570035, + "grad_norm": 0.11268099397420883, + "learning_rate": 0.0001, + "loss": 1.5504, + "step": 1624 + }, + { + "epoch": 0.8805201842319155, + "grad_norm": 0.11444398015737534, + "learning_rate": 0.0001, + "loss": 1.5284, + "step": 1625 + }, + { + "epoch": 0.8810620428068274, + "grad_norm": 0.116224005818367, + "learning_rate": 0.0001, + "loss": 1.6135, + "step": 1626 + }, + { + "epoch": 0.8816039013817394, + "grad_norm": 0.1233014464378357, + "learning_rate": 0.0001, + "loss": 1.5768, + "step": 1627 + }, + { + "epoch": 0.8821457599566513, + "grad_norm": 0.11984910815954208, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 1628 + }, + { + "epoch": 0.8826876185315633, + "grad_norm": 0.13135050237178802, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 1629 + }, + { + "epoch": 0.8832294771064753, + "grad_norm": 0.120527483522892, + "learning_rate": 0.0001, + "loss": 1.5724, + "step": 1630 + }, + { + "epoch": 0.8837713356813871, + "grad_norm": 0.13846857845783234, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 1631 + }, + { + "epoch": 0.8843131942562991, + "grad_norm": 0.11715717613697052, + "learning_rate": 0.0001, + "loss": 1.5607, + "step": 1632 + }, + { + "epoch": 0.884855052831211, + "grad_norm": 0.11982149630784988, + "learning_rate": 0.0001, + "loss": 1.6066, + "step": 1633 + }, + { + "epoch": 0.885396911406123, + "grad_norm": 0.11110392957925797, + "learning_rate": 0.0001, + "loss": 1.5525, + "step": 1634 + }, + { + "epoch": 0.8859387699810349, + "grad_norm": 0.12085683643817902, + "learning_rate": 0.0001, + "loss": 1.5901, + "step": 1635 + }, + { + "epoch": 0.8864806285559469, + "grad_norm": 0.12121672928333282, + "learning_rate": 0.0001, + "loss": 1.5523, + "step": 1636 + }, + { + "epoch": 0.8870224871308588, + "grad_norm": 0.12281519919633865, + "learning_rate": 0.0001, + "loss": 1.6853, + "step": 1637 + }, + { + "epoch": 0.8875643457057708, + "grad_norm": 0.10896273702383041, + "learning_rate": 0.0001, + "loss": 1.516, + "step": 1638 + }, + { + "epoch": 0.8881062042806828, + "grad_norm": 0.11349642276763916, + "learning_rate": 0.0001, + "loss": 1.5398, + "step": 1639 + }, + { + "epoch": 0.8886480628555947, + "grad_norm": 0.11644469946622849, + "learning_rate": 0.0001, + "loss": 1.5734, + "step": 1640 + }, + { + "epoch": 0.8891899214305067, + "grad_norm": 0.11534909158945084, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 1641 + }, + { + "epoch": 0.8897317800054186, + "grad_norm": 0.11221238225698471, + "learning_rate": 0.0001, + "loss": 1.575, + "step": 1642 + }, + { + "epoch": 0.8902736385803305, + "grad_norm": 0.10964131355285645, + "learning_rate": 0.0001, + "loss": 1.5851, + "step": 1643 + }, + { + "epoch": 0.8908154971552424, + "grad_norm": 0.1157451868057251, + "learning_rate": 0.0001, + "loss": 1.5438, + "step": 1644 + }, + { + "epoch": 0.8913573557301544, + "grad_norm": 0.1303514987230301, + "learning_rate": 0.0001, + "loss": 1.5881, + "step": 1645 + }, + { + "epoch": 0.8918992143050664, + "grad_norm": 0.12164656072854996, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 1646 + }, + { + "epoch": 0.8924410728799783, + "grad_norm": 0.12777622044086456, + "learning_rate": 0.0001, + "loss": 1.5742, + "step": 1647 + }, + { + "epoch": 0.8929829314548903, + "grad_norm": 0.12283733487129211, + "learning_rate": 0.0001, + "loss": 1.6093, + "step": 1648 + }, + { + "epoch": 0.8935247900298022, + "grad_norm": 0.12476850301027298, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 1649 + }, + { + "epoch": 0.8940666486047142, + "grad_norm": 0.11374060809612274, + "learning_rate": 0.0001, + "loss": 1.5208, + "step": 1650 + }, + { + "epoch": 0.8946085071796261, + "grad_norm": 0.12660513818264008, + "learning_rate": 0.0001, + "loss": 1.5475, + "step": 1651 + }, + { + "epoch": 0.8951503657545381, + "grad_norm": 0.12348679453134537, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 1652 + }, + { + "epoch": 0.8956922243294501, + "grad_norm": 0.1383376121520996, + "learning_rate": 0.0001, + "loss": 1.5918, + "step": 1653 + }, + { + "epoch": 0.8962340829043619, + "grad_norm": 0.11630037426948547, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 1654 + }, + { + "epoch": 0.8967759414792739, + "grad_norm": 0.12958136200904846, + "learning_rate": 0.0001, + "loss": 1.5429, + "step": 1655 + }, + { + "epoch": 0.8973178000541858, + "grad_norm": 0.11349201202392578, + "learning_rate": 0.0001, + "loss": 1.5671, + "step": 1656 + }, + { + "epoch": 0.8978596586290978, + "grad_norm": 0.11807017773389816, + "learning_rate": 0.0001, + "loss": 1.5202, + "step": 1657 + }, + { + "epoch": 0.8984015172040097, + "grad_norm": 0.1251143217086792, + "learning_rate": 0.0001, + "loss": 1.5952, + "step": 1658 + }, + { + "epoch": 0.8989433757789217, + "grad_norm": 0.11176281422376633, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 1659 + }, + { + "epoch": 0.8994852343538337, + "grad_norm": 0.12428291141986847, + "learning_rate": 0.0001, + "loss": 1.5621, + "step": 1660 + }, + { + "epoch": 0.9000270929287456, + "grad_norm": 0.11493722349405289, + "learning_rate": 0.0001, + "loss": 1.5835, + "step": 1661 + }, + { + "epoch": 0.9005689515036576, + "grad_norm": 0.11907187104225159, + "learning_rate": 0.0001, + "loss": 1.5335, + "step": 1662 + }, + { + "epoch": 0.9011108100785695, + "grad_norm": 0.1341247409582138, + "learning_rate": 0.0001, + "loss": 1.6146, + "step": 1663 + }, + { + "epoch": 0.9016526686534815, + "grad_norm": 0.12102586776018143, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 1664 + }, + { + "epoch": 0.9021945272283934, + "grad_norm": 0.12353098392486572, + "learning_rate": 0.0001, + "loss": 1.6133, + "step": 1665 + }, + { + "epoch": 0.9027363858033053, + "grad_norm": 0.11463508754968643, + "learning_rate": 0.0001, + "loss": 1.5422, + "step": 1666 + }, + { + "epoch": 0.9032782443782172, + "grad_norm": 0.14872030913829803, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 1667 + }, + { + "epoch": 0.9038201029531292, + "grad_norm": 0.12321729958057404, + "learning_rate": 0.0001, + "loss": 1.6017, + "step": 1668 + }, + { + "epoch": 0.9043619615280412, + "grad_norm": 0.1364787518978119, + "learning_rate": 0.0001, + "loss": 1.5988, + "step": 1669 + }, + { + "epoch": 0.9049038201029531, + "grad_norm": 0.145307719707489, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 1670 + }, + { + "epoch": 0.9054456786778651, + "grad_norm": 0.12828241288661957, + "learning_rate": 0.0001, + "loss": 1.5977, + "step": 1671 + }, + { + "epoch": 0.905987537252777, + "grad_norm": 0.1327933818101883, + "learning_rate": 0.0001, + "loss": 1.5433, + "step": 1672 + }, + { + "epoch": 0.906529395827689, + "grad_norm": 0.11078044027090073, + "learning_rate": 0.0001, + "loss": 1.5736, + "step": 1673 + }, + { + "epoch": 0.907071254402601, + "grad_norm": 0.1254003942012787, + "learning_rate": 0.0001, + "loss": 1.5524, + "step": 1674 + }, + { + "epoch": 0.9076131129775129, + "grad_norm": 0.11752931028604507, + "learning_rate": 0.0001, + "loss": 1.589, + "step": 1675 + }, + { + "epoch": 0.9081549715524249, + "grad_norm": 0.12012698501348495, + "learning_rate": 0.0001, + "loss": 1.6072, + "step": 1676 + }, + { + "epoch": 0.9086968301273368, + "grad_norm": 0.12614655494689941, + "learning_rate": 0.0001, + "loss": 1.5748, + "step": 1677 + }, + { + "epoch": 0.9092386887022487, + "grad_norm": 0.1087908074259758, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 1678 + }, + { + "epoch": 0.9097805472771606, + "grad_norm": 0.1440020352602005, + "learning_rate": 0.0001, + "loss": 1.634, + "step": 1679 + }, + { + "epoch": 0.9103224058520726, + "grad_norm": 0.11258941143751144, + "learning_rate": 0.0001, + "loss": 1.5273, + "step": 1680 + }, + { + "epoch": 0.9108642644269845, + "grad_norm": 0.12051312625408173, + "learning_rate": 0.0001, + "loss": 1.5715, + "step": 1681 + }, + { + "epoch": 0.9114061230018965, + "grad_norm": 0.11230110377073288, + "learning_rate": 0.0001, + "loss": 1.5831, + "step": 1682 + }, + { + "epoch": 0.9119479815768085, + "grad_norm": 0.1212247908115387, + "learning_rate": 0.0001, + "loss": 1.5928, + "step": 1683 + }, + { + "epoch": 0.9124898401517204, + "grad_norm": 0.11118316650390625, + "learning_rate": 0.0001, + "loss": 1.5785, + "step": 1684 + }, + { + "epoch": 0.9130316987266324, + "grad_norm": 0.11062508076429367, + "learning_rate": 0.0001, + "loss": 1.5468, + "step": 1685 + }, + { + "epoch": 0.9135735573015443, + "grad_norm": 0.13190412521362305, + "learning_rate": 0.0001, + "loss": 1.5937, + "step": 1686 + }, + { + "epoch": 0.9141154158764563, + "grad_norm": 0.10946314036846161, + "learning_rate": 0.0001, + "loss": 1.5615, + "step": 1687 + }, + { + "epoch": 0.9146572744513682, + "grad_norm": 0.11208927631378174, + "learning_rate": 0.0001, + "loss": 1.5419, + "step": 1688 + }, + { + "epoch": 0.9151991330262801, + "grad_norm": 0.12088596820831299, + "learning_rate": 0.0001, + "loss": 1.5833, + "step": 1689 + }, + { + "epoch": 0.915740991601192, + "grad_norm": 0.1129712462425232, + "learning_rate": 0.0001, + "loss": 1.5547, + "step": 1690 + }, + { + "epoch": 0.916282850176104, + "grad_norm": 0.12110590934753418, + "learning_rate": 0.0001, + "loss": 1.5354, + "step": 1691 + }, + { + "epoch": 0.916824708751016, + "grad_norm": 0.11715506762266159, + "learning_rate": 0.0001, + "loss": 1.5834, + "step": 1692 + }, + { + "epoch": 0.9173665673259279, + "grad_norm": 0.11173558980226517, + "learning_rate": 0.0001, + "loss": 1.5223, + "step": 1693 + }, + { + "epoch": 0.9179084259008399, + "grad_norm": 0.12259289622306824, + "learning_rate": 0.0001, + "loss": 1.5992, + "step": 1694 + }, + { + "epoch": 0.9184502844757518, + "grad_norm": 0.11358395963907242, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 1695 + }, + { + "epoch": 0.9189921430506638, + "grad_norm": 0.13116388022899628, + "learning_rate": 0.0001, + "loss": 1.5542, + "step": 1696 + }, + { + "epoch": 0.9195340016255757, + "grad_norm": 0.10953519493341446, + "learning_rate": 0.0001, + "loss": 1.486, + "step": 1697 + }, + { + "epoch": 0.9200758602004877, + "grad_norm": 0.11799793690443039, + "learning_rate": 0.0001, + "loss": 1.6026, + "step": 1698 + }, + { + "epoch": 0.9206177187753997, + "grad_norm": 0.1310616433620453, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 1699 + }, + { + "epoch": 0.9211595773503116, + "grad_norm": 0.11110175400972366, + "learning_rate": 0.0001, + "loss": 1.5298, + "step": 1700 + }, + { + "epoch": 0.9217014359252235, + "grad_norm": 0.12602512538433075, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 1701 + }, + { + "epoch": 0.9222432945001354, + "grad_norm": 0.11771464347839355, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 1702 + }, + { + "epoch": 0.9227851530750474, + "grad_norm": 0.11211036890745163, + "learning_rate": 0.0001, + "loss": 1.5361, + "step": 1703 + }, + { + "epoch": 0.9233270116499593, + "grad_norm": 0.1293649971485138, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 1704 + }, + { + "epoch": 0.9238688702248713, + "grad_norm": 0.11830790340900421, + "learning_rate": 0.0001, + "loss": 1.6019, + "step": 1705 + }, + { + "epoch": 0.9244107287997833, + "grad_norm": 0.11535318195819855, + "learning_rate": 0.0001, + "loss": 1.5832, + "step": 1706 + }, + { + "epoch": 0.9249525873746952, + "grad_norm": 0.11281119287014008, + "learning_rate": 0.0001, + "loss": 1.5781, + "step": 1707 + }, + { + "epoch": 0.9254944459496072, + "grad_norm": 0.11184824258089066, + "learning_rate": 0.0001, + "loss": 1.559, + "step": 1708 + }, + { + "epoch": 0.9260363045245191, + "grad_norm": 0.11843252182006836, + "learning_rate": 0.0001, + "loss": 1.6079, + "step": 1709 + }, + { + "epoch": 0.9265781630994311, + "grad_norm": 0.11637989431619644, + "learning_rate": 0.0001, + "loss": 1.5784, + "step": 1710 + }, + { + "epoch": 0.927120021674343, + "grad_norm": 0.11847338825464249, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 1711 + }, + { + "epoch": 0.927661880249255, + "grad_norm": 0.1154191642999649, + "learning_rate": 0.0001, + "loss": 1.5312, + "step": 1712 + }, + { + "epoch": 0.9282037388241668, + "grad_norm": 0.11177432537078857, + "learning_rate": 0.0001, + "loss": 1.5722, + "step": 1713 + }, + { + "epoch": 0.9287455973990788, + "grad_norm": 0.11820822209119797, + "learning_rate": 0.0001, + "loss": 1.5876, + "step": 1714 + }, + { + "epoch": 0.9292874559739908, + "grad_norm": 0.1316254734992981, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 1715 + }, + { + "epoch": 0.9298293145489027, + "grad_norm": 0.12370502203702927, + "learning_rate": 0.0001, + "loss": 1.6166, + "step": 1716 + }, + { + "epoch": 0.9303711731238147, + "grad_norm": 0.13926295936107635, + "learning_rate": 0.0001, + "loss": 1.5673, + "step": 1717 + }, + { + "epoch": 0.9309130316987266, + "grad_norm": 0.11580811440944672, + "learning_rate": 0.0001, + "loss": 1.5704, + "step": 1718 + }, + { + "epoch": 0.9314548902736386, + "grad_norm": 0.14968527853488922, + "learning_rate": 0.0001, + "loss": 1.6003, + "step": 1719 + }, + { + "epoch": 0.9319967488485505, + "grad_norm": 0.11721023172140121, + "learning_rate": 0.0001, + "loss": 1.5907, + "step": 1720 + }, + { + "epoch": 0.9325386074234625, + "grad_norm": 0.13286975026130676, + "learning_rate": 0.0001, + "loss": 1.5621, + "step": 1721 + }, + { + "epoch": 0.9330804659983745, + "grad_norm": 0.1223980113863945, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 1722 + }, + { + "epoch": 0.9336223245732864, + "grad_norm": 0.13075362145900726, + "learning_rate": 0.0001, + "loss": 1.5676, + "step": 1723 + }, + { + "epoch": 0.9341641831481983, + "grad_norm": 0.13382086157798767, + "learning_rate": 0.0001, + "loss": 1.5355, + "step": 1724 + }, + { + "epoch": 0.9347060417231102, + "grad_norm": 0.11884968727827072, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 1725 + }, + { + "epoch": 0.9352479002980222, + "grad_norm": 0.12873336672782898, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 1726 + }, + { + "epoch": 0.9357897588729341, + "grad_norm": 0.11676906049251556, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 1727 + }, + { + "epoch": 0.9363316174478461, + "grad_norm": 0.1319124698638916, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 1728 + }, + { + "epoch": 0.9368734760227581, + "grad_norm": 0.11038797348737717, + "learning_rate": 0.0001, + "loss": 1.45, + "step": 1729 + }, + { + "epoch": 0.93741533459767, + "grad_norm": 0.12079478055238724, + "learning_rate": 0.0001, + "loss": 1.5593, + "step": 1730 + }, + { + "epoch": 0.937957193172582, + "grad_norm": 0.1391608864068985, + "learning_rate": 0.0001, + "loss": 1.5944, + "step": 1731 + }, + { + "epoch": 0.9384990517474939, + "grad_norm": 0.11178398132324219, + "learning_rate": 0.0001, + "loss": 1.5399, + "step": 1732 + }, + { + "epoch": 0.9390409103224059, + "grad_norm": 0.14346595108509064, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 1733 + }, + { + "epoch": 0.9395827688973178, + "grad_norm": 0.11750081181526184, + "learning_rate": 0.0001, + "loss": 1.6113, + "step": 1734 + }, + { + "epoch": 0.9401246274722298, + "grad_norm": 0.1212116926908493, + "learning_rate": 0.0001, + "loss": 1.5555, + "step": 1735 + }, + { + "epoch": 0.9406664860471416, + "grad_norm": 0.13526900112628937, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 1736 + }, + { + "epoch": 0.9412083446220536, + "grad_norm": 0.11274862289428711, + "learning_rate": 0.0001, + "loss": 1.5397, + "step": 1737 + }, + { + "epoch": 0.9417502031969656, + "grad_norm": 0.1345253586769104, + "learning_rate": 0.0001, + "loss": 1.5766, + "step": 1738 + }, + { + "epoch": 0.9422920617718775, + "grad_norm": 0.1218753382563591, + "learning_rate": 0.0001, + "loss": 1.5997, + "step": 1739 + }, + { + "epoch": 0.9428339203467895, + "grad_norm": 0.1240580603480339, + "learning_rate": 0.0001, + "loss": 1.5754, + "step": 1740 + }, + { + "epoch": 0.9433757789217014, + "grad_norm": 0.13162104785442352, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1741 + }, + { + "epoch": 0.9439176374966134, + "grad_norm": 0.1223171278834343, + "learning_rate": 0.0001, + "loss": 1.5899, + "step": 1742 + }, + { + "epoch": 0.9444594960715254, + "grad_norm": 0.1349639743566513, + "learning_rate": 0.0001, + "loss": 1.6118, + "step": 1743 + }, + { + "epoch": 0.9450013546464373, + "grad_norm": 0.12058819830417633, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 1744 + }, + { + "epoch": 0.9455432132213493, + "grad_norm": 0.12474081665277481, + "learning_rate": 0.0001, + "loss": 1.5619, + "step": 1745 + }, + { + "epoch": 0.9460850717962612, + "grad_norm": 0.1322060078382492, + "learning_rate": 0.0001, + "loss": 1.5174, + "step": 1746 + }, + { + "epoch": 0.9466269303711731, + "grad_norm": 0.11868065595626831, + "learning_rate": 0.0001, + "loss": 1.5275, + "step": 1747 + }, + { + "epoch": 0.947168788946085, + "grad_norm": 0.13948048651218414, + "learning_rate": 0.0001, + "loss": 1.5391, + "step": 1748 + }, + { + "epoch": 0.947710647520997, + "grad_norm": 0.11156269907951355, + "learning_rate": 0.0001, + "loss": 1.5508, + "step": 1749 + }, + { + "epoch": 0.9482525060959089, + "grad_norm": 0.14344418048858643, + "learning_rate": 0.0001, + "loss": 1.5386, + "step": 1750 + }, + { + "epoch": 0.9487943646708209, + "grad_norm": 0.1274086982011795, + "learning_rate": 0.0001, + "loss": 1.5993, + "step": 1751 + }, + { + "epoch": 0.9493362232457329, + "grad_norm": 0.1266939342021942, + "learning_rate": 0.0001, + "loss": 1.6089, + "step": 1752 + }, + { + "epoch": 0.9498780818206448, + "grad_norm": 0.14160290360450745, + "learning_rate": 0.0001, + "loss": 1.568, + "step": 1753 + }, + { + "epoch": 0.9504199403955568, + "grad_norm": 0.11376998573541641, + "learning_rate": 0.0001, + "loss": 1.5564, + "step": 1754 + }, + { + "epoch": 0.9509617989704687, + "grad_norm": 0.14705996215343475, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 1755 + }, + { + "epoch": 0.9515036575453807, + "grad_norm": 0.11236948519945145, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 1756 + }, + { + "epoch": 0.9520455161202926, + "grad_norm": 0.12842684984207153, + "learning_rate": 0.0001, + "loss": 1.6045, + "step": 1757 + }, + { + "epoch": 0.9525873746952046, + "grad_norm": 0.12077702581882477, + "learning_rate": 0.0001, + "loss": 1.5668, + "step": 1758 + }, + { + "epoch": 0.9531292332701164, + "grad_norm": 0.11721716821193695, + "learning_rate": 0.0001, + "loss": 1.5491, + "step": 1759 + }, + { + "epoch": 0.9536710918450284, + "grad_norm": 0.12973424792289734, + "learning_rate": 0.0001, + "loss": 1.6049, + "step": 1760 + }, + { + "epoch": 0.9542129504199404, + "grad_norm": 0.11184488981962204, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1761 + }, + { + "epoch": 0.9547548089948523, + "grad_norm": 0.12777452170848846, + "learning_rate": 0.0001, + "loss": 1.5778, + "step": 1762 + }, + { + "epoch": 0.9552966675697643, + "grad_norm": 0.12511278688907623, + "learning_rate": 0.0001, + "loss": 1.5786, + "step": 1763 + }, + { + "epoch": 0.9558385261446762, + "grad_norm": 0.1258372664451599, + "learning_rate": 0.0001, + "loss": 1.6059, + "step": 1764 + }, + { + "epoch": 0.9563803847195882, + "grad_norm": 0.1296481341123581, + "learning_rate": 0.0001, + "loss": 1.5824, + "step": 1765 + }, + { + "epoch": 0.9569222432945002, + "grad_norm": 0.11928050220012665, + "learning_rate": 0.0001, + "loss": 1.6027, + "step": 1766 + }, + { + "epoch": 0.9574641018694121, + "grad_norm": 0.11857929080724716, + "learning_rate": 0.0001, + "loss": 1.5373, + "step": 1767 + }, + { + "epoch": 0.9580059604443241, + "grad_norm": 0.12479015439748764, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 1768 + }, + { + "epoch": 0.958547819019236, + "grad_norm": 0.11598216742277145, + "learning_rate": 0.0001, + "loss": 1.5384, + "step": 1769 + }, + { + "epoch": 0.959089677594148, + "grad_norm": 0.11646448820829391, + "learning_rate": 0.0001, + "loss": 1.5858, + "step": 1770 + }, + { + "epoch": 0.9596315361690598, + "grad_norm": 0.12080811709165573, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 1771 + }, + { + "epoch": 0.9601733947439718, + "grad_norm": 0.11604882776737213, + "learning_rate": 0.0001, + "loss": 1.5372, + "step": 1772 + }, + { + "epoch": 0.9607152533188837, + "grad_norm": 0.1229996308684349, + "learning_rate": 0.0001, + "loss": 1.5694, + "step": 1773 + }, + { + "epoch": 0.9612571118937957, + "grad_norm": 0.13511215150356293, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 1774 + }, + { + "epoch": 0.9617989704687077, + "grad_norm": 0.12474493682384491, + "learning_rate": 0.0001, + "loss": 1.6047, + "step": 1775 + }, + { + "epoch": 0.9623408290436196, + "grad_norm": 0.11818260699510574, + "learning_rate": 0.0001, + "loss": 1.5869, + "step": 1776 + }, + { + "epoch": 0.9628826876185316, + "grad_norm": 0.11510884016752243, + "learning_rate": 0.0001, + "loss": 1.5688, + "step": 1777 + }, + { + "epoch": 0.9634245461934435, + "grad_norm": 0.11183018237352371, + "learning_rate": 0.0001, + "loss": 1.585, + "step": 1778 + }, + { + "epoch": 0.9639664047683555, + "grad_norm": 0.12041230499744415, + "learning_rate": 0.0001, + "loss": 1.5651, + "step": 1779 + }, + { + "epoch": 0.9645082633432674, + "grad_norm": 0.11413277685642242, + "learning_rate": 0.0001, + "loss": 1.6082, + "step": 1780 + }, + { + "epoch": 0.9650501219181794, + "grad_norm": 0.11770571023225784, + "learning_rate": 0.0001, + "loss": 1.5329, + "step": 1781 + }, + { + "epoch": 0.9655919804930913, + "grad_norm": 0.11431930214166641, + "learning_rate": 0.0001, + "loss": 1.5738, + "step": 1782 + }, + { + "epoch": 0.9661338390680032, + "grad_norm": 0.12779121100902557, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 1783 + }, + { + "epoch": 0.9666756976429152, + "grad_norm": 0.1112782433629036, + "learning_rate": 0.0001, + "loss": 1.5436, + "step": 1784 + }, + { + "epoch": 0.9672175562178271, + "grad_norm": 0.11836463958024979, + "learning_rate": 0.0001, + "loss": 1.5875, + "step": 1785 + }, + { + "epoch": 0.9677594147927391, + "grad_norm": 0.11033179610967636, + "learning_rate": 0.0001, + "loss": 1.5581, + "step": 1786 + }, + { + "epoch": 0.968301273367651, + "grad_norm": 0.12732066214084625, + "learning_rate": 0.0001, + "loss": 1.5556, + "step": 1787 + }, + { + "epoch": 0.968843131942563, + "grad_norm": 0.11075282841920853, + "learning_rate": 0.0001, + "loss": 1.5906, + "step": 1788 + }, + { + "epoch": 0.969384990517475, + "grad_norm": 0.11174791306257248, + "learning_rate": 0.0001, + "loss": 1.4853, + "step": 1789 + }, + { + "epoch": 0.9699268490923869, + "grad_norm": 0.10969841480255127, + "learning_rate": 0.0001, + "loss": 1.6018, + "step": 1790 + }, + { + "epoch": 0.9704687076672989, + "grad_norm": 0.12162282317876816, + "learning_rate": 0.0001, + "loss": 1.5783, + "step": 1791 + }, + { + "epoch": 0.9710105662422108, + "grad_norm": 0.11701533943414688, + "learning_rate": 0.0001, + "loss": 1.5762, + "step": 1792 + }, + { + "epoch": 0.9715524248171228, + "grad_norm": 0.12743960320949554, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 1793 + }, + { + "epoch": 0.9720942833920346, + "grad_norm": 0.1182185560464859, + "learning_rate": 0.0001, + "loss": 1.5864, + "step": 1794 + }, + { + "epoch": 0.9726361419669466, + "grad_norm": 0.12218203395605087, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 1795 + }, + { + "epoch": 0.9731780005418585, + "grad_norm": 0.11644001305103302, + "learning_rate": 0.0001, + "loss": 1.5516, + "step": 1796 + }, + { + "epoch": 0.9737198591167705, + "grad_norm": 0.1234685555100441, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 1797 + }, + { + "epoch": 0.9742617176916825, + "grad_norm": 0.11179257184267044, + "learning_rate": 0.0001, + "loss": 1.5292, + "step": 1798 + }, + { + "epoch": 0.9748035762665944, + "grad_norm": 0.11813155561685562, + "learning_rate": 0.0001, + "loss": 1.6825, + "step": 1799 + }, + { + "epoch": 0.9753454348415064, + "grad_norm": 0.11452137678861618, + "learning_rate": 0.0001, + "loss": 1.5874, + "step": 1800 + }, + { + "epoch": 0.9758872934164183, + "grad_norm": 0.1203099936246872, + "learning_rate": 0.0001, + "loss": 1.591, + "step": 1801 + }, + { + "epoch": 0.9764291519913303, + "grad_norm": 0.11747494339942932, + "learning_rate": 0.0001, + "loss": 1.5647, + "step": 1802 + }, + { + "epoch": 0.9769710105662422, + "grad_norm": 0.11051321774721146, + "learning_rate": 0.0001, + "loss": 1.5415, + "step": 1803 + }, + { + "epoch": 0.9775128691411542, + "grad_norm": 0.11837146431207657, + "learning_rate": 0.0001, + "loss": 1.5948, + "step": 1804 + }, + { + "epoch": 0.978054727716066, + "grad_norm": 0.11796199530363083, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 1805 + }, + { + "epoch": 0.978596586290978, + "grad_norm": 0.11934317648410797, + "learning_rate": 0.0001, + "loss": 1.5606, + "step": 1806 + }, + { + "epoch": 0.97913844486589, + "grad_norm": 0.11518701165914536, + "learning_rate": 0.0001, + "loss": 1.5921, + "step": 1807 + }, + { + "epoch": 0.9796803034408019, + "grad_norm": 0.12122237682342529, + "learning_rate": 0.0001, + "loss": 1.604, + "step": 1808 + }, + { + "epoch": 0.9802221620157139, + "grad_norm": 0.1308448165655136, + "learning_rate": 0.0001, + "loss": 1.5844, + "step": 1809 + }, + { + "epoch": 0.9807640205906258, + "grad_norm": 0.12254752963781357, + "learning_rate": 0.0001, + "loss": 1.6409, + "step": 1810 + }, + { + "epoch": 0.9813058791655378, + "grad_norm": 0.12072083353996277, + "learning_rate": 0.0001, + "loss": 1.5395, + "step": 1811 + }, + { + "epoch": 0.9818477377404498, + "grad_norm": 0.12756118178367615, + "learning_rate": 0.0001, + "loss": 1.6052, + "step": 1812 + }, + { + "epoch": 0.9823895963153617, + "grad_norm": 0.11563752591609955, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 1813 + }, + { + "epoch": 0.9829314548902737, + "grad_norm": 0.12231991440057755, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 1814 + }, + { + "epoch": 0.9834733134651856, + "grad_norm": 0.11458272486925125, + "learning_rate": 0.0001, + "loss": 1.5106, + "step": 1815 + }, + { + "epoch": 0.9840151720400976, + "grad_norm": 0.12017235159873962, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 1816 + }, + { + "epoch": 0.9845570306150094, + "grad_norm": 0.12292500585317612, + "learning_rate": 0.0001, + "loss": 1.5838, + "step": 1817 + }, + { + "epoch": 0.9850988891899214, + "grad_norm": 0.11395881325006485, + "learning_rate": 0.0001, + "loss": 1.6044, + "step": 1818 + }, + { + "epoch": 0.9856407477648333, + "grad_norm": 0.11829330772161484, + "learning_rate": 0.0001, + "loss": 1.5448, + "step": 1819 + }, + { + "epoch": 0.9861826063397453, + "grad_norm": 0.12044976651668549, + "learning_rate": 0.0001, + "loss": 1.5628, + "step": 1820 + }, + { + "epoch": 0.9867244649146573, + "grad_norm": 0.13126184046268463, + "learning_rate": 0.0001, + "loss": 1.5905, + "step": 1821 + }, + { + "epoch": 0.9872663234895692, + "grad_norm": 0.11481881886720657, + "learning_rate": 0.0001, + "loss": 1.5681, + "step": 1822 + }, + { + "epoch": 0.9878081820644812, + "grad_norm": 0.11921236664056778, + "learning_rate": 0.0001, + "loss": 1.5652, + "step": 1823 + }, + { + "epoch": 0.9883500406393931, + "grad_norm": 0.13419100642204285, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 1824 + }, + { + "epoch": 0.9888918992143051, + "grad_norm": 0.11893222481012344, + "learning_rate": 0.0001, + "loss": 1.6514, + "step": 1825 + }, + { + "epoch": 0.989433757789217, + "grad_norm": 0.12331171333789825, + "learning_rate": 0.0001, + "loss": 1.533, + "step": 1826 + }, + { + "epoch": 0.989975616364129, + "grad_norm": 0.12092404067516327, + "learning_rate": 0.0001, + "loss": 1.6025, + "step": 1827 + }, + { + "epoch": 0.990517474939041, + "grad_norm": 0.13040265440940857, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 1828 + }, + { + "epoch": 0.9910593335139528, + "grad_norm": 0.11708499491214752, + "learning_rate": 0.0001, + "loss": 1.558, + "step": 1829 + }, + { + "epoch": 0.9916011920888648, + "grad_norm": 0.11232500523328781, + "learning_rate": 0.0001, + "loss": 1.4854, + "step": 1830 + }, + { + "epoch": 0.9921430506637767, + "grad_norm": 0.12061132490634918, + "learning_rate": 0.0001, + "loss": 1.5493, + "step": 1831 + }, + { + "epoch": 0.9926849092386887, + "grad_norm": 0.11637751758098602, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 1832 + }, + { + "epoch": 0.9932267678136006, + "grad_norm": 0.12868823111057281, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 1833 + }, + { + "epoch": 0.9937686263885126, + "grad_norm": 0.11612503975629807, + "learning_rate": 0.0001, + "loss": 1.5745, + "step": 1834 + }, + { + "epoch": 0.9943104849634246, + "grad_norm": 0.11189901828765869, + "learning_rate": 0.0001, + "loss": 1.5413, + "step": 1835 + }, + { + "epoch": 0.9948523435383365, + "grad_norm": 0.11703436076641083, + "learning_rate": 0.0001, + "loss": 1.5294, + "step": 1836 + }, + { + "epoch": 0.9953942021132485, + "grad_norm": 0.12222783267498016, + "learning_rate": 0.0001, + "loss": 1.6134, + "step": 1837 + }, + { + "epoch": 0.9959360606881604, + "grad_norm": 0.11635846644639969, + "learning_rate": 0.0001, + "loss": 1.5942, + "step": 1838 + }, + { + "epoch": 0.9964779192630724, + "grad_norm": 0.12245723605155945, + "learning_rate": 0.0001, + "loss": 1.5653, + "step": 1839 + }, + { + "epoch": 0.9970197778379842, + "grad_norm": 0.1175963282585144, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 1840 + }, + { + "epoch": 0.9975616364128962, + "grad_norm": 0.13284112513065338, + "learning_rate": 0.0001, + "loss": 1.5772, + "step": 1841 + }, + { + "epoch": 0.9981034949878081, + "grad_norm": 0.11542721837759018, + "learning_rate": 0.0001, + "loss": 1.5804, + "step": 1842 + }, + { + "epoch": 0.9986453535627201, + "grad_norm": 0.11047124862670898, + "learning_rate": 0.0001, + "loss": 1.5168, + "step": 1843 + }, + { + "epoch": 0.9991872121376321, + "grad_norm": 0.11980320513248444, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 1844 + }, + { + "epoch": 0.999729070712544, + "grad_norm": 0.11151415854692459, + "learning_rate": 0.0001, + "loss": 1.5566, + "step": 1845 + }, + { + "epoch": 0.999729070712544, + "step": 1845, + "total_flos": 2.0947907107325215e+19, + "train_loss": 1.6380996824926155, + "train_runtime": 59531.8826, + "train_samples_per_second": 0.496, + "train_steps_per_second": 0.031 + } + ], + "logging_steps": 1, + "max_steps": 1845, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0947907107325215e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}