diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,61698 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999467841672254, + "eval_steps": 500, + "global_step": 8808, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011352710991907646, + "grad_norm": 8.75, + "learning_rate": 0.00020600000000000002, + "loss": 11.9909, + "step": 1 + }, + { + "epoch": 0.00022705421983815292, + "grad_norm": 7.71875, + "learning_rate": 0.00021200000000000003, + "loss": 11.428, + "step": 2 + }, + { + "epoch": 0.00034058132975722935, + "grad_norm": 8.25, + "learning_rate": 0.00021800000000000004, + "loss": 10.9556, + "step": 3 + }, + { + "epoch": 0.00045410843967630583, + "grad_norm": 7.71875, + "learning_rate": 0.000224, + "loss": 10.4866, + "step": 4 + }, + { + "epoch": 0.0005676355495953823, + "grad_norm": 6.40625, + "learning_rate": 0.00023, + "loss": 10.0525, + "step": 5 + }, + { + "epoch": 0.0006811626595144587, + "grad_norm": 5.5, + "learning_rate": 0.00023600000000000002, + "loss": 9.6593, + "step": 6 + }, + { + "epoch": 0.0007946897694335352, + "grad_norm": 6.375, + "learning_rate": 0.00024200000000000003, + "loss": 9.3475, + "step": 7 + }, + { + "epoch": 0.0009082168793526117, + "grad_norm": 4.34375, + "learning_rate": 0.000248, + "loss": 9.0679, + "step": 8 + }, + { + "epoch": 0.001021743989271688, + "grad_norm": 2.984375, + "learning_rate": 0.000254, + "loss": 8.8488, + "step": 9 + }, + { + "epoch": 0.0011352710991907645, + "grad_norm": 2.3125, + "learning_rate": 0.00026000000000000003, + "loss": 8.6739, + "step": 10 + }, + { + "epoch": 0.001248798209109841, + "grad_norm": 2.015625, + "learning_rate": 0.000266, + "loss": 8.5514, + "step": 11 + }, + { + "epoch": 0.0013623253190289174, + "grad_norm": 1.8359375, + "learning_rate": 0.00027200000000000005, + "loss": 8.4544, + "step": 12 + }, + { + "epoch": 0.001475852428947994, + "grad_norm": 2.0625, + "learning_rate": 0.00027800000000000004, + "loss": 8.3934, + "step": 13 + }, + { + "epoch": 0.0015893795388670705, + "grad_norm": 2.296875, + "learning_rate": 0.000284, + "loss": 8.371, + "step": 14 + }, + { + "epoch": 0.001702906648786147, + "grad_norm": 2.03125, + "learning_rate": 0.00029000000000000006, + "loss": 8.3597, + "step": 15 + }, + { + "epoch": 0.0018164337587052233, + "grad_norm": 2.015625, + "learning_rate": 0.00029600000000000004, + "loss": 8.3499, + "step": 16 + }, + { + "epoch": 0.0019299608686242998, + "grad_norm": 2.234375, + "learning_rate": 0.000302, + "loss": 8.3017, + "step": 17 + }, + { + "epoch": 0.002043487978543376, + "grad_norm": 1.890625, + "learning_rate": 0.000308, + "loss": 8.3102, + "step": 18 + }, + { + "epoch": 0.0021570150884624526, + "grad_norm": 2.3125, + "learning_rate": 0.00031400000000000004, + "loss": 8.3016, + "step": 19 + }, + { + "epoch": 0.002270542198381529, + "grad_norm": 2.21875, + "learning_rate": 0.00032, + "loss": 8.2926, + "step": 20 + }, + { + "epoch": 0.0023840693083006055, + "grad_norm": 2.3125, + "learning_rate": 0.00032600000000000006, + "loss": 8.2745, + "step": 21 + }, + { + "epoch": 0.002497596418219682, + "grad_norm": 2.203125, + "learning_rate": 0.00033200000000000005, + "loss": 8.2567, + "step": 22 + }, + { + "epoch": 0.0026111235281387583, + "grad_norm": 2.1875, + "learning_rate": 0.000338, + "loss": 8.238, + "step": 23 + }, + { + "epoch": 0.0027246506380578348, + "grad_norm": 2.34375, + "learning_rate": 0.000344, + "loss": 8.2471, + "step": 24 + }, + { + "epoch": 0.0028381777479769116, + "grad_norm": 2.328125, + "learning_rate": 0.00035, + "loss": 8.2487, + "step": 25 + }, + { + "epoch": 0.002951704857895988, + "grad_norm": 2.671875, + "learning_rate": 0.000356, + "loss": 8.2324, + "step": 26 + }, + { + "epoch": 0.0030652319678150645, + "grad_norm": 2.390625, + "learning_rate": 0.000362, + "loss": 8.2254, + "step": 27 + }, + { + "epoch": 0.003178759077734141, + "grad_norm": 2.859375, + "learning_rate": 0.000368, + "loss": 8.2185, + "step": 28 + }, + { + "epoch": 0.0032922861876532174, + "grad_norm": 2.21875, + "learning_rate": 0.000374, + "loss": 8.2334, + "step": 29 + }, + { + "epoch": 0.003405813297572294, + "grad_norm": 2.84375, + "learning_rate": 0.00038, + "loss": 8.2103, + "step": 30 + }, + { + "epoch": 0.00351934040749137, + "grad_norm": 2.125, + "learning_rate": 0.000386, + "loss": 8.2065, + "step": 31 + }, + { + "epoch": 0.0036328675174104466, + "grad_norm": 2.84375, + "learning_rate": 0.00039200000000000004, + "loss": 8.2215, + "step": 32 + }, + { + "epoch": 0.003746394627329523, + "grad_norm": 2.296875, + "learning_rate": 0.000398, + "loss": 8.2297, + "step": 33 + }, + { + "epoch": 0.0038599217372485995, + "grad_norm": 4.65625, + "learning_rate": 0.000404, + "loss": 8.2004, + "step": 34 + }, + { + "epoch": 0.003973448847167676, + "grad_norm": 3.234375, + "learning_rate": 0.00041000000000000005, + "loss": 8.1734, + "step": 35 + }, + { + "epoch": 0.004086975957086752, + "grad_norm": 3.359375, + "learning_rate": 0.00041600000000000003, + "loss": 8.1699, + "step": 36 + }, + { + "epoch": 0.004200503067005829, + "grad_norm": 3.59375, + "learning_rate": 0.00042200000000000007, + "loss": 8.1847, + "step": 37 + }, + { + "epoch": 0.004314030176924905, + "grad_norm": 2.859375, + "learning_rate": 0.00042800000000000005, + "loss": 8.1821, + "step": 38 + }, + { + "epoch": 0.004427557286843982, + "grad_norm": 3.046875, + "learning_rate": 0.0004340000000000001, + "loss": 8.167, + "step": 39 + }, + { + "epoch": 0.004541084396763058, + "grad_norm": 2.21875, + "learning_rate": 0.00044, + "loss": 8.1414, + "step": 40 + }, + { + "epoch": 0.004654611506682135, + "grad_norm": 3.109375, + "learning_rate": 0.000446, + "loss": 8.1489, + "step": 41 + }, + { + "epoch": 0.004768138616601211, + "grad_norm": 2.75, + "learning_rate": 0.0004520000000000001, + "loss": 8.1023, + "step": 42 + }, + { + "epoch": 0.004881665726520288, + "grad_norm": 3.265625, + "learning_rate": 0.000458, + "loss": 8.1471, + "step": 43 + }, + { + "epoch": 0.004995192836439364, + "grad_norm": 2.1875, + "learning_rate": 0.00046400000000000006, + "loss": 8.117, + "step": 44 + }, + { + "epoch": 0.005108719946358441, + "grad_norm": 2.84375, + "learning_rate": 0.00047000000000000004, + "loss": 8.1023, + "step": 45 + }, + { + "epoch": 0.005222247056277517, + "grad_norm": 1.7421875, + "learning_rate": 0.00047599999999999997, + "loss": 8.0842, + "step": 46 + }, + { + "epoch": 0.0053357741661965935, + "grad_norm": 2.5, + "learning_rate": 0.00048200000000000006, + "loss": 8.071, + "step": 47 + }, + { + "epoch": 0.0054493012761156695, + "grad_norm": 1.3203125, + "learning_rate": 0.00048800000000000004, + "loss": 8.0508, + "step": 48 + }, + { + "epoch": 0.005562828386034746, + "grad_norm": 2.5, + "learning_rate": 0.000494, + "loss": 8.0878, + "step": 49 + }, + { + "epoch": 0.005676355495953823, + "grad_norm": 1.2890625, + "learning_rate": 0.0005, + "loss": 8.0276, + "step": 50 + }, + { + "epoch": 0.005789882605872899, + "grad_norm": 2.78125, + "learning_rate": 0.000506, + "loss": 8.0411, + "step": 51 + }, + { + "epoch": 0.005903409715791976, + "grad_norm": 1.2578125, + "learning_rate": 0.000512, + "loss": 8.0092, + "step": 52 + }, + { + "epoch": 0.006016936825711052, + "grad_norm": 3.5, + "learning_rate": 0.000518, + "loss": 8.0271, + "step": 53 + }, + { + "epoch": 0.006130463935630129, + "grad_norm": 1.5, + "learning_rate": 0.000524, + "loss": 8.009, + "step": 54 + }, + { + "epoch": 0.006243991045549205, + "grad_norm": 3.171875, + "learning_rate": 0.0005300000000000001, + "loss": 7.9902, + "step": 55 + }, + { + "epoch": 0.006357518155468282, + "grad_norm": 1.2578125, + "learning_rate": 0.000536, + "loss": 7.9482, + "step": 56 + }, + { + "epoch": 0.006471045265387358, + "grad_norm": 2.625, + "learning_rate": 0.0005420000000000001, + "loss": 7.9866, + "step": 57 + }, + { + "epoch": 0.006584572375306435, + "grad_norm": 1.1953125, + "learning_rate": 0.0005480000000000001, + "loss": 7.9408, + "step": 58 + }, + { + "epoch": 0.006698099485225511, + "grad_norm": 2.234375, + "learning_rate": 0.000554, + "loss": 7.9441, + "step": 59 + }, + { + "epoch": 0.006811626595144588, + "grad_norm": 1.734375, + "learning_rate": 0.0005600000000000001, + "loss": 7.9147, + "step": 60 + }, + { + "epoch": 0.006925153705063664, + "grad_norm": 2.390625, + "learning_rate": 0.0005660000000000001, + "loss": 7.9292, + "step": 61 + }, + { + "epoch": 0.00703868081498274, + "grad_norm": 1.3828125, + "learning_rate": 0.000572, + "loss": 7.8685, + "step": 62 + }, + { + "epoch": 0.007152207924901816, + "grad_norm": 2.25, + "learning_rate": 0.0005780000000000001, + "loss": 7.8874, + "step": 63 + }, + { + "epoch": 0.007265735034820893, + "grad_norm": 1.5546875, + "learning_rate": 0.0005840000000000001, + "loss": 7.8855, + "step": 64 + }, + { + "epoch": 0.007379262144739969, + "grad_norm": 2.109375, + "learning_rate": 0.0005900000000000001, + "loss": 7.8879, + "step": 65 + }, + { + "epoch": 0.007492789254659046, + "grad_norm": 1.4921875, + "learning_rate": 0.0005960000000000001, + "loss": 7.834, + "step": 66 + }, + { + "epoch": 0.007606316364578123, + "grad_norm": 2.078125, + "learning_rate": 0.0006020000000000001, + "loss": 7.8308, + "step": 67 + }, + { + "epoch": 0.007719843474497199, + "grad_norm": 1.7421875, + "learning_rate": 0.000608, + "loss": 7.8146, + "step": 68 + }, + { + "epoch": 0.007833370584416275, + "grad_norm": 2.109375, + "learning_rate": 0.0006140000000000001, + "loss": 7.8115, + "step": 69 + }, + { + "epoch": 0.007946897694335353, + "grad_norm": 1.6015625, + "learning_rate": 0.0006200000000000001, + "loss": 7.771, + "step": 70 + }, + { + "epoch": 0.008060424804254429, + "grad_norm": 2.046875, + "learning_rate": 0.000626, + "loss": 7.7673, + "step": 71 + }, + { + "epoch": 0.008173951914173505, + "grad_norm": 1.6015625, + "learning_rate": 0.000632, + "loss": 7.769, + "step": 72 + }, + { + "epoch": 0.00828747902409258, + "grad_norm": 1.8671875, + "learning_rate": 0.0006380000000000001, + "loss": 7.7397, + "step": 73 + }, + { + "epoch": 0.008401006134011658, + "grad_norm": 1.4453125, + "learning_rate": 0.000644, + "loss": 7.7263, + "step": 74 + }, + { + "epoch": 0.008514533243930734, + "grad_norm": 2.171875, + "learning_rate": 0.0006500000000000001, + "loss": 7.7123, + "step": 75 + }, + { + "epoch": 0.00862806035384981, + "grad_norm": 1.4296875, + "learning_rate": 0.0006560000000000001, + "loss": 7.7051, + "step": 76 + }, + { + "epoch": 0.008741587463768886, + "grad_norm": 2.046875, + "learning_rate": 0.0006619999999999999, + "loss": 7.6825, + "step": 77 + }, + { + "epoch": 0.008855114573687964, + "grad_norm": 1.640625, + "learning_rate": 0.0006680000000000001, + "loss": 7.6839, + "step": 78 + }, + { + "epoch": 0.00896864168360704, + "grad_norm": 2.15625, + "learning_rate": 0.0006739999999999999, + "loss": 7.6467, + "step": 79 + }, + { + "epoch": 0.009082168793526116, + "grad_norm": 1.5546875, + "learning_rate": 0.0006799999999999999, + "loss": 7.6222, + "step": 80 + }, + { + "epoch": 0.009195695903445192, + "grad_norm": 2.125, + "learning_rate": 0.0006860000000000001, + "loss": 7.6246, + "step": 81 + }, + { + "epoch": 0.00930922301336427, + "grad_norm": 1.421875, + "learning_rate": 0.000692, + "loss": 7.6004, + "step": 82 + }, + { + "epoch": 0.009422750123283346, + "grad_norm": 1.7734375, + "learning_rate": 0.0006979999999999999, + "loss": 7.5864, + "step": 83 + }, + { + "epoch": 0.009536277233202422, + "grad_norm": 1.203125, + "learning_rate": 0.0007040000000000002, + "loss": 7.5924, + "step": 84 + }, + { + "epoch": 0.0096498043431215, + "grad_norm": 1.71875, + "learning_rate": 0.00071, + "loss": 7.5853, + "step": 85 + }, + { + "epoch": 0.009763331453040576, + "grad_norm": 1.5, + "learning_rate": 0.000716, + "loss": 7.5521, + "step": 86 + }, + { + "epoch": 0.009876858562959652, + "grad_norm": 2.046875, + "learning_rate": 0.000722, + "loss": 7.5544, + "step": 87 + }, + { + "epoch": 0.009990385672878728, + "grad_norm": 1.4140625, + "learning_rate": 0.000728, + "loss": 7.5235, + "step": 88 + }, + { + "epoch": 0.010103912782797805, + "grad_norm": 1.6640625, + "learning_rate": 0.000734, + "loss": 7.5459, + "step": 89 + }, + { + "epoch": 0.010217439892716881, + "grad_norm": 1.234375, + "learning_rate": 0.00074, + "loss": 7.5094, + "step": 90 + }, + { + "epoch": 0.010330967002635957, + "grad_norm": 1.625, + "learning_rate": 0.000746, + "loss": 7.491, + "step": 91 + }, + { + "epoch": 0.010444494112555033, + "grad_norm": 1.5625, + "learning_rate": 0.0007520000000000001, + "loss": 7.4972, + "step": 92 + }, + { + "epoch": 0.010558021222474111, + "grad_norm": 1.953125, + "learning_rate": 0.000758, + "loss": 7.4828, + "step": 93 + }, + { + "epoch": 0.010671548332393187, + "grad_norm": 1.390625, + "learning_rate": 0.000764, + "loss": 7.4417, + "step": 94 + }, + { + "epoch": 0.010785075442312263, + "grad_norm": 1.6640625, + "learning_rate": 0.0007700000000000001, + "loss": 7.4512, + "step": 95 + }, + { + "epoch": 0.010898602552231339, + "grad_norm": 1.453125, + "learning_rate": 0.000776, + "loss": 7.4023, + "step": 96 + }, + { + "epoch": 0.011012129662150417, + "grad_norm": 1.6484375, + "learning_rate": 0.000782, + "loss": 7.4279, + "step": 97 + }, + { + "epoch": 0.011125656772069493, + "grad_norm": 1.3984375, + "learning_rate": 0.0007880000000000001, + "loss": 7.4036, + "step": 98 + }, + { + "epoch": 0.011239183881988569, + "grad_norm": 1.5703125, + "learning_rate": 0.0007940000000000001, + "loss": 7.4104, + "step": 99 + }, + { + "epoch": 0.011352710991907647, + "grad_norm": 1.3203125, + "learning_rate": 0.0008, + "loss": 7.3965, + "step": 100 + }, + { + "epoch": 0.011466238101826723, + "grad_norm": 1.3125, + "learning_rate": 0.0008060000000000001, + "loss": 7.3772, + "step": 101 + }, + { + "epoch": 0.011579765211745799, + "grad_norm": 1.359375, + "learning_rate": 0.0008120000000000001, + "loss": 7.3716, + "step": 102 + }, + { + "epoch": 0.011693292321664875, + "grad_norm": 1.515625, + "learning_rate": 0.000818, + "loss": 7.3657, + "step": 103 + }, + { + "epoch": 0.011806819431583952, + "grad_norm": 1.3515625, + "learning_rate": 0.0008240000000000001, + "loss": 7.3203, + "step": 104 + }, + { + "epoch": 0.011920346541503028, + "grad_norm": 1.265625, + "learning_rate": 0.0008300000000000001, + "loss": 7.3213, + "step": 105 + }, + { + "epoch": 0.012033873651422104, + "grad_norm": 1.2265625, + "learning_rate": 0.000836, + "loss": 7.3194, + "step": 106 + }, + { + "epoch": 0.01214740076134118, + "grad_norm": 1.3203125, + "learning_rate": 0.0008420000000000001, + "loss": 7.3077, + "step": 107 + }, + { + "epoch": 0.012260927871260258, + "grad_norm": 1.34375, + "learning_rate": 0.0008480000000000001, + "loss": 7.2861, + "step": 108 + }, + { + "epoch": 0.012374454981179334, + "grad_norm": 1.21875, + "learning_rate": 0.0008540000000000002, + "loss": 7.2577, + "step": 109 + }, + { + "epoch": 0.01248798209109841, + "grad_norm": 1.3671875, + "learning_rate": 0.0008599999999999999, + "loss": 7.2872, + "step": 110 + }, + { + "epoch": 0.012601509201017486, + "grad_norm": 1.25, + "learning_rate": 0.0008660000000000001, + "loss": 7.2687, + "step": 111 + }, + { + "epoch": 0.012715036310936564, + "grad_norm": 1.125, + "learning_rate": 0.0008720000000000002, + "loss": 7.2544, + "step": 112 + }, + { + "epoch": 0.01282856342085564, + "grad_norm": 1.078125, + "learning_rate": 0.0008779999999999999, + "loss": 7.2292, + "step": 113 + }, + { + "epoch": 0.012942090530774716, + "grad_norm": 1.109375, + "learning_rate": 0.0008840000000000001, + "loss": 7.2224, + "step": 114 + }, + { + "epoch": 0.013055617640693792, + "grad_norm": 1.2734375, + "learning_rate": 0.0008900000000000002, + "loss": 7.2196, + "step": 115 + }, + { + "epoch": 0.01316914475061287, + "grad_norm": 1.0859375, + "learning_rate": 0.0008959999999999999, + "loss": 7.1945, + "step": 116 + }, + { + "epoch": 0.013282671860531945, + "grad_norm": 1.1640625, + "learning_rate": 0.0009020000000000001, + "loss": 7.1936, + "step": 117 + }, + { + "epoch": 0.013396198970451021, + "grad_norm": 1.1171875, + "learning_rate": 0.000908, + "loss": 7.189, + "step": 118 + }, + { + "epoch": 0.0135097260803701, + "grad_norm": 1.3359375, + "learning_rate": 0.000914, + "loss": 7.1527, + "step": 119 + }, + { + "epoch": 0.013623253190289175, + "grad_norm": 1.1875, + "learning_rate": 0.0009200000000000001, + "loss": 7.1539, + "step": 120 + }, + { + "epoch": 0.013736780300208251, + "grad_norm": 1.4453125, + "learning_rate": 0.000926, + "loss": 7.1505, + "step": 121 + }, + { + "epoch": 0.013850307410127327, + "grad_norm": 1.0625, + "learning_rate": 0.000932, + "loss": 7.1208, + "step": 122 + }, + { + "epoch": 0.013963834520046405, + "grad_norm": 1.3046875, + "learning_rate": 0.0009379999999999999, + "loss": 7.1369, + "step": 123 + }, + { + "epoch": 0.01407736162996548, + "grad_norm": 1.1171875, + "learning_rate": 0.000944, + "loss": 7.111, + "step": 124 + }, + { + "epoch": 0.014190888739884557, + "grad_norm": 1.3515625, + "learning_rate": 0.00095, + "loss": 7.1177, + "step": 125 + }, + { + "epoch": 0.014304415849803633, + "grad_norm": 0.95703125, + "learning_rate": 0.0009559999999999999, + "loss": 7.0898, + "step": 126 + }, + { + "epoch": 0.01441794295972271, + "grad_norm": 1.296875, + "learning_rate": 0.000962, + "loss": 7.1302, + "step": 127 + }, + { + "epoch": 0.014531470069641787, + "grad_norm": 0.984375, + "learning_rate": 0.000968, + "loss": 7.0921, + "step": 128 + }, + { + "epoch": 0.014644997179560863, + "grad_norm": 1.15625, + "learning_rate": 0.000974, + "loss": 7.0639, + "step": 129 + }, + { + "epoch": 0.014758524289479939, + "grad_norm": 0.8671875, + "learning_rate": 0.00098, + "loss": 7.0577, + "step": 130 + }, + { + "epoch": 0.014872051399399016, + "grad_norm": 1.0546875, + "learning_rate": 0.0009860000000000001, + "loss": 7.0683, + "step": 131 + }, + { + "epoch": 0.014985578509318092, + "grad_norm": 1.203125, + "learning_rate": 0.000992, + "loss": 7.0393, + "step": 132 + }, + { + "epoch": 0.015099105619237168, + "grad_norm": 0.97265625, + "learning_rate": 0.000998, + "loss": 7.0399, + "step": 133 + }, + { + "epoch": 0.015212632729156246, + "grad_norm": 1.171875, + "learning_rate": 0.0010040000000000001, + "loss": 7.0315, + "step": 134 + }, + { + "epoch": 0.015326159839075322, + "grad_norm": 1.046875, + "learning_rate": 0.00101, + "loss": 7.0166, + "step": 135 + }, + { + "epoch": 0.015439686948994398, + "grad_norm": 1.1640625, + "learning_rate": 0.001016, + "loss": 7.0053, + "step": 136 + }, + { + "epoch": 0.015553214058913474, + "grad_norm": 0.8671875, + "learning_rate": 0.0010220000000000001, + "loss": 6.9946, + "step": 137 + }, + { + "epoch": 0.01566674116883255, + "grad_norm": 0.95703125, + "learning_rate": 0.001028, + "loss": 7.0057, + "step": 138 + }, + { + "epoch": 0.015780268278751626, + "grad_norm": 0.859375, + "learning_rate": 0.001034, + "loss": 6.9886, + "step": 139 + }, + { + "epoch": 0.015893795388670705, + "grad_norm": 0.90625, + "learning_rate": 0.0010400000000000001, + "loss": 6.9678, + "step": 140 + }, + { + "epoch": 0.01600732249858978, + "grad_norm": 0.9609375, + "learning_rate": 0.001046, + "loss": 6.9794, + "step": 141 + }, + { + "epoch": 0.016120849608508857, + "grad_norm": 0.9140625, + "learning_rate": 0.001052, + "loss": 6.9735, + "step": 142 + }, + { + "epoch": 0.016234376718427933, + "grad_norm": 0.91796875, + "learning_rate": 0.0010580000000000001, + "loss": 6.9456, + "step": 143 + }, + { + "epoch": 0.01634790382834701, + "grad_norm": 0.9140625, + "learning_rate": 0.001064, + "loss": 6.9397, + "step": 144 + }, + { + "epoch": 0.016461430938266085, + "grad_norm": 1.0078125, + "learning_rate": 0.00107, + "loss": 6.9411, + "step": 145 + }, + { + "epoch": 0.01657495804818516, + "grad_norm": 1.015625, + "learning_rate": 0.0010760000000000001, + "loss": 6.9274, + "step": 146 + }, + { + "epoch": 0.016688485158104237, + "grad_norm": 1.1171875, + "learning_rate": 0.001082, + "loss": 6.921, + "step": 147 + }, + { + "epoch": 0.016802012268023317, + "grad_norm": 0.9609375, + "learning_rate": 0.0010880000000000002, + "loss": 6.9047, + "step": 148 + }, + { + "epoch": 0.016915539377942393, + "grad_norm": 1.1640625, + "learning_rate": 0.0010940000000000001, + "loss": 6.9285, + "step": 149 + }, + { + "epoch": 0.01702906648786147, + "grad_norm": 0.921875, + "learning_rate": 0.0011, + "loss": 6.8916, + "step": 150 + }, + { + "epoch": 0.017142593597780545, + "grad_norm": 1.0703125, + "learning_rate": 0.001106, + "loss": 6.8917, + "step": 151 + }, + { + "epoch": 0.01725612070769962, + "grad_norm": 1.0625, + "learning_rate": 0.0011120000000000001, + "loss": 6.8983, + "step": 152 + }, + { + "epoch": 0.017369647817618697, + "grad_norm": 0.94921875, + "learning_rate": 0.001118, + "loss": 6.872, + "step": 153 + }, + { + "epoch": 0.017483174927537773, + "grad_norm": 0.9296875, + "learning_rate": 0.001124, + "loss": 6.8641, + "step": 154 + }, + { + "epoch": 0.017596702037456852, + "grad_norm": 0.9453125, + "learning_rate": 0.0011300000000000001, + "loss": 6.8508, + "step": 155 + }, + { + "epoch": 0.01771022914737593, + "grad_norm": 1.109375, + "learning_rate": 0.001136, + "loss": 6.8742, + "step": 156 + }, + { + "epoch": 0.017823756257295004, + "grad_norm": 1.109375, + "learning_rate": 0.001142, + "loss": 6.8687, + "step": 157 + }, + { + "epoch": 0.01793728336721408, + "grad_norm": 1.0546875, + "learning_rate": 0.001148, + "loss": 6.8461, + "step": 158 + }, + { + "epoch": 0.018050810477133156, + "grad_norm": 1.015625, + "learning_rate": 0.001154, + "loss": 6.8546, + "step": 159 + }, + { + "epoch": 0.018164337587052232, + "grad_norm": 0.89453125, + "learning_rate": 0.00116, + "loss": 6.8339, + "step": 160 + }, + { + "epoch": 0.01827786469697131, + "grad_norm": 0.8671875, + "learning_rate": 0.001166, + "loss": 6.8353, + "step": 161 + }, + { + "epoch": 0.018391391806890384, + "grad_norm": 1.046875, + "learning_rate": 0.001172, + "loss": 6.8294, + "step": 162 + }, + { + "epoch": 0.018504918916809464, + "grad_norm": 1.0703125, + "learning_rate": 0.001178, + "loss": 6.816, + "step": 163 + }, + { + "epoch": 0.01861844602672854, + "grad_norm": 0.83984375, + "learning_rate": 0.001184, + "loss": 6.8221, + "step": 164 + }, + { + "epoch": 0.018731973136647616, + "grad_norm": 0.93359375, + "learning_rate": 0.00119, + "loss": 6.8141, + "step": 165 + }, + { + "epoch": 0.018845500246566692, + "grad_norm": 0.78515625, + "learning_rate": 0.001196, + "loss": 6.7819, + "step": 166 + }, + { + "epoch": 0.018959027356485768, + "grad_norm": 0.84765625, + "learning_rate": 0.001202, + "loss": 6.8, + "step": 167 + }, + { + "epoch": 0.019072554466404844, + "grad_norm": 0.92578125, + "learning_rate": 0.0012080000000000003, + "loss": 6.7826, + "step": 168 + }, + { + "epoch": 0.01918608157632392, + "grad_norm": 0.98046875, + "learning_rate": 0.001214, + "loss": 6.7639, + "step": 169 + }, + { + "epoch": 0.019299608686243, + "grad_norm": 1.03125, + "learning_rate": 0.00122, + "loss": 6.7705, + "step": 170 + }, + { + "epoch": 0.019413135796162075, + "grad_norm": 0.90234375, + "learning_rate": 0.001226, + "loss": 6.7804, + "step": 171 + }, + { + "epoch": 0.01952666290608115, + "grad_norm": 0.8828125, + "learning_rate": 0.001232, + "loss": 6.7498, + "step": 172 + }, + { + "epoch": 0.019640190016000227, + "grad_norm": 0.8359375, + "learning_rate": 0.001238, + "loss": 6.772, + "step": 173 + }, + { + "epoch": 0.019753717125919303, + "grad_norm": 0.83984375, + "learning_rate": 0.001244, + "loss": 6.7528, + "step": 174 + }, + { + "epoch": 0.01986724423583838, + "grad_norm": 0.83984375, + "learning_rate": 0.00125, + "loss": 6.7417, + "step": 175 + }, + { + "epoch": 0.019980771345757455, + "grad_norm": 0.96484375, + "learning_rate": 0.001256, + "loss": 6.7512, + "step": 176 + }, + { + "epoch": 0.02009429845567653, + "grad_norm": 0.79296875, + "learning_rate": 0.001262, + "loss": 6.7275, + "step": 177 + }, + { + "epoch": 0.02020782556559561, + "grad_norm": 0.92578125, + "learning_rate": 0.001268, + "loss": 6.7165, + "step": 178 + }, + { + "epoch": 0.020321352675514687, + "grad_norm": 0.9765625, + "learning_rate": 0.001274, + "loss": 6.7315, + "step": 179 + }, + { + "epoch": 0.020434879785433763, + "grad_norm": 0.953125, + "learning_rate": 0.00128, + "loss": 6.716, + "step": 180 + }, + { + "epoch": 0.02054840689535284, + "grad_norm": 0.81640625, + "learning_rate": 0.001286, + "loss": 6.7107, + "step": 181 + }, + { + "epoch": 0.020661934005271915, + "grad_norm": 0.72265625, + "learning_rate": 0.001292, + "loss": 6.6907, + "step": 182 + }, + { + "epoch": 0.02077546111519099, + "grad_norm": 0.71875, + "learning_rate": 0.0012980000000000001, + "loss": 6.677, + "step": 183 + }, + { + "epoch": 0.020888988225110067, + "grad_norm": 0.7890625, + "learning_rate": 0.0013039999999999998, + "loss": 6.6887, + "step": 184 + }, + { + "epoch": 0.021002515335029146, + "grad_norm": 0.83203125, + "learning_rate": 0.0013100000000000002, + "loss": 6.699, + "step": 185 + }, + { + "epoch": 0.021116042444948222, + "grad_norm": 0.88671875, + "learning_rate": 0.0013160000000000001, + "loss": 6.6653, + "step": 186 + }, + { + "epoch": 0.021229569554867298, + "grad_norm": 0.8515625, + "learning_rate": 0.0013219999999999998, + "loss": 6.6719, + "step": 187 + }, + { + "epoch": 0.021343096664786374, + "grad_norm": 0.73828125, + "learning_rate": 0.0013280000000000002, + "loss": 6.6611, + "step": 188 + }, + { + "epoch": 0.02145662377470545, + "grad_norm": 0.76171875, + "learning_rate": 0.0013340000000000001, + "loss": 6.6477, + "step": 189 + }, + { + "epoch": 0.021570150884624526, + "grad_norm": 0.80078125, + "learning_rate": 0.0013399999999999998, + "loss": 6.6759, + "step": 190 + }, + { + "epoch": 0.021683677994543602, + "grad_norm": 0.87890625, + "learning_rate": 0.0013460000000000002, + "loss": 6.6546, + "step": 191 + }, + { + "epoch": 0.021797205104462678, + "grad_norm": 0.88671875, + "learning_rate": 0.0013520000000000001, + "loss": 6.6508, + "step": 192 + }, + { + "epoch": 0.021910732214381758, + "grad_norm": 0.8359375, + "learning_rate": 0.0013579999999999998, + "loss": 6.6491, + "step": 193 + }, + { + "epoch": 0.022024259324300834, + "grad_norm": 0.85546875, + "learning_rate": 0.001364, + "loss": 6.6361, + "step": 194 + }, + { + "epoch": 0.02213778643421991, + "grad_norm": 0.84375, + "learning_rate": 0.0013700000000000001, + "loss": 6.6242, + "step": 195 + }, + { + "epoch": 0.022251313544138986, + "grad_norm": 0.89453125, + "learning_rate": 0.0013759999999999998, + "loss": 6.6195, + "step": 196 + }, + { + "epoch": 0.02236484065405806, + "grad_norm": 0.80078125, + "learning_rate": 0.001382, + "loss": 6.6039, + "step": 197 + }, + { + "epoch": 0.022478367763977138, + "grad_norm": 0.703125, + "learning_rate": 0.0013880000000000001, + "loss": 6.6206, + "step": 198 + }, + { + "epoch": 0.022591894873896214, + "grad_norm": 0.765625, + "learning_rate": 0.0013939999999999998, + "loss": 6.6069, + "step": 199 + }, + { + "epoch": 0.022705421983815293, + "grad_norm": 0.8203125, + "learning_rate": 0.0014, + "loss": 6.6141, + "step": 200 + }, + { + "epoch": 0.02281894909373437, + "grad_norm": 0.88671875, + "learning_rate": 0.0014060000000000001, + "loss": 6.5865, + "step": 201 + }, + { + "epoch": 0.022932476203653445, + "grad_norm": 0.83203125, + "learning_rate": 0.0014119999999999998, + "loss": 6.6011, + "step": 202 + }, + { + "epoch": 0.02304600331357252, + "grad_norm": 0.78125, + "learning_rate": 0.001418, + "loss": 6.5957, + "step": 203 + }, + { + "epoch": 0.023159530423491597, + "grad_norm": 0.73828125, + "learning_rate": 0.0014240000000000001, + "loss": 6.588, + "step": 204 + }, + { + "epoch": 0.023273057533410673, + "grad_norm": 0.74609375, + "learning_rate": 0.00143, + "loss": 6.5857, + "step": 205 + }, + { + "epoch": 0.02338658464332975, + "grad_norm": 0.7578125, + "learning_rate": 0.001436, + "loss": 6.582, + "step": 206 + }, + { + "epoch": 0.023500111753248825, + "grad_norm": 0.921875, + "learning_rate": 0.001442, + "loss": 6.5843, + "step": 207 + }, + { + "epoch": 0.023613638863167905, + "grad_norm": 0.98828125, + "learning_rate": 0.001448, + "loss": 6.582, + "step": 208 + }, + { + "epoch": 0.02372716597308698, + "grad_norm": 0.94921875, + "learning_rate": 0.001454, + "loss": 6.5853, + "step": 209 + }, + { + "epoch": 0.023840693083006056, + "grad_norm": 0.875, + "learning_rate": 0.00146, + "loss": 6.5648, + "step": 210 + }, + { + "epoch": 0.023954220192925132, + "grad_norm": 0.8828125, + "learning_rate": 0.001466, + "loss": 6.5536, + "step": 211 + }, + { + "epoch": 0.02406774730284421, + "grad_norm": 0.828125, + "learning_rate": 0.001472, + "loss": 6.571, + "step": 212 + }, + { + "epoch": 0.024181274412763284, + "grad_norm": 0.73828125, + "learning_rate": 0.001478, + "loss": 6.5288, + "step": 213 + }, + { + "epoch": 0.02429480152268236, + "grad_norm": 0.71875, + "learning_rate": 0.001484, + "loss": 6.5409, + "step": 214 + }, + { + "epoch": 0.02440832863260144, + "grad_norm": 0.859375, + "learning_rate": 0.00149, + "loss": 6.5423, + "step": 215 + }, + { + "epoch": 0.024521855742520516, + "grad_norm": 0.8984375, + "learning_rate": 0.001496, + "loss": 6.5535, + "step": 216 + }, + { + "epoch": 0.024635382852439592, + "grad_norm": 0.80078125, + "learning_rate": 0.001502, + "loss": 6.523, + "step": 217 + }, + { + "epoch": 0.024748909962358668, + "grad_norm": 0.76171875, + "learning_rate": 0.001508, + "loss": 6.5229, + "step": 218 + }, + { + "epoch": 0.024862437072277744, + "grad_norm": 0.78515625, + "learning_rate": 0.001514, + "loss": 6.5246, + "step": 219 + }, + { + "epoch": 0.02497596418219682, + "grad_norm": 0.71484375, + "learning_rate": 0.0015199999999999999, + "loss": 6.5105, + "step": 220 + }, + { + "epoch": 0.025089491292115896, + "grad_norm": 0.7421875, + "learning_rate": 0.001526, + "loss": 6.5089, + "step": 221 + }, + { + "epoch": 0.025203018402034972, + "grad_norm": 0.78515625, + "learning_rate": 0.0015320000000000002, + "loss": 6.4997, + "step": 222 + }, + { + "epoch": 0.02531654551195405, + "grad_norm": 0.7265625, + "learning_rate": 0.0015379999999999999, + "loss": 6.4932, + "step": 223 + }, + { + "epoch": 0.025430072621873127, + "grad_norm": 0.71875, + "learning_rate": 0.001544, + "loss": 6.4813, + "step": 224 + }, + { + "epoch": 0.025543599731792203, + "grad_norm": 0.71484375, + "learning_rate": 0.0015500000000000002, + "loss": 6.4868, + "step": 225 + }, + { + "epoch": 0.02565712684171128, + "grad_norm": 0.8046875, + "learning_rate": 0.0015559999999999999, + "loss": 6.495, + "step": 226 + }, + { + "epoch": 0.025770653951630355, + "grad_norm": 0.79296875, + "learning_rate": 0.001562, + "loss": 6.484, + "step": 227 + }, + { + "epoch": 0.02588418106154943, + "grad_norm": 0.6953125, + "learning_rate": 0.0015680000000000002, + "loss": 6.4856, + "step": 228 + }, + { + "epoch": 0.025997708171468507, + "grad_norm": 0.69140625, + "learning_rate": 0.0015739999999999999, + "loss": 6.4888, + "step": 229 + }, + { + "epoch": 0.026111235281387583, + "grad_norm": 0.6640625, + "learning_rate": 0.00158, + "loss": 6.4776, + "step": 230 + }, + { + "epoch": 0.026224762391306663, + "grad_norm": 0.703125, + "learning_rate": 0.0015860000000000002, + "loss": 6.4745, + "step": 231 + }, + { + "epoch": 0.02633828950122574, + "grad_norm": 0.75390625, + "learning_rate": 0.0015919999999999999, + "loss": 6.4713, + "step": 232 + }, + { + "epoch": 0.026451816611144815, + "grad_norm": 0.7109375, + "learning_rate": 0.0015979999999999998, + "loss": 6.4736, + "step": 233 + }, + { + "epoch": 0.02656534372106389, + "grad_norm": 0.7734375, + "learning_rate": 0.0016040000000000002, + "loss": 6.4385, + "step": 234 + }, + { + "epoch": 0.026678870830982967, + "grad_norm": 0.76953125, + "learning_rate": 0.0016099999999999999, + "loss": 6.4602, + "step": 235 + }, + { + "epoch": 0.026792397940902043, + "grad_norm": 0.77734375, + "learning_rate": 0.0016159999999999998, + "loss": 6.4555, + "step": 236 + }, + { + "epoch": 0.02690592505082112, + "grad_norm": 0.8125, + "learning_rate": 0.0016220000000000002, + "loss": 6.4601, + "step": 237 + }, + { + "epoch": 0.0270194521607402, + "grad_norm": 0.80078125, + "learning_rate": 0.0016279999999999999, + "loss": 6.4413, + "step": 238 + }, + { + "epoch": 0.027132979270659274, + "grad_norm": 0.7890625, + "learning_rate": 0.001634, + "loss": 6.4384, + "step": 239 + }, + { + "epoch": 0.02724650638057835, + "grad_norm": 0.7890625, + "learning_rate": 0.0016400000000000002, + "loss": 6.4485, + "step": 240 + }, + { + "epoch": 0.027360033490497426, + "grad_norm": 0.7734375, + "learning_rate": 0.001646, + "loss": 6.4372, + "step": 241 + }, + { + "epoch": 0.027473560600416502, + "grad_norm": 0.77734375, + "learning_rate": 0.001652, + "loss": 6.4507, + "step": 242 + }, + { + "epoch": 0.02758708771033558, + "grad_norm": 0.6875, + "learning_rate": 0.0016580000000000002, + "loss": 6.4199, + "step": 243 + }, + { + "epoch": 0.027700614820254654, + "grad_norm": 0.69921875, + "learning_rate": 0.001664, + "loss": 6.4132, + "step": 244 + }, + { + "epoch": 0.02781414193017373, + "grad_norm": 0.765625, + "learning_rate": 0.00167, + "loss": 6.4206, + "step": 245 + }, + { + "epoch": 0.02792766904009281, + "grad_norm": 0.73046875, + "learning_rate": 0.001676, + "loss": 6.4129, + "step": 246 + }, + { + "epoch": 0.028041196150011886, + "grad_norm": 0.81640625, + "learning_rate": 0.001682, + "loss": 6.4176, + "step": 247 + }, + { + "epoch": 0.02815472325993096, + "grad_norm": 0.87890625, + "learning_rate": 0.001688, + "loss": 6.4286, + "step": 248 + }, + { + "epoch": 0.028268250369850038, + "grad_norm": 0.78125, + "learning_rate": 0.001694, + "loss": 6.4073, + "step": 249 + }, + { + "epoch": 0.028381777479769114, + "grad_norm": 0.83984375, + "learning_rate": 0.0017, + "loss": 6.4126, + "step": 250 + }, + { + "epoch": 0.02849530458968819, + "grad_norm": 0.7890625, + "learning_rate": 0.001706, + "loss": 6.4056, + "step": 251 + }, + { + "epoch": 0.028608831699607266, + "grad_norm": 0.76953125, + "learning_rate": 0.001712, + "loss": 6.3884, + "step": 252 + }, + { + "epoch": 0.028722358809526345, + "grad_norm": 0.74609375, + "learning_rate": 0.001718, + "loss": 6.4034, + "step": 253 + }, + { + "epoch": 0.02883588591944542, + "grad_norm": 0.7890625, + "learning_rate": 0.001724, + "loss": 6.4039, + "step": 254 + }, + { + "epoch": 0.028949413029364497, + "grad_norm": 0.77734375, + "learning_rate": 0.00173, + "loss": 6.3756, + "step": 255 + }, + { + "epoch": 0.029062940139283573, + "grad_norm": 0.69140625, + "learning_rate": 0.0017360000000000001, + "loss": 6.3928, + "step": 256 + }, + { + "epoch": 0.02917646724920265, + "grad_norm": 0.6796875, + "learning_rate": 0.001742, + "loss": 6.3748, + "step": 257 + }, + { + "epoch": 0.029289994359121725, + "grad_norm": 0.7890625, + "learning_rate": 0.001748, + "loss": 6.3946, + "step": 258 + }, + { + "epoch": 0.0294035214690408, + "grad_norm": 0.88671875, + "learning_rate": 0.0017540000000000001, + "loss": 6.3819, + "step": 259 + }, + { + "epoch": 0.029517048578959877, + "grad_norm": 0.76953125, + "learning_rate": 0.00176, + "loss": 6.3675, + "step": 260 + }, + { + "epoch": 0.029630575688878957, + "grad_norm": 0.66796875, + "learning_rate": 0.001766, + "loss": 6.3723, + "step": 261 + }, + { + "epoch": 0.029744102798798033, + "grad_norm": 0.65234375, + "learning_rate": 0.0017720000000000001, + "loss": 6.356, + "step": 262 + }, + { + "epoch": 0.02985762990871711, + "grad_norm": 0.6484375, + "learning_rate": 0.001778, + "loss": 6.3453, + "step": 263 + }, + { + "epoch": 0.029971157018636185, + "grad_norm": 0.6796875, + "learning_rate": 0.001784, + "loss": 6.3625, + "step": 264 + }, + { + "epoch": 0.03008468412855526, + "grad_norm": 0.62109375, + "learning_rate": 0.0017900000000000001, + "loss": 6.358, + "step": 265 + }, + { + "epoch": 0.030198211238474337, + "grad_norm": 0.67578125, + "learning_rate": 0.001796, + "loss": 6.3492, + "step": 266 + }, + { + "epoch": 0.030311738348393413, + "grad_norm": 0.69140625, + "learning_rate": 0.001802, + "loss": 6.3527, + "step": 267 + }, + { + "epoch": 0.030425265458312492, + "grad_norm": 0.66796875, + "learning_rate": 0.0018080000000000001, + "loss": 6.3575, + "step": 268 + }, + { + "epoch": 0.030538792568231568, + "grad_norm": 0.6640625, + "learning_rate": 0.0018139999999999999, + "loss": 6.3469, + "step": 269 + }, + { + "epoch": 0.030652319678150644, + "grad_norm": 0.671875, + "learning_rate": 0.00182, + "loss": 6.3497, + "step": 270 + }, + { + "epoch": 0.03076584678806972, + "grad_norm": 0.70703125, + "learning_rate": 0.0018260000000000001, + "loss": 6.3454, + "step": 271 + }, + { + "epoch": 0.030879373897988796, + "grad_norm": 0.6640625, + "learning_rate": 0.0018319999999999999, + "loss": 6.3361, + "step": 272 + }, + { + "epoch": 0.030992901007907872, + "grad_norm": 0.63671875, + "learning_rate": 0.0018380000000000002, + "loss": 6.3293, + "step": 273 + }, + { + "epoch": 0.031106428117826948, + "grad_norm": 0.6640625, + "learning_rate": 0.0018440000000000002, + "loss": 6.3312, + "step": 274 + }, + { + "epoch": 0.031219955227746024, + "grad_norm": 0.82421875, + "learning_rate": 0.0018499999999999999, + "loss": 6.3324, + "step": 275 + }, + { + "epoch": 0.0313334823376651, + "grad_norm": 0.796875, + "learning_rate": 0.0018560000000000002, + "loss": 6.3218, + "step": 276 + }, + { + "epoch": 0.031447009447584176, + "grad_norm": 0.8125, + "learning_rate": 0.0018620000000000002, + "loss": 6.3279, + "step": 277 + }, + { + "epoch": 0.03156053655750325, + "grad_norm": 0.84765625, + "learning_rate": 0.0018679999999999999, + "loss": 6.3369, + "step": 278 + }, + { + "epoch": 0.03167406366742233, + "grad_norm": 0.828125, + "learning_rate": 0.0018740000000000002, + "loss": 6.3231, + "step": 279 + }, + { + "epoch": 0.03178759077734141, + "grad_norm": 0.87109375, + "learning_rate": 0.0018800000000000002, + "loss": 6.3226, + "step": 280 + }, + { + "epoch": 0.03190111788726049, + "grad_norm": 0.6875, + "learning_rate": 0.0018859999999999999, + "loss": 6.3117, + "step": 281 + }, + { + "epoch": 0.03201464499717956, + "grad_norm": 0.65234375, + "learning_rate": 0.001892, + "loss": 6.2882, + "step": 282 + }, + { + "epoch": 0.03212817210709864, + "grad_norm": 0.73828125, + "learning_rate": 0.0018980000000000002, + "loss": 6.3303, + "step": 283 + }, + { + "epoch": 0.032241699217017715, + "grad_norm": 0.79296875, + "learning_rate": 0.0019039999999999999, + "loss": 6.3005, + "step": 284 + }, + { + "epoch": 0.03235522632693679, + "grad_norm": 0.67578125, + "learning_rate": 0.00191, + "loss": 6.313, + "step": 285 + }, + { + "epoch": 0.03246875343685587, + "grad_norm": 0.62109375, + "learning_rate": 0.0019160000000000002, + "loss": 6.2736, + "step": 286 + }, + { + "epoch": 0.03258228054677494, + "grad_norm": 0.60546875, + "learning_rate": 0.0019219999999999999, + "loss": 6.2803, + "step": 287 + }, + { + "epoch": 0.03269580765669402, + "grad_norm": 0.64453125, + "learning_rate": 0.001928, + "loss": 6.2968, + "step": 288 + }, + { + "epoch": 0.032809334766613095, + "grad_norm": 0.6875, + "learning_rate": 0.0019340000000000002, + "loss": 6.3097, + "step": 289 + }, + { + "epoch": 0.03292286187653217, + "grad_norm": 0.71875, + "learning_rate": 0.0019399999999999999, + "loss": 6.2837, + "step": 290 + }, + { + "epoch": 0.03303638898645125, + "grad_norm": 0.78515625, + "learning_rate": 0.001946, + "loss": 6.2716, + "step": 291 + }, + { + "epoch": 0.03314991609637032, + "grad_norm": 0.73046875, + "learning_rate": 0.0019520000000000002, + "loss": 6.2953, + "step": 292 + }, + { + "epoch": 0.0332634432062894, + "grad_norm": 0.625, + "learning_rate": 0.001958, + "loss": 6.2702, + "step": 293 + }, + { + "epoch": 0.033376970316208475, + "grad_norm": 0.54296875, + "learning_rate": 0.001964, + "loss": 6.2684, + "step": 294 + }, + { + "epoch": 0.03349049742612756, + "grad_norm": 0.59765625, + "learning_rate": 0.00197, + "loss": 6.2785, + "step": 295 + }, + { + "epoch": 0.033604024536046634, + "grad_norm": 0.6328125, + "learning_rate": 0.001976, + "loss": 6.2718, + "step": 296 + }, + { + "epoch": 0.03371755164596571, + "grad_norm": 0.71875, + "learning_rate": 0.001982, + "loss": 6.2632, + "step": 297 + }, + { + "epoch": 0.033831078755884786, + "grad_norm": 0.6875, + "learning_rate": 0.001988, + "loss": 6.253, + "step": 298 + }, + { + "epoch": 0.03394460586580386, + "grad_norm": 0.60546875, + "learning_rate": 0.001994, + "loss": 6.261, + "step": 299 + }, + { + "epoch": 0.03405813297572294, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 6.2471, + "step": 300 + }, + { + "epoch": 0.034171660085642014, + "grad_norm": 0.62890625, + "learning_rate": 0.002, + "loss": 6.2568, + "step": 301 + }, + { + "epoch": 0.03428518719556109, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 6.2433, + "step": 302 + }, + { + "epoch": 0.034398714305480166, + "grad_norm": 0.6171875, + "learning_rate": 0.002, + "loss": 6.2558, + "step": 303 + }, + { + "epoch": 0.03451224141539924, + "grad_norm": 0.75390625, + "learning_rate": 0.002, + "loss": 6.2437, + "step": 304 + }, + { + "epoch": 0.03462576852531832, + "grad_norm": 0.734375, + "learning_rate": 0.002, + "loss": 6.2434, + "step": 305 + }, + { + "epoch": 0.034739295635237394, + "grad_norm": 0.78125, + "learning_rate": 0.002, + "loss": 6.2332, + "step": 306 + }, + { + "epoch": 0.03485282274515647, + "grad_norm": 0.75, + "learning_rate": 0.002, + "loss": 6.2421, + "step": 307 + }, + { + "epoch": 0.034966349855075546, + "grad_norm": 0.6796875, + "learning_rate": 0.002, + "loss": 6.2396, + "step": 308 + }, + { + "epoch": 0.03507987696499462, + "grad_norm": 0.6875, + "learning_rate": 0.002, + "loss": 6.2284, + "step": 309 + }, + { + "epoch": 0.035193404074913705, + "grad_norm": 0.7421875, + "learning_rate": 0.002, + "loss": 6.2347, + "step": 310 + }, + { + "epoch": 0.03530693118483278, + "grad_norm": 0.78125, + "learning_rate": 0.002, + "loss": 6.2424, + "step": 311 + }, + { + "epoch": 0.03542045829475186, + "grad_norm": 0.71484375, + "learning_rate": 0.002, + "loss": 6.2347, + "step": 312 + }, + { + "epoch": 0.03553398540467093, + "grad_norm": 0.640625, + "learning_rate": 0.002, + "loss": 6.2269, + "step": 313 + }, + { + "epoch": 0.03564751251459001, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 6.2191, + "step": 314 + }, + { + "epoch": 0.035761039624509085, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 6.2313, + "step": 315 + }, + { + "epoch": 0.03587456673442816, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 6.2311, + "step": 316 + }, + { + "epoch": 0.03598809384434724, + "grad_norm": 0.6484375, + "learning_rate": 0.002, + "loss": 6.2184, + "step": 317 + }, + { + "epoch": 0.03610162095426631, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 6.1863, + "step": 318 + }, + { + "epoch": 0.03621514806418539, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 6.2005, + "step": 319 + }, + { + "epoch": 0.036328675174104465, + "grad_norm": 0.7109375, + "learning_rate": 0.002, + "loss": 6.2043, + "step": 320 + }, + { + "epoch": 0.03644220228402354, + "grad_norm": 0.76953125, + "learning_rate": 0.002, + "loss": 6.2207, + "step": 321 + }, + { + "epoch": 0.03655572939394262, + "grad_norm": 0.76953125, + "learning_rate": 0.002, + "loss": 6.2231, + "step": 322 + }, + { + "epoch": 0.03666925650386169, + "grad_norm": 0.6953125, + "learning_rate": 0.002, + "loss": 6.1961, + "step": 323 + }, + { + "epoch": 0.03678278361378077, + "grad_norm": 0.703125, + "learning_rate": 0.002, + "loss": 6.1961, + "step": 324 + }, + { + "epoch": 0.03689631072369985, + "grad_norm": 0.80078125, + "learning_rate": 0.002, + "loss": 6.2078, + "step": 325 + }, + { + "epoch": 0.03700983783361893, + "grad_norm": 0.77734375, + "learning_rate": 0.002, + "loss": 6.2047, + "step": 326 + }, + { + "epoch": 0.037123364943538004, + "grad_norm": 0.64453125, + "learning_rate": 0.002, + "loss": 6.1689, + "step": 327 + }, + { + "epoch": 0.03723689205345708, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 6.1883, + "step": 328 + }, + { + "epoch": 0.037350419163376156, + "grad_norm": 0.5703125, + "learning_rate": 0.002, + "loss": 6.1699, + "step": 329 + }, + { + "epoch": 0.03746394627329523, + "grad_norm": 0.640625, + "learning_rate": 0.002, + "loss": 6.1786, + "step": 330 + }, + { + "epoch": 0.03757747338321431, + "grad_norm": 0.69921875, + "learning_rate": 0.002, + "loss": 6.1778, + "step": 331 + }, + { + "epoch": 0.037691000493133384, + "grad_norm": 0.64453125, + "learning_rate": 0.002, + "loss": 6.1811, + "step": 332 + }, + { + "epoch": 0.03780452760305246, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 6.1769, + "step": 333 + }, + { + "epoch": 0.037918054712971536, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 6.1642, + "step": 334 + }, + { + "epoch": 0.03803158182289061, + "grad_norm": 0.6171875, + "learning_rate": 0.002, + "loss": 6.1512, + "step": 335 + }, + { + "epoch": 0.03814510893280969, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 6.1648, + "step": 336 + }, + { + "epoch": 0.038258636042728764, + "grad_norm": 0.72265625, + "learning_rate": 0.002, + "loss": 6.1735, + "step": 337 + }, + { + "epoch": 0.03837216315264784, + "grad_norm": 0.73828125, + "learning_rate": 0.002, + "loss": 6.1485, + "step": 338 + }, + { + "epoch": 0.038485690262566916, + "grad_norm": 0.671875, + "learning_rate": 0.002, + "loss": 6.1556, + "step": 339 + }, + { + "epoch": 0.038599217372486, + "grad_norm": 0.68359375, + "learning_rate": 0.002, + "loss": 6.1484, + "step": 340 + }, + { + "epoch": 0.038712744482405075, + "grad_norm": 0.62890625, + "learning_rate": 0.002, + "loss": 6.1459, + "step": 341 + }, + { + "epoch": 0.03882627159232415, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 6.1456, + "step": 342 + }, + { + "epoch": 0.03893979870224323, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 6.1399, + "step": 343 + }, + { + "epoch": 0.0390533258121623, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 6.1475, + "step": 344 + }, + { + "epoch": 0.03916685292208138, + "grad_norm": 0.5703125, + "learning_rate": 0.002, + "loss": 6.1336, + "step": 345 + }, + { + "epoch": 0.039280380032000455, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 6.133, + "step": 346 + }, + { + "epoch": 0.03939390714191953, + "grad_norm": 0.5703125, + "learning_rate": 0.002, + "loss": 6.1262, + "step": 347 + }, + { + "epoch": 0.039507434251838607, + "grad_norm": 0.578125, + "learning_rate": 0.002, + "loss": 6.1377, + "step": 348 + }, + { + "epoch": 0.03962096136175768, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 6.1175, + "step": 349 + }, + { + "epoch": 0.03973448847167676, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 6.1234, + "step": 350 + }, + { + "epoch": 0.039848015581595835, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 6.1384, + "step": 351 + }, + { + "epoch": 0.03996154269151491, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 6.1445, + "step": 352 + }, + { + "epoch": 0.040075069801433987, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 6.1153, + "step": 353 + }, + { + "epoch": 0.04018859691135306, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 6.1028, + "step": 354 + }, + { + "epoch": 0.040302124021272145, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 6.1172, + "step": 355 + }, + { + "epoch": 0.04041565113119122, + "grad_norm": 0.5859375, + "learning_rate": 0.002, + "loss": 6.1185, + "step": 356 + }, + { + "epoch": 0.0405291782411103, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 6.087, + "step": 357 + }, + { + "epoch": 0.04064270535102937, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 6.0829, + "step": 358 + }, + { + "epoch": 0.04075623246094845, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 6.0981, + "step": 359 + }, + { + "epoch": 0.040869759570867525, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 6.1014, + "step": 360 + }, + { + "epoch": 0.0409832866807866, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 6.0917, + "step": 361 + }, + { + "epoch": 0.04109681379070568, + "grad_norm": 0.703125, + "learning_rate": 0.002, + "loss": 6.1174, + "step": 362 + }, + { + "epoch": 0.04121034090062475, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 6.0854, + "step": 363 + }, + { + "epoch": 0.04132386801054383, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 6.081, + "step": 364 + }, + { + "epoch": 0.041437395120462905, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 6.1024, + "step": 365 + }, + { + "epoch": 0.04155092223038198, + "grad_norm": 0.6640625, + "learning_rate": 0.002, + "loss": 6.0949, + "step": 366 + }, + { + "epoch": 0.04166444934030106, + "grad_norm": 0.734375, + "learning_rate": 0.002, + "loss": 6.079, + "step": 367 + }, + { + "epoch": 0.04177797645022013, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 6.0801, + "step": 368 + }, + { + "epoch": 0.04189150356013921, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 6.0927, + "step": 369 + }, + { + "epoch": 0.04200503067005829, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 6.0886, + "step": 370 + }, + { + "epoch": 0.04211855777997737, + "grad_norm": 0.640625, + "learning_rate": 0.002, + "loss": 6.0786, + "step": 371 + }, + { + "epoch": 0.042232084889896444, + "grad_norm": 0.64453125, + "learning_rate": 0.002, + "loss": 6.0789, + "step": 372 + }, + { + "epoch": 0.04234561199981552, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 6.0908, + "step": 373 + }, + { + "epoch": 0.042459139109734596, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 6.0615, + "step": 374 + }, + { + "epoch": 0.04257266621965367, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 6.0617, + "step": 375 + }, + { + "epoch": 0.04268619332957275, + "grad_norm": 0.5703125, + "learning_rate": 0.002, + "loss": 6.0826, + "step": 376 + }, + { + "epoch": 0.042799720439491824, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 6.0556, + "step": 377 + }, + { + "epoch": 0.0429132475494109, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 6.0778, + "step": 378 + }, + { + "epoch": 0.043026774659329976, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 6.0702, + "step": 379 + }, + { + "epoch": 0.04314030176924905, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 6.062, + "step": 380 + }, + { + "epoch": 0.04325382887916813, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 6.0649, + "step": 381 + }, + { + "epoch": 0.043367355989087204, + "grad_norm": 0.6171875, + "learning_rate": 0.002, + "loss": 6.0691, + "step": 382 + }, + { + "epoch": 0.04348088309900628, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 6.0609, + "step": 383 + }, + { + "epoch": 0.043594410208925356, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 6.0438, + "step": 384 + }, + { + "epoch": 0.04370793731884444, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 6.059, + "step": 385 + }, + { + "epoch": 0.043821464428763515, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 6.0472, + "step": 386 + }, + { + "epoch": 0.04393499153868259, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 6.0531, + "step": 387 + }, + { + "epoch": 0.04404851864860167, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 6.0568, + "step": 388 + }, + { + "epoch": 0.04416204575852074, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 6.047, + "step": 389 + }, + { + "epoch": 0.04427557286843982, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 6.0349, + "step": 390 + }, + { + "epoch": 0.044389099978358895, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 6.032, + "step": 391 + }, + { + "epoch": 0.04450262708827797, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 6.034, + "step": 392 + }, + { + "epoch": 0.04461615419819705, + "grad_norm": 0.609375, + "learning_rate": 0.002, + "loss": 6.0341, + "step": 393 + }, + { + "epoch": 0.04472968130811612, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 6.0278, + "step": 394 + }, + { + "epoch": 0.0448432084180352, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 6.0432, + "step": 395 + }, + { + "epoch": 0.044956735527954275, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 6.0241, + "step": 396 + }, + { + "epoch": 0.04507026263787335, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 6.0446, + "step": 397 + }, + { + "epoch": 0.04518378974779243, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 6.0416, + "step": 398 + }, + { + "epoch": 0.0452973168577115, + "grad_norm": 0.671875, + "learning_rate": 0.002, + "loss": 6.0243, + "step": 399 + }, + { + "epoch": 0.045410843967630586, + "grad_norm": 0.65625, + "learning_rate": 0.002, + "loss": 6.0252, + "step": 400 + }, + { + "epoch": 0.04552437107754966, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 6.0338, + "step": 401 + }, + { + "epoch": 0.04563789818746874, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 6.023, + "step": 402 + }, + { + "epoch": 0.045751425297387814, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 6.0366, + "step": 403 + }, + { + "epoch": 0.04586495240730689, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 6.048, + "step": 404 + }, + { + "epoch": 0.045978479517225966, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 6.0079, + "step": 405 + }, + { + "epoch": 0.04609200662714504, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.9989, + "step": 406 + }, + { + "epoch": 0.04620553373706412, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 6.0113, + "step": 407 + }, + { + "epoch": 0.046319060846983194, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 6.0031, + "step": 408 + }, + { + "epoch": 0.04643258795690227, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 6.0011, + "step": 409 + }, + { + "epoch": 0.046546115066821346, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 6.0037, + "step": 410 + }, + { + "epoch": 0.04665964217674042, + "grad_norm": 0.70703125, + "learning_rate": 0.002, + "loss": 6.0108, + "step": 411 + }, + { + "epoch": 0.0467731692866595, + "grad_norm": 0.72265625, + "learning_rate": 0.002, + "loss": 6.0067, + "step": 412 + }, + { + "epoch": 0.046886696396578574, + "grad_norm": 0.66796875, + "learning_rate": 0.002, + "loss": 6.0059, + "step": 413 + }, + { + "epoch": 0.04700022350649765, + "grad_norm": 0.6484375, + "learning_rate": 0.002, + "loss": 6.0045, + "step": 414 + }, + { + "epoch": 0.04711375061641673, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 6.0096, + "step": 415 + }, + { + "epoch": 0.04722727772633581, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.9978, + "step": 416 + }, + { + "epoch": 0.047340804836254885, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.9964, + "step": 417 + }, + { + "epoch": 0.04745433194617396, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.985, + "step": 418 + }, + { + "epoch": 0.04756785905609304, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.9858, + "step": 419 + }, + { + "epoch": 0.04768138616601211, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.9872, + "step": 420 + }, + { + "epoch": 0.04779491327593119, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.9986, + "step": 421 + }, + { + "epoch": 0.047908440385850265, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.9791, + "step": 422 + }, + { + "epoch": 0.04802196749576934, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.9766, + "step": 423 + }, + { + "epoch": 0.04813549460568842, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.9913, + "step": 424 + }, + { + "epoch": 0.04824902171560749, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.9781, + "step": 425 + }, + { + "epoch": 0.04836254882552657, + "grad_norm": 0.484375, + "learning_rate": 0.002, + "loss": 5.9782, + "step": 426 + }, + { + "epoch": 0.048476075935445645, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.9753, + "step": 427 + }, + { + "epoch": 0.04858960304536472, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.968, + "step": 428 + }, + { + "epoch": 0.0487031301552838, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.973, + "step": 429 + }, + { + "epoch": 0.04881665726520288, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.9632, + "step": 430 + }, + { + "epoch": 0.048930184375121956, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.969, + "step": 431 + }, + { + "epoch": 0.04904371148504103, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 5.9568, + "step": 432 + }, + { + "epoch": 0.04915723859496011, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 5.9873, + "step": 433 + }, + { + "epoch": 0.049270765704879184, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 5.9671, + "step": 434 + }, + { + "epoch": 0.04938429281479826, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.9707, + "step": 435 + }, + { + "epoch": 0.049497819924717336, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.9794, + "step": 436 + }, + { + "epoch": 0.04961134703463641, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 5.9613, + "step": 437 + }, + { + "epoch": 0.04972487414455549, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 5.9623, + "step": 438 + }, + { + "epoch": 0.049838401254474564, + "grad_norm": 0.63671875, + "learning_rate": 0.002, + "loss": 5.9661, + "step": 439 + }, + { + "epoch": 0.04995192836439364, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.971, + "step": 440 + }, + { + "epoch": 0.050065455474312716, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.9666, + "step": 441 + }, + { + "epoch": 0.05017898258423179, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.9563, + "step": 442 + }, + { + "epoch": 0.05029250969415087, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 5.9467, + "step": 443 + }, + { + "epoch": 0.050406036804069944, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.961, + "step": 444 + }, + { + "epoch": 0.05051956391398903, + "grad_norm": 0.578125, + "learning_rate": 0.002, + "loss": 5.9395, + "step": 445 + }, + { + "epoch": 0.0506330910239081, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.9592, + "step": 446 + }, + { + "epoch": 0.05074661813382718, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 5.9602, + "step": 447 + }, + { + "epoch": 0.050860145243746255, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 5.9538, + "step": 448 + }, + { + "epoch": 0.05097367235366533, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.9509, + "step": 449 + }, + { + "epoch": 0.05108719946358441, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 5.9407, + "step": 450 + }, + { + "epoch": 0.05120072657350348, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.9565, + "step": 451 + }, + { + "epoch": 0.05131425368342256, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.9379, + "step": 452 + }, + { + "epoch": 0.051427780793341635, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.9503, + "step": 453 + }, + { + "epoch": 0.05154130790326071, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 5.9299, + "step": 454 + }, + { + "epoch": 0.05165483501317979, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.9241, + "step": 455 + }, + { + "epoch": 0.05176836212309886, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.9515, + "step": 456 + }, + { + "epoch": 0.05188188923301794, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.9157, + "step": 457 + }, + { + "epoch": 0.051995416342937015, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.9354, + "step": 458 + }, + { + "epoch": 0.05210894345285609, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.9348, + "step": 459 + }, + { + "epoch": 0.05222247056277517, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.9315, + "step": 460 + }, + { + "epoch": 0.05233599767269425, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.9449, + "step": 461 + }, + { + "epoch": 0.052449524782613326, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.9425, + "step": 462 + }, + { + "epoch": 0.0525630518925324, + "grad_norm": 0.486328125, + "learning_rate": 0.002, + "loss": 5.9275, + "step": 463 + }, + { + "epoch": 0.05267657900245148, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.9382, + "step": 464 + }, + { + "epoch": 0.052790106112370554, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.9147, + "step": 465 + }, + { + "epoch": 0.05290363322228963, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.9249, + "step": 466 + }, + { + "epoch": 0.053017160332208706, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.9074, + "step": 467 + }, + { + "epoch": 0.05313068744212778, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 5.9267, + "step": 468 + }, + { + "epoch": 0.05324421455204686, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 5.9167, + "step": 469 + }, + { + "epoch": 0.053357741661965934, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.9287, + "step": 470 + }, + { + "epoch": 0.05347126877188501, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.9367, + "step": 471 + }, + { + "epoch": 0.053584795881804086, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.9199, + "step": 472 + }, + { + "epoch": 0.05369832299172316, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.9235, + "step": 473 + }, + { + "epoch": 0.05381185010164224, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.8995, + "step": 474 + }, + { + "epoch": 0.053925377211561314, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.9214, + "step": 475 + }, + { + "epoch": 0.0540389043214804, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.9146, + "step": 476 + }, + { + "epoch": 0.05415243143139947, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.9127, + "step": 477 + }, + { + "epoch": 0.05426595854131855, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.9092, + "step": 478 + }, + { + "epoch": 0.054379485651237625, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.914, + "step": 479 + }, + { + "epoch": 0.0544930127611567, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.9071, + "step": 480 + }, + { + "epoch": 0.05460653987107578, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.9036, + "step": 481 + }, + { + "epoch": 0.05472006698099485, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.912, + "step": 482 + }, + { + "epoch": 0.05483359409091393, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.9081, + "step": 483 + }, + { + "epoch": 0.054947121200833005, + "grad_norm": 0.486328125, + "learning_rate": 0.002, + "loss": 5.8961, + "step": 484 + }, + { + "epoch": 0.05506064831075208, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.8945, + "step": 485 + }, + { + "epoch": 0.05517417542067116, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.896, + "step": 486 + }, + { + "epoch": 0.05528770253059023, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.8671, + "step": 487 + }, + { + "epoch": 0.05540122964050931, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.9104, + "step": 488 + }, + { + "epoch": 0.055514756750428385, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.908, + "step": 489 + }, + { + "epoch": 0.05562828386034746, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.8806, + "step": 490 + }, + { + "epoch": 0.05574181097026654, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.9009, + "step": 491 + }, + { + "epoch": 0.05585533808018562, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.8843, + "step": 492 + }, + { + "epoch": 0.055968865190104695, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.8909, + "step": 493 + }, + { + "epoch": 0.05608239230002377, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.8813, + "step": 494 + }, + { + "epoch": 0.05619591940994285, + "grad_norm": 0.640625, + "learning_rate": 0.002, + "loss": 5.9248, + "step": 495 + }, + { + "epoch": 0.05630944651986192, + "grad_norm": 0.65234375, + "learning_rate": 0.002, + "loss": 5.8991, + "step": 496 + }, + { + "epoch": 0.056422973629781, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.8897, + "step": 497 + }, + { + "epoch": 0.056536500739700075, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.9034, + "step": 498 + }, + { + "epoch": 0.05665002784961915, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.8889, + "step": 499 + }, + { + "epoch": 0.05676355495953823, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.891, + "step": 500 + }, + { + "epoch": 0.0568770820694573, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.8765, + "step": 501 + }, + { + "epoch": 0.05699060917937638, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.8805, + "step": 502 + }, + { + "epoch": 0.057104136289295455, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.8843, + "step": 503 + }, + { + "epoch": 0.05721766339921453, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 5.8774, + "step": 504 + }, + { + "epoch": 0.05733119050913361, + "grad_norm": 0.58984375, + "learning_rate": 0.002, + "loss": 5.8876, + "step": 505 + }, + { + "epoch": 0.05744471761905269, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 5.8917, + "step": 506 + }, + { + "epoch": 0.057558244728971766, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.8807, + "step": 507 + }, + { + "epoch": 0.05767177183889084, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.8686, + "step": 508 + }, + { + "epoch": 0.05778529894880992, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.8777, + "step": 509 + }, + { + "epoch": 0.057898826058728994, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.878, + "step": 510 + }, + { + "epoch": 0.05801235316864807, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.8672, + "step": 511 + }, + { + "epoch": 0.058125880278567146, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.8738, + "step": 512 + }, + { + "epoch": 0.05823940738848622, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.8602, + "step": 513 + }, + { + "epoch": 0.0583529344984053, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.858, + "step": 514 + }, + { + "epoch": 0.058466461608324374, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 5.866, + "step": 515 + }, + { + "epoch": 0.05857998871824345, + "grad_norm": 0.73828125, + "learning_rate": 0.002, + "loss": 5.8628, + "step": 516 + }, + { + "epoch": 0.058693515828162526, + "grad_norm": 0.6328125, + "learning_rate": 0.002, + "loss": 5.8504, + "step": 517 + }, + { + "epoch": 0.0588070429380816, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.8885, + "step": 518 + }, + { + "epoch": 0.05892057004800068, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.8404, + "step": 519 + }, + { + "epoch": 0.059034097157919754, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.8517, + "step": 520 + }, + { + "epoch": 0.05914762426783884, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.8683, + "step": 521 + }, + { + "epoch": 0.05926115137775791, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.8823, + "step": 522 + }, + { + "epoch": 0.05937467848767699, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.8534, + "step": 523 + }, + { + "epoch": 0.059488205597596065, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.861, + "step": 524 + }, + { + "epoch": 0.05960173270751514, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.8512, + "step": 525 + }, + { + "epoch": 0.05971525981743422, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.8561, + "step": 526 + }, + { + "epoch": 0.05982878692735329, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.8532, + "step": 527 + }, + { + "epoch": 0.05994231403727237, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.854, + "step": 528 + }, + { + "epoch": 0.060055841147191445, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.8502, + "step": 529 + }, + { + "epoch": 0.06016936825711052, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.8568, + "step": 530 + }, + { + "epoch": 0.0602828953670296, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.8416, + "step": 531 + }, + { + "epoch": 0.06039642247694867, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.8455, + "step": 532 + }, + { + "epoch": 0.06050994958686775, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.8462, + "step": 533 + }, + { + "epoch": 0.060623476696786825, + "grad_norm": 0.66015625, + "learning_rate": 0.002, + "loss": 5.8577, + "step": 534 + }, + { + "epoch": 0.0607370038067059, + "grad_norm": 0.66796875, + "learning_rate": 0.002, + "loss": 5.8566, + "step": 535 + }, + { + "epoch": 0.060850530916624984, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.8322, + "step": 536 + }, + { + "epoch": 0.06096405802654406, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.8367, + "step": 537 + }, + { + "epoch": 0.061077585136463136, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.8477, + "step": 538 + }, + { + "epoch": 0.06119111224638221, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.8544, + "step": 539 + }, + { + "epoch": 0.06130463935630129, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.8441, + "step": 540 + }, + { + "epoch": 0.061418166466220364, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.8381, + "step": 541 + }, + { + "epoch": 0.06153169357613944, + "grad_norm": 0.61328125, + "learning_rate": 0.002, + "loss": 5.834, + "step": 542 + }, + { + "epoch": 0.061645220686058516, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 5.8289, + "step": 543 + }, + { + "epoch": 0.06175874779597759, + "grad_norm": 0.66015625, + "learning_rate": 0.002, + "loss": 5.8362, + "step": 544 + }, + { + "epoch": 0.06187227490589667, + "grad_norm": 0.64453125, + "learning_rate": 0.002, + "loss": 5.8187, + "step": 545 + }, + { + "epoch": 0.061985802015815744, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.8164, + "step": 546 + }, + { + "epoch": 0.06209932912573482, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.8342, + "step": 547 + }, + { + "epoch": 0.062212856235653896, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 5.8349, + "step": 548 + }, + { + "epoch": 0.06232638334557297, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.8105, + "step": 549 + }, + { + "epoch": 0.06243991045549205, + "grad_norm": 0.578125, + "learning_rate": 0.002, + "loss": 5.8258, + "step": 550 + }, + { + "epoch": 0.06255343756541112, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 5.8211, + "step": 551 + }, + { + "epoch": 0.0626669646753302, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.8138, + "step": 552 + }, + { + "epoch": 0.06278049178524928, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.8251, + "step": 553 + }, + { + "epoch": 0.06289401889516835, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.8248, + "step": 554 + }, + { + "epoch": 0.06300754600508743, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.8315, + "step": 555 + }, + { + "epoch": 0.0631210731150065, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.8225, + "step": 556 + }, + { + "epoch": 0.06323460022492558, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.8259, + "step": 557 + }, + { + "epoch": 0.06334812733484466, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.8122, + "step": 558 + }, + { + "epoch": 0.06346165444476375, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.8048, + "step": 559 + }, + { + "epoch": 0.06357518155468282, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.8143, + "step": 560 + }, + { + "epoch": 0.0636887086646019, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.82, + "step": 561 + }, + { + "epoch": 0.06380223577452097, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.8199, + "step": 562 + }, + { + "epoch": 0.06391576288444005, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.8225, + "step": 563 + }, + { + "epoch": 0.06402928999435913, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.8071, + "step": 564 + }, + { + "epoch": 0.0641428171042782, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.7962, + "step": 565 + }, + { + "epoch": 0.06425634421419728, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.8026, + "step": 566 + }, + { + "epoch": 0.06436987132411635, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 5.7825, + "step": 567 + }, + { + "epoch": 0.06448339843403543, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 5.8015, + "step": 568 + }, + { + "epoch": 0.0645969255439545, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.8206, + "step": 569 + }, + { + "epoch": 0.06471045265387358, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.8249, + "step": 570 + }, + { + "epoch": 0.06482397976379266, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.7873, + "step": 571 + }, + { + "epoch": 0.06493750687371173, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.8174, + "step": 572 + }, + { + "epoch": 0.06505103398363081, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.8136, + "step": 573 + }, + { + "epoch": 0.06516456109354989, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.8094, + "step": 574 + }, + { + "epoch": 0.06527808820346896, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.7936, + "step": 575 + }, + { + "epoch": 0.06539161531338804, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.8102, + "step": 576 + }, + { + "epoch": 0.06550514242330711, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.8202, + "step": 577 + }, + { + "epoch": 0.06561866953322619, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.7878, + "step": 578 + }, + { + "epoch": 0.06573219664314527, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.7993, + "step": 579 + }, + { + "epoch": 0.06584572375306434, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.7928, + "step": 580 + }, + { + "epoch": 0.06595925086298342, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.8023, + "step": 581 + }, + { + "epoch": 0.0660727779729025, + "grad_norm": 0.57421875, + "learning_rate": 0.002, + "loss": 5.7986, + "step": 582 + }, + { + "epoch": 0.06618630508282157, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.7899, + "step": 583 + }, + { + "epoch": 0.06629983219274065, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.7876, + "step": 584 + }, + { + "epoch": 0.06641335930265972, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7819, + "step": 585 + }, + { + "epoch": 0.0665268864125788, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.79, + "step": 586 + }, + { + "epoch": 0.06664041352249787, + "grad_norm": 0.484375, + "learning_rate": 0.002, + "loss": 5.8001, + "step": 587 + }, + { + "epoch": 0.06675394063241695, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.7964, + "step": 588 + }, + { + "epoch": 0.06686746774233604, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.7779, + "step": 589 + }, + { + "epoch": 0.06698099485225512, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.7976, + "step": 590 + }, + { + "epoch": 0.06709452196217419, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.8005, + "step": 591 + }, + { + "epoch": 0.06720804907209327, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.7917, + "step": 592 + }, + { + "epoch": 0.06732157618201234, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.8071, + "step": 593 + }, + { + "epoch": 0.06743510329193142, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.7799, + "step": 594 + }, + { + "epoch": 0.0675486304018505, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.8027, + "step": 595 + }, + { + "epoch": 0.06766215751176957, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.7886, + "step": 596 + }, + { + "epoch": 0.06777568462168865, + "grad_norm": 0.484375, + "learning_rate": 0.002, + "loss": 5.7979, + "step": 597 + }, + { + "epoch": 0.06788921173160772, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.7848, + "step": 598 + }, + { + "epoch": 0.0680027388415268, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.7738, + "step": 599 + }, + { + "epoch": 0.06811626595144588, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.793, + "step": 600 + }, + { + "epoch": 0.06822979306136495, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.769, + "step": 601 + }, + { + "epoch": 0.06834332017128403, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.78, + "step": 602 + }, + { + "epoch": 0.0684568472812031, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.7849, + "step": 603 + }, + { + "epoch": 0.06857037439112218, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.7999, + "step": 604 + }, + { + "epoch": 0.06868390150104126, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.7781, + "step": 605 + }, + { + "epoch": 0.06879742861096033, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.7623, + "step": 606 + }, + { + "epoch": 0.06891095572087941, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.7853, + "step": 607 + }, + { + "epoch": 0.06902448283079848, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.7886, + "step": 608 + }, + { + "epoch": 0.06913800994071756, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.7733, + "step": 609 + }, + { + "epoch": 0.06925153705063664, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 5.7728, + "step": 610 + }, + { + "epoch": 0.06936506416055571, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.7883, + "step": 611 + }, + { + "epoch": 0.06947859127047479, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.7865, + "step": 612 + }, + { + "epoch": 0.06959211838039386, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.7726, + "step": 613 + }, + { + "epoch": 0.06970564549031294, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.7789, + "step": 614 + }, + { + "epoch": 0.06981917260023202, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.7612, + "step": 615 + }, + { + "epoch": 0.06993269971015109, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.7471, + "step": 616 + }, + { + "epoch": 0.07004622682007017, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.766, + "step": 617 + }, + { + "epoch": 0.07015975392998924, + "grad_norm": 0.62890625, + "learning_rate": 0.002, + "loss": 5.765, + "step": 618 + }, + { + "epoch": 0.07027328103990833, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 5.7685, + "step": 619 + }, + { + "epoch": 0.07038680814982741, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.7545, + "step": 620 + }, + { + "epoch": 0.07050033525974649, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.7547, + "step": 621 + }, + { + "epoch": 0.07061386236966556, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.7689, + "step": 622 + }, + { + "epoch": 0.07072738947958464, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.7749, + "step": 623 + }, + { + "epoch": 0.07084091658950371, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.7507, + "step": 624 + }, + { + "epoch": 0.07095444369942279, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.7766, + "step": 625 + }, + { + "epoch": 0.07106797080934187, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.7641, + "step": 626 + }, + { + "epoch": 0.07118149791926094, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.7773, + "step": 627 + }, + { + "epoch": 0.07129502502918002, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.7531, + "step": 628 + }, + { + "epoch": 0.0714085521390991, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.7536, + "step": 629 + }, + { + "epoch": 0.07152207924901817, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.7559, + "step": 630 + }, + { + "epoch": 0.07163560635893725, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.779, + "step": 631 + }, + { + "epoch": 0.07174913346885632, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.755, + "step": 632 + }, + { + "epoch": 0.0718626605787754, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.7559, + "step": 633 + }, + { + "epoch": 0.07197618768869447, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.7532, + "step": 634 + }, + { + "epoch": 0.07208971479861355, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.748, + "step": 635 + }, + { + "epoch": 0.07220324190853263, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.7481, + "step": 636 + }, + { + "epoch": 0.0723167690184517, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.7658, + "step": 637 + }, + { + "epoch": 0.07243029612837078, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.7519, + "step": 638 + }, + { + "epoch": 0.07254382323828985, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.7332, + "step": 639 + }, + { + "epoch": 0.07265735034820893, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.748, + "step": 640 + }, + { + "epoch": 0.072770877458128, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.7558, + "step": 641 + }, + { + "epoch": 0.07288440456804708, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.7501, + "step": 642 + }, + { + "epoch": 0.07299793167796616, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.7486, + "step": 643 + }, + { + "epoch": 0.07311145878788523, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.75, + "step": 644 + }, + { + "epoch": 0.07322498589780431, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7473, + "step": 645 + }, + { + "epoch": 0.07333851300772339, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 5.7436, + "step": 646 + }, + { + "epoch": 0.07345204011764246, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7425, + "step": 647 + }, + { + "epoch": 0.07356556722756154, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.7354, + "step": 648 + }, + { + "epoch": 0.07367909433748063, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.7463, + "step": 649 + }, + { + "epoch": 0.0737926214473997, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.7663, + "step": 650 + }, + { + "epoch": 0.07390614855731878, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.7306, + "step": 651 + }, + { + "epoch": 0.07401967566723786, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.7533, + "step": 652 + }, + { + "epoch": 0.07413320277715693, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.7253, + "step": 653 + }, + { + "epoch": 0.07424672988707601, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.7278, + "step": 654 + }, + { + "epoch": 0.07436025699699508, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.738, + "step": 655 + }, + { + "epoch": 0.07447378410691416, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.7384, + "step": 656 + }, + { + "epoch": 0.07458731121683324, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.7493, + "step": 657 + }, + { + "epoch": 0.07470083832675231, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.7397, + "step": 658 + }, + { + "epoch": 0.07481436543667139, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.7394, + "step": 659 + }, + { + "epoch": 0.07492789254659046, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.7142, + "step": 660 + }, + { + "epoch": 0.07504141965650954, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.7464, + "step": 661 + }, + { + "epoch": 0.07515494676642862, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.744, + "step": 662 + }, + { + "epoch": 0.07526847387634769, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.7447, + "step": 663 + }, + { + "epoch": 0.07538200098626677, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.7243, + "step": 664 + }, + { + "epoch": 0.07549552809618584, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.7268, + "step": 665 + }, + { + "epoch": 0.07560905520610492, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.7228, + "step": 666 + }, + { + "epoch": 0.075722582316024, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.76, + "step": 667 + }, + { + "epoch": 0.07583610942594307, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.7159, + "step": 668 + }, + { + "epoch": 0.07594963653586215, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.7309, + "step": 669 + }, + { + "epoch": 0.07606316364578122, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.7336, + "step": 670 + }, + { + "epoch": 0.0761766907557003, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.7507, + "step": 671 + }, + { + "epoch": 0.07629021786561938, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.7326, + "step": 672 + }, + { + "epoch": 0.07640374497553845, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.728, + "step": 673 + }, + { + "epoch": 0.07651727208545753, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.7249, + "step": 674 + }, + { + "epoch": 0.0766307991953766, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.7227, + "step": 675 + }, + { + "epoch": 0.07674432630529568, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.708, + "step": 676 + }, + { + "epoch": 0.07685785341521476, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.7337, + "step": 677 + }, + { + "epoch": 0.07697138052513383, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.7041, + "step": 678 + }, + { + "epoch": 0.07708490763505292, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.7493, + "step": 679 + }, + { + "epoch": 0.077198434744972, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.7239, + "step": 680 + }, + { + "epoch": 0.07731196185489107, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.7318, + "step": 681 + }, + { + "epoch": 0.07742548896481015, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.7337, + "step": 682 + }, + { + "epoch": 0.07753901607472923, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.7239, + "step": 683 + }, + { + "epoch": 0.0776525431846483, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.7422, + "step": 684 + }, + { + "epoch": 0.07776607029456738, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.7314, + "step": 685 + }, + { + "epoch": 0.07787959740448645, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7197, + "step": 686 + }, + { + "epoch": 0.07799312451440553, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.7187, + "step": 687 + }, + { + "epoch": 0.0781066516243246, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.7262, + "step": 688 + }, + { + "epoch": 0.07822017873424368, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.7167, + "step": 689 + }, + { + "epoch": 0.07833370584416276, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.7378, + "step": 690 + }, + { + "epoch": 0.07844723295408183, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.7229, + "step": 691 + }, + { + "epoch": 0.07856076006400091, + "grad_norm": 0.5859375, + "learning_rate": 0.002, + "loss": 5.7405, + "step": 692 + }, + { + "epoch": 0.07867428717391999, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 5.7001, + "step": 693 + }, + { + "epoch": 0.07878781428383906, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.6996, + "step": 694 + }, + { + "epoch": 0.07890134139375814, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.7106, + "step": 695 + }, + { + "epoch": 0.07901486850367721, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.7326, + "step": 696 + }, + { + "epoch": 0.07912839561359629, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.7346, + "step": 697 + }, + { + "epoch": 0.07924192272351537, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.7037, + "step": 698 + }, + { + "epoch": 0.07935544983343444, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.7097, + "step": 699 + }, + { + "epoch": 0.07946897694335352, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.7179, + "step": 700 + }, + { + "epoch": 0.07958250405327259, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.7063, + "step": 701 + }, + { + "epoch": 0.07969603116319167, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.7066, + "step": 702 + }, + { + "epoch": 0.07980955827311075, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.6934, + "step": 703 + }, + { + "epoch": 0.07992308538302982, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.7136, + "step": 704 + }, + { + "epoch": 0.0800366124929489, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.7259, + "step": 705 + }, + { + "epoch": 0.08015013960286797, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.7111, + "step": 706 + }, + { + "epoch": 0.08026366671278705, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.7146, + "step": 707 + }, + { + "epoch": 0.08037719382270613, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.7274, + "step": 708 + }, + { + "epoch": 0.08049072093262521, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7083, + "step": 709 + }, + { + "epoch": 0.08060424804254429, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.7025, + "step": 710 + }, + { + "epoch": 0.08071777515246337, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.7096, + "step": 711 + }, + { + "epoch": 0.08083130226238244, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.7122, + "step": 712 + }, + { + "epoch": 0.08094482937230152, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.7053, + "step": 713 + }, + { + "epoch": 0.0810583564822206, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.7054, + "step": 714 + }, + { + "epoch": 0.08117188359213967, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.7145, + "step": 715 + }, + { + "epoch": 0.08128541070205875, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.6956, + "step": 716 + }, + { + "epoch": 0.08139893781197782, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.714, + "step": 717 + }, + { + "epoch": 0.0815124649218969, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.6946, + "step": 718 + }, + { + "epoch": 0.08162599203181597, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.7053, + "step": 719 + }, + { + "epoch": 0.08173951914173505, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.7101, + "step": 720 + }, + { + "epoch": 0.08185304625165413, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.7074, + "step": 721 + }, + { + "epoch": 0.0819665733615732, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.7082, + "step": 722 + }, + { + "epoch": 0.08208010047149228, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.6899, + "step": 723 + }, + { + "epoch": 0.08219362758141135, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.7093, + "step": 724 + }, + { + "epoch": 0.08230715469133043, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.7094, + "step": 725 + }, + { + "epoch": 0.0824206818012495, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.7134, + "step": 726 + }, + { + "epoch": 0.08253420891116858, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.7131, + "step": 727 + }, + { + "epoch": 0.08264773602108766, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.7036, + "step": 728 + }, + { + "epoch": 0.08276126313100673, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.712, + "step": 729 + }, + { + "epoch": 0.08287479024092581, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.6783, + "step": 730 + }, + { + "epoch": 0.08298831735084489, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.6737, + "step": 731 + }, + { + "epoch": 0.08310184446076396, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.6974, + "step": 732 + }, + { + "epoch": 0.08321537157068304, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.7014, + "step": 733 + }, + { + "epoch": 0.08332889868060211, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.7028, + "step": 734 + }, + { + "epoch": 0.08344242579052119, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.6799, + "step": 735 + }, + { + "epoch": 0.08355595290044027, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.6847, + "step": 736 + }, + { + "epoch": 0.08366948001035934, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.6855, + "step": 737 + }, + { + "epoch": 0.08378300712027842, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.6816, + "step": 738 + }, + { + "epoch": 0.0838965342301975, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.6893, + "step": 739 + }, + { + "epoch": 0.08401006134011658, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.6585, + "step": 740 + }, + { + "epoch": 0.08412358845003566, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.6878, + "step": 741 + }, + { + "epoch": 0.08423711555995474, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.7034, + "step": 742 + }, + { + "epoch": 0.08435064266987381, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.6927, + "step": 743 + }, + { + "epoch": 0.08446416977979289, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.6771, + "step": 744 + }, + { + "epoch": 0.08457769688971196, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.7051, + "step": 745 + }, + { + "epoch": 0.08469122399963104, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.6919, + "step": 746 + }, + { + "epoch": 0.08480475110955012, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.6853, + "step": 747 + }, + { + "epoch": 0.08491827821946919, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.6983, + "step": 748 + }, + { + "epoch": 0.08503180532938827, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.6894, + "step": 749 + }, + { + "epoch": 0.08514533243930734, + "grad_norm": 0.58203125, + "learning_rate": 0.002, + "loss": 5.6809, + "step": 750 + }, + { + "epoch": 0.08525885954922642, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 5.6604, + "step": 751 + }, + { + "epoch": 0.0853723866591455, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.6816, + "step": 752 + }, + { + "epoch": 0.08548591376906457, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.672, + "step": 753 + }, + { + "epoch": 0.08559944087898365, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.6886, + "step": 754 + }, + { + "epoch": 0.08571296798890272, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.6917, + "step": 755 + }, + { + "epoch": 0.0858264950988218, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.6703, + "step": 756 + }, + { + "epoch": 0.08594002220874088, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.6828, + "step": 757 + }, + { + "epoch": 0.08605354931865995, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.6771, + "step": 758 + }, + { + "epoch": 0.08616707642857903, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.6718, + "step": 759 + }, + { + "epoch": 0.0862806035384981, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.6722, + "step": 760 + }, + { + "epoch": 0.08639413064841718, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.6887, + "step": 761 + }, + { + "epoch": 0.08650765775833626, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.6836, + "step": 762 + }, + { + "epoch": 0.08662118486825533, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.678, + "step": 763 + }, + { + "epoch": 0.08673471197817441, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.6689, + "step": 764 + }, + { + "epoch": 0.08684823908809348, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.6632, + "step": 765 + }, + { + "epoch": 0.08696176619801256, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.6632, + "step": 766 + }, + { + "epoch": 0.08707529330793164, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.678, + "step": 767 + }, + { + "epoch": 0.08718882041785071, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.6778, + "step": 768 + }, + { + "epoch": 0.08730234752776979, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.6674, + "step": 769 + }, + { + "epoch": 0.08741587463768888, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.6487, + "step": 770 + }, + { + "epoch": 0.08752940174760795, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.6965, + "step": 771 + }, + { + "epoch": 0.08764292885752703, + "grad_norm": 0.640625, + "learning_rate": 0.002, + "loss": 5.6662, + "step": 772 + }, + { + "epoch": 0.0877564559674461, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 5.6936, + "step": 773 + }, + { + "epoch": 0.08786998307736518, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.6424, + "step": 774 + }, + { + "epoch": 0.08798351018728426, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.6758, + "step": 775 + }, + { + "epoch": 0.08809703729720333, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.6707, + "step": 776 + }, + { + "epoch": 0.08821056440712241, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.6686, + "step": 777 + }, + { + "epoch": 0.08832409151704149, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.6656, + "step": 778 + }, + { + "epoch": 0.08843761862696056, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.6793, + "step": 779 + }, + { + "epoch": 0.08855114573687964, + "grad_norm": 0.5859375, + "learning_rate": 0.002, + "loss": 5.6657, + "step": 780 + }, + { + "epoch": 0.08866467284679871, + "grad_norm": 0.62109375, + "learning_rate": 0.002, + "loss": 5.6561, + "step": 781 + }, + { + "epoch": 0.08877819995671779, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.6558, + "step": 782 + }, + { + "epoch": 0.08889172706663687, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.6518, + "step": 783 + }, + { + "epoch": 0.08900525417655594, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.6836, + "step": 784 + }, + { + "epoch": 0.08911878128647502, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.6575, + "step": 785 + }, + { + "epoch": 0.0892323083963941, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.6695, + "step": 786 + }, + { + "epoch": 0.08934583550631317, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.6617, + "step": 787 + }, + { + "epoch": 0.08945936261623225, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.6731, + "step": 788 + }, + { + "epoch": 0.08957288972615132, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.6636, + "step": 789 + }, + { + "epoch": 0.0896864168360704, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.6592, + "step": 790 + }, + { + "epoch": 0.08979994394598947, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.6409, + "step": 791 + }, + { + "epoch": 0.08991347105590855, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.6797, + "step": 792 + }, + { + "epoch": 0.09002699816582763, + "grad_norm": 0.484375, + "learning_rate": 0.002, + "loss": 5.6531, + "step": 793 + }, + { + "epoch": 0.0901405252757467, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.6628, + "step": 794 + }, + { + "epoch": 0.09025405238566578, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.6652, + "step": 795 + }, + { + "epoch": 0.09036757949558485, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.6619, + "step": 796 + }, + { + "epoch": 0.09048110660550393, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.6807, + "step": 797 + }, + { + "epoch": 0.090594633715423, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.6865, + "step": 798 + }, + { + "epoch": 0.09070816082534208, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.6616, + "step": 799 + }, + { + "epoch": 0.09082168793526117, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.6624, + "step": 800 + }, + { + "epoch": 0.09093521504518025, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.6658, + "step": 801 + }, + { + "epoch": 0.09104874215509932, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.6552, + "step": 802 + }, + { + "epoch": 0.0911622692650184, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.6669, + "step": 803 + }, + { + "epoch": 0.09127579637493748, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.675, + "step": 804 + }, + { + "epoch": 0.09138932348485655, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.6735, + "step": 805 + }, + { + "epoch": 0.09150285059477563, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.6651, + "step": 806 + }, + { + "epoch": 0.0916163777046947, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6541, + "step": 807 + }, + { + "epoch": 0.09172990481461378, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.6562, + "step": 808 + }, + { + "epoch": 0.09184343192453286, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.6588, + "step": 809 + }, + { + "epoch": 0.09195695903445193, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.6664, + "step": 810 + }, + { + "epoch": 0.09207048614437101, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.65, + "step": 811 + }, + { + "epoch": 0.09218401325429008, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.6573, + "step": 812 + }, + { + "epoch": 0.09229754036420916, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.6691, + "step": 813 + }, + { + "epoch": 0.09241106747412824, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.6454, + "step": 814 + }, + { + "epoch": 0.09252459458404731, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.6413, + "step": 815 + }, + { + "epoch": 0.09263812169396639, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.6545, + "step": 816 + }, + { + "epoch": 0.09275164880388546, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.6425, + "step": 817 + }, + { + "epoch": 0.09286517591380454, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.6416, + "step": 818 + }, + { + "epoch": 0.09297870302372362, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.6595, + "step": 819 + }, + { + "epoch": 0.09309223013364269, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.6368, + "step": 820 + }, + { + "epoch": 0.09320575724356177, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.6714, + "step": 821 + }, + { + "epoch": 0.09331928435348084, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6696, + "step": 822 + }, + { + "epoch": 0.09343281146339992, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.6734, + "step": 823 + }, + { + "epoch": 0.093546338573319, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.673, + "step": 824 + }, + { + "epoch": 0.09365986568323807, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.645, + "step": 825 + }, + { + "epoch": 0.09377339279315715, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.6506, + "step": 826 + }, + { + "epoch": 0.09388691990307622, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.6317, + "step": 827 + }, + { + "epoch": 0.0940004470129953, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.6394, + "step": 828 + }, + { + "epoch": 0.09411397412291438, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.6609, + "step": 829 + }, + { + "epoch": 0.09422750123283347, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.6418, + "step": 830 + }, + { + "epoch": 0.09434102834275254, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.6387, + "step": 831 + }, + { + "epoch": 0.09445455545267162, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6421, + "step": 832 + }, + { + "epoch": 0.0945680825625907, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.656, + "step": 833 + }, + { + "epoch": 0.09468160967250977, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.6557, + "step": 834 + }, + { + "epoch": 0.09479513678242885, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.6472, + "step": 835 + }, + { + "epoch": 0.09490866389234792, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.6651, + "step": 836 + }, + { + "epoch": 0.095022191002267, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.6386, + "step": 837 + }, + { + "epoch": 0.09513571811218607, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.6418, + "step": 838 + }, + { + "epoch": 0.09524924522210515, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.6447, + "step": 839 + }, + { + "epoch": 0.09536277233202423, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.6292, + "step": 840 + }, + { + "epoch": 0.0954762994419433, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.6334, + "step": 841 + }, + { + "epoch": 0.09558982655186238, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.6577, + "step": 842 + }, + { + "epoch": 0.09570335366178145, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.654, + "step": 843 + }, + { + "epoch": 0.09581688077170053, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.624, + "step": 844 + }, + { + "epoch": 0.0959304078816196, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.6521, + "step": 845 + }, + { + "epoch": 0.09604393499153868, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.6229, + "step": 846 + }, + { + "epoch": 0.09615746210145776, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.6424, + "step": 847 + }, + { + "epoch": 0.09627098921137683, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6545, + "step": 848 + }, + { + "epoch": 0.09638451632129591, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.6503, + "step": 849 + }, + { + "epoch": 0.09649804343121499, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.629, + "step": 850 + }, + { + "epoch": 0.09661157054113406, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.6165, + "step": 851 + }, + { + "epoch": 0.09672509765105314, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.6342, + "step": 852 + }, + { + "epoch": 0.09683862476097221, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.6343, + "step": 853 + }, + { + "epoch": 0.09695215187089129, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.6143, + "step": 854 + }, + { + "epoch": 0.09706567898081037, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.6072, + "step": 855 + }, + { + "epoch": 0.09717920609072944, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.6483, + "step": 856 + }, + { + "epoch": 0.09729273320064852, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.6203, + "step": 857 + }, + { + "epoch": 0.0974062603105676, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.6298, + "step": 858 + }, + { + "epoch": 0.09751978742048667, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.6195, + "step": 859 + }, + { + "epoch": 0.09763331453040576, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.6411, + "step": 860 + }, + { + "epoch": 0.09774684164032484, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.6273, + "step": 861 + }, + { + "epoch": 0.09786036875024391, + "grad_norm": 0.6015625, + "learning_rate": 0.002, + "loss": 5.6302, + "step": 862 + }, + { + "epoch": 0.09797389586016299, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 5.6143, + "step": 863 + }, + { + "epoch": 0.09808742297008206, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.6288, + "step": 864 + }, + { + "epoch": 0.09820095008000114, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.6417, + "step": 865 + }, + { + "epoch": 0.09831447718992022, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.6172, + "step": 866 + }, + { + "epoch": 0.09842800429983929, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6289, + "step": 867 + }, + { + "epoch": 0.09854153140975837, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.6493, + "step": 868 + }, + { + "epoch": 0.09865505851967744, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.6282, + "step": 869 + }, + { + "epoch": 0.09876858562959652, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.6209, + "step": 870 + }, + { + "epoch": 0.0988821127395156, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.6253, + "step": 871 + }, + { + "epoch": 0.09899563984943467, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.6341, + "step": 872 + }, + { + "epoch": 0.09910916695935375, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.6358, + "step": 873 + }, + { + "epoch": 0.09922269406927282, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.6132, + "step": 874 + }, + { + "epoch": 0.0993362211791919, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.6295, + "step": 875 + }, + { + "epoch": 0.09944974828911098, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.6159, + "step": 876 + }, + { + "epoch": 0.09956327539903005, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 5.6369, + "step": 877 + }, + { + "epoch": 0.09967680250894913, + "grad_norm": 0.6015625, + "learning_rate": 0.002, + "loss": 5.6308, + "step": 878 + }, + { + "epoch": 0.0997903296188682, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 5.6488, + "step": 879 + }, + { + "epoch": 0.09990385672878728, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.6272, + "step": 880 + }, + { + "epoch": 0.10001738383870636, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.6217, + "step": 881 + }, + { + "epoch": 0.10013091094862543, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.6349, + "step": 882 + }, + { + "epoch": 0.10024443805854451, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.6253, + "step": 883 + }, + { + "epoch": 0.10035796516846358, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6104, + "step": 884 + }, + { + "epoch": 0.10047149227838266, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.6363, + "step": 885 + }, + { + "epoch": 0.10058501938830174, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.6375, + "step": 886 + }, + { + "epoch": 0.10069854649822081, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.6283, + "step": 887 + }, + { + "epoch": 0.10081207360813989, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6314, + "step": 888 + }, + { + "epoch": 0.10092560071805896, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.6346, + "step": 889 + }, + { + "epoch": 0.10103912782797805, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.6193, + "step": 890 + }, + { + "epoch": 0.10115265493789713, + "grad_norm": 0.625, + "learning_rate": 0.002, + "loss": 5.6268, + "step": 891 + }, + { + "epoch": 0.1012661820478162, + "grad_norm": 0.5703125, + "learning_rate": 0.002, + "loss": 5.6322, + "step": 892 + }, + { + "epoch": 0.10137970915773528, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.6032, + "step": 893 + }, + { + "epoch": 0.10149323626765436, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6263, + "step": 894 + }, + { + "epoch": 0.10160676337757343, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.6228, + "step": 895 + }, + { + "epoch": 0.10172029048749251, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.6125, + "step": 896 + }, + { + "epoch": 0.10183381759741159, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.6185, + "step": 897 + }, + { + "epoch": 0.10194734470733066, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.6267, + "step": 898 + }, + { + "epoch": 0.10206087181724974, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.621, + "step": 899 + }, + { + "epoch": 0.10217439892716881, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.614, + "step": 900 + }, + { + "epoch": 0.10228792603708789, + "grad_norm": 0.53515625, + "learning_rate": 0.002, + "loss": 5.6343, + "step": 901 + }, + { + "epoch": 0.10240145314700697, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.6302, + "step": 902 + }, + { + "epoch": 0.10251498025692604, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.6186, + "step": 903 + }, + { + "epoch": 0.10262850736684512, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.6047, + "step": 904 + }, + { + "epoch": 0.1027420344767642, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.6217, + "step": 905 + }, + { + "epoch": 0.10285556158668327, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.6354, + "step": 906 + }, + { + "epoch": 0.10296908869660235, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.6317, + "step": 907 + }, + { + "epoch": 0.10308261580652142, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.5977, + "step": 908 + }, + { + "epoch": 0.1031961429164405, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.6094, + "step": 909 + }, + { + "epoch": 0.10330967002635957, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.604, + "step": 910 + }, + { + "epoch": 0.10342319713627865, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6258, + "step": 911 + }, + { + "epoch": 0.10353672424619773, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.6154, + "step": 912 + }, + { + "epoch": 0.1036502513561168, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.6283, + "step": 913 + }, + { + "epoch": 0.10376377846603588, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.632, + "step": 914 + }, + { + "epoch": 0.10387730557595495, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.6283, + "step": 915 + }, + { + "epoch": 0.10399083268587403, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.6206, + "step": 916 + }, + { + "epoch": 0.1041043597957931, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.6153, + "step": 917 + }, + { + "epoch": 0.10421788690571218, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5826, + "step": 918 + }, + { + "epoch": 0.10433141401563126, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.6171, + "step": 919 + }, + { + "epoch": 0.10444494112555033, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.6129, + "step": 920 + }, + { + "epoch": 0.10455846823546942, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6148, + "step": 921 + }, + { + "epoch": 0.1046719953453885, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.6074, + "step": 922 + }, + { + "epoch": 0.10478552245530758, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.5983, + "step": 923 + }, + { + "epoch": 0.10489904956522665, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.6257, + "step": 924 + }, + { + "epoch": 0.10501257667514573, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.5954, + "step": 925 + }, + { + "epoch": 0.1051261037850648, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.6087, + "step": 926 + }, + { + "epoch": 0.10523963089498388, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.618, + "step": 927 + }, + { + "epoch": 0.10535315800490296, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.6049, + "step": 928 + }, + { + "epoch": 0.10546668511482203, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5961, + "step": 929 + }, + { + "epoch": 0.10558021222474111, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5927, + "step": 930 + }, + { + "epoch": 0.10569373933466018, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.6119, + "step": 931 + }, + { + "epoch": 0.10580726644457926, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.5959, + "step": 932 + }, + { + "epoch": 0.10592079355449834, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.6095, + "step": 933 + }, + { + "epoch": 0.10603432066441741, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.6101, + "step": 934 + }, + { + "epoch": 0.10614784777433649, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5955, + "step": 935 + }, + { + "epoch": 0.10626137488425556, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6151, + "step": 936 + }, + { + "epoch": 0.10637490199417464, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.6209, + "step": 937 + }, + { + "epoch": 0.10648842910409372, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.591, + "step": 938 + }, + { + "epoch": 0.10660195621401279, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.6023, + "step": 939 + }, + { + "epoch": 0.10671548332393187, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.6064, + "step": 940 + }, + { + "epoch": 0.10682901043385094, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.6035, + "step": 941 + }, + { + "epoch": 0.10694253754377002, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.5801, + "step": 942 + }, + { + "epoch": 0.1070560646536891, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.6222, + "step": 943 + }, + { + "epoch": 0.10716959176360817, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.6049, + "step": 944 + }, + { + "epoch": 0.10728311887352725, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.6068, + "step": 945 + }, + { + "epoch": 0.10739664598344632, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.6129, + "step": 946 + }, + { + "epoch": 0.1075101730933654, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.6057, + "step": 947 + }, + { + "epoch": 0.10762370020328448, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5867, + "step": 948 + }, + { + "epoch": 0.10773722731320355, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.6051, + "step": 949 + }, + { + "epoch": 0.10785075442312263, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5915, + "step": 950 + }, + { + "epoch": 0.10796428153304172, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.5859, + "step": 951 + }, + { + "epoch": 0.1080778086429608, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.5956, + "step": 952 + }, + { + "epoch": 0.10819133575287987, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.6042, + "step": 953 + }, + { + "epoch": 0.10830486286279895, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.6142, + "step": 954 + }, + { + "epoch": 0.10841838997271802, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5827, + "step": 955 + }, + { + "epoch": 0.1085319170826371, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.5824, + "step": 956 + }, + { + "epoch": 0.10864544419255617, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.6083, + "step": 957 + }, + { + "epoch": 0.10875897130247525, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.5804, + "step": 958 + }, + { + "epoch": 0.10887249841239433, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.6095, + "step": 959 + }, + { + "epoch": 0.1089860255223134, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.5909, + "step": 960 + }, + { + "epoch": 0.10909955263223248, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.5835, + "step": 961 + }, + { + "epoch": 0.10921307974215155, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.6202, + "step": 962 + }, + { + "epoch": 0.10932660685207063, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.5972, + "step": 963 + }, + { + "epoch": 0.1094401339619897, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.5943, + "step": 964 + }, + { + "epoch": 0.10955366107190878, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5897, + "step": 965 + }, + { + "epoch": 0.10966718818182786, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.5808, + "step": 966 + }, + { + "epoch": 0.10978071529174693, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.6067, + "step": 967 + }, + { + "epoch": 0.10989424240166601, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.6222, + "step": 968 + }, + { + "epoch": 0.11000776951158509, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.5942, + "step": 969 + }, + { + "epoch": 0.11012129662150416, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.6148, + "step": 970 + }, + { + "epoch": 0.11023482373142324, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5933, + "step": 971 + }, + { + "epoch": 0.11034835084134231, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.6097, + "step": 972 + }, + { + "epoch": 0.11046187795126139, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.6176, + "step": 973 + }, + { + "epoch": 0.11057540506118047, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.575, + "step": 974 + }, + { + "epoch": 0.11068893217109954, + "grad_norm": 0.66015625, + "learning_rate": 0.002, + "loss": 5.6118, + "step": 975 + }, + { + "epoch": 0.11080245928101862, + "grad_norm": 0.6171875, + "learning_rate": 0.002, + "loss": 5.5853, + "step": 976 + }, + { + "epoch": 0.1109159863909377, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.5862, + "step": 977 + }, + { + "epoch": 0.11102951350085677, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.5831, + "step": 978 + }, + { + "epoch": 0.11114304061077585, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.5997, + "step": 979 + }, + { + "epoch": 0.11125656772069492, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.5824, + "step": 980 + }, + { + "epoch": 0.11137009483061401, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.6081, + "step": 981 + }, + { + "epoch": 0.11148362194053309, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.5927, + "step": 982 + }, + { + "epoch": 0.11159714905045216, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.5977, + "step": 983 + }, + { + "epoch": 0.11171067616037124, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.5878, + "step": 984 + }, + { + "epoch": 0.11182420327029031, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.5938, + "step": 985 + }, + { + "epoch": 0.11193773038020939, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.5986, + "step": 986 + }, + { + "epoch": 0.11205125749012847, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.5747, + "step": 987 + }, + { + "epoch": 0.11216478460004754, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.5795, + "step": 988 + }, + { + "epoch": 0.11227831170996662, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5969, + "step": 989 + }, + { + "epoch": 0.1123918388198857, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.602, + "step": 990 + }, + { + "epoch": 0.11250536592980477, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5899, + "step": 991 + }, + { + "epoch": 0.11261889303972385, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.5868, + "step": 992 + }, + { + "epoch": 0.11273242014964292, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5909, + "step": 993 + }, + { + "epoch": 0.112845947259562, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.6037, + "step": 994 + }, + { + "epoch": 0.11295947436948107, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.5669, + "step": 995 + }, + { + "epoch": 0.11307300147940015, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.5906, + "step": 996 + }, + { + "epoch": 0.11318652858931923, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.5805, + "step": 997 + }, + { + "epoch": 0.1133000556992383, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5668, + "step": 998 + }, + { + "epoch": 0.11341358280915738, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.5836, + "step": 999 + }, + { + "epoch": 0.11352710991907645, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5779, + "step": 1000 + }, + { + "epoch": 0.11364063702899553, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.5953, + "step": 1001 + }, + { + "epoch": 0.1137541641389146, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.5934, + "step": 1002 + }, + { + "epoch": 0.11386769124883368, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.5934, + "step": 1003 + }, + { + "epoch": 0.11398121835875276, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5778, + "step": 1004 + }, + { + "epoch": 0.11409474546867183, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.5706, + "step": 1005 + }, + { + "epoch": 0.11420827257859091, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.5717, + "step": 1006 + }, + { + "epoch": 0.11432179968850999, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.5936, + "step": 1007 + }, + { + "epoch": 0.11443532679842906, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.578, + "step": 1008 + }, + { + "epoch": 0.11454885390834814, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.5532, + "step": 1009 + }, + { + "epoch": 0.11466238101826721, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5636, + "step": 1010 + }, + { + "epoch": 0.1147759081281863, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.5852, + "step": 1011 + }, + { + "epoch": 0.11488943523810538, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.5727, + "step": 1012 + }, + { + "epoch": 0.11500296234802446, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.578, + "step": 1013 + }, + { + "epoch": 0.11511648945794353, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.5804, + "step": 1014 + }, + { + "epoch": 0.11523001656786261, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.5649, + "step": 1015 + }, + { + "epoch": 0.11534354367778168, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.5849, + "step": 1016 + }, + { + "epoch": 0.11545707078770076, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.5747, + "step": 1017 + }, + { + "epoch": 0.11557059789761984, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.5729, + "step": 1018 + }, + { + "epoch": 0.11568412500753891, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.5544, + "step": 1019 + }, + { + "epoch": 0.11579765211745799, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.5804, + "step": 1020 + }, + { + "epoch": 0.11591117922737706, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.5695, + "step": 1021 + }, + { + "epoch": 0.11602470633729614, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5706, + "step": 1022 + }, + { + "epoch": 0.11613823344721522, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5751, + "step": 1023 + }, + { + "epoch": 0.11625176055713429, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.5699, + "step": 1024 + }, + { + "epoch": 0.11636528766705337, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.5722, + "step": 1025 + }, + { + "epoch": 0.11647881477697244, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.5785, + "step": 1026 + }, + { + "epoch": 0.11659234188689152, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5789, + "step": 1027 + }, + { + "epoch": 0.1167058689968106, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5705, + "step": 1028 + }, + { + "epoch": 0.11681939610672967, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.586, + "step": 1029 + }, + { + "epoch": 0.11693292321664875, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5683, + "step": 1030 + }, + { + "epoch": 0.11704645032656782, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.5833, + "step": 1031 + }, + { + "epoch": 0.1171599774364869, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.5729, + "step": 1032 + }, + { + "epoch": 0.11727350454640598, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.5695, + "step": 1033 + }, + { + "epoch": 0.11738703165632505, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.5532, + "step": 1034 + }, + { + "epoch": 0.11750055876624413, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.5794, + "step": 1035 + }, + { + "epoch": 0.1176140858761632, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.5763, + "step": 1036 + }, + { + "epoch": 0.11772761298608228, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.5686, + "step": 1037 + }, + { + "epoch": 0.11784114009600136, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.5819, + "step": 1038 + }, + { + "epoch": 0.11795466720592043, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.5806, + "step": 1039 + }, + { + "epoch": 0.11806819431583951, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.5665, + "step": 1040 + }, + { + "epoch": 0.1181817214257586, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5706, + "step": 1041 + }, + { + "epoch": 0.11829524853567767, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.562, + "step": 1042 + }, + { + "epoch": 0.11840877564559675, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5604, + "step": 1043 + }, + { + "epoch": 0.11852230275551583, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5737, + "step": 1044 + }, + { + "epoch": 0.1186358298654349, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.5703, + "step": 1045 + }, + { + "epoch": 0.11874935697535398, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.5501, + "step": 1046 + }, + { + "epoch": 0.11886288408527305, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.5648, + "step": 1047 + }, + { + "epoch": 0.11897641119519213, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5639, + "step": 1048 + }, + { + "epoch": 0.1190899383051112, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.5697, + "step": 1049 + }, + { + "epoch": 0.11920346541503028, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 5.5672, + "step": 1050 + }, + { + "epoch": 0.11931699252494936, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.5607, + "step": 1051 + }, + { + "epoch": 0.11943051963486843, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.573, + "step": 1052 + }, + { + "epoch": 0.11954404674478751, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.5559, + "step": 1053 + }, + { + "epoch": 0.11965757385470659, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.5561, + "step": 1054 + }, + { + "epoch": 0.11977110096462566, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.5514, + "step": 1055 + }, + { + "epoch": 0.11988462807454474, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.5471, + "step": 1056 + }, + { + "epoch": 0.11999815518446381, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.5953, + "step": 1057 + }, + { + "epoch": 0.12011168229438289, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5661, + "step": 1058 + }, + { + "epoch": 0.12022520940430197, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5598, + "step": 1059 + }, + { + "epoch": 0.12033873651422104, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.5619, + "step": 1060 + }, + { + "epoch": 0.12045226362414012, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.5565, + "step": 1061 + }, + { + "epoch": 0.1205657907340592, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5556, + "step": 1062 + }, + { + "epoch": 0.12067931784397827, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.5713, + "step": 1063 + }, + { + "epoch": 0.12079284495389735, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.5582, + "step": 1064 + }, + { + "epoch": 0.12090637206381642, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.5706, + "step": 1065 + }, + { + "epoch": 0.1210198991737355, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.5816, + "step": 1066 + }, + { + "epoch": 0.12113342628365457, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.5739, + "step": 1067 + }, + { + "epoch": 0.12124695339357365, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.574, + "step": 1068 + }, + { + "epoch": 0.12136048050349273, + "grad_norm": 0.5546875, + "learning_rate": 0.002, + "loss": 5.5521, + "step": 1069 + }, + { + "epoch": 0.1214740076134118, + "grad_norm": 0.65625, + "learning_rate": 0.002, + "loss": 5.5583, + "step": 1070 + }, + { + "epoch": 0.12158753472333089, + "grad_norm": 0.60546875, + "learning_rate": 0.002, + "loss": 5.5464, + "step": 1071 + }, + { + "epoch": 0.12170106183324997, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.5743, + "step": 1072 + }, + { + "epoch": 0.12181458894316904, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.5292, + "step": 1073 + }, + { + "epoch": 0.12192811605308812, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.57, + "step": 1074 + }, + { + "epoch": 0.1220416431630072, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.5705, + "step": 1075 + }, + { + "epoch": 0.12215517027292627, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5686, + "step": 1076 + }, + { + "epoch": 0.12226869738284535, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5457, + "step": 1077 + }, + { + "epoch": 0.12238222449276442, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5514, + "step": 1078 + }, + { + "epoch": 0.1224957516026835, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.5715, + "step": 1079 + }, + { + "epoch": 0.12260927871260258, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.5441, + "step": 1080 + }, + { + "epoch": 0.12272280582252165, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.5699, + "step": 1081 + }, + { + "epoch": 0.12283633293244073, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.5504, + "step": 1082 + }, + { + "epoch": 0.1229498600423598, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5494, + "step": 1083 + }, + { + "epoch": 0.12306338715227888, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.5575, + "step": 1084 + }, + { + "epoch": 0.12317691426219796, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.563, + "step": 1085 + }, + { + "epoch": 0.12329044137211703, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5622, + "step": 1086 + }, + { + "epoch": 0.12340396848203611, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5618, + "step": 1087 + }, + { + "epoch": 0.12351749559195518, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.5402, + "step": 1088 + }, + { + "epoch": 0.12363102270187426, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.5557, + "step": 1089 + }, + { + "epoch": 0.12374454981179334, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.563, + "step": 1090 + }, + { + "epoch": 0.12385807692171241, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.5684, + "step": 1091 + }, + { + "epoch": 0.12397160403163149, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.5484, + "step": 1092 + }, + { + "epoch": 0.12408513114155056, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.5572, + "step": 1093 + }, + { + "epoch": 0.12419865825146964, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.5657, + "step": 1094 + }, + { + "epoch": 0.12431218536138872, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.5525, + "step": 1095 + }, + { + "epoch": 0.12442571247130779, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.5524, + "step": 1096 + }, + { + "epoch": 0.12453923958122687, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.5588, + "step": 1097 + }, + { + "epoch": 0.12465276669114594, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.5512, + "step": 1098 + }, + { + "epoch": 0.12476629380106502, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5488, + "step": 1099 + }, + { + "epoch": 0.1248798209109841, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.5724, + "step": 1100 + }, + { + "epoch": 0.12499334802090317, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.5408, + "step": 1101 + }, + { + "epoch": 0.12510687513082225, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.5319, + "step": 1102 + }, + { + "epoch": 0.12522040224074132, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.5478, + "step": 1103 + }, + { + "epoch": 0.1253339293506604, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.5648, + "step": 1104 + }, + { + "epoch": 0.12544745646057948, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.5579, + "step": 1105 + }, + { + "epoch": 0.12556098357049855, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.5536, + "step": 1106 + }, + { + "epoch": 0.12567451068041763, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5541, + "step": 1107 + }, + { + "epoch": 0.1257880377903367, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5458, + "step": 1108 + }, + { + "epoch": 0.12590156490025578, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5617, + "step": 1109 + }, + { + "epoch": 0.12601509201017486, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5634, + "step": 1110 + }, + { + "epoch": 0.12612861912009393, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.5347, + "step": 1111 + }, + { + "epoch": 0.126242146230013, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5753, + "step": 1112 + }, + { + "epoch": 0.12635567333993208, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.5607, + "step": 1113 + }, + { + "epoch": 0.12646920044985116, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.5263, + "step": 1114 + }, + { + "epoch": 0.12658272755977024, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5605, + "step": 1115 + }, + { + "epoch": 0.1266962546696893, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.5603, + "step": 1116 + }, + { + "epoch": 0.12680978177960842, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.54, + "step": 1117 + }, + { + "epoch": 0.1269233088895275, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.5457, + "step": 1118 + }, + { + "epoch": 0.12703683599944657, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.5271, + "step": 1119 + }, + { + "epoch": 0.12715036310936564, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5395, + "step": 1120 + }, + { + "epoch": 0.12726389021928472, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5356, + "step": 1121 + }, + { + "epoch": 0.1273774173292038, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.5575, + "step": 1122 + }, + { + "epoch": 0.12749094443912287, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.5514, + "step": 1123 + }, + { + "epoch": 0.12760447154904195, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.5386, + "step": 1124 + }, + { + "epoch": 0.12771799865896102, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.5548, + "step": 1125 + }, + { + "epoch": 0.1278315257688801, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5535, + "step": 1126 + }, + { + "epoch": 0.12794505287879918, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.5349, + "step": 1127 + }, + { + "epoch": 0.12805857998871825, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.5458, + "step": 1128 + }, + { + "epoch": 0.12817210709863733, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.5261, + "step": 1129 + }, + { + "epoch": 0.1282856342085564, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.5365, + "step": 1130 + }, + { + "epoch": 0.12839916131847548, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.5481, + "step": 1131 + }, + { + "epoch": 0.12851268842839456, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.5552, + "step": 1132 + }, + { + "epoch": 0.12862621553831363, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.5549, + "step": 1133 + }, + { + "epoch": 0.1287397426482327, + "grad_norm": 0.5625, + "learning_rate": 0.002, + "loss": 5.5279, + "step": 1134 + }, + { + "epoch": 0.12885326975815178, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.5441, + "step": 1135 + }, + { + "epoch": 0.12896679686807086, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.5371, + "step": 1136 + }, + { + "epoch": 0.12908032397798994, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.5476, + "step": 1137 + }, + { + "epoch": 0.129193851087909, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5427, + "step": 1138 + }, + { + "epoch": 0.1293073781978281, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5379, + "step": 1139 + }, + { + "epoch": 0.12942090530774716, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.558, + "step": 1140 + }, + { + "epoch": 0.12953443241766624, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.5446, + "step": 1141 + }, + { + "epoch": 0.12964795952758532, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.5415, + "step": 1142 + }, + { + "epoch": 0.1297614866375044, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5544, + "step": 1143 + }, + { + "epoch": 0.12987501374742347, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.5142, + "step": 1144 + }, + { + "epoch": 0.12998854085734254, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.5177, + "step": 1145 + }, + { + "epoch": 0.13010206796726162, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.5389, + "step": 1146 + }, + { + "epoch": 0.1302155950771807, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.5318, + "step": 1147 + }, + { + "epoch": 0.13032912218709977, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.5477, + "step": 1148 + }, + { + "epoch": 0.13044264929701885, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.5443, + "step": 1149 + }, + { + "epoch": 0.13055617640693792, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.5231, + "step": 1150 + }, + { + "epoch": 0.130669703516857, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5401, + "step": 1151 + }, + { + "epoch": 0.13078323062677608, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5374, + "step": 1152 + }, + { + "epoch": 0.13089675773669515, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.5306, + "step": 1153 + }, + { + "epoch": 0.13101028484661423, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.5381, + "step": 1154 + }, + { + "epoch": 0.1311238119565333, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.5337, + "step": 1155 + }, + { + "epoch": 0.13123733906645238, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.5434, + "step": 1156 + }, + { + "epoch": 0.13135086617637146, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.5441, + "step": 1157 + }, + { + "epoch": 0.13146439328629053, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5277, + "step": 1158 + }, + { + "epoch": 0.1315779203962096, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.539, + "step": 1159 + }, + { + "epoch": 0.13169144750612868, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5387, + "step": 1160 + }, + { + "epoch": 0.13180497461604776, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.5396, + "step": 1161 + }, + { + "epoch": 0.13191850172596684, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5249, + "step": 1162 + }, + { + "epoch": 0.1320320288358859, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.5466, + "step": 1163 + }, + { + "epoch": 0.132145555945805, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.526, + "step": 1164 + }, + { + "epoch": 0.13225908305572406, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5321, + "step": 1165 + }, + { + "epoch": 0.13237261016564314, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5269, + "step": 1166 + }, + { + "epoch": 0.13248613727556222, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.5436, + "step": 1167 + }, + { + "epoch": 0.1325996643854813, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.5382, + "step": 1168 + }, + { + "epoch": 0.13271319149540037, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.5295, + "step": 1169 + }, + { + "epoch": 0.13282671860531944, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.5262, + "step": 1170 + }, + { + "epoch": 0.13294024571523852, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5363, + "step": 1171 + }, + { + "epoch": 0.1330537728251576, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.5427, + "step": 1172 + }, + { + "epoch": 0.13316729993507667, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.5425, + "step": 1173 + }, + { + "epoch": 0.13328082704499575, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5189, + "step": 1174 + }, + { + "epoch": 0.13339435415491482, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.554, + "step": 1175 + }, + { + "epoch": 0.1335078812648339, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.4912, + "step": 1176 + }, + { + "epoch": 0.133621408374753, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5278, + "step": 1177 + }, + { + "epoch": 0.13373493548467208, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.5212, + "step": 1178 + }, + { + "epoch": 0.13384846259459116, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.5424, + "step": 1179 + }, + { + "epoch": 0.13396198970451023, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5192, + "step": 1180 + }, + { + "epoch": 0.1340755168144293, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.5283, + "step": 1181 + }, + { + "epoch": 0.13418904392434838, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.537, + "step": 1182 + }, + { + "epoch": 0.13430257103426746, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.5259, + "step": 1183 + }, + { + "epoch": 0.13441609814418654, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.5359, + "step": 1184 + }, + { + "epoch": 0.1345296252541056, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.5311, + "step": 1185 + }, + { + "epoch": 0.1346431523640247, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.5232, + "step": 1186 + }, + { + "epoch": 0.13475667947394376, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.5296, + "step": 1187 + }, + { + "epoch": 0.13487020658386284, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.5201, + "step": 1188 + }, + { + "epoch": 0.13498373369378192, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.5279, + "step": 1189 + }, + { + "epoch": 0.135097260803701, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.5238, + "step": 1190 + }, + { + "epoch": 0.13521078791362007, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.5171, + "step": 1191 + }, + { + "epoch": 0.13532431502353914, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.5474, + "step": 1192 + }, + { + "epoch": 0.13543784213345822, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.5475, + "step": 1193 + }, + { + "epoch": 0.1355513692433773, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.5243, + "step": 1194 + }, + { + "epoch": 0.13566489635329637, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.5344, + "step": 1195 + }, + { + "epoch": 0.13577842346321545, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.53, + "step": 1196 + }, + { + "epoch": 0.13589195057313452, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.5222, + "step": 1197 + }, + { + "epoch": 0.1360054776830536, + "grad_norm": 0.59765625, + "learning_rate": 0.002, + "loss": 5.5147, + "step": 1198 + }, + { + "epoch": 0.13611900479297268, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.5287, + "step": 1199 + }, + { + "epoch": 0.13623253190289175, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5145, + "step": 1200 + }, + { + "epoch": 0.13634605901281083, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.529, + "step": 1201 + }, + { + "epoch": 0.1364595861227299, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.5366, + "step": 1202 + }, + { + "epoch": 0.13657311323264898, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5282, + "step": 1203 + }, + { + "epoch": 0.13668664034256806, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.5166, + "step": 1204 + }, + { + "epoch": 0.13680016745248713, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5217, + "step": 1205 + }, + { + "epoch": 0.1369136945624062, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.5257, + "step": 1206 + }, + { + "epoch": 0.13702722167232528, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.5025, + "step": 1207 + }, + { + "epoch": 0.13714074878224436, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5269, + "step": 1208 + }, + { + "epoch": 0.13725427589216344, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.5391, + "step": 1209 + }, + { + "epoch": 0.1373678030020825, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.5028, + "step": 1210 + }, + { + "epoch": 0.1374813301120016, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.527, + "step": 1211 + }, + { + "epoch": 0.13759485722192066, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.5209, + "step": 1212 + }, + { + "epoch": 0.13770838433183974, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.5134, + "step": 1213 + }, + { + "epoch": 0.13782191144175882, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.5211, + "step": 1214 + }, + { + "epoch": 0.1379354385516779, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.5359, + "step": 1215 + }, + { + "epoch": 0.13804896566159697, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.5179, + "step": 1216 + }, + { + "epoch": 0.13816249277151604, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5178, + "step": 1217 + }, + { + "epoch": 0.13827601988143512, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4937, + "step": 1218 + }, + { + "epoch": 0.1383895469913542, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.5265, + "step": 1219 + }, + { + "epoch": 0.13850307410127327, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.5118, + "step": 1220 + }, + { + "epoch": 0.13861660121119235, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.5181, + "step": 1221 + }, + { + "epoch": 0.13873012832111142, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.5385, + "step": 1222 + }, + { + "epoch": 0.1388436554310305, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.539, + "step": 1223 + }, + { + "epoch": 0.13895718254094958, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.5237, + "step": 1224 + }, + { + "epoch": 0.13907070965086865, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.5251, + "step": 1225 + }, + { + "epoch": 0.13918423676078773, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.5393, + "step": 1226 + }, + { + "epoch": 0.1392977638707068, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4971, + "step": 1227 + }, + { + "epoch": 0.13941129098062588, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.5192, + "step": 1228 + }, + { + "epoch": 0.13952481809054496, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4989, + "step": 1229 + }, + { + "epoch": 0.13963834520046403, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.5368, + "step": 1230 + }, + { + "epoch": 0.1397518723103831, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.5391, + "step": 1231 + }, + { + "epoch": 0.13986539942030218, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.519, + "step": 1232 + }, + { + "epoch": 0.13997892653022126, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5221, + "step": 1233 + }, + { + "epoch": 0.14009245364014034, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.5069, + "step": 1234 + }, + { + "epoch": 0.1402059807500594, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.53, + "step": 1235 + }, + { + "epoch": 0.1403195078599785, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.5109, + "step": 1236 + }, + { + "epoch": 0.1404330349698976, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.5188, + "step": 1237 + }, + { + "epoch": 0.14054656207981667, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.5151, + "step": 1238 + }, + { + "epoch": 0.14066008918973574, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.5115, + "step": 1239 + }, + { + "epoch": 0.14077361629965482, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.5396, + "step": 1240 + }, + { + "epoch": 0.1408871434095739, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.5101, + "step": 1241 + }, + { + "epoch": 0.14100067051949297, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.5293, + "step": 1242 + }, + { + "epoch": 0.14111419762941205, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.507, + "step": 1243 + }, + { + "epoch": 0.14122772473933112, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.5201, + "step": 1244 + }, + { + "epoch": 0.1413412518492502, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.507, + "step": 1245 + }, + { + "epoch": 0.14145477895916928, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.5111, + "step": 1246 + }, + { + "epoch": 0.14156830606908835, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 5.5155, + "step": 1247 + }, + { + "epoch": 0.14168183317900743, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.5221, + "step": 1248 + }, + { + "epoch": 0.1417953602889265, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.5118, + "step": 1249 + }, + { + "epoch": 0.14190888739884558, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.5088, + "step": 1250 + }, + { + "epoch": 0.14202241450876466, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.5193, + "step": 1251 + }, + { + "epoch": 0.14213594161868373, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4872, + "step": 1252 + }, + { + "epoch": 0.1422494687286028, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.5207, + "step": 1253 + }, + { + "epoch": 0.14236299583852188, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.5089, + "step": 1254 + }, + { + "epoch": 0.14247652294844096, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4982, + "step": 1255 + }, + { + "epoch": 0.14259005005836003, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.5159, + "step": 1256 + }, + { + "epoch": 0.1427035771682791, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.507, + "step": 1257 + }, + { + "epoch": 0.1428171042781982, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.5016, + "step": 1258 + }, + { + "epoch": 0.14293063138811726, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.5108, + "step": 1259 + }, + { + "epoch": 0.14304415849803634, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.5205, + "step": 1260 + }, + { + "epoch": 0.14315768560795541, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.5127, + "step": 1261 + }, + { + "epoch": 0.1432712127178745, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4916, + "step": 1262 + }, + { + "epoch": 0.14338473982779357, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.5037, + "step": 1263 + }, + { + "epoch": 0.14349826693771264, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.518, + "step": 1264 + }, + { + "epoch": 0.14361179404763172, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4998, + "step": 1265 + }, + { + "epoch": 0.1437253211575508, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5086, + "step": 1266 + }, + { + "epoch": 0.14383884826746987, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.5099, + "step": 1267 + }, + { + "epoch": 0.14395237537738895, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.5156, + "step": 1268 + }, + { + "epoch": 0.14406590248730802, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4937, + "step": 1269 + }, + { + "epoch": 0.1441794295972271, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.5108, + "step": 1270 + }, + { + "epoch": 0.14429295670714617, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.5208, + "step": 1271 + }, + { + "epoch": 0.14440648381706525, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.517, + "step": 1272 + }, + { + "epoch": 0.14452001092698433, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.514, + "step": 1273 + }, + { + "epoch": 0.1446335380369034, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.5225, + "step": 1274 + }, + { + "epoch": 0.14474706514682248, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.5213, + "step": 1275 + }, + { + "epoch": 0.14486059225674155, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.5253, + "step": 1276 + }, + { + "epoch": 0.14497411936666063, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4806, + "step": 1277 + }, + { + "epoch": 0.1450876464765797, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4986, + "step": 1278 + }, + { + "epoch": 0.14520117358649878, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4802, + "step": 1279 + }, + { + "epoch": 0.14531470069641786, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.509, + "step": 1280 + }, + { + "epoch": 0.14542822780633693, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.5083, + "step": 1281 + }, + { + "epoch": 0.145541754916256, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4918, + "step": 1282 + }, + { + "epoch": 0.1456552820261751, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.5365, + "step": 1283 + }, + { + "epoch": 0.14576880913609416, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.5178, + "step": 1284 + }, + { + "epoch": 0.14588233624601324, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5097, + "step": 1285 + }, + { + "epoch": 0.14599586335593231, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.5035, + "step": 1286 + }, + { + "epoch": 0.1461093904658514, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.513, + "step": 1287 + }, + { + "epoch": 0.14622291757577047, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.5109, + "step": 1288 + }, + { + "epoch": 0.14633644468568954, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.5049, + "step": 1289 + }, + { + "epoch": 0.14644997179560862, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.5105, + "step": 1290 + }, + { + "epoch": 0.1465634989055277, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.5136, + "step": 1291 + }, + { + "epoch": 0.14667702601544677, + "grad_norm": 0.59375, + "learning_rate": 0.002, + "loss": 5.5088, + "step": 1292 + }, + { + "epoch": 0.14679055312536585, + "grad_norm": 0.54296875, + "learning_rate": 0.002, + "loss": 5.512, + "step": 1293 + }, + { + "epoch": 0.14690408023528492, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.4919, + "step": 1294 + }, + { + "epoch": 0.147017607345204, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4931, + "step": 1295 + }, + { + "epoch": 0.14713113445512307, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.493, + "step": 1296 + }, + { + "epoch": 0.14724466156504215, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.4839, + "step": 1297 + }, + { + "epoch": 0.14735818867496125, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.4777, + "step": 1298 + }, + { + "epoch": 0.14747171578488033, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.5059, + "step": 1299 + }, + { + "epoch": 0.1475852428947994, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5141, + "step": 1300 + }, + { + "epoch": 0.14769877000471848, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.5259, + "step": 1301 + }, + { + "epoch": 0.14781229711463756, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.5155, + "step": 1302 + }, + { + "epoch": 0.14792582422455663, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.503, + "step": 1303 + }, + { + "epoch": 0.1480393513344757, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.5032, + "step": 1304 + }, + { + "epoch": 0.1481528784443948, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4906, + "step": 1305 + }, + { + "epoch": 0.14826640555431386, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4984, + "step": 1306 + }, + { + "epoch": 0.14837993266423294, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.5264, + "step": 1307 + }, + { + "epoch": 0.14849345977415201, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.5082, + "step": 1308 + }, + { + "epoch": 0.1486069868840711, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.4823, + "step": 1309 + }, + { + "epoch": 0.14872051399399017, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.5023, + "step": 1310 + }, + { + "epoch": 0.14883404110390924, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.4978, + "step": 1311 + }, + { + "epoch": 0.14894756821382832, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.5204, + "step": 1312 + }, + { + "epoch": 0.1490610953237474, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.4882, + "step": 1313 + }, + { + "epoch": 0.14917462243366647, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4701, + "step": 1314 + }, + { + "epoch": 0.14928814954358555, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.51, + "step": 1315 + }, + { + "epoch": 0.14940167665350462, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4809, + "step": 1316 + }, + { + "epoch": 0.1495152037634237, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4878, + "step": 1317 + }, + { + "epoch": 0.14962873087334277, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4939, + "step": 1318 + }, + { + "epoch": 0.14974225798326185, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.4838, + "step": 1319 + }, + { + "epoch": 0.14985578509318093, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4949, + "step": 1320 + }, + { + "epoch": 0.1499693122031, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4901, + "step": 1321 + }, + { + "epoch": 0.15008283931301908, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4935, + "step": 1322 + }, + { + "epoch": 0.15019636642293815, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.5011, + "step": 1323 + }, + { + "epoch": 0.15030989353285723, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.4881, + "step": 1324 + }, + { + "epoch": 0.1504234206427763, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.4987, + "step": 1325 + }, + { + "epoch": 0.15053694775269538, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4833, + "step": 1326 + }, + { + "epoch": 0.15065047486261446, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5303, + "step": 1327 + }, + { + "epoch": 0.15076400197253353, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.5166, + "step": 1328 + }, + { + "epoch": 0.1508775290824526, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4696, + "step": 1329 + }, + { + "epoch": 0.1509910561923717, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4704, + "step": 1330 + }, + { + "epoch": 0.15110458330229076, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4845, + "step": 1331 + }, + { + "epoch": 0.15121811041220984, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4976, + "step": 1332 + }, + { + "epoch": 0.15133163752212891, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4732, + "step": 1333 + }, + { + "epoch": 0.151445164632048, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4948, + "step": 1334 + }, + { + "epoch": 0.15155869174196707, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4892, + "step": 1335 + }, + { + "epoch": 0.15167221885188614, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.4983, + "step": 1336 + }, + { + "epoch": 0.15178574596180522, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4859, + "step": 1337 + }, + { + "epoch": 0.1518992730717243, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4646, + "step": 1338 + }, + { + "epoch": 0.15201280018164337, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.5193, + "step": 1339 + }, + { + "epoch": 0.15212632729156245, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.5059, + "step": 1340 + }, + { + "epoch": 0.15223985440148152, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.5149, + "step": 1341 + }, + { + "epoch": 0.1523533815114006, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.4976, + "step": 1342 + }, + { + "epoch": 0.15246690862131967, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.491, + "step": 1343 + }, + { + "epoch": 0.15258043573123875, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4841, + "step": 1344 + }, + { + "epoch": 0.15269396284115783, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4861, + "step": 1345 + }, + { + "epoch": 0.1528074899510769, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4922, + "step": 1346 + }, + { + "epoch": 0.15292101706099598, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.5032, + "step": 1347 + }, + { + "epoch": 0.15303454417091505, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.4852, + "step": 1348 + }, + { + "epoch": 0.15314807128083413, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.5097, + "step": 1349 + }, + { + "epoch": 0.1532615983907532, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4892, + "step": 1350 + }, + { + "epoch": 0.15337512550067228, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.4972, + "step": 1351 + }, + { + "epoch": 0.15348865261059136, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.5134, + "step": 1352 + }, + { + "epoch": 0.15360217972051043, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4984, + "step": 1353 + }, + { + "epoch": 0.1537157068304295, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4886, + "step": 1354 + }, + { + "epoch": 0.1538292339403486, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.466, + "step": 1355 + }, + { + "epoch": 0.15394276105026766, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.4912, + "step": 1356 + }, + { + "epoch": 0.15405628816018674, + "grad_norm": 0.55078125, + "learning_rate": 0.002, + "loss": 5.5099, + "step": 1357 + }, + { + "epoch": 0.15416981527010584, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.5082, + "step": 1358 + }, + { + "epoch": 0.15428334238002492, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4784, + "step": 1359 + }, + { + "epoch": 0.154396869489944, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.5001, + "step": 1360 + }, + { + "epoch": 0.15451039659986307, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.4859, + "step": 1361 + }, + { + "epoch": 0.15462392370978215, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.4926, + "step": 1362 + }, + { + "epoch": 0.15473745081970122, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.4977, + "step": 1363 + }, + { + "epoch": 0.1548509779296203, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.4775, + "step": 1364 + }, + { + "epoch": 0.15496450503953937, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.5012, + "step": 1365 + }, + { + "epoch": 0.15507803214945845, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.5044, + "step": 1366 + }, + { + "epoch": 0.15519155925937753, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.4905, + "step": 1367 + }, + { + "epoch": 0.1553050863692966, + "grad_norm": 0.474609375, + "learning_rate": 0.002, + "loss": 5.5009, + "step": 1368 + }, + { + "epoch": 0.15541861347921568, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.4886, + "step": 1369 + }, + { + "epoch": 0.15553214058913475, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.5077, + "step": 1370 + }, + { + "epoch": 0.15564566769905383, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.4751, + "step": 1371 + }, + { + "epoch": 0.1557591948089729, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.5031, + "step": 1372 + }, + { + "epoch": 0.15587272191889198, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.5057, + "step": 1373 + }, + { + "epoch": 0.15598624902881106, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4909, + "step": 1374 + }, + { + "epoch": 0.15609977613873013, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.4869, + "step": 1375 + }, + { + "epoch": 0.1562133032486492, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.5038, + "step": 1376 + }, + { + "epoch": 0.1563268303585683, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.4862, + "step": 1377 + }, + { + "epoch": 0.15644035746848736, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.4859, + "step": 1378 + }, + { + "epoch": 0.15655388457840644, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.4768, + "step": 1379 + }, + { + "epoch": 0.15666741168832551, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.4793, + "step": 1380 + }, + { + "epoch": 0.1567809387982446, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4765, + "step": 1381 + }, + { + "epoch": 0.15689446590816367, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.4869, + "step": 1382 + }, + { + "epoch": 0.15700799301808274, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.5028, + "step": 1383 + }, + { + "epoch": 0.15712152012800182, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.5071, + "step": 1384 + }, + { + "epoch": 0.1572350472379209, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.485, + "step": 1385 + }, + { + "epoch": 0.15734857434783997, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.4737, + "step": 1386 + }, + { + "epoch": 0.15746210145775905, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.4989, + "step": 1387 + }, + { + "epoch": 0.15757562856767812, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4855, + "step": 1388 + }, + { + "epoch": 0.1576891556775972, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4772, + "step": 1389 + }, + { + "epoch": 0.15780268278751627, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4933, + "step": 1390 + }, + { + "epoch": 0.15791620989743535, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4855, + "step": 1391 + }, + { + "epoch": 0.15802973700735443, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4806, + "step": 1392 + }, + { + "epoch": 0.1581432641172735, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.4889, + "step": 1393 + }, + { + "epoch": 0.15825679122719258, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4864, + "step": 1394 + }, + { + "epoch": 0.15837031833711165, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.4974, + "step": 1395 + }, + { + "epoch": 0.15848384544703073, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4708, + "step": 1396 + }, + { + "epoch": 0.1585973725569498, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.4889, + "step": 1397 + }, + { + "epoch": 0.15871089966686888, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4789, + "step": 1398 + }, + { + "epoch": 0.15882442677678796, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.5028, + "step": 1399 + }, + { + "epoch": 0.15893795388670703, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.4884, + "step": 1400 + }, + { + "epoch": 0.1590514809966261, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.4941, + "step": 1401 + }, + { + "epoch": 0.15916500810654519, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.476, + "step": 1402 + }, + { + "epoch": 0.15927853521646426, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.5018, + "step": 1403 + }, + { + "epoch": 0.15939206232638334, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.4796, + "step": 1404 + }, + { + "epoch": 0.15950558943630241, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.4904, + "step": 1405 + }, + { + "epoch": 0.1596191165462215, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.4859, + "step": 1406 + }, + { + "epoch": 0.15973264365614057, + "grad_norm": 0.51953125, + "learning_rate": 0.002, + "loss": 5.473, + "step": 1407 + }, + { + "epoch": 0.15984617076605964, + "grad_norm": 0.578125, + "learning_rate": 0.002, + "loss": 5.4877, + "step": 1408 + }, + { + "epoch": 0.15995969787597872, + "grad_norm": 0.546875, + "learning_rate": 0.002, + "loss": 5.495, + "step": 1409 + }, + { + "epoch": 0.1600732249858978, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4757, + "step": 1410 + }, + { + "epoch": 0.16018675209581687, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4813, + "step": 1411 + }, + { + "epoch": 0.16030027920573595, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4831, + "step": 1412 + }, + { + "epoch": 0.16041380631565502, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.4812, + "step": 1413 + }, + { + "epoch": 0.1605273334255741, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.4656, + "step": 1414 + }, + { + "epoch": 0.16064086053549317, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.4795, + "step": 1415 + }, + { + "epoch": 0.16075438764541225, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4759, + "step": 1416 + }, + { + "epoch": 0.16086791475533133, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4771, + "step": 1417 + }, + { + "epoch": 0.16098144186525043, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.5008, + "step": 1418 + }, + { + "epoch": 0.1610949689751695, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.4532, + "step": 1419 + }, + { + "epoch": 0.16120849608508858, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.4976, + "step": 1420 + }, + { + "epoch": 0.16132202319500766, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.4839, + "step": 1421 + }, + { + "epoch": 0.16143555030492673, + "grad_norm": 0.51171875, + "learning_rate": 0.002, + "loss": 5.4972, + "step": 1422 + }, + { + "epoch": 0.1615490774148458, + "grad_norm": 0.4921875, + "learning_rate": 0.002, + "loss": 5.4811, + "step": 1423 + }, + { + "epoch": 0.16166260452476489, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4947, + "step": 1424 + }, + { + "epoch": 0.16177613163468396, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.4983, + "step": 1425 + }, + { + "epoch": 0.16188965874460304, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4758, + "step": 1426 + }, + { + "epoch": 0.1620031858545221, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4805, + "step": 1427 + }, + { + "epoch": 0.1621167129644412, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4665, + "step": 1428 + }, + { + "epoch": 0.16223024007436027, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.4805, + "step": 1429 + }, + { + "epoch": 0.16234376718427934, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.4692, + "step": 1430 + }, + { + "epoch": 0.16245729429419842, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.5048, + "step": 1431 + }, + { + "epoch": 0.1625708214041175, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4789, + "step": 1432 + }, + { + "epoch": 0.16268434851403657, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4673, + "step": 1433 + }, + { + "epoch": 0.16279787562395565, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4844, + "step": 1434 + }, + { + "epoch": 0.16291140273387472, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4937, + "step": 1435 + }, + { + "epoch": 0.1630249298437938, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.4909, + "step": 1436 + }, + { + "epoch": 0.16313845695371287, + "grad_norm": 0.578125, + "learning_rate": 0.002, + "loss": 5.4605, + "step": 1437 + }, + { + "epoch": 0.16325198406363195, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.4732, + "step": 1438 + }, + { + "epoch": 0.16336551117355103, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.4838, + "step": 1439 + }, + { + "epoch": 0.1634790382834701, + "grad_norm": 0.53125, + "learning_rate": 0.002, + "loss": 5.476, + "step": 1440 + }, + { + "epoch": 0.16359256539338918, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.4707, + "step": 1441 + }, + { + "epoch": 0.16370609250330825, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.491, + "step": 1442 + }, + { + "epoch": 0.16381961961322733, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4845, + "step": 1443 + }, + { + "epoch": 0.1639331467231464, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4903, + "step": 1444 + }, + { + "epoch": 0.16404667383306548, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4855, + "step": 1445 + }, + { + "epoch": 0.16416020094298456, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4706, + "step": 1446 + }, + { + "epoch": 0.16427372805290363, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4858, + "step": 1447 + }, + { + "epoch": 0.1643872551628227, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4512, + "step": 1448 + }, + { + "epoch": 0.16450078227274179, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4715, + "step": 1449 + }, + { + "epoch": 0.16461430938266086, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4567, + "step": 1450 + }, + { + "epoch": 0.16472783649257994, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.4879, + "step": 1451 + }, + { + "epoch": 0.164841363602499, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4386, + "step": 1452 + }, + { + "epoch": 0.1649548907124181, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4727, + "step": 1453 + }, + { + "epoch": 0.16506841782233717, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4587, + "step": 1454 + }, + { + "epoch": 0.16518194493225624, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.4848, + "step": 1455 + }, + { + "epoch": 0.16529547204217532, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4719, + "step": 1456 + }, + { + "epoch": 0.1654089991520944, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4634, + "step": 1457 + }, + { + "epoch": 0.16552252626201347, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.495, + "step": 1458 + }, + { + "epoch": 0.16563605337193255, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4712, + "step": 1459 + }, + { + "epoch": 0.16574958048185162, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4553, + "step": 1460 + }, + { + "epoch": 0.1658631075917707, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4866, + "step": 1461 + }, + { + "epoch": 0.16597663470168977, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4752, + "step": 1462 + }, + { + "epoch": 0.16609016181160885, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4927, + "step": 1463 + }, + { + "epoch": 0.16620368892152793, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4516, + "step": 1464 + }, + { + "epoch": 0.166317216031447, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4769, + "step": 1465 + }, + { + "epoch": 0.16643074314136608, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4526, + "step": 1466 + }, + { + "epoch": 0.16654427025128515, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4606, + "step": 1467 + }, + { + "epoch": 0.16665779736120423, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4622, + "step": 1468 + }, + { + "epoch": 0.1667713244711233, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4751, + "step": 1469 + }, + { + "epoch": 0.16688485158104238, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.4731, + "step": 1470 + }, + { + "epoch": 0.16699837869096146, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4632, + "step": 1471 + }, + { + "epoch": 0.16711190580088053, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.472, + "step": 1472 + }, + { + "epoch": 0.1672254329107996, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4779, + "step": 1473 + }, + { + "epoch": 0.16733896002071869, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4662, + "step": 1474 + }, + { + "epoch": 0.16745248713063776, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.4617, + "step": 1475 + }, + { + "epoch": 0.16756601424055684, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.4877, + "step": 1476 + }, + { + "epoch": 0.1676795413504759, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.4593, + "step": 1477 + }, + { + "epoch": 0.167793068460395, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.4768, + "step": 1478 + }, + { + "epoch": 0.1679065955703141, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.4696, + "step": 1479 + }, + { + "epoch": 0.16802012268023317, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.4673, + "step": 1480 + }, + { + "epoch": 0.16813364979015225, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4726, + "step": 1481 + }, + { + "epoch": 0.16824717690007132, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4702, + "step": 1482 + }, + { + "epoch": 0.1683607040099904, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.485, + "step": 1483 + }, + { + "epoch": 0.16847423111990947, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4598, + "step": 1484 + }, + { + "epoch": 0.16858775822982855, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4861, + "step": 1485 + }, + { + "epoch": 0.16870128533974763, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4659, + "step": 1486 + }, + { + "epoch": 0.1688148124496667, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4681, + "step": 1487 + }, + { + "epoch": 0.16892833955958578, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4692, + "step": 1488 + }, + { + "epoch": 0.16904186666950485, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.4862, + "step": 1489 + }, + { + "epoch": 0.16915539377942393, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.473, + "step": 1490 + }, + { + "epoch": 0.169268920889343, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.476, + "step": 1491 + }, + { + "epoch": 0.16938244799926208, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4545, + "step": 1492 + }, + { + "epoch": 0.16949597510918116, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.471, + "step": 1493 + }, + { + "epoch": 0.16960950221910023, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4775, + "step": 1494 + }, + { + "epoch": 0.1697230293290193, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4582, + "step": 1495 + }, + { + "epoch": 0.16983655643893839, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4853, + "step": 1496 + }, + { + "epoch": 0.16995008354885746, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4723, + "step": 1497 + }, + { + "epoch": 0.17006361065877654, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.4633, + "step": 1498 + }, + { + "epoch": 0.1701771377686956, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4763, + "step": 1499 + }, + { + "epoch": 0.1702906648786147, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.453, + "step": 1500 + }, + { + "epoch": 0.17040419198853377, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4781, + "step": 1501 + }, + { + "epoch": 0.17051771909845284, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4801, + "step": 1502 + }, + { + "epoch": 0.17063124620837192, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.465, + "step": 1503 + }, + { + "epoch": 0.170744773318291, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4826, + "step": 1504 + }, + { + "epoch": 0.17085830042821007, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.452, + "step": 1505 + }, + { + "epoch": 0.17097182753812915, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4603, + "step": 1506 + }, + { + "epoch": 0.17108535464804822, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4688, + "step": 1507 + }, + { + "epoch": 0.1711988817579673, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4663, + "step": 1508 + }, + { + "epoch": 0.17131240886788637, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4648, + "step": 1509 + }, + { + "epoch": 0.17142593597780545, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.4656, + "step": 1510 + }, + { + "epoch": 0.17153946308772453, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.4348, + "step": 1511 + }, + { + "epoch": 0.1716529901976436, + "grad_norm": 0.486328125, + "learning_rate": 0.002, + "loss": 5.4604, + "step": 1512 + }, + { + "epoch": 0.17176651730756268, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.4631, + "step": 1513 + }, + { + "epoch": 0.17188004441748175, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4614, + "step": 1514 + }, + { + "epoch": 0.17199357152740083, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4629, + "step": 1515 + }, + { + "epoch": 0.1721070986373199, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.4654, + "step": 1516 + }, + { + "epoch": 0.17222062574723898, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.477, + "step": 1517 + }, + { + "epoch": 0.17233415285715806, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4586, + "step": 1518 + }, + { + "epoch": 0.17244767996707713, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4732, + "step": 1519 + }, + { + "epoch": 0.1725612070769962, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4437, + "step": 1520 + }, + { + "epoch": 0.17267473418691529, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.4716, + "step": 1521 + }, + { + "epoch": 0.17278826129683436, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.4551, + "step": 1522 + }, + { + "epoch": 0.17290178840675344, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4778, + "step": 1523 + }, + { + "epoch": 0.1730153155166725, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4687, + "step": 1524 + }, + { + "epoch": 0.1731288426265916, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4522, + "step": 1525 + }, + { + "epoch": 0.17324236973651067, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4479, + "step": 1526 + }, + { + "epoch": 0.17335589684642974, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4743, + "step": 1527 + }, + { + "epoch": 0.17346942395634882, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.472, + "step": 1528 + }, + { + "epoch": 0.1735829510662679, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4773, + "step": 1529 + }, + { + "epoch": 0.17369647817618697, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4717, + "step": 1530 + }, + { + "epoch": 0.17381000528610605, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.4617, + "step": 1531 + }, + { + "epoch": 0.17392353239602512, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4705, + "step": 1532 + }, + { + "epoch": 0.1740370595059442, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4663, + "step": 1533 + }, + { + "epoch": 0.17415058661586327, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4957, + "step": 1534 + }, + { + "epoch": 0.17426411372578235, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.4572, + "step": 1535 + }, + { + "epoch": 0.17437764083570143, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.4683, + "step": 1536 + }, + { + "epoch": 0.1744911679456205, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.4949, + "step": 1537 + }, + { + "epoch": 0.17460469505553958, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.4521, + "step": 1538 + }, + { + "epoch": 0.17471822216545868, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4502, + "step": 1539 + }, + { + "epoch": 0.17483174927537776, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.4611, + "step": 1540 + }, + { + "epoch": 0.17494527638529683, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.4778, + "step": 1541 + }, + { + "epoch": 0.1750588034952159, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.4469, + "step": 1542 + }, + { + "epoch": 0.17517233060513498, + "grad_norm": 0.486328125, + "learning_rate": 0.002, + "loss": 5.4497, + "step": 1543 + }, + { + "epoch": 0.17528585771505406, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.4443, + "step": 1544 + }, + { + "epoch": 0.17539938482497314, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4545, + "step": 1545 + }, + { + "epoch": 0.1755129119348922, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.4699, + "step": 1546 + }, + { + "epoch": 0.1756264390448113, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.4678, + "step": 1547 + }, + { + "epoch": 0.17573996615473036, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4652, + "step": 1548 + }, + { + "epoch": 0.17585349326464944, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.464, + "step": 1549 + }, + { + "epoch": 0.17596702037456852, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.483, + "step": 1550 + }, + { + "epoch": 0.1760805474844876, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.4593, + "step": 1551 + }, + { + "epoch": 0.17619407459440667, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4614, + "step": 1552 + }, + { + "epoch": 0.17630760170432574, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4653, + "step": 1553 + }, + { + "epoch": 0.17642112881424482, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4821, + "step": 1554 + }, + { + "epoch": 0.1765346559241639, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4356, + "step": 1555 + }, + { + "epoch": 0.17664818303408297, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4494, + "step": 1556 + }, + { + "epoch": 0.17676171014400205, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4647, + "step": 1557 + }, + { + "epoch": 0.17687523725392112, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.4265, + "step": 1558 + }, + { + "epoch": 0.1769887643638402, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4432, + "step": 1559 + }, + { + "epoch": 0.17710229147375928, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4406, + "step": 1560 + }, + { + "epoch": 0.17721581858367835, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4446, + "step": 1561 + }, + { + "epoch": 0.17732934569359743, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.467, + "step": 1562 + }, + { + "epoch": 0.1774428728035165, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4731, + "step": 1563 + }, + { + "epoch": 0.17755639991343558, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4614, + "step": 1564 + }, + { + "epoch": 0.17766992702335466, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.456, + "step": 1565 + }, + { + "epoch": 0.17778345413327373, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.4389, + "step": 1566 + }, + { + "epoch": 0.1778969812431928, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.4437, + "step": 1567 + }, + { + "epoch": 0.17801050835311188, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4411, + "step": 1568 + }, + { + "epoch": 0.17812403546303096, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4746, + "step": 1569 + }, + { + "epoch": 0.17823756257295004, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4455, + "step": 1570 + }, + { + "epoch": 0.1783510896828691, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4468, + "step": 1571 + }, + { + "epoch": 0.1784646167927882, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4619, + "step": 1572 + }, + { + "epoch": 0.17857814390270726, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4728, + "step": 1573 + }, + { + "epoch": 0.17869167101262634, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.4654, + "step": 1574 + }, + { + "epoch": 0.17880519812254542, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4495, + "step": 1575 + }, + { + "epoch": 0.1789187252324645, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.4541, + "step": 1576 + }, + { + "epoch": 0.17903225234238357, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4461, + "step": 1577 + }, + { + "epoch": 0.17914577945230264, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4525, + "step": 1578 + }, + { + "epoch": 0.17925930656222172, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4405, + "step": 1579 + }, + { + "epoch": 0.1793728336721408, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.45, + "step": 1580 + }, + { + "epoch": 0.17948636078205987, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4674, + "step": 1581 + }, + { + "epoch": 0.17959988789197895, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4532, + "step": 1582 + }, + { + "epoch": 0.17971341500189802, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.4626, + "step": 1583 + }, + { + "epoch": 0.1798269421118171, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4286, + "step": 1584 + }, + { + "epoch": 0.17994046922173618, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4414, + "step": 1585 + }, + { + "epoch": 0.18005399633165525, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.4642, + "step": 1586 + }, + { + "epoch": 0.18016752344157433, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4289, + "step": 1587 + }, + { + "epoch": 0.1802810505514934, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4624, + "step": 1588 + }, + { + "epoch": 0.18039457766141248, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4389, + "step": 1589 + }, + { + "epoch": 0.18050810477133156, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4468, + "step": 1590 + }, + { + "epoch": 0.18062163188125063, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4455, + "step": 1591 + }, + { + "epoch": 0.1807351589911697, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4618, + "step": 1592 + }, + { + "epoch": 0.18084868610108878, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.4449, + "step": 1593 + }, + { + "epoch": 0.18096221321100786, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4244, + "step": 1594 + }, + { + "epoch": 0.18107574032092694, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4657, + "step": 1595 + }, + { + "epoch": 0.181189267430846, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.4504, + "step": 1596 + }, + { + "epoch": 0.1813027945407651, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4436, + "step": 1597 + }, + { + "epoch": 0.18141632165068416, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4427, + "step": 1598 + }, + { + "epoch": 0.18152984876060327, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4517, + "step": 1599 + }, + { + "epoch": 0.18164337587052234, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4587, + "step": 1600 + }, + { + "epoch": 0.18175690298044142, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.4272, + "step": 1601 + }, + { + "epoch": 0.1818704300903605, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4805, + "step": 1602 + }, + { + "epoch": 0.18198395720027957, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4365, + "step": 1603 + }, + { + "epoch": 0.18209748431019865, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4694, + "step": 1604 + }, + { + "epoch": 0.18221101142011772, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4463, + "step": 1605 + }, + { + "epoch": 0.1823245385300368, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4463, + "step": 1606 + }, + { + "epoch": 0.18243806563995588, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.4485, + "step": 1607 + }, + { + "epoch": 0.18255159274987495, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.485, + "step": 1608 + }, + { + "epoch": 0.18266511985979403, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.453, + "step": 1609 + }, + { + "epoch": 0.1827786469697131, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.4696, + "step": 1610 + }, + { + "epoch": 0.18289217407963218, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.4293, + "step": 1611 + }, + { + "epoch": 0.18300570118955126, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4609, + "step": 1612 + }, + { + "epoch": 0.18311922829947033, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4205, + "step": 1613 + }, + { + "epoch": 0.1832327554093894, + "grad_norm": 0.52734375, + "learning_rate": 0.002, + "loss": 5.4594, + "step": 1614 + }, + { + "epoch": 0.18334628251930848, + "grad_norm": 0.56640625, + "learning_rate": 0.002, + "loss": 5.4291, + "step": 1615 + }, + { + "epoch": 0.18345980962922756, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.4399, + "step": 1616 + }, + { + "epoch": 0.18357333673914664, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.4586, + "step": 1617 + }, + { + "epoch": 0.1836868638490657, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4633, + "step": 1618 + }, + { + "epoch": 0.1838003909589848, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.4631, + "step": 1619 + }, + { + "epoch": 0.18391391806890386, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.445, + "step": 1620 + }, + { + "epoch": 0.18402744517882294, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4526, + "step": 1621 + }, + { + "epoch": 0.18414097228874202, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4417, + "step": 1622 + }, + { + "epoch": 0.1842544993986611, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.454, + "step": 1623 + }, + { + "epoch": 0.18436802650858017, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.453, + "step": 1624 + }, + { + "epoch": 0.18448155361849924, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.4207, + "step": 1625 + }, + { + "epoch": 0.18459508072841832, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4373, + "step": 1626 + }, + { + "epoch": 0.1847086078383374, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4356, + "step": 1627 + }, + { + "epoch": 0.18482213494825647, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.4404, + "step": 1628 + }, + { + "epoch": 0.18493566205817555, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4603, + "step": 1629 + }, + { + "epoch": 0.18504918916809462, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.4375, + "step": 1630 + }, + { + "epoch": 0.1851627162780137, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.4427, + "step": 1631 + }, + { + "epoch": 0.18527624338793278, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4439, + "step": 1632 + }, + { + "epoch": 0.18538977049785185, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4491, + "step": 1633 + }, + { + "epoch": 0.18550329760777093, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4486, + "step": 1634 + }, + { + "epoch": 0.18561682471769, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4597, + "step": 1635 + }, + { + "epoch": 0.18573035182760908, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4445, + "step": 1636 + }, + { + "epoch": 0.18584387893752816, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4322, + "step": 1637 + }, + { + "epoch": 0.18595740604744723, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4408, + "step": 1638 + }, + { + "epoch": 0.1860709331573663, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4643, + "step": 1639 + }, + { + "epoch": 0.18618446026728538, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4536, + "step": 1640 + }, + { + "epoch": 0.18629798737720446, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.4587, + "step": 1641 + }, + { + "epoch": 0.18641151448712354, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.4356, + "step": 1642 + }, + { + "epoch": 0.1865250415970426, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4437, + "step": 1643 + }, + { + "epoch": 0.1866385687069617, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4518, + "step": 1644 + }, + { + "epoch": 0.18675209581688076, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4505, + "step": 1645 + }, + { + "epoch": 0.18686562292679984, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4595, + "step": 1646 + }, + { + "epoch": 0.18697915003671892, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.432, + "step": 1647 + }, + { + "epoch": 0.187092677146638, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.4474, + "step": 1648 + }, + { + "epoch": 0.18720620425655707, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4421, + "step": 1649 + }, + { + "epoch": 0.18731973136647614, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4705, + "step": 1650 + }, + { + "epoch": 0.18743325847639522, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.4448, + "step": 1651 + }, + { + "epoch": 0.1875467855863143, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.4486, + "step": 1652 + }, + { + "epoch": 0.18766031269623337, + "grad_norm": 0.5390625, + "learning_rate": 0.002, + "loss": 5.4351, + "step": 1653 + }, + { + "epoch": 0.18777383980615245, + "grad_norm": 0.5, + "learning_rate": 0.002, + "loss": 5.4437, + "step": 1654 + }, + { + "epoch": 0.18788736691607152, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4236, + "step": 1655 + }, + { + "epoch": 0.1880008940259906, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.4349, + "step": 1656 + }, + { + "epoch": 0.18811442113590968, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4658, + "step": 1657 + }, + { + "epoch": 0.18822794824582875, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.4347, + "step": 1658 + }, + { + "epoch": 0.18834147535574783, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4195, + "step": 1659 + }, + { + "epoch": 0.18845500246566693, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4737, + "step": 1660 + }, + { + "epoch": 0.188568529575586, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4287, + "step": 1661 + }, + { + "epoch": 0.18868205668550508, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.457, + "step": 1662 + }, + { + "epoch": 0.18879558379542416, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.4479, + "step": 1663 + }, + { + "epoch": 0.18890911090534324, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4372, + "step": 1664 + }, + { + "epoch": 0.1890226380152623, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4425, + "step": 1665 + }, + { + "epoch": 0.1891361651251814, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.428, + "step": 1666 + }, + { + "epoch": 0.18924969223510046, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.4491, + "step": 1667 + }, + { + "epoch": 0.18936321934501954, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4045, + "step": 1668 + }, + { + "epoch": 0.18947674645493862, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4429, + "step": 1669 + }, + { + "epoch": 0.1895902735648577, + "grad_norm": 0.484375, + "learning_rate": 0.002, + "loss": 5.4346, + "step": 1670 + }, + { + "epoch": 0.18970380067477677, + "grad_norm": 0.45703125, + "learning_rate": 0.002, + "loss": 5.4292, + "step": 1671 + }, + { + "epoch": 0.18981732778469584, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.4513, + "step": 1672 + }, + { + "epoch": 0.18993085489461492, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4545, + "step": 1673 + }, + { + "epoch": 0.190044382004534, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.4336, + "step": 1674 + }, + { + "epoch": 0.19015790911445307, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4561, + "step": 1675 + }, + { + "epoch": 0.19027143622437215, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.4442, + "step": 1676 + }, + { + "epoch": 0.19038496333429122, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.4376, + "step": 1677 + }, + { + "epoch": 0.1904984904442103, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.4509, + "step": 1678 + }, + { + "epoch": 0.19061201755412938, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.4609, + "step": 1679 + }, + { + "epoch": 0.19072554466404845, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.434, + "step": 1680 + }, + { + "epoch": 0.19083907177396753, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.4381, + "step": 1681 + }, + { + "epoch": 0.1909525988838866, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4327, + "step": 1682 + }, + { + "epoch": 0.19106612599380568, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4649, + "step": 1683 + }, + { + "epoch": 0.19117965310372476, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.4252, + "step": 1684 + }, + { + "epoch": 0.19129318021364383, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.425, + "step": 1685 + }, + { + "epoch": 0.1914067073235629, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4499, + "step": 1686 + }, + { + "epoch": 0.19152023443348198, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4485, + "step": 1687 + }, + { + "epoch": 0.19163376154340106, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4428, + "step": 1688 + }, + { + "epoch": 0.19174728865332014, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4555, + "step": 1689 + }, + { + "epoch": 0.1918608157632392, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4447, + "step": 1690 + }, + { + "epoch": 0.1919743428731583, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.4394, + "step": 1691 + }, + { + "epoch": 0.19208786998307736, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4614, + "step": 1692 + }, + { + "epoch": 0.19220139709299644, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.4278, + "step": 1693 + }, + { + "epoch": 0.19231492420291552, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.4706, + "step": 1694 + }, + { + "epoch": 0.1924284513128346, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4458, + "step": 1695 + }, + { + "epoch": 0.19254197842275367, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.4402, + "step": 1696 + }, + { + "epoch": 0.19265550553267274, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.4508, + "step": 1697 + }, + { + "epoch": 0.19276903264259182, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4455, + "step": 1698 + }, + { + "epoch": 0.1928825597525109, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4254, + "step": 1699 + }, + { + "epoch": 0.19299608686242997, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.44, + "step": 1700 + }, + { + "epoch": 0.19310961397234905, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.451, + "step": 1701 + }, + { + "epoch": 0.19322314108226812, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.436, + "step": 1702 + }, + { + "epoch": 0.1933366681921872, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4104, + "step": 1703 + }, + { + "epoch": 0.19345019530210628, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4251, + "step": 1704 + }, + { + "epoch": 0.19356372241202535, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4337, + "step": 1705 + }, + { + "epoch": 0.19367724952194443, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4406, + "step": 1706 + }, + { + "epoch": 0.1937907766318635, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.439, + "step": 1707 + }, + { + "epoch": 0.19390430374178258, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4343, + "step": 1708 + }, + { + "epoch": 0.19401783085170166, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.4254, + "step": 1709 + }, + { + "epoch": 0.19413135796162073, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4301, + "step": 1710 + }, + { + "epoch": 0.1942448850715398, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.4377, + "step": 1711 + }, + { + "epoch": 0.19435841218145888, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.4302, + "step": 1712 + }, + { + "epoch": 0.19447193929137796, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.4458, + "step": 1713 + }, + { + "epoch": 0.19458546640129704, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4197, + "step": 1714 + }, + { + "epoch": 0.1946989935112161, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4456, + "step": 1715 + }, + { + "epoch": 0.1948125206211352, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4335, + "step": 1716 + }, + { + "epoch": 0.19492604773105426, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4399, + "step": 1717 + }, + { + "epoch": 0.19503957484097334, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4432, + "step": 1718 + }, + { + "epoch": 0.19515310195089242, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4308, + "step": 1719 + }, + { + "epoch": 0.19526662906081152, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.4433, + "step": 1720 + }, + { + "epoch": 0.1953801561707306, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4424, + "step": 1721 + }, + { + "epoch": 0.19549368328064967, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4281, + "step": 1722 + }, + { + "epoch": 0.19560721039056875, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4521, + "step": 1723 + }, + { + "epoch": 0.19572073750048782, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4382, + "step": 1724 + }, + { + "epoch": 0.1958342646104069, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4436, + "step": 1725 + }, + { + "epoch": 0.19594779172032598, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4182, + "step": 1726 + }, + { + "epoch": 0.19606131883024505, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4215, + "step": 1727 + }, + { + "epoch": 0.19617484594016413, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.425, + "step": 1728 + }, + { + "epoch": 0.1962883730500832, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4334, + "step": 1729 + }, + { + "epoch": 0.19640190016000228, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.4219, + "step": 1730 + }, + { + "epoch": 0.19651542726992136, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4228, + "step": 1731 + }, + { + "epoch": 0.19662895437984043, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4405, + "step": 1732 + }, + { + "epoch": 0.1967424814897595, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4157, + "step": 1733 + }, + { + "epoch": 0.19685600859967858, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4272, + "step": 1734 + }, + { + "epoch": 0.19696953570959766, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.426, + "step": 1735 + }, + { + "epoch": 0.19708306281951674, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4255, + "step": 1736 + }, + { + "epoch": 0.1971965899294358, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.44, + "step": 1737 + }, + { + "epoch": 0.1973101170393549, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.4355, + "step": 1738 + }, + { + "epoch": 0.19742364414927396, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.4303, + "step": 1739 + }, + { + "epoch": 0.19753717125919304, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.4257, + "step": 1740 + }, + { + "epoch": 0.19765069836911212, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4174, + "step": 1741 + }, + { + "epoch": 0.1977642254790312, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.45, + "step": 1742 + }, + { + "epoch": 0.19787775258895027, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4156, + "step": 1743 + }, + { + "epoch": 0.19799127969886934, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4177, + "step": 1744 + }, + { + "epoch": 0.19810480680878842, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4242, + "step": 1745 + }, + { + "epoch": 0.1982183339187075, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.416, + "step": 1746 + }, + { + "epoch": 0.19833186102862657, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4304, + "step": 1747 + }, + { + "epoch": 0.19844538813854565, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.4186, + "step": 1748 + }, + { + "epoch": 0.19855891524846472, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.4159, + "step": 1749 + }, + { + "epoch": 0.1986724423583838, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4203, + "step": 1750 + }, + { + "epoch": 0.19878596946830288, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4175, + "step": 1751 + }, + { + "epoch": 0.19889949657822195, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.4461, + "step": 1752 + }, + { + "epoch": 0.19901302368814103, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4222, + "step": 1753 + }, + { + "epoch": 0.1991265507980601, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4224, + "step": 1754 + }, + { + "epoch": 0.19924007790797918, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4232, + "step": 1755 + }, + { + "epoch": 0.19935360501789826, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4121, + "step": 1756 + }, + { + "epoch": 0.19946713212781733, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4381, + "step": 1757 + }, + { + "epoch": 0.1995806592377364, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.4171, + "step": 1758 + }, + { + "epoch": 0.19969418634765548, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4242, + "step": 1759 + }, + { + "epoch": 0.19980771345757456, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4195, + "step": 1760 + }, + { + "epoch": 0.19992124056749364, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4407, + "step": 1761 + }, + { + "epoch": 0.2000347676774127, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.4333, + "step": 1762 + }, + { + "epoch": 0.2001482947873318, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3916, + "step": 1763 + }, + { + "epoch": 0.20026182189725086, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.4317, + "step": 1764 + }, + { + "epoch": 0.20037534900716994, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4223, + "step": 1765 + }, + { + "epoch": 0.20048887611708902, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4211, + "step": 1766 + }, + { + "epoch": 0.2006024032270081, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.443, + "step": 1767 + }, + { + "epoch": 0.20071593033692717, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4414, + "step": 1768 + }, + { + "epoch": 0.20082945744684624, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4441, + "step": 1769 + }, + { + "epoch": 0.20094298455676532, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.4197, + "step": 1770 + }, + { + "epoch": 0.2010565116666844, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4335, + "step": 1771 + }, + { + "epoch": 0.20117003877660347, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.4135, + "step": 1772 + }, + { + "epoch": 0.20128356588652255, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.415, + "step": 1773 + }, + { + "epoch": 0.20139709299644162, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.4429, + "step": 1774 + }, + { + "epoch": 0.2015106201063607, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.4239, + "step": 1775 + }, + { + "epoch": 0.20162414721627978, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.4177, + "step": 1776 + }, + { + "epoch": 0.20173767432619885, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.4501, + "step": 1777 + }, + { + "epoch": 0.20185120143611793, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.4213, + "step": 1778 + }, + { + "epoch": 0.201964728546037, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.4293, + "step": 1779 + }, + { + "epoch": 0.2020782556559561, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.4367, + "step": 1780 + }, + { + "epoch": 0.20219178276587518, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4392, + "step": 1781 + }, + { + "epoch": 0.20230530987579426, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.4157, + "step": 1782 + }, + { + "epoch": 0.20241883698571334, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4281, + "step": 1783 + }, + { + "epoch": 0.2025323640956324, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3975, + "step": 1784 + }, + { + "epoch": 0.2026458912055515, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.4414, + "step": 1785 + }, + { + "epoch": 0.20275941831547056, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.4281, + "step": 1786 + }, + { + "epoch": 0.20287294542538964, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4183, + "step": 1787 + }, + { + "epoch": 0.20298647253530872, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.4122, + "step": 1788 + }, + { + "epoch": 0.2030999996452278, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.4049, + "step": 1789 + }, + { + "epoch": 0.20321352675514687, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.4121, + "step": 1790 + }, + { + "epoch": 0.20332705386506594, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.4204, + "step": 1791 + }, + { + "epoch": 0.20344058097498502, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.4223, + "step": 1792 + }, + { + "epoch": 0.2035541080849041, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4206, + "step": 1793 + }, + { + "epoch": 0.20366763519482317, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4167, + "step": 1794 + }, + { + "epoch": 0.20378116230474225, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4226, + "step": 1795 + }, + { + "epoch": 0.20389468941466132, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4224, + "step": 1796 + }, + { + "epoch": 0.2040082165245804, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.4074, + "step": 1797 + }, + { + "epoch": 0.20412174363449948, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4349, + "step": 1798 + }, + { + "epoch": 0.20423527074441855, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3955, + "step": 1799 + }, + { + "epoch": 0.20434879785433763, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.4145, + "step": 1800 + }, + { + "epoch": 0.2044623249642567, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.4225, + "step": 1801 + }, + { + "epoch": 0.20457585207417578, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3982, + "step": 1802 + }, + { + "epoch": 0.20468937918409486, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4133, + "step": 1803 + }, + { + "epoch": 0.20480290629401393, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4301, + "step": 1804 + }, + { + "epoch": 0.204916433403933, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4216, + "step": 1805 + }, + { + "epoch": 0.20502996051385208, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4176, + "step": 1806 + }, + { + "epoch": 0.20514348762377116, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.4278, + "step": 1807 + }, + { + "epoch": 0.20525701473369024, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.4071, + "step": 1808 + }, + { + "epoch": 0.2053705418436093, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.4412, + "step": 1809 + }, + { + "epoch": 0.2054840689535284, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.4154, + "step": 1810 + }, + { + "epoch": 0.20559759606344746, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.4196, + "step": 1811 + }, + { + "epoch": 0.20571112317336654, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4096, + "step": 1812 + }, + { + "epoch": 0.20582465028328562, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.411, + "step": 1813 + }, + { + "epoch": 0.2059381773932047, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.4079, + "step": 1814 + }, + { + "epoch": 0.20605170450312377, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.4057, + "step": 1815 + }, + { + "epoch": 0.20616523161304284, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.4242, + "step": 1816 + }, + { + "epoch": 0.20627875872296192, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.429, + "step": 1817 + }, + { + "epoch": 0.206392285832881, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.4308, + "step": 1818 + }, + { + "epoch": 0.20650581294280007, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.4171, + "step": 1819 + }, + { + "epoch": 0.20661934005271915, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4254, + "step": 1820 + }, + { + "epoch": 0.20673286716263822, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4287, + "step": 1821 + }, + { + "epoch": 0.2068463942725573, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.4344, + "step": 1822 + }, + { + "epoch": 0.20695992138247638, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.4188, + "step": 1823 + }, + { + "epoch": 0.20707344849239545, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.4039, + "step": 1824 + }, + { + "epoch": 0.20718697560231453, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.4286, + "step": 1825 + }, + { + "epoch": 0.2073005027122336, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.4135, + "step": 1826 + }, + { + "epoch": 0.20741402982215268, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4281, + "step": 1827 + }, + { + "epoch": 0.20752755693207176, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4046, + "step": 1828 + }, + { + "epoch": 0.20764108404199083, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4162, + "step": 1829 + }, + { + "epoch": 0.2077546111519099, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.421, + "step": 1830 + }, + { + "epoch": 0.20786813826182898, + "grad_norm": 0.498046875, + "learning_rate": 0.002, + "loss": 5.4429, + "step": 1831 + }, + { + "epoch": 0.20798166537174806, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.418, + "step": 1832 + }, + { + "epoch": 0.20809519248166714, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.412, + "step": 1833 + }, + { + "epoch": 0.2082087195915862, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.4358, + "step": 1834 + }, + { + "epoch": 0.2083222467015053, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.4257, + "step": 1835 + }, + { + "epoch": 0.20843577381142436, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4159, + "step": 1836 + }, + { + "epoch": 0.20854930092134344, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.4374, + "step": 1837 + }, + { + "epoch": 0.20866282803126251, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.4029, + "step": 1838 + }, + { + "epoch": 0.2087763551411816, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3989, + "step": 1839 + }, + { + "epoch": 0.20888988225110067, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.4171, + "step": 1840 + }, + { + "epoch": 0.20900340936101977, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.422, + "step": 1841 + }, + { + "epoch": 0.20911693647093885, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4012, + "step": 1842 + }, + { + "epoch": 0.20923046358085792, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.4251, + "step": 1843 + }, + { + "epoch": 0.209343990690777, + "grad_norm": 0.47265625, + "learning_rate": 0.002, + "loss": 5.416, + "step": 1844 + }, + { + "epoch": 0.20945751780069607, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.4226, + "step": 1845 + }, + { + "epoch": 0.20957104491061515, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.427, + "step": 1846 + }, + { + "epoch": 0.20968457202053423, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.4247, + "step": 1847 + }, + { + "epoch": 0.2097980991304533, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.4358, + "step": 1848 + }, + { + "epoch": 0.20991162624037238, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.391, + "step": 1849 + }, + { + "epoch": 0.21002515335029145, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.419, + "step": 1850 + }, + { + "epoch": 0.21013868046021053, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4086, + "step": 1851 + }, + { + "epoch": 0.2102522075701296, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.4202, + "step": 1852 + }, + { + "epoch": 0.21036573468004868, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.4064, + "step": 1853 + }, + { + "epoch": 0.21047926178996776, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.4271, + "step": 1854 + }, + { + "epoch": 0.21059278889988683, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.4143, + "step": 1855 + }, + { + "epoch": 0.2107063160098059, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3868, + "step": 1856 + }, + { + "epoch": 0.210819843119725, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.4158, + "step": 1857 + }, + { + "epoch": 0.21093337022964406, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.4217, + "step": 1858 + }, + { + "epoch": 0.21104689733956314, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.4094, + "step": 1859 + }, + { + "epoch": 0.21116042444948221, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.4318, + "step": 1860 + }, + { + "epoch": 0.2112739515594013, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.4094, + "step": 1861 + }, + { + "epoch": 0.21138747866932037, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4279, + "step": 1862 + }, + { + "epoch": 0.21150100577923944, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.408, + "step": 1863 + }, + { + "epoch": 0.21161453288915852, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.4187, + "step": 1864 + }, + { + "epoch": 0.2117280599990776, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.4209, + "step": 1865 + }, + { + "epoch": 0.21184158710899667, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.412, + "step": 1866 + }, + { + "epoch": 0.21195511421891575, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4038, + "step": 1867 + }, + { + "epoch": 0.21206864132883482, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.415, + "step": 1868 + }, + { + "epoch": 0.2121821684387539, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.4166, + "step": 1869 + }, + { + "epoch": 0.21229569554867297, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.4355, + "step": 1870 + }, + { + "epoch": 0.21240922265859205, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.423, + "step": 1871 + }, + { + "epoch": 0.21252274976851113, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4043, + "step": 1872 + }, + { + "epoch": 0.2126362768784302, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.4141, + "step": 1873 + }, + { + "epoch": 0.21274980398834928, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.4215, + "step": 1874 + }, + { + "epoch": 0.21286333109826835, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.4197, + "step": 1875 + }, + { + "epoch": 0.21297685820818743, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4246, + "step": 1876 + }, + { + "epoch": 0.2130903853181065, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.4259, + "step": 1877 + }, + { + "epoch": 0.21320391242802558, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.428, + "step": 1878 + }, + { + "epoch": 0.21331743953794466, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.407, + "step": 1879 + }, + { + "epoch": 0.21343096664786373, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.4029, + "step": 1880 + }, + { + "epoch": 0.2135444937577828, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.4101, + "step": 1881 + }, + { + "epoch": 0.2136580208677019, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4078, + "step": 1882 + }, + { + "epoch": 0.21377154797762096, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.441, + "step": 1883 + }, + { + "epoch": 0.21388507508754004, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4151, + "step": 1884 + }, + { + "epoch": 0.21399860219745911, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.4206, + "step": 1885 + }, + { + "epoch": 0.2141121293073782, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.4111, + "step": 1886 + }, + { + "epoch": 0.21422565641729727, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.4172, + "step": 1887 + }, + { + "epoch": 0.21433918352721634, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.401, + "step": 1888 + }, + { + "epoch": 0.21445271063713542, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4182, + "step": 1889 + }, + { + "epoch": 0.2145662377470545, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3957, + "step": 1890 + }, + { + "epoch": 0.21467976485697357, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3993, + "step": 1891 + }, + { + "epoch": 0.21479329196689265, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4031, + "step": 1892 + }, + { + "epoch": 0.21490681907681172, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4166, + "step": 1893 + }, + { + "epoch": 0.2150203461867308, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4179, + "step": 1894 + }, + { + "epoch": 0.21513387329664987, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.424, + "step": 1895 + }, + { + "epoch": 0.21524740040656895, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.4048, + "step": 1896 + }, + { + "epoch": 0.21536092751648803, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.4076, + "step": 1897 + }, + { + "epoch": 0.2154744546264071, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.4134, + "step": 1898 + }, + { + "epoch": 0.21558798173632618, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4067, + "step": 1899 + }, + { + "epoch": 0.21570150884624525, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3943, + "step": 1900 + }, + { + "epoch": 0.21581503595616436, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.406, + "step": 1901 + }, + { + "epoch": 0.21592856306608343, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3992, + "step": 1902 + }, + { + "epoch": 0.2160420901760025, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4058, + "step": 1903 + }, + { + "epoch": 0.2161556172859216, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.4055, + "step": 1904 + }, + { + "epoch": 0.21626914439584066, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3939, + "step": 1905 + }, + { + "epoch": 0.21638267150575974, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4179, + "step": 1906 + }, + { + "epoch": 0.21649619861567881, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4217, + "step": 1907 + }, + { + "epoch": 0.2166097257255979, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.4194, + "step": 1908 + }, + { + "epoch": 0.21672325283551697, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3921, + "step": 1909 + }, + { + "epoch": 0.21683677994543604, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.4091, + "step": 1910 + }, + { + "epoch": 0.21695030705535512, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.41, + "step": 1911 + }, + { + "epoch": 0.2170638341652742, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.394, + "step": 1912 + }, + { + "epoch": 0.21717736127519327, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4143, + "step": 1913 + }, + { + "epoch": 0.21729088838511235, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.4048, + "step": 1914 + }, + { + "epoch": 0.21740441549503142, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.4296, + "step": 1915 + }, + { + "epoch": 0.2175179426049505, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3821, + "step": 1916 + }, + { + "epoch": 0.21763146971486957, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.4009, + "step": 1917 + }, + { + "epoch": 0.21774499682478865, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4232, + "step": 1918 + }, + { + "epoch": 0.21785852393470773, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4186, + "step": 1919 + }, + { + "epoch": 0.2179720510446268, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.4007, + "step": 1920 + }, + { + "epoch": 0.21808557815454588, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.4072, + "step": 1921 + }, + { + "epoch": 0.21819910526446495, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.375, + "step": 1922 + }, + { + "epoch": 0.21831263237438403, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.4234, + "step": 1923 + }, + { + "epoch": 0.2184261594843031, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3999, + "step": 1924 + }, + { + "epoch": 0.21853968659422218, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3978, + "step": 1925 + }, + { + "epoch": 0.21865321370414126, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4143, + "step": 1926 + }, + { + "epoch": 0.21876674081406033, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4105, + "step": 1927 + }, + { + "epoch": 0.2188802679239794, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.4058, + "step": 1928 + }, + { + "epoch": 0.2189937950338985, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4075, + "step": 1929 + }, + { + "epoch": 0.21910732214381756, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4129, + "step": 1930 + }, + { + "epoch": 0.21922084925373664, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.4163, + "step": 1931 + }, + { + "epoch": 0.21933437636365571, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.4088, + "step": 1932 + }, + { + "epoch": 0.2194479034735748, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.3952, + "step": 1933 + }, + { + "epoch": 0.21956143058349387, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.382, + "step": 1934 + }, + { + "epoch": 0.21967495769341294, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.4053, + "step": 1935 + }, + { + "epoch": 0.21978848480333202, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3779, + "step": 1936 + }, + { + "epoch": 0.2199020119132511, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3973, + "step": 1937 + }, + { + "epoch": 0.22001553902317017, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4245, + "step": 1938 + }, + { + "epoch": 0.22012906613308925, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.4045, + "step": 1939 + }, + { + "epoch": 0.22024259324300832, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3981, + "step": 1940 + }, + { + "epoch": 0.2203561203529274, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.412, + "step": 1941 + }, + { + "epoch": 0.22046964746284647, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.4019, + "step": 1942 + }, + { + "epoch": 0.22058317457276555, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4214, + "step": 1943 + }, + { + "epoch": 0.22069670168268463, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3993, + "step": 1944 + }, + { + "epoch": 0.2208102287926037, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4013, + "step": 1945 + }, + { + "epoch": 0.22092375590252278, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.4114, + "step": 1946 + }, + { + "epoch": 0.22103728301244185, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.4081, + "step": 1947 + }, + { + "epoch": 0.22115081012236093, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4156, + "step": 1948 + }, + { + "epoch": 0.22126433723228, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.4123, + "step": 1949 + }, + { + "epoch": 0.22137786434219908, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.4018, + "step": 1950 + }, + { + "epoch": 0.22149139145211816, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.387, + "step": 1951 + }, + { + "epoch": 0.22160491856203723, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3916, + "step": 1952 + }, + { + "epoch": 0.2217184456719563, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.4162, + "step": 1953 + }, + { + "epoch": 0.2218319727818754, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.377, + "step": 1954 + }, + { + "epoch": 0.22194549989179446, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3958, + "step": 1955 + }, + { + "epoch": 0.22205902700171354, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.4014, + "step": 1956 + }, + { + "epoch": 0.22217255411163261, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4109, + "step": 1957 + }, + { + "epoch": 0.2222860812215517, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.4088, + "step": 1958 + }, + { + "epoch": 0.22239960833147077, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.4198, + "step": 1959 + }, + { + "epoch": 0.22251313544138984, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.4139, + "step": 1960 + }, + { + "epoch": 0.22262666255130895, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3946, + "step": 1961 + }, + { + "epoch": 0.22274018966122802, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.4308, + "step": 1962 + }, + { + "epoch": 0.2228537167711471, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.4132, + "step": 1963 + }, + { + "epoch": 0.22296724388106617, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.4049, + "step": 1964 + }, + { + "epoch": 0.22308077099098525, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.4155, + "step": 1965 + }, + { + "epoch": 0.22319429810090433, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3751, + "step": 1966 + }, + { + "epoch": 0.2233078252108234, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3814, + "step": 1967 + }, + { + "epoch": 0.22342135232074248, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4234, + "step": 1968 + }, + { + "epoch": 0.22353487943066155, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3833, + "step": 1969 + }, + { + "epoch": 0.22364840654058063, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4179, + "step": 1970 + }, + { + "epoch": 0.2237619336504997, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.388, + "step": 1971 + }, + { + "epoch": 0.22387546076041878, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.4069, + "step": 1972 + }, + { + "epoch": 0.22398898787033786, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3993, + "step": 1973 + }, + { + "epoch": 0.22410251498025693, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4097, + "step": 1974 + }, + { + "epoch": 0.224216042090176, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4118, + "step": 1975 + }, + { + "epoch": 0.22432956920009509, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.4064, + "step": 1976 + }, + { + "epoch": 0.22444309631001416, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.4124, + "step": 1977 + }, + { + "epoch": 0.22455662341993324, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.4048, + "step": 1978 + }, + { + "epoch": 0.2246701505298523, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4064, + "step": 1979 + }, + { + "epoch": 0.2247836776397714, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3858, + "step": 1980 + }, + { + "epoch": 0.22489720474969047, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4007, + "step": 1981 + }, + { + "epoch": 0.22501073185960954, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4041, + "step": 1982 + }, + { + "epoch": 0.22512425896952862, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3858, + "step": 1983 + }, + { + "epoch": 0.2252377860794477, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.4045, + "step": 1984 + }, + { + "epoch": 0.22535131318936677, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.403, + "step": 1985 + }, + { + "epoch": 0.22546484029928585, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3755, + "step": 1986 + }, + { + "epoch": 0.22557836740920492, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3899, + "step": 1987 + }, + { + "epoch": 0.225691894519124, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4022, + "step": 1988 + }, + { + "epoch": 0.22580542162904307, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.389, + "step": 1989 + }, + { + "epoch": 0.22591894873896215, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4022, + "step": 1990 + }, + { + "epoch": 0.22603247584888123, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3991, + "step": 1991 + }, + { + "epoch": 0.2261460029588003, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3909, + "step": 1992 + }, + { + "epoch": 0.22625953006871938, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3888, + "step": 1993 + }, + { + "epoch": 0.22637305717863845, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.4074, + "step": 1994 + }, + { + "epoch": 0.22648658428855753, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.4051, + "step": 1995 + }, + { + "epoch": 0.2266001113984766, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3952, + "step": 1996 + }, + { + "epoch": 0.22671363850839568, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4027, + "step": 1997 + }, + { + "epoch": 0.22682716561831476, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.4056, + "step": 1998 + }, + { + "epoch": 0.22694069272823383, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.4077, + "step": 1999 + }, + { + "epoch": 0.2270542198381529, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.4049, + "step": 2000 + }, + { + "epoch": 0.22716774694807199, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.4031, + "step": 2001 + }, + { + "epoch": 0.22728127405799106, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.4046, + "step": 2002 + }, + { + "epoch": 0.22739480116791014, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.3963, + "step": 2003 + }, + { + "epoch": 0.2275083282778292, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3966, + "step": 2004 + }, + { + "epoch": 0.2276218553877483, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3972, + "step": 2005 + }, + { + "epoch": 0.22773538249766737, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.4023, + "step": 2006 + }, + { + "epoch": 0.22784890960758644, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3935, + "step": 2007 + }, + { + "epoch": 0.22796243671750552, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.4097, + "step": 2008 + }, + { + "epoch": 0.2280759638274246, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3915, + "step": 2009 + }, + { + "epoch": 0.22818949093734367, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.4088, + "step": 2010 + }, + { + "epoch": 0.22830301804726275, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.3922, + "step": 2011 + }, + { + "epoch": 0.22841654515718182, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.3775, + "step": 2012 + }, + { + "epoch": 0.2285300722671009, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.401, + "step": 2013 + }, + { + "epoch": 0.22864359937701997, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.41, + "step": 2014 + }, + { + "epoch": 0.22875712648693905, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.4001, + "step": 2015 + }, + { + "epoch": 0.22887065359685813, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3902, + "step": 2016 + }, + { + "epoch": 0.2289841807067772, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4054, + "step": 2017 + }, + { + "epoch": 0.22909770781669628, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3748, + "step": 2018 + }, + { + "epoch": 0.22921123492661535, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3984, + "step": 2019 + }, + { + "epoch": 0.22932476203653443, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3988, + "step": 2020 + }, + { + "epoch": 0.2294382891464535, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3763, + "step": 2021 + }, + { + "epoch": 0.2295518162563726, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.385, + "step": 2022 + }, + { + "epoch": 0.22966534336629169, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.4149, + "step": 2023 + }, + { + "epoch": 0.22977887047621076, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4083, + "step": 2024 + }, + { + "epoch": 0.22989239758612984, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3961, + "step": 2025 + }, + { + "epoch": 0.2300059246960489, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3852, + "step": 2026 + }, + { + "epoch": 0.230119451805968, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.385, + "step": 2027 + }, + { + "epoch": 0.23023297891588707, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.4009, + "step": 2028 + }, + { + "epoch": 0.23034650602580614, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4018, + "step": 2029 + }, + { + "epoch": 0.23046003313572522, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4244, + "step": 2030 + }, + { + "epoch": 0.2305735602456443, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.4061, + "step": 2031 + }, + { + "epoch": 0.23068708735556337, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3881, + "step": 2032 + }, + { + "epoch": 0.23080061446548245, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.4068, + "step": 2033 + }, + { + "epoch": 0.23091414157540152, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3866, + "step": 2034 + }, + { + "epoch": 0.2310276686853206, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.385, + "step": 2035 + }, + { + "epoch": 0.23114119579523967, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.4174, + "step": 2036 + }, + { + "epoch": 0.23125472290515875, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.411, + "step": 2037 + }, + { + "epoch": 0.23136825001507783, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.3917, + "step": 2038 + }, + { + "epoch": 0.2314817771249969, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.4043, + "step": 2039 + }, + { + "epoch": 0.23159530423491598, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.4053, + "step": 2040 + }, + { + "epoch": 0.23170883134483505, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4052, + "step": 2041 + }, + { + "epoch": 0.23182235845475413, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.4109, + "step": 2042 + }, + { + "epoch": 0.2319358855646732, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3953, + "step": 2043 + }, + { + "epoch": 0.23204941267459228, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3913, + "step": 2044 + }, + { + "epoch": 0.23216293978451136, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3846, + "step": 2045 + }, + { + "epoch": 0.23227646689443043, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.4065, + "step": 2046 + }, + { + "epoch": 0.2323899940043495, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3722, + "step": 2047 + }, + { + "epoch": 0.23250352111426859, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.402, + "step": 2048 + }, + { + "epoch": 0.23261704822418766, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3825, + "step": 2049 + }, + { + "epoch": 0.23273057533410674, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3987, + "step": 2050 + }, + { + "epoch": 0.2328441024440258, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.4239, + "step": 2051 + }, + { + "epoch": 0.2329576295539449, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3948, + "step": 2052 + }, + { + "epoch": 0.23307115666386397, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3854, + "step": 2053 + }, + { + "epoch": 0.23318468377378304, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3738, + "step": 2054 + }, + { + "epoch": 0.23329821088370212, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3851, + "step": 2055 + }, + { + "epoch": 0.2334117379936212, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.4038, + "step": 2056 + }, + { + "epoch": 0.23352526510354027, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.401, + "step": 2057 + }, + { + "epoch": 0.23363879221345935, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3975, + "step": 2058 + }, + { + "epoch": 0.23375231932337842, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3579, + "step": 2059 + }, + { + "epoch": 0.2338658464332975, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3699, + "step": 2060 + }, + { + "epoch": 0.23397937354321657, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3939, + "step": 2061 + }, + { + "epoch": 0.23409290065313565, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.4082, + "step": 2062 + }, + { + "epoch": 0.23420642776305473, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3757, + "step": 2063 + }, + { + "epoch": 0.2343199548729738, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.3791, + "step": 2064 + }, + { + "epoch": 0.23443348198289288, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.4254, + "step": 2065 + }, + { + "epoch": 0.23454700909281195, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.4129, + "step": 2066 + }, + { + "epoch": 0.23466053620273103, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.4029, + "step": 2067 + }, + { + "epoch": 0.2347740633126501, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3978, + "step": 2068 + }, + { + "epoch": 0.23488759042256918, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3829, + "step": 2069 + }, + { + "epoch": 0.23500111753248826, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.419, + "step": 2070 + }, + { + "epoch": 0.23511464464240733, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3789, + "step": 2071 + }, + { + "epoch": 0.2352281717523264, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.4004, + "step": 2072 + }, + { + "epoch": 0.23534169886224549, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.4018, + "step": 2073 + }, + { + "epoch": 0.23545522597216456, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.4067, + "step": 2074 + }, + { + "epoch": 0.23556875308208364, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.4032, + "step": 2075 + }, + { + "epoch": 0.2356822801920027, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.4079, + "step": 2076 + }, + { + "epoch": 0.2357958073019218, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3909, + "step": 2077 + }, + { + "epoch": 0.23590933441184087, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3813, + "step": 2078 + }, + { + "epoch": 0.23602286152175994, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3918, + "step": 2079 + }, + { + "epoch": 0.23613638863167902, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3936, + "step": 2080 + }, + { + "epoch": 0.2362499157415981, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3989, + "step": 2081 + }, + { + "epoch": 0.2363634428515172, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.4042, + "step": 2082 + }, + { + "epoch": 0.23647696996143627, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3853, + "step": 2083 + }, + { + "epoch": 0.23659049707135535, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3921, + "step": 2084 + }, + { + "epoch": 0.23670402418127443, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.4004, + "step": 2085 + }, + { + "epoch": 0.2368175512911935, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3765, + "step": 2086 + }, + { + "epoch": 0.23693107840111258, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.4073, + "step": 2087 + }, + { + "epoch": 0.23704460551103165, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.393, + "step": 2088 + }, + { + "epoch": 0.23715813262095073, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.4015, + "step": 2089 + }, + { + "epoch": 0.2372716597308698, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3903, + "step": 2090 + }, + { + "epoch": 0.23738518684078888, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.4097, + "step": 2091 + }, + { + "epoch": 0.23749871395070796, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3742, + "step": 2092 + }, + { + "epoch": 0.23761224106062703, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3701, + "step": 2093 + }, + { + "epoch": 0.2377257681705461, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3913, + "step": 2094 + }, + { + "epoch": 0.23783929528046518, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3918, + "step": 2095 + }, + { + "epoch": 0.23795282239038426, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3823, + "step": 2096 + }, + { + "epoch": 0.23806634950030334, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3811, + "step": 2097 + }, + { + "epoch": 0.2381798766102224, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3858, + "step": 2098 + }, + { + "epoch": 0.2382934037201415, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3962, + "step": 2099 + }, + { + "epoch": 0.23840693083006056, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3937, + "step": 2100 + }, + { + "epoch": 0.23852045793997964, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.4131, + "step": 2101 + }, + { + "epoch": 0.23863398504989872, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3794, + "step": 2102 + }, + { + "epoch": 0.2387475121598178, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3989, + "step": 2103 + }, + { + "epoch": 0.23886103926973687, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3994, + "step": 2104 + }, + { + "epoch": 0.23897456637965594, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3824, + "step": 2105 + }, + { + "epoch": 0.23908809348957502, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.397, + "step": 2106 + }, + { + "epoch": 0.2392016205994941, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.4024, + "step": 2107 + }, + { + "epoch": 0.23931514770941317, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.4112, + "step": 2108 + }, + { + "epoch": 0.23942867481933225, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3826, + "step": 2109 + }, + { + "epoch": 0.23954220192925132, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.376, + "step": 2110 + }, + { + "epoch": 0.2396557290391704, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3851, + "step": 2111 + }, + { + "epoch": 0.23976925614908948, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3876, + "step": 2112 + }, + { + "epoch": 0.23988278325900855, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3778, + "step": 2113 + }, + { + "epoch": 0.23999631036892763, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3819, + "step": 2114 + }, + { + "epoch": 0.2401098374788467, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.392, + "step": 2115 + }, + { + "epoch": 0.24022336458876578, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3932, + "step": 2116 + }, + { + "epoch": 0.24033689169868486, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3922, + "step": 2117 + }, + { + "epoch": 0.24045041880860393, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.408, + "step": 2118 + }, + { + "epoch": 0.240563945918523, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3798, + "step": 2119 + }, + { + "epoch": 0.24067747302844208, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3892, + "step": 2120 + }, + { + "epoch": 0.24079100013836116, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3991, + "step": 2121 + }, + { + "epoch": 0.24090452724828024, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.3892, + "step": 2122 + }, + { + "epoch": 0.2410180543581993, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3834, + "step": 2123 + }, + { + "epoch": 0.2411315814681184, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.398, + "step": 2124 + }, + { + "epoch": 0.24124510857803746, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3822, + "step": 2125 + }, + { + "epoch": 0.24135863568795654, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3883, + "step": 2126 + }, + { + "epoch": 0.24147216279787562, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3846, + "step": 2127 + }, + { + "epoch": 0.2415856899077947, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3829, + "step": 2128 + }, + { + "epoch": 0.24169921701771377, + "grad_norm": 0.4765625, + "learning_rate": 0.002, + "loss": 5.3777, + "step": 2129 + }, + { + "epoch": 0.24181274412763284, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.4042, + "step": 2130 + }, + { + "epoch": 0.24192627123755192, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3933, + "step": 2131 + }, + { + "epoch": 0.242039798347471, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3824, + "step": 2132 + }, + { + "epoch": 0.24215332545739007, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3863, + "step": 2133 + }, + { + "epoch": 0.24226685256730915, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3743, + "step": 2134 + }, + { + "epoch": 0.24238037967722822, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.382, + "step": 2135 + }, + { + "epoch": 0.2424939067871473, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3797, + "step": 2136 + }, + { + "epoch": 0.24260743389706638, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3915, + "step": 2137 + }, + { + "epoch": 0.24272096100698545, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3877, + "step": 2138 + }, + { + "epoch": 0.24283448811690453, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3841, + "step": 2139 + }, + { + "epoch": 0.2429480152268236, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3565, + "step": 2140 + }, + { + "epoch": 0.24306154233674268, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3835, + "step": 2141 + }, + { + "epoch": 0.24317506944666178, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3726, + "step": 2142 + }, + { + "epoch": 0.24328859655658086, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3656, + "step": 2143 + }, + { + "epoch": 0.24340212366649994, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3759, + "step": 2144 + }, + { + "epoch": 0.243515650776419, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3909, + "step": 2145 + }, + { + "epoch": 0.2436291778863381, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3735, + "step": 2146 + }, + { + "epoch": 0.24374270499625716, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3757, + "step": 2147 + }, + { + "epoch": 0.24385623210617624, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3903, + "step": 2148 + }, + { + "epoch": 0.24396975921609532, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3924, + "step": 2149 + }, + { + "epoch": 0.2440832863260144, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3989, + "step": 2150 + }, + { + "epoch": 0.24419681343593347, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3725, + "step": 2151 + }, + { + "epoch": 0.24431034054585254, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3858, + "step": 2152 + }, + { + "epoch": 0.24442386765577162, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3765, + "step": 2153 + }, + { + "epoch": 0.2445373947656907, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3717, + "step": 2154 + }, + { + "epoch": 0.24465092187560977, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.3982, + "step": 2155 + }, + { + "epoch": 0.24476444898552885, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3693, + "step": 2156 + }, + { + "epoch": 0.24487797609544792, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3722, + "step": 2157 + }, + { + "epoch": 0.244991503205367, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3723, + "step": 2158 + }, + { + "epoch": 0.24510503031528608, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4009, + "step": 2159 + }, + { + "epoch": 0.24521855742520515, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3717, + "step": 2160 + }, + { + "epoch": 0.24533208453512423, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.378, + "step": 2161 + }, + { + "epoch": 0.2454456116450433, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3653, + "step": 2162 + }, + { + "epoch": 0.24555913875496238, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3694, + "step": 2163 + }, + { + "epoch": 0.24567266586488146, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3713, + "step": 2164 + }, + { + "epoch": 0.24578619297480053, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3894, + "step": 2165 + }, + { + "epoch": 0.2458997200847196, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3828, + "step": 2166 + }, + { + "epoch": 0.24601324719463868, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3866, + "step": 2167 + }, + { + "epoch": 0.24612677430455776, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3773, + "step": 2168 + }, + { + "epoch": 0.24624030141447684, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3887, + "step": 2169 + }, + { + "epoch": 0.2463538285243959, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3628, + "step": 2170 + }, + { + "epoch": 0.246467355634315, + "grad_norm": 0.48828125, + "learning_rate": 0.002, + "loss": 5.3883, + "step": 2171 + }, + { + "epoch": 0.24658088274423406, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.3939, + "step": 2172 + }, + { + "epoch": 0.24669440985415314, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3899, + "step": 2173 + }, + { + "epoch": 0.24680793696407222, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3814, + "step": 2174 + }, + { + "epoch": 0.2469214640739913, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3869, + "step": 2175 + }, + { + "epoch": 0.24703499118391037, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3761, + "step": 2176 + }, + { + "epoch": 0.24714851829382944, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.4045, + "step": 2177 + }, + { + "epoch": 0.24726204540374852, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.359, + "step": 2178 + }, + { + "epoch": 0.2473755725136676, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3757, + "step": 2179 + }, + { + "epoch": 0.24748909962358667, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3918, + "step": 2180 + }, + { + "epoch": 0.24760262673350575, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3606, + "step": 2181 + }, + { + "epoch": 0.24771615384342482, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3714, + "step": 2182 + }, + { + "epoch": 0.2478296809533439, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3626, + "step": 2183 + }, + { + "epoch": 0.24794320806326298, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3666, + "step": 2184 + }, + { + "epoch": 0.24805673517318205, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3706, + "step": 2185 + }, + { + "epoch": 0.24817026228310113, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3722, + "step": 2186 + }, + { + "epoch": 0.2482837893930202, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3818, + "step": 2187 + }, + { + "epoch": 0.24839731650293928, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3888, + "step": 2188 + }, + { + "epoch": 0.24851084361285836, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3918, + "step": 2189 + }, + { + "epoch": 0.24862437072277743, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3825, + "step": 2190 + }, + { + "epoch": 0.2487378978326965, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.3804, + "step": 2191 + }, + { + "epoch": 0.24885142494261558, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3776, + "step": 2192 + }, + { + "epoch": 0.24896495205253466, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3857, + "step": 2193 + }, + { + "epoch": 0.24907847916245374, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3733, + "step": 2194 + }, + { + "epoch": 0.2491920062723728, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3628, + "step": 2195 + }, + { + "epoch": 0.2493055333822919, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3954, + "step": 2196 + }, + { + "epoch": 0.24941906049221096, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.38, + "step": 2197 + }, + { + "epoch": 0.24953258760213004, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3786, + "step": 2198 + }, + { + "epoch": 0.24964611471204912, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3641, + "step": 2199 + }, + { + "epoch": 0.2497596418219682, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3888, + "step": 2200 + }, + { + "epoch": 0.24987316893188727, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3921, + "step": 2201 + }, + { + "epoch": 0.24998669604180634, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3643, + "step": 2202 + }, + { + "epoch": 0.2501002231517254, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3762, + "step": 2203 + }, + { + "epoch": 0.2502137502616445, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3679, + "step": 2204 + }, + { + "epoch": 0.2503272773715636, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.358, + "step": 2205 + }, + { + "epoch": 0.25044080448148265, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3759, + "step": 2206 + }, + { + "epoch": 0.2505543315914017, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3565, + "step": 2207 + }, + { + "epoch": 0.2506678587013208, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3774, + "step": 2208 + }, + { + "epoch": 0.2507813858112399, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3796, + "step": 2209 + }, + { + "epoch": 0.25089491292115895, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3725, + "step": 2210 + }, + { + "epoch": 0.25100844003107803, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3791, + "step": 2211 + }, + { + "epoch": 0.2511219671409971, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3773, + "step": 2212 + }, + { + "epoch": 0.2512354942509162, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3767, + "step": 2213 + }, + { + "epoch": 0.25134902136083526, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3853, + "step": 2214 + }, + { + "epoch": 0.25146254847075433, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.388, + "step": 2215 + }, + { + "epoch": 0.2515760755806734, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3691, + "step": 2216 + }, + { + "epoch": 0.2516896026905925, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3731, + "step": 2217 + }, + { + "epoch": 0.25180312980051156, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.4074, + "step": 2218 + }, + { + "epoch": 0.25191665691043064, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3913, + "step": 2219 + }, + { + "epoch": 0.2520301840203497, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3942, + "step": 2220 + }, + { + "epoch": 0.2521437111302688, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3758, + "step": 2221 + }, + { + "epoch": 0.25225723824018786, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.372, + "step": 2222 + }, + { + "epoch": 0.25237076535010694, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3535, + "step": 2223 + }, + { + "epoch": 0.252484292460026, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.385, + "step": 2224 + }, + { + "epoch": 0.2525978195699451, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3681, + "step": 2225 + }, + { + "epoch": 0.25271134667986417, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3894, + "step": 2226 + }, + { + "epoch": 0.25282487378978324, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3829, + "step": 2227 + }, + { + "epoch": 0.2529384008997023, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3662, + "step": 2228 + }, + { + "epoch": 0.2530519280096214, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3852, + "step": 2229 + }, + { + "epoch": 0.2531654551195405, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3736, + "step": 2230 + }, + { + "epoch": 0.25327898222945955, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3798, + "step": 2231 + }, + { + "epoch": 0.2533925093393786, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3794, + "step": 2232 + }, + { + "epoch": 0.25350603644929776, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3789, + "step": 2233 + }, + { + "epoch": 0.25361956355921683, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3801, + "step": 2234 + }, + { + "epoch": 0.2537330906691359, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3562, + "step": 2235 + }, + { + "epoch": 0.253846617779055, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3646, + "step": 2236 + }, + { + "epoch": 0.25396014488897406, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3798, + "step": 2237 + }, + { + "epoch": 0.25407367199889314, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.3857, + "step": 2238 + }, + { + "epoch": 0.2541871991088122, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.3707, + "step": 2239 + }, + { + "epoch": 0.2543007262187313, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.3886, + "step": 2240 + }, + { + "epoch": 0.25441425332865036, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.3673, + "step": 2241 + }, + { + "epoch": 0.25452778043856944, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3728, + "step": 2242 + }, + { + "epoch": 0.2546413075484885, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.4026, + "step": 2243 + }, + { + "epoch": 0.2547548346584076, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3839, + "step": 2244 + }, + { + "epoch": 0.25486836176832667, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3835, + "step": 2245 + }, + { + "epoch": 0.25498188887824574, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3616, + "step": 2246 + }, + { + "epoch": 0.2550954159881648, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3724, + "step": 2247 + }, + { + "epoch": 0.2552089430980839, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3684, + "step": 2248 + }, + { + "epoch": 0.25532247020800297, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3784, + "step": 2249 + }, + { + "epoch": 0.25543599731792205, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3811, + "step": 2250 + }, + { + "epoch": 0.2555495244278411, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3619, + "step": 2251 + }, + { + "epoch": 0.2556630515377602, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3966, + "step": 2252 + }, + { + "epoch": 0.2557765786476793, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3597, + "step": 2253 + }, + { + "epoch": 0.25589010575759835, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3571, + "step": 2254 + }, + { + "epoch": 0.2560036328675174, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3814, + "step": 2255 + }, + { + "epoch": 0.2561171599774365, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3683, + "step": 2256 + }, + { + "epoch": 0.2562306870873556, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.367, + "step": 2257 + }, + { + "epoch": 0.25634421419727466, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3781, + "step": 2258 + }, + { + "epoch": 0.25645774130719373, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3797, + "step": 2259 + }, + { + "epoch": 0.2565712684171128, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3751, + "step": 2260 + }, + { + "epoch": 0.2566847955270319, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3896, + "step": 2261 + }, + { + "epoch": 0.25679832263695096, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3775, + "step": 2262 + }, + { + "epoch": 0.25691184974687004, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.3752, + "step": 2263 + }, + { + "epoch": 0.2570253768567891, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3723, + "step": 2264 + }, + { + "epoch": 0.2571389039667082, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3591, + "step": 2265 + }, + { + "epoch": 0.25725243107662726, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3745, + "step": 2266 + }, + { + "epoch": 0.25736595818654634, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3634, + "step": 2267 + }, + { + "epoch": 0.2574794852964654, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3927, + "step": 2268 + }, + { + "epoch": 0.2575930124063845, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3695, + "step": 2269 + }, + { + "epoch": 0.25770653951630357, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3797, + "step": 2270 + }, + { + "epoch": 0.25782006662622264, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3603, + "step": 2271 + }, + { + "epoch": 0.2579335937361417, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3724, + "step": 2272 + }, + { + "epoch": 0.2580471208460608, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3726, + "step": 2273 + }, + { + "epoch": 0.25816064795597987, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3601, + "step": 2274 + }, + { + "epoch": 0.25827417506589895, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3731, + "step": 2275 + }, + { + "epoch": 0.258387702175818, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3855, + "step": 2276 + }, + { + "epoch": 0.2585012292857371, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.3519, + "step": 2277 + }, + { + "epoch": 0.2586147563956562, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.3801, + "step": 2278 + }, + { + "epoch": 0.25872828350557525, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.3835, + "step": 2279 + }, + { + "epoch": 0.2588418106154943, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.3921, + "step": 2280 + }, + { + "epoch": 0.2589553377254134, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3824, + "step": 2281 + }, + { + "epoch": 0.2590688648353325, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3546, + "step": 2282 + }, + { + "epoch": 0.25918239194525156, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3792, + "step": 2283 + }, + { + "epoch": 0.25929591905517063, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.3717, + "step": 2284 + }, + { + "epoch": 0.2594094461650897, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.3755, + "step": 2285 + }, + { + "epoch": 0.2595229732750088, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.3605, + "step": 2286 + }, + { + "epoch": 0.25963650038492786, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3559, + "step": 2287 + }, + { + "epoch": 0.25975002749484694, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3695, + "step": 2288 + }, + { + "epoch": 0.259863554604766, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3625, + "step": 2289 + }, + { + "epoch": 0.2599770817146851, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3741, + "step": 2290 + }, + { + "epoch": 0.26009060882460416, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3482, + "step": 2291 + }, + { + "epoch": 0.26020413593452324, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3818, + "step": 2292 + }, + { + "epoch": 0.2603176630444423, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3687, + "step": 2293 + }, + { + "epoch": 0.2604311901543614, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3785, + "step": 2294 + }, + { + "epoch": 0.26054471726428047, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3809, + "step": 2295 + }, + { + "epoch": 0.26065824437419954, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3827, + "step": 2296 + }, + { + "epoch": 0.2607717714841186, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3752, + "step": 2297 + }, + { + "epoch": 0.2608852985940377, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3729, + "step": 2298 + }, + { + "epoch": 0.26099882570395677, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3809, + "step": 2299 + }, + { + "epoch": 0.26111235281387585, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3625, + "step": 2300 + }, + { + "epoch": 0.2612258799237949, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3655, + "step": 2301 + }, + { + "epoch": 0.261339407033714, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3651, + "step": 2302 + }, + { + "epoch": 0.2614529341436331, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3579, + "step": 2303 + }, + { + "epoch": 0.26156646125355215, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3849, + "step": 2304 + }, + { + "epoch": 0.2616799883634712, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3786, + "step": 2305 + }, + { + "epoch": 0.2617935154733903, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3629, + "step": 2306 + }, + { + "epoch": 0.2619070425833094, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.3586, + "step": 2307 + }, + { + "epoch": 0.26202056969322846, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.381, + "step": 2308 + }, + { + "epoch": 0.26213409680314753, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3641, + "step": 2309 + }, + { + "epoch": 0.2622476239130666, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.37, + "step": 2310 + }, + { + "epoch": 0.2623611510229857, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.393, + "step": 2311 + }, + { + "epoch": 0.26247467813290476, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3671, + "step": 2312 + }, + { + "epoch": 0.26258820524282384, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3687, + "step": 2313 + }, + { + "epoch": 0.2627017323527429, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3809, + "step": 2314 + }, + { + "epoch": 0.262815259462662, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3712, + "step": 2315 + }, + { + "epoch": 0.26292878657258106, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3449, + "step": 2316 + }, + { + "epoch": 0.26304231368250014, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3721, + "step": 2317 + }, + { + "epoch": 0.2631558407924192, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3897, + "step": 2318 + }, + { + "epoch": 0.2632693679023383, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3531, + "step": 2319 + }, + { + "epoch": 0.26338289501225737, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3663, + "step": 2320 + }, + { + "epoch": 0.26349642212217644, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3661, + "step": 2321 + }, + { + "epoch": 0.2636099492320955, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3477, + "step": 2322 + }, + { + "epoch": 0.2637234763420146, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3773, + "step": 2323 + }, + { + "epoch": 0.26383700345193367, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3576, + "step": 2324 + }, + { + "epoch": 0.26395053056185275, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3664, + "step": 2325 + }, + { + "epoch": 0.2640640576717718, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3741, + "step": 2326 + }, + { + "epoch": 0.2641775847816909, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3485, + "step": 2327 + }, + { + "epoch": 0.26429111189161, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3624, + "step": 2328 + }, + { + "epoch": 0.26440463900152905, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3582, + "step": 2329 + }, + { + "epoch": 0.2645181661114481, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3683, + "step": 2330 + }, + { + "epoch": 0.2646316932213672, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3427, + "step": 2331 + }, + { + "epoch": 0.2647452203312863, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3618, + "step": 2332 + }, + { + "epoch": 0.26485874744120536, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3724, + "step": 2333 + }, + { + "epoch": 0.26497227455112443, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3807, + "step": 2334 + }, + { + "epoch": 0.2650858016610435, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3608, + "step": 2335 + }, + { + "epoch": 0.2651993287709626, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3692, + "step": 2336 + }, + { + "epoch": 0.26531285588088166, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3816, + "step": 2337 + }, + { + "epoch": 0.26542638299080074, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3728, + "step": 2338 + }, + { + "epoch": 0.2655399101007198, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3493, + "step": 2339 + }, + { + "epoch": 0.2656534372106389, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.365, + "step": 2340 + }, + { + "epoch": 0.26576696432055796, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3621, + "step": 2341 + }, + { + "epoch": 0.26588049143047704, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3872, + "step": 2342 + }, + { + "epoch": 0.2659940185403961, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.356, + "step": 2343 + }, + { + "epoch": 0.2661075456503152, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3688, + "step": 2344 + }, + { + "epoch": 0.26622107276023427, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3744, + "step": 2345 + }, + { + "epoch": 0.26633459987015334, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3645, + "step": 2346 + }, + { + "epoch": 0.2664481269800724, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3512, + "step": 2347 + }, + { + "epoch": 0.2665616540899915, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3718, + "step": 2348 + }, + { + "epoch": 0.26667518119991057, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3796, + "step": 2349 + }, + { + "epoch": 0.26678870830982965, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3652, + "step": 2350 + }, + { + "epoch": 0.2669022354197487, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.3977, + "step": 2351 + }, + { + "epoch": 0.2670157625296678, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3615, + "step": 2352 + }, + { + "epoch": 0.26712928963958693, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3442, + "step": 2353 + }, + { + "epoch": 0.267242816749506, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3815, + "step": 2354 + }, + { + "epoch": 0.2673563438594251, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3595, + "step": 2355 + }, + { + "epoch": 0.26746987096934416, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3547, + "step": 2356 + }, + { + "epoch": 0.26758339807926323, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3679, + "step": 2357 + }, + { + "epoch": 0.2676969251891823, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.355, + "step": 2358 + }, + { + "epoch": 0.2678104522991014, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.3619, + "step": 2359 + }, + { + "epoch": 0.26792397940902046, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.3802, + "step": 2360 + }, + { + "epoch": 0.26803750651893954, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.3428, + "step": 2361 + }, + { + "epoch": 0.2681510336288586, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.3605, + "step": 2362 + }, + { + "epoch": 0.2682645607387777, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3847, + "step": 2363 + }, + { + "epoch": 0.26837808784869677, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3636, + "step": 2364 + }, + { + "epoch": 0.26849161495861584, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3649, + "step": 2365 + }, + { + "epoch": 0.2686051420685349, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.3644, + "step": 2366 + }, + { + "epoch": 0.268718669178454, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3542, + "step": 2367 + }, + { + "epoch": 0.26883219628837307, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3625, + "step": 2368 + }, + { + "epoch": 0.26894572339829215, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3676, + "step": 2369 + }, + { + "epoch": 0.2690592505082112, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3512, + "step": 2370 + }, + { + "epoch": 0.2691727776181303, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3529, + "step": 2371 + }, + { + "epoch": 0.2692863047280494, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3475, + "step": 2372 + }, + { + "epoch": 0.26939983183796845, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3737, + "step": 2373 + }, + { + "epoch": 0.2695133589478875, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3523, + "step": 2374 + }, + { + "epoch": 0.2696268860578066, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.3521, + "step": 2375 + }, + { + "epoch": 0.2697404131677257, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3541, + "step": 2376 + }, + { + "epoch": 0.26985394027764475, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3778, + "step": 2377 + }, + { + "epoch": 0.26996746738756383, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3716, + "step": 2378 + }, + { + "epoch": 0.2700809944974829, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3657, + "step": 2379 + }, + { + "epoch": 0.270194521607402, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3762, + "step": 2380 + }, + { + "epoch": 0.27030804871732106, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3495, + "step": 2381 + }, + { + "epoch": 0.27042157582724013, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3728, + "step": 2382 + }, + { + "epoch": 0.2705351029371592, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3674, + "step": 2383 + }, + { + "epoch": 0.2706486300470783, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3564, + "step": 2384 + }, + { + "epoch": 0.27076215715699736, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3791, + "step": 2385 + }, + { + "epoch": 0.27087568426691644, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3709, + "step": 2386 + }, + { + "epoch": 0.2709892113768355, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3542, + "step": 2387 + }, + { + "epoch": 0.2711027384867546, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3665, + "step": 2388 + }, + { + "epoch": 0.27121626559667367, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3555, + "step": 2389 + }, + { + "epoch": 0.27132979270659274, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3479, + "step": 2390 + }, + { + "epoch": 0.2714433198165118, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3534, + "step": 2391 + }, + { + "epoch": 0.2715568469264309, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3619, + "step": 2392 + }, + { + "epoch": 0.27167037403634997, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.368, + "step": 2393 + }, + { + "epoch": 0.27178390114626905, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3835, + "step": 2394 + }, + { + "epoch": 0.2718974282561881, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3651, + "step": 2395 + }, + { + "epoch": 0.2720109553661072, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3643, + "step": 2396 + }, + { + "epoch": 0.2721244824760263, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3777, + "step": 2397 + }, + { + "epoch": 0.27223800958594535, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3618, + "step": 2398 + }, + { + "epoch": 0.2723515366958644, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3679, + "step": 2399 + }, + { + "epoch": 0.2724650638057835, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3504, + "step": 2400 + }, + { + "epoch": 0.2725785909157026, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3545, + "step": 2401 + }, + { + "epoch": 0.27269211802562165, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.326, + "step": 2402 + }, + { + "epoch": 0.27280564513554073, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3659, + "step": 2403 + }, + { + "epoch": 0.2729191722454598, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.3483, + "step": 2404 + }, + { + "epoch": 0.2730326993553789, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.3616, + "step": 2405 + }, + { + "epoch": 0.27314622646529796, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.3567, + "step": 2406 + }, + { + "epoch": 0.27325975357521703, + "grad_norm": 0.435546875, + "learning_rate": 0.002, + "loss": 5.3584, + "step": 2407 + }, + { + "epoch": 0.2733732806851361, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3457, + "step": 2408 + }, + { + "epoch": 0.2734868077950552, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3489, + "step": 2409 + }, + { + "epoch": 0.27360033490497426, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3529, + "step": 2410 + }, + { + "epoch": 0.27371386201489334, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3451, + "step": 2411 + }, + { + "epoch": 0.2738273891248124, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3569, + "step": 2412 + }, + { + "epoch": 0.2739409162347315, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3644, + "step": 2413 + }, + { + "epoch": 0.27405444334465057, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3699, + "step": 2414 + }, + { + "epoch": 0.27416797045456964, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.335, + "step": 2415 + }, + { + "epoch": 0.2742814975644887, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3639, + "step": 2416 + }, + { + "epoch": 0.2743950246744078, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3572, + "step": 2417 + }, + { + "epoch": 0.27450855178432687, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3565, + "step": 2418 + }, + { + "epoch": 0.27462207889424595, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3561, + "step": 2419 + }, + { + "epoch": 0.274735606004165, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3848, + "step": 2420 + }, + { + "epoch": 0.2748491331140841, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3925, + "step": 2421 + }, + { + "epoch": 0.2749626602240032, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3607, + "step": 2422 + }, + { + "epoch": 0.27507618733392225, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.378, + "step": 2423 + }, + { + "epoch": 0.2751897144438413, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3582, + "step": 2424 + }, + { + "epoch": 0.2753032415537604, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3538, + "step": 2425 + }, + { + "epoch": 0.2754167686636795, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3657, + "step": 2426 + }, + { + "epoch": 0.27553029577359855, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3457, + "step": 2427 + }, + { + "epoch": 0.27564382288351763, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3748, + "step": 2428 + }, + { + "epoch": 0.2757573499934367, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3652, + "step": 2429 + }, + { + "epoch": 0.2758708771033558, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3376, + "step": 2430 + }, + { + "epoch": 0.27598440421327486, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3514, + "step": 2431 + }, + { + "epoch": 0.27609793132319393, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3831, + "step": 2432 + }, + { + "epoch": 0.276211458433113, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3605, + "step": 2433 + }, + { + "epoch": 0.2763249855430321, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3646, + "step": 2434 + }, + { + "epoch": 0.27643851265295116, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3773, + "step": 2435 + }, + { + "epoch": 0.27655203976287024, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3634, + "step": 2436 + }, + { + "epoch": 0.2766655668727893, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3605, + "step": 2437 + }, + { + "epoch": 0.2767790939827084, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3524, + "step": 2438 + }, + { + "epoch": 0.27689262109262747, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3584, + "step": 2439 + }, + { + "epoch": 0.27700614820254654, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3535, + "step": 2440 + }, + { + "epoch": 0.2771196753124656, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3707, + "step": 2441 + }, + { + "epoch": 0.2772332024223847, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3441, + "step": 2442 + }, + { + "epoch": 0.27734672953230377, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.3677, + "step": 2443 + }, + { + "epoch": 0.27746025664222285, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.3463, + "step": 2444 + }, + { + "epoch": 0.2775737837521419, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.3485, + "step": 2445 + }, + { + "epoch": 0.277687310862061, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.3489, + "step": 2446 + }, + { + "epoch": 0.2778008379719801, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3562, + "step": 2447 + }, + { + "epoch": 0.27791436508189915, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3534, + "step": 2448 + }, + { + "epoch": 0.2780278921918182, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2449 + }, + { + "epoch": 0.2781414193017373, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3937, + "step": 2450 + }, + { + "epoch": 0.2782549464116564, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3663, + "step": 2451 + }, + { + "epoch": 0.27836847352157545, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.355, + "step": 2452 + }, + { + "epoch": 0.27848200063149453, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3379, + "step": 2453 + }, + { + "epoch": 0.2785955277414136, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3444, + "step": 2454 + }, + { + "epoch": 0.2787090548513327, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3576, + "step": 2455 + }, + { + "epoch": 0.27882258196125176, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3529, + "step": 2456 + }, + { + "epoch": 0.27893610907117083, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3495, + "step": 2457 + }, + { + "epoch": 0.2790496361810899, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3736, + "step": 2458 + }, + { + "epoch": 0.279163163291009, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3537, + "step": 2459 + }, + { + "epoch": 0.27927669040092806, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3623, + "step": 2460 + }, + { + "epoch": 0.27939021751084714, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3488, + "step": 2461 + }, + { + "epoch": 0.2795037446207662, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3732, + "step": 2462 + }, + { + "epoch": 0.2796172717306853, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3481, + "step": 2463 + }, + { + "epoch": 0.27973079884060437, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3714, + "step": 2464 + }, + { + "epoch": 0.27984432595052344, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3587, + "step": 2465 + }, + { + "epoch": 0.2799578530604425, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3615, + "step": 2466 + }, + { + "epoch": 0.2800713801703616, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3612, + "step": 2467 + }, + { + "epoch": 0.28018490728028067, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3399, + "step": 2468 + }, + { + "epoch": 0.28029843439019975, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3491, + "step": 2469 + }, + { + "epoch": 0.2804119615001188, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3483, + "step": 2470 + }, + { + "epoch": 0.2805254886100379, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3832, + "step": 2471 + }, + { + "epoch": 0.280639015719957, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3651, + "step": 2472 + }, + { + "epoch": 0.28075254282987605, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3485, + "step": 2473 + }, + { + "epoch": 0.2808660699397952, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3514, + "step": 2474 + }, + { + "epoch": 0.28097959704971426, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.3549, + "step": 2475 + }, + { + "epoch": 0.28109312415963333, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3659, + "step": 2476 + }, + { + "epoch": 0.2812066512695524, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3705, + "step": 2477 + }, + { + "epoch": 0.2813201783794715, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3244, + "step": 2478 + }, + { + "epoch": 0.28143370548939056, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3508, + "step": 2479 + }, + { + "epoch": 0.28154723259930964, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3657, + "step": 2480 + }, + { + "epoch": 0.2816607597092287, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3507, + "step": 2481 + }, + { + "epoch": 0.2817742868191478, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3579, + "step": 2482 + }, + { + "epoch": 0.28188781392906687, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3825, + "step": 2483 + }, + { + "epoch": 0.28200134103898594, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3445, + "step": 2484 + }, + { + "epoch": 0.282114868148905, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3444, + "step": 2485 + }, + { + "epoch": 0.2822283952588241, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3659, + "step": 2486 + }, + { + "epoch": 0.28234192236874317, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3711, + "step": 2487 + }, + { + "epoch": 0.28245544947866225, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3682, + "step": 2488 + }, + { + "epoch": 0.2825689765885813, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.36, + "step": 2489 + }, + { + "epoch": 0.2826825036985004, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3365, + "step": 2490 + }, + { + "epoch": 0.2827960308084195, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3482, + "step": 2491 + }, + { + "epoch": 0.28290955791833855, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3279, + "step": 2492 + }, + { + "epoch": 0.2830230850282576, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3502, + "step": 2493 + }, + { + "epoch": 0.2831366121381767, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3568, + "step": 2494 + }, + { + "epoch": 0.2832501392480958, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3691, + "step": 2495 + }, + { + "epoch": 0.28336366635801485, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3665, + "step": 2496 + }, + { + "epoch": 0.28347719346793393, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3511, + "step": 2497 + }, + { + "epoch": 0.283590720577853, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3583, + "step": 2498 + }, + { + "epoch": 0.2837042476877721, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.356, + "step": 2499 + }, + { + "epoch": 0.28381777479769116, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3577, + "step": 2500 + }, + { + "epoch": 0.28393130190761023, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3445, + "step": 2501 + }, + { + "epoch": 0.2840448290175293, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3587, + "step": 2502 + }, + { + "epoch": 0.2841583561274484, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3454, + "step": 2503 + }, + { + "epoch": 0.28427188323736746, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3489, + "step": 2504 + }, + { + "epoch": 0.28438541034728654, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3634, + "step": 2505 + }, + { + "epoch": 0.2844989374572056, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3582, + "step": 2506 + }, + { + "epoch": 0.2846124645671247, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.3541, + "step": 2507 + }, + { + "epoch": 0.28472599167704377, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.3565, + "step": 2508 + }, + { + "epoch": 0.28483951878696284, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3445, + "step": 2509 + }, + { + "epoch": 0.2849530458968819, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3612, + "step": 2510 + }, + { + "epoch": 0.285066573006801, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3661, + "step": 2511 + }, + { + "epoch": 0.28518010011672007, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3323, + "step": 2512 + }, + { + "epoch": 0.28529362722663915, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3676, + "step": 2513 + }, + { + "epoch": 0.2854071543365582, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3548, + "step": 2514 + }, + { + "epoch": 0.2855206814464773, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3656, + "step": 2515 + }, + { + "epoch": 0.2856342085563964, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3388, + "step": 2516 + }, + { + "epoch": 0.28574773566631545, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3548, + "step": 2517 + }, + { + "epoch": 0.2858612627762345, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.37, + "step": 2518 + }, + { + "epoch": 0.2859747898861536, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3515, + "step": 2519 + }, + { + "epoch": 0.2860883169960727, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3598, + "step": 2520 + }, + { + "epoch": 0.28620184410599175, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3482, + "step": 2521 + }, + { + "epoch": 0.28631537121591083, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3534, + "step": 2522 + }, + { + "epoch": 0.2864288983258299, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3576, + "step": 2523 + }, + { + "epoch": 0.286542425435749, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.353, + "step": 2524 + }, + { + "epoch": 0.28665595254566806, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.359, + "step": 2525 + }, + { + "epoch": 0.28676947965558713, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3438, + "step": 2526 + }, + { + "epoch": 0.2868830067655062, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3538, + "step": 2527 + }, + { + "epoch": 0.2869965338754253, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3563, + "step": 2528 + }, + { + "epoch": 0.28711006098534436, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3587, + "step": 2529 + }, + { + "epoch": 0.28722358809526344, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3341, + "step": 2530 + }, + { + "epoch": 0.2873371152051825, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3527, + "step": 2531 + }, + { + "epoch": 0.2874506423151016, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3506, + "step": 2532 + }, + { + "epoch": 0.28756416942502067, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3655, + "step": 2533 + }, + { + "epoch": 0.28767769653493974, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3326, + "step": 2534 + }, + { + "epoch": 0.2877912236448588, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3591, + "step": 2535 + }, + { + "epoch": 0.2879047507547779, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3472, + "step": 2536 + }, + { + "epoch": 0.28801827786469697, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3472, + "step": 2537 + }, + { + "epoch": 0.28813180497461605, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3494, + "step": 2538 + }, + { + "epoch": 0.2882453320845351, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3477, + "step": 2539 + }, + { + "epoch": 0.2883588591944542, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.3513, + "step": 2540 + }, + { + "epoch": 0.2884723863043733, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.3572, + "step": 2541 + }, + { + "epoch": 0.28858591341429235, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3518, + "step": 2542 + }, + { + "epoch": 0.2886994405242114, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3464, + "step": 2543 + }, + { + "epoch": 0.2888129676341305, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3441, + "step": 2544 + }, + { + "epoch": 0.2889264947440496, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3555, + "step": 2545 + }, + { + "epoch": 0.28904002185396865, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3647, + "step": 2546 + }, + { + "epoch": 0.28915354896388773, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3614, + "step": 2547 + }, + { + "epoch": 0.2892670760738068, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3384, + "step": 2548 + }, + { + "epoch": 0.2893806031837259, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3661, + "step": 2549 + }, + { + "epoch": 0.28949413029364496, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3477, + "step": 2550 + }, + { + "epoch": 0.28960765740356403, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3668, + "step": 2551 + }, + { + "epoch": 0.2897211845134831, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3419, + "step": 2552 + }, + { + "epoch": 0.2898347116234022, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3398, + "step": 2553 + }, + { + "epoch": 0.28994823873332126, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3422, + "step": 2554 + }, + { + "epoch": 0.29006176584324034, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3485, + "step": 2555 + }, + { + "epoch": 0.2901752929531594, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3351, + "step": 2556 + }, + { + "epoch": 0.2902888200630785, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3459, + "step": 2557 + }, + { + "epoch": 0.29040234717299757, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3607, + "step": 2558 + }, + { + "epoch": 0.29051587428291664, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3386, + "step": 2559 + }, + { + "epoch": 0.2906294013928357, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.361, + "step": 2560 + }, + { + "epoch": 0.2907429285027548, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3316, + "step": 2561 + }, + { + "epoch": 0.29085645561267387, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3541, + "step": 2562 + }, + { + "epoch": 0.29096998272259295, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3345, + "step": 2563 + }, + { + "epoch": 0.291083509832512, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3556, + "step": 2564 + }, + { + "epoch": 0.2911970369424311, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.3455, + "step": 2565 + }, + { + "epoch": 0.2913105640523502, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3541, + "step": 2566 + }, + { + "epoch": 0.29142409116226925, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3269, + "step": 2567 + }, + { + "epoch": 0.2915376182721883, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3498, + "step": 2568 + }, + { + "epoch": 0.2916511453821074, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.361, + "step": 2569 + }, + { + "epoch": 0.2917646724920265, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3471, + "step": 2570 + }, + { + "epoch": 0.29187819960194555, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3524, + "step": 2571 + }, + { + "epoch": 0.29199172671186463, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3462, + "step": 2572 + }, + { + "epoch": 0.2921052538217837, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3409, + "step": 2573 + }, + { + "epoch": 0.2922187809317028, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3553, + "step": 2574 + }, + { + "epoch": 0.29233230804162186, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3301, + "step": 2575 + }, + { + "epoch": 0.29244583515154093, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3471, + "step": 2576 + }, + { + "epoch": 0.29255936226146, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3513, + "step": 2577 + }, + { + "epoch": 0.2926728893713791, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.356, + "step": 2578 + }, + { + "epoch": 0.29278641648129816, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.373, + "step": 2579 + }, + { + "epoch": 0.29289994359121724, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2580 + }, + { + "epoch": 0.2930134707011363, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3498, + "step": 2581 + }, + { + "epoch": 0.2931269978110554, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3701, + "step": 2582 + }, + { + "epoch": 0.29324052492097447, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3586, + "step": 2583 + }, + { + "epoch": 0.29335405203089354, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3188, + "step": 2584 + }, + { + "epoch": 0.2934675791408126, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3679, + "step": 2585 + }, + { + "epoch": 0.2935811062507317, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3458, + "step": 2586 + }, + { + "epoch": 0.29369463336065077, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3558, + "step": 2587 + }, + { + "epoch": 0.29380816047056985, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.331, + "step": 2588 + }, + { + "epoch": 0.2939216875804889, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.3614, + "step": 2589 + }, + { + "epoch": 0.294035214690408, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.3502, + "step": 2590 + }, + { + "epoch": 0.2941487418003271, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.3417, + "step": 2591 + }, + { + "epoch": 0.29426226891024615, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3353, + "step": 2592 + }, + { + "epoch": 0.2943757960201652, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3257, + "step": 2593 + }, + { + "epoch": 0.2944893231300843, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3525, + "step": 2594 + }, + { + "epoch": 0.29460285024000343, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3291, + "step": 2595 + }, + { + "epoch": 0.2947163773499225, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.349, + "step": 2596 + }, + { + "epoch": 0.2948299044598416, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.345, + "step": 2597 + }, + { + "epoch": 0.29494343156976066, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3508, + "step": 2598 + }, + { + "epoch": 0.29505695867967974, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3662, + "step": 2599 + }, + { + "epoch": 0.2951704857895988, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3447, + "step": 2600 + }, + { + "epoch": 0.2952840128995179, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3273, + "step": 2601 + }, + { + "epoch": 0.29539754000943697, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3454, + "step": 2602 + }, + { + "epoch": 0.29551106711935604, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3529, + "step": 2603 + }, + { + "epoch": 0.2956245942292751, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3538, + "step": 2604 + }, + { + "epoch": 0.2957381213391942, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.359, + "step": 2605 + }, + { + "epoch": 0.29585164844911327, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3371, + "step": 2606 + }, + { + "epoch": 0.29596517555903235, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3317, + "step": 2607 + }, + { + "epoch": 0.2960787026689514, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2608 + }, + { + "epoch": 0.2961922297788705, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3311, + "step": 2609 + }, + { + "epoch": 0.2963057568887896, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3362, + "step": 2610 + }, + { + "epoch": 0.29641928399870865, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3661, + "step": 2611 + }, + { + "epoch": 0.2965328111086277, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.3678, + "step": 2612 + }, + { + "epoch": 0.2966463382185468, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.3438, + "step": 2613 + }, + { + "epoch": 0.2967598653284659, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3374, + "step": 2614 + }, + { + "epoch": 0.29687339243838495, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3207, + "step": 2615 + }, + { + "epoch": 0.29698691954830403, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.327, + "step": 2616 + }, + { + "epoch": 0.2971004466582231, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3606, + "step": 2617 + }, + { + "epoch": 0.2972139737681422, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3338, + "step": 2618 + }, + { + "epoch": 0.29732750087806126, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3313, + "step": 2619 + }, + { + "epoch": 0.29744102798798033, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3456, + "step": 2620 + }, + { + "epoch": 0.2975545550978994, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.33, + "step": 2621 + }, + { + "epoch": 0.2976680822078185, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3694, + "step": 2622 + }, + { + "epoch": 0.29778160931773756, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3583, + "step": 2623 + }, + { + "epoch": 0.29789513642765664, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3354, + "step": 2624 + }, + { + "epoch": 0.2980086635375757, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3299, + "step": 2625 + }, + { + "epoch": 0.2981221906474948, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3533, + "step": 2626 + }, + { + "epoch": 0.29823571775741387, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3374, + "step": 2627 + }, + { + "epoch": 0.29834924486733294, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3585, + "step": 2628 + }, + { + "epoch": 0.298462771977252, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.341, + "step": 2629 + }, + { + "epoch": 0.2985762990871711, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3304, + "step": 2630 + }, + { + "epoch": 0.29868982619709017, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3343, + "step": 2631 + }, + { + "epoch": 0.29880335330700925, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3329, + "step": 2632 + }, + { + "epoch": 0.2989168804169283, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3447, + "step": 2633 + }, + { + "epoch": 0.2990304075268474, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3568, + "step": 2634 + }, + { + "epoch": 0.2991439346367665, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3096, + "step": 2635 + }, + { + "epoch": 0.29925746174668555, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3422, + "step": 2636 + }, + { + "epoch": 0.2993709888566046, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3535, + "step": 2637 + }, + { + "epoch": 0.2994845159665237, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3548, + "step": 2638 + }, + { + "epoch": 0.2995980430764428, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3577, + "step": 2639 + }, + { + "epoch": 0.29971157018636185, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3489, + "step": 2640 + }, + { + "epoch": 0.29982509729628093, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3419, + "step": 2641 + }, + { + "epoch": 0.2999386244062, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.324, + "step": 2642 + }, + { + "epoch": 0.3000521515161191, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3465, + "step": 2643 + }, + { + "epoch": 0.30016567862603816, + "grad_norm": 0.2294921875, + "learning_rate": 0.002, + "loss": 5.3479, + "step": 2644 + }, + { + "epoch": 0.30027920573595723, + "grad_norm": 0.224609375, + "learning_rate": 0.002, + "loss": 5.3225, + "step": 2645 + }, + { + "epoch": 0.3003927328458763, + "grad_norm": 0.21875, + "learning_rate": 0.002, + "loss": 5.3508, + "step": 2646 + }, + { + "epoch": 0.3005062599557954, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3504, + "step": 2647 + }, + { + "epoch": 0.30061978706571446, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.336, + "step": 2648 + }, + { + "epoch": 0.30073331417563354, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3425, + "step": 2649 + }, + { + "epoch": 0.3008468412855526, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3459, + "step": 2650 + }, + { + "epoch": 0.3009603683954717, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3418, + "step": 2651 + }, + { + "epoch": 0.30107389550539077, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3524, + "step": 2652 + }, + { + "epoch": 0.30118742261530984, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3242, + "step": 2653 + }, + { + "epoch": 0.3013009497252289, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3563, + "step": 2654 + }, + { + "epoch": 0.301414476835148, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3462, + "step": 2655 + }, + { + "epoch": 0.30152800394506707, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3461, + "step": 2656 + }, + { + "epoch": 0.30164153105498615, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.345, + "step": 2657 + }, + { + "epoch": 0.3017550581649052, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3436, + "step": 2658 + }, + { + "epoch": 0.3018685852748243, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3532, + "step": 2659 + }, + { + "epoch": 0.3019821123847434, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3572, + "step": 2660 + }, + { + "epoch": 0.30209563949466245, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3438, + "step": 2661 + }, + { + "epoch": 0.3022091666045815, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3724, + "step": 2662 + }, + { + "epoch": 0.3023226937145006, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.3409, + "step": 2663 + }, + { + "epoch": 0.3024362208244197, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2664 + }, + { + "epoch": 0.30254974793433875, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3297, + "step": 2665 + }, + { + "epoch": 0.30266327504425783, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3289, + "step": 2666 + }, + { + "epoch": 0.3027768021541769, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3585, + "step": 2667 + }, + { + "epoch": 0.302890329264096, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3271, + "step": 2668 + }, + { + "epoch": 0.30300385637401506, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3465, + "step": 2669 + }, + { + "epoch": 0.30311738348393413, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3523, + "step": 2670 + }, + { + "epoch": 0.3032309105938532, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3516, + "step": 2671 + }, + { + "epoch": 0.3033444377037723, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3454, + "step": 2672 + }, + { + "epoch": 0.30345796481369136, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2673 + }, + { + "epoch": 0.30357149192361044, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3395, + "step": 2674 + }, + { + "epoch": 0.3036850190335295, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3396, + "step": 2675 + }, + { + "epoch": 0.3037985461434486, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3409, + "step": 2676 + }, + { + "epoch": 0.30391207325336766, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3437, + "step": 2677 + }, + { + "epoch": 0.30402560036328674, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3397, + "step": 2678 + }, + { + "epoch": 0.3041391274732058, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3367, + "step": 2679 + }, + { + "epoch": 0.3042526545831249, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.3408, + "step": 2680 + }, + { + "epoch": 0.30436618169304397, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2681 + }, + { + "epoch": 0.30447970880296304, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.351, + "step": 2682 + }, + { + "epoch": 0.3045932359128821, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.3537, + "step": 2683 + }, + { + "epoch": 0.3047067630228012, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 2684 + }, + { + "epoch": 0.3048202901327203, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3364, + "step": 2685 + }, + { + "epoch": 0.30493381724263935, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3383, + "step": 2686 + }, + { + "epoch": 0.3050473443525584, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3537, + "step": 2687 + }, + { + "epoch": 0.3051608714624775, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.353, + "step": 2688 + }, + { + "epoch": 0.3052743985723966, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3661, + "step": 2689 + }, + { + "epoch": 0.30538792568231565, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3359, + "step": 2690 + }, + { + "epoch": 0.30550145279223473, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3512, + "step": 2691 + }, + { + "epoch": 0.3056149799021538, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.345, + "step": 2692 + }, + { + "epoch": 0.3057285070120729, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.343, + "step": 2693 + }, + { + "epoch": 0.30584203412199196, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3313, + "step": 2694 + }, + { + "epoch": 0.30595556123191103, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3352, + "step": 2695 + }, + { + "epoch": 0.3060690883418301, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3463, + "step": 2696 + }, + { + "epoch": 0.3061826154517492, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.365, + "step": 2697 + }, + { + "epoch": 0.30629614256166826, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.3402, + "step": 2698 + }, + { + "epoch": 0.30640966967158734, + "grad_norm": 0.2158203125, + "learning_rate": 0.002, + "loss": 5.3273, + "step": 2699 + }, + { + "epoch": 0.3065231967815064, + "grad_norm": 0.2158203125, + "learning_rate": 0.002, + "loss": 5.3307, + "step": 2700 + }, + { + "epoch": 0.3066367238914255, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.3397, + "step": 2701 + }, + { + "epoch": 0.30675025100134456, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3251, + "step": 2702 + }, + { + "epoch": 0.30686377811126364, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3497, + "step": 2703 + }, + { + "epoch": 0.3069773052211827, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3403, + "step": 2704 + }, + { + "epoch": 0.3070908323311018, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3256, + "step": 2705 + }, + { + "epoch": 0.30720435944102087, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3561, + "step": 2706 + }, + { + "epoch": 0.30731788655093994, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3235, + "step": 2707 + }, + { + "epoch": 0.307431413660859, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3304, + "step": 2708 + }, + { + "epoch": 0.3075449407707781, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2709 + }, + { + "epoch": 0.3076584678806972, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3503, + "step": 2710 + }, + { + "epoch": 0.30777199499061625, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3484, + "step": 2711 + }, + { + "epoch": 0.3078855221005353, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3268, + "step": 2712 + }, + { + "epoch": 0.3079990492104544, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3628, + "step": 2713 + }, + { + "epoch": 0.3081125763203735, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3616, + "step": 2714 + }, + { + "epoch": 0.3082261034302926, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3459, + "step": 2715 + }, + { + "epoch": 0.3083396305402117, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3393, + "step": 2716 + }, + { + "epoch": 0.30845315765013076, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3488, + "step": 2717 + }, + { + "epoch": 0.30856668476004984, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3409, + "step": 2718 + }, + { + "epoch": 0.3086802118699689, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3536, + "step": 2719 + }, + { + "epoch": 0.308793738979888, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3303, + "step": 2720 + }, + { + "epoch": 0.30890726608980706, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3454, + "step": 2721 + }, + { + "epoch": 0.30902079319972614, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3333, + "step": 2722 + }, + { + "epoch": 0.3091343203096452, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3361, + "step": 2723 + }, + { + "epoch": 0.3092478474195643, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3314, + "step": 2724 + }, + { + "epoch": 0.30936137452948337, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3335, + "step": 2725 + }, + { + "epoch": 0.30947490163940244, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3457, + "step": 2726 + }, + { + "epoch": 0.3095884287493215, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3325, + "step": 2727 + }, + { + "epoch": 0.3097019558592406, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.323, + "step": 2728 + }, + { + "epoch": 0.3098154829691597, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3496, + "step": 2729 + }, + { + "epoch": 0.30992901007907875, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3345, + "step": 2730 + }, + { + "epoch": 0.3100425371889978, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3401, + "step": 2731 + }, + { + "epoch": 0.3101560642989169, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.348, + "step": 2732 + }, + { + "epoch": 0.310269591408836, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.3617, + "step": 2733 + }, + { + "epoch": 0.31038311851875505, + "grad_norm": 0.2294921875, + "learning_rate": 0.002, + "loss": 5.3244, + "step": 2734 + }, + { + "epoch": 0.31049664562867413, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3384, + "step": 2735 + }, + { + "epoch": 0.3106101727385932, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3517, + "step": 2736 + }, + { + "epoch": 0.3107236998485123, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3469, + "step": 2737 + }, + { + "epoch": 0.31083722695843136, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.326, + "step": 2738 + }, + { + "epoch": 0.31095075406835043, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3233, + "step": 2739 + }, + { + "epoch": 0.3110642811782695, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3316, + "step": 2740 + }, + { + "epoch": 0.3111778082881886, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3433, + "step": 2741 + }, + { + "epoch": 0.31129133539810766, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3288, + "step": 2742 + }, + { + "epoch": 0.31140486250802674, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3583, + "step": 2743 + }, + { + "epoch": 0.3115183896179458, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3372, + "step": 2744 + }, + { + "epoch": 0.3116319167278649, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3423, + "step": 2745 + }, + { + "epoch": 0.31174544383778396, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3474, + "step": 2746 + }, + { + "epoch": 0.31185897094770304, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.345, + "step": 2747 + }, + { + "epoch": 0.3119724980576221, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3454, + "step": 2748 + }, + { + "epoch": 0.3120860251675412, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3448, + "step": 2749 + }, + { + "epoch": 0.31219955227746027, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.322, + "step": 2750 + }, + { + "epoch": 0.31231307938737934, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.325, + "step": 2751 + }, + { + "epoch": 0.3124266064972984, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3631, + "step": 2752 + }, + { + "epoch": 0.3125401336072175, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3105, + "step": 2753 + }, + { + "epoch": 0.3126536607171366, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2754 + }, + { + "epoch": 0.31276718782705565, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3593, + "step": 2755 + }, + { + "epoch": 0.3128807149369747, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.354, + "step": 2756 + }, + { + "epoch": 0.3129942420468938, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3314, + "step": 2757 + }, + { + "epoch": 0.3131077691568129, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3324, + "step": 2758 + }, + { + "epoch": 0.31322129626673195, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.335, + "step": 2759 + }, + { + "epoch": 0.31333482337665103, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3499, + "step": 2760 + }, + { + "epoch": 0.3134483504865701, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3374, + "step": 2761 + }, + { + "epoch": 0.3135618775964892, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3346, + "step": 2762 + }, + { + "epoch": 0.31367540470640826, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3417, + "step": 2763 + }, + { + "epoch": 0.31378893181632733, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.337, + "step": 2764 + }, + { + "epoch": 0.3139024589262464, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.3368, + "step": 2765 + }, + { + "epoch": 0.3140159860361655, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.3337, + "step": 2766 + }, + { + "epoch": 0.31412951314608456, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3574, + "step": 2767 + }, + { + "epoch": 0.31424304025600364, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3211, + "step": 2768 + }, + { + "epoch": 0.3143565673659227, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3289, + "step": 2769 + }, + { + "epoch": 0.3144700944758418, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3371, + "step": 2770 + }, + { + "epoch": 0.31458362158576086, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3471, + "step": 2771 + }, + { + "epoch": 0.31469714869567994, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3428, + "step": 2772 + }, + { + "epoch": 0.314810675805599, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3197, + "step": 2773 + }, + { + "epoch": 0.3149242029155181, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3241, + "step": 2774 + }, + { + "epoch": 0.31503773002543717, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3403, + "step": 2775 + }, + { + "epoch": 0.31515125713535624, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3214, + "step": 2776 + }, + { + "epoch": 0.3152647842452753, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.32, + "step": 2777 + }, + { + "epoch": 0.3153783113551944, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3383, + "step": 2778 + }, + { + "epoch": 0.31549183846511347, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3028, + "step": 2779 + }, + { + "epoch": 0.31560536557503255, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3259, + "step": 2780 + }, + { + "epoch": 0.3157188926849516, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.339, + "step": 2781 + }, + { + "epoch": 0.3158324197948707, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3177, + "step": 2782 + }, + { + "epoch": 0.3159459469047898, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3406, + "step": 2783 + }, + { + "epoch": 0.31605947401470885, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3191, + "step": 2784 + }, + { + "epoch": 0.31617300112462793, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3351, + "step": 2785 + }, + { + "epoch": 0.316286528234547, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3275, + "step": 2786 + }, + { + "epoch": 0.3164000553444661, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3255, + "step": 2787 + }, + { + "epoch": 0.31651358245438516, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3377, + "step": 2788 + }, + { + "epoch": 0.31662710956430423, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3617, + "step": 2789 + }, + { + "epoch": 0.3167406366742233, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2955, + "step": 2790 + }, + { + "epoch": 0.3168541637841424, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3278, + "step": 2791 + }, + { + "epoch": 0.31696769089406146, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.349, + "step": 2792 + }, + { + "epoch": 0.31708121800398054, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2793 + }, + { + "epoch": 0.3171947451138996, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3315, + "step": 2794 + }, + { + "epoch": 0.3173082722238187, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3461, + "step": 2795 + }, + { + "epoch": 0.31742179933373776, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 2796 + }, + { + "epoch": 0.31753532644365684, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3329, + "step": 2797 + }, + { + "epoch": 0.3176488535535759, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3572, + "step": 2798 + }, + { + "epoch": 0.317762380663495, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3387, + "step": 2799 + }, + { + "epoch": 0.31787590777341407, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3468, + "step": 2800 + }, + { + "epoch": 0.31798943488333314, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3465, + "step": 2801 + }, + { + "epoch": 0.3181029619932522, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3266, + "step": 2802 + }, + { + "epoch": 0.3182164891031713, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3318, + "step": 2803 + }, + { + "epoch": 0.31833001621309037, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3378, + "step": 2804 + }, + { + "epoch": 0.31844354332300945, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3332, + "step": 2805 + }, + { + "epoch": 0.3185570704329285, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3283, + "step": 2806 + }, + { + "epoch": 0.3186705975428476, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.329, + "step": 2807 + }, + { + "epoch": 0.3187841246527667, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3282, + "step": 2808 + }, + { + "epoch": 0.31889765176268575, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3157, + "step": 2809 + }, + { + "epoch": 0.31901117887260483, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3247, + "step": 2810 + }, + { + "epoch": 0.3191247059825239, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3271, + "step": 2811 + }, + { + "epoch": 0.319238233092443, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3559, + "step": 2812 + }, + { + "epoch": 0.31935176020236206, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.316, + "step": 2813 + }, + { + "epoch": 0.31946528731228113, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.324, + "step": 2814 + }, + { + "epoch": 0.3195788144222002, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3461, + "step": 2815 + }, + { + "epoch": 0.3196923415321193, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3474, + "step": 2816 + }, + { + "epoch": 0.31980586864203836, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3344, + "step": 2817 + }, + { + "epoch": 0.31991939575195744, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3186, + "step": 2818 + }, + { + "epoch": 0.3200329228618765, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3265, + "step": 2819 + }, + { + "epoch": 0.3201464499717956, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3307, + "step": 2820 + }, + { + "epoch": 0.32025997708171466, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3419, + "step": 2821 + }, + { + "epoch": 0.32037350419163374, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.341, + "step": 2822 + }, + { + "epoch": 0.3204870313015528, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3275, + "step": 2823 + }, + { + "epoch": 0.3206005584114719, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3393, + "step": 2824 + }, + { + "epoch": 0.32071408552139097, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.3532, + "step": 2825 + }, + { + "epoch": 0.32082761263131004, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.3255, + "step": 2826 + }, + { + "epoch": 0.3209411397412291, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3202, + "step": 2827 + }, + { + "epoch": 0.3210546668511482, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3522, + "step": 2828 + }, + { + "epoch": 0.32116819396106727, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3643, + "step": 2829 + }, + { + "epoch": 0.32128172107098635, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2830 + }, + { + "epoch": 0.3213952481809054, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3307, + "step": 2831 + }, + { + "epoch": 0.3215087752908245, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3492, + "step": 2832 + }, + { + "epoch": 0.3216223024007436, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3297, + "step": 2833 + }, + { + "epoch": 0.32173582951066265, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3318, + "step": 2834 + }, + { + "epoch": 0.32184935662058173, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3228, + "step": 2835 + }, + { + "epoch": 0.32196288373050086, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.321, + "step": 2836 + }, + { + "epoch": 0.32207641084041994, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3407, + "step": 2837 + }, + { + "epoch": 0.322189937950339, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 2838 + }, + { + "epoch": 0.3223034650602581, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3478, + "step": 2839 + }, + { + "epoch": 0.32241699217017716, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3466, + "step": 2840 + }, + { + "epoch": 0.32253051928009624, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3429, + "step": 2841 + }, + { + "epoch": 0.3226440463900153, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2842 + }, + { + "epoch": 0.3227575734999344, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3264, + "step": 2843 + }, + { + "epoch": 0.32287110060985347, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3213, + "step": 2844 + }, + { + "epoch": 0.32298462771977254, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3204, + "step": 2845 + }, + { + "epoch": 0.3230981548296916, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3334, + "step": 2846 + }, + { + "epoch": 0.3232116819396107, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.3355, + "step": 2847 + }, + { + "epoch": 0.32332520904952977, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.3395, + "step": 2848 + }, + { + "epoch": 0.32343873615944885, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.3408, + "step": 2849 + }, + { + "epoch": 0.3235522632693679, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.3461, + "step": 2850 + }, + { + "epoch": 0.323665790379287, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3486, + "step": 2851 + }, + { + "epoch": 0.3237793174892061, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3219, + "step": 2852 + }, + { + "epoch": 0.32389284459912515, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3319, + "step": 2853 + }, + { + "epoch": 0.3240063717090442, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3161, + "step": 2854 + }, + { + "epoch": 0.3241198988189633, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3457, + "step": 2855 + }, + { + "epoch": 0.3242334259288824, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3243, + "step": 2856 + }, + { + "epoch": 0.32434695303880146, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3149, + "step": 2857 + }, + { + "epoch": 0.32446048014872053, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3407, + "step": 2858 + }, + { + "epoch": 0.3245740072586396, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3279, + "step": 2859 + }, + { + "epoch": 0.3246875343685587, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3447, + "step": 2860 + }, + { + "epoch": 0.32480106147847776, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3372, + "step": 2861 + }, + { + "epoch": 0.32491458858839684, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3345, + "step": 2862 + }, + { + "epoch": 0.3250281156983159, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3291, + "step": 2863 + }, + { + "epoch": 0.325141642808235, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.325, + "step": 2864 + }, + { + "epoch": 0.32525516991815406, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3521, + "step": 2865 + }, + { + "epoch": 0.32536869702807314, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3367, + "step": 2866 + }, + { + "epoch": 0.3254822241379922, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.33, + "step": 2867 + }, + { + "epoch": 0.3255957512479113, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3272, + "step": 2868 + }, + { + "epoch": 0.32570927835783037, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3302, + "step": 2869 + }, + { + "epoch": 0.32582280546774944, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3327, + "step": 2870 + }, + { + "epoch": 0.3259363325776685, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3452, + "step": 2871 + }, + { + "epoch": 0.3260498596875876, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3306, + "step": 2872 + }, + { + "epoch": 0.32616338679750667, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3186, + "step": 2873 + }, + { + "epoch": 0.32627691390742575, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3402, + "step": 2874 + }, + { + "epoch": 0.3263904410173448, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3481, + "step": 2875 + }, + { + "epoch": 0.3265039681272639, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3264, + "step": 2876 + }, + { + "epoch": 0.326617495237183, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3282, + "step": 2877 + }, + { + "epoch": 0.32673102234710205, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3183, + "step": 2878 + }, + { + "epoch": 0.3268445494570211, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3151, + "step": 2879 + }, + { + "epoch": 0.3269580765669402, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.3318, + "step": 2880 + }, + { + "epoch": 0.3270716036768593, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.3209, + "step": 2881 + }, + { + "epoch": 0.32718513078677836, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.337, + "step": 2882 + }, + { + "epoch": 0.32729865789669743, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3284, + "step": 2883 + }, + { + "epoch": 0.3274121850066165, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3222, + "step": 2884 + }, + { + "epoch": 0.3275257121165356, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3146, + "step": 2885 + }, + { + "epoch": 0.32763923922645466, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3317, + "step": 2886 + }, + { + "epoch": 0.32775276633637374, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3342, + "step": 2887 + }, + { + "epoch": 0.3278662934462928, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3026, + "step": 2888 + }, + { + "epoch": 0.3279798205562119, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3248, + "step": 2889 + }, + { + "epoch": 0.32809334766613096, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3417, + "step": 2890 + }, + { + "epoch": 0.32820687477605004, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3284, + "step": 2891 + }, + { + "epoch": 0.3283204018859691, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3465, + "step": 2892 + }, + { + "epoch": 0.3284339289958882, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3252, + "step": 2893 + }, + { + "epoch": 0.32854745610580727, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 2894 + }, + { + "epoch": 0.32866098321572634, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3254, + "step": 2895 + }, + { + "epoch": 0.3287745103256454, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3207, + "step": 2896 + }, + { + "epoch": 0.3288880374355645, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3287, + "step": 2897 + }, + { + "epoch": 0.32900156454548357, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3232, + "step": 2898 + }, + { + "epoch": 0.32911509165540265, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.319, + "step": 2899 + }, + { + "epoch": 0.3292286187653217, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3463, + "step": 2900 + }, + { + "epoch": 0.3293421458752408, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3265, + "step": 2901 + }, + { + "epoch": 0.3294556729851599, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3294, + "step": 2902 + }, + { + "epoch": 0.32956920009507895, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3316, + "step": 2903 + }, + { + "epoch": 0.329682727204998, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3094, + "step": 2904 + }, + { + "epoch": 0.3297962543149171, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3288, + "step": 2905 + }, + { + "epoch": 0.3299097814248362, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.317, + "step": 2906 + }, + { + "epoch": 0.33002330853475526, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3368, + "step": 2907 + }, + { + "epoch": 0.33013683564467433, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.352, + "step": 2908 + }, + { + "epoch": 0.3302503627545934, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.336, + "step": 2909 + }, + { + "epoch": 0.3303638898645125, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3242, + "step": 2910 + }, + { + "epoch": 0.33047741697443156, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3347, + "step": 2911 + }, + { + "epoch": 0.33059094408435064, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3348, + "step": 2912 + }, + { + "epoch": 0.3307044711942697, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3109, + "step": 2913 + }, + { + "epoch": 0.3308179983041888, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3434, + "step": 2914 + }, + { + "epoch": 0.33093152541410786, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3241, + "step": 2915 + }, + { + "epoch": 0.33104505252402694, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3201, + "step": 2916 + }, + { + "epoch": 0.331158579633946, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3388, + "step": 2917 + }, + { + "epoch": 0.3312721067438651, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3363, + "step": 2918 + }, + { + "epoch": 0.33138563385378417, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3154, + "step": 2919 + }, + { + "epoch": 0.33149916096370324, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3413, + "step": 2920 + }, + { + "epoch": 0.3316126880736223, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3314, + "step": 2921 + }, + { + "epoch": 0.3317262151835414, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3209, + "step": 2922 + }, + { + "epoch": 0.33183974229346047, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3384, + "step": 2923 + }, + { + "epoch": 0.33195326940337955, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3117, + "step": 2924 + }, + { + "epoch": 0.3320667965132986, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.331, + "step": 2925 + }, + { + "epoch": 0.3321803236232177, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.314, + "step": 2926 + }, + { + "epoch": 0.3322938507331368, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3115, + "step": 2927 + }, + { + "epoch": 0.33240737784305585, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3048, + "step": 2928 + }, + { + "epoch": 0.3325209049529749, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3258, + "step": 2929 + }, + { + "epoch": 0.332634432062894, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3133, + "step": 2930 + }, + { + "epoch": 0.3327479591728131, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3123, + "step": 2931 + }, + { + "epoch": 0.33286148628273216, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3258, + "step": 2932 + }, + { + "epoch": 0.33297501339265123, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3353, + "step": 2933 + }, + { + "epoch": 0.3330885405025703, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3366, + "step": 2934 + }, + { + "epoch": 0.3332020676124894, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3252, + "step": 2935 + }, + { + "epoch": 0.33331559472240846, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3358, + "step": 2936 + }, + { + "epoch": 0.33342912183232754, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3205, + "step": 2937 + }, + { + "epoch": 0.3335426489422466, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3479, + "step": 2938 + }, + { + "epoch": 0.3336561760521657, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3189, + "step": 2939 + }, + { + "epoch": 0.33376970316208476, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3274, + "step": 2940 + }, + { + "epoch": 0.33388323027200384, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3057, + "step": 2941 + }, + { + "epoch": 0.3339967573819229, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3098, + "step": 2942 + }, + { + "epoch": 0.334110284491842, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3237, + "step": 2943 + }, + { + "epoch": 0.33422381160176107, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3071, + "step": 2944 + }, + { + "epoch": 0.33433733871168014, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.336, + "step": 2945 + }, + { + "epoch": 0.3344508658215992, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.3172, + "step": 2946 + }, + { + "epoch": 0.3345643929315183, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3317, + "step": 2947 + }, + { + "epoch": 0.33467792004143737, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3274, + "step": 2948 + }, + { + "epoch": 0.33479144715135645, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2949 + }, + { + "epoch": 0.3349049742612755, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3324, + "step": 2950 + }, + { + "epoch": 0.3350185013711946, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3296, + "step": 2951 + }, + { + "epoch": 0.3351320284811137, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3554, + "step": 2952 + }, + { + "epoch": 0.33524555559103275, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3294, + "step": 2953 + }, + { + "epoch": 0.3353590827009518, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.314, + "step": 2954 + }, + { + "epoch": 0.3354726098108709, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3497, + "step": 2955 + }, + { + "epoch": 0.33558613692079, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3047, + "step": 2956 + }, + { + "epoch": 0.3356996640307091, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3393, + "step": 2957 + }, + { + "epoch": 0.3358131911406282, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3344, + "step": 2958 + }, + { + "epoch": 0.33592671825054726, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3252, + "step": 2959 + }, + { + "epoch": 0.33604024536046634, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3176, + "step": 2960 + }, + { + "epoch": 0.3361537724703854, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.335, + "step": 2961 + }, + { + "epoch": 0.3362672995803045, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.3354, + "step": 2962 + }, + { + "epoch": 0.33638082669022357, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3451, + "step": 2963 + }, + { + "epoch": 0.33649435380014264, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3318, + "step": 2964 + }, + { + "epoch": 0.3366078809100617, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3326, + "step": 2965 + }, + { + "epoch": 0.3367214080199808, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3261, + "step": 2966 + }, + { + "epoch": 0.33683493512989987, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.342, + "step": 2967 + }, + { + "epoch": 0.33694846223981895, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3323, + "step": 2968 + }, + { + "epoch": 0.337061989349738, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.3338, + "step": 2969 + }, + { + "epoch": 0.3371755164596571, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3543, + "step": 2970 + }, + { + "epoch": 0.3372890435695762, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3273, + "step": 2971 + }, + { + "epoch": 0.33740257067949525, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3193, + "step": 2972 + }, + { + "epoch": 0.3375160977894143, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3292, + "step": 2973 + }, + { + "epoch": 0.3376296248993334, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3019, + "step": 2974 + }, + { + "epoch": 0.3377431520092525, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3022, + "step": 2975 + }, + { + "epoch": 0.33785667911917155, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3382, + "step": 2976 + }, + { + "epoch": 0.33797020622909063, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3395, + "step": 2977 + }, + { + "epoch": 0.3380837333390097, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3156, + "step": 2978 + }, + { + "epoch": 0.3381972604489288, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.3168, + "step": 2979 + }, + { + "epoch": 0.33831078755884786, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3191, + "step": 2980 + }, + { + "epoch": 0.33842431466876693, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3107, + "step": 2981 + }, + { + "epoch": 0.338537841778686, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3145, + "step": 2982 + }, + { + "epoch": 0.3386513688886051, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3414, + "step": 2983 + }, + { + "epoch": 0.33876489599852416, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3291, + "step": 2984 + }, + { + "epoch": 0.33887842310844324, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3253, + "step": 2985 + }, + { + "epoch": 0.3389919502183623, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3406, + "step": 2986 + }, + { + "epoch": 0.3391054773282814, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3289, + "step": 2987 + }, + { + "epoch": 0.33921900443820047, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3112, + "step": 2988 + }, + { + "epoch": 0.33933253154811954, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3254, + "step": 2989 + }, + { + "epoch": 0.3394460586580386, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3211, + "step": 2990 + }, + { + "epoch": 0.3395595857679577, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.3269, + "step": 2991 + }, + { + "epoch": 0.33967311287787677, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.3512, + "step": 2992 + }, + { + "epoch": 0.33978663998779585, + "grad_norm": 0.2265625, + "learning_rate": 0.002, + "loss": 5.3346, + "step": 2993 + }, + { + "epoch": 0.3399001670977149, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.3354, + "step": 2994 + }, + { + "epoch": 0.340013694207634, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3292, + "step": 2995 + }, + { + "epoch": 0.3401272213175531, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3184, + "step": 2996 + }, + { + "epoch": 0.34024074842747215, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3216, + "step": 2997 + }, + { + "epoch": 0.3403542755373912, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3234, + "step": 2998 + }, + { + "epoch": 0.3404678026473103, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.3348, + "step": 2999 + }, + { + "epoch": 0.3405813297572294, + "grad_norm": 0.439453125, + "learning_rate": 0.002, + "loss": 5.3241, + "step": 3000 + }, + { + "epoch": 0.34069485686714845, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.3281, + "step": 3001 + }, + { + "epoch": 0.34080838397706753, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3485, + "step": 3002 + }, + { + "epoch": 0.3409219110869866, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.338, + "step": 3003 + }, + { + "epoch": 0.3410354381969057, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3191, + "step": 3004 + }, + { + "epoch": 0.34114896530682476, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3209, + "step": 3005 + }, + { + "epoch": 0.34126249241674383, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.308, + "step": 3006 + }, + { + "epoch": 0.3413760195266629, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3311, + "step": 3007 + }, + { + "epoch": 0.341489546636582, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3338, + "step": 3008 + }, + { + "epoch": 0.34160307374650106, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3379, + "step": 3009 + }, + { + "epoch": 0.34171660085642014, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3112, + "step": 3010 + }, + { + "epoch": 0.3418301279663392, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3238, + "step": 3011 + }, + { + "epoch": 0.3419436550762583, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3304, + "step": 3012 + }, + { + "epoch": 0.34205718218617737, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3081, + "step": 3013 + }, + { + "epoch": 0.34217070929609644, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3116, + "step": 3014 + }, + { + "epoch": 0.3422842364060155, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3105, + "step": 3015 + }, + { + "epoch": 0.3423977635159346, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3233, + "step": 3016 + }, + { + "epoch": 0.34251129062585367, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 3017 + }, + { + "epoch": 0.34262481773577275, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.3195, + "step": 3018 + }, + { + "epoch": 0.3427383448456918, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3048, + "step": 3019 + }, + { + "epoch": 0.3428518719556109, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3248, + "step": 3020 + }, + { + "epoch": 0.34296539906553, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3049, + "step": 3021 + }, + { + "epoch": 0.34307892617544905, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3356, + "step": 3022 + }, + { + "epoch": 0.3431924532853681, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3357, + "step": 3023 + }, + { + "epoch": 0.3433059803952872, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3218, + "step": 3024 + }, + { + "epoch": 0.3434195075052063, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3314, + "step": 3025 + }, + { + "epoch": 0.34353303461512535, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3361, + "step": 3026 + }, + { + "epoch": 0.34364656172504443, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3372, + "step": 3027 + }, + { + "epoch": 0.3437600888349635, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3272, + "step": 3028 + }, + { + "epoch": 0.3438736159448826, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3178, + "step": 3029 + }, + { + "epoch": 0.34398714305480166, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3228, + "step": 3030 + }, + { + "epoch": 0.34410067016472073, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3156, + "step": 3031 + }, + { + "epoch": 0.3442141972746398, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3386, + "step": 3032 + }, + { + "epoch": 0.3443277243845589, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3128, + "step": 3033 + }, + { + "epoch": 0.34444125149447796, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.309, + "step": 3034 + }, + { + "epoch": 0.34455477860439704, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3168, + "step": 3035 + }, + { + "epoch": 0.3446683057143161, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3249, + "step": 3036 + }, + { + "epoch": 0.3447818328242352, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3292, + "step": 3037 + }, + { + "epoch": 0.34489535993415427, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.3285, + "step": 3038 + }, + { + "epoch": 0.34500888704407334, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3247, + "step": 3039 + }, + { + "epoch": 0.3451224141539924, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3333, + "step": 3040 + }, + { + "epoch": 0.3452359412639115, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3243, + "step": 3041 + }, + { + "epoch": 0.34534946837383057, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3349, + "step": 3042 + }, + { + "epoch": 0.34546299548374965, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3014, + "step": 3043 + }, + { + "epoch": 0.3455765225936687, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3158, + "step": 3044 + }, + { + "epoch": 0.3456900497035878, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.34, + "step": 3045 + }, + { + "epoch": 0.3458035768135069, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3326, + "step": 3046 + }, + { + "epoch": 0.34591710392342595, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3482, + "step": 3047 + }, + { + "epoch": 0.346030631033345, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3207, + "step": 3048 + }, + { + "epoch": 0.3461441581432641, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3131, + "step": 3049 + }, + { + "epoch": 0.3462576852531832, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3176, + "step": 3050 + }, + { + "epoch": 0.34637121236310225, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3297, + "step": 3051 + }, + { + "epoch": 0.34648473947302133, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3321, + "step": 3052 + }, + { + "epoch": 0.3465982665829404, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.319, + "step": 3053 + }, + { + "epoch": 0.3467117936928595, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.3066, + "step": 3054 + }, + { + "epoch": 0.34682532080277856, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3165, + "step": 3055 + }, + { + "epoch": 0.34693884791269763, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.3123, + "step": 3056 + }, + { + "epoch": 0.3470523750226167, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3185, + "step": 3057 + }, + { + "epoch": 0.3471659021325358, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3288, + "step": 3058 + }, + { + "epoch": 0.34727942924245486, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3089, + "step": 3059 + }, + { + "epoch": 0.34739295635237394, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.314, + "step": 3060 + }, + { + "epoch": 0.347506483462293, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.314, + "step": 3061 + }, + { + "epoch": 0.3476200105722121, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3109, + "step": 3062 + }, + { + "epoch": 0.34773353768213117, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.3109, + "step": 3063 + }, + { + "epoch": 0.34784706479205024, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3166, + "step": 3064 + }, + { + "epoch": 0.3479605919019693, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3213, + "step": 3065 + }, + { + "epoch": 0.3480741190118884, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3529, + "step": 3066 + }, + { + "epoch": 0.34818764612180747, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3441, + "step": 3067 + }, + { + "epoch": 0.34830117323172655, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3203, + "step": 3068 + }, + { + "epoch": 0.3484147003416456, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3232, + "step": 3069 + }, + { + "epoch": 0.3485282274515647, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3089, + "step": 3070 + }, + { + "epoch": 0.3486417545614838, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3129, + "step": 3071 + }, + { + "epoch": 0.34875528167140285, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3029, + "step": 3072 + }, + { + "epoch": 0.3488688087813219, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3018, + "step": 3073 + }, + { + "epoch": 0.348982335891241, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.316, + "step": 3074 + }, + { + "epoch": 0.3490958630011601, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2986, + "step": 3075 + }, + { + "epoch": 0.34920939011107915, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.312, + "step": 3076 + }, + { + "epoch": 0.3493229172209983, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3352, + "step": 3077 + }, + { + "epoch": 0.34943644433091736, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3344, + "step": 3078 + }, + { + "epoch": 0.34954997144083644, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.329, + "step": 3079 + }, + { + "epoch": 0.3496634985507555, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.311, + "step": 3080 + }, + { + "epoch": 0.3497770256606746, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3275, + "step": 3081 + }, + { + "epoch": 0.34989055277059367, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3306, + "step": 3082 + }, + { + "epoch": 0.35000407988051274, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3277, + "step": 3083 + }, + { + "epoch": 0.3501176069904318, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.342, + "step": 3084 + }, + { + "epoch": 0.3502311341003509, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3292, + "step": 3085 + }, + { + "epoch": 0.35034466121026997, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3075, + "step": 3086 + }, + { + "epoch": 0.35045818832018905, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.315, + "step": 3087 + }, + { + "epoch": 0.3505717154301081, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3349, + "step": 3088 + }, + { + "epoch": 0.3506852425400272, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3212, + "step": 3089 + }, + { + "epoch": 0.3507987696499463, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3096, + "step": 3090 + }, + { + "epoch": 0.35091229675986535, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.322, + "step": 3091 + }, + { + "epoch": 0.3510258238697844, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 3092 + }, + { + "epoch": 0.3511393509797035, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3093 + }, + { + "epoch": 0.3512528780896226, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3034, + "step": 3094 + }, + { + "epoch": 0.35136640519954165, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3082, + "step": 3095 + }, + { + "epoch": 0.35147993230946073, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3186, + "step": 3096 + }, + { + "epoch": 0.3515934594193798, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.3335, + "step": 3097 + }, + { + "epoch": 0.3517069865292989, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2968, + "step": 3098 + }, + { + "epoch": 0.35182051363921796, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3212, + "step": 3099 + }, + { + "epoch": 0.35193404074913703, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3047, + "step": 3100 + }, + { + "epoch": 0.3520475678590561, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3045, + "step": 3101 + }, + { + "epoch": 0.3521610949689752, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3172, + "step": 3102 + }, + { + "epoch": 0.35227462207889426, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3127, + "step": 3103 + }, + { + "epoch": 0.35238814918881334, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3219, + "step": 3104 + }, + { + "epoch": 0.3525016762987324, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3156, + "step": 3105 + }, + { + "epoch": 0.3526152034086515, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.331, + "step": 3106 + }, + { + "epoch": 0.35272873051857057, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3147, + "step": 3107 + }, + { + "epoch": 0.35284225762848964, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.306, + "step": 3108 + }, + { + "epoch": 0.3529557847384087, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3189, + "step": 3109 + }, + { + "epoch": 0.3530693118483278, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.328, + "step": 3110 + }, + { + "epoch": 0.35318283895824687, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3258, + "step": 3111 + }, + { + "epoch": 0.35329636606816595, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3152, + "step": 3112 + }, + { + "epoch": 0.353409893178085, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3245, + "step": 3113 + }, + { + "epoch": 0.3535234202880041, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3179, + "step": 3114 + }, + { + "epoch": 0.3536369473979232, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3127, + "step": 3115 + }, + { + "epoch": 0.35375047450784225, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3291, + "step": 3116 + }, + { + "epoch": 0.3538640016177613, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3073, + "step": 3117 + }, + { + "epoch": 0.3539775287276804, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3408, + "step": 3118 + }, + { + "epoch": 0.3540910558375995, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3225, + "step": 3119 + }, + { + "epoch": 0.35420458294751855, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3249, + "step": 3120 + }, + { + "epoch": 0.35431811005743763, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3279, + "step": 3121 + }, + { + "epoch": 0.3544316371673567, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3102, + "step": 3122 + }, + { + "epoch": 0.3545451642772758, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3192, + "step": 3123 + }, + { + "epoch": 0.35465869138719486, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3057, + "step": 3124 + }, + { + "epoch": 0.35477221849711393, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.309, + "step": 3125 + }, + { + "epoch": 0.354885745607033, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3038, + "step": 3126 + }, + { + "epoch": 0.3549992727169521, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3342, + "step": 3127 + }, + { + "epoch": 0.35511279982687116, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3238, + "step": 3128 + }, + { + "epoch": 0.35522632693679024, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3152, + "step": 3129 + }, + { + "epoch": 0.3553398540467093, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3117, + "step": 3130 + }, + { + "epoch": 0.3554533811566284, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3244, + "step": 3131 + }, + { + "epoch": 0.35556690826654747, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2969, + "step": 3132 + }, + { + "epoch": 0.35568043537646654, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3235, + "step": 3133 + }, + { + "epoch": 0.3557939624863856, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3134 + }, + { + "epoch": 0.3559074895963047, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2841, + "step": 3135 + }, + { + "epoch": 0.35602101670622377, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3125, + "step": 3136 + }, + { + "epoch": 0.35613454381614285, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3048, + "step": 3137 + }, + { + "epoch": 0.3562480709260619, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3234, + "step": 3138 + }, + { + "epoch": 0.356361598035981, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3029, + "step": 3139 + }, + { + "epoch": 0.3564751251459001, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3099, + "step": 3140 + }, + { + "epoch": 0.35658865225581915, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3199, + "step": 3141 + }, + { + "epoch": 0.3567021793657382, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3249, + "step": 3142 + }, + { + "epoch": 0.3568157064756573, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3361, + "step": 3143 + }, + { + "epoch": 0.3569292335855764, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.317, + "step": 3144 + }, + { + "epoch": 0.35704276069549545, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3049, + "step": 3145 + }, + { + "epoch": 0.35715628780541453, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3286, + "step": 3146 + }, + { + "epoch": 0.3572698149153336, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3296, + "step": 3147 + }, + { + "epoch": 0.3573833420252527, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3186, + "step": 3148 + }, + { + "epoch": 0.35749686913517176, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2932, + "step": 3149 + }, + { + "epoch": 0.35761039624509083, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3150 + }, + { + "epoch": 0.3577239233550099, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2995, + "step": 3151 + }, + { + "epoch": 0.357837450464929, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3147, + "step": 3152 + }, + { + "epoch": 0.35795097757484806, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3156, + "step": 3153 + }, + { + "epoch": 0.35806450468476714, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2979, + "step": 3154 + }, + { + "epoch": 0.3581780317946862, + "grad_norm": 0.2255859375, + "learning_rate": 0.002, + "loss": 5.3416, + "step": 3155 + }, + { + "epoch": 0.3582915589046053, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.3202, + "step": 3156 + }, + { + "epoch": 0.35840508601452437, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3069, + "step": 3157 + }, + { + "epoch": 0.35851861312444344, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3093, + "step": 3158 + }, + { + "epoch": 0.3586321402343625, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3125, + "step": 3159 + }, + { + "epoch": 0.3587456673442816, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3123, + "step": 3160 + }, + { + "epoch": 0.35885919445420067, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3165, + "step": 3161 + }, + { + "epoch": 0.35897272156411975, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3106, + "step": 3162 + }, + { + "epoch": 0.3590862486740388, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3291, + "step": 3163 + }, + { + "epoch": 0.3591997757839579, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.329, + "step": 3164 + }, + { + "epoch": 0.359313302893877, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3282, + "step": 3165 + }, + { + "epoch": 0.35942683000379605, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.303, + "step": 3166 + }, + { + "epoch": 0.3595403571137151, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3195, + "step": 3167 + }, + { + "epoch": 0.3596538842236342, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3166, + "step": 3168 + }, + { + "epoch": 0.3597674113335533, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3063, + "step": 3169 + }, + { + "epoch": 0.35988093844347235, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3031, + "step": 3170 + }, + { + "epoch": 0.35999446555339143, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3288, + "step": 3171 + }, + { + "epoch": 0.3601079926633105, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3227, + "step": 3172 + }, + { + "epoch": 0.3602215197732296, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3279, + "step": 3173 + }, + { + "epoch": 0.36033504688314866, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.326, + "step": 3174 + }, + { + "epoch": 0.36044857399306773, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3191, + "step": 3175 + }, + { + "epoch": 0.3605621011029868, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3155, + "step": 3176 + }, + { + "epoch": 0.3606756282129059, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3063, + "step": 3177 + }, + { + "epoch": 0.36078915532282496, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3223, + "step": 3178 + }, + { + "epoch": 0.36090268243274404, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3163, + "step": 3179 + }, + { + "epoch": 0.3610162095426631, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3038, + "step": 3180 + }, + { + "epoch": 0.3611297366525822, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3241, + "step": 3181 + }, + { + "epoch": 0.36124326376250127, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3181, + "step": 3182 + }, + { + "epoch": 0.36135679087242034, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3181, + "step": 3183 + }, + { + "epoch": 0.3614703179823394, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3372, + "step": 3184 + }, + { + "epoch": 0.3615838450922585, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3185 + }, + { + "epoch": 0.36169737220217757, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.309, + "step": 3186 + }, + { + "epoch": 0.36181089931209665, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3042, + "step": 3187 + }, + { + "epoch": 0.3619244264220157, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2981, + "step": 3188 + }, + { + "epoch": 0.3620379535319348, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3048, + "step": 3189 + }, + { + "epoch": 0.3621514806418539, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3262, + "step": 3190 + }, + { + "epoch": 0.36226500775177295, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3297, + "step": 3191 + }, + { + "epoch": 0.362378534861692, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3192, + "step": 3192 + }, + { + "epoch": 0.3624920619716111, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3149, + "step": 3193 + }, + { + "epoch": 0.3626055890815302, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3346, + "step": 3194 + }, + { + "epoch": 0.36271911619144925, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3045, + "step": 3195 + }, + { + "epoch": 0.36283264330136833, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3196 + }, + { + "epoch": 0.3629461704112874, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3244, + "step": 3197 + }, + { + "epoch": 0.36305969752120654, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2937, + "step": 3198 + }, + { + "epoch": 0.3631732246311256, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3175, + "step": 3199 + }, + { + "epoch": 0.3632867517410447, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3192, + "step": 3200 + }, + { + "epoch": 0.36340027885096376, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.305, + "step": 3201 + }, + { + "epoch": 0.36351380596088284, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.3004, + "step": 3202 + }, + { + "epoch": 0.3636273330708019, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.3369, + "step": 3203 + }, + { + "epoch": 0.363740860180721, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3221, + "step": 3204 + }, + { + "epoch": 0.36385438729064007, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3127, + "step": 3205 + }, + { + "epoch": 0.36396791440055914, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.2887, + "step": 3206 + }, + { + "epoch": 0.3640814415104782, + "grad_norm": 0.462890625, + "learning_rate": 0.002, + "loss": 5.3157, + "step": 3207 + }, + { + "epoch": 0.3641949686203973, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.3085, + "step": 3208 + }, + { + "epoch": 0.3643084957303164, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3228, + "step": 3209 + }, + { + "epoch": 0.36442202284023545, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3223, + "step": 3210 + }, + { + "epoch": 0.3645355499501545, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3314, + "step": 3211 + }, + { + "epoch": 0.3646490770600736, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3099, + "step": 3212 + }, + { + "epoch": 0.3647626041699927, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3108, + "step": 3213 + }, + { + "epoch": 0.36487613127991175, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3112, + "step": 3214 + }, + { + "epoch": 0.36498965838983083, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3203, + "step": 3215 + }, + { + "epoch": 0.3651031854997499, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2962, + "step": 3216 + }, + { + "epoch": 0.365216712609669, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3033, + "step": 3217 + }, + { + "epoch": 0.36533023971958806, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.323, + "step": 3218 + }, + { + "epoch": 0.36544376682950713, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3085, + "step": 3219 + }, + { + "epoch": 0.3655572939394262, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 3220 + }, + { + "epoch": 0.3656708210493453, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3234, + "step": 3221 + }, + { + "epoch": 0.36578434815926436, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3133, + "step": 3222 + }, + { + "epoch": 0.36589787526918344, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3283, + "step": 3223 + }, + { + "epoch": 0.3660114023791025, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3181, + "step": 3224 + }, + { + "epoch": 0.3661249294890216, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2906, + "step": 3225 + }, + { + "epoch": 0.36623845659894066, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.311, + "step": 3226 + }, + { + "epoch": 0.36635198370885974, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3213, + "step": 3227 + }, + { + "epoch": 0.3664655108187788, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3002, + "step": 3228 + }, + { + "epoch": 0.3665790379286979, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3069, + "step": 3229 + }, + { + "epoch": 0.36669256503861697, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 3230 + }, + { + "epoch": 0.36680609214853604, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3018, + "step": 3231 + }, + { + "epoch": 0.3669196192584551, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2983, + "step": 3232 + }, + { + "epoch": 0.3670331463683742, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3005, + "step": 3233 + }, + { + "epoch": 0.3671466734782933, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3322, + "step": 3234 + }, + { + "epoch": 0.36726020058821235, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3175, + "step": 3235 + }, + { + "epoch": 0.3673737276981314, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3278, + "step": 3236 + }, + { + "epoch": 0.3674872548080505, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3255, + "step": 3237 + }, + { + "epoch": 0.3676007819179696, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.313, + "step": 3238 + }, + { + "epoch": 0.36771430902788865, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.3094, + "step": 3239 + }, + { + "epoch": 0.36782783613780773, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3037, + "step": 3240 + }, + { + "epoch": 0.3679413632477268, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3215, + "step": 3241 + }, + { + "epoch": 0.3680548903576459, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3242 + }, + { + "epoch": 0.36816841746756496, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2997, + "step": 3243 + }, + { + "epoch": 0.36828194457748403, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3044, + "step": 3244 + }, + { + "epoch": 0.3683954716874031, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2939, + "step": 3245 + }, + { + "epoch": 0.3685089987973222, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3082, + "step": 3246 + }, + { + "epoch": 0.36862252590724126, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3151, + "step": 3247 + }, + { + "epoch": 0.36873605301716034, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3188, + "step": 3248 + }, + { + "epoch": 0.3688495801270794, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.316, + "step": 3249 + }, + { + "epoch": 0.3689631072369985, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 3250 + }, + { + "epoch": 0.36907663434691756, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2942, + "step": 3251 + }, + { + "epoch": 0.36919016145683664, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.3072, + "step": 3252 + }, + { + "epoch": 0.3693036885667557, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.32, + "step": 3253 + }, + { + "epoch": 0.3694172156766748, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3133, + "step": 3254 + }, + { + "epoch": 0.36953074278659387, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3143, + "step": 3255 + }, + { + "epoch": 0.36964426989651294, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.3153, + "step": 3256 + }, + { + "epoch": 0.369757797006432, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.3181, + "step": 3257 + }, + { + "epoch": 0.3698713241163511, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.3032, + "step": 3258 + }, + { + "epoch": 0.3699848512262702, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.2921, + "step": 3259 + }, + { + "epoch": 0.37009837833618925, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3105, + "step": 3260 + }, + { + "epoch": 0.3702119054461083, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3234, + "step": 3261 + }, + { + "epoch": 0.3703254325560274, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 3262 + }, + { + "epoch": 0.3704389596659465, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3033, + "step": 3263 + }, + { + "epoch": 0.37055248677586555, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3234, + "step": 3264 + }, + { + "epoch": 0.37066601388578463, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3089, + "step": 3265 + }, + { + "epoch": 0.3707795409957037, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3144, + "step": 3266 + }, + { + "epoch": 0.3708930681056228, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3109, + "step": 3267 + }, + { + "epoch": 0.37100659521554186, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3064, + "step": 3268 + }, + { + "epoch": 0.37112012232546093, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.312, + "step": 3269 + }, + { + "epoch": 0.37123364943538, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.317, + "step": 3270 + }, + { + "epoch": 0.3713471765452991, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3365, + "step": 3271 + }, + { + "epoch": 0.37146070365521816, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 3272 + }, + { + "epoch": 0.37157423076513724, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3191, + "step": 3273 + }, + { + "epoch": 0.3716877578750563, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3033, + "step": 3274 + }, + { + "epoch": 0.3718012849849754, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3263, + "step": 3275 + }, + { + "epoch": 0.37191481209489446, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.3138, + "step": 3276 + }, + { + "epoch": 0.37202833920481354, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3031, + "step": 3277 + }, + { + "epoch": 0.3721418663147326, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3059, + "step": 3278 + }, + { + "epoch": 0.3722553934246517, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2929, + "step": 3279 + }, + { + "epoch": 0.37236892053457077, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.317, + "step": 3280 + }, + { + "epoch": 0.37248244764448984, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3127, + "step": 3281 + }, + { + "epoch": 0.3725959747544089, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3216, + "step": 3282 + }, + { + "epoch": 0.372709501864328, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2894, + "step": 3283 + }, + { + "epoch": 0.3728230289742471, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 3284 + }, + { + "epoch": 0.37293655608416615, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3017, + "step": 3285 + }, + { + "epoch": 0.3730500831940852, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3205, + "step": 3286 + }, + { + "epoch": 0.3731636103040043, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3287 + }, + { + "epoch": 0.3732771374139234, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3238, + "step": 3288 + }, + { + "epoch": 0.37339066452384245, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2894, + "step": 3289 + }, + { + "epoch": 0.37350419163376153, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3041, + "step": 3290 + }, + { + "epoch": 0.3736177187436806, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3291 + }, + { + "epoch": 0.3737312458535997, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3292 + }, + { + "epoch": 0.37384477296351876, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3247, + "step": 3293 + }, + { + "epoch": 0.37395830007343783, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2791, + "step": 3294 + }, + { + "epoch": 0.3740718271833569, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.3095, + "step": 3295 + }, + { + "epoch": 0.374185354293276, + "grad_norm": 0.2236328125, + "learning_rate": 0.002, + "loss": 5.2932, + "step": 3296 + }, + { + "epoch": 0.37429888140319506, + "grad_norm": 0.22265625, + "learning_rate": 0.002, + "loss": 5.3145, + "step": 3297 + }, + { + "epoch": 0.37441240851311414, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.3071, + "step": 3298 + }, + { + "epoch": 0.3745259356230332, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 3299 + }, + { + "epoch": 0.3746394627329523, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3253, + "step": 3300 + }, + { + "epoch": 0.37475298984287136, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3118, + "step": 3301 + }, + { + "epoch": 0.37486651695279044, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2949, + "step": 3302 + }, + { + "epoch": 0.3749800440627095, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3095, + "step": 3303 + }, + { + "epoch": 0.3750935711726286, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.307, + "step": 3304 + }, + { + "epoch": 0.37520709828254767, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3159, + "step": 3305 + }, + { + "epoch": 0.37532062539246674, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 3306 + }, + { + "epoch": 0.3754341525023858, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.3079, + "step": 3307 + }, + { + "epoch": 0.3755476796123049, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3211, + "step": 3308 + }, + { + "epoch": 0.375661206722224, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3178, + "step": 3309 + }, + { + "epoch": 0.37577473383214305, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2976, + "step": 3310 + }, + { + "epoch": 0.3758882609420621, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3059, + "step": 3311 + }, + { + "epoch": 0.3760017880519812, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3111, + "step": 3312 + }, + { + "epoch": 0.3761153151619003, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3226, + "step": 3313 + }, + { + "epoch": 0.37622884227181935, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3025, + "step": 3314 + }, + { + "epoch": 0.37634236938173843, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2729, + "step": 3315 + }, + { + "epoch": 0.3764558964916575, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.296, + "step": 3316 + }, + { + "epoch": 0.3765694236015766, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.306, + "step": 3317 + }, + { + "epoch": 0.37668295071149566, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3082, + "step": 3318 + }, + { + "epoch": 0.3767964778214148, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3173, + "step": 3319 + }, + { + "epoch": 0.37691000493133386, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3247, + "step": 3320 + }, + { + "epoch": 0.37702353204125294, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3139, + "step": 3321 + }, + { + "epoch": 0.377137059151172, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3161, + "step": 3322 + }, + { + "epoch": 0.3772505862610911, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3052, + "step": 3323 + }, + { + "epoch": 0.37736411337101017, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2994, + "step": 3324 + }, + { + "epoch": 0.37747764048092924, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3116, + "step": 3325 + }, + { + "epoch": 0.3775911675908483, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3029, + "step": 3326 + }, + { + "epoch": 0.3777046947007674, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.317, + "step": 3327 + }, + { + "epoch": 0.37781822181068647, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3163, + "step": 3328 + }, + { + "epoch": 0.37793174892060555, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3218, + "step": 3329 + }, + { + "epoch": 0.3780452760305246, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3068, + "step": 3330 + }, + { + "epoch": 0.3781588031404437, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2996, + "step": 3331 + }, + { + "epoch": 0.3782723302503628, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 3332 + }, + { + "epoch": 0.37838585736028185, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3158, + "step": 3333 + }, + { + "epoch": 0.37849938447020093, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 3334 + }, + { + "epoch": 0.37861291158012, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2965, + "step": 3335 + }, + { + "epoch": 0.3787264386900391, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.3016, + "step": 3336 + }, + { + "epoch": 0.37883996579995816, + "grad_norm": 0.2294921875, + "learning_rate": 0.002, + "loss": 5.3235, + "step": 3337 + }, + { + "epoch": 0.37895349290987723, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2885, + "step": 3338 + }, + { + "epoch": 0.3790670200197963, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3143, + "step": 3339 + }, + { + "epoch": 0.3791805471297154, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 3340 + }, + { + "epoch": 0.37929407423963446, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3177, + "step": 3341 + }, + { + "epoch": 0.37940760134955354, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.3067, + "step": 3342 + }, + { + "epoch": 0.3795211284594726, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3083, + "step": 3343 + }, + { + "epoch": 0.3796346555693917, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.3052, + "step": 3344 + }, + { + "epoch": 0.37974818267931076, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3345 + }, + { + "epoch": 0.37986170978922984, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3206, + "step": 3346 + }, + { + "epoch": 0.3799752368991489, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3173, + "step": 3347 + }, + { + "epoch": 0.380088764009068, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3131, + "step": 3348 + }, + { + "epoch": 0.38020229111898707, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3136, + "step": 3349 + }, + { + "epoch": 0.38031581822890614, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3236, + "step": 3350 + }, + { + "epoch": 0.3804293453388252, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3102, + "step": 3351 + }, + { + "epoch": 0.3805428724487443, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3174, + "step": 3352 + }, + { + "epoch": 0.38065639955866337, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.3144, + "step": 3353 + }, + { + "epoch": 0.38076992666858245, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3354 + }, + { + "epoch": 0.3808834537785015, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3128, + "step": 3355 + }, + { + "epoch": 0.3809969808884206, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2917, + "step": 3356 + }, + { + "epoch": 0.3811105079983397, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.3145, + "step": 3357 + }, + { + "epoch": 0.38122403510825875, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2955, + "step": 3358 + }, + { + "epoch": 0.38133756221817783, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2991, + "step": 3359 + }, + { + "epoch": 0.3814510893280969, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3034, + "step": 3360 + }, + { + "epoch": 0.381564616438016, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3114, + "step": 3361 + }, + { + "epoch": 0.38167814354793506, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3143, + "step": 3362 + }, + { + "epoch": 0.38179167065785413, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2907, + "step": 3363 + }, + { + "epoch": 0.3819051977677732, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3209, + "step": 3364 + }, + { + "epoch": 0.3820187248776923, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2979, + "step": 3365 + }, + { + "epoch": 0.38213225198761136, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2874, + "step": 3366 + }, + { + "epoch": 0.38224577909753044, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3126, + "step": 3367 + }, + { + "epoch": 0.3823593062074495, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2988, + "step": 3368 + }, + { + "epoch": 0.3824728333173686, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3126, + "step": 3369 + }, + { + "epoch": 0.38258636042728766, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3096, + "step": 3370 + }, + { + "epoch": 0.38269988753720674, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3371 + }, + { + "epoch": 0.3828134146471258, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2919, + "step": 3372 + }, + { + "epoch": 0.3829269417570449, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3093, + "step": 3373 + }, + { + "epoch": 0.38304046886696397, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3092, + "step": 3374 + }, + { + "epoch": 0.38315399597688304, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3262, + "step": 3375 + }, + { + "epoch": 0.3832675230868021, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.302, + "step": 3376 + }, + { + "epoch": 0.3833810501967212, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.3078, + "step": 3377 + }, + { + "epoch": 0.38349457730664027, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3378 + }, + { + "epoch": 0.38360810441655935, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3158, + "step": 3379 + }, + { + "epoch": 0.3837216315264784, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3061, + "step": 3380 + }, + { + "epoch": 0.3838351586363975, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 3381 + }, + { + "epoch": 0.3839486857463166, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3241, + "step": 3382 + }, + { + "epoch": 0.38406221285623565, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.2907, + "step": 3383 + }, + { + "epoch": 0.38417573996615473, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.309, + "step": 3384 + }, + { + "epoch": 0.3842892670760738, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.3131, + "step": 3385 + }, + { + "epoch": 0.3844027941859929, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.3057, + "step": 3386 + }, + { + "epoch": 0.38451632129591196, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3159, + "step": 3387 + }, + { + "epoch": 0.38462984840583103, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3073, + "step": 3388 + }, + { + "epoch": 0.3847433755157501, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3024, + "step": 3389 + }, + { + "epoch": 0.3848569026256692, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3007, + "step": 3390 + }, + { + "epoch": 0.38497042973558826, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3183, + "step": 3391 + }, + { + "epoch": 0.38508395684550734, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3184, + "step": 3392 + }, + { + "epoch": 0.3851974839554264, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2942, + "step": 3393 + }, + { + "epoch": 0.3853110110653455, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.306, + "step": 3394 + }, + { + "epoch": 0.38542453817526456, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.306, + "step": 3395 + }, + { + "epoch": 0.38553806528518364, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2898, + "step": 3396 + }, + { + "epoch": 0.3856515923951027, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3073, + "step": 3397 + }, + { + "epoch": 0.3857651195050218, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2993, + "step": 3398 + }, + { + "epoch": 0.38587864661494087, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.316, + "step": 3399 + }, + { + "epoch": 0.38599217372485994, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3400 + }, + { + "epoch": 0.386105700834779, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2976, + "step": 3401 + }, + { + "epoch": 0.3862192279446981, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2977, + "step": 3402 + }, + { + "epoch": 0.38633275505461717, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3054, + "step": 3403 + }, + { + "epoch": 0.38644628216453625, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3014, + "step": 3404 + }, + { + "epoch": 0.3865598092744553, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2918, + "step": 3405 + }, + { + "epoch": 0.3866733363843744, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2981, + "step": 3406 + }, + { + "epoch": 0.3867868634942935, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3097, + "step": 3407 + }, + { + "epoch": 0.38690039060421255, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.302, + "step": 3408 + }, + { + "epoch": 0.38701391771413163, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.3007, + "step": 3409 + }, + { + "epoch": 0.3871274448240507, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3025, + "step": 3410 + }, + { + "epoch": 0.3872409719339698, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3125, + "step": 3411 + }, + { + "epoch": 0.38735449904388886, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 3412 + }, + { + "epoch": 0.38746802615380793, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2904, + "step": 3413 + }, + { + "epoch": 0.387581553263727, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3414 + }, + { + "epoch": 0.3876950803736461, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3193, + "step": 3415 + }, + { + "epoch": 0.38780860748356516, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3055, + "step": 3416 + }, + { + "epoch": 0.38792213459348424, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3106, + "step": 3417 + }, + { + "epoch": 0.3880356617034033, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2826, + "step": 3418 + }, + { + "epoch": 0.3881491888133224, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 3419 + }, + { + "epoch": 0.38826271592324146, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3091, + "step": 3420 + }, + { + "epoch": 0.38837624303316054, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.308, + "step": 3421 + }, + { + "epoch": 0.3884897701430796, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3116, + "step": 3422 + }, + { + "epoch": 0.3886032972529987, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2926, + "step": 3423 + }, + { + "epoch": 0.38871682436291777, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3214, + "step": 3424 + }, + { + "epoch": 0.38883035147283684, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.32, + "step": 3425 + }, + { + "epoch": 0.3889438785827559, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3081, + "step": 3426 + }, + { + "epoch": 0.389057405692675, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.3099, + "step": 3427 + }, + { + "epoch": 0.38917093280259407, + "grad_norm": 0.2333984375, + "learning_rate": 0.002, + "loss": 5.3099, + "step": 3428 + }, + { + "epoch": 0.38928445991251315, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3123, + "step": 3429 + }, + { + "epoch": 0.3893979870224322, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3189, + "step": 3430 + }, + { + "epoch": 0.3895115141323513, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2955, + "step": 3431 + }, + { + "epoch": 0.3896250412422704, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2989, + "step": 3432 + }, + { + "epoch": 0.38973856835218945, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3033, + "step": 3433 + }, + { + "epoch": 0.3898520954621085, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3075, + "step": 3434 + }, + { + "epoch": 0.3899656225720276, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3146, + "step": 3435 + }, + { + "epoch": 0.3900791496819467, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2914, + "step": 3436 + }, + { + "epoch": 0.39019267679186576, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3067, + "step": 3437 + }, + { + "epoch": 0.39030620390178483, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2906, + "step": 3438 + }, + { + "epoch": 0.39041973101170396, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.296, + "step": 3439 + }, + { + "epoch": 0.39053325812162304, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.3257, + "step": 3440 + }, + { + "epoch": 0.3906467852315421, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.3018, + "step": 3441 + }, + { + "epoch": 0.3907603123414612, + "grad_norm": 0.21875, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 3442 + }, + { + "epoch": 0.39087383945138027, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3081, + "step": 3443 + }, + { + "epoch": 0.39098736656129934, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.268, + "step": 3444 + }, + { + "epoch": 0.3911008936712184, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3024, + "step": 3445 + }, + { + "epoch": 0.3912144207811375, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.286, + "step": 3446 + }, + { + "epoch": 0.39132794789105657, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3447 + }, + { + "epoch": 0.39144147500097565, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.3078, + "step": 3448 + }, + { + "epoch": 0.3915550021108947, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.3012, + "step": 3449 + }, + { + "epoch": 0.3916685292208138, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.3216, + "step": 3450 + }, + { + "epoch": 0.3917820563307329, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2945, + "step": 3451 + }, + { + "epoch": 0.39189558344065195, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.299, + "step": 3452 + }, + { + "epoch": 0.392009110550571, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2933, + "step": 3453 + }, + { + "epoch": 0.3921226376604901, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3217, + "step": 3454 + }, + { + "epoch": 0.3922361647704092, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3455 + }, + { + "epoch": 0.39234969188032826, + "grad_norm": 0.470703125, + "learning_rate": 0.002, + "loss": 5.2957, + "step": 3456 + }, + { + "epoch": 0.39246321899024733, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.3054, + "step": 3457 + }, + { + "epoch": 0.3925767461001664, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.2942, + "step": 3458 + }, + { + "epoch": 0.3926902732100855, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2916, + "step": 3459 + }, + { + "epoch": 0.39280380032000456, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3069, + "step": 3460 + }, + { + "epoch": 0.39291732742992364, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.3339, + "step": 3461 + }, + { + "epoch": 0.3930308545398427, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2988, + "step": 3462 + }, + { + "epoch": 0.3931443816497618, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2844, + "step": 3463 + }, + { + "epoch": 0.39325790875968086, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.293, + "step": 3464 + }, + { + "epoch": 0.39337143586959994, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.3167, + "step": 3465 + }, + { + "epoch": 0.393484962979519, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3466 + }, + { + "epoch": 0.3935984900894381, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 3467 + }, + { + "epoch": 0.39371201719935717, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.3213, + "step": 3468 + }, + { + "epoch": 0.39382554430927624, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3021, + "step": 3469 + }, + { + "epoch": 0.3939390714191953, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3151, + "step": 3470 + }, + { + "epoch": 0.3940525985291144, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3471 + }, + { + "epoch": 0.39416612563903347, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 3472 + }, + { + "epoch": 0.39427965274895255, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2983, + "step": 3473 + }, + { + "epoch": 0.3943931798588716, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3474 + }, + { + "epoch": 0.3945067069687907, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3034, + "step": 3475 + }, + { + "epoch": 0.3946202340787098, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.3052, + "step": 3476 + }, + { + "epoch": 0.39473376118862885, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.3185, + "step": 3477 + }, + { + "epoch": 0.3948472882985479, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3108, + "step": 3478 + }, + { + "epoch": 0.394960815408467, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3001, + "step": 3479 + }, + { + "epoch": 0.3950743425183861, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3039, + "step": 3480 + }, + { + "epoch": 0.39518786962830516, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3097, + "step": 3481 + }, + { + "epoch": 0.39530139673822423, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.305, + "step": 3482 + }, + { + "epoch": 0.3954149238481433, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 3483 + }, + { + "epoch": 0.3955284509580624, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2985, + "step": 3484 + }, + { + "epoch": 0.39564197806798146, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.3001, + "step": 3485 + }, + { + "epoch": 0.39575550517790054, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 3486 + }, + { + "epoch": 0.3958690322878196, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2927, + "step": 3487 + }, + { + "epoch": 0.3959825593977387, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3173, + "step": 3488 + }, + { + "epoch": 0.39609608650765776, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.298, + "step": 3489 + }, + { + "epoch": 0.39620961361757684, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3123, + "step": 3490 + }, + { + "epoch": 0.3963231407274959, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2835, + "step": 3491 + }, + { + "epoch": 0.396436667837415, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2936, + "step": 3492 + }, + { + "epoch": 0.39655019494733407, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2981, + "step": 3493 + }, + { + "epoch": 0.39666372205725314, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3031, + "step": 3494 + }, + { + "epoch": 0.3967772491671722, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2965, + "step": 3495 + }, + { + "epoch": 0.3968907762770913, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.3012, + "step": 3496 + }, + { + "epoch": 0.39700430338701037, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.29, + "step": 3497 + }, + { + "epoch": 0.39711783049692945, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.3111, + "step": 3498 + }, + { + "epoch": 0.3972313576068485, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3024, + "step": 3499 + }, + { + "epoch": 0.3973448847167676, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.3032, + "step": 3500 + }, + { + "epoch": 0.3974584118266867, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3017, + "step": 3501 + }, + { + "epoch": 0.39757193893660575, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3034, + "step": 3502 + }, + { + "epoch": 0.3976854660465248, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3329, + "step": 3503 + }, + { + "epoch": 0.3977989931564439, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3098, + "step": 3504 + }, + { + "epoch": 0.397912520266363, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.295, + "step": 3505 + }, + { + "epoch": 0.39802604737628206, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3028, + "step": 3506 + }, + { + "epoch": 0.39813957448620113, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2981, + "step": 3507 + }, + { + "epoch": 0.3982531015961202, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2944, + "step": 3508 + }, + { + "epoch": 0.3983666287060393, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3039, + "step": 3509 + }, + { + "epoch": 0.39848015581595836, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.285, + "step": 3510 + }, + { + "epoch": 0.39859368292587743, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.317, + "step": 3511 + }, + { + "epoch": 0.3987072100357965, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2916, + "step": 3512 + }, + { + "epoch": 0.3988207371457156, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.3098, + "step": 3513 + }, + { + "epoch": 0.39893426425563466, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.3116, + "step": 3514 + }, + { + "epoch": 0.39904779136555374, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3072, + "step": 3515 + }, + { + "epoch": 0.3991613184754728, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.2904, + "step": 3516 + }, + { + "epoch": 0.3992748455853919, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2925, + "step": 3517 + }, + { + "epoch": 0.39938837269531097, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2987, + "step": 3518 + }, + { + "epoch": 0.39950189980523004, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.301, + "step": 3519 + }, + { + "epoch": 0.3996154269151491, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3520 + }, + { + "epoch": 0.3997289540250682, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.3041, + "step": 3521 + }, + { + "epoch": 0.39984248113498727, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.3045, + "step": 3522 + }, + { + "epoch": 0.39995600824490635, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2957, + "step": 3523 + }, + { + "epoch": 0.4000695353548254, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3071, + "step": 3524 + }, + { + "epoch": 0.4001830624647445, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2996, + "step": 3525 + }, + { + "epoch": 0.4002965895746636, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3032, + "step": 3526 + }, + { + "epoch": 0.40041011668458265, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 3527 + }, + { + "epoch": 0.4005236437945017, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2986, + "step": 3528 + }, + { + "epoch": 0.4006371709044208, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3104, + "step": 3529 + }, + { + "epoch": 0.4007506980143399, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 3530 + }, + { + "epoch": 0.40086422512425895, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.293, + "step": 3531 + }, + { + "epoch": 0.40097775223417803, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3032, + "step": 3532 + }, + { + "epoch": 0.4010912793440971, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2937, + "step": 3533 + }, + { + "epoch": 0.4012048064540162, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3117, + "step": 3534 + }, + { + "epoch": 0.40131833356393526, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3185, + "step": 3535 + }, + { + "epoch": 0.40143186067385433, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2879, + "step": 3536 + }, + { + "epoch": 0.4015453877837734, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2881, + "step": 3537 + }, + { + "epoch": 0.4016589148936925, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3153, + "step": 3538 + }, + { + "epoch": 0.40177244200361156, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3068, + "step": 3539 + }, + { + "epoch": 0.40188596911353064, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2959, + "step": 3540 + }, + { + "epoch": 0.4019994962234497, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.3149, + "step": 3541 + }, + { + "epoch": 0.4021130233333688, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.3011, + "step": 3542 + }, + { + "epoch": 0.40222655044328787, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.307, + "step": 3543 + }, + { + "epoch": 0.40234007755320694, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.278, + "step": 3544 + }, + { + "epoch": 0.402453604663126, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2931, + "step": 3545 + }, + { + "epoch": 0.4025671317730451, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2876, + "step": 3546 + }, + { + "epoch": 0.40268065888296417, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.3264, + "step": 3547 + }, + { + "epoch": 0.40279418599288325, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3071, + "step": 3548 + }, + { + "epoch": 0.4029077131028023, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3006, + "step": 3549 + }, + { + "epoch": 0.4030212402127214, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2991, + "step": 3550 + }, + { + "epoch": 0.4031347673226405, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.3044, + "step": 3551 + }, + { + "epoch": 0.40324829443255955, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.284, + "step": 3552 + }, + { + "epoch": 0.4033618215424786, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2986, + "step": 3553 + }, + { + "epoch": 0.4034753486523977, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3029, + "step": 3554 + }, + { + "epoch": 0.4035888757623168, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 3555 + }, + { + "epoch": 0.40370240287223585, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3079, + "step": 3556 + }, + { + "epoch": 0.40381592998215493, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3557 + }, + { + "epoch": 0.403929457092074, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3022, + "step": 3558 + }, + { + "epoch": 0.4040429842019931, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2988, + "step": 3559 + }, + { + "epoch": 0.4041565113119122, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.3062, + "step": 3560 + }, + { + "epoch": 0.4042700384218313, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.2976, + "step": 3561 + }, + { + "epoch": 0.40438356553175037, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2917, + "step": 3562 + }, + { + "epoch": 0.40449709264166944, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2905, + "step": 3563 + }, + { + "epoch": 0.4046106197515885, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.297, + "step": 3564 + }, + { + "epoch": 0.4047241468615076, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3048, + "step": 3565 + }, + { + "epoch": 0.40483767397142667, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2934, + "step": 3566 + }, + { + "epoch": 0.40495120108134575, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.312, + "step": 3567 + }, + { + "epoch": 0.4050647281912648, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3029, + "step": 3568 + }, + { + "epoch": 0.4051782553011839, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2987, + "step": 3569 + }, + { + "epoch": 0.405291782411103, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 3570 + }, + { + "epoch": 0.40540530952102205, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 3571 + }, + { + "epoch": 0.4055188366309411, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.294, + "step": 3572 + }, + { + "epoch": 0.4056323637408602, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.308, + "step": 3573 + }, + { + "epoch": 0.4057458908507793, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2801, + "step": 3574 + }, + { + "epoch": 0.40585941796069835, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2828, + "step": 3575 + }, + { + "epoch": 0.40597294507061743, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2875, + "step": 3576 + }, + { + "epoch": 0.4060864721805365, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.3125, + "step": 3577 + }, + { + "epoch": 0.4061999992904556, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 3578 + }, + { + "epoch": 0.40631352640037466, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.3157, + "step": 3579 + }, + { + "epoch": 0.40642705351029373, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3299, + "step": 3580 + }, + { + "epoch": 0.4065405806202128, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 3581 + }, + { + "epoch": 0.4066541077301319, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2847, + "step": 3582 + }, + { + "epoch": 0.40676763484005096, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2937, + "step": 3583 + }, + { + "epoch": 0.40688116194997004, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2945, + "step": 3584 + }, + { + "epoch": 0.4069946890598891, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 3585 + }, + { + "epoch": 0.4071082161698082, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2816, + "step": 3586 + }, + { + "epoch": 0.40722174327972727, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.286, + "step": 3587 + }, + { + "epoch": 0.40733527038964634, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2799, + "step": 3588 + }, + { + "epoch": 0.4074487974995654, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3011, + "step": 3589 + }, + { + "epoch": 0.4075623246094845, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.3049, + "step": 3590 + }, + { + "epoch": 0.40767585171940357, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2948, + "step": 3591 + }, + { + "epoch": 0.40778937882932265, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3592 + }, + { + "epoch": 0.4079029059392417, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.3154, + "step": 3593 + }, + { + "epoch": 0.4080164330491608, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3182, + "step": 3594 + }, + { + "epoch": 0.4081299601590799, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2984, + "step": 3595 + }, + { + "epoch": 0.40824348726899895, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3596 + }, + { + "epoch": 0.408357014378918, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 3597 + }, + { + "epoch": 0.4084705414888371, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3598 + }, + { + "epoch": 0.4085840685987562, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2876, + "step": 3599 + }, + { + "epoch": 0.40869759570867525, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.3136, + "step": 3600 + }, + { + "epoch": 0.40881112281859433, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2939, + "step": 3601 + }, + { + "epoch": 0.4089246499285134, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.3082, + "step": 3602 + }, + { + "epoch": 0.4090381770384325, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.3045, + "step": 3603 + }, + { + "epoch": 0.40915170414835156, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.3018, + "step": 3604 + }, + { + "epoch": 0.40926523125827063, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3605 + }, + { + "epoch": 0.4093787583681897, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.3034, + "step": 3606 + }, + { + "epoch": 0.4094922854781088, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3036, + "step": 3607 + }, + { + "epoch": 0.40960581258802786, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2857, + "step": 3608 + }, + { + "epoch": 0.40971933969794694, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 3609 + }, + { + "epoch": 0.409832866807866, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.294, + "step": 3610 + }, + { + "epoch": 0.4099463939177851, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.3061, + "step": 3611 + }, + { + "epoch": 0.41005992102770417, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2969, + "step": 3612 + }, + { + "epoch": 0.41017344813762324, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2939, + "step": 3613 + }, + { + "epoch": 0.4102869752475423, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2876, + "step": 3614 + }, + { + "epoch": 0.4104005023574614, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.3014, + "step": 3615 + }, + { + "epoch": 0.41051402946738047, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3089, + "step": 3616 + }, + { + "epoch": 0.41062755657729955, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 3617 + }, + { + "epoch": 0.4107410836872186, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3136, + "step": 3618 + }, + { + "epoch": 0.4108546107971377, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 3619 + }, + { + "epoch": 0.4109681379070568, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 3620 + }, + { + "epoch": 0.41108166501697585, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2908, + "step": 3621 + }, + { + "epoch": 0.4111951921268949, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3007, + "step": 3622 + }, + { + "epoch": 0.411308719236814, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2993, + "step": 3623 + }, + { + "epoch": 0.4114222463467331, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2886, + "step": 3624 + }, + { + "epoch": 0.41153577345665215, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 3625 + }, + { + "epoch": 0.41164930056657123, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2879, + "step": 3626 + }, + { + "epoch": 0.4117628276764903, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2943, + "step": 3627 + }, + { + "epoch": 0.4118763547864094, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 3628 + }, + { + "epoch": 0.41198988189632846, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.308, + "step": 3629 + }, + { + "epoch": 0.41210340900624753, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.3124, + "step": 3630 + }, + { + "epoch": 0.4122169361161666, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2937, + "step": 3631 + }, + { + "epoch": 0.4123304632260857, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2811, + "step": 3632 + }, + { + "epoch": 0.41244399033600476, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3014, + "step": 3633 + }, + { + "epoch": 0.41255751744592384, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 3634 + }, + { + "epoch": 0.4126710445558429, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3082, + "step": 3635 + }, + { + "epoch": 0.412784571665762, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2898, + "step": 3636 + }, + { + "epoch": 0.41289809877568107, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2917, + "step": 3637 + }, + { + "epoch": 0.41301162588560014, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.291, + "step": 3638 + }, + { + "epoch": 0.4131251529955192, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2981, + "step": 3639 + }, + { + "epoch": 0.4132386801054383, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2854, + "step": 3640 + }, + { + "epoch": 0.41335220721535737, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.3038, + "step": 3641 + }, + { + "epoch": 0.41346573432527645, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.297, + "step": 3642 + }, + { + "epoch": 0.4135792614351955, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 3643 + }, + { + "epoch": 0.4136927885451146, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2793, + "step": 3644 + }, + { + "epoch": 0.4138063156550337, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2934, + "step": 3645 + }, + { + "epoch": 0.41391984276495275, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2896, + "step": 3646 + }, + { + "epoch": 0.4140333698748718, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 3647 + }, + { + "epoch": 0.4141468969847909, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3648 + }, + { + "epoch": 0.41426042409471, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.3102, + "step": 3649 + }, + { + "epoch": 0.41437395120462905, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2925, + "step": 3650 + }, + { + "epoch": 0.41448747831454813, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3125, + "step": 3651 + }, + { + "epoch": 0.4146010054244672, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2844, + "step": 3652 + }, + { + "epoch": 0.4147145325343863, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2941, + "step": 3653 + }, + { + "epoch": 0.41482805964430536, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.283, + "step": 3654 + }, + { + "epoch": 0.41494158675422443, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.307, + "step": 3655 + }, + { + "epoch": 0.4150551138641435, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 3656 + }, + { + "epoch": 0.4151686409740626, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.3001, + "step": 3657 + }, + { + "epoch": 0.41528216808398166, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2774, + "step": 3658 + }, + { + "epoch": 0.41539569519390074, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3043, + "step": 3659 + }, + { + "epoch": 0.4155092223038198, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3003, + "step": 3660 + }, + { + "epoch": 0.4156227494137389, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 3661 + }, + { + "epoch": 0.41573627652365797, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2877, + "step": 3662 + }, + { + "epoch": 0.41584980363357704, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2747, + "step": 3663 + }, + { + "epoch": 0.4159633307434961, + "grad_norm": 0.228515625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 3664 + }, + { + "epoch": 0.4160768578534152, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 3665 + }, + { + "epoch": 0.41619038496333427, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2717, + "step": 3666 + }, + { + "epoch": 0.41630391207325335, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3072, + "step": 3667 + }, + { + "epoch": 0.4164174391831724, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2901, + "step": 3668 + }, + { + "epoch": 0.4165309662930915, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.3037, + "step": 3669 + }, + { + "epoch": 0.4166444934030106, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.2888, + "step": 3670 + }, + { + "epoch": 0.41675802051292965, + "grad_norm": 0.48046875, + "learning_rate": 0.002, + "loss": 5.3021, + "step": 3671 + }, + { + "epoch": 0.4168715476228487, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.3102, + "step": 3672 + }, + { + "epoch": 0.4169850747327678, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.2874, + "step": 3673 + }, + { + "epoch": 0.4170986018426869, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2871, + "step": 3674 + }, + { + "epoch": 0.41721212895260595, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2872, + "step": 3675 + }, + { + "epoch": 0.41732565606252503, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.291, + "step": 3676 + }, + { + "epoch": 0.4174391831724441, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 3677 + }, + { + "epoch": 0.4175527102823632, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3678 + }, + { + "epoch": 0.41766623739228226, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2882, + "step": 3679 + }, + { + "epoch": 0.41777976450220133, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3105, + "step": 3680 + }, + { + "epoch": 0.41789329161212047, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2805, + "step": 3681 + }, + { + "epoch": 0.41800681872203954, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2992, + "step": 3682 + }, + { + "epoch": 0.4181203458319586, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.3232, + "step": 3683 + }, + { + "epoch": 0.4182338729418777, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.301, + "step": 3684 + }, + { + "epoch": 0.41834740005179677, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.271, + "step": 3685 + }, + { + "epoch": 0.41846092716171585, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.3078, + "step": 3686 + }, + { + "epoch": 0.4185744542716349, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3062, + "step": 3687 + }, + { + "epoch": 0.418687981381554, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3688 + }, + { + "epoch": 0.4188015084914731, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.302, + "step": 3689 + }, + { + "epoch": 0.41891503560139215, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.305, + "step": 3690 + }, + { + "epoch": 0.4190285627113112, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.296, + "step": 3691 + }, + { + "epoch": 0.4191420898212303, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 3692 + }, + { + "epoch": 0.4192556169311494, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3002, + "step": 3693 + }, + { + "epoch": 0.41936914404106845, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2997, + "step": 3694 + }, + { + "epoch": 0.41948267115098753, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2662, + "step": 3695 + }, + { + "epoch": 0.4195961982609066, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2947, + "step": 3696 + }, + { + "epoch": 0.4197097253708257, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.3024, + "step": 3697 + }, + { + "epoch": 0.41982325248074476, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 3698 + }, + { + "epoch": 0.41993677959066383, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2949, + "step": 3699 + }, + { + "epoch": 0.4200503067005829, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2846, + "step": 3700 + }, + { + "epoch": 0.420163833810502, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3701 + }, + { + "epoch": 0.42027736092042106, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.3035, + "step": 3702 + }, + { + "epoch": 0.42039088803034014, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.299, + "step": 3703 + }, + { + "epoch": 0.4205044151402592, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2945, + "step": 3704 + }, + { + "epoch": 0.4206179422501783, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2941, + "step": 3705 + }, + { + "epoch": 0.42073146936009737, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3235, + "step": 3706 + }, + { + "epoch": 0.42084499647001644, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 3707 + }, + { + "epoch": 0.4209585235799355, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3046, + "step": 3708 + }, + { + "epoch": 0.4210720506898546, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 3709 + }, + { + "epoch": 0.42118557779977367, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2916, + "step": 3710 + }, + { + "epoch": 0.42129910490969275, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.3209, + "step": 3711 + }, + { + "epoch": 0.4214126320196118, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.292, + "step": 3712 + }, + { + "epoch": 0.4215261591295309, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2895, + "step": 3713 + }, + { + "epoch": 0.42163968623945, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2933, + "step": 3714 + }, + { + "epoch": 0.42175321334936905, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2959, + "step": 3715 + }, + { + "epoch": 0.4218667404592881, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 3716 + }, + { + "epoch": 0.4219802675692072, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2855, + "step": 3717 + }, + { + "epoch": 0.4220937946791263, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2936, + "step": 3718 + }, + { + "epoch": 0.42220732178904535, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3081, + "step": 3719 + }, + { + "epoch": 0.42232084889896443, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3117, + "step": 3720 + }, + { + "epoch": 0.4224343760088835, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.3087, + "step": 3721 + }, + { + "epoch": 0.4225479031188026, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.307, + "step": 3722 + }, + { + "epoch": 0.42266143022872166, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2946, + "step": 3723 + }, + { + "epoch": 0.42277495733864073, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.3042, + "step": 3724 + }, + { + "epoch": 0.4228884844485598, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 3725 + }, + { + "epoch": 0.4230020115584789, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.282, + "step": 3726 + }, + { + "epoch": 0.42311553866839796, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 3727 + }, + { + "epoch": 0.42322906577831704, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2962, + "step": 3728 + }, + { + "epoch": 0.4233425928882361, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.301, + "step": 3729 + }, + { + "epoch": 0.4234561199981552, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 3730 + }, + { + "epoch": 0.42356964710807427, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2876, + "step": 3731 + }, + { + "epoch": 0.42368317421799334, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2825, + "step": 3732 + }, + { + "epoch": 0.4237967013279124, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2911, + "step": 3733 + }, + { + "epoch": 0.4239102284378315, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.3028, + "step": 3734 + }, + { + "epoch": 0.42402375554775057, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.283, + "step": 3735 + }, + { + "epoch": 0.42413728265766965, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.3015, + "step": 3736 + }, + { + "epoch": 0.4242508097675887, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3062, + "step": 3737 + }, + { + "epoch": 0.4243643368775078, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.3115, + "step": 3738 + }, + { + "epoch": 0.4244778639874269, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3011, + "step": 3739 + }, + { + "epoch": 0.42459139109734595, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 3740 + }, + { + "epoch": 0.424704918207265, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.3041, + "step": 3741 + }, + { + "epoch": 0.4248184453171841, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2975, + "step": 3742 + }, + { + "epoch": 0.4249319724271032, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 3743 + }, + { + "epoch": 0.42504549953702225, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 3744 + }, + { + "epoch": 0.42515902664694133, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3056, + "step": 3745 + }, + { + "epoch": 0.4252725537568604, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.3118, + "step": 3746 + }, + { + "epoch": 0.4253860808667795, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2948, + "step": 3747 + }, + { + "epoch": 0.42549960797669856, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2825, + "step": 3748 + }, + { + "epoch": 0.42561313508661763, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2953, + "step": 3749 + }, + { + "epoch": 0.4257266621965367, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2879, + "step": 3750 + }, + { + "epoch": 0.4258401893064558, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2991, + "step": 3751 + }, + { + "epoch": 0.42595371641637486, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.306, + "step": 3752 + }, + { + "epoch": 0.42606724352629394, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2958, + "step": 3753 + }, + { + "epoch": 0.426180770636213, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.291, + "step": 3754 + }, + { + "epoch": 0.4262942977461321, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2881, + "step": 3755 + }, + { + "epoch": 0.42640782485605117, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 3756 + }, + { + "epoch": 0.42652135196597024, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2984, + "step": 3757 + }, + { + "epoch": 0.4266348790758893, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2982, + "step": 3758 + }, + { + "epoch": 0.4267484061858084, + "grad_norm": 0.2138671875, + "learning_rate": 0.002, + "loss": 5.3014, + "step": 3759 + }, + { + "epoch": 0.42686193329572747, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.2723, + "step": 3760 + }, + { + "epoch": 0.42697546040564655, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 3761 + }, + { + "epoch": 0.4270889875155656, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2959, + "step": 3762 + }, + { + "epoch": 0.4272025146254847, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2997, + "step": 3763 + }, + { + "epoch": 0.4273160417354038, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2964, + "step": 3764 + }, + { + "epoch": 0.42742956884532285, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2984, + "step": 3765 + }, + { + "epoch": 0.4275430959552419, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 3766 + }, + { + "epoch": 0.427656623065161, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2907, + "step": 3767 + }, + { + "epoch": 0.4277701501750801, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 3768 + }, + { + "epoch": 0.42788367728499915, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2967, + "step": 3769 + }, + { + "epoch": 0.42799720439491823, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2795, + "step": 3770 + }, + { + "epoch": 0.4281107315048373, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.287, + "step": 3771 + }, + { + "epoch": 0.4282242586147564, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2671, + "step": 3772 + }, + { + "epoch": 0.42833778572467546, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.3001, + "step": 3773 + }, + { + "epoch": 0.42845131283459453, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 3774 + }, + { + "epoch": 0.4285648399445136, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.3044, + "step": 3775 + }, + { + "epoch": 0.4286783670544327, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2747, + "step": 3776 + }, + { + "epoch": 0.42879189416435176, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2895, + "step": 3777 + }, + { + "epoch": 0.42890542127427084, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.294, + "step": 3778 + }, + { + "epoch": 0.4290189483841899, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.3025, + "step": 3779 + }, + { + "epoch": 0.429132475494109, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2991, + "step": 3780 + }, + { + "epoch": 0.42924600260402807, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 3781 + }, + { + "epoch": 0.42935952971394714, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 3782 + }, + { + "epoch": 0.4294730568238662, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3098, + "step": 3783 + }, + { + "epoch": 0.4295865839337853, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 3784 + }, + { + "epoch": 0.42970011104370437, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2952, + "step": 3785 + }, + { + "epoch": 0.42981363815362345, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2848, + "step": 3786 + }, + { + "epoch": 0.4299271652635425, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2969, + "step": 3787 + }, + { + "epoch": 0.4300406923734616, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.2926, + "step": 3788 + }, + { + "epoch": 0.4301542194833807, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2859, + "step": 3789 + }, + { + "epoch": 0.43026774659329975, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2677, + "step": 3790 + }, + { + "epoch": 0.4303812737032188, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2999, + "step": 3791 + }, + { + "epoch": 0.4304948008131379, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3005, + "step": 3792 + }, + { + "epoch": 0.430608327923057, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2825, + "step": 3793 + }, + { + "epoch": 0.43072185503297605, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.299, + "step": 3794 + }, + { + "epoch": 0.43083538214289513, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2773, + "step": 3795 + }, + { + "epoch": 0.4309489092528142, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2918, + "step": 3796 + }, + { + "epoch": 0.4310624363627333, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.3051, + "step": 3797 + }, + { + "epoch": 0.43117596347265236, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 3798 + }, + { + "epoch": 0.43128949058257143, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2962, + "step": 3799 + }, + { + "epoch": 0.4314030176924905, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 3800 + }, + { + "epoch": 0.43151654480240964, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 3801 + }, + { + "epoch": 0.4316300719123287, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.284, + "step": 3802 + }, + { + "epoch": 0.4317435990222478, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2944, + "step": 3803 + }, + { + "epoch": 0.43185712613216687, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2712, + "step": 3804 + }, + { + "epoch": 0.43197065324208594, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.31, + "step": 3805 + }, + { + "epoch": 0.432084180352005, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 3806 + }, + { + "epoch": 0.4321977074619241, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.295, + "step": 3807 + }, + { + "epoch": 0.4323112345718432, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2754, + "step": 3808 + }, + { + "epoch": 0.43242476168176225, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.297, + "step": 3809 + }, + { + "epoch": 0.4325382887916813, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.283, + "step": 3810 + }, + { + "epoch": 0.4326518159016004, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.3044, + "step": 3811 + }, + { + "epoch": 0.4327653430115195, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3017, + "step": 3812 + }, + { + "epoch": 0.43287887012143855, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2844, + "step": 3813 + }, + { + "epoch": 0.43299239723135763, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.281, + "step": 3814 + }, + { + "epoch": 0.4331059243412767, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 3815 + }, + { + "epoch": 0.4332194514511958, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 3816 + }, + { + "epoch": 0.43333297856111486, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2632, + "step": 3817 + }, + { + "epoch": 0.43344650567103393, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 3818 + }, + { + "epoch": 0.433560032780953, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 3819 + }, + { + "epoch": 0.4336735598908721, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2851, + "step": 3820 + }, + { + "epoch": 0.43378708700079116, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 3821 + }, + { + "epoch": 0.43390061411071024, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2816, + "step": 3822 + }, + { + "epoch": 0.4340141412206293, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2922, + "step": 3823 + }, + { + "epoch": 0.4341276683305484, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2939, + "step": 3824 + }, + { + "epoch": 0.43424119544046746, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.274, + "step": 3825 + }, + { + "epoch": 0.43435472255038654, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2963, + "step": 3826 + }, + { + "epoch": 0.4344682496603056, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.305, + "step": 3827 + }, + { + "epoch": 0.4345817767702247, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.3021, + "step": 3828 + }, + { + "epoch": 0.43469530388014377, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2887, + "step": 3829 + }, + { + "epoch": 0.43480883099006284, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2871, + "step": 3830 + }, + { + "epoch": 0.4349223580999819, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 3831 + }, + { + "epoch": 0.435035885209901, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.308, + "step": 3832 + }, + { + "epoch": 0.4351494123198201, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2937, + "step": 3833 + }, + { + "epoch": 0.43526293942973915, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2912, + "step": 3834 + }, + { + "epoch": 0.4353764665396582, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 3835 + }, + { + "epoch": 0.4354899936495773, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.3117, + "step": 3836 + }, + { + "epoch": 0.4356035207594964, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2952, + "step": 3837 + }, + { + "epoch": 0.43571704786941545, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 3838 + }, + { + "epoch": 0.43583057497933453, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2984, + "step": 3839 + }, + { + "epoch": 0.4359441020892536, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2844, + "step": 3840 + }, + { + "epoch": 0.4360576291991727, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2814, + "step": 3841 + }, + { + "epoch": 0.43617115630909176, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2826, + "step": 3842 + }, + { + "epoch": 0.43628468341901083, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2849, + "step": 3843 + }, + { + "epoch": 0.4363982105289299, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.3129, + "step": 3844 + }, + { + "epoch": 0.436511737638849, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2702, + "step": 3845 + }, + { + "epoch": 0.43662526474876806, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.307, + "step": 3846 + }, + { + "epoch": 0.43673879185868714, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2912, + "step": 3847 + }, + { + "epoch": 0.4368523189686062, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2948, + "step": 3848 + }, + { + "epoch": 0.4369658460785253, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2832, + "step": 3849 + }, + { + "epoch": 0.43707937318844436, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.285, + "step": 3850 + }, + { + "epoch": 0.43719290029836344, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2932, + "step": 3851 + }, + { + "epoch": 0.4373064274082825, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.3, + "step": 3852 + }, + { + "epoch": 0.4374199545182016, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2956, + "step": 3853 + }, + { + "epoch": 0.43753348162812067, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3149, + "step": 3854 + }, + { + "epoch": 0.43764700873803974, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.3006, + "step": 3855 + }, + { + "epoch": 0.4377605358479588, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 3856 + }, + { + "epoch": 0.4378740629578779, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.281, + "step": 3857 + }, + { + "epoch": 0.437987590067797, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2853, + "step": 3858 + }, + { + "epoch": 0.43810111717771605, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.3017, + "step": 3859 + }, + { + "epoch": 0.4382146442876351, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 3860 + }, + { + "epoch": 0.4383281713975542, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 3861 + }, + { + "epoch": 0.4384416985074733, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.3087, + "step": 3862 + }, + { + "epoch": 0.43855522561739235, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2709, + "step": 3863 + }, + { + "epoch": 0.43866875272731143, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.279, + "step": 3864 + }, + { + "epoch": 0.4387822798372305, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2843, + "step": 3865 + }, + { + "epoch": 0.4388958069471496, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2765, + "step": 3866 + }, + { + "epoch": 0.43900933405706866, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2856, + "step": 3867 + }, + { + "epoch": 0.43912286116698773, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2983, + "step": 3868 + }, + { + "epoch": 0.4392363882769068, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2912, + "step": 3869 + }, + { + "epoch": 0.4393499153868259, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2999, + "step": 3870 + }, + { + "epoch": 0.43946344249674496, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2997, + "step": 3871 + }, + { + "epoch": 0.43957696960666404, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 3872 + }, + { + "epoch": 0.4396904967165831, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2813, + "step": 3873 + }, + { + "epoch": 0.4398040238265022, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2742, + "step": 3874 + }, + { + "epoch": 0.43991755093642126, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 3875 + }, + { + "epoch": 0.44003107804634034, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 3876 + }, + { + "epoch": 0.4401446051562594, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2887, + "step": 3877 + }, + { + "epoch": 0.4402581322661785, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2969, + "step": 3878 + }, + { + "epoch": 0.44037165937609757, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 3879 + }, + { + "epoch": 0.44048518648601664, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2898, + "step": 3880 + }, + { + "epoch": 0.4405987135959357, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2849, + "step": 3881 + }, + { + "epoch": 0.4407122407058548, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2905, + "step": 3882 + }, + { + "epoch": 0.4408257678157739, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 3883 + }, + { + "epoch": 0.44093929492569295, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2859, + "step": 3884 + }, + { + "epoch": 0.441052822035612, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 3885 + }, + { + "epoch": 0.4411663491455311, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2883, + "step": 3886 + }, + { + "epoch": 0.4412798762554502, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 3887 + }, + { + "epoch": 0.44139340336536925, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2888, + "step": 3888 + }, + { + "epoch": 0.44150693047528833, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 3889 + }, + { + "epoch": 0.4416204575852074, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2887, + "step": 3890 + }, + { + "epoch": 0.4417339846951265, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.297, + "step": 3891 + }, + { + "epoch": 0.44184751180504556, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2846, + "step": 3892 + }, + { + "epoch": 0.44196103891496463, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2845, + "step": 3893 + }, + { + "epoch": 0.4420745660248837, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.3043, + "step": 3894 + }, + { + "epoch": 0.4421880931348028, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2902, + "step": 3895 + }, + { + "epoch": 0.44230162024472186, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2924, + "step": 3896 + }, + { + "epoch": 0.44241514735464094, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.3143, + "step": 3897 + }, + { + "epoch": 0.44252867446456, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3083, + "step": 3898 + }, + { + "epoch": 0.4426422015744791, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2736, + "step": 3899 + }, + { + "epoch": 0.44275572868439816, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2645, + "step": 3900 + }, + { + "epoch": 0.44286925579431724, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2921, + "step": 3901 + }, + { + "epoch": 0.4429827829042363, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 3902 + }, + { + "epoch": 0.4430963100141554, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.3081, + "step": 3903 + }, + { + "epoch": 0.44320983712407447, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2944, + "step": 3904 + }, + { + "epoch": 0.44332336423399354, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2932, + "step": 3905 + }, + { + "epoch": 0.4434368913439126, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2769, + "step": 3906 + }, + { + "epoch": 0.4435504184538317, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2969, + "step": 3907 + }, + { + "epoch": 0.4436639455637508, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.29, + "step": 3908 + }, + { + "epoch": 0.44377747267366985, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2713, + "step": 3909 + }, + { + "epoch": 0.4438909997835889, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2919, + "step": 3910 + }, + { + "epoch": 0.444004526893508, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.296, + "step": 3911 + }, + { + "epoch": 0.4441180540034271, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2732, + "step": 3912 + }, + { + "epoch": 0.44423158111334615, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 3913 + }, + { + "epoch": 0.44434510822326523, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2893, + "step": 3914 + }, + { + "epoch": 0.4444586353331843, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 3915 + }, + { + "epoch": 0.4445721624431034, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2994, + "step": 3916 + }, + { + "epoch": 0.44468568955302246, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2762, + "step": 3917 + }, + { + "epoch": 0.44479921666294153, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.3021, + "step": 3918 + }, + { + "epoch": 0.4449127437728606, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2769, + "step": 3919 + }, + { + "epoch": 0.4450262708827797, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2904, + "step": 3920 + }, + { + "epoch": 0.44513979799269876, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2882, + "step": 3921 + }, + { + "epoch": 0.4452533251026179, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 3922 + }, + { + "epoch": 0.44536685221253697, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2931, + "step": 3923 + }, + { + "epoch": 0.44548037932245604, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2976, + "step": 3924 + }, + { + "epoch": 0.4455939064323751, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2809, + "step": 3925 + }, + { + "epoch": 0.4457074335422942, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2923, + "step": 3926 + }, + { + "epoch": 0.44582096065221327, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3006, + "step": 3927 + }, + { + "epoch": 0.44593448776213235, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2877, + "step": 3928 + }, + { + "epoch": 0.4460480148720514, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2896, + "step": 3929 + }, + { + "epoch": 0.4461615419819705, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 3930 + }, + { + "epoch": 0.4462750690918896, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.251, + "step": 3931 + }, + { + "epoch": 0.44638859620180865, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2739, + "step": 3932 + }, + { + "epoch": 0.4465021233117277, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2822, + "step": 3933 + }, + { + "epoch": 0.4466156504216468, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2813, + "step": 3934 + }, + { + "epoch": 0.4467291775315659, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 3935 + }, + { + "epoch": 0.44684270464148496, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2984, + "step": 3936 + }, + { + "epoch": 0.44695623175140403, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2748, + "step": 3937 + }, + { + "epoch": 0.4470697588613231, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.287, + "step": 3938 + }, + { + "epoch": 0.4471832859712422, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2815, + "step": 3939 + }, + { + "epoch": 0.44729681308116126, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2993, + "step": 3940 + }, + { + "epoch": 0.44741034019108034, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 3941 + }, + { + "epoch": 0.4475238673009994, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2768, + "step": 3942 + }, + { + "epoch": 0.4476373944109185, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2772, + "step": 3943 + }, + { + "epoch": 0.44775092152083756, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 3944 + }, + { + "epoch": 0.44786444863075664, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.284, + "step": 3945 + }, + { + "epoch": 0.4479779757406757, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 3946 + }, + { + "epoch": 0.4480915028505948, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.3101, + "step": 3947 + }, + { + "epoch": 0.44820502996051387, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.266, + "step": 3948 + }, + { + "epoch": 0.44831855707043294, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2694, + "step": 3949 + }, + { + "epoch": 0.448432084180352, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2755, + "step": 3950 + }, + { + "epoch": 0.4485456112902711, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 3951 + }, + { + "epoch": 0.44865913840019017, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.3013, + "step": 3952 + }, + { + "epoch": 0.44877266551010925, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 3953 + }, + { + "epoch": 0.4488861926200283, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.3035, + "step": 3954 + }, + { + "epoch": 0.4489997197299474, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 3955 + }, + { + "epoch": 0.4491132468398665, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 3956 + }, + { + "epoch": 0.44922677394978555, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 3957 + }, + { + "epoch": 0.4493403010597046, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2884, + "step": 3958 + }, + { + "epoch": 0.4494538281696237, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.284, + "step": 3959 + }, + { + "epoch": 0.4495673552795428, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 3960 + }, + { + "epoch": 0.44968088238946186, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.278, + "step": 3961 + }, + { + "epoch": 0.44979440949938093, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2729, + "step": 3962 + }, + { + "epoch": 0.4499079366093, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 3963 + }, + { + "epoch": 0.4500214637192191, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.3046, + "step": 3964 + }, + { + "epoch": 0.45013499082913816, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2786, + "step": 3965 + }, + { + "epoch": 0.45024851793905724, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2798, + "step": 3966 + }, + { + "epoch": 0.4503620450489763, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2921, + "step": 3967 + }, + { + "epoch": 0.4504755721588954, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2837, + "step": 3968 + }, + { + "epoch": 0.45058909926881446, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.296, + "step": 3969 + }, + { + "epoch": 0.45070262637873354, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.3064, + "step": 3970 + }, + { + "epoch": 0.4508161534886526, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.2776, + "step": 3971 + }, + { + "epoch": 0.4509296805985717, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2819, + "step": 3972 + }, + { + "epoch": 0.45104320770849077, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2912, + "step": 3973 + }, + { + "epoch": 0.45115673481840984, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2886, + "step": 3974 + }, + { + "epoch": 0.4512702619283289, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3105, + "step": 3975 + }, + { + "epoch": 0.451383789038248, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2944, + "step": 3976 + }, + { + "epoch": 0.45149731614816707, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2867, + "step": 3977 + }, + { + "epoch": 0.45161084325808615, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 3978 + }, + { + "epoch": 0.4517243703680052, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2941, + "step": 3979 + }, + { + "epoch": 0.4518378974779243, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2723, + "step": 3980 + }, + { + "epoch": 0.4519514245878434, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2902, + "step": 3981 + }, + { + "epoch": 0.45206495169776245, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.28, + "step": 3982 + }, + { + "epoch": 0.4521784788076815, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2714, + "step": 3983 + }, + { + "epoch": 0.4522920059176006, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.3035, + "step": 3984 + }, + { + "epoch": 0.4524055330275197, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2948, + "step": 3985 + }, + { + "epoch": 0.45251906013743876, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2786, + "step": 3986 + }, + { + "epoch": 0.45263258724735783, + "grad_norm": 0.224609375, + "learning_rate": 0.002, + "loss": 5.2893, + "step": 3987 + }, + { + "epoch": 0.4527461143572769, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 3988 + }, + { + "epoch": 0.452859641467196, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2958, + "step": 3989 + }, + { + "epoch": 0.45297316857711506, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2866, + "step": 3990 + }, + { + "epoch": 0.45308669568703414, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.3043, + "step": 3991 + }, + { + "epoch": 0.4532002227969532, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2802, + "step": 3992 + }, + { + "epoch": 0.4533137499068723, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2818, + "step": 3993 + }, + { + "epoch": 0.45342727701679136, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.3004, + "step": 3994 + }, + { + "epoch": 0.45354080412671044, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2793, + "step": 3995 + }, + { + "epoch": 0.4536543312366295, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 3996 + }, + { + "epoch": 0.4537678583465486, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 3997 + }, + { + "epoch": 0.45388138545646767, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2712, + "step": 3998 + }, + { + "epoch": 0.45399491256638674, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2955, + "step": 3999 + }, + { + "epoch": 0.4541084396763058, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2819, + "step": 4000 + }, + { + "epoch": 0.4542219667862249, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2817, + "step": 4001 + }, + { + "epoch": 0.45433549389614397, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2827, + "step": 4002 + }, + { + "epoch": 0.45444902100606305, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2932, + "step": 4003 + }, + { + "epoch": 0.4545625481159821, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 4004 + }, + { + "epoch": 0.4546760752259012, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 4005 + }, + { + "epoch": 0.4547896023358203, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.3042, + "step": 4006 + }, + { + "epoch": 0.45490312944573935, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.284, + "step": 4007 + }, + { + "epoch": 0.4550166565556584, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 4008 + }, + { + "epoch": 0.4551301836655775, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2903, + "step": 4009 + }, + { + "epoch": 0.4552437107754966, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2873, + "step": 4010 + }, + { + "epoch": 0.45535723788541566, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 4011 + }, + { + "epoch": 0.45547076499533473, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.301, + "step": 4012 + }, + { + "epoch": 0.4555842921052538, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2909, + "step": 4013 + }, + { + "epoch": 0.4556978192151729, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2811, + "step": 4014 + }, + { + "epoch": 0.45581134632509196, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.289, + "step": 4015 + }, + { + "epoch": 0.45592487343501104, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2723, + "step": 4016 + }, + { + "epoch": 0.4560384005449301, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2955, + "step": 4017 + }, + { + "epoch": 0.4561519276548492, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.292, + "step": 4018 + }, + { + "epoch": 0.45626545476476826, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2962, + "step": 4019 + }, + { + "epoch": 0.45637898187468734, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2682, + "step": 4020 + }, + { + "epoch": 0.4564925089846064, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.2834, + "step": 4021 + }, + { + "epoch": 0.4566060360945255, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 4022 + }, + { + "epoch": 0.45671956320444457, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2896, + "step": 4023 + }, + { + "epoch": 0.45683309031436364, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 4024 + }, + { + "epoch": 0.4569466174242827, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 4025 + }, + { + "epoch": 0.4570601445342018, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2772, + "step": 4026 + }, + { + "epoch": 0.45717367164412087, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2828, + "step": 4027 + }, + { + "epoch": 0.45728719875403995, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2946, + "step": 4028 + }, + { + "epoch": 0.457400725863959, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2919, + "step": 4029 + }, + { + "epoch": 0.4575142529738781, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2979, + "step": 4030 + }, + { + "epoch": 0.4576277800837972, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4031 + }, + { + "epoch": 0.45774130719371625, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2572, + "step": 4032 + }, + { + "epoch": 0.4578548343036353, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 4033 + }, + { + "epoch": 0.4579683614135544, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 4034 + }, + { + "epoch": 0.4580818885234735, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2831, + "step": 4035 + }, + { + "epoch": 0.45819541563339256, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.2842, + "step": 4036 + }, + { + "epoch": 0.45830894274331163, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4037 + }, + { + "epoch": 0.4584224698532307, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3044, + "step": 4038 + }, + { + "epoch": 0.4585359969631498, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2897, + "step": 4039 + }, + { + "epoch": 0.45864952407306886, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 4040 + }, + { + "epoch": 0.45876305118298794, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2704, + "step": 4041 + }, + { + "epoch": 0.458876578292907, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.26, + "step": 4042 + }, + { + "epoch": 0.45899010540282614, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.2919, + "step": 4043 + }, + { + "epoch": 0.4591036325127452, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2863, + "step": 4044 + }, + { + "epoch": 0.4592171596226643, + "grad_norm": 0.2265625, + "learning_rate": 0.002, + "loss": 5.3056, + "step": 4045 + }, + { + "epoch": 0.45933068673258337, + "grad_norm": 0.2109375, + "learning_rate": 0.002, + "loss": 5.2907, + "step": 4046 + }, + { + "epoch": 0.45944421384250245, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.2774, + "step": 4047 + }, + { + "epoch": 0.4595577409524215, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 4048 + }, + { + "epoch": 0.4596712680623406, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.3004, + "step": 4049 + }, + { + "epoch": 0.4597847951722597, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2973, + "step": 4050 + }, + { + "epoch": 0.45989832228217875, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4051 + }, + { + "epoch": 0.4600118493920978, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 4052 + }, + { + "epoch": 0.4601253765020169, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2991, + "step": 4053 + }, + { + "epoch": 0.460238903611936, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2829, + "step": 4054 + }, + { + "epoch": 0.46035243072185505, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2852, + "step": 4055 + }, + { + "epoch": 0.46046595783177413, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2875, + "step": 4056 + }, + { + "epoch": 0.4605794849416932, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.3042, + "step": 4057 + }, + { + "epoch": 0.4606930120516123, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2936, + "step": 4058 + }, + { + "epoch": 0.46080653916153136, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2652, + "step": 4059 + }, + { + "epoch": 0.46092006627145043, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2743, + "step": 4060 + }, + { + "epoch": 0.4610335933813695, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2879, + "step": 4061 + }, + { + "epoch": 0.4611471204912886, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 4062 + }, + { + "epoch": 0.46126064760120766, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3061, + "step": 4063 + }, + { + "epoch": 0.46137417471112674, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2697, + "step": 4064 + }, + { + "epoch": 0.4614877018210458, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.266, + "step": 4065 + }, + { + "epoch": 0.4616012289309649, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 4066 + }, + { + "epoch": 0.46171475604088397, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 4067 + }, + { + "epoch": 0.46182828315080304, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2661, + "step": 4068 + }, + { + "epoch": 0.4619418102607221, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2778, + "step": 4069 + }, + { + "epoch": 0.4620553373706412, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.3047, + "step": 4070 + }, + { + "epoch": 0.46216886448056027, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.257, + "step": 4071 + }, + { + "epoch": 0.46228239159047935, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.274, + "step": 4072 + }, + { + "epoch": 0.4623959187003984, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 4073 + }, + { + "epoch": 0.4625094458103175, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2696, + "step": 4074 + }, + { + "epoch": 0.4626229729202366, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2632, + "step": 4075 + }, + { + "epoch": 0.46273650003015565, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.3005, + "step": 4076 + }, + { + "epoch": 0.4628500271400747, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2812, + "step": 4077 + }, + { + "epoch": 0.4629635542499938, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2902, + "step": 4078 + }, + { + "epoch": 0.4630770813599129, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2795, + "step": 4079 + }, + { + "epoch": 0.46319060846983195, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2712, + "step": 4080 + }, + { + "epoch": 0.46330413557975103, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2934, + "step": 4081 + }, + { + "epoch": 0.4634176626896701, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 4082 + }, + { + "epoch": 0.4635311897995892, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2759, + "step": 4083 + }, + { + "epoch": 0.46364471690950826, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2846, + "step": 4084 + }, + { + "epoch": 0.46375824401942733, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 4085 + }, + { + "epoch": 0.4638717711293464, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4086 + }, + { + "epoch": 0.4639852982392655, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2733, + "step": 4087 + }, + { + "epoch": 0.46409882534918456, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3129, + "step": 4088 + }, + { + "epoch": 0.46421235245910364, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2757, + "step": 4089 + }, + { + "epoch": 0.4643258795690227, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 4090 + }, + { + "epoch": 0.4644394066789418, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 4091 + }, + { + "epoch": 0.46455293378886087, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2857, + "step": 4092 + }, + { + "epoch": 0.46466646089877994, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2733, + "step": 4093 + }, + { + "epoch": 0.464779988008699, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 4094 + }, + { + "epoch": 0.4648935151186181, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.2663, + "step": 4095 + }, + { + "epoch": 0.46500704222853717, + "grad_norm": 0.2138671875, + "learning_rate": 0.002, + "loss": 5.2682, + "step": 4096 + }, + { + "epoch": 0.46512056933845625, + "grad_norm": 0.228515625, + "learning_rate": 0.002, + "loss": 5.2762, + "step": 4097 + }, + { + "epoch": 0.4652340964483753, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4098 + }, + { + "epoch": 0.4653476235582944, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.258, + "step": 4099 + }, + { + "epoch": 0.4654611506682135, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4100 + }, + { + "epoch": 0.46557467777813255, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 4101 + }, + { + "epoch": 0.4656882048880516, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2942, + "step": 4102 + }, + { + "epoch": 0.4658017319979707, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2912, + "step": 4103 + }, + { + "epoch": 0.4659152591078898, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2793, + "step": 4104 + }, + { + "epoch": 0.46602878621780885, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 4105 + }, + { + "epoch": 0.46614231332772793, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2958, + "step": 4106 + }, + { + "epoch": 0.466255840437647, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.249, + "step": 4107 + }, + { + "epoch": 0.4663693675475661, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.261, + "step": 4108 + }, + { + "epoch": 0.46648289465748516, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2911, + "step": 4109 + }, + { + "epoch": 0.46659642176740423, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2865, + "step": 4110 + }, + { + "epoch": 0.4667099488773233, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2802, + "step": 4111 + }, + { + "epoch": 0.4668234759872424, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2738, + "step": 4112 + }, + { + "epoch": 0.46693700309716146, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2809, + "step": 4113 + }, + { + "epoch": 0.46705053020708054, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 4114 + }, + { + "epoch": 0.4671640573169996, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2916, + "step": 4115 + }, + { + "epoch": 0.4672775844269187, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2841, + "step": 4116 + }, + { + "epoch": 0.46739111153683777, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3, + "step": 4117 + }, + { + "epoch": 0.46750463864675684, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 4118 + }, + { + "epoch": 0.4676181657566759, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 4119 + }, + { + "epoch": 0.467731692866595, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2845, + "step": 4120 + }, + { + "epoch": 0.46784521997651407, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 4121 + }, + { + "epoch": 0.46795874708643315, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 4122 + }, + { + "epoch": 0.4680722741963522, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2755, + "step": 4123 + }, + { + "epoch": 0.4681858013062713, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 4124 + }, + { + "epoch": 0.4682993284161904, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2918, + "step": 4125 + }, + { + "epoch": 0.46841285552610945, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 4126 + }, + { + "epoch": 0.4685263826360285, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2812, + "step": 4127 + }, + { + "epoch": 0.4686399097459476, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2799, + "step": 4128 + }, + { + "epoch": 0.4687534368558667, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2948, + "step": 4129 + }, + { + "epoch": 0.46886696396578575, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2733, + "step": 4130 + }, + { + "epoch": 0.46898049107570483, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2945, + "step": 4131 + }, + { + "epoch": 0.4690940181856239, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2952, + "step": 4132 + }, + { + "epoch": 0.469207545295543, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.3019, + "step": 4133 + }, + { + "epoch": 0.46932107240546206, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 4134 + }, + { + "epoch": 0.46943459951538113, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 4135 + }, + { + "epoch": 0.4695481266253002, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2817, + "step": 4136 + }, + { + "epoch": 0.4696616537352193, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2793, + "step": 4137 + }, + { + "epoch": 0.46977518084513836, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2858, + "step": 4138 + }, + { + "epoch": 0.46988870795505744, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2728, + "step": 4139 + }, + { + "epoch": 0.4700022350649765, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2713, + "step": 4140 + }, + { + "epoch": 0.4701157621748956, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 4141 + }, + { + "epoch": 0.47022928928481467, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.276, + "step": 4142 + }, + { + "epoch": 0.47034281639473374, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 4143 + }, + { + "epoch": 0.4704563435046528, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2683, + "step": 4144 + }, + { + "epoch": 0.4705698706145719, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2619, + "step": 4145 + }, + { + "epoch": 0.47068339772449097, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.271, + "step": 4146 + }, + { + "epoch": 0.47079692483441005, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 4147 + }, + { + "epoch": 0.4709104519443291, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2918, + "step": 4148 + }, + { + "epoch": 0.4710239790542482, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 4149 + }, + { + "epoch": 0.4711375061641673, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 4150 + }, + { + "epoch": 0.47125103327408635, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.273, + "step": 4151 + }, + { + "epoch": 0.4713645603840054, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2629, + "step": 4152 + }, + { + "epoch": 0.4714780874939245, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2921, + "step": 4153 + }, + { + "epoch": 0.4715916146038436, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 4154 + }, + { + "epoch": 0.47170514171376265, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.3023, + "step": 4155 + }, + { + "epoch": 0.47181866882368173, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2756, + "step": 4156 + }, + { + "epoch": 0.4719321959336008, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2768, + "step": 4157 + }, + { + "epoch": 0.4720457230435199, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 4158 + }, + { + "epoch": 0.47215925015343896, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 4159 + }, + { + "epoch": 0.47227277726335803, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2825, + "step": 4160 + }, + { + "epoch": 0.4723863043732771, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.278, + "step": 4161 + }, + { + "epoch": 0.4724998314831962, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 4162 + }, + { + "epoch": 0.4726133585931153, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2699, + "step": 4163 + }, + { + "epoch": 0.4727268857030344, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.3013, + "step": 4164 + }, + { + "epoch": 0.47284041281295347, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2818, + "step": 4165 + }, + { + "epoch": 0.47295393992287255, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2717, + "step": 4166 + }, + { + "epoch": 0.4730674670327916, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2927, + "step": 4167 + }, + { + "epoch": 0.4731809941427107, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2719, + "step": 4168 + }, + { + "epoch": 0.4732945212526298, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2836, + "step": 4169 + }, + { + "epoch": 0.47340804836254885, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2851, + "step": 4170 + }, + { + "epoch": 0.4735215754724679, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.29, + "step": 4171 + }, + { + "epoch": 0.473635102582387, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2873, + "step": 4172 + }, + { + "epoch": 0.4737486296923061, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3, + "step": 4173 + }, + { + "epoch": 0.47386215680222515, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 4174 + }, + { + "epoch": 0.47397568391214423, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2982, + "step": 4175 + }, + { + "epoch": 0.4740892110220633, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2887, + "step": 4176 + }, + { + "epoch": 0.4742027381319824, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2971, + "step": 4177 + }, + { + "epoch": 0.47431626524190146, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 4178 + }, + { + "epoch": 0.47442979235182053, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2807, + "step": 4179 + }, + { + "epoch": 0.4745433194617396, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 4180 + }, + { + "epoch": 0.4746568465716587, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 4181 + }, + { + "epoch": 0.47477037368157776, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2923, + "step": 4182 + }, + { + "epoch": 0.47488390079149684, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 4183 + }, + { + "epoch": 0.4749974279014159, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2881, + "step": 4184 + }, + { + "epoch": 0.475110955011335, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2908, + "step": 4185 + }, + { + "epoch": 0.47522448212125407, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 4186 + }, + { + "epoch": 0.47533800923117314, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2809, + "step": 4187 + }, + { + "epoch": 0.4754515363410922, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.277, + "step": 4188 + }, + { + "epoch": 0.4755650634510113, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2705, + "step": 4189 + }, + { + "epoch": 0.47567859056093037, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2763, + "step": 4190 + }, + { + "epoch": 0.47579211767084945, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2898, + "step": 4191 + }, + { + "epoch": 0.4759056447807685, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2893, + "step": 4192 + }, + { + "epoch": 0.4760191718906876, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2652, + "step": 4193 + }, + { + "epoch": 0.4761326990006067, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4194 + }, + { + "epoch": 0.47624622611052575, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 4195 + }, + { + "epoch": 0.4763597532204448, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2768, + "step": 4196 + }, + { + "epoch": 0.4764732803303639, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2846, + "step": 4197 + }, + { + "epoch": 0.476586807440283, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2661, + "step": 4198 + }, + { + "epoch": 0.47670033455020205, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 4199 + }, + { + "epoch": 0.47681386166012113, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.278, + "step": 4200 + }, + { + "epoch": 0.4769273887700402, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2849, + "step": 4201 + }, + { + "epoch": 0.4770409158799593, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2701, + "step": 4202 + }, + { + "epoch": 0.47715444298987836, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2718, + "step": 4203 + }, + { + "epoch": 0.47726797009979743, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2737, + "step": 4204 + }, + { + "epoch": 0.4773814972097165, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 4205 + }, + { + "epoch": 0.4774950243196356, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 4206 + }, + { + "epoch": 0.47760855142955466, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2739, + "step": 4207 + }, + { + "epoch": 0.47772207853947374, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.271, + "step": 4208 + }, + { + "epoch": 0.4778356056493928, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 4209 + }, + { + "epoch": 0.4779491327593119, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2817, + "step": 4210 + }, + { + "epoch": 0.47806265986923097, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 4211 + }, + { + "epoch": 0.47817618697915004, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.3022, + "step": 4212 + }, + { + "epoch": 0.4782897140890691, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.276, + "step": 4213 + }, + { + "epoch": 0.4784032411989882, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2765, + "step": 4214 + }, + { + "epoch": 0.47851676830890727, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.272, + "step": 4215 + }, + { + "epoch": 0.47863029541882635, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2928, + "step": 4216 + }, + { + "epoch": 0.4787438225287454, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2649, + "step": 4217 + }, + { + "epoch": 0.4788573496386645, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2961, + "step": 4218 + }, + { + "epoch": 0.4789708767485836, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2805, + "step": 4219 + }, + { + "epoch": 0.47908440385850265, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2756, + "step": 4220 + }, + { + "epoch": 0.4791979309684217, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 4221 + }, + { + "epoch": 0.4793114580783408, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.3049, + "step": 4222 + }, + { + "epoch": 0.4794249851882599, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2901, + "step": 4223 + }, + { + "epoch": 0.47953851229817895, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2723, + "step": 4224 + }, + { + "epoch": 0.47965203940809803, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2568, + "step": 4225 + }, + { + "epoch": 0.4797655665180171, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2746, + "step": 4226 + }, + { + "epoch": 0.4798790936279362, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2832, + "step": 4227 + }, + { + "epoch": 0.47999262073785526, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2877, + "step": 4228 + }, + { + "epoch": 0.48010614784777433, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.3013, + "step": 4229 + }, + { + "epoch": 0.4802196749576934, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2827, + "step": 4230 + }, + { + "epoch": 0.4803332020676125, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2846, + "step": 4231 + }, + { + "epoch": 0.48044672917753156, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2667, + "step": 4232 + }, + { + "epoch": 0.48056025628745064, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.275, + "step": 4233 + }, + { + "epoch": 0.4806737833973697, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 4234 + }, + { + "epoch": 0.4807873105072888, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 4235 + }, + { + "epoch": 0.48090083761720787, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 4236 + }, + { + "epoch": 0.48101436472712694, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 4237 + }, + { + "epoch": 0.481127891837046, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2696, + "step": 4238 + }, + { + "epoch": 0.4812414189469651, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2676, + "step": 4239 + }, + { + "epoch": 0.48135494605688417, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.285, + "step": 4240 + }, + { + "epoch": 0.48146847316680325, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3024, + "step": 4241 + }, + { + "epoch": 0.4815820002767223, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 4242 + }, + { + "epoch": 0.4816955273866414, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 4243 + }, + { + "epoch": 0.4818090544965605, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2837, + "step": 4244 + }, + { + "epoch": 0.48192258160647955, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2719, + "step": 4245 + }, + { + "epoch": 0.4820361087163986, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2745, + "step": 4246 + }, + { + "epoch": 0.4821496358263177, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2769, + "step": 4247 + }, + { + "epoch": 0.4822631629362368, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2741, + "step": 4248 + }, + { + "epoch": 0.48237669004615585, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2689, + "step": 4249 + }, + { + "epoch": 0.48249021715607493, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2889, + "step": 4250 + }, + { + "epoch": 0.482603744265994, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4251 + }, + { + "epoch": 0.4827172713759131, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2645, + "step": 4252 + }, + { + "epoch": 0.48283079848583216, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 4253 + }, + { + "epoch": 0.48294432559575123, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2719, + "step": 4254 + }, + { + "epoch": 0.4830578527056703, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2734, + "step": 4255 + }, + { + "epoch": 0.4831713798155894, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.264, + "step": 4256 + }, + { + "epoch": 0.48328490692550846, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2741, + "step": 4257 + }, + { + "epoch": 0.48339843403542754, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.271, + "step": 4258 + }, + { + "epoch": 0.4835119611453466, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.278, + "step": 4259 + }, + { + "epoch": 0.4836254882552657, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2848, + "step": 4260 + }, + { + "epoch": 0.48373901536518477, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 4261 + }, + { + "epoch": 0.48385254247510384, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2913, + "step": 4262 + }, + { + "epoch": 0.4839660695850229, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2763, + "step": 4263 + }, + { + "epoch": 0.484079596694942, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2919, + "step": 4264 + }, + { + "epoch": 0.48419312380486107, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2589, + "step": 4265 + }, + { + "epoch": 0.48430665091478015, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2554, + "step": 4266 + }, + { + "epoch": 0.4844201780246992, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.281, + "step": 4267 + }, + { + "epoch": 0.4845337051346183, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2589, + "step": 4268 + }, + { + "epoch": 0.4846472322445374, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2809, + "step": 4269 + }, + { + "epoch": 0.48476075935445645, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2884, + "step": 4270 + }, + { + "epoch": 0.4848742864643755, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.3009, + "step": 4271 + }, + { + "epoch": 0.4849878135742946, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2942, + "step": 4272 + }, + { + "epoch": 0.4851013406842137, + "grad_norm": 0.2265625, + "learning_rate": 0.002, + "loss": 5.2842, + "step": 4273 + }, + { + "epoch": 0.48521486779413275, + "grad_norm": 0.220703125, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4274 + }, + { + "epoch": 0.48532839490405183, + "grad_norm": 0.21484375, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 4275 + }, + { + "epoch": 0.4854419220139709, + "grad_norm": 0.2255859375, + "learning_rate": 0.002, + "loss": 5.2798, + "step": 4276 + }, + { + "epoch": 0.48555544912389, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 4277 + }, + { + "epoch": 0.48566897623380906, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2881, + "step": 4278 + }, + { + "epoch": 0.48578250334372813, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.289, + "step": 4279 + }, + { + "epoch": 0.4858960304536472, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2972, + "step": 4280 + }, + { + "epoch": 0.4860095575635663, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2825, + "step": 4281 + }, + { + "epoch": 0.48612308467348536, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2715, + "step": 4282 + }, + { + "epoch": 0.48623661178340444, + "grad_norm": 0.453125, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4283 + }, + { + "epoch": 0.48635013889332357, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.275, + "step": 4284 + }, + { + "epoch": 0.48646366600324265, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4285 + }, + { + "epoch": 0.4865771931131617, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 4286 + }, + { + "epoch": 0.4866907202230808, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2898, + "step": 4287 + }, + { + "epoch": 0.4868042473329999, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2765, + "step": 4288 + }, + { + "epoch": 0.48691777444291895, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 4289 + }, + { + "epoch": 0.487031301552838, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.274, + "step": 4290 + }, + { + "epoch": 0.4871448286627571, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.272, + "step": 4291 + }, + { + "epoch": 0.4872583557726762, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2914, + "step": 4292 + }, + { + "epoch": 0.48737188288259525, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2612, + "step": 4293 + }, + { + "epoch": 0.48748540999251433, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2782, + "step": 4294 + }, + { + "epoch": 0.4875989371024334, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2715, + "step": 4295 + }, + { + "epoch": 0.4877124642123525, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2713, + "step": 4296 + }, + { + "epoch": 0.48782599132227156, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2512, + "step": 4297 + }, + { + "epoch": 0.48793951843219063, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2924, + "step": 4298 + }, + { + "epoch": 0.4880530455421097, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4299 + }, + { + "epoch": 0.4881665726520288, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4300 + }, + { + "epoch": 0.48828009976194786, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 4301 + }, + { + "epoch": 0.48839362687186694, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4302 + }, + { + "epoch": 0.488507153981786, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2633, + "step": 4303 + }, + { + "epoch": 0.4886206810917051, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2908, + "step": 4304 + }, + { + "epoch": 0.48873420820162417, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.3012, + "step": 4305 + }, + { + "epoch": 0.48884773531154324, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2654, + "step": 4306 + }, + { + "epoch": 0.4889612624214623, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 4307 + }, + { + "epoch": 0.4890747895313814, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.239, + "step": 4308 + }, + { + "epoch": 0.48918831664130047, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4309 + }, + { + "epoch": 0.48930184375121955, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2812, + "step": 4310 + }, + { + "epoch": 0.4894153708611386, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2751, + "step": 4311 + }, + { + "epoch": 0.4895288979710577, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2899, + "step": 4312 + }, + { + "epoch": 0.4896424250809768, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.27, + "step": 4313 + }, + { + "epoch": 0.48975595219089585, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2771, + "step": 4314 + }, + { + "epoch": 0.4898694793008149, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2809, + "step": 4315 + }, + { + "epoch": 0.489983006410734, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 4316 + }, + { + "epoch": 0.4900965335206531, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 4317 + }, + { + "epoch": 0.49021006063057215, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2667, + "step": 4318 + }, + { + "epoch": 0.49032358774049123, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 4319 + }, + { + "epoch": 0.4904371148504103, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2581, + "step": 4320 + }, + { + "epoch": 0.4905506419603294, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2885, + "step": 4321 + }, + { + "epoch": 0.49066416907024846, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 4322 + }, + { + "epoch": 0.49077769618016753, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2866, + "step": 4323 + }, + { + "epoch": 0.4908912232900866, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.285, + "step": 4324 + }, + { + "epoch": 0.4910047504000057, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.3013, + "step": 4325 + }, + { + "epoch": 0.49111827750992476, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2688, + "step": 4326 + }, + { + "epoch": 0.49123180461984384, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2867, + "step": 4327 + }, + { + "epoch": 0.4913453317297629, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 4328 + }, + { + "epoch": 0.491458858839682, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2834, + "step": 4329 + }, + { + "epoch": 0.49157238594960107, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2866, + "step": 4330 + }, + { + "epoch": 0.49168591305952014, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.293, + "step": 4331 + }, + { + "epoch": 0.4917994401694392, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4332 + }, + { + "epoch": 0.4919129672793583, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2922, + "step": 4333 + }, + { + "epoch": 0.49202649438927737, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2881, + "step": 4334 + }, + { + "epoch": 0.49214002149919645, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2684, + "step": 4335 + }, + { + "epoch": 0.4922535486091155, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2828, + "step": 4336 + }, + { + "epoch": 0.4923670757190346, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.265, + "step": 4337 + }, + { + "epoch": 0.4924806028289537, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2705, + "step": 4338 + }, + { + "epoch": 0.49259412993887275, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4339 + }, + { + "epoch": 0.4927076570487918, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2735, + "step": 4340 + }, + { + "epoch": 0.4928211841587109, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2954, + "step": 4341 + }, + { + "epoch": 0.49293471126863, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2829, + "step": 4342 + }, + { + "epoch": 0.49304823837854905, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2729, + "step": 4343 + }, + { + "epoch": 0.49316176548846813, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 4344 + }, + { + "epoch": 0.4932752925983872, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2911, + "step": 4345 + }, + { + "epoch": 0.4933888197083063, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 4346 + }, + { + "epoch": 0.49350234681822536, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2774, + "step": 4347 + }, + { + "epoch": 0.49361587392814443, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2792, + "step": 4348 + }, + { + "epoch": 0.4937294010380635, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2845, + "step": 4349 + }, + { + "epoch": 0.4938429281479826, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.259, + "step": 4350 + }, + { + "epoch": 0.49395645525790166, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 4351 + }, + { + "epoch": 0.49406998236782074, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.259, + "step": 4352 + }, + { + "epoch": 0.4941835094777398, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2878, + "step": 4353 + }, + { + "epoch": 0.4942970365876589, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2687, + "step": 4354 + }, + { + "epoch": 0.49441056369757796, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2737, + "step": 4355 + }, + { + "epoch": 0.49452409080749704, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2714, + "step": 4356 + }, + { + "epoch": 0.4946376179174161, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2749, + "step": 4357 + }, + { + "epoch": 0.4947511450273352, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.276, + "step": 4358 + }, + { + "epoch": 0.49486467213725427, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2784, + "step": 4359 + }, + { + "epoch": 0.49497819924717334, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2792, + "step": 4360 + }, + { + "epoch": 0.4950917263570924, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 4361 + }, + { + "epoch": 0.4952052534670115, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2824, + "step": 4362 + }, + { + "epoch": 0.4953187805769306, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2581, + "step": 4363 + }, + { + "epoch": 0.49543230768684965, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 4364 + }, + { + "epoch": 0.4955458347967687, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2757, + "step": 4365 + }, + { + "epoch": 0.4956593619066878, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 4366 + }, + { + "epoch": 0.4957728890166069, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2829, + "step": 4367 + }, + { + "epoch": 0.49588641612652595, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2794, + "step": 4368 + }, + { + "epoch": 0.49599994323644503, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2744, + "step": 4369 + }, + { + "epoch": 0.4961134703463641, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2651, + "step": 4370 + }, + { + "epoch": 0.4962269974562832, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4371 + }, + { + "epoch": 0.49634052456620226, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 4372 + }, + { + "epoch": 0.49645405167612133, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2747, + "step": 4373 + }, + { + "epoch": 0.4965675787860404, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 4374 + }, + { + "epoch": 0.4966811058959595, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 4375 + }, + { + "epoch": 0.49679463300587856, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2859, + "step": 4376 + }, + { + "epoch": 0.49690816011579764, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 4377 + }, + { + "epoch": 0.4970216872257167, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2793, + "step": 4378 + }, + { + "epoch": 0.4971352143356358, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2832, + "step": 4379 + }, + { + "epoch": 0.49724874144555486, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 4380 + }, + { + "epoch": 0.49736226855547394, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2661, + "step": 4381 + }, + { + "epoch": 0.497475795665393, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2564, + "step": 4382 + }, + { + "epoch": 0.4975893227753121, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2822, + "step": 4383 + }, + { + "epoch": 0.49770284988523117, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 4384 + }, + { + "epoch": 0.49781637699515024, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2813, + "step": 4385 + }, + { + "epoch": 0.4979299041050693, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.277, + "step": 4386 + }, + { + "epoch": 0.4980434312149884, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2752, + "step": 4387 + }, + { + "epoch": 0.4981569583249075, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2691, + "step": 4388 + }, + { + "epoch": 0.49827048543482655, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 4389 + }, + { + "epoch": 0.4983840125447456, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2716, + "step": 4390 + }, + { + "epoch": 0.4984975396546647, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2966, + "step": 4391 + }, + { + "epoch": 0.4986110667645838, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2567, + "step": 4392 + }, + { + "epoch": 0.49872459387450285, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2874, + "step": 4393 + }, + { + "epoch": 0.49883812098442193, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.297, + "step": 4394 + }, + { + "epoch": 0.498951648094341, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2886, + "step": 4395 + }, + { + "epoch": 0.4990651752042601, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2704, + "step": 4396 + }, + { + "epoch": 0.49917870231417916, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 4397 + }, + { + "epoch": 0.49929222942409823, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 4398 + }, + { + "epoch": 0.4994057565340173, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 4399 + }, + { + "epoch": 0.4995192836439364, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 4400 + }, + { + "epoch": 0.49963281075385546, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2778, + "step": 4401 + }, + { + "epoch": 0.49974633786377454, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2682, + "step": 4402 + }, + { + "epoch": 0.4998598649736936, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2754, + "step": 4403 + }, + { + "epoch": 0.4999733920836127, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 4404 + }, + { + "epoch": 0.5000869191935318, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2859, + "step": 4405 + }, + { + "epoch": 0.5002004463034508, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2678, + "step": 4406 + }, + { + "epoch": 0.5003139734133699, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2839, + "step": 4407 + }, + { + "epoch": 0.500427500523289, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 4408 + }, + { + "epoch": 0.5005410276332081, + "grad_norm": 0.21875, + "learning_rate": 0.002, + "loss": 5.262, + "step": 4409 + }, + { + "epoch": 0.5006545547431271, + "grad_norm": 0.2177734375, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 4410 + }, + { + "epoch": 0.5007680818530462, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2542, + "step": 4411 + }, + { + "epoch": 0.5008816089629653, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2926, + "step": 4412 + }, + { + "epoch": 0.5009951360728844, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 4413 + }, + { + "epoch": 0.5011086631828034, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2616, + "step": 4414 + }, + { + "epoch": 0.5012221902927225, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 4415 + }, + { + "epoch": 0.5013357174026416, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2849, + "step": 4416 + }, + { + "epoch": 0.5014492445125607, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2535, + "step": 4417 + }, + { + "epoch": 0.5015627716224798, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2637, + "step": 4418 + }, + { + "epoch": 0.5016762987323988, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2848, + "step": 4419 + }, + { + "epoch": 0.5017898258423179, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2842, + "step": 4420 + }, + { + "epoch": 0.501903352952237, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2852, + "step": 4421 + }, + { + "epoch": 0.5020168800621561, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2745, + "step": 4422 + }, + { + "epoch": 0.5021304071720751, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2847, + "step": 4423 + }, + { + "epoch": 0.5022439342819942, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2735, + "step": 4424 + }, + { + "epoch": 0.5023574613919133, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2885, + "step": 4425 + }, + { + "epoch": 0.5024709885018324, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2935, + "step": 4426 + }, + { + "epoch": 0.5025845156117514, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 4427 + }, + { + "epoch": 0.5026980427216705, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 4428 + }, + { + "epoch": 0.5028115698315896, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2769, + "step": 4429 + }, + { + "epoch": 0.5029250969415087, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 4430 + }, + { + "epoch": 0.5030386240514277, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 4431 + }, + { + "epoch": 0.5031521511613468, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2899, + "step": 4432 + }, + { + "epoch": 0.5032656782712659, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2833, + "step": 4433 + }, + { + "epoch": 0.503379205381185, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2755, + "step": 4434 + }, + { + "epoch": 0.503492732491104, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 4435 + }, + { + "epoch": 0.5036062596010231, + "grad_norm": 0.447265625, + "learning_rate": 0.002, + "loss": 5.2894, + "step": 4436 + }, + { + "epoch": 0.5037197867109422, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.2776, + "step": 4437 + }, + { + "epoch": 0.5038333138208613, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.2769, + "step": 4438 + }, + { + "epoch": 0.5039468409307803, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.257, + "step": 4439 + }, + { + "epoch": 0.5040603680406994, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2543, + "step": 4440 + }, + { + "epoch": 0.5041738951506185, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 4441 + }, + { + "epoch": 0.5042874222605376, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2752, + "step": 4442 + }, + { + "epoch": 0.5044009493704567, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2787, + "step": 4443 + }, + { + "epoch": 0.5045144764803757, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 4444 + }, + { + "epoch": 0.5046280035902948, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 4445 + }, + { + "epoch": 0.5047415307002139, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2998, + "step": 4446 + }, + { + "epoch": 0.504855057810133, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2842, + "step": 4447 + }, + { + "epoch": 0.504968584920052, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.269, + "step": 4448 + }, + { + "epoch": 0.5050821120299711, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2676, + "step": 4449 + }, + { + "epoch": 0.5051956391398902, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2915, + "step": 4450 + }, + { + "epoch": 0.5053091662498093, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 4451 + }, + { + "epoch": 0.5054226933597283, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.251, + "step": 4452 + }, + { + "epoch": 0.5055362204696474, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2532, + "step": 4453 + }, + { + "epoch": 0.5056497475795665, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2629, + "step": 4454 + }, + { + "epoch": 0.5057632746894856, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 4455 + }, + { + "epoch": 0.5058768017994046, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2722, + "step": 4456 + }, + { + "epoch": 0.5059903289093237, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2931, + "step": 4457 + }, + { + "epoch": 0.5061038560192428, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 4458 + }, + { + "epoch": 0.5062173831291619, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2554, + "step": 4459 + }, + { + "epoch": 0.506330910239081, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2653, + "step": 4460 + }, + { + "epoch": 0.506444437349, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 4461 + }, + { + "epoch": 0.5065579644589191, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 4462 + }, + { + "epoch": 0.5066714915688382, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 4463 + }, + { + "epoch": 0.5067850186787572, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4464 + }, + { + "epoch": 0.5068985457886764, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2822, + "step": 4465 + }, + { + "epoch": 0.5070120728985955, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2914, + "step": 4466 + }, + { + "epoch": 0.5071256000085146, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2817, + "step": 4467 + }, + { + "epoch": 0.5072391271184337, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 4468 + }, + { + "epoch": 0.5073526542283527, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 4469 + }, + { + "epoch": 0.5074661813382718, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 4470 + }, + { + "epoch": 0.5075797084481909, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 4471 + }, + { + "epoch": 0.50769323555811, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2778, + "step": 4472 + }, + { + "epoch": 0.507806762668029, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 4473 + }, + { + "epoch": 0.5079202897779481, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 4474 + }, + { + "epoch": 0.5080338168878672, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.264, + "step": 4475 + }, + { + "epoch": 0.5081473439977863, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2594, + "step": 4476 + }, + { + "epoch": 0.5082608711077053, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.257, + "step": 4477 + }, + { + "epoch": 0.5083743982176244, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2827, + "step": 4478 + }, + { + "epoch": 0.5084879253275435, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 4479 + }, + { + "epoch": 0.5086014524374626, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.275, + "step": 4480 + }, + { + "epoch": 0.5087149795473817, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2754, + "step": 4481 + }, + { + "epoch": 0.5088285066573007, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.285, + "step": 4482 + }, + { + "epoch": 0.5089420337672198, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.263, + "step": 4483 + }, + { + "epoch": 0.5090555608771389, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.292, + "step": 4484 + }, + { + "epoch": 0.509169087987058, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 4485 + }, + { + "epoch": 0.509282615096977, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2826, + "step": 4486 + }, + { + "epoch": 0.5093961422068961, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 4487 + }, + { + "epoch": 0.5095096693168152, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2787, + "step": 4488 + }, + { + "epoch": 0.5096231964267343, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2816, + "step": 4489 + }, + { + "epoch": 0.5097367235366533, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 4490 + }, + { + "epoch": 0.5098502506465724, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2401, + "step": 4491 + }, + { + "epoch": 0.5099637777564915, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2777, + "step": 4492 + }, + { + "epoch": 0.5100773048664106, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2774, + "step": 4493 + }, + { + "epoch": 0.5101908319763296, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 4494 + }, + { + "epoch": 0.5103043590862487, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4495 + }, + { + "epoch": 0.5104178861961678, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2768, + "step": 4496 + }, + { + "epoch": 0.5105314133060869, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 4497 + }, + { + "epoch": 0.5106449404160059, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.249, + "step": 4498 + }, + { + "epoch": 0.510758467525925, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4499 + }, + { + "epoch": 0.5108719946358441, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2577, + "step": 4500 + }, + { + "epoch": 0.5109855217457632, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.24, + "step": 4501 + }, + { + "epoch": 0.5110990488556822, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2747, + "step": 4502 + }, + { + "epoch": 0.5112125759656013, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 4503 + }, + { + "epoch": 0.5113261030755204, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 4504 + }, + { + "epoch": 0.5114396301854395, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2729, + "step": 4505 + }, + { + "epoch": 0.5115531572953586, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.276, + "step": 4506 + }, + { + "epoch": 0.5116666844052776, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 4507 + }, + { + "epoch": 0.5117802115151967, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.277, + "step": 4508 + }, + { + "epoch": 0.5118937386251158, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4509 + }, + { + "epoch": 0.5120072657350349, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.2687, + "step": 4510 + }, + { + "epoch": 0.5121207928449539, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2862, + "step": 4511 + }, + { + "epoch": 0.512234319954873, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 4512 + }, + { + "epoch": 0.5123478470647921, + "grad_norm": 0.2294921875, + "learning_rate": 0.002, + "loss": 5.2478, + "step": 4513 + }, + { + "epoch": 0.5124613741747112, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.272, + "step": 4514 + }, + { + "epoch": 0.5125749012846302, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 4515 + }, + { + "epoch": 0.5126884283945493, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2636, + "step": 4516 + }, + { + "epoch": 0.5128019555044684, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.266, + "step": 4517 + }, + { + "epoch": 0.5129154826143875, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2682, + "step": 4518 + }, + { + "epoch": 0.5130290097243065, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2658, + "step": 4519 + }, + { + "epoch": 0.5131425368342256, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.274, + "step": 4520 + }, + { + "epoch": 0.5132560639441447, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2478, + "step": 4521 + }, + { + "epoch": 0.5133695910540638, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 4522 + }, + { + "epoch": 0.5134831181639828, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.267, + "step": 4523 + }, + { + "epoch": 0.5135966452739019, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2777, + "step": 4524 + }, + { + "epoch": 0.513710172383821, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 4525 + }, + { + "epoch": 0.5138236994937401, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2852, + "step": 4526 + }, + { + "epoch": 0.5139372266036591, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 4527 + }, + { + "epoch": 0.5140507537135782, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 4528 + }, + { + "epoch": 0.5141642808234973, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 4529 + }, + { + "epoch": 0.5142778079334164, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.268, + "step": 4530 + }, + { + "epoch": 0.5143913350433355, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2768, + "step": 4531 + }, + { + "epoch": 0.5145048621532545, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2795, + "step": 4532 + }, + { + "epoch": 0.5146183892631736, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2858, + "step": 4533 + }, + { + "epoch": 0.5147319163730927, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4534 + }, + { + "epoch": 0.5148454434830118, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.257, + "step": 4535 + }, + { + "epoch": 0.5149589705929308, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2677, + "step": 4536 + }, + { + "epoch": 0.5150724977028499, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2891, + "step": 4537 + }, + { + "epoch": 0.515186024812769, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 4538 + }, + { + "epoch": 0.5152995519226881, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 4539 + }, + { + "epoch": 0.5154130790326071, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 4540 + }, + { + "epoch": 0.5155266061425262, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2728, + "step": 4541 + }, + { + "epoch": 0.5156401332524453, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 4542 + }, + { + "epoch": 0.5157536603623644, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2827, + "step": 4543 + }, + { + "epoch": 0.5158671874722834, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2654, + "step": 4544 + }, + { + "epoch": 0.5159807145822025, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2621, + "step": 4545 + }, + { + "epoch": 0.5160942416921216, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 4546 + }, + { + "epoch": 0.5162077688020407, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 4547 + }, + { + "epoch": 0.5163212959119597, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 4548 + }, + { + "epoch": 0.5164348230218788, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2759, + "step": 4549 + }, + { + "epoch": 0.5165483501317979, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2896, + "step": 4550 + }, + { + "epoch": 0.516661877241717, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 4551 + }, + { + "epoch": 0.516775404351636, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2649, + "step": 4552 + }, + { + "epoch": 0.5168889314615551, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 4553 + }, + { + "epoch": 0.5170024585714742, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 4554 + }, + { + "epoch": 0.5171159856813933, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.252, + "step": 4555 + }, + { + "epoch": 0.5172295127913124, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2828, + "step": 4556 + }, + { + "epoch": 0.5173430399012314, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2921, + "step": 4557 + }, + { + "epoch": 0.5174565670111505, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2711, + "step": 4558 + }, + { + "epoch": 0.5175700941210696, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2884, + "step": 4559 + }, + { + "epoch": 0.5176836212309887, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 4560 + }, + { + "epoch": 0.5177971483409077, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 4561 + }, + { + "epoch": 0.5179106754508268, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 4562 + }, + { + "epoch": 0.5180242025607459, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 4563 + }, + { + "epoch": 0.518137729670665, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4564 + }, + { + "epoch": 0.518251256780584, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2813, + "step": 4565 + }, + { + "epoch": 0.5183647838905031, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2625, + "step": 4566 + }, + { + "epoch": 0.5184783110004222, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2652, + "step": 4567 + }, + { + "epoch": 0.5185918381103413, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2826, + "step": 4568 + }, + { + "epoch": 0.5187053652202603, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2973, + "step": 4569 + }, + { + "epoch": 0.5188188923301794, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 4570 + }, + { + "epoch": 0.5189324194400985, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 4571 + }, + { + "epoch": 0.5190459465500176, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 4572 + }, + { + "epoch": 0.5191594736599366, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.262, + "step": 4573 + }, + { + "epoch": 0.5192730007698557, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2994, + "step": 4574 + }, + { + "epoch": 0.5193865278797748, + "grad_norm": 0.2265625, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 4575 + }, + { + "epoch": 0.5195000549896939, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2736, + "step": 4576 + }, + { + "epoch": 0.519613582099613, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2822, + "step": 4577 + }, + { + "epoch": 0.519727109209532, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2554, + "step": 4578 + }, + { + "epoch": 0.5198406363194511, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 4579 + }, + { + "epoch": 0.5199541634293702, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2858, + "step": 4580 + }, + { + "epoch": 0.5200676905392893, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 4581 + }, + { + "epoch": 0.5201812176492083, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2585, + "step": 4582 + }, + { + "epoch": 0.5202947447591274, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2865, + "step": 4583 + }, + { + "epoch": 0.5204082718690465, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 4584 + }, + { + "epoch": 0.5205217989789656, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2687, + "step": 4585 + }, + { + "epoch": 0.5206353260888846, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2729, + "step": 4586 + }, + { + "epoch": 0.5207488531988037, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2746, + "step": 4587 + }, + { + "epoch": 0.5208623803087228, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 4588 + }, + { + "epoch": 0.5209759074186419, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2844, + "step": 4589 + }, + { + "epoch": 0.5210894345285609, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2802, + "step": 4590 + }, + { + "epoch": 0.52120296163848, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2804, + "step": 4591 + }, + { + "epoch": 0.5213164887483991, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2526, + "step": 4592 + }, + { + "epoch": 0.5214300158583182, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 4593 + }, + { + "epoch": 0.5215435429682372, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.267, + "step": 4594 + }, + { + "epoch": 0.5216570700781563, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2815, + "step": 4595 + }, + { + "epoch": 0.5217705971880754, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.267, + "step": 4596 + }, + { + "epoch": 0.5218841242979945, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2847, + "step": 4597 + }, + { + "epoch": 0.5219976514079135, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2788, + "step": 4598 + }, + { + "epoch": 0.5221111785178326, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2759, + "step": 4599 + }, + { + "epoch": 0.5222247056277517, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2738, + "step": 4600 + }, + { + "epoch": 0.5223382327376708, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2744, + "step": 4601 + }, + { + "epoch": 0.5224517598475898, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.287, + "step": 4602 + }, + { + "epoch": 0.5225652869575089, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2836, + "step": 4603 + }, + { + "epoch": 0.522678814067428, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2817, + "step": 4604 + }, + { + "epoch": 0.5227923411773471, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2503, + "step": 4605 + }, + { + "epoch": 0.5229058682872662, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2808, + "step": 4606 + }, + { + "epoch": 0.5230193953971852, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4607 + }, + { + "epoch": 0.5231329225071043, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2666, + "step": 4608 + }, + { + "epoch": 0.5232464496170234, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2798, + "step": 4609 + }, + { + "epoch": 0.5233599767269425, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2871, + "step": 4610 + }, + { + "epoch": 0.5234735038368615, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 4611 + }, + { + "epoch": 0.5235870309467806, + "grad_norm": 0.2333984375, + "learning_rate": 0.002, + "loss": 5.2784, + "step": 4612 + }, + { + "epoch": 0.5237005580566997, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 4613 + }, + { + "epoch": 0.5238140851666188, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 4614 + }, + { + "epoch": 0.5239276122765378, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2797, + "step": 4615 + }, + { + "epoch": 0.5240411393864569, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2612, + "step": 4616 + }, + { + "epoch": 0.524154666496376, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2873, + "step": 4617 + }, + { + "epoch": 0.5242681936062951, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2602, + "step": 4618 + }, + { + "epoch": 0.5243817207162141, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 4619 + }, + { + "epoch": 0.5244952478261332, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2778, + "step": 4620 + }, + { + "epoch": 0.5246087749360523, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 4621 + }, + { + "epoch": 0.5247223020459714, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.287, + "step": 4622 + }, + { + "epoch": 0.5248358291558904, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 4623 + }, + { + "epoch": 0.5249493562658095, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 4624 + }, + { + "epoch": 0.5250628833757286, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2707, + "step": 4625 + }, + { + "epoch": 0.5251764104856477, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.269, + "step": 4626 + }, + { + "epoch": 0.5252899375955667, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2843, + "step": 4627 + }, + { + "epoch": 0.5254034647054858, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 4628 + }, + { + "epoch": 0.5255169918154049, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4629 + }, + { + "epoch": 0.525630518925324, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 4630 + }, + { + "epoch": 0.525744046035243, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2869, + "step": 4631 + }, + { + "epoch": 0.5258575731451621, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 4632 + }, + { + "epoch": 0.5259711002550812, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2452, + "step": 4633 + }, + { + "epoch": 0.5260846273650003, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 4634 + }, + { + "epoch": 0.5261981544749194, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 4635 + }, + { + "epoch": 0.5263116815848384, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2759, + "step": 4636 + }, + { + "epoch": 0.5264252086947575, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.295, + "step": 4637 + }, + { + "epoch": 0.5265387358046766, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 4638 + }, + { + "epoch": 0.5266522629145957, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 4639 + }, + { + "epoch": 0.5267657900245147, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 4640 + }, + { + "epoch": 0.5268793171344338, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.2637, + "step": 4641 + }, + { + "epoch": 0.5269928442443529, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 4642 + }, + { + "epoch": 0.527106371354272, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2838, + "step": 4643 + }, + { + "epoch": 0.527219898464191, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 4644 + }, + { + "epoch": 0.5273334255741101, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2669, + "step": 4645 + }, + { + "epoch": 0.5274469526840292, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2685, + "step": 4646 + }, + { + "epoch": 0.5275604797939483, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 4647 + }, + { + "epoch": 0.5276740069038673, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4648 + }, + { + "epoch": 0.5277875340137864, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 4649 + }, + { + "epoch": 0.5279010611237055, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 4650 + }, + { + "epoch": 0.5280145882336246, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2735, + "step": 4651 + }, + { + "epoch": 0.5281281153435436, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 4652 + }, + { + "epoch": 0.5282416424534627, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2633, + "step": 4653 + }, + { + "epoch": 0.5283551695633818, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4654 + }, + { + "epoch": 0.5284686966733009, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.275, + "step": 4655 + }, + { + "epoch": 0.52858222378322, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 4656 + }, + { + "epoch": 0.528695750893139, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 4657 + }, + { + "epoch": 0.5288092780030581, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 4658 + }, + { + "epoch": 0.5289228051129772, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2712, + "step": 4659 + }, + { + "epoch": 0.5290363322228963, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2643, + "step": 4660 + }, + { + "epoch": 0.5291498593328153, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.291, + "step": 4661 + }, + { + "epoch": 0.5292633864427344, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2924, + "step": 4662 + }, + { + "epoch": 0.5293769135526535, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2956, + "step": 4663 + }, + { + "epoch": 0.5294904406625726, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4664 + }, + { + "epoch": 0.5296039677724916, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2938, + "step": 4665 + }, + { + "epoch": 0.5297174948824107, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.252, + "step": 4666 + }, + { + "epoch": 0.5298310219923298, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2632, + "step": 4667 + }, + { + "epoch": 0.5299445491022489, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 4668 + }, + { + "epoch": 0.5300580762121679, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 4669 + }, + { + "epoch": 0.530171603322087, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4670 + }, + { + "epoch": 0.5302851304320061, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2742, + "step": 4671 + }, + { + "epoch": 0.5303986575419252, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2872, + "step": 4672 + }, + { + "epoch": 0.5305121846518442, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2853, + "step": 4673 + }, + { + "epoch": 0.5306257117617633, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 4674 + }, + { + "epoch": 0.5307392388716824, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2788, + "step": 4675 + }, + { + "epoch": 0.5308527659816015, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 4676 + }, + { + "epoch": 0.5309662930915205, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4677 + }, + { + "epoch": 0.5310798202014396, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2691, + "step": 4678 + }, + { + "epoch": 0.5311933473113587, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 4679 + }, + { + "epoch": 0.5313068744212778, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 4680 + }, + { + "epoch": 0.5314204015311969, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2663, + "step": 4681 + }, + { + "epoch": 0.5315339286411159, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 4682 + }, + { + "epoch": 0.531647455751035, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2525, + "step": 4683 + }, + { + "epoch": 0.5317609828609541, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4684 + }, + { + "epoch": 0.5318745099708732, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2676, + "step": 4685 + }, + { + "epoch": 0.5319880370807922, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 4686 + }, + { + "epoch": 0.5321015641907113, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2618, + "step": 4687 + }, + { + "epoch": 0.5322150913006304, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2868, + "step": 4688 + }, + { + "epoch": 0.5323286184105495, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.286, + "step": 4689 + }, + { + "epoch": 0.5324421455204685, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2853, + "step": 4690 + }, + { + "epoch": 0.5325556726303876, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 4691 + }, + { + "epoch": 0.5326691997403067, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 4692 + }, + { + "epoch": 0.5327827268502258, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 4693 + }, + { + "epoch": 0.5328962539601448, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 4694 + }, + { + "epoch": 0.5330097810700639, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2653, + "step": 4695 + }, + { + "epoch": 0.533123308179983, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2691, + "step": 4696 + }, + { + "epoch": 0.5332368352899021, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2582, + "step": 4697 + }, + { + "epoch": 0.5333503623998211, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2716, + "step": 4698 + }, + { + "epoch": 0.5334638895097402, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 4699 + }, + { + "epoch": 0.5335774166196593, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2741, + "step": 4700 + }, + { + "epoch": 0.5336909437295784, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 4701 + }, + { + "epoch": 0.5338044708394974, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2831, + "step": 4702 + }, + { + "epoch": 0.5339179979494165, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.268, + "step": 4703 + }, + { + "epoch": 0.5340315250593356, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 4704 + }, + { + "epoch": 0.5341450521692547, + "grad_norm": 0.216796875, + "learning_rate": 0.002, + "loss": 5.2673, + "step": 4705 + }, + { + "epoch": 0.5342585792791739, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.289, + "step": 4706 + }, + { + "epoch": 0.5343721063890929, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2744, + "step": 4707 + }, + { + "epoch": 0.534485633499012, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 4708 + }, + { + "epoch": 0.5345991606089311, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2951, + "step": 4709 + }, + { + "epoch": 0.5347126877188502, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 4710 + }, + { + "epoch": 0.5348262148287692, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 4711 + }, + { + "epoch": 0.5349397419386883, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2862, + "step": 4712 + }, + { + "epoch": 0.5350532690486074, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 4713 + }, + { + "epoch": 0.5351667961585265, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.264, + "step": 4714 + }, + { + "epoch": 0.5352803232684455, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2677, + "step": 4715 + }, + { + "epoch": 0.5353938503783646, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2734, + "step": 4716 + }, + { + "epoch": 0.5355073774882837, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2763, + "step": 4717 + }, + { + "epoch": 0.5356209045982028, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 4718 + }, + { + "epoch": 0.5357344317081218, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.266, + "step": 4719 + }, + { + "epoch": 0.5358479588180409, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 4720 + }, + { + "epoch": 0.53596148592796, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 4721 + }, + { + "epoch": 0.5360750130378791, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2683, + "step": 4722 + }, + { + "epoch": 0.5361885401477982, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2601, + "step": 4723 + }, + { + "epoch": 0.5363020672577172, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 4724 + }, + { + "epoch": 0.5364155943676363, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 4725 + }, + { + "epoch": 0.5365291214775554, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.291, + "step": 4726 + }, + { + "epoch": 0.5366426485874745, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2722, + "step": 4727 + }, + { + "epoch": 0.5367561756973935, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2896, + "step": 4728 + }, + { + "epoch": 0.5368697028073126, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 4729 + }, + { + "epoch": 0.5369832299172317, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 4730 + }, + { + "epoch": 0.5370967570271508, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.279, + "step": 4731 + }, + { + "epoch": 0.5372102841370698, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2847, + "step": 4732 + }, + { + "epoch": 0.5373238112469889, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 4733 + }, + { + "epoch": 0.537437338356908, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4734 + }, + { + "epoch": 0.5375508654668271, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 4735 + }, + { + "epoch": 0.5376643925767461, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 4736 + }, + { + "epoch": 0.5377779196866652, + "grad_norm": 0.478515625, + "learning_rate": 0.002, + "loss": 5.2796, + "step": 4737 + }, + { + "epoch": 0.5378914467965843, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2742, + "step": 4738 + }, + { + "epoch": 0.5380049739065034, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.3038, + "step": 4739 + }, + { + "epoch": 0.5381185010164224, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 4740 + }, + { + "epoch": 0.5382320281263415, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 4741 + }, + { + "epoch": 0.5383455552362606, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2826, + "step": 4742 + }, + { + "epoch": 0.5384590823461797, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2574, + "step": 4743 + }, + { + "epoch": 0.5385726094560987, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 4744 + }, + { + "epoch": 0.5386861365660178, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 4745 + }, + { + "epoch": 0.5387996636759369, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2667, + "step": 4746 + }, + { + "epoch": 0.538913190785856, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 4747 + }, + { + "epoch": 0.539026717895775, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 4748 + }, + { + "epoch": 0.5391402450056941, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 4749 + }, + { + "epoch": 0.5392537721156132, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2859, + "step": 4750 + }, + { + "epoch": 0.5393672992255323, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2975, + "step": 4751 + }, + { + "epoch": 0.5394808263354514, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2652, + "step": 4752 + }, + { + "epoch": 0.5395943534453704, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2702, + "step": 4753 + }, + { + "epoch": 0.5397078805552895, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 4754 + }, + { + "epoch": 0.5398214076652086, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2694, + "step": 4755 + }, + { + "epoch": 0.5399349347751277, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.261, + "step": 4756 + }, + { + "epoch": 0.5400484618850467, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 4757 + }, + { + "epoch": 0.5401619889949658, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2525, + "step": 4758 + }, + { + "epoch": 0.5402755161048849, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 4759 + }, + { + "epoch": 0.540389043214804, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.27, + "step": 4760 + }, + { + "epoch": 0.540502570324723, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 4761 + }, + { + "epoch": 0.5406160974346421, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2621, + "step": 4762 + }, + { + "epoch": 0.5407296245445612, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2698, + "step": 4763 + }, + { + "epoch": 0.5408431516544803, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2601, + "step": 4764 + }, + { + "epoch": 0.5409566787643993, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 4765 + }, + { + "epoch": 0.5410702058743184, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2899, + "step": 4766 + }, + { + "epoch": 0.5411837329842375, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2805, + "step": 4767 + }, + { + "epoch": 0.5412972600941566, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 4768 + }, + { + "epoch": 0.5414107872040756, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 4769 + }, + { + "epoch": 0.5415243143139947, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2539, + "step": 4770 + }, + { + "epoch": 0.5416378414239138, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.262, + "step": 4771 + }, + { + "epoch": 0.5417513685338329, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2779, + "step": 4772 + }, + { + "epoch": 0.541864895643752, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 4773 + }, + { + "epoch": 0.541978422753671, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 4774 + }, + { + "epoch": 0.5420919498635901, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2841, + "step": 4775 + }, + { + "epoch": 0.5422054769735092, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 4776 + }, + { + "epoch": 0.5423190040834283, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.288, + "step": 4777 + }, + { + "epoch": 0.5424325311933473, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2471, + "step": 4778 + }, + { + "epoch": 0.5425460583032664, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 4779 + }, + { + "epoch": 0.5426595854131855, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 4780 + }, + { + "epoch": 0.5427731125231046, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2839, + "step": 4781 + }, + { + "epoch": 0.5428866396330236, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 4782 + }, + { + "epoch": 0.5430001667429427, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 4783 + }, + { + "epoch": 0.5431136938528618, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2677, + "step": 4784 + }, + { + "epoch": 0.5432272209627809, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2848, + "step": 4785 + }, + { + "epoch": 0.5433407480726999, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 4786 + }, + { + "epoch": 0.543454275182619, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2702, + "step": 4787 + }, + { + "epoch": 0.5435678022925381, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 4788 + }, + { + "epoch": 0.5436813294024572, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2469, + "step": 4789 + }, + { + "epoch": 0.5437948565123762, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2621, + "step": 4790 + }, + { + "epoch": 0.5439083836222953, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 4791 + }, + { + "epoch": 0.5440219107322144, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.274, + "step": 4792 + }, + { + "epoch": 0.5441354378421335, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2694, + "step": 4793 + }, + { + "epoch": 0.5442489649520525, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 4794 + }, + { + "epoch": 0.5443624920619716, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2815, + "step": 4795 + }, + { + "epoch": 0.5444760191718907, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 4796 + }, + { + "epoch": 0.5445895462818098, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 4797 + }, + { + "epoch": 0.5447030733917289, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 4798 + }, + { + "epoch": 0.5448166005016479, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2705, + "step": 4799 + }, + { + "epoch": 0.544930127611567, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 4800 + }, + { + "epoch": 0.5450436547214861, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2624, + "step": 4801 + }, + { + "epoch": 0.5451571818314052, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 4802 + }, + { + "epoch": 0.5452707089413242, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 4803 + }, + { + "epoch": 0.5453842360512433, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 4804 + }, + { + "epoch": 0.5454977631611624, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2728, + "step": 4805 + }, + { + "epoch": 0.5456112902710815, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2691, + "step": 4806 + }, + { + "epoch": 0.5457248173810005, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2667, + "step": 4807 + }, + { + "epoch": 0.5458383444909196, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 4808 + }, + { + "epoch": 0.5459518716008387, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2789, + "step": 4809 + }, + { + "epoch": 0.5460653987107578, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 4810 + }, + { + "epoch": 0.5461789258206768, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2812, + "step": 4811 + }, + { + "epoch": 0.5462924529305959, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2471, + "step": 4812 + }, + { + "epoch": 0.546405980040515, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2602, + "step": 4813 + }, + { + "epoch": 0.5465195071504341, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.259, + "step": 4814 + }, + { + "epoch": 0.5466330342603531, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2572, + "step": 4815 + }, + { + "epoch": 0.5467465613702722, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2786, + "step": 4816 + }, + { + "epoch": 0.5468600884801913, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 4817 + }, + { + "epoch": 0.5469736155901104, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2718, + "step": 4818 + }, + { + "epoch": 0.5470871427000294, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 4819 + }, + { + "epoch": 0.5472006698099485, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 4820 + }, + { + "epoch": 0.5473141969198676, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2803, + "step": 4821 + }, + { + "epoch": 0.5474277240297867, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2757, + "step": 4822 + }, + { + "epoch": 0.5475412511397058, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.27, + "step": 4823 + }, + { + "epoch": 0.5476547782496248, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 4824 + }, + { + "epoch": 0.5477683053595439, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2794, + "step": 4825 + }, + { + "epoch": 0.547881832469463, + "grad_norm": 0.2333984375, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4826 + }, + { + "epoch": 0.5479953595793821, + "grad_norm": 0.228515625, + "learning_rate": 0.002, + "loss": 5.2738, + "step": 4827 + }, + { + "epoch": 0.5481088866893011, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.265, + "step": 4828 + }, + { + "epoch": 0.5482224137992202, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 4829 + }, + { + "epoch": 0.5483359409091393, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 4830 + }, + { + "epoch": 0.5484494680190584, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 4831 + }, + { + "epoch": 0.5485629951289774, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2428, + "step": 4832 + }, + { + "epoch": 0.5486765222388965, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2532, + "step": 4833 + }, + { + "epoch": 0.5487900493488156, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2713, + "step": 4834 + }, + { + "epoch": 0.5489035764587347, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 4835 + }, + { + "epoch": 0.5490171035686537, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2593, + "step": 4836 + }, + { + "epoch": 0.5491306306785728, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 4837 + }, + { + "epoch": 0.5492441577884919, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 4838 + }, + { + "epoch": 0.549357684898411, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2384, + "step": 4839 + }, + { + "epoch": 0.54947121200833, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2734, + "step": 4840 + }, + { + "epoch": 0.5495847391182491, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2606, + "step": 4841 + }, + { + "epoch": 0.5496982662281682, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2821, + "step": 4842 + }, + { + "epoch": 0.5498117933380873, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 4843 + }, + { + "epoch": 0.5499253204480063, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 4844 + }, + { + "epoch": 0.5500388475579254, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2737, + "step": 4845 + }, + { + "epoch": 0.5501523746678445, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 4846 + }, + { + "epoch": 0.5502659017777636, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 4847 + }, + { + "epoch": 0.5503794288876827, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 4848 + }, + { + "epoch": 0.5504929559976017, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2678, + "step": 4849 + }, + { + "epoch": 0.5506064831075208, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2563, + "step": 4850 + }, + { + "epoch": 0.5507200102174399, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2791, + "step": 4851 + }, + { + "epoch": 0.550833537327359, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2773, + "step": 4852 + }, + { + "epoch": 0.550947064437278, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 4853 + }, + { + "epoch": 0.5510605915471971, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2465, + "step": 4854 + }, + { + "epoch": 0.5511741186571162, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2707, + "step": 4855 + }, + { + "epoch": 0.5512876457670353, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.288, + "step": 4856 + }, + { + "epoch": 0.5514011728769543, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2801, + "step": 4857 + }, + { + "epoch": 0.5515146999868734, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 4858 + }, + { + "epoch": 0.5516282270967925, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 4859 + }, + { + "epoch": 0.5517417542067116, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 4860 + }, + { + "epoch": 0.5518552813166306, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2602, + "step": 4861 + }, + { + "epoch": 0.5519688084265497, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2732, + "step": 4862 + }, + { + "epoch": 0.5520823355364688, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.289, + "step": 4863 + }, + { + "epoch": 0.5521958626463879, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 4864 + }, + { + "epoch": 0.552309389756307, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 4865 + }, + { + "epoch": 0.552422916866226, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 4866 + }, + { + "epoch": 0.5525364439761451, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2819, + "step": 4867 + }, + { + "epoch": 0.5526499710860642, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2762, + "step": 4868 + }, + { + "epoch": 0.5527634981959832, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 4869 + }, + { + "epoch": 0.5528770253059023, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.268, + "step": 4870 + }, + { + "epoch": 0.5529905524158214, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2658, + "step": 4871 + }, + { + "epoch": 0.5531040795257405, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 4872 + }, + { + "epoch": 0.5532176066356596, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 4873 + }, + { + "epoch": 0.5533311337455786, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 4874 + }, + { + "epoch": 0.5534446608554977, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4875 + }, + { + "epoch": 0.5535581879654168, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 4876 + }, + { + "epoch": 0.5536717150753359, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2765, + "step": 4877 + }, + { + "epoch": 0.5537852421852549, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 4878 + }, + { + "epoch": 0.553898769295174, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.249, + "step": 4879 + }, + { + "epoch": 0.5540122964050931, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 4880 + }, + { + "epoch": 0.5541258235150122, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.252, + "step": 4881 + }, + { + "epoch": 0.5542393506249312, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 4882 + }, + { + "epoch": 0.5543528777348503, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 4883 + }, + { + "epoch": 0.5544664048447694, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 4884 + }, + { + "epoch": 0.5545799319546885, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2757, + "step": 4885 + }, + { + "epoch": 0.5546934590646075, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2743, + "step": 4886 + }, + { + "epoch": 0.5548069861745266, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2978, + "step": 4887 + }, + { + "epoch": 0.5549205132844457, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.28, + "step": 4888 + }, + { + "epoch": 0.5550340403943648, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 4889 + }, + { + "epoch": 0.5551475675042838, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2616, + "step": 4890 + }, + { + "epoch": 0.5552610946142029, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 4891 + }, + { + "epoch": 0.555374621724122, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 4892 + }, + { + "epoch": 0.5554881488340411, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2742, + "step": 4893 + }, + { + "epoch": 0.5556016759439601, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.259, + "step": 4894 + }, + { + "epoch": 0.5557152030538792, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 4895 + }, + { + "epoch": 0.5558287301637983, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2791, + "step": 4896 + }, + { + "epoch": 0.5559422572737174, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 4897 + }, + { + "epoch": 0.5560557843836365, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2631, + "step": 4898 + }, + { + "epoch": 0.5561693114935555, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2643, + "step": 4899 + }, + { + "epoch": 0.5562828386034746, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2618, + "step": 4900 + }, + { + "epoch": 0.5563963657133937, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 4901 + }, + { + "epoch": 0.5565098928233128, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 4902 + }, + { + "epoch": 0.5566234199332318, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 4903 + }, + { + "epoch": 0.5567369470431509, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2714, + "step": 4904 + }, + { + "epoch": 0.55685047415307, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.284, + "step": 4905 + }, + { + "epoch": 0.5569640012629891, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2701, + "step": 4906 + }, + { + "epoch": 0.5570775283729081, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 4907 + }, + { + "epoch": 0.5571910554828272, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.266, + "step": 4908 + }, + { + "epoch": 0.5573045825927463, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2678, + "step": 4909 + }, + { + "epoch": 0.5574181097026654, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 4910 + }, + { + "epoch": 0.5575316368125844, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 4911 + }, + { + "epoch": 0.5576451639225035, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2837, + "step": 4912 + }, + { + "epoch": 0.5577586910324226, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2611, + "step": 4913 + }, + { + "epoch": 0.5578722181423417, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2806, + "step": 4914 + }, + { + "epoch": 0.5579857452522607, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.274, + "step": 4915 + }, + { + "epoch": 0.5580992723621798, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 4916 + }, + { + "epoch": 0.5582127994720989, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2842, + "step": 4917 + }, + { + "epoch": 0.558326326582018, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 4918 + }, + { + "epoch": 0.558439853691937, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.254, + "step": 4919 + }, + { + "epoch": 0.5585533808018561, + "grad_norm": 0.2333984375, + "learning_rate": 0.002, + "loss": 5.2335, + "step": 4920 + }, + { + "epoch": 0.5586669079117752, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.2735, + "step": 4921 + }, + { + "epoch": 0.5587804350216943, + "grad_norm": 0.2197265625, + "learning_rate": 0.002, + "loss": 5.2751, + "step": 4922 + }, + { + "epoch": 0.5588939621316134, + "grad_norm": 0.224609375, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 4923 + }, + { + "epoch": 0.5590074892415324, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2791, + "step": 4924 + }, + { + "epoch": 0.5591210163514515, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2762, + "step": 4925 + }, + { + "epoch": 0.5592345434613706, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2835, + "step": 4926 + }, + { + "epoch": 0.5593480705712897, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 4927 + }, + { + "epoch": 0.5594615976812087, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2572, + "step": 4928 + }, + { + "epoch": 0.5595751247911278, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 4929 + }, + { + "epoch": 0.5596886519010469, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 4930 + }, + { + "epoch": 0.559802179010966, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2722, + "step": 4931 + }, + { + "epoch": 0.559915706120885, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 4932 + }, + { + "epoch": 0.5600292332308041, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 4933 + }, + { + "epoch": 0.5601427603407232, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 4934 + }, + { + "epoch": 0.5602562874506423, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 4935 + }, + { + "epoch": 0.5603698145605613, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2673, + "step": 4936 + }, + { + "epoch": 0.5604833416704804, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 4937 + }, + { + "epoch": 0.5605968687803995, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2709, + "step": 4938 + }, + { + "epoch": 0.5607103958903186, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 4939 + }, + { + "epoch": 0.5608239230002376, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2902, + "step": 4940 + }, + { + "epoch": 0.5609374501101567, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 4941 + }, + { + "epoch": 0.5610509772200758, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 4942 + }, + { + "epoch": 0.5611645043299949, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.279, + "step": 4943 + }, + { + "epoch": 0.561278031439914, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 4944 + }, + { + "epoch": 0.561391558549833, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2567, + "step": 4945 + }, + { + "epoch": 0.5615050856597521, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2749, + "step": 4946 + }, + { + "epoch": 0.5616186127696712, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 4947 + }, + { + "epoch": 0.5617321398795904, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.272, + "step": 4948 + }, + { + "epoch": 0.5618456669895094, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 4949 + }, + { + "epoch": 0.5619591940994285, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2593, + "step": 4950 + }, + { + "epoch": 0.5620727212093476, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 4951 + }, + { + "epoch": 0.5621862483192667, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2606, + "step": 4952 + }, + { + "epoch": 0.5622997754291857, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2742, + "step": 4953 + }, + { + "epoch": 0.5624133025391048, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2869, + "step": 4954 + }, + { + "epoch": 0.5625268296490239, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 4955 + }, + { + "epoch": 0.562640356758943, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2702, + "step": 4956 + }, + { + "epoch": 0.562753883868862, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 4957 + }, + { + "epoch": 0.5628674109787811, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 4958 + }, + { + "epoch": 0.5629809380887002, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2715, + "step": 4959 + }, + { + "epoch": 0.5630944651986193, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 4960 + }, + { + "epoch": 0.5632079923085384, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2739, + "step": 4961 + }, + { + "epoch": 0.5633215194184574, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 4962 + }, + { + "epoch": 0.5634350465283765, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2959, + "step": 4963 + }, + { + "epoch": 0.5635485736382956, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 4964 + }, + { + "epoch": 0.5636621007482147, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 4965 + }, + { + "epoch": 0.5637756278581337, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2649, + "step": 4966 + }, + { + "epoch": 0.5638891549680528, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 4967 + }, + { + "epoch": 0.5640026820779719, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 4968 + }, + { + "epoch": 0.564116209187891, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 4969 + }, + { + "epoch": 0.56422973629781, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 4970 + }, + { + "epoch": 0.5643432634077291, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2718, + "step": 4971 + }, + { + "epoch": 0.5644567905176482, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2836, + "step": 4972 + }, + { + "epoch": 0.5645703176275673, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2616, + "step": 4973 + }, + { + "epoch": 0.5646838447374863, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.27, + "step": 4974 + }, + { + "epoch": 0.5647973718474054, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2288, + "step": 4975 + }, + { + "epoch": 0.5649108989573245, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.272, + "step": 4976 + }, + { + "epoch": 0.5650244260672436, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2348, + "step": 4977 + }, + { + "epoch": 0.5651379531771626, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 4978 + }, + { + "epoch": 0.5652514802870817, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 4979 + }, + { + "epoch": 0.5653650073970008, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.249, + "step": 4980 + }, + { + "epoch": 0.5654785345069199, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2379, + "step": 4981 + }, + { + "epoch": 0.565592061616839, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2597, + "step": 4982 + }, + { + "epoch": 0.565705588726758, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2864, + "step": 4983 + }, + { + "epoch": 0.5658191158366771, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 4984 + }, + { + "epoch": 0.5659326429465962, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2698, + "step": 4985 + }, + { + "epoch": 0.5660461700565153, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 4986 + }, + { + "epoch": 0.5661596971664343, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2731, + "step": 4987 + }, + { + "epoch": 0.5662732242763534, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.276, + "step": 4988 + }, + { + "epoch": 0.5663867513862725, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.287, + "step": 4989 + }, + { + "epoch": 0.5665002784961916, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 4990 + }, + { + "epoch": 0.5666138056061106, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 4991 + }, + { + "epoch": 0.5667273327160297, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2439, + "step": 4992 + }, + { + "epoch": 0.5668408598259488, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 4993 + }, + { + "epoch": 0.5669543869358679, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.265, + "step": 4994 + }, + { + "epoch": 0.5670679140457869, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 4995 + }, + { + "epoch": 0.567181441155706, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2856, + "step": 4996 + }, + { + "epoch": 0.5672949682656251, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2589, + "step": 4997 + }, + { + "epoch": 0.5674084953755442, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 4998 + }, + { + "epoch": 0.5675220224854632, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 4999 + }, + { + "epoch": 0.5676355495953823, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 5000 + }, + { + "epoch": 0.5677490767053014, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 5001 + }, + { + "epoch": 0.5678626038152205, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 5002 + }, + { + "epoch": 0.5679761309251395, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2643, + "step": 5003 + }, + { + "epoch": 0.5680896580350586, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 5004 + }, + { + "epoch": 0.5682031851449777, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 5005 + }, + { + "epoch": 0.5683167122548968, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 5006 + }, + { + "epoch": 0.5684302393648158, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5007 + }, + { + "epoch": 0.5685437664747349, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 5008 + }, + { + "epoch": 0.568657293584654, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2698, + "step": 5009 + }, + { + "epoch": 0.5687708206945731, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 5010 + }, + { + "epoch": 0.5688843478044922, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 5011 + }, + { + "epoch": 0.5689978749144112, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5012 + }, + { + "epoch": 0.5691114020243303, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2653, + "step": 5013 + }, + { + "epoch": 0.5692249291342494, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2666, + "step": 5014 + }, + { + "epoch": 0.5693384562441685, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5015 + }, + { + "epoch": 0.5694519833540875, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2734, + "step": 5016 + }, + { + "epoch": 0.5695655104640066, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 5017 + }, + { + "epoch": 0.5696790375739257, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2635, + "step": 5018 + }, + { + "epoch": 0.5697925646838448, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2713, + "step": 5019 + }, + { + "epoch": 0.5699060917937638, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5020 + }, + { + "epoch": 0.5700196189036829, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.252, + "step": 5021 + }, + { + "epoch": 0.570133146013602, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 5022 + }, + { + "epoch": 0.5702466731235211, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 5023 + }, + { + "epoch": 0.5703602002334401, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 5024 + }, + { + "epoch": 0.5704737273433592, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2676, + "step": 5025 + }, + { + "epoch": 0.5705872544532783, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2706, + "step": 5026 + }, + { + "epoch": 0.5707007815631974, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5027 + }, + { + "epoch": 0.5708143086731164, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2639, + "step": 5028 + }, + { + "epoch": 0.5709278357830355, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2503, + "step": 5029 + }, + { + "epoch": 0.5710413628929546, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2927, + "step": 5030 + }, + { + "epoch": 0.5711548900028737, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 5031 + }, + { + "epoch": 0.5712684171127927, + "grad_norm": 0.224609375, + "learning_rate": 0.002, + "loss": 5.2634, + "step": 5032 + }, + { + "epoch": 0.5713819442227118, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5033 + }, + { + "epoch": 0.5714954713326309, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2813, + "step": 5034 + }, + { + "epoch": 0.57160899844255, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5035 + }, + { + "epoch": 0.571722525552469, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 5036 + }, + { + "epoch": 0.5718360526623881, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 5037 + }, + { + "epoch": 0.5719495797723072, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2554, + "step": 5038 + }, + { + "epoch": 0.5720631068822263, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 5039 + }, + { + "epoch": 0.5721766339921454, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 5040 + }, + { + "epoch": 0.5722901611020644, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2911, + "step": 5041 + }, + { + "epoch": 0.5724036882119835, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5042 + }, + { + "epoch": 0.5725172153219026, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2585, + "step": 5043 + }, + { + "epoch": 0.5726307424318217, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2553, + "step": 5044 + }, + { + "epoch": 0.5727442695417407, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2581, + "step": 5045 + }, + { + "epoch": 0.5728577966516598, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 5046 + }, + { + "epoch": 0.5729713237615789, + "grad_norm": 0.423828125, + "learning_rate": 0.002, + "loss": 5.2466, + "step": 5047 + }, + { + "epoch": 0.573084850871498, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 5048 + }, + { + "epoch": 0.573198377981417, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2689, + "step": 5049 + }, + { + "epoch": 0.5733119050913361, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 5050 + }, + { + "epoch": 0.5734254322012552, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.254, + "step": 5051 + }, + { + "epoch": 0.5735389593111743, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2539, + "step": 5052 + }, + { + "epoch": 0.5736524864210933, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 5053 + }, + { + "epoch": 0.5737660135310124, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2428, + "step": 5054 + }, + { + "epoch": 0.5738795406409315, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2471, + "step": 5055 + }, + { + "epoch": 0.5739930677508506, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2761, + "step": 5056 + }, + { + "epoch": 0.5741065948607696, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2671, + "step": 5057 + }, + { + "epoch": 0.5742201219706887, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 5058 + }, + { + "epoch": 0.5743336490806078, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 5059 + }, + { + "epoch": 0.5744471761905269, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 5060 + }, + { + "epoch": 0.574560703300446, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 5061 + }, + { + "epoch": 0.574674230410365, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 5062 + }, + { + "epoch": 0.5747877575202841, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2783, + "step": 5063 + }, + { + "epoch": 0.5749012846302032, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2391, + "step": 5064 + }, + { + "epoch": 0.5750148117401223, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5065 + }, + { + "epoch": 0.5751283388500413, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5066 + }, + { + "epoch": 0.5752418659599604, + "grad_norm": 0.9296875, + "learning_rate": 0.002, + "loss": 5.277, + "step": 5067 + }, + { + "epoch": 0.5753553930698795, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2663, + "step": 5068 + }, + { + "epoch": 0.5754689201797986, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2694, + "step": 5069 + }, + { + "epoch": 0.5755824472897176, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 5070 + }, + { + "epoch": 0.5756959743996367, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 5071 + }, + { + "epoch": 0.5758095015095558, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 5072 + }, + { + "epoch": 0.5759230286194749, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5073 + }, + { + "epoch": 0.5760365557293939, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.272, + "step": 5074 + }, + { + "epoch": 0.576150082839313, + "grad_norm": 0.2294921875, + "learning_rate": 0.002, + "loss": 5.2674, + "step": 5075 + }, + { + "epoch": 0.5762636099492321, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5076 + }, + { + "epoch": 0.5763771370591512, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.258, + "step": 5077 + }, + { + "epoch": 0.5764906641690702, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2687, + "step": 5078 + }, + { + "epoch": 0.5766041912789893, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 5079 + }, + { + "epoch": 0.5767177183889084, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5080 + }, + { + "epoch": 0.5768312454988275, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 5081 + }, + { + "epoch": 0.5769447726087465, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2699, + "step": 5082 + }, + { + "epoch": 0.5770582997186656, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2321, + "step": 5083 + }, + { + "epoch": 0.5771718268285847, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 5084 + }, + { + "epoch": 0.5772853539385038, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 5085 + }, + { + "epoch": 0.5773988810484229, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 5086 + }, + { + "epoch": 0.5775124081583419, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2618, + "step": 5087 + }, + { + "epoch": 0.577625935268261, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 5088 + }, + { + "epoch": 0.5777394623781801, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 5089 + }, + { + "epoch": 0.5778529894880992, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 5090 + }, + { + "epoch": 0.5779665165980182, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 5091 + }, + { + "epoch": 0.5780800437079373, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2541, + "step": 5092 + }, + { + "epoch": 0.5781935708178564, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 5093 + }, + { + "epoch": 0.5783070979277755, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 5094 + }, + { + "epoch": 0.5784206250376945, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5095 + }, + { + "epoch": 0.5785341521476136, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 5096 + }, + { + "epoch": 0.5786476792575327, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2718, + "step": 5097 + }, + { + "epoch": 0.5787612063674518, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2538, + "step": 5098 + }, + { + "epoch": 0.5788747334773708, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 5099 + }, + { + "epoch": 0.5789882605872899, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2567, + "step": 5100 + }, + { + "epoch": 0.579101787697209, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 5101 + }, + { + "epoch": 0.5792153148071281, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 5102 + }, + { + "epoch": 0.5793288419170471, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5103 + }, + { + "epoch": 0.5794423690269662, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 5104 + }, + { + "epoch": 0.5795558961368853, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2719, + "step": 5105 + }, + { + "epoch": 0.5796694232468044, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2705, + "step": 5106 + }, + { + "epoch": 0.5797829503567234, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2734, + "step": 5107 + }, + { + "epoch": 0.5798964774666425, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 5108 + }, + { + "epoch": 0.5800100045765616, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 5109 + }, + { + "epoch": 0.5801235316864807, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2624, + "step": 5110 + }, + { + "epoch": 0.5802370587963998, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2666, + "step": 5111 + }, + { + "epoch": 0.5803505859063188, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 5112 + }, + { + "epoch": 0.5804641130162379, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 5113 + }, + { + "epoch": 0.580577640126157, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 5114 + }, + { + "epoch": 0.5806911672360761, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2689, + "step": 5115 + }, + { + "epoch": 0.5808046943459951, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 5116 + }, + { + "epoch": 0.5809182214559142, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 5117 + }, + { + "epoch": 0.5810317485658333, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5118 + }, + { + "epoch": 0.5811452756757524, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2615, + "step": 5119 + }, + { + "epoch": 0.5812588027856714, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2589, + "step": 5120 + }, + { + "epoch": 0.5813723298955905, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 5121 + }, + { + "epoch": 0.5814858570055096, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2696, + "step": 5122 + }, + { + "epoch": 0.5815993841154287, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 5123 + }, + { + "epoch": 0.5817129112253477, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2365, + "step": 5124 + }, + { + "epoch": 0.5818264383352668, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2585, + "step": 5125 + }, + { + "epoch": 0.5819399654451859, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 5126 + }, + { + "epoch": 0.582053492555105, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2662, + "step": 5127 + }, + { + "epoch": 0.582167019665024, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 5128 + }, + { + "epoch": 0.5822805467749431, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 5129 + }, + { + "epoch": 0.5823940738848622, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2852, + "step": 5130 + }, + { + "epoch": 0.5825076009947813, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2704, + "step": 5131 + }, + { + "epoch": 0.5826211281047003, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2662, + "step": 5132 + }, + { + "epoch": 0.5827346552146194, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 5133 + }, + { + "epoch": 0.5828481823245385, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2883, + "step": 5134 + }, + { + "epoch": 0.5829617094344576, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2539, + "step": 5135 + }, + { + "epoch": 0.5830752365443767, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2347, + "step": 5136 + }, + { + "epoch": 0.5831887636542957, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 5137 + }, + { + "epoch": 0.5833022907642148, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 5138 + }, + { + "epoch": 0.5834158178741339, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.225, + "step": 5139 + }, + { + "epoch": 0.583529344984053, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2732, + "step": 5140 + }, + { + "epoch": 0.583642872093972, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.257, + "step": 5141 + }, + { + "epoch": 0.5837563992038911, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5142 + }, + { + "epoch": 0.5838699263138102, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2737, + "step": 5143 + }, + { + "epoch": 0.5839834534237293, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 5144 + }, + { + "epoch": 0.5840969805336483, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.2559, + "step": 5145 + }, + { + "epoch": 0.5842105076435674, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 5146 + }, + { + "epoch": 0.5843240347534865, + "grad_norm": 0.49609375, + "learning_rate": 0.002, + "loss": 5.2597, + "step": 5147 + }, + { + "epoch": 0.5844375618634056, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.2586, + "step": 5148 + }, + { + "epoch": 0.5845510889733246, + "grad_norm": 0.490234375, + "learning_rate": 0.002, + "loss": 5.2581, + "step": 5149 + }, + { + "epoch": 0.5846646160832437, + "grad_norm": 0.408203125, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5150 + }, + { + "epoch": 0.5847781431931628, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 5151 + }, + { + "epoch": 0.5848916703030819, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 5152 + }, + { + "epoch": 0.5850051974130009, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2539, + "step": 5153 + }, + { + "epoch": 0.58511872452292, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2722, + "step": 5154 + }, + { + "epoch": 0.5852322516328391, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 5155 + }, + { + "epoch": 0.5853457787427582, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 5156 + }, + { + "epoch": 0.5854593058526772, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2466, + "step": 5157 + }, + { + "epoch": 0.5855728329625963, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.257, + "step": 5158 + }, + { + "epoch": 0.5856863600725154, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 5159 + }, + { + "epoch": 0.5857998871824345, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2651, + "step": 5160 + }, + { + "epoch": 0.5859134142923536, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 5161 + }, + { + "epoch": 0.5860269414022726, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2645, + "step": 5162 + }, + { + "epoch": 0.5861404685121917, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2317, + "step": 5163 + }, + { + "epoch": 0.5862539956221108, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 5164 + }, + { + "epoch": 0.5863675227320299, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 5165 + }, + { + "epoch": 0.5864810498419489, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2518, + "step": 5166 + }, + { + "epoch": 0.586594576951868, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2673, + "step": 5167 + }, + { + "epoch": 0.5867081040617871, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2631, + "step": 5168 + }, + { + "epoch": 0.5868216311717062, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.271, + "step": 5169 + }, + { + "epoch": 0.5869351582816252, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 5170 + }, + { + "epoch": 0.5870486853915443, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2781, + "step": 5171 + }, + { + "epoch": 0.5871622125014634, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 5172 + }, + { + "epoch": 0.5872757396113825, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2594, + "step": 5173 + }, + { + "epoch": 0.5873892667213015, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 5174 + }, + { + "epoch": 0.5875027938312206, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2582, + "step": 5175 + }, + { + "epoch": 0.5876163209411397, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.275, + "step": 5176 + }, + { + "epoch": 0.5877298480510588, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2654, + "step": 5177 + }, + { + "epoch": 0.5878433751609778, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.234, + "step": 5178 + }, + { + "epoch": 0.5879569022708969, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2704, + "step": 5179 + }, + { + "epoch": 0.588070429380816, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2387, + "step": 5180 + }, + { + "epoch": 0.5881839564907351, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2716, + "step": 5181 + }, + { + "epoch": 0.5882974836006541, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 5182 + }, + { + "epoch": 0.5884110107105732, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 5183 + }, + { + "epoch": 0.5885245378204923, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 5184 + }, + { + "epoch": 0.5886380649304114, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2697, + "step": 5185 + }, + { + "epoch": 0.5887515920403305, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2309, + "step": 5186 + }, + { + "epoch": 0.5888651191502495, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2389, + "step": 5187 + }, + { + "epoch": 0.5889786462601686, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2659, + "step": 5188 + }, + { + "epoch": 0.5890921733700878, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2663, + "step": 5189 + }, + { + "epoch": 0.5892057004800069, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 5190 + }, + { + "epoch": 0.5893192275899259, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 5191 + }, + { + "epoch": 0.589432754699845, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.252, + "step": 5192 + }, + { + "epoch": 0.5895462818097641, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 5193 + }, + { + "epoch": 0.5896598089196832, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2689, + "step": 5194 + }, + { + "epoch": 0.5897733360296022, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.26, + "step": 5195 + }, + { + "epoch": 0.5898868631395213, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2429, + "step": 5196 + }, + { + "epoch": 0.5900003902494404, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 5197 + }, + { + "epoch": 0.5901139173593595, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 5198 + }, + { + "epoch": 0.5902274444692786, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2747, + "step": 5199 + }, + { + "epoch": 0.5903409715791976, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2296, + "step": 5200 + }, + { + "epoch": 0.5904544986891167, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2764, + "step": 5201 + }, + { + "epoch": 0.5905680257990358, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2299, + "step": 5202 + }, + { + "epoch": 0.5906815529089549, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2282, + "step": 5203 + }, + { + "epoch": 0.5907950800188739, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 5204 + }, + { + "epoch": 0.590908607128793, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2421, + "step": 5205 + }, + { + "epoch": 0.5910221342387121, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.254, + "step": 5206 + }, + { + "epoch": 0.5911356613486312, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.262, + "step": 5207 + }, + { + "epoch": 0.5912491884585502, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 5208 + }, + { + "epoch": 0.5913627155684693, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 5209 + }, + { + "epoch": 0.5914762426783884, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 5210 + }, + { + "epoch": 0.5915897697883075, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 5211 + }, + { + "epoch": 0.5917032968982265, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2359, + "step": 5212 + }, + { + "epoch": 0.5918168240081456, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 5213 + }, + { + "epoch": 0.5919303511180647, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 5214 + }, + { + "epoch": 0.5920438782279838, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2612, + "step": 5215 + }, + { + "epoch": 0.5921574053379028, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.242, + "step": 5216 + }, + { + "epoch": 0.5922709324478219, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2404, + "step": 5217 + }, + { + "epoch": 0.592384459557741, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 5218 + }, + { + "epoch": 0.5924979866676601, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2483, + "step": 5219 + }, + { + "epoch": 0.5926115137775791, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 5220 + }, + { + "epoch": 0.5927250408874982, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 5221 + }, + { + "epoch": 0.5928385679974173, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2746, + "step": 5222 + }, + { + "epoch": 0.5929520951073364, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5223 + }, + { + "epoch": 0.5930656222172555, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.258, + "step": 5224 + }, + { + "epoch": 0.5931791493271745, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2746, + "step": 5225 + }, + { + "epoch": 0.5932926764370936, + "grad_norm": 0.228515625, + "learning_rate": 0.002, + "loss": 5.2684, + "step": 5226 + }, + { + "epoch": 0.5934062035470127, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5227 + }, + { + "epoch": 0.5935197306569318, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2577, + "step": 5228 + }, + { + "epoch": 0.5936332577668508, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.272, + "step": 5229 + }, + { + "epoch": 0.5937467848767699, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 5230 + }, + { + "epoch": 0.593860311986689, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2616, + "step": 5231 + }, + { + "epoch": 0.5939738390966081, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 5232 + }, + { + "epoch": 0.5940873662065271, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.2245, + "step": 5233 + }, + { + "epoch": 0.5942008933164462, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2512, + "step": 5234 + }, + { + "epoch": 0.5943144204263653, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 5235 + }, + { + "epoch": 0.5944279475362844, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 5236 + }, + { + "epoch": 0.5945414746462034, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2347, + "step": 5237 + }, + { + "epoch": 0.5946550017561225, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 5238 + }, + { + "epoch": 0.5947685288660416, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 5239 + }, + { + "epoch": 0.5948820559759607, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2365, + "step": 5240 + }, + { + "epoch": 0.5949955830858797, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5241 + }, + { + "epoch": 0.5951091101957988, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2744, + "step": 5242 + }, + { + "epoch": 0.5952226373057179, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 5243 + }, + { + "epoch": 0.595336164415637, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2586, + "step": 5244 + }, + { + "epoch": 0.595449691525556, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 5245 + }, + { + "epoch": 0.5955632186354751, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2392, + "step": 5246 + }, + { + "epoch": 0.5956767457453942, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 5247 + }, + { + "epoch": 0.5957902728553133, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.259, + "step": 5248 + }, + { + "epoch": 0.5959037999652324, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2751, + "step": 5249 + }, + { + "epoch": 0.5960173270751514, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 5250 + }, + { + "epoch": 0.5961308541850705, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 5251 + }, + { + "epoch": 0.5962443812949896, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2707, + "step": 5252 + }, + { + "epoch": 0.5963579084049087, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2563, + "step": 5253 + }, + { + "epoch": 0.5964714355148277, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2741, + "step": 5254 + }, + { + "epoch": 0.5965849626247468, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 5255 + }, + { + "epoch": 0.5966984897346659, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2671, + "step": 5256 + }, + { + "epoch": 0.596812016844585, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2739, + "step": 5257 + }, + { + "epoch": 0.596925543954504, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.235, + "step": 5258 + }, + { + "epoch": 0.5970390710644231, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2428, + "step": 5259 + }, + { + "epoch": 0.5971525981743422, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 5260 + }, + { + "epoch": 0.5972661252842613, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 5261 + }, + { + "epoch": 0.5973796523941803, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2645, + "step": 5262 + }, + { + "epoch": 0.5974931795040994, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2762, + "step": 5263 + }, + { + "epoch": 0.5976067066140185, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2699, + "step": 5264 + }, + { + "epoch": 0.5977202337239376, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2585, + "step": 5265 + }, + { + "epoch": 0.5978337608338566, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2605, + "step": 5266 + }, + { + "epoch": 0.5979472879437757, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2698, + "step": 5267 + }, + { + "epoch": 0.5980608150536948, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.247, + "step": 5268 + }, + { + "epoch": 0.5981743421636139, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 5269 + }, + { + "epoch": 0.598287869273533, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2721, + "step": 5270 + }, + { + "epoch": 0.598401396383452, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 5271 + }, + { + "epoch": 0.5985149234933711, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 5272 + }, + { + "epoch": 0.5986284506032902, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2392, + "step": 5273 + }, + { + "epoch": 0.5987419777132093, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2861, + "step": 5274 + }, + { + "epoch": 0.5988555048231283, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 5275 + }, + { + "epoch": 0.5989690319330474, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 5276 + }, + { + "epoch": 0.5990825590429665, + "grad_norm": 0.2333984375, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 5277 + }, + { + "epoch": 0.5991960861528856, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 5278 + }, + { + "epoch": 0.5993096132628046, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 5279 + }, + { + "epoch": 0.5994231403727237, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2443, + "step": 5280 + }, + { + "epoch": 0.5995366674826428, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 5281 + }, + { + "epoch": 0.5996501945925619, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2688, + "step": 5282 + }, + { + "epoch": 0.5997637217024809, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5283 + }, + { + "epoch": 0.5998772488124, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 5284 + }, + { + "epoch": 0.5999907759223191, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 5285 + }, + { + "epoch": 0.6001043030322382, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2432, + "step": 5286 + }, + { + "epoch": 0.6002178301421572, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5287 + }, + { + "epoch": 0.6003313572520763, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2419, + "step": 5288 + }, + { + "epoch": 0.6004448843619954, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2454, + "step": 5289 + }, + { + "epoch": 0.6005584114719145, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2602, + "step": 5290 + }, + { + "epoch": 0.6006719385818335, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 5291 + }, + { + "epoch": 0.6007854656917526, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2727, + "step": 5292 + }, + { + "epoch": 0.6008989928016717, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 5293 + }, + { + "epoch": 0.6010125199115908, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2785, + "step": 5294 + }, + { + "epoch": 0.6011260470215098, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2637, + "step": 5295 + }, + { + "epoch": 0.6012395741314289, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 5296 + }, + { + "epoch": 0.601353101241348, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 5297 + }, + { + "epoch": 0.6014666283512671, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2754, + "step": 5298 + }, + { + "epoch": 0.6015801554611862, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 5299 + }, + { + "epoch": 0.6016936825711052, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 5300 + }, + { + "epoch": 0.6018072096810243, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2538, + "step": 5301 + }, + { + "epoch": 0.6019207367909434, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.271, + "step": 5302 + }, + { + "epoch": 0.6020342639008625, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2687, + "step": 5303 + }, + { + "epoch": 0.6021477910107815, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 5304 + }, + { + "epoch": 0.6022613181207006, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 5305 + }, + { + "epoch": 0.6023748452306197, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 5306 + }, + { + "epoch": 0.6024883723405388, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 5307 + }, + { + "epoch": 0.6026018994504578, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 5308 + }, + { + "epoch": 0.6027154265603769, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.26, + "step": 5309 + }, + { + "epoch": 0.602828953670296, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 5310 + }, + { + "epoch": 0.6029424807802151, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5311 + }, + { + "epoch": 0.6030560078901341, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.273, + "step": 5312 + }, + { + "epoch": 0.6031695350000532, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2666, + "step": 5313 + }, + { + "epoch": 0.6032830621099723, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2355, + "step": 5314 + }, + { + "epoch": 0.6033965892198914, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2795, + "step": 5315 + }, + { + "epoch": 0.6035101163298104, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 5316 + }, + { + "epoch": 0.6036236434397295, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 5317 + }, + { + "epoch": 0.6037371705496486, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.244, + "step": 5318 + }, + { + "epoch": 0.6038506976595677, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5319 + }, + { + "epoch": 0.6039642247694867, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2543, + "step": 5320 + }, + { + "epoch": 0.6040777518794058, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.264, + "step": 5321 + }, + { + "epoch": 0.6041912789893249, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 5322 + }, + { + "epoch": 0.604304806099244, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 5323 + }, + { + "epoch": 0.604418333209163, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5324 + }, + { + "epoch": 0.6045318603190821, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.242, + "step": 5325 + }, + { + "epoch": 0.6046453874290012, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2637, + "step": 5326 + }, + { + "epoch": 0.6047589145389203, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2611, + "step": 5327 + }, + { + "epoch": 0.6048724416488394, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2394, + "step": 5328 + }, + { + "epoch": 0.6049859687587584, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2666, + "step": 5329 + }, + { + "epoch": 0.6050994958686775, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 5330 + }, + { + "epoch": 0.6052130229785966, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 5331 + }, + { + "epoch": 0.6053265500885157, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2676, + "step": 5332 + }, + { + "epoch": 0.6054400771984347, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2704, + "step": 5333 + }, + { + "epoch": 0.6055536043083538, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5334 + }, + { + "epoch": 0.6056671314182729, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2484, + "step": 5335 + }, + { + "epoch": 0.605780658528192, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 5336 + }, + { + "epoch": 0.605894185638111, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2606, + "step": 5337 + }, + { + "epoch": 0.6060077127480301, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.242, + "step": 5338 + }, + { + "epoch": 0.6061212398579492, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2569, + "step": 5339 + }, + { + "epoch": 0.6062347669678683, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 5340 + }, + { + "epoch": 0.6063482940777873, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 5341 + }, + { + "epoch": 0.6064618211877064, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.251, + "step": 5342 + }, + { + "epoch": 0.6065753482976255, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5343 + }, + { + "epoch": 0.6066888754075446, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 5344 + }, + { + "epoch": 0.6068024025174636, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.27, + "step": 5345 + }, + { + "epoch": 0.6069159296273827, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2629, + "step": 5346 + }, + { + "epoch": 0.6070294567373018, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2746, + "step": 5347 + }, + { + "epoch": 0.6071429838472209, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.233, + "step": 5348 + }, + { + "epoch": 0.60725651095714, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2547, + "step": 5349 + }, + { + "epoch": 0.607370038067059, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2669, + "step": 5350 + }, + { + "epoch": 0.6074835651769781, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2702, + "step": 5351 + }, + { + "epoch": 0.6075970922868972, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 5352 + }, + { + "epoch": 0.6077106193968163, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 5353 + }, + { + "epoch": 0.6078241465067353, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5354 + }, + { + "epoch": 0.6079376736166544, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 5355 + }, + { + "epoch": 0.6080512007265735, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5356 + }, + { + "epoch": 0.6081647278364926, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5357 + }, + { + "epoch": 0.6082782549464116, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 5358 + }, + { + "epoch": 0.6083917820563307, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.232, + "step": 5359 + }, + { + "epoch": 0.6085053091662498, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2526, + "step": 5360 + }, + { + "epoch": 0.6086188362761689, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2606, + "step": 5361 + }, + { + "epoch": 0.6087323633860879, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2512, + "step": 5362 + }, + { + "epoch": 0.608845890496007, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 5363 + }, + { + "epoch": 0.6089594176059261, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 5364 + }, + { + "epoch": 0.6090729447158452, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 5365 + }, + { + "epoch": 0.6091864718257642, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2398, + "step": 5366 + }, + { + "epoch": 0.6092999989356833, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2631, + "step": 5367 + }, + { + "epoch": 0.6094135260456024, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2641, + "step": 5368 + }, + { + "epoch": 0.6095270531555215, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2621, + "step": 5369 + }, + { + "epoch": 0.6096405802654405, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5370 + }, + { + "epoch": 0.6097541073753596, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 5371 + }, + { + "epoch": 0.6098676344852787, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 5372 + }, + { + "epoch": 0.6099811615951978, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 5373 + }, + { + "epoch": 0.6100946887051168, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 5374 + }, + { + "epoch": 0.6102082158150359, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 5375 + }, + { + "epoch": 0.610321742924955, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2357, + "step": 5376 + }, + { + "epoch": 0.6104352700348741, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2715, + "step": 5377 + }, + { + "epoch": 0.6105487971447932, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 5378 + }, + { + "epoch": 0.6106623242547122, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 5379 + }, + { + "epoch": 0.6107758513646313, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2629, + "step": 5380 + }, + { + "epoch": 0.6108893784745504, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 5381 + }, + { + "epoch": 0.6110029055844695, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 5382 + }, + { + "epoch": 0.6111164326943885, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.292, + "step": 5383 + }, + { + "epoch": 0.6112299598043076, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 5384 + }, + { + "epoch": 0.6113434869142267, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2357, + "step": 5385 + }, + { + "epoch": 0.6114570140241458, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2526, + "step": 5386 + }, + { + "epoch": 0.6115705411340648, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 5387 + }, + { + "epoch": 0.6116840682439839, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 5388 + }, + { + "epoch": 0.611797595353903, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2431, + "step": 5389 + }, + { + "epoch": 0.6119111224638221, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2743, + "step": 5390 + }, + { + "epoch": 0.6120246495737411, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 5391 + }, + { + "epoch": 0.6121381766836602, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 5392 + }, + { + "epoch": 0.6122517037935793, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2495, + "step": 5393 + }, + { + "epoch": 0.6123652309034984, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2873, + "step": 5394 + }, + { + "epoch": 0.6124787580134174, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.246, + "step": 5395 + }, + { + "epoch": 0.6125922851233365, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 5396 + }, + { + "epoch": 0.6127058122332556, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2757, + "step": 5397 + }, + { + "epoch": 0.6128193393431747, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2772, + "step": 5398 + }, + { + "epoch": 0.6129328664530937, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 5399 + }, + { + "epoch": 0.6130463935630128, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2593, + "step": 5400 + }, + { + "epoch": 0.6131599206729319, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2691, + "step": 5401 + }, + { + "epoch": 0.613273447782851, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.2611, + "step": 5402 + }, + { + "epoch": 0.61338697489277, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2766, + "step": 5403 + }, + { + "epoch": 0.6135005020026891, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5404 + }, + { + "epoch": 0.6136140291126082, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 5405 + }, + { + "epoch": 0.6137275562225273, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5406 + }, + { + "epoch": 0.6138410833324464, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2744, + "step": 5407 + }, + { + "epoch": 0.6139546104423654, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 5408 + }, + { + "epoch": 0.6140681375522845, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 5409 + }, + { + "epoch": 0.6141816646622036, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2812, + "step": 5410 + }, + { + "epoch": 0.6142951917721227, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2492, + "step": 5411 + }, + { + "epoch": 0.6144087188820417, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 5412 + }, + { + "epoch": 0.6145222459919608, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2778, + "step": 5413 + }, + { + "epoch": 0.6146357731018799, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 5414 + }, + { + "epoch": 0.614749300211799, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 5415 + }, + { + "epoch": 0.614862827321718, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2439, + "step": 5416 + }, + { + "epoch": 0.6149763544316371, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2646, + "step": 5417 + }, + { + "epoch": 0.6150898815415562, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 5418 + }, + { + "epoch": 0.6152034086514753, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2853, + "step": 5419 + }, + { + "epoch": 0.6153169357613943, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 5420 + }, + { + "epoch": 0.6154304628713134, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 5421 + }, + { + "epoch": 0.6155439899812325, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2636, + "step": 5422 + }, + { + "epoch": 0.6156575170911516, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2643, + "step": 5423 + }, + { + "epoch": 0.6157710442010706, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2554, + "step": 5424 + }, + { + "epoch": 0.6158845713109897, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 5425 + }, + { + "epoch": 0.6159980984209088, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2571, + "step": 5426 + }, + { + "epoch": 0.6161116255308279, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2751, + "step": 5427 + }, + { + "epoch": 0.616225152640747, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 5428 + }, + { + "epoch": 0.616338679750666, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2603, + "step": 5429 + }, + { + "epoch": 0.6164522068605852, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2542, + "step": 5430 + }, + { + "epoch": 0.6165657339705043, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 5431 + }, + { + "epoch": 0.6166792610804234, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2404, + "step": 5432 + }, + { + "epoch": 0.6167927881903424, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2635, + "step": 5433 + }, + { + "epoch": 0.6169063153002615, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2748, + "step": 5434 + }, + { + "epoch": 0.6170198424101806, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5435 + }, + { + "epoch": 0.6171333695200997, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 5436 + }, + { + "epoch": 0.6172468966300187, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 5437 + }, + { + "epoch": 0.6173604237399378, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2567, + "step": 5438 + }, + { + "epoch": 0.6174739508498569, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2669, + "step": 5439 + }, + { + "epoch": 0.617587477959776, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 5440 + }, + { + "epoch": 0.617701005069695, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 5441 + }, + { + "epoch": 0.6178145321796141, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 5442 + }, + { + "epoch": 0.6179280592895332, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2444, + "step": 5443 + }, + { + "epoch": 0.6180415863994523, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2543, + "step": 5444 + }, + { + "epoch": 0.6181551135093714, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5445 + }, + { + "epoch": 0.6182686406192904, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.25, + "step": 5446 + }, + { + "epoch": 0.6183821677292095, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2624, + "step": 5447 + }, + { + "epoch": 0.6184956948391286, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2647, + "step": 5448 + }, + { + "epoch": 0.6186092219490477, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 5449 + }, + { + "epoch": 0.6187227490589667, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 5450 + }, + { + "epoch": 0.6188362761688858, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 5451 + }, + { + "epoch": 0.6189498032788049, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2943, + "step": 5452 + }, + { + "epoch": 0.619063330388724, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.268, + "step": 5453 + }, + { + "epoch": 0.619176857498643, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2343, + "step": 5454 + }, + { + "epoch": 0.6192903846085621, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.244, + "step": 5455 + }, + { + "epoch": 0.6194039117184812, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5456 + }, + { + "epoch": 0.6195174388284003, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 5457 + }, + { + "epoch": 0.6196309659383193, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 5458 + }, + { + "epoch": 0.6197444930482384, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.225, + "step": 5459 + }, + { + "epoch": 0.6198580201581575, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2564, + "step": 5460 + }, + { + "epoch": 0.6199715472680766, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 5461 + }, + { + "epoch": 0.6200850743779956, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5462 + }, + { + "epoch": 0.6201986014879147, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2716, + "step": 5463 + }, + { + "epoch": 0.6203121285978338, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5464 + }, + { + "epoch": 0.6204256557077529, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2286, + "step": 5465 + }, + { + "epoch": 0.620539182817672, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5466 + }, + { + "epoch": 0.620652709927591, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2641, + "step": 5467 + }, + { + "epoch": 0.6207662370375101, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 5468 + }, + { + "epoch": 0.6208797641474292, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2712, + "step": 5469 + }, + { + "epoch": 0.6209932912573483, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2571, + "step": 5470 + }, + { + "epoch": 0.6211068183672673, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5471 + }, + { + "epoch": 0.6212203454771864, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2469, + "step": 5472 + }, + { + "epoch": 0.6213338725871055, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2388, + "step": 5473 + }, + { + "epoch": 0.6214473996970246, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2681, + "step": 5474 + }, + { + "epoch": 0.6215609268069436, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 5475 + }, + { + "epoch": 0.6216744539168627, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 5476 + }, + { + "epoch": 0.6217879810267818, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 5477 + }, + { + "epoch": 0.6219015081367009, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2408, + "step": 5478 + }, + { + "epoch": 0.6220150352466199, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 5479 + }, + { + "epoch": 0.622128562356539, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2779, + "step": 5480 + }, + { + "epoch": 0.6222420894664581, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5481 + }, + { + "epoch": 0.6223556165763772, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5482 + }, + { + "epoch": 0.6224691436862962, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2408, + "step": 5483 + }, + { + "epoch": 0.6225826707962153, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 5484 + }, + { + "epoch": 0.6226961979061344, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2628, + "step": 5485 + }, + { + "epoch": 0.6228097250160535, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2394, + "step": 5486 + }, + { + "epoch": 0.6229232521259725, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2688, + "step": 5487 + }, + { + "epoch": 0.6230367792358916, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2547, + "step": 5488 + }, + { + "epoch": 0.6231503063458107, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 5489 + }, + { + "epoch": 0.6232638334557298, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5490 + }, + { + "epoch": 0.6233773605656489, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2314, + "step": 5491 + }, + { + "epoch": 0.6234908876755679, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5492 + }, + { + "epoch": 0.623604414785487, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.253, + "step": 5493 + }, + { + "epoch": 0.6237179418954061, + "grad_norm": 0.400390625, + "learning_rate": 0.002, + "loss": 5.25, + "step": 5494 + }, + { + "epoch": 0.6238314690053252, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 5495 + }, + { + "epoch": 0.6239449961152442, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2804, + "step": 5496 + }, + { + "epoch": 0.6240585232251633, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 5497 + }, + { + "epoch": 0.6241720503350824, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.236, + "step": 5498 + }, + { + "epoch": 0.6242855774450015, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 5499 + }, + { + "epoch": 0.6243991045549205, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 5500 + }, + { + "epoch": 0.6245126316648396, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2594, + "step": 5501 + }, + { + "epoch": 0.6246261587747587, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 5502 + }, + { + "epoch": 0.6247396858846778, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2344, + "step": 5503 + }, + { + "epoch": 0.6248532129945968, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2543, + "step": 5504 + }, + { + "epoch": 0.6249667401045159, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 5505 + }, + { + "epoch": 0.625080267214435, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2656, + "step": 5506 + }, + { + "epoch": 0.6251937943243541, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 5507 + }, + { + "epoch": 0.6253073214342731, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5508 + }, + { + "epoch": 0.6254208485441922, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2381, + "step": 5509 + }, + { + "epoch": 0.6255343756541113, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2639, + "step": 5510 + }, + { + "epoch": 0.6256479027640304, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2541, + "step": 5511 + }, + { + "epoch": 0.6257614298739494, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2394, + "step": 5512 + }, + { + "epoch": 0.6258749569838685, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.251, + "step": 5513 + }, + { + "epoch": 0.6259884840937876, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 5514 + }, + { + "epoch": 0.6261020112037067, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 5515 + }, + { + "epoch": 0.6262155383136258, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 5516 + }, + { + "epoch": 0.6263290654235448, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2495, + "step": 5517 + }, + { + "epoch": 0.6264425925334639, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 5518 + }, + { + "epoch": 0.626556119643383, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 5519 + }, + { + "epoch": 0.6266696467533021, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 5520 + }, + { + "epoch": 0.6267831738632211, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2738, + "step": 5521 + }, + { + "epoch": 0.6268967009731402, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 5522 + }, + { + "epoch": 0.6270102280830593, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 5523 + }, + { + "epoch": 0.6271237551929784, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 5524 + }, + { + "epoch": 0.6272372823028974, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 5525 + }, + { + "epoch": 0.6273508094128165, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 5526 + }, + { + "epoch": 0.6274643365227356, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 5527 + }, + { + "epoch": 0.6275778636326547, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 5528 + }, + { + "epoch": 0.6276913907425737, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 5529 + }, + { + "epoch": 0.6278049178524928, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5530 + }, + { + "epoch": 0.6279184449624119, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2338, + "step": 5531 + }, + { + "epoch": 0.628031972072331, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2351, + "step": 5532 + }, + { + "epoch": 0.62814549918225, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5533 + }, + { + "epoch": 0.6282590262921691, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 5534 + }, + { + "epoch": 0.6283725534020882, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.26, + "step": 5535 + }, + { + "epoch": 0.6284860805120073, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 5536 + }, + { + "epoch": 0.6285996076219263, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 5537 + }, + { + "epoch": 0.6287131347318454, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5538 + }, + { + "epoch": 0.6288266618417645, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 5539 + }, + { + "epoch": 0.6289401889516836, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.252, + "step": 5540 + }, + { + "epoch": 0.6290537160616027, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 5541 + }, + { + "epoch": 0.6291672431715217, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 5542 + }, + { + "epoch": 0.6292807702814408, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2724, + "step": 5543 + }, + { + "epoch": 0.6293942973913599, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2313, + "step": 5544 + }, + { + "epoch": 0.629507824501279, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 5545 + }, + { + "epoch": 0.629621351611198, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 5546 + }, + { + "epoch": 0.6297348787211171, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 5547 + }, + { + "epoch": 0.6298484058310362, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 5548 + }, + { + "epoch": 0.6299619329409553, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 5549 + }, + { + "epoch": 0.6300754600508743, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2174, + "step": 5550 + }, + { + "epoch": 0.6301889871607934, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 5551 + }, + { + "epoch": 0.6303025142707125, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2636, + "step": 5552 + }, + { + "epoch": 0.6304160413806316, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 5553 + }, + { + "epoch": 0.6305295684905506, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 5554 + }, + { + "epoch": 0.6306430956004697, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.249, + "step": 5555 + }, + { + "epoch": 0.6307566227103888, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 5556 + }, + { + "epoch": 0.6308701498203079, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 5557 + }, + { + "epoch": 0.6309836769302269, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 5558 + }, + { + "epoch": 0.631097204040146, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5559 + }, + { + "epoch": 0.6312107311500651, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2683, + "step": 5560 + }, + { + "epoch": 0.6313242582599842, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 5561 + }, + { + "epoch": 0.6314377853699032, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 5562 + }, + { + "epoch": 0.6315513124798223, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2292, + "step": 5563 + }, + { + "epoch": 0.6316648395897414, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2361, + "step": 5564 + }, + { + "epoch": 0.6317783666996605, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 5565 + }, + { + "epoch": 0.6318918938095796, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5566 + }, + { + "epoch": 0.6320054209194986, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.254, + "step": 5567 + }, + { + "epoch": 0.6321189480294177, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2403, + "step": 5568 + }, + { + "epoch": 0.6322324751393368, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2366, + "step": 5569 + }, + { + "epoch": 0.6323460022492559, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 5570 + }, + { + "epoch": 0.6324595293591749, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.26, + "step": 5571 + }, + { + "epoch": 0.632573056469094, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2518, + "step": 5572 + }, + { + "epoch": 0.6326865835790131, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 5573 + }, + { + "epoch": 0.6328001106889322, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 5574 + }, + { + "epoch": 0.6329136377988512, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5575 + }, + { + "epoch": 0.6330271649087703, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.235, + "step": 5576 + }, + { + "epoch": 0.6331406920186894, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 5577 + }, + { + "epoch": 0.6332542191286085, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 5578 + }, + { + "epoch": 0.6333677462385275, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2495, + "step": 5579 + }, + { + "epoch": 0.6334812733484466, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5580 + }, + { + "epoch": 0.6335948004583657, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 5581 + }, + { + "epoch": 0.6337083275682848, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2686, + "step": 5582 + }, + { + "epoch": 0.6338218546782038, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2627, + "step": 5583 + }, + { + "epoch": 0.6339353817881229, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2568, + "step": 5584 + }, + { + "epoch": 0.634048908898042, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 5585 + }, + { + "epoch": 0.6341624360079611, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 5586 + }, + { + "epoch": 0.6342759631178801, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.242, + "step": 5587 + }, + { + "epoch": 0.6343894902277992, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2517, + "step": 5588 + }, + { + "epoch": 0.6345030173377183, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2244, + "step": 5589 + }, + { + "epoch": 0.6346165444476374, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5590 + }, + { + "epoch": 0.6347300715575565, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2615, + "step": 5591 + }, + { + "epoch": 0.6348435986674755, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2606, + "step": 5592 + }, + { + "epoch": 0.6349571257773946, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5593 + }, + { + "epoch": 0.6350706528873137, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2448, + "step": 5594 + }, + { + "epoch": 0.6351841799972328, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 5595 + }, + { + "epoch": 0.6352977071071518, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.223, + "step": 5596 + }, + { + "epoch": 0.6354112342170709, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 5597 + }, + { + "epoch": 0.63552476132699, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2539, + "step": 5598 + }, + { + "epoch": 0.6356382884369091, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 5599 + }, + { + "epoch": 0.6357518155468281, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 5600 + }, + { + "epoch": 0.6358653426567472, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.25, + "step": 5601 + }, + { + "epoch": 0.6359788697666663, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 5602 + }, + { + "epoch": 0.6360923968765854, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2465, + "step": 5603 + }, + { + "epoch": 0.6362059239865044, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 5604 + }, + { + "epoch": 0.6363194510964235, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2258, + "step": 5605 + }, + { + "epoch": 0.6364329782063426, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 5606 + }, + { + "epoch": 0.6365465053162617, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5607 + }, + { + "epoch": 0.6366600324261807, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2518, + "step": 5608 + }, + { + "epoch": 0.6367735595360998, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2547, + "step": 5609 + }, + { + "epoch": 0.6368870866460189, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2609, + "step": 5610 + }, + { + "epoch": 0.637000613755938, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2567, + "step": 5611 + }, + { + "epoch": 0.637114140865857, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5612 + }, + { + "epoch": 0.6372276679757761, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 5613 + }, + { + "epoch": 0.6373411950856952, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 5614 + }, + { + "epoch": 0.6374547221956143, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2664, + "step": 5615 + }, + { + "epoch": 0.6375682493055334, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2328, + "step": 5616 + }, + { + "epoch": 0.6376817764154524, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2663, + "step": 5617 + }, + { + "epoch": 0.6377953035253715, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5618 + }, + { + "epoch": 0.6379088306352906, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 5619 + }, + { + "epoch": 0.6380223577452097, + "grad_norm": 0.232421875, + "learning_rate": 0.002, + "loss": 5.235, + "step": 5620 + }, + { + "epoch": 0.6381358848551287, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2359, + "step": 5621 + }, + { + "epoch": 0.6382494119650478, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 5622 + }, + { + "epoch": 0.6383629390749669, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2582, + "step": 5623 + }, + { + "epoch": 0.638476466184886, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.265, + "step": 5624 + }, + { + "epoch": 0.638589993294805, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 5625 + }, + { + "epoch": 0.6387035204047241, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.222, + "step": 5626 + }, + { + "epoch": 0.6388170475146432, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5627 + }, + { + "epoch": 0.6389305746245623, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 5628 + }, + { + "epoch": 0.6390441017344813, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 5629 + }, + { + "epoch": 0.6391576288444004, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 5630 + }, + { + "epoch": 0.6392711559543195, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2318, + "step": 5631 + }, + { + "epoch": 0.6393846830642386, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2634, + "step": 5632 + }, + { + "epoch": 0.6394982101741576, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2417, + "step": 5633 + }, + { + "epoch": 0.6396117372840767, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2905, + "step": 5634 + }, + { + "epoch": 0.6397252643939958, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2594, + "step": 5635 + }, + { + "epoch": 0.6398387915039149, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 5636 + }, + { + "epoch": 0.639952318613834, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2654, + "step": 5637 + }, + { + "epoch": 0.640065845723753, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5638 + }, + { + "epoch": 0.6401793728336721, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 5639 + }, + { + "epoch": 0.6402928999435912, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.255, + "step": 5640 + }, + { + "epoch": 0.6404064270535103, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.2404, + "step": 5641 + }, + { + "epoch": 0.6405199541634293, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.267, + "step": 5642 + }, + { + "epoch": 0.6406334812733484, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 5643 + }, + { + "epoch": 0.6407470083832675, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 5644 + }, + { + "epoch": 0.6408605354931866, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2737, + "step": 5645 + }, + { + "epoch": 0.6409740626031056, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2442, + "step": 5646 + }, + { + "epoch": 0.6410875897130247, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.243, + "step": 5647 + }, + { + "epoch": 0.6412011168229438, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2641, + "step": 5648 + }, + { + "epoch": 0.6413146439328629, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 5649 + }, + { + "epoch": 0.6414281710427819, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 5650 + }, + { + "epoch": 0.641541698152701, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 5651 + }, + { + "epoch": 0.6416552252626201, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 5652 + }, + { + "epoch": 0.6417687523725392, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 5653 + }, + { + "epoch": 0.6418822794824582, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2678, + "step": 5654 + }, + { + "epoch": 0.6419958065923773, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2542, + "step": 5655 + }, + { + "epoch": 0.6421093337022964, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 5656 + }, + { + "epoch": 0.6422228608122155, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 5657 + }, + { + "epoch": 0.6423363879221345, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2469, + "step": 5658 + }, + { + "epoch": 0.6424499150320536, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 5659 + }, + { + "epoch": 0.6425634421419727, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2342, + "step": 5660 + }, + { + "epoch": 0.6426769692518918, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 5661 + }, + { + "epoch": 0.6427904963618108, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 5662 + }, + { + "epoch": 0.6429040234717299, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 5663 + }, + { + "epoch": 0.643017550581649, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2365, + "step": 5664 + }, + { + "epoch": 0.6431310776915681, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 5665 + }, + { + "epoch": 0.6432446048014872, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 5666 + }, + { + "epoch": 0.6433581319114062, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 5667 + }, + { + "epoch": 0.6434716590213253, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 5668 + }, + { + "epoch": 0.6435851861312444, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 5669 + }, + { + "epoch": 0.6436987132411635, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2603, + "step": 5670 + }, + { + "epoch": 0.6438122403510825, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2355, + "step": 5671 + }, + { + "epoch": 0.6439257674610017, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 5672 + }, + { + "epoch": 0.6440392945709208, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5673 + }, + { + "epoch": 0.6441528216808399, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.229, + "step": 5674 + }, + { + "epoch": 0.644266348790759, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 5675 + }, + { + "epoch": 0.644379875900678, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 5676 + }, + { + "epoch": 0.6444934030105971, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2422, + "step": 5677 + }, + { + "epoch": 0.6446069301205162, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 5678 + }, + { + "epoch": 0.6447204572304353, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 5679 + }, + { + "epoch": 0.6448339843403543, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2389, + "step": 5680 + }, + { + "epoch": 0.6449475114502734, + "grad_norm": 0.5234375, + "learning_rate": 0.002, + "loss": 5.2634, + "step": 5681 + }, + { + "epoch": 0.6450610385601925, + "grad_norm": 0.494140625, + "learning_rate": 0.002, + "loss": 5.2509, + "step": 5682 + }, + { + "epoch": 0.6451745656701116, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2313, + "step": 5683 + }, + { + "epoch": 0.6452880927800306, + "grad_norm": 0.482421875, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 5684 + }, + { + "epoch": 0.6454016198899497, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 5685 + }, + { + "epoch": 0.6455151469998688, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2338, + "step": 5686 + }, + { + "epoch": 0.6456286741097879, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 5687 + }, + { + "epoch": 0.6457422012197069, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 5688 + }, + { + "epoch": 0.645855728329626, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2319, + "step": 5689 + }, + { + "epoch": 0.6459692554395451, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 5690 + }, + { + "epoch": 0.6460827825494642, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2331, + "step": 5691 + }, + { + "epoch": 0.6461963096593832, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 5692 + }, + { + "epoch": 0.6463098367693023, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2397, + "step": 5693 + }, + { + "epoch": 0.6464233638792214, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 5694 + }, + { + "epoch": 0.6465368909891405, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2612, + "step": 5695 + }, + { + "epoch": 0.6466504180990595, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 5696 + }, + { + "epoch": 0.6467639452089786, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5697 + }, + { + "epoch": 0.6468774723188977, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 5698 + }, + { + "epoch": 0.6469909994288168, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2284, + "step": 5699 + }, + { + "epoch": 0.6471045265387358, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 5700 + }, + { + "epoch": 0.6472180536486549, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2725, + "step": 5701 + }, + { + "epoch": 0.647331580758574, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.244, + "step": 5702 + }, + { + "epoch": 0.6474451078684931, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2709, + "step": 5703 + }, + { + "epoch": 0.6475586349784122, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2625, + "step": 5704 + }, + { + "epoch": 0.6476721620883312, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2444, + "step": 5705 + }, + { + "epoch": 0.6477856891982503, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 5706 + }, + { + "epoch": 0.6478992163081694, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 5707 + }, + { + "epoch": 0.6480127434180885, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2276, + "step": 5708 + }, + { + "epoch": 0.6481262705280075, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2341, + "step": 5709 + }, + { + "epoch": 0.6482397976379266, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 5710 + }, + { + "epoch": 0.6483533247478457, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 5711 + }, + { + "epoch": 0.6484668518577648, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2449, + "step": 5712 + }, + { + "epoch": 0.6485803789676838, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2309, + "step": 5713 + }, + { + "epoch": 0.6486939060776029, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 5714 + }, + { + "epoch": 0.648807433187522, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5715 + }, + { + "epoch": 0.6489209602974411, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 5716 + }, + { + "epoch": 0.6490344874073601, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5717 + }, + { + "epoch": 0.6491480145172792, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 5718 + }, + { + "epoch": 0.6492615416271983, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2594, + "step": 5719 + }, + { + "epoch": 0.6493750687371174, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2448, + "step": 5720 + }, + { + "epoch": 0.6494885958470364, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5721 + }, + { + "epoch": 0.6496021229569555, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 5722 + }, + { + "epoch": 0.6497156500668746, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 5723 + }, + { + "epoch": 0.6498291771767937, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5724 + }, + { + "epoch": 0.6499427042867127, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 5725 + }, + { + "epoch": 0.6500562313966318, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2389, + "step": 5726 + }, + { + "epoch": 0.6501697585065509, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5727 + }, + { + "epoch": 0.65028328561647, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5728 + }, + { + "epoch": 0.650396812726389, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2386, + "step": 5729 + }, + { + "epoch": 0.6505103398363081, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2743, + "step": 5730 + }, + { + "epoch": 0.6506238669462272, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2373, + "step": 5731 + }, + { + "epoch": 0.6507373940561463, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 5732 + }, + { + "epoch": 0.6508509211660654, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 5733 + }, + { + "epoch": 0.6509644482759844, + "grad_norm": 0.22265625, + "learning_rate": 0.002, + "loss": 5.2644, + "step": 5734 + }, + { + "epoch": 0.6510779753859035, + "grad_norm": 0.21875, + "learning_rate": 0.002, + "loss": 5.2405, + "step": 5735 + }, + { + "epoch": 0.6511915024958226, + "grad_norm": 0.2216796875, + "learning_rate": 0.002, + "loss": 5.2586, + "step": 5736 + }, + { + "epoch": 0.6513050296057417, + "grad_norm": 0.2421875, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 5737 + }, + { + "epoch": 0.6514185567156607, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 5738 + }, + { + "epoch": 0.6515320838255798, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.26, + "step": 5739 + }, + { + "epoch": 0.6516456109354989, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 5740 + }, + { + "epoch": 0.651759138045418, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 5741 + }, + { + "epoch": 0.651872665155337, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2639, + "step": 5742 + }, + { + "epoch": 0.6519861922652561, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 5743 + }, + { + "epoch": 0.6520997193751752, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 5744 + }, + { + "epoch": 0.6522132464850943, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 5745 + }, + { + "epoch": 0.6523267735950133, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2675, + "step": 5746 + }, + { + "epoch": 0.6524403007049324, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.236, + "step": 5747 + }, + { + "epoch": 0.6525538278148515, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 5748 + }, + { + "epoch": 0.6526673549247706, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 5749 + }, + { + "epoch": 0.6527808820346896, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 5750 + }, + { + "epoch": 0.6528944091446087, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 5751 + }, + { + "epoch": 0.6530079362545278, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 5752 + }, + { + "epoch": 0.6531214633644469, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 5753 + }, + { + "epoch": 0.653234990474366, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2726, + "step": 5754 + }, + { + "epoch": 0.653348517584285, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.269, + "step": 5755 + }, + { + "epoch": 0.6534620446942041, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.248, + "step": 5756 + }, + { + "epoch": 0.6535755718041232, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 5757 + }, + { + "epoch": 0.6536890989140423, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 5758 + }, + { + "epoch": 0.6538026260239613, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2563, + "step": 5759 + }, + { + "epoch": 0.6539161531338804, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2698, + "step": 5760 + }, + { + "epoch": 0.6540296802437995, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 5761 + }, + { + "epoch": 0.6541432073537186, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2692, + "step": 5762 + }, + { + "epoch": 0.6542567344636376, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 5763 + }, + { + "epoch": 0.6543702615735567, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 5764 + }, + { + "epoch": 0.6544837886834758, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 5765 + }, + { + "epoch": 0.6545973157933949, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.254, + "step": 5766 + }, + { + "epoch": 0.6547108429033139, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2767, + "step": 5767 + }, + { + "epoch": 0.654824370013233, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2168, + "step": 5768 + }, + { + "epoch": 0.6549378971231521, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 5769 + }, + { + "epoch": 0.6550514242330712, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 5770 + }, + { + "epoch": 0.6551649513429902, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2526, + "step": 5771 + }, + { + "epoch": 0.6552784784529093, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2665, + "step": 5772 + }, + { + "epoch": 0.6553920055628284, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 5773 + }, + { + "epoch": 0.6555055326727475, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2483, + "step": 5774 + }, + { + "epoch": 0.6556190597826665, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2619, + "step": 5775 + }, + { + "epoch": 0.6557325868925856, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 5776 + }, + { + "epoch": 0.6558461140025047, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2255, + "step": 5777 + }, + { + "epoch": 0.6559596411124238, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2452, + "step": 5778 + }, + { + "epoch": 0.6560731682223429, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2466, + "step": 5779 + }, + { + "epoch": 0.6561866953322619, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5780 + }, + { + "epoch": 0.656300222442181, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5781 + }, + { + "epoch": 0.6564137495521001, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 5782 + }, + { + "epoch": 0.6565272766620192, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2484, + "step": 5783 + }, + { + "epoch": 0.6566408037719382, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 5784 + }, + { + "epoch": 0.6567543308818573, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5785 + }, + { + "epoch": 0.6568678579917764, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2751, + "step": 5786 + }, + { + "epoch": 0.6569813851016955, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 5787 + }, + { + "epoch": 0.6570949122116145, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.257, + "step": 5788 + }, + { + "epoch": 0.6572084393215336, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.275, + "step": 5789 + }, + { + "epoch": 0.6573219664314527, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 5790 + }, + { + "epoch": 0.6574354935413718, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2315, + "step": 5791 + }, + { + "epoch": 0.6575490206512908, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 5792 + }, + { + "epoch": 0.6576625477612099, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2316, + "step": 5793 + }, + { + "epoch": 0.657776074871129, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2361, + "step": 5794 + }, + { + "epoch": 0.6578896019810481, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2401, + "step": 5795 + }, + { + "epoch": 0.6580031290909671, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 5796 + }, + { + "epoch": 0.6581166562008862, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 5797 + }, + { + "epoch": 0.6582301833108053, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 5798 + }, + { + "epoch": 0.6583437104207244, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2559, + "step": 5799 + }, + { + "epoch": 0.6584572375306434, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 5800 + }, + { + "epoch": 0.6585707646405625, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 5801 + }, + { + "epoch": 0.6586842917504816, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.285, + "step": 5802 + }, + { + "epoch": 0.6587978188604007, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5803 + }, + { + "epoch": 0.6589113459703198, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 5804 + }, + { + "epoch": 0.6590248730802388, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2349, + "step": 5805 + }, + { + "epoch": 0.6591384001901579, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 5806 + }, + { + "epoch": 0.659251927300077, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.2642, + "step": 5807 + }, + { + "epoch": 0.659365454409996, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2397, + "step": 5808 + }, + { + "epoch": 0.6594789815199151, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.24, + "step": 5809 + }, + { + "epoch": 0.6595925086298342, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2341, + "step": 5810 + }, + { + "epoch": 0.6597060357397533, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 5811 + }, + { + "epoch": 0.6598195628496724, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2617, + "step": 5812 + }, + { + "epoch": 0.6599330899595914, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2566, + "step": 5813 + }, + { + "epoch": 0.6600466170695105, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2492, + "step": 5814 + }, + { + "epoch": 0.6601601441794296, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 5815 + }, + { + "epoch": 0.6602736712893487, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2357, + "step": 5816 + }, + { + "epoch": 0.6603871983992677, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.234, + "step": 5817 + }, + { + "epoch": 0.6605007255091868, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2466, + "step": 5818 + }, + { + "epoch": 0.6606142526191059, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2188, + "step": 5819 + }, + { + "epoch": 0.660727779729025, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 5820 + }, + { + "epoch": 0.660841306838944, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 5821 + }, + { + "epoch": 0.6609548339488631, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2541, + "step": 5822 + }, + { + "epoch": 0.6610683610587822, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.258, + "step": 5823 + }, + { + "epoch": 0.6611818881687013, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2659, + "step": 5824 + }, + { + "epoch": 0.6612954152786203, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2447, + "step": 5825 + }, + { + "epoch": 0.6614089423885394, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2445, + "step": 5826 + }, + { + "epoch": 0.6615224694984585, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2328, + "step": 5827 + }, + { + "epoch": 0.6616359966083776, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2688, + "step": 5828 + }, + { + "epoch": 0.6617495237182967, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2465, + "step": 5829 + }, + { + "epoch": 0.6618630508282157, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2442, + "step": 5830 + }, + { + "epoch": 0.6619765779381348, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 5831 + }, + { + "epoch": 0.6620901050480539, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 5832 + }, + { + "epoch": 0.662203632157973, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2274, + "step": 5833 + }, + { + "epoch": 0.662317159267892, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2492, + "step": 5834 + }, + { + "epoch": 0.6624306863778111, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 5835 + }, + { + "epoch": 0.6625442134877302, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2623, + "step": 5836 + }, + { + "epoch": 0.6626577405976493, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2227, + "step": 5837 + }, + { + "epoch": 0.6627712677075683, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2543, + "step": 5838 + }, + { + "epoch": 0.6628847948174874, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 5839 + }, + { + "epoch": 0.6629983219274065, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 5840 + }, + { + "epoch": 0.6631118490373256, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 5841 + }, + { + "epoch": 0.6632253761472446, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 5842 + }, + { + "epoch": 0.6633389032571637, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2417, + "step": 5843 + }, + { + "epoch": 0.6634524303670828, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5844 + }, + { + "epoch": 0.6635659574770019, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 5845 + }, + { + "epoch": 0.6636794845869209, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2303, + "step": 5846 + }, + { + "epoch": 0.66379301169684, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2469, + "step": 5847 + }, + { + "epoch": 0.6639065388067591, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2291, + "step": 5848 + }, + { + "epoch": 0.6640200659166782, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2097, + "step": 5849 + }, + { + "epoch": 0.6641335930265972, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2439, + "step": 5850 + }, + { + "epoch": 0.6642471201365163, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 5851 + }, + { + "epoch": 0.6643606472464354, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2313, + "step": 5852 + }, + { + "epoch": 0.6644741743563545, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 5853 + }, + { + "epoch": 0.6645877014662736, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.261, + "step": 5854 + }, + { + "epoch": 0.6647012285761926, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 5855 + }, + { + "epoch": 0.6648147556861117, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 5856 + }, + { + "epoch": 0.6649282827960308, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2228, + "step": 5857 + }, + { + "epoch": 0.6650418099059499, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 5858 + }, + { + "epoch": 0.6651553370158689, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 5859 + }, + { + "epoch": 0.665268864125788, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 5860 + }, + { + "epoch": 0.6653823912357071, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2593, + "step": 5861 + }, + { + "epoch": 0.6654959183456262, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2296, + "step": 5862 + }, + { + "epoch": 0.6656094454555452, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2287, + "step": 5863 + }, + { + "epoch": 0.6657229725654643, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2352, + "step": 5864 + }, + { + "epoch": 0.6658364996753834, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 5865 + }, + { + "epoch": 0.6659500267853025, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 5866 + }, + { + "epoch": 0.6660635538952215, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2259, + "step": 5867 + }, + { + "epoch": 0.6661770810051406, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2569, + "step": 5868 + }, + { + "epoch": 0.6662906081150597, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 5869 + }, + { + "epoch": 0.6664041352249788, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2087, + "step": 5870 + }, + { + "epoch": 0.6665176623348978, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 5871 + }, + { + "epoch": 0.6666311894448169, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2596, + "step": 5872 + }, + { + "epoch": 0.666744716554736, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2466, + "step": 5873 + }, + { + "epoch": 0.6668582436646551, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 5874 + }, + { + "epoch": 0.6669717707745741, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5875 + }, + { + "epoch": 0.6670852978844932, + "grad_norm": 0.515625, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5876 + }, + { + "epoch": 0.6671988249944123, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.233, + "step": 5877 + }, + { + "epoch": 0.6673123521043314, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 5878 + }, + { + "epoch": 0.6674258792142505, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2337, + "step": 5879 + }, + { + "epoch": 0.6675394063241695, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.237, + "step": 5880 + }, + { + "epoch": 0.6676529334340886, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 5881 + }, + { + "epoch": 0.6677664605440077, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5882 + }, + { + "epoch": 0.6678799876539268, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 5883 + }, + { + "epoch": 0.6679935147638458, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2291, + "step": 5884 + }, + { + "epoch": 0.6681070418737649, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2697, + "step": 5885 + }, + { + "epoch": 0.668220568983684, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2636, + "step": 5886 + }, + { + "epoch": 0.6683340960936031, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 5887 + }, + { + "epoch": 0.6684476232035221, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 5888 + }, + { + "epoch": 0.6685611503134412, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2298, + "step": 5889 + }, + { + "epoch": 0.6686746774233603, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.248, + "step": 5890 + }, + { + "epoch": 0.6687882045332794, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2366, + "step": 5891 + }, + { + "epoch": 0.6689017316431984, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 5892 + }, + { + "epoch": 0.6690152587531175, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 5893 + }, + { + "epoch": 0.6691287858630366, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.234, + "step": 5894 + }, + { + "epoch": 0.6692423129729557, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2532, + "step": 5895 + }, + { + "epoch": 0.6693558400828747, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 5896 + }, + { + "epoch": 0.6694693671927938, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 5897 + }, + { + "epoch": 0.6695828943027129, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.235, + "step": 5898 + }, + { + "epoch": 0.669696421412632, + "grad_norm": 0.2353515625, + "learning_rate": 0.002, + "loss": 5.2393, + "step": 5899 + }, + { + "epoch": 0.669809948522551, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2381, + "step": 5900 + }, + { + "epoch": 0.6699234756324701, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 5901 + }, + { + "epoch": 0.6700370027423892, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 5902 + }, + { + "epoch": 0.6701505298523083, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2733, + "step": 5903 + }, + { + "epoch": 0.6702640569622274, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2526, + "step": 5904 + }, + { + "epoch": 0.6703775840721464, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 5905 + }, + { + "epoch": 0.6704911111820655, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2448, + "step": 5906 + }, + { + "epoch": 0.6706046382919846, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 5907 + }, + { + "epoch": 0.6707181654019037, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 5908 + }, + { + "epoch": 0.6708316925118227, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 5909 + }, + { + "epoch": 0.6709452196217418, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 5910 + }, + { + "epoch": 0.6710587467316609, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 5911 + }, + { + "epoch": 0.67117227384158, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 5912 + }, + { + "epoch": 0.6712858009514991, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 5913 + }, + { + "epoch": 0.6713993280614182, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.239, + "step": 5914 + }, + { + "epoch": 0.6715128551713373, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2312, + "step": 5915 + }, + { + "epoch": 0.6716263822812564, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2384, + "step": 5916 + }, + { + "epoch": 0.6717399093911754, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 5917 + }, + { + "epoch": 0.6718534365010945, + "grad_norm": 0.46875, + "learning_rate": 0.002, + "loss": 5.2672, + "step": 5918 + }, + { + "epoch": 0.6719669636110136, + "grad_norm": 0.443359375, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 5919 + }, + { + "epoch": 0.6720804907209327, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 5920 + }, + { + "epoch": 0.6721940178308518, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.2597, + "step": 5921 + }, + { + "epoch": 0.6723075449407708, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 5922 + }, + { + "epoch": 0.6724210720506899, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 5923 + }, + { + "epoch": 0.672534599160609, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 5924 + }, + { + "epoch": 0.6726481262705281, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2274, + "step": 5925 + }, + { + "epoch": 0.6727616533804471, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2361, + "step": 5926 + }, + { + "epoch": 0.6728751804903662, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 5927 + }, + { + "epoch": 0.6729887076002853, + "grad_norm": 0.2470703125, + "learning_rate": 0.002, + "loss": 5.2586, + "step": 5928 + }, + { + "epoch": 0.6731022347102044, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2615, + "step": 5929 + }, + { + "epoch": 0.6732157618201234, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2372, + "step": 5930 + }, + { + "epoch": 0.6733292889300425, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2443, + "step": 5931 + }, + { + "epoch": 0.6734428160399616, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 5932 + }, + { + "epoch": 0.6735563431498807, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 5933 + }, + { + "epoch": 0.6736698702597997, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2465, + "step": 5934 + }, + { + "epoch": 0.6737833973697188, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2429, + "step": 5935 + }, + { + "epoch": 0.6738969244796379, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5936 + }, + { + "epoch": 0.674010451589557, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 5937 + }, + { + "epoch": 0.674123978699476, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2389, + "step": 5938 + }, + { + "epoch": 0.6742375058093951, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5939 + }, + { + "epoch": 0.6743510329193142, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 5940 + }, + { + "epoch": 0.6744645600292333, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2303, + "step": 5941 + }, + { + "epoch": 0.6745780871391523, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2276, + "step": 5942 + }, + { + "epoch": 0.6746916142490714, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 5943 + }, + { + "epoch": 0.6748051413589905, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 5944 + }, + { + "epoch": 0.6749186684689096, + "grad_norm": 0.23046875, + "learning_rate": 0.002, + "loss": 5.2159, + "step": 5945 + }, + { + "epoch": 0.6750321955788287, + "grad_norm": 0.228515625, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 5946 + }, + { + "epoch": 0.6751457226887477, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2412, + "step": 5947 + }, + { + "epoch": 0.6752592497986668, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2137, + "step": 5948 + }, + { + "epoch": 0.6753727769085859, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2364, + "step": 5949 + }, + { + "epoch": 0.675486304018505, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2222, + "step": 5950 + }, + { + "epoch": 0.675599831128424, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.237, + "step": 5951 + }, + { + "epoch": 0.6757133582383431, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2289, + "step": 5952 + }, + { + "epoch": 0.6758268853482622, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2329, + "step": 5953 + }, + { + "epoch": 0.6759404124581813, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.245, + "step": 5954 + }, + { + "epoch": 0.6760539395681003, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 5955 + }, + { + "epoch": 0.6761674666780194, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2424, + "step": 5956 + }, + { + "epoch": 0.6762809937879385, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 5957 + }, + { + "epoch": 0.6763945208978576, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.229, + "step": 5958 + }, + { + "epoch": 0.6765080480077766, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2421, + "step": 5959 + }, + { + "epoch": 0.6766215751176957, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 5960 + }, + { + "epoch": 0.6767351022276148, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2437, + "step": 5961 + }, + { + "epoch": 0.6768486293375339, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 5962 + }, + { + "epoch": 0.676962156447453, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 5963 + }, + { + "epoch": 0.677075683557372, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 5964 + }, + { + "epoch": 0.6771892106672911, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 5965 + }, + { + "epoch": 0.6773027377772102, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 5966 + }, + { + "epoch": 0.6774162648871292, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.237, + "step": 5967 + }, + { + "epoch": 0.6775297919970483, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 5968 + }, + { + "epoch": 0.6776433191069674, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2344, + "step": 5969 + }, + { + "epoch": 0.6777568462168865, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2657, + "step": 5970 + }, + { + "epoch": 0.6778703733268056, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 5971 + }, + { + "epoch": 0.6779839004367246, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 5972 + }, + { + "epoch": 0.6780974275466437, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 5973 + }, + { + "epoch": 0.6782109546565628, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2349, + "step": 5974 + }, + { + "epoch": 0.6783244817664819, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 5975 + }, + { + "epoch": 0.6784380088764009, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2307, + "step": 5976 + }, + { + "epoch": 0.67855153598632, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 5977 + }, + { + "epoch": 0.6786650630962391, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 5978 + }, + { + "epoch": 0.6787785902061582, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 5979 + }, + { + "epoch": 0.6788921173160772, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2386, + "step": 5980 + }, + { + "epoch": 0.6790056444259963, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 5981 + }, + { + "epoch": 0.6791191715359154, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2321, + "step": 5982 + }, + { + "epoch": 0.6792326986458345, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.252, + "step": 5983 + }, + { + "epoch": 0.6793462257557535, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2381, + "step": 5984 + }, + { + "epoch": 0.6794597528656726, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 5985 + }, + { + "epoch": 0.6795732799755917, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2316, + "step": 5986 + }, + { + "epoch": 0.6796868070855108, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2453, + "step": 5987 + }, + { + "epoch": 0.6798003341954298, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 5988 + }, + { + "epoch": 0.6799138613053489, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 5989 + }, + { + "epoch": 0.680027388415268, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2154, + "step": 5990 + }, + { + "epoch": 0.6801409155251871, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 5991 + }, + { + "epoch": 0.6802544426351061, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 5992 + }, + { + "epoch": 0.6803679697450252, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 5993 + }, + { + "epoch": 0.6804814968549443, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 5994 + }, + { + "epoch": 0.6805950239648634, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 5995 + }, + { + "epoch": 0.6807085510747825, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2732, + "step": 5996 + }, + { + "epoch": 0.6808220781847015, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2705, + "step": 5997 + }, + { + "epoch": 0.6809356052946206, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 5998 + }, + { + "epoch": 0.6810491324045397, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2282, + "step": 5999 + }, + { + "epoch": 0.6811626595144588, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 6000 + }, + { + "epoch": 0.6812761866243778, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 6001 + }, + { + "epoch": 0.6813897137342969, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 6002 + }, + { + "epoch": 0.681503240844216, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.248, + "step": 6003 + }, + { + "epoch": 0.6816167679541351, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2485, + "step": 6004 + }, + { + "epoch": 0.6817302950640541, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 6005 + }, + { + "epoch": 0.6818438221739732, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 6006 + }, + { + "epoch": 0.6819573492838923, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2447, + "step": 6007 + }, + { + "epoch": 0.6820708763938114, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2375, + "step": 6008 + }, + { + "epoch": 0.6821844035037304, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2495, + "step": 6009 + }, + { + "epoch": 0.6822979306136495, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2572, + "step": 6010 + }, + { + "epoch": 0.6824114577235686, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.235, + "step": 6011 + }, + { + "epoch": 0.6825249848334877, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2647, + "step": 6012 + }, + { + "epoch": 0.6826385119434067, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6013 + }, + { + "epoch": 0.6827520390533258, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2274, + "step": 6014 + }, + { + "epoch": 0.6828655661632449, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2429, + "step": 6015 + }, + { + "epoch": 0.682979093273164, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2224, + "step": 6016 + }, + { + "epoch": 0.683092620383083, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6017 + }, + { + "epoch": 0.6832061474930021, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 6018 + }, + { + "epoch": 0.6833196746029212, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 6019 + }, + { + "epoch": 0.6834332017128403, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.261, + "step": 6020 + }, + { + "epoch": 0.6835467288227594, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.252, + "step": 6021 + }, + { + "epoch": 0.6836602559326784, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2217, + "step": 6022 + }, + { + "epoch": 0.6837737830425975, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2421, + "step": 6023 + }, + { + "epoch": 0.6838873101525166, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 6024 + }, + { + "epoch": 0.6840008372624357, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2412, + "step": 6025 + }, + { + "epoch": 0.6841143643723547, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 6026 + }, + { + "epoch": 0.6842278914822738, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2638, + "step": 6027 + }, + { + "epoch": 0.6843414185921929, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 6028 + }, + { + "epoch": 0.684454945702112, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6029 + }, + { + "epoch": 0.684568472812031, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 6030 + }, + { + "epoch": 0.6846819999219501, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 6031 + }, + { + "epoch": 0.6847955270318692, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2309, + "step": 6032 + }, + { + "epoch": 0.6849090541417883, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2346, + "step": 6033 + }, + { + "epoch": 0.6850225812517073, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2245, + "step": 6034 + }, + { + "epoch": 0.6851361083616264, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6035 + }, + { + "epoch": 0.6852496354715455, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2252, + "step": 6036 + }, + { + "epoch": 0.6853631625814646, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2452, + "step": 6037 + }, + { + "epoch": 0.6854766896913836, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6038 + }, + { + "epoch": 0.6855902168013027, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6039 + }, + { + "epoch": 0.6857037439112218, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2294, + "step": 6040 + }, + { + "epoch": 0.6858172710211409, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6041 + }, + { + "epoch": 0.68593079813106, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2509, + "step": 6042 + }, + { + "epoch": 0.686044325240979, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 6043 + }, + { + "epoch": 0.6861578523508981, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2347, + "step": 6044 + }, + { + "epoch": 0.6862713794608172, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2391, + "step": 6045 + }, + { + "epoch": 0.6863849065707363, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2634, + "step": 6046 + }, + { + "epoch": 0.6864984336806553, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2518, + "step": 6047 + }, + { + "epoch": 0.6866119607905744, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2406, + "step": 6048 + }, + { + "epoch": 0.6867254879004935, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 6049 + }, + { + "epoch": 0.6868390150104126, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6050 + }, + { + "epoch": 0.6869525421203316, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 6051 + }, + { + "epoch": 0.6870660692302507, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2577, + "step": 6052 + }, + { + "epoch": 0.6871795963401698, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2708, + "step": 6053 + }, + { + "epoch": 0.6872931234500889, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.223, + "step": 6054 + }, + { + "epoch": 0.6874066505600079, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 6055 + }, + { + "epoch": 0.687520177669927, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 6056 + }, + { + "epoch": 0.6876337047798461, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2208, + "step": 6057 + }, + { + "epoch": 0.6877472318897652, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2376, + "step": 6058 + }, + { + "epoch": 0.6878607589996842, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 6059 + }, + { + "epoch": 0.6879742861096033, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2225, + "step": 6060 + }, + { + "epoch": 0.6880878132195224, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2431, + "step": 6061 + }, + { + "epoch": 0.6882013403294415, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2485, + "step": 6062 + }, + { + "epoch": 0.6883148674393605, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 6063 + }, + { + "epoch": 0.6884283945492796, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2388, + "step": 6064 + }, + { + "epoch": 0.6885419216591987, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2441, + "step": 6065 + }, + { + "epoch": 0.6886554487691178, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.242, + "step": 6066 + }, + { + "epoch": 0.6887689758790368, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6067 + }, + { + "epoch": 0.6888825029889559, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2346, + "step": 6068 + }, + { + "epoch": 0.688996030098875, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2521, + "step": 6069 + }, + { + "epoch": 0.6891095572087941, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 6070 + }, + { + "epoch": 0.6892230843187132, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 6071 + }, + { + "epoch": 0.6893366114286322, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 6072 + }, + { + "epoch": 0.6894501385385513, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 6073 + }, + { + "epoch": 0.6895636656484704, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2295, + "step": 6074 + }, + { + "epoch": 0.6896771927583895, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2321, + "step": 6075 + }, + { + "epoch": 0.6897907198683085, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6076 + }, + { + "epoch": 0.6899042469782276, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2312, + "step": 6077 + }, + { + "epoch": 0.6900177740881467, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 6078 + }, + { + "epoch": 0.6901313011980658, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6079 + }, + { + "epoch": 0.6902448283079848, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2417, + "step": 6080 + }, + { + "epoch": 0.6903583554179039, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 6081 + }, + { + "epoch": 0.690471882527823, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 6082 + }, + { + "epoch": 0.6905854096377421, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6083 + }, + { + "epoch": 0.6906989367476611, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2228, + "step": 6084 + }, + { + "epoch": 0.6908124638575802, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6085 + }, + { + "epoch": 0.6909259909674993, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 6086 + }, + { + "epoch": 0.6910395180774184, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2303, + "step": 6087 + }, + { + "epoch": 0.6911530451873374, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.25, + "step": 6088 + }, + { + "epoch": 0.6912665722972565, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2485, + "step": 6089 + }, + { + "epoch": 0.6913800994071756, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 6090 + }, + { + "epoch": 0.6914936265170947, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 6091 + }, + { + "epoch": 0.6916071536270137, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.242, + "step": 6092 + }, + { + "epoch": 0.6917206807369328, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.245, + "step": 6093 + }, + { + "epoch": 0.6918342078468519, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6094 + }, + { + "epoch": 0.691947734956771, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2309, + "step": 6095 + }, + { + "epoch": 0.69206126206669, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 6096 + }, + { + "epoch": 0.6921747891766091, + "grad_norm": 0.41015625, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 6097 + }, + { + "epoch": 0.6922883162865282, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 6098 + }, + { + "epoch": 0.6924018433964473, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.2361, + "step": 6099 + }, + { + "epoch": 0.6925153705063664, + "grad_norm": 0.421875, + "learning_rate": 0.002, + "loss": 5.2485, + "step": 6100 + }, + { + "epoch": 0.6926288976162854, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2512, + "step": 6101 + }, + { + "epoch": 0.6927424247262045, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 6102 + }, + { + "epoch": 0.6928559518361236, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2693, + "step": 6103 + }, + { + "epoch": 0.6929694789460427, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 6104 + }, + { + "epoch": 0.6930830060559617, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6105 + }, + { + "epoch": 0.6931965331658808, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2445, + "step": 6106 + }, + { + "epoch": 0.6933100602757999, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2548, + "step": 6107 + }, + { + "epoch": 0.693423587385719, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2422, + "step": 6108 + }, + { + "epoch": 0.693537114495638, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 6109 + }, + { + "epoch": 0.6936506416055571, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 6110 + }, + { + "epoch": 0.6937641687154762, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2447, + "step": 6111 + }, + { + "epoch": 0.6938776958253953, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6112 + }, + { + "epoch": 0.6939912229353143, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 6113 + }, + { + "epoch": 0.6941047500452334, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2575, + "step": 6114 + }, + { + "epoch": 0.6942182771551525, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 6115 + }, + { + "epoch": 0.6943318042650716, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2273, + "step": 6116 + }, + { + "epoch": 0.6944453313749906, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 6117 + }, + { + "epoch": 0.6945588584849097, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2405, + "step": 6118 + }, + { + "epoch": 0.6946723855948288, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 6119 + }, + { + "epoch": 0.6947859127047479, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2442, + "step": 6120 + }, + { + "epoch": 0.694899439814667, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2261, + "step": 6121 + }, + { + "epoch": 0.695012966924586, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6122 + }, + { + "epoch": 0.6951264940345051, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 6123 + }, + { + "epoch": 0.6952400211444242, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2372, + "step": 6124 + }, + { + "epoch": 0.6953535482543433, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6125 + }, + { + "epoch": 0.6954670753642623, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 6126 + }, + { + "epoch": 0.6955806024741814, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2541, + "step": 6127 + }, + { + "epoch": 0.6956941295841005, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2535, + "step": 6128 + }, + { + "epoch": 0.6958076566940196, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 6129 + }, + { + "epoch": 0.6959211838039386, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 6130 + }, + { + "epoch": 0.6960347109138577, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2125, + "step": 6131 + }, + { + "epoch": 0.6961482380237768, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2431, + "step": 6132 + }, + { + "epoch": 0.6962617651336959, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2259, + "step": 6133 + }, + { + "epoch": 0.6963752922436149, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2648, + "step": 6134 + }, + { + "epoch": 0.696488819353534, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.249, + "step": 6135 + }, + { + "epoch": 0.6966023464634531, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6136 + }, + { + "epoch": 0.6967158735733722, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6137 + }, + { + "epoch": 0.6968294006832912, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.252, + "step": 6138 + }, + { + "epoch": 0.6969429277932103, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2261, + "step": 6139 + }, + { + "epoch": 0.6970564549031294, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2624, + "step": 6140 + }, + { + "epoch": 0.6971699820130485, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2722, + "step": 6141 + }, + { + "epoch": 0.6972835091229675, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 6142 + }, + { + "epoch": 0.6973970362328866, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2629, + "step": 6143 + }, + { + "epoch": 0.6975105633428057, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6144 + }, + { + "epoch": 0.6976240904527248, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 6145 + }, + { + "epoch": 0.6977376175626439, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 6146 + }, + { + "epoch": 0.6978511446725629, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 6147 + }, + { + "epoch": 0.697964671782482, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 6148 + }, + { + "epoch": 0.6980781988924011, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2774, + "step": 6149 + }, + { + "epoch": 0.6981917260023202, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2517, + "step": 6150 + }, + { + "epoch": 0.6983052531122392, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2218, + "step": 6151 + }, + { + "epoch": 0.6984187802221583, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2258, + "step": 6152 + }, + { + "epoch": 0.6985323073320774, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2366, + "step": 6153 + }, + { + "epoch": 0.6986458344419966, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 6154 + }, + { + "epoch": 0.6987593615519156, + "grad_norm": 0.4375, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 6155 + }, + { + "epoch": 0.6988728886618347, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 6156 + }, + { + "epoch": 0.6989864157717538, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2578, + "step": 6157 + }, + { + "epoch": 0.6990999428816729, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 6158 + }, + { + "epoch": 0.699213469991592, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2556, + "step": 6159 + }, + { + "epoch": 0.699326997101511, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 6160 + }, + { + "epoch": 0.6994405242114301, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6161 + }, + { + "epoch": 0.6995540513213492, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 6162 + }, + { + "epoch": 0.6996675784312683, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2354, + "step": 6163 + }, + { + "epoch": 0.6997811055411873, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2331, + "step": 6164 + }, + { + "epoch": 0.6998946326511064, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 6165 + }, + { + "epoch": 0.7000081597610255, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2407, + "step": 6166 + }, + { + "epoch": 0.7001216868709446, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6167 + }, + { + "epoch": 0.7002352139808636, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 6168 + }, + { + "epoch": 0.7003487410907827, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2273, + "step": 6169 + }, + { + "epoch": 0.7004622682007018, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2359, + "step": 6170 + }, + { + "epoch": 0.7005757953106209, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2311, + "step": 6171 + }, + { + "epoch": 0.7006893224205399, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 6172 + }, + { + "epoch": 0.700802849530459, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6173 + }, + { + "epoch": 0.7009163766403781, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 6174 + }, + { + "epoch": 0.7010299037502972, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2264, + "step": 6175 + }, + { + "epoch": 0.7011434308602162, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2312, + "step": 6176 + }, + { + "epoch": 0.7012569579701353, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2238, + "step": 6177 + }, + { + "epoch": 0.7013704850800544, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2237, + "step": 6178 + }, + { + "epoch": 0.7014840121899735, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2507, + "step": 6179 + }, + { + "epoch": 0.7015975392998925, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2243, + "step": 6180 + }, + { + "epoch": 0.7017110664098116, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2432, + "step": 6181 + }, + { + "epoch": 0.7018245935197307, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2354, + "step": 6182 + }, + { + "epoch": 0.7019381206296498, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.249, + "step": 6183 + }, + { + "epoch": 0.7020516477395689, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2265, + "step": 6184 + }, + { + "epoch": 0.7021651748494879, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 6185 + }, + { + "epoch": 0.702278701959407, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 6186 + }, + { + "epoch": 0.7023922290693261, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6187 + }, + { + "epoch": 0.7025057561792452, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 6188 + }, + { + "epoch": 0.7026192832891642, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6189 + }, + { + "epoch": 0.7027328103990833, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2454, + "step": 6190 + }, + { + "epoch": 0.7028463375090024, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2199, + "step": 6191 + }, + { + "epoch": 0.7029598646189215, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2349, + "step": 6192 + }, + { + "epoch": 0.7030733917288405, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2488, + "step": 6193 + }, + { + "epoch": 0.7031869188387596, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6194 + }, + { + "epoch": 0.7033004459486787, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2295, + "step": 6195 + }, + { + "epoch": 0.7034139730585978, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6196 + }, + { + "epoch": 0.7035275001685168, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 6197 + }, + { + "epoch": 0.7036410272784359, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 6198 + }, + { + "epoch": 0.703754554388355, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 6199 + }, + { + "epoch": 0.7038680814982741, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2192, + "step": 6200 + }, + { + "epoch": 0.7039816086081931, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 6201 + }, + { + "epoch": 0.7040951357181122, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 6202 + }, + { + "epoch": 0.7042086628280313, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6203 + }, + { + "epoch": 0.7043221899379504, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 6204 + }, + { + "epoch": 0.7044357170478694, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2536, + "step": 6205 + }, + { + "epoch": 0.7045492441577885, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 6206 + }, + { + "epoch": 0.7046627712677076, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 6207 + }, + { + "epoch": 0.7047762983776267, + "grad_norm": 0.5078125, + "learning_rate": 0.002, + "loss": 5.2302, + "step": 6208 + }, + { + "epoch": 0.7048898254875458, + "grad_norm": 0.44140625, + "learning_rate": 0.002, + "loss": 5.2312, + "step": 6209 + }, + { + "epoch": 0.7050033525974648, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.2403, + "step": 6210 + }, + { + "epoch": 0.7051168797073839, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2503, + "step": 6211 + }, + { + "epoch": 0.705230406817303, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.234, + "step": 6212 + }, + { + "epoch": 0.705343933927222, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2318, + "step": 6213 + }, + { + "epoch": 0.7054574610371411, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 6214 + }, + { + "epoch": 0.7055709881470602, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.239, + "step": 6215 + }, + { + "epoch": 0.7056845152569793, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6216 + }, + { + "epoch": 0.7057980423668984, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2209, + "step": 6217 + }, + { + "epoch": 0.7059115694768174, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 6218 + }, + { + "epoch": 0.7060250965867365, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 6219 + }, + { + "epoch": 0.7061386236966556, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2531, + "step": 6220 + }, + { + "epoch": 0.7062521508065747, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2298, + "step": 6221 + }, + { + "epoch": 0.7063656779164937, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 6222 + }, + { + "epoch": 0.7064792050264128, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 6223 + }, + { + "epoch": 0.7065927321363319, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2328, + "step": 6224 + }, + { + "epoch": 0.706706259246251, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 6225 + }, + { + "epoch": 0.70681978635617, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6226 + }, + { + "epoch": 0.7069333134660891, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 6227 + }, + { + "epoch": 0.7070468405760082, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 6228 + }, + { + "epoch": 0.7071603676859273, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2283, + "step": 6229 + }, + { + "epoch": 0.7072738947958463, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 6230 + }, + { + "epoch": 0.7073874219057654, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 6231 + }, + { + "epoch": 0.7075009490156845, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2524, + "step": 6232 + }, + { + "epoch": 0.7076144761256036, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2322, + "step": 6233 + }, + { + "epoch": 0.7077280032355227, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 6234 + }, + { + "epoch": 0.7078415303454417, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6235 + }, + { + "epoch": 0.7079550574553608, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2507, + "step": 6236 + }, + { + "epoch": 0.7080685845652799, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2275, + "step": 6237 + }, + { + "epoch": 0.708182111675199, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2449, + "step": 6238 + }, + { + "epoch": 0.708295638785118, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6239 + }, + { + "epoch": 0.7084091658950371, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 6240 + }, + { + "epoch": 0.7085226930049562, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2378, + "step": 6241 + }, + { + "epoch": 0.7086362201148753, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 6242 + }, + { + "epoch": 0.7087497472247943, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2261, + "step": 6243 + }, + { + "epoch": 0.7088632743347134, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2489, + "step": 6244 + }, + { + "epoch": 0.7089768014446325, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 6245 + }, + { + "epoch": 0.7090903285545516, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2424, + "step": 6246 + }, + { + "epoch": 0.7092038556644706, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2223, + "step": 6247 + }, + { + "epoch": 0.7093173827743897, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2622, + "step": 6248 + }, + { + "epoch": 0.7094309098843088, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6249 + }, + { + "epoch": 0.7095444369942279, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2589, + "step": 6250 + }, + { + "epoch": 0.7096579641041469, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2574, + "step": 6251 + }, + { + "epoch": 0.709771491214066, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2437, + "step": 6252 + }, + { + "epoch": 0.7098850183239851, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.231, + "step": 6253 + }, + { + "epoch": 0.7099985454339042, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 6254 + }, + { + "epoch": 0.7101120725438232, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 6255 + }, + { + "epoch": 0.7102255996537423, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 6256 + }, + { + "epoch": 0.7103391267636614, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2265, + "step": 6257 + }, + { + "epoch": 0.7104526538735805, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2682, + "step": 6258 + }, + { + "epoch": 0.7105661809834996, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 6259 + }, + { + "epoch": 0.7106797080934186, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 6260 + }, + { + "epoch": 0.7107932352033377, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2373, + "step": 6261 + }, + { + "epoch": 0.7109067623132568, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2753, + "step": 6262 + }, + { + "epoch": 0.7110202894231759, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.217, + "step": 6263 + }, + { + "epoch": 0.7111338165330949, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 6264 + }, + { + "epoch": 0.711247343643014, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2324, + "step": 6265 + }, + { + "epoch": 0.7113608707529331, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 6266 + }, + { + "epoch": 0.7114743978628522, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.217, + "step": 6267 + }, + { + "epoch": 0.7115879249727712, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2671, + "step": 6268 + }, + { + "epoch": 0.7117014520826903, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2372, + "step": 6269 + }, + { + "epoch": 0.7118149791926094, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.244, + "step": 6270 + }, + { + "epoch": 0.7119285063025285, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 6271 + }, + { + "epoch": 0.7120420334124475, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2354, + "step": 6272 + }, + { + "epoch": 0.7121555605223666, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2386, + "step": 6273 + }, + { + "epoch": 0.7122690876322857, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6274 + }, + { + "epoch": 0.7123826147422048, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2166, + "step": 6275 + }, + { + "epoch": 0.7124961418521238, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2319, + "step": 6276 + }, + { + "epoch": 0.7126096689620429, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.25, + "step": 6277 + }, + { + "epoch": 0.712723196071962, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6278 + }, + { + "epoch": 0.7128367231818811, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2268, + "step": 6279 + }, + { + "epoch": 0.7129502502918001, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2417, + "step": 6280 + }, + { + "epoch": 0.7130637774017192, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2317, + "step": 6281 + }, + { + "epoch": 0.7131773045116383, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2315, + "step": 6282 + }, + { + "epoch": 0.7132908316215574, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 6283 + }, + { + "epoch": 0.7134043587314765, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2278, + "step": 6284 + }, + { + "epoch": 0.7135178858413955, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2564, + "step": 6285 + }, + { + "epoch": 0.7136314129513146, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.254, + "step": 6286 + }, + { + "epoch": 0.7137449400612337, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6287 + }, + { + "epoch": 0.7138584671711528, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6288 + }, + { + "epoch": 0.7139719942810718, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2695, + "step": 6289 + }, + { + "epoch": 0.7140855213909909, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2339, + "step": 6290 + }, + { + "epoch": 0.71419904850091, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2607, + "step": 6291 + }, + { + "epoch": 0.7143125756108291, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2145, + "step": 6292 + }, + { + "epoch": 0.7144261027207481, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2186, + "step": 6293 + }, + { + "epoch": 0.7145396298306672, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2618, + "step": 6294 + }, + { + "epoch": 0.7146531569405863, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2191, + "step": 6295 + }, + { + "epoch": 0.7147666840505054, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2542, + "step": 6296 + }, + { + "epoch": 0.7148802111604244, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 6297 + }, + { + "epoch": 0.7149937382703435, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 6298 + }, + { + "epoch": 0.7151072653802626, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2516, + "step": 6299 + }, + { + "epoch": 0.7152207924901817, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2422, + "step": 6300 + }, + { + "epoch": 0.7153343196001007, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2375, + "step": 6301 + }, + { + "epoch": 0.7154478467100198, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2288, + "step": 6302 + }, + { + "epoch": 0.7155613738199389, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.2386, + "step": 6303 + }, + { + "epoch": 0.715674900929858, + "grad_norm": 0.419921875, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 6304 + }, + { + "epoch": 0.715788428039777, + "grad_norm": 0.4296875, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6305 + }, + { + "epoch": 0.7159019551496961, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6306 + }, + { + "epoch": 0.7160154822596152, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2535, + "step": 6307 + }, + { + "epoch": 0.7161290093695343, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.247, + "step": 6308 + }, + { + "epoch": 0.7162425364794534, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2234, + "step": 6309 + }, + { + "epoch": 0.7163560635893724, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2215, + "step": 6310 + }, + { + "epoch": 0.7164695906992915, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2295, + "step": 6311 + }, + { + "epoch": 0.7165831178092106, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2615, + "step": 6312 + }, + { + "epoch": 0.7166966449191297, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2268, + "step": 6313 + }, + { + "epoch": 0.7168101720290487, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2612, + "step": 6314 + }, + { + "epoch": 0.7169236991389678, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2545, + "step": 6315 + }, + { + "epoch": 0.7170372262488869, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2335, + "step": 6316 + }, + { + "epoch": 0.717150753358806, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2281, + "step": 6317 + }, + { + "epoch": 0.717264280468725, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.209, + "step": 6318 + }, + { + "epoch": 0.7173778075786441, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6319 + }, + { + "epoch": 0.7174913346885632, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.254, + "step": 6320 + }, + { + "epoch": 0.7176048617984823, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.256, + "step": 6321 + }, + { + "epoch": 0.7177183889084013, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2453, + "step": 6322 + }, + { + "epoch": 0.7178319160183204, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 6323 + }, + { + "epoch": 0.7179454431282395, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2234, + "step": 6324 + }, + { + "epoch": 0.7180589702381586, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 6325 + }, + { + "epoch": 0.7181724973480776, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 6326 + }, + { + "epoch": 0.7182860244579967, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2425, + "step": 6327 + }, + { + "epoch": 0.7183995515679158, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.243, + "step": 6328 + }, + { + "epoch": 0.7185130786778349, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2307, + "step": 6329 + }, + { + "epoch": 0.718626605787754, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6330 + }, + { + "epoch": 0.718740132897673, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2287, + "step": 6331 + }, + { + "epoch": 0.7188536600075921, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 6332 + }, + { + "epoch": 0.7189671871175112, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2604, + "step": 6333 + }, + { + "epoch": 0.7190807142274303, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2445, + "step": 6334 + }, + { + "epoch": 0.7191942413373493, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2534, + "step": 6335 + }, + { + "epoch": 0.7193077684472684, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2262, + "step": 6336 + }, + { + "epoch": 0.7194212955571875, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2634, + "step": 6337 + }, + { + "epoch": 0.7195348226671066, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 6338 + }, + { + "epoch": 0.7196483497770256, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 6339 + }, + { + "epoch": 0.7197618768869447, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.25, + "step": 6340 + }, + { + "epoch": 0.7198754039968638, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 6341 + }, + { + "epoch": 0.7199889311067829, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 6342 + }, + { + "epoch": 0.7201024582167019, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2153, + "step": 6343 + }, + { + "epoch": 0.720215985326621, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 6344 + }, + { + "epoch": 0.7203295124365401, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2198, + "step": 6345 + }, + { + "epoch": 0.7204430395464592, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2228, + "step": 6346 + }, + { + "epoch": 0.7205565666563782, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2262, + "step": 6347 + }, + { + "epoch": 0.7206700937662973, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2327, + "step": 6348 + }, + { + "epoch": 0.7207836208762164, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 6349 + }, + { + "epoch": 0.7208971479861355, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2547, + "step": 6350 + }, + { + "epoch": 0.7210106750960545, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 6351 + }, + { + "epoch": 0.7211242022059736, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 6352 + }, + { + "epoch": 0.7212377293158927, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 6353 + }, + { + "epoch": 0.7213512564258118, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2613, + "step": 6354 + }, + { + "epoch": 0.7214647835357308, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 6355 + }, + { + "epoch": 0.7215783106456499, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.251, + "step": 6356 + }, + { + "epoch": 0.721691837755569, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 6357 + }, + { + "epoch": 0.7218053648654881, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2294, + "step": 6358 + }, + { + "epoch": 0.7219188919754072, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 6359 + }, + { + "epoch": 0.7220324190853262, + "grad_norm": 0.40234375, + "learning_rate": 0.002, + "loss": 5.2279, + "step": 6360 + }, + { + "epoch": 0.7221459461952453, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2671, + "step": 6361 + }, + { + "epoch": 0.7222594733051644, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.23, + "step": 6362 + }, + { + "epoch": 0.7223730004150835, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2403, + "step": 6363 + }, + { + "epoch": 0.7224865275250025, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2304, + "step": 6364 + }, + { + "epoch": 0.7226000546349216, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.253, + "step": 6365 + }, + { + "epoch": 0.7227135817448407, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2448, + "step": 6366 + }, + { + "epoch": 0.7228271088547598, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 6367 + }, + { + "epoch": 0.7229406359646788, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2347, + "step": 6368 + }, + { + "epoch": 0.7230541630745979, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 6369 + }, + { + "epoch": 0.723167690184517, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 6370 + }, + { + "epoch": 0.7232812172944361, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6371 + }, + { + "epoch": 0.7233947444043551, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2626, + "step": 6372 + }, + { + "epoch": 0.7235082715142742, + "grad_norm": 0.3828125, + "learning_rate": 0.002, + "loss": 5.2181, + "step": 6373 + }, + { + "epoch": 0.7236217986241933, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2354, + "step": 6374 + }, + { + "epoch": 0.7237353257341124, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2315, + "step": 6375 + }, + { + "epoch": 0.7238488528440314, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2571, + "step": 6376 + }, + { + "epoch": 0.7239623799539505, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6377 + }, + { + "epoch": 0.7240759070638696, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2275, + "step": 6378 + }, + { + "epoch": 0.7241894341737887, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2283, + "step": 6379 + }, + { + "epoch": 0.7243029612837077, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2403, + "step": 6380 + }, + { + "epoch": 0.7244164883936268, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2546, + "step": 6381 + }, + { + "epoch": 0.7245300155035459, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2404, + "step": 6382 + }, + { + "epoch": 0.724643542613465, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2525, + "step": 6383 + }, + { + "epoch": 0.724757069723384, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6384 + }, + { + "epoch": 0.7248705968333031, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6385 + }, + { + "epoch": 0.7249841239432222, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2328, + "step": 6386 + }, + { + "epoch": 0.7250976510531413, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2292, + "step": 6387 + }, + { + "epoch": 0.7252111781630604, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2334, + "step": 6388 + }, + { + "epoch": 0.7253247052729794, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2517, + "step": 6389 + }, + { + "epoch": 0.7254382323828985, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.265, + "step": 6390 + }, + { + "epoch": 0.7255517594928176, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2655, + "step": 6391 + }, + { + "epoch": 0.7256652866027367, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2321, + "step": 6392 + }, + { + "epoch": 0.7257788137126557, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6393 + }, + { + "epoch": 0.7258923408225748, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2581, + "step": 6394 + }, + { + "epoch": 0.7260058679324939, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.245, + "step": 6395 + }, + { + "epoch": 0.7261193950424131, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.254, + "step": 6396 + }, + { + "epoch": 0.7262329221523321, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.249, + "step": 6397 + }, + { + "epoch": 0.7263464492622512, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2758, + "step": 6398 + }, + { + "epoch": 0.7264599763721703, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 6399 + }, + { + "epoch": 0.7265735034820894, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6400 + }, + { + "epoch": 0.7266870305920085, + "grad_norm": 0.396484375, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 6401 + }, + { + "epoch": 0.7268005577019275, + "grad_norm": 0.404296875, + "learning_rate": 0.002, + "loss": 5.2529, + "step": 6402 + }, + { + "epoch": 0.7269140848118466, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.2284, + "step": 6403 + }, + { + "epoch": 0.7270276119217657, + "grad_norm": 0.458984375, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 6404 + }, + { + "epoch": 0.7271411390316848, + "grad_norm": 0.42578125, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6405 + }, + { + "epoch": 0.7272546661416038, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6406 + }, + { + "epoch": 0.7273681932515229, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.236, + "step": 6407 + }, + { + "epoch": 0.727481720361442, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 6408 + }, + { + "epoch": 0.7275952474713611, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2522, + "step": 6409 + }, + { + "epoch": 0.7277087745812801, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2544, + "step": 6410 + }, + { + "epoch": 0.7278223016911992, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.223, + "step": 6411 + }, + { + "epoch": 0.7279358288011183, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 6412 + }, + { + "epoch": 0.7280493559110374, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2143, + "step": 6413 + }, + { + "epoch": 0.7281628830209564, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2352, + "step": 6414 + }, + { + "epoch": 0.7282764101308755, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2504, + "step": 6415 + }, + { + "epoch": 0.7283899372407946, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6416 + }, + { + "epoch": 0.7285034643507137, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2259, + "step": 6417 + }, + { + "epoch": 0.7286169914606327, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2635, + "step": 6418 + }, + { + "epoch": 0.7287305185705518, + "grad_norm": 0.2275390625, + "learning_rate": 0.002, + "loss": 5.2481, + "step": 6419 + }, + { + "epoch": 0.7288440456804709, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2477, + "step": 6420 + }, + { + "epoch": 0.72895757279039, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 6421 + }, + { + "epoch": 0.729071099900309, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 6422 + }, + { + "epoch": 0.7291846270102281, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6423 + }, + { + "epoch": 0.7292981541201472, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2284, + "step": 6424 + }, + { + "epoch": 0.7294116812300663, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2597, + "step": 6425 + }, + { + "epoch": 0.7295252083399854, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 6426 + }, + { + "epoch": 0.7296387354499044, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2568, + "step": 6427 + }, + { + "epoch": 0.7297522625598235, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.248, + "step": 6428 + }, + { + "epoch": 0.7298657896697426, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2468, + "step": 6429 + }, + { + "epoch": 0.7299793167796617, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 6430 + }, + { + "epoch": 0.7300928438895807, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2322, + "step": 6431 + }, + { + "epoch": 0.7302063709994998, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.249, + "step": 6432 + }, + { + "epoch": 0.7303198981094189, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2433, + "step": 6433 + }, + { + "epoch": 0.730433425219338, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2502, + "step": 6434 + }, + { + "epoch": 0.730546952329257, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2283, + "step": 6435 + }, + { + "epoch": 0.7306604794391761, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2358, + "step": 6436 + }, + { + "epoch": 0.7307740065490952, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 6437 + }, + { + "epoch": 0.7308875336590143, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2598, + "step": 6438 + }, + { + "epoch": 0.7310010607689333, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 6439 + }, + { + "epoch": 0.7311145878788524, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2511, + "step": 6440 + }, + { + "epoch": 0.7312281149887715, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2519, + "step": 6441 + }, + { + "epoch": 0.7313416420986906, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 6442 + }, + { + "epoch": 0.7314551692086096, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2443, + "step": 6443 + }, + { + "epoch": 0.7315686963185287, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 6444 + }, + { + "epoch": 0.7316822234284478, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2344, + "step": 6445 + }, + { + "epoch": 0.7317957505383669, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6446 + }, + { + "epoch": 0.731909277648286, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 6447 + }, + { + "epoch": 0.732022804758205, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2264, + "step": 6448 + }, + { + "epoch": 0.7321363318681241, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2429, + "step": 6449 + }, + { + "epoch": 0.7322498589780432, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.256, + "step": 6450 + }, + { + "epoch": 0.7323633860879623, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2203, + "step": 6451 + }, + { + "epoch": 0.7324769131978813, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.236, + "step": 6452 + }, + { + "epoch": 0.7325904403078004, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2515, + "step": 6453 + }, + { + "epoch": 0.7327039674177195, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2408, + "step": 6454 + }, + { + "epoch": 0.7328174945276386, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2276, + "step": 6455 + }, + { + "epoch": 0.7329310216375576, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2239, + "step": 6456 + }, + { + "epoch": 0.7330445487474767, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 6457 + }, + { + "epoch": 0.7331580758573958, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2555, + "step": 6458 + }, + { + "epoch": 0.7332716029673149, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 6459 + }, + { + "epoch": 0.7333851300772339, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2386, + "step": 6460 + }, + { + "epoch": 0.733498657187153, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2336, + "step": 6461 + }, + { + "epoch": 0.7336121842970721, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2418, + "step": 6462 + }, + { + "epoch": 0.7337257114069912, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2433, + "step": 6463 + }, + { + "epoch": 0.7338392385169102, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2411, + "step": 6464 + }, + { + "epoch": 0.7339527656268293, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 6465 + }, + { + "epoch": 0.7340662927367484, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.235, + "step": 6466 + }, + { + "epoch": 0.7341798198466675, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6467 + }, + { + "epoch": 0.7342933469565865, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2561, + "step": 6468 + }, + { + "epoch": 0.7344068740665056, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2398, + "step": 6469 + }, + { + "epoch": 0.7345204011764247, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.231, + "step": 6470 + }, + { + "epoch": 0.7346339282863438, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2583, + "step": 6471 + }, + { + "epoch": 0.7347474553962628, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2392, + "step": 6472 + }, + { + "epoch": 0.7348609825061819, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 6473 + }, + { + "epoch": 0.734974509616101, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2592, + "step": 6474 + }, + { + "epoch": 0.7350880367260201, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2527, + "step": 6475 + }, + { + "epoch": 0.7352015638359392, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2305, + "step": 6476 + }, + { + "epoch": 0.7353150909458582, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.246, + "step": 6477 + }, + { + "epoch": 0.7354286180557773, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2484, + "step": 6478 + }, + { + "epoch": 0.7355421451656964, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.234, + "step": 6479 + }, + { + "epoch": 0.7356556722756155, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2496, + "step": 6480 + }, + { + "epoch": 0.7357691993855345, + "grad_norm": 0.244140625, + "learning_rate": 0.002, + "loss": 5.2584, + "step": 6481 + }, + { + "epoch": 0.7358827264954536, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6482 + }, + { + "epoch": 0.7359962536053727, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2481, + "step": 6483 + }, + { + "epoch": 0.7361097807152918, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.262, + "step": 6484 + }, + { + "epoch": 0.7362233078252108, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2611, + "step": 6485 + }, + { + "epoch": 0.7363368349351299, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 6486 + }, + { + "epoch": 0.736450362045049, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 6487 + }, + { + "epoch": 0.7365638891549681, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2574, + "step": 6488 + }, + { + "epoch": 0.7366774162648871, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2449, + "step": 6489 + }, + { + "epoch": 0.7367909433748062, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 6490 + }, + { + "epoch": 0.7369044704847253, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6491 + }, + { + "epoch": 0.7370179975946444, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 6492 + }, + { + "epoch": 0.7371315247045634, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 6493 + }, + { + "epoch": 0.7372450518144825, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2462, + "step": 6494 + }, + { + "epoch": 0.7373585789244016, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2261, + "step": 6495 + }, + { + "epoch": 0.7374721060343207, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 6496 + }, + { + "epoch": 0.7375856331442397, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2577, + "step": 6497 + }, + { + "epoch": 0.7376991602541588, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2387, + "step": 6498 + }, + { + "epoch": 0.7378126873640779, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2269, + "step": 6499 + }, + { + "epoch": 0.737926214473997, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2523, + "step": 6500 + }, + { + "epoch": 0.738039741583916, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2218, + "step": 6501 + }, + { + "epoch": 0.7381532686938351, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2441, + "step": 6502 + }, + { + "epoch": 0.7382667958037542, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2375, + "step": 6503 + }, + { + "epoch": 0.7383803229136733, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2305, + "step": 6504 + }, + { + "epoch": 0.7384938500235924, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2231, + "step": 6505 + }, + { + "epoch": 0.7386073771335114, + "grad_norm": 0.427734375, + "learning_rate": 0.002, + "loss": 5.2292, + "step": 6506 + }, + { + "epoch": 0.7387209042434305, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2048, + "step": 6507 + }, + { + "epoch": 0.7388344313533496, + "grad_norm": 0.392578125, + "learning_rate": 0.002, + "loss": 5.2132, + "step": 6508 + }, + { + "epoch": 0.7389479584632687, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2315, + "step": 6509 + }, + { + "epoch": 0.7390614855731877, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2397, + "step": 6510 + }, + { + "epoch": 0.7391750126831068, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2287, + "step": 6511 + }, + { + "epoch": 0.7392885397930259, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2405, + "step": 6512 + }, + { + "epoch": 0.739402066902945, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6513 + }, + { + "epoch": 0.739515594012864, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2387, + "step": 6514 + }, + { + "epoch": 0.7396291211227831, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2253, + "step": 6515 + }, + { + "epoch": 0.7397426482327022, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2197, + "step": 6516 + }, + { + "epoch": 0.7398561753426213, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2267, + "step": 6517 + }, + { + "epoch": 0.7399697024525403, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2124, + "step": 6518 + }, + { + "epoch": 0.7400832295624594, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2582, + "step": 6519 + }, + { + "epoch": 0.7401967566723785, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2414, + "step": 6520 + }, + { + "epoch": 0.7403102837822976, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.235, + "step": 6521 + }, + { + "epoch": 0.7404238108922166, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 6522 + }, + { + "epoch": 0.7405373380021357, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2338, + "step": 6523 + }, + { + "epoch": 0.7406508651120548, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 6524 + }, + { + "epoch": 0.7407643922219739, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2183, + "step": 6525 + }, + { + "epoch": 0.740877919331893, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2492, + "step": 6526 + }, + { + "epoch": 0.740991446441812, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2358, + "step": 6527 + }, + { + "epoch": 0.7411049735517311, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6528 + }, + { + "epoch": 0.7412185006616502, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 6529 + }, + { + "epoch": 0.7413320277715693, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2408, + "step": 6530 + }, + { + "epoch": 0.7414455548814883, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2309, + "step": 6531 + }, + { + "epoch": 0.7415590819914074, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2614, + "step": 6532 + }, + { + "epoch": 0.7416726091013265, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6533 + }, + { + "epoch": 0.7417861362112456, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2419, + "step": 6534 + }, + { + "epoch": 0.7418996633211646, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 6535 + }, + { + "epoch": 0.7420131904310837, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 6536 + }, + { + "epoch": 0.7421267175410028, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2268, + "step": 6537 + }, + { + "epoch": 0.7422402446509219, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 6538 + }, + { + "epoch": 0.7423537717608409, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2176, + "step": 6539 + }, + { + "epoch": 0.74246729887076, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2293, + "step": 6540 + }, + { + "epoch": 0.7425808259806791, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2393, + "step": 6541 + }, + { + "epoch": 0.7426943530905982, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2599, + "step": 6542 + }, + { + "epoch": 0.7428078802005172, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 6543 + }, + { + "epoch": 0.7429214073104363, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.2291, + "step": 6544 + }, + { + "epoch": 0.7430349344203554, + "grad_norm": 0.451171875, + "learning_rate": 0.002, + "loss": 5.244, + "step": 6545 + }, + { + "epoch": 0.7431484615302745, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2286, + "step": 6546 + }, + { + "epoch": 0.7432619886401935, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 6547 + }, + { + "epoch": 0.7433755157501126, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2229, + "step": 6548 + }, + { + "epoch": 0.7434890428600317, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2331, + "step": 6549 + }, + { + "epoch": 0.7436025699699508, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2244, + "step": 6550 + }, + { + "epoch": 0.7437160970798699, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6551 + }, + { + "epoch": 0.7438296241897889, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2289, + "step": 6552 + }, + { + "epoch": 0.743943151299708, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2444, + "step": 6553 + }, + { + "epoch": 0.7440566784096271, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2608, + "step": 6554 + }, + { + "epoch": 0.7441702055195462, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2246, + "step": 6555 + }, + { + "epoch": 0.7442837326294652, + "grad_norm": 0.2392578125, + "learning_rate": 0.002, + "loss": 5.2412, + "step": 6556 + }, + { + "epoch": 0.7443972597393843, + "grad_norm": 0.2412109375, + "learning_rate": 0.002, + "loss": 5.2227, + "step": 6557 + }, + { + "epoch": 0.7445107868493034, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 6558 + }, + { + "epoch": 0.7446243139592225, + "grad_norm": 0.234375, + "learning_rate": 0.002, + "loss": 5.2306, + "step": 6559 + }, + { + "epoch": 0.7447378410691415, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2453, + "step": 6560 + }, + { + "epoch": 0.7448513681790606, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6561 + }, + { + "epoch": 0.7449648952889797, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 6562 + }, + { + "epoch": 0.7450784223988988, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6563 + }, + { + "epoch": 0.7451919495088178, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2373, + "step": 6564 + }, + { + "epoch": 0.7453054766187369, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2289, + "step": 6565 + }, + { + "epoch": 0.745419003728656, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.224, + "step": 6566 + }, + { + "epoch": 0.7455325308385751, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2315, + "step": 6567 + }, + { + "epoch": 0.7456460579484941, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2401, + "step": 6568 + }, + { + "epoch": 0.7457595850584132, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.241, + "step": 6569 + }, + { + "epoch": 0.7458731121683323, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2505, + "step": 6570 + }, + { + "epoch": 0.7459866392782514, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2421, + "step": 6571 + }, + { + "epoch": 0.7461001663881704, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 6572 + }, + { + "epoch": 0.7462136934980895, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 6573 + }, + { + "epoch": 0.7463272206080086, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 6574 + }, + { + "epoch": 0.7464407477179277, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6575 + }, + { + "epoch": 0.7465542748278468, + "grad_norm": 0.466796875, + "learning_rate": 0.002, + "loss": 5.2517, + "step": 6576 + }, + { + "epoch": 0.7466678019377658, + "grad_norm": 0.44921875, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 6577 + }, + { + "epoch": 0.7467813290476849, + "grad_norm": 0.4453125, + "learning_rate": 0.002, + "loss": 5.2204, + "step": 6578 + }, + { + "epoch": 0.746894856157604, + "grad_norm": 0.4609375, + "learning_rate": 0.002, + "loss": 5.24, + "step": 6579 + }, + { + "epoch": 0.7470083832675231, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2324, + "step": 6580 + }, + { + "epoch": 0.7471219103774421, + "grad_norm": 0.3984375, + "learning_rate": 0.002, + "loss": 5.2342, + "step": 6581 + }, + { + "epoch": 0.7472354374873612, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2397, + "step": 6582 + }, + { + "epoch": 0.7473489645972803, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2179, + "step": 6583 + }, + { + "epoch": 0.7474624917071994, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 6584 + }, + { + "epoch": 0.7475760188171184, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2235, + "step": 6585 + }, + { + "epoch": 0.7476895459270375, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2388, + "step": 6586 + }, + { + "epoch": 0.7478030730369566, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.232, + "step": 6587 + }, + { + "epoch": 0.7479166001468757, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2668, + "step": 6588 + }, + { + "epoch": 0.7480301272567947, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 6589 + }, + { + "epoch": 0.7481436543667138, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2537, + "step": 6590 + }, + { + "epoch": 0.7482571814766329, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2132, + "step": 6591 + }, + { + "epoch": 0.748370708586552, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2297, + "step": 6592 + }, + { + "epoch": 0.748484235696471, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2187, + "step": 6593 + }, + { + "epoch": 0.7485977628063901, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2564, + "step": 6594 + }, + { + "epoch": 0.7487112899163092, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2328, + "step": 6595 + }, + { + "epoch": 0.7488248170262283, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.2347, + "step": 6596 + }, + { + "epoch": 0.7489383441361473, + "grad_norm": 0.23828125, + "learning_rate": 0.002, + "loss": 5.2322, + "step": 6597 + }, + { + "epoch": 0.7490518712460664, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.224, + "step": 6598 + }, + { + "epoch": 0.7491653983559855, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2275, + "step": 6599 + }, + { + "epoch": 0.7492789254659046, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.242, + "step": 6600 + }, + { + "epoch": 0.7493924525758237, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6601 + }, + { + "epoch": 0.7495059796857427, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2316, + "step": 6602 + }, + { + "epoch": 0.7496195067956618, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2438, + "step": 6603 + }, + { + "epoch": 0.7497330339055809, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 6604 + }, + { + "epoch": 0.7498465610155, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2058, + "step": 6605 + }, + { + "epoch": 0.749960088125419, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2333, + "step": 6606 + }, + { + "epoch": 0.7500736152353381, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 6607 + }, + { + "epoch": 0.7501871423452572, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2124, + "step": 6608 + }, + { + "epoch": 0.7503006694551763, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 6609 + }, + { + "epoch": 0.7504141965650953, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2565, + "step": 6610 + }, + { + "epoch": 0.7505277236750144, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 6611 + }, + { + "epoch": 0.7506412507849335, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2392, + "step": 6612 + }, + { + "epoch": 0.7507547778948526, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2116, + "step": 6613 + }, + { + "epoch": 0.7508683050047716, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2397, + "step": 6614 + }, + { + "epoch": 0.7509818321146907, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2337, + "step": 6615 + }, + { + "epoch": 0.7510953592246098, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 6616 + }, + { + "epoch": 0.7512088863345289, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2494, + "step": 6617 + }, + { + "epoch": 0.751322413444448, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2424, + "step": 6618 + }, + { + "epoch": 0.751435940554367, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2054, + "step": 6619 + }, + { + "epoch": 0.7515494676642861, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2499, + "step": 6620 + }, + { + "epoch": 0.7516629947742052, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 6621 + }, + { + "epoch": 0.7517765218841242, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 6622 + }, + { + "epoch": 0.7518900489940433, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 6623 + }, + { + "epoch": 0.7520035761039624, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2351, + "step": 6624 + }, + { + "epoch": 0.7521171032138815, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.227, + "step": 6625 + }, + { + "epoch": 0.7522306303238006, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6626 + }, + { + "epoch": 0.7523441574337196, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6627 + }, + { + "epoch": 0.7524576845436387, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2243, + "step": 6628 + }, + { + "epoch": 0.7525712116535578, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.257, + "step": 6629 + }, + { + "epoch": 0.7526847387634769, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2434, + "step": 6630 + }, + { + "epoch": 0.7527982658733959, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 6631 + }, + { + "epoch": 0.752911792983315, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2206, + "step": 6632 + }, + { + "epoch": 0.7530253200932341, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2558, + "step": 6633 + }, + { + "epoch": 0.7531388472031532, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2568, + "step": 6634 + }, + { + "epoch": 0.7532523743130722, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2483, + "step": 6635 + }, + { + "epoch": 0.7533659014229913, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2293, + "step": 6636 + }, + { + "epoch": 0.7534794285329105, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2456, + "step": 6637 + }, + { + "epoch": 0.7535929556428296, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2091, + "step": 6638 + }, + { + "epoch": 0.7537064827527487, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2541, + "step": 6639 + }, + { + "epoch": 0.7538200098626677, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 6640 + }, + { + "epoch": 0.7539335369725868, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 6641 + }, + { + "epoch": 0.7540470640825059, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6642 + }, + { + "epoch": 0.754160591192425, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 6643 + }, + { + "epoch": 0.754274118302344, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2283, + "step": 6644 + }, + { + "epoch": 0.7543876454122631, + "grad_norm": 0.2373046875, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6645 + }, + { + "epoch": 0.7545011725221822, + "grad_norm": 0.240234375, + "learning_rate": 0.002, + "loss": 5.2412, + "step": 6646 + }, + { + "epoch": 0.7546146996321013, + "grad_norm": 0.236328125, + "learning_rate": 0.002, + "loss": 5.2557, + "step": 6647 + }, + { + "epoch": 0.7547282267420203, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6648 + }, + { + "epoch": 0.7548417538519394, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 6649 + }, + { + "epoch": 0.7549552809618585, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2189, + "step": 6650 + }, + { + "epoch": 0.7550688080717776, + "grad_norm": 0.35546875, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 6651 + }, + { + "epoch": 0.7551823351816966, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2595, + "step": 6652 + }, + { + "epoch": 0.7552958622916157, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2155, + "step": 6653 + }, + { + "epoch": 0.7554093894015348, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2134, + "step": 6654 + }, + { + "epoch": 0.7555229165114539, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6655 + }, + { + "epoch": 0.7556364436213729, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2403, + "step": 6656 + }, + { + "epoch": 0.755749970731292, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 6657 + }, + { + "epoch": 0.7558634978412111, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2019, + "step": 6658 + }, + { + "epoch": 0.7559770249511302, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2393, + "step": 6659 + }, + { + "epoch": 0.7560905520610492, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2406, + "step": 6660 + }, + { + "epoch": 0.7562040791709683, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2323, + "step": 6661 + }, + { + "epoch": 0.7563176062808874, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 6662 + }, + { + "epoch": 0.7564311333908065, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 6663 + }, + { + "epoch": 0.7565446605007256, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2305, + "step": 6664 + }, + { + "epoch": 0.7566581876106446, + "grad_norm": 0.2490234375, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 6665 + }, + { + "epoch": 0.7567717147205637, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.24, + "step": 6666 + }, + { + "epoch": 0.7568852418304828, + "grad_norm": 0.24609375, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6667 + }, + { + "epoch": 0.7569987689404019, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6668 + }, + { + "epoch": 0.7571122960503209, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2406, + "step": 6669 + }, + { + "epoch": 0.75722582316024, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 6670 + }, + { + "epoch": 0.7573393502701591, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2423, + "step": 6671 + }, + { + "epoch": 0.7574528773800782, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2429, + "step": 6672 + }, + { + "epoch": 0.7575664044899972, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2299, + "step": 6673 + }, + { + "epoch": 0.7576799315999163, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 6674 + }, + { + "epoch": 0.7577934587098354, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2248, + "step": 6675 + }, + { + "epoch": 0.7579069858197545, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 6676 + }, + { + "epoch": 0.7580205129296735, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.248, + "step": 6677 + }, + { + "epoch": 0.7581340400395926, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2227, + "step": 6678 + }, + { + "epoch": 0.7582475671495117, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2248, + "step": 6679 + }, + { + "epoch": 0.7583610942594308, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6680 + }, + { + "epoch": 0.7584746213693498, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 6681 + }, + { + "epoch": 0.7585881484792689, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2508, + "step": 6682 + }, + { + "epoch": 0.758701675589188, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2359, + "step": 6683 + }, + { + "epoch": 0.7588152026991071, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2327, + "step": 6684 + }, + { + "epoch": 0.7589287298090261, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2302, + "step": 6685 + }, + { + "epoch": 0.7590422569189452, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2441, + "step": 6686 + }, + { + "epoch": 0.7591557840288643, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.231, + "step": 6687 + }, + { + "epoch": 0.7592693111387834, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2483, + "step": 6688 + }, + { + "epoch": 0.7593828382487025, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.231, + "step": 6689 + }, + { + "epoch": 0.7594963653586215, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6690 + }, + { + "epoch": 0.7596098924685406, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2421, + "step": 6691 + }, + { + "epoch": 0.7597234195784597, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.238, + "step": 6692 + }, + { + "epoch": 0.7598369466883788, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2282, + "step": 6693 + }, + { + "epoch": 0.7599504737982978, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2171, + "step": 6694 + }, + { + "epoch": 0.7600640009082169, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2314, + "step": 6695 + }, + { + "epoch": 0.760177528018136, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 6696 + }, + { + "epoch": 0.7602910551280551, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2679, + "step": 6697 + }, + { + "epoch": 0.7604045822379741, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 6698 + }, + { + "epoch": 0.7605181093478932, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2184, + "step": 6699 + }, + { + "epoch": 0.7606316364578123, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 6700 + }, + { + "epoch": 0.7607451635677314, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2358, + "step": 6701 + }, + { + "epoch": 0.7608586906776504, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2341, + "step": 6702 + }, + { + "epoch": 0.7609722177875695, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2514, + "step": 6703 + }, + { + "epoch": 0.7610857448974886, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2317, + "step": 6704 + }, + { + "epoch": 0.7611992720074077, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 6705 + }, + { + "epoch": 0.7613127991173267, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 6706 + }, + { + "epoch": 0.7614263262272458, + "grad_norm": 0.431640625, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6707 + }, + { + "epoch": 0.7615398533371649, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2501, + "step": 6708 + }, + { + "epoch": 0.761653380447084, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2264, + "step": 6709 + }, + { + "epoch": 0.761766907557003, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2307, + "step": 6710 + }, + { + "epoch": 0.7618804346669221, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2472, + "step": 6711 + }, + { + "epoch": 0.7619939617768412, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2241, + "step": 6712 + }, + { + "epoch": 0.7621074888867603, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2492, + "step": 6713 + }, + { + "epoch": 0.7622210159966794, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2182, + "step": 6714 + }, + { + "epoch": 0.7623345431065984, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6715 + }, + { + "epoch": 0.7624480702165175, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.235, + "step": 6716 + }, + { + "epoch": 0.7625615973264366, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2307, + "step": 6717 + }, + { + "epoch": 0.7626751244363557, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2234, + "step": 6718 + }, + { + "epoch": 0.7627886515462747, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6719 + }, + { + "epoch": 0.7629021786561938, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6720 + }, + { + "epoch": 0.7630157057661129, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 6721 + }, + { + "epoch": 0.763129232876032, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2178, + "step": 6722 + }, + { + "epoch": 0.763242759985951, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.223, + "step": 6723 + }, + { + "epoch": 0.7633562870958701, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2444, + "step": 6724 + }, + { + "epoch": 0.7634698142057892, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2394, + "step": 6725 + }, + { + "epoch": 0.7635833413157083, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.234, + "step": 6726 + }, + { + "epoch": 0.7636968684256273, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2378, + "step": 6727 + }, + { + "epoch": 0.7638103955355464, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6728 + }, + { + "epoch": 0.7639239226454655, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.218, + "step": 6729 + }, + { + "epoch": 0.7640374497553846, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 6730 + }, + { + "epoch": 0.7641509768653036, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2241, + "step": 6731 + }, + { + "epoch": 0.7642645039752227, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2279, + "step": 6732 + }, + { + "epoch": 0.7643780310851418, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2338, + "step": 6733 + }, + { + "epoch": 0.7644915581950609, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 6734 + }, + { + "epoch": 0.76460508530498, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2254, + "step": 6735 + }, + { + "epoch": 0.764718612414899, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 6736 + }, + { + "epoch": 0.7648321395248181, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2512, + "step": 6737 + }, + { + "epoch": 0.7649456666347372, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2268, + "step": 6738 + }, + { + "epoch": 0.7650591937446563, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2291, + "step": 6739 + }, + { + "epoch": 0.7651727208545753, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2577, + "step": 6740 + }, + { + "epoch": 0.7652862479644944, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 6741 + }, + { + "epoch": 0.7653997750744135, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2401, + "step": 6742 + }, + { + "epoch": 0.7655133021843326, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 6743 + }, + { + "epoch": 0.7656268292942516, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6744 + }, + { + "epoch": 0.7657403564041707, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2474, + "step": 6745 + }, + { + "epoch": 0.7658538835140898, + "grad_norm": 0.357421875, + "learning_rate": 0.002, + "loss": 5.2224, + "step": 6746 + }, + { + "epoch": 0.7659674106240089, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6747 + }, + { + "epoch": 0.7660809377339279, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2333, + "step": 6748 + }, + { + "epoch": 0.766194464843847, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2431, + "step": 6749 + }, + { + "epoch": 0.7663079919537661, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2459, + "step": 6750 + }, + { + "epoch": 0.7664215190636852, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2325, + "step": 6751 + }, + { + "epoch": 0.7665350461736042, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2352, + "step": 6752 + }, + { + "epoch": 0.7666485732835233, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2107, + "step": 6753 + }, + { + "epoch": 0.7667621003934424, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2366, + "step": 6754 + }, + { + "epoch": 0.7668756275033615, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2143, + "step": 6755 + }, + { + "epoch": 0.7669891546132805, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.231, + "step": 6756 + }, + { + "epoch": 0.7671026817231996, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2416, + "step": 6757 + }, + { + "epoch": 0.7672162088331187, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.229, + "step": 6758 + }, + { + "epoch": 0.7673297359430378, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2533, + "step": 6759 + }, + { + "epoch": 0.7674432630529568, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2351, + "step": 6760 + }, + { + "epoch": 0.7675567901628759, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 6761 + }, + { + "epoch": 0.767670317272795, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 6762 + }, + { + "epoch": 0.7677838443827141, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.242, + "step": 6763 + }, + { + "epoch": 0.7678973714926332, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2243, + "step": 6764 + }, + { + "epoch": 0.7680108986025522, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2223, + "step": 6765 + }, + { + "epoch": 0.7681244257124713, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2535, + "step": 6766 + }, + { + "epoch": 0.7682379528223904, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2203, + "step": 6767 + }, + { + "epoch": 0.7683514799323095, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.223, + "step": 6768 + }, + { + "epoch": 0.7684650070422285, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2298, + "step": 6769 + }, + { + "epoch": 0.7685785341521476, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.235, + "step": 6770 + }, + { + "epoch": 0.7686920612620667, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2343, + "step": 6771 + }, + { + "epoch": 0.7688055883719858, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.222, + "step": 6772 + }, + { + "epoch": 0.7689191154819048, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2294, + "step": 6773 + }, + { + "epoch": 0.7690326425918239, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2549, + "step": 6774 + }, + { + "epoch": 0.769146169701743, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2303, + "step": 6775 + }, + { + "epoch": 0.7692596968116621, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2419, + "step": 6776 + }, + { + "epoch": 0.7693732239215811, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2335, + "step": 6777 + }, + { + "epoch": 0.7694867510315002, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.241, + "step": 6778 + }, + { + "epoch": 0.7696002781414193, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 6779 + }, + { + "epoch": 0.7697138052513384, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6780 + }, + { + "epoch": 0.7698273323612574, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2282, + "step": 6781 + }, + { + "epoch": 0.7699408594711765, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6782 + }, + { + "epoch": 0.7700543865810956, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 6783 + }, + { + "epoch": 0.7701679136910147, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2382, + "step": 6784 + }, + { + "epoch": 0.7702814408009337, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2788, + "step": 6785 + }, + { + "epoch": 0.7703949679108528, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.238, + "step": 6786 + }, + { + "epoch": 0.7705084950207719, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.244, + "step": 6787 + }, + { + "epoch": 0.770622022130691, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2475, + "step": 6788 + }, + { + "epoch": 0.77073554924061, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2095, + "step": 6789 + }, + { + "epoch": 0.7708490763505291, + "grad_norm": 0.25390625, + "learning_rate": 0.002, + "loss": 5.2081, + "step": 6790 + }, + { + "epoch": 0.7709626034604482, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2202, + "step": 6791 + }, + { + "epoch": 0.7710761305703673, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 6792 + }, + { + "epoch": 0.7711896576802864, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6793 + }, + { + "epoch": 0.7713031847902054, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 6794 + }, + { + "epoch": 0.7714167119001245, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.234, + "step": 6795 + }, + { + "epoch": 0.7715302390100436, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2404, + "step": 6796 + }, + { + "epoch": 0.7716437661199627, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2266, + "step": 6797 + }, + { + "epoch": 0.7717572932298817, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2344, + "step": 6798 + }, + { + "epoch": 0.7718708203398008, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2377, + "step": 6799 + }, + { + "epoch": 0.7719843474497199, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2376, + "step": 6800 + }, + { + "epoch": 0.772097874559639, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2333, + "step": 6801 + }, + { + "epoch": 0.772211401669558, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2518, + "step": 6802 + }, + { + "epoch": 0.7723249287794771, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2407, + "step": 6803 + }, + { + "epoch": 0.7724384558893962, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2413, + "step": 6804 + }, + { + "epoch": 0.7725519829993153, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2209, + "step": 6805 + }, + { + "epoch": 0.7726655101092343, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2597, + "step": 6806 + }, + { + "epoch": 0.7727790372191534, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2248, + "step": 6807 + }, + { + "epoch": 0.7728925643290725, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2286, + "step": 6808 + }, + { + "epoch": 0.7730060914389916, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2331, + "step": 6809 + }, + { + "epoch": 0.7731196185489106, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2193, + "step": 6810 + }, + { + "epoch": 0.7732331456588297, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2528, + "step": 6811 + }, + { + "epoch": 0.7733466727687488, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 6812 + }, + { + "epoch": 0.7734601998786679, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2199, + "step": 6813 + }, + { + "epoch": 0.773573726988587, + "grad_norm": 0.296875, + "learning_rate": 0.002, + "loss": 5.2408, + "step": 6814 + }, + { + "epoch": 0.773687254098506, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2329, + "step": 6815 + }, + { + "epoch": 0.7738007812084251, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2513, + "step": 6816 + }, + { + "epoch": 0.7739143083183442, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2356, + "step": 6817 + }, + { + "epoch": 0.7740278354282633, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 6818 + }, + { + "epoch": 0.7741413625381823, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2206, + "step": 6819 + }, + { + "epoch": 0.7742548896481014, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6820 + }, + { + "epoch": 0.7743684167580205, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2602, + "step": 6821 + }, + { + "epoch": 0.7744819438679396, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2196, + "step": 6822 + }, + { + "epoch": 0.7745954709778586, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2355, + "step": 6823 + }, + { + "epoch": 0.7747089980877777, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2368, + "step": 6824 + }, + { + "epoch": 0.7748225251976968, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2678, + "step": 6825 + }, + { + "epoch": 0.7749360523076159, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2182, + "step": 6826 + }, + { + "epoch": 0.7750495794175349, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2361, + "step": 6827 + }, + { + "epoch": 0.775163106527454, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.232, + "step": 6828 + }, + { + "epoch": 0.7752766336373731, + "grad_norm": 0.416015625, + "learning_rate": 0.002, + "loss": 5.2196, + "step": 6829 + }, + { + "epoch": 0.7753901607472922, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2364, + "step": 6830 + }, + { + "epoch": 0.7755036878572112, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.258, + "step": 6831 + }, + { + "epoch": 0.7756172149671303, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2222, + "step": 6832 + }, + { + "epoch": 0.7757307420770494, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2422, + "step": 6833 + }, + { + "epoch": 0.7758442691869685, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 6834 + }, + { + "epoch": 0.7759577962968875, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2299, + "step": 6835 + }, + { + "epoch": 0.7760713234068066, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2432, + "step": 6836 + }, + { + "epoch": 0.7761848505167257, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2389, + "step": 6837 + }, + { + "epoch": 0.7762983776266448, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2381, + "step": 6838 + }, + { + "epoch": 0.7764119047365639, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2311, + "step": 6839 + }, + { + "epoch": 0.7765254318464829, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2197, + "step": 6840 + }, + { + "epoch": 0.776638958956402, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 6841 + }, + { + "epoch": 0.7767524860663211, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.245, + "step": 6842 + }, + { + "epoch": 0.7768660131762402, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.2618, + "step": 6843 + }, + { + "epoch": 0.7769795402861592, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.25, + "step": 6844 + }, + { + "epoch": 0.7770930673960783, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.247, + "step": 6845 + }, + { + "epoch": 0.7772065945059974, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2188, + "step": 6846 + }, + { + "epoch": 0.7773201216159165, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.255, + "step": 6847 + }, + { + "epoch": 0.7774336487258355, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6848 + }, + { + "epoch": 0.7775471758357546, + "grad_norm": 0.3359375, + "learning_rate": 0.002, + "loss": 5.2509, + "step": 6849 + }, + { + "epoch": 0.7776607029456737, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2479, + "step": 6850 + }, + { + "epoch": 0.7777742300555928, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2289, + "step": 6851 + }, + { + "epoch": 0.7778877571655118, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2379, + "step": 6852 + }, + { + "epoch": 0.7780012842754309, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.2318, + "step": 6853 + }, + { + "epoch": 0.77811481138535, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2342, + "step": 6854 + }, + { + "epoch": 0.7782283384952691, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.271, + "step": 6855 + }, + { + "epoch": 0.7783418656051881, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2582, + "step": 6856 + }, + { + "epoch": 0.7784553927151072, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2279, + "step": 6857 + }, + { + "epoch": 0.7785689198250263, + "grad_norm": 0.248046875, + "learning_rate": 0.002, + "loss": 5.2574, + "step": 6858 + }, + { + "epoch": 0.7786824469349454, + "grad_norm": 0.2431640625, + "learning_rate": 0.002, + "loss": 5.2319, + "step": 6859 + }, + { + "epoch": 0.7787959740448644, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2146, + "step": 6860 + }, + { + "epoch": 0.7789095011547835, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2272, + "step": 6861 + }, + { + "epoch": 0.7790230282647026, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2288, + "step": 6862 + }, + { + "epoch": 0.7791365553746217, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2431, + "step": 6863 + }, + { + "epoch": 0.7792500824845408, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 6864 + }, + { + "epoch": 0.7793636095944598, + "grad_norm": 0.26171875, + "learning_rate": 0.002, + "loss": 5.2358, + "step": 6865 + }, + { + "epoch": 0.7794771367043789, + "grad_norm": 0.265625, + "learning_rate": 0.002, + "loss": 5.2503, + "step": 6866 + }, + { + "epoch": 0.779590663814298, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2333, + "step": 6867 + }, + { + "epoch": 0.779704190924217, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 6868 + }, + { + "epoch": 0.7798177180341361, + "grad_norm": 0.298828125, + "learning_rate": 0.002, + "loss": 5.232, + "step": 6869 + }, + { + "epoch": 0.7799312451440552, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2399, + "step": 6870 + }, + { + "epoch": 0.7800447722539743, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2346, + "step": 6871 + }, + { + "epoch": 0.7801582993638934, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2398, + "step": 6872 + }, + { + "epoch": 0.7802718264738124, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2493, + "step": 6873 + }, + { + "epoch": 0.7803853535837315, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2278, + "step": 6874 + }, + { + "epoch": 0.7804988806936506, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2305, + "step": 6875 + }, + { + "epoch": 0.7806124078035697, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2213, + "step": 6876 + }, + { + "epoch": 0.7807259349134887, + "grad_norm": 0.37109375, + "learning_rate": 0.002, + "loss": 5.243, + "step": 6877 + }, + { + "epoch": 0.7808394620234079, + "grad_norm": 0.55859375, + "learning_rate": 0.002, + "loss": 5.2457, + "step": 6878 + }, + { + "epoch": 0.780952989133327, + "grad_norm": 0.50390625, + "learning_rate": 0.002, + "loss": 5.2353, + "step": 6879 + }, + { + "epoch": 0.7810665162432461, + "grad_norm": 0.412109375, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 6880 + }, + { + "epoch": 0.7811800433531652, + "grad_norm": 0.41796875, + "learning_rate": 0.002, + "loss": 5.2453, + "step": 6881 + }, + { + "epoch": 0.7812935704630842, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2442, + "step": 6882 + }, + { + "epoch": 0.7814070975730033, + "grad_norm": 0.3671875, + "learning_rate": 0.002, + "loss": 5.2573, + "step": 6883 + }, + { + "epoch": 0.7815206246829224, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 6884 + }, + { + "epoch": 0.7816341517928415, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2445, + "step": 6885 + }, + { + "epoch": 0.7817476789027605, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2392, + "step": 6886 + }, + { + "epoch": 0.7818612060126796, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 6887 + }, + { + "epoch": 0.7819747331225987, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2335, + "step": 6888 + }, + { + "epoch": 0.7820882602325178, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2373, + "step": 6889 + }, + { + "epoch": 0.7822017873424368, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2308, + "step": 6890 + }, + { + "epoch": 0.7823153144523559, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2202, + "step": 6891 + }, + { + "epoch": 0.782428841562275, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2322, + "step": 6892 + }, + { + "epoch": 0.7825423686721941, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2481, + "step": 6893 + }, + { + "epoch": 0.7826558957821131, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 6894 + }, + { + "epoch": 0.7827694228920322, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 6895 + }, + { + "epoch": 0.7828829500019513, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2343, + "step": 6896 + }, + { + "epoch": 0.7829964771118704, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2486, + "step": 6897 + }, + { + "epoch": 0.7831100042217894, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2181, + "step": 6898 + }, + { + "epoch": 0.7832235313317085, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2498, + "step": 6899 + }, + { + "epoch": 0.7833370584416276, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 6900 + }, + { + "epoch": 0.7834505855515467, + "grad_norm": 0.263671875, + "learning_rate": 0.002, + "loss": 5.2143, + "step": 6901 + }, + { + "epoch": 0.7835641126614658, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 6902 + }, + { + "epoch": 0.7836776397713848, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.241, + "step": 6903 + }, + { + "epoch": 0.7837911668813039, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2391, + "step": 6904 + }, + { + "epoch": 0.783904693991223, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.245, + "step": 6905 + }, + { + "epoch": 0.784018221101142, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2314, + "step": 6906 + }, + { + "epoch": 0.7841317482110611, + "grad_norm": 0.40625, + "learning_rate": 0.002, + "loss": 5.2241, + "step": 6907 + }, + { + "epoch": 0.7842452753209802, + "grad_norm": 0.38671875, + "learning_rate": 0.002, + "loss": 5.2169, + "step": 6908 + }, + { + "epoch": 0.7843588024308993, + "grad_norm": 0.380859375, + "learning_rate": 0.002, + "loss": 5.2409, + "step": 6909 + }, + { + "epoch": 0.7844723295408184, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.236, + "step": 6910 + }, + { + "epoch": 0.7845858566507374, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2588, + "step": 6911 + }, + { + "epoch": 0.7846993837606565, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2417, + "step": 6912 + }, + { + "epoch": 0.7848129108705756, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2552, + "step": 6913 + }, + { + "epoch": 0.7849264379804947, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2175, + "step": 6914 + }, + { + "epoch": 0.7850399650904137, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.2256, + "step": 6915 + }, + { + "epoch": 0.7851534922003328, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2154, + "step": 6916 + }, + { + "epoch": 0.7852670193102519, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2367, + "step": 6917 + }, + { + "epoch": 0.785380546420171, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2481, + "step": 6918 + }, + { + "epoch": 0.78549407353009, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2453, + "step": 6919 + }, + { + "epoch": 0.7856076006400091, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 6920 + }, + { + "epoch": 0.7857211277499282, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2279, + "step": 6921 + }, + { + "epoch": 0.7858346548598473, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2364, + "step": 6922 + }, + { + "epoch": 0.7859481819697663, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2455, + "step": 6923 + }, + { + "epoch": 0.7860617090796854, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.237, + "step": 6924 + }, + { + "epoch": 0.7861752361896045, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2437, + "step": 6925 + }, + { + "epoch": 0.7862887632995236, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2476, + "step": 6926 + }, + { + "epoch": 0.7864022904094427, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2362, + "step": 6927 + }, + { + "epoch": 0.7865158175193617, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.248, + "step": 6928 + }, + { + "epoch": 0.7866293446292808, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2252, + "step": 6929 + }, + { + "epoch": 0.7867428717391999, + "grad_norm": 0.291015625, + "learning_rate": 0.002, + "loss": 5.2306, + "step": 6930 + }, + { + "epoch": 0.786856398849119, + "grad_norm": 0.30859375, + "learning_rate": 0.002, + "loss": 5.2265, + "step": 6931 + }, + { + "epoch": 0.786969925959038, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2217, + "step": 6932 + }, + { + "epoch": 0.7870834530689571, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6933 + }, + { + "epoch": 0.7871969801788762, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2235, + "step": 6934 + }, + { + "epoch": 0.7873105072887953, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2451, + "step": 6935 + }, + { + "epoch": 0.7874240343987143, + "grad_norm": 0.34765625, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6936 + }, + { + "epoch": 0.7875375615086334, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.2405, + "step": 6937 + }, + { + "epoch": 0.7876510886185525, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.2115, + "step": 6938 + }, + { + "epoch": 0.7877646157284716, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2136, + "step": 6939 + }, + { + "epoch": 0.7878781428383906, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2299, + "step": 6940 + }, + { + "epoch": 0.7879916699483097, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6941 + }, + { + "epoch": 0.7881051970582288, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 6942 + }, + { + "epoch": 0.7882187241681479, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2191, + "step": 6943 + }, + { + "epoch": 0.7883322512780669, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2442, + "step": 6944 + }, + { + "epoch": 0.788445778387986, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2374, + "step": 6945 + }, + { + "epoch": 0.7885593054979051, + "grad_norm": 0.353515625, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 6946 + }, + { + "epoch": 0.7886728326078242, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2383, + "step": 6947 + }, + { + "epoch": 0.7887863597177432, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 6948 + }, + { + "epoch": 0.7888998868276623, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2313, + "step": 6949 + }, + { + "epoch": 0.7890134139375814, + "grad_norm": 0.33984375, + "learning_rate": 0.002, + "loss": 5.2563, + "step": 6950 + }, + { + "epoch": 0.7891269410475005, + "grad_norm": 0.365234375, + "learning_rate": 0.002, + "loss": 5.2591, + "step": 6951 + }, + { + "epoch": 0.7892404681574196, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2393, + "step": 6952 + }, + { + "epoch": 0.7893539952673386, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2363, + "step": 6953 + }, + { + "epoch": 0.7894675223772577, + "grad_norm": 0.373046875, + "learning_rate": 0.002, + "loss": 5.2446, + "step": 6954 + }, + { + "epoch": 0.7895810494871768, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2312, + "step": 6955 + }, + { + "epoch": 0.7896945765970959, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.226, + "step": 6956 + }, + { + "epoch": 0.7898081037070149, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2148, + "step": 6957 + }, + { + "epoch": 0.789921630816934, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2506, + "step": 6958 + }, + { + "epoch": 0.7900351579268531, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.234, + "step": 6959 + }, + { + "epoch": 0.7901486850367722, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2247, + "step": 6960 + }, + { + "epoch": 0.7902622121466912, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.2458, + "step": 6961 + }, + { + "epoch": 0.7903757392566103, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2491, + "step": 6962 + }, + { + "epoch": 0.7904892663665294, + "grad_norm": 0.26953125, + "learning_rate": 0.002, + "loss": 5.247, + "step": 6963 + }, + { + "epoch": 0.7906027934764485, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2535, + "step": 6964 + }, + { + "epoch": 0.7907163205863675, + "grad_norm": 0.28515625, + "learning_rate": 0.002, + "loss": 5.243, + "step": 6965 + }, + { + "epoch": 0.7908298476962866, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2401, + "step": 6966 + }, + { + "epoch": 0.7909433748062057, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2325, + "step": 6967 + }, + { + "epoch": 0.7910569019161248, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.2415, + "step": 6968 + }, + { + "epoch": 0.7911704290260438, + "grad_norm": 0.2578125, + "learning_rate": 0.002, + "loss": 5.2203, + "step": 6969 + }, + { + "epoch": 0.7912839561359629, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.2427, + "step": 6970 + }, + { + "epoch": 0.791397483245882, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2402, + "step": 6971 + }, + { + "epoch": 0.7915110103558011, + "grad_norm": 0.39453125, + "learning_rate": 0.002, + "loss": 5.2275, + "step": 6972 + }, + { + "epoch": 0.7916245374657201, + "grad_norm": 0.4140625, + "learning_rate": 0.002, + "loss": 5.2234, + "step": 6973 + }, + { + "epoch": 0.7917380645756392, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2497, + "step": 6974 + }, + { + "epoch": 0.7918515916855583, + "grad_norm": 0.361328125, + "learning_rate": 0.002, + "loss": 5.2208, + "step": 6975 + }, + { + "epoch": 0.7919651187954774, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.236, + "step": 6976 + }, + { + "epoch": 0.7920786459053965, + "grad_norm": 0.279296875, + "learning_rate": 0.002, + "loss": 5.2368, + "step": 6977 + }, + { + "epoch": 0.7921921730153155, + "grad_norm": 0.259765625, + "learning_rate": 0.002, + "loss": 5.2332, + "step": 6978 + }, + { + "epoch": 0.7923057001252346, + "grad_norm": 0.2451171875, + "learning_rate": 0.002, + "loss": 5.2314, + "step": 6979 + }, + { + "epoch": 0.7924192272351537, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2378, + "step": 6980 + }, + { + "epoch": 0.7925327543450728, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 6981 + }, + { + "epoch": 0.7926462814549918, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2576, + "step": 6982 + }, + { + "epoch": 0.7927598085649109, + "grad_norm": 0.310546875, + "learning_rate": 0.002, + "loss": 5.2327, + "step": 6983 + }, + { + "epoch": 0.79287333567483, + "grad_norm": 0.36328125, + "learning_rate": 0.002, + "loss": 5.2248, + "step": 6984 + }, + { + "epoch": 0.7929868627847491, + "grad_norm": 0.376953125, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 6985 + }, + { + "epoch": 0.7931003898946681, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2326, + "step": 6986 + }, + { + "epoch": 0.7932139170045872, + "grad_norm": 0.345703125, + "learning_rate": 0.002, + "loss": 5.2195, + "step": 6987 + }, + { + "epoch": 0.7933274441145063, + "grad_norm": 0.349609375, + "learning_rate": 0.002, + "loss": 5.2251, + "step": 6988 + }, + { + "epoch": 0.7934409712244254, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2463, + "step": 6989 + }, + { + "epoch": 0.7935544983343444, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2387, + "step": 6990 + }, + { + "epoch": 0.7936680254442635, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.262, + "step": 6991 + }, + { + "epoch": 0.7937815525541826, + "grad_norm": 0.322265625, + "learning_rate": 0.002, + "loss": 5.2209, + "step": 6992 + }, + { + "epoch": 0.7938950796641017, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.223, + "step": 6993 + }, + { + "epoch": 0.7940086067740207, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 6994 + }, + { + "epoch": 0.7941221338839398, + "grad_norm": 0.384765625, + "learning_rate": 0.002, + "loss": 5.2105, + "step": 6995 + }, + { + "epoch": 0.7942356609938589, + "grad_norm": 0.369140625, + "learning_rate": 0.002, + "loss": 5.2564, + "step": 6996 + }, + { + "epoch": 0.794349188103778, + "grad_norm": 0.37890625, + "learning_rate": 0.002, + "loss": 5.2414, + "step": 6997 + }, + { + "epoch": 0.794462715213697, + "grad_norm": 0.341796875, + "learning_rate": 0.002, + "loss": 5.2365, + "step": 6998 + }, + { + "epoch": 0.7945762423236161, + "grad_norm": 0.337890625, + "learning_rate": 0.002, + "loss": 5.2439, + "step": 6999 + }, + { + "epoch": 0.7946897694335352, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2292, + "step": 7000 + }, + { + "epoch": 0.7948032965434543, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2461, + "step": 7001 + }, + { + "epoch": 0.7949168236533734, + "grad_norm": 0.328125, + "learning_rate": 0.002, + "loss": 5.246, + "step": 7002 + }, + { + "epoch": 0.7950303507632924, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2428, + "step": 7003 + }, + { + "epoch": 0.7951438778732115, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2487, + "step": 7004 + }, + { + "epoch": 0.7952574049831306, + "grad_norm": 0.318359375, + "learning_rate": 0.002, + "loss": 5.2452, + "step": 7005 + }, + { + "epoch": 0.7953709320930497, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2282, + "step": 7006 + }, + { + "epoch": 0.7954844592029687, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2371, + "step": 7007 + }, + { + "epoch": 0.7955979863128878, + "grad_norm": 0.251953125, + "learning_rate": 0.002, + "loss": 5.2197, + "step": 7008 + }, + { + "epoch": 0.7957115134228069, + "grad_norm": 0.2314453125, + "learning_rate": 0.002, + "loss": 5.216, + "step": 7009 + }, + { + "epoch": 0.795825040532726, + "grad_norm": 0.2216796875, + "learning_rate": 0.002, + "loss": 5.2231, + "step": 7010 + }, + { + "epoch": 0.795938567642645, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2396, + "step": 7011 + }, + { + "epoch": 0.7960520947525641, + "grad_norm": 0.375, + "learning_rate": 0.002, + "loss": 5.2239, + "step": 7012 + }, + { + "epoch": 0.7961656218624832, + "grad_norm": 0.455078125, + "learning_rate": 0.002, + "loss": 5.2562, + "step": 7013 + }, + { + "epoch": 0.7962791489724023, + "grad_norm": 0.46484375, + "learning_rate": 0.002, + "loss": 5.248, + "step": 7014 + }, + { + "epoch": 0.7963926760823213, + "grad_norm": 0.43359375, + "learning_rate": 0.002, + "loss": 5.2318, + "step": 7015 + }, + { + "epoch": 0.7965062031922404, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2435, + "step": 7016 + }, + { + "epoch": 0.7966197303021595, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2274, + "step": 7017 + }, + { + "epoch": 0.7967332574120786, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2473, + "step": 7018 + }, + { + "epoch": 0.7968467845219976, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2277, + "step": 7019 + }, + { + "epoch": 0.7969603116319167, + "grad_norm": 0.28125, + "learning_rate": 0.002, + "loss": 5.2257, + "step": 7020 + }, + { + "epoch": 0.7970738387418358, + "grad_norm": 0.271484375, + "learning_rate": 0.002, + "loss": 5.2436, + "step": 7021 + }, + { + "epoch": 0.7971873658517549, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.249, + "step": 7022 + }, + { + "epoch": 0.797300892961674, + "grad_norm": 0.2734375, + "learning_rate": 0.002, + "loss": 5.251, + "step": 7023 + }, + { + "epoch": 0.797414420071593, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.232, + "step": 7024 + }, + { + "epoch": 0.7975279471815121, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2185, + "step": 7025 + }, + { + "epoch": 0.7976414742914312, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 7026 + }, + { + "epoch": 0.7977550014013502, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2369, + "step": 7027 + }, + { + "epoch": 0.7978685285112693, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2301, + "step": 7028 + }, + { + "epoch": 0.7979820556211884, + "grad_norm": 0.306640625, + "learning_rate": 0.002, + "loss": 5.2426, + "step": 7029 + }, + { + "epoch": 0.7980955827311075, + "grad_norm": 0.29296875, + "learning_rate": 0.002, + "loss": 5.2391, + "step": 7030 + }, + { + "epoch": 0.7982091098410266, + "grad_norm": 0.255859375, + "learning_rate": 0.002, + "loss": 5.217, + "step": 7031 + }, + { + "epoch": 0.7983226369509456, + "grad_norm": 0.267578125, + "learning_rate": 0.002, + "loss": 5.2424, + "step": 7032 + }, + { + "epoch": 0.7984361640608647, + "grad_norm": 0.25, + "learning_rate": 0.002, + "loss": 5.2188, + "step": 7033 + }, + { + "epoch": 0.7985496911707838, + "grad_norm": 0.30078125, + "learning_rate": 0.002, + "loss": 5.2291, + "step": 7034 + }, + { + "epoch": 0.7986632182807029, + "grad_norm": 0.326171875, + "learning_rate": 0.002, + "loss": 5.2345, + "step": 7035 + }, + { + "epoch": 0.7987767453906219, + "grad_norm": 0.390625, + "learning_rate": 0.002, + "loss": 5.2464, + "step": 7036 + }, + { + "epoch": 0.798890272500541, + "grad_norm": 0.388671875, + "learning_rate": 0.002, + "loss": 5.2364, + "step": 7037 + }, + { + "epoch": 0.7990037996104601, + "grad_norm": 0.359375, + "learning_rate": 0.002, + "loss": 5.2175, + "step": 7038 + }, + { + "epoch": 0.7991173267203792, + "grad_norm": 0.3515625, + "learning_rate": 0.002, + "loss": 5.2395, + "step": 7039 + }, + { + "epoch": 0.7992308538302982, + "grad_norm": 0.3046875, + "learning_rate": 0.002, + "loss": 5.1986, + "step": 7040 + }, + { + "epoch": 0.7993443809402173, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2579, + "step": 7041 + }, + { + "epoch": 0.7994579080501364, + "grad_norm": 0.27734375, + "learning_rate": 0.002, + "loss": 5.255, + "step": 7042 + }, + { + "epoch": 0.7995714351600555, + "grad_norm": 0.275390625, + "learning_rate": 0.002, + "loss": 5.2412, + "step": 7043 + }, + { + "epoch": 0.7996849622699745, + "grad_norm": 0.302734375, + "learning_rate": 0.002, + "loss": 5.2219, + "step": 7044 + }, + { + "epoch": 0.7997984893798936, + "grad_norm": 0.314453125, + "learning_rate": 0.002, + "loss": 5.232, + "step": 7045 + }, + { + "epoch": 0.7999120164898127, + "grad_norm": 0.33203125, + "learning_rate": 0.002, + "loss": 5.2227, + "step": 7046 + }, + { + "epoch": 0.8000255435997318, + "grad_norm": 0.34375, + "learning_rate": 0.002, + "loss": 5.2288, + "step": 7047 + }, + { + "epoch": 0.8001390707096508, + "grad_norm": 0.333984375, + "learning_rate": 0.002, + "loss": 5.2299, + "step": 7048 + }, + { + "epoch": 0.8002525978195699, + "grad_norm": 0.32421875, + "learning_rate": 0.002, + "loss": 5.2249, + "step": 7049 + }, + { + "epoch": 0.800366124929489, + "grad_norm": 0.330078125, + "learning_rate": 0.002, + "loss": 5.2269, + "step": 7050 + }, + { + "epoch": 0.8004796520394081, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 7051 + }, + { + "epoch": 0.8005931791493271, + "grad_norm": 0.31640625, + "learning_rate": 0.002, + "loss": 5.2385, + "step": 7052 + }, + { + "epoch": 0.8007067062592462, + "grad_norm": 0.3125, + "learning_rate": 0.002, + "loss": 5.2264, + "step": 7053 + }, + { + "epoch": 0.8008202333691653, + "grad_norm": 0.3203125, + "learning_rate": 0.002, + "loss": 5.2482, + "step": 7054 + }, + { + "epoch": 0.8009337604790844, + "grad_norm": 0.294921875, + "learning_rate": 0.002, + "loss": 5.2317, + "step": 7055 + }, + { + "epoch": 0.8010472875890035, + "grad_norm": 0.287109375, + "learning_rate": 0.002, + "loss": 5.249, + "step": 7056 + }, + { + "epoch": 0.8011608146989225, + "grad_norm": 0.283203125, + "learning_rate": 0.002, + "loss": 5.2467, + "step": 7057 + }, + { + "epoch": 0.8012743418088416, + "grad_norm": 0.2890625, + "learning_rate": 0.002, + "loss": 5.2354, + "step": 7058 + }, + { + "epoch": 0.8013878689187607, + "grad_norm": 0.310546875, + "learning_rate": 0.0019999985497728043, + "loss": 5.2593, + "step": 7059 + }, + { + "epoch": 0.8015013960286798, + "grad_norm": 0.31640625, + "learning_rate": 0.0019999941990958897, + "loss": 5.2228, + "step": 7060 + }, + { + "epoch": 0.8016149231385988, + "grad_norm": 0.361328125, + "learning_rate": 0.0019999869479832783, + "loss": 5.2192, + "step": 7061 + }, + { + "epoch": 0.8017284502485179, + "grad_norm": 0.3515625, + "learning_rate": 0.0019999767964583377, + "loss": 5.2464, + "step": 7062 + }, + { + "epoch": 0.801841977358437, + "grad_norm": 0.3515625, + "learning_rate": 0.001999963744553784, + "loss": 5.235, + "step": 7063 + }, + { + "epoch": 0.8019555044683561, + "grad_norm": 0.3671875, + "learning_rate": 0.00199994779231168, + "loss": 5.2344, + "step": 7064 + }, + { + "epoch": 0.8020690315782751, + "grad_norm": 0.31640625, + "learning_rate": 0.0019999289397834344, + "loss": 5.251, + "step": 7065 + }, + { + "epoch": 0.8021825586881942, + "grad_norm": 0.33984375, + "learning_rate": 0.0019999071870298053, + "loss": 5.2342, + "step": 7066 + }, + { + "epoch": 0.8022960857981133, + "grad_norm": 0.326171875, + "learning_rate": 0.0019998825341208943, + "loss": 5.2311, + "step": 7067 + }, + { + "epoch": 0.8024096129080324, + "grad_norm": 0.322265625, + "learning_rate": 0.0019998549811361525, + "loss": 5.2038, + "step": 7068 + }, + { + "epoch": 0.8025231400179514, + "grad_norm": 0.29296875, + "learning_rate": 0.0019998245281643745, + "loss": 5.2382, + "step": 7069 + }, + { + "epoch": 0.8026366671278705, + "grad_norm": 0.294921875, + "learning_rate": 0.0019997911753037024, + "loss": 5.2386, + "step": 7070 + }, + { + "epoch": 0.8027501942377896, + "grad_norm": 0.30078125, + "learning_rate": 0.0019997549226616236, + "loss": 5.208, + "step": 7071 + }, + { + "epoch": 0.8028637213477087, + "grad_norm": 0.357421875, + "learning_rate": 0.00199971577035497, + "loss": 5.2324, + "step": 7072 + }, + { + "epoch": 0.8029772484576277, + "grad_norm": 0.39453125, + "learning_rate": 0.001999673718509919, + "loss": 5.2286, + "step": 7073 + }, + { + "epoch": 0.8030907755675468, + "grad_norm": 0.42578125, + "learning_rate": 0.0019996287672619925, + "loss": 5.2365, + "step": 7074 + }, + { + "epoch": 0.8032043026774659, + "grad_norm": 0.41015625, + "learning_rate": 0.001999580916756055, + "loss": 5.2468, + "step": 7075 + }, + { + "epoch": 0.803317829787385, + "grad_norm": 0.353515625, + "learning_rate": 0.0019995301671463174, + "loss": 5.2252, + "step": 7076 + }, + { + "epoch": 0.803431356897304, + "grad_norm": 0.357421875, + "learning_rate": 0.0019994765185963304, + "loss": 5.216, + "step": 7077 + }, + { + "epoch": 0.8035448840072231, + "grad_norm": 0.275390625, + "learning_rate": 0.001999419971278989, + "loss": 5.2266, + "step": 7078 + }, + { + "epoch": 0.8036584111171422, + "grad_norm": 0.275390625, + "learning_rate": 0.0019993605253765304, + "loss": 5.2387, + "step": 7079 + }, + { + "epoch": 0.8037719382270613, + "grad_norm": 0.23828125, + "learning_rate": 0.0019992981810805317, + "loss": 5.205, + "step": 7080 + }, + { + "epoch": 0.8038854653369804, + "grad_norm": 0.2265625, + "learning_rate": 0.0019992329385919117, + "loss": 5.2228, + "step": 7081 + }, + { + "epoch": 0.8039989924468994, + "grad_norm": 0.2353515625, + "learning_rate": 0.00199916479812093, + "loss": 5.2332, + "step": 7082 + }, + { + "epoch": 0.8041125195568185, + "grad_norm": 0.2412109375, + "learning_rate": 0.001999093759887183, + "loss": 5.2142, + "step": 7083 + }, + { + "epoch": 0.8042260466667376, + "grad_norm": 0.255859375, + "learning_rate": 0.0019990198241196092, + "loss": 5.2272, + "step": 7084 + }, + { + "epoch": 0.8043395737766567, + "grad_norm": 0.255859375, + "learning_rate": 0.0019989429910564826, + "loss": 5.248, + "step": 7085 + }, + { + "epoch": 0.8044531008865757, + "grad_norm": 0.2578125, + "learning_rate": 0.001998863260945416, + "loss": 5.2242, + "step": 7086 + }, + { + "epoch": 0.8045666279964948, + "grad_norm": 0.2470703125, + "learning_rate": 0.0019987806340433564, + "loss": 5.2396, + "step": 7087 + }, + { + "epoch": 0.8046801551064139, + "grad_norm": 0.24609375, + "learning_rate": 0.001998695110616589, + "loss": 5.2022, + "step": 7088 + }, + { + "epoch": 0.804793682216333, + "grad_norm": 0.255859375, + "learning_rate": 0.0019986066909407317, + "loss": 5.2301, + "step": 7089 + }, + { + "epoch": 0.804907209326252, + "grad_norm": 0.28125, + "learning_rate": 0.0019985153753007373, + "loss": 5.213, + "step": 7090 + }, + { + "epoch": 0.8050207364361711, + "grad_norm": 0.30859375, + "learning_rate": 0.0019984211639908914, + "loss": 5.2323, + "step": 7091 + }, + { + "epoch": 0.8051342635460902, + "grad_norm": 0.353515625, + "learning_rate": 0.001998324057314811, + "loss": 5.25, + "step": 7092 + }, + { + "epoch": 0.8052477906560093, + "grad_norm": 0.359375, + "learning_rate": 0.0019982240555854445, + "loss": 5.2288, + "step": 7093 + }, + { + "epoch": 0.8053613177659283, + "grad_norm": 0.375, + "learning_rate": 0.00199812115912507, + "loss": 5.2443, + "step": 7094 + }, + { + "epoch": 0.8054748448758474, + "grad_norm": 0.3515625, + "learning_rate": 0.001998015368265295, + "loss": 5.2326, + "step": 7095 + }, + { + "epoch": 0.8055883719857665, + "grad_norm": 0.33203125, + "learning_rate": 0.001997906683347055, + "loss": 5.2554, + "step": 7096 + }, + { + "epoch": 0.8057018990956856, + "grad_norm": 0.294921875, + "learning_rate": 0.0019977951047206108, + "loss": 5.2406, + "step": 7097 + }, + { + "epoch": 0.8058154262056046, + "grad_norm": 0.298828125, + "learning_rate": 0.0019976806327455508, + "loss": 5.2463, + "step": 7098 + }, + { + "epoch": 0.8059289533155237, + "grad_norm": 0.294921875, + "learning_rate": 0.001997563267790786, + "loss": 5.2427, + "step": 7099 + }, + { + "epoch": 0.8060424804254428, + "grad_norm": 0.3359375, + "learning_rate": 0.0019974430102345526, + "loss": 5.2501, + "step": 7100 + }, + { + "epoch": 0.8061560075353619, + "grad_norm": 0.384765625, + "learning_rate": 0.001997319860464407, + "loss": 5.2372, + "step": 7101 + }, + { + "epoch": 0.806269534645281, + "grad_norm": 0.41015625, + "learning_rate": 0.001997193818877228, + "loss": 5.2439, + "step": 7102 + }, + { + "epoch": 0.8063830617552, + "grad_norm": 0.396484375, + "learning_rate": 0.001997064885879213, + "loss": 5.2292, + "step": 7103 + }, + { + "epoch": 0.8064965888651191, + "grad_norm": 0.375, + "learning_rate": 0.0019969330618858777, + "loss": 5.2216, + "step": 7104 + }, + { + "epoch": 0.8066101159750382, + "grad_norm": 0.333984375, + "learning_rate": 0.0019967983473220554, + "loss": 5.2187, + "step": 7105 + }, + { + "epoch": 0.8067236430849573, + "grad_norm": 0.31640625, + "learning_rate": 0.0019966607426218936, + "loss": 5.2221, + "step": 7106 + }, + { + "epoch": 0.8068371701948763, + "grad_norm": 0.296875, + "learning_rate": 0.0019965202482288553, + "loss": 5.2452, + "step": 7107 + }, + { + "epoch": 0.8069506973047954, + "grad_norm": 0.28515625, + "learning_rate": 0.001996376864595715, + "loss": 5.2446, + "step": 7108 + }, + { + "epoch": 0.8070642244147145, + "grad_norm": 0.27734375, + "learning_rate": 0.0019962305921845595, + "loss": 5.2246, + "step": 7109 + }, + { + "epoch": 0.8071777515246336, + "grad_norm": 0.29296875, + "learning_rate": 0.001996081431466785, + "loss": 5.2344, + "step": 7110 + }, + { + "epoch": 0.8072912786345526, + "grad_norm": 0.330078125, + "learning_rate": 0.001995929382923095, + "loss": 5.2461, + "step": 7111 + }, + { + "epoch": 0.8074048057444717, + "grad_norm": 0.375, + "learning_rate": 0.0019957744470435017, + "loss": 5.2463, + "step": 7112 + }, + { + "epoch": 0.8075183328543908, + "grad_norm": 0.392578125, + "learning_rate": 0.0019956166243273203, + "loss": 5.2233, + "step": 7113 + }, + { + "epoch": 0.8076318599643099, + "grad_norm": 0.373046875, + "learning_rate": 0.0019954559152831706, + "loss": 5.2364, + "step": 7114 + }, + { + "epoch": 0.8077453870742289, + "grad_norm": 0.390625, + "learning_rate": 0.001995292320428973, + "loss": 5.2483, + "step": 7115 + }, + { + "epoch": 0.807858914184148, + "grad_norm": 0.3515625, + "learning_rate": 0.001995125840291951, + "loss": 5.2371, + "step": 7116 + }, + { + "epoch": 0.8079724412940671, + "grad_norm": 0.36328125, + "learning_rate": 0.001994956475408623, + "loss": 5.2271, + "step": 7117 + }, + { + "epoch": 0.8080859684039862, + "grad_norm": 0.333984375, + "learning_rate": 0.0019947842263248064, + "loss": 5.2429, + "step": 7118 + }, + { + "epoch": 0.8081994955139052, + "grad_norm": 0.326171875, + "learning_rate": 0.001994609093595613, + "loss": 5.2098, + "step": 7119 + }, + { + "epoch": 0.8083130226238244, + "grad_norm": 0.291015625, + "learning_rate": 0.001994431077785448, + "loss": 5.2377, + "step": 7120 + }, + { + "epoch": 0.8084265497337435, + "grad_norm": 0.28515625, + "learning_rate": 0.0019942501794680077, + "loss": 5.2504, + "step": 7121 + }, + { + "epoch": 0.8085400768436626, + "grad_norm": 0.251953125, + "learning_rate": 0.0019940663992262775, + "loss": 5.2359, + "step": 7122 + }, + { + "epoch": 0.8086536039535817, + "grad_norm": 0.26171875, + "learning_rate": 0.0019938797376525316, + "loss": 5.2292, + "step": 7123 + }, + { + "epoch": 0.8087671310635007, + "grad_norm": 0.26171875, + "learning_rate": 0.001993690195348329, + "loss": 5.2436, + "step": 7124 + }, + { + "epoch": 0.8088806581734198, + "grad_norm": 0.2470703125, + "learning_rate": 0.001993497772924513, + "loss": 5.226, + "step": 7125 + }, + { + "epoch": 0.8089941852833389, + "grad_norm": 0.263671875, + "learning_rate": 0.0019933024710012083, + "loss": 5.2094, + "step": 7126 + }, + { + "epoch": 0.809107712393258, + "grad_norm": 0.263671875, + "learning_rate": 0.00199310429020782, + "loss": 5.2277, + "step": 7127 + }, + { + "epoch": 0.809221239503177, + "grad_norm": 0.279296875, + "learning_rate": 0.00199290323118303, + "loss": 5.2299, + "step": 7128 + }, + { + "epoch": 0.8093347666130961, + "grad_norm": 0.271484375, + "learning_rate": 0.001992699294574798, + "loss": 5.2419, + "step": 7129 + }, + { + "epoch": 0.8094482937230152, + "grad_norm": 0.279296875, + "learning_rate": 0.0019924924810403545, + "loss": 5.2246, + "step": 7130 + }, + { + "epoch": 0.8095618208329343, + "grad_norm": 0.283203125, + "learning_rate": 0.0019922827912462046, + "loss": 5.2314, + "step": 7131 + }, + { + "epoch": 0.8096753479428533, + "grad_norm": 0.291015625, + "learning_rate": 0.0019920702258681196, + "loss": 5.2273, + "step": 7132 + }, + { + "epoch": 0.8097888750527724, + "grad_norm": 0.28125, + "learning_rate": 0.001991854785591141, + "loss": 5.2327, + "step": 7133 + }, + { + "epoch": 0.8099024021626915, + "grad_norm": 0.265625, + "learning_rate": 0.0019916364711095743, + "loss": 5.2324, + "step": 7134 + }, + { + "epoch": 0.8100159292726106, + "grad_norm": 0.259765625, + "learning_rate": 0.001991415283126986, + "loss": 5.2381, + "step": 7135 + }, + { + "epoch": 0.8101294563825296, + "grad_norm": 0.2431640625, + "learning_rate": 0.0019911912223562057, + "loss": 5.2377, + "step": 7136 + }, + { + "epoch": 0.8102429834924487, + "grad_norm": 0.2734375, + "learning_rate": 0.00199096428951932, + "loss": 5.2477, + "step": 7137 + }, + { + "epoch": 0.8103565106023678, + "grad_norm": 0.3046875, + "learning_rate": 0.0019907344853476714, + "loss": 5.2349, + "step": 7138 + }, + { + "epoch": 0.8104700377122869, + "grad_norm": 0.318359375, + "learning_rate": 0.001990501810581856, + "loss": 5.2307, + "step": 7139 + }, + { + "epoch": 0.810583564822206, + "grad_norm": 0.333984375, + "learning_rate": 0.001990266265971721, + "loss": 5.2297, + "step": 7140 + }, + { + "epoch": 0.810697091932125, + "grad_norm": 0.3046875, + "learning_rate": 0.0019900278522763624, + "loss": 5.2133, + "step": 7141 + }, + { + "epoch": 0.8108106190420441, + "grad_norm": 0.30078125, + "learning_rate": 0.0019897865702641227, + "loss": 5.244, + "step": 7142 + }, + { + "epoch": 0.8109241461519632, + "grad_norm": 0.271484375, + "learning_rate": 0.001989542420712588, + "loss": 5.2312, + "step": 7143 + }, + { + "epoch": 0.8110376732618823, + "grad_norm": 0.2578125, + "learning_rate": 0.001989295404408585, + "loss": 5.2455, + "step": 7144 + }, + { + "epoch": 0.8111512003718013, + "grad_norm": 0.2421875, + "learning_rate": 0.0019890455221481806, + "loss": 5.2388, + "step": 7145 + }, + { + "epoch": 0.8112647274817204, + "grad_norm": 0.2451171875, + "learning_rate": 0.001988792774736677, + "loss": 5.2303, + "step": 7146 + }, + { + "epoch": 0.8113782545916395, + "grad_norm": 0.23828125, + "learning_rate": 0.00198853716298861, + "loss": 5.2365, + "step": 7147 + }, + { + "epoch": 0.8114917817015586, + "grad_norm": 0.24609375, + "learning_rate": 0.0019882786877277463, + "loss": 5.2388, + "step": 7148 + }, + { + "epoch": 0.8116053088114776, + "grad_norm": 0.251953125, + "learning_rate": 0.001988017349787081, + "loss": 5.2471, + "step": 7149 + }, + { + "epoch": 0.8117188359213967, + "grad_norm": 0.275390625, + "learning_rate": 0.001987753150008836, + "loss": 5.2396, + "step": 7150 + }, + { + "epoch": 0.8118323630313158, + "grad_norm": 0.29296875, + "learning_rate": 0.0019874860892444544, + "loss": 5.2562, + "step": 7151 + }, + { + "epoch": 0.8119458901412349, + "grad_norm": 0.30859375, + "learning_rate": 0.0019872161683545998, + "loss": 5.2307, + "step": 7152 + }, + { + "epoch": 0.8120594172511539, + "grad_norm": 0.349609375, + "learning_rate": 0.001986943388209154, + "loss": 5.2181, + "step": 7153 + }, + { + "epoch": 0.812172944361073, + "grad_norm": 0.369140625, + "learning_rate": 0.001986667749687213, + "loss": 5.2511, + "step": 7154 + }, + { + "epoch": 0.8122864714709921, + "grad_norm": 0.3984375, + "learning_rate": 0.0019863892536770844, + "loss": 5.2599, + "step": 7155 + }, + { + "epoch": 0.8123999985809112, + "grad_norm": 0.37890625, + "learning_rate": 0.001986107901076285, + "loss": 5.2153, + "step": 7156 + }, + { + "epoch": 0.8125135256908302, + "grad_norm": 0.345703125, + "learning_rate": 0.001985823692791537, + "loss": 5.2369, + "step": 7157 + }, + { + "epoch": 0.8126270528007493, + "grad_norm": 0.3515625, + "learning_rate": 0.0019855366297387668, + "loss": 5.2278, + "step": 7158 + }, + { + "epoch": 0.8127405799106684, + "grad_norm": 0.318359375, + "learning_rate": 0.0019852467128430996, + "loss": 5.2292, + "step": 7159 + }, + { + "epoch": 0.8128541070205875, + "grad_norm": 0.34375, + "learning_rate": 0.001984953943038859, + "loss": 5.2276, + "step": 7160 + }, + { + "epoch": 0.8129676341305065, + "grad_norm": 0.361328125, + "learning_rate": 0.001984658321269562, + "loss": 5.258, + "step": 7161 + }, + { + "epoch": 0.8130811612404256, + "grad_norm": 0.388671875, + "learning_rate": 0.001984359848487917, + "loss": 5.2062, + "step": 7162 + }, + { + "epoch": 0.8131946883503447, + "grad_norm": 0.3984375, + "learning_rate": 0.00198405852565582, + "loss": 5.2148, + "step": 7163 + }, + { + "epoch": 0.8133082154602638, + "grad_norm": 0.38671875, + "learning_rate": 0.0019837543537443523, + "loss": 5.2288, + "step": 7164 + }, + { + "epoch": 0.8134217425701828, + "grad_norm": 0.3359375, + "learning_rate": 0.0019834473337337773, + "loss": 5.243, + "step": 7165 + }, + { + "epoch": 0.8135352696801019, + "grad_norm": 0.33203125, + "learning_rate": 0.0019831374666135363, + "loss": 5.2008, + "step": 7166 + }, + { + "epoch": 0.813648796790021, + "grad_norm": 0.326171875, + "learning_rate": 0.0019828247533822466, + "loss": 5.215, + "step": 7167 + }, + { + "epoch": 0.8137623238999401, + "grad_norm": 0.326171875, + "learning_rate": 0.001982509195047698, + "loss": 5.2385, + "step": 7168 + }, + { + "epoch": 0.8138758510098592, + "grad_norm": 0.30859375, + "learning_rate": 0.0019821907926268487, + "loss": 5.2461, + "step": 7169 + }, + { + "epoch": 0.8139893781197782, + "grad_norm": 0.31640625, + "learning_rate": 0.001981869547145822, + "loss": 5.2426, + "step": 7170 + }, + { + "epoch": 0.8141029052296973, + "grad_norm": 0.294921875, + "learning_rate": 0.001981545459639906, + "loss": 5.2326, + "step": 7171 + }, + { + "epoch": 0.8142164323396164, + "grad_norm": 0.296875, + "learning_rate": 0.0019812185311535446, + "loss": 5.2196, + "step": 7172 + }, + { + "epoch": 0.8143299594495355, + "grad_norm": 0.287109375, + "learning_rate": 0.0019808887627403406, + "loss": 5.2386, + "step": 7173 + }, + { + "epoch": 0.8144434865594545, + "grad_norm": 0.27734375, + "learning_rate": 0.001980556155463046, + "loss": 5.256, + "step": 7174 + }, + { + "epoch": 0.8145570136693736, + "grad_norm": 0.287109375, + "learning_rate": 0.0019802207103935647, + "loss": 5.2262, + "step": 7175 + }, + { + "epoch": 0.8146705407792927, + "grad_norm": 0.283203125, + "learning_rate": 0.0019798824286129443, + "loss": 5.2309, + "step": 7176 + }, + { + "epoch": 0.8147840678892118, + "grad_norm": 0.30078125, + "learning_rate": 0.0019795413112113744, + "loss": 5.2336, + "step": 7177 + }, + { + "epoch": 0.8148975949991308, + "grad_norm": 0.326171875, + "learning_rate": 0.001979197359288183, + "loss": 5.2155, + "step": 7178 + }, + { + "epoch": 0.8150111221090499, + "grad_norm": 0.31640625, + "learning_rate": 0.001978850573951834, + "loss": 5.2059, + "step": 7179 + }, + { + "epoch": 0.815124649218969, + "grad_norm": 0.31640625, + "learning_rate": 0.001978500956319921, + "loss": 5.246, + "step": 7180 + }, + { + "epoch": 0.8152381763288881, + "grad_norm": 0.318359375, + "learning_rate": 0.0019781485075191668, + "loss": 5.2322, + "step": 7181 + }, + { + "epoch": 0.8153517034388071, + "grad_norm": 0.283203125, + "learning_rate": 0.0019777932286854185, + "loss": 5.2339, + "step": 7182 + }, + { + "epoch": 0.8154652305487262, + "grad_norm": 0.283203125, + "learning_rate": 0.001977435120963641, + "loss": 5.2343, + "step": 7183 + }, + { + "epoch": 0.8155787576586453, + "grad_norm": 0.2490234375, + "learning_rate": 0.0019770741855079195, + "loss": 5.2315, + "step": 7184 + }, + { + "epoch": 0.8156922847685644, + "grad_norm": 0.248046875, + "learning_rate": 0.001976710423481449, + "loss": 5.2266, + "step": 7185 + }, + { + "epoch": 0.8158058118784834, + "grad_norm": 0.23828125, + "learning_rate": 0.0019763438360565367, + "loss": 5.238, + "step": 7186 + }, + { + "epoch": 0.8159193389884025, + "grad_norm": 0.2451171875, + "learning_rate": 0.0019759744244145926, + "loss": 5.2134, + "step": 7187 + }, + { + "epoch": 0.8160328660983216, + "grad_norm": 0.2431640625, + "learning_rate": 0.0019756021897461306, + "loss": 5.2253, + "step": 7188 + }, + { + "epoch": 0.8161463932082407, + "grad_norm": 0.255859375, + "learning_rate": 0.0019752271332507607, + "loss": 5.2367, + "step": 7189 + }, + { + "epoch": 0.8162599203181597, + "grad_norm": 0.271484375, + "learning_rate": 0.0019748492561371877, + "loss": 5.246, + "step": 7190 + }, + { + "epoch": 0.8163734474280788, + "grad_norm": 0.32421875, + "learning_rate": 0.0019744685596232068, + "loss": 5.213, + "step": 7191 + }, + { + "epoch": 0.8164869745379979, + "grad_norm": 0.349609375, + "learning_rate": 0.0019740850449356984, + "loss": 5.2335, + "step": 7192 + }, + { + "epoch": 0.816600501647917, + "grad_norm": 0.36328125, + "learning_rate": 0.001973698713310626, + "loss": 5.2367, + "step": 7193 + }, + { + "epoch": 0.816714028757836, + "grad_norm": 0.353515625, + "learning_rate": 0.001973309565993032, + "loss": 5.2401, + "step": 7194 + }, + { + "epoch": 0.8168275558677551, + "grad_norm": 0.349609375, + "learning_rate": 0.00197291760423703, + "loss": 5.2281, + "step": 7195 + }, + { + "epoch": 0.8169410829776742, + "grad_norm": 0.322265625, + "learning_rate": 0.0019725228293058076, + "loss": 5.2098, + "step": 7196 + }, + { + "epoch": 0.8170546100875933, + "grad_norm": 0.34375, + "learning_rate": 0.001972125242471615, + "loss": 5.2306, + "step": 7197 + }, + { + "epoch": 0.8171681371975124, + "grad_norm": 0.33203125, + "learning_rate": 0.001971724845015768, + "loss": 5.2341, + "step": 7198 + }, + { + "epoch": 0.8172816643074314, + "grad_norm": 0.345703125, + "learning_rate": 0.001971321638228637, + "loss": 5.2291, + "step": 7199 + }, + { + "epoch": 0.8173951914173505, + "grad_norm": 0.3515625, + "learning_rate": 0.001970915623409648, + "loss": 5.2319, + "step": 7200 + }, + { + "epoch": 0.8175087185272696, + "grad_norm": 0.36328125, + "learning_rate": 0.0019705068018672747, + "loss": 5.2428, + "step": 7201 + }, + { + "epoch": 0.8176222456371887, + "grad_norm": 0.326171875, + "learning_rate": 0.001970095174919039, + "loss": 5.2308, + "step": 7202 + }, + { + "epoch": 0.8177357727471077, + "grad_norm": 0.328125, + "learning_rate": 0.0019696807438915015, + "loss": 5.2295, + "step": 7203 + }, + { + "epoch": 0.8178492998570268, + "grad_norm": 0.31640625, + "learning_rate": 0.0019692635101202604, + "loss": 5.2513, + "step": 7204 + }, + { + "epoch": 0.8179628269669459, + "grad_norm": 0.294921875, + "learning_rate": 0.0019688434749499466, + "loss": 5.2363, + "step": 7205 + }, + { + "epoch": 0.818076354076865, + "grad_norm": 0.2734375, + "learning_rate": 0.001968420639734218, + "loss": 5.2003, + "step": 7206 + }, + { + "epoch": 0.818189881186784, + "grad_norm": 0.27734375, + "learning_rate": 0.0019679950058357583, + "loss": 5.2274, + "step": 7207 + }, + { + "epoch": 0.8183034082967031, + "grad_norm": 0.27734375, + "learning_rate": 0.001967566574626268, + "loss": 5.2399, + "step": 7208 + }, + { + "epoch": 0.8184169354066222, + "grad_norm": 0.279296875, + "learning_rate": 0.001967135347486465, + "loss": 5.2159, + "step": 7209 + }, + { + "epoch": 0.8185304625165413, + "grad_norm": 0.29296875, + "learning_rate": 0.001966701325806076, + "loss": 5.2317, + "step": 7210 + }, + { + "epoch": 0.8186439896264603, + "grad_norm": 0.30859375, + "learning_rate": 0.0019662645109838352, + "loss": 5.2416, + "step": 7211 + }, + { + "epoch": 0.8187575167363794, + "grad_norm": 0.2890625, + "learning_rate": 0.001965824904427477, + "loss": 5.2259, + "step": 7212 + }, + { + "epoch": 0.8188710438462985, + "grad_norm": 0.283203125, + "learning_rate": 0.001965382507553734, + "loss": 5.2457, + "step": 7213 + }, + { + "epoch": 0.8189845709562176, + "grad_norm": 0.28125, + "learning_rate": 0.00196493732178833, + "loss": 5.2565, + "step": 7214 + }, + { + "epoch": 0.8190980980661366, + "grad_norm": 0.26953125, + "learning_rate": 0.001964489348565977, + "loss": 5.2399, + "step": 7215 + }, + { + "epoch": 0.8192116251760557, + "grad_norm": 0.26953125, + "learning_rate": 0.0019640385893303705, + "loss": 5.2222, + "step": 7216 + }, + { + "epoch": 0.8193251522859748, + "grad_norm": 0.259765625, + "learning_rate": 0.0019635850455341857, + "loss": 5.228, + "step": 7217 + }, + { + "epoch": 0.8194386793958939, + "grad_norm": 0.255859375, + "learning_rate": 0.001963128718639069, + "loss": 5.229, + "step": 7218 + }, + { + "epoch": 0.819552206505813, + "grad_norm": 0.251953125, + "learning_rate": 0.0019626696101156384, + "loss": 5.2476, + "step": 7219 + }, + { + "epoch": 0.819665733615732, + "grad_norm": 0.248046875, + "learning_rate": 0.001962207721443475, + "loss": 5.2394, + "step": 7220 + }, + { + "epoch": 0.8197792607256511, + "grad_norm": 0.25, + "learning_rate": 0.00196174305411112, + "loss": 5.2235, + "step": 7221 + }, + { + "epoch": 0.8198927878355702, + "grad_norm": 0.25390625, + "learning_rate": 0.0019612756096160687, + "loss": 5.2381, + "step": 7222 + }, + { + "epoch": 0.8200063149454893, + "grad_norm": 0.267578125, + "learning_rate": 0.0019608053894647685, + "loss": 5.2386, + "step": 7223 + }, + { + "epoch": 0.8201198420554083, + "grad_norm": 0.271484375, + "learning_rate": 0.00196033239517261, + "loss": 5.2171, + "step": 7224 + }, + { + "epoch": 0.8202333691653274, + "grad_norm": 0.2890625, + "learning_rate": 0.0019598566282639242, + "loss": 5.253, + "step": 7225 + }, + { + "epoch": 0.8203468962752465, + "grad_norm": 0.30859375, + "learning_rate": 0.001959378090271979, + "loss": 5.2352, + "step": 7226 + }, + { + "epoch": 0.8204604233851656, + "grad_norm": 0.3046875, + "learning_rate": 0.0019588967827389703, + "loss": 5.233, + "step": 7227 + }, + { + "epoch": 0.8205739504950846, + "grad_norm": 0.296875, + "learning_rate": 0.001958412707216023, + "loss": 5.2228, + "step": 7228 + }, + { + "epoch": 0.8206874776050037, + "grad_norm": 0.263671875, + "learning_rate": 0.001957925865263179, + "loss": 5.2065, + "step": 7229 + }, + { + "epoch": 0.8208010047149228, + "grad_norm": 0.275390625, + "learning_rate": 0.0019574362584493968, + "loss": 5.2348, + "step": 7230 + }, + { + "epoch": 0.8209145318248419, + "grad_norm": 0.3046875, + "learning_rate": 0.0019569438883525465, + "loss": 5.2241, + "step": 7231 + }, + { + "epoch": 0.8210280589347609, + "grad_norm": 0.3515625, + "learning_rate": 0.001956448756559402, + "loss": 5.2316, + "step": 7232 + }, + { + "epoch": 0.82114158604468, + "grad_norm": 0.396484375, + "learning_rate": 0.0019559508646656383, + "loss": 5.2187, + "step": 7233 + }, + { + "epoch": 0.8212551131545991, + "grad_norm": 0.412109375, + "learning_rate": 0.0019554502142758246, + "loss": 5.242, + "step": 7234 + }, + { + "epoch": 0.8213686402645182, + "grad_norm": 0.37109375, + "learning_rate": 0.001954946807003421, + "loss": 5.2443, + "step": 7235 + }, + { + "epoch": 0.8214821673744372, + "grad_norm": 0.345703125, + "learning_rate": 0.0019544406444707715, + "loss": 5.2319, + "step": 7236 + }, + { + "epoch": 0.8215956944843563, + "grad_norm": 0.328125, + "learning_rate": 0.0019539317283091, + "loss": 5.2334, + "step": 7237 + }, + { + "epoch": 0.8217092215942754, + "grad_norm": 0.302734375, + "learning_rate": 0.0019534200601585036, + "loss": 5.2381, + "step": 7238 + }, + { + "epoch": 0.8218227487041945, + "grad_norm": 0.287109375, + "learning_rate": 0.0019529056416679506, + "loss": 5.243, + "step": 7239 + }, + { + "epoch": 0.8219362758141135, + "grad_norm": 0.28515625, + "learning_rate": 0.0019523884744952701, + "loss": 5.2346, + "step": 7240 + }, + { + "epoch": 0.8220498029240326, + "grad_norm": 0.28125, + "learning_rate": 0.0019518685603071512, + "loss": 5.2412, + "step": 7241 + }, + { + "epoch": 0.8221633300339517, + "grad_norm": 0.28515625, + "learning_rate": 0.0019513459007791355, + "loss": 5.2575, + "step": 7242 + }, + { + "epoch": 0.8222768571438708, + "grad_norm": 0.271484375, + "learning_rate": 0.0019508204975956119, + "loss": 5.2204, + "step": 7243 + }, + { + "epoch": 0.8223903842537899, + "grad_norm": 0.2734375, + "learning_rate": 0.001950292352449812, + "loss": 5.2482, + "step": 7244 + }, + { + "epoch": 0.8225039113637089, + "grad_norm": 0.255859375, + "learning_rate": 0.0019497614670438027, + "loss": 5.2401, + "step": 7245 + }, + { + "epoch": 0.822617438473628, + "grad_norm": 0.27734375, + "learning_rate": 0.0019492278430884833, + "loss": 5.2485, + "step": 7246 + }, + { + "epoch": 0.8227309655835471, + "grad_norm": 0.265625, + "learning_rate": 0.0019486914823035782, + "loss": 5.2242, + "step": 7247 + }, + { + "epoch": 0.8228444926934662, + "grad_norm": 0.30078125, + "learning_rate": 0.0019481523864176313, + "loss": 5.2347, + "step": 7248 + }, + { + "epoch": 0.8229580198033852, + "grad_norm": 0.28125, + "learning_rate": 0.001947610557168002, + "loss": 5.2267, + "step": 7249 + }, + { + "epoch": 0.8230715469133043, + "grad_norm": 0.28125, + "learning_rate": 0.0019470659963008582, + "loss": 5.231, + "step": 7250 + }, + { + "epoch": 0.8231850740232234, + "grad_norm": 0.283203125, + "learning_rate": 0.0019465187055711707, + "loss": 5.2222, + "step": 7251 + }, + { + "epoch": 0.8232986011331425, + "grad_norm": 0.26171875, + "learning_rate": 0.001945968686742708, + "loss": 5.2463, + "step": 7252 + }, + { + "epoch": 0.8234121282430615, + "grad_norm": 0.2734375, + "learning_rate": 0.0019454159415880313, + "loss": 5.2299, + "step": 7253 + }, + { + "epoch": 0.8235256553529806, + "grad_norm": 0.26171875, + "learning_rate": 0.0019448604718884867, + "loss": 5.2249, + "step": 7254 + }, + { + "epoch": 0.8236391824628997, + "grad_norm": 0.283203125, + "learning_rate": 0.0019443022794342019, + "loss": 5.2376, + "step": 7255 + }, + { + "epoch": 0.8237527095728188, + "grad_norm": 0.302734375, + "learning_rate": 0.0019437413660240786, + "loss": 5.2104, + "step": 7256 + }, + { + "epoch": 0.8238662366827378, + "grad_norm": 0.34375, + "learning_rate": 0.001943177733465788, + "loss": 5.2189, + "step": 7257 + }, + { + "epoch": 0.8239797637926569, + "grad_norm": 0.37890625, + "learning_rate": 0.0019426113835757635, + "loss": 5.2393, + "step": 7258 + }, + { + "epoch": 0.824093290902576, + "grad_norm": 0.400390625, + "learning_rate": 0.0019420423181791967, + "loss": 5.2351, + "step": 7259 + }, + { + "epoch": 0.8242068180124951, + "grad_norm": 0.396484375, + "learning_rate": 0.0019414705391100299, + "loss": 5.2582, + "step": 7260 + }, + { + "epoch": 0.8243203451224141, + "grad_norm": 0.431640625, + "learning_rate": 0.001940896048210951, + "loss": 5.2146, + "step": 7261 + }, + { + "epoch": 0.8244338722323332, + "grad_norm": 0.38671875, + "learning_rate": 0.0019403188473333872, + "loss": 5.2421, + "step": 7262 + }, + { + "epoch": 0.8245473993422523, + "grad_norm": 0.439453125, + "learning_rate": 0.0019397389383375, + "loss": 5.2155, + "step": 7263 + }, + { + "epoch": 0.8246609264521714, + "grad_norm": 0.39453125, + "learning_rate": 0.001939156323092177, + "loss": 5.2283, + "step": 7264 + }, + { + "epoch": 0.8247744535620904, + "grad_norm": 0.380859375, + "learning_rate": 0.0019385710034750287, + "loss": 5.2023, + "step": 7265 + }, + { + "epoch": 0.8248879806720095, + "grad_norm": 0.37890625, + "learning_rate": 0.0019379829813723807, + "loss": 5.2374, + "step": 7266 + }, + { + "epoch": 0.8250015077819286, + "grad_norm": 0.36328125, + "learning_rate": 0.0019373922586792673, + "loss": 5.2174, + "step": 7267 + }, + { + "epoch": 0.8251150348918477, + "grad_norm": 0.318359375, + "learning_rate": 0.0019367988372994264, + "loss": 5.2171, + "step": 7268 + }, + { + "epoch": 0.8252285620017668, + "grad_norm": 0.3203125, + "learning_rate": 0.0019362027191452936, + "loss": 5.2411, + "step": 7269 + }, + { + "epoch": 0.8253420891116858, + "grad_norm": 0.28125, + "learning_rate": 0.0019356039061379949, + "loss": 5.2315, + "step": 7270 + }, + { + "epoch": 0.8254556162216049, + "grad_norm": 0.2890625, + "learning_rate": 0.0019350024002073411, + "loss": 5.2372, + "step": 7271 + }, + { + "epoch": 0.825569143331524, + "grad_norm": 0.26171875, + "learning_rate": 0.0019343982032918217, + "loss": 5.2191, + "step": 7272 + }, + { + "epoch": 0.8256826704414431, + "grad_norm": 0.26953125, + "learning_rate": 0.0019337913173385988, + "loss": 5.2386, + "step": 7273 + }, + { + "epoch": 0.8257961975513621, + "grad_norm": 0.263671875, + "learning_rate": 0.0019331817443034999, + "loss": 5.2264, + "step": 7274 + }, + { + "epoch": 0.8259097246612812, + "grad_norm": 0.25390625, + "learning_rate": 0.0019325694861510121, + "loss": 5.2239, + "step": 7275 + }, + { + "epoch": 0.8260232517712003, + "grad_norm": 0.267578125, + "learning_rate": 0.0019319545448542772, + "loss": 5.2328, + "step": 7276 + }, + { + "epoch": 0.8261367788811194, + "grad_norm": 0.27734375, + "learning_rate": 0.001931336922395083, + "loss": 5.228, + "step": 7277 + }, + { + "epoch": 0.8262503059910384, + "grad_norm": 0.3046875, + "learning_rate": 0.0019307166207638577, + "loss": 5.2125, + "step": 7278 + }, + { + "epoch": 0.8263638331009575, + "grad_norm": 0.314453125, + "learning_rate": 0.0019300936419596644, + "loss": 5.2516, + "step": 7279 + }, + { + "epoch": 0.8264773602108766, + "grad_norm": 0.298828125, + "learning_rate": 0.0019294679879901938, + "loss": 5.2344, + "step": 7280 + }, + { + "epoch": 0.8265908873207957, + "grad_norm": 0.275390625, + "learning_rate": 0.0019288396608717578, + "loss": 5.2307, + "step": 7281 + }, + { + "epoch": 0.8267044144307147, + "grad_norm": 0.255859375, + "learning_rate": 0.0019282086626292833, + "loss": 5.2337, + "step": 7282 + }, + { + "epoch": 0.8268179415406338, + "grad_norm": 0.2578125, + "learning_rate": 0.0019275749952963057, + "loss": 5.2255, + "step": 7283 + }, + { + "epoch": 0.8269314686505529, + "grad_norm": 0.25390625, + "learning_rate": 0.0019269386609149612, + "loss": 5.2403, + "step": 7284 + }, + { + "epoch": 0.827044995760472, + "grad_norm": 0.275390625, + "learning_rate": 0.001926299661535983, + "loss": 5.2156, + "step": 7285 + }, + { + "epoch": 0.827158522870391, + "grad_norm": 0.27734375, + "learning_rate": 0.0019256579992186905, + "loss": 5.2405, + "step": 7286 + }, + { + "epoch": 0.8272720499803101, + "grad_norm": 0.29296875, + "learning_rate": 0.0019250136760309868, + "loss": 5.2254, + "step": 7287 + }, + { + "epoch": 0.8273855770902292, + "grad_norm": 0.287109375, + "learning_rate": 0.0019243666940493497, + "loss": 5.2372, + "step": 7288 + }, + { + "epoch": 0.8274991042001483, + "grad_norm": 0.279296875, + "learning_rate": 0.001923717055358826, + "loss": 5.2192, + "step": 7289 + }, + { + "epoch": 0.8276126313100673, + "grad_norm": 0.263671875, + "learning_rate": 0.0019230647620530228, + "loss": 5.2281, + "step": 7290 + }, + { + "epoch": 0.8277261584199864, + "grad_norm": 0.255859375, + "learning_rate": 0.0019224098162341043, + "loss": 5.2192, + "step": 7291 + }, + { + "epoch": 0.8278396855299055, + "grad_norm": 0.255859375, + "learning_rate": 0.0019217522200127822, + "loss": 5.2353, + "step": 7292 + }, + { + "epoch": 0.8279532126398246, + "grad_norm": 0.2578125, + "learning_rate": 0.0019210919755083089, + "loss": 5.2337, + "step": 7293 + }, + { + "epoch": 0.8280667397497437, + "grad_norm": 0.2578125, + "learning_rate": 0.0019204290848484728, + "loss": 5.236, + "step": 7294 + }, + { + "epoch": 0.8281802668596627, + "grad_norm": 0.24609375, + "learning_rate": 0.00191976355016959, + "loss": 5.2139, + "step": 7295 + }, + { + "epoch": 0.8282937939695818, + "grad_norm": 0.259765625, + "learning_rate": 0.0019190953736164963, + "loss": 5.2483, + "step": 7296 + }, + { + "epoch": 0.8284073210795009, + "grad_norm": 0.26953125, + "learning_rate": 0.0019184245573425425, + "loss": 5.2305, + "step": 7297 + }, + { + "epoch": 0.82852084818942, + "grad_norm": 0.30859375, + "learning_rate": 0.001917751103509587, + "loss": 5.2192, + "step": 7298 + }, + { + "epoch": 0.828634375299339, + "grad_norm": 0.32421875, + "learning_rate": 0.0019170750142879873, + "loss": 5.203, + "step": 7299 + }, + { + "epoch": 0.8287479024092581, + "grad_norm": 0.359375, + "learning_rate": 0.0019163962918565946, + "loss": 5.223, + "step": 7300 + }, + { + "epoch": 0.8288614295191772, + "grad_norm": 0.373046875, + "learning_rate": 0.0019157149384027457, + "loss": 5.2191, + "step": 7301 + }, + { + "epoch": 0.8289749566290963, + "grad_norm": 0.388671875, + "learning_rate": 0.0019150309561222574, + "loss": 5.2087, + "step": 7302 + }, + { + "epoch": 0.8290884837390153, + "grad_norm": 0.380859375, + "learning_rate": 0.0019143443472194178, + "loss": 5.2119, + "step": 7303 + }, + { + "epoch": 0.8292020108489344, + "grad_norm": 0.37890625, + "learning_rate": 0.0019136551139069793, + "loss": 5.2168, + "step": 7304 + }, + { + "epoch": 0.8293155379588535, + "grad_norm": 0.34375, + "learning_rate": 0.0019129632584061538, + "loss": 5.2104, + "step": 7305 + }, + { + "epoch": 0.8294290650687726, + "grad_norm": 0.33203125, + "learning_rate": 0.0019122687829466023, + "loss": 5.2178, + "step": 7306 + }, + { + "epoch": 0.8295425921786916, + "grad_norm": 0.310546875, + "learning_rate": 0.0019115716897664299, + "loss": 5.2493, + "step": 7307 + }, + { + "epoch": 0.8296561192886107, + "grad_norm": 0.310546875, + "learning_rate": 0.0019108719811121771, + "loss": 5.2356, + "step": 7308 + }, + { + "epoch": 0.8297696463985298, + "grad_norm": 0.28515625, + "learning_rate": 0.0019101696592388148, + "loss": 5.201, + "step": 7309 + }, + { + "epoch": 0.8298831735084489, + "grad_norm": 0.2734375, + "learning_rate": 0.001909464726409734, + "loss": 5.219, + "step": 7310 + }, + { + "epoch": 0.8299967006183679, + "grad_norm": 0.25390625, + "learning_rate": 0.001908757184896741, + "loss": 5.2147, + "step": 7311 + }, + { + "epoch": 0.830110227728287, + "grad_norm": 0.2490234375, + "learning_rate": 0.0019080470369800494, + "loss": 5.2397, + "step": 7312 + }, + { + "epoch": 0.8302237548382061, + "grad_norm": 0.2421875, + "learning_rate": 0.0019073342849482717, + "loss": 5.2212, + "step": 7313 + }, + { + "epoch": 0.8303372819481252, + "grad_norm": 0.2451171875, + "learning_rate": 0.0019066189310984135, + "loss": 5.2484, + "step": 7314 + }, + { + "epoch": 0.8304508090580442, + "grad_norm": 0.2578125, + "learning_rate": 0.001905900977735865, + "loss": 5.2294, + "step": 7315 + }, + { + "epoch": 0.8305643361679633, + "grad_norm": 0.25390625, + "learning_rate": 0.0019051804271743935, + "loss": 5.2219, + "step": 7316 + }, + { + "epoch": 0.8306778632778824, + "grad_norm": 0.255859375, + "learning_rate": 0.0019044572817361375, + "loss": 5.2292, + "step": 7317 + }, + { + "epoch": 0.8307913903878015, + "grad_norm": 0.263671875, + "learning_rate": 0.001903731543751597, + "loss": 5.2066, + "step": 7318 + }, + { + "epoch": 0.8309049174977206, + "grad_norm": 0.27734375, + "learning_rate": 0.0019030032155596276, + "loss": 5.2262, + "step": 7319 + }, + { + "epoch": 0.8310184446076396, + "grad_norm": 0.27734375, + "learning_rate": 0.0019022722995074319, + "loss": 5.2266, + "step": 7320 + }, + { + "epoch": 0.8311319717175587, + "grad_norm": 0.271484375, + "learning_rate": 0.0019015387979505536, + "loss": 5.245, + "step": 7321 + }, + { + "epoch": 0.8312454988274778, + "grad_norm": 0.26953125, + "learning_rate": 0.0019008027132528676, + "loss": 5.2294, + "step": 7322 + }, + { + "epoch": 0.8313590259373969, + "grad_norm": 0.294921875, + "learning_rate": 0.0019000640477865739, + "loss": 5.2244, + "step": 7323 + }, + { + "epoch": 0.8314725530473159, + "grad_norm": 0.31640625, + "learning_rate": 0.00189932280393219, + "loss": 5.239, + "step": 7324 + }, + { + "epoch": 0.831586080157235, + "grad_norm": 0.34375, + "learning_rate": 0.0018985789840785426, + "loss": 5.2362, + "step": 7325 + }, + { + "epoch": 0.8316996072671541, + "grad_norm": 0.34375, + "learning_rate": 0.0018978325906227598, + "loss": 5.2193, + "step": 7326 + }, + { + "epoch": 0.8318131343770732, + "grad_norm": 0.357421875, + "learning_rate": 0.0018970836259702643, + "loss": 5.2269, + "step": 7327 + }, + { + "epoch": 0.8319266614869922, + "grad_norm": 0.34765625, + "learning_rate": 0.0018963320925347647, + "loss": 5.2273, + "step": 7328 + }, + { + "epoch": 0.8320401885969113, + "grad_norm": 0.353515625, + "learning_rate": 0.001895577992738248, + "loss": 5.2182, + "step": 7329 + }, + { + "epoch": 0.8321537157068304, + "grad_norm": 0.3203125, + "learning_rate": 0.0018948213290109722, + "loss": 5.2334, + "step": 7330 + }, + { + "epoch": 0.8322672428167495, + "grad_norm": 0.3125, + "learning_rate": 0.001894062103791458, + "loss": 5.225, + "step": 7331 + }, + { + "epoch": 0.8323807699266685, + "grad_norm": 0.30078125, + "learning_rate": 0.001893300319526481, + "loss": 5.2291, + "step": 7332 + }, + { + "epoch": 0.8324942970365876, + "grad_norm": 0.296875, + "learning_rate": 0.0018925359786710642, + "loss": 5.2212, + "step": 7333 + }, + { + "epoch": 0.8326078241465067, + "grad_norm": 0.271484375, + "learning_rate": 0.0018917690836884693, + "loss": 5.2243, + "step": 7334 + }, + { + "epoch": 0.8327213512564258, + "grad_norm": 0.27734375, + "learning_rate": 0.0018909996370501901, + "loss": 5.2375, + "step": 7335 + }, + { + "epoch": 0.8328348783663448, + "grad_norm": 0.251953125, + "learning_rate": 0.0018902276412359423, + "loss": 5.2317, + "step": 7336 + }, + { + "epoch": 0.8329484054762639, + "grad_norm": 0.259765625, + "learning_rate": 0.0018894530987336585, + "loss": 5.2565, + "step": 7337 + }, + { + "epoch": 0.833061932586183, + "grad_norm": 0.259765625, + "learning_rate": 0.0018886760120394772, + "loss": 5.2575, + "step": 7338 + }, + { + "epoch": 0.8331754596961021, + "grad_norm": 0.267578125, + "learning_rate": 0.0018878963836577373, + "loss": 5.2332, + "step": 7339 + }, + { + "epoch": 0.8332889868060211, + "grad_norm": 0.275390625, + "learning_rate": 0.0018871142161009677, + "loss": 5.2165, + "step": 7340 + }, + { + "epoch": 0.8334025139159402, + "grad_norm": 0.294921875, + "learning_rate": 0.0018863295118898816, + "loss": 5.2251, + "step": 7341 + }, + { + "epoch": 0.8335160410258593, + "grad_norm": 0.296875, + "learning_rate": 0.0018855422735533662, + "loss": 5.2341, + "step": 7342 + }, + { + "epoch": 0.8336295681357784, + "grad_norm": 0.298828125, + "learning_rate": 0.0018847525036284761, + "loss": 5.2221, + "step": 7343 + }, + { + "epoch": 0.8337430952456975, + "grad_norm": 0.28515625, + "learning_rate": 0.0018839602046604239, + "loss": 5.2277, + "step": 7344 + }, + { + "epoch": 0.8338566223556165, + "grad_norm": 0.279296875, + "learning_rate": 0.0018831653792025732, + "loss": 5.2105, + "step": 7345 + }, + { + "epoch": 0.8339701494655356, + "grad_norm": 0.271484375, + "learning_rate": 0.0018823680298164294, + "loss": 5.2287, + "step": 7346 + }, + { + "epoch": 0.8340836765754547, + "grad_norm": 0.28125, + "learning_rate": 0.0018815681590716319, + "loss": 5.2431, + "step": 7347 + }, + { + "epoch": 0.8341972036853738, + "grad_norm": 0.302734375, + "learning_rate": 0.001880765769545946, + "loss": 5.2252, + "step": 7348 + }, + { + "epoch": 0.8343107307952928, + "grad_norm": 0.349609375, + "learning_rate": 0.001879960863825254, + "loss": 5.2205, + "step": 7349 + }, + { + "epoch": 0.8344242579052119, + "grad_norm": 0.375, + "learning_rate": 0.0018791534445035472, + "loss": 5.232, + "step": 7350 + }, + { + "epoch": 0.834537785015131, + "grad_norm": 0.3828125, + "learning_rate": 0.0018783435141829187, + "loss": 5.2107, + "step": 7351 + }, + { + "epoch": 0.8346513121250501, + "grad_norm": 0.361328125, + "learning_rate": 0.0018775310754735517, + "loss": 5.201, + "step": 7352 + }, + { + "epoch": 0.8347648392349691, + "grad_norm": 0.337890625, + "learning_rate": 0.0018767161309937153, + "loss": 5.2314, + "step": 7353 + }, + { + "epoch": 0.8348783663448882, + "grad_norm": 0.357421875, + "learning_rate": 0.0018758986833697527, + "loss": 5.2384, + "step": 7354 + }, + { + "epoch": 0.8349918934548073, + "grad_norm": 0.33984375, + "learning_rate": 0.0018750787352360746, + "loss": 5.2412, + "step": 7355 + }, + { + "epoch": 0.8351054205647264, + "grad_norm": 0.369140625, + "learning_rate": 0.0018742562892351503, + "loss": 5.2106, + "step": 7356 + }, + { + "epoch": 0.8352189476746454, + "grad_norm": 0.330078125, + "learning_rate": 0.0018734313480174983, + "loss": 5.2153, + "step": 7357 + }, + { + "epoch": 0.8353324747845645, + "grad_norm": 0.33984375, + "learning_rate": 0.0018726039142416796, + "loss": 5.2318, + "step": 7358 + }, + { + "epoch": 0.8354460018944836, + "grad_norm": 0.32421875, + "learning_rate": 0.0018717739905742873, + "loss": 5.2403, + "step": 7359 + }, + { + "epoch": 0.8355595290044027, + "grad_norm": 0.322265625, + "learning_rate": 0.0018709415796899383, + "loss": 5.2338, + "step": 7360 + }, + { + "epoch": 0.8356730561143219, + "grad_norm": 0.30078125, + "learning_rate": 0.0018701066842712667, + "loss": 5.2164, + "step": 7361 + }, + { + "epoch": 0.8357865832242409, + "grad_norm": 0.29296875, + "learning_rate": 0.001869269307008912, + "loss": 5.2091, + "step": 7362 + }, + { + "epoch": 0.83590011033416, + "grad_norm": 0.26171875, + "learning_rate": 0.0018684294506015125, + "loss": 5.2272, + "step": 7363 + }, + { + "epoch": 0.8360136374440791, + "grad_norm": 0.2578125, + "learning_rate": 0.0018675871177556967, + "loss": 5.2254, + "step": 7364 + }, + { + "epoch": 0.8361271645539982, + "grad_norm": 0.267578125, + "learning_rate": 0.001866742311186073, + "loss": 5.2338, + "step": 7365 + }, + { + "epoch": 0.8362406916639172, + "grad_norm": 0.27734375, + "learning_rate": 0.0018658950336152228, + "loss": 5.2252, + "step": 7366 + }, + { + "epoch": 0.8363542187738363, + "grad_norm": 0.314453125, + "learning_rate": 0.0018650452877736901, + "loss": 5.2021, + "step": 7367 + }, + { + "epoch": 0.8364677458837554, + "grad_norm": 0.333984375, + "learning_rate": 0.0018641930763999743, + "loss": 5.2147, + "step": 7368 + }, + { + "epoch": 0.8365812729936745, + "grad_norm": 0.3515625, + "learning_rate": 0.0018633384022405197, + "loss": 5.2393, + "step": 7369 + }, + { + "epoch": 0.8366948001035935, + "grad_norm": 0.3515625, + "learning_rate": 0.0018624812680497081, + "loss": 5.2228, + "step": 7370 + }, + { + "epoch": 0.8368083272135126, + "grad_norm": 0.365234375, + "learning_rate": 0.001861621676589849, + "loss": 5.2268, + "step": 7371 + }, + { + "epoch": 0.8369218543234317, + "grad_norm": 0.341796875, + "learning_rate": 0.0018607596306311712, + "loss": 5.2217, + "step": 7372 + }, + { + "epoch": 0.8370353814333508, + "grad_norm": 0.35546875, + "learning_rate": 0.0018598951329518137, + "loss": 5.2215, + "step": 7373 + }, + { + "epoch": 0.8371489085432698, + "grad_norm": 0.33203125, + "learning_rate": 0.0018590281863378163, + "loss": 5.2059, + "step": 7374 + }, + { + "epoch": 0.8372624356531889, + "grad_norm": 0.34375, + "learning_rate": 0.0018581587935831113, + "loss": 5.2297, + "step": 7375 + }, + { + "epoch": 0.837375962763108, + "grad_norm": 0.328125, + "learning_rate": 0.0018572869574895144, + "loss": 5.2313, + "step": 7376 + }, + { + "epoch": 0.8374894898730271, + "grad_norm": 0.337890625, + "learning_rate": 0.0018564126808667156, + "loss": 5.2385, + "step": 7377 + }, + { + "epoch": 0.8376030169829461, + "grad_norm": 0.330078125, + "learning_rate": 0.0018555359665322695, + "loss": 5.2205, + "step": 7378 + }, + { + "epoch": 0.8377165440928652, + "grad_norm": 0.310546875, + "learning_rate": 0.0018546568173115874, + "loss": 5.239, + "step": 7379 + }, + { + "epoch": 0.8378300712027843, + "grad_norm": 0.2734375, + "learning_rate": 0.0018537752360379277, + "loss": 5.225, + "step": 7380 + }, + { + "epoch": 0.8379435983127034, + "grad_norm": 0.255859375, + "learning_rate": 0.0018528912255523855, + "loss": 5.2082, + "step": 7381 + }, + { + "epoch": 0.8380571254226225, + "grad_norm": 0.236328125, + "learning_rate": 0.0018520047887038858, + "loss": 5.2277, + "step": 7382 + }, + { + "epoch": 0.8381706525325415, + "grad_norm": 0.234375, + "learning_rate": 0.0018511159283491722, + "loss": 5.2383, + "step": 7383 + }, + { + "epoch": 0.8382841796424606, + "grad_norm": 0.24609375, + "learning_rate": 0.0018502246473528, + "loss": 5.2243, + "step": 7384 + }, + { + "epoch": 0.8383977067523797, + "grad_norm": 0.259765625, + "learning_rate": 0.001849330948587124, + "loss": 5.2148, + "step": 7385 + }, + { + "epoch": 0.8385112338622988, + "grad_norm": 0.27734375, + "learning_rate": 0.001848434834932291, + "loss": 5.2117, + "step": 7386 + }, + { + "epoch": 0.8386247609722178, + "grad_norm": 0.275390625, + "learning_rate": 0.0018475363092762315, + "loss": 5.2222, + "step": 7387 + }, + { + "epoch": 0.8387382880821369, + "grad_norm": 0.28125, + "learning_rate": 0.0018466353745146481, + "loss": 5.2113, + "step": 7388 + }, + { + "epoch": 0.838851815192056, + "grad_norm": 0.28125, + "learning_rate": 0.0018457320335510078, + "loss": 5.2312, + "step": 7389 + }, + { + "epoch": 0.8389653423019751, + "grad_norm": 0.287109375, + "learning_rate": 0.0018448262892965322, + "loss": 5.2219, + "step": 7390 + }, + { + "epoch": 0.8390788694118941, + "grad_norm": 0.255859375, + "learning_rate": 0.0018439181446701875, + "loss": 5.2092, + "step": 7391 + }, + { + "epoch": 0.8391923965218132, + "grad_norm": 0.2490234375, + "learning_rate": 0.0018430076025986763, + "loss": 5.217, + "step": 7392 + }, + { + "epoch": 0.8393059236317323, + "grad_norm": 0.251953125, + "learning_rate": 0.0018420946660164268, + "loss": 5.2323, + "step": 7393 + }, + { + "epoch": 0.8394194507416514, + "grad_norm": 0.2470703125, + "learning_rate": 0.0018411793378655845, + "loss": 5.2479, + "step": 7394 + }, + { + "epoch": 0.8395329778515704, + "grad_norm": 0.255859375, + "learning_rate": 0.0018402616210960031, + "loss": 5.1887, + "step": 7395 + }, + { + "epoch": 0.8396465049614895, + "grad_norm": 0.255859375, + "learning_rate": 0.0018393415186652323, + "loss": 5.2388, + "step": 7396 + }, + { + "epoch": 0.8397600320714086, + "grad_norm": 0.26171875, + "learning_rate": 0.0018384190335385116, + "loss": 5.2259, + "step": 7397 + }, + { + "epoch": 0.8398735591813277, + "grad_norm": 0.259765625, + "learning_rate": 0.0018374941686887588, + "loss": 5.2321, + "step": 7398 + }, + { + "epoch": 0.8399870862912467, + "grad_norm": 0.26953125, + "learning_rate": 0.0018365669270965607, + "loss": 5.2157, + "step": 7399 + }, + { + "epoch": 0.8401006134011658, + "grad_norm": 0.251953125, + "learning_rate": 0.001835637311750164, + "loss": 5.2224, + "step": 7400 + }, + { + "epoch": 0.8402141405110849, + "grad_norm": 0.263671875, + "learning_rate": 0.0018347053256454657, + "loss": 5.2302, + "step": 7401 + }, + { + "epoch": 0.840327667621004, + "grad_norm": 0.2431640625, + "learning_rate": 0.0018337709717860021, + "loss": 5.2177, + "step": 7402 + }, + { + "epoch": 0.840441194730923, + "grad_norm": 0.2734375, + "learning_rate": 0.0018328342531829407, + "loss": 5.227, + "step": 7403 + }, + { + "epoch": 0.8405547218408421, + "grad_norm": 0.283203125, + "learning_rate": 0.0018318951728550706, + "loss": 5.2082, + "step": 7404 + }, + { + "epoch": 0.8406682489507612, + "grad_norm": 0.326171875, + "learning_rate": 0.0018309537338287904, + "loss": 5.2236, + "step": 7405 + }, + { + "epoch": 0.8407817760606803, + "grad_norm": 0.345703125, + "learning_rate": 0.0018300099391381022, + "loss": 5.2196, + "step": 7406 + }, + { + "epoch": 0.8408953031705994, + "grad_norm": 0.33984375, + "learning_rate": 0.001829063791824598, + "loss": 5.2117, + "step": 7407 + }, + { + "epoch": 0.8410088302805184, + "grad_norm": 0.330078125, + "learning_rate": 0.0018281152949374527, + "loss": 5.2316, + "step": 7408 + }, + { + "epoch": 0.8411223573904375, + "grad_norm": 0.322265625, + "learning_rate": 0.001827164451533413, + "loss": 5.2309, + "step": 7409 + }, + { + "epoch": 0.8412358845003566, + "grad_norm": 0.318359375, + "learning_rate": 0.0018262112646767875, + "loss": 5.2474, + "step": 7410 + }, + { + "epoch": 0.8413494116102757, + "grad_norm": 0.28515625, + "learning_rate": 0.0018252557374394374, + "loss": 5.2165, + "step": 7411 + }, + { + "epoch": 0.8414629387201947, + "grad_norm": 0.2734375, + "learning_rate": 0.001824297872900766, + "loss": 5.2043, + "step": 7412 + }, + { + "epoch": 0.8415764658301138, + "grad_norm": 0.263671875, + "learning_rate": 0.0018233376741477098, + "loss": 5.2403, + "step": 7413 + }, + { + "epoch": 0.8416899929400329, + "grad_norm": 0.263671875, + "learning_rate": 0.0018223751442747271, + "loss": 5.2441, + "step": 7414 + }, + { + "epoch": 0.841803520049952, + "grad_norm": 0.26953125, + "learning_rate": 0.001821410286383789, + "loss": 5.2222, + "step": 7415 + }, + { + "epoch": 0.841917047159871, + "grad_norm": 0.279296875, + "learning_rate": 0.001820443103584369, + "loss": 5.2222, + "step": 7416 + }, + { + "epoch": 0.8420305742697901, + "grad_norm": 0.296875, + "learning_rate": 0.0018194735989934337, + "loss": 5.2188, + "step": 7417 + }, + { + "epoch": 0.8421441013797092, + "grad_norm": 0.294921875, + "learning_rate": 0.0018185017757354313, + "loss": 5.2177, + "step": 7418 + }, + { + "epoch": 0.8422576284896283, + "grad_norm": 0.302734375, + "learning_rate": 0.0018175276369422832, + "loss": 5.2055, + "step": 7419 + }, + { + "epoch": 0.8423711555995473, + "grad_norm": 0.302734375, + "learning_rate": 0.0018165511857533736, + "loss": 5.1936, + "step": 7420 + }, + { + "epoch": 0.8424846827094664, + "grad_norm": 0.3125, + "learning_rate": 0.0018155724253155368, + "loss": 5.2188, + "step": 7421 + }, + { + "epoch": 0.8425982098193855, + "grad_norm": 0.2734375, + "learning_rate": 0.0018145913587830518, + "loss": 5.2243, + "step": 7422 + }, + { + "epoch": 0.8427117369293046, + "grad_norm": 0.287109375, + "learning_rate": 0.0018136079893176279, + "loss": 5.2046, + "step": 7423 + }, + { + "epoch": 0.8428252640392236, + "grad_norm": 0.259765625, + "learning_rate": 0.001812622320088396, + "loss": 5.2183, + "step": 7424 + }, + { + "epoch": 0.8429387911491427, + "grad_norm": 0.2490234375, + "learning_rate": 0.0018116343542719001, + "loss": 5.1986, + "step": 7425 + }, + { + "epoch": 0.8430523182590618, + "grad_norm": 0.2392578125, + "learning_rate": 0.0018106440950520835, + "loss": 5.2239, + "step": 7426 + }, + { + "epoch": 0.8431658453689809, + "grad_norm": 0.2451171875, + "learning_rate": 0.0018096515456202818, + "loss": 5.2253, + "step": 7427 + }, + { + "epoch": 0.8432793724789, + "grad_norm": 0.251953125, + "learning_rate": 0.0018086567091752108, + "loss": 5.2263, + "step": 7428 + }, + { + "epoch": 0.843392899588819, + "grad_norm": 0.283203125, + "learning_rate": 0.001807659588922957, + "loss": 5.2151, + "step": 7429 + }, + { + "epoch": 0.8435064266987381, + "grad_norm": 0.30859375, + "learning_rate": 0.001806660188076967, + "loss": 5.2337, + "step": 7430 + }, + { + "epoch": 0.8436199538086572, + "grad_norm": 0.32421875, + "learning_rate": 0.001805658509858037, + "loss": 5.2317, + "step": 7431 + }, + { + "epoch": 0.8437334809185763, + "grad_norm": 0.345703125, + "learning_rate": 0.0018046545574943022, + "loss": 5.238, + "step": 7432 + }, + { + "epoch": 0.8438470080284953, + "grad_norm": 0.32421875, + "learning_rate": 0.0018036483342212268, + "loss": 5.2044, + "step": 7433 + }, + { + "epoch": 0.8439605351384144, + "grad_norm": 0.33984375, + "learning_rate": 0.0018026398432815947, + "loss": 5.2297, + "step": 7434 + }, + { + "epoch": 0.8440740622483335, + "grad_norm": 0.349609375, + "learning_rate": 0.0018016290879254966, + "loss": 5.2301, + "step": 7435 + }, + { + "epoch": 0.8441875893582526, + "grad_norm": 0.37109375, + "learning_rate": 0.0018006160714103213, + "loss": 5.2368, + "step": 7436 + }, + { + "epoch": 0.8443011164681716, + "grad_norm": 0.375, + "learning_rate": 0.001799600797000744, + "loss": 5.2175, + "step": 7437 + }, + { + "epoch": 0.8444146435780907, + "grad_norm": 0.3828125, + "learning_rate": 0.001798583267968718, + "loss": 5.2314, + "step": 7438 + }, + { + "epoch": 0.8445281706880098, + "grad_norm": 0.36328125, + "learning_rate": 0.0017975634875934605, + "loss": 5.205, + "step": 7439 + }, + { + "epoch": 0.8446416977979289, + "grad_norm": 0.38671875, + "learning_rate": 0.0017965414591614465, + "loss": 5.2173, + "step": 7440 + }, + { + "epoch": 0.8447552249078479, + "grad_norm": 0.359375, + "learning_rate": 0.0017955171859663941, + "loss": 5.2231, + "step": 7441 + }, + { + "epoch": 0.844868752017767, + "grad_norm": 0.34765625, + "learning_rate": 0.0017944906713092566, + "loss": 5.2094, + "step": 7442 + }, + { + "epoch": 0.8449822791276861, + "grad_norm": 0.296875, + "learning_rate": 0.0017934619184982103, + "loss": 5.2393, + "step": 7443 + }, + { + "epoch": 0.8450958062376052, + "grad_norm": 0.279296875, + "learning_rate": 0.001792430930848645, + "loss": 5.2216, + "step": 7444 + }, + { + "epoch": 0.8452093333475242, + "grad_norm": 0.2578125, + "learning_rate": 0.0017913977116831524, + "loss": 5.2167, + "step": 7445 + }, + { + "epoch": 0.8453228604574433, + "grad_norm": 0.2490234375, + "learning_rate": 0.0017903622643315159, + "loss": 5.2135, + "step": 7446 + }, + { + "epoch": 0.8454363875673624, + "grad_norm": 0.26953125, + "learning_rate": 0.0017893245921307, + "loss": 5.2207, + "step": 7447 + }, + { + "epoch": 0.8455499146772815, + "grad_norm": 0.271484375, + "learning_rate": 0.0017882846984248386, + "loss": 5.2202, + "step": 7448 + }, + { + "epoch": 0.8456634417872005, + "grad_norm": 0.283203125, + "learning_rate": 0.0017872425865652257, + "loss": 5.203, + "step": 7449 + }, + { + "epoch": 0.8457769688971196, + "grad_norm": 0.265625, + "learning_rate": 0.001786198259910303, + "loss": 5.2125, + "step": 7450 + }, + { + "epoch": 0.8458904960070387, + "grad_norm": 0.26953125, + "learning_rate": 0.001785151721825651, + "loss": 5.2425, + "step": 7451 + }, + { + "epoch": 0.8460040231169578, + "grad_norm": 0.26953125, + "learning_rate": 0.0017841029756839758, + "loss": 5.2119, + "step": 7452 + }, + { + "epoch": 0.8461175502268768, + "grad_norm": 0.28515625, + "learning_rate": 0.0017830520248651, + "loss": 5.2316, + "step": 7453 + }, + { + "epoch": 0.8462310773367959, + "grad_norm": 0.275390625, + "learning_rate": 0.0017819988727559514, + "loss": 5.2465, + "step": 7454 + }, + { + "epoch": 0.846344604446715, + "grad_norm": 0.279296875, + "learning_rate": 0.0017809435227505522, + "loss": 5.2205, + "step": 7455 + }, + { + "epoch": 0.8464581315566341, + "grad_norm": 0.267578125, + "learning_rate": 0.001779885978250007, + "loss": 5.2195, + "step": 7456 + }, + { + "epoch": 0.8465716586665532, + "grad_norm": 0.267578125, + "learning_rate": 0.0017788262426624935, + "loss": 5.2111, + "step": 7457 + }, + { + "epoch": 0.8466851857764722, + "grad_norm": 0.248046875, + "learning_rate": 0.00177776431940325, + "loss": 5.2225, + "step": 7458 + }, + { + "epoch": 0.8467987128863913, + "grad_norm": 0.244140625, + "learning_rate": 0.0017767002118945652, + "loss": 5.219, + "step": 7459 + }, + { + "epoch": 0.8469122399963104, + "grad_norm": 0.2333984375, + "learning_rate": 0.0017756339235657677, + "loss": 5.22, + "step": 7460 + }, + { + "epoch": 0.8470257671062295, + "grad_norm": 0.2373046875, + "learning_rate": 0.0017745654578532133, + "loss": 5.2136, + "step": 7461 + }, + { + "epoch": 0.8471392942161485, + "grad_norm": 0.236328125, + "learning_rate": 0.0017734948182002762, + "loss": 5.2108, + "step": 7462 + }, + { + "epoch": 0.8472528213260676, + "grad_norm": 0.263671875, + "learning_rate": 0.0017724220080573348, + "loss": 5.2412, + "step": 7463 + }, + { + "epoch": 0.8473663484359867, + "grad_norm": 0.279296875, + "learning_rate": 0.001771347030881764, + "loss": 5.2288, + "step": 7464 + }, + { + "epoch": 0.8474798755459058, + "grad_norm": 0.283203125, + "learning_rate": 0.0017702698901379218, + "loss": 5.2324, + "step": 7465 + }, + { + "epoch": 0.8475934026558248, + "grad_norm": 0.2890625, + "learning_rate": 0.001769190589297139, + "loss": 5.2034, + "step": 7466 + }, + { + "epoch": 0.8477069297657439, + "grad_norm": 0.302734375, + "learning_rate": 0.0017681091318377073, + "loss": 5.2271, + "step": 7467 + }, + { + "epoch": 0.847820456875663, + "grad_norm": 0.294921875, + "learning_rate": 0.0017670255212448693, + "loss": 5.2168, + "step": 7468 + }, + { + "epoch": 0.8479339839855821, + "grad_norm": 0.318359375, + "learning_rate": 0.001765939761010806, + "loss": 5.2402, + "step": 7469 + }, + { + "epoch": 0.8480475110955011, + "grad_norm": 0.294921875, + "learning_rate": 0.001764851854634627, + "loss": 5.2227, + "step": 7470 + }, + { + "epoch": 0.8481610382054202, + "grad_norm": 0.283203125, + "learning_rate": 0.0017637618056223566, + "loss": 5.2304, + "step": 7471 + }, + { + "epoch": 0.8482745653153393, + "grad_norm": 0.265625, + "learning_rate": 0.0017626696174869259, + "loss": 5.2111, + "step": 7472 + }, + { + "epoch": 0.8483880924252584, + "grad_norm": 0.26171875, + "learning_rate": 0.001761575293748159, + "loss": 5.2403, + "step": 7473 + }, + { + "epoch": 0.8485016195351774, + "grad_norm": 0.2431640625, + "learning_rate": 0.001760478837932763, + "loss": 5.2097, + "step": 7474 + }, + { + "epoch": 0.8486151466450965, + "grad_norm": 0.2353515625, + "learning_rate": 0.0017593802535743155, + "loss": 5.2026, + "step": 7475 + }, + { + "epoch": 0.8487286737550156, + "grad_norm": 0.255859375, + "learning_rate": 0.0017582795442132543, + "loss": 5.2282, + "step": 7476 + }, + { + "epoch": 0.8488422008649347, + "grad_norm": 0.240234375, + "learning_rate": 0.001757176713396865, + "loss": 5.2219, + "step": 7477 + }, + { + "epoch": 0.8489557279748537, + "grad_norm": 0.271484375, + "learning_rate": 0.0017560717646792703, + "loss": 5.2334, + "step": 7478 + }, + { + "epoch": 0.8490692550847728, + "grad_norm": 0.259765625, + "learning_rate": 0.0017549647016214186, + "loss": 5.2092, + "step": 7479 + }, + { + "epoch": 0.8491827821946919, + "grad_norm": 0.25390625, + "learning_rate": 0.0017538555277910725, + "loss": 5.2101, + "step": 7480 + }, + { + "epoch": 0.849296309304611, + "grad_norm": 0.259765625, + "learning_rate": 0.0017527442467627954, + "loss": 5.2334, + "step": 7481 + }, + { + "epoch": 0.84940983641453, + "grad_norm": 0.275390625, + "learning_rate": 0.0017516308621179438, + "loss": 5.2092, + "step": 7482 + }, + { + "epoch": 0.8495233635244491, + "grad_norm": 0.2734375, + "learning_rate": 0.0017505153774446526, + "loss": 5.219, + "step": 7483 + }, + { + "epoch": 0.8496368906343682, + "grad_norm": 0.298828125, + "learning_rate": 0.0017493977963378233, + "loss": 5.2115, + "step": 7484 + }, + { + "epoch": 0.8497504177442873, + "grad_norm": 0.2890625, + "learning_rate": 0.0017482781223991163, + "loss": 5.1933, + "step": 7485 + }, + { + "epoch": 0.8498639448542064, + "grad_norm": 0.310546875, + "learning_rate": 0.0017471563592369342, + "loss": 5.2172, + "step": 7486 + }, + { + "epoch": 0.8499774719641254, + "grad_norm": 0.314453125, + "learning_rate": 0.0017460325104664137, + "loss": 5.2017, + "step": 7487 + }, + { + "epoch": 0.8500909990740445, + "grad_norm": 0.30078125, + "learning_rate": 0.0017449065797094131, + "loss": 5.2281, + "step": 7488 + }, + { + "epoch": 0.8502045261839636, + "grad_norm": 0.283203125, + "learning_rate": 0.0017437785705944992, + "loss": 5.194, + "step": 7489 + }, + { + "epoch": 0.8503180532938827, + "grad_norm": 0.265625, + "learning_rate": 0.0017426484867569389, + "loss": 5.2077, + "step": 7490 + }, + { + "epoch": 0.8504315804038017, + "grad_norm": 0.24609375, + "learning_rate": 0.0017415163318386822, + "loss": 5.2195, + "step": 7491 + }, + { + "epoch": 0.8505451075137208, + "grad_norm": 0.244140625, + "learning_rate": 0.001740382109488357, + "loss": 5.2201, + "step": 7492 + }, + { + "epoch": 0.8506586346236399, + "grad_norm": 0.2333984375, + "learning_rate": 0.0017392458233612513, + "loss": 5.2055, + "step": 7493 + }, + { + "epoch": 0.850772161733559, + "grad_norm": 0.2353515625, + "learning_rate": 0.0017381074771193059, + "loss": 5.217, + "step": 7494 + }, + { + "epoch": 0.850885688843478, + "grad_norm": 0.232421875, + "learning_rate": 0.0017369670744310998, + "loss": 5.2222, + "step": 7495 + }, + { + "epoch": 0.8509992159533971, + "grad_norm": 0.232421875, + "learning_rate": 0.00173582461897184, + "loss": 5.202, + "step": 7496 + }, + { + "epoch": 0.8511127430633162, + "grad_norm": 0.2314453125, + "learning_rate": 0.0017346801144233483, + "loss": 5.2117, + "step": 7497 + }, + { + "epoch": 0.8512262701732353, + "grad_norm": 0.224609375, + "learning_rate": 0.0017335335644740504, + "loss": 5.2163, + "step": 7498 + }, + { + "epoch": 0.8513397972831543, + "grad_norm": 0.2197265625, + "learning_rate": 0.0017323849728189645, + "loss": 5.2031, + "step": 7499 + }, + { + "epoch": 0.8514533243930734, + "grad_norm": 0.21875, + "learning_rate": 0.0017312343431596874, + "loss": 5.2112, + "step": 7500 + }, + { + "epoch": 0.8515668515029925, + "grad_norm": 0.22265625, + "learning_rate": 0.0017300816792043849, + "loss": 5.2134, + "step": 7501 + }, + { + "epoch": 0.8516803786129116, + "grad_norm": 0.2294921875, + "learning_rate": 0.0017289269846677779, + "loss": 5.1887, + "step": 7502 + }, + { + "epoch": 0.8517939057228306, + "grad_norm": 0.2490234375, + "learning_rate": 0.0017277702632711323, + "loss": 5.2175, + "step": 7503 + }, + { + "epoch": 0.8519074328327497, + "grad_norm": 0.251953125, + "learning_rate": 0.0017266115187422452, + "loss": 5.2027, + "step": 7504 + }, + { + "epoch": 0.8520209599426688, + "grad_norm": 0.263671875, + "learning_rate": 0.001725450754815434, + "loss": 5.2239, + "step": 7505 + }, + { + "epoch": 0.8521344870525879, + "grad_norm": 0.267578125, + "learning_rate": 0.0017242879752315244, + "loss": 5.2257, + "step": 7506 + }, + { + "epoch": 0.852248014162507, + "grad_norm": 0.265625, + "learning_rate": 0.0017231231837378377, + "loss": 5.2303, + "step": 7507 + }, + { + "epoch": 0.852361541272426, + "grad_norm": 0.2490234375, + "learning_rate": 0.0017219563840881783, + "loss": 5.1914, + "step": 7508 + }, + { + "epoch": 0.8524750683823451, + "grad_norm": 0.251953125, + "learning_rate": 0.0017207875800428235, + "loss": 5.2349, + "step": 7509 + }, + { + "epoch": 0.8525885954922642, + "grad_norm": 0.2275390625, + "learning_rate": 0.00171961677536851, + "loss": 5.2074, + "step": 7510 + }, + { + "epoch": 0.8527021226021833, + "grad_norm": 0.234375, + "learning_rate": 0.0017184439738384214, + "loss": 5.203, + "step": 7511 + }, + { + "epoch": 0.8528156497121023, + "grad_norm": 0.220703125, + "learning_rate": 0.0017172691792321773, + "loss": 5.219, + "step": 7512 + }, + { + "epoch": 0.8529291768220214, + "grad_norm": 0.2265625, + "learning_rate": 0.0017160923953358199, + "loss": 5.2137, + "step": 7513 + }, + { + "epoch": 0.8530427039319405, + "grad_norm": 0.21484375, + "learning_rate": 0.0017149136259418025, + "loss": 5.2038, + "step": 7514 + }, + { + "epoch": 0.8531562310418596, + "grad_norm": 0.2255859375, + "learning_rate": 0.001713732874848977, + "loss": 5.206, + "step": 7515 + }, + { + "epoch": 0.8532697581517786, + "grad_norm": 0.2333984375, + "learning_rate": 0.0017125501458625826, + "loss": 5.2176, + "step": 7516 + }, + { + "epoch": 0.8533832852616977, + "grad_norm": 0.251953125, + "learning_rate": 0.0017113654427942317, + "loss": 5.2352, + "step": 7517 + }, + { + "epoch": 0.8534968123716168, + "grad_norm": 0.26953125, + "learning_rate": 0.0017101787694618988, + "loss": 5.225, + "step": 7518 + }, + { + "epoch": 0.8536103394815359, + "grad_norm": 0.287109375, + "learning_rate": 0.0017089901296899084, + "loss": 5.1893, + "step": 7519 + }, + { + "epoch": 0.8537238665914549, + "grad_norm": 0.287109375, + "learning_rate": 0.0017077995273089219, + "loss": 5.2056, + "step": 7520 + }, + { + "epoch": 0.853837393701374, + "grad_norm": 0.29296875, + "learning_rate": 0.0017066069661559266, + "loss": 5.2175, + "step": 7521 + }, + { + "epoch": 0.8539509208112931, + "grad_norm": 0.28515625, + "learning_rate": 0.0017054124500742204, + "loss": 5.2128, + "step": 7522 + }, + { + "epoch": 0.8540644479212122, + "grad_norm": 0.2734375, + "learning_rate": 0.0017042159829134037, + "loss": 5.2205, + "step": 7523 + }, + { + "epoch": 0.8541779750311312, + "grad_norm": 0.26171875, + "learning_rate": 0.001703017568529363, + "loss": 5.2286, + "step": 7524 + }, + { + "epoch": 0.8542915021410503, + "grad_norm": 0.25390625, + "learning_rate": 0.0017018172107842616, + "loss": 5.2186, + "step": 7525 + }, + { + "epoch": 0.8544050292509694, + "grad_norm": 0.2470703125, + "learning_rate": 0.0017006149135465238, + "loss": 5.2357, + "step": 7526 + }, + { + "epoch": 0.8545185563608885, + "grad_norm": 0.234375, + "learning_rate": 0.0016994106806908266, + "loss": 5.1987, + "step": 7527 + }, + { + "epoch": 0.8546320834708075, + "grad_norm": 0.2451171875, + "learning_rate": 0.0016982045160980831, + "loss": 5.2158, + "step": 7528 + }, + { + "epoch": 0.8547456105807266, + "grad_norm": 0.25, + "learning_rate": 0.0016969964236554329, + "loss": 5.2177, + "step": 7529 + }, + { + "epoch": 0.8548591376906457, + "grad_norm": 0.2578125, + "learning_rate": 0.0016957864072562286, + "loss": 5.2184, + "step": 7530 + }, + { + "epoch": 0.8549726648005648, + "grad_norm": 0.2392578125, + "learning_rate": 0.0016945744708000221, + "loss": 5.211, + "step": 7531 + }, + { + "epoch": 0.8550861919104839, + "grad_norm": 0.26171875, + "learning_rate": 0.0016933606181925547, + "loss": 5.2166, + "step": 7532 + }, + { + "epoch": 0.8551997190204029, + "grad_norm": 0.2412109375, + "learning_rate": 0.0016921448533457415, + "loss": 5.2279, + "step": 7533 + }, + { + "epoch": 0.855313246130322, + "grad_norm": 0.26171875, + "learning_rate": 0.0016909271801776608, + "loss": 5.2373, + "step": 7534 + }, + { + "epoch": 0.8554267732402411, + "grad_norm": 0.2451171875, + "learning_rate": 0.0016897076026125414, + "loss": 5.2019, + "step": 7535 + }, + { + "epoch": 0.8555403003501602, + "grad_norm": 0.2470703125, + "learning_rate": 0.0016884861245807486, + "loss": 5.2134, + "step": 7536 + }, + { + "epoch": 0.8556538274600792, + "grad_norm": 0.2265625, + "learning_rate": 0.0016872627500187727, + "loss": 5.2257, + "step": 7537 + }, + { + "epoch": 0.8557673545699983, + "grad_norm": 0.216796875, + "learning_rate": 0.0016860374828692162, + "loss": 5.2173, + "step": 7538 + }, + { + "epoch": 0.8558808816799174, + "grad_norm": 0.2109375, + "learning_rate": 0.0016848103270807808, + "loss": 5.2107, + "step": 7539 + }, + { + "epoch": 0.8559944087898365, + "grad_norm": 0.2060546875, + "learning_rate": 0.0016835812866082547, + "loss": 5.2267, + "step": 7540 + }, + { + "epoch": 0.8561079358997555, + "grad_norm": 0.2216796875, + "learning_rate": 0.0016823503654125002, + "loss": 5.2244, + "step": 7541 + }, + { + "epoch": 0.8562214630096746, + "grad_norm": 0.2255859375, + "learning_rate": 0.0016811175674604395, + "loss": 5.2292, + "step": 7542 + }, + { + "epoch": 0.8563349901195937, + "grad_norm": 0.26171875, + "learning_rate": 0.001679882896725045, + "loss": 5.2056, + "step": 7543 + }, + { + "epoch": 0.8564485172295128, + "grad_norm": 0.279296875, + "learning_rate": 0.001678646357185323, + "loss": 5.2114, + "step": 7544 + }, + { + "epoch": 0.8565620443394318, + "grad_norm": 0.318359375, + "learning_rate": 0.001677407952826303, + "loss": 5.2037, + "step": 7545 + }, + { + "epoch": 0.8566755714493509, + "grad_norm": 0.30078125, + "learning_rate": 0.0016761676876390246, + "loss": 5.2066, + "step": 7546 + }, + { + "epoch": 0.85678909855927, + "grad_norm": 0.279296875, + "learning_rate": 0.0016749255656205238, + "loss": 5.1757, + "step": 7547 + }, + { + "epoch": 0.8569026256691891, + "grad_norm": 0.283203125, + "learning_rate": 0.0016736815907738207, + "loss": 5.2106, + "step": 7548 + }, + { + "epoch": 0.8570161527791081, + "grad_norm": 0.283203125, + "learning_rate": 0.001672435767107907, + "loss": 5.2196, + "step": 7549 + }, + { + "epoch": 0.8571296798890272, + "grad_norm": 0.265625, + "learning_rate": 0.0016711880986377328, + "loss": 5.2458, + "step": 7550 + }, + { + "epoch": 0.8572432069989463, + "grad_norm": 0.265625, + "learning_rate": 0.0016699385893841924, + "loss": 5.214, + "step": 7551 + }, + { + "epoch": 0.8573567341088654, + "grad_norm": 0.251953125, + "learning_rate": 0.0016686872433741133, + "loss": 5.2182, + "step": 7552 + }, + { + "epoch": 0.8574702612187844, + "grad_norm": 0.26953125, + "learning_rate": 0.0016674340646402424, + "loss": 5.241, + "step": 7553 + }, + { + "epoch": 0.8575837883287035, + "grad_norm": 0.259765625, + "learning_rate": 0.0016661790572212328, + "loss": 5.1946, + "step": 7554 + }, + { + "epoch": 0.8576973154386226, + "grad_norm": 0.263671875, + "learning_rate": 0.0016649222251616305, + "loss": 5.2025, + "step": 7555 + }, + { + "epoch": 0.8578108425485417, + "grad_norm": 0.28125, + "learning_rate": 0.0016636635725118627, + "loss": 5.2113, + "step": 7556 + }, + { + "epoch": 0.8579243696584608, + "grad_norm": 0.28515625, + "learning_rate": 0.0016624031033282232, + "loss": 5.2039, + "step": 7557 + }, + { + "epoch": 0.8580378967683798, + "grad_norm": 0.2890625, + "learning_rate": 0.0016611408216728604, + "loss": 5.2319, + "step": 7558 + }, + { + "epoch": 0.8581514238782989, + "grad_norm": 0.283203125, + "learning_rate": 0.0016598767316137633, + "loss": 5.2122, + "step": 7559 + }, + { + "epoch": 0.858264950988218, + "grad_norm": 0.265625, + "learning_rate": 0.0016586108372247492, + "loss": 5.2349, + "step": 7560 + }, + { + "epoch": 0.858378478098137, + "grad_norm": 0.2490234375, + "learning_rate": 0.0016573431425854503, + "loss": 5.2317, + "step": 7561 + }, + { + "epoch": 0.8584920052080561, + "grad_norm": 0.255859375, + "learning_rate": 0.0016560736517813011, + "loss": 5.2297, + "step": 7562 + }, + { + "epoch": 0.8586055323179752, + "grad_norm": 0.2294921875, + "learning_rate": 0.0016548023689035229, + "loss": 5.2215, + "step": 7563 + }, + { + "epoch": 0.8587190594278943, + "grad_norm": 0.234375, + "learning_rate": 0.0016535292980491146, + "loss": 5.2162, + "step": 7564 + }, + { + "epoch": 0.8588325865378134, + "grad_norm": 0.22265625, + "learning_rate": 0.0016522544433208353, + "loss": 5.2133, + "step": 7565 + }, + { + "epoch": 0.8589461136477324, + "grad_norm": 0.212890625, + "learning_rate": 0.0016509778088271941, + "loss": 5.2195, + "step": 7566 + }, + { + "epoch": 0.8590596407576515, + "grad_norm": 0.20703125, + "learning_rate": 0.001649699398682436, + "loss": 5.2002, + "step": 7567 + }, + { + "epoch": 0.8591731678675706, + "grad_norm": 0.2119140625, + "learning_rate": 0.0016484192170065275, + "loss": 5.2216, + "step": 7568 + }, + { + "epoch": 0.8592866949774897, + "grad_norm": 0.2041015625, + "learning_rate": 0.001647137267925145, + "loss": 5.2125, + "step": 7569 + }, + { + "epoch": 0.8594002220874087, + "grad_norm": 0.2060546875, + "learning_rate": 0.0016458535555696602, + "loss": 5.2175, + "step": 7570 + }, + { + "epoch": 0.8595137491973278, + "grad_norm": 0.2099609375, + "learning_rate": 0.0016445680840771284, + "loss": 5.2155, + "step": 7571 + }, + { + "epoch": 0.8596272763072469, + "grad_norm": 0.208984375, + "learning_rate": 0.001643280857590273, + "loss": 5.2244, + "step": 7572 + }, + { + "epoch": 0.859740803417166, + "grad_norm": 0.216796875, + "learning_rate": 0.001641991880257474, + "loss": 5.2184, + "step": 7573 + }, + { + "epoch": 0.859854330527085, + "grad_norm": 0.232421875, + "learning_rate": 0.001640701156232753, + "loss": 5.2109, + "step": 7574 + }, + { + "epoch": 0.8599678576370041, + "grad_norm": 0.2314453125, + "learning_rate": 0.0016394086896757616, + "loss": 5.2202, + "step": 7575 + }, + { + "epoch": 0.8600813847469232, + "grad_norm": 0.244140625, + "learning_rate": 0.0016381144847517671, + "loss": 5.2075, + "step": 7576 + }, + { + "epoch": 0.8601949118568423, + "grad_norm": 0.2431640625, + "learning_rate": 0.0016368185456316381, + "loss": 5.2054, + "step": 7577 + }, + { + "epoch": 0.8603084389667613, + "grad_norm": 0.248046875, + "learning_rate": 0.0016355208764918334, + "loss": 5.1963, + "step": 7578 + }, + { + "epoch": 0.8604219660766804, + "grad_norm": 0.244140625, + "learning_rate": 0.001634221481514386, + "loss": 5.2361, + "step": 7579 + }, + { + "epoch": 0.8605354931865995, + "grad_norm": 0.255859375, + "learning_rate": 0.001632920364886892, + "loss": 5.224, + "step": 7580 + }, + { + "epoch": 0.8606490202965186, + "grad_norm": 0.248046875, + "learning_rate": 0.0016316175308024943, + "loss": 5.2148, + "step": 7581 + }, + { + "epoch": 0.8607625474064377, + "grad_norm": 0.255859375, + "learning_rate": 0.0016303129834598724, + "loss": 5.2131, + "step": 7582 + }, + { + "epoch": 0.8608760745163567, + "grad_norm": 0.27734375, + "learning_rate": 0.001629006727063226, + "loss": 5.2143, + "step": 7583 + }, + { + "epoch": 0.8609896016262758, + "grad_norm": 0.294921875, + "learning_rate": 0.001627698765822263, + "loss": 5.2074, + "step": 7584 + }, + { + "epoch": 0.8611031287361949, + "grad_norm": 0.298828125, + "learning_rate": 0.0016263891039521854, + "loss": 5.1969, + "step": 7585 + }, + { + "epoch": 0.861216655846114, + "grad_norm": 0.30078125, + "learning_rate": 0.0016250777456736762, + "loss": 5.207, + "step": 7586 + }, + { + "epoch": 0.861330182956033, + "grad_norm": 0.294921875, + "learning_rate": 0.0016237646952128855, + "loss": 5.1918, + "step": 7587 + }, + { + "epoch": 0.8614437100659521, + "grad_norm": 0.30078125, + "learning_rate": 0.001622449956801416, + "loss": 5.2103, + "step": 7588 + }, + { + "epoch": 0.8615572371758712, + "grad_norm": 0.2578125, + "learning_rate": 0.0016211335346763116, + "loss": 5.2023, + "step": 7589 + }, + { + "epoch": 0.8616707642857903, + "grad_norm": 0.26171875, + "learning_rate": 0.0016198154330800407, + "loss": 5.2133, + "step": 7590 + }, + { + "epoch": 0.8617842913957093, + "grad_norm": 0.232421875, + "learning_rate": 0.0016184956562604858, + "loss": 5.1942, + "step": 7591 + }, + { + "epoch": 0.8618978185056284, + "grad_norm": 0.232421875, + "learning_rate": 0.0016171742084709268, + "loss": 5.2057, + "step": 7592 + }, + { + "epoch": 0.8620113456155475, + "grad_norm": 0.2294921875, + "learning_rate": 0.00161585109397003, + "loss": 5.2038, + "step": 7593 + }, + { + "epoch": 0.8621248727254666, + "grad_norm": 0.236328125, + "learning_rate": 0.0016145263170218318, + "loss": 5.2123, + "step": 7594 + }, + { + "epoch": 0.8622383998353856, + "grad_norm": 0.2197265625, + "learning_rate": 0.0016131998818957269, + "loss": 5.1869, + "step": 7595 + }, + { + "epoch": 0.8623519269453047, + "grad_norm": 0.220703125, + "learning_rate": 0.0016118717928664537, + "loss": 5.1967, + "step": 7596 + }, + { + "epoch": 0.8624654540552238, + "grad_norm": 0.220703125, + "learning_rate": 0.0016105420542140809, + "loss": 5.2073, + "step": 7597 + }, + { + "epoch": 0.8625789811651429, + "grad_norm": 0.216796875, + "learning_rate": 0.001609210670223993, + "loss": 5.2152, + "step": 7598 + }, + { + "epoch": 0.8626925082750619, + "grad_norm": 0.220703125, + "learning_rate": 0.0016078776451868774, + "loss": 5.2186, + "step": 7599 + }, + { + "epoch": 0.862806035384981, + "grad_norm": 0.2197265625, + "learning_rate": 0.0016065429833987104, + "loss": 5.2069, + "step": 7600 + }, + { + "epoch": 0.8629195624949001, + "grad_norm": 0.2216796875, + "learning_rate": 0.001605206689160742, + "loss": 5.1865, + "step": 7601 + }, + { + "epoch": 0.8630330896048193, + "grad_norm": 0.2353515625, + "learning_rate": 0.0016038687667794846, + "loss": 5.2122, + "step": 7602 + }, + { + "epoch": 0.8631466167147384, + "grad_norm": 0.2275390625, + "learning_rate": 0.001602529220566696, + "loss": 5.2025, + "step": 7603 + }, + { + "epoch": 0.8632601438246574, + "grad_norm": 0.2255859375, + "learning_rate": 0.0016011880548393693, + "loss": 5.2108, + "step": 7604 + }, + { + "epoch": 0.8633736709345765, + "grad_norm": 0.2275390625, + "learning_rate": 0.0015998452739197145, + "loss": 5.19, + "step": 7605 + }, + { + "epoch": 0.8634871980444956, + "grad_norm": 0.2275390625, + "learning_rate": 0.0015985008821351489, + "loss": 5.2129, + "step": 7606 + }, + { + "epoch": 0.8636007251544147, + "grad_norm": 0.23046875, + "learning_rate": 0.0015971548838182793, + "loss": 5.2129, + "step": 7607 + }, + { + "epoch": 0.8637142522643337, + "grad_norm": 0.216796875, + "learning_rate": 0.0015958072833068924, + "loss": 5.2162, + "step": 7608 + }, + { + "epoch": 0.8638277793742528, + "grad_norm": 0.208984375, + "learning_rate": 0.0015944580849439358, + "loss": 5.2314, + "step": 7609 + }, + { + "epoch": 0.8639413064841719, + "grad_norm": 0.197265625, + "learning_rate": 0.0015931072930775076, + "loss": 5.21, + "step": 7610 + }, + { + "epoch": 0.864054833594091, + "grad_norm": 0.1904296875, + "learning_rate": 0.0015917549120608423, + "loss": 5.1931, + "step": 7611 + }, + { + "epoch": 0.86416836070401, + "grad_norm": 0.193359375, + "learning_rate": 0.001590400946252294, + "loss": 5.2007, + "step": 7612 + }, + { + "epoch": 0.8642818878139291, + "grad_norm": 0.1962890625, + "learning_rate": 0.0015890454000153254, + "loss": 5.203, + "step": 7613 + }, + { + "epoch": 0.8643954149238482, + "grad_norm": 0.22265625, + "learning_rate": 0.0015876882777184915, + "loss": 5.2026, + "step": 7614 + }, + { + "epoch": 0.8645089420337673, + "grad_norm": 0.2158203125, + "learning_rate": 0.0015863295837354279, + "loss": 5.2039, + "step": 7615 + }, + { + "epoch": 0.8646224691436863, + "grad_norm": 0.2412109375, + "learning_rate": 0.001584969322444834, + "loss": 5.2212, + "step": 7616 + }, + { + "epoch": 0.8647359962536054, + "grad_norm": 0.236328125, + "learning_rate": 0.0015836074982304613, + "loss": 5.2077, + "step": 7617 + }, + { + "epoch": 0.8648495233635245, + "grad_norm": 0.25, + "learning_rate": 0.001582244115481097, + "loss": 5.2058, + "step": 7618 + }, + { + "epoch": 0.8649630504734436, + "grad_norm": 0.23828125, + "learning_rate": 0.001580879178590552, + "loss": 5.1931, + "step": 7619 + }, + { + "epoch": 0.8650765775833626, + "grad_norm": 0.244140625, + "learning_rate": 0.001579512691957645, + "loss": 5.218, + "step": 7620 + }, + { + "epoch": 0.8651901046932817, + "grad_norm": 0.2265625, + "learning_rate": 0.00157814465998619, + "loss": 5.2069, + "step": 7621 + }, + { + "epoch": 0.8653036318032008, + "grad_norm": 0.2275390625, + "learning_rate": 0.0015767750870849804, + "loss": 5.2055, + "step": 7622 + }, + { + "epoch": 0.8654171589131199, + "grad_norm": 0.216796875, + "learning_rate": 0.0015754039776677761, + "loss": 5.1989, + "step": 7623 + }, + { + "epoch": 0.865530686023039, + "grad_norm": 0.216796875, + "learning_rate": 0.0015740313361532882, + "loss": 5.2105, + "step": 7624 + }, + { + "epoch": 0.865644213132958, + "grad_norm": 0.21484375, + "learning_rate": 0.0015726571669651664, + "loss": 5.2033, + "step": 7625 + }, + { + "epoch": 0.8657577402428771, + "grad_norm": 0.2197265625, + "learning_rate": 0.0015712814745319822, + "loss": 5.1956, + "step": 7626 + }, + { + "epoch": 0.8658712673527962, + "grad_norm": 0.2236328125, + "learning_rate": 0.0015699042632872173, + "loss": 5.2051, + "step": 7627 + }, + { + "epoch": 0.8659847944627153, + "grad_norm": 0.228515625, + "learning_rate": 0.0015685255376692478, + "loss": 5.2271, + "step": 7628 + }, + { + "epoch": 0.8660983215726343, + "grad_norm": 0.23828125, + "learning_rate": 0.0015671453021213297, + "loss": 5.1899, + "step": 7629 + }, + { + "epoch": 0.8662118486825534, + "grad_norm": 0.2431640625, + "learning_rate": 0.0015657635610915861, + "loss": 5.2224, + "step": 7630 + }, + { + "epoch": 0.8663253757924725, + "grad_norm": 0.234375, + "learning_rate": 0.001564380319032991, + "loss": 5.1975, + "step": 7631 + }, + { + "epoch": 0.8664389029023916, + "grad_norm": 0.2451171875, + "learning_rate": 0.0015629955804033558, + "loss": 5.1879, + "step": 7632 + }, + { + "epoch": 0.8665524300123106, + "grad_norm": 0.2353515625, + "learning_rate": 0.0015616093496653156, + "loss": 5.223, + "step": 7633 + }, + { + "epoch": 0.8666659571222297, + "grad_norm": 0.25390625, + "learning_rate": 0.0015602216312863135, + "loss": 5.2054, + "step": 7634 + }, + { + "epoch": 0.8667794842321488, + "grad_norm": 0.25390625, + "learning_rate": 0.0015588324297385872, + "loss": 5.2198, + "step": 7635 + }, + { + "epoch": 0.8668930113420679, + "grad_norm": 0.2734375, + "learning_rate": 0.0015574417494991545, + "loss": 5.2062, + "step": 7636 + }, + { + "epoch": 0.8670065384519869, + "grad_norm": 0.265625, + "learning_rate": 0.0015560495950497977, + "loss": 5.1861, + "step": 7637 + }, + { + "epoch": 0.867120065561906, + "grad_norm": 0.275390625, + "learning_rate": 0.0015546559708770511, + "loss": 5.1992, + "step": 7638 + }, + { + "epoch": 0.8672335926718251, + "grad_norm": 0.27734375, + "learning_rate": 0.0015532608814721848, + "loss": 5.2023, + "step": 7639 + }, + { + "epoch": 0.8673471197817442, + "grad_norm": 0.28515625, + "learning_rate": 0.0015518643313311915, + "loss": 5.1679, + "step": 7640 + }, + { + "epoch": 0.8674606468916632, + "grad_norm": 0.267578125, + "learning_rate": 0.0015504663249547713, + "loss": 5.1957, + "step": 7641 + }, + { + "epoch": 0.8675741740015823, + "grad_norm": 0.283203125, + "learning_rate": 0.0015490668668483166, + "loss": 5.2118, + "step": 7642 + }, + { + "epoch": 0.8676877011115014, + "grad_norm": 0.26953125, + "learning_rate": 0.0015476659615218998, + "loss": 5.2051, + "step": 7643 + }, + { + "epoch": 0.8678012282214205, + "grad_norm": 0.267578125, + "learning_rate": 0.0015462636134902562, + "loss": 5.185, + "step": 7644 + }, + { + "epoch": 0.8679147553313395, + "grad_norm": 0.265625, + "learning_rate": 0.0015448598272727702, + "loss": 5.2109, + "step": 7645 + }, + { + "epoch": 0.8680282824412586, + "grad_norm": 0.265625, + "learning_rate": 0.0015434546073934622, + "loss": 5.2174, + "step": 7646 + }, + { + "epoch": 0.8681418095511777, + "grad_norm": 0.240234375, + "learning_rate": 0.0015420479583809728, + "loss": 5.1935, + "step": 7647 + }, + { + "epoch": 0.8682553366610968, + "grad_norm": 0.2294921875, + "learning_rate": 0.0015406398847685472, + "loss": 5.2076, + "step": 7648 + }, + { + "epoch": 0.8683688637710159, + "grad_norm": 0.205078125, + "learning_rate": 0.0015392303910940228, + "loss": 5.1892, + "step": 7649 + }, + { + "epoch": 0.8684823908809349, + "grad_norm": 0.193359375, + "learning_rate": 0.0015378194818998125, + "loss": 5.2103, + "step": 7650 + }, + { + "epoch": 0.868595917990854, + "grad_norm": 0.1845703125, + "learning_rate": 0.0015364071617328923, + "loss": 5.2292, + "step": 7651 + }, + { + "epoch": 0.8687094451007731, + "grad_norm": 0.1806640625, + "learning_rate": 0.0015349934351447845, + "loss": 5.2191, + "step": 7652 + }, + { + "epoch": 0.8688229722106922, + "grad_norm": 0.1865234375, + "learning_rate": 0.0015335783066915435, + "loss": 5.2028, + "step": 7653 + }, + { + "epoch": 0.8689364993206112, + "grad_norm": 0.193359375, + "learning_rate": 0.0015321617809337432, + "loss": 5.1941, + "step": 7654 + }, + { + "epoch": 0.8690500264305303, + "grad_norm": 0.1982421875, + "learning_rate": 0.0015307438624364588, + "loss": 5.222, + "step": 7655 + }, + { + "epoch": 0.8691635535404494, + "grad_norm": 0.2177734375, + "learning_rate": 0.0015293245557692547, + "loss": 5.1929, + "step": 7656 + }, + { + "epoch": 0.8692770806503685, + "grad_norm": 0.2236328125, + "learning_rate": 0.0015279038655061688, + "loss": 5.2246, + "step": 7657 + }, + { + "epoch": 0.8693906077602875, + "grad_norm": 0.2138671875, + "learning_rate": 0.0015264817962256988, + "loss": 5.1833, + "step": 7658 + }, + { + "epoch": 0.8695041348702066, + "grad_norm": 0.2138671875, + "learning_rate": 0.0015250583525107855, + "loss": 5.2075, + "step": 7659 + }, + { + "epoch": 0.8696176619801257, + "grad_norm": 0.19140625, + "learning_rate": 0.0015236335389487996, + "loss": 5.1948, + "step": 7660 + }, + { + "epoch": 0.8697311890900448, + "grad_norm": 0.2041015625, + "learning_rate": 0.001522207360131526, + "loss": 5.2117, + "step": 7661 + }, + { + "epoch": 0.8698447161999638, + "grad_norm": 0.19140625, + "learning_rate": 0.0015207798206551503, + "loss": 5.2014, + "step": 7662 + }, + { + "epoch": 0.8699582433098829, + "grad_norm": 0.19140625, + "learning_rate": 0.0015193509251202422, + "loss": 5.2132, + "step": 7663 + }, + { + "epoch": 0.870071770419802, + "grad_norm": 0.2021484375, + "learning_rate": 0.001517920678131743, + "loss": 5.2317, + "step": 7664 + }, + { + "epoch": 0.8701852975297211, + "grad_norm": 0.1962890625, + "learning_rate": 0.0015164890842989474, + "loss": 5.2171, + "step": 7665 + }, + { + "epoch": 0.8702988246396401, + "grad_norm": 0.2060546875, + "learning_rate": 0.0015150561482354921, + "loss": 5.1957, + "step": 7666 + }, + { + "epoch": 0.8704123517495592, + "grad_norm": 0.208984375, + "learning_rate": 0.0015136218745593394, + "loss": 5.2112, + "step": 7667 + }, + { + "epoch": 0.8705258788594783, + "grad_norm": 0.2041015625, + "learning_rate": 0.001512186267892761, + "loss": 5.188, + "step": 7668 + }, + { + "epoch": 0.8706394059693974, + "grad_norm": 0.2001953125, + "learning_rate": 0.0015107493328623258, + "loss": 5.2025, + "step": 7669 + }, + { + "epoch": 0.8707529330793164, + "grad_norm": 0.2060546875, + "learning_rate": 0.0015093110740988837, + "loss": 5.2158, + "step": 7670 + }, + { + "epoch": 0.8708664601892355, + "grad_norm": 0.1923828125, + "learning_rate": 0.0015078714962375498, + "loss": 5.1936, + "step": 7671 + }, + { + "epoch": 0.8709799872991546, + "grad_norm": 0.2060546875, + "learning_rate": 0.0015064306039176906, + "loss": 5.2162, + "step": 7672 + }, + { + "epoch": 0.8710935144090737, + "grad_norm": 0.197265625, + "learning_rate": 0.0015049884017829089, + "loss": 5.2106, + "step": 7673 + }, + { + "epoch": 0.8712070415189928, + "grad_norm": 0.220703125, + "learning_rate": 0.0015035448944810293, + "loss": 5.187, + "step": 7674 + }, + { + "epoch": 0.8713205686289118, + "grad_norm": 0.2255859375, + "learning_rate": 0.0015021000866640806, + "loss": 5.2236, + "step": 7675 + }, + { + "epoch": 0.8714340957388309, + "grad_norm": 0.2490234375, + "learning_rate": 0.001500653982988285, + "loss": 5.2045, + "step": 7676 + }, + { + "epoch": 0.87154762284875, + "grad_norm": 0.240234375, + "learning_rate": 0.00149920658811404, + "loss": 5.2114, + "step": 7677 + }, + { + "epoch": 0.8716611499586691, + "grad_norm": 0.265625, + "learning_rate": 0.001497757906705904, + "loss": 5.186, + "step": 7678 + }, + { + "epoch": 0.8717746770685881, + "grad_norm": 0.255859375, + "learning_rate": 0.0014963079434325822, + "loss": 5.1795, + "step": 7679 + }, + { + "epoch": 0.8718882041785072, + "grad_norm": 0.251953125, + "learning_rate": 0.00149485670296691, + "loss": 5.2045, + "step": 7680 + }, + { + "epoch": 0.8720017312884263, + "grad_norm": 0.2470703125, + "learning_rate": 0.0014934041899858398, + "loss": 5.2013, + "step": 7681 + }, + { + "epoch": 0.8721152583983454, + "grad_norm": 0.26171875, + "learning_rate": 0.001491950409170424, + "loss": 5.2121, + "step": 7682 + }, + { + "epoch": 0.8722287855082644, + "grad_norm": 0.2490234375, + "learning_rate": 0.0014904953652058021, + "loss": 5.2104, + "step": 7683 + }, + { + "epoch": 0.8723423126181835, + "grad_norm": 0.2451171875, + "learning_rate": 0.0014890390627811838, + "loss": 5.1955, + "step": 7684 + }, + { + "epoch": 0.8724558397281026, + "grad_norm": 0.234375, + "learning_rate": 0.0014875815065898339, + "loss": 5.2033, + "step": 7685 + }, + { + "epoch": 0.8725693668380217, + "grad_norm": 0.2470703125, + "learning_rate": 0.001486122701329058, + "loss": 5.1966, + "step": 7686 + }, + { + "epoch": 0.8726828939479407, + "grad_norm": 0.2392578125, + "learning_rate": 0.0014846626517001883, + "loss": 5.2291, + "step": 7687 + }, + { + "epoch": 0.8727964210578598, + "grad_norm": 0.2353515625, + "learning_rate": 0.0014832013624085654, + "loss": 5.1995, + "step": 7688 + }, + { + "epoch": 0.8729099481677789, + "grad_norm": 0.216796875, + "learning_rate": 0.0014817388381635262, + "loss": 5.2014, + "step": 7689 + }, + { + "epoch": 0.873023475277698, + "grad_norm": 0.2216796875, + "learning_rate": 0.0014802750836783877, + "loss": 5.2039, + "step": 7690 + }, + { + "epoch": 0.873137002387617, + "grad_norm": 0.2021484375, + "learning_rate": 0.0014788101036704304, + "loss": 5.1871, + "step": 7691 + }, + { + "epoch": 0.8732505294975361, + "grad_norm": 0.201171875, + "learning_rate": 0.0014773439028608858, + "loss": 5.1953, + "step": 7692 + }, + { + "epoch": 0.8733640566074552, + "grad_norm": 0.1865234375, + "learning_rate": 0.001475876485974918, + "loss": 5.1917, + "step": 7693 + }, + { + "epoch": 0.8734775837173743, + "grad_norm": 0.1826171875, + "learning_rate": 0.0014744078577416122, + "loss": 5.2164, + "step": 7694 + }, + { + "epoch": 0.8735911108272933, + "grad_norm": 0.1845703125, + "learning_rate": 0.0014729380228939559, + "loss": 5.1981, + "step": 7695 + }, + { + "epoch": 0.8737046379372124, + "grad_norm": 0.173828125, + "learning_rate": 0.0014714669861688257, + "loss": 5.1998, + "step": 7696 + }, + { + "epoch": 0.8738181650471315, + "grad_norm": 0.173828125, + "learning_rate": 0.0014699947523069716, + "loss": 5.2217, + "step": 7697 + }, + { + "epoch": 0.8739316921570506, + "grad_norm": 0.1806640625, + "learning_rate": 0.0014685213260530016, + "loss": 5.1956, + "step": 7698 + }, + { + "epoch": 0.8740452192669697, + "grad_norm": 0.18359375, + "learning_rate": 0.0014670467121553662, + "loss": 5.2216, + "step": 7699 + }, + { + "epoch": 0.8741587463768887, + "grad_norm": 0.17578125, + "learning_rate": 0.001465570915366344, + "loss": 5.2032, + "step": 7700 + }, + { + "epoch": 0.8742722734868078, + "grad_norm": 0.1845703125, + "learning_rate": 0.0014640939404420251, + "loss": 5.2171, + "step": 7701 + }, + { + "epoch": 0.8743858005967269, + "grad_norm": 0.1845703125, + "learning_rate": 0.0014626157921422964, + "loss": 5.2001, + "step": 7702 + }, + { + "epoch": 0.874499327706646, + "grad_norm": 0.1865234375, + "learning_rate": 0.001461136475230827, + "loss": 5.2108, + "step": 7703 + }, + { + "epoch": 0.874612854816565, + "grad_norm": 0.1884765625, + "learning_rate": 0.0014596559944750507, + "loss": 5.2096, + "step": 7704 + }, + { + "epoch": 0.8747263819264841, + "grad_norm": 0.205078125, + "learning_rate": 0.001458174354646154, + "loss": 5.1888, + "step": 7705 + }, + { + "epoch": 0.8748399090364032, + "grad_norm": 0.2119140625, + "learning_rate": 0.0014566915605190571, + "loss": 5.1753, + "step": 7706 + }, + { + "epoch": 0.8749534361463223, + "grad_norm": 0.212890625, + "learning_rate": 0.0014552076168724013, + "loss": 5.2088, + "step": 7707 + }, + { + "epoch": 0.8750669632562413, + "grad_norm": 0.2412109375, + "learning_rate": 0.0014537225284885313, + "loss": 5.2031, + "step": 7708 + }, + { + "epoch": 0.8751804903661604, + "grad_norm": 0.2470703125, + "learning_rate": 0.0014522363001534823, + "loss": 5.1987, + "step": 7709 + }, + { + "epoch": 0.8752940174760795, + "grad_norm": 0.255859375, + "learning_rate": 0.0014507489366569625, + "loss": 5.1918, + "step": 7710 + }, + { + "epoch": 0.8754075445859986, + "grad_norm": 0.2294921875, + "learning_rate": 0.0014492604427923381, + "loss": 5.2174, + "step": 7711 + }, + { + "epoch": 0.8755210716959176, + "grad_norm": 0.2353515625, + "learning_rate": 0.0014477708233566191, + "loss": 5.2064, + "step": 7712 + }, + { + "epoch": 0.8756345988058367, + "grad_norm": 0.21875, + "learning_rate": 0.0014462800831504426, + "loss": 5.2025, + "step": 7713 + }, + { + "epoch": 0.8757481259157558, + "grad_norm": 0.2216796875, + "learning_rate": 0.001444788226978057, + "loss": 5.2152, + "step": 7714 + }, + { + "epoch": 0.8758616530256749, + "grad_norm": 0.2099609375, + "learning_rate": 0.0014432952596473074, + "loss": 5.2251, + "step": 7715 + }, + { + "epoch": 0.875975180135594, + "grad_norm": 0.21484375, + "learning_rate": 0.0014418011859696211, + "loss": 5.1927, + "step": 7716 + }, + { + "epoch": 0.876088707245513, + "grad_norm": 0.2138671875, + "learning_rate": 0.001440306010759989, + "loss": 5.2099, + "step": 7717 + }, + { + "epoch": 0.8762022343554321, + "grad_norm": 0.212890625, + "learning_rate": 0.0014388097388369529, + "loss": 5.1982, + "step": 7718 + }, + { + "epoch": 0.8763157614653512, + "grad_norm": 0.2255859375, + "learning_rate": 0.001437312375022589, + "loss": 5.1945, + "step": 7719 + }, + { + "epoch": 0.8764292885752702, + "grad_norm": 0.2255859375, + "learning_rate": 0.0014358139241424923, + "loss": 5.2065, + "step": 7720 + }, + { + "epoch": 0.8765428156851893, + "grad_norm": 0.20703125, + "learning_rate": 0.001434314391025761, + "loss": 5.2015, + "step": 7721 + }, + { + "epoch": 0.8766563427951084, + "grad_norm": 0.2060546875, + "learning_rate": 0.0014328137805049808, + "loss": 5.2127, + "step": 7722 + }, + { + "epoch": 0.8767698699050275, + "grad_norm": 0.2001953125, + "learning_rate": 0.0014313120974162104, + "loss": 5.1895, + "step": 7723 + }, + { + "epoch": 0.8768833970149466, + "grad_norm": 0.2041015625, + "learning_rate": 0.0014298093465989641, + "loss": 5.2186, + "step": 7724 + }, + { + "epoch": 0.8769969241248656, + "grad_norm": 0.2021484375, + "learning_rate": 0.001428305532896198, + "loss": 5.2036, + "step": 7725 + }, + { + "epoch": 0.8771104512347847, + "grad_norm": 0.2177734375, + "learning_rate": 0.0014268006611542936, + "loss": 5.1912, + "step": 7726 + }, + { + "epoch": 0.8772239783447038, + "grad_norm": 0.2099609375, + "learning_rate": 0.0014252947362230412, + "loss": 5.1829, + "step": 7727 + }, + { + "epoch": 0.8773375054546229, + "grad_norm": 0.22265625, + "learning_rate": 0.0014237877629556263, + "loss": 5.2052, + "step": 7728 + }, + { + "epoch": 0.8774510325645419, + "grad_norm": 0.2119140625, + "learning_rate": 0.0014222797462086123, + "loss": 5.2, + "step": 7729 + }, + { + "epoch": 0.877564559674461, + "grad_norm": 0.21875, + "learning_rate": 0.0014207706908419256, + "loss": 5.194, + "step": 7730 + }, + { + "epoch": 0.8776780867843801, + "grad_norm": 0.224609375, + "learning_rate": 0.00141926060171884, + "loss": 5.204, + "step": 7731 + }, + { + "epoch": 0.8777916138942992, + "grad_norm": 0.2353515625, + "learning_rate": 0.0014177494837059608, + "loss": 5.2019, + "step": 7732 + }, + { + "epoch": 0.8779051410042182, + "grad_norm": 0.2294921875, + "learning_rate": 0.0014162373416732087, + "loss": 5.2007, + "step": 7733 + }, + { + "epoch": 0.8780186681141373, + "grad_norm": 0.21875, + "learning_rate": 0.0014147241804938046, + "loss": 5.2203, + "step": 7734 + }, + { + "epoch": 0.8781321952240564, + "grad_norm": 0.2060546875, + "learning_rate": 0.0014132100050442543, + "loss": 5.2002, + "step": 7735 + }, + { + "epoch": 0.8782457223339755, + "grad_norm": 0.19140625, + "learning_rate": 0.0014116948202043322, + "loss": 5.2191, + "step": 7736 + }, + { + "epoch": 0.8783592494438945, + "grad_norm": 0.1796875, + "learning_rate": 0.0014101786308570652, + "loss": 5.2027, + "step": 7737 + }, + { + "epoch": 0.8784727765538136, + "grad_norm": 0.185546875, + "learning_rate": 0.0014086614418887182, + "loss": 5.1952, + "step": 7738 + }, + { + "epoch": 0.8785863036637327, + "grad_norm": 0.1708984375, + "learning_rate": 0.0014071432581887772, + "loss": 5.2017, + "step": 7739 + }, + { + "epoch": 0.8786998307736518, + "grad_norm": 0.1875, + "learning_rate": 0.0014056240846499336, + "loss": 5.2191, + "step": 7740 + }, + { + "epoch": 0.8788133578835708, + "grad_norm": 0.1962890625, + "learning_rate": 0.0014041039261680692, + "loss": 5.1863, + "step": 7741 + }, + { + "epoch": 0.8789268849934899, + "grad_norm": 0.20703125, + "learning_rate": 0.0014025827876422404, + "loss": 5.1991, + "step": 7742 + }, + { + "epoch": 0.879040412103409, + "grad_norm": 0.234375, + "learning_rate": 0.001401060673974661, + "loss": 5.1886, + "step": 7743 + }, + { + "epoch": 0.8791539392133281, + "grad_norm": 0.244140625, + "learning_rate": 0.001399537590070688, + "loss": 5.1917, + "step": 7744 + }, + { + "epoch": 0.8792674663232471, + "grad_norm": 0.2421875, + "learning_rate": 0.0013980135408388056, + "loss": 5.1942, + "step": 7745 + }, + { + "epoch": 0.8793809934331662, + "grad_norm": 0.251953125, + "learning_rate": 0.0013964885311906082, + "loss": 5.235, + "step": 7746 + }, + { + "epoch": 0.8794945205430853, + "grad_norm": 0.255859375, + "learning_rate": 0.0013949625660407859, + "loss": 5.1829, + "step": 7747 + }, + { + "epoch": 0.8796080476530044, + "grad_norm": 0.25, + "learning_rate": 0.0013934356503071078, + "loss": 5.2025, + "step": 7748 + }, + { + "epoch": 0.8797215747629235, + "grad_norm": 0.26171875, + "learning_rate": 0.0013919077889104066, + "loss": 5.2172, + "step": 7749 + }, + { + "epoch": 0.8798351018728425, + "grad_norm": 0.263671875, + "learning_rate": 0.001390378986774563, + "loss": 5.1914, + "step": 7750 + }, + { + "epoch": 0.8799486289827616, + "grad_norm": 0.25390625, + "learning_rate": 0.0013888492488264887, + "loss": 5.1833, + "step": 7751 + }, + { + "epoch": 0.8800621560926807, + "grad_norm": 0.259765625, + "learning_rate": 0.0013873185799961115, + "loss": 5.2095, + "step": 7752 + }, + { + "epoch": 0.8801756832025998, + "grad_norm": 0.248046875, + "learning_rate": 0.00138578698521636, + "loss": 5.2073, + "step": 7753 + }, + { + "epoch": 0.8802892103125188, + "grad_norm": 0.2431640625, + "learning_rate": 0.0013842544694231457, + "loss": 5.192, + "step": 7754 + }, + { + "epoch": 0.8804027374224379, + "grad_norm": 0.2236328125, + "learning_rate": 0.0013827210375553485, + "loss": 5.1857, + "step": 7755 + }, + { + "epoch": 0.880516264532357, + "grad_norm": 0.2490234375, + "learning_rate": 0.0013811866945548017, + "loss": 5.1908, + "step": 7756 + }, + { + "epoch": 0.8806297916422761, + "grad_norm": 0.228515625, + "learning_rate": 0.0013796514453662733, + "loss": 5.2199, + "step": 7757 + }, + { + "epoch": 0.8807433187521951, + "grad_norm": 0.23046875, + "learning_rate": 0.0013781152949374528, + "loss": 5.2044, + "step": 7758 + }, + { + "epoch": 0.8808568458621142, + "grad_norm": 0.2177734375, + "learning_rate": 0.0013765782482189337, + "loss": 5.2011, + "step": 7759 + }, + { + "epoch": 0.8809703729720333, + "grad_norm": 0.22265625, + "learning_rate": 0.0013750403101641983, + "loss": 5.1926, + "step": 7760 + }, + { + "epoch": 0.8810839000819524, + "grad_norm": 0.2236328125, + "learning_rate": 0.001373501485729601, + "loss": 5.2098, + "step": 7761 + }, + { + "epoch": 0.8811974271918714, + "grad_norm": 0.2275390625, + "learning_rate": 0.0013719617798743531, + "loss": 5.1877, + "step": 7762 + }, + { + "epoch": 0.8813109543017905, + "grad_norm": 0.2197265625, + "learning_rate": 0.0013704211975605067, + "loss": 5.1889, + "step": 7763 + }, + { + "epoch": 0.8814244814117096, + "grad_norm": 0.234375, + "learning_rate": 0.0013688797437529376, + "loss": 5.202, + "step": 7764 + }, + { + "epoch": 0.8815380085216287, + "grad_norm": 0.2236328125, + "learning_rate": 0.0013673374234193314, + "loss": 5.1935, + "step": 7765 + }, + { + "epoch": 0.8816515356315477, + "grad_norm": 0.23046875, + "learning_rate": 0.0013657942415301653, + "loss": 5.2271, + "step": 7766 + }, + { + "epoch": 0.8817650627414668, + "grad_norm": 0.2099609375, + "learning_rate": 0.0013642502030586931, + "loss": 5.1846, + "step": 7767 + }, + { + "epoch": 0.8818785898513859, + "grad_norm": 0.2041015625, + "learning_rate": 0.00136270531298093, + "loss": 5.2102, + "step": 7768 + }, + { + "epoch": 0.881992116961305, + "grad_norm": 0.1884765625, + "learning_rate": 0.0013611595762756349, + "loss": 5.185, + "step": 7769 + }, + { + "epoch": 0.882105644071224, + "grad_norm": 0.1865234375, + "learning_rate": 0.0013596129979242951, + "loss": 5.1883, + "step": 7770 + }, + { + "epoch": 0.8822191711811431, + "grad_norm": 0.1806640625, + "learning_rate": 0.001358065582911111, + "loss": 5.202, + "step": 7771 + }, + { + "epoch": 0.8823326982910622, + "grad_norm": 0.1787109375, + "learning_rate": 0.0013565173362229785, + "loss": 5.1774, + "step": 7772 + }, + { + "epoch": 0.8824462254009813, + "grad_norm": 0.1728515625, + "learning_rate": 0.0013549682628494744, + "loss": 5.1813, + "step": 7773 + }, + { + "epoch": 0.8825597525109004, + "grad_norm": 0.16796875, + "learning_rate": 0.0013534183677828397, + "loss": 5.1968, + "step": 7774 + }, + { + "epoch": 0.8826732796208194, + "grad_norm": 0.16015625, + "learning_rate": 0.0013518676560179627, + "loss": 5.2038, + "step": 7775 + }, + { + "epoch": 0.8827868067307385, + "grad_norm": 0.162109375, + "learning_rate": 0.0013503161325523645, + "loss": 5.1878, + "step": 7776 + }, + { + "epoch": 0.8829003338406576, + "grad_norm": 0.1552734375, + "learning_rate": 0.0013487638023861818, + "loss": 5.1836, + "step": 7777 + }, + { + "epoch": 0.8830138609505767, + "grad_norm": 0.16015625, + "learning_rate": 0.0013472106705221512, + "loss": 5.1969, + "step": 7778 + }, + { + "epoch": 0.8831273880604957, + "grad_norm": 0.16015625, + "learning_rate": 0.0013456567419655927, + "loss": 5.1836, + "step": 7779 + }, + { + "epoch": 0.8832409151704148, + "grad_norm": 0.16015625, + "learning_rate": 0.0013441020217243942, + "loss": 5.2085, + "step": 7780 + }, + { + "epoch": 0.8833544422803339, + "grad_norm": 0.1669921875, + "learning_rate": 0.0013425465148089945, + "loss": 5.2036, + "step": 7781 + }, + { + "epoch": 0.883467969390253, + "grad_norm": 0.169921875, + "learning_rate": 0.001340990226232368, + "loss": 5.1866, + "step": 7782 + }, + { + "epoch": 0.883581496500172, + "grad_norm": 0.169921875, + "learning_rate": 0.0013394331610100077, + "loss": 5.1936, + "step": 7783 + }, + { + "epoch": 0.8836950236100911, + "grad_norm": 0.171875, + "learning_rate": 0.0013378753241599104, + "loss": 5.1605, + "step": 7784 + }, + { + "epoch": 0.8838085507200102, + "grad_norm": 0.1787109375, + "learning_rate": 0.001336316720702559, + "loss": 5.1877, + "step": 7785 + }, + { + "epoch": 0.8839220778299293, + "grad_norm": 0.1796875, + "learning_rate": 0.0013347573556609074, + "loss": 5.1972, + "step": 7786 + }, + { + "epoch": 0.8840356049398483, + "grad_norm": 0.17578125, + "learning_rate": 0.0013331972340603628, + "loss": 5.1899, + "step": 7787 + }, + { + "epoch": 0.8841491320497674, + "grad_norm": 0.1669921875, + "learning_rate": 0.0013316363609287723, + "loss": 5.2045, + "step": 7788 + }, + { + "epoch": 0.8842626591596865, + "grad_norm": 0.169921875, + "learning_rate": 0.0013300747412964035, + "loss": 5.1771, + "step": 7789 + }, + { + "epoch": 0.8843761862696056, + "grad_norm": 0.1611328125, + "learning_rate": 0.0013285123801959303, + "loss": 5.1944, + "step": 7790 + }, + { + "epoch": 0.8844897133795246, + "grad_norm": 0.162109375, + "learning_rate": 0.0013269492826624158, + "loss": 5.2104, + "step": 7791 + }, + { + "epoch": 0.8846032404894437, + "grad_norm": 0.15625, + "learning_rate": 0.0013253854537332978, + "loss": 5.2055, + "step": 7792 + }, + { + "epoch": 0.8847167675993628, + "grad_norm": 0.1650390625, + "learning_rate": 0.0013238208984483696, + "loss": 5.1728, + "step": 7793 + }, + { + "epoch": 0.8848302947092819, + "grad_norm": 0.162109375, + "learning_rate": 0.0013222556218497655, + "loss": 5.203, + "step": 7794 + }, + { + "epoch": 0.884943821819201, + "grad_norm": 0.171875, + "learning_rate": 0.0013206896289819456, + "loss": 5.201, + "step": 7795 + }, + { + "epoch": 0.88505734892912, + "grad_norm": 0.16796875, + "learning_rate": 0.0013191229248916772, + "loss": 5.2059, + "step": 7796 + }, + { + "epoch": 0.8851708760390391, + "grad_norm": 0.16796875, + "learning_rate": 0.0013175555146280199, + "loss": 5.1975, + "step": 7797 + }, + { + "epoch": 0.8852844031489582, + "grad_norm": 0.1748046875, + "learning_rate": 0.0013159874032423094, + "loss": 5.197, + "step": 7798 + }, + { + "epoch": 0.8853979302588773, + "grad_norm": 0.1708984375, + "learning_rate": 0.0013144185957881408, + "loss": 5.1856, + "step": 7799 + }, + { + "epoch": 0.8855114573687963, + "grad_norm": 0.1826171875, + "learning_rate": 0.0013128490973213522, + "loss": 5.2092, + "step": 7800 + }, + { + "epoch": 0.8856249844787154, + "grad_norm": 0.193359375, + "learning_rate": 0.0013112789129000094, + "loss": 5.1998, + "step": 7801 + }, + { + "epoch": 0.8857385115886345, + "grad_norm": 0.2158203125, + "learning_rate": 0.0013097080475843875, + "loss": 5.187, + "step": 7802 + }, + { + "epoch": 0.8858520386985536, + "grad_norm": 0.2109375, + "learning_rate": 0.001308136506436957, + "loss": 5.1945, + "step": 7803 + }, + { + "epoch": 0.8859655658084726, + "grad_norm": 0.2294921875, + "learning_rate": 0.0013065642945223664, + "loss": 5.1942, + "step": 7804 + }, + { + "epoch": 0.8860790929183917, + "grad_norm": 0.240234375, + "learning_rate": 0.0013049914169074256, + "loss": 5.1937, + "step": 7805 + }, + { + "epoch": 0.8861926200283108, + "grad_norm": 0.2412109375, + "learning_rate": 0.0013034178786610895, + "loss": 5.195, + "step": 7806 + }, + { + "epoch": 0.8863061471382299, + "grad_norm": 0.2314453125, + "learning_rate": 0.0013018436848544431, + "loss": 5.2196, + "step": 7807 + }, + { + "epoch": 0.8864196742481489, + "grad_norm": 0.2197265625, + "learning_rate": 0.001300268840560683, + "loss": 5.2071, + "step": 7808 + }, + { + "epoch": 0.886533201358068, + "grad_norm": 0.197265625, + "learning_rate": 0.0012986933508551024, + "loss": 5.1843, + "step": 7809 + }, + { + "epoch": 0.8866467284679871, + "grad_norm": 0.19921875, + "learning_rate": 0.0012971172208150754, + "loss": 5.188, + "step": 7810 + }, + { + "epoch": 0.8867602555779062, + "grad_norm": 0.1787109375, + "learning_rate": 0.0012955404555200385, + "loss": 5.1877, + "step": 7811 + }, + { + "epoch": 0.8868737826878252, + "grad_norm": 0.189453125, + "learning_rate": 0.001293963060051476, + "loss": 5.1989, + "step": 7812 + }, + { + "epoch": 0.8869873097977443, + "grad_norm": 0.1767578125, + "learning_rate": 0.0012923850394929031, + "loss": 5.187, + "step": 7813 + }, + { + "epoch": 0.8871008369076634, + "grad_norm": 0.1787109375, + "learning_rate": 0.0012908063989298493, + "loss": 5.1738, + "step": 7814 + }, + { + "epoch": 0.8872143640175825, + "grad_norm": 0.1796875, + "learning_rate": 0.0012892271434498423, + "loss": 5.1949, + "step": 7815 + }, + { + "epoch": 0.8873278911275015, + "grad_norm": 0.1748046875, + "learning_rate": 0.0012876472781423916, + "loss": 5.1852, + "step": 7816 + }, + { + "epoch": 0.8874414182374206, + "grad_norm": 0.16796875, + "learning_rate": 0.0012860668080989723, + "loss": 5.1872, + "step": 7817 + }, + { + "epoch": 0.8875549453473397, + "grad_norm": 0.181640625, + "learning_rate": 0.0012844857384130075, + "loss": 5.1924, + "step": 7818 + }, + { + "epoch": 0.8876684724572588, + "grad_norm": 0.169921875, + "learning_rate": 0.0012829040741798535, + "loss": 5.1844, + "step": 7819 + }, + { + "epoch": 0.8877819995671778, + "grad_norm": 0.181640625, + "learning_rate": 0.0012813218204967823, + "loss": 5.1962, + "step": 7820 + }, + { + "epoch": 0.8878955266770969, + "grad_norm": 0.1884765625, + "learning_rate": 0.0012797389824629661, + "loss": 5.1862, + "step": 7821 + }, + { + "epoch": 0.888009053787016, + "grad_norm": 0.185546875, + "learning_rate": 0.00127815556517946, + "loss": 5.1679, + "step": 7822 + }, + { + "epoch": 0.8881225808969351, + "grad_norm": 0.1826171875, + "learning_rate": 0.0012765715737491857, + "loss": 5.1899, + "step": 7823 + }, + { + "epoch": 0.8882361080068542, + "grad_norm": 0.1865234375, + "learning_rate": 0.0012749870132769147, + "loss": 5.1907, + "step": 7824 + }, + { + "epoch": 0.8883496351167732, + "grad_norm": 0.1845703125, + "learning_rate": 0.001273401888869254, + "loss": 5.1936, + "step": 7825 + }, + { + "epoch": 0.8884631622266923, + "grad_norm": 0.1962890625, + "learning_rate": 0.0012718162056346266, + "loss": 5.1973, + "step": 7826 + }, + { + "epoch": 0.8885766893366114, + "grad_norm": 0.189453125, + "learning_rate": 0.0012702299686832561, + "loss": 5.1864, + "step": 7827 + }, + { + "epoch": 0.8886902164465305, + "grad_norm": 0.1962890625, + "learning_rate": 0.0012686431831271524, + "loss": 5.1934, + "step": 7828 + }, + { + "epoch": 0.8888037435564495, + "grad_norm": 0.193359375, + "learning_rate": 0.0012670558540800916, + "loss": 5.1848, + "step": 7829 + }, + { + "epoch": 0.8889172706663686, + "grad_norm": 0.1943359375, + "learning_rate": 0.0012654679866576021, + "loss": 5.1932, + "step": 7830 + }, + { + "epoch": 0.8890307977762877, + "grad_norm": 0.1962890625, + "learning_rate": 0.0012638795859769476, + "loss": 5.1923, + "step": 7831 + }, + { + "epoch": 0.8891443248862068, + "grad_norm": 0.2001953125, + "learning_rate": 0.00126229065715711, + "loss": 5.2058, + "step": 7832 + }, + { + "epoch": 0.8892578519961258, + "grad_norm": 0.1865234375, + "learning_rate": 0.001260701205318773, + "loss": 5.191, + "step": 7833 + }, + { + "epoch": 0.8893713791060449, + "grad_norm": 0.1943359375, + "learning_rate": 0.0012591112355843062, + "loss": 5.1946, + "step": 7834 + }, + { + "epoch": 0.889484906215964, + "grad_norm": 0.193359375, + "learning_rate": 0.0012575207530777486, + "loss": 5.2004, + "step": 7835 + }, + { + "epoch": 0.8895984333258831, + "grad_norm": 0.203125, + "learning_rate": 0.0012559297629247906, + "loss": 5.1966, + "step": 7836 + }, + { + "epoch": 0.8897119604358021, + "grad_norm": 0.173828125, + "learning_rate": 0.0012543382702527596, + "loss": 5.1583, + "step": 7837 + }, + { + "epoch": 0.8898254875457212, + "grad_norm": 0.1669921875, + "learning_rate": 0.001252746280190602, + "loss": 5.1799, + "step": 7838 + }, + { + "epoch": 0.8899390146556403, + "grad_norm": 0.16015625, + "learning_rate": 0.0012511537978688678, + "loss": 5.203, + "step": 7839 + }, + { + "epoch": 0.8900525417655594, + "grad_norm": 0.158203125, + "learning_rate": 0.0012495608284196925, + "loss": 5.1752, + "step": 7840 + }, + { + "epoch": 0.8901660688754784, + "grad_norm": 0.1484375, + "learning_rate": 0.0012479673769767818, + "loss": 5.1947, + "step": 7841 + }, + { + "epoch": 0.8902795959853975, + "grad_norm": 0.1396484375, + "learning_rate": 0.0012463734486753953, + "loss": 5.1873, + "step": 7842 + }, + { + "epoch": 0.8903931230953166, + "grad_norm": 0.138671875, + "learning_rate": 0.0012447790486523288, + "loss": 5.1958, + "step": 7843 + }, + { + "epoch": 0.8905066502052358, + "grad_norm": 0.1396484375, + "learning_rate": 0.0012431841820458981, + "loss": 5.1801, + "step": 7844 + }, + { + "epoch": 0.8906201773151549, + "grad_norm": 0.1396484375, + "learning_rate": 0.0012415888539959233, + "loss": 5.1912, + "step": 7845 + }, + { + "epoch": 0.8907337044250739, + "grad_norm": 0.14453125, + "learning_rate": 0.0012399930696437114, + "loss": 5.1867, + "step": 7846 + }, + { + "epoch": 0.890847231534993, + "grad_norm": 0.1494140625, + "learning_rate": 0.0012383968341320402, + "loss": 5.228, + "step": 7847 + }, + { + "epoch": 0.8909607586449121, + "grad_norm": 0.1435546875, + "learning_rate": 0.0012368001526051407, + "loss": 5.1707, + "step": 7848 + }, + { + "epoch": 0.8910742857548312, + "grad_norm": 0.171875, + "learning_rate": 0.0012352030302086815, + "loss": 5.1916, + "step": 7849 + }, + { + "epoch": 0.8911878128647502, + "grad_norm": 0.1748046875, + "learning_rate": 0.0012336054720897527, + "loss": 5.1911, + "step": 7850 + }, + { + "epoch": 0.8913013399746693, + "grad_norm": 0.1826171875, + "learning_rate": 0.001232007483396848, + "loss": 5.1767, + "step": 7851 + }, + { + "epoch": 0.8914148670845884, + "grad_norm": 0.1767578125, + "learning_rate": 0.0012304090692798486, + "loss": 5.2039, + "step": 7852 + }, + { + "epoch": 0.8915283941945075, + "grad_norm": 0.1748046875, + "learning_rate": 0.0012288102348900077, + "loss": 5.2016, + "step": 7853 + }, + { + "epoch": 0.8916419213044265, + "grad_norm": 0.1572265625, + "learning_rate": 0.0012272109853799316, + "loss": 5.207, + "step": 7854 + }, + { + "epoch": 0.8917554484143456, + "grad_norm": 0.171875, + "learning_rate": 0.0012256113259035651, + "loss": 5.2131, + "step": 7855 + }, + { + "epoch": 0.8918689755242647, + "grad_norm": 0.1591796875, + "learning_rate": 0.0012240112616161744, + "loss": 5.1855, + "step": 7856 + }, + { + "epoch": 0.8919825026341838, + "grad_norm": 0.158203125, + "learning_rate": 0.0012224107976743297, + "loss": 5.1954, + "step": 7857 + }, + { + "epoch": 0.8920960297441028, + "grad_norm": 0.1591796875, + "learning_rate": 0.00122080993923589, + "loss": 5.1868, + "step": 7858 + }, + { + "epoch": 0.8922095568540219, + "grad_norm": 0.16015625, + "learning_rate": 0.0012192086914599846, + "loss": 5.1843, + "step": 7859 + }, + { + "epoch": 0.892323083963941, + "grad_norm": 0.1611328125, + "learning_rate": 0.0012176070595069988, + "loss": 5.1921, + "step": 7860 + }, + { + "epoch": 0.8924366110738601, + "grad_norm": 0.158203125, + "learning_rate": 0.0012160050485385547, + "loss": 5.1675, + "step": 7861 + }, + { + "epoch": 0.8925501381837792, + "grad_norm": 0.1630859375, + "learning_rate": 0.0012144026637174968, + "loss": 5.1671, + "step": 7862 + }, + { + "epoch": 0.8926636652936982, + "grad_norm": 0.1669921875, + "learning_rate": 0.001212799910207874, + "loss": 5.1872, + "step": 7863 + }, + { + "epoch": 0.8927771924036173, + "grad_norm": 0.1611328125, + "learning_rate": 0.0012111967931749232, + "loss": 5.1853, + "step": 7864 + }, + { + "epoch": 0.8928907195135364, + "grad_norm": 0.1669921875, + "learning_rate": 0.0012095933177850536, + "loss": 5.1713, + "step": 7865 + }, + { + "epoch": 0.8930042466234555, + "grad_norm": 0.1591796875, + "learning_rate": 0.0012079894892058282, + "loss": 5.1619, + "step": 7866 + }, + { + "epoch": 0.8931177737333745, + "grad_norm": 0.1572265625, + "learning_rate": 0.0012063853126059497, + "loss": 5.2004, + "step": 7867 + }, + { + "epoch": 0.8932313008432936, + "grad_norm": 0.1572265625, + "learning_rate": 0.0012047807931552406, + "loss": 5.2044, + "step": 7868 + }, + { + "epoch": 0.8933448279532127, + "grad_norm": 0.154296875, + "learning_rate": 0.0012031759360246297, + "loss": 5.1908, + "step": 7869 + }, + { + "epoch": 0.8934583550631318, + "grad_norm": 0.150390625, + "learning_rate": 0.0012015707463861333, + "loss": 5.1902, + "step": 7870 + }, + { + "epoch": 0.8935718821730508, + "grad_norm": 0.150390625, + "learning_rate": 0.0011999652294128402, + "loss": 5.2204, + "step": 7871 + }, + { + "epoch": 0.8936854092829699, + "grad_norm": 0.140625, + "learning_rate": 0.0011983593902788927, + "loss": 5.1862, + "step": 7872 + }, + { + "epoch": 0.893798936392889, + "grad_norm": 0.15234375, + "learning_rate": 0.0011967532341594727, + "loss": 5.1808, + "step": 7873 + }, + { + "epoch": 0.8939124635028081, + "grad_norm": 0.1474609375, + "learning_rate": 0.0011951467662307822, + "loss": 5.1677, + "step": 7874 + }, + { + "epoch": 0.8940259906127271, + "grad_norm": 0.14453125, + "learning_rate": 0.0011935399916700297, + "loss": 5.1634, + "step": 7875 + }, + { + "epoch": 0.8941395177226462, + "grad_norm": 0.1474609375, + "learning_rate": 0.0011919329156554112, + "loss": 5.1936, + "step": 7876 + }, + { + "epoch": 0.8942530448325653, + "grad_norm": 0.1416015625, + "learning_rate": 0.0011903255433660933, + "loss": 5.1836, + "step": 7877 + }, + { + "epoch": 0.8943665719424844, + "grad_norm": 0.13671875, + "learning_rate": 0.0011887178799821992, + "loss": 5.1945, + "step": 7878 + }, + { + "epoch": 0.8944800990524034, + "grad_norm": 0.1416015625, + "learning_rate": 0.0011871099306847886, + "loss": 5.205, + "step": 7879 + }, + { + "epoch": 0.8945936261623225, + "grad_norm": 0.1396484375, + "learning_rate": 0.0011855017006558437, + "loss": 5.1798, + "step": 7880 + }, + { + "epoch": 0.8947071532722416, + "grad_norm": 0.146484375, + "learning_rate": 0.0011838931950782505, + "loss": 5.1999, + "step": 7881 + }, + { + "epoch": 0.8948206803821607, + "grad_norm": 0.1455078125, + "learning_rate": 0.0011822844191357843, + "loss": 5.1847, + "step": 7882 + }, + { + "epoch": 0.8949342074920797, + "grad_norm": 0.1435546875, + "learning_rate": 0.0011806753780130904, + "loss": 5.1886, + "step": 7883 + }, + { + "epoch": 0.8950477346019988, + "grad_norm": 0.14453125, + "learning_rate": 0.0011790660768956692, + "loss": 5.2067, + "step": 7884 + }, + { + "epoch": 0.8951612617119179, + "grad_norm": 0.150390625, + "learning_rate": 0.0011774565209698587, + "loss": 5.1954, + "step": 7885 + }, + { + "epoch": 0.895274788821837, + "grad_norm": 0.1484375, + "learning_rate": 0.0011758467154228194, + "loss": 5.2096, + "step": 7886 + }, + { + "epoch": 0.895388315931756, + "grad_norm": 0.138671875, + "learning_rate": 0.0011742366654425144, + "loss": 5.1883, + "step": 7887 + }, + { + "epoch": 0.8955018430416751, + "grad_norm": 0.1494140625, + "learning_rate": 0.0011726263762176956, + "loss": 5.1892, + "step": 7888 + }, + { + "epoch": 0.8956153701515942, + "grad_norm": 0.154296875, + "learning_rate": 0.001171015852937886, + "loss": 5.1557, + "step": 7889 + }, + { + "epoch": 0.8957288972615133, + "grad_norm": 0.1611328125, + "learning_rate": 0.0011694051007933627, + "loss": 5.1942, + "step": 7890 + }, + { + "epoch": 0.8958424243714324, + "grad_norm": 0.1630859375, + "learning_rate": 0.0011677941249751396, + "loss": 5.1717, + "step": 7891 + }, + { + "epoch": 0.8959559514813514, + "grad_norm": 0.1708984375, + "learning_rate": 0.0011661829306749525, + "loss": 5.179, + "step": 7892 + }, + { + "epoch": 0.8960694785912705, + "grad_norm": 0.1904296875, + "learning_rate": 0.0011645715230852415, + "loss": 5.1974, + "step": 7893 + }, + { + "epoch": 0.8961830057011896, + "grad_norm": 0.20703125, + "learning_rate": 0.0011629599073991327, + "loss": 5.1684, + "step": 7894 + }, + { + "epoch": 0.8962965328111087, + "grad_norm": 0.205078125, + "learning_rate": 0.0011613480888104243, + "loss": 5.1968, + "step": 7895 + }, + { + "epoch": 0.8964100599210277, + "grad_norm": 0.234375, + "learning_rate": 0.0011597360725135667, + "loss": 5.2139, + "step": 7896 + }, + { + "epoch": 0.8965235870309468, + "grad_norm": 0.2109375, + "learning_rate": 0.0011581238637036499, + "loss": 5.188, + "step": 7897 + }, + { + "epoch": 0.8966371141408659, + "grad_norm": 0.220703125, + "learning_rate": 0.0011565114675763822, + "loss": 5.1818, + "step": 7898 + }, + { + "epoch": 0.896750641250785, + "grad_norm": 0.1904296875, + "learning_rate": 0.0011548988893280761, + "loss": 5.1698, + "step": 7899 + }, + { + "epoch": 0.896864168360704, + "grad_norm": 0.1865234375, + "learning_rate": 0.0011532861341556316, + "loss": 5.1732, + "step": 7900 + }, + { + "epoch": 0.8969776954706231, + "grad_norm": 0.181640625, + "learning_rate": 0.0011516732072565186, + "loss": 5.1746, + "step": 7901 + }, + { + "epoch": 0.8970912225805422, + "grad_norm": 0.173828125, + "learning_rate": 0.0011500601138287596, + "loss": 5.1823, + "step": 7902 + }, + { + "epoch": 0.8972047496904613, + "grad_norm": 0.1669921875, + "learning_rate": 0.0011484468590709153, + "loss": 5.1689, + "step": 7903 + }, + { + "epoch": 0.8973182768003803, + "grad_norm": 0.1806640625, + "learning_rate": 0.0011468334481820656, + "loss": 5.1704, + "step": 7904 + }, + { + "epoch": 0.8974318039102994, + "grad_norm": 0.177734375, + "learning_rate": 0.0011452198863617926, + "loss": 5.1858, + "step": 7905 + }, + { + "epoch": 0.8975453310202185, + "grad_norm": 0.181640625, + "learning_rate": 0.0011436061788101665, + "loss": 5.172, + "step": 7906 + }, + { + "epoch": 0.8976588581301376, + "grad_norm": 0.1640625, + "learning_rate": 0.0011419923307277263, + "loss": 5.1728, + "step": 7907 + }, + { + "epoch": 0.8977723852400566, + "grad_norm": 0.173828125, + "learning_rate": 0.0011403783473154635, + "loss": 5.1778, + "step": 7908 + }, + { + "epoch": 0.8978859123499757, + "grad_norm": 0.15625, + "learning_rate": 0.0011387642337748068, + "loss": 5.1722, + "step": 7909 + }, + { + "epoch": 0.8979994394598948, + "grad_norm": 0.158203125, + "learning_rate": 0.0011371499953076027, + "loss": 5.1951, + "step": 7910 + }, + { + "epoch": 0.8981129665698139, + "grad_norm": 0.1640625, + "learning_rate": 0.0011355356371161025, + "loss": 5.2079, + "step": 7911 + }, + { + "epoch": 0.898226493679733, + "grad_norm": 0.1630859375, + "learning_rate": 0.0011339211644029413, + "loss": 5.1825, + "step": 7912 + }, + { + "epoch": 0.898340020789652, + "grad_norm": 0.1591796875, + "learning_rate": 0.0011323065823711242, + "loss": 5.2076, + "step": 7913 + }, + { + "epoch": 0.8984535478995711, + "grad_norm": 0.158203125, + "learning_rate": 0.0011306918962240083, + "loss": 5.1994, + "step": 7914 + }, + { + "epoch": 0.8985670750094902, + "grad_norm": 0.1513671875, + "learning_rate": 0.0011290771111652866, + "loss": 5.1968, + "step": 7915 + }, + { + "epoch": 0.8986806021194093, + "grad_norm": 0.1494140625, + "learning_rate": 0.0011274622323989712, + "loss": 5.1793, + "step": 7916 + }, + { + "epoch": 0.8987941292293283, + "grad_norm": 0.1494140625, + "learning_rate": 0.0011258472651293748, + "loss": 5.1857, + "step": 7917 + }, + { + "epoch": 0.8989076563392474, + "grad_norm": 0.1533203125, + "learning_rate": 0.0011242322145610965, + "loss": 5.1803, + "step": 7918 + }, + { + "epoch": 0.8990211834491665, + "grad_norm": 0.146484375, + "learning_rate": 0.0011226170858990039, + "loss": 5.2064, + "step": 7919 + }, + { + "epoch": 0.8991347105590856, + "grad_norm": 0.1474609375, + "learning_rate": 0.0011210018843482155, + "loss": 5.1755, + "step": 7920 + }, + { + "epoch": 0.8992482376690046, + "grad_norm": 0.1357421875, + "learning_rate": 0.0011193866151140849, + "loss": 5.1773, + "step": 7921 + }, + { + "epoch": 0.8993617647789237, + "grad_norm": 0.1328125, + "learning_rate": 0.0011177712834021848, + "loss": 5.1708, + "step": 7922 + }, + { + "epoch": 0.8994752918888428, + "grad_norm": 0.1259765625, + "learning_rate": 0.0011161558944182877, + "loss": 5.1751, + "step": 7923 + }, + { + "epoch": 0.8995888189987619, + "grad_norm": 0.1240234375, + "learning_rate": 0.0011145404533683518, + "loss": 5.176, + "step": 7924 + }, + { + "epoch": 0.8997023461086809, + "grad_norm": 0.126953125, + "learning_rate": 0.001112924965458503, + "loss": 5.2079, + "step": 7925 + }, + { + "epoch": 0.8998158732186, + "grad_norm": 0.1298828125, + "learning_rate": 0.0011113094358950176, + "loss": 5.1855, + "step": 7926 + }, + { + "epoch": 0.8999294003285191, + "grad_norm": 0.12451171875, + "learning_rate": 0.0011096938698843064, + "loss": 5.1644, + "step": 7927 + }, + { + "epoch": 0.9000429274384382, + "grad_norm": 0.1318359375, + "learning_rate": 0.0011080782726328982, + "loss": 5.1846, + "step": 7928 + }, + { + "epoch": 0.9001564545483572, + "grad_norm": 0.1318359375, + "learning_rate": 0.0011064626493474219, + "loss": 5.1596, + "step": 7929 + }, + { + "epoch": 0.9002699816582763, + "grad_norm": 0.1328125, + "learning_rate": 0.0011048470052345905, + "loss": 5.1702, + "step": 7930 + }, + { + "epoch": 0.9003835087681954, + "grad_norm": 0.1337890625, + "learning_rate": 0.0011032313455011837, + "loss": 5.1814, + "step": 7931 + }, + { + "epoch": 0.9004970358781145, + "grad_norm": 0.1357421875, + "learning_rate": 0.001101615675354032, + "loss": 5.1957, + "step": 7932 + }, + { + "epoch": 0.9006105629880335, + "grad_norm": 0.13671875, + "learning_rate": 0.0011, + "loss": 5.16, + "step": 7933 + }, + { + "epoch": 0.9007240900979526, + "grad_norm": 0.1318359375, + "learning_rate": 0.001098384324645968, + "loss": 5.1868, + "step": 7934 + }, + { + "epoch": 0.9008376172078717, + "grad_norm": 0.1279296875, + "learning_rate": 0.0010967686544988166, + "loss": 5.1864, + "step": 7935 + }, + { + "epoch": 0.9009511443177908, + "grad_norm": 0.1337890625, + "learning_rate": 0.0010951529947654103, + "loss": 5.1908, + "step": 7936 + }, + { + "epoch": 0.9010646714277099, + "grad_norm": 0.1298828125, + "learning_rate": 0.001093537350652578, + "loss": 5.1767, + "step": 7937 + }, + { + "epoch": 0.9011781985376289, + "grad_norm": 0.1298828125, + "learning_rate": 0.001091921727367102, + "loss": 5.166, + "step": 7938 + }, + { + "epoch": 0.901291725647548, + "grad_norm": 0.12890625, + "learning_rate": 0.0010903061301156937, + "loss": 5.1636, + "step": 7939 + }, + { + "epoch": 0.9014052527574671, + "grad_norm": 0.1298828125, + "learning_rate": 0.0010886905641049828, + "loss": 5.175, + "step": 7940 + }, + { + "epoch": 0.9015187798673862, + "grad_norm": 0.125, + "learning_rate": 0.0010870750345414973, + "loss": 5.1638, + "step": 7941 + }, + { + "epoch": 0.9016323069773052, + "grad_norm": 0.134765625, + "learning_rate": 0.0010854595466316484, + "loss": 5.1865, + "step": 7942 + }, + { + "epoch": 0.9017458340872243, + "grad_norm": 0.13671875, + "learning_rate": 0.0010838441055817127, + "loss": 5.1731, + "step": 7943 + }, + { + "epoch": 0.9018593611971434, + "grad_norm": 0.1533203125, + "learning_rate": 0.0010822287165978156, + "loss": 5.1617, + "step": 7944 + }, + { + "epoch": 0.9019728883070625, + "grad_norm": 0.1689453125, + "learning_rate": 0.001080613384885915, + "loss": 5.1774, + "step": 7945 + }, + { + "epoch": 0.9020864154169815, + "grad_norm": 0.1845703125, + "learning_rate": 0.0010789981156517847, + "loss": 5.179, + "step": 7946 + }, + { + "epoch": 0.9021999425269006, + "grad_norm": 0.173828125, + "learning_rate": 0.0010773829141009963, + "loss": 5.1648, + "step": 7947 + }, + { + "epoch": 0.9023134696368197, + "grad_norm": 0.185546875, + "learning_rate": 0.0010757677854389036, + "loss": 5.1832, + "step": 7948 + }, + { + "epoch": 0.9024269967467388, + "grad_norm": 0.1787109375, + "learning_rate": 0.0010741527348706254, + "loss": 5.1857, + "step": 7949 + }, + { + "epoch": 0.9025405238566578, + "grad_norm": 0.17578125, + "learning_rate": 0.001072537767601029, + "loss": 5.177, + "step": 7950 + }, + { + "epoch": 0.9026540509665769, + "grad_norm": 0.1796875, + "learning_rate": 0.0010709228888347133, + "loss": 5.1861, + "step": 7951 + }, + { + "epoch": 0.902767578076496, + "grad_norm": 0.171875, + "learning_rate": 0.0010693081037759916, + "loss": 5.1883, + "step": 7952 + }, + { + "epoch": 0.9028811051864151, + "grad_norm": 0.1689453125, + "learning_rate": 0.001067693417628876, + "loss": 5.1767, + "step": 7953 + }, + { + "epoch": 0.9029946322963341, + "grad_norm": 0.16796875, + "learning_rate": 0.001066078835597059, + "loss": 5.1763, + "step": 7954 + }, + { + "epoch": 0.9031081594062532, + "grad_norm": 0.15625, + "learning_rate": 0.0010644643628838976, + "loss": 5.1664, + "step": 7955 + }, + { + "epoch": 0.9032216865161723, + "grad_norm": 0.1669921875, + "learning_rate": 0.0010628500046923975, + "loss": 5.1894, + "step": 7956 + }, + { + "epoch": 0.9033352136260914, + "grad_norm": 0.1630859375, + "learning_rate": 0.0010612357662251938, + "loss": 5.2043, + "step": 7957 + }, + { + "epoch": 0.9034487407360104, + "grad_norm": 0.1689453125, + "learning_rate": 0.001059621652684537, + "loss": 5.1603, + "step": 7958 + }, + { + "epoch": 0.9035622678459295, + "grad_norm": 0.150390625, + "learning_rate": 0.0010580076692722738, + "loss": 5.1801, + "step": 7959 + }, + { + "epoch": 0.9036757949558486, + "grad_norm": 0.15234375, + "learning_rate": 0.0010563938211898337, + "loss": 5.1819, + "step": 7960 + }, + { + "epoch": 0.9037893220657677, + "grad_norm": 0.138671875, + "learning_rate": 0.0010547801136382075, + "loss": 5.1908, + "step": 7961 + }, + { + "epoch": 0.9039028491756868, + "grad_norm": 0.1416015625, + "learning_rate": 0.0010531665518179348, + "loss": 5.1787, + "step": 7962 + }, + { + "epoch": 0.9040163762856058, + "grad_norm": 0.1416015625, + "learning_rate": 0.0010515531409290846, + "loss": 5.1688, + "step": 7963 + }, + { + "epoch": 0.9041299033955249, + "grad_norm": 0.1435546875, + "learning_rate": 0.0010499398861712405, + "loss": 5.1948, + "step": 7964 + }, + { + "epoch": 0.904243430505444, + "grad_norm": 0.1435546875, + "learning_rate": 0.0010483267927434818, + "loss": 5.1724, + "step": 7965 + }, + { + "epoch": 0.904356957615363, + "grad_norm": 0.14453125, + "learning_rate": 0.0010467138658443683, + "loss": 5.1701, + "step": 7966 + }, + { + "epoch": 0.9044704847252821, + "grad_norm": 0.1435546875, + "learning_rate": 0.001045101110671924, + "loss": 5.1979, + "step": 7967 + }, + { + "epoch": 0.9045840118352012, + "grad_norm": 0.1474609375, + "learning_rate": 0.0010434885324236182, + "loss": 5.175, + "step": 7968 + }, + { + "epoch": 0.9046975389451203, + "grad_norm": 0.1435546875, + "learning_rate": 0.0010418761362963502, + "loss": 5.1852, + "step": 7969 + }, + { + "epoch": 0.9048110660550394, + "grad_norm": 0.142578125, + "learning_rate": 0.0010402639274864334, + "loss": 5.1887, + "step": 7970 + }, + { + "epoch": 0.9049245931649584, + "grad_norm": 0.1357421875, + "learning_rate": 0.0010386519111895763, + "loss": 5.1572, + "step": 7971 + }, + { + "epoch": 0.9050381202748775, + "grad_norm": 0.1435546875, + "learning_rate": 0.0010370400926008677, + "loss": 5.1821, + "step": 7972 + }, + { + "epoch": 0.9051516473847966, + "grad_norm": 0.130859375, + "learning_rate": 0.0010354284769147587, + "loss": 5.1717, + "step": 7973 + }, + { + "epoch": 0.9052651744947157, + "grad_norm": 0.1396484375, + "learning_rate": 0.0010338170693250475, + "loss": 5.1817, + "step": 7974 + }, + { + "epoch": 0.9053787016046347, + "grad_norm": 0.1318359375, + "learning_rate": 0.0010322058750248605, + "loss": 5.1866, + "step": 7975 + }, + { + "epoch": 0.9054922287145538, + "grad_norm": 0.1357421875, + "learning_rate": 0.0010305948992066377, + "loss": 5.1933, + "step": 7976 + }, + { + "epoch": 0.9056057558244729, + "grad_norm": 0.13671875, + "learning_rate": 0.0010289841470621142, + "loss": 5.1607, + "step": 7977 + }, + { + "epoch": 0.905719282934392, + "grad_norm": 0.15234375, + "learning_rate": 0.0010273736237823045, + "loss": 5.1882, + "step": 7978 + }, + { + "epoch": 0.905832810044311, + "grad_norm": 0.1533203125, + "learning_rate": 0.001025763334557486, + "loss": 5.1713, + "step": 7979 + }, + { + "epoch": 0.9059463371542301, + "grad_norm": 0.150390625, + "learning_rate": 0.0010241532845771814, + "loss": 5.17, + "step": 7980 + }, + { + "epoch": 0.9060598642641492, + "grad_norm": 0.15234375, + "learning_rate": 0.0010225434790301414, + "loss": 5.1675, + "step": 7981 + }, + { + "epoch": 0.9061733913740683, + "grad_norm": 0.162109375, + "learning_rate": 0.0010209339231043314, + "loss": 5.1693, + "step": 7982 + }, + { + "epoch": 0.9062869184839873, + "grad_norm": 0.1455078125, + "learning_rate": 0.00101932462198691, + "loss": 5.1671, + "step": 7983 + }, + { + "epoch": 0.9064004455939064, + "grad_norm": 0.1494140625, + "learning_rate": 0.001017715580864216, + "loss": 5.1734, + "step": 7984 + }, + { + "epoch": 0.9065139727038255, + "grad_norm": 0.154296875, + "learning_rate": 0.0010161068049217494, + "loss": 5.1768, + "step": 7985 + }, + { + "epoch": 0.9066274998137446, + "grad_norm": 0.158203125, + "learning_rate": 0.0010144982993441564, + "loss": 5.1915, + "step": 7986 + }, + { + "epoch": 0.9067410269236637, + "grad_norm": 0.1533203125, + "learning_rate": 0.0010128900693152114, + "loss": 5.1684, + "step": 7987 + }, + { + "epoch": 0.9068545540335827, + "grad_norm": 0.1611328125, + "learning_rate": 0.0010112821200178009, + "loss": 5.1797, + "step": 7988 + }, + { + "epoch": 0.9069680811435018, + "grad_norm": 0.150390625, + "learning_rate": 0.0010096744566339064, + "loss": 5.1773, + "step": 7989 + }, + { + "epoch": 0.9070816082534209, + "grad_norm": 0.1533203125, + "learning_rate": 0.001008067084344589, + "loss": 5.1864, + "step": 7990 + }, + { + "epoch": 0.90719513536334, + "grad_norm": 0.14453125, + "learning_rate": 0.0010064600083299704, + "loss": 5.1773, + "step": 7991 + }, + { + "epoch": 0.907308662473259, + "grad_norm": 0.1474609375, + "learning_rate": 0.001004853233769218, + "loss": 5.1919, + "step": 7992 + }, + { + "epoch": 0.9074221895831781, + "grad_norm": 0.1484375, + "learning_rate": 0.0010032467658405277, + "loss": 5.1791, + "step": 7993 + }, + { + "epoch": 0.9075357166930972, + "grad_norm": 0.1474609375, + "learning_rate": 0.0010016406097211074, + "loss": 5.162, + "step": 7994 + }, + { + "epoch": 0.9076492438030163, + "grad_norm": 0.134765625, + "learning_rate": 0.0010000347705871597, + "loss": 5.1776, + "step": 7995 + }, + { + "epoch": 0.9077627709129353, + "grad_norm": 0.142578125, + "learning_rate": 0.0009984292536138666, + "loss": 5.1733, + "step": 7996 + }, + { + "epoch": 0.9078762980228544, + "grad_norm": 0.1337890625, + "learning_rate": 0.0009968240639753702, + "loss": 5.1716, + "step": 7997 + }, + { + "epoch": 0.9079898251327735, + "grad_norm": 0.130859375, + "learning_rate": 0.0009952192068447597, + "loss": 5.172, + "step": 7998 + }, + { + "epoch": 0.9081033522426926, + "grad_norm": 0.134765625, + "learning_rate": 0.0009936146873940506, + "loss": 5.182, + "step": 7999 + }, + { + "epoch": 0.9082168793526116, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009920105107941717, + "loss": 5.1707, + "step": 8000 + }, + { + "epoch": 0.9083304064625307, + "grad_norm": 0.134765625, + "learning_rate": 0.0009904066822149467, + "loss": 5.175, + "step": 8001 + }, + { + "epoch": 0.9084439335724498, + "grad_norm": 0.142578125, + "learning_rate": 0.0009888032068250767, + "loss": 5.1907, + "step": 8002 + }, + { + "epoch": 0.9085574606823689, + "grad_norm": 0.1435546875, + "learning_rate": 0.0009872000897921261, + "loss": 5.1779, + "step": 8003 + }, + { + "epoch": 0.9086709877922879, + "grad_norm": 0.15625, + "learning_rate": 0.0009855973362825033, + "loss": 5.1776, + "step": 8004 + }, + { + "epoch": 0.908784514902207, + "grad_norm": 0.146484375, + "learning_rate": 0.0009839949514614454, + "loss": 5.1975, + "step": 8005 + }, + { + "epoch": 0.9088980420121261, + "grad_norm": 0.1494140625, + "learning_rate": 0.0009823929404930015, + "loss": 5.1914, + "step": 8006 + }, + { + "epoch": 0.9090115691220452, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009807913085400155, + "loss": 5.1892, + "step": 8007 + }, + { + "epoch": 0.9091250962319642, + "grad_norm": 0.140625, + "learning_rate": 0.0009791900607641103, + "loss": 5.1815, + "step": 8008 + }, + { + "epoch": 0.9092386233418833, + "grad_norm": 0.1416015625, + "learning_rate": 0.0009775892023256704, + "loss": 5.1674, + "step": 8009 + }, + { + "epoch": 0.9093521504518024, + "grad_norm": 0.15234375, + "learning_rate": 0.000975988738383826, + "loss": 5.1825, + "step": 8010 + }, + { + "epoch": 0.9094656775617215, + "grad_norm": 0.14453125, + "learning_rate": 0.0009743886740964352, + "loss": 5.1813, + "step": 8011 + }, + { + "epoch": 0.9095792046716406, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009727890146200687, + "loss": 5.1862, + "step": 8012 + }, + { + "epoch": 0.9096927317815596, + "grad_norm": 0.13671875, + "learning_rate": 0.0009711897651099925, + "loss": 5.189, + "step": 8013 + }, + { + "epoch": 0.9098062588914787, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009695909307201513, + "loss": 5.1727, + "step": 8014 + }, + { + "epoch": 0.9099197860013978, + "grad_norm": 0.142578125, + "learning_rate": 0.0009679925166031525, + "loss": 5.1764, + "step": 8015 + }, + { + "epoch": 0.9100333131113169, + "grad_norm": 0.15234375, + "learning_rate": 0.0009663945279102477, + "loss": 5.1844, + "step": 8016 + }, + { + "epoch": 0.9101468402212359, + "grad_norm": 0.1455078125, + "learning_rate": 0.0009647969697913186, + "loss": 5.1706, + "step": 8017 + }, + { + "epoch": 0.910260367331155, + "grad_norm": 0.140625, + "learning_rate": 0.0009631998473948596, + "loss": 5.1549, + "step": 8018 + }, + { + "epoch": 0.9103738944410741, + "grad_norm": 0.126953125, + "learning_rate": 0.00096160316586796, + "loss": 5.192, + "step": 8019 + }, + { + "epoch": 0.9104874215509932, + "grad_norm": 0.13671875, + "learning_rate": 0.0009600069303562887, + "loss": 5.1895, + "step": 8020 + }, + { + "epoch": 0.9106009486609122, + "grad_norm": 0.130859375, + "learning_rate": 0.0009584111460040767, + "loss": 5.195, + "step": 8021 + }, + { + "epoch": 0.9107144757708313, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009568158179541022, + "loss": 5.1845, + "step": 8022 + }, + { + "epoch": 0.9108280028807504, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009552209513476718, + "loss": 5.1668, + "step": 8023 + }, + { + "epoch": 0.9109415299906695, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009536265513246047, + "loss": 5.1755, + "step": 8024 + }, + { + "epoch": 0.9110550571005885, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009520326230232181, + "loss": 5.1808, + "step": 8025 + }, + { + "epoch": 0.9111685842105076, + "grad_norm": 0.125, + "learning_rate": 0.0009504391715803076, + "loss": 5.1793, + "step": 8026 + }, + { + "epoch": 0.9112821113204267, + "grad_norm": 0.12353515625, + "learning_rate": 0.0009488462021311324, + "loss": 5.1889, + "step": 8027 + }, + { + "epoch": 0.9113956384303458, + "grad_norm": 0.12158203125, + "learning_rate": 0.000947253719809398, + "loss": 5.1645, + "step": 8028 + }, + { + "epoch": 0.9115091655402648, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009456617297472407, + "loss": 5.1742, + "step": 8029 + }, + { + "epoch": 0.9116226926501839, + "grad_norm": 0.1201171875, + "learning_rate": 0.0009440702370752097, + "loss": 5.167, + "step": 8030 + }, + { + "epoch": 0.911736219760103, + "grad_norm": 0.126953125, + "learning_rate": 0.0009424792469222516, + "loss": 5.158, + "step": 8031 + }, + { + "epoch": 0.9118497468700221, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009408887644156938, + "loss": 5.1752, + "step": 8032 + }, + { + "epoch": 0.9119632739799411, + "grad_norm": 0.12158203125, + "learning_rate": 0.000939298794681227, + "loss": 5.1599, + "step": 8033 + }, + { + "epoch": 0.9120768010898602, + "grad_norm": 0.123046875, + "learning_rate": 0.0009377093428428903, + "loss": 5.1586, + "step": 8034 + }, + { + "epoch": 0.9121903281997793, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009361204140230524, + "loss": 5.1629, + "step": 8035 + }, + { + "epoch": 0.9123038553096984, + "grad_norm": 0.123046875, + "learning_rate": 0.000934532013342398, + "loss": 5.161, + "step": 8036 + }, + { + "epoch": 0.9124173824196175, + "grad_norm": 0.126953125, + "learning_rate": 0.0009329441459199089, + "loss": 5.172, + "step": 8037 + }, + { + "epoch": 0.9125309095295365, + "grad_norm": 0.134765625, + "learning_rate": 0.0009313568168728477, + "loss": 5.17, + "step": 8038 + }, + { + "epoch": 0.9126444366394556, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009297700313167439, + "loss": 5.1936, + "step": 8039 + }, + { + "epoch": 0.9127579637493747, + "grad_norm": 0.1328125, + "learning_rate": 0.0009281837943653737, + "loss": 5.1874, + "step": 8040 + }, + { + "epoch": 0.9128714908592938, + "grad_norm": 0.130859375, + "learning_rate": 0.000926598111130746, + "loss": 5.1661, + "step": 8041 + }, + { + "epoch": 0.9129850179692128, + "grad_norm": 0.1318359375, + "learning_rate": 0.0009250129867230852, + "loss": 5.161, + "step": 8042 + }, + { + "epoch": 0.9130985450791319, + "grad_norm": 0.119140625, + "learning_rate": 0.0009234284262508146, + "loss": 5.1861, + "step": 8043 + }, + { + "epoch": 0.913212072189051, + "grad_norm": 0.12255859375, + "learning_rate": 0.0009218444348205401, + "loss": 5.176, + "step": 8044 + }, + { + "epoch": 0.9133255992989701, + "grad_norm": 0.1220703125, + "learning_rate": 0.0009202610175370338, + "loss": 5.1729, + "step": 8045 + }, + { + "epoch": 0.9134391264088891, + "grad_norm": 0.12890625, + "learning_rate": 0.0009186781795032178, + "loss": 5.1907, + "step": 8046 + }, + { + "epoch": 0.9135526535188082, + "grad_norm": 0.134765625, + "learning_rate": 0.0009170959258201468, + "loss": 5.1914, + "step": 8047 + }, + { + "epoch": 0.9136661806287273, + "grad_norm": 0.1357421875, + "learning_rate": 0.0009155142615869927, + "loss": 5.1819, + "step": 8048 + }, + { + "epoch": 0.9137797077386464, + "grad_norm": 0.1259765625, + "learning_rate": 0.0009139331919010281, + "loss": 5.1969, + "step": 8049 + }, + { + "epoch": 0.9138932348485654, + "grad_norm": 0.1298828125, + "learning_rate": 0.0009123527218576085, + "loss": 5.1699, + "step": 8050 + }, + { + "epoch": 0.9140067619584845, + "grad_norm": 0.11962890625, + "learning_rate": 0.0009107728565501581, + "loss": 5.1461, + "step": 8051 + }, + { + "epoch": 0.9141202890684036, + "grad_norm": 0.1279296875, + "learning_rate": 0.0009091936010701511, + "loss": 5.1691, + "step": 8052 + }, + { + "epoch": 0.9142338161783227, + "grad_norm": 0.126953125, + "learning_rate": 0.0009076149605070969, + "loss": 5.1581, + "step": 8053 + }, + { + "epoch": 0.9143473432882417, + "grad_norm": 0.1337890625, + "learning_rate": 0.000906036939948524, + "loss": 5.1846, + "step": 8054 + }, + { + "epoch": 0.9144608703981608, + "grad_norm": 0.11279296875, + "learning_rate": 0.0009044595444799615, + "loss": 5.1796, + "step": 8055 + }, + { + "epoch": 0.9145743975080799, + "grad_norm": 0.1162109375, + "learning_rate": 0.0009028827791849245, + "loss": 5.1764, + "step": 8056 + }, + { + "epoch": 0.914687924617999, + "grad_norm": 0.11572265625, + "learning_rate": 0.0009013066491448976, + "loss": 5.1737, + "step": 8057 + }, + { + "epoch": 0.914801451727918, + "grad_norm": 0.11767578125, + "learning_rate": 0.0008997311594393172, + "loss": 5.1796, + "step": 8058 + }, + { + "epoch": 0.9149149788378371, + "grad_norm": 0.11376953125, + "learning_rate": 0.0008981563151455572, + "loss": 5.1749, + "step": 8059 + }, + { + "epoch": 0.9150285059477562, + "grad_norm": 0.12255859375, + "learning_rate": 0.0008965821213389105, + "loss": 5.1714, + "step": 8060 + }, + { + "epoch": 0.9151420330576753, + "grad_norm": 0.1162109375, + "learning_rate": 0.0008950085830925745, + "loss": 5.1859, + "step": 8061 + }, + { + "epoch": 0.9152555601675944, + "grad_norm": 0.1142578125, + "learning_rate": 0.0008934357054776336, + "loss": 5.1832, + "step": 8062 + }, + { + "epoch": 0.9153690872775134, + "grad_norm": 0.10986328125, + "learning_rate": 0.0008918634935630432, + "loss": 5.1824, + "step": 8063 + }, + { + "epoch": 0.9154826143874325, + "grad_norm": 0.11669921875, + "learning_rate": 0.0008902919524156128, + "loss": 5.1731, + "step": 8064 + }, + { + "epoch": 0.9155961414973516, + "grad_norm": 0.12158203125, + "learning_rate": 0.0008887210870999912, + "loss": 5.1876, + "step": 8065 + }, + { + "epoch": 0.9157096686072707, + "grad_norm": 0.111328125, + "learning_rate": 0.0008871509026786479, + "loss": 5.1597, + "step": 8066 + }, + { + "epoch": 0.9158231957171897, + "grad_norm": 0.1103515625, + "learning_rate": 0.0008855814042118593, + "loss": 5.1837, + "step": 8067 + }, + { + "epoch": 0.9159367228271088, + "grad_norm": 0.1181640625, + "learning_rate": 0.0008840125967576907, + "loss": 5.1709, + "step": 8068 + }, + { + "epoch": 0.9160502499370279, + "grad_norm": 0.1083984375, + "learning_rate": 0.0008824444853719802, + "loss": 5.1555, + "step": 8069 + }, + { + "epoch": 0.916163777046947, + "grad_norm": 0.11328125, + "learning_rate": 0.0008808770751083231, + "loss": 5.15, + "step": 8070 + }, + { + "epoch": 0.916277304156866, + "grad_norm": 0.11328125, + "learning_rate": 0.0008793103710180545, + "loss": 5.1665, + "step": 8071 + }, + { + "epoch": 0.9163908312667851, + "grad_norm": 0.11962890625, + "learning_rate": 0.0008777443781502345, + "loss": 5.1557, + "step": 8072 + }, + { + "epoch": 0.9165043583767042, + "grad_norm": 0.11328125, + "learning_rate": 0.000876179101551631, + "loss": 5.1617, + "step": 8073 + }, + { + "epoch": 0.9166178854866233, + "grad_norm": 0.11767578125, + "learning_rate": 0.0008746145462667026, + "loss": 5.1653, + "step": 8074 + }, + { + "epoch": 0.9167314125965423, + "grad_norm": 0.1162109375, + "learning_rate": 0.0008730507173375841, + "loss": 5.1468, + "step": 8075 + }, + { + "epoch": 0.9168449397064614, + "grad_norm": 0.10986328125, + "learning_rate": 0.0008714876198040699, + "loss": 5.1362, + "step": 8076 + }, + { + "epoch": 0.9169584668163805, + "grad_norm": 0.109375, + "learning_rate": 0.0008699252587035969, + "loss": 5.1758, + "step": 8077 + }, + { + "epoch": 0.9170719939262996, + "grad_norm": 0.1064453125, + "learning_rate": 0.0008683636390712282, + "loss": 5.1589, + "step": 8078 + }, + { + "epoch": 0.9171855210362186, + "grad_norm": 0.10595703125, + "learning_rate": 0.0008668027659396374, + "loss": 5.1656, + "step": 8079 + }, + { + "epoch": 0.9172990481461377, + "grad_norm": 0.111328125, + "learning_rate": 0.000865242644339093, + "loss": 5.1928, + "step": 8080 + }, + { + "epoch": 0.9174125752560568, + "grad_norm": 0.10205078125, + "learning_rate": 0.0008636832792974409, + "loss": 5.1834, + "step": 8081 + }, + { + "epoch": 0.9175261023659759, + "grad_norm": 0.1123046875, + "learning_rate": 0.0008621246758400898, + "loss": 5.1582, + "step": 8082 + }, + { + "epoch": 0.917639629475895, + "grad_norm": 0.1171875, + "learning_rate": 0.0008605668389899925, + "loss": 5.1827, + "step": 8083 + }, + { + "epoch": 0.917753156585814, + "grad_norm": 0.1171875, + "learning_rate": 0.0008590097737676323, + "loss": 5.156, + "step": 8084 + }, + { + "epoch": 0.9178666836957332, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008574534851910059, + "loss": 5.1729, + "step": 8085 + }, + { + "epoch": 0.9179802108056523, + "grad_norm": 0.12255859375, + "learning_rate": 0.0008558979782756062, + "loss": 5.1755, + "step": 8086 + }, + { + "epoch": 0.9180937379155714, + "grad_norm": 0.119140625, + "learning_rate": 0.0008543432580344075, + "loss": 5.1685, + "step": 8087 + }, + { + "epoch": 0.9182072650254904, + "grad_norm": 0.1142578125, + "learning_rate": 0.000852789329477849, + "loss": 5.1662, + "step": 8088 + }, + { + "epoch": 0.9183207921354095, + "grad_norm": 0.1181640625, + "learning_rate": 0.0008512361976138181, + "loss": 5.1779, + "step": 8089 + }, + { + "epoch": 0.9184343192453286, + "grad_norm": 0.11962890625, + "learning_rate": 0.0008496838674476355, + "loss": 5.1755, + "step": 8090 + }, + { + "epoch": 0.9185478463552477, + "grad_norm": 0.12109375, + "learning_rate": 0.0008481323439820374, + "loss": 5.1647, + "step": 8091 + }, + { + "epoch": 0.9186613734651667, + "grad_norm": 0.11572265625, + "learning_rate": 0.0008465816322171605, + "loss": 5.1856, + "step": 8092 + }, + { + "epoch": 0.9187749005750858, + "grad_norm": 0.10595703125, + "learning_rate": 0.0008450317371505258, + "loss": 5.183, + "step": 8093 + }, + { + "epoch": 0.9188884276850049, + "grad_norm": 0.119140625, + "learning_rate": 0.0008434826637770216, + "loss": 5.162, + "step": 8094 + }, + { + "epoch": 0.919001954794924, + "grad_norm": 0.11669921875, + "learning_rate": 0.0008419344170888892, + "loss": 5.1651, + "step": 8095 + }, + { + "epoch": 0.919115481904843, + "grad_norm": 0.11962890625, + "learning_rate": 0.000840387002075705, + "loss": 5.1786, + "step": 8096 + }, + { + "epoch": 0.9192290090147621, + "grad_norm": 0.1162109375, + "learning_rate": 0.0008388404237243652, + "loss": 5.1682, + "step": 8097 + }, + { + "epoch": 0.9193425361246812, + "grad_norm": 0.1279296875, + "learning_rate": 0.00083729468701907, + "loss": 5.1688, + "step": 8098 + }, + { + "epoch": 0.9194560632346003, + "grad_norm": 0.12890625, + "learning_rate": 0.0008357497969413068, + "loss": 5.1825, + "step": 8099 + }, + { + "epoch": 0.9195695903445193, + "grad_norm": 0.12451171875, + "learning_rate": 0.0008342057584698349, + "loss": 5.1891, + "step": 8100 + }, + { + "epoch": 0.9196831174544384, + "grad_norm": 0.11181640625, + "learning_rate": 0.0008326625765806688, + "loss": 5.1981, + "step": 8101 + }, + { + "epoch": 0.9197966445643575, + "grad_norm": 0.11376953125, + "learning_rate": 0.0008311202562470626, + "loss": 5.1757, + "step": 8102 + }, + { + "epoch": 0.9199101716742766, + "grad_norm": 0.10595703125, + "learning_rate": 0.0008295788024394932, + "loss": 5.1916, + "step": 8103 + }, + { + "epoch": 0.9200236987841957, + "grad_norm": 0.107421875, + "learning_rate": 0.0008280382201256469, + "loss": 5.1824, + "step": 8104 + }, + { + "epoch": 0.9201372258941147, + "grad_norm": 0.10205078125, + "learning_rate": 0.0008264985142703991, + "loss": 5.1651, + "step": 8105 + }, + { + "epoch": 0.9202507530040338, + "grad_norm": 0.10546875, + "learning_rate": 0.0008249596898358019, + "loss": 5.1896, + "step": 8106 + }, + { + "epoch": 0.9203642801139529, + "grad_norm": 0.1142578125, + "learning_rate": 0.0008234217517810663, + "loss": 5.1711, + "step": 8107 + }, + { + "epoch": 0.920477807223872, + "grad_norm": 0.11767578125, + "learning_rate": 0.0008218847050625476, + "loss": 5.1697, + "step": 8108 + }, + { + "epoch": 0.920591334333791, + "grad_norm": 0.11474609375, + "learning_rate": 0.000820348554633727, + "loss": 5.1701, + "step": 8109 + }, + { + "epoch": 0.9207048614437101, + "grad_norm": 0.1083984375, + "learning_rate": 0.0008188133054451987, + "loss": 5.1778, + "step": 8110 + }, + { + "epoch": 0.9208183885536292, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008172789624446513, + "loss": 5.1575, + "step": 8111 + }, + { + "epoch": 0.9209319156635483, + "grad_norm": 0.10888671875, + "learning_rate": 0.0008157455305768544, + "loss": 5.1888, + "step": 8112 + }, + { + "epoch": 0.9210454427734673, + "grad_norm": 0.10498046875, + "learning_rate": 0.0008142130147836402, + "loss": 5.1734, + "step": 8113 + }, + { + "epoch": 0.9211589698833864, + "grad_norm": 0.10791015625, + "learning_rate": 0.0008126814200038885, + "loss": 5.1834, + "step": 8114 + }, + { + "epoch": 0.9212724969933055, + "grad_norm": 0.1005859375, + "learning_rate": 0.0008111507511735117, + "loss": 5.1632, + "step": 8115 + }, + { + "epoch": 0.9213860241032246, + "grad_norm": 0.10498046875, + "learning_rate": 0.0008096210132254373, + "loss": 5.1709, + "step": 8116 + }, + { + "epoch": 0.9214995512131436, + "grad_norm": 0.09716796875, + "learning_rate": 0.0008080922110895937, + "loss": 5.1507, + "step": 8117 + }, + { + "epoch": 0.9216130783230627, + "grad_norm": 0.10205078125, + "learning_rate": 0.0008065643496928925, + "loss": 5.1566, + "step": 8118 + }, + { + "epoch": 0.9217266054329818, + "grad_norm": 0.10205078125, + "learning_rate": 0.0008050374339592144, + "loss": 5.1587, + "step": 8119 + }, + { + "epoch": 0.9218401325429009, + "grad_norm": 0.107421875, + "learning_rate": 0.0008035114688093918, + "loss": 5.1667, + "step": 8120 + }, + { + "epoch": 0.92195365965282, + "grad_norm": 0.10498046875, + "learning_rate": 0.0008019864591611945, + "loss": 5.1718, + "step": 8121 + }, + { + "epoch": 0.922067186762739, + "grad_norm": 0.10498046875, + "learning_rate": 0.0008004624099293121, + "loss": 5.1607, + "step": 8122 + }, + { + "epoch": 0.9221807138726581, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007989393260253392, + "loss": 5.1886, + "step": 8123 + }, + { + "epoch": 0.9222942409825772, + "grad_norm": 0.10400390625, + "learning_rate": 0.0007974172123577599, + "loss": 5.1664, + "step": 8124 + }, + { + "epoch": 0.9224077680924962, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007958960738319305, + "loss": 5.1883, + "step": 8125 + }, + { + "epoch": 0.9225212952024153, + "grad_norm": 0.1083984375, + "learning_rate": 0.0007943759153500665, + "loss": 5.156, + "step": 8126 + }, + { + "epoch": 0.9226348223123344, + "grad_norm": 0.11083984375, + "learning_rate": 0.0007928567418112229, + "loss": 5.1808, + "step": 8127 + }, + { + "epoch": 0.9227483494222535, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007913385581112817, + "loss": 5.1537, + "step": 8128 + }, + { + "epoch": 0.9228618765321726, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007898213691429348, + "loss": 5.1715, + "step": 8129 + }, + { + "epoch": 0.9229754036420916, + "grad_norm": 0.09765625, + "learning_rate": 0.0007883051797956679, + "loss": 5.1693, + "step": 8130 + }, + { + "epoch": 0.9230889307520107, + "grad_norm": 0.0986328125, + "learning_rate": 0.000786789994955746, + "loss": 5.196, + "step": 8131 + }, + { + "epoch": 0.9232024578619298, + "grad_norm": 0.10595703125, + "learning_rate": 0.0007852758195061955, + "loss": 5.1696, + "step": 8132 + }, + { + "epoch": 0.9233159849718489, + "grad_norm": 0.10546875, + "learning_rate": 0.0007837626583267916, + "loss": 5.1719, + "step": 8133 + }, + { + "epoch": 0.9234295120817679, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007822505162940392, + "loss": 5.1936, + "step": 8134 + }, + { + "epoch": 0.923543039191687, + "grad_norm": 0.099609375, + "learning_rate": 0.0007807393982811599, + "loss": 5.1906, + "step": 8135 + }, + { + "epoch": 0.9236565663016061, + "grad_norm": 0.10546875, + "learning_rate": 0.0007792293091580745, + "loss": 5.1816, + "step": 8136 + }, + { + "epoch": 0.9237700934115252, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007777202537913878, + "loss": 5.1559, + "step": 8137 + }, + { + "epoch": 0.9238836205214442, + "grad_norm": 0.09912109375, + "learning_rate": 0.000776212237044374, + "loss": 5.1688, + "step": 8138 + }, + { + "epoch": 0.9239971476313633, + "grad_norm": 0.10107421875, + "learning_rate": 0.0007747052637769591, + "loss": 5.185, + "step": 8139 + }, + { + "epoch": 0.9241106747412824, + "grad_norm": 0.10302734375, + "learning_rate": 0.0007731993388457066, + "loss": 5.1774, + "step": 8140 + }, + { + "epoch": 0.9242242018512015, + "grad_norm": 0.09375, + "learning_rate": 0.000771694467103802, + "loss": 5.1767, + "step": 8141 + }, + { + "epoch": 0.9243377289611205, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007701906534010361, + "loss": 5.169, + "step": 8142 + }, + { + "epoch": 0.9244512560710396, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007686879025837898, + "loss": 5.1879, + "step": 8143 + }, + { + "epoch": 0.9245647831809587, + "grad_norm": 0.09765625, + "learning_rate": 0.0007671862194950194, + "loss": 5.1811, + "step": 8144 + }, + { + "epoch": 0.9246783102908778, + "grad_norm": 0.09033203125, + "learning_rate": 0.0007656856089742396, + "loss": 5.1636, + "step": 8145 + }, + { + "epoch": 0.9247918374007968, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007641860758575081, + "loss": 5.1663, + "step": 8146 + }, + { + "epoch": 0.9249053645107159, + "grad_norm": 0.09033203125, + "learning_rate": 0.000762687624977411, + "loss": 5.1732, + "step": 8147 + }, + { + "epoch": 0.925018891620635, + "grad_norm": 0.0966796875, + "learning_rate": 0.0007611902611630473, + "loss": 5.142, + "step": 8148 + }, + { + "epoch": 0.9251324187305541, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007596939892400112, + "loss": 5.1859, + "step": 8149 + }, + { + "epoch": 0.9252459458404731, + "grad_norm": 0.10009765625, + "learning_rate": 0.0007581988140303791, + "loss": 5.17, + "step": 8150 + }, + { + "epoch": 0.9253594729503922, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007567047403526925, + "loss": 5.1652, + "step": 8151 + }, + { + "epoch": 0.9254730000603113, + "grad_norm": 0.0986328125, + "learning_rate": 0.0007552117730219434, + "loss": 5.1747, + "step": 8152 + }, + { + "epoch": 0.9255865271702304, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007537199168495577, + "loss": 5.1422, + "step": 8153 + }, + { + "epoch": 0.9257000542801495, + "grad_norm": 0.095703125, + "learning_rate": 0.0007522291766433809, + "loss": 5.1636, + "step": 8154 + }, + { + "epoch": 0.9258135813900685, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007507395572076621, + "loss": 5.1438, + "step": 8155 + }, + { + "epoch": 0.9259271084999876, + "grad_norm": 0.09814453125, + "learning_rate": 0.0007492510633430378, + "loss": 5.164, + "step": 8156 + }, + { + "epoch": 0.9260406356099067, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007477636998465178, + "loss": 5.1579, + "step": 8157 + }, + { + "epoch": 0.9261541627198258, + "grad_norm": 0.09375, + "learning_rate": 0.0007462774715114688, + "loss": 5.1659, + "step": 8158 + }, + { + "epoch": 0.9262676898297448, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007447923831275989, + "loss": 5.1565, + "step": 8159 + }, + { + "epoch": 0.9263812169396639, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007433084394809431, + "loss": 5.1695, + "step": 8160 + }, + { + "epoch": 0.926494744049583, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007418256453538459, + "loss": 5.1713, + "step": 8161 + }, + { + "epoch": 0.9266082711595021, + "grad_norm": 0.1005859375, + "learning_rate": 0.0007403440055249491, + "loss": 5.1808, + "step": 8162 + }, + { + "epoch": 0.9267217982694211, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007388635247691733, + "loss": 5.175, + "step": 8163 + }, + { + "epoch": 0.9268353253793402, + "grad_norm": 0.1015625, + "learning_rate": 0.0007373842078577038, + "loss": 5.1892, + "step": 8164 + }, + { + "epoch": 0.9269488524892593, + "grad_norm": 0.09716796875, + "learning_rate": 0.0007359060595579752, + "loss": 5.175, + "step": 8165 + }, + { + "epoch": 0.9270623795991784, + "grad_norm": 0.099609375, + "learning_rate": 0.0007344290846336561, + "loss": 5.1682, + "step": 8166 + }, + { + "epoch": 0.9271759067090974, + "grad_norm": 0.09326171875, + "learning_rate": 0.0007329532878446339, + "loss": 5.173, + "step": 8167 + }, + { + "epoch": 0.9272894338190165, + "grad_norm": 0.08935546875, + "learning_rate": 0.0007314786739469987, + "loss": 5.1673, + "step": 8168 + }, + { + "epoch": 0.9274029609289356, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007300052476930286, + "loss": 5.172, + "step": 8169 + }, + { + "epoch": 0.9275164880388547, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007285330138311746, + "loss": 5.1901, + "step": 8170 + }, + { + "epoch": 0.9276300151487737, + "grad_norm": 0.08837890625, + "learning_rate": 0.0007270619771060443, + "loss": 5.1783, + "step": 8171 + }, + { + "epoch": 0.9277435422586928, + "grad_norm": 0.09521484375, + "learning_rate": 0.000725592142258388, + "loss": 5.1678, + "step": 8172 + }, + { + "epoch": 0.9278570693686119, + "grad_norm": 0.08837890625, + "learning_rate": 0.0007241235140250822, + "loss": 5.1475, + "step": 8173 + }, + { + "epoch": 0.927970596478531, + "grad_norm": 0.09765625, + "learning_rate": 0.0007226560971391145, + "loss": 5.1827, + "step": 8174 + }, + { + "epoch": 0.92808412358845, + "grad_norm": 0.09716796875, + "learning_rate": 0.00072118989632957, + "loss": 5.1874, + "step": 8175 + }, + { + "epoch": 0.9281976506983691, + "grad_norm": 0.095703125, + "learning_rate": 0.0007197249163216122, + "loss": 5.1481, + "step": 8176 + }, + { + "epoch": 0.9283111778082882, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007182611618364736, + "loss": 5.1714, + "step": 8177 + }, + { + "epoch": 0.9284247049182073, + "grad_norm": 0.08740234375, + "learning_rate": 0.0007167986375914346, + "loss": 5.1695, + "step": 8178 + }, + { + "epoch": 0.9285382320281264, + "grad_norm": 0.0869140625, + "learning_rate": 0.000715337348299812, + "loss": 5.1633, + "step": 8179 + }, + { + "epoch": 0.9286517591380454, + "grad_norm": 0.0869140625, + "learning_rate": 0.0007138772986709421, + "loss": 5.1806, + "step": 8180 + }, + { + "epoch": 0.9287652862479645, + "grad_norm": 0.083984375, + "learning_rate": 0.0007124184934101665, + "loss": 5.1536, + "step": 8181 + }, + { + "epoch": 0.9288788133578836, + "grad_norm": 0.0859375, + "learning_rate": 0.0007109609372188164, + "loss": 5.1775, + "step": 8182 + }, + { + "epoch": 0.9289923404678027, + "grad_norm": 0.0859375, + "learning_rate": 0.0007095046347941978, + "loss": 5.1505, + "step": 8183 + }, + { + "epoch": 0.9291058675777217, + "grad_norm": 0.0908203125, + "learning_rate": 0.0007080495908295759, + "loss": 5.1821, + "step": 8184 + }, + { + "epoch": 0.9292193946876408, + "grad_norm": 0.09033203125, + "learning_rate": 0.0007065958100141607, + "loss": 5.1717, + "step": 8185 + }, + { + "epoch": 0.9293329217975599, + "grad_norm": 0.09423828125, + "learning_rate": 0.0007051432970330902, + "loss": 5.1634, + "step": 8186 + }, + { + "epoch": 0.929446448907479, + "grad_norm": 0.09375, + "learning_rate": 0.0007036920565674181, + "loss": 5.1572, + "step": 8187 + }, + { + "epoch": 0.929559976017398, + "grad_norm": 0.09228515625, + "learning_rate": 0.0007022420932940962, + "loss": 5.1538, + "step": 8188 + }, + { + "epoch": 0.9296735031273171, + "grad_norm": 0.09326171875, + "learning_rate": 0.00070079341188596, + "loss": 5.1414, + "step": 8189 + }, + { + "epoch": 0.9297870302372362, + "grad_norm": 0.08984375, + "learning_rate": 0.000699346017011715, + "loss": 5.1603, + "step": 8190 + }, + { + "epoch": 0.9299005573471553, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006978999133359192, + "loss": 5.1521, + "step": 8191 + }, + { + "epoch": 0.9300140844570743, + "grad_norm": 0.09375, + "learning_rate": 0.0006964551055189712, + "loss": 5.1734, + "step": 8192 + }, + { + "epoch": 0.9301276115669934, + "grad_norm": 0.09033203125, + "learning_rate": 0.000695011598217091, + "loss": 5.1705, + "step": 8193 + }, + { + "epoch": 0.9302411386769125, + "grad_norm": 0.0947265625, + "learning_rate": 0.0006935693960823097, + "loss": 5.1803, + "step": 8194 + }, + { + "epoch": 0.9303546657868316, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006921285037624507, + "loss": 5.176, + "step": 8195 + }, + { + "epoch": 0.9304681928967506, + "grad_norm": 0.087890625, + "learning_rate": 0.0006906889259011165, + "loss": 5.1605, + "step": 8196 + }, + { + "epoch": 0.9305817200066697, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006892506671376742, + "loss": 5.1814, + "step": 8197 + }, + { + "epoch": 0.9306952471165888, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006878137321072394, + "loss": 5.1576, + "step": 8198 + }, + { + "epoch": 0.9308087742265079, + "grad_norm": 0.0859375, + "learning_rate": 0.0006863781254406611, + "loss": 5.184, + "step": 8199 + }, + { + "epoch": 0.930922301336427, + "grad_norm": 0.08984375, + "learning_rate": 0.0006849438517645077, + "loss": 5.1449, + "step": 8200 + }, + { + "epoch": 0.931035828446346, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006835109157010528, + "loss": 5.1922, + "step": 8201 + }, + { + "epoch": 0.9311493555562651, + "grad_norm": 0.09228515625, + "learning_rate": 0.0006820793218682572, + "loss": 5.1593, + "step": 8202 + }, + { + "epoch": 0.9312628826661842, + "grad_norm": 0.0908203125, + "learning_rate": 0.0006806490748797581, + "loss": 5.1539, + "step": 8203 + }, + { + "epoch": 0.9313764097761033, + "grad_norm": 0.09423828125, + "learning_rate": 0.0006792201793448501, + "loss": 5.1673, + "step": 8204 + }, + { + "epoch": 0.9314899368860223, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006777926398684743, + "loss": 5.1838, + "step": 8205 + }, + { + "epoch": 0.9316034639959414, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006763664610512007, + "loss": 5.1741, + "step": 8206 + }, + { + "epoch": 0.9317169911058605, + "grad_norm": 0.08984375, + "learning_rate": 0.0006749416474892148, + "loss": 5.161, + "step": 8207 + }, + { + "epoch": 0.9318305182157796, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006735182037743013, + "loss": 5.1653, + "step": 8208 + }, + { + "epoch": 0.9319440453256986, + "grad_norm": 0.08837890625, + "learning_rate": 0.000672096134493831, + "loss": 5.1721, + "step": 8209 + }, + { + "epoch": 0.9320575724356177, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006706754442307456, + "loss": 5.1812, + "step": 8210 + }, + { + "epoch": 0.9321710995455368, + "grad_norm": 0.0791015625, + "learning_rate": 0.0006692561375635414, + "loss": 5.1738, + "step": 8211 + }, + { + "epoch": 0.9322846266554559, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006678382190662568, + "loss": 5.1649, + "step": 8212 + }, + { + "epoch": 0.9323981537653749, + "grad_norm": 0.0859375, + "learning_rate": 0.0006664216933084562, + "loss": 5.1495, + "step": 8213 + }, + { + "epoch": 0.932511680875294, + "grad_norm": 0.0927734375, + "learning_rate": 0.0006650065648552158, + "loss": 5.16, + "step": 8214 + }, + { + "epoch": 0.9326252079852131, + "grad_norm": 0.08349609375, + "learning_rate": 0.0006635928382671077, + "loss": 5.1602, + "step": 8215 + }, + { + "epoch": 0.9327387350951322, + "grad_norm": 0.083984375, + "learning_rate": 0.0006621805181001876, + "loss": 5.1514, + "step": 8216 + }, + { + "epoch": 0.9328522622050512, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006607696089059775, + "loss": 5.1794, + "step": 8217 + }, + { + "epoch": 0.9329657893149703, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006593601152314532, + "loss": 5.1523, + "step": 8218 + }, + { + "epoch": 0.9330793164248894, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006579520416190272, + "loss": 5.1533, + "step": 8219 + }, + { + "epoch": 0.9331928435348085, + "grad_norm": 0.08740234375, + "learning_rate": 0.0006565453926065377, + "loss": 5.155, + "step": 8220 + }, + { + "epoch": 0.9333063706447275, + "grad_norm": 0.0888671875, + "learning_rate": 0.0006551401727272299, + "loss": 5.1594, + "step": 8221 + }, + { + "epoch": 0.9334198977546466, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006537363865097438, + "loss": 5.1779, + "step": 8222 + }, + { + "epoch": 0.9335334248645657, + "grad_norm": 0.08251953125, + "learning_rate": 0.0006523340384781003, + "loss": 5.1652, + "step": 8223 + }, + { + "epoch": 0.9336469519744848, + "grad_norm": 0.080078125, + "learning_rate": 0.0006509331331516834, + "loss": 5.1586, + "step": 8224 + }, + { + "epoch": 0.9337604790844038, + "grad_norm": 0.08203125, + "learning_rate": 0.0006495336750452292, + "loss": 5.1692, + "step": 8225 + }, + { + "epoch": 0.9338740061943229, + "grad_norm": 0.08203125, + "learning_rate": 0.0006481356686688084, + "loss": 5.1772, + "step": 8226 + }, + { + "epoch": 0.933987533304242, + "grad_norm": 0.08056640625, + "learning_rate": 0.0006467391185278153, + "loss": 5.1742, + "step": 8227 + }, + { + "epoch": 0.9341010604141611, + "grad_norm": 0.0830078125, + "learning_rate": 0.0006453440291229491, + "loss": 5.1521, + "step": 8228 + }, + { + "epoch": 0.9342145875240802, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006439504049502026, + "loss": 5.1452, + "step": 8229 + }, + { + "epoch": 0.9343281146339992, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006425582505008459, + "loss": 5.1677, + "step": 8230 + }, + { + "epoch": 0.9344416417439183, + "grad_norm": 0.08251953125, + "learning_rate": 0.0006411675702614131, + "loss": 5.1749, + "step": 8231 + }, + { + "epoch": 0.9345551688538374, + "grad_norm": 0.08642578125, + "learning_rate": 0.0006397783687136868, + "loss": 5.1502, + "step": 8232 + }, + { + "epoch": 0.9346686959637565, + "grad_norm": 0.08544921875, + "learning_rate": 0.0006383906503346846, + "loss": 5.1791, + "step": 8233 + }, + { + "epoch": 0.9347822230736755, + "grad_norm": 0.0859375, + "learning_rate": 0.0006370044195966443, + "loss": 5.152, + "step": 8234 + }, + { + "epoch": 0.9348957501835946, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006356196809670089, + "loss": 5.144, + "step": 8235 + }, + { + "epoch": 0.9350092772935137, + "grad_norm": 0.0888671875, + "learning_rate": 0.000634236438908414, + "loss": 5.1706, + "step": 8236 + }, + { + "epoch": 0.9351228044034328, + "grad_norm": 0.0830078125, + "learning_rate": 0.0006328546978786702, + "loss": 5.1641, + "step": 8237 + }, + { + "epoch": 0.9352363315133518, + "grad_norm": 0.08203125, + "learning_rate": 0.0006314744623307526, + "loss": 5.1651, + "step": 8238 + }, + { + "epoch": 0.9353498586232709, + "grad_norm": 0.0849609375, + "learning_rate": 0.0006300957367127829, + "loss": 5.1847, + "step": 8239 + }, + { + "epoch": 0.93546338573319, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006287185254680183, + "loss": 5.1746, + "step": 8240 + }, + { + "epoch": 0.9355769128431091, + "grad_norm": 0.0830078125, + "learning_rate": 0.0006273428330348338, + "loss": 5.16, + "step": 8241 + }, + { + "epoch": 0.9356904399530281, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006259686638467119, + "loss": 5.1647, + "step": 8242 + }, + { + "epoch": 0.9358039670629472, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006245960223322241, + "loss": 5.1594, + "step": 8243 + }, + { + "epoch": 0.9359174941728663, + "grad_norm": 0.083984375, + "learning_rate": 0.0006232249129150201, + "loss": 5.1901, + "step": 8244 + }, + { + "epoch": 0.9360310212827854, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006218553400138102, + "loss": 5.1631, + "step": 8245 + }, + { + "epoch": 0.9361445483927044, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006204873080423549, + "loss": 5.1782, + "step": 8246 + }, + { + "epoch": 0.9362580755026235, + "grad_norm": 0.080078125, + "learning_rate": 0.0006191208214094484, + "loss": 5.1767, + "step": 8247 + }, + { + "epoch": 0.9363716026125426, + "grad_norm": 0.08154296875, + "learning_rate": 0.0006177558845189029, + "loss": 5.1477, + "step": 8248 + }, + { + "epoch": 0.9364851297224617, + "grad_norm": 0.07763671875, + "learning_rate": 0.0006163925017695389, + "loss": 5.177, + "step": 8249 + }, + { + "epoch": 0.9365986568323807, + "grad_norm": 0.078125, + "learning_rate": 0.0006150306775551659, + "loss": 5.1655, + "step": 8250 + }, + { + "epoch": 0.9367121839422998, + "grad_norm": 0.07861328125, + "learning_rate": 0.0006136704162645724, + "loss": 5.17, + "step": 8251 + }, + { + "epoch": 0.9368257110522189, + "grad_norm": 0.08349609375, + "learning_rate": 0.0006123117222815085, + "loss": 5.1784, + "step": 8252 + }, + { + "epoch": 0.936939238162138, + "grad_norm": 0.07763671875, + "learning_rate": 0.0006109545999846751, + "loss": 5.1483, + "step": 8253 + }, + { + "epoch": 0.937052765272057, + "grad_norm": 0.08056640625, + "learning_rate": 0.0006095990537477063, + "loss": 5.1718, + "step": 8254 + }, + { + "epoch": 0.9371662923819761, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006082450879391579, + "loss": 5.1824, + "step": 8255 + }, + { + "epoch": 0.9372798194918952, + "grad_norm": 0.08349609375, + "learning_rate": 0.0006068927069224924, + "loss": 5.1844, + "step": 8256 + }, + { + "epoch": 0.9373933466018143, + "grad_norm": 0.0791015625, + "learning_rate": 0.0006055419150560646, + "loss": 5.1544, + "step": 8257 + }, + { + "epoch": 0.9375068737117334, + "grad_norm": 0.0791015625, + "learning_rate": 0.0006041927166931078, + "loss": 5.1615, + "step": 8258 + }, + { + "epoch": 0.9376204008216524, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006028451161817206, + "loss": 5.1449, + "step": 8259 + }, + { + "epoch": 0.9377339279315715, + "grad_norm": 0.08447265625, + "learning_rate": 0.0006014991178648515, + "loss": 5.1657, + "step": 8260 + }, + { + "epoch": 0.9378474550414906, + "grad_norm": 0.08203125, + "learning_rate": 0.0006001547260802855, + "loss": 5.1387, + "step": 8261 + }, + { + "epoch": 0.9379609821514097, + "grad_norm": 0.08203125, + "learning_rate": 0.0005988119451606312, + "loss": 5.1431, + "step": 8262 + }, + { + "epoch": 0.9380745092613287, + "grad_norm": 0.08203125, + "learning_rate": 0.0005974707794333036, + "loss": 5.1639, + "step": 8263 + }, + { + "epoch": 0.9381880363712478, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005961312332205157, + "loss": 5.1696, + "step": 8264 + }, + { + "epoch": 0.9383015634811669, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005947933108392579, + "loss": 5.1578, + "step": 8265 + }, + { + "epoch": 0.938415090591086, + "grad_norm": 0.080078125, + "learning_rate": 0.0005934570166012898, + "loss": 5.1651, + "step": 8266 + }, + { + "epoch": 0.938528617701005, + "grad_norm": 0.078125, + "learning_rate": 0.0005921223548131225, + "loss": 5.1502, + "step": 8267 + }, + { + "epoch": 0.9386421448109241, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005907893297760069, + "loss": 5.1696, + "step": 8268 + }, + { + "epoch": 0.9387556719208432, + "grad_norm": 0.076171875, + "learning_rate": 0.0005894579457859194, + "loss": 5.1743, + "step": 8269 + }, + { + "epoch": 0.9388691990307623, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005881282071335464, + "loss": 5.1559, + "step": 8270 + }, + { + "epoch": 0.9389827261406813, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005868001181042733, + "loss": 5.1433, + "step": 8271 + }, + { + "epoch": 0.9390962532506004, + "grad_norm": 0.08251953125, + "learning_rate": 0.0005854736829781681, + "loss": 5.1427, + "step": 8272 + }, + { + "epoch": 0.9392097803605195, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005841489060299701, + "loss": 5.1766, + "step": 8273 + }, + { + "epoch": 0.9393233074704386, + "grad_norm": 0.078125, + "learning_rate": 0.000582825791529073, + "loss": 5.1475, + "step": 8274 + }, + { + "epoch": 0.9394368345803576, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005815043437395144, + "loss": 5.1636, + "step": 8275 + }, + { + "epoch": 0.9395503616902767, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005801845669199594, + "loss": 5.1669, + "step": 8276 + }, + { + "epoch": 0.9396638888001958, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005788664653236886, + "loss": 5.1647, + "step": 8277 + }, + { + "epoch": 0.9397774159101149, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005775500431985838, + "loss": 5.1584, + "step": 8278 + }, + { + "epoch": 0.939890943020034, + "grad_norm": 0.07421875, + "learning_rate": 0.0005762353047871148, + "loss": 5.1642, + "step": 8279 + }, + { + "epoch": 0.940004470129953, + "grad_norm": 0.07763671875, + "learning_rate": 0.000574922254326324, + "loss": 5.1611, + "step": 8280 + }, + { + "epoch": 0.9401179972398721, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005736108960478148, + "loss": 5.1681, + "step": 8281 + }, + { + "epoch": 0.9402315243497912, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005723012341777374, + "loss": 5.1757, + "step": 8282 + }, + { + "epoch": 0.9403450514597103, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005709932729367744, + "loss": 5.1648, + "step": 8283 + }, + { + "epoch": 0.9404585785696293, + "grad_norm": 0.076171875, + "learning_rate": 0.0005696870165401276, + "loss": 5.1755, + "step": 8284 + }, + { + "epoch": 0.9405721056795484, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005683824691975056, + "loss": 5.166, + "step": 8285 + }, + { + "epoch": 0.9406856327894675, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005670796351131082, + "loss": 5.1804, + "step": 8286 + }, + { + "epoch": 0.9407991598993866, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005657785184856138, + "loss": 5.1485, + "step": 8287 + }, + { + "epoch": 0.9409126870093056, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005644791235081668, + "loss": 5.1624, + "step": 8288 + }, + { + "epoch": 0.9410262141192247, + "grad_norm": 0.07861328125, + "learning_rate": 0.000563181454368362, + "loss": 5.1624, + "step": 8289 + }, + { + "epoch": 0.9411397412291438, + "grad_norm": 0.08056640625, + "learning_rate": 0.0005618855152482334, + "loss": 5.166, + "step": 8290 + }, + { + "epoch": 0.9412532683390629, + "grad_norm": 0.08154296875, + "learning_rate": 0.0005605913103242381, + "loss": 5.1648, + "step": 8291 + }, + { + "epoch": 0.9413667954489819, + "grad_norm": 0.07763671875, + "learning_rate": 0.000559298843767247, + "loss": 5.1579, + "step": 8292 + }, + { + "epoch": 0.941480322558901, + "grad_norm": 0.07470703125, + "learning_rate": 0.000558008119742526, + "loss": 5.166, + "step": 8293 + }, + { + "epoch": 0.9415938496688201, + "grad_norm": 0.07568359375, + "learning_rate": 0.0005567191424097268, + "loss": 5.1716, + "step": 8294 + }, + { + "epoch": 0.9417073767787392, + "grad_norm": 0.078125, + "learning_rate": 0.0005554319159228716, + "loss": 5.1862, + "step": 8295 + }, + { + "epoch": 0.9418209038886582, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005541464444303397, + "loss": 5.1987, + "step": 8296 + }, + { + "epoch": 0.9419344309985773, + "grad_norm": 0.078125, + "learning_rate": 0.0005528627320748554, + "loss": 5.161, + "step": 8297 + }, + { + "epoch": 0.9420479581084964, + "grad_norm": 0.078125, + "learning_rate": 0.0005515807829934728, + "loss": 5.1589, + "step": 8298 + }, + { + "epoch": 0.9421614852184155, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005503006013175642, + "loss": 5.1627, + "step": 8299 + }, + { + "epoch": 0.9422750123283345, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005490221911728059, + "loss": 5.1718, + "step": 8300 + }, + { + "epoch": 0.9423885394382536, + "grad_norm": 0.076171875, + "learning_rate": 0.0005477455566791649, + "loss": 5.1723, + "step": 8301 + }, + { + "epoch": 0.9425020665481727, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005464707019508856, + "loss": 5.1596, + "step": 8302 + }, + { + "epoch": 0.9426155936580918, + "grad_norm": 0.07421875, + "learning_rate": 0.0005451976310964772, + "loss": 5.162, + "step": 8303 + }, + { + "epoch": 0.9427291207680109, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005439263482186992, + "loss": 5.1533, + "step": 8304 + }, + { + "epoch": 0.9428426478779299, + "grad_norm": 0.076171875, + "learning_rate": 0.0005426568574145496, + "loss": 5.1798, + "step": 8305 + }, + { + "epoch": 0.942956174987849, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005413891627752509, + "loss": 5.1338, + "step": 8306 + }, + { + "epoch": 0.9430697020977681, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005401232683862367, + "loss": 5.1738, + "step": 8307 + }, + { + "epoch": 0.9431832292076872, + "grad_norm": 0.078125, + "learning_rate": 0.0005388591783271399, + "loss": 5.1531, + "step": 8308 + }, + { + "epoch": 0.9432967563176062, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005375968966717769, + "loss": 5.157, + "step": 8309 + }, + { + "epoch": 0.9434102834275253, + "grad_norm": 0.07421875, + "learning_rate": 0.0005363364274881375, + "loss": 5.1576, + "step": 8310 + }, + { + "epoch": 0.9435238105374444, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005350777748383697, + "loss": 5.1734, + "step": 8311 + }, + { + "epoch": 0.9436373376473635, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005338209427787677, + "loss": 5.1639, + "step": 8312 + }, + { + "epoch": 0.9437508647572825, + "grad_norm": 0.076171875, + "learning_rate": 0.0005325659353597575, + "loss": 5.1678, + "step": 8313 + }, + { + "epoch": 0.9438643918672016, + "grad_norm": 0.07666015625, + "learning_rate": 0.0005313127566258869, + "loss": 5.1637, + "step": 8314 + }, + { + "epoch": 0.9439779189771207, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005300614106158077, + "loss": 5.1608, + "step": 8315 + }, + { + "epoch": 0.9440914460870398, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005288119013622674, + "loss": 5.1805, + "step": 8316 + }, + { + "epoch": 0.9442049731969588, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005275642328920929, + "loss": 5.1689, + "step": 8317 + }, + { + "epoch": 0.9443185003068779, + "grad_norm": 0.07763671875, + "learning_rate": 0.0005263184092261792, + "loss": 5.1627, + "step": 8318 + }, + { + "epoch": 0.944432027416797, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005250744343794766, + "loss": 5.1598, + "step": 8319 + }, + { + "epoch": 0.9445455545267161, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005238323123609753, + "loss": 5.1602, + "step": 8320 + }, + { + "epoch": 0.9446590816366351, + "grad_norm": 0.07275390625, + "learning_rate": 0.000522592047173697, + "loss": 5.1608, + "step": 8321 + }, + { + "epoch": 0.9447726087465542, + "grad_norm": 0.07373046875, + "learning_rate": 0.000521353642814677, + "loss": 5.1616, + "step": 8322 + }, + { + "epoch": 0.9448861358564733, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005201171032749552, + "loss": 5.1671, + "step": 8323 + }, + { + "epoch": 0.9449996629663924, + "grad_norm": 0.0751953125, + "learning_rate": 0.0005188824325395605, + "loss": 5.1789, + "step": 8324 + }, + { + "epoch": 0.9451131900763114, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005176496345875004, + "loss": 5.1621, + "step": 8325 + }, + { + "epoch": 0.9452267171862306, + "grad_norm": 0.07177734375, + "learning_rate": 0.0005164187133917455, + "loss": 5.1653, + "step": 8326 + }, + { + "epoch": 0.9453402442961497, + "grad_norm": 0.072265625, + "learning_rate": 0.0005151896729192194, + "loss": 5.164, + "step": 8327 + }, + { + "epoch": 0.9454537714060688, + "grad_norm": 0.07177734375, + "learning_rate": 0.0005139625171307838, + "loss": 5.1466, + "step": 8328 + }, + { + "epoch": 0.9455672985159879, + "grad_norm": 0.07421875, + "learning_rate": 0.0005127372499812276, + "loss": 5.1636, + "step": 8329 + }, + { + "epoch": 0.9456808256259069, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005115138754192517, + "loss": 5.1528, + "step": 8330 + }, + { + "epoch": 0.945794352735826, + "grad_norm": 0.072265625, + "learning_rate": 0.0005102923973874588, + "loss": 5.1666, + "step": 8331 + }, + { + "epoch": 0.9459078798457451, + "grad_norm": 0.0732421875, + "learning_rate": 0.0005090728198223393, + "loss": 5.1435, + "step": 8332 + }, + { + "epoch": 0.9460214069556642, + "grad_norm": 0.07470703125, + "learning_rate": 0.0005078551466542587, + "loss": 5.1327, + "step": 8333 + }, + { + "epoch": 0.9461349340655832, + "grad_norm": 0.07421875, + "learning_rate": 0.0005066393818074457, + "loss": 5.1594, + "step": 8334 + }, + { + "epoch": 0.9462484611755023, + "grad_norm": 0.07177734375, + "learning_rate": 0.0005054255291999777, + "loss": 5.1938, + "step": 8335 + }, + { + "epoch": 0.9463619882854214, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005042135927437716, + "loss": 5.1565, + "step": 8336 + }, + { + "epoch": 0.9464755153953405, + "grad_norm": 0.07275390625, + "learning_rate": 0.000503003576344567, + "loss": 5.1678, + "step": 8337 + }, + { + "epoch": 0.9465890425052595, + "grad_norm": 0.07275390625, + "learning_rate": 0.0005017954839019172, + "loss": 5.1839, + "step": 8338 + }, + { + "epoch": 0.9467025696151786, + "grad_norm": 0.07373046875, + "learning_rate": 0.0005005893193091736, + "loss": 5.168, + "step": 8339 + }, + { + "epoch": 0.9468160967250977, + "grad_norm": 0.072265625, + "learning_rate": 0.0004993850864534762, + "loss": 5.1828, + "step": 8340 + }, + { + "epoch": 0.9469296238350168, + "grad_norm": 0.07421875, + "learning_rate": 0.0004981827892157389, + "loss": 5.1729, + "step": 8341 + }, + { + "epoch": 0.9470431509449359, + "grad_norm": 0.07373046875, + "learning_rate": 0.0004969824314706371, + "loss": 5.1595, + "step": 8342 + }, + { + "epoch": 0.9471566780548549, + "grad_norm": 0.0732421875, + "learning_rate": 0.0004957840170865964, + "loss": 5.1748, + "step": 8343 + }, + { + "epoch": 0.947270205164774, + "grad_norm": 0.07568359375, + "learning_rate": 0.0004945875499257796, + "loss": 5.1526, + "step": 8344 + }, + { + "epoch": 0.9473837322746931, + "grad_norm": 0.072265625, + "learning_rate": 0.0004933930338440739, + "loss": 5.1646, + "step": 8345 + }, + { + "epoch": 0.9474972593846122, + "grad_norm": 0.0732421875, + "learning_rate": 0.0004922004726910779, + "loss": 5.1554, + "step": 8346 + }, + { + "epoch": 0.9476107864945312, + "grad_norm": 0.07275390625, + "learning_rate": 0.0004910098703100919, + "loss": 5.1661, + "step": 8347 + }, + { + "epoch": 0.9477243136044503, + "grad_norm": 0.0751953125, + "learning_rate": 0.0004898212305381015, + "loss": 5.1527, + "step": 8348 + }, + { + "epoch": 0.9478378407143694, + "grad_norm": 0.0732421875, + "learning_rate": 0.0004886345572057683, + "loss": 5.1607, + "step": 8349 + }, + { + "epoch": 0.9479513678242885, + "grad_norm": 0.0771484375, + "learning_rate": 0.00048744985413741725, + "loss": 5.1639, + "step": 8350 + }, + { + "epoch": 0.9480648949342075, + "grad_norm": 0.07373046875, + "learning_rate": 0.0004862671251510229, + "loss": 5.1745, + "step": 8351 + }, + { + "epoch": 0.9481784220441266, + "grad_norm": 0.07275390625, + "learning_rate": 0.00048508637405819763, + "loss": 5.159, + "step": 8352 + }, + { + "epoch": 0.9482919491540457, + "grad_norm": 0.076171875, + "learning_rate": 0.00048390760466418016, + "loss": 5.1807, + "step": 8353 + }, + { + "epoch": 0.9484054762639648, + "grad_norm": 0.07177734375, + "learning_rate": 0.0004827308207678229, + "loss": 5.1392, + "step": 8354 + }, + { + "epoch": 0.9485190033738838, + "grad_norm": 0.072265625, + "learning_rate": 0.0004815560261615786, + "loss": 5.1411, + "step": 8355 + }, + { + "epoch": 0.9486325304838029, + "grad_norm": 0.0732421875, + "learning_rate": 0.00048038322463149025, + "loss": 5.1527, + "step": 8356 + }, + { + "epoch": 0.948746057593722, + "grad_norm": 0.072265625, + "learning_rate": 0.0004792124199571763, + "loss": 5.1696, + "step": 8357 + }, + { + "epoch": 0.9488595847036411, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004780436159118218, + "loss": 5.1748, + "step": 8358 + }, + { + "epoch": 0.9489731118135601, + "grad_norm": 0.07080078125, + "learning_rate": 0.0004768768162621625, + "loss": 5.1834, + "step": 8359 + }, + { + "epoch": 0.9490866389234792, + "grad_norm": 0.07177734375, + "learning_rate": 0.00047571202476847567, + "loss": 5.167, + "step": 8360 + }, + { + "epoch": 0.9492001660333983, + "grad_norm": 0.0712890625, + "learning_rate": 0.00047454924518456595, + "loss": 5.158, + "step": 8361 + }, + { + "epoch": 0.9493136931433174, + "grad_norm": 0.0703125, + "learning_rate": 0.00047338848125775515, + "loss": 5.1563, + "step": 8362 + }, + { + "epoch": 0.9494272202532364, + "grad_norm": 0.07177734375, + "learning_rate": 0.0004722297367288681, + "loss": 5.1888, + "step": 8363 + }, + { + "epoch": 0.9495407473631555, + "grad_norm": 0.072265625, + "learning_rate": 0.0004710730153322224, + "loss": 5.175, + "step": 8364 + }, + { + "epoch": 0.9496542744730746, + "grad_norm": 0.0712890625, + "learning_rate": 0.00046991832079561554, + "loss": 5.1681, + "step": 8365 + }, + { + "epoch": 0.9497678015829937, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004687656568403127, + "loss": 5.1737, + "step": 8366 + }, + { + "epoch": 0.9498813286929128, + "grad_norm": 0.0693359375, + "learning_rate": 0.00046761502718103585, + "loss": 5.1348, + "step": 8367 + }, + { + "epoch": 0.9499948558028318, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004664664355259496, + "loss": 5.141, + "step": 8368 + }, + { + "epoch": 0.9501083829127509, + "grad_norm": 0.0712890625, + "learning_rate": 0.000465319885576652, + "loss": 5.1691, + "step": 8369 + }, + { + "epoch": 0.95022191002267, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004641753810281601, + "loss": 5.154, + "step": 8370 + }, + { + "epoch": 0.9503354371325891, + "grad_norm": 0.072265625, + "learning_rate": 0.00046303292556890005, + "loss": 5.1613, + "step": 8371 + }, + { + "epoch": 0.9504489642425081, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004618925228806939, + "loss": 5.1707, + "step": 8372 + }, + { + "epoch": 0.9505624913524272, + "grad_norm": 0.07275390625, + "learning_rate": 0.0004607541766387488, + "loss": 5.1596, + "step": 8373 + }, + { + "epoch": 0.9506760184623463, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004596178905116432, + "loss": 5.1814, + "step": 8374 + }, + { + "epoch": 0.9507895455722654, + "grad_norm": 0.07080078125, + "learning_rate": 0.00045848366816131803, + "loss": 5.1379, + "step": 8375 + }, + { + "epoch": 0.9509030726821844, + "grad_norm": 0.06884765625, + "learning_rate": 0.00045735151324306154, + "loss": 5.1881, + "step": 8376 + }, + { + "epoch": 0.9510165997921035, + "grad_norm": 0.072265625, + "learning_rate": 0.0004562214294055007, + "loss": 5.1655, + "step": 8377 + }, + { + "epoch": 0.9511301269020226, + "grad_norm": 0.072265625, + "learning_rate": 0.000455093420290587, + "loss": 5.1595, + "step": 8378 + }, + { + "epoch": 0.9512436540119417, + "grad_norm": 0.0693359375, + "learning_rate": 0.00045396748953358614, + "loss": 5.1536, + "step": 8379 + }, + { + "epoch": 0.9513571811218607, + "grad_norm": 0.07080078125, + "learning_rate": 0.000452843640763066, + "loss": 5.1454, + "step": 8380 + }, + { + "epoch": 0.9514707082317798, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004517218776008838, + "loss": 5.173, + "step": 8381 + }, + { + "epoch": 0.9515842353416989, + "grad_norm": 0.07275390625, + "learning_rate": 0.0004506022036621767, + "loss": 5.1873, + "step": 8382 + }, + { + "epoch": 0.951697762451618, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004494846225553477, + "loss": 5.165, + "step": 8383 + }, + { + "epoch": 0.951811289561537, + "grad_norm": 0.07177734375, + "learning_rate": 0.00044836913788205636, + "loss": 5.1473, + "step": 8384 + }, + { + "epoch": 0.9519248166714561, + "grad_norm": 0.07080078125, + "learning_rate": 0.00044725575323720424, + "loss": 5.1566, + "step": 8385 + }, + { + "epoch": 0.9520383437813752, + "grad_norm": 0.07080078125, + "learning_rate": 0.0004461444722089277, + "loss": 5.1857, + "step": 8386 + }, + { + "epoch": 0.9521518708912943, + "grad_norm": 0.06982421875, + "learning_rate": 0.00044503529837858116, + "loss": 5.1842, + "step": 8387 + }, + { + "epoch": 0.9522653980012133, + "grad_norm": 0.072265625, + "learning_rate": 0.00044392823532072976, + "loss": 5.1414, + "step": 8388 + }, + { + "epoch": 0.9523789251111324, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004428232866031352, + "loss": 5.1723, + "step": 8389 + }, + { + "epoch": 0.9524924522210515, + "grad_norm": 0.0703125, + "learning_rate": 0.0004417204557867458, + "loss": 5.1782, + "step": 8390 + }, + { + "epoch": 0.9526059793309706, + "grad_norm": 0.068359375, + "learning_rate": 0.0004406197464256847, + "loss": 5.1488, + "step": 8391 + }, + { + "epoch": 0.9527195064408897, + "grad_norm": 0.0703125, + "learning_rate": 0.000439521162067237, + "loss": 5.1583, + "step": 8392 + }, + { + "epoch": 0.9528330335508087, + "grad_norm": 0.072265625, + "learning_rate": 0.00043842470625184095, + "loss": 5.1665, + "step": 8393 + }, + { + "epoch": 0.9529465606607278, + "grad_norm": 0.07080078125, + "learning_rate": 0.0004373303825130741, + "loss": 5.1728, + "step": 8394 + }, + { + "epoch": 0.9530600877706469, + "grad_norm": 0.06982421875, + "learning_rate": 0.00043623819437764357, + "loss": 5.1462, + "step": 8395 + }, + { + "epoch": 0.953173614880566, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004351481453653733, + "loss": 5.1407, + "step": 8396 + }, + { + "epoch": 0.953287141990485, + "grad_norm": 0.0693359375, + "learning_rate": 0.00043406023898919405, + "loss": 5.1738, + "step": 8397 + }, + { + "epoch": 0.9534006691004041, + "grad_norm": 0.07177734375, + "learning_rate": 0.00043297447875513084, + "loss": 5.1405, + "step": 8398 + }, + { + "epoch": 0.9535141962103232, + "grad_norm": 0.06884765625, + "learning_rate": 0.00043189086816229275, + "loss": 5.1716, + "step": 8399 + }, + { + "epoch": 0.9536277233202423, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004308094107028612, + "loss": 5.1796, + "step": 8400 + }, + { + "epoch": 0.9537412504301613, + "grad_norm": 0.0693359375, + "learning_rate": 0.0004297301098620784, + "loss": 5.1668, + "step": 8401 + }, + { + "epoch": 0.9538547775400804, + "grad_norm": 0.0703125, + "learning_rate": 0.00042865296911823614, + "loss": 5.1635, + "step": 8402 + }, + { + "epoch": 0.9539683046499995, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004275779919426653, + "loss": 5.1601, + "step": 8403 + }, + { + "epoch": 0.9540818317599186, + "grad_norm": 0.0703125, + "learning_rate": 0.0004265051817997242, + "loss": 5.1738, + "step": 8404 + }, + { + "epoch": 0.9541953588698376, + "grad_norm": 0.0693359375, + "learning_rate": 0.0004254345421467867, + "loss": 5.1473, + "step": 8405 + }, + { + "epoch": 0.9543088859797567, + "grad_norm": 0.072265625, + "learning_rate": 0.00042436607643423266, + "loss": 5.1403, + "step": 8406 + }, + { + "epoch": 0.9544224130896758, + "grad_norm": 0.0703125, + "learning_rate": 0.00042329978810543467, + "loss": 5.1473, + "step": 8407 + }, + { + "epoch": 0.9545359401995949, + "grad_norm": 0.0703125, + "learning_rate": 0.0004222356805967502, + "loss": 5.173, + "step": 8408 + }, + { + "epoch": 0.9546494673095139, + "grad_norm": 0.07080078125, + "learning_rate": 0.0004211737573375065, + "loss": 5.1485, + "step": 8409 + }, + { + "epoch": 0.954762994419433, + "grad_norm": 0.06884765625, + "learning_rate": 0.00042011402174999316, + "loss": 5.1655, + "step": 8410 + }, + { + "epoch": 0.9548765215293521, + "grad_norm": 0.0693359375, + "learning_rate": 0.00041905647724944787, + "loss": 5.175, + "step": 8411 + }, + { + "epoch": 0.9549900486392712, + "grad_norm": 0.0712890625, + "learning_rate": 0.00041800112724404845, + "loss": 5.1521, + "step": 8412 + }, + { + "epoch": 0.9551035757491902, + "grad_norm": 0.0693359375, + "learning_rate": 0.0004169479751349001, + "loss": 5.1531, + "step": 8413 + }, + { + "epoch": 0.9552171028591093, + "grad_norm": 0.0654296875, + "learning_rate": 0.00041589702431602443, + "loss": 5.1519, + "step": 8414 + }, + { + "epoch": 0.9553306299690284, + "grad_norm": 0.06787109375, + "learning_rate": 0.0004148482781743491, + "loss": 5.1755, + "step": 8415 + }, + { + "epoch": 0.9554441570789475, + "grad_norm": 0.0693359375, + "learning_rate": 0.0004138017400896968, + "loss": 5.1596, + "step": 8416 + }, + { + "epoch": 0.9555576841888666, + "grad_norm": 0.072265625, + "learning_rate": 0.00041275741343477457, + "loss": 5.1418, + "step": 8417 + }, + { + "epoch": 0.9556712112987856, + "grad_norm": 0.06982421875, + "learning_rate": 0.00041171530157516143, + "loss": 5.1324, + "step": 8418 + }, + { + "epoch": 0.9557847384087047, + "grad_norm": 0.06787109375, + "learning_rate": 0.00041067540786930034, + "loss": 5.1526, + "step": 8419 + }, + { + "epoch": 0.9558982655186238, + "grad_norm": 0.06982421875, + "learning_rate": 0.0004096377356684842, + "loss": 5.1527, + "step": 8420 + }, + { + "epoch": 0.9560117926285429, + "grad_norm": 0.06787109375, + "learning_rate": 0.00040860228831684787, + "loss": 5.1771, + "step": 8421 + }, + { + "epoch": 0.9561253197384619, + "grad_norm": 0.068359375, + "learning_rate": 0.00040756906915135493, + "loss": 5.1582, + "step": 8422 + }, + { + "epoch": 0.956238846848381, + "grad_norm": 0.06884765625, + "learning_rate": 0.0004065380815017898, + "loss": 5.1622, + "step": 8423 + }, + { + "epoch": 0.9563523739583001, + "grad_norm": 0.068359375, + "learning_rate": 0.00040550932869074343, + "loss": 5.175, + "step": 8424 + }, + { + "epoch": 0.9564659010682192, + "grad_norm": 0.06689453125, + "learning_rate": 0.00040448281403360583, + "loss": 5.1768, + "step": 8425 + }, + { + "epoch": 0.9565794281781382, + "grad_norm": 0.06884765625, + "learning_rate": 0.0004034585408385536, + "loss": 5.1851, + "step": 8426 + }, + { + "epoch": 0.9566929552880573, + "grad_norm": 0.07080078125, + "learning_rate": 0.00040243651240653945, + "loss": 5.1442, + "step": 8427 + }, + { + "epoch": 0.9568064823979764, + "grad_norm": 0.0712890625, + "learning_rate": 0.00040141673203128247, + "loss": 5.1728, + "step": 8428 + }, + { + "epoch": 0.9569200095078955, + "grad_norm": 0.06640625, + "learning_rate": 0.00040039920299925593, + "loss": 5.1554, + "step": 8429 + }, + { + "epoch": 0.9570335366178145, + "grad_norm": 0.068359375, + "learning_rate": 0.00039938392858967895, + "loss": 5.1575, + "step": 8430 + }, + { + "epoch": 0.9571470637277336, + "grad_norm": 0.07080078125, + "learning_rate": 0.0003983709120745035, + "loss": 5.1694, + "step": 8431 + }, + { + "epoch": 0.9572605908376527, + "grad_norm": 0.068359375, + "learning_rate": 0.00039736015671840543, + "loss": 5.1523, + "step": 8432 + }, + { + "epoch": 0.9573741179475718, + "grad_norm": 0.0673828125, + "learning_rate": 0.00039635166577877323, + "loss": 5.1434, + "step": 8433 + }, + { + "epoch": 0.9574876450574908, + "grad_norm": 0.0703125, + "learning_rate": 0.0003953454425056984, + "loss": 5.1476, + "step": 8434 + }, + { + "epoch": 0.9576011721674099, + "grad_norm": 0.068359375, + "learning_rate": 0.0003943414901419635, + "loss": 5.1356, + "step": 8435 + }, + { + "epoch": 0.957714699277329, + "grad_norm": 0.06787109375, + "learning_rate": 0.00039333981192303313, + "loss": 5.1555, + "step": 8436 + }, + { + "epoch": 0.9578282263872481, + "grad_norm": 0.06884765625, + "learning_rate": 0.000392340411077043, + "loss": 5.1687, + "step": 8437 + }, + { + "epoch": 0.9579417534971671, + "grad_norm": 0.06982421875, + "learning_rate": 0.0003913432908247892, + "loss": 5.1538, + "step": 8438 + }, + { + "epoch": 0.9580552806070862, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003903484543797184, + "loss": 5.1586, + "step": 8439 + }, + { + "epoch": 0.9581688077170053, + "grad_norm": 0.06884765625, + "learning_rate": 0.0003893559049479167, + "loss": 5.1481, + "step": 8440 + }, + { + "epoch": 0.9582823348269244, + "grad_norm": 0.06787109375, + "learning_rate": 0.0003883656457281003, + "loss": 5.1598, + "step": 8441 + }, + { + "epoch": 0.9583958619368435, + "grad_norm": 0.06591796875, + "learning_rate": 0.000387377679911604, + "loss": 5.1739, + "step": 8442 + }, + { + "epoch": 0.9585093890467625, + "grad_norm": 0.06689453125, + "learning_rate": 0.0003863920106823723, + "loss": 5.1644, + "step": 8443 + }, + { + "epoch": 0.9586229161566816, + "grad_norm": 0.06640625, + "learning_rate": 0.0003854086412169482, + "loss": 5.1607, + "step": 8444 + }, + { + "epoch": 0.9587364432666007, + "grad_norm": 0.06640625, + "learning_rate": 0.0003844275746844632, + "loss": 5.1585, + "step": 8445 + }, + { + "epoch": 0.9588499703765198, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003834488142466266, + "loss": 5.1447, + "step": 8446 + }, + { + "epoch": 0.9589634974864388, + "grad_norm": 0.0673828125, + "learning_rate": 0.00038247236305771683, + "loss": 5.1594, + "step": 8447 + }, + { + "epoch": 0.9590770245963579, + "grad_norm": 0.06787109375, + "learning_rate": 0.0003814982242645688, + "loss": 5.1618, + "step": 8448 + }, + { + "epoch": 0.959190551706277, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003805264010065665, + "loss": 5.1641, + "step": 8449 + }, + { + "epoch": 0.9593040788161961, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003795568964156312, + "loss": 5.1479, + "step": 8450 + }, + { + "epoch": 0.9594176059261151, + "grad_norm": 0.06591796875, + "learning_rate": 0.00037858971361621097, + "loss": 5.1503, + "step": 8451 + }, + { + "epoch": 0.9595311330360342, + "grad_norm": 0.06787109375, + "learning_rate": 0.00037762485572527294, + "loss": 5.1409, + "step": 8452 + }, + { + "epoch": 0.9596446601459533, + "grad_norm": 0.06640625, + "learning_rate": 0.00037666232585229013, + "loss": 5.1789, + "step": 8453 + }, + { + "epoch": 0.9597581872558724, + "grad_norm": 0.06640625, + "learning_rate": 0.000375702127099234, + "loss": 5.1513, + "step": 8454 + }, + { + "epoch": 0.9598717143657914, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003747442625605628, + "loss": 5.155, + "step": 8455 + }, + { + "epoch": 0.9599852414757105, + "grad_norm": 0.06689453125, + "learning_rate": 0.0003737887353232129, + "loss": 5.1621, + "step": 8456 + }, + { + "epoch": 0.9600987685856296, + "grad_norm": 0.06689453125, + "learning_rate": 0.00037283554846658727, + "loss": 5.1607, + "step": 8457 + }, + { + "epoch": 0.9602122956955487, + "grad_norm": 0.06640625, + "learning_rate": 0.00037188470506254747, + "loss": 5.1743, + "step": 8458 + }, + { + "epoch": 0.9603258228054677, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003709362081754021, + "loss": 5.178, + "step": 8459 + }, + { + "epoch": 0.9604393499153868, + "grad_norm": 0.06787109375, + "learning_rate": 0.0003699900608618981, + "loss": 5.1516, + "step": 8460 + }, + { + "epoch": 0.9605528770253059, + "grad_norm": 0.06591796875, + "learning_rate": 0.00036904626617120963, + "loss": 5.18, + "step": 8461 + }, + { + "epoch": 0.960666404135225, + "grad_norm": 0.06689453125, + "learning_rate": 0.00036810482714492965, + "loss": 5.1616, + "step": 8462 + }, + { + "epoch": 0.960779931245144, + "grad_norm": 0.06689453125, + "learning_rate": 0.0003671657468170594, + "loss": 5.1806, + "step": 8463 + }, + { + "epoch": 0.9608934583550631, + "grad_norm": 0.06689453125, + "learning_rate": 0.0003662290282139982, + "loss": 5.1625, + "step": 8464 + }, + { + "epoch": 0.9610069854649822, + "grad_norm": 0.0673828125, + "learning_rate": 0.00036529467435453446, + "loss": 5.1943, + "step": 8465 + }, + { + "epoch": 0.9611205125749013, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003643626882498359, + "loss": 5.1392, + "step": 8466 + }, + { + "epoch": 0.9612340396848204, + "grad_norm": 0.068359375, + "learning_rate": 0.0003634330729034394, + "loss": 5.1505, + "step": 8467 + }, + { + "epoch": 0.9613475667947394, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003625058313112413, + "loss": 5.1745, + "step": 8468 + }, + { + "epoch": 0.9614610939046585, + "grad_norm": 0.06787109375, + "learning_rate": 0.00036158096646148866, + "loss": 5.1437, + "step": 8469 + }, + { + "epoch": 0.9615746210145776, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003606584813347678, + "loss": 5.1422, + "step": 8470 + }, + { + "epoch": 0.9616881481244967, + "grad_norm": 0.06640625, + "learning_rate": 0.000359738378903997, + "loss": 5.1627, + "step": 8471 + }, + { + "epoch": 0.9618016752344157, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003588206621344153, + "loss": 5.17, + "step": 8472 + }, + { + "epoch": 0.9619152023443348, + "grad_norm": 0.06787109375, + "learning_rate": 0.0003579053339835735, + "loss": 5.1463, + "step": 8473 + }, + { + "epoch": 0.9620287294542539, + "grad_norm": 0.068359375, + "learning_rate": 0.00035699239740132397, + "loss": 5.1765, + "step": 8474 + }, + { + "epoch": 0.962142256564173, + "grad_norm": 0.06494140625, + "learning_rate": 0.00035608185532981263, + "loss": 5.1532, + "step": 8475 + }, + { + "epoch": 0.962255783674092, + "grad_norm": 0.06591796875, + "learning_rate": 0.00035517371070346805, + "loss": 5.164, + "step": 8476 + }, + { + "epoch": 0.9623693107840111, + "grad_norm": 0.06640625, + "learning_rate": 0.0003542679664489922, + "loss": 5.1611, + "step": 8477 + }, + { + "epoch": 0.9624828378939302, + "grad_norm": 0.0673828125, + "learning_rate": 0.0003533646254853522, + "loss": 5.1444, + "step": 8478 + }, + { + "epoch": 0.9625963650038493, + "grad_norm": 0.068359375, + "learning_rate": 0.0003524636907237685, + "loss": 5.1598, + "step": 8479 + }, + { + "epoch": 0.9627098921137683, + "grad_norm": 0.068359375, + "learning_rate": 0.0003515651650677092, + "loss": 5.1917, + "step": 8480 + }, + { + "epoch": 0.9628234192236874, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003506690514128763, + "loss": 5.1478, + "step": 8481 + }, + { + "epoch": 0.9629369463336065, + "grad_norm": 0.064453125, + "learning_rate": 0.00034977535264720026, + "loss": 5.1298, + "step": 8482 + }, + { + "epoch": 0.9630504734435256, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003488840716508276, + "loss": 5.1423, + "step": 8483 + }, + { + "epoch": 0.9631640005534446, + "grad_norm": 0.06640625, + "learning_rate": 0.0003479952112961143, + "loss": 5.1577, + "step": 8484 + }, + { + "epoch": 0.9632775276633637, + "grad_norm": 0.06640625, + "learning_rate": 0.0003471087744476148, + "loss": 5.1698, + "step": 8485 + }, + { + "epoch": 0.9633910547732828, + "grad_norm": 0.0654296875, + "learning_rate": 0.00034622476396207254, + "loss": 5.1411, + "step": 8486 + }, + { + "epoch": 0.9635045818832019, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003453431826884125, + "loss": 5.1627, + "step": 8487 + }, + { + "epoch": 0.963618108993121, + "grad_norm": 0.064453125, + "learning_rate": 0.0003444640334677305, + "loss": 5.1594, + "step": 8488 + }, + { + "epoch": 0.96373163610304, + "grad_norm": 0.0654296875, + "learning_rate": 0.00034358731913328465, + "loss": 5.1723, + "step": 8489 + }, + { + "epoch": 0.9638451632129591, + "grad_norm": 0.06640625, + "learning_rate": 0.0003427130425104857, + "loss": 5.1604, + "step": 8490 + }, + { + "epoch": 0.9639586903228782, + "grad_norm": 0.06494140625, + "learning_rate": 0.00034184120641688907, + "loss": 5.1615, + "step": 8491 + }, + { + "epoch": 0.9640722174327973, + "grad_norm": 0.0654296875, + "learning_rate": 0.000340971813662184, + "loss": 5.1714, + "step": 8492 + }, + { + "epoch": 0.9641857445427163, + "grad_norm": 0.0654296875, + "learning_rate": 0.00034010486704818665, + "loss": 5.1526, + "step": 8493 + }, + { + "epoch": 0.9642992716526354, + "grad_norm": 0.06591796875, + "learning_rate": 0.00033924036936882864, + "loss": 5.1567, + "step": 8494 + }, + { + "epoch": 0.9644127987625545, + "grad_norm": 0.06640625, + "learning_rate": 0.00033837832341015105, + "loss": 5.1515, + "step": 8495 + }, + { + "epoch": 0.9645263258724736, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003375187319502919, + "loss": 5.1505, + "step": 8496 + }, + { + "epoch": 0.9646398529823926, + "grad_norm": 0.06640625, + "learning_rate": 0.00033666159775948024, + "loss": 5.1558, + "step": 8497 + }, + { + "epoch": 0.9647533800923117, + "grad_norm": 0.06494140625, + "learning_rate": 0.00033580692360002585, + "loss": 5.1429, + "step": 8498 + }, + { + "epoch": 0.9648669072022308, + "grad_norm": 0.06787109375, + "learning_rate": 0.00033495471222631, + "loss": 5.1636, + "step": 8499 + }, + { + "epoch": 0.9649804343121499, + "grad_norm": 0.068359375, + "learning_rate": 0.0003341049663847775, + "loss": 5.1804, + "step": 8500 + }, + { + "epoch": 0.9650939614220689, + "grad_norm": 0.06640625, + "learning_rate": 0.00033325768881392696, + "loss": 5.1479, + "step": 8501 + }, + { + "epoch": 0.965207488531988, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003324128822443035, + "loss": 5.1508, + "step": 8502 + }, + { + "epoch": 0.9653210156419071, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003315705493984875, + "loss": 5.1564, + "step": 8503 + }, + { + "epoch": 0.9654345427518262, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003307306929910883, + "loss": 5.1587, + "step": 8504 + }, + { + "epoch": 0.9655480698617452, + "grad_norm": 0.064453125, + "learning_rate": 0.00032989331572873335, + "loss": 5.1652, + "step": 8505 + }, + { + "epoch": 0.9656615969716643, + "grad_norm": 0.06689453125, + "learning_rate": 0.00032905842031006184, + "loss": 5.138, + "step": 8506 + }, + { + "epoch": 0.9657751240815834, + "grad_norm": 0.064453125, + "learning_rate": 0.0003282260094257131, + "loss": 5.146, + "step": 8507 + }, + { + "epoch": 0.9658886511915025, + "grad_norm": 0.06640625, + "learning_rate": 0.0003273960857583205, + "loss": 5.1465, + "step": 8508 + }, + { + "epoch": 0.9660021783014215, + "grad_norm": 0.06640625, + "learning_rate": 0.00032656865198250174, + "loss": 5.1534, + "step": 8509 + }, + { + "epoch": 0.9661157054113406, + "grad_norm": 0.06494140625, + "learning_rate": 0.00032574371076484984, + "loss": 5.1687, + "step": 8510 + }, + { + "epoch": 0.9662292325212597, + "grad_norm": 0.064453125, + "learning_rate": 0.0003249212647639255, + "loss": 5.1535, + "step": 8511 + }, + { + "epoch": 0.9663427596311788, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003241013166302474, + "loss": 5.1669, + "step": 8512 + }, + { + "epoch": 0.9664562867410978, + "grad_norm": 0.064453125, + "learning_rate": 0.00032328386900628485, + "loss": 5.1484, + "step": 8513 + }, + { + "epoch": 0.9665698138510169, + "grad_norm": 0.064453125, + "learning_rate": 0.00032246892452644825, + "loss": 5.1524, + "step": 8514 + }, + { + "epoch": 0.966683340960936, + "grad_norm": 0.0654296875, + "learning_rate": 0.00032165648581708163, + "loss": 5.131, + "step": 8515 + }, + { + "epoch": 0.9667968680708551, + "grad_norm": 0.06591796875, + "learning_rate": 0.00032084655549645256, + "loss": 5.141, + "step": 8516 + }, + { + "epoch": 0.9669103951807742, + "grad_norm": 0.064453125, + "learning_rate": 0.0003200391361747462, + "loss": 5.1481, + "step": 8517 + }, + { + "epoch": 0.9670239222906932, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003192342304540542, + "loss": 5.1569, + "step": 8518 + }, + { + "epoch": 0.9671374494006123, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003184318409283685, + "loss": 5.1414, + "step": 8519 + }, + { + "epoch": 0.9672509765105314, + "grad_norm": 0.06494140625, + "learning_rate": 0.00031763197018357093, + "loss": 5.1676, + "step": 8520 + }, + { + "epoch": 0.9673645036204505, + "grad_norm": 0.064453125, + "learning_rate": 0.0003168346207974269, + "loss": 5.1565, + "step": 8521 + }, + { + "epoch": 0.9674780307303695, + "grad_norm": 0.06494140625, + "learning_rate": 0.0003160397953395762, + "loss": 5.1599, + "step": 8522 + }, + { + "epoch": 0.9675915578402886, + "grad_norm": 0.0673828125, + "learning_rate": 0.00031524749637152374, + "loss": 5.1333, + "step": 8523 + }, + { + "epoch": 0.9677050849502077, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003144577264466336, + "loss": 5.1722, + "step": 8524 + }, + { + "epoch": 0.9678186120601268, + "grad_norm": 0.06689453125, + "learning_rate": 0.00031367048811011824, + "loss": 5.1816, + "step": 8525 + }, + { + "epoch": 0.9679321391700458, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003128857838990322, + "loss": 5.1324, + "step": 8526 + }, + { + "epoch": 0.9680456662799649, + "grad_norm": 0.06494140625, + "learning_rate": 0.00031210361634226286, + "loss": 5.1443, + "step": 8527 + }, + { + "epoch": 0.968159193389884, + "grad_norm": 0.064453125, + "learning_rate": 0.000311323987960523, + "loss": 5.1518, + "step": 8528 + }, + { + "epoch": 0.9682727204998031, + "grad_norm": 0.06689453125, + "learning_rate": 0.00031054690126634175, + "loss": 5.1591, + "step": 8529 + }, + { + "epoch": 0.9683862476097221, + "grad_norm": 0.064453125, + "learning_rate": 0.0003097723587640577, + "loss": 5.1504, + "step": 8530 + }, + { + "epoch": 0.9684997747196412, + "grad_norm": 0.06640625, + "learning_rate": 0.0003090003629498101, + "loss": 5.1482, + "step": 8531 + }, + { + "epoch": 0.9686133018295603, + "grad_norm": 0.06640625, + "learning_rate": 0.00030823091631153066, + "loss": 5.1625, + "step": 8532 + }, + { + "epoch": 0.9687268289394794, + "grad_norm": 0.0654296875, + "learning_rate": 0.0003074640213289358, + "loss": 5.1462, + "step": 8533 + }, + { + "epoch": 0.9688403560493984, + "grad_norm": 0.06640625, + "learning_rate": 0.0003066996804735189, + "loss": 5.1582, + "step": 8534 + }, + { + "epoch": 0.9689538831593175, + "grad_norm": 0.0654296875, + "learning_rate": 0.00030593789620854216, + "loss": 5.1627, + "step": 8535 + }, + { + "epoch": 0.9690674102692366, + "grad_norm": 0.06494140625, + "learning_rate": 0.000305178670989028, + "loss": 5.1492, + "step": 8536 + }, + { + "epoch": 0.9691809373791557, + "grad_norm": 0.06591796875, + "learning_rate": 0.00030442200726175207, + "loss": 5.134, + "step": 8537 + }, + { + "epoch": 0.9692944644890747, + "grad_norm": 0.064453125, + "learning_rate": 0.00030366790746523537, + "loss": 5.1613, + "step": 8538 + }, + { + "epoch": 0.9694079915989938, + "grad_norm": 0.0654296875, + "learning_rate": 0.00030291637402973576, + "loss": 5.1403, + "step": 8539 + }, + { + "epoch": 0.9695215187089129, + "grad_norm": 0.064453125, + "learning_rate": 0.00030216740937724016, + "loss": 5.1566, + "step": 8540 + }, + { + "epoch": 0.969635045818832, + "grad_norm": 0.064453125, + "learning_rate": 0.0003014210159214575, + "loss": 5.1305, + "step": 8541 + }, + { + "epoch": 0.969748572928751, + "grad_norm": 0.06591796875, + "learning_rate": 0.00030067719606780995, + "loss": 5.1384, + "step": 8542 + }, + { + "epoch": 0.9698621000386701, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002999359522134262, + "loss": 5.1418, + "step": 8543 + }, + { + "epoch": 0.9699756271485892, + "grad_norm": 0.06640625, + "learning_rate": 0.00029919728674713267, + "loss": 5.1481, + "step": 8544 + }, + { + "epoch": 0.9700891542585083, + "grad_norm": 0.064453125, + "learning_rate": 0.00029846120204944654, + "loss": 5.1791, + "step": 8545 + }, + { + "epoch": 0.9702026813684274, + "grad_norm": 0.06494140625, + "learning_rate": 0.00029772770049256816, + "loss": 5.1543, + "step": 8546 + }, + { + "epoch": 0.9703162084783464, + "grad_norm": 0.06396484375, + "learning_rate": 0.00029699678444037263, + "loss": 5.1671, + "step": 8547 + }, + { + "epoch": 0.9704297355882655, + "grad_norm": 0.06396484375, + "learning_rate": 0.00029626845624840324, + "loss": 5.1639, + "step": 8548 + }, + { + "epoch": 0.9705432626981846, + "grad_norm": 0.06494140625, + "learning_rate": 0.00029554271826386265, + "loss": 5.1705, + "step": 8549 + }, + { + "epoch": 0.9706567898081037, + "grad_norm": 0.06494140625, + "learning_rate": 0.00029481957282560665, + "loss": 5.1375, + "step": 8550 + }, + { + "epoch": 0.9707703169180227, + "grad_norm": 0.064453125, + "learning_rate": 0.00029409902226413523, + "loss": 5.1314, + "step": 8551 + }, + { + "epoch": 0.9708838440279418, + "grad_norm": 0.064453125, + "learning_rate": 0.0002933810689015865, + "loss": 5.1703, + "step": 8552 + }, + { + "epoch": 0.9709973711378609, + "grad_norm": 0.06591796875, + "learning_rate": 0.00029266571505172823, + "loss": 5.1458, + "step": 8553 + }, + { + "epoch": 0.97111089824778, + "grad_norm": 0.064453125, + "learning_rate": 0.0002919529630199507, + "loss": 5.1653, + "step": 8554 + }, + { + "epoch": 0.971224425357699, + "grad_norm": 0.0654296875, + "learning_rate": 0.00029124281510325896, + "loss": 5.1452, + "step": 8555 + }, + { + "epoch": 0.9713379524676181, + "grad_norm": 0.06640625, + "learning_rate": 0.000290535273590266, + "loss": 5.1518, + "step": 8556 + }, + { + "epoch": 0.9714514795775372, + "grad_norm": 0.06396484375, + "learning_rate": 0.00028983034076118547, + "loss": 5.1601, + "step": 8557 + }, + { + "epoch": 0.9715650066874563, + "grad_norm": 0.06396484375, + "learning_rate": 0.00028912801888782293, + "loss": 5.1352, + "step": 8558 + }, + { + "epoch": 0.9716785337973753, + "grad_norm": 0.06494140625, + "learning_rate": 0.00028842831023357025, + "loss": 5.1599, + "step": 8559 + }, + { + "epoch": 0.9717920609072944, + "grad_norm": 0.06396484375, + "learning_rate": 0.00028773121705339755, + "loss": 5.1502, + "step": 8560 + }, + { + "epoch": 0.9719055880172135, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002870367415938462, + "loss": 5.1485, + "step": 8561 + }, + { + "epoch": 0.9720191151271326, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002863448860930206, + "loss": 5.1572, + "step": 8562 + }, + { + "epoch": 0.9721326422370516, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002856556527805826, + "loss": 5.1339, + "step": 8563 + }, + { + "epoch": 0.9722461693469707, + "grad_norm": 0.0654296875, + "learning_rate": 0.00028496904387774263, + "loss": 5.1607, + "step": 8564 + }, + { + "epoch": 0.9723596964568898, + "grad_norm": 0.064453125, + "learning_rate": 0.00028428506159725443, + "loss": 5.1588, + "step": 8565 + }, + { + "epoch": 0.9724732235668089, + "grad_norm": 0.06494140625, + "learning_rate": 0.00028360370814340546, + "loss": 5.1873, + "step": 8566 + }, + { + "epoch": 0.972586750676728, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002829249857120128, + "loss": 5.1185, + "step": 8567 + }, + { + "epoch": 0.9727002777866471, + "grad_norm": 0.064453125, + "learning_rate": 0.00028224889649041296, + "loss": 5.1456, + "step": 8568 + }, + { + "epoch": 0.9728138048965662, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002815754426574574, + "loss": 5.1567, + "step": 8569 + }, + { + "epoch": 0.9729273320064853, + "grad_norm": 0.06396484375, + "learning_rate": 0.00028090462638350394, + "loss": 5.1474, + "step": 8570 + }, + { + "epoch": 0.9730408591164044, + "grad_norm": 0.06689453125, + "learning_rate": 0.0002802364498304102, + "loss": 5.166, + "step": 8571 + }, + { + "epoch": 0.9731543862263234, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002795709151515272, + "loss": 5.1605, + "step": 8572 + }, + { + "epoch": 0.9732679133362425, + "grad_norm": 0.064453125, + "learning_rate": 0.0002789080244916909, + "loss": 5.1688, + "step": 8573 + }, + { + "epoch": 0.9733814404461616, + "grad_norm": 0.06396484375, + "learning_rate": 0.00027824777998721806, + "loss": 5.1593, + "step": 8574 + }, + { + "epoch": 0.9734949675560807, + "grad_norm": 0.0654296875, + "learning_rate": 0.0002775901837658956, + "loss": 5.1504, + "step": 8575 + }, + { + "epoch": 0.9736084946659997, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002769352379469773, + "loss": 5.1656, + "step": 8576 + }, + { + "epoch": 0.9737220217759188, + "grad_norm": 0.06396484375, + "learning_rate": 0.00027628294464117424, + "loss": 5.1571, + "step": 8577 + }, + { + "epoch": 0.9738355488858379, + "grad_norm": 0.0654296875, + "learning_rate": 0.00027563330595065024, + "loss": 5.1568, + "step": 8578 + }, + { + "epoch": 0.973949075995757, + "grad_norm": 0.064453125, + "learning_rate": 0.0002749863239690132, + "loss": 5.1562, + "step": 8579 + }, + { + "epoch": 0.974062603105676, + "grad_norm": 0.064453125, + "learning_rate": 0.00027434200078130954, + "loss": 5.1615, + "step": 8580 + }, + { + "epoch": 0.9741761302155951, + "grad_norm": 0.06396484375, + "learning_rate": 0.00027370033846401716, + "loss": 5.1633, + "step": 8581 + }, + { + "epoch": 0.9742896573255142, + "grad_norm": 0.064453125, + "learning_rate": 0.0002730613390850386, + "loss": 5.1638, + "step": 8582 + }, + { + "epoch": 0.9744031844354333, + "grad_norm": 0.06591796875, + "learning_rate": 0.00027242500470369454, + "loss": 5.1587, + "step": 8583 + }, + { + "epoch": 0.9745167115453524, + "grad_norm": 0.064453125, + "learning_rate": 0.0002717913373707167, + "loss": 5.1436, + "step": 8584 + }, + { + "epoch": 0.9746302386552714, + "grad_norm": 0.06396484375, + "learning_rate": 0.00027116033912824247, + "loss": 5.1434, + "step": 8585 + }, + { + "epoch": 0.9747437657651905, + "grad_norm": 0.0654296875, + "learning_rate": 0.00027053201200980643, + "loss": 5.1793, + "step": 8586 + }, + { + "epoch": 0.9748572928751096, + "grad_norm": 0.064453125, + "learning_rate": 0.0002699063580403359, + "loss": 5.1637, + "step": 8587 + }, + { + "epoch": 0.9749708199850287, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002692833792361424, + "loss": 5.1565, + "step": 8588 + }, + { + "epoch": 0.9750843470949477, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002686630776049172, + "loss": 5.1466, + "step": 8589 + }, + { + "epoch": 0.9751978742048668, + "grad_norm": 0.064453125, + "learning_rate": 0.0002680454551457228, + "loss": 5.1508, + "step": 8590 + }, + { + "epoch": 0.9753114013147859, + "grad_norm": 0.06689453125, + "learning_rate": 0.00026743051384898786, + "loss": 5.1237, + "step": 8591 + }, + { + "epoch": 0.975424928424705, + "grad_norm": 0.06494140625, + "learning_rate": 0.00026681825569650037, + "loss": 5.1547, + "step": 8592 + }, + { + "epoch": 0.975538455534624, + "grad_norm": 0.068359375, + "learning_rate": 0.0002662086826614013, + "loss": 5.1528, + "step": 8593 + }, + { + "epoch": 0.9756519826445431, + "grad_norm": 0.064453125, + "learning_rate": 0.00026560179670817835, + "loss": 5.1782, + "step": 8594 + }, + { + "epoch": 0.9757655097544622, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002649975997926589, + "loss": 5.172, + "step": 8595 + }, + { + "epoch": 0.9758790368643813, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002643960938620053, + "loss": 5.1388, + "step": 8596 + }, + { + "epoch": 0.9759925639743003, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002637972808547065, + "loss": 5.1525, + "step": 8597 + }, + { + "epoch": 0.9761060910842194, + "grad_norm": 0.06396484375, + "learning_rate": 0.00026320116270057384, + "loss": 5.1637, + "step": 8598 + }, + { + "epoch": 0.9762196181941385, + "grad_norm": 0.064453125, + "learning_rate": 0.00026260774132073314, + "loss": 5.1719, + "step": 8599 + }, + { + "epoch": 0.9763331453040576, + "grad_norm": 0.06396484375, + "learning_rate": 0.00026201701862761965, + "loss": 5.1764, + "step": 8600 + }, + { + "epoch": 0.9764466724139766, + "grad_norm": 0.06298828125, + "learning_rate": 0.00026142899652497145, + "loss": 5.1427, + "step": 8601 + }, + { + "epoch": 0.9765601995238957, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002608436769078231, + "loss": 5.1842, + "step": 8602 + }, + { + "epoch": 0.9766737266338148, + "grad_norm": 0.06640625, + "learning_rate": 0.00026026106166250027, + "loss": 5.1527, + "step": 8603 + }, + { + "epoch": 0.9767872537437339, + "grad_norm": 0.0634765625, + "learning_rate": 0.00025968115266661285, + "loss": 5.1621, + "step": 8604 + }, + { + "epoch": 0.976900780853653, + "grad_norm": 0.0625, + "learning_rate": 0.0002591039517890492, + "loss": 5.1766, + "step": 8605 + }, + { + "epoch": 0.977014307963572, + "grad_norm": 0.064453125, + "learning_rate": 0.00025852946088997025, + "loss": 5.1638, + "step": 8606 + }, + { + "epoch": 0.9771278350734911, + "grad_norm": 0.06396484375, + "learning_rate": 0.00025795768182080344, + "loss": 5.1427, + "step": 8607 + }, + { + "epoch": 0.9772413621834102, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002573886164242366, + "loss": 5.1831, + "step": 8608 + }, + { + "epoch": 0.9773548892933293, + "grad_norm": 0.064453125, + "learning_rate": 0.00025682226653421226, + "loss": 5.1355, + "step": 8609 + }, + { + "epoch": 0.9774684164032483, + "grad_norm": 0.064453125, + "learning_rate": 0.00025625863397592133, + "loss": 5.1669, + "step": 8610 + }, + { + "epoch": 0.9775819435131674, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002556977205657982, + "loss": 5.172, + "step": 8611 + }, + { + "epoch": 0.9776954706230865, + "grad_norm": 0.06591796875, + "learning_rate": 0.0002551395281115134, + "loss": 5.1424, + "step": 8612 + }, + { + "epoch": 0.9778089977330056, + "grad_norm": 0.0634765625, + "learning_rate": 0.000254584058411969, + "loss": 5.1695, + "step": 8613 + }, + { + "epoch": 0.9779225248429246, + "grad_norm": 0.06298828125, + "learning_rate": 0.00025403131325729195, + "loss": 5.1417, + "step": 8614 + }, + { + "epoch": 0.9780360519528437, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002534812944288294, + "loss": 5.1522, + "step": 8615 + }, + { + "epoch": 0.9781495790627628, + "grad_norm": 0.0634765625, + "learning_rate": 0.000252934003699142, + "loss": 5.1485, + "step": 8616 + }, + { + "epoch": 0.9782631061726819, + "grad_norm": 0.0634765625, + "learning_rate": 0.000252389442831998, + "loss": 5.1561, + "step": 8617 + }, + { + "epoch": 0.9783766332826009, + "grad_norm": 0.0634765625, + "learning_rate": 0.00025184761358236875, + "loss": 5.1676, + "step": 8618 + }, + { + "epoch": 0.97849016039252, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002513085176964219, + "loss": 5.1676, + "step": 8619 + }, + { + "epoch": 0.9786036875024391, + "grad_norm": 0.064453125, + "learning_rate": 0.0002507721569115168, + "loss": 5.1599, + "step": 8620 + }, + { + "epoch": 0.9787172146123582, + "grad_norm": 0.064453125, + "learning_rate": 0.00025023853295619725, + "loss": 5.1543, + "step": 8621 + }, + { + "epoch": 0.9788307417222772, + "grad_norm": 0.06298828125, + "learning_rate": 0.00024970764755018817, + "loss": 5.1539, + "step": 8622 + }, + { + "epoch": 0.9789442688321963, + "grad_norm": 0.064453125, + "learning_rate": 0.00024917950240438804, + "loss": 5.1604, + "step": 8623 + }, + { + "epoch": 0.9790577959421154, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002486540992208646, + "loss": 5.1331, + "step": 8624 + }, + { + "epoch": 0.9791713230520345, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024813143969284893, + "loss": 5.1693, + "step": 8625 + }, + { + "epoch": 0.9792848501619535, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002476115255047302, + "loss": 5.1358, + "step": 8626 + }, + { + "epoch": 0.9793983772718726, + "grad_norm": 0.0625, + "learning_rate": 0.0002470943583320497, + "loss": 5.1416, + "step": 8627 + }, + { + "epoch": 0.9795119043817917, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002465799398414962, + "loss": 5.1871, + "step": 8628 + }, + { + "epoch": 0.9796254314917108, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002460682716909004, + "loss": 5.1599, + "step": 8629 + }, + { + "epoch": 0.9797389586016299, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002455593555292287, + "loss": 5.1541, + "step": 8630 + }, + { + "epoch": 0.9798524857115489, + "grad_norm": 0.06396484375, + "learning_rate": 0.000245053192996579, + "loss": 5.1649, + "step": 8631 + }, + { + "epoch": 0.979966012821468, + "grad_norm": 0.064453125, + "learning_rate": 0.00024454978572417533, + "loss": 5.1633, + "step": 8632 + }, + { + "epoch": 0.9800795399313871, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024404913533436186, + "loss": 5.1495, + "step": 8633 + }, + { + "epoch": 0.9801930670413062, + "grad_norm": 0.06298828125, + "learning_rate": 0.00024355124344059792, + "loss": 5.1779, + "step": 8634 + }, + { + "epoch": 0.9803065941512252, + "grad_norm": 0.0625, + "learning_rate": 0.00024305611164745358, + "loss": 5.1705, + "step": 8635 + }, + { + "epoch": 0.9804201212611443, + "grad_norm": 0.064453125, + "learning_rate": 0.0002425637415506032, + "loss": 5.1446, + "step": 8636 + }, + { + "epoch": 0.9805336483710634, + "grad_norm": 0.0634765625, + "learning_rate": 0.00024207413473682147, + "loss": 5.1645, + "step": 8637 + }, + { + "epoch": 0.9806471754809825, + "grad_norm": 0.06787109375, + "learning_rate": 0.00024158729278397724, + "loss": 5.1464, + "step": 8638 + }, + { + "epoch": 0.9807607025909015, + "grad_norm": 0.06298828125, + "learning_rate": 0.00024110321726102937, + "loss": 5.1403, + "step": 8639 + }, + { + "epoch": 0.9808742297008206, + "grad_norm": 0.06494140625, + "learning_rate": 0.00024062190972802137, + "loss": 5.1649, + "step": 8640 + }, + { + "epoch": 0.9809877568107397, + "grad_norm": 0.0634765625, + "learning_rate": 0.00024014337173607585, + "loss": 5.1627, + "step": 8641 + }, + { + "epoch": 0.9811012839206588, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023966760482739037, + "loss": 5.1667, + "step": 8642 + }, + { + "epoch": 0.9812148110305778, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023919461053523167, + "loss": 5.1573, + "step": 8643 + }, + { + "epoch": 0.9813283381404969, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002387243903839313, + "loss": 5.1371, + "step": 8644 + }, + { + "epoch": 0.981441865250416, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023825694588888046, + "loss": 5.1593, + "step": 8645 + }, + { + "epoch": 0.9815553923603351, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023779227855652525, + "loss": 5.1372, + "step": 8646 + }, + { + "epoch": 0.9816689194702541, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023733038988436167, + "loss": 5.1428, + "step": 8647 + }, + { + "epoch": 0.9817824465801732, + "grad_norm": 0.0634765625, + "learning_rate": 0.000236871281360931, + "loss": 5.1505, + "step": 8648 + }, + { + "epoch": 0.9818959736900923, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023641495446581447, + "loss": 5.1238, + "step": 8649 + }, + { + "epoch": 0.9820095008000114, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002359614106696293, + "loss": 5.1757, + "step": 8650 + }, + { + "epoch": 0.9821230279099304, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023551065143402318, + "loss": 5.127, + "step": 8651 + }, + { + "epoch": 0.9822365550198495, + "grad_norm": 0.062255859375, + "learning_rate": 0.00023506267821167033, + "loss": 5.1608, + "step": 8652 + }, + { + "epoch": 0.9823500821297686, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023461749244626604, + "loss": 5.1723, + "step": 8653 + }, + { + "epoch": 0.9824636092396877, + "grad_norm": 0.0625, + "learning_rate": 0.00023417509557252267, + "loss": 5.1628, + "step": 8654 + }, + { + "epoch": 0.9825771363496068, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023373548901616464, + "loss": 5.162, + "step": 8655 + }, + { + "epoch": 0.9826906634595258, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023329867419392373, + "loss": 5.1505, + "step": 8656 + }, + { + "epoch": 0.9828041905694449, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002328646525135352, + "loss": 5.1481, + "step": 8657 + }, + { + "epoch": 0.982917717679364, + "grad_norm": 0.064453125, + "learning_rate": 0.0002324334253737321, + "loss": 5.1644, + "step": 8658 + }, + { + "epoch": 0.983031244789283, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002320049941642421, + "loss": 5.1591, + "step": 8659 + }, + { + "epoch": 0.9831447718992021, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023157936026578185, + "loss": 5.1626, + "step": 8660 + }, + { + "epoch": 0.9832582990091212, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023115652505005353, + "loss": 5.1559, + "step": 8661 + }, + { + "epoch": 0.9833718261190403, + "grad_norm": 0.0634765625, + "learning_rate": 0.00023073648987973942, + "loss": 5.1334, + "step": 8662 + }, + { + "epoch": 0.9834853532289594, + "grad_norm": 0.064453125, + "learning_rate": 0.00023031925610849852, + "loss": 5.1693, + "step": 8663 + }, + { + "epoch": 0.9835988803388784, + "grad_norm": 0.06298828125, + "learning_rate": 0.00022990482508096103, + "loss": 5.1548, + "step": 8664 + }, + { + "epoch": 0.9837124074487975, + "grad_norm": 0.06201171875, + "learning_rate": 0.0002294931981327252, + "loss": 5.1424, + "step": 8665 + }, + { + "epoch": 0.9838259345587166, + "grad_norm": 0.064453125, + "learning_rate": 0.0002290843765903526, + "loss": 5.1587, + "step": 8666 + }, + { + "epoch": 0.9839394616686357, + "grad_norm": 0.0634765625, + "learning_rate": 0.00022867836177136312, + "loss": 5.1506, + "step": 8667 + }, + { + "epoch": 0.9840529887785547, + "grad_norm": 0.06396484375, + "learning_rate": 0.00022827515498423205, + "loss": 5.1191, + "step": 8668 + }, + { + "epoch": 0.9841665158884738, + "grad_norm": 0.0634765625, + "learning_rate": 0.00022787475752838448, + "loss": 5.1645, + "step": 8669 + }, + { + "epoch": 0.9842800429983929, + "grad_norm": 0.0625, + "learning_rate": 0.00022747717069419255, + "loss": 5.1468, + "step": 8670 + }, + { + "epoch": 0.984393570108312, + "grad_norm": 0.0634765625, + "learning_rate": 0.00022708239576296987, + "loss": 5.1568, + "step": 8671 + }, + { + "epoch": 0.984507097218231, + "grad_norm": 0.064453125, + "learning_rate": 0.00022669043400696837, + "loss": 5.1601, + "step": 8672 + }, + { + "epoch": 0.9846206243281501, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002263012866893737, + "loss": 5.1506, + "step": 8673 + }, + { + "epoch": 0.9847341514380692, + "grad_norm": 0.064453125, + "learning_rate": 0.00022591495506430156, + "loss": 5.145, + "step": 8674 + }, + { + "epoch": 0.9848476785479883, + "grad_norm": 0.064453125, + "learning_rate": 0.00022553144037679344, + "loss": 5.1336, + "step": 8675 + }, + { + "epoch": 0.9849612056579073, + "grad_norm": 0.062255859375, + "learning_rate": 0.00022515074386281257, + "loss": 5.1797, + "step": 8676 + }, + { + "epoch": 0.9850747327678264, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002247728667492396, + "loss": 5.1442, + "step": 8677 + }, + { + "epoch": 0.9851882598777455, + "grad_norm": 0.06298828125, + "learning_rate": 0.00022439781025386957, + "loss": 5.1528, + "step": 8678 + }, + { + "epoch": 0.9853017869876646, + "grad_norm": 0.06298828125, + "learning_rate": 0.00022402557558540726, + "loss": 5.1608, + "step": 8679 + }, + { + "epoch": 0.9854153140975837, + "grad_norm": 0.06494140625, + "learning_rate": 0.00022365616394346333, + "loss": 5.1588, + "step": 8680 + }, + { + "epoch": 0.9855288412075027, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002232895765185508, + "loss": 5.1482, + "step": 8681 + }, + { + "epoch": 0.9856423683174218, + "grad_norm": 0.0654296875, + "learning_rate": 0.00022292581449208048, + "loss": 5.1762, + "step": 8682 + }, + { + "epoch": 0.9857558954273409, + "grad_norm": 0.0625, + "learning_rate": 0.00022256487903635875, + "loss": 5.1579, + "step": 8683 + }, + { + "epoch": 0.98586942253726, + "grad_norm": 0.06298828125, + "learning_rate": 0.00022220677131458172, + "loss": 5.1438, + "step": 8684 + }, + { + "epoch": 0.985982949647179, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002218514924808329, + "loss": 5.1589, + "step": 8685 + }, + { + "epoch": 0.9860964767570981, + "grad_norm": 0.0654296875, + "learning_rate": 0.00022149904368007903, + "loss": 5.1768, + "step": 8686 + }, + { + "epoch": 0.9862100038670172, + "grad_norm": 0.0634765625, + "learning_rate": 0.00022114942604816633, + "loss": 5.1403, + "step": 8687 + }, + { + "epoch": 0.9863235309769363, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002208026407118172, + "loss": 5.165, + "step": 8688 + }, + { + "epoch": 0.9864370580868553, + "grad_norm": 0.064453125, + "learning_rate": 0.0002204586887886259, + "loss": 5.1627, + "step": 8689 + }, + { + "epoch": 0.9865505851967744, + "grad_norm": 0.064453125, + "learning_rate": 0.0002201175713870558, + "loss": 5.171, + "step": 8690 + }, + { + "epoch": 0.9866641123066935, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021977928960643513, + "loss": 5.156, + "step": 8691 + }, + { + "epoch": 0.9867776394166126, + "grad_norm": 0.06201171875, + "learning_rate": 0.00021944384453695387, + "loss": 5.1631, + "step": 8692 + }, + { + "epoch": 0.9868911665265316, + "grad_norm": 0.0625, + "learning_rate": 0.00021911123725965972, + "loss": 5.1614, + "step": 8693 + }, + { + "epoch": 0.9870046936364507, + "grad_norm": 0.06201171875, + "learning_rate": 0.00021878146884645542, + "loss": 5.1535, + "step": 8694 + }, + { + "epoch": 0.9871182207463698, + "grad_norm": 0.06298828125, + "learning_rate": 0.00021845454036009438, + "loss": 5.1541, + "step": 8695 + }, + { + "epoch": 0.9872317478562889, + "grad_norm": 0.0634765625, + "learning_rate": 0.00021813045285417785, + "loss": 5.173, + "step": 8696 + }, + { + "epoch": 0.9873452749662079, + "grad_norm": 0.06298828125, + "learning_rate": 0.00021780920737315152, + "loss": 5.1459, + "step": 8697 + }, + { + "epoch": 0.987458802076127, + "grad_norm": 0.0625, + "learning_rate": 0.000217490804952302, + "loss": 5.1545, + "step": 8698 + }, + { + "epoch": 0.9875723291860461, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021717524661775322, + "loss": 5.1579, + "step": 8699 + }, + { + "epoch": 0.9876858562959652, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021686253338646378, + "loss": 5.1641, + "step": 8700 + }, + { + "epoch": 0.9877993834058842, + "grad_norm": 0.0654296875, + "learning_rate": 0.00021655266626622295, + "loss": 5.1597, + "step": 8701 + }, + { + "epoch": 0.9879129105158033, + "grad_norm": 0.0634765625, + "learning_rate": 0.00021624564625564782, + "loss": 5.1852, + "step": 8702 + }, + { + "epoch": 0.9880264376257224, + "grad_norm": 0.0625, + "learning_rate": 0.00021594147434418027, + "loss": 5.159, + "step": 8703 + }, + { + "epoch": 0.9881399647356415, + "grad_norm": 0.0634765625, + "learning_rate": 0.00021564015151208314, + "loss": 5.1526, + "step": 8704 + }, + { + "epoch": 0.9882534918455605, + "grad_norm": 0.064453125, + "learning_rate": 0.00021534167873043806, + "loss": 5.157, + "step": 8705 + }, + { + "epoch": 0.9883670189554796, + "grad_norm": 0.064453125, + "learning_rate": 0.00021504605696114095, + "loss": 5.17, + "step": 8706 + }, + { + "epoch": 0.9884805460653987, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002147532871569004, + "loss": 5.1395, + "step": 8707 + }, + { + "epoch": 0.9885940731753178, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002144633702612333, + "loss": 5.1422, + "step": 8708 + }, + { + "epoch": 0.9887076002852369, + "grad_norm": 0.0634765625, + "learning_rate": 0.00021417630720846305, + "loss": 5.152, + "step": 8709 + }, + { + "epoch": 0.9888211273951559, + "grad_norm": 0.0625, + "learning_rate": 0.00021389209892371523, + "loss": 5.1682, + "step": 8710 + }, + { + "epoch": 0.988934654505075, + "grad_norm": 0.06494140625, + "learning_rate": 0.00021361074632291561, + "loss": 5.1635, + "step": 8711 + }, + { + "epoch": 0.9890481816149941, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021333225031278701, + "loss": 5.1526, + "step": 8712 + }, + { + "epoch": 0.9891617087249132, + "grad_norm": 0.06298828125, + "learning_rate": 0.000213056611790846, + "loss": 5.1435, + "step": 8713 + }, + { + "epoch": 0.9892752358348322, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021278383164540028, + "loss": 5.163, + "step": 8714 + }, + { + "epoch": 0.9893887629447513, + "grad_norm": 0.06396484375, + "learning_rate": 0.00021251391075554583, + "loss": 5.1407, + "step": 8715 + }, + { + "epoch": 0.9895022900546704, + "grad_norm": 0.06201171875, + "learning_rate": 0.00021224684999116407, + "loss": 5.1741, + "step": 8716 + }, + { + "epoch": 0.9896158171645895, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002119826502129188, + "loss": 5.1389, + "step": 8717 + }, + { + "epoch": 0.9897293442745085, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002117213122722539, + "loss": 5.1593, + "step": 8718 + }, + { + "epoch": 0.9898428713844276, + "grad_norm": 0.0625, + "learning_rate": 0.00021146283701139028, + "loss": 5.1619, + "step": 8719 + }, + { + "epoch": 0.9899563984943467, + "grad_norm": 0.06298828125, + "learning_rate": 0.00021120722526332328, + "loss": 5.15, + "step": 8720 + }, + { + "epoch": 0.9900699256042658, + "grad_norm": 0.06298828125, + "learning_rate": 0.00021095447785181954, + "loss": 5.1518, + "step": 8721 + }, + { + "epoch": 0.9901834527141848, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002107045955914152, + "loss": 5.1544, + "step": 8722 + }, + { + "epoch": 0.9902969798241039, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002104575792874124, + "loss": 5.1629, + "step": 8723 + }, + { + "epoch": 0.990410506934023, + "grad_norm": 0.0634765625, + "learning_rate": 0.00021021342973587745, + "loss": 5.1611, + "step": 8724 + }, + { + "epoch": 0.9905240340439421, + "grad_norm": 0.06787109375, + "learning_rate": 0.00020997214772363763, + "loss": 5.1418, + "step": 8725 + }, + { + "epoch": 0.9906375611538611, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020973373402827908, + "loss": 5.1578, + "step": 8726 + }, + { + "epoch": 0.9907510882637802, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020949818941814406, + "loss": 5.1534, + "step": 8727 + }, + { + "epoch": 0.9908646153736993, + "grad_norm": 0.0625, + "learning_rate": 0.00020926551465232863, + "loss": 5.1665, + "step": 8728 + }, + { + "epoch": 0.9909781424836184, + "grad_norm": 0.0625, + "learning_rate": 0.00020903571048067992, + "loss": 5.1685, + "step": 8729 + }, + { + "epoch": 0.9910916695935374, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020880877764379414, + "loss": 5.1565, + "step": 8730 + }, + { + "epoch": 0.9912051967034565, + "grad_norm": 0.06982421875, + "learning_rate": 0.00020858471687301393, + "loss": 5.1554, + "step": 8731 + }, + { + "epoch": 0.9913187238133756, + "grad_norm": 0.06591796875, + "learning_rate": 0.000208363528890426, + "loss": 5.1642, + "step": 8732 + }, + { + "epoch": 0.9914322509232947, + "grad_norm": 0.064453125, + "learning_rate": 0.00020814521440885877, + "loss": 5.1496, + "step": 8733 + }, + { + "epoch": 0.9915457780332138, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002079297741318803, + "loss": 5.1448, + "step": 8734 + }, + { + "epoch": 0.9916593051431328, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020771720875379586, + "loss": 5.134, + "step": 8735 + }, + { + "epoch": 0.9917728322530519, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020750751895964547, + "loss": 5.1615, + "step": 8736 + }, + { + "epoch": 0.991886359362971, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020730070542520233, + "loss": 5.1553, + "step": 8737 + }, + { + "epoch": 0.9919998864728901, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020709676881697003, + "loss": 5.1501, + "step": 8738 + }, + { + "epoch": 0.9921134135828091, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020689570979218037, + "loss": 5.1231, + "step": 8739 + }, + { + "epoch": 0.9922269406927282, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020669752899879192, + "loss": 5.1596, + "step": 8740 + }, + { + "epoch": 0.9923404678026473, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020650222707548717, + "loss": 5.1706, + "step": 8741 + }, + { + "epoch": 0.9924539949125664, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020630980465167106, + "loss": 5.1505, + "step": 8742 + }, + { + "epoch": 0.9925675220224854, + "grad_norm": 0.0625, + "learning_rate": 0.0002061202623474684, + "loss": 5.1626, + "step": 8743 + }, + { + "epoch": 0.9926810491324045, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020593360077372249, + "loss": 5.1444, + "step": 8744 + }, + { + "epoch": 0.9927945762423236, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020574982053199245, + "loss": 5.1584, + "step": 8745 + }, + { + "epoch": 0.9929081033522427, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020556892221455195, + "loss": 5.1746, + "step": 8746 + }, + { + "epoch": 0.9930216304621617, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020539090640438691, + "loss": 5.149, + "step": 8747 + }, + { + "epoch": 0.9931351575720808, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020521577367519373, + "loss": 5.1403, + "step": 8748 + }, + { + "epoch": 0.9932486846819999, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002050435245913772, + "loss": 5.1414, + "step": 8749 + }, + { + "epoch": 0.993362211791919, + "grad_norm": 0.06494140625, + "learning_rate": 0.00020487415970804935, + "loss": 5.1603, + "step": 8750 + }, + { + "epoch": 0.993475738901838, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020470767957102694, + "loss": 5.1697, + "step": 8751 + }, + { + "epoch": 0.9935892660117571, + "grad_norm": 0.061767578125, + "learning_rate": 0.00020454408471682988, + "loss": 5.1719, + "step": 8752 + }, + { + "epoch": 0.9937027931216762, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020438337567267993, + "loss": 5.1562, + "step": 8753 + }, + { + "epoch": 0.9938163202315953, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002042255529564982, + "loss": 5.1772, + "step": 8754 + }, + { + "epoch": 0.9939298473415143, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020407061707690467, + "loss": 5.1478, + "step": 8755 + }, + { + "epoch": 0.9940433744514334, + "grad_norm": 0.0634765625, + "learning_rate": 0.000203918568533215, + "loss": 5.1444, + "step": 8756 + }, + { + "epoch": 0.9941569015613525, + "grad_norm": 0.06494140625, + "learning_rate": 0.00020376940781544035, + "loss": 5.1625, + "step": 8757 + }, + { + "epoch": 0.9942704286712716, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020362313540428484, + "loss": 5.153, + "step": 8758 + }, + { + "epoch": 0.9943839557811907, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020347975177114475, + "loss": 5.1682, + "step": 8759 + }, + { + "epoch": 0.9944974828911097, + "grad_norm": 0.0625, + "learning_rate": 0.00020333925737810632, + "loss": 5.1691, + "step": 8760 + }, + { + "epoch": 0.9946110100010288, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020320165267794462, + "loss": 5.1531, + "step": 8761 + }, + { + "epoch": 0.9947245371109479, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020306693811412199, + "loss": 5.1392, + "step": 8762 + }, + { + "epoch": 0.994838064220867, + "grad_norm": 0.061767578125, + "learning_rate": 0.00020293511412078688, + "loss": 5.1369, + "step": 8763 + }, + { + "epoch": 0.994951591330786, + "grad_norm": 0.0625, + "learning_rate": 0.0002028061811227718, + "loss": 5.1646, + "step": 8764 + }, + { + "epoch": 0.9950651184407051, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020268013953559277, + "loss": 5.1592, + "step": 8765 + }, + { + "epoch": 0.9951786455506242, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020255698976544747, + "loss": 5.1502, + "step": 8766 + }, + { + "epoch": 0.9952921726605433, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020243673220921393, + "loss": 5.142, + "step": 8767 + }, + { + "epoch": 0.9954056997704623, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020231936725444938, + "loss": 5.1435, + "step": 8768 + }, + { + "epoch": 0.9955192268803814, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002022048952793891, + "loss": 5.1484, + "step": 8769 + }, + { + "epoch": 0.9956327539903005, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002020933166529451, + "loss": 5.18, + "step": 8770 + }, + { + "epoch": 0.9957462811002196, + "grad_norm": 0.0625, + "learning_rate": 0.00020198463173470477, + "loss": 5.1516, + "step": 8771 + }, + { + "epoch": 0.9958598082101386, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020187884087492991, + "loss": 5.1448, + "step": 8772 + }, + { + "epoch": 0.9959733353200577, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002017759444145556, + "loss": 5.1466, + "step": 8773 + }, + { + "epoch": 0.9960868624299768, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002016759426851891, + "loss": 5.1691, + "step": 8774 + }, + { + "epoch": 0.9962003895398959, + "grad_norm": 0.064453125, + "learning_rate": 0.0002015788360091087, + "loss": 5.1403, + "step": 8775 + }, + { + "epoch": 0.996313916649815, + "grad_norm": 0.064453125, + "learning_rate": 0.00020148462469926265, + "loss": 5.1548, + "step": 8776 + }, + { + "epoch": 0.996427443759734, + "grad_norm": 0.06494140625, + "learning_rate": 0.00020139330905926835, + "loss": 5.1401, + "step": 8777 + }, + { + "epoch": 0.9965409708696531, + "grad_norm": 0.06201171875, + "learning_rate": 0.00020130488938341118, + "loss": 5.1417, + "step": 8778 + }, + { + "epoch": 0.9966544979795722, + "grad_norm": 0.0625, + "learning_rate": 0.00020121936595664358, + "loss": 5.1478, + "step": 8779 + }, + { + "epoch": 0.9967680250894912, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002011367390545843, + "loss": 5.1462, + "step": 8780 + }, + { + "epoch": 0.9968815521994103, + "grad_norm": 0.0625, + "learning_rate": 0.00020105700894351716, + "loss": 5.1421, + "step": 8781 + }, + { + "epoch": 0.9969950793093294, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002009801758803907, + "loss": 5.1539, + "step": 8782 + }, + { + "epoch": 0.9971086064192485, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020090624011281688, + "loss": 5.1587, + "step": 8783 + }, + { + "epoch": 0.9972221335291676, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020083520187907033, + "loss": 5.1473, + "step": 8784 + }, + { + "epoch": 0.9973356606390866, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020076706140808814, + "loss": 5.149, + "step": 8785 + }, + { + "epoch": 0.9974491877490057, + "grad_norm": 0.0625, + "learning_rate": 0.00020070181891946833, + "loss": 5.1587, + "step": 8786 + }, + { + "epoch": 0.9975627148589248, + "grad_norm": 0.06396484375, + "learning_rate": 0.00020063947462346975, + "loss": 5.1448, + "step": 8787 + }, + { + "epoch": 0.9976762419688439, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020058002872101077, + "loss": 5.1528, + "step": 8788 + }, + { + "epoch": 0.9977897690787629, + "grad_norm": 0.06298828125, + "learning_rate": 0.0002005234814036696, + "loss": 5.1646, + "step": 8789 + }, + { + "epoch": 0.997903296188682, + "grad_norm": 0.06396484375, + "learning_rate": 0.0002004698328536827, + "loss": 5.1574, + "step": 8790 + }, + { + "epoch": 0.9980168232986011, + "grad_norm": 0.062255859375, + "learning_rate": 0.0002004190832439447, + "loss": 5.1553, + "step": 8791 + }, + { + "epoch": 0.9981303504085202, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020037123273800782, + "loss": 5.1813, + "step": 8792 + }, + { + "epoch": 0.9982438775184392, + "grad_norm": 0.0625, + "learning_rate": 0.000200326281490081, + "loss": 5.1492, + "step": 8793 + }, + { + "epoch": 0.9983574046283583, + "grad_norm": 0.06201171875, + "learning_rate": 0.00020028422964503008, + "loss": 5.1408, + "step": 8794 + }, + { + "epoch": 0.9984709317382774, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020024507733837644, + "loss": 5.1743, + "step": 8795 + }, + { + "epoch": 0.9985844588481965, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020020882469629753, + "loss": 5.1757, + "step": 8796 + }, + { + "epoch": 0.9986979859581155, + "grad_norm": 0.06201171875, + "learning_rate": 0.00020017547183562553, + "loss": 5.1422, + "step": 8797 + }, + { + "epoch": 0.9988115130680346, + "grad_norm": 0.061767578125, + "learning_rate": 0.00020014501886384765, + "loss": 5.1514, + "step": 8798 + }, + { + "epoch": 0.9989250401779537, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020011746587910557, + "loss": 5.1403, + "step": 8799 + }, + { + "epoch": 0.9990385672878728, + "grad_norm": 0.064453125, + "learning_rate": 0.00020009281297019498, + "loss": 5.1583, + "step": 8800 + }, + { + "epoch": 0.9991520943977918, + "grad_norm": 0.0615234375, + "learning_rate": 0.0002000710602165655, + "loss": 5.1545, + "step": 8801 + }, + { + "epoch": 0.9992656215077109, + "grad_norm": 0.0625, + "learning_rate": 0.0002000522076883203, + "loss": 5.1756, + "step": 8802 + }, + { + "epoch": 0.99937914861763, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020003625544621603, + "loss": 5.1673, + "step": 8803 + }, + { + "epoch": 0.9994926757275491, + "grad_norm": 0.0625, + "learning_rate": 0.00020002320354166228, + "loss": 5.1647, + "step": 8804 + }, + { + "epoch": 0.9996062028374681, + "grad_norm": 0.06298828125, + "learning_rate": 0.00020001305201672182, + "loss": 5.1787, + "step": 8805 + }, + { + "epoch": 0.9997197299473872, + "grad_norm": 0.0634765625, + "learning_rate": 0.00020000580090411026, + "loss": 5.1421, + "step": 8806 + }, + { + "epoch": 0.9998332570573063, + "grad_norm": 0.0625, + "learning_rate": 0.00020000145022719605, + "loss": 5.1743, + "step": 8807 + }, + { + "epoch": 0.9999467841672254, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002, + "loss": 5.143, + "step": 8808 + }, + { + "epoch": 0.9999467841672254, + "step": 8808, + "total_flos": 4.588406021280372e+20, + "train_loss": 5.391583533096487, + "train_runtime": 330234.0549, + "train_samples_per_second": 23.899, + "train_steps_per_second": 0.027 + } + ], + "logging_steps": 1.0, + "max_steps": 8808, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.588406021280372e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}