diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,9194 +3,2327 @@ "best_model_checkpoint": null, "epoch": 0.9999235532451648, "eval_steps": 500, - "global_step": 6540, + "global_step": 1635, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.00015289350967051448, - "grad_norm": 0.25844491471013625, - "learning_rate": 1.5290519877675841e-06, - "loss": 1.5751, + "epoch": 0.0006115740386820579, + "grad_norm": 0.12052220602859784, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.6245, "step": 1 }, { - "epoch": 0.0007644675483525725, - "grad_norm": 0.23350215723561454, - "learning_rate": 7.645259938837921e-06, - "loss": 1.6179, + "epoch": 0.00305787019341029, + "grad_norm": 0.11718955528750491, + "learning_rate": 3.048780487804878e-05, + "loss": 1.5875, "step": 5 }, { - "epoch": 0.001528935096705145, - "grad_norm": 0.23748541894682731, - "learning_rate": 1.5290519877675842e-05, - "loss": 1.5771, + "epoch": 0.00611574038682058, + "grad_norm": 0.07463826304526239, + "learning_rate": 6.097560975609756e-05, + "loss": 1.5734, "step": 10 }, { - "epoch": 0.0022934026450577173, - "grad_norm": 0.20165559806581407, - "learning_rate": 2.2935779816513765e-05, - "loss": 1.5702, + "epoch": 0.009173610580230869, + "grad_norm": 0.07065399016948878, + "learning_rate": 9.146341463414634e-05, + "loss": 1.4926, "step": 15 }, { - "epoch": 0.00305787019341029, - "grad_norm": 0.15999746610192153, - "learning_rate": 3.0581039755351684e-05, - "loss": 1.5676, + "epoch": 0.01223148077364116, + "grad_norm": 0.06072873212977001, + "learning_rate": 0.00012195121951219512, + "loss": 1.458, "step": 20 }, { - "epoch": 0.0038223377417628624, - "grad_norm": 0.15565801558030515, - "learning_rate": 3.82262996941896e-05, - "loss": 1.5135, + "epoch": 0.01528935096705145, + "grad_norm": 0.0562127370758177, + "learning_rate": 0.0001524390243902439, + "loss": 1.4146, "step": 25 }, { - "epoch": 0.0045868052901154345, - "grad_norm": 0.1783724098220412, - "learning_rate": 4.587155963302753e-05, - "loss": 1.5051, + "epoch": 0.018347221160461738, + "grad_norm": 0.0369940258125185, + "learning_rate": 0.00018292682926829268, + "loss": 1.3727, "step": 30 }, { - "epoch": 0.005351272838468007, - "grad_norm": 0.14330557099797261, - "learning_rate": 5.3516819571865445e-05, - "loss": 1.4676, + "epoch": 0.021405091353872027, + "grad_norm": 0.03030856084221372, + "learning_rate": 0.00021341463414634146, + "loss": 1.3452, "step": 35 }, { - "epoch": 0.00611574038682058, - "grad_norm": 0.119715470035023, - "learning_rate": 6.116207951070337e-05, - "loss": 1.4462, + "epoch": 0.02446296154728232, + "grad_norm": 0.029052665253947514, + "learning_rate": 0.00024390243902439024, + "loss": 1.3062, "step": 40 }, { - "epoch": 0.006880207935173152, - "grad_norm": 0.11217846902570104, - "learning_rate": 6.880733944954129e-05, - "loss": 1.3848, + "epoch": 0.027520831740692607, + "grad_norm": 0.027169249076678257, + "learning_rate": 0.00027439024390243905, + "loss": 1.3193, "step": 45 }, { - "epoch": 0.007644675483525725, - "grad_norm": 0.10423836816079957, - "learning_rate": 7.64525993883792e-05, - "loss": 1.3812, + "epoch": 0.0305787019341029, + "grad_norm": 0.026308314290126287, + "learning_rate": 0.0003048780487804878, + "loss": 1.3059, "step": 50 }, { - "epoch": 0.008409143031878296, - "grad_norm": 0.10698713236366712, - "learning_rate": 8.409785932721712e-05, - "loss": 1.352, + "epoch": 0.033636572127513184, + "grad_norm": 0.02558312476230526, + "learning_rate": 0.0003353658536585366, + "loss": 1.2978, "step": 55 }, { - "epoch": 0.009173610580230869, - "grad_norm": 0.09868273967248942, - "learning_rate": 9.174311926605506e-05, - "loss": 1.374, + "epoch": 0.036694442320923476, + "grad_norm": 0.02366166156410773, + "learning_rate": 0.00036585365853658537, + "loss": 1.2835, "step": 60 }, { - "epoch": 0.009938078128583442, - "grad_norm": 0.08791146321468311, - "learning_rate": 9.938837920489297e-05, - "loss": 1.375, + "epoch": 0.03975231251433377, + "grad_norm": 0.023696910212749084, + "learning_rate": 0.0003963414634146342, + "loss": 1.2801, "step": 65 }, { - "epoch": 0.010702545676936013, - "grad_norm": 0.09568154037565704, - "learning_rate": 0.00010703363914373089, - "loss": 1.3381, + "epoch": 0.04281018270774405, + "grad_norm": 0.023961941292451862, + "learning_rate": 0.0004268292682926829, + "loss": 1.2862, "step": 70 }, { - "epoch": 0.011467013225288586, - "grad_norm": 0.09637997956354047, - "learning_rate": 0.00011467889908256881, - "loss": 1.3444, + "epoch": 0.045868052901154345, + "grad_norm": 0.026300989505884765, + "learning_rate": 0.00045731707317073173, + "loss": 1.2738, "step": 75 }, { - "epoch": 0.01223148077364116, - "grad_norm": 0.09217973928630054, - "learning_rate": 0.00012232415902140674, - "loss": 1.3413, + "epoch": 0.04892592309456464, + "grad_norm": 0.024085664746400153, + "learning_rate": 0.0004878048780487805, + "loss": 1.267, "step": 80 }, { - "epoch": 0.01299594832199373, - "grad_norm": 0.08948686274207032, - "learning_rate": 0.00012996941896024463, - "loss": 1.3062, + "epoch": 0.05198379328797492, + "grad_norm": 0.02400704254026435, + "learning_rate": 0.0005182926829268293, + "loss": 1.2687, "step": 85 }, { - "epoch": 0.013760415870346304, - "grad_norm": 0.08961531821702039, - "learning_rate": 0.00013761467889908258, - "loss": 1.3533, + "epoch": 0.055041663481385214, + "grad_norm": 0.028737409033153787, + "learning_rate": 0.0005487804878048781, + "loss": 1.248, "step": 90 }, { - "epoch": 0.014524883418698877, - "grad_norm": 0.09082536551122133, - "learning_rate": 0.0001452599388379205, - "loss": 1.3422, + "epoch": 0.058099533674795506, + "grad_norm": 0.033246208383736264, + "learning_rate": 0.0005792682926829268, + "loss": 1.2693, "step": 95 }, { - "epoch": 0.01528935096705145, - "grad_norm": 0.08401976510617441, - "learning_rate": 0.0001529051987767584, - "loss": 1.3419, + "epoch": 0.0611574038682058, + "grad_norm": 0.02684140817882857, + "learning_rate": 0.0006097560975609756, + "loss": 1.2562, "step": 100 }, { - "epoch": 0.016053818515404022, - "grad_norm": 0.13830177296315044, - "learning_rate": 0.00016055045871559635, - "loss": 1.313, + "epoch": 0.06421527406161609, + "grad_norm": 0.026557034480265206, + "learning_rate": 0.0006402439024390244, + "loss": 1.2286, "step": 105 }, { - "epoch": 0.016818286063756592, - "grad_norm": 0.08531248094678413, - "learning_rate": 0.00016819571865443425, - "loss": 1.308, + "epoch": 0.06727314425502637, + "grad_norm": 0.025820861749653763, + "learning_rate": 0.0006707317073170732, + "loss": 1.2678, "step": 110 }, { - "epoch": 0.017582753612109165, - "grad_norm": 0.09250245345131555, - "learning_rate": 0.00017584097859327217, - "loss": 1.3213, + "epoch": 0.07033101444843666, + "grad_norm": 0.029846828731098105, + "learning_rate": 0.0007012195121951219, + "loss": 1.2335, "step": 115 }, { - "epoch": 0.018347221160461738, - "grad_norm": 0.09366340563313799, - "learning_rate": 0.00018348623853211012, - "loss": 1.3243, + "epoch": 0.07338888464184695, + "grad_norm": 0.03508112839172254, + "learning_rate": 0.0007317073170731707, + "loss": 1.2663, "step": 120 }, { - "epoch": 0.01911168870881431, - "grad_norm": 0.09609974405919121, - "learning_rate": 0.00019113149847094801, - "loss": 1.2916, + "epoch": 0.07644675483525724, + "grad_norm": 0.0352184112282257, + "learning_rate": 0.0007621951219512195, + "loss": 1.2399, "step": 125 }, { - "epoch": 0.019876156257166884, - "grad_norm": 0.09358863492514614, - "learning_rate": 0.00019877675840978594, - "loss": 1.3238, + "epoch": 0.07950462502866754, + "grad_norm": 0.03322649965458742, + "learning_rate": 0.0007926829268292683, + "loss": 1.2503, "step": 130 }, { - "epoch": 0.020640623805519457, - "grad_norm": 0.08913000043934122, - "learning_rate": 0.00020642201834862386, - "loss": 1.2873, + "epoch": 0.08256249522207783, + "grad_norm": 0.028514272146253784, + "learning_rate": 0.000823170731707317, + "loss": 1.2468, "step": 135 }, { - "epoch": 0.021405091353872027, - "grad_norm": 0.0884733062294589, - "learning_rate": 0.00021406727828746178, - "loss": 1.2956, + "epoch": 0.0856203654154881, + "grad_norm": 0.028671295675629906, + "learning_rate": 0.0008536585365853659, + "loss": 1.23, "step": 140 }, { - "epoch": 0.0221695589022246, - "grad_norm": 0.08986088170695355, - "learning_rate": 0.0002217125382262997, - "loss": 1.2232, + "epoch": 0.0886782356088984, + "grad_norm": 0.030269321761170552, + "learning_rate": 0.0008841463414634147, + "loss": 1.2296, "step": 145 }, { - "epoch": 0.022934026450577173, - "grad_norm": 0.09329784590851617, - "learning_rate": 0.00022935779816513763, - "loss": 1.2924, + "epoch": 0.09173610580230869, + "grad_norm": 0.02846427788729512, + "learning_rate": 0.0009146341463414635, + "loss": 1.2241, "step": 150 }, { - "epoch": 0.023698493998929746, - "grad_norm": 0.09051908574519414, - "learning_rate": 0.00023700305810397555, - "loss": 1.2969, + "epoch": 0.09479397599571898, + "grad_norm": 0.03975120343371244, + "learning_rate": 0.0009451219512195122, + "loss": 1.244, "step": 155 }, { - "epoch": 0.02446296154728232, - "grad_norm": 0.08933188925183595, - "learning_rate": 0.00024464831804281347, - "loss": 1.2705, + "epoch": 0.09785184618912927, + "grad_norm": 0.03216136084206205, + "learning_rate": 0.000975609756097561, + "loss": 1.2457, "step": 160 }, { - "epoch": 0.02522742909563489, - "grad_norm": 0.09515017865326594, - "learning_rate": 0.00025229357798165137, - "loss": 1.3025, + "epoch": 0.10090971638253957, + "grad_norm": 0.03152661590457252, + "learning_rate": 0.000999998859712815, + "loss": 1.2326, "step": 165 }, { - "epoch": 0.02599189664398746, - "grad_norm": 0.0954616739977279, - "learning_rate": 0.00025993883792048926, - "loss": 1.2614, + "epoch": 0.10396758657594984, + "grad_norm": 0.031922913335073945, + "learning_rate": 0.0009999589502074485, + "loss": 1.249, "step": 170 }, { - "epoch": 0.026756364192340034, - "grad_norm": 0.08878617974216362, - "learning_rate": 0.00026758409785932727, - "loss": 1.3293, + "epoch": 0.10702545676936014, + "grad_norm": 0.04658476708749259, + "learning_rate": 0.0009998620315437507, + "loss": 1.2447, "step": 175 }, { - "epoch": 0.027520831740692607, - "grad_norm": 0.08664411019437332, - "learning_rate": 0.00027522935779816516, - "loss": 1.2649, + "epoch": 0.11008332696277043, + "grad_norm": 0.03617623229838695, + "learning_rate": 0.000999708114773132, + "loss": 1.2419, "step": 180 }, { - "epoch": 0.02828529928904518, - "grad_norm": 0.08911333992689248, - "learning_rate": 0.00028287461773700306, - "loss": 1.3307, + "epoch": 0.11314119715618072, + "grad_norm": 0.03418014417549059, + "learning_rate": 0.0009994972174463639, + "loss": 1.2197, "step": 185 }, { - "epoch": 0.029049766837397753, - "grad_norm": 0.0919892953815635, - "learning_rate": 0.000290519877675841, - "loss": 1.2688, + "epoch": 0.11619906734959101, + "grad_norm": 0.02577313047796619, + "learning_rate": 0.0009992293636115793, + "loss": 1.2334, "step": 190 }, { - "epoch": 0.029814234385750326, - "grad_norm": 0.08982364488062158, - "learning_rate": 0.0002981651376146789, - "loss": 1.3118, + "epoch": 0.1192569375430013, + "grad_norm": 0.026546715317612456, + "learning_rate": 0.0009989045838115292, + "loss": 1.2381, "step": 195 }, { - "epoch": 0.0305787019341029, - "grad_norm": 0.10317910874972114, - "learning_rate": 0.0003058103975535168, - "loss": 1.2273, + "epoch": 0.1223148077364116, + "grad_norm": 0.0326169458915842, + "learning_rate": 0.0009985229150800998, + "loss": 1.2143, "step": 200 }, { - "epoch": 0.03134316948245547, - "grad_norm": 0.0988002844753528, - "learning_rate": 0.00031345565749235475, - "loss": 1.282, + "epoch": 0.12537267792982187, + "grad_norm": 0.03223237682583807, + "learning_rate": 0.0009980844009380904, + "loss": 1.231, "step": 205 }, { - "epoch": 0.032107637030808045, - "grad_norm": 0.09497408054902434, - "learning_rate": 0.0003211009174311927, - "loss": 1.2672, + "epoch": 0.12843054812323218, + "grad_norm": 0.029643352282643223, + "learning_rate": 0.0009975890913882512, + "loss": 1.2469, "step": 210 }, { - "epoch": 0.032872104579160615, - "grad_norm": 0.0882881624063108, - "learning_rate": 0.0003287461773700306, - "loss": 1.2976, + "epoch": 0.13148841831664246, + "grad_norm": 0.027522481255624117, + "learning_rate": 0.0009970370429095794, + "loss": 1.2062, "step": 215 }, { - "epoch": 0.033636572127513184, - "grad_norm": 0.09864983052632054, - "learning_rate": 0.0003363914373088685, - "loss": 1.2793, + "epoch": 0.13454628851005274, + "grad_norm": 0.033075124541469436, + "learning_rate": 0.0009964283184508817, + "loss": 1.224, "step": 220 }, { - "epoch": 0.03440103967586576, - "grad_norm": 0.08805919903970645, - "learning_rate": 0.00034403669724770644, - "loss": 1.2742, + "epoch": 0.13760415870346304, + "grad_norm": 0.027377314114451212, + "learning_rate": 0.0009957629874235948, + "loss": 1.2086, "step": 225 }, { - "epoch": 0.03516550722421833, - "grad_norm": 0.09477232508209228, - "learning_rate": 0.00035168195718654434, - "loss": 1.2745, + "epoch": 0.14066202889687332, + "grad_norm": 0.03090829597386635, + "learning_rate": 0.000995041125693871, + "loss": 1.217, "step": 230 }, { - "epoch": 0.035929974772570906, - "grad_norm": 0.08374687785846717, - "learning_rate": 0.00035932721712538223, - "loss": 1.2962, + "epoch": 0.14371989909028363, + "grad_norm": 0.035975979448955, + "learning_rate": 0.0009942628155739275, + "loss": 1.2133, "step": 235 }, { - "epoch": 0.036694442320923476, - "grad_norm": 0.09194750042724101, - "learning_rate": 0.00036697247706422024, - "loss": 1.243, + "epoch": 0.1467777692836939, + "grad_norm": 0.029218388374324843, + "learning_rate": 0.00099342814581266, + "loss": 1.2056, "step": 240 }, { - "epoch": 0.03745890986927605, - "grad_norm": 0.09258202800876715, - "learning_rate": 0.00037461773700305813, - "loss": 1.293, + "epoch": 0.1498356394771042, + "grad_norm": 0.028688837478414952, + "learning_rate": 0.000992537211585524, + "loss": 1.2145, "step": 245 }, { - "epoch": 0.03822337741762862, - "grad_norm": 0.107430134301919, - "learning_rate": 0.00038226299694189603, - "loss": 1.2844, + "epoch": 0.1528935096705145, + "grad_norm": 0.0302429094788238, + "learning_rate": 0.0009915901144836802, + "loss": 1.2035, "step": 250 }, { - "epoch": 0.03898784496598119, - "grad_norm": 0.08615411635234742, - "learning_rate": 0.0003899082568807339, - "loss": 1.2506, + "epoch": 0.15595137986392477, + "grad_norm": 0.024901652208543252, + "learning_rate": 0.000990586962502413, + "loss": 1.2202, "step": 255 }, { - "epoch": 0.03975231251433377, - "grad_norm": 0.09531408169170505, - "learning_rate": 0.0003975535168195719, - "loss": 1.2637, + "epoch": 0.15900925005733507, + "grad_norm": 0.027892260766706888, + "learning_rate": 0.0009895278700288131, + "loss": 1.1807, "step": 260 }, { - "epoch": 0.04051678006268634, - "grad_norm": 0.08440291399032539, - "learning_rate": 0.00040519877675840977, - "loss": 1.2847, + "epoch": 0.16206712025074535, + "grad_norm": 0.03226968566991327, + "learning_rate": 0.000988412957828737, + "loss": 1.2072, "step": 265 }, { - "epoch": 0.041281247611038914, - "grad_norm": 0.09782115096835144, - "learning_rate": 0.0004128440366972477, - "loss": 1.2672, + "epoch": 0.16512499044415566, + "grad_norm": 0.03872574944189164, + "learning_rate": 0.000987242353033034, + "loss": 1.2037, "step": 270 }, { - "epoch": 0.042045715159391484, - "grad_norm": 0.10289516485156369, - "learning_rate": 0.00042048929663608567, - "loss": 1.3019, + "epoch": 0.16818286063756593, + "grad_norm": 0.033428446400930414, + "learning_rate": 0.000986016189123051, + "loss": 1.1912, "step": 275 }, { - "epoch": 0.04281018270774405, - "grad_norm": 0.10696722168409797, - "learning_rate": 0.00042813455657492356, - "loss": 1.2832, + "epoch": 0.1712407308309762, + "grad_norm": 0.0385131025139653, + "learning_rate": 0.0009847346059154124, + "loss": 1.2141, "step": 280 }, { - "epoch": 0.04357465025609663, - "grad_norm": 0.10607951895487863, - "learning_rate": 0.00043577981651376146, - "loss": 1.2772, + "epoch": 0.17429860102438652, + "grad_norm": 0.026232059261105282, + "learning_rate": 0.0009833977495460754, + "loss": 1.2019, "step": 285 }, { - "epoch": 0.0443391178044492, - "grad_norm": 0.08619472519717984, - "learning_rate": 0.0004434250764525994, - "loss": 1.2527, + "epoch": 0.1773564712177968, + "grad_norm": 0.024160196147301412, + "learning_rate": 0.000982005772453668, + "loss": 1.1787, "step": 290 }, { - "epoch": 0.045103585352801775, - "grad_norm": 0.09914776526024943, - "learning_rate": 0.0004510703363914373, - "loss": 1.3135, + "epoch": 0.1804143414112071, + "grad_norm": 0.022453529587095535, + "learning_rate": 0.0009805588333621066, + "loss": 1.1845, "step": 295 }, { - "epoch": 0.045868052901154345, - "grad_norm": 0.09807796164274683, - "learning_rate": 0.00045871559633027525, - "loss": 1.2597, + "epoch": 0.18347221160461738, + "grad_norm": 0.02720482538531544, + "learning_rate": 0.0009790570972624964, + "loss": 1.1738, "step": 300 }, { - "epoch": 0.04663252044950692, - "grad_norm": 0.10283330899312029, - "learning_rate": 0.00046636085626911315, - "loss": 1.2876, + "epoch": 0.18653008179802769, + "grad_norm": 0.02771296154155037, + "learning_rate": 0.000977500735394318, + "loss": 1.203, "step": 305 }, { - "epoch": 0.04739698799785949, - "grad_norm": 0.08801929670660691, - "learning_rate": 0.0004740061162079511, - "loss": 1.2711, + "epoch": 0.18958795199143796, + "grad_norm": 0.029629116640802273, + "learning_rate": 0.0009758899252259019, + "loss": 1.1867, "step": 310 }, { - "epoch": 0.04816145554621206, - "grad_norm": 0.10424236208657252, - "learning_rate": 0.000481651376146789, - "loss": 1.238, + "epoch": 0.19264582218484824, + "grad_norm": 0.024073515796522824, + "learning_rate": 0.0009742248504341918, + "loss": 1.1892, "step": 315 }, { - "epoch": 0.04892592309456464, - "grad_norm": 0.08993328037756107, - "learning_rate": 0.0004892966360856269, - "loss": 1.2834, + "epoch": 0.19570369237825855, + "grad_norm": 0.026256282210288567, + "learning_rate": 0.0009725057008838005, + "loss": 1.1895, "step": 320 }, { - "epoch": 0.04969039064291721, - "grad_norm": 0.10465808153748822, - "learning_rate": 0.0004969418960244648, - "loss": 1.2948, + "epoch": 0.19876156257166883, + "grad_norm": 0.027127167143907085, + "learning_rate": 0.0009707326726053596, + "loss": 1.1903, "step": 325 }, { - "epoch": 0.05045485819126978, - "grad_norm": 0.08903569990977848, - "learning_rate": 0.0005045871559633027, - "loss": 1.2672, + "epoch": 0.20181943276507913, + "grad_norm": 0.024066276808509948, + "learning_rate": 0.0009689059677731673, + "loss": 1.1732, "step": 330 }, { - "epoch": 0.05121932573962235, - "grad_norm": 0.10316518201667073, - "learning_rate": 0.0005122324159021407, - "loss": 1.2767, + "epoch": 0.2048773029584894, + "grad_norm": 0.03467571344471047, + "learning_rate": 0.0009670257946821346, + "loss": 1.1817, "step": 335 }, { - "epoch": 0.05198379328797492, - "grad_norm": 0.11171289817500524, - "learning_rate": 0.0005198776758409785, - "loss": 1.2606, + "epoch": 0.2079351731518997, + "grad_norm": 0.029277519444581707, + "learning_rate": 0.0009650923677240338, + "loss": 1.1949, "step": 340 }, { - "epoch": 0.0527482608363275, - "grad_norm": 0.09273950516093567, - "learning_rate": 0.0005275229357798165, - "loss": 1.3147, + "epoch": 0.21099304334531, + "grad_norm": 0.058400563521677724, + "learning_rate": 0.0009631059073630522, + "loss": 1.2189, "step": 345 }, { - "epoch": 0.05351272838468007, - "grad_norm": 0.09177716088521977, - "learning_rate": 0.0005351681957186545, - "loss": 1.2496, + "epoch": 0.21405091353872027, + "grad_norm": 0.035079123932762336, + "learning_rate": 0.0009610666401106527, + "loss": 1.183, "step": 350 }, { - "epoch": 0.054277195933032645, - "grad_norm": 0.11167903832080238, - "learning_rate": 0.0005428134556574924, - "loss": 1.2371, + "epoch": 0.21710878373213058, + "grad_norm": 0.027974026832942114, + "learning_rate": 0.0009589747984997454, + "loss": 1.1889, "step": 355 }, { - "epoch": 0.055041663481385214, - "grad_norm": 0.08734633477134537, - "learning_rate": 0.0005504587155963303, - "loss": 1.2304, + "epoch": 0.22016665392554086, + "grad_norm": 0.02669021024572584, + "learning_rate": 0.0009568306210581728, + "loss": 1.1787, "step": 360 }, { - "epoch": 0.05580613102973779, - "grad_norm": 0.08372782755538714, - "learning_rate": 0.0005581039755351683, - "loss": 1.2885, + "epoch": 0.22322452411895116, + "grad_norm": 0.02179715758089009, + "learning_rate": 0.00095463435228151, + "loss": 1.2111, "step": 365 }, { - "epoch": 0.05657059857809036, - "grad_norm": 0.0978669585781756, - "learning_rate": 0.0005657492354740061, - "loss": 1.2748, + "epoch": 0.22628239431236144, + "grad_norm": 0.02384963533856956, + "learning_rate": 0.0009523862426051868, + "loss": 1.1784, "step": 370 }, { - "epoch": 0.05733506612644293, - "grad_norm": 0.09122050324394194, - "learning_rate": 0.0005733944954128441, - "loss": 1.248, + "epoch": 0.22934026450577172, + "grad_norm": 0.028747252177852817, + "learning_rate": 0.00095008654837593, + "loss": 1.1883, "step": 375 }, { - "epoch": 0.058099533674795506, - "grad_norm": 0.10143711570915029, - "learning_rate": 0.000581039755351682, - "loss": 1.3127, + "epoch": 0.23239813469918202, + "grad_norm": 0.040361089898442656, + "learning_rate": 0.0009477355318225334, + "loss": 1.1769, "step": 380 }, { - "epoch": 0.058864001223148076, - "grad_norm": 0.088650537773865, - "learning_rate": 0.0005886850152905199, - "loss": 1.2848, + "epoch": 0.2354560048925923, + "grad_norm": 0.02974537276209321, + "learning_rate": 0.000945333461025956, + "loss": 1.1765, "step": 385 }, { - "epoch": 0.05962846877150065, - "grad_norm": 0.09185160704256218, - "learning_rate": 0.0005963302752293578, - "loss": 1.3034, + "epoch": 0.2385138750860026, + "grad_norm": 0.02326511823628993, + "learning_rate": 0.0009428806098887542, + "loss": 1.1613, "step": 390 }, { - "epoch": 0.06039293631985322, - "grad_norm": 0.08093356027719242, - "learning_rate": 0.0006039755351681957, - "loss": 1.229, + "epoch": 0.24157174527941289, + "grad_norm": 0.02788935788617548, + "learning_rate": 0.0009403772581038481, + "loss": 1.1885, "step": 395 }, { - "epoch": 0.0611574038682058, - "grad_norm": 0.0860338941976869, - "learning_rate": 0.0006116207951070336, - "loss": 1.269, + "epoch": 0.2446296154728232, + "grad_norm": 0.02733650935761564, + "learning_rate": 0.0009378236911226298, + "loss": 1.1413, "step": 400 }, { - "epoch": 0.06192187141655837, - "grad_norm": 0.08948858211939976, - "learning_rate": 0.0006192660550458716, - "loss": 1.2351, + "epoch": 0.24768748566623347, + "grad_norm": 0.026735407407419445, + "learning_rate": 0.0009352202001224133, + "loss": 1.1689, "step": 405 }, { - "epoch": 0.06268633896491094, - "grad_norm": 0.09429709970238356, - "learning_rate": 0.0006269113149847095, - "loss": 1.2485, + "epoch": 0.25074535585964375, + "grad_norm": 0.02496495083042904, + "learning_rate": 0.000932567081973233, + "loss": 1.1766, "step": 410 }, { - "epoch": 0.06345080651326351, - "grad_norm": 0.08705352394865537, - "learning_rate": 0.0006345565749235475, - "loss": 1.2537, + "epoch": 0.25380322605305405, + "grad_norm": 0.022860603493433784, + "learning_rate": 0.0009298646392039917, + "loss": 1.1601, "step": 415 }, { - "epoch": 0.06421527406161609, - "grad_norm": 0.1082260673439644, - "learning_rate": 0.0006422018348623854, - "loss": 1.2463, + "epoch": 0.25686109624646436, + "grad_norm": 0.023001099418093644, + "learning_rate": 0.0009271131799679638, + "loss": 1.17, "step": 420 }, { - "epoch": 0.06497974160996865, - "grad_norm": 0.09403045912297836, - "learning_rate": 0.0006498470948012232, - "loss": 1.304, + "epoch": 0.2599189664398746, + "grad_norm": 0.027817090057780446, + "learning_rate": 0.0009243130180076583, + "loss": 1.1614, "step": 425 }, { - "epoch": 0.06574420915832123, - "grad_norm": 0.10025637341779624, - "learning_rate": 0.0006574923547400612, - "loss": 1.2816, + "epoch": 0.2629768366332849, + "grad_norm": 0.024779365625928928, + "learning_rate": 0.000921464472619042, + "loss": 1.1744, "step": 430 }, { - "epoch": 0.0665086767066738, - "grad_norm": 0.10084432286928596, - "learning_rate": 0.0006651376146788991, - "loss": 1.3113, + "epoch": 0.2660347068266952, + "grad_norm": 0.02415651195912707, + "learning_rate": 0.0009185678686151322, + "loss": 1.1579, "step": 435 }, { - "epoch": 0.06727314425502637, - "grad_norm": 0.10873149177853551, - "learning_rate": 0.000672782874617737, - "loss": 1.264, + "epoch": 0.2690925770201055, + "grad_norm": 0.022815366652922293, + "learning_rate": 0.0009156235362889584, + "loss": 1.1745, "step": 440 }, { - "epoch": 0.06803761180337894, - "grad_norm": 0.08450706822759692, - "learning_rate": 0.0006804281345565749, - "loss": 1.2385, + "epoch": 0.2721504472135158, + "grad_norm": 0.021130481291703748, + "learning_rate": 0.0009126318113758999, + "loss": 1.1607, "step": 445 }, { - "epoch": 0.06880207935173152, - "grad_norm": 0.1055703401455179, - "learning_rate": 0.0006880733944954129, - "loss": 1.2603, + "epoch": 0.2752083174069261, + "grad_norm": 0.026167022396722542, + "learning_rate": 0.0009095930350154026, + "loss": 1.1874, "step": 450 }, { - "epoch": 0.0695665469000841, - "grad_norm": 0.09853955754584204, - "learning_rate": 0.0006957186544342507, - "loss": 1.2655, + "epoch": 0.2782661876003364, + "grad_norm": 0.03886381769784487, + "learning_rate": 0.0009065075537120796, + "loss": 1.1896, "step": 455 }, { - "epoch": 0.07033101444843666, - "grad_norm": 0.09398452513918377, - "learning_rate": 0.0007033639143730887, - "loss": 1.2665, + "epoch": 0.28132405779374664, + "grad_norm": 0.03341109366532242, + "learning_rate": 0.0009033757192962004, + "loss": 1.1575, "step": 460 }, { - "epoch": 0.07109548199678924, - "grad_norm": 0.0963643560310258, - "learning_rate": 0.0007110091743119266, - "loss": 1.2876, + "epoch": 0.28438192798715695, + "grad_norm": 0.03306894493562745, + "learning_rate": 0.0009001978888835723, + "loss": 1.1452, "step": 465 }, { - "epoch": 0.07185994954514181, - "grad_norm": 0.0856944074129867, - "learning_rate": 0.0007186544342507645, - "loss": 1.3032, + "epoch": 0.28743979818056725, + "grad_norm": 0.02506155876387002, + "learning_rate": 0.0008969744248348192, + "loss": 1.1589, "step": 470 }, { - "epoch": 0.07262441709349438, - "grad_norm": 0.08659127693558222, - "learning_rate": 0.0007262996941896025, - "loss": 1.3092, + "epoch": 0.2904976683739775, + "grad_norm": 0.025436594808100562, + "learning_rate": 0.0008937056947140625, + "loss": 1.188, "step": 475 }, { - "epoch": 0.07338888464184695, - "grad_norm": 0.0832981276048925, - "learning_rate": 0.0007339449541284405, - "loss": 1.2758, + "epoch": 0.2935555385673878, + "grad_norm": 0.023518916924014855, + "learning_rate": 0.0008903920712470089, + "loss": 1.1513, "step": 480 }, { - "epoch": 0.07415335219019953, - "grad_norm": 0.08230388249939921, - "learning_rate": 0.0007415902140672783, - "loss": 1.2788, + "epoch": 0.2966134087607981, + "grad_norm": 0.025275737643640807, + "learning_rate": 0.0008870339322784491, + "loss": 1.1646, "step": 485 }, { - "epoch": 0.0749178197385521, - "grad_norm": 0.10161902796868667, - "learning_rate": 0.0007492354740061163, - "loss": 1.2714, + "epoch": 0.2996712789542084, + "grad_norm": 0.02705678175117531, + "learning_rate": 0.0008836316607291732, + "loss": 1.1468, "step": 490 }, { - "epoch": 0.07568228728690467, - "grad_norm": 0.11416418403977804, - "learning_rate": 0.0007568807339449541, - "loss": 1.2577, + "epoch": 0.30272914914761867, + "grad_norm": 0.03300749247554839, + "learning_rate": 0.0008801856445523064, + "loss": 1.1593, "step": 495 }, { - "epoch": 0.07644675483525724, - "grad_norm": 0.09847812040362659, - "learning_rate": 0.0007645259938837921, - "loss": 1.2771, + "epoch": 0.305787019341029, + "grad_norm": 0.03070807591397227, + "learning_rate": 0.0008766962766890733, + "loss": 1.1494, "step": 500 }, { - "epoch": 0.07721122238360982, - "grad_norm": 0.08845565348706537, - "learning_rate": 0.00077217125382263, - "loss": 1.3169, + "epoch": 0.3088448895344393, + "grad_norm": 0.03171884578840513, + "learning_rate": 0.0008731639550239895, + "loss": 1.1549, "step": 505 }, { - "epoch": 0.07797568993196238, - "grad_norm": 0.09498376109239919, - "learning_rate": 0.0007798165137614678, - "loss": 1.2596, + "epoch": 0.31190275972784953, + "grad_norm": 0.026683099702115617, + "learning_rate": 0.0008695890823394938, + "loss": 1.1599, "step": 510 }, { - "epoch": 0.07874015748031496, - "grad_norm": 0.08367225972468065, - "learning_rate": 0.0007874617737003058, - "loss": 1.2634, + "epoch": 0.31496062992125984, + "grad_norm": 0.02544573139035867, + "learning_rate": 0.0008659720662700183, + "loss": 1.14, "step": 515 }, { - "epoch": 0.07950462502866754, - "grad_norm": 0.07963132772921086, - "learning_rate": 0.0007951070336391437, - "loss": 1.2872, + "epoch": 0.31801850011467014, + "grad_norm": 0.022400712579356837, + "learning_rate": 0.0008623133192555081, + "loss": 1.1631, "step": 520 }, { - "epoch": 0.0802690925770201, - "grad_norm": 0.0903050811552875, - "learning_rate": 0.0008027522935779816, - "loss": 1.2628, + "epoch": 0.3210763703080804, + "grad_norm": 0.021516644941970493, + "learning_rate": 0.0008586132584943902, + "loss": 1.1414, "step": 525 }, { - "epoch": 0.08103356012537268, - "grad_norm": 0.0835760948608985, - "learning_rate": 0.0008103975535168195, - "loss": 1.295, + "epoch": 0.3241342405014907, + "grad_norm": 0.022140987842416558, + "learning_rate": 0.0008548723058960033, + "loss": 1.1514, "step": 530 }, { - "epoch": 0.08179802767372525, - "grad_norm": 0.09170552594575584, - "learning_rate": 0.0008180428134556576, - "loss": 1.2793, + "epoch": 0.327192110694901, + "grad_norm": 0.02141121718297312, + "learning_rate": 0.0008510908880324864, + "loss": 1.1496, "step": 535 }, { - "epoch": 0.08256249522207783, - "grad_norm": 0.10994383729222293, - "learning_rate": 0.0008256880733944954, - "loss": 1.2947, + "epoch": 0.3302499808883113, + "grad_norm": 0.0230067385747688, + "learning_rate": 0.000847269436090139, + "loss": 1.1565, "step": 540 }, { - "epoch": 0.08332696277043039, - "grad_norm": 0.07270122130569258, - "learning_rate": 0.0008333333333333334, - "loss": 1.2592, + "epoch": 0.33330785108172156, + "grad_norm": 0.026857409244505942, + "learning_rate": 0.0008434083858202541, + "loss": 1.1541, "step": 545 }, { - "epoch": 0.08409143031878297, - "grad_norm": 0.09011634909964011, - "learning_rate": 0.0008409785932721713, - "loss": 1.2491, + "epoch": 0.33636572127513187, + "grad_norm": 0.031962019558439914, + "learning_rate": 0.0008395081774894297, + "loss": 1.1369, "step": 550 }, { - "epoch": 0.08485589786713554, - "grad_norm": 0.11082174674890546, - "learning_rate": 0.0008486238532110092, - "loss": 1.2943, + "epoch": 0.3394235914685422, + "grad_norm": 0.025328863714439345, + "learning_rate": 0.0008355692558293664, + "loss": 1.1462, "step": 555 }, { - "epoch": 0.0856203654154881, - "grad_norm": 0.08500973543905206, - "learning_rate": 0.0008562691131498471, - "loss": 1.2768, + "epoch": 0.3424814616619524, + "grad_norm": 0.02322136360750488, + "learning_rate": 0.0008315920699861561, + "loss": 1.1364, "step": 560 }, { - "epoch": 0.08638483296384068, - "grad_norm": 0.08246253797224005, - "learning_rate": 0.0008639143730886851, - "loss": 1.2644, + "epoch": 0.34553933185536273, + "grad_norm": 0.023916932106576035, + "learning_rate": 0.0008275770734690669, + "loss": 1.1376, "step": 565 }, { - "epoch": 0.08714930051219326, - "grad_norm": 0.08202189446656684, - "learning_rate": 0.0008715596330275229, - "loss": 1.2794, + "epoch": 0.34859720204877304, + "grad_norm": 0.027655600181262414, + "learning_rate": 0.0008235247240988294, + "loss": 1.1433, "step": 570 }, { - "epoch": 0.08791376806054584, - "grad_norm": 0.08492301097879416, - "learning_rate": 0.0008792048929663609, - "loss": 1.265, + "epoch": 0.35165507224218334, + "grad_norm": 0.01944294724363515, + "learning_rate": 0.0008194354839554344, + "loss": 1.1422, "step": 575 }, { - "epoch": 0.0886782356088984, - "grad_norm": 0.10488937164616725, - "learning_rate": 0.0008868501529051988, - "loss": 1.2663, + "epoch": 0.3547129424355936, + "grad_norm": 0.024586207516548813, + "learning_rate": 0.0008153098193254413, + "loss": 1.1481, "step": 580 }, { - "epoch": 0.08944270315725097, - "grad_norm": 0.09957818602896881, - "learning_rate": 0.0008944954128440367, - "loss": 1.2554, + "epoch": 0.3577708126290039, + "grad_norm": 0.02722431056639769, + "learning_rate": 0.0008111482006488091, + "loss": 1.1479, "step": 585 }, { - "epoch": 0.09020717070560355, - "grad_norm": 0.09037700230444715, - "learning_rate": 0.0009021406727828746, - "loss": 1.2805, + "epoch": 0.3608286828224142, + "grad_norm": 0.021338457814223475, + "learning_rate": 0.0008069511024652537, + "loss": 1.1444, "step": 590 }, { - "epoch": 0.09097163825395611, - "grad_norm": 0.08940645361248405, - "learning_rate": 0.0009097859327217126, - "loss": 1.2877, + "epoch": 0.36388655301582445, + "grad_norm": 0.025185341202656784, + "learning_rate": 0.0008027190033601363, + "loss": 1.1301, "step": 595 }, { - "epoch": 0.09173610580230869, - "grad_norm": 0.08800769193925378, - "learning_rate": 0.0009174311926605505, - "loss": 1.2453, + "epoch": 0.36694442320923476, + "grad_norm": 0.020971837960705533, + "learning_rate": 0.000798452385909892, + "loss": 1.1431, "step": 600 }, { - "epoch": 0.09250057335066127, - "grad_norm": 0.09302580222012974, - "learning_rate": 0.0009250764525993885, - "loss": 1.2957, + "epoch": 0.37000229340264507, + "grad_norm": 0.02495043534094607, + "learning_rate": 0.0007941517366270032, + "loss": 1.1356, "step": 605 }, { - "epoch": 0.09326504089901384, - "grad_norm": 0.08726126516074638, - "learning_rate": 0.0009327217125382263, - "loss": 1.2796, + "epoch": 0.37306016359605537, + "grad_norm": 0.025056486993634036, + "learning_rate": 0.0007898175459045224, + "loss": 1.1529, "step": 610 }, { - "epoch": 0.0940295084473664, - "grad_norm": 0.09201111299602344, - "learning_rate": 0.0009403669724770643, - "loss": 1.3069, + "epoch": 0.3761180337894656, + "grad_norm": 0.02276962025168977, + "learning_rate": 0.0007854503079601551, + "loss": 1.1236, "step": 615 }, { - "epoch": 0.09479397599571898, - "grad_norm": 0.10065773151168467, - "learning_rate": 0.0009480122324159022, - "loss": 1.2825, + "epoch": 0.37917590398287593, + "grad_norm": 0.021078280161330308, + "learning_rate": 0.0007810505207799046, + "loss": 1.137, "step": 620 }, { - "epoch": 0.09555844354407156, - "grad_norm": 0.07547253089389518, - "learning_rate": 0.00095565749235474, - "loss": 1.2775, + "epoch": 0.38223377417628623, + "grad_norm": 0.02117005306879687, + "learning_rate": 0.000776618686061288, + "loss": 1.1449, "step": 625 }, { - "epoch": 0.09632291109242412, - "grad_norm": 0.0795033812217181, - "learning_rate": 0.000963302752293578, - "loss": 1.3211, + "epoch": 0.3852916443696965, + "grad_norm": 0.024567794869791014, + "learning_rate": 0.0007721553091561288, + "loss": 1.139, "step": 630 }, { - "epoch": 0.0970873786407767, - "grad_norm": 0.08486183593424045, - "learning_rate": 0.0009709480122324159, - "loss": 1.2822, + "epoch": 0.3883495145631068, + "grad_norm": 0.019473444654760176, + "learning_rate": 0.000767660899012933, + "loss": 1.1386, "step": 635 }, { - "epoch": 0.09785184618912927, - "grad_norm": 0.09356435073800304, - "learning_rate": 0.0009785932721712539, - "loss": 1.3051, + "epoch": 0.3914073847565171, + "grad_norm": 0.022412121303153214, + "learning_rate": 0.0007631359681188544, + "loss": 1.1368, "step": 640 }, { - "epoch": 0.09861631373748185, - "grad_norm": 0.08681809761708813, - "learning_rate": 0.0009862385321100918, - "loss": 1.2784, + "epoch": 0.3944652549499274, + "grad_norm": 0.022050962340279187, + "learning_rate": 0.000758581032441257, + "loss": 1.1223, "step": 645 }, { - "epoch": 0.09938078128583441, - "grad_norm": 0.08805484661340418, - "learning_rate": 0.0009938837920489296, - "loss": 1.2898, + "epoch": 0.39752312514333765, + "grad_norm": 0.02203032943298021, + "learning_rate": 0.0007539966113688812, + "loss": 1.1167, "step": 650 }, { - "epoch": 0.10014524883418699, - "grad_norm": 0.10779188617692295, - "learning_rate": 0.0009999999287804496, - "loss": 1.3224, + "epoch": 0.40058099533674796, + "grad_norm": 0.023078837241982387, + "learning_rate": 0.0007493832276526181, + "loss": 1.1344, "step": 655 }, { - "epoch": 0.10090971638253957, - "grad_norm": 0.09449739996386951, - "learning_rate": 0.000999997436098313, - "loss": 1.2743, + "epoch": 0.40363886553015826, + "grad_norm": 0.023966897106968064, + "learning_rate": 0.0007447414073459017, + "loss": 1.1227, "step": 660 }, { - "epoch": 0.10167418393089213, - "grad_norm": 0.0963645222292544, - "learning_rate": 0.0009999913824589408, - "loss": 1.3092, + "epoch": 0.4066967357235685, + "grad_norm": 0.02168228889218337, + "learning_rate": 0.000740071679744725, + "loss": 1.1246, "step": 665 }, { - "epoch": 0.1024386514792447, - "grad_norm": 0.11266749758021803, - "learning_rate": 0.000999981767905447, - "loss": 1.296, + "epoch": 0.4097546059169788, + "grad_norm": 0.020601206009860356, + "learning_rate": 0.0007353745773272844, + "loss": 1.1227, "step": 670 }, { - "epoch": 0.10320311902759728, - "grad_norm": 0.37487450908071945, - "learning_rate": 0.000999968592506306, - "loss": 1.3499, + "epoch": 0.4128124761103891, + "grad_norm": 0.021802890558372243, + "learning_rate": 0.0007306506356932631, + "loss": 1.1037, "step": 675 }, { - "epoch": 0.10396758657594984, - "grad_norm": 0.14259324943638688, - "learning_rate": 0.000999951856355352, - "loss": 1.3757, + "epoch": 0.4158703463037994, + "grad_norm": 0.02158104255188381, + "learning_rate": 0.0007259003935027581, + "loss": 1.1166, "step": 680 }, { - "epoch": 0.10473205412430242, - "grad_norm": 0.30655516403662986, - "learning_rate": 0.0009999315595717797, - "loss": 1.4443, + "epoch": 0.4189282164972097, + "grad_norm": 0.02059398752407913, + "learning_rate": 0.0007211243924148578, + "loss": 1.1195, "step": 685 }, { - "epoch": 0.105496521672655, - "grad_norm": 1.0200782964851494, - "learning_rate": 0.000999907702300141, - "loss": 2.5802, + "epoch": 0.42198608669062, + "grad_norm": 0.022849828674870364, + "learning_rate": 0.0007163231770258777, + "loss": 1.1358, "step": 690 }, { - "epoch": 0.10626098922100757, - "grad_norm": 0.26954529309057906, - "learning_rate": 0.0009998802847103467, - "loss": 1.5332, + "epoch": 0.4250439568840303, + "grad_norm": 0.022923693857214923, + "learning_rate": 0.0007114972948072613, + "loss": 1.1388, "step": 695 }, { - "epoch": 0.10702545676936014, - "grad_norm": 0.2069498533750865, - "learning_rate": 0.0009998493069976635, - "loss": 1.464, + "epoch": 0.42810182707744054, + "grad_norm": 0.02318319704681315, + "learning_rate": 0.0007066472960431541, + "loss": 1.1154, "step": 700 }, { - "epoch": 0.10778992431771271, - "grad_norm": 0.11686570615912077, - "learning_rate": 0.000999814769382713, - "loss": 1.4344, + "epoch": 0.43115969727085085, + "grad_norm": 0.02746835308407499, + "learning_rate": 0.0007017737337676546, + "loss": 1.1222, "step": 705 }, { - "epoch": 0.10855439186606529, - "grad_norm": 0.24123211327141858, - "learning_rate": 0.0009997766721114708, - "loss": 1.4062, + "epoch": 0.43421756746426116, + "grad_norm": 0.02776028621219042, + "learning_rate": 0.0006968771637017541, + "loss": 1.1011, "step": 710 }, { - "epoch": 0.10931885941441785, - "grad_norm": 0.11666901013683244, - "learning_rate": 0.0009997350154552632, - "loss": 1.3302, + "epoch": 0.4372754376576714, + "grad_norm": 0.024266687223156804, + "learning_rate": 0.0006919581441899689, + "loss": 1.1246, "step": 715 }, { - "epoch": 0.11008332696277043, - "grad_norm": 0.12636047196811126, - "learning_rate": 0.000999689799710767, - "loss": 1.3883, + "epoch": 0.4403333078510817, + "grad_norm": 0.02908301149621474, + "learning_rate": 0.0006870172361366733, + "loss": 1.1225, "step": 720 }, { - "epoch": 0.110847794511123, - "grad_norm": 0.12003129463073069, - "learning_rate": 0.000999641025200007, - "loss": 1.3407, + "epoch": 0.443391178044492, + "grad_norm": 0.019862559304176748, + "learning_rate": 0.0006820550029421413, + "loss": 1.1367, "step": 725 }, { - "epoch": 0.11161226205947558, - "grad_norm": 0.09823944621605302, - "learning_rate": 0.0009995886922703524, - "loss": 1.3159, + "epoch": 0.4464490482379023, + "grad_norm": 0.032405405252682935, + "learning_rate": 0.000677072010438303, + "loss": 1.1234, "step": 730 }, { - "epoch": 0.11237672960782814, - "grad_norm": 0.13329784014265705, - "learning_rate": 0.0009995328012945158, - "loss": 1.3444, + "epoch": 0.4495069184313126, + "grad_norm": 0.030753416752446607, + "learning_rate": 0.0006720688268242248, + "loss": 1.1109, "step": 735 }, { - "epoch": 0.11314119715618072, - "grad_norm": 0.0786031784176081, - "learning_rate": 0.0009994733526705502, - "loss": 1.3057, + "epoch": 0.4525647886247229, + "grad_norm": 0.022597155712349652, + "learning_rate": 0.0006670460226013181, + "loss": 1.0944, "step": 740 }, { - "epoch": 0.1139056647045333, - "grad_norm": 0.07828669230906563, - "learning_rate": 0.0009994103468218456, - "loss": 1.3281, + "epoch": 0.4556226588181332, + "grad_norm": 0.022634198101240257, + "learning_rate": 0.0006620041705082871, + "loss": 1.1089, "step": 745 }, { - "epoch": 0.11467013225288586, - "grad_norm": 0.11852517280818008, - "learning_rate": 0.0009993437841971267, - "loss": 1.3017, + "epoch": 0.45868052901154344, + "grad_norm": 0.02271478233831073, + "learning_rate": 0.00065694384545582, + "loss": 1.1023, "step": 750 }, { - "epoch": 0.11543459980123844, - "grad_norm": 0.08558417909039735, - "learning_rate": 0.0009992736652704491, - "loss": 1.3356, + "epoch": 0.46173839920495374, + "grad_norm": 0.027216769976956785, + "learning_rate": 0.000651865624461034, + "loss": 1.1058, "step": 755 }, { - "epoch": 0.11619906734959101, - "grad_norm": 0.08594422889881462, - "learning_rate": 0.0009991999905411967, - "loss": 1.3238, + "epoch": 0.46479626939836405, + "grad_norm": 0.02071517184981896, + "learning_rate": 0.0006467700865816791, + "loss": 1.1083, "step": 760 }, { - "epoch": 0.11696353489794359, - "grad_norm": 0.19597834457509825, - "learning_rate": 0.0009991227605340768, - "loss": 1.3349, + "epoch": 0.46785413959177435, + "grad_norm": 0.02555353317127748, + "learning_rate": 0.0006416578128501092, + "loss": 1.0981, "step": 765 }, { - "epoch": 0.11772800244629615, - "grad_norm": 0.07985410977636408, - "learning_rate": 0.0009990419757991181, - "loss": 1.3364, + "epoch": 0.4709120097851846, + "grad_norm": 0.026631260066706193, + "learning_rate": 0.0006365293862070285, + "loss": 1.1091, "step": 770 }, { - "epoch": 0.11849246999464873, - "grad_norm": 0.1703047512929766, - "learning_rate": 0.0009989576369116653, - "loss": 1.3149, + "epoch": 0.4739698799785949, + "grad_norm": 0.022198028831592595, + "learning_rate": 0.0006313853914350207, + "loss": 1.1265, "step": 775 }, { - "epoch": 0.1192569375430013, - "grad_norm": 0.10335327963273161, - "learning_rate": 0.0009988697444723762, - "loss": 1.3157, + "epoch": 0.4770277501720052, + "grad_norm": 0.020216593671054946, + "learning_rate": 0.0006262264150918667, + "loss": 1.0885, "step": 780 }, { - "epoch": 0.12002140509135387, - "grad_norm": 0.08362146242319474, - "learning_rate": 0.000998778299107216, - "loss": 1.3042, + "epoch": 0.48008562036541547, + "grad_norm": 0.02929161675770138, + "learning_rate": 0.0006210530454436612, + "loss": 1.1049, "step": 785 }, { - "epoch": 0.12078587263970644, - "grad_norm": 0.08660098140550453, - "learning_rate": 0.0009986833014674543, - "loss": 1.3064, + "epoch": 0.48314349055882577, + "grad_norm": 0.022879526501220927, + "learning_rate": 0.0006158658723977341, + "loss": 1.1114, "step": 790 }, { - "epoch": 0.12155034018805902, - "grad_norm": 0.09078797333385379, - "learning_rate": 0.0009985847522296597, - "loss": 1.2792, + "epoch": 0.4862013607522361, + "grad_norm": 0.02011420292019708, + "learning_rate": 0.0006106654874353845, + "loss": 1.1147, "step": 795 }, { - "epoch": 0.1223148077364116, - "grad_norm": 0.09242815151780588, - "learning_rate": 0.000998482652095695, - "loss": 1.2806, + "epoch": 0.4892592309456464, + "grad_norm": 0.020765554185496773, + "learning_rate": 0.0006054524835444363, + "loss": 1.126, "step": 800 }, { - "epoch": 0.12307927528476416, - "grad_norm": 0.09723297340419826, - "learning_rate": 0.0009983770017927122, - "loss": 1.3326, + "epoch": 0.49231710113905663, + "grad_norm": 0.021368017020777876, + "learning_rate": 0.0006002274551516198, + "loss": 1.1141, "step": 805 }, { - "epoch": 0.12384374283311674, - "grad_norm": 0.09272895680580698, - "learning_rate": 0.0009982678020731476, - "loss": 1.3065, + "epoch": 0.49537497133246694, + "grad_norm": 0.023303801824493854, + "learning_rate": 0.0005949909980547917, + "loss": 1.1056, "step": 810 }, { - "epoch": 0.12460821038146931, - "grad_norm": 0.12136097817347706, - "learning_rate": 0.0009981550537147166, - "loss": 1.2986, + "epoch": 0.49843284152587725, + "grad_norm": 0.020988017572428172, + "learning_rate": 0.0005897437093549972, + "loss": 1.1046, "step": 815 }, { - "epoch": 0.12537267792982187, - "grad_norm": 0.08788778565772157, - "learning_rate": 0.000998038757520407, - "loss": 1.2805, + "epoch": 0.5014907117192875, + "grad_norm": 0.0222701220933236, + "learning_rate": 0.0005844861873883838, + "loss": 1.0896, "step": 820 }, { - "epoch": 0.12613714547817445, - "grad_norm": 0.08105841411135437, - "learning_rate": 0.000997918914318475, - "loss": 1.3446, + "epoch": 0.5045485819126978, + "grad_norm": 0.02390246675277257, + "learning_rate": 0.0005792190316579754, + "loss": 1.1003, "step": 825 }, { - "epoch": 0.12690161302652703, - "grad_norm": 0.07619675061909292, - "learning_rate": 0.0009977955249624378, - "loss": 1.316, + "epoch": 0.5076064521061081, + "grad_norm": 0.024599305753849318, + "learning_rate": 0.0005739428427653107, + "loss": 1.1006, "step": 830 }, { - "epoch": 0.1276660805748796, - "grad_norm": 0.0780011000474007, - "learning_rate": 0.0009976685903310685, - "loss": 1.2802, + "epoch": 0.5106643222995184, + "grad_norm": 0.021909626214719365, + "learning_rate": 0.0005686582223419595, + "loss": 1.0978, "step": 835 }, { - "epoch": 0.12843054812323218, - "grad_norm": 0.1123562503265519, - "learning_rate": 0.000997538111328389, - "loss": 1.3186, + "epoch": 0.5137221924929287, + "grad_norm": 0.020358006124918764, + "learning_rate": 0.0005633657729809194, + "loss": 1.0959, "step": 840 }, { - "epoch": 0.12919501567158473, - "grad_norm": 0.09717794248478596, - "learning_rate": 0.0009974040888836647, - "loss": 1.2766, + "epoch": 0.5167800626863389, + "grad_norm": 0.0198487307695255, + "learning_rate": 0.0005580660981679034, + "loss": 1.1, "step": 845 }, { - "epoch": 0.1299594832199373, - "grad_norm": 0.13339328046425755, - "learning_rate": 0.0009972665239513967, - "loss": 1.3051, + "epoch": 0.5198379328797492, + "grad_norm": 0.020615430577747694, + "learning_rate": 0.0005527598022125259, + "loss": 1.095, "step": 850 }, { - "epoch": 0.13072395076828988, - "grad_norm": 0.09665450497996363, - "learning_rate": 0.0009971254175113156, - "loss": 1.277, + "epoch": 0.5228958030731595, + "grad_norm": 0.02292927406023271, + "learning_rate": 0.0005474474901793951, + "loss": 1.1095, "step": 855 }, { - "epoch": 0.13148841831664246, - "grad_norm": 0.10830546508500209, - "learning_rate": 0.000996980770568375, - "loss": 1.2626, + "epoch": 0.5259536732665698, + "grad_norm": 0.023765712603430467, + "learning_rate": 0.0005421297678191189, + "loss": 1.0771, "step": 860 }, { - "epoch": 0.13225288586499503, - "grad_norm": 0.0892103153317403, - "learning_rate": 0.000996832584152743, - "loss": 1.2664, + "epoch": 0.5290115434599801, + "grad_norm": 0.01983795845597764, + "learning_rate": 0.0005368072414992314, + "loss": 1.1005, "step": 865 }, { - "epoch": 0.1330173534133476, - "grad_norm": 0.08476734354247635, - "learning_rate": 0.0009966808593197959, - "loss": 1.3118, + "epoch": 0.5320694136533904, + "grad_norm": 0.026870791253807334, + "learning_rate": 0.0005314805181350515, + "loss": 1.0942, "step": 870 }, { - "epoch": 0.1337818209617002, - "grad_norm": 0.07122127752767866, - "learning_rate": 0.0009965255971501107, - "loss": 1.2838, + "epoch": 0.5351272838468007, + "grad_norm": 0.01859295942083269, + "learning_rate": 0.0005261502051204772, + "loss": 1.0795, "step": 875 }, { - "epoch": 0.13454628851005274, - "grad_norm": 0.07151006579443676, - "learning_rate": 0.0009963667987494568, - "loss": 1.324, + "epoch": 0.538185154040211, + "grad_norm": 0.024323351242619737, + "learning_rate": 0.0005208169102587264, + "loss": 1.0888, "step": 880 }, { - "epoch": 0.1353107560584053, - "grad_norm": 0.08908137143916416, - "learning_rate": 0.000996204465248789, - "loss": 1.2627, + "epoch": 0.5412430242336213, + "grad_norm": 0.02524057653331344, + "learning_rate": 0.0005154812416930298, + "loss": 1.0908, "step": 885 }, { - "epoch": 0.1360752236067579, - "grad_norm": 0.07219573935906125, - "learning_rate": 0.0009960385978042383, - "loss": 1.2644, + "epoch": 0.5443008944270316, + "grad_norm": 0.018971386204347403, + "learning_rate": 0.0005101438078372858, + "loss": 1.0976, "step": 890 }, { - "epoch": 0.13683969115511047, - "grad_norm": 0.07819307629570971, - "learning_rate": 0.0009958691975971044, - "loss": 1.2889, + "epoch": 0.5473587646204419, + "grad_norm": 0.019908423848410486, + "learning_rate": 0.000504805217306685, + "loss": 1.093, "step": 895 }, { - "epoch": 0.13760415870346304, - "grad_norm": 0.07151962644193259, - "learning_rate": 0.0009956962658338472, - "loss": 1.2671, + "epoch": 0.5504166348138522, + "grad_norm": 0.0195002156944646, + "learning_rate": 0.0004994660788483113, + "loss": 1.0982, "step": 900 }, { - "epoch": 0.13836862625181562, - "grad_norm": 0.07987119674459224, - "learning_rate": 0.0009955198037460786, - "loss": 1.2814, + "epoch": 0.5534745050072625, + "grad_norm": 0.019088935901819853, + "learning_rate": 0.0004941270012717269, + "loss": 1.0946, "step": 905 }, { - "epoch": 0.1391330938001682, - "grad_norm": 0.07361684559744329, - "learning_rate": 0.0009953398125905528, - "loss": 1.2827, + "epoch": 0.5565323752006728, + "grad_norm": 0.020956181295062905, + "learning_rate": 0.000488788593379552, + "loss": 1.0927, "step": 910 }, { - "epoch": 0.13989756134852074, - "grad_norm": 0.06549715454074158, - "learning_rate": 0.000995156293649158, - "loss": 1.3029, + "epoch": 0.559590245394083, + "grad_norm": 0.02058468157622052, + "learning_rate": 0.00048345146389804425, + "loss": 1.0778, "step": 915 }, { - "epoch": 0.14066202889687332, - "grad_norm": 0.10854965228474792, - "learning_rate": 0.000994969248228907, - "loss": 1.252, + "epoch": 0.5626481155874933, + "grad_norm": 0.023584369471082292, + "learning_rate": 0.00047811622140768727, + "loss": 1.0821, "step": 920 }, { - "epoch": 0.1414264964452259, - "grad_norm": 0.08012814087734642, - "learning_rate": 0.000994778677661928, - "loss": 1.2825, + "epoch": 0.5657059857809036, + "grad_norm": 0.019349421074761057, + "learning_rate": 0.0004727834742737955, + "loss": 1.107, "step": 925 }, { - "epoch": 0.14219096399357847, - "grad_norm": 0.07656969405845815, - "learning_rate": 0.0009945845833054555, - "loss": 1.2584, + "epoch": 0.5687638559743139, + "grad_norm": 0.020725187288479826, + "learning_rate": 0.0004674538305771433, + "loss": 1.1124, "step": 930 }, { - "epoch": 0.14295543154193105, - "grad_norm": 0.0851834014068238, - "learning_rate": 0.00099438696654182, - "loss": 1.2469, + "epoch": 0.5718217261677242, + "grad_norm": 0.02330784277601695, + "learning_rate": 0.00046212789804462765, + "loss": 1.0776, "step": 935 }, { - "epoch": 0.14371989909028363, - "grad_norm": 0.08868829561728953, - "learning_rate": 0.0009941858287784383, - "loss": 1.3047, + "epoch": 0.5748795963611345, + "grad_norm": 0.020779970593931846, + "learning_rate": 0.0004568062839799696, + "loss": 1.0936, "step": 940 }, { - "epoch": 0.1444843666386362, - "grad_norm": 0.10386951169055887, - "learning_rate": 0.0009939811714478036, - "loss": 1.2649, + "epoch": 0.5779374665545448, + "grad_norm": 0.02310004433856537, + "learning_rate": 0.000451489595194466, + "loss": 1.0816, "step": 945 }, { - "epoch": 0.14524883418698875, - "grad_norm": 0.13301369066388727, - "learning_rate": 0.0009937729960074755, - "loss": 1.253, + "epoch": 0.580995336747955, + "grad_norm": 0.020049876509611084, + "learning_rate": 0.00044617843793779517, + "loss": 1.0776, "step": 950 }, { - "epoch": 0.14601330173534133, - "grad_norm": 0.10263670019334172, - "learning_rate": 0.000993561303940069, - "loss": 1.2709, + "epoch": 0.5840532069413653, + "grad_norm": 0.019669039054560983, + "learning_rate": 0.00044087341782888823, + "loss": 1.0813, "step": 955 }, { - "epoch": 0.1467777692836939, - "grad_norm": 0.10665302368773437, - "learning_rate": 0.0009933460967532454, - "loss": 1.2766, + "epoch": 0.5871110771347756, + "grad_norm": 0.019875654721293167, + "learning_rate": 0.0004355751397868713, + "loss": 1.0899, "step": 960 }, { - "epoch": 0.14754223683204648, - "grad_norm": 0.087814360033473, - "learning_rate": 0.0009931273759796989, - "loss": 1.285, + "epoch": 0.5901689473281859, + "grad_norm": 0.020596024583719397, + "learning_rate": 0.0004302842079620885, + "loss": 1.0753, "step": 965 }, { - "epoch": 0.14830670438039906, - "grad_norm": 0.0975006197140381, - "learning_rate": 0.0009929051431771486, - "loss": 1.2741, + "epoch": 0.5932268175215962, + "grad_norm": 0.020513561070135005, + "learning_rate": 0.0004250012256672117, + "loss": 1.0659, "step": 970 }, { - "epoch": 0.14907117192875163, - "grad_norm": 0.07858075681067718, - "learning_rate": 0.0009926793999283255, - "loss": 1.2972, + "epoch": 0.5962846877150065, + "grad_norm": 0.021144651470144736, + "learning_rate": 0.00041972679530844647, + "loss": 1.0839, "step": 975 }, { - "epoch": 0.1498356394771042, - "grad_norm": 0.0844266755046617, - "learning_rate": 0.0009924501478409617, - "loss": 1.2449, + "epoch": 0.5993425579084168, + "grad_norm": 0.021711998053494025, + "learning_rate": 0.0004144615183168409, + "loss": 1.0849, "step": 980 }, { - "epoch": 0.15060010702545676, - "grad_norm": 0.08128550636606187, - "learning_rate": 0.0009922173885477798, - "loss": 1.2777, + "epoch": 0.602400428101827, + "grad_norm": 0.022407376804965174, + "learning_rate": 0.0004092059950797062, + "loss": 1.0609, "step": 985 }, { - "epoch": 0.15136457457380934, - "grad_norm": 0.08733018440635835, - "learning_rate": 0.0009919811237064798, - "loss": 1.256, + "epoch": 0.6054582982952373, + "grad_norm": 0.02333806103193432, + "learning_rate": 0.0004039608248721548, + "loss": 1.0886, "step": 990 }, { - "epoch": 0.1521290421221619, - "grad_norm": 0.0866321204115484, - "learning_rate": 0.0009917413549997284, - "loss": 1.3139, + "epoch": 0.6085161684886476, + "grad_norm": 0.024102796234038874, + "learning_rate": 0.00039872660578876785, + "loss": 1.0737, "step": 995 }, { - "epoch": 0.1528935096705145, - "grad_norm": 0.08070145917396142, - "learning_rate": 0.0009914980841351465, - "loss": 1.2179, + "epoch": 0.611574038682058, + "grad_norm": 0.019357290501165046, + "learning_rate": 0.00039350393467539515, + "loss": 1.08, "step": 1000 }, { - "epoch": 0.15365797721886706, - "grad_norm": 0.08759371152417407, - "learning_rate": 0.0009912513128452973, - "loss": 1.2498, + "epoch": 0.6146319088754683, + "grad_norm": 0.020142530815836432, + "learning_rate": 0.0003882934070610981, + "loss": 1.0656, "step": 1005 }, { - "epoch": 0.15442244476721964, - "grad_norm": 0.07992027185128928, - "learning_rate": 0.000991001042887674, - "loss": 1.2898, + "epoch": 0.6176897790688786, + "grad_norm": 0.018608575204815583, + "learning_rate": 0.00038309561709024356, + "loss": 1.0637, "step": 1010 }, { - "epoch": 0.15518691231557222, - "grad_norm": 0.09915223808905828, - "learning_rate": 0.0009907472760446868, - "loss": 1.2784, + "epoch": 0.6207476492622889, + "grad_norm": 0.021214222795490626, + "learning_rate": 0.0003779111574547537, + "loss": 1.08, "step": 1015 }, { - "epoch": 0.15595137986392477, - "grad_norm": 0.07529248014649702, - "learning_rate": 0.0009904900141236505, - "loss": 1.3012, + "epoch": 0.6238055194556991, + "grad_norm": 0.022933622407233706, + "learning_rate": 0.00037274061932652413, + "loss": 1.0664, "step": 1020 }, { - "epoch": 0.15671584741227734, - "grad_norm": 0.07116825555952118, - "learning_rate": 0.0009902292589567723, - "loss": 1.225, + "epoch": 0.6268633896491094, + "grad_norm": 0.019706448651309442, + "learning_rate": 0.000367584592290013, + "loss": 1.0806, "step": 1025 }, { - "epoch": 0.15748031496062992, - "grad_norm": 0.0721270831702372, - "learning_rate": 0.0009899650124011376, - "loss": 1.2453, + "epoch": 0.6299212598425197, + "grad_norm": 0.02379452277464871, + "learning_rate": 0.00036244366427501265, + "loss": 1.0664, "step": 1030 }, { - "epoch": 0.1582447825089825, - "grad_norm": 0.08089190163094111, - "learning_rate": 0.0009896972763386973, - "loss": 1.2494, + "epoch": 0.63297913003593, + "grad_norm": 0.022836969922118866, + "learning_rate": 0.0003573184214896087, + "loss": 1.0702, "step": 1035 }, { - "epoch": 0.15900925005733507, - "grad_norm": 0.07205992090968301, - "learning_rate": 0.0009894260526762547, - "loss": 1.2383, + "epoch": 0.6360370002293403, + "grad_norm": 0.0220847265087786, + "learning_rate": 0.0003522094483533369, + "loss": 1.0777, "step": 1040 }, { - "epoch": 0.15977371760568765, - "grad_norm": 0.32600838075966987, - "learning_rate": 0.0009891513433454512, - "loss": 1.2791, + "epoch": 0.6390948704227506, + "grad_norm": 0.021238471439531986, + "learning_rate": 0.000347117327430542, + "loss": 1.0827, "step": 1045 }, { - "epoch": 0.1605381851540402, - "grad_norm": 0.06955922752630836, - "learning_rate": 0.0009888731503027535, - "loss": 1.2568, + "epoch": 0.6421527406161608, + "grad_norm": 0.01967371179793323, + "learning_rate": 0.0003420426393639508, + "loss": 1.0638, "step": 1050 }, { - "epoch": 0.16130265270239277, - "grad_norm": 0.10616255888347151, - "learning_rate": 0.0009885914755294386, - "loss": 1.2747, + "epoch": 0.6452106108095711, + "grad_norm": 0.022296186314483487, + "learning_rate": 0.00033698596280846125, + "loss": 1.0639, "step": 1055 }, { - "epoch": 0.16206712025074535, - "grad_norm": 0.11210822875448243, - "learning_rate": 0.0009883063210315804, - "loss": 1.3125, + "epoch": 0.6482684810029814, + "grad_norm": 0.02241422770345421, + "learning_rate": 0.0003319478743651609, + "loss": 1.0851, "step": 1060 }, { - "epoch": 0.16283158779909793, - "grad_norm": 0.10366169122798494, - "learning_rate": 0.0009880176888400352, - "loss": 1.3034, + "epoch": 0.6513263511963917, + "grad_norm": 0.02523497459965589, + "learning_rate": 0.0003269289485155783, + "loss": 1.0847, "step": 1065 }, { - "epoch": 0.1635960553474505, - "grad_norm": 0.6763547146869965, - "learning_rate": 0.0009877255810104275, - "loss": 1.2877, + "epoch": 0.654384221389802, + "grad_norm": 0.025512702330510183, + "learning_rate": 0.00032192975755617506, + "loss": 1.0662, "step": 1070 }, { - "epoch": 0.16436052289580308, - "grad_norm": 0.12323825764543753, - "learning_rate": 0.000987429999623135, - "loss": 1.2683, + "epoch": 0.6574420915832123, + "grad_norm": 0.019599747988122102, + "learning_rate": 0.00031695087153308976, + "loss": 1.0766, "step": 1075 }, { - "epoch": 0.16512499044415566, - "grad_norm": 0.14377535741587366, - "learning_rate": 0.0009871309467832737, - "loss": 1.3187, + "epoch": 0.6604999617766226, + "grad_norm": 0.01962272505893558, + "learning_rate": 0.0003119928581771362, + "loss": 1.0625, "step": 1080 }, { - "epoch": 0.1658894579925082, - "grad_norm": 0.0913096688213412, - "learning_rate": 0.0009868284246206834, - "loss": 1.2987, + "epoch": 0.6635578319700328, + "grad_norm": 0.019771169590040102, + "learning_rate": 0.0003070562828390662, + "loss": 1.0689, "step": 1085 }, { - "epoch": 0.16665392554086078, - "grad_norm": 0.1261260904406275, - "learning_rate": 0.0009865224352899118, - "loss": 1.2607, + "epoch": 0.6666157021634431, + "grad_norm": 0.02030493911602512, + "learning_rate": 0.00030214170842510435, + "loss": 1.0769, "step": 1090 }, { - "epoch": 0.16741839308921336, - "grad_norm": 0.07986542658284047, - "learning_rate": 0.0009862129809702005, - "loss": 1.2815, + "epoch": 0.6696735723568534, + "grad_norm": 0.024601514965605153, + "learning_rate": 0.0002972496953327607, + "loss": 1.062, "step": 1095 }, { - "epoch": 0.16818286063756593, - "grad_norm": 0.11156681330527404, - "learning_rate": 0.0009859000638654674, - "loss": 1.2537, + "epoch": 0.6727314425502637, + "grad_norm": 0.01994828738024262, + "learning_rate": 0.00029238080138693025, + "loss": 1.0775, "step": 1100 }, { - "epoch": 0.1689473281859185, - "grad_norm": 0.07080066680629181, - "learning_rate": 0.000985583686204293, - "loss": 1.2753, + "epoch": 0.675789312743674, + "grad_norm": 0.028014612142516805, + "learning_rate": 0.0002875355817762856, + "loss": 1.0567, "step": 1105 }, { - "epoch": 0.1697117957342711, - "grad_norm": 0.08751280576746387, - "learning_rate": 0.0009852638502399036, - "loss": 1.2832, + "epoch": 0.6788471829370843, + "grad_norm": 0.02154035563943042, + "learning_rate": 0.0002827145889899698, + "loss": 1.0451, "step": 1110 }, { - "epoch": 0.17047626328262366, - "grad_norm": 0.07513697430712966, - "learning_rate": 0.0009849405582501552, - "loss": 1.2826, + "epoch": 0.6819050531304947, + "grad_norm": 0.019908200712777067, + "learning_rate": 0.0002779183727545973, + "loss": 1.0715, "step": 1115 }, { - "epoch": 0.1712407308309762, - "grad_norm": 0.07040068722014306, - "learning_rate": 0.0009846138125375174, - "loss": 1.3079, + "epoch": 0.6849629233239048, + "grad_norm": 0.019831506624958018, + "learning_rate": 0.0002731474799715701, + "loss": 1.0794, "step": 1120 }, { - "epoch": 0.1720051983793288, - "grad_norm": 0.07554763974923043, - "learning_rate": 0.0009842836154290576, - "loss": 1.2688, + "epoch": 0.6880207935173152, + "grad_norm": 0.020609762986790316, + "learning_rate": 0.00026840245465471535, + "loss": 1.0648, "step": 1125 }, { - "epoch": 0.17276966592768137, - "grad_norm": 0.0756416076694629, - "learning_rate": 0.0009839499692764226, - "loss": 1.2845, + "epoch": 0.6910786637107255, + "grad_norm": 0.018373460203891627, + "learning_rate": 0.0002636838378682534, + "loss": 1.0642, "step": 1130 }, { - "epoch": 0.17353413347603394, - "grad_norm": 0.08613603811511952, - "learning_rate": 0.0009836128764558248, - "loss": 1.2946, + "epoch": 0.6941365339041358, + "grad_norm": 0.021227917010586402, + "learning_rate": 0.00025899216766510114, + "loss": 1.0574, "step": 1135 }, { - "epoch": 0.17429860102438652, - "grad_norm": 0.09369729151974834, - "learning_rate": 0.000983272339368022, - "loss": 1.2453, + "epoch": 0.6971944040975461, + "grad_norm": 0.021984002313747204, + "learning_rate": 0.0002543279790255192, + "loss": 1.0603, "step": 1140 }, { - "epoch": 0.1750630685727391, - "grad_norm": 0.07994080822915721, - "learning_rate": 0.0009829283604383034, - "loss": 1.2671, + "epoch": 0.7002522742909564, + "grad_norm": 0.02355426186863231, + "learning_rate": 0.00024969180379610934, + "loss": 1.0586, "step": 1145 }, { - "epoch": 0.17582753612109167, - "grad_norm": 0.10575664278992154, - "learning_rate": 0.0009825809421164695, - "loss": 1.2497, + "epoch": 0.7033101444843667, + "grad_norm": 0.024678418156208182, + "learning_rate": 0.00024508417062916885, + "loss": 1.0661, "step": 1150 }, { - "epoch": 0.17659200366944422, - "grad_norm": 0.10195388695968033, - "learning_rate": 0.0009822300868768169, - "loss": 1.2472, + "epoch": 0.7063680146777769, + "grad_norm": 0.02082314009145273, + "learning_rate": 0.0002405056049224103, + "loss": 1.0651, "step": 1155 }, { - "epoch": 0.1773564712177968, - "grad_norm": 0.08122506603589001, - "learning_rate": 0.000981875797218119, - "loss": 1.244, + "epoch": 0.7094258848711872, + "grad_norm": 0.02162054379835859, + "learning_rate": 0.0002359566287590511, + "loss": 1.0715, "step": 1160 }, { - "epoch": 0.17812093876614937, - "grad_norm": 0.09114356191080156, - "learning_rate": 0.00098151807566361, - "loss": 1.2631, + "epoch": 0.7124837550645975, + "grad_norm": 0.021178840967147644, + "learning_rate": 0.00023143776084828178, + "loss": 1.045, "step": 1165 }, { - "epoch": 0.17888540631450195, - "grad_norm": 0.12085241472592927, - "learning_rate": 0.0009811569247609646, - "loss": 1.2685, + "epoch": 0.7155416252580078, + "grad_norm": 0.023789035651104056, + "learning_rate": 0.00022694951646611884, + "loss": 1.0618, "step": 1170 }, { - "epoch": 0.17964987386285453, - "grad_norm": 0.10376652091732114, - "learning_rate": 0.0009807923470822825, - "loss": 1.2457, + "epoch": 0.7185994954514181, + "grad_norm": 0.019246434577242334, + "learning_rate": 0.0002224924073966484, + "loss": 1.0638, "step": 1175 }, { - "epoch": 0.1804143414112071, - "grad_norm": 0.08582105488308488, - "learning_rate": 0.0009804243452240676, - "loss": 1.2655, + "epoch": 0.7216573656448284, + "grad_norm": 0.018166777369856113, + "learning_rate": 0.00021806694187366976, + "loss": 1.0551, "step": 1180 }, { - "epoch": 0.18117880895955968, - "grad_norm": 0.08069706067779515, - "learning_rate": 0.0009800529218072112, - "loss": 1.242, + "epoch": 0.7247152358382387, + "grad_norm": 0.01932908428350948, + "learning_rate": 0.00021367362452274198, + "loss": 1.0559, "step": 1185 }, { - "epoch": 0.18194327650791223, - "grad_norm": 0.09899431989367119, - "learning_rate": 0.0009796780794769726, - "loss": 1.2136, + "epoch": 0.7277731060316489, + "grad_norm": 0.019018633917993133, + "learning_rate": 0.00020931295630364222, + "loss": 1.0542, "step": 1190 }, { - "epoch": 0.1827077440562648, - "grad_norm": 0.0721358139689406, - "learning_rate": 0.0009792998209029607, - "loss": 1.2691, + "epoch": 0.7308309762250592, + "grad_norm": 0.019157449963553598, + "learning_rate": 0.00020498543445324346, + "loss": 1.0572, "step": 1195 }, { - "epoch": 0.18347221160461738, - "grad_norm": 0.1851037362834397, - "learning_rate": 0.0009789181487791145, - "loss": 1.252, + "epoch": 0.7338888464184695, + "grad_norm": 0.018701309680460174, + "learning_rate": 0.00020069155242881525, + "loss": 1.064, "step": 1200 }, { - "epoch": 0.18423667915296996, - "grad_norm": 0.07551235120975437, - "learning_rate": 0.0009785330658236841, - "loss": 1.2771, + "epoch": 0.7369467166118798, + "grad_norm": 0.020107359392638562, + "learning_rate": 0.00019643179985175525, + "loss": 1.0427, "step": 1205 }, { - "epoch": 0.18500114670132253, - "grad_norm": 0.07538379469221401, - "learning_rate": 0.0009781445747792114, - "loss": 1.2605, + "epoch": 0.7400045868052901, + "grad_norm": 0.018162781703447546, + "learning_rate": 0.00019220666245176016, + "loss": 1.0486, "step": 1210 }, { - "epoch": 0.1857656142496751, - "grad_norm": 0.07311552086280827, - "learning_rate": 0.0009777526784125105, - "loss": 1.2735, + "epoch": 0.7430624569987004, + "grad_norm": 0.019847164271580075, + "learning_rate": 0.00018801662201143815, + "loss": 1.0353, "step": 1215 }, { - "epoch": 0.18653008179802769, - "grad_norm": 0.06571777631977245, - "learning_rate": 0.0009773573795146485, - "loss": 1.2642, + "epoch": 0.7461203271921107, + "grad_norm": 0.020732633406869302, + "learning_rate": 0.00018386215631137248, + "loss": 1.064, "step": 1220 }, { - "epoch": 0.18729454934638023, - "grad_norm": 0.23163418700965988, - "learning_rate": 0.0009769586809009244, - "loss": 1.2063, + "epoch": 0.7491781973855209, + "grad_norm": 0.020348214250739332, + "learning_rate": 0.00017974373907564167, + "loss": 1.052, "step": 1225 }, { - "epoch": 0.1880590168947328, - "grad_norm": 0.07247911397915323, - "learning_rate": 0.0009765565854108503, - "loss": 1.2543, + "epoch": 0.7522360675789312, + "grad_norm": 0.019382936149259185, + "learning_rate": 0.000175661839917801, + "loss": 1.0632, "step": 1230 }, { - "epoch": 0.1888234844430854, - "grad_norm": 0.07853414468894612, - "learning_rate": 0.0009761510959081305, - "loss": 1.2966, + "epoch": 0.7552939377723416, + "grad_norm": 0.020617077209785874, + "learning_rate": 0.0001716169242873346, + "loss": 1.0603, "step": 1235 }, { - "epoch": 0.18958795199143796, - "grad_norm": 0.07106732796502367, - "learning_rate": 0.0009757422152806414, - "loss": 1.2454, + "epoch": 0.7583518079657519, + "grad_norm": 0.018721534354599865, + "learning_rate": 0.0001676094534165807, + "loss": 1.0504, "step": 1240 }, { - "epoch": 0.19035241953979054, - "grad_norm": 0.06666091797081679, - "learning_rate": 0.0009753299464404109, - "loss": 1.252, + "epoch": 0.7614096781591622, + "grad_norm": 0.019523624424920942, + "learning_rate": 0.00016363988426813865, + "loss": 1.0605, "step": 1245 }, { - "epoch": 0.19111688708814312, - "grad_norm": 0.0809358295124488, - "learning_rate": 0.0009749142923235974, - "loss": 1.2823, + "epoch": 0.7644675483525725, + "grad_norm": 0.021325379023544762, + "learning_rate": 0.00015970866948276208, + "loss": 1.0502, "step": 1250 }, { - "epoch": 0.1918813546364957, - "grad_norm": 0.07398963462471533, - "learning_rate": 0.0009744952558904692, - "loss": 1.2211, + "epoch": 0.7675254185459828, + "grad_norm": 0.018381651158354734, + "learning_rate": 0.00015581625732774562, + "loss": 1.0695, "step": 1255 }, { - "epoch": 0.19264582218484824, - "grad_norm": 0.06648581697987127, - "learning_rate": 0.000974072840125383, - "loss": 1.2452, + "epoch": 0.770583288739393, + "grad_norm": 0.017773314789403843, + "learning_rate": 0.00015196309164581046, + "loss": 1.0564, "step": 1260 }, { - "epoch": 0.19341028973320082, - "grad_norm": 0.08734480773613341, - "learning_rate": 0.0009736470480367634, - "loss": 1.2983, + "epoch": 0.7736411589328033, + "grad_norm": 0.018755748204595284, + "learning_rate": 0.00014814961180449339, + "loss": 1.0372, "step": 1265 }, { - "epoch": 0.1941747572815534, - "grad_norm": 0.0814299487892339, - "learning_rate": 0.000973217882657081, - "loss": 1.2668, + "epoch": 0.7766990291262136, + "grad_norm": 0.025065839963610673, + "learning_rate": 0.0001443762526460468, + "loss": 1.058, "step": 1270 }, { - "epoch": 0.19493922482990597, - "grad_norm": 0.08233314539840832, - "learning_rate": 0.00097278534704283, - "loss": 1.2196, + "epoch": 0.7797568993196239, + "grad_norm": 0.022671037009366964, + "learning_rate": 0.00014064344443785504, + "loss": 1.0486, "step": 1275 }, { - "epoch": 0.19570369237825855, - "grad_norm": 0.06781465946994458, - "learning_rate": 0.0009723494442745084, - "loss": 1.2194, + "epoch": 0.7828147695130342, + "grad_norm": 0.021001365810248977, + "learning_rate": 0.0001369516128233716, + "loss": 1.0684, "step": 1280 }, { - "epoch": 0.19646815992661112, - "grad_norm": 0.09257477131832359, - "learning_rate": 0.0009719101774565941, - "loss": 1.228, + "epoch": 0.7858726397064445, + "grad_norm": 0.02187887791955955, + "learning_rate": 0.00013330117877358383, + "loss": 1.0488, "step": 1285 }, { - "epoch": 0.1972326274749637, - "grad_norm": 0.07505866072283653, - "learning_rate": 0.000971467549717524, - "loss": 1.2683, + "epoch": 0.7889305098998548, + "grad_norm": 0.017361526578702963, + "learning_rate": 0.00012969255853901142, + "loss": 1.0378, "step": 1290 }, { - "epoch": 0.19799709502331625, - "grad_norm": 0.06785480940064653, - "learning_rate": 0.0009710215642096711, - "loss": 1.249, + "epoch": 0.791988380093265, + "grad_norm": 0.01766771162161045, + "learning_rate": 0.00012612616360224138, + "loss": 1.0369, "step": 1295 }, { - "epoch": 0.19876156257166883, - "grad_norm": 0.06775548552563401, - "learning_rate": 0.0009705722241093222, - "loss": 1.2575, + "epoch": 0.7950462502866753, + "grad_norm": 0.021322450785295783, + "learning_rate": 0.00012260240063100846, + "loss": 1.0639, "step": 1300 }, { - "epoch": 0.1995260301200214, - "grad_norm": 0.06486741211895572, - "learning_rate": 0.0009701195326166555, - "loss": 1.2114, + "epoch": 0.7981041204800856, + "grad_norm": 0.020380850502793722, + "learning_rate": 0.00011912167143182279, + "loss": 1.0493, "step": 1305 }, { - "epoch": 0.20029049766837398, - "grad_norm": 0.07620009922689974, - "learning_rate": 0.0009696634929557178, - "loss": 1.2423, + "epoch": 0.8011619906734959, + "grad_norm": 0.018397926034012132, + "learning_rate": 0.00011568437290415395, + "loss": 1.0541, "step": 1310 }, { - "epoch": 0.20105496521672656, - "grad_norm": 0.07533128179658055, - "learning_rate": 0.0009692041083744011, - "loss": 1.1992, + "epoch": 0.8042198608669062, + "grad_norm": 0.017133257784555725, + "learning_rate": 0.00011229089699517242, + "loss": 1.0523, "step": 1315 }, { - "epoch": 0.20181943276507913, - "grad_norm": 0.08244330003493801, - "learning_rate": 0.0009687413821444199, - "loss": 1.2613, + "epoch": 0.8072777310603165, + "grad_norm": 0.01819525737040671, + "learning_rate": 0.00010894163065505753, + "loss": 1.0468, "step": 1320 }, { - "epoch": 0.2025839003134317, - "grad_norm": 0.06717152873434033, - "learning_rate": 0.0009682753175612876, - "loss": 1.2424, + "epoch": 0.8103356012537268, + "grad_norm": 0.02028545883645919, + "learning_rate": 0.00010563695579287402, + "loss": 1.0629, "step": 1325 }, { - "epoch": 0.20334836786178426, - "grad_norm": 0.10045766887445413, - "learning_rate": 0.0009678059179442935, - "loss": 1.2244, + "epoch": 0.813393471447137, + "grad_norm": 0.01787173264557234, + "learning_rate": 0.00010237724923302406, + "loss": 1.0351, "step": 1330 }, { - "epoch": 0.20411283541013683, - "grad_norm": 0.06751762043917765, - "learning_rate": 0.0009673331866364786, - "loss": 1.2375, + "epoch": 0.8164513416405473, + "grad_norm": 0.018194742191756992, + "learning_rate": 9.916288267227824e-05, + "loss": 1.0598, "step": 1335 }, { - "epoch": 0.2048773029584894, - "grad_norm": 0.07281491279798423, - "learning_rate": 0.0009668571270046121, - "loss": 1.2563, + "epoch": 0.8195092118339576, + "grad_norm": 0.019378519353161313, + "learning_rate": 9.599422263739289e-05, + "loss": 1.0608, "step": 1340 }, { - "epoch": 0.205641770506842, - "grad_norm": 0.07449446576944249, - "learning_rate": 0.0009663777424391675, - "loss": 1.2537, + "epoch": 0.822567082027368, + "grad_norm": 0.01902789612845712, + "learning_rate": 9.287163044331498e-05, + "loss": 1.0501, "step": 1345 }, { - "epoch": 0.20640623805519456, - "grad_norm": 0.06486106585475247, - "learning_rate": 0.0009658950363542979, - "loss": 1.2626, + "epoch": 0.8256249522207783, + "grad_norm": 0.018581047910824787, + "learning_rate": 8.979546215198265e-05, + "loss": 1.0458, "step": 1350 }, { - "epoch": 0.20717070560354714, - "grad_norm": 0.068273879782427, - "learning_rate": 0.0009654090121878125, - "loss": 1.2601, + "epoch": 0.8286828224141886, + "grad_norm": 0.01799606775719105, + "learning_rate": 8.676606853172426e-05, + "loss": 1.0403, "step": 1355 }, { - "epoch": 0.2079351731518997, - "grad_norm": 0.07549308029701757, - "learning_rate": 0.0009649196734011519, - "loss": 1.2414, + "epoch": 0.8317406926075988, + "grad_norm": 0.01874997273444636, + "learning_rate": 8.378379501726096e-05, + "loss": 1.0672, "step": 1360 }, { - "epoch": 0.20869964070025226, - "grad_norm": 0.07060653649003629, - "learning_rate": 0.0009644270234793623, - "loss": 1.2503, + "epoch": 0.8347985628010091, + "grad_norm": 0.01729209570770116, + "learning_rate": 8.084898167031746e-05, + "loss": 1.0519, "step": 1365 }, { - "epoch": 0.20946410824860484, - "grad_norm": 0.5778206445656846, - "learning_rate": 0.0009639310659310729, - "loss": 1.2925, + "epoch": 0.8378564329944194, + "grad_norm": 0.020489335055326883, + "learning_rate": 7.796196314084614e-05, + "loss": 1.0332, "step": 1370 }, { - "epoch": 0.21022857579695742, - "grad_norm": 0.086771304690071, - "learning_rate": 0.0009634318042884689, - "loss": 1.286, + "epoch": 0.8409143031878297, + "grad_norm": 0.018824889864362003, + "learning_rate": 7.512306862886681e-05, + "loss": 1.0349, "step": 1375 }, { - "epoch": 0.21099304334531, - "grad_norm": 0.07780731029224541, - "learning_rate": 0.000962929242107267, - "loss": 1.2886, + "epoch": 0.84397217338124, + "grad_norm": 0.018493904385427234, + "learning_rate": 7.233262184692974e-05, + "loss": 1.0458, "step": 1380 }, { - "epoch": 0.21175751089366257, - "grad_norm": 0.3625052155879589, - "learning_rate": 0.0009624233829666907, - "loss": 1.208, + "epoch": 0.8470300435746503, + "grad_norm": 0.021431180850593753, + "learning_rate": 6.959094098320295e-05, + "loss": 1.042, "step": 1385 }, { - "epoch": 0.21252197844201515, - "grad_norm": 0.06645909024594576, - "learning_rate": 0.0009619142304694439, - "loss": 1.2426, + "epoch": 0.8500879137680606, + "grad_norm": 0.01774287881523113, + "learning_rate": 6.68983386651898e-05, + "loss": 1.0463, "step": 1390 }, { - "epoch": 0.2132864459903677, - "grad_norm": 0.10577040811890297, - "learning_rate": 0.0009614017882416859, - "loss": 1.2787, + "epoch": 0.8531457839614708, + "grad_norm": 0.019191338242596227, + "learning_rate": 6.425512192408112e-05, + "loss": 1.0428, "step": 1395 }, { - "epoch": 0.21405091353872027, - "grad_norm": 0.09769123070834913, - "learning_rate": 0.0009608860599330048, - "loss": 1.2174, + "epoch": 0.8562036541548811, + "grad_norm": 0.01894785926804795, + "learning_rate": 6.166159215974521e-05, + "loss": 1.0435, "step": 1400 }, { - "epoch": 0.21481538108707285, - "grad_norm": 0.08553593141915698, - "learning_rate": 0.0009603670492163926, - "loss": 1.2382, + "epoch": 0.8592615243482914, + "grad_norm": 0.016825213446883102, + "learning_rate": 5.911804510635965e-05, + "loss": 1.0447, "step": 1405 }, { - "epoch": 0.21557984863542543, - "grad_norm": 0.08406132520975527, - "learning_rate": 0.0009598447597882181, - "loss": 1.2541, + "epoch": 0.8623193945417017, + "grad_norm": 0.016963551769271477, + "learning_rate": 5.6624770798689665e-05, + "loss": 1.0412, "step": 1410 }, { - "epoch": 0.216344316183778, - "grad_norm": 0.09071815773985022, - "learning_rate": 0.000959319195368201, - "loss": 1.2277, + "epoch": 0.865377264735112, + "grad_norm": 0.016723668733144115, + "learning_rate": 5.418205353901562e-05, + "loss": 1.0684, "step": 1415 }, { - "epoch": 0.21710878373213058, - "grad_norm": 0.073298406707831, - "learning_rate": 0.0009587903596993855, - "loss": 1.2373, + "epoch": 0.8684351349285223, + "grad_norm": 0.019698338107007437, + "learning_rate": 5.179017186471557e-05, + "loss": 1.0379, "step": 1420 }, { - "epoch": 0.21787325128048315, - "grad_norm": 0.08328106823130824, - "learning_rate": 0.000958258256548113, - "loss": 1.2093, + "epoch": 0.8714930051219326, + "grad_norm": 0.017265614379559306, + "learning_rate": 4.94493985165036e-05, + "loss": 1.0454, "step": 1425 }, { - "epoch": 0.2186377188288357, - "grad_norm": 0.076889746663869, - "learning_rate": 0.0009577228897039963, - "loss": 1.2324, + "epoch": 0.8745508753153428, + "grad_norm": 0.01778176463464062, + "learning_rate": 4.7160000407329765e-05, + "loss": 1.0431, "step": 1430 }, { - "epoch": 0.21940218637718828, - "grad_norm": 0.10417983319043374, - "learning_rate": 0.0009571842629798917, - "loss": 1.2314, + "epoch": 0.8776087455087531, + "grad_norm": 0.019057294957546256, + "learning_rate": 4.492223859194533e-05, + "loss": 1.0442, "step": 1435 }, { - "epoch": 0.22016665392554086, - "grad_norm": 0.07670976249694544, - "learning_rate": 0.0009566423802118725, - "loss": 1.2418, + "epoch": 0.8806666157021634, + "grad_norm": 0.018453421610507247, + "learning_rate": 4.273636823713467e-05, + "loss": 1.0437, "step": 1440 }, { - "epoch": 0.22093112147389343, - "grad_norm": 0.06138227389521626, - "learning_rate": 0.0009560972452592009, - "loss": 1.2728, + "epoch": 0.8837244858955737, + "grad_norm": 0.01765155626929793, + "learning_rate": 4.0602638592619257e-05, + "loss": 1.0415, "step": 1445 }, { - "epoch": 0.221695589022246, - "grad_norm": 0.06168948726191564, - "learning_rate": 0.0009555488620043018, - "loss": 1.2739, + "epoch": 0.886782356088984, + "grad_norm": 0.017862026611963264, + "learning_rate": 3.8521292962636625e-05, + "loss": 1.0436, "step": 1450 }, { - "epoch": 0.22246005657059859, - "grad_norm": 0.07139543551614756, - "learning_rate": 0.0009549972343527336, - "loss": 1.2635, + "epoch": 0.8898402262823943, + "grad_norm": 0.017606797127937533, + "learning_rate": 3.649256867819667e-05, + "loss": 1.04, "step": 1455 }, { - "epoch": 0.22322452411895116, - "grad_norm": 0.08125497986153993, - "learning_rate": 0.0009544423662331612, - "loss": 1.2419, + "epoch": 0.8928980964758046, + "grad_norm": 0.017151387009890048, + "learning_rate": 3.451669707001942e-05, + "loss": 1.0578, "step": 1460 }, { - "epoch": 0.2239889916673037, - "grad_norm": 0.06174524531157182, - "learning_rate": 0.0009538842615973284, - "loss": 1.2286, + "epoch": 0.8959559666692148, + "grad_norm": 0.017195927947563305, + "learning_rate": 3.259390344215707e-05, + "loss": 1.0462, "step": 1465 }, { - "epoch": 0.2247534592156563, - "grad_norm": 0.08712335845180429, - "learning_rate": 0.0009533229244200292, - "loss": 1.2095, + "epoch": 0.8990138368626251, + "grad_norm": 0.01845679676674692, + "learning_rate": 3.072440704630258e-05, + "loss": 1.0555, "step": 1470 }, { - "epoch": 0.22551792676400886, - "grad_norm": 0.07266991262135022, - "learning_rate": 0.0009527583586990791, - "loss": 1.2463, + "epoch": 0.9020717070560355, + "grad_norm": 0.016944392213100467, + "learning_rate": 2.8908421056789703e-05, + "loss": 1.033, "step": 1475 }, { - "epoch": 0.22628239431236144, - "grad_norm": 0.07327264508555179, - "learning_rate": 0.0009521905684552877, - "loss": 1.2243, + "epoch": 0.9051295772494458, + "grad_norm": 0.018432906406152354, + "learning_rate": 2.7146152546284663e-05, + "loss": 1.0508, "step": 1480 }, { - "epoch": 0.22704686186071402, - "grad_norm": 0.09100386022395632, - "learning_rate": 0.0009516195577324294, - "loss": 1.2602, + "epoch": 0.9081874474428561, + "grad_norm": 0.017706430050248204, + "learning_rate": 2.5437802462174144e-05, + "loss": 1.0407, "step": 1485 }, { - "epoch": 0.2278113294090666, - "grad_norm": 0.06911064137719194, - "learning_rate": 0.0009510453305972142, - "loss": 1.2512, + "epoch": 0.9112453176362664, + "grad_norm": 0.01825937227605355, + "learning_rate": 2.3783565603652024e-05, + "loss": 1.0191, "step": 1490 }, { - "epoch": 0.22857579695741917, - "grad_norm": 0.10444912049605627, - "learning_rate": 0.0009504678911392601, - "loss": 1.2157, + "epoch": 0.9143031878296767, + "grad_norm": 0.017170141434300536, + "learning_rate": 2.218363059950651e-05, + "loss": 1.0381, "step": 1495 }, { - "epoch": 0.22934026450577172, - "grad_norm": 0.09169840988364335, - "learning_rate": 0.0009498872434710623, - "loss": 1.2266, + "epoch": 0.9173610580230869, + "grad_norm": 0.021018198836607237, + "learning_rate": 2.0638179886611542e-05, + "loss": 1.0342, "step": 1500 }, { - "epoch": 0.2301047320541243, - "grad_norm": 0.06835850906504734, - "learning_rate": 0.000949303391727965, - "loss": 1.1967, + "epoch": 0.9204189282164972, + "grad_norm": 0.018670756565939955, + "learning_rate": 1.9147389689123884e-05, + "loss": 1.038, "step": 1505 }, { - "epoch": 0.23086919960247687, - "grad_norm": 0.08072558468127804, - "learning_rate": 0.000948716340068132, - "loss": 1.2446, + "epoch": 0.9234767984099075, + "grad_norm": 0.01840746015098489, + "learning_rate": 1.771142999838854e-05, + "loss": 1.0576, "step": 1510 }, { - "epoch": 0.23163366715082945, - "grad_norm": 0.09703619217717847, - "learning_rate": 0.0009481260926725161, - "loss": 1.2335, + "epoch": 0.9265346686033178, + "grad_norm": 0.01791437481490544, + "learning_rate": 1.633046455355519e-05, + "loss": 1.0547, "step": 1515 }, { - "epoch": 0.23239813469918202, - "grad_norm": 0.07534829838458987, - "learning_rate": 0.0009475326537448307, - "loss": 1.2297, + "epoch": 0.9295925387967281, + "grad_norm": 0.016561566008186503, + "learning_rate": 1.5004650822907306e-05, + "loss": 1.0429, "step": 1520 }, { - "epoch": 0.2331626022475346, - "grad_norm": 0.08322927425588308, - "learning_rate": 0.0009469360275115185, - "loss": 1.2265, + "epoch": 0.9326504089901384, + "grad_norm": 0.017211669368685612, + "learning_rate": 1.373413998590639e-05, + "loss": 1.0412, "step": 1525 }, { - "epoch": 0.23392706979588718, - "grad_norm": 0.06820300897660575, - "learning_rate": 0.0009463362182217223, - "loss": 1.2338, + "epoch": 0.9357082791835487, + "grad_norm": 0.01917485719407648, + "learning_rate": 1.2519076915953443e-05, + "loss": 1.0329, "step": 1530 }, { - "epoch": 0.23469153734423973, - "grad_norm": 0.07576589358564349, - "learning_rate": 0.0009457332301472545, - "loss": 1.204, + "epoch": 0.9387661493769589, + "grad_norm": 0.019235379531641828, + "learning_rate": 1.1359600163869276e-05, + "loss": 1.0509, "step": 1535 }, { - "epoch": 0.2354560048925923, - "grad_norm": 0.07033059584476081, - "learning_rate": 0.0009451270675825665, - "loss": 1.2368, + "epoch": 0.9418240195703692, + "grad_norm": 0.017132686744869362, + "learning_rate": 1.0255841942096056e-05, + "loss": 1.0402, "step": 1540 }, { - "epoch": 0.23622047244094488, - "grad_norm": 0.07998529784318079, - "learning_rate": 0.0009445177348447187, - "loss": 1.1867, + "epoch": 0.9448818897637795, + "grad_norm": 0.018593609192373937, + "learning_rate": 9.207928109621122e-06, + "loss": 1.0409, "step": 1545 }, { - "epoch": 0.23698493998929746, - "grad_norm": 0.06330372086398396, - "learning_rate": 0.0009439052362733486, - "loss": 1.2153, + "epoch": 0.9479397599571898, + "grad_norm": 0.01776794448761931, + "learning_rate": 8.21597815762587e-06, + "loss": 1.0242, "step": 1550 }, - { - "epoch": 0.23774940753765003, - "grad_norm": 0.0825543412216852, - "learning_rate": 0.0009432895762306415, - "loss": 1.2114, - "step": 1555 - }, - { - "epoch": 0.2385138750860026, - "grad_norm": 0.07266151001303742, - "learning_rate": 0.0009426707591012975, - "loss": 1.2227, - "step": 1560 - }, - { - "epoch": 0.23927834263435518, - "grad_norm": 0.08994379252698968, - "learning_rate": 0.0009420487892925023, - "loss": 1.2153, - "step": 1565 - }, - { - "epoch": 0.24004281018270773, - "grad_norm": 0.06997215800326952, - "learning_rate": 0.0009414236712338941, - "loss": 1.2456, - "step": 1570 - }, - { - "epoch": 0.2408072777310603, - "grad_norm": 0.07195887577227815, - "learning_rate": 0.0009407954093775333, - "loss": 1.2606, - "step": 1575 - }, - { - "epoch": 0.24157174527941289, - "grad_norm": 0.0669981044756401, - "learning_rate": 0.00094016400819787, - "loss": 1.2293, - "step": 1580 - }, - { - "epoch": 0.24233621282776546, - "grad_norm": 0.08395936334147859, - "learning_rate": 0.0009395294721917122, - "loss": 1.1822, - "step": 1585 - }, - { - "epoch": 0.24310068037611804, - "grad_norm": 0.08734808358295865, - "learning_rate": 0.0009388918058781945, - "loss": 1.2026, - "step": 1590 - }, - { - "epoch": 0.24386514792447062, - "grad_norm": 0.06832693115838025, - "learning_rate": 0.0009382510137987452, - "loss": 1.2111, - "step": 1595 - }, - { - "epoch": 0.2446296154728232, - "grad_norm": 0.0706613941441332, - "learning_rate": 0.0009376071005170539, - "loss": 1.1989, - "step": 1600 - }, - { - "epoch": 0.24539408302117574, - "grad_norm": 0.06738770199309628, - "learning_rate": 0.0009369600706190395, - "loss": 1.2405, - "step": 1605 - }, - { - "epoch": 0.24615855056952832, - "grad_norm": 0.0815414234541493, - "learning_rate": 0.0009363099287128173, - "loss": 1.2008, - "step": 1610 - }, - { - "epoch": 0.2469230181178809, - "grad_norm": 0.07295828491472306, - "learning_rate": 0.000935656679428666, - "loss": 1.2355, - "step": 1615 - }, - { - "epoch": 0.24768748566623347, - "grad_norm": 0.06339989075543628, - "learning_rate": 0.0009350003274189949, - "loss": 1.1979, - "step": 1620 - }, - { - "epoch": 0.24845195321458605, - "grad_norm": 0.06230503242046638, - "learning_rate": 0.000934340877358311, - "loss": 1.2229, - "step": 1625 - }, - { - "epoch": 0.24921642076293862, - "grad_norm": 0.07324576424114503, - "learning_rate": 0.0009336783339431851, - "loss": 1.2468, - "step": 1630 - }, - { - "epoch": 0.2499808883112912, - "grad_norm": 0.06752028292519564, - "learning_rate": 0.0009330127018922195, - "loss": 1.2213, - "step": 1635 - }, - { - "epoch": 0.25074535585964375, - "grad_norm": 0.07188634308822926, - "learning_rate": 0.0009323439859460122, - "loss": 1.2177, - "step": 1640 - }, - { - "epoch": 0.2515098234079963, - "grad_norm": 0.06474900340430122, - "learning_rate": 0.000931672190867126, - "loss": 1.2038, - "step": 1645 - }, - { - "epoch": 0.2522742909563489, - "grad_norm": 0.06803113030721539, - "learning_rate": 0.0009309973214400524, - "loss": 1.2429, - "step": 1650 - }, - { - "epoch": 0.2530387585047015, - "grad_norm": 0.0696030870632886, - "learning_rate": 0.0009303193824711783, - "loss": 1.1881, - "step": 1655 - }, - { - "epoch": 0.25380322605305405, - "grad_norm": 0.06427278412655327, - "learning_rate": 0.0009296383787887519, - "loss": 1.1999, - "step": 1660 - }, - { - "epoch": 0.25456769360140663, - "grad_norm": 0.06151334873890928, - "learning_rate": 0.000928954315242848, - "loss": 1.2104, - "step": 1665 - }, - { - "epoch": 0.2553321611497592, - "grad_norm": 0.07147857753540077, - "learning_rate": 0.0009282671967053335, - "loss": 1.1903, - "step": 1670 - }, - { - "epoch": 0.2560966286981118, - "grad_norm": 0.06167324666150717, - "learning_rate": 0.0009275770280698332, - "loss": 1.2496, - "step": 1675 - }, - { - "epoch": 0.25686109624646436, - "grad_norm": 0.06467673259045754, - "learning_rate": 0.0009268838142516944, - "loss": 1.2307, - "step": 1680 - }, - { - "epoch": 0.25762556379481694, - "grad_norm": 0.06363717052606994, - "learning_rate": 0.0009261875601879514, - "loss": 1.2183, - "step": 1685 - }, - { - "epoch": 0.25839003134316946, - "grad_norm": 0.08631057056855553, - "learning_rate": 0.0009254882708372922, - "loss": 1.2058, - "step": 1690 - }, - { - "epoch": 0.25915449889152203, - "grad_norm": 0.059947021170670405, - "learning_rate": 0.0009247859511800207, - "loss": 1.2075, - "step": 1695 - }, - { - "epoch": 0.2599189664398746, - "grad_norm": 0.06378174138594353, - "learning_rate": 0.0009240806062180233, - "loss": 1.2141, - "step": 1700 - }, - { - "epoch": 0.2606834339882272, - "grad_norm": 0.06373001825884964, - "learning_rate": 0.0009233722409747325, - "loss": 1.1995, - "step": 1705 - }, - { - "epoch": 0.26144790153657976, - "grad_norm": 0.07986652766185562, - "learning_rate": 0.0009226608604950905, - "loss": 1.2691, - "step": 1710 - }, - { - "epoch": 0.26221236908493234, - "grad_norm": 0.06532339660591398, - "learning_rate": 0.0009219464698455143, - "loss": 1.2049, - "step": 1715 - }, - { - "epoch": 0.2629768366332849, - "grad_norm": 0.07511908390592772, - "learning_rate": 0.0009212290741138591, - "loss": 1.2109, - "step": 1720 - }, - { - "epoch": 0.2637413041816375, - "grad_norm": 0.06805686624245322, - "learning_rate": 0.0009205086784093823, - "loss": 1.1938, - "step": 1725 - }, - { - "epoch": 0.26450577172999007, - "grad_norm": 0.07608315580128151, - "learning_rate": 0.0009197852878627064, - "loss": 1.2247, - "step": 1730 - }, - { - "epoch": 0.26527023927834265, - "grad_norm": 0.07460173165874026, - "learning_rate": 0.000919058907625784, - "loss": 1.1986, - "step": 1735 - }, - { - "epoch": 0.2660347068266952, - "grad_norm": 0.0653540774845259, - "learning_rate": 0.0009183295428718591, - "loss": 1.2015, - "step": 1740 - }, - { - "epoch": 0.2667991743750478, - "grad_norm": 0.06088986122123603, - "learning_rate": 0.0009175971987954319, - "loss": 1.2347, - "step": 1745 - }, - { - "epoch": 0.2675636419234004, - "grad_norm": 0.06182727669114958, - "learning_rate": 0.0009168618806122209, - "loss": 1.2168, - "step": 1750 - }, - { - "epoch": 0.26832810947175295, - "grad_norm": 0.06945925182347323, - "learning_rate": 0.0009161235935591264, - "loss": 1.2107, - "step": 1755 - }, - { - "epoch": 0.2690925770201055, - "grad_norm": 0.07350549175394744, - "learning_rate": 0.0009153823428941923, - "loss": 1.2252, - "step": 1760 - }, - { - "epoch": 0.26985704456845805, - "grad_norm": 0.06552770776828734, - "learning_rate": 0.0009146381338965698, - "loss": 1.2268, - "step": 1765 - }, - { - "epoch": 0.2706215121168106, - "grad_norm": 0.0866708613189302, - "learning_rate": 0.0009138909718664788, - "loss": 1.1928, - "step": 1770 - }, - { - "epoch": 0.2713859796651632, - "grad_norm": 0.07542325154916106, - "learning_rate": 0.0009131408621251704, - "loss": 1.1992, - "step": 1775 - }, - { - "epoch": 0.2721504472135158, - "grad_norm": 0.08169121697992505, - "learning_rate": 0.0009123878100148899, - "loss": 1.2173, - "step": 1780 - }, - { - "epoch": 0.27291491476186835, - "grad_norm": 0.07641997824597095, - "learning_rate": 0.0009116318208988372, - "loss": 1.2592, - "step": 1785 - }, - { - "epoch": 0.27367938231022093, - "grad_norm": 0.08606275877785033, - "learning_rate": 0.0009108729001611297, - "loss": 1.1776, - "step": 1790 - }, - { - "epoch": 0.2744438498585735, - "grad_norm": 0.07432766120012625, - "learning_rate": 0.000910111053206764, - "loss": 1.2192, - "step": 1795 - }, - { - "epoch": 0.2752083174069261, - "grad_norm": 0.08764002357094997, - "learning_rate": 0.0009093462854615766, - "loss": 1.28, - "step": 1800 - }, - { - "epoch": 0.27597278495527866, - "grad_norm": 0.06475582520577117, - "learning_rate": 0.0009085786023722058, - "loss": 1.2357, - "step": 1805 - }, - { - "epoch": 0.27673725250363124, - "grad_norm": 0.07215552635866873, - "learning_rate": 0.0009078080094060529, - "loss": 1.3359, - "step": 1810 - }, - { - "epoch": 0.2775017200519838, - "grad_norm": 0.07686482863798352, - "learning_rate": 0.0009070345120512436, - "loss": 1.202, - "step": 1815 - }, - { - "epoch": 0.2782661876003364, - "grad_norm": 0.07764075676679777, - "learning_rate": 0.0009062581158165877, - "loss": 1.2308, - "step": 1820 - }, - { - "epoch": 0.2790306551486889, - "grad_norm": 0.06490582289798909, - "learning_rate": 0.0009054788262315414, - "loss": 1.1967, - "step": 1825 - }, - { - "epoch": 0.2797951226970415, - "grad_norm": 0.064554320995236, - "learning_rate": 0.000904696648846167, - "loss": 1.2148, - "step": 1830 - }, - { - "epoch": 0.28055959024539406, - "grad_norm": 0.09976478815425889, - "learning_rate": 0.0009039115892310931, - "loss": 1.2129, - "step": 1835 - }, - { - "epoch": 0.28132405779374664, - "grad_norm": 0.08662063449022793, - "learning_rate": 0.0009031236529774765, - "loss": 1.1943, - "step": 1840 - }, - { - "epoch": 0.2820885253420992, - "grad_norm": 0.06867780089471395, - "learning_rate": 0.0009023328456969598, - "loss": 1.1957, - "step": 1845 - }, - { - "epoch": 0.2828529928904518, - "grad_norm": 0.08054420978971974, - "learning_rate": 0.0009015391730216343, - "loss": 1.1775, - "step": 1850 - }, - { - "epoch": 0.28361746043880437, - "grad_norm": 0.06748059150550935, - "learning_rate": 0.0009007426406039975, - "loss": 1.197, - "step": 1855 - }, - { - "epoch": 0.28438192798715695, - "grad_norm": 0.061636919853196595, - "learning_rate": 0.0008999432541169144, - "loss": 1.1895, - "step": 1860 - }, - { - "epoch": 0.2851463955355095, - "grad_norm": 0.06329361104758795, - "learning_rate": 0.0008991410192535765, - "loss": 1.2115, - "step": 1865 - }, - { - "epoch": 0.2859108630838621, - "grad_norm": 0.06647515121483408, - "learning_rate": 0.000898335941727461, - "loss": 1.17, - "step": 1870 - }, - { - "epoch": 0.2866753306322147, - "grad_norm": 0.08264733362665477, - "learning_rate": 0.0008975280272722906, - "loss": 1.2323, - "step": 1875 - }, - { - "epoch": 0.28743979818056725, - "grad_norm": 0.06871644318138265, - "learning_rate": 0.0008967172816419926, - "loss": 1.2038, - "step": 1880 - }, - { - "epoch": 0.28820426572891983, - "grad_norm": 0.06608040766728891, - "learning_rate": 0.0008959037106106575, - "loss": 1.2133, - "step": 1885 - }, - { - "epoch": 0.2889687332772724, - "grad_norm": 0.0843249746514495, - "learning_rate": 0.0008950873199724985, - "loss": 1.2476, - "step": 1890 - }, - { - "epoch": 0.2897332008256249, - "grad_norm": 0.06391405872287036, - "learning_rate": 0.0008942681155418093, - "loss": 1.2173, - "step": 1895 - }, - { - "epoch": 0.2904976683739775, - "grad_norm": 0.07858223937418263, - "learning_rate": 0.0008934461031529241, - "loss": 1.2367, - "step": 1900 - }, - { - "epoch": 0.2912621359223301, - "grad_norm": 0.09167454894661381, - "learning_rate": 0.0008926212886601749, - "loss": 1.2095, - "step": 1905 - }, - { - "epoch": 0.29202660347068266, - "grad_norm": 0.0652987710283515, - "learning_rate": 0.0008917936779378497, - "loss": 1.1657, - "step": 1910 - }, - { - "epoch": 0.29279107101903523, - "grad_norm": 0.08532622110352704, - "learning_rate": 0.0008909632768801519, - "loss": 1.2182, - "step": 1915 - }, - { - "epoch": 0.2935555385673878, - "grad_norm": 0.13889204437606176, - "learning_rate": 0.0008901300914011569, - "loss": 1.1919, - "step": 1920 - }, - { - "epoch": 0.2943200061157404, - "grad_norm": 0.06762914304742695, - "learning_rate": 0.0008892941274347707, - "loss": 1.2174, - "step": 1925 - }, - { - "epoch": 0.29508447366409296, - "grad_norm": 0.07275455107428999, - "learning_rate": 0.0008884553909346882, - "loss": 1.2023, - "step": 1930 - }, - { - "epoch": 0.29584894121244554, - "grad_norm": 0.0671901981946541, - "learning_rate": 0.0008876138878743493, - "loss": 1.1984, - "step": 1935 - }, - { - "epoch": 0.2966134087607981, - "grad_norm": 0.06303430763742675, - "learning_rate": 0.0008867696242468976, - "loss": 1.2203, - "step": 1940 - }, - { - "epoch": 0.2973778763091507, - "grad_norm": 0.06625636101894779, - "learning_rate": 0.0008859226060651373, - "loss": 1.1984, - "step": 1945 - }, - { - "epoch": 0.29814234385750327, - "grad_norm": 0.06254460803911173, - "learning_rate": 0.0008850728393614902, - "loss": 1.149, - "step": 1950 - }, - { - "epoch": 0.29890681140585584, - "grad_norm": 0.06693727583070225, - "learning_rate": 0.0008842203301879535, - "loss": 1.1963, - "step": 1955 - }, - { - "epoch": 0.2996712789542084, - "grad_norm": 0.06974909946676806, - "learning_rate": 0.0008833650846160555, - "loss": 1.2125, - "step": 1960 - }, - { - "epoch": 0.30043574650256094, - "grad_norm": 5.2895162289649615, - "learning_rate": 0.0008825071087368131, - "loss": 1.2391, - "step": 1965 - }, - { - "epoch": 0.3012002140509135, - "grad_norm": 0.09460329161529786, - "learning_rate": 0.0008816464086606888, - "loss": 1.2074, - "step": 1970 - }, - { - "epoch": 0.3019646815992661, - "grad_norm": 0.1507554888140455, - "learning_rate": 0.0008807829905175462, - "loss": 1.2141, - "step": 1975 - }, - { - "epoch": 0.30272914914761867, - "grad_norm": 0.07284165587078718, - "learning_rate": 0.000879916860456607, - "loss": 1.1938, - "step": 1980 - }, - { - "epoch": 0.30349361669597125, - "grad_norm": 0.08317773836093713, - "learning_rate": 0.0008790480246464071, - "loss": 1.2209, - "step": 1985 - }, - { - "epoch": 0.3042580842443238, - "grad_norm": 0.11170869912072523, - "learning_rate": 0.0008781764892747525, - "loss": 1.1916, - "step": 1990 - }, - { - "epoch": 0.3050225517926764, - "grad_norm": 0.06299927829436214, - "learning_rate": 0.0008773022605486754, - "loss": 1.2109, - "step": 1995 - }, - { - "epoch": 0.305787019341029, - "grad_norm": 0.08391531668184286, - "learning_rate": 0.0008764253446943899, - "loss": 1.1906, - "step": 2000 - }, - { - "epoch": 0.30655148688938155, - "grad_norm": 0.0698777375605405, - "learning_rate": 0.0008755457479572478, - "loss": 1.1927, - "step": 2005 - }, - { - "epoch": 0.30731595443773413, - "grad_norm": 0.0764854535360263, - "learning_rate": 0.0008746634766016941, - "loss": 1.1993, - "step": 2010 - }, - { - "epoch": 0.3080804219860867, - "grad_norm": 0.07915862196006107, - "learning_rate": 0.0008737785369112219, - "loss": 1.2232, - "step": 2015 - }, - { - "epoch": 0.3088448895344393, - "grad_norm": 0.06725444808758432, - "learning_rate": 0.0008728909351883283, - "loss": 1.2007, - "step": 2020 - }, - { - "epoch": 0.30960935708279186, - "grad_norm": 0.07910670587364323, - "learning_rate": 0.0008720006777544696, - "loss": 1.2094, - "step": 2025 - }, - { - "epoch": 0.31037382463114443, - "grad_norm": 0.07747525026031941, - "learning_rate": 0.0008711077709500153, - "loss": 1.2051, - "step": 2030 - }, - { - "epoch": 0.31113829217949696, - "grad_norm": 0.06741852376910797, - "learning_rate": 0.0008702122211342039, - "loss": 1.2331, - "step": 2035 - }, - { - "epoch": 0.31190275972784953, - "grad_norm": 0.06866481465327473, - "learning_rate": 0.0008693140346850975, - "loss": 1.183, - "step": 2040 - }, - { - "epoch": 0.3126672272762021, - "grad_norm": 0.0978479410648741, - "learning_rate": 0.0008684132179995359, - "loss": 1.1953, - "step": 2045 - }, - { - "epoch": 0.3134316948245547, - "grad_norm": 0.06892340870931946, - "learning_rate": 0.0008675097774930912, - "loss": 1.1691, - "step": 2050 - }, - { - "epoch": 0.31419616237290726, - "grad_norm": 0.059001834743335244, - "learning_rate": 0.0008666037196000227, - "loss": 1.1996, - "step": 2055 - }, - { - "epoch": 0.31496062992125984, - "grad_norm": 0.06755590313805718, - "learning_rate": 0.0008656950507732302, - "loss": 1.1778, - "step": 2060 - }, - { - "epoch": 0.3157250974696124, - "grad_norm": 0.06507056486580198, - "learning_rate": 0.0008647837774842085, - "loss": 1.2057, - "step": 2065 - }, - { - "epoch": 0.316489565017965, - "grad_norm": 0.06785984666263632, - "learning_rate": 0.0008638699062230011, - "loss": 1.2116, - "step": 2070 - }, - { - "epoch": 0.31725403256631757, - "grad_norm": 0.08655861530553224, - "learning_rate": 0.0008629534434981547, - "loss": 1.1711, - "step": 2075 - }, - { - "epoch": 0.31801850011467014, - "grad_norm": 0.09502942427023862, - "learning_rate": 0.0008620343958366718, - "loss": 1.2589, - "step": 2080 - }, - { - "epoch": 0.3187829676630227, - "grad_norm": 0.07389015223749948, - "learning_rate": 0.0008611127697839648, - "loss": 1.1662, - "step": 2085 - }, - { - "epoch": 0.3195474352113753, - "grad_norm": 0.06284898980071424, - "learning_rate": 0.0008601885719038092, - "loss": 1.2171, - "step": 2090 - }, - { - "epoch": 0.3203119027597279, - "grad_norm": 0.06925613572330289, - "learning_rate": 0.0008592618087782971, - "loss": 1.1941, - "step": 2095 - }, - { - "epoch": 0.3210763703080804, - "grad_norm": 0.06565728652078977, - "learning_rate": 0.00085833248700779, - "loss": 1.1722, - "step": 2100 - }, - { - "epoch": 0.32184083785643297, - "grad_norm": 0.06497325207608327, - "learning_rate": 0.0008574006132108721, - "loss": 1.2005, - "step": 2105 - }, - { - "epoch": 0.32260530540478555, - "grad_norm": 0.0830860825359688, - "learning_rate": 0.0008564661940243027, - "loss": 1.1849, - "step": 2110 - }, - { - "epoch": 0.3233697729531381, - "grad_norm": 0.06766203461784877, - "learning_rate": 0.0008555292361029696, - "loss": 1.1927, - "step": 2115 - }, - { - "epoch": 0.3241342405014907, - "grad_norm": 0.07153735721703837, - "learning_rate": 0.0008545897461198413, - "loss": 1.2189, - "step": 2120 - }, - { - "epoch": 0.3248987080498433, - "grad_norm": 0.07558372140579875, - "learning_rate": 0.0008536477307659192, - "loss": 1.1919, - "step": 2125 - }, - { - "epoch": 0.32566317559819585, - "grad_norm": 0.08212311724687331, - "learning_rate": 0.0008527031967501906, - "loss": 1.1633, - "step": 2130 - }, - { - "epoch": 0.32642764314654843, - "grad_norm": 0.06765162215684241, - "learning_rate": 0.0008517561507995805, - "loss": 1.2469, - "step": 2135 - }, - { - "epoch": 0.327192110694901, - "grad_norm": 0.06925030760862601, - "learning_rate": 0.0008508065996589037, - "loss": 1.1679, - "step": 2140 - }, - { - "epoch": 0.3279565782432536, - "grad_norm": 0.07384137246492231, - "learning_rate": 0.0008498545500908168, - "loss": 1.1988, - "step": 2145 - }, - { - "epoch": 0.32872104579160616, - "grad_norm": 0.06472585249971541, - "learning_rate": 0.0008489000088757703, - "loss": 1.1971, - "step": 2150 - }, - { - "epoch": 0.32948551333995874, - "grad_norm": 0.08266273525218829, - "learning_rate": 0.0008479429828119598, - "loss": 1.1802, - "step": 2155 - }, - { - "epoch": 0.3302499808883113, - "grad_norm": 0.06160840327052949, - "learning_rate": 0.0008469834787152783, - "loss": 1.2191, - "step": 2160 - }, - { - "epoch": 0.3310144484366639, - "grad_norm": 0.06108701849003786, - "learning_rate": 0.0008460215034192667, - "loss": 1.1599, - "step": 2165 - }, - { - "epoch": 0.3317789159850164, - "grad_norm": 0.0675003939511652, - "learning_rate": 0.000845057063775066, - "loss": 1.2091, - "step": 2170 - }, - { - "epoch": 0.332543383533369, - "grad_norm": 0.06491128889515788, - "learning_rate": 0.0008440901666513681, - "loss": 1.1932, - "step": 2175 - }, - { - "epoch": 0.33330785108172156, - "grad_norm": 0.06739173874329502, - "learning_rate": 0.0008431208189343669, - "loss": 1.2222, - "step": 2180 - }, - { - "epoch": 0.33407231863007414, - "grad_norm": 0.06837629340068371, - "learning_rate": 0.0008421490275277093, - "loss": 1.1827, - "step": 2185 - }, - { - "epoch": 0.3348367861784267, - "grad_norm": 0.07301400162009927, - "learning_rate": 0.000841174799352446, - "loss": 1.1898, - "step": 2190 - }, - { - "epoch": 0.3356012537267793, - "grad_norm": 0.07696744621983317, - "learning_rate": 0.0008401981413469826, - "loss": 1.1807, - "step": 2195 - }, - { - "epoch": 0.33636572127513187, - "grad_norm": 0.06350723808650868, - "learning_rate": 0.0008392190604670293, - "loss": 1.1597, - "step": 2200 - }, - { - "epoch": 0.33713018882348444, - "grad_norm": 0.06598685397103475, - "learning_rate": 0.0008382375636855522, - "loss": 1.1897, - "step": 2205 - }, - { - "epoch": 0.337894656371837, - "grad_norm": 0.0666673189819934, - "learning_rate": 0.0008372536579927233, - "loss": 1.1718, - "step": 2210 - }, - { - "epoch": 0.3386591239201896, - "grad_norm": 0.062110253414163125, - "learning_rate": 0.0008362673503958707, - "loss": 1.1791, - "step": 2215 - }, - { - "epoch": 0.3394235914685422, - "grad_norm": 0.06769528641288612, - "learning_rate": 0.0008352786479194288, - "loss": 1.2181, - "step": 2220 - }, - { - "epoch": 0.34018805901689475, - "grad_norm": 0.08008302678521025, - "learning_rate": 0.000834287557604888, - "loss": 1.1546, - "step": 2225 - }, - { - "epoch": 0.3409525265652473, - "grad_norm": 0.0764574913479765, - "learning_rate": 0.0008332940865107452, - "loss": 1.1988, - "step": 2230 - }, - { - "epoch": 0.3417169941135999, - "grad_norm": 0.07425606538696802, - "learning_rate": 0.000832298241712453, - "loss": 1.19, - "step": 2235 - }, - { - "epoch": 0.3424814616619524, - "grad_norm": 0.06744819104094633, - "learning_rate": 0.0008313000303023688, - "loss": 1.1711, - "step": 2240 - }, - { - "epoch": 0.343245929210305, - "grad_norm": 0.061154631516393325, - "learning_rate": 0.0008302994593897055, - "loss": 1.1914, - "step": 2245 - }, - { - "epoch": 0.3440103967586576, - "grad_norm": 0.07404312882255028, - "learning_rate": 0.0008292965361004801, - "loss": 1.1682, - "step": 2250 - }, - { - "epoch": 0.34477486430701015, - "grad_norm": 0.06742969913673322, - "learning_rate": 0.0008282912675774632, - "loss": 1.2123, - "step": 2255 - }, - { - "epoch": 0.34553933185536273, - "grad_norm": 0.06384028111357246, - "learning_rate": 0.000827283660980128, - "loss": 1.1466, - "step": 2260 - }, - { - "epoch": 0.3463037994037153, - "grad_norm": 0.0807612774368799, - "learning_rate": 0.0008262737234845991, - "loss": 1.1709, - "step": 2265 - }, - { - "epoch": 0.3470682669520679, - "grad_norm": 0.06299523953324049, - "learning_rate": 0.0008252614622836021, - "loss": 1.1561, - "step": 2270 - }, - { - "epoch": 0.34783273450042046, - "grad_norm": 0.07042573805382132, - "learning_rate": 0.0008242468845864115, - "loss": 1.2122, - "step": 2275 - }, - { - "epoch": 0.34859720204877304, - "grad_norm": 0.07326150426716001, - "learning_rate": 0.0008232299976187999, - "loss": 1.191, - "step": 2280 - }, - { - "epoch": 0.3493616695971256, - "grad_norm": 0.07733654467734337, - "learning_rate": 0.0008222108086229865, - "loss": 1.1814, - "step": 2285 - }, - { - "epoch": 0.3501261371454782, - "grad_norm": 0.06653554190977337, - "learning_rate": 0.0008211893248575854, - "loss": 1.1706, - "step": 2290 - }, - { - "epoch": 0.35089060469383077, - "grad_norm": 0.07586818669104102, - "learning_rate": 0.0008201655535975541, - "loss": 1.1816, - "step": 2295 - }, - { - "epoch": 0.35165507224218334, - "grad_norm": 0.0722200099530391, - "learning_rate": 0.0008191395021341408, - "loss": 1.2203, - "step": 2300 - }, - { - "epoch": 0.3524195397905359, - "grad_norm": 0.07729811512497685, - "learning_rate": 0.0008181111777748342, - "loss": 1.2101, - "step": 2305 - }, - { - "epoch": 0.35318400733888844, - "grad_norm": 0.06548329762746453, - "learning_rate": 0.0008170805878433099, - "loss": 1.2126, - "step": 2310 - }, - { - "epoch": 0.353948474887241, - "grad_norm": 0.16878411753529185, - "learning_rate": 0.0008160477396793788, - "loss": 1.1993, - "step": 2315 - }, - { - "epoch": 0.3547129424355936, - "grad_norm": 0.08450927348552127, - "learning_rate": 0.0008150126406389351, - "loss": 1.1903, - "step": 2320 - }, - { - "epoch": 0.35547740998394617, - "grad_norm": 0.07393765853159336, - "learning_rate": 0.0008139752980939033, - "loss": 1.1896, - "step": 2325 - }, - { - "epoch": 0.35624187753229875, - "grad_norm": 0.0645007516624614, - "learning_rate": 0.000812935719432186, - "loss": 1.203, - "step": 2330 - }, - { - "epoch": 0.3570063450806513, - "grad_norm": 0.07480048796944509, - "learning_rate": 0.0008118939120576117, - "loss": 1.2232, - "step": 2335 - }, - { - "epoch": 0.3577708126290039, - "grad_norm": 0.06387184157320323, - "learning_rate": 0.0008108498833898814, - "loss": 1.1844, - "step": 2340 - }, - { - "epoch": 0.3585352801773565, - "grad_norm": 0.0706157487905302, - "learning_rate": 0.0008098036408645161, - "loss": 1.1545, - "step": 2345 - }, - { - "epoch": 0.35929974772570905, - "grad_norm": 0.06346402094887067, - "learning_rate": 0.0008087551919328038, - "loss": 1.1998, - "step": 2350 - }, - { - "epoch": 0.3600642152740616, - "grad_norm": 0.07219725356620699, - "learning_rate": 0.0008077045440617464, - "loss": 1.2208, - "step": 2355 - }, - { - "epoch": 0.3608286828224142, - "grad_norm": 0.06980068763494783, - "learning_rate": 0.0008066517047340065, - "loss": 1.1953, - "step": 2360 - }, - { - "epoch": 0.3615931503707668, - "grad_norm": 0.06836413221657871, - "learning_rate": 0.0008055966814478543, - "loss": 1.1998, - "step": 2365 - }, - { - "epoch": 0.36235761791911936, - "grad_norm": 0.06020785995804865, - "learning_rate": 0.0008045394817171138, - "loss": 1.1495, - "step": 2370 - }, - { - "epoch": 0.36312208546747193, - "grad_norm": 0.06584082195664415, - "learning_rate": 0.0008034801130711101, - "loss": 1.1687, - "step": 2375 - }, - { - "epoch": 0.36388655301582445, - "grad_norm": 0.07621192168498155, - "learning_rate": 0.000802418583054614, - "loss": 1.185, - "step": 2380 - }, - { - "epoch": 0.36465102056417703, - "grad_norm": 0.0858231914116967, - "learning_rate": 0.0008013548992277909, - "loss": 1.1875, - "step": 2385 - }, - { - "epoch": 0.3654154881125296, - "grad_norm": 0.06562775419380161, - "learning_rate": 0.0008002890691661446, - "loss": 1.213, - "step": 2390 - }, - { - "epoch": 0.3661799556608822, - "grad_norm": 0.07523928065889657, - "learning_rate": 0.0007992211004604646, - "loss": 1.1824, - "step": 2395 - }, - { - "epoch": 0.36694442320923476, - "grad_norm": 0.08499994679950856, - "learning_rate": 0.0007981510007167718, - "loss": 1.1871, - "step": 2400 - }, - { - "epoch": 0.36770889075758734, - "grad_norm": 0.06903788845491694, - "learning_rate": 0.0007970787775562641, - "loss": 1.1822, - "step": 2405 - }, - { - "epoch": 0.3684733583059399, - "grad_norm": 0.06597575725856182, - "learning_rate": 0.000796004438615262, - "loss": 1.1925, - "step": 2410 - }, - { - "epoch": 0.3692378258542925, - "grad_norm": 0.07344580250828273, - "learning_rate": 0.0007949279915451553, - "loss": 1.1823, - "step": 2415 - }, - { - "epoch": 0.37000229340264507, - "grad_norm": 0.060426789873964576, - "learning_rate": 0.0007938494440123468, - "loss": 1.1681, - "step": 2420 - }, - { - "epoch": 0.37076676095099764, - "grad_norm": 0.07015273475512843, - "learning_rate": 0.0007927688036981994, - "loss": 1.2012, - "step": 2425 - }, - { - "epoch": 0.3715312284993502, - "grad_norm": 0.07711911062361188, - "learning_rate": 0.0007916860782989806, - "loss": 1.1532, - "step": 2430 - }, - { - "epoch": 0.3722956960477028, - "grad_norm": 0.06654784595079923, - "learning_rate": 0.0007906012755258072, - "loss": 1.2284, - "step": 2435 - }, - { - "epoch": 0.37306016359605537, - "grad_norm": 0.062029588756078474, - "learning_rate": 0.0007895144031045918, - "loss": 1.2247, - "step": 2440 - }, - { - "epoch": 0.3738246311444079, - "grad_norm": 0.0877602095214219, - "learning_rate": 0.0007884254687759862, - "loss": 1.1617, - "step": 2445 - }, - { - "epoch": 0.37458909869276047, - "grad_norm": 0.06287647828722921, - "learning_rate": 0.0007873344802953277, - "loss": 1.1672, - "step": 2450 - }, - { - "epoch": 0.37535356624111305, - "grad_norm": 0.06438896028122171, - "learning_rate": 0.0007862414454325826, - "loss": 1.1483, - "step": 2455 - }, - { - "epoch": 0.3761180337894656, - "grad_norm": 0.0593997017120742, - "learning_rate": 0.0007851463719722913, - "loss": 1.1945, - "step": 2460 - }, - { - "epoch": 0.3768825013378182, - "grad_norm": 0.09414055116242716, - "learning_rate": 0.0007840492677135139, - "loss": 1.2039, - "step": 2465 - }, - { - "epoch": 0.3776469688861708, - "grad_norm": 0.07238337604705601, - "learning_rate": 0.0007829501404697728, - "loss": 1.1972, - "step": 2470 - }, - { - "epoch": 0.37841143643452335, - "grad_norm": 0.05784642990402551, - "learning_rate": 0.0007818489980689985, - "loss": 1.1969, - "step": 2475 - }, - { - "epoch": 0.37917590398287593, - "grad_norm": 0.06420946957423379, - "learning_rate": 0.0007807458483534731, - "loss": 1.1344, - "step": 2480 - }, - { - "epoch": 0.3799403715312285, - "grad_norm": 0.0622062481944738, - "learning_rate": 0.0007796406991797748, - "loss": 1.1632, - "step": 2485 - }, - { - "epoch": 0.3807048390795811, - "grad_norm": 0.06698914216587033, - "learning_rate": 0.0007785335584187218, - "loss": 1.2193, - "step": 2490 - }, - { - "epoch": 0.38146930662793366, - "grad_norm": 0.06501159827802083, - "learning_rate": 0.0007774244339553168, - "loss": 1.1949, - "step": 2495 - }, - { - "epoch": 0.38223377417628623, - "grad_norm": 0.06352771347650159, - "learning_rate": 0.0007763133336886891, - "loss": 1.1694, - "step": 2500 - }, - { - "epoch": 0.3829982417246388, - "grad_norm": 0.062043540050934584, - "learning_rate": 0.0007752002655320411, - "loss": 1.1754, - "step": 2505 - }, - { - "epoch": 0.3837627092729914, - "grad_norm": 0.05993356867245502, - "learning_rate": 0.0007740852374125892, - "loss": 1.1586, - "step": 2510 - }, - { - "epoch": 0.3845271768213439, - "grad_norm": 0.08359997915488934, - "learning_rate": 0.0007729682572715092, - "loss": 1.2118, - "step": 2515 - }, - { - "epoch": 0.3852916443696965, - "grad_norm": 0.07542827308050223, - "learning_rate": 0.0007718493330638788, - "loss": 1.1713, - "step": 2520 - }, - { - "epoch": 0.38605611191804906, - "grad_norm": 0.06013940005700568, - "learning_rate": 0.0007707284727586216, - "loss": 1.2009, - "step": 2525 - }, - { - "epoch": 0.38682057946640164, - "grad_norm": 0.05981452616943288, - "learning_rate": 0.0007696056843384493, - "loss": 1.1794, - "step": 2530 - }, - { - "epoch": 0.3875850470147542, - "grad_norm": 0.06567705884727439, - "learning_rate": 0.0007684809757998066, - "loss": 1.155, - "step": 2535 - }, - { - "epoch": 0.3883495145631068, - "grad_norm": 0.07472356348523346, - "learning_rate": 0.0007673543551528121, - "loss": 1.1884, - "step": 2540 - }, - { - "epoch": 0.38911398211145937, - "grad_norm": 0.06125765376781909, - "learning_rate": 0.0007662258304212033, - "loss": 1.163, - "step": 2545 - }, - { - "epoch": 0.38987844965981194, - "grad_norm": 0.0660322307444375, - "learning_rate": 0.0007650954096422775, - "loss": 1.1842, - "step": 2550 - }, - { - "epoch": 0.3906429172081645, - "grad_norm": 0.07730780676951493, - "learning_rate": 0.0007639631008668364, - "loss": 1.1628, - "step": 2555 - }, - { - "epoch": 0.3914073847565171, - "grad_norm": 0.08716048156438365, - "learning_rate": 0.0007628289121591276, - "loss": 1.2012, - "step": 2560 - }, - { - "epoch": 0.3921718523048697, - "grad_norm": 0.05864132359328279, - "learning_rate": 0.0007616928515967875, - "loss": 1.1638, - "step": 2565 - }, - { - "epoch": 0.39293631985322225, - "grad_norm": 0.06514781329333548, - "learning_rate": 0.0007605549272707835, - "loss": 1.1472, - "step": 2570 - }, - { - "epoch": 0.3937007874015748, - "grad_norm": 0.07818283105138371, - "learning_rate": 0.0007594151472853572, - "loss": 1.1711, - "step": 2575 - }, - { - "epoch": 0.3944652549499274, - "grad_norm": 0.07142906060531046, - "learning_rate": 0.0007582735197579657, - "loss": 1.1789, - "step": 2580 - }, - { - "epoch": 0.3952297224982799, - "grad_norm": 0.06460911519595092, - "learning_rate": 0.0007571300528192242, - "loss": 1.1553, - "step": 2585 - }, - { - "epoch": 0.3959941900466325, - "grad_norm": 0.06458967247713916, - "learning_rate": 0.0007559847546128482, - "loss": 1.1672, - "step": 2590 - }, - { - "epoch": 0.3967586575949851, - "grad_norm": 0.06555873403698737, - "learning_rate": 0.0007548376332955956, - "loss": 1.1389, - "step": 2595 - }, - { - "epoch": 0.39752312514333765, - "grad_norm": 0.06320995634242295, - "learning_rate": 0.0007536886970372079, - "loss": 1.172, - "step": 2600 - }, - { - "epoch": 0.39828759269169023, - "grad_norm": 0.0692346965255682, - "learning_rate": 0.0007525379540203532, - "loss": 1.1449, - "step": 2605 - }, - { - "epoch": 0.3990520602400428, - "grad_norm": 0.06669313758415195, - "learning_rate": 0.0007513854124405664, - "loss": 1.1875, - "step": 2610 - }, - { - "epoch": 0.3998165277883954, - "grad_norm": 0.06168539003076038, - "learning_rate": 0.0007502310805061927, - "loss": 1.1822, - "step": 2615 - }, - { - "epoch": 0.40058099533674796, - "grad_norm": 0.07577936750925265, - "learning_rate": 0.0007490749664383271, - "loss": 1.1878, - "step": 2620 - }, - { - "epoch": 0.40134546288510053, - "grad_norm": 0.07513830844488023, - "learning_rate": 0.0007479170784707574, - "loss": 1.1538, - "step": 2625 - }, - { - "epoch": 0.4021099304334531, - "grad_norm": 0.0704004134116212, - "learning_rate": 0.0007467574248499052, - "loss": 1.159, - "step": 2630 - }, - { - "epoch": 0.4028743979818057, - "grad_norm": 0.06393476454512485, - "learning_rate": 0.0007455960138347665, - "loss": 1.1533, - "step": 2635 - }, - { - "epoch": 0.40363886553015826, - "grad_norm": 0.060988648131048324, - "learning_rate": 0.0007444328536968537, - "loss": 1.1878, - "step": 2640 - }, - { - "epoch": 0.40440333307851084, - "grad_norm": 0.07642776920542559, - "learning_rate": 0.0007432679527201364, - "loss": 1.21, - "step": 2645 - }, - { - "epoch": 0.4051678006268634, - "grad_norm": 0.07143064451662551, - "learning_rate": 0.0007421013192009823, - "loss": 1.146, - "step": 2650 - }, - { - "epoch": 0.40593226817521594, - "grad_norm": 0.06608214029577218, - "learning_rate": 0.000740932961448098, - "loss": 1.142, - "step": 2655 - }, - { - "epoch": 0.4066967357235685, - "grad_norm": 0.2372583228137919, - "learning_rate": 0.0007397628877824702, - "loss": 1.1696, - "step": 2660 - }, - { - "epoch": 0.4074612032719211, - "grad_norm": 0.06401105720414174, - "learning_rate": 0.0007385911065373061, - "loss": 1.1434, - "step": 2665 - }, - { - "epoch": 0.40822567082027367, - "grad_norm": 0.08634710294909215, - "learning_rate": 0.0007374176260579745, - "loss": 1.1806, - "step": 2670 - }, - { - "epoch": 0.40899013836862624, - "grad_norm": 0.0686184603428492, - "learning_rate": 0.0007362424547019457, - "loss": 1.1754, - "step": 2675 - }, - { - "epoch": 0.4097546059169788, - "grad_norm": 0.07213216317785283, - "learning_rate": 0.0007350656008387326, - "loss": 1.1699, - "step": 2680 - }, - { - "epoch": 0.4105190734653314, - "grad_norm": 0.0739348063942258, - "learning_rate": 0.0007338870728498308, - "loss": 1.1276, - "step": 2685 - }, - { - "epoch": 0.411283541013684, - "grad_norm": 0.07052527755070558, - "learning_rate": 0.0007327068791286586, - "loss": 1.1166, - "step": 2690 - }, - { - "epoch": 0.41204800856203655, - "grad_norm": 0.061734383183528134, - "learning_rate": 0.0007315250280804982, - "loss": 1.1694, - "step": 2695 - }, - { - "epoch": 0.4128124761103891, - "grad_norm": 0.06851752144246956, - "learning_rate": 0.0007303415281224346, - "loss": 1.1696, - "step": 2700 - }, - { - "epoch": 0.4135769436587417, - "grad_norm": 0.06924429941802722, - "learning_rate": 0.0007291563876832966, - "loss": 1.134, - "step": 2705 - }, - { - "epoch": 0.4143414112070943, - "grad_norm": 0.062236735706859295, - "learning_rate": 0.0007279696152035963, - "loss": 1.1756, - "step": 2710 - }, - { - "epoch": 0.41510587875544686, - "grad_norm": 0.06605748129048904, - "learning_rate": 0.000726781219135469, - "loss": 1.1616, - "step": 2715 - }, - { - "epoch": 0.4158703463037994, - "grad_norm": 0.05979864244024164, - "learning_rate": 0.0007255912079426136, - "loss": 1.1586, - "step": 2720 - }, - { - "epoch": 0.41663481385215195, - "grad_norm": 0.05911691466595612, - "learning_rate": 0.0007243995901002312, - "loss": 1.203, - "step": 2725 - }, - { - "epoch": 0.41739928140050453, - "grad_norm": 0.07120579590463852, - "learning_rate": 0.0007232063740949656, - "loss": 1.1739, - "step": 2730 - }, - { - "epoch": 0.4181637489488571, - "grad_norm": 0.07696734291297799, - "learning_rate": 0.0007220115684248429, - "loss": 1.0993, - "step": 2735 - }, - { - "epoch": 0.4189282164972097, - "grad_norm": 0.08907278539208283, - "learning_rate": 0.0007208151815992107, - "loss": 1.1619, - "step": 2740 - }, - { - "epoch": 0.41969268404556226, - "grad_norm": 0.062313137263563724, - "learning_rate": 0.000719617222138677, - "loss": 1.1466, - "step": 2745 - }, - { - "epoch": 0.42045715159391484, - "grad_norm": 0.07514342527228114, - "learning_rate": 0.0007184176985750506, - "loss": 1.215, - "step": 2750 - }, - { - "epoch": 0.4212216191422674, - "grad_norm": 0.07162533739741002, - "learning_rate": 0.0007172166194512792, - "loss": 1.1552, - "step": 2755 - }, - { - "epoch": 0.42198608669062, - "grad_norm": 0.06920738404553707, - "learning_rate": 0.0007160139933213898, - "loss": 1.1844, - "step": 2760 - }, - { - "epoch": 0.42275055423897256, - "grad_norm": 0.06239605059074051, - "learning_rate": 0.0007148098287504266, - "loss": 1.1899, - "step": 2765 - }, - { - "epoch": 0.42351502178732514, - "grad_norm": 0.07204617294710451, - "learning_rate": 0.0007136041343143908, - "loss": 1.1943, - "step": 2770 - }, - { - "epoch": 0.4242794893356777, - "grad_norm": 0.07656583544762036, - "learning_rate": 0.0007123969186001791, - "loss": 1.1758, - "step": 2775 - }, - { - "epoch": 0.4250439568840303, - "grad_norm": 0.06931848694667721, - "learning_rate": 0.0007111881902055223, - "loss": 1.1583, - "step": 2780 - }, - { - "epoch": 0.42580842443238287, - "grad_norm": 0.07497458994101201, - "learning_rate": 0.000709977957738925, - "loss": 1.1558, - "step": 2785 - }, - { - "epoch": 0.4265728919807354, - "grad_norm": 0.0798030234936708, - "learning_rate": 0.0007087662298196037, - "loss": 1.1589, - "step": 2790 - }, - { - "epoch": 0.42733735952908797, - "grad_norm": 0.07255931989483083, - "learning_rate": 0.000707553015077425, - "loss": 1.1175, - "step": 2795 - }, - { - "epoch": 0.42810182707744054, - "grad_norm": 0.07101246431967022, - "learning_rate": 0.000706338322152845, - "loss": 1.181, - "step": 2800 - }, - { - "epoch": 0.4288662946257931, - "grad_norm": 0.05920290047242307, - "learning_rate": 0.000705122159696847, - "loss": 1.1552, - "step": 2805 - }, - { - "epoch": 0.4296307621741457, - "grad_norm": 0.07048728251583447, - "learning_rate": 0.0007039045363708807, - "loss": 1.1649, - "step": 2810 - }, - { - "epoch": 0.4303952297224983, - "grad_norm": 0.057337298176797065, - "learning_rate": 0.0007026854608467994, - "loss": 1.1646, - "step": 2815 - }, - { - "epoch": 0.43115969727085085, - "grad_norm": 0.06921836651212089, - "learning_rate": 0.0007014649418067994, - "loss": 1.1547, - "step": 2820 - }, - { - "epoch": 0.4319241648192034, - "grad_norm": 0.06676730022332167, - "learning_rate": 0.0007002429879433577, - "loss": 1.1512, - "step": 2825 - }, - { - "epoch": 0.432688632367556, - "grad_norm": 0.0630615394746687, - "learning_rate": 0.0006990196079591694, - "loss": 1.1271, - "step": 2830 - }, - { - "epoch": 0.4334530999159086, - "grad_norm": 0.06891965405826847, - "learning_rate": 0.000697794810567087, - "loss": 1.1506, - "step": 2835 - }, - { - "epoch": 0.43421756746426116, - "grad_norm": 0.0658246172183584, - "learning_rate": 0.0006965686044900577, - "loss": 1.1165, - "step": 2840 - }, - { - "epoch": 0.43498203501261373, - "grad_norm": 0.08178737314912204, - "learning_rate": 0.0006953409984610607, - "loss": 1.1679, - "step": 2845 - }, - { - "epoch": 0.4357465025609663, - "grad_norm": 0.0658367549932726, - "learning_rate": 0.0006941120012230463, - "loss": 1.1854, - "step": 2850 - }, - { - "epoch": 0.4365109701093189, - "grad_norm": 0.06579939923600828, - "learning_rate": 0.0006928816215288722, - "loss": 1.1589, - "step": 2855 - }, - { - "epoch": 0.4372754376576714, - "grad_norm": 0.06821461913343178, - "learning_rate": 0.0006916498681412429, - "loss": 1.1409, - "step": 2860 - }, - { - "epoch": 0.438039905206024, - "grad_norm": 0.06302865374733113, - "learning_rate": 0.0006904167498326451, - "loss": 1.1557, - "step": 2865 - }, - { - "epoch": 0.43880437275437656, - "grad_norm": 0.05918815408942707, - "learning_rate": 0.0006891822753852874, - "loss": 1.1565, - "step": 2870 - }, - { - "epoch": 0.43956884030272914, - "grad_norm": 0.06542754378140148, - "learning_rate": 0.0006879464535910358, - "loss": 1.1577, - "step": 2875 - }, - { - "epoch": 0.4403333078510817, - "grad_norm": 0.0647221155275831, - "learning_rate": 0.000686709293251353, - "loss": 1.1602, - "step": 2880 - }, - { - "epoch": 0.4410977753994343, - "grad_norm": 0.0579063394917837, - "learning_rate": 0.0006854708031772341, - "loss": 1.1827, - "step": 2885 - }, - { - "epoch": 0.44186224294778687, - "grad_norm": 0.06365215547735684, - "learning_rate": 0.0006842309921891447, - "loss": 1.164, - "step": 2890 - }, - { - "epoch": 0.44262671049613944, - "grad_norm": 0.06261024715809171, - "learning_rate": 0.000682989869116958, - "loss": 1.191, - "step": 2895 - }, - { - "epoch": 0.443391178044492, - "grad_norm": 0.07234668451704347, - "learning_rate": 0.0006817474427998916, - "loss": 1.1505, - "step": 2900 - }, - { - "epoch": 0.4441556455928446, - "grad_norm": 0.07197614974444562, - "learning_rate": 0.0006805037220864449, - "loss": 1.1858, - "step": 2905 - }, - { - "epoch": 0.44492011314119717, - "grad_norm": 0.06560613456796847, - "learning_rate": 0.000679258715834336, - "loss": 1.1559, - "step": 2910 - }, - { - "epoch": 0.44568458068954975, - "grad_norm": 0.06198035141165582, - "learning_rate": 0.000678012432910438, - "loss": 1.1545, - "step": 2915 - }, - { - "epoch": 0.4464490482379023, - "grad_norm": 0.06444936717504604, - "learning_rate": 0.0006767648821907172, - "loss": 1.1336, - "step": 2920 - }, - { - "epoch": 0.4472135157862549, - "grad_norm": 0.0645392975838134, - "learning_rate": 0.0006755160725601685, - "loss": 1.1391, - "step": 2925 - }, - { - "epoch": 0.4479779833346074, - "grad_norm": 0.06065312532733533, - "learning_rate": 0.0006742660129127529, - "loss": 1.1231, - "step": 2930 - }, - { - "epoch": 0.44874245088296, - "grad_norm": 0.05770027644463755, - "learning_rate": 0.000673014712151334, - "loss": 1.1665, - "step": 2935 - }, - { - "epoch": 0.4495069184313126, - "grad_norm": 0.059687456397664214, - "learning_rate": 0.0006717621791876146, - "loss": 1.1478, - "step": 2940 - }, - { - "epoch": 0.45027138597966515, - "grad_norm": 0.0709234344203115, - "learning_rate": 0.0006705084229420729, - "loss": 1.1418, - "step": 2945 - }, - { - "epoch": 0.4510358535280177, - "grad_norm": 0.08126674149433259, - "learning_rate": 0.0006692534523438993, - "loss": 1.079, - "step": 2950 - }, - { - "epoch": 0.4518003210763703, - "grad_norm": 0.07012397401994788, - "learning_rate": 0.0006679972763309333, - "loss": 1.1583, - "step": 2955 - }, - { - "epoch": 0.4525647886247229, - "grad_norm": 0.060697021218761534, - "learning_rate": 0.0006667399038495986, - "loss": 1.1223, - "step": 2960 - }, - { - "epoch": 0.45332925617307546, - "grad_norm": 0.07452330710843885, - "learning_rate": 0.0006654813438548404, - "loss": 1.1536, - "step": 2965 - }, - { - "epoch": 0.45409372372142803, - "grad_norm": 0.06928736586445865, - "learning_rate": 0.0006642216053100616, - "loss": 1.1137, - "step": 2970 - }, - { - "epoch": 0.4548581912697806, - "grad_norm": 0.06759852135632291, - "learning_rate": 0.000662960697187058, - "loss": 1.1315, - "step": 2975 - }, - { - "epoch": 0.4556226588181332, - "grad_norm": 0.06959880066718058, - "learning_rate": 0.0006616986284659557, - "loss": 1.165, - "step": 2980 - }, - { - "epoch": 0.45638712636648576, - "grad_norm": 0.06392136917578246, - "learning_rate": 0.000660435408135146, - "loss": 1.1531, - "step": 2985 - }, - { - "epoch": 0.45715159391483834, - "grad_norm": 0.06630940031172995, - "learning_rate": 0.0006591710451912225, - "loss": 1.1314, - "step": 2990 - }, - { - "epoch": 0.45791606146319086, - "grad_norm": 0.10697784394354062, - "learning_rate": 0.000657905548638916, - "loss": 1.1522, - "step": 2995 - }, - { - "epoch": 0.45868052901154344, - "grad_norm": 0.060670680576808696, - "learning_rate": 0.0006566389274910309, - "loss": 1.0984, - "step": 3000 - }, - { - "epoch": 0.459444996559896, - "grad_norm": 0.06606537631594284, - "learning_rate": 0.000655371190768381, - "loss": 1.1592, - "step": 3005 - }, - { - "epoch": 0.4602094641082486, - "grad_norm": 0.06083196601012183, - "learning_rate": 0.000654102347499725, - "loss": 1.1495, - "step": 3010 - }, - { - "epoch": 0.46097393165660117, - "grad_norm": 0.05898668376641134, - "learning_rate": 0.0006528324067217025, - "loss": 1.1208, - "step": 3015 - }, - { - "epoch": 0.46173839920495374, - "grad_norm": 0.05423508964655473, - "learning_rate": 0.0006515613774787698, - "loss": 1.1107, - "step": 3020 - }, - { - "epoch": 0.4625028667533063, - "grad_norm": 0.06527274534130718, - "learning_rate": 0.0006502892688231342, - "loss": 1.1523, - "step": 3025 - }, - { - "epoch": 0.4632673343016589, - "grad_norm": 0.06470168742423997, - "learning_rate": 0.0006490160898146918, - "loss": 1.1557, - "step": 3030 - }, - { - "epoch": 0.46403180185001147, - "grad_norm": 0.0651661791184414, - "learning_rate": 0.000647741849520961, - "loss": 1.1149, - "step": 3035 - }, - { - "epoch": 0.46479626939836405, - "grad_norm": 0.06518913789788636, - "learning_rate": 0.0006464665570170185, - "loss": 1.1251, - "step": 3040 - }, - { - "epoch": 0.4655607369467166, - "grad_norm": 0.06041300256236077, - "learning_rate": 0.0006451902213854352, - "loss": 1.1574, - "step": 3045 - }, - { - "epoch": 0.4663252044950692, - "grad_norm": 0.07182863661248022, - "learning_rate": 0.0006439128517162109, - "loss": 1.113, - "step": 3050 - }, - { - "epoch": 0.4670896720434218, - "grad_norm": 0.07484708764880833, - "learning_rate": 0.0006426344571067096, - "loss": 1.1027, - "step": 3055 - }, - { - "epoch": 0.46785413959177435, - "grad_norm": 0.06402028003954885, - "learning_rate": 0.0006413550466615952, - "loss": 1.1398, - "step": 3060 - }, - { - "epoch": 0.4686186071401269, - "grad_norm": 0.05903548215228208, - "learning_rate": 0.0006400746294927662, - "loss": 1.1552, - "step": 3065 - }, - { - "epoch": 0.46938307468847945, - "grad_norm": 0.0701643196016173, - "learning_rate": 0.000638793214719291, - "loss": 1.1333, - "step": 3070 - }, - { - "epoch": 0.47014754223683203, - "grad_norm": 0.08476933332297756, - "learning_rate": 0.0006375108114673424, - "loss": 1.0994, - "step": 3075 - }, - { - "epoch": 0.4709120097851846, - "grad_norm": 0.06475816517225963, - "learning_rate": 0.0006362274288701342, - "loss": 1.1759, - "step": 3080 - }, - { - "epoch": 0.4716764773335372, - "grad_norm": 0.06895976508220301, - "learning_rate": 0.0006349430760678538, - "loss": 1.1462, - "step": 3085 - }, - { - "epoch": 0.47244094488188976, - "grad_norm": 0.05934511939754207, - "learning_rate": 0.0006336577622075992, - "loss": 1.1841, - "step": 3090 - }, - { - "epoch": 0.47320541243024233, - "grad_norm": 0.060053855049644576, - "learning_rate": 0.0006323714964433126, - "loss": 1.1397, - "step": 3095 - }, - { - "epoch": 0.4739698799785949, - "grad_norm": 0.0727525002225497, - "learning_rate": 0.0006310842879357157, - "loss": 1.1602, - "step": 3100 - }, - { - "epoch": 0.4747343475269475, - "grad_norm": 0.06560733009273371, - "learning_rate": 0.0006297961458522444, - "loss": 1.145, - "step": 3105 - }, - { - "epoch": 0.47549881507530006, - "grad_norm": 0.06296234629565342, - "learning_rate": 0.0006285070793669836, - "loss": 1.0818, - "step": 3110 - }, - { - "epoch": 0.47626328262365264, - "grad_norm": 0.07048483200966957, - "learning_rate": 0.0006272170976606016, - "loss": 1.1443, - "step": 3115 - }, - { - "epoch": 0.4770277501720052, - "grad_norm": 0.060550118582415664, - "learning_rate": 0.0006259262099202849, - "loss": 1.1041, - "step": 3120 - }, - { - "epoch": 0.4777922177203578, - "grad_norm": 0.05970124171786194, - "learning_rate": 0.0006246344253396727, - "loss": 1.1376, - "step": 3125 - }, - { - "epoch": 0.47855668526871037, - "grad_norm": 0.06257524696187984, - "learning_rate": 0.0006233417531187914, - "loss": 1.1472, - "step": 3130 - }, - { - "epoch": 0.4793211528170629, - "grad_norm": 0.0588918841092477, - "learning_rate": 0.0006220482024639893, - "loss": 1.1095, - "step": 3135 - }, - { - "epoch": 0.48008562036541547, - "grad_norm": 0.07088042128141556, - "learning_rate": 0.0006207537825878707, - "loss": 1.1452, - "step": 3140 - }, - { - "epoch": 0.48085008791376804, - "grad_norm": 0.05691526198340547, - "learning_rate": 0.0006194585027092306, - "loss": 1.141, - "step": 3145 - }, - { - "epoch": 0.4816145554621206, - "grad_norm": 0.062009323742541686, - "learning_rate": 0.0006181623720529888, - "loss": 1.1483, - "step": 3150 - }, - { - "epoch": 0.4823790230104732, - "grad_norm": 0.07737737480019008, - "learning_rate": 0.0006168653998501242, - "loss": 1.1398, - "step": 3155 - }, - { - "epoch": 0.48314349055882577, - "grad_norm": 0.0727754305441971, - "learning_rate": 0.0006155675953376094, - "loss": 1.1404, - "step": 3160 - }, - { - "epoch": 0.48390795810717835, - "grad_norm": 0.057391906142971814, - "learning_rate": 0.0006142689677583447, - "loss": 1.1316, - "step": 3165 - }, - { - "epoch": 0.4846724256555309, - "grad_norm": 0.06229610749138481, - "learning_rate": 0.0006129695263610915, - "loss": 1.1871, - "step": 3170 - }, - { - "epoch": 0.4854368932038835, - "grad_norm": 0.09096557645409899, - "learning_rate": 0.0006116692804004083, - "loss": 1.1198, - "step": 3175 - }, - { - "epoch": 0.4862013607522361, - "grad_norm": 0.06325095081093489, - "learning_rate": 0.0006103682391365828, - "loss": 1.1363, - "step": 3180 - }, - { - "epoch": 0.48696582830058865, - "grad_norm": 0.09162697104182367, - "learning_rate": 0.0006090664118355673, - "loss": 1.1527, - "step": 3185 - }, - { - "epoch": 0.48773029584894123, - "grad_norm": 0.06262487814739563, - "learning_rate": 0.000607763807768912, - "loss": 1.198, - "step": 3190 - }, - { - "epoch": 0.4884947633972938, - "grad_norm": 0.06150141308221176, - "learning_rate": 0.0006064604362136991, - "loss": 1.1479, - "step": 3195 - }, - { - "epoch": 0.4892592309456464, - "grad_norm": 0.07871059881475885, - "learning_rate": 0.000605156306452477, - "loss": 1.1168, - "step": 3200 - }, - { - "epoch": 0.4900236984939989, - "grad_norm": 0.06617353704318331, - "learning_rate": 0.0006038514277731941, - "loss": 1.1265, - "step": 3205 - }, - { - "epoch": 0.4907881660423515, - "grad_norm": 0.06533259508339928, - "learning_rate": 0.0006025458094691323, - "loss": 1.1142, - "step": 3210 - }, - { - "epoch": 0.49155263359070406, - "grad_norm": 0.06562349068949161, - "learning_rate": 0.0006012394608388411, - "loss": 1.1551, - "step": 3215 - }, - { - "epoch": 0.49231710113905663, - "grad_norm": 0.07913282984614999, - "learning_rate": 0.0005999323911860712, - "loss": 1.1805, - "step": 3220 - }, - { - "epoch": 0.4930815686874092, - "grad_norm": 0.06329112653333181, - "learning_rate": 0.0005986246098197093, - "loss": 1.1808, - "step": 3225 - }, - { - "epoch": 0.4938460362357618, - "grad_norm": 0.05983365597317458, - "learning_rate": 0.0005973161260537095, - "loss": 1.1359, - "step": 3230 - }, - { - "epoch": 0.49461050378411436, - "grad_norm": 0.06404699549747081, - "learning_rate": 0.0005960069492070294, - "loss": 1.109, - "step": 3235 - }, - { - "epoch": 0.49537497133246694, - "grad_norm": 0.06392702550097112, - "learning_rate": 0.0005946970886035625, - "loss": 1.1195, - "step": 3240 - }, - { - "epoch": 0.4961394388808195, - "grad_norm": 0.06887946962999059, - "learning_rate": 0.0005933865535720714, - "loss": 1.1338, - "step": 3245 - }, - { - "epoch": 0.4969039064291721, - "grad_norm": 0.06321912983691443, - "learning_rate": 0.0005920753534461225, - "loss": 1.1628, - "step": 3250 - }, - { - "epoch": 0.49766837397752467, - "grad_norm": 0.05892992472619733, - "learning_rate": 0.0005907634975640191, - "loss": 1.1343, - "step": 3255 - }, - { - "epoch": 0.49843284152587725, - "grad_norm": 0.08626283038221394, - "learning_rate": 0.000589450995268734, - "loss": 1.1082, - "step": 3260 - }, - { - "epoch": 0.4991973090742298, - "grad_norm": 0.07608683156808083, - "learning_rate": 0.0005881378559078448, - "loss": 1.0954, - "step": 3265 - }, - { - "epoch": 0.4999617766225824, - "grad_norm": 0.06202931308381165, - "learning_rate": 0.0005868240888334653, - "loss": 1.1253, - "step": 3270 - }, - { - "epoch": 0.5007262441709349, - "grad_norm": 0.0693162463919691, - "learning_rate": 0.0005855097034021803, - "loss": 1.1302, - "step": 3275 - }, - { - "epoch": 0.5014907117192875, - "grad_norm": 0.06857586212270887, - "learning_rate": 0.0005841947089749782, - "loss": 1.1145, - "step": 3280 - }, - { - "epoch": 0.5022551792676401, - "grad_norm": 0.056543922942401514, - "learning_rate": 0.0005828791149171855, - "loss": 1.166, - "step": 3285 - }, - { - "epoch": 0.5030196468159926, - "grad_norm": 0.061216299482396534, - "learning_rate": 0.0005815629305983979, - "loss": 1.1258, - "step": 3290 - }, - { - "epoch": 0.5037841143643452, - "grad_norm": 0.08922514630551934, - "learning_rate": 0.000580246165392416, - "loss": 1.1201, - "step": 3295 - }, - { - "epoch": 0.5045485819126978, - "grad_norm": 0.06366534833418323, - "learning_rate": 0.000578928828677177, - "loss": 1.0926, - "step": 3300 - }, - { - "epoch": 0.5053130494610504, - "grad_norm": 0.05868222940320121, - "learning_rate": 0.0005776109298346885, - "loss": 1.123, - "step": 3305 - }, - { - "epoch": 0.506077517009403, - "grad_norm": 0.055139863246591446, - "learning_rate": 0.0005762924782509612, - "loss": 1.1111, - "step": 3310 - }, - { - "epoch": 0.5068419845577555, - "grad_norm": 0.06573454791906676, - "learning_rate": 0.0005749734833159429, - "loss": 1.113, - "step": 3315 - }, - { - "epoch": 0.5076064521061081, - "grad_norm": 0.08427239378689733, - "learning_rate": 0.0005736539544234508, - "loss": 1.1569, - "step": 3320 - }, - { - "epoch": 0.5083709196544607, - "grad_norm": 0.07558080456630767, - "learning_rate": 0.0005723339009711051, - "loss": 1.1376, - "step": 3325 - }, - { - "epoch": 0.5091353872028133, - "grad_norm": 0.05898916837152998, - "learning_rate": 0.0005710133323602616, - "loss": 1.1007, - "step": 3330 - }, - { - "epoch": 0.5098998547511658, - "grad_norm": 0.06278710607065763, - "learning_rate": 0.0005696922579959455, - "loss": 1.1288, - "step": 3335 - }, - { - "epoch": 0.5106643222995184, - "grad_norm": 0.06074976725732031, - "learning_rate": 0.0005683706872867833, - "loss": 1.131, - "step": 3340 - }, - { - "epoch": 0.511428789847871, - "grad_norm": 0.06500965731679485, - "learning_rate": 0.0005670486296449372, - "loss": 1.1073, - "step": 3345 - }, - { - "epoch": 0.5121932573962236, - "grad_norm": 0.059725258913741155, - "learning_rate": 0.0005657260944860367, - "loss": 1.122, - "step": 3350 - }, - { - "epoch": 0.5129577249445761, - "grad_norm": 0.06081393034395375, - "learning_rate": 0.0005644030912291125, - "loss": 1.1397, - "step": 3355 - }, - { - "epoch": 0.5137221924929287, - "grad_norm": 0.06500953558187635, - "learning_rate": 0.0005630796292965288, - "loss": 1.128, - "step": 3360 - }, - { - "epoch": 0.5144866600412813, - "grad_norm": 0.05991814710809413, - "learning_rate": 0.0005617557181139169, - "loss": 1.115, - "step": 3365 - }, - { - "epoch": 0.5152511275896339, - "grad_norm": 0.06774178943497386, - "learning_rate": 0.000560431367110107, - "loss": 1.1321, - "step": 3370 - }, - { - "epoch": 0.5160155951379863, - "grad_norm": 0.06508939987253937, - "learning_rate": 0.0005591065857170623, - "loss": 1.1377, - "step": 3375 - }, - { - "epoch": 0.5167800626863389, - "grad_norm": 0.06522977314137465, - "learning_rate": 0.000557781383369811, - "loss": 1.117, - "step": 3380 - }, - { - "epoch": 0.5175445302346915, - "grad_norm": 0.0636581730317922, - "learning_rate": 0.000556455769506379, - "loss": 1.0904, - "step": 3385 - }, - { - "epoch": 0.5183089977830441, - "grad_norm": 0.05994794288590292, - "learning_rate": 0.0005551297535677235, - "loss": 1.1361, - "step": 3390 - }, - { - "epoch": 0.5190734653313966, - "grad_norm": 0.07212580912491869, - "learning_rate": 0.000553803344997665, - "loss": 1.1197, - "step": 3395 - }, - { - "epoch": 0.5198379328797492, - "grad_norm": 0.06315090246868217, - "learning_rate": 0.0005524765532428204, - "loss": 1.1205, - "step": 3400 - }, - { - "epoch": 0.5206024004281018, - "grad_norm": 0.07345186841532578, - "learning_rate": 0.0005511493877525352, - "loss": 1.121, - "step": 3405 - }, - { - "epoch": 0.5213668679764544, - "grad_norm": 0.05751935815321791, - "learning_rate": 0.0005498218579788173, - "loss": 1.1279, - "step": 3410 - }, - { - "epoch": 0.522131335524807, - "grad_norm": 0.06091507104043262, - "learning_rate": 0.0005484939733762687, - "loss": 1.1366, - "step": 3415 - }, - { - "epoch": 0.5228958030731595, - "grad_norm": 0.059482594137463464, - "learning_rate": 0.0005471657434020182, - "loss": 1.1536, - "step": 3420 - }, - { - "epoch": 0.5236602706215121, - "grad_norm": 0.06510428118556476, - "learning_rate": 0.0005458371775156548, - "loss": 1.0995, - "step": 3425 - }, - { - "epoch": 0.5244247381698647, - "grad_norm": 0.06702378445812333, - "learning_rate": 0.0005445082851791597, - "loss": 1.0935, - "step": 3430 - }, - { - "epoch": 0.5251892057182173, - "grad_norm": 0.06267153627313525, - "learning_rate": 0.0005431790758568388, - "loss": 1.0971, - "step": 3435 - }, - { - "epoch": 0.5259536732665698, - "grad_norm": 0.06690110297685588, - "learning_rate": 0.0005418495590152557, - "loss": 1.1132, - "step": 3440 - }, - { - "epoch": 0.5267181408149224, - "grad_norm": 0.06857945644968291, - "learning_rate": 0.0005405197441231645, - "loss": 1.129, - "step": 3445 - }, - { - "epoch": 0.527482608363275, - "grad_norm": 0.0679067326661273, - "learning_rate": 0.0005391896406514414, - "loss": 1.1197, - "step": 3450 - }, - { - "epoch": 0.5282470759116276, - "grad_norm": 0.06495452970096306, - "learning_rate": 0.0005378592580730182, - "loss": 1.1191, - "step": 3455 - }, - { - "epoch": 0.5290115434599801, - "grad_norm": 0.07011699388524864, - "learning_rate": 0.0005365286058628144, - "loss": 1.1244, - "step": 3460 - }, - { - "epoch": 0.5297760110083327, - "grad_norm": 0.08305004226196393, - "learning_rate": 0.0005351976934976702, - "loss": 1.1122, - "step": 3465 - }, - { - "epoch": 0.5305404785566853, - "grad_norm": 0.05964304908716454, - "learning_rate": 0.0005338665304562776, - "loss": 1.0962, - "step": 3470 - }, - { - "epoch": 0.5313049461050379, - "grad_norm": 0.06608417735470593, - "learning_rate": 0.0005325351262191149, - "loss": 1.1091, - "step": 3475 - }, - { - "epoch": 0.5320694136533904, - "grad_norm": 0.06157287568970666, - "learning_rate": 0.0005312034902683779, - "loss": 1.1544, - "step": 3480 - }, - { - "epoch": 0.532833881201743, - "grad_norm": 0.0604884131717689, - "learning_rate": 0.0005298716320879124, - "loss": 1.1189, - "step": 3485 - }, - { - "epoch": 0.5335983487500956, - "grad_norm": 0.07023563410010202, - "learning_rate": 0.000528539561163147, - "loss": 1.0744, - "step": 3490 - }, - { - "epoch": 0.5343628162984482, - "grad_norm": 0.06000529959619642, - "learning_rate": 0.000527207286981026, - "loss": 1.1147, - "step": 3495 - }, - { - "epoch": 0.5351272838468007, - "grad_norm": 0.06234895133269599, - "learning_rate": 0.0005258748190299404, - "loss": 1.0933, - "step": 3500 - }, - { - "epoch": 0.5358917513951533, - "grad_norm": 0.0631654264689642, - "learning_rate": 0.0005245421667996618, - "loss": 1.151, - "step": 3505 - }, - { - "epoch": 0.5366562189435059, - "grad_norm": 0.06627341764797444, - "learning_rate": 0.000523209339781274, - "loss": 1.0995, - "step": 3510 - }, - { - "epoch": 0.5374206864918584, - "grad_norm": 0.07201555609957591, - "learning_rate": 0.0005218763474671058, - "loss": 1.0883, - "step": 3515 - }, - { - "epoch": 0.538185154040211, - "grad_norm": 0.0713275731303358, - "learning_rate": 0.000520543199350663, - "loss": 1.105, - "step": 3520 - }, - { - "epoch": 0.5389496215885635, - "grad_norm": 0.06821865094908078, - "learning_rate": 0.0005192099049265613, - "loss": 1.0769, - "step": 3525 - }, - { - "epoch": 0.5397140891369161, - "grad_norm": 0.06557378936774873, - "learning_rate": 0.0005178764736904582, - "loss": 1.1454, - "step": 3530 - }, - { - "epoch": 0.5404785566852687, - "grad_norm": 0.06045084371867397, - "learning_rate": 0.0005165429151389856, - "loss": 1.105, - "step": 3535 - }, - { - "epoch": 0.5412430242336213, - "grad_norm": 0.06242346673470062, - "learning_rate": 0.0005152092387696821, - "loss": 1.1108, - "step": 3540 - }, - { - "epoch": 0.5420074917819738, - "grad_norm": 0.06172204231908025, - "learning_rate": 0.0005138754540809253, - "loss": 1.1299, - "step": 3545 - }, - { - "epoch": 0.5427719593303264, - "grad_norm": 0.05825804942006388, - "learning_rate": 0.0005125415705718646, - "loss": 1.1023, - "step": 3550 - }, - { - "epoch": 0.543536426878679, - "grad_norm": 0.06796344310929125, - "learning_rate": 0.0005112075977423532, - "loss": 1.1216, - "step": 3555 - }, - { - "epoch": 0.5443008944270316, - "grad_norm": 0.058632756425142235, - "learning_rate": 0.0005098735450928799, - "loss": 1.1226, - "step": 3560 - }, - { - "epoch": 0.5450653619753841, - "grad_norm": 0.06265232814361309, - "learning_rate": 0.0005085394221245021, - "loss": 1.1166, - "step": 3565 - }, - { - "epoch": 0.5458298295237367, - "grad_norm": 0.06577998911234535, - "learning_rate": 0.0005072052383387787, - "loss": 1.0894, - "step": 3570 - }, - { - "epoch": 0.5465942970720893, - "grad_norm": 0.07545067157181294, - "learning_rate": 0.0005058710032377008, - "loss": 1.1264, - "step": 3575 - }, - { - "epoch": 0.5473587646204419, - "grad_norm": 0.07311060599396883, - "learning_rate": 0.0005045367263236257, - "loss": 1.1257, - "step": 3580 - }, - { - "epoch": 0.5481232321687944, - "grad_norm": 0.06148576987581805, - "learning_rate": 0.000503202417099208, - "loss": 1.1184, - "step": 3585 - }, - { - "epoch": 0.548887699717147, - "grad_norm": 0.06467766525296764, - "learning_rate": 0.0005018680850673327, - "loss": 1.1132, - "step": 3590 - }, - { - "epoch": 0.5496521672654996, - "grad_norm": 0.06393600821432976, - "learning_rate": 0.0005005337397310469, - "loss": 1.1094, - "step": 3595 - }, - { - "epoch": 0.5504166348138522, - "grad_norm": 0.0621691916365975, - "learning_rate": 0.0004991993905934931, - "loss": 1.1361, - "step": 3600 - }, - { - "epoch": 0.5511811023622047, - "grad_norm": 0.06724351675549163, - "learning_rate": 0.0004978650471578402, - "loss": 1.1147, - "step": 3605 - }, - { - "epoch": 0.5519455699105573, - "grad_norm": 0.05679252783616519, - "learning_rate": 0.000496530718927217, - "loss": 1.1256, - "step": 3610 - }, - { - "epoch": 0.5527100374589099, - "grad_norm": 0.057881098463542474, - "learning_rate": 0.0004951964154046432, - "loss": 1.1106, - "step": 3615 - }, - { - "epoch": 0.5534745050072625, - "grad_norm": 0.06292887711056025, - "learning_rate": 0.0004938621460929639, - "loss": 1.1134, - "step": 3620 - }, - { - "epoch": 0.554238972555615, - "grad_norm": 0.06590848453173402, - "learning_rate": 0.0004925279204947789, - "loss": 1.1275, - "step": 3625 - }, - { - "epoch": 0.5550034401039676, - "grad_norm": 0.06325873950215483, - "learning_rate": 0.0004911937481123783, - "loss": 1.09, - "step": 3630 - }, - { - "epoch": 0.5557679076523202, - "grad_norm": 0.05608984482155, - "learning_rate": 0.0004898596384476718, - "loss": 1.1412, - "step": 3635 - }, - { - "epoch": 0.5565323752006728, - "grad_norm": 0.07138390520772304, - "learning_rate": 0.0004885256010021232, - "loss": 1.087, - "step": 3640 - }, - { - "epoch": 0.5572968427490254, - "grad_norm": 0.06847884401062614, - "learning_rate": 0.0004871916452766822, - "loss": 1.0987, - "step": 3645 - }, - { - "epoch": 0.5580613102973778, - "grad_norm": 0.06778957445525216, - "learning_rate": 0.00048585778077171586, - "loss": 1.0987, - "step": 3650 - }, - { - "epoch": 0.5588257778457304, - "grad_norm": 0.062458614692809604, - "learning_rate": 0.00048452401698694154, - "loss": 1.0803, - "step": 3655 - }, - { - "epoch": 0.559590245394083, - "grad_norm": 0.06609876186419025, - "learning_rate": 0.00048319036342135983, - "loss": 1.1123, - "step": 3660 - }, - { - "epoch": 0.5603547129424356, - "grad_norm": 0.06691503571502525, - "learning_rate": 0.00048185682957318604, - "loss": 1.0857, - "step": 3665 - }, - { - "epoch": 0.5611191804907881, - "grad_norm": 0.06633733246653169, - "learning_rate": 0.00048052342493978275, - "loss": 1.1082, - "step": 3670 - }, - { - "epoch": 0.5618836480391407, - "grad_norm": 0.06504840824047256, - "learning_rate": 0.0004791901590175926, - "loss": 1.1154, - "step": 3675 - }, - { - "epoch": 0.5626481155874933, - "grad_norm": 0.0603761920675834, - "learning_rate": 0.0004778570413020702, - "loss": 1.0871, - "step": 3680 - }, - { - "epoch": 0.5634125831358459, - "grad_norm": 0.058914355257856044, - "learning_rate": 0.0004765240812876141, - "loss": 1.1103, - "step": 3685 - }, - { - "epoch": 0.5641770506841984, - "grad_norm": 0.06502988982340185, - "learning_rate": 0.00047519128846750083, - "loss": 1.15, - "step": 3690 - }, - { - "epoch": 0.564941518232551, - "grad_norm": 0.07280984153816127, - "learning_rate": 0.00047385867233381516, - "loss": 1.1121, - "step": 3695 - }, - { - "epoch": 0.5657059857809036, - "grad_norm": 0.06994408916357317, - "learning_rate": 0.0004725262423773838, - "loss": 1.1293, - "step": 3700 - }, - { - "epoch": 0.5664704533292562, - "grad_norm": 0.0621673770156963, - "learning_rate": 0.0004711940080877079, - "loss": 1.0852, - "step": 3705 - }, - { - "epoch": 0.5672349208776087, - "grad_norm": 0.05817651318162582, - "learning_rate": 0.0004698619789528948, - "loss": 1.1301, - "step": 3710 - }, - { - "epoch": 0.5679993884259613, - "grad_norm": 0.06081671877206097, - "learning_rate": 0.00046853016445959014, - "loss": 1.1398, - "step": 3715 - }, - { - "epoch": 0.5687638559743139, - "grad_norm": 0.06188571604259829, - "learning_rate": 0.00046719857409291226, - "loss": 1.1731, - "step": 3720 - }, - { - "epoch": 0.5695283235226665, - "grad_norm": 0.06243919731382698, - "learning_rate": 0.00046586721733638204, - "loss": 1.0922, - "step": 3725 - }, - { - "epoch": 0.570292791071019, - "grad_norm": 0.06337869980709998, - "learning_rate": 0.00046453610367185695, - "loss": 1.1325, - "step": 3730 - }, - { - "epoch": 0.5710572586193716, - "grad_norm": 0.06302625565447471, - "learning_rate": 0.00046320524257946363, - "loss": 1.0613, - "step": 3735 - }, - { - "epoch": 0.5718217261677242, - "grad_norm": 0.06598219655258462, - "learning_rate": 0.00046187464353752945, - "loss": 1.0962, - "step": 3740 - }, - { - "epoch": 0.5725861937160768, - "grad_norm": 0.06698025179482835, - "learning_rate": 0.0004605443160225152, - "loss": 1.1107, - "step": 3745 - }, - { - "epoch": 0.5733506612644294, - "grad_norm": 0.06756310322497935, - "learning_rate": 0.00045921426950894885, - "loss": 1.1511, - "step": 3750 - }, - { - "epoch": 0.5741151288127819, - "grad_norm": 0.06396486586356562, - "learning_rate": 0.00045788451346935605, - "loss": 1.0729, - "step": 3755 - }, - { - "epoch": 0.5748795963611345, - "grad_norm": 0.0732526041311009, - "learning_rate": 0.0004565550573741942, - "loss": 1.1073, - "step": 3760 - }, - { - "epoch": 0.5756440639094871, - "grad_norm": 0.06998612377117433, - "learning_rate": 0.0004552259106917846, - "loss": 1.128, - "step": 3765 - }, - { - "epoch": 0.5764085314578397, - "grad_norm": 0.06616565417315289, - "learning_rate": 0.0004538970828882447, - "loss": 1.0956, - "step": 3770 - }, - { - "epoch": 0.5771729990061922, - "grad_norm": 0.06108422217657452, - "learning_rate": 0.00045256858342742094, - "loss": 1.0987, - "step": 3775 - }, - { - "epoch": 0.5779374665545448, - "grad_norm": 0.05824837749243198, - "learning_rate": 0.0004512404217708217, - "loss": 1.0806, - "step": 3780 - }, - { - "epoch": 0.5787019341028974, - "grad_norm": 0.06266413521523766, - "learning_rate": 0.00044991260737754877, - "loss": 1.102, - "step": 3785 - }, - { - "epoch": 0.5794664016512499, - "grad_norm": 0.060441791106991334, - "learning_rate": 0.0004485851497042312, - "loss": 1.0884, - "step": 3790 - }, - { - "epoch": 0.5802308691996024, - "grad_norm": 0.060797439754043565, - "learning_rate": 0.0004472580582049578, - "loss": 1.098, - "step": 3795 - }, - { - "epoch": 0.580995336747955, - "grad_norm": 0.062023942489464336, - "learning_rate": 0.000445931342331209, - "loss": 1.0919, - "step": 3800 - }, - { - "epoch": 0.5817598042963076, - "grad_norm": 0.06418430339347835, - "learning_rate": 0.00044460501153179016, - "loss": 1.1017, - "step": 3805 - }, - { - "epoch": 0.5825242718446602, - "grad_norm": 0.060619213747711925, - "learning_rate": 0.00044327907525276456, - "loss": 1.0849, - "step": 3810 - }, - { - "epoch": 0.5832887393930127, - "grad_norm": 0.057326188671437316, - "learning_rate": 0.0004419535429373848, - "loss": 1.1172, - "step": 3815 - }, - { - "epoch": 0.5840532069413653, - "grad_norm": 0.06607758733161419, - "learning_rate": 0.00044062842402602776, - "loss": 1.0928, - "step": 3820 - }, - { - "epoch": 0.5848176744897179, - "grad_norm": 0.07029009256072538, - "learning_rate": 0.000439303727956125, - "loss": 1.095, - "step": 3825 - }, - { - "epoch": 0.5855821420380705, - "grad_norm": 0.0710751111387438, - "learning_rate": 0.0004379794641620969, - "loss": 1.0881, - "step": 3830 - }, - { - "epoch": 0.586346609586423, - "grad_norm": 0.06321187821581618, - "learning_rate": 0.00043665564207528556, - "loss": 1.1249, - "step": 3835 - }, - { - "epoch": 0.5871110771347756, - "grad_norm": 0.06167451377736427, - "learning_rate": 0.0004353322711238869, - "loss": 1.1137, - "step": 3840 - }, - { - "epoch": 0.5878755446831282, - "grad_norm": 0.060565668504396965, - "learning_rate": 0.0004340093607328839, - "loss": 1.1036, - "step": 3845 - }, - { - "epoch": 0.5886400122314808, - "grad_norm": 0.0622976248859124, - "learning_rate": 0.00043268692032397984, - "loss": 1.0913, - "step": 3850 - }, - { - "epoch": 0.5894044797798333, - "grad_norm": 0.06391396857013981, - "learning_rate": 0.0004313649593155301, - "loss": 1.0847, - "step": 3855 - }, - { - "epoch": 0.5901689473281859, - "grad_norm": 0.07423916257372261, - "learning_rate": 0.0004300434871224763, - "loss": 1.0901, - "step": 3860 - }, - { - "epoch": 0.5909334148765385, - "grad_norm": 0.062338256577621556, - "learning_rate": 0.00042872251315627884, - "loss": 1.0926, - "step": 3865 - }, - { - "epoch": 0.5916978824248911, - "grad_norm": 0.06178660389841415, - "learning_rate": 0.0004274020468248494, - "loss": 1.0754, - "step": 3870 - }, - { - "epoch": 0.5924623499732437, - "grad_norm": 0.058847696827950484, - "learning_rate": 0.0004260820975324844, - "loss": 1.0788, - "step": 3875 - }, - { - "epoch": 0.5932268175215962, - "grad_norm": 0.061443746079662805, - "learning_rate": 0.00042476267467979827, - "loss": 1.0796, - "step": 3880 - }, - { - "epoch": 0.5939912850699488, - "grad_norm": 0.06810138628824276, - "learning_rate": 0.0004234437876636557, - "loss": 1.0977, - "step": 3885 - }, - { - "epoch": 0.5947557526183014, - "grad_norm": 0.07494921338482857, - "learning_rate": 0.00042212544587710534, - "loss": 1.0967, - "step": 3890 - }, - { - "epoch": 0.595520220166654, - "grad_norm": 0.06867273849143388, - "learning_rate": 0.00042080765870931294, - "loss": 1.1117, - "step": 3895 - }, - { - "epoch": 0.5962846877150065, - "grad_norm": 0.06708271782909088, - "learning_rate": 0.00041949043554549405, - "loss": 1.0985, - "step": 3900 - }, - { - "epoch": 0.5970491552633591, - "grad_norm": 0.05385745159990605, - "learning_rate": 0.00041817378576684746, - "loss": 1.1013, - "step": 3905 - }, - { - "epoch": 0.5978136228117117, - "grad_norm": 0.059032475231314455, - "learning_rate": 0.0004168577187504884, - "loss": 1.0984, - "step": 3910 - }, - { - "epoch": 0.5985780903600643, - "grad_norm": 0.0527291048836883, - "learning_rate": 0.00041554224386938186, - "loss": 1.0873, - "step": 3915 - }, - { - "epoch": 0.5993425579084168, - "grad_norm": 0.06282472550713358, - "learning_rate": 0.00041422737049227496, - "loss": 1.1133, - "step": 3920 - }, - { - "epoch": 0.6001070254567693, - "grad_norm": 0.06637056841657221, - "learning_rate": 0.000412913107983632, - "loss": 1.0733, - "step": 3925 - }, - { - "epoch": 0.6008714930051219, - "grad_norm": 0.07134078790379195, - "learning_rate": 0.00041159946570356584, - "loss": 1.0589, - "step": 3930 - }, - { - "epoch": 0.6016359605534745, - "grad_norm": 0.059395620562726396, - "learning_rate": 0.0004102864530077724, - "loss": 1.0624, - "step": 3935 - }, - { - "epoch": 0.602400428101827, - "grad_norm": 0.059597386222636405, - "learning_rate": 0.000408974079247464, - "loss": 1.1008, - "step": 3940 - }, - { - "epoch": 0.6031648956501796, - "grad_norm": 0.057839152335023306, - "learning_rate": 0.00040766235376930214, - "loss": 1.1103, - "step": 3945 - }, - { - "epoch": 0.6039293631985322, - "grad_norm": 0.061849685617478466, - "learning_rate": 0.0004063512859153311, - "loss": 1.0816, - "step": 3950 - }, - { - "epoch": 0.6046938307468848, - "grad_norm": 0.06508877768701075, - "learning_rate": 0.00040504088502291234, - "loss": 1.1081, - "step": 3955 - }, - { - "epoch": 0.6054582982952373, - "grad_norm": 0.07683797410703767, - "learning_rate": 0.00040373116042465647, - "loss": 1.1111, - "step": 3960 - }, - { - "epoch": 0.6062227658435899, - "grad_norm": 0.06757768507720088, - "learning_rate": 0.0004024221214483579, - "loss": 1.0925, - "step": 3965 - }, - { - "epoch": 0.6069872333919425, - "grad_norm": 0.06593559024726058, - "learning_rate": 0.00040111377741692834, - "loss": 1.0891, - "step": 3970 - }, - { - "epoch": 0.6077517009402951, - "grad_norm": 0.05952734135732054, - "learning_rate": 0.00039980613764832975, - "loss": 1.0705, - "step": 3975 - }, - { - "epoch": 0.6085161684886476, - "grad_norm": 0.06174361114814803, - "learning_rate": 0.00039849921145550807, - "loss": 1.1021, - "step": 3980 - }, - { - "epoch": 0.6092806360370002, - "grad_norm": 0.07045898974414644, - "learning_rate": 0.00039719300814632823, - "loss": 1.1107, - "step": 3985 - }, - { - "epoch": 0.6100451035853528, - "grad_norm": 0.059203133295731424, - "learning_rate": 0.00039588753702350584, - "loss": 1.1137, - "step": 3990 - }, - { - "epoch": 0.6108095711337054, - "grad_norm": 0.07049531573486653, - "learning_rate": 0.00039458280738454213, - "loss": 1.0764, - "step": 3995 - }, - { - "epoch": 0.611574038682058, - "grad_norm": 0.06407896548037507, - "learning_rate": 0.0003932788285216579, - "loss": 1.0802, - "step": 4000 - }, - { - "epoch": 0.6123385062304105, - "grad_norm": 0.062705551728603, - "learning_rate": 0.0003919756097217266, - "loss": 1.0549, - "step": 4005 - }, - { - "epoch": 0.6131029737787631, - "grad_norm": 0.06559872784175859, - "learning_rate": 0.0003906731602662087, - "loss": 1.1088, - "step": 4010 - }, - { - "epoch": 0.6138674413271157, - "grad_norm": 0.06427225726298613, - "learning_rate": 0.0003893714894310855, - "loss": 1.0621, - "step": 4015 - }, - { - "epoch": 0.6146319088754683, - "grad_norm": 0.06406941756324243, - "learning_rate": 0.0003880706064867926, - "loss": 1.0855, - "step": 4020 - }, - { - "epoch": 0.6153963764238208, - "grad_norm": 0.06576877266118943, - "learning_rate": 0.00038677052069815477, - "loss": 1.0793, - "step": 4025 - }, - { - "epoch": 0.6161608439721734, - "grad_norm": 0.06459197784453244, - "learning_rate": 0.0003854712413243192, - "loss": 1.091, - "step": 4030 - }, - { - "epoch": 0.616925311520526, - "grad_norm": 0.05811327949041547, - "learning_rate": 0.0003841727776186899, - "loss": 1.08, - "step": 4035 - }, - { - "epoch": 0.6176897790688786, - "grad_norm": 0.07215085049566354, - "learning_rate": 0.00038287513882886195, - "loss": 1.0611, - "step": 4040 - }, - { - "epoch": 0.6184542466172311, - "grad_norm": 0.06252405374750149, - "learning_rate": 0.00038157833419655507, - "loss": 1.088, - "step": 4045 - }, - { - "epoch": 0.6192187141655837, - "grad_norm": 0.06008502958042993, - "learning_rate": 0.00038028237295754805, - "loss": 1.1198, - "step": 4050 - }, - { - "epoch": 0.6199831817139363, - "grad_norm": 0.06330961932652555, - "learning_rate": 0.0003789872643416138, - "loss": 1.0562, - "step": 4055 - }, - { - "epoch": 0.6207476492622889, - "grad_norm": 0.058986400518205764, - "learning_rate": 0.00037769301757245214, - "loss": 1.1104, - "step": 4060 - }, - { - "epoch": 0.6215121168106413, - "grad_norm": 0.05855844822094068, - "learning_rate": 0.00037639964186762505, - "loss": 1.0794, - "step": 4065 - }, - { - "epoch": 0.6222765843589939, - "grad_norm": 0.06760118473112167, - "learning_rate": 0.00037510714643849107, - "loss": 1.0467, - "step": 4070 - }, - { - "epoch": 0.6230410519073465, - "grad_norm": 0.05867534449074983, - "learning_rate": 0.00037381554049013946, - "loss": 1.0938, - "step": 4075 - }, - { - "epoch": 0.6238055194556991, - "grad_norm": 0.07682190561405723, - "learning_rate": 0.00037252483322132386, - "loss": 1.0884, - "step": 4080 - }, - { - "epoch": 0.6245699870040516, - "grad_norm": 0.07229420016537297, - "learning_rate": 0.00037123503382439894, - "loss": 1.0984, - "step": 4085 - }, - { - "epoch": 0.6253344545524042, - "grad_norm": 0.06570366560388953, - "learning_rate": 0.00036994615148525236, - "loss": 1.0887, - "step": 4090 - }, - { - "epoch": 0.6260989221007568, - "grad_norm": 0.061864508617144216, - "learning_rate": 0.00036865819538324095, - "loss": 1.1112, - "step": 4095 - }, - { - "epoch": 0.6268633896491094, - "grad_norm": 0.0656439941017718, - "learning_rate": 0.0003673711746911252, - "loss": 1.0741, - "step": 4100 - }, - { - "epoch": 0.627627857197462, - "grad_norm": 0.055905760438808745, - "learning_rate": 0.00036608509857500335, - "loss": 1.0459, - "step": 4105 - }, - { - "epoch": 0.6283923247458145, - "grad_norm": 0.06674862047297604, - "learning_rate": 0.00036479997619424606, - "loss": 1.0918, - "step": 4110 - }, - { - "epoch": 0.6291567922941671, - "grad_norm": 0.060859055527957194, - "learning_rate": 0.0003635158167014326, - "loss": 1.0804, - "step": 4115 - }, - { - "epoch": 0.6299212598425197, - "grad_norm": 0.06537100475249158, - "learning_rate": 0.0003622326292422834, - "loss": 1.0847, - "step": 4120 - }, - { - "epoch": 0.6306857273908723, - "grad_norm": 0.06227343013649914, - "learning_rate": 0.0003609504229555969, - "loss": 1.0853, - "step": 4125 - }, - { - "epoch": 0.6314501949392248, - "grad_norm": 0.06841807299363888, - "learning_rate": 0.0003596692069731835, - "loss": 1.0578, - "step": 4130 - }, - { - "epoch": 0.6322146624875774, - "grad_norm": 0.0736451804635189, - "learning_rate": 0.00035838899041980055, - "loss": 1.0817, - "step": 4135 - }, - { - "epoch": 0.63297913003593, - "grad_norm": 0.05488620850993582, - "learning_rate": 0.0003571097824130873, - "loss": 1.0921, - "step": 4140 - }, - { - "epoch": 0.6337435975842826, - "grad_norm": 0.06282145805968094, - "learning_rate": 0.0003558315920635011, - "loss": 1.0378, - "step": 4145 - }, - { - "epoch": 0.6345080651326351, - "grad_norm": 0.06445119265430188, - "learning_rate": 0.0003545544284742506, - "loss": 1.0929, - "step": 4150 - }, - { - "epoch": 0.6352725326809877, - "grad_norm": 0.056909601872114204, - "learning_rate": 0.0003532783007412321, - "loss": 1.1056, - "step": 4155 - }, - { - "epoch": 0.6360370002293403, - "grad_norm": 0.060124580046915954, - "learning_rate": 0.0003520032179529652, - "loss": 1.1178, - "step": 4160 - }, - { - "epoch": 0.6368014677776929, - "grad_norm": 0.06314252699342483, - "learning_rate": 0.00035072918919052683, - "loss": 1.0837, - "step": 4165 - }, - { - "epoch": 0.6375659353260454, - "grad_norm": 0.06686385252802637, - "learning_rate": 0.0003494562235274873, - "loss": 1.0949, - "step": 4170 - }, - { - "epoch": 0.638330402874398, - "grad_norm": 0.06101338870198361, - "learning_rate": 0.0003481843300298459, - "loss": 1.0786, - "step": 4175 - }, - { - "epoch": 0.6390948704227506, - "grad_norm": 0.060474410756929865, - "learning_rate": 0.0003469135177559656, - "loss": 1.1157, - "step": 4180 - }, - { - "epoch": 0.6398593379711032, - "grad_norm": 0.06235109250261373, - "learning_rate": 0.00034564379575650916, - "loss": 1.0814, - "step": 4185 - }, - { - "epoch": 0.6406238055194557, - "grad_norm": 0.07102337030993396, - "learning_rate": 0.0003443751730743745, - "loss": 1.0545, - "step": 4190 - }, - { - "epoch": 0.6413882730678083, - "grad_norm": 0.06922307618811946, - "learning_rate": 0.00034310765874463007, - "loss": 1.0813, - "step": 4195 - }, - { - "epoch": 0.6421527406161608, - "grad_norm": 0.06734659315916373, - "learning_rate": 0.000341841261794451, - "loss": 1.0821, - "step": 4200 - }, - { - "epoch": 0.6429172081645134, - "grad_norm": 0.07173326298860931, - "learning_rate": 0.0003405759912430543, - "loss": 1.0853, - "step": 4205 - }, - { - "epoch": 0.6436816757128659, - "grad_norm": 0.060997631416030235, - "learning_rate": 0.00033931185610163437, - "loss": 1.0501, - "step": 4210 - }, - { - "epoch": 0.6444461432612185, - "grad_norm": 0.0678413835009176, - "learning_rate": 0.0003380488653733004, - "loss": 1.0596, - "step": 4215 - }, - { - "epoch": 0.6452106108095711, - "grad_norm": 0.06764818450679011, - "learning_rate": 0.00033678702805301007, - "loss": 1.1031, - "step": 4220 - }, - { - "epoch": 0.6459750783579237, - "grad_norm": 0.06064356181998237, - "learning_rate": 0.0003355263531275069, - "loss": 1.0926, - "step": 4225 - }, - { - "epoch": 0.6467395459062762, - "grad_norm": 0.07643917264349942, - "learning_rate": 0.0003342668495752561, - "loss": 1.102, - "step": 4230 - }, - { - "epoch": 0.6475040134546288, - "grad_norm": 0.06014169982393995, - "learning_rate": 0.00033300852636638026, - "loss": 1.097, - "step": 4235 - }, - { - "epoch": 0.6482684810029814, - "grad_norm": 0.07020185439225457, - "learning_rate": 0.00033175139246259533, - "loss": 1.081, - "step": 4240 - }, - { - "epoch": 0.649032948551334, - "grad_norm": 0.061835432421758194, - "learning_rate": 0.0003304954568171478, - "loss": 1.151, - "step": 4245 - }, - { - "epoch": 0.6497974160996866, - "grad_norm": 0.05936020939229027, - "learning_rate": 0.00032924072837474936, - "loss": 1.06, - "step": 4250 - }, - { - "epoch": 0.6505618836480391, - "grad_norm": 0.06073312626281398, - "learning_rate": 0.0003279872160715146, - "loss": 1.0694, - "step": 4255 - }, - { - "epoch": 0.6513263511963917, - "grad_norm": 0.05551866637743777, - "learning_rate": 0.00032673492883489693, - "loss": 1.0963, - "step": 4260 - }, - { - "epoch": 0.6520908187447443, - "grad_norm": 0.057095759169672695, - "learning_rate": 0.0003254838755836245, - "loss": 1.0736, - "step": 4265 - }, - { - "epoch": 0.6528552862930969, - "grad_norm": 0.05824058789550292, - "learning_rate": 0.0003242340652276371, - "loss": 1.0632, - "step": 4270 - }, - { - "epoch": 0.6536197538414494, - "grad_norm": 0.06447238459426781, - "learning_rate": 0.00032298550666802315, - "loss": 1.0641, - "step": 4275 - }, - { - "epoch": 0.654384221389802, - "grad_norm": 0.07348007672974653, - "learning_rate": 0.0003217382087969554, - "loss": 1.0919, - "step": 4280 - }, - { - "epoch": 0.6551486889381546, - "grad_norm": 0.06202495691783003, - "learning_rate": 0.000320492180497628, - "loss": 1.1166, - "step": 4285 - }, - { - "epoch": 0.6559131564865072, - "grad_norm": 0.05877415292293241, - "learning_rate": 0.0003192474306441936, - "loss": 1.045, - "step": 4290 - }, - { - "epoch": 0.6566776240348597, - "grad_norm": 0.05983915724082635, - "learning_rate": 0.00031800396810169963, - "loss": 1.0807, - "step": 4295 - }, - { - "epoch": 0.6574420915832123, - "grad_norm": 0.06659604980593338, - "learning_rate": 0.00031676180172602523, - "loss": 1.101, - "step": 4300 - }, - { - "epoch": 0.6582065591315649, - "grad_norm": 0.06638853925039498, - "learning_rate": 0.00031552094036381873, - "loss": 1.0828, - "step": 4305 - }, - { - "epoch": 0.6589710266799175, - "grad_norm": 0.06147261804251727, - "learning_rate": 0.0003142813928524338, - "loss": 1.0734, - "step": 4310 - }, - { - "epoch": 0.65973549422827, - "grad_norm": 0.06859661731494095, - "learning_rate": 0.000313043168019867, - "loss": 1.0989, - "step": 4315 - }, - { - "epoch": 0.6604999617766226, - "grad_norm": 0.06412265796659967, - "learning_rate": 0.00031180627468469494, - "loss": 1.0289, - "step": 4320 - }, - { - "epoch": 0.6612644293249752, - "grad_norm": 0.06624448181748696, - "learning_rate": 0.0003105707216560114, - "loss": 1.0555, - "step": 4325 - }, - { - "epoch": 0.6620288968733278, - "grad_norm": 0.05973287720890624, - "learning_rate": 0.0003093365177333644, - "loss": 1.0933, - "step": 4330 - }, - { - "epoch": 0.6627933644216804, - "grad_norm": 0.06311393586816254, - "learning_rate": 0.0003081036717066938, - "loss": 1.0869, - "step": 4335 - }, - { - "epoch": 0.6635578319700328, - "grad_norm": 0.06408202960534053, - "learning_rate": 0.0003068721923562688, - "loss": 1.0702, - "step": 4340 - }, - { - "epoch": 0.6643222995183854, - "grad_norm": 0.05725889171250293, - "learning_rate": 0.0003056420884526244, - "loss": 1.0778, - "step": 4345 - }, - { - "epoch": 0.665086767066738, - "grad_norm": 0.05862869882873112, - "learning_rate": 0.00030441336875650115, - "loss": 1.0706, - "step": 4350 - }, - { - "epoch": 0.6658512346150905, - "grad_norm": 0.061265898924405884, - "learning_rate": 0.00030318604201877994, - "loss": 1.0869, - "step": 4355 - }, - { - "epoch": 0.6666157021634431, - "grad_norm": 0.06660471122968116, - "learning_rate": 0.0003019601169804216, - "loss": 1.096, - "step": 4360 - }, - { - "epoch": 0.6673801697117957, - "grad_norm": 0.06883224741443655, - "learning_rate": 0.0003007356023724045, - "loss": 1.0725, - "step": 4365 - }, - { - "epoch": 0.6681446372601483, - "grad_norm": 0.06613070267430747, - "learning_rate": 0.00029951250691566156, - "loss": 1.0636, - "step": 4370 - }, - { - "epoch": 0.6689091048085009, - "grad_norm": 0.05514526491107288, - "learning_rate": 0.00029829083932101844, - "loss": 1.0718, - "step": 4375 - }, - { - "epoch": 0.6696735723568534, - "grad_norm": 0.06035585077122042, - "learning_rate": 0.00029707060828913225, - "loss": 1.0601, - "step": 4380 - }, - { - "epoch": 0.670438039905206, - "grad_norm": 0.06401962624174869, - "learning_rate": 0.0002958518225104283, - "loss": 1.1028, - "step": 4385 - }, - { - "epoch": 0.6712025074535586, - "grad_norm": 0.06137496192489166, - "learning_rate": 0.00029463449066503936, - "loss": 1.0915, - "step": 4390 - }, - { - "epoch": 0.6719669750019112, - "grad_norm": 0.061351092117864735, - "learning_rate": 0.00029341862142274306, - "loss": 1.1007, - "step": 4395 - }, - { - "epoch": 0.6727314425502637, - "grad_norm": 0.06025596628948086, - "learning_rate": 0.00029220422344290053, - "loss": 1.0421, - "step": 4400 - }, - { - "epoch": 0.6734959100986163, - "grad_norm": 0.057708276936275546, - "learning_rate": 0.0002909913053743949, - "loss": 1.073, - "step": 4405 - }, - { - "epoch": 0.6742603776469689, - "grad_norm": 0.05934715222749201, - "learning_rate": 0.000289779875855569, - "loss": 1.0912, - "step": 4410 - }, - { - "epoch": 0.6750248451953215, - "grad_norm": 0.05766840269365502, - "learning_rate": 0.00028856994351416433, - "loss": 1.0883, - "step": 4415 - }, - { - "epoch": 0.675789312743674, - "grad_norm": 0.05304655893207912, - "learning_rate": 0.0002873615169672601, - "loss": 1.0014, - "step": 4420 - }, - { - "epoch": 0.6765537802920266, - "grad_norm": 0.06298752379204364, - "learning_rate": 0.00028615460482121096, - "loss": 1.0319, - "step": 4425 - }, - { - "epoch": 0.6773182478403792, - "grad_norm": 0.06572666933184156, - "learning_rate": 0.00028494921567158557, - "loss": 1.0627, - "step": 4430 - }, - { - "epoch": 0.6780827153887318, - "grad_norm": 0.05564925869133632, - "learning_rate": 0.0002837453581031068, - "loss": 1.0647, - "step": 4435 - }, - { - "epoch": 0.6788471829370843, - "grad_norm": 0.08038005236660152, - "learning_rate": 0.00028254304068958924, - "loss": 1.0436, - "step": 4440 - }, - { - "epoch": 0.6796116504854369, - "grad_norm": 0.07054627355167405, - "learning_rate": 0.0002813422719938784, - "loss": 1.0817, - "step": 4445 - }, - { - "epoch": 0.6803761180337895, - "grad_norm": 0.06023512683419835, - "learning_rate": 0.00028014306056779026, - "loss": 1.0844, - "step": 4450 - }, - { - "epoch": 0.6811405855821421, - "grad_norm": 0.06141926824245988, - "learning_rate": 0.0002789454149520497, - "loss": 1.0579, - "step": 4455 - }, - { - "epoch": 0.6819050531304947, - "grad_norm": 0.05608369336806669, - "learning_rate": 0.00027774934367622993, - "loss": 1.0871, - "step": 4460 - }, - { - "epoch": 0.6826695206788472, - "grad_norm": 0.06081877401068415, - "learning_rate": 0.0002765548552586926, - "loss": 1.0833, - "step": 4465 - }, - { - "epoch": 0.6834339882271998, - "grad_norm": 0.058671855702705646, - "learning_rate": 0.00027536195820652504, - "loss": 1.0955, - "step": 4470 - }, - { - "epoch": 0.6841984557755524, - "grad_norm": 0.06570010670273176, - "learning_rate": 0.0002741706610154816, - "loss": 1.0776, - "step": 4475 - }, - { - "epoch": 0.6849629233239048, - "grad_norm": 0.0621757019848029, - "learning_rate": 0.00027298097216992283, - "loss": 1.0787, - "step": 4480 - }, - { - "epoch": 0.6857273908722574, - "grad_norm": 0.05545832112022665, - "learning_rate": 0.00027179290014275447, - "loss": 1.0497, - "step": 4485 - }, - { - "epoch": 0.68649185842061, - "grad_norm": 0.06138638133688741, - "learning_rate": 0.0002706064533953667, - "loss": 1.0874, - "step": 4490 - }, - { - "epoch": 0.6872563259689626, - "grad_norm": 0.06238495792553165, - "learning_rate": 0.00026942164037757567, - "loss": 1.0474, - "step": 4495 - }, - { - "epoch": 0.6880207935173152, - "grad_norm": 0.06011230852606334, - "learning_rate": 0.00026823846952756127, - "loss": 1.0881, - "step": 4500 - }, - { - "epoch": 0.6887852610656677, - "grad_norm": 0.059382096793669986, - "learning_rate": 0.00026705694927180856, - "loss": 1.0691, - "step": 4505 - }, - { - "epoch": 0.6895497286140203, - "grad_norm": 0.06765967195924834, - "learning_rate": 0.00026587708802504675, - "loss": 1.0529, - "step": 4510 - }, - { - "epoch": 0.6903141961623729, - "grad_norm": 0.06898816086964697, - "learning_rate": 0.00026469889419018983, - "loss": 1.0509, - "step": 4515 - }, - { - "epoch": 0.6910786637107255, - "grad_norm": 0.06243515555844469, - "learning_rate": 0.0002635223761582763, - "loss": 1.0923, - "step": 4520 - }, - { - "epoch": 0.691843131259078, - "grad_norm": 0.06953610098318412, - "learning_rate": 0.00026234754230841076, - "loss": 1.0357, - "step": 4525 - }, - { - "epoch": 0.6926075988074306, - "grad_norm": 0.057115769917451306, - "learning_rate": 0.0002611744010077014, - "loss": 1.0823, - "step": 4530 - }, - { - "epoch": 0.6933720663557832, - "grad_norm": 0.05928833387518554, - "learning_rate": 0.0002600029606112033, - "loss": 1.0685, - "step": 4535 - }, - { - "epoch": 0.6941365339041358, - "grad_norm": 0.06514424505474277, - "learning_rate": 0.00025883322946185775, - "loss": 1.0522, - "step": 4540 - }, - { - "epoch": 0.6949010014524883, - "grad_norm": 0.06345812694465712, - "learning_rate": 0.00025766521589043267, - "loss": 1.063, - "step": 4545 - }, - { - "epoch": 0.6956654690008409, - "grad_norm": 0.06710277327176996, - "learning_rate": 0.00025649892821546334, - "loss": 1.102, - "step": 4550 - }, - { - "epoch": 0.6964299365491935, - "grad_norm": 0.0601077328567939, - "learning_rate": 0.0002553343747431934, - "loss": 1.0402, - "step": 4555 - }, - { - "epoch": 0.6971944040975461, - "grad_norm": 0.06375489248294168, - "learning_rate": 0.0002541715637675156, - "loss": 1.048, - "step": 4560 - }, - { - "epoch": 0.6979588716458986, - "grad_norm": 0.061468942934610936, - "learning_rate": 0.0002530105035699131, - "loss": 1.0518, - "step": 4565 - }, - { - "epoch": 0.6987233391942512, - "grad_norm": 0.05855568239256093, - "learning_rate": 0.0002518512024193993, - "loss": 1.0779, - "step": 4570 - }, - { - "epoch": 0.6994878067426038, - "grad_norm": 0.06351375706901624, - "learning_rate": 0.00025069366857246, - "loss": 1.0445, - "step": 4575 - }, - { - "epoch": 0.7002522742909564, - "grad_norm": 0.06505940038931915, - "learning_rate": 0.00024953791027299503, - "loss": 1.0701, - "step": 4580 - }, - { - "epoch": 0.701016741839309, - "grad_norm": 0.054690941924002506, - "learning_rate": 0.0002483839357522582, - "loss": 1.0496, - "step": 4585 - }, - { - "epoch": 0.7017812093876615, - "grad_norm": 0.0596197660604312, - "learning_rate": 0.0002472317532287994, - "loss": 1.0715, - "step": 4590 - }, - { - "epoch": 0.7025456769360141, - "grad_norm": 0.06143144522893759, - "learning_rate": 0.00024608137090840614, - "loss": 1.0836, - "step": 4595 - }, - { - "epoch": 0.7033101444843667, - "grad_norm": 0.06261739468488044, - "learning_rate": 0.0002449327969840449, - "loss": 1.0627, - "step": 4600 - }, - { - "epoch": 0.7040746120327193, - "grad_norm": 0.06305255828688207, - "learning_rate": 0.00024378603963580293, - "loss": 1.058, - "step": 4605 - }, - { - "epoch": 0.7048390795810718, - "grad_norm": 0.06250340445854399, - "learning_rate": 0.00024264110703082982, - "loss": 1.0866, - "step": 4610 - }, - { - "epoch": 0.7056035471294243, - "grad_norm": 0.06906200872639721, - "learning_rate": 0.00024149800732327942, - "loss": 1.0653, - "step": 4615 - }, - { - "epoch": 0.7063680146777769, - "grad_norm": 0.06481175868184377, - "learning_rate": 0.00024035674865425177, - "loss": 1.0565, - "step": 4620 - }, - { - "epoch": 0.7071324822261295, - "grad_norm": 0.0624819873899462, - "learning_rate": 0.00023921733915173565, - "loss": 1.069, - "step": 4625 - }, - { - "epoch": 0.707896949774482, - "grad_norm": 0.0701032532427006, - "learning_rate": 0.0002380797869305491, - "loss": 1.1104, - "step": 4630 - }, - { - "epoch": 0.7086614173228346, - "grad_norm": 0.06214752273965523, - "learning_rate": 0.0002369441000922834, - "loss": 1.0545, - "step": 4635 - }, - { - "epoch": 0.7094258848711872, - "grad_norm": 0.06024114616320267, - "learning_rate": 0.00023581028672524485, - "loss": 1.0555, - "step": 4640 - }, - { - "epoch": 0.7101903524195398, - "grad_norm": 0.05406841159348691, - "learning_rate": 0.00023467835490439647, - "loss": 1.0367, - "step": 4645 - }, - { - "epoch": 0.7109548199678923, - "grad_norm": 0.059729814605345245, - "learning_rate": 0.00023354831269130133, - "loss": 1.0416, - "step": 4650 - }, - { - "epoch": 0.7117192875162449, - "grad_norm": 0.0636881199097813, - "learning_rate": 0.0002324201681340646, - "loss": 1.0688, - "step": 4655 - }, - { - "epoch": 0.7124837550645975, - "grad_norm": 0.06458838613385967, - "learning_rate": 0.0002312939292672765, - "loss": 1.0565, - "step": 4660 - }, - { - "epoch": 0.7132482226129501, - "grad_norm": 0.055775934963736715, - "learning_rate": 0.00023016960411195508, - "loss": 1.0496, - "step": 4665 - }, - { - "epoch": 0.7140126901613026, - "grad_norm": 0.06470166002706611, - "learning_rate": 0.00022904720067548907, - "loss": 1.073, - "step": 4670 - }, - { - "epoch": 0.7147771577096552, - "grad_norm": 0.06899482171245135, - "learning_rate": 0.00022792672695158057, - "loss": 1.0684, - "step": 4675 - }, - { - "epoch": 0.7155416252580078, - "grad_norm": 0.054926399175138495, - "learning_rate": 0.00022680819092018851, - "loss": 1.0778, - "step": 4680 - }, - { - "epoch": 0.7163060928063604, - "grad_norm": 0.06076409853777293, - "learning_rate": 0.00022569160054747195, - "loss": 1.0726, - "step": 4685 - }, - { - "epoch": 0.717070560354713, - "grad_norm": 0.05595718010117591, - "learning_rate": 0.00022457696378573285, - "loss": 1.0407, - "step": 4690 - }, - { - "epoch": 0.7178350279030655, - "grad_norm": 0.05621732756049289, - "learning_rate": 0.00022346428857335904, - "loss": 1.0532, - "step": 4695 - }, - { - "epoch": 0.7185994954514181, - "grad_norm": 0.06019576108707039, - "learning_rate": 0.00022235358283476936, - "loss": 1.0965, - "step": 4700 - }, - { - "epoch": 0.7193639629997707, - "grad_norm": 0.08373809700675898, - "learning_rate": 0.00022124485448035553, - "loss": 1.0519, - "step": 4705 - }, - { - "epoch": 0.7201284305481233, - "grad_norm": 0.07459730823974277, - "learning_rate": 0.00022013811140642652, - "loss": 1.0614, - "step": 4710 - }, - { - "epoch": 0.7208928980964758, - "grad_norm": 0.0571730569494354, - "learning_rate": 0.00021903336149515224, - "loss": 1.0561, - "step": 4715 - }, - { - "epoch": 0.7216573656448284, - "grad_norm": 0.05546776042814926, - "learning_rate": 0.00021793061261450748, - "loss": 1.0567, - "step": 4720 - }, - { - "epoch": 0.722421833193181, - "grad_norm": 0.06438680600968821, - "learning_rate": 0.00021682987261821546, - "loss": 1.0629, - "step": 4725 - }, - { - "epoch": 0.7231863007415336, - "grad_norm": 0.0743342279298406, - "learning_rate": 0.0002157311493456931, - "loss": 1.0397, - "step": 4730 - }, - { - "epoch": 0.7239507682898861, - "grad_norm": 0.06983357774921824, - "learning_rate": 0.0002146344506219931, - "loss": 1.0536, - "step": 4735 - }, - { - "epoch": 0.7247152358382387, - "grad_norm": 0.059060993062700295, - "learning_rate": 0.00021353978425775005, - "loss": 1.0728, - "step": 4740 - }, - { - "epoch": 0.7254797033865913, - "grad_norm": 0.0579147373891631, - "learning_rate": 0.0002124471580491245, - "loss": 1.0557, - "step": 4745 - }, - { - "epoch": 0.7262441709349439, - "grad_norm": 0.0621748767051743, - "learning_rate": 0.00021135657977774665, - "loss": 1.0392, - "step": 4750 - }, - { - "epoch": 0.7270086384832963, - "grad_norm": 0.05605518237597559, - "learning_rate": 0.00021026805721066134, - "loss": 1.0644, - "step": 4755 - }, - { - "epoch": 0.7277731060316489, - "grad_norm": 0.058785481188473786, - "learning_rate": 0.0002091815981002731, - "loss": 1.0571, - "step": 4760 - }, - { - "epoch": 0.7285375735800015, - "grad_norm": 0.06048464507849506, - "learning_rate": 0.0002080972101842904, - "loss": 1.0716, - "step": 4765 - }, - { - "epoch": 0.7293020411283541, - "grad_norm": 0.06504524837348831, - "learning_rate": 0.00020701490118567079, - "loss": 1.0539, - "step": 4770 - }, - { - "epoch": 0.7300665086767066, - "grad_norm": 0.058155612162554556, - "learning_rate": 0.000205934678812566, - "loss": 1.0634, - "step": 4775 - }, - { - "epoch": 0.7308309762250592, - "grad_norm": 0.08474842750770611, - "learning_rate": 0.00020485655075826663, - "loss": 1.0494, - "step": 4780 - }, - { - "epoch": 0.7315954437734118, - "grad_norm": 0.06240607837640539, - "learning_rate": 0.00020378052470114823, - "loss": 1.0964, - "step": 4785 - }, - { - "epoch": 0.7323599113217644, - "grad_norm": 0.05804984432313353, - "learning_rate": 0.0002027066083046155, - "loss": 1.0112, - "step": 4790 - }, - { - "epoch": 0.7331243788701169, - "grad_norm": 0.05649066467343644, - "learning_rate": 0.000201634809217048, - "loss": 1.0986, - "step": 4795 - }, - { - "epoch": 0.7338888464184695, - "grad_norm": 0.05637504209092696, - "learning_rate": 0.00020056513507174685, - "loss": 1.0532, - "step": 4800 - }, - { - "epoch": 0.7346533139668221, - "grad_norm": 0.06298093925215675, - "learning_rate": 0.00019949759348687857, - "loss": 1.0139, - "step": 4805 - }, - { - "epoch": 0.7354177815151747, - "grad_norm": 0.06517336083169355, - "learning_rate": 0.00019843219206542212, - "loss": 1.0883, - "step": 4810 - }, - { - "epoch": 0.7361822490635272, - "grad_norm": 0.05724115630119351, - "learning_rate": 0.00019736893839511423, - "loss": 1.0196, - "step": 4815 - }, - { - "epoch": 0.7369467166118798, - "grad_norm": 0.06914879934829934, - "learning_rate": 0.0001963078400483953, - "loss": 1.048, - "step": 4820 - }, - { - "epoch": 0.7377111841602324, - "grad_norm": 0.07095787700138627, - "learning_rate": 0.00019524890458235566, - "loss": 1.0561, - "step": 4825 - }, - { - "epoch": 0.738475651708585, - "grad_norm": 0.06227294786877535, - "learning_rate": 0.00019419213953868235, - "loss": 0.9981, - "step": 4830 - }, - { - "epoch": 0.7392401192569376, - "grad_norm": 0.05688377294396278, - "learning_rate": 0.0001931375524436037, - "loss": 1.0683, - "step": 4835 - }, - { - "epoch": 0.7400045868052901, - "grad_norm": 0.06299988853768748, - "learning_rate": 0.00019208515080783723, - "loss": 1.0661, - "step": 4840 - }, - { - "epoch": 0.7407690543536427, - "grad_norm": 0.05374647681244872, - "learning_rate": 0.0001910349421265362, - "loss": 1.0292, - "step": 4845 - }, - { - "epoch": 0.7415335219019953, - "grad_norm": 0.05994041675068698, - "learning_rate": 0.00018998693387923542, - "loss": 1.0409, - "step": 4850 - }, - { - "epoch": 0.7422979894503479, - "grad_norm": 0.06160205659605142, - "learning_rate": 0.00018894113352979774, - "loss": 1.0132, - "step": 4855 - }, - { - "epoch": 0.7430624569987004, - "grad_norm": 0.06087875817228139, - "learning_rate": 0.00018789754852636243, - "loss": 1.0508, - "step": 4860 - }, - { - "epoch": 0.743826924547053, - "grad_norm": 0.05831613296562593, - "learning_rate": 0.00018685618630129064, - "loss": 1.0738, - "step": 4865 - }, - { - "epoch": 0.7445913920954056, - "grad_norm": 0.05710988576473131, - "learning_rate": 0.00018581705427111306, - "loss": 1.0502, - "step": 4870 - }, - { - "epoch": 0.7453558596437582, - "grad_norm": 0.055938286857235675, - "learning_rate": 0.00018478015983647717, - "loss": 1.0545, - "step": 4875 - }, - { - "epoch": 0.7461203271921107, - "grad_norm": 0.06130999146989328, - "learning_rate": 0.0001837455103820942, - "loss": 1.0672, - "step": 4880 - }, - { - "epoch": 0.7468847947404633, - "grad_norm": 0.0700486673269101, - "learning_rate": 0.0001827131132766867, - "loss": 1.0598, - "step": 4885 - }, - { - "epoch": 0.7476492622888158, - "grad_norm": 0.061673007556164776, - "learning_rate": 0.0001816829758729368, - "loss": 1.0591, - "step": 4890 - }, - { - "epoch": 0.7484137298371684, - "grad_norm": 0.05301433905384343, - "learning_rate": 0.00018065510550743213, - "loss": 1.032, - "step": 4895 - }, - { - "epoch": 0.7491781973855209, - "grad_norm": 0.05637326883587409, - "learning_rate": 0.000179629509500615, - "loss": 1.0485, - "step": 4900 - }, - { - "epoch": 0.7499426649338735, - "grad_norm": 0.06567737463780914, - "learning_rate": 0.0001786061951567303, - "loss": 1.0517, - "step": 4905 - }, - { - "epoch": 0.7507071324822261, - "grad_norm": 0.06049809996712097, - "learning_rate": 0.00017758516976377247, - "loss": 1.0563, - "step": 4910 - }, - { - "epoch": 0.7514716000305787, - "grad_norm": 0.0576348511640816, - "learning_rate": 0.00017656644059343423, - "loss": 1.0455, - "step": 4915 - }, - { - "epoch": 0.7522360675789312, - "grad_norm": 0.05822968830584341, - "learning_rate": 0.00017555001490105488, - "loss": 1.0848, - "step": 4920 - }, - { - "epoch": 0.7530005351272838, - "grad_norm": 0.06417289661323497, - "learning_rate": 0.00017453589992556833, - "loss": 1.0481, - "step": 4925 - }, - { - "epoch": 0.7537650026756364, - "grad_norm": 0.059833408626991574, - "learning_rate": 0.0001735241028894518, - "loss": 1.0682, - "step": 4930 - }, - { - "epoch": 0.754529470223989, - "grad_norm": 0.061813707081001805, - "learning_rate": 0.00017251463099867415, - "loss": 1.0578, - "step": 4935 - }, - { - "epoch": 0.7552939377723416, - "grad_norm": 0.06410222905138455, - "learning_rate": 0.0001715074914426446, - "loss": 1.0629, - "step": 4940 - }, - { - "epoch": 0.7560584053206941, - "grad_norm": 0.062252515513016034, - "learning_rate": 0.00017050269139416198, - "loss": 1.0241, - "step": 4945 - }, - { - "epoch": 0.7568228728690467, - "grad_norm": 0.06040469252714623, - "learning_rate": 0.00016950023800936298, - "loss": 1.0177, - "step": 4950 - }, - { - "epoch": 0.7575873404173993, - "grad_norm": 0.05861174886953368, - "learning_rate": 0.00016850013842767102, - "loss": 1.0575, - "step": 4955 - }, - { - "epoch": 0.7583518079657519, - "grad_norm": 0.05545703925778717, - "learning_rate": 0.0001675023997717468, - "loss": 1.0852, - "step": 4960 - }, - { - "epoch": 0.7591162755141044, - "grad_norm": 0.061973289366438845, - "learning_rate": 0.000166507029147436, - "loss": 1.0398, - "step": 4965 - }, - { - "epoch": 0.759880743062457, - "grad_norm": 0.05581024876428727, - "learning_rate": 0.00016551403364371936, - "loss": 1.0494, - "step": 4970 - }, - { - "epoch": 0.7606452106108096, - "grad_norm": 0.05851335959521461, - "learning_rate": 0.0001645234203326622, - "loss": 1.0588, - "step": 4975 - }, - { - "epoch": 0.7614096781591622, - "grad_norm": 0.06189891483918274, - "learning_rate": 0.00016353519626936396, - "loss": 1.0855, - "step": 4980 - }, - { - "epoch": 0.7621741457075147, - "grad_norm": 0.06925049708184426, - "learning_rate": 0.00016254936849190795, - "loss": 1.0521, - "step": 4985 - }, - { - "epoch": 0.7629386132558673, - "grad_norm": 0.05519111923786063, - "learning_rate": 0.00016156594402131158, - "loss": 1.0116, - "step": 4990 - }, - { - "epoch": 0.7637030808042199, - "grad_norm": 0.0661038161121552, - "learning_rate": 0.00016058492986147538, - "loss": 1.0273, - "step": 4995 - }, - { - "epoch": 0.7644675483525725, - "grad_norm": 0.062383945489791016, - "learning_rate": 0.00015960633299913406, - "loss": 1.0873, - "step": 5000 - }, - { - "epoch": 0.765232015900925, - "grad_norm": 0.06897279137476997, - "learning_rate": 0.00015863016040380685, - "loss": 1.0594, - "step": 5005 - }, - { - "epoch": 0.7659964834492776, - "grad_norm": 0.056082673963443117, - "learning_rate": 0.000157656419027747, - "loss": 1.0708, - "step": 5010 - }, - { - "epoch": 0.7667609509976302, - "grad_norm": 0.058420523194276514, - "learning_rate": 0.00015668511580589284, - "loss": 1.0628, - "step": 5015 - }, - { - "epoch": 0.7675254185459828, - "grad_norm": 0.06310313725455063, - "learning_rate": 0.0001557162576558183, - "loss": 1.0722, - "step": 5020 - }, - { - "epoch": 0.7682898860943354, - "grad_norm": 0.06596435296148517, - "learning_rate": 0.00015474985147768367, - "loss": 1.0406, - "step": 5025 - }, - { - "epoch": 0.7690543536426878, - "grad_norm": 0.06971977148141095, - "learning_rate": 0.00015378590415418637, - "loss": 1.0339, - "step": 5030 - }, - { - "epoch": 0.7698188211910404, - "grad_norm": 0.05474565393634749, - "learning_rate": 0.00015282442255051205, - "loss": 1.0795, - "step": 5035 - }, - { - "epoch": 0.770583288739393, - "grad_norm": 0.06320387891306647, - "learning_rate": 0.00015186541351428545, - "loss": 1.0629, - "step": 5040 - }, - { - "epoch": 0.7713477562877455, - "grad_norm": 0.06290589598630533, - "learning_rate": 0.00015090888387552187, - "loss": 1.045, - "step": 5045 - }, - { - "epoch": 0.7721122238360981, - "grad_norm": 0.06282443758043982, - "learning_rate": 0.0001499548404465788, - "loss": 1.0133, - "step": 5050 - }, - { - "epoch": 0.7728766913844507, - "grad_norm": 0.061142142507514305, - "learning_rate": 0.00014900329002210682, - "loss": 1.0481, - "step": 5055 - }, - { - "epoch": 0.7736411589328033, - "grad_norm": 0.056267952874468424, - "learning_rate": 0.00014805423937900086, - "loss": 1.0295, - "step": 5060 - }, - { - "epoch": 0.7744056264811559, - "grad_norm": 0.056082796704510815, - "learning_rate": 0.00014710769527635372, - "loss": 1.0509, - "step": 5065 - }, - { - "epoch": 0.7751700940295084, - "grad_norm": 0.06610725348457465, - "learning_rate": 0.00014616366445540575, - "loss": 1.0911, - "step": 5070 - }, - { - "epoch": 0.775934561577861, - "grad_norm": 0.06318782224901359, - "learning_rate": 0.0001452221536394983, - "loss": 1.0382, - "step": 5075 - }, - { - "epoch": 0.7766990291262136, - "grad_norm": 0.08384451404877709, - "learning_rate": 0.00014428316953402526, - "loss": 1.0365, - "step": 5080 - }, - { - "epoch": 0.7774634966745662, - "grad_norm": 0.0671394576521587, - "learning_rate": 0.00014334671882638538, - "loss": 1.0328, - "step": 5085 - }, - { - "epoch": 0.7782279642229187, - "grad_norm": 0.055317503030393636, - "learning_rate": 0.00014241280818593472, - "loss": 1.0427, - "step": 5090 - }, - { - "epoch": 0.7789924317712713, - "grad_norm": 0.06850422084885686, - "learning_rate": 0.0001414814442639391, - "loss": 1.0653, - "step": 5095 - }, - { - "epoch": 0.7797568993196239, - "grad_norm": 0.06250617863046679, - "learning_rate": 0.00014055263369352672, - "loss": 1.0339, - "step": 5100 - }, - { - "epoch": 0.7805213668679765, - "grad_norm": 0.06251159274247055, - "learning_rate": 0.00013962638308964083, - "loss": 1.0615, - "step": 5105 - }, - { - "epoch": 0.781285834416329, - "grad_norm": 0.06035781872130478, - "learning_rate": 0.00013870269904899302, - "loss": 1.0662, - "step": 5110 - }, - { - "epoch": 0.7820503019646816, - "grad_norm": 0.05488178375326448, - "learning_rate": 0.0001377815881500159, - "loss": 1.0777, - "step": 5115 - }, - { - "epoch": 0.7828147695130342, - "grad_norm": 0.056280611763081405, - "learning_rate": 0.0001368630569528156, - "loss": 1.0451, - "step": 5120 - }, - { - "epoch": 0.7835792370613868, - "grad_norm": 0.059108351931859225, - "learning_rate": 0.0001359471119991268, - "loss": 1.0645, - "step": 5125 - }, - { - "epoch": 0.7843437046097393, - "grad_norm": 0.05661993372727011, - "learning_rate": 0.0001350337598122644, - "loss": 1.0362, - "step": 5130 - }, - { - "epoch": 0.7851081721580919, - "grad_norm": 0.05986614836392417, - "learning_rate": 0.0001341230068970779, - "loss": 1.0175, - "step": 5135 - }, - { - "epoch": 0.7858726397064445, - "grad_norm": 0.06809753874365711, - "learning_rate": 0.00013321485973990493, - "loss": 1.0583, - "step": 5140 - }, - { - "epoch": 0.7866371072547971, - "grad_norm": 0.06323544711504882, - "learning_rate": 0.00013230932480852485, - "loss": 1.036, - "step": 5145 - }, - { - "epoch": 0.7874015748031497, - "grad_norm": 0.06793389651951727, - "learning_rate": 0.00013140640855211345, - "loss": 1.0233, - "step": 5150 - }, - { - "epoch": 0.7881660423515022, - "grad_norm": 0.06817907964044204, - "learning_rate": 0.000130506117401196, - "loss": 1.0297, - "step": 5155 - }, - { - "epoch": 0.7889305098998548, - "grad_norm": 0.0603564707452043, - "learning_rate": 0.00012960845776760154, - "loss": 1.0359, - "step": 5160 - }, - { - "epoch": 0.7896949774482073, - "grad_norm": 0.05755456407134064, - "learning_rate": 0.00012871343604441837, - "loss": 1.0425, - "step": 5165 - }, - { - "epoch": 0.7904594449965598, - "grad_norm": 0.06924984234647509, - "learning_rate": 0.00012782105860594724, - "loss": 1.0133, - "step": 5170 - }, - { - "epoch": 0.7912239125449124, - "grad_norm": 0.06525653855106175, - "learning_rate": 0.00012693133180765654, - "loss": 1.0236, - "step": 5175 - }, - { - "epoch": 0.791988380093265, - "grad_norm": 0.058531934314296985, - "learning_rate": 0.0001260442619861369, - "loss": 1.0413, - "step": 5180 - }, - { - "epoch": 0.7927528476416176, - "grad_norm": 0.06357634178206084, - "learning_rate": 0.00012515985545905606, - "loss": 1.0336, - "step": 5185 - }, - { - "epoch": 0.7935173151899702, - "grad_norm": 0.06604250681255697, - "learning_rate": 0.00012427811852511395, - "loss": 1.1083, - "step": 5190 - }, - { - "epoch": 0.7942817827383227, - "grad_norm": 0.055731873100012756, - "learning_rate": 0.0001233990574639981, - "loss": 1.0716, - "step": 5195 - }, - { - "epoch": 0.7950462502866753, - "grad_norm": 0.05731005090974728, - "learning_rate": 0.00012252267853633797, - "loss": 1.0179, - "step": 5200 - }, - { - "epoch": 0.7958107178350279, - "grad_norm": 0.05717571227226772, - "learning_rate": 0.00012164898798366126, - "loss": 1.0229, - "step": 5205 - }, - { - "epoch": 0.7965751853833805, - "grad_norm": 0.06450794458214343, - "learning_rate": 0.0001207779920283496, - "loss": 1.0382, - "step": 5210 - }, - { - "epoch": 0.797339652931733, - "grad_norm": 0.06110455802138656, - "learning_rate": 0.00011990969687359349, - "loss": 1.072, - "step": 5215 - }, - { - "epoch": 0.7981041204800856, - "grad_norm": 0.06285806039928286, - "learning_rate": 0.00011904410870334803, - "loss": 1.038, - "step": 5220 - }, - { - "epoch": 0.7988685880284382, - "grad_norm": 0.05602690523165866, - "learning_rate": 0.00011818123368229022, - "loss": 1.0422, - "step": 5225 - }, - { - "epoch": 0.7996330555767908, - "grad_norm": 0.05976622450742704, - "learning_rate": 0.00011732107795577345, - "loss": 1.051, - "step": 5230 - }, - { - "epoch": 0.8003975231251433, - "grad_norm": 0.06267233414280757, - "learning_rate": 0.00011646364764978467, - "loss": 1.0601, - "step": 5235 - }, - { - "epoch": 0.8011619906734959, - "grad_norm": 0.07549985414221129, - "learning_rate": 0.00011560894887090051, - "loss": 1.0348, - "step": 5240 - }, - { - "epoch": 0.8019264582218485, - "grad_norm": 0.059342180679561214, - "learning_rate": 0.00011475698770624382, - "loss": 1.0652, - "step": 5245 - }, - { - "epoch": 0.8026909257702011, - "grad_norm": 0.06346928729977709, - "learning_rate": 0.00011390777022344006, - "loss": 1.0449, - "step": 5250 - }, - { - "epoch": 0.8034553933185536, - "grad_norm": 0.054691568332852346, - "learning_rate": 0.000113061302470575, - "loss": 1.0218, - "step": 5255 - }, - { - "epoch": 0.8042198608669062, - "grad_norm": 0.055409675325726666, - "learning_rate": 0.00011221759047615004, - "loss": 1.0436, - "step": 5260 - }, - { - "epoch": 0.8049843284152588, - "grad_norm": 0.06398528486658486, - "learning_rate": 0.00011137664024904065, - "loss": 1.0385, - "step": 5265 - }, - { - "epoch": 0.8057487959636114, - "grad_norm": 0.06396202261992998, - "learning_rate": 0.0001105384577784534, - "loss": 1.0752, - "step": 5270 - }, - { - "epoch": 0.806513263511964, - "grad_norm": 0.05484742690897295, - "learning_rate": 0.00010970304903388267, - "loss": 1.0219, - "step": 5275 - }, - { - "epoch": 0.8072777310603165, - "grad_norm": 0.062189301119376504, - "learning_rate": 0.00010887041996506857, - "loss": 1.0211, - "step": 5280 - }, - { - "epoch": 0.8080421986086691, - "grad_norm": 0.062008186236770825, - "learning_rate": 0.00010804057650195448, - "loss": 1.069, - "step": 5285 - }, - { - "epoch": 0.8088066661570217, - "grad_norm": 0.06107669674272538, - "learning_rate": 0.000107213524554645, - "loss": 1.1088, - "step": 5290 - }, - { - "epoch": 0.8095711337053743, - "grad_norm": 0.05868772060077093, - "learning_rate": 0.00010638927001336341, - "loss": 1.007, - "step": 5295 - }, - { - "epoch": 0.8103356012537268, - "grad_norm": 0.06532420618997664, - "learning_rate": 0.00010556781874841026, - "loss": 1.0363, - "step": 5300 - }, - { - "epoch": 0.8111000688020793, - "grad_norm": 0.05416277654812083, - "learning_rate": 0.00010474917661012118, - "loss": 1.0184, - "step": 5305 - }, - { - "epoch": 0.8118645363504319, - "grad_norm": 0.05527840984915126, - "learning_rate": 0.00010393334942882521, - "loss": 1.024, - "step": 5310 - }, - { - "epoch": 0.8126290038987845, - "grad_norm": 0.06124101383734983, - "learning_rate": 0.000103120343014804, - "loss": 1.0251, - "step": 5315 - }, - { - "epoch": 0.813393471447137, - "grad_norm": 0.06459260135386527, - "learning_rate": 0.00010231016315824875, - "loss": 1.0474, - "step": 5320 - }, - { - "epoch": 0.8141579389954896, - "grad_norm": 0.05584252284948542, - "learning_rate": 0.0001015028156292212, - "loss": 1.0966, - "step": 5325 - }, - { - "epoch": 0.8149224065438422, - "grad_norm": 0.06552357487141015, - "learning_rate": 0.00010069830617761067, - "loss": 1.0496, - "step": 5330 - }, - { - "epoch": 0.8156868740921948, - "grad_norm": 0.05524468981155268, - "learning_rate": 9.98966405330941e-05, - "loss": 1.0519, - "step": 5335 - }, - { - "epoch": 0.8164513416405473, - "grad_norm": 0.05208274069273543, - "learning_rate": 9.909782440509491e-05, - "loss": 1.0089, - "step": 5340 - }, - { - "epoch": 0.8172158091888999, - "grad_norm": 0.05988429182702099, - "learning_rate": 9.830186348274239e-05, - "loss": 1.0319, - "step": 5345 - }, - { - "epoch": 0.8179802767372525, - "grad_norm": 0.06029492738382687, - "learning_rate": 9.750876343483112e-05, - "loss": 1.0574, - "step": 5350 - }, - { - "epoch": 0.8187447442856051, - "grad_norm": 0.05562867844505153, - "learning_rate": 9.671852990978109e-05, - "loss": 1.0655, - "step": 5355 - }, - { - "epoch": 0.8195092118339576, - "grad_norm": 0.055724731096407874, - "learning_rate": 9.593116853559646e-05, - "loss": 1.0604, - "step": 5360 - }, - { - "epoch": 0.8202736793823102, - "grad_norm": 0.05764217796697168, - "learning_rate": 9.514668491982631e-05, - "loss": 1.0664, - "step": 5365 - }, - { - "epoch": 0.8210381469306628, - "grad_norm": 0.05752272726794207, - "learning_rate": 9.43650846495247e-05, - "loss": 1.0412, - "step": 5370 - }, - { - "epoch": 0.8218026144790154, - "grad_norm": 0.06966191316852606, - "learning_rate": 9.35863732912104e-05, - "loss": 1.0369, - "step": 5375 - }, - { - "epoch": 0.822567082027368, - "grad_norm": 0.0683006602529788, - "learning_rate": 9.281055639082747e-05, - "loss": 1.023, - "step": 5380 - }, - { - "epoch": 0.8233315495757205, - "grad_norm": 0.06139287047512268, - "learning_rate": 9.203763947370591e-05, - "loss": 1.0404, - "step": 5385 - }, - { - "epoch": 0.8240960171240731, - "grad_norm": 0.057208454420155645, - "learning_rate": 9.12676280445221e-05, - "loss": 1.0402, - "step": 5390 - }, - { - "epoch": 0.8248604846724257, - "grad_norm": 0.060773388016504705, - "learning_rate": 9.050052758725975e-05, - "loss": 1.053, - "step": 5395 - }, - { - "epoch": 0.8256249522207783, - "grad_norm": 0.06256383640892278, - "learning_rate": 8.973634356517063e-05, - "loss": 1.0165, - "step": 5400 - }, - { - "epoch": 0.8263894197691308, - "grad_norm": 0.05964678294589494, - "learning_rate": 8.89750814207359e-05, - "loss": 1.0403, - "step": 5405 - }, - { - "epoch": 0.8271538873174834, - "grad_norm": 0.05381486961488011, - "learning_rate": 8.821674657562723e-05, - "loss": 1.0269, - "step": 5410 - }, - { - "epoch": 0.827918354865836, - "grad_norm": 0.06186356736730895, - "learning_rate": 8.746134443066839e-05, - "loss": 1.0329, - "step": 5415 - }, - { - "epoch": 0.8286828224141886, - "grad_norm": 0.0762563854805148, - "learning_rate": 8.670888036579639e-05, - "loss": 1.026, - "step": 5420 - }, - { - "epoch": 0.8294472899625411, - "grad_norm": 0.06234950390412603, - "learning_rate": 8.595935974002317e-05, - "loss": 1.074, - "step": 5425 - }, - { - "epoch": 0.8302117575108937, - "grad_norm": 0.06064159367934313, - "learning_rate": 8.521278789139813e-05, - "loss": 1.0455, - "step": 5430 - }, - { - "epoch": 0.8309762250592463, - "grad_norm": 0.05867647677565349, - "learning_rate": 8.446917013696937e-05, - "loss": 1.0575, - "step": 5435 - }, - { - "epoch": 0.8317406926075988, - "grad_norm": 0.06005112304825767, - "learning_rate": 8.372851177274604e-05, - "loss": 1.0533, - "step": 5440 - }, - { - "epoch": 0.8325051601559513, - "grad_norm": 0.056801197118919115, - "learning_rate": 8.299081807366076e-05, - "loss": 1.0202, - "step": 5445 - }, - { - "epoch": 0.8332696277043039, - "grad_norm": 0.06433320913665841, - "learning_rate": 8.225609429353187e-05, - "loss": 1.0427, - "step": 5450 - }, - { - "epoch": 0.8340340952526565, - "grad_norm": 0.056930526224854554, - "learning_rate": 8.152434566502609e-05, - "loss": 1.0571, - "step": 5455 - }, - { - "epoch": 0.8347985628010091, - "grad_norm": 0.06701689056833854, - "learning_rate": 8.07955773996213e-05, - "loss": 1.056, - "step": 5460 - }, - { - "epoch": 0.8355630303493616, - "grad_norm": 0.0619431235582593, - "learning_rate": 8.006979468756942e-05, - "loss": 1.0356, - "step": 5465 - }, - { - "epoch": 0.8363274978977142, - "grad_norm": 0.07144070886689532, - "learning_rate": 7.934700269785921e-05, - "loss": 1.0338, - "step": 5470 - }, - { - "epoch": 0.8370919654460668, - "grad_norm": 0.05933587591040727, - "learning_rate": 7.862720657818001e-05, - "loss": 0.9977, - "step": 5475 - }, - { - "epoch": 0.8378564329944194, - "grad_norm": 0.060067430203855944, - "learning_rate": 7.791041145488453e-05, - "loss": 1.0338, - "step": 5480 - }, - { - "epoch": 0.8386209005427719, - "grad_norm": 0.05476141839391333, - "learning_rate": 7.719662243295217e-05, - "loss": 1.0267, - "step": 5485 - }, - { - "epoch": 0.8393853680911245, - "grad_norm": 0.060883085756603644, - "learning_rate": 7.64858445959537e-05, - "loss": 1.0211, - "step": 5490 - }, - { - "epoch": 0.8401498356394771, - "grad_norm": 0.06166847844911407, - "learning_rate": 7.57780830060139e-05, - "loss": 0.9904, - "step": 5495 - }, - { - "epoch": 0.8409143031878297, - "grad_norm": 0.056351980585675226, - "learning_rate": 7.50733427037762e-05, - "loss": 1.0611, - "step": 5500 - }, - { - "epoch": 0.8416787707361822, - "grad_norm": 0.06572816953871387, - "learning_rate": 7.437162870836638e-05, - "loss": 1.0406, - "step": 5505 - }, - { - "epoch": 0.8424432382845348, - "grad_norm": 0.05899387419863725, - "learning_rate": 7.367294601735708e-05, - "loss": 1.0148, - "step": 5510 - }, - { - "epoch": 0.8432077058328874, - "grad_norm": 0.07468273941671154, - "learning_rate": 7.297729960673244e-05, - "loss": 1.063, - "step": 5515 - }, - { - "epoch": 0.84397217338124, - "grad_norm": 0.05921366189210043, - "learning_rate": 7.228469443085206e-05, - "loss": 1.0227, - "step": 5520 - }, - { - "epoch": 0.8447366409295926, - "grad_norm": 0.0582034769285556, - "learning_rate": 7.15951354224157e-05, - "loss": 1.0282, - "step": 5525 - }, - { - "epoch": 0.8455011084779451, - "grad_norm": 0.06412673335503208, - "learning_rate": 7.090862749242921e-05, - "loss": 1.0661, - "step": 5530 - }, - { - "epoch": 0.8462655760262977, - "grad_norm": 0.06369336136081105, - "learning_rate": 7.022517553016827e-05, - "loss": 1.0106, - "step": 5535 - }, - { - "epoch": 0.8470300435746503, - "grad_norm": 0.058369722400650574, - "learning_rate": 6.954478440314427e-05, - "loss": 1.0188, - "step": 5540 - }, - { - "epoch": 0.8477945111230029, - "grad_norm": 0.05858354379438574, - "learning_rate": 6.886745895706947e-05, - "loss": 1.0354, - "step": 5545 - }, - { - "epoch": 0.8485589786713554, - "grad_norm": 0.058696989044599954, - "learning_rate": 6.819320401582258e-05, - "loss": 1.0528, - "step": 5550 - }, - { - "epoch": 0.849323446219708, - "grad_norm": 0.053612155113326045, - "learning_rate": 6.752202438141402e-05, - "loss": 1.0318, - "step": 5555 - }, - { - "epoch": 0.8500879137680606, - "grad_norm": 0.06426653835618115, - "learning_rate": 6.685392483395259e-05, - "loss": 1.0242, - "step": 5560 - }, - { - "epoch": 0.8508523813164132, - "grad_norm": 0.0635526144399548, - "learning_rate": 6.618891013161026e-05, - "loss": 1.0592, - "step": 5565 - }, - { - "epoch": 0.8516168488647657, - "grad_norm": 0.06555734215101829, - "learning_rate": 6.552698501058918e-05, - "loss": 1.0422, - "step": 5570 - }, - { - "epoch": 0.8523813164131183, - "grad_norm": 0.055809657555843446, - "learning_rate": 6.486815418508774e-05, - "loss": 0.9978, - "step": 5575 - }, - { - "epoch": 0.8531457839614708, - "grad_norm": 0.05874038849008624, - "learning_rate": 6.421242234726682e-05, - "loss": 1.0349, - "step": 5580 - }, - { - "epoch": 0.8539102515098234, - "grad_norm": 0.06670573325924009, - "learning_rate": 6.355979416721614e-05, - "loss": 1.0444, - "step": 5585 - }, - { - "epoch": 0.8546747190581759, - "grad_norm": 0.05580177701866413, - "learning_rate": 6.29102742929219e-05, - "loss": 0.9998, - "step": 5590 - }, - { - "epoch": 0.8554391866065285, - "grad_norm": 0.06253027310359607, - "learning_rate": 6.22638673502327e-05, - "loss": 1.0142, - "step": 5595 - }, - { - "epoch": 0.8562036541548811, - "grad_norm": 0.06370909036637372, - "learning_rate": 6.162057794282716e-05, - "loss": 1.0731, - "step": 5600 - }, - { - "epoch": 0.8569681217032337, - "grad_norm": 0.06829313690074416, - "learning_rate": 6.098041065218091e-05, - "loss": 1.0409, - "step": 5605 - }, - { - "epoch": 0.8577325892515862, - "grad_norm": 0.9460628196157908, - "learning_rate": 6.034337003753393e-05, - "loss": 1.0322, - "step": 5610 - }, - { - "epoch": 0.8584970567999388, - "grad_norm": 0.05588947262819443, - "learning_rate": 5.970946063585825e-05, - "loss": 1.0452, - "step": 5615 - }, - { - "epoch": 0.8592615243482914, - "grad_norm": 0.057798441761553625, - "learning_rate": 5.9078686961825836e-05, - "loss": 1.0114, - "step": 5620 - }, - { - "epoch": 0.860025991896644, - "grad_norm": 0.05559558436221299, - "learning_rate": 5.845105350777552e-05, - "loss": 1.0428, - "step": 5625 - }, - { - "epoch": 0.8607904594449965, - "grad_norm": 0.0605007497185841, - "learning_rate": 5.782656474368209e-05, - "loss": 1.033, - "step": 5630 - }, - { - "epoch": 0.8615549269933491, - "grad_norm": 0.05966863081807496, - "learning_rate": 5.720522511712406e-05, - "loss": 1.0186, - "step": 5635 - }, - { - "epoch": 0.8623193945417017, - "grad_norm": 0.06499735693928305, - "learning_rate": 5.6587039053251856e-05, - "loss": 1.0291, - "step": 5640 - }, - { - "epoch": 0.8630838620900543, - "grad_norm": 0.0625076833633183, - "learning_rate": 5.5972010954756015e-05, - "loss": 1.0514, - "step": 5645 - }, - { - "epoch": 0.8638483296384069, - "grad_norm": 0.05810931772236003, - "learning_rate": 5.5360145201836745e-05, - "loss": 1.0511, - "step": 5650 - }, - { - "epoch": 0.8646127971867594, - "grad_norm": 0.059530519769012705, - "learning_rate": 5.4751446152171866e-05, - "loss": 1.0486, - "step": 5655 - }, - { - "epoch": 0.865377264735112, - "grad_norm": 0.06283023161053312, - "learning_rate": 5.414591814088626e-05, - "loss": 1.0857, - "step": 5660 - }, - { - "epoch": 0.8661417322834646, - "grad_norm": 0.05570338995935867, - "learning_rate": 5.35435654805207e-05, - "loss": 1.0054, - "step": 5665 - }, - { - "epoch": 0.8669061998318172, - "grad_norm": 0.05444953831071218, - "learning_rate": 5.29443924610013e-05, - "loss": 1.0478, - "step": 5670 - }, - { - "epoch": 0.8676706673801697, - "grad_norm": 0.061787584612685256, - "learning_rate": 5.2348403349608834e-05, - "loss": 1.0321, - "step": 5675 - }, - { - "epoch": 0.8684351349285223, - "grad_norm": 0.05892829060325815, - "learning_rate": 5.17556023909489e-05, - "loss": 1.0268, - "step": 5680 - }, - { - "epoch": 0.8691996024768749, - "grad_norm": 0.06444520050715744, - "learning_rate": 5.1165993806920494e-05, - "loss": 1.0257, - "step": 5685 - }, - { - "epoch": 0.8699640700252275, - "grad_norm": 0.058670167066170946, - "learning_rate": 5.057958179668709e-05, - "loss": 1.0266, - "step": 5690 - }, - { - "epoch": 0.87072853757358, - "grad_norm": 0.058886717538286906, - "learning_rate": 4.9996370536646464e-05, - "loss": 1.0171, - "step": 5695 - }, - { - "epoch": 0.8714930051219326, - "grad_norm": 0.05872445786329025, - "learning_rate": 4.9416364180400574e-05, - "loss": 1.0689, - "step": 5700 - }, - { - "epoch": 0.8722574726702852, - "grad_norm": 0.06068126926877506, - "learning_rate": 4.883956685872626e-05, - "loss": 1.0528, - "step": 5705 - }, - { - "epoch": 0.8730219402186378, - "grad_norm": 0.06024646060572423, - "learning_rate": 4.826598267954574e-05, - "loss": 1.0091, - "step": 5710 - }, - { - "epoch": 0.8737864077669902, - "grad_norm": 0.056061378995899455, - "learning_rate": 4.7695615727897376e-05, - "loss": 1.0327, - "step": 5715 - }, - { - "epoch": 0.8745508753153428, - "grad_norm": 0.05642355016792108, - "learning_rate": 4.712847006590693e-05, - "loss": 1.0372, - "step": 5720 - }, - { - "epoch": 0.8753153428636954, - "grad_norm": 0.063941135432836, - "learning_rate": 4.6564549732757644e-05, - "loss": 1.0343, - "step": 5725 - }, - { - "epoch": 0.876079810412048, - "grad_norm": 0.06401648968226159, - "learning_rate": 4.6003858744662565e-05, - "loss": 1.0077, - "step": 5730 - }, - { - "epoch": 0.8768442779604005, - "grad_norm": 0.05861465222047972, - "learning_rate": 4.544640109483561e-05, - "loss": 1.0679, - "step": 5735 - }, - { - "epoch": 0.8776087455087531, - "grad_norm": 0.06161017849684759, - "learning_rate": 4.4892180753462744e-05, - "loss": 1.0213, - "step": 5740 - }, - { - "epoch": 0.8783732130571057, - "grad_norm": 0.06386075268338762, - "learning_rate": 4.434120166767408e-05, - "loss": 1.0436, - "step": 5745 - }, - { - "epoch": 0.8791376806054583, - "grad_norm": 0.06479505346064894, - "learning_rate": 4.37934677615156e-05, - "loss": 1.0518, - "step": 5750 - }, - { - "epoch": 0.8799021481538108, - "grad_norm": 0.06233866178912154, - "learning_rate": 4.32489829359215e-05, - "loss": 1.0067, - "step": 5755 - }, - { - "epoch": 0.8806666157021634, - "grad_norm": 0.0630500847286222, - "learning_rate": 4.270775106868585e-05, - "loss": 1.0303, - "step": 5760 - }, - { - "epoch": 0.881431083250516, - "grad_norm": 0.056777284717789075, - "learning_rate": 4.2169776014435554e-05, - "loss": 1.0409, - "step": 5765 - }, - { - "epoch": 0.8821955507988686, - "grad_norm": 0.05842993078979994, - "learning_rate": 4.163506160460262e-05, - "loss": 1.0421, - "step": 5770 - }, - { - "epoch": 0.8829600183472212, - "grad_norm": 0.054652077273351, - "learning_rate": 4.1103611647396735e-05, - "loss": 1.0526, - "step": 5775 - }, - { - "epoch": 0.8837244858955737, - "grad_norm": 0.05789228866767471, - "learning_rate": 4.0575429927778684e-05, - "loss": 0.9877, - "step": 5780 - }, - { - "epoch": 0.8844889534439263, - "grad_norm": 0.06435394005015357, - "learning_rate": 4.005052020743261e-05, - "loss": 1.0197, - "step": 5785 - }, - { - "epoch": 0.8852534209922789, - "grad_norm": 0.0668458189028938, - "learning_rate": 3.952888622473977e-05, - "loss": 1.0316, - "step": 5790 - }, - { - "epoch": 0.8860178885406315, - "grad_norm": 0.058224692824791244, - "learning_rate": 3.901053169475194e-05, - "loss": 1.0214, - "step": 5795 - }, - { - "epoch": 0.886782356088984, - "grad_norm": 0.062009025031709276, - "learning_rate": 3.849546030916473e-05, - "loss": 1.0556, - "step": 5800 - }, - { - "epoch": 0.8875468236373366, - "grad_norm": 0.05649898959762725, - "learning_rate": 3.798367573629113e-05, - "loss": 1.0078, - "step": 5805 - }, - { - "epoch": 0.8883112911856892, - "grad_norm": 0.05872883539211935, - "learning_rate": 3.747518162103597e-05, - "loss": 1.0756, - "step": 5810 - }, - { - "epoch": 0.8890757587340418, - "grad_norm": 0.059954064410168935, - "learning_rate": 3.696998158486925e-05, - "loss": 1.039, - "step": 5815 - }, - { - "epoch": 0.8898402262823943, - "grad_norm": 0.06002820321062443, - "learning_rate": 3.646807922580098e-05, - "loss": 0.9925, - "step": 5820 - }, - { - "epoch": 0.8906046938307469, - "grad_norm": 0.06355876238149161, - "learning_rate": 3.5969478118355156e-05, - "loss": 1.0486, - "step": 5825 - }, - { - "epoch": 0.8913691613790995, - "grad_norm": 0.05532906568084624, - "learning_rate": 3.5474181813544335e-05, - "loss": 1.0117, - "step": 5830 - }, - { - "epoch": 0.8921336289274521, - "grad_norm": 0.06045611131725339, - "learning_rate": 3.4982193838844544e-05, - "loss": 1.0701, - "step": 5835 - }, - { - "epoch": 0.8928980964758046, - "grad_norm": 0.0609764116986175, - "learning_rate": 3.4493517698170164e-05, - "loss": 1.0551, - "step": 5840 - }, - { - "epoch": 0.8936625640241572, - "grad_norm": 0.05428762758654894, - "learning_rate": 3.400815687184872e-05, - "loss": 1.0155, - "step": 5845 - }, - { - "epoch": 0.8944270315725098, - "grad_norm": 0.05965962849250994, - "learning_rate": 3.352611481659595e-05, - "loss": 1.0511, - "step": 5850 - }, - { - "epoch": 0.8951914991208623, - "grad_norm": 0.056939343112244536, - "learning_rate": 3.304739496549203e-05, - "loss": 1.0378, - "step": 5855 - }, - { - "epoch": 0.8959559666692148, - "grad_norm": 0.055743517095070166, - "learning_rate": 3.257200072795619e-05, - "loss": 1.0267, - "step": 5860 - }, - { - "epoch": 0.8967204342175674, - "grad_norm": 0.06219829393402533, - "learning_rate": 3.2099935489722896e-05, - "loss": 1.0407, - "step": 5865 - }, - { - "epoch": 0.89748490176592, - "grad_norm": 0.060687226345486525, - "learning_rate": 3.16312026128176e-05, - "loss": 1.0483, - "step": 5870 - }, - { - "epoch": 0.8982493693142726, - "grad_norm": 0.05981997075175033, - "learning_rate": 3.1165805435532936e-05, - "loss": 1.0382, - "step": 5875 - }, - { - "epoch": 0.8990138368626251, - "grad_norm": 0.057428646353623006, - "learning_rate": 3.070374727240466e-05, - "loss": 1.0479, - "step": 5880 - }, - { - "epoch": 0.8997783044109777, - "grad_norm": 0.06072172144354419, - "learning_rate": 3.0245031414188663e-05, - "loss": 0.9941, - "step": 5885 - }, - { - "epoch": 0.9005427719593303, - "grad_norm": 0.056470945217116215, - "learning_rate": 2.9789661127836432e-05, - "loss": 1.0396, - "step": 5890 - }, - { - "epoch": 0.9013072395076829, - "grad_norm": 0.05688971005581688, - "learning_rate": 2.933763965647307e-05, - "loss": 1.0013, - "step": 5895 - }, - { - "epoch": 0.9020717070560355, - "grad_norm": 0.06009286163449355, - "learning_rate": 2.8888970219373312e-05, - "loss": 1.0559, - "step": 5900 - }, - { - "epoch": 0.902836174604388, - "grad_norm": 0.07379788163258001, - "learning_rate": 2.8443656011938878e-05, - "loss": 0.9891, - "step": 5905 - }, - { - "epoch": 0.9036006421527406, - "grad_norm": 0.056877317710044964, - "learning_rate": 2.8001700205675663e-05, - "loss": 1.0763, - "step": 5910 - }, - { - "epoch": 0.9043651097010932, - "grad_norm": 0.05890117332064067, - "learning_rate": 2.7563105948171352e-05, - "loss": 1.0734, - "step": 5915 - }, - { - "epoch": 0.9051295772494458, - "grad_norm": 0.05414882138341149, - "learning_rate": 2.7127876363072734e-05, - "loss": 1.015, - "step": 5920 - }, - { - "epoch": 0.9058940447977983, - "grad_norm": 0.055796033504062224, - "learning_rate": 2.669601455006354e-05, - "loss": 1.0413, - "step": 5925 - }, - { - "epoch": 0.9066585123461509, - "grad_norm": 0.07287607712954004, - "learning_rate": 2.626752358484247e-05, - "loss": 1.0328, - "step": 5930 - }, - { - "epoch": 0.9074229798945035, - "grad_norm": 0.0544793234349126, - "learning_rate": 2.5842406519101135e-05, - "loss": 1.0159, - "step": 5935 - }, - { - "epoch": 0.9081874474428561, - "grad_norm": 0.061462858436265366, - "learning_rate": 2.54206663805025e-05, - "loss": 1.0265, - "step": 5940 - }, - { - "epoch": 0.9089519149912086, - "grad_norm": 0.05724469979574169, - "learning_rate": 2.500230617265925e-05, - "loss": 0.9856, - "step": 5945 - }, - { - "epoch": 0.9097163825395612, - "grad_norm": 0.0544995228115746, - "learning_rate": 2.4587328875112002e-05, - "loss": 0.9981, - "step": 5950 - }, - { - "epoch": 0.9104808500879138, - "grad_norm": 0.053865460569715765, - "learning_rate": 2.4175737443308975e-05, - "loss": 1.0465, - "step": 5955 - }, - { - "epoch": 0.9112453176362664, - "grad_norm": 0.055490991260585244, - "learning_rate": 2.3767534808584125e-05, - "loss": 0.9892, - "step": 5960 - }, - { - "epoch": 0.912009785184619, - "grad_norm": 0.058919461159295805, - "learning_rate": 2.3362723878136592e-05, - "loss": 1.0345, - "step": 5965 - }, - { - "epoch": 0.9127742527329715, - "grad_norm": 0.05703229353352163, - "learning_rate": 2.2961307535009966e-05, - "loss": 1.0403, - "step": 5970 - }, - { - "epoch": 0.9135387202813241, - "grad_norm": 0.06454466876602472, - "learning_rate": 2.2563288638071778e-05, - "loss": 0.9969, - "step": 5975 - }, - { - "epoch": 0.9143031878296767, - "grad_norm": 0.06269175867935511, - "learning_rate": 2.2168670021993076e-05, - "loss": 1.0319, - "step": 5980 - }, - { - "epoch": 0.9150676553780293, - "grad_norm": 0.054325191582656224, - "learning_rate": 2.1777454497228456e-05, - "loss": 1.0105, - "step": 5985 - }, - { - "epoch": 0.9158321229263817, - "grad_norm": 0.05741162522426686, - "learning_rate": 2.1389644849995506e-05, - "loss": 1.0486, - "step": 5990 - }, - { - "epoch": 0.9165965904747343, - "grad_norm": 0.059760672805936554, - "learning_rate": 2.100524384225555e-05, - "loss": 1.0196, - "step": 5995 - }, - { - "epoch": 0.9173610580230869, - "grad_norm": 0.05529093705054985, - "learning_rate": 2.0624254211693892e-05, - "loss": 1.0128, - "step": 6000 - }, - { - "epoch": 0.9181255255714394, - "grad_norm": 0.0560885235521163, - "learning_rate": 2.0246678671699936e-05, - "loss": 1.0228, - "step": 6005 - }, - { - "epoch": 0.918889993119792, - "grad_norm": 0.06287594156329433, - "learning_rate": 1.987251991134803e-05, - "loss": 1.0095, - "step": 6010 - }, - { - "epoch": 0.9196544606681446, - "grad_norm": 0.0571702237792438, - "learning_rate": 1.9501780595378715e-05, - "loss": 1.0274, - "step": 6015 - }, - { - "epoch": 0.9204189282164972, - "grad_norm": 0.06161643032987793, - "learning_rate": 1.9134463364179176e-05, - "loss": 1.0419, - "step": 6020 - }, - { - "epoch": 0.9211833957648498, - "grad_norm": 0.062217905827859084, - "learning_rate": 1.8770570833764712e-05, - "loss": 1.0231, - "step": 6025 - }, - { - "epoch": 0.9219478633132023, - "grad_norm": 0.056345001582428215, - "learning_rate": 1.8410105595760063e-05, - "loss": 1.0567, - "step": 6030 - }, - { - "epoch": 0.9227123308615549, - "grad_norm": 0.06041690355549645, - "learning_rate": 1.8053070217381007e-05, - "loss": 1.0428, - "step": 6035 - }, - { - "epoch": 0.9234767984099075, - "grad_norm": 0.059578321030006376, - "learning_rate": 1.7699467241416023e-05, - "loss": 1.0511, - "step": 6040 - }, - { - "epoch": 0.9242412659582601, - "grad_norm": 0.053569365750345925, - "learning_rate": 1.7349299186208258e-05, - "loss": 1.0511, - "step": 6045 - }, - { - "epoch": 0.9250057335066126, - "grad_norm": 0.055706266872641214, - "learning_rate": 1.7002568545637308e-05, - "loss": 1.0615, - "step": 6050 - }, - { - "epoch": 0.9257702010549652, - "grad_norm": 0.06210337329106453, - "learning_rate": 1.6659277789101757e-05, - "loss": 1.04, - "step": 6055 - }, - { - "epoch": 0.9265346686033178, - "grad_norm": 0.05720829659281351, - "learning_rate": 1.6319429361501713e-05, - "loss": 1.0181, - "step": 6060 - }, - { - "epoch": 0.9272991361516704, - "grad_norm": 0.05466393755249225, - "learning_rate": 1.5983025683220964e-05, - "loss": 1.024, - "step": 6065 - }, - { - "epoch": 0.9280636037000229, - "grad_norm": 0.06279571773285912, - "learning_rate": 1.5650069150110023e-05, - "loss": 1.0058, - "step": 6070 - }, - { - "epoch": 0.9288280712483755, - "grad_norm": 0.06240450137014157, - "learning_rate": 1.5320562133468997e-05, - "loss": 1.0495, - "step": 6075 - }, - { - "epoch": 0.9295925387967281, - "grad_norm": 0.0647440463087532, - "learning_rate": 1.4994506980030576e-05, - "loss": 1.0403, - "step": 6080 - }, - { - "epoch": 0.9303570063450807, - "grad_norm": 0.05313875552378329, - "learning_rate": 1.4671906011943848e-05, - "loss": 1.0331, - "step": 6085 - }, - { - "epoch": 0.9311214738934332, - "grad_norm": 0.05521374207034005, - "learning_rate": 1.4352761526756907e-05, - "loss": 1.0075, - "step": 6090 - }, - { - "epoch": 0.9318859414417858, - "grad_norm": 0.05982237948906845, - "learning_rate": 1.4037075797401155e-05, - "loss": 1.0629, - "step": 6095 - }, - { - "epoch": 0.9326504089901384, - "grad_norm": 0.06261396303304399, - "learning_rate": 1.3724851072174916e-05, - "loss": 1.0106, - "step": 6100 - }, - { - "epoch": 0.933414876538491, - "grad_norm": 0.05700687805485836, - "learning_rate": 1.3416089574727396e-05, - "loss": 1.0337, - "step": 6105 - }, - { - "epoch": 0.9341793440868436, - "grad_norm": 0.057915780300408407, - "learning_rate": 1.3110793504042873e-05, - "loss": 1.0059, - "step": 6110 - }, - { - "epoch": 0.9349438116351961, - "grad_norm": 0.05380257804448096, - "learning_rate": 1.2808965034424913e-05, - "loss": 1.0018, - "step": 6115 - }, - { - "epoch": 0.9357082791835487, - "grad_norm": 0.060812954266141986, - "learning_rate": 1.2510606315481121e-05, - "loss": 1.0407, - "step": 6120 - }, - { - "epoch": 0.9364727467319013, - "grad_norm": 0.059692939386243565, - "learning_rate": 1.22157194721077e-05, - "loss": 1.0539, - "step": 6125 - }, - { - "epoch": 0.9372372142802537, - "grad_norm": 0.06410611806486131, - "learning_rate": 1.1924306604474245e-05, - "loss": 1.0402, - "step": 6130 - }, - { - "epoch": 0.9380016818286063, - "grad_norm": 0.05657159861756608, - "learning_rate": 1.1636369788008972e-05, - "loss": 1.0383, - "step": 6135 - }, - { - "epoch": 0.9387661493769589, - "grad_norm": 0.06587261877884344, - "learning_rate": 1.1351911073383681e-05, - "loss": 1.0233, - "step": 6140 - }, - { - "epoch": 0.9395306169253115, - "grad_norm": 0.06201339266734815, - "learning_rate": 1.1070932486499542e-05, - "loss": 1.0289, - "step": 6145 - }, - { - "epoch": 0.9402950844736641, - "grad_norm": 0.07085274861735513, - "learning_rate": 1.0793436028472214e-05, - "loss": 1.0377, - "step": 6150 - }, - { - "epoch": 0.9410595520220166, - "grad_norm": 0.056399221307596455, - "learning_rate": 1.0519423675617811e-05, - "loss": 1.0336, - "step": 6155 - }, - { - "epoch": 0.9418240195703692, - "grad_norm": 0.05838303840904757, - "learning_rate": 1.0248897379438905e-05, - "loss": 1.0088, - "step": 6160 - }, - { - "epoch": 0.9425884871187218, - "grad_norm": 0.05511937754131694, - "learning_rate": 9.981859066610643e-06, - "loss": 1.0325, - "step": 6165 - }, - { - "epoch": 0.9433529546670744, - "grad_norm": 0.05862068288977634, - "learning_rate": 9.718310638966609e-06, - "loss": 1.0566, - "step": 6170 - }, - { - "epoch": 0.9441174222154269, - "grad_norm": 0.06229665145510315, - "learning_rate": 9.45825397348593e-06, - "loss": 1.0129, - "step": 6175 - }, - { - "epoch": 0.9448818897637795, - "grad_norm": 0.058461286059371405, - "learning_rate": 9.201690922279405e-06, - "loss": 1.0108, - "step": 6180 - }, - { - "epoch": 0.9456463573121321, - "grad_norm": 0.060236921179087834, - "learning_rate": 8.948623312576454e-06, - "loss": 0.9994, - "step": 6185 - }, - { - "epoch": 0.9464108248604847, - "grad_norm": 0.061654492292078825, - "learning_rate": 8.699052946712248e-06, - "loss": 1.0435, - "step": 6190 - }, - { - "epoch": 0.9471752924088372, - "grad_norm": 0.05414771023362534, - "learning_rate": 8.452981602114717e-06, - "loss": 1.0159, - "step": 6195 - }, - { - "epoch": 0.9479397599571898, - "grad_norm": 0.05913688312421346, - "learning_rate": 8.210411031291776e-06, - "loss": 0.9874, - "step": 6200 - }, - { - "epoch": 0.9487042275055424, - "grad_norm": 0.05691389065492228, - "learning_rate": 7.97134296181934e-06, - "loss": 1.0112, - "step": 6205 - }, - { - "epoch": 0.949468695053895, - "grad_norm": 0.055429285880686574, - "learning_rate": 7.735779096328389e-06, - "loss": 1.017, - "step": 6210 - }, - { - "epoch": 0.9502331626022475, - "grad_norm": 0.06097722521863537, - "learning_rate": 7.5037211124932e-06, - "loss": 1.0011, - "step": 6215 - }, { "epoch": 0.9509976301506001, - "grad_norm": 0.05532466685476817, - "learning_rate": 7.275170663019415e-06, - "loss": 1.0269, - "step": 6220 - }, - { - "epoch": 0.9517620976989527, - "grad_norm": 0.06042708468611293, - "learning_rate": 7.0501293756320975e-06, - "loss": 1.0602, - "step": 6225 - }, - { - "epoch": 0.9525265652473053, - "grad_norm": 0.05543873760281028, - "learning_rate": 6.828598853064249e-06, - "loss": 1.0048, - "step": 6230 - }, - { - "epoch": 0.9532910327956579, - "grad_norm": 0.0674868225950688, - "learning_rate": 6.610580673045485e-06, - "loss": 1.0069, - "step": 6235 + "grad_norm": 0.019644926869063512, + "learning_rate": 7.280105195860254e-06, + "loss": 1.0306, + "step": 1555 }, { "epoch": 0.9540555003440104, - "grad_norm": 0.05549130127970468, - "learning_rate": 6.396076388290484e-06, - "loss": 0.9908, - "step": 6240 - }, - { - "epoch": 0.954819967892363, - "grad_norm": 0.055652396632378566, - "learning_rate": 6.185087526488331e-06, - "loss": 1.0001, - "step": 6245 - }, - { - "epoch": 0.9555844354407156, - "grad_norm": 0.05267551540657268, - "learning_rate": 5.977615590291363e-06, - "loss": 1.0544, - "step": 6250 - }, - { - "epoch": 0.9563489029890682, - "grad_norm": 0.05592750269276289, - "learning_rate": 5.773662057304452e-06, - "loss": 1.0415, - "step": 6255 + "grad_norm": 0.016710041757227387, + "learning_rate": 6.400415939745275e-06, + "loss": 1.0281, + "step": 1560 }, { "epoch": 0.9571133705374207, - "grad_norm": 0.05646475680673014, - "learning_rate": 5.573228380074735e-06, - "loss": 1.0531, - "step": 6260 - }, - { - "epoch": 0.9578778380857733, - "grad_norm": 0.06321256859481447, - "learning_rate": 5.376315986081071e-06, - "loss": 0.9959, - "step": 6265 - }, - { - "epoch": 0.9586423056341258, - "grad_norm": 0.06453658248744666, - "learning_rate": 5.1829262777238205e-06, - "loss": 1.0466, - "step": 6270 - }, - { - "epoch": 0.9594067731824784, - "grad_norm": 0.05935378265190903, - "learning_rate": 4.993060632314972e-06, - "loss": 1.0193, - "step": 6275 + "grad_norm": 0.018527383142328113, + "learning_rate": 5.577010698204488e-06, + "loss": 1.0493, + "step": 1565 }, { "epoch": 0.9601712407308309, - "grad_norm": 0.056930907655133194, - "learning_rate": 4.806720402068476e-06, - "loss": 1.0026, - "step": 6280 - }, - { - "epoch": 0.9609357082791835, - "grad_norm": 0.05573893520025273, - "learning_rate": 4.623906914090203e-06, - "loss": 0.9942, - "step": 6285 - }, - { - "epoch": 0.9617001758275361, - "grad_norm": 0.059293037343727206, - "learning_rate": 4.444621470368893e-06, - "loss": 1.0574, - "step": 6290 - }, - { - "epoch": 0.9624646433758887, - "grad_norm": 0.057319030664173376, - "learning_rate": 4.268865347766715e-06, - "loss": 1.047, - "step": 6295 + "grad_norm": 0.01587941593451175, + "learning_rate": 4.809983362225878e-06, + "loss": 1.0287, + "step": 1570 }, { "epoch": 0.9632291109242412, - "grad_norm": 0.05986032444262332, - "learning_rate": 4.09663979801006e-06, - "loss": 1.0445, - "step": 6300 - }, - { - "epoch": 0.9639935784725938, - "grad_norm": 0.06772012126843469, - "learning_rate": 3.927946047680986e-06, - "loss": 1.0249, - "step": 6305 - }, - { - "epoch": 0.9647580460209464, - "grad_norm": 0.052915625380592655, - "learning_rate": 3.7627852982081156e-06, - "loss": 1.0115, - "step": 6310 - }, - { - "epoch": 0.965522513569299, - "grad_norm": 0.05719483772507953, - "learning_rate": 3.601158725858034e-06, - "loss": 1.0089, - "step": 6315 + "grad_norm": 0.01784011524370286, + "learning_rate": 4.0994213941557e-06, + "loss": 1.0494, + "step": 1575 }, { "epoch": 0.9662869811176515, - "grad_norm": 0.06742198460058717, - "learning_rate": 3.4430674817274575e-06, - "loss": 0.9958, - "step": 6320 - }, - { - "epoch": 0.9670514486660041, - "grad_norm": 0.05934089002940797, - "learning_rate": 3.288512691734413e-06, - "loss": 1.0607, - "step": 6325 - }, - { - "epoch": 0.9678159162143567, - "grad_norm": 0.05426755164335042, - "learning_rate": 3.1374954566105173e-06, - "loss": 1.0188, - "step": 6330 - }, - { - "epoch": 0.9685803837627093, - "grad_norm": 0.05770507635439541, - "learning_rate": 2.9900168518931513e-06, - "loss": 1.0476, - "step": 6335 + "grad_norm": 0.022229608533606754, + "learning_rate": 3.4454058177253998e-06, + "loss": 1.0245, + "step": 1580 }, { "epoch": 0.9693448513110619, - "grad_norm": 0.061035126427298775, - "learning_rate": 2.8460779279176897e-06, - "loss": 1.0285, - "step": 6340 - }, - { - "epoch": 0.9701093188594144, - "grad_norm": 0.05994621621761424, - "learning_rate": 2.705679709810116e-06, - "loss": 1.0273, - "step": 6345 - }, - { - "epoch": 0.970873786407767, - "grad_norm": 0.05622211095511261, - "learning_rate": 2.5688231974796418e-06, - "loss": 1.0306, - "step": 6350 - }, - { - "epoch": 0.9716382539561196, - "grad_norm": 0.05672371489636327, - "learning_rate": 2.4355093656116546e-06, - "loss": 1.0404, - "step": 6355 + "grad_norm": 0.01771675468251995, + "learning_rate": 2.8480112088128396e-06, + "loss": 1.0539, + "step": 1585 }, { "epoch": 0.9724027215044722, - "grad_norm": 0.05819240760299045, - "learning_rate": 2.3057391636606697e-06, - "loss": 0.9942, - "step": 6360 - }, - { - "epoch": 0.9731671890528247, - "grad_norm": 0.053968015068043766, - "learning_rate": 2.1795135158438338e-06, - "loss": 1.0023, - "step": 6365 - }, - { - "epoch": 0.9739316566011773, - "grad_norm": 0.05692302172149137, - "learning_rate": 2.0568333211338774e-06, - "loss": 1.0448, - "step": 6370 - }, - { - "epoch": 0.9746961241495299, - "grad_norm": 0.056053764675146125, - "learning_rate": 1.937699453253228e-06, - "loss": 1.0052, - "step": 6375 + "grad_norm": 0.019318982313098143, + "learning_rate": 2.307305686938266e-06, + "loss": 1.0364, + "step": 1590 }, { "epoch": 0.9754605916978825, - "grad_norm": 0.05688214554755461, - "learning_rate": 1.8221127606674603e-06, - "loss": 1.0529, - "step": 6380 - }, - { - "epoch": 0.976225059246235, - "grad_norm": 0.059143411541020596, - "learning_rate": 1.710074066579248e-06, - "loss": 1.0184, - "step": 6385 - }, - { - "epoch": 0.9769895267945876, - "grad_norm": 0.05396274376529374, - "learning_rate": 1.6015841689227539e-06, - "loss": 1.0269, - "step": 6390 - }, - { - "epoch": 0.9777539943429402, - "grad_norm": 0.06599763151002173, - "learning_rate": 1.4966438403577475e-06, - "loss": 1.0267, - "step": 6395 + "grad_norm": 0.01763551427180363, + "learning_rate": 1.8233509074969124e-06, + "loss": 1.0387, + "step": 1595 }, { "epoch": 0.9785184618912928, - "grad_norm": 0.054046587912488314, - "learning_rate": 1.3952538282639982e-06, - "loss": 1.0196, - "step": 6400 - }, - { - "epoch": 0.9792829294396452, - "grad_norm": 0.05563950879198749, - "learning_rate": 1.297414854736223e-06, - "loss": 1.0423, - "step": 6405 - }, - { - "epoch": 0.9800473969879978, - "grad_norm": 0.05499476733743179, - "learning_rate": 1.2031276165789252e-06, - "loss": 1.0444, - "step": 6410 - }, - { - "epoch": 0.9808118645363504, - "grad_norm": 0.05728822187639338, - "learning_rate": 1.1123927853010085e-06, - "loss": 1.008, - "step": 6415 + "grad_norm": 0.020265580264344555, + "learning_rate": 1.3962020547287345e-06, + "loss": 1.0375, + "step": 1600 }, { "epoch": 0.981576332084703, - "grad_norm": 0.05951022694716498, - "learning_rate": 1.025211007111615e-06, - "loss": 1.0416, - "step": 6420 - }, - { - "epoch": 0.9823407996330555, - "grad_norm": 0.05918659466424548, - "learning_rate": 9.415829029150724e-07, - "loss": 1.0156, - "step": 6425 - }, - { - "epoch": 0.9831052671814081, - "grad_norm": 0.056548103687334934, - "learning_rate": 8.615090683066206e-07, - "loss": 1.0567, - "step": 6430 - }, - { - "epoch": 0.9838697347297607, - "grad_norm": 0.06106341663875317, - "learning_rate": 7.84990073568248e-07, - "loss": 1.0436, - "step": 6435 + "grad_norm": 0.017705249951308937, + "learning_rate": 1.0259078354257212e-06, + "loss": 1.0463, + "step": 1605 }, { "epoch": 0.9846342022781133, - "grad_norm": 0.05745985936443068, - "learning_rate": 7.120264636643614e-07, - "loss": 1.0382, - "step": 6440 - }, - { - "epoch": 0.9853986698264658, - "grad_norm": 0.06113618554750734, - "learning_rate": 6.426187582383447e-07, - "loss": 0.9907, - "step": 6445 - }, - { - "epoch": 0.9861631373748184, - "grad_norm": 0.060054142652089436, - "learning_rate": 5.767674516083954e-07, - "loss": 1.0316, - "step": 6450 - }, - { - "epoch": 0.986927604923171, - "grad_norm": 0.05945875123547078, - "learning_rate": 5.144730127643605e-07, - "loss": 1.0362, - "step": 6455 + "grad_norm": 0.01899318507681047, + "learning_rate": 7.12510473377892e-07, + "loss": 1.0499, + "step": 1610 }, { "epoch": 0.9876920724715236, - "grad_norm": 0.06106867404334743, - "learning_rate": 4.5573588536407253e-07, - "loss": 1.0046, - "step": 6460 - }, - { - "epoch": 0.9884565400198762, - "grad_norm": 0.058622274212216934, - "learning_rate": 4.005564877305745e-07, - "loss": 1.0808, - "step": 6465 - }, - { - "epoch": 0.9892210075682287, - "grad_norm": 0.055370471737170614, - "learning_rate": 3.4893521284878884e-07, - "loss": 1.0308, - "step": 6470 - }, - { - "epoch": 0.9899854751165813, - "grad_norm": 0.058651345695096446, - "learning_rate": 3.0087242836285275e-07, - "loss": 1.0341, - "step": 6475 + "grad_norm": 0.20527421150604633, + "learning_rate": 4.5604570455887175e-07, + "loss": 1.0321, + "step": 1615 }, { "epoch": 0.9907499426649339, - "grad_norm": 0.05881488906932957, - "learning_rate": 2.5636847657367624e-07, - "loss": 1.0514, - "step": 6480 - }, - { - "epoch": 0.9915144102132865, - "grad_norm": 0.05718945757039933, - "learning_rate": 2.1542367443616595e-07, - "loss": 1.0102, - "step": 6485 - }, - { - "epoch": 0.992278877761639, - "grad_norm": 0.05591730925908283, - "learning_rate": 1.780383135571717e-07, - "loss": 1.0249, - "step": 6490 - }, - { - "epoch": 0.9930433453099916, - "grad_norm": 0.06348320627570112, - "learning_rate": 1.4421266019348787e-07, - "loss": 1.057, - "step": 6495 + "grad_norm": 0.01969626150373465, + "learning_rate": 2.565427730508163e-07, + "loss": 1.0622, + "step": 1620 }, { "epoch": 0.9938078128583442, - "grad_norm": 0.06390840223331544, - "learning_rate": 1.1394695524963305e-07, - "loss": 1.0271, - "step": 6500 - }, - { - "epoch": 0.9945722804066968, - "grad_norm": 0.1081874050336723, - "learning_rate": 8.724141427657317e-08, - "loss": 1.0346, - "step": 6505 - }, - { - "epoch": 0.9953367479550493, - "grad_norm": 0.05829569266484604, - "learning_rate": 6.409622746977872e-08, - "loss": 1.0286, - "step": 6510 - }, - { - "epoch": 0.9961012155034019, - "grad_norm": 0.0578107659452591, - "learning_rate": 4.4511559668225507e-08, - "loss": 1.0164, - "step": 6515 + "grad_norm": 0.01846687245559012, + "learning_rate": 1.1402442770985788e-07, + "loss": 1.0443, + "step": 1625 }, { "epoch": 0.9968656830517545, - "grad_norm": 0.06124806496876297, - "learning_rate": 2.8487550352951362e-08, - "loss": 1.0361, - "step": 6520 - }, - { - "epoch": 0.9976301506001071, - "grad_norm": 0.0593758021573862, - "learning_rate": 1.6024313646056942e-08, - "loss": 1.0053, - "step": 6525 - }, - { - "epoch": 0.9983946181484596, - "grad_norm": 0.06060901602693129, - "learning_rate": 7.121938310261644e-09, - "loss": 1.0459, - "step": 6530 - }, - { - "epoch": 0.9991590856968122, - "grad_norm": 0.05833116627694336, - "learning_rate": 1.7804877476823578e-09, - "loss": 1.0245, - "step": 6535 + "grad_norm": 0.01754718213075641, + "learning_rate": 2.8506919571902325e-08, + "loss": 1.0413, + "step": 1630 }, { "epoch": 0.9999235532451648, - "grad_norm": 0.055856565745703204, + "grad_norm": 0.01654015476154834, "learning_rate": 0.0, - "loss": 0.9952, - "step": 6540 + "loss": 1.031, + "step": 1635 }, { "epoch": 0.9999235532451648, - "eval_loss": 1.4338810443878174, - "eval_runtime": 608.009, - "eval_samples_per_second": 34.46, - "eval_steps_per_second": 8.615, - "step": 6540 + "eval_loss": 1.415837049484253, + "eval_runtime": 621.6735, + "eval_samples_per_second": 33.703, + "eval_steps_per_second": 8.426, + "step": 1635 }, { "epoch": 0.9999235532451648, - "step": 6540, - "total_flos": 1.024299027881001e+17, - "train_loss": 1.1466178722155569, - "train_runtime": 11966.5261, - "train_samples_per_second": 8.745, - "train_steps_per_second": 0.547 + "step": 1635, + "total_flos": 1.1949402256048128e+17, + "train_loss": 1.1248935338554031, + "train_runtime": 10312.3192, + "train_samples_per_second": 10.148, + "train_steps_per_second": 0.159 } ], "logging_steps": 5, - "max_steps": 6540, + "max_steps": 1635, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -9206,8 +2339,8 @@ "attributes": {} } }, - "total_flos": 1.024299027881001e+17, - "train_batch_size": 4, + "total_flos": 1.1949402256048128e+17, + "train_batch_size": 8, "trial_name": null, "trial_params": null }