{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990982867448152, "eval_steps": 500, "global_step": 554, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018034265103697023, "grad_norm": 6.674532137525891, "learning_rate": 0.0, "loss": 1.2076, "step": 1 }, { "epoch": 0.0036068530207394047, "grad_norm": 6.154259029326631, "learning_rate": 2.2522522522522524e-08, "loss": 1.1491, "step": 2 }, { "epoch": 0.005410279531109108, "grad_norm": 6.771653034612783, "learning_rate": 4.504504504504505e-08, "loss": 1.0534, "step": 3 }, { "epoch": 0.007213706041478809, "grad_norm": 6.67369914164215, "learning_rate": 6.756756756756757e-08, "loss": 1.1268, "step": 4 }, { "epoch": 0.009017132551848512, "grad_norm": 6.948175490186477, "learning_rate": 9.00900900900901e-08, "loss": 1.0109, "step": 5 }, { "epoch": 0.010820559062218215, "grad_norm": 6.285358425270715, "learning_rate": 1.1261261261261262e-07, "loss": 1.1015, "step": 6 }, { "epoch": 0.012623985572587917, "grad_norm": 7.480900019360095, "learning_rate": 1.3513513513513515e-07, "loss": 1.1338, "step": 7 }, { "epoch": 0.014427412082957619, "grad_norm": 5.985730068048292, "learning_rate": 1.5765765765765766e-07, "loss": 1.0889, "step": 8 }, { "epoch": 0.016230838593327322, "grad_norm": 6.74360726738279, "learning_rate": 1.801801801801802e-07, "loss": 1.0402, "step": 9 }, { "epoch": 0.018034265103697024, "grad_norm": 6.603771834682011, "learning_rate": 2.0270270270270273e-07, "loss": 1.1394, "step": 10 }, { "epoch": 0.019837691614066726, "grad_norm": 6.070271548824436, "learning_rate": 2.2522522522522524e-07, "loss": 1.1439, "step": 11 }, { "epoch": 0.02164111812443643, "grad_norm": 6.814219599011481, "learning_rate": 2.477477477477478e-07, "loss": 1.0881, "step": 12 }, { "epoch": 0.023444544634806132, "grad_norm": 6.428167151788823, "learning_rate": 2.702702702702703e-07, "loss": 1.1163, "step": 13 }, { "epoch": 0.025247971145175834, "grad_norm": 7.047010957583219, "learning_rate": 2.927927927927928e-07, "loss": 1.0923, "step": 14 }, { "epoch": 0.027051397655545536, "grad_norm": 5.53060795190606, "learning_rate": 3.153153153153153e-07, "loss": 1.0118, "step": 15 }, { "epoch": 0.028854824165915238, "grad_norm": 6.682099184154056, "learning_rate": 3.378378378378379e-07, "loss": 1.1213, "step": 16 }, { "epoch": 0.030658250676284943, "grad_norm": 5.6679773328714615, "learning_rate": 3.603603603603604e-07, "loss": 1.2125, "step": 17 }, { "epoch": 0.032461677186654644, "grad_norm": 5.547383052372404, "learning_rate": 3.828828828828829e-07, "loss": 1.0804, "step": 18 }, { "epoch": 0.034265103697024346, "grad_norm": 7.200179452941777, "learning_rate": 4.0540540540540546e-07, "loss": 1.0663, "step": 19 }, { "epoch": 0.03606853020739405, "grad_norm": 5.325648740758234, "learning_rate": 4.27927927927928e-07, "loss": 1.2159, "step": 20 }, { "epoch": 0.03787195671776375, "grad_norm": 5.0110630414633475, "learning_rate": 4.504504504504505e-07, "loss": 0.9838, "step": 21 }, { "epoch": 0.03967538322813345, "grad_norm": 5.657518684511818, "learning_rate": 4.7297297297297305e-07, "loss": 1.0797, "step": 22 }, { "epoch": 0.04147880973850315, "grad_norm": 4.5143785306650575, "learning_rate": 4.954954954954956e-07, "loss": 1.1578, "step": 23 }, { "epoch": 0.04328223624887286, "grad_norm": 5.287373714805066, "learning_rate": 5.180180180180181e-07, "loss": 1.1259, "step": 24 }, { "epoch": 0.04508566275924256, "grad_norm": 4.812185404720589, "learning_rate": 5.405405405405406e-07, "loss": 1.0263, "step": 25 }, { "epoch": 0.046889089269612265, "grad_norm": 4.813260658431154, "learning_rate": 5.630630630630631e-07, "loss": 1.0294, "step": 26 }, { "epoch": 0.04869251577998197, "grad_norm": 5.38161770029296, "learning_rate": 5.855855855855856e-07, "loss": 1.0928, "step": 27 }, { "epoch": 0.05049594229035167, "grad_norm": 4.1250087598910365, "learning_rate": 6.081081081081082e-07, "loss": 1.0323, "step": 28 }, { "epoch": 0.05229936880072137, "grad_norm": 3.4592927519497496, "learning_rate": 6.306306306306306e-07, "loss": 1.0056, "step": 29 }, { "epoch": 0.05410279531109107, "grad_norm": 3.904048770191826, "learning_rate": 6.531531531531532e-07, "loss": 1.0514, "step": 30 }, { "epoch": 0.05590622182146077, "grad_norm": 3.648999951860501, "learning_rate": 6.756756756756758e-07, "loss": 1.0958, "step": 31 }, { "epoch": 0.057709648331830475, "grad_norm": 3.466281275771792, "learning_rate": 6.981981981981982e-07, "loss": 1.0257, "step": 32 }, { "epoch": 0.059513074842200184, "grad_norm": 3.392213870901326, "learning_rate": 7.207207207207208e-07, "loss": 0.9459, "step": 33 }, { "epoch": 0.061316501352569885, "grad_norm": 3.5029298346782385, "learning_rate": 7.432432432432434e-07, "loss": 0.9415, "step": 34 }, { "epoch": 0.06311992786293959, "grad_norm": 3.0634308438792632, "learning_rate": 7.657657657657658e-07, "loss": 1.0153, "step": 35 }, { "epoch": 0.06492335437330929, "grad_norm": 2.9484149128045, "learning_rate": 7.882882882882883e-07, "loss": 0.9878, "step": 36 }, { "epoch": 0.06672678088367899, "grad_norm": 3.0610426789398195, "learning_rate": 8.108108108108109e-07, "loss": 0.9123, "step": 37 }, { "epoch": 0.06853020739404869, "grad_norm": 3.32199769361744, "learning_rate": 8.333333333333333e-07, "loss": 1.0099, "step": 38 }, { "epoch": 0.0703336339044184, "grad_norm": 3.0709465427851046, "learning_rate": 8.55855855855856e-07, "loss": 1.0385, "step": 39 }, { "epoch": 0.0721370604147881, "grad_norm": 3.0428201478943575, "learning_rate": 8.783783783783785e-07, "loss": 0.9887, "step": 40 }, { "epoch": 0.0739404869251578, "grad_norm": 3.11038611613558, "learning_rate": 9.00900900900901e-07, "loss": 0.8805, "step": 41 }, { "epoch": 0.0757439134355275, "grad_norm": 3.5117708849754283, "learning_rate": 9.234234234234235e-07, "loss": 0.9708, "step": 42 }, { "epoch": 0.0775473399458972, "grad_norm": 3.436408499708477, "learning_rate": 9.459459459459461e-07, "loss": 0.991, "step": 43 }, { "epoch": 0.0793507664562669, "grad_norm": 2.707066762216591, "learning_rate": 9.684684684684686e-07, "loss": 0.8664, "step": 44 }, { "epoch": 0.0811541929666366, "grad_norm": 2.9154636312948647, "learning_rate": 9.909909909909911e-07, "loss": 0.9008, "step": 45 }, { "epoch": 0.0829576194770063, "grad_norm": 2.9028667627025726, "learning_rate": 1.0135135135135136e-06, "loss": 1.0705, "step": 46 }, { "epoch": 0.08476104598737602, "grad_norm": 2.6634992062941736, "learning_rate": 1.0360360360360361e-06, "loss": 0.891, "step": 47 }, { "epoch": 0.08656447249774572, "grad_norm": 2.738023098685531, "learning_rate": 1.0585585585585587e-06, "loss": 0.9246, "step": 48 }, { "epoch": 0.08836789900811542, "grad_norm": 2.5938725151636435, "learning_rate": 1.0810810810810812e-06, "loss": 0.9308, "step": 49 }, { "epoch": 0.09017132551848513, "grad_norm": 2.732422906982916, "learning_rate": 1.1036036036036037e-06, "loss": 1.0283, "step": 50 }, { "epoch": 0.09197475202885483, "grad_norm": 2.5138095481285814, "learning_rate": 1.1261261261261262e-06, "loss": 1.0285, "step": 51 }, { "epoch": 0.09377817853922453, "grad_norm": 2.5550555806196265, "learning_rate": 1.148648648648649e-06, "loss": 0.9065, "step": 52 }, { "epoch": 0.09558160504959423, "grad_norm": 2.3645335521201702, "learning_rate": 1.1711711711711712e-06, "loss": 0.8516, "step": 53 }, { "epoch": 0.09738503155996393, "grad_norm": 2.409700298550962, "learning_rate": 1.1936936936936937e-06, "loss": 0.8294, "step": 54 }, { "epoch": 0.09918845807033363, "grad_norm": 2.3183367981378145, "learning_rate": 1.2162162162162164e-06, "loss": 0.9365, "step": 55 }, { "epoch": 0.10099188458070334, "grad_norm": 2.1828402934512776, "learning_rate": 1.2387387387387387e-06, "loss": 0.8918, "step": 56 }, { "epoch": 0.10279531109107304, "grad_norm": 2.3691895978895094, "learning_rate": 1.2612612612612613e-06, "loss": 0.9768, "step": 57 }, { "epoch": 0.10459873760144274, "grad_norm": 2.3204193779879208, "learning_rate": 1.2837837837837838e-06, "loss": 0.7925, "step": 58 }, { "epoch": 0.10640216411181244, "grad_norm": 2.334168434235552, "learning_rate": 1.3063063063063065e-06, "loss": 0.855, "step": 59 }, { "epoch": 0.10820559062218214, "grad_norm": 2.256611178722444, "learning_rate": 1.328828828828829e-06, "loss": 0.8408, "step": 60 }, { "epoch": 0.11000901713255185, "grad_norm": 2.4778146158109924, "learning_rate": 1.3513513513513515e-06, "loss": 0.8964, "step": 61 }, { "epoch": 0.11181244364292155, "grad_norm": 2.4357880480005756, "learning_rate": 1.373873873873874e-06, "loss": 0.9621, "step": 62 }, { "epoch": 0.11361587015329125, "grad_norm": 2.2150394871764294, "learning_rate": 1.3963963963963963e-06, "loss": 0.8501, "step": 63 }, { "epoch": 0.11541929666366095, "grad_norm": 2.118545319018784, "learning_rate": 1.418918918918919e-06, "loss": 0.9133, "step": 64 }, { "epoch": 0.11722272317403065, "grad_norm": 2.1649234330413587, "learning_rate": 1.4414414414414416e-06, "loss": 0.7883, "step": 65 }, { "epoch": 0.11902614968440037, "grad_norm": 2.260747898334313, "learning_rate": 1.463963963963964e-06, "loss": 1.0857, "step": 66 }, { "epoch": 0.12082957619477007, "grad_norm": 2.3422569755909257, "learning_rate": 1.4864864864864868e-06, "loss": 0.9595, "step": 67 }, { "epoch": 0.12263300270513977, "grad_norm": 2.1879879443879067, "learning_rate": 1.5090090090090093e-06, "loss": 0.9373, "step": 68 }, { "epoch": 0.12443642921550947, "grad_norm": 2.1212636698318565, "learning_rate": 1.5315315315315316e-06, "loss": 0.8465, "step": 69 }, { "epoch": 0.12623985572587917, "grad_norm": 2.093523831224821, "learning_rate": 1.5540540540540541e-06, "loss": 0.8851, "step": 70 }, { "epoch": 0.12804328223624886, "grad_norm": 2.117245792873491, "learning_rate": 1.5765765765765766e-06, "loss": 0.8836, "step": 71 }, { "epoch": 0.12984670874661858, "grad_norm": 2.297950888317582, "learning_rate": 1.5990990990990993e-06, "loss": 0.8671, "step": 72 }, { "epoch": 0.13165013525698827, "grad_norm": 2.136681162174477, "learning_rate": 1.6216216216216219e-06, "loss": 0.9114, "step": 73 }, { "epoch": 0.13345356176735798, "grad_norm": 2.377418938286004, "learning_rate": 1.6441441441441444e-06, "loss": 0.9153, "step": 74 }, { "epoch": 0.13525698827772767, "grad_norm": 2.14684216322763, "learning_rate": 1.6666666666666667e-06, "loss": 0.895, "step": 75 }, { "epoch": 0.13706041478809738, "grad_norm": 2.0578144463585395, "learning_rate": 1.6891891891891894e-06, "loss": 0.7646, "step": 76 }, { "epoch": 0.1388638412984671, "grad_norm": 2.1370659943028256, "learning_rate": 1.711711711711712e-06, "loss": 0.963, "step": 77 }, { "epoch": 0.1406672678088368, "grad_norm": 2.1407789023578805, "learning_rate": 1.7342342342342344e-06, "loss": 0.8181, "step": 78 }, { "epoch": 0.1424706943192065, "grad_norm": 2.224908436029519, "learning_rate": 1.756756756756757e-06, "loss": 0.7726, "step": 79 }, { "epoch": 0.1442741208295762, "grad_norm": 2.4321949851329627, "learning_rate": 1.7792792792792792e-06, "loss": 0.857, "step": 80 }, { "epoch": 0.1460775473399459, "grad_norm": 2.144226568602669, "learning_rate": 1.801801801801802e-06, "loss": 0.8378, "step": 81 }, { "epoch": 0.1478809738503156, "grad_norm": 1.9826711249103168, "learning_rate": 1.8243243243243245e-06, "loss": 0.7902, "step": 82 }, { "epoch": 0.1496844003606853, "grad_norm": 2.291008678375686, "learning_rate": 1.846846846846847e-06, "loss": 0.8824, "step": 83 }, { "epoch": 0.151487826871055, "grad_norm": 2.145067975437641, "learning_rate": 1.8693693693693697e-06, "loss": 0.7856, "step": 84 }, { "epoch": 0.1532912533814247, "grad_norm": 2.123969288662637, "learning_rate": 1.8918918918918922e-06, "loss": 0.7462, "step": 85 }, { "epoch": 0.1550946798917944, "grad_norm": 2.2295818146317776, "learning_rate": 1.9144144144144145e-06, "loss": 0.8886, "step": 86 }, { "epoch": 0.15689810640216412, "grad_norm": 2.140597774817277, "learning_rate": 1.9369369369369372e-06, "loss": 0.8319, "step": 87 }, { "epoch": 0.1587015329125338, "grad_norm": 2.344451235287885, "learning_rate": 1.9594594594594595e-06, "loss": 0.8575, "step": 88 }, { "epoch": 0.16050495942290352, "grad_norm": 1.961958906755019, "learning_rate": 1.9819819819819822e-06, "loss": 0.8305, "step": 89 }, { "epoch": 0.1623083859332732, "grad_norm": 2.082591039262878, "learning_rate": 2.0045045045045045e-06, "loss": 0.8032, "step": 90 }, { "epoch": 0.16411181244364292, "grad_norm": 1.8951491866286936, "learning_rate": 2.0270270270270273e-06, "loss": 0.9459, "step": 91 }, { "epoch": 0.1659152389540126, "grad_norm": 2.1428859812782344, "learning_rate": 2.0495495495495496e-06, "loss": 0.8839, "step": 92 }, { "epoch": 0.16771866546438233, "grad_norm": 2.2152288593264173, "learning_rate": 2.0720720720720723e-06, "loss": 0.8265, "step": 93 }, { "epoch": 0.16952209197475204, "grad_norm": 2.0213826178716254, "learning_rate": 2.0945945945945946e-06, "loss": 0.8098, "step": 94 }, { "epoch": 0.17132551848512173, "grad_norm": 2.0901306331246374, "learning_rate": 2.1171171171171173e-06, "loss": 0.8728, "step": 95 }, { "epoch": 0.17312894499549145, "grad_norm": 2.164683159815703, "learning_rate": 2.13963963963964e-06, "loss": 0.7473, "step": 96 }, { "epoch": 0.17493237150586113, "grad_norm": 2.128063710011363, "learning_rate": 2.1621621621621623e-06, "loss": 0.7631, "step": 97 }, { "epoch": 0.17673579801623085, "grad_norm": 2.2572109322776446, "learning_rate": 2.1846846846846846e-06, "loss": 0.933, "step": 98 }, { "epoch": 0.17853922452660054, "grad_norm": 2.1363963838074325, "learning_rate": 2.2072072072072073e-06, "loss": 0.7977, "step": 99 }, { "epoch": 0.18034265103697025, "grad_norm": 2.146510752101339, "learning_rate": 2.22972972972973e-06, "loss": 0.8328, "step": 100 }, { "epoch": 0.18214607754733994, "grad_norm": 2.128787213407692, "learning_rate": 2.2522522522522524e-06, "loss": 0.769, "step": 101 }, { "epoch": 0.18394950405770966, "grad_norm": 2.1474901480116384, "learning_rate": 2.274774774774775e-06, "loss": 0.8078, "step": 102 }, { "epoch": 0.18575293056807934, "grad_norm": 2.2077628269004306, "learning_rate": 2.297297297297298e-06, "loss": 0.869, "step": 103 }, { "epoch": 0.18755635707844906, "grad_norm": 2.1596373889839353, "learning_rate": 2.31981981981982e-06, "loss": 0.9066, "step": 104 }, { "epoch": 0.18935978358881875, "grad_norm": 2.227258779710617, "learning_rate": 2.3423423423423424e-06, "loss": 0.8736, "step": 105 }, { "epoch": 0.19116321009918846, "grad_norm": 2.0265448039731604, "learning_rate": 2.364864864864865e-06, "loss": 0.8269, "step": 106 }, { "epoch": 0.19296663660955815, "grad_norm": 2.090824078885953, "learning_rate": 2.3873873873873874e-06, "loss": 0.8771, "step": 107 }, { "epoch": 0.19477006311992787, "grad_norm": 1.9761919651706363, "learning_rate": 2.40990990990991e-06, "loss": 0.8337, "step": 108 }, { "epoch": 0.19657348963029755, "grad_norm": 2.1515967867262455, "learning_rate": 2.432432432432433e-06, "loss": 0.8635, "step": 109 }, { "epoch": 0.19837691614066727, "grad_norm": 2.0366179273737943, "learning_rate": 2.454954954954955e-06, "loss": 0.7951, "step": 110 }, { "epoch": 0.20018034265103696, "grad_norm": 2.2337095952708568, "learning_rate": 2.4774774774774775e-06, "loss": 0.8296, "step": 111 }, { "epoch": 0.20198376916140667, "grad_norm": 2.314845611994883, "learning_rate": 2.5e-06, "loss": 0.8512, "step": 112 }, { "epoch": 0.2037871956717764, "grad_norm": 2.1397074134865623, "learning_rate": 2.5225225225225225e-06, "loss": 0.7915, "step": 113 }, { "epoch": 0.20559062218214608, "grad_norm": 2.2454332127701644, "learning_rate": 2.5450450450450452e-06, "loss": 0.7976, "step": 114 }, { "epoch": 0.2073940486925158, "grad_norm": 2.184763023914372, "learning_rate": 2.5675675675675675e-06, "loss": 0.9853, "step": 115 }, { "epoch": 0.20919747520288548, "grad_norm": 2.1965096069781653, "learning_rate": 2.5900900900900907e-06, "loss": 0.8754, "step": 116 }, { "epoch": 0.2110009017132552, "grad_norm": 2.1197328540143405, "learning_rate": 2.612612612612613e-06, "loss": 0.873, "step": 117 }, { "epoch": 0.21280432822362488, "grad_norm": 2.1479255682477656, "learning_rate": 2.6351351351351353e-06, "loss": 0.811, "step": 118 }, { "epoch": 0.2146077547339946, "grad_norm": 1.907529342641289, "learning_rate": 2.657657657657658e-06, "loss": 0.8094, "step": 119 }, { "epoch": 0.2164111812443643, "grad_norm": 2.0692795464883162, "learning_rate": 2.6801801801801803e-06, "loss": 0.7645, "step": 120 }, { "epoch": 0.218214607754734, "grad_norm": 2.525383768634248, "learning_rate": 2.702702702702703e-06, "loss": 0.7909, "step": 121 }, { "epoch": 0.2200180342651037, "grad_norm": 2.193135508499143, "learning_rate": 2.7252252252252253e-06, "loss": 0.8109, "step": 122 }, { "epoch": 0.2218214607754734, "grad_norm": 2.2119521513341263, "learning_rate": 2.747747747747748e-06, "loss": 0.864, "step": 123 }, { "epoch": 0.2236248872858431, "grad_norm": 2.2411527155988966, "learning_rate": 2.7702702702702703e-06, "loss": 0.7952, "step": 124 }, { "epoch": 0.2254283137962128, "grad_norm": 2.0883350487153693, "learning_rate": 2.7927927927927926e-06, "loss": 1.0036, "step": 125 }, { "epoch": 0.2272317403065825, "grad_norm": 2.109829568194192, "learning_rate": 2.8153153153153158e-06, "loss": 0.7232, "step": 126 }, { "epoch": 0.2290351668169522, "grad_norm": 1.993689884083202, "learning_rate": 2.837837837837838e-06, "loss": 0.7923, "step": 127 }, { "epoch": 0.2308385933273219, "grad_norm": 1.8907292753085065, "learning_rate": 2.860360360360361e-06, "loss": 0.7996, "step": 128 }, { "epoch": 0.23264201983769162, "grad_norm": 2.0881747024356367, "learning_rate": 2.882882882882883e-06, "loss": 0.8358, "step": 129 }, { "epoch": 0.2344454463480613, "grad_norm": 2.2364232617934583, "learning_rate": 2.9054054054054054e-06, "loss": 0.855, "step": 130 }, { "epoch": 0.23624887285843102, "grad_norm": 2.1751800704208017, "learning_rate": 2.927927927927928e-06, "loss": 0.988, "step": 131 }, { "epoch": 0.23805229936880073, "grad_norm": 2.5339818982016244, "learning_rate": 2.9504504504504504e-06, "loss": 0.9803, "step": 132 }, { "epoch": 0.23985572587917042, "grad_norm": 2.1208235838666276, "learning_rate": 2.9729729729729736e-06, "loss": 0.8555, "step": 133 }, { "epoch": 0.24165915238954014, "grad_norm": 2.2141319147659404, "learning_rate": 2.995495495495496e-06, "loss": 0.8157, "step": 134 }, { "epoch": 0.24346257889990983, "grad_norm": 2.2533639584780403, "learning_rate": 3.0180180180180186e-06, "loss": 0.7266, "step": 135 }, { "epoch": 0.24526600541027954, "grad_norm": 1.9934582570878943, "learning_rate": 3.040540540540541e-06, "loss": 0.6843, "step": 136 }, { "epoch": 0.24706943192064923, "grad_norm": 2.3507505242464286, "learning_rate": 3.063063063063063e-06, "loss": 0.7358, "step": 137 }, { "epoch": 0.24887285843101895, "grad_norm": 2.333362017875557, "learning_rate": 3.085585585585586e-06, "loss": 0.7389, "step": 138 }, { "epoch": 0.25067628494138866, "grad_norm": 2.2614223566969707, "learning_rate": 3.1081081081081082e-06, "loss": 0.7867, "step": 139 }, { "epoch": 0.25247971145175835, "grad_norm": 1.9874595550084149, "learning_rate": 3.130630630630631e-06, "loss": 0.8633, "step": 140 }, { "epoch": 0.25428313796212804, "grad_norm": 2.078601890935306, "learning_rate": 3.1531531531531532e-06, "loss": 0.7232, "step": 141 }, { "epoch": 0.2560865644724977, "grad_norm": 2.059190081358862, "learning_rate": 3.1756756756756755e-06, "loss": 0.716, "step": 142 }, { "epoch": 0.25788999098286747, "grad_norm": 2.2762951186816474, "learning_rate": 3.1981981981981987e-06, "loss": 0.8357, "step": 143 }, { "epoch": 0.25969341749323716, "grad_norm": 2.2293473455945882, "learning_rate": 3.220720720720721e-06, "loss": 0.9155, "step": 144 }, { "epoch": 0.26149684400360684, "grad_norm": 2.138416887954227, "learning_rate": 3.2432432432432437e-06, "loss": 0.8325, "step": 145 }, { "epoch": 0.26330027051397653, "grad_norm": 2.124046946880288, "learning_rate": 3.265765765765766e-06, "loss": 0.8651, "step": 146 }, { "epoch": 0.2651036970243463, "grad_norm": 2.1097037863696015, "learning_rate": 3.2882882882882887e-06, "loss": 0.748, "step": 147 }, { "epoch": 0.26690712353471596, "grad_norm": 1.9680662328568495, "learning_rate": 3.310810810810811e-06, "loss": 0.7221, "step": 148 }, { "epoch": 0.26871055004508565, "grad_norm": 2.046832628017909, "learning_rate": 3.3333333333333333e-06, "loss": 0.76, "step": 149 }, { "epoch": 0.27051397655545534, "grad_norm": 2.318638863913297, "learning_rate": 3.3558558558558565e-06, "loss": 0.8807, "step": 150 }, { "epoch": 0.2723174030658251, "grad_norm": 2.012178712794308, "learning_rate": 3.3783783783783788e-06, "loss": 0.8326, "step": 151 }, { "epoch": 0.27412082957619477, "grad_norm": 2.066509449866673, "learning_rate": 3.4009009009009015e-06, "loss": 0.8075, "step": 152 }, { "epoch": 0.27592425608656446, "grad_norm": 2.1828393174475735, "learning_rate": 3.423423423423424e-06, "loss": 0.8072, "step": 153 }, { "epoch": 0.2777276825969342, "grad_norm": 2.3767082165962368, "learning_rate": 3.445945945945946e-06, "loss": 0.97, "step": 154 }, { "epoch": 0.2795311091073039, "grad_norm": 2.1245505290732853, "learning_rate": 3.468468468468469e-06, "loss": 0.8154, "step": 155 }, { "epoch": 0.2813345356176736, "grad_norm": 2.0827910518003523, "learning_rate": 3.490990990990991e-06, "loss": 0.7921, "step": 156 }, { "epoch": 0.28313796212804326, "grad_norm": 2.0364316538096863, "learning_rate": 3.513513513513514e-06, "loss": 0.9461, "step": 157 }, { "epoch": 0.284941388638413, "grad_norm": 2.3511021810109383, "learning_rate": 3.536036036036036e-06, "loss": 0.7942, "step": 158 }, { "epoch": 0.2867448151487827, "grad_norm": 1.9833910918431235, "learning_rate": 3.5585585585585584e-06, "loss": 0.8312, "step": 159 }, { "epoch": 0.2885482416591524, "grad_norm": 1.9342863774694277, "learning_rate": 3.5810810810810816e-06, "loss": 0.8015, "step": 160 }, { "epoch": 0.29035166816952207, "grad_norm": 2.033383216145857, "learning_rate": 3.603603603603604e-06, "loss": 0.7695, "step": 161 }, { "epoch": 0.2921550946798918, "grad_norm": 2.374348511862132, "learning_rate": 3.6261261261261266e-06, "loss": 0.762, "step": 162 }, { "epoch": 0.2939585211902615, "grad_norm": 2.114360094597133, "learning_rate": 3.648648648648649e-06, "loss": 0.8444, "step": 163 }, { "epoch": 0.2957619477006312, "grad_norm": 1.9931929796238907, "learning_rate": 3.6711711711711716e-06, "loss": 0.7882, "step": 164 }, { "epoch": 0.2975653742110009, "grad_norm": 2.0730938718533145, "learning_rate": 3.693693693693694e-06, "loss": 0.7745, "step": 165 }, { "epoch": 0.2993688007213706, "grad_norm": 1.8554364231513298, "learning_rate": 3.7162162162162162e-06, "loss": 0.7253, "step": 166 }, { "epoch": 0.3011722272317403, "grad_norm": 2.149623516434781, "learning_rate": 3.7387387387387394e-06, "loss": 0.8954, "step": 167 }, { "epoch": 0.30297565374211, "grad_norm": 2.4856316208076503, "learning_rate": 3.7612612612612612e-06, "loss": 0.8402, "step": 168 }, { "epoch": 0.3047790802524797, "grad_norm": 2.1406112105466035, "learning_rate": 3.7837837837837844e-06, "loss": 0.8636, "step": 169 }, { "epoch": 0.3065825067628494, "grad_norm": 2.2289790923203205, "learning_rate": 3.8063063063063067e-06, "loss": 0.7428, "step": 170 }, { "epoch": 0.3083859332732191, "grad_norm": 2.004209812667466, "learning_rate": 3.828828828828829e-06, "loss": 0.7797, "step": 171 }, { "epoch": 0.3101893597835888, "grad_norm": 2.006314497006802, "learning_rate": 3.851351351351352e-06, "loss": 0.7593, "step": 172 }, { "epoch": 0.31199278629395855, "grad_norm": 2.282382563008822, "learning_rate": 3.8738738738738744e-06, "loss": 0.9733, "step": 173 }, { "epoch": 0.31379621280432823, "grad_norm": 2.0355833568890946, "learning_rate": 3.896396396396397e-06, "loss": 0.8561, "step": 174 }, { "epoch": 0.3155996393146979, "grad_norm": 2.259718701083019, "learning_rate": 3.918918918918919e-06, "loss": 0.797, "step": 175 }, { "epoch": 0.3174030658250676, "grad_norm": 2.1729996844233455, "learning_rate": 3.941441441441442e-06, "loss": 0.7527, "step": 176 }, { "epoch": 0.31920649233543735, "grad_norm": 2.60117835410255, "learning_rate": 3.9639639639639645e-06, "loss": 1.0225, "step": 177 }, { "epoch": 0.32100991884580704, "grad_norm": 2.2528379596704604, "learning_rate": 3.986486486486487e-06, "loss": 0.7965, "step": 178 }, { "epoch": 0.32281334535617673, "grad_norm": 2.3132904967648082, "learning_rate": 4.009009009009009e-06, "loss": 0.8112, "step": 179 }, { "epoch": 0.3246167718665464, "grad_norm": 2.5263030575643564, "learning_rate": 4.031531531531531e-06, "loss": 0.8432, "step": 180 }, { "epoch": 0.32642019837691616, "grad_norm": 2.2940008917196817, "learning_rate": 4.0540540540540545e-06, "loss": 0.7679, "step": 181 }, { "epoch": 0.32822362488728585, "grad_norm": 2.1976286649954355, "learning_rate": 4.076576576576577e-06, "loss": 0.853, "step": 182 }, { "epoch": 0.33002705139765554, "grad_norm": 2.287412594205084, "learning_rate": 4.099099099099099e-06, "loss": 0.8528, "step": 183 }, { "epoch": 0.3318304779080252, "grad_norm": 2.265413975048022, "learning_rate": 4.121621621621622e-06, "loss": 0.8891, "step": 184 }, { "epoch": 0.33363390441839497, "grad_norm": 2.1347188948409626, "learning_rate": 4.1441441441441446e-06, "loss": 0.7172, "step": 185 }, { "epoch": 0.33543733092876465, "grad_norm": 1.9036460590607482, "learning_rate": 4.166666666666667e-06, "loss": 0.8139, "step": 186 }, { "epoch": 0.33724075743913434, "grad_norm": 2.1982101741220723, "learning_rate": 4.189189189189189e-06, "loss": 0.7872, "step": 187 }, { "epoch": 0.3390441839495041, "grad_norm": 1.9974084871264948, "learning_rate": 4.2117117117117115e-06, "loss": 0.7211, "step": 188 }, { "epoch": 0.3408476104598738, "grad_norm": 2.343692275926038, "learning_rate": 4.234234234234235e-06, "loss": 0.8724, "step": 189 }, { "epoch": 0.34265103697024346, "grad_norm": 2.4031986369572103, "learning_rate": 4.256756756756757e-06, "loss": 0.8742, "step": 190 }, { "epoch": 0.34445446348061315, "grad_norm": 2.077375945110708, "learning_rate": 4.27927927927928e-06, "loss": 0.7802, "step": 191 }, { "epoch": 0.3462578899909829, "grad_norm": 2.027445526646734, "learning_rate": 4.301801801801802e-06, "loss": 0.8748, "step": 192 }, { "epoch": 0.3480613165013526, "grad_norm": 2.4821342969963522, "learning_rate": 4.324324324324325e-06, "loss": 0.7775, "step": 193 }, { "epoch": 0.34986474301172227, "grad_norm": 2.356163875058872, "learning_rate": 4.346846846846847e-06, "loss": 0.7257, "step": 194 }, { "epoch": 0.35166816952209196, "grad_norm": 2.295664379900927, "learning_rate": 4.369369369369369e-06, "loss": 0.7341, "step": 195 }, { "epoch": 0.3534715960324617, "grad_norm": 2.3623157091199882, "learning_rate": 4.391891891891892e-06, "loss": 0.8198, "step": 196 }, { "epoch": 0.3552750225428314, "grad_norm": 2.186111001087259, "learning_rate": 4.414414414414415e-06, "loss": 0.8505, "step": 197 }, { "epoch": 0.3570784490532011, "grad_norm": 2.2466120322229504, "learning_rate": 4.436936936936938e-06, "loss": 0.8352, "step": 198 }, { "epoch": 0.35888187556357076, "grad_norm": 2.2300706402504837, "learning_rate": 4.45945945945946e-06, "loss": 0.9199, "step": 199 }, { "epoch": 0.3606853020739405, "grad_norm": 2.175470606319692, "learning_rate": 4.4819819819819824e-06, "loss": 0.6704, "step": 200 }, { "epoch": 0.3624887285843102, "grad_norm": 2.0680624661556397, "learning_rate": 4.504504504504505e-06, "loss": 0.8349, "step": 201 }, { "epoch": 0.3642921550946799, "grad_norm": 2.1728800893786895, "learning_rate": 4.527027027027027e-06, "loss": 0.8035, "step": 202 }, { "epoch": 0.36609558160504957, "grad_norm": 2.360603677367448, "learning_rate": 4.54954954954955e-06, "loss": 0.7997, "step": 203 }, { "epoch": 0.3678990081154193, "grad_norm": 2.0528022407082425, "learning_rate": 4.5720720720720725e-06, "loss": 0.7377, "step": 204 }, { "epoch": 0.369702434625789, "grad_norm": 2.145107444388918, "learning_rate": 4.594594594594596e-06, "loss": 0.7246, "step": 205 }, { "epoch": 0.3715058611361587, "grad_norm": 2.142567114305303, "learning_rate": 4.617117117117118e-06, "loss": 0.767, "step": 206 }, { "epoch": 0.37330928764652843, "grad_norm": 2.250353037529415, "learning_rate": 4.63963963963964e-06, "loss": 0.744, "step": 207 }, { "epoch": 0.3751127141568981, "grad_norm": 2.4107500279982577, "learning_rate": 4.6621621621621625e-06, "loss": 0.9702, "step": 208 }, { "epoch": 0.3769161406672678, "grad_norm": 1.83721607411279, "learning_rate": 4.684684684684685e-06, "loss": 0.7841, "step": 209 }, { "epoch": 0.3787195671776375, "grad_norm": 2.187844750445605, "learning_rate": 4.707207207207208e-06, "loss": 0.7828, "step": 210 }, { "epoch": 0.38052299368800724, "grad_norm": 2.6155119945345913, "learning_rate": 4.72972972972973e-06, "loss": 0.7754, "step": 211 }, { "epoch": 0.3823264201983769, "grad_norm": 2.087136361991544, "learning_rate": 4.7522522522522526e-06, "loss": 0.7961, "step": 212 }, { "epoch": 0.3841298467087466, "grad_norm": 2.052469543045352, "learning_rate": 4.774774774774775e-06, "loss": 0.74, "step": 213 }, { "epoch": 0.3859332732191163, "grad_norm": 2.326611456516733, "learning_rate": 4.797297297297297e-06, "loss": 0.8593, "step": 214 }, { "epoch": 0.38773669972948605, "grad_norm": 2.161826276327704, "learning_rate": 4.81981981981982e-06, "loss": 0.8495, "step": 215 }, { "epoch": 0.38954012623985573, "grad_norm": 2.2046675857827416, "learning_rate": 4.842342342342343e-06, "loss": 0.7909, "step": 216 }, { "epoch": 0.3913435527502254, "grad_norm": 1.9987211123805613, "learning_rate": 4.864864864864866e-06, "loss": 0.7073, "step": 217 }, { "epoch": 0.3931469792605951, "grad_norm": 2.060842447295872, "learning_rate": 4.887387387387388e-06, "loss": 0.7356, "step": 218 }, { "epoch": 0.39495040577096485, "grad_norm": 2.167011002864499, "learning_rate": 4.90990990990991e-06, "loss": 0.9376, "step": 219 }, { "epoch": 0.39675383228133454, "grad_norm": 2.120387815326822, "learning_rate": 4.932432432432433e-06, "loss": 0.923, "step": 220 }, { "epoch": 0.3985572587917042, "grad_norm": 2.0360713757641675, "learning_rate": 4.954954954954955e-06, "loss": 0.7894, "step": 221 }, { "epoch": 0.4003606853020739, "grad_norm": 1.9555130956875506, "learning_rate": 4.977477477477478e-06, "loss": 0.7949, "step": 222 }, { "epoch": 0.40216411181244366, "grad_norm": 2.1707142856979553, "learning_rate": 5e-06, "loss": 0.8106, "step": 223 }, { "epoch": 0.40396753832281335, "grad_norm": 2.0454935807171566, "learning_rate": 5.022522522522523e-06, "loss": 0.7753, "step": 224 }, { "epoch": 0.40577096483318303, "grad_norm": 1.9818753837111296, "learning_rate": 5.045045045045045e-06, "loss": 0.7425, "step": 225 }, { "epoch": 0.4075743913435528, "grad_norm": 2.089730216802721, "learning_rate": 5.067567567567568e-06, "loss": 0.7953, "step": 226 }, { "epoch": 0.40937781785392247, "grad_norm": 2.1221432507716593, "learning_rate": 5.0900900900900905e-06, "loss": 0.7228, "step": 227 }, { "epoch": 0.41118124436429215, "grad_norm": 2.302405327887679, "learning_rate": 5.112612612612613e-06, "loss": 0.8949, "step": 228 }, { "epoch": 0.41298467087466184, "grad_norm": 2.1247546433849203, "learning_rate": 5.135135135135135e-06, "loss": 0.7007, "step": 229 }, { "epoch": 0.4147880973850316, "grad_norm": 2.043647163631905, "learning_rate": 5.157657657657657e-06, "loss": 0.7338, "step": 230 }, { "epoch": 0.4165915238954013, "grad_norm": 2.270245087265323, "learning_rate": 5.180180180180181e-06, "loss": 0.8341, "step": 231 }, { "epoch": 0.41839495040577096, "grad_norm": 2.183816726602172, "learning_rate": 5.202702702702704e-06, "loss": 0.8531, "step": 232 }, { "epoch": 0.42019837691614065, "grad_norm": 2.1547309877335903, "learning_rate": 5.225225225225226e-06, "loss": 0.7347, "step": 233 }, { "epoch": 0.4220018034265104, "grad_norm": 2.206709828896525, "learning_rate": 5.247747747747748e-06, "loss": 0.8665, "step": 234 }, { "epoch": 0.4238052299368801, "grad_norm": 2.0559406646994987, "learning_rate": 5.2702702702702705e-06, "loss": 0.6986, "step": 235 }, { "epoch": 0.42560865644724977, "grad_norm": 2.018292507433674, "learning_rate": 5.292792792792794e-06, "loss": 0.6952, "step": 236 }, { "epoch": 0.42741208295761945, "grad_norm": 2.0090659177061805, "learning_rate": 5.315315315315316e-06, "loss": 0.8291, "step": 237 }, { "epoch": 0.4292155094679892, "grad_norm": 2.301997115792778, "learning_rate": 5.337837837837838e-06, "loss": 0.7145, "step": 238 }, { "epoch": 0.4310189359783589, "grad_norm": 2.067738383833435, "learning_rate": 5.360360360360361e-06, "loss": 0.7995, "step": 239 }, { "epoch": 0.4328223624887286, "grad_norm": 2.2635903833648245, "learning_rate": 5.382882882882884e-06, "loss": 0.7224, "step": 240 }, { "epoch": 0.43462578899909826, "grad_norm": 2.275286557637392, "learning_rate": 5.405405405405406e-06, "loss": 0.8183, "step": 241 }, { "epoch": 0.436429215509468, "grad_norm": 2.1179111948903513, "learning_rate": 5.427927927927928e-06, "loss": 0.772, "step": 242 }, { "epoch": 0.4382326420198377, "grad_norm": 2.164539734222491, "learning_rate": 5.450450450450451e-06, "loss": 0.7319, "step": 243 }, { "epoch": 0.4400360685302074, "grad_norm": 2.1650273688319515, "learning_rate": 5.472972972972973e-06, "loss": 0.8215, "step": 244 }, { "epoch": 0.4418394950405771, "grad_norm": 2.075382076411821, "learning_rate": 5.495495495495496e-06, "loss": 0.7398, "step": 245 }, { "epoch": 0.4436429215509468, "grad_norm": 1.8310224669393116, "learning_rate": 5.518018018018018e-06, "loss": 0.7462, "step": 246 }, { "epoch": 0.4454463480613165, "grad_norm": 2.1020377635825955, "learning_rate": 5.540540540540541e-06, "loss": 0.7952, "step": 247 }, { "epoch": 0.4472497745716862, "grad_norm": 2.20596373228597, "learning_rate": 5.563063063063063e-06, "loss": 0.7155, "step": 248 }, { "epoch": 0.44905320108205593, "grad_norm": 2.157295752855383, "learning_rate": 5.585585585585585e-06, "loss": 0.7448, "step": 249 }, { "epoch": 0.4508566275924256, "grad_norm": 2.0756794403814767, "learning_rate": 5.608108108108109e-06, "loss": 0.9216, "step": 250 }, { "epoch": 0.4526600541027953, "grad_norm": 2.3192109805255123, "learning_rate": 5.6306306306306316e-06, "loss": 0.7884, "step": 251 }, { "epoch": 0.454463480613165, "grad_norm": 2.0220751392261467, "learning_rate": 5.653153153153154e-06, "loss": 0.7508, "step": 252 }, { "epoch": 0.45626690712353474, "grad_norm": 2.13660207924998, "learning_rate": 5.675675675675676e-06, "loss": 0.7559, "step": 253 }, { "epoch": 0.4580703336339044, "grad_norm": 2.114868597507177, "learning_rate": 5.6981981981981985e-06, "loss": 0.8145, "step": 254 }, { "epoch": 0.4598737601442741, "grad_norm": 2.1123935906108313, "learning_rate": 5.720720720720722e-06, "loss": 0.8049, "step": 255 }, { "epoch": 0.4616771866546438, "grad_norm": 2.4676890144062957, "learning_rate": 5.743243243243244e-06, "loss": 0.7957, "step": 256 }, { "epoch": 0.46348061316501354, "grad_norm": 2.168073489314107, "learning_rate": 5.765765765765766e-06, "loss": 0.7666, "step": 257 }, { "epoch": 0.46528403967538323, "grad_norm": 2.072398238803128, "learning_rate": 5.7882882882882885e-06, "loss": 0.8192, "step": 258 }, { "epoch": 0.4670874661857529, "grad_norm": 2.563194398373102, "learning_rate": 5.810810810810811e-06, "loss": 0.8409, "step": 259 }, { "epoch": 0.4688908926961226, "grad_norm": 2.304094514669754, "learning_rate": 5.833333333333334e-06, "loss": 0.8541, "step": 260 }, { "epoch": 0.47069431920649235, "grad_norm": 2.198147090270184, "learning_rate": 5.855855855855856e-06, "loss": 0.6985, "step": 261 }, { "epoch": 0.47249774571686204, "grad_norm": 2.1753059174557112, "learning_rate": 5.8783783783783786e-06, "loss": 0.8182, "step": 262 }, { "epoch": 0.4743011722272317, "grad_norm": 2.2589974392486356, "learning_rate": 5.900900900900901e-06, "loss": 0.755, "step": 263 }, { "epoch": 0.47610459873760147, "grad_norm": 2.1007335416554507, "learning_rate": 5.923423423423423e-06, "loss": 0.748, "step": 264 }, { "epoch": 0.47790802524797116, "grad_norm": 1.9793246514216147, "learning_rate": 5.945945945945947e-06, "loss": 0.6935, "step": 265 }, { "epoch": 0.47971145175834085, "grad_norm": 2.310796934006057, "learning_rate": 5.9684684684684694e-06, "loss": 0.7839, "step": 266 }, { "epoch": 0.48151487826871053, "grad_norm": 2.1733242596190374, "learning_rate": 5.990990990990992e-06, "loss": 0.7748, "step": 267 }, { "epoch": 0.4833183047790803, "grad_norm": 2.134235761560956, "learning_rate": 6.013513513513514e-06, "loss": 0.7545, "step": 268 }, { "epoch": 0.48512173128944996, "grad_norm": 2.279296657289438, "learning_rate": 6.036036036036037e-06, "loss": 0.7399, "step": 269 }, { "epoch": 0.48692515779981965, "grad_norm": 2.2785713614109566, "learning_rate": 6.0585585585585595e-06, "loss": 0.9031, "step": 270 }, { "epoch": 0.48872858431018934, "grad_norm": 2.17376130509456, "learning_rate": 6.081081081081082e-06, "loss": 0.8876, "step": 271 }, { "epoch": 0.4905320108205591, "grad_norm": 2.2837346570989694, "learning_rate": 6.103603603603604e-06, "loss": 0.852, "step": 272 }, { "epoch": 0.49233543733092877, "grad_norm": 2.308367261822732, "learning_rate": 6.126126126126126e-06, "loss": 0.7471, "step": 273 }, { "epoch": 0.49413886384129846, "grad_norm": 2.353472342894518, "learning_rate": 6.1486486486486495e-06, "loss": 0.865, "step": 274 }, { "epoch": 0.49594229035166815, "grad_norm": 2.2188631595778077, "learning_rate": 6.171171171171172e-06, "loss": 0.8253, "step": 275 }, { "epoch": 0.4977457168620379, "grad_norm": 2.4928969764456212, "learning_rate": 6.193693693693694e-06, "loss": 0.9809, "step": 276 }, { "epoch": 0.4995491433724076, "grad_norm": 2.429996582097567, "learning_rate": 6.2162162162162164e-06, "loss": 0.9529, "step": 277 }, { "epoch": 0.5013525698827773, "grad_norm": 2.156174833500389, "learning_rate": 6.238738738738739e-06, "loss": 0.7549, "step": 278 }, { "epoch": 0.503155996393147, "grad_norm": 2.1145480790559916, "learning_rate": 6.261261261261262e-06, "loss": 0.7325, "step": 279 }, { "epoch": 0.5049594229035167, "grad_norm": 2.3827039996906887, "learning_rate": 6.283783783783784e-06, "loss": 0.8234, "step": 280 }, { "epoch": 0.5067628494138864, "grad_norm": 2.2520674713452635, "learning_rate": 6.3063063063063065e-06, "loss": 0.805, "step": 281 }, { "epoch": 0.5085662759242561, "grad_norm": 2.2751328483189344, "learning_rate": 6.328828828828829e-06, "loss": 0.7916, "step": 282 }, { "epoch": 0.5103697024346258, "grad_norm": 2.105893153039127, "learning_rate": 6.351351351351351e-06, "loss": 0.7339, "step": 283 }, { "epoch": 0.5121731289449954, "grad_norm": 2.3088480635629853, "learning_rate": 6.373873873873875e-06, "loss": 0.7908, "step": 284 }, { "epoch": 0.5139765554553652, "grad_norm": 2.2019643640954567, "learning_rate": 6.396396396396397e-06, "loss": 0.8165, "step": 285 }, { "epoch": 0.5157799819657349, "grad_norm": 2.2224375489982195, "learning_rate": 6.41891891891892e-06, "loss": 0.7884, "step": 286 }, { "epoch": 0.5175834084761046, "grad_norm": 2.2123927819948257, "learning_rate": 6.441441441441442e-06, "loss": 0.8261, "step": 287 }, { "epoch": 0.5193868349864743, "grad_norm": 2.2449799654503093, "learning_rate": 6.463963963963964e-06, "loss": 0.8287, "step": 288 }, { "epoch": 0.521190261496844, "grad_norm": 2.244952248633715, "learning_rate": 6.486486486486487e-06, "loss": 0.823, "step": 289 }, { "epoch": 0.5229936880072137, "grad_norm": 2.146102311067904, "learning_rate": 6.50900900900901e-06, "loss": 0.8201, "step": 290 }, { "epoch": 0.5247971145175834, "grad_norm": 2.2107121939036642, "learning_rate": 6.531531531531532e-06, "loss": 0.7452, "step": 291 }, { "epoch": 0.5266005410279531, "grad_norm": 2.274570701724603, "learning_rate": 6.554054054054054e-06, "loss": 0.7995, "step": 292 }, { "epoch": 0.5284039675383229, "grad_norm": 2.3901970457801323, "learning_rate": 6.5765765765765775e-06, "loss": 0.8293, "step": 293 }, { "epoch": 0.5302073940486925, "grad_norm": 2.2046674887614617, "learning_rate": 6.5990990990991e-06, "loss": 0.7711, "step": 294 }, { "epoch": 0.5320108205590622, "grad_norm": 2.181130141644271, "learning_rate": 6.621621621621622e-06, "loss": 0.7467, "step": 295 }, { "epoch": 0.5338142470694319, "grad_norm": 2.1545307052885434, "learning_rate": 6.644144144144144e-06, "loss": 0.7591, "step": 296 }, { "epoch": 0.5356176735798016, "grad_norm": 2.000955616731471, "learning_rate": 6.666666666666667e-06, "loss": 0.7405, "step": 297 }, { "epoch": 0.5374211000901713, "grad_norm": 2.270342762369627, "learning_rate": 6.689189189189191e-06, "loss": 0.8734, "step": 298 }, { "epoch": 0.539224526600541, "grad_norm": 1.9914609602909024, "learning_rate": 6.711711711711713e-06, "loss": 0.9029, "step": 299 }, { "epoch": 0.5410279531109107, "grad_norm": 2.1862011960901238, "learning_rate": 6.734234234234235e-06, "loss": 0.7847, "step": 300 }, { "epoch": 0.5428313796212805, "grad_norm": 2.162455670849857, "learning_rate": 6.7567567567567575e-06, "loss": 0.796, "step": 301 }, { "epoch": 0.5446348061316502, "grad_norm": 2.190782292923182, "learning_rate": 6.77927927927928e-06, "loss": 0.8361, "step": 302 }, { "epoch": 0.5464382326420198, "grad_norm": 2.343114673195786, "learning_rate": 6.801801801801803e-06, "loss": 0.9578, "step": 303 }, { "epoch": 0.5482416591523895, "grad_norm": 2.137122549596483, "learning_rate": 6.824324324324325e-06, "loss": 0.8094, "step": 304 }, { "epoch": 0.5500450856627592, "grad_norm": 2.106947969785909, "learning_rate": 6.846846846846848e-06, "loss": 0.7836, "step": 305 }, { "epoch": 0.5518485121731289, "grad_norm": 2.146424998051745, "learning_rate": 6.86936936936937e-06, "loss": 0.7997, "step": 306 }, { "epoch": 0.5536519386834986, "grad_norm": 1.8042726808487144, "learning_rate": 6.891891891891892e-06, "loss": 0.7629, "step": 307 }, { "epoch": 0.5554553651938684, "grad_norm": 2.142256130584483, "learning_rate": 6.914414414414415e-06, "loss": 0.8131, "step": 308 }, { "epoch": 0.5572587917042381, "grad_norm": 2.0191516225293116, "learning_rate": 6.936936936936938e-06, "loss": 0.8088, "step": 309 }, { "epoch": 0.5590622182146078, "grad_norm": 2.2871792098661015, "learning_rate": 6.95945945945946e-06, "loss": 0.7785, "step": 310 }, { "epoch": 0.5608656447249775, "grad_norm": 2.2023691318993905, "learning_rate": 6.981981981981982e-06, "loss": 0.828, "step": 311 }, { "epoch": 0.5626690712353472, "grad_norm": 2.234138281725447, "learning_rate": 7.0045045045045045e-06, "loss": 0.7213, "step": 312 }, { "epoch": 0.5644724977457168, "grad_norm": 2.2818031085235932, "learning_rate": 7.027027027027028e-06, "loss": 0.6795, "step": 313 }, { "epoch": 0.5662759242560865, "grad_norm": 2.0842673839335846, "learning_rate": 7.04954954954955e-06, "loss": 0.8452, "step": 314 }, { "epoch": 0.5680793507664562, "grad_norm": 2.002642371369536, "learning_rate": 7.072072072072072e-06, "loss": 0.89, "step": 315 }, { "epoch": 0.569882777276826, "grad_norm": 2.083265325972135, "learning_rate": 7.0945945945945946e-06, "loss": 0.7551, "step": 316 }, { "epoch": 0.5716862037871957, "grad_norm": 2.0768528412350586, "learning_rate": 7.117117117117117e-06, "loss": 0.7289, "step": 317 }, { "epoch": 0.5734896302975654, "grad_norm": 1.9695169006653106, "learning_rate": 7.139639639639641e-06, "loss": 0.7184, "step": 318 }, { "epoch": 0.5752930568079351, "grad_norm": 2.084828562576803, "learning_rate": 7.162162162162163e-06, "loss": 0.7489, "step": 319 }, { "epoch": 0.5770964833183048, "grad_norm": 2.209350364597537, "learning_rate": 7.1846846846846855e-06, "loss": 0.7564, "step": 320 }, { "epoch": 0.5788999098286745, "grad_norm": 2.2105476735413054, "learning_rate": 7.207207207207208e-06, "loss": 0.84, "step": 321 }, { "epoch": 0.5807033363390441, "grad_norm": 2.5857780263589616, "learning_rate": 7.229729729729731e-06, "loss": 0.8624, "step": 322 }, { "epoch": 0.5825067628494139, "grad_norm": 2.19851358126889, "learning_rate": 7.252252252252253e-06, "loss": 0.6745, "step": 323 }, { "epoch": 0.5843101893597836, "grad_norm": 2.144886414168463, "learning_rate": 7.2747747747747755e-06, "loss": 0.8314, "step": 324 }, { "epoch": 0.5861136158701533, "grad_norm": 2.11225650931259, "learning_rate": 7.297297297297298e-06, "loss": 0.9043, "step": 325 }, { "epoch": 0.587917042380523, "grad_norm": 2.301105964642942, "learning_rate": 7.31981981981982e-06, "loss": 0.7197, "step": 326 }, { "epoch": 0.5897204688908927, "grad_norm": 2.3975525696500806, "learning_rate": 7.342342342342343e-06, "loss": 0.9116, "step": 327 }, { "epoch": 0.5915238954012624, "grad_norm": 2.1452666571693255, "learning_rate": 7.3648648648648655e-06, "loss": 0.7906, "step": 328 }, { "epoch": 0.5933273219116321, "grad_norm": 2.4409882639138134, "learning_rate": 7.387387387387388e-06, "loss": 0.8716, "step": 329 }, { "epoch": 0.5951307484220018, "grad_norm": 2.171908727845542, "learning_rate": 7.40990990990991e-06, "loss": 0.7304, "step": 330 }, { "epoch": 0.5969341749323716, "grad_norm": 2.247976351955023, "learning_rate": 7.4324324324324324e-06, "loss": 0.7739, "step": 331 }, { "epoch": 0.5987376014427412, "grad_norm": 2.238977362911319, "learning_rate": 7.4549549549549564e-06, "loss": 0.6977, "step": 332 }, { "epoch": 0.6005410279531109, "grad_norm": 1.9416451363003897, "learning_rate": 7.477477477477479e-06, "loss": 0.7653, "step": 333 }, { "epoch": 0.6023444544634806, "grad_norm": 2.184729395722401, "learning_rate": 7.500000000000001e-06, "loss": 0.7434, "step": 334 }, { "epoch": 0.6041478809738503, "grad_norm": 2.1317078889032173, "learning_rate": 7.5225225225225225e-06, "loss": 0.7047, "step": 335 }, { "epoch": 0.60595130748422, "grad_norm": 1.9907910766284589, "learning_rate": 7.545045045045045e-06, "loss": 0.7499, "step": 336 }, { "epoch": 0.6077547339945897, "grad_norm": 2.1264240696103487, "learning_rate": 7.567567567567569e-06, "loss": 0.8601, "step": 337 }, { "epoch": 0.6095581605049594, "grad_norm": 2.2046257254141035, "learning_rate": 7.590090090090091e-06, "loss": 0.8269, "step": 338 }, { "epoch": 0.6113615870153292, "grad_norm": 2.1215302945356695, "learning_rate": 7.612612612612613e-06, "loss": 0.7938, "step": 339 }, { "epoch": 0.6131650135256989, "grad_norm": 2.229799150273438, "learning_rate": 7.635135135135135e-06, "loss": 0.7993, "step": 340 }, { "epoch": 0.6149684400360685, "grad_norm": 2.129796061875063, "learning_rate": 7.657657657657658e-06, "loss": 0.8045, "step": 341 }, { "epoch": 0.6167718665464382, "grad_norm": 2.0716978158297685, "learning_rate": 7.680180180180181e-06, "loss": 0.7925, "step": 342 }, { "epoch": 0.6185752930568079, "grad_norm": 2.130760036578581, "learning_rate": 7.702702702702704e-06, "loss": 0.8987, "step": 343 }, { "epoch": 0.6203787195671776, "grad_norm": 1.9658518773305242, "learning_rate": 7.725225225225226e-06, "loss": 0.7385, "step": 344 }, { "epoch": 0.6221821460775473, "grad_norm": 2.2057377247232557, "learning_rate": 7.747747747747749e-06, "loss": 0.8462, "step": 345 }, { "epoch": 0.6239855725879171, "grad_norm": 1.998255103995078, "learning_rate": 7.77027027027027e-06, "loss": 0.726, "step": 346 }, { "epoch": 0.6257889990982868, "grad_norm": 2.1024774999508384, "learning_rate": 7.792792792792793e-06, "loss": 0.7351, "step": 347 }, { "epoch": 0.6275924256086565, "grad_norm": 2.045648302941062, "learning_rate": 7.815315315315317e-06, "loss": 0.7605, "step": 348 }, { "epoch": 0.6293958521190262, "grad_norm": 2.2083257683921373, "learning_rate": 7.837837837837838e-06, "loss": 0.7318, "step": 349 }, { "epoch": 0.6311992786293958, "grad_norm": 2.187495516104006, "learning_rate": 7.860360360360361e-06, "loss": 0.8159, "step": 350 }, { "epoch": 0.6330027051397655, "grad_norm": 2.0804434740408007, "learning_rate": 7.882882882882884e-06, "loss": 0.7396, "step": 351 }, { "epoch": 0.6348061316501352, "grad_norm": 2.037209239741434, "learning_rate": 7.905405405405406e-06, "loss": 0.7953, "step": 352 }, { "epoch": 0.6366095581605049, "grad_norm": 2.0556472196259055, "learning_rate": 7.927927927927929e-06, "loss": 0.6783, "step": 353 }, { "epoch": 0.6384129846708747, "grad_norm": 2.0562270484236898, "learning_rate": 7.95045045045045e-06, "loss": 0.7568, "step": 354 }, { "epoch": 0.6402164111812444, "grad_norm": 1.9938898178619702, "learning_rate": 7.972972972972974e-06, "loss": 0.8204, "step": 355 }, { "epoch": 0.6420198376916141, "grad_norm": 2.072139406380031, "learning_rate": 7.995495495495497e-06, "loss": 0.8035, "step": 356 }, { "epoch": 0.6438232642019838, "grad_norm": 2.0697820853812674, "learning_rate": 8.018018018018018e-06, "loss": 0.7801, "step": 357 }, { "epoch": 0.6456266907123535, "grad_norm": 2.0412202940720623, "learning_rate": 8.040540540540541e-06, "loss": 0.7014, "step": 358 }, { "epoch": 0.6474301172227231, "grad_norm": 1.9405663633560892, "learning_rate": 8.063063063063063e-06, "loss": 0.6928, "step": 359 }, { "epoch": 0.6492335437330928, "grad_norm": 2.1239135328030234, "learning_rate": 8.085585585585586e-06, "loss": 0.887, "step": 360 }, { "epoch": 0.6510369702434626, "grad_norm": 2.436623349264573, "learning_rate": 8.108108108108109e-06, "loss": 0.8074, "step": 361 }, { "epoch": 0.6528403967538323, "grad_norm": 2.0833195573627195, "learning_rate": 8.130630630630632e-06, "loss": 0.7037, "step": 362 }, { "epoch": 0.654643823264202, "grad_norm": 1.9625571637730252, "learning_rate": 8.153153153153154e-06, "loss": 0.7199, "step": 363 }, { "epoch": 0.6564472497745717, "grad_norm": 1.980886318284568, "learning_rate": 8.175675675675677e-06, "loss": 0.749, "step": 364 }, { "epoch": 0.6582506762849414, "grad_norm": 2.3686023326598593, "learning_rate": 8.198198198198198e-06, "loss": 1.0184, "step": 365 }, { "epoch": 0.6600541027953111, "grad_norm": 2.2671248043012264, "learning_rate": 8.220720720720721e-06, "loss": 0.7957, "step": 366 }, { "epoch": 0.6618575293056808, "grad_norm": 2.2527130831079027, "learning_rate": 8.243243243243245e-06, "loss": 0.8575, "step": 367 }, { "epoch": 0.6636609558160504, "grad_norm": 2.5533424372075446, "learning_rate": 8.265765765765766e-06, "loss": 0.7706, "step": 368 }, { "epoch": 0.6654643823264202, "grad_norm": 2.467984541574478, "learning_rate": 8.288288288288289e-06, "loss": 0.7647, "step": 369 }, { "epoch": 0.6672678088367899, "grad_norm": 2.183449489939818, "learning_rate": 8.31081081081081e-06, "loss": 0.8173, "step": 370 }, { "epoch": 0.6690712353471596, "grad_norm": 2.2867069876523582, "learning_rate": 8.333333333333334e-06, "loss": 0.8269, "step": 371 }, { "epoch": 0.6708746618575293, "grad_norm": 2.2573156909093957, "learning_rate": 8.355855855855857e-06, "loss": 0.7816, "step": 372 }, { "epoch": 0.672678088367899, "grad_norm": 1.9766214188381033, "learning_rate": 8.378378378378378e-06, "loss": 0.7515, "step": 373 }, { "epoch": 0.6744815148782687, "grad_norm": 2.1750484801074057, "learning_rate": 8.400900900900901e-06, "loss": 0.8656, "step": 374 }, { "epoch": 0.6762849413886384, "grad_norm": 2.230627699222089, "learning_rate": 8.423423423423423e-06, "loss": 0.773, "step": 375 }, { "epoch": 0.6780883678990082, "grad_norm": 2.0314632652565763, "learning_rate": 8.445945945945948e-06, "loss": 0.7375, "step": 376 }, { "epoch": 0.6798917944093779, "grad_norm": 2.005821931578866, "learning_rate": 8.46846846846847e-06, "loss": 0.7262, "step": 377 }, { "epoch": 0.6816952209197475, "grad_norm": 2.1743917104398647, "learning_rate": 8.490990990990992e-06, "loss": 0.7824, "step": 378 }, { "epoch": 0.6834986474301172, "grad_norm": 2.0955934020895066, "learning_rate": 8.513513513513514e-06, "loss": 0.785, "step": 379 }, { "epoch": 0.6853020739404869, "grad_norm": 2.000365871785507, "learning_rate": 8.536036036036037e-06, "loss": 0.648, "step": 380 }, { "epoch": 0.6871055004508566, "grad_norm": 2.1478172637074744, "learning_rate": 8.55855855855856e-06, "loss": 0.8075, "step": 381 }, { "epoch": 0.6889089269612263, "grad_norm": 2.134460577230095, "learning_rate": 8.581081081081082e-06, "loss": 0.9026, "step": 382 }, { "epoch": 0.690712353471596, "grad_norm": 2.14542331689987, "learning_rate": 8.603603603603605e-06, "loss": 0.8901, "step": 383 }, { "epoch": 0.6925157799819658, "grad_norm": 2.135300301139234, "learning_rate": 8.626126126126126e-06, "loss": 0.7259, "step": 384 }, { "epoch": 0.6943192064923355, "grad_norm": 2.474623212671607, "learning_rate": 8.64864864864865e-06, "loss": 0.8629, "step": 385 }, { "epoch": 0.6961226330027052, "grad_norm": 1.975908066289463, "learning_rate": 8.671171171171172e-06, "loss": 0.7249, "step": 386 }, { "epoch": 0.6979260595130748, "grad_norm": 2.1653693128183016, "learning_rate": 8.693693693693694e-06, "loss": 0.8081, "step": 387 }, { "epoch": 0.6997294860234445, "grad_norm": 1.8567902438166204, "learning_rate": 8.716216216216217e-06, "loss": 0.7579, "step": 388 }, { "epoch": 0.7015329125338142, "grad_norm": 2.2215481111685484, "learning_rate": 8.738738738738739e-06, "loss": 0.9716, "step": 389 }, { "epoch": 0.7033363390441839, "grad_norm": 2.3046170296242, "learning_rate": 8.761261261261262e-06, "loss": 0.7795, "step": 390 }, { "epoch": 0.7051397655545536, "grad_norm": 2.131248198058394, "learning_rate": 8.783783783783785e-06, "loss": 0.9155, "step": 391 }, { "epoch": 0.7069431920649234, "grad_norm": 1.924396723021384, "learning_rate": 8.806306306306306e-06, "loss": 0.7556, "step": 392 }, { "epoch": 0.7087466185752931, "grad_norm": 2.1117533927836996, "learning_rate": 8.82882882882883e-06, "loss": 0.8406, "step": 393 }, { "epoch": 0.7105500450856628, "grad_norm": 2.075709429966764, "learning_rate": 8.851351351351351e-06, "loss": 0.745, "step": 394 }, { "epoch": 0.7123534715960325, "grad_norm": 2.16985925608763, "learning_rate": 8.873873873873876e-06, "loss": 0.6691, "step": 395 }, { "epoch": 0.7141568981064021, "grad_norm": 2.0797926880074846, "learning_rate": 8.896396396396397e-06, "loss": 0.8238, "step": 396 }, { "epoch": 0.7159603246167718, "grad_norm": 2.0186248223482997, "learning_rate": 8.91891891891892e-06, "loss": 0.7898, "step": 397 }, { "epoch": 0.7177637511271415, "grad_norm": 2.185076250626117, "learning_rate": 8.941441441441442e-06, "loss": 0.7142, "step": 398 }, { "epoch": 0.7195671776375113, "grad_norm": 2.1357619098512384, "learning_rate": 8.963963963963965e-06, "loss": 0.723, "step": 399 }, { "epoch": 0.721370604147881, "grad_norm": 2.117174241205152, "learning_rate": 8.986486486486488e-06, "loss": 0.7863, "step": 400 }, { "epoch": 0.7231740306582507, "grad_norm": 2.0651532522605214, "learning_rate": 9.00900900900901e-06, "loss": 0.7213, "step": 401 }, { "epoch": 0.7249774571686204, "grad_norm": 1.9757749532276578, "learning_rate": 9.031531531531533e-06, "loss": 0.7742, "step": 402 }, { "epoch": 0.7267808836789901, "grad_norm": 2.3393951402241755, "learning_rate": 9.054054054054054e-06, "loss": 0.7721, "step": 403 }, { "epoch": 0.7285843101893598, "grad_norm": 2.2574331805115064, "learning_rate": 9.076576576576577e-06, "loss": 0.9634, "step": 404 }, { "epoch": 0.7303877366997295, "grad_norm": 2.0915118092689524, "learning_rate": 9.0990990990991e-06, "loss": 0.8077, "step": 405 }, { "epoch": 0.7321911632100991, "grad_norm": 2.1653381189020524, "learning_rate": 9.121621621621622e-06, "loss": 0.7777, "step": 406 }, { "epoch": 0.7339945897204689, "grad_norm": 2.1328560039152458, "learning_rate": 9.144144144144145e-06, "loss": 0.7882, "step": 407 }, { "epoch": 0.7357980162308386, "grad_norm": 2.201274651740219, "learning_rate": 9.166666666666666e-06, "loss": 0.7608, "step": 408 }, { "epoch": 0.7376014427412083, "grad_norm": 2.2359271481989587, "learning_rate": 9.189189189189191e-06, "loss": 0.8347, "step": 409 }, { "epoch": 0.739404869251578, "grad_norm": 2.1161842611073034, "learning_rate": 9.211711711711713e-06, "loss": 0.7557, "step": 410 }, { "epoch": 0.7412082957619477, "grad_norm": 2.028791176838769, "learning_rate": 9.234234234234236e-06, "loss": 0.7183, "step": 411 }, { "epoch": 0.7430117222723174, "grad_norm": 2.0656955597804503, "learning_rate": 9.256756756756757e-06, "loss": 0.7699, "step": 412 }, { "epoch": 0.7448151487826871, "grad_norm": 2.087727535028477, "learning_rate": 9.27927927927928e-06, "loss": 0.7679, "step": 413 }, { "epoch": 0.7466185752930569, "grad_norm": 2.49764840147548, "learning_rate": 9.301801801801804e-06, "loss": 0.8287, "step": 414 }, { "epoch": 0.7484220018034266, "grad_norm": 1.9741285413389515, "learning_rate": 9.324324324324325e-06, "loss": 0.8161, "step": 415 }, { "epoch": 0.7502254283137962, "grad_norm": 1.9406848284047182, "learning_rate": 9.346846846846848e-06, "loss": 0.768, "step": 416 }, { "epoch": 0.7520288548241659, "grad_norm": 2.244408508249851, "learning_rate": 9.36936936936937e-06, "loss": 0.8611, "step": 417 }, { "epoch": 0.7538322813345356, "grad_norm": 2.158384016489991, "learning_rate": 9.391891891891893e-06, "loss": 0.8361, "step": 418 }, { "epoch": 0.7556357078449053, "grad_norm": 2.066053768199076, "learning_rate": 9.414414414414416e-06, "loss": 0.819, "step": 419 }, { "epoch": 0.757439134355275, "grad_norm": 2.1350022569990603, "learning_rate": 9.436936936936937e-06, "loss": 0.7075, "step": 420 }, { "epoch": 0.7592425608656447, "grad_norm": 2.0051316619920745, "learning_rate": 9.45945945945946e-06, "loss": 0.8319, "step": 421 }, { "epoch": 0.7610459873760145, "grad_norm": 2.225160013360467, "learning_rate": 9.481981981981982e-06, "loss": 0.7308, "step": 422 }, { "epoch": 0.7628494138863842, "grad_norm": 2.276993744188313, "learning_rate": 9.504504504504505e-06, "loss": 0.8014, "step": 423 }, { "epoch": 0.7646528403967539, "grad_norm": 1.8858762886928577, "learning_rate": 9.527027027027028e-06, "loss": 0.7923, "step": 424 }, { "epoch": 0.7664562669071235, "grad_norm": 2.0252801103636195, "learning_rate": 9.54954954954955e-06, "loss": 0.76, "step": 425 }, { "epoch": 0.7682596934174932, "grad_norm": 2.0418148296691503, "learning_rate": 9.572072072072073e-06, "loss": 0.7714, "step": 426 }, { "epoch": 0.7700631199278629, "grad_norm": 2.133301976541229, "learning_rate": 9.594594594594594e-06, "loss": 0.918, "step": 427 }, { "epoch": 0.7718665464382326, "grad_norm": 1.8904070256803192, "learning_rate": 9.617117117117117e-06, "loss": 0.8019, "step": 428 }, { "epoch": 0.7736699729486023, "grad_norm": 2.1802687710567445, "learning_rate": 9.63963963963964e-06, "loss": 0.7124, "step": 429 }, { "epoch": 0.7754733994589721, "grad_norm": 2.0041551174293883, "learning_rate": 9.662162162162164e-06, "loss": 0.7317, "step": 430 }, { "epoch": 0.7772768259693418, "grad_norm": 2.1685058642707085, "learning_rate": 9.684684684684685e-06, "loss": 0.8182, "step": 431 }, { "epoch": 0.7790802524797115, "grad_norm": 2.0821300887019394, "learning_rate": 9.707207207207208e-06, "loss": 0.7483, "step": 432 }, { "epoch": 0.7808836789900812, "grad_norm": 2.017254787966865, "learning_rate": 9.729729729729732e-06, "loss": 0.7439, "step": 433 }, { "epoch": 0.7826871055004508, "grad_norm": 2.0938808603633965, "learning_rate": 9.752252252252253e-06, "loss": 0.727, "step": 434 }, { "epoch": 0.7844905320108205, "grad_norm": 2.0745407772671784, "learning_rate": 9.774774774774776e-06, "loss": 0.8423, "step": 435 }, { "epoch": 0.7862939585211902, "grad_norm": 2.080200509416444, "learning_rate": 9.797297297297298e-06, "loss": 0.7642, "step": 436 }, { "epoch": 0.78809738503156, "grad_norm": 2.0994306655505537, "learning_rate": 9.81981981981982e-06, "loss": 0.839, "step": 437 }, { "epoch": 0.7899008115419297, "grad_norm": 2.2979389003423423, "learning_rate": 9.842342342342344e-06, "loss": 1.0683, "step": 438 }, { "epoch": 0.7917042380522994, "grad_norm": 2.0022713778993046, "learning_rate": 9.864864864864865e-06, "loss": 0.8482, "step": 439 }, { "epoch": 0.7935076645626691, "grad_norm": 2.079766390913082, "learning_rate": 9.887387387387388e-06, "loss": 0.8196, "step": 440 }, { "epoch": 0.7953110910730388, "grad_norm": 2.2693808506736555, "learning_rate": 9.90990990990991e-06, "loss": 0.8566, "step": 441 }, { "epoch": 0.7971145175834085, "grad_norm": 2.0568829176639767, "learning_rate": 9.932432432432433e-06, "loss": 0.6902, "step": 442 }, { "epoch": 0.7989179440937781, "grad_norm": 2.0605295508114687, "learning_rate": 9.954954954954956e-06, "loss": 0.7233, "step": 443 }, { "epoch": 0.8007213706041478, "grad_norm": 2.0444950860289532, "learning_rate": 9.97747747747748e-06, "loss": 0.7836, "step": 444 }, { "epoch": 0.8025247971145176, "grad_norm": 2.110829091240123, "learning_rate": 1e-05, "loss": 0.8386, "step": 445 }, { "epoch": 0.8043282236248873, "grad_norm": 1.9884942336111662, "learning_rate": 9.999998454785508e-06, "loss": 0.7082, "step": 446 }, { "epoch": 0.806131650135257, "grad_norm": 2.259740074055523, "learning_rate": 9.999993819142988e-06, "loss": 0.7289, "step": 447 }, { "epoch": 0.8079350766456267, "grad_norm": 1.9397862708682205, "learning_rate": 9.999986093075303e-06, "loss": 0.8564, "step": 448 }, { "epoch": 0.8097385031559964, "grad_norm": 2.024093634732536, "learning_rate": 9.99997527658723e-06, "loss": 0.8771, "step": 449 }, { "epoch": 0.8115419296663661, "grad_norm": 2.1014829667276866, "learning_rate": 9.999961369685454e-06, "loss": 0.7321, "step": 450 }, { "epoch": 0.8133453561767358, "grad_norm": 2.048448841813617, "learning_rate": 9.999944372378571e-06, "loss": 0.7546, "step": 451 }, { "epoch": 0.8151487826871056, "grad_norm": 2.1737934515944346, "learning_rate": 9.999924284677087e-06, "loss": 0.7508, "step": 452 }, { "epoch": 0.8169522091974752, "grad_norm": 2.1001584800928037, "learning_rate": 9.999901106593418e-06, "loss": 0.7644, "step": 453 }, { "epoch": 0.8187556357078449, "grad_norm": 1.8767527278907021, "learning_rate": 9.999874838141888e-06, "loss": 0.7667, "step": 454 }, { "epoch": 0.8205590622182146, "grad_norm": 2.04063723684597, "learning_rate": 9.999845479338735e-06, "loss": 0.8819, "step": 455 }, { "epoch": 0.8223624887285843, "grad_norm": 2.0555425814497728, "learning_rate": 9.999813030202106e-06, "loss": 0.7877, "step": 456 }, { "epoch": 0.824165915238954, "grad_norm": 1.967379340855907, "learning_rate": 9.999777490752056e-06, "loss": 0.8801, "step": 457 }, { "epoch": 0.8259693417493237, "grad_norm": 2.2507392839674556, "learning_rate": 9.99973886101055e-06, "loss": 0.7568, "step": 458 }, { "epoch": 0.8277727682596934, "grad_norm": 2.5630927142344935, "learning_rate": 9.99969714100147e-06, "loss": 0.9138, "step": 459 }, { "epoch": 0.8295761947700632, "grad_norm": 2.173332163298101, "learning_rate": 9.999652330750595e-06, "loss": 0.8281, "step": 460 }, { "epoch": 0.8313796212804329, "grad_norm": 1.916637925944768, "learning_rate": 9.999604430285628e-06, "loss": 0.7754, "step": 461 }, { "epoch": 0.8331830477908025, "grad_norm": 2.1152249416855384, "learning_rate": 9.999553439636171e-06, "loss": 0.8997, "step": 462 }, { "epoch": 0.8349864743011722, "grad_norm": 2.0689074493196955, "learning_rate": 9.999499358833745e-06, "loss": 0.7964, "step": 463 }, { "epoch": 0.8367899008115419, "grad_norm": 2.0154413915192064, "learning_rate": 9.999442187911774e-06, "loss": 0.7699, "step": 464 }, { "epoch": 0.8385933273219116, "grad_norm": 1.9151923147532852, "learning_rate": 9.999381926905592e-06, "loss": 0.6932, "step": 465 }, { "epoch": 0.8403967538322813, "grad_norm": 2.1665705435782336, "learning_rate": 9.999318575852451e-06, "loss": 1.0093, "step": 466 }, { "epoch": 0.842200180342651, "grad_norm": 1.8047363755961323, "learning_rate": 9.999252134791504e-06, "loss": 0.6659, "step": 467 }, { "epoch": 0.8440036068530208, "grad_norm": 2.151615247814168, "learning_rate": 9.999182603763816e-06, "loss": 0.7546, "step": 468 }, { "epoch": 0.8458070333633905, "grad_norm": 2.4423751661740503, "learning_rate": 9.999109982812368e-06, "loss": 0.9198, "step": 469 }, { "epoch": 0.8476104598737602, "grad_norm": 1.9717826223860573, "learning_rate": 9.99903427198204e-06, "loss": 0.7544, "step": 470 }, { "epoch": 0.8494138863841298, "grad_norm": 2.0312411419769005, "learning_rate": 9.99895547131963e-06, "loss": 0.8107, "step": 471 }, { "epoch": 0.8512173128944995, "grad_norm": 2.157429067167941, "learning_rate": 9.998873580873848e-06, "loss": 0.6818, "step": 472 }, { "epoch": 0.8530207394048692, "grad_norm": 2.165363885628237, "learning_rate": 9.998788600695304e-06, "loss": 0.7382, "step": 473 }, { "epoch": 0.8548241659152389, "grad_norm": 2.147934133999578, "learning_rate": 9.998700530836525e-06, "loss": 0.8056, "step": 474 }, { "epoch": 0.8566275924256087, "grad_norm": 1.9152522216267172, "learning_rate": 9.998609371351944e-06, "loss": 0.8791, "step": 475 }, { "epoch": 0.8584310189359784, "grad_norm": 2.0560094856153976, "learning_rate": 9.998515122297909e-06, "loss": 0.8172, "step": 476 }, { "epoch": 0.8602344454463481, "grad_norm": 2.123184949091883, "learning_rate": 9.99841778373267e-06, "loss": 0.8243, "step": 477 }, { "epoch": 0.8620378719567178, "grad_norm": 2.178911188445628, "learning_rate": 9.998317355716393e-06, "loss": 0.9132, "step": 478 }, { "epoch": 0.8638412984670875, "grad_norm": 2.2802080985811766, "learning_rate": 9.99821383831115e-06, "loss": 0.8641, "step": 479 }, { "epoch": 0.8656447249774571, "grad_norm": 2.0900308677818287, "learning_rate": 9.998107231580925e-06, "loss": 0.7905, "step": 480 }, { "epoch": 0.8674481514878268, "grad_norm": 1.9731645490377003, "learning_rate": 9.99799753559161e-06, "loss": 0.639, "step": 481 }, { "epoch": 0.8692515779981965, "grad_norm": 2.253348194223949, "learning_rate": 9.997884750411004e-06, "loss": 0.7037, "step": 482 }, { "epoch": 0.8710550045085663, "grad_norm": 2.052696164388112, "learning_rate": 9.99776887610882e-06, "loss": 0.7247, "step": 483 }, { "epoch": 0.872858431018936, "grad_norm": 2.0018489890660196, "learning_rate": 9.997649912756678e-06, "loss": 0.8574, "step": 484 }, { "epoch": 0.8746618575293057, "grad_norm": 2.168583333160758, "learning_rate": 9.997527860428108e-06, "loss": 0.7786, "step": 485 }, { "epoch": 0.8764652840396754, "grad_norm": 1.943218814932057, "learning_rate": 9.99740271919855e-06, "loss": 0.8582, "step": 486 }, { "epoch": 0.8782687105500451, "grad_norm": 2.3671546883192964, "learning_rate": 9.997274489145348e-06, "loss": 0.8454, "step": 487 }, { "epoch": 0.8800721370604148, "grad_norm": 1.970853464509309, "learning_rate": 9.997143170347762e-06, "loss": 0.8135, "step": 488 }, { "epoch": 0.8818755635707844, "grad_norm": 2.0194288814353505, "learning_rate": 9.997008762886957e-06, "loss": 0.8322, "step": 489 }, { "epoch": 0.8836789900811542, "grad_norm": 1.7905144760024025, "learning_rate": 9.99687126684601e-06, "loss": 0.6747, "step": 490 }, { "epoch": 0.8854824165915239, "grad_norm": 1.8598765372123462, "learning_rate": 9.996730682309905e-06, "loss": 0.7077, "step": 491 }, { "epoch": 0.8872858431018936, "grad_norm": 2.0349723629280194, "learning_rate": 9.996587009365534e-06, "loss": 1.0192, "step": 492 }, { "epoch": 0.8890892696122633, "grad_norm": 1.8670659449601439, "learning_rate": 9.9964402481017e-06, "loss": 0.7877, "step": 493 }, { "epoch": 0.890892696122633, "grad_norm": 1.9920058723596443, "learning_rate": 9.996290398609115e-06, "loss": 0.7732, "step": 494 }, { "epoch": 0.8926961226330027, "grad_norm": 2.115518704658833, "learning_rate": 9.996137460980397e-06, "loss": 0.9214, "step": 495 }, { "epoch": 0.8944995491433724, "grad_norm": 1.8785646679648142, "learning_rate": 9.995981435310078e-06, "loss": 0.7817, "step": 496 }, { "epoch": 0.8963029756537421, "grad_norm": 2.055980396750251, "learning_rate": 9.99582232169459e-06, "loss": 0.7329, "step": 497 }, { "epoch": 0.8981064021641119, "grad_norm": 2.3245341827701016, "learning_rate": 9.995660120232282e-06, "loss": 0.7507, "step": 498 }, { "epoch": 0.8999098286744815, "grad_norm": 1.9959708222467396, "learning_rate": 9.99549483102341e-06, "loss": 0.8384, "step": 499 }, { "epoch": 0.9017132551848512, "grad_norm": 1.8680585762600073, "learning_rate": 9.995326454170132e-06, "loss": 0.7024, "step": 500 }, { "epoch": 0.9035166816952209, "grad_norm": 2.0117030514545378, "learning_rate": 9.995154989776523e-06, "loss": 0.7802, "step": 501 }, { "epoch": 0.9053201082055906, "grad_norm": 2.137679667033616, "learning_rate": 9.994980437948563e-06, "loss": 0.8063, "step": 502 }, { "epoch": 0.9071235347159603, "grad_norm": 2.1841726174477225, "learning_rate": 9.994802798794138e-06, "loss": 0.8739, "step": 503 }, { "epoch": 0.90892696122633, "grad_norm": 2.0156304452286316, "learning_rate": 9.994622072423046e-06, "loss": 0.8506, "step": 504 }, { "epoch": 0.9107303877366997, "grad_norm": 1.9906410530703365, "learning_rate": 9.99443825894699e-06, "loss": 0.756, "step": 505 }, { "epoch": 0.9125338142470695, "grad_norm": 2.112608845026693, "learning_rate": 9.994251358479583e-06, "loss": 0.8051, "step": 506 }, { "epoch": 0.9143372407574392, "grad_norm": 2.043964300441665, "learning_rate": 9.994061371136347e-06, "loss": 0.7568, "step": 507 }, { "epoch": 0.9161406672678089, "grad_norm": 1.938078803529045, "learning_rate": 9.993868297034709e-06, "loss": 0.6958, "step": 508 }, { "epoch": 0.9179440937781785, "grad_norm": 2.136141218098241, "learning_rate": 9.993672136294004e-06, "loss": 0.8964, "step": 509 }, { "epoch": 0.9197475202885482, "grad_norm": 2.159727291798854, "learning_rate": 9.993472889035478e-06, "loss": 0.7743, "step": 510 }, { "epoch": 0.9215509467989179, "grad_norm": 2.067924171823381, "learning_rate": 9.993270555382283e-06, "loss": 0.7229, "step": 511 }, { "epoch": 0.9233543733092876, "grad_norm": 1.9675288285614405, "learning_rate": 9.99306513545948e-06, "loss": 0.7454, "step": 512 }, { "epoch": 0.9251577998196574, "grad_norm": 2.022409969863871, "learning_rate": 9.99285662939403e-06, "loss": 0.8741, "step": 513 }, { "epoch": 0.9269612263300271, "grad_norm": 2.1801414103496084, "learning_rate": 9.992645037314815e-06, "loss": 0.9204, "step": 514 }, { "epoch": 0.9287646528403968, "grad_norm": 2.243366786461632, "learning_rate": 9.992430359352613e-06, "loss": 0.7942, "step": 515 }, { "epoch": 0.9305680793507665, "grad_norm": 2.035046137492824, "learning_rate": 9.992212595640115e-06, "loss": 0.7946, "step": 516 }, { "epoch": 0.9323715058611362, "grad_norm": 1.9398861696717984, "learning_rate": 9.991991746311916e-06, "loss": 0.8198, "step": 517 }, { "epoch": 0.9341749323715058, "grad_norm": 2.069297097067726, "learning_rate": 9.991767811504522e-06, "loss": 0.7359, "step": 518 }, { "epoch": 0.9359783588818755, "grad_norm": 2.0501351425908516, "learning_rate": 9.991540791356342e-06, "loss": 0.7781, "step": 519 }, { "epoch": 0.9377817853922452, "grad_norm": 2.0883364441479713, "learning_rate": 9.991310686007694e-06, "loss": 0.7445, "step": 520 }, { "epoch": 0.939585211902615, "grad_norm": 2.000915313932848, "learning_rate": 9.991077495600806e-06, "loss": 0.877, "step": 521 }, { "epoch": 0.9413886384129847, "grad_norm": 2.0041196859921637, "learning_rate": 9.990841220279805e-06, "loss": 0.7847, "step": 522 }, { "epoch": 0.9431920649233544, "grad_norm": 2.0817088464090614, "learning_rate": 9.990601860190732e-06, "loss": 0.769, "step": 523 }, { "epoch": 0.9449954914337241, "grad_norm": 1.8694308179676222, "learning_rate": 9.990359415481532e-06, "loss": 0.7341, "step": 524 }, { "epoch": 0.9467989179440938, "grad_norm": 1.929784513064164, "learning_rate": 9.990113886302057e-06, "loss": 0.9216, "step": 525 }, { "epoch": 0.9486023444544635, "grad_norm": 2.0430562116596698, "learning_rate": 9.989865272804064e-06, "loss": 0.9328, "step": 526 }, { "epoch": 0.9504057709648331, "grad_norm": 2.0821273642223366, "learning_rate": 9.989613575141216e-06, "loss": 0.7013, "step": 527 }, { "epoch": 0.9522091974752029, "grad_norm": 2.107240374281522, "learning_rate": 9.989358793469089e-06, "loss": 0.8061, "step": 528 }, { "epoch": 0.9540126239855726, "grad_norm": 1.9040690842756598, "learning_rate": 9.989100927945155e-06, "loss": 0.6969, "step": 529 }, { "epoch": 0.9558160504959423, "grad_norm": 1.9025952752616262, "learning_rate": 9.988839978728798e-06, "loss": 0.8238, "step": 530 }, { "epoch": 0.957619477006312, "grad_norm": 2.26685993097902, "learning_rate": 9.988575945981308e-06, "loss": 0.72, "step": 531 }, { "epoch": 0.9594229035166817, "grad_norm": 1.9202409462283874, "learning_rate": 9.98830882986588e-06, "loss": 0.6961, "step": 532 }, { "epoch": 0.9612263300270514, "grad_norm": 1.9580497289053733, "learning_rate": 9.988038630547613e-06, "loss": 0.7772, "step": 533 }, { "epoch": 0.9630297565374211, "grad_norm": 2.1460745134447143, "learning_rate": 9.987765348193517e-06, "loss": 0.7882, "step": 534 }, { "epoch": 0.9648331830477908, "grad_norm": 2.232600000911527, "learning_rate": 9.9874889829725e-06, "loss": 0.7665, "step": 535 }, { "epoch": 0.9666366095581606, "grad_norm": 2.169430275754146, "learning_rate": 9.98720953505538e-06, "loss": 0.8654, "step": 536 }, { "epoch": 0.9684400360685302, "grad_norm": 2.1645017061078096, "learning_rate": 9.986927004614881e-06, "loss": 0.7641, "step": 537 }, { "epoch": 0.9702434625788999, "grad_norm": 1.964406868016901, "learning_rate": 9.986641391825633e-06, "loss": 0.74, "step": 538 }, { "epoch": 0.9720468890892696, "grad_norm": 2.024095725628653, "learning_rate": 9.986352696864165e-06, "loss": 0.7718, "step": 539 }, { "epoch": 0.9738503155996393, "grad_norm": 2.256779549272981, "learning_rate": 9.986060919908917e-06, "loss": 0.8262, "step": 540 }, { "epoch": 0.975653742110009, "grad_norm": 1.9660904201916358, "learning_rate": 9.985766061140233e-06, "loss": 0.6275, "step": 541 }, { "epoch": 0.9774571686203787, "grad_norm": 2.1280476336617378, "learning_rate": 9.985468120740361e-06, "loss": 0.768, "step": 542 }, { "epoch": 0.9792605951307484, "grad_norm": 1.938265598688848, "learning_rate": 9.985167098893452e-06, "loss": 0.9998, "step": 543 }, { "epoch": 0.9810640216411182, "grad_norm": 2.1379975513910487, "learning_rate": 9.984862995785564e-06, "loss": 0.8202, "step": 544 }, { "epoch": 0.9828674481514879, "grad_norm": 2.0638541854893884, "learning_rate": 9.984555811604662e-06, "loss": 0.7729, "step": 545 }, { "epoch": 0.9846708746618575, "grad_norm": 2.34875633165973, "learning_rate": 9.984245546540606e-06, "loss": 0.8073, "step": 546 }, { "epoch": 0.9864743011722272, "grad_norm": 2.1131918995078056, "learning_rate": 9.983932200785173e-06, "loss": 0.7262, "step": 547 }, { "epoch": 0.9882777276825969, "grad_norm": 1.995224926283804, "learning_rate": 9.983615774532031e-06, "loss": 0.8007, "step": 548 }, { "epoch": 0.9900811541929666, "grad_norm": 1.8849007313400998, "learning_rate": 9.983296267976766e-06, "loss": 0.6879, "step": 549 }, { "epoch": 0.9918845807033363, "grad_norm": 2.051745716721103, "learning_rate": 9.982973681316854e-06, "loss": 0.7265, "step": 550 }, { "epoch": 0.9936880072137061, "grad_norm": 2.0528171651989693, "learning_rate": 9.982648014751685e-06, "loss": 0.7505, "step": 551 }, { "epoch": 0.9954914337240758, "grad_norm": 2.2911439260287336, "learning_rate": 9.982319268482547e-06, "loss": 0.8454, "step": 552 }, { "epoch": 0.9972948602344455, "grad_norm": 2.2047269517513257, "learning_rate": 9.981987442712634e-06, "loss": 0.8355, "step": 553 }, { "epoch": 0.9990982867448152, "grad_norm": 1.7988349615615926, "learning_rate": 9.981652537647041e-06, "loss": 0.6762, "step": 554 } ], "logging_steps": 1, "max_steps": 4440, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 277, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 260908552552448.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }