{ "best_metric": 0.009820309467613697, "best_model_checkpoint": "/workspace/previous_works/RadFM/output/RadFM-Llama3-8B-pretrain-0002-embed_tokens-depth32-lora-10ep/checkpoint-10000", "epoch": 3.0, "eval_steps": 10000, "global_step": 14319, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031426775612822125, "grad_norm": 38.333740234375, "learning_rate": 3.488372093023256e-06, "loss": 2.6324, "step": 15 }, { "epoch": 0.006285355122564425, "grad_norm": 23.8914794921875, "learning_rate": 6.976744186046512e-06, "loss": 2.3565, "step": 30 }, { "epoch": 0.009428032683846637, "grad_norm": 6.890503883361816, "learning_rate": 1.0465116279069768e-05, "loss": 1.8897, "step": 45 }, { "epoch": 0.01257071024512885, "grad_norm": 3.9464468955993652, "learning_rate": 1.3953488372093024e-05, "loss": 1.3707, "step": 60 }, { "epoch": 0.01571338780641106, "grad_norm": 4.443431854248047, "learning_rate": 1.744186046511628e-05, "loss": 1.055, "step": 75 }, { "epoch": 0.018856065367693273, "grad_norm": 3.5747361183166504, "learning_rate": 2.0930232558139536e-05, "loss": 0.9048, "step": 90 }, { "epoch": 0.02199874292897549, "grad_norm": 4.540731430053711, "learning_rate": 2.441860465116279e-05, "loss": 0.9143, "step": 105 }, { "epoch": 0.0251414204902577, "grad_norm": 4.121450424194336, "learning_rate": 2.7906976744186048e-05, "loss": 0.7641, "step": 120 }, { "epoch": 0.028284098051539912, "grad_norm": 3.1179299354553223, "learning_rate": 3.13953488372093e-05, "loss": 0.7784, "step": 135 }, { "epoch": 0.03142677561282212, "grad_norm": 2.9703869819641113, "learning_rate": 3.488372093023256e-05, "loss": 0.7299, "step": 150 }, { "epoch": 0.034569453174104335, "grad_norm": 2.706854820251465, "learning_rate": 3.837209302325582e-05, "loss": 0.6778, "step": 165 }, { "epoch": 0.03771213073538655, "grad_norm": 3.361267328262329, "learning_rate": 4.186046511627907e-05, "loss": 0.7222, "step": 180 }, { "epoch": 0.04085480829666876, "grad_norm": 4.040229797363281, "learning_rate": 4.5348837209302326e-05, "loss": 0.6684, "step": 195 }, { "epoch": 0.04399748585795098, "grad_norm": 2.817627429962158, "learning_rate": 4.883720930232558e-05, "loss": 0.7458, "step": 210 }, { "epoch": 0.04714016341923319, "grad_norm": 2.8800182342529297, "learning_rate": 5.232558139534884e-05, "loss": 0.6338, "step": 225 }, { "epoch": 0.0502828409805154, "grad_norm": 2.436993360519409, "learning_rate": 5.5813953488372095e-05, "loss": 0.6299, "step": 240 }, { "epoch": 0.05342551854179761, "grad_norm": 3.5814456939697266, "learning_rate": 5.9302325581395356e-05, "loss": 0.5728, "step": 255 }, { "epoch": 0.056568196103079824, "grad_norm": 2.8744938373565674, "learning_rate": 6.27906976744186e-05, "loss": 0.59, "step": 270 }, { "epoch": 0.059710873664362035, "grad_norm": 2.679749011993408, "learning_rate": 6.627906976744186e-05, "loss": 0.6016, "step": 285 }, { "epoch": 0.06285355122564425, "grad_norm": 3.1333463191986084, "learning_rate": 6.976744186046513e-05, "loss": 0.6569, "step": 300 }, { "epoch": 0.06599622878692646, "grad_norm": 2.2865939140319824, "learning_rate": 7.325581395348837e-05, "loss": 0.6385, "step": 315 }, { "epoch": 0.06913890634820867, "grad_norm": 2.9787251949310303, "learning_rate": 7.674418604651163e-05, "loss": 0.6307, "step": 330 }, { "epoch": 0.07228158390949088, "grad_norm": 2.078509569168091, "learning_rate": 8.023255813953489e-05, "loss": 0.5454, "step": 345 }, { "epoch": 0.0754242614707731, "grad_norm": 2.6606740951538086, "learning_rate": 8.372093023255814e-05, "loss": 0.6211, "step": 360 }, { "epoch": 0.0785669390320553, "grad_norm": 1.9346429109573364, "learning_rate": 8.72093023255814e-05, "loss": 0.5954, "step": 375 }, { "epoch": 0.08170961659333752, "grad_norm": 2.2432360649108887, "learning_rate": 9.069767441860465e-05, "loss": 0.5385, "step": 390 }, { "epoch": 0.08485229415461974, "grad_norm": 2.1645498275756836, "learning_rate": 9.418604651162792e-05, "loss": 0.592, "step": 405 }, { "epoch": 0.08799497171590195, "grad_norm": 2.1806533336639404, "learning_rate": 9.767441860465116e-05, "loss": 0.5372, "step": 420 }, { "epoch": 0.09113764927718417, "grad_norm": 2.445610761642456, "learning_rate": 9.999996802299678e-05, "loss": 0.6487, "step": 435 }, { "epoch": 0.09428032683846638, "grad_norm": 2.3592734336853027, "learning_rate": 9.999948836876656e-05, "loss": 0.5957, "step": 450 }, { "epoch": 0.09742300439974859, "grad_norm": 2.3027069568634033, "learning_rate": 9.999843313485898e-05, "loss": 0.5835, "step": 465 }, { "epoch": 0.1005656819610308, "grad_norm": 2.6429057121276855, "learning_rate": 9.999680233342161e-05, "loss": 0.592, "step": 480 }, { "epoch": 0.10370835952231301, "grad_norm": 2.0832202434539795, "learning_rate": 9.999459598322778e-05, "loss": 0.6203, "step": 495 }, { "epoch": 0.10685103708359522, "grad_norm": 2.481870412826538, "learning_rate": 9.999181410967633e-05, "loss": 0.5428, "step": 510 }, { "epoch": 0.10999371464487744, "grad_norm": 1.9621151685714722, "learning_rate": 9.99884567447914e-05, "loss": 0.6101, "step": 525 }, { "epoch": 0.11313639220615965, "grad_norm": 2.8833186626434326, "learning_rate": 9.998452392722198e-05, "loss": 0.5577, "step": 540 }, { "epoch": 0.11627906976744186, "grad_norm": 2.4447429180145264, "learning_rate": 9.998001570224158e-05, "loss": 0.566, "step": 555 }, { "epoch": 0.11942174732872407, "grad_norm": 2.141496419906616, "learning_rate": 9.997493212174753e-05, "loss": 0.6211, "step": 570 }, { "epoch": 0.12256442489000628, "grad_norm": 2.389796495437622, "learning_rate": 9.996927324426057e-05, "loss": 0.5937, "step": 585 }, { "epoch": 0.1257071024512885, "grad_norm": 2.1194262504577637, "learning_rate": 9.996303913492408e-05, "loss": 0.5847, "step": 600 }, { "epoch": 0.12884978001257072, "grad_norm": 1.7767274379730225, "learning_rate": 9.99562298655033e-05, "loss": 0.518, "step": 615 }, { "epoch": 0.13199245757385292, "grad_norm": 2.0348453521728516, "learning_rate": 9.994884551438458e-05, "loss": 0.5941, "step": 630 }, { "epoch": 0.13513513513513514, "grad_norm": 1.443819284439087, "learning_rate": 9.994088616657444e-05, "loss": 0.5022, "step": 645 }, { "epoch": 0.13827781269641734, "grad_norm": 2.1748251914978027, "learning_rate": 9.993235191369861e-05, "loss": 0.5369, "step": 660 }, { "epoch": 0.14142049025769957, "grad_norm": 1.9295774698257446, "learning_rate": 9.99232428540009e-05, "loss": 0.607, "step": 675 }, { "epoch": 0.14456316781898176, "grad_norm": 1.7530088424682617, "learning_rate": 9.991355909234224e-05, "loss": 0.5417, "step": 690 }, { "epoch": 0.147705845380264, "grad_norm": 10.02226448059082, "learning_rate": 9.990330074019925e-05, "loss": 0.5901, "step": 705 }, { "epoch": 0.1508485229415462, "grad_norm": 1.3864644765853882, "learning_rate": 9.989246791566314e-05, "loss": 0.678, "step": 720 }, { "epoch": 0.1539912005028284, "grad_norm": 1.6103929281234741, "learning_rate": 9.988106074343823e-05, "loss": 0.4741, "step": 735 }, { "epoch": 0.1571338780641106, "grad_norm": 1.5933347940444946, "learning_rate": 9.986907935484064e-05, "loss": 0.5391, "step": 750 }, { "epoch": 0.16027655562539284, "grad_norm": 1.5971338748931885, "learning_rate": 9.985652388779663e-05, "loss": 0.5782, "step": 765 }, { "epoch": 0.16341923318667503, "grad_norm": 1.559793472290039, "learning_rate": 9.984339448684113e-05, "loss": 0.5227, "step": 780 }, { "epoch": 0.16656191074795726, "grad_norm": 1.3077164888381958, "learning_rate": 9.982969130311597e-05, "loss": 0.5203, "step": 795 }, { "epoch": 0.16970458830923948, "grad_norm": 1.6828336715698242, "learning_rate": 9.98154144943683e-05, "loss": 0.5471, "step": 810 }, { "epoch": 0.17284726587052168, "grad_norm": 1.387099266052246, "learning_rate": 9.98005642249486e-05, "loss": 0.5399, "step": 825 }, { "epoch": 0.1759899434318039, "grad_norm": 1.723253607749939, "learning_rate": 9.978514066580886e-05, "loss": 0.5606, "step": 840 }, { "epoch": 0.1791326209930861, "grad_norm": 1.22931706905365, "learning_rate": 9.976914399450068e-05, "loss": 0.5024, "step": 855 }, { "epoch": 0.18227529855436833, "grad_norm": 1.4278538227081299, "learning_rate": 9.97525743951731e-05, "loss": 0.5983, "step": 870 }, { "epoch": 0.18541797611565053, "grad_norm": 1.4029372930526733, "learning_rate": 9.973543205857057e-05, "loss": 0.5699, "step": 885 }, { "epoch": 0.18856065367693275, "grad_norm": 1.3018133640289307, "learning_rate": 9.971771718203072e-05, "loss": 0.4936, "step": 900 }, { "epoch": 0.19170333123821495, "grad_norm": 1.3082265853881836, "learning_rate": 9.969942996948209e-05, "loss": 0.5025, "step": 915 }, { "epoch": 0.19484600879949718, "grad_norm": 1.2923167943954468, "learning_rate": 9.968057063144182e-05, "loss": 0.5779, "step": 930 }, { "epoch": 0.19798868636077938, "grad_norm": 1.2902971506118774, "learning_rate": 9.966113938501313e-05, "loss": 0.5373, "step": 945 }, { "epoch": 0.2011313639220616, "grad_norm": 1.391560673713684, "learning_rate": 9.964113645388293e-05, "loss": 0.5858, "step": 960 }, { "epoch": 0.2042740414833438, "grad_norm": 1.3245513439178467, "learning_rate": 9.96205620683192e-05, "loss": 0.6043, "step": 975 }, { "epoch": 0.20741671904462602, "grad_norm": 1.4998241662979126, "learning_rate": 9.95994164651683e-05, "loss": 0.5785, "step": 990 }, { "epoch": 0.21055939660590822, "grad_norm": 1.090804934501648, "learning_rate": 9.957769988785236e-05, "loss": 0.6439, "step": 1005 }, { "epoch": 0.21370207416719045, "grad_norm": 1.1564654111862183, "learning_rate": 9.955541258636631e-05, "loss": 0.5091, "step": 1020 }, { "epoch": 0.21684475172847265, "grad_norm": 1.1778066158294678, "learning_rate": 9.953255481727513e-05, "loss": 0.5456, "step": 1035 }, { "epoch": 0.21998742928975487, "grad_norm": 1.3568626642227173, "learning_rate": 9.950912684371088e-05, "loss": 0.5208, "step": 1050 }, { "epoch": 0.2231301068510371, "grad_norm": 1.804425597190857, "learning_rate": 9.948512893536961e-05, "loss": 0.4956, "step": 1065 }, { "epoch": 0.2262727844123193, "grad_norm": 1.226159930229187, "learning_rate": 9.946056136850833e-05, "loss": 0.5812, "step": 1080 }, { "epoch": 0.22941546197360152, "grad_norm": 1.1530790328979492, "learning_rate": 9.943542442594177e-05, "loss": 0.4742, "step": 1095 }, { "epoch": 0.23255813953488372, "grad_norm": 1.390417218208313, "learning_rate": 9.940971839703916e-05, "loss": 0.619, "step": 1110 }, { "epoch": 0.23570081709616594, "grad_norm": 1.4010789394378662, "learning_rate": 9.938344357772087e-05, "loss": 0.6086, "step": 1125 }, { "epoch": 0.23884349465744814, "grad_norm": 1.6488044261932373, "learning_rate": 9.935660027045506e-05, "loss": 0.551, "step": 1140 }, { "epoch": 0.24198617221873037, "grad_norm": 1.0560044050216675, "learning_rate": 9.932918878425412e-05, "loss": 0.532, "step": 1155 }, { "epoch": 0.24512884978001256, "grad_norm": 1.0651888847351074, "learning_rate": 9.930120943467117e-05, "loss": 0.5012, "step": 1170 }, { "epoch": 0.2482715273412948, "grad_norm": 1.0553079843521118, "learning_rate": 9.927266254379642e-05, "loss": 0.5576, "step": 1185 }, { "epoch": 0.251414204902577, "grad_norm": 1.007480263710022, "learning_rate": 9.924354844025339e-05, "loss": 0.4839, "step": 1200 }, { "epoch": 0.2545568824638592, "grad_norm": 1.0924334526062012, "learning_rate": 9.921386745919528e-05, "loss": 0.595, "step": 1215 }, { "epoch": 0.25769956002514144, "grad_norm": 1.3309390544891357, "learning_rate": 9.918361994230097e-05, "loss": 0.5224, "step": 1230 }, { "epoch": 0.2608422375864236, "grad_norm": 0.9702763557434082, "learning_rate": 9.915280623777114e-05, "loss": 0.4871, "step": 1245 }, { "epoch": 0.26398491514770583, "grad_norm": 1.0511876344680786, "learning_rate": 9.912142670032427e-05, "loss": 0.5861, "step": 1260 }, { "epoch": 0.26712759270898806, "grad_norm": 1.396050214767456, "learning_rate": 9.908948169119251e-05, "loss": 0.4651, "step": 1275 }, { "epoch": 0.2702702702702703, "grad_norm": 0.985396683216095, "learning_rate": 9.905697157811761e-05, "loss": 0.4302, "step": 1290 }, { "epoch": 0.27341294783155246, "grad_norm": 0.9169828295707703, "learning_rate": 9.902389673534659e-05, "loss": 0.5212, "step": 1305 }, { "epoch": 0.2765556253928347, "grad_norm": 0.9107710123062134, "learning_rate": 9.899025754362751e-05, "loss": 0.4941, "step": 1320 }, { "epoch": 0.2796983029541169, "grad_norm": 0.9720286726951599, "learning_rate": 9.8956054390205e-05, "loss": 0.5169, "step": 1335 }, { "epoch": 0.28284098051539913, "grad_norm": 1.1490366458892822, "learning_rate": 9.892128766881596e-05, "loss": 0.4973, "step": 1350 }, { "epoch": 0.28598365807668136, "grad_norm": 1.2628952264785767, "learning_rate": 9.888595777968479e-05, "loss": 0.5194, "step": 1365 }, { "epoch": 0.2891263356379635, "grad_norm": 1.1610651016235352, "learning_rate": 9.885006512951897e-05, "loss": 0.4994, "step": 1380 }, { "epoch": 0.29226901319924575, "grad_norm": 1.054768681526184, "learning_rate": 9.881361013150436e-05, "loss": 0.4664, "step": 1395 }, { "epoch": 0.295411690760528, "grad_norm": 1.0745666027069092, "learning_rate": 9.877659320530037e-05, "loss": 0.5306, "step": 1410 }, { "epoch": 0.2985543683218102, "grad_norm": 1.3258591890335083, "learning_rate": 9.873901477703516e-05, "loss": 0.5076, "step": 1425 }, { "epoch": 0.3016970458830924, "grad_norm": 1.222783088684082, "learning_rate": 9.870087527930077e-05, "loss": 0.4581, "step": 1440 }, { "epoch": 0.3048397234443746, "grad_norm": 0.9374076724052429, "learning_rate": 9.866217515114805e-05, "loss": 0.4643, "step": 1455 }, { "epoch": 0.3079824010056568, "grad_norm": 1.3485162258148193, "learning_rate": 9.862291483808173e-05, "loss": 0.5551, "step": 1470 }, { "epoch": 0.31112507856693905, "grad_norm": 0.9162548780441284, "learning_rate": 9.858309479205519e-05, "loss": 0.5592, "step": 1485 }, { "epoch": 0.3142677561282212, "grad_norm": 1.1385138034820557, "learning_rate": 9.854271547146531e-05, "loss": 0.477, "step": 1500 }, { "epoch": 0.31741043368950345, "grad_norm": 1.0023164749145508, "learning_rate": 9.850177734114718e-05, "loss": 0.4972, "step": 1515 }, { "epoch": 0.32055311125078567, "grad_norm": 2.540215492248535, "learning_rate": 9.846028087236873e-05, "loss": 0.5007, "step": 1530 }, { "epoch": 0.3236957888120679, "grad_norm": 1.2012773752212524, "learning_rate": 9.841822654282533e-05, "loss": 0.5481, "step": 1545 }, { "epoch": 0.32683846637335007, "grad_norm": 0.9517608284950256, "learning_rate": 9.837561483663429e-05, "loss": 0.567, "step": 1560 }, { "epoch": 0.3299811439346323, "grad_norm": 1.0308321714401245, "learning_rate": 9.833244624432927e-05, "loss": 0.4856, "step": 1575 }, { "epoch": 0.3331238214959145, "grad_norm": 1.118574857711792, "learning_rate": 9.828872126285465e-05, "loss": 0.465, "step": 1590 }, { "epoch": 0.33626649905719674, "grad_norm": 1.0821537971496582, "learning_rate": 9.824444039555977e-05, "loss": 0.4394, "step": 1605 }, { "epoch": 0.33940917661847897, "grad_norm": 0.8795451521873474, "learning_rate": 9.81996041521932e-05, "loss": 0.4383, "step": 1620 }, { "epoch": 0.34255185417976114, "grad_norm": 1.1455141305923462, "learning_rate": 9.815421304889687e-05, "loss": 0.4805, "step": 1635 }, { "epoch": 0.34569453174104336, "grad_norm": 1.1445369720458984, "learning_rate": 9.81082676082e-05, "loss": 0.5315, "step": 1650 }, { "epoch": 0.3488372093023256, "grad_norm": 1.0800312757492065, "learning_rate": 9.806176835901328e-05, "loss": 0.5205, "step": 1665 }, { "epoch": 0.3519798868636078, "grad_norm": 0.7038319706916809, "learning_rate": 9.801471583662263e-05, "loss": 0.515, "step": 1680 }, { "epoch": 0.35512256442489, "grad_norm": 0.9790651202201843, "learning_rate": 9.796711058268313e-05, "loss": 0.504, "step": 1695 }, { "epoch": 0.3582652419861722, "grad_norm": 1.1764894723892212, "learning_rate": 9.791895314521267e-05, "loss": 0.4806, "step": 1710 }, { "epoch": 0.36140791954745444, "grad_norm": 0.9900022745132446, "learning_rate": 9.787024407858582e-05, "loss": 0.5358, "step": 1725 }, { "epoch": 0.36455059710873666, "grad_norm": 0.8621386289596558, "learning_rate": 9.782098394352725e-05, "loss": 0.5494, "step": 1740 }, { "epoch": 0.36769327467001883, "grad_norm": 0.8717844486236572, "learning_rate": 9.777117330710547e-05, "loss": 0.4967, "step": 1755 }, { "epoch": 0.37083595223130106, "grad_norm": 0.9800569415092468, "learning_rate": 9.772081274272611e-05, "loss": 0.4538, "step": 1770 }, { "epoch": 0.3739786297925833, "grad_norm": 0.9540134072303772, "learning_rate": 9.766990283012544e-05, "loss": 0.5149, "step": 1785 }, { "epoch": 0.3771213073538655, "grad_norm": 1.0856047868728638, "learning_rate": 9.761844415536372e-05, "loss": 0.5042, "step": 1800 }, { "epoch": 0.3802639849151477, "grad_norm": 1.0914040803909302, "learning_rate": 9.756643731081833e-05, "loss": 0.5059, "step": 1815 }, { "epoch": 0.3834066624764299, "grad_norm": 1.2371134757995605, "learning_rate": 9.751388289517704e-05, "loss": 0.4506, "step": 1830 }, { "epoch": 0.38654934003771213, "grad_norm": 1.0402591228485107, "learning_rate": 9.746078151343116e-05, "loss": 0.5535, "step": 1845 }, { "epoch": 0.38969201759899436, "grad_norm": 0.6260209083557129, "learning_rate": 9.740713377686843e-05, "loss": 0.4436, "step": 1860 }, { "epoch": 0.3928346951602766, "grad_norm": 0.9588780999183655, "learning_rate": 9.735294030306611e-05, "loss": 0.5573, "step": 1875 }, { "epoch": 0.39597737272155875, "grad_norm": 1.0838474035263062, "learning_rate": 9.729820171588384e-05, "loss": 0.4627, "step": 1890 }, { "epoch": 0.399120050282841, "grad_norm": 1.0682798624038696, "learning_rate": 9.724291864545643e-05, "loss": 0.4893, "step": 1905 }, { "epoch": 0.4022627278441232, "grad_norm": 0.9129301309585571, "learning_rate": 9.718709172818661e-05, "loss": 0.4898, "step": 1920 }, { "epoch": 0.40540540540540543, "grad_norm": 1.0116883516311646, "learning_rate": 9.713072160673777e-05, "loss": 0.4615, "step": 1935 }, { "epoch": 0.4085480829666876, "grad_norm": 1.057822823524475, "learning_rate": 9.707380893002646e-05, "loss": 0.4899, "step": 1950 }, { "epoch": 0.4116907605279698, "grad_norm": 0.6419869661331177, "learning_rate": 9.7016354353215e-05, "loss": 0.4348, "step": 1965 }, { "epoch": 0.41483343808925205, "grad_norm": 0.961713433265686, "learning_rate": 9.695835853770387e-05, "loss": 0.4921, "step": 1980 }, { "epoch": 0.4179761156505343, "grad_norm": 0.9473373889923096, "learning_rate": 9.689982215112417e-05, "loss": 0.4926, "step": 1995 }, { "epoch": 0.42111879321181644, "grad_norm": 1.2034335136413574, "learning_rate": 9.684074586732987e-05, "loss": 0.5042, "step": 2010 }, { "epoch": 0.42426147077309867, "grad_norm": 0.9373855590820312, "learning_rate": 9.678113036639014e-05, "loss": 0.5076, "step": 2025 }, { "epoch": 0.4274041483343809, "grad_norm": 1.016756296157837, "learning_rate": 9.672097633458136e-05, "loss": 0.4805, "step": 2040 }, { "epoch": 0.4305468258956631, "grad_norm": 0.7454690337181091, "learning_rate": 9.666028446437942e-05, "loss": 0.5382, "step": 2055 }, { "epoch": 0.4336895034569453, "grad_norm": 0.8196286559104919, "learning_rate": 9.659905545445159e-05, "loss": 0.4613, "step": 2070 }, { "epoch": 0.4368321810182275, "grad_norm": 0.9132091403007507, "learning_rate": 9.653729000964857e-05, "loss": 0.4595, "step": 2085 }, { "epoch": 0.43997485857950974, "grad_norm": 0.8063992857933044, "learning_rate": 9.647498884099633e-05, "loss": 0.4139, "step": 2100 }, { "epoch": 0.44311753614079197, "grad_norm": 0.9756997227668762, "learning_rate": 9.641215266568794e-05, "loss": 0.3941, "step": 2115 }, { "epoch": 0.4462602137020742, "grad_norm": 0.6542510390281677, "learning_rate": 9.634878220707531e-05, "loss": 0.4768, "step": 2130 }, { "epoch": 0.44940289126335636, "grad_norm": 0.9039008617401123, "learning_rate": 9.628487819466086e-05, "loss": 0.4248, "step": 2145 }, { "epoch": 0.4525455688246386, "grad_norm": 1.1151047945022583, "learning_rate": 9.622044136408914e-05, "loss": 0.5041, "step": 2160 }, { "epoch": 0.4556882463859208, "grad_norm": 0.8580663800239563, "learning_rate": 9.615547245713836e-05, "loss": 0.4766, "step": 2175 }, { "epoch": 0.45883092394720304, "grad_norm": 0.9799042344093323, "learning_rate": 9.608997222171178e-05, "loss": 0.4714, "step": 2190 }, { "epoch": 0.4619736015084852, "grad_norm": 0.8485172986984253, "learning_rate": 9.602394141182927e-05, "loss": 0.4556, "step": 2205 }, { "epoch": 0.46511627906976744, "grad_norm": 0.9632934927940369, "learning_rate": 9.595738078761837e-05, "loss": 0.4791, "step": 2220 }, { "epoch": 0.46825895663104966, "grad_norm": 0.8843478560447693, "learning_rate": 9.589029111530586e-05, "loss": 0.4603, "step": 2235 }, { "epoch": 0.4714016341923319, "grad_norm": 1.1230348348617554, "learning_rate": 9.582267316720861e-05, "loss": 0.491, "step": 2250 }, { "epoch": 0.47454431175361406, "grad_norm": 0.8234013915061951, "learning_rate": 9.575452772172495e-05, "loss": 0.44, "step": 2265 }, { "epoch": 0.4776869893148963, "grad_norm": 0.6838919520378113, "learning_rate": 9.568585556332559e-05, "loss": 0.4456, "step": 2280 }, { "epoch": 0.4808296668761785, "grad_norm": 0.8424423336982727, "learning_rate": 9.561665748254456e-05, "loss": 0.4556, "step": 2295 }, { "epoch": 0.48397234443746073, "grad_norm": 0.6735498905181885, "learning_rate": 9.554693427597024e-05, "loss": 0.5184, "step": 2310 }, { "epoch": 0.4871150219987429, "grad_norm": 0.8868768811225891, "learning_rate": 9.5476686746236e-05, "loss": 0.5403, "step": 2325 }, { "epoch": 0.49025769956002513, "grad_norm": 0.9957670569419861, "learning_rate": 9.540591570201116e-05, "loss": 0.4997, "step": 2340 }, { "epoch": 0.49340037712130735, "grad_norm": 0.76320481300354, "learning_rate": 9.533462195799157e-05, "loss": 0.4534, "step": 2355 }, { "epoch": 0.4965430546825896, "grad_norm": 0.8841500282287598, "learning_rate": 9.526280633489018e-05, "loss": 0.4724, "step": 2370 }, { "epoch": 0.4996857322438718, "grad_norm": 0.8852142095565796, "learning_rate": 9.519046965942776e-05, "loss": 0.4655, "step": 2385 }, { "epoch": 0.502828409805154, "grad_norm": 0.839430570602417, "learning_rate": 9.511761276432321e-05, "loss": 0.4386, "step": 2400 }, { "epoch": 0.5059710873664363, "grad_norm": 0.7581266760826111, "learning_rate": 9.50442364882841e-05, "loss": 0.4774, "step": 2415 }, { "epoch": 0.5091137649277184, "grad_norm": 0.8754017949104309, "learning_rate": 9.497034167599691e-05, "loss": 0.4744, "step": 2430 }, { "epoch": 0.5122564424890006, "grad_norm": 0.9099476337432861, "learning_rate": 9.48959291781174e-05, "loss": 0.4292, "step": 2445 }, { "epoch": 0.5153991200502829, "grad_norm": 0.9721155166625977, "learning_rate": 9.482099985126079e-05, "loss": 0.4137, "step": 2460 }, { "epoch": 0.518541797611565, "grad_norm": 0.8385334014892578, "learning_rate": 9.474555455799181e-05, "loss": 0.471, "step": 2475 }, { "epoch": 0.5216844751728472, "grad_norm": 0.9853966236114502, "learning_rate": 9.466959416681495e-05, "loss": 0.4233, "step": 2490 }, { "epoch": 0.5248271527341295, "grad_norm": 1.1044224500656128, "learning_rate": 9.459311955216428e-05, "loss": 0.5188, "step": 2505 }, { "epoch": 0.5279698302954117, "grad_norm": 0.870677649974823, "learning_rate": 9.451613159439349e-05, "loss": 0.4676, "step": 2520 }, { "epoch": 0.531112507856694, "grad_norm": 0.8571140170097351, "learning_rate": 9.443863117976573e-05, "loss": 0.4863, "step": 2535 }, { "epoch": 0.5342551854179761, "grad_norm": 1.0573495626449585, "learning_rate": 9.436061920044341e-05, "loss": 0.5057, "step": 2550 }, { "epoch": 0.5373978629792583, "grad_norm": 0.9805963635444641, "learning_rate": 9.42820965544779e-05, "loss": 0.468, "step": 2565 }, { "epoch": 0.5405405405405406, "grad_norm": 0.8198602199554443, "learning_rate": 9.420306414579925e-05, "loss": 0.5054, "step": 2580 }, { "epoch": 0.5436832181018227, "grad_norm": 0.9718137979507446, "learning_rate": 9.412352288420572e-05, "loss": 0.4824, "step": 2595 }, { "epoch": 0.5468258956631049, "grad_norm": 1.0223153829574585, "learning_rate": 9.404347368535337e-05, "loss": 0.4502, "step": 2610 }, { "epoch": 0.5499685732243872, "grad_norm": 0.9398010969161987, "learning_rate": 9.396291747074547e-05, "loss": 0.4761, "step": 2625 }, { "epoch": 0.5531112507856694, "grad_norm": 0.9091777801513672, "learning_rate": 9.38818551677219e-05, "loss": 0.4033, "step": 2640 }, { "epoch": 0.5562539283469516, "grad_norm": 1.06580650806427, "learning_rate": 9.380028770944849e-05, "loss": 0.4052, "step": 2655 }, { "epoch": 0.5593966059082338, "grad_norm": 0.7236329913139343, "learning_rate": 9.371821603490627e-05, "loss": 0.4677, "step": 2670 }, { "epoch": 0.562539283469516, "grad_norm": 0.8263210654258728, "learning_rate": 9.363564108888069e-05, "loss": 0.4576, "step": 2685 }, { "epoch": 0.5656819610307983, "grad_norm": 1.022448182106018, "learning_rate": 9.355256382195068e-05, "loss": 0.4963, "step": 2700 }, { "epoch": 0.5688246385920804, "grad_norm": 0.9639766812324524, "learning_rate": 9.346898519047775e-05, "loss": 0.4113, "step": 2715 }, { "epoch": 0.5719673161533627, "grad_norm": 1.1044561862945557, "learning_rate": 9.338490615659499e-05, "loss": 0.5023, "step": 2730 }, { "epoch": 0.5751099937146449, "grad_norm": 0.8272239565849304, "learning_rate": 9.330032768819596e-05, "loss": 0.4699, "step": 2745 }, { "epoch": 0.578252671275927, "grad_norm": 0.7692523002624512, "learning_rate": 9.321525075892356e-05, "loss": 0.4292, "step": 2760 }, { "epoch": 0.5813953488372093, "grad_norm": 0.9032982587814331, "learning_rate": 9.312967634815888e-05, "loss": 0.4432, "step": 2775 }, { "epoch": 0.5845380263984915, "grad_norm": 0.7676737904548645, "learning_rate": 9.304360544100982e-05, "loss": 0.4311, "step": 2790 }, { "epoch": 0.5876807039597737, "grad_norm": 0.9019532799720764, "learning_rate": 9.29570390282998e-05, "loss": 0.4464, "step": 2805 }, { "epoch": 0.590823381521056, "grad_norm": 0.9738386869430542, "learning_rate": 9.286997810655638e-05, "loss": 0.5019, "step": 2820 }, { "epoch": 0.5939660590823381, "grad_norm": 0.7886769771575928, "learning_rate": 9.278242367799978e-05, "loss": 0.4919, "step": 2835 }, { "epoch": 0.5971087366436204, "grad_norm": 0.9002622365951538, "learning_rate": 9.269437675053129e-05, "loss": 0.4695, "step": 2850 }, { "epoch": 0.6002514142049026, "grad_norm": 0.7023227214813232, "learning_rate": 9.260583833772172e-05, "loss": 0.4338, "step": 2865 }, { "epoch": 0.6033940917661847, "grad_norm": 0.9442479014396667, "learning_rate": 9.251680945879975e-05, "loss": 0.4907, "step": 2880 }, { "epoch": 0.606536769327467, "grad_norm": 0.6304488778114319, "learning_rate": 9.24272911386401e-05, "loss": 0.4612, "step": 2895 }, { "epoch": 0.6096794468887492, "grad_norm": 0.731960117816925, "learning_rate": 9.233728440775185e-05, "loss": 0.4207, "step": 2910 }, { "epoch": 0.6128221244500315, "grad_norm": 1.083849549293518, "learning_rate": 9.224679030226648e-05, "loss": 0.4775, "step": 2925 }, { "epoch": 0.6159648020113137, "grad_norm": 0.6792687177658081, "learning_rate": 9.215580986392607e-05, "loss": 0.4708, "step": 2940 }, { "epoch": 0.6191074795725958, "grad_norm": 0.7582160830497742, "learning_rate": 9.20643441400711e-05, "loss": 0.4352, "step": 2955 }, { "epoch": 0.6222501571338781, "grad_norm": 0.7785065174102783, "learning_rate": 9.197239418362862e-05, "loss": 0.4199, "step": 2970 }, { "epoch": 0.6253928346951603, "grad_norm": 0.9076778292655945, "learning_rate": 9.187996105309995e-05, "loss": 0.4937, "step": 2985 }, { "epoch": 0.6285355122564424, "grad_norm": 0.9189762473106384, "learning_rate": 9.178704581254865e-05, "loss": 0.4553, "step": 3000 }, { "epoch": 0.6316781898177247, "grad_norm": 0.8485803008079529, "learning_rate": 9.169364953158812e-05, "loss": 0.4799, "step": 3015 }, { "epoch": 0.6348208673790069, "grad_norm": 0.8296557068824768, "learning_rate": 9.15997732853694e-05, "loss": 0.4799, "step": 3030 }, { "epoch": 0.6379635449402892, "grad_norm": 0.9346463680267334, "learning_rate": 9.150541815456874e-05, "loss": 0.4707, "step": 3045 }, { "epoch": 0.6411062225015713, "grad_norm": 1.0045510530471802, "learning_rate": 9.141058522537515e-05, "loss": 0.5216, "step": 3060 }, { "epoch": 0.6442489000628535, "grad_norm": 0.5840141773223877, "learning_rate": 9.131527558947796e-05, "loss": 0.429, "step": 3075 }, { "epoch": 0.6473915776241358, "grad_norm": 0.8743481040000916, "learning_rate": 9.121949034405417e-05, "loss": 0.4734, "step": 3090 }, { "epoch": 0.650534255185418, "grad_norm": 0.9631288051605225, "learning_rate": 9.112323059175588e-05, "loss": 0.4856, "step": 3105 }, { "epoch": 0.6536769327467001, "grad_norm": 0.7583104372024536, "learning_rate": 9.102649744069758e-05, "loss": 0.4428, "step": 3120 }, { "epoch": 0.6568196103079824, "grad_norm": 0.9227087497711182, "learning_rate": 9.092929200444337e-05, "loss": 0.4622, "step": 3135 }, { "epoch": 0.6599622878692646, "grad_norm": 0.720124363899231, "learning_rate": 9.083161540199417e-05, "loss": 0.4136, "step": 3150 }, { "epoch": 0.6631049654305469, "grad_norm": 0.6481117010116577, "learning_rate": 9.073346875777487e-05, "loss": 0.5445, "step": 3165 }, { "epoch": 0.666247642991829, "grad_norm": 0.6970652937889099, "learning_rate": 9.063485320162126e-05, "loss": 0.4247, "step": 3180 }, { "epoch": 0.6693903205531112, "grad_norm": 0.5132230520248413, "learning_rate": 9.053576986876718e-05, "loss": 0.4415, "step": 3195 }, { "epoch": 0.6725329981143935, "grad_norm": 0.7673790454864502, "learning_rate": 9.043621989983135e-05, "loss": 0.5188, "step": 3210 }, { "epoch": 0.6756756756756757, "grad_norm": 0.8441967368125916, "learning_rate": 9.033620444080428e-05, "loss": 0.4343, "step": 3225 }, { "epoch": 0.6788183532369579, "grad_norm": 0.8746171593666077, "learning_rate": 9.023572464303506e-05, "loss": 0.4114, "step": 3240 }, { "epoch": 0.6819610307982401, "grad_norm": 0.7494221925735474, "learning_rate": 9.013478166321812e-05, "loss": 0.4334, "step": 3255 }, { "epoch": 0.6851037083595223, "grad_norm": 0.7263948917388916, "learning_rate": 9.00333766633799e-05, "loss": 0.4322, "step": 3270 }, { "epoch": 0.6882463859208046, "grad_norm": 0.852172315120697, "learning_rate": 8.99315108108655e-05, "loss": 0.4506, "step": 3285 }, { "epoch": 0.6913890634820867, "grad_norm": 0.7959320545196533, "learning_rate": 8.98291852783252e-05, "loss": 0.4456, "step": 3300 }, { "epoch": 0.6945317410433689, "grad_norm": 0.5918748378753662, "learning_rate": 8.9726401243701e-05, "loss": 0.4181, "step": 3315 }, { "epoch": 0.6976744186046512, "grad_norm": 0.9726805090904236, "learning_rate": 8.962315989021304e-05, "loss": 0.4964, "step": 3330 }, { "epoch": 0.7008170961659334, "grad_norm": 0.8826568126678467, "learning_rate": 8.951946240634596e-05, "loss": 0.4702, "step": 3345 }, { "epoch": 0.7039597737272156, "grad_norm": 0.7354099154472351, "learning_rate": 8.941530998583527e-05, "loss": 0.4258, "step": 3360 }, { "epoch": 0.7071024512884978, "grad_norm": 0.9217835664749146, "learning_rate": 8.931070382765359e-05, "loss": 0.5185, "step": 3375 }, { "epoch": 0.71024512884978, "grad_norm": 0.7444872260093689, "learning_rate": 8.920564513599679e-05, "loss": 0.4534, "step": 3390 }, { "epoch": 0.7133878064110623, "grad_norm": 0.7847276926040649, "learning_rate": 8.910013512027022e-05, "loss": 0.4232, "step": 3405 }, { "epoch": 0.7165304839723444, "grad_norm": 0.8024355173110962, "learning_rate": 8.899417499507471e-05, "loss": 0.4579, "step": 3420 }, { "epoch": 0.7196731615336267, "grad_norm": 0.7088613510131836, "learning_rate": 8.888776598019266e-05, "loss": 0.4437, "step": 3435 }, { "epoch": 0.7228158390949089, "grad_norm": 0.6009235382080078, "learning_rate": 8.87809093005739e-05, "loss": 0.397, "step": 3450 }, { "epoch": 0.725958516656191, "grad_norm": 0.8743120431900024, "learning_rate": 8.867360618632172e-05, "loss": 0.5056, "step": 3465 }, { "epoch": 0.7291011942174733, "grad_norm": 0.899148166179657, "learning_rate": 8.856585787267856e-05, "loss": 0.4521, "step": 3480 }, { "epoch": 0.7322438717787555, "grad_norm": 0.8690171837806702, "learning_rate": 8.845766560001193e-05, "loss": 0.4708, "step": 3495 }, { "epoch": 0.7353865493400377, "grad_norm": 0.9699186682701111, "learning_rate": 8.834903061380002e-05, "loss": 0.4534, "step": 3510 }, { "epoch": 0.73852922690132, "grad_norm": 0.8577262163162231, "learning_rate": 8.823995416461744e-05, "loss": 0.4096, "step": 3525 }, { "epoch": 0.7416719044626021, "grad_norm": 0.7458922266960144, "learning_rate": 8.81304375081208e-05, "loss": 0.46, "step": 3540 }, { "epoch": 0.7448145820238844, "grad_norm": 0.7347140908241272, "learning_rate": 8.802048190503423e-05, "loss": 0.4684, "step": 3555 }, { "epoch": 0.7479572595851666, "grad_norm": 0.7161451578140259, "learning_rate": 8.79100886211349e-05, "loss": 0.4715, "step": 3570 }, { "epoch": 0.7510999371464487, "grad_norm": 0.8321588039398193, "learning_rate": 8.779925892723842e-05, "loss": 0.3598, "step": 3585 }, { "epoch": 0.754242614707731, "grad_norm": 0.9462142586708069, "learning_rate": 8.768799409918423e-05, "loss": 0.4404, "step": 3600 }, { "epoch": 0.7573852922690132, "grad_norm": 0.6842710971832275, "learning_rate": 8.75762954178209e-05, "loss": 0.4648, "step": 3615 }, { "epoch": 0.7605279698302954, "grad_norm": 0.8573241829872131, "learning_rate": 8.746416416899145e-05, "loss": 0.4592, "step": 3630 }, { "epoch": 0.7636706473915776, "grad_norm": 0.751291811466217, "learning_rate": 8.735160164351841e-05, "loss": 0.5319, "step": 3645 }, { "epoch": 0.7668133249528598, "grad_norm": 0.731086790561676, "learning_rate": 8.72386091371891e-05, "loss": 0.4629, "step": 3660 }, { "epoch": 0.7699560025141421, "grad_norm": 0.9289976358413696, "learning_rate": 8.712518795074063e-05, "loss": 0.4427, "step": 3675 }, { "epoch": 0.7730986800754243, "grad_norm": 0.7036064267158508, "learning_rate": 8.701133938984496e-05, "loss": 0.4679, "step": 3690 }, { "epoch": 0.7762413576367064, "grad_norm": 0.778161346912384, "learning_rate": 8.689706476509385e-05, "loss": 0.4489, "step": 3705 }, { "epoch": 0.7793840351979887, "grad_norm": 0.8694556951522827, "learning_rate": 8.678236539198382e-05, "loss": 0.4048, "step": 3720 }, { "epoch": 0.7825267127592709, "grad_norm": 0.5768362283706665, "learning_rate": 8.666724259090092e-05, "loss": 0.4434, "step": 3735 }, { "epoch": 0.7856693903205532, "grad_norm": 0.604917585849762, "learning_rate": 8.655169768710562e-05, "loss": 0.4669, "step": 3750 }, { "epoch": 0.7888120678818353, "grad_norm": 0.833985447883606, "learning_rate": 8.643573201071748e-05, "loss": 0.4267, "step": 3765 }, { "epoch": 0.7919547454431175, "grad_norm": 0.7951568365097046, "learning_rate": 8.631934689669992e-05, "loss": 0.4028, "step": 3780 }, { "epoch": 0.7950974230043998, "grad_norm": 0.7703410983085632, "learning_rate": 8.620254368484474e-05, "loss": 0.4153, "step": 3795 }, { "epoch": 0.798240100565682, "grad_norm": 0.8545910716056824, "learning_rate": 8.608532371975684e-05, "loss": 0.4949, "step": 3810 }, { "epoch": 0.8013827781269641, "grad_norm": 0.8206099271774292, "learning_rate": 8.59676883508386e-05, "loss": 0.4714, "step": 3825 }, { "epoch": 0.8045254556882464, "grad_norm": 0.7841479182243347, "learning_rate": 8.584963893227442e-05, "loss": 0.4888, "step": 3840 }, { "epoch": 0.8076681332495286, "grad_norm": 0.7417731285095215, "learning_rate": 8.573117682301514e-05, "loss": 0.4951, "step": 3855 }, { "epoch": 0.8108108108108109, "grad_norm": 0.9013925194740295, "learning_rate": 8.561230338676239e-05, "loss": 0.4542, "step": 3870 }, { "epoch": 0.813953488372093, "grad_norm": 1.2146642208099365, "learning_rate": 8.549301999195283e-05, "loss": 0.4606, "step": 3885 }, { "epoch": 0.8170961659333752, "grad_norm": 0.8740483522415161, "learning_rate": 8.537332801174245e-05, "loss": 0.4562, "step": 3900 }, { "epoch": 0.8202388434946575, "grad_norm": 0.7769590020179749, "learning_rate": 8.525322882399082e-05, "loss": 0.4385, "step": 3915 }, { "epoch": 0.8233815210559396, "grad_norm": 0.7966271042823792, "learning_rate": 8.513272381124511e-05, "loss": 0.4011, "step": 3930 }, { "epoch": 0.8265241986172219, "grad_norm": 0.6132526397705078, "learning_rate": 8.501181436072422e-05, "loss": 0.393, "step": 3945 }, { "epoch": 0.8296668761785041, "grad_norm": 0.6438138484954834, "learning_rate": 8.489050186430285e-05, "loss": 0.4226, "step": 3960 }, { "epoch": 0.8328095537397863, "grad_norm": 0.8362025022506714, "learning_rate": 8.476878771849545e-05, "loss": 0.4216, "step": 3975 }, { "epoch": 0.8359522313010685, "grad_norm": 0.770706057548523, "learning_rate": 8.464667332444012e-05, "loss": 0.4278, "step": 3990 }, { "epoch": 0.8390949088623507, "grad_norm": 0.8944802284240723, "learning_rate": 8.452416008788254e-05, "loss": 0.4609, "step": 4005 }, { "epoch": 0.8422375864236329, "grad_norm": 0.9292035102844238, "learning_rate": 8.440124941915972e-05, "loss": 0.4124, "step": 4020 }, { "epoch": 0.8453802639849152, "grad_norm": 0.6450730562210083, "learning_rate": 8.427794273318377e-05, "loss": 0.4124, "step": 4035 }, { "epoch": 0.8485229415461973, "grad_norm": 1.0732468366622925, "learning_rate": 8.415424144942569e-05, "loss": 0.4678, "step": 4050 }, { "epoch": 0.8516656191074796, "grad_norm": 0.900360107421875, "learning_rate": 8.403014699189892e-05, "loss": 0.4299, "step": 4065 }, { "epoch": 0.8548082966687618, "grad_norm": 0.7163972854614258, "learning_rate": 8.39056607891431e-05, "loss": 0.4651, "step": 4080 }, { "epoch": 0.857950974230044, "grad_norm": 0.6078224182128906, "learning_rate": 8.378078427420739e-05, "loss": 0.4612, "step": 4095 }, { "epoch": 0.8610936517913262, "grad_norm": 0.7975668907165527, "learning_rate": 8.365551888463423e-05, "loss": 0.4521, "step": 4110 }, { "epoch": 0.8642363293526084, "grad_norm": 0.7620348930358887, "learning_rate": 8.352986606244262e-05, "loss": 0.4527, "step": 4125 }, { "epoch": 0.8673790069138906, "grad_norm": 0.7811437249183655, "learning_rate": 8.340382725411155e-05, "loss": 0.4639, "step": 4140 }, { "epoch": 0.8705216844751729, "grad_norm": 0.46538805961608887, "learning_rate": 8.327740391056343e-05, "loss": 0.3793, "step": 4155 }, { "epoch": 0.873664362036455, "grad_norm": 0.893225371837616, "learning_rate": 8.315059748714728e-05, "loss": 0.4824, "step": 4170 }, { "epoch": 0.8768070395977373, "grad_norm": 0.8325145244598389, "learning_rate": 8.302340944362205e-05, "loss": 0.4623, "step": 4185 }, { "epoch": 0.8799497171590195, "grad_norm": 0.7328510880470276, "learning_rate": 8.289584124413978e-05, "loss": 0.4075, "step": 4200 }, { "epoch": 0.8830923947203017, "grad_norm": 0.35754507780075073, "learning_rate": 8.276789435722875e-05, "loss": 0.3328, "step": 4215 }, { "epoch": 0.8862350722815839, "grad_norm": 0.78349369764328, "learning_rate": 8.263957025577663e-05, "loss": 0.4962, "step": 4230 }, { "epoch": 0.8893777498428661, "grad_norm": 0.644481360912323, "learning_rate": 8.251087041701339e-05, "loss": 0.3977, "step": 4245 }, { "epoch": 0.8925204274041484, "grad_norm": 0.618881344795227, "learning_rate": 8.238179632249443e-05, "loss": 0.3967, "step": 4260 }, { "epoch": 0.8956631049654306, "grad_norm": 0.7603642344474792, "learning_rate": 8.22523494580835e-05, "loss": 0.4413, "step": 4275 }, { "epoch": 0.8988057825267127, "grad_norm": 0.6301630735397339, "learning_rate": 8.212253131393549e-05, "loss": 0.4333, "step": 4290 }, { "epoch": 0.901948460087995, "grad_norm": 0.7729358077049255, "learning_rate": 8.199234338447942e-05, "loss": 0.4633, "step": 4305 }, { "epoch": 0.9050911376492772, "grad_norm": 0.9121199250221252, "learning_rate": 8.186178716840118e-05, "loss": 0.4411, "step": 4320 }, { "epoch": 0.9082338152105593, "grad_norm": 0.5462374091148376, "learning_rate": 8.17308641686262e-05, "loss": 0.4659, "step": 4335 }, { "epoch": 0.9113764927718416, "grad_norm": 0.7599003911018372, "learning_rate": 8.15995758923023e-05, "loss": 0.4015, "step": 4350 }, { "epoch": 0.9145191703331238, "grad_norm": 0.8557884693145752, "learning_rate": 8.14679238507822e-05, "loss": 0.4574, "step": 4365 }, { "epoch": 0.9176618478944061, "grad_norm": 0.7987812757492065, "learning_rate": 8.133590955960619e-05, "loss": 0.4501, "step": 4380 }, { "epoch": 0.9208045254556882, "grad_norm": 0.8603717088699341, "learning_rate": 8.120353453848471e-05, "loss": 0.4201, "step": 4395 }, { "epoch": 0.9239472030169704, "grad_norm": 0.7066472768783569, "learning_rate": 8.107080031128078e-05, "loss": 0.4035, "step": 4410 }, { "epoch": 0.9270898805782527, "grad_norm": 0.6430373191833496, "learning_rate": 8.09377084059925e-05, "loss": 0.4141, "step": 4425 }, { "epoch": 0.9302325581395349, "grad_norm": 0.6911259889602661, "learning_rate": 8.080426035473549e-05, "loss": 0.4431, "step": 4440 }, { "epoch": 0.933375235700817, "grad_norm": 0.8445611000061035, "learning_rate": 8.067045769372515e-05, "loss": 0.4469, "step": 4455 }, { "epoch": 0.9365179132620993, "grad_norm": 0.9317618012428284, "learning_rate": 8.053630196325914e-05, "loss": 0.4051, "step": 4470 }, { "epoch": 0.9396605908233815, "grad_norm": 0.8286532163619995, "learning_rate": 8.040179470769946e-05, "loss": 0.4158, "step": 4485 }, { "epoch": 0.9428032683846638, "grad_norm": 0.7000495195388794, "learning_rate": 8.026693747545486e-05, "loss": 0.4202, "step": 4500 }, { "epoch": 0.9459459459459459, "grad_norm": 0.8104173541069031, "learning_rate": 8.013173181896283e-05, "loss": 0.4369, "step": 4515 }, { "epoch": 0.9490886235072281, "grad_norm": 0.864750862121582, "learning_rate": 7.999617929467187e-05, "loss": 0.4152, "step": 4530 }, { "epoch": 0.9522313010685104, "grad_norm": 0.7788864970207214, "learning_rate": 7.98602814630235e-05, "loss": 0.492, "step": 4545 }, { "epoch": 0.9553739786297926, "grad_norm": 0.707156777381897, "learning_rate": 7.972403988843435e-05, "loss": 0.4105, "step": 4560 }, { "epoch": 0.9585166561910748, "grad_norm": 0.8454593420028687, "learning_rate": 7.958745613927809e-05, "loss": 0.4622, "step": 4575 }, { "epoch": 0.961659333752357, "grad_norm": 0.8026373982429504, "learning_rate": 7.945053178786744e-05, "loss": 0.4236, "step": 4590 }, { "epoch": 0.9648020113136392, "grad_norm": 0.786409318447113, "learning_rate": 7.931326841043596e-05, "loss": 0.4677, "step": 4605 }, { "epoch": 0.9679446888749215, "grad_norm": 0.5381405353546143, "learning_rate": 7.917566758712005e-05, "loss": 0.443, "step": 4620 }, { "epoch": 0.9710873664362036, "grad_norm": 0.6609058380126953, "learning_rate": 7.903773090194069e-05, "loss": 0.4573, "step": 4635 }, { "epoch": 0.9742300439974858, "grad_norm": 0.7192760705947876, "learning_rate": 7.889945994278514e-05, "loss": 0.4387, "step": 4650 }, { "epoch": 0.9773727215587681, "grad_norm": 0.7502164244651794, "learning_rate": 7.87608563013888e-05, "loss": 0.399, "step": 4665 }, { "epoch": 0.9805153991200503, "grad_norm": 0.7829092144966125, "learning_rate": 7.86219215733168e-05, "loss": 0.3705, "step": 4680 }, { "epoch": 0.9836580766813325, "grad_norm": 0.791359007358551, "learning_rate": 7.848265735794558e-05, "loss": 0.4434, "step": 4695 }, { "epoch": 0.9868007542426147, "grad_norm": 0.7627493739128113, "learning_rate": 7.834306525844461e-05, "loss": 0.4496, "step": 4710 }, { "epoch": 0.9899434318038969, "grad_norm": 0.679959237575531, "learning_rate": 7.820314688175784e-05, "loss": 0.4815, "step": 4725 }, { "epoch": 0.9930861093651792, "grad_norm": 0.8766529560089111, "learning_rate": 7.806290383858523e-05, "loss": 0.4704, "step": 4740 }, { "epoch": 0.9962287869264613, "grad_norm": 1.1642574071884155, "learning_rate": 7.792233774336423e-05, "loss": 0.4974, "step": 4755 }, { "epoch": 0.9993714644877436, "grad_norm": 0.7194317579269409, "learning_rate": 7.778145021425114e-05, "loss": 0.4423, "step": 4770 }, { "epoch": 1.0025141420490258, "grad_norm": 0.7814803719520569, "learning_rate": 7.764024287310252e-05, "loss": 0.4194, "step": 4785 }, { "epoch": 1.005656819610308, "grad_norm": 0.8891781568527222, "learning_rate": 7.749871734545652e-05, "loss": 0.3977, "step": 4800 }, { "epoch": 1.0087994971715901, "grad_norm": 0.7444355487823486, "learning_rate": 7.735687526051418e-05, "loss": 0.3924, "step": 4815 }, { "epoch": 1.0119421747328725, "grad_norm": 0.9248786568641663, "learning_rate": 7.721471825112062e-05, "loss": 0.4273, "step": 4830 }, { "epoch": 1.0150848522941547, "grad_norm": 0.6513450741767883, "learning_rate": 7.70722479537463e-05, "loss": 0.3909, "step": 4845 }, { "epoch": 1.0182275298554369, "grad_norm": 0.8597205877304077, "learning_rate": 7.692946600846818e-05, "loss": 0.4027, "step": 4860 }, { "epoch": 1.021370207416719, "grad_norm": 0.9086320996284485, "learning_rate": 7.678637405895076e-05, "loss": 0.4225, "step": 4875 }, { "epoch": 1.0245128849780012, "grad_norm": 0.8219915628433228, "learning_rate": 7.66429737524273e-05, "loss": 0.4055, "step": 4890 }, { "epoch": 1.0276555625392834, "grad_norm": 0.9232605695724487, "learning_rate": 7.649926673968069e-05, "loss": 0.3801, "step": 4905 }, { "epoch": 1.0307982401005658, "grad_norm": 0.8866775035858154, "learning_rate": 7.635525467502462e-05, "loss": 0.3887, "step": 4920 }, { "epoch": 1.033940917661848, "grad_norm": 0.6395006775856018, "learning_rate": 7.62109392162844e-05, "loss": 0.4018, "step": 4935 }, { "epoch": 1.03708359522313, "grad_norm": 0.8276055455207825, "learning_rate": 7.60663220247779e-05, "loss": 0.3875, "step": 4950 }, { "epoch": 1.0402262727844123, "grad_norm": 0.8251763582229614, "learning_rate": 7.592140476529652e-05, "loss": 0.3912, "step": 4965 }, { "epoch": 1.0433689503456944, "grad_norm": 0.8321304321289062, "learning_rate": 7.577618910608591e-05, "loss": 0.4317, "step": 4980 }, { "epoch": 1.0465116279069768, "grad_norm": 0.6474670171737671, "learning_rate": 7.56306767188268e-05, "loss": 0.4594, "step": 4995 }, { "epoch": 1.049654305468259, "grad_norm": 0.6989348530769348, "learning_rate": 7.548486927861582e-05, "loss": 0.3744, "step": 5010 }, { "epoch": 1.0527969830295412, "grad_norm": 0.8184515237808228, "learning_rate": 7.533876846394613e-05, "loss": 0.3364, "step": 5025 }, { "epoch": 1.0559396605908233, "grad_norm": 0.7965102195739746, "learning_rate": 7.519237595668811e-05, "loss": 0.3934, "step": 5040 }, { "epoch": 1.0590823381521055, "grad_norm": 0.731299638748169, "learning_rate": 7.504569344207007e-05, "loss": 0.4161, "step": 5055 }, { "epoch": 1.062225015713388, "grad_norm": 0.9074578881263733, "learning_rate": 7.489872260865877e-05, "loss": 0.4103, "step": 5070 }, { "epoch": 1.06536769327467, "grad_norm": 0.8735909461975098, "learning_rate": 7.475146514834001e-05, "loss": 0.3686, "step": 5085 }, { "epoch": 1.0685103708359522, "grad_norm": 0.7814076542854309, "learning_rate": 7.460392275629918e-05, "loss": 0.3943, "step": 5100 }, { "epoch": 1.0716530483972344, "grad_norm": 0.8307476043701172, "learning_rate": 7.445609713100171e-05, "loss": 0.3999, "step": 5115 }, { "epoch": 1.0747957259585166, "grad_norm": 0.7908287048339844, "learning_rate": 7.430798997417353e-05, "loss": 0.4104, "step": 5130 }, { "epoch": 1.077938403519799, "grad_norm": 0.8598707914352417, "learning_rate": 7.415960299078143e-05, "loss": 0.3976, "step": 5145 }, { "epoch": 1.0810810810810811, "grad_norm": 0.5163241028785706, "learning_rate": 7.40109378890136e-05, "loss": 0.3506, "step": 5160 }, { "epoch": 1.0842237586423633, "grad_norm": 0.8642787933349609, "learning_rate": 7.386199638025973e-05, "loss": 0.31, "step": 5175 }, { "epoch": 1.0873664362036455, "grad_norm": 0.7603743076324463, "learning_rate": 7.371278017909148e-05, "loss": 0.4695, "step": 5190 }, { "epoch": 1.0905091137649277, "grad_norm": 0.7949853539466858, "learning_rate": 7.356329100324273e-05, "loss": 0.4076, "step": 5205 }, { "epoch": 1.0936517913262098, "grad_norm": 0.8560110926628113, "learning_rate": 7.341353057358966e-05, "loss": 0.3833, "step": 5220 }, { "epoch": 1.0967944688874922, "grad_norm": 0.632763147354126, "learning_rate": 7.326350061413114e-05, "loss": 0.4128, "step": 5235 }, { "epoch": 1.0999371464487744, "grad_norm": 0.9416031837463379, "learning_rate": 7.311320285196875e-05, "loss": 0.3665, "step": 5250 }, { "epoch": 1.1030798240100566, "grad_norm": 0.6195524334907532, "learning_rate": 7.296263901728694e-05, "loss": 0.362, "step": 5265 }, { "epoch": 1.1062225015713387, "grad_norm": 0.8545498251914978, "learning_rate": 7.281181084333311e-05, "loss": 0.361, "step": 5280 }, { "epoch": 1.109365179132621, "grad_norm": 0.75226229429245, "learning_rate": 7.26607200663977e-05, "loss": 0.3948, "step": 5295 }, { "epoch": 1.1125078566939033, "grad_norm": 0.877756655216217, "learning_rate": 7.250936842579407e-05, "loss": 0.4061, "step": 5310 }, { "epoch": 1.1156505342551855, "grad_norm": 0.5953283309936523, "learning_rate": 7.235775766383862e-05, "loss": 0.3273, "step": 5325 }, { "epoch": 1.1187932118164676, "grad_norm": 0.8206706643104553, "learning_rate": 7.220588952583071e-05, "loss": 0.3757, "step": 5340 }, { "epoch": 1.1219358893777498, "grad_norm": 0.7466344237327576, "learning_rate": 7.205376576003247e-05, "loss": 0.3892, "step": 5355 }, { "epoch": 1.125078566939032, "grad_norm": 0.8034494519233704, "learning_rate": 7.190138811764882e-05, "loss": 0.4043, "step": 5370 }, { "epoch": 1.1282212445003144, "grad_norm": 0.9050668478012085, "learning_rate": 7.174875835280716e-05, "loss": 0.3812, "step": 5385 }, { "epoch": 1.1313639220615965, "grad_norm": 0.8540876507759094, "learning_rate": 7.159587822253733e-05, "loss": 0.3645, "step": 5400 }, { "epoch": 1.1345065996228787, "grad_norm": 0.7688354849815369, "learning_rate": 7.14427494867512e-05, "loss": 0.3683, "step": 5415 }, { "epoch": 1.1376492771841609, "grad_norm": 0.6950829029083252, "learning_rate": 7.128937390822261e-05, "loss": 0.3347, "step": 5430 }, { "epoch": 1.140791954745443, "grad_norm": 0.8212427496910095, "learning_rate": 7.113575325256694e-05, "loss": 0.3775, "step": 5445 }, { "epoch": 1.1439346323067254, "grad_norm": 0.8312988877296448, "learning_rate": 7.098188928822084e-05, "loss": 0.4325, "step": 5460 }, { "epoch": 1.1470773098680076, "grad_norm": 0.9646623134613037, "learning_rate": 7.082778378642184e-05, "loss": 0.3898, "step": 5475 }, { "epoch": 1.1502199874292898, "grad_norm": 0.8333424925804138, "learning_rate": 7.0673438521188e-05, "loss": 0.4068, "step": 5490 }, { "epoch": 1.153362664990572, "grad_norm": 0.918892502784729, "learning_rate": 7.051885526929747e-05, "loss": 0.3968, "step": 5505 }, { "epoch": 1.156505342551854, "grad_norm": 0.5460782647132874, "learning_rate": 7.0364035810268e-05, "loss": 0.3672, "step": 5520 }, { "epoch": 1.1596480201131363, "grad_norm": 0.876811683177948, "learning_rate": 7.020898192633655e-05, "loss": 0.408, "step": 5535 }, { "epoch": 1.1627906976744187, "grad_norm": 0.6740222573280334, "learning_rate": 7.005369540243864e-05, "loss": 0.2995, "step": 5550 }, { "epoch": 1.1659333752357008, "grad_norm": 0.8702965378761292, "learning_rate": 6.989817802618792e-05, "loss": 0.3307, "step": 5565 }, { "epoch": 1.169076052796983, "grad_norm": 0.8837511539459229, "learning_rate": 6.974243158785554e-05, "loss": 0.3864, "step": 5580 }, { "epoch": 1.1722187303582652, "grad_norm": 0.4050454795360565, "learning_rate": 6.958645788034952e-05, "loss": 0.3525, "step": 5595 }, { "epoch": 1.1753614079195476, "grad_norm": 0.8361005187034607, "learning_rate": 6.943025869919418e-05, "loss": 0.3747, "step": 5610 }, { "epoch": 1.1785040854808297, "grad_norm": 0.841556191444397, "learning_rate": 6.92738358425094e-05, "loss": 0.406, "step": 5625 }, { "epoch": 1.181646763042112, "grad_norm": 0.629443883895874, "learning_rate": 6.911719111098996e-05, "loss": 0.4175, "step": 5640 }, { "epoch": 1.184789440603394, "grad_norm": 0.7146449685096741, "learning_rate": 6.896032630788476e-05, "loss": 0.3511, "step": 5655 }, { "epoch": 1.1879321181646763, "grad_norm": 0.8358393311500549, "learning_rate": 6.880324323897617e-05, "loss": 0.3851, "step": 5670 }, { "epoch": 1.1910747957259584, "grad_norm": 0.742857813835144, "learning_rate": 6.864594371255913e-05, "loss": 0.3821, "step": 5685 }, { "epoch": 1.1942174732872408, "grad_norm": 0.7099196910858154, "learning_rate": 6.848842953942036e-05, "loss": 0.3789, "step": 5700 }, { "epoch": 1.197360150848523, "grad_norm": 0.754542350769043, "learning_rate": 6.83307025328176e-05, "loss": 0.3472, "step": 5715 }, { "epoch": 1.2005028284098052, "grad_norm": 0.7466986775398254, "learning_rate": 6.817276450845856e-05, "loss": 0.3393, "step": 5730 }, { "epoch": 1.2036455059710873, "grad_norm": 0.7026840448379517, "learning_rate": 6.801461728448022e-05, "loss": 0.3891, "step": 5745 }, { "epoch": 1.2067881835323695, "grad_norm": 1.1348669528961182, "learning_rate": 6.785626268142777e-05, "loss": 0.3802, "step": 5760 }, { "epoch": 1.2099308610936519, "grad_norm": 0.7511578798294067, "learning_rate": 6.769770252223369e-05, "loss": 0.4252, "step": 5775 }, { "epoch": 1.213073538654934, "grad_norm": 0.8412914276123047, "learning_rate": 6.753893863219675e-05, "loss": 0.3813, "step": 5790 }, { "epoch": 1.2162162162162162, "grad_norm": 0.8765383958816528, "learning_rate": 6.737997283896103e-05, "loss": 0.3712, "step": 5805 }, { "epoch": 1.2193588937774984, "grad_norm": 0.7843053340911865, "learning_rate": 6.722080697249487e-05, "loss": 0.3776, "step": 5820 }, { "epoch": 1.2225015713387806, "grad_norm": 1.0745536088943481, "learning_rate": 6.706144286506978e-05, "loss": 0.3499, "step": 5835 }, { "epoch": 1.2256442489000627, "grad_norm": 0.7722020745277405, "learning_rate": 6.690188235123934e-05, "loss": 0.4211, "step": 5850 }, { "epoch": 1.2287869264613451, "grad_norm": 0.9631087183952332, "learning_rate": 6.674212726781814e-05, "loss": 0.3772, "step": 5865 }, { "epoch": 1.2319296040226273, "grad_norm": 0.8981698751449585, "learning_rate": 6.65821794538606e-05, "loss": 0.4598, "step": 5880 }, { "epoch": 1.2350722815839095, "grad_norm": 0.778362512588501, "learning_rate": 6.642204075063974e-05, "loss": 0.4179, "step": 5895 }, { "epoch": 1.2382149591451916, "grad_norm": 0.8421118259429932, "learning_rate": 6.626171300162615e-05, "loss": 0.3583, "step": 5910 }, { "epoch": 1.241357636706474, "grad_norm": 1.0227240324020386, "learning_rate": 6.610119805246653e-05, "loss": 0.3919, "step": 5925 }, { "epoch": 1.2445003142677562, "grad_norm": 0.5748106837272644, "learning_rate": 6.594049775096268e-05, "loss": 0.3571, "step": 5940 }, { "epoch": 1.2476429918290384, "grad_norm": 0.6924661993980408, "learning_rate": 6.577961394705008e-05, "loss": 0.3812, "step": 5955 }, { "epoch": 1.2507856693903205, "grad_norm": 0.7702043056488037, "learning_rate": 6.561854849277664e-05, "loss": 0.331, "step": 5970 }, { "epoch": 1.2539283469516027, "grad_norm": 0.6666329503059387, "learning_rate": 6.545730324228136e-05, "loss": 0.3266, "step": 5985 }, { "epoch": 1.2570710245128849, "grad_norm": 0.9120034575462341, "learning_rate": 6.529588005177305e-05, "loss": 0.4188, "step": 6000 }, { "epoch": 1.260213702074167, "grad_norm": 0.7251651287078857, "learning_rate": 6.513428077950886e-05, "loss": 0.4067, "step": 6015 }, { "epoch": 1.2633563796354494, "grad_norm": 0.6845729947090149, "learning_rate": 6.497250728577296e-05, "loss": 0.4266, "step": 6030 }, { "epoch": 1.2664990571967316, "grad_norm": 0.7530787587165833, "learning_rate": 6.481056143285512e-05, "loss": 0.3302, "step": 6045 }, { "epoch": 1.2696417347580138, "grad_norm": 0.7474608421325684, "learning_rate": 6.464844508502927e-05, "loss": 0.4305, "step": 6060 }, { "epoch": 1.2727844123192962, "grad_norm": 0.8672669529914856, "learning_rate": 6.448616010853199e-05, "loss": 0.4267, "step": 6075 }, { "epoch": 1.2759270898805783, "grad_norm": 0.7703887224197388, "learning_rate": 6.432370837154109e-05, "loss": 0.3531, "step": 6090 }, { "epoch": 1.2790697674418605, "grad_norm": 0.7432886958122253, "learning_rate": 6.416109174415406e-05, "loss": 0.3189, "step": 6105 }, { "epoch": 1.2822124450031427, "grad_norm": 0.9600912928581238, "learning_rate": 6.399831209836659e-05, "loss": 0.4036, "step": 6120 }, { "epoch": 1.2853551225644249, "grad_norm": 0.7727882862091064, "learning_rate": 6.383537130805098e-05, "loss": 0.3857, "step": 6135 }, { "epoch": 1.288497800125707, "grad_norm": 0.7628008723258972, "learning_rate": 6.367227124893455e-05, "loss": 0.4229, "step": 6150 }, { "epoch": 1.2916404776869892, "grad_norm": 0.9682219624519348, "learning_rate": 6.350901379857814e-05, "loss": 0.3544, "step": 6165 }, { "epoch": 1.2947831552482716, "grad_norm": 0.7553837895393372, "learning_rate": 6.334560083635434e-05, "loss": 0.3968, "step": 6180 }, { "epoch": 1.2979258328095538, "grad_norm": 0.7951422333717346, "learning_rate": 6.318203424342605e-05, "loss": 0.2946, "step": 6195 }, { "epoch": 1.301068510370836, "grad_norm": 0.9351706504821777, "learning_rate": 6.301831590272465e-05, "loss": 0.4203, "step": 6210 }, { "epoch": 1.304211187932118, "grad_norm": 0.8283166289329529, "learning_rate": 6.28544476989284e-05, "loss": 0.4166, "step": 6225 }, { "epoch": 1.3073538654934005, "grad_norm": 0.7889246940612793, "learning_rate": 6.269043151844081e-05, "loss": 0.4084, "step": 6240 }, { "epoch": 1.3104965430546827, "grad_norm": 0.7893148064613342, "learning_rate": 6.252626924936876e-05, "loss": 0.3327, "step": 6255 }, { "epoch": 1.3136392206159648, "grad_norm": 0.9599968194961548, "learning_rate": 6.236196278150092e-05, "loss": 0.3987, "step": 6270 }, { "epoch": 1.316781898177247, "grad_norm": 0.7326962351799011, "learning_rate": 6.219751400628593e-05, "loss": 0.3872, "step": 6285 }, { "epoch": 1.3199245757385292, "grad_norm": 0.7666275501251221, "learning_rate": 6.203292481681061e-05, "loss": 0.2906, "step": 6300 }, { "epoch": 1.3230672532998113, "grad_norm": 0.7648006081581116, "learning_rate": 6.186819710777819e-05, "loss": 0.4077, "step": 6315 }, { "epoch": 1.3262099308610937, "grad_norm": 0.8993086218833923, "learning_rate": 6.170333277548653e-05, "loss": 0.3334, "step": 6330 }, { "epoch": 1.329352608422376, "grad_norm": 0.8966405987739563, "learning_rate": 6.153833371780622e-05, "loss": 0.3772, "step": 6345 }, { "epoch": 1.332495285983658, "grad_norm": 0.955697774887085, "learning_rate": 6.137320183415877e-05, "loss": 0.3652, "step": 6360 }, { "epoch": 1.3356379635449402, "grad_norm": 0.913931667804718, "learning_rate": 6.120793902549478e-05, "loss": 0.3943, "step": 6375 }, { "epoch": 1.3387806411062226, "grad_norm": 0.471160352230072, "learning_rate": 6.1042547194272e-05, "loss": 0.3656, "step": 6390 }, { "epoch": 1.3419233186675048, "grad_norm": 0.7883521914482117, "learning_rate": 6.0877028244433444e-05, "loss": 0.3494, "step": 6405 }, { "epoch": 1.345065996228787, "grad_norm": 0.8015203475952148, "learning_rate": 6.071138408138547e-05, "loss": 0.3498, "step": 6420 }, { "epoch": 1.3482086737900691, "grad_norm": 0.8431302905082703, "learning_rate": 6.0545616611975886e-05, "loss": 0.3726, "step": 6435 }, { "epoch": 1.3513513513513513, "grad_norm": 0.6410717964172363, "learning_rate": 6.0379727744471936e-05, "loss": 0.3793, "step": 6450 }, { "epoch": 1.3544940289126335, "grad_norm": 0.8410218358039856, "learning_rate": 6.021371938853839e-05, "loss": 0.4294, "step": 6465 }, { "epoch": 1.3576367064739157, "grad_norm": 0.622178852558136, "learning_rate": 6.004759345521552e-05, "loss": 0.3373, "step": 6480 }, { "epoch": 1.360779384035198, "grad_norm": 0.8277848362922668, "learning_rate": 5.988135185689712e-05, "loss": 0.3796, "step": 6495 }, { "epoch": 1.3639220615964802, "grad_norm": 0.799150824546814, "learning_rate": 5.9714996507308465e-05, "loss": 0.3361, "step": 6510 }, { "epoch": 1.3670647391577624, "grad_norm": 0.8518102765083313, "learning_rate": 5.954852932148433e-05, "loss": 0.3913, "step": 6525 }, { "epoch": 1.3702074167190446, "grad_norm": 0.7465687990188599, "learning_rate": 5.9381952215746905e-05, "loss": 0.3546, "step": 6540 }, { "epoch": 1.373350094280327, "grad_norm": 0.7342978119850159, "learning_rate": 5.921526710768376e-05, "loss": 0.3832, "step": 6555 }, { "epoch": 1.3764927718416091, "grad_norm": 0.6754856109619141, "learning_rate": 5.9048475916125723e-05, "loss": 0.4051, "step": 6570 }, { "epoch": 1.3796354494028913, "grad_norm": 0.6392863988876343, "learning_rate": 5.888158056112486e-05, "loss": 0.3828, "step": 6585 }, { "epoch": 1.3827781269641735, "grad_norm": 0.897132933139801, "learning_rate": 5.871458296393231e-05, "loss": 0.405, "step": 6600 }, { "epoch": 1.3859208045254556, "grad_norm": 0.7124328017234802, "learning_rate": 5.854748504697624e-05, "loss": 0.3712, "step": 6615 }, { "epoch": 1.3890634820867378, "grad_norm": 0.8436194062232971, "learning_rate": 5.8380288733839585e-05, "loss": 0.3773, "step": 6630 }, { "epoch": 1.3922061596480202, "grad_norm": 0.780944287776947, "learning_rate": 5.8212995949238083e-05, "loss": 0.3529, "step": 6645 }, { "epoch": 1.3953488372093024, "grad_norm": 1.0335406064987183, "learning_rate": 5.804560861899795e-05, "loss": 0.4262, "step": 6660 }, { "epoch": 1.3984915147705845, "grad_norm": 0.7593971490859985, "learning_rate": 5.7878128670033826e-05, "loss": 0.4079, "step": 6675 }, { "epoch": 1.4016341923318667, "grad_norm": 0.7240027189254761, "learning_rate": 5.7710558030326545e-05, "loss": 0.3835, "step": 6690 }, { "epoch": 1.404776869893149, "grad_norm": 1.530868411064148, "learning_rate": 5.754289862890093e-05, "loss": 0.4294, "step": 6705 }, { "epoch": 1.4079195474544313, "grad_norm": 0.6043078899383545, "learning_rate": 5.7375152395803624e-05, "loss": 0.3343, "step": 6720 }, { "epoch": 1.4110622250157134, "grad_norm": 0.8058659434318542, "learning_rate": 5.720732126208082e-05, "loss": 0.4533, "step": 6735 }, { "epoch": 1.4142049025769956, "grad_norm": 0.7185141444206238, "learning_rate": 5.7039407159756106e-05, "loss": 0.42, "step": 6750 }, { "epoch": 1.4173475801382778, "grad_norm": 1.0086369514465332, "learning_rate": 5.687141202180817e-05, "loss": 0.3701, "step": 6765 }, { "epoch": 1.42049025769956, "grad_norm": 1.0289742946624756, "learning_rate": 5.67033377821485e-05, "loss": 0.4565, "step": 6780 }, { "epoch": 1.4236329352608421, "grad_norm": 1.1389039754867554, "learning_rate": 5.6535186375599266e-05, "loss": 0.3555, "step": 6795 }, { "epoch": 1.4267756128221245, "grad_norm": 0.887610673904419, "learning_rate": 5.636695973787093e-05, "loss": 0.368, "step": 6810 }, { "epoch": 1.4299182903834067, "grad_norm": 0.9625629186630249, "learning_rate": 5.619865980553994e-05, "loss": 0.3962, "step": 6825 }, { "epoch": 1.4330609679446888, "grad_norm": 0.8793766498565674, "learning_rate": 5.6030288516026564e-05, "loss": 0.3979, "step": 6840 }, { "epoch": 1.436203645505971, "grad_norm": 0.7626388669013977, "learning_rate": 5.586184780757251e-05, "loss": 0.345, "step": 6855 }, { "epoch": 1.4393463230672534, "grad_norm": 1.109713077545166, "learning_rate": 5.5693339619218534e-05, "loss": 0.4446, "step": 6870 }, { "epoch": 1.4424890006285356, "grad_norm": 0.9758956432342529, "learning_rate": 5.552476589078231e-05, "loss": 0.401, "step": 6885 }, { "epoch": 1.4456316781898177, "grad_norm": 0.923329770565033, "learning_rate": 5.5356128562835904e-05, "loss": 0.385, "step": 6900 }, { "epoch": 1.4487743557511, "grad_norm": 0.7539265155792236, "learning_rate": 5.518742957668359e-05, "loss": 0.3274, "step": 6915 }, { "epoch": 1.451917033312382, "grad_norm": 0.8187793493270874, "learning_rate": 5.5018670874339386e-05, "loss": 0.3677, "step": 6930 }, { "epoch": 1.4550597108736643, "grad_norm": 0.9522603750228882, "learning_rate": 5.484985439850473e-05, "loss": 0.3319, "step": 6945 }, { "epoch": 1.4582023884349467, "grad_norm": 0.8808611631393433, "learning_rate": 5.468098209254622e-05, "loss": 0.4311, "step": 6960 }, { "epoch": 1.4613450659962288, "grad_norm": 0.6949836611747742, "learning_rate": 5.4512055900473035e-05, "loss": 0.3679, "step": 6975 }, { "epoch": 1.464487743557511, "grad_norm": 0.783545196056366, "learning_rate": 5.434307776691479e-05, "loss": 0.3552, "step": 6990 }, { "epoch": 1.4676304211187932, "grad_norm": 0.8342312574386597, "learning_rate": 5.417404963709894e-05, "loss": 0.3755, "step": 7005 }, { "epoch": 1.4707730986800756, "grad_norm": 0.7615540027618408, "learning_rate": 5.400497345682857e-05, "loss": 0.3605, "step": 7020 }, { "epoch": 1.4739157762413577, "grad_norm": 0.8944594860076904, "learning_rate": 5.3835851172459794e-05, "loss": 0.3948, "step": 7035 }, { "epoch": 1.47705845380264, "grad_norm": 0.8412215113639832, "learning_rate": 5.36666847308796e-05, "loss": 0.3658, "step": 7050 }, { "epoch": 1.480201131363922, "grad_norm": 0.8457724452018738, "learning_rate": 5.34974760794832e-05, "loss": 0.4327, "step": 7065 }, { "epoch": 1.4833438089252042, "grad_norm": 0.7231891751289368, "learning_rate": 5.332822716615172e-05, "loss": 0.3489, "step": 7080 }, { "epoch": 1.4864864864864864, "grad_norm": 0.8975026607513428, "learning_rate": 5.315893993922986e-05, "loss": 0.331, "step": 7095 }, { "epoch": 1.4896291640477686, "grad_norm": 0.871842086315155, "learning_rate": 5.2989616347503244e-05, "loss": 0.4056, "step": 7110 }, { "epoch": 1.492771841609051, "grad_norm": 0.5846161246299744, "learning_rate": 5.282025834017623e-05, "loss": 0.381, "step": 7125 }, { "epoch": 1.4959145191703331, "grad_norm": 0.6650387644767761, "learning_rate": 5.265086786684929e-05, "loss": 0.34, "step": 7140 }, { "epoch": 1.4990571967316153, "grad_norm": 0.862241804599762, "learning_rate": 5.2481446877496665e-05, "loss": 0.354, "step": 7155 }, { "epoch": 1.5021998742928977, "grad_norm": 0.8328828811645508, "learning_rate": 5.231199732244386e-05, "loss": 0.3772, "step": 7170 }, { "epoch": 1.5053425518541799, "grad_norm": 0.5438669323921204, "learning_rate": 5.214252115234527e-05, "loss": 0.3493, "step": 7185 }, { "epoch": 1.508485229415462, "grad_norm": 0.7722681760787964, "learning_rate": 5.197302031816165e-05, "loss": 0.3494, "step": 7200 }, { "epoch": 1.5116279069767442, "grad_norm": 0.9693325161933899, "learning_rate": 5.180349677113762e-05, "loss": 0.3512, "step": 7215 }, { "epoch": 1.5147705845380264, "grad_norm": 1.0208348035812378, "learning_rate": 5.163395246277938e-05, "loss": 0.2772, "step": 7230 }, { "epoch": 1.5179132620993085, "grad_norm": 0.8255509734153748, "learning_rate": 5.1464389344832024e-05, "loss": 0.3491, "step": 7245 }, { "epoch": 1.5210559396605907, "grad_norm": 0.723574697971344, "learning_rate": 5.1294809369257244e-05, "loss": 0.3894, "step": 7260 }, { "epoch": 1.5241986172218729, "grad_norm": 0.8955418467521667, "learning_rate": 5.112521448821076e-05, "loss": 0.3722, "step": 7275 }, { "epoch": 1.5273412947831553, "grad_norm": 0.9446234703063965, "learning_rate": 5.0955606654019895e-05, "loss": 0.3602, "step": 7290 }, { "epoch": 1.5304839723444374, "grad_norm": 0.7256786227226257, "learning_rate": 5.078598781916107e-05, "loss": 0.3488, "step": 7305 }, { "epoch": 1.5336266499057196, "grad_norm": 0.775834858417511, "learning_rate": 5.0616359936237355e-05, "loss": 0.3983, "step": 7320 }, { "epoch": 1.536769327467002, "grad_norm": 0.7684575915336609, "learning_rate": 5.044672495795598e-05, "loss": 0.3992, "step": 7335 }, { "epoch": 1.5399120050282842, "grad_norm": 0.7569010853767395, "learning_rate": 5.0277084837105826e-05, "loss": 0.352, "step": 7350 }, { "epoch": 1.5430546825895664, "grad_norm": 0.7330282926559448, "learning_rate": 5.010744152653501e-05, "loss": 0.3486, "step": 7365 }, { "epoch": 1.5461973601508485, "grad_norm": 0.8921106457710266, "learning_rate": 4.993779697912837e-05, "loss": 0.3107, "step": 7380 }, { "epoch": 1.5493400377121307, "grad_norm": 0.7190592288970947, "learning_rate": 4.976815314778493e-05, "loss": 0.3429, "step": 7395 }, { "epoch": 1.5524827152734129, "grad_norm": 0.8145999312400818, "learning_rate": 4.9598511985395535e-05, "loss": 0.3455, "step": 7410 }, { "epoch": 1.555625392834695, "grad_norm": 0.7628950476646423, "learning_rate": 4.942887544482029e-05, "loss": 0.3362, "step": 7425 }, { "epoch": 1.5587680703959774, "grad_norm": 0.5859194993972778, "learning_rate": 4.925924547886603e-05, "loss": 0.3723, "step": 7440 }, { "epoch": 1.5619107479572596, "grad_norm": 0.7906526327133179, "learning_rate": 4.9089624040264013e-05, "loss": 0.3511, "step": 7455 }, { "epoch": 1.5650534255185418, "grad_norm": 0.7591722011566162, "learning_rate": 4.892001308164727e-05, "loss": 0.4439, "step": 7470 }, { "epoch": 1.5681961030798242, "grad_norm": 0.9237760901451111, "learning_rate": 4.875041455552817e-05, "loss": 0.3638, "step": 7485 }, { "epoch": 1.5713387806411063, "grad_norm": 0.734752893447876, "learning_rate": 4.858083041427599e-05, "loss": 0.4047, "step": 7500 }, { "epoch": 1.5744814582023885, "grad_norm": 0.676703155040741, "learning_rate": 4.8411262610094445e-05, "loss": 0.3566, "step": 7515 }, { "epoch": 1.5776241357636707, "grad_norm": 0.8751126527786255, "learning_rate": 4.824171309499913e-05, "loss": 0.3743, "step": 7530 }, { "epoch": 1.5807668133249528, "grad_norm": 0.6884835958480835, "learning_rate": 4.807218382079511e-05, "loss": 0.3821, "step": 7545 }, { "epoch": 1.583909490886235, "grad_norm": 0.8230961561203003, "learning_rate": 4.790267673905447e-05, "loss": 0.3193, "step": 7560 }, { "epoch": 1.5870521684475172, "grad_norm": 0.8046270608901978, "learning_rate": 4.7733193801093803e-05, "loss": 0.3714, "step": 7575 }, { "epoch": 1.5901948460087993, "grad_norm": 0.895897626876831, "learning_rate": 4.756373695795177e-05, "loss": 0.386, "step": 7590 }, { "epoch": 1.5933375235700817, "grad_norm": 0.8858537077903748, "learning_rate": 4.7394308160366617e-05, "loss": 0.3755, "step": 7605 }, { "epoch": 1.596480201131364, "grad_norm": 0.6874979138374329, "learning_rate": 4.722490935875377e-05, "loss": 0.3547, "step": 7620 }, { "epoch": 1.5996228786926463, "grad_norm": 0.8027022480964661, "learning_rate": 4.705554250318335e-05, "loss": 0.3702, "step": 7635 }, { "epoch": 1.6027655562539285, "grad_norm": 0.9383290410041809, "learning_rate": 4.688620954335766e-05, "loss": 0.4038, "step": 7650 }, { "epoch": 1.6059082338152106, "grad_norm": 0.8475779294967651, "learning_rate": 4.671691242858891e-05, "loss": 0.3257, "step": 7665 }, { "epoch": 1.6090509113764928, "grad_norm": 0.702893853187561, "learning_rate": 4.654765310777659e-05, "loss": 0.3642, "step": 7680 }, { "epoch": 1.612193588937775, "grad_norm": 0.7762289047241211, "learning_rate": 4.6378433529385157e-05, "loss": 0.3859, "step": 7695 }, { "epoch": 1.6153362664990571, "grad_norm": 0.7309826016426086, "learning_rate": 4.620925564142151e-05, "loss": 0.3427, "step": 7710 }, { "epoch": 1.6184789440603393, "grad_norm": 0.655974805355072, "learning_rate": 4.60401213914127e-05, "loss": 0.3893, "step": 7725 }, { "epoch": 1.6216216216216215, "grad_norm": 0.7434260845184326, "learning_rate": 4.5871032726383386e-05, "loss": 0.3528, "step": 7740 }, { "epoch": 1.6247642991829039, "grad_norm": 0.981696605682373, "learning_rate": 4.570199159283345e-05, "loss": 0.3792, "step": 7755 }, { "epoch": 1.627906976744186, "grad_norm": 0.5884058475494385, "learning_rate": 4.553299993671567e-05, "loss": 0.3082, "step": 7770 }, { "epoch": 1.6310496543054682, "grad_norm": 0.9349349737167358, "learning_rate": 4.536405970341317e-05, "loss": 0.3736, "step": 7785 }, { "epoch": 1.6341923318667506, "grad_norm": 0.8422302603721619, "learning_rate": 4.519517283771717e-05, "loss": 0.3897, "step": 7800 }, { "epoch": 1.6373350094280328, "grad_norm": 0.7569222450256348, "learning_rate": 4.502634128380448e-05, "loss": 0.3581, "step": 7815 }, { "epoch": 1.640477686989315, "grad_norm": 0.8034069538116455, "learning_rate": 4.4857566985215276e-05, "loss": 0.3542, "step": 7830 }, { "epoch": 1.6436203645505971, "grad_norm": 0.5547857284545898, "learning_rate": 4.4688851884830516e-05, "loss": 0.3089, "step": 7845 }, { "epoch": 1.6467630421118793, "grad_norm": 0.8145669102668762, "learning_rate": 4.452019792484975e-05, "loss": 0.3391, "step": 7860 }, { "epoch": 1.6499057196731615, "grad_norm": 0.672332227230072, "learning_rate": 4.4351607046768704e-05, "loss": 0.3866, "step": 7875 }, { "epoch": 1.6530483972344436, "grad_norm": 0.7952318787574768, "learning_rate": 4.418308119135686e-05, "loss": 0.4221, "step": 7890 }, { "epoch": 1.6561910747957258, "grad_norm": 0.7489158511161804, "learning_rate": 4.401462229863526e-05, "loss": 0.3687, "step": 7905 }, { "epoch": 1.6593337523570082, "grad_norm": 0.8457122445106506, "learning_rate": 4.3846232307854e-05, "loss": 0.3888, "step": 7920 }, { "epoch": 1.6624764299182904, "grad_norm": 0.7040199637413025, "learning_rate": 4.36779131574701e-05, "loss": 0.3437, "step": 7935 }, { "epoch": 1.6656191074795728, "grad_norm": 1.0369516611099243, "learning_rate": 4.3509666785125005e-05, "loss": 0.3557, "step": 7950 }, { "epoch": 1.668761785040855, "grad_norm": 0.7418217062950134, "learning_rate": 4.334149512762238e-05, "loss": 0.351, "step": 7965 }, { "epoch": 1.671904462602137, "grad_norm": 0.6527841687202454, "learning_rate": 4.3173400120905824e-05, "loss": 0.3286, "step": 7980 }, { "epoch": 1.6750471401634193, "grad_norm": 0.9062017798423767, "learning_rate": 4.3005383700036525e-05, "loss": 0.3828, "step": 7995 }, { "epoch": 1.6781898177247014, "grad_norm": 0.6981047987937927, "learning_rate": 4.283744779917102e-05, "loss": 0.3689, "step": 8010 }, { "epoch": 1.6813324952859836, "grad_norm": 0.8865767121315002, "learning_rate": 4.26695943515389e-05, "loss": 0.3912, "step": 8025 }, { "epoch": 1.6844751728472658, "grad_norm": 0.5835604667663574, "learning_rate": 4.250182528942065e-05, "loss": 0.317, "step": 8040 }, { "epoch": 1.687617850408548, "grad_norm": 0.869529128074646, "learning_rate": 4.233414254412525e-05, "loss": 0.4031, "step": 8055 }, { "epoch": 1.6907605279698303, "grad_norm": 0.7666299939155579, "learning_rate": 4.216654804596808e-05, "loss": 0.3635, "step": 8070 }, { "epoch": 1.6939032055311125, "grad_norm": 0.6868289709091187, "learning_rate": 4.199904372424858e-05, "loss": 0.3554, "step": 8085 }, { "epoch": 1.6970458830923947, "grad_norm": 0.7406291961669922, "learning_rate": 4.183163150722822e-05, "loss": 0.3216, "step": 8100 }, { "epoch": 1.700188560653677, "grad_norm": 0.7962248921394348, "learning_rate": 4.166431332210807e-05, "loss": 0.3398, "step": 8115 }, { "epoch": 1.7033312382149592, "grad_norm": 1.02495276927948, "learning_rate": 4.149709109500678e-05, "loss": 0.3817, "step": 8130 }, { "epoch": 1.7064739157762414, "grad_norm": 0.7741113305091858, "learning_rate": 4.13299667509384e-05, "loss": 0.4072, "step": 8145 }, { "epoch": 1.7096165933375236, "grad_norm": 0.7952526807785034, "learning_rate": 4.1162942213790086e-05, "loss": 0.3441, "step": 8160 }, { "epoch": 1.7127592708988058, "grad_norm": 0.7849689722061157, "learning_rate": 4.0996019406300126e-05, "loss": 0.3417, "step": 8175 }, { "epoch": 1.715901948460088, "grad_norm": 0.7431788444519043, "learning_rate": 4.082920025003567e-05, "loss": 0.3995, "step": 8190 }, { "epoch": 1.71904462602137, "grad_norm": 0.7709872126579285, "learning_rate": 4.0662486665370734e-05, "loss": 0.4069, "step": 8205 }, { "epoch": 1.7221873035826523, "grad_norm": 0.6013693809509277, "learning_rate": 4.049588057146394e-05, "loss": 0.3877, "step": 8220 }, { "epoch": 1.7253299811439347, "grad_norm": 0.7985032796859741, "learning_rate": 4.032938388623657e-05, "loss": 0.3407, "step": 8235 }, { "epoch": 1.7284726587052168, "grad_norm": 0.6259362101554871, "learning_rate": 4.01629985263504e-05, "loss": 0.3167, "step": 8250 }, { "epoch": 1.7316153362664992, "grad_norm": 0.7632457613945007, "learning_rate": 3.999672640718567e-05, "loss": 0.365, "step": 8265 }, { "epoch": 1.7347580138277814, "grad_norm": 0.9532593488693237, "learning_rate": 3.983056944281901e-05, "loss": 0.427, "step": 8280 }, { "epoch": 1.7379006913890636, "grad_norm": 0.7168596386909485, "learning_rate": 3.966452954600142e-05, "loss": 0.3776, "step": 8295 }, { "epoch": 1.7410433689503457, "grad_norm": 0.753966748714447, "learning_rate": 3.94986086281363e-05, "loss": 0.3792, "step": 8310 }, { "epoch": 1.744186046511628, "grad_norm": 0.38063740730285645, "learning_rate": 3.933280859925734e-05, "loss": 0.3499, "step": 8325 }, { "epoch": 1.74732872407291, "grad_norm": 0.8001086711883545, "learning_rate": 3.916713136800659e-05, "loss": 0.3491, "step": 8340 }, { "epoch": 1.7504714016341922, "grad_norm": 0.7394033074378967, "learning_rate": 3.900157884161255e-05, "loss": 0.3383, "step": 8355 }, { "epoch": 1.7536140791954744, "grad_norm": 0.7337818741798401, "learning_rate": 3.8836152925868114e-05, "loss": 0.3705, "step": 8370 }, { "epoch": 1.7567567567567568, "grad_norm": 0.7671971917152405, "learning_rate": 3.867085552510864e-05, "loss": 0.3125, "step": 8385 }, { "epoch": 1.759899434318039, "grad_norm": 0.8018542528152466, "learning_rate": 3.850568854219011e-05, "loss": 0.3678, "step": 8400 }, { "epoch": 1.7630421118793211, "grad_norm": 0.8364083766937256, "learning_rate": 3.834065387846718e-05, "loss": 0.4179, "step": 8415 }, { "epoch": 1.7661847894406035, "grad_norm": 0.8526837825775146, "learning_rate": 3.817575343377122e-05, "loss": 0.3881, "step": 8430 }, { "epoch": 1.7693274670018857, "grad_norm": 0.6416676640510559, "learning_rate": 3.8010989106388554e-05, "loss": 0.3099, "step": 8445 }, { "epoch": 1.7724701445631679, "grad_norm": 0.7990739941596985, "learning_rate": 3.784636279303858e-05, "loss": 0.3598, "step": 8460 }, { "epoch": 1.77561282212445, "grad_norm": 0.8872657418251038, "learning_rate": 3.76818763888519e-05, "loss": 0.3882, "step": 8475 }, { "epoch": 1.7787554996857322, "grad_norm": 0.8712546229362488, "learning_rate": 3.7517531787348484e-05, "loss": 0.3773, "step": 8490 }, { "epoch": 1.7818981772470144, "grad_norm": 0.7423908710479736, "learning_rate": 3.735333088041596e-05, "loss": 0.3777, "step": 8505 }, { "epoch": 1.7850408548082966, "grad_norm": 0.9166727066040039, "learning_rate": 3.718927555828779e-05, "loss": 0.4059, "step": 8520 }, { "epoch": 1.7881835323695787, "grad_norm": 0.7207896113395691, "learning_rate": 3.702536770952148e-05, "loss": 0.3754, "step": 8535 }, { "epoch": 1.7913262099308611, "grad_norm": 0.844727635383606, "learning_rate": 3.6861609220976846e-05, "loss": 0.3328, "step": 8550 }, { "epoch": 1.7944688874921433, "grad_norm": 0.7674320340156555, "learning_rate": 3.6698001977794366e-05, "loss": 0.3806, "step": 8565 }, { "epoch": 1.7976115650534257, "grad_norm": 0.6307094693183899, "learning_rate": 3.6534547863373394e-05, "loss": 0.3694, "step": 8580 }, { "epoch": 1.8007542426147078, "grad_norm": 0.767432451248169, "learning_rate": 3.63712487593505e-05, "loss": 0.4028, "step": 8595 }, { "epoch": 1.80389692017599, "grad_norm": 0.8937990665435791, "learning_rate": 3.6208106545577824e-05, "loss": 0.3372, "step": 8610 }, { "epoch": 1.8070395977372722, "grad_norm": 0.590930163860321, "learning_rate": 3.604512310010146e-05, "loss": 0.3684, "step": 8625 }, { "epoch": 1.8101822752985544, "grad_norm": 0.8184636831283569, "learning_rate": 3.58823002991398e-05, "loss": 0.373, "step": 8640 }, { "epoch": 1.8133249528598365, "grad_norm": 0.9741955399513245, "learning_rate": 3.5719640017061885e-05, "loss": 0.3374, "step": 8655 }, { "epoch": 1.8164676304211187, "grad_norm": 1.0014973878860474, "learning_rate": 3.555714412636595e-05, "loss": 0.3848, "step": 8670 }, { "epoch": 1.8196103079824009, "grad_norm": 0.6335365772247314, "learning_rate": 3.53948144976578e-05, "loss": 0.3689, "step": 8685 }, { "epoch": 1.8227529855436833, "grad_norm": 0.5687909722328186, "learning_rate": 3.523265299962924e-05, "loss": 0.4178, "step": 8700 }, { "epoch": 1.8258956631049654, "grad_norm": 0.8622750043869019, "learning_rate": 3.507066149903662e-05, "loss": 0.3899, "step": 8715 }, { "epoch": 1.8290383406662476, "grad_norm": 0.7984293699264526, "learning_rate": 3.490884186067935e-05, "loss": 0.4353, "step": 8730 }, { "epoch": 1.83218101822753, "grad_norm": 0.7962972521781921, "learning_rate": 3.474719594737842e-05, "loss": 0.3324, "step": 8745 }, { "epoch": 1.8353236957888122, "grad_norm": 0.7194257974624634, "learning_rate": 3.4585725619954864e-05, "loss": 0.3765, "step": 8760 }, { "epoch": 1.8384663733500943, "grad_norm": 0.6931387782096863, "learning_rate": 3.442443273720853e-05, "loss": 0.3183, "step": 8775 }, { "epoch": 1.8416090509113765, "grad_norm": 0.7540430426597595, "learning_rate": 3.426331915589651e-05, "loss": 0.3975, "step": 8790 }, { "epoch": 1.8447517284726587, "grad_norm": 0.7310993671417236, "learning_rate": 3.410238673071185e-05, "loss": 0.3975, "step": 8805 }, { "epoch": 1.8478944060339408, "grad_norm": 0.7351768612861633, "learning_rate": 3.394163731426216e-05, "loss": 0.3558, "step": 8820 }, { "epoch": 1.851037083595223, "grad_norm": 0.7860934138298035, "learning_rate": 3.378107275704834e-05, "loss": 0.3601, "step": 8835 }, { "epoch": 1.8541797611565052, "grad_norm": 0.6049594283103943, "learning_rate": 3.362069490744322e-05, "loss": 0.3692, "step": 8850 }, { "epoch": 1.8573224387177876, "grad_norm": 0.9184178709983826, "learning_rate": 3.346050561167029e-05, "loss": 0.3518, "step": 8865 }, { "epoch": 1.8604651162790697, "grad_norm": 0.7558075189590454, "learning_rate": 3.3300506713782495e-05, "loss": 0.3587, "step": 8880 }, { "epoch": 1.8636077938403521, "grad_norm": 0.7545658349990845, "learning_rate": 3.314070005564097e-05, "loss": 0.3679, "step": 8895 }, { "epoch": 1.8667504714016343, "grad_norm": 0.9135695695877075, "learning_rate": 3.2981087476893853e-05, "loss": 0.3725, "step": 8910 }, { "epoch": 1.8698931489629165, "grad_norm": 0.9788998961448669, "learning_rate": 3.2821670814955026e-05, "loss": 0.3149, "step": 8925 }, { "epoch": 1.8730358265241986, "grad_norm": 0.7953155636787415, "learning_rate": 3.266245190498311e-05, "loss": 0.3461, "step": 8940 }, { "epoch": 1.8761785040854808, "grad_norm": 0.9166163802146912, "learning_rate": 3.250343257986027e-05, "loss": 0.3866, "step": 8955 }, { "epoch": 1.879321181646763, "grad_norm": 0.9379754066467285, "learning_rate": 3.2344614670171025e-05, "loss": 0.3928, "step": 8970 }, { "epoch": 1.8824638592080452, "grad_norm": 0.8782539963722229, "learning_rate": 3.2186000004181314e-05, "loss": 0.3959, "step": 8985 }, { "epoch": 1.8856065367693273, "grad_norm": 0.7237117886543274, "learning_rate": 3.2027590407817407e-05, "loss": 0.3458, "step": 9000 }, { "epoch": 1.8887492143306097, "grad_norm": 0.8787809014320374, "learning_rate": 3.186938770464486e-05, "loss": 0.4081, "step": 9015 }, { "epoch": 1.8918918918918919, "grad_norm": 0.7628602981567383, "learning_rate": 3.1711393715847476e-05, "loss": 0.3928, "step": 9030 }, { "epoch": 1.895034569453174, "grad_norm": 0.9172194600105286, "learning_rate": 3.15536102602065e-05, "loss": 0.3777, "step": 9045 }, { "epoch": 1.8981772470144564, "grad_norm": 0.8413445353507996, "learning_rate": 3.13960391540795e-05, "loss": 0.36, "step": 9060 }, { "epoch": 1.9013199245757386, "grad_norm": 0.9793257117271423, "learning_rate": 3.1238682211379586e-05, "loss": 0.3801, "step": 9075 }, { "epoch": 1.9044626021370208, "grad_norm": 0.7620652318000793, "learning_rate": 3.1081541243554427e-05, "loss": 0.3689, "step": 9090 }, { "epoch": 1.907605279698303, "grad_norm": 0.8353012800216675, "learning_rate": 3.092461805956551e-05, "loss": 0.3961, "step": 9105 }, { "epoch": 1.9107479572595851, "grad_norm": 0.8704758882522583, "learning_rate": 3.0767914465867246e-05, "loss": 0.3168, "step": 9120 }, { "epoch": 1.9138906348208673, "grad_norm": 0.6754759550094604, "learning_rate": 3.061143226638611e-05, "loss": 0.3407, "step": 9135 }, { "epoch": 1.9170333123821495, "grad_norm": 0.9682889580726624, "learning_rate": 3.0455173262500093e-05, "loss": 0.4251, "step": 9150 }, { "epoch": 1.9201759899434316, "grad_norm": 0.8114556670188904, "learning_rate": 3.0299139253017695e-05, "loss": 0.3397, "step": 9165 }, { "epoch": 1.923318667504714, "grad_norm": 0.8123522996902466, "learning_rate": 3.014333203415741e-05, "loss": 0.3372, "step": 9180 }, { "epoch": 1.9264613450659962, "grad_norm": 0.6080268025398254, "learning_rate": 2.9987753399526934e-05, "loss": 0.3506, "step": 9195 }, { "epoch": 1.9296040226272786, "grad_norm": 0.8804168701171875, "learning_rate": 2.9832405140102637e-05, "loss": 0.3689, "step": 9210 }, { "epoch": 1.9327467001885608, "grad_norm": 0.8579033613204956, "learning_rate": 2.9677289044208833e-05, "loss": 0.3875, "step": 9225 }, { "epoch": 1.935889377749843, "grad_norm": 0.9520317316055298, "learning_rate": 2.952240689749722e-05, "loss": 0.422, "step": 9240 }, { "epoch": 1.939032055311125, "grad_norm": 0.9517824053764343, "learning_rate": 2.9367760482926393e-05, "loss": 0.3917, "step": 9255 }, { "epoch": 1.9421747328724073, "grad_norm": 0.8813058733940125, "learning_rate": 2.921335158074122e-05, "loss": 0.3551, "step": 9270 }, { "epoch": 1.9453174104336894, "grad_norm": 0.8402652144432068, "learning_rate": 2.905918196845242e-05, "loss": 0.3468, "step": 9285 }, { "epoch": 1.9484600879949716, "grad_norm": 0.855032205581665, "learning_rate": 2.8905253420816035e-05, "loss": 0.3534, "step": 9300 }, { "epoch": 1.9516027655562538, "grad_norm": 0.7760915756225586, "learning_rate": 2.875156770981311e-05, "loss": 0.348, "step": 9315 }, { "epoch": 1.9547454431175362, "grad_norm": 0.946934163570404, "learning_rate": 2.8598126604629195e-05, "loss": 0.3556, "step": 9330 }, { "epoch": 1.9578881206788183, "grad_norm": 0.7589976191520691, "learning_rate": 2.844493187163395e-05, "loss": 0.3944, "step": 9345 }, { "epoch": 1.9610307982401005, "grad_norm": 0.8831868171691895, "learning_rate": 2.8291985274360983e-05, "loss": 0.3192, "step": 9360 }, { "epoch": 1.964173475801383, "grad_norm": 0.8260477781295776, "learning_rate": 2.8139288573487337e-05, "loss": 0.3476, "step": 9375 }, { "epoch": 1.967316153362665, "grad_norm": 0.9583712816238403, "learning_rate": 2.7986843526813343e-05, "loss": 0.3112, "step": 9390 }, { "epoch": 1.9704588309239472, "grad_norm": 0.8534590005874634, "learning_rate": 2.783465188924239e-05, "loss": 0.3738, "step": 9405 }, { "epoch": 1.9736015084852294, "grad_norm": 0.8562766909599304, "learning_rate": 2.7682715412760696e-05, "loss": 0.3831, "step": 9420 }, { "epoch": 1.9767441860465116, "grad_norm": 0.649868905544281, "learning_rate": 2.7531035846417107e-05, "loss": 0.379, "step": 9435 }, { "epoch": 1.9798868636077938, "grad_norm": 0.7702896595001221, "learning_rate": 2.7379614936302982e-05, "loss": 0.3617, "step": 9450 }, { "epoch": 1.983029541169076, "grad_norm": 0.9378584623336792, "learning_rate": 2.7228454425532157e-05, "loss": 0.3681, "step": 9465 }, { "epoch": 1.9861722187303583, "grad_norm": 1.0069222450256348, "learning_rate": 2.7077556054220804e-05, "loss": 0.3356, "step": 9480 }, { "epoch": 1.9893148962916405, "grad_norm": 0.9345496892929077, "learning_rate": 2.6926921559467412e-05, "loss": 0.3974, "step": 9495 }, { "epoch": 1.9924575738529227, "grad_norm": 0.8090453147888184, "learning_rate": 2.6776552675332768e-05, "loss": 0.3397, "step": 9510 }, { "epoch": 1.995600251414205, "grad_norm": 0.647416353225708, "learning_rate": 2.6626451132820085e-05, "loss": 0.3259, "step": 9525 }, { "epoch": 1.9987429289754872, "grad_norm": 0.7810280323028564, "learning_rate": 2.6476618659855023e-05, "loss": 0.3234, "step": 9540 }, { "epoch": 2.0018856065367694, "grad_norm": 0.7231355309486389, "learning_rate": 2.6327056981265708e-05, "loss": 0.3276, "step": 9555 }, { "epoch": 2.0050282840980516, "grad_norm": 0.7072864174842834, "learning_rate": 2.6177767818763062e-05, "loss": 0.2683, "step": 9570 }, { "epoch": 2.0081709616593337, "grad_norm": 0.8502817749977112, "learning_rate": 2.6028752890920783e-05, "loss": 0.2844, "step": 9585 }, { "epoch": 2.011313639220616, "grad_norm": 0.6001257300376892, "learning_rate": 2.5880013913155743e-05, "loss": 0.2582, "step": 9600 }, { "epoch": 2.014456316781898, "grad_norm": 1.037467360496521, "learning_rate": 2.5731552597708086e-05, "loss": 0.2666, "step": 9615 }, { "epoch": 2.0175989943431802, "grad_norm": 0.990047812461853, "learning_rate": 2.5583370653621652e-05, "loss": 0.3042, "step": 9630 }, { "epoch": 2.0207416719044624, "grad_norm": 1.0518317222595215, "learning_rate": 2.5435469786724204e-05, "loss": 0.2543, "step": 9645 }, { "epoch": 2.023884349465745, "grad_norm": 1.225774884223938, "learning_rate": 2.528785169960779e-05, "loss": 0.3183, "step": 9660 }, { "epoch": 2.027027027027027, "grad_norm": 0.9525572061538696, "learning_rate": 2.5140518091609256e-05, "loss": 0.3426, "step": 9675 }, { "epoch": 2.0301697045883094, "grad_norm": 1.0750566720962524, "learning_rate": 2.4993470658790573e-05, "loss": 0.3172, "step": 9690 }, { "epoch": 2.0333123821495915, "grad_norm": 0.8268773555755615, "learning_rate": 2.484671109391933e-05, "loss": 0.31, "step": 9705 }, { "epoch": 2.0364550597108737, "grad_norm": 0.679678201675415, "learning_rate": 2.470024108644925e-05, "loss": 0.2868, "step": 9720 }, { "epoch": 2.039597737272156, "grad_norm": 0.997440755367279, "learning_rate": 2.4554062322500797e-05, "loss": 0.3291, "step": 9735 }, { "epoch": 2.042740414833438, "grad_norm": 0.9968817830085754, "learning_rate": 2.4408176484841732e-05, "loss": 0.2664, "step": 9750 }, { "epoch": 2.04588309239472, "grad_norm": 1.0939124822616577, "learning_rate": 2.4262585252867686e-05, "loss": 0.2895, "step": 9765 }, { "epoch": 2.0490257699560024, "grad_norm": 1.0220900774002075, "learning_rate": 2.4117290302582872e-05, "loss": 0.3191, "step": 9780 }, { "epoch": 2.0521684475172846, "grad_norm": 0.635898768901825, "learning_rate": 2.397229330658084e-05, "loss": 0.307, "step": 9795 }, { "epoch": 2.0553111250785667, "grad_norm": 1.112257719039917, "learning_rate": 2.382759593402517e-05, "loss": 0.2748, "step": 9810 }, { "epoch": 2.0584538026398493, "grad_norm": 0.9440275430679321, "learning_rate": 2.3683199850630213e-05, "loss": 0.2893, "step": 9825 }, { "epoch": 2.0615964802011315, "grad_norm": 1.2118226289749146, "learning_rate": 2.3539106718642034e-05, "loss": 0.2791, "step": 9840 }, { "epoch": 2.0647391577624137, "grad_norm": 1.1374374628067017, "learning_rate": 2.339531819681914e-05, "loss": 0.2777, "step": 9855 }, { "epoch": 2.067881835323696, "grad_norm": 0.6932136416435242, "learning_rate": 2.3251835940413517e-05, "loss": 0.2828, "step": 9870 }, { "epoch": 2.071024512884978, "grad_norm": 1.0308489799499512, "learning_rate": 2.310866160115146e-05, "loss": 0.2947, "step": 9885 }, { "epoch": 2.07416719044626, "grad_norm": 1.063235878944397, "learning_rate": 2.2965796827214665e-05, "loss": 0.3204, "step": 9900 }, { "epoch": 2.0773098680075424, "grad_norm": 1.1612193584442139, "learning_rate": 2.282324326322115e-05, "loss": 0.2976, "step": 9915 }, { "epoch": 2.0804525455688245, "grad_norm": 0.8928938508033752, "learning_rate": 2.2681002550206355e-05, "loss": 0.2921, "step": 9930 }, { "epoch": 2.0835952231301067, "grad_norm": 1.066124677658081, "learning_rate": 2.253907632560439e-05, "loss": 0.298, "step": 9945 }, { "epoch": 2.086737900691389, "grad_norm": 0.8713576197624207, "learning_rate": 2.2397466223228947e-05, "loss": 0.275, "step": 9960 }, { "epoch": 2.0898805782526715, "grad_norm": 1.1056296825408936, "learning_rate": 2.2256173873254643e-05, "loss": 0.3266, "step": 9975 }, { "epoch": 2.0930232558139537, "grad_norm": 0.9172502160072327, "learning_rate": 2.211520090219821e-05, "loss": 0.2731, "step": 9990 }, { "epoch": 2.0951183741881416, "eval_accuracy": 0.009820309467613697, "eval_loss": 0.4190310835838318, "eval_runtime": 424.9528, "eval_samples_per_second": 11.26, "eval_steps_per_second": 2.817, "step": 10000 }, { "epoch": 2.096165933375236, "grad_norm": 0.9003602862358093, "learning_rate": 2.1974548932899814e-05, "loss": 0.2534, "step": 10005 }, { "epoch": 2.099308610936518, "grad_norm": 1.0138850212097168, "learning_rate": 2.1834219584504345e-05, "loss": 0.2847, "step": 10020 }, { "epoch": 2.1024512884978, "grad_norm": 0.8467048406600952, "learning_rate": 2.169421447244272e-05, "loss": 0.3011, "step": 10035 }, { "epoch": 2.1055939660590823, "grad_norm": 1.1273193359375, "learning_rate": 2.1554535208413406e-05, "loss": 0.3181, "step": 10050 }, { "epoch": 2.1087366436203645, "grad_norm": 1.1201776266098022, "learning_rate": 2.1415183400363748e-05, "loss": 0.3122, "step": 10065 }, { "epoch": 2.1118793211816467, "grad_norm": 1.0749905109405518, "learning_rate": 2.1276160652471555e-05, "loss": 0.3357, "step": 10080 }, { "epoch": 2.115021998742929, "grad_norm": 0.874462366104126, "learning_rate": 2.1137468565126543e-05, "loss": 0.3014, "step": 10095 }, { "epoch": 2.118164676304211, "grad_norm": 1.0569285154342651, "learning_rate": 2.099910873491202e-05, "loss": 0.2945, "step": 10110 }, { "epoch": 2.121307353865493, "grad_norm": 0.9067788124084473, "learning_rate": 2.0861082754586382e-05, "loss": 0.3218, "step": 10125 }, { "epoch": 2.124450031426776, "grad_norm": 1.2187013626098633, "learning_rate": 2.0723392213064884e-05, "loss": 0.3065, "step": 10140 }, { "epoch": 2.127592708988058, "grad_norm": 1.0931589603424072, "learning_rate": 2.0586038695401317e-05, "loss": 0.2792, "step": 10155 }, { "epoch": 2.13073538654934, "grad_norm": 1.2825082540512085, "learning_rate": 2.0449023782769706e-05, "loss": 0.3138, "step": 10170 }, { "epoch": 2.1338780641106223, "grad_norm": 1.0086079835891724, "learning_rate": 2.031234905244618e-05, "loss": 0.3079, "step": 10185 }, { "epoch": 2.1370207416719045, "grad_norm": 0.7740280032157898, "learning_rate": 2.017601607779074e-05, "loss": 0.2704, "step": 10200 }, { "epoch": 2.1401634192331866, "grad_norm": 0.7861264944076538, "learning_rate": 2.0040026428229313e-05, "loss": 0.296, "step": 10215 }, { "epoch": 2.143306096794469, "grad_norm": 0.8179210424423218, "learning_rate": 1.9904381669235456e-05, "loss": 0.296, "step": 10230 }, { "epoch": 2.146448774355751, "grad_norm": 1.410079002380371, "learning_rate": 1.976908336231245e-05, "loss": 0.2836, "step": 10245 }, { "epoch": 2.149591451917033, "grad_norm": 1.082899570465088, "learning_rate": 1.9634133064975402e-05, "loss": 0.2848, "step": 10260 }, { "epoch": 2.1527341294783153, "grad_norm": 0.9219628572463989, "learning_rate": 1.9499532330733135e-05, "loss": 0.3255, "step": 10275 }, { "epoch": 2.155876807039598, "grad_norm": 0.9849101901054382, "learning_rate": 1.9365282709070487e-05, "loss": 0.3336, "step": 10290 }, { "epoch": 2.15901948460088, "grad_norm": 0.8761511445045471, "learning_rate": 1.9231385745430308e-05, "loss": 0.3128, "step": 10305 }, { "epoch": 2.1621621621621623, "grad_norm": 1.1564205884933472, "learning_rate": 1.9097842981195834e-05, "loss": 0.291, "step": 10320 }, { "epoch": 2.1653048397234445, "grad_norm": 0.6984158158302307, "learning_rate": 1.8964655953672784e-05, "loss": 0.2761, "step": 10335 }, { "epoch": 2.1684475172847266, "grad_norm": 0.7349433898925781, "learning_rate": 1.883182619607179e-05, "loss": 0.3066, "step": 10350 }, { "epoch": 2.171590194846009, "grad_norm": 0.9663205742835999, "learning_rate": 1.8699355237490694e-05, "loss": 0.2644, "step": 10365 }, { "epoch": 2.174732872407291, "grad_norm": 1.194226861000061, "learning_rate": 1.856724460289692e-05, "loss": 0.3112, "step": 10380 }, { "epoch": 2.177875549968573, "grad_norm": 1.0187724828720093, "learning_rate": 1.8435495813109938e-05, "loss": 0.2779, "step": 10395 }, { "epoch": 2.1810182275298553, "grad_norm": 0.7448340654373169, "learning_rate": 1.8304110384783806e-05, "loss": 0.2723, "step": 10410 }, { "epoch": 2.1841609050911375, "grad_norm": 1.0969903469085693, "learning_rate": 1.8173089830389662e-05, "loss": 0.2824, "step": 10425 }, { "epoch": 2.1873035826524196, "grad_norm": 1.0222073793411255, "learning_rate": 1.8042435658198286e-05, "loss": 0.303, "step": 10440 }, { "epoch": 2.1904462602137023, "grad_norm": 0.9316915273666382, "learning_rate": 1.7912149372262793e-05, "loss": 0.2562, "step": 10455 }, { "epoch": 2.1935889377749844, "grad_norm": 0.6998715996742249, "learning_rate": 1.77822324724013e-05, "loss": 0.298, "step": 10470 }, { "epoch": 2.1967316153362666, "grad_norm": 0.9719591736793518, "learning_rate": 1.7652686454179686e-05, "loss": 0.2887, "step": 10485 }, { "epoch": 2.1998742928975488, "grad_norm": 0.8645143508911133, "learning_rate": 1.7523512808894288e-05, "loss": 0.2532, "step": 10500 }, { "epoch": 2.203016970458831, "grad_norm": 1.1070195436477661, "learning_rate": 1.739471302355482e-05, "loss": 0.2999, "step": 10515 }, { "epoch": 2.206159648020113, "grad_norm": 0.8601672053337097, "learning_rate": 1.7266288580867258e-05, "loss": 0.3209, "step": 10530 }, { "epoch": 2.2093023255813953, "grad_norm": 1.0818884372711182, "learning_rate": 1.713824095921668e-05, "loss": 0.3079, "step": 10545 }, { "epoch": 2.2124450031426774, "grad_norm": 0.7250615954399109, "learning_rate": 1.701057163265038e-05, "loss": 0.3364, "step": 10560 }, { "epoch": 2.2155876807039596, "grad_norm": 0.9716282486915588, "learning_rate": 1.6883282070860763e-05, "loss": 0.2898, "step": 10575 }, { "epoch": 2.218730358265242, "grad_norm": 1.0294605493545532, "learning_rate": 1.675637373916855e-05, "loss": 0.3075, "step": 10590 }, { "epoch": 2.2218730358265244, "grad_norm": 1.0724180936813354, "learning_rate": 1.662984809850579e-05, "loss": 0.3068, "step": 10605 }, { "epoch": 2.2250157133878066, "grad_norm": 0.9719418883323669, "learning_rate": 1.6503706605399156e-05, "loss": 0.3153, "step": 10620 }, { "epoch": 2.2281583909490887, "grad_norm": 0.8698229193687439, "learning_rate": 1.6377950711953115e-05, "loss": 0.2597, "step": 10635 }, { "epoch": 2.231301068510371, "grad_norm": 0.9012719988822937, "learning_rate": 1.6252581865833198e-05, "loss": 0.3284, "step": 10650 }, { "epoch": 2.234443746071653, "grad_norm": 0.8515365123748779, "learning_rate": 1.612760151024936e-05, "loss": 0.3147, "step": 10665 }, { "epoch": 2.2375864236329353, "grad_norm": 1.1416083574295044, "learning_rate": 1.6003011083939396e-05, "loss": 0.2958, "step": 10680 }, { "epoch": 2.2407291011942174, "grad_norm": 0.9006314873695374, "learning_rate": 1.5878812021152334e-05, "loss": 0.2757, "step": 10695 }, { "epoch": 2.2438717787554996, "grad_norm": 1.1663639545440674, "learning_rate": 1.5755005751631922e-05, "loss": 0.3064, "step": 10710 }, { "epoch": 2.2470144563167818, "grad_norm": 1.0664478540420532, "learning_rate": 1.563159370060019e-05, "loss": 0.2878, "step": 10725 }, { "epoch": 2.250157133878064, "grad_norm": 0.7780718207359314, "learning_rate": 1.5508577288741056e-05, "loss": 0.3065, "step": 10740 }, { "epoch": 2.253299811439346, "grad_norm": 1.1266307830810547, "learning_rate": 1.5385957932183954e-05, "loss": 0.3004, "step": 10755 }, { "epoch": 2.2564424890006287, "grad_norm": 0.7767760157585144, "learning_rate": 1.5263737042487514e-05, "loss": 0.291, "step": 10770 }, { "epoch": 2.259585166561911, "grad_norm": 0.6928930878639221, "learning_rate": 1.514191602662332e-05, "loss": 0.2945, "step": 10785 }, { "epoch": 2.262727844123193, "grad_norm": 1.177262544631958, "learning_rate": 1.5020496286959752e-05, "loss": 0.3168, "step": 10800 }, { "epoch": 2.2658705216844752, "grad_norm": 1.1784379482269287, "learning_rate": 1.4899479221245827e-05, "loss": 0.342, "step": 10815 }, { "epoch": 2.2690131992457574, "grad_norm": 1.4985358715057373, "learning_rate": 1.477886622259504e-05, "loss": 0.3073, "step": 10830 }, { "epoch": 2.2721558768070396, "grad_norm": 1.0009207725524902, "learning_rate": 1.4658658679469445e-05, "loss": 0.2888, "step": 10845 }, { "epoch": 2.2752985543683217, "grad_norm": 1.0263885259628296, "learning_rate": 1.4538857975663567e-05, "loss": 0.3153, "step": 10860 }, { "epoch": 2.278441231929604, "grad_norm": 0.8072161078453064, "learning_rate": 1.4419465490288508e-05, "loss": 0.2481, "step": 10875 }, { "epoch": 2.281583909490886, "grad_norm": 0.8211586475372314, "learning_rate": 1.430048259775611e-05, "loss": 0.2738, "step": 10890 }, { "epoch": 2.2847265870521687, "grad_norm": 1.0490375757217407, "learning_rate": 1.418191066776311e-05, "loss": 0.3005, "step": 10905 }, { "epoch": 2.287869264613451, "grad_norm": 0.9059322476387024, "learning_rate": 1.4063751065275315e-05, "loss": 0.2578, "step": 10920 }, { "epoch": 2.291011942174733, "grad_norm": 0.9448453187942505, "learning_rate": 1.3946005150511948e-05, "loss": 0.3033, "step": 10935 }, { "epoch": 2.294154619736015, "grad_norm": 0.9595757126808167, "learning_rate": 1.3828674278930009e-05, "loss": 0.3092, "step": 10950 }, { "epoch": 2.2972972972972974, "grad_norm": 0.6836899518966675, "learning_rate": 1.371175980120864e-05, "loss": 0.2354, "step": 10965 }, { "epoch": 2.3004399748585795, "grad_norm": 1.1870014667510986, "learning_rate": 1.3595263063233538e-05, "loss": 0.339, "step": 10980 }, { "epoch": 2.3035826524198617, "grad_norm": 0.9335547685623169, "learning_rate": 1.3479185406081519e-05, "loss": 0.2667, "step": 10995 }, { "epoch": 2.306725329981144, "grad_norm": 1.0864135026931763, "learning_rate": 1.3363528166005068e-05, "loss": 0.2993, "step": 11010 }, { "epoch": 2.309868007542426, "grad_norm": 1.3026399612426758, "learning_rate": 1.3248292674416968e-05, "loss": 0.2838, "step": 11025 }, { "epoch": 2.313010685103708, "grad_norm": 0.7582332491874695, "learning_rate": 1.3133480257874902e-05, "loss": 0.2746, "step": 11040 }, { "epoch": 2.3161533626649904, "grad_norm": 1.0766429901123047, "learning_rate": 1.3019092238066304e-05, "loss": 0.2915, "step": 11055 }, { "epoch": 2.3192960402262726, "grad_norm": 0.7966647148132324, "learning_rate": 1.2905129931793009e-05, "loss": 0.2586, "step": 11070 }, { "epoch": 2.322438717787555, "grad_norm": 1.0455411672592163, "learning_rate": 1.2791594650956212e-05, "loss": 0.2867, "step": 11085 }, { "epoch": 2.3255813953488373, "grad_norm": 0.9847836494445801, "learning_rate": 1.267848770254127e-05, "loss": 0.3219, "step": 11100 }, { "epoch": 2.3287240729101195, "grad_norm": 0.9694182276725769, "learning_rate": 1.256581038860275e-05, "loss": 0.2558, "step": 11115 }, { "epoch": 2.3318667504714017, "grad_norm": 1.4064688682556152, "learning_rate": 1.2453564006249352e-05, "loss": 0.2609, "step": 11130 }, { "epoch": 2.335009428032684, "grad_norm": 0.8352707028388977, "learning_rate": 1.2341749847628997e-05, "loss": 0.2985, "step": 11145 }, { "epoch": 2.338152105593966, "grad_norm": 1.016571044921875, "learning_rate": 1.2230369199914066e-05, "loss": 0.2673, "step": 11160 }, { "epoch": 2.341294783155248, "grad_norm": 0.9296002984046936, "learning_rate": 1.211942334528639e-05, "loss": 0.2685, "step": 11175 }, { "epoch": 2.3444374607165304, "grad_norm": 1.4591748714447021, "learning_rate": 1.200891356092263e-05, "loss": 0.2773, "step": 11190 }, { "epoch": 2.3475801382778125, "grad_norm": 0.9775596261024475, "learning_rate": 1.1898841118979504e-05, "loss": 0.2976, "step": 11205 }, { "epoch": 2.350722815839095, "grad_norm": 1.2126258611679077, "learning_rate": 1.1789207286579201e-05, "loss": 0.3298, "step": 11220 }, { "epoch": 2.3538654934003773, "grad_norm": 1.3125213384628296, "learning_rate": 1.1680013325794776e-05, "loss": 0.2639, "step": 11235 }, { "epoch": 2.3570081709616595, "grad_norm": 1.0396140813827515, "learning_rate": 1.1571260493635561e-05, "loss": 0.292, "step": 11250 }, { "epoch": 2.3601508485229417, "grad_norm": 0.9269897937774658, "learning_rate": 1.1462950042032767e-05, "loss": 0.3426, "step": 11265 }, { "epoch": 2.363293526084224, "grad_norm": 1.1665176153182983, "learning_rate": 1.1355083217825052e-05, "loss": 0.2794, "step": 11280 }, { "epoch": 2.366436203645506, "grad_norm": 1.0097540616989136, "learning_rate": 1.1247661262744175e-05, "loss": 0.2986, "step": 11295 }, { "epoch": 2.369578881206788, "grad_norm": 1.1132863759994507, "learning_rate": 1.1140685413400648e-05, "loss": 0.3229, "step": 11310 }, { "epoch": 2.3727215587680703, "grad_norm": 1.2184104919433594, "learning_rate": 1.1034156901269598e-05, "loss": 0.2708, "step": 11325 }, { "epoch": 2.3758642363293525, "grad_norm": 1.0664645433425903, "learning_rate": 1.0928076952676474e-05, "loss": 0.2728, "step": 11340 }, { "epoch": 2.3790069138906347, "grad_norm": 1.2971463203430176, "learning_rate": 1.0822446788783058e-05, "loss": 0.3048, "step": 11355 }, { "epoch": 2.382149591451917, "grad_norm": 0.9727672338485718, "learning_rate": 1.0717267625573279e-05, "loss": 0.2918, "step": 11370 }, { "epoch": 2.385292269013199, "grad_norm": 1.0206960439682007, "learning_rate": 1.0612540673839322e-05, "loss": 0.2885, "step": 11385 }, { "epoch": 2.3884349465744816, "grad_norm": 1.1079341173171997, "learning_rate": 1.0508267139167615e-05, "loss": 0.309, "step": 11400 }, { "epoch": 2.391577624135764, "grad_norm": 1.1144444942474365, "learning_rate": 1.0404448221924961e-05, "loss": 0.2268, "step": 11415 }, { "epoch": 2.394720301697046, "grad_norm": 1.1846858263015747, "learning_rate": 1.030108511724483e-05, "loss": 0.2822, "step": 11430 }, { "epoch": 2.397862979258328, "grad_norm": 1.063310146331787, "learning_rate": 1.019817901501341e-05, "loss": 0.2883, "step": 11445 }, { "epoch": 2.4010056568196103, "grad_norm": 1.1355246305465698, "learning_rate": 1.0095731099856049e-05, "loss": 0.2975, "step": 11460 }, { "epoch": 2.4041483343808925, "grad_norm": 1.017663836479187, "learning_rate": 9.993742551123558e-06, "loss": 0.2883, "step": 11475 }, { "epoch": 2.4072910119421747, "grad_norm": 1.3695423603057861, "learning_rate": 9.892214542878686e-06, "loss": 0.343, "step": 11490 }, { "epoch": 2.410433689503457, "grad_norm": 1.0663484334945679, "learning_rate": 9.79114824388257e-06, "loss": 0.26, "step": 11505 }, { "epoch": 2.413576367064739, "grad_norm": 1.0500160455703735, "learning_rate": 9.690544817581243e-06, "loss": 0.2877, "step": 11520 }, { "epoch": 2.4167190446260216, "grad_norm": 1.0720367431640625, "learning_rate": 9.590405422092336e-06, "loss": 0.2561, "step": 11535 }, { "epoch": 2.4198617221873038, "grad_norm": 0.9935043454170227, "learning_rate": 9.49073121019164e-06, "loss": 0.2764, "step": 11550 }, { "epoch": 2.423004399748586, "grad_norm": 1.2285892963409424, "learning_rate": 9.391523329299928e-06, "loss": 0.303, "step": 11565 }, { "epoch": 2.426147077309868, "grad_norm": 1.2495083808898926, "learning_rate": 9.292782921469673e-06, "loss": 0.3252, "step": 11580 }, { "epoch": 2.4292897548711503, "grad_norm": 1.0354247093200684, "learning_rate": 9.194511123371963e-06, "loss": 0.2692, "step": 11595 }, { "epoch": 2.4324324324324325, "grad_norm": 1.0744938850402832, "learning_rate": 9.096709066283354e-06, "loss": 0.2793, "step": 11610 }, { "epoch": 2.4355751099937146, "grad_norm": 1.145193338394165, "learning_rate": 8.9993778760729e-06, "loss": 0.3108, "step": 11625 }, { "epoch": 2.438717787554997, "grad_norm": 0.7168245911598206, "learning_rate": 8.902518673189192e-06, "loss": 0.3088, "step": 11640 }, { "epoch": 2.441860465116279, "grad_norm": 0.9759941697120667, "learning_rate": 8.806132572647386e-06, "loss": 0.2771, "step": 11655 }, { "epoch": 2.445003142677561, "grad_norm": 0.9443902373313904, "learning_rate": 8.710220684016462e-06, "loss": 0.2593, "step": 11670 }, { "epoch": 2.4481458202388433, "grad_norm": 0.9628651142120361, "learning_rate": 8.614784111406365e-06, "loss": 0.267, "step": 11685 }, { "epoch": 2.4512884978001255, "grad_norm": 1.0149531364440918, "learning_rate": 8.519823953455424e-06, "loss": 0.2929, "step": 11700 }, { "epoch": 2.454431175361408, "grad_norm": 0.9107941389083862, "learning_rate": 8.425341303317536e-06, "loss": 0.2911, "step": 11715 }, { "epoch": 2.4575738529226903, "grad_norm": 1.1681251525878906, "learning_rate": 8.33133724864969e-06, "loss": 0.2939, "step": 11730 }, { "epoch": 2.4607165304839724, "grad_norm": 0.8774799704551697, "learning_rate": 8.237812871599448e-06, "loss": 0.2612, "step": 11745 }, { "epoch": 2.4638592080452546, "grad_norm": 0.8654860854148865, "learning_rate": 8.144769248792417e-06, "loss": 0.2924, "step": 11760 }, { "epoch": 2.4670018856065368, "grad_norm": 1.062782645225525, "learning_rate": 8.052207451319954e-06, "loss": 0.2466, "step": 11775 }, { "epoch": 2.470144563167819, "grad_norm": 0.8732921481132507, "learning_rate": 7.960128544726724e-06, "loss": 0.2318, "step": 11790 }, { "epoch": 2.473287240729101, "grad_norm": 1.191798210144043, "learning_rate": 7.86853358899855e-06, "loss": 0.3097, "step": 11805 }, { "epoch": 2.4764299182903833, "grad_norm": 0.9445894360542297, "learning_rate": 7.777423638550096e-06, "loss": 0.2935, "step": 11820 }, { "epoch": 2.4795725958516655, "grad_norm": 0.9677672386169434, "learning_rate": 7.68679974221282e-06, "loss": 0.2949, "step": 11835 }, { "epoch": 2.482715273412948, "grad_norm": 0.756100058555603, "learning_rate": 7.596662943222877e-06, "loss": 0.2685, "step": 11850 }, { "epoch": 2.4858579509742302, "grad_norm": 1.2218337059020996, "learning_rate": 7.507014279209057e-06, "loss": 0.3395, "step": 11865 }, { "epoch": 2.4890006285355124, "grad_norm": 1.1206847429275513, "learning_rate": 7.417854782180894e-06, "loss": 0.2641, "step": 11880 }, { "epoch": 2.4921433060967946, "grad_norm": 1.095615029335022, "learning_rate": 7.329185478516798e-06, "loss": 0.3021, "step": 11895 }, { "epoch": 2.4952859836580767, "grad_norm": 0.9641756415367126, "learning_rate": 7.241007388952209e-06, "loss": 0.2847, "step": 11910 }, { "epoch": 2.498428661219359, "grad_norm": 0.9637003540992737, "learning_rate": 7.153321528567819e-06, "loss": 0.2775, "step": 11925 }, { "epoch": 2.501571338780641, "grad_norm": 0.8976852297782898, "learning_rate": 7.066128906777941e-06, "loss": 0.2636, "step": 11940 }, { "epoch": 2.5047140163419233, "grad_norm": 1.006549596786499, "learning_rate": 6.97943052731887e-06, "loss": 0.2616, "step": 11955 }, { "epoch": 2.5078566939032054, "grad_norm": 1.004257321357727, "learning_rate": 6.893227388237345e-06, "loss": 0.2579, "step": 11970 }, { "epoch": 2.5109993714644876, "grad_norm": 0.8972447514533997, "learning_rate": 6.807520481879004e-06, "loss": 0.2469, "step": 11985 }, { "epoch": 2.5141420490257698, "grad_norm": 0.8245068192481995, "learning_rate": 6.722310794877002e-06, "loss": 0.3258, "step": 12000 }, { "epoch": 2.517284726587052, "grad_norm": 1.2819231748580933, "learning_rate": 6.637599308140685e-06, "loss": 0.2503, "step": 12015 }, { "epoch": 2.520427404148334, "grad_norm": 0.9961299896240234, "learning_rate": 6.553386996844208e-06, "loss": 0.2766, "step": 12030 }, { "epoch": 2.5235700817096167, "grad_norm": 0.7203584909439087, "learning_rate": 6.469674830415412e-06, "loss": 0.3168, "step": 12045 }, { "epoch": 2.526712759270899, "grad_norm": 0.8977159261703491, "learning_rate": 6.386463772524576e-06, "loss": 0.2573, "step": 12060 }, { "epoch": 2.529855436832181, "grad_norm": 1.2124725580215454, "learning_rate": 6.303754781073395e-06, "loss": 0.3008, "step": 12075 }, { "epoch": 2.5329981143934632, "grad_norm": 0.7577414512634277, "learning_rate": 6.2215488081838854e-06, "loss": 0.2492, "step": 12090 }, { "epoch": 2.5361407919547454, "grad_norm": 1.308779001235962, "learning_rate": 6.139846800187493e-06, "loss": 0.3002, "step": 12105 }, { "epoch": 2.5392834695160276, "grad_norm": 1.0538486242294312, "learning_rate": 6.058649697614149e-06, "loss": 0.3068, "step": 12120 }, { "epoch": 2.5424261470773097, "grad_norm": 1.1852937936782837, "learning_rate": 5.9779584351814636e-06, "loss": 0.308, "step": 12135 }, { "epoch": 2.5455688246385924, "grad_norm": 0.9339080452919006, "learning_rate": 5.897773941783935e-06, "loss": 0.297, "step": 12150 }, { "epoch": 2.5487115021998745, "grad_norm": 0.8344528079032898, "learning_rate": 5.8180971404823205e-06, "loss": 0.2789, "step": 12165 }, { "epoch": 2.5518541797611567, "grad_norm": 1.3588929176330566, "learning_rate": 5.738928948492966e-06, "loss": 0.296, "step": 12180 }, { "epoch": 2.554996857322439, "grad_norm": 1.0490657091140747, "learning_rate": 5.660270277177243e-06, "loss": 0.2864, "step": 12195 }, { "epoch": 2.558139534883721, "grad_norm": 1.2904434204101562, "learning_rate": 5.582122032031051e-06, "loss": 0.2966, "step": 12210 }, { "epoch": 2.561282212445003, "grad_norm": 0.7123144268989563, "learning_rate": 5.5044851126744404e-06, "loss": 0.2733, "step": 12225 }, { "epoch": 2.5644248900062854, "grad_norm": 1.2593188285827637, "learning_rate": 5.4273604128412315e-06, "loss": 0.2873, "step": 12240 }, { "epoch": 2.5675675675675675, "grad_norm": 0.9681785106658936, "learning_rate": 5.35074882036869e-06, "loss": 0.2596, "step": 12255 }, { "epoch": 2.5707102451288497, "grad_norm": 0.944814145565033, "learning_rate": 5.2746512171873485e-06, "loss": 0.2871, "step": 12270 }, { "epoch": 2.573852922690132, "grad_norm": 1.0654292106628418, "learning_rate": 5.199068479310865e-06, "loss": 0.2856, "step": 12285 }, { "epoch": 2.576995600251414, "grad_norm": 1.4697771072387695, "learning_rate": 5.12400147682589e-06, "loss": 0.3125, "step": 12300 }, { "epoch": 2.5801382778126962, "grad_norm": 1.1471614837646484, "learning_rate": 5.0494510738820836e-06, "loss": 0.2712, "step": 12315 }, { "epoch": 2.5832809553739784, "grad_norm": 1.2926499843597412, "learning_rate": 4.9754181286821855e-06, "loss": 0.2721, "step": 12330 }, { "epoch": 2.586423632935261, "grad_norm": 1.1065871715545654, "learning_rate": 4.901903493472071e-06, "loss": 0.3443, "step": 12345 }, { "epoch": 2.589566310496543, "grad_norm": 1.0714068412780762, "learning_rate": 4.8289080145309974e-06, "loss": 0.2963, "step": 12360 }, { "epoch": 2.5927089880578253, "grad_norm": 0.8245282769203186, "learning_rate": 4.756432532161858e-06, "loss": 0.2564, "step": 12375 }, { "epoch": 2.5958516656191075, "grad_norm": 1.266921043395996, "learning_rate": 4.684477880681492e-06, "loss": 0.2712, "step": 12390 }, { "epoch": 2.5989943431803897, "grad_norm": 1.2646595239639282, "learning_rate": 4.613044888411067e-06, "loss": 0.2845, "step": 12405 }, { "epoch": 2.602137020741672, "grad_norm": 1.0433062314987183, "learning_rate": 4.542134377666562e-06, "loss": 0.309, "step": 12420 }, { "epoch": 2.605279698302954, "grad_norm": 0.9236804246902466, "learning_rate": 4.471747164749318e-06, "loss": 0.2576, "step": 12435 }, { "epoch": 2.608422375864236, "grad_norm": 0.8656274676322937, "learning_rate": 4.401884059936618e-06, "loss": 0.2695, "step": 12450 }, { "epoch": 2.611565053425519, "grad_norm": 1.226678729057312, "learning_rate": 4.332545867472354e-06, "loss": 0.2993, "step": 12465 }, { "epoch": 2.614707730986801, "grad_norm": 1.1997127532958984, "learning_rate": 4.263733385557767e-06, "loss": 0.2832, "step": 12480 }, { "epoch": 2.617850408548083, "grad_norm": 1.113054871559143, "learning_rate": 4.195447406342301e-06, "loss": 0.2429, "step": 12495 }, { "epoch": 2.6209930861093653, "grad_norm": 1.1524410247802734, "learning_rate": 4.127688715914446e-06, "loss": 0.3216, "step": 12510 }, { "epoch": 2.6241357636706475, "grad_norm": 1.1508104801177979, "learning_rate": 4.060458094292663e-06, "loss": 0.2685, "step": 12525 }, { "epoch": 2.6272784412319297, "grad_norm": 1.1233001947402954, "learning_rate": 3.993756315416486e-06, "loss": 0.2525, "step": 12540 }, { "epoch": 2.630421118793212, "grad_norm": 1.041908621788025, "learning_rate": 3.927584147137514e-06, "loss": 0.2833, "step": 12555 }, { "epoch": 2.633563796354494, "grad_norm": 1.2598505020141602, "learning_rate": 3.8619423512106734e-06, "loss": 0.2895, "step": 12570 }, { "epoch": 2.636706473915776, "grad_norm": 1.137080430984497, "learning_rate": 3.7968316832853456e-06, "loss": 0.29, "step": 12585 }, { "epoch": 2.6398491514770583, "grad_norm": 1.0239893198013306, "learning_rate": 3.7322528928967703e-06, "loss": 0.2548, "step": 12600 }, { "epoch": 2.6429918290383405, "grad_norm": 0.9820106625556946, "learning_rate": 3.668206723457329e-06, "loss": 0.3135, "step": 12615 }, { "epoch": 2.6461345065996227, "grad_norm": 0.8583505153656006, "learning_rate": 3.604693912248025e-06, "loss": 0.2581, "step": 12630 }, { "epoch": 2.649277184160905, "grad_norm": 1.1391513347625732, "learning_rate": 3.541715190410022e-06, "loss": 0.2878, "step": 12645 }, { "epoch": 2.6524198617221875, "grad_norm": 1.0786199569702148, "learning_rate": 3.4792712829361917e-06, "loss": 0.2667, "step": 12660 }, { "epoch": 2.6555625392834696, "grad_norm": 0.9973167777061462, "learning_rate": 3.4173629086627633e-06, "loss": 0.2455, "step": 12675 }, { "epoch": 2.658705216844752, "grad_norm": 0.8622914552688599, "learning_rate": 3.355990780261059e-06, "loss": 0.2264, "step": 12690 }, { "epoch": 2.661847894406034, "grad_norm": 0.9155644774436951, "learning_rate": 3.295155604229322e-06, "loss": 0.3147, "step": 12705 }, { "epoch": 2.664990571967316, "grad_norm": 1.313897728919983, "learning_rate": 3.234858080884545e-06, "loss": 0.2793, "step": 12720 }, { "epoch": 2.6681332495285983, "grad_norm": 1.0417330265045166, "learning_rate": 3.1750989043543843e-06, "loss": 0.3048, "step": 12735 }, { "epoch": 2.6712759270898805, "grad_norm": 1.175787091255188, "learning_rate": 3.1158787625692632e-06, "loss": 0.2897, "step": 12750 }, { "epoch": 2.6744186046511627, "grad_norm": 1.1047790050506592, "learning_rate": 3.05719833725433e-06, "loss": 0.3, "step": 12765 }, { "epoch": 2.6775612822124453, "grad_norm": 0.8376184701919556, "learning_rate": 2.9990583039217203e-06, "loss": 0.2654, "step": 12780 }, { "epoch": 2.6807039597737274, "grad_norm": 0.6929535269737244, "learning_rate": 2.941459331862706e-06, "loss": 0.3012, "step": 12795 }, { "epoch": 2.6838466373350096, "grad_norm": 0.832949161529541, "learning_rate": 2.8844020841400364e-06, "loss": 0.2765, "step": 12810 }, { "epoch": 2.686989314896292, "grad_norm": 0.9470664858818054, "learning_rate": 2.827887217580266e-06, "loss": 0.2729, "step": 12825 }, { "epoch": 2.690131992457574, "grad_norm": 0.7952046394348145, "learning_rate": 2.771915382766238e-06, "loss": 0.2464, "step": 12840 }, { "epoch": 2.693274670018856, "grad_norm": 1.0609912872314453, "learning_rate": 2.7164872240295458e-06, "loss": 0.3087, "step": 12855 }, { "epoch": 2.6964173475801383, "grad_norm": 0.9275609850883484, "learning_rate": 2.6616033794431614e-06, "loss": 0.2575, "step": 12870 }, { "epoch": 2.6995600251414205, "grad_norm": 1.464107871055603, "learning_rate": 2.607264480814059e-06, "loss": 0.2919, "step": 12885 }, { "epoch": 2.7027027027027026, "grad_norm": 1.1258777379989624, "learning_rate": 2.5534711536759404e-06, "loss": 0.265, "step": 12900 }, { "epoch": 2.705845380263985, "grad_norm": 1.169700264930725, "learning_rate": 2.5002240172820823e-06, "loss": 0.2849, "step": 12915 }, { "epoch": 2.708988057825267, "grad_norm": 1.3186782598495483, "learning_rate": 2.4475236845981465e-06, "loss": 0.2806, "step": 12930 }, { "epoch": 2.712130735386549, "grad_norm": 1.4104660749435425, "learning_rate": 2.395370762295135e-06, "loss": 0.3004, "step": 12945 }, { "epoch": 2.7152734129478313, "grad_norm": 1.2798209190368652, "learning_rate": 2.343765850742441e-06, "loss": 0.2887, "step": 12960 }, { "epoch": 2.718416090509114, "grad_norm": 1.0648716688156128, "learning_rate": 2.2927095440009093e-06, "loss": 0.2842, "step": 12975 }, { "epoch": 2.721558768070396, "grad_norm": 1.0158684253692627, "learning_rate": 2.2422024298160147e-06, "loss": 0.2977, "step": 12990 }, { "epoch": 2.7247014456316783, "grad_norm": 0.6185563802719116, "learning_rate": 2.1922450896110614e-06, "loss": 0.2967, "step": 13005 }, { "epoch": 2.7278441231929604, "grad_norm": 1.0942654609680176, "learning_rate": 2.142838098480543e-06, "loss": 0.277, "step": 13020 }, { "epoch": 2.7309868007542426, "grad_norm": 1.0424152612686157, "learning_rate": 2.0939820251834717e-06, "loss": 0.2908, "step": 13035 }, { "epoch": 2.7341294783155248, "grad_norm": 1.048524022102356, "learning_rate": 2.0456774321368666e-06, "loss": 0.3442, "step": 13050 }, { "epoch": 2.737272155876807, "grad_norm": 0.8081900477409363, "learning_rate": 1.9979248754092517e-06, "loss": 0.2707, "step": 13065 }, { "epoch": 2.740414833438089, "grad_norm": 1.3440662622451782, "learning_rate": 1.950724904714285e-06, "loss": 0.3337, "step": 13080 }, { "epoch": 2.7435575109993717, "grad_norm": 0.9911431670188904, "learning_rate": 1.904078063404391e-06, "loss": 0.2852, "step": 13095 }, { "epoch": 2.746700188560654, "grad_norm": 1.150423526763916, "learning_rate": 1.8579848884645534e-06, "loss": 0.2571, "step": 13110 }, { "epoch": 2.749842866121936, "grad_norm": 1.1156803369522095, "learning_rate": 1.8124459105060942e-06, "loss": 0.2896, "step": 13125 }, { "epoch": 2.7529855436832182, "grad_norm": 1.040390133857727, "learning_rate": 1.767461653760588e-06, "loss": 0.278, "step": 13140 }, { "epoch": 2.7561282212445004, "grad_norm": 1.0304458141326904, "learning_rate": 1.723032636073807e-06, "loss": 0.2613, "step": 13155 }, { "epoch": 2.7592708988057826, "grad_norm": 1.1717437505722046, "learning_rate": 1.679159368899763e-06, "loss": 0.3064, "step": 13170 }, { "epoch": 2.7624135763670647, "grad_norm": 0.9141078591346741, "learning_rate": 1.63584235729487e-06, "loss": 0.2837, "step": 13185 }, { "epoch": 2.765556253928347, "grad_norm": 1.1188409328460693, "learning_rate": 1.593082099912052e-06, "loss": 0.2932, "step": 13200 }, { "epoch": 2.768698931489629, "grad_norm": 1.0684481859207153, "learning_rate": 1.5508790889950441e-06, "loss": 0.267, "step": 13215 }, { "epoch": 2.7718416090509113, "grad_norm": 0.976677417755127, "learning_rate": 1.5092338103727344e-06, "loss": 0.2897, "step": 13230 }, { "epoch": 2.7749842866121934, "grad_norm": 1.081978678703308, "learning_rate": 1.4681467434535356e-06, "loss": 0.2592, "step": 13245 }, { "epoch": 2.7781269641734756, "grad_norm": 1.090117335319519, "learning_rate": 1.4276183612199178e-06, "loss": 0.2923, "step": 13260 }, { "epoch": 2.7812696417347578, "grad_norm": 1.1117249727249146, "learning_rate": 1.3876491302229011e-06, "loss": 0.2701, "step": 13275 }, { "epoch": 2.7844123192960404, "grad_norm": 1.4228675365447998, "learning_rate": 1.3482395105767543e-06, "loss": 0.3066, "step": 13290 }, { "epoch": 2.7875549968573226, "grad_norm": 0.9276790618896484, "learning_rate": 1.3093899559536272e-06, "loss": 0.2437, "step": 13305 }, { "epoch": 2.7906976744186047, "grad_norm": 1.1724159717559814, "learning_rate": 1.2711009135783825e-06, "loss": 0.3051, "step": 13320 }, { "epoch": 2.793840351979887, "grad_norm": 0.9188593029975891, "learning_rate": 1.2333728242234333e-06, "loss": 0.3214, "step": 13335 }, { "epoch": 2.796983029541169, "grad_norm": 1.084934949874878, "learning_rate": 1.196206122203647e-06, "loss": 0.2653, "step": 13350 }, { "epoch": 2.8001257071024512, "grad_norm": 1.041142225265503, "learning_rate": 1.1596012353713604e-06, "loss": 0.2879, "step": 13365 }, { "epoch": 2.8032683846637334, "grad_norm": 1.026824951171875, "learning_rate": 1.1235585851114726e-06, "loss": 0.3006, "step": 13380 }, { "epoch": 2.8064110622250156, "grad_norm": 1.143835425376892, "learning_rate": 1.0880785863365718e-06, "loss": 0.305, "step": 13395 }, { "epoch": 2.809553739786298, "grad_norm": 0.5169873833656311, "learning_rate": 1.0531616474821649e-06, "loss": 0.2878, "step": 13410 }, { "epoch": 2.8126964173475804, "grad_norm": 1.1536767482757568, "learning_rate": 1.0188081705019558e-06, "loss": 0.2877, "step": 13425 }, { "epoch": 2.8158390949088625, "grad_norm": 0.9985389113426208, "learning_rate": 9.850185508632704e-07, "loss": 0.3113, "step": 13440 }, { "epoch": 2.8189817724701447, "grad_norm": 0.9148264527320862, "learning_rate": 9.517931775424593e-07, "loss": 0.3117, "step": 13455 }, { "epoch": 2.822124450031427, "grad_norm": 1.1424579620361328, "learning_rate": 9.191324330204199e-07, "loss": 0.2721, "step": 13470 }, { "epoch": 2.825267127592709, "grad_norm": 1.054230809211731, "learning_rate": 8.870366932782093e-07, "loss": 0.303, "step": 13485 }, { "epoch": 2.828409805153991, "grad_norm": 1.211416482925415, "learning_rate": 8.555063277927378e-07, "loss": 0.2932, "step": 13500 }, { "epoch": 2.8315524827152734, "grad_norm": 1.4953478574752808, "learning_rate": 8.24541699532455e-07, "loss": 0.3246, "step": 13515 }, { "epoch": 2.8346951602765555, "grad_norm": 0.773501455783844, "learning_rate": 7.94143164953226e-07, "loss": 0.2777, "step": 13530 }, { "epoch": 2.8378378378378377, "grad_norm": 0.6173717379570007, "learning_rate": 7.643110739942172e-07, "loss": 0.3181, "step": 13545 }, { "epoch": 2.84098051539912, "grad_norm": 1.1255333423614502, "learning_rate": 7.350457700738389e-07, "loss": 0.2954, "step": 13560 }, { "epoch": 2.844123192960402, "grad_norm": 1.1932814121246338, "learning_rate": 7.063475900858263e-07, "loss": 0.314, "step": 13575 }, { "epoch": 2.8472658705216842, "grad_norm": 1.5271681547164917, "learning_rate": 6.782168643953312e-07, "loss": 0.3197, "step": 13590 }, { "epoch": 2.850408548082967, "grad_norm": 0.9488076567649841, "learning_rate": 6.506539168351699e-07, "loss": 0.2993, "step": 13605 }, { "epoch": 2.853551225644249, "grad_norm": 1.015404462814331, "learning_rate": 6.236590647020202e-07, "loss": 0.2831, "step": 13620 }, { "epoch": 2.856693903205531, "grad_norm": 0.6510112881660461, "learning_rate": 5.972326187528299e-07, "loss": 0.2806, "step": 13635 }, { "epoch": 2.8598365807668134, "grad_norm": 1.1119881868362427, "learning_rate": 5.7137488320122e-07, "loss": 0.2625, "step": 13650 }, { "epoch": 2.8629792583280955, "grad_norm": 1.0891669988632202, "learning_rate": 5.460861557139818e-07, "loss": 0.2913, "step": 13665 }, { "epoch": 2.8661219358893777, "grad_norm": 1.3575654029846191, "learning_rate": 5.213667274076461e-07, "loss": 0.3209, "step": 13680 }, { "epoch": 2.86926461345066, "grad_norm": 0.7372342944145203, "learning_rate": 4.972168828451251e-07, "loss": 0.2798, "step": 13695 }, { "epoch": 2.872407291011942, "grad_norm": 1.258745551109314, "learning_rate": 4.736369000324703e-07, "loss": 0.3125, "step": 13710 }, { "epoch": 2.8755499685732246, "grad_norm": 0.7658424973487854, "learning_rate": 4.506270504156307e-07, "loss": 0.2501, "step": 13725 }, { "epoch": 2.878692646134507, "grad_norm": 1.225644826889038, "learning_rate": 4.281875988773554e-07, "loss": 0.2975, "step": 13740 }, { "epoch": 2.881835323695789, "grad_norm": 1.0335606336593628, "learning_rate": 4.063188037341348e-07, "loss": 0.2852, "step": 13755 }, { "epoch": 2.884978001257071, "grad_norm": 0.8567134737968445, "learning_rate": 3.8502091673322526e-07, "loss": 0.2584, "step": 13770 }, { "epoch": 2.8881206788183533, "grad_norm": 0.8661710023880005, "learning_rate": 3.642941830497515e-07, "loss": 0.3128, "step": 13785 }, { "epoch": 2.8912633563796355, "grad_norm": 1.1629458665847778, "learning_rate": 3.441388412838864e-07, "loss": 0.2919, "step": 13800 }, { "epoch": 2.8944060339409177, "grad_norm": 0.9116327166557312, "learning_rate": 3.2455512345811457e-07, "loss": 0.2464, "step": 13815 }, { "epoch": 2.8975487115022, "grad_norm": 0.8351930975914001, "learning_rate": 3.055432550145398e-07, "loss": 0.3138, "step": 13830 }, { "epoch": 2.900691389063482, "grad_norm": 0.8611274361610413, "learning_rate": 2.871034548122986e-07, "loss": 0.2675, "step": 13845 }, { "epoch": 2.903834066624764, "grad_norm": 1.021216630935669, "learning_rate": 2.692359351250506e-07, "loss": 0.2545, "step": 13860 }, { "epoch": 2.9069767441860463, "grad_norm": 0.9064350128173828, "learning_rate": 2.5194090163853103e-07, "loss": 0.2813, "step": 13875 }, { "epoch": 2.9101194217473285, "grad_norm": 0.7603162527084351, "learning_rate": 2.3521855344816323e-07, "loss": 0.2837, "step": 13890 }, { "epoch": 2.9132620993086107, "grad_norm": 1.0929245948791504, "learning_rate": 2.1906908305679986e-07, "loss": 0.3017, "step": 13905 }, { "epoch": 2.9164047768698933, "grad_norm": 1.078133225440979, "learning_rate": 2.0349267637247982e-07, "loss": 0.2812, "step": 13920 }, { "epoch": 2.9195474544311755, "grad_norm": 0.6622474789619446, "learning_rate": 1.8848951270630244e-07, "loss": 0.2775, "step": 13935 }, { "epoch": 2.9226901319924576, "grad_norm": 0.8766260147094727, "learning_rate": 1.7405976477035124e-07, "loss": 0.2694, "step": 13950 }, { "epoch": 2.92583280955374, "grad_norm": 1.1658825874328613, "learning_rate": 1.6020359867572333e-07, "loss": 0.2946, "step": 13965 }, { "epoch": 2.928975487115022, "grad_norm": 1.0801419019699097, "learning_rate": 1.469211739306031e-07, "loss": 0.3458, "step": 13980 }, { "epoch": 2.932118164676304, "grad_norm": 1.0484652519226074, "learning_rate": 1.3421264343843054e-07, "loss": 0.3075, "step": 13995 }, { "epoch": 2.9352608422375863, "grad_norm": 1.168779730796814, "learning_rate": 1.2207815349614128e-07, "loss": 0.2848, "step": 14010 }, { "epoch": 2.9384035197988685, "grad_norm": 0.684332013130188, "learning_rate": 1.105178437924792e-07, "loss": 0.2766, "step": 14025 }, { "epoch": 2.941546197360151, "grad_norm": 0.8341169953346252, "learning_rate": 9.953184740639222e-08, "loss": 0.2733, "step": 14040 }, { "epoch": 2.9446888749214333, "grad_norm": 0.9254804849624634, "learning_rate": 8.91202908055e-08, "loss": 0.2501, "step": 14055 }, { "epoch": 2.9478315524827154, "grad_norm": 1.1126680374145508, "learning_rate": 7.928329384463418e-08, "loss": 0.3106, "step": 14070 }, { "epoch": 2.9509742300439976, "grad_norm": 0.7974324226379395, "learning_rate": 7.002096976446715e-08, "loss": 0.2767, "step": 14085 }, { "epoch": 2.95411690760528, "grad_norm": 1.1926546096801758, "learning_rate": 6.133342519020202e-08, "loss": 0.3035, "step": 14100 }, { "epoch": 2.957259585166562, "grad_norm": 1.0937379598617554, "learning_rate": 5.322076013034027e-08, "loss": 0.2464, "step": 14115 }, { "epoch": 2.960402262727844, "grad_norm": 0.7003195285797119, "learning_rate": 4.568306797554378e-08, "loss": 0.2879, "step": 14130 }, { "epoch": 2.9635449402891263, "grad_norm": 1.247180461883545, "learning_rate": 3.872043549754678e-08, "loss": 0.2745, "step": 14145 }, { "epoch": 2.9666876178504085, "grad_norm": 1.021088719367981, "learning_rate": 3.233294284816224e-08, "loss": 0.2661, "step": 14160 }, { "epoch": 2.9698302954116906, "grad_norm": 1.149630069732666, "learning_rate": 2.652066355836591e-08, "loss": 0.3032, "step": 14175 }, { "epoch": 2.972972972972973, "grad_norm": 1.1792501211166382, "learning_rate": 2.128366453743591e-08, "loss": 0.2652, "step": 14190 }, { "epoch": 2.976115650534255, "grad_norm": 1.1864453554153442, "learning_rate": 1.662200607219777e-08, "loss": 0.2712, "step": 14205 }, { "epoch": 2.979258328095537, "grad_norm": 1.6070250272750854, "learning_rate": 1.2535741826313897e-08, "loss": 0.2848, "step": 14220 }, { "epoch": 2.9824010056568198, "grad_norm": 0.9383937120437622, "learning_rate": 9.024918839678486e-09, "loss": 0.2689, "step": 14235 }, { "epoch": 2.985543683218102, "grad_norm": 0.9039358496665955, "learning_rate": 6.089577527873535e-09, "loss": 0.2109, "step": 14250 }, { "epoch": 2.988686360779384, "grad_norm": 0.8809177279472351, "learning_rate": 3.729751681702531e-09, "loss": 0.2992, "step": 14265 }, { "epoch": 2.9918290383406663, "grad_norm": 1.0034148693084717, "learning_rate": 1.94546846679633e-09, "loss": 0.2721, "step": 14280 }, { "epoch": 2.9949717159019484, "grad_norm": 0.8567355871200562, "learning_rate": 7.367484233133937e-10, "loss": 0.2449, "step": 14295 }, { "epoch": 2.9981143934632306, "grad_norm": 1.462417721748352, "learning_rate": 1.0360546568444207e-10, "loss": 0.2634, "step": 14310 }, { "epoch": 3.0, "step": 14319, "total_flos": 2.7323814530514944e+18, "train_loss": 0.39260489724686265, "train_runtime": 12699.4551, "train_samples_per_second": 4.51, "train_steps_per_second": 1.128 } ], "logging_steps": 15, "max_steps": 14319, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7323814530514944e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }