diff --git "a/checkpoint-10000/trainer_state.json" "b/checkpoint-10000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10000/trainer_state.json" @@ -0,0 +1,4704 @@ +{ + "best_metric": 0.009820309467613697, + "best_model_checkpoint": "/workspace/previous_works/RadFM/output/RadFM-Llama3-8B-pretrain-0002-embed_tokens-depth32-lora-10ep/checkpoint-10000", + "epoch": 2.0951183741881416, + "eval_steps": 10000, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0031426775612822125, + "grad_norm": 38.333740234375, + "learning_rate": 3.488372093023256e-06, + "loss": 2.6324, + "step": 15 + }, + { + "epoch": 0.006285355122564425, + "grad_norm": 23.8914794921875, + "learning_rate": 6.976744186046512e-06, + "loss": 2.3565, + "step": 30 + }, + { + "epoch": 0.009428032683846637, + "grad_norm": 6.890503883361816, + "learning_rate": 1.0465116279069768e-05, + "loss": 1.8897, + "step": 45 + }, + { + "epoch": 0.01257071024512885, + "grad_norm": 3.9464468955993652, + "learning_rate": 1.3953488372093024e-05, + "loss": 1.3707, + "step": 60 + }, + { + "epoch": 0.01571338780641106, + "grad_norm": 4.443431854248047, + "learning_rate": 1.744186046511628e-05, + "loss": 1.055, + "step": 75 + }, + { + "epoch": 0.018856065367693273, + "grad_norm": 3.5747361183166504, + "learning_rate": 2.0930232558139536e-05, + "loss": 0.9048, + "step": 90 + }, + { + "epoch": 0.02199874292897549, + "grad_norm": 4.540731430053711, + "learning_rate": 2.441860465116279e-05, + "loss": 0.9143, + "step": 105 + }, + { + "epoch": 0.0251414204902577, + "grad_norm": 4.121450424194336, + "learning_rate": 2.7906976744186048e-05, + "loss": 0.7641, + "step": 120 + }, + { + "epoch": 0.028284098051539912, + "grad_norm": 3.1179299354553223, + "learning_rate": 3.13953488372093e-05, + "loss": 0.7784, + "step": 135 + }, + { + "epoch": 0.03142677561282212, + "grad_norm": 2.9703869819641113, + "learning_rate": 3.488372093023256e-05, + "loss": 0.7299, + "step": 150 + }, + { + "epoch": 0.034569453174104335, + "grad_norm": 2.706854820251465, + "learning_rate": 3.837209302325582e-05, + "loss": 0.6778, + "step": 165 + }, + { + "epoch": 0.03771213073538655, + "grad_norm": 3.361267328262329, + "learning_rate": 4.186046511627907e-05, + "loss": 0.7222, + "step": 180 + }, + { + "epoch": 0.04085480829666876, + "grad_norm": 4.040229797363281, + "learning_rate": 4.5348837209302326e-05, + "loss": 0.6684, + "step": 195 + }, + { + "epoch": 0.04399748585795098, + "grad_norm": 2.817627429962158, + "learning_rate": 4.883720930232558e-05, + "loss": 0.7458, + "step": 210 + }, + { + "epoch": 0.04714016341923319, + "grad_norm": 2.8800182342529297, + "learning_rate": 5.232558139534884e-05, + "loss": 0.6338, + "step": 225 + }, + { + "epoch": 0.0502828409805154, + "grad_norm": 2.436993360519409, + "learning_rate": 5.5813953488372095e-05, + "loss": 0.6299, + "step": 240 + }, + { + "epoch": 0.05342551854179761, + "grad_norm": 3.5814456939697266, + "learning_rate": 5.9302325581395356e-05, + "loss": 0.5728, + "step": 255 + }, + { + "epoch": 0.056568196103079824, + "grad_norm": 2.8744938373565674, + "learning_rate": 6.27906976744186e-05, + "loss": 0.59, + "step": 270 + }, + { + "epoch": 0.059710873664362035, + "grad_norm": 2.679749011993408, + "learning_rate": 6.627906976744186e-05, + "loss": 0.6016, + "step": 285 + }, + { + "epoch": 0.06285355122564425, + "grad_norm": 3.1333463191986084, + "learning_rate": 6.976744186046513e-05, + "loss": 0.6569, + "step": 300 + }, + { + "epoch": 0.06599622878692646, + "grad_norm": 2.2865939140319824, + "learning_rate": 7.325581395348837e-05, + "loss": 0.6385, + "step": 315 + }, + { + "epoch": 0.06913890634820867, + "grad_norm": 2.9787251949310303, + "learning_rate": 7.674418604651163e-05, + "loss": 0.6307, + "step": 330 + }, + { + "epoch": 0.07228158390949088, + "grad_norm": 2.078509569168091, + "learning_rate": 8.023255813953489e-05, + "loss": 0.5454, + "step": 345 + }, + { + "epoch": 0.0754242614707731, + "grad_norm": 2.6606740951538086, + "learning_rate": 8.372093023255814e-05, + "loss": 0.6211, + "step": 360 + }, + { + "epoch": 0.0785669390320553, + "grad_norm": 1.9346429109573364, + "learning_rate": 8.72093023255814e-05, + "loss": 0.5954, + "step": 375 + }, + { + "epoch": 0.08170961659333752, + "grad_norm": 2.2432360649108887, + "learning_rate": 9.069767441860465e-05, + "loss": 0.5385, + "step": 390 + }, + { + "epoch": 0.08485229415461974, + "grad_norm": 2.1645498275756836, + "learning_rate": 9.418604651162792e-05, + "loss": 0.592, + "step": 405 + }, + { + "epoch": 0.08799497171590195, + "grad_norm": 2.1806533336639404, + "learning_rate": 9.767441860465116e-05, + "loss": 0.5372, + "step": 420 + }, + { + "epoch": 0.09113764927718417, + "grad_norm": 2.445610761642456, + "learning_rate": 9.999996802299678e-05, + "loss": 0.6487, + "step": 435 + }, + { + "epoch": 0.09428032683846638, + "grad_norm": 2.3592734336853027, + "learning_rate": 9.999948836876656e-05, + "loss": 0.5957, + "step": 450 + }, + { + "epoch": 0.09742300439974859, + "grad_norm": 2.3027069568634033, + "learning_rate": 9.999843313485898e-05, + "loss": 0.5835, + "step": 465 + }, + { + "epoch": 0.1005656819610308, + "grad_norm": 2.6429057121276855, + "learning_rate": 9.999680233342161e-05, + "loss": 0.592, + "step": 480 + }, + { + "epoch": 0.10370835952231301, + "grad_norm": 2.0832202434539795, + "learning_rate": 9.999459598322778e-05, + "loss": 0.6203, + "step": 495 + }, + { + "epoch": 0.10685103708359522, + "grad_norm": 2.481870412826538, + "learning_rate": 9.999181410967633e-05, + "loss": 0.5428, + "step": 510 + }, + { + "epoch": 0.10999371464487744, + "grad_norm": 1.9621151685714722, + "learning_rate": 9.99884567447914e-05, + "loss": 0.6101, + "step": 525 + }, + { + "epoch": 0.11313639220615965, + "grad_norm": 2.8833186626434326, + "learning_rate": 9.998452392722198e-05, + "loss": 0.5577, + "step": 540 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 2.4447429180145264, + "learning_rate": 9.998001570224158e-05, + "loss": 0.566, + "step": 555 + }, + { + "epoch": 0.11942174732872407, + "grad_norm": 2.141496419906616, + "learning_rate": 9.997493212174753e-05, + "loss": 0.6211, + "step": 570 + }, + { + "epoch": 0.12256442489000628, + "grad_norm": 2.389796495437622, + "learning_rate": 9.996927324426057e-05, + "loss": 0.5937, + "step": 585 + }, + { + "epoch": 0.1257071024512885, + "grad_norm": 2.1194262504577637, + "learning_rate": 9.996303913492408e-05, + "loss": 0.5847, + "step": 600 + }, + { + "epoch": 0.12884978001257072, + "grad_norm": 1.7767274379730225, + "learning_rate": 9.99562298655033e-05, + "loss": 0.518, + "step": 615 + }, + { + "epoch": 0.13199245757385292, + "grad_norm": 2.0348453521728516, + "learning_rate": 9.994884551438458e-05, + "loss": 0.5941, + "step": 630 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 1.443819284439087, + "learning_rate": 9.994088616657444e-05, + "loss": 0.5022, + "step": 645 + }, + { + "epoch": 0.13827781269641734, + "grad_norm": 2.1748251914978027, + "learning_rate": 9.993235191369861e-05, + "loss": 0.5369, + "step": 660 + }, + { + "epoch": 0.14142049025769957, + "grad_norm": 1.9295774698257446, + "learning_rate": 9.99232428540009e-05, + "loss": 0.607, + "step": 675 + }, + { + "epoch": 0.14456316781898176, + "grad_norm": 1.7530088424682617, + "learning_rate": 9.991355909234224e-05, + "loss": 0.5417, + "step": 690 + }, + { + "epoch": 0.147705845380264, + "grad_norm": 10.02226448059082, + "learning_rate": 9.990330074019925e-05, + "loss": 0.5901, + "step": 705 + }, + { + "epoch": 0.1508485229415462, + "grad_norm": 1.3864644765853882, + "learning_rate": 9.989246791566314e-05, + "loss": 0.678, + "step": 720 + }, + { + "epoch": 0.1539912005028284, + "grad_norm": 1.6103929281234741, + "learning_rate": 9.988106074343823e-05, + "loss": 0.4741, + "step": 735 + }, + { + "epoch": 0.1571338780641106, + "grad_norm": 1.5933347940444946, + "learning_rate": 9.986907935484064e-05, + "loss": 0.5391, + "step": 750 + }, + { + "epoch": 0.16027655562539284, + "grad_norm": 1.5971338748931885, + "learning_rate": 9.985652388779663e-05, + "loss": 0.5782, + "step": 765 + }, + { + "epoch": 0.16341923318667503, + "grad_norm": 1.559793472290039, + "learning_rate": 9.984339448684113e-05, + "loss": 0.5227, + "step": 780 + }, + { + "epoch": 0.16656191074795726, + "grad_norm": 1.3077164888381958, + "learning_rate": 9.982969130311597e-05, + "loss": 0.5203, + "step": 795 + }, + { + "epoch": 0.16970458830923948, + "grad_norm": 1.6828336715698242, + "learning_rate": 9.98154144943683e-05, + "loss": 0.5471, + "step": 810 + }, + { + "epoch": 0.17284726587052168, + "grad_norm": 1.387099266052246, + "learning_rate": 9.98005642249486e-05, + "loss": 0.5399, + "step": 825 + }, + { + "epoch": 0.1759899434318039, + "grad_norm": 1.723253607749939, + "learning_rate": 9.978514066580886e-05, + "loss": 0.5606, + "step": 840 + }, + { + "epoch": 0.1791326209930861, + "grad_norm": 1.22931706905365, + "learning_rate": 9.976914399450068e-05, + "loss": 0.5024, + "step": 855 + }, + { + "epoch": 0.18227529855436833, + "grad_norm": 1.4278538227081299, + "learning_rate": 9.97525743951731e-05, + "loss": 0.5983, + "step": 870 + }, + { + "epoch": 0.18541797611565053, + "grad_norm": 1.4029372930526733, + "learning_rate": 9.973543205857057e-05, + "loss": 0.5699, + "step": 885 + }, + { + "epoch": 0.18856065367693275, + "grad_norm": 1.3018133640289307, + "learning_rate": 9.971771718203072e-05, + "loss": 0.4936, + "step": 900 + }, + { + "epoch": 0.19170333123821495, + "grad_norm": 1.3082265853881836, + "learning_rate": 9.969942996948209e-05, + "loss": 0.5025, + "step": 915 + }, + { + "epoch": 0.19484600879949718, + "grad_norm": 1.2923167943954468, + "learning_rate": 9.968057063144182e-05, + "loss": 0.5779, + "step": 930 + }, + { + "epoch": 0.19798868636077938, + "grad_norm": 1.2902971506118774, + "learning_rate": 9.966113938501313e-05, + "loss": 0.5373, + "step": 945 + }, + { + "epoch": 0.2011313639220616, + "grad_norm": 1.391560673713684, + "learning_rate": 9.964113645388293e-05, + "loss": 0.5858, + "step": 960 + }, + { + "epoch": 0.2042740414833438, + "grad_norm": 1.3245513439178467, + "learning_rate": 9.96205620683192e-05, + "loss": 0.6043, + "step": 975 + }, + { + "epoch": 0.20741671904462602, + "grad_norm": 1.4998241662979126, + "learning_rate": 9.95994164651683e-05, + "loss": 0.5785, + "step": 990 + }, + { + "epoch": 0.21055939660590822, + "grad_norm": 1.090804934501648, + "learning_rate": 9.957769988785236e-05, + "loss": 0.6439, + "step": 1005 + }, + { + "epoch": 0.21370207416719045, + "grad_norm": 1.1564654111862183, + "learning_rate": 9.955541258636631e-05, + "loss": 0.5091, + "step": 1020 + }, + { + "epoch": 0.21684475172847265, + "grad_norm": 1.1778066158294678, + "learning_rate": 9.953255481727513e-05, + "loss": 0.5456, + "step": 1035 + }, + { + "epoch": 0.21998742928975487, + "grad_norm": 1.3568626642227173, + "learning_rate": 9.950912684371088e-05, + "loss": 0.5208, + "step": 1050 + }, + { + "epoch": 0.2231301068510371, + "grad_norm": 1.804425597190857, + "learning_rate": 9.948512893536961e-05, + "loss": 0.4956, + "step": 1065 + }, + { + "epoch": 0.2262727844123193, + "grad_norm": 1.226159930229187, + "learning_rate": 9.946056136850833e-05, + "loss": 0.5812, + "step": 1080 + }, + { + "epoch": 0.22941546197360152, + "grad_norm": 1.1530790328979492, + "learning_rate": 9.943542442594177e-05, + "loss": 0.4742, + "step": 1095 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 1.390417218208313, + "learning_rate": 9.940971839703916e-05, + "loss": 0.619, + "step": 1110 + }, + { + "epoch": 0.23570081709616594, + "grad_norm": 1.4010789394378662, + "learning_rate": 9.938344357772087e-05, + "loss": 0.6086, + "step": 1125 + }, + { + "epoch": 0.23884349465744814, + "grad_norm": 1.6488044261932373, + "learning_rate": 9.935660027045506e-05, + "loss": 0.551, + "step": 1140 + }, + { + "epoch": 0.24198617221873037, + "grad_norm": 1.0560044050216675, + "learning_rate": 9.932918878425412e-05, + "loss": 0.532, + "step": 1155 + }, + { + "epoch": 0.24512884978001256, + "grad_norm": 1.0651888847351074, + "learning_rate": 9.930120943467117e-05, + "loss": 0.5012, + "step": 1170 + }, + { + "epoch": 0.2482715273412948, + "grad_norm": 1.0553079843521118, + "learning_rate": 9.927266254379642e-05, + "loss": 0.5576, + "step": 1185 + }, + { + "epoch": 0.251414204902577, + "grad_norm": 1.007480263710022, + "learning_rate": 9.924354844025339e-05, + "loss": 0.4839, + "step": 1200 + }, + { + "epoch": 0.2545568824638592, + "grad_norm": 1.0924334526062012, + "learning_rate": 9.921386745919528e-05, + "loss": 0.595, + "step": 1215 + }, + { + "epoch": 0.25769956002514144, + "grad_norm": 1.3309390544891357, + "learning_rate": 9.918361994230097e-05, + "loss": 0.5224, + "step": 1230 + }, + { + "epoch": 0.2608422375864236, + "grad_norm": 0.9702763557434082, + "learning_rate": 9.915280623777114e-05, + "loss": 0.4871, + "step": 1245 + }, + { + "epoch": 0.26398491514770583, + "grad_norm": 1.0511876344680786, + "learning_rate": 9.912142670032427e-05, + "loss": 0.5861, + "step": 1260 + }, + { + "epoch": 0.26712759270898806, + "grad_norm": 1.396050214767456, + "learning_rate": 9.908948169119251e-05, + "loss": 0.4651, + "step": 1275 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.985396683216095, + "learning_rate": 9.905697157811761e-05, + "loss": 0.4302, + "step": 1290 + }, + { + "epoch": 0.27341294783155246, + "grad_norm": 0.9169828295707703, + "learning_rate": 9.902389673534659e-05, + "loss": 0.5212, + "step": 1305 + }, + { + "epoch": 0.2765556253928347, + "grad_norm": 0.9107710123062134, + "learning_rate": 9.899025754362751e-05, + "loss": 0.4941, + "step": 1320 + }, + { + "epoch": 0.2796983029541169, + "grad_norm": 0.9720286726951599, + "learning_rate": 9.8956054390205e-05, + "loss": 0.5169, + "step": 1335 + }, + { + "epoch": 0.28284098051539913, + "grad_norm": 1.1490366458892822, + "learning_rate": 9.892128766881596e-05, + "loss": 0.4973, + "step": 1350 + }, + { + "epoch": 0.28598365807668136, + "grad_norm": 1.2628952264785767, + "learning_rate": 9.888595777968479e-05, + "loss": 0.5194, + "step": 1365 + }, + { + "epoch": 0.2891263356379635, + "grad_norm": 1.1610651016235352, + "learning_rate": 9.885006512951897e-05, + "loss": 0.4994, + "step": 1380 + }, + { + "epoch": 0.29226901319924575, + "grad_norm": 1.054768681526184, + "learning_rate": 9.881361013150436e-05, + "loss": 0.4664, + "step": 1395 + }, + { + "epoch": 0.295411690760528, + "grad_norm": 1.0745666027069092, + "learning_rate": 9.877659320530037e-05, + "loss": 0.5306, + "step": 1410 + }, + { + "epoch": 0.2985543683218102, + "grad_norm": 1.3258591890335083, + "learning_rate": 9.873901477703516e-05, + "loss": 0.5076, + "step": 1425 + }, + { + "epoch": 0.3016970458830924, + "grad_norm": 1.222783088684082, + "learning_rate": 9.870087527930077e-05, + "loss": 0.4581, + "step": 1440 + }, + { + "epoch": 0.3048397234443746, + "grad_norm": 0.9374076724052429, + "learning_rate": 9.866217515114805e-05, + "loss": 0.4643, + "step": 1455 + }, + { + "epoch": 0.3079824010056568, + "grad_norm": 1.3485162258148193, + "learning_rate": 9.862291483808173e-05, + "loss": 0.5551, + "step": 1470 + }, + { + "epoch": 0.31112507856693905, + "grad_norm": 0.9162548780441284, + "learning_rate": 9.858309479205519e-05, + "loss": 0.5592, + "step": 1485 + }, + { + "epoch": 0.3142677561282212, + "grad_norm": 1.1385138034820557, + "learning_rate": 9.854271547146531e-05, + "loss": 0.477, + "step": 1500 + }, + { + "epoch": 0.31741043368950345, + "grad_norm": 1.0023164749145508, + "learning_rate": 9.850177734114718e-05, + "loss": 0.4972, + "step": 1515 + }, + { + "epoch": 0.32055311125078567, + "grad_norm": 2.540215492248535, + "learning_rate": 9.846028087236873e-05, + "loss": 0.5007, + "step": 1530 + }, + { + "epoch": 0.3236957888120679, + "grad_norm": 1.2012773752212524, + "learning_rate": 9.841822654282533e-05, + "loss": 0.5481, + "step": 1545 + }, + { + "epoch": 0.32683846637335007, + "grad_norm": 0.9517608284950256, + "learning_rate": 9.837561483663429e-05, + "loss": 0.567, + "step": 1560 + }, + { + "epoch": 0.3299811439346323, + "grad_norm": 1.0308321714401245, + "learning_rate": 9.833244624432927e-05, + "loss": 0.4856, + "step": 1575 + }, + { + "epoch": 0.3331238214959145, + "grad_norm": 1.118574857711792, + "learning_rate": 9.828872126285465e-05, + "loss": 0.465, + "step": 1590 + }, + { + "epoch": 0.33626649905719674, + "grad_norm": 1.0821537971496582, + "learning_rate": 9.824444039555977e-05, + "loss": 0.4394, + "step": 1605 + }, + { + "epoch": 0.33940917661847897, + "grad_norm": 0.8795451521873474, + "learning_rate": 9.81996041521932e-05, + "loss": 0.4383, + "step": 1620 + }, + { + "epoch": 0.34255185417976114, + "grad_norm": 1.1455141305923462, + "learning_rate": 9.815421304889687e-05, + "loss": 0.4805, + "step": 1635 + }, + { + "epoch": 0.34569453174104336, + "grad_norm": 1.1445369720458984, + "learning_rate": 9.81082676082e-05, + "loss": 0.5315, + "step": 1650 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 1.0800312757492065, + "learning_rate": 9.806176835901328e-05, + "loss": 0.5205, + "step": 1665 + }, + { + "epoch": 0.3519798868636078, + "grad_norm": 0.7038319706916809, + "learning_rate": 9.801471583662263e-05, + "loss": 0.515, + "step": 1680 + }, + { + "epoch": 0.35512256442489, + "grad_norm": 0.9790651202201843, + "learning_rate": 9.796711058268313e-05, + "loss": 0.504, + "step": 1695 + }, + { + "epoch": 0.3582652419861722, + "grad_norm": 1.1764894723892212, + "learning_rate": 9.791895314521267e-05, + "loss": 0.4806, + "step": 1710 + }, + { + "epoch": 0.36140791954745444, + "grad_norm": 0.9900022745132446, + "learning_rate": 9.787024407858582e-05, + "loss": 0.5358, + "step": 1725 + }, + { + "epoch": 0.36455059710873666, + "grad_norm": 0.8621386289596558, + "learning_rate": 9.782098394352725e-05, + "loss": 0.5494, + "step": 1740 + }, + { + "epoch": 0.36769327467001883, + "grad_norm": 0.8717844486236572, + "learning_rate": 9.777117330710547e-05, + "loss": 0.4967, + "step": 1755 + }, + { + "epoch": 0.37083595223130106, + "grad_norm": 0.9800569415092468, + "learning_rate": 9.772081274272611e-05, + "loss": 0.4538, + "step": 1770 + }, + { + "epoch": 0.3739786297925833, + "grad_norm": 0.9540134072303772, + "learning_rate": 9.766990283012544e-05, + "loss": 0.5149, + "step": 1785 + }, + { + "epoch": 0.3771213073538655, + "grad_norm": 1.0856047868728638, + "learning_rate": 9.761844415536372e-05, + "loss": 0.5042, + "step": 1800 + }, + { + "epoch": 0.3802639849151477, + "grad_norm": 1.0914040803909302, + "learning_rate": 9.756643731081833e-05, + "loss": 0.5059, + "step": 1815 + }, + { + "epoch": 0.3834066624764299, + "grad_norm": 1.2371134757995605, + "learning_rate": 9.751388289517704e-05, + "loss": 0.4506, + "step": 1830 + }, + { + "epoch": 0.38654934003771213, + "grad_norm": 1.0402591228485107, + "learning_rate": 9.746078151343116e-05, + "loss": 0.5535, + "step": 1845 + }, + { + "epoch": 0.38969201759899436, + "grad_norm": 0.6260209083557129, + "learning_rate": 9.740713377686843e-05, + "loss": 0.4436, + "step": 1860 + }, + { + "epoch": 0.3928346951602766, + "grad_norm": 0.9588780999183655, + "learning_rate": 9.735294030306611e-05, + "loss": 0.5573, + "step": 1875 + }, + { + "epoch": 0.39597737272155875, + "grad_norm": 1.0838474035263062, + "learning_rate": 9.729820171588384e-05, + "loss": 0.4627, + "step": 1890 + }, + { + "epoch": 0.399120050282841, + "grad_norm": 1.0682798624038696, + "learning_rate": 9.724291864545643e-05, + "loss": 0.4893, + "step": 1905 + }, + { + "epoch": 0.4022627278441232, + "grad_norm": 0.9129301309585571, + "learning_rate": 9.718709172818661e-05, + "loss": 0.4898, + "step": 1920 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 1.0116883516311646, + "learning_rate": 9.713072160673777e-05, + "loss": 0.4615, + "step": 1935 + }, + { + "epoch": 0.4085480829666876, + "grad_norm": 1.057822823524475, + "learning_rate": 9.707380893002646e-05, + "loss": 0.4899, + "step": 1950 + }, + { + "epoch": 0.4116907605279698, + "grad_norm": 0.6419869661331177, + "learning_rate": 9.7016354353215e-05, + "loss": 0.4348, + "step": 1965 + }, + { + "epoch": 0.41483343808925205, + "grad_norm": 0.961713433265686, + "learning_rate": 9.695835853770387e-05, + "loss": 0.4921, + "step": 1980 + }, + { + "epoch": 0.4179761156505343, + "grad_norm": 0.9473373889923096, + "learning_rate": 9.689982215112417e-05, + "loss": 0.4926, + "step": 1995 + }, + { + "epoch": 0.42111879321181644, + "grad_norm": 1.2034335136413574, + "learning_rate": 9.684074586732987e-05, + "loss": 0.5042, + "step": 2010 + }, + { + "epoch": 0.42426147077309867, + "grad_norm": 0.9373855590820312, + "learning_rate": 9.678113036639014e-05, + "loss": 0.5076, + "step": 2025 + }, + { + "epoch": 0.4274041483343809, + "grad_norm": 1.016756296157837, + "learning_rate": 9.672097633458136e-05, + "loss": 0.4805, + "step": 2040 + }, + { + "epoch": 0.4305468258956631, + "grad_norm": 0.7454690337181091, + "learning_rate": 9.666028446437942e-05, + "loss": 0.5382, + "step": 2055 + }, + { + "epoch": 0.4336895034569453, + "grad_norm": 0.8196286559104919, + "learning_rate": 9.659905545445159e-05, + "loss": 0.4613, + "step": 2070 + }, + { + "epoch": 0.4368321810182275, + "grad_norm": 0.9132091403007507, + "learning_rate": 9.653729000964857e-05, + "loss": 0.4595, + "step": 2085 + }, + { + "epoch": 0.43997485857950974, + "grad_norm": 0.8063992857933044, + "learning_rate": 9.647498884099633e-05, + "loss": 0.4139, + "step": 2100 + }, + { + "epoch": 0.44311753614079197, + "grad_norm": 0.9756997227668762, + "learning_rate": 9.641215266568794e-05, + "loss": 0.3941, + "step": 2115 + }, + { + "epoch": 0.4462602137020742, + "grad_norm": 0.6542510390281677, + "learning_rate": 9.634878220707531e-05, + "loss": 0.4768, + "step": 2130 + }, + { + "epoch": 0.44940289126335636, + "grad_norm": 0.9039008617401123, + "learning_rate": 9.628487819466086e-05, + "loss": 0.4248, + "step": 2145 + }, + { + "epoch": 0.4525455688246386, + "grad_norm": 1.1151047945022583, + "learning_rate": 9.622044136408914e-05, + "loss": 0.5041, + "step": 2160 + }, + { + "epoch": 0.4556882463859208, + "grad_norm": 0.8580663800239563, + "learning_rate": 9.615547245713836e-05, + "loss": 0.4766, + "step": 2175 + }, + { + "epoch": 0.45883092394720304, + "grad_norm": 0.9799042344093323, + "learning_rate": 9.608997222171178e-05, + "loss": 0.4714, + "step": 2190 + }, + { + "epoch": 0.4619736015084852, + "grad_norm": 0.8485172986984253, + "learning_rate": 9.602394141182927e-05, + "loss": 0.4556, + "step": 2205 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.9632934927940369, + "learning_rate": 9.595738078761837e-05, + "loss": 0.4791, + "step": 2220 + }, + { + "epoch": 0.46825895663104966, + "grad_norm": 0.8843478560447693, + "learning_rate": 9.589029111530586e-05, + "loss": 0.4603, + "step": 2235 + }, + { + "epoch": 0.4714016341923319, + "grad_norm": 1.1230348348617554, + "learning_rate": 9.582267316720861e-05, + "loss": 0.491, + "step": 2250 + }, + { + "epoch": 0.47454431175361406, + "grad_norm": 0.8234013915061951, + "learning_rate": 9.575452772172495e-05, + "loss": 0.44, + "step": 2265 + }, + { + "epoch": 0.4776869893148963, + "grad_norm": 0.6838919520378113, + "learning_rate": 9.568585556332559e-05, + "loss": 0.4456, + "step": 2280 + }, + { + "epoch": 0.4808296668761785, + "grad_norm": 0.8424423336982727, + "learning_rate": 9.561665748254456e-05, + "loss": 0.4556, + "step": 2295 + }, + { + "epoch": 0.48397234443746073, + "grad_norm": 0.6735498905181885, + "learning_rate": 9.554693427597024e-05, + "loss": 0.5184, + "step": 2310 + }, + { + "epoch": 0.4871150219987429, + "grad_norm": 0.8868768811225891, + "learning_rate": 9.5476686746236e-05, + "loss": 0.5403, + "step": 2325 + }, + { + "epoch": 0.49025769956002513, + "grad_norm": 0.9957670569419861, + "learning_rate": 9.540591570201116e-05, + "loss": 0.4997, + "step": 2340 + }, + { + "epoch": 0.49340037712130735, + "grad_norm": 0.76320481300354, + "learning_rate": 9.533462195799157e-05, + "loss": 0.4534, + "step": 2355 + }, + { + "epoch": 0.4965430546825896, + "grad_norm": 0.8841500282287598, + "learning_rate": 9.526280633489018e-05, + "loss": 0.4724, + "step": 2370 + }, + { + "epoch": 0.4996857322438718, + "grad_norm": 0.8852142095565796, + "learning_rate": 9.519046965942776e-05, + "loss": 0.4655, + "step": 2385 + }, + { + "epoch": 0.502828409805154, + "grad_norm": 0.839430570602417, + "learning_rate": 9.511761276432321e-05, + "loss": 0.4386, + "step": 2400 + }, + { + "epoch": 0.5059710873664363, + "grad_norm": 0.7581266760826111, + "learning_rate": 9.50442364882841e-05, + "loss": 0.4774, + "step": 2415 + }, + { + "epoch": 0.5091137649277184, + "grad_norm": 0.8754017949104309, + "learning_rate": 9.497034167599691e-05, + "loss": 0.4744, + "step": 2430 + }, + { + "epoch": 0.5122564424890006, + "grad_norm": 0.9099476337432861, + "learning_rate": 9.48959291781174e-05, + "loss": 0.4292, + "step": 2445 + }, + { + "epoch": 0.5153991200502829, + "grad_norm": 0.9721155166625977, + "learning_rate": 9.482099985126079e-05, + "loss": 0.4137, + "step": 2460 + }, + { + "epoch": 0.518541797611565, + "grad_norm": 0.8385334014892578, + "learning_rate": 9.474555455799181e-05, + "loss": 0.471, + "step": 2475 + }, + { + "epoch": 0.5216844751728472, + "grad_norm": 0.9853966236114502, + "learning_rate": 9.466959416681495e-05, + "loss": 0.4233, + "step": 2490 + }, + { + "epoch": 0.5248271527341295, + "grad_norm": 1.1044224500656128, + "learning_rate": 9.459311955216428e-05, + "loss": 0.5188, + "step": 2505 + }, + { + "epoch": 0.5279698302954117, + "grad_norm": 0.870677649974823, + "learning_rate": 9.451613159439349e-05, + "loss": 0.4676, + "step": 2520 + }, + { + "epoch": 0.531112507856694, + "grad_norm": 0.8571140170097351, + "learning_rate": 9.443863117976573e-05, + "loss": 0.4863, + "step": 2535 + }, + { + "epoch": 0.5342551854179761, + "grad_norm": 1.0573495626449585, + "learning_rate": 9.436061920044341e-05, + "loss": 0.5057, + "step": 2550 + }, + { + "epoch": 0.5373978629792583, + "grad_norm": 0.9805963635444641, + "learning_rate": 9.42820965544779e-05, + "loss": 0.468, + "step": 2565 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.8198602199554443, + "learning_rate": 9.420306414579925e-05, + "loss": 0.5054, + "step": 2580 + }, + { + "epoch": 0.5436832181018227, + "grad_norm": 0.9718137979507446, + "learning_rate": 9.412352288420572e-05, + "loss": 0.4824, + "step": 2595 + }, + { + "epoch": 0.5468258956631049, + "grad_norm": 1.0223153829574585, + "learning_rate": 9.404347368535337e-05, + "loss": 0.4502, + "step": 2610 + }, + { + "epoch": 0.5499685732243872, + "grad_norm": 0.9398010969161987, + "learning_rate": 9.396291747074547e-05, + "loss": 0.4761, + "step": 2625 + }, + { + "epoch": 0.5531112507856694, + "grad_norm": 0.9091777801513672, + "learning_rate": 9.38818551677219e-05, + "loss": 0.4033, + "step": 2640 + }, + { + "epoch": 0.5562539283469516, + "grad_norm": 1.06580650806427, + "learning_rate": 9.380028770944849e-05, + "loss": 0.4052, + "step": 2655 + }, + { + "epoch": 0.5593966059082338, + "grad_norm": 0.7236329913139343, + "learning_rate": 9.371821603490627e-05, + "loss": 0.4677, + "step": 2670 + }, + { + "epoch": 0.562539283469516, + "grad_norm": 0.8263210654258728, + "learning_rate": 9.363564108888069e-05, + "loss": 0.4576, + "step": 2685 + }, + { + "epoch": 0.5656819610307983, + "grad_norm": 1.022448182106018, + "learning_rate": 9.355256382195068e-05, + "loss": 0.4963, + "step": 2700 + }, + { + "epoch": 0.5688246385920804, + "grad_norm": 0.9639766812324524, + "learning_rate": 9.346898519047775e-05, + "loss": 0.4113, + "step": 2715 + }, + { + "epoch": 0.5719673161533627, + "grad_norm": 1.1044561862945557, + "learning_rate": 9.338490615659499e-05, + "loss": 0.5023, + "step": 2730 + }, + { + "epoch": 0.5751099937146449, + "grad_norm": 0.8272239565849304, + "learning_rate": 9.330032768819596e-05, + "loss": 0.4699, + "step": 2745 + }, + { + "epoch": 0.578252671275927, + "grad_norm": 0.7692523002624512, + "learning_rate": 9.321525075892356e-05, + "loss": 0.4292, + "step": 2760 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 0.9032982587814331, + "learning_rate": 9.312967634815888e-05, + "loss": 0.4432, + "step": 2775 + }, + { + "epoch": 0.5845380263984915, + "grad_norm": 0.7676737904548645, + "learning_rate": 9.304360544100982e-05, + "loss": 0.4311, + "step": 2790 + }, + { + "epoch": 0.5876807039597737, + "grad_norm": 0.9019532799720764, + "learning_rate": 9.29570390282998e-05, + "loss": 0.4464, + "step": 2805 + }, + { + "epoch": 0.590823381521056, + "grad_norm": 0.9738386869430542, + "learning_rate": 9.286997810655638e-05, + "loss": 0.5019, + "step": 2820 + }, + { + "epoch": 0.5939660590823381, + "grad_norm": 0.7886769771575928, + "learning_rate": 9.278242367799978e-05, + "loss": 0.4919, + "step": 2835 + }, + { + "epoch": 0.5971087366436204, + "grad_norm": 0.9002622365951538, + "learning_rate": 9.269437675053129e-05, + "loss": 0.4695, + "step": 2850 + }, + { + "epoch": 0.6002514142049026, + "grad_norm": 0.7023227214813232, + "learning_rate": 9.260583833772172e-05, + "loss": 0.4338, + "step": 2865 + }, + { + "epoch": 0.6033940917661847, + "grad_norm": 0.9442479014396667, + "learning_rate": 9.251680945879975e-05, + "loss": 0.4907, + "step": 2880 + }, + { + "epoch": 0.606536769327467, + "grad_norm": 0.6304488778114319, + "learning_rate": 9.24272911386401e-05, + "loss": 0.4612, + "step": 2895 + }, + { + "epoch": 0.6096794468887492, + "grad_norm": 0.731960117816925, + "learning_rate": 9.233728440775185e-05, + "loss": 0.4207, + "step": 2910 + }, + { + "epoch": 0.6128221244500315, + "grad_norm": 1.083849549293518, + "learning_rate": 9.224679030226648e-05, + "loss": 0.4775, + "step": 2925 + }, + { + "epoch": 0.6159648020113137, + "grad_norm": 0.6792687177658081, + "learning_rate": 9.215580986392607e-05, + "loss": 0.4708, + "step": 2940 + }, + { + "epoch": 0.6191074795725958, + "grad_norm": 0.7582160830497742, + "learning_rate": 9.20643441400711e-05, + "loss": 0.4352, + "step": 2955 + }, + { + "epoch": 0.6222501571338781, + "grad_norm": 0.7785065174102783, + "learning_rate": 9.197239418362862e-05, + "loss": 0.4199, + "step": 2970 + }, + { + "epoch": 0.6253928346951603, + "grad_norm": 0.9076778292655945, + "learning_rate": 9.187996105309995e-05, + "loss": 0.4937, + "step": 2985 + }, + { + "epoch": 0.6285355122564424, + "grad_norm": 0.9189762473106384, + "learning_rate": 9.178704581254865e-05, + "loss": 0.4553, + "step": 3000 + }, + { + "epoch": 0.6316781898177247, + "grad_norm": 0.8485803008079529, + "learning_rate": 9.169364953158812e-05, + "loss": 0.4799, + "step": 3015 + }, + { + "epoch": 0.6348208673790069, + "grad_norm": 0.8296557068824768, + "learning_rate": 9.15997732853694e-05, + "loss": 0.4799, + "step": 3030 + }, + { + "epoch": 0.6379635449402892, + "grad_norm": 0.9346463680267334, + "learning_rate": 9.150541815456874e-05, + "loss": 0.4707, + "step": 3045 + }, + { + "epoch": 0.6411062225015713, + "grad_norm": 1.0045510530471802, + "learning_rate": 9.141058522537515e-05, + "loss": 0.5216, + "step": 3060 + }, + { + "epoch": 0.6442489000628535, + "grad_norm": 0.5840141773223877, + "learning_rate": 9.131527558947796e-05, + "loss": 0.429, + "step": 3075 + }, + { + "epoch": 0.6473915776241358, + "grad_norm": 0.8743481040000916, + "learning_rate": 9.121949034405417e-05, + "loss": 0.4734, + "step": 3090 + }, + { + "epoch": 0.650534255185418, + "grad_norm": 0.9631288051605225, + "learning_rate": 9.112323059175588e-05, + "loss": 0.4856, + "step": 3105 + }, + { + "epoch": 0.6536769327467001, + "grad_norm": 0.7583104372024536, + "learning_rate": 9.102649744069758e-05, + "loss": 0.4428, + "step": 3120 + }, + { + "epoch": 0.6568196103079824, + "grad_norm": 0.9227087497711182, + "learning_rate": 9.092929200444337e-05, + "loss": 0.4622, + "step": 3135 + }, + { + "epoch": 0.6599622878692646, + "grad_norm": 0.720124363899231, + "learning_rate": 9.083161540199417e-05, + "loss": 0.4136, + "step": 3150 + }, + { + "epoch": 0.6631049654305469, + "grad_norm": 0.6481117010116577, + "learning_rate": 9.073346875777487e-05, + "loss": 0.5445, + "step": 3165 + }, + { + "epoch": 0.666247642991829, + "grad_norm": 0.6970652937889099, + "learning_rate": 9.063485320162126e-05, + "loss": 0.4247, + "step": 3180 + }, + { + "epoch": 0.6693903205531112, + "grad_norm": 0.5132230520248413, + "learning_rate": 9.053576986876718e-05, + "loss": 0.4415, + "step": 3195 + }, + { + "epoch": 0.6725329981143935, + "grad_norm": 0.7673790454864502, + "learning_rate": 9.043621989983135e-05, + "loss": 0.5188, + "step": 3210 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.8441967368125916, + "learning_rate": 9.033620444080428e-05, + "loss": 0.4343, + "step": 3225 + }, + { + "epoch": 0.6788183532369579, + "grad_norm": 0.8746171593666077, + "learning_rate": 9.023572464303506e-05, + "loss": 0.4114, + "step": 3240 + }, + { + "epoch": 0.6819610307982401, + "grad_norm": 0.7494221925735474, + "learning_rate": 9.013478166321812e-05, + "loss": 0.4334, + "step": 3255 + }, + { + "epoch": 0.6851037083595223, + "grad_norm": 0.7263948917388916, + "learning_rate": 9.00333766633799e-05, + "loss": 0.4322, + "step": 3270 + }, + { + "epoch": 0.6882463859208046, + "grad_norm": 0.852172315120697, + "learning_rate": 8.99315108108655e-05, + "loss": 0.4506, + "step": 3285 + }, + { + "epoch": 0.6913890634820867, + "grad_norm": 0.7959320545196533, + "learning_rate": 8.98291852783252e-05, + "loss": 0.4456, + "step": 3300 + }, + { + "epoch": 0.6945317410433689, + "grad_norm": 0.5918748378753662, + "learning_rate": 8.9726401243701e-05, + "loss": 0.4181, + "step": 3315 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.9726805090904236, + "learning_rate": 8.962315989021304e-05, + "loss": 0.4964, + "step": 3330 + }, + { + "epoch": 0.7008170961659334, + "grad_norm": 0.8826568126678467, + "learning_rate": 8.951946240634596e-05, + "loss": 0.4702, + "step": 3345 + }, + { + "epoch": 0.7039597737272156, + "grad_norm": 0.7354099154472351, + "learning_rate": 8.941530998583527e-05, + "loss": 0.4258, + "step": 3360 + }, + { + "epoch": 0.7071024512884978, + "grad_norm": 0.9217835664749146, + "learning_rate": 8.931070382765359e-05, + "loss": 0.5185, + "step": 3375 + }, + { + "epoch": 0.71024512884978, + "grad_norm": 0.7444872260093689, + "learning_rate": 8.920564513599679e-05, + "loss": 0.4534, + "step": 3390 + }, + { + "epoch": 0.7133878064110623, + "grad_norm": 0.7847276926040649, + "learning_rate": 8.910013512027022e-05, + "loss": 0.4232, + "step": 3405 + }, + { + "epoch": 0.7165304839723444, + "grad_norm": 0.8024355173110962, + "learning_rate": 8.899417499507471e-05, + "loss": 0.4579, + "step": 3420 + }, + { + "epoch": 0.7196731615336267, + "grad_norm": 0.7088613510131836, + "learning_rate": 8.888776598019266e-05, + "loss": 0.4437, + "step": 3435 + }, + { + "epoch": 0.7228158390949089, + "grad_norm": 0.6009235382080078, + "learning_rate": 8.87809093005739e-05, + "loss": 0.397, + "step": 3450 + }, + { + "epoch": 0.725958516656191, + "grad_norm": 0.8743120431900024, + "learning_rate": 8.867360618632172e-05, + "loss": 0.5056, + "step": 3465 + }, + { + "epoch": 0.7291011942174733, + "grad_norm": 0.899148166179657, + "learning_rate": 8.856585787267856e-05, + "loss": 0.4521, + "step": 3480 + }, + { + "epoch": 0.7322438717787555, + "grad_norm": 0.8690171837806702, + "learning_rate": 8.845766560001193e-05, + "loss": 0.4708, + "step": 3495 + }, + { + "epoch": 0.7353865493400377, + "grad_norm": 0.9699186682701111, + "learning_rate": 8.834903061380002e-05, + "loss": 0.4534, + "step": 3510 + }, + { + "epoch": 0.73852922690132, + "grad_norm": 0.8577262163162231, + "learning_rate": 8.823995416461744e-05, + "loss": 0.4096, + "step": 3525 + }, + { + "epoch": 0.7416719044626021, + "grad_norm": 0.7458922266960144, + "learning_rate": 8.81304375081208e-05, + "loss": 0.46, + "step": 3540 + }, + { + "epoch": 0.7448145820238844, + "grad_norm": 0.7347140908241272, + "learning_rate": 8.802048190503423e-05, + "loss": 0.4684, + "step": 3555 + }, + { + "epoch": 0.7479572595851666, + "grad_norm": 0.7161451578140259, + "learning_rate": 8.79100886211349e-05, + "loss": 0.4715, + "step": 3570 + }, + { + "epoch": 0.7510999371464487, + "grad_norm": 0.8321588039398193, + "learning_rate": 8.779925892723842e-05, + "loss": 0.3598, + "step": 3585 + }, + { + "epoch": 0.754242614707731, + "grad_norm": 0.9462142586708069, + "learning_rate": 8.768799409918423e-05, + "loss": 0.4404, + "step": 3600 + }, + { + "epoch": 0.7573852922690132, + "grad_norm": 0.6842710971832275, + "learning_rate": 8.75762954178209e-05, + "loss": 0.4648, + "step": 3615 + }, + { + "epoch": 0.7605279698302954, + "grad_norm": 0.8573241829872131, + "learning_rate": 8.746416416899145e-05, + "loss": 0.4592, + "step": 3630 + }, + { + "epoch": 0.7636706473915776, + "grad_norm": 0.751291811466217, + "learning_rate": 8.735160164351841e-05, + "loss": 0.5319, + "step": 3645 + }, + { + "epoch": 0.7668133249528598, + "grad_norm": 0.731086790561676, + "learning_rate": 8.72386091371891e-05, + "loss": 0.4629, + "step": 3660 + }, + { + "epoch": 0.7699560025141421, + "grad_norm": 0.9289976358413696, + "learning_rate": 8.712518795074063e-05, + "loss": 0.4427, + "step": 3675 + }, + { + "epoch": 0.7730986800754243, + "grad_norm": 0.7036064267158508, + "learning_rate": 8.701133938984496e-05, + "loss": 0.4679, + "step": 3690 + }, + { + "epoch": 0.7762413576367064, + "grad_norm": 0.778161346912384, + "learning_rate": 8.689706476509385e-05, + "loss": 0.4489, + "step": 3705 + }, + { + "epoch": 0.7793840351979887, + "grad_norm": 0.8694556951522827, + "learning_rate": 8.678236539198382e-05, + "loss": 0.4048, + "step": 3720 + }, + { + "epoch": 0.7825267127592709, + "grad_norm": 0.5768362283706665, + "learning_rate": 8.666724259090092e-05, + "loss": 0.4434, + "step": 3735 + }, + { + "epoch": 0.7856693903205532, + "grad_norm": 0.604917585849762, + "learning_rate": 8.655169768710562e-05, + "loss": 0.4669, + "step": 3750 + }, + { + "epoch": 0.7888120678818353, + "grad_norm": 0.833985447883606, + "learning_rate": 8.643573201071748e-05, + "loss": 0.4267, + "step": 3765 + }, + { + "epoch": 0.7919547454431175, + "grad_norm": 0.7951568365097046, + "learning_rate": 8.631934689669992e-05, + "loss": 0.4028, + "step": 3780 + }, + { + "epoch": 0.7950974230043998, + "grad_norm": 0.7703410983085632, + "learning_rate": 8.620254368484474e-05, + "loss": 0.4153, + "step": 3795 + }, + { + "epoch": 0.798240100565682, + "grad_norm": 0.8545910716056824, + "learning_rate": 8.608532371975684e-05, + "loss": 0.4949, + "step": 3810 + }, + { + "epoch": 0.8013827781269641, + "grad_norm": 0.8206099271774292, + "learning_rate": 8.59676883508386e-05, + "loss": 0.4714, + "step": 3825 + }, + { + "epoch": 0.8045254556882464, + "grad_norm": 0.7841479182243347, + "learning_rate": 8.584963893227442e-05, + "loss": 0.4888, + "step": 3840 + }, + { + "epoch": 0.8076681332495286, + "grad_norm": 0.7417731285095215, + "learning_rate": 8.573117682301514e-05, + "loss": 0.4951, + "step": 3855 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.9013925194740295, + "learning_rate": 8.561230338676239e-05, + "loss": 0.4542, + "step": 3870 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 1.2146642208099365, + "learning_rate": 8.549301999195283e-05, + "loss": 0.4606, + "step": 3885 + }, + { + "epoch": 0.8170961659333752, + "grad_norm": 0.8740483522415161, + "learning_rate": 8.537332801174245e-05, + "loss": 0.4562, + "step": 3900 + }, + { + "epoch": 0.8202388434946575, + "grad_norm": 0.7769590020179749, + "learning_rate": 8.525322882399082e-05, + "loss": 0.4385, + "step": 3915 + }, + { + "epoch": 0.8233815210559396, + "grad_norm": 0.7966271042823792, + "learning_rate": 8.513272381124511e-05, + "loss": 0.4011, + "step": 3930 + }, + { + "epoch": 0.8265241986172219, + "grad_norm": 0.6132526397705078, + "learning_rate": 8.501181436072422e-05, + "loss": 0.393, + "step": 3945 + }, + { + "epoch": 0.8296668761785041, + "grad_norm": 0.6438138484954834, + "learning_rate": 8.489050186430285e-05, + "loss": 0.4226, + "step": 3960 + }, + { + "epoch": 0.8328095537397863, + "grad_norm": 0.8362025022506714, + "learning_rate": 8.476878771849545e-05, + "loss": 0.4216, + "step": 3975 + }, + { + "epoch": 0.8359522313010685, + "grad_norm": 0.770706057548523, + "learning_rate": 8.464667332444012e-05, + "loss": 0.4278, + "step": 3990 + }, + { + "epoch": 0.8390949088623507, + "grad_norm": 0.8944802284240723, + "learning_rate": 8.452416008788254e-05, + "loss": 0.4609, + "step": 4005 + }, + { + "epoch": 0.8422375864236329, + "grad_norm": 0.9292035102844238, + "learning_rate": 8.440124941915972e-05, + "loss": 0.4124, + "step": 4020 + }, + { + "epoch": 0.8453802639849152, + "grad_norm": 0.6450730562210083, + "learning_rate": 8.427794273318377e-05, + "loss": 0.4124, + "step": 4035 + }, + { + "epoch": 0.8485229415461973, + "grad_norm": 1.0732468366622925, + "learning_rate": 8.415424144942569e-05, + "loss": 0.4678, + "step": 4050 + }, + { + "epoch": 0.8516656191074796, + "grad_norm": 0.900360107421875, + "learning_rate": 8.403014699189892e-05, + "loss": 0.4299, + "step": 4065 + }, + { + "epoch": 0.8548082966687618, + "grad_norm": 0.7163972854614258, + "learning_rate": 8.39056607891431e-05, + "loss": 0.4651, + "step": 4080 + }, + { + "epoch": 0.857950974230044, + "grad_norm": 0.6078224182128906, + "learning_rate": 8.378078427420739e-05, + "loss": 0.4612, + "step": 4095 + }, + { + "epoch": 0.8610936517913262, + "grad_norm": 0.7975668907165527, + "learning_rate": 8.365551888463423e-05, + "loss": 0.4521, + "step": 4110 + }, + { + "epoch": 0.8642363293526084, + "grad_norm": 0.7620348930358887, + "learning_rate": 8.352986606244262e-05, + "loss": 0.4527, + "step": 4125 + }, + { + "epoch": 0.8673790069138906, + "grad_norm": 0.7811437249183655, + "learning_rate": 8.340382725411155e-05, + "loss": 0.4639, + "step": 4140 + }, + { + "epoch": 0.8705216844751729, + "grad_norm": 0.46538805961608887, + "learning_rate": 8.327740391056343e-05, + "loss": 0.3793, + "step": 4155 + }, + { + "epoch": 0.873664362036455, + "grad_norm": 0.893225371837616, + "learning_rate": 8.315059748714728e-05, + "loss": 0.4824, + "step": 4170 + }, + { + "epoch": 0.8768070395977373, + "grad_norm": 0.8325145244598389, + "learning_rate": 8.302340944362205e-05, + "loss": 0.4623, + "step": 4185 + }, + { + "epoch": 0.8799497171590195, + "grad_norm": 0.7328510880470276, + "learning_rate": 8.289584124413978e-05, + "loss": 0.4075, + "step": 4200 + }, + { + "epoch": 0.8830923947203017, + "grad_norm": 0.35754507780075073, + "learning_rate": 8.276789435722875e-05, + "loss": 0.3328, + "step": 4215 + }, + { + "epoch": 0.8862350722815839, + "grad_norm": 0.78349369764328, + "learning_rate": 8.263957025577663e-05, + "loss": 0.4962, + "step": 4230 + }, + { + "epoch": 0.8893777498428661, + "grad_norm": 0.644481360912323, + "learning_rate": 8.251087041701339e-05, + "loss": 0.3977, + "step": 4245 + }, + { + "epoch": 0.8925204274041484, + "grad_norm": 0.618881344795227, + "learning_rate": 8.238179632249443e-05, + "loss": 0.3967, + "step": 4260 + }, + { + "epoch": 0.8956631049654306, + "grad_norm": 0.7603642344474792, + "learning_rate": 8.22523494580835e-05, + "loss": 0.4413, + "step": 4275 + }, + { + "epoch": 0.8988057825267127, + "grad_norm": 0.6301630735397339, + "learning_rate": 8.212253131393549e-05, + "loss": 0.4333, + "step": 4290 + }, + { + "epoch": 0.901948460087995, + "grad_norm": 0.7729358077049255, + "learning_rate": 8.199234338447942e-05, + "loss": 0.4633, + "step": 4305 + }, + { + "epoch": 0.9050911376492772, + "grad_norm": 0.9121199250221252, + "learning_rate": 8.186178716840118e-05, + "loss": 0.4411, + "step": 4320 + }, + { + "epoch": 0.9082338152105593, + "grad_norm": 0.5462374091148376, + "learning_rate": 8.17308641686262e-05, + "loss": 0.4659, + "step": 4335 + }, + { + "epoch": 0.9113764927718416, + "grad_norm": 0.7599003911018372, + "learning_rate": 8.15995758923023e-05, + "loss": 0.4015, + "step": 4350 + }, + { + "epoch": 0.9145191703331238, + "grad_norm": 0.8557884693145752, + "learning_rate": 8.14679238507822e-05, + "loss": 0.4574, + "step": 4365 + }, + { + "epoch": 0.9176618478944061, + "grad_norm": 0.7987812757492065, + "learning_rate": 8.133590955960619e-05, + "loss": 0.4501, + "step": 4380 + }, + { + "epoch": 0.9208045254556882, + "grad_norm": 0.8603717088699341, + "learning_rate": 8.120353453848471e-05, + "loss": 0.4201, + "step": 4395 + }, + { + "epoch": 0.9239472030169704, + "grad_norm": 0.7066472768783569, + "learning_rate": 8.107080031128078e-05, + "loss": 0.4035, + "step": 4410 + }, + { + "epoch": 0.9270898805782527, + "grad_norm": 0.6430373191833496, + "learning_rate": 8.09377084059925e-05, + "loss": 0.4141, + "step": 4425 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.6911259889602661, + "learning_rate": 8.080426035473549e-05, + "loss": 0.4431, + "step": 4440 + }, + { + "epoch": 0.933375235700817, + "grad_norm": 0.8445611000061035, + "learning_rate": 8.067045769372515e-05, + "loss": 0.4469, + "step": 4455 + }, + { + "epoch": 0.9365179132620993, + "grad_norm": 0.9317618012428284, + "learning_rate": 8.053630196325914e-05, + "loss": 0.4051, + "step": 4470 + }, + { + "epoch": 0.9396605908233815, + "grad_norm": 0.8286532163619995, + "learning_rate": 8.040179470769946e-05, + "loss": 0.4158, + "step": 4485 + }, + { + "epoch": 0.9428032683846638, + "grad_norm": 0.7000495195388794, + "learning_rate": 8.026693747545486e-05, + "loss": 0.4202, + "step": 4500 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.8104173541069031, + "learning_rate": 8.013173181896283e-05, + "loss": 0.4369, + "step": 4515 + }, + { + "epoch": 0.9490886235072281, + "grad_norm": 0.864750862121582, + "learning_rate": 7.999617929467187e-05, + "loss": 0.4152, + "step": 4530 + }, + { + "epoch": 0.9522313010685104, + "grad_norm": 0.7788864970207214, + "learning_rate": 7.98602814630235e-05, + "loss": 0.492, + "step": 4545 + }, + { + "epoch": 0.9553739786297926, + "grad_norm": 0.707156777381897, + "learning_rate": 7.972403988843435e-05, + "loss": 0.4105, + "step": 4560 + }, + { + "epoch": 0.9585166561910748, + "grad_norm": 0.8454593420028687, + "learning_rate": 7.958745613927809e-05, + "loss": 0.4622, + "step": 4575 + }, + { + "epoch": 0.961659333752357, + "grad_norm": 0.8026373982429504, + "learning_rate": 7.945053178786744e-05, + "loss": 0.4236, + "step": 4590 + }, + { + "epoch": 0.9648020113136392, + "grad_norm": 0.786409318447113, + "learning_rate": 7.931326841043596e-05, + "loss": 0.4677, + "step": 4605 + }, + { + "epoch": 0.9679446888749215, + "grad_norm": 0.5381405353546143, + "learning_rate": 7.917566758712005e-05, + "loss": 0.443, + "step": 4620 + }, + { + "epoch": 0.9710873664362036, + "grad_norm": 0.6609058380126953, + "learning_rate": 7.903773090194069e-05, + "loss": 0.4573, + "step": 4635 + }, + { + "epoch": 0.9742300439974858, + "grad_norm": 0.7192760705947876, + "learning_rate": 7.889945994278514e-05, + "loss": 0.4387, + "step": 4650 + }, + { + "epoch": 0.9773727215587681, + "grad_norm": 0.7502164244651794, + "learning_rate": 7.87608563013888e-05, + "loss": 0.399, + "step": 4665 + }, + { + "epoch": 0.9805153991200503, + "grad_norm": 0.7829092144966125, + "learning_rate": 7.86219215733168e-05, + "loss": 0.3705, + "step": 4680 + }, + { + "epoch": 0.9836580766813325, + "grad_norm": 0.791359007358551, + "learning_rate": 7.848265735794558e-05, + "loss": 0.4434, + "step": 4695 + }, + { + "epoch": 0.9868007542426147, + "grad_norm": 0.7627493739128113, + "learning_rate": 7.834306525844461e-05, + "loss": 0.4496, + "step": 4710 + }, + { + "epoch": 0.9899434318038969, + "grad_norm": 0.679959237575531, + "learning_rate": 7.820314688175784e-05, + "loss": 0.4815, + "step": 4725 + }, + { + "epoch": 0.9930861093651792, + "grad_norm": 0.8766529560089111, + "learning_rate": 7.806290383858523e-05, + "loss": 0.4704, + "step": 4740 + }, + { + "epoch": 0.9962287869264613, + "grad_norm": 1.1642574071884155, + "learning_rate": 7.792233774336423e-05, + "loss": 0.4974, + "step": 4755 + }, + { + "epoch": 0.9993714644877436, + "grad_norm": 0.7194317579269409, + "learning_rate": 7.778145021425114e-05, + "loss": 0.4423, + "step": 4770 + }, + { + "epoch": 1.0025141420490258, + "grad_norm": 0.7814803719520569, + "learning_rate": 7.764024287310252e-05, + "loss": 0.4194, + "step": 4785 + }, + { + "epoch": 1.005656819610308, + "grad_norm": 0.8891781568527222, + "learning_rate": 7.749871734545652e-05, + "loss": 0.3977, + "step": 4800 + }, + { + "epoch": 1.0087994971715901, + "grad_norm": 0.7444355487823486, + "learning_rate": 7.735687526051418e-05, + "loss": 0.3924, + "step": 4815 + }, + { + "epoch": 1.0119421747328725, + "grad_norm": 0.9248786568641663, + "learning_rate": 7.721471825112062e-05, + "loss": 0.4273, + "step": 4830 + }, + { + "epoch": 1.0150848522941547, + "grad_norm": 0.6513450741767883, + "learning_rate": 7.70722479537463e-05, + "loss": 0.3909, + "step": 4845 + }, + { + "epoch": 1.0182275298554369, + "grad_norm": 0.8597205877304077, + "learning_rate": 7.692946600846818e-05, + "loss": 0.4027, + "step": 4860 + }, + { + "epoch": 1.021370207416719, + "grad_norm": 0.9086320996284485, + "learning_rate": 7.678637405895076e-05, + "loss": 0.4225, + "step": 4875 + }, + { + "epoch": 1.0245128849780012, + "grad_norm": 0.8219915628433228, + "learning_rate": 7.66429737524273e-05, + "loss": 0.4055, + "step": 4890 + }, + { + "epoch": 1.0276555625392834, + "grad_norm": 0.9232605695724487, + "learning_rate": 7.649926673968069e-05, + "loss": 0.3801, + "step": 4905 + }, + { + "epoch": 1.0307982401005658, + "grad_norm": 0.8866775035858154, + "learning_rate": 7.635525467502462e-05, + "loss": 0.3887, + "step": 4920 + }, + { + "epoch": 1.033940917661848, + "grad_norm": 0.6395006775856018, + "learning_rate": 7.62109392162844e-05, + "loss": 0.4018, + "step": 4935 + }, + { + "epoch": 1.03708359522313, + "grad_norm": 0.8276055455207825, + "learning_rate": 7.60663220247779e-05, + "loss": 0.3875, + "step": 4950 + }, + { + "epoch": 1.0402262727844123, + "grad_norm": 0.8251763582229614, + "learning_rate": 7.592140476529652e-05, + "loss": 0.3912, + "step": 4965 + }, + { + "epoch": 1.0433689503456944, + "grad_norm": 0.8321304321289062, + "learning_rate": 7.577618910608591e-05, + "loss": 0.4317, + "step": 4980 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 0.6474670171737671, + "learning_rate": 7.56306767188268e-05, + "loss": 0.4594, + "step": 4995 + }, + { + "epoch": 1.049654305468259, + "grad_norm": 0.6989348530769348, + "learning_rate": 7.548486927861582e-05, + "loss": 0.3744, + "step": 5010 + }, + { + "epoch": 1.0527969830295412, + "grad_norm": 0.8184515237808228, + "learning_rate": 7.533876846394613e-05, + "loss": 0.3364, + "step": 5025 + }, + { + "epoch": 1.0559396605908233, + "grad_norm": 0.7965102195739746, + "learning_rate": 7.519237595668811e-05, + "loss": 0.3934, + "step": 5040 + }, + { + "epoch": 1.0590823381521055, + "grad_norm": 0.731299638748169, + "learning_rate": 7.504569344207007e-05, + "loss": 0.4161, + "step": 5055 + }, + { + "epoch": 1.062225015713388, + "grad_norm": 0.9074578881263733, + "learning_rate": 7.489872260865877e-05, + "loss": 0.4103, + "step": 5070 + }, + { + "epoch": 1.06536769327467, + "grad_norm": 0.8735909461975098, + "learning_rate": 7.475146514834001e-05, + "loss": 0.3686, + "step": 5085 + }, + { + "epoch": 1.0685103708359522, + "grad_norm": 0.7814076542854309, + "learning_rate": 7.460392275629918e-05, + "loss": 0.3943, + "step": 5100 + }, + { + "epoch": 1.0716530483972344, + "grad_norm": 0.8307476043701172, + "learning_rate": 7.445609713100171e-05, + "loss": 0.3999, + "step": 5115 + }, + { + "epoch": 1.0747957259585166, + "grad_norm": 0.7908287048339844, + "learning_rate": 7.430798997417353e-05, + "loss": 0.4104, + "step": 5130 + }, + { + "epoch": 1.077938403519799, + "grad_norm": 0.8598707914352417, + "learning_rate": 7.415960299078143e-05, + "loss": 0.3976, + "step": 5145 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5163241028785706, + "learning_rate": 7.40109378890136e-05, + "loss": 0.3506, + "step": 5160 + }, + { + "epoch": 1.0842237586423633, + "grad_norm": 0.8642787933349609, + "learning_rate": 7.386199638025973e-05, + "loss": 0.31, + "step": 5175 + }, + { + "epoch": 1.0873664362036455, + "grad_norm": 0.7603743076324463, + "learning_rate": 7.371278017909148e-05, + "loss": 0.4695, + "step": 5190 + }, + { + "epoch": 1.0905091137649277, + "grad_norm": 0.7949853539466858, + "learning_rate": 7.356329100324273e-05, + "loss": 0.4076, + "step": 5205 + }, + { + "epoch": 1.0936517913262098, + "grad_norm": 0.8560110926628113, + "learning_rate": 7.341353057358966e-05, + "loss": 0.3833, + "step": 5220 + }, + { + "epoch": 1.0967944688874922, + "grad_norm": 0.632763147354126, + "learning_rate": 7.326350061413114e-05, + "loss": 0.4128, + "step": 5235 + }, + { + "epoch": 1.0999371464487744, + "grad_norm": 0.9416031837463379, + "learning_rate": 7.311320285196875e-05, + "loss": 0.3665, + "step": 5250 + }, + { + "epoch": 1.1030798240100566, + "grad_norm": 0.6195524334907532, + "learning_rate": 7.296263901728694e-05, + "loss": 0.362, + "step": 5265 + }, + { + "epoch": 1.1062225015713387, + "grad_norm": 0.8545498251914978, + "learning_rate": 7.281181084333311e-05, + "loss": 0.361, + "step": 5280 + }, + { + "epoch": 1.109365179132621, + "grad_norm": 0.75226229429245, + "learning_rate": 7.26607200663977e-05, + "loss": 0.3948, + "step": 5295 + }, + { + "epoch": 1.1125078566939033, + "grad_norm": 0.877756655216217, + "learning_rate": 7.250936842579407e-05, + "loss": 0.4061, + "step": 5310 + }, + { + "epoch": 1.1156505342551855, + "grad_norm": 0.5953283309936523, + "learning_rate": 7.235775766383862e-05, + "loss": 0.3273, + "step": 5325 + }, + { + "epoch": 1.1187932118164676, + "grad_norm": 0.8206706643104553, + "learning_rate": 7.220588952583071e-05, + "loss": 0.3757, + "step": 5340 + }, + { + "epoch": 1.1219358893777498, + "grad_norm": 0.7466344237327576, + "learning_rate": 7.205376576003247e-05, + "loss": 0.3892, + "step": 5355 + }, + { + "epoch": 1.125078566939032, + "grad_norm": 0.8034494519233704, + "learning_rate": 7.190138811764882e-05, + "loss": 0.4043, + "step": 5370 + }, + { + "epoch": 1.1282212445003144, + "grad_norm": 0.9050668478012085, + "learning_rate": 7.174875835280716e-05, + "loss": 0.3812, + "step": 5385 + }, + { + "epoch": 1.1313639220615965, + "grad_norm": 0.8540876507759094, + "learning_rate": 7.159587822253733e-05, + "loss": 0.3645, + "step": 5400 + }, + { + "epoch": 1.1345065996228787, + "grad_norm": 0.7688354849815369, + "learning_rate": 7.14427494867512e-05, + "loss": 0.3683, + "step": 5415 + }, + { + "epoch": 1.1376492771841609, + "grad_norm": 0.6950829029083252, + "learning_rate": 7.128937390822261e-05, + "loss": 0.3347, + "step": 5430 + }, + { + "epoch": 1.140791954745443, + "grad_norm": 0.8212427496910095, + "learning_rate": 7.113575325256694e-05, + "loss": 0.3775, + "step": 5445 + }, + { + "epoch": 1.1439346323067254, + "grad_norm": 0.8312988877296448, + "learning_rate": 7.098188928822084e-05, + "loss": 0.4325, + "step": 5460 + }, + { + "epoch": 1.1470773098680076, + "grad_norm": 0.9646623134613037, + "learning_rate": 7.082778378642184e-05, + "loss": 0.3898, + "step": 5475 + }, + { + "epoch": 1.1502199874292898, + "grad_norm": 0.8333424925804138, + "learning_rate": 7.0673438521188e-05, + "loss": 0.4068, + "step": 5490 + }, + { + "epoch": 1.153362664990572, + "grad_norm": 0.918892502784729, + "learning_rate": 7.051885526929747e-05, + "loss": 0.3968, + "step": 5505 + }, + { + "epoch": 1.156505342551854, + "grad_norm": 0.5460782647132874, + "learning_rate": 7.0364035810268e-05, + "loss": 0.3672, + "step": 5520 + }, + { + "epoch": 1.1596480201131363, + "grad_norm": 0.876811683177948, + "learning_rate": 7.020898192633655e-05, + "loss": 0.408, + "step": 5535 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.6740222573280334, + "learning_rate": 7.005369540243864e-05, + "loss": 0.2995, + "step": 5550 + }, + { + "epoch": 1.1659333752357008, + "grad_norm": 0.8702965378761292, + "learning_rate": 6.989817802618792e-05, + "loss": 0.3307, + "step": 5565 + }, + { + "epoch": 1.169076052796983, + "grad_norm": 0.8837511539459229, + "learning_rate": 6.974243158785554e-05, + "loss": 0.3864, + "step": 5580 + }, + { + "epoch": 1.1722187303582652, + "grad_norm": 0.4050454795360565, + "learning_rate": 6.958645788034952e-05, + "loss": 0.3525, + "step": 5595 + }, + { + "epoch": 1.1753614079195476, + "grad_norm": 0.8361005187034607, + "learning_rate": 6.943025869919418e-05, + "loss": 0.3747, + "step": 5610 + }, + { + "epoch": 1.1785040854808297, + "grad_norm": 0.841556191444397, + "learning_rate": 6.92738358425094e-05, + "loss": 0.406, + "step": 5625 + }, + { + "epoch": 1.181646763042112, + "grad_norm": 0.629443883895874, + "learning_rate": 6.911719111098996e-05, + "loss": 0.4175, + "step": 5640 + }, + { + "epoch": 1.184789440603394, + "grad_norm": 0.7146449685096741, + "learning_rate": 6.896032630788476e-05, + "loss": 0.3511, + "step": 5655 + }, + { + "epoch": 1.1879321181646763, + "grad_norm": 0.8358393311500549, + "learning_rate": 6.880324323897617e-05, + "loss": 0.3851, + "step": 5670 + }, + { + "epoch": 1.1910747957259584, + "grad_norm": 0.742857813835144, + "learning_rate": 6.864594371255913e-05, + "loss": 0.3821, + "step": 5685 + }, + { + "epoch": 1.1942174732872408, + "grad_norm": 0.7099196910858154, + "learning_rate": 6.848842953942036e-05, + "loss": 0.3789, + "step": 5700 + }, + { + "epoch": 1.197360150848523, + "grad_norm": 0.754542350769043, + "learning_rate": 6.83307025328176e-05, + "loss": 0.3472, + "step": 5715 + }, + { + "epoch": 1.2005028284098052, + "grad_norm": 0.7466986775398254, + "learning_rate": 6.817276450845856e-05, + "loss": 0.3393, + "step": 5730 + }, + { + "epoch": 1.2036455059710873, + "grad_norm": 0.7026840448379517, + "learning_rate": 6.801461728448022e-05, + "loss": 0.3891, + "step": 5745 + }, + { + "epoch": 1.2067881835323695, + "grad_norm": 1.1348669528961182, + "learning_rate": 6.785626268142777e-05, + "loss": 0.3802, + "step": 5760 + }, + { + "epoch": 1.2099308610936519, + "grad_norm": 0.7511578798294067, + "learning_rate": 6.769770252223369e-05, + "loss": 0.4252, + "step": 5775 + }, + { + "epoch": 1.213073538654934, + "grad_norm": 0.8412914276123047, + "learning_rate": 6.753893863219675e-05, + "loss": 0.3813, + "step": 5790 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.8765383958816528, + "learning_rate": 6.737997283896103e-05, + "loss": 0.3712, + "step": 5805 + }, + { + "epoch": 1.2193588937774984, + "grad_norm": 0.7843053340911865, + "learning_rate": 6.722080697249487e-05, + "loss": 0.3776, + "step": 5820 + }, + { + "epoch": 1.2225015713387806, + "grad_norm": 1.0745536088943481, + "learning_rate": 6.706144286506978e-05, + "loss": 0.3499, + "step": 5835 + }, + { + "epoch": 1.2256442489000627, + "grad_norm": 0.7722020745277405, + "learning_rate": 6.690188235123934e-05, + "loss": 0.4211, + "step": 5850 + }, + { + "epoch": 1.2287869264613451, + "grad_norm": 0.9631087183952332, + "learning_rate": 6.674212726781814e-05, + "loss": 0.3772, + "step": 5865 + }, + { + "epoch": 1.2319296040226273, + "grad_norm": 0.8981698751449585, + "learning_rate": 6.65821794538606e-05, + "loss": 0.4598, + "step": 5880 + }, + { + "epoch": 1.2350722815839095, + "grad_norm": 0.778362512588501, + "learning_rate": 6.642204075063974e-05, + "loss": 0.4179, + "step": 5895 + }, + { + "epoch": 1.2382149591451916, + "grad_norm": 0.8421118259429932, + "learning_rate": 6.626171300162615e-05, + "loss": 0.3583, + "step": 5910 + }, + { + "epoch": 1.241357636706474, + "grad_norm": 1.0227240324020386, + "learning_rate": 6.610119805246653e-05, + "loss": 0.3919, + "step": 5925 + }, + { + "epoch": 1.2445003142677562, + "grad_norm": 0.5748106837272644, + "learning_rate": 6.594049775096268e-05, + "loss": 0.3571, + "step": 5940 + }, + { + "epoch": 1.2476429918290384, + "grad_norm": 0.6924661993980408, + "learning_rate": 6.577961394705008e-05, + "loss": 0.3812, + "step": 5955 + }, + { + "epoch": 1.2507856693903205, + "grad_norm": 0.7702043056488037, + "learning_rate": 6.561854849277664e-05, + "loss": 0.331, + "step": 5970 + }, + { + "epoch": 1.2539283469516027, + "grad_norm": 0.6666329503059387, + "learning_rate": 6.545730324228136e-05, + "loss": 0.3266, + "step": 5985 + }, + { + "epoch": 1.2570710245128849, + "grad_norm": 0.9120034575462341, + "learning_rate": 6.529588005177305e-05, + "loss": 0.4188, + "step": 6000 + }, + { + "epoch": 1.260213702074167, + "grad_norm": 0.7251651287078857, + "learning_rate": 6.513428077950886e-05, + "loss": 0.4067, + "step": 6015 + }, + { + "epoch": 1.2633563796354494, + "grad_norm": 0.6845729947090149, + "learning_rate": 6.497250728577296e-05, + "loss": 0.4266, + "step": 6030 + }, + { + "epoch": 1.2664990571967316, + "grad_norm": 0.7530787587165833, + "learning_rate": 6.481056143285512e-05, + "loss": 0.3302, + "step": 6045 + }, + { + "epoch": 1.2696417347580138, + "grad_norm": 0.7474608421325684, + "learning_rate": 6.464844508502927e-05, + "loss": 0.4305, + "step": 6060 + }, + { + "epoch": 1.2727844123192962, + "grad_norm": 0.8672669529914856, + "learning_rate": 6.448616010853199e-05, + "loss": 0.4267, + "step": 6075 + }, + { + "epoch": 1.2759270898805783, + "grad_norm": 0.7703887224197388, + "learning_rate": 6.432370837154109e-05, + "loss": 0.3531, + "step": 6090 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 0.7432886958122253, + "learning_rate": 6.416109174415406e-05, + "loss": 0.3189, + "step": 6105 + }, + { + "epoch": 1.2822124450031427, + "grad_norm": 0.9600912928581238, + "learning_rate": 6.399831209836659e-05, + "loss": 0.4036, + "step": 6120 + }, + { + "epoch": 1.2853551225644249, + "grad_norm": 0.7727882862091064, + "learning_rate": 6.383537130805098e-05, + "loss": 0.3857, + "step": 6135 + }, + { + "epoch": 1.288497800125707, + "grad_norm": 0.7628008723258972, + "learning_rate": 6.367227124893455e-05, + "loss": 0.4229, + "step": 6150 + }, + { + "epoch": 1.2916404776869892, + "grad_norm": 0.9682219624519348, + "learning_rate": 6.350901379857814e-05, + "loss": 0.3544, + "step": 6165 + }, + { + "epoch": 1.2947831552482716, + "grad_norm": 0.7553837895393372, + "learning_rate": 6.334560083635434e-05, + "loss": 0.3968, + "step": 6180 + }, + { + "epoch": 1.2979258328095538, + "grad_norm": 0.7951422333717346, + "learning_rate": 6.318203424342605e-05, + "loss": 0.2946, + "step": 6195 + }, + { + "epoch": 1.301068510370836, + "grad_norm": 0.9351706504821777, + "learning_rate": 6.301831590272465e-05, + "loss": 0.4203, + "step": 6210 + }, + { + "epoch": 1.304211187932118, + "grad_norm": 0.8283166289329529, + "learning_rate": 6.28544476989284e-05, + "loss": 0.4166, + "step": 6225 + }, + { + "epoch": 1.3073538654934005, + "grad_norm": 0.7889246940612793, + "learning_rate": 6.269043151844081e-05, + "loss": 0.4084, + "step": 6240 + }, + { + "epoch": 1.3104965430546827, + "grad_norm": 0.7893148064613342, + "learning_rate": 6.252626924936876e-05, + "loss": 0.3327, + "step": 6255 + }, + { + "epoch": 1.3136392206159648, + "grad_norm": 0.9599968194961548, + "learning_rate": 6.236196278150092e-05, + "loss": 0.3987, + "step": 6270 + }, + { + "epoch": 1.316781898177247, + "grad_norm": 0.7326962351799011, + "learning_rate": 6.219751400628593e-05, + "loss": 0.3872, + "step": 6285 + }, + { + "epoch": 1.3199245757385292, + "grad_norm": 0.7666275501251221, + "learning_rate": 6.203292481681061e-05, + "loss": 0.2906, + "step": 6300 + }, + { + "epoch": 1.3230672532998113, + "grad_norm": 0.7648006081581116, + "learning_rate": 6.186819710777819e-05, + "loss": 0.4077, + "step": 6315 + }, + { + "epoch": 1.3262099308610937, + "grad_norm": 0.8993086218833923, + "learning_rate": 6.170333277548653e-05, + "loss": 0.3334, + "step": 6330 + }, + { + "epoch": 1.329352608422376, + "grad_norm": 0.8966405987739563, + "learning_rate": 6.153833371780622e-05, + "loss": 0.3772, + "step": 6345 + }, + { + "epoch": 1.332495285983658, + "grad_norm": 0.955697774887085, + "learning_rate": 6.137320183415877e-05, + "loss": 0.3652, + "step": 6360 + }, + { + "epoch": 1.3356379635449402, + "grad_norm": 0.913931667804718, + "learning_rate": 6.120793902549478e-05, + "loss": 0.3943, + "step": 6375 + }, + { + "epoch": 1.3387806411062226, + "grad_norm": 0.471160352230072, + "learning_rate": 6.1042547194272e-05, + "loss": 0.3656, + "step": 6390 + }, + { + "epoch": 1.3419233186675048, + "grad_norm": 0.7883521914482117, + "learning_rate": 6.0877028244433444e-05, + "loss": 0.3494, + "step": 6405 + }, + { + "epoch": 1.345065996228787, + "grad_norm": 0.8015203475952148, + "learning_rate": 6.071138408138547e-05, + "loss": 0.3498, + "step": 6420 + }, + { + "epoch": 1.3482086737900691, + "grad_norm": 0.8431302905082703, + "learning_rate": 6.0545616611975886e-05, + "loss": 0.3726, + "step": 6435 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.6410717964172363, + "learning_rate": 6.0379727744471936e-05, + "loss": 0.3793, + "step": 6450 + }, + { + "epoch": 1.3544940289126335, + "grad_norm": 0.8410218358039856, + "learning_rate": 6.021371938853839e-05, + "loss": 0.4294, + "step": 6465 + }, + { + "epoch": 1.3576367064739157, + "grad_norm": 0.622178852558136, + "learning_rate": 6.004759345521552e-05, + "loss": 0.3373, + "step": 6480 + }, + { + "epoch": 1.360779384035198, + "grad_norm": 0.8277848362922668, + "learning_rate": 5.988135185689712e-05, + "loss": 0.3796, + "step": 6495 + }, + { + "epoch": 1.3639220615964802, + "grad_norm": 0.799150824546814, + "learning_rate": 5.9714996507308465e-05, + "loss": 0.3361, + "step": 6510 + }, + { + "epoch": 1.3670647391577624, + "grad_norm": 0.8518102765083313, + "learning_rate": 5.954852932148433e-05, + "loss": 0.3913, + "step": 6525 + }, + { + "epoch": 1.3702074167190446, + "grad_norm": 0.7465687990188599, + "learning_rate": 5.9381952215746905e-05, + "loss": 0.3546, + "step": 6540 + }, + { + "epoch": 1.373350094280327, + "grad_norm": 0.7342978119850159, + "learning_rate": 5.921526710768376e-05, + "loss": 0.3832, + "step": 6555 + }, + { + "epoch": 1.3764927718416091, + "grad_norm": 0.6754856109619141, + "learning_rate": 5.9048475916125723e-05, + "loss": 0.4051, + "step": 6570 + }, + { + "epoch": 1.3796354494028913, + "grad_norm": 0.6392863988876343, + "learning_rate": 5.888158056112486e-05, + "loss": 0.3828, + "step": 6585 + }, + { + "epoch": 1.3827781269641735, + "grad_norm": 0.897132933139801, + "learning_rate": 5.871458296393231e-05, + "loss": 0.405, + "step": 6600 + }, + { + "epoch": 1.3859208045254556, + "grad_norm": 0.7124328017234802, + "learning_rate": 5.854748504697624e-05, + "loss": 0.3712, + "step": 6615 + }, + { + "epoch": 1.3890634820867378, + "grad_norm": 0.8436194062232971, + "learning_rate": 5.8380288733839585e-05, + "loss": 0.3773, + "step": 6630 + }, + { + "epoch": 1.3922061596480202, + "grad_norm": 0.780944287776947, + "learning_rate": 5.8212995949238083e-05, + "loss": 0.3529, + "step": 6645 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 1.0335406064987183, + "learning_rate": 5.804560861899795e-05, + "loss": 0.4262, + "step": 6660 + }, + { + "epoch": 1.3984915147705845, + "grad_norm": 0.7593971490859985, + "learning_rate": 5.7878128670033826e-05, + "loss": 0.4079, + "step": 6675 + }, + { + "epoch": 1.4016341923318667, + "grad_norm": 0.7240027189254761, + "learning_rate": 5.7710558030326545e-05, + "loss": 0.3835, + "step": 6690 + }, + { + "epoch": 1.404776869893149, + "grad_norm": 1.530868411064148, + "learning_rate": 5.754289862890093e-05, + "loss": 0.4294, + "step": 6705 + }, + { + "epoch": 1.4079195474544313, + "grad_norm": 0.6043078899383545, + "learning_rate": 5.7375152395803624e-05, + "loss": 0.3343, + "step": 6720 + }, + { + "epoch": 1.4110622250157134, + "grad_norm": 0.8058659434318542, + "learning_rate": 5.720732126208082e-05, + "loss": 0.4533, + "step": 6735 + }, + { + "epoch": 1.4142049025769956, + "grad_norm": 0.7185141444206238, + "learning_rate": 5.7039407159756106e-05, + "loss": 0.42, + "step": 6750 + }, + { + "epoch": 1.4173475801382778, + "grad_norm": 1.0086369514465332, + "learning_rate": 5.687141202180817e-05, + "loss": 0.3701, + "step": 6765 + }, + { + "epoch": 1.42049025769956, + "grad_norm": 1.0289742946624756, + "learning_rate": 5.67033377821485e-05, + "loss": 0.4565, + "step": 6780 + }, + { + "epoch": 1.4236329352608421, + "grad_norm": 1.1389039754867554, + "learning_rate": 5.6535186375599266e-05, + "loss": 0.3555, + "step": 6795 + }, + { + "epoch": 1.4267756128221245, + "grad_norm": 0.887610673904419, + "learning_rate": 5.636695973787093e-05, + "loss": 0.368, + "step": 6810 + }, + { + "epoch": 1.4299182903834067, + "grad_norm": 0.9625629186630249, + "learning_rate": 5.619865980553994e-05, + "loss": 0.3962, + "step": 6825 + }, + { + "epoch": 1.4330609679446888, + "grad_norm": 0.8793766498565674, + "learning_rate": 5.6030288516026564e-05, + "loss": 0.3979, + "step": 6840 + }, + { + "epoch": 1.436203645505971, + "grad_norm": 0.7626388669013977, + "learning_rate": 5.586184780757251e-05, + "loss": 0.345, + "step": 6855 + }, + { + "epoch": 1.4393463230672534, + "grad_norm": 1.109713077545166, + "learning_rate": 5.5693339619218534e-05, + "loss": 0.4446, + "step": 6870 + }, + { + "epoch": 1.4424890006285356, + "grad_norm": 0.9758956432342529, + "learning_rate": 5.552476589078231e-05, + "loss": 0.401, + "step": 6885 + }, + { + "epoch": 1.4456316781898177, + "grad_norm": 0.923329770565033, + "learning_rate": 5.5356128562835904e-05, + "loss": 0.385, + "step": 6900 + }, + { + "epoch": 1.4487743557511, + "grad_norm": 0.7539265155792236, + "learning_rate": 5.518742957668359e-05, + "loss": 0.3274, + "step": 6915 + }, + { + "epoch": 1.451917033312382, + "grad_norm": 0.8187793493270874, + "learning_rate": 5.5018670874339386e-05, + "loss": 0.3677, + "step": 6930 + }, + { + "epoch": 1.4550597108736643, + "grad_norm": 0.9522603750228882, + "learning_rate": 5.484985439850473e-05, + "loss": 0.3319, + "step": 6945 + }, + { + "epoch": 1.4582023884349467, + "grad_norm": 0.8808611631393433, + "learning_rate": 5.468098209254622e-05, + "loss": 0.4311, + "step": 6960 + }, + { + "epoch": 1.4613450659962288, + "grad_norm": 0.6949836611747742, + "learning_rate": 5.4512055900473035e-05, + "loss": 0.3679, + "step": 6975 + }, + { + "epoch": 1.464487743557511, + "grad_norm": 0.783545196056366, + "learning_rate": 5.434307776691479e-05, + "loss": 0.3552, + "step": 6990 + }, + { + "epoch": 1.4676304211187932, + "grad_norm": 0.8342312574386597, + "learning_rate": 5.417404963709894e-05, + "loss": 0.3755, + "step": 7005 + }, + { + "epoch": 1.4707730986800756, + "grad_norm": 0.7615540027618408, + "learning_rate": 5.400497345682857e-05, + "loss": 0.3605, + "step": 7020 + }, + { + "epoch": 1.4739157762413577, + "grad_norm": 0.8944594860076904, + "learning_rate": 5.3835851172459794e-05, + "loss": 0.3948, + "step": 7035 + }, + { + "epoch": 1.47705845380264, + "grad_norm": 0.8412215113639832, + "learning_rate": 5.36666847308796e-05, + "loss": 0.3658, + "step": 7050 + }, + { + "epoch": 1.480201131363922, + "grad_norm": 0.8457724452018738, + "learning_rate": 5.34974760794832e-05, + "loss": 0.4327, + "step": 7065 + }, + { + "epoch": 1.4833438089252042, + "grad_norm": 0.7231891751289368, + "learning_rate": 5.332822716615172e-05, + "loss": 0.3489, + "step": 7080 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.8975026607513428, + "learning_rate": 5.315893993922986e-05, + "loss": 0.331, + "step": 7095 + }, + { + "epoch": 1.4896291640477686, + "grad_norm": 0.871842086315155, + "learning_rate": 5.2989616347503244e-05, + "loss": 0.4056, + "step": 7110 + }, + { + "epoch": 1.492771841609051, + "grad_norm": 0.5846161246299744, + "learning_rate": 5.282025834017623e-05, + "loss": 0.381, + "step": 7125 + }, + { + "epoch": 1.4959145191703331, + "grad_norm": 0.6650387644767761, + "learning_rate": 5.265086786684929e-05, + "loss": 0.34, + "step": 7140 + }, + { + "epoch": 1.4990571967316153, + "grad_norm": 0.862241804599762, + "learning_rate": 5.2481446877496665e-05, + "loss": 0.354, + "step": 7155 + }, + { + "epoch": 1.5021998742928977, + "grad_norm": 0.8328828811645508, + "learning_rate": 5.231199732244386e-05, + "loss": 0.3772, + "step": 7170 + }, + { + "epoch": 1.5053425518541799, + "grad_norm": 0.5438669323921204, + "learning_rate": 5.214252115234527e-05, + "loss": 0.3493, + "step": 7185 + }, + { + "epoch": 1.508485229415462, + "grad_norm": 0.7722681760787964, + "learning_rate": 5.197302031816165e-05, + "loss": 0.3494, + "step": 7200 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 0.9693325161933899, + "learning_rate": 5.180349677113762e-05, + "loss": 0.3512, + "step": 7215 + }, + { + "epoch": 1.5147705845380264, + "grad_norm": 1.0208348035812378, + "learning_rate": 5.163395246277938e-05, + "loss": 0.2772, + "step": 7230 + }, + { + "epoch": 1.5179132620993085, + "grad_norm": 0.8255509734153748, + "learning_rate": 5.1464389344832024e-05, + "loss": 0.3491, + "step": 7245 + }, + { + "epoch": 1.5210559396605907, + "grad_norm": 0.723574697971344, + "learning_rate": 5.1294809369257244e-05, + "loss": 0.3894, + "step": 7260 + }, + { + "epoch": 1.5241986172218729, + "grad_norm": 0.8955418467521667, + "learning_rate": 5.112521448821076e-05, + "loss": 0.3722, + "step": 7275 + }, + { + "epoch": 1.5273412947831553, + "grad_norm": 0.9446234703063965, + "learning_rate": 5.0955606654019895e-05, + "loss": 0.3602, + "step": 7290 + }, + { + "epoch": 1.5304839723444374, + "grad_norm": 0.7256786227226257, + "learning_rate": 5.078598781916107e-05, + "loss": 0.3488, + "step": 7305 + }, + { + "epoch": 1.5336266499057196, + "grad_norm": 0.775834858417511, + "learning_rate": 5.0616359936237355e-05, + "loss": 0.3983, + "step": 7320 + }, + { + "epoch": 1.536769327467002, + "grad_norm": 0.7684575915336609, + "learning_rate": 5.044672495795598e-05, + "loss": 0.3992, + "step": 7335 + }, + { + "epoch": 1.5399120050282842, + "grad_norm": 0.7569010853767395, + "learning_rate": 5.0277084837105826e-05, + "loss": 0.352, + "step": 7350 + }, + { + "epoch": 1.5430546825895664, + "grad_norm": 0.7330282926559448, + "learning_rate": 5.010744152653501e-05, + "loss": 0.3486, + "step": 7365 + }, + { + "epoch": 1.5461973601508485, + "grad_norm": 0.8921106457710266, + "learning_rate": 4.993779697912837e-05, + "loss": 0.3107, + "step": 7380 + }, + { + "epoch": 1.5493400377121307, + "grad_norm": 0.7190592288970947, + "learning_rate": 4.976815314778493e-05, + "loss": 0.3429, + "step": 7395 + }, + { + "epoch": 1.5524827152734129, + "grad_norm": 0.8145999312400818, + "learning_rate": 4.9598511985395535e-05, + "loss": 0.3455, + "step": 7410 + }, + { + "epoch": 1.555625392834695, + "grad_norm": 0.7628950476646423, + "learning_rate": 4.942887544482029e-05, + "loss": 0.3362, + "step": 7425 + }, + { + "epoch": 1.5587680703959774, + "grad_norm": 0.5859194993972778, + "learning_rate": 4.925924547886603e-05, + "loss": 0.3723, + "step": 7440 + }, + { + "epoch": 1.5619107479572596, + "grad_norm": 0.7906526327133179, + "learning_rate": 4.9089624040264013e-05, + "loss": 0.3511, + "step": 7455 + }, + { + "epoch": 1.5650534255185418, + "grad_norm": 0.7591722011566162, + "learning_rate": 4.892001308164727e-05, + "loss": 0.4439, + "step": 7470 + }, + { + "epoch": 1.5681961030798242, + "grad_norm": 0.9237760901451111, + "learning_rate": 4.875041455552817e-05, + "loss": 0.3638, + "step": 7485 + }, + { + "epoch": 1.5713387806411063, + "grad_norm": 0.734752893447876, + "learning_rate": 4.858083041427599e-05, + "loss": 0.4047, + "step": 7500 + }, + { + "epoch": 1.5744814582023885, + "grad_norm": 0.676703155040741, + "learning_rate": 4.8411262610094445e-05, + "loss": 0.3566, + "step": 7515 + }, + { + "epoch": 1.5776241357636707, + "grad_norm": 0.8751126527786255, + "learning_rate": 4.824171309499913e-05, + "loss": 0.3743, + "step": 7530 + }, + { + "epoch": 1.5807668133249528, + "grad_norm": 0.6884835958480835, + "learning_rate": 4.807218382079511e-05, + "loss": 0.3821, + "step": 7545 + }, + { + "epoch": 1.583909490886235, + "grad_norm": 0.8230961561203003, + "learning_rate": 4.790267673905447e-05, + "loss": 0.3193, + "step": 7560 + }, + { + "epoch": 1.5870521684475172, + "grad_norm": 0.8046270608901978, + "learning_rate": 4.7733193801093803e-05, + "loss": 0.3714, + "step": 7575 + }, + { + "epoch": 1.5901948460087993, + "grad_norm": 0.895897626876831, + "learning_rate": 4.756373695795177e-05, + "loss": 0.386, + "step": 7590 + }, + { + "epoch": 1.5933375235700817, + "grad_norm": 0.8858537077903748, + "learning_rate": 4.7394308160366617e-05, + "loss": 0.3755, + "step": 7605 + }, + { + "epoch": 1.596480201131364, + "grad_norm": 0.6874979138374329, + "learning_rate": 4.722490935875377e-05, + "loss": 0.3547, + "step": 7620 + }, + { + "epoch": 1.5996228786926463, + "grad_norm": 0.8027022480964661, + "learning_rate": 4.705554250318335e-05, + "loss": 0.3702, + "step": 7635 + }, + { + "epoch": 1.6027655562539285, + "grad_norm": 0.9383290410041809, + "learning_rate": 4.688620954335766e-05, + "loss": 0.4038, + "step": 7650 + }, + { + "epoch": 1.6059082338152106, + "grad_norm": 0.8475779294967651, + "learning_rate": 4.671691242858891e-05, + "loss": 0.3257, + "step": 7665 + }, + { + "epoch": 1.6090509113764928, + "grad_norm": 0.702893853187561, + "learning_rate": 4.654765310777659e-05, + "loss": 0.3642, + "step": 7680 + }, + { + "epoch": 1.612193588937775, + "grad_norm": 0.7762289047241211, + "learning_rate": 4.6378433529385157e-05, + "loss": 0.3859, + "step": 7695 + }, + { + "epoch": 1.6153362664990571, + "grad_norm": 0.7309826016426086, + "learning_rate": 4.620925564142151e-05, + "loss": 0.3427, + "step": 7710 + }, + { + "epoch": 1.6184789440603393, + "grad_norm": 0.655974805355072, + "learning_rate": 4.60401213914127e-05, + "loss": 0.3893, + "step": 7725 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.7434260845184326, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.3528, + "step": 7740 + }, + { + "epoch": 1.6247642991829039, + "grad_norm": 0.981696605682373, + "learning_rate": 4.570199159283345e-05, + "loss": 0.3792, + "step": 7755 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.5884058475494385, + "learning_rate": 4.553299993671567e-05, + "loss": 0.3082, + "step": 7770 + }, + { + "epoch": 1.6310496543054682, + "grad_norm": 0.9349349737167358, + "learning_rate": 4.536405970341317e-05, + "loss": 0.3736, + "step": 7785 + }, + { + "epoch": 1.6341923318667506, + "grad_norm": 0.8422302603721619, + "learning_rate": 4.519517283771717e-05, + "loss": 0.3897, + "step": 7800 + }, + { + "epoch": 1.6373350094280328, + "grad_norm": 0.7569222450256348, + "learning_rate": 4.502634128380448e-05, + "loss": 0.3581, + "step": 7815 + }, + { + "epoch": 1.640477686989315, + "grad_norm": 0.8034069538116455, + "learning_rate": 4.4857566985215276e-05, + "loss": 0.3542, + "step": 7830 + }, + { + "epoch": 1.6436203645505971, + "grad_norm": 0.5547857284545898, + "learning_rate": 4.4688851884830516e-05, + "loss": 0.3089, + "step": 7845 + }, + { + "epoch": 1.6467630421118793, + "grad_norm": 0.8145669102668762, + "learning_rate": 4.452019792484975e-05, + "loss": 0.3391, + "step": 7860 + }, + { + "epoch": 1.6499057196731615, + "grad_norm": 0.672332227230072, + "learning_rate": 4.4351607046768704e-05, + "loss": 0.3866, + "step": 7875 + }, + { + "epoch": 1.6530483972344436, + "grad_norm": 0.7952318787574768, + "learning_rate": 4.418308119135686e-05, + "loss": 0.4221, + "step": 7890 + }, + { + "epoch": 1.6561910747957258, + "grad_norm": 0.7489158511161804, + "learning_rate": 4.401462229863526e-05, + "loss": 0.3687, + "step": 7905 + }, + { + "epoch": 1.6593337523570082, + "grad_norm": 0.8457122445106506, + "learning_rate": 4.3846232307854e-05, + "loss": 0.3888, + "step": 7920 + }, + { + "epoch": 1.6624764299182904, + "grad_norm": 0.7040199637413025, + "learning_rate": 4.36779131574701e-05, + "loss": 0.3437, + "step": 7935 + }, + { + "epoch": 1.6656191074795728, + "grad_norm": 1.0369516611099243, + "learning_rate": 4.3509666785125005e-05, + "loss": 0.3557, + "step": 7950 + }, + { + "epoch": 1.668761785040855, + "grad_norm": 0.7418217062950134, + "learning_rate": 4.334149512762238e-05, + "loss": 0.351, + "step": 7965 + }, + { + "epoch": 1.671904462602137, + "grad_norm": 0.6527841687202454, + "learning_rate": 4.3173400120905824e-05, + "loss": 0.3286, + "step": 7980 + }, + { + "epoch": 1.6750471401634193, + "grad_norm": 0.9062017798423767, + "learning_rate": 4.3005383700036525e-05, + "loss": 0.3828, + "step": 7995 + }, + { + "epoch": 1.6781898177247014, + "grad_norm": 0.6981047987937927, + "learning_rate": 4.283744779917102e-05, + "loss": 0.3689, + "step": 8010 + }, + { + "epoch": 1.6813324952859836, + "grad_norm": 0.8865767121315002, + "learning_rate": 4.26695943515389e-05, + "loss": 0.3912, + "step": 8025 + }, + { + "epoch": 1.6844751728472658, + "grad_norm": 0.5835604667663574, + "learning_rate": 4.250182528942065e-05, + "loss": 0.317, + "step": 8040 + }, + { + "epoch": 1.687617850408548, + "grad_norm": 0.869529128074646, + "learning_rate": 4.233414254412525e-05, + "loss": 0.4031, + "step": 8055 + }, + { + "epoch": 1.6907605279698303, + "grad_norm": 0.7666299939155579, + "learning_rate": 4.216654804596808e-05, + "loss": 0.3635, + "step": 8070 + }, + { + "epoch": 1.6939032055311125, + "grad_norm": 0.6868289709091187, + "learning_rate": 4.199904372424858e-05, + "loss": 0.3554, + "step": 8085 + }, + { + "epoch": 1.6970458830923947, + "grad_norm": 0.7406291961669922, + "learning_rate": 4.183163150722822e-05, + "loss": 0.3216, + "step": 8100 + }, + { + "epoch": 1.700188560653677, + "grad_norm": 0.7962248921394348, + "learning_rate": 4.166431332210807e-05, + "loss": 0.3398, + "step": 8115 + }, + { + "epoch": 1.7033312382149592, + "grad_norm": 1.02495276927948, + "learning_rate": 4.149709109500678e-05, + "loss": 0.3817, + "step": 8130 + }, + { + "epoch": 1.7064739157762414, + "grad_norm": 0.7741113305091858, + "learning_rate": 4.13299667509384e-05, + "loss": 0.4072, + "step": 8145 + }, + { + "epoch": 1.7096165933375236, + "grad_norm": 0.7952526807785034, + "learning_rate": 4.1162942213790086e-05, + "loss": 0.3441, + "step": 8160 + }, + { + "epoch": 1.7127592708988058, + "grad_norm": 0.7849689722061157, + "learning_rate": 4.0996019406300126e-05, + "loss": 0.3417, + "step": 8175 + }, + { + "epoch": 1.715901948460088, + "grad_norm": 0.7431788444519043, + "learning_rate": 4.082920025003567e-05, + "loss": 0.3995, + "step": 8190 + }, + { + "epoch": 1.71904462602137, + "grad_norm": 0.7709872126579285, + "learning_rate": 4.0662486665370734e-05, + "loss": 0.4069, + "step": 8205 + }, + { + "epoch": 1.7221873035826523, + "grad_norm": 0.6013693809509277, + "learning_rate": 4.049588057146394e-05, + "loss": 0.3877, + "step": 8220 + }, + { + "epoch": 1.7253299811439347, + "grad_norm": 0.7985032796859741, + "learning_rate": 4.032938388623657e-05, + "loss": 0.3407, + "step": 8235 + }, + { + "epoch": 1.7284726587052168, + "grad_norm": 0.6259362101554871, + "learning_rate": 4.01629985263504e-05, + "loss": 0.3167, + "step": 8250 + }, + { + "epoch": 1.7316153362664992, + "grad_norm": 0.7632457613945007, + "learning_rate": 3.999672640718567e-05, + "loss": 0.365, + "step": 8265 + }, + { + "epoch": 1.7347580138277814, + "grad_norm": 0.9532593488693237, + "learning_rate": 3.983056944281901e-05, + "loss": 0.427, + "step": 8280 + }, + { + "epoch": 1.7379006913890636, + "grad_norm": 0.7168596386909485, + "learning_rate": 3.966452954600142e-05, + "loss": 0.3776, + "step": 8295 + }, + { + "epoch": 1.7410433689503457, + "grad_norm": 0.753966748714447, + "learning_rate": 3.94986086281363e-05, + "loss": 0.3792, + "step": 8310 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 0.38063740730285645, + "learning_rate": 3.933280859925734e-05, + "loss": 0.3499, + "step": 8325 + }, + { + "epoch": 1.74732872407291, + "grad_norm": 0.8001086711883545, + "learning_rate": 3.916713136800659e-05, + "loss": 0.3491, + "step": 8340 + }, + { + "epoch": 1.7504714016341922, + "grad_norm": 0.7394033074378967, + "learning_rate": 3.900157884161255e-05, + "loss": 0.3383, + "step": 8355 + }, + { + "epoch": 1.7536140791954744, + "grad_norm": 0.7337818741798401, + "learning_rate": 3.8836152925868114e-05, + "loss": 0.3705, + "step": 8370 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.7671971917152405, + "learning_rate": 3.867085552510864e-05, + "loss": 0.3125, + "step": 8385 + }, + { + "epoch": 1.759899434318039, + "grad_norm": 0.8018542528152466, + "learning_rate": 3.850568854219011e-05, + "loss": 0.3678, + "step": 8400 + }, + { + "epoch": 1.7630421118793211, + "grad_norm": 0.8364083766937256, + "learning_rate": 3.834065387846718e-05, + "loss": 0.4179, + "step": 8415 + }, + { + "epoch": 1.7661847894406035, + "grad_norm": 0.8526837825775146, + "learning_rate": 3.817575343377122e-05, + "loss": 0.3881, + "step": 8430 + }, + { + "epoch": 1.7693274670018857, + "grad_norm": 0.6416676640510559, + "learning_rate": 3.8010989106388554e-05, + "loss": 0.3099, + "step": 8445 + }, + { + "epoch": 1.7724701445631679, + "grad_norm": 0.7990739941596985, + "learning_rate": 3.784636279303858e-05, + "loss": 0.3598, + "step": 8460 + }, + { + "epoch": 1.77561282212445, + "grad_norm": 0.8872657418251038, + "learning_rate": 3.76818763888519e-05, + "loss": 0.3882, + "step": 8475 + }, + { + "epoch": 1.7787554996857322, + "grad_norm": 0.8712546229362488, + "learning_rate": 3.7517531787348484e-05, + "loss": 0.3773, + "step": 8490 + }, + { + "epoch": 1.7818981772470144, + "grad_norm": 0.7423908710479736, + "learning_rate": 3.735333088041596e-05, + "loss": 0.3777, + "step": 8505 + }, + { + "epoch": 1.7850408548082966, + "grad_norm": 0.9166727066040039, + "learning_rate": 3.718927555828779e-05, + "loss": 0.4059, + "step": 8520 + }, + { + "epoch": 1.7881835323695787, + "grad_norm": 0.7207896113395691, + "learning_rate": 3.702536770952148e-05, + "loss": 0.3754, + "step": 8535 + }, + { + "epoch": 1.7913262099308611, + "grad_norm": 0.844727635383606, + "learning_rate": 3.6861609220976846e-05, + "loss": 0.3328, + "step": 8550 + }, + { + "epoch": 1.7944688874921433, + "grad_norm": 0.7674320340156555, + "learning_rate": 3.6698001977794366e-05, + "loss": 0.3806, + "step": 8565 + }, + { + "epoch": 1.7976115650534257, + "grad_norm": 0.6307094693183899, + "learning_rate": 3.6534547863373394e-05, + "loss": 0.3694, + "step": 8580 + }, + { + "epoch": 1.8007542426147078, + "grad_norm": 0.767432451248169, + "learning_rate": 3.63712487593505e-05, + "loss": 0.4028, + "step": 8595 + }, + { + "epoch": 1.80389692017599, + "grad_norm": 0.8937990665435791, + "learning_rate": 3.6208106545577824e-05, + "loss": 0.3372, + "step": 8610 + }, + { + "epoch": 1.8070395977372722, + "grad_norm": 0.590930163860321, + "learning_rate": 3.604512310010146e-05, + "loss": 0.3684, + "step": 8625 + }, + { + "epoch": 1.8101822752985544, + "grad_norm": 0.8184636831283569, + "learning_rate": 3.58823002991398e-05, + "loss": 0.373, + "step": 8640 + }, + { + "epoch": 1.8133249528598365, + "grad_norm": 0.9741955399513245, + "learning_rate": 3.5719640017061885e-05, + "loss": 0.3374, + "step": 8655 + }, + { + "epoch": 1.8164676304211187, + "grad_norm": 1.0014973878860474, + "learning_rate": 3.555714412636595e-05, + "loss": 0.3848, + "step": 8670 + }, + { + "epoch": 1.8196103079824009, + "grad_norm": 0.6335365772247314, + "learning_rate": 3.53948144976578e-05, + "loss": 0.3689, + "step": 8685 + }, + { + "epoch": 1.8227529855436833, + "grad_norm": 0.5687909722328186, + "learning_rate": 3.523265299962924e-05, + "loss": 0.4178, + "step": 8700 + }, + { + "epoch": 1.8258956631049654, + "grad_norm": 0.8622750043869019, + "learning_rate": 3.507066149903662e-05, + "loss": 0.3899, + "step": 8715 + }, + { + "epoch": 1.8290383406662476, + "grad_norm": 0.7984293699264526, + "learning_rate": 3.490884186067935e-05, + "loss": 0.4353, + "step": 8730 + }, + { + "epoch": 1.83218101822753, + "grad_norm": 0.7962972521781921, + "learning_rate": 3.474719594737842e-05, + "loss": 0.3324, + "step": 8745 + }, + { + "epoch": 1.8353236957888122, + "grad_norm": 0.7194257974624634, + "learning_rate": 3.4585725619954864e-05, + "loss": 0.3765, + "step": 8760 + }, + { + "epoch": 1.8384663733500943, + "grad_norm": 0.6931387782096863, + "learning_rate": 3.442443273720853e-05, + "loss": 0.3183, + "step": 8775 + }, + { + "epoch": 1.8416090509113765, + "grad_norm": 0.7540430426597595, + "learning_rate": 3.426331915589651e-05, + "loss": 0.3975, + "step": 8790 + }, + { + "epoch": 1.8447517284726587, + "grad_norm": 0.7310993671417236, + "learning_rate": 3.410238673071185e-05, + "loss": 0.3975, + "step": 8805 + }, + { + "epoch": 1.8478944060339408, + "grad_norm": 0.7351768612861633, + "learning_rate": 3.394163731426216e-05, + "loss": 0.3558, + "step": 8820 + }, + { + "epoch": 1.851037083595223, + "grad_norm": 0.7860934138298035, + "learning_rate": 3.378107275704834e-05, + "loss": 0.3601, + "step": 8835 + }, + { + "epoch": 1.8541797611565052, + "grad_norm": 0.6049594283103943, + "learning_rate": 3.362069490744322e-05, + "loss": 0.3692, + "step": 8850 + }, + { + "epoch": 1.8573224387177876, + "grad_norm": 0.9184178709983826, + "learning_rate": 3.346050561167029e-05, + "loss": 0.3518, + "step": 8865 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.7558075189590454, + "learning_rate": 3.3300506713782495e-05, + "loss": 0.3587, + "step": 8880 + }, + { + "epoch": 1.8636077938403521, + "grad_norm": 0.7545658349990845, + "learning_rate": 3.314070005564097e-05, + "loss": 0.3679, + "step": 8895 + }, + { + "epoch": 1.8667504714016343, + "grad_norm": 0.9135695695877075, + "learning_rate": 3.2981087476893853e-05, + "loss": 0.3725, + "step": 8910 + }, + { + "epoch": 1.8698931489629165, + "grad_norm": 0.9788998961448669, + "learning_rate": 3.2821670814955026e-05, + "loss": 0.3149, + "step": 8925 + }, + { + "epoch": 1.8730358265241986, + "grad_norm": 0.7953155636787415, + "learning_rate": 3.266245190498311e-05, + "loss": 0.3461, + "step": 8940 + }, + { + "epoch": 1.8761785040854808, + "grad_norm": 0.9166163802146912, + "learning_rate": 3.250343257986027e-05, + "loss": 0.3866, + "step": 8955 + }, + { + "epoch": 1.879321181646763, + "grad_norm": 0.9379754066467285, + "learning_rate": 3.2344614670171025e-05, + "loss": 0.3928, + "step": 8970 + }, + { + "epoch": 1.8824638592080452, + "grad_norm": 0.8782539963722229, + "learning_rate": 3.2186000004181314e-05, + "loss": 0.3959, + "step": 8985 + }, + { + "epoch": 1.8856065367693273, + "grad_norm": 0.7237117886543274, + "learning_rate": 3.2027590407817407e-05, + "loss": 0.3458, + "step": 9000 + }, + { + "epoch": 1.8887492143306097, + "grad_norm": 0.8787809014320374, + "learning_rate": 3.186938770464486e-05, + "loss": 0.4081, + "step": 9015 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.7628602981567383, + "learning_rate": 3.1711393715847476e-05, + "loss": 0.3928, + "step": 9030 + }, + { + "epoch": 1.895034569453174, + "grad_norm": 0.9172194600105286, + "learning_rate": 3.15536102602065e-05, + "loss": 0.3777, + "step": 9045 + }, + { + "epoch": 1.8981772470144564, + "grad_norm": 0.8413445353507996, + "learning_rate": 3.13960391540795e-05, + "loss": 0.36, + "step": 9060 + }, + { + "epoch": 1.9013199245757386, + "grad_norm": 0.9793257117271423, + "learning_rate": 3.1238682211379586e-05, + "loss": 0.3801, + "step": 9075 + }, + { + "epoch": 1.9044626021370208, + "grad_norm": 0.7620652318000793, + "learning_rate": 3.1081541243554427e-05, + "loss": 0.3689, + "step": 9090 + }, + { + "epoch": 1.907605279698303, + "grad_norm": 0.8353012800216675, + "learning_rate": 3.092461805956551e-05, + "loss": 0.3961, + "step": 9105 + }, + { + "epoch": 1.9107479572595851, + "grad_norm": 0.8704758882522583, + "learning_rate": 3.0767914465867246e-05, + "loss": 0.3168, + "step": 9120 + }, + { + "epoch": 1.9138906348208673, + "grad_norm": 0.6754759550094604, + "learning_rate": 3.061143226638611e-05, + "loss": 0.3407, + "step": 9135 + }, + { + "epoch": 1.9170333123821495, + "grad_norm": 0.9682889580726624, + "learning_rate": 3.0455173262500093e-05, + "loss": 0.4251, + "step": 9150 + }, + { + "epoch": 1.9201759899434316, + "grad_norm": 0.8114556670188904, + "learning_rate": 3.0299139253017695e-05, + "loss": 0.3397, + "step": 9165 + }, + { + "epoch": 1.923318667504714, + "grad_norm": 0.8123522996902466, + "learning_rate": 3.014333203415741e-05, + "loss": 0.3372, + "step": 9180 + }, + { + "epoch": 1.9264613450659962, + "grad_norm": 0.6080268025398254, + "learning_rate": 2.9987753399526934e-05, + "loss": 0.3506, + "step": 9195 + }, + { + "epoch": 1.9296040226272786, + "grad_norm": 0.8804168701171875, + "learning_rate": 2.9832405140102637e-05, + "loss": 0.3689, + "step": 9210 + }, + { + "epoch": 1.9327467001885608, + "grad_norm": 0.8579033613204956, + "learning_rate": 2.9677289044208833e-05, + "loss": 0.3875, + "step": 9225 + }, + { + "epoch": 1.935889377749843, + "grad_norm": 0.9520317316055298, + "learning_rate": 2.952240689749722e-05, + "loss": 0.422, + "step": 9240 + }, + { + "epoch": 1.939032055311125, + "grad_norm": 0.9517824053764343, + "learning_rate": 2.9367760482926393e-05, + "loss": 0.3917, + "step": 9255 + }, + { + "epoch": 1.9421747328724073, + "grad_norm": 0.8813058733940125, + "learning_rate": 2.921335158074122e-05, + "loss": 0.3551, + "step": 9270 + }, + { + "epoch": 1.9453174104336894, + "grad_norm": 0.8402652144432068, + "learning_rate": 2.905918196845242e-05, + "loss": 0.3468, + "step": 9285 + }, + { + "epoch": 1.9484600879949716, + "grad_norm": 0.855032205581665, + "learning_rate": 2.8905253420816035e-05, + "loss": 0.3534, + "step": 9300 + }, + { + "epoch": 1.9516027655562538, + "grad_norm": 0.7760915756225586, + "learning_rate": 2.875156770981311e-05, + "loss": 0.348, + "step": 9315 + }, + { + "epoch": 1.9547454431175362, + "grad_norm": 0.946934163570404, + "learning_rate": 2.8598126604629195e-05, + "loss": 0.3556, + "step": 9330 + }, + { + "epoch": 1.9578881206788183, + "grad_norm": 0.7589976191520691, + "learning_rate": 2.844493187163395e-05, + "loss": 0.3944, + "step": 9345 + }, + { + "epoch": 1.9610307982401005, + "grad_norm": 0.8831868171691895, + "learning_rate": 2.8291985274360983e-05, + "loss": 0.3192, + "step": 9360 + }, + { + "epoch": 1.964173475801383, + "grad_norm": 0.8260477781295776, + "learning_rate": 2.8139288573487337e-05, + "loss": 0.3476, + "step": 9375 + }, + { + "epoch": 1.967316153362665, + "grad_norm": 0.9583712816238403, + "learning_rate": 2.7986843526813343e-05, + "loss": 0.3112, + "step": 9390 + }, + { + "epoch": 1.9704588309239472, + "grad_norm": 0.8534590005874634, + "learning_rate": 2.783465188924239e-05, + "loss": 0.3738, + "step": 9405 + }, + { + "epoch": 1.9736015084852294, + "grad_norm": 0.8562766909599304, + "learning_rate": 2.7682715412760696e-05, + "loss": 0.3831, + "step": 9420 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 0.649868905544281, + "learning_rate": 2.7531035846417107e-05, + "loss": 0.379, + "step": 9435 + }, + { + "epoch": 1.9798868636077938, + "grad_norm": 0.7702896595001221, + "learning_rate": 2.7379614936302982e-05, + "loss": 0.3617, + "step": 9450 + }, + { + "epoch": 1.983029541169076, + "grad_norm": 0.9378584623336792, + "learning_rate": 2.7228454425532157e-05, + "loss": 0.3681, + "step": 9465 + }, + { + "epoch": 1.9861722187303583, + "grad_norm": 1.0069222450256348, + "learning_rate": 2.7077556054220804e-05, + "loss": 0.3356, + "step": 9480 + }, + { + "epoch": 1.9893148962916405, + "grad_norm": 0.9345496892929077, + "learning_rate": 2.6926921559467412e-05, + "loss": 0.3974, + "step": 9495 + }, + { + "epoch": 1.9924575738529227, + "grad_norm": 0.8090453147888184, + "learning_rate": 2.6776552675332768e-05, + "loss": 0.3397, + "step": 9510 + }, + { + "epoch": 1.995600251414205, + "grad_norm": 0.647416353225708, + "learning_rate": 2.6626451132820085e-05, + "loss": 0.3259, + "step": 9525 + }, + { + "epoch": 1.9987429289754872, + "grad_norm": 0.7810280323028564, + "learning_rate": 2.6476618659855023e-05, + "loss": 0.3234, + "step": 9540 + }, + { + "epoch": 2.0018856065367694, + "grad_norm": 0.7231355309486389, + "learning_rate": 2.6327056981265708e-05, + "loss": 0.3276, + "step": 9555 + }, + { + "epoch": 2.0050282840980516, + "grad_norm": 0.7072864174842834, + "learning_rate": 2.6177767818763062e-05, + "loss": 0.2683, + "step": 9570 + }, + { + "epoch": 2.0081709616593337, + "grad_norm": 0.8502817749977112, + "learning_rate": 2.6028752890920783e-05, + "loss": 0.2844, + "step": 9585 + }, + { + "epoch": 2.011313639220616, + "grad_norm": 0.6001257300376892, + "learning_rate": 2.5880013913155743e-05, + "loss": 0.2582, + "step": 9600 + }, + { + "epoch": 2.014456316781898, + "grad_norm": 1.037467360496521, + "learning_rate": 2.5731552597708086e-05, + "loss": 0.2666, + "step": 9615 + }, + { + "epoch": 2.0175989943431802, + "grad_norm": 0.990047812461853, + "learning_rate": 2.5583370653621652e-05, + "loss": 0.3042, + "step": 9630 + }, + { + "epoch": 2.0207416719044624, + "grad_norm": 1.0518317222595215, + "learning_rate": 2.5435469786724204e-05, + "loss": 0.2543, + "step": 9645 + }, + { + "epoch": 2.023884349465745, + "grad_norm": 1.225774884223938, + "learning_rate": 2.528785169960779e-05, + "loss": 0.3183, + "step": 9660 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.9525572061538696, + "learning_rate": 2.5140518091609256e-05, + "loss": 0.3426, + "step": 9675 + }, + { + "epoch": 2.0301697045883094, + "grad_norm": 1.0750566720962524, + "learning_rate": 2.4993470658790573e-05, + "loss": 0.3172, + "step": 9690 + }, + { + "epoch": 2.0333123821495915, + "grad_norm": 0.8268773555755615, + "learning_rate": 2.484671109391933e-05, + "loss": 0.31, + "step": 9705 + }, + { + "epoch": 2.0364550597108737, + "grad_norm": 0.679678201675415, + "learning_rate": 2.470024108644925e-05, + "loss": 0.2868, + "step": 9720 + }, + { + "epoch": 2.039597737272156, + "grad_norm": 0.997440755367279, + "learning_rate": 2.4554062322500797e-05, + "loss": 0.3291, + "step": 9735 + }, + { + "epoch": 2.042740414833438, + "grad_norm": 0.9968817830085754, + "learning_rate": 2.4408176484841732e-05, + "loss": 0.2664, + "step": 9750 + }, + { + "epoch": 2.04588309239472, + "grad_norm": 1.0939124822616577, + "learning_rate": 2.4262585252867686e-05, + "loss": 0.2895, + "step": 9765 + }, + { + "epoch": 2.0490257699560024, + "grad_norm": 1.0220900774002075, + "learning_rate": 2.4117290302582872e-05, + "loss": 0.3191, + "step": 9780 + }, + { + "epoch": 2.0521684475172846, + "grad_norm": 0.635898768901825, + "learning_rate": 2.397229330658084e-05, + "loss": 0.307, + "step": 9795 + }, + { + "epoch": 2.0553111250785667, + "grad_norm": 1.112257719039917, + "learning_rate": 2.382759593402517e-05, + "loss": 0.2748, + "step": 9810 + }, + { + "epoch": 2.0584538026398493, + "grad_norm": 0.9440275430679321, + "learning_rate": 2.3683199850630213e-05, + "loss": 0.2893, + "step": 9825 + }, + { + "epoch": 2.0615964802011315, + "grad_norm": 1.2118226289749146, + "learning_rate": 2.3539106718642034e-05, + "loss": 0.2791, + "step": 9840 + }, + { + "epoch": 2.0647391577624137, + "grad_norm": 1.1374374628067017, + "learning_rate": 2.339531819681914e-05, + "loss": 0.2777, + "step": 9855 + }, + { + "epoch": 2.067881835323696, + "grad_norm": 0.6932136416435242, + "learning_rate": 2.3251835940413517e-05, + "loss": 0.2828, + "step": 9870 + }, + { + "epoch": 2.071024512884978, + "grad_norm": 1.0308489799499512, + "learning_rate": 2.310866160115146e-05, + "loss": 0.2947, + "step": 9885 + }, + { + "epoch": 2.07416719044626, + "grad_norm": 1.063235878944397, + "learning_rate": 2.2965796827214665e-05, + "loss": 0.3204, + "step": 9900 + }, + { + "epoch": 2.0773098680075424, + "grad_norm": 1.1612193584442139, + "learning_rate": 2.282324326322115e-05, + "loss": 0.2976, + "step": 9915 + }, + { + "epoch": 2.0804525455688245, + "grad_norm": 0.8928938508033752, + "learning_rate": 2.2681002550206355e-05, + "loss": 0.2921, + "step": 9930 + }, + { + "epoch": 2.0835952231301067, + "grad_norm": 1.066124677658081, + "learning_rate": 2.253907632560439e-05, + "loss": 0.298, + "step": 9945 + }, + { + "epoch": 2.086737900691389, + "grad_norm": 0.8713576197624207, + "learning_rate": 2.2397466223228947e-05, + "loss": 0.275, + "step": 9960 + }, + { + "epoch": 2.0898805782526715, + "grad_norm": 1.1056296825408936, + "learning_rate": 2.2256173873254643e-05, + "loss": 0.3266, + "step": 9975 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.9172502160072327, + "learning_rate": 2.211520090219821e-05, + "loss": 0.2731, + "step": 9990 + }, + { + "epoch": 2.0951183741881416, + "eval_accuracy": 0.009820309467613697, + "eval_loss": 0.4190310835838318, + "eval_runtime": 424.9528, + "eval_samples_per_second": 11.26, + "eval_steps_per_second": 2.817, + "step": 10000 + } + ], + "logging_steps": 15, + "max_steps": 14319, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9082208625284874e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}