diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3886 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00546448087431694, + "grad_norm": 0.25466734170913696, + "learning_rate": 3.6363636363636366e-07, + "loss": 2.7662577629089355, + "step": 2 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 0.7515560388565063, + "learning_rate": 1.090909090909091e-06, + "loss": 1.9380992650985718, + "step": 4 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.1754308044910431, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.9071111679077148, + "step": 6 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.16241183876991272, + "learning_rate": 2.5454545454545456e-06, + "loss": 1.805609107017517, + "step": 8 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.20075678825378418, + "learning_rate": 3.272727272727273e-06, + "loss": 1.8223026990890503, + "step": 10 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.21740451455116272, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6317622661590576, + "step": 12 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.3697981834411621, + "learning_rate": 4.727272727272728e-06, + "loss": 1.529943823814392, + "step": 14 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 0.1746780127286911, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.3002747297286987, + "step": 16 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.32101982831954956, + "learning_rate": 6.181818181818182e-06, + "loss": 1.1705005168914795, + "step": 18 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.11055011302232742, + "learning_rate": 6.90909090909091e-06, + "loss": 1.4407516717910767, + "step": 20 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.11553582549095154, + "learning_rate": 7.636363636363638e-06, + "loss": 1.3156378269195557, + "step": 22 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.09201869368553162, + "learning_rate": 8.363636363636365e-06, + "loss": 1.4316177368164062, + "step": 24 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.3089788854122162, + "learning_rate": 9.090909090909091e-06, + "loss": 1.6130497455596924, + "step": 26 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.10925760865211487, + "learning_rate": 9.81818181818182e-06, + "loss": 1.3906177282333374, + "step": 28 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.11519195884466171, + "learning_rate": 1.0545454545454546e-05, + "loss": 1.2700624465942383, + "step": 30 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.15188704431056976, + "learning_rate": 1.1272727272727272e-05, + "loss": 1.3382295370101929, + "step": 32 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.2614683210849762, + "learning_rate": 1.2e-05, + "loss": 1.021398663520813, + "step": 34 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.15062183141708374, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.12600576877594, + "step": 36 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.1506468653678894, + "learning_rate": 1.3454545454545455e-05, + "loss": 1.3920522928237915, + "step": 38 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.1137060672044754, + "learning_rate": 1.4181818181818183e-05, + "loss": 1.3287949562072754, + "step": 40 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.2205193191766739, + "learning_rate": 1.4909090909090911e-05, + "loss": 0.9591144323348999, + "step": 42 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.11356563866138458, + "learning_rate": 1.563636363636364e-05, + "loss": 0.8748141527175903, + "step": 44 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.09480801969766617, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.3462021350860596, + "step": 46 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.08694199472665787, + "learning_rate": 1.7090909090909092e-05, + "loss": 1.3080401420593262, + "step": 48 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.16311120986938477, + "learning_rate": 1.781818181818182e-05, + "loss": 1.2748043537139893, + "step": 50 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.12218938022851944, + "learning_rate": 1.8545454545454545e-05, + "loss": 1.247350811958313, + "step": 52 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 0.5999717712402344, + "learning_rate": 1.9272727272727275e-05, + "loss": 0.9628254771232605, + "step": 54 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.11117656528949738, + "learning_rate": 2e-05, + "loss": 1.4265263080596924, + "step": 56 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.1765168160200119, + "learning_rate": 1.9998327792599505e-05, + "loss": 1.4137494564056396, + "step": 58 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.10895860195159912, + "learning_rate": 1.999331179179304e-05, + "loss": 1.3032466173171997, + "step": 60 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.15461023151874542, + "learning_rate": 1.9984953861534752e-05, + "loss": 1.5170549154281616, + "step": 62 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.21557451784610748, + "learning_rate": 1.997325710764527e-05, + "loss": 1.141064167022705, + "step": 64 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.14256301522254944, + "learning_rate": 1.9958225876657575e-05, + "loss": 1.3047987222671509, + "step": 66 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.12212738394737244, + "learning_rate": 1.9939865754201825e-05, + "loss": 1.273051381111145, + "step": 68 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.1700674444437027, + "learning_rate": 1.9918183562929717e-05, + "loss": 1.3256914615631104, + "step": 70 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.11881201714277267, + "learning_rate": 1.9893187359979183e-05, + "loss": 1.141993522644043, + "step": 72 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.11009667813777924, + "learning_rate": 1.986488643398035e-05, + "loss": 2.432406187057495, + "step": 74 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.1322188526391983, + "learning_rate": 1.9833291301603863e-05, + "loss": 1.325785756111145, + "step": 76 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 0.5970381498336792, + "learning_rate": 1.9798413703652867e-05, + "loss": 1.2421168088912964, + "step": 78 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.08399596810340881, + "learning_rate": 1.976026660070012e-05, + "loss": 0.5391249656677246, + "step": 80 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.13643911480903625, + "learning_rate": 1.9718864168271823e-05, + "loss": 1.3353921175003052, + "step": 82 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.10358614474534988, + "learning_rate": 1.9674221791579946e-05, + "loss": 0.7132644653320312, + "step": 84 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.1321418434381485, + "learning_rate": 1.9626356059805085e-05, + "loss": 1.2667148113250732, + "step": 86 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.11468269675970078, + "learning_rate": 1.957528475993189e-05, + "loss": 1.2476862668991089, + "step": 88 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.5918356776237488, + "learning_rate": 1.952102687013938e-05, + "loss": 0.8590590357780457, + "step": 90 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.36314836144447327, + "learning_rate": 1.946360255274863e-05, + "loss": 1.3177051544189453, + "step": 92 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.29460352659225464, + "learning_rate": 1.9403033146730424e-05, + "loss": 1.1159619092941284, + "step": 94 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.19202913343906403, + "learning_rate": 1.9339341159775647e-05, + "loss": 1.5400570631027222, + "step": 96 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.10346690565347672, + "learning_rate": 1.9272550259931398e-05, + "loss": 0.9743759632110596, + "step": 98 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.0876322016119957, + "learning_rate": 1.9202685266805896e-05, + "loss": 1.0140562057495117, + "step": 100 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.1559719294309616, + "learning_rate": 1.9129772142345484e-05, + "loss": 1.2331827878952026, + "step": 102 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.12013057619333267, + "learning_rate": 1.9053837981187125e-05, + "loss": 0.9312763214111328, + "step": 104 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.1158582791686058, + "learning_rate": 1.897491100058998e-05, + "loss": 1.2368842363357544, + "step": 106 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.1803876906633377, + "learning_rate": 1.8893020529949838e-05, + "loss": 1.394473910331726, + "step": 108 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.25411710143089294, + "learning_rate": 1.880819699990027e-05, + "loss": 1.2550570964813232, + "step": 110 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.10351160913705826, + "learning_rate": 1.8720471931004526e-05, + "loss": 1.2556068897247314, + "step": 112 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.2302931249141693, + "learning_rate": 1.8629877922042485e-05, + "loss": 1.3221981525421143, + "step": 114 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.12091614305973053, + "learning_rate": 1.8536448637896866e-05, + "loss": 1.243666172027588, + "step": 116 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.133016899228096, + "learning_rate": 1.84402187970433e-05, + "loss": 0.9432613253593445, + "step": 118 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.4293932616710663, + "learning_rate": 1.834122415864891e-05, + "loss": 1.2485939264297485, + "step": 120 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.3474442660808563, + "learning_rate": 1.8239501509284123e-05, + "loss": 1.3357336521148682, + "step": 122 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.14584510028362274, + "learning_rate": 1.8135088649252725e-05, + "loss": 0.9912499189376831, + "step": 124 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.1797986924648285, + "learning_rate": 1.8028024378545224e-05, + "loss": 0.9850834012031555, + "step": 126 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.16606612503528595, + "learning_rate": 1.7918348482420692e-05, + "loss": 1.30361008644104, + "step": 128 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.10997994989156723, + "learning_rate": 1.7806101716622486e-05, + "loss": 1.2278146743774414, + "step": 130 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.10381340980529785, + "learning_rate": 1.7691325792233378e-05, + "loss": 1.17452073097229, + "step": 132 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.1617143303155899, + "learning_rate": 1.7574063360175625e-05, + "loss": 1.0645220279693604, + "step": 134 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.20978182554244995, + "learning_rate": 1.745435799536183e-05, + "loss": 0.939949095249176, + "step": 136 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.15356120467185974, + "learning_rate": 1.7332254180502407e-05, + "loss": 0.9277665019035339, + "step": 138 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.07884714752435684, + "learning_rate": 1.7207797289575777e-05, + "loss": 0.5311431288719177, + "step": 140 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.17396573722362518, + "learning_rate": 1.708103357096728e-05, + "loss": 1.2086787223815918, + "step": 142 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.17656747996807098, + "learning_rate": 1.695201013028322e-05, + "loss": 1.2440763711929321, + "step": 144 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 0.10827220976352692, + "learning_rate": 1.6820774912846335e-05, + "loss": 1.2088078260421753, + "step": 146 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.1017991229891777, + "learning_rate": 1.668737668587926e-05, + "loss": 1.2017563581466675, + "step": 148 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.12564772367477417, + "learning_rate": 1.655186502038251e-05, + "loss": 1.1395862102508545, + "step": 150 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.09512261301279068, + "learning_rate": 1.641429027271384e-05, + "loss": 1.2394720315933228, + "step": 152 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.2284335345029831, + "learning_rate": 1.6274703565875736e-05, + "loss": 1.2160933017730713, + "step": 154 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.0999346673488617, + "learning_rate": 1.613315677051801e-05, + "loss": 1.2623002529144287, + "step": 156 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.19460758566856384, + "learning_rate": 1.598970248566261e-05, + "loss": 0.942702054977417, + "step": 158 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.6756497025489807, + "learning_rate": 1.5844394019157697e-05, + "loss": 1.1925864219665527, + "step": 160 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.15877823531627655, + "learning_rate": 1.5697285367868393e-05, + "loss": 0.5849096179008484, + "step": 162 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.15297530591487885, + "learning_rate": 1.5548431197611448e-05, + "loss": 1.5736088752746582, + "step": 164 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.30064284801483154, + "learning_rate": 1.539788682284133e-05, + "loss": 0.6945763230323792, + "step": 166 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.11130578815937042, + "learning_rate": 1.5245708186095275e-05, + "loss": 1.1386805772781372, + "step": 168 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.0942344143986702, + "learning_rate": 1.5091951837204973e-05, + "loss": 1.219637155532837, + "step": 170 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.27630358934402466, + "learning_rate": 1.4936674912282525e-05, + "loss": 1.2905319929122925, + "step": 172 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.2305569052696228, + "learning_rate": 1.4779935112488597e-05, + "loss": 1.4264920949935913, + "step": 174 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.1555745154619217, + "learning_rate": 1.4621790682590556e-05, + "loss": 1.2014399766921997, + "step": 176 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.17775969207286835, + "learning_rate": 1.4462300389318635e-05, + "loss": 1.1622732877731323, + "step": 178 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.13497698307037354, + "learning_rate": 1.4301523499528099e-05, + "loss": 1.174301266670227, + "step": 180 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.1448894590139389, + "learning_rate": 1.4139519758175602e-05, + "loss": 1.1337958574295044, + "step": 182 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 0.2996978461742401, + "learning_rate": 1.3976349366117861e-05, + "loss": 1.3451225757598877, + "step": 184 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.20243562757968903, + "learning_rate": 1.3812072957740898e-05, + "loss": 1.141178846359253, + "step": 186 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.2731589674949646, + "learning_rate": 1.3646751578428231e-05, + "loss": 1.1806496381759644, + "step": 188 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.11246220767498016, + "learning_rate": 1.3480446661876295e-05, + "loss": 1.2235163450241089, + "step": 190 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.1825818121433258, + "learning_rate": 1.3313220007265572e-05, + "loss": 1.1772065162658691, + "step": 192 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.09880517423152924, + "learning_rate": 1.3145133756295936e-05, + "loss": 1.1989942789077759, + "step": 194 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.13294869661331177, + "learning_rate": 1.2976250370094668e-05, + "loss": 1.2548409700393677, + "step": 196 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.14556096494197845, + "learning_rate": 1.2806632606005822e-05, + "loss": 1.0411778688430786, + "step": 198 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.08977798372507095, + "learning_rate": 1.2636343494269479e-05, + "loss": 1.2309821844100952, + "step": 200 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.2739904522895813, + "learning_rate": 1.2465446314599609e-05, + "loss": 1.0581748485565186, + "step": 202 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.38823750615119934, + "learning_rate": 1.2294004572669228e-05, + "loss": 0.8346990346908569, + "step": 204 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.3511664867401123, + "learning_rate": 1.2122081976511581e-05, + "loss": 1.215368390083313, + "step": 206 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.08836658298969269, + "learning_rate": 1.1949742412846142e-05, + "loss": 1.0212361812591553, + "step": 208 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.15098384022712708, + "learning_rate": 1.177704992333818e-05, + "loss": 1.203520655632019, + "step": 210 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.15484996140003204, + "learning_rate": 1.1604068680800809e-05, + "loss": 1.4717365503311157, + "step": 212 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.09766274690628052, + "learning_rate": 1.1430862965348224e-05, + "loss": 1.187837839126587, + "step": 214 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.24046754837036133, + "learning_rate": 1.1257497140509141e-05, + "loss": 1.5469108819961548, + "step": 216 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.09375728666782379, + "learning_rate": 1.1084035629309176e-05, + "loss": 1.1628564596176147, + "step": 218 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.25373145937919617, + "learning_rate": 1.0910542890331162e-05, + "loss": 1.2555179595947266, + "step": 220 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.11713021248579025, + "learning_rate": 1.0737083393762213e-05, + "loss": 1.0514938831329346, + "step": 222 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.27219468355178833, + "learning_rate": 1.0563721597436525e-05, + "loss": 1.0099146366119385, + "step": 224 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.2724650204181671, + "learning_rate": 1.039052192288271e-05, + "loss": 1.2064918279647827, + "step": 226 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.3544275760650635, + "learning_rate": 1.0217548731384677e-05, + "loss": 1.2222727537155151, + "step": 228 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.10047902166843414, + "learning_rate": 1.0044866300064842e-05, + "loss": 0.6619182825088501, + "step": 230 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.1497628390789032, + "learning_rate": 9.872538797998672e-06, + "loss": 1.2109484672546387, + "step": 232 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 0.25251680612564087, + "learning_rate": 9.700630262369337e-06, + "loss": 0.8720806837081909, + "step": 234 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.3295302093029022, + "learning_rate": 9.529204574671391e-06, + "loss": 0.7993478178977966, + "step": 236 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.2178184986114502, + "learning_rate": 9.3583254369723e-06, + "loss": 1.282462239265442, + "step": 238 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.9069607257843018, + "learning_rate": 9.188056348240655e-06, + "loss": 1.2156498432159424, + "step": 240 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.175712451338768, + "learning_rate": 9.018460580749842e-06, + "loss": 1.1425039768218994, + "step": 242 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.16185066103935242, + "learning_rate": 8.849601156565972e-06, + "loss": 0.9641758799552917, + "step": 244 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.11022090166807175, + "learning_rate": 8.68154082412877e-06, + "loss": 1.1621077060699463, + "step": 246 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.12610098719596863, + "learning_rate": 8.514342034934159e-06, + "loss": 1.2263044118881226, + "step": 248 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.11280061304569244, + "learning_rate": 8.348066920327163e-06, + "loss": 1.193541407585144, + "step": 250 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.15432216227054596, + "learning_rate": 8.182777268413822e-06, + "loss": 1.0459729433059692, + "step": 252 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.09402735531330109, + "learning_rate": 8.018534501100611e-06, + "loss": 1.1481513977050781, + "step": 254 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.16630694270133972, + "learning_rate": 7.855399651269982e-06, + "loss": 1.1385172605514526, + "step": 256 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.13937430083751678, + "learning_rate": 7.6934333401004e-06, + "loss": 1.087058663368225, + "step": 258 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.1182989850640297, + "learning_rate": 7.53269575453947e-06, + "loss": 1.1448062658309937, + "step": 260 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.15738993883132935, + "learning_rate": 7.373246624938324e-06, + "loss": 1.2247816324234009, + "step": 262 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.08486029505729675, + "learning_rate": 7.215145202855746e-06, + "loss": 1.1820663213729858, + "step": 264 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.08663325756788254, + "learning_rate": 7.0584502390401865e-06, + "loss": 1.001596450805664, + "step": 266 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.3671172559261322, + "learning_rate": 6.903219961597891e-06, + "loss": 1.1261245012283325, + "step": 268 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.7661083936691284, + "learning_rate": 6.7495120543552475e-06, + "loss": 1.171128511428833, + "step": 270 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.1354326456785202, + "learning_rate": 6.59738363542336e-06, + "loss": 1.1350687742233276, + "step": 272 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.1703643500804901, + "learning_rate": 6.446891235972894e-06, + "loss": 1.1624363660812378, + "step": 274 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.11271324008703232, + "learning_rate": 6.298090779226977e-06, + "loss": 1.2078073024749756, + "step": 276 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.15839166939258575, + "learning_rate": 6.151037559680047e-06, + "loss": 1.1765800714492798, + "step": 278 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.1356314718723297, + "learning_rate": 6.005786222550319e-06, + "loss": 0.7959855794906616, + "step": 280 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.1252662092447281, + "learning_rate": 5.8623907434735515e-06, + "loss": 1.2027335166931152, + "step": 282 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.11218217760324478, + "learning_rate": 5.720904408445589e-06, + "loss": 1.1648608446121216, + "step": 284 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.12568356096744537, + "learning_rate": 5.581379794021202e-06, + "loss": 1.1810613870620728, + "step": 286 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.37117138504981995, + "learning_rate": 5.443868747776579e-06, + "loss": 1.148728847503662, + "step": 288 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.09700842201709747, + "learning_rate": 5.308422369042644e-06, + "loss": 1.0601937770843506, + "step": 290 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 2.1059610843658447, + "learning_rate": 5.175090989916483e-06, + "loss": 1.1344964504241943, + "step": 292 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.15745721757411957, + "learning_rate": 5.043924156557844e-06, + "loss": 1.1499444246292114, + "step": 294 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.17999355494976044, + "learning_rate": 4.914970610777725e-06, + "loss": 1.4922648668289185, + "step": 296 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 1.251465916633606, + "learning_rate": 4.788278271925802e-06, + "loss": 0.8081488013267517, + "step": 298 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.11098221689462662, + "learning_rate": 4.663894219083548e-06, + "loss": 1.5515803098678589, + "step": 300 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.1215251013636589, + "learning_rate": 4.541864673569551e-06, + "loss": 1.1884891986846924, + "step": 302 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.11028771847486496, + "learning_rate": 4.422234981763613e-06, + "loss": 0.7176008224487305, + "step": 304 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.44399887323379517, + "learning_rate": 4.305049598255946e-06, + "loss": 1.520970106124878, + "step": 306 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.3971186578273773, + "learning_rate": 4.190352069327777e-06, + "loss": 0.8190991282463074, + "step": 308 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.13053210079669952, + "learning_rate": 4.078185016769484e-06, + "loss": 1.185870885848999, + "step": 310 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.19934000074863434, + "learning_rate": 3.968590122042265e-06, + "loss": 1.1845072507858276, + "step": 312 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.11231542378664017, + "learning_rate": 3.861608110789228e-06, + "loss": 1.195508599281311, + "step": 314 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.20832614600658417, + "learning_rate": 3.757278737701697e-06, + "loss": 1.2070856094360352, + "step": 316 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.15592867136001587, + "learning_rate": 3.6556407717462856e-06, + "loss": 1.026039719581604, + "step": 318 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.13188278675079346, + "learning_rate": 3.5567319817582944e-06, + "loss": 1.371668815612793, + "step": 320 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.10445486754179001, + "learning_rate": 3.4605891224067423e-06, + "loss": 1.0845658779144287, + "step": 322 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.2320113480091095, + "learning_rate": 3.3672479205362764e-06, + "loss": 0.45486584305763245, + "step": 324 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.12340901046991348, + "learning_rate": 3.276743061891014e-06, + "loss": 1.1611615419387817, + "step": 326 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.4475763738155365, + "learning_rate": 3.1891081782252726e-06, + "loss": 1.044767141342163, + "step": 328 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.19148172438144684, + "learning_rate": 3.1043758348059384e-06, + "loss": 1.2087295055389404, + "step": 330 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.20862823724746704, + "learning_rate": 3.0225775183111784e-06, + "loss": 1.248867392539978, + "step": 332 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.11509155482053757, + "learning_rate": 2.943743625129917e-06, + "loss": 0.9973466396331787, + "step": 334 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.10567598789930344, + "learning_rate": 2.867903450066513e-06, + "loss": 1.1415375471115112, + "step": 336 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.17730312049388885, + "learning_rate": 2.795085175454741e-06, + "loss": 1.1459949016571045, + "step": 338 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.202916219830513, + "learning_rate": 2.7253158606851983e-06, + "loss": 1.1517586708068848, + "step": 340 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.14017514884471893, + "learning_rate": 2.6586214321499952e-06, + "loss": 1.2387312650680542, + "step": 342 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.3595944941043854, + "learning_rate": 2.5950266736084558e-06, + "loss": 1.1301581859588623, + "step": 344 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.14350801706314087, + "learning_rate": 2.5345552169774413e-06, + "loss": 1.5077723264694214, + "step": 346 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.22342601418495178, + "learning_rate": 2.477229533549685e-06, + "loss": 0.7664333581924438, + "step": 348 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.09661981463432312, + "learning_rate": 2.423070925643422e-06, + "loss": 0.756706178188324, + "step": 350 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.08707380294799805, + "learning_rate": 2.372099518686416e-06, + "loss": 1.1494992971420288, + "step": 352 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.08544003218412399, + "learning_rate": 2.324334253737321e-06, + "loss": 0.7893691062927246, + "step": 354 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.12082793563604355, + "learning_rate": 2.2797928804471413e-06, + "loss": 1.0328351259231567, + "step": 356 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.11313844472169876, + "learning_rate": 2.2384919504634465e-06, + "loss": 0.6884475946426392, + "step": 358 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.15572156012058258, + "learning_rate": 2.2004468112797345e-06, + "loss": 1.1554869413375854, + "step": 360 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.5943642854690552, + "learning_rate": 2.165671600532298e-06, + "loss": 0.8945509791374207, + "step": 362 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.11099452525377274, + "learning_rate": 2.134179240746638e-06, + "loss": 1.134549617767334, + "step": 364 + }, + { + "epoch": 1.0, + "grad_norm": 0.1020437702536583, + "learning_rate": 2.1059814345354434e-06, + "loss": 1.1114323139190674, + "step": 366 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.1056353822350502, + "learning_rate": 2.0810886602498733e-06, + "loss": 1.0149329900741577, + "step": 368 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.0995660275220871, + "learning_rate": 2.059510168085791e-06, + "loss": 1.0914201736450195, + "step": 370 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.11507035046815872, + "learning_rate": 2.0412539766463697e-06, + "loss": 1.0559698343276978, + "step": 372 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.11144529283046722, + "learning_rate": 2.0263268699623746e-06, + "loss": 1.0247032642364502, + "step": 374 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.24577614665031433, + "learning_rate": 2.0147343949711965e-06, + "loss": 0.818856418132782, + "step": 376 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.17577126622200012, + "learning_rate": 2.0064808594556066e-06, + "loss": 1.0067471265792847, + "step": 378 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.1252138763666153, + "learning_rate": 2.0015693304429757e-06, + "loss": 0.6751760840415955, + "step": 380 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.29696500301361084, + "learning_rate": 2.000001633065562e-06, + "loss": 0.9073832631111145, + "step": 382 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.11161016672849655, + "learning_rate": 2.0017783498822896e-06, + "loss": 1.0375909805297852, + "step": 384 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.14083045721054077, + "learning_rate": 2.006898820662268e-06, + "loss": 0.6373446583747864, + "step": 386 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.9407859444618225, + "learning_rate": 2.0153611426301325e-06, + "loss": 0.513404369354248, + "step": 388 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.2054053246974945, + "learning_rate": 2.027162171173126e-06, + "loss": 1.122929573059082, + "step": 390 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.14908728003501892, + "learning_rate": 2.0422975210096317e-06, + "loss": 0.9799243807792664, + "step": 392 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.5126852989196777, + "learning_rate": 2.0607615678187605e-06, + "loss": 0.5636054277420044, + "step": 394 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.23881854116916656, + "learning_rate": 2.082547450330353e-06, + "loss": 1.0122809410095215, + "step": 396 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.14029423892498016, + "learning_rate": 2.1076470728746407e-06, + "loss": 1.0476458072662354, + "step": 398 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.23039187490940094, + "learning_rate": 2.136051108390608e-06, + "loss": 1.099108338356018, + "step": 400 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.2124757319688797, + "learning_rate": 2.167749001891944e-06, + "loss": 0.941685140132904, + "step": 402 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.12573638558387756, + "learning_rate": 2.202728974389296e-06, + "loss": 1.043279767036438, + "step": 404 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.6265265941619873, + "learning_rate": 2.240978027267357e-06, + "loss": 1.12416410446167, + "step": 406 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.6654527187347412, + "learning_rate": 2.2824819471151736e-06, + "loss": 0.6356739401817322, + "step": 408 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.10931819677352905, + "learning_rate": 2.327225311007878e-06, + "loss": 1.186576247215271, + "step": 410 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.26741528511047363, + "learning_rate": 2.3751914922378623e-06, + "loss": 0.6458518505096436, + "step": 412 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.12008926272392273, + "learning_rate": 2.4263626664932998e-06, + "loss": 0.73695307970047, + "step": 414 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.23112480342388153, + "learning_rate": 2.4807198184816817e-06, + "loss": 0.6092378497123718, + "step": 416 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.17145994305610657, + "learning_rate": 2.5382427489959373e-06, + "loss": 1.0369067192077637, + "step": 418 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.14935195446014404, + "learning_rate": 2.5989100824204876e-06, + "loss": 0.8705060482025146, + "step": 420 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.11667577922344208, + "learning_rate": 2.662699274674462e-06, + "loss": 1.0095133781433105, + "step": 422 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.14248596131801605, + "learning_rate": 2.7295866215891107e-06, + "loss": 0.7098819017410278, + "step": 424 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.09048722684383392, + "learning_rate": 2.799547267716326e-06, + "loss": 0.6562694311141968, + "step": 426 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.18223534524440765, + "learning_rate": 2.872555215564946e-06, + "loss": 1.1051146984100342, + "step": 428 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.17947758734226227, + "learning_rate": 2.9485833352614895e-06, + "loss": 1.0301352739334106, + "step": 430 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.16275085508823395, + "learning_rate": 3.027603374631647e-06, + "loss": 0.9983370304107666, + "step": 432 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.37115633487701416, + "learning_rate": 3.1095859696988273e-06, + "loss": 0.7356457710266113, + "step": 434 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.1627861112356186, + "learning_rate": 3.1945006555958885e-06, + "loss": 1.066249966621399, + "step": 436 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.08598164469003677, + "learning_rate": 3.2823158778858976e-06, + "loss": 0.9931395649909973, + "step": 438 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.20133376121520996, + "learning_rate": 3.372999004287839e-06, + "loss": 0.8884505033493042, + "step": 440 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.1404731720685959, + "learning_rate": 3.4665163368028044e-06, + "loss": 1.0539674758911133, + "step": 442 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.13216909766197205, + "learning_rate": 3.562833124236238e-06, + "loss": 1.0272293090820312, + "step": 444 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.09942961484193802, + "learning_rate": 3.6619135751115325e-06, + "loss": 0.5767819881439209, + "step": 446 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.1461215615272522, + "learning_rate": 3.763720870970201e-06, + "loss": 0.9723450541496277, + "step": 448 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.539188802242279, + "learning_rate": 3.86821718005367e-06, + "loss": 1.0379064083099365, + "step": 450 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.1771521270275116, + "learning_rate": 3.975363671361641e-06, + "loss": 0.9944263696670532, + "step": 452 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.15524691343307495, + "learning_rate": 4.0851205290817254e-06, + "loss": 0.3145610988140106, + "step": 454 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.27492523193359375, + "learning_rate": 4.197446967385105e-06, + "loss": 0.7276042103767395, + "step": 456 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.36312294006347656, + "learning_rate": 4.312301245582571e-06, + "loss": 0.599205493927002, + "step": 458 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.12119816243648529, + "learning_rate": 4.429640683635466e-06, + "loss": 1.0522774457931519, + "step": 460 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.2748584747314453, + "learning_rate": 4.549421678015633e-06, + "loss": 0.6640270352363586, + "step": 462 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.29243332147598267, + "learning_rate": 4.671599717908582e-06, + "loss": 0.9783296585083008, + "step": 464 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.2020227164030075, + "learning_rate": 4.796129401753752e-06, + "loss": 1.0049470663070679, + "step": 466 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.10091632604598999, + "learning_rate": 4.922964454115837e-06, + "loss": 0.9500146508216858, + "step": 468 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.15708519518375397, + "learning_rate": 5.0520577428807835e-06, + "loss": 0.9865996837615967, + "step": 470 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.18902111053466797, + "learning_rate": 5.183361296770197e-06, + "loss": 1.3013302087783813, + "step": 472 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.15106923878192902, + "learning_rate": 5.316826323167505e-06, + "loss": 0.9929630160331726, + "step": 474 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.18238449096679688, + "learning_rate": 5.4524032262494175e-06, + "loss": 0.5130822658538818, + "step": 476 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.12673920392990112, + "learning_rate": 5.590041625415783e-06, + "loss": 1.0614360570907593, + "step": 478 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.13248680531978607, + "learning_rate": 5.7296903740111076e-06, + "loss": 1.066565752029419, + "step": 480 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.11556583642959595, + "learning_rate": 5.87129757833077e-06, + "loss": 0.8726215362548828, + "step": 482 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.23890437185764313, + "learning_rate": 6.014810616904747e-06, + "loss": 0.6259342432022095, + "step": 484 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.19614221155643463, + "learning_rate": 6.160176160051906e-06, + "loss": 0.9607924222946167, + "step": 486 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.12293434143066406, + "learning_rate": 6.307340189697344e-06, + "loss": 1.0918643474578857, + "step": 488 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.4499771296977997, + "learning_rate": 6.456248019445626e-06, + "loss": 0.6185516119003296, + "step": 490 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.11939382553100586, + "learning_rate": 6.606844314902321e-06, + "loss": 1.0591408014297485, + "step": 492 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.0514669343829155, + "learning_rate": 6.7590731142363915e-06, + "loss": 0.8746728897094727, + "step": 494 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.1703745722770691, + "learning_rate": 6.912877848975638e-06, + "loss": 1.0505259037017822, + "step": 496 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.2271912842988968, + "learning_rate": 7.068201365027712e-06, + "loss": 0.888632595539093, + "step": 498 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.19478864967823029, + "learning_rate": 7.2249859439185875e-06, + "loss": 1.2047241926193237, + "step": 500 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.19422981142997742, + "learning_rate": 7.3831733242409285e-06, + "loss": 0.9257214069366455, + "step": 502 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.1071656197309494, + "learning_rate": 7.5427047233040485e-06, + "loss": 0.5969194173812866, + "step": 504 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.15044689178466797, + "learning_rate": 7.703520858977702e-06, + "loss": 0.8187384009361267, + "step": 506 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.15576544404029846, + "learning_rate": 7.865561971721389e-06, + "loss": 1.0144892930984497, + "step": 508 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.16742777824401855, + "learning_rate": 8.02876784679115e-06, + "loss": 1.0460524559020996, + "step": 510 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.09324585646390915, + "learning_rate": 8.193077836615386e-06, + "loss": 0.954236626625061, + "step": 512 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.056807976216077805, + "learning_rate": 8.35843088333168e-06, + "loss": 0.7853972315788269, + "step": 514 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.09889108687639236, + "learning_rate": 8.524765541475935e-06, + "loss": 0.7305459976196289, + "step": 516 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.09034071862697601, + "learning_rate": 8.692020000815627e-06, + "loss": 1.0243470668792725, + "step": 518 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.27085310220718384, + "learning_rate": 8.860132109318622e-06, + "loss": 0.6142768859863281, + "step": 520 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.15680420398712158, + "learning_rate": 9.029039396248916e-06, + "loss": 1.0847419500350952, + "step": 522 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.38691043853759766, + "learning_rate": 9.198679095380924e-06, + "loss": 0.9868083596229553, + "step": 524 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.2185872495174408, + "learning_rate": 9.368988168323451e-06, + "loss": 0.959234356880188, + "step": 526 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.15318599343299866, + "learning_rate": 9.539903327944926e-06, + "loss": 0.6547545790672302, + "step": 528 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.2598091661930084, + "learning_rate": 9.711361061890942e-06, + "loss": 0.7329921722412109, + "step": 530 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.8608449101448059, + "learning_rate": 9.8832976561856e-06, + "loss": 0.6598019599914551, + "step": 532 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.11225418001413345, + "learning_rate": 1.0055649218907688e-05, + "loss": 1.0508843660354614, + "step": 534 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.16358889639377594, + "learning_rate": 1.0228351703933075e-05, + "loss": 1.0427049398422241, + "step": 536 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.10920920968055725, + "learning_rate": 1.0401340934734287e-05, + "loss": 1.0863603353500366, + "step": 538 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.11286948621273041, + "learning_rate": 1.0574552628228691e-05, + "loss": 1.0349069833755493, + "step": 540 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.12279583513736725, + "learning_rate": 1.0747922418666115e-05, + "loss": 1.0405045747756958, + "step": 542 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.1787271350622177, + "learning_rate": 1.0921385881547311e-05, + "loss": 0.893326461315155, + "step": 544 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.23371008038520813, + "learning_rate": 1.1094878557564217e-05, + "loss": 1.134016513824463, + "step": 546 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.5070299506187439, + "learning_rate": 1.1268335976553098e-05, + "loss": 0.3684206008911133, + "step": 548 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.11523240804672241, + "learning_rate": 1.144169368145179e-05, + "loss": 1.0376405715942383, + "step": 550 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.1450342983007431, + "learning_rate": 1.1614887252252076e-05, + "loss": 1.0525881052017212, + "step": 552 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.1141965240240097, + "learning_rate": 1.1787852329938198e-05, + "loss": 1.0597068071365356, + "step": 554 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.10889161378145218, + "learning_rate": 1.1960524640402862e-05, + "loss": 1.1244990825653076, + "step": 556 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.34256356954574585, + "learning_rate": 1.2132840018331514e-05, + "loss": 0.7808042764663696, + "step": 558 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.15591228008270264, + "learning_rate": 1.2304734431046335e-05, + "loss": 0.5251950025558472, + "step": 560 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.27672791481018066, + "learning_rate": 1.2476144002300864e-05, + "loss": 0.889411985874176, + "step": 562 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.15575620532035828, + "learning_rate": 1.264700503601655e-05, + "loss": 0.7815580368041992, + "step": 564 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.12009876221418381, + "learning_rate": 1.2817254039952253e-05, + "loss": 1.0448942184448242, + "step": 566 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.20458699762821198, + "learning_rate": 1.2986827749298138e-05, + "loss": 0.7897263169288635, + "step": 568 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.16608871519565582, + "learning_rate": 1.3155663150184942e-05, + "loss": 1.145054578781128, + "step": 570 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.875208854675293, + "learning_rate": 1.3323697503100035e-05, + "loss": 1.0145237445831299, + "step": 572 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.07434671372175217, + "learning_rate": 1.3490868366201527e-05, + "loss": 1.0764760971069336, + "step": 574 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.33626464009284973, + "learning_rate": 1.3657113618521763e-05, + "loss": 0.7892511487007141, + "step": 576 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.14133699238300323, + "learning_rate": 1.3822371483051593e-05, + "loss": 1.094011902809143, + "step": 578 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.06214473024010658, + "learning_rate": 1.3986580549696777e-05, + "loss": 0.5204868912696838, + "step": 580 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.10695597529411316, + "learning_rate": 1.4149679798098097e-05, + "loss": 0.6737838983535767, + "step": 582 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.1742696315050125, + "learning_rate": 1.4311608620306626e-05, + "loss": 0.6672943830490112, + "step": 584 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.11992673575878143, + "learning_rate": 1.447230684330573e-05, + "loss": 1.0576859712600708, + "step": 586 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.09333043545484543, + "learning_rate": 1.4631714751371456e-05, + "loss": 1.0113446712493896, + "step": 588 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.08862563222646713, + "learning_rate": 1.4789773108263016e-05, + "loss": 0.7977145314216614, + "step": 590 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.09326809644699097, + "learning_rate": 1.4946423179235068e-05, + "loss": 1.077767252922058, + "step": 592 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 1.102271556854248, + "learning_rate": 1.5101606752863606e-05, + "loss": 0.8312599062919617, + "step": 594 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.12699337303638458, + "learning_rate": 1.5255266162677466e-05, + "loss": 0.3103083670139313, + "step": 596 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.19393262267112732, + "learning_rate": 1.540734430858725e-05, + "loss": 1.186922550201416, + "step": 598 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.13817955553531647, + "learning_rate": 1.5557784678103852e-05, + "loss": 1.1361396312713623, + "step": 600 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.11928141117095947, + "learning_rate": 1.5706531367338546e-05, + "loss": 1.0724998712539673, + "step": 602 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.29511600732803345, + "learning_rate": 1.5853529101776985e-05, + "loss": 1.1022595167160034, + "step": 604 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.1801217645406723, + "learning_rate": 1.5998723256819298e-05, + "loss": 1.1575597524642944, + "step": 606 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.13056795299053192, + "learning_rate": 1.614205987807872e-05, + "loss": 0.9522284269332886, + "step": 608 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.10384301096200943, + "learning_rate": 1.628348570143105e-05, + "loss": 1.1397171020507812, + "step": 610 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.4214940071105957, + "learning_rate": 1.6422948172807745e-05, + "loss": 0.6794083118438721, + "step": 612 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.11825451999902725, + "learning_rate": 1.6560395467725086e-05, + "loss": 1.0216362476348877, + "step": 614 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.10102783888578415, + "learning_rate": 1.6695776510542253e-05, + "loss": 1.1043617725372314, + "step": 616 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.39107218384742737, + "learning_rate": 1.6829040993441085e-05, + "loss": 0.6350641846656799, + "step": 618 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.20578241348266602, + "learning_rate": 1.696013939512057e-05, + "loss": 1.098326325416565, + "step": 620 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.146490678191185, + "learning_rate": 1.7089022999199064e-05, + "loss": 1.2161624431610107, + "step": 622 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.16978783905506134, + "learning_rate": 1.7215643912317323e-05, + "loss": 0.7078676223754883, + "step": 624 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.08843975514173508, + "learning_rate": 1.73399550819358e-05, + "loss": 0.6535137891769409, + "step": 626 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.08357886970043182, + "learning_rate": 1.746191031381943e-05, + "loss": 1.08620023727417, + "step": 628 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.07610905915498734, + "learning_rate": 1.7581464289203475e-05, + "loss": 0.7033694386482239, + "step": 630 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.08110252767801285, + "learning_rate": 1.7698572581634083e-05, + "loss": 1.0666342973709106, + "step": 632 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.12325672805309296, + "learning_rate": 1.781319167347718e-05, + "loss": 1.1530389785766602, + "step": 634 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.09884843975305557, + "learning_rate": 1.7925278972089748e-05, + "loss": 0.9689383506774902, + "step": 636 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.1466217190027237, + "learning_rate": 1.8034792825647287e-05, + "loss": 1.014373779296875, + "step": 638 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.286197692155838, + "learning_rate": 1.8141692538621716e-05, + "loss": 0.9180911183357239, + "step": 640 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.20502841472625732, + "learning_rate": 1.8245938386903896e-05, + "loss": 1.1219123601913452, + "step": 642 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.09796746075153351, + "learning_rate": 1.8347491632565156e-05, + "loss": 1.0469398498535156, + "step": 644 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.05621729791164398, + "learning_rate": 1.8446314538252407e-05, + "loss": 0.6228929162025452, + "step": 646 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.21613766252994537, + "learning_rate": 1.8542370381211374e-05, + "loss": 1.0182948112487793, + "step": 648 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.12267361581325531, + "learning_rate": 1.8635623466932843e-05, + "loss": 1.0775526762008667, + "step": 650 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.08999711275100708, + "learning_rate": 1.8726039142416796e-05, + "loss": 1.1355713605880737, + "step": 652 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.12249153107404709, + "learning_rate": 1.881358380904954e-05, + "loss": 0.9938265681266785, + "step": 654 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.30137544870376587, + "learning_rate": 1.889822493508897e-05, + "loss": 0.9946900010108948, + "step": 656 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.8713083863258362, + "learning_rate": 1.897993106775346e-05, + "loss": 0.9141952991485596, + "step": 658 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 1.5324283838272095, + "learning_rate": 1.9058671844909742e-05, + "loss": 1.079390525817871, + "step": 660 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.2170521765947342, + "learning_rate": 1.9134418006355532e-05, + "loss": 1.1160584688186646, + "step": 662 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.14580038189888, + "learning_rate": 1.9207141404692667e-05, + "loss": 1.0880683660507202, + "step": 664 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.10861846804618835, + "learning_rate": 1.927681501578672e-05, + "loss": 1.4276467561721802, + "step": 666 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.07761558145284653, + "learning_rate": 1.934341294880924e-05, + "loss": 1.1454670429229736, + "step": 668 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 1.6005865335464478, + "learning_rate": 1.9406910455858783e-05, + "loss": 0.7400512099266052, + "step": 670 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.21596352756023407, + "learning_rate": 1.9467283941157304e-05, + "loss": 1.2740768194198608, + "step": 672 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.09257059544324875, + "learning_rate": 1.952451096981838e-05, + "loss": 1.1218076944351196, + "step": 674 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.0891253873705864, + "learning_rate": 1.957857027618405e-05, + "loss": 1.1445086002349854, + "step": 676 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.03821035847067833, + "learning_rate": 1.9629441771727166e-05, + "loss": 0.24173806607723236, + "step": 678 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.11186153441667557, + "learning_rate": 1.9677106552516317e-05, + "loss": 1.0836678743362427, + "step": 680 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.09181064367294312, + "learning_rate": 1.9721546906240577e-05, + "loss": 1.086242914199829, + "step": 682 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.12407488375902176, + "learning_rate": 1.976274631879142e-05, + "loss": 1.0517431497573853, + "step": 684 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.6223235726356506, + "learning_rate": 1.9800689480399383e-05, + "loss": 0.7017934918403625, + "step": 686 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.13767960667610168, + "learning_rate": 1.9835362291323222e-05, + "loss": 1.1536179780960083, + "step": 688 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.07212366908788681, + "learning_rate": 1.9866751867089363e-05, + "loss": 1.1063528060913086, + "step": 690 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.14884713292121887, + "learning_rate": 1.9894846543279838e-05, + "loss": 0.7882944941520691, + "step": 692 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.11355551332235336, + "learning_rate": 1.991963587986677e-05, + "loss": 1.1226757764816284, + "step": 694 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.0906631201505661, + "learning_rate": 1.9941110665091922e-05, + "loss": 1.1270076036453247, + "step": 696 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.1339959055185318, + "learning_rate": 1.9959262918889774e-05, + "loss": 1.000802993774414, + "step": 698 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.06641183793544769, + "learning_rate": 1.9974085895852973e-05, + "loss": 1.1165053844451904, + "step": 700 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.0869305208325386, + "learning_rate": 1.99855740877389e-05, + "loss": 1.0390774011611938, + "step": 702 + }, + { + "epoch": 1.9234972677595628, + "grad_norm": 0.13396522402763367, + "learning_rate": 1.9993723225516553e-05, + "loss": 1.1478846073150635, + "step": 704 + }, + { + "epoch": 1.92896174863388, + "grad_norm": 0.09904704242944717, + "learning_rate": 1.9998530280952938e-05, + "loss": 1.1377912759780884, + "step": 706 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.09361512959003448, + "learning_rate": 1.9999993467738345e-05, + "loss": 1.172654151916504, + "step": 708 + }, + { + "epoch": 1.9398907103825138, + "grad_norm": 0.09594118595123291, + "learning_rate": 1.9998112242150162e-05, + "loss": 0.893622100353241, + "step": 710 + }, + { + "epoch": 1.9453551912568305, + "grad_norm": 0.17272037267684937, + "learning_rate": 1.999288730325491e-05, + "loss": 0.610184371471405, + "step": 712 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.08503665030002594, + "learning_rate": 1.9984320592648474e-05, + "loss": 1.1162121295928955, + "step": 714 + }, + { + "epoch": 1.9562841530054644, + "grad_norm": 0.10477106273174286, + "learning_rate": 1.9972415293734607e-05, + "loss": 0.8185825943946838, + "step": 716 + }, + { + "epoch": 1.9617486338797814, + "grad_norm": 0.09983915090560913, + "learning_rate": 1.995717583054196e-05, + "loss": 1.1097755432128906, + "step": 718 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 0.10685881227254868, + "learning_rate": 1.9938607866080114e-05, + "loss": 1.1009999513626099, + "step": 720 + }, + { + "epoch": 1.9726775956284153, + "grad_norm": 0.2453712522983551, + "learning_rate": 1.991671830023521e-05, + "loss": 1.1338697671890259, + "step": 722 + }, + { + "epoch": 1.9781420765027322, + "grad_norm": 0.08788733184337616, + "learning_rate": 1.989151526720591e-05, + "loss": 1.0071790218353271, + "step": 724 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 0.18296760320663452, + "learning_rate": 1.986300813248073e-05, + "loss": 1.0721267461776733, + "step": 726 + }, + { + "epoch": 1.989071038251366, + "grad_norm": 0.08143667131662369, + "learning_rate": 1.9831207489357825e-05, + "loss": 1.0946375131607056, + "step": 728 + }, + { + "epoch": 1.994535519125683, + "grad_norm": 0.1328088641166687, + "learning_rate": 1.979612515500847e-05, + "loss": 1.0233464241027832, + "step": 730 + }, + { + "epoch": 2.0, + "grad_norm": 0.2603437006473541, + "learning_rate": 1.97577741660858e-05, + "loss": 1.1380540132522583, + "step": 732 + }, + { + "epoch": 2.0054644808743167, + "grad_norm": 0.0938432514667511, + "learning_rate": 1.9716168773880382e-05, + "loss": 0.4869334399700165, + "step": 734 + }, + { + "epoch": 2.010928961748634, + "grad_norm": 0.15073847770690918, + "learning_rate": 1.9671324439024374e-05, + "loss": 0.8568433523178101, + "step": 736 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.1493358016014099, + "learning_rate": 1.9623257825746357e-05, + "loss": 0.8497028946876526, + "step": 738 + }, + { + "epoch": 2.021857923497268, + "grad_norm": 0.11990120261907578, + "learning_rate": 1.9571986795678878e-05, + "loss": 0.8644115924835205, + "step": 740 + }, + { + "epoch": 2.0273224043715845, + "grad_norm": 0.6325967311859131, + "learning_rate": 1.951753040122102e-05, + "loss": 0.559238076210022, + "step": 742 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 0.3195425868034363, + "learning_rate": 1.9459908878458532e-05, + "loss": 0.9297751188278198, + "step": 744 + }, + { + "epoch": 2.0382513661202184, + "grad_norm": 0.11186884343624115, + "learning_rate": 1.939914363964402e-05, + "loss": 0.7149038314819336, + "step": 746 + }, + { + "epoch": 2.0437158469945356, + "grad_norm": 0.12103074043989182, + "learning_rate": 1.9335257265240168e-05, + "loss": 1.0219731330871582, + "step": 748 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.12488316744565964, + "learning_rate": 1.9268273495528768e-05, + "loss": 0.644228994846344, + "step": 750 + }, + { + "epoch": 2.0546448087431695, + "grad_norm": 0.1779249608516693, + "learning_rate": 1.9198217221788806e-05, + "loss": 0.5474087595939636, + "step": 752 + }, + { + "epoch": 2.060109289617486, + "grad_norm": 0.4773656725883484, + "learning_rate": 1.9125114477046807e-05, + "loss": 0.6247202754020691, + "step": 754 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.7489643692970276, + "learning_rate": 1.9048992426402947e-05, + "loss": 1.2044504880905151, + "step": 756 + }, + { + "epoch": 2.07103825136612, + "grad_norm": 0.3501991033554077, + "learning_rate": 1.896987935693643e-05, + "loss": 0.9943795204162598, + "step": 758 + }, + { + "epoch": 2.0765027322404372, + "grad_norm": 0.13691188395023346, + "learning_rate": 1.888780466719397e-05, + "loss": 0.8962281346321106, + "step": 760 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.20719239115715027, + "learning_rate": 1.8802798856265254e-05, + "loss": 0.9988420009613037, + "step": 762 + }, + { + "epoch": 2.087431693989071, + "grad_norm": 0.10589145869016647, + "learning_rate": 1.8714893512449424e-05, + "loss": 0.9582113027572632, + "step": 764 + }, + { + "epoch": 2.092896174863388, + "grad_norm": 0.09363008290529251, + "learning_rate": 1.8624121301516808e-05, + "loss": 1.0755091905593872, + "step": 766 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 0.1002376526594162, + "learning_rate": 1.853051595457026e-05, + "loss": 0.8220664262771606, + "step": 768 + }, + { + "epoch": 2.1038251366120218, + "grad_norm": 0.11234933882951736, + "learning_rate": 1.843411225551065e-05, + "loss": 0.9647861123085022, + "step": 770 + }, + { + "epoch": 2.109289617486339, + "grad_norm": 1.0863375663757324, + "learning_rate": 1.8334946028111088e-05, + "loss": 0.8694019317626953, + "step": 772 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 0.08745839446783066, + "learning_rate": 1.8233054122704765e-05, + "loss": 0.8390365839004517, + "step": 774 + }, + { + "epoch": 2.120218579234973, + "grad_norm": 0.10107838362455368, + "learning_rate": 1.8128474402491286e-05, + "loss": 0.9191421270370483, + "step": 776 + }, + { + "epoch": 2.1256830601092895, + "grad_norm": 0.671896755695343, + "learning_rate": 1.802124572946668e-05, + "loss": 0.6683709621429443, + "step": 778 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 0.1272987574338913, + "learning_rate": 1.791140794998219e-05, + "loss": 0.9340140223503113, + "step": 780 + }, + { + "epoch": 2.1366120218579234, + "grad_norm": 0.060741033405065536, + "learning_rate": 1.7799001879937294e-05, + "loss": 0.9635948538780212, + "step": 782 + }, + { + "epoch": 2.1420765027322406, + "grad_norm": 0.0766320452094078, + "learning_rate": 1.768406928961248e-05, + "loss": 1.02224862575531, + "step": 784 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.08025658875703812, + "learning_rate": 1.7566652888147328e-05, + "loss": 0.6348251104354858, + "step": 786 + }, + { + "epoch": 2.1530054644808745, + "grad_norm": 0.748442530632019, + "learning_rate": 1.7446796307669725e-05, + "loss": 0.8085255026817322, + "step": 788 + }, + { + "epoch": 2.158469945355191, + "grad_norm": 0.14406074583530426, + "learning_rate": 1.732454408708209e-05, + "loss": 1.0787290334701538, + "step": 790 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.12041930854320526, + "learning_rate": 1.719994165551063e-05, + "loss": 0.6151443719863892, + "step": 792 + }, + { + "epoch": 2.169398907103825, + "grad_norm": 0.2303697168827057, + "learning_rate": 1.7073035315423838e-05, + "loss": 0.7541511058807373, + "step": 794 + }, + { + "epoch": 2.1748633879781423, + "grad_norm": 0.10725667327642441, + "learning_rate": 1.6943872225426396e-05, + "loss": 0.9673095941543579, + "step": 796 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.08471065014600754, + "learning_rate": 1.6812500382734977e-05, + "loss": 0.9283774495124817, + "step": 798 + }, + { + "epoch": 2.185792349726776, + "grad_norm": 0.2112588882446289, + "learning_rate": 1.6678968605342348e-05, + "loss": 0.8556913137435913, + "step": 800 + }, + { + "epoch": 2.191256830601093, + "grad_norm": 0.10837297886610031, + "learning_rate": 1.6543326513876602e-05, + "loss": 0.9529185891151428, + "step": 802 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 0.2664050757884979, + "learning_rate": 1.6405624513162002e-05, + "loss": 0.8970465064048767, + "step": 804 + }, + { + "epoch": 2.202185792349727, + "grad_norm": 0.11209579557180405, + "learning_rate": 1.6265913773488456e-05, + "loss": 0.9722681641578674, + "step": 806 + }, + { + "epoch": 2.2076502732240435, + "grad_norm": 0.07670333236455917, + "learning_rate": 1.6124246211596606e-05, + "loss": 0.5162777304649353, + "step": 808 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.08938929438591003, + "learning_rate": 1.598067447138542e-05, + "loss": 0.6079915761947632, + "step": 810 + }, + { + "epoch": 2.2185792349726774, + "grad_norm": 0.09473609179258347, + "learning_rate": 1.5835251904349688e-05, + "loss": 0.9542949199676514, + "step": 812 + }, + { + "epoch": 2.2240437158469946, + "grad_norm": 0.11124635487794876, + "learning_rate": 1.5688032549754453e-05, + "loss": 0.866733729839325, + "step": 814 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 0.13868460059165955, + "learning_rate": 1.553907111455401e-05, + "loss": 0.7844435572624207, + "step": 816 + }, + { + "epoch": 2.2349726775956285, + "grad_norm": 0.09409452229738235, + "learning_rate": 1.538842295306264e-05, + "loss": 0.9567630290985107, + "step": 818 + }, + { + "epoch": 2.240437158469945, + "grad_norm": 0.09976345300674438, + "learning_rate": 1.5236144046384917e-05, + "loss": 0.9797232151031494, + "step": 820 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.18227122724056244, + "learning_rate": 1.5082290981612987e-05, + "loss": 1.0215951204299927, + "step": 822 + }, + { + "epoch": 2.251366120218579, + "grad_norm": 0.13870660960674286, + "learning_rate": 1.4926920930798736e-05, + "loss": 0.7462106943130493, + "step": 824 + }, + { + "epoch": 2.2568306010928962, + "grad_norm": 0.8586530089378357, + "learning_rate": 1.4770091629708562e-05, + "loss": 0.5034890174865723, + "step": 826 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 0.1805536448955536, + "learning_rate": 1.461186135636868e-05, + "loss": 0.554444432258606, + "step": 828 + }, + { + "epoch": 2.26775956284153, + "grad_norm": 0.1220104768872261, + "learning_rate": 1.4452288909408864e-05, + "loss": 0.8765403628349304, + "step": 830 + }, + { + "epoch": 2.273224043715847, + "grad_norm": 0.19856558740139008, + "learning_rate": 1.4291433586212831e-05, + "loss": 0.4287130534648895, + "step": 832 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.11485260725021362, + "learning_rate": 1.4129355160883216e-05, + "loss": 0.6764381527900696, + "step": 834 + }, + { + "epoch": 2.2841530054644807, + "grad_norm": 0.15756544470787048, + "learning_rate": 1.3966113862029429e-05, + "loss": 0.9859319925308228, + "step": 836 + }, + { + "epoch": 2.289617486338798, + "grad_norm": 0.8009629845619202, + "learning_rate": 1.3801770350386568e-05, + "loss": 0.9443151354789734, + "step": 838 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 0.07390843331813812, + "learning_rate": 1.363638569627384e-05, + "loss": 0.7474254369735718, + "step": 840 + }, + { + "epoch": 2.300546448087432, + "grad_norm": 0.06552885472774506, + "learning_rate": 1.3470021356900696e-05, + "loss": 0.9358803629875183, + "step": 842 + }, + { + "epoch": 2.3060109289617485, + "grad_norm": 0.07640265673398972, + "learning_rate": 1.3302739153529252e-05, + "loss": 1.0211539268493652, + "step": 844 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.06588945537805557, + "learning_rate": 1.3134601248501366e-05, + "loss": 0.4891568124294281, + "step": 846 + }, + { + "epoch": 2.3169398907103824, + "grad_norm": 0.08077123761177063, + "learning_rate": 1.2965670122139071e-05, + "loss": 0.9966482520103455, + "step": 848 + }, + { + "epoch": 2.3224043715846996, + "grad_norm": 0.07753586769104004, + "learning_rate": 1.2796008549526752e-05, + "loss": 0.9455719590187073, + "step": 850 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.16270698606967926, + "learning_rate": 1.262567957718378e-05, + "loss": 0.8000156283378601, + "step": 852 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.17322923243045807, + "learning_rate": 1.2454746499636408e-05, + "loss": 1.0176856517791748, + "step": 854 + }, + { + "epoch": 2.33879781420765, + "grad_norm": 0.09006354957818985, + "learning_rate": 1.2283272835897359e-05, + "loss": 0.9464543461799622, + "step": 856 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.06936459243297577, + "learning_rate": 1.2111322305862088e-05, + "loss": 0.6206764578819275, + "step": 858 + }, + { + "epoch": 2.349726775956284, + "grad_norm": 0.07615432143211365, + "learning_rate": 1.1938958806630322e-05, + "loss": 0.8551270365715027, + "step": 860 + }, + { + "epoch": 2.3551912568306013, + "grad_norm": 0.1207866445183754, + "learning_rate": 1.1766246388761841e-05, + "loss": 0.987690806388855, + "step": 862 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.07359467446804047, + "learning_rate": 1.1593249232475162e-05, + "loss": 0.6516119837760925, + "step": 864 + }, + { + "epoch": 2.366120218579235, + "grad_norm": 0.1848146617412567, + "learning_rate": 1.142003162379808e-05, + "loss": 1.2323834896087646, + "step": 866 + }, + { + "epoch": 2.371584699453552, + "grad_norm": 0.09937203675508499, + "learning_rate": 1.1246657930678817e-05, + "loss": 0.9654973745346069, + "step": 868 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.13023817539215088, + "learning_rate": 1.1073192579066867e-05, + "loss": 0.8113493323326111, + "step": 870 + }, + { + "epoch": 2.3825136612021858, + "grad_norm": 0.07656283676624298, + "learning_rate": 1.0899700028972169e-05, + "loss": 0.8552543520927429, + "step": 872 + }, + { + "epoch": 2.387978142076503, + "grad_norm": 0.08033186942338943, + "learning_rate": 1.072624475051166e-05, + "loss": 0.9093409180641174, + "step": 874 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.3393533229827881, + "learning_rate": 1.055289119995206e-05, + "loss": 0.7607249021530151, + "step": 876 + }, + { + "epoch": 2.3989071038251364, + "grad_norm": 0.06451606750488281, + "learning_rate": 1.0379703795757853e-05, + "loss": 0.8950807452201843, + "step": 878 + }, + { + "epoch": 2.4043715846994536, + "grad_norm": 0.08689342439174652, + "learning_rate": 1.0206746894653252e-05, + "loss": 0.9779506325721741, + "step": 880 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.06411824375391006, + "learning_rate": 1.0034084767707164e-05, + "loss": 0.5035781264305115, + "step": 882 + }, + { + "epoch": 2.4153005464480874, + "grad_norm": 0.08929432928562164, + "learning_rate": 9.861781576449879e-06, + "loss": 0.8264143466949463, + "step": 884 + }, + { + "epoch": 2.420765027322404, + "grad_norm": 0.06747736781835556, + "learning_rate": 9.689901349030646e-06, + "loss": 0.9163289666175842, + "step": 886 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.08119731396436691, + "learning_rate": 9.518507956424643e-06, + "loss": 0.6026349067687988, + "step": 888 + }, + { + "epoch": 2.431693989071038, + "grad_norm": 0.10195915400981903, + "learning_rate": 9.347665088698444e-06, + "loss": 0.5477317571640015, + "step": 890 + }, + { + "epoch": 2.4371584699453552, + "grad_norm": 0.14393250644207, + "learning_rate": 9.177436231342623e-06, + "loss": 0.8474670648574829, + "step": 892 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.11053544282913208, + "learning_rate": 9.00788464168054e-06, + "loss": 0.5279150605201721, + "step": 894 + }, + { + "epoch": 2.448087431693989, + "grad_norm": 0.07670940458774567, + "learning_rate": 8.839073325361751e-06, + "loss": 0.7849703431129456, + "step": 896 + }, + { + "epoch": 2.453551912568306, + "grad_norm": 0.07591420412063599, + "learning_rate": 8.67106501294902e-06, + "loss": 0.8677206635475159, + "step": 898 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.09224042296409607, + "learning_rate": 8.503922136607536e-06, + "loss": 0.9447459578514099, + "step": 900 + }, + { + "epoch": 2.4644808743169397, + "grad_norm": 0.11288972944021225, + "learning_rate": 8.337706806905029e-06, + "loss": 0.9016860127449036, + "step": 902 + }, + { + "epoch": 2.469945355191257, + "grad_norm": 0.0662192702293396, + "learning_rate": 8.172480789731374e-06, + "loss": 0.802085816860199, + "step": 904 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.42237725853919983, + "learning_rate": 8.00830548334625e-06, + "loss": 0.8318672180175781, + "step": 906 + }, + { + "epoch": 2.480874316939891, + "grad_norm": 0.04947488009929657, + "learning_rate": 7.84524189556352e-06, + "loss": 0.39941924810409546, + "step": 908 + }, + { + "epoch": 2.4863387978142075, + "grad_norm": 0.04675549268722534, + "learning_rate": 7.68335062108057e-06, + "loss": 0.8486447930335999, + "step": 910 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.10684451460838318, + "learning_rate": 7.522691818961252e-06, + "loss": 0.4767865240573883, + "step": 912 + }, + { + "epoch": 2.4972677595628414, + "grad_norm": 0.10145942121744156, + "learning_rate": 7.3633251902806165e-06, + "loss": 0.9361868500709534, + "step": 914 + }, + { + "epoch": 2.5027322404371586, + "grad_norm": 0.07360153645277023, + "learning_rate": 7.205309955939983e-06, + "loss": 0.7450263500213623, + "step": 916 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 0.08711640536785126, + "learning_rate": 7.048704834660296e-06, + "loss": 0.9111591577529907, + "step": 918 + }, + { + "epoch": 2.5136612021857925, + "grad_norm": 0.05997847765684128, + "learning_rate": 6.8935680211621715e-06, + "loss": 0.8339736461639404, + "step": 920 + }, + { + "epoch": 2.519125683060109, + "grad_norm": 0.05966237187385559, + "learning_rate": 6.739957164540634e-06, + "loss": 0.862764835357666, + "step": 922 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.09305696934461594, + "learning_rate": 6.587929346842625e-06, + "loss": 0.5352113246917725, + "step": 924 + }, + { + "epoch": 2.530054644808743, + "grad_norm": 0.06322740018367767, + "learning_rate": 6.437541061855222e-06, + "loss": 0.4757808446884155, + "step": 926 + }, + { + "epoch": 2.5355191256830603, + "grad_norm": 0.14923587441444397, + "learning_rate": 6.288848194112459e-06, + "loss": 0.5935426354408264, + "step": 928 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 0.12613911926746368, + "learning_rate": 6.141905998128495e-06, + "loss": 0.962956428527832, + "step": 930 + }, + { + "epoch": 2.546448087431694, + "grad_norm": 0.11086481809616089, + "learning_rate": 5.996769077865029e-06, + "loss": 0.5794277787208557, + "step": 932 + }, + { + "epoch": 2.551912568306011, + "grad_norm": 0.09779225289821625, + "learning_rate": 5.853491366440313e-06, + "loss": 0.8937954902648926, + "step": 934 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 0.11617986857891083, + "learning_rate": 5.712126106087557e-06, + "loss": 0.6793417930603027, + "step": 936 + }, + { + "epoch": 2.5628415300546448, + "grad_norm": 0.24284426867961884, + "learning_rate": 5.572725828369961e-06, + "loss": 0.538112223148346, + "step": 938 + }, + { + "epoch": 2.5683060109289615, + "grad_norm": 0.1071031242609024, + "learning_rate": 5.4353423346599944e-06, + "loss": 0.9442830681800842, + "step": 940 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 0.256465345621109, + "learning_rate": 5.30002667688986e-06, + "loss": 0.5235742926597595, + "step": 942 + }, + { + "epoch": 2.579234972677596, + "grad_norm": 0.07830263674259186, + "learning_rate": 5.1668291385804995e-06, + "loss": 0.9773924350738525, + "step": 944 + }, + { + "epoch": 2.5846994535519126, + "grad_norm": 0.08557162433862686, + "learning_rate": 5.03579921615621e-06, + "loss": 0.22877348959445953, + "step": 946 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.06623120605945587, + "learning_rate": 4.906985600551651e-06, + "loss": 0.8310664892196655, + "step": 948 + }, + { + "epoch": 2.5956284153005464, + "grad_norm": 0.07215957343578339, + "learning_rate": 4.780436159118221e-06, + "loss": 0.6763550043106079, + "step": 950 + }, + { + "epoch": 2.6010928961748636, + "grad_norm": 0.07515008747577667, + "learning_rate": 4.656197917836474e-06, + "loss": 0.873980700969696, + "step": 952 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 0.10116924345493317, + "learning_rate": 4.5343170438411885e-06, + "loss": 0.4812062084674835, + "step": 954 + }, + { + "epoch": 2.612021857923497, + "grad_norm": 0.08510662615299225, + "learning_rate": 4.414838828265581e-06, + "loss": 0.3283853828907013, + "step": 956 + }, + { + "epoch": 2.6174863387978142, + "grad_norm": 0.07895616441965103, + "learning_rate": 4.297807669411057e-06, + "loss": 0.5227924585342407, + "step": 958 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.06837275624275208, + "learning_rate": 4.183267056248689e-06, + "loss": 0.8052303791046143, + "step": 960 + }, + { + "epoch": 2.628415300546448, + "grad_norm": 0.5316947102546692, + "learning_rate": 4.071259552258709e-06, + "loss": 0.6300249695777893, + "step": 962 + }, + { + "epoch": 2.633879781420765, + "grad_norm": 0.1823725551366806, + "learning_rate": 3.961826779613801e-06, + "loss": 0.9274300932884216, + "step": 964 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.09565065801143646, + "learning_rate": 3.85500940371226e-06, + "loss": 0.8221227526664734, + "step": 966 + }, + { + "epoch": 2.644808743169399, + "grad_norm": 0.07037303596735, + "learning_rate": 3.750847118066614e-06, + "loss": 0.6738592982292175, + "step": 968 + }, + { + "epoch": 2.650273224043716, + "grad_norm": 0.15274615585803986, + "learning_rate": 3.6493786295535234e-06, + "loss": 0.6995601654052734, + "step": 970 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 0.07629843801259995, + "learning_rate": 3.5506416440301885e-06, + "loss": 1.0434399843215942, + "step": 972 + }, + { + "epoch": 2.66120218579235, + "grad_norm": 0.07291587442159653, + "learning_rate": 3.4546728523228067e-06, + "loss": 0.6066082119941711, + "step": 974 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.07083304971456528, + "learning_rate": 3.361507916592206e-06, + "loss": 0.8686235547065735, + "step": 976 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.05435602739453316, + "learning_rate": 3.271181457081715e-06, + "loss": 0.9038045406341553, + "step": 978 + }, + { + "epoch": 2.6775956284153004, + "grad_norm": 0.06579703837633133, + "learning_rate": 3.1837270392522456e-06, + "loss": 0.8090689182281494, + "step": 980 + }, + { + "epoch": 2.6830601092896176, + "grad_norm": 0.07758979499340057, + "learning_rate": 3.0991771613092686e-06, + "loss": 0.791861891746521, + "step": 982 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.127408966422081, + "learning_rate": 3.017563242126483e-06, + "loss": 0.5899272561073303, + "step": 984 + }, + { + "epoch": 2.6939890710382515, + "grad_norm": 0.10469768196344376, + "learning_rate": 2.9389156095704764e-06, + "loss": 1.0097025632858276, + "step": 986 + }, + { + "epoch": 2.699453551912568, + "grad_norm": 0.08178994059562683, + "learning_rate": 2.8632634892308535e-06, + "loss": 0.8643054962158203, + "step": 988 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.07229630649089813, + "learning_rate": 2.7906349935599326e-06, + "loss": 0.7919317483901978, + "step": 990 + }, + { + "epoch": 2.710382513661202, + "grad_norm": 0.07362597435712814, + "learning_rate": 2.721057111426154e-06, + "loss": 0.48871687054634094, + "step": 992 + }, + { + "epoch": 2.7158469945355193, + "grad_norm": 0.09497088938951492, + "learning_rate": 2.6545556980849417e-06, + "loss": 0.9226855039596558, + "step": 994 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.054573237895965576, + "learning_rate": 2.591155465570866e-06, + "loss": 0.8928760886192322, + "step": 996 + }, + { + "epoch": 2.726775956284153, + "grad_norm": 0.07322686165571213, + "learning_rate": 2.5308799735145813e-06, + "loss": 0.838208794593811, + "step": 998 + }, + { + "epoch": 2.73224043715847, + "grad_norm": 0.20288613438606262, + "learning_rate": 2.473751620388069e-06, + "loss": 1.0185120105743408, + "step": 1000 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.06770773977041245, + "learning_rate": 2.419791635181301e-06, + "loss": 0.881740152835846, + "step": 1002 + }, + { + "epoch": 2.7431693989071038, + "grad_norm": 0.08364085108041763, + "learning_rate": 2.369020069513521e-06, + "loss": 0.8283625841140747, + "step": 1004 + }, + { + "epoch": 2.748633879781421, + "grad_norm": 0.46381357312202454, + "learning_rate": 2.3214557901820258e-06, + "loss": 0.35482144355773926, + "step": 1006 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 0.1369582712650299, + "learning_rate": 2.27711647215124e-06, + "loss": 1.0208179950714111, + "step": 1008 + }, + { + "epoch": 2.7595628415300544, + "grad_norm": 0.08924005925655365, + "learning_rate": 2.2360185919846593e-06, + "loss": 1.0277652740478516, + "step": 1010 + }, + { + "epoch": 2.7650273224043715, + "grad_norm": 0.16718317568302155, + "learning_rate": 2.1981774217221474e-06, + "loss": 0.7020498514175415, + "step": 1012 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.07670430839061737, + "learning_rate": 2.1636070232047966e-06, + "loss": 0.9566575884819031, + "step": 1014 + }, + { + "epoch": 2.7759562841530054, + "grad_norm": 0.086872898042202, + "learning_rate": 2.1323202428495544e-06, + "loss": 0.7534717917442322, + "step": 1016 + }, + { + "epoch": 2.781420765027322, + "grad_norm": 0.06671146303415298, + "learning_rate": 2.104328706875452e-06, + "loss": 0.9348956346511841, + "step": 1018 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.09823765605688095, + "learning_rate": 2.079642816983293e-06, + "loss": 0.6335864663124084, + "step": 1020 + }, + { + "epoch": 2.7923497267759565, + "grad_norm": 0.08545304834842682, + "learning_rate": 2.0582717464903546e-06, + "loss": 0.9375832676887512, + "step": 1022 + }, + { + "epoch": 2.797814207650273, + "grad_norm": 0.12878002226352692, + "learning_rate": 2.040223436921581e-06, + "loss": 0.4505836069583893, + "step": 1024 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.07770732790231705, + "learning_rate": 2.025504595058489e-06, + "loss": 0.9441653490066528, + "step": 1026 + }, + { + "epoch": 2.808743169398907, + "grad_norm": 0.25983214378356934, + "learning_rate": 2.0141206904469206e-06, + "loss": 0.7026041150093079, + "step": 1028 + }, + { + "epoch": 2.8142076502732243, + "grad_norm": 0.0644698366522789, + "learning_rate": 2.006075953364551e-06, + "loss": 0.7949016094207764, + "step": 1030 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.06279131770133972, + "learning_rate": 2.0013733732489103e-06, + "loss": 0.7919459939002991, + "step": 1032 + }, + { + "epoch": 2.8251366120218577, + "grad_norm": 0.18770284950733185, + "learning_rate": 2.000014697586502e-06, + "loss": 0.6975997090339661, + "step": 1034 + }, + { + "epoch": 2.830601092896175, + "grad_norm": 0.08828295767307281, + "learning_rate": 2.0020004312634374e-06, + "loss": 1.0092765092849731, + "step": 1036 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.059960268437862396, + "learning_rate": 2.0073298363778166e-06, + "loss": 0.8346893191337585, + "step": 1038 + }, + { + "epoch": 2.841530054644809, + "grad_norm": 0.07700161635875702, + "learning_rate": 2.016000932513934e-06, + "loss": 0.9381151795387268, + "step": 1040 + }, + { + "epoch": 2.8469945355191255, + "grad_norm": 0.09279518574476242, + "learning_rate": 2.0280104974782058e-06, + "loss": 0.9051158428192139, + "step": 1042 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.07361223548650742, + "learning_rate": 2.043354068496541e-06, + "loss": 0.7462697625160217, + "step": 1044 + }, + { + "epoch": 2.8579234972677594, + "grad_norm": 0.07207613438367844, + "learning_rate": 2.0620259438727168e-06, + "loss": 0.9937452077865601, + "step": 1046 + }, + { + "epoch": 2.8633879781420766, + "grad_norm": 0.046472106128931046, + "learning_rate": 2.084019185107135e-06, + "loss": 0.465142160654068, + "step": 1048 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 0.10938539355993271, + "learning_rate": 2.1093256194751822e-06, + "loss": 0.8489465713500977, + "step": 1050 + }, + { + "epoch": 2.8743169398907105, + "grad_norm": 0.076163150370121, + "learning_rate": 2.137935843064233e-06, + "loss": 0.8256275057792664, + "step": 1052 + }, + { + "epoch": 2.879781420765027, + "grad_norm": 0.09961618483066559, + "learning_rate": 2.1698392242681502e-06, + "loss": 0.9924526214599609, + "step": 1054 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 0.16896775364875793, + "learning_rate": 2.2050239077380097e-06, + "loss": 0.7573210000991821, + "step": 1056 + }, + { + "epoch": 2.890710382513661, + "grad_norm": 0.17918655276298523, + "learning_rate": 2.2434768187875723e-06, + "loss": 0.5940418839454651, + "step": 1058 + }, + { + "epoch": 2.8961748633879782, + "grad_norm": 0.06816217303276062, + "learning_rate": 2.285183668251853e-06, + "loss": 0.8905007839202881, + "step": 1060 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.05951293557882309, + "learning_rate": 2.3301289577970028e-06, + "loss": 0.8955813646316528, + "step": 1062 + }, + { + "epoch": 2.907103825136612, + "grad_norm": 0.059989165514707565, + "learning_rate": 2.3782959856795113e-06, + "loss": 0.9200108051300049, + "step": 1064 + }, + { + "epoch": 2.912568306010929, + "grad_norm": 0.07120663672685623, + "learning_rate": 2.4296668529525998e-06, + "loss": 0.5274271965026855, + "step": 1066 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 0.08395183831453323, + "learning_rate": 2.4842224701175147e-06, + "loss": 0.5822516083717346, + "step": 1068 + }, + { + "epoch": 2.9234972677595628, + "grad_norm": 0.1801450401544571, + "learning_rate": 2.541942564217196e-06, + "loss": 0.7497321963310242, + "step": 1070 + }, + { + "epoch": 2.92896174863388, + "grad_norm": 0.10151448100805283, + "learning_rate": 2.6028056863697506e-06, + "loss": 0.976716935634613, + "step": 1072 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 0.3388555347919464, + "learning_rate": 2.6667892197388884e-06, + "loss": 0.5633800029754639, + "step": 1074 + }, + { + "epoch": 2.939890710382514, + "grad_norm": 0.11042678356170654, + "learning_rate": 2.7338693879383967e-06, + "loss": 0.5925998687744141, + "step": 1076 + }, + { + "epoch": 2.9453551912568305, + "grad_norm": 0.10288611799478531, + "learning_rate": 2.8040212638674506e-06, + "loss": 0.5275436043739319, + "step": 1078 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.15759417414665222, + "learning_rate": 2.877218778973578e-06, + "loss": 0.9885872006416321, + "step": 1080 + }, + { + "epoch": 2.9562841530054644, + "grad_norm": 0.09481590986251831, + "learning_rate": 2.9534347329398027e-06, + "loss": 0.6088681817054749, + "step": 1082 + }, + { + "epoch": 2.9617486338797816, + "grad_norm": 0.08891811966896057, + "learning_rate": 3.0326408037922827e-06, + "loss": 0.9159113764762878, + "step": 1084 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.059494610875844955, + "learning_rate": 3.1148075584248306e-06, + "loss": 0.5118452906608582, + "step": 1086 + }, + { + "epoch": 2.972677595628415, + "grad_norm": 0.07500632852315903, + "learning_rate": 3.199904463536296e-06, + "loss": 1.0341871976852417, + "step": 1088 + }, + { + "epoch": 2.978142076502732, + "grad_norm": 0.08454718440771103, + "learning_rate": 3.2878998969767954e-06, + "loss": 0.7394497394561768, + "step": 1090 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 0.06789242476224899, + "learning_rate": 3.378761159498547e-06, + "loss": 0.7495979070663452, + "step": 1092 + }, + { + "epoch": 2.989071038251366, + "grad_norm": 0.16882292926311493, + "learning_rate": 3.472454486906972e-06, + "loss": 0.8230282068252563, + "step": 1094 + }, + { + "epoch": 2.994535519125683, + "grad_norm": 1.5026551485061646, + "learning_rate": 3.5689450626075132e-06, + "loss": 0.5486379861831665, + "step": 1096 + }, + { + "epoch": 3.0, + "grad_norm": 0.15779562294483185, + "learning_rate": 3.668197030543573e-06, + "loss": 0.8555684089660645, + "step": 1098 + }, + { + "epoch": 3.0, + "step": 1098, + "total_flos": 4.842473815785079e+18, + "train_loss": 0.9743298643925151, + "train_runtime": 37826.3024, + "train_samples_per_second": 1.742, + "train_steps_per_second": 0.029 + } + ], + "logging_steps": 2, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.842473815785079e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}